1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * NVMe over Fabrics RDMA host code. 4 * Copyright (c) 2015-2016 HGST, a Western Digital Company. 5 */ 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 #include <linux/module.h> 8 #include <linux/init.h> 9 #include <linux/slab.h> 10 #include <rdma/mr_pool.h> 11 #include <linux/err.h> 12 #include <linux/string.h> 13 #include <linux/atomic.h> 14 #include <linux/blk-mq.h> 15 #include <linux/blk-mq-rdma.h> 16 #include <linux/types.h> 17 #include <linux/list.h> 18 #include <linux/mutex.h> 19 #include <linux/scatterlist.h> 20 #include <linux/nvme.h> 21 #include <asm/unaligned.h> 22 23 #include <rdma/ib_verbs.h> 24 #include <rdma/rdma_cm.h> 25 #include <linux/nvme-rdma.h> 26 27 #include "nvme.h" 28 #include "fabrics.h" 29 30 31 #define NVME_RDMA_CONNECT_TIMEOUT_MS 3000 /* 3 second */ 32 33 #define NVME_RDMA_MAX_SEGMENTS 256 34 35 #define NVME_RDMA_MAX_INLINE_SEGMENTS 4 36 37 struct nvme_rdma_device { 38 struct ib_device *dev; 39 struct ib_pd *pd; 40 struct kref ref; 41 struct list_head entry; 42 unsigned int num_inline_segments; 43 }; 44 45 struct nvme_rdma_qe { 46 struct ib_cqe cqe; 47 void *data; 48 u64 dma; 49 }; 50 51 struct nvme_rdma_queue; 52 struct nvme_rdma_request { 53 struct nvme_request req; 54 struct ib_mr *mr; 55 struct nvme_rdma_qe sqe; 56 union nvme_result result; 57 __le16 status; 58 refcount_t ref; 59 struct ib_sge sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS]; 60 u32 num_sge; 61 int nents; 62 struct ib_reg_wr reg_wr; 63 struct ib_cqe reg_cqe; 64 struct nvme_rdma_queue *queue; 65 struct sg_table sg_table; 66 struct scatterlist first_sgl[]; 67 }; 68 69 enum nvme_rdma_queue_flags { 70 NVME_RDMA_Q_ALLOCATED = 0, 71 NVME_RDMA_Q_LIVE = 1, 72 NVME_RDMA_Q_TR_READY = 2, 73 }; 74 75 struct nvme_rdma_queue { 76 struct nvme_rdma_qe *rsp_ring; 77 int queue_size; 78 size_t cmnd_capsule_len; 79 struct nvme_rdma_ctrl *ctrl; 80 struct nvme_rdma_device *device; 81 struct ib_cq *ib_cq; 82 struct ib_qp *qp; 83 84 unsigned long flags; 85 struct rdma_cm_id *cm_id; 86 int cm_error; 87 struct completion cm_done; 88 }; 89 90 struct nvme_rdma_ctrl { 91 /* read only in the hot path */ 92 struct nvme_rdma_queue *queues; 93 94 /* other member variables */ 95 struct blk_mq_tag_set tag_set; 96 struct work_struct err_work; 97 98 struct nvme_rdma_qe async_event_sqe; 99 100 struct delayed_work reconnect_work; 101 102 struct list_head list; 103 104 struct blk_mq_tag_set admin_tag_set; 105 struct nvme_rdma_device *device; 106 107 u32 max_fr_pages; 108 109 struct sockaddr_storage addr; 110 struct sockaddr_storage src_addr; 111 112 struct nvme_ctrl ctrl; 113 bool use_inline_data; 114 u32 io_queues[HCTX_MAX_TYPES]; 115 }; 116 117 static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl) 118 { 119 return container_of(ctrl, struct nvme_rdma_ctrl, ctrl); 120 } 121 122 static LIST_HEAD(device_list); 123 static DEFINE_MUTEX(device_list_mutex); 124 125 static LIST_HEAD(nvme_rdma_ctrl_list); 126 static DEFINE_MUTEX(nvme_rdma_ctrl_mutex); 127 128 /* 129 * Disabling this option makes small I/O goes faster, but is fundamentally 130 * unsafe. With it turned off we will have to register a global rkey that 131 * allows read and write access to all physical memory. 132 */ 133 static bool register_always = true; 134 module_param(register_always, bool, 0444); 135 MODULE_PARM_DESC(register_always, 136 "Use memory registration even for contiguous memory regions"); 137 138 static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, 139 struct rdma_cm_event *event); 140 static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc); 141 142 static const struct blk_mq_ops nvme_rdma_mq_ops; 143 static const struct blk_mq_ops nvme_rdma_admin_mq_ops; 144 145 static inline int nvme_rdma_queue_idx(struct nvme_rdma_queue *queue) 146 { 147 return queue - queue->ctrl->queues; 148 } 149 150 static bool nvme_rdma_poll_queue(struct nvme_rdma_queue *queue) 151 { 152 return nvme_rdma_queue_idx(queue) > 153 queue->ctrl->io_queues[HCTX_TYPE_DEFAULT] + 154 queue->ctrl->io_queues[HCTX_TYPE_READ]; 155 } 156 157 static inline size_t nvme_rdma_inline_data_size(struct nvme_rdma_queue *queue) 158 { 159 return queue->cmnd_capsule_len - sizeof(struct nvme_command); 160 } 161 162 static void nvme_rdma_free_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe, 163 size_t capsule_size, enum dma_data_direction dir) 164 { 165 ib_dma_unmap_single(ibdev, qe->dma, capsule_size, dir); 166 kfree(qe->data); 167 } 168 169 static int nvme_rdma_alloc_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe, 170 size_t capsule_size, enum dma_data_direction dir) 171 { 172 qe->data = kzalloc(capsule_size, GFP_KERNEL); 173 if (!qe->data) 174 return -ENOMEM; 175 176 qe->dma = ib_dma_map_single(ibdev, qe->data, capsule_size, dir); 177 if (ib_dma_mapping_error(ibdev, qe->dma)) { 178 kfree(qe->data); 179 qe->data = NULL; 180 return -ENOMEM; 181 } 182 183 return 0; 184 } 185 186 static void nvme_rdma_free_ring(struct ib_device *ibdev, 187 struct nvme_rdma_qe *ring, size_t ib_queue_size, 188 size_t capsule_size, enum dma_data_direction dir) 189 { 190 int i; 191 192 for (i = 0; i < ib_queue_size; i++) 193 nvme_rdma_free_qe(ibdev, &ring[i], capsule_size, dir); 194 kfree(ring); 195 } 196 197 static struct nvme_rdma_qe *nvme_rdma_alloc_ring(struct ib_device *ibdev, 198 size_t ib_queue_size, size_t capsule_size, 199 enum dma_data_direction dir) 200 { 201 struct nvme_rdma_qe *ring; 202 int i; 203 204 ring = kcalloc(ib_queue_size, sizeof(struct nvme_rdma_qe), GFP_KERNEL); 205 if (!ring) 206 return NULL; 207 208 /* 209 * Bind the CQEs (post recv buffers) DMA mapping to the RDMA queue 210 * lifetime. It's safe, since any chage in the underlying RDMA device 211 * will issue error recovery and queue re-creation. 212 */ 213 for (i = 0; i < ib_queue_size; i++) { 214 if (nvme_rdma_alloc_qe(ibdev, &ring[i], capsule_size, dir)) 215 goto out_free_ring; 216 } 217 218 return ring; 219 220 out_free_ring: 221 nvme_rdma_free_ring(ibdev, ring, i, capsule_size, dir); 222 return NULL; 223 } 224 225 static void nvme_rdma_qp_event(struct ib_event *event, void *context) 226 { 227 pr_debug("QP event %s (%d)\n", 228 ib_event_msg(event->event), event->event); 229 230 } 231 232 static int nvme_rdma_wait_for_cm(struct nvme_rdma_queue *queue) 233 { 234 int ret; 235 236 ret = wait_for_completion_interruptible_timeout(&queue->cm_done, 237 msecs_to_jiffies(NVME_RDMA_CONNECT_TIMEOUT_MS) + 1); 238 if (ret < 0) 239 return ret; 240 if (ret == 0) 241 return -ETIMEDOUT; 242 WARN_ON_ONCE(queue->cm_error > 0); 243 return queue->cm_error; 244 } 245 246 static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor) 247 { 248 struct nvme_rdma_device *dev = queue->device; 249 struct ib_qp_init_attr init_attr; 250 int ret; 251 252 memset(&init_attr, 0, sizeof(init_attr)); 253 init_attr.event_handler = nvme_rdma_qp_event; 254 /* +1 for drain */ 255 init_attr.cap.max_send_wr = factor * queue->queue_size + 1; 256 /* +1 for drain */ 257 init_attr.cap.max_recv_wr = queue->queue_size + 1; 258 init_attr.cap.max_recv_sge = 1; 259 init_attr.cap.max_send_sge = 1 + dev->num_inline_segments; 260 init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; 261 init_attr.qp_type = IB_QPT_RC; 262 init_attr.send_cq = queue->ib_cq; 263 init_attr.recv_cq = queue->ib_cq; 264 265 ret = rdma_create_qp(queue->cm_id, dev->pd, &init_attr); 266 267 queue->qp = queue->cm_id->qp; 268 return ret; 269 } 270 271 static void nvme_rdma_exit_request(struct blk_mq_tag_set *set, 272 struct request *rq, unsigned int hctx_idx) 273 { 274 struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); 275 276 kfree(req->sqe.data); 277 } 278 279 static int nvme_rdma_init_request(struct blk_mq_tag_set *set, 280 struct request *rq, unsigned int hctx_idx, 281 unsigned int numa_node) 282 { 283 struct nvme_rdma_ctrl *ctrl = set->driver_data; 284 struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); 285 int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; 286 struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx]; 287 288 nvme_req(rq)->ctrl = &ctrl->ctrl; 289 req->sqe.data = kzalloc(sizeof(struct nvme_command), GFP_KERNEL); 290 if (!req->sqe.data) 291 return -ENOMEM; 292 293 req->queue = queue; 294 295 return 0; 296 } 297 298 static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, 299 unsigned int hctx_idx) 300 { 301 struct nvme_rdma_ctrl *ctrl = data; 302 struct nvme_rdma_queue *queue = &ctrl->queues[hctx_idx + 1]; 303 304 BUG_ON(hctx_idx >= ctrl->ctrl.queue_count); 305 306 hctx->driver_data = queue; 307 return 0; 308 } 309 310 static int nvme_rdma_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, 311 unsigned int hctx_idx) 312 { 313 struct nvme_rdma_ctrl *ctrl = data; 314 struct nvme_rdma_queue *queue = &ctrl->queues[0]; 315 316 BUG_ON(hctx_idx != 0); 317 318 hctx->driver_data = queue; 319 return 0; 320 } 321 322 static void nvme_rdma_free_dev(struct kref *ref) 323 { 324 struct nvme_rdma_device *ndev = 325 container_of(ref, struct nvme_rdma_device, ref); 326 327 mutex_lock(&device_list_mutex); 328 list_del(&ndev->entry); 329 mutex_unlock(&device_list_mutex); 330 331 ib_dealloc_pd(ndev->pd); 332 kfree(ndev); 333 } 334 335 static void nvme_rdma_dev_put(struct nvme_rdma_device *dev) 336 { 337 kref_put(&dev->ref, nvme_rdma_free_dev); 338 } 339 340 static int nvme_rdma_dev_get(struct nvme_rdma_device *dev) 341 { 342 return kref_get_unless_zero(&dev->ref); 343 } 344 345 static struct nvme_rdma_device * 346 nvme_rdma_find_get_device(struct rdma_cm_id *cm_id) 347 { 348 struct nvme_rdma_device *ndev; 349 350 mutex_lock(&device_list_mutex); 351 list_for_each_entry(ndev, &device_list, entry) { 352 if (ndev->dev->node_guid == cm_id->device->node_guid && 353 nvme_rdma_dev_get(ndev)) 354 goto out_unlock; 355 } 356 357 ndev = kzalloc(sizeof(*ndev), GFP_KERNEL); 358 if (!ndev) 359 goto out_err; 360 361 ndev->dev = cm_id->device; 362 kref_init(&ndev->ref); 363 364 ndev->pd = ib_alloc_pd(ndev->dev, 365 register_always ? 0 : IB_PD_UNSAFE_GLOBAL_RKEY); 366 if (IS_ERR(ndev->pd)) 367 goto out_free_dev; 368 369 if (!(ndev->dev->attrs.device_cap_flags & 370 IB_DEVICE_MEM_MGT_EXTENSIONS)) { 371 dev_err(&ndev->dev->dev, 372 "Memory registrations not supported.\n"); 373 goto out_free_pd; 374 } 375 376 ndev->num_inline_segments = min(NVME_RDMA_MAX_INLINE_SEGMENTS, 377 ndev->dev->attrs.max_send_sge - 1); 378 list_add(&ndev->entry, &device_list); 379 out_unlock: 380 mutex_unlock(&device_list_mutex); 381 return ndev; 382 383 out_free_pd: 384 ib_dealloc_pd(ndev->pd); 385 out_free_dev: 386 kfree(ndev); 387 out_err: 388 mutex_unlock(&device_list_mutex); 389 return NULL; 390 } 391 392 static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) 393 { 394 struct nvme_rdma_device *dev; 395 struct ib_device *ibdev; 396 397 if (!test_and_clear_bit(NVME_RDMA_Q_TR_READY, &queue->flags)) 398 return; 399 400 dev = queue->device; 401 ibdev = dev->dev; 402 403 ib_mr_pool_destroy(queue->qp, &queue->qp->rdma_mrs); 404 405 /* 406 * The cm_id object might have been destroyed during RDMA connection 407 * establishment error flow to avoid getting other cma events, thus 408 * the destruction of the QP shouldn't use rdma_cm API. 409 */ 410 ib_destroy_qp(queue->qp); 411 ib_free_cq(queue->ib_cq); 412 413 nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size, 414 sizeof(struct nvme_completion), DMA_FROM_DEVICE); 415 416 nvme_rdma_dev_put(dev); 417 } 418 419 static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev) 420 { 421 return min_t(u32, NVME_RDMA_MAX_SEGMENTS, 422 ibdev->attrs.max_fast_reg_page_list_len - 1); 423 } 424 425 static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) 426 { 427 struct ib_device *ibdev; 428 const int send_wr_factor = 3; /* MR, SEND, INV */ 429 const int cq_factor = send_wr_factor + 1; /* + RECV */ 430 int comp_vector, idx = nvme_rdma_queue_idx(queue); 431 enum ib_poll_context poll_ctx; 432 int ret, pages_per_mr; 433 434 queue->device = nvme_rdma_find_get_device(queue->cm_id); 435 if (!queue->device) { 436 dev_err(queue->cm_id->device->dev.parent, 437 "no client data found!\n"); 438 return -ECONNREFUSED; 439 } 440 ibdev = queue->device->dev; 441 442 /* 443 * Spread I/O queues completion vectors according their queue index. 444 * Admin queues can always go on completion vector 0. 445 */ 446 comp_vector = idx == 0 ? idx : idx - 1; 447 448 /* Polling queues need direct cq polling context */ 449 if (nvme_rdma_poll_queue(queue)) 450 poll_ctx = IB_POLL_DIRECT; 451 else 452 poll_ctx = IB_POLL_SOFTIRQ; 453 454 /* +1 for ib_stop_cq */ 455 queue->ib_cq = ib_alloc_cq(ibdev, queue, 456 cq_factor * queue->queue_size + 1, 457 comp_vector, poll_ctx); 458 if (IS_ERR(queue->ib_cq)) { 459 ret = PTR_ERR(queue->ib_cq); 460 goto out_put_dev; 461 } 462 463 ret = nvme_rdma_create_qp(queue, send_wr_factor); 464 if (ret) 465 goto out_destroy_ib_cq; 466 467 queue->rsp_ring = nvme_rdma_alloc_ring(ibdev, queue->queue_size, 468 sizeof(struct nvme_completion), DMA_FROM_DEVICE); 469 if (!queue->rsp_ring) { 470 ret = -ENOMEM; 471 goto out_destroy_qp; 472 } 473 474 /* 475 * Currently we don't use SG_GAPS MR's so if the first entry is 476 * misaligned we'll end up using two entries for a single data page, 477 * so one additional entry is required. 478 */ 479 pages_per_mr = nvme_rdma_get_max_fr_pages(ibdev) + 1; 480 ret = ib_mr_pool_init(queue->qp, &queue->qp->rdma_mrs, 481 queue->queue_size, 482 IB_MR_TYPE_MEM_REG, 483 pages_per_mr, 0); 484 if (ret) { 485 dev_err(queue->ctrl->ctrl.device, 486 "failed to initialize MR pool sized %d for QID %d\n", 487 queue->queue_size, idx); 488 goto out_destroy_ring; 489 } 490 491 set_bit(NVME_RDMA_Q_TR_READY, &queue->flags); 492 493 return 0; 494 495 out_destroy_ring: 496 nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size, 497 sizeof(struct nvme_completion), DMA_FROM_DEVICE); 498 out_destroy_qp: 499 rdma_destroy_qp(queue->cm_id); 500 out_destroy_ib_cq: 501 ib_free_cq(queue->ib_cq); 502 out_put_dev: 503 nvme_rdma_dev_put(queue->device); 504 return ret; 505 } 506 507 static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl, 508 int idx, size_t queue_size) 509 { 510 struct nvme_rdma_queue *queue; 511 struct sockaddr *src_addr = NULL; 512 int ret; 513 514 queue = &ctrl->queues[idx]; 515 queue->ctrl = ctrl; 516 init_completion(&queue->cm_done); 517 518 if (idx > 0) 519 queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16; 520 else 521 queue->cmnd_capsule_len = sizeof(struct nvme_command); 522 523 queue->queue_size = queue_size; 524 525 queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue, 526 RDMA_PS_TCP, IB_QPT_RC); 527 if (IS_ERR(queue->cm_id)) { 528 dev_info(ctrl->ctrl.device, 529 "failed to create CM ID: %ld\n", PTR_ERR(queue->cm_id)); 530 return PTR_ERR(queue->cm_id); 531 } 532 533 if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) 534 src_addr = (struct sockaddr *)&ctrl->src_addr; 535 536 queue->cm_error = -ETIMEDOUT; 537 ret = rdma_resolve_addr(queue->cm_id, src_addr, 538 (struct sockaddr *)&ctrl->addr, 539 NVME_RDMA_CONNECT_TIMEOUT_MS); 540 if (ret) { 541 dev_info(ctrl->ctrl.device, 542 "rdma_resolve_addr failed (%d).\n", ret); 543 goto out_destroy_cm_id; 544 } 545 546 ret = nvme_rdma_wait_for_cm(queue); 547 if (ret) { 548 dev_info(ctrl->ctrl.device, 549 "rdma connection establishment failed (%d)\n", ret); 550 goto out_destroy_cm_id; 551 } 552 553 set_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags); 554 555 return 0; 556 557 out_destroy_cm_id: 558 rdma_destroy_id(queue->cm_id); 559 nvme_rdma_destroy_queue_ib(queue); 560 return ret; 561 } 562 563 static void __nvme_rdma_stop_queue(struct nvme_rdma_queue *queue) 564 { 565 rdma_disconnect(queue->cm_id); 566 ib_drain_qp(queue->qp); 567 } 568 569 static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue) 570 { 571 if (!test_and_clear_bit(NVME_RDMA_Q_LIVE, &queue->flags)) 572 return; 573 __nvme_rdma_stop_queue(queue); 574 } 575 576 static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue) 577 { 578 if (!test_and_clear_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags)) 579 return; 580 581 nvme_rdma_destroy_queue_ib(queue); 582 rdma_destroy_id(queue->cm_id); 583 } 584 585 static void nvme_rdma_free_io_queues(struct nvme_rdma_ctrl *ctrl) 586 { 587 int i; 588 589 for (i = 1; i < ctrl->ctrl.queue_count; i++) 590 nvme_rdma_free_queue(&ctrl->queues[i]); 591 } 592 593 static void nvme_rdma_stop_io_queues(struct nvme_rdma_ctrl *ctrl) 594 { 595 int i; 596 597 for (i = 1; i < ctrl->ctrl.queue_count; i++) 598 nvme_rdma_stop_queue(&ctrl->queues[i]); 599 } 600 601 static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx) 602 { 603 struct nvme_rdma_queue *queue = &ctrl->queues[idx]; 604 bool poll = nvme_rdma_poll_queue(queue); 605 int ret; 606 607 if (idx) 608 ret = nvmf_connect_io_queue(&ctrl->ctrl, idx, poll); 609 else 610 ret = nvmf_connect_admin_queue(&ctrl->ctrl); 611 612 if (!ret) { 613 set_bit(NVME_RDMA_Q_LIVE, &queue->flags); 614 } else { 615 if (test_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags)) 616 __nvme_rdma_stop_queue(queue); 617 dev_info(ctrl->ctrl.device, 618 "failed to connect queue: %d ret=%d\n", idx, ret); 619 } 620 return ret; 621 } 622 623 static int nvme_rdma_start_io_queues(struct nvme_rdma_ctrl *ctrl) 624 { 625 int i, ret = 0; 626 627 for (i = 1; i < ctrl->ctrl.queue_count; i++) { 628 ret = nvme_rdma_start_queue(ctrl, i); 629 if (ret) 630 goto out_stop_queues; 631 } 632 633 return 0; 634 635 out_stop_queues: 636 for (i--; i >= 1; i--) 637 nvme_rdma_stop_queue(&ctrl->queues[i]); 638 return ret; 639 } 640 641 static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl) 642 { 643 struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; 644 struct ib_device *ibdev = ctrl->device->dev; 645 unsigned int nr_io_queues, nr_default_queues; 646 unsigned int nr_read_queues, nr_poll_queues; 647 int i, ret; 648 649 nr_read_queues = min_t(unsigned int, ibdev->num_comp_vectors, 650 min(opts->nr_io_queues, num_online_cpus())); 651 nr_default_queues = min_t(unsigned int, ibdev->num_comp_vectors, 652 min(opts->nr_write_queues, num_online_cpus())); 653 nr_poll_queues = min(opts->nr_poll_queues, num_online_cpus()); 654 nr_io_queues = nr_read_queues + nr_default_queues + nr_poll_queues; 655 656 ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues); 657 if (ret) 658 return ret; 659 660 ctrl->ctrl.queue_count = nr_io_queues + 1; 661 if (ctrl->ctrl.queue_count < 2) 662 return 0; 663 664 dev_info(ctrl->ctrl.device, 665 "creating %d I/O queues.\n", nr_io_queues); 666 667 if (opts->nr_write_queues && nr_read_queues < nr_io_queues) { 668 /* 669 * separate read/write queues 670 * hand out dedicated default queues only after we have 671 * sufficient read queues. 672 */ 673 ctrl->io_queues[HCTX_TYPE_READ] = nr_read_queues; 674 nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ]; 675 ctrl->io_queues[HCTX_TYPE_DEFAULT] = 676 min(nr_default_queues, nr_io_queues); 677 nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT]; 678 } else { 679 /* 680 * shared read/write queues 681 * either no write queues were requested, or we don't have 682 * sufficient queue count to have dedicated default queues. 683 */ 684 ctrl->io_queues[HCTX_TYPE_DEFAULT] = 685 min(nr_read_queues, nr_io_queues); 686 nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT]; 687 } 688 689 if (opts->nr_poll_queues && nr_io_queues) { 690 /* map dedicated poll queues only if we have queues left */ 691 ctrl->io_queues[HCTX_TYPE_POLL] = 692 min(nr_poll_queues, nr_io_queues); 693 } 694 695 for (i = 1; i < ctrl->ctrl.queue_count; i++) { 696 ret = nvme_rdma_alloc_queue(ctrl, i, 697 ctrl->ctrl.sqsize + 1); 698 if (ret) 699 goto out_free_queues; 700 } 701 702 return 0; 703 704 out_free_queues: 705 for (i--; i >= 1; i--) 706 nvme_rdma_free_queue(&ctrl->queues[i]); 707 708 return ret; 709 } 710 711 static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl, 712 bool admin) 713 { 714 struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); 715 struct blk_mq_tag_set *set; 716 int ret; 717 718 if (admin) { 719 set = &ctrl->admin_tag_set; 720 memset(set, 0, sizeof(*set)); 721 set->ops = &nvme_rdma_admin_mq_ops; 722 set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; 723 set->reserved_tags = 2; /* connect + keep-alive */ 724 set->numa_node = nctrl->numa_node; 725 set->cmd_size = sizeof(struct nvme_rdma_request) + 726 NVME_INLINE_SG_CNT * sizeof(struct scatterlist); 727 set->driver_data = ctrl; 728 set->nr_hw_queues = 1; 729 set->timeout = ADMIN_TIMEOUT; 730 set->flags = BLK_MQ_F_NO_SCHED; 731 } else { 732 set = &ctrl->tag_set; 733 memset(set, 0, sizeof(*set)); 734 set->ops = &nvme_rdma_mq_ops; 735 set->queue_depth = nctrl->sqsize + 1; 736 set->reserved_tags = 1; /* fabric connect */ 737 set->numa_node = nctrl->numa_node; 738 set->flags = BLK_MQ_F_SHOULD_MERGE; 739 set->cmd_size = sizeof(struct nvme_rdma_request) + 740 NVME_INLINE_SG_CNT * sizeof(struct scatterlist); 741 set->driver_data = ctrl; 742 set->nr_hw_queues = nctrl->queue_count - 1; 743 set->timeout = NVME_IO_TIMEOUT; 744 set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2; 745 } 746 747 ret = blk_mq_alloc_tag_set(set); 748 if (ret) 749 return ERR_PTR(ret); 750 751 return set; 752 } 753 754 static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl, 755 bool remove) 756 { 757 if (remove) { 758 blk_cleanup_queue(ctrl->ctrl.admin_q); 759 blk_cleanup_queue(ctrl->ctrl.fabrics_q); 760 blk_mq_free_tag_set(ctrl->ctrl.admin_tagset); 761 } 762 if (ctrl->async_event_sqe.data) { 763 nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe, 764 sizeof(struct nvme_command), DMA_TO_DEVICE); 765 ctrl->async_event_sqe.data = NULL; 766 } 767 nvme_rdma_free_queue(&ctrl->queues[0]); 768 } 769 770 static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, 771 bool new) 772 { 773 int error; 774 775 error = nvme_rdma_alloc_queue(ctrl, 0, NVME_AQ_DEPTH); 776 if (error) 777 return error; 778 779 ctrl->device = ctrl->queues[0].device; 780 ctrl->ctrl.numa_node = dev_to_node(ctrl->device->dev->dma_device); 781 782 ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev); 783 784 /* 785 * Bind the async event SQE DMA mapping to the admin queue lifetime. 786 * It's safe, since any chage in the underlying RDMA device will issue 787 * error recovery and queue re-creation. 788 */ 789 error = nvme_rdma_alloc_qe(ctrl->device->dev, &ctrl->async_event_sqe, 790 sizeof(struct nvme_command), DMA_TO_DEVICE); 791 if (error) 792 goto out_free_queue; 793 794 if (new) { 795 ctrl->ctrl.admin_tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, true); 796 if (IS_ERR(ctrl->ctrl.admin_tagset)) { 797 error = PTR_ERR(ctrl->ctrl.admin_tagset); 798 goto out_free_async_qe; 799 } 800 801 ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set); 802 if (IS_ERR(ctrl->ctrl.fabrics_q)) { 803 error = PTR_ERR(ctrl->ctrl.fabrics_q); 804 goto out_free_tagset; 805 } 806 807 ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); 808 if (IS_ERR(ctrl->ctrl.admin_q)) { 809 error = PTR_ERR(ctrl->ctrl.admin_q); 810 goto out_cleanup_fabrics_q; 811 } 812 } 813 814 error = nvme_rdma_start_queue(ctrl, 0); 815 if (error) 816 goto out_cleanup_queue; 817 818 error = nvme_enable_ctrl(&ctrl->ctrl); 819 if (error) 820 goto out_stop_queue; 821 822 ctrl->ctrl.max_segments = ctrl->max_fr_pages; 823 ctrl->ctrl.max_hw_sectors = ctrl->max_fr_pages << (ilog2(SZ_4K) - 9); 824 825 blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); 826 827 error = nvme_init_identify(&ctrl->ctrl); 828 if (error) 829 goto out_stop_queue; 830 831 return 0; 832 833 out_stop_queue: 834 nvme_rdma_stop_queue(&ctrl->queues[0]); 835 out_cleanup_queue: 836 if (new) 837 blk_cleanup_queue(ctrl->ctrl.admin_q); 838 out_cleanup_fabrics_q: 839 if (new) 840 blk_cleanup_queue(ctrl->ctrl.fabrics_q); 841 out_free_tagset: 842 if (new) 843 blk_mq_free_tag_set(ctrl->ctrl.admin_tagset); 844 out_free_async_qe: 845 if (ctrl->async_event_sqe.data) { 846 nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe, 847 sizeof(struct nvme_command), DMA_TO_DEVICE); 848 ctrl->async_event_sqe.data = NULL; 849 } 850 out_free_queue: 851 nvme_rdma_free_queue(&ctrl->queues[0]); 852 return error; 853 } 854 855 static void nvme_rdma_destroy_io_queues(struct nvme_rdma_ctrl *ctrl, 856 bool remove) 857 { 858 if (remove) { 859 blk_cleanup_queue(ctrl->ctrl.connect_q); 860 blk_mq_free_tag_set(ctrl->ctrl.tagset); 861 } 862 nvme_rdma_free_io_queues(ctrl); 863 } 864 865 static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) 866 { 867 int ret; 868 869 ret = nvme_rdma_alloc_io_queues(ctrl); 870 if (ret) 871 return ret; 872 873 if (new) { 874 ctrl->ctrl.tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, false); 875 if (IS_ERR(ctrl->ctrl.tagset)) { 876 ret = PTR_ERR(ctrl->ctrl.tagset); 877 goto out_free_io_queues; 878 } 879 880 ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set); 881 if (IS_ERR(ctrl->ctrl.connect_q)) { 882 ret = PTR_ERR(ctrl->ctrl.connect_q); 883 goto out_free_tag_set; 884 } 885 } else { 886 blk_mq_update_nr_hw_queues(&ctrl->tag_set, 887 ctrl->ctrl.queue_count - 1); 888 } 889 890 ret = nvme_rdma_start_io_queues(ctrl); 891 if (ret) 892 goto out_cleanup_connect_q; 893 894 return 0; 895 896 out_cleanup_connect_q: 897 if (new) 898 blk_cleanup_queue(ctrl->ctrl.connect_q); 899 out_free_tag_set: 900 if (new) 901 blk_mq_free_tag_set(ctrl->ctrl.tagset); 902 out_free_io_queues: 903 nvme_rdma_free_io_queues(ctrl); 904 return ret; 905 } 906 907 static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, 908 bool remove) 909 { 910 blk_mq_quiesce_queue(ctrl->ctrl.admin_q); 911 nvme_rdma_stop_queue(&ctrl->queues[0]); 912 if (ctrl->ctrl.admin_tagset) { 913 blk_mq_tagset_busy_iter(ctrl->ctrl.admin_tagset, 914 nvme_cancel_request, &ctrl->ctrl); 915 blk_mq_tagset_wait_completed_request(ctrl->ctrl.admin_tagset); 916 } 917 if (remove) 918 blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); 919 nvme_rdma_destroy_admin_queue(ctrl, remove); 920 } 921 922 static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, 923 bool remove) 924 { 925 if (ctrl->ctrl.queue_count > 1) { 926 nvme_stop_queues(&ctrl->ctrl); 927 nvme_rdma_stop_io_queues(ctrl); 928 if (ctrl->ctrl.tagset) { 929 blk_mq_tagset_busy_iter(ctrl->ctrl.tagset, 930 nvme_cancel_request, &ctrl->ctrl); 931 blk_mq_tagset_wait_completed_request(ctrl->ctrl.tagset); 932 } 933 if (remove) 934 nvme_start_queues(&ctrl->ctrl); 935 nvme_rdma_destroy_io_queues(ctrl, remove); 936 } 937 } 938 939 static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl) 940 { 941 struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); 942 943 if (list_empty(&ctrl->list)) 944 goto free_ctrl; 945 946 mutex_lock(&nvme_rdma_ctrl_mutex); 947 list_del(&ctrl->list); 948 mutex_unlock(&nvme_rdma_ctrl_mutex); 949 950 nvmf_free_options(nctrl->opts); 951 free_ctrl: 952 kfree(ctrl->queues); 953 kfree(ctrl); 954 } 955 956 static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl) 957 { 958 /* If we are resetting/deleting then do nothing */ 959 if (ctrl->ctrl.state != NVME_CTRL_CONNECTING) { 960 WARN_ON_ONCE(ctrl->ctrl.state == NVME_CTRL_NEW || 961 ctrl->ctrl.state == NVME_CTRL_LIVE); 962 return; 963 } 964 965 if (nvmf_should_reconnect(&ctrl->ctrl)) { 966 dev_info(ctrl->ctrl.device, "Reconnecting in %d seconds...\n", 967 ctrl->ctrl.opts->reconnect_delay); 968 queue_delayed_work(nvme_wq, &ctrl->reconnect_work, 969 ctrl->ctrl.opts->reconnect_delay * HZ); 970 } else { 971 nvme_delete_ctrl(&ctrl->ctrl); 972 } 973 } 974 975 static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) 976 { 977 int ret = -EINVAL; 978 bool changed; 979 980 ret = nvme_rdma_configure_admin_queue(ctrl, new); 981 if (ret) 982 return ret; 983 984 if (ctrl->ctrl.icdoff) { 985 dev_err(ctrl->ctrl.device, "icdoff is not supported!\n"); 986 goto destroy_admin; 987 } 988 989 if (!(ctrl->ctrl.sgls & (1 << 2))) { 990 dev_err(ctrl->ctrl.device, 991 "Mandatory keyed sgls are not supported!\n"); 992 goto destroy_admin; 993 } 994 995 if (ctrl->ctrl.opts->queue_size > ctrl->ctrl.sqsize + 1) { 996 dev_warn(ctrl->ctrl.device, 997 "queue_size %zu > ctrl sqsize %u, clamping down\n", 998 ctrl->ctrl.opts->queue_size, ctrl->ctrl.sqsize + 1); 999 } 1000 1001 if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) { 1002 dev_warn(ctrl->ctrl.device, 1003 "sqsize %u > ctrl maxcmd %u, clamping down\n", 1004 ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd); 1005 ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1; 1006 } 1007 1008 if (ctrl->ctrl.sgls & (1 << 20)) 1009 ctrl->use_inline_data = true; 1010 1011 if (ctrl->ctrl.queue_count > 1) { 1012 ret = nvme_rdma_configure_io_queues(ctrl, new); 1013 if (ret) 1014 goto destroy_admin; 1015 } 1016 1017 changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); 1018 if (!changed) { 1019 /* 1020 * state change failure is ok if we're in DELETING state, 1021 * unless we're during creation of a new controller to 1022 * avoid races with teardown flow. 1023 */ 1024 WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING); 1025 WARN_ON_ONCE(new); 1026 ret = -EINVAL; 1027 goto destroy_io; 1028 } 1029 1030 nvme_start_ctrl(&ctrl->ctrl); 1031 return 0; 1032 1033 destroy_io: 1034 if (ctrl->ctrl.queue_count > 1) 1035 nvme_rdma_destroy_io_queues(ctrl, new); 1036 destroy_admin: 1037 nvme_rdma_stop_queue(&ctrl->queues[0]); 1038 nvme_rdma_destroy_admin_queue(ctrl, new); 1039 return ret; 1040 } 1041 1042 static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) 1043 { 1044 struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work), 1045 struct nvme_rdma_ctrl, reconnect_work); 1046 1047 ++ctrl->ctrl.nr_reconnects; 1048 1049 if (nvme_rdma_setup_ctrl(ctrl, false)) 1050 goto requeue; 1051 1052 dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n", 1053 ctrl->ctrl.nr_reconnects); 1054 1055 ctrl->ctrl.nr_reconnects = 0; 1056 1057 return; 1058 1059 requeue: 1060 dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n", 1061 ctrl->ctrl.nr_reconnects); 1062 nvme_rdma_reconnect_or_remove(ctrl); 1063 } 1064 1065 static void nvme_rdma_error_recovery_work(struct work_struct *work) 1066 { 1067 struct nvme_rdma_ctrl *ctrl = container_of(work, 1068 struct nvme_rdma_ctrl, err_work); 1069 1070 nvme_stop_keep_alive(&ctrl->ctrl); 1071 nvme_rdma_teardown_io_queues(ctrl, false); 1072 nvme_start_queues(&ctrl->ctrl); 1073 nvme_rdma_teardown_admin_queue(ctrl, false); 1074 blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); 1075 1076 if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { 1077 /* state change failure is ok if we're in DELETING state */ 1078 WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING); 1079 return; 1080 } 1081 1082 nvme_rdma_reconnect_or_remove(ctrl); 1083 } 1084 1085 static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl) 1086 { 1087 if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING)) 1088 return; 1089 1090 queue_work(nvme_reset_wq, &ctrl->err_work); 1091 } 1092 1093 static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc, 1094 const char *op) 1095 { 1096 struct nvme_rdma_queue *queue = cq->cq_context; 1097 struct nvme_rdma_ctrl *ctrl = queue->ctrl; 1098 1099 if (ctrl->ctrl.state == NVME_CTRL_LIVE) 1100 dev_info(ctrl->ctrl.device, 1101 "%s for CQE 0x%p failed with status %s (%d)\n", 1102 op, wc->wr_cqe, 1103 ib_wc_status_msg(wc->status), wc->status); 1104 nvme_rdma_error_recovery(ctrl); 1105 } 1106 1107 static void nvme_rdma_memreg_done(struct ib_cq *cq, struct ib_wc *wc) 1108 { 1109 if (unlikely(wc->status != IB_WC_SUCCESS)) 1110 nvme_rdma_wr_error(cq, wc, "MEMREG"); 1111 } 1112 1113 static void nvme_rdma_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc) 1114 { 1115 struct nvme_rdma_request *req = 1116 container_of(wc->wr_cqe, struct nvme_rdma_request, reg_cqe); 1117 struct request *rq = blk_mq_rq_from_pdu(req); 1118 1119 if (unlikely(wc->status != IB_WC_SUCCESS)) { 1120 nvme_rdma_wr_error(cq, wc, "LOCAL_INV"); 1121 return; 1122 } 1123 1124 if (refcount_dec_and_test(&req->ref)) 1125 nvme_end_request(rq, req->status, req->result); 1126 1127 } 1128 1129 static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue, 1130 struct nvme_rdma_request *req) 1131 { 1132 struct ib_send_wr wr = { 1133 .opcode = IB_WR_LOCAL_INV, 1134 .next = NULL, 1135 .num_sge = 0, 1136 .send_flags = IB_SEND_SIGNALED, 1137 .ex.invalidate_rkey = req->mr->rkey, 1138 }; 1139 1140 req->reg_cqe.done = nvme_rdma_inv_rkey_done; 1141 wr.wr_cqe = &req->reg_cqe; 1142 1143 return ib_post_send(queue->qp, &wr, NULL); 1144 } 1145 1146 static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue, 1147 struct request *rq) 1148 { 1149 struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); 1150 struct nvme_rdma_device *dev = queue->device; 1151 struct ib_device *ibdev = dev->dev; 1152 1153 if (!blk_rq_nr_phys_segments(rq)) 1154 return; 1155 1156 if (req->mr) { 1157 ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr); 1158 req->mr = NULL; 1159 } 1160 1161 ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, rq_dma_dir(rq)); 1162 sg_free_table_chained(&req->sg_table, NVME_INLINE_SG_CNT); 1163 } 1164 1165 static int nvme_rdma_set_sg_null(struct nvme_command *c) 1166 { 1167 struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl; 1168 1169 sg->addr = 0; 1170 put_unaligned_le24(0, sg->length); 1171 put_unaligned_le32(0, sg->key); 1172 sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4; 1173 return 0; 1174 } 1175 1176 static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue, 1177 struct nvme_rdma_request *req, struct nvme_command *c, 1178 int count) 1179 { 1180 struct nvme_sgl_desc *sg = &c->common.dptr.sgl; 1181 struct scatterlist *sgl = req->sg_table.sgl; 1182 struct ib_sge *sge = &req->sge[1]; 1183 u32 len = 0; 1184 int i; 1185 1186 for (i = 0; i < count; i++, sgl++, sge++) { 1187 sge->addr = sg_dma_address(sgl); 1188 sge->length = sg_dma_len(sgl); 1189 sge->lkey = queue->device->pd->local_dma_lkey; 1190 len += sge->length; 1191 } 1192 1193 sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff); 1194 sg->length = cpu_to_le32(len); 1195 sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET; 1196 1197 req->num_sge += count; 1198 return 0; 1199 } 1200 1201 static int nvme_rdma_map_sg_single(struct nvme_rdma_queue *queue, 1202 struct nvme_rdma_request *req, struct nvme_command *c) 1203 { 1204 struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl; 1205 1206 sg->addr = cpu_to_le64(sg_dma_address(req->sg_table.sgl)); 1207 put_unaligned_le24(sg_dma_len(req->sg_table.sgl), sg->length); 1208 put_unaligned_le32(queue->device->pd->unsafe_global_rkey, sg->key); 1209 sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4; 1210 return 0; 1211 } 1212 1213 static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue, 1214 struct nvme_rdma_request *req, struct nvme_command *c, 1215 int count) 1216 { 1217 struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl; 1218 int nr; 1219 1220 req->mr = ib_mr_pool_get(queue->qp, &queue->qp->rdma_mrs); 1221 if (WARN_ON_ONCE(!req->mr)) 1222 return -EAGAIN; 1223 1224 /* 1225 * Align the MR to a 4K page size to match the ctrl page size and 1226 * the block virtual boundary. 1227 */ 1228 nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, SZ_4K); 1229 if (unlikely(nr < count)) { 1230 ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr); 1231 req->mr = NULL; 1232 if (nr < 0) 1233 return nr; 1234 return -EINVAL; 1235 } 1236 1237 ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey)); 1238 1239 req->reg_cqe.done = nvme_rdma_memreg_done; 1240 memset(&req->reg_wr, 0, sizeof(req->reg_wr)); 1241 req->reg_wr.wr.opcode = IB_WR_REG_MR; 1242 req->reg_wr.wr.wr_cqe = &req->reg_cqe; 1243 req->reg_wr.wr.num_sge = 0; 1244 req->reg_wr.mr = req->mr; 1245 req->reg_wr.key = req->mr->rkey; 1246 req->reg_wr.access = IB_ACCESS_LOCAL_WRITE | 1247 IB_ACCESS_REMOTE_READ | 1248 IB_ACCESS_REMOTE_WRITE; 1249 1250 sg->addr = cpu_to_le64(req->mr->iova); 1251 put_unaligned_le24(req->mr->length, sg->length); 1252 put_unaligned_le32(req->mr->rkey, sg->key); 1253 sg->type = (NVME_KEY_SGL_FMT_DATA_DESC << 4) | 1254 NVME_SGL_FMT_INVALIDATE; 1255 1256 return 0; 1257 } 1258 1259 static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, 1260 struct request *rq, struct nvme_command *c) 1261 { 1262 struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); 1263 struct nvme_rdma_device *dev = queue->device; 1264 struct ib_device *ibdev = dev->dev; 1265 int count, ret; 1266 1267 req->num_sge = 1; 1268 refcount_set(&req->ref, 2); /* send and recv completions */ 1269 1270 c->common.flags |= NVME_CMD_SGL_METABUF; 1271 1272 if (!blk_rq_nr_phys_segments(rq)) 1273 return nvme_rdma_set_sg_null(c); 1274 1275 req->sg_table.sgl = req->first_sgl; 1276 ret = sg_alloc_table_chained(&req->sg_table, 1277 blk_rq_nr_phys_segments(rq), req->sg_table.sgl, 1278 NVME_INLINE_SG_CNT); 1279 if (ret) 1280 return -ENOMEM; 1281 1282 req->nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl); 1283 1284 count = ib_dma_map_sg(ibdev, req->sg_table.sgl, req->nents, 1285 rq_dma_dir(rq)); 1286 if (unlikely(count <= 0)) { 1287 ret = -EIO; 1288 goto out_free_table; 1289 } 1290 1291 if (count <= dev->num_inline_segments) { 1292 if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) && 1293 queue->ctrl->use_inline_data && 1294 blk_rq_payload_bytes(rq) <= 1295 nvme_rdma_inline_data_size(queue)) { 1296 ret = nvme_rdma_map_sg_inline(queue, req, c, count); 1297 goto out; 1298 } 1299 1300 if (count == 1 && dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) { 1301 ret = nvme_rdma_map_sg_single(queue, req, c); 1302 goto out; 1303 } 1304 } 1305 1306 ret = nvme_rdma_map_sg_fr(queue, req, c, count); 1307 out: 1308 if (unlikely(ret)) 1309 goto out_unmap_sg; 1310 1311 return 0; 1312 1313 out_unmap_sg: 1314 ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, rq_dma_dir(rq)); 1315 out_free_table: 1316 sg_free_table_chained(&req->sg_table, NVME_INLINE_SG_CNT); 1317 return ret; 1318 } 1319 1320 static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) 1321 { 1322 struct nvme_rdma_qe *qe = 1323 container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe); 1324 struct nvme_rdma_request *req = 1325 container_of(qe, struct nvme_rdma_request, sqe); 1326 struct request *rq = blk_mq_rq_from_pdu(req); 1327 1328 if (unlikely(wc->status != IB_WC_SUCCESS)) { 1329 nvme_rdma_wr_error(cq, wc, "SEND"); 1330 return; 1331 } 1332 1333 if (refcount_dec_and_test(&req->ref)) 1334 nvme_end_request(rq, req->status, req->result); 1335 } 1336 1337 static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, 1338 struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge, 1339 struct ib_send_wr *first) 1340 { 1341 struct ib_send_wr wr; 1342 int ret; 1343 1344 sge->addr = qe->dma; 1345 sge->length = sizeof(struct nvme_command), 1346 sge->lkey = queue->device->pd->local_dma_lkey; 1347 1348 wr.next = NULL; 1349 wr.wr_cqe = &qe->cqe; 1350 wr.sg_list = sge; 1351 wr.num_sge = num_sge; 1352 wr.opcode = IB_WR_SEND; 1353 wr.send_flags = IB_SEND_SIGNALED; 1354 1355 if (first) 1356 first->next = ≀ 1357 else 1358 first = ≀ 1359 1360 ret = ib_post_send(queue->qp, first, NULL); 1361 if (unlikely(ret)) { 1362 dev_err(queue->ctrl->ctrl.device, 1363 "%s failed with error code %d\n", __func__, ret); 1364 } 1365 return ret; 1366 } 1367 1368 static int nvme_rdma_post_recv(struct nvme_rdma_queue *queue, 1369 struct nvme_rdma_qe *qe) 1370 { 1371 struct ib_recv_wr wr; 1372 struct ib_sge list; 1373 int ret; 1374 1375 list.addr = qe->dma; 1376 list.length = sizeof(struct nvme_completion); 1377 list.lkey = queue->device->pd->local_dma_lkey; 1378 1379 qe->cqe.done = nvme_rdma_recv_done; 1380 1381 wr.next = NULL; 1382 wr.wr_cqe = &qe->cqe; 1383 wr.sg_list = &list; 1384 wr.num_sge = 1; 1385 1386 ret = ib_post_recv(queue->qp, &wr, NULL); 1387 if (unlikely(ret)) { 1388 dev_err(queue->ctrl->ctrl.device, 1389 "%s failed with error code %d\n", __func__, ret); 1390 } 1391 return ret; 1392 } 1393 1394 static struct blk_mq_tags *nvme_rdma_tagset(struct nvme_rdma_queue *queue) 1395 { 1396 u32 queue_idx = nvme_rdma_queue_idx(queue); 1397 1398 if (queue_idx == 0) 1399 return queue->ctrl->admin_tag_set.tags[queue_idx]; 1400 return queue->ctrl->tag_set.tags[queue_idx - 1]; 1401 } 1402 1403 static void nvme_rdma_async_done(struct ib_cq *cq, struct ib_wc *wc) 1404 { 1405 if (unlikely(wc->status != IB_WC_SUCCESS)) 1406 nvme_rdma_wr_error(cq, wc, "ASYNC"); 1407 } 1408 1409 static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg) 1410 { 1411 struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg); 1412 struct nvme_rdma_queue *queue = &ctrl->queues[0]; 1413 struct ib_device *dev = queue->device->dev; 1414 struct nvme_rdma_qe *sqe = &ctrl->async_event_sqe; 1415 struct nvme_command *cmd = sqe->data; 1416 struct ib_sge sge; 1417 int ret; 1418 1419 ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(*cmd), DMA_TO_DEVICE); 1420 1421 memset(cmd, 0, sizeof(*cmd)); 1422 cmd->common.opcode = nvme_admin_async_event; 1423 cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH; 1424 cmd->common.flags |= NVME_CMD_SGL_METABUF; 1425 nvme_rdma_set_sg_null(cmd); 1426 1427 sqe->cqe.done = nvme_rdma_async_done; 1428 1429 ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd), 1430 DMA_TO_DEVICE); 1431 1432 ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL); 1433 WARN_ON_ONCE(ret); 1434 } 1435 1436 static void nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue, 1437 struct nvme_completion *cqe, struct ib_wc *wc) 1438 { 1439 struct request *rq; 1440 struct nvme_rdma_request *req; 1441 1442 rq = blk_mq_tag_to_rq(nvme_rdma_tagset(queue), cqe->command_id); 1443 if (!rq) { 1444 dev_err(queue->ctrl->ctrl.device, 1445 "tag 0x%x on QP %#x not found\n", 1446 cqe->command_id, queue->qp->qp_num); 1447 nvme_rdma_error_recovery(queue->ctrl); 1448 return; 1449 } 1450 req = blk_mq_rq_to_pdu(rq); 1451 1452 req->status = cqe->status; 1453 req->result = cqe->result; 1454 1455 if (wc->wc_flags & IB_WC_WITH_INVALIDATE) { 1456 if (unlikely(wc->ex.invalidate_rkey != req->mr->rkey)) { 1457 dev_err(queue->ctrl->ctrl.device, 1458 "Bogus remote invalidation for rkey %#x\n", 1459 req->mr->rkey); 1460 nvme_rdma_error_recovery(queue->ctrl); 1461 } 1462 } else if (req->mr) { 1463 int ret; 1464 1465 ret = nvme_rdma_inv_rkey(queue, req); 1466 if (unlikely(ret < 0)) { 1467 dev_err(queue->ctrl->ctrl.device, 1468 "Queueing INV WR for rkey %#x failed (%d)\n", 1469 req->mr->rkey, ret); 1470 nvme_rdma_error_recovery(queue->ctrl); 1471 } 1472 /* the local invalidation completion will end the request */ 1473 return; 1474 } 1475 1476 if (refcount_dec_and_test(&req->ref)) 1477 nvme_end_request(rq, req->status, req->result); 1478 } 1479 1480 static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) 1481 { 1482 struct nvme_rdma_qe *qe = 1483 container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe); 1484 struct nvme_rdma_queue *queue = cq->cq_context; 1485 struct ib_device *ibdev = queue->device->dev; 1486 struct nvme_completion *cqe = qe->data; 1487 const size_t len = sizeof(struct nvme_completion); 1488 1489 if (unlikely(wc->status != IB_WC_SUCCESS)) { 1490 nvme_rdma_wr_error(cq, wc, "RECV"); 1491 return; 1492 } 1493 1494 ib_dma_sync_single_for_cpu(ibdev, qe->dma, len, DMA_FROM_DEVICE); 1495 /* 1496 * AEN requests are special as they don't time out and can 1497 * survive any kind of queue freeze and often don't respond to 1498 * aborts. We don't even bother to allocate a struct request 1499 * for them but rather special case them here. 1500 */ 1501 if (unlikely(nvme_is_aen_req(nvme_rdma_queue_idx(queue), 1502 cqe->command_id))) 1503 nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status, 1504 &cqe->result); 1505 else 1506 nvme_rdma_process_nvme_rsp(queue, cqe, wc); 1507 ib_dma_sync_single_for_device(ibdev, qe->dma, len, DMA_FROM_DEVICE); 1508 1509 nvme_rdma_post_recv(queue, qe); 1510 } 1511 1512 static int nvme_rdma_conn_established(struct nvme_rdma_queue *queue) 1513 { 1514 int ret, i; 1515 1516 for (i = 0; i < queue->queue_size; i++) { 1517 ret = nvme_rdma_post_recv(queue, &queue->rsp_ring[i]); 1518 if (ret) 1519 goto out_destroy_queue_ib; 1520 } 1521 1522 return 0; 1523 1524 out_destroy_queue_ib: 1525 nvme_rdma_destroy_queue_ib(queue); 1526 return ret; 1527 } 1528 1529 static int nvme_rdma_conn_rejected(struct nvme_rdma_queue *queue, 1530 struct rdma_cm_event *ev) 1531 { 1532 struct rdma_cm_id *cm_id = queue->cm_id; 1533 int status = ev->status; 1534 const char *rej_msg; 1535 const struct nvme_rdma_cm_rej *rej_data; 1536 u8 rej_data_len; 1537 1538 rej_msg = rdma_reject_msg(cm_id, status); 1539 rej_data = rdma_consumer_reject_data(cm_id, ev, &rej_data_len); 1540 1541 if (rej_data && rej_data_len >= sizeof(u16)) { 1542 u16 sts = le16_to_cpu(rej_data->sts); 1543 1544 dev_err(queue->ctrl->ctrl.device, 1545 "Connect rejected: status %d (%s) nvme status %d (%s).\n", 1546 status, rej_msg, sts, nvme_rdma_cm_msg(sts)); 1547 } else { 1548 dev_err(queue->ctrl->ctrl.device, 1549 "Connect rejected: status %d (%s).\n", status, rej_msg); 1550 } 1551 1552 return -ECONNRESET; 1553 } 1554 1555 static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue) 1556 { 1557 struct nvme_ctrl *ctrl = &queue->ctrl->ctrl; 1558 int ret; 1559 1560 ret = nvme_rdma_create_queue_ib(queue); 1561 if (ret) 1562 return ret; 1563 1564 if (ctrl->opts->tos >= 0) 1565 rdma_set_service_type(queue->cm_id, ctrl->opts->tos); 1566 ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS); 1567 if (ret) { 1568 dev_err(ctrl->device, "rdma_resolve_route failed (%d).\n", 1569 queue->cm_error); 1570 goto out_destroy_queue; 1571 } 1572 1573 return 0; 1574 1575 out_destroy_queue: 1576 nvme_rdma_destroy_queue_ib(queue); 1577 return ret; 1578 } 1579 1580 static int nvme_rdma_route_resolved(struct nvme_rdma_queue *queue) 1581 { 1582 struct nvme_rdma_ctrl *ctrl = queue->ctrl; 1583 struct rdma_conn_param param = { }; 1584 struct nvme_rdma_cm_req priv = { }; 1585 int ret; 1586 1587 param.qp_num = queue->qp->qp_num; 1588 param.flow_control = 1; 1589 1590 param.responder_resources = queue->device->dev->attrs.max_qp_rd_atom; 1591 /* maximum retry count */ 1592 param.retry_count = 7; 1593 param.rnr_retry_count = 7; 1594 param.private_data = &priv; 1595 param.private_data_len = sizeof(priv); 1596 1597 priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); 1598 priv.qid = cpu_to_le16(nvme_rdma_queue_idx(queue)); 1599 /* 1600 * set the admin queue depth to the minimum size 1601 * specified by the Fabrics standard. 1602 */ 1603 if (priv.qid == 0) { 1604 priv.hrqsize = cpu_to_le16(NVME_AQ_DEPTH); 1605 priv.hsqsize = cpu_to_le16(NVME_AQ_DEPTH - 1); 1606 } else { 1607 /* 1608 * current interpretation of the fabrics spec 1609 * is at minimum you make hrqsize sqsize+1, or a 1610 * 1's based representation of sqsize. 1611 */ 1612 priv.hrqsize = cpu_to_le16(queue->queue_size); 1613 priv.hsqsize = cpu_to_le16(queue->ctrl->ctrl.sqsize); 1614 } 1615 1616 ret = rdma_connect(queue->cm_id, ¶m); 1617 if (ret) { 1618 dev_err(ctrl->ctrl.device, 1619 "rdma_connect failed (%d).\n", ret); 1620 goto out_destroy_queue_ib; 1621 } 1622 1623 return 0; 1624 1625 out_destroy_queue_ib: 1626 nvme_rdma_destroy_queue_ib(queue); 1627 return ret; 1628 } 1629 1630 static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, 1631 struct rdma_cm_event *ev) 1632 { 1633 struct nvme_rdma_queue *queue = cm_id->context; 1634 int cm_error = 0; 1635 1636 dev_dbg(queue->ctrl->ctrl.device, "%s (%d): status %d id %p\n", 1637 rdma_event_msg(ev->event), ev->event, 1638 ev->status, cm_id); 1639 1640 switch (ev->event) { 1641 case RDMA_CM_EVENT_ADDR_RESOLVED: 1642 cm_error = nvme_rdma_addr_resolved(queue); 1643 break; 1644 case RDMA_CM_EVENT_ROUTE_RESOLVED: 1645 cm_error = nvme_rdma_route_resolved(queue); 1646 break; 1647 case RDMA_CM_EVENT_ESTABLISHED: 1648 queue->cm_error = nvme_rdma_conn_established(queue); 1649 /* complete cm_done regardless of success/failure */ 1650 complete(&queue->cm_done); 1651 return 0; 1652 case RDMA_CM_EVENT_REJECTED: 1653 nvme_rdma_destroy_queue_ib(queue); 1654 cm_error = nvme_rdma_conn_rejected(queue, ev); 1655 break; 1656 case RDMA_CM_EVENT_ROUTE_ERROR: 1657 case RDMA_CM_EVENT_CONNECT_ERROR: 1658 case RDMA_CM_EVENT_UNREACHABLE: 1659 nvme_rdma_destroy_queue_ib(queue); 1660 /* fall through */ 1661 case RDMA_CM_EVENT_ADDR_ERROR: 1662 dev_dbg(queue->ctrl->ctrl.device, 1663 "CM error event %d\n", ev->event); 1664 cm_error = -ECONNRESET; 1665 break; 1666 case RDMA_CM_EVENT_DISCONNECTED: 1667 case RDMA_CM_EVENT_ADDR_CHANGE: 1668 case RDMA_CM_EVENT_TIMEWAIT_EXIT: 1669 dev_dbg(queue->ctrl->ctrl.device, 1670 "disconnect received - connection closed\n"); 1671 nvme_rdma_error_recovery(queue->ctrl); 1672 break; 1673 case RDMA_CM_EVENT_DEVICE_REMOVAL: 1674 /* device removal is handled via the ib_client API */ 1675 break; 1676 default: 1677 dev_err(queue->ctrl->ctrl.device, 1678 "Unexpected RDMA CM event (%d)\n", ev->event); 1679 nvme_rdma_error_recovery(queue->ctrl); 1680 break; 1681 } 1682 1683 if (cm_error) { 1684 queue->cm_error = cm_error; 1685 complete(&queue->cm_done); 1686 } 1687 1688 return 0; 1689 } 1690 1691 static enum blk_eh_timer_return 1692 nvme_rdma_timeout(struct request *rq, bool reserved) 1693 { 1694 struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); 1695 struct nvme_rdma_queue *queue = req->queue; 1696 struct nvme_rdma_ctrl *ctrl = queue->ctrl; 1697 1698 dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n", 1699 rq->tag, nvme_rdma_queue_idx(queue)); 1700 1701 /* 1702 * Restart the timer if a controller reset is already scheduled. Any 1703 * timed out commands would be handled before entering the connecting 1704 * state. 1705 */ 1706 if (ctrl->ctrl.state == NVME_CTRL_RESETTING) 1707 return BLK_EH_RESET_TIMER; 1708 1709 if (ctrl->ctrl.state != NVME_CTRL_LIVE) { 1710 /* 1711 * Teardown immediately if controller times out while starting 1712 * or we are already started error recovery. all outstanding 1713 * requests are completed on shutdown, so we return BLK_EH_DONE. 1714 */ 1715 flush_work(&ctrl->err_work); 1716 nvme_rdma_teardown_io_queues(ctrl, false); 1717 nvme_rdma_teardown_admin_queue(ctrl, false); 1718 return BLK_EH_DONE; 1719 } 1720 1721 dev_warn(ctrl->ctrl.device, "starting error recovery\n"); 1722 nvme_rdma_error_recovery(ctrl); 1723 1724 return BLK_EH_RESET_TIMER; 1725 } 1726 1727 static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, 1728 const struct blk_mq_queue_data *bd) 1729 { 1730 struct nvme_ns *ns = hctx->queue->queuedata; 1731 struct nvme_rdma_queue *queue = hctx->driver_data; 1732 struct request *rq = bd->rq; 1733 struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); 1734 struct nvme_rdma_qe *sqe = &req->sqe; 1735 struct nvme_command *c = sqe->data; 1736 struct ib_device *dev; 1737 bool queue_ready = test_bit(NVME_RDMA_Q_LIVE, &queue->flags); 1738 blk_status_t ret; 1739 int err; 1740 1741 WARN_ON_ONCE(rq->tag < 0); 1742 1743 if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready)) 1744 return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq); 1745 1746 dev = queue->device->dev; 1747 1748 req->sqe.dma = ib_dma_map_single(dev, req->sqe.data, 1749 sizeof(struct nvme_command), 1750 DMA_TO_DEVICE); 1751 err = ib_dma_mapping_error(dev, req->sqe.dma); 1752 if (unlikely(err)) 1753 return BLK_STS_RESOURCE; 1754 1755 ib_dma_sync_single_for_cpu(dev, sqe->dma, 1756 sizeof(struct nvme_command), DMA_TO_DEVICE); 1757 1758 ret = nvme_setup_cmd(ns, rq, c); 1759 if (ret) 1760 goto unmap_qe; 1761 1762 blk_mq_start_request(rq); 1763 1764 err = nvme_rdma_map_data(queue, rq, c); 1765 if (unlikely(err < 0)) { 1766 dev_err(queue->ctrl->ctrl.device, 1767 "Failed to map data (%d)\n", err); 1768 goto err; 1769 } 1770 1771 sqe->cqe.done = nvme_rdma_send_done; 1772 1773 ib_dma_sync_single_for_device(dev, sqe->dma, 1774 sizeof(struct nvme_command), DMA_TO_DEVICE); 1775 1776 err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge, 1777 req->mr ? &req->reg_wr.wr : NULL); 1778 if (unlikely(err)) 1779 goto err_unmap; 1780 1781 return BLK_STS_OK; 1782 1783 err_unmap: 1784 nvme_rdma_unmap_data(queue, rq); 1785 err: 1786 if (err == -ENOMEM || err == -EAGAIN) 1787 ret = BLK_STS_RESOURCE; 1788 else 1789 ret = BLK_STS_IOERR; 1790 nvme_cleanup_cmd(rq); 1791 unmap_qe: 1792 ib_dma_unmap_single(dev, req->sqe.dma, sizeof(struct nvme_command), 1793 DMA_TO_DEVICE); 1794 return ret; 1795 } 1796 1797 static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx) 1798 { 1799 struct nvme_rdma_queue *queue = hctx->driver_data; 1800 1801 return ib_process_cq_direct(queue->ib_cq, -1); 1802 } 1803 1804 static void nvme_rdma_complete_rq(struct request *rq) 1805 { 1806 struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); 1807 struct nvme_rdma_queue *queue = req->queue; 1808 struct ib_device *ibdev = queue->device->dev; 1809 1810 nvme_rdma_unmap_data(queue, rq); 1811 ib_dma_unmap_single(ibdev, req->sqe.dma, sizeof(struct nvme_command), 1812 DMA_TO_DEVICE); 1813 nvme_complete_rq(rq); 1814 } 1815 1816 static int nvme_rdma_map_queues(struct blk_mq_tag_set *set) 1817 { 1818 struct nvme_rdma_ctrl *ctrl = set->driver_data; 1819 struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; 1820 1821 if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) { 1822 /* separate read/write queues */ 1823 set->map[HCTX_TYPE_DEFAULT].nr_queues = 1824 ctrl->io_queues[HCTX_TYPE_DEFAULT]; 1825 set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; 1826 set->map[HCTX_TYPE_READ].nr_queues = 1827 ctrl->io_queues[HCTX_TYPE_READ]; 1828 set->map[HCTX_TYPE_READ].queue_offset = 1829 ctrl->io_queues[HCTX_TYPE_DEFAULT]; 1830 } else { 1831 /* shared read/write queues */ 1832 set->map[HCTX_TYPE_DEFAULT].nr_queues = 1833 ctrl->io_queues[HCTX_TYPE_DEFAULT]; 1834 set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; 1835 set->map[HCTX_TYPE_READ].nr_queues = 1836 ctrl->io_queues[HCTX_TYPE_DEFAULT]; 1837 set->map[HCTX_TYPE_READ].queue_offset = 0; 1838 } 1839 blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_DEFAULT], 1840 ctrl->device->dev, 0); 1841 blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_READ], 1842 ctrl->device->dev, 0); 1843 1844 if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) { 1845 /* map dedicated poll queues only if we have queues left */ 1846 set->map[HCTX_TYPE_POLL].nr_queues = 1847 ctrl->io_queues[HCTX_TYPE_POLL]; 1848 set->map[HCTX_TYPE_POLL].queue_offset = 1849 ctrl->io_queues[HCTX_TYPE_DEFAULT] + 1850 ctrl->io_queues[HCTX_TYPE_READ]; 1851 blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]); 1852 } 1853 1854 dev_info(ctrl->ctrl.device, 1855 "mapped %d/%d/%d default/read/poll queues.\n", 1856 ctrl->io_queues[HCTX_TYPE_DEFAULT], 1857 ctrl->io_queues[HCTX_TYPE_READ], 1858 ctrl->io_queues[HCTX_TYPE_POLL]); 1859 1860 return 0; 1861 } 1862 1863 static const struct blk_mq_ops nvme_rdma_mq_ops = { 1864 .queue_rq = nvme_rdma_queue_rq, 1865 .complete = nvme_rdma_complete_rq, 1866 .init_request = nvme_rdma_init_request, 1867 .exit_request = nvme_rdma_exit_request, 1868 .init_hctx = nvme_rdma_init_hctx, 1869 .timeout = nvme_rdma_timeout, 1870 .map_queues = nvme_rdma_map_queues, 1871 .poll = nvme_rdma_poll, 1872 }; 1873 1874 static const struct blk_mq_ops nvme_rdma_admin_mq_ops = { 1875 .queue_rq = nvme_rdma_queue_rq, 1876 .complete = nvme_rdma_complete_rq, 1877 .init_request = nvme_rdma_init_request, 1878 .exit_request = nvme_rdma_exit_request, 1879 .init_hctx = nvme_rdma_init_admin_hctx, 1880 .timeout = nvme_rdma_timeout, 1881 }; 1882 1883 static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) 1884 { 1885 cancel_work_sync(&ctrl->err_work); 1886 cancel_delayed_work_sync(&ctrl->reconnect_work); 1887 1888 nvme_rdma_teardown_io_queues(ctrl, shutdown); 1889 blk_mq_quiesce_queue(ctrl->ctrl.admin_q); 1890 if (shutdown) 1891 nvme_shutdown_ctrl(&ctrl->ctrl); 1892 else 1893 nvme_disable_ctrl(&ctrl->ctrl); 1894 nvme_rdma_teardown_admin_queue(ctrl, shutdown); 1895 } 1896 1897 static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl) 1898 { 1899 nvme_rdma_shutdown_ctrl(to_rdma_ctrl(ctrl), true); 1900 } 1901 1902 static void nvme_rdma_reset_ctrl_work(struct work_struct *work) 1903 { 1904 struct nvme_rdma_ctrl *ctrl = 1905 container_of(work, struct nvme_rdma_ctrl, ctrl.reset_work); 1906 1907 nvme_stop_ctrl(&ctrl->ctrl); 1908 nvme_rdma_shutdown_ctrl(ctrl, false); 1909 1910 if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { 1911 /* state change failure should never happen */ 1912 WARN_ON_ONCE(1); 1913 return; 1914 } 1915 1916 if (nvme_rdma_setup_ctrl(ctrl, false)) 1917 goto out_fail; 1918 1919 return; 1920 1921 out_fail: 1922 ++ctrl->ctrl.nr_reconnects; 1923 nvme_rdma_reconnect_or_remove(ctrl); 1924 } 1925 1926 static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { 1927 .name = "rdma", 1928 .module = THIS_MODULE, 1929 .flags = NVME_F_FABRICS, 1930 .reg_read32 = nvmf_reg_read32, 1931 .reg_read64 = nvmf_reg_read64, 1932 .reg_write32 = nvmf_reg_write32, 1933 .free_ctrl = nvme_rdma_free_ctrl, 1934 .submit_async_event = nvme_rdma_submit_async_event, 1935 .delete_ctrl = nvme_rdma_delete_ctrl, 1936 .get_address = nvmf_get_address, 1937 }; 1938 1939 /* 1940 * Fails a connection request if it matches an existing controller 1941 * (association) with the same tuple: 1942 * <Host NQN, Host ID, local address, remote address, remote port, SUBSYS NQN> 1943 * 1944 * if local address is not specified in the request, it will match an 1945 * existing controller with all the other parameters the same and no 1946 * local port address specified as well. 1947 * 1948 * The ports don't need to be compared as they are intrinsically 1949 * already matched by the port pointers supplied. 1950 */ 1951 static bool 1952 nvme_rdma_existing_controller(struct nvmf_ctrl_options *opts) 1953 { 1954 struct nvme_rdma_ctrl *ctrl; 1955 bool found = false; 1956 1957 mutex_lock(&nvme_rdma_ctrl_mutex); 1958 list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) { 1959 found = nvmf_ip_options_match(&ctrl->ctrl, opts); 1960 if (found) 1961 break; 1962 } 1963 mutex_unlock(&nvme_rdma_ctrl_mutex); 1964 1965 return found; 1966 } 1967 1968 static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, 1969 struct nvmf_ctrl_options *opts) 1970 { 1971 struct nvme_rdma_ctrl *ctrl; 1972 int ret; 1973 bool changed; 1974 1975 ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); 1976 if (!ctrl) 1977 return ERR_PTR(-ENOMEM); 1978 ctrl->ctrl.opts = opts; 1979 INIT_LIST_HEAD(&ctrl->list); 1980 1981 if (!(opts->mask & NVMF_OPT_TRSVCID)) { 1982 opts->trsvcid = 1983 kstrdup(__stringify(NVME_RDMA_IP_PORT), GFP_KERNEL); 1984 if (!opts->trsvcid) { 1985 ret = -ENOMEM; 1986 goto out_free_ctrl; 1987 } 1988 opts->mask |= NVMF_OPT_TRSVCID; 1989 } 1990 1991 ret = inet_pton_with_scope(&init_net, AF_UNSPEC, 1992 opts->traddr, opts->trsvcid, &ctrl->addr); 1993 if (ret) { 1994 pr_err("malformed address passed: %s:%s\n", 1995 opts->traddr, opts->trsvcid); 1996 goto out_free_ctrl; 1997 } 1998 1999 if (opts->mask & NVMF_OPT_HOST_TRADDR) { 2000 ret = inet_pton_with_scope(&init_net, AF_UNSPEC, 2001 opts->host_traddr, NULL, &ctrl->src_addr); 2002 if (ret) { 2003 pr_err("malformed src address passed: %s\n", 2004 opts->host_traddr); 2005 goto out_free_ctrl; 2006 } 2007 } 2008 2009 if (!opts->duplicate_connect && nvme_rdma_existing_controller(opts)) { 2010 ret = -EALREADY; 2011 goto out_free_ctrl; 2012 } 2013 2014 INIT_DELAYED_WORK(&ctrl->reconnect_work, 2015 nvme_rdma_reconnect_ctrl_work); 2016 INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work); 2017 INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work); 2018 2019 ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + 2020 opts->nr_poll_queues + 1; 2021 ctrl->ctrl.sqsize = opts->queue_size - 1; 2022 ctrl->ctrl.kato = opts->kato; 2023 2024 ret = -ENOMEM; 2025 ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues), 2026 GFP_KERNEL); 2027 if (!ctrl->queues) 2028 goto out_free_ctrl; 2029 2030 ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops, 2031 0 /* no quirks, we're perfect! */); 2032 if (ret) 2033 goto out_kfree_queues; 2034 2035 changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING); 2036 WARN_ON_ONCE(!changed); 2037 2038 ret = nvme_rdma_setup_ctrl(ctrl, true); 2039 if (ret) 2040 goto out_uninit_ctrl; 2041 2042 dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n", 2043 ctrl->ctrl.opts->subsysnqn, &ctrl->addr); 2044 2045 mutex_lock(&nvme_rdma_ctrl_mutex); 2046 list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list); 2047 mutex_unlock(&nvme_rdma_ctrl_mutex); 2048 2049 return &ctrl->ctrl; 2050 2051 out_uninit_ctrl: 2052 nvme_uninit_ctrl(&ctrl->ctrl); 2053 nvme_put_ctrl(&ctrl->ctrl); 2054 if (ret > 0) 2055 ret = -EIO; 2056 return ERR_PTR(ret); 2057 out_kfree_queues: 2058 kfree(ctrl->queues); 2059 out_free_ctrl: 2060 kfree(ctrl); 2061 return ERR_PTR(ret); 2062 } 2063 2064 static struct nvmf_transport_ops nvme_rdma_transport = { 2065 .name = "rdma", 2066 .module = THIS_MODULE, 2067 .required_opts = NVMF_OPT_TRADDR, 2068 .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY | 2069 NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO | 2070 NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES | 2071 NVMF_OPT_TOS, 2072 .create_ctrl = nvme_rdma_create_ctrl, 2073 }; 2074 2075 static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data) 2076 { 2077 struct nvme_rdma_ctrl *ctrl; 2078 struct nvme_rdma_device *ndev; 2079 bool found = false; 2080 2081 mutex_lock(&device_list_mutex); 2082 list_for_each_entry(ndev, &device_list, entry) { 2083 if (ndev->dev == ib_device) { 2084 found = true; 2085 break; 2086 } 2087 } 2088 mutex_unlock(&device_list_mutex); 2089 2090 if (!found) 2091 return; 2092 2093 /* Delete all controllers using this device */ 2094 mutex_lock(&nvme_rdma_ctrl_mutex); 2095 list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) { 2096 if (ctrl->device->dev != ib_device) 2097 continue; 2098 nvme_delete_ctrl(&ctrl->ctrl); 2099 } 2100 mutex_unlock(&nvme_rdma_ctrl_mutex); 2101 2102 flush_workqueue(nvme_delete_wq); 2103 } 2104 2105 static struct ib_client nvme_rdma_ib_client = { 2106 .name = "nvme_rdma", 2107 .remove = nvme_rdma_remove_one 2108 }; 2109 2110 static int __init nvme_rdma_init_module(void) 2111 { 2112 int ret; 2113 2114 ret = ib_register_client(&nvme_rdma_ib_client); 2115 if (ret) 2116 return ret; 2117 2118 ret = nvmf_register_transport(&nvme_rdma_transport); 2119 if (ret) 2120 goto err_unreg_client; 2121 2122 return 0; 2123 2124 err_unreg_client: 2125 ib_unregister_client(&nvme_rdma_ib_client); 2126 return ret; 2127 } 2128 2129 static void __exit nvme_rdma_cleanup_module(void) 2130 { 2131 struct nvme_rdma_ctrl *ctrl; 2132 2133 nvmf_unregister_transport(&nvme_rdma_transport); 2134 ib_unregister_client(&nvme_rdma_ib_client); 2135 2136 mutex_lock(&nvme_rdma_ctrl_mutex); 2137 list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) 2138 nvme_delete_ctrl(&ctrl->ctrl); 2139 mutex_unlock(&nvme_rdma_ctrl_mutex); 2140 flush_workqueue(nvme_delete_wq); 2141 } 2142 2143 module_init(nvme_rdma_init_module); 2144 module_exit(nvme_rdma_cleanup_module); 2145 2146 MODULE_LICENSE("GPL v2"); 2147