1 /* 2 * QEMU paravirtual RDMA - Generic RDMA backend 3 * 4 * Copyright (C) 2018 Oracle 5 * Copyright (C) 2018 Red Hat Inc 6 * 7 * Authors: 8 * Yuval Shaia <yuval.shaia@oracle.com> 9 * Marcel Apfelbaum <marcel@redhat.com> 10 * 11 * This work is licensed under the terms of the GNU GPL, version 2 or later. 12 * See the COPYING file in the top-level directory. 13 * 14 */ 15 16 #include "qemu/osdep.h" 17 #include "qapi/qapi-events-rdma.h" 18 19 #include <infiniband/verbs.h> 20 21 #include "contrib/rdmacm-mux/rdmacm-mux.h" 22 #include "trace.h" 23 #include "rdma_utils.h" 24 #include "rdma_rm.h" 25 #include "rdma_backend.h" 26 27 #define THR_NAME_LEN 16 28 #define THR_POLL_TO 5000 29 30 #define MAD_HDR_SIZE sizeof(struct ibv_grh) 31 32 typedef struct BackendCtx { 33 void *up_ctx; 34 struct ibv_sge sge; /* Used to save MAD recv buffer */ 35 RdmaBackendQP *backend_qp; /* To maintain recv buffers */ 36 RdmaBackendSRQ *backend_srq; 37 } BackendCtx; 38 39 struct backend_umad { 40 struct ib_user_mad hdr; 41 char mad[RDMA_MAX_PRIVATE_DATA]; 42 }; 43 44 static void (*comp_handler)(void *ctx, struct ibv_wc *wc); 45 46 static void dummy_comp_handler(void *ctx, struct ibv_wc *wc) 47 { 48 rdma_error_report("No completion handler is registered"); 49 } 50 51 static inline void complete_work(enum ibv_wc_status status, uint32_t vendor_err, 52 void *ctx) 53 { 54 struct ibv_wc wc = {}; 55 56 wc.status = status; 57 wc.vendor_err = vendor_err; 58 59 comp_handler(ctx, &wc); 60 } 61 62 static void free_cqe_ctx(gpointer data, gpointer user_data) 63 { 64 BackendCtx *bctx; 65 RdmaDeviceResources *rdma_dev_res = user_data; 66 unsigned long cqe_ctx_id = GPOINTER_TO_INT(data); 67 68 bctx = rdma_rm_get_cqe_ctx(rdma_dev_res, cqe_ctx_id); 69 if (bctx) { 70 rdma_rm_dealloc_cqe_ctx(rdma_dev_res, cqe_ctx_id); 71 atomic_dec(&rdma_dev_res->stats.missing_cqe); 72 } 73 g_free(bctx); 74 } 75 76 static void clean_recv_mads(RdmaBackendDev *backend_dev) 77 { 78 unsigned long cqe_ctx_id; 79 80 do { 81 cqe_ctx_id = rdma_protected_qlist_pop_int64(&backend_dev-> 82 recv_mads_list); 83 if (cqe_ctx_id != -ENOENT) { 84 atomic_inc(&backend_dev->rdma_dev_res->stats.missing_cqe); 85 free_cqe_ctx(GINT_TO_POINTER(cqe_ctx_id), 86 backend_dev->rdma_dev_res); 87 } 88 } while (cqe_ctx_id != -ENOENT); 89 } 90 91 static int rdma_poll_cq(RdmaDeviceResources *rdma_dev_res, struct ibv_cq *ibcq) 92 { 93 int i, ne, total_ne = 0; 94 BackendCtx *bctx; 95 struct ibv_wc wc[2]; 96 RdmaProtectedGSList *cqe_ctx_list; 97 98 qemu_mutex_lock(&rdma_dev_res->lock); 99 do { 100 ne = ibv_poll_cq(ibcq, ARRAY_SIZE(wc), wc); 101 102 trace_rdma_poll_cq(ne, ibcq); 103 104 for (i = 0; i < ne; i++) { 105 bctx = rdma_rm_get_cqe_ctx(rdma_dev_res, wc[i].wr_id); 106 if (unlikely(!bctx)) { 107 rdma_error_report("No matching ctx for req %"PRId64, 108 wc[i].wr_id); 109 continue; 110 } 111 112 comp_handler(bctx->up_ctx, &wc[i]); 113 114 if (bctx->backend_qp) { 115 cqe_ctx_list = &bctx->backend_qp->cqe_ctx_list; 116 } else { 117 cqe_ctx_list = &bctx->backend_srq->cqe_ctx_list; 118 } 119 120 rdma_protected_gslist_remove_int32(cqe_ctx_list, wc[i].wr_id); 121 rdma_rm_dealloc_cqe_ctx(rdma_dev_res, wc[i].wr_id); 122 g_free(bctx); 123 } 124 total_ne += ne; 125 } while (ne > 0); 126 atomic_sub(&rdma_dev_res->stats.missing_cqe, total_ne); 127 qemu_mutex_unlock(&rdma_dev_res->lock); 128 129 if (ne < 0) { 130 rdma_error_report("ibv_poll_cq fail, rc=%d, errno=%d", ne, errno); 131 } 132 133 rdma_dev_res->stats.completions += total_ne; 134 135 return total_ne; 136 } 137 138 static void *comp_handler_thread(void *arg) 139 { 140 RdmaBackendDev *backend_dev = (RdmaBackendDev *)arg; 141 int rc; 142 struct ibv_cq *ev_cq; 143 void *ev_ctx; 144 int flags; 145 GPollFD pfds[1]; 146 147 /* Change to non-blocking mode */ 148 flags = fcntl(backend_dev->channel->fd, F_GETFL); 149 rc = fcntl(backend_dev->channel->fd, F_SETFL, flags | O_NONBLOCK); 150 if (rc < 0) { 151 rdma_error_report("Failed to change backend channel FD to non-blocking"); 152 return NULL; 153 } 154 155 pfds[0].fd = backend_dev->channel->fd; 156 pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR; 157 158 backend_dev->comp_thread.is_running = true; 159 160 while (backend_dev->comp_thread.run) { 161 do { 162 rc = qemu_poll_ns(pfds, 1, THR_POLL_TO * (int64_t)SCALE_MS); 163 if (!rc) { 164 backend_dev->rdma_dev_res->stats.poll_cq_ppoll_to++; 165 } 166 } while (!rc && backend_dev->comp_thread.run); 167 168 if (backend_dev->comp_thread.run) { 169 rc = ibv_get_cq_event(backend_dev->channel, &ev_cq, &ev_ctx); 170 if (unlikely(rc)) { 171 rdma_error_report("ibv_get_cq_event fail, rc=%d, errno=%d", rc, 172 errno); 173 continue; 174 } 175 176 rc = ibv_req_notify_cq(ev_cq, 0); 177 if (unlikely(rc)) { 178 rdma_error_report("ibv_req_notify_cq fail, rc=%d, errno=%d", rc, 179 errno); 180 } 181 182 backend_dev->rdma_dev_res->stats.poll_cq_from_bk++; 183 rdma_poll_cq(backend_dev->rdma_dev_res, ev_cq); 184 185 ibv_ack_cq_events(ev_cq, 1); 186 } 187 } 188 189 backend_dev->comp_thread.is_running = false; 190 191 qemu_thread_exit(0); 192 193 return NULL; 194 } 195 196 static inline void disable_rdmacm_mux_async(RdmaBackendDev *backend_dev) 197 { 198 atomic_set(&backend_dev->rdmacm_mux.can_receive, 0); 199 } 200 201 static inline void enable_rdmacm_mux_async(RdmaBackendDev *backend_dev) 202 { 203 atomic_set(&backend_dev->rdmacm_mux.can_receive, sizeof(RdmaCmMuxMsg)); 204 } 205 206 static inline int rdmacm_mux_can_process_async(RdmaBackendDev *backend_dev) 207 { 208 return atomic_read(&backend_dev->rdmacm_mux.can_receive); 209 } 210 211 static int rdmacm_mux_check_op_status(CharBackend *mad_chr_be) 212 { 213 RdmaCmMuxMsg msg = {}; 214 int ret; 215 216 ret = qemu_chr_fe_read_all(mad_chr_be, (uint8_t *)&msg, sizeof(msg)); 217 if (ret != sizeof(msg)) { 218 rdma_error_report("Got invalid message from mux: size %d, expecting %d", 219 ret, (int)sizeof(msg)); 220 return -EIO; 221 } 222 223 trace_rdmacm_mux_check_op_status(msg.hdr.msg_type, msg.hdr.op_code, 224 msg.hdr.err_code); 225 226 if (msg.hdr.msg_type != RDMACM_MUX_MSG_TYPE_RESP) { 227 rdma_error_report("Got invalid message type %d", msg.hdr.msg_type); 228 return -EIO; 229 } 230 231 if (msg.hdr.err_code != RDMACM_MUX_ERR_CODE_OK) { 232 rdma_error_report("Operation failed in mux, error code %d", 233 msg.hdr.err_code); 234 return -EIO; 235 } 236 237 return 0; 238 } 239 240 static int rdmacm_mux_send(RdmaBackendDev *backend_dev, RdmaCmMuxMsg *msg) 241 { 242 int rc = 0; 243 244 msg->hdr.msg_type = RDMACM_MUX_MSG_TYPE_REQ; 245 trace_rdmacm_mux("send", msg->hdr.msg_type, msg->hdr.op_code); 246 disable_rdmacm_mux_async(backend_dev); 247 rc = qemu_chr_fe_write(backend_dev->rdmacm_mux.chr_be, 248 (const uint8_t *)msg, sizeof(*msg)); 249 if (rc != sizeof(*msg)) { 250 enable_rdmacm_mux_async(backend_dev); 251 rdma_error_report("Failed to send request to rdmacm_mux (rc=%d)", rc); 252 return -EIO; 253 } 254 255 rc = rdmacm_mux_check_op_status(backend_dev->rdmacm_mux.chr_be); 256 if (rc) { 257 rdma_error_report("Failed to execute rdmacm_mux request %d (rc=%d)", 258 msg->hdr.op_code, rc); 259 } 260 261 enable_rdmacm_mux_async(backend_dev); 262 263 return 0; 264 } 265 266 static void stop_backend_thread(RdmaBackendThread *thread) 267 { 268 thread->run = false; 269 while (thread->is_running) { 270 sleep(THR_POLL_TO / SCALE_US / 2); 271 } 272 } 273 274 static void start_comp_thread(RdmaBackendDev *backend_dev) 275 { 276 char thread_name[THR_NAME_LEN] = {}; 277 278 stop_backend_thread(&backend_dev->comp_thread); 279 280 snprintf(thread_name, sizeof(thread_name), "rdma_comp_%s", 281 ibv_get_device_name(backend_dev->ib_dev)); 282 backend_dev->comp_thread.run = true; 283 qemu_thread_create(&backend_dev->comp_thread.thread, thread_name, 284 comp_handler_thread, backend_dev, QEMU_THREAD_DETACHED); 285 } 286 287 void rdma_backend_register_comp_handler(void (*handler)(void *ctx, 288 struct ibv_wc *wc)) 289 { 290 comp_handler = handler; 291 } 292 293 void rdma_backend_unregister_comp_handler(void) 294 { 295 rdma_backend_register_comp_handler(dummy_comp_handler); 296 } 297 298 int rdma_backend_query_port(RdmaBackendDev *backend_dev, 299 struct ibv_port_attr *port_attr) 300 { 301 int rc; 302 303 rc = ibv_query_port(backend_dev->context, backend_dev->port_num, port_attr); 304 if (rc) { 305 rdma_error_report("ibv_query_port fail, rc=%d, errno=%d", rc, errno); 306 return -EIO; 307 } 308 309 return 0; 310 } 311 312 void rdma_backend_poll_cq(RdmaDeviceResources *rdma_dev_res, RdmaBackendCQ *cq) 313 { 314 int polled; 315 316 rdma_dev_res->stats.poll_cq_from_guest++; 317 polled = rdma_poll_cq(rdma_dev_res, cq->ibcq); 318 if (!polled) { 319 rdma_dev_res->stats.poll_cq_from_guest_empty++; 320 } 321 } 322 323 static GHashTable *ah_hash; 324 325 static struct ibv_ah *create_ah(RdmaBackendDev *backend_dev, struct ibv_pd *pd, 326 uint8_t sgid_idx, union ibv_gid *dgid) 327 { 328 GBytes *ah_key = g_bytes_new(dgid, sizeof(*dgid)); 329 struct ibv_ah *ah = g_hash_table_lookup(ah_hash, ah_key); 330 331 if (ah) { 332 trace_rdma_create_ah_cache_hit(be64_to_cpu(dgid->global.subnet_prefix), 333 be64_to_cpu(dgid->global.interface_id)); 334 g_bytes_unref(ah_key); 335 } else { 336 struct ibv_ah_attr ah_attr = { 337 .is_global = 1, 338 .port_num = backend_dev->port_num, 339 .grh.hop_limit = 1, 340 }; 341 342 ah_attr.grh.dgid = *dgid; 343 ah_attr.grh.sgid_index = sgid_idx; 344 345 ah = ibv_create_ah(pd, &ah_attr); 346 if (ah) { 347 g_hash_table_insert(ah_hash, ah_key, ah); 348 } else { 349 g_bytes_unref(ah_key); 350 rdma_error_report("Failed to create AH for gid <0x%" PRIx64", 0x%"PRIx64">", 351 be64_to_cpu(dgid->global.subnet_prefix), 352 be64_to_cpu(dgid->global.interface_id)); 353 } 354 355 trace_rdma_create_ah_cache_miss(be64_to_cpu(dgid->global.subnet_prefix), 356 be64_to_cpu(dgid->global.interface_id)); 357 } 358 359 return ah; 360 } 361 362 static void destroy_ah_hash_key(gpointer data) 363 { 364 g_bytes_unref(data); 365 } 366 367 static void destroy_ah_hast_data(gpointer data) 368 { 369 struct ibv_ah *ah = data; 370 371 ibv_destroy_ah(ah); 372 } 373 374 static void ah_cache_init(void) 375 { 376 ah_hash = g_hash_table_new_full(g_bytes_hash, g_bytes_equal, 377 destroy_ah_hash_key, destroy_ah_hast_data); 378 } 379 380 static int build_host_sge_array(RdmaDeviceResources *rdma_dev_res, 381 struct ibv_sge *dsge, struct ibv_sge *ssge, 382 uint8_t num_sge, uint64_t *total_length) 383 { 384 RdmaRmMR *mr; 385 int ssge_idx; 386 387 for (ssge_idx = 0; ssge_idx < num_sge; ssge_idx++) { 388 mr = rdma_rm_get_mr(rdma_dev_res, ssge[ssge_idx].lkey); 389 if (unlikely(!mr)) { 390 rdma_error_report("Invalid lkey 0x%x", ssge[ssge_idx].lkey); 391 return VENDOR_ERR_INVLKEY | ssge[ssge_idx].lkey; 392 } 393 394 dsge->addr = (uintptr_t)mr->virt + ssge[ssge_idx].addr - mr->start; 395 dsge->length = ssge[ssge_idx].length; 396 dsge->lkey = rdma_backend_mr_lkey(&mr->backend_mr); 397 398 *total_length += dsge->length; 399 400 dsge++; 401 } 402 403 return 0; 404 } 405 406 static void trace_mad_message(const char *title, char *buf, int len) 407 { 408 int i; 409 char *b = g_malloc0(len * 3 + 1); 410 char b1[4]; 411 412 for (i = 0; i < len; i++) { 413 sprintf(b1, "%.2X ", buf[i] & 0x000000FF); 414 strcat(b, b1); 415 } 416 417 trace_rdma_mad_message(title, len, b); 418 419 g_free(b); 420 } 421 422 static int mad_send(RdmaBackendDev *backend_dev, uint8_t sgid_idx, 423 union ibv_gid *sgid, struct ibv_sge *sge, uint32_t num_sge) 424 { 425 RdmaCmMuxMsg msg = {}; 426 char *hdr, *data; 427 int ret; 428 429 if (num_sge != 2) { 430 return -EINVAL; 431 } 432 433 msg.hdr.op_code = RDMACM_MUX_OP_CODE_MAD; 434 memcpy(msg.hdr.sgid.raw, sgid->raw, sizeof(msg.hdr.sgid)); 435 436 msg.umad_len = sge[0].length + sge[1].length; 437 438 if (msg.umad_len > sizeof(msg.umad.mad)) { 439 return -ENOMEM; 440 } 441 442 msg.umad.hdr.addr.qpn = htobe32(1); 443 msg.umad.hdr.addr.grh_present = 1; 444 msg.umad.hdr.addr.gid_index = sgid_idx; 445 memcpy(msg.umad.hdr.addr.gid, sgid->raw, sizeof(msg.umad.hdr.addr.gid)); 446 msg.umad.hdr.addr.hop_limit = 0xFF; 447 448 hdr = rdma_pci_dma_map(backend_dev->dev, sge[0].addr, sge[0].length); 449 if (!hdr) { 450 return -ENOMEM; 451 } 452 data = rdma_pci_dma_map(backend_dev->dev, sge[1].addr, sge[1].length); 453 if (!data) { 454 rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length); 455 return -ENOMEM; 456 } 457 458 memcpy(&msg.umad.mad[0], hdr, sge[0].length); 459 memcpy(&msg.umad.mad[sge[0].length], data, sge[1].length); 460 461 rdma_pci_dma_unmap(backend_dev->dev, data, sge[1].length); 462 rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length); 463 464 trace_mad_message("send", msg.umad.mad, msg.umad_len); 465 466 ret = rdmacm_mux_send(backend_dev, &msg); 467 if (ret) { 468 rdma_error_report("Failed to send MAD to rdma_umadmux (%d)", ret); 469 return -EIO; 470 } 471 472 return 0; 473 } 474 475 void rdma_backend_post_send(RdmaBackendDev *backend_dev, 476 RdmaBackendQP *qp, uint8_t qp_type, 477 struct ibv_sge *sge, uint32_t num_sge, 478 uint8_t sgid_idx, union ibv_gid *sgid, 479 union ibv_gid *dgid, uint32_t dqpn, uint32_t dqkey, 480 void *ctx) 481 { 482 BackendCtx *bctx; 483 struct ibv_sge new_sge[MAX_SGE]; 484 uint32_t bctx_id; 485 int rc; 486 struct ibv_send_wr wr = {}, *bad_wr; 487 488 if (!qp->ibqp) { /* This field is not initialized for QP0 and QP1 */ 489 if (qp_type == IBV_QPT_SMI) { 490 rdma_error_report("Got QP0 request"); 491 complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx); 492 } else if (qp_type == IBV_QPT_GSI) { 493 rc = mad_send(backend_dev, sgid_idx, sgid, sge, num_sge); 494 if (rc) { 495 complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx); 496 backend_dev->rdma_dev_res->stats.mad_tx_err++; 497 } else { 498 complete_work(IBV_WC_SUCCESS, 0, ctx); 499 backend_dev->rdma_dev_res->stats.mad_tx++; 500 } 501 } 502 return; 503 } 504 505 bctx = g_malloc0(sizeof(*bctx)); 506 bctx->up_ctx = ctx; 507 bctx->backend_qp = qp; 508 509 rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, &bctx_id, bctx); 510 if (unlikely(rc)) { 511 complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_NOMEM, ctx); 512 goto err_free_bctx; 513 } 514 515 rdma_protected_gslist_append_int32(&qp->cqe_ctx_list, bctx_id); 516 517 rc = build_host_sge_array(backend_dev->rdma_dev_res, new_sge, sge, num_sge, 518 &backend_dev->rdma_dev_res->stats.tx_len); 519 if (rc) { 520 complete_work(IBV_WC_GENERAL_ERR, rc, ctx); 521 goto err_dealloc_cqe_ctx; 522 } 523 524 if (qp_type == IBV_QPT_UD) { 525 wr.wr.ud.ah = create_ah(backend_dev, qp->ibpd, sgid_idx, dgid); 526 if (!wr.wr.ud.ah) { 527 complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx); 528 goto err_dealloc_cqe_ctx; 529 } 530 wr.wr.ud.remote_qpn = dqpn; 531 wr.wr.ud.remote_qkey = dqkey; 532 } 533 534 wr.num_sge = num_sge; 535 wr.opcode = IBV_WR_SEND; 536 wr.send_flags = IBV_SEND_SIGNALED; 537 wr.sg_list = new_sge; 538 wr.wr_id = bctx_id; 539 540 rc = ibv_post_send(qp->ibqp, &wr, &bad_wr); 541 if (rc) { 542 rdma_error_report("ibv_post_send fail, qpn=0x%x, rc=%d, errno=%d", 543 qp->ibqp->qp_num, rc, errno); 544 complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx); 545 goto err_dealloc_cqe_ctx; 546 } 547 548 atomic_inc(&backend_dev->rdma_dev_res->stats.missing_cqe); 549 backend_dev->rdma_dev_res->stats.tx++; 550 551 return; 552 553 err_dealloc_cqe_ctx: 554 backend_dev->rdma_dev_res->stats.tx_err++; 555 rdma_rm_dealloc_cqe_ctx(backend_dev->rdma_dev_res, bctx_id); 556 557 err_free_bctx: 558 g_free(bctx); 559 } 560 561 static unsigned int save_mad_recv_buffer(RdmaBackendDev *backend_dev, 562 struct ibv_sge *sge, uint32_t num_sge, 563 void *ctx) 564 { 565 BackendCtx *bctx; 566 int rc; 567 uint32_t bctx_id; 568 569 if (num_sge != 1) { 570 rdma_error_report("Invalid num_sge (%d), expecting 1", num_sge); 571 return VENDOR_ERR_INV_NUM_SGE; 572 } 573 574 if (sge[0].length < RDMA_MAX_PRIVATE_DATA + sizeof(struct ibv_grh)) { 575 rdma_error_report("Too small buffer for MAD"); 576 return VENDOR_ERR_INV_MAD_BUFF; 577 } 578 579 bctx = g_malloc0(sizeof(*bctx)); 580 581 rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, &bctx_id, bctx); 582 if (unlikely(rc)) { 583 g_free(bctx); 584 return VENDOR_ERR_NOMEM; 585 } 586 587 bctx->up_ctx = ctx; 588 bctx->sge = *sge; 589 590 rdma_protected_qlist_append_int64(&backend_dev->recv_mads_list, bctx_id); 591 592 return 0; 593 } 594 595 void rdma_backend_post_recv(RdmaBackendDev *backend_dev, 596 RdmaBackendQP *qp, uint8_t qp_type, 597 struct ibv_sge *sge, uint32_t num_sge, void *ctx) 598 { 599 BackendCtx *bctx; 600 struct ibv_sge new_sge[MAX_SGE]; 601 uint32_t bctx_id; 602 int rc; 603 struct ibv_recv_wr wr = {}, *bad_wr; 604 605 if (!qp->ibqp) { /* This field does not get initialized for QP0 and QP1 */ 606 if (qp_type == IBV_QPT_SMI) { 607 rdma_error_report("Got QP0 request"); 608 complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx); 609 } 610 if (qp_type == IBV_QPT_GSI) { 611 rc = save_mad_recv_buffer(backend_dev, sge, num_sge, ctx); 612 if (rc) { 613 complete_work(IBV_WC_GENERAL_ERR, rc, ctx); 614 backend_dev->rdma_dev_res->stats.mad_rx_bufs_err++; 615 } else { 616 backend_dev->rdma_dev_res->stats.mad_rx_bufs++; 617 } 618 } 619 return; 620 } 621 622 bctx = g_malloc0(sizeof(*bctx)); 623 bctx->up_ctx = ctx; 624 bctx->backend_qp = qp; 625 626 rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, &bctx_id, bctx); 627 if (unlikely(rc)) { 628 complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_NOMEM, ctx); 629 goto err_free_bctx; 630 } 631 632 rdma_protected_gslist_append_int32(&qp->cqe_ctx_list, bctx_id); 633 634 rc = build_host_sge_array(backend_dev->rdma_dev_res, new_sge, sge, num_sge, 635 &backend_dev->rdma_dev_res->stats.rx_bufs_len); 636 if (rc) { 637 complete_work(IBV_WC_GENERAL_ERR, rc, ctx); 638 goto err_dealloc_cqe_ctx; 639 } 640 641 wr.num_sge = num_sge; 642 wr.sg_list = new_sge; 643 wr.wr_id = bctx_id; 644 rc = ibv_post_recv(qp->ibqp, &wr, &bad_wr); 645 if (rc) { 646 rdma_error_report("ibv_post_recv fail, qpn=0x%x, rc=%d, errno=%d", 647 qp->ibqp->qp_num, rc, errno); 648 complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx); 649 goto err_dealloc_cqe_ctx; 650 } 651 652 atomic_inc(&backend_dev->rdma_dev_res->stats.missing_cqe); 653 backend_dev->rdma_dev_res->stats.rx_bufs++; 654 655 return; 656 657 err_dealloc_cqe_ctx: 658 backend_dev->rdma_dev_res->stats.rx_bufs_err++; 659 rdma_rm_dealloc_cqe_ctx(backend_dev->rdma_dev_res, bctx_id); 660 661 err_free_bctx: 662 g_free(bctx); 663 } 664 665 void rdma_backend_post_srq_recv(RdmaBackendDev *backend_dev, 666 RdmaBackendSRQ *srq, struct ibv_sge *sge, 667 uint32_t num_sge, void *ctx) 668 { 669 BackendCtx *bctx; 670 struct ibv_sge new_sge[MAX_SGE]; 671 uint32_t bctx_id; 672 int rc; 673 struct ibv_recv_wr wr = {}, *bad_wr; 674 675 bctx = g_malloc0(sizeof(*bctx)); 676 bctx->up_ctx = ctx; 677 bctx->backend_srq = srq; 678 679 rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, &bctx_id, bctx); 680 if (unlikely(rc)) { 681 complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_NOMEM, ctx); 682 goto err_free_bctx; 683 } 684 685 rdma_protected_gslist_append_int32(&srq->cqe_ctx_list, bctx_id); 686 687 rc = build_host_sge_array(backend_dev->rdma_dev_res, new_sge, sge, num_sge, 688 &backend_dev->rdma_dev_res->stats.rx_bufs_len); 689 if (rc) { 690 complete_work(IBV_WC_GENERAL_ERR, rc, ctx); 691 goto err_dealloc_cqe_ctx; 692 } 693 694 wr.num_sge = num_sge; 695 wr.sg_list = new_sge; 696 wr.wr_id = bctx_id; 697 rc = ibv_post_srq_recv(srq->ibsrq, &wr, &bad_wr); 698 if (rc) { 699 rdma_error_report("ibv_post_srq_recv fail, srqn=0x%x, rc=%d, errno=%d", 700 srq->ibsrq->handle, rc, errno); 701 complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx); 702 goto err_dealloc_cqe_ctx; 703 } 704 705 atomic_inc(&backend_dev->rdma_dev_res->stats.missing_cqe); 706 backend_dev->rdma_dev_res->stats.rx_bufs++; 707 backend_dev->rdma_dev_res->stats.rx_srq++; 708 709 return; 710 711 err_dealloc_cqe_ctx: 712 backend_dev->rdma_dev_res->stats.rx_bufs_err++; 713 rdma_rm_dealloc_cqe_ctx(backend_dev->rdma_dev_res, bctx_id); 714 715 err_free_bctx: 716 g_free(bctx); 717 } 718 719 int rdma_backend_create_pd(RdmaBackendDev *backend_dev, RdmaBackendPD *pd) 720 { 721 pd->ibpd = ibv_alloc_pd(backend_dev->context); 722 723 if (!pd->ibpd) { 724 rdma_error_report("ibv_alloc_pd fail, errno=%d", errno); 725 return -EIO; 726 } 727 728 return 0; 729 } 730 731 void rdma_backend_destroy_pd(RdmaBackendPD *pd) 732 { 733 if (pd->ibpd) { 734 ibv_dealloc_pd(pd->ibpd); 735 } 736 } 737 738 int rdma_backend_create_mr(RdmaBackendMR *mr, RdmaBackendPD *pd, void *addr, 739 size_t length, int access) 740 { 741 mr->ibmr = ibv_reg_mr(pd->ibpd, addr, length, access); 742 if (!mr->ibmr) { 743 rdma_error_report("ibv_reg_mr fail, errno=%d", errno); 744 return -EIO; 745 } 746 747 mr->ibpd = pd->ibpd; 748 749 return 0; 750 } 751 752 void rdma_backend_destroy_mr(RdmaBackendMR *mr) 753 { 754 if (mr->ibmr) { 755 ibv_dereg_mr(mr->ibmr); 756 } 757 } 758 759 int rdma_backend_create_cq(RdmaBackendDev *backend_dev, RdmaBackendCQ *cq, 760 int cqe) 761 { 762 int rc; 763 764 cq->ibcq = ibv_create_cq(backend_dev->context, cqe + 1, NULL, 765 backend_dev->channel, 0); 766 if (!cq->ibcq) { 767 rdma_error_report("ibv_create_cq fail, errno=%d", errno); 768 return -EIO; 769 } 770 771 rc = ibv_req_notify_cq(cq->ibcq, 0); 772 if (rc) { 773 rdma_warn_report("ibv_req_notify_cq fail, rc=%d, errno=%d", rc, errno); 774 } 775 776 cq->backend_dev = backend_dev; 777 778 return 0; 779 } 780 781 void rdma_backend_destroy_cq(RdmaBackendCQ *cq) 782 { 783 if (cq->ibcq) { 784 ibv_destroy_cq(cq->ibcq); 785 } 786 } 787 788 int rdma_backend_create_qp(RdmaBackendQP *qp, uint8_t qp_type, 789 RdmaBackendPD *pd, RdmaBackendCQ *scq, 790 RdmaBackendCQ *rcq, RdmaBackendSRQ *srq, 791 uint32_t max_send_wr, uint32_t max_recv_wr, 792 uint32_t max_send_sge, uint32_t max_recv_sge) 793 { 794 struct ibv_qp_init_attr attr = {}; 795 796 qp->ibqp = 0; 797 798 switch (qp_type) { 799 case IBV_QPT_GSI: 800 return 0; 801 802 case IBV_QPT_RC: 803 /* fall through */ 804 case IBV_QPT_UD: 805 /* do nothing */ 806 break; 807 808 default: 809 rdma_error_report("Unsupported QP type %d", qp_type); 810 return -EIO; 811 } 812 813 attr.qp_type = qp_type; 814 attr.send_cq = scq->ibcq; 815 attr.recv_cq = rcq->ibcq; 816 attr.cap.max_send_wr = max_send_wr; 817 attr.cap.max_recv_wr = max_recv_wr; 818 attr.cap.max_send_sge = max_send_sge; 819 attr.cap.max_recv_sge = max_recv_sge; 820 if (srq) { 821 attr.srq = srq->ibsrq; 822 } 823 824 qp->ibqp = ibv_create_qp(pd->ibpd, &attr); 825 if (!qp->ibqp) { 826 rdma_error_report("ibv_create_qp fail, errno=%d", errno); 827 return -EIO; 828 } 829 830 rdma_protected_gslist_init(&qp->cqe_ctx_list); 831 832 qp->ibpd = pd->ibpd; 833 834 /* TODO: Query QP to get max_inline_data and save it to be used in send */ 835 836 return 0; 837 } 838 839 int rdma_backend_qp_state_init(RdmaBackendDev *backend_dev, RdmaBackendQP *qp, 840 uint8_t qp_type, uint32_t qkey) 841 { 842 struct ibv_qp_attr attr = {}; 843 int rc, attr_mask; 844 845 attr_mask = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT; 846 attr.qp_state = IBV_QPS_INIT; 847 attr.pkey_index = 0; 848 attr.port_num = backend_dev->port_num; 849 850 switch (qp_type) { 851 case IBV_QPT_RC: 852 attr_mask |= IBV_QP_ACCESS_FLAGS; 853 trace_rdma_backend_rc_qp_state_init(qp->ibqp->qp_num); 854 break; 855 856 case IBV_QPT_UD: 857 attr.qkey = qkey; 858 attr_mask |= IBV_QP_QKEY; 859 trace_rdma_backend_ud_qp_state_init(qp->ibqp->qp_num, qkey); 860 break; 861 862 default: 863 rdma_error_report("Unsupported QP type %d", qp_type); 864 return -EIO; 865 } 866 867 rc = ibv_modify_qp(qp->ibqp, &attr, attr_mask); 868 if (rc) { 869 rdma_error_report("ibv_modify_qp fail, rc=%d, errno=%d", rc, errno); 870 return -EIO; 871 } 872 873 return 0; 874 } 875 876 int rdma_backend_qp_state_rtr(RdmaBackendDev *backend_dev, RdmaBackendQP *qp, 877 uint8_t qp_type, uint8_t sgid_idx, 878 union ibv_gid *dgid, uint32_t dqpn, 879 uint32_t rq_psn, uint32_t qkey, bool use_qkey) 880 { 881 struct ibv_qp_attr attr = {}; 882 union ibv_gid ibv_gid = { 883 .global.interface_id = dgid->global.interface_id, 884 .global.subnet_prefix = dgid->global.subnet_prefix 885 }; 886 int rc, attr_mask; 887 888 attr.qp_state = IBV_QPS_RTR; 889 attr_mask = IBV_QP_STATE; 890 891 qp->sgid_idx = sgid_idx; 892 893 switch (qp_type) { 894 case IBV_QPT_RC: 895 attr.path_mtu = IBV_MTU_1024; 896 attr.dest_qp_num = dqpn; 897 attr.max_dest_rd_atomic = 1; 898 attr.min_rnr_timer = 12; 899 attr.ah_attr.port_num = backend_dev->port_num; 900 attr.ah_attr.is_global = 1; 901 attr.ah_attr.grh.hop_limit = 1; 902 attr.ah_attr.grh.dgid = ibv_gid; 903 attr.ah_attr.grh.sgid_index = qp->sgid_idx; 904 attr.rq_psn = rq_psn; 905 906 attr_mask |= IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | 907 IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | 908 IBV_QP_MIN_RNR_TIMER; 909 910 trace_rdma_backend_rc_qp_state_rtr(qp->ibqp->qp_num, 911 be64_to_cpu(ibv_gid.global. 912 subnet_prefix), 913 be64_to_cpu(ibv_gid.global. 914 interface_id), 915 qp->sgid_idx, dqpn, rq_psn); 916 break; 917 918 case IBV_QPT_UD: 919 if (use_qkey) { 920 attr.qkey = qkey; 921 attr_mask |= IBV_QP_QKEY; 922 } 923 trace_rdma_backend_ud_qp_state_rtr(qp->ibqp->qp_num, use_qkey ? qkey : 924 0); 925 break; 926 } 927 928 rc = ibv_modify_qp(qp->ibqp, &attr, attr_mask); 929 if (rc) { 930 rdma_error_report("ibv_modify_qp fail, rc=%d, errno=%d", rc, errno); 931 return -EIO; 932 } 933 934 return 0; 935 } 936 937 int rdma_backend_qp_state_rts(RdmaBackendQP *qp, uint8_t qp_type, 938 uint32_t sq_psn, uint32_t qkey, bool use_qkey) 939 { 940 struct ibv_qp_attr attr = {}; 941 int rc, attr_mask; 942 943 attr.qp_state = IBV_QPS_RTS; 944 attr.sq_psn = sq_psn; 945 attr_mask = IBV_QP_STATE | IBV_QP_SQ_PSN; 946 947 switch (qp_type) { 948 case IBV_QPT_RC: 949 attr.timeout = 14; 950 attr.retry_cnt = 7; 951 attr.rnr_retry = 7; 952 attr.max_rd_atomic = 1; 953 954 attr_mask |= IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | 955 IBV_QP_MAX_QP_RD_ATOMIC; 956 trace_rdma_backend_rc_qp_state_rts(qp->ibqp->qp_num, sq_psn); 957 break; 958 959 case IBV_QPT_UD: 960 if (use_qkey) { 961 attr.qkey = qkey; 962 attr_mask |= IBV_QP_QKEY; 963 } 964 trace_rdma_backend_ud_qp_state_rts(qp->ibqp->qp_num, sq_psn, 965 use_qkey ? qkey : 0); 966 break; 967 } 968 969 rc = ibv_modify_qp(qp->ibqp, &attr, attr_mask); 970 if (rc) { 971 rdma_error_report("ibv_modify_qp fail, rc=%d, errno=%d", rc, errno); 972 return -EIO; 973 } 974 975 return 0; 976 } 977 978 int rdma_backend_query_qp(RdmaBackendQP *qp, struct ibv_qp_attr *attr, 979 int attr_mask, struct ibv_qp_init_attr *init_attr) 980 { 981 if (!qp->ibqp) { 982 attr->qp_state = IBV_QPS_RTS; 983 return 0; 984 } 985 986 return ibv_query_qp(qp->ibqp, attr, attr_mask, init_attr); 987 } 988 989 void rdma_backend_destroy_qp(RdmaBackendQP *qp, RdmaDeviceResources *dev_res) 990 { 991 if (qp->ibqp) { 992 ibv_destroy_qp(qp->ibqp); 993 } 994 g_slist_foreach(qp->cqe_ctx_list.list, free_cqe_ctx, dev_res); 995 rdma_protected_gslist_destroy(&qp->cqe_ctx_list); 996 } 997 998 int rdma_backend_create_srq(RdmaBackendSRQ *srq, RdmaBackendPD *pd, 999 uint32_t max_wr, uint32_t max_sge, 1000 uint32_t srq_limit) 1001 { 1002 struct ibv_srq_init_attr srq_init_attr = {}; 1003 1004 srq_init_attr.attr.max_wr = max_wr; 1005 srq_init_attr.attr.max_sge = max_sge; 1006 srq_init_attr.attr.srq_limit = srq_limit; 1007 1008 srq->ibsrq = ibv_create_srq(pd->ibpd, &srq_init_attr); 1009 if (!srq->ibsrq) { 1010 rdma_error_report("ibv_create_srq failed, errno=%d", errno); 1011 return -EIO; 1012 } 1013 1014 rdma_protected_gslist_init(&srq->cqe_ctx_list); 1015 1016 return 0; 1017 } 1018 1019 int rdma_backend_query_srq(RdmaBackendSRQ *srq, struct ibv_srq_attr *srq_attr) 1020 { 1021 if (!srq->ibsrq) { 1022 return -EINVAL; 1023 } 1024 1025 return ibv_query_srq(srq->ibsrq, srq_attr); 1026 } 1027 1028 int rdma_backend_modify_srq(RdmaBackendSRQ *srq, struct ibv_srq_attr *srq_attr, 1029 int srq_attr_mask) 1030 { 1031 if (!srq->ibsrq) { 1032 return -EINVAL; 1033 } 1034 1035 return ibv_modify_srq(srq->ibsrq, srq_attr, srq_attr_mask); 1036 } 1037 1038 void rdma_backend_destroy_srq(RdmaBackendSRQ *srq, RdmaDeviceResources *dev_res) 1039 { 1040 if (srq->ibsrq) { 1041 ibv_destroy_srq(srq->ibsrq); 1042 } 1043 g_slist_foreach(srq->cqe_ctx_list.list, free_cqe_ctx, dev_res); 1044 rdma_protected_gslist_destroy(&srq->cqe_ctx_list); 1045 } 1046 1047 #define CHK_ATTR(req, dev, member, fmt) ({ \ 1048 trace_rdma_check_dev_attr(#member, dev.member, req->member); \ 1049 if (req->member > dev.member) { \ 1050 rdma_warn_report("%s = "fmt" is higher than host device capability "fmt, \ 1051 #member, req->member, dev.member); \ 1052 req->member = dev.member; \ 1053 } \ 1054 }) 1055 1056 static int init_device_caps(RdmaBackendDev *backend_dev, 1057 struct ibv_device_attr *dev_attr) 1058 { 1059 struct ibv_device_attr bk_dev_attr; 1060 int rc; 1061 1062 rc = ibv_query_device(backend_dev->context, &bk_dev_attr); 1063 if (rc) { 1064 rdma_error_report("ibv_query_device fail, rc=%d, errno=%d", rc, errno); 1065 return -EIO; 1066 } 1067 1068 dev_attr->max_sge = MAX_SGE; 1069 dev_attr->max_srq_sge = MAX_SGE; 1070 1071 CHK_ATTR(dev_attr, bk_dev_attr, max_mr_size, "%" PRId64); 1072 CHK_ATTR(dev_attr, bk_dev_attr, max_qp, "%d"); 1073 CHK_ATTR(dev_attr, bk_dev_attr, max_sge, "%d"); 1074 CHK_ATTR(dev_attr, bk_dev_attr, max_cq, "%d"); 1075 CHK_ATTR(dev_attr, bk_dev_attr, max_mr, "%d"); 1076 CHK_ATTR(dev_attr, bk_dev_attr, max_pd, "%d"); 1077 CHK_ATTR(dev_attr, bk_dev_attr, max_qp_rd_atom, "%d"); 1078 CHK_ATTR(dev_attr, bk_dev_attr, max_qp_init_rd_atom, "%d"); 1079 CHK_ATTR(dev_attr, bk_dev_attr, max_ah, "%d"); 1080 CHK_ATTR(dev_attr, bk_dev_attr, max_srq, "%d"); 1081 1082 return 0; 1083 } 1084 1085 static inline void build_mad_hdr(struct ibv_grh *grh, union ibv_gid *sgid, 1086 union ibv_gid *my_gid, int paylen) 1087 { 1088 grh->paylen = htons(paylen); 1089 grh->sgid = *sgid; 1090 grh->dgid = *my_gid; 1091 } 1092 1093 static void process_incoming_mad_req(RdmaBackendDev *backend_dev, 1094 RdmaCmMuxMsg *msg) 1095 { 1096 unsigned long cqe_ctx_id; 1097 BackendCtx *bctx; 1098 char *mad; 1099 1100 trace_mad_message("recv", msg->umad.mad, msg->umad_len); 1101 1102 cqe_ctx_id = rdma_protected_qlist_pop_int64(&backend_dev->recv_mads_list); 1103 if (cqe_ctx_id == -ENOENT) { 1104 rdma_warn_report("No more free MADs buffers, waiting for a while"); 1105 sleep(THR_POLL_TO); 1106 return; 1107 } 1108 1109 bctx = rdma_rm_get_cqe_ctx(backend_dev->rdma_dev_res, cqe_ctx_id); 1110 if (unlikely(!bctx)) { 1111 rdma_error_report("No matching ctx for req %ld", cqe_ctx_id); 1112 backend_dev->rdma_dev_res->stats.mad_rx_err++; 1113 return; 1114 } 1115 1116 mad = rdma_pci_dma_map(backend_dev->dev, bctx->sge.addr, 1117 bctx->sge.length); 1118 if (!mad || bctx->sge.length < msg->umad_len + MAD_HDR_SIZE) { 1119 backend_dev->rdma_dev_res->stats.mad_rx_err++; 1120 complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_INV_MAD_BUFF, 1121 bctx->up_ctx); 1122 } else { 1123 struct ibv_wc wc = {}; 1124 memset(mad, 0, bctx->sge.length); 1125 build_mad_hdr((struct ibv_grh *)mad, 1126 (union ibv_gid *)&msg->umad.hdr.addr.gid, &msg->hdr.sgid, 1127 msg->umad_len); 1128 memcpy(&mad[MAD_HDR_SIZE], msg->umad.mad, msg->umad_len); 1129 rdma_pci_dma_unmap(backend_dev->dev, mad, bctx->sge.length); 1130 1131 wc.byte_len = msg->umad_len; 1132 wc.status = IBV_WC_SUCCESS; 1133 wc.wc_flags = IBV_WC_GRH; 1134 backend_dev->rdma_dev_res->stats.mad_rx++; 1135 comp_handler(bctx->up_ctx, &wc); 1136 } 1137 1138 g_free(bctx); 1139 rdma_rm_dealloc_cqe_ctx(backend_dev->rdma_dev_res, cqe_ctx_id); 1140 } 1141 1142 static inline int rdmacm_mux_can_receive(void *opaque) 1143 { 1144 RdmaBackendDev *backend_dev = (RdmaBackendDev *)opaque; 1145 1146 return rdmacm_mux_can_process_async(backend_dev); 1147 } 1148 1149 static void rdmacm_mux_read(void *opaque, const uint8_t *buf, int size) 1150 { 1151 RdmaBackendDev *backend_dev = (RdmaBackendDev *)opaque; 1152 RdmaCmMuxMsg *msg = (RdmaCmMuxMsg *)buf; 1153 1154 trace_rdmacm_mux("read", msg->hdr.msg_type, msg->hdr.op_code); 1155 1156 if (msg->hdr.msg_type != RDMACM_MUX_MSG_TYPE_REQ && 1157 msg->hdr.op_code != RDMACM_MUX_OP_CODE_MAD) { 1158 rdma_error_report("Error: Not a MAD request, skipping"); 1159 return; 1160 } 1161 process_incoming_mad_req(backend_dev, msg); 1162 } 1163 1164 static int mad_init(RdmaBackendDev *backend_dev, CharBackend *mad_chr_be) 1165 { 1166 int ret; 1167 1168 backend_dev->rdmacm_mux.chr_be = mad_chr_be; 1169 1170 ret = qemu_chr_fe_backend_connected(backend_dev->rdmacm_mux.chr_be); 1171 if (!ret) { 1172 rdma_error_report("Missing chardev for MAD multiplexer"); 1173 return -EIO; 1174 } 1175 1176 rdma_protected_qlist_init(&backend_dev->recv_mads_list); 1177 1178 enable_rdmacm_mux_async(backend_dev); 1179 1180 qemu_chr_fe_set_handlers(backend_dev->rdmacm_mux.chr_be, 1181 rdmacm_mux_can_receive, rdmacm_mux_read, NULL, 1182 NULL, backend_dev, NULL, true); 1183 1184 return 0; 1185 } 1186 1187 static void mad_stop(RdmaBackendDev *backend_dev) 1188 { 1189 clean_recv_mads(backend_dev); 1190 } 1191 1192 static void mad_fini(RdmaBackendDev *backend_dev) 1193 { 1194 disable_rdmacm_mux_async(backend_dev); 1195 qemu_chr_fe_disconnect(backend_dev->rdmacm_mux.chr_be); 1196 rdma_protected_qlist_destroy(&backend_dev->recv_mads_list); 1197 } 1198 1199 int rdma_backend_get_gid_index(RdmaBackendDev *backend_dev, 1200 union ibv_gid *gid) 1201 { 1202 union ibv_gid sgid; 1203 int ret; 1204 int i = 0; 1205 1206 do { 1207 ret = ibv_query_gid(backend_dev->context, backend_dev->port_num, i, 1208 &sgid); 1209 i++; 1210 } while (!ret && (memcmp(&sgid, gid, sizeof(*gid)))); 1211 1212 trace_rdma_backend_get_gid_index(be64_to_cpu(gid->global.subnet_prefix), 1213 be64_to_cpu(gid->global.interface_id), 1214 i - 1); 1215 1216 return ret ? ret : i - 1; 1217 } 1218 1219 int rdma_backend_add_gid(RdmaBackendDev *backend_dev, const char *ifname, 1220 union ibv_gid *gid) 1221 { 1222 RdmaCmMuxMsg msg = {}; 1223 int ret; 1224 1225 trace_rdma_backend_gid_change("add", be64_to_cpu(gid->global.subnet_prefix), 1226 be64_to_cpu(gid->global.interface_id)); 1227 1228 msg.hdr.op_code = RDMACM_MUX_OP_CODE_REG; 1229 memcpy(msg.hdr.sgid.raw, gid->raw, sizeof(msg.hdr.sgid)); 1230 1231 ret = rdmacm_mux_send(backend_dev, &msg); 1232 if (ret) { 1233 rdma_error_report("Failed to register GID to rdma_umadmux (%d)", ret); 1234 return -EIO; 1235 } 1236 1237 qapi_event_send_rdma_gid_status_changed(ifname, true, 1238 gid->global.subnet_prefix, 1239 gid->global.interface_id); 1240 1241 return ret; 1242 } 1243 1244 int rdma_backend_del_gid(RdmaBackendDev *backend_dev, const char *ifname, 1245 union ibv_gid *gid) 1246 { 1247 RdmaCmMuxMsg msg = {}; 1248 int ret; 1249 1250 trace_rdma_backend_gid_change("del", be64_to_cpu(gid->global.subnet_prefix), 1251 be64_to_cpu(gid->global.interface_id)); 1252 1253 msg.hdr.op_code = RDMACM_MUX_OP_CODE_UNREG; 1254 memcpy(msg.hdr.sgid.raw, gid->raw, sizeof(msg.hdr.sgid)); 1255 1256 ret = rdmacm_mux_send(backend_dev, &msg); 1257 if (ret) { 1258 rdma_error_report("Failed to unregister GID from rdma_umadmux (%d)", 1259 ret); 1260 return -EIO; 1261 } 1262 1263 qapi_event_send_rdma_gid_status_changed(ifname, false, 1264 gid->global.subnet_prefix, 1265 gid->global.interface_id); 1266 1267 return 0; 1268 } 1269 1270 int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev, 1271 RdmaDeviceResources *rdma_dev_res, 1272 const char *backend_device_name, uint8_t port_num, 1273 struct ibv_device_attr *dev_attr, CharBackend *mad_chr_be) 1274 { 1275 int i; 1276 int ret = 0; 1277 int num_ibv_devices; 1278 struct ibv_device **dev_list; 1279 1280 memset(backend_dev, 0, sizeof(*backend_dev)); 1281 1282 backend_dev->dev = pdev; 1283 backend_dev->port_num = port_num; 1284 backend_dev->rdma_dev_res = rdma_dev_res; 1285 1286 rdma_backend_register_comp_handler(dummy_comp_handler); 1287 1288 dev_list = ibv_get_device_list(&num_ibv_devices); 1289 if (!dev_list) { 1290 rdma_error_report("Failed to get IB devices list"); 1291 return -EIO; 1292 } 1293 1294 if (num_ibv_devices == 0) { 1295 rdma_error_report("No IB devices were found"); 1296 ret = -ENXIO; 1297 goto out_free_dev_list; 1298 } 1299 1300 if (backend_device_name) { 1301 for (i = 0; dev_list[i]; ++i) { 1302 if (!strcmp(ibv_get_device_name(dev_list[i]), 1303 backend_device_name)) { 1304 break; 1305 } 1306 } 1307 1308 backend_dev->ib_dev = dev_list[i]; 1309 if (!backend_dev->ib_dev) { 1310 rdma_error_report("Failed to find IB device %s", 1311 backend_device_name); 1312 ret = -EIO; 1313 goto out_free_dev_list; 1314 } 1315 } else { 1316 backend_dev->ib_dev = *dev_list; 1317 } 1318 1319 rdma_info_report("uverb device %s", backend_dev->ib_dev->dev_name); 1320 1321 backend_dev->context = ibv_open_device(backend_dev->ib_dev); 1322 if (!backend_dev->context) { 1323 rdma_error_report("Failed to open IB device %s", 1324 ibv_get_device_name(backend_dev->ib_dev)); 1325 ret = -EIO; 1326 goto out; 1327 } 1328 1329 backend_dev->channel = ibv_create_comp_channel(backend_dev->context); 1330 if (!backend_dev->channel) { 1331 rdma_error_report("Failed to create IB communication channel"); 1332 ret = -EIO; 1333 goto out_close_device; 1334 } 1335 1336 ret = init_device_caps(backend_dev, dev_attr); 1337 if (ret) { 1338 rdma_error_report("Failed to initialize device capabilities"); 1339 ret = -EIO; 1340 goto out_destroy_comm_channel; 1341 } 1342 1343 1344 ret = mad_init(backend_dev, mad_chr_be); 1345 if (ret) { 1346 rdma_error_report("Failed to initialize mad"); 1347 ret = -EIO; 1348 goto out_destroy_comm_channel; 1349 } 1350 1351 backend_dev->comp_thread.run = false; 1352 backend_dev->comp_thread.is_running = false; 1353 1354 ah_cache_init(); 1355 1356 goto out_free_dev_list; 1357 1358 out_destroy_comm_channel: 1359 ibv_destroy_comp_channel(backend_dev->channel); 1360 1361 out_close_device: 1362 ibv_close_device(backend_dev->context); 1363 1364 out_free_dev_list: 1365 ibv_free_device_list(dev_list); 1366 1367 out: 1368 return ret; 1369 } 1370 1371 1372 void rdma_backend_start(RdmaBackendDev *backend_dev) 1373 { 1374 start_comp_thread(backend_dev); 1375 } 1376 1377 void rdma_backend_stop(RdmaBackendDev *backend_dev) 1378 { 1379 mad_stop(backend_dev); 1380 stop_backend_thread(&backend_dev->comp_thread); 1381 } 1382 1383 void rdma_backend_fini(RdmaBackendDev *backend_dev) 1384 { 1385 mad_fini(backend_dev); 1386 g_hash_table_destroy(ah_hash); 1387 ibv_destroy_comp_channel(backend_dev->channel); 1388 ibv_close_device(backend_dev->context); 1389 } 1390