1 /* 2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the BSD-type 8 * license below: 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 14 * Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 17 * Redistributions in binary form must reproduce the above 18 * copyright notice, this list of conditions and the following 19 * disclaimer in the documentation and/or other materials provided 20 * with the distribution. 21 * 22 * Neither the name of the Network Appliance, Inc. nor the names of 23 * its contributors may be used to endorse or promote products 24 * derived from this software without specific prior written 25 * permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 */ 39 40 /* 41 * verbs.c 42 * 43 * Encapsulates the major functions managing: 44 * o adapters 45 * o endpoints 46 * o connections 47 * o buffer memory 48 */ 49 50 #include <linux/interrupt.h> 51 #include <linux/pci.h> /* for Tavor hack below */ 52 #include <linux/slab.h> 53 54 #include "xprt_rdma.h" 55 56 /* 57 * Globals/Macros 58 */ 59 60 #ifdef RPC_DEBUG 61 # define RPCDBG_FACILITY RPCDBG_TRANS 62 #endif 63 64 /* 65 * internal functions 66 */ 67 68 /* 69 * handle replies in tasklet context, using a single, global list 70 * rdma tasklet function -- just turn around and call the func 71 * for all replies on the list 72 */ 73 74 static DEFINE_SPINLOCK(rpcrdma_tk_lock_g); 75 static LIST_HEAD(rpcrdma_tasklets_g); 76 77 static void 78 rpcrdma_run_tasklet(unsigned long data) 79 { 80 struct rpcrdma_rep *rep; 81 void (*func)(struct rpcrdma_rep *); 82 unsigned long flags; 83 84 data = data; 85 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); 86 while (!list_empty(&rpcrdma_tasklets_g)) { 87 rep = list_entry(rpcrdma_tasklets_g.next, 88 struct rpcrdma_rep, rr_list); 89 list_del(&rep->rr_list); 90 func = rep->rr_func; 91 rep->rr_func = NULL; 92 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); 93 94 if (func) 95 func(rep); 96 else 97 rpcrdma_recv_buffer_put(rep); 98 99 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); 100 } 101 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); 102 } 103 104 static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL); 105 106 static inline void 107 rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep) 108 { 109 unsigned long flags; 110 111 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); 112 list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g); 113 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); 114 tasklet_schedule(&rpcrdma_tasklet_g); 115 } 116 117 static void 118 rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) 119 { 120 struct rpcrdma_ep *ep = context; 121 122 dprintk("RPC: %s: QP error %X on device %s ep %p\n", 123 __func__, event->event, event->device->name, context); 124 if (ep->rep_connected == 1) { 125 ep->rep_connected = -EIO; 126 ep->rep_func(ep); 127 wake_up_all(&ep->rep_connect_wait); 128 } 129 } 130 131 static void 132 rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context) 133 { 134 struct rpcrdma_ep *ep = context; 135 136 dprintk("RPC: %s: CQ error %X on device %s ep %p\n", 137 __func__, event->event, event->device->name, context); 138 if (ep->rep_connected == 1) { 139 ep->rep_connected = -EIO; 140 ep->rep_func(ep); 141 wake_up_all(&ep->rep_connect_wait); 142 } 143 } 144 145 static inline 146 void rpcrdma_event_process(struct ib_wc *wc) 147 { 148 struct rpcrdma_mw *frmr; 149 struct rpcrdma_rep *rep = 150 (struct rpcrdma_rep *)(unsigned long) wc->wr_id; 151 152 dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n", 153 __func__, rep, wc->status, wc->opcode, wc->byte_len); 154 155 if (!rep) /* send or bind completion that we don't care about */ 156 return; 157 158 if (IB_WC_SUCCESS != wc->status) { 159 dprintk("RPC: %s: WC opcode %d status %X, connection lost\n", 160 __func__, wc->opcode, wc->status); 161 rep->rr_len = ~0U; 162 if (wc->opcode != IB_WC_FAST_REG_MR && wc->opcode != IB_WC_LOCAL_INV) 163 rpcrdma_schedule_tasklet(rep); 164 return; 165 } 166 167 switch (wc->opcode) { 168 case IB_WC_FAST_REG_MR: 169 frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; 170 frmr->r.frmr.state = FRMR_IS_VALID; 171 break; 172 case IB_WC_LOCAL_INV: 173 frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; 174 frmr->r.frmr.state = FRMR_IS_INVALID; 175 break; 176 case IB_WC_RECV: 177 rep->rr_len = wc->byte_len; 178 ib_dma_sync_single_for_cpu( 179 rdmab_to_ia(rep->rr_buffer)->ri_id->device, 180 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE); 181 /* Keep (only) the most recent credits, after check validity */ 182 if (rep->rr_len >= 16) { 183 struct rpcrdma_msg *p = 184 (struct rpcrdma_msg *) rep->rr_base; 185 unsigned int credits = ntohl(p->rm_credit); 186 if (credits == 0) { 187 dprintk("RPC: %s: server" 188 " dropped credits to 0!\n", __func__); 189 /* don't deadlock */ 190 credits = 1; 191 } else if (credits > rep->rr_buffer->rb_max_requests) { 192 dprintk("RPC: %s: server" 193 " over-crediting: %d (%d)\n", 194 __func__, credits, 195 rep->rr_buffer->rb_max_requests); 196 credits = rep->rr_buffer->rb_max_requests; 197 } 198 atomic_set(&rep->rr_buffer->rb_credits, credits); 199 } 200 /* fall through */ 201 case IB_WC_BIND_MW: 202 rpcrdma_schedule_tasklet(rep); 203 break; 204 default: 205 dprintk("RPC: %s: unexpected WC event %X\n", 206 __func__, wc->opcode); 207 break; 208 } 209 } 210 211 static inline int 212 rpcrdma_cq_poll(struct ib_cq *cq) 213 { 214 struct ib_wc wc; 215 int rc; 216 217 for (;;) { 218 rc = ib_poll_cq(cq, 1, &wc); 219 if (rc < 0) { 220 dprintk("RPC: %s: ib_poll_cq failed %i\n", 221 __func__, rc); 222 return rc; 223 } 224 if (rc == 0) 225 break; 226 227 rpcrdma_event_process(&wc); 228 } 229 230 return 0; 231 } 232 233 /* 234 * rpcrdma_cq_event_upcall 235 * 236 * This upcall handles recv, send, bind and unbind events. 237 * It is reentrant but processes single events in order to maintain 238 * ordering of receives to keep server credits. 239 * 240 * It is the responsibility of the scheduled tasklet to return 241 * recv buffers to the pool. NOTE: this affects synchronization of 242 * connection shutdown. That is, the structures required for 243 * the completion of the reply handler must remain intact until 244 * all memory has been reclaimed. 245 * 246 * Note that send events are suppressed and do not result in an upcall. 247 */ 248 static void 249 rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context) 250 { 251 int rc; 252 253 rc = rpcrdma_cq_poll(cq); 254 if (rc) 255 return; 256 257 rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); 258 if (rc) { 259 dprintk("RPC: %s: ib_req_notify_cq failed %i\n", 260 __func__, rc); 261 return; 262 } 263 264 rpcrdma_cq_poll(cq); 265 } 266 267 #ifdef RPC_DEBUG 268 static const char * const conn[] = { 269 "address resolved", 270 "address error", 271 "route resolved", 272 "route error", 273 "connect request", 274 "connect response", 275 "connect error", 276 "unreachable", 277 "rejected", 278 "established", 279 "disconnected", 280 "device removal" 281 }; 282 #endif 283 284 static int 285 rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) 286 { 287 struct rpcrdma_xprt *xprt = id->context; 288 struct rpcrdma_ia *ia = &xprt->rx_ia; 289 struct rpcrdma_ep *ep = &xprt->rx_ep; 290 #ifdef RPC_DEBUG 291 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr; 292 #endif 293 struct ib_qp_attr attr; 294 struct ib_qp_init_attr iattr; 295 int connstate = 0; 296 297 switch (event->event) { 298 case RDMA_CM_EVENT_ADDR_RESOLVED: 299 case RDMA_CM_EVENT_ROUTE_RESOLVED: 300 ia->ri_async_rc = 0; 301 complete(&ia->ri_done); 302 break; 303 case RDMA_CM_EVENT_ADDR_ERROR: 304 ia->ri_async_rc = -EHOSTUNREACH; 305 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n", 306 __func__, ep); 307 complete(&ia->ri_done); 308 break; 309 case RDMA_CM_EVENT_ROUTE_ERROR: 310 ia->ri_async_rc = -ENETUNREACH; 311 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n", 312 __func__, ep); 313 complete(&ia->ri_done); 314 break; 315 case RDMA_CM_EVENT_ESTABLISHED: 316 connstate = 1; 317 ib_query_qp(ia->ri_id->qp, &attr, 318 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC, 319 &iattr); 320 dprintk("RPC: %s: %d responder resources" 321 " (%d initiator)\n", 322 __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic); 323 goto connected; 324 case RDMA_CM_EVENT_CONNECT_ERROR: 325 connstate = -ENOTCONN; 326 goto connected; 327 case RDMA_CM_EVENT_UNREACHABLE: 328 connstate = -ENETDOWN; 329 goto connected; 330 case RDMA_CM_EVENT_REJECTED: 331 connstate = -ECONNREFUSED; 332 goto connected; 333 case RDMA_CM_EVENT_DISCONNECTED: 334 connstate = -ECONNABORTED; 335 goto connected; 336 case RDMA_CM_EVENT_DEVICE_REMOVAL: 337 connstate = -ENODEV; 338 connected: 339 dprintk("RPC: %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n", 340 __func__, 341 (event->event <= 11) ? conn[event->event] : 342 "unknown connection error", 343 &addr->sin_addr.s_addr, 344 ntohs(addr->sin_port), 345 ep, event->event); 346 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1); 347 dprintk("RPC: %s: %sconnected\n", 348 __func__, connstate > 0 ? "" : "dis"); 349 ep->rep_connected = connstate; 350 ep->rep_func(ep); 351 wake_up_all(&ep->rep_connect_wait); 352 break; 353 default: 354 dprintk("RPC: %s: unexpected CM event %d\n", 355 __func__, event->event); 356 break; 357 } 358 359 #ifdef RPC_DEBUG 360 if (connstate == 1) { 361 int ird = attr.max_dest_rd_atomic; 362 int tird = ep->rep_remote_cma.responder_resources; 363 printk(KERN_INFO "rpcrdma: connection to %pI4:%u " 364 "on %s, memreg %d slots %d ird %d%s\n", 365 &addr->sin_addr.s_addr, 366 ntohs(addr->sin_port), 367 ia->ri_id->device->name, 368 ia->ri_memreg_strategy, 369 xprt->rx_buf.rb_max_requests, 370 ird, ird < 4 && ird < tird / 2 ? " (low!)" : ""); 371 } else if (connstate < 0) { 372 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n", 373 &addr->sin_addr.s_addr, 374 ntohs(addr->sin_port), 375 connstate); 376 } 377 #endif 378 379 return 0; 380 } 381 382 static struct rdma_cm_id * 383 rpcrdma_create_id(struct rpcrdma_xprt *xprt, 384 struct rpcrdma_ia *ia, struct sockaddr *addr) 385 { 386 struct rdma_cm_id *id; 387 int rc; 388 389 init_completion(&ia->ri_done); 390 391 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC); 392 if (IS_ERR(id)) { 393 rc = PTR_ERR(id); 394 dprintk("RPC: %s: rdma_create_id() failed %i\n", 395 __func__, rc); 396 return id; 397 } 398 399 ia->ri_async_rc = -ETIMEDOUT; 400 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT); 401 if (rc) { 402 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n", 403 __func__, rc); 404 goto out; 405 } 406 wait_for_completion_interruptible_timeout(&ia->ri_done, 407 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1); 408 rc = ia->ri_async_rc; 409 if (rc) 410 goto out; 411 412 ia->ri_async_rc = -ETIMEDOUT; 413 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT); 414 if (rc) { 415 dprintk("RPC: %s: rdma_resolve_route() failed %i\n", 416 __func__, rc); 417 goto out; 418 } 419 wait_for_completion_interruptible_timeout(&ia->ri_done, 420 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1); 421 rc = ia->ri_async_rc; 422 if (rc) 423 goto out; 424 425 return id; 426 427 out: 428 rdma_destroy_id(id); 429 return ERR_PTR(rc); 430 } 431 432 /* 433 * Drain any cq, prior to teardown. 434 */ 435 static void 436 rpcrdma_clean_cq(struct ib_cq *cq) 437 { 438 struct ib_wc wc; 439 int count = 0; 440 441 while (1 == ib_poll_cq(cq, 1, &wc)) 442 ++count; 443 444 if (count) 445 dprintk("RPC: %s: flushed %d events (last 0x%x)\n", 446 __func__, count, wc.opcode); 447 } 448 449 /* 450 * Exported functions. 451 */ 452 453 /* 454 * Open and initialize an Interface Adapter. 455 * o initializes fields of struct rpcrdma_ia, including 456 * interface and provider attributes and protection zone. 457 */ 458 int 459 rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) 460 { 461 int rc, mem_priv; 462 struct ib_device_attr devattr; 463 struct rpcrdma_ia *ia = &xprt->rx_ia; 464 465 ia->ri_id = rpcrdma_create_id(xprt, ia, addr); 466 if (IS_ERR(ia->ri_id)) { 467 rc = PTR_ERR(ia->ri_id); 468 goto out1; 469 } 470 471 ia->ri_pd = ib_alloc_pd(ia->ri_id->device); 472 if (IS_ERR(ia->ri_pd)) { 473 rc = PTR_ERR(ia->ri_pd); 474 dprintk("RPC: %s: ib_alloc_pd() failed %i\n", 475 __func__, rc); 476 goto out2; 477 } 478 479 /* 480 * Query the device to determine if the requested memory 481 * registration strategy is supported. If it isn't, set the 482 * strategy to a globally supported model. 483 */ 484 rc = ib_query_device(ia->ri_id->device, &devattr); 485 if (rc) { 486 dprintk("RPC: %s: ib_query_device failed %d\n", 487 __func__, rc); 488 goto out2; 489 } 490 491 if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) { 492 ia->ri_have_dma_lkey = 1; 493 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey; 494 } 495 496 switch (memreg) { 497 case RPCRDMA_MEMWINDOWS: 498 case RPCRDMA_MEMWINDOWS_ASYNC: 499 if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) { 500 dprintk("RPC: %s: MEMWINDOWS registration " 501 "specified but not supported by adapter, " 502 "using slower RPCRDMA_REGISTER\n", 503 __func__); 504 memreg = RPCRDMA_REGISTER; 505 } 506 break; 507 case RPCRDMA_MTHCAFMR: 508 if (!ia->ri_id->device->alloc_fmr) { 509 #if RPCRDMA_PERSISTENT_REGISTRATION 510 dprintk("RPC: %s: MTHCAFMR registration " 511 "specified but not supported by adapter, " 512 "using riskier RPCRDMA_ALLPHYSICAL\n", 513 __func__); 514 memreg = RPCRDMA_ALLPHYSICAL; 515 #else 516 dprintk("RPC: %s: MTHCAFMR registration " 517 "specified but not supported by adapter, " 518 "using slower RPCRDMA_REGISTER\n", 519 __func__); 520 memreg = RPCRDMA_REGISTER; 521 #endif 522 } 523 break; 524 case RPCRDMA_FRMR: 525 /* Requires both frmr reg and local dma lkey */ 526 if ((devattr.device_cap_flags & 527 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) != 528 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) { 529 #if RPCRDMA_PERSISTENT_REGISTRATION 530 dprintk("RPC: %s: FRMR registration " 531 "specified but not supported by adapter, " 532 "using riskier RPCRDMA_ALLPHYSICAL\n", 533 __func__); 534 memreg = RPCRDMA_ALLPHYSICAL; 535 #else 536 dprintk("RPC: %s: FRMR registration " 537 "specified but not supported by adapter, " 538 "using slower RPCRDMA_REGISTER\n", 539 __func__); 540 memreg = RPCRDMA_REGISTER; 541 #endif 542 } 543 break; 544 } 545 546 /* 547 * Optionally obtain an underlying physical identity mapping in 548 * order to do a memory window-based bind. This base registration 549 * is protected from remote access - that is enabled only by binding 550 * for the specific bytes targeted during each RPC operation, and 551 * revoked after the corresponding completion similar to a storage 552 * adapter. 553 */ 554 switch (memreg) { 555 case RPCRDMA_BOUNCEBUFFERS: 556 case RPCRDMA_REGISTER: 557 case RPCRDMA_FRMR: 558 break; 559 #if RPCRDMA_PERSISTENT_REGISTRATION 560 case RPCRDMA_ALLPHYSICAL: 561 mem_priv = IB_ACCESS_LOCAL_WRITE | 562 IB_ACCESS_REMOTE_WRITE | 563 IB_ACCESS_REMOTE_READ; 564 goto register_setup; 565 #endif 566 case RPCRDMA_MEMWINDOWS_ASYNC: 567 case RPCRDMA_MEMWINDOWS: 568 mem_priv = IB_ACCESS_LOCAL_WRITE | 569 IB_ACCESS_MW_BIND; 570 goto register_setup; 571 case RPCRDMA_MTHCAFMR: 572 if (ia->ri_have_dma_lkey) 573 break; 574 mem_priv = IB_ACCESS_LOCAL_WRITE; 575 register_setup: 576 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv); 577 if (IS_ERR(ia->ri_bind_mem)) { 578 printk(KERN_ALERT "%s: ib_get_dma_mr for " 579 "phys register failed with %lX\n\t" 580 "Will continue with degraded performance\n", 581 __func__, PTR_ERR(ia->ri_bind_mem)); 582 memreg = RPCRDMA_REGISTER; 583 ia->ri_bind_mem = NULL; 584 } 585 break; 586 default: 587 printk(KERN_ERR "%s: invalid memory registration mode %d\n", 588 __func__, memreg); 589 rc = -EINVAL; 590 goto out2; 591 } 592 dprintk("RPC: %s: memory registration strategy is %d\n", 593 __func__, memreg); 594 595 /* Else will do memory reg/dereg for each chunk */ 596 ia->ri_memreg_strategy = memreg; 597 598 return 0; 599 out2: 600 rdma_destroy_id(ia->ri_id); 601 ia->ri_id = NULL; 602 out1: 603 return rc; 604 } 605 606 /* 607 * Clean up/close an IA. 608 * o if event handles and PD have been initialized, free them. 609 * o close the IA 610 */ 611 void 612 rpcrdma_ia_close(struct rpcrdma_ia *ia) 613 { 614 int rc; 615 616 dprintk("RPC: %s: entering\n", __func__); 617 if (ia->ri_bind_mem != NULL) { 618 rc = ib_dereg_mr(ia->ri_bind_mem); 619 dprintk("RPC: %s: ib_dereg_mr returned %i\n", 620 __func__, rc); 621 } 622 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) { 623 if (ia->ri_id->qp) 624 rdma_destroy_qp(ia->ri_id); 625 rdma_destroy_id(ia->ri_id); 626 ia->ri_id = NULL; 627 } 628 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) { 629 rc = ib_dealloc_pd(ia->ri_pd); 630 dprintk("RPC: %s: ib_dealloc_pd returned %i\n", 631 __func__, rc); 632 } 633 } 634 635 /* 636 * Create unconnected endpoint. 637 */ 638 int 639 rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, 640 struct rpcrdma_create_data_internal *cdata) 641 { 642 struct ib_device_attr devattr; 643 int rc, err; 644 645 rc = ib_query_device(ia->ri_id->device, &devattr); 646 if (rc) { 647 dprintk("RPC: %s: ib_query_device failed %d\n", 648 __func__, rc); 649 return rc; 650 } 651 652 /* check provider's send/recv wr limits */ 653 if (cdata->max_requests > devattr.max_qp_wr) 654 cdata->max_requests = devattr.max_qp_wr; 655 656 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall; 657 ep->rep_attr.qp_context = ep; 658 /* send_cq and recv_cq initialized below */ 659 ep->rep_attr.srq = NULL; 660 ep->rep_attr.cap.max_send_wr = cdata->max_requests; 661 switch (ia->ri_memreg_strategy) { 662 case RPCRDMA_FRMR: 663 /* Add room for frmr register and invalidate WRs. 664 * 1. FRMR reg WR for head 665 * 2. FRMR invalidate WR for head 666 * 3. FRMR reg WR for pagelist 667 * 4. FRMR invalidate WR for pagelist 668 * 5. FRMR reg WR for tail 669 * 6. FRMR invalidate WR for tail 670 * 7. The RDMA_SEND WR 671 */ 672 ep->rep_attr.cap.max_send_wr *= 7; 673 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) { 674 cdata->max_requests = devattr.max_qp_wr / 7; 675 if (!cdata->max_requests) 676 return -EINVAL; 677 ep->rep_attr.cap.max_send_wr = cdata->max_requests * 7; 678 } 679 break; 680 case RPCRDMA_MEMWINDOWS_ASYNC: 681 case RPCRDMA_MEMWINDOWS: 682 /* Add room for mw_binds+unbinds - overkill! */ 683 ep->rep_attr.cap.max_send_wr++; 684 ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS); 685 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) 686 return -EINVAL; 687 break; 688 default: 689 break; 690 } 691 ep->rep_attr.cap.max_recv_wr = cdata->max_requests; 692 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2); 693 ep->rep_attr.cap.max_recv_sge = 1; 694 ep->rep_attr.cap.max_inline_data = 0; 695 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR; 696 ep->rep_attr.qp_type = IB_QPT_RC; 697 ep->rep_attr.port_num = ~0; 698 699 dprintk("RPC: %s: requested max: dtos: send %d recv %d; " 700 "iovs: send %d recv %d\n", 701 __func__, 702 ep->rep_attr.cap.max_send_wr, 703 ep->rep_attr.cap.max_recv_wr, 704 ep->rep_attr.cap.max_send_sge, 705 ep->rep_attr.cap.max_recv_sge); 706 707 /* set trigger for requesting send completion */ 708 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/; 709 switch (ia->ri_memreg_strategy) { 710 case RPCRDMA_MEMWINDOWS_ASYNC: 711 case RPCRDMA_MEMWINDOWS: 712 ep->rep_cqinit -= RPCRDMA_MAX_SEGS; 713 break; 714 default: 715 break; 716 } 717 if (ep->rep_cqinit <= 2) 718 ep->rep_cqinit = 0; 719 INIT_CQCOUNT(ep); 720 ep->rep_ia = ia; 721 init_waitqueue_head(&ep->rep_connect_wait); 722 723 /* 724 * Create a single cq for receive dto and mw_bind (only ever 725 * care about unbind, really). Send completions are suppressed. 726 * Use single threaded tasklet upcalls to maintain ordering. 727 */ 728 ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall, 729 rpcrdma_cq_async_error_upcall, NULL, 730 ep->rep_attr.cap.max_recv_wr + 731 ep->rep_attr.cap.max_send_wr + 1, 0); 732 if (IS_ERR(ep->rep_cq)) { 733 rc = PTR_ERR(ep->rep_cq); 734 dprintk("RPC: %s: ib_create_cq failed: %i\n", 735 __func__, rc); 736 goto out1; 737 } 738 739 rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP); 740 if (rc) { 741 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", 742 __func__, rc); 743 goto out2; 744 } 745 746 ep->rep_attr.send_cq = ep->rep_cq; 747 ep->rep_attr.recv_cq = ep->rep_cq; 748 749 /* Initialize cma parameters */ 750 751 /* RPC/RDMA does not use private data */ 752 ep->rep_remote_cma.private_data = NULL; 753 ep->rep_remote_cma.private_data_len = 0; 754 755 /* Client offers RDMA Read but does not initiate */ 756 ep->rep_remote_cma.initiator_depth = 0; 757 if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS) 758 ep->rep_remote_cma.responder_resources = 0; 759 else if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */ 760 ep->rep_remote_cma.responder_resources = 32; 761 else 762 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom; 763 764 ep->rep_remote_cma.retry_count = 7; 765 ep->rep_remote_cma.flow_control = 0; 766 ep->rep_remote_cma.rnr_retry_count = 0; 767 768 return 0; 769 770 out2: 771 err = ib_destroy_cq(ep->rep_cq); 772 if (err) 773 dprintk("RPC: %s: ib_destroy_cq returned %i\n", 774 __func__, err); 775 out1: 776 return rc; 777 } 778 779 /* 780 * rpcrdma_ep_destroy 781 * 782 * Disconnect and destroy endpoint. After this, the only 783 * valid operations on the ep are to free it (if dynamically 784 * allocated) or re-create it. 785 * 786 * The caller's error handling must be sure to not leak the endpoint 787 * if this function fails. 788 */ 789 int 790 rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) 791 { 792 int rc; 793 794 dprintk("RPC: %s: entering, connected is %d\n", 795 __func__, ep->rep_connected); 796 797 if (ia->ri_id->qp) { 798 rc = rpcrdma_ep_disconnect(ep, ia); 799 if (rc) 800 dprintk("RPC: %s: rpcrdma_ep_disconnect" 801 " returned %i\n", __func__, rc); 802 rdma_destroy_qp(ia->ri_id); 803 ia->ri_id->qp = NULL; 804 } 805 806 /* padding - could be done in rpcrdma_buffer_destroy... */ 807 if (ep->rep_pad_mr) { 808 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad); 809 ep->rep_pad_mr = NULL; 810 } 811 812 rpcrdma_clean_cq(ep->rep_cq); 813 rc = ib_destroy_cq(ep->rep_cq); 814 if (rc) 815 dprintk("RPC: %s: ib_destroy_cq returned %i\n", 816 __func__, rc); 817 818 return rc; 819 } 820 821 /* 822 * Connect unconnected endpoint. 823 */ 824 int 825 rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) 826 { 827 struct rdma_cm_id *id; 828 int rc = 0; 829 int retry_count = 0; 830 831 if (ep->rep_connected != 0) { 832 struct rpcrdma_xprt *xprt; 833 retry: 834 rc = rpcrdma_ep_disconnect(ep, ia); 835 if (rc && rc != -ENOTCONN) 836 dprintk("RPC: %s: rpcrdma_ep_disconnect" 837 " status %i\n", __func__, rc); 838 rpcrdma_clean_cq(ep->rep_cq); 839 840 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); 841 id = rpcrdma_create_id(xprt, ia, 842 (struct sockaddr *)&xprt->rx_data.addr); 843 if (IS_ERR(id)) { 844 rc = PTR_ERR(id); 845 goto out; 846 } 847 /* TEMP TEMP TEMP - fail if new device: 848 * Deregister/remarshal *all* requests! 849 * Close and recreate adapter, pd, etc! 850 * Re-determine all attributes still sane! 851 * More stuff I haven't thought of! 852 * Rrrgh! 853 */ 854 if (ia->ri_id->device != id->device) { 855 printk("RPC: %s: can't reconnect on " 856 "different device!\n", __func__); 857 rdma_destroy_id(id); 858 rc = -ENETDOWN; 859 goto out; 860 } 861 /* END TEMP */ 862 rdma_destroy_qp(ia->ri_id); 863 rdma_destroy_id(ia->ri_id); 864 ia->ri_id = id; 865 } 866 867 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); 868 if (rc) { 869 dprintk("RPC: %s: rdma_create_qp failed %i\n", 870 __func__, rc); 871 goto out; 872 } 873 874 /* XXX Tavor device performs badly with 2K MTU! */ 875 if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) { 876 struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device); 877 if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR && 878 (pcid->vendor == PCI_VENDOR_ID_MELLANOX || 879 pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) { 880 struct ib_qp_attr attr = { 881 .path_mtu = IB_MTU_1024 882 }; 883 rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU); 884 } 885 } 886 887 ep->rep_connected = 0; 888 889 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); 890 if (rc) { 891 dprintk("RPC: %s: rdma_connect() failed with %i\n", 892 __func__, rc); 893 goto out; 894 } 895 896 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0); 897 898 /* 899 * Check state. A non-peer reject indicates no listener 900 * (ECONNREFUSED), which may be a transient state. All 901 * others indicate a transport condition which has already 902 * undergone a best-effort. 903 */ 904 if (ep->rep_connected == -ECONNREFUSED && 905 ++retry_count <= RDMA_CONNECT_RETRY_MAX) { 906 dprintk("RPC: %s: non-peer_reject, retry\n", __func__); 907 goto retry; 908 } 909 if (ep->rep_connected <= 0) { 910 /* Sometimes, the only way to reliably connect to remote 911 * CMs is to use same nonzero values for ORD and IRD. */ 912 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 && 913 (ep->rep_remote_cma.responder_resources == 0 || 914 ep->rep_remote_cma.initiator_depth != 915 ep->rep_remote_cma.responder_resources)) { 916 if (ep->rep_remote_cma.responder_resources == 0) 917 ep->rep_remote_cma.responder_resources = 1; 918 ep->rep_remote_cma.initiator_depth = 919 ep->rep_remote_cma.responder_resources; 920 goto retry; 921 } 922 rc = ep->rep_connected; 923 } else { 924 dprintk("RPC: %s: connected\n", __func__); 925 } 926 927 out: 928 if (rc) 929 ep->rep_connected = rc; 930 return rc; 931 } 932 933 /* 934 * rpcrdma_ep_disconnect 935 * 936 * This is separate from destroy to facilitate the ability 937 * to reconnect without recreating the endpoint. 938 * 939 * This call is not reentrant, and must not be made in parallel 940 * on the same endpoint. 941 */ 942 int 943 rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) 944 { 945 int rc; 946 947 rpcrdma_clean_cq(ep->rep_cq); 948 rc = rdma_disconnect(ia->ri_id); 949 if (!rc) { 950 /* returns without wait if not connected */ 951 wait_event_interruptible(ep->rep_connect_wait, 952 ep->rep_connected != 1); 953 dprintk("RPC: %s: after wait, %sconnected\n", __func__, 954 (ep->rep_connected == 1) ? "still " : "dis"); 955 } else { 956 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc); 957 ep->rep_connected = rc; 958 } 959 return rc; 960 } 961 962 /* 963 * Initialize buffer memory 964 */ 965 int 966 rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, 967 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata) 968 { 969 char *p; 970 size_t len; 971 int i, rc; 972 struct rpcrdma_mw *r; 973 974 buf->rb_max_requests = cdata->max_requests; 975 spin_lock_init(&buf->rb_lock); 976 atomic_set(&buf->rb_credits, 1); 977 978 /* Need to allocate: 979 * 1. arrays for send and recv pointers 980 * 2. arrays of struct rpcrdma_req to fill in pointers 981 * 3. array of struct rpcrdma_rep for replies 982 * 4. padding, if any 983 * 5. mw's, fmr's or frmr's, if any 984 * Send/recv buffers in req/rep need to be registered 985 */ 986 987 len = buf->rb_max_requests * 988 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *)); 989 len += cdata->padding; 990 switch (ia->ri_memreg_strategy) { 991 case RPCRDMA_FRMR: 992 len += buf->rb_max_requests * RPCRDMA_MAX_SEGS * 993 sizeof(struct rpcrdma_mw); 994 break; 995 case RPCRDMA_MTHCAFMR: 996 /* TBD we are perhaps overallocating here */ 997 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * 998 sizeof(struct rpcrdma_mw); 999 break; 1000 case RPCRDMA_MEMWINDOWS_ASYNC: 1001 case RPCRDMA_MEMWINDOWS: 1002 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * 1003 sizeof(struct rpcrdma_mw); 1004 break; 1005 default: 1006 break; 1007 } 1008 1009 /* allocate 1, 4 and 5 in one shot */ 1010 p = kzalloc(len, GFP_KERNEL); 1011 if (p == NULL) { 1012 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n", 1013 __func__, len); 1014 rc = -ENOMEM; 1015 goto out; 1016 } 1017 buf->rb_pool = p; /* for freeing it later */ 1018 1019 buf->rb_send_bufs = (struct rpcrdma_req **) p; 1020 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests]; 1021 buf->rb_recv_bufs = (struct rpcrdma_rep **) p; 1022 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests]; 1023 1024 /* 1025 * Register the zeroed pad buffer, if any. 1026 */ 1027 if (cdata->padding) { 1028 rc = rpcrdma_register_internal(ia, p, cdata->padding, 1029 &ep->rep_pad_mr, &ep->rep_pad); 1030 if (rc) 1031 goto out; 1032 } 1033 p += cdata->padding; 1034 1035 /* 1036 * Allocate the fmr's, or mw's for mw_bind chunk registration. 1037 * We "cycle" the mw's in order to minimize rkey reuse, 1038 * and also reduce unbind-to-bind collision. 1039 */ 1040 INIT_LIST_HEAD(&buf->rb_mws); 1041 r = (struct rpcrdma_mw *)p; 1042 switch (ia->ri_memreg_strategy) { 1043 case RPCRDMA_FRMR: 1044 for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) { 1045 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, 1046 RPCRDMA_MAX_SEGS); 1047 if (IS_ERR(r->r.frmr.fr_mr)) { 1048 rc = PTR_ERR(r->r.frmr.fr_mr); 1049 dprintk("RPC: %s: ib_alloc_fast_reg_mr" 1050 " failed %i\n", __func__, rc); 1051 goto out; 1052 } 1053 r->r.frmr.fr_pgl = 1054 ib_alloc_fast_reg_page_list(ia->ri_id->device, 1055 RPCRDMA_MAX_SEGS); 1056 if (IS_ERR(r->r.frmr.fr_pgl)) { 1057 rc = PTR_ERR(r->r.frmr.fr_pgl); 1058 dprintk("RPC: %s: " 1059 "ib_alloc_fast_reg_page_list " 1060 "failed %i\n", __func__, rc); 1061 goto out; 1062 } 1063 list_add(&r->mw_list, &buf->rb_mws); 1064 ++r; 1065 } 1066 break; 1067 case RPCRDMA_MTHCAFMR: 1068 /* TBD we are perhaps overallocating here */ 1069 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { 1070 static struct ib_fmr_attr fa = 1071 { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT }; 1072 r->r.fmr = ib_alloc_fmr(ia->ri_pd, 1073 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ, 1074 &fa); 1075 if (IS_ERR(r->r.fmr)) { 1076 rc = PTR_ERR(r->r.fmr); 1077 dprintk("RPC: %s: ib_alloc_fmr" 1078 " failed %i\n", __func__, rc); 1079 goto out; 1080 } 1081 list_add(&r->mw_list, &buf->rb_mws); 1082 ++r; 1083 } 1084 break; 1085 case RPCRDMA_MEMWINDOWS_ASYNC: 1086 case RPCRDMA_MEMWINDOWS: 1087 /* Allocate one extra request's worth, for full cycling */ 1088 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { 1089 r->r.mw = ib_alloc_mw(ia->ri_pd); 1090 if (IS_ERR(r->r.mw)) { 1091 rc = PTR_ERR(r->r.mw); 1092 dprintk("RPC: %s: ib_alloc_mw" 1093 " failed %i\n", __func__, rc); 1094 goto out; 1095 } 1096 list_add(&r->mw_list, &buf->rb_mws); 1097 ++r; 1098 } 1099 break; 1100 default: 1101 break; 1102 } 1103 1104 /* 1105 * Allocate/init the request/reply buffers. Doing this 1106 * using kmalloc for now -- one for each buf. 1107 */ 1108 for (i = 0; i < buf->rb_max_requests; i++) { 1109 struct rpcrdma_req *req; 1110 struct rpcrdma_rep *rep; 1111 1112 len = cdata->inline_wsize + sizeof(struct rpcrdma_req); 1113 /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */ 1114 /* Typical ~2400b, so rounding up saves work later */ 1115 if (len < 4096) 1116 len = 4096; 1117 req = kmalloc(len, GFP_KERNEL); 1118 if (req == NULL) { 1119 dprintk("RPC: %s: request buffer %d alloc" 1120 " failed\n", __func__, i); 1121 rc = -ENOMEM; 1122 goto out; 1123 } 1124 memset(req, 0, sizeof(struct rpcrdma_req)); 1125 buf->rb_send_bufs[i] = req; 1126 buf->rb_send_bufs[i]->rl_buffer = buf; 1127 1128 rc = rpcrdma_register_internal(ia, req->rl_base, 1129 len - offsetof(struct rpcrdma_req, rl_base), 1130 &buf->rb_send_bufs[i]->rl_handle, 1131 &buf->rb_send_bufs[i]->rl_iov); 1132 if (rc) 1133 goto out; 1134 1135 buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req); 1136 1137 len = cdata->inline_rsize + sizeof(struct rpcrdma_rep); 1138 rep = kmalloc(len, GFP_KERNEL); 1139 if (rep == NULL) { 1140 dprintk("RPC: %s: reply buffer %d alloc failed\n", 1141 __func__, i); 1142 rc = -ENOMEM; 1143 goto out; 1144 } 1145 memset(rep, 0, sizeof(struct rpcrdma_rep)); 1146 buf->rb_recv_bufs[i] = rep; 1147 buf->rb_recv_bufs[i]->rr_buffer = buf; 1148 init_waitqueue_head(&rep->rr_unbind); 1149 1150 rc = rpcrdma_register_internal(ia, rep->rr_base, 1151 len - offsetof(struct rpcrdma_rep, rr_base), 1152 &buf->rb_recv_bufs[i]->rr_handle, 1153 &buf->rb_recv_bufs[i]->rr_iov); 1154 if (rc) 1155 goto out; 1156 1157 } 1158 dprintk("RPC: %s: max_requests %d\n", 1159 __func__, buf->rb_max_requests); 1160 /* done */ 1161 return 0; 1162 out: 1163 rpcrdma_buffer_destroy(buf); 1164 return rc; 1165 } 1166 1167 /* 1168 * Unregister and destroy buffer memory. Need to deal with 1169 * partial initialization, so it's callable from failed create. 1170 * Must be called before destroying endpoint, as registrations 1171 * reference it. 1172 */ 1173 void 1174 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) 1175 { 1176 int rc, i; 1177 struct rpcrdma_ia *ia = rdmab_to_ia(buf); 1178 struct rpcrdma_mw *r; 1179 1180 /* clean up in reverse order from create 1181 * 1. recv mr memory (mr free, then kfree) 1182 * 1a. bind mw memory 1183 * 2. send mr memory (mr free, then kfree) 1184 * 3. padding (if any) [moved to rpcrdma_ep_destroy] 1185 * 4. arrays 1186 */ 1187 dprintk("RPC: %s: entering\n", __func__); 1188 1189 for (i = 0; i < buf->rb_max_requests; i++) { 1190 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) { 1191 rpcrdma_deregister_internal(ia, 1192 buf->rb_recv_bufs[i]->rr_handle, 1193 &buf->rb_recv_bufs[i]->rr_iov); 1194 kfree(buf->rb_recv_bufs[i]); 1195 } 1196 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) { 1197 while (!list_empty(&buf->rb_mws)) { 1198 r = list_entry(buf->rb_mws.next, 1199 struct rpcrdma_mw, mw_list); 1200 list_del(&r->mw_list); 1201 switch (ia->ri_memreg_strategy) { 1202 case RPCRDMA_FRMR: 1203 rc = ib_dereg_mr(r->r.frmr.fr_mr); 1204 if (rc) 1205 dprintk("RPC: %s:" 1206 " ib_dereg_mr" 1207 " failed %i\n", 1208 __func__, rc); 1209 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); 1210 break; 1211 case RPCRDMA_MTHCAFMR: 1212 rc = ib_dealloc_fmr(r->r.fmr); 1213 if (rc) 1214 dprintk("RPC: %s:" 1215 " ib_dealloc_fmr" 1216 " failed %i\n", 1217 __func__, rc); 1218 break; 1219 case RPCRDMA_MEMWINDOWS_ASYNC: 1220 case RPCRDMA_MEMWINDOWS: 1221 rc = ib_dealloc_mw(r->r.mw); 1222 if (rc) 1223 dprintk("RPC: %s:" 1224 " ib_dealloc_mw" 1225 " failed %i\n", 1226 __func__, rc); 1227 break; 1228 default: 1229 break; 1230 } 1231 } 1232 rpcrdma_deregister_internal(ia, 1233 buf->rb_send_bufs[i]->rl_handle, 1234 &buf->rb_send_bufs[i]->rl_iov); 1235 kfree(buf->rb_send_bufs[i]); 1236 } 1237 } 1238 1239 kfree(buf->rb_pool); 1240 } 1241 1242 /* 1243 * Get a set of request/reply buffers. 1244 * 1245 * Reply buffer (if needed) is attached to send buffer upon return. 1246 * Rule: 1247 * rb_send_index and rb_recv_index MUST always be pointing to the 1248 * *next* available buffer (non-NULL). They are incremented after 1249 * removing buffers, and decremented *before* returning them. 1250 */ 1251 struct rpcrdma_req * 1252 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) 1253 { 1254 struct rpcrdma_req *req; 1255 unsigned long flags; 1256 int i; 1257 struct rpcrdma_mw *r; 1258 1259 spin_lock_irqsave(&buffers->rb_lock, flags); 1260 if (buffers->rb_send_index == buffers->rb_max_requests) { 1261 spin_unlock_irqrestore(&buffers->rb_lock, flags); 1262 dprintk("RPC: %s: out of request buffers\n", __func__); 1263 return ((struct rpcrdma_req *)NULL); 1264 } 1265 1266 req = buffers->rb_send_bufs[buffers->rb_send_index]; 1267 if (buffers->rb_send_index < buffers->rb_recv_index) { 1268 dprintk("RPC: %s: %d extra receives outstanding (ok)\n", 1269 __func__, 1270 buffers->rb_recv_index - buffers->rb_send_index); 1271 req->rl_reply = NULL; 1272 } else { 1273 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index]; 1274 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL; 1275 } 1276 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL; 1277 if (!list_empty(&buffers->rb_mws)) { 1278 i = RPCRDMA_MAX_SEGS - 1; 1279 do { 1280 r = list_entry(buffers->rb_mws.next, 1281 struct rpcrdma_mw, mw_list); 1282 list_del(&r->mw_list); 1283 req->rl_segments[i].mr_chunk.rl_mw = r; 1284 } while (--i >= 0); 1285 } 1286 spin_unlock_irqrestore(&buffers->rb_lock, flags); 1287 return req; 1288 } 1289 1290 /* 1291 * Put request/reply buffers back into pool. 1292 * Pre-decrement counter/array index. 1293 */ 1294 void 1295 rpcrdma_buffer_put(struct rpcrdma_req *req) 1296 { 1297 struct rpcrdma_buffer *buffers = req->rl_buffer; 1298 struct rpcrdma_ia *ia = rdmab_to_ia(buffers); 1299 int i; 1300 unsigned long flags; 1301 1302 BUG_ON(req->rl_nchunks != 0); 1303 spin_lock_irqsave(&buffers->rb_lock, flags); 1304 buffers->rb_send_bufs[--buffers->rb_send_index] = req; 1305 req->rl_niovs = 0; 1306 if (req->rl_reply) { 1307 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply; 1308 init_waitqueue_head(&req->rl_reply->rr_unbind); 1309 req->rl_reply->rr_func = NULL; 1310 req->rl_reply = NULL; 1311 } 1312 switch (ia->ri_memreg_strategy) { 1313 case RPCRDMA_FRMR: 1314 case RPCRDMA_MTHCAFMR: 1315 case RPCRDMA_MEMWINDOWS_ASYNC: 1316 case RPCRDMA_MEMWINDOWS: 1317 /* 1318 * Cycle mw's back in reverse order, and "spin" them. 1319 * This delays and scrambles reuse as much as possible. 1320 */ 1321 i = 1; 1322 do { 1323 struct rpcrdma_mw **mw; 1324 mw = &req->rl_segments[i].mr_chunk.rl_mw; 1325 list_add_tail(&(*mw)->mw_list, &buffers->rb_mws); 1326 *mw = NULL; 1327 } while (++i < RPCRDMA_MAX_SEGS); 1328 list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list, 1329 &buffers->rb_mws); 1330 req->rl_segments[0].mr_chunk.rl_mw = NULL; 1331 break; 1332 default: 1333 break; 1334 } 1335 spin_unlock_irqrestore(&buffers->rb_lock, flags); 1336 } 1337 1338 /* 1339 * Recover reply buffers from pool. 1340 * This happens when recovering from error conditions. 1341 * Post-increment counter/array index. 1342 */ 1343 void 1344 rpcrdma_recv_buffer_get(struct rpcrdma_req *req) 1345 { 1346 struct rpcrdma_buffer *buffers = req->rl_buffer; 1347 unsigned long flags; 1348 1349 if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */ 1350 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer; 1351 spin_lock_irqsave(&buffers->rb_lock, flags); 1352 if (buffers->rb_recv_index < buffers->rb_max_requests) { 1353 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index]; 1354 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL; 1355 } 1356 spin_unlock_irqrestore(&buffers->rb_lock, flags); 1357 } 1358 1359 /* 1360 * Put reply buffers back into pool when not attached to 1361 * request. This happens in error conditions, and when 1362 * aborting unbinds. Pre-decrement counter/array index. 1363 */ 1364 void 1365 rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) 1366 { 1367 struct rpcrdma_buffer *buffers = rep->rr_buffer; 1368 unsigned long flags; 1369 1370 rep->rr_func = NULL; 1371 spin_lock_irqsave(&buffers->rb_lock, flags); 1372 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep; 1373 spin_unlock_irqrestore(&buffers->rb_lock, flags); 1374 } 1375 1376 /* 1377 * Wrappers for internal-use kmalloc memory registration, used by buffer code. 1378 */ 1379 1380 int 1381 rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, 1382 struct ib_mr **mrp, struct ib_sge *iov) 1383 { 1384 struct ib_phys_buf ipb; 1385 struct ib_mr *mr; 1386 int rc; 1387 1388 /* 1389 * All memory passed here was kmalloc'ed, therefore phys-contiguous. 1390 */ 1391 iov->addr = ib_dma_map_single(ia->ri_id->device, 1392 va, len, DMA_BIDIRECTIONAL); 1393 iov->length = len; 1394 1395 if (ia->ri_have_dma_lkey) { 1396 *mrp = NULL; 1397 iov->lkey = ia->ri_dma_lkey; 1398 return 0; 1399 } else if (ia->ri_bind_mem != NULL) { 1400 *mrp = NULL; 1401 iov->lkey = ia->ri_bind_mem->lkey; 1402 return 0; 1403 } 1404 1405 ipb.addr = iov->addr; 1406 ipb.size = iov->length; 1407 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1, 1408 IB_ACCESS_LOCAL_WRITE, &iov->addr); 1409 1410 dprintk("RPC: %s: phys convert: 0x%llx " 1411 "registered 0x%llx length %d\n", 1412 __func__, (unsigned long long)ipb.addr, 1413 (unsigned long long)iov->addr, len); 1414 1415 if (IS_ERR(mr)) { 1416 *mrp = NULL; 1417 rc = PTR_ERR(mr); 1418 dprintk("RPC: %s: failed with %i\n", __func__, rc); 1419 } else { 1420 *mrp = mr; 1421 iov->lkey = mr->lkey; 1422 rc = 0; 1423 } 1424 1425 return rc; 1426 } 1427 1428 int 1429 rpcrdma_deregister_internal(struct rpcrdma_ia *ia, 1430 struct ib_mr *mr, struct ib_sge *iov) 1431 { 1432 int rc; 1433 1434 ib_dma_unmap_single(ia->ri_id->device, 1435 iov->addr, iov->length, DMA_BIDIRECTIONAL); 1436 1437 if (NULL == mr) 1438 return 0; 1439 1440 rc = ib_dereg_mr(mr); 1441 if (rc) 1442 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc); 1443 return rc; 1444 } 1445 1446 /* 1447 * Wrappers for chunk registration, shared by read/write chunk code. 1448 */ 1449 1450 static void 1451 rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing) 1452 { 1453 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; 1454 seg->mr_dmalen = seg->mr_len; 1455 if (seg->mr_page) 1456 seg->mr_dma = ib_dma_map_page(ia->ri_id->device, 1457 seg->mr_page, offset_in_page(seg->mr_offset), 1458 seg->mr_dmalen, seg->mr_dir); 1459 else 1460 seg->mr_dma = ib_dma_map_single(ia->ri_id->device, 1461 seg->mr_offset, 1462 seg->mr_dmalen, seg->mr_dir); 1463 if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) { 1464 dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n", 1465 __func__, 1466 (unsigned long long)seg->mr_dma, 1467 seg->mr_offset, seg->mr_dmalen); 1468 } 1469 } 1470 1471 static void 1472 rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg) 1473 { 1474 if (seg->mr_page) 1475 ib_dma_unmap_page(ia->ri_id->device, 1476 seg->mr_dma, seg->mr_dmalen, seg->mr_dir); 1477 else 1478 ib_dma_unmap_single(ia->ri_id->device, 1479 seg->mr_dma, seg->mr_dmalen, seg->mr_dir); 1480 } 1481 1482 static int 1483 rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, 1484 int *nsegs, int writing, struct rpcrdma_ia *ia, 1485 struct rpcrdma_xprt *r_xprt) 1486 { 1487 struct rpcrdma_mr_seg *seg1 = seg; 1488 struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr; 1489 1490 u8 key; 1491 int len, pageoff; 1492 int i, rc; 1493 1494 pageoff = offset_in_page(seg1->mr_offset); 1495 seg1->mr_offset -= pageoff; /* start of page */ 1496 seg1->mr_len += pageoff; 1497 len = -pageoff; 1498 if (*nsegs > RPCRDMA_MAX_DATA_SEGS) 1499 *nsegs = RPCRDMA_MAX_DATA_SEGS; 1500 for (i = 0; i < *nsegs;) { 1501 rpcrdma_map_one(ia, seg, writing); 1502 seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma; 1503 len += seg->mr_len; 1504 BUG_ON(seg->mr_len > PAGE_SIZE); 1505 ++seg; 1506 ++i; 1507 /* Check for holes */ 1508 if ((i < *nsegs && offset_in_page(seg->mr_offset)) || 1509 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) 1510 break; 1511 } 1512 dprintk("RPC: %s: Using frmr %p to map %d segments\n", 1513 __func__, seg1->mr_chunk.rl_mw, i); 1514 1515 if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) { 1516 dprintk("RPC: %s: frmr %x left valid, posting invalidate.\n", 1517 __func__, 1518 seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey); 1519 /* Invalidate before using. */ 1520 memset(&invalidate_wr, 0, sizeof invalidate_wr); 1521 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; 1522 invalidate_wr.next = &frmr_wr; 1523 invalidate_wr.opcode = IB_WR_LOCAL_INV; 1524 invalidate_wr.send_flags = IB_SEND_SIGNALED; 1525 invalidate_wr.ex.invalidate_rkey = 1526 seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; 1527 DECR_CQCOUNT(&r_xprt->rx_ep); 1528 post_wr = &invalidate_wr; 1529 } else 1530 post_wr = &frmr_wr; 1531 1532 /* Bump the key */ 1533 key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF); 1534 ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key); 1535 1536 /* Prepare FRMR WR */ 1537 memset(&frmr_wr, 0, sizeof frmr_wr); 1538 frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; 1539 frmr_wr.opcode = IB_WR_FAST_REG_MR; 1540 frmr_wr.send_flags = IB_SEND_SIGNALED; 1541 frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma; 1542 frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl; 1543 frmr_wr.wr.fast_reg.page_list_len = i; 1544 frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT; 1545 frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT; 1546 BUG_ON(frmr_wr.wr.fast_reg.length < len); 1547 frmr_wr.wr.fast_reg.access_flags = (writing ? 1548 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : 1549 IB_ACCESS_REMOTE_READ); 1550 frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; 1551 DECR_CQCOUNT(&r_xprt->rx_ep); 1552 1553 rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr); 1554 1555 if (rc) { 1556 dprintk("RPC: %s: failed ib_post_send for register," 1557 " status %i\n", __func__, rc); 1558 while (i--) 1559 rpcrdma_unmap_one(ia, --seg); 1560 } else { 1561 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; 1562 seg1->mr_base = seg1->mr_dma + pageoff; 1563 seg1->mr_nsegs = i; 1564 seg1->mr_len = len; 1565 } 1566 *nsegs = i; 1567 return rc; 1568 } 1569 1570 static int 1571 rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, 1572 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt) 1573 { 1574 struct rpcrdma_mr_seg *seg1 = seg; 1575 struct ib_send_wr invalidate_wr, *bad_wr; 1576 int rc; 1577 1578 while (seg1->mr_nsegs--) 1579 rpcrdma_unmap_one(ia, seg++); 1580 1581 memset(&invalidate_wr, 0, sizeof invalidate_wr); 1582 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; 1583 invalidate_wr.opcode = IB_WR_LOCAL_INV; 1584 invalidate_wr.send_flags = IB_SEND_SIGNALED; 1585 invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; 1586 DECR_CQCOUNT(&r_xprt->rx_ep); 1587 1588 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); 1589 if (rc) 1590 dprintk("RPC: %s: failed ib_post_send for invalidate," 1591 " status %i\n", __func__, rc); 1592 return rc; 1593 } 1594 1595 static int 1596 rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg, 1597 int *nsegs, int writing, struct rpcrdma_ia *ia) 1598 { 1599 struct rpcrdma_mr_seg *seg1 = seg; 1600 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS]; 1601 int len, pageoff, i, rc; 1602 1603 pageoff = offset_in_page(seg1->mr_offset); 1604 seg1->mr_offset -= pageoff; /* start of page */ 1605 seg1->mr_len += pageoff; 1606 len = -pageoff; 1607 if (*nsegs > RPCRDMA_MAX_DATA_SEGS) 1608 *nsegs = RPCRDMA_MAX_DATA_SEGS; 1609 for (i = 0; i < *nsegs;) { 1610 rpcrdma_map_one(ia, seg, writing); 1611 physaddrs[i] = seg->mr_dma; 1612 len += seg->mr_len; 1613 ++seg; 1614 ++i; 1615 /* Check for holes */ 1616 if ((i < *nsegs && offset_in_page(seg->mr_offset)) || 1617 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) 1618 break; 1619 } 1620 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr, 1621 physaddrs, i, seg1->mr_dma); 1622 if (rc) { 1623 dprintk("RPC: %s: failed ib_map_phys_fmr " 1624 "%u@0x%llx+%i (%d)... status %i\n", __func__, 1625 len, (unsigned long long)seg1->mr_dma, 1626 pageoff, i, rc); 1627 while (i--) 1628 rpcrdma_unmap_one(ia, --seg); 1629 } else { 1630 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey; 1631 seg1->mr_base = seg1->mr_dma + pageoff; 1632 seg1->mr_nsegs = i; 1633 seg1->mr_len = len; 1634 } 1635 *nsegs = i; 1636 return rc; 1637 } 1638 1639 static int 1640 rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg, 1641 struct rpcrdma_ia *ia) 1642 { 1643 struct rpcrdma_mr_seg *seg1 = seg; 1644 LIST_HEAD(l); 1645 int rc; 1646 1647 list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l); 1648 rc = ib_unmap_fmr(&l); 1649 while (seg1->mr_nsegs--) 1650 rpcrdma_unmap_one(ia, seg++); 1651 if (rc) 1652 dprintk("RPC: %s: failed ib_unmap_fmr," 1653 " status %i\n", __func__, rc); 1654 return rc; 1655 } 1656 1657 static int 1658 rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg, 1659 int *nsegs, int writing, struct rpcrdma_ia *ia, 1660 struct rpcrdma_xprt *r_xprt) 1661 { 1662 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE : 1663 IB_ACCESS_REMOTE_READ); 1664 struct ib_mw_bind param; 1665 int rc; 1666 1667 *nsegs = 1; 1668 rpcrdma_map_one(ia, seg, writing); 1669 param.mr = ia->ri_bind_mem; 1670 param.wr_id = 0ULL; /* no send cookie */ 1671 param.addr = seg->mr_dma; 1672 param.length = seg->mr_len; 1673 param.send_flags = 0; 1674 param.mw_access_flags = mem_priv; 1675 1676 DECR_CQCOUNT(&r_xprt->rx_ep); 1677 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m); 1678 if (rc) { 1679 dprintk("RPC: %s: failed ib_bind_mw " 1680 "%u@0x%llx status %i\n", 1681 __func__, seg->mr_len, 1682 (unsigned long long)seg->mr_dma, rc); 1683 rpcrdma_unmap_one(ia, seg); 1684 } else { 1685 seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey; 1686 seg->mr_base = param.addr; 1687 seg->mr_nsegs = 1; 1688 } 1689 return rc; 1690 } 1691 1692 static int 1693 rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg, 1694 struct rpcrdma_ia *ia, 1695 struct rpcrdma_xprt *r_xprt, void **r) 1696 { 1697 struct ib_mw_bind param; 1698 LIST_HEAD(l); 1699 int rc; 1700 1701 BUG_ON(seg->mr_nsegs != 1); 1702 param.mr = ia->ri_bind_mem; 1703 param.addr = 0ULL; /* unbind */ 1704 param.length = 0; 1705 param.mw_access_flags = 0; 1706 if (*r) { 1707 param.wr_id = (u64) (unsigned long) *r; 1708 param.send_flags = IB_SEND_SIGNALED; 1709 INIT_CQCOUNT(&r_xprt->rx_ep); 1710 } else { 1711 param.wr_id = 0ULL; 1712 param.send_flags = 0; 1713 DECR_CQCOUNT(&r_xprt->rx_ep); 1714 } 1715 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m); 1716 rpcrdma_unmap_one(ia, seg); 1717 if (rc) 1718 dprintk("RPC: %s: failed ib_(un)bind_mw," 1719 " status %i\n", __func__, rc); 1720 else 1721 *r = NULL; /* will upcall on completion */ 1722 return rc; 1723 } 1724 1725 static int 1726 rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg, 1727 int *nsegs, int writing, struct rpcrdma_ia *ia) 1728 { 1729 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE : 1730 IB_ACCESS_REMOTE_READ); 1731 struct rpcrdma_mr_seg *seg1 = seg; 1732 struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS]; 1733 int len, i, rc = 0; 1734 1735 if (*nsegs > RPCRDMA_MAX_DATA_SEGS) 1736 *nsegs = RPCRDMA_MAX_DATA_SEGS; 1737 for (len = 0, i = 0; i < *nsegs;) { 1738 rpcrdma_map_one(ia, seg, writing); 1739 ipb[i].addr = seg->mr_dma; 1740 ipb[i].size = seg->mr_len; 1741 len += seg->mr_len; 1742 ++seg; 1743 ++i; 1744 /* Check for holes */ 1745 if ((i < *nsegs && offset_in_page(seg->mr_offset)) || 1746 offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len)) 1747 break; 1748 } 1749 seg1->mr_base = seg1->mr_dma; 1750 seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd, 1751 ipb, i, mem_priv, &seg1->mr_base); 1752 if (IS_ERR(seg1->mr_chunk.rl_mr)) { 1753 rc = PTR_ERR(seg1->mr_chunk.rl_mr); 1754 dprintk("RPC: %s: failed ib_reg_phys_mr " 1755 "%u@0x%llx (%d)... status %i\n", 1756 __func__, len, 1757 (unsigned long long)seg1->mr_dma, i, rc); 1758 while (i--) 1759 rpcrdma_unmap_one(ia, --seg); 1760 } else { 1761 seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey; 1762 seg1->mr_nsegs = i; 1763 seg1->mr_len = len; 1764 } 1765 *nsegs = i; 1766 return rc; 1767 } 1768 1769 static int 1770 rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg, 1771 struct rpcrdma_ia *ia) 1772 { 1773 struct rpcrdma_mr_seg *seg1 = seg; 1774 int rc; 1775 1776 rc = ib_dereg_mr(seg1->mr_chunk.rl_mr); 1777 seg1->mr_chunk.rl_mr = NULL; 1778 while (seg1->mr_nsegs--) 1779 rpcrdma_unmap_one(ia, seg++); 1780 if (rc) 1781 dprintk("RPC: %s: failed ib_dereg_mr," 1782 " status %i\n", __func__, rc); 1783 return rc; 1784 } 1785 1786 int 1787 rpcrdma_register_external(struct rpcrdma_mr_seg *seg, 1788 int nsegs, int writing, struct rpcrdma_xprt *r_xprt) 1789 { 1790 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 1791 int rc = 0; 1792 1793 switch (ia->ri_memreg_strategy) { 1794 1795 #if RPCRDMA_PERSISTENT_REGISTRATION 1796 case RPCRDMA_ALLPHYSICAL: 1797 rpcrdma_map_one(ia, seg, writing); 1798 seg->mr_rkey = ia->ri_bind_mem->rkey; 1799 seg->mr_base = seg->mr_dma; 1800 seg->mr_nsegs = 1; 1801 nsegs = 1; 1802 break; 1803 #endif 1804 1805 /* Registration using frmr registration */ 1806 case RPCRDMA_FRMR: 1807 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt); 1808 break; 1809 1810 /* Registration using fmr memory registration */ 1811 case RPCRDMA_MTHCAFMR: 1812 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia); 1813 break; 1814 1815 /* Registration using memory windows */ 1816 case RPCRDMA_MEMWINDOWS_ASYNC: 1817 case RPCRDMA_MEMWINDOWS: 1818 rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt); 1819 break; 1820 1821 /* Default registration each time */ 1822 default: 1823 rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia); 1824 break; 1825 } 1826 if (rc) 1827 return -1; 1828 1829 return nsegs; 1830 } 1831 1832 int 1833 rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, 1834 struct rpcrdma_xprt *r_xprt, void *r) 1835 { 1836 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 1837 int nsegs = seg->mr_nsegs, rc; 1838 1839 switch (ia->ri_memreg_strategy) { 1840 1841 #if RPCRDMA_PERSISTENT_REGISTRATION 1842 case RPCRDMA_ALLPHYSICAL: 1843 BUG_ON(nsegs != 1); 1844 rpcrdma_unmap_one(ia, seg); 1845 rc = 0; 1846 break; 1847 #endif 1848 1849 case RPCRDMA_FRMR: 1850 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt); 1851 break; 1852 1853 case RPCRDMA_MTHCAFMR: 1854 rc = rpcrdma_deregister_fmr_external(seg, ia); 1855 break; 1856 1857 case RPCRDMA_MEMWINDOWS_ASYNC: 1858 case RPCRDMA_MEMWINDOWS: 1859 rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r); 1860 break; 1861 1862 default: 1863 rc = rpcrdma_deregister_default_external(seg, ia); 1864 break; 1865 } 1866 if (r) { 1867 struct rpcrdma_rep *rep = r; 1868 void (*func)(struct rpcrdma_rep *) = rep->rr_func; 1869 rep->rr_func = NULL; 1870 func(rep); /* dereg done, callback now */ 1871 } 1872 return nsegs; 1873 } 1874 1875 /* 1876 * Prepost any receive buffer, then post send. 1877 * 1878 * Receive buffer is donated to hardware, reclaimed upon recv completion. 1879 */ 1880 int 1881 rpcrdma_ep_post(struct rpcrdma_ia *ia, 1882 struct rpcrdma_ep *ep, 1883 struct rpcrdma_req *req) 1884 { 1885 struct ib_send_wr send_wr, *send_wr_fail; 1886 struct rpcrdma_rep *rep = req->rl_reply; 1887 int rc; 1888 1889 if (rep) { 1890 rc = rpcrdma_ep_post_recv(ia, ep, rep); 1891 if (rc) 1892 goto out; 1893 req->rl_reply = NULL; 1894 } 1895 1896 send_wr.next = NULL; 1897 send_wr.wr_id = 0ULL; /* no send cookie */ 1898 send_wr.sg_list = req->rl_send_iov; 1899 send_wr.num_sge = req->rl_niovs; 1900 send_wr.opcode = IB_WR_SEND; 1901 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */ 1902 ib_dma_sync_single_for_device(ia->ri_id->device, 1903 req->rl_send_iov[3].addr, req->rl_send_iov[3].length, 1904 DMA_TO_DEVICE); 1905 ib_dma_sync_single_for_device(ia->ri_id->device, 1906 req->rl_send_iov[1].addr, req->rl_send_iov[1].length, 1907 DMA_TO_DEVICE); 1908 ib_dma_sync_single_for_device(ia->ri_id->device, 1909 req->rl_send_iov[0].addr, req->rl_send_iov[0].length, 1910 DMA_TO_DEVICE); 1911 1912 if (DECR_CQCOUNT(ep) > 0) 1913 send_wr.send_flags = 0; 1914 else { /* Provider must take a send completion every now and then */ 1915 INIT_CQCOUNT(ep); 1916 send_wr.send_flags = IB_SEND_SIGNALED; 1917 } 1918 1919 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail); 1920 if (rc) 1921 dprintk("RPC: %s: ib_post_send returned %i\n", __func__, 1922 rc); 1923 out: 1924 return rc; 1925 } 1926 1927 /* 1928 * (Re)post a receive buffer. 1929 */ 1930 int 1931 rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, 1932 struct rpcrdma_ep *ep, 1933 struct rpcrdma_rep *rep) 1934 { 1935 struct ib_recv_wr recv_wr, *recv_wr_fail; 1936 int rc; 1937 1938 recv_wr.next = NULL; 1939 recv_wr.wr_id = (u64) (unsigned long) rep; 1940 recv_wr.sg_list = &rep->rr_iov; 1941 recv_wr.num_sge = 1; 1942 1943 ib_dma_sync_single_for_cpu(ia->ri_id->device, 1944 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL); 1945 1946 DECR_CQCOUNT(ep); 1947 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); 1948 1949 if (rc) 1950 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__, 1951 rc); 1952 return rc; 1953 } 1954