1 /* 2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the BSD-type 8 * license below: 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 14 * Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 17 * Redistributions in binary form must reproduce the above 18 * copyright notice, this list of conditions and the following 19 * disclaimer in the documentation and/or other materials provided 20 * with the distribution. 21 * 22 * Neither the name of the Network Appliance, Inc. nor the names of 23 * its contributors may be used to endorse or promote products 24 * derived from this software without specific prior written 25 * permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 */ 39 40 /* 41 * verbs.c 42 * 43 * Encapsulates the major functions managing: 44 * o adapters 45 * o endpoints 46 * o connections 47 * o buffer memory 48 */ 49 50 #include <linux/pci.h> /* for Tavor hack below */ 51 #include <linux/slab.h> 52 53 #include "xprt_rdma.h" 54 55 /* 56 * Globals/Macros 57 */ 58 59 #ifdef RPC_DEBUG 60 # define RPCDBG_FACILITY RPCDBG_TRANS 61 #endif 62 63 /* 64 * internal functions 65 */ 66 67 /* 68 * handle replies in tasklet context, using a single, global list 69 * rdma tasklet function -- just turn around and call the func 70 * for all replies on the list 71 */ 72 73 static DEFINE_SPINLOCK(rpcrdma_tk_lock_g); 74 static LIST_HEAD(rpcrdma_tasklets_g); 75 76 static void 77 rpcrdma_run_tasklet(unsigned long data) 78 { 79 struct rpcrdma_rep *rep; 80 void (*func)(struct rpcrdma_rep *); 81 unsigned long flags; 82 83 data = data; 84 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); 85 while (!list_empty(&rpcrdma_tasklets_g)) { 86 rep = list_entry(rpcrdma_tasklets_g.next, 87 struct rpcrdma_rep, rr_list); 88 list_del(&rep->rr_list); 89 func = rep->rr_func; 90 rep->rr_func = NULL; 91 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); 92 93 if (func) 94 func(rep); 95 else 96 rpcrdma_recv_buffer_put(rep); 97 98 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); 99 } 100 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); 101 } 102 103 static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL); 104 105 static inline void 106 rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep) 107 { 108 unsigned long flags; 109 110 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); 111 list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g); 112 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); 113 tasklet_schedule(&rpcrdma_tasklet_g); 114 } 115 116 static void 117 rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) 118 { 119 struct rpcrdma_ep *ep = context; 120 121 dprintk("RPC: %s: QP error %X on device %s ep %p\n", 122 __func__, event->event, event->device->name, context); 123 if (ep->rep_connected == 1) { 124 ep->rep_connected = -EIO; 125 ep->rep_func(ep); 126 wake_up_all(&ep->rep_connect_wait); 127 } 128 } 129 130 static void 131 rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context) 132 { 133 struct rpcrdma_ep *ep = context; 134 135 dprintk("RPC: %s: CQ error %X on device %s ep %p\n", 136 __func__, event->event, event->device->name, context); 137 if (ep->rep_connected == 1) { 138 ep->rep_connected = -EIO; 139 ep->rep_func(ep); 140 wake_up_all(&ep->rep_connect_wait); 141 } 142 } 143 144 static inline 145 void rpcrdma_event_process(struct ib_wc *wc) 146 { 147 struct rpcrdma_rep *rep = 148 (struct rpcrdma_rep *)(unsigned long) wc->wr_id; 149 150 dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n", 151 __func__, rep, wc->status, wc->opcode, wc->byte_len); 152 153 if (!rep) /* send or bind completion that we don't care about */ 154 return; 155 156 if (IB_WC_SUCCESS != wc->status) { 157 dprintk("RPC: %s: %s WC status %X, connection lost\n", 158 __func__, (wc->opcode & IB_WC_RECV) ? "recv" : "send", 159 wc->status); 160 rep->rr_len = ~0U; 161 rpcrdma_schedule_tasklet(rep); 162 return; 163 } 164 165 switch (wc->opcode) { 166 case IB_WC_RECV: 167 rep->rr_len = wc->byte_len; 168 ib_dma_sync_single_for_cpu( 169 rdmab_to_ia(rep->rr_buffer)->ri_id->device, 170 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE); 171 /* Keep (only) the most recent credits, after check validity */ 172 if (rep->rr_len >= 16) { 173 struct rpcrdma_msg *p = 174 (struct rpcrdma_msg *) rep->rr_base; 175 unsigned int credits = ntohl(p->rm_credit); 176 if (credits == 0) { 177 dprintk("RPC: %s: server" 178 " dropped credits to 0!\n", __func__); 179 /* don't deadlock */ 180 credits = 1; 181 } else if (credits > rep->rr_buffer->rb_max_requests) { 182 dprintk("RPC: %s: server" 183 " over-crediting: %d (%d)\n", 184 __func__, credits, 185 rep->rr_buffer->rb_max_requests); 186 credits = rep->rr_buffer->rb_max_requests; 187 } 188 atomic_set(&rep->rr_buffer->rb_credits, credits); 189 } 190 /* fall through */ 191 case IB_WC_BIND_MW: 192 rpcrdma_schedule_tasklet(rep); 193 break; 194 default: 195 dprintk("RPC: %s: unexpected WC event %X\n", 196 __func__, wc->opcode); 197 break; 198 } 199 } 200 201 static inline int 202 rpcrdma_cq_poll(struct ib_cq *cq) 203 { 204 struct ib_wc wc; 205 int rc; 206 207 for (;;) { 208 rc = ib_poll_cq(cq, 1, &wc); 209 if (rc < 0) { 210 dprintk("RPC: %s: ib_poll_cq failed %i\n", 211 __func__, rc); 212 return rc; 213 } 214 if (rc == 0) 215 break; 216 217 rpcrdma_event_process(&wc); 218 } 219 220 return 0; 221 } 222 223 /* 224 * rpcrdma_cq_event_upcall 225 * 226 * This upcall handles recv, send, bind and unbind events. 227 * It is reentrant but processes single events in order to maintain 228 * ordering of receives to keep server credits. 229 * 230 * It is the responsibility of the scheduled tasklet to return 231 * recv buffers to the pool. NOTE: this affects synchronization of 232 * connection shutdown. That is, the structures required for 233 * the completion of the reply handler must remain intact until 234 * all memory has been reclaimed. 235 * 236 * Note that send events are suppressed and do not result in an upcall. 237 */ 238 static void 239 rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context) 240 { 241 int rc; 242 243 rc = rpcrdma_cq_poll(cq); 244 if (rc) 245 return; 246 247 rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); 248 if (rc) { 249 dprintk("RPC: %s: ib_req_notify_cq failed %i\n", 250 __func__, rc); 251 return; 252 } 253 254 rpcrdma_cq_poll(cq); 255 } 256 257 #ifdef RPC_DEBUG 258 static const char * const conn[] = { 259 "address resolved", 260 "address error", 261 "route resolved", 262 "route error", 263 "connect request", 264 "connect response", 265 "connect error", 266 "unreachable", 267 "rejected", 268 "established", 269 "disconnected", 270 "device removal" 271 }; 272 #endif 273 274 static int 275 rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) 276 { 277 struct rpcrdma_xprt *xprt = id->context; 278 struct rpcrdma_ia *ia = &xprt->rx_ia; 279 struct rpcrdma_ep *ep = &xprt->rx_ep; 280 #ifdef RPC_DEBUG 281 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr; 282 #endif 283 struct ib_qp_attr attr; 284 struct ib_qp_init_attr iattr; 285 int connstate = 0; 286 287 switch (event->event) { 288 case RDMA_CM_EVENT_ADDR_RESOLVED: 289 case RDMA_CM_EVENT_ROUTE_RESOLVED: 290 ia->ri_async_rc = 0; 291 complete(&ia->ri_done); 292 break; 293 case RDMA_CM_EVENT_ADDR_ERROR: 294 ia->ri_async_rc = -EHOSTUNREACH; 295 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n", 296 __func__, ep); 297 complete(&ia->ri_done); 298 break; 299 case RDMA_CM_EVENT_ROUTE_ERROR: 300 ia->ri_async_rc = -ENETUNREACH; 301 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n", 302 __func__, ep); 303 complete(&ia->ri_done); 304 break; 305 case RDMA_CM_EVENT_ESTABLISHED: 306 connstate = 1; 307 ib_query_qp(ia->ri_id->qp, &attr, 308 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC, 309 &iattr); 310 dprintk("RPC: %s: %d responder resources" 311 " (%d initiator)\n", 312 __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic); 313 goto connected; 314 case RDMA_CM_EVENT_CONNECT_ERROR: 315 connstate = -ENOTCONN; 316 goto connected; 317 case RDMA_CM_EVENT_UNREACHABLE: 318 connstate = -ENETDOWN; 319 goto connected; 320 case RDMA_CM_EVENT_REJECTED: 321 connstate = -ECONNREFUSED; 322 goto connected; 323 case RDMA_CM_EVENT_DISCONNECTED: 324 connstate = -ECONNABORTED; 325 goto connected; 326 case RDMA_CM_EVENT_DEVICE_REMOVAL: 327 connstate = -ENODEV; 328 connected: 329 dprintk("RPC: %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n", 330 __func__, 331 (event->event <= 11) ? conn[event->event] : 332 "unknown connection error", 333 &addr->sin_addr.s_addr, 334 ntohs(addr->sin_port), 335 ep, event->event); 336 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1); 337 dprintk("RPC: %s: %sconnected\n", 338 __func__, connstate > 0 ? "" : "dis"); 339 ep->rep_connected = connstate; 340 ep->rep_func(ep); 341 wake_up_all(&ep->rep_connect_wait); 342 break; 343 default: 344 dprintk("RPC: %s: unexpected CM event %d\n", 345 __func__, event->event); 346 break; 347 } 348 349 #ifdef RPC_DEBUG 350 if (connstate == 1) { 351 int ird = attr.max_dest_rd_atomic; 352 int tird = ep->rep_remote_cma.responder_resources; 353 printk(KERN_INFO "rpcrdma: connection to %pI4:%u " 354 "on %s, memreg %d slots %d ird %d%s\n", 355 &addr->sin_addr.s_addr, 356 ntohs(addr->sin_port), 357 ia->ri_id->device->name, 358 ia->ri_memreg_strategy, 359 xprt->rx_buf.rb_max_requests, 360 ird, ird < 4 && ird < tird / 2 ? " (low!)" : ""); 361 } else if (connstate < 0) { 362 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n", 363 &addr->sin_addr.s_addr, 364 ntohs(addr->sin_port), 365 connstate); 366 } 367 #endif 368 369 return 0; 370 } 371 372 static struct rdma_cm_id * 373 rpcrdma_create_id(struct rpcrdma_xprt *xprt, 374 struct rpcrdma_ia *ia, struct sockaddr *addr) 375 { 376 struct rdma_cm_id *id; 377 int rc; 378 379 init_completion(&ia->ri_done); 380 381 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP); 382 if (IS_ERR(id)) { 383 rc = PTR_ERR(id); 384 dprintk("RPC: %s: rdma_create_id() failed %i\n", 385 __func__, rc); 386 return id; 387 } 388 389 ia->ri_async_rc = -ETIMEDOUT; 390 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT); 391 if (rc) { 392 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n", 393 __func__, rc); 394 goto out; 395 } 396 wait_for_completion_interruptible_timeout(&ia->ri_done, 397 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1); 398 rc = ia->ri_async_rc; 399 if (rc) 400 goto out; 401 402 ia->ri_async_rc = -ETIMEDOUT; 403 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT); 404 if (rc) { 405 dprintk("RPC: %s: rdma_resolve_route() failed %i\n", 406 __func__, rc); 407 goto out; 408 } 409 wait_for_completion_interruptible_timeout(&ia->ri_done, 410 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1); 411 rc = ia->ri_async_rc; 412 if (rc) 413 goto out; 414 415 return id; 416 417 out: 418 rdma_destroy_id(id); 419 return ERR_PTR(rc); 420 } 421 422 /* 423 * Drain any cq, prior to teardown. 424 */ 425 static void 426 rpcrdma_clean_cq(struct ib_cq *cq) 427 { 428 struct ib_wc wc; 429 int count = 0; 430 431 while (1 == ib_poll_cq(cq, 1, &wc)) 432 ++count; 433 434 if (count) 435 dprintk("RPC: %s: flushed %d events (last 0x%x)\n", 436 __func__, count, wc.opcode); 437 } 438 439 /* 440 * Exported functions. 441 */ 442 443 /* 444 * Open and initialize an Interface Adapter. 445 * o initializes fields of struct rpcrdma_ia, including 446 * interface and provider attributes and protection zone. 447 */ 448 int 449 rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) 450 { 451 int rc, mem_priv; 452 struct ib_device_attr devattr; 453 struct rpcrdma_ia *ia = &xprt->rx_ia; 454 455 ia->ri_id = rpcrdma_create_id(xprt, ia, addr); 456 if (IS_ERR(ia->ri_id)) { 457 rc = PTR_ERR(ia->ri_id); 458 goto out1; 459 } 460 461 ia->ri_pd = ib_alloc_pd(ia->ri_id->device); 462 if (IS_ERR(ia->ri_pd)) { 463 rc = PTR_ERR(ia->ri_pd); 464 dprintk("RPC: %s: ib_alloc_pd() failed %i\n", 465 __func__, rc); 466 goto out2; 467 } 468 469 /* 470 * Query the device to determine if the requested memory 471 * registration strategy is supported. If it isn't, set the 472 * strategy to a globally supported model. 473 */ 474 rc = ib_query_device(ia->ri_id->device, &devattr); 475 if (rc) { 476 dprintk("RPC: %s: ib_query_device failed %d\n", 477 __func__, rc); 478 goto out2; 479 } 480 481 if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) { 482 ia->ri_have_dma_lkey = 1; 483 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey; 484 } 485 486 switch (memreg) { 487 case RPCRDMA_MEMWINDOWS: 488 case RPCRDMA_MEMWINDOWS_ASYNC: 489 if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) { 490 dprintk("RPC: %s: MEMWINDOWS registration " 491 "specified but not supported by adapter, " 492 "using slower RPCRDMA_REGISTER\n", 493 __func__); 494 memreg = RPCRDMA_REGISTER; 495 } 496 break; 497 case RPCRDMA_MTHCAFMR: 498 if (!ia->ri_id->device->alloc_fmr) { 499 #if RPCRDMA_PERSISTENT_REGISTRATION 500 dprintk("RPC: %s: MTHCAFMR registration " 501 "specified but not supported by adapter, " 502 "using riskier RPCRDMA_ALLPHYSICAL\n", 503 __func__); 504 memreg = RPCRDMA_ALLPHYSICAL; 505 #else 506 dprintk("RPC: %s: MTHCAFMR registration " 507 "specified but not supported by adapter, " 508 "using slower RPCRDMA_REGISTER\n", 509 __func__); 510 memreg = RPCRDMA_REGISTER; 511 #endif 512 } 513 break; 514 case RPCRDMA_FRMR: 515 /* Requires both frmr reg and local dma lkey */ 516 if ((devattr.device_cap_flags & 517 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) != 518 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) { 519 #if RPCRDMA_PERSISTENT_REGISTRATION 520 dprintk("RPC: %s: FRMR registration " 521 "specified but not supported by adapter, " 522 "using riskier RPCRDMA_ALLPHYSICAL\n", 523 __func__); 524 memreg = RPCRDMA_ALLPHYSICAL; 525 #else 526 dprintk("RPC: %s: FRMR registration " 527 "specified but not supported by adapter, " 528 "using slower RPCRDMA_REGISTER\n", 529 __func__); 530 memreg = RPCRDMA_REGISTER; 531 #endif 532 } 533 break; 534 } 535 536 /* 537 * Optionally obtain an underlying physical identity mapping in 538 * order to do a memory window-based bind. This base registration 539 * is protected from remote access - that is enabled only by binding 540 * for the specific bytes targeted during each RPC operation, and 541 * revoked after the corresponding completion similar to a storage 542 * adapter. 543 */ 544 switch (memreg) { 545 case RPCRDMA_BOUNCEBUFFERS: 546 case RPCRDMA_REGISTER: 547 case RPCRDMA_FRMR: 548 break; 549 #if RPCRDMA_PERSISTENT_REGISTRATION 550 case RPCRDMA_ALLPHYSICAL: 551 mem_priv = IB_ACCESS_LOCAL_WRITE | 552 IB_ACCESS_REMOTE_WRITE | 553 IB_ACCESS_REMOTE_READ; 554 goto register_setup; 555 #endif 556 case RPCRDMA_MEMWINDOWS_ASYNC: 557 case RPCRDMA_MEMWINDOWS: 558 mem_priv = IB_ACCESS_LOCAL_WRITE | 559 IB_ACCESS_MW_BIND; 560 goto register_setup; 561 case RPCRDMA_MTHCAFMR: 562 if (ia->ri_have_dma_lkey) 563 break; 564 mem_priv = IB_ACCESS_LOCAL_WRITE; 565 register_setup: 566 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv); 567 if (IS_ERR(ia->ri_bind_mem)) { 568 printk(KERN_ALERT "%s: ib_get_dma_mr for " 569 "phys register failed with %lX\n\t" 570 "Will continue with degraded performance\n", 571 __func__, PTR_ERR(ia->ri_bind_mem)); 572 memreg = RPCRDMA_REGISTER; 573 ia->ri_bind_mem = NULL; 574 } 575 break; 576 default: 577 printk(KERN_ERR "%s: invalid memory registration mode %d\n", 578 __func__, memreg); 579 rc = -EINVAL; 580 goto out2; 581 } 582 dprintk("RPC: %s: memory registration strategy is %d\n", 583 __func__, memreg); 584 585 /* Else will do memory reg/dereg for each chunk */ 586 ia->ri_memreg_strategy = memreg; 587 588 return 0; 589 out2: 590 rdma_destroy_id(ia->ri_id); 591 ia->ri_id = NULL; 592 out1: 593 return rc; 594 } 595 596 /* 597 * Clean up/close an IA. 598 * o if event handles and PD have been initialized, free them. 599 * o close the IA 600 */ 601 void 602 rpcrdma_ia_close(struct rpcrdma_ia *ia) 603 { 604 int rc; 605 606 dprintk("RPC: %s: entering\n", __func__); 607 if (ia->ri_bind_mem != NULL) { 608 rc = ib_dereg_mr(ia->ri_bind_mem); 609 dprintk("RPC: %s: ib_dereg_mr returned %i\n", 610 __func__, rc); 611 } 612 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) { 613 if (ia->ri_id->qp) 614 rdma_destroy_qp(ia->ri_id); 615 rdma_destroy_id(ia->ri_id); 616 ia->ri_id = NULL; 617 } 618 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) { 619 rc = ib_dealloc_pd(ia->ri_pd); 620 dprintk("RPC: %s: ib_dealloc_pd returned %i\n", 621 __func__, rc); 622 } 623 } 624 625 /* 626 * Create unconnected endpoint. 627 */ 628 int 629 rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, 630 struct rpcrdma_create_data_internal *cdata) 631 { 632 struct ib_device_attr devattr; 633 int rc, err; 634 635 rc = ib_query_device(ia->ri_id->device, &devattr); 636 if (rc) { 637 dprintk("RPC: %s: ib_query_device failed %d\n", 638 __func__, rc); 639 return rc; 640 } 641 642 /* check provider's send/recv wr limits */ 643 if (cdata->max_requests > devattr.max_qp_wr) 644 cdata->max_requests = devattr.max_qp_wr; 645 646 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall; 647 ep->rep_attr.qp_context = ep; 648 /* send_cq and recv_cq initialized below */ 649 ep->rep_attr.srq = NULL; 650 ep->rep_attr.cap.max_send_wr = cdata->max_requests; 651 switch (ia->ri_memreg_strategy) { 652 case RPCRDMA_FRMR: 653 /* Add room for frmr register and invalidate WRs. 654 * 1. FRMR reg WR for head 655 * 2. FRMR invalidate WR for head 656 * 3. FRMR reg WR for pagelist 657 * 4. FRMR invalidate WR for pagelist 658 * 5. FRMR reg WR for tail 659 * 6. FRMR invalidate WR for tail 660 * 7. The RDMA_SEND WR 661 */ 662 ep->rep_attr.cap.max_send_wr *= 7; 663 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) { 664 cdata->max_requests = devattr.max_qp_wr / 7; 665 if (!cdata->max_requests) 666 return -EINVAL; 667 ep->rep_attr.cap.max_send_wr = cdata->max_requests * 7; 668 } 669 break; 670 case RPCRDMA_MEMWINDOWS_ASYNC: 671 case RPCRDMA_MEMWINDOWS: 672 /* Add room for mw_binds+unbinds - overkill! */ 673 ep->rep_attr.cap.max_send_wr++; 674 ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS); 675 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) 676 return -EINVAL; 677 break; 678 default: 679 break; 680 } 681 ep->rep_attr.cap.max_recv_wr = cdata->max_requests; 682 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2); 683 ep->rep_attr.cap.max_recv_sge = 1; 684 ep->rep_attr.cap.max_inline_data = 0; 685 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR; 686 ep->rep_attr.qp_type = IB_QPT_RC; 687 ep->rep_attr.port_num = ~0; 688 689 dprintk("RPC: %s: requested max: dtos: send %d recv %d; " 690 "iovs: send %d recv %d\n", 691 __func__, 692 ep->rep_attr.cap.max_send_wr, 693 ep->rep_attr.cap.max_recv_wr, 694 ep->rep_attr.cap.max_send_sge, 695 ep->rep_attr.cap.max_recv_sge); 696 697 /* set trigger for requesting send completion */ 698 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/; 699 switch (ia->ri_memreg_strategy) { 700 case RPCRDMA_MEMWINDOWS_ASYNC: 701 case RPCRDMA_MEMWINDOWS: 702 ep->rep_cqinit -= RPCRDMA_MAX_SEGS; 703 break; 704 default: 705 break; 706 } 707 if (ep->rep_cqinit <= 2) 708 ep->rep_cqinit = 0; 709 INIT_CQCOUNT(ep); 710 ep->rep_ia = ia; 711 init_waitqueue_head(&ep->rep_connect_wait); 712 713 /* 714 * Create a single cq for receive dto and mw_bind (only ever 715 * care about unbind, really). Send completions are suppressed. 716 * Use single threaded tasklet upcalls to maintain ordering. 717 */ 718 ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall, 719 rpcrdma_cq_async_error_upcall, NULL, 720 ep->rep_attr.cap.max_recv_wr + 721 ep->rep_attr.cap.max_send_wr + 1, 0); 722 if (IS_ERR(ep->rep_cq)) { 723 rc = PTR_ERR(ep->rep_cq); 724 dprintk("RPC: %s: ib_create_cq failed: %i\n", 725 __func__, rc); 726 goto out1; 727 } 728 729 rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP); 730 if (rc) { 731 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", 732 __func__, rc); 733 goto out2; 734 } 735 736 ep->rep_attr.send_cq = ep->rep_cq; 737 ep->rep_attr.recv_cq = ep->rep_cq; 738 739 /* Initialize cma parameters */ 740 741 /* RPC/RDMA does not use private data */ 742 ep->rep_remote_cma.private_data = NULL; 743 ep->rep_remote_cma.private_data_len = 0; 744 745 /* Client offers RDMA Read but does not initiate */ 746 ep->rep_remote_cma.initiator_depth = 0; 747 if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS) 748 ep->rep_remote_cma.responder_resources = 0; 749 else if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */ 750 ep->rep_remote_cma.responder_resources = 32; 751 else 752 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom; 753 754 ep->rep_remote_cma.retry_count = 7; 755 ep->rep_remote_cma.flow_control = 0; 756 ep->rep_remote_cma.rnr_retry_count = 0; 757 758 return 0; 759 760 out2: 761 err = ib_destroy_cq(ep->rep_cq); 762 if (err) 763 dprintk("RPC: %s: ib_destroy_cq returned %i\n", 764 __func__, err); 765 out1: 766 return rc; 767 } 768 769 /* 770 * rpcrdma_ep_destroy 771 * 772 * Disconnect and destroy endpoint. After this, the only 773 * valid operations on the ep are to free it (if dynamically 774 * allocated) or re-create it. 775 * 776 * The caller's error handling must be sure to not leak the endpoint 777 * if this function fails. 778 */ 779 int 780 rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) 781 { 782 int rc; 783 784 dprintk("RPC: %s: entering, connected is %d\n", 785 __func__, ep->rep_connected); 786 787 if (ia->ri_id->qp) { 788 rc = rpcrdma_ep_disconnect(ep, ia); 789 if (rc) 790 dprintk("RPC: %s: rpcrdma_ep_disconnect" 791 " returned %i\n", __func__, rc); 792 rdma_destroy_qp(ia->ri_id); 793 ia->ri_id->qp = NULL; 794 } 795 796 /* padding - could be done in rpcrdma_buffer_destroy... */ 797 if (ep->rep_pad_mr) { 798 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad); 799 ep->rep_pad_mr = NULL; 800 } 801 802 rpcrdma_clean_cq(ep->rep_cq); 803 rc = ib_destroy_cq(ep->rep_cq); 804 if (rc) 805 dprintk("RPC: %s: ib_destroy_cq returned %i\n", 806 __func__, rc); 807 808 return rc; 809 } 810 811 /* 812 * Connect unconnected endpoint. 813 */ 814 int 815 rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) 816 { 817 struct rdma_cm_id *id; 818 int rc = 0; 819 int retry_count = 0; 820 821 if (ep->rep_connected != 0) { 822 struct rpcrdma_xprt *xprt; 823 retry: 824 rc = rpcrdma_ep_disconnect(ep, ia); 825 if (rc && rc != -ENOTCONN) 826 dprintk("RPC: %s: rpcrdma_ep_disconnect" 827 " status %i\n", __func__, rc); 828 rpcrdma_clean_cq(ep->rep_cq); 829 830 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); 831 id = rpcrdma_create_id(xprt, ia, 832 (struct sockaddr *)&xprt->rx_data.addr); 833 if (IS_ERR(id)) { 834 rc = PTR_ERR(id); 835 goto out; 836 } 837 /* TEMP TEMP TEMP - fail if new device: 838 * Deregister/remarshal *all* requests! 839 * Close and recreate adapter, pd, etc! 840 * Re-determine all attributes still sane! 841 * More stuff I haven't thought of! 842 * Rrrgh! 843 */ 844 if (ia->ri_id->device != id->device) { 845 printk("RPC: %s: can't reconnect on " 846 "different device!\n", __func__); 847 rdma_destroy_id(id); 848 rc = -ENETDOWN; 849 goto out; 850 } 851 /* END TEMP */ 852 rdma_destroy_qp(ia->ri_id); 853 rdma_destroy_id(ia->ri_id); 854 ia->ri_id = id; 855 } 856 857 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); 858 if (rc) { 859 dprintk("RPC: %s: rdma_create_qp failed %i\n", 860 __func__, rc); 861 goto out; 862 } 863 864 /* XXX Tavor device performs badly with 2K MTU! */ 865 if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) { 866 struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device); 867 if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR && 868 (pcid->vendor == PCI_VENDOR_ID_MELLANOX || 869 pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) { 870 struct ib_qp_attr attr = { 871 .path_mtu = IB_MTU_1024 872 }; 873 rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU); 874 } 875 } 876 877 ep->rep_connected = 0; 878 879 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); 880 if (rc) { 881 dprintk("RPC: %s: rdma_connect() failed with %i\n", 882 __func__, rc); 883 goto out; 884 } 885 886 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0); 887 888 /* 889 * Check state. A non-peer reject indicates no listener 890 * (ECONNREFUSED), which may be a transient state. All 891 * others indicate a transport condition which has already 892 * undergone a best-effort. 893 */ 894 if (ep->rep_connected == -ECONNREFUSED && 895 ++retry_count <= RDMA_CONNECT_RETRY_MAX) { 896 dprintk("RPC: %s: non-peer_reject, retry\n", __func__); 897 goto retry; 898 } 899 if (ep->rep_connected <= 0) { 900 /* Sometimes, the only way to reliably connect to remote 901 * CMs is to use same nonzero values for ORD and IRD. */ 902 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 && 903 (ep->rep_remote_cma.responder_resources == 0 || 904 ep->rep_remote_cma.initiator_depth != 905 ep->rep_remote_cma.responder_resources)) { 906 if (ep->rep_remote_cma.responder_resources == 0) 907 ep->rep_remote_cma.responder_resources = 1; 908 ep->rep_remote_cma.initiator_depth = 909 ep->rep_remote_cma.responder_resources; 910 goto retry; 911 } 912 rc = ep->rep_connected; 913 } else { 914 dprintk("RPC: %s: connected\n", __func__); 915 } 916 917 out: 918 if (rc) 919 ep->rep_connected = rc; 920 return rc; 921 } 922 923 /* 924 * rpcrdma_ep_disconnect 925 * 926 * This is separate from destroy to facilitate the ability 927 * to reconnect without recreating the endpoint. 928 * 929 * This call is not reentrant, and must not be made in parallel 930 * on the same endpoint. 931 */ 932 int 933 rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) 934 { 935 int rc; 936 937 rpcrdma_clean_cq(ep->rep_cq); 938 rc = rdma_disconnect(ia->ri_id); 939 if (!rc) { 940 /* returns without wait if not connected */ 941 wait_event_interruptible(ep->rep_connect_wait, 942 ep->rep_connected != 1); 943 dprintk("RPC: %s: after wait, %sconnected\n", __func__, 944 (ep->rep_connected == 1) ? "still " : "dis"); 945 } else { 946 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc); 947 ep->rep_connected = rc; 948 } 949 return rc; 950 } 951 952 /* 953 * Initialize buffer memory 954 */ 955 int 956 rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, 957 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata) 958 { 959 char *p; 960 size_t len; 961 int i, rc; 962 struct rpcrdma_mw *r; 963 964 buf->rb_max_requests = cdata->max_requests; 965 spin_lock_init(&buf->rb_lock); 966 atomic_set(&buf->rb_credits, 1); 967 968 /* Need to allocate: 969 * 1. arrays for send and recv pointers 970 * 2. arrays of struct rpcrdma_req to fill in pointers 971 * 3. array of struct rpcrdma_rep for replies 972 * 4. padding, if any 973 * 5. mw's, fmr's or frmr's, if any 974 * Send/recv buffers in req/rep need to be registered 975 */ 976 977 len = buf->rb_max_requests * 978 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *)); 979 len += cdata->padding; 980 switch (ia->ri_memreg_strategy) { 981 case RPCRDMA_FRMR: 982 len += buf->rb_max_requests * RPCRDMA_MAX_SEGS * 983 sizeof(struct rpcrdma_mw); 984 break; 985 case RPCRDMA_MTHCAFMR: 986 /* TBD we are perhaps overallocating here */ 987 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * 988 sizeof(struct rpcrdma_mw); 989 break; 990 case RPCRDMA_MEMWINDOWS_ASYNC: 991 case RPCRDMA_MEMWINDOWS: 992 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * 993 sizeof(struct rpcrdma_mw); 994 break; 995 default: 996 break; 997 } 998 999 /* allocate 1, 4 and 5 in one shot */ 1000 p = kzalloc(len, GFP_KERNEL); 1001 if (p == NULL) { 1002 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n", 1003 __func__, len); 1004 rc = -ENOMEM; 1005 goto out; 1006 } 1007 buf->rb_pool = p; /* for freeing it later */ 1008 1009 buf->rb_send_bufs = (struct rpcrdma_req **) p; 1010 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests]; 1011 buf->rb_recv_bufs = (struct rpcrdma_rep **) p; 1012 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests]; 1013 1014 /* 1015 * Register the zeroed pad buffer, if any. 1016 */ 1017 if (cdata->padding) { 1018 rc = rpcrdma_register_internal(ia, p, cdata->padding, 1019 &ep->rep_pad_mr, &ep->rep_pad); 1020 if (rc) 1021 goto out; 1022 } 1023 p += cdata->padding; 1024 1025 /* 1026 * Allocate the fmr's, or mw's for mw_bind chunk registration. 1027 * We "cycle" the mw's in order to minimize rkey reuse, 1028 * and also reduce unbind-to-bind collision. 1029 */ 1030 INIT_LIST_HEAD(&buf->rb_mws); 1031 r = (struct rpcrdma_mw *)p; 1032 switch (ia->ri_memreg_strategy) { 1033 case RPCRDMA_FRMR: 1034 for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) { 1035 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, 1036 RPCRDMA_MAX_SEGS); 1037 if (IS_ERR(r->r.frmr.fr_mr)) { 1038 rc = PTR_ERR(r->r.frmr.fr_mr); 1039 dprintk("RPC: %s: ib_alloc_fast_reg_mr" 1040 " failed %i\n", __func__, rc); 1041 goto out; 1042 } 1043 r->r.frmr.fr_pgl = 1044 ib_alloc_fast_reg_page_list(ia->ri_id->device, 1045 RPCRDMA_MAX_SEGS); 1046 if (IS_ERR(r->r.frmr.fr_pgl)) { 1047 rc = PTR_ERR(r->r.frmr.fr_pgl); 1048 dprintk("RPC: %s: " 1049 "ib_alloc_fast_reg_page_list " 1050 "failed %i\n", __func__, rc); 1051 goto out; 1052 } 1053 list_add(&r->mw_list, &buf->rb_mws); 1054 ++r; 1055 } 1056 break; 1057 case RPCRDMA_MTHCAFMR: 1058 /* TBD we are perhaps overallocating here */ 1059 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { 1060 static struct ib_fmr_attr fa = 1061 { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT }; 1062 r->r.fmr = ib_alloc_fmr(ia->ri_pd, 1063 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ, 1064 &fa); 1065 if (IS_ERR(r->r.fmr)) { 1066 rc = PTR_ERR(r->r.fmr); 1067 dprintk("RPC: %s: ib_alloc_fmr" 1068 " failed %i\n", __func__, rc); 1069 goto out; 1070 } 1071 list_add(&r->mw_list, &buf->rb_mws); 1072 ++r; 1073 } 1074 break; 1075 case RPCRDMA_MEMWINDOWS_ASYNC: 1076 case RPCRDMA_MEMWINDOWS: 1077 /* Allocate one extra request's worth, for full cycling */ 1078 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { 1079 r->r.mw = ib_alloc_mw(ia->ri_pd); 1080 if (IS_ERR(r->r.mw)) { 1081 rc = PTR_ERR(r->r.mw); 1082 dprintk("RPC: %s: ib_alloc_mw" 1083 " failed %i\n", __func__, rc); 1084 goto out; 1085 } 1086 list_add(&r->mw_list, &buf->rb_mws); 1087 ++r; 1088 } 1089 break; 1090 default: 1091 break; 1092 } 1093 1094 /* 1095 * Allocate/init the request/reply buffers. Doing this 1096 * using kmalloc for now -- one for each buf. 1097 */ 1098 for (i = 0; i < buf->rb_max_requests; i++) { 1099 struct rpcrdma_req *req; 1100 struct rpcrdma_rep *rep; 1101 1102 len = cdata->inline_wsize + sizeof(struct rpcrdma_req); 1103 /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */ 1104 /* Typical ~2400b, so rounding up saves work later */ 1105 if (len < 4096) 1106 len = 4096; 1107 req = kmalloc(len, GFP_KERNEL); 1108 if (req == NULL) { 1109 dprintk("RPC: %s: request buffer %d alloc" 1110 " failed\n", __func__, i); 1111 rc = -ENOMEM; 1112 goto out; 1113 } 1114 memset(req, 0, sizeof(struct rpcrdma_req)); 1115 buf->rb_send_bufs[i] = req; 1116 buf->rb_send_bufs[i]->rl_buffer = buf; 1117 1118 rc = rpcrdma_register_internal(ia, req->rl_base, 1119 len - offsetof(struct rpcrdma_req, rl_base), 1120 &buf->rb_send_bufs[i]->rl_handle, 1121 &buf->rb_send_bufs[i]->rl_iov); 1122 if (rc) 1123 goto out; 1124 1125 buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req); 1126 1127 len = cdata->inline_rsize + sizeof(struct rpcrdma_rep); 1128 rep = kmalloc(len, GFP_KERNEL); 1129 if (rep == NULL) { 1130 dprintk("RPC: %s: reply buffer %d alloc failed\n", 1131 __func__, i); 1132 rc = -ENOMEM; 1133 goto out; 1134 } 1135 memset(rep, 0, sizeof(struct rpcrdma_rep)); 1136 buf->rb_recv_bufs[i] = rep; 1137 buf->rb_recv_bufs[i]->rr_buffer = buf; 1138 init_waitqueue_head(&rep->rr_unbind); 1139 1140 rc = rpcrdma_register_internal(ia, rep->rr_base, 1141 len - offsetof(struct rpcrdma_rep, rr_base), 1142 &buf->rb_recv_bufs[i]->rr_handle, 1143 &buf->rb_recv_bufs[i]->rr_iov); 1144 if (rc) 1145 goto out; 1146 1147 } 1148 dprintk("RPC: %s: max_requests %d\n", 1149 __func__, buf->rb_max_requests); 1150 /* done */ 1151 return 0; 1152 out: 1153 rpcrdma_buffer_destroy(buf); 1154 return rc; 1155 } 1156 1157 /* 1158 * Unregister and destroy buffer memory. Need to deal with 1159 * partial initialization, so it's callable from failed create. 1160 * Must be called before destroying endpoint, as registrations 1161 * reference it. 1162 */ 1163 void 1164 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) 1165 { 1166 int rc, i; 1167 struct rpcrdma_ia *ia = rdmab_to_ia(buf); 1168 struct rpcrdma_mw *r; 1169 1170 /* clean up in reverse order from create 1171 * 1. recv mr memory (mr free, then kfree) 1172 * 1a. bind mw memory 1173 * 2. send mr memory (mr free, then kfree) 1174 * 3. padding (if any) [moved to rpcrdma_ep_destroy] 1175 * 4. arrays 1176 */ 1177 dprintk("RPC: %s: entering\n", __func__); 1178 1179 for (i = 0; i < buf->rb_max_requests; i++) { 1180 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) { 1181 rpcrdma_deregister_internal(ia, 1182 buf->rb_recv_bufs[i]->rr_handle, 1183 &buf->rb_recv_bufs[i]->rr_iov); 1184 kfree(buf->rb_recv_bufs[i]); 1185 } 1186 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) { 1187 while (!list_empty(&buf->rb_mws)) { 1188 r = list_entry(buf->rb_mws.next, 1189 struct rpcrdma_mw, mw_list); 1190 list_del(&r->mw_list); 1191 switch (ia->ri_memreg_strategy) { 1192 case RPCRDMA_FRMR: 1193 rc = ib_dereg_mr(r->r.frmr.fr_mr); 1194 if (rc) 1195 dprintk("RPC: %s:" 1196 " ib_dereg_mr" 1197 " failed %i\n", 1198 __func__, rc); 1199 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); 1200 break; 1201 case RPCRDMA_MTHCAFMR: 1202 rc = ib_dealloc_fmr(r->r.fmr); 1203 if (rc) 1204 dprintk("RPC: %s:" 1205 " ib_dealloc_fmr" 1206 " failed %i\n", 1207 __func__, rc); 1208 break; 1209 case RPCRDMA_MEMWINDOWS_ASYNC: 1210 case RPCRDMA_MEMWINDOWS: 1211 rc = ib_dealloc_mw(r->r.mw); 1212 if (rc) 1213 dprintk("RPC: %s:" 1214 " ib_dealloc_mw" 1215 " failed %i\n", 1216 __func__, rc); 1217 break; 1218 default: 1219 break; 1220 } 1221 } 1222 rpcrdma_deregister_internal(ia, 1223 buf->rb_send_bufs[i]->rl_handle, 1224 &buf->rb_send_bufs[i]->rl_iov); 1225 kfree(buf->rb_send_bufs[i]); 1226 } 1227 } 1228 1229 kfree(buf->rb_pool); 1230 } 1231 1232 /* 1233 * Get a set of request/reply buffers. 1234 * 1235 * Reply buffer (if needed) is attached to send buffer upon return. 1236 * Rule: 1237 * rb_send_index and rb_recv_index MUST always be pointing to the 1238 * *next* available buffer (non-NULL). They are incremented after 1239 * removing buffers, and decremented *before* returning them. 1240 */ 1241 struct rpcrdma_req * 1242 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) 1243 { 1244 struct rpcrdma_req *req; 1245 unsigned long flags; 1246 int i; 1247 struct rpcrdma_mw *r; 1248 1249 spin_lock_irqsave(&buffers->rb_lock, flags); 1250 if (buffers->rb_send_index == buffers->rb_max_requests) { 1251 spin_unlock_irqrestore(&buffers->rb_lock, flags); 1252 dprintk("RPC: %s: out of request buffers\n", __func__); 1253 return ((struct rpcrdma_req *)NULL); 1254 } 1255 1256 req = buffers->rb_send_bufs[buffers->rb_send_index]; 1257 if (buffers->rb_send_index < buffers->rb_recv_index) { 1258 dprintk("RPC: %s: %d extra receives outstanding (ok)\n", 1259 __func__, 1260 buffers->rb_recv_index - buffers->rb_send_index); 1261 req->rl_reply = NULL; 1262 } else { 1263 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index]; 1264 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL; 1265 } 1266 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL; 1267 if (!list_empty(&buffers->rb_mws)) { 1268 i = RPCRDMA_MAX_SEGS - 1; 1269 do { 1270 r = list_entry(buffers->rb_mws.next, 1271 struct rpcrdma_mw, mw_list); 1272 list_del(&r->mw_list); 1273 req->rl_segments[i].mr_chunk.rl_mw = r; 1274 } while (--i >= 0); 1275 } 1276 spin_unlock_irqrestore(&buffers->rb_lock, flags); 1277 return req; 1278 } 1279 1280 /* 1281 * Put request/reply buffers back into pool. 1282 * Pre-decrement counter/array index. 1283 */ 1284 void 1285 rpcrdma_buffer_put(struct rpcrdma_req *req) 1286 { 1287 struct rpcrdma_buffer *buffers = req->rl_buffer; 1288 struct rpcrdma_ia *ia = rdmab_to_ia(buffers); 1289 int i; 1290 unsigned long flags; 1291 1292 BUG_ON(req->rl_nchunks != 0); 1293 spin_lock_irqsave(&buffers->rb_lock, flags); 1294 buffers->rb_send_bufs[--buffers->rb_send_index] = req; 1295 req->rl_niovs = 0; 1296 if (req->rl_reply) { 1297 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply; 1298 init_waitqueue_head(&req->rl_reply->rr_unbind); 1299 req->rl_reply->rr_func = NULL; 1300 req->rl_reply = NULL; 1301 } 1302 switch (ia->ri_memreg_strategy) { 1303 case RPCRDMA_FRMR: 1304 case RPCRDMA_MTHCAFMR: 1305 case RPCRDMA_MEMWINDOWS_ASYNC: 1306 case RPCRDMA_MEMWINDOWS: 1307 /* 1308 * Cycle mw's back in reverse order, and "spin" them. 1309 * This delays and scrambles reuse as much as possible. 1310 */ 1311 i = 1; 1312 do { 1313 struct rpcrdma_mw **mw; 1314 mw = &req->rl_segments[i].mr_chunk.rl_mw; 1315 list_add_tail(&(*mw)->mw_list, &buffers->rb_mws); 1316 *mw = NULL; 1317 } while (++i < RPCRDMA_MAX_SEGS); 1318 list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list, 1319 &buffers->rb_mws); 1320 req->rl_segments[0].mr_chunk.rl_mw = NULL; 1321 break; 1322 default: 1323 break; 1324 } 1325 spin_unlock_irqrestore(&buffers->rb_lock, flags); 1326 } 1327 1328 /* 1329 * Recover reply buffers from pool. 1330 * This happens when recovering from error conditions. 1331 * Post-increment counter/array index. 1332 */ 1333 void 1334 rpcrdma_recv_buffer_get(struct rpcrdma_req *req) 1335 { 1336 struct rpcrdma_buffer *buffers = req->rl_buffer; 1337 unsigned long flags; 1338 1339 if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */ 1340 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer; 1341 spin_lock_irqsave(&buffers->rb_lock, flags); 1342 if (buffers->rb_recv_index < buffers->rb_max_requests) { 1343 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index]; 1344 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL; 1345 } 1346 spin_unlock_irqrestore(&buffers->rb_lock, flags); 1347 } 1348 1349 /* 1350 * Put reply buffers back into pool when not attached to 1351 * request. This happens in error conditions, and when 1352 * aborting unbinds. Pre-decrement counter/array index. 1353 */ 1354 void 1355 rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) 1356 { 1357 struct rpcrdma_buffer *buffers = rep->rr_buffer; 1358 unsigned long flags; 1359 1360 rep->rr_func = NULL; 1361 spin_lock_irqsave(&buffers->rb_lock, flags); 1362 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep; 1363 spin_unlock_irqrestore(&buffers->rb_lock, flags); 1364 } 1365 1366 /* 1367 * Wrappers for internal-use kmalloc memory registration, used by buffer code. 1368 */ 1369 1370 int 1371 rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, 1372 struct ib_mr **mrp, struct ib_sge *iov) 1373 { 1374 struct ib_phys_buf ipb; 1375 struct ib_mr *mr; 1376 int rc; 1377 1378 /* 1379 * All memory passed here was kmalloc'ed, therefore phys-contiguous. 1380 */ 1381 iov->addr = ib_dma_map_single(ia->ri_id->device, 1382 va, len, DMA_BIDIRECTIONAL); 1383 iov->length = len; 1384 1385 if (ia->ri_have_dma_lkey) { 1386 *mrp = NULL; 1387 iov->lkey = ia->ri_dma_lkey; 1388 return 0; 1389 } else if (ia->ri_bind_mem != NULL) { 1390 *mrp = NULL; 1391 iov->lkey = ia->ri_bind_mem->lkey; 1392 return 0; 1393 } 1394 1395 ipb.addr = iov->addr; 1396 ipb.size = iov->length; 1397 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1, 1398 IB_ACCESS_LOCAL_WRITE, &iov->addr); 1399 1400 dprintk("RPC: %s: phys convert: 0x%llx " 1401 "registered 0x%llx length %d\n", 1402 __func__, (unsigned long long)ipb.addr, 1403 (unsigned long long)iov->addr, len); 1404 1405 if (IS_ERR(mr)) { 1406 *mrp = NULL; 1407 rc = PTR_ERR(mr); 1408 dprintk("RPC: %s: failed with %i\n", __func__, rc); 1409 } else { 1410 *mrp = mr; 1411 iov->lkey = mr->lkey; 1412 rc = 0; 1413 } 1414 1415 return rc; 1416 } 1417 1418 int 1419 rpcrdma_deregister_internal(struct rpcrdma_ia *ia, 1420 struct ib_mr *mr, struct ib_sge *iov) 1421 { 1422 int rc; 1423 1424 ib_dma_unmap_single(ia->ri_id->device, 1425 iov->addr, iov->length, DMA_BIDIRECTIONAL); 1426 1427 if (NULL == mr) 1428 return 0; 1429 1430 rc = ib_dereg_mr(mr); 1431 if (rc) 1432 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc); 1433 return rc; 1434 } 1435 1436 /* 1437 * Wrappers for chunk registration, shared by read/write chunk code. 1438 */ 1439 1440 static void 1441 rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing) 1442 { 1443 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; 1444 seg->mr_dmalen = seg->mr_len; 1445 if (seg->mr_page) 1446 seg->mr_dma = ib_dma_map_page(ia->ri_id->device, 1447 seg->mr_page, offset_in_page(seg->mr_offset), 1448 seg->mr_dmalen, seg->mr_dir); 1449 else 1450 seg->mr_dma = ib_dma_map_single(ia->ri_id->device, 1451 seg->mr_offset, 1452 seg->mr_dmalen, seg->mr_dir); 1453 } 1454 1455 static void 1456 rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg) 1457 { 1458 if (seg->mr_page) 1459 ib_dma_unmap_page(ia->ri_id->device, 1460 seg->mr_dma, seg->mr_dmalen, seg->mr_dir); 1461 else 1462 ib_dma_unmap_single(ia->ri_id->device, 1463 seg->mr_dma, seg->mr_dmalen, seg->mr_dir); 1464 } 1465 1466 static int 1467 rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, 1468 int *nsegs, int writing, struct rpcrdma_ia *ia, 1469 struct rpcrdma_xprt *r_xprt) 1470 { 1471 struct rpcrdma_mr_seg *seg1 = seg; 1472 struct ib_send_wr frmr_wr, *bad_wr; 1473 u8 key; 1474 int len, pageoff; 1475 int i, rc; 1476 1477 pageoff = offset_in_page(seg1->mr_offset); 1478 seg1->mr_offset -= pageoff; /* start of page */ 1479 seg1->mr_len += pageoff; 1480 len = -pageoff; 1481 if (*nsegs > RPCRDMA_MAX_DATA_SEGS) 1482 *nsegs = RPCRDMA_MAX_DATA_SEGS; 1483 for (i = 0; i < *nsegs;) { 1484 rpcrdma_map_one(ia, seg, writing); 1485 seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma; 1486 len += seg->mr_len; 1487 ++seg; 1488 ++i; 1489 /* Check for holes */ 1490 if ((i < *nsegs && offset_in_page(seg->mr_offset)) || 1491 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) 1492 break; 1493 } 1494 dprintk("RPC: %s: Using frmr %p to map %d segments\n", 1495 __func__, seg1->mr_chunk.rl_mw, i); 1496 1497 /* Bump the key */ 1498 key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF); 1499 ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key); 1500 1501 /* Prepare FRMR WR */ 1502 memset(&frmr_wr, 0, sizeof frmr_wr); 1503 frmr_wr.opcode = IB_WR_FAST_REG_MR; 1504 frmr_wr.send_flags = 0; /* unsignaled */ 1505 frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma; 1506 frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl; 1507 frmr_wr.wr.fast_reg.page_list_len = i; 1508 frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT; 1509 frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT; 1510 frmr_wr.wr.fast_reg.access_flags = (writing ? 1511 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : 1512 IB_ACCESS_REMOTE_READ); 1513 frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; 1514 DECR_CQCOUNT(&r_xprt->rx_ep); 1515 1516 rc = ib_post_send(ia->ri_id->qp, &frmr_wr, &bad_wr); 1517 1518 if (rc) { 1519 dprintk("RPC: %s: failed ib_post_send for register," 1520 " status %i\n", __func__, rc); 1521 while (i--) 1522 rpcrdma_unmap_one(ia, --seg); 1523 } else { 1524 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; 1525 seg1->mr_base = seg1->mr_dma + pageoff; 1526 seg1->mr_nsegs = i; 1527 seg1->mr_len = len; 1528 } 1529 *nsegs = i; 1530 return rc; 1531 } 1532 1533 static int 1534 rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, 1535 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt) 1536 { 1537 struct rpcrdma_mr_seg *seg1 = seg; 1538 struct ib_send_wr invalidate_wr, *bad_wr; 1539 int rc; 1540 1541 while (seg1->mr_nsegs--) 1542 rpcrdma_unmap_one(ia, seg++); 1543 1544 memset(&invalidate_wr, 0, sizeof invalidate_wr); 1545 invalidate_wr.opcode = IB_WR_LOCAL_INV; 1546 invalidate_wr.send_flags = 0; /* unsignaled */ 1547 invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; 1548 DECR_CQCOUNT(&r_xprt->rx_ep); 1549 1550 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); 1551 if (rc) 1552 dprintk("RPC: %s: failed ib_post_send for invalidate," 1553 " status %i\n", __func__, rc); 1554 return rc; 1555 } 1556 1557 static int 1558 rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg, 1559 int *nsegs, int writing, struct rpcrdma_ia *ia) 1560 { 1561 struct rpcrdma_mr_seg *seg1 = seg; 1562 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS]; 1563 int len, pageoff, i, rc; 1564 1565 pageoff = offset_in_page(seg1->mr_offset); 1566 seg1->mr_offset -= pageoff; /* start of page */ 1567 seg1->mr_len += pageoff; 1568 len = -pageoff; 1569 if (*nsegs > RPCRDMA_MAX_DATA_SEGS) 1570 *nsegs = RPCRDMA_MAX_DATA_SEGS; 1571 for (i = 0; i < *nsegs;) { 1572 rpcrdma_map_one(ia, seg, writing); 1573 physaddrs[i] = seg->mr_dma; 1574 len += seg->mr_len; 1575 ++seg; 1576 ++i; 1577 /* Check for holes */ 1578 if ((i < *nsegs && offset_in_page(seg->mr_offset)) || 1579 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) 1580 break; 1581 } 1582 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr, 1583 physaddrs, i, seg1->mr_dma); 1584 if (rc) { 1585 dprintk("RPC: %s: failed ib_map_phys_fmr " 1586 "%u@0x%llx+%i (%d)... status %i\n", __func__, 1587 len, (unsigned long long)seg1->mr_dma, 1588 pageoff, i, rc); 1589 while (i--) 1590 rpcrdma_unmap_one(ia, --seg); 1591 } else { 1592 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey; 1593 seg1->mr_base = seg1->mr_dma + pageoff; 1594 seg1->mr_nsegs = i; 1595 seg1->mr_len = len; 1596 } 1597 *nsegs = i; 1598 return rc; 1599 } 1600 1601 static int 1602 rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg, 1603 struct rpcrdma_ia *ia) 1604 { 1605 struct rpcrdma_mr_seg *seg1 = seg; 1606 LIST_HEAD(l); 1607 int rc; 1608 1609 list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l); 1610 rc = ib_unmap_fmr(&l); 1611 while (seg1->mr_nsegs--) 1612 rpcrdma_unmap_one(ia, seg++); 1613 if (rc) 1614 dprintk("RPC: %s: failed ib_unmap_fmr," 1615 " status %i\n", __func__, rc); 1616 return rc; 1617 } 1618 1619 static int 1620 rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg, 1621 int *nsegs, int writing, struct rpcrdma_ia *ia, 1622 struct rpcrdma_xprt *r_xprt) 1623 { 1624 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE : 1625 IB_ACCESS_REMOTE_READ); 1626 struct ib_mw_bind param; 1627 int rc; 1628 1629 *nsegs = 1; 1630 rpcrdma_map_one(ia, seg, writing); 1631 param.mr = ia->ri_bind_mem; 1632 param.wr_id = 0ULL; /* no send cookie */ 1633 param.addr = seg->mr_dma; 1634 param.length = seg->mr_len; 1635 param.send_flags = 0; 1636 param.mw_access_flags = mem_priv; 1637 1638 DECR_CQCOUNT(&r_xprt->rx_ep); 1639 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m); 1640 if (rc) { 1641 dprintk("RPC: %s: failed ib_bind_mw " 1642 "%u@0x%llx status %i\n", 1643 __func__, seg->mr_len, 1644 (unsigned long long)seg->mr_dma, rc); 1645 rpcrdma_unmap_one(ia, seg); 1646 } else { 1647 seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey; 1648 seg->mr_base = param.addr; 1649 seg->mr_nsegs = 1; 1650 } 1651 return rc; 1652 } 1653 1654 static int 1655 rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg, 1656 struct rpcrdma_ia *ia, 1657 struct rpcrdma_xprt *r_xprt, void **r) 1658 { 1659 struct ib_mw_bind param; 1660 LIST_HEAD(l); 1661 int rc; 1662 1663 BUG_ON(seg->mr_nsegs != 1); 1664 param.mr = ia->ri_bind_mem; 1665 param.addr = 0ULL; /* unbind */ 1666 param.length = 0; 1667 param.mw_access_flags = 0; 1668 if (*r) { 1669 param.wr_id = (u64) (unsigned long) *r; 1670 param.send_flags = IB_SEND_SIGNALED; 1671 INIT_CQCOUNT(&r_xprt->rx_ep); 1672 } else { 1673 param.wr_id = 0ULL; 1674 param.send_flags = 0; 1675 DECR_CQCOUNT(&r_xprt->rx_ep); 1676 } 1677 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m); 1678 rpcrdma_unmap_one(ia, seg); 1679 if (rc) 1680 dprintk("RPC: %s: failed ib_(un)bind_mw," 1681 " status %i\n", __func__, rc); 1682 else 1683 *r = NULL; /* will upcall on completion */ 1684 return rc; 1685 } 1686 1687 static int 1688 rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg, 1689 int *nsegs, int writing, struct rpcrdma_ia *ia) 1690 { 1691 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE : 1692 IB_ACCESS_REMOTE_READ); 1693 struct rpcrdma_mr_seg *seg1 = seg; 1694 struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS]; 1695 int len, i, rc = 0; 1696 1697 if (*nsegs > RPCRDMA_MAX_DATA_SEGS) 1698 *nsegs = RPCRDMA_MAX_DATA_SEGS; 1699 for (len = 0, i = 0; i < *nsegs;) { 1700 rpcrdma_map_one(ia, seg, writing); 1701 ipb[i].addr = seg->mr_dma; 1702 ipb[i].size = seg->mr_len; 1703 len += seg->mr_len; 1704 ++seg; 1705 ++i; 1706 /* Check for holes */ 1707 if ((i < *nsegs && offset_in_page(seg->mr_offset)) || 1708 offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len)) 1709 break; 1710 } 1711 seg1->mr_base = seg1->mr_dma; 1712 seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd, 1713 ipb, i, mem_priv, &seg1->mr_base); 1714 if (IS_ERR(seg1->mr_chunk.rl_mr)) { 1715 rc = PTR_ERR(seg1->mr_chunk.rl_mr); 1716 dprintk("RPC: %s: failed ib_reg_phys_mr " 1717 "%u@0x%llx (%d)... status %i\n", 1718 __func__, len, 1719 (unsigned long long)seg1->mr_dma, i, rc); 1720 while (i--) 1721 rpcrdma_unmap_one(ia, --seg); 1722 } else { 1723 seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey; 1724 seg1->mr_nsegs = i; 1725 seg1->mr_len = len; 1726 } 1727 *nsegs = i; 1728 return rc; 1729 } 1730 1731 static int 1732 rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg, 1733 struct rpcrdma_ia *ia) 1734 { 1735 struct rpcrdma_mr_seg *seg1 = seg; 1736 int rc; 1737 1738 rc = ib_dereg_mr(seg1->mr_chunk.rl_mr); 1739 seg1->mr_chunk.rl_mr = NULL; 1740 while (seg1->mr_nsegs--) 1741 rpcrdma_unmap_one(ia, seg++); 1742 if (rc) 1743 dprintk("RPC: %s: failed ib_dereg_mr," 1744 " status %i\n", __func__, rc); 1745 return rc; 1746 } 1747 1748 int 1749 rpcrdma_register_external(struct rpcrdma_mr_seg *seg, 1750 int nsegs, int writing, struct rpcrdma_xprt *r_xprt) 1751 { 1752 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 1753 int rc = 0; 1754 1755 switch (ia->ri_memreg_strategy) { 1756 1757 #if RPCRDMA_PERSISTENT_REGISTRATION 1758 case RPCRDMA_ALLPHYSICAL: 1759 rpcrdma_map_one(ia, seg, writing); 1760 seg->mr_rkey = ia->ri_bind_mem->rkey; 1761 seg->mr_base = seg->mr_dma; 1762 seg->mr_nsegs = 1; 1763 nsegs = 1; 1764 break; 1765 #endif 1766 1767 /* Registration using frmr registration */ 1768 case RPCRDMA_FRMR: 1769 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt); 1770 break; 1771 1772 /* Registration using fmr memory registration */ 1773 case RPCRDMA_MTHCAFMR: 1774 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia); 1775 break; 1776 1777 /* Registration using memory windows */ 1778 case RPCRDMA_MEMWINDOWS_ASYNC: 1779 case RPCRDMA_MEMWINDOWS: 1780 rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt); 1781 break; 1782 1783 /* Default registration each time */ 1784 default: 1785 rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia); 1786 break; 1787 } 1788 if (rc) 1789 return -1; 1790 1791 return nsegs; 1792 } 1793 1794 int 1795 rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, 1796 struct rpcrdma_xprt *r_xprt, void *r) 1797 { 1798 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 1799 int nsegs = seg->mr_nsegs, rc; 1800 1801 switch (ia->ri_memreg_strategy) { 1802 1803 #if RPCRDMA_PERSISTENT_REGISTRATION 1804 case RPCRDMA_ALLPHYSICAL: 1805 BUG_ON(nsegs != 1); 1806 rpcrdma_unmap_one(ia, seg); 1807 rc = 0; 1808 break; 1809 #endif 1810 1811 case RPCRDMA_FRMR: 1812 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt); 1813 break; 1814 1815 case RPCRDMA_MTHCAFMR: 1816 rc = rpcrdma_deregister_fmr_external(seg, ia); 1817 break; 1818 1819 case RPCRDMA_MEMWINDOWS_ASYNC: 1820 case RPCRDMA_MEMWINDOWS: 1821 rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r); 1822 break; 1823 1824 default: 1825 rc = rpcrdma_deregister_default_external(seg, ia); 1826 break; 1827 } 1828 if (r) { 1829 struct rpcrdma_rep *rep = r; 1830 void (*func)(struct rpcrdma_rep *) = rep->rr_func; 1831 rep->rr_func = NULL; 1832 func(rep); /* dereg done, callback now */ 1833 } 1834 return nsegs; 1835 } 1836 1837 /* 1838 * Prepost any receive buffer, then post send. 1839 * 1840 * Receive buffer is donated to hardware, reclaimed upon recv completion. 1841 */ 1842 int 1843 rpcrdma_ep_post(struct rpcrdma_ia *ia, 1844 struct rpcrdma_ep *ep, 1845 struct rpcrdma_req *req) 1846 { 1847 struct ib_send_wr send_wr, *send_wr_fail; 1848 struct rpcrdma_rep *rep = req->rl_reply; 1849 int rc; 1850 1851 if (rep) { 1852 rc = rpcrdma_ep_post_recv(ia, ep, rep); 1853 if (rc) 1854 goto out; 1855 req->rl_reply = NULL; 1856 } 1857 1858 send_wr.next = NULL; 1859 send_wr.wr_id = 0ULL; /* no send cookie */ 1860 send_wr.sg_list = req->rl_send_iov; 1861 send_wr.num_sge = req->rl_niovs; 1862 send_wr.opcode = IB_WR_SEND; 1863 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */ 1864 ib_dma_sync_single_for_device(ia->ri_id->device, 1865 req->rl_send_iov[3].addr, req->rl_send_iov[3].length, 1866 DMA_TO_DEVICE); 1867 ib_dma_sync_single_for_device(ia->ri_id->device, 1868 req->rl_send_iov[1].addr, req->rl_send_iov[1].length, 1869 DMA_TO_DEVICE); 1870 ib_dma_sync_single_for_device(ia->ri_id->device, 1871 req->rl_send_iov[0].addr, req->rl_send_iov[0].length, 1872 DMA_TO_DEVICE); 1873 1874 if (DECR_CQCOUNT(ep) > 0) 1875 send_wr.send_flags = 0; 1876 else { /* Provider must take a send completion every now and then */ 1877 INIT_CQCOUNT(ep); 1878 send_wr.send_flags = IB_SEND_SIGNALED; 1879 } 1880 1881 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail); 1882 if (rc) 1883 dprintk("RPC: %s: ib_post_send returned %i\n", __func__, 1884 rc); 1885 out: 1886 return rc; 1887 } 1888 1889 /* 1890 * (Re)post a receive buffer. 1891 */ 1892 int 1893 rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, 1894 struct rpcrdma_ep *ep, 1895 struct rpcrdma_rep *rep) 1896 { 1897 struct ib_recv_wr recv_wr, *recv_wr_fail; 1898 int rc; 1899 1900 recv_wr.next = NULL; 1901 recv_wr.wr_id = (u64) (unsigned long) rep; 1902 recv_wr.sg_list = &rep->rr_iov; 1903 recv_wr.num_sge = 1; 1904 1905 ib_dma_sync_single_for_cpu(ia->ri_id->device, 1906 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL); 1907 1908 DECR_CQCOUNT(ep); 1909 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); 1910 1911 if (rc) 1912 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__, 1913 rc); 1914 return rc; 1915 } 1916