1 /* 2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the BSD-type 8 * license below: 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 14 * Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 17 * Redistributions in binary form must reproduce the above 18 * copyright notice, this list of conditions and the following 19 * disclaimer in the documentation and/or other materials provided 20 * with the distribution. 21 * 22 * Neither the name of the Network Appliance, Inc. nor the names of 23 * its contributors may be used to endorse or promote products 24 * derived from this software without specific prior written 25 * permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 */ 39 40 /* 41 * verbs.c 42 * 43 * Encapsulates the major functions managing: 44 * o adapters 45 * o endpoints 46 * o connections 47 * o buffer memory 48 */ 49 50 #include <linux/interrupt.h> 51 #include <linux/slab.h> 52 #include <asm/bitops.h> 53 54 #include "xprt_rdma.h" 55 56 /* 57 * Globals/Macros 58 */ 59 60 #ifdef RPC_DEBUG 61 # define RPCDBG_FACILITY RPCDBG_TRANS 62 #endif 63 64 static void rpcrdma_reset_frmrs(struct rpcrdma_ia *); 65 66 /* 67 * internal functions 68 */ 69 70 /* 71 * handle replies in tasklet context, using a single, global list 72 * rdma tasklet function -- just turn around and call the func 73 * for all replies on the list 74 */ 75 76 static DEFINE_SPINLOCK(rpcrdma_tk_lock_g); 77 static LIST_HEAD(rpcrdma_tasklets_g); 78 79 static void 80 rpcrdma_run_tasklet(unsigned long data) 81 { 82 struct rpcrdma_rep *rep; 83 void (*func)(struct rpcrdma_rep *); 84 unsigned long flags; 85 86 data = data; 87 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); 88 while (!list_empty(&rpcrdma_tasklets_g)) { 89 rep = list_entry(rpcrdma_tasklets_g.next, 90 struct rpcrdma_rep, rr_list); 91 list_del(&rep->rr_list); 92 func = rep->rr_func; 93 rep->rr_func = NULL; 94 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); 95 96 if (func) 97 func(rep); 98 else 99 rpcrdma_recv_buffer_put(rep); 100 101 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); 102 } 103 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); 104 } 105 106 static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL); 107 108 static void 109 rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) 110 { 111 struct rpcrdma_ep *ep = context; 112 113 dprintk("RPC: %s: QP error %X on device %s ep %p\n", 114 __func__, event->event, event->device->name, context); 115 if (ep->rep_connected == 1) { 116 ep->rep_connected = -EIO; 117 ep->rep_func(ep); 118 wake_up_all(&ep->rep_connect_wait); 119 } 120 } 121 122 static void 123 rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context) 124 { 125 struct rpcrdma_ep *ep = context; 126 127 dprintk("RPC: %s: CQ error %X on device %s ep %p\n", 128 __func__, event->event, event->device->name, context); 129 if (ep->rep_connected == 1) { 130 ep->rep_connected = -EIO; 131 ep->rep_func(ep); 132 wake_up_all(&ep->rep_connect_wait); 133 } 134 } 135 136 static void 137 rpcrdma_sendcq_process_wc(struct ib_wc *wc) 138 { 139 struct rpcrdma_mw *frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; 140 141 dprintk("RPC: %s: frmr %p status %X opcode %d\n", 142 __func__, frmr, wc->status, wc->opcode); 143 144 if (wc->wr_id == 0ULL) 145 return; 146 if (wc->status != IB_WC_SUCCESS) 147 frmr->r.frmr.fr_state = FRMR_IS_STALE; 148 } 149 150 static int 151 rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep) 152 { 153 struct ib_wc *wcs; 154 int budget, count, rc; 155 156 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE; 157 do { 158 wcs = ep->rep_send_wcs; 159 160 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs); 161 if (rc <= 0) 162 return rc; 163 164 count = rc; 165 while (count-- > 0) 166 rpcrdma_sendcq_process_wc(wcs++); 167 } while (rc == RPCRDMA_POLLSIZE && --budget); 168 return 0; 169 } 170 171 /* 172 * Handle send, fast_reg_mr, and local_inv completions. 173 * 174 * Send events are typically suppressed and thus do not result 175 * in an upcall. Occasionally one is signaled, however. This 176 * prevents the provider's completion queue from wrapping and 177 * losing a completion. 178 */ 179 static void 180 rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context) 181 { 182 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context; 183 int rc; 184 185 rc = rpcrdma_sendcq_poll(cq, ep); 186 if (rc) { 187 dprintk("RPC: %s: ib_poll_cq failed: %i\n", 188 __func__, rc); 189 return; 190 } 191 192 rc = ib_req_notify_cq(cq, 193 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); 194 if (rc == 0) 195 return; 196 if (rc < 0) { 197 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", 198 __func__, rc); 199 return; 200 } 201 202 rpcrdma_sendcq_poll(cq, ep); 203 } 204 205 static void 206 rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list) 207 { 208 struct rpcrdma_rep *rep = 209 (struct rpcrdma_rep *)(unsigned long)wc->wr_id; 210 211 dprintk("RPC: %s: rep %p status %X opcode %X length %u\n", 212 __func__, rep, wc->status, wc->opcode, wc->byte_len); 213 214 if (wc->status != IB_WC_SUCCESS) { 215 rep->rr_len = ~0U; 216 goto out_schedule; 217 } 218 if (wc->opcode != IB_WC_RECV) 219 return; 220 221 rep->rr_len = wc->byte_len; 222 ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device, 223 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE); 224 225 if (rep->rr_len >= 16) { 226 struct rpcrdma_msg *p = (struct rpcrdma_msg *)rep->rr_base; 227 unsigned int credits = ntohl(p->rm_credit); 228 229 if (credits == 0) 230 credits = 1; /* don't deadlock */ 231 else if (credits > rep->rr_buffer->rb_max_requests) 232 credits = rep->rr_buffer->rb_max_requests; 233 atomic_set(&rep->rr_buffer->rb_credits, credits); 234 } 235 236 out_schedule: 237 list_add_tail(&rep->rr_list, sched_list); 238 } 239 240 static int 241 rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep) 242 { 243 struct list_head sched_list; 244 struct ib_wc *wcs; 245 int budget, count, rc; 246 unsigned long flags; 247 248 INIT_LIST_HEAD(&sched_list); 249 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE; 250 do { 251 wcs = ep->rep_recv_wcs; 252 253 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs); 254 if (rc <= 0) 255 goto out_schedule; 256 257 count = rc; 258 while (count-- > 0) 259 rpcrdma_recvcq_process_wc(wcs++, &sched_list); 260 } while (rc == RPCRDMA_POLLSIZE && --budget); 261 rc = 0; 262 263 out_schedule: 264 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); 265 list_splice_tail(&sched_list, &rpcrdma_tasklets_g); 266 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); 267 tasklet_schedule(&rpcrdma_tasklet_g); 268 return rc; 269 } 270 271 /* 272 * Handle receive completions. 273 * 274 * It is reentrant but processes single events in order to maintain 275 * ordering of receives to keep server credits. 276 * 277 * It is the responsibility of the scheduled tasklet to return 278 * recv buffers to the pool. NOTE: this affects synchronization of 279 * connection shutdown. That is, the structures required for 280 * the completion of the reply handler must remain intact until 281 * all memory has been reclaimed. 282 */ 283 static void 284 rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context) 285 { 286 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context; 287 int rc; 288 289 rc = rpcrdma_recvcq_poll(cq, ep); 290 if (rc) { 291 dprintk("RPC: %s: ib_poll_cq failed: %i\n", 292 __func__, rc); 293 return; 294 } 295 296 rc = ib_req_notify_cq(cq, 297 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); 298 if (rc == 0) 299 return; 300 if (rc < 0) { 301 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", 302 __func__, rc); 303 return; 304 } 305 306 rpcrdma_recvcq_poll(cq, ep); 307 } 308 309 static void 310 rpcrdma_flush_cqs(struct rpcrdma_ep *ep) 311 { 312 rpcrdma_recvcq_upcall(ep->rep_attr.recv_cq, ep); 313 rpcrdma_sendcq_upcall(ep->rep_attr.send_cq, ep); 314 } 315 316 #ifdef RPC_DEBUG 317 static const char * const conn[] = { 318 "address resolved", 319 "address error", 320 "route resolved", 321 "route error", 322 "connect request", 323 "connect response", 324 "connect error", 325 "unreachable", 326 "rejected", 327 "established", 328 "disconnected", 329 "device removal", 330 "multicast join", 331 "multicast error", 332 "address change", 333 "timewait exit", 334 }; 335 336 #define CONNECTION_MSG(status) \ 337 ((status) < ARRAY_SIZE(conn) ? \ 338 conn[(status)] : "unrecognized connection error") 339 #endif 340 341 static int 342 rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) 343 { 344 struct rpcrdma_xprt *xprt = id->context; 345 struct rpcrdma_ia *ia = &xprt->rx_ia; 346 struct rpcrdma_ep *ep = &xprt->rx_ep; 347 #ifdef RPC_DEBUG 348 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr; 349 #endif 350 struct ib_qp_attr attr; 351 struct ib_qp_init_attr iattr; 352 int connstate = 0; 353 354 switch (event->event) { 355 case RDMA_CM_EVENT_ADDR_RESOLVED: 356 case RDMA_CM_EVENT_ROUTE_RESOLVED: 357 ia->ri_async_rc = 0; 358 complete(&ia->ri_done); 359 break; 360 case RDMA_CM_EVENT_ADDR_ERROR: 361 ia->ri_async_rc = -EHOSTUNREACH; 362 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n", 363 __func__, ep); 364 complete(&ia->ri_done); 365 break; 366 case RDMA_CM_EVENT_ROUTE_ERROR: 367 ia->ri_async_rc = -ENETUNREACH; 368 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n", 369 __func__, ep); 370 complete(&ia->ri_done); 371 break; 372 case RDMA_CM_EVENT_ESTABLISHED: 373 connstate = 1; 374 ib_query_qp(ia->ri_id->qp, &attr, 375 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC, 376 &iattr); 377 dprintk("RPC: %s: %d responder resources" 378 " (%d initiator)\n", 379 __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic); 380 goto connected; 381 case RDMA_CM_EVENT_CONNECT_ERROR: 382 connstate = -ENOTCONN; 383 goto connected; 384 case RDMA_CM_EVENT_UNREACHABLE: 385 connstate = -ENETDOWN; 386 goto connected; 387 case RDMA_CM_EVENT_REJECTED: 388 connstate = -ECONNREFUSED; 389 goto connected; 390 case RDMA_CM_EVENT_DISCONNECTED: 391 connstate = -ECONNABORTED; 392 goto connected; 393 case RDMA_CM_EVENT_DEVICE_REMOVAL: 394 connstate = -ENODEV; 395 connected: 396 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1); 397 dprintk("RPC: %s: %sconnected\n", 398 __func__, connstate > 0 ? "" : "dis"); 399 ep->rep_connected = connstate; 400 ep->rep_func(ep); 401 wake_up_all(&ep->rep_connect_wait); 402 /*FALLTHROUGH*/ 403 default: 404 dprintk("RPC: %s: %pI4:%u (ep 0x%p): %s\n", 405 __func__, &addr->sin_addr.s_addr, 406 ntohs(addr->sin_port), ep, 407 CONNECTION_MSG(event->event)); 408 break; 409 } 410 411 #ifdef RPC_DEBUG 412 if (connstate == 1) { 413 int ird = attr.max_dest_rd_atomic; 414 int tird = ep->rep_remote_cma.responder_resources; 415 printk(KERN_INFO "rpcrdma: connection to %pI4:%u " 416 "on %s, memreg %d slots %d ird %d%s\n", 417 &addr->sin_addr.s_addr, 418 ntohs(addr->sin_port), 419 ia->ri_id->device->name, 420 ia->ri_memreg_strategy, 421 xprt->rx_buf.rb_max_requests, 422 ird, ird < 4 && ird < tird / 2 ? " (low!)" : ""); 423 } else if (connstate < 0) { 424 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n", 425 &addr->sin_addr.s_addr, 426 ntohs(addr->sin_port), 427 connstate); 428 } 429 #endif 430 431 return 0; 432 } 433 434 static struct rdma_cm_id * 435 rpcrdma_create_id(struct rpcrdma_xprt *xprt, 436 struct rpcrdma_ia *ia, struct sockaddr *addr) 437 { 438 struct rdma_cm_id *id; 439 int rc; 440 441 init_completion(&ia->ri_done); 442 443 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC); 444 if (IS_ERR(id)) { 445 rc = PTR_ERR(id); 446 dprintk("RPC: %s: rdma_create_id() failed %i\n", 447 __func__, rc); 448 return id; 449 } 450 451 ia->ri_async_rc = -ETIMEDOUT; 452 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT); 453 if (rc) { 454 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n", 455 __func__, rc); 456 goto out; 457 } 458 wait_for_completion_interruptible_timeout(&ia->ri_done, 459 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1); 460 rc = ia->ri_async_rc; 461 if (rc) 462 goto out; 463 464 ia->ri_async_rc = -ETIMEDOUT; 465 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT); 466 if (rc) { 467 dprintk("RPC: %s: rdma_resolve_route() failed %i\n", 468 __func__, rc); 469 goto out; 470 } 471 wait_for_completion_interruptible_timeout(&ia->ri_done, 472 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1); 473 rc = ia->ri_async_rc; 474 if (rc) 475 goto out; 476 477 return id; 478 479 out: 480 rdma_destroy_id(id); 481 return ERR_PTR(rc); 482 } 483 484 /* 485 * Drain any cq, prior to teardown. 486 */ 487 static void 488 rpcrdma_clean_cq(struct ib_cq *cq) 489 { 490 struct ib_wc wc; 491 int count = 0; 492 493 while (1 == ib_poll_cq(cq, 1, &wc)) 494 ++count; 495 496 if (count) 497 dprintk("RPC: %s: flushed %d events (last 0x%x)\n", 498 __func__, count, wc.opcode); 499 } 500 501 /* 502 * Exported functions. 503 */ 504 505 /* 506 * Open and initialize an Interface Adapter. 507 * o initializes fields of struct rpcrdma_ia, including 508 * interface and provider attributes and protection zone. 509 */ 510 int 511 rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) 512 { 513 int rc, mem_priv; 514 struct ib_device_attr devattr; 515 struct rpcrdma_ia *ia = &xprt->rx_ia; 516 517 ia->ri_id = rpcrdma_create_id(xprt, ia, addr); 518 if (IS_ERR(ia->ri_id)) { 519 rc = PTR_ERR(ia->ri_id); 520 goto out1; 521 } 522 523 ia->ri_pd = ib_alloc_pd(ia->ri_id->device); 524 if (IS_ERR(ia->ri_pd)) { 525 rc = PTR_ERR(ia->ri_pd); 526 dprintk("RPC: %s: ib_alloc_pd() failed %i\n", 527 __func__, rc); 528 goto out2; 529 } 530 531 /* 532 * Query the device to determine if the requested memory 533 * registration strategy is supported. If it isn't, set the 534 * strategy to a globally supported model. 535 */ 536 rc = ib_query_device(ia->ri_id->device, &devattr); 537 if (rc) { 538 dprintk("RPC: %s: ib_query_device failed %d\n", 539 __func__, rc); 540 goto out2; 541 } 542 543 if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) { 544 ia->ri_have_dma_lkey = 1; 545 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey; 546 } 547 548 if (memreg == RPCRDMA_FRMR) { 549 /* Requires both frmr reg and local dma lkey */ 550 if ((devattr.device_cap_flags & 551 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) != 552 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) { 553 dprintk("RPC: %s: FRMR registration " 554 "not supported by HCA\n", __func__); 555 memreg = RPCRDMA_MTHCAFMR; 556 } else { 557 /* Mind the ia limit on FRMR page list depth */ 558 ia->ri_max_frmr_depth = min_t(unsigned int, 559 RPCRDMA_MAX_DATA_SEGS, 560 devattr.max_fast_reg_page_list_len); 561 } 562 } 563 if (memreg == RPCRDMA_MTHCAFMR) { 564 if (!ia->ri_id->device->alloc_fmr) { 565 dprintk("RPC: %s: MTHCAFMR registration " 566 "not supported by HCA\n", __func__); 567 memreg = RPCRDMA_ALLPHYSICAL; 568 } 569 } 570 571 /* 572 * Optionally obtain an underlying physical identity mapping in 573 * order to do a memory window-based bind. This base registration 574 * is protected from remote access - that is enabled only by binding 575 * for the specific bytes targeted during each RPC operation, and 576 * revoked after the corresponding completion similar to a storage 577 * adapter. 578 */ 579 switch (memreg) { 580 case RPCRDMA_FRMR: 581 break; 582 case RPCRDMA_ALLPHYSICAL: 583 mem_priv = IB_ACCESS_LOCAL_WRITE | 584 IB_ACCESS_REMOTE_WRITE | 585 IB_ACCESS_REMOTE_READ; 586 goto register_setup; 587 case RPCRDMA_MTHCAFMR: 588 if (ia->ri_have_dma_lkey) 589 break; 590 mem_priv = IB_ACCESS_LOCAL_WRITE; 591 register_setup: 592 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv); 593 if (IS_ERR(ia->ri_bind_mem)) { 594 printk(KERN_ALERT "%s: ib_get_dma_mr for " 595 "phys register failed with %lX\n", 596 __func__, PTR_ERR(ia->ri_bind_mem)); 597 rc = -ENOMEM; 598 goto out2; 599 } 600 break; 601 default: 602 printk(KERN_ERR "RPC: Unsupported memory " 603 "registration mode: %d\n", memreg); 604 rc = -ENOMEM; 605 goto out2; 606 } 607 dprintk("RPC: %s: memory registration strategy is %d\n", 608 __func__, memreg); 609 610 /* Else will do memory reg/dereg for each chunk */ 611 ia->ri_memreg_strategy = memreg; 612 613 rwlock_init(&ia->ri_qplock); 614 return 0; 615 out2: 616 rdma_destroy_id(ia->ri_id); 617 ia->ri_id = NULL; 618 out1: 619 return rc; 620 } 621 622 /* 623 * Clean up/close an IA. 624 * o if event handles and PD have been initialized, free them. 625 * o close the IA 626 */ 627 void 628 rpcrdma_ia_close(struct rpcrdma_ia *ia) 629 { 630 int rc; 631 632 dprintk("RPC: %s: entering\n", __func__); 633 if (ia->ri_bind_mem != NULL) { 634 rc = ib_dereg_mr(ia->ri_bind_mem); 635 dprintk("RPC: %s: ib_dereg_mr returned %i\n", 636 __func__, rc); 637 } 638 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) { 639 if (ia->ri_id->qp) 640 rdma_destroy_qp(ia->ri_id); 641 rdma_destroy_id(ia->ri_id); 642 ia->ri_id = NULL; 643 } 644 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) { 645 rc = ib_dealloc_pd(ia->ri_pd); 646 dprintk("RPC: %s: ib_dealloc_pd returned %i\n", 647 __func__, rc); 648 } 649 } 650 651 /* 652 * Create unconnected endpoint. 653 */ 654 int 655 rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, 656 struct rpcrdma_create_data_internal *cdata) 657 { 658 struct ib_device_attr devattr; 659 struct ib_cq *sendcq, *recvcq; 660 int rc, err; 661 662 rc = ib_query_device(ia->ri_id->device, &devattr); 663 if (rc) { 664 dprintk("RPC: %s: ib_query_device failed %d\n", 665 __func__, rc); 666 return rc; 667 } 668 669 /* check provider's send/recv wr limits */ 670 if (cdata->max_requests > devattr.max_qp_wr) 671 cdata->max_requests = devattr.max_qp_wr; 672 673 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall; 674 ep->rep_attr.qp_context = ep; 675 /* send_cq and recv_cq initialized below */ 676 ep->rep_attr.srq = NULL; 677 ep->rep_attr.cap.max_send_wr = cdata->max_requests; 678 switch (ia->ri_memreg_strategy) { 679 case RPCRDMA_FRMR: { 680 int depth = 7; 681 682 /* Add room for frmr register and invalidate WRs. 683 * 1. FRMR reg WR for head 684 * 2. FRMR invalidate WR for head 685 * 3. N FRMR reg WRs for pagelist 686 * 4. N FRMR invalidate WRs for pagelist 687 * 5. FRMR reg WR for tail 688 * 6. FRMR invalidate WR for tail 689 * 7. The RDMA_SEND WR 690 */ 691 692 /* Calculate N if the device max FRMR depth is smaller than 693 * RPCRDMA_MAX_DATA_SEGS. 694 */ 695 if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) { 696 int delta = RPCRDMA_MAX_DATA_SEGS - 697 ia->ri_max_frmr_depth; 698 699 do { 700 depth += 2; /* FRMR reg + invalidate */ 701 delta -= ia->ri_max_frmr_depth; 702 } while (delta > 0); 703 704 } 705 ep->rep_attr.cap.max_send_wr *= depth; 706 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) { 707 cdata->max_requests = devattr.max_qp_wr / depth; 708 if (!cdata->max_requests) 709 return -EINVAL; 710 ep->rep_attr.cap.max_send_wr = cdata->max_requests * 711 depth; 712 } 713 break; 714 } 715 default: 716 break; 717 } 718 ep->rep_attr.cap.max_recv_wr = cdata->max_requests; 719 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2); 720 ep->rep_attr.cap.max_recv_sge = 1; 721 ep->rep_attr.cap.max_inline_data = 0; 722 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR; 723 ep->rep_attr.qp_type = IB_QPT_RC; 724 ep->rep_attr.port_num = ~0; 725 726 dprintk("RPC: %s: requested max: dtos: send %d recv %d; " 727 "iovs: send %d recv %d\n", 728 __func__, 729 ep->rep_attr.cap.max_send_wr, 730 ep->rep_attr.cap.max_recv_wr, 731 ep->rep_attr.cap.max_send_sge, 732 ep->rep_attr.cap.max_recv_sge); 733 734 /* set trigger for requesting send completion */ 735 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1; 736 if (ep->rep_cqinit <= 2) 737 ep->rep_cqinit = 0; 738 INIT_CQCOUNT(ep); 739 ep->rep_ia = ia; 740 init_waitqueue_head(&ep->rep_connect_wait); 741 INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); 742 743 sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall, 744 rpcrdma_cq_async_error_upcall, ep, 745 ep->rep_attr.cap.max_send_wr + 1, 0); 746 if (IS_ERR(sendcq)) { 747 rc = PTR_ERR(sendcq); 748 dprintk("RPC: %s: failed to create send CQ: %i\n", 749 __func__, rc); 750 goto out1; 751 } 752 753 rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP); 754 if (rc) { 755 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", 756 __func__, rc); 757 goto out2; 758 } 759 760 recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall, 761 rpcrdma_cq_async_error_upcall, ep, 762 ep->rep_attr.cap.max_recv_wr + 1, 0); 763 if (IS_ERR(recvcq)) { 764 rc = PTR_ERR(recvcq); 765 dprintk("RPC: %s: failed to create recv CQ: %i\n", 766 __func__, rc); 767 goto out2; 768 } 769 770 rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP); 771 if (rc) { 772 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", 773 __func__, rc); 774 ib_destroy_cq(recvcq); 775 goto out2; 776 } 777 778 ep->rep_attr.send_cq = sendcq; 779 ep->rep_attr.recv_cq = recvcq; 780 781 /* Initialize cma parameters */ 782 783 /* RPC/RDMA does not use private data */ 784 ep->rep_remote_cma.private_data = NULL; 785 ep->rep_remote_cma.private_data_len = 0; 786 787 /* Client offers RDMA Read but does not initiate */ 788 ep->rep_remote_cma.initiator_depth = 0; 789 if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */ 790 ep->rep_remote_cma.responder_resources = 32; 791 else 792 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom; 793 794 ep->rep_remote_cma.retry_count = 7; 795 ep->rep_remote_cma.flow_control = 0; 796 ep->rep_remote_cma.rnr_retry_count = 0; 797 798 return 0; 799 800 out2: 801 err = ib_destroy_cq(sendcq); 802 if (err) 803 dprintk("RPC: %s: ib_destroy_cq returned %i\n", 804 __func__, err); 805 out1: 806 return rc; 807 } 808 809 /* 810 * rpcrdma_ep_destroy 811 * 812 * Disconnect and destroy endpoint. After this, the only 813 * valid operations on the ep are to free it (if dynamically 814 * allocated) or re-create it. 815 */ 816 void 817 rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) 818 { 819 int rc; 820 821 dprintk("RPC: %s: entering, connected is %d\n", 822 __func__, ep->rep_connected); 823 824 cancel_delayed_work_sync(&ep->rep_connect_worker); 825 826 if (ia->ri_id->qp) { 827 rpcrdma_ep_disconnect(ep, ia); 828 rdma_destroy_qp(ia->ri_id); 829 ia->ri_id->qp = NULL; 830 } 831 832 /* padding - could be done in rpcrdma_buffer_destroy... */ 833 if (ep->rep_pad_mr) { 834 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad); 835 ep->rep_pad_mr = NULL; 836 } 837 838 rpcrdma_clean_cq(ep->rep_attr.recv_cq); 839 rc = ib_destroy_cq(ep->rep_attr.recv_cq); 840 if (rc) 841 dprintk("RPC: %s: ib_destroy_cq returned %i\n", 842 __func__, rc); 843 844 rpcrdma_clean_cq(ep->rep_attr.send_cq); 845 rc = ib_destroy_cq(ep->rep_attr.send_cq); 846 if (rc) 847 dprintk("RPC: %s: ib_destroy_cq returned %i\n", 848 __func__, rc); 849 } 850 851 /* 852 * Connect unconnected endpoint. 853 */ 854 int 855 rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) 856 { 857 struct rdma_cm_id *id, *old; 858 int rc = 0; 859 int retry_count = 0; 860 861 if (ep->rep_connected != 0) { 862 struct rpcrdma_xprt *xprt; 863 retry: 864 dprintk("RPC: %s: reconnecting...\n", __func__); 865 866 rpcrdma_ep_disconnect(ep, ia); 867 rpcrdma_flush_cqs(ep); 868 869 if (ia->ri_memreg_strategy == RPCRDMA_FRMR) 870 rpcrdma_reset_frmrs(ia); 871 872 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); 873 id = rpcrdma_create_id(xprt, ia, 874 (struct sockaddr *)&xprt->rx_data.addr); 875 if (IS_ERR(id)) { 876 rc = -EHOSTUNREACH; 877 goto out; 878 } 879 /* TEMP TEMP TEMP - fail if new device: 880 * Deregister/remarshal *all* requests! 881 * Close and recreate adapter, pd, etc! 882 * Re-determine all attributes still sane! 883 * More stuff I haven't thought of! 884 * Rrrgh! 885 */ 886 if (ia->ri_id->device != id->device) { 887 printk("RPC: %s: can't reconnect on " 888 "different device!\n", __func__); 889 rdma_destroy_id(id); 890 rc = -ENETUNREACH; 891 goto out; 892 } 893 /* END TEMP */ 894 rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr); 895 if (rc) { 896 dprintk("RPC: %s: rdma_create_qp failed %i\n", 897 __func__, rc); 898 rdma_destroy_id(id); 899 rc = -ENETUNREACH; 900 goto out; 901 } 902 903 write_lock(&ia->ri_qplock); 904 old = ia->ri_id; 905 ia->ri_id = id; 906 write_unlock(&ia->ri_qplock); 907 908 rdma_destroy_qp(old); 909 rdma_destroy_id(old); 910 } else { 911 dprintk("RPC: %s: connecting...\n", __func__); 912 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); 913 if (rc) { 914 dprintk("RPC: %s: rdma_create_qp failed %i\n", 915 __func__, rc); 916 /* do not update ep->rep_connected */ 917 return -ENETUNREACH; 918 } 919 } 920 921 ep->rep_connected = 0; 922 923 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); 924 if (rc) { 925 dprintk("RPC: %s: rdma_connect() failed with %i\n", 926 __func__, rc); 927 goto out; 928 } 929 930 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0); 931 932 /* 933 * Check state. A non-peer reject indicates no listener 934 * (ECONNREFUSED), which may be a transient state. All 935 * others indicate a transport condition which has already 936 * undergone a best-effort. 937 */ 938 if (ep->rep_connected == -ECONNREFUSED && 939 ++retry_count <= RDMA_CONNECT_RETRY_MAX) { 940 dprintk("RPC: %s: non-peer_reject, retry\n", __func__); 941 goto retry; 942 } 943 if (ep->rep_connected <= 0) { 944 /* Sometimes, the only way to reliably connect to remote 945 * CMs is to use same nonzero values for ORD and IRD. */ 946 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 && 947 (ep->rep_remote_cma.responder_resources == 0 || 948 ep->rep_remote_cma.initiator_depth != 949 ep->rep_remote_cma.responder_resources)) { 950 if (ep->rep_remote_cma.responder_resources == 0) 951 ep->rep_remote_cma.responder_resources = 1; 952 ep->rep_remote_cma.initiator_depth = 953 ep->rep_remote_cma.responder_resources; 954 goto retry; 955 } 956 rc = ep->rep_connected; 957 } else { 958 dprintk("RPC: %s: connected\n", __func__); 959 } 960 961 out: 962 if (rc) 963 ep->rep_connected = rc; 964 return rc; 965 } 966 967 /* 968 * rpcrdma_ep_disconnect 969 * 970 * This is separate from destroy to facilitate the ability 971 * to reconnect without recreating the endpoint. 972 * 973 * This call is not reentrant, and must not be made in parallel 974 * on the same endpoint. 975 */ 976 void 977 rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) 978 { 979 int rc; 980 981 rpcrdma_flush_cqs(ep); 982 rc = rdma_disconnect(ia->ri_id); 983 if (!rc) { 984 /* returns without wait if not connected */ 985 wait_event_interruptible(ep->rep_connect_wait, 986 ep->rep_connected != 1); 987 dprintk("RPC: %s: after wait, %sconnected\n", __func__, 988 (ep->rep_connected == 1) ? "still " : "dis"); 989 } else { 990 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc); 991 ep->rep_connected = rc; 992 } 993 } 994 995 static int 996 rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf) 997 { 998 int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; 999 struct ib_fmr_attr fmr_attr = { 1000 .max_pages = RPCRDMA_MAX_DATA_SEGS, 1001 .max_maps = 1, 1002 .page_shift = PAGE_SHIFT 1003 }; 1004 struct rpcrdma_mw *r; 1005 int i, rc; 1006 1007 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; 1008 dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i); 1009 1010 while (i--) { 1011 r = kzalloc(sizeof(*r), GFP_KERNEL); 1012 if (r == NULL) 1013 return -ENOMEM; 1014 1015 r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr); 1016 if (IS_ERR(r->r.fmr)) { 1017 rc = PTR_ERR(r->r.fmr); 1018 dprintk("RPC: %s: ib_alloc_fmr failed %i\n", 1019 __func__, rc); 1020 goto out_free; 1021 } 1022 1023 list_add(&r->mw_list, &buf->rb_mws); 1024 list_add(&r->mw_all, &buf->rb_all); 1025 } 1026 return 0; 1027 1028 out_free: 1029 kfree(r); 1030 return rc; 1031 } 1032 1033 static int 1034 rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf) 1035 { 1036 struct rpcrdma_frmr *f; 1037 struct rpcrdma_mw *r; 1038 int i, rc; 1039 1040 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; 1041 dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i); 1042 1043 while (i--) { 1044 r = kzalloc(sizeof(*r), GFP_KERNEL); 1045 if (r == NULL) 1046 return -ENOMEM; 1047 f = &r->r.frmr; 1048 1049 f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, 1050 ia->ri_max_frmr_depth); 1051 if (IS_ERR(f->fr_mr)) { 1052 rc = PTR_ERR(f->fr_mr); 1053 dprintk("RPC: %s: ib_alloc_fast_reg_mr " 1054 "failed %i\n", __func__, rc); 1055 goto out_free; 1056 } 1057 1058 f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device, 1059 ia->ri_max_frmr_depth); 1060 if (IS_ERR(f->fr_pgl)) { 1061 rc = PTR_ERR(f->fr_pgl); 1062 dprintk("RPC: %s: ib_alloc_fast_reg_page_list " 1063 "failed %i\n", __func__, rc); 1064 1065 ib_dereg_mr(f->fr_mr); 1066 goto out_free; 1067 } 1068 1069 list_add(&r->mw_list, &buf->rb_mws); 1070 list_add(&r->mw_all, &buf->rb_all); 1071 } 1072 1073 return 0; 1074 1075 out_free: 1076 kfree(r); 1077 return rc; 1078 } 1079 1080 int 1081 rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, 1082 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata) 1083 { 1084 char *p; 1085 size_t len, rlen, wlen; 1086 int i, rc; 1087 1088 buf->rb_max_requests = cdata->max_requests; 1089 spin_lock_init(&buf->rb_lock); 1090 atomic_set(&buf->rb_credits, 1); 1091 1092 /* Need to allocate: 1093 * 1. arrays for send and recv pointers 1094 * 2. arrays of struct rpcrdma_req to fill in pointers 1095 * 3. array of struct rpcrdma_rep for replies 1096 * 4. padding, if any 1097 * Send/recv buffers in req/rep need to be registered 1098 */ 1099 len = buf->rb_max_requests * 1100 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *)); 1101 len += cdata->padding; 1102 1103 p = kzalloc(len, GFP_KERNEL); 1104 if (p == NULL) { 1105 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n", 1106 __func__, len); 1107 rc = -ENOMEM; 1108 goto out; 1109 } 1110 buf->rb_pool = p; /* for freeing it later */ 1111 1112 buf->rb_send_bufs = (struct rpcrdma_req **) p; 1113 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests]; 1114 buf->rb_recv_bufs = (struct rpcrdma_rep **) p; 1115 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests]; 1116 1117 /* 1118 * Register the zeroed pad buffer, if any. 1119 */ 1120 if (cdata->padding) { 1121 rc = rpcrdma_register_internal(ia, p, cdata->padding, 1122 &ep->rep_pad_mr, &ep->rep_pad); 1123 if (rc) 1124 goto out; 1125 } 1126 p += cdata->padding; 1127 1128 INIT_LIST_HEAD(&buf->rb_mws); 1129 INIT_LIST_HEAD(&buf->rb_all); 1130 switch (ia->ri_memreg_strategy) { 1131 case RPCRDMA_FRMR: 1132 rc = rpcrdma_init_frmrs(ia, buf); 1133 if (rc) 1134 goto out; 1135 break; 1136 case RPCRDMA_MTHCAFMR: 1137 rc = rpcrdma_init_fmrs(ia, buf); 1138 if (rc) 1139 goto out; 1140 break; 1141 default: 1142 break; 1143 } 1144 1145 /* 1146 * Allocate/init the request/reply buffers. Doing this 1147 * using kmalloc for now -- one for each buf. 1148 */ 1149 wlen = 1 << fls(cdata->inline_wsize + sizeof(struct rpcrdma_req)); 1150 rlen = 1 << fls(cdata->inline_rsize + sizeof(struct rpcrdma_rep)); 1151 dprintk("RPC: %s: wlen = %zu, rlen = %zu\n", 1152 __func__, wlen, rlen); 1153 1154 for (i = 0; i < buf->rb_max_requests; i++) { 1155 struct rpcrdma_req *req; 1156 struct rpcrdma_rep *rep; 1157 1158 req = kmalloc(wlen, GFP_KERNEL); 1159 if (req == NULL) { 1160 dprintk("RPC: %s: request buffer %d alloc" 1161 " failed\n", __func__, i); 1162 rc = -ENOMEM; 1163 goto out; 1164 } 1165 memset(req, 0, sizeof(struct rpcrdma_req)); 1166 buf->rb_send_bufs[i] = req; 1167 buf->rb_send_bufs[i]->rl_buffer = buf; 1168 1169 rc = rpcrdma_register_internal(ia, req->rl_base, 1170 wlen - offsetof(struct rpcrdma_req, rl_base), 1171 &buf->rb_send_bufs[i]->rl_handle, 1172 &buf->rb_send_bufs[i]->rl_iov); 1173 if (rc) 1174 goto out; 1175 1176 buf->rb_send_bufs[i]->rl_size = wlen - 1177 sizeof(struct rpcrdma_req); 1178 1179 rep = kmalloc(rlen, GFP_KERNEL); 1180 if (rep == NULL) { 1181 dprintk("RPC: %s: reply buffer %d alloc failed\n", 1182 __func__, i); 1183 rc = -ENOMEM; 1184 goto out; 1185 } 1186 memset(rep, 0, sizeof(struct rpcrdma_rep)); 1187 buf->rb_recv_bufs[i] = rep; 1188 buf->rb_recv_bufs[i]->rr_buffer = buf; 1189 1190 rc = rpcrdma_register_internal(ia, rep->rr_base, 1191 rlen - offsetof(struct rpcrdma_rep, rr_base), 1192 &buf->rb_recv_bufs[i]->rr_handle, 1193 &buf->rb_recv_bufs[i]->rr_iov); 1194 if (rc) 1195 goto out; 1196 1197 } 1198 dprintk("RPC: %s: max_requests %d\n", 1199 __func__, buf->rb_max_requests); 1200 /* done */ 1201 return 0; 1202 out: 1203 rpcrdma_buffer_destroy(buf); 1204 return rc; 1205 } 1206 1207 static void 1208 rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf) 1209 { 1210 struct rpcrdma_mw *r; 1211 int rc; 1212 1213 while (!list_empty(&buf->rb_all)) { 1214 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); 1215 list_del(&r->mw_all); 1216 list_del(&r->mw_list); 1217 1218 rc = ib_dealloc_fmr(r->r.fmr); 1219 if (rc) 1220 dprintk("RPC: %s: ib_dealloc_fmr failed %i\n", 1221 __func__, rc); 1222 1223 kfree(r); 1224 } 1225 } 1226 1227 static void 1228 rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf) 1229 { 1230 struct rpcrdma_mw *r; 1231 int rc; 1232 1233 while (!list_empty(&buf->rb_all)) { 1234 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); 1235 list_del(&r->mw_all); 1236 list_del(&r->mw_list); 1237 1238 rc = ib_dereg_mr(r->r.frmr.fr_mr); 1239 if (rc) 1240 dprintk("RPC: %s: ib_dereg_mr failed %i\n", 1241 __func__, rc); 1242 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); 1243 1244 kfree(r); 1245 } 1246 } 1247 1248 void 1249 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) 1250 { 1251 struct rpcrdma_ia *ia = rdmab_to_ia(buf); 1252 int i; 1253 1254 /* clean up in reverse order from create 1255 * 1. recv mr memory (mr free, then kfree) 1256 * 2. send mr memory (mr free, then kfree) 1257 * 3. MWs 1258 */ 1259 dprintk("RPC: %s: entering\n", __func__); 1260 1261 for (i = 0; i < buf->rb_max_requests; i++) { 1262 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) { 1263 rpcrdma_deregister_internal(ia, 1264 buf->rb_recv_bufs[i]->rr_handle, 1265 &buf->rb_recv_bufs[i]->rr_iov); 1266 kfree(buf->rb_recv_bufs[i]); 1267 } 1268 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) { 1269 rpcrdma_deregister_internal(ia, 1270 buf->rb_send_bufs[i]->rl_handle, 1271 &buf->rb_send_bufs[i]->rl_iov); 1272 kfree(buf->rb_send_bufs[i]); 1273 } 1274 } 1275 1276 switch (ia->ri_memreg_strategy) { 1277 case RPCRDMA_FRMR: 1278 rpcrdma_destroy_frmrs(buf); 1279 break; 1280 case RPCRDMA_MTHCAFMR: 1281 rpcrdma_destroy_fmrs(buf); 1282 break; 1283 default: 1284 break; 1285 } 1286 1287 kfree(buf->rb_pool); 1288 } 1289 1290 /* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in 1291 * an unusable state. Find FRMRs in this state and dereg / reg 1292 * each. FRMRs that are VALID and attached to an rpcrdma_req are 1293 * also torn down. 1294 * 1295 * This gives all in-use FRMRs a fresh rkey and leaves them INVALID. 1296 * 1297 * This is invoked only in the transport connect worker in order 1298 * to serialize with rpcrdma_register_frmr_external(). 1299 */ 1300 static void 1301 rpcrdma_reset_frmrs(struct rpcrdma_ia *ia) 1302 { 1303 struct rpcrdma_xprt *r_xprt = 1304 container_of(ia, struct rpcrdma_xprt, rx_ia); 1305 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 1306 struct list_head *pos; 1307 struct rpcrdma_mw *r; 1308 int rc; 1309 1310 list_for_each(pos, &buf->rb_all) { 1311 r = list_entry(pos, struct rpcrdma_mw, mw_all); 1312 1313 if (r->r.frmr.fr_state == FRMR_IS_INVALID) 1314 continue; 1315 1316 rc = ib_dereg_mr(r->r.frmr.fr_mr); 1317 if (rc) 1318 dprintk("RPC: %s: ib_dereg_mr failed %i\n", 1319 __func__, rc); 1320 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); 1321 1322 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, 1323 ia->ri_max_frmr_depth); 1324 if (IS_ERR(r->r.frmr.fr_mr)) { 1325 rc = PTR_ERR(r->r.frmr.fr_mr); 1326 dprintk("RPC: %s: ib_alloc_fast_reg_mr" 1327 " failed %i\n", __func__, rc); 1328 continue; 1329 } 1330 r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list( 1331 ia->ri_id->device, 1332 ia->ri_max_frmr_depth); 1333 if (IS_ERR(r->r.frmr.fr_pgl)) { 1334 rc = PTR_ERR(r->r.frmr.fr_pgl); 1335 dprintk("RPC: %s: " 1336 "ib_alloc_fast_reg_page_list " 1337 "failed %i\n", __func__, rc); 1338 1339 ib_dereg_mr(r->r.frmr.fr_mr); 1340 continue; 1341 } 1342 r->r.frmr.fr_state = FRMR_IS_INVALID; 1343 } 1344 } 1345 1346 /* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving 1347 * some req segments uninitialized. 1348 */ 1349 static void 1350 rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf) 1351 { 1352 if (*mw) { 1353 list_add_tail(&(*mw)->mw_list, &buf->rb_mws); 1354 *mw = NULL; 1355 } 1356 } 1357 1358 /* Cycle mw's back in reverse order, and "spin" them. 1359 * This delays and scrambles reuse as much as possible. 1360 */ 1361 static void 1362 rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) 1363 { 1364 struct rpcrdma_mr_seg *seg = req->rl_segments; 1365 struct rpcrdma_mr_seg *seg1 = seg; 1366 int i; 1367 1368 for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++) 1369 rpcrdma_buffer_put_mr(&seg->mr_chunk.rl_mw, buf); 1370 rpcrdma_buffer_put_mr(&seg1->mr_chunk.rl_mw, buf); 1371 } 1372 1373 static void 1374 rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) 1375 { 1376 buf->rb_send_bufs[--buf->rb_send_index] = req; 1377 req->rl_niovs = 0; 1378 if (req->rl_reply) { 1379 buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply; 1380 req->rl_reply->rr_func = NULL; 1381 req->rl_reply = NULL; 1382 } 1383 } 1384 1385 /* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external(). 1386 * Redo only the ib_post_send(). 1387 */ 1388 static void 1389 rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia) 1390 { 1391 struct rpcrdma_xprt *r_xprt = 1392 container_of(ia, struct rpcrdma_xprt, rx_ia); 1393 struct ib_send_wr invalidate_wr, *bad_wr; 1394 int rc; 1395 1396 dprintk("RPC: %s: FRMR %p is stale\n", __func__, r); 1397 1398 /* When this FRMR is re-inserted into rb_mws, it is no longer stale */ 1399 r->r.frmr.fr_state = FRMR_IS_INVALID; 1400 1401 memset(&invalidate_wr, 0, sizeof(invalidate_wr)); 1402 invalidate_wr.wr_id = (unsigned long)(void *)r; 1403 invalidate_wr.opcode = IB_WR_LOCAL_INV; 1404 invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey; 1405 DECR_CQCOUNT(&r_xprt->rx_ep); 1406 1407 dprintk("RPC: %s: frmr %p invalidating rkey %08x\n", 1408 __func__, r, r->r.frmr.fr_mr->rkey); 1409 1410 read_lock(&ia->ri_qplock); 1411 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); 1412 read_unlock(&ia->ri_qplock); 1413 if (rc) { 1414 /* Force rpcrdma_buffer_get() to retry */ 1415 r->r.frmr.fr_state = FRMR_IS_STALE; 1416 dprintk("RPC: %s: ib_post_send failed, %i\n", 1417 __func__, rc); 1418 } 1419 } 1420 1421 static void 1422 rpcrdma_retry_flushed_linv(struct list_head *stale, 1423 struct rpcrdma_buffer *buf) 1424 { 1425 struct rpcrdma_ia *ia = rdmab_to_ia(buf); 1426 struct list_head *pos; 1427 struct rpcrdma_mw *r; 1428 unsigned long flags; 1429 1430 list_for_each(pos, stale) { 1431 r = list_entry(pos, struct rpcrdma_mw, mw_list); 1432 rpcrdma_retry_local_inv(r, ia); 1433 } 1434 1435 spin_lock_irqsave(&buf->rb_lock, flags); 1436 list_splice_tail(stale, &buf->rb_mws); 1437 spin_unlock_irqrestore(&buf->rb_lock, flags); 1438 } 1439 1440 static struct rpcrdma_req * 1441 rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf, 1442 struct list_head *stale) 1443 { 1444 struct rpcrdma_mw *r; 1445 int i; 1446 1447 i = RPCRDMA_MAX_SEGS - 1; 1448 while (!list_empty(&buf->rb_mws)) { 1449 r = list_entry(buf->rb_mws.next, 1450 struct rpcrdma_mw, mw_list); 1451 list_del(&r->mw_list); 1452 if (r->r.frmr.fr_state == FRMR_IS_STALE) { 1453 list_add(&r->mw_list, stale); 1454 continue; 1455 } 1456 req->rl_segments[i].mr_chunk.rl_mw = r; 1457 if (unlikely(i-- == 0)) 1458 return req; /* Success */ 1459 } 1460 1461 /* Not enough entries on rb_mws for this req */ 1462 rpcrdma_buffer_put_sendbuf(req, buf); 1463 rpcrdma_buffer_put_mrs(req, buf); 1464 return NULL; 1465 } 1466 1467 static struct rpcrdma_req * 1468 rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) 1469 { 1470 struct rpcrdma_mw *r; 1471 int i; 1472 1473 i = RPCRDMA_MAX_SEGS - 1; 1474 while (!list_empty(&buf->rb_mws)) { 1475 r = list_entry(buf->rb_mws.next, 1476 struct rpcrdma_mw, mw_list); 1477 list_del(&r->mw_list); 1478 req->rl_segments[i].mr_chunk.rl_mw = r; 1479 if (unlikely(i-- == 0)) 1480 return req; /* Success */ 1481 } 1482 1483 /* Not enough entries on rb_mws for this req */ 1484 rpcrdma_buffer_put_sendbuf(req, buf); 1485 rpcrdma_buffer_put_mrs(req, buf); 1486 return NULL; 1487 } 1488 1489 /* 1490 * Get a set of request/reply buffers. 1491 * 1492 * Reply buffer (if needed) is attached to send buffer upon return. 1493 * Rule: 1494 * rb_send_index and rb_recv_index MUST always be pointing to the 1495 * *next* available buffer (non-NULL). They are incremented after 1496 * removing buffers, and decremented *before* returning them. 1497 */ 1498 struct rpcrdma_req * 1499 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) 1500 { 1501 struct rpcrdma_ia *ia = rdmab_to_ia(buffers); 1502 struct list_head stale; 1503 struct rpcrdma_req *req; 1504 unsigned long flags; 1505 1506 spin_lock_irqsave(&buffers->rb_lock, flags); 1507 if (buffers->rb_send_index == buffers->rb_max_requests) { 1508 spin_unlock_irqrestore(&buffers->rb_lock, flags); 1509 dprintk("RPC: %s: out of request buffers\n", __func__); 1510 return ((struct rpcrdma_req *)NULL); 1511 } 1512 1513 req = buffers->rb_send_bufs[buffers->rb_send_index]; 1514 if (buffers->rb_send_index < buffers->rb_recv_index) { 1515 dprintk("RPC: %s: %d extra receives outstanding (ok)\n", 1516 __func__, 1517 buffers->rb_recv_index - buffers->rb_send_index); 1518 req->rl_reply = NULL; 1519 } else { 1520 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index]; 1521 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL; 1522 } 1523 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL; 1524 1525 INIT_LIST_HEAD(&stale); 1526 switch (ia->ri_memreg_strategy) { 1527 case RPCRDMA_FRMR: 1528 req = rpcrdma_buffer_get_frmrs(req, buffers, &stale); 1529 break; 1530 case RPCRDMA_MTHCAFMR: 1531 req = rpcrdma_buffer_get_fmrs(req, buffers); 1532 break; 1533 default: 1534 break; 1535 } 1536 spin_unlock_irqrestore(&buffers->rb_lock, flags); 1537 if (!list_empty(&stale)) 1538 rpcrdma_retry_flushed_linv(&stale, buffers); 1539 return req; 1540 } 1541 1542 /* 1543 * Put request/reply buffers back into pool. 1544 * Pre-decrement counter/array index. 1545 */ 1546 void 1547 rpcrdma_buffer_put(struct rpcrdma_req *req) 1548 { 1549 struct rpcrdma_buffer *buffers = req->rl_buffer; 1550 struct rpcrdma_ia *ia = rdmab_to_ia(buffers); 1551 unsigned long flags; 1552 1553 spin_lock_irqsave(&buffers->rb_lock, flags); 1554 rpcrdma_buffer_put_sendbuf(req, buffers); 1555 switch (ia->ri_memreg_strategy) { 1556 case RPCRDMA_FRMR: 1557 case RPCRDMA_MTHCAFMR: 1558 rpcrdma_buffer_put_mrs(req, buffers); 1559 break; 1560 default: 1561 break; 1562 } 1563 spin_unlock_irqrestore(&buffers->rb_lock, flags); 1564 } 1565 1566 /* 1567 * Recover reply buffers from pool. 1568 * This happens when recovering from error conditions. 1569 * Post-increment counter/array index. 1570 */ 1571 void 1572 rpcrdma_recv_buffer_get(struct rpcrdma_req *req) 1573 { 1574 struct rpcrdma_buffer *buffers = req->rl_buffer; 1575 unsigned long flags; 1576 1577 if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */ 1578 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer; 1579 spin_lock_irqsave(&buffers->rb_lock, flags); 1580 if (buffers->rb_recv_index < buffers->rb_max_requests) { 1581 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index]; 1582 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL; 1583 } 1584 spin_unlock_irqrestore(&buffers->rb_lock, flags); 1585 } 1586 1587 /* 1588 * Put reply buffers back into pool when not attached to 1589 * request. This happens in error conditions. 1590 */ 1591 void 1592 rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) 1593 { 1594 struct rpcrdma_buffer *buffers = rep->rr_buffer; 1595 unsigned long flags; 1596 1597 rep->rr_func = NULL; 1598 spin_lock_irqsave(&buffers->rb_lock, flags); 1599 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep; 1600 spin_unlock_irqrestore(&buffers->rb_lock, flags); 1601 } 1602 1603 /* 1604 * Wrappers for internal-use kmalloc memory registration, used by buffer code. 1605 */ 1606 1607 int 1608 rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, 1609 struct ib_mr **mrp, struct ib_sge *iov) 1610 { 1611 struct ib_phys_buf ipb; 1612 struct ib_mr *mr; 1613 int rc; 1614 1615 /* 1616 * All memory passed here was kmalloc'ed, therefore phys-contiguous. 1617 */ 1618 iov->addr = ib_dma_map_single(ia->ri_id->device, 1619 va, len, DMA_BIDIRECTIONAL); 1620 if (ib_dma_mapping_error(ia->ri_id->device, iov->addr)) 1621 return -ENOMEM; 1622 1623 iov->length = len; 1624 1625 if (ia->ri_have_dma_lkey) { 1626 *mrp = NULL; 1627 iov->lkey = ia->ri_dma_lkey; 1628 return 0; 1629 } else if (ia->ri_bind_mem != NULL) { 1630 *mrp = NULL; 1631 iov->lkey = ia->ri_bind_mem->lkey; 1632 return 0; 1633 } 1634 1635 ipb.addr = iov->addr; 1636 ipb.size = iov->length; 1637 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1, 1638 IB_ACCESS_LOCAL_WRITE, &iov->addr); 1639 1640 dprintk("RPC: %s: phys convert: 0x%llx " 1641 "registered 0x%llx length %d\n", 1642 __func__, (unsigned long long)ipb.addr, 1643 (unsigned long long)iov->addr, len); 1644 1645 if (IS_ERR(mr)) { 1646 *mrp = NULL; 1647 rc = PTR_ERR(mr); 1648 dprintk("RPC: %s: failed with %i\n", __func__, rc); 1649 } else { 1650 *mrp = mr; 1651 iov->lkey = mr->lkey; 1652 rc = 0; 1653 } 1654 1655 return rc; 1656 } 1657 1658 int 1659 rpcrdma_deregister_internal(struct rpcrdma_ia *ia, 1660 struct ib_mr *mr, struct ib_sge *iov) 1661 { 1662 int rc; 1663 1664 ib_dma_unmap_single(ia->ri_id->device, 1665 iov->addr, iov->length, DMA_BIDIRECTIONAL); 1666 1667 if (NULL == mr) 1668 return 0; 1669 1670 rc = ib_dereg_mr(mr); 1671 if (rc) 1672 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc); 1673 return rc; 1674 } 1675 1676 /* 1677 * Wrappers for chunk registration, shared by read/write chunk code. 1678 */ 1679 1680 static void 1681 rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing) 1682 { 1683 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; 1684 seg->mr_dmalen = seg->mr_len; 1685 if (seg->mr_page) 1686 seg->mr_dma = ib_dma_map_page(ia->ri_id->device, 1687 seg->mr_page, offset_in_page(seg->mr_offset), 1688 seg->mr_dmalen, seg->mr_dir); 1689 else 1690 seg->mr_dma = ib_dma_map_single(ia->ri_id->device, 1691 seg->mr_offset, 1692 seg->mr_dmalen, seg->mr_dir); 1693 if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) { 1694 dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n", 1695 __func__, 1696 (unsigned long long)seg->mr_dma, 1697 seg->mr_offset, seg->mr_dmalen); 1698 } 1699 } 1700 1701 static void 1702 rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg) 1703 { 1704 if (seg->mr_page) 1705 ib_dma_unmap_page(ia->ri_id->device, 1706 seg->mr_dma, seg->mr_dmalen, seg->mr_dir); 1707 else 1708 ib_dma_unmap_single(ia->ri_id->device, 1709 seg->mr_dma, seg->mr_dmalen, seg->mr_dir); 1710 } 1711 1712 static int 1713 rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, 1714 int *nsegs, int writing, struct rpcrdma_ia *ia, 1715 struct rpcrdma_xprt *r_xprt) 1716 { 1717 struct rpcrdma_mr_seg *seg1 = seg; 1718 struct rpcrdma_mw *mw = seg1->mr_chunk.rl_mw; 1719 struct rpcrdma_frmr *frmr = &mw->r.frmr; 1720 struct ib_mr *mr = frmr->fr_mr; 1721 struct ib_send_wr fastreg_wr, *bad_wr; 1722 u8 key; 1723 int len, pageoff; 1724 int i, rc; 1725 int seg_len; 1726 u64 pa; 1727 int page_no; 1728 1729 pageoff = offset_in_page(seg1->mr_offset); 1730 seg1->mr_offset -= pageoff; /* start of page */ 1731 seg1->mr_len += pageoff; 1732 len = -pageoff; 1733 if (*nsegs > ia->ri_max_frmr_depth) 1734 *nsegs = ia->ri_max_frmr_depth; 1735 for (page_no = i = 0; i < *nsegs;) { 1736 rpcrdma_map_one(ia, seg, writing); 1737 pa = seg->mr_dma; 1738 for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) { 1739 frmr->fr_pgl->page_list[page_no++] = pa; 1740 pa += PAGE_SIZE; 1741 } 1742 len += seg->mr_len; 1743 ++seg; 1744 ++i; 1745 /* Check for holes */ 1746 if ((i < *nsegs && offset_in_page(seg->mr_offset)) || 1747 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) 1748 break; 1749 } 1750 dprintk("RPC: %s: Using frmr %p to map %d segments\n", 1751 __func__, mw, i); 1752 1753 frmr->fr_state = FRMR_IS_VALID; 1754 1755 memset(&fastreg_wr, 0, sizeof(fastreg_wr)); 1756 fastreg_wr.wr_id = (unsigned long)(void *)mw; 1757 fastreg_wr.opcode = IB_WR_FAST_REG_MR; 1758 fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma; 1759 fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl; 1760 fastreg_wr.wr.fast_reg.page_list_len = page_no; 1761 fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; 1762 fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT; 1763 if (fastreg_wr.wr.fast_reg.length < len) { 1764 rc = -EIO; 1765 goto out_err; 1766 } 1767 1768 /* Bump the key */ 1769 key = (u8)(mr->rkey & 0x000000FF); 1770 ib_update_fast_reg_key(mr, ++key); 1771 1772 fastreg_wr.wr.fast_reg.access_flags = (writing ? 1773 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : 1774 IB_ACCESS_REMOTE_READ); 1775 fastreg_wr.wr.fast_reg.rkey = mr->rkey; 1776 DECR_CQCOUNT(&r_xprt->rx_ep); 1777 1778 rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr); 1779 if (rc) { 1780 dprintk("RPC: %s: failed ib_post_send for register," 1781 " status %i\n", __func__, rc); 1782 ib_update_fast_reg_key(mr, --key); 1783 goto out_err; 1784 } else { 1785 seg1->mr_rkey = mr->rkey; 1786 seg1->mr_base = seg1->mr_dma + pageoff; 1787 seg1->mr_nsegs = i; 1788 seg1->mr_len = len; 1789 } 1790 *nsegs = i; 1791 return 0; 1792 out_err: 1793 frmr->fr_state = FRMR_IS_INVALID; 1794 while (i--) 1795 rpcrdma_unmap_one(ia, --seg); 1796 return rc; 1797 } 1798 1799 static int 1800 rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, 1801 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt) 1802 { 1803 struct rpcrdma_mr_seg *seg1 = seg; 1804 struct ib_send_wr invalidate_wr, *bad_wr; 1805 int rc; 1806 1807 seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_INVALID; 1808 1809 memset(&invalidate_wr, 0, sizeof invalidate_wr); 1810 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; 1811 invalidate_wr.opcode = IB_WR_LOCAL_INV; 1812 invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; 1813 DECR_CQCOUNT(&r_xprt->rx_ep); 1814 1815 read_lock(&ia->ri_qplock); 1816 while (seg1->mr_nsegs--) 1817 rpcrdma_unmap_one(ia, seg++); 1818 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); 1819 read_unlock(&ia->ri_qplock); 1820 if (rc) { 1821 /* Force rpcrdma_buffer_get() to retry */ 1822 seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_STALE; 1823 dprintk("RPC: %s: failed ib_post_send for invalidate," 1824 " status %i\n", __func__, rc); 1825 } 1826 return rc; 1827 } 1828 1829 static int 1830 rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg, 1831 int *nsegs, int writing, struct rpcrdma_ia *ia) 1832 { 1833 struct rpcrdma_mr_seg *seg1 = seg; 1834 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS]; 1835 int len, pageoff, i, rc; 1836 1837 pageoff = offset_in_page(seg1->mr_offset); 1838 seg1->mr_offset -= pageoff; /* start of page */ 1839 seg1->mr_len += pageoff; 1840 len = -pageoff; 1841 if (*nsegs > RPCRDMA_MAX_DATA_SEGS) 1842 *nsegs = RPCRDMA_MAX_DATA_SEGS; 1843 for (i = 0; i < *nsegs;) { 1844 rpcrdma_map_one(ia, seg, writing); 1845 physaddrs[i] = seg->mr_dma; 1846 len += seg->mr_len; 1847 ++seg; 1848 ++i; 1849 /* Check for holes */ 1850 if ((i < *nsegs && offset_in_page(seg->mr_offset)) || 1851 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) 1852 break; 1853 } 1854 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr, 1855 physaddrs, i, seg1->mr_dma); 1856 if (rc) { 1857 dprintk("RPC: %s: failed ib_map_phys_fmr " 1858 "%u@0x%llx+%i (%d)... status %i\n", __func__, 1859 len, (unsigned long long)seg1->mr_dma, 1860 pageoff, i, rc); 1861 while (i--) 1862 rpcrdma_unmap_one(ia, --seg); 1863 } else { 1864 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey; 1865 seg1->mr_base = seg1->mr_dma + pageoff; 1866 seg1->mr_nsegs = i; 1867 seg1->mr_len = len; 1868 } 1869 *nsegs = i; 1870 return rc; 1871 } 1872 1873 static int 1874 rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg, 1875 struct rpcrdma_ia *ia) 1876 { 1877 struct rpcrdma_mr_seg *seg1 = seg; 1878 LIST_HEAD(l); 1879 int rc; 1880 1881 list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l); 1882 rc = ib_unmap_fmr(&l); 1883 read_lock(&ia->ri_qplock); 1884 while (seg1->mr_nsegs--) 1885 rpcrdma_unmap_one(ia, seg++); 1886 read_unlock(&ia->ri_qplock); 1887 if (rc) 1888 dprintk("RPC: %s: failed ib_unmap_fmr," 1889 " status %i\n", __func__, rc); 1890 return rc; 1891 } 1892 1893 int 1894 rpcrdma_register_external(struct rpcrdma_mr_seg *seg, 1895 int nsegs, int writing, struct rpcrdma_xprt *r_xprt) 1896 { 1897 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 1898 int rc = 0; 1899 1900 switch (ia->ri_memreg_strategy) { 1901 1902 case RPCRDMA_ALLPHYSICAL: 1903 rpcrdma_map_one(ia, seg, writing); 1904 seg->mr_rkey = ia->ri_bind_mem->rkey; 1905 seg->mr_base = seg->mr_dma; 1906 seg->mr_nsegs = 1; 1907 nsegs = 1; 1908 break; 1909 1910 /* Registration using frmr registration */ 1911 case RPCRDMA_FRMR: 1912 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt); 1913 break; 1914 1915 /* Registration using fmr memory registration */ 1916 case RPCRDMA_MTHCAFMR: 1917 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia); 1918 break; 1919 1920 default: 1921 return -1; 1922 } 1923 if (rc) 1924 return -1; 1925 1926 return nsegs; 1927 } 1928 1929 int 1930 rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, 1931 struct rpcrdma_xprt *r_xprt) 1932 { 1933 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 1934 int nsegs = seg->mr_nsegs, rc; 1935 1936 switch (ia->ri_memreg_strategy) { 1937 1938 case RPCRDMA_ALLPHYSICAL: 1939 read_lock(&ia->ri_qplock); 1940 rpcrdma_unmap_one(ia, seg); 1941 read_unlock(&ia->ri_qplock); 1942 break; 1943 1944 case RPCRDMA_FRMR: 1945 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt); 1946 break; 1947 1948 case RPCRDMA_MTHCAFMR: 1949 rc = rpcrdma_deregister_fmr_external(seg, ia); 1950 break; 1951 1952 default: 1953 break; 1954 } 1955 return nsegs; 1956 } 1957 1958 /* 1959 * Prepost any receive buffer, then post send. 1960 * 1961 * Receive buffer is donated to hardware, reclaimed upon recv completion. 1962 */ 1963 int 1964 rpcrdma_ep_post(struct rpcrdma_ia *ia, 1965 struct rpcrdma_ep *ep, 1966 struct rpcrdma_req *req) 1967 { 1968 struct ib_send_wr send_wr, *send_wr_fail; 1969 struct rpcrdma_rep *rep = req->rl_reply; 1970 int rc; 1971 1972 if (rep) { 1973 rc = rpcrdma_ep_post_recv(ia, ep, rep); 1974 if (rc) 1975 goto out; 1976 req->rl_reply = NULL; 1977 } 1978 1979 send_wr.next = NULL; 1980 send_wr.wr_id = 0ULL; /* no send cookie */ 1981 send_wr.sg_list = req->rl_send_iov; 1982 send_wr.num_sge = req->rl_niovs; 1983 send_wr.opcode = IB_WR_SEND; 1984 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */ 1985 ib_dma_sync_single_for_device(ia->ri_id->device, 1986 req->rl_send_iov[3].addr, req->rl_send_iov[3].length, 1987 DMA_TO_DEVICE); 1988 ib_dma_sync_single_for_device(ia->ri_id->device, 1989 req->rl_send_iov[1].addr, req->rl_send_iov[1].length, 1990 DMA_TO_DEVICE); 1991 ib_dma_sync_single_for_device(ia->ri_id->device, 1992 req->rl_send_iov[0].addr, req->rl_send_iov[0].length, 1993 DMA_TO_DEVICE); 1994 1995 if (DECR_CQCOUNT(ep) > 0) 1996 send_wr.send_flags = 0; 1997 else { /* Provider must take a send completion every now and then */ 1998 INIT_CQCOUNT(ep); 1999 send_wr.send_flags = IB_SEND_SIGNALED; 2000 } 2001 2002 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail); 2003 if (rc) 2004 dprintk("RPC: %s: ib_post_send returned %i\n", __func__, 2005 rc); 2006 out: 2007 return rc; 2008 } 2009 2010 /* 2011 * (Re)post a receive buffer. 2012 */ 2013 int 2014 rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, 2015 struct rpcrdma_ep *ep, 2016 struct rpcrdma_rep *rep) 2017 { 2018 struct ib_recv_wr recv_wr, *recv_wr_fail; 2019 int rc; 2020 2021 recv_wr.next = NULL; 2022 recv_wr.wr_id = (u64) (unsigned long) rep; 2023 recv_wr.sg_list = &rep->rr_iov; 2024 recv_wr.num_sge = 1; 2025 2026 ib_dma_sync_single_for_cpu(ia->ri_id->device, 2027 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL); 2028 2029 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); 2030 2031 if (rc) 2032 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__, 2033 rc); 2034 return rc; 2035 } 2036 2037 /* Physical mapping means one Read/Write list entry per-page. 2038 * All list entries must fit within an inline buffer 2039 * 2040 * NB: The server must return a Write list for NFS READ, 2041 * which has the same constraint. Factor in the inline 2042 * rsize as well. 2043 */ 2044 static size_t 2045 rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt) 2046 { 2047 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; 2048 unsigned int inline_size, pages; 2049 2050 inline_size = min_t(unsigned int, 2051 cdata->inline_wsize, cdata->inline_rsize); 2052 inline_size -= RPCRDMA_HDRLEN_MIN; 2053 pages = inline_size / sizeof(struct rpcrdma_segment); 2054 return pages << PAGE_SHIFT; 2055 } 2056 2057 static size_t 2058 rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt) 2059 { 2060 return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT; 2061 } 2062 2063 size_t 2064 rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt) 2065 { 2066 size_t result; 2067 2068 switch (r_xprt->rx_ia.ri_memreg_strategy) { 2069 case RPCRDMA_ALLPHYSICAL: 2070 result = rpcrdma_physical_max_payload(r_xprt); 2071 break; 2072 default: 2073 result = rpcrdma_mr_max_payload(r_xprt); 2074 } 2075 return result; 2076 } 2077