1 /* 2 * Copyright (c) 2014-2017 Oracle. All rights reserved. 3 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the BSD-type 9 * license below: 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 15 * Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 18 * Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials provided 21 * with the distribution. 22 * 23 * Neither the name of the Network Appliance, Inc. nor the names of 24 * its contributors may be used to endorse or promote products 25 * derived from this software without specific prior written 26 * permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 29 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 30 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 31 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 32 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 33 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 34 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 35 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 36 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 37 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 38 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 39 */ 40 41 /* 42 * rpc_rdma.c 43 * 44 * This file contains the guts of the RPC RDMA protocol, and 45 * does marshaling/unmarshaling, etc. It is also where interfacing 46 * to the Linux RPC framework lives. 47 */ 48 49 #include "xprt_rdma.h" 50 51 #include <linux/highmem.h> 52 53 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) 54 # define RPCDBG_FACILITY RPCDBG_TRANS 55 #endif 56 57 static const char transfertypes[][12] = { 58 "inline", /* no chunks */ 59 "read list", /* some argument via rdma read */ 60 "*read list", /* entire request via rdma read */ 61 "write list", /* some result via rdma write */ 62 "reply chunk" /* entire reply via rdma write */ 63 }; 64 65 /* Returns size of largest RPC-over-RDMA header in a Call message 66 * 67 * The largest Call header contains a full-size Read list and a 68 * minimal Reply chunk. 69 */ 70 static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs) 71 { 72 unsigned int size; 73 74 /* Fixed header fields and list discriminators */ 75 size = RPCRDMA_HDRLEN_MIN; 76 77 /* Maximum Read list size */ 78 maxsegs += 2; /* segment for head and tail buffers */ 79 size = maxsegs * rpcrdma_readchunk_maxsz * sizeof(__be32); 80 81 /* Minimal Read chunk size */ 82 size += sizeof(__be32); /* segment count */ 83 size += rpcrdma_segment_maxsz * sizeof(__be32); 84 size += sizeof(__be32); /* list discriminator */ 85 86 dprintk("RPC: %s: max call header size = %u\n", 87 __func__, size); 88 return size; 89 } 90 91 /* Returns size of largest RPC-over-RDMA header in a Reply message 92 * 93 * There is only one Write list or one Reply chunk per Reply 94 * message. The larger list is the Write list. 95 */ 96 static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs) 97 { 98 unsigned int size; 99 100 /* Fixed header fields and list discriminators */ 101 size = RPCRDMA_HDRLEN_MIN; 102 103 /* Maximum Write list size */ 104 maxsegs += 2; /* segment for head and tail buffers */ 105 size = sizeof(__be32); /* segment count */ 106 size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32); 107 size += sizeof(__be32); /* list discriminator */ 108 109 dprintk("RPC: %s: max reply header size = %u\n", 110 __func__, size); 111 return size; 112 } 113 114 void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt) 115 { 116 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; 117 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 118 unsigned int maxsegs = ia->ri_max_segs; 119 120 ia->ri_max_inline_write = cdata->inline_wsize - 121 rpcrdma_max_call_header_size(maxsegs); 122 ia->ri_max_inline_read = cdata->inline_rsize - 123 rpcrdma_max_reply_header_size(maxsegs); 124 } 125 126 /* The client can send a request inline as long as the RPCRDMA header 127 * plus the RPC call fit under the transport's inline limit. If the 128 * combined call message size exceeds that limit, the client must use 129 * a Read chunk for this operation. 130 * 131 * A Read chunk is also required if sending the RPC call inline would 132 * exceed this device's max_sge limit. 133 */ 134 static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, 135 struct rpc_rqst *rqst) 136 { 137 struct xdr_buf *xdr = &rqst->rq_snd_buf; 138 unsigned int count, remaining, offset; 139 140 if (xdr->len > r_xprt->rx_ia.ri_max_inline_write) 141 return false; 142 143 if (xdr->page_len) { 144 remaining = xdr->page_len; 145 offset = offset_in_page(xdr->page_base); 146 count = 0; 147 while (remaining) { 148 remaining -= min_t(unsigned int, 149 PAGE_SIZE - offset, remaining); 150 offset = 0; 151 if (++count > r_xprt->rx_ia.ri_max_send_sges) 152 return false; 153 } 154 } 155 156 return true; 157 } 158 159 /* The client can't know how large the actual reply will be. Thus it 160 * plans for the largest possible reply for that particular ULP 161 * operation. If the maximum combined reply message size exceeds that 162 * limit, the client must provide a write list or a reply chunk for 163 * this request. 164 */ 165 static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt, 166 struct rpc_rqst *rqst) 167 { 168 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 169 170 return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read; 171 } 172 173 /* Split @vec on page boundaries into SGEs. FMR registers pages, not 174 * a byte range. Other modes coalesce these SGEs into a single MR 175 * when they can. 176 * 177 * Returns pointer to next available SGE, and bumps the total number 178 * of SGEs consumed. 179 */ 180 static struct rpcrdma_mr_seg * 181 rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, 182 unsigned int *n) 183 { 184 u32 remaining, page_offset; 185 char *base; 186 187 base = vec->iov_base; 188 page_offset = offset_in_page(base); 189 remaining = vec->iov_len; 190 while (remaining) { 191 seg->mr_page = NULL; 192 seg->mr_offset = base; 193 seg->mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining); 194 remaining -= seg->mr_len; 195 base += seg->mr_len; 196 ++seg; 197 ++(*n); 198 page_offset = 0; 199 } 200 return seg; 201 } 202 203 /* Convert @xdrbuf into SGEs no larger than a page each. As they 204 * are registered, these SGEs are then coalesced into RDMA segments 205 * when the selected memreg mode supports it. 206 * 207 * Returns positive number of SGEs consumed, or a negative errno. 208 */ 209 210 static int 211 rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, 212 unsigned int pos, enum rpcrdma_chunktype type, 213 struct rpcrdma_mr_seg *seg) 214 { 215 unsigned long page_base; 216 unsigned int len, n; 217 struct page **ppages; 218 219 n = 0; 220 if (pos == 0) 221 seg = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, &n); 222 223 len = xdrbuf->page_len; 224 ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); 225 page_base = offset_in_page(xdrbuf->page_base); 226 while (len) { 227 if (unlikely(!*ppages)) { 228 /* XXX: Certain upper layer operations do 229 * not provide receive buffer pages. 230 */ 231 *ppages = alloc_page(GFP_ATOMIC); 232 if (!*ppages) 233 return -EAGAIN; 234 } 235 seg->mr_page = *ppages; 236 seg->mr_offset = (char *)page_base; 237 seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len); 238 len -= seg->mr_len; 239 ++ppages; 240 ++seg; 241 ++n; 242 page_base = 0; 243 } 244 245 /* When encoding a Read chunk, the tail iovec contains an 246 * XDR pad and may be omitted. 247 */ 248 if (type == rpcrdma_readch && r_xprt->rx_ia.ri_implicit_roundup) 249 goto out; 250 251 /* When encoding a Write chunk, some servers need to see an 252 * extra segment for non-XDR-aligned Write chunks. The upper 253 * layer provides space in the tail iovec that may be used 254 * for this purpose. 255 */ 256 if (type == rpcrdma_writech && r_xprt->rx_ia.ri_implicit_roundup) 257 goto out; 258 259 if (xdrbuf->tail[0].iov_len) 260 seg = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n); 261 262 out: 263 if (unlikely(n > RPCRDMA_MAX_SEGS)) 264 return -EIO; 265 return n; 266 } 267 268 static inline int 269 encode_item_present(struct xdr_stream *xdr) 270 { 271 __be32 *p; 272 273 p = xdr_reserve_space(xdr, sizeof(*p)); 274 if (unlikely(!p)) 275 return -EMSGSIZE; 276 277 *p = xdr_one; 278 return 0; 279 } 280 281 static inline int 282 encode_item_not_present(struct xdr_stream *xdr) 283 { 284 __be32 *p; 285 286 p = xdr_reserve_space(xdr, sizeof(*p)); 287 if (unlikely(!p)) 288 return -EMSGSIZE; 289 290 *p = xdr_zero; 291 return 0; 292 } 293 294 static void 295 xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mw *mw) 296 { 297 *iptr++ = cpu_to_be32(mw->mw_handle); 298 *iptr++ = cpu_to_be32(mw->mw_length); 299 xdr_encode_hyper(iptr, mw->mw_offset); 300 } 301 302 static int 303 encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw) 304 { 305 __be32 *p; 306 307 p = xdr_reserve_space(xdr, 4 * sizeof(*p)); 308 if (unlikely(!p)) 309 return -EMSGSIZE; 310 311 xdr_encode_rdma_segment(p, mw); 312 return 0; 313 } 314 315 static int 316 encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw, 317 u32 position) 318 { 319 __be32 *p; 320 321 p = xdr_reserve_space(xdr, 6 * sizeof(*p)); 322 if (unlikely(!p)) 323 return -EMSGSIZE; 324 325 *p++ = xdr_one; /* Item present */ 326 *p++ = cpu_to_be32(position); 327 xdr_encode_rdma_segment(p, mw); 328 return 0; 329 } 330 331 /* Register and XDR encode the Read list. Supports encoding a list of read 332 * segments that belong to a single read chunk. 333 * 334 * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): 335 * 336 * Read chunklist (a linked list): 337 * N elements, position P (same P for all chunks of same arg!): 338 * 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0 339 * 340 * Returns zero on success, or a negative errno if a failure occurred. 341 * @xdr is advanced to the next position in the stream. 342 * 343 * Only a single @pos value is currently supported. 344 */ 345 static noinline int 346 rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, 347 struct rpc_rqst *rqst, enum rpcrdma_chunktype rtype) 348 { 349 struct xdr_stream *xdr = &req->rl_stream; 350 struct rpcrdma_mr_seg *seg; 351 struct rpcrdma_mw *mw; 352 unsigned int pos; 353 int nsegs; 354 355 pos = rqst->rq_snd_buf.head[0].iov_len; 356 if (rtype == rpcrdma_areadch) 357 pos = 0; 358 seg = req->rl_segments; 359 nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_snd_buf, pos, 360 rtype, seg); 361 if (nsegs < 0) 362 return nsegs; 363 364 do { 365 seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, 366 false, &mw); 367 if (IS_ERR(seg)) 368 return PTR_ERR(seg); 369 rpcrdma_push_mw(mw, &req->rl_registered); 370 371 if (encode_read_segment(xdr, mw, pos) < 0) 372 return -EMSGSIZE; 373 374 dprintk("RPC: %5u %s: pos %u %u@0x%016llx:0x%08x (%s)\n", 375 rqst->rq_task->tk_pid, __func__, pos, 376 mw->mw_length, (unsigned long long)mw->mw_offset, 377 mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last"); 378 379 r_xprt->rx_stats.read_chunk_count++; 380 nsegs -= mw->mw_nents; 381 } while (nsegs); 382 383 return 0; 384 } 385 386 /* Register and XDR encode the Write list. Supports encoding a list 387 * containing one array of plain segments that belong to a single 388 * write chunk. 389 * 390 * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): 391 * 392 * Write chunklist (a list of (one) counted array): 393 * N elements: 394 * 1 - N - HLOO - HLOO - ... - HLOO - 0 395 * 396 * Returns zero on success, or a negative errno if a failure occurred. 397 * @xdr is advanced to the next position in the stream. 398 * 399 * Only a single Write chunk is currently supported. 400 */ 401 static noinline int 402 rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, 403 struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype) 404 { 405 struct xdr_stream *xdr = &req->rl_stream; 406 struct rpcrdma_mr_seg *seg; 407 struct rpcrdma_mw *mw; 408 int nsegs, nchunks; 409 __be32 *segcount; 410 411 seg = req->rl_segments; 412 nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 413 rqst->rq_rcv_buf.head[0].iov_len, 414 wtype, seg); 415 if (nsegs < 0) 416 return nsegs; 417 418 if (encode_item_present(xdr) < 0) 419 return -EMSGSIZE; 420 segcount = xdr_reserve_space(xdr, sizeof(*segcount)); 421 if (unlikely(!segcount)) 422 return -EMSGSIZE; 423 /* Actual value encoded below */ 424 425 nchunks = 0; 426 do { 427 seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, 428 true, &mw); 429 if (IS_ERR(seg)) 430 return PTR_ERR(seg); 431 rpcrdma_push_mw(mw, &req->rl_registered); 432 433 if (encode_rdma_segment(xdr, mw) < 0) 434 return -EMSGSIZE; 435 436 dprintk("RPC: %5u %s: %u@0x016%llx:0x%08x (%s)\n", 437 rqst->rq_task->tk_pid, __func__, 438 mw->mw_length, (unsigned long long)mw->mw_offset, 439 mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last"); 440 441 r_xprt->rx_stats.write_chunk_count++; 442 r_xprt->rx_stats.total_rdma_request += seg->mr_len; 443 nchunks++; 444 nsegs -= mw->mw_nents; 445 } while (nsegs); 446 447 /* Update count of segments in this Write chunk */ 448 *segcount = cpu_to_be32(nchunks); 449 450 return 0; 451 } 452 453 /* Register and XDR encode the Reply chunk. Supports encoding an array 454 * of plain segments that belong to a single write (reply) chunk. 455 * 456 * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): 457 * 458 * Reply chunk (a counted array): 459 * N elements: 460 * 1 - N - HLOO - HLOO - ... - HLOO 461 * 462 * Returns zero on success, or a negative errno if a failure occurred. 463 * @xdr is advanced to the next position in the stream. 464 */ 465 static noinline int 466 rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, 467 struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype) 468 { 469 struct xdr_stream *xdr = &req->rl_stream; 470 struct rpcrdma_mr_seg *seg; 471 struct rpcrdma_mw *mw; 472 int nsegs, nchunks; 473 __be32 *segcount; 474 475 seg = req->rl_segments; 476 nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg); 477 if (nsegs < 0) 478 return nsegs; 479 480 if (encode_item_present(xdr) < 0) 481 return -EMSGSIZE; 482 segcount = xdr_reserve_space(xdr, sizeof(*segcount)); 483 if (unlikely(!segcount)) 484 return -EMSGSIZE; 485 /* Actual value encoded below */ 486 487 nchunks = 0; 488 do { 489 seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, 490 true, &mw); 491 if (IS_ERR(seg)) 492 return PTR_ERR(seg); 493 rpcrdma_push_mw(mw, &req->rl_registered); 494 495 if (encode_rdma_segment(xdr, mw) < 0) 496 return -EMSGSIZE; 497 498 dprintk("RPC: %5u %s: %u@0x%016llx:0x%08x (%s)\n", 499 rqst->rq_task->tk_pid, __func__, 500 mw->mw_length, (unsigned long long)mw->mw_offset, 501 mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last"); 502 503 r_xprt->rx_stats.reply_chunk_count++; 504 r_xprt->rx_stats.total_rdma_request += seg->mr_len; 505 nchunks++; 506 nsegs -= mw->mw_nents; 507 } while (nsegs); 508 509 /* Update count of segments in the Reply chunk */ 510 *segcount = cpu_to_be32(nchunks); 511 512 return 0; 513 } 514 515 /** 516 * rpcrdma_unmap_sendctx - DMA-unmap Send buffers 517 * @sc: sendctx containing SGEs to unmap 518 * 519 */ 520 void 521 rpcrdma_unmap_sendctx(struct rpcrdma_sendctx *sc) 522 { 523 struct rpcrdma_ia *ia = &sc->sc_xprt->rx_ia; 524 struct ib_sge *sge; 525 unsigned int count; 526 527 dprintk("RPC: %s: unmapping %u sges for sc=%p\n", 528 __func__, sc->sc_unmap_count, sc); 529 530 /* The first two SGEs contain the transport header and 531 * the inline buffer. These are always left mapped so 532 * they can be cheaply re-used. 533 */ 534 sge = &sc->sc_sges[2]; 535 for (count = sc->sc_unmap_count; count; ++sge, --count) 536 ib_dma_unmap_page(ia->ri_device, 537 sge->addr, sge->length, DMA_TO_DEVICE); 538 539 if (test_and_clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, &sc->sc_req->rl_flags)) { 540 smp_mb__after_atomic(); 541 wake_up_bit(&sc->sc_req->rl_flags, RPCRDMA_REQ_F_TX_RESOURCES); 542 } 543 } 544 545 /* Prepare an SGE for the RPC-over-RDMA transport header. 546 */ 547 static bool 548 rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req, 549 u32 len) 550 { 551 struct rpcrdma_sendctx *sc = req->rl_sendctx; 552 struct rpcrdma_regbuf *rb = req->rl_rdmabuf; 553 struct ib_sge *sge = sc->sc_sges; 554 555 if (!rpcrdma_dma_map_regbuf(ia, rb)) 556 goto out_regbuf; 557 sge->addr = rdmab_addr(rb); 558 sge->length = len; 559 sge->lkey = rdmab_lkey(rb); 560 561 ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, 562 sge->length, DMA_TO_DEVICE); 563 sc->sc_wr.num_sge++; 564 return true; 565 566 out_regbuf: 567 pr_err("rpcrdma: failed to DMA map a Send buffer\n"); 568 return false; 569 } 570 571 /* Prepare the Send SGEs. The head and tail iovec, and each entry 572 * in the page list, gets its own SGE. 573 */ 574 static bool 575 rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req, 576 struct xdr_buf *xdr, enum rpcrdma_chunktype rtype) 577 { 578 struct rpcrdma_sendctx *sc = req->rl_sendctx; 579 unsigned int sge_no, page_base, len, remaining; 580 struct rpcrdma_regbuf *rb = req->rl_sendbuf; 581 struct ib_device *device = ia->ri_device; 582 struct ib_sge *sge = sc->sc_sges; 583 u32 lkey = ia->ri_pd->local_dma_lkey; 584 struct page *page, **ppages; 585 586 /* The head iovec is straightforward, as it is already 587 * DMA-mapped. Sync the content that has changed. 588 */ 589 if (!rpcrdma_dma_map_regbuf(ia, rb)) 590 goto out_regbuf; 591 sge_no = 1; 592 sge[sge_no].addr = rdmab_addr(rb); 593 sge[sge_no].length = xdr->head[0].iov_len; 594 sge[sge_no].lkey = rdmab_lkey(rb); 595 ib_dma_sync_single_for_device(rdmab_device(rb), sge[sge_no].addr, 596 sge[sge_no].length, DMA_TO_DEVICE); 597 598 /* If there is a Read chunk, the page list is being handled 599 * via explicit RDMA, and thus is skipped here. However, the 600 * tail iovec may include an XDR pad for the page list, as 601 * well as additional content, and may not reside in the 602 * same page as the head iovec. 603 */ 604 if (rtype == rpcrdma_readch) { 605 len = xdr->tail[0].iov_len; 606 607 /* Do not include the tail if it is only an XDR pad */ 608 if (len < 4) 609 goto out; 610 611 page = virt_to_page(xdr->tail[0].iov_base); 612 page_base = offset_in_page(xdr->tail[0].iov_base); 613 614 /* If the content in the page list is an odd length, 615 * xdr_write_pages() has added a pad at the beginning 616 * of the tail iovec. Force the tail's non-pad content 617 * to land at the next XDR position in the Send message. 618 */ 619 page_base += len & 3; 620 len -= len & 3; 621 goto map_tail; 622 } 623 624 /* If there is a page list present, temporarily DMA map 625 * and prepare an SGE for each page to be sent. 626 */ 627 if (xdr->page_len) { 628 ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); 629 page_base = offset_in_page(xdr->page_base); 630 remaining = xdr->page_len; 631 while (remaining) { 632 sge_no++; 633 if (sge_no > RPCRDMA_MAX_SEND_SGES - 2) 634 goto out_mapping_overflow; 635 636 len = min_t(u32, PAGE_SIZE - page_base, remaining); 637 sge[sge_no].addr = ib_dma_map_page(device, *ppages, 638 page_base, len, 639 DMA_TO_DEVICE); 640 if (ib_dma_mapping_error(device, sge[sge_no].addr)) 641 goto out_mapping_err; 642 sge[sge_no].length = len; 643 sge[sge_no].lkey = lkey; 644 645 sc->sc_unmap_count++; 646 ppages++; 647 remaining -= len; 648 page_base = 0; 649 } 650 } 651 652 /* The tail iovec is not always constructed in the same 653 * page where the head iovec resides (see, for example, 654 * gss_wrap_req_priv). To neatly accommodate that case, 655 * DMA map it separately. 656 */ 657 if (xdr->tail[0].iov_len) { 658 page = virt_to_page(xdr->tail[0].iov_base); 659 page_base = offset_in_page(xdr->tail[0].iov_base); 660 len = xdr->tail[0].iov_len; 661 662 map_tail: 663 sge_no++; 664 sge[sge_no].addr = ib_dma_map_page(device, page, 665 page_base, len, 666 DMA_TO_DEVICE); 667 if (ib_dma_mapping_error(device, sge[sge_no].addr)) 668 goto out_mapping_err; 669 sge[sge_no].length = len; 670 sge[sge_no].lkey = lkey; 671 sc->sc_unmap_count++; 672 } 673 674 out: 675 sc->sc_wr.num_sge += sge_no; 676 if (sc->sc_unmap_count) 677 __set_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags); 678 return true; 679 680 out_regbuf: 681 pr_err("rpcrdma: failed to DMA map a Send buffer\n"); 682 return false; 683 684 out_mapping_overflow: 685 rpcrdma_unmap_sendctx(sc); 686 pr_err("rpcrdma: too many Send SGEs (%u)\n", sge_no); 687 return false; 688 689 out_mapping_err: 690 rpcrdma_unmap_sendctx(sc); 691 pr_err("rpcrdma: Send mapping error\n"); 692 return false; 693 } 694 695 /** 696 * rpcrdma_prepare_send_sges - Construct SGEs for a Send WR 697 * @r_xprt: controlling transport 698 * @req: context of RPC Call being marshalled 699 * @hdrlen: size of transport header, in bytes 700 * @xdr: xdr_buf containing RPC Call 701 * @rtype: chunk type being encoded 702 * 703 * Returns 0 on success; otherwise a negative errno is returned. 704 */ 705 int 706 rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, 707 struct rpcrdma_req *req, u32 hdrlen, 708 struct xdr_buf *xdr, enum rpcrdma_chunktype rtype) 709 { 710 req->rl_sendctx = rpcrdma_sendctx_get_locked(&r_xprt->rx_buf); 711 if (!req->rl_sendctx) 712 return -ENOBUFS; 713 req->rl_sendctx->sc_wr.num_sge = 0; 714 req->rl_sendctx->sc_unmap_count = 0; 715 req->rl_sendctx->sc_req = req; 716 __clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags); 717 718 if (!rpcrdma_prepare_hdr_sge(&r_xprt->rx_ia, req, hdrlen)) 719 return -EIO; 720 721 if (rtype != rpcrdma_areadch) 722 if (!rpcrdma_prepare_msg_sges(&r_xprt->rx_ia, req, xdr, rtype)) 723 return -EIO; 724 725 return 0; 726 } 727 728 /** 729 * rpcrdma_marshal_req - Marshal and send one RPC request 730 * @r_xprt: controlling transport 731 * @rqst: RPC request to be marshaled 732 * 733 * For the RPC in "rqst", this function: 734 * - Chooses the transfer mode (eg., RDMA_MSG or RDMA_NOMSG) 735 * - Registers Read, Write, and Reply chunks 736 * - Constructs the transport header 737 * - Posts a Send WR to send the transport header and request 738 * 739 * Returns: 740 * %0 if the RPC was sent successfully, 741 * %-ENOTCONN if the connection was lost, 742 * %-EAGAIN if not enough pages are available for on-demand reply buffer, 743 * %-ENOBUFS if no MRs are available to register chunks, 744 * %-EMSGSIZE if the transport header is too small, 745 * %-EIO if a permanent problem occurred while marshaling. 746 */ 747 int 748 rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) 749 { 750 struct rpcrdma_req *req = rpcr_to_rdmar(rqst); 751 struct xdr_stream *xdr = &req->rl_stream; 752 enum rpcrdma_chunktype rtype, wtype; 753 bool ddp_allowed; 754 __be32 *p; 755 int ret; 756 757 #if defined(CONFIG_SUNRPC_BACKCHANNEL) 758 if (test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state)) 759 return rpcrdma_bc_marshal_reply(rqst); 760 #endif 761 762 rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0); 763 xdr_init_encode(xdr, &req->rl_hdrbuf, 764 req->rl_rdmabuf->rg_base); 765 766 /* Fixed header fields */ 767 ret = -EMSGSIZE; 768 p = xdr_reserve_space(xdr, 4 * sizeof(*p)); 769 if (!p) 770 goto out_err; 771 *p++ = rqst->rq_xid; 772 *p++ = rpcrdma_version; 773 *p++ = cpu_to_be32(r_xprt->rx_buf.rb_max_requests); 774 775 /* When the ULP employs a GSS flavor that guarantees integrity 776 * or privacy, direct data placement of individual data items 777 * is not allowed. 778 */ 779 ddp_allowed = !(rqst->rq_cred->cr_auth->au_flags & 780 RPCAUTH_AUTH_DATATOUCH); 781 782 /* 783 * Chunks needed for results? 784 * 785 * o If the expected result is under the inline threshold, all ops 786 * return as inline. 787 * o Large read ops return data as write chunk(s), header as 788 * inline. 789 * o Large non-read ops return as a single reply chunk. 790 */ 791 if (rpcrdma_results_inline(r_xprt, rqst)) 792 wtype = rpcrdma_noch; 793 else if (ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ) 794 wtype = rpcrdma_writech; 795 else 796 wtype = rpcrdma_replych; 797 798 /* 799 * Chunks needed for arguments? 800 * 801 * o If the total request is under the inline threshold, all ops 802 * are sent as inline. 803 * o Large write ops transmit data as read chunk(s), header as 804 * inline. 805 * o Large non-write ops are sent with the entire message as a 806 * single read chunk (protocol 0-position special case). 807 * 808 * This assumes that the upper layer does not present a request 809 * that both has a data payload, and whose non-data arguments 810 * by themselves are larger than the inline threshold. 811 */ 812 if (rpcrdma_args_inline(r_xprt, rqst)) { 813 *p++ = rdma_msg; 814 rtype = rpcrdma_noch; 815 } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) { 816 *p++ = rdma_msg; 817 rtype = rpcrdma_readch; 818 } else { 819 r_xprt->rx_stats.nomsg_call_count++; 820 *p++ = rdma_nomsg; 821 rtype = rpcrdma_areadch; 822 } 823 824 /* This implementation supports the following combinations 825 * of chunk lists in one RPC-over-RDMA Call message: 826 * 827 * - Read list 828 * - Write list 829 * - Reply chunk 830 * - Read list + Reply chunk 831 * 832 * It might not yet support the following combinations: 833 * 834 * - Read list + Write list 835 * 836 * It does not support the following combinations: 837 * 838 * - Write list + Reply chunk 839 * - Read list + Write list + Reply chunk 840 * 841 * This implementation supports only a single chunk in each 842 * Read or Write list. Thus for example the client cannot 843 * send a Call message with a Position Zero Read chunk and a 844 * regular Read chunk at the same time. 845 */ 846 if (rtype != rpcrdma_noch) { 847 ret = rpcrdma_encode_read_list(r_xprt, req, rqst, rtype); 848 if (ret) 849 goto out_err; 850 } 851 ret = encode_item_not_present(xdr); 852 if (ret) 853 goto out_err; 854 855 if (wtype == rpcrdma_writech) { 856 ret = rpcrdma_encode_write_list(r_xprt, req, rqst, wtype); 857 if (ret) 858 goto out_err; 859 } 860 ret = encode_item_not_present(xdr); 861 if (ret) 862 goto out_err; 863 864 if (wtype != rpcrdma_replych) 865 ret = encode_item_not_present(xdr); 866 else 867 ret = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, wtype); 868 if (ret) 869 goto out_err; 870 871 dprintk("RPC: %5u %s: %s/%s: hdrlen %u rpclen\n", 872 rqst->rq_task->tk_pid, __func__, 873 transfertypes[rtype], transfertypes[wtype], 874 xdr_stream_pos(xdr)); 875 876 ret = rpcrdma_prepare_send_sges(r_xprt, req, xdr_stream_pos(xdr), 877 &rqst->rq_snd_buf, rtype); 878 if (ret) 879 goto out_err; 880 return 0; 881 882 out_err: 883 if (ret != -ENOBUFS) { 884 pr_err("rpcrdma: header marshaling failed (%d)\n", ret); 885 r_xprt->rx_stats.failed_marshal_count++; 886 } 887 return ret; 888 } 889 890 /** 891 * rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs 892 * @rqst: controlling RPC request 893 * @srcp: points to RPC message payload in receive buffer 894 * @copy_len: remaining length of receive buffer content 895 * @pad: Write chunk pad bytes needed (zero for pure inline) 896 * 897 * The upper layer has set the maximum number of bytes it can 898 * receive in each component of rq_rcv_buf. These values are set in 899 * the head.iov_len, page_len, tail.iov_len, and buflen fields. 900 * 901 * Unlike the TCP equivalent (xdr_partial_copy_from_skb), in 902 * many cases this function simply updates iov_base pointers in 903 * rq_rcv_buf to point directly to the received reply data, to 904 * avoid copying reply data. 905 * 906 * Returns the count of bytes which had to be memcopied. 907 */ 908 static unsigned long 909 rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) 910 { 911 unsigned long fixup_copy_count; 912 int i, npages, curlen; 913 char *destp; 914 struct page **ppages; 915 int page_base; 916 917 /* The head iovec is redirected to the RPC reply message 918 * in the receive buffer, to avoid a memcopy. 919 */ 920 rqst->rq_rcv_buf.head[0].iov_base = srcp; 921 rqst->rq_private_buf.head[0].iov_base = srcp; 922 923 /* The contents of the receive buffer that follow 924 * head.iov_len bytes are copied into the page list. 925 */ 926 curlen = rqst->rq_rcv_buf.head[0].iov_len; 927 if (curlen > copy_len) 928 curlen = copy_len; 929 dprintk("RPC: %s: srcp 0x%p len %d hdrlen %d\n", 930 __func__, srcp, copy_len, curlen); 931 srcp += curlen; 932 copy_len -= curlen; 933 934 ppages = rqst->rq_rcv_buf.pages + 935 (rqst->rq_rcv_buf.page_base >> PAGE_SHIFT); 936 page_base = offset_in_page(rqst->rq_rcv_buf.page_base); 937 fixup_copy_count = 0; 938 if (copy_len && rqst->rq_rcv_buf.page_len) { 939 int pagelist_len; 940 941 pagelist_len = rqst->rq_rcv_buf.page_len; 942 if (pagelist_len > copy_len) 943 pagelist_len = copy_len; 944 npages = PAGE_ALIGN(page_base + pagelist_len) >> PAGE_SHIFT; 945 for (i = 0; i < npages; i++) { 946 curlen = PAGE_SIZE - page_base; 947 if (curlen > pagelist_len) 948 curlen = pagelist_len; 949 950 dprintk("RPC: %s: page %d" 951 " srcp 0x%p len %d curlen %d\n", 952 __func__, i, srcp, copy_len, curlen); 953 destp = kmap_atomic(ppages[i]); 954 memcpy(destp + page_base, srcp, curlen); 955 flush_dcache_page(ppages[i]); 956 kunmap_atomic(destp); 957 srcp += curlen; 958 copy_len -= curlen; 959 fixup_copy_count += curlen; 960 pagelist_len -= curlen; 961 if (!pagelist_len) 962 break; 963 page_base = 0; 964 } 965 966 /* Implicit padding for the last segment in a Write 967 * chunk is inserted inline at the front of the tail 968 * iovec. The upper layer ignores the content of 969 * the pad. Simply ensure inline content in the tail 970 * that follows the Write chunk is properly aligned. 971 */ 972 if (pad) 973 srcp -= pad; 974 } 975 976 /* The tail iovec is redirected to the remaining data 977 * in the receive buffer, to avoid a memcopy. 978 */ 979 if (copy_len || pad) { 980 rqst->rq_rcv_buf.tail[0].iov_base = srcp; 981 rqst->rq_private_buf.tail[0].iov_base = srcp; 982 } 983 984 return fixup_copy_count; 985 } 986 987 /* Caller must guarantee @rep remains stable during this call. 988 */ 989 static void 990 rpcrdma_mark_remote_invalidation(struct list_head *mws, 991 struct rpcrdma_rep *rep) 992 { 993 struct rpcrdma_mw *mw; 994 995 if (!(rep->rr_wc_flags & IB_WC_WITH_INVALIDATE)) 996 return; 997 998 list_for_each_entry(mw, mws, mw_list) 999 if (mw->mw_handle == rep->rr_inv_rkey) { 1000 mw->mw_flags = RPCRDMA_MW_F_RI; 1001 break; /* only one invalidated MR per RPC */ 1002 } 1003 } 1004 1005 /* By convention, backchannel calls arrive via rdma_msg type 1006 * messages, and never populate the chunk lists. This makes 1007 * the RPC/RDMA header small and fixed in size, so it is 1008 * straightforward to check the RPC header's direction field. 1009 */ 1010 static bool 1011 rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) 1012 #if defined(CONFIG_SUNRPC_BACKCHANNEL) 1013 { 1014 struct xdr_stream *xdr = &rep->rr_stream; 1015 __be32 *p; 1016 1017 if (rep->rr_proc != rdma_msg) 1018 return false; 1019 1020 /* Peek at stream contents without advancing. */ 1021 p = xdr_inline_decode(xdr, 0); 1022 1023 /* Chunk lists */ 1024 if (*p++ != xdr_zero) 1025 return false; 1026 if (*p++ != xdr_zero) 1027 return false; 1028 if (*p++ != xdr_zero) 1029 return false; 1030 1031 /* RPC header */ 1032 if (*p++ != rep->rr_xid) 1033 return false; 1034 if (*p != cpu_to_be32(RPC_CALL)) 1035 return false; 1036 1037 /* Now that we are sure this is a backchannel call, 1038 * advance to the RPC header. 1039 */ 1040 p = xdr_inline_decode(xdr, 3 * sizeof(*p)); 1041 if (unlikely(!p)) 1042 goto out_short; 1043 1044 rpcrdma_bc_receive_call(r_xprt, rep); 1045 return true; 1046 1047 out_short: 1048 pr_warn("RPC/RDMA short backward direction call\n"); 1049 if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep)) 1050 xprt_disconnect_done(&r_xprt->rx_xprt); 1051 return true; 1052 } 1053 #else /* CONFIG_SUNRPC_BACKCHANNEL */ 1054 { 1055 return false; 1056 } 1057 #endif /* CONFIG_SUNRPC_BACKCHANNEL */ 1058 1059 static int decode_rdma_segment(struct xdr_stream *xdr, u32 *length) 1060 { 1061 __be32 *p; 1062 1063 p = xdr_inline_decode(xdr, 4 * sizeof(*p)); 1064 if (unlikely(!p)) 1065 return -EIO; 1066 1067 ifdebug(FACILITY) { 1068 u64 offset; 1069 u32 handle; 1070 1071 handle = be32_to_cpup(p++); 1072 *length = be32_to_cpup(p++); 1073 xdr_decode_hyper(p, &offset); 1074 dprintk("RPC: %s: segment %u@0x%016llx:0x%08x\n", 1075 __func__, *length, (unsigned long long)offset, 1076 handle); 1077 } else { 1078 *length = be32_to_cpup(p + 1); 1079 } 1080 1081 return 0; 1082 } 1083 1084 static int decode_write_chunk(struct xdr_stream *xdr, u32 *length) 1085 { 1086 u32 segcount, seglength; 1087 __be32 *p; 1088 1089 p = xdr_inline_decode(xdr, sizeof(*p)); 1090 if (unlikely(!p)) 1091 return -EIO; 1092 1093 *length = 0; 1094 segcount = be32_to_cpup(p); 1095 while (segcount--) { 1096 if (decode_rdma_segment(xdr, &seglength)) 1097 return -EIO; 1098 *length += seglength; 1099 } 1100 1101 dprintk("RPC: %s: segcount=%u, %u bytes\n", 1102 __func__, be32_to_cpup(p), *length); 1103 return 0; 1104 } 1105 1106 /* In RPC-over-RDMA Version One replies, a Read list is never 1107 * expected. This decoder is a stub that returns an error if 1108 * a Read list is present. 1109 */ 1110 static int decode_read_list(struct xdr_stream *xdr) 1111 { 1112 __be32 *p; 1113 1114 p = xdr_inline_decode(xdr, sizeof(*p)); 1115 if (unlikely(!p)) 1116 return -EIO; 1117 if (unlikely(*p != xdr_zero)) 1118 return -EIO; 1119 return 0; 1120 } 1121 1122 /* Supports only one Write chunk in the Write list 1123 */ 1124 static int decode_write_list(struct xdr_stream *xdr, u32 *length) 1125 { 1126 u32 chunklen; 1127 bool first; 1128 __be32 *p; 1129 1130 *length = 0; 1131 first = true; 1132 do { 1133 p = xdr_inline_decode(xdr, sizeof(*p)); 1134 if (unlikely(!p)) 1135 return -EIO; 1136 if (*p == xdr_zero) 1137 break; 1138 if (!first) 1139 return -EIO; 1140 1141 if (decode_write_chunk(xdr, &chunklen)) 1142 return -EIO; 1143 *length += chunklen; 1144 first = false; 1145 } while (true); 1146 return 0; 1147 } 1148 1149 static int decode_reply_chunk(struct xdr_stream *xdr, u32 *length) 1150 { 1151 __be32 *p; 1152 1153 p = xdr_inline_decode(xdr, sizeof(*p)); 1154 if (unlikely(!p)) 1155 return -EIO; 1156 1157 *length = 0; 1158 if (*p != xdr_zero) 1159 if (decode_write_chunk(xdr, length)) 1160 return -EIO; 1161 return 0; 1162 } 1163 1164 static int 1165 rpcrdma_decode_msg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, 1166 struct rpc_rqst *rqst) 1167 { 1168 struct xdr_stream *xdr = &rep->rr_stream; 1169 u32 writelist, replychunk, rpclen; 1170 char *base; 1171 1172 /* Decode the chunk lists */ 1173 if (decode_read_list(xdr)) 1174 return -EIO; 1175 if (decode_write_list(xdr, &writelist)) 1176 return -EIO; 1177 if (decode_reply_chunk(xdr, &replychunk)) 1178 return -EIO; 1179 1180 /* RDMA_MSG sanity checks */ 1181 if (unlikely(replychunk)) 1182 return -EIO; 1183 1184 /* Build the RPC reply's Payload stream in rqst->rq_rcv_buf */ 1185 base = (char *)xdr_inline_decode(xdr, 0); 1186 rpclen = xdr_stream_remaining(xdr); 1187 r_xprt->rx_stats.fixup_copy_count += 1188 rpcrdma_inline_fixup(rqst, base, rpclen, writelist & 3); 1189 1190 r_xprt->rx_stats.total_rdma_reply += writelist; 1191 return rpclen + xdr_align_size(writelist); 1192 } 1193 1194 static noinline int 1195 rpcrdma_decode_nomsg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) 1196 { 1197 struct xdr_stream *xdr = &rep->rr_stream; 1198 u32 writelist, replychunk; 1199 1200 /* Decode the chunk lists */ 1201 if (decode_read_list(xdr)) 1202 return -EIO; 1203 if (decode_write_list(xdr, &writelist)) 1204 return -EIO; 1205 if (decode_reply_chunk(xdr, &replychunk)) 1206 return -EIO; 1207 1208 /* RDMA_NOMSG sanity checks */ 1209 if (unlikely(writelist)) 1210 return -EIO; 1211 if (unlikely(!replychunk)) 1212 return -EIO; 1213 1214 /* Reply chunk buffer already is the reply vector */ 1215 r_xprt->rx_stats.total_rdma_reply += replychunk; 1216 return replychunk; 1217 } 1218 1219 static noinline int 1220 rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, 1221 struct rpc_rqst *rqst) 1222 { 1223 struct xdr_stream *xdr = &rep->rr_stream; 1224 __be32 *p; 1225 1226 p = xdr_inline_decode(xdr, sizeof(*p)); 1227 if (unlikely(!p)) 1228 return -EIO; 1229 1230 switch (*p) { 1231 case err_vers: 1232 p = xdr_inline_decode(xdr, 2 * sizeof(*p)); 1233 if (!p) 1234 break; 1235 dprintk("RPC: %5u: %s: server reports version error (%u-%u)\n", 1236 rqst->rq_task->tk_pid, __func__, 1237 be32_to_cpup(p), be32_to_cpu(*(p + 1))); 1238 break; 1239 case err_chunk: 1240 dprintk("RPC: %5u: %s: server reports header decoding error\n", 1241 rqst->rq_task->tk_pid, __func__); 1242 break; 1243 default: 1244 dprintk("RPC: %5u: %s: server reports unrecognized error %d\n", 1245 rqst->rq_task->tk_pid, __func__, be32_to_cpup(p)); 1246 } 1247 1248 r_xprt->rx_stats.bad_reply_count++; 1249 return -EREMOTEIO; 1250 } 1251 1252 /* Perform XID lookup, reconstruction of the RPC reply, and 1253 * RPC completion while holding the transport lock to ensure 1254 * the rep, rqst, and rq_task pointers remain stable. 1255 */ 1256 void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) 1257 { 1258 struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; 1259 struct rpc_xprt *xprt = &r_xprt->rx_xprt; 1260 struct rpc_rqst *rqst = rep->rr_rqst; 1261 unsigned long cwnd; 1262 int status; 1263 1264 xprt->reestablish_timeout = 0; 1265 1266 switch (rep->rr_proc) { 1267 case rdma_msg: 1268 status = rpcrdma_decode_msg(r_xprt, rep, rqst); 1269 break; 1270 case rdma_nomsg: 1271 status = rpcrdma_decode_nomsg(r_xprt, rep); 1272 break; 1273 case rdma_error: 1274 status = rpcrdma_decode_error(r_xprt, rep, rqst); 1275 break; 1276 default: 1277 status = -EIO; 1278 } 1279 if (status < 0) 1280 goto out_badheader; 1281 1282 out: 1283 spin_lock(&xprt->recv_lock); 1284 cwnd = xprt->cwnd; 1285 xprt->cwnd = r_xprt->rx_buf.rb_credits << RPC_CWNDSHIFT; 1286 if (xprt->cwnd > cwnd) 1287 xprt_release_rqst_cong(rqst->rq_task); 1288 1289 xprt_complete_rqst(rqst->rq_task, status); 1290 xprt_unpin_rqst(rqst); 1291 spin_unlock(&xprt->recv_lock); 1292 return; 1293 1294 /* If the incoming reply terminated a pending RPC, the next 1295 * RPC call will post a replacement receive buffer as it is 1296 * being marshaled. 1297 */ 1298 out_badheader: 1299 dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n", 1300 rqst->rq_task->tk_pid, __func__, be32_to_cpu(rep->rr_proc)); 1301 r_xprt->rx_stats.bad_reply_count++; 1302 status = -EIO; 1303 goto out; 1304 } 1305 1306 void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) 1307 { 1308 /* Invalidate and unmap the data payloads before waking 1309 * the waiting application. This guarantees the memory 1310 * regions are properly fenced from the server before the 1311 * application accesses the data. It also ensures proper 1312 * send flow control: waking the next RPC waits until this 1313 * RPC has relinquished all its Send Queue entries. 1314 */ 1315 if (!list_empty(&req->rl_registered)) 1316 r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, 1317 &req->rl_registered); 1318 1319 /* Ensure that any DMA mapped pages associated with 1320 * the Send of the RPC Call have been unmapped before 1321 * allowing the RPC to complete. This protects argument 1322 * memory not controlled by the RPC client from being 1323 * re-used before we're done with it. 1324 */ 1325 if (test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) { 1326 r_xprt->rx_stats.reply_waits_for_send++; 1327 out_of_line_wait_on_bit(&req->rl_flags, 1328 RPCRDMA_REQ_F_TX_RESOURCES, 1329 bit_wait, 1330 TASK_UNINTERRUPTIBLE); 1331 } 1332 } 1333 1334 /* Reply handling runs in the poll worker thread. Anything that 1335 * might wait is deferred to a separate workqueue. 1336 */ 1337 void rpcrdma_deferred_completion(struct work_struct *work) 1338 { 1339 struct rpcrdma_rep *rep = 1340 container_of(work, struct rpcrdma_rep, rr_work); 1341 struct rpcrdma_req *req = rpcr_to_rdmar(rep->rr_rqst); 1342 1343 rpcrdma_mark_remote_invalidation(&req->rl_registered, rep); 1344 rpcrdma_release_rqst(rep->rr_rxprt, req); 1345 rpcrdma_complete_rqst(rep); 1346 } 1347 1348 /* Process received RPC/RDMA messages. 1349 * 1350 * Errors must result in the RPC task either being awakened, or 1351 * allowed to timeout, to discover the errors at that time. 1352 */ 1353 void rpcrdma_reply_handler(struct rpcrdma_rep *rep) 1354 { 1355 struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; 1356 struct rpc_xprt *xprt = &r_xprt->rx_xprt; 1357 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 1358 struct rpcrdma_req *req; 1359 struct rpc_rqst *rqst; 1360 u32 credits; 1361 __be32 *p; 1362 1363 dprintk("RPC: %s: incoming rep %p\n", __func__, rep); 1364 1365 if (rep->rr_hdrbuf.head[0].iov_len == 0) 1366 goto out_badstatus; 1367 1368 xdr_init_decode(&rep->rr_stream, &rep->rr_hdrbuf, 1369 rep->rr_hdrbuf.head[0].iov_base); 1370 1371 /* Fixed transport header fields */ 1372 p = xdr_inline_decode(&rep->rr_stream, 4 * sizeof(*p)); 1373 if (unlikely(!p)) 1374 goto out_shortreply; 1375 rep->rr_xid = *p++; 1376 rep->rr_vers = *p++; 1377 credits = be32_to_cpu(*p++); 1378 rep->rr_proc = *p++; 1379 1380 if (rep->rr_vers != rpcrdma_version) 1381 goto out_badversion; 1382 1383 if (rpcrdma_is_bcall(r_xprt, rep)) 1384 return; 1385 1386 /* Match incoming rpcrdma_rep to an rpcrdma_req to 1387 * get context for handling any incoming chunks. 1388 */ 1389 spin_lock(&xprt->recv_lock); 1390 rqst = xprt_lookup_rqst(xprt, rep->rr_xid); 1391 if (!rqst) 1392 goto out_norqst; 1393 xprt_pin_rqst(rqst); 1394 1395 if (credits == 0) 1396 credits = 1; /* don't deadlock */ 1397 else if (credits > buf->rb_max_requests) 1398 credits = buf->rb_max_requests; 1399 buf->rb_credits = credits; 1400 1401 spin_unlock(&xprt->recv_lock); 1402 1403 req = rpcr_to_rdmar(rqst); 1404 req->rl_reply = rep; 1405 rep->rr_rqst = rqst; 1406 clear_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags); 1407 1408 dprintk("RPC: %s: reply %p completes request %p (xid 0x%08x)\n", 1409 __func__, rep, req, be32_to_cpu(rep->rr_xid)); 1410 1411 queue_work_on(req->rl_cpu, rpcrdma_receive_wq, &rep->rr_work); 1412 return; 1413 1414 out_badstatus: 1415 rpcrdma_recv_buffer_put(rep); 1416 if (r_xprt->rx_ep.rep_connected == 1) { 1417 r_xprt->rx_ep.rep_connected = -EIO; 1418 rpcrdma_conn_func(&r_xprt->rx_ep); 1419 } 1420 return; 1421 1422 out_badversion: 1423 dprintk("RPC: %s: invalid version %d\n", 1424 __func__, be32_to_cpu(rep->rr_vers)); 1425 goto repost; 1426 1427 /* The RPC transaction has already been terminated, or the header 1428 * is corrupt. 1429 */ 1430 out_norqst: 1431 spin_unlock(&xprt->recv_lock); 1432 dprintk("RPC: %s: no match for incoming xid 0x%08x\n", 1433 __func__, be32_to_cpu(rep->rr_xid)); 1434 goto repost; 1435 1436 out_shortreply: 1437 dprintk("RPC: %s: short/invalid reply\n", __func__); 1438 1439 /* If no pending RPC transaction was matched, post a replacement 1440 * receive buffer before returning. 1441 */ 1442 repost: 1443 r_xprt->rx_stats.bad_reply_count++; 1444 if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep)) 1445 rpcrdma_recv_buffer_put(rep); 1446 } 1447