1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 /* 3 * Copyright (c) 2014-2020, Oracle and/or its affiliates. 4 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. 5 * 6 * This software is available to you under a choice of one of two 7 * licenses. You may choose to be licensed under the terms of the GNU 8 * General Public License (GPL) Version 2, available from the file 9 * COPYING in the main directory of this source tree, or the BSD-type 10 * license below: 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 16 * Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 19 * Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials provided 22 * with the distribution. 23 * 24 * Neither the name of the Network Appliance, Inc. nor the names of 25 * its contributors may be used to endorse or promote products 26 * derived from this software without specific prior written 27 * permission. 28 * 29 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 30 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 31 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 32 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 33 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 34 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 35 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 36 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 37 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 38 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 39 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 40 */ 41 42 /* 43 * rpc_rdma.c 44 * 45 * This file contains the guts of the RPC RDMA protocol, and 46 * does marshaling/unmarshaling, etc. It is also where interfacing 47 * to the Linux RPC framework lives. 48 */ 49 50 #include <linux/highmem.h> 51 52 #include <linux/sunrpc/svc_rdma.h> 53 54 #include "xprt_rdma.h" 55 #include <trace/events/rpcrdma.h> 56 57 /* Returns size of largest RPC-over-RDMA header in a Call message 58 * 59 * The largest Call header contains a full-size Read list and a 60 * minimal Reply chunk. 61 */ 62 static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs) 63 { 64 unsigned int size; 65 66 /* Fixed header fields and list discriminators */ 67 size = RPCRDMA_HDRLEN_MIN; 68 69 /* Maximum Read list size */ 70 size += maxsegs * rpcrdma_readchunk_maxsz * sizeof(__be32); 71 72 /* Minimal Read chunk size */ 73 size += sizeof(__be32); /* segment count */ 74 size += rpcrdma_segment_maxsz * sizeof(__be32); 75 size += sizeof(__be32); /* list discriminator */ 76 77 return size; 78 } 79 80 /* Returns size of largest RPC-over-RDMA header in a Reply message 81 * 82 * There is only one Write list or one Reply chunk per Reply 83 * message. The larger list is the Write list. 84 */ 85 static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs) 86 { 87 unsigned int size; 88 89 /* Fixed header fields and list discriminators */ 90 size = RPCRDMA_HDRLEN_MIN; 91 92 /* Maximum Write list size */ 93 size += sizeof(__be32); /* segment count */ 94 size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32); 95 size += sizeof(__be32); /* list discriminator */ 96 97 return size; 98 } 99 100 /** 101 * rpcrdma_set_max_header_sizes - Initialize inline payload sizes 102 * @ep: endpoint to initialize 103 * 104 * The max_inline fields contain the maximum size of an RPC message 105 * so the marshaling code doesn't have to repeat this calculation 106 * for every RPC. 107 */ 108 void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep) 109 { 110 unsigned int maxsegs = ep->re_max_rdma_segs; 111 112 ep->re_max_inline_send = 113 ep->re_inline_send - rpcrdma_max_call_header_size(maxsegs); 114 ep->re_max_inline_recv = 115 ep->re_inline_recv - rpcrdma_max_reply_header_size(maxsegs); 116 } 117 118 /* The client can send a request inline as long as the RPCRDMA header 119 * plus the RPC call fit under the transport's inline limit. If the 120 * combined call message size exceeds that limit, the client must use 121 * a Read chunk for this operation. 122 * 123 * A Read chunk is also required if sending the RPC call inline would 124 * exceed this device's max_sge limit. 125 */ 126 static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, 127 struct rpc_rqst *rqst) 128 { 129 struct xdr_buf *xdr = &rqst->rq_snd_buf; 130 struct rpcrdma_ep *ep = r_xprt->rx_ep; 131 unsigned int count, remaining, offset; 132 133 if (xdr->len > ep->re_max_inline_send) 134 return false; 135 136 if (xdr->page_len) { 137 remaining = xdr->page_len; 138 offset = offset_in_page(xdr->page_base); 139 count = RPCRDMA_MIN_SEND_SGES; 140 while (remaining) { 141 remaining -= min_t(unsigned int, 142 PAGE_SIZE - offset, remaining); 143 offset = 0; 144 if (++count > ep->re_attr.cap.max_send_sge) 145 return false; 146 } 147 } 148 149 return true; 150 } 151 152 /* The client can't know how large the actual reply will be. Thus it 153 * plans for the largest possible reply for that particular ULP 154 * operation. If the maximum combined reply message size exceeds that 155 * limit, the client must provide a write list or a reply chunk for 156 * this request. 157 */ 158 static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt, 159 struct rpc_rqst *rqst) 160 { 161 return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep->re_max_inline_recv; 162 } 163 164 /* The client is required to provide a Reply chunk if the maximum 165 * size of the non-payload part of the RPC Reply is larger than 166 * the inline threshold. 167 */ 168 static bool 169 rpcrdma_nonpayload_inline(const struct rpcrdma_xprt *r_xprt, 170 const struct rpc_rqst *rqst) 171 { 172 const struct xdr_buf *buf = &rqst->rq_rcv_buf; 173 174 return (buf->head[0].iov_len + buf->tail[0].iov_len) < 175 r_xprt->rx_ep->re_max_inline_recv; 176 } 177 178 /* ACL likes to be lazy in allocating pages. For TCP, these 179 * pages can be allocated during receive processing. Not true 180 * for RDMA, which must always provision receive buffers 181 * up front. 182 */ 183 static noinline int 184 rpcrdma_alloc_sparse_pages(struct xdr_buf *buf) 185 { 186 struct page **ppages; 187 int len; 188 189 len = buf->page_len; 190 ppages = buf->pages + (buf->page_base >> PAGE_SHIFT); 191 while (len > 0) { 192 if (!*ppages) 193 *ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN); 194 if (!*ppages) 195 return -ENOBUFS; 196 ppages++; 197 len -= PAGE_SIZE; 198 } 199 200 return 0; 201 } 202 203 /* Convert @vec to a single SGL element. 204 * 205 * Returns pointer to next available SGE, and bumps the total number 206 * of SGEs consumed. 207 */ 208 static struct rpcrdma_mr_seg * 209 rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, 210 unsigned int *n) 211 { 212 seg->mr_page = virt_to_page(vec->iov_base); 213 seg->mr_offset = offset_in_page(vec->iov_base); 214 seg->mr_len = vec->iov_len; 215 ++seg; 216 ++(*n); 217 return seg; 218 } 219 220 /* Convert @xdrbuf into SGEs no larger than a page each. As they 221 * are registered, these SGEs are then coalesced into RDMA segments 222 * when the selected memreg mode supports it. 223 * 224 * Returns positive number of SGEs consumed, or a negative errno. 225 */ 226 227 static int 228 rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, 229 unsigned int pos, enum rpcrdma_chunktype type, 230 struct rpcrdma_mr_seg *seg) 231 { 232 unsigned long page_base; 233 unsigned int len, n; 234 struct page **ppages; 235 236 n = 0; 237 if (pos == 0) 238 seg = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, &n); 239 240 len = xdrbuf->page_len; 241 ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); 242 page_base = offset_in_page(xdrbuf->page_base); 243 while (len) { 244 seg->mr_page = *ppages; 245 seg->mr_offset = page_base; 246 seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len); 247 len -= seg->mr_len; 248 ++ppages; 249 ++seg; 250 ++n; 251 page_base = 0; 252 } 253 254 if (type == rpcrdma_readch || type == rpcrdma_writech) 255 goto out; 256 257 if (xdrbuf->tail[0].iov_len) 258 rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n); 259 260 out: 261 if (unlikely(n > RPCRDMA_MAX_SEGS)) 262 return -EIO; 263 return n; 264 } 265 266 static int 267 encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr) 268 { 269 __be32 *p; 270 271 p = xdr_reserve_space(xdr, 4 * sizeof(*p)); 272 if (unlikely(!p)) 273 return -EMSGSIZE; 274 275 xdr_encode_rdma_segment(p, mr->mr_handle, mr->mr_length, mr->mr_offset); 276 return 0; 277 } 278 279 static int 280 encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr, 281 u32 position) 282 { 283 __be32 *p; 284 285 p = xdr_reserve_space(xdr, 6 * sizeof(*p)); 286 if (unlikely(!p)) 287 return -EMSGSIZE; 288 289 *p++ = xdr_one; /* Item present */ 290 xdr_encode_read_segment(p, position, mr->mr_handle, mr->mr_length, 291 mr->mr_offset); 292 return 0; 293 } 294 295 static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, 296 struct rpcrdma_req *req, 297 struct rpcrdma_mr_seg *seg, 298 int nsegs, bool writing, 299 struct rpcrdma_mr **mr) 300 { 301 *mr = rpcrdma_mr_pop(&req->rl_free_mrs); 302 if (!*mr) { 303 *mr = rpcrdma_mr_get(r_xprt); 304 if (!*mr) 305 goto out_getmr_err; 306 (*mr)->mr_req = req; 307 } 308 309 rpcrdma_mr_push(*mr, &req->rl_registered); 310 return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr); 311 312 out_getmr_err: 313 trace_xprtrdma_nomrs_err(r_xprt, req); 314 xprt_wait_for_buffer_space(&r_xprt->rx_xprt); 315 rpcrdma_mrs_refresh(r_xprt); 316 return ERR_PTR(-EAGAIN); 317 } 318 319 /* Register and XDR encode the Read list. Supports encoding a list of read 320 * segments that belong to a single read chunk. 321 * 322 * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): 323 * 324 * Read chunklist (a linked list): 325 * N elements, position P (same P for all chunks of same arg!): 326 * 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0 327 * 328 * Returns zero on success, or a negative errno if a failure occurred. 329 * @xdr is advanced to the next position in the stream. 330 * 331 * Only a single @pos value is currently supported. 332 */ 333 static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, 334 struct rpcrdma_req *req, 335 struct rpc_rqst *rqst, 336 enum rpcrdma_chunktype rtype) 337 { 338 struct xdr_stream *xdr = &req->rl_stream; 339 struct rpcrdma_mr_seg *seg; 340 struct rpcrdma_mr *mr; 341 unsigned int pos; 342 int nsegs; 343 344 if (rtype == rpcrdma_noch_pullup || rtype == rpcrdma_noch_mapped) 345 goto done; 346 347 pos = rqst->rq_snd_buf.head[0].iov_len; 348 if (rtype == rpcrdma_areadch) 349 pos = 0; 350 seg = req->rl_segments; 351 nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_snd_buf, pos, 352 rtype, seg); 353 if (nsegs < 0) 354 return nsegs; 355 356 do { 357 seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, false, &mr); 358 if (IS_ERR(seg)) 359 return PTR_ERR(seg); 360 361 if (encode_read_segment(xdr, mr, pos) < 0) 362 return -EMSGSIZE; 363 364 trace_xprtrdma_chunk_read(rqst->rq_task, pos, mr, nsegs); 365 r_xprt->rx_stats.read_chunk_count++; 366 nsegs -= mr->mr_nents; 367 } while (nsegs); 368 369 done: 370 if (xdr_stream_encode_item_absent(xdr) < 0) 371 return -EMSGSIZE; 372 return 0; 373 } 374 375 /* Register and XDR encode the Write list. Supports encoding a list 376 * containing one array of plain segments that belong to a single 377 * write chunk. 378 * 379 * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): 380 * 381 * Write chunklist (a list of (one) counted array): 382 * N elements: 383 * 1 - N - HLOO - HLOO - ... - HLOO - 0 384 * 385 * Returns zero on success, or a negative errno if a failure occurred. 386 * @xdr is advanced to the next position in the stream. 387 * 388 * Only a single Write chunk is currently supported. 389 */ 390 static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, 391 struct rpcrdma_req *req, 392 struct rpc_rqst *rqst, 393 enum rpcrdma_chunktype wtype) 394 { 395 struct xdr_stream *xdr = &req->rl_stream; 396 struct rpcrdma_ep *ep = r_xprt->rx_ep; 397 struct rpcrdma_mr_seg *seg; 398 struct rpcrdma_mr *mr; 399 int nsegs, nchunks; 400 __be32 *segcount; 401 402 if (wtype != rpcrdma_writech) 403 goto done; 404 405 seg = req->rl_segments; 406 nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 407 rqst->rq_rcv_buf.head[0].iov_len, 408 wtype, seg); 409 if (nsegs < 0) 410 return nsegs; 411 412 if (xdr_stream_encode_item_present(xdr) < 0) 413 return -EMSGSIZE; 414 segcount = xdr_reserve_space(xdr, sizeof(*segcount)); 415 if (unlikely(!segcount)) 416 return -EMSGSIZE; 417 /* Actual value encoded below */ 418 419 nchunks = 0; 420 do { 421 seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr); 422 if (IS_ERR(seg)) 423 return PTR_ERR(seg); 424 425 if (encode_rdma_segment(xdr, mr) < 0) 426 return -EMSGSIZE; 427 428 trace_xprtrdma_chunk_write(rqst->rq_task, mr, nsegs); 429 r_xprt->rx_stats.write_chunk_count++; 430 r_xprt->rx_stats.total_rdma_request += mr->mr_length; 431 nchunks++; 432 nsegs -= mr->mr_nents; 433 } while (nsegs); 434 435 if (xdr_pad_size(rqst->rq_rcv_buf.page_len)) { 436 if (encode_rdma_segment(xdr, ep->re_write_pad_mr) < 0) 437 return -EMSGSIZE; 438 439 trace_xprtrdma_chunk_wp(rqst->rq_task, ep->re_write_pad_mr, 440 nsegs); 441 r_xprt->rx_stats.write_chunk_count++; 442 r_xprt->rx_stats.total_rdma_request += mr->mr_length; 443 nchunks++; 444 nsegs -= mr->mr_nents; 445 } 446 447 /* Update count of segments in this Write chunk */ 448 *segcount = cpu_to_be32(nchunks); 449 450 done: 451 if (xdr_stream_encode_item_absent(xdr) < 0) 452 return -EMSGSIZE; 453 return 0; 454 } 455 456 /* Register and XDR encode the Reply chunk. Supports encoding an array 457 * of plain segments that belong to a single write (reply) chunk. 458 * 459 * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): 460 * 461 * Reply chunk (a counted array): 462 * N elements: 463 * 1 - N - HLOO - HLOO - ... - HLOO 464 * 465 * Returns zero on success, or a negative errno if a failure occurred. 466 * @xdr is advanced to the next position in the stream. 467 */ 468 static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, 469 struct rpcrdma_req *req, 470 struct rpc_rqst *rqst, 471 enum rpcrdma_chunktype wtype) 472 { 473 struct xdr_stream *xdr = &req->rl_stream; 474 struct rpcrdma_mr_seg *seg; 475 struct rpcrdma_mr *mr; 476 int nsegs, nchunks; 477 __be32 *segcount; 478 479 if (wtype != rpcrdma_replych) { 480 if (xdr_stream_encode_item_absent(xdr) < 0) 481 return -EMSGSIZE; 482 return 0; 483 } 484 485 seg = req->rl_segments; 486 nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg); 487 if (nsegs < 0) 488 return nsegs; 489 490 if (xdr_stream_encode_item_present(xdr) < 0) 491 return -EMSGSIZE; 492 segcount = xdr_reserve_space(xdr, sizeof(*segcount)); 493 if (unlikely(!segcount)) 494 return -EMSGSIZE; 495 /* Actual value encoded below */ 496 497 nchunks = 0; 498 do { 499 seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr); 500 if (IS_ERR(seg)) 501 return PTR_ERR(seg); 502 503 if (encode_rdma_segment(xdr, mr) < 0) 504 return -EMSGSIZE; 505 506 trace_xprtrdma_chunk_reply(rqst->rq_task, mr, nsegs); 507 r_xprt->rx_stats.reply_chunk_count++; 508 r_xprt->rx_stats.total_rdma_request += mr->mr_length; 509 nchunks++; 510 nsegs -= mr->mr_nents; 511 } while (nsegs); 512 513 /* Update count of segments in the Reply chunk */ 514 *segcount = cpu_to_be32(nchunks); 515 516 return 0; 517 } 518 519 static void rpcrdma_sendctx_done(struct kref *kref) 520 { 521 struct rpcrdma_req *req = 522 container_of(kref, struct rpcrdma_req, rl_kref); 523 struct rpcrdma_rep *rep = req->rl_reply; 524 525 rpcrdma_complete_rqst(rep); 526 rep->rr_rxprt->rx_stats.reply_waits_for_send++; 527 } 528 529 /** 530 * rpcrdma_sendctx_unmap - DMA-unmap Send buffer 531 * @sc: sendctx containing SGEs to unmap 532 * 533 */ 534 void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc) 535 { 536 struct rpcrdma_regbuf *rb = sc->sc_req->rl_sendbuf; 537 struct ib_sge *sge; 538 539 if (!sc->sc_unmap_count) 540 return; 541 542 /* The first two SGEs contain the transport header and 543 * the inline buffer. These are always left mapped so 544 * they can be cheaply re-used. 545 */ 546 for (sge = &sc->sc_sges[2]; sc->sc_unmap_count; 547 ++sge, --sc->sc_unmap_count) 548 ib_dma_unmap_page(rdmab_device(rb), sge->addr, sge->length, 549 DMA_TO_DEVICE); 550 551 kref_put(&sc->sc_req->rl_kref, rpcrdma_sendctx_done); 552 } 553 554 /* Prepare an SGE for the RPC-over-RDMA transport header. 555 */ 556 static void rpcrdma_prepare_hdr_sge(struct rpcrdma_xprt *r_xprt, 557 struct rpcrdma_req *req, u32 len) 558 { 559 struct rpcrdma_sendctx *sc = req->rl_sendctx; 560 struct rpcrdma_regbuf *rb = req->rl_rdmabuf; 561 struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++]; 562 563 sge->addr = rdmab_addr(rb); 564 sge->length = len; 565 sge->lkey = rdmab_lkey(rb); 566 567 ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length, 568 DMA_TO_DEVICE); 569 } 570 571 /* The head iovec is straightforward, as it is usually already 572 * DMA-mapped. Sync the content that has changed. 573 */ 574 static bool rpcrdma_prepare_head_iov(struct rpcrdma_xprt *r_xprt, 575 struct rpcrdma_req *req, unsigned int len) 576 { 577 struct rpcrdma_sendctx *sc = req->rl_sendctx; 578 struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++]; 579 struct rpcrdma_regbuf *rb = req->rl_sendbuf; 580 581 if (!rpcrdma_regbuf_dma_map(r_xprt, rb)) 582 return false; 583 584 sge->addr = rdmab_addr(rb); 585 sge->length = len; 586 sge->lkey = rdmab_lkey(rb); 587 588 ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length, 589 DMA_TO_DEVICE); 590 return true; 591 } 592 593 /* If there is a page list present, DMA map and prepare an 594 * SGE for each page to be sent. 595 */ 596 static bool rpcrdma_prepare_pagelist(struct rpcrdma_req *req, 597 struct xdr_buf *xdr) 598 { 599 struct rpcrdma_sendctx *sc = req->rl_sendctx; 600 struct rpcrdma_regbuf *rb = req->rl_sendbuf; 601 unsigned int page_base, len, remaining; 602 struct page **ppages; 603 struct ib_sge *sge; 604 605 ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); 606 page_base = offset_in_page(xdr->page_base); 607 remaining = xdr->page_len; 608 while (remaining) { 609 sge = &sc->sc_sges[req->rl_wr.num_sge++]; 610 len = min_t(unsigned int, PAGE_SIZE - page_base, remaining); 611 sge->addr = ib_dma_map_page(rdmab_device(rb), *ppages, 612 page_base, len, DMA_TO_DEVICE); 613 if (ib_dma_mapping_error(rdmab_device(rb), sge->addr)) 614 goto out_mapping_err; 615 616 sge->length = len; 617 sge->lkey = rdmab_lkey(rb); 618 619 sc->sc_unmap_count++; 620 ppages++; 621 remaining -= len; 622 page_base = 0; 623 } 624 625 return true; 626 627 out_mapping_err: 628 trace_xprtrdma_dma_maperr(sge->addr); 629 return false; 630 } 631 632 /* The tail iovec may include an XDR pad for the page list, 633 * as well as additional content, and may not reside in the 634 * same page as the head iovec. 635 */ 636 static bool rpcrdma_prepare_tail_iov(struct rpcrdma_req *req, 637 struct xdr_buf *xdr, 638 unsigned int page_base, unsigned int len) 639 { 640 struct rpcrdma_sendctx *sc = req->rl_sendctx; 641 struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++]; 642 struct rpcrdma_regbuf *rb = req->rl_sendbuf; 643 struct page *page = virt_to_page(xdr->tail[0].iov_base); 644 645 sge->addr = ib_dma_map_page(rdmab_device(rb), page, page_base, len, 646 DMA_TO_DEVICE); 647 if (ib_dma_mapping_error(rdmab_device(rb), sge->addr)) 648 goto out_mapping_err; 649 650 sge->length = len; 651 sge->lkey = rdmab_lkey(rb); 652 ++sc->sc_unmap_count; 653 return true; 654 655 out_mapping_err: 656 trace_xprtrdma_dma_maperr(sge->addr); 657 return false; 658 } 659 660 /* Copy the tail to the end of the head buffer. 661 */ 662 static void rpcrdma_pullup_tail_iov(struct rpcrdma_xprt *r_xprt, 663 struct rpcrdma_req *req, 664 struct xdr_buf *xdr) 665 { 666 unsigned char *dst; 667 668 dst = (unsigned char *)xdr->head[0].iov_base; 669 dst += xdr->head[0].iov_len + xdr->page_len; 670 memmove(dst, xdr->tail[0].iov_base, xdr->tail[0].iov_len); 671 r_xprt->rx_stats.pullup_copy_count += xdr->tail[0].iov_len; 672 } 673 674 /* Copy pagelist content into the head buffer. 675 */ 676 static void rpcrdma_pullup_pagelist(struct rpcrdma_xprt *r_xprt, 677 struct rpcrdma_req *req, 678 struct xdr_buf *xdr) 679 { 680 unsigned int len, page_base, remaining; 681 struct page **ppages; 682 unsigned char *src, *dst; 683 684 dst = (unsigned char *)xdr->head[0].iov_base; 685 dst += xdr->head[0].iov_len; 686 ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); 687 page_base = offset_in_page(xdr->page_base); 688 remaining = xdr->page_len; 689 while (remaining) { 690 src = page_address(*ppages); 691 src += page_base; 692 len = min_t(unsigned int, PAGE_SIZE - page_base, remaining); 693 memcpy(dst, src, len); 694 r_xprt->rx_stats.pullup_copy_count += len; 695 696 ppages++; 697 dst += len; 698 remaining -= len; 699 page_base = 0; 700 } 701 } 702 703 /* Copy the contents of @xdr into @rl_sendbuf and DMA sync it. 704 * When the head, pagelist, and tail are small, a pull-up copy 705 * is considerably less costly than DMA mapping the components 706 * of @xdr. 707 * 708 * Assumptions: 709 * - the caller has already verified that the total length 710 * of the RPC Call body will fit into @rl_sendbuf. 711 */ 712 static bool rpcrdma_prepare_noch_pullup(struct rpcrdma_xprt *r_xprt, 713 struct rpcrdma_req *req, 714 struct xdr_buf *xdr) 715 { 716 if (unlikely(xdr->tail[0].iov_len)) 717 rpcrdma_pullup_tail_iov(r_xprt, req, xdr); 718 719 if (unlikely(xdr->page_len)) 720 rpcrdma_pullup_pagelist(r_xprt, req, xdr); 721 722 /* The whole RPC message resides in the head iovec now */ 723 return rpcrdma_prepare_head_iov(r_xprt, req, xdr->len); 724 } 725 726 static bool rpcrdma_prepare_noch_mapped(struct rpcrdma_xprt *r_xprt, 727 struct rpcrdma_req *req, 728 struct xdr_buf *xdr) 729 { 730 struct kvec *tail = &xdr->tail[0]; 731 732 if (!rpcrdma_prepare_head_iov(r_xprt, req, xdr->head[0].iov_len)) 733 return false; 734 if (xdr->page_len) 735 if (!rpcrdma_prepare_pagelist(req, xdr)) 736 return false; 737 if (tail->iov_len) 738 if (!rpcrdma_prepare_tail_iov(req, xdr, 739 offset_in_page(tail->iov_base), 740 tail->iov_len)) 741 return false; 742 743 if (req->rl_sendctx->sc_unmap_count) 744 kref_get(&req->rl_kref); 745 return true; 746 } 747 748 static bool rpcrdma_prepare_readch(struct rpcrdma_xprt *r_xprt, 749 struct rpcrdma_req *req, 750 struct xdr_buf *xdr) 751 { 752 if (!rpcrdma_prepare_head_iov(r_xprt, req, xdr->head[0].iov_len)) 753 return false; 754 755 /* If there is a Read chunk, the page list is being handled 756 * via explicit RDMA, and thus is skipped here. 757 */ 758 759 /* Do not include the tail if it is only an XDR pad */ 760 if (xdr->tail[0].iov_len > 3) { 761 unsigned int page_base, len; 762 763 /* If the content in the page list is an odd length, 764 * xdr_write_pages() adds a pad at the beginning of 765 * the tail iovec. Force the tail's non-pad content to 766 * land at the next XDR position in the Send message. 767 */ 768 page_base = offset_in_page(xdr->tail[0].iov_base); 769 len = xdr->tail[0].iov_len; 770 page_base += len & 3; 771 len -= len & 3; 772 if (!rpcrdma_prepare_tail_iov(req, xdr, page_base, len)) 773 return false; 774 kref_get(&req->rl_kref); 775 } 776 777 return true; 778 } 779 780 /** 781 * rpcrdma_prepare_send_sges - Construct SGEs for a Send WR 782 * @r_xprt: controlling transport 783 * @req: context of RPC Call being marshalled 784 * @hdrlen: size of transport header, in bytes 785 * @xdr: xdr_buf containing RPC Call 786 * @rtype: chunk type being encoded 787 * 788 * Returns 0 on success; otherwise a negative errno is returned. 789 */ 790 inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, 791 struct rpcrdma_req *req, u32 hdrlen, 792 struct xdr_buf *xdr, 793 enum rpcrdma_chunktype rtype) 794 { 795 int ret; 796 797 ret = -EAGAIN; 798 req->rl_sendctx = rpcrdma_sendctx_get_locked(r_xprt); 799 if (!req->rl_sendctx) 800 goto out_nosc; 801 req->rl_sendctx->sc_unmap_count = 0; 802 req->rl_sendctx->sc_req = req; 803 kref_init(&req->rl_kref); 804 req->rl_wr.wr_cqe = &req->rl_sendctx->sc_cqe; 805 req->rl_wr.sg_list = req->rl_sendctx->sc_sges; 806 req->rl_wr.num_sge = 0; 807 req->rl_wr.opcode = IB_WR_SEND; 808 809 rpcrdma_prepare_hdr_sge(r_xprt, req, hdrlen); 810 811 ret = -EIO; 812 switch (rtype) { 813 case rpcrdma_noch_pullup: 814 if (!rpcrdma_prepare_noch_pullup(r_xprt, req, xdr)) 815 goto out_unmap; 816 break; 817 case rpcrdma_noch_mapped: 818 if (!rpcrdma_prepare_noch_mapped(r_xprt, req, xdr)) 819 goto out_unmap; 820 break; 821 case rpcrdma_readch: 822 if (!rpcrdma_prepare_readch(r_xprt, req, xdr)) 823 goto out_unmap; 824 break; 825 case rpcrdma_areadch: 826 break; 827 default: 828 goto out_unmap; 829 } 830 831 return 0; 832 833 out_unmap: 834 rpcrdma_sendctx_unmap(req->rl_sendctx); 835 out_nosc: 836 trace_xprtrdma_prepsend_failed(&req->rl_slot, ret); 837 return ret; 838 } 839 840 /** 841 * rpcrdma_marshal_req - Marshal and send one RPC request 842 * @r_xprt: controlling transport 843 * @rqst: RPC request to be marshaled 844 * 845 * For the RPC in "rqst", this function: 846 * - Chooses the transfer mode (eg., RDMA_MSG or RDMA_NOMSG) 847 * - Registers Read, Write, and Reply chunks 848 * - Constructs the transport header 849 * - Posts a Send WR to send the transport header and request 850 * 851 * Returns: 852 * %0 if the RPC was sent successfully, 853 * %-ENOTCONN if the connection was lost, 854 * %-EAGAIN if the caller should call again with the same arguments, 855 * %-ENOBUFS if the caller should call again after a delay, 856 * %-EMSGSIZE if the transport header is too small, 857 * %-EIO if a permanent problem occurred while marshaling. 858 */ 859 int 860 rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) 861 { 862 struct rpcrdma_req *req = rpcr_to_rdmar(rqst); 863 struct xdr_stream *xdr = &req->rl_stream; 864 enum rpcrdma_chunktype rtype, wtype; 865 struct xdr_buf *buf = &rqst->rq_snd_buf; 866 bool ddp_allowed; 867 __be32 *p; 868 int ret; 869 870 if (unlikely(rqst->rq_rcv_buf.flags & XDRBUF_SPARSE_PAGES)) { 871 ret = rpcrdma_alloc_sparse_pages(&rqst->rq_rcv_buf); 872 if (ret) 873 return ret; 874 } 875 876 rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0); 877 xdr_init_encode(xdr, &req->rl_hdrbuf, rdmab_data(req->rl_rdmabuf), 878 rqst); 879 880 /* Fixed header fields */ 881 ret = -EMSGSIZE; 882 p = xdr_reserve_space(xdr, 4 * sizeof(*p)); 883 if (!p) 884 goto out_err; 885 *p++ = rqst->rq_xid; 886 *p++ = rpcrdma_version; 887 *p++ = r_xprt->rx_buf.rb_max_requests; 888 889 /* When the ULP employs a GSS flavor that guarantees integrity 890 * or privacy, direct data placement of individual data items 891 * is not allowed. 892 */ 893 ddp_allowed = !test_bit(RPCAUTH_AUTH_DATATOUCH, 894 &rqst->rq_cred->cr_auth->au_flags); 895 896 /* 897 * Chunks needed for results? 898 * 899 * o If the expected result is under the inline threshold, all ops 900 * return as inline. 901 * o Large read ops return data as write chunk(s), header as 902 * inline. 903 * o Large non-read ops return as a single reply chunk. 904 */ 905 if (rpcrdma_results_inline(r_xprt, rqst)) 906 wtype = rpcrdma_noch; 907 else if ((ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ) && 908 rpcrdma_nonpayload_inline(r_xprt, rqst)) 909 wtype = rpcrdma_writech; 910 else 911 wtype = rpcrdma_replych; 912 913 /* 914 * Chunks needed for arguments? 915 * 916 * o If the total request is under the inline threshold, all ops 917 * are sent as inline. 918 * o Large write ops transmit data as read chunk(s), header as 919 * inline. 920 * o Large non-write ops are sent with the entire message as a 921 * single read chunk (protocol 0-position special case). 922 * 923 * This assumes that the upper layer does not present a request 924 * that both has a data payload, and whose non-data arguments 925 * by themselves are larger than the inline threshold. 926 */ 927 if (rpcrdma_args_inline(r_xprt, rqst)) { 928 *p++ = rdma_msg; 929 rtype = buf->len < rdmab_length(req->rl_sendbuf) ? 930 rpcrdma_noch_pullup : rpcrdma_noch_mapped; 931 } else if (ddp_allowed && buf->flags & XDRBUF_WRITE) { 932 *p++ = rdma_msg; 933 rtype = rpcrdma_readch; 934 } else { 935 r_xprt->rx_stats.nomsg_call_count++; 936 *p++ = rdma_nomsg; 937 rtype = rpcrdma_areadch; 938 } 939 940 /* This implementation supports the following combinations 941 * of chunk lists in one RPC-over-RDMA Call message: 942 * 943 * - Read list 944 * - Write list 945 * - Reply chunk 946 * - Read list + Reply chunk 947 * 948 * It might not yet support the following combinations: 949 * 950 * - Read list + Write list 951 * 952 * It does not support the following combinations: 953 * 954 * - Write list + Reply chunk 955 * - Read list + Write list + Reply chunk 956 * 957 * This implementation supports only a single chunk in each 958 * Read or Write list. Thus for example the client cannot 959 * send a Call message with a Position Zero Read chunk and a 960 * regular Read chunk at the same time. 961 */ 962 ret = rpcrdma_encode_read_list(r_xprt, req, rqst, rtype); 963 if (ret) 964 goto out_err; 965 ret = rpcrdma_encode_write_list(r_xprt, req, rqst, wtype); 966 if (ret) 967 goto out_err; 968 ret = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, wtype); 969 if (ret) 970 goto out_err; 971 972 ret = rpcrdma_prepare_send_sges(r_xprt, req, req->rl_hdrbuf.len, 973 buf, rtype); 974 if (ret) 975 goto out_err; 976 977 trace_xprtrdma_marshal(req, rtype, wtype); 978 return 0; 979 980 out_err: 981 trace_xprtrdma_marshal_failed(rqst, ret); 982 r_xprt->rx_stats.failed_marshal_count++; 983 frwr_reset(req); 984 return ret; 985 } 986 987 static void __rpcrdma_update_cwnd_locked(struct rpc_xprt *xprt, 988 struct rpcrdma_buffer *buf, 989 u32 grant) 990 { 991 buf->rb_credits = grant; 992 xprt->cwnd = grant << RPC_CWNDSHIFT; 993 } 994 995 static void rpcrdma_update_cwnd(struct rpcrdma_xprt *r_xprt, u32 grant) 996 { 997 struct rpc_xprt *xprt = &r_xprt->rx_xprt; 998 999 spin_lock(&xprt->transport_lock); 1000 __rpcrdma_update_cwnd_locked(xprt, &r_xprt->rx_buf, grant); 1001 spin_unlock(&xprt->transport_lock); 1002 } 1003 1004 /** 1005 * rpcrdma_reset_cwnd - Reset the xprt's congestion window 1006 * @r_xprt: controlling transport instance 1007 * 1008 * Prepare @r_xprt for the next connection by reinitializing 1009 * its credit grant to one (see RFC 8166, Section 3.3.3). 1010 */ 1011 void rpcrdma_reset_cwnd(struct rpcrdma_xprt *r_xprt) 1012 { 1013 struct rpc_xprt *xprt = &r_xprt->rx_xprt; 1014 1015 spin_lock(&xprt->transport_lock); 1016 xprt->cong = 0; 1017 __rpcrdma_update_cwnd_locked(xprt, &r_xprt->rx_buf, 1); 1018 spin_unlock(&xprt->transport_lock); 1019 } 1020 1021 /** 1022 * rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs 1023 * @rqst: controlling RPC request 1024 * @srcp: points to RPC message payload in receive buffer 1025 * @copy_len: remaining length of receive buffer content 1026 * @pad: Write chunk pad bytes needed (zero for pure inline) 1027 * 1028 * The upper layer has set the maximum number of bytes it can 1029 * receive in each component of rq_rcv_buf. These values are set in 1030 * the head.iov_len, page_len, tail.iov_len, and buflen fields. 1031 * 1032 * Unlike the TCP equivalent (xdr_partial_copy_from_skb), in 1033 * many cases this function simply updates iov_base pointers in 1034 * rq_rcv_buf to point directly to the received reply data, to 1035 * avoid copying reply data. 1036 * 1037 * Returns the count of bytes which had to be memcopied. 1038 */ 1039 static unsigned long 1040 rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) 1041 { 1042 unsigned long fixup_copy_count; 1043 int i, npages, curlen; 1044 char *destp; 1045 struct page **ppages; 1046 int page_base; 1047 1048 /* The head iovec is redirected to the RPC reply message 1049 * in the receive buffer, to avoid a memcopy. 1050 */ 1051 rqst->rq_rcv_buf.head[0].iov_base = srcp; 1052 rqst->rq_private_buf.head[0].iov_base = srcp; 1053 1054 /* The contents of the receive buffer that follow 1055 * head.iov_len bytes are copied into the page list. 1056 */ 1057 curlen = rqst->rq_rcv_buf.head[0].iov_len; 1058 if (curlen > copy_len) 1059 curlen = copy_len; 1060 srcp += curlen; 1061 copy_len -= curlen; 1062 1063 ppages = rqst->rq_rcv_buf.pages + 1064 (rqst->rq_rcv_buf.page_base >> PAGE_SHIFT); 1065 page_base = offset_in_page(rqst->rq_rcv_buf.page_base); 1066 fixup_copy_count = 0; 1067 if (copy_len && rqst->rq_rcv_buf.page_len) { 1068 int pagelist_len; 1069 1070 pagelist_len = rqst->rq_rcv_buf.page_len; 1071 if (pagelist_len > copy_len) 1072 pagelist_len = copy_len; 1073 npages = PAGE_ALIGN(page_base + pagelist_len) >> PAGE_SHIFT; 1074 for (i = 0; i < npages; i++) { 1075 curlen = PAGE_SIZE - page_base; 1076 if (curlen > pagelist_len) 1077 curlen = pagelist_len; 1078 1079 destp = kmap_atomic(ppages[i]); 1080 memcpy(destp + page_base, srcp, curlen); 1081 flush_dcache_page(ppages[i]); 1082 kunmap_atomic(destp); 1083 srcp += curlen; 1084 copy_len -= curlen; 1085 fixup_copy_count += curlen; 1086 pagelist_len -= curlen; 1087 if (!pagelist_len) 1088 break; 1089 page_base = 0; 1090 } 1091 1092 /* Implicit padding for the last segment in a Write 1093 * chunk is inserted inline at the front of the tail 1094 * iovec. The upper layer ignores the content of 1095 * the pad. Simply ensure inline content in the tail 1096 * that follows the Write chunk is properly aligned. 1097 */ 1098 if (pad) 1099 srcp -= pad; 1100 } 1101 1102 /* The tail iovec is redirected to the remaining data 1103 * in the receive buffer, to avoid a memcopy. 1104 */ 1105 if (copy_len || pad) { 1106 rqst->rq_rcv_buf.tail[0].iov_base = srcp; 1107 rqst->rq_private_buf.tail[0].iov_base = srcp; 1108 } 1109 1110 if (fixup_copy_count) 1111 trace_xprtrdma_fixup(rqst, fixup_copy_count); 1112 return fixup_copy_count; 1113 } 1114 1115 /* By convention, backchannel calls arrive via rdma_msg type 1116 * messages, and never populate the chunk lists. This makes 1117 * the RPC/RDMA header small and fixed in size, so it is 1118 * straightforward to check the RPC header's direction field. 1119 */ 1120 static bool 1121 rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) 1122 #if defined(CONFIG_SUNRPC_BACKCHANNEL) 1123 { 1124 struct xdr_stream *xdr = &rep->rr_stream; 1125 __be32 *p; 1126 1127 if (rep->rr_proc != rdma_msg) 1128 return false; 1129 1130 /* Peek at stream contents without advancing. */ 1131 p = xdr_inline_decode(xdr, 0); 1132 1133 /* Chunk lists */ 1134 if (xdr_item_is_present(p++)) 1135 return false; 1136 if (xdr_item_is_present(p++)) 1137 return false; 1138 if (xdr_item_is_present(p++)) 1139 return false; 1140 1141 /* RPC header */ 1142 if (*p++ != rep->rr_xid) 1143 return false; 1144 if (*p != cpu_to_be32(RPC_CALL)) 1145 return false; 1146 1147 /* Now that we are sure this is a backchannel call, 1148 * advance to the RPC header. 1149 */ 1150 p = xdr_inline_decode(xdr, 3 * sizeof(*p)); 1151 if (unlikely(!p)) 1152 return true; 1153 1154 rpcrdma_bc_receive_call(r_xprt, rep); 1155 return true; 1156 } 1157 #else /* CONFIG_SUNRPC_BACKCHANNEL */ 1158 { 1159 return false; 1160 } 1161 #endif /* CONFIG_SUNRPC_BACKCHANNEL */ 1162 1163 static int decode_rdma_segment(struct xdr_stream *xdr, u32 *length) 1164 { 1165 u32 handle; 1166 u64 offset; 1167 __be32 *p; 1168 1169 p = xdr_inline_decode(xdr, 4 * sizeof(*p)); 1170 if (unlikely(!p)) 1171 return -EIO; 1172 1173 xdr_decode_rdma_segment(p, &handle, length, &offset); 1174 trace_xprtrdma_decode_seg(handle, *length, offset); 1175 return 0; 1176 } 1177 1178 static int decode_write_chunk(struct xdr_stream *xdr, u32 *length) 1179 { 1180 u32 segcount, seglength; 1181 __be32 *p; 1182 1183 p = xdr_inline_decode(xdr, sizeof(*p)); 1184 if (unlikely(!p)) 1185 return -EIO; 1186 1187 *length = 0; 1188 segcount = be32_to_cpup(p); 1189 while (segcount--) { 1190 if (decode_rdma_segment(xdr, &seglength)) 1191 return -EIO; 1192 *length += seglength; 1193 } 1194 1195 return 0; 1196 } 1197 1198 /* In RPC-over-RDMA Version One replies, a Read list is never 1199 * expected. This decoder is a stub that returns an error if 1200 * a Read list is present. 1201 */ 1202 static int decode_read_list(struct xdr_stream *xdr) 1203 { 1204 __be32 *p; 1205 1206 p = xdr_inline_decode(xdr, sizeof(*p)); 1207 if (unlikely(!p)) 1208 return -EIO; 1209 if (unlikely(xdr_item_is_present(p))) 1210 return -EIO; 1211 return 0; 1212 } 1213 1214 /* Supports only one Write chunk in the Write list 1215 */ 1216 static int decode_write_list(struct xdr_stream *xdr, u32 *length) 1217 { 1218 u32 chunklen; 1219 bool first; 1220 __be32 *p; 1221 1222 *length = 0; 1223 first = true; 1224 do { 1225 p = xdr_inline_decode(xdr, sizeof(*p)); 1226 if (unlikely(!p)) 1227 return -EIO; 1228 if (xdr_item_is_absent(p)) 1229 break; 1230 if (!first) 1231 return -EIO; 1232 1233 if (decode_write_chunk(xdr, &chunklen)) 1234 return -EIO; 1235 *length += chunklen; 1236 first = false; 1237 } while (true); 1238 return 0; 1239 } 1240 1241 static int decode_reply_chunk(struct xdr_stream *xdr, u32 *length) 1242 { 1243 __be32 *p; 1244 1245 p = xdr_inline_decode(xdr, sizeof(*p)); 1246 if (unlikely(!p)) 1247 return -EIO; 1248 1249 *length = 0; 1250 if (xdr_item_is_present(p)) 1251 if (decode_write_chunk(xdr, length)) 1252 return -EIO; 1253 return 0; 1254 } 1255 1256 static int 1257 rpcrdma_decode_msg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, 1258 struct rpc_rqst *rqst) 1259 { 1260 struct xdr_stream *xdr = &rep->rr_stream; 1261 u32 writelist, replychunk, rpclen; 1262 char *base; 1263 1264 /* Decode the chunk lists */ 1265 if (decode_read_list(xdr)) 1266 return -EIO; 1267 if (decode_write_list(xdr, &writelist)) 1268 return -EIO; 1269 if (decode_reply_chunk(xdr, &replychunk)) 1270 return -EIO; 1271 1272 /* RDMA_MSG sanity checks */ 1273 if (unlikely(replychunk)) 1274 return -EIO; 1275 1276 /* Build the RPC reply's Payload stream in rqst->rq_rcv_buf */ 1277 base = (char *)xdr_inline_decode(xdr, 0); 1278 rpclen = xdr_stream_remaining(xdr); 1279 r_xprt->rx_stats.fixup_copy_count += 1280 rpcrdma_inline_fixup(rqst, base, rpclen, writelist & 3); 1281 1282 r_xprt->rx_stats.total_rdma_reply += writelist; 1283 return rpclen + xdr_align_size(writelist); 1284 } 1285 1286 static noinline int 1287 rpcrdma_decode_nomsg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) 1288 { 1289 struct xdr_stream *xdr = &rep->rr_stream; 1290 u32 writelist, replychunk; 1291 1292 /* Decode the chunk lists */ 1293 if (decode_read_list(xdr)) 1294 return -EIO; 1295 if (decode_write_list(xdr, &writelist)) 1296 return -EIO; 1297 if (decode_reply_chunk(xdr, &replychunk)) 1298 return -EIO; 1299 1300 /* RDMA_NOMSG sanity checks */ 1301 if (unlikely(writelist)) 1302 return -EIO; 1303 if (unlikely(!replychunk)) 1304 return -EIO; 1305 1306 /* Reply chunk buffer already is the reply vector */ 1307 r_xprt->rx_stats.total_rdma_reply += replychunk; 1308 return replychunk; 1309 } 1310 1311 static noinline int 1312 rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, 1313 struct rpc_rqst *rqst) 1314 { 1315 struct xdr_stream *xdr = &rep->rr_stream; 1316 __be32 *p; 1317 1318 p = xdr_inline_decode(xdr, sizeof(*p)); 1319 if (unlikely(!p)) 1320 return -EIO; 1321 1322 switch (*p) { 1323 case err_vers: 1324 p = xdr_inline_decode(xdr, 2 * sizeof(*p)); 1325 if (!p) 1326 break; 1327 trace_xprtrdma_err_vers(rqst, p, p + 1); 1328 break; 1329 case err_chunk: 1330 trace_xprtrdma_err_chunk(rqst); 1331 break; 1332 default: 1333 trace_xprtrdma_err_unrecognized(rqst, p); 1334 } 1335 1336 return -EIO; 1337 } 1338 1339 /** 1340 * rpcrdma_unpin_rqst - Release rqst without completing it 1341 * @rep: RPC/RDMA Receive context 1342 * 1343 * This is done when a connection is lost so that a Reply 1344 * can be dropped and its matching Call can be subsequently 1345 * retransmitted on a new connection. 1346 */ 1347 void rpcrdma_unpin_rqst(struct rpcrdma_rep *rep) 1348 { 1349 struct rpc_xprt *xprt = &rep->rr_rxprt->rx_xprt; 1350 struct rpc_rqst *rqst = rep->rr_rqst; 1351 struct rpcrdma_req *req = rpcr_to_rdmar(rqst); 1352 1353 req->rl_reply = NULL; 1354 rep->rr_rqst = NULL; 1355 1356 spin_lock(&xprt->queue_lock); 1357 xprt_unpin_rqst(rqst); 1358 spin_unlock(&xprt->queue_lock); 1359 } 1360 1361 /** 1362 * rpcrdma_complete_rqst - Pass completed rqst back to RPC 1363 * @rep: RPC/RDMA Receive context 1364 * 1365 * Reconstruct the RPC reply and complete the transaction 1366 * while @rqst is still pinned to ensure the rep, rqst, and 1367 * rq_task pointers remain stable. 1368 */ 1369 void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) 1370 { 1371 struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; 1372 struct rpc_xprt *xprt = &r_xprt->rx_xprt; 1373 struct rpc_rqst *rqst = rep->rr_rqst; 1374 int status; 1375 1376 switch (rep->rr_proc) { 1377 case rdma_msg: 1378 status = rpcrdma_decode_msg(r_xprt, rep, rqst); 1379 break; 1380 case rdma_nomsg: 1381 status = rpcrdma_decode_nomsg(r_xprt, rep); 1382 break; 1383 case rdma_error: 1384 status = rpcrdma_decode_error(r_xprt, rep, rqst); 1385 break; 1386 default: 1387 status = -EIO; 1388 } 1389 if (status < 0) 1390 goto out_badheader; 1391 1392 out: 1393 spin_lock(&xprt->queue_lock); 1394 xprt_complete_rqst(rqst->rq_task, status); 1395 xprt_unpin_rqst(rqst); 1396 spin_unlock(&xprt->queue_lock); 1397 return; 1398 1399 out_badheader: 1400 trace_xprtrdma_reply_hdr_err(rep); 1401 r_xprt->rx_stats.bad_reply_count++; 1402 rqst->rq_task->tk_status = status; 1403 status = 0; 1404 goto out; 1405 } 1406 1407 static void rpcrdma_reply_done(struct kref *kref) 1408 { 1409 struct rpcrdma_req *req = 1410 container_of(kref, struct rpcrdma_req, rl_kref); 1411 1412 rpcrdma_complete_rqst(req->rl_reply); 1413 } 1414 1415 /** 1416 * rpcrdma_reply_handler - Process received RPC/RDMA messages 1417 * @rep: Incoming rpcrdma_rep object to process 1418 * 1419 * Errors must result in the RPC task either being awakened, or 1420 * allowed to timeout, to discover the errors at that time. 1421 */ 1422 void rpcrdma_reply_handler(struct rpcrdma_rep *rep) 1423 { 1424 struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; 1425 struct rpc_xprt *xprt = &r_xprt->rx_xprt; 1426 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 1427 struct rpcrdma_req *req; 1428 struct rpc_rqst *rqst; 1429 u32 credits; 1430 __be32 *p; 1431 1432 /* Any data means we had a useful conversation, so 1433 * then we don't need to delay the next reconnect. 1434 */ 1435 if (xprt->reestablish_timeout) 1436 xprt->reestablish_timeout = 0; 1437 1438 /* Fixed transport header fields */ 1439 xdr_init_decode(&rep->rr_stream, &rep->rr_hdrbuf, 1440 rep->rr_hdrbuf.head[0].iov_base, NULL); 1441 p = xdr_inline_decode(&rep->rr_stream, 4 * sizeof(*p)); 1442 if (unlikely(!p)) 1443 goto out_shortreply; 1444 rep->rr_xid = *p++; 1445 rep->rr_vers = *p++; 1446 credits = be32_to_cpu(*p++); 1447 rep->rr_proc = *p++; 1448 1449 if (rep->rr_vers != rpcrdma_version) 1450 goto out_badversion; 1451 1452 if (rpcrdma_is_bcall(r_xprt, rep)) 1453 return; 1454 1455 /* Match incoming rpcrdma_rep to an rpcrdma_req to 1456 * get context for handling any incoming chunks. 1457 */ 1458 spin_lock(&xprt->queue_lock); 1459 rqst = xprt_lookup_rqst(xprt, rep->rr_xid); 1460 if (!rqst) 1461 goto out_norqst; 1462 xprt_pin_rqst(rqst); 1463 spin_unlock(&xprt->queue_lock); 1464 1465 if (credits == 0) 1466 credits = 1; /* don't deadlock */ 1467 else if (credits > r_xprt->rx_ep->re_max_requests) 1468 credits = r_xprt->rx_ep->re_max_requests; 1469 rpcrdma_post_recvs(r_xprt, credits + (buf->rb_bc_srv_max_requests << 1), 1470 false); 1471 if (buf->rb_credits != credits) 1472 rpcrdma_update_cwnd(r_xprt, credits); 1473 1474 req = rpcr_to_rdmar(rqst); 1475 if (unlikely(req->rl_reply)) 1476 rpcrdma_rep_put(buf, req->rl_reply); 1477 req->rl_reply = rep; 1478 rep->rr_rqst = rqst; 1479 1480 trace_xprtrdma_reply(rqst->rq_task, rep, credits); 1481 1482 if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) 1483 frwr_reminv(rep, &req->rl_registered); 1484 if (!list_empty(&req->rl_registered)) 1485 frwr_unmap_async(r_xprt, req); 1486 /* LocalInv completion will complete the RPC */ 1487 else 1488 kref_put(&req->rl_kref, rpcrdma_reply_done); 1489 return; 1490 1491 out_badversion: 1492 trace_xprtrdma_reply_vers_err(rep); 1493 goto out; 1494 1495 out_norqst: 1496 spin_unlock(&xprt->queue_lock); 1497 trace_xprtrdma_reply_rqst_err(rep); 1498 goto out; 1499 1500 out_shortreply: 1501 trace_xprtrdma_reply_short_err(rep); 1502 1503 out: 1504 rpcrdma_rep_put(buf, rep); 1505 } 1506