1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 /* 3 * Copyright (c) 2014-2017 Oracle. All rights reserved. 4 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. 5 * 6 * This software is available to you under a choice of one of two 7 * licenses. You may choose to be licensed under the terms of the GNU 8 * General Public License (GPL) Version 2, available from the file 9 * COPYING in the main directory of this source tree, or the BSD-type 10 * license below: 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 16 * Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 19 * Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials provided 22 * with the distribution. 23 * 24 * Neither the name of the Network Appliance, Inc. nor the names of 25 * its contributors may be used to endorse or promote products 26 * derived from this software without specific prior written 27 * permission. 28 * 29 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 30 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 31 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 32 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 33 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 34 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 35 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 36 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 37 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 38 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 39 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 40 */ 41 42 /* 43 * rpc_rdma.c 44 * 45 * This file contains the guts of the RPC RDMA protocol, and 46 * does marshaling/unmarshaling, etc. It is also where interfacing 47 * to the Linux RPC framework lives. 48 */ 49 50 #include <linux/highmem.h> 51 52 #include <linux/sunrpc/svc_rdma.h> 53 54 #include "xprt_rdma.h" 55 #include <trace/events/rpcrdma.h> 56 57 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) 58 # define RPCDBG_FACILITY RPCDBG_TRANS 59 #endif 60 61 /* Returns size of largest RPC-over-RDMA header in a Call message 62 * 63 * The largest Call header contains a full-size Read list and a 64 * minimal Reply chunk. 65 */ 66 static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs) 67 { 68 unsigned int size; 69 70 /* Fixed header fields and list discriminators */ 71 size = RPCRDMA_HDRLEN_MIN; 72 73 /* Maximum Read list size */ 74 size = maxsegs * rpcrdma_readchunk_maxsz * sizeof(__be32); 75 76 /* Minimal Read chunk size */ 77 size += sizeof(__be32); /* segment count */ 78 size += rpcrdma_segment_maxsz * sizeof(__be32); 79 size += sizeof(__be32); /* list discriminator */ 80 81 return size; 82 } 83 84 /* Returns size of largest RPC-over-RDMA header in a Reply message 85 * 86 * There is only one Write list or one Reply chunk per Reply 87 * message. The larger list is the Write list. 88 */ 89 static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs) 90 { 91 unsigned int size; 92 93 /* Fixed header fields and list discriminators */ 94 size = RPCRDMA_HDRLEN_MIN; 95 96 /* Maximum Write list size */ 97 size = sizeof(__be32); /* segment count */ 98 size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32); 99 size += sizeof(__be32); /* list discriminator */ 100 101 return size; 102 } 103 104 /** 105 * rpcrdma_set_max_header_sizes - Initialize inline payload sizes 106 * @r_xprt: transport instance to initialize 107 * 108 * The max_inline fields contain the maximum size of an RPC message 109 * so the marshaling code doesn't have to repeat this calculation 110 * for every RPC. 111 */ 112 void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt) 113 { 114 unsigned int maxsegs = r_xprt->rx_ia.ri_max_segs; 115 struct rpcrdma_ep *ep = &r_xprt->rx_ep; 116 117 ep->rep_max_inline_send = 118 ep->rep_inline_send - rpcrdma_max_call_header_size(maxsegs); 119 ep->rep_max_inline_recv = 120 ep->rep_inline_recv - rpcrdma_max_reply_header_size(maxsegs); 121 } 122 123 /* The client can send a request inline as long as the RPCRDMA header 124 * plus the RPC call fit under the transport's inline limit. If the 125 * combined call message size exceeds that limit, the client must use 126 * a Read chunk for this operation. 127 * 128 * A Read chunk is also required if sending the RPC call inline would 129 * exceed this device's max_sge limit. 130 */ 131 static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, 132 struct rpc_rqst *rqst) 133 { 134 struct xdr_buf *xdr = &rqst->rq_snd_buf; 135 unsigned int count, remaining, offset; 136 137 if (xdr->len > r_xprt->rx_ep.rep_max_inline_send) 138 return false; 139 140 if (xdr->page_len) { 141 remaining = xdr->page_len; 142 offset = offset_in_page(xdr->page_base); 143 count = RPCRDMA_MIN_SEND_SGES; 144 while (remaining) { 145 remaining -= min_t(unsigned int, 146 PAGE_SIZE - offset, remaining); 147 offset = 0; 148 if (++count > r_xprt->rx_ia.ri_max_send_sges) 149 return false; 150 } 151 } 152 153 return true; 154 } 155 156 /* The client can't know how large the actual reply will be. Thus it 157 * plans for the largest possible reply for that particular ULP 158 * operation. If the maximum combined reply message size exceeds that 159 * limit, the client must provide a write list or a reply chunk for 160 * this request. 161 */ 162 static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt, 163 struct rpc_rqst *rqst) 164 { 165 return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep.rep_max_inline_recv; 166 } 167 168 /* The client is required to provide a Reply chunk if the maximum 169 * size of the non-payload part of the RPC Reply is larger than 170 * the inline threshold. 171 */ 172 static bool 173 rpcrdma_nonpayload_inline(const struct rpcrdma_xprt *r_xprt, 174 const struct rpc_rqst *rqst) 175 { 176 const struct xdr_buf *buf = &rqst->rq_rcv_buf; 177 178 return (buf->head[0].iov_len + buf->tail[0].iov_len) < 179 r_xprt->rx_ep.rep_max_inline_recv; 180 } 181 182 /* Split @vec on page boundaries into SGEs. FMR registers pages, not 183 * a byte range. Other modes coalesce these SGEs into a single MR 184 * when they can. 185 * 186 * Returns pointer to next available SGE, and bumps the total number 187 * of SGEs consumed. 188 */ 189 static struct rpcrdma_mr_seg * 190 rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, 191 unsigned int *n) 192 { 193 u32 remaining, page_offset; 194 char *base; 195 196 base = vec->iov_base; 197 page_offset = offset_in_page(base); 198 remaining = vec->iov_len; 199 while (remaining) { 200 seg->mr_page = NULL; 201 seg->mr_offset = base; 202 seg->mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining); 203 remaining -= seg->mr_len; 204 base += seg->mr_len; 205 ++seg; 206 ++(*n); 207 page_offset = 0; 208 } 209 return seg; 210 } 211 212 /* Convert @xdrbuf into SGEs no larger than a page each. As they 213 * are registered, these SGEs are then coalesced into RDMA segments 214 * when the selected memreg mode supports it. 215 * 216 * Returns positive number of SGEs consumed, or a negative errno. 217 */ 218 219 static int 220 rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, 221 unsigned int pos, enum rpcrdma_chunktype type, 222 struct rpcrdma_mr_seg *seg) 223 { 224 unsigned long page_base; 225 unsigned int len, n; 226 struct page **ppages; 227 228 n = 0; 229 if (pos == 0) 230 seg = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, &n); 231 232 len = xdrbuf->page_len; 233 ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); 234 page_base = offset_in_page(xdrbuf->page_base); 235 while (len) { 236 /* ACL likes to be lazy in allocating pages - ACLs 237 * are small by default but can get huge. 238 */ 239 if (unlikely(xdrbuf->flags & XDRBUF_SPARSE_PAGES)) { 240 if (!*ppages) 241 *ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN); 242 if (!*ppages) 243 return -ENOBUFS; 244 } 245 seg->mr_page = *ppages; 246 seg->mr_offset = (char *)page_base; 247 seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len); 248 len -= seg->mr_len; 249 ++ppages; 250 ++seg; 251 ++n; 252 page_base = 0; 253 } 254 255 /* When encoding a Read chunk, the tail iovec contains an 256 * XDR pad and may be omitted. 257 */ 258 if (type == rpcrdma_readch && r_xprt->rx_ia.ri_implicit_roundup) 259 goto out; 260 261 /* When encoding a Write chunk, some servers need to see an 262 * extra segment for non-XDR-aligned Write chunks. The upper 263 * layer provides space in the tail iovec that may be used 264 * for this purpose. 265 */ 266 if (type == rpcrdma_writech && r_xprt->rx_ia.ri_implicit_roundup) 267 goto out; 268 269 if (xdrbuf->tail[0].iov_len) 270 seg = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n); 271 272 out: 273 if (unlikely(n > RPCRDMA_MAX_SEGS)) 274 return -EIO; 275 return n; 276 } 277 278 static inline int 279 encode_item_present(struct xdr_stream *xdr) 280 { 281 __be32 *p; 282 283 p = xdr_reserve_space(xdr, sizeof(*p)); 284 if (unlikely(!p)) 285 return -EMSGSIZE; 286 287 *p = xdr_one; 288 return 0; 289 } 290 291 static inline int 292 encode_item_not_present(struct xdr_stream *xdr) 293 { 294 __be32 *p; 295 296 p = xdr_reserve_space(xdr, sizeof(*p)); 297 if (unlikely(!p)) 298 return -EMSGSIZE; 299 300 *p = xdr_zero; 301 return 0; 302 } 303 304 static void 305 xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr *mr) 306 { 307 *iptr++ = cpu_to_be32(mr->mr_handle); 308 *iptr++ = cpu_to_be32(mr->mr_length); 309 xdr_encode_hyper(iptr, mr->mr_offset); 310 } 311 312 static int 313 encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr) 314 { 315 __be32 *p; 316 317 p = xdr_reserve_space(xdr, 4 * sizeof(*p)); 318 if (unlikely(!p)) 319 return -EMSGSIZE; 320 321 xdr_encode_rdma_segment(p, mr); 322 return 0; 323 } 324 325 static int 326 encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr, 327 u32 position) 328 { 329 __be32 *p; 330 331 p = xdr_reserve_space(xdr, 6 * sizeof(*p)); 332 if (unlikely(!p)) 333 return -EMSGSIZE; 334 335 *p++ = xdr_one; /* Item present */ 336 *p++ = cpu_to_be32(position); 337 xdr_encode_rdma_segment(p, mr); 338 return 0; 339 } 340 341 static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, 342 struct rpcrdma_req *req, 343 struct rpcrdma_mr_seg *seg, 344 int nsegs, bool writing, 345 struct rpcrdma_mr **mr) 346 { 347 *mr = rpcrdma_mr_pop(&req->rl_free_mrs); 348 if (!*mr) { 349 *mr = rpcrdma_mr_get(r_xprt); 350 if (!*mr) 351 goto out_getmr_err; 352 trace_xprtrdma_mr_get(req); 353 (*mr)->mr_req = req; 354 } 355 356 rpcrdma_mr_push(*mr, &req->rl_registered); 357 return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr); 358 359 out_getmr_err: 360 trace_xprtrdma_nomrs(req); 361 xprt_wait_for_buffer_space(&r_xprt->rx_xprt); 362 rpcrdma_mrs_refresh(r_xprt); 363 return ERR_PTR(-EAGAIN); 364 } 365 366 /* Register and XDR encode the Read list. Supports encoding a list of read 367 * segments that belong to a single read chunk. 368 * 369 * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): 370 * 371 * Read chunklist (a linked list): 372 * N elements, position P (same P for all chunks of same arg!): 373 * 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0 374 * 375 * Returns zero on success, or a negative errno if a failure occurred. 376 * @xdr is advanced to the next position in the stream. 377 * 378 * Only a single @pos value is currently supported. 379 */ 380 static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, 381 struct rpcrdma_req *req, 382 struct rpc_rqst *rqst, 383 enum rpcrdma_chunktype rtype) 384 { 385 struct xdr_stream *xdr = &req->rl_stream; 386 struct rpcrdma_mr_seg *seg; 387 struct rpcrdma_mr *mr; 388 unsigned int pos; 389 int nsegs; 390 391 if (rtype == rpcrdma_noch_pullup || rtype == rpcrdma_noch_mapped) 392 goto done; 393 394 pos = rqst->rq_snd_buf.head[0].iov_len; 395 if (rtype == rpcrdma_areadch) 396 pos = 0; 397 seg = req->rl_segments; 398 nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_snd_buf, pos, 399 rtype, seg); 400 if (nsegs < 0) 401 return nsegs; 402 403 do { 404 seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, false, &mr); 405 if (IS_ERR(seg)) 406 return PTR_ERR(seg); 407 408 if (encode_read_segment(xdr, mr, pos) < 0) 409 return -EMSGSIZE; 410 411 trace_xprtrdma_chunk_read(rqst->rq_task, pos, mr, nsegs); 412 r_xprt->rx_stats.read_chunk_count++; 413 nsegs -= mr->mr_nents; 414 } while (nsegs); 415 416 done: 417 return encode_item_not_present(xdr); 418 } 419 420 /* Register and XDR encode the Write list. Supports encoding a list 421 * containing one array of plain segments that belong to a single 422 * write chunk. 423 * 424 * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): 425 * 426 * Write chunklist (a list of (one) counted array): 427 * N elements: 428 * 1 - N - HLOO - HLOO - ... - HLOO - 0 429 * 430 * Returns zero on success, or a negative errno if a failure occurred. 431 * @xdr is advanced to the next position in the stream. 432 * 433 * Only a single Write chunk is currently supported. 434 */ 435 static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, 436 struct rpcrdma_req *req, 437 struct rpc_rqst *rqst, 438 enum rpcrdma_chunktype wtype) 439 { 440 struct xdr_stream *xdr = &req->rl_stream; 441 struct rpcrdma_mr_seg *seg; 442 struct rpcrdma_mr *mr; 443 int nsegs, nchunks; 444 __be32 *segcount; 445 446 if (wtype != rpcrdma_writech) 447 goto done; 448 449 seg = req->rl_segments; 450 nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 451 rqst->rq_rcv_buf.head[0].iov_len, 452 wtype, seg); 453 if (nsegs < 0) 454 return nsegs; 455 456 if (encode_item_present(xdr) < 0) 457 return -EMSGSIZE; 458 segcount = xdr_reserve_space(xdr, sizeof(*segcount)); 459 if (unlikely(!segcount)) 460 return -EMSGSIZE; 461 /* Actual value encoded below */ 462 463 nchunks = 0; 464 do { 465 seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr); 466 if (IS_ERR(seg)) 467 return PTR_ERR(seg); 468 469 if (encode_rdma_segment(xdr, mr) < 0) 470 return -EMSGSIZE; 471 472 trace_xprtrdma_chunk_write(rqst->rq_task, mr, nsegs); 473 r_xprt->rx_stats.write_chunk_count++; 474 r_xprt->rx_stats.total_rdma_request += mr->mr_length; 475 nchunks++; 476 nsegs -= mr->mr_nents; 477 } while (nsegs); 478 479 /* Update count of segments in this Write chunk */ 480 *segcount = cpu_to_be32(nchunks); 481 482 done: 483 return encode_item_not_present(xdr); 484 } 485 486 /* Register and XDR encode the Reply chunk. Supports encoding an array 487 * of plain segments that belong to a single write (reply) chunk. 488 * 489 * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): 490 * 491 * Reply chunk (a counted array): 492 * N elements: 493 * 1 - N - HLOO - HLOO - ... - HLOO 494 * 495 * Returns zero on success, or a negative errno if a failure occurred. 496 * @xdr is advanced to the next position in the stream. 497 */ 498 static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, 499 struct rpcrdma_req *req, 500 struct rpc_rqst *rqst, 501 enum rpcrdma_chunktype wtype) 502 { 503 struct xdr_stream *xdr = &req->rl_stream; 504 struct rpcrdma_mr_seg *seg; 505 struct rpcrdma_mr *mr; 506 int nsegs, nchunks; 507 __be32 *segcount; 508 509 if (wtype != rpcrdma_replych) 510 return encode_item_not_present(xdr); 511 512 seg = req->rl_segments; 513 nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg); 514 if (nsegs < 0) 515 return nsegs; 516 517 if (encode_item_present(xdr) < 0) 518 return -EMSGSIZE; 519 segcount = xdr_reserve_space(xdr, sizeof(*segcount)); 520 if (unlikely(!segcount)) 521 return -EMSGSIZE; 522 /* Actual value encoded below */ 523 524 nchunks = 0; 525 do { 526 seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr); 527 if (IS_ERR(seg)) 528 return PTR_ERR(seg); 529 530 if (encode_rdma_segment(xdr, mr) < 0) 531 return -EMSGSIZE; 532 533 trace_xprtrdma_chunk_reply(rqst->rq_task, mr, nsegs); 534 r_xprt->rx_stats.reply_chunk_count++; 535 r_xprt->rx_stats.total_rdma_request += mr->mr_length; 536 nchunks++; 537 nsegs -= mr->mr_nents; 538 } while (nsegs); 539 540 /* Update count of segments in the Reply chunk */ 541 *segcount = cpu_to_be32(nchunks); 542 543 return 0; 544 } 545 546 static void rpcrdma_sendctx_done(struct kref *kref) 547 { 548 struct rpcrdma_req *req = 549 container_of(kref, struct rpcrdma_req, rl_kref); 550 struct rpcrdma_rep *rep = req->rl_reply; 551 552 rpcrdma_complete_rqst(rep); 553 rep->rr_rxprt->rx_stats.reply_waits_for_send++; 554 } 555 556 /** 557 * rpcrdma_sendctx_unmap - DMA-unmap Send buffer 558 * @sc: sendctx containing SGEs to unmap 559 * 560 */ 561 void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc) 562 { 563 struct rpcrdma_regbuf *rb = sc->sc_req->rl_sendbuf; 564 struct ib_sge *sge; 565 566 if (!sc->sc_unmap_count) 567 return; 568 569 /* The first two SGEs contain the transport header and 570 * the inline buffer. These are always left mapped so 571 * they can be cheaply re-used. 572 */ 573 for (sge = &sc->sc_sges[2]; sc->sc_unmap_count; 574 ++sge, --sc->sc_unmap_count) 575 ib_dma_unmap_page(rdmab_device(rb), sge->addr, sge->length, 576 DMA_TO_DEVICE); 577 578 kref_put(&sc->sc_req->rl_kref, rpcrdma_sendctx_done); 579 } 580 581 /* Prepare an SGE for the RPC-over-RDMA transport header. 582 */ 583 static bool rpcrdma_prepare_hdr_sge(struct rpcrdma_xprt *r_xprt, 584 struct rpcrdma_req *req, u32 len) 585 { 586 struct rpcrdma_sendctx *sc = req->rl_sendctx; 587 struct rpcrdma_regbuf *rb = req->rl_rdmabuf; 588 struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++]; 589 590 if (!rpcrdma_regbuf_dma_map(r_xprt, rb)) 591 return false; 592 sge->addr = rdmab_addr(rb); 593 sge->length = len; 594 sge->lkey = rdmab_lkey(rb); 595 596 ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length, 597 DMA_TO_DEVICE); 598 return true; 599 } 600 601 /* The head iovec is straightforward, as it is usually already 602 * DMA-mapped. Sync the content that has changed. 603 */ 604 static bool rpcrdma_prepare_head_iov(struct rpcrdma_xprt *r_xprt, 605 struct rpcrdma_req *req, unsigned int len) 606 { 607 struct rpcrdma_sendctx *sc = req->rl_sendctx; 608 struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++]; 609 struct rpcrdma_regbuf *rb = req->rl_sendbuf; 610 611 if (!rpcrdma_regbuf_dma_map(r_xprt, rb)) 612 return false; 613 614 sge->addr = rdmab_addr(rb); 615 sge->length = len; 616 sge->lkey = rdmab_lkey(rb); 617 618 ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length, 619 DMA_TO_DEVICE); 620 return true; 621 } 622 623 /* If there is a page list present, DMA map and prepare an 624 * SGE for each page to be sent. 625 */ 626 static bool rpcrdma_prepare_pagelist(struct rpcrdma_req *req, 627 struct xdr_buf *xdr) 628 { 629 struct rpcrdma_sendctx *sc = req->rl_sendctx; 630 struct rpcrdma_regbuf *rb = req->rl_sendbuf; 631 unsigned int page_base, len, remaining; 632 struct page **ppages; 633 struct ib_sge *sge; 634 635 ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); 636 page_base = offset_in_page(xdr->page_base); 637 remaining = xdr->page_len; 638 while (remaining) { 639 sge = &sc->sc_sges[req->rl_wr.num_sge++]; 640 len = min_t(unsigned int, PAGE_SIZE - page_base, remaining); 641 sge->addr = ib_dma_map_page(rdmab_device(rb), *ppages, 642 page_base, len, DMA_TO_DEVICE); 643 if (ib_dma_mapping_error(rdmab_device(rb), sge->addr)) 644 goto out_mapping_err; 645 646 sge->length = len; 647 sge->lkey = rdmab_lkey(rb); 648 649 sc->sc_unmap_count++; 650 ppages++; 651 remaining -= len; 652 page_base = 0; 653 } 654 655 return true; 656 657 out_mapping_err: 658 trace_xprtrdma_dma_maperr(sge->addr); 659 return false; 660 } 661 662 /* The tail iovec may include an XDR pad for the page list, 663 * as well as additional content, and may not reside in the 664 * same page as the head iovec. 665 */ 666 static bool rpcrdma_prepare_tail_iov(struct rpcrdma_req *req, 667 struct xdr_buf *xdr, 668 unsigned int page_base, unsigned int len) 669 { 670 struct rpcrdma_sendctx *sc = req->rl_sendctx; 671 struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++]; 672 struct rpcrdma_regbuf *rb = req->rl_sendbuf; 673 struct page *page = virt_to_page(xdr->tail[0].iov_base); 674 675 sge->addr = ib_dma_map_page(rdmab_device(rb), page, page_base, len, 676 DMA_TO_DEVICE); 677 if (ib_dma_mapping_error(rdmab_device(rb), sge->addr)) 678 goto out_mapping_err; 679 680 sge->length = len; 681 sge->lkey = rdmab_lkey(rb); 682 ++sc->sc_unmap_count; 683 return true; 684 685 out_mapping_err: 686 trace_xprtrdma_dma_maperr(sge->addr); 687 return false; 688 } 689 690 /* Copy the tail to the end of the head buffer. 691 */ 692 static void rpcrdma_pullup_tail_iov(struct rpcrdma_xprt *r_xprt, 693 struct rpcrdma_req *req, 694 struct xdr_buf *xdr) 695 { 696 unsigned char *dst; 697 698 dst = (unsigned char *)xdr->head[0].iov_base; 699 dst += xdr->head[0].iov_len + xdr->page_len; 700 memmove(dst, xdr->tail[0].iov_base, xdr->tail[0].iov_len); 701 r_xprt->rx_stats.pullup_copy_count += xdr->tail[0].iov_len; 702 } 703 704 /* Copy pagelist content into the head buffer. 705 */ 706 static void rpcrdma_pullup_pagelist(struct rpcrdma_xprt *r_xprt, 707 struct rpcrdma_req *req, 708 struct xdr_buf *xdr) 709 { 710 unsigned int len, page_base, remaining; 711 struct page **ppages; 712 unsigned char *src, *dst; 713 714 dst = (unsigned char *)xdr->head[0].iov_base; 715 dst += xdr->head[0].iov_len; 716 ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); 717 page_base = offset_in_page(xdr->page_base); 718 remaining = xdr->page_len; 719 while (remaining) { 720 src = page_address(*ppages); 721 src += page_base; 722 len = min_t(unsigned int, PAGE_SIZE - page_base, remaining); 723 memcpy(dst, src, len); 724 r_xprt->rx_stats.pullup_copy_count += len; 725 726 ppages++; 727 dst += len; 728 remaining -= len; 729 page_base = 0; 730 } 731 } 732 733 /* Copy the contents of @xdr into @rl_sendbuf and DMA sync it. 734 * When the head, pagelist, and tail are small, a pull-up copy 735 * is considerably less costly than DMA mapping the components 736 * of @xdr. 737 * 738 * Assumptions: 739 * - the caller has already verified that the total length 740 * of the RPC Call body will fit into @rl_sendbuf. 741 */ 742 static bool rpcrdma_prepare_noch_pullup(struct rpcrdma_xprt *r_xprt, 743 struct rpcrdma_req *req, 744 struct xdr_buf *xdr) 745 { 746 if (unlikely(xdr->tail[0].iov_len)) 747 rpcrdma_pullup_tail_iov(r_xprt, req, xdr); 748 749 if (unlikely(xdr->page_len)) 750 rpcrdma_pullup_pagelist(r_xprt, req, xdr); 751 752 /* The whole RPC message resides in the head iovec now */ 753 return rpcrdma_prepare_head_iov(r_xprt, req, xdr->len); 754 } 755 756 static bool rpcrdma_prepare_noch_mapped(struct rpcrdma_xprt *r_xprt, 757 struct rpcrdma_req *req, 758 struct xdr_buf *xdr) 759 { 760 struct kvec *tail = &xdr->tail[0]; 761 762 if (!rpcrdma_prepare_head_iov(r_xprt, req, xdr->head[0].iov_len)) 763 return false; 764 if (xdr->page_len) 765 if (!rpcrdma_prepare_pagelist(req, xdr)) 766 return false; 767 if (tail->iov_len) 768 if (!rpcrdma_prepare_tail_iov(req, xdr, 769 offset_in_page(tail->iov_base), 770 tail->iov_len)) 771 return false; 772 773 if (req->rl_sendctx->sc_unmap_count) 774 kref_get(&req->rl_kref); 775 return true; 776 } 777 778 static bool rpcrdma_prepare_readch(struct rpcrdma_xprt *r_xprt, 779 struct rpcrdma_req *req, 780 struct xdr_buf *xdr) 781 { 782 if (!rpcrdma_prepare_head_iov(r_xprt, req, xdr->head[0].iov_len)) 783 return false; 784 785 /* If there is a Read chunk, the page list is being handled 786 * via explicit RDMA, and thus is skipped here. 787 */ 788 789 /* Do not include the tail if it is only an XDR pad */ 790 if (xdr->tail[0].iov_len > 3) { 791 unsigned int page_base, len; 792 793 /* If the content in the page list is an odd length, 794 * xdr_write_pages() adds a pad at the beginning of 795 * the tail iovec. Force the tail's non-pad content to 796 * land at the next XDR position in the Send message. 797 */ 798 page_base = offset_in_page(xdr->tail[0].iov_base); 799 len = xdr->tail[0].iov_len; 800 page_base += len & 3; 801 len -= len & 3; 802 if (!rpcrdma_prepare_tail_iov(req, xdr, page_base, len)) 803 return false; 804 kref_get(&req->rl_kref); 805 } 806 807 return true; 808 } 809 810 /** 811 * rpcrdma_prepare_send_sges - Construct SGEs for a Send WR 812 * @r_xprt: controlling transport 813 * @req: context of RPC Call being marshalled 814 * @hdrlen: size of transport header, in bytes 815 * @xdr: xdr_buf containing RPC Call 816 * @rtype: chunk type being encoded 817 * 818 * Returns 0 on success; otherwise a negative errno is returned. 819 */ 820 inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, 821 struct rpcrdma_req *req, u32 hdrlen, 822 struct xdr_buf *xdr, 823 enum rpcrdma_chunktype rtype) 824 { 825 int ret; 826 827 ret = -EAGAIN; 828 req->rl_sendctx = rpcrdma_sendctx_get_locked(r_xprt); 829 if (!req->rl_sendctx) 830 goto out_nosc; 831 req->rl_sendctx->sc_unmap_count = 0; 832 req->rl_sendctx->sc_req = req; 833 kref_init(&req->rl_kref); 834 req->rl_wr.wr_cqe = &req->rl_sendctx->sc_cqe; 835 req->rl_wr.sg_list = req->rl_sendctx->sc_sges; 836 req->rl_wr.num_sge = 0; 837 req->rl_wr.opcode = IB_WR_SEND; 838 839 ret = -EIO; 840 if (!rpcrdma_prepare_hdr_sge(r_xprt, req, hdrlen)) 841 goto out_unmap; 842 843 switch (rtype) { 844 case rpcrdma_noch_pullup: 845 if (!rpcrdma_prepare_noch_pullup(r_xprt, req, xdr)) 846 goto out_unmap; 847 break; 848 case rpcrdma_noch_mapped: 849 if (!rpcrdma_prepare_noch_mapped(r_xprt, req, xdr)) 850 goto out_unmap; 851 break; 852 case rpcrdma_readch: 853 if (!rpcrdma_prepare_readch(r_xprt, req, xdr)) 854 goto out_unmap; 855 break; 856 case rpcrdma_areadch: 857 break; 858 default: 859 goto out_unmap; 860 } 861 862 return 0; 863 864 out_unmap: 865 rpcrdma_sendctx_unmap(req->rl_sendctx); 866 out_nosc: 867 trace_xprtrdma_prepsend_failed(&req->rl_slot, ret); 868 return ret; 869 } 870 871 /** 872 * rpcrdma_marshal_req - Marshal and send one RPC request 873 * @r_xprt: controlling transport 874 * @rqst: RPC request to be marshaled 875 * 876 * For the RPC in "rqst", this function: 877 * - Chooses the transfer mode (eg., RDMA_MSG or RDMA_NOMSG) 878 * - Registers Read, Write, and Reply chunks 879 * - Constructs the transport header 880 * - Posts a Send WR to send the transport header and request 881 * 882 * Returns: 883 * %0 if the RPC was sent successfully, 884 * %-ENOTCONN if the connection was lost, 885 * %-EAGAIN if the caller should call again with the same arguments, 886 * %-ENOBUFS if the caller should call again after a delay, 887 * %-EMSGSIZE if the transport header is too small, 888 * %-EIO if a permanent problem occurred while marshaling. 889 */ 890 int 891 rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) 892 { 893 struct rpcrdma_req *req = rpcr_to_rdmar(rqst); 894 struct xdr_stream *xdr = &req->rl_stream; 895 enum rpcrdma_chunktype rtype, wtype; 896 struct xdr_buf *buf = &rqst->rq_snd_buf; 897 bool ddp_allowed; 898 __be32 *p; 899 int ret; 900 901 rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0); 902 xdr_init_encode(xdr, &req->rl_hdrbuf, rdmab_data(req->rl_rdmabuf), 903 rqst); 904 905 /* Fixed header fields */ 906 ret = -EMSGSIZE; 907 p = xdr_reserve_space(xdr, 4 * sizeof(*p)); 908 if (!p) 909 goto out_err; 910 *p++ = rqst->rq_xid; 911 *p++ = rpcrdma_version; 912 *p++ = cpu_to_be32(r_xprt->rx_buf.rb_max_requests); 913 914 /* When the ULP employs a GSS flavor that guarantees integrity 915 * or privacy, direct data placement of individual data items 916 * is not allowed. 917 */ 918 ddp_allowed = !(rqst->rq_cred->cr_auth->au_flags & 919 RPCAUTH_AUTH_DATATOUCH); 920 921 /* 922 * Chunks needed for results? 923 * 924 * o If the expected result is under the inline threshold, all ops 925 * return as inline. 926 * o Large read ops return data as write chunk(s), header as 927 * inline. 928 * o Large non-read ops return as a single reply chunk. 929 */ 930 if (rpcrdma_results_inline(r_xprt, rqst)) 931 wtype = rpcrdma_noch; 932 else if ((ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ) && 933 rpcrdma_nonpayload_inline(r_xprt, rqst)) 934 wtype = rpcrdma_writech; 935 else 936 wtype = rpcrdma_replych; 937 938 /* 939 * Chunks needed for arguments? 940 * 941 * o If the total request is under the inline threshold, all ops 942 * are sent as inline. 943 * o Large write ops transmit data as read chunk(s), header as 944 * inline. 945 * o Large non-write ops are sent with the entire message as a 946 * single read chunk (protocol 0-position special case). 947 * 948 * This assumes that the upper layer does not present a request 949 * that both has a data payload, and whose non-data arguments 950 * by themselves are larger than the inline threshold. 951 */ 952 if (rpcrdma_args_inline(r_xprt, rqst)) { 953 *p++ = rdma_msg; 954 rtype = buf->len < rdmab_length(req->rl_sendbuf) ? 955 rpcrdma_noch_pullup : rpcrdma_noch_mapped; 956 } else if (ddp_allowed && buf->flags & XDRBUF_WRITE) { 957 *p++ = rdma_msg; 958 rtype = rpcrdma_readch; 959 } else { 960 r_xprt->rx_stats.nomsg_call_count++; 961 *p++ = rdma_nomsg; 962 rtype = rpcrdma_areadch; 963 } 964 965 /* This implementation supports the following combinations 966 * of chunk lists in one RPC-over-RDMA Call message: 967 * 968 * - Read list 969 * - Write list 970 * - Reply chunk 971 * - Read list + Reply chunk 972 * 973 * It might not yet support the following combinations: 974 * 975 * - Read list + Write list 976 * 977 * It does not support the following combinations: 978 * 979 * - Write list + Reply chunk 980 * - Read list + Write list + Reply chunk 981 * 982 * This implementation supports only a single chunk in each 983 * Read or Write list. Thus for example the client cannot 984 * send a Call message with a Position Zero Read chunk and a 985 * regular Read chunk at the same time. 986 */ 987 ret = rpcrdma_encode_read_list(r_xprt, req, rqst, rtype); 988 if (ret) 989 goto out_err; 990 ret = rpcrdma_encode_write_list(r_xprt, req, rqst, wtype); 991 if (ret) 992 goto out_err; 993 ret = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, wtype); 994 if (ret) 995 goto out_err; 996 997 ret = rpcrdma_prepare_send_sges(r_xprt, req, req->rl_hdrbuf.len, 998 buf, rtype); 999 if (ret) 1000 goto out_err; 1001 1002 trace_xprtrdma_marshal(req, rtype, wtype); 1003 return 0; 1004 1005 out_err: 1006 trace_xprtrdma_marshal_failed(rqst, ret); 1007 r_xprt->rx_stats.failed_marshal_count++; 1008 frwr_reset(req); 1009 return ret; 1010 } 1011 1012 static void __rpcrdma_update_cwnd_locked(struct rpc_xprt *xprt, 1013 struct rpcrdma_buffer *buf, 1014 u32 grant) 1015 { 1016 buf->rb_credits = grant; 1017 xprt->cwnd = grant << RPC_CWNDSHIFT; 1018 } 1019 1020 static void rpcrdma_update_cwnd(struct rpcrdma_xprt *r_xprt, u32 grant) 1021 { 1022 struct rpc_xprt *xprt = &r_xprt->rx_xprt; 1023 1024 spin_lock(&xprt->transport_lock); 1025 __rpcrdma_update_cwnd_locked(xprt, &r_xprt->rx_buf, grant); 1026 spin_unlock(&xprt->transport_lock); 1027 } 1028 1029 /** 1030 * rpcrdma_reset_cwnd - Reset the xprt's congestion window 1031 * @r_xprt: controlling transport instance 1032 * 1033 * Prepare @r_xprt for the next connection by reinitializing 1034 * its credit grant to one (see RFC 8166, Section 3.3.3). 1035 */ 1036 void rpcrdma_reset_cwnd(struct rpcrdma_xprt *r_xprt) 1037 { 1038 struct rpc_xprt *xprt = &r_xprt->rx_xprt; 1039 1040 spin_lock(&xprt->transport_lock); 1041 xprt->cong = 0; 1042 __rpcrdma_update_cwnd_locked(xprt, &r_xprt->rx_buf, 1); 1043 spin_unlock(&xprt->transport_lock); 1044 } 1045 1046 /** 1047 * rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs 1048 * @rqst: controlling RPC request 1049 * @srcp: points to RPC message payload in receive buffer 1050 * @copy_len: remaining length of receive buffer content 1051 * @pad: Write chunk pad bytes needed (zero for pure inline) 1052 * 1053 * The upper layer has set the maximum number of bytes it can 1054 * receive in each component of rq_rcv_buf. These values are set in 1055 * the head.iov_len, page_len, tail.iov_len, and buflen fields. 1056 * 1057 * Unlike the TCP equivalent (xdr_partial_copy_from_skb), in 1058 * many cases this function simply updates iov_base pointers in 1059 * rq_rcv_buf to point directly to the received reply data, to 1060 * avoid copying reply data. 1061 * 1062 * Returns the count of bytes which had to be memcopied. 1063 */ 1064 static unsigned long 1065 rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) 1066 { 1067 unsigned long fixup_copy_count; 1068 int i, npages, curlen; 1069 char *destp; 1070 struct page **ppages; 1071 int page_base; 1072 1073 /* The head iovec is redirected to the RPC reply message 1074 * in the receive buffer, to avoid a memcopy. 1075 */ 1076 rqst->rq_rcv_buf.head[0].iov_base = srcp; 1077 rqst->rq_private_buf.head[0].iov_base = srcp; 1078 1079 /* The contents of the receive buffer that follow 1080 * head.iov_len bytes are copied into the page list. 1081 */ 1082 curlen = rqst->rq_rcv_buf.head[0].iov_len; 1083 if (curlen > copy_len) 1084 curlen = copy_len; 1085 srcp += curlen; 1086 copy_len -= curlen; 1087 1088 ppages = rqst->rq_rcv_buf.pages + 1089 (rqst->rq_rcv_buf.page_base >> PAGE_SHIFT); 1090 page_base = offset_in_page(rqst->rq_rcv_buf.page_base); 1091 fixup_copy_count = 0; 1092 if (copy_len && rqst->rq_rcv_buf.page_len) { 1093 int pagelist_len; 1094 1095 pagelist_len = rqst->rq_rcv_buf.page_len; 1096 if (pagelist_len > copy_len) 1097 pagelist_len = copy_len; 1098 npages = PAGE_ALIGN(page_base + pagelist_len) >> PAGE_SHIFT; 1099 for (i = 0; i < npages; i++) { 1100 curlen = PAGE_SIZE - page_base; 1101 if (curlen > pagelist_len) 1102 curlen = pagelist_len; 1103 1104 destp = kmap_atomic(ppages[i]); 1105 memcpy(destp + page_base, srcp, curlen); 1106 flush_dcache_page(ppages[i]); 1107 kunmap_atomic(destp); 1108 srcp += curlen; 1109 copy_len -= curlen; 1110 fixup_copy_count += curlen; 1111 pagelist_len -= curlen; 1112 if (!pagelist_len) 1113 break; 1114 page_base = 0; 1115 } 1116 1117 /* Implicit padding for the last segment in a Write 1118 * chunk is inserted inline at the front of the tail 1119 * iovec. The upper layer ignores the content of 1120 * the pad. Simply ensure inline content in the tail 1121 * that follows the Write chunk is properly aligned. 1122 */ 1123 if (pad) 1124 srcp -= pad; 1125 } 1126 1127 /* The tail iovec is redirected to the remaining data 1128 * in the receive buffer, to avoid a memcopy. 1129 */ 1130 if (copy_len || pad) { 1131 rqst->rq_rcv_buf.tail[0].iov_base = srcp; 1132 rqst->rq_private_buf.tail[0].iov_base = srcp; 1133 } 1134 1135 if (fixup_copy_count) 1136 trace_xprtrdma_fixup(rqst, fixup_copy_count); 1137 return fixup_copy_count; 1138 } 1139 1140 /* By convention, backchannel calls arrive via rdma_msg type 1141 * messages, and never populate the chunk lists. This makes 1142 * the RPC/RDMA header small and fixed in size, so it is 1143 * straightforward to check the RPC header's direction field. 1144 */ 1145 static bool 1146 rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) 1147 #if defined(CONFIG_SUNRPC_BACKCHANNEL) 1148 { 1149 struct xdr_stream *xdr = &rep->rr_stream; 1150 __be32 *p; 1151 1152 if (rep->rr_proc != rdma_msg) 1153 return false; 1154 1155 /* Peek at stream contents without advancing. */ 1156 p = xdr_inline_decode(xdr, 0); 1157 1158 /* Chunk lists */ 1159 if (*p++ != xdr_zero) 1160 return false; 1161 if (*p++ != xdr_zero) 1162 return false; 1163 if (*p++ != xdr_zero) 1164 return false; 1165 1166 /* RPC header */ 1167 if (*p++ != rep->rr_xid) 1168 return false; 1169 if (*p != cpu_to_be32(RPC_CALL)) 1170 return false; 1171 1172 /* Now that we are sure this is a backchannel call, 1173 * advance to the RPC header. 1174 */ 1175 p = xdr_inline_decode(xdr, 3 * sizeof(*p)); 1176 if (unlikely(!p)) 1177 goto out_short; 1178 1179 rpcrdma_bc_receive_call(r_xprt, rep); 1180 return true; 1181 1182 out_short: 1183 pr_warn("RPC/RDMA short backward direction call\n"); 1184 return true; 1185 } 1186 #else /* CONFIG_SUNRPC_BACKCHANNEL */ 1187 { 1188 return false; 1189 } 1190 #endif /* CONFIG_SUNRPC_BACKCHANNEL */ 1191 1192 static int decode_rdma_segment(struct xdr_stream *xdr, u32 *length) 1193 { 1194 u32 handle; 1195 u64 offset; 1196 __be32 *p; 1197 1198 p = xdr_inline_decode(xdr, 4 * sizeof(*p)); 1199 if (unlikely(!p)) 1200 return -EIO; 1201 1202 handle = be32_to_cpup(p++); 1203 *length = be32_to_cpup(p++); 1204 xdr_decode_hyper(p, &offset); 1205 1206 trace_xprtrdma_decode_seg(handle, *length, offset); 1207 return 0; 1208 } 1209 1210 static int decode_write_chunk(struct xdr_stream *xdr, u32 *length) 1211 { 1212 u32 segcount, seglength; 1213 __be32 *p; 1214 1215 p = xdr_inline_decode(xdr, sizeof(*p)); 1216 if (unlikely(!p)) 1217 return -EIO; 1218 1219 *length = 0; 1220 segcount = be32_to_cpup(p); 1221 while (segcount--) { 1222 if (decode_rdma_segment(xdr, &seglength)) 1223 return -EIO; 1224 *length += seglength; 1225 } 1226 1227 return 0; 1228 } 1229 1230 /* In RPC-over-RDMA Version One replies, a Read list is never 1231 * expected. This decoder is a stub that returns an error if 1232 * a Read list is present. 1233 */ 1234 static int decode_read_list(struct xdr_stream *xdr) 1235 { 1236 __be32 *p; 1237 1238 p = xdr_inline_decode(xdr, sizeof(*p)); 1239 if (unlikely(!p)) 1240 return -EIO; 1241 if (unlikely(*p != xdr_zero)) 1242 return -EIO; 1243 return 0; 1244 } 1245 1246 /* Supports only one Write chunk in the Write list 1247 */ 1248 static int decode_write_list(struct xdr_stream *xdr, u32 *length) 1249 { 1250 u32 chunklen; 1251 bool first; 1252 __be32 *p; 1253 1254 *length = 0; 1255 first = true; 1256 do { 1257 p = xdr_inline_decode(xdr, sizeof(*p)); 1258 if (unlikely(!p)) 1259 return -EIO; 1260 if (*p == xdr_zero) 1261 break; 1262 if (!first) 1263 return -EIO; 1264 1265 if (decode_write_chunk(xdr, &chunklen)) 1266 return -EIO; 1267 *length += chunklen; 1268 first = false; 1269 } while (true); 1270 return 0; 1271 } 1272 1273 static int decode_reply_chunk(struct xdr_stream *xdr, u32 *length) 1274 { 1275 __be32 *p; 1276 1277 p = xdr_inline_decode(xdr, sizeof(*p)); 1278 if (unlikely(!p)) 1279 return -EIO; 1280 1281 *length = 0; 1282 if (*p != xdr_zero) 1283 if (decode_write_chunk(xdr, length)) 1284 return -EIO; 1285 return 0; 1286 } 1287 1288 static int 1289 rpcrdma_decode_msg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, 1290 struct rpc_rqst *rqst) 1291 { 1292 struct xdr_stream *xdr = &rep->rr_stream; 1293 u32 writelist, replychunk, rpclen; 1294 char *base; 1295 1296 /* Decode the chunk lists */ 1297 if (decode_read_list(xdr)) 1298 return -EIO; 1299 if (decode_write_list(xdr, &writelist)) 1300 return -EIO; 1301 if (decode_reply_chunk(xdr, &replychunk)) 1302 return -EIO; 1303 1304 /* RDMA_MSG sanity checks */ 1305 if (unlikely(replychunk)) 1306 return -EIO; 1307 1308 /* Build the RPC reply's Payload stream in rqst->rq_rcv_buf */ 1309 base = (char *)xdr_inline_decode(xdr, 0); 1310 rpclen = xdr_stream_remaining(xdr); 1311 r_xprt->rx_stats.fixup_copy_count += 1312 rpcrdma_inline_fixup(rqst, base, rpclen, writelist & 3); 1313 1314 r_xprt->rx_stats.total_rdma_reply += writelist; 1315 return rpclen + xdr_align_size(writelist); 1316 } 1317 1318 static noinline int 1319 rpcrdma_decode_nomsg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) 1320 { 1321 struct xdr_stream *xdr = &rep->rr_stream; 1322 u32 writelist, replychunk; 1323 1324 /* Decode the chunk lists */ 1325 if (decode_read_list(xdr)) 1326 return -EIO; 1327 if (decode_write_list(xdr, &writelist)) 1328 return -EIO; 1329 if (decode_reply_chunk(xdr, &replychunk)) 1330 return -EIO; 1331 1332 /* RDMA_NOMSG sanity checks */ 1333 if (unlikely(writelist)) 1334 return -EIO; 1335 if (unlikely(!replychunk)) 1336 return -EIO; 1337 1338 /* Reply chunk buffer already is the reply vector */ 1339 r_xprt->rx_stats.total_rdma_reply += replychunk; 1340 return replychunk; 1341 } 1342 1343 static noinline int 1344 rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, 1345 struct rpc_rqst *rqst) 1346 { 1347 struct xdr_stream *xdr = &rep->rr_stream; 1348 __be32 *p; 1349 1350 p = xdr_inline_decode(xdr, sizeof(*p)); 1351 if (unlikely(!p)) 1352 return -EIO; 1353 1354 switch (*p) { 1355 case err_vers: 1356 p = xdr_inline_decode(xdr, 2 * sizeof(*p)); 1357 if (!p) 1358 break; 1359 dprintk("RPC: %s: server reports " 1360 "version error (%u-%u), xid %08x\n", __func__, 1361 be32_to_cpup(p), be32_to_cpu(*(p + 1)), 1362 be32_to_cpu(rep->rr_xid)); 1363 break; 1364 case err_chunk: 1365 dprintk("RPC: %s: server reports " 1366 "header decoding error, xid %08x\n", __func__, 1367 be32_to_cpu(rep->rr_xid)); 1368 break; 1369 default: 1370 dprintk("RPC: %s: server reports " 1371 "unrecognized error %d, xid %08x\n", __func__, 1372 be32_to_cpup(p), be32_to_cpu(rep->rr_xid)); 1373 } 1374 1375 r_xprt->rx_stats.bad_reply_count++; 1376 return -EREMOTEIO; 1377 } 1378 1379 /* Perform XID lookup, reconstruction of the RPC reply, and 1380 * RPC completion while holding the transport lock to ensure 1381 * the rep, rqst, and rq_task pointers remain stable. 1382 */ 1383 void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) 1384 { 1385 struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; 1386 struct rpc_xprt *xprt = &r_xprt->rx_xprt; 1387 struct rpc_rqst *rqst = rep->rr_rqst; 1388 int status; 1389 1390 switch (rep->rr_proc) { 1391 case rdma_msg: 1392 status = rpcrdma_decode_msg(r_xprt, rep, rqst); 1393 break; 1394 case rdma_nomsg: 1395 status = rpcrdma_decode_nomsg(r_xprt, rep); 1396 break; 1397 case rdma_error: 1398 status = rpcrdma_decode_error(r_xprt, rep, rqst); 1399 break; 1400 default: 1401 status = -EIO; 1402 } 1403 if (status < 0) 1404 goto out_badheader; 1405 1406 out: 1407 spin_lock(&xprt->queue_lock); 1408 xprt_complete_rqst(rqst->rq_task, status); 1409 xprt_unpin_rqst(rqst); 1410 spin_unlock(&xprt->queue_lock); 1411 return; 1412 1413 /* If the incoming reply terminated a pending RPC, the next 1414 * RPC call will post a replacement receive buffer as it is 1415 * being marshaled. 1416 */ 1417 out_badheader: 1418 trace_xprtrdma_reply_hdr(rep); 1419 r_xprt->rx_stats.bad_reply_count++; 1420 goto out; 1421 } 1422 1423 static void rpcrdma_reply_done(struct kref *kref) 1424 { 1425 struct rpcrdma_req *req = 1426 container_of(kref, struct rpcrdma_req, rl_kref); 1427 1428 rpcrdma_complete_rqst(req->rl_reply); 1429 } 1430 1431 /** 1432 * rpcrdma_reply_handler - Process received RPC/RDMA messages 1433 * @rep: Incoming rpcrdma_rep object to process 1434 * 1435 * Errors must result in the RPC task either being awakened, or 1436 * allowed to timeout, to discover the errors at that time. 1437 */ 1438 void rpcrdma_reply_handler(struct rpcrdma_rep *rep) 1439 { 1440 struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; 1441 struct rpc_xprt *xprt = &r_xprt->rx_xprt; 1442 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 1443 struct rpcrdma_req *req; 1444 struct rpc_rqst *rqst; 1445 u32 credits; 1446 __be32 *p; 1447 1448 /* Any data means we had a useful conversation, so 1449 * then we don't need to delay the next reconnect. 1450 */ 1451 if (xprt->reestablish_timeout) 1452 xprt->reestablish_timeout = 0; 1453 1454 /* Fixed transport header fields */ 1455 xdr_init_decode(&rep->rr_stream, &rep->rr_hdrbuf, 1456 rep->rr_hdrbuf.head[0].iov_base, NULL); 1457 p = xdr_inline_decode(&rep->rr_stream, 4 * sizeof(*p)); 1458 if (unlikely(!p)) 1459 goto out_shortreply; 1460 rep->rr_xid = *p++; 1461 rep->rr_vers = *p++; 1462 credits = be32_to_cpu(*p++); 1463 rep->rr_proc = *p++; 1464 1465 if (rep->rr_vers != rpcrdma_version) 1466 goto out_badversion; 1467 1468 if (rpcrdma_is_bcall(r_xprt, rep)) 1469 return; 1470 1471 /* Match incoming rpcrdma_rep to an rpcrdma_req to 1472 * get context for handling any incoming chunks. 1473 */ 1474 spin_lock(&xprt->queue_lock); 1475 rqst = xprt_lookup_rqst(xprt, rep->rr_xid); 1476 if (!rqst) 1477 goto out_norqst; 1478 xprt_pin_rqst(rqst); 1479 spin_unlock(&xprt->queue_lock); 1480 1481 if (credits == 0) 1482 credits = 1; /* don't deadlock */ 1483 else if (credits > buf->rb_max_requests) 1484 credits = buf->rb_max_requests; 1485 if (buf->rb_credits != credits) 1486 rpcrdma_update_cwnd(r_xprt, credits); 1487 rpcrdma_post_recvs(r_xprt, false); 1488 1489 req = rpcr_to_rdmar(rqst); 1490 if (req->rl_reply) { 1491 trace_xprtrdma_leaked_rep(rqst, req->rl_reply); 1492 rpcrdma_recv_buffer_put(req->rl_reply); 1493 } 1494 req->rl_reply = rep; 1495 rep->rr_rqst = rqst; 1496 1497 trace_xprtrdma_reply(rqst->rq_task, rep, req, credits); 1498 1499 if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) 1500 frwr_reminv(rep, &req->rl_registered); 1501 if (!list_empty(&req->rl_registered)) 1502 frwr_unmap_async(r_xprt, req); 1503 /* LocalInv completion will complete the RPC */ 1504 else 1505 kref_put(&req->rl_kref, rpcrdma_reply_done); 1506 return; 1507 1508 out_badversion: 1509 trace_xprtrdma_reply_vers(rep); 1510 goto out; 1511 1512 out_norqst: 1513 spin_unlock(&xprt->queue_lock); 1514 trace_xprtrdma_reply_rqst(rep); 1515 goto out; 1516 1517 out_shortreply: 1518 trace_xprtrdma_reply_short(rep); 1519 1520 out: 1521 rpcrdma_recv_buffer_put(rep); 1522 } 1523