1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 /* 3 * Copyright (c) 2014-2017 Oracle. All rights reserved. 4 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. 5 * 6 * This software is available to you under a choice of one of two 7 * licenses. You may choose to be licensed under the terms of the GNU 8 * General Public License (GPL) Version 2, available from the file 9 * COPYING in the main directory of this source tree, or the BSD-type 10 * license below: 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 16 * Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 19 * Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials provided 22 * with the distribution. 23 * 24 * Neither the name of the Network Appliance, Inc. nor the names of 25 * its contributors may be used to endorse or promote products 26 * derived from this software without specific prior written 27 * permission. 28 * 29 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 30 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 31 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 32 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 33 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 34 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 35 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 36 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 37 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 38 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 39 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 40 */ 41 42 /* 43 * rpc_rdma.c 44 * 45 * This file contains the guts of the RPC RDMA protocol, and 46 * does marshaling/unmarshaling, etc. It is also where interfacing 47 * to the Linux RPC framework lives. 48 */ 49 50 #include <linux/highmem.h> 51 52 #include <linux/sunrpc/svc_rdma.h> 53 54 #include "xprt_rdma.h" 55 #include <trace/events/rpcrdma.h> 56 57 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) 58 # define RPCDBG_FACILITY RPCDBG_TRANS 59 #endif 60 61 /* Returns size of largest RPC-over-RDMA header in a Call message 62 * 63 * The largest Call header contains a full-size Read list and a 64 * minimal Reply chunk. 65 */ 66 static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs) 67 { 68 unsigned int size; 69 70 /* Fixed header fields and list discriminators */ 71 size = RPCRDMA_HDRLEN_MIN; 72 73 /* Maximum Read list size */ 74 size = maxsegs * rpcrdma_readchunk_maxsz * sizeof(__be32); 75 76 /* Minimal Read chunk size */ 77 size += sizeof(__be32); /* segment count */ 78 size += rpcrdma_segment_maxsz * sizeof(__be32); 79 size += sizeof(__be32); /* list discriminator */ 80 81 return size; 82 } 83 84 /* Returns size of largest RPC-over-RDMA header in a Reply message 85 * 86 * There is only one Write list or one Reply chunk per Reply 87 * message. The larger list is the Write list. 88 */ 89 static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs) 90 { 91 unsigned int size; 92 93 /* Fixed header fields and list discriminators */ 94 size = RPCRDMA_HDRLEN_MIN; 95 96 /* Maximum Write list size */ 97 size = sizeof(__be32); /* segment count */ 98 size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32); 99 size += sizeof(__be32); /* list discriminator */ 100 101 return size; 102 } 103 104 /** 105 * rpcrdma_set_max_header_sizes - Initialize inline payload sizes 106 * @ep: endpoint to initialize 107 * 108 * The max_inline fields contain the maximum size of an RPC message 109 * so the marshaling code doesn't have to repeat this calculation 110 * for every RPC. 111 */ 112 void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep) 113 { 114 unsigned int maxsegs = ep->re_max_rdma_segs; 115 116 ep->re_max_inline_send = 117 ep->re_inline_send - rpcrdma_max_call_header_size(maxsegs); 118 ep->re_max_inline_recv = 119 ep->re_inline_recv - rpcrdma_max_reply_header_size(maxsegs); 120 } 121 122 /* The client can send a request inline as long as the RPCRDMA header 123 * plus the RPC call fit under the transport's inline limit. If the 124 * combined call message size exceeds that limit, the client must use 125 * a Read chunk for this operation. 126 * 127 * A Read chunk is also required if sending the RPC call inline would 128 * exceed this device's max_sge limit. 129 */ 130 static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, 131 struct rpc_rqst *rqst) 132 { 133 struct xdr_buf *xdr = &rqst->rq_snd_buf; 134 struct rpcrdma_ep *ep = r_xprt->rx_ep; 135 unsigned int count, remaining, offset; 136 137 if (xdr->len > ep->re_max_inline_send) 138 return false; 139 140 if (xdr->page_len) { 141 remaining = xdr->page_len; 142 offset = offset_in_page(xdr->page_base); 143 count = RPCRDMA_MIN_SEND_SGES; 144 while (remaining) { 145 remaining -= min_t(unsigned int, 146 PAGE_SIZE - offset, remaining); 147 offset = 0; 148 if (++count > ep->re_attr.cap.max_send_sge) 149 return false; 150 } 151 } 152 153 return true; 154 } 155 156 /* The client can't know how large the actual reply will be. Thus it 157 * plans for the largest possible reply for that particular ULP 158 * operation. If the maximum combined reply message size exceeds that 159 * limit, the client must provide a write list or a reply chunk for 160 * this request. 161 */ 162 static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt, 163 struct rpc_rqst *rqst) 164 { 165 return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep->re_max_inline_recv; 166 } 167 168 /* The client is required to provide a Reply chunk if the maximum 169 * size of the non-payload part of the RPC Reply is larger than 170 * the inline threshold. 171 */ 172 static bool 173 rpcrdma_nonpayload_inline(const struct rpcrdma_xprt *r_xprt, 174 const struct rpc_rqst *rqst) 175 { 176 const struct xdr_buf *buf = &rqst->rq_rcv_buf; 177 178 return (buf->head[0].iov_len + buf->tail[0].iov_len) < 179 r_xprt->rx_ep->re_max_inline_recv; 180 } 181 182 /* Split @vec on page boundaries into SGEs. FMR registers pages, not 183 * a byte range. Other modes coalesce these SGEs into a single MR 184 * when they can. 185 * 186 * Returns pointer to next available SGE, and bumps the total number 187 * of SGEs consumed. 188 */ 189 static struct rpcrdma_mr_seg * 190 rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, 191 unsigned int *n) 192 { 193 u32 remaining, page_offset; 194 char *base; 195 196 base = vec->iov_base; 197 page_offset = offset_in_page(base); 198 remaining = vec->iov_len; 199 while (remaining) { 200 seg->mr_page = NULL; 201 seg->mr_offset = base; 202 seg->mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining); 203 remaining -= seg->mr_len; 204 base += seg->mr_len; 205 ++seg; 206 ++(*n); 207 page_offset = 0; 208 } 209 return seg; 210 } 211 212 /* Convert @xdrbuf into SGEs no larger than a page each. As they 213 * are registered, these SGEs are then coalesced into RDMA segments 214 * when the selected memreg mode supports it. 215 * 216 * Returns positive number of SGEs consumed, or a negative errno. 217 */ 218 219 static int 220 rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, 221 unsigned int pos, enum rpcrdma_chunktype type, 222 struct rpcrdma_mr_seg *seg) 223 { 224 unsigned long page_base; 225 unsigned int len, n; 226 struct page **ppages; 227 228 n = 0; 229 if (pos == 0) 230 seg = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, &n); 231 232 len = xdrbuf->page_len; 233 ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); 234 page_base = offset_in_page(xdrbuf->page_base); 235 while (len) { 236 /* ACL likes to be lazy in allocating pages - ACLs 237 * are small by default but can get huge. 238 */ 239 if (unlikely(xdrbuf->flags & XDRBUF_SPARSE_PAGES)) { 240 if (!*ppages) 241 *ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN); 242 if (!*ppages) 243 return -ENOBUFS; 244 } 245 seg->mr_page = *ppages; 246 seg->mr_offset = (char *)page_base; 247 seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len); 248 len -= seg->mr_len; 249 ++ppages; 250 ++seg; 251 ++n; 252 page_base = 0; 253 } 254 255 /* When encoding a Read chunk, the tail iovec contains an 256 * XDR pad and may be omitted. 257 */ 258 if (type == rpcrdma_readch && r_xprt->rx_ep->re_implicit_roundup) 259 goto out; 260 261 /* When encoding a Write chunk, some servers need to see an 262 * extra segment for non-XDR-aligned Write chunks. The upper 263 * layer provides space in the tail iovec that may be used 264 * for this purpose. 265 */ 266 if (type == rpcrdma_writech && r_xprt->rx_ep->re_implicit_roundup) 267 goto out; 268 269 if (xdrbuf->tail[0].iov_len) 270 seg = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n); 271 272 out: 273 if (unlikely(n > RPCRDMA_MAX_SEGS)) 274 return -EIO; 275 return n; 276 } 277 278 static void 279 xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr *mr) 280 { 281 *iptr++ = cpu_to_be32(mr->mr_handle); 282 *iptr++ = cpu_to_be32(mr->mr_length); 283 xdr_encode_hyper(iptr, mr->mr_offset); 284 } 285 286 static int 287 encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr) 288 { 289 __be32 *p; 290 291 p = xdr_reserve_space(xdr, 4 * sizeof(*p)); 292 if (unlikely(!p)) 293 return -EMSGSIZE; 294 295 xdr_encode_rdma_segment(p, mr); 296 return 0; 297 } 298 299 static int 300 encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr, 301 u32 position) 302 { 303 __be32 *p; 304 305 p = xdr_reserve_space(xdr, 6 * sizeof(*p)); 306 if (unlikely(!p)) 307 return -EMSGSIZE; 308 309 *p++ = xdr_one; /* Item present */ 310 *p++ = cpu_to_be32(position); 311 xdr_encode_rdma_segment(p, mr); 312 return 0; 313 } 314 315 static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, 316 struct rpcrdma_req *req, 317 struct rpcrdma_mr_seg *seg, 318 int nsegs, bool writing, 319 struct rpcrdma_mr **mr) 320 { 321 *mr = rpcrdma_mr_pop(&req->rl_free_mrs); 322 if (!*mr) { 323 *mr = rpcrdma_mr_get(r_xprt); 324 if (!*mr) 325 goto out_getmr_err; 326 trace_xprtrdma_mr_get(req); 327 (*mr)->mr_req = req; 328 } 329 330 rpcrdma_mr_push(*mr, &req->rl_registered); 331 return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr); 332 333 out_getmr_err: 334 trace_xprtrdma_nomrs(req); 335 xprt_wait_for_buffer_space(&r_xprt->rx_xprt); 336 rpcrdma_mrs_refresh(r_xprt); 337 return ERR_PTR(-EAGAIN); 338 } 339 340 /* Register and XDR encode the Read list. Supports encoding a list of read 341 * segments that belong to a single read chunk. 342 * 343 * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): 344 * 345 * Read chunklist (a linked list): 346 * N elements, position P (same P for all chunks of same arg!): 347 * 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0 348 * 349 * Returns zero on success, or a negative errno if a failure occurred. 350 * @xdr is advanced to the next position in the stream. 351 * 352 * Only a single @pos value is currently supported. 353 */ 354 static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, 355 struct rpcrdma_req *req, 356 struct rpc_rqst *rqst, 357 enum rpcrdma_chunktype rtype) 358 { 359 struct xdr_stream *xdr = &req->rl_stream; 360 struct rpcrdma_mr_seg *seg; 361 struct rpcrdma_mr *mr; 362 unsigned int pos; 363 int nsegs; 364 365 if (rtype == rpcrdma_noch_pullup || rtype == rpcrdma_noch_mapped) 366 goto done; 367 368 pos = rqst->rq_snd_buf.head[0].iov_len; 369 if (rtype == rpcrdma_areadch) 370 pos = 0; 371 seg = req->rl_segments; 372 nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_snd_buf, pos, 373 rtype, seg); 374 if (nsegs < 0) 375 return nsegs; 376 377 do { 378 seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, false, &mr); 379 if (IS_ERR(seg)) 380 return PTR_ERR(seg); 381 382 if (encode_read_segment(xdr, mr, pos) < 0) 383 return -EMSGSIZE; 384 385 trace_xprtrdma_chunk_read(rqst->rq_task, pos, mr, nsegs); 386 r_xprt->rx_stats.read_chunk_count++; 387 nsegs -= mr->mr_nents; 388 } while (nsegs); 389 390 done: 391 if (xdr_stream_encode_item_absent(xdr) < 0) 392 return -EMSGSIZE; 393 return 0; 394 } 395 396 /* Register and XDR encode the Write list. Supports encoding a list 397 * containing one array of plain segments that belong to a single 398 * write chunk. 399 * 400 * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): 401 * 402 * Write chunklist (a list of (one) counted array): 403 * N elements: 404 * 1 - N - HLOO - HLOO - ... - HLOO - 0 405 * 406 * Returns zero on success, or a negative errno if a failure occurred. 407 * @xdr is advanced to the next position in the stream. 408 * 409 * Only a single Write chunk is currently supported. 410 */ 411 static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, 412 struct rpcrdma_req *req, 413 struct rpc_rqst *rqst, 414 enum rpcrdma_chunktype wtype) 415 { 416 struct xdr_stream *xdr = &req->rl_stream; 417 struct rpcrdma_mr_seg *seg; 418 struct rpcrdma_mr *mr; 419 int nsegs, nchunks; 420 __be32 *segcount; 421 422 if (wtype != rpcrdma_writech) 423 goto done; 424 425 seg = req->rl_segments; 426 nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 427 rqst->rq_rcv_buf.head[0].iov_len, 428 wtype, seg); 429 if (nsegs < 0) 430 return nsegs; 431 432 if (xdr_stream_encode_item_present(xdr) < 0) 433 return -EMSGSIZE; 434 segcount = xdr_reserve_space(xdr, sizeof(*segcount)); 435 if (unlikely(!segcount)) 436 return -EMSGSIZE; 437 /* Actual value encoded below */ 438 439 nchunks = 0; 440 do { 441 seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr); 442 if (IS_ERR(seg)) 443 return PTR_ERR(seg); 444 445 if (encode_rdma_segment(xdr, mr) < 0) 446 return -EMSGSIZE; 447 448 trace_xprtrdma_chunk_write(rqst->rq_task, mr, nsegs); 449 r_xprt->rx_stats.write_chunk_count++; 450 r_xprt->rx_stats.total_rdma_request += mr->mr_length; 451 nchunks++; 452 nsegs -= mr->mr_nents; 453 } while (nsegs); 454 455 /* Update count of segments in this Write chunk */ 456 *segcount = cpu_to_be32(nchunks); 457 458 done: 459 if (xdr_stream_encode_item_absent(xdr) < 0) 460 return -EMSGSIZE; 461 return 0; 462 } 463 464 /* Register and XDR encode the Reply chunk. Supports encoding an array 465 * of plain segments that belong to a single write (reply) chunk. 466 * 467 * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): 468 * 469 * Reply chunk (a counted array): 470 * N elements: 471 * 1 - N - HLOO - HLOO - ... - HLOO 472 * 473 * Returns zero on success, or a negative errno if a failure occurred. 474 * @xdr is advanced to the next position in the stream. 475 */ 476 static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, 477 struct rpcrdma_req *req, 478 struct rpc_rqst *rqst, 479 enum rpcrdma_chunktype wtype) 480 { 481 struct xdr_stream *xdr = &req->rl_stream; 482 struct rpcrdma_mr_seg *seg; 483 struct rpcrdma_mr *mr; 484 int nsegs, nchunks; 485 __be32 *segcount; 486 487 if (wtype != rpcrdma_replych) { 488 if (xdr_stream_encode_item_absent(xdr) < 0) 489 return -EMSGSIZE; 490 return 0; 491 } 492 493 seg = req->rl_segments; 494 nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg); 495 if (nsegs < 0) 496 return nsegs; 497 498 if (xdr_stream_encode_item_present(xdr) < 0) 499 return -EMSGSIZE; 500 segcount = xdr_reserve_space(xdr, sizeof(*segcount)); 501 if (unlikely(!segcount)) 502 return -EMSGSIZE; 503 /* Actual value encoded below */ 504 505 nchunks = 0; 506 do { 507 seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr); 508 if (IS_ERR(seg)) 509 return PTR_ERR(seg); 510 511 if (encode_rdma_segment(xdr, mr) < 0) 512 return -EMSGSIZE; 513 514 trace_xprtrdma_chunk_reply(rqst->rq_task, mr, nsegs); 515 r_xprt->rx_stats.reply_chunk_count++; 516 r_xprt->rx_stats.total_rdma_request += mr->mr_length; 517 nchunks++; 518 nsegs -= mr->mr_nents; 519 } while (nsegs); 520 521 /* Update count of segments in the Reply chunk */ 522 *segcount = cpu_to_be32(nchunks); 523 524 return 0; 525 } 526 527 static void rpcrdma_sendctx_done(struct kref *kref) 528 { 529 struct rpcrdma_req *req = 530 container_of(kref, struct rpcrdma_req, rl_kref); 531 struct rpcrdma_rep *rep = req->rl_reply; 532 533 rpcrdma_complete_rqst(rep); 534 rep->rr_rxprt->rx_stats.reply_waits_for_send++; 535 } 536 537 /** 538 * rpcrdma_sendctx_unmap - DMA-unmap Send buffer 539 * @sc: sendctx containing SGEs to unmap 540 * 541 */ 542 void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc) 543 { 544 struct rpcrdma_regbuf *rb = sc->sc_req->rl_sendbuf; 545 struct ib_sge *sge; 546 547 if (!sc->sc_unmap_count) 548 return; 549 550 /* The first two SGEs contain the transport header and 551 * the inline buffer. These are always left mapped so 552 * they can be cheaply re-used. 553 */ 554 for (sge = &sc->sc_sges[2]; sc->sc_unmap_count; 555 ++sge, --sc->sc_unmap_count) 556 ib_dma_unmap_page(rdmab_device(rb), sge->addr, sge->length, 557 DMA_TO_DEVICE); 558 559 kref_put(&sc->sc_req->rl_kref, rpcrdma_sendctx_done); 560 } 561 562 /* Prepare an SGE for the RPC-over-RDMA transport header. 563 */ 564 static void rpcrdma_prepare_hdr_sge(struct rpcrdma_xprt *r_xprt, 565 struct rpcrdma_req *req, u32 len) 566 { 567 struct rpcrdma_sendctx *sc = req->rl_sendctx; 568 struct rpcrdma_regbuf *rb = req->rl_rdmabuf; 569 struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++]; 570 571 sge->addr = rdmab_addr(rb); 572 sge->length = len; 573 sge->lkey = rdmab_lkey(rb); 574 575 ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length, 576 DMA_TO_DEVICE); 577 } 578 579 /* The head iovec is straightforward, as it is usually already 580 * DMA-mapped. Sync the content that has changed. 581 */ 582 static bool rpcrdma_prepare_head_iov(struct rpcrdma_xprt *r_xprt, 583 struct rpcrdma_req *req, unsigned int len) 584 { 585 struct rpcrdma_sendctx *sc = req->rl_sendctx; 586 struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++]; 587 struct rpcrdma_regbuf *rb = req->rl_sendbuf; 588 589 if (!rpcrdma_regbuf_dma_map(r_xprt, rb)) 590 return false; 591 592 sge->addr = rdmab_addr(rb); 593 sge->length = len; 594 sge->lkey = rdmab_lkey(rb); 595 596 ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length, 597 DMA_TO_DEVICE); 598 return true; 599 } 600 601 /* If there is a page list present, DMA map and prepare an 602 * SGE for each page to be sent. 603 */ 604 static bool rpcrdma_prepare_pagelist(struct rpcrdma_req *req, 605 struct xdr_buf *xdr) 606 { 607 struct rpcrdma_sendctx *sc = req->rl_sendctx; 608 struct rpcrdma_regbuf *rb = req->rl_sendbuf; 609 unsigned int page_base, len, remaining; 610 struct page **ppages; 611 struct ib_sge *sge; 612 613 ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); 614 page_base = offset_in_page(xdr->page_base); 615 remaining = xdr->page_len; 616 while (remaining) { 617 sge = &sc->sc_sges[req->rl_wr.num_sge++]; 618 len = min_t(unsigned int, PAGE_SIZE - page_base, remaining); 619 sge->addr = ib_dma_map_page(rdmab_device(rb), *ppages, 620 page_base, len, DMA_TO_DEVICE); 621 if (ib_dma_mapping_error(rdmab_device(rb), sge->addr)) 622 goto out_mapping_err; 623 624 sge->length = len; 625 sge->lkey = rdmab_lkey(rb); 626 627 sc->sc_unmap_count++; 628 ppages++; 629 remaining -= len; 630 page_base = 0; 631 } 632 633 return true; 634 635 out_mapping_err: 636 trace_xprtrdma_dma_maperr(sge->addr); 637 return false; 638 } 639 640 /* The tail iovec may include an XDR pad for the page list, 641 * as well as additional content, and may not reside in the 642 * same page as the head iovec. 643 */ 644 static bool rpcrdma_prepare_tail_iov(struct rpcrdma_req *req, 645 struct xdr_buf *xdr, 646 unsigned int page_base, unsigned int len) 647 { 648 struct rpcrdma_sendctx *sc = req->rl_sendctx; 649 struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++]; 650 struct rpcrdma_regbuf *rb = req->rl_sendbuf; 651 struct page *page = virt_to_page(xdr->tail[0].iov_base); 652 653 sge->addr = ib_dma_map_page(rdmab_device(rb), page, page_base, len, 654 DMA_TO_DEVICE); 655 if (ib_dma_mapping_error(rdmab_device(rb), sge->addr)) 656 goto out_mapping_err; 657 658 sge->length = len; 659 sge->lkey = rdmab_lkey(rb); 660 ++sc->sc_unmap_count; 661 return true; 662 663 out_mapping_err: 664 trace_xprtrdma_dma_maperr(sge->addr); 665 return false; 666 } 667 668 /* Copy the tail to the end of the head buffer. 669 */ 670 static void rpcrdma_pullup_tail_iov(struct rpcrdma_xprt *r_xprt, 671 struct rpcrdma_req *req, 672 struct xdr_buf *xdr) 673 { 674 unsigned char *dst; 675 676 dst = (unsigned char *)xdr->head[0].iov_base; 677 dst += xdr->head[0].iov_len + xdr->page_len; 678 memmove(dst, xdr->tail[0].iov_base, xdr->tail[0].iov_len); 679 r_xprt->rx_stats.pullup_copy_count += xdr->tail[0].iov_len; 680 } 681 682 /* Copy pagelist content into the head buffer. 683 */ 684 static void rpcrdma_pullup_pagelist(struct rpcrdma_xprt *r_xprt, 685 struct rpcrdma_req *req, 686 struct xdr_buf *xdr) 687 { 688 unsigned int len, page_base, remaining; 689 struct page **ppages; 690 unsigned char *src, *dst; 691 692 dst = (unsigned char *)xdr->head[0].iov_base; 693 dst += xdr->head[0].iov_len; 694 ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); 695 page_base = offset_in_page(xdr->page_base); 696 remaining = xdr->page_len; 697 while (remaining) { 698 src = page_address(*ppages); 699 src += page_base; 700 len = min_t(unsigned int, PAGE_SIZE - page_base, remaining); 701 memcpy(dst, src, len); 702 r_xprt->rx_stats.pullup_copy_count += len; 703 704 ppages++; 705 dst += len; 706 remaining -= len; 707 page_base = 0; 708 } 709 } 710 711 /* Copy the contents of @xdr into @rl_sendbuf and DMA sync it. 712 * When the head, pagelist, and tail are small, a pull-up copy 713 * is considerably less costly than DMA mapping the components 714 * of @xdr. 715 * 716 * Assumptions: 717 * - the caller has already verified that the total length 718 * of the RPC Call body will fit into @rl_sendbuf. 719 */ 720 static bool rpcrdma_prepare_noch_pullup(struct rpcrdma_xprt *r_xprt, 721 struct rpcrdma_req *req, 722 struct xdr_buf *xdr) 723 { 724 if (unlikely(xdr->tail[0].iov_len)) 725 rpcrdma_pullup_tail_iov(r_xprt, req, xdr); 726 727 if (unlikely(xdr->page_len)) 728 rpcrdma_pullup_pagelist(r_xprt, req, xdr); 729 730 /* The whole RPC message resides in the head iovec now */ 731 return rpcrdma_prepare_head_iov(r_xprt, req, xdr->len); 732 } 733 734 static bool rpcrdma_prepare_noch_mapped(struct rpcrdma_xprt *r_xprt, 735 struct rpcrdma_req *req, 736 struct xdr_buf *xdr) 737 { 738 struct kvec *tail = &xdr->tail[0]; 739 740 if (!rpcrdma_prepare_head_iov(r_xprt, req, xdr->head[0].iov_len)) 741 return false; 742 if (xdr->page_len) 743 if (!rpcrdma_prepare_pagelist(req, xdr)) 744 return false; 745 if (tail->iov_len) 746 if (!rpcrdma_prepare_tail_iov(req, xdr, 747 offset_in_page(tail->iov_base), 748 tail->iov_len)) 749 return false; 750 751 if (req->rl_sendctx->sc_unmap_count) 752 kref_get(&req->rl_kref); 753 return true; 754 } 755 756 static bool rpcrdma_prepare_readch(struct rpcrdma_xprt *r_xprt, 757 struct rpcrdma_req *req, 758 struct xdr_buf *xdr) 759 { 760 if (!rpcrdma_prepare_head_iov(r_xprt, req, xdr->head[0].iov_len)) 761 return false; 762 763 /* If there is a Read chunk, the page list is being handled 764 * via explicit RDMA, and thus is skipped here. 765 */ 766 767 /* Do not include the tail if it is only an XDR pad */ 768 if (xdr->tail[0].iov_len > 3) { 769 unsigned int page_base, len; 770 771 /* If the content in the page list is an odd length, 772 * xdr_write_pages() adds a pad at the beginning of 773 * the tail iovec. Force the tail's non-pad content to 774 * land at the next XDR position in the Send message. 775 */ 776 page_base = offset_in_page(xdr->tail[0].iov_base); 777 len = xdr->tail[0].iov_len; 778 page_base += len & 3; 779 len -= len & 3; 780 if (!rpcrdma_prepare_tail_iov(req, xdr, page_base, len)) 781 return false; 782 kref_get(&req->rl_kref); 783 } 784 785 return true; 786 } 787 788 /** 789 * rpcrdma_prepare_send_sges - Construct SGEs for a Send WR 790 * @r_xprt: controlling transport 791 * @req: context of RPC Call being marshalled 792 * @hdrlen: size of transport header, in bytes 793 * @xdr: xdr_buf containing RPC Call 794 * @rtype: chunk type being encoded 795 * 796 * Returns 0 on success; otherwise a negative errno is returned. 797 */ 798 inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, 799 struct rpcrdma_req *req, u32 hdrlen, 800 struct xdr_buf *xdr, 801 enum rpcrdma_chunktype rtype) 802 { 803 int ret; 804 805 ret = -EAGAIN; 806 req->rl_sendctx = rpcrdma_sendctx_get_locked(r_xprt); 807 if (!req->rl_sendctx) 808 goto out_nosc; 809 req->rl_sendctx->sc_unmap_count = 0; 810 req->rl_sendctx->sc_req = req; 811 kref_init(&req->rl_kref); 812 req->rl_wr.wr_cqe = &req->rl_sendctx->sc_cqe; 813 req->rl_wr.sg_list = req->rl_sendctx->sc_sges; 814 req->rl_wr.num_sge = 0; 815 req->rl_wr.opcode = IB_WR_SEND; 816 817 rpcrdma_prepare_hdr_sge(r_xprt, req, hdrlen); 818 819 ret = -EIO; 820 switch (rtype) { 821 case rpcrdma_noch_pullup: 822 if (!rpcrdma_prepare_noch_pullup(r_xprt, req, xdr)) 823 goto out_unmap; 824 break; 825 case rpcrdma_noch_mapped: 826 if (!rpcrdma_prepare_noch_mapped(r_xprt, req, xdr)) 827 goto out_unmap; 828 break; 829 case rpcrdma_readch: 830 if (!rpcrdma_prepare_readch(r_xprt, req, xdr)) 831 goto out_unmap; 832 break; 833 case rpcrdma_areadch: 834 break; 835 default: 836 goto out_unmap; 837 } 838 839 return 0; 840 841 out_unmap: 842 rpcrdma_sendctx_unmap(req->rl_sendctx); 843 out_nosc: 844 trace_xprtrdma_prepsend_failed(&req->rl_slot, ret); 845 return ret; 846 } 847 848 /** 849 * rpcrdma_marshal_req - Marshal and send one RPC request 850 * @r_xprt: controlling transport 851 * @rqst: RPC request to be marshaled 852 * 853 * For the RPC in "rqst", this function: 854 * - Chooses the transfer mode (eg., RDMA_MSG or RDMA_NOMSG) 855 * - Registers Read, Write, and Reply chunks 856 * - Constructs the transport header 857 * - Posts a Send WR to send the transport header and request 858 * 859 * Returns: 860 * %0 if the RPC was sent successfully, 861 * %-ENOTCONN if the connection was lost, 862 * %-EAGAIN if the caller should call again with the same arguments, 863 * %-ENOBUFS if the caller should call again after a delay, 864 * %-EMSGSIZE if the transport header is too small, 865 * %-EIO if a permanent problem occurred while marshaling. 866 */ 867 int 868 rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) 869 { 870 struct rpcrdma_req *req = rpcr_to_rdmar(rqst); 871 struct xdr_stream *xdr = &req->rl_stream; 872 enum rpcrdma_chunktype rtype, wtype; 873 struct xdr_buf *buf = &rqst->rq_snd_buf; 874 bool ddp_allowed; 875 __be32 *p; 876 int ret; 877 878 rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0); 879 xdr_init_encode(xdr, &req->rl_hdrbuf, rdmab_data(req->rl_rdmabuf), 880 rqst); 881 882 /* Fixed header fields */ 883 ret = -EMSGSIZE; 884 p = xdr_reserve_space(xdr, 4 * sizeof(*p)); 885 if (!p) 886 goto out_err; 887 *p++ = rqst->rq_xid; 888 *p++ = rpcrdma_version; 889 *p++ = r_xprt->rx_buf.rb_max_requests; 890 891 /* When the ULP employs a GSS flavor that guarantees integrity 892 * or privacy, direct data placement of individual data items 893 * is not allowed. 894 */ 895 ddp_allowed = !test_bit(RPCAUTH_AUTH_DATATOUCH, 896 &rqst->rq_cred->cr_auth->au_flags); 897 898 /* 899 * Chunks needed for results? 900 * 901 * o If the expected result is under the inline threshold, all ops 902 * return as inline. 903 * o Large read ops return data as write chunk(s), header as 904 * inline. 905 * o Large non-read ops return as a single reply chunk. 906 */ 907 if (rpcrdma_results_inline(r_xprt, rqst)) 908 wtype = rpcrdma_noch; 909 else if ((ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ) && 910 rpcrdma_nonpayload_inline(r_xprt, rqst)) 911 wtype = rpcrdma_writech; 912 else 913 wtype = rpcrdma_replych; 914 915 /* 916 * Chunks needed for arguments? 917 * 918 * o If the total request is under the inline threshold, all ops 919 * are sent as inline. 920 * o Large write ops transmit data as read chunk(s), header as 921 * inline. 922 * o Large non-write ops are sent with the entire message as a 923 * single read chunk (protocol 0-position special case). 924 * 925 * This assumes that the upper layer does not present a request 926 * that both has a data payload, and whose non-data arguments 927 * by themselves are larger than the inline threshold. 928 */ 929 if (rpcrdma_args_inline(r_xprt, rqst)) { 930 *p++ = rdma_msg; 931 rtype = buf->len < rdmab_length(req->rl_sendbuf) ? 932 rpcrdma_noch_pullup : rpcrdma_noch_mapped; 933 } else if (ddp_allowed && buf->flags & XDRBUF_WRITE) { 934 *p++ = rdma_msg; 935 rtype = rpcrdma_readch; 936 } else { 937 r_xprt->rx_stats.nomsg_call_count++; 938 *p++ = rdma_nomsg; 939 rtype = rpcrdma_areadch; 940 } 941 942 /* This implementation supports the following combinations 943 * of chunk lists in one RPC-over-RDMA Call message: 944 * 945 * - Read list 946 * - Write list 947 * - Reply chunk 948 * - Read list + Reply chunk 949 * 950 * It might not yet support the following combinations: 951 * 952 * - Read list + Write list 953 * 954 * It does not support the following combinations: 955 * 956 * - Write list + Reply chunk 957 * - Read list + Write list + Reply chunk 958 * 959 * This implementation supports only a single chunk in each 960 * Read or Write list. Thus for example the client cannot 961 * send a Call message with a Position Zero Read chunk and a 962 * regular Read chunk at the same time. 963 */ 964 ret = rpcrdma_encode_read_list(r_xprt, req, rqst, rtype); 965 if (ret) 966 goto out_err; 967 ret = rpcrdma_encode_write_list(r_xprt, req, rqst, wtype); 968 if (ret) 969 goto out_err; 970 ret = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, wtype); 971 if (ret) 972 goto out_err; 973 974 ret = rpcrdma_prepare_send_sges(r_xprt, req, req->rl_hdrbuf.len, 975 buf, rtype); 976 if (ret) 977 goto out_err; 978 979 trace_xprtrdma_marshal(req, rtype, wtype); 980 return 0; 981 982 out_err: 983 trace_xprtrdma_marshal_failed(rqst, ret); 984 r_xprt->rx_stats.failed_marshal_count++; 985 frwr_reset(req); 986 return ret; 987 } 988 989 static void __rpcrdma_update_cwnd_locked(struct rpc_xprt *xprt, 990 struct rpcrdma_buffer *buf, 991 u32 grant) 992 { 993 buf->rb_credits = grant; 994 xprt->cwnd = grant << RPC_CWNDSHIFT; 995 } 996 997 static void rpcrdma_update_cwnd(struct rpcrdma_xprt *r_xprt, u32 grant) 998 { 999 struct rpc_xprt *xprt = &r_xprt->rx_xprt; 1000 1001 spin_lock(&xprt->transport_lock); 1002 __rpcrdma_update_cwnd_locked(xprt, &r_xprt->rx_buf, grant); 1003 spin_unlock(&xprt->transport_lock); 1004 } 1005 1006 /** 1007 * rpcrdma_reset_cwnd - Reset the xprt's congestion window 1008 * @r_xprt: controlling transport instance 1009 * 1010 * Prepare @r_xprt for the next connection by reinitializing 1011 * its credit grant to one (see RFC 8166, Section 3.3.3). 1012 */ 1013 void rpcrdma_reset_cwnd(struct rpcrdma_xprt *r_xprt) 1014 { 1015 struct rpc_xprt *xprt = &r_xprt->rx_xprt; 1016 1017 spin_lock(&xprt->transport_lock); 1018 xprt->cong = 0; 1019 __rpcrdma_update_cwnd_locked(xprt, &r_xprt->rx_buf, 1); 1020 spin_unlock(&xprt->transport_lock); 1021 } 1022 1023 /** 1024 * rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs 1025 * @rqst: controlling RPC request 1026 * @srcp: points to RPC message payload in receive buffer 1027 * @copy_len: remaining length of receive buffer content 1028 * @pad: Write chunk pad bytes needed (zero for pure inline) 1029 * 1030 * The upper layer has set the maximum number of bytes it can 1031 * receive in each component of rq_rcv_buf. These values are set in 1032 * the head.iov_len, page_len, tail.iov_len, and buflen fields. 1033 * 1034 * Unlike the TCP equivalent (xdr_partial_copy_from_skb), in 1035 * many cases this function simply updates iov_base pointers in 1036 * rq_rcv_buf to point directly to the received reply data, to 1037 * avoid copying reply data. 1038 * 1039 * Returns the count of bytes which had to be memcopied. 1040 */ 1041 static unsigned long 1042 rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) 1043 { 1044 unsigned long fixup_copy_count; 1045 int i, npages, curlen; 1046 char *destp; 1047 struct page **ppages; 1048 int page_base; 1049 1050 /* The head iovec is redirected to the RPC reply message 1051 * in the receive buffer, to avoid a memcopy. 1052 */ 1053 rqst->rq_rcv_buf.head[0].iov_base = srcp; 1054 rqst->rq_private_buf.head[0].iov_base = srcp; 1055 1056 /* The contents of the receive buffer that follow 1057 * head.iov_len bytes are copied into the page list. 1058 */ 1059 curlen = rqst->rq_rcv_buf.head[0].iov_len; 1060 if (curlen > copy_len) 1061 curlen = copy_len; 1062 srcp += curlen; 1063 copy_len -= curlen; 1064 1065 ppages = rqst->rq_rcv_buf.pages + 1066 (rqst->rq_rcv_buf.page_base >> PAGE_SHIFT); 1067 page_base = offset_in_page(rqst->rq_rcv_buf.page_base); 1068 fixup_copy_count = 0; 1069 if (copy_len && rqst->rq_rcv_buf.page_len) { 1070 int pagelist_len; 1071 1072 pagelist_len = rqst->rq_rcv_buf.page_len; 1073 if (pagelist_len > copy_len) 1074 pagelist_len = copy_len; 1075 npages = PAGE_ALIGN(page_base + pagelist_len) >> PAGE_SHIFT; 1076 for (i = 0; i < npages; i++) { 1077 curlen = PAGE_SIZE - page_base; 1078 if (curlen > pagelist_len) 1079 curlen = pagelist_len; 1080 1081 destp = kmap_atomic(ppages[i]); 1082 memcpy(destp + page_base, srcp, curlen); 1083 flush_dcache_page(ppages[i]); 1084 kunmap_atomic(destp); 1085 srcp += curlen; 1086 copy_len -= curlen; 1087 fixup_copy_count += curlen; 1088 pagelist_len -= curlen; 1089 if (!pagelist_len) 1090 break; 1091 page_base = 0; 1092 } 1093 1094 /* Implicit padding for the last segment in a Write 1095 * chunk is inserted inline at the front of the tail 1096 * iovec. The upper layer ignores the content of 1097 * the pad. Simply ensure inline content in the tail 1098 * that follows the Write chunk is properly aligned. 1099 */ 1100 if (pad) 1101 srcp -= pad; 1102 } 1103 1104 /* The tail iovec is redirected to the remaining data 1105 * in the receive buffer, to avoid a memcopy. 1106 */ 1107 if (copy_len || pad) { 1108 rqst->rq_rcv_buf.tail[0].iov_base = srcp; 1109 rqst->rq_private_buf.tail[0].iov_base = srcp; 1110 } 1111 1112 if (fixup_copy_count) 1113 trace_xprtrdma_fixup(rqst, fixup_copy_count); 1114 return fixup_copy_count; 1115 } 1116 1117 /* By convention, backchannel calls arrive via rdma_msg type 1118 * messages, and never populate the chunk lists. This makes 1119 * the RPC/RDMA header small and fixed in size, so it is 1120 * straightforward to check the RPC header's direction field. 1121 */ 1122 static bool 1123 rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) 1124 #if defined(CONFIG_SUNRPC_BACKCHANNEL) 1125 { 1126 struct xdr_stream *xdr = &rep->rr_stream; 1127 __be32 *p; 1128 1129 if (rep->rr_proc != rdma_msg) 1130 return false; 1131 1132 /* Peek at stream contents without advancing. */ 1133 p = xdr_inline_decode(xdr, 0); 1134 1135 /* Chunk lists */ 1136 if (*p++ != xdr_zero) 1137 return false; 1138 if (*p++ != xdr_zero) 1139 return false; 1140 if (*p++ != xdr_zero) 1141 return false; 1142 1143 /* RPC header */ 1144 if (*p++ != rep->rr_xid) 1145 return false; 1146 if (*p != cpu_to_be32(RPC_CALL)) 1147 return false; 1148 1149 /* Now that we are sure this is a backchannel call, 1150 * advance to the RPC header. 1151 */ 1152 p = xdr_inline_decode(xdr, 3 * sizeof(*p)); 1153 if (unlikely(!p)) 1154 goto out_short; 1155 1156 rpcrdma_bc_receive_call(r_xprt, rep); 1157 return true; 1158 1159 out_short: 1160 pr_warn("RPC/RDMA short backward direction call\n"); 1161 return true; 1162 } 1163 #else /* CONFIG_SUNRPC_BACKCHANNEL */ 1164 { 1165 return false; 1166 } 1167 #endif /* CONFIG_SUNRPC_BACKCHANNEL */ 1168 1169 static int decode_rdma_segment(struct xdr_stream *xdr, u32 *length) 1170 { 1171 u32 handle; 1172 u64 offset; 1173 __be32 *p; 1174 1175 p = xdr_inline_decode(xdr, 4 * sizeof(*p)); 1176 if (unlikely(!p)) 1177 return -EIO; 1178 1179 handle = be32_to_cpup(p++); 1180 *length = be32_to_cpup(p++); 1181 xdr_decode_hyper(p, &offset); 1182 1183 trace_xprtrdma_decode_seg(handle, *length, offset); 1184 return 0; 1185 } 1186 1187 static int decode_write_chunk(struct xdr_stream *xdr, u32 *length) 1188 { 1189 u32 segcount, seglength; 1190 __be32 *p; 1191 1192 p = xdr_inline_decode(xdr, sizeof(*p)); 1193 if (unlikely(!p)) 1194 return -EIO; 1195 1196 *length = 0; 1197 segcount = be32_to_cpup(p); 1198 while (segcount--) { 1199 if (decode_rdma_segment(xdr, &seglength)) 1200 return -EIO; 1201 *length += seglength; 1202 } 1203 1204 return 0; 1205 } 1206 1207 /* In RPC-over-RDMA Version One replies, a Read list is never 1208 * expected. This decoder is a stub that returns an error if 1209 * a Read list is present. 1210 */ 1211 static int decode_read_list(struct xdr_stream *xdr) 1212 { 1213 __be32 *p; 1214 1215 p = xdr_inline_decode(xdr, sizeof(*p)); 1216 if (unlikely(!p)) 1217 return -EIO; 1218 if (unlikely(*p != xdr_zero)) 1219 return -EIO; 1220 return 0; 1221 } 1222 1223 /* Supports only one Write chunk in the Write list 1224 */ 1225 static int decode_write_list(struct xdr_stream *xdr, u32 *length) 1226 { 1227 u32 chunklen; 1228 bool first; 1229 __be32 *p; 1230 1231 *length = 0; 1232 first = true; 1233 do { 1234 p = xdr_inline_decode(xdr, sizeof(*p)); 1235 if (unlikely(!p)) 1236 return -EIO; 1237 if (*p == xdr_zero) 1238 break; 1239 if (!first) 1240 return -EIO; 1241 1242 if (decode_write_chunk(xdr, &chunklen)) 1243 return -EIO; 1244 *length += chunklen; 1245 first = false; 1246 } while (true); 1247 return 0; 1248 } 1249 1250 static int decode_reply_chunk(struct xdr_stream *xdr, u32 *length) 1251 { 1252 __be32 *p; 1253 1254 p = xdr_inline_decode(xdr, sizeof(*p)); 1255 if (unlikely(!p)) 1256 return -EIO; 1257 1258 *length = 0; 1259 if (*p != xdr_zero) 1260 if (decode_write_chunk(xdr, length)) 1261 return -EIO; 1262 return 0; 1263 } 1264 1265 static int 1266 rpcrdma_decode_msg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, 1267 struct rpc_rqst *rqst) 1268 { 1269 struct xdr_stream *xdr = &rep->rr_stream; 1270 u32 writelist, replychunk, rpclen; 1271 char *base; 1272 1273 /* Decode the chunk lists */ 1274 if (decode_read_list(xdr)) 1275 return -EIO; 1276 if (decode_write_list(xdr, &writelist)) 1277 return -EIO; 1278 if (decode_reply_chunk(xdr, &replychunk)) 1279 return -EIO; 1280 1281 /* RDMA_MSG sanity checks */ 1282 if (unlikely(replychunk)) 1283 return -EIO; 1284 1285 /* Build the RPC reply's Payload stream in rqst->rq_rcv_buf */ 1286 base = (char *)xdr_inline_decode(xdr, 0); 1287 rpclen = xdr_stream_remaining(xdr); 1288 r_xprt->rx_stats.fixup_copy_count += 1289 rpcrdma_inline_fixup(rqst, base, rpclen, writelist & 3); 1290 1291 r_xprt->rx_stats.total_rdma_reply += writelist; 1292 return rpclen + xdr_align_size(writelist); 1293 } 1294 1295 static noinline int 1296 rpcrdma_decode_nomsg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) 1297 { 1298 struct xdr_stream *xdr = &rep->rr_stream; 1299 u32 writelist, replychunk; 1300 1301 /* Decode the chunk lists */ 1302 if (decode_read_list(xdr)) 1303 return -EIO; 1304 if (decode_write_list(xdr, &writelist)) 1305 return -EIO; 1306 if (decode_reply_chunk(xdr, &replychunk)) 1307 return -EIO; 1308 1309 /* RDMA_NOMSG sanity checks */ 1310 if (unlikely(writelist)) 1311 return -EIO; 1312 if (unlikely(!replychunk)) 1313 return -EIO; 1314 1315 /* Reply chunk buffer already is the reply vector */ 1316 r_xprt->rx_stats.total_rdma_reply += replychunk; 1317 return replychunk; 1318 } 1319 1320 static noinline int 1321 rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, 1322 struct rpc_rqst *rqst) 1323 { 1324 struct xdr_stream *xdr = &rep->rr_stream; 1325 __be32 *p; 1326 1327 p = xdr_inline_decode(xdr, sizeof(*p)); 1328 if (unlikely(!p)) 1329 return -EIO; 1330 1331 switch (*p) { 1332 case err_vers: 1333 p = xdr_inline_decode(xdr, 2 * sizeof(*p)); 1334 if (!p) 1335 break; 1336 dprintk("RPC: %s: server reports " 1337 "version error (%u-%u), xid %08x\n", __func__, 1338 be32_to_cpup(p), be32_to_cpu(*(p + 1)), 1339 be32_to_cpu(rep->rr_xid)); 1340 break; 1341 case err_chunk: 1342 dprintk("RPC: %s: server reports " 1343 "header decoding error, xid %08x\n", __func__, 1344 be32_to_cpu(rep->rr_xid)); 1345 break; 1346 default: 1347 dprintk("RPC: %s: server reports " 1348 "unrecognized error %d, xid %08x\n", __func__, 1349 be32_to_cpup(p), be32_to_cpu(rep->rr_xid)); 1350 } 1351 1352 return -EIO; 1353 } 1354 1355 /* Perform XID lookup, reconstruction of the RPC reply, and 1356 * RPC completion while holding the transport lock to ensure 1357 * the rep, rqst, and rq_task pointers remain stable. 1358 */ 1359 void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) 1360 { 1361 struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; 1362 struct rpc_xprt *xprt = &r_xprt->rx_xprt; 1363 struct rpc_rqst *rqst = rep->rr_rqst; 1364 int status; 1365 1366 switch (rep->rr_proc) { 1367 case rdma_msg: 1368 status = rpcrdma_decode_msg(r_xprt, rep, rqst); 1369 break; 1370 case rdma_nomsg: 1371 status = rpcrdma_decode_nomsg(r_xprt, rep); 1372 break; 1373 case rdma_error: 1374 status = rpcrdma_decode_error(r_xprt, rep, rqst); 1375 break; 1376 default: 1377 status = -EIO; 1378 } 1379 if (status < 0) 1380 goto out_badheader; 1381 1382 out: 1383 spin_lock(&xprt->queue_lock); 1384 xprt_complete_rqst(rqst->rq_task, status); 1385 xprt_unpin_rqst(rqst); 1386 spin_unlock(&xprt->queue_lock); 1387 return; 1388 1389 out_badheader: 1390 trace_xprtrdma_reply_hdr(rep); 1391 r_xprt->rx_stats.bad_reply_count++; 1392 rqst->rq_task->tk_status = status; 1393 status = 0; 1394 goto out; 1395 } 1396 1397 static void rpcrdma_reply_done(struct kref *kref) 1398 { 1399 struct rpcrdma_req *req = 1400 container_of(kref, struct rpcrdma_req, rl_kref); 1401 1402 rpcrdma_complete_rqst(req->rl_reply); 1403 } 1404 1405 /** 1406 * rpcrdma_reply_handler - Process received RPC/RDMA messages 1407 * @rep: Incoming rpcrdma_rep object to process 1408 * 1409 * Errors must result in the RPC task either being awakened, or 1410 * allowed to timeout, to discover the errors at that time. 1411 */ 1412 void rpcrdma_reply_handler(struct rpcrdma_rep *rep) 1413 { 1414 struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; 1415 struct rpc_xprt *xprt = &r_xprt->rx_xprt; 1416 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 1417 struct rpcrdma_req *req; 1418 struct rpc_rqst *rqst; 1419 u32 credits; 1420 __be32 *p; 1421 1422 /* Any data means we had a useful conversation, so 1423 * then we don't need to delay the next reconnect. 1424 */ 1425 if (xprt->reestablish_timeout) 1426 xprt->reestablish_timeout = 0; 1427 1428 /* Fixed transport header fields */ 1429 xdr_init_decode(&rep->rr_stream, &rep->rr_hdrbuf, 1430 rep->rr_hdrbuf.head[0].iov_base, NULL); 1431 p = xdr_inline_decode(&rep->rr_stream, 4 * sizeof(*p)); 1432 if (unlikely(!p)) 1433 goto out_shortreply; 1434 rep->rr_xid = *p++; 1435 rep->rr_vers = *p++; 1436 credits = be32_to_cpu(*p++); 1437 rep->rr_proc = *p++; 1438 1439 if (rep->rr_vers != rpcrdma_version) 1440 goto out_badversion; 1441 1442 if (rpcrdma_is_bcall(r_xprt, rep)) 1443 return; 1444 1445 /* Match incoming rpcrdma_rep to an rpcrdma_req to 1446 * get context for handling any incoming chunks. 1447 */ 1448 spin_lock(&xprt->queue_lock); 1449 rqst = xprt_lookup_rqst(xprt, rep->rr_xid); 1450 if (!rqst) 1451 goto out_norqst; 1452 xprt_pin_rqst(rqst); 1453 spin_unlock(&xprt->queue_lock); 1454 1455 if (credits == 0) 1456 credits = 1; /* don't deadlock */ 1457 else if (credits > r_xprt->rx_ep->re_max_requests) 1458 credits = r_xprt->rx_ep->re_max_requests; 1459 if (buf->rb_credits != credits) 1460 rpcrdma_update_cwnd(r_xprt, credits); 1461 rpcrdma_post_recvs(r_xprt, false); 1462 1463 req = rpcr_to_rdmar(rqst); 1464 if (req->rl_reply) { 1465 trace_xprtrdma_leaked_rep(rqst, req->rl_reply); 1466 rpcrdma_recv_buffer_put(req->rl_reply); 1467 } 1468 req->rl_reply = rep; 1469 rep->rr_rqst = rqst; 1470 1471 trace_xprtrdma_reply(rqst->rq_task, rep, req, credits); 1472 1473 if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) 1474 frwr_reminv(rep, &req->rl_registered); 1475 if (!list_empty(&req->rl_registered)) 1476 frwr_unmap_async(r_xprt, req); 1477 /* LocalInv completion will complete the RPC */ 1478 else 1479 kref_put(&req->rl_kref, rpcrdma_reply_done); 1480 return; 1481 1482 out_badversion: 1483 trace_xprtrdma_reply_vers(rep); 1484 goto out; 1485 1486 out_norqst: 1487 spin_unlock(&xprt->queue_lock); 1488 trace_xprtrdma_reply_rqst(rep); 1489 goto out; 1490 1491 out_shortreply: 1492 trace_xprtrdma_reply_short(rep); 1493 1494 out: 1495 rpcrdma_recv_buffer_put(rep); 1496 } 1497