1 /* 2 * Copyright (c) 2016, 2017 Oracle. All rights reserved. 3 * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved. 4 * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. 5 * 6 * This software is available to you under a choice of one of two 7 * licenses. You may choose to be licensed under the terms of the GNU 8 * General Public License (GPL) Version 2, available from the file 9 * COPYING in the main directory of this source tree, or the BSD-type 10 * license below: 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 16 * Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 19 * Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials provided 22 * with the distribution. 23 * 24 * Neither the name of the Network Appliance, Inc. nor the names of 25 * its contributors may be used to endorse or promote products 26 * derived from this software without specific prior written 27 * permission. 28 * 29 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 30 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 31 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 32 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 33 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 34 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 35 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 36 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 37 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 38 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 39 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 40 * 41 * Author: Tom Tucker <tom@opengridcomputing.com> 42 */ 43 44 /* Operation 45 * 46 * The main entry point is svc_rdma_recvfrom. This is called from 47 * svc_recv when the transport indicates there is incoming data to 48 * be read. "Data Ready" is signaled when an RDMA Receive completes, 49 * or when a set of RDMA Reads complete. 50 * 51 * An svc_rqst is passed in. This structure contains an array of 52 * free pages (rq_pages) that will contain the incoming RPC message. 53 * 54 * Short messages are moved directly into svc_rqst::rq_arg, and 55 * the RPC Call is ready to be processed by the Upper Layer. 56 * svc_rdma_recvfrom returns the length of the RPC Call message, 57 * completing the reception of the RPC Call. 58 * 59 * However, when an incoming message has Read chunks, 60 * svc_rdma_recvfrom must post RDMA Reads to pull the RPC Call's 61 * data payload from the client. svc_rdma_recvfrom sets up the 62 * RDMA Reads using pages in svc_rqst::rq_pages, which are 63 * transferred to an svc_rdma_op_ctxt for the duration of the 64 * I/O. svc_rdma_recvfrom then returns zero, since the RPC message 65 * is still not yet ready. 66 * 67 * When the Read chunk payloads have become available on the 68 * server, "Data Ready" is raised again, and svc_recv calls 69 * svc_rdma_recvfrom again. This second call may use a different 70 * svc_rqst than the first one, thus any information that needs 71 * to be preserved across these two calls is kept in an 72 * svc_rdma_op_ctxt. 73 * 74 * The second call to svc_rdma_recvfrom performs final assembly 75 * of the RPC Call message, using the RDMA Read sink pages kept in 76 * the svc_rdma_op_ctxt. The xdr_buf is copied from the 77 * svc_rdma_op_ctxt to the second svc_rqst. The second call returns 78 * the length of the completed RPC Call message. 79 * 80 * Page Management 81 * 82 * Pages under I/O must be transferred from the first svc_rqst to an 83 * svc_rdma_op_ctxt before the first svc_rdma_recvfrom call returns. 84 * 85 * The first svc_rqst supplies pages for RDMA Reads. These are moved 86 * from rqstp::rq_pages into ctxt::pages. The consumed elements of 87 * the rq_pages array are set to NULL and refilled with the first 88 * svc_rdma_recvfrom call returns. 89 * 90 * During the second svc_rdma_recvfrom call, RDMA Read sink pages 91 * are transferred from the svc_rdma_op_ctxt to the second svc_rqst 92 * (see rdma_read_complete() below). 93 */ 94 95 #include <asm/unaligned.h> 96 #include <rdma/ib_verbs.h> 97 #include <rdma/rdma_cm.h> 98 99 #include <linux/spinlock.h> 100 101 #include <linux/sunrpc/xdr.h> 102 #include <linux/sunrpc/debug.h> 103 #include <linux/sunrpc/rpc_rdma.h> 104 #include <linux/sunrpc/svc_rdma.h> 105 106 #define RPCDBG_FACILITY RPCDBG_SVCXPRT 107 108 /* 109 * Replace the pages in the rq_argpages array with the pages from the SGE in 110 * the RDMA_RECV completion. The SGL should contain full pages up until the 111 * last one. 112 */ 113 static void rdma_build_arg_xdr(struct svc_rqst *rqstp, 114 struct svc_rdma_op_ctxt *ctxt, 115 u32 byte_count) 116 { 117 struct page *page; 118 u32 bc; 119 int sge_no; 120 121 /* Swap the page in the SGE with the page in argpages */ 122 page = ctxt->pages[0]; 123 put_page(rqstp->rq_pages[0]); 124 rqstp->rq_pages[0] = page; 125 126 /* Set up the XDR head */ 127 rqstp->rq_arg.head[0].iov_base = page_address(page); 128 rqstp->rq_arg.head[0].iov_len = 129 min_t(size_t, byte_count, ctxt->sge[0].length); 130 rqstp->rq_arg.len = byte_count; 131 rqstp->rq_arg.buflen = byte_count; 132 133 /* Compute bytes past head in the SGL */ 134 bc = byte_count - rqstp->rq_arg.head[0].iov_len; 135 136 /* If data remains, store it in the pagelist */ 137 rqstp->rq_arg.page_len = bc; 138 rqstp->rq_arg.page_base = 0; 139 140 sge_no = 1; 141 while (bc && sge_no < ctxt->count) { 142 page = ctxt->pages[sge_no]; 143 put_page(rqstp->rq_pages[sge_no]); 144 rqstp->rq_pages[sge_no] = page; 145 bc -= min_t(u32, bc, ctxt->sge[sge_no].length); 146 sge_no++; 147 } 148 rqstp->rq_respages = &rqstp->rq_pages[sge_no]; 149 rqstp->rq_next_page = rqstp->rq_respages + 1; 150 151 /* If not all pages were used from the SGL, free the remaining ones */ 152 bc = sge_no; 153 while (sge_no < ctxt->count) { 154 page = ctxt->pages[sge_no++]; 155 put_page(page); 156 } 157 ctxt->count = bc; 158 159 /* Set up tail */ 160 rqstp->rq_arg.tail[0].iov_base = NULL; 161 rqstp->rq_arg.tail[0].iov_len = 0; 162 } 163 164 /* This accommodates the largest possible Write chunk, 165 * in one segment. 166 */ 167 #define MAX_BYTES_WRITE_SEG ((u32)(RPCSVC_MAXPAGES << PAGE_SHIFT)) 168 169 /* This accommodates the largest possible Position-Zero 170 * Read chunk or Reply chunk, in one segment. 171 */ 172 #define MAX_BYTES_SPECIAL_SEG ((u32)((RPCSVC_MAXPAGES + 2) << PAGE_SHIFT)) 173 174 /* Sanity check the Read list. 175 * 176 * Implementation limits: 177 * - This implementation supports only one Read chunk. 178 * 179 * Sanity checks: 180 * - Read list does not overflow buffer. 181 * - Segment size limited by largest NFS data payload. 182 * 183 * The segment count is limited to how many segments can 184 * fit in the transport header without overflowing the 185 * buffer. That's about 40 Read segments for a 1KB inline 186 * threshold. 187 * 188 * Returns pointer to the following Write list. 189 */ 190 static __be32 *xdr_check_read_list(__be32 *p, const __be32 *end) 191 { 192 u32 position; 193 bool first; 194 195 first = true; 196 while (*p++ != xdr_zero) { 197 if (first) { 198 position = be32_to_cpup(p++); 199 first = false; 200 } else if (be32_to_cpup(p++) != position) { 201 return NULL; 202 } 203 p++; /* handle */ 204 if (be32_to_cpup(p++) > MAX_BYTES_SPECIAL_SEG) 205 return NULL; 206 p += 2; /* offset */ 207 208 if (p > end) 209 return NULL; 210 } 211 return p; 212 } 213 214 /* The segment count is limited to how many segments can 215 * fit in the transport header without overflowing the 216 * buffer. That's about 60 Write segments for a 1KB inline 217 * threshold. 218 */ 219 static __be32 *xdr_check_write_chunk(__be32 *p, const __be32 *end, 220 u32 maxlen) 221 { 222 u32 i, segcount; 223 224 segcount = be32_to_cpup(p++); 225 for (i = 0; i < segcount; i++) { 226 p++; /* handle */ 227 if (be32_to_cpup(p++) > maxlen) 228 return NULL; 229 p += 2; /* offset */ 230 231 if (p > end) 232 return NULL; 233 } 234 235 return p; 236 } 237 238 /* Sanity check the Write list. 239 * 240 * Implementation limits: 241 * - This implementation supports only one Write chunk. 242 * 243 * Sanity checks: 244 * - Write list does not overflow buffer. 245 * - Segment size limited by largest NFS data payload. 246 * 247 * Returns pointer to the following Reply chunk. 248 */ 249 static __be32 *xdr_check_write_list(__be32 *p, const __be32 *end) 250 { 251 u32 chcount; 252 253 chcount = 0; 254 while (*p++ != xdr_zero) { 255 p = xdr_check_write_chunk(p, end, MAX_BYTES_WRITE_SEG); 256 if (!p) 257 return NULL; 258 if (chcount++ > 1) 259 return NULL; 260 } 261 return p; 262 } 263 264 /* Sanity check the Reply chunk. 265 * 266 * Sanity checks: 267 * - Reply chunk does not overflow buffer. 268 * - Segment size limited by largest NFS data payload. 269 * 270 * Returns pointer to the following RPC header. 271 */ 272 static __be32 *xdr_check_reply_chunk(__be32 *p, const __be32 *end) 273 { 274 if (*p++ != xdr_zero) { 275 p = xdr_check_write_chunk(p, end, MAX_BYTES_SPECIAL_SEG); 276 if (!p) 277 return NULL; 278 } 279 return p; 280 } 281 282 /* On entry, xdr->head[0].iov_base points to first byte in the 283 * RPC-over-RDMA header. 284 * 285 * On successful exit, head[0] points to first byte past the 286 * RPC-over-RDMA header. For RDMA_MSG, this is the RPC message. 287 * The length of the RPC-over-RDMA header is returned. 288 * 289 * Assumptions: 290 * - The transport header is entirely contained in the head iovec. 291 */ 292 static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg) 293 { 294 __be32 *p, *end, *rdma_argp; 295 unsigned int hdr_len; 296 char *proc; 297 298 /* Verify that there's enough bytes for header + something */ 299 if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) 300 goto out_short; 301 302 rdma_argp = rq_arg->head[0].iov_base; 303 if (*(rdma_argp + 1) != rpcrdma_version) 304 goto out_version; 305 306 switch (*(rdma_argp + 3)) { 307 case rdma_msg: 308 proc = "RDMA_MSG"; 309 break; 310 case rdma_nomsg: 311 proc = "RDMA_NOMSG"; 312 break; 313 314 case rdma_done: 315 goto out_drop; 316 317 case rdma_error: 318 goto out_drop; 319 320 default: 321 goto out_proc; 322 } 323 324 end = (__be32 *)((unsigned long)rdma_argp + rq_arg->len); 325 p = xdr_check_read_list(rdma_argp + 4, end); 326 if (!p) 327 goto out_inval; 328 p = xdr_check_write_list(p, end); 329 if (!p) 330 goto out_inval; 331 p = xdr_check_reply_chunk(p, end); 332 if (!p) 333 goto out_inval; 334 if (p > end) 335 goto out_inval; 336 337 rq_arg->head[0].iov_base = p; 338 hdr_len = (unsigned long)p - (unsigned long)rdma_argp; 339 rq_arg->head[0].iov_len -= hdr_len; 340 rq_arg->len -= hdr_len; 341 dprintk("svcrdma: received %s request for XID 0x%08x, hdr_len=%u\n", 342 proc, be32_to_cpup(rdma_argp), hdr_len); 343 return hdr_len; 344 345 out_short: 346 dprintk("svcrdma: header too short = %d\n", rq_arg->len); 347 return -EINVAL; 348 349 out_version: 350 dprintk("svcrdma: bad xprt version: %u\n", 351 be32_to_cpup(rdma_argp + 1)); 352 return -EPROTONOSUPPORT; 353 354 out_drop: 355 dprintk("svcrdma: dropping RDMA_DONE/ERROR message\n"); 356 return 0; 357 358 out_proc: 359 dprintk("svcrdma: bad rdma procedure (%u)\n", 360 be32_to_cpup(rdma_argp + 3)); 361 return -EINVAL; 362 363 out_inval: 364 dprintk("svcrdma: failed to parse transport header\n"); 365 return -EINVAL; 366 } 367 368 static void rdma_read_complete(struct svc_rqst *rqstp, 369 struct svc_rdma_op_ctxt *head) 370 { 371 int page_no; 372 373 /* Copy RPC pages */ 374 for (page_no = 0; page_no < head->count; page_no++) { 375 put_page(rqstp->rq_pages[page_no]); 376 rqstp->rq_pages[page_no] = head->pages[page_no]; 377 } 378 379 /* Point rq_arg.pages past header */ 380 rqstp->rq_arg.pages = &rqstp->rq_pages[head->hdr_count]; 381 rqstp->rq_arg.page_len = head->arg.page_len; 382 383 /* rq_respages starts after the last arg page */ 384 rqstp->rq_respages = &rqstp->rq_pages[page_no]; 385 rqstp->rq_next_page = rqstp->rq_respages + 1; 386 387 /* Rebuild rq_arg head and tail. */ 388 rqstp->rq_arg.head[0] = head->arg.head[0]; 389 rqstp->rq_arg.tail[0] = head->arg.tail[0]; 390 rqstp->rq_arg.len = head->arg.len; 391 rqstp->rq_arg.buflen = head->arg.buflen; 392 } 393 394 static void svc_rdma_send_error(struct svcxprt_rdma *xprt, 395 __be32 *rdma_argp, int status) 396 { 397 struct svc_rdma_op_ctxt *ctxt; 398 __be32 *p, *err_msgp; 399 unsigned int length; 400 struct page *page; 401 int ret; 402 403 ret = svc_rdma_repost_recv(xprt, GFP_KERNEL); 404 if (ret) 405 return; 406 407 page = alloc_page(GFP_KERNEL); 408 if (!page) 409 return; 410 err_msgp = page_address(page); 411 412 p = err_msgp; 413 *p++ = *rdma_argp; 414 *p++ = *(rdma_argp + 1); 415 *p++ = xprt->sc_fc_credits; 416 *p++ = rdma_error; 417 if (status == -EPROTONOSUPPORT) { 418 *p++ = err_vers; 419 *p++ = rpcrdma_version; 420 *p++ = rpcrdma_version; 421 } else { 422 *p++ = err_chunk; 423 } 424 length = (unsigned long)p - (unsigned long)err_msgp; 425 426 /* Map transport header; no RPC message payload */ 427 ctxt = svc_rdma_get_context(xprt); 428 ret = svc_rdma_map_reply_hdr(xprt, ctxt, err_msgp, length); 429 if (ret) { 430 dprintk("svcrdma: Error %d mapping send for protocol error\n", 431 ret); 432 return; 433 } 434 435 ret = svc_rdma_post_send_wr(xprt, ctxt, 1, 0); 436 if (ret) { 437 dprintk("svcrdma: Error %d posting send for protocol error\n", 438 ret); 439 svc_rdma_unmap_dma(ctxt); 440 svc_rdma_put_context(ctxt, 1); 441 } 442 } 443 444 /* By convention, backchannel calls arrive via rdma_msg type 445 * messages, and never populate the chunk lists. This makes 446 * the RPC/RDMA header small and fixed in size, so it is 447 * straightforward to check the RPC header's direction field. 448 */ 449 static bool svc_rdma_is_backchannel_reply(struct svc_xprt *xprt, 450 __be32 *rdma_resp) 451 { 452 __be32 *p; 453 454 if (!xprt->xpt_bc_xprt) 455 return false; 456 457 p = rdma_resp + 3; 458 if (*p++ != rdma_msg) 459 return false; 460 461 if (*p++ != xdr_zero) 462 return false; 463 if (*p++ != xdr_zero) 464 return false; 465 if (*p++ != xdr_zero) 466 return false; 467 468 /* XID sanity */ 469 if (*p++ != *rdma_resp) 470 return false; 471 /* call direction */ 472 if (*p == cpu_to_be32(RPC_CALL)) 473 return false; 474 475 return true; 476 } 477 478 /** 479 * svc_rdma_recvfrom - Receive an RPC call 480 * @rqstp: request structure into which to receive an RPC Call 481 * 482 * Returns: 483 * The positive number of bytes in the RPC Call message, 484 * %0 if there were no Calls ready to return, 485 * %-EINVAL if the Read chunk data is too large, 486 * %-ENOMEM if rdma_rw context pool was exhausted, 487 * %-ENOTCONN if posting failed (connection is lost), 488 * %-EIO if rdma_rw initialization failed (DMA mapping, etc). 489 * 490 * Called in a loop when XPT_DATA is set. XPT_DATA is cleared only 491 * when there are no remaining ctxt's to process. 492 * 493 * The next ctxt is removed from the "receive" lists. 494 * 495 * - If the ctxt completes a Read, then finish assembling the Call 496 * message and return the number of bytes in the message. 497 * 498 * - If the ctxt completes a Receive, then construct the Call 499 * message from the contents of the Receive buffer. 500 * 501 * - If there are no Read chunks in this message, then finish 502 * assembling the Call message and return the number of bytes 503 * in the message. 504 * 505 * - If there are Read chunks in this message, post Read WRs to 506 * pull that payload and return 0. 507 */ 508 int svc_rdma_recvfrom(struct svc_rqst *rqstp) 509 { 510 struct svc_xprt *xprt = rqstp->rq_xprt; 511 struct svcxprt_rdma *rdma_xprt = 512 container_of(xprt, struct svcxprt_rdma, sc_xprt); 513 struct svc_rdma_op_ctxt *ctxt; 514 __be32 *p; 515 int ret; 516 517 spin_lock(&rdma_xprt->sc_rq_dto_lock); 518 if (!list_empty(&rdma_xprt->sc_read_complete_q)) { 519 ctxt = list_first_entry(&rdma_xprt->sc_read_complete_q, 520 struct svc_rdma_op_ctxt, list); 521 list_del(&ctxt->list); 522 spin_unlock(&rdma_xprt->sc_rq_dto_lock); 523 rdma_read_complete(rqstp, ctxt); 524 goto complete; 525 } else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) { 526 ctxt = list_first_entry(&rdma_xprt->sc_rq_dto_q, 527 struct svc_rdma_op_ctxt, list); 528 list_del(&ctxt->list); 529 } else { 530 /* No new incoming requests, terminate the loop */ 531 clear_bit(XPT_DATA, &xprt->xpt_flags); 532 spin_unlock(&rdma_xprt->sc_rq_dto_lock); 533 return 0; 534 } 535 spin_unlock(&rdma_xprt->sc_rq_dto_lock); 536 537 dprintk("svcrdma: recvfrom: ctxt=%p on xprt=%p, rqstp=%p\n", 538 ctxt, rdma_xprt, rqstp); 539 atomic_inc(&rdma_stat_recv); 540 541 /* Build up the XDR from the receive buffers. */ 542 rdma_build_arg_xdr(rqstp, ctxt, ctxt->byte_len); 543 544 /* Decode the RDMA header. */ 545 p = (__be32 *)rqstp->rq_arg.head[0].iov_base; 546 ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg); 547 if (ret < 0) 548 goto out_err; 549 if (ret == 0) 550 goto out_drop; 551 rqstp->rq_xprt_hlen = ret; 552 553 if (svc_rdma_is_backchannel_reply(xprt, p)) { 554 ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, p, 555 &rqstp->rq_arg); 556 svc_rdma_put_context(ctxt, 0); 557 if (ret) 558 goto repost; 559 return ret; 560 } 561 562 p += rpcrdma_fixed_maxsz; 563 if (*p != xdr_zero) 564 goto out_readchunk; 565 566 complete: 567 svc_rdma_put_context(ctxt, 0); 568 dprintk("svcrdma: recvfrom: xprt=%p, rqstp=%p, rq_arg.len=%u\n", 569 rdma_xprt, rqstp, rqstp->rq_arg.len); 570 rqstp->rq_prot = IPPROTO_MAX; 571 svc_xprt_copy_addrs(rqstp, xprt); 572 return rqstp->rq_arg.len; 573 574 out_readchunk: 575 ret = svc_rdma_recv_read_chunk(rdma_xprt, rqstp, ctxt, p); 576 if (ret < 0) 577 goto out_postfail; 578 return 0; 579 580 out_err: 581 svc_rdma_send_error(rdma_xprt, p, ret); 582 svc_rdma_put_context(ctxt, 0); 583 return 0; 584 585 out_postfail: 586 if (ret == -EINVAL) 587 svc_rdma_send_error(rdma_xprt, p, ret); 588 svc_rdma_put_context(ctxt, 1); 589 return ret; 590 591 out_drop: 592 svc_rdma_put_context(ctxt, 1); 593 repost: 594 return svc_rdma_repost_recv(rdma_xprt, GFP_KERNEL); 595 } 596