1bcf3ffd4SChuck Lever // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2c06b540aSTom Tucker /* 3ecf85b23SChuck Lever * Copyright (c) 2016-2018 Oracle. All rights reserved. 40bf48289SSteve Wise * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved. 5c06b540aSTom Tucker * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. 6c06b540aSTom Tucker * 7c06b540aSTom Tucker * This software is available to you under a choice of one of two 8c06b540aSTom Tucker * licenses. You may choose to be licensed under the terms of the GNU 9c06b540aSTom Tucker * General Public License (GPL) Version 2, available from the file 10c06b540aSTom Tucker * COPYING in the main directory of this source tree, or the BSD-type 11c06b540aSTom Tucker * license below: 12c06b540aSTom Tucker * 13c06b540aSTom Tucker * Redistribution and use in source and binary forms, with or without 14c06b540aSTom Tucker * modification, are permitted provided that the following conditions 15c06b540aSTom Tucker * are met: 16c06b540aSTom Tucker * 17c06b540aSTom Tucker * Redistributions of source code must retain the above copyright 18c06b540aSTom Tucker * notice, this list of conditions and the following disclaimer. 19c06b540aSTom Tucker * 20c06b540aSTom Tucker * Redistributions in binary form must reproduce the above 21c06b540aSTom Tucker * copyright notice, this list of conditions and the following 22c06b540aSTom Tucker * disclaimer in the documentation and/or other materials provided 23c06b540aSTom Tucker * with the distribution. 24c06b540aSTom Tucker * 25c06b540aSTom Tucker * Neither the name of the Network Appliance, Inc. nor the names of 26c06b540aSTom Tucker * its contributors may be used to endorse or promote products 27c06b540aSTom Tucker * derived from this software without specific prior written 28c06b540aSTom Tucker * permission. 29c06b540aSTom Tucker * 30c06b540aSTom Tucker * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 31c06b540aSTom Tucker * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 32c06b540aSTom Tucker * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 33c06b540aSTom Tucker * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 34c06b540aSTom Tucker * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 35c06b540aSTom Tucker * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 36c06b540aSTom Tucker * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 37c06b540aSTom Tucker * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 38c06b540aSTom Tucker * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 39c06b540aSTom Tucker * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 40c06b540aSTom Tucker * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 41c06b540aSTom Tucker * 42c06b540aSTom Tucker * Author: Tom Tucker <tom@opengridcomputing.com> 43c06b540aSTom Tucker */ 44c06b540aSTom Tucker 459a6a180bSChuck Lever /* Operation 469a6a180bSChuck Lever * 479a6a180bSChuck Lever * The main entry point is svc_rdma_sendto. This is called by the 489a6a180bSChuck Lever * RPC server when an RPC Reply is ready to be transmitted to a client. 499a6a180bSChuck Lever * 509a6a180bSChuck Lever * The passed-in svc_rqst contains a struct xdr_buf which holds an 519a6a180bSChuck Lever * XDR-encoded RPC Reply message. sendto must construct the RPC-over-RDMA 529a6a180bSChuck Lever * transport header, post all Write WRs needed for this Reply, then post 539a6a180bSChuck Lever * a Send WR conveying the transport header and the RPC message itself to 549a6a180bSChuck Lever * the client. 559a6a180bSChuck Lever * 569a6a180bSChuck Lever * svc_rdma_sendto must fully transmit the Reply before returning, as 579a6a180bSChuck Lever * the svc_rqst will be recycled as soon as sendto returns. Remaining 589a6a180bSChuck Lever * resources referred to by the svc_rqst are also recycled at that time. 599a6a180bSChuck Lever * Therefore any resources that must remain longer must be detached 609a6a180bSChuck Lever * from the svc_rqst and released later. 619a6a180bSChuck Lever * 629a6a180bSChuck Lever * Page Management 639a6a180bSChuck Lever * 649a6a180bSChuck Lever * The I/O that performs Reply transmission is asynchronous, and may 659a6a180bSChuck Lever * complete well after sendto returns. Thus pages under I/O must be 669a6a180bSChuck Lever * removed from the svc_rqst before sendto returns. 679a6a180bSChuck Lever * 689a6a180bSChuck Lever * The logic here depends on Send Queue and completion ordering. Since 699a6a180bSChuck Lever * the Send WR is always posted last, it will always complete last. Thus 709a6a180bSChuck Lever * when it completes, it is guaranteed that all previous Write WRs have 719a6a180bSChuck Lever * also completed. 729a6a180bSChuck Lever * 739a6a180bSChuck Lever * Write WRs are constructed and posted. Each Write segment gets its own 749a6a180bSChuck Lever * svc_rdma_rw_ctxt, allowing the Write completion handler to find and 759a6a180bSChuck Lever * DMA-unmap the pages under I/O for that Write segment. The Write 769a6a180bSChuck Lever * completion handler does not release any pages. 779a6a180bSChuck Lever * 789a6a180bSChuck Lever * When the Send WR is constructed, it also gets its own svc_rdma_op_ctxt. 799a6a180bSChuck Lever * The ownership of all of the Reply's pages are transferred into that 809a6a180bSChuck Lever * ctxt, the Send WR is posted, and sendto returns. 819a6a180bSChuck Lever * 829a6a180bSChuck Lever * The svc_rdma_op_ctxt is presented when the Send WR completes. The 839a6a180bSChuck Lever * Send completion handler finally releases the Reply's pages. 849a6a180bSChuck Lever * 859a6a180bSChuck Lever * This mechanism also assumes that completions on the transport's Send 869a6a180bSChuck Lever * Completion Queue do not run in parallel. Otherwise a Write completion 879a6a180bSChuck Lever * and Send completion running at the same time could release pages that 889a6a180bSChuck Lever * are still DMA-mapped. 899a6a180bSChuck Lever * 909a6a180bSChuck Lever * Error Handling 919a6a180bSChuck Lever * 929a6a180bSChuck Lever * - If the Send WR is posted successfully, it will either complete 939a6a180bSChuck Lever * successfully, or get flushed. Either way, the Send completion 949a6a180bSChuck Lever * handler releases the Reply's pages. 959a6a180bSChuck Lever * - If the Send WR cannot be not posted, the forward path releases 969a6a180bSChuck Lever * the Reply's pages. 979a6a180bSChuck Lever * 989a6a180bSChuck Lever * This handles the case, without the use of page reference counting, 999a6a180bSChuck Lever * where two different Write segments send portions of the same page. 1009a6a180bSChuck Lever */ 1019a6a180bSChuck Lever 102c06b540aSTom Tucker #include <linux/spinlock.h> 103c06b540aSTom Tucker #include <asm/unaligned.h> 10498895edbSChuck Lever 105c06b540aSTom Tucker #include <rdma/ib_verbs.h> 106c06b540aSTom Tucker #include <rdma/rdma_cm.h> 10798895edbSChuck Lever 10898895edbSChuck Lever #include <linux/sunrpc/debug.h> 10998895edbSChuck Lever #include <linux/sunrpc/rpc_rdma.h> 110c06b540aSTom Tucker #include <linux/sunrpc/svc_rdma.h> 111c06b540aSTom Tucker 11298895edbSChuck Lever #include "xprt_rdma.h" 11398895edbSChuck Lever #include <trace/events/rpcrdma.h> 11498895edbSChuck Lever 115c06b540aSTom Tucker #define RPCDBG_FACILITY RPCDBG_SVCXPRT 116c06b540aSTom Tucker 117cf570a93SChuck Lever static u32 xdr_padsize(u32 len) 118cf570a93SChuck Lever { 119cf570a93SChuck Lever return (len & 3) ? (4 - (len & 3)) : 0; 120cf570a93SChuck Lever } 121cf570a93SChuck Lever 1229a6a180bSChuck Lever /* Returns length of transport header, in bytes. 1239a6a180bSChuck Lever */ 1249a6a180bSChuck Lever static unsigned int svc_rdma_reply_hdr_len(__be32 *rdma_resp) 1259a6a180bSChuck Lever { 1269a6a180bSChuck Lever unsigned int nsegs; 1279a6a180bSChuck Lever __be32 *p; 1289a6a180bSChuck Lever 1299a6a180bSChuck Lever p = rdma_resp; 1309a6a180bSChuck Lever 1319a6a180bSChuck Lever /* RPC-over-RDMA V1 replies never have a Read list. */ 1329a6a180bSChuck Lever p += rpcrdma_fixed_maxsz + 1; 1339a6a180bSChuck Lever 1349a6a180bSChuck Lever /* Skip Write list. */ 1359a6a180bSChuck Lever while (*p++ != xdr_zero) { 1369a6a180bSChuck Lever nsegs = be32_to_cpup(p++); 1379a6a180bSChuck Lever p += nsegs * rpcrdma_segment_maxsz; 1389a6a180bSChuck Lever } 1399a6a180bSChuck Lever 1409a6a180bSChuck Lever /* Skip Reply chunk. */ 1419a6a180bSChuck Lever if (*p++ != xdr_zero) { 1429a6a180bSChuck Lever nsegs = be32_to_cpup(p++); 1439a6a180bSChuck Lever p += nsegs * rpcrdma_segment_maxsz; 1449a6a180bSChuck Lever } 1459a6a180bSChuck Lever 1469a6a180bSChuck Lever return (unsigned long)p - (unsigned long)rdma_resp; 1479a6a180bSChuck Lever } 1489a6a180bSChuck Lever 1499a6a180bSChuck Lever /* One Write chunk is copied from Call transport header to Reply 1509a6a180bSChuck Lever * transport header. Each segment's length field is updated to 1519a6a180bSChuck Lever * reflect number of bytes consumed in the segment. 1529a6a180bSChuck Lever * 1539a6a180bSChuck Lever * Returns number of segments in this chunk. 1549a6a180bSChuck Lever */ 1559a6a180bSChuck Lever static unsigned int xdr_encode_write_chunk(__be32 *dst, __be32 *src, 1569a6a180bSChuck Lever unsigned int remaining) 1579a6a180bSChuck Lever { 1589a6a180bSChuck Lever unsigned int i, nsegs; 1599a6a180bSChuck Lever u32 seg_len; 1609a6a180bSChuck Lever 1619a6a180bSChuck Lever /* Write list discriminator */ 1629a6a180bSChuck Lever *dst++ = *src++; 1639a6a180bSChuck Lever 1649a6a180bSChuck Lever /* number of segments in this chunk */ 1659a6a180bSChuck Lever nsegs = be32_to_cpup(src); 1669a6a180bSChuck Lever *dst++ = *src++; 1679a6a180bSChuck Lever 1689a6a180bSChuck Lever for (i = nsegs; i; i--) { 1699a6a180bSChuck Lever /* segment's RDMA handle */ 1709a6a180bSChuck Lever *dst++ = *src++; 1719a6a180bSChuck Lever 1729a6a180bSChuck Lever /* bytes returned in this segment */ 1739a6a180bSChuck Lever seg_len = be32_to_cpu(*src); 1749a6a180bSChuck Lever if (remaining >= seg_len) { 1759a6a180bSChuck Lever /* entire segment was consumed */ 1769a6a180bSChuck Lever *dst = *src; 1779a6a180bSChuck Lever remaining -= seg_len; 1789a6a180bSChuck Lever } else { 1799a6a180bSChuck Lever /* segment only partly filled */ 1809a6a180bSChuck Lever *dst = cpu_to_be32(remaining); 1819a6a180bSChuck Lever remaining = 0; 1829a6a180bSChuck Lever } 1839a6a180bSChuck Lever dst++; src++; 1849a6a180bSChuck Lever 1859a6a180bSChuck Lever /* segment's RDMA offset */ 1869a6a180bSChuck Lever *dst++ = *src++; 1879a6a180bSChuck Lever *dst++ = *src++; 1889a6a180bSChuck Lever } 1899a6a180bSChuck Lever 1909a6a180bSChuck Lever return nsegs; 1919a6a180bSChuck Lever } 1929a6a180bSChuck Lever 1939a6a180bSChuck Lever /* The client provided a Write list in the Call message. Fill in 1949a6a180bSChuck Lever * the segments in the first Write chunk in the Reply's transport 1959a6a180bSChuck Lever * header with the number of bytes consumed in each segment. 1969a6a180bSChuck Lever * Remaining chunks are returned unused. 1979a6a180bSChuck Lever * 1989a6a180bSChuck Lever * Assumptions: 1999a6a180bSChuck Lever * - Client has provided only one Write chunk 2009a6a180bSChuck Lever */ 2019a6a180bSChuck Lever static void svc_rdma_xdr_encode_write_list(__be32 *rdma_resp, __be32 *wr_ch, 2029a6a180bSChuck Lever unsigned int consumed) 2039a6a180bSChuck Lever { 2049a6a180bSChuck Lever unsigned int nsegs; 2059a6a180bSChuck Lever __be32 *p, *q; 2069a6a180bSChuck Lever 2079a6a180bSChuck Lever /* RPC-over-RDMA V1 replies never have a Read list. */ 2089a6a180bSChuck Lever p = rdma_resp + rpcrdma_fixed_maxsz + 1; 2099a6a180bSChuck Lever 2109a6a180bSChuck Lever q = wr_ch; 2119a6a180bSChuck Lever while (*q != xdr_zero) { 2129a6a180bSChuck Lever nsegs = xdr_encode_write_chunk(p, q, consumed); 2139a6a180bSChuck Lever q += 2 + nsegs * rpcrdma_segment_maxsz; 2149a6a180bSChuck Lever p += 2 + nsegs * rpcrdma_segment_maxsz; 2159a6a180bSChuck Lever consumed = 0; 2169a6a180bSChuck Lever } 2179a6a180bSChuck Lever 2189a6a180bSChuck Lever /* Terminate Write list */ 2199a6a180bSChuck Lever *p++ = xdr_zero; 2209a6a180bSChuck Lever 2219a6a180bSChuck Lever /* Reply chunk discriminator; may be replaced later */ 2229a6a180bSChuck Lever *p = xdr_zero; 2239a6a180bSChuck Lever } 2249a6a180bSChuck Lever 2259a6a180bSChuck Lever /* The client provided a Reply chunk in the Call message. Fill in 2269a6a180bSChuck Lever * the segments in the Reply chunk in the Reply message with the 2279a6a180bSChuck Lever * number of bytes consumed in each segment. 2289a6a180bSChuck Lever * 2299a6a180bSChuck Lever * Assumptions: 2309a6a180bSChuck Lever * - Reply can always fit in the provided Reply chunk 2319a6a180bSChuck Lever */ 2329a6a180bSChuck Lever static void svc_rdma_xdr_encode_reply_chunk(__be32 *rdma_resp, __be32 *rp_ch, 2339a6a180bSChuck Lever unsigned int consumed) 2349a6a180bSChuck Lever { 2359a6a180bSChuck Lever __be32 *p; 2369a6a180bSChuck Lever 2379a6a180bSChuck Lever /* Find the Reply chunk in the Reply's xprt header. 2389a6a180bSChuck Lever * RPC-over-RDMA V1 replies never have a Read list. 2399a6a180bSChuck Lever */ 2409a6a180bSChuck Lever p = rdma_resp + rpcrdma_fixed_maxsz + 1; 2419a6a180bSChuck Lever 2429a6a180bSChuck Lever /* Skip past Write list */ 2439a6a180bSChuck Lever while (*p++ != xdr_zero) 2449a6a180bSChuck Lever p += 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz; 2459a6a180bSChuck Lever 2469a6a180bSChuck Lever xdr_encode_write_chunk(p, rp_ch, consumed); 2479a6a180bSChuck Lever } 2489a6a180bSChuck Lever 2495fdca653SChuck Lever /* Parse the RPC Call's transport header. 25010dc4512SChuck Lever */ 2519a6a180bSChuck Lever static void svc_rdma_get_write_arrays(__be32 *rdma_argp, 2529a6a180bSChuck Lever __be32 **write, __be32 **reply) 25310dc4512SChuck Lever { 2545fdca653SChuck Lever __be32 *p; 25510dc4512SChuck Lever 2569a6a180bSChuck Lever p = rdma_argp + rpcrdma_fixed_maxsz; 2575fdca653SChuck Lever 2585fdca653SChuck Lever /* Read list */ 2595fdca653SChuck Lever while (*p++ != xdr_zero) 2605fdca653SChuck Lever p += 5; 2615fdca653SChuck Lever 2625fdca653SChuck Lever /* Write list */ 2635fdca653SChuck Lever if (*p != xdr_zero) { 2649a6a180bSChuck Lever *write = p; 2655fdca653SChuck Lever while (*p++ != xdr_zero) 2665fdca653SChuck Lever p += 1 + be32_to_cpu(*p) * 4; 2675fdca653SChuck Lever } else { 2685fdca653SChuck Lever *write = NULL; 2695fdca653SChuck Lever p++; 27010dc4512SChuck Lever } 27110dc4512SChuck Lever 2725fdca653SChuck Lever /* Reply chunk */ 2735fdca653SChuck Lever if (*p != xdr_zero) 2749a6a180bSChuck Lever *reply = p; 2755fdca653SChuck Lever else 2765fdca653SChuck Lever *reply = NULL; 27710dc4512SChuck Lever } 27810dc4512SChuck Lever 27925d55296SChuck Lever /* RPC-over-RDMA Version One private extension: Remote Invalidation. 28025d55296SChuck Lever * Responder's choice: requester signals it can handle Send With 28125d55296SChuck Lever * Invalidate, and responder chooses one rkey to invalidate. 28225d55296SChuck Lever * 28325d55296SChuck Lever * Find a candidate rkey to invalidate when sending a reply. Picks the 284c238c4c0SChuck Lever * first R_key it finds in the chunk lists. 28525d55296SChuck Lever * 28625d55296SChuck Lever * Returns zero if RPC's chunk lists are empty. 28725d55296SChuck Lever */ 288c238c4c0SChuck Lever static u32 svc_rdma_get_inv_rkey(__be32 *rdma_argp, 289c238c4c0SChuck Lever __be32 *wr_lst, __be32 *rp_ch) 29025d55296SChuck Lever { 291c238c4c0SChuck Lever __be32 *p; 29225d55296SChuck Lever 293c238c4c0SChuck Lever p = rdma_argp + rpcrdma_fixed_maxsz; 294c238c4c0SChuck Lever if (*p != xdr_zero) 295c238c4c0SChuck Lever p += 2; 296c238c4c0SChuck Lever else if (wr_lst && be32_to_cpup(wr_lst + 1)) 297c238c4c0SChuck Lever p = wr_lst + 2; 298c238c4c0SChuck Lever else if (rp_ch && be32_to_cpup(rp_ch + 1)) 299c238c4c0SChuck Lever p = rp_ch + 2; 300c238c4c0SChuck Lever else 301fafedf81SChuck Lever return 0; 302c238c4c0SChuck Lever return be32_to_cpup(p); 30325d55296SChuck Lever } 30425d55296SChuck Lever 3059a6a180bSChuck Lever /* ib_dma_map_page() is used here because svc_rdma_dma_unmap() 3069a6a180bSChuck Lever * is used during completion to DMA-unmap this memory, and 3079a6a180bSChuck Lever * it uses ib_dma_unmap_page() exclusively. 3089a6a180bSChuck Lever */ 3099a6a180bSChuck Lever static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma, 3109a6a180bSChuck Lever struct svc_rdma_op_ctxt *ctxt, 3119a6a180bSChuck Lever unsigned int sge_no, 3129a6a180bSChuck Lever unsigned char *base, 3139a6a180bSChuck Lever unsigned int len) 3149a6a180bSChuck Lever { 3159a6a180bSChuck Lever unsigned long offset = (unsigned long)base & ~PAGE_MASK; 3169a6a180bSChuck Lever struct ib_device *dev = rdma->sc_cm_id->device; 3179a6a180bSChuck Lever dma_addr_t dma_addr; 3189a6a180bSChuck Lever 3199a6a180bSChuck Lever dma_addr = ib_dma_map_page(dev, virt_to_page(base), 3209a6a180bSChuck Lever offset, len, DMA_TO_DEVICE); 3219a6a180bSChuck Lever if (ib_dma_mapping_error(dev, dma_addr)) 32291a08eaeSChuck Lever goto out_maperr; 3239a6a180bSChuck Lever 3249a6a180bSChuck Lever ctxt->sge[sge_no].addr = dma_addr; 3259a6a180bSChuck Lever ctxt->sge[sge_no].length = len; 3269a6a180bSChuck Lever ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey; 3279a6a180bSChuck Lever svc_rdma_count_mappings(rdma, ctxt); 3289a6a180bSChuck Lever return 0; 32991a08eaeSChuck Lever 33091a08eaeSChuck Lever out_maperr: 33191a08eaeSChuck Lever pr_err("svcrdma: failed to map buffer\n"); 33291a08eaeSChuck Lever return -EIO; 3339a6a180bSChuck Lever } 3349a6a180bSChuck Lever 3356e6092caSChuck Lever static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma, 3366e6092caSChuck Lever struct svc_rdma_op_ctxt *ctxt, 3376e6092caSChuck Lever unsigned int sge_no, 3386e6092caSChuck Lever struct page *page, 3396e6092caSChuck Lever unsigned int offset, 3406e6092caSChuck Lever unsigned int len) 3416e6092caSChuck Lever { 3426e6092caSChuck Lever struct ib_device *dev = rdma->sc_cm_id->device; 3436e6092caSChuck Lever dma_addr_t dma_addr; 3446e6092caSChuck Lever 3456e6092caSChuck Lever dma_addr = ib_dma_map_page(dev, page, offset, len, DMA_TO_DEVICE); 3466e6092caSChuck Lever if (ib_dma_mapping_error(dev, dma_addr)) 34791a08eaeSChuck Lever goto out_maperr; 3486e6092caSChuck Lever 3496e6092caSChuck Lever ctxt->sge[sge_no].addr = dma_addr; 3506e6092caSChuck Lever ctxt->sge[sge_no].length = len; 3516e6092caSChuck Lever ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey; 3526e6092caSChuck Lever svc_rdma_count_mappings(rdma, ctxt); 3536e6092caSChuck Lever return 0; 35491a08eaeSChuck Lever 35591a08eaeSChuck Lever out_maperr: 356bd2abef3SChuck Lever trace_svcrdma_dma_map_page(rdma, page); 35791a08eaeSChuck Lever return -EIO; 3586e6092caSChuck Lever } 3596e6092caSChuck Lever 3606e6092caSChuck Lever /** 3616e6092caSChuck Lever * svc_rdma_map_reply_hdr - DMA map the transport header buffer 3626e6092caSChuck Lever * @rdma: controlling transport 3636e6092caSChuck Lever * @ctxt: op_ctxt for the Send WR 3646e6092caSChuck Lever * @rdma_resp: buffer containing transport header 3656e6092caSChuck Lever * @len: length of transport header 3666e6092caSChuck Lever * 3676e6092caSChuck Lever * Returns: 3686e6092caSChuck Lever * %0 if the header is DMA mapped, 3696e6092caSChuck Lever * %-EIO if DMA mapping failed. 3706e6092caSChuck Lever */ 3716e6092caSChuck Lever int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma, 3726e6092caSChuck Lever struct svc_rdma_op_ctxt *ctxt, 3736e6092caSChuck Lever __be32 *rdma_resp, 3746e6092caSChuck Lever unsigned int len) 3756e6092caSChuck Lever { 3766e6092caSChuck Lever ctxt->direction = DMA_TO_DEVICE; 3776e6092caSChuck Lever ctxt->pages[0] = virt_to_page(rdma_resp); 3786e6092caSChuck Lever ctxt->count = 1; 3796e6092caSChuck Lever return svc_rdma_dma_map_page(rdma, ctxt, 0, ctxt->pages[0], 0, len); 3806e6092caSChuck Lever } 3816e6092caSChuck Lever 3829a6a180bSChuck Lever /* Load the xdr_buf into the ctxt's sge array, and DMA map each 3839a6a180bSChuck Lever * element as it is added. 3849a6a180bSChuck Lever * 3859a6a180bSChuck Lever * Returns the number of sge elements loaded on success, or 3869a6a180bSChuck Lever * a negative errno on failure. 387c06b540aSTom Tucker */ 3889a6a180bSChuck Lever static int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, 3899a6a180bSChuck Lever struct svc_rdma_op_ctxt *ctxt, 3909a6a180bSChuck Lever struct xdr_buf *xdr, __be32 *wr_lst) 391c06b540aSTom Tucker { 3929a6a180bSChuck Lever unsigned int len, sge_no, remaining, page_off; 3939a6a180bSChuck Lever struct page **ppages; 3949a6a180bSChuck Lever unsigned char *base; 3959a6a180bSChuck Lever u32 xdr_pad; 396c06b540aSTom Tucker int ret; 397c06b540aSTom Tucker 3989a6a180bSChuck Lever sge_no = 1; 399c06b540aSTom Tucker 4009a6a180bSChuck Lever ret = svc_rdma_dma_map_buf(rdma, ctxt, sge_no++, 4019a6a180bSChuck Lever xdr->head[0].iov_base, 4029a6a180bSChuck Lever xdr->head[0].iov_len); 4039a6a180bSChuck Lever if (ret < 0) 4049a6a180bSChuck Lever return ret; 405c06b540aSTom Tucker 4069a6a180bSChuck Lever /* If a Write chunk is present, the xdr_buf's page list 4079a6a180bSChuck Lever * is not included inline. However the Upper Layer may 4089a6a180bSChuck Lever * have added XDR padding in the tail buffer, and that 4099a6a180bSChuck Lever * should not be included inline. 4109a6a180bSChuck Lever */ 4119a6a180bSChuck Lever if (wr_lst) { 4129a6a180bSChuck Lever base = xdr->tail[0].iov_base; 4139a6a180bSChuck Lever len = xdr->tail[0].iov_len; 4149a6a180bSChuck Lever xdr_pad = xdr_padsize(xdr->page_len); 415c06b540aSTom Tucker 4169a6a180bSChuck Lever if (len && xdr_pad) { 4179a6a180bSChuck Lever base += xdr_pad; 4189a6a180bSChuck Lever len -= xdr_pad; 419c06b540aSTom Tucker } 420c06b540aSTom Tucker 4219a6a180bSChuck Lever goto tail; 422c06b540aSTom Tucker } 4239a6a180bSChuck Lever 4249a6a180bSChuck Lever ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); 4259a6a180bSChuck Lever page_off = xdr->page_base & ~PAGE_MASK; 4269a6a180bSChuck Lever remaining = xdr->page_len; 4279a6a180bSChuck Lever while (remaining) { 4289a6a180bSChuck Lever len = min_t(u32, PAGE_SIZE - page_off, remaining); 4299a6a180bSChuck Lever 4309a6a180bSChuck Lever ret = svc_rdma_dma_map_page(rdma, ctxt, sge_no++, 4319a6a180bSChuck Lever *ppages++, page_off, len); 4329a6a180bSChuck Lever if (ret < 0) 4339a6a180bSChuck Lever return ret; 4349a6a180bSChuck Lever 4359a6a180bSChuck Lever remaining -= len; 4369a6a180bSChuck Lever page_off = 0; 437c06b540aSTom Tucker } 438c06b540aSTom Tucker 4399a6a180bSChuck Lever base = xdr->tail[0].iov_base; 4409a6a180bSChuck Lever len = xdr->tail[0].iov_len; 4419a6a180bSChuck Lever tail: 4429a6a180bSChuck Lever if (len) { 4439a6a180bSChuck Lever ret = svc_rdma_dma_map_buf(rdma, ctxt, sge_no++, base, len); 4449a6a180bSChuck Lever if (ret < 0) 4459a6a180bSChuck Lever return ret; 4469a6a180bSChuck Lever } 44708ae4e7fSChuck Lever 4489a6a180bSChuck Lever return sge_no - 1; 449c06b540aSTom Tucker } 450c06b540aSTom Tucker 451c55ab070SChuck Lever /* The svc_rqst and all resources it owns are released as soon as 452c55ab070SChuck Lever * svc_rdma_sendto returns. Transfer pages under I/O to the ctxt 453c55ab070SChuck Lever * so they are released by the Send completion handler. 454c55ab070SChuck Lever */ 455c55ab070SChuck Lever static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, 456c55ab070SChuck Lever struct svc_rdma_op_ctxt *ctxt) 457c55ab070SChuck Lever { 458c55ab070SChuck Lever int i, pages = rqstp->rq_next_page - rqstp->rq_respages; 459c55ab070SChuck Lever 460c55ab070SChuck Lever ctxt->count += pages; 461c55ab070SChuck Lever for (i = 0; i < pages; i++) { 462c55ab070SChuck Lever ctxt->pages[i + 1] = rqstp->rq_respages[i]; 463c55ab070SChuck Lever rqstp->rq_respages[i] = NULL; 464c55ab070SChuck Lever } 465c55ab070SChuck Lever rqstp->rq_next_page = rqstp->rq_respages + 1; 466c55ab070SChuck Lever } 467c55ab070SChuck Lever 46817f5f7f5SChuck Lever /** 46917f5f7f5SChuck Lever * svc_rdma_post_send_wr - Set up and post one Send Work Request 47017f5f7f5SChuck Lever * @rdma: controlling transport 47117f5f7f5SChuck Lever * @ctxt: op_ctxt for transmitting the Send WR 47217f5f7f5SChuck Lever * @num_sge: number of SGEs to send 47317f5f7f5SChuck Lever * @inv_rkey: R_key argument to Send With Invalidate, or zero 47417f5f7f5SChuck Lever * 47517f5f7f5SChuck Lever * Returns: 47617f5f7f5SChuck Lever * %0 if the Send* was posted successfully, 47717f5f7f5SChuck Lever * %-ENOTCONN if the connection was lost or dropped, 47817f5f7f5SChuck Lever * %-EINVAL if there was a problem with the Send we built, 47917f5f7f5SChuck Lever * %-ENOMEM if ib_post_send failed. 48017f5f7f5SChuck Lever */ 48117f5f7f5SChuck Lever int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma, 48217f5f7f5SChuck Lever struct svc_rdma_op_ctxt *ctxt, int num_sge, 48317f5f7f5SChuck Lever u32 inv_rkey) 48417f5f7f5SChuck Lever { 48517f5f7f5SChuck Lever struct ib_send_wr *send_wr = &ctxt->send_wr; 48617f5f7f5SChuck Lever 48717f5f7f5SChuck Lever dprintk("svcrdma: posting Send WR with %u sge(s)\n", num_sge); 48817f5f7f5SChuck Lever 48917f5f7f5SChuck Lever send_wr->next = NULL; 49017f5f7f5SChuck Lever ctxt->cqe.done = svc_rdma_wc_send; 49117f5f7f5SChuck Lever send_wr->wr_cqe = &ctxt->cqe; 49217f5f7f5SChuck Lever send_wr->sg_list = ctxt->sge; 49317f5f7f5SChuck Lever send_wr->num_sge = num_sge; 49417f5f7f5SChuck Lever send_wr->send_flags = IB_SEND_SIGNALED; 49517f5f7f5SChuck Lever if (inv_rkey) { 49617f5f7f5SChuck Lever send_wr->opcode = IB_WR_SEND_WITH_INV; 49717f5f7f5SChuck Lever send_wr->ex.invalidate_rkey = inv_rkey; 49817f5f7f5SChuck Lever } else { 49917f5f7f5SChuck Lever send_wr->opcode = IB_WR_SEND; 50017f5f7f5SChuck Lever } 50117f5f7f5SChuck Lever 50217f5f7f5SChuck Lever return svc_rdma_send(rdma, send_wr); 50317f5f7f5SChuck Lever } 50417f5f7f5SChuck Lever 5059a6a180bSChuck Lever /* Prepare the portion of the RPC Reply that will be transmitted 5069a6a180bSChuck Lever * via RDMA Send. The RPC-over-RDMA transport header is prepared 5079a6a180bSChuck Lever * in sge[0], and the RPC xdr_buf is prepared in following sges. 5089a6a180bSChuck Lever * 5099a6a180bSChuck Lever * Depending on whether a Write list or Reply chunk is present, 5109a6a180bSChuck Lever * the server may send all, a portion of, or none of the xdr_buf. 5119a6a180bSChuck Lever * In the latter case, only the transport header (sge[0]) is 5129a6a180bSChuck Lever * transmitted. 5139a6a180bSChuck Lever * 5149a6a180bSChuck Lever * RDMA Send is the last step of transmitting an RPC reply. Pages 5159a6a180bSChuck Lever * involved in the earlier RDMA Writes are here transferred out 5169a6a180bSChuck Lever * of the rqstp and into the ctxt's page array. These pages are 5179a6a180bSChuck Lever * DMA unmapped by each Write completion, but the subsequent Send 5189a6a180bSChuck Lever * completion finally releases these pages. 5199a6a180bSChuck Lever * 5209a6a180bSChuck Lever * Assumptions: 5219a6a180bSChuck Lever * - The Reply's transport header will never be larger than a page. 522c06b540aSTom Tucker */ 5239a6a180bSChuck Lever static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, 5249a6a180bSChuck Lever __be32 *rdma_argp, __be32 *rdma_resp, 525c06b540aSTom Tucker struct svc_rqst *rqstp, 5269a6a180bSChuck Lever __be32 *wr_lst, __be32 *rp_ch) 527c06b540aSTom Tucker { 5289ec64052SChuck Lever struct svc_rdma_op_ctxt *ctxt; 5299a6a180bSChuck Lever u32 inv_rkey; 5309a6a180bSChuck Lever int ret; 5310e7f011aSTom Tucker 5329ec64052SChuck Lever ctxt = svc_rdma_get_context(rdma); 533c06b540aSTom Tucker 5349a6a180bSChuck Lever ret = svc_rdma_map_reply_hdr(rdma, ctxt, rdma_resp, 5359a6a180bSChuck Lever svc_rdma_reply_hdr_len(rdma_resp)); 5369a6a180bSChuck Lever if (ret < 0) 537afd566eaSTom Tucker goto err; 538afd566eaSTom Tucker 5399a6a180bSChuck Lever if (!rp_ch) { 5409a6a180bSChuck Lever ret = svc_rdma_map_reply_msg(rdma, ctxt, 5419a6a180bSChuck Lever &rqstp->rq_res, wr_lst); 5429a6a180bSChuck Lever if (ret < 0) 5433fe04ee9SChuck Lever goto err; 5443fe04ee9SChuck Lever } 545c06b540aSTom Tucker 546c55ab070SChuck Lever svc_rdma_save_io_pages(rqstp, ctxt); 5470bf48289SSteve Wise 5489a6a180bSChuck Lever inv_rkey = 0; 5499a6a180bSChuck Lever if (rdma->sc_snd_w_inv) 5509a6a180bSChuck Lever inv_rkey = svc_rdma_get_inv_rkey(rdma_argp, wr_lst, rp_ch); 5519a6a180bSChuck Lever ret = svc_rdma_post_send_wr(rdma, ctxt, 1 + ret, inv_rkey); 552c06b540aSTom Tucker if (ret) 553afd566eaSTom Tucker goto err; 554c06b540aSTom Tucker 555afd566eaSTom Tucker return 0; 556afd566eaSTom Tucker 557afd566eaSTom Tucker err: 55821515e46SSteve Wise svc_rdma_unmap_dma(ctxt); 559afd566eaSTom Tucker svc_rdma_put_context(ctxt, 1); 5609ec64052SChuck Lever return ret; 561c06b540aSTom Tucker } 562c06b540aSTom Tucker 5634757d90bSChuck Lever /* Given the client-provided Write and Reply chunks, the server was not 5644757d90bSChuck Lever * able to form a complete reply. Return an RDMA_ERROR message so the 5654757d90bSChuck Lever * client can retire this RPC transaction. As above, the Send completion 5664757d90bSChuck Lever * routine releases payload pages that were part of a previous RDMA Write. 5674757d90bSChuck Lever * 5684757d90bSChuck Lever * Remote Invalidation is skipped for simplicity. 5694757d90bSChuck Lever */ 5704757d90bSChuck Lever static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, 5714757d90bSChuck Lever __be32 *rdma_resp, struct svc_rqst *rqstp) 5724757d90bSChuck Lever { 5734757d90bSChuck Lever struct svc_rdma_op_ctxt *ctxt; 5744757d90bSChuck Lever __be32 *p; 5754757d90bSChuck Lever int ret; 5764757d90bSChuck Lever 5774757d90bSChuck Lever ctxt = svc_rdma_get_context(rdma); 5784757d90bSChuck Lever 5794757d90bSChuck Lever /* Replace the original transport header with an 5804757d90bSChuck Lever * RDMA_ERROR response. XID etc are preserved. 5814757d90bSChuck Lever */ 58298895edbSChuck Lever trace_svcrdma_err_chunk(*rdma_resp); 5834757d90bSChuck Lever p = rdma_resp + 3; 5844757d90bSChuck Lever *p++ = rdma_error; 5854757d90bSChuck Lever *p = err_chunk; 5864757d90bSChuck Lever 5874757d90bSChuck Lever ret = svc_rdma_map_reply_hdr(rdma, ctxt, rdma_resp, 20); 5884757d90bSChuck Lever if (ret < 0) 5894757d90bSChuck Lever goto err; 5904757d90bSChuck Lever 5914757d90bSChuck Lever svc_rdma_save_io_pages(rqstp, ctxt); 5924757d90bSChuck Lever 5934757d90bSChuck Lever ret = svc_rdma_post_send_wr(rdma, ctxt, 1 + ret, 0); 5944757d90bSChuck Lever if (ret) 5954757d90bSChuck Lever goto err; 5964757d90bSChuck Lever 5974757d90bSChuck Lever return 0; 5984757d90bSChuck Lever 5994757d90bSChuck Lever err: 6004757d90bSChuck Lever svc_rdma_unmap_dma(ctxt); 6014757d90bSChuck Lever svc_rdma_put_context(ctxt, 1); 6024757d90bSChuck Lever return ret; 6034757d90bSChuck Lever } 6044757d90bSChuck Lever 605c06b540aSTom Tucker void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp) 606c06b540aSTom Tucker { 607c06b540aSTom Tucker } 608c06b540aSTom Tucker 6099a6a180bSChuck Lever /** 6109a6a180bSChuck Lever * svc_rdma_sendto - Transmit an RPC reply 6119a6a180bSChuck Lever * @rqstp: processed RPC request, reply XDR already in ::rq_res 6129a6a180bSChuck Lever * 6139a6a180bSChuck Lever * Any resources still associated with @rqstp are released upon return. 6149a6a180bSChuck Lever * If no reply message was possible, the connection is closed. 6159a6a180bSChuck Lever * 6169a6a180bSChuck Lever * Returns: 6179a6a180bSChuck Lever * %0 if an RPC reply has been successfully posted, 6189a6a180bSChuck Lever * %-ENOMEM if a resource shortage occurred (connection is lost), 6199a6a180bSChuck Lever * %-ENOTCONN if posting failed (connection is lost). 6209a6a180bSChuck Lever */ 621c06b540aSTom Tucker int svc_rdma_sendto(struct svc_rqst *rqstp) 622c06b540aSTom Tucker { 623c06b540aSTom Tucker struct svc_xprt *xprt = rqstp->rq_xprt; 624c06b540aSTom Tucker struct svcxprt_rdma *rdma = 625c06b540aSTom Tucker container_of(xprt, struct svcxprt_rdma, sc_xprt); 6263a88092eSChuck Lever struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt; 6279a6a180bSChuck Lever __be32 *p, *rdma_argp, *rdma_resp, *wr_lst, *rp_ch; 6289a6a180bSChuck Lever struct xdr_buf *xdr = &rqstp->rq_res; 629c06b540aSTom Tucker struct page *res_page; 6309a6a180bSChuck Lever int ret; 631c06b540aSTom Tucker 6329a6a180bSChuck Lever /* Find the call's chunk lists to decide how to send the reply. 6339a6a180bSChuck Lever * Receive places the Call's xprt header at the start of page 0. 634e5523bd2SChuck Lever */ 635e5523bd2SChuck Lever rdma_argp = page_address(rqstp->rq_pages[0]); 6369a6a180bSChuck Lever svc_rdma_get_write_arrays(rdma_argp, &wr_lst, &rp_ch); 637c06b540aSTom Tucker 638e4eb42ceSChuck Lever /* Create the RDMA response header. xprt->xpt_mutex, 639e4eb42ceSChuck Lever * acquired in svc_send(), serializes RPC replies. The 640e4eb42ceSChuck Lever * code path below that inserts the credit grant value 641e4eb42ceSChuck Lever * into each transport header runs only inside this 642e4eb42ceSChuck Lever * critical section. 643e4eb42ceSChuck Lever */ 64478da2b3cSChuck Lever ret = -ENOMEM; 64578da2b3cSChuck Lever res_page = alloc_page(GFP_KERNEL); 64678da2b3cSChuck Lever if (!res_page) 64778da2b3cSChuck Lever goto err0; 648c06b540aSTom Tucker rdma_resp = page_address(res_page); 64998fc21d3SChuck Lever 6509a6a180bSChuck Lever p = rdma_resp; 6519a6a180bSChuck Lever *p++ = *rdma_argp; 6529a6a180bSChuck Lever *p++ = *(rdma_argp + 1); 65398fc21d3SChuck Lever *p++ = rdma->sc_fc_credits; 6549a6a180bSChuck Lever *p++ = rp_ch ? rdma_nomsg : rdma_msg; 65598fc21d3SChuck Lever 65698fc21d3SChuck Lever /* Start with empty chunks */ 65798fc21d3SChuck Lever *p++ = xdr_zero; 65898fc21d3SChuck Lever *p++ = xdr_zero; 65998fc21d3SChuck Lever *p = xdr_zero; 660c06b540aSTom Tucker 6619a6a180bSChuck Lever if (wr_lst) { 6629a6a180bSChuck Lever /* XXX: Presume the client sent only one Write chunk */ 6639a6a180bSChuck Lever ret = svc_rdma_send_write_chunk(rdma, wr_lst, xdr); 66408ae4e7fSChuck Lever if (ret < 0) 6654757d90bSChuck Lever goto err2; 6669a6a180bSChuck Lever svc_rdma_xdr_encode_write_list(rdma_resp, wr_lst, ret); 66708ae4e7fSChuck Lever } 6689a6a180bSChuck Lever if (rp_ch) { 6699a6a180bSChuck Lever ret = svc_rdma_send_reply_chunk(rdma, rp_ch, wr_lst, xdr); 67008ae4e7fSChuck Lever if (ret < 0) 6714757d90bSChuck Lever goto err2; 6729a6a180bSChuck Lever svc_rdma_xdr_encode_reply_chunk(rdma_resp, rp_ch, ret); 67308ae4e7fSChuck Lever } 674c06b540aSTom Tucker 6759a6a180bSChuck Lever ret = svc_rdma_send_reply_msg(rdma, rdma_argp, rdma_resp, rqstp, 6769a6a180bSChuck Lever wr_lst, rp_ch); 6773e1eeb98SChuck Lever if (ret < 0) 6789995237bSChuck Lever goto err0; 6793a88092eSChuck Lever ret = 0; 6803a88092eSChuck Lever 6813a88092eSChuck Lever out: 6823a88092eSChuck Lever rqstp->rq_xprt_ctxt = NULL; 6833a88092eSChuck Lever svc_rdma_recv_ctxt_put(rdma, rctxt); 6843a88092eSChuck Lever return ret; 685afd566eaSTom Tucker 6864757d90bSChuck Lever err2: 687b20dae70SColin Ian King if (ret != -E2BIG && ret != -EINVAL) 6884757d90bSChuck Lever goto err1; 6894757d90bSChuck Lever 6904757d90bSChuck Lever ret = svc_rdma_send_error_msg(rdma, rdma_resp, rqstp); 6914757d90bSChuck Lever if (ret < 0) 6924757d90bSChuck Lever goto err0; 6933a88092eSChuck Lever ret = 0; 6943a88092eSChuck Lever goto out; 6954757d90bSChuck Lever 696afd566eaSTom Tucker err1: 697afd566eaSTom Tucker put_page(res_page); 698afd566eaSTom Tucker err0: 699bd2abef3SChuck Lever trace_svcrdma_send_failed(rqstp, ret); 7009a6a180bSChuck Lever set_bit(XPT_CLOSE, &xprt->xpt_flags); 7013a88092eSChuck Lever ret = -ENOTCONN; 7023a88092eSChuck Lever goto out; 703c06b540aSTom Tucker } 704