xref: /openbmc/linux/net/sunrpc/xprtrdma/rpc_rdma.c (revision 8f8d5745bb520c76b81abef4a2cb3023d0313bfd)
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 /*
3  * Copyright (c) 2014-2017 Oracle.  All rights reserved.
4  * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
5  *
6  * This software is available to you under a choice of one of two
7  * licenses.  You may choose to be licensed under the terms of the GNU
8  * General Public License (GPL) Version 2, available from the file
9  * COPYING in the main directory of this source tree, or the BSD-type
10  * license below:
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  *
16  *      Redistributions of source code must retain the above copyright
17  *      notice, this list of conditions and the following disclaimer.
18  *
19  *      Redistributions in binary form must reproduce the above
20  *      copyright notice, this list of conditions and the following
21  *      disclaimer in the documentation and/or other materials provided
22  *      with the distribution.
23  *
24  *      Neither the name of the Network Appliance, Inc. nor the names of
25  *      its contributors may be used to endorse or promote products
26  *      derived from this software without specific prior written
27  *      permission.
28  *
29  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
30  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
31  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
32  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
33  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
34  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
35  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
36  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
37  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
38  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
39  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
40  */
41 
42 /*
43  * rpc_rdma.c
44  *
45  * This file contains the guts of the RPC RDMA protocol, and
46  * does marshaling/unmarshaling, etc. It is also where interfacing
47  * to the Linux RPC framework lives.
48  */
49 
50 #include <linux/highmem.h>
51 
52 #include <linux/sunrpc/svc_rdma.h>
53 
54 #include "xprt_rdma.h"
55 #include <trace/events/rpcrdma.h>
56 
57 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
58 # define RPCDBG_FACILITY	RPCDBG_TRANS
59 #endif
60 
61 /* Returns size of largest RPC-over-RDMA header in a Call message
62  *
63  * The largest Call header contains a full-size Read list and a
64  * minimal Reply chunk.
65  */
66 static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs)
67 {
68 	unsigned int size;
69 
70 	/* Fixed header fields and list discriminators */
71 	size = RPCRDMA_HDRLEN_MIN;
72 
73 	/* Maximum Read list size */
74 	size = maxsegs * rpcrdma_readchunk_maxsz * sizeof(__be32);
75 
76 	/* Minimal Read chunk size */
77 	size += sizeof(__be32);	/* segment count */
78 	size += rpcrdma_segment_maxsz * sizeof(__be32);
79 	size += sizeof(__be32);	/* list discriminator */
80 
81 	dprintk("RPC:       %s: max call header size = %u\n",
82 		__func__, size);
83 	return size;
84 }
85 
86 /* Returns size of largest RPC-over-RDMA header in a Reply message
87  *
88  * There is only one Write list or one Reply chunk per Reply
89  * message.  The larger list is the Write list.
90  */
91 static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
92 {
93 	unsigned int size;
94 
95 	/* Fixed header fields and list discriminators */
96 	size = RPCRDMA_HDRLEN_MIN;
97 
98 	/* Maximum Write list size */
99 	size = sizeof(__be32);		/* segment count */
100 	size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32);
101 	size += sizeof(__be32);	/* list discriminator */
102 
103 	dprintk("RPC:       %s: max reply header size = %u\n",
104 		__func__, size);
105 	return size;
106 }
107 
108 void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt)
109 {
110 	struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
111 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
112 	unsigned int maxsegs = ia->ri_max_segs;
113 
114 	ia->ri_max_inline_write = cdata->inline_wsize -
115 				  rpcrdma_max_call_header_size(maxsegs);
116 	ia->ri_max_inline_read = cdata->inline_rsize -
117 				 rpcrdma_max_reply_header_size(maxsegs);
118 }
119 
120 /* The client can send a request inline as long as the RPCRDMA header
121  * plus the RPC call fit under the transport's inline limit. If the
122  * combined call message size exceeds that limit, the client must use
123  * a Read chunk for this operation.
124  *
125  * A Read chunk is also required if sending the RPC call inline would
126  * exceed this device's max_sge limit.
127  */
128 static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
129 				struct rpc_rqst *rqst)
130 {
131 	struct xdr_buf *xdr = &rqst->rq_snd_buf;
132 	unsigned int count, remaining, offset;
133 
134 	if (xdr->len > r_xprt->rx_ia.ri_max_inline_write)
135 		return false;
136 
137 	if (xdr->page_len) {
138 		remaining = xdr->page_len;
139 		offset = offset_in_page(xdr->page_base);
140 		count = RPCRDMA_MIN_SEND_SGES;
141 		while (remaining) {
142 			remaining -= min_t(unsigned int,
143 					   PAGE_SIZE - offset, remaining);
144 			offset = 0;
145 			if (++count > r_xprt->rx_ia.ri_max_send_sges)
146 				return false;
147 		}
148 	}
149 
150 	return true;
151 }
152 
153 /* The client can't know how large the actual reply will be. Thus it
154  * plans for the largest possible reply for that particular ULP
155  * operation. If the maximum combined reply message size exceeds that
156  * limit, the client must provide a write list or a reply chunk for
157  * this request.
158  */
159 static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
160 				   struct rpc_rqst *rqst)
161 {
162 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
163 
164 	return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read;
165 }
166 
167 /* The client is required to provide a Reply chunk if the maximum
168  * size of the non-payload part of the RPC Reply is larger than
169  * the inline threshold.
170  */
171 static bool
172 rpcrdma_nonpayload_inline(const struct rpcrdma_xprt *r_xprt,
173 			  const struct rpc_rqst *rqst)
174 {
175 	const struct xdr_buf *buf = &rqst->rq_rcv_buf;
176 	const struct rpcrdma_ia *ia = &r_xprt->rx_ia;
177 
178 	return buf->head[0].iov_len + buf->tail[0].iov_len <
179 		ia->ri_max_inline_read;
180 }
181 
182 /* Split @vec on page boundaries into SGEs. FMR registers pages, not
183  * a byte range. Other modes coalesce these SGEs into a single MR
184  * when they can.
185  *
186  * Returns pointer to next available SGE, and bumps the total number
187  * of SGEs consumed.
188  */
189 static struct rpcrdma_mr_seg *
190 rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
191 		     unsigned int *n)
192 {
193 	u32 remaining, page_offset;
194 	char *base;
195 
196 	base = vec->iov_base;
197 	page_offset = offset_in_page(base);
198 	remaining = vec->iov_len;
199 	while (remaining) {
200 		seg->mr_page = NULL;
201 		seg->mr_offset = base;
202 		seg->mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining);
203 		remaining -= seg->mr_len;
204 		base += seg->mr_len;
205 		++seg;
206 		++(*n);
207 		page_offset = 0;
208 	}
209 	return seg;
210 }
211 
212 /* Convert @xdrbuf into SGEs no larger than a page each. As they
213  * are registered, these SGEs are then coalesced into RDMA segments
214  * when the selected memreg mode supports it.
215  *
216  * Returns positive number of SGEs consumed, or a negative errno.
217  */
218 
219 static int
220 rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
221 		     unsigned int pos, enum rpcrdma_chunktype type,
222 		     struct rpcrdma_mr_seg *seg)
223 {
224 	unsigned long page_base;
225 	unsigned int len, n;
226 	struct page **ppages;
227 
228 	n = 0;
229 	if (pos == 0)
230 		seg = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, &n);
231 
232 	len = xdrbuf->page_len;
233 	ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
234 	page_base = offset_in_page(xdrbuf->page_base);
235 	while (len) {
236 		/* ACL likes to be lazy in allocating pages - ACLs
237 		 * are small by default but can get huge.
238 		 */
239 		if (unlikely(xdrbuf->flags & XDRBUF_SPARSE_PAGES)) {
240 			if (!*ppages)
241 				*ppages = alloc_page(GFP_ATOMIC);
242 			if (!*ppages)
243 				return -ENOBUFS;
244 		}
245 		seg->mr_page = *ppages;
246 		seg->mr_offset = (char *)page_base;
247 		seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len);
248 		len -= seg->mr_len;
249 		++ppages;
250 		++seg;
251 		++n;
252 		page_base = 0;
253 	}
254 
255 	/* When encoding a Read chunk, the tail iovec contains an
256 	 * XDR pad and may be omitted.
257 	 */
258 	if (type == rpcrdma_readch && r_xprt->rx_ia.ri_implicit_roundup)
259 		goto out;
260 
261 	/* When encoding a Write chunk, some servers need to see an
262 	 * extra segment for non-XDR-aligned Write chunks. The upper
263 	 * layer provides space in the tail iovec that may be used
264 	 * for this purpose.
265 	 */
266 	if (type == rpcrdma_writech && r_xprt->rx_ia.ri_implicit_roundup)
267 		goto out;
268 
269 	if (xdrbuf->tail[0].iov_len)
270 		seg = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n);
271 
272 out:
273 	if (unlikely(n > RPCRDMA_MAX_SEGS))
274 		return -EIO;
275 	return n;
276 }
277 
278 static inline int
279 encode_item_present(struct xdr_stream *xdr)
280 {
281 	__be32 *p;
282 
283 	p = xdr_reserve_space(xdr, sizeof(*p));
284 	if (unlikely(!p))
285 		return -EMSGSIZE;
286 
287 	*p = xdr_one;
288 	return 0;
289 }
290 
291 static inline int
292 encode_item_not_present(struct xdr_stream *xdr)
293 {
294 	__be32 *p;
295 
296 	p = xdr_reserve_space(xdr, sizeof(*p));
297 	if (unlikely(!p))
298 		return -EMSGSIZE;
299 
300 	*p = xdr_zero;
301 	return 0;
302 }
303 
304 static void
305 xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr *mr)
306 {
307 	*iptr++ = cpu_to_be32(mr->mr_handle);
308 	*iptr++ = cpu_to_be32(mr->mr_length);
309 	xdr_encode_hyper(iptr, mr->mr_offset);
310 }
311 
312 static int
313 encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr)
314 {
315 	__be32 *p;
316 
317 	p = xdr_reserve_space(xdr, 4 * sizeof(*p));
318 	if (unlikely(!p))
319 		return -EMSGSIZE;
320 
321 	xdr_encode_rdma_segment(p, mr);
322 	return 0;
323 }
324 
325 static int
326 encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr,
327 		    u32 position)
328 {
329 	__be32 *p;
330 
331 	p = xdr_reserve_space(xdr, 6 * sizeof(*p));
332 	if (unlikely(!p))
333 		return -EMSGSIZE;
334 
335 	*p++ = xdr_one;			/* Item present */
336 	*p++ = cpu_to_be32(position);
337 	xdr_encode_rdma_segment(p, mr);
338 	return 0;
339 }
340 
341 /* Register and XDR encode the Read list. Supports encoding a list of read
342  * segments that belong to a single read chunk.
343  *
344  * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
345  *
346  *  Read chunklist (a linked list):
347  *   N elements, position P (same P for all chunks of same arg!):
348  *    1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0
349  *
350  * Returns zero on success, or a negative errno if a failure occurred.
351  * @xdr is advanced to the next position in the stream.
352  *
353  * Only a single @pos value is currently supported.
354  */
355 static noinline int
356 rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
357 			 struct rpc_rqst *rqst, enum rpcrdma_chunktype rtype)
358 {
359 	struct xdr_stream *xdr = &req->rl_stream;
360 	struct rpcrdma_mr_seg *seg;
361 	struct rpcrdma_mr *mr;
362 	unsigned int pos;
363 	int nsegs;
364 
365 	pos = rqst->rq_snd_buf.head[0].iov_len;
366 	if (rtype == rpcrdma_areadch)
367 		pos = 0;
368 	seg = req->rl_segments;
369 	nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_snd_buf, pos,
370 				     rtype, seg);
371 	if (nsegs < 0)
372 		return nsegs;
373 
374 	do {
375 		seg = frwr_map(r_xprt, seg, nsegs, false, rqst->rq_xid, &mr);
376 		if (IS_ERR(seg))
377 			return PTR_ERR(seg);
378 		rpcrdma_mr_push(mr, &req->rl_registered);
379 
380 		if (encode_read_segment(xdr, mr, pos) < 0)
381 			return -EMSGSIZE;
382 
383 		trace_xprtrdma_chunk_read(rqst->rq_task, pos, mr, nsegs);
384 		r_xprt->rx_stats.read_chunk_count++;
385 		nsegs -= mr->mr_nents;
386 	} while (nsegs);
387 
388 	return 0;
389 }
390 
391 /* Register and XDR encode the Write list. Supports encoding a list
392  * containing one array of plain segments that belong to a single
393  * write chunk.
394  *
395  * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
396  *
397  *  Write chunklist (a list of (one) counted array):
398  *   N elements:
399  *    1 - N - HLOO - HLOO - ... - HLOO - 0
400  *
401  * Returns zero on success, or a negative errno if a failure occurred.
402  * @xdr is advanced to the next position in the stream.
403  *
404  * Only a single Write chunk is currently supported.
405  */
406 static noinline int
407 rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
408 			  struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype)
409 {
410 	struct xdr_stream *xdr = &req->rl_stream;
411 	struct rpcrdma_mr_seg *seg;
412 	struct rpcrdma_mr *mr;
413 	int nsegs, nchunks;
414 	__be32 *segcount;
415 
416 	seg = req->rl_segments;
417 	nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf,
418 				     rqst->rq_rcv_buf.head[0].iov_len,
419 				     wtype, seg);
420 	if (nsegs < 0)
421 		return nsegs;
422 
423 	if (encode_item_present(xdr) < 0)
424 		return -EMSGSIZE;
425 	segcount = xdr_reserve_space(xdr, sizeof(*segcount));
426 	if (unlikely(!segcount))
427 		return -EMSGSIZE;
428 	/* Actual value encoded below */
429 
430 	nchunks = 0;
431 	do {
432 		seg = frwr_map(r_xprt, seg, nsegs, true, rqst->rq_xid, &mr);
433 		if (IS_ERR(seg))
434 			return PTR_ERR(seg);
435 		rpcrdma_mr_push(mr, &req->rl_registered);
436 
437 		if (encode_rdma_segment(xdr, mr) < 0)
438 			return -EMSGSIZE;
439 
440 		trace_xprtrdma_chunk_write(rqst->rq_task, mr, nsegs);
441 		r_xprt->rx_stats.write_chunk_count++;
442 		r_xprt->rx_stats.total_rdma_request += mr->mr_length;
443 		nchunks++;
444 		nsegs -= mr->mr_nents;
445 	} while (nsegs);
446 
447 	/* Update count of segments in this Write chunk */
448 	*segcount = cpu_to_be32(nchunks);
449 
450 	return 0;
451 }
452 
453 /* Register and XDR encode the Reply chunk. Supports encoding an array
454  * of plain segments that belong to a single write (reply) chunk.
455  *
456  * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
457  *
458  *  Reply chunk (a counted array):
459  *   N elements:
460  *    1 - N - HLOO - HLOO - ... - HLOO
461  *
462  * Returns zero on success, or a negative errno if a failure occurred.
463  * @xdr is advanced to the next position in the stream.
464  */
465 static noinline int
466 rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
467 			   struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype)
468 {
469 	struct xdr_stream *xdr = &req->rl_stream;
470 	struct rpcrdma_mr_seg *seg;
471 	struct rpcrdma_mr *mr;
472 	int nsegs, nchunks;
473 	__be32 *segcount;
474 
475 	seg = req->rl_segments;
476 	nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg);
477 	if (nsegs < 0)
478 		return nsegs;
479 
480 	if (encode_item_present(xdr) < 0)
481 		return -EMSGSIZE;
482 	segcount = xdr_reserve_space(xdr, sizeof(*segcount));
483 	if (unlikely(!segcount))
484 		return -EMSGSIZE;
485 	/* Actual value encoded below */
486 
487 	nchunks = 0;
488 	do {
489 		seg = frwr_map(r_xprt, seg, nsegs, true, rqst->rq_xid, &mr);
490 		if (IS_ERR(seg))
491 			return PTR_ERR(seg);
492 		rpcrdma_mr_push(mr, &req->rl_registered);
493 
494 		if (encode_rdma_segment(xdr, mr) < 0)
495 			return -EMSGSIZE;
496 
497 		trace_xprtrdma_chunk_reply(rqst->rq_task, mr, nsegs);
498 		r_xprt->rx_stats.reply_chunk_count++;
499 		r_xprt->rx_stats.total_rdma_request += mr->mr_length;
500 		nchunks++;
501 		nsegs -= mr->mr_nents;
502 	} while (nsegs);
503 
504 	/* Update count of segments in the Reply chunk */
505 	*segcount = cpu_to_be32(nchunks);
506 
507 	return 0;
508 }
509 
510 /**
511  * rpcrdma_unmap_sendctx - DMA-unmap Send buffers
512  * @sc: sendctx containing SGEs to unmap
513  *
514  */
515 void
516 rpcrdma_unmap_sendctx(struct rpcrdma_sendctx *sc)
517 {
518 	struct rpcrdma_ia *ia = &sc->sc_xprt->rx_ia;
519 	struct ib_sge *sge;
520 	unsigned int count;
521 
522 	/* The first two SGEs contain the transport header and
523 	 * the inline buffer. These are always left mapped so
524 	 * they can be cheaply re-used.
525 	 */
526 	sge = &sc->sc_sges[2];
527 	for (count = sc->sc_unmap_count; count; ++sge, --count)
528 		ib_dma_unmap_page(ia->ri_device,
529 				  sge->addr, sge->length, DMA_TO_DEVICE);
530 
531 	if (test_and_clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, &sc->sc_req->rl_flags)) {
532 		smp_mb__after_atomic();
533 		wake_up_bit(&sc->sc_req->rl_flags, RPCRDMA_REQ_F_TX_RESOURCES);
534 	}
535 }
536 
537 /* Prepare an SGE for the RPC-over-RDMA transport header.
538  */
539 static bool
540 rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
541 			u32 len)
542 {
543 	struct rpcrdma_sendctx *sc = req->rl_sendctx;
544 	struct rpcrdma_regbuf *rb = req->rl_rdmabuf;
545 	struct ib_sge *sge = sc->sc_sges;
546 
547 	if (!rpcrdma_dma_map_regbuf(ia, rb))
548 		goto out_regbuf;
549 	sge->addr = rdmab_addr(rb);
550 	sge->length = len;
551 	sge->lkey = rdmab_lkey(rb);
552 
553 	ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr,
554 				      sge->length, DMA_TO_DEVICE);
555 	sc->sc_wr.num_sge++;
556 	return true;
557 
558 out_regbuf:
559 	pr_err("rpcrdma: failed to DMA map a Send buffer\n");
560 	return false;
561 }
562 
563 /* Prepare the Send SGEs. The head and tail iovec, and each entry
564  * in the page list, gets its own SGE.
565  */
566 static bool
567 rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
568 			 struct xdr_buf *xdr, enum rpcrdma_chunktype rtype)
569 {
570 	struct rpcrdma_sendctx *sc = req->rl_sendctx;
571 	unsigned int sge_no, page_base, len, remaining;
572 	struct rpcrdma_regbuf *rb = req->rl_sendbuf;
573 	struct ib_device *device = ia->ri_device;
574 	struct ib_sge *sge = sc->sc_sges;
575 	u32 lkey = ia->ri_pd->local_dma_lkey;
576 	struct page *page, **ppages;
577 
578 	/* The head iovec is straightforward, as it is already
579 	 * DMA-mapped. Sync the content that has changed.
580 	 */
581 	if (!rpcrdma_dma_map_regbuf(ia, rb))
582 		goto out_regbuf;
583 	sge_no = 1;
584 	sge[sge_no].addr = rdmab_addr(rb);
585 	sge[sge_no].length = xdr->head[0].iov_len;
586 	sge[sge_no].lkey = rdmab_lkey(rb);
587 	ib_dma_sync_single_for_device(rdmab_device(rb), sge[sge_no].addr,
588 				      sge[sge_no].length, DMA_TO_DEVICE);
589 
590 	/* If there is a Read chunk, the page list is being handled
591 	 * via explicit RDMA, and thus is skipped here. However, the
592 	 * tail iovec may include an XDR pad for the page list, as
593 	 * well as additional content, and may not reside in the
594 	 * same page as the head iovec.
595 	 */
596 	if (rtype == rpcrdma_readch) {
597 		len = xdr->tail[0].iov_len;
598 
599 		/* Do not include the tail if it is only an XDR pad */
600 		if (len < 4)
601 			goto out;
602 
603 		page = virt_to_page(xdr->tail[0].iov_base);
604 		page_base = offset_in_page(xdr->tail[0].iov_base);
605 
606 		/* If the content in the page list is an odd length,
607 		 * xdr_write_pages() has added a pad at the beginning
608 		 * of the tail iovec. Force the tail's non-pad content
609 		 * to land at the next XDR position in the Send message.
610 		 */
611 		page_base += len & 3;
612 		len -= len & 3;
613 		goto map_tail;
614 	}
615 
616 	/* If there is a page list present, temporarily DMA map
617 	 * and prepare an SGE for each page to be sent.
618 	 */
619 	if (xdr->page_len) {
620 		ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
621 		page_base = offset_in_page(xdr->page_base);
622 		remaining = xdr->page_len;
623 		while (remaining) {
624 			sge_no++;
625 			if (sge_no > RPCRDMA_MAX_SEND_SGES - 2)
626 				goto out_mapping_overflow;
627 
628 			len = min_t(u32, PAGE_SIZE - page_base, remaining);
629 			sge[sge_no].addr = ib_dma_map_page(device, *ppages,
630 							   page_base, len,
631 							   DMA_TO_DEVICE);
632 			if (ib_dma_mapping_error(device, sge[sge_no].addr))
633 				goto out_mapping_err;
634 			sge[sge_no].length = len;
635 			sge[sge_no].lkey = lkey;
636 
637 			sc->sc_unmap_count++;
638 			ppages++;
639 			remaining -= len;
640 			page_base = 0;
641 		}
642 	}
643 
644 	/* The tail iovec is not always constructed in the same
645 	 * page where the head iovec resides (see, for example,
646 	 * gss_wrap_req_priv). To neatly accommodate that case,
647 	 * DMA map it separately.
648 	 */
649 	if (xdr->tail[0].iov_len) {
650 		page = virt_to_page(xdr->tail[0].iov_base);
651 		page_base = offset_in_page(xdr->tail[0].iov_base);
652 		len = xdr->tail[0].iov_len;
653 
654 map_tail:
655 		sge_no++;
656 		sge[sge_no].addr = ib_dma_map_page(device, page,
657 						   page_base, len,
658 						   DMA_TO_DEVICE);
659 		if (ib_dma_mapping_error(device, sge[sge_no].addr))
660 			goto out_mapping_err;
661 		sge[sge_no].length = len;
662 		sge[sge_no].lkey = lkey;
663 		sc->sc_unmap_count++;
664 	}
665 
666 out:
667 	sc->sc_wr.num_sge += sge_no;
668 	if (sc->sc_unmap_count)
669 		__set_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags);
670 	return true;
671 
672 out_regbuf:
673 	pr_err("rpcrdma: failed to DMA map a Send buffer\n");
674 	return false;
675 
676 out_mapping_overflow:
677 	rpcrdma_unmap_sendctx(sc);
678 	pr_err("rpcrdma: too many Send SGEs (%u)\n", sge_no);
679 	return false;
680 
681 out_mapping_err:
682 	rpcrdma_unmap_sendctx(sc);
683 	trace_xprtrdma_dma_maperr(sge[sge_no].addr);
684 	return false;
685 }
686 
687 /**
688  * rpcrdma_prepare_send_sges - Construct SGEs for a Send WR
689  * @r_xprt: controlling transport
690  * @req: context of RPC Call being marshalled
691  * @hdrlen: size of transport header, in bytes
692  * @xdr: xdr_buf containing RPC Call
693  * @rtype: chunk type being encoded
694  *
695  * Returns 0 on success; otherwise a negative errno is returned.
696  */
697 int
698 rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
699 			  struct rpcrdma_req *req, u32 hdrlen,
700 			  struct xdr_buf *xdr, enum rpcrdma_chunktype rtype)
701 {
702 	req->rl_sendctx = rpcrdma_sendctx_get_locked(&r_xprt->rx_buf);
703 	if (!req->rl_sendctx)
704 		return -EAGAIN;
705 	req->rl_sendctx->sc_wr.num_sge = 0;
706 	req->rl_sendctx->sc_unmap_count = 0;
707 	req->rl_sendctx->sc_req = req;
708 	__clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags);
709 
710 	if (!rpcrdma_prepare_hdr_sge(&r_xprt->rx_ia, req, hdrlen))
711 		return -EIO;
712 
713 	if (rtype != rpcrdma_areadch)
714 		if (!rpcrdma_prepare_msg_sges(&r_xprt->rx_ia, req, xdr, rtype))
715 			return -EIO;
716 
717 	return 0;
718 }
719 
720 /**
721  * rpcrdma_marshal_req - Marshal and send one RPC request
722  * @r_xprt: controlling transport
723  * @rqst: RPC request to be marshaled
724  *
725  * For the RPC in "rqst", this function:
726  *  - Chooses the transfer mode (eg., RDMA_MSG or RDMA_NOMSG)
727  *  - Registers Read, Write, and Reply chunks
728  *  - Constructs the transport header
729  *  - Posts a Send WR to send the transport header and request
730  *
731  * Returns:
732  *	%0 if the RPC was sent successfully,
733  *	%-ENOTCONN if the connection was lost,
734  *	%-EAGAIN if the caller should call again with the same arguments,
735  *	%-ENOBUFS if the caller should call again after a delay,
736  *	%-EMSGSIZE if the transport header is too small,
737  *	%-EIO if a permanent problem occurred while marshaling.
738  */
739 int
740 rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
741 {
742 	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
743 	struct xdr_stream *xdr = &req->rl_stream;
744 	enum rpcrdma_chunktype rtype, wtype;
745 	bool ddp_allowed;
746 	__be32 *p;
747 	int ret;
748 
749 	rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0);
750 	xdr_init_encode(xdr, &req->rl_hdrbuf,
751 			req->rl_rdmabuf->rg_base, rqst);
752 
753 	/* Fixed header fields */
754 	ret = -EMSGSIZE;
755 	p = xdr_reserve_space(xdr, 4 * sizeof(*p));
756 	if (!p)
757 		goto out_err;
758 	*p++ = rqst->rq_xid;
759 	*p++ = rpcrdma_version;
760 	*p++ = cpu_to_be32(r_xprt->rx_buf.rb_max_requests);
761 
762 	/* When the ULP employs a GSS flavor that guarantees integrity
763 	 * or privacy, direct data placement of individual data items
764 	 * is not allowed.
765 	 */
766 	ddp_allowed = !(rqst->rq_cred->cr_auth->au_flags &
767 						RPCAUTH_AUTH_DATATOUCH);
768 
769 	/*
770 	 * Chunks needed for results?
771 	 *
772 	 * o If the expected result is under the inline threshold, all ops
773 	 *   return as inline.
774 	 * o Large read ops return data as write chunk(s), header as
775 	 *   inline.
776 	 * o Large non-read ops return as a single reply chunk.
777 	 */
778 	if (rpcrdma_results_inline(r_xprt, rqst))
779 		wtype = rpcrdma_noch;
780 	else if ((ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ) &&
781 		 rpcrdma_nonpayload_inline(r_xprt, rqst))
782 		wtype = rpcrdma_writech;
783 	else
784 		wtype = rpcrdma_replych;
785 
786 	/*
787 	 * Chunks needed for arguments?
788 	 *
789 	 * o If the total request is under the inline threshold, all ops
790 	 *   are sent as inline.
791 	 * o Large write ops transmit data as read chunk(s), header as
792 	 *   inline.
793 	 * o Large non-write ops are sent with the entire message as a
794 	 *   single read chunk (protocol 0-position special case).
795 	 *
796 	 * This assumes that the upper layer does not present a request
797 	 * that both has a data payload, and whose non-data arguments
798 	 * by themselves are larger than the inline threshold.
799 	 */
800 	if (rpcrdma_args_inline(r_xprt, rqst)) {
801 		*p++ = rdma_msg;
802 		rtype = rpcrdma_noch;
803 	} else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
804 		*p++ = rdma_msg;
805 		rtype = rpcrdma_readch;
806 	} else {
807 		r_xprt->rx_stats.nomsg_call_count++;
808 		*p++ = rdma_nomsg;
809 		rtype = rpcrdma_areadch;
810 	}
811 
812 	/* If this is a retransmit, discard previously registered
813 	 * chunks. Very likely the connection has been replaced,
814 	 * so these registrations are invalid and unusable.
815 	 */
816 	while (unlikely(!list_empty(&req->rl_registered))) {
817 		struct rpcrdma_mr *mr;
818 
819 		mr = rpcrdma_mr_pop(&req->rl_registered);
820 		rpcrdma_mr_recycle(mr);
821 	}
822 
823 	/* This implementation supports the following combinations
824 	 * of chunk lists in one RPC-over-RDMA Call message:
825 	 *
826 	 *   - Read list
827 	 *   - Write list
828 	 *   - Reply chunk
829 	 *   - Read list + Reply chunk
830 	 *
831 	 * It might not yet support the following combinations:
832 	 *
833 	 *   - Read list + Write list
834 	 *
835 	 * It does not support the following combinations:
836 	 *
837 	 *   - Write list + Reply chunk
838 	 *   - Read list + Write list + Reply chunk
839 	 *
840 	 * This implementation supports only a single chunk in each
841 	 * Read or Write list. Thus for example the client cannot
842 	 * send a Call message with a Position Zero Read chunk and a
843 	 * regular Read chunk at the same time.
844 	 */
845 	if (rtype != rpcrdma_noch) {
846 		ret = rpcrdma_encode_read_list(r_xprt, req, rqst, rtype);
847 		if (ret)
848 			goto out_err;
849 	}
850 	ret = encode_item_not_present(xdr);
851 	if (ret)
852 		goto out_err;
853 
854 	if (wtype == rpcrdma_writech) {
855 		ret = rpcrdma_encode_write_list(r_xprt, req, rqst, wtype);
856 		if (ret)
857 			goto out_err;
858 	}
859 	ret = encode_item_not_present(xdr);
860 	if (ret)
861 		goto out_err;
862 
863 	if (wtype != rpcrdma_replych)
864 		ret = encode_item_not_present(xdr);
865 	else
866 		ret = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, wtype);
867 	if (ret)
868 		goto out_err;
869 
870 	trace_xprtrdma_marshal(rqst, xdr_stream_pos(xdr), rtype, wtype);
871 
872 	ret = rpcrdma_prepare_send_sges(r_xprt, req, xdr_stream_pos(xdr),
873 					&rqst->rq_snd_buf, rtype);
874 	if (ret)
875 		goto out_err;
876 	return 0;
877 
878 out_err:
879 	switch (ret) {
880 	case -EAGAIN:
881 		xprt_wait_for_buffer_space(rqst->rq_xprt);
882 		break;
883 	case -ENOBUFS:
884 		break;
885 	default:
886 		r_xprt->rx_stats.failed_marshal_count++;
887 	}
888 	return ret;
889 }
890 
891 /**
892  * rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs
893  * @rqst: controlling RPC request
894  * @srcp: points to RPC message payload in receive buffer
895  * @copy_len: remaining length of receive buffer content
896  * @pad: Write chunk pad bytes needed (zero for pure inline)
897  *
898  * The upper layer has set the maximum number of bytes it can
899  * receive in each component of rq_rcv_buf. These values are set in
900  * the head.iov_len, page_len, tail.iov_len, and buflen fields.
901  *
902  * Unlike the TCP equivalent (xdr_partial_copy_from_skb), in
903  * many cases this function simply updates iov_base pointers in
904  * rq_rcv_buf to point directly to the received reply data, to
905  * avoid copying reply data.
906  *
907  * Returns the count of bytes which had to be memcopied.
908  */
909 static unsigned long
910 rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
911 {
912 	unsigned long fixup_copy_count;
913 	int i, npages, curlen;
914 	char *destp;
915 	struct page **ppages;
916 	int page_base;
917 
918 	/* The head iovec is redirected to the RPC reply message
919 	 * in the receive buffer, to avoid a memcopy.
920 	 */
921 	rqst->rq_rcv_buf.head[0].iov_base = srcp;
922 	rqst->rq_private_buf.head[0].iov_base = srcp;
923 
924 	/* The contents of the receive buffer that follow
925 	 * head.iov_len bytes are copied into the page list.
926 	 */
927 	curlen = rqst->rq_rcv_buf.head[0].iov_len;
928 	if (curlen > copy_len)
929 		curlen = copy_len;
930 	trace_xprtrdma_fixup(rqst, copy_len, curlen);
931 	srcp += curlen;
932 	copy_len -= curlen;
933 
934 	ppages = rqst->rq_rcv_buf.pages +
935 		(rqst->rq_rcv_buf.page_base >> PAGE_SHIFT);
936 	page_base = offset_in_page(rqst->rq_rcv_buf.page_base);
937 	fixup_copy_count = 0;
938 	if (copy_len && rqst->rq_rcv_buf.page_len) {
939 		int pagelist_len;
940 
941 		pagelist_len = rqst->rq_rcv_buf.page_len;
942 		if (pagelist_len > copy_len)
943 			pagelist_len = copy_len;
944 		npages = PAGE_ALIGN(page_base + pagelist_len) >> PAGE_SHIFT;
945 		for (i = 0; i < npages; i++) {
946 			curlen = PAGE_SIZE - page_base;
947 			if (curlen > pagelist_len)
948 				curlen = pagelist_len;
949 
950 			trace_xprtrdma_fixup_pg(rqst, i, srcp,
951 						copy_len, curlen);
952 			destp = kmap_atomic(ppages[i]);
953 			memcpy(destp + page_base, srcp, curlen);
954 			flush_dcache_page(ppages[i]);
955 			kunmap_atomic(destp);
956 			srcp += curlen;
957 			copy_len -= curlen;
958 			fixup_copy_count += curlen;
959 			pagelist_len -= curlen;
960 			if (!pagelist_len)
961 				break;
962 			page_base = 0;
963 		}
964 
965 		/* Implicit padding for the last segment in a Write
966 		 * chunk is inserted inline at the front of the tail
967 		 * iovec. The upper layer ignores the content of
968 		 * the pad. Simply ensure inline content in the tail
969 		 * that follows the Write chunk is properly aligned.
970 		 */
971 		if (pad)
972 			srcp -= pad;
973 	}
974 
975 	/* The tail iovec is redirected to the remaining data
976 	 * in the receive buffer, to avoid a memcopy.
977 	 */
978 	if (copy_len || pad) {
979 		rqst->rq_rcv_buf.tail[0].iov_base = srcp;
980 		rqst->rq_private_buf.tail[0].iov_base = srcp;
981 	}
982 
983 	return fixup_copy_count;
984 }
985 
986 /* By convention, backchannel calls arrive via rdma_msg type
987  * messages, and never populate the chunk lists. This makes
988  * the RPC/RDMA header small and fixed in size, so it is
989  * straightforward to check the RPC header's direction field.
990  */
991 static bool
992 rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep)
993 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
994 {
995 	struct xdr_stream *xdr = &rep->rr_stream;
996 	__be32 *p;
997 
998 	if (rep->rr_proc != rdma_msg)
999 		return false;
1000 
1001 	/* Peek at stream contents without advancing. */
1002 	p = xdr_inline_decode(xdr, 0);
1003 
1004 	/* Chunk lists */
1005 	if (*p++ != xdr_zero)
1006 		return false;
1007 	if (*p++ != xdr_zero)
1008 		return false;
1009 	if (*p++ != xdr_zero)
1010 		return false;
1011 
1012 	/* RPC header */
1013 	if (*p++ != rep->rr_xid)
1014 		return false;
1015 	if (*p != cpu_to_be32(RPC_CALL))
1016 		return false;
1017 
1018 	/* Now that we are sure this is a backchannel call,
1019 	 * advance to the RPC header.
1020 	 */
1021 	p = xdr_inline_decode(xdr, 3 * sizeof(*p));
1022 	if (unlikely(!p))
1023 		goto out_short;
1024 
1025 	rpcrdma_bc_receive_call(r_xprt, rep);
1026 	return true;
1027 
1028 out_short:
1029 	pr_warn("RPC/RDMA short backward direction call\n");
1030 	return true;
1031 }
1032 #else	/* CONFIG_SUNRPC_BACKCHANNEL */
1033 {
1034 	return false;
1035 }
1036 #endif	/* CONFIG_SUNRPC_BACKCHANNEL */
1037 
1038 static int decode_rdma_segment(struct xdr_stream *xdr, u32 *length)
1039 {
1040 	u32 handle;
1041 	u64 offset;
1042 	__be32 *p;
1043 
1044 	p = xdr_inline_decode(xdr, 4 * sizeof(*p));
1045 	if (unlikely(!p))
1046 		return -EIO;
1047 
1048 	handle = be32_to_cpup(p++);
1049 	*length = be32_to_cpup(p++);
1050 	xdr_decode_hyper(p, &offset);
1051 
1052 	trace_xprtrdma_decode_seg(handle, *length, offset);
1053 	return 0;
1054 }
1055 
1056 static int decode_write_chunk(struct xdr_stream *xdr, u32 *length)
1057 {
1058 	u32 segcount, seglength;
1059 	__be32 *p;
1060 
1061 	p = xdr_inline_decode(xdr, sizeof(*p));
1062 	if (unlikely(!p))
1063 		return -EIO;
1064 
1065 	*length = 0;
1066 	segcount = be32_to_cpup(p);
1067 	while (segcount--) {
1068 		if (decode_rdma_segment(xdr, &seglength))
1069 			return -EIO;
1070 		*length += seglength;
1071 	}
1072 
1073 	return 0;
1074 }
1075 
1076 /* In RPC-over-RDMA Version One replies, a Read list is never
1077  * expected. This decoder is a stub that returns an error if
1078  * a Read list is present.
1079  */
1080 static int decode_read_list(struct xdr_stream *xdr)
1081 {
1082 	__be32 *p;
1083 
1084 	p = xdr_inline_decode(xdr, sizeof(*p));
1085 	if (unlikely(!p))
1086 		return -EIO;
1087 	if (unlikely(*p != xdr_zero))
1088 		return -EIO;
1089 	return 0;
1090 }
1091 
1092 /* Supports only one Write chunk in the Write list
1093  */
1094 static int decode_write_list(struct xdr_stream *xdr, u32 *length)
1095 {
1096 	u32 chunklen;
1097 	bool first;
1098 	__be32 *p;
1099 
1100 	*length = 0;
1101 	first = true;
1102 	do {
1103 		p = xdr_inline_decode(xdr, sizeof(*p));
1104 		if (unlikely(!p))
1105 			return -EIO;
1106 		if (*p == xdr_zero)
1107 			break;
1108 		if (!first)
1109 			return -EIO;
1110 
1111 		if (decode_write_chunk(xdr, &chunklen))
1112 			return -EIO;
1113 		*length += chunklen;
1114 		first = false;
1115 	} while (true);
1116 	return 0;
1117 }
1118 
1119 static int decode_reply_chunk(struct xdr_stream *xdr, u32 *length)
1120 {
1121 	__be32 *p;
1122 
1123 	p = xdr_inline_decode(xdr, sizeof(*p));
1124 	if (unlikely(!p))
1125 		return -EIO;
1126 
1127 	*length = 0;
1128 	if (*p != xdr_zero)
1129 		if (decode_write_chunk(xdr, length))
1130 			return -EIO;
1131 	return 0;
1132 }
1133 
1134 static int
1135 rpcrdma_decode_msg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
1136 		   struct rpc_rqst *rqst)
1137 {
1138 	struct xdr_stream *xdr = &rep->rr_stream;
1139 	u32 writelist, replychunk, rpclen;
1140 	char *base;
1141 
1142 	/* Decode the chunk lists */
1143 	if (decode_read_list(xdr))
1144 		return -EIO;
1145 	if (decode_write_list(xdr, &writelist))
1146 		return -EIO;
1147 	if (decode_reply_chunk(xdr, &replychunk))
1148 		return -EIO;
1149 
1150 	/* RDMA_MSG sanity checks */
1151 	if (unlikely(replychunk))
1152 		return -EIO;
1153 
1154 	/* Build the RPC reply's Payload stream in rqst->rq_rcv_buf */
1155 	base = (char *)xdr_inline_decode(xdr, 0);
1156 	rpclen = xdr_stream_remaining(xdr);
1157 	r_xprt->rx_stats.fixup_copy_count +=
1158 		rpcrdma_inline_fixup(rqst, base, rpclen, writelist & 3);
1159 
1160 	r_xprt->rx_stats.total_rdma_reply += writelist;
1161 	return rpclen + xdr_align_size(writelist);
1162 }
1163 
1164 static noinline int
1165 rpcrdma_decode_nomsg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep)
1166 {
1167 	struct xdr_stream *xdr = &rep->rr_stream;
1168 	u32 writelist, replychunk;
1169 
1170 	/* Decode the chunk lists */
1171 	if (decode_read_list(xdr))
1172 		return -EIO;
1173 	if (decode_write_list(xdr, &writelist))
1174 		return -EIO;
1175 	if (decode_reply_chunk(xdr, &replychunk))
1176 		return -EIO;
1177 
1178 	/* RDMA_NOMSG sanity checks */
1179 	if (unlikely(writelist))
1180 		return -EIO;
1181 	if (unlikely(!replychunk))
1182 		return -EIO;
1183 
1184 	/* Reply chunk buffer already is the reply vector */
1185 	r_xprt->rx_stats.total_rdma_reply += replychunk;
1186 	return replychunk;
1187 }
1188 
1189 static noinline int
1190 rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
1191 		     struct rpc_rqst *rqst)
1192 {
1193 	struct xdr_stream *xdr = &rep->rr_stream;
1194 	__be32 *p;
1195 
1196 	p = xdr_inline_decode(xdr, sizeof(*p));
1197 	if (unlikely(!p))
1198 		return -EIO;
1199 
1200 	switch (*p) {
1201 	case err_vers:
1202 		p = xdr_inline_decode(xdr, 2 * sizeof(*p));
1203 		if (!p)
1204 			break;
1205 		dprintk("RPC:       %s: server reports "
1206 			"version error (%u-%u), xid %08x\n", __func__,
1207 			be32_to_cpup(p), be32_to_cpu(*(p + 1)),
1208 			be32_to_cpu(rep->rr_xid));
1209 		break;
1210 	case err_chunk:
1211 		dprintk("RPC:       %s: server reports "
1212 			"header decoding error, xid %08x\n", __func__,
1213 			be32_to_cpu(rep->rr_xid));
1214 		break;
1215 	default:
1216 		dprintk("RPC:       %s: server reports "
1217 			"unrecognized error %d, xid %08x\n", __func__,
1218 			be32_to_cpup(p), be32_to_cpu(rep->rr_xid));
1219 	}
1220 
1221 	r_xprt->rx_stats.bad_reply_count++;
1222 	return -EREMOTEIO;
1223 }
1224 
1225 /* Perform XID lookup, reconstruction of the RPC reply, and
1226  * RPC completion while holding the transport lock to ensure
1227  * the rep, rqst, and rq_task pointers remain stable.
1228  */
1229 void rpcrdma_complete_rqst(struct rpcrdma_rep *rep)
1230 {
1231 	struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
1232 	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
1233 	struct rpc_rqst *rqst = rep->rr_rqst;
1234 	int status;
1235 
1236 	xprt->reestablish_timeout = 0;
1237 
1238 	switch (rep->rr_proc) {
1239 	case rdma_msg:
1240 		status = rpcrdma_decode_msg(r_xprt, rep, rqst);
1241 		break;
1242 	case rdma_nomsg:
1243 		status = rpcrdma_decode_nomsg(r_xprt, rep);
1244 		break;
1245 	case rdma_error:
1246 		status = rpcrdma_decode_error(r_xprt, rep, rqst);
1247 		break;
1248 	default:
1249 		status = -EIO;
1250 	}
1251 	if (status < 0)
1252 		goto out_badheader;
1253 
1254 out:
1255 	spin_lock(&xprt->queue_lock);
1256 	xprt_complete_rqst(rqst->rq_task, status);
1257 	xprt_unpin_rqst(rqst);
1258 	spin_unlock(&xprt->queue_lock);
1259 	return;
1260 
1261 /* If the incoming reply terminated a pending RPC, the next
1262  * RPC call will post a replacement receive buffer as it is
1263  * being marshaled.
1264  */
1265 out_badheader:
1266 	trace_xprtrdma_reply_hdr(rep);
1267 	r_xprt->rx_stats.bad_reply_count++;
1268 	goto out;
1269 }
1270 
1271 void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
1272 {
1273 	/* Invalidate and unmap the data payloads before waking
1274 	 * the waiting application. This guarantees the memory
1275 	 * regions are properly fenced from the server before the
1276 	 * application accesses the data. It also ensures proper
1277 	 * send flow control: waking the next RPC waits until this
1278 	 * RPC has relinquished all its Send Queue entries.
1279 	 */
1280 	if (!list_empty(&req->rl_registered))
1281 		frwr_unmap_sync(r_xprt, &req->rl_registered);
1282 
1283 	/* Ensure that any DMA mapped pages associated with
1284 	 * the Send of the RPC Call have been unmapped before
1285 	 * allowing the RPC to complete. This protects argument
1286 	 * memory not controlled by the RPC client from being
1287 	 * re-used before we're done with it.
1288 	 */
1289 	if (test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) {
1290 		r_xprt->rx_stats.reply_waits_for_send++;
1291 		out_of_line_wait_on_bit(&req->rl_flags,
1292 					RPCRDMA_REQ_F_TX_RESOURCES,
1293 					bit_wait,
1294 					TASK_UNINTERRUPTIBLE);
1295 	}
1296 }
1297 
1298 /* Reply handling runs in the poll worker thread. Anything that
1299  * might wait is deferred to a separate workqueue.
1300  */
1301 void rpcrdma_deferred_completion(struct work_struct *work)
1302 {
1303 	struct rpcrdma_rep *rep =
1304 			container_of(work, struct rpcrdma_rep, rr_work);
1305 	struct rpcrdma_req *req = rpcr_to_rdmar(rep->rr_rqst);
1306 	struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
1307 
1308 	trace_xprtrdma_defer_cmp(rep);
1309 	if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE)
1310 		frwr_reminv(rep, &req->rl_registered);
1311 	rpcrdma_release_rqst(r_xprt, req);
1312 	rpcrdma_complete_rqst(rep);
1313 }
1314 
1315 /* Process received RPC/RDMA messages.
1316  *
1317  * Errors must result in the RPC task either being awakened, or
1318  * allowed to timeout, to discover the errors at that time.
1319  */
1320 void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
1321 {
1322 	struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
1323 	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
1324 	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1325 	struct rpcrdma_req *req;
1326 	struct rpc_rqst *rqst;
1327 	u32 credits;
1328 	__be32 *p;
1329 
1330 	/* Fixed transport header fields */
1331 	xdr_init_decode(&rep->rr_stream, &rep->rr_hdrbuf,
1332 			rep->rr_hdrbuf.head[0].iov_base, NULL);
1333 	p = xdr_inline_decode(&rep->rr_stream, 4 * sizeof(*p));
1334 	if (unlikely(!p))
1335 		goto out_shortreply;
1336 	rep->rr_xid = *p++;
1337 	rep->rr_vers = *p++;
1338 	credits = be32_to_cpu(*p++);
1339 	rep->rr_proc = *p++;
1340 
1341 	if (rep->rr_vers != rpcrdma_version)
1342 		goto out_badversion;
1343 
1344 	if (rpcrdma_is_bcall(r_xprt, rep))
1345 		return;
1346 
1347 	/* Match incoming rpcrdma_rep to an rpcrdma_req to
1348 	 * get context for handling any incoming chunks.
1349 	 */
1350 	spin_lock(&xprt->queue_lock);
1351 	rqst = xprt_lookup_rqst(xprt, rep->rr_xid);
1352 	if (!rqst)
1353 		goto out_norqst;
1354 	xprt_pin_rqst(rqst);
1355 	spin_unlock(&xprt->queue_lock);
1356 
1357 	if (credits == 0)
1358 		credits = 1;	/* don't deadlock */
1359 	else if (credits > buf->rb_max_requests)
1360 		credits = buf->rb_max_requests;
1361 	if (buf->rb_credits != credits) {
1362 		spin_lock_bh(&xprt->transport_lock);
1363 		buf->rb_credits = credits;
1364 		xprt->cwnd = credits << RPC_CWNDSHIFT;
1365 		spin_unlock_bh(&xprt->transport_lock);
1366 	}
1367 
1368 	req = rpcr_to_rdmar(rqst);
1369 	if (req->rl_reply) {
1370 		trace_xprtrdma_leaked_rep(rqst, req->rl_reply);
1371 		rpcrdma_recv_buffer_put(req->rl_reply);
1372 	}
1373 	req->rl_reply = rep;
1374 	rep->rr_rqst = rqst;
1375 	clear_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags);
1376 
1377 	trace_xprtrdma_reply(rqst->rq_task, rep, req, credits);
1378 	queue_work(buf->rb_completion_wq, &rep->rr_work);
1379 	return;
1380 
1381 out_badversion:
1382 	trace_xprtrdma_reply_vers(rep);
1383 	goto out;
1384 
1385 out_norqst:
1386 	spin_unlock(&xprt->queue_lock);
1387 	trace_xprtrdma_reply_rqst(rep);
1388 	goto out;
1389 
1390 out_shortreply:
1391 	trace_xprtrdma_reply_short(rep);
1392 
1393 out:
1394 	rpcrdma_recv_buffer_put(rep);
1395 }
1396