xref: /openbmc/linux/net/sunrpc/xprtrdma/rpc_rdma.c (revision 943126417891372d56aa3fe46295cbf53db31370)
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 /*
3  * Copyright (c) 2014-2017 Oracle.  All rights reserved.
4  * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
5  *
6  * This software is available to you under a choice of one of two
7  * licenses.  You may choose to be licensed under the terms of the GNU
8  * General Public License (GPL) Version 2, available from the file
9  * COPYING in the main directory of this source tree, or the BSD-type
10  * license below:
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  *
16  *      Redistributions of source code must retain the above copyright
17  *      notice, this list of conditions and the following disclaimer.
18  *
19  *      Redistributions in binary form must reproduce the above
20  *      copyright notice, this list of conditions and the following
21  *      disclaimer in the documentation and/or other materials provided
22  *      with the distribution.
23  *
24  *      Neither the name of the Network Appliance, Inc. nor the names of
25  *      its contributors may be used to endorse or promote products
26  *      derived from this software without specific prior written
27  *      permission.
28  *
29  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
30  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
31  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
32  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
33  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
34  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
35  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
36  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
37  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
38  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
39  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
40  */
41 
42 /*
43  * rpc_rdma.c
44  *
45  * This file contains the guts of the RPC RDMA protocol, and
46  * does marshaling/unmarshaling, etc. It is also where interfacing
47  * to the Linux RPC framework lives.
48  */
49 
50 #include <linux/highmem.h>
51 
52 #include <linux/sunrpc/svc_rdma.h>
53 
54 #include "xprt_rdma.h"
55 #include <trace/events/rpcrdma.h>
56 
57 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
58 # define RPCDBG_FACILITY	RPCDBG_TRANS
59 #endif
60 
61 /* Returns size of largest RPC-over-RDMA header in a Call message
62  *
63  * The largest Call header contains a full-size Read list and a
64  * minimal Reply chunk.
65  */
66 static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs)
67 {
68 	unsigned int size;
69 
70 	/* Fixed header fields and list discriminators */
71 	size = RPCRDMA_HDRLEN_MIN;
72 
73 	/* Maximum Read list size */
74 	size = maxsegs * rpcrdma_readchunk_maxsz * sizeof(__be32);
75 
76 	/* Minimal Read chunk size */
77 	size += sizeof(__be32);	/* segment count */
78 	size += rpcrdma_segment_maxsz * sizeof(__be32);
79 	size += sizeof(__be32);	/* list discriminator */
80 
81 	dprintk("RPC:       %s: max call header size = %u\n",
82 		__func__, size);
83 	return size;
84 }
85 
86 /* Returns size of largest RPC-over-RDMA header in a Reply message
87  *
88  * There is only one Write list or one Reply chunk per Reply
89  * message.  The larger list is the Write list.
90  */
91 static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
92 {
93 	unsigned int size;
94 
95 	/* Fixed header fields and list discriminators */
96 	size = RPCRDMA_HDRLEN_MIN;
97 
98 	/* Maximum Write list size */
99 	size = sizeof(__be32);		/* segment count */
100 	size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32);
101 	size += sizeof(__be32);	/* list discriminator */
102 
103 	dprintk("RPC:       %s: max reply header size = %u\n",
104 		__func__, size);
105 	return size;
106 }
107 
108 void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt)
109 {
110 	struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
111 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
112 	unsigned int maxsegs = ia->ri_max_segs;
113 
114 	ia->ri_max_inline_write = cdata->inline_wsize -
115 				  rpcrdma_max_call_header_size(maxsegs);
116 	ia->ri_max_inline_read = cdata->inline_rsize -
117 				 rpcrdma_max_reply_header_size(maxsegs);
118 }
119 
120 /* The client can send a request inline as long as the RPCRDMA header
121  * plus the RPC call fit under the transport's inline limit. If the
122  * combined call message size exceeds that limit, the client must use
123  * a Read chunk for this operation.
124  *
125  * A Read chunk is also required if sending the RPC call inline would
126  * exceed this device's max_sge limit.
127  */
128 static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
129 				struct rpc_rqst *rqst)
130 {
131 	struct xdr_buf *xdr = &rqst->rq_snd_buf;
132 	unsigned int count, remaining, offset;
133 
134 	if (xdr->len > r_xprt->rx_ia.ri_max_inline_write)
135 		return false;
136 
137 	if (xdr->page_len) {
138 		remaining = xdr->page_len;
139 		offset = offset_in_page(xdr->page_base);
140 		count = RPCRDMA_MIN_SEND_SGES;
141 		while (remaining) {
142 			remaining -= min_t(unsigned int,
143 					   PAGE_SIZE - offset, remaining);
144 			offset = 0;
145 			if (++count > r_xprt->rx_ia.ri_max_send_sges)
146 				return false;
147 		}
148 	}
149 
150 	return true;
151 }
152 
153 /* The client can't know how large the actual reply will be. Thus it
154  * plans for the largest possible reply for that particular ULP
155  * operation. If the maximum combined reply message size exceeds that
156  * limit, the client must provide a write list or a reply chunk for
157  * this request.
158  */
159 static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
160 				   struct rpc_rqst *rqst)
161 {
162 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
163 
164 	return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read;
165 }
166 
167 /* Split @vec on page boundaries into SGEs. FMR registers pages, not
168  * a byte range. Other modes coalesce these SGEs into a single MR
169  * when they can.
170  *
171  * Returns pointer to next available SGE, and bumps the total number
172  * of SGEs consumed.
173  */
174 static struct rpcrdma_mr_seg *
175 rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
176 		     unsigned int *n)
177 {
178 	u32 remaining, page_offset;
179 	char *base;
180 
181 	base = vec->iov_base;
182 	page_offset = offset_in_page(base);
183 	remaining = vec->iov_len;
184 	while (remaining) {
185 		seg->mr_page = NULL;
186 		seg->mr_offset = base;
187 		seg->mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining);
188 		remaining -= seg->mr_len;
189 		base += seg->mr_len;
190 		++seg;
191 		++(*n);
192 		page_offset = 0;
193 	}
194 	return seg;
195 }
196 
197 /* Convert @xdrbuf into SGEs no larger than a page each. As they
198  * are registered, these SGEs are then coalesced into RDMA segments
199  * when the selected memreg mode supports it.
200  *
201  * Returns positive number of SGEs consumed, or a negative errno.
202  */
203 
204 static int
205 rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
206 		     unsigned int pos, enum rpcrdma_chunktype type,
207 		     struct rpcrdma_mr_seg *seg)
208 {
209 	unsigned long page_base;
210 	unsigned int len, n;
211 	struct page **ppages;
212 
213 	n = 0;
214 	if (pos == 0)
215 		seg = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, &n);
216 
217 	len = xdrbuf->page_len;
218 	ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
219 	page_base = offset_in_page(xdrbuf->page_base);
220 	while (len) {
221 		if (unlikely(!*ppages)) {
222 			/* XXX: Certain upper layer operations do
223 			 *	not provide receive buffer pages.
224 			 */
225 			*ppages = alloc_page(GFP_ATOMIC);
226 			if (!*ppages)
227 				return -ENOBUFS;
228 		}
229 		seg->mr_page = *ppages;
230 		seg->mr_offset = (char *)page_base;
231 		seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len);
232 		len -= seg->mr_len;
233 		++ppages;
234 		++seg;
235 		++n;
236 		page_base = 0;
237 	}
238 
239 	/* When encoding a Read chunk, the tail iovec contains an
240 	 * XDR pad and may be omitted.
241 	 */
242 	if (type == rpcrdma_readch && r_xprt->rx_ia.ri_implicit_roundup)
243 		goto out;
244 
245 	/* When encoding a Write chunk, some servers need to see an
246 	 * extra segment for non-XDR-aligned Write chunks. The upper
247 	 * layer provides space in the tail iovec that may be used
248 	 * for this purpose.
249 	 */
250 	if (type == rpcrdma_writech && r_xprt->rx_ia.ri_implicit_roundup)
251 		goto out;
252 
253 	if (xdrbuf->tail[0].iov_len)
254 		seg = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n);
255 
256 out:
257 	if (unlikely(n > RPCRDMA_MAX_SEGS))
258 		return -EIO;
259 	return n;
260 }
261 
262 static inline int
263 encode_item_present(struct xdr_stream *xdr)
264 {
265 	__be32 *p;
266 
267 	p = xdr_reserve_space(xdr, sizeof(*p));
268 	if (unlikely(!p))
269 		return -EMSGSIZE;
270 
271 	*p = xdr_one;
272 	return 0;
273 }
274 
275 static inline int
276 encode_item_not_present(struct xdr_stream *xdr)
277 {
278 	__be32 *p;
279 
280 	p = xdr_reserve_space(xdr, sizeof(*p));
281 	if (unlikely(!p))
282 		return -EMSGSIZE;
283 
284 	*p = xdr_zero;
285 	return 0;
286 }
287 
288 static void
289 xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr *mr)
290 {
291 	*iptr++ = cpu_to_be32(mr->mr_handle);
292 	*iptr++ = cpu_to_be32(mr->mr_length);
293 	xdr_encode_hyper(iptr, mr->mr_offset);
294 }
295 
296 static int
297 encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr)
298 {
299 	__be32 *p;
300 
301 	p = xdr_reserve_space(xdr, 4 * sizeof(*p));
302 	if (unlikely(!p))
303 		return -EMSGSIZE;
304 
305 	xdr_encode_rdma_segment(p, mr);
306 	return 0;
307 }
308 
309 static int
310 encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr,
311 		    u32 position)
312 {
313 	__be32 *p;
314 
315 	p = xdr_reserve_space(xdr, 6 * sizeof(*p));
316 	if (unlikely(!p))
317 		return -EMSGSIZE;
318 
319 	*p++ = xdr_one;			/* Item present */
320 	*p++ = cpu_to_be32(position);
321 	xdr_encode_rdma_segment(p, mr);
322 	return 0;
323 }
324 
325 /* Register and XDR encode the Read list. Supports encoding a list of read
326  * segments that belong to a single read chunk.
327  *
328  * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
329  *
330  *  Read chunklist (a linked list):
331  *   N elements, position P (same P for all chunks of same arg!):
332  *    1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0
333  *
334  * Returns zero on success, or a negative errno if a failure occurred.
335  * @xdr is advanced to the next position in the stream.
336  *
337  * Only a single @pos value is currently supported.
338  */
339 static noinline int
340 rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
341 			 struct rpc_rqst *rqst, enum rpcrdma_chunktype rtype)
342 {
343 	struct xdr_stream *xdr = &req->rl_stream;
344 	struct rpcrdma_mr_seg *seg;
345 	struct rpcrdma_mr *mr;
346 	unsigned int pos;
347 	int nsegs;
348 
349 	pos = rqst->rq_snd_buf.head[0].iov_len;
350 	if (rtype == rpcrdma_areadch)
351 		pos = 0;
352 	seg = req->rl_segments;
353 	nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_snd_buf, pos,
354 				     rtype, seg);
355 	if (nsegs < 0)
356 		return nsegs;
357 
358 	do {
359 		seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
360 						   false, &mr);
361 		if (IS_ERR(seg))
362 			return PTR_ERR(seg);
363 		rpcrdma_mr_push(mr, &req->rl_registered);
364 
365 		if (encode_read_segment(xdr, mr, pos) < 0)
366 			return -EMSGSIZE;
367 
368 		trace_xprtrdma_read_chunk(rqst->rq_task, pos, mr, nsegs);
369 		r_xprt->rx_stats.read_chunk_count++;
370 		nsegs -= mr->mr_nents;
371 	} while (nsegs);
372 
373 	return 0;
374 }
375 
376 /* Register and XDR encode the Write list. Supports encoding a list
377  * containing one array of plain segments that belong to a single
378  * write chunk.
379  *
380  * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
381  *
382  *  Write chunklist (a list of (one) counted array):
383  *   N elements:
384  *    1 - N - HLOO - HLOO - ... - HLOO - 0
385  *
386  * Returns zero on success, or a negative errno if a failure occurred.
387  * @xdr is advanced to the next position in the stream.
388  *
389  * Only a single Write chunk is currently supported.
390  */
391 static noinline int
392 rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
393 			  struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype)
394 {
395 	struct xdr_stream *xdr = &req->rl_stream;
396 	struct rpcrdma_mr_seg *seg;
397 	struct rpcrdma_mr *mr;
398 	int nsegs, nchunks;
399 	__be32 *segcount;
400 
401 	seg = req->rl_segments;
402 	nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf,
403 				     rqst->rq_rcv_buf.head[0].iov_len,
404 				     wtype, seg);
405 	if (nsegs < 0)
406 		return nsegs;
407 
408 	if (encode_item_present(xdr) < 0)
409 		return -EMSGSIZE;
410 	segcount = xdr_reserve_space(xdr, sizeof(*segcount));
411 	if (unlikely(!segcount))
412 		return -EMSGSIZE;
413 	/* Actual value encoded below */
414 
415 	nchunks = 0;
416 	do {
417 		seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
418 						   true, &mr);
419 		if (IS_ERR(seg))
420 			return PTR_ERR(seg);
421 		rpcrdma_mr_push(mr, &req->rl_registered);
422 
423 		if (encode_rdma_segment(xdr, mr) < 0)
424 			return -EMSGSIZE;
425 
426 		trace_xprtrdma_write_chunk(rqst->rq_task, mr, nsegs);
427 		r_xprt->rx_stats.write_chunk_count++;
428 		r_xprt->rx_stats.total_rdma_request += mr->mr_length;
429 		nchunks++;
430 		nsegs -= mr->mr_nents;
431 	} while (nsegs);
432 
433 	/* Update count of segments in this Write chunk */
434 	*segcount = cpu_to_be32(nchunks);
435 
436 	return 0;
437 }
438 
439 /* Register and XDR encode the Reply chunk. Supports encoding an array
440  * of plain segments that belong to a single write (reply) chunk.
441  *
442  * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
443  *
444  *  Reply chunk (a counted array):
445  *   N elements:
446  *    1 - N - HLOO - HLOO - ... - HLOO
447  *
448  * Returns zero on success, or a negative errno if a failure occurred.
449  * @xdr is advanced to the next position in the stream.
450  */
451 static noinline int
452 rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
453 			   struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype)
454 {
455 	struct xdr_stream *xdr = &req->rl_stream;
456 	struct rpcrdma_mr_seg *seg;
457 	struct rpcrdma_mr *mr;
458 	int nsegs, nchunks;
459 	__be32 *segcount;
460 
461 	seg = req->rl_segments;
462 	nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg);
463 	if (nsegs < 0)
464 		return nsegs;
465 
466 	if (encode_item_present(xdr) < 0)
467 		return -EMSGSIZE;
468 	segcount = xdr_reserve_space(xdr, sizeof(*segcount));
469 	if (unlikely(!segcount))
470 		return -EMSGSIZE;
471 	/* Actual value encoded below */
472 
473 	nchunks = 0;
474 	do {
475 		seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
476 						   true, &mr);
477 		if (IS_ERR(seg))
478 			return PTR_ERR(seg);
479 		rpcrdma_mr_push(mr, &req->rl_registered);
480 
481 		if (encode_rdma_segment(xdr, mr) < 0)
482 			return -EMSGSIZE;
483 
484 		trace_xprtrdma_reply_chunk(rqst->rq_task, mr, nsegs);
485 		r_xprt->rx_stats.reply_chunk_count++;
486 		r_xprt->rx_stats.total_rdma_request += mr->mr_length;
487 		nchunks++;
488 		nsegs -= mr->mr_nents;
489 	} while (nsegs);
490 
491 	/* Update count of segments in the Reply chunk */
492 	*segcount = cpu_to_be32(nchunks);
493 
494 	return 0;
495 }
496 
497 /**
498  * rpcrdma_unmap_sendctx - DMA-unmap Send buffers
499  * @sc: sendctx containing SGEs to unmap
500  *
501  */
502 void
503 rpcrdma_unmap_sendctx(struct rpcrdma_sendctx *sc)
504 {
505 	struct rpcrdma_ia *ia = &sc->sc_xprt->rx_ia;
506 	struct ib_sge *sge;
507 	unsigned int count;
508 
509 	/* The first two SGEs contain the transport header and
510 	 * the inline buffer. These are always left mapped so
511 	 * they can be cheaply re-used.
512 	 */
513 	sge = &sc->sc_sges[2];
514 	for (count = sc->sc_unmap_count; count; ++sge, --count)
515 		ib_dma_unmap_page(ia->ri_device,
516 				  sge->addr, sge->length, DMA_TO_DEVICE);
517 
518 	if (test_and_clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, &sc->sc_req->rl_flags)) {
519 		smp_mb__after_atomic();
520 		wake_up_bit(&sc->sc_req->rl_flags, RPCRDMA_REQ_F_TX_RESOURCES);
521 	}
522 }
523 
524 /* Prepare an SGE for the RPC-over-RDMA transport header.
525  */
526 static bool
527 rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
528 			u32 len)
529 {
530 	struct rpcrdma_sendctx *sc = req->rl_sendctx;
531 	struct rpcrdma_regbuf *rb = req->rl_rdmabuf;
532 	struct ib_sge *sge = sc->sc_sges;
533 
534 	if (!rpcrdma_dma_map_regbuf(ia, rb))
535 		goto out_regbuf;
536 	sge->addr = rdmab_addr(rb);
537 	sge->length = len;
538 	sge->lkey = rdmab_lkey(rb);
539 
540 	ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr,
541 				      sge->length, DMA_TO_DEVICE);
542 	sc->sc_wr.num_sge++;
543 	return true;
544 
545 out_regbuf:
546 	pr_err("rpcrdma: failed to DMA map a Send buffer\n");
547 	return false;
548 }
549 
550 /* Prepare the Send SGEs. The head and tail iovec, and each entry
551  * in the page list, gets its own SGE.
552  */
553 static bool
554 rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
555 			 struct xdr_buf *xdr, enum rpcrdma_chunktype rtype)
556 {
557 	struct rpcrdma_sendctx *sc = req->rl_sendctx;
558 	unsigned int sge_no, page_base, len, remaining;
559 	struct rpcrdma_regbuf *rb = req->rl_sendbuf;
560 	struct ib_device *device = ia->ri_device;
561 	struct ib_sge *sge = sc->sc_sges;
562 	u32 lkey = ia->ri_pd->local_dma_lkey;
563 	struct page *page, **ppages;
564 
565 	/* The head iovec is straightforward, as it is already
566 	 * DMA-mapped. Sync the content that has changed.
567 	 */
568 	if (!rpcrdma_dma_map_regbuf(ia, rb))
569 		goto out_regbuf;
570 	sge_no = 1;
571 	sge[sge_no].addr = rdmab_addr(rb);
572 	sge[sge_no].length = xdr->head[0].iov_len;
573 	sge[sge_no].lkey = rdmab_lkey(rb);
574 	ib_dma_sync_single_for_device(rdmab_device(rb), sge[sge_no].addr,
575 				      sge[sge_no].length, DMA_TO_DEVICE);
576 
577 	/* If there is a Read chunk, the page list is being handled
578 	 * via explicit RDMA, and thus is skipped here. However, the
579 	 * tail iovec may include an XDR pad for the page list, as
580 	 * well as additional content, and may not reside in the
581 	 * same page as the head iovec.
582 	 */
583 	if (rtype == rpcrdma_readch) {
584 		len = xdr->tail[0].iov_len;
585 
586 		/* Do not include the tail if it is only an XDR pad */
587 		if (len < 4)
588 			goto out;
589 
590 		page = virt_to_page(xdr->tail[0].iov_base);
591 		page_base = offset_in_page(xdr->tail[0].iov_base);
592 
593 		/* If the content in the page list is an odd length,
594 		 * xdr_write_pages() has added a pad at the beginning
595 		 * of the tail iovec. Force the tail's non-pad content
596 		 * to land at the next XDR position in the Send message.
597 		 */
598 		page_base += len & 3;
599 		len -= len & 3;
600 		goto map_tail;
601 	}
602 
603 	/* If there is a page list present, temporarily DMA map
604 	 * and prepare an SGE for each page to be sent.
605 	 */
606 	if (xdr->page_len) {
607 		ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
608 		page_base = offset_in_page(xdr->page_base);
609 		remaining = xdr->page_len;
610 		while (remaining) {
611 			sge_no++;
612 			if (sge_no > RPCRDMA_MAX_SEND_SGES - 2)
613 				goto out_mapping_overflow;
614 
615 			len = min_t(u32, PAGE_SIZE - page_base, remaining);
616 			sge[sge_no].addr = ib_dma_map_page(device, *ppages,
617 							   page_base, len,
618 							   DMA_TO_DEVICE);
619 			if (ib_dma_mapping_error(device, sge[sge_no].addr))
620 				goto out_mapping_err;
621 			sge[sge_no].length = len;
622 			sge[sge_no].lkey = lkey;
623 
624 			sc->sc_unmap_count++;
625 			ppages++;
626 			remaining -= len;
627 			page_base = 0;
628 		}
629 	}
630 
631 	/* The tail iovec is not always constructed in the same
632 	 * page where the head iovec resides (see, for example,
633 	 * gss_wrap_req_priv). To neatly accommodate that case,
634 	 * DMA map it separately.
635 	 */
636 	if (xdr->tail[0].iov_len) {
637 		page = virt_to_page(xdr->tail[0].iov_base);
638 		page_base = offset_in_page(xdr->tail[0].iov_base);
639 		len = xdr->tail[0].iov_len;
640 
641 map_tail:
642 		sge_no++;
643 		sge[sge_no].addr = ib_dma_map_page(device, page,
644 						   page_base, len,
645 						   DMA_TO_DEVICE);
646 		if (ib_dma_mapping_error(device, sge[sge_no].addr))
647 			goto out_mapping_err;
648 		sge[sge_no].length = len;
649 		sge[sge_no].lkey = lkey;
650 		sc->sc_unmap_count++;
651 	}
652 
653 out:
654 	sc->sc_wr.num_sge += sge_no;
655 	if (sc->sc_unmap_count)
656 		__set_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags);
657 	return true;
658 
659 out_regbuf:
660 	pr_err("rpcrdma: failed to DMA map a Send buffer\n");
661 	return false;
662 
663 out_mapping_overflow:
664 	rpcrdma_unmap_sendctx(sc);
665 	pr_err("rpcrdma: too many Send SGEs (%u)\n", sge_no);
666 	return false;
667 
668 out_mapping_err:
669 	rpcrdma_unmap_sendctx(sc);
670 	pr_err("rpcrdma: Send mapping error\n");
671 	return false;
672 }
673 
674 /**
675  * rpcrdma_prepare_send_sges - Construct SGEs for a Send WR
676  * @r_xprt: controlling transport
677  * @req: context of RPC Call being marshalled
678  * @hdrlen: size of transport header, in bytes
679  * @xdr: xdr_buf containing RPC Call
680  * @rtype: chunk type being encoded
681  *
682  * Returns 0 on success; otherwise a negative errno is returned.
683  */
684 int
685 rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
686 			  struct rpcrdma_req *req, u32 hdrlen,
687 			  struct xdr_buf *xdr, enum rpcrdma_chunktype rtype)
688 {
689 	req->rl_sendctx = rpcrdma_sendctx_get_locked(&r_xprt->rx_buf);
690 	if (!req->rl_sendctx)
691 		return -EAGAIN;
692 	req->rl_sendctx->sc_wr.num_sge = 0;
693 	req->rl_sendctx->sc_unmap_count = 0;
694 	req->rl_sendctx->sc_req = req;
695 	__clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags);
696 
697 	if (!rpcrdma_prepare_hdr_sge(&r_xprt->rx_ia, req, hdrlen))
698 		return -EIO;
699 
700 	if (rtype != rpcrdma_areadch)
701 		if (!rpcrdma_prepare_msg_sges(&r_xprt->rx_ia, req, xdr, rtype))
702 			return -EIO;
703 
704 	return 0;
705 }
706 
707 /**
708  * rpcrdma_marshal_req - Marshal and send one RPC request
709  * @r_xprt: controlling transport
710  * @rqst: RPC request to be marshaled
711  *
712  * For the RPC in "rqst", this function:
713  *  - Chooses the transfer mode (eg., RDMA_MSG or RDMA_NOMSG)
714  *  - Registers Read, Write, and Reply chunks
715  *  - Constructs the transport header
716  *  - Posts a Send WR to send the transport header and request
717  *
718  * Returns:
719  *	%0 if the RPC was sent successfully,
720  *	%-ENOTCONN if the connection was lost,
721  *	%-EAGAIN if the caller should call again with the same arguments,
722  *	%-ENOBUFS if the caller should call again after a delay,
723  *	%-EMSGSIZE if the transport header is too small,
724  *	%-EIO if a permanent problem occurred while marshaling.
725  */
726 int
727 rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
728 {
729 	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
730 	struct xdr_stream *xdr = &req->rl_stream;
731 	enum rpcrdma_chunktype rtype, wtype;
732 	bool ddp_allowed;
733 	__be32 *p;
734 	int ret;
735 
736 	rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0);
737 	xdr_init_encode(xdr, &req->rl_hdrbuf,
738 			req->rl_rdmabuf->rg_base);
739 
740 	/* Fixed header fields */
741 	ret = -EMSGSIZE;
742 	p = xdr_reserve_space(xdr, 4 * sizeof(*p));
743 	if (!p)
744 		goto out_err;
745 	*p++ = rqst->rq_xid;
746 	*p++ = rpcrdma_version;
747 	*p++ = cpu_to_be32(r_xprt->rx_buf.rb_max_requests);
748 
749 	/* When the ULP employs a GSS flavor that guarantees integrity
750 	 * or privacy, direct data placement of individual data items
751 	 * is not allowed.
752 	 */
753 	ddp_allowed = !(rqst->rq_cred->cr_auth->au_flags &
754 						RPCAUTH_AUTH_DATATOUCH);
755 
756 	/*
757 	 * Chunks needed for results?
758 	 *
759 	 * o If the expected result is under the inline threshold, all ops
760 	 *   return as inline.
761 	 * o Large read ops return data as write chunk(s), header as
762 	 *   inline.
763 	 * o Large non-read ops return as a single reply chunk.
764 	 */
765 	if (rpcrdma_results_inline(r_xprt, rqst))
766 		wtype = rpcrdma_noch;
767 	else if (ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ)
768 		wtype = rpcrdma_writech;
769 	else
770 		wtype = rpcrdma_replych;
771 
772 	/*
773 	 * Chunks needed for arguments?
774 	 *
775 	 * o If the total request is under the inline threshold, all ops
776 	 *   are sent as inline.
777 	 * o Large write ops transmit data as read chunk(s), header as
778 	 *   inline.
779 	 * o Large non-write ops are sent with the entire message as a
780 	 *   single read chunk (protocol 0-position special case).
781 	 *
782 	 * This assumes that the upper layer does not present a request
783 	 * that both has a data payload, and whose non-data arguments
784 	 * by themselves are larger than the inline threshold.
785 	 */
786 	if (rpcrdma_args_inline(r_xprt, rqst)) {
787 		*p++ = rdma_msg;
788 		rtype = rpcrdma_noch;
789 	} else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
790 		*p++ = rdma_msg;
791 		rtype = rpcrdma_readch;
792 	} else {
793 		r_xprt->rx_stats.nomsg_call_count++;
794 		*p++ = rdma_nomsg;
795 		rtype = rpcrdma_areadch;
796 	}
797 
798 	/* If this is a retransmit, discard previously registered
799 	 * chunks. Very likely the connection has been replaced,
800 	 * so these registrations are invalid and unusable.
801 	 */
802 	while (unlikely(!list_empty(&req->rl_registered))) {
803 		struct rpcrdma_mr *mr;
804 
805 		mr = rpcrdma_mr_pop(&req->rl_registered);
806 		rpcrdma_mr_recycle(mr);
807 	}
808 
809 	/* This implementation supports the following combinations
810 	 * of chunk lists in one RPC-over-RDMA Call message:
811 	 *
812 	 *   - Read list
813 	 *   - Write list
814 	 *   - Reply chunk
815 	 *   - Read list + Reply chunk
816 	 *
817 	 * It might not yet support the following combinations:
818 	 *
819 	 *   - Read list + Write list
820 	 *
821 	 * It does not support the following combinations:
822 	 *
823 	 *   - Write list + Reply chunk
824 	 *   - Read list + Write list + Reply chunk
825 	 *
826 	 * This implementation supports only a single chunk in each
827 	 * Read or Write list. Thus for example the client cannot
828 	 * send a Call message with a Position Zero Read chunk and a
829 	 * regular Read chunk at the same time.
830 	 */
831 	if (rtype != rpcrdma_noch) {
832 		ret = rpcrdma_encode_read_list(r_xprt, req, rqst, rtype);
833 		if (ret)
834 			goto out_err;
835 	}
836 	ret = encode_item_not_present(xdr);
837 	if (ret)
838 		goto out_err;
839 
840 	if (wtype == rpcrdma_writech) {
841 		ret = rpcrdma_encode_write_list(r_xprt, req, rqst, wtype);
842 		if (ret)
843 			goto out_err;
844 	}
845 	ret = encode_item_not_present(xdr);
846 	if (ret)
847 		goto out_err;
848 
849 	if (wtype != rpcrdma_replych)
850 		ret = encode_item_not_present(xdr);
851 	else
852 		ret = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, wtype);
853 	if (ret)
854 		goto out_err;
855 
856 	trace_xprtrdma_marshal(rqst, xdr_stream_pos(xdr), rtype, wtype);
857 
858 	ret = rpcrdma_prepare_send_sges(r_xprt, req, xdr_stream_pos(xdr),
859 					&rqst->rq_snd_buf, rtype);
860 	if (ret)
861 		goto out_err;
862 	return 0;
863 
864 out_err:
865 	switch (ret) {
866 	case -EAGAIN:
867 		xprt_wait_for_buffer_space(rqst->rq_xprt);
868 		break;
869 	case -ENOBUFS:
870 		break;
871 	default:
872 		r_xprt->rx_stats.failed_marshal_count++;
873 	}
874 	return ret;
875 }
876 
877 /**
878  * rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs
879  * @rqst: controlling RPC request
880  * @srcp: points to RPC message payload in receive buffer
881  * @copy_len: remaining length of receive buffer content
882  * @pad: Write chunk pad bytes needed (zero for pure inline)
883  *
884  * The upper layer has set the maximum number of bytes it can
885  * receive in each component of rq_rcv_buf. These values are set in
886  * the head.iov_len, page_len, tail.iov_len, and buflen fields.
887  *
888  * Unlike the TCP equivalent (xdr_partial_copy_from_skb), in
889  * many cases this function simply updates iov_base pointers in
890  * rq_rcv_buf to point directly to the received reply data, to
891  * avoid copying reply data.
892  *
893  * Returns the count of bytes which had to be memcopied.
894  */
895 static unsigned long
896 rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
897 {
898 	unsigned long fixup_copy_count;
899 	int i, npages, curlen;
900 	char *destp;
901 	struct page **ppages;
902 	int page_base;
903 
904 	/* The head iovec is redirected to the RPC reply message
905 	 * in the receive buffer, to avoid a memcopy.
906 	 */
907 	rqst->rq_rcv_buf.head[0].iov_base = srcp;
908 	rqst->rq_private_buf.head[0].iov_base = srcp;
909 
910 	/* The contents of the receive buffer that follow
911 	 * head.iov_len bytes are copied into the page list.
912 	 */
913 	curlen = rqst->rq_rcv_buf.head[0].iov_len;
914 	if (curlen > copy_len)
915 		curlen = copy_len;
916 	trace_xprtrdma_fixup(rqst, copy_len, curlen);
917 	srcp += curlen;
918 	copy_len -= curlen;
919 
920 	ppages = rqst->rq_rcv_buf.pages +
921 		(rqst->rq_rcv_buf.page_base >> PAGE_SHIFT);
922 	page_base = offset_in_page(rqst->rq_rcv_buf.page_base);
923 	fixup_copy_count = 0;
924 	if (copy_len && rqst->rq_rcv_buf.page_len) {
925 		int pagelist_len;
926 
927 		pagelist_len = rqst->rq_rcv_buf.page_len;
928 		if (pagelist_len > copy_len)
929 			pagelist_len = copy_len;
930 		npages = PAGE_ALIGN(page_base + pagelist_len) >> PAGE_SHIFT;
931 		for (i = 0; i < npages; i++) {
932 			curlen = PAGE_SIZE - page_base;
933 			if (curlen > pagelist_len)
934 				curlen = pagelist_len;
935 
936 			trace_xprtrdma_fixup_pg(rqst, i, srcp,
937 						copy_len, curlen);
938 			destp = kmap_atomic(ppages[i]);
939 			memcpy(destp + page_base, srcp, curlen);
940 			flush_dcache_page(ppages[i]);
941 			kunmap_atomic(destp);
942 			srcp += curlen;
943 			copy_len -= curlen;
944 			fixup_copy_count += curlen;
945 			pagelist_len -= curlen;
946 			if (!pagelist_len)
947 				break;
948 			page_base = 0;
949 		}
950 
951 		/* Implicit padding for the last segment in a Write
952 		 * chunk is inserted inline at the front of the tail
953 		 * iovec. The upper layer ignores the content of
954 		 * the pad. Simply ensure inline content in the tail
955 		 * that follows the Write chunk is properly aligned.
956 		 */
957 		if (pad)
958 			srcp -= pad;
959 	}
960 
961 	/* The tail iovec is redirected to the remaining data
962 	 * in the receive buffer, to avoid a memcopy.
963 	 */
964 	if (copy_len || pad) {
965 		rqst->rq_rcv_buf.tail[0].iov_base = srcp;
966 		rqst->rq_private_buf.tail[0].iov_base = srcp;
967 	}
968 
969 	return fixup_copy_count;
970 }
971 
972 /* By convention, backchannel calls arrive via rdma_msg type
973  * messages, and never populate the chunk lists. This makes
974  * the RPC/RDMA header small and fixed in size, so it is
975  * straightforward to check the RPC header's direction field.
976  */
977 static bool
978 rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep)
979 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
980 {
981 	struct xdr_stream *xdr = &rep->rr_stream;
982 	__be32 *p;
983 
984 	if (rep->rr_proc != rdma_msg)
985 		return false;
986 
987 	/* Peek at stream contents without advancing. */
988 	p = xdr_inline_decode(xdr, 0);
989 
990 	/* Chunk lists */
991 	if (*p++ != xdr_zero)
992 		return false;
993 	if (*p++ != xdr_zero)
994 		return false;
995 	if (*p++ != xdr_zero)
996 		return false;
997 
998 	/* RPC header */
999 	if (*p++ != rep->rr_xid)
1000 		return false;
1001 	if (*p != cpu_to_be32(RPC_CALL))
1002 		return false;
1003 
1004 	/* Now that we are sure this is a backchannel call,
1005 	 * advance to the RPC header.
1006 	 */
1007 	p = xdr_inline_decode(xdr, 3 * sizeof(*p));
1008 	if (unlikely(!p))
1009 		goto out_short;
1010 
1011 	rpcrdma_bc_receive_call(r_xprt, rep);
1012 	return true;
1013 
1014 out_short:
1015 	pr_warn("RPC/RDMA short backward direction call\n");
1016 	return true;
1017 }
1018 #else	/* CONFIG_SUNRPC_BACKCHANNEL */
1019 {
1020 	return false;
1021 }
1022 #endif	/* CONFIG_SUNRPC_BACKCHANNEL */
1023 
1024 static int decode_rdma_segment(struct xdr_stream *xdr, u32 *length)
1025 {
1026 	u32 handle;
1027 	u64 offset;
1028 	__be32 *p;
1029 
1030 	p = xdr_inline_decode(xdr, 4 * sizeof(*p));
1031 	if (unlikely(!p))
1032 		return -EIO;
1033 
1034 	handle = be32_to_cpup(p++);
1035 	*length = be32_to_cpup(p++);
1036 	xdr_decode_hyper(p, &offset);
1037 
1038 	trace_xprtrdma_decode_seg(handle, *length, offset);
1039 	return 0;
1040 }
1041 
1042 static int decode_write_chunk(struct xdr_stream *xdr, u32 *length)
1043 {
1044 	u32 segcount, seglength;
1045 	__be32 *p;
1046 
1047 	p = xdr_inline_decode(xdr, sizeof(*p));
1048 	if (unlikely(!p))
1049 		return -EIO;
1050 
1051 	*length = 0;
1052 	segcount = be32_to_cpup(p);
1053 	while (segcount--) {
1054 		if (decode_rdma_segment(xdr, &seglength))
1055 			return -EIO;
1056 		*length += seglength;
1057 	}
1058 
1059 	return 0;
1060 }
1061 
1062 /* In RPC-over-RDMA Version One replies, a Read list is never
1063  * expected. This decoder is a stub that returns an error if
1064  * a Read list is present.
1065  */
1066 static int decode_read_list(struct xdr_stream *xdr)
1067 {
1068 	__be32 *p;
1069 
1070 	p = xdr_inline_decode(xdr, sizeof(*p));
1071 	if (unlikely(!p))
1072 		return -EIO;
1073 	if (unlikely(*p != xdr_zero))
1074 		return -EIO;
1075 	return 0;
1076 }
1077 
1078 /* Supports only one Write chunk in the Write list
1079  */
1080 static int decode_write_list(struct xdr_stream *xdr, u32 *length)
1081 {
1082 	u32 chunklen;
1083 	bool first;
1084 	__be32 *p;
1085 
1086 	*length = 0;
1087 	first = true;
1088 	do {
1089 		p = xdr_inline_decode(xdr, sizeof(*p));
1090 		if (unlikely(!p))
1091 			return -EIO;
1092 		if (*p == xdr_zero)
1093 			break;
1094 		if (!first)
1095 			return -EIO;
1096 
1097 		if (decode_write_chunk(xdr, &chunklen))
1098 			return -EIO;
1099 		*length += chunklen;
1100 		first = false;
1101 	} while (true);
1102 	return 0;
1103 }
1104 
1105 static int decode_reply_chunk(struct xdr_stream *xdr, u32 *length)
1106 {
1107 	__be32 *p;
1108 
1109 	p = xdr_inline_decode(xdr, sizeof(*p));
1110 	if (unlikely(!p))
1111 		return -EIO;
1112 
1113 	*length = 0;
1114 	if (*p != xdr_zero)
1115 		if (decode_write_chunk(xdr, length))
1116 			return -EIO;
1117 	return 0;
1118 }
1119 
1120 static int
1121 rpcrdma_decode_msg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
1122 		   struct rpc_rqst *rqst)
1123 {
1124 	struct xdr_stream *xdr = &rep->rr_stream;
1125 	u32 writelist, replychunk, rpclen;
1126 	char *base;
1127 
1128 	/* Decode the chunk lists */
1129 	if (decode_read_list(xdr))
1130 		return -EIO;
1131 	if (decode_write_list(xdr, &writelist))
1132 		return -EIO;
1133 	if (decode_reply_chunk(xdr, &replychunk))
1134 		return -EIO;
1135 
1136 	/* RDMA_MSG sanity checks */
1137 	if (unlikely(replychunk))
1138 		return -EIO;
1139 
1140 	/* Build the RPC reply's Payload stream in rqst->rq_rcv_buf */
1141 	base = (char *)xdr_inline_decode(xdr, 0);
1142 	rpclen = xdr_stream_remaining(xdr);
1143 	r_xprt->rx_stats.fixup_copy_count +=
1144 		rpcrdma_inline_fixup(rqst, base, rpclen, writelist & 3);
1145 
1146 	r_xprt->rx_stats.total_rdma_reply += writelist;
1147 	return rpclen + xdr_align_size(writelist);
1148 }
1149 
1150 static noinline int
1151 rpcrdma_decode_nomsg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep)
1152 {
1153 	struct xdr_stream *xdr = &rep->rr_stream;
1154 	u32 writelist, replychunk;
1155 
1156 	/* Decode the chunk lists */
1157 	if (decode_read_list(xdr))
1158 		return -EIO;
1159 	if (decode_write_list(xdr, &writelist))
1160 		return -EIO;
1161 	if (decode_reply_chunk(xdr, &replychunk))
1162 		return -EIO;
1163 
1164 	/* RDMA_NOMSG sanity checks */
1165 	if (unlikely(writelist))
1166 		return -EIO;
1167 	if (unlikely(!replychunk))
1168 		return -EIO;
1169 
1170 	/* Reply chunk buffer already is the reply vector */
1171 	r_xprt->rx_stats.total_rdma_reply += replychunk;
1172 	return replychunk;
1173 }
1174 
1175 static noinline int
1176 rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
1177 		     struct rpc_rqst *rqst)
1178 {
1179 	struct xdr_stream *xdr = &rep->rr_stream;
1180 	__be32 *p;
1181 
1182 	p = xdr_inline_decode(xdr, sizeof(*p));
1183 	if (unlikely(!p))
1184 		return -EIO;
1185 
1186 	switch (*p) {
1187 	case err_vers:
1188 		p = xdr_inline_decode(xdr, 2 * sizeof(*p));
1189 		if (!p)
1190 			break;
1191 		dprintk("RPC: %5u: %s: server reports version error (%u-%u)\n",
1192 			rqst->rq_task->tk_pid, __func__,
1193 			be32_to_cpup(p), be32_to_cpu(*(p + 1)));
1194 		break;
1195 	case err_chunk:
1196 		dprintk("RPC: %5u: %s: server reports header decoding error\n",
1197 			rqst->rq_task->tk_pid, __func__);
1198 		break;
1199 	default:
1200 		dprintk("RPC: %5u: %s: server reports unrecognized error %d\n",
1201 			rqst->rq_task->tk_pid, __func__, be32_to_cpup(p));
1202 	}
1203 
1204 	r_xprt->rx_stats.bad_reply_count++;
1205 	return -EREMOTEIO;
1206 }
1207 
1208 /* Perform XID lookup, reconstruction of the RPC reply, and
1209  * RPC completion while holding the transport lock to ensure
1210  * the rep, rqst, and rq_task pointers remain stable.
1211  */
1212 void rpcrdma_complete_rqst(struct rpcrdma_rep *rep)
1213 {
1214 	struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
1215 	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
1216 	struct rpc_rqst *rqst = rep->rr_rqst;
1217 	int status;
1218 
1219 	xprt->reestablish_timeout = 0;
1220 
1221 	switch (rep->rr_proc) {
1222 	case rdma_msg:
1223 		status = rpcrdma_decode_msg(r_xprt, rep, rqst);
1224 		break;
1225 	case rdma_nomsg:
1226 		status = rpcrdma_decode_nomsg(r_xprt, rep);
1227 		break;
1228 	case rdma_error:
1229 		status = rpcrdma_decode_error(r_xprt, rep, rqst);
1230 		break;
1231 	default:
1232 		status = -EIO;
1233 	}
1234 	if (status < 0)
1235 		goto out_badheader;
1236 
1237 out:
1238 	spin_lock(&xprt->queue_lock);
1239 	xprt_complete_rqst(rqst->rq_task, status);
1240 	xprt_unpin_rqst(rqst);
1241 	spin_unlock(&xprt->queue_lock);
1242 	return;
1243 
1244 /* If the incoming reply terminated a pending RPC, the next
1245  * RPC call will post a replacement receive buffer as it is
1246  * being marshaled.
1247  */
1248 out_badheader:
1249 	trace_xprtrdma_reply_hdr(rep);
1250 	r_xprt->rx_stats.bad_reply_count++;
1251 	status = -EIO;
1252 	goto out;
1253 }
1254 
1255 void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
1256 {
1257 	/* Invalidate and unmap the data payloads before waking
1258 	 * the waiting application. This guarantees the memory
1259 	 * regions are properly fenced from the server before the
1260 	 * application accesses the data. It also ensures proper
1261 	 * send flow control: waking the next RPC waits until this
1262 	 * RPC has relinquished all its Send Queue entries.
1263 	 */
1264 	if (!list_empty(&req->rl_registered))
1265 		r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt,
1266 						    &req->rl_registered);
1267 
1268 	/* Ensure that any DMA mapped pages associated with
1269 	 * the Send of the RPC Call have been unmapped before
1270 	 * allowing the RPC to complete. This protects argument
1271 	 * memory not controlled by the RPC client from being
1272 	 * re-used before we're done with it.
1273 	 */
1274 	if (test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) {
1275 		r_xprt->rx_stats.reply_waits_for_send++;
1276 		out_of_line_wait_on_bit(&req->rl_flags,
1277 					RPCRDMA_REQ_F_TX_RESOURCES,
1278 					bit_wait,
1279 					TASK_UNINTERRUPTIBLE);
1280 	}
1281 }
1282 
1283 /* Reply handling runs in the poll worker thread. Anything that
1284  * might wait is deferred to a separate workqueue.
1285  */
1286 void rpcrdma_deferred_completion(struct work_struct *work)
1287 {
1288 	struct rpcrdma_rep *rep =
1289 			container_of(work, struct rpcrdma_rep, rr_work);
1290 	struct rpcrdma_req *req = rpcr_to_rdmar(rep->rr_rqst);
1291 	struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
1292 
1293 	trace_xprtrdma_defer_cmp(rep);
1294 	if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE)
1295 		r_xprt->rx_ia.ri_ops->ro_reminv(rep, &req->rl_registered);
1296 	rpcrdma_release_rqst(r_xprt, req);
1297 	rpcrdma_complete_rqst(rep);
1298 }
1299 
1300 /* Process received RPC/RDMA messages.
1301  *
1302  * Errors must result in the RPC task either being awakened, or
1303  * allowed to timeout, to discover the errors at that time.
1304  */
1305 void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
1306 {
1307 	struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
1308 	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
1309 	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1310 	struct rpcrdma_req *req;
1311 	struct rpc_rqst *rqst;
1312 	u32 credits;
1313 	__be32 *p;
1314 
1315 	--buf->rb_posted_receives;
1316 
1317 	if (rep->rr_hdrbuf.head[0].iov_len == 0)
1318 		goto out_badstatus;
1319 
1320 	/* Fixed transport header fields */
1321 	xdr_init_decode(&rep->rr_stream, &rep->rr_hdrbuf,
1322 			rep->rr_hdrbuf.head[0].iov_base);
1323 	p = xdr_inline_decode(&rep->rr_stream, 4 * sizeof(*p));
1324 	if (unlikely(!p))
1325 		goto out_shortreply;
1326 	rep->rr_xid = *p++;
1327 	rep->rr_vers = *p++;
1328 	credits = be32_to_cpu(*p++);
1329 	rep->rr_proc = *p++;
1330 
1331 	if (rep->rr_vers != rpcrdma_version)
1332 		goto out_badversion;
1333 
1334 	if (rpcrdma_is_bcall(r_xprt, rep))
1335 		return;
1336 
1337 	/* Match incoming rpcrdma_rep to an rpcrdma_req to
1338 	 * get context for handling any incoming chunks.
1339 	 */
1340 	spin_lock(&xprt->queue_lock);
1341 	rqst = xprt_lookup_rqst(xprt, rep->rr_xid);
1342 	if (!rqst)
1343 		goto out_norqst;
1344 	xprt_pin_rqst(rqst);
1345 	spin_unlock(&xprt->queue_lock);
1346 
1347 	if (credits == 0)
1348 		credits = 1;	/* don't deadlock */
1349 	else if (credits > buf->rb_max_requests)
1350 		credits = buf->rb_max_requests;
1351 	if (buf->rb_credits != credits) {
1352 		spin_lock_bh(&xprt->transport_lock);
1353 		buf->rb_credits = credits;
1354 		xprt->cwnd = credits << RPC_CWNDSHIFT;
1355 		spin_unlock_bh(&xprt->transport_lock);
1356 	}
1357 
1358 	req = rpcr_to_rdmar(rqst);
1359 	req->rl_reply = rep;
1360 	rep->rr_rqst = rqst;
1361 	clear_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags);
1362 
1363 	trace_xprtrdma_reply(rqst->rq_task, rep, req, credits);
1364 
1365 	rpcrdma_post_recvs(r_xprt, false);
1366 	queue_work(rpcrdma_receive_wq, &rep->rr_work);
1367 	return;
1368 
1369 out_badversion:
1370 	trace_xprtrdma_reply_vers(rep);
1371 	goto repost;
1372 
1373 /* The RPC transaction has already been terminated, or the header
1374  * is corrupt.
1375  */
1376 out_norqst:
1377 	spin_unlock(&xprt->queue_lock);
1378 	trace_xprtrdma_reply_rqst(rep);
1379 	goto repost;
1380 
1381 out_shortreply:
1382 	trace_xprtrdma_reply_short(rep);
1383 
1384 /* If no pending RPC transaction was matched, post a replacement
1385  * receive buffer before returning.
1386  */
1387 repost:
1388 	rpcrdma_post_recvs(r_xprt, false);
1389 out_badstatus:
1390 	rpcrdma_recv_buffer_put(rep);
1391 }
1392