1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
2a0ce85f5SChuck Lever /*
3ce5b3717SChuck Lever * Copyright (c) 2015, 2017 Oracle. All rights reserved.
4a0ce85f5SChuck Lever * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
5a0ce85f5SChuck Lever */
6a0ce85f5SChuck Lever
7a0ce85f5SChuck Lever /* Lightweight memory registration using Fast Registration Work
8ce5b3717SChuck Lever * Requests (FRWR).
9a0ce85f5SChuck Lever *
102fb2a4d5SChuck Lever * FRWR features ordered asynchronous registration and invalidation
112fb2a4d5SChuck Lever * of arbitrarily-sized memory regions. This is the fastest and safest
12a0ce85f5SChuck Lever * but most complex memory registration mode.
13a0ce85f5SChuck Lever */
14a0ce85f5SChuck Lever
15c14d86e5SChuck Lever /* Normal operation
16c14d86e5SChuck Lever *
172fb2a4d5SChuck Lever * A Memory Region is prepared for RDMA Read or Write using a FAST_REG
185f62412bSChuck Lever * Work Request (frwr_map). When the RDMA operation is finished, this
19c14d86e5SChuck Lever * Memory Region is invalidated using a LOCAL_INV Work Request
202fb2a4d5SChuck Lever * (frwr_unmap_async and frwr_unmap_sync).
21c14d86e5SChuck Lever *
222fb2a4d5SChuck Lever * Typically FAST_REG Work Requests are not signaled, and neither are
232fb2a4d5SChuck Lever * RDMA Send Work Requests (with the exception of signaling occasionally
242fb2a4d5SChuck Lever * to prevent provider work queue overflows). This greatly reduces HCA
25c14d86e5SChuck Lever * interrupt workload.
26c14d86e5SChuck Lever */
27c14d86e5SChuck Lever
28c14d86e5SChuck Lever /* Transport recovery
29c14d86e5SChuck Lever *
302fb2a4d5SChuck Lever * frwr_map and frwr_unmap_* cannot run at the same time the transport
312fb2a4d5SChuck Lever * connect worker is running. The connect worker holds the transport
322fb2a4d5SChuck Lever * send lock, just as ->send_request does. This prevents frwr_map and
332fb2a4d5SChuck Lever * the connect worker from running concurrently. When a connection is
342fb2a4d5SChuck Lever * closed, the Receive completion queue is drained before the allowing
352fb2a4d5SChuck Lever * the connect worker to get control. This prevents frwr_unmap and the
362fb2a4d5SChuck Lever * connect worker from running concurrently.
37c14d86e5SChuck Lever *
382fb2a4d5SChuck Lever * When the underlying transport disconnects, MRs that are in flight
399d2da4ffSChuck Lever * are flushed and are likely unusable. Thus all MRs are destroyed.
409d2da4ffSChuck Lever * New MRs are created on demand.
41c14d86e5SChuck Lever */
42c14d86e5SChuck Lever
43bd2abef3SChuck Lever #include <linux/sunrpc/svc_rdma.h>
44c8b920bbSChuck Lever
45a0ce85f5SChuck Lever #include "xprt_rdma.h"
46b6e717cbSChuck Lever #include <trace/events/rpcrdma.h>
47a0ce85f5SChuck Lever
frwr_cid_init(struct rpcrdma_ep * ep,struct rpcrdma_mr * mr)480a26d10eSChuck Lever static void frwr_cid_init(struct rpcrdma_ep *ep,
490a26d10eSChuck Lever struct rpcrdma_mr *mr)
500a26d10eSChuck Lever {
510a26d10eSChuck Lever struct rpc_rdma_cid *cid = &mr->mr_cid;
520a26d10eSChuck Lever
530a26d10eSChuck Lever cid->ci_queue_id = ep->re_attr.send_cq->res.id;
5413bcf7e3SChuck Lever cid->ci_completion_id = mr->mr_ibmr->res.id;
550a26d10eSChuck Lever }
560a26d10eSChuck Lever
frwr_mr_unmap(struct rpcrdma_xprt * r_xprt,struct rpcrdma_mr * mr)57ef2be591SChuck Lever static void frwr_mr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
5861da886bSChuck Lever {
597a03aeb6SChuck Lever if (mr->mr_device) {
60d379eaa8SChuck Lever trace_xprtrdma_mr_unmap(mr);
617a03aeb6SChuck Lever ib_dma_unmap_sg(mr->mr_device, mr->mr_sg, mr->mr_nents,
627a03aeb6SChuck Lever mr->mr_dir);
637a03aeb6SChuck Lever mr->mr_device = NULL;
6461da886bSChuck Lever }
65ef2be591SChuck Lever }
66ef2be591SChuck Lever
67e4b52ca0SChuck Lever /**
68e4b52ca0SChuck Lever * frwr_mr_release - Destroy one MR
69e4b52ca0SChuck Lever * @mr: MR allocated by frwr_mr_init
70e4b52ca0SChuck Lever *
71e4b52ca0SChuck Lever */
frwr_mr_release(struct rpcrdma_mr * mr)72e4b52ca0SChuck Lever void frwr_mr_release(struct rpcrdma_mr *mr)
73ef2be591SChuck Lever {
74e4b52ca0SChuck Lever int rc;
75ef2be591SChuck Lever
76e4b52ca0SChuck Lever frwr_mr_unmap(mr->mr_xprt, mr);
77ef2be591SChuck Lever
7813bcf7e3SChuck Lever rc = ib_dereg_mr(mr->mr_ibmr);
79e4b52ca0SChuck Lever if (rc)
80e4b52ca0SChuck Lever trace_xprtrdma_frwr_dereg(mr, rc);
81e4b52ca0SChuck Lever kfree(mr->mr_sg);
82e4b52ca0SChuck Lever kfree(mr);
8361da886bSChuck Lever }
8461da886bSChuck Lever
frwr_mr_put(struct rpcrdma_mr * mr)85ef2be591SChuck Lever static void frwr_mr_put(struct rpcrdma_mr *mr)
86ef2be591SChuck Lever {
87ef2be591SChuck Lever frwr_mr_unmap(mr->mr_xprt, mr);
88ef2be591SChuck Lever
89ef2be591SChuck Lever /* The MR is returned to the req's MR free list instead
90ef2be591SChuck Lever * of to the xprt's MR free list. No spinlock is needed.
91ef2be591SChuck Lever */
92ef2be591SChuck Lever rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs);
93ef2be591SChuck Lever }
94ef2be591SChuck Lever
95*83e025daSChuck Lever /**
96*83e025daSChuck Lever * frwr_reset - Place MRs back on @req's free list
9740088f0eSChuck Lever * @req: request to reset
9840088f0eSChuck Lever *
9940088f0eSChuck Lever * Used after a failed marshal. For FRWR, this means the MRs
10040088f0eSChuck Lever * don't have to be fully released and recreated.
10140088f0eSChuck Lever *
10240088f0eSChuck Lever * NB: This is safe only as long as none of @req's MRs are
10340088f0eSChuck Lever * involved with an ongoing asynchronous FAST_REG or LOCAL_INV
10440088f0eSChuck Lever * Work Request.
10540088f0eSChuck Lever */
frwr_reset(struct rpcrdma_req * req)10640088f0eSChuck Lever void frwr_reset(struct rpcrdma_req *req)
10740088f0eSChuck Lever {
10840088f0eSChuck Lever struct rpcrdma_mr *mr;
10940088f0eSChuck Lever
110265a38d4SChuck Lever while ((mr = rpcrdma_mr_pop(&req->rl_registered)))
111ef2be591SChuck Lever frwr_mr_put(mr);
11240088f0eSChuck Lever }
11340088f0eSChuck Lever
1145f62412bSChuck Lever /**
115253a5162SChuck Lever * frwr_mr_init - Initialize one MR
116253a5162SChuck Lever * @r_xprt: controlling transport instance
1175f62412bSChuck Lever * @mr: generic MR to prepare for FRWR
1185f62412bSChuck Lever *
1195f62412bSChuck Lever * Returns zero if successful. Otherwise a negative errno
1205f62412bSChuck Lever * is returned.
1215f62412bSChuck Lever */
frwr_mr_init(struct rpcrdma_xprt * r_xprt,struct rpcrdma_mr * mr)122253a5162SChuck Lever int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
123d48b1d29SChuck Lever {
124e28ce900SChuck Lever struct rpcrdma_ep *ep = r_xprt->rx_ep;
12593aa8e0aSChuck Lever unsigned int depth = ep->re_max_fr_depth;
126f85adb1bSChuck Lever struct scatterlist *sg;
127f85adb1bSChuck Lever struct ib_mr *frmr;
128d48b1d29SChuck Lever
1292d77058cSChuck Lever sg = kcalloc_node(depth, sizeof(*sg), XPRTRDMA_GFP_FLAGS,
1302d77058cSChuck Lever ibdev_to_node(ep->re_id->device));
1312d77058cSChuck Lever if (!sg)
1322d77058cSChuck Lever return -ENOMEM;
1332d77058cSChuck Lever
13493aa8e0aSChuck Lever frmr = ib_alloc_mr(ep->re_pd, ep->re_mrtype, depth);
135f85adb1bSChuck Lever if (IS_ERR(frmr))
136d48b1d29SChuck Lever goto out_mr_err;
137d48b1d29SChuck Lever
138253a5162SChuck Lever mr->mr_xprt = r_xprt;
13913bcf7e3SChuck Lever mr->mr_ibmr = frmr;
1407a03aeb6SChuck Lever mr->mr_device = NULL;
141054f1557SChuck Lever INIT_LIST_HEAD(&mr->mr_list);
1429a301cafSChuck Lever init_completion(&mr->mr_linv_done);
1430a26d10eSChuck Lever frwr_cid_init(ep, mr);
144f85adb1bSChuck Lever
145f85adb1bSChuck Lever sg_init_table(sg, depth);
146f85adb1bSChuck Lever mr->mr_sg = sg;
147d48b1d29SChuck Lever return 0;
148d48b1d29SChuck Lever
149d48b1d29SChuck Lever out_mr_err:
1502d77058cSChuck Lever kfree(sg);
151e4266f23SChuck Lever trace_xprtrdma_frwr_alloc(mr, PTR_ERR(frmr));
1522d77058cSChuck Lever return PTR_ERR(frmr);
153d48b1d29SChuck Lever }
154d48b1d29SChuck Lever
1555f62412bSChuck Lever /**
15625868e61SChuck Lever * frwr_query_device - Prepare a transport for use with FRWR
15793aa8e0aSChuck Lever * @ep: endpoint to fill in
15825868e61SChuck Lever * @device: RDMA device to query
1595f62412bSChuck Lever *
1605f62412bSChuck Lever * On success, sets:
16193aa8e0aSChuck Lever * ep->re_attr
16293aa8e0aSChuck Lever * ep->re_max_requests
16393aa8e0aSChuck Lever * ep->re_max_rdma_segs
16493aa8e0aSChuck Lever * ep->re_max_fr_depth
16593aa8e0aSChuck Lever * ep->re_mrtype
1665f62412bSChuck Lever *
16725868e61SChuck Lever * Return values:
16825868e61SChuck Lever * On success, returns zero.
16925868e61SChuck Lever * %-EINVAL - the device does not support FRWR memory registration
17025868e61SChuck Lever * %-ENOMEM - the device is not sufficiently capable for NFS/RDMA
171914fcad9SChuck Lever */
frwr_query_device(struct rpcrdma_ep * ep,const struct ib_device * device)17293aa8e0aSChuck Lever int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device)
1733968cb58SChuck Lever {
17425868e61SChuck Lever const struct ib_device_attr *attrs = &device->attrs;
175914fcad9SChuck Lever int max_qp_wr, depth, delta;
1762e870368SChuck Lever unsigned int max_sge;
1772e870368SChuck Lever
17825868e61SChuck Lever if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) ||
17925868e61SChuck Lever attrs->max_fast_reg_page_list_len == 0) {
18025868e61SChuck Lever pr_err("rpcrdma: 'frwr' mode is not supported by device %s\n",
18125868e61SChuck Lever device->name);
18225868e61SChuck Lever return -EINVAL;
18325868e61SChuck Lever }
18425868e61SChuck Lever
1852e870368SChuck Lever max_sge = min_t(unsigned int, attrs->max_send_sge,
1862e870368SChuck Lever RPCRDMA_MAX_SEND_SGES);
1872e870368SChuck Lever if (max_sge < RPCRDMA_MIN_SEND_SGES) {
1882e870368SChuck Lever pr_err("rpcrdma: HCA provides only %u send SGEs\n", max_sge);
1892e870368SChuck Lever return -ENOMEM;
1902e870368SChuck Lever }
19193aa8e0aSChuck Lever ep->re_attr.cap.max_send_sge = max_sge;
19293aa8e0aSChuck Lever ep->re_attr.cap.max_recv_sge = 1;
1933968cb58SChuck Lever
19493aa8e0aSChuck Lever ep->re_mrtype = IB_MR_TYPE_MEM_REG;
195e945c653SJason Gunthorpe if (attrs->kernel_cap_flags & IBK_SG_GAPS_REG)
19693aa8e0aSChuck Lever ep->re_mrtype = IB_MR_TYPE_SG_GAPS;
1975e9fc6a0SChuck Lever
198a7886849SChuck Lever /* Quirk: Some devices advertise a large max_fast_reg_page_list_len
199a7886849SChuck Lever * capability, but perform optimally when the MRs are not larger
200a7886849SChuck Lever * than a page.
201a7886849SChuck Lever */
20218d065a5SChuck Lever if (attrs->max_sge_rd > RPCRDMA_MAX_HDR_SEGS)
20393aa8e0aSChuck Lever ep->re_max_fr_depth = attrs->max_sge_rd;
204a7886849SChuck Lever else
20593aa8e0aSChuck Lever ep->re_max_fr_depth = attrs->max_fast_reg_page_list_len;
20693aa8e0aSChuck Lever if (ep->re_max_fr_depth > RPCRDMA_MAX_DATA_SEGS)
20793aa8e0aSChuck Lever ep->re_max_fr_depth = RPCRDMA_MAX_DATA_SEGS;
2083968cb58SChuck Lever
209ce5b3717SChuck Lever /* Add room for frwr register and invalidate WRs.
210ce5b3717SChuck Lever * 1. FRWR reg WR for head
211ce5b3717SChuck Lever * 2. FRWR invalidate WR for head
212ce5b3717SChuck Lever * 3. N FRWR reg WRs for pagelist
213ce5b3717SChuck Lever * 4. N FRWR invalidate WRs for pagelist
214ce5b3717SChuck Lever * 5. FRWR reg WR for tail
215ce5b3717SChuck Lever * 6. FRWR invalidate WR for tail
2163968cb58SChuck Lever * 7. The RDMA_SEND WR
2173968cb58SChuck Lever */
2183968cb58SChuck Lever depth = 7;
2193968cb58SChuck Lever
220ce5b3717SChuck Lever /* Calculate N if the device max FRWR depth is smaller than
2213968cb58SChuck Lever * RPCRDMA_MAX_DATA_SEGS.
2223968cb58SChuck Lever */
22393aa8e0aSChuck Lever if (ep->re_max_fr_depth < RPCRDMA_MAX_DATA_SEGS) {
22493aa8e0aSChuck Lever delta = RPCRDMA_MAX_DATA_SEGS - ep->re_max_fr_depth;
2253968cb58SChuck Lever do {
226ce5b3717SChuck Lever depth += 2; /* FRWR reg + invalidate */
22793aa8e0aSChuck Lever delta -= ep->re_max_fr_depth;
2283968cb58SChuck Lever } while (delta > 0);
2293968cb58SChuck Lever }
2303968cb58SChuck Lever
23125868e61SChuck Lever max_qp_wr = attrs->max_qp_wr;
232914fcad9SChuck Lever max_qp_wr -= RPCRDMA_BACKWARD_WRS;
233914fcad9SChuck Lever max_qp_wr -= 1;
234914fcad9SChuck Lever if (max_qp_wr < RPCRDMA_MIN_SLOT_TABLE)
235914fcad9SChuck Lever return -ENOMEM;
23693aa8e0aSChuck Lever if (ep->re_max_requests > max_qp_wr)
23793aa8e0aSChuck Lever ep->re_max_requests = max_qp_wr;
23893aa8e0aSChuck Lever ep->re_attr.cap.max_send_wr = ep->re_max_requests * depth;
23993aa8e0aSChuck Lever if (ep->re_attr.cap.max_send_wr > max_qp_wr) {
24093aa8e0aSChuck Lever ep->re_max_requests = max_qp_wr / depth;
24193aa8e0aSChuck Lever if (!ep->re_max_requests)
24225868e61SChuck Lever return -ENOMEM;
24393aa8e0aSChuck Lever ep->re_attr.cap.max_send_wr = ep->re_max_requests * depth;
2443968cb58SChuck Lever }
24593aa8e0aSChuck Lever ep->re_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
24693aa8e0aSChuck Lever ep->re_attr.cap.max_send_wr += 1; /* for ib_drain_sq */
24793aa8e0aSChuck Lever ep->re_attr.cap.max_recv_wr = ep->re_max_requests;
24893aa8e0aSChuck Lever ep->re_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
24932e6b681SChuck Lever ep->re_attr.cap.max_recv_wr += RPCRDMA_MAX_RECV_BATCH;
25093aa8e0aSChuck Lever ep->re_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */
2513968cb58SChuck Lever
25293aa8e0aSChuck Lever ep->re_max_rdma_segs =
25393aa8e0aSChuck Lever DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ep->re_max_fr_depth);
2546946f823SChuck Lever /* Reply chunks require segments for head and tail buffers */
25593aa8e0aSChuck Lever ep->re_max_rdma_segs += 2;
25693aa8e0aSChuck Lever if (ep->re_max_rdma_segs > RPCRDMA_MAX_HDR_SEGS)
25793aa8e0aSChuck Lever ep->re_max_rdma_segs = RPCRDMA_MAX_HDR_SEGS;
2583968cb58SChuck Lever
25918d065a5SChuck Lever /* Ensure the underlying device is capable of conveying the
26018d065a5SChuck Lever * largest r/wsize NFS will ask for. This guarantees that
26118d065a5SChuck Lever * failing over from one RDMA device to another will not
26218d065a5SChuck Lever * break NFS I/O.
2631c9351eeSChuck Lever */
26493aa8e0aSChuck Lever if ((ep->re_max_rdma_segs * ep->re_max_fr_depth) < RPCRDMA_MAX_SEGS)
26518d065a5SChuck Lever return -ENOMEM;
2661c9351eeSChuck Lever
26718d065a5SChuck Lever return 0;
2681c9351eeSChuck Lever }
2691c9351eeSChuck Lever
2702fa8f88dSChuck Lever /**
2715f62412bSChuck Lever * frwr_map - Register a memory region
2725f62412bSChuck Lever * @r_xprt: controlling transport
2735f62412bSChuck Lever * @seg: memory region co-ordinates
2745f62412bSChuck Lever * @nsegs: number of segments remaining
2755f62412bSChuck Lever * @writing: true when RDMA Write will be used
2760a93fbcbSChuck Lever * @xid: XID of RPC using the registered memory
2773b39f52aSChuck Lever * @mr: MR to fill in
2785f62412bSChuck Lever *
2795f62412bSChuck Lever * Prepare a REG_MR Work Request to register a memory region
2809c1b4d77SChuck Lever * for remote access via RDMA READ or RDMA WRITE.
2815f62412bSChuck Lever *
2825f62412bSChuck Lever * Returns the next segment or a negative errno pointer.
2833b39f52aSChuck Lever * On success, @mr is filled in.
2849c1b4d77SChuck Lever */
frwr_map(struct rpcrdma_xprt * r_xprt,struct rpcrdma_mr_seg * seg,int nsegs,bool writing,__be32 xid,struct rpcrdma_mr * mr)2855f62412bSChuck Lever struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
2865f62412bSChuck Lever struct rpcrdma_mr_seg *seg,
287ec482cc1SChuck Lever int nsegs, bool writing, __be32 xid,
2883b39f52aSChuck Lever struct rpcrdma_mr *mr)
2899c1b4d77SChuck Lever {
290e28ce900SChuck Lever struct rpcrdma_ep *ep = r_xprt->rx_ep;
2913cf4e169SChuck Lever struct ib_reg_wr *reg_wr;
292ca1c6713SChuck Lever int i, n, dma_nents;
2933b39f52aSChuck Lever struct ib_mr *ibmr;
2949c1b4d77SChuck Lever u8 key;
2959c1b4d77SChuck Lever
29693aa8e0aSChuck Lever if (nsegs > ep->re_max_fr_depth)
29793aa8e0aSChuck Lever nsegs = ep->re_max_fr_depth;
2984143f34eSSagi Grimberg for (i = 0; i < nsegs;) {
29967b16625SChuck Lever sg_set_page(&mr->mr_sg[i], seg->mr_page,
30067b16625SChuck Lever seg->mr_len, seg->mr_offset);
3014143f34eSSagi Grimberg
3029c1b4d77SChuck Lever ++seg;
3039c1b4d77SChuck Lever ++i;
30493aa8e0aSChuck Lever if (ep->re_mrtype == IB_MR_TYPE_SG_GAPS)
3055e9fc6a0SChuck Lever continue;
30667b16625SChuck Lever if ((i < nsegs && seg->mr_offset) ||
3079c1b4d77SChuck Lever offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
3089c1b4d77SChuck Lever break;
3099c1b4d77SChuck Lever }
31096ceddeaSChuck Lever mr->mr_dir = rpcrdma_data_dir(writing);
311ca1c6713SChuck Lever mr->mr_nents = i;
3129c1b4d77SChuck Lever
31393aa8e0aSChuck Lever dma_nents = ib_dma_map_sg(ep->re_id->device, mr->mr_sg, mr->mr_nents,
314ca1c6713SChuck Lever mr->mr_dir);
315ca1c6713SChuck Lever if (!dma_nents)
316564471d2SChuck Lever goto out_dmamap_err;
3177a03aeb6SChuck Lever mr->mr_device = ep->re_id->device;
3184143f34eSSagi Grimberg
31913bcf7e3SChuck Lever ibmr = mr->mr_ibmr;
320ca1c6713SChuck Lever n = ib_map_mr_sg(ibmr, mr->mr_sg, dma_nents, NULL, PAGE_SIZE);
321ca1c6713SChuck Lever if (n != dma_nents)
322564471d2SChuck Lever goto out_mapmr_err;
3234143f34eSSagi Grimberg
3240a93fbcbSChuck Lever ibmr->iova &= 0x00000000ffffffff;
325ec482cc1SChuck Lever ibmr->iova |= ((u64)be32_to_cpu(xid)) << 32;
32696ceddeaSChuck Lever key = (u8)(ibmr->rkey & 0x000000FF);
32796ceddeaSChuck Lever ib_update_fast_reg_key(ibmr, ++key);
3284143f34eSSagi Grimberg
329dcff9ed2SChuck Lever reg_wr = &mr->mr_regwr;
33096ceddeaSChuck Lever reg_wr->mr = ibmr;
33196ceddeaSChuck Lever reg_wr->key = ibmr->rkey;
3323cf4e169SChuck Lever reg_wr->access = writing ?
3334143f34eSSagi Grimberg IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
3344143f34eSSagi Grimberg IB_ACCESS_REMOTE_READ;
3359c1b4d77SChuck Lever
33696ceddeaSChuck Lever mr->mr_handle = ibmr->rkey;
33796ceddeaSChuck Lever mr->mr_length = ibmr->length;
33896ceddeaSChuck Lever mr->mr_offset = ibmr->iova;
339ba217ec6SChuck Lever trace_xprtrdma_mr_map(mr);
3404143f34eSSagi Grimberg
3416748b0caSChuck Lever return seg;
342564471d2SChuck Lever
343564471d2SChuck Lever out_dmamap_err:
34453b2c1cbSChuck Lever trace_xprtrdma_frwr_sgerr(mr, i);
3456748b0caSChuck Lever return ERR_PTR(-EIO);
346564471d2SChuck Lever
347564471d2SChuck Lever out_mapmr_err:
34853b2c1cbSChuck Lever trace_xprtrdma_frwr_maperr(mr, n);
3496748b0caSChuck Lever return ERR_PTR(-EIO);
350f2877623SChuck Lever }
3519c1b4d77SChuck Lever
3525f62412bSChuck Lever /**
35384756894SChuck Lever * frwr_wc_fastreg - Invoked by RDMA provider for a flushed FastReg WC
354d6ccebf9SChuck Lever * @cq: completion queue
355d6ccebf9SChuck Lever * @wc: WCE for a completed FastReg WR
35684756894SChuck Lever *
357e4b52ca0SChuck Lever * Each flushed MR gets destroyed after the QP has drained.
35884756894SChuck Lever */
frwr_wc_fastreg(struct ib_cq * cq,struct ib_wc * wc)35984756894SChuck Lever static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
36084756894SChuck Lever {
36184756894SChuck Lever struct ib_cqe *cqe = wc->wr_cqe;
362e10fa96dSChuck Lever struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe);
36384756894SChuck Lever
36484756894SChuck Lever /* WARNING: Only wr_cqe and status are reliable at this point */
3650a26d10eSChuck Lever trace_xprtrdma_wc_fastreg(wc, &mr->mr_cid);
366d6ccebf9SChuck Lever
367f423f755SChuck Lever rpcrdma_flush_disconnect(cq->cq_context, wc);
36884756894SChuck Lever }
36984756894SChuck Lever
37084756894SChuck Lever /**
37197d0de88SChuck Lever * frwr_send - post Send WRs containing the RPC Call message
37297d0de88SChuck Lever * @r_xprt: controlling transport instance
37397d0de88SChuck Lever * @req: prepared RPC Call
374f2877623SChuck Lever *
375e0f86bc4SChuck Lever * For FRWR, chain any FastReg WRs to the Send WR. Only a
376f2877623SChuck Lever * single ib_post_send call is needed to register memory
377f2877623SChuck Lever * and then post the Send WR.
3785f62412bSChuck Lever *
37997d0de88SChuck Lever * Returns the return code from ib_post_send.
38097d0de88SChuck Lever *
38197d0de88SChuck Lever * Caller must hold the transport send lock to ensure that the
38297d0de88SChuck Lever * pointers to the transport's rdma_cm_id and QP are stable.
383f2877623SChuck Lever */
frwr_send(struct rpcrdma_xprt * r_xprt,struct rpcrdma_req * req)38497d0de88SChuck Lever int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
385f2877623SChuck Lever {
386b3ce7a25SChuck Lever struct ib_send_wr *post_wr, *send_wr = &req->rl_wr;
3875ecef9c8SChuck Lever struct rpcrdma_ep *ep = r_xprt->rx_ep;
388f2877623SChuck Lever struct rpcrdma_mr *mr;
389b3ce7a25SChuck Lever unsigned int num_wrs;
390d9ae8134SChuck Lever int ret;
391f2877623SChuck Lever
392b3ce7a25SChuck Lever num_wrs = 1;
393b3ce7a25SChuck Lever post_wr = send_wr;
394f2877623SChuck Lever list_for_each_entry(mr, &req->rl_registered, mr_list) {
3954ddd0fc3SChuck Lever trace_xprtrdma_mr_fastreg(mr);
396f2877623SChuck Lever
397e10fa96dSChuck Lever mr->mr_cqe.done = frwr_wc_fastreg;
398dcff9ed2SChuck Lever mr->mr_regwr.wr.next = post_wr;
399dcff9ed2SChuck Lever mr->mr_regwr.wr.wr_cqe = &mr->mr_cqe;
400dcff9ed2SChuck Lever mr->mr_regwr.wr.num_sge = 0;
401dcff9ed2SChuck Lever mr->mr_regwr.wr.opcode = IB_WR_REG_MR;
402dcff9ed2SChuck Lever mr->mr_regwr.wr.send_flags = 0;
403dcff9ed2SChuck Lever post_wr = &mr->mr_regwr.wr;
404b3ce7a25SChuck Lever ++num_wrs;
405f2877623SChuck Lever }
406f2877623SChuck Lever
407b3ce7a25SChuck Lever if ((kref_read(&req->rl_kref) > 1) || num_wrs > ep->re_send_count) {
408b3ce7a25SChuck Lever send_wr->send_flags |= IB_SEND_SIGNALED;
409b3ce7a25SChuck Lever ep->re_send_count = min_t(unsigned int, ep->re_send_batch,
410b3ce7a25SChuck Lever num_wrs - ep->re_send_count);
411b3ce7a25SChuck Lever } else {
412b3ce7a25SChuck Lever send_wr->send_flags &= ~IB_SEND_SIGNALED;
413b3ce7a25SChuck Lever ep->re_send_count -= num_wrs;
414b3ce7a25SChuck Lever }
415b3ce7a25SChuck Lever
416b3ce7a25SChuck Lever trace_xprtrdma_post_send(req);
417d9ae8134SChuck Lever ret = ib_post_send(ep->re_id->qp, post_wr, NULL);
418d9ae8134SChuck Lever if (ret)
419d9ae8134SChuck Lever trace_xprtrdma_post_send_err(r_xprt, req, ret);
420d9ae8134SChuck Lever return ret;
4219c1b4d77SChuck Lever }
4229c1b4d77SChuck Lever
4235f62412bSChuck Lever /**
4245f62412bSChuck Lever * frwr_reminv - handle a remotely invalidated mr on the @mrs list
4255f62412bSChuck Lever * @rep: Received reply
4265f62412bSChuck Lever * @mrs: list of MRs to check
4275f62412bSChuck Lever *
428c3441618SChuck Lever */
frwr_reminv(struct rpcrdma_rep * rep,struct list_head * mrs)4295f62412bSChuck Lever void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
430c3441618SChuck Lever {
43196ceddeaSChuck Lever struct rpcrdma_mr *mr;
432c3441618SChuck Lever
43396ceddeaSChuck Lever list_for_each_entry(mr, mrs, mr_list)
43496ceddeaSChuck Lever if (mr->mr_handle == rep->rr_inv_rkey) {
435054f1557SChuck Lever list_del_init(&mr->mr_list);
4364ddd0fc3SChuck Lever trace_xprtrdma_mr_reminv(mr);
437ef2be591SChuck Lever frwr_mr_put(mr);
438c3441618SChuck Lever break; /* only one invalidated MR per RPC */
439c3441618SChuck Lever }
440c3441618SChuck Lever }
441c3441618SChuck Lever
frwr_mr_done(struct ib_wc * wc,struct rpcrdma_mr * mr)442ef2be591SChuck Lever static void frwr_mr_done(struct ib_wc *wc, struct rpcrdma_mr *mr)
44384756894SChuck Lever {
444e4b52ca0SChuck Lever if (likely(wc->status == IB_WC_SUCCESS))
445ef2be591SChuck Lever frwr_mr_put(mr);
44684756894SChuck Lever }
44784756894SChuck Lever
44884756894SChuck Lever /**
44984756894SChuck Lever * frwr_wc_localinv - Invoked by RDMA provider for a LOCAL_INV WC
450d6ccebf9SChuck Lever * @cq: completion queue
451d6ccebf9SChuck Lever * @wc: WCE for a completed LocalInv WR
45284756894SChuck Lever *
45384756894SChuck Lever */
frwr_wc_localinv(struct ib_cq * cq,struct ib_wc * wc)45484756894SChuck Lever static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
45584756894SChuck Lever {
45684756894SChuck Lever struct ib_cqe *cqe = wc->wr_cqe;
457e10fa96dSChuck Lever struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe);
45884756894SChuck Lever
45984756894SChuck Lever /* WARNING: Only wr_cqe and status are reliable at this point */
4600a26d10eSChuck Lever trace_xprtrdma_wc_li(wc, &mr->mr_cid);
461ef2be591SChuck Lever frwr_mr_done(wc, mr);
462d6ccebf9SChuck Lever
463f423f755SChuck Lever rpcrdma_flush_disconnect(cq->cq_context, wc);
46484756894SChuck Lever }
46584756894SChuck Lever
46684756894SChuck Lever /**
46784756894SChuck Lever * frwr_wc_localinv_wake - Invoked by RDMA provider for a LOCAL_INV WC
468d6ccebf9SChuck Lever * @cq: completion queue
469d6ccebf9SChuck Lever * @wc: WCE for a completed LocalInv WR
47084756894SChuck Lever *
47184756894SChuck Lever * Awaken anyone waiting for an MR to finish being fenced.
47284756894SChuck Lever */
frwr_wc_localinv_wake(struct ib_cq * cq,struct ib_wc * wc)47384756894SChuck Lever static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
47484756894SChuck Lever {
47584756894SChuck Lever struct ib_cqe *cqe = wc->wr_cqe;
476e10fa96dSChuck Lever struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe);
47784756894SChuck Lever
47884756894SChuck Lever /* WARNING: Only wr_cqe and status are reliable at this point */
4790a26d10eSChuck Lever trace_xprtrdma_wc_li_wake(wc, &mr->mr_cid);
480ef2be591SChuck Lever frwr_mr_done(wc, mr);
4819a301cafSChuck Lever complete(&mr->mr_linv_done);
482d6ccebf9SChuck Lever
483f423f755SChuck Lever rpcrdma_flush_disconnect(cq->cq_context, wc);
48484756894SChuck Lever }
48584756894SChuck Lever
4865f62412bSChuck Lever /**
4875f62412bSChuck Lever * frwr_unmap_sync - invalidate memory regions that were registered for @req
48884756894SChuck Lever * @r_xprt: controlling transport instance
48984756894SChuck Lever * @req: rpcrdma_req with a non-empty list of MRs to process
490c9918ff5SChuck Lever *
49184756894SChuck Lever * Sleeps until it is safe for the host CPU to access the previously mapped
492d8099fedSChuck Lever * memory regions. This guarantees that registered MRs are properly fenced
493d8099fedSChuck Lever * from the server before the RPC consumer accesses the data in them. It
494d8099fedSChuck Lever * also ensures proper Send flow control: waking the next RPC waits until
495d8099fedSChuck Lever * this RPC has relinquished all its Send Queue entries.
496c9918ff5SChuck Lever */
frwr_unmap_sync(struct rpcrdma_xprt * r_xprt,struct rpcrdma_req * req)49784756894SChuck Lever void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
498c9918ff5SChuck Lever {
499d34ac5cdSBart Van Assche struct ib_send_wr *first, **prev, *last;
5005ecef9c8SChuck Lever struct rpcrdma_ep *ep = r_xprt->rx_ep;
501d34ac5cdSBart Van Assche const struct ib_send_wr *bad_wr;
50296ceddeaSChuck Lever struct rpcrdma_mr *mr;
50384756894SChuck Lever int rc;
504c9918ff5SChuck Lever
505451d26e1SChuck Lever /* ORDER: Invalidate all of the MRs first
506c9918ff5SChuck Lever *
507c9918ff5SChuck Lever * Chain the LOCAL_INV Work Requests and post them with
508c9918ff5SChuck Lever * a single ib_post_send() call.
509c9918ff5SChuck Lever */
510a100fda1SChuck Lever prev = &first;
511cb5a967fSBenjamin Coddington mr = rpcrdma_mr_pop(&req->rl_registered);
512cb5a967fSBenjamin Coddington do {
51384756894SChuck Lever trace_xprtrdma_mr_localinv(mr);
51484756894SChuck Lever r_xprt->rx_stats.local_inv_needed++;
515c8b920bbSChuck Lever
516dcff9ed2SChuck Lever last = &mr->mr_invwr;
51784756894SChuck Lever last->next = NULL;
518e10fa96dSChuck Lever last->wr_cqe = &mr->mr_cqe;
51984756894SChuck Lever last->sg_list = NULL;
52084756894SChuck Lever last->num_sge = 0;
521a100fda1SChuck Lever last->opcode = IB_WR_LOCAL_INV;
52284756894SChuck Lever last->send_flags = IB_SEND_SIGNALED;
52396ceddeaSChuck Lever last->ex.invalidate_rkey = mr->mr_handle;
524c9918ff5SChuck Lever
525e10fa96dSChuck Lever last->wr_cqe->done = frwr_wc_localinv;
526e10fa96dSChuck Lever
527a100fda1SChuck Lever *prev = last;
528a100fda1SChuck Lever prev = &last->next;
529cb5a967fSBenjamin Coddington } while ((mr = rpcrdma_mr_pop(&req->rl_registered)));
530cb5a967fSBenjamin Coddington
5319e895cd9SChuck Lever mr = container_of(last, struct rpcrdma_mr, mr_invwr);
532c9918ff5SChuck Lever
533c9918ff5SChuck Lever /* Strong send queue ordering guarantees that when the
534c9918ff5SChuck Lever * last WR in the chain completes, all WRs in the chain
535c9918ff5SChuck Lever * are complete.
536c9918ff5SChuck Lever */
537e10fa96dSChuck Lever last->wr_cqe->done = frwr_wc_localinv_wake;
5389a301cafSChuck Lever reinit_completion(&mr->mr_linv_done);
5398d38de65SChuck Lever
540c9918ff5SChuck Lever /* Transport disconnect drains the receive CQ before it
541c9918ff5SChuck Lever * replaces the QP. The RPC reply handler won't call us
54293aa8e0aSChuck Lever * unless re_id->qp is a valid pointer.
543c9918ff5SChuck Lever */
5448d75483aSChuck Lever bad_wr = NULL;
5455ecef9c8SChuck Lever rc = ib_post_send(ep->re_id->qp, first, &bad_wr);
54684756894SChuck Lever
54784756894SChuck Lever /* The final LOCAL_INV WR in the chain is supposed to
54884756894SChuck Lever * do the wake. If it was never posted, the wake will
54984756894SChuck Lever * not happen, so don't wait in that case.
55084756894SChuck Lever */
5518d75483aSChuck Lever if (bad_wr != first)
5529a301cafSChuck Lever wait_for_completion(&mr->mr_linv_done);
55384756894SChuck Lever if (!rc)
554d7a21c1bSChuck Lever return;
555d7a21c1bSChuck Lever
556e4b52ca0SChuck Lever /* On error, the MRs get destroyed once the QP has drained. */
55736a55edfSChuck Lever trace_xprtrdma_post_linv_err(req, rc);
5581143129eSChuck Lever
5591143129eSChuck Lever /* Force a connection loss to ensure complete recovery.
5601143129eSChuck Lever */
5611143129eSChuck Lever rpcrdma_force_disconnect(ep);
562c9918ff5SChuck Lever }
563d8099fedSChuck Lever
564d8099fedSChuck Lever /**
565d8099fedSChuck Lever * frwr_wc_localinv_done - Invoked by RDMA provider for a signaled LOCAL_INV WC
566d6ccebf9SChuck Lever * @cq: completion queue
567d6ccebf9SChuck Lever * @wc: WCE for a completed LocalInv WR
568d8099fedSChuck Lever *
569d8099fedSChuck Lever */
frwr_wc_localinv_done(struct ib_cq * cq,struct ib_wc * wc)570d8099fedSChuck Lever static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc)
571d8099fedSChuck Lever {
572d8099fedSChuck Lever struct ib_cqe *cqe = wc->wr_cqe;
573e10fa96dSChuck Lever struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe);
57444438ad9SChuck Lever struct rpcrdma_rep *rep;
575d8099fedSChuck Lever
576d8099fedSChuck Lever /* WARNING: Only wr_cqe and status are reliable at this point */
5770a26d10eSChuck Lever trace_xprtrdma_wc_li_done(wc, &mr->mr_cid);
5786dc6ec9eSChuck Lever
57944438ad9SChuck Lever /* Ensure that @rep is generated before the MR is released */
58044438ad9SChuck Lever rep = mr->mr_req->rl_reply;
5816dc6ec9eSChuck Lever smp_rmb();
58244438ad9SChuck Lever
5838a053433SChuck Lever if (wc->status != IB_WC_SUCCESS) {
5848a053433SChuck Lever if (rep)
5858a053433SChuck Lever rpcrdma_unpin_rqst(rep);
586f423f755SChuck Lever rpcrdma_flush_disconnect(cq->cq_context, wc);
5878a053433SChuck Lever return;
5888a053433SChuck Lever }
5898a053433SChuck Lever frwr_mr_put(mr);
5908a053433SChuck Lever rpcrdma_complete_rqst(rep);
591d8099fedSChuck Lever }
592d8099fedSChuck Lever
593d8099fedSChuck Lever /**
594d8099fedSChuck Lever * frwr_unmap_async - invalidate memory regions that were registered for @req
595d8099fedSChuck Lever * @r_xprt: controlling transport instance
596d8099fedSChuck Lever * @req: rpcrdma_req with a non-empty list of MRs to process
597d8099fedSChuck Lever *
598d8099fedSChuck Lever * This guarantees that registered MRs are properly fenced from the
599d8099fedSChuck Lever * server before the RPC consumer accesses the data in them. It also
600d8099fedSChuck Lever * ensures proper Send flow control: waking the next RPC waits until
601d8099fedSChuck Lever * this RPC has relinquished all its Send Queue entries.
602d8099fedSChuck Lever */
frwr_unmap_async(struct rpcrdma_xprt * r_xprt,struct rpcrdma_req * req)603d8099fedSChuck Lever void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
604d8099fedSChuck Lever {
605d8099fedSChuck Lever struct ib_send_wr *first, *last, **prev;
6065ecef9c8SChuck Lever struct rpcrdma_ep *ep = r_xprt->rx_ep;
607d8099fedSChuck Lever struct rpcrdma_mr *mr;
608d8099fedSChuck Lever int rc;
609d8099fedSChuck Lever
610d8099fedSChuck Lever /* Chain the LOCAL_INV Work Requests and post them with
611d8099fedSChuck Lever * a single ib_post_send() call.
612d8099fedSChuck Lever */
613d8099fedSChuck Lever prev = &first;
614cb5a967fSBenjamin Coddington mr = rpcrdma_mr_pop(&req->rl_registered);
615cb5a967fSBenjamin Coddington do {
616d8099fedSChuck Lever trace_xprtrdma_mr_localinv(mr);
617d8099fedSChuck Lever r_xprt->rx_stats.local_inv_needed++;
618d8099fedSChuck Lever
619dcff9ed2SChuck Lever last = &mr->mr_invwr;
620d8099fedSChuck Lever last->next = NULL;
621e10fa96dSChuck Lever last->wr_cqe = &mr->mr_cqe;
622d8099fedSChuck Lever last->sg_list = NULL;
623d8099fedSChuck Lever last->num_sge = 0;
624d8099fedSChuck Lever last->opcode = IB_WR_LOCAL_INV;
625d8099fedSChuck Lever last->send_flags = IB_SEND_SIGNALED;
626d8099fedSChuck Lever last->ex.invalidate_rkey = mr->mr_handle;
627d8099fedSChuck Lever
628e10fa96dSChuck Lever last->wr_cqe->done = frwr_wc_localinv;
629e10fa96dSChuck Lever
630d8099fedSChuck Lever *prev = last;
631d8099fedSChuck Lever prev = &last->next;
632cb5a967fSBenjamin Coddington } while ((mr = rpcrdma_mr_pop(&req->rl_registered)));
633d8099fedSChuck Lever
634d8099fedSChuck Lever /* Strong send queue ordering guarantees that when the
635d8099fedSChuck Lever * last WR in the chain completes, all WRs in the chain
636d8099fedSChuck Lever * are complete. The last completion will wake up the
637d8099fedSChuck Lever * RPC waiter.
638d8099fedSChuck Lever */
639e10fa96dSChuck Lever last->wr_cqe->done = frwr_wc_localinv_done;
640d8099fedSChuck Lever
641d8099fedSChuck Lever /* Transport disconnect drains the receive CQ before it
642d8099fedSChuck Lever * replaces the QP. The RPC reply handler won't call us
64393aa8e0aSChuck Lever * unless re_id->qp is a valid pointer.
644d8099fedSChuck Lever */
645e4b52ca0SChuck Lever rc = ib_post_send(ep->re_id->qp, first, NULL);
646d8099fedSChuck Lever if (!rc)
647d8099fedSChuck Lever return;
648d8099fedSChuck Lever
649e4b52ca0SChuck Lever /* On error, the MRs get destroyed once the QP has drained. */
65036a55edfSChuck Lever trace_xprtrdma_post_linv_err(req, rc);
651d8099fedSChuck Lever
652d8099fedSChuck Lever /* The final LOCAL_INV WR in the chain is supposed to
6538a053433SChuck Lever * do the wake. If it was never posted, the wake does
6548a053433SChuck Lever * not happen. Unpin the rqst in preparation for its
6558a053433SChuck Lever * retransmission.
656d8099fedSChuck Lever */
6578a053433SChuck Lever rpcrdma_unpin_rqst(req->rl_reply);
6581143129eSChuck Lever
6591143129eSChuck Lever /* Force a connection loss to ensure complete recovery.
6601143129eSChuck Lever */
6611143129eSChuck Lever rpcrdma_force_disconnect(ep);
662d8099fedSChuck Lever }
66321037b8cSChuck Lever
66421037b8cSChuck Lever /**
66521037b8cSChuck Lever * frwr_wp_create - Create an MR for padding Write chunks
66621037b8cSChuck Lever * @r_xprt: transport resources to use
66721037b8cSChuck Lever *
66821037b8cSChuck Lever * Return 0 on success, negative errno on failure.
66921037b8cSChuck Lever */
frwr_wp_create(struct rpcrdma_xprt * r_xprt)67021037b8cSChuck Lever int frwr_wp_create(struct rpcrdma_xprt *r_xprt)
67121037b8cSChuck Lever {
67221037b8cSChuck Lever struct rpcrdma_ep *ep = r_xprt->rx_ep;
67321037b8cSChuck Lever struct rpcrdma_mr_seg seg;
67421037b8cSChuck Lever struct rpcrdma_mr *mr;
67521037b8cSChuck Lever
67621037b8cSChuck Lever mr = rpcrdma_mr_get(r_xprt);
67721037b8cSChuck Lever if (!mr)
67821037b8cSChuck Lever return -EAGAIN;
67921037b8cSChuck Lever mr->mr_req = NULL;
68021037b8cSChuck Lever ep->re_write_pad_mr = mr;
68121037b8cSChuck Lever
68221037b8cSChuck Lever seg.mr_len = XDR_UNIT;
68321037b8cSChuck Lever seg.mr_page = virt_to_page(ep->re_write_pad);
68421037b8cSChuck Lever seg.mr_offset = offset_in_page(ep->re_write_pad);
68521037b8cSChuck Lever if (IS_ERR(frwr_map(r_xprt, &seg, 1, true, xdr_zero, mr)))
68621037b8cSChuck Lever return -EIO;
68721037b8cSChuck Lever trace_xprtrdma_mr_fastreg(mr);
68821037b8cSChuck Lever
68921037b8cSChuck Lever mr->mr_cqe.done = frwr_wc_fastreg;
69021037b8cSChuck Lever mr->mr_regwr.wr.next = NULL;
69121037b8cSChuck Lever mr->mr_regwr.wr.wr_cqe = &mr->mr_cqe;
69221037b8cSChuck Lever mr->mr_regwr.wr.num_sge = 0;
69321037b8cSChuck Lever mr->mr_regwr.wr.opcode = IB_WR_REG_MR;
69421037b8cSChuck Lever mr->mr_regwr.wr.send_flags = 0;
69521037b8cSChuck Lever
69621037b8cSChuck Lever return ib_post_send(ep->re_id->qp, &mr->mr_regwr.wr, NULL);
69721037b8cSChuck Lever }
698