1 /* 2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the BSD-type 8 * license below: 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 14 * Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 17 * Redistributions in binary form must reproduce the above 18 * copyright notice, this list of conditions and the following 19 * disclaimer in the documentation and/or other materials provided 20 * with the distribution. 21 * 22 * Neither the name of the Network Appliance, Inc. nor the names of 23 * its contributors may be used to endorse or promote products 24 * derived from this software without specific prior written 25 * permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 */ 39 40 /* 41 * rpc_rdma.c 42 * 43 * This file contains the guts of the RPC RDMA protocol, and 44 * does marshaling/unmarshaling, etc. It is also where interfacing 45 * to the Linux RPC framework lives. 46 */ 47 48 #include "xprt_rdma.h" 49 50 #include <linux/highmem.h> 51 52 #ifdef RPC_DEBUG 53 # define RPCDBG_FACILITY RPCDBG_TRANS 54 #endif 55 56 enum rpcrdma_chunktype { 57 rpcrdma_noch = 0, 58 rpcrdma_readch, 59 rpcrdma_areadch, 60 rpcrdma_writech, 61 rpcrdma_replych 62 }; 63 64 #ifdef RPC_DEBUG 65 static const char transfertypes[][12] = { 66 "pure inline", /* no chunks */ 67 " read chunk", /* some argument via rdma read */ 68 "*read chunk", /* entire request via rdma read */ 69 "write chunk", /* some result via rdma write */ 70 "reply chunk" /* entire reply via rdma write */ 71 }; 72 #endif 73 74 /* 75 * Chunk assembly from upper layer xdr_buf. 76 * 77 * Prepare the passed-in xdr_buf into representation as RPC/RDMA chunk 78 * elements. Segments are then coalesced when registered, if possible 79 * within the selected memreg mode. 80 * 81 * Returns positive number of segments converted, or a negative errno. 82 */ 83 84 static int 85 rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, 86 enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs) 87 { 88 int len, n = 0, p; 89 int page_base; 90 struct page **ppages; 91 92 if (pos == 0 && xdrbuf->head[0].iov_len) { 93 seg[n].mr_page = NULL; 94 seg[n].mr_offset = xdrbuf->head[0].iov_base; 95 seg[n].mr_len = xdrbuf->head[0].iov_len; 96 ++n; 97 } 98 99 len = xdrbuf->page_len; 100 ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); 101 page_base = xdrbuf->page_base & ~PAGE_MASK; 102 p = 0; 103 while (len && n < nsegs) { 104 if (!ppages[p]) { 105 /* alloc the pagelist for receiving buffer */ 106 ppages[p] = alloc_page(GFP_ATOMIC); 107 if (!ppages[p]) 108 return -ENOMEM; 109 } 110 seg[n].mr_page = ppages[p]; 111 seg[n].mr_offset = (void *)(unsigned long) page_base; 112 seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len); 113 if (seg[n].mr_len > PAGE_SIZE) 114 return -EIO; 115 len -= seg[n].mr_len; 116 ++n; 117 ++p; 118 page_base = 0; /* page offset only applies to first page */ 119 } 120 121 /* Message overflows the seg array */ 122 if (len && n == nsegs) 123 return -EIO; 124 125 if (xdrbuf->tail[0].iov_len) { 126 /* the rpcrdma protocol allows us to omit any trailing 127 * xdr pad bytes, saving the server an RDMA operation. */ 128 if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize) 129 return n; 130 if (n == nsegs) 131 /* Tail remains, but we're out of segments */ 132 return -EIO; 133 seg[n].mr_page = NULL; 134 seg[n].mr_offset = xdrbuf->tail[0].iov_base; 135 seg[n].mr_len = xdrbuf->tail[0].iov_len; 136 ++n; 137 } 138 139 return n; 140 } 141 142 /* 143 * Create read/write chunk lists, and reply chunks, for RDMA 144 * 145 * Assume check against THRESHOLD has been done, and chunks are required. 146 * Assume only encoding one list entry for read|write chunks. The NFSv3 147 * protocol is simple enough to allow this as it only has a single "bulk 148 * result" in each procedure - complicated NFSv4 COMPOUNDs are not. (The 149 * RDMA/Sessions NFSv4 proposal addresses this for future v4 revs.) 150 * 151 * When used for a single reply chunk (which is a special write 152 * chunk used for the entire reply, rather than just the data), it 153 * is used primarily for READDIR and READLINK which would otherwise 154 * be severely size-limited by a small rdma inline read max. The server 155 * response will come back as an RDMA Write, followed by a message 156 * of type RDMA_NOMSG carrying the xid and length. As a result, reply 157 * chunks do not provide data alignment, however they do not require 158 * "fixup" (moving the response to the upper layer buffer) either. 159 * 160 * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): 161 * 162 * Read chunklist (a linked list): 163 * N elements, position P (same P for all chunks of same arg!): 164 * 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0 165 * 166 * Write chunklist (a list of (one) counted array): 167 * N elements: 168 * 1 - N - HLOO - HLOO - ... - HLOO - 0 169 * 170 * Reply chunk (a counted array): 171 * N elements: 172 * 1 - N - HLOO - HLOO - ... - HLOO 173 * 174 * Returns positive RPC/RDMA header size, or negative errno. 175 */ 176 177 static ssize_t 178 rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, 179 struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type) 180 { 181 struct rpcrdma_req *req = rpcr_to_rdmar(rqst); 182 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); 183 int n, nsegs, nchunks = 0; 184 unsigned int pos; 185 struct rpcrdma_mr_seg *seg = req->rl_segments; 186 struct rpcrdma_read_chunk *cur_rchunk = NULL; 187 struct rpcrdma_write_array *warray = NULL; 188 struct rpcrdma_write_chunk *cur_wchunk = NULL; 189 __be32 *iptr = headerp->rm_body.rm_chunks; 190 191 if (type == rpcrdma_readch || type == rpcrdma_areadch) { 192 /* a read chunk - server will RDMA Read our memory */ 193 cur_rchunk = (struct rpcrdma_read_chunk *) iptr; 194 } else { 195 /* a write or reply chunk - server will RDMA Write our memory */ 196 *iptr++ = xdr_zero; /* encode a NULL read chunk list */ 197 if (type == rpcrdma_replych) 198 *iptr++ = xdr_zero; /* a NULL write chunk list */ 199 warray = (struct rpcrdma_write_array *) iptr; 200 cur_wchunk = (struct rpcrdma_write_chunk *) (warray + 1); 201 } 202 203 if (type == rpcrdma_replych || type == rpcrdma_areadch) 204 pos = 0; 205 else 206 pos = target->head[0].iov_len; 207 208 nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS); 209 if (nsegs < 0) 210 return nsegs; 211 212 do { 213 n = rpcrdma_register_external(seg, nsegs, 214 cur_wchunk != NULL, r_xprt); 215 if (n <= 0) 216 goto out; 217 if (cur_rchunk) { /* read */ 218 cur_rchunk->rc_discrim = xdr_one; 219 /* all read chunks have the same "position" */ 220 cur_rchunk->rc_position = htonl(pos); 221 cur_rchunk->rc_target.rs_handle = htonl(seg->mr_rkey); 222 cur_rchunk->rc_target.rs_length = htonl(seg->mr_len); 223 xdr_encode_hyper( 224 (__be32 *)&cur_rchunk->rc_target.rs_offset, 225 seg->mr_base); 226 dprintk("RPC: %s: read chunk " 227 "elem %d@0x%llx:0x%x pos %u (%s)\n", __func__, 228 seg->mr_len, (unsigned long long)seg->mr_base, 229 seg->mr_rkey, pos, n < nsegs ? "more" : "last"); 230 cur_rchunk++; 231 r_xprt->rx_stats.read_chunk_count++; 232 } else { /* write/reply */ 233 cur_wchunk->wc_target.rs_handle = htonl(seg->mr_rkey); 234 cur_wchunk->wc_target.rs_length = htonl(seg->mr_len); 235 xdr_encode_hyper( 236 (__be32 *)&cur_wchunk->wc_target.rs_offset, 237 seg->mr_base); 238 dprintk("RPC: %s: %s chunk " 239 "elem %d@0x%llx:0x%x (%s)\n", __func__, 240 (type == rpcrdma_replych) ? "reply" : "write", 241 seg->mr_len, (unsigned long long)seg->mr_base, 242 seg->mr_rkey, n < nsegs ? "more" : "last"); 243 cur_wchunk++; 244 if (type == rpcrdma_replych) 245 r_xprt->rx_stats.reply_chunk_count++; 246 else 247 r_xprt->rx_stats.write_chunk_count++; 248 r_xprt->rx_stats.total_rdma_request += seg->mr_len; 249 } 250 nchunks++; 251 seg += n; 252 nsegs -= n; 253 } while (nsegs); 254 255 /* success. all failures return above */ 256 req->rl_nchunks = nchunks; 257 258 /* 259 * finish off header. If write, marshal discrim and nchunks. 260 */ 261 if (cur_rchunk) { 262 iptr = (__be32 *) cur_rchunk; 263 *iptr++ = xdr_zero; /* finish the read chunk list */ 264 *iptr++ = xdr_zero; /* encode a NULL write chunk list */ 265 *iptr++ = xdr_zero; /* encode a NULL reply chunk */ 266 } else { 267 warray->wc_discrim = xdr_one; 268 warray->wc_nchunks = htonl(nchunks); 269 iptr = (__be32 *) cur_wchunk; 270 if (type == rpcrdma_writech) { 271 *iptr++ = xdr_zero; /* finish the write chunk list */ 272 *iptr++ = xdr_zero; /* encode a NULL reply chunk */ 273 } 274 } 275 276 /* 277 * Return header size. 278 */ 279 return (unsigned char *)iptr - (unsigned char *)headerp; 280 281 out: 282 for (pos = 0; nchunks--;) 283 pos += rpcrdma_deregister_external( 284 &req->rl_segments[pos], r_xprt); 285 return n; 286 } 287 288 /* 289 * Copy write data inline. 290 * This function is used for "small" requests. Data which is passed 291 * to RPC via iovecs (or page list) is copied directly into the 292 * pre-registered memory buffer for this request. For small amounts 293 * of data, this is efficient. The cutoff value is tunable. 294 */ 295 static int 296 rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) 297 { 298 int i, npages, curlen; 299 int copy_len; 300 unsigned char *srcp, *destp; 301 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); 302 int page_base; 303 struct page **ppages; 304 305 destp = rqst->rq_svec[0].iov_base; 306 curlen = rqst->rq_svec[0].iov_len; 307 destp += curlen; 308 /* 309 * Do optional padding where it makes sense. Alignment of write 310 * payload can help the server, if our setting is accurate. 311 */ 312 pad -= (curlen + 36/*sizeof(struct rpcrdma_msg_padded)*/); 313 if (pad < 0 || rqst->rq_slen - curlen < RPCRDMA_INLINE_PAD_THRESH) 314 pad = 0; /* don't pad this request */ 315 316 dprintk("RPC: %s: pad %d destp 0x%p len %d hdrlen %d\n", 317 __func__, pad, destp, rqst->rq_slen, curlen); 318 319 copy_len = rqst->rq_snd_buf.page_len; 320 321 if (rqst->rq_snd_buf.tail[0].iov_len) { 322 curlen = rqst->rq_snd_buf.tail[0].iov_len; 323 if (destp + copy_len != rqst->rq_snd_buf.tail[0].iov_base) { 324 memmove(destp + copy_len, 325 rqst->rq_snd_buf.tail[0].iov_base, curlen); 326 r_xprt->rx_stats.pullup_copy_count += curlen; 327 } 328 dprintk("RPC: %s: tail destp 0x%p len %d\n", 329 __func__, destp + copy_len, curlen); 330 rqst->rq_svec[0].iov_len += curlen; 331 } 332 r_xprt->rx_stats.pullup_copy_count += copy_len; 333 334 page_base = rqst->rq_snd_buf.page_base; 335 ppages = rqst->rq_snd_buf.pages + (page_base >> PAGE_SHIFT); 336 page_base &= ~PAGE_MASK; 337 npages = PAGE_ALIGN(page_base+copy_len) >> PAGE_SHIFT; 338 for (i = 0; copy_len && i < npages; i++) { 339 curlen = PAGE_SIZE - page_base; 340 if (curlen > copy_len) 341 curlen = copy_len; 342 dprintk("RPC: %s: page %d destp 0x%p len %d curlen %d\n", 343 __func__, i, destp, copy_len, curlen); 344 srcp = kmap_atomic(ppages[i]); 345 memcpy(destp, srcp+page_base, curlen); 346 kunmap_atomic(srcp); 347 rqst->rq_svec[0].iov_len += curlen; 348 destp += curlen; 349 copy_len -= curlen; 350 page_base = 0; 351 } 352 /* header now contains entire send message */ 353 return pad; 354 } 355 356 /* 357 * Marshal a request: the primary job of this routine is to choose 358 * the transfer modes. See comments below. 359 * 360 * Uses multiple RDMA IOVs for a request: 361 * [0] -- RPC RDMA header, which uses memory from the *start* of the 362 * preregistered buffer that already holds the RPC data in 363 * its middle. 364 * [1] -- the RPC header/data, marshaled by RPC and the NFS protocol. 365 * [2] -- optional padding. 366 * [3] -- if padded, header only in [1] and data here. 367 * 368 * Returns zero on success, otherwise a negative errno. 369 */ 370 371 int 372 rpcrdma_marshal_req(struct rpc_rqst *rqst) 373 { 374 struct rpc_xprt *xprt = rqst->rq_xprt; 375 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 376 struct rpcrdma_req *req = rpcr_to_rdmar(rqst); 377 char *base; 378 size_t rpclen, padlen; 379 ssize_t hdrlen; 380 enum rpcrdma_chunktype rtype, wtype; 381 struct rpcrdma_msg *headerp; 382 383 /* 384 * rpclen gets amount of data in first buffer, which is the 385 * pre-registered buffer. 386 */ 387 base = rqst->rq_svec[0].iov_base; 388 rpclen = rqst->rq_svec[0].iov_len; 389 390 /* build RDMA header in private area at front */ 391 headerp = (struct rpcrdma_msg *) req->rl_base; 392 /* don't htonl XID, it's already done in request */ 393 headerp->rm_xid = rqst->rq_xid; 394 headerp->rm_vers = xdr_one; 395 headerp->rm_credit = htonl(r_xprt->rx_buf.rb_max_requests); 396 headerp->rm_type = htonl(RDMA_MSG); 397 398 /* 399 * Chunks needed for results? 400 * 401 * o If the expected result is under the inline threshold, all ops 402 * return as inline (but see later). 403 * o Large non-read ops return as a single reply chunk. 404 * o Large read ops return data as write chunk(s), header as inline. 405 * 406 * Note: the NFS code sending down multiple result segments implies 407 * the op is one of read, readdir[plus], readlink or NFSv4 getacl. 408 */ 409 410 /* 411 * This code can handle read chunks, write chunks OR reply 412 * chunks -- only one type. If the request is too big to fit 413 * inline, then we will choose read chunks. If the request is 414 * a READ, then use write chunks to separate the file data 415 * into pages; otherwise use reply chunks. 416 */ 417 if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst)) 418 wtype = rpcrdma_noch; 419 else if (rqst->rq_rcv_buf.page_len == 0) 420 wtype = rpcrdma_replych; 421 else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) 422 wtype = rpcrdma_writech; 423 else 424 wtype = rpcrdma_replych; 425 426 /* 427 * Chunks needed for arguments? 428 * 429 * o If the total request is under the inline threshold, all ops 430 * are sent as inline. 431 * o Large non-write ops are sent with the entire message as a 432 * single read chunk (protocol 0-position special case). 433 * o Large write ops transmit data as read chunk(s), header as 434 * inline. 435 * 436 * Note: the NFS code sending down multiple argument segments 437 * implies the op is a write. 438 * TBD check NFSv4 setacl 439 */ 440 if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst)) 441 rtype = rpcrdma_noch; 442 else if (rqst->rq_snd_buf.page_len == 0) 443 rtype = rpcrdma_areadch; 444 else 445 rtype = rpcrdma_readch; 446 447 /* The following simplification is not true forever */ 448 if (rtype != rpcrdma_noch && wtype == rpcrdma_replych) 449 wtype = rpcrdma_noch; 450 if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) { 451 dprintk("RPC: %s: cannot marshal multiple chunk lists\n", 452 __func__); 453 return -EIO; 454 } 455 456 hdrlen = 28; /*sizeof *headerp;*/ 457 padlen = 0; 458 459 /* 460 * Pull up any extra send data into the preregistered buffer. 461 * When padding is in use and applies to the transfer, insert 462 * it and change the message type. 463 */ 464 if (rtype == rpcrdma_noch) { 465 466 padlen = rpcrdma_inline_pullup(rqst, 467 RPCRDMA_INLINE_PAD_VALUE(rqst)); 468 469 if (padlen) { 470 headerp->rm_type = htonl(RDMA_MSGP); 471 headerp->rm_body.rm_padded.rm_align = 472 htonl(RPCRDMA_INLINE_PAD_VALUE(rqst)); 473 headerp->rm_body.rm_padded.rm_thresh = 474 htonl(RPCRDMA_INLINE_PAD_THRESH); 475 headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero; 476 headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; 477 headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; 478 hdrlen += 2 * sizeof(u32); /* extra words in padhdr */ 479 if (wtype != rpcrdma_noch) { 480 dprintk("RPC: %s: invalid chunk list\n", 481 __func__); 482 return -EIO; 483 } 484 } else { 485 headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero; 486 headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero; 487 headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero; 488 /* new length after pullup */ 489 rpclen = rqst->rq_svec[0].iov_len; 490 /* 491 * Currently we try to not actually use read inline. 492 * Reply chunks have the desirable property that 493 * they land, packed, directly in the target buffers 494 * without headers, so they require no fixup. The 495 * additional RDMA Write op sends the same amount 496 * of data, streams on-the-wire and adds no overhead 497 * on receive. Therefore, we request a reply chunk 498 * for non-writes wherever feasible and efficient. 499 */ 500 if (wtype == rpcrdma_noch) 501 wtype = rpcrdma_replych; 502 } 503 } 504 505 /* 506 * Marshal chunks. This routine will return the header length 507 * consumed by marshaling. 508 */ 509 if (rtype != rpcrdma_noch) { 510 hdrlen = rpcrdma_create_chunks(rqst, 511 &rqst->rq_snd_buf, headerp, rtype); 512 wtype = rtype; /* simplify dprintk */ 513 514 } else if (wtype != rpcrdma_noch) { 515 hdrlen = rpcrdma_create_chunks(rqst, 516 &rqst->rq_rcv_buf, headerp, wtype); 517 } 518 if (hdrlen < 0) 519 return hdrlen; 520 521 dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" 522 " headerp 0x%p base 0x%p lkey 0x%x\n", 523 __func__, transfertypes[wtype], hdrlen, rpclen, padlen, 524 headerp, base, req->rl_iov.lkey); 525 526 /* 527 * initialize send_iov's - normally only two: rdma chunk header and 528 * single preregistered RPC header buffer, but if padding is present, 529 * then use a preregistered (and zeroed) pad buffer between the RPC 530 * header and any write data. In all non-rdma cases, any following 531 * data has been copied into the RPC header buffer. 532 */ 533 req->rl_send_iov[0].addr = req->rl_iov.addr; 534 req->rl_send_iov[0].length = hdrlen; 535 req->rl_send_iov[0].lkey = req->rl_iov.lkey; 536 537 req->rl_send_iov[1].addr = req->rl_iov.addr + (base - req->rl_base); 538 req->rl_send_iov[1].length = rpclen; 539 req->rl_send_iov[1].lkey = req->rl_iov.lkey; 540 541 req->rl_niovs = 2; 542 543 if (padlen) { 544 struct rpcrdma_ep *ep = &r_xprt->rx_ep; 545 546 req->rl_send_iov[2].addr = ep->rep_pad.addr; 547 req->rl_send_iov[2].length = padlen; 548 req->rl_send_iov[2].lkey = ep->rep_pad.lkey; 549 550 req->rl_send_iov[3].addr = req->rl_send_iov[1].addr + rpclen; 551 req->rl_send_iov[3].length = rqst->rq_slen - rpclen; 552 req->rl_send_iov[3].lkey = req->rl_iov.lkey; 553 554 req->rl_niovs = 4; 555 } 556 557 return 0; 558 } 559 560 /* 561 * Chase down a received write or reply chunklist to get length 562 * RDMA'd by server. See map at rpcrdma_create_chunks()! :-) 563 */ 564 static int 565 rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __be32 **iptrp) 566 { 567 unsigned int i, total_len; 568 struct rpcrdma_write_chunk *cur_wchunk; 569 570 i = ntohl(**iptrp); /* get array count */ 571 if (i > max) 572 return -1; 573 cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1); 574 total_len = 0; 575 while (i--) { 576 struct rpcrdma_segment *seg = &cur_wchunk->wc_target; 577 ifdebug(FACILITY) { 578 u64 off; 579 xdr_decode_hyper((__be32 *)&seg->rs_offset, &off); 580 dprintk("RPC: %s: chunk %d@0x%llx:0x%x\n", 581 __func__, 582 ntohl(seg->rs_length), 583 (unsigned long long)off, 584 ntohl(seg->rs_handle)); 585 } 586 total_len += ntohl(seg->rs_length); 587 ++cur_wchunk; 588 } 589 /* check and adjust for properly terminated write chunk */ 590 if (wrchunk) { 591 __be32 *w = (__be32 *) cur_wchunk; 592 if (*w++ != xdr_zero) 593 return -1; 594 cur_wchunk = (struct rpcrdma_write_chunk *) w; 595 } 596 if ((char *) cur_wchunk > rep->rr_base + rep->rr_len) 597 return -1; 598 599 *iptrp = (__be32 *) cur_wchunk; 600 return total_len; 601 } 602 603 /* 604 * Scatter inline received data back into provided iov's. 605 */ 606 static void 607 rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) 608 { 609 int i, npages, curlen, olen; 610 char *destp; 611 struct page **ppages; 612 int page_base; 613 614 curlen = rqst->rq_rcv_buf.head[0].iov_len; 615 if (curlen > copy_len) { /* write chunk header fixup */ 616 curlen = copy_len; 617 rqst->rq_rcv_buf.head[0].iov_len = curlen; 618 } 619 620 dprintk("RPC: %s: srcp 0x%p len %d hdrlen %d\n", 621 __func__, srcp, copy_len, curlen); 622 623 /* Shift pointer for first receive segment only */ 624 rqst->rq_rcv_buf.head[0].iov_base = srcp; 625 srcp += curlen; 626 copy_len -= curlen; 627 628 olen = copy_len; 629 i = 0; 630 rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen; 631 page_base = rqst->rq_rcv_buf.page_base; 632 ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT); 633 page_base &= ~PAGE_MASK; 634 635 if (copy_len && rqst->rq_rcv_buf.page_len) { 636 npages = PAGE_ALIGN(page_base + 637 rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT; 638 for (; i < npages; i++) { 639 curlen = PAGE_SIZE - page_base; 640 if (curlen > copy_len) 641 curlen = copy_len; 642 dprintk("RPC: %s: page %d" 643 " srcp 0x%p len %d curlen %d\n", 644 __func__, i, srcp, copy_len, curlen); 645 destp = kmap_atomic(ppages[i]); 646 memcpy(destp + page_base, srcp, curlen); 647 flush_dcache_page(ppages[i]); 648 kunmap_atomic(destp); 649 srcp += curlen; 650 copy_len -= curlen; 651 if (copy_len == 0) 652 break; 653 page_base = 0; 654 } 655 } 656 657 if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) { 658 curlen = copy_len; 659 if (curlen > rqst->rq_rcv_buf.tail[0].iov_len) 660 curlen = rqst->rq_rcv_buf.tail[0].iov_len; 661 if (rqst->rq_rcv_buf.tail[0].iov_base != srcp) 662 memmove(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen); 663 dprintk("RPC: %s: tail srcp 0x%p len %d curlen %d\n", 664 __func__, srcp, copy_len, curlen); 665 rqst->rq_rcv_buf.tail[0].iov_len = curlen; 666 copy_len -= curlen; ++i; 667 } else 668 rqst->rq_rcv_buf.tail[0].iov_len = 0; 669 670 if (pad) { 671 /* implicit padding on terminal chunk */ 672 unsigned char *p = rqst->rq_rcv_buf.tail[0].iov_base; 673 while (pad--) 674 p[rqst->rq_rcv_buf.tail[0].iov_len++] = 0; 675 } 676 677 if (copy_len) 678 dprintk("RPC: %s: %d bytes in" 679 " %d extra segments (%d lost)\n", 680 __func__, olen, i, copy_len); 681 682 /* TBD avoid a warning from call_decode() */ 683 rqst->rq_private_buf = rqst->rq_rcv_buf; 684 } 685 686 void 687 rpcrdma_connect_worker(struct work_struct *work) 688 { 689 struct rpcrdma_ep *ep = 690 container_of(work, struct rpcrdma_ep, rep_connect_worker.work); 691 struct rpc_xprt *xprt = ep->rep_xprt; 692 693 spin_lock_bh(&xprt->transport_lock); 694 if (++xprt->connect_cookie == 0) /* maintain a reserved value */ 695 ++xprt->connect_cookie; 696 if (ep->rep_connected > 0) { 697 if (!xprt_test_and_set_connected(xprt)) 698 xprt_wake_pending_tasks(xprt, 0); 699 } else { 700 if (xprt_test_and_clear_connected(xprt)) 701 xprt_wake_pending_tasks(xprt, -ENOTCONN); 702 } 703 spin_unlock_bh(&xprt->transport_lock); 704 } 705 706 /* 707 * This function is called when an async event is posted to 708 * the connection which changes the connection state. All it 709 * does at this point is mark the connection up/down, the rpc 710 * timers do the rest. 711 */ 712 void 713 rpcrdma_conn_func(struct rpcrdma_ep *ep) 714 { 715 schedule_delayed_work(&ep->rep_connect_worker, 0); 716 } 717 718 /* 719 * Called as a tasklet to do req/reply match and complete a request 720 * Errors must result in the RPC task either being awakened, or 721 * allowed to timeout, to discover the errors at that time. 722 */ 723 void 724 rpcrdma_reply_handler(struct rpcrdma_rep *rep) 725 { 726 struct rpcrdma_msg *headerp; 727 struct rpcrdma_req *req; 728 struct rpc_rqst *rqst; 729 struct rpc_xprt *xprt = rep->rr_xprt; 730 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 731 __be32 *iptr; 732 int rdmalen, status; 733 unsigned long cwnd; 734 735 /* Check status. If bad, signal disconnect and return rep to pool */ 736 if (rep->rr_len == ~0U) { 737 rpcrdma_recv_buffer_put(rep); 738 if (r_xprt->rx_ep.rep_connected == 1) { 739 r_xprt->rx_ep.rep_connected = -EIO; 740 rpcrdma_conn_func(&r_xprt->rx_ep); 741 } 742 return; 743 } 744 if (rep->rr_len < 28) { 745 dprintk("RPC: %s: short/invalid reply\n", __func__); 746 goto repost; 747 } 748 headerp = (struct rpcrdma_msg *) rep->rr_base; 749 if (headerp->rm_vers != xdr_one) { 750 dprintk("RPC: %s: invalid version %d\n", 751 __func__, ntohl(headerp->rm_vers)); 752 goto repost; 753 } 754 755 /* Get XID and try for a match. */ 756 spin_lock(&xprt->transport_lock); 757 rqst = xprt_lookup_rqst(xprt, headerp->rm_xid); 758 if (rqst == NULL) { 759 spin_unlock(&xprt->transport_lock); 760 dprintk("RPC: %s: reply 0x%p failed " 761 "to match any request xid 0x%08x len %d\n", 762 __func__, rep, headerp->rm_xid, rep->rr_len); 763 repost: 764 r_xprt->rx_stats.bad_reply_count++; 765 rep->rr_func = rpcrdma_reply_handler; 766 if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep)) 767 rpcrdma_recv_buffer_put(rep); 768 769 return; 770 } 771 772 /* get request object */ 773 req = rpcr_to_rdmar(rqst); 774 if (req->rl_reply) { 775 spin_unlock(&xprt->transport_lock); 776 dprintk("RPC: %s: duplicate reply 0x%p to RPC " 777 "request 0x%p: xid 0x%08x\n", __func__, rep, req, 778 headerp->rm_xid); 779 goto repost; 780 } 781 782 dprintk("RPC: %s: reply 0x%p completes request 0x%p\n" 783 " RPC request 0x%p xid 0x%08x\n", 784 __func__, rep, req, rqst, headerp->rm_xid); 785 786 /* from here on, the reply is no longer an orphan */ 787 req->rl_reply = rep; 788 xprt->reestablish_timeout = 0; 789 790 /* check for expected message types */ 791 /* The order of some of these tests is important. */ 792 switch (headerp->rm_type) { 793 case htonl(RDMA_MSG): 794 /* never expect read chunks */ 795 /* never expect reply chunks (two ways to check) */ 796 /* never expect write chunks without having offered RDMA */ 797 if (headerp->rm_body.rm_chunks[0] != xdr_zero || 798 (headerp->rm_body.rm_chunks[1] == xdr_zero && 799 headerp->rm_body.rm_chunks[2] != xdr_zero) || 800 (headerp->rm_body.rm_chunks[1] != xdr_zero && 801 req->rl_nchunks == 0)) 802 goto badheader; 803 if (headerp->rm_body.rm_chunks[1] != xdr_zero) { 804 /* count any expected write chunks in read reply */ 805 /* start at write chunk array count */ 806 iptr = &headerp->rm_body.rm_chunks[2]; 807 rdmalen = rpcrdma_count_chunks(rep, 808 req->rl_nchunks, 1, &iptr); 809 /* check for validity, and no reply chunk after */ 810 if (rdmalen < 0 || *iptr++ != xdr_zero) 811 goto badheader; 812 rep->rr_len -= 813 ((unsigned char *)iptr - (unsigned char *)headerp); 814 status = rep->rr_len + rdmalen; 815 r_xprt->rx_stats.total_rdma_reply += rdmalen; 816 /* special case - last chunk may omit padding */ 817 if (rdmalen &= 3) { 818 rdmalen = 4 - rdmalen; 819 status += rdmalen; 820 } 821 } else { 822 /* else ordinary inline */ 823 rdmalen = 0; 824 iptr = (__be32 *)((unsigned char *)headerp + 28); 825 rep->rr_len -= 28; /*sizeof *headerp;*/ 826 status = rep->rr_len; 827 } 828 /* Fix up the rpc results for upper layer */ 829 rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen); 830 break; 831 832 case htonl(RDMA_NOMSG): 833 /* never expect read or write chunks, always reply chunks */ 834 if (headerp->rm_body.rm_chunks[0] != xdr_zero || 835 headerp->rm_body.rm_chunks[1] != xdr_zero || 836 headerp->rm_body.rm_chunks[2] != xdr_one || 837 req->rl_nchunks == 0) 838 goto badheader; 839 iptr = (__be32 *)((unsigned char *)headerp + 28); 840 rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr); 841 if (rdmalen < 0) 842 goto badheader; 843 r_xprt->rx_stats.total_rdma_reply += rdmalen; 844 /* Reply chunk buffer already is the reply vector - no fixup. */ 845 status = rdmalen; 846 break; 847 848 badheader: 849 default: 850 dprintk("%s: invalid rpcrdma reply header (type %d):" 851 " chunks[012] == %d %d %d" 852 " expected chunks <= %d\n", 853 __func__, ntohl(headerp->rm_type), 854 headerp->rm_body.rm_chunks[0], 855 headerp->rm_body.rm_chunks[1], 856 headerp->rm_body.rm_chunks[2], 857 req->rl_nchunks); 858 status = -EIO; 859 r_xprt->rx_stats.bad_reply_count++; 860 break; 861 } 862 863 cwnd = xprt->cwnd; 864 xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT; 865 if (xprt->cwnd > cwnd) 866 xprt_release_rqst_cong(rqst->rq_task); 867 868 dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n", 869 __func__, xprt, rqst, status); 870 xprt_complete_rqst(rqst->rq_task, status); 871 spin_unlock(&xprt->transport_lock); 872 } 873