1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause 2 3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ 4 /* Copyright (c) 2008-2019, IBM Corporation */ 5 6 #include <linux/errno.h> 7 #include <linux/types.h> 8 #include <linux/net.h> 9 #include <linux/scatterlist.h> 10 #include <linux/highmem.h> 11 #include <net/tcp.h> 12 13 #include <rdma/iw_cm.h> 14 #include <rdma/ib_verbs.h> 15 #include <rdma/ib_user_verbs.h> 16 17 #include "siw.h" 18 #include "siw_verbs.h" 19 #include "siw_mem.h" 20 21 #define MAX_HDR_INLINE \ 22 (((uint32_t)(sizeof(struct siw_rreq_pkt) - \ 23 sizeof(struct iwarp_send))) & 0xF8) 24 25 static struct page *siw_get_pblpage(struct siw_mem *mem, u64 addr, int *idx) 26 { 27 struct siw_pbl *pbl = mem->pbl; 28 u64 offset = addr - mem->va; 29 u64 paddr = siw_pbl_get_buffer(pbl, offset, NULL, idx); 30 31 if (paddr) 32 return virt_to_page(paddr); 33 34 return NULL; 35 } 36 37 /* 38 * Copy short payload at provided destination payload address 39 */ 40 static int siw_try_1seg(struct siw_iwarp_tx *c_tx, u64 paddr) 41 { 42 struct siw_wqe *wqe = &c_tx->wqe_active; 43 struct siw_sge *sge = &wqe->sqe.sge[0]; 44 u32 bytes = sge->length; 45 46 if (bytes > MAX_HDR_INLINE || wqe->sqe.num_sge != 1) 47 return MAX_HDR_INLINE + 1; 48 49 if (!bytes) 50 return 0; 51 52 if (tx_flags(wqe) & SIW_WQE_INLINE) { 53 memcpy((void *)paddr, &wqe->sqe.sge[1], bytes); 54 } else { 55 struct siw_mem *mem = wqe->mem[0]; 56 57 if (!mem->mem_obj) { 58 /* Kernel client using kva */ 59 memcpy((void *)paddr, (void *)sge->laddr, bytes); 60 } else if (c_tx->in_syscall) { 61 if (copy_from_user((void *)paddr, 62 (const void __user *)sge->laddr, 63 bytes)) 64 return -EFAULT; 65 } else { 66 unsigned int off = sge->laddr & ~PAGE_MASK; 67 struct page *p; 68 char *buffer; 69 int pbl_idx = 0; 70 71 if (!mem->is_pbl) 72 p = siw_get_upage(mem->umem, sge->laddr); 73 else 74 p = siw_get_pblpage(mem, sge->laddr, &pbl_idx); 75 76 if (unlikely(!p)) 77 return -EFAULT; 78 79 buffer = kmap_atomic(p); 80 81 if (likely(PAGE_SIZE - off >= bytes)) { 82 memcpy((void *)paddr, buffer + off, bytes); 83 kunmap_atomic(buffer); 84 } else { 85 unsigned long part = bytes - (PAGE_SIZE - off); 86 87 memcpy((void *)paddr, buffer + off, part); 88 kunmap_atomic(buffer); 89 90 if (!mem->is_pbl) 91 p = siw_get_upage(mem->umem, 92 sge->laddr + part); 93 else 94 p = siw_get_pblpage(mem, 95 sge->laddr + part, 96 &pbl_idx); 97 if (unlikely(!p)) 98 return -EFAULT; 99 100 buffer = kmap_atomic(p); 101 memcpy((void *)(paddr + part), buffer, 102 bytes - part); 103 kunmap_atomic(buffer); 104 } 105 } 106 } 107 return (int)bytes; 108 } 109 110 #define PKT_FRAGMENTED 1 111 #define PKT_COMPLETE 0 112 113 /* 114 * siw_qp_prepare_tx() 115 * 116 * Prepare tx state for sending out one fpdu. Builds complete pkt 117 * if no user data or only immediate data are present. 118 * 119 * returns PKT_COMPLETE if complete pkt built, PKT_FRAGMENTED otherwise. 120 */ 121 static int siw_qp_prepare_tx(struct siw_iwarp_tx *c_tx) 122 { 123 struct siw_wqe *wqe = &c_tx->wqe_active; 124 char *crc = NULL; 125 int data = 0; 126 127 switch (tx_type(wqe)) { 128 case SIW_OP_READ: 129 case SIW_OP_READ_LOCAL_INV: 130 memcpy(&c_tx->pkt.ctrl, 131 &iwarp_pktinfo[RDMAP_RDMA_READ_REQ].ctrl, 132 sizeof(struct iwarp_ctrl)); 133 134 c_tx->pkt.rreq.rsvd = 0; 135 c_tx->pkt.rreq.ddp_qn = htonl(RDMAP_UNTAGGED_QN_RDMA_READ); 136 c_tx->pkt.rreq.ddp_msn = 137 htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]); 138 c_tx->pkt.rreq.ddp_mo = 0; 139 c_tx->pkt.rreq.sink_stag = htonl(wqe->sqe.sge[0].lkey); 140 c_tx->pkt.rreq.sink_to = 141 cpu_to_be64(wqe->sqe.sge[0].laddr); 142 c_tx->pkt.rreq.source_stag = htonl(wqe->sqe.rkey); 143 c_tx->pkt.rreq.source_to = cpu_to_be64(wqe->sqe.raddr); 144 c_tx->pkt.rreq.read_size = htonl(wqe->sqe.sge[0].length); 145 146 c_tx->ctrl_len = sizeof(struct iwarp_rdma_rreq); 147 crc = (char *)&c_tx->pkt.rreq_pkt.crc; 148 break; 149 150 case SIW_OP_SEND: 151 if (tx_flags(wqe) & SIW_WQE_SOLICITED) 152 memcpy(&c_tx->pkt.ctrl, 153 &iwarp_pktinfo[RDMAP_SEND_SE].ctrl, 154 sizeof(struct iwarp_ctrl)); 155 else 156 memcpy(&c_tx->pkt.ctrl, &iwarp_pktinfo[RDMAP_SEND].ctrl, 157 sizeof(struct iwarp_ctrl)); 158 159 c_tx->pkt.send.ddp_qn = RDMAP_UNTAGGED_QN_SEND; 160 c_tx->pkt.send.ddp_msn = 161 htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]); 162 c_tx->pkt.send.ddp_mo = 0; 163 164 c_tx->pkt.send_inv.inval_stag = 0; 165 166 c_tx->ctrl_len = sizeof(struct iwarp_send); 167 168 crc = (char *)&c_tx->pkt.send_pkt.crc; 169 data = siw_try_1seg(c_tx, (u64)crc); 170 break; 171 172 case SIW_OP_SEND_REMOTE_INV: 173 if (tx_flags(wqe) & SIW_WQE_SOLICITED) 174 memcpy(&c_tx->pkt.ctrl, 175 &iwarp_pktinfo[RDMAP_SEND_SE_INVAL].ctrl, 176 sizeof(struct iwarp_ctrl)); 177 else 178 memcpy(&c_tx->pkt.ctrl, 179 &iwarp_pktinfo[RDMAP_SEND_INVAL].ctrl, 180 sizeof(struct iwarp_ctrl)); 181 182 c_tx->pkt.send.ddp_qn = RDMAP_UNTAGGED_QN_SEND; 183 c_tx->pkt.send.ddp_msn = 184 htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]); 185 c_tx->pkt.send.ddp_mo = 0; 186 187 c_tx->pkt.send_inv.inval_stag = cpu_to_be32(wqe->sqe.rkey); 188 189 c_tx->ctrl_len = sizeof(struct iwarp_send_inv); 190 191 crc = (char *)&c_tx->pkt.send_pkt.crc; 192 data = siw_try_1seg(c_tx, (u64)crc); 193 break; 194 195 case SIW_OP_WRITE: 196 memcpy(&c_tx->pkt.ctrl, &iwarp_pktinfo[RDMAP_RDMA_WRITE].ctrl, 197 sizeof(struct iwarp_ctrl)); 198 199 c_tx->pkt.rwrite.sink_stag = htonl(wqe->sqe.rkey); 200 c_tx->pkt.rwrite.sink_to = cpu_to_be64(wqe->sqe.raddr); 201 c_tx->ctrl_len = sizeof(struct iwarp_rdma_write); 202 203 crc = (char *)&c_tx->pkt.write_pkt.crc; 204 data = siw_try_1seg(c_tx, (u64)crc); 205 break; 206 207 case SIW_OP_READ_RESPONSE: 208 memcpy(&c_tx->pkt.ctrl, 209 &iwarp_pktinfo[RDMAP_RDMA_READ_RESP].ctrl, 210 sizeof(struct iwarp_ctrl)); 211 212 /* NBO */ 213 c_tx->pkt.rresp.sink_stag = cpu_to_be32(wqe->sqe.rkey); 214 c_tx->pkt.rresp.sink_to = cpu_to_be64(wqe->sqe.raddr); 215 216 c_tx->ctrl_len = sizeof(struct iwarp_rdma_rresp); 217 218 crc = (char *)&c_tx->pkt.write_pkt.crc; 219 data = siw_try_1seg(c_tx, (u64)crc); 220 break; 221 222 default: 223 siw_dbg_qp(tx_qp(c_tx), "stale wqe type %d\n", tx_type(wqe)); 224 return -EOPNOTSUPP; 225 } 226 if (unlikely(data < 0)) 227 return data; 228 229 c_tx->ctrl_sent = 0; 230 231 if (data <= MAX_HDR_INLINE) { 232 if (data) { 233 wqe->processed = data; 234 235 c_tx->pkt.ctrl.mpa_len = 236 htons(c_tx->ctrl_len + data - MPA_HDR_SIZE); 237 238 /* Add pad, if needed */ 239 data += -(int)data & 0x3; 240 /* advance CRC location after payload */ 241 crc += data; 242 c_tx->ctrl_len += data; 243 244 if (!(c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)) 245 c_tx->pkt.c_untagged.ddp_mo = 0; 246 else 247 c_tx->pkt.c_tagged.ddp_to = 248 cpu_to_be64(wqe->sqe.raddr); 249 } 250 251 *(u32 *)crc = 0; 252 /* 253 * Do complete CRC if enabled and short packet 254 */ 255 if (c_tx->mpa_crc_hd) { 256 crypto_shash_init(c_tx->mpa_crc_hd); 257 if (crypto_shash_update(c_tx->mpa_crc_hd, 258 (u8 *)&c_tx->pkt, 259 c_tx->ctrl_len)) 260 return -EINVAL; 261 crypto_shash_final(c_tx->mpa_crc_hd, (u8 *)crc); 262 } 263 c_tx->ctrl_len += MPA_CRC_SIZE; 264 265 return PKT_COMPLETE; 266 } 267 c_tx->ctrl_len += MPA_CRC_SIZE; 268 c_tx->sge_idx = 0; 269 c_tx->sge_off = 0; 270 c_tx->pbl_idx = 0; 271 272 /* 273 * Allow direct sending out of user buffer if WR is non signalled 274 * and payload is over threshold. 275 * Per RDMA verbs, the application should not change the send buffer 276 * until the work completed. In iWarp, work completion is only 277 * local delivery to TCP. TCP may reuse the buffer for 278 * retransmission. Changing unsent data also breaks the CRC, 279 * if applied. 280 */ 281 if (c_tx->zcopy_tx && wqe->bytes >= SENDPAGE_THRESH && 282 !(tx_flags(wqe) & SIW_WQE_SIGNALLED)) 283 c_tx->use_sendpage = 1; 284 else 285 c_tx->use_sendpage = 0; 286 287 return PKT_FRAGMENTED; 288 } 289 290 /* 291 * Send out one complete control type FPDU, or header of FPDU carrying 292 * data. Used for fixed sized packets like Read.Requests or zero length 293 * SENDs, WRITEs, READ.Responses, or header only. 294 */ 295 static int siw_tx_ctrl(struct siw_iwarp_tx *c_tx, struct socket *s, 296 int flags) 297 { 298 struct msghdr msg = { .msg_flags = flags }; 299 struct kvec iov = { .iov_base = 300 (char *)&c_tx->pkt.ctrl + c_tx->ctrl_sent, 301 .iov_len = c_tx->ctrl_len - c_tx->ctrl_sent }; 302 303 int rv = kernel_sendmsg(s, &msg, &iov, 1, 304 c_tx->ctrl_len - c_tx->ctrl_sent); 305 306 if (rv >= 0) { 307 c_tx->ctrl_sent += rv; 308 309 if (c_tx->ctrl_sent == c_tx->ctrl_len) 310 rv = 0; 311 else 312 rv = -EAGAIN; 313 } 314 return rv; 315 } 316 317 /* 318 * 0copy TCP transmit interface: Use do_tcp_sendpages. 319 * 320 * Using sendpage to push page by page appears to be less efficient 321 * than using sendmsg, even if data are copied. 322 * 323 * A general performance limitation might be the extra four bytes 324 * trailer checksum segment to be pushed after user data. 325 */ 326 static int siw_tcp_sendpages(struct socket *s, struct page **page, int offset, 327 size_t size) 328 { 329 struct sock *sk = s->sk; 330 int i = 0, rv = 0, sent = 0, 331 flags = MSG_MORE | MSG_DONTWAIT | MSG_SENDPAGE_NOTLAST; 332 333 while (size) { 334 size_t bytes = min_t(size_t, PAGE_SIZE - offset, size); 335 336 if (size + offset <= PAGE_SIZE) 337 flags = MSG_MORE | MSG_DONTWAIT; 338 339 tcp_rate_check_app_limited(sk); 340 try_page_again: 341 lock_sock(sk); 342 rv = do_tcp_sendpages(sk, page[i], offset, bytes, flags); 343 release_sock(sk); 344 345 if (rv > 0) { 346 size -= rv; 347 sent += rv; 348 if (rv != bytes) { 349 offset += rv; 350 bytes -= rv; 351 goto try_page_again; 352 } 353 offset = 0; 354 } else { 355 if (rv == -EAGAIN || rv == 0) 356 break; 357 return rv; 358 } 359 i++; 360 } 361 return sent; 362 } 363 364 /* 365 * siw_0copy_tx() 366 * 367 * Pushes list of pages to TCP socket. If pages from multiple 368 * SGE's, all referenced pages of each SGE are pushed in one 369 * shot. 370 */ 371 static int siw_0copy_tx(struct socket *s, struct page **page, 372 struct siw_sge *sge, unsigned int offset, 373 unsigned int size) 374 { 375 int i = 0, sent = 0, rv; 376 int sge_bytes = min(sge->length - offset, size); 377 378 offset = (sge->laddr + offset) & ~PAGE_MASK; 379 380 while (sent != size) { 381 rv = siw_tcp_sendpages(s, &page[i], offset, sge_bytes); 382 if (rv >= 0) { 383 sent += rv; 384 if (size == sent || sge_bytes > rv) 385 break; 386 387 i += PAGE_ALIGN(sge_bytes + offset) >> PAGE_SHIFT; 388 sge++; 389 sge_bytes = min(sge->length, size - sent); 390 offset = sge->laddr & ~PAGE_MASK; 391 } else { 392 sent = rv; 393 break; 394 } 395 } 396 return sent; 397 } 398 399 #define MAX_TRAILER (MPA_CRC_SIZE + 4) 400 401 static void siw_unmap_pages(struct page **pages, int hdr_len, int num_maps) 402 { 403 if (hdr_len) { 404 ++pages; 405 --num_maps; 406 } 407 while (num_maps-- > 0) { 408 kunmap(*pages); 409 pages++; 410 } 411 } 412 413 /* 414 * siw_tx_hdt() tries to push a complete packet to TCP where all 415 * packet fragments are referenced by the elements of one iovec. 416 * For the data portion, each involved page must be referenced by 417 * one extra element. All sge's data can be non-aligned to page 418 * boundaries. Two more elements are referencing iWARP header 419 * and trailer: 420 * MAX_ARRAY = 64KB/PAGE_SIZE + 1 + (2 * (SIW_MAX_SGE - 1) + HDR + TRL 421 */ 422 #define MAX_ARRAY ((0xffff / PAGE_SIZE) + 1 + (2 * (SIW_MAX_SGE - 1) + 2)) 423 424 /* 425 * Write out iov referencing hdr, data and trailer of current FPDU. 426 * Update transmit state dependent on write return status 427 */ 428 static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s) 429 { 430 struct siw_wqe *wqe = &c_tx->wqe_active; 431 struct siw_sge *sge = &wqe->sqe.sge[c_tx->sge_idx]; 432 struct kvec iov[MAX_ARRAY]; 433 struct page *page_array[MAX_ARRAY]; 434 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR }; 435 436 int seg = 0, do_crc = c_tx->do_crc, is_kva = 0, rv; 437 unsigned int data_len = c_tx->bytes_unsent, hdr_len = 0, trl_len = 0, 438 sge_off = c_tx->sge_off, sge_idx = c_tx->sge_idx, 439 pbl_idx = c_tx->pbl_idx; 440 441 if (c_tx->state == SIW_SEND_HDR) { 442 if (c_tx->use_sendpage) { 443 rv = siw_tx_ctrl(c_tx, s, MSG_DONTWAIT | MSG_MORE); 444 if (rv) 445 goto done; 446 447 c_tx->state = SIW_SEND_DATA; 448 } else { 449 iov[0].iov_base = 450 (char *)&c_tx->pkt.ctrl + c_tx->ctrl_sent; 451 iov[0].iov_len = hdr_len = 452 c_tx->ctrl_len - c_tx->ctrl_sent; 453 seg = 1; 454 } 455 } 456 457 wqe->processed += data_len; 458 459 while (data_len) { /* walk the list of SGE's */ 460 unsigned int sge_len = min(sge->length - sge_off, data_len); 461 unsigned int fp_off = (sge->laddr + sge_off) & ~PAGE_MASK; 462 struct siw_mem *mem; 463 464 if (!(tx_flags(wqe) & SIW_WQE_INLINE)) { 465 mem = wqe->mem[sge_idx]; 466 if (!mem->mem_obj) 467 is_kva = 1; 468 } else { 469 is_kva = 1; 470 } 471 if (is_kva && !c_tx->use_sendpage) { 472 /* 473 * tx from kernel virtual address: either inline data 474 * or memory region with assigned kernel buffer 475 */ 476 iov[seg].iov_base = (void *)(sge->laddr + sge_off); 477 iov[seg].iov_len = sge_len; 478 479 if (do_crc) 480 crypto_shash_update(c_tx->mpa_crc_hd, 481 iov[seg].iov_base, 482 sge_len); 483 sge_off += sge_len; 484 data_len -= sge_len; 485 seg++; 486 goto sge_done; 487 } 488 489 while (sge_len) { 490 size_t plen = min((int)PAGE_SIZE - fp_off, sge_len); 491 492 if (!is_kva) { 493 struct page *p; 494 495 if (mem->is_pbl) 496 p = siw_get_pblpage( 497 mem, sge->laddr + sge_off, 498 &pbl_idx); 499 else 500 p = siw_get_upage(mem->umem, 501 sge->laddr + sge_off); 502 if (unlikely(!p)) { 503 if (hdr_len) 504 seg--; 505 if (!c_tx->use_sendpage && seg) { 506 siw_unmap_pages(page_array, 507 hdr_len, seg); 508 } 509 wqe->processed -= c_tx->bytes_unsent; 510 rv = -EFAULT; 511 goto done_crc; 512 } 513 page_array[seg] = p; 514 515 if (!c_tx->use_sendpage) { 516 iov[seg].iov_base = kmap(p) + fp_off; 517 iov[seg].iov_len = plen; 518 if (do_crc) 519 crypto_shash_update( 520 c_tx->mpa_crc_hd, 521 iov[seg].iov_base, 522 plen); 523 } else if (do_crc) 524 crypto_shash_update( 525 c_tx->mpa_crc_hd, 526 page_address(p) + fp_off, 527 plen); 528 } else { 529 u64 pa = ((sge->laddr + sge_off) & PAGE_MASK); 530 531 page_array[seg] = virt_to_page(pa); 532 if (do_crc) 533 crypto_shash_update( 534 c_tx->mpa_crc_hd, 535 (void *)(sge->laddr + sge_off), 536 plen); 537 } 538 539 sge_len -= plen; 540 sge_off += plen; 541 data_len -= plen; 542 fp_off = 0; 543 544 if (++seg > (int)MAX_ARRAY) { 545 siw_dbg_qp(tx_qp(c_tx), "to many fragments\n"); 546 if (!is_kva && !c_tx->use_sendpage) { 547 siw_unmap_pages(page_array, hdr_len, 548 seg - 1); 549 } 550 wqe->processed -= c_tx->bytes_unsent; 551 rv = -EMSGSIZE; 552 goto done_crc; 553 } 554 } 555 sge_done: 556 /* Update SGE variables at end of SGE */ 557 if (sge_off == sge->length && 558 (data_len != 0 || wqe->processed < wqe->bytes)) { 559 sge_idx++; 560 sge++; 561 sge_off = 0; 562 } 563 } 564 /* trailer */ 565 if (likely(c_tx->state != SIW_SEND_TRAILER)) { 566 iov[seg].iov_base = &c_tx->trailer.pad[4 - c_tx->pad]; 567 iov[seg].iov_len = trl_len = MAX_TRAILER - (4 - c_tx->pad); 568 } else { 569 iov[seg].iov_base = &c_tx->trailer.pad[c_tx->ctrl_sent]; 570 iov[seg].iov_len = trl_len = MAX_TRAILER - c_tx->ctrl_sent; 571 } 572 573 if (c_tx->pad) { 574 *(u32 *)c_tx->trailer.pad = 0; 575 if (do_crc) 576 crypto_shash_update(c_tx->mpa_crc_hd, 577 (u8 *)&c_tx->trailer.crc - c_tx->pad, 578 c_tx->pad); 579 } 580 if (!c_tx->mpa_crc_hd) 581 c_tx->trailer.crc = 0; 582 else if (do_crc) 583 crypto_shash_final(c_tx->mpa_crc_hd, (u8 *)&c_tx->trailer.crc); 584 585 data_len = c_tx->bytes_unsent; 586 587 if (c_tx->use_sendpage) { 588 rv = siw_0copy_tx(s, page_array, &wqe->sqe.sge[c_tx->sge_idx], 589 c_tx->sge_off, data_len); 590 if (rv == data_len) { 591 rv = kernel_sendmsg(s, &msg, &iov[seg], 1, trl_len); 592 if (rv > 0) 593 rv += data_len; 594 else 595 rv = data_len; 596 } 597 } else { 598 rv = kernel_sendmsg(s, &msg, iov, seg + 1, 599 hdr_len + data_len + trl_len); 600 if (!is_kva) 601 siw_unmap_pages(page_array, hdr_len, seg); 602 } 603 if (rv < (int)hdr_len) { 604 /* Not even complete hdr pushed or negative rv */ 605 wqe->processed -= data_len; 606 if (rv >= 0) { 607 c_tx->ctrl_sent += rv; 608 rv = -EAGAIN; 609 } 610 goto done_crc; 611 } 612 rv -= hdr_len; 613 614 if (rv >= (int)data_len) { 615 /* all user data pushed to TCP or no data to push */ 616 if (data_len > 0 && wqe->processed < wqe->bytes) { 617 /* Save the current state for next tx */ 618 c_tx->sge_idx = sge_idx; 619 c_tx->sge_off = sge_off; 620 c_tx->pbl_idx = pbl_idx; 621 } 622 rv -= data_len; 623 624 if (rv == trl_len) /* all pushed */ 625 rv = 0; 626 else { 627 c_tx->state = SIW_SEND_TRAILER; 628 c_tx->ctrl_len = MAX_TRAILER; 629 c_tx->ctrl_sent = rv + 4 - c_tx->pad; 630 c_tx->bytes_unsent = 0; 631 rv = -EAGAIN; 632 } 633 634 } else if (data_len > 0) { 635 /* Maybe some user data pushed to TCP */ 636 c_tx->state = SIW_SEND_DATA; 637 wqe->processed -= data_len - rv; 638 639 if (rv) { 640 /* 641 * Some bytes out. Recompute tx state based 642 * on old state and bytes pushed 643 */ 644 unsigned int sge_unsent; 645 646 c_tx->bytes_unsent -= rv; 647 sge = &wqe->sqe.sge[c_tx->sge_idx]; 648 sge_unsent = sge->length - c_tx->sge_off; 649 650 while (sge_unsent <= rv) { 651 rv -= sge_unsent; 652 c_tx->sge_idx++; 653 c_tx->sge_off = 0; 654 sge++; 655 sge_unsent = sge->length; 656 } 657 c_tx->sge_off += rv; 658 } 659 rv = -EAGAIN; 660 } 661 done_crc: 662 c_tx->do_crc = 0; 663 done: 664 return rv; 665 } 666 667 static void siw_update_tcpseg(struct siw_iwarp_tx *c_tx, 668 struct socket *s) 669 { 670 struct tcp_sock *tp = tcp_sk(s->sk); 671 672 if (tp->gso_segs) { 673 if (c_tx->gso_seg_limit == 0) 674 c_tx->tcp_seglen = tp->mss_cache * tp->gso_segs; 675 else 676 c_tx->tcp_seglen = 677 tp->mss_cache * 678 min_t(u16, c_tx->gso_seg_limit, tp->gso_segs); 679 } else { 680 c_tx->tcp_seglen = tp->mss_cache; 681 } 682 /* Loopback may give odd numbers */ 683 c_tx->tcp_seglen &= 0xfffffff8; 684 } 685 686 /* 687 * siw_prepare_fpdu() 688 * 689 * Prepares transmit context to send out one FPDU if FPDU will contain 690 * user data and user data are not immediate data. 691 * Computes maximum FPDU length to fill up TCP MSS if possible. 692 * 693 * @qp: QP from which to transmit 694 * @wqe: Current WQE causing transmission 695 * 696 * TODO: Take into account real available sendspace on socket 697 * to avoid header misalignment due to send pausing within 698 * fpdu transmission 699 */ 700 static void siw_prepare_fpdu(struct siw_qp *qp, struct siw_wqe *wqe) 701 { 702 struct siw_iwarp_tx *c_tx = &qp->tx_ctx; 703 int data_len; 704 705 c_tx->ctrl_len = 706 iwarp_pktinfo[__rdmap_get_opcode(&c_tx->pkt.ctrl)].hdr_len; 707 c_tx->ctrl_sent = 0; 708 709 /* 710 * Update target buffer offset if any 711 */ 712 if (!(c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)) 713 /* Untagged message */ 714 c_tx->pkt.c_untagged.ddp_mo = cpu_to_be32(wqe->processed); 715 else /* Tagged message */ 716 c_tx->pkt.c_tagged.ddp_to = 717 cpu_to_be64(wqe->sqe.raddr + wqe->processed); 718 719 data_len = wqe->bytes - wqe->processed; 720 if (data_len + c_tx->ctrl_len + MPA_CRC_SIZE > c_tx->tcp_seglen) { 721 /* Trim DDP payload to fit into current TCP segment */ 722 data_len = c_tx->tcp_seglen - (c_tx->ctrl_len + MPA_CRC_SIZE); 723 c_tx->pkt.ctrl.ddp_rdmap_ctrl &= ~DDP_FLAG_LAST; 724 c_tx->pad = 0; 725 } else { 726 c_tx->pkt.ctrl.ddp_rdmap_ctrl |= DDP_FLAG_LAST; 727 c_tx->pad = -data_len & 0x3; 728 } 729 c_tx->bytes_unsent = data_len; 730 731 c_tx->pkt.ctrl.mpa_len = 732 htons(c_tx->ctrl_len + data_len - MPA_HDR_SIZE); 733 734 /* 735 * Init MPA CRC computation 736 */ 737 if (c_tx->mpa_crc_hd) { 738 crypto_shash_init(c_tx->mpa_crc_hd); 739 crypto_shash_update(c_tx->mpa_crc_hd, (u8 *)&c_tx->pkt, 740 c_tx->ctrl_len); 741 c_tx->do_crc = 1; 742 } 743 } 744 745 /* 746 * siw_check_sgl_tx() 747 * 748 * Check permissions for a list of SGE's (SGL). 749 * A successful check will have all memory referenced 750 * for transmission resolved and assigned to the WQE. 751 * 752 * @pd: Protection Domain SGL should belong to 753 * @wqe: WQE to be checked 754 * @perms: requested access permissions 755 * 756 */ 757 758 static int siw_check_sgl_tx(struct ib_pd *pd, struct siw_wqe *wqe, 759 enum ib_access_flags perms) 760 { 761 struct siw_sge *sge = &wqe->sqe.sge[0]; 762 int i, len, num_sge = wqe->sqe.num_sge; 763 764 if (unlikely(num_sge > SIW_MAX_SGE)) 765 return -EINVAL; 766 767 for (i = 0, len = 0; num_sge; num_sge--, i++, sge++) { 768 /* 769 * rdma verbs: do not check stag for a zero length sge 770 */ 771 if (sge->length) { 772 int rv = siw_check_sge(pd, sge, &wqe->mem[i], perms, 0, 773 sge->length); 774 775 if (unlikely(rv != E_ACCESS_OK)) 776 return rv; 777 } 778 len += sge->length; 779 } 780 return len; 781 } 782 783 /* 784 * siw_qp_sq_proc_tx() 785 * 786 * Process one WQE which needs transmission on the wire. 787 */ 788 static int siw_qp_sq_proc_tx(struct siw_qp *qp, struct siw_wqe *wqe) 789 { 790 struct siw_iwarp_tx *c_tx = &qp->tx_ctx; 791 struct socket *s = qp->attrs.sk; 792 int rv = 0, burst_len = qp->tx_ctx.burst; 793 enum rdmap_ecode ecode = RDMAP_ECODE_CATASTROPHIC_STREAM; 794 795 if (unlikely(wqe->wr_status == SIW_WR_IDLE)) 796 return 0; 797 798 if (!burst_len) 799 burst_len = SQ_USER_MAXBURST; 800 801 if (wqe->wr_status == SIW_WR_QUEUED) { 802 if (!(wqe->sqe.flags & SIW_WQE_INLINE)) { 803 if (tx_type(wqe) == SIW_OP_READ_RESPONSE) 804 wqe->sqe.num_sge = 1; 805 806 if (tx_type(wqe) != SIW_OP_READ && 807 tx_type(wqe) != SIW_OP_READ_LOCAL_INV) { 808 /* 809 * Reference memory to be tx'd w/o checking 810 * access for LOCAL_READ permission, since 811 * not defined in RDMA core. 812 */ 813 rv = siw_check_sgl_tx(qp->pd, wqe, 0); 814 if (rv < 0) { 815 if (tx_type(wqe) == 816 SIW_OP_READ_RESPONSE) 817 ecode = siw_rdmap_error(-rv); 818 rv = -EINVAL; 819 goto tx_error; 820 } 821 wqe->bytes = rv; 822 } else { 823 wqe->bytes = 0; 824 } 825 } else { 826 wqe->bytes = wqe->sqe.sge[0].length; 827 if (!qp->kernel_verbs) { 828 if (wqe->bytes > SIW_MAX_INLINE) { 829 rv = -EINVAL; 830 goto tx_error; 831 } 832 wqe->sqe.sge[0].laddr = (u64)&wqe->sqe.sge[1]; 833 } 834 } 835 wqe->wr_status = SIW_WR_INPROGRESS; 836 wqe->processed = 0; 837 838 siw_update_tcpseg(c_tx, s); 839 840 rv = siw_qp_prepare_tx(c_tx); 841 if (rv == PKT_FRAGMENTED) { 842 c_tx->state = SIW_SEND_HDR; 843 siw_prepare_fpdu(qp, wqe); 844 } else if (rv == PKT_COMPLETE) { 845 c_tx->state = SIW_SEND_SHORT_FPDU; 846 } else { 847 goto tx_error; 848 } 849 } 850 851 next_segment: 852 siw_dbg_qp(qp, "wr type %d, state %d, data %u, sent %u, id %llx\n", 853 tx_type(wqe), wqe->wr_status, wqe->bytes, wqe->processed, 854 wqe->sqe.id); 855 856 if (--burst_len == 0) { 857 rv = -EINPROGRESS; 858 goto tx_done; 859 } 860 if (c_tx->state == SIW_SEND_SHORT_FPDU) { 861 enum siw_opcode tx_type = tx_type(wqe); 862 unsigned int msg_flags; 863 864 if (siw_sq_empty(qp) || !siw_tcp_nagle || burst_len == 1) 865 /* 866 * End current TCP segment, if SQ runs empty, 867 * or siw_tcp_nagle is not set, or we bail out 868 * soon due to no burst credit left. 869 */ 870 msg_flags = MSG_DONTWAIT; 871 else 872 msg_flags = MSG_DONTWAIT | MSG_MORE; 873 874 rv = siw_tx_ctrl(c_tx, s, msg_flags); 875 876 if (!rv && tx_type != SIW_OP_READ && 877 tx_type != SIW_OP_READ_LOCAL_INV) 878 wqe->processed = wqe->bytes; 879 880 goto tx_done; 881 882 } else { 883 rv = siw_tx_hdt(c_tx, s); 884 } 885 if (!rv) { 886 /* 887 * One segment sent. Processing completed if last 888 * segment, Do next segment otherwise. 889 */ 890 if (unlikely(c_tx->tx_suspend)) { 891 /* 892 * Verbs, 6.4.: Try stopping sending after a full 893 * DDP segment if the connection goes down 894 * (== peer halfclose) 895 */ 896 rv = -ECONNABORTED; 897 goto tx_done; 898 } 899 if (c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_LAST) { 900 siw_dbg_qp(qp, "WQE completed\n"); 901 goto tx_done; 902 } 903 c_tx->state = SIW_SEND_HDR; 904 905 siw_update_tcpseg(c_tx, s); 906 907 siw_prepare_fpdu(qp, wqe); 908 goto next_segment; 909 } 910 tx_done: 911 qp->tx_ctx.burst = burst_len; 912 return rv; 913 914 tx_error: 915 if (ecode != RDMAP_ECODE_CATASTROPHIC_STREAM) 916 siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP, 917 RDMAP_ETYPE_REMOTE_PROTECTION, ecode, 1); 918 else 919 siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP, 920 RDMAP_ETYPE_CATASTROPHIC, 921 RDMAP_ECODE_UNSPECIFIED, 1); 922 return rv; 923 } 924 925 static int siw_fastreg_mr(struct ib_pd *pd, struct siw_sqe *sqe) 926 { 927 struct ib_mr *base_mr = (struct ib_mr *)sqe->base_mr; 928 struct siw_device *sdev = to_siw_dev(pd->device); 929 struct siw_mem *mem = siw_mem_id2obj(sdev, sqe->rkey >> 8); 930 int rv = 0; 931 932 siw_dbg_pd(pd, "STag 0x%08x\n", sqe->rkey); 933 934 if (unlikely(!mem || !base_mr)) { 935 pr_warn("siw: fastreg: STag 0x%08x unknown\n", sqe->rkey); 936 return -EINVAL; 937 } 938 if (unlikely(base_mr->rkey >> 8 != sqe->rkey >> 8)) { 939 pr_warn("siw: fastreg: STag 0x%08x: bad MR\n", sqe->rkey); 940 rv = -EINVAL; 941 goto out; 942 } 943 if (unlikely(mem->pd != pd)) { 944 pr_warn("siw: fastreg: PD mismatch\n"); 945 rv = -EINVAL; 946 goto out; 947 } 948 if (unlikely(mem->stag_valid)) { 949 pr_warn("siw: fastreg: STag 0x%08x already valid\n", sqe->rkey); 950 rv = -EINVAL; 951 goto out; 952 } 953 /* Refresh STag since user may have changed key part */ 954 mem->stag = sqe->rkey; 955 mem->perms = sqe->access; 956 957 siw_dbg_mem(mem, "STag now valid, MR va: 0x%016llx -> 0x%016llx\n", 958 mem->va, base_mr->iova); 959 mem->va = base_mr->iova; 960 mem->stag_valid = 1; 961 out: 962 siw_mem_put(mem); 963 return rv; 964 } 965 966 static int siw_qp_sq_proc_local(struct siw_qp *qp, struct siw_wqe *wqe) 967 { 968 int rv; 969 970 switch (tx_type(wqe)) { 971 case SIW_OP_REG_MR: 972 rv = siw_fastreg_mr(qp->pd, &wqe->sqe); 973 break; 974 975 case SIW_OP_INVAL_STAG: 976 rv = siw_invalidate_stag(qp->pd, wqe->sqe.rkey); 977 break; 978 979 default: 980 rv = -EINVAL; 981 } 982 return rv; 983 } 984 985 /* 986 * siw_qp_sq_process() 987 * 988 * Core TX path routine for RDMAP/DDP/MPA using a TCP kernel socket. 989 * Sends RDMAP payload for the current SQ WR @wqe of @qp in one or more 990 * MPA FPDUs, each containing a DDP segment. 991 * 992 * SQ processing may occur in user context as a result of posting 993 * new WQE's or from siw_sq_work_handler() context. Processing in 994 * user context is limited to non-kernel verbs users. 995 * 996 * SQ processing may get paused anytime, possibly in the middle of a WR 997 * or FPDU, if insufficient send space is available. SQ processing 998 * gets resumed from siw_sq_work_handler(), if send space becomes 999 * available again. 1000 * 1001 * Must be called with the QP state read-locked. 1002 * 1003 * Note: 1004 * An outbound RREQ can be satisfied by the corresponding RRESP 1005 * _before_ it gets assigned to the ORQ. This happens regularly 1006 * in RDMA READ via loopback case. Since both outbound RREQ and 1007 * inbound RRESP can be handled by the same CPU, locking the ORQ 1008 * is dead-lock prone and thus not an option. With that, the 1009 * RREQ gets assigned to the ORQ _before_ being sent - see 1010 * siw_activate_tx() - and pulled back in case of send failure. 1011 */ 1012 int siw_qp_sq_process(struct siw_qp *qp) 1013 { 1014 struct siw_wqe *wqe = tx_wqe(qp); 1015 enum siw_opcode tx_type; 1016 unsigned long flags; 1017 int rv = 0; 1018 1019 siw_dbg_qp(qp, "enter for type %d\n", tx_type(wqe)); 1020 1021 next_wqe: 1022 /* 1023 * Stop QP processing if SQ state changed 1024 */ 1025 if (unlikely(qp->tx_ctx.tx_suspend)) { 1026 siw_dbg_qp(qp, "tx suspended\n"); 1027 goto done; 1028 } 1029 tx_type = tx_type(wqe); 1030 1031 if (tx_type <= SIW_OP_READ_RESPONSE) 1032 rv = siw_qp_sq_proc_tx(qp, wqe); 1033 else 1034 rv = siw_qp_sq_proc_local(qp, wqe); 1035 1036 if (!rv) { 1037 /* 1038 * WQE processing done 1039 */ 1040 switch (tx_type) { 1041 case SIW_OP_SEND: 1042 case SIW_OP_SEND_REMOTE_INV: 1043 case SIW_OP_WRITE: 1044 siw_wqe_put_mem(wqe, tx_type); 1045 /* Fall through */ 1046 1047 case SIW_OP_INVAL_STAG: 1048 case SIW_OP_REG_MR: 1049 if (tx_flags(wqe) & SIW_WQE_SIGNALLED) 1050 siw_sqe_complete(qp, &wqe->sqe, wqe->bytes, 1051 SIW_WC_SUCCESS); 1052 break; 1053 1054 case SIW_OP_READ: 1055 case SIW_OP_READ_LOCAL_INV: 1056 /* 1057 * already enqueued to ORQ queue 1058 */ 1059 break; 1060 1061 case SIW_OP_READ_RESPONSE: 1062 siw_wqe_put_mem(wqe, tx_type); 1063 break; 1064 1065 default: 1066 WARN(1, "undefined WQE type %d\n", tx_type); 1067 rv = -EINVAL; 1068 goto done; 1069 } 1070 1071 spin_lock_irqsave(&qp->sq_lock, flags); 1072 wqe->wr_status = SIW_WR_IDLE; 1073 rv = siw_activate_tx(qp); 1074 spin_unlock_irqrestore(&qp->sq_lock, flags); 1075 1076 if (rv <= 0) 1077 goto done; 1078 1079 goto next_wqe; 1080 1081 } else if (rv == -EAGAIN) { 1082 siw_dbg_qp(qp, "sq paused: hd/tr %d of %d, data %d\n", 1083 qp->tx_ctx.ctrl_sent, qp->tx_ctx.ctrl_len, 1084 qp->tx_ctx.bytes_unsent); 1085 rv = 0; 1086 goto done; 1087 } else if (rv == -EINPROGRESS) { 1088 rv = siw_sq_start(qp); 1089 goto done; 1090 } else { 1091 /* 1092 * WQE processing failed. 1093 * Verbs 8.3.2: 1094 * o It turns any WQE into a signalled WQE. 1095 * o Local catastrophic error must be surfaced 1096 * o QP must be moved into Terminate state: done by code 1097 * doing socket state change processing 1098 * 1099 * o TODO: Termination message must be sent. 1100 * o TODO: Implement more precise work completion errors, 1101 * see enum ib_wc_status in ib_verbs.h 1102 */ 1103 siw_dbg_qp(qp, "wqe type %d processing failed: %d\n", 1104 tx_type(wqe), rv); 1105 1106 spin_lock_irqsave(&qp->sq_lock, flags); 1107 /* 1108 * RREQ may have already been completed by inbound RRESP! 1109 */ 1110 if (tx_type == SIW_OP_READ || 1111 tx_type == SIW_OP_READ_LOCAL_INV) { 1112 /* Cleanup pending entry in ORQ */ 1113 qp->orq_put--; 1114 qp->orq[qp->orq_put % qp->attrs.orq_size].flags = 0; 1115 } 1116 spin_unlock_irqrestore(&qp->sq_lock, flags); 1117 /* 1118 * immediately suspends further TX processing 1119 */ 1120 if (!qp->tx_ctx.tx_suspend) 1121 siw_qp_cm_drop(qp, 0); 1122 1123 switch (tx_type) { 1124 case SIW_OP_SEND: 1125 case SIW_OP_SEND_REMOTE_INV: 1126 case SIW_OP_SEND_WITH_IMM: 1127 case SIW_OP_WRITE: 1128 case SIW_OP_READ: 1129 case SIW_OP_READ_LOCAL_INV: 1130 siw_wqe_put_mem(wqe, tx_type); 1131 /* Fall through */ 1132 1133 case SIW_OP_INVAL_STAG: 1134 case SIW_OP_REG_MR: 1135 siw_sqe_complete(qp, &wqe->sqe, wqe->bytes, 1136 SIW_WC_LOC_QP_OP_ERR); 1137 1138 siw_qp_event(qp, IB_EVENT_QP_FATAL); 1139 1140 break; 1141 1142 case SIW_OP_READ_RESPONSE: 1143 siw_dbg_qp(qp, "proc. read.response failed: %d\n", rv); 1144 1145 siw_qp_event(qp, IB_EVENT_QP_REQ_ERR); 1146 1147 siw_wqe_put_mem(wqe, SIW_OP_READ_RESPONSE); 1148 1149 break; 1150 1151 default: 1152 WARN(1, "undefined WQE type %d\n", tx_type); 1153 rv = -EINVAL; 1154 } 1155 wqe->wr_status = SIW_WR_IDLE; 1156 } 1157 done: 1158 return rv; 1159 } 1160 1161 static void siw_sq_resume(struct siw_qp *qp) 1162 { 1163 if (down_read_trylock(&qp->state_lock)) { 1164 if (likely(qp->attrs.state == SIW_QP_STATE_RTS && 1165 !qp->tx_ctx.tx_suspend)) { 1166 int rv = siw_qp_sq_process(qp); 1167 1168 up_read(&qp->state_lock); 1169 1170 if (unlikely(rv < 0)) { 1171 siw_dbg_qp(qp, "SQ task failed: err %d\n", rv); 1172 1173 if (!qp->tx_ctx.tx_suspend) 1174 siw_qp_cm_drop(qp, 0); 1175 } 1176 } else { 1177 up_read(&qp->state_lock); 1178 } 1179 } else { 1180 siw_dbg_qp(qp, "Resume SQ while QP locked\n"); 1181 } 1182 siw_qp_put(qp); 1183 } 1184 1185 struct tx_task_t { 1186 struct llist_head active; 1187 wait_queue_head_t waiting; 1188 }; 1189 1190 static DEFINE_PER_CPU(struct tx_task_t, siw_tx_task_g); 1191 1192 void siw_stop_tx_thread(int nr_cpu) 1193 { 1194 kthread_stop(siw_tx_thread[nr_cpu]); 1195 wake_up(&per_cpu(siw_tx_task_g, nr_cpu).waiting); 1196 } 1197 1198 int siw_run_sq(void *data) 1199 { 1200 const int nr_cpu = (unsigned int)(long)data; 1201 struct llist_node *active; 1202 struct siw_qp *qp; 1203 struct tx_task_t *tx_task = &per_cpu(siw_tx_task_g, nr_cpu); 1204 1205 init_llist_head(&tx_task->active); 1206 init_waitqueue_head(&tx_task->waiting); 1207 1208 while (1) { 1209 struct llist_node *fifo_list = NULL; 1210 1211 wait_event_interruptible(tx_task->waiting, 1212 !llist_empty(&tx_task->active) || 1213 kthread_should_stop()); 1214 1215 if (kthread_should_stop()) 1216 break; 1217 1218 active = llist_del_all(&tx_task->active); 1219 /* 1220 * llist_del_all returns a list with newest entry first. 1221 * Re-order list for fairness among QP's. 1222 */ 1223 while (active) { 1224 struct llist_node *tmp = active; 1225 1226 active = llist_next(active); 1227 tmp->next = fifo_list; 1228 fifo_list = tmp; 1229 } 1230 while (fifo_list) { 1231 qp = container_of(fifo_list, struct siw_qp, tx_list); 1232 fifo_list = llist_next(fifo_list); 1233 qp->tx_list.next = NULL; 1234 1235 siw_sq_resume(qp); 1236 } 1237 } 1238 active = llist_del_all(&tx_task->active); 1239 if (active) { 1240 llist_for_each_entry(qp, active, tx_list) { 1241 qp->tx_list.next = NULL; 1242 siw_sq_resume(qp); 1243 } 1244 } 1245 return 0; 1246 } 1247 1248 int siw_sq_start(struct siw_qp *qp) 1249 { 1250 if (tx_wqe(qp)->wr_status == SIW_WR_IDLE) 1251 return 0; 1252 1253 if (unlikely(!cpu_online(qp->tx_cpu))) { 1254 siw_put_tx_cpu(qp->tx_cpu); 1255 qp->tx_cpu = siw_get_tx_cpu(qp->sdev); 1256 if (qp->tx_cpu < 0) { 1257 pr_warn("siw: no tx cpu available\n"); 1258 1259 return -EIO; 1260 } 1261 } 1262 siw_qp_get(qp); 1263 1264 llist_add(&qp->tx_list, &per_cpu(siw_tx_task_g, qp->tx_cpu).active); 1265 1266 wake_up(&per_cpu(siw_tx_task_g, qp->tx_cpu).waiting); 1267 1268 return 0; 1269 } 1270