1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause 2 3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ 4 /* Copyright (c) 2008-2019, IBM Corporation */ 5 6 #include <linux/errno.h> 7 #include <linux/types.h> 8 #include <linux/net.h> 9 #include <linux/scatterlist.h> 10 #include <linux/highmem.h> 11 12 #include <rdma/iw_cm.h> 13 #include <rdma/ib_verbs.h> 14 15 #include "siw.h" 16 #include "siw_verbs.h" 17 #include "siw_mem.h" 18 19 /* 20 * siw_rx_umem() 21 * 22 * Receive data of @len into target referenced by @dest_addr. 23 * 24 * @srx: Receive Context 25 * @umem: siw representation of target memory 26 * @dest_addr: user virtual address 27 * @len: number of bytes to place 28 */ 29 static int siw_rx_umem(struct siw_rx_stream *srx, struct siw_umem *umem, 30 u64 dest_addr, int len) 31 { 32 int copied = 0; 33 34 while (len) { 35 struct page *p; 36 int pg_off, bytes, rv; 37 void *dest; 38 39 p = siw_get_upage(umem, dest_addr); 40 if (unlikely(!p)) { 41 pr_warn("siw: %s: [QP %u]: bogus addr: %pK, %pK\n", 42 __func__, qp_id(rx_qp(srx)), 43 (void *)(uintptr_t)dest_addr, 44 (void *)(uintptr_t)umem->fp_addr); 45 /* siw internal error */ 46 srx->skb_copied += copied; 47 srx->skb_new -= copied; 48 49 return -EFAULT; 50 } 51 pg_off = dest_addr & ~PAGE_MASK; 52 bytes = min(len, (int)PAGE_SIZE - pg_off); 53 54 siw_dbg_qp(rx_qp(srx), "page %pK, bytes=%u\n", p, bytes); 55 56 dest = kmap_atomic(p); 57 rv = skb_copy_bits(srx->skb, srx->skb_offset, dest + pg_off, 58 bytes); 59 60 if (unlikely(rv)) { 61 kunmap_atomic(dest); 62 srx->skb_copied += copied; 63 srx->skb_new -= copied; 64 65 pr_warn("siw: [QP %u]: %s, len %d, page %p, rv %d\n", 66 qp_id(rx_qp(srx)), __func__, len, p, rv); 67 68 return -EFAULT; 69 } 70 if (srx->mpa_crc_hd) { 71 if (rx_qp(srx)->kernel_verbs) { 72 crypto_shash_update(srx->mpa_crc_hd, 73 (u8 *)(dest + pg_off), bytes); 74 kunmap_atomic(dest); 75 } else { 76 kunmap_atomic(dest); 77 /* 78 * Do CRC on original, not target buffer. 79 * Some user land applications may 80 * concurrently write the target buffer, 81 * which would yield a broken CRC. 82 * Walking the skb twice is very ineffcient. 83 * Folding the CRC into skb_copy_bits() 84 * would be much better, but is currently 85 * not supported. 86 */ 87 siw_crc_skb(srx, bytes); 88 } 89 } else { 90 kunmap_atomic(dest); 91 } 92 srx->skb_offset += bytes; 93 copied += bytes; 94 len -= bytes; 95 dest_addr += bytes; 96 pg_off = 0; 97 } 98 srx->skb_copied += copied; 99 srx->skb_new -= copied; 100 101 return copied; 102 } 103 104 static int siw_rx_kva(struct siw_rx_stream *srx, void *kva, int len) 105 { 106 int rv; 107 108 siw_dbg_qp(rx_qp(srx), "kva: 0x%pK, len: %u\n", kva, len); 109 110 rv = skb_copy_bits(srx->skb, srx->skb_offset, kva, len); 111 if (unlikely(rv)) { 112 pr_warn("siw: [QP %u]: %s, len %d, kva 0x%pK, rv %d\n", 113 qp_id(rx_qp(srx)), __func__, len, kva, rv); 114 115 return rv; 116 } 117 if (srx->mpa_crc_hd) 118 crypto_shash_update(srx->mpa_crc_hd, (u8 *)kva, len); 119 120 srx->skb_offset += len; 121 srx->skb_copied += len; 122 srx->skb_new -= len; 123 124 return len; 125 } 126 127 static int siw_rx_pbl(struct siw_rx_stream *srx, int *pbl_idx, 128 struct siw_mem *mem, u64 addr, int len) 129 { 130 struct siw_pbl *pbl = mem->pbl; 131 u64 offset = addr - mem->va; 132 int copied = 0; 133 134 while (len) { 135 int bytes; 136 dma_addr_t buf_addr = 137 siw_pbl_get_buffer(pbl, offset, &bytes, pbl_idx); 138 if (!buf_addr) 139 break; 140 141 bytes = min(bytes, len); 142 if (siw_rx_kva(srx, (void *)buf_addr, bytes) == bytes) { 143 copied += bytes; 144 offset += bytes; 145 len -= bytes; 146 } else { 147 break; 148 } 149 } 150 return copied; 151 } 152 153 /* 154 * siw_rresp_check_ntoh() 155 * 156 * Check incoming RRESP fragment header against expected 157 * header values and update expected values for potential next 158 * fragment. 159 * 160 * NOTE: This function must be called only if a RRESP DDP segment 161 * starts but not for fragmented consecutive pieces of an 162 * already started DDP segment. 163 */ 164 static int siw_rresp_check_ntoh(struct siw_rx_stream *srx, 165 struct siw_rx_fpdu *frx) 166 { 167 struct iwarp_rdma_rresp *rresp = &srx->hdr.rresp; 168 struct siw_wqe *wqe = &frx->wqe_active; 169 enum ddp_ecode ecode; 170 171 u32 sink_stag = be32_to_cpu(rresp->sink_stag); 172 u64 sink_to = be64_to_cpu(rresp->sink_to); 173 174 if (frx->first_ddp_seg) { 175 srx->ddp_stag = wqe->sqe.sge[0].lkey; 176 srx->ddp_to = wqe->sqe.sge[0].laddr; 177 frx->pbl_idx = 0; 178 } 179 /* Below checks extend beyond the semantics of DDP, and 180 * into RDMAP: 181 * We check if the read response matches exactly the 182 * read request which was send to the remote peer to 183 * trigger this read response. RFC5040/5041 do not 184 * always have a proper error code for the detected 185 * error cases. We choose 'base or bounds error' for 186 * cases where the inbound STag is valid, but offset 187 * or length do not match our response receive state. 188 */ 189 if (unlikely(srx->ddp_stag != sink_stag)) { 190 pr_warn("siw: [QP %u]: rresp stag: %08x != %08x\n", 191 qp_id(rx_qp(srx)), sink_stag, srx->ddp_stag); 192 ecode = DDP_ECODE_T_INVALID_STAG; 193 goto error; 194 } 195 if (unlikely(srx->ddp_to != sink_to)) { 196 pr_warn("siw: [QP %u]: rresp off: %016llx != %016llx\n", 197 qp_id(rx_qp(srx)), (unsigned long long)sink_to, 198 (unsigned long long)srx->ddp_to); 199 ecode = DDP_ECODE_T_BASE_BOUNDS; 200 goto error; 201 } 202 if (unlikely(!frx->more_ddp_segs && 203 (wqe->processed + srx->fpdu_part_rem != wqe->bytes))) { 204 pr_warn("siw: [QP %u]: rresp len: %d != %d\n", 205 qp_id(rx_qp(srx)), 206 wqe->processed + srx->fpdu_part_rem, wqe->bytes); 207 ecode = DDP_ECODE_T_BASE_BOUNDS; 208 goto error; 209 } 210 return 0; 211 error: 212 siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP, 213 DDP_ETYPE_TAGGED_BUF, ecode, 0); 214 return -EINVAL; 215 } 216 217 /* 218 * siw_write_check_ntoh() 219 * 220 * Check incoming WRITE fragment header against expected 221 * header values and update expected values for potential next 222 * fragment 223 * 224 * NOTE: This function must be called only if a WRITE DDP segment 225 * starts but not for fragmented consecutive pieces of an 226 * already started DDP segment. 227 */ 228 static int siw_write_check_ntoh(struct siw_rx_stream *srx, 229 struct siw_rx_fpdu *frx) 230 { 231 struct iwarp_rdma_write *write = &srx->hdr.rwrite; 232 enum ddp_ecode ecode; 233 234 u32 sink_stag = be32_to_cpu(write->sink_stag); 235 u64 sink_to = be64_to_cpu(write->sink_to); 236 237 if (frx->first_ddp_seg) { 238 srx->ddp_stag = sink_stag; 239 srx->ddp_to = sink_to; 240 frx->pbl_idx = 0; 241 } else { 242 if (unlikely(srx->ddp_stag != sink_stag)) { 243 pr_warn("siw: [QP %u]: write stag: %08x != %08x\n", 244 qp_id(rx_qp(srx)), sink_stag, 245 srx->ddp_stag); 246 ecode = DDP_ECODE_T_INVALID_STAG; 247 goto error; 248 } 249 if (unlikely(srx->ddp_to != sink_to)) { 250 pr_warn("siw: [QP %u]: write off: %016llx != %016llx\n", 251 qp_id(rx_qp(srx)), 252 (unsigned long long)sink_to, 253 (unsigned long long)srx->ddp_to); 254 ecode = DDP_ECODE_T_BASE_BOUNDS; 255 goto error; 256 } 257 } 258 return 0; 259 error: 260 siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP, 261 DDP_ETYPE_TAGGED_BUF, ecode, 0); 262 return -EINVAL; 263 } 264 265 /* 266 * siw_send_check_ntoh() 267 * 268 * Check incoming SEND fragment header against expected 269 * header values and update expected MSN if no next 270 * fragment expected 271 * 272 * NOTE: This function must be called only if a SEND DDP segment 273 * starts but not for fragmented consecutive pieces of an 274 * already started DDP segment. 275 */ 276 static int siw_send_check_ntoh(struct siw_rx_stream *srx, 277 struct siw_rx_fpdu *frx) 278 { 279 struct iwarp_send_inv *send = &srx->hdr.send_inv; 280 struct siw_wqe *wqe = &frx->wqe_active; 281 enum ddp_ecode ecode; 282 283 u32 ddp_msn = be32_to_cpu(send->ddp_msn); 284 u32 ddp_mo = be32_to_cpu(send->ddp_mo); 285 u32 ddp_qn = be32_to_cpu(send->ddp_qn); 286 287 if (unlikely(ddp_qn != RDMAP_UNTAGGED_QN_SEND)) { 288 pr_warn("siw: [QP %u]: invalid ddp qn %d for send\n", 289 qp_id(rx_qp(srx)), ddp_qn); 290 ecode = DDP_ECODE_UT_INVALID_QN; 291 goto error; 292 } 293 if (unlikely(ddp_msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND])) { 294 pr_warn("siw: [QP %u]: send msn: %u != %u\n", 295 qp_id(rx_qp(srx)), ddp_msn, 296 srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]); 297 ecode = DDP_ECODE_UT_INVALID_MSN_RANGE; 298 goto error; 299 } 300 if (unlikely(ddp_mo != wqe->processed)) { 301 pr_warn("siw: [QP %u], send mo: %u != %u\n", 302 qp_id(rx_qp(srx)), ddp_mo, wqe->processed); 303 ecode = DDP_ECODE_UT_INVALID_MO; 304 goto error; 305 } 306 if (frx->first_ddp_seg) { 307 /* initialize user memory write position */ 308 frx->sge_idx = 0; 309 frx->sge_off = 0; 310 frx->pbl_idx = 0; 311 312 /* only valid for SEND_INV and SEND_SE_INV operations */ 313 srx->inval_stag = be32_to_cpu(send->inval_stag); 314 } 315 if (unlikely(wqe->bytes < wqe->processed + srx->fpdu_part_rem)) { 316 siw_dbg_qp(rx_qp(srx), "receive space short: %d - %d < %d\n", 317 wqe->bytes, wqe->processed, srx->fpdu_part_rem); 318 wqe->wc_status = SIW_WC_LOC_LEN_ERR; 319 ecode = DDP_ECODE_UT_INVALID_MSN_NOBUF; 320 goto error; 321 } 322 return 0; 323 error: 324 siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP, 325 DDP_ETYPE_UNTAGGED_BUF, ecode, 0); 326 return -EINVAL; 327 } 328 329 static struct siw_wqe *siw_rqe_get(struct siw_qp *qp) 330 { 331 struct siw_rqe *rqe; 332 struct siw_srq *srq; 333 struct siw_wqe *wqe = NULL; 334 bool srq_event = false; 335 unsigned long flags; 336 337 srq = qp->srq; 338 if (srq) { 339 spin_lock_irqsave(&srq->lock, flags); 340 if (unlikely(!srq->num_rqe)) 341 goto out; 342 343 rqe = &srq->recvq[srq->rq_get % srq->num_rqe]; 344 } else { 345 if (unlikely(!qp->recvq)) 346 goto out; 347 348 rqe = &qp->recvq[qp->rq_get % qp->attrs.rq_size]; 349 } 350 if (likely(rqe->flags == SIW_WQE_VALID)) { 351 int num_sge = rqe->num_sge; 352 353 if (likely(num_sge <= SIW_MAX_SGE)) { 354 int i = 0; 355 356 wqe = rx_wqe(&qp->rx_untagged); 357 rx_type(wqe) = SIW_OP_RECEIVE; 358 wqe->wr_status = SIW_WR_INPROGRESS; 359 wqe->bytes = 0; 360 wqe->processed = 0; 361 362 wqe->rqe.id = rqe->id; 363 wqe->rqe.num_sge = num_sge; 364 365 while (i < num_sge) { 366 wqe->rqe.sge[i].laddr = rqe->sge[i].laddr; 367 wqe->rqe.sge[i].lkey = rqe->sge[i].lkey; 368 wqe->rqe.sge[i].length = rqe->sge[i].length; 369 wqe->bytes += wqe->rqe.sge[i].length; 370 wqe->mem[i] = NULL; 371 i++; 372 } 373 /* can be re-used by appl */ 374 smp_store_mb(rqe->flags, 0); 375 } else { 376 siw_dbg_qp(qp, "too many sge's: %d\n", rqe->num_sge); 377 if (srq) 378 spin_unlock_irqrestore(&srq->lock, flags); 379 return NULL; 380 } 381 if (!srq) { 382 qp->rq_get++; 383 } else { 384 if (srq->armed) { 385 /* Test SRQ limit */ 386 u32 off = (srq->rq_get + srq->limit) % 387 srq->num_rqe; 388 struct siw_rqe *rqe2 = &srq->recvq[off]; 389 390 if (!(rqe2->flags & SIW_WQE_VALID)) { 391 srq->armed = 0; 392 srq_event = true; 393 } 394 } 395 srq->rq_get++; 396 } 397 } 398 out: 399 if (srq) { 400 spin_unlock_irqrestore(&srq->lock, flags); 401 if (srq_event) 402 siw_srq_event(srq, IB_EVENT_SRQ_LIMIT_REACHED); 403 } 404 return wqe; 405 } 406 407 /* 408 * siw_proc_send: 409 * 410 * Process one incoming SEND and place data into memory referenced by 411 * receive wqe. 412 * 413 * Function supports partially received sends (suspending/resuming 414 * current receive wqe processing) 415 * 416 * return value: 417 * 0: reached the end of a DDP segment 418 * -EAGAIN: to be called again to finish the DDP segment 419 */ 420 int siw_proc_send(struct siw_qp *qp) 421 { 422 struct siw_rx_stream *srx = &qp->rx_stream; 423 struct siw_rx_fpdu *frx = &qp->rx_untagged; 424 struct siw_wqe *wqe; 425 u32 data_bytes; /* all data bytes available */ 426 u32 rcvd_bytes; /* sum of data bytes rcvd */ 427 int rv = 0; 428 429 if (frx->first_ddp_seg) { 430 wqe = siw_rqe_get(qp); 431 if (unlikely(!wqe)) { 432 siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, 433 DDP_ETYPE_UNTAGGED_BUF, 434 DDP_ECODE_UT_INVALID_MSN_NOBUF, 0); 435 return -ENOENT; 436 } 437 } else { 438 wqe = rx_wqe(frx); 439 } 440 if (srx->state == SIW_GET_DATA_START) { 441 rv = siw_send_check_ntoh(srx, frx); 442 if (unlikely(rv)) { 443 siw_qp_event(qp, IB_EVENT_QP_FATAL); 444 return rv; 445 } 446 if (!srx->fpdu_part_rem) /* zero length SEND */ 447 return 0; 448 } 449 data_bytes = min(srx->fpdu_part_rem, srx->skb_new); 450 rcvd_bytes = 0; 451 452 /* A zero length SEND will skip below loop */ 453 while (data_bytes) { 454 struct ib_pd *pd; 455 struct siw_mem **mem, *mem_p; 456 struct siw_sge *sge; 457 u32 sge_bytes; /* data bytes avail for SGE */ 458 459 sge = &wqe->rqe.sge[frx->sge_idx]; 460 461 if (!sge->length) { 462 /* just skip empty sge's */ 463 frx->sge_idx++; 464 frx->sge_off = 0; 465 frx->pbl_idx = 0; 466 continue; 467 } 468 sge_bytes = min(data_bytes, sge->length - frx->sge_off); 469 mem = &wqe->mem[frx->sge_idx]; 470 471 /* 472 * check with QP's PD if no SRQ present, SRQ's PD otherwise 473 */ 474 pd = qp->srq == NULL ? qp->pd : qp->srq->base_srq.pd; 475 476 rv = siw_check_sge(pd, sge, mem, IB_ACCESS_LOCAL_WRITE, 477 frx->sge_off, sge_bytes); 478 if (unlikely(rv)) { 479 siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, 480 DDP_ETYPE_CATASTROPHIC, 481 DDP_ECODE_CATASTROPHIC, 0); 482 483 siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR); 484 break; 485 } 486 mem_p = *mem; 487 if (mem_p->mem_obj == NULL) 488 rv = siw_rx_kva(srx, 489 (void *)(uintptr_t)(sge->laddr + frx->sge_off), 490 sge_bytes); 491 else if (!mem_p->is_pbl) 492 rv = siw_rx_umem(srx, mem_p->umem, 493 sge->laddr + frx->sge_off, sge_bytes); 494 else 495 rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p, 496 sge->laddr + frx->sge_off, sge_bytes); 497 498 if (unlikely(rv != sge_bytes)) { 499 wqe->processed += rcvd_bytes; 500 501 siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, 502 DDP_ETYPE_CATASTROPHIC, 503 DDP_ECODE_CATASTROPHIC, 0); 504 return -EINVAL; 505 } 506 frx->sge_off += rv; 507 508 if (frx->sge_off == sge->length) { 509 frx->sge_idx++; 510 frx->sge_off = 0; 511 frx->pbl_idx = 0; 512 } 513 data_bytes -= rv; 514 rcvd_bytes += rv; 515 516 srx->fpdu_part_rem -= rv; 517 srx->fpdu_part_rcvd += rv; 518 } 519 wqe->processed += rcvd_bytes; 520 521 if (!srx->fpdu_part_rem) 522 return 0; 523 524 return (rv < 0) ? rv : -EAGAIN; 525 } 526 527 /* 528 * siw_proc_write: 529 * 530 * Place incoming WRITE after referencing and checking target buffer 531 532 * Function supports partially received WRITEs (suspending/resuming 533 * current receive processing) 534 * 535 * return value: 536 * 0: reached the end of a DDP segment 537 * -EAGAIN: to be called again to finish the DDP segment 538 */ 539 int siw_proc_write(struct siw_qp *qp) 540 { 541 struct siw_rx_stream *srx = &qp->rx_stream; 542 struct siw_rx_fpdu *frx = &qp->rx_tagged; 543 struct siw_mem *mem; 544 int bytes, rv; 545 546 if (srx->state == SIW_GET_DATA_START) { 547 if (!srx->fpdu_part_rem) /* zero length WRITE */ 548 return 0; 549 550 rv = siw_write_check_ntoh(srx, frx); 551 if (unlikely(rv)) { 552 siw_qp_event(qp, IB_EVENT_QP_FATAL); 553 return rv; 554 } 555 } 556 bytes = min(srx->fpdu_part_rem, srx->skb_new); 557 558 if (frx->first_ddp_seg) { 559 struct siw_wqe *wqe = rx_wqe(frx); 560 561 rx_mem(frx) = siw_mem_id2obj(qp->sdev, srx->ddp_stag >> 8); 562 if (unlikely(!rx_mem(frx))) { 563 siw_dbg_qp(qp, 564 "sink stag not found/invalid, stag 0x%08x\n", 565 srx->ddp_stag); 566 567 siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, 568 DDP_ETYPE_TAGGED_BUF, 569 DDP_ECODE_T_INVALID_STAG, 0); 570 return -EINVAL; 571 } 572 wqe->rqe.num_sge = 1; 573 rx_type(wqe) = SIW_OP_WRITE; 574 wqe->wr_status = SIW_WR_INPROGRESS; 575 } 576 mem = rx_mem(frx); 577 578 /* 579 * Check if application re-registered memory with different 580 * key field of STag. 581 */ 582 if (unlikely(mem->stag != srx->ddp_stag)) { 583 siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, 584 DDP_ETYPE_TAGGED_BUF, 585 DDP_ECODE_T_INVALID_STAG, 0); 586 return -EINVAL; 587 } 588 rv = siw_check_mem(qp->pd, mem, srx->ddp_to + srx->fpdu_part_rcvd, 589 IB_ACCESS_REMOTE_WRITE, bytes); 590 if (unlikely(rv)) { 591 siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, 592 DDP_ETYPE_TAGGED_BUF, siw_tagged_error(-rv), 593 0); 594 595 siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR); 596 597 return -EINVAL; 598 } 599 600 if (mem->mem_obj == NULL) 601 rv = siw_rx_kva(srx, 602 (void *)(uintptr_t)(srx->ddp_to + srx->fpdu_part_rcvd), 603 bytes); 604 else if (!mem->is_pbl) 605 rv = siw_rx_umem(srx, mem->umem, 606 srx->ddp_to + srx->fpdu_part_rcvd, bytes); 607 else 608 rv = siw_rx_pbl(srx, &frx->pbl_idx, mem, 609 srx->ddp_to + srx->fpdu_part_rcvd, bytes); 610 611 if (unlikely(rv != bytes)) { 612 siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, 613 DDP_ETYPE_CATASTROPHIC, 614 DDP_ECODE_CATASTROPHIC, 0); 615 return -EINVAL; 616 } 617 srx->fpdu_part_rem -= rv; 618 srx->fpdu_part_rcvd += rv; 619 620 if (!srx->fpdu_part_rem) { 621 srx->ddp_to += srx->fpdu_part_rcvd; 622 return 0; 623 } 624 return -EAGAIN; 625 } 626 627 /* 628 * Inbound RREQ's cannot carry user data. 629 */ 630 int siw_proc_rreq(struct siw_qp *qp) 631 { 632 struct siw_rx_stream *srx = &qp->rx_stream; 633 634 if (!srx->fpdu_part_rem) 635 return 0; 636 637 pr_warn("siw: [QP %u]: rreq with mpa len %d\n", qp_id(qp), 638 be16_to_cpu(srx->hdr.ctrl.mpa_len)); 639 640 return -EPROTO; 641 } 642 643 /* 644 * siw_init_rresp: 645 * 646 * Process inbound RDMA READ REQ. Produce a pseudo READ RESPONSE WQE. 647 * Put it at the tail of the IRQ, if there is another WQE currently in 648 * transmit processing. If not, make it the current WQE to be processed 649 * and schedule transmit processing. 650 * 651 * Can be called from softirq context and from process 652 * context (RREAD socket loopback case!) 653 * 654 * return value: 655 * 0: success, 656 * failure code otherwise 657 */ 658 659 static int siw_init_rresp(struct siw_qp *qp, struct siw_rx_stream *srx) 660 { 661 struct siw_wqe *tx_work = tx_wqe(qp); 662 struct siw_sqe *resp; 663 664 uint64_t raddr = be64_to_cpu(srx->hdr.rreq.sink_to), 665 laddr = be64_to_cpu(srx->hdr.rreq.source_to); 666 uint32_t length = be32_to_cpu(srx->hdr.rreq.read_size), 667 lkey = be32_to_cpu(srx->hdr.rreq.source_stag), 668 rkey = be32_to_cpu(srx->hdr.rreq.sink_stag), 669 msn = be32_to_cpu(srx->hdr.rreq.ddp_msn); 670 671 int run_sq = 1, rv = 0; 672 unsigned long flags; 673 674 if (unlikely(msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ])) { 675 siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, 676 DDP_ETYPE_UNTAGGED_BUF, 677 DDP_ECODE_UT_INVALID_MSN_RANGE, 0); 678 return -EPROTO; 679 } 680 spin_lock_irqsave(&qp->sq_lock, flags); 681 682 if (tx_work->wr_status == SIW_WR_IDLE) { 683 /* 684 * immediately schedule READ response w/o 685 * consuming IRQ entry: IRQ must be empty. 686 */ 687 tx_work->processed = 0; 688 tx_work->mem[0] = NULL; 689 tx_work->wr_status = SIW_WR_QUEUED; 690 resp = &tx_work->sqe; 691 } else { 692 resp = irq_alloc_free(qp); 693 run_sq = 0; 694 } 695 if (likely(resp)) { 696 resp->opcode = SIW_OP_READ_RESPONSE; 697 698 resp->sge[0].length = length; 699 resp->sge[0].laddr = laddr; 700 resp->sge[0].lkey = lkey; 701 702 /* Keep aside message sequence number for potential 703 * error reporting during Read Response generation. 704 */ 705 resp->sge[1].length = msn; 706 707 resp->raddr = raddr; 708 resp->rkey = rkey; 709 resp->num_sge = length ? 1 : 0; 710 711 /* RRESP now valid as current TX wqe or placed into IRQ */ 712 smp_store_mb(resp->flags, SIW_WQE_VALID); 713 } else { 714 pr_warn("siw: [QP %u]: irq %d exceeded %d\n", qp_id(qp), 715 qp->irq_put % qp->attrs.irq_size, qp->attrs.irq_size); 716 717 siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP, 718 RDMAP_ETYPE_REMOTE_OPERATION, 719 RDMAP_ECODE_CATASTROPHIC_STREAM, 0); 720 rv = -EPROTO; 721 } 722 723 spin_unlock_irqrestore(&qp->sq_lock, flags); 724 725 if (run_sq) 726 rv = siw_sq_start(qp); 727 728 return rv; 729 } 730 731 /* 732 * Only called at start of Read.Resonse processing. 733 * Transfer pending Read from tip of ORQ into currrent rx wqe, 734 * but keep ORQ entry valid until Read.Response processing done. 735 * No Queue locking needed. 736 */ 737 static int siw_orqe_start_rx(struct siw_qp *qp) 738 { 739 struct siw_sqe *orqe; 740 struct siw_wqe *wqe = NULL; 741 742 /* make sure ORQ indices are current */ 743 smp_mb(); 744 745 orqe = orq_get_current(qp); 746 if (READ_ONCE(orqe->flags) & SIW_WQE_VALID) { 747 /* RRESP is a TAGGED RDMAP operation */ 748 wqe = rx_wqe(&qp->rx_tagged); 749 wqe->sqe.id = orqe->id; 750 wqe->sqe.opcode = orqe->opcode; 751 wqe->sqe.sge[0].laddr = orqe->sge[0].laddr; 752 wqe->sqe.sge[0].lkey = orqe->sge[0].lkey; 753 wqe->sqe.sge[0].length = orqe->sge[0].length; 754 wqe->sqe.flags = orqe->flags; 755 wqe->sqe.num_sge = 1; 756 wqe->bytes = orqe->sge[0].length; 757 wqe->processed = 0; 758 wqe->mem[0] = NULL; 759 /* make sure WQE is completely written before valid */ 760 smp_wmb(); 761 wqe->wr_status = SIW_WR_INPROGRESS; 762 763 return 0; 764 } 765 return -EPROTO; 766 } 767 768 /* 769 * siw_proc_rresp: 770 * 771 * Place incoming RRESP data into memory referenced by RREQ WQE 772 * which is at the tip of the ORQ 773 * 774 * Function supports partially received RRESP's (suspending/resuming 775 * current receive processing) 776 */ 777 int siw_proc_rresp(struct siw_qp *qp) 778 { 779 struct siw_rx_stream *srx = &qp->rx_stream; 780 struct siw_rx_fpdu *frx = &qp->rx_tagged; 781 struct siw_wqe *wqe = rx_wqe(frx); 782 struct siw_mem **mem, *mem_p; 783 struct siw_sge *sge; 784 int bytes, rv; 785 786 if (frx->first_ddp_seg) { 787 if (unlikely(wqe->wr_status != SIW_WR_IDLE)) { 788 pr_warn("siw: [QP %u]: proc RRESP: status %d, op %d\n", 789 qp_id(qp), wqe->wr_status, wqe->sqe.opcode); 790 rv = -EPROTO; 791 goto error_term; 792 } 793 /* 794 * fetch pending RREQ from orq 795 */ 796 rv = siw_orqe_start_rx(qp); 797 if (rv) { 798 pr_warn("siw: [QP %u]: ORQ empty at idx %d\n", 799 qp_id(qp), qp->orq_get % qp->attrs.orq_size); 800 goto error_term; 801 } 802 rv = siw_rresp_check_ntoh(srx, frx); 803 if (unlikely(rv)) { 804 siw_qp_event(qp, IB_EVENT_QP_FATAL); 805 return rv; 806 } 807 } else { 808 if (unlikely(wqe->wr_status != SIW_WR_INPROGRESS)) { 809 pr_warn("siw: [QP %u]: resume RRESP: status %d\n", 810 qp_id(qp), wqe->wr_status); 811 rv = -EPROTO; 812 goto error_term; 813 } 814 } 815 if (!srx->fpdu_part_rem) /* zero length RRESPONSE */ 816 return 0; 817 818 sge = wqe->sqe.sge; /* there is only one */ 819 mem = &wqe->mem[0]; 820 821 if (!(*mem)) { 822 /* 823 * check target memory which resolves memory on first fragment 824 */ 825 rv = siw_check_sge(qp->pd, sge, mem, IB_ACCESS_LOCAL_WRITE, 0, 826 wqe->bytes); 827 if (unlikely(rv)) { 828 siw_dbg_qp(qp, "target mem check: %d\n", rv); 829 wqe->wc_status = SIW_WC_LOC_PROT_ERR; 830 831 siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, 832 DDP_ETYPE_TAGGED_BUF, 833 siw_tagged_error(-rv), 0); 834 835 siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR); 836 837 return -EINVAL; 838 } 839 } 840 mem_p = *mem; 841 842 bytes = min(srx->fpdu_part_rem, srx->skb_new); 843 844 if (mem_p->mem_obj == NULL) 845 rv = siw_rx_kva(srx, 846 (void *)(uintptr_t)(sge->laddr + wqe->processed), 847 bytes); 848 else if (!mem_p->is_pbl) 849 rv = siw_rx_umem(srx, mem_p->umem, sge->laddr + wqe->processed, 850 bytes); 851 else 852 rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p, 853 sge->laddr + wqe->processed, bytes); 854 if (rv != bytes) { 855 wqe->wc_status = SIW_WC_GENERAL_ERR; 856 rv = -EINVAL; 857 goto error_term; 858 } 859 srx->fpdu_part_rem -= rv; 860 srx->fpdu_part_rcvd += rv; 861 wqe->processed += rv; 862 863 if (!srx->fpdu_part_rem) { 864 srx->ddp_to += srx->fpdu_part_rcvd; 865 return 0; 866 } 867 return -EAGAIN; 868 869 error_term: 870 siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, DDP_ETYPE_CATASTROPHIC, 871 DDP_ECODE_CATASTROPHIC, 0); 872 return rv; 873 } 874 875 int siw_proc_terminate(struct siw_qp *qp) 876 { 877 struct siw_rx_stream *srx = &qp->rx_stream; 878 struct sk_buff *skb = srx->skb; 879 struct iwarp_terminate *term = &srx->hdr.terminate; 880 union iwarp_hdr term_info; 881 u8 *infop = (u8 *)&term_info; 882 enum rdma_opcode op; 883 u16 to_copy = sizeof(struct iwarp_ctrl); 884 885 pr_warn("siw: got TERMINATE. layer %d, type %d, code %d\n", 886 __rdmap_term_layer(term), __rdmap_term_etype(term), 887 __rdmap_term_ecode(term)); 888 889 if (be32_to_cpu(term->ddp_qn) != RDMAP_UNTAGGED_QN_TERMINATE || 890 be32_to_cpu(term->ddp_msn) != 891 qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] || 892 be32_to_cpu(term->ddp_mo) != 0) { 893 pr_warn("siw: rx bogus TERM [QN x%08x, MSN x%08x, MO x%08x]\n", 894 be32_to_cpu(term->ddp_qn), be32_to_cpu(term->ddp_msn), 895 be32_to_cpu(term->ddp_mo)); 896 return -ECONNRESET; 897 } 898 /* 899 * Receive remaining pieces of TERM if indicated 900 */ 901 if (!term->flag_m) 902 return -ECONNRESET; 903 904 /* Do not take the effort to reassemble a network fragmented 905 * TERM message 906 */ 907 if (srx->skb_new < sizeof(struct iwarp_ctrl_tagged)) 908 return -ECONNRESET; 909 910 memset(infop, 0, sizeof(term_info)); 911 912 skb_copy_bits(skb, srx->skb_offset, infop, to_copy); 913 914 op = __rdmap_get_opcode(&term_info.ctrl); 915 if (op >= RDMAP_TERMINATE) 916 goto out; 917 918 infop += to_copy; 919 srx->skb_offset += to_copy; 920 srx->skb_new -= to_copy; 921 srx->skb_copied += to_copy; 922 srx->fpdu_part_rcvd += to_copy; 923 srx->fpdu_part_rem -= to_copy; 924 925 to_copy = iwarp_pktinfo[op].hdr_len - to_copy; 926 927 /* Again, no network fragmented TERM's */ 928 if (to_copy + MPA_CRC_SIZE > srx->skb_new) 929 return -ECONNRESET; 930 931 skb_copy_bits(skb, srx->skb_offset, infop, to_copy); 932 933 if (term->flag_r) { 934 siw_dbg_qp(qp, "TERM reports RDMAP hdr type %u, len %u (%s)\n", 935 op, be16_to_cpu(term_info.ctrl.mpa_len), 936 term->flag_m ? "valid" : "invalid"); 937 } else if (term->flag_d) { 938 siw_dbg_qp(qp, "TERM reports DDP hdr type %u, len %u (%s)\n", 939 op, be16_to_cpu(term_info.ctrl.mpa_len), 940 term->flag_m ? "valid" : "invalid"); 941 } 942 out: 943 srx->skb_new -= to_copy; 944 srx->skb_offset += to_copy; 945 srx->skb_copied += to_copy; 946 srx->fpdu_part_rcvd += to_copy; 947 srx->fpdu_part_rem -= to_copy; 948 949 return -ECONNRESET; 950 } 951 952 static int siw_get_trailer(struct siw_qp *qp, struct siw_rx_stream *srx) 953 { 954 struct sk_buff *skb = srx->skb; 955 u8 *tbuf = (u8 *)&srx->trailer.crc - srx->pad; 956 __wsum crc_in, crc_own = 0; 957 958 siw_dbg_qp(qp, "expected %d, available %d, pad %u\n", 959 srx->fpdu_part_rem, srx->skb_new, srx->pad); 960 961 if (srx->skb_new < srx->fpdu_part_rem) 962 return -EAGAIN; 963 964 skb_copy_bits(skb, srx->skb_offset, tbuf, srx->fpdu_part_rem); 965 966 if (srx->mpa_crc_hd && srx->pad) 967 crypto_shash_update(srx->mpa_crc_hd, tbuf, srx->pad); 968 969 srx->skb_new -= srx->fpdu_part_rem; 970 srx->skb_offset += srx->fpdu_part_rem; 971 srx->skb_copied += srx->fpdu_part_rem; 972 973 if (!srx->mpa_crc_hd) 974 return 0; 975 976 /* 977 * CRC32 is computed, transmitted and received directly in NBO, 978 * so there's never a reason to convert byte order. 979 */ 980 crypto_shash_final(srx->mpa_crc_hd, (u8 *)&crc_own); 981 crc_in = (__force __wsum)srx->trailer.crc; 982 983 if (unlikely(crc_in != crc_own)) { 984 pr_warn("siw: crc error. in: %08x, own %08x, op %u\n", 985 crc_in, crc_own, qp->rx_stream.rdmap_op); 986 987 siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, 988 LLP_ETYPE_MPA, 989 LLP_ECODE_RECEIVED_CRC, 0); 990 return -EINVAL; 991 } 992 return 0; 993 } 994 995 #define MIN_DDP_HDR sizeof(struct iwarp_ctrl_tagged) 996 997 static int siw_get_hdr(struct siw_rx_stream *srx) 998 { 999 struct sk_buff *skb = srx->skb; 1000 struct siw_qp *qp = rx_qp(srx); 1001 struct iwarp_ctrl *c_hdr = &srx->hdr.ctrl; 1002 struct siw_rx_fpdu *frx; 1003 u8 opcode; 1004 int bytes; 1005 1006 if (srx->fpdu_part_rcvd < MIN_DDP_HDR) { 1007 /* 1008 * copy a mimimum sized (tagged) DDP frame control part 1009 */ 1010 bytes = min_t(int, srx->skb_new, 1011 MIN_DDP_HDR - srx->fpdu_part_rcvd); 1012 1013 skb_copy_bits(skb, srx->skb_offset, 1014 (char *)c_hdr + srx->fpdu_part_rcvd, bytes); 1015 1016 srx->fpdu_part_rcvd += bytes; 1017 1018 srx->skb_new -= bytes; 1019 srx->skb_offset += bytes; 1020 srx->skb_copied += bytes; 1021 1022 if (srx->fpdu_part_rcvd < MIN_DDP_HDR) 1023 return -EAGAIN; 1024 1025 if (unlikely(__ddp_get_version(c_hdr) != DDP_VERSION)) { 1026 enum ddp_etype etype; 1027 enum ddp_ecode ecode; 1028 1029 pr_warn("siw: received ddp version unsupported %d\n", 1030 __ddp_get_version(c_hdr)); 1031 1032 if (c_hdr->ddp_rdmap_ctrl & DDP_FLAG_TAGGED) { 1033 etype = DDP_ETYPE_TAGGED_BUF; 1034 ecode = DDP_ECODE_T_VERSION; 1035 } else { 1036 etype = DDP_ETYPE_UNTAGGED_BUF; 1037 ecode = DDP_ECODE_UT_VERSION; 1038 } 1039 siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP, 1040 etype, ecode, 0); 1041 return -EINVAL; 1042 } 1043 if (unlikely(__rdmap_get_version(c_hdr) != RDMAP_VERSION)) { 1044 pr_warn("siw: received rdmap version unsupported %d\n", 1045 __rdmap_get_version(c_hdr)); 1046 1047 siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP, 1048 RDMAP_ETYPE_REMOTE_OPERATION, 1049 RDMAP_ECODE_VERSION, 0); 1050 return -EINVAL; 1051 } 1052 opcode = __rdmap_get_opcode(c_hdr); 1053 1054 if (opcode > RDMAP_TERMINATE) { 1055 pr_warn("siw: received unknown packet type %u\n", 1056 opcode); 1057 1058 siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP, 1059 RDMAP_ETYPE_REMOTE_OPERATION, 1060 RDMAP_ECODE_OPCODE, 0); 1061 return -EINVAL; 1062 } 1063 siw_dbg_qp(rx_qp(srx), "new header, opcode %u\n", opcode); 1064 } else { 1065 opcode = __rdmap_get_opcode(c_hdr); 1066 } 1067 set_rx_fpdu_context(qp, opcode); 1068 frx = qp->rx_fpdu; 1069 1070 /* 1071 * Figure out len of current hdr: variable length of 1072 * iwarp hdr may force us to copy hdr information in 1073 * two steps. Only tagged DDP messages are already 1074 * completely received. 1075 */ 1076 if (iwarp_pktinfo[opcode].hdr_len > sizeof(struct iwarp_ctrl_tagged)) { 1077 bytes = iwarp_pktinfo[opcode].hdr_len - MIN_DDP_HDR; 1078 1079 if (srx->skb_new < bytes) 1080 return -EAGAIN; 1081 1082 skb_copy_bits(skb, srx->skb_offset, 1083 (char *)c_hdr + srx->fpdu_part_rcvd, bytes); 1084 1085 srx->fpdu_part_rcvd += bytes; 1086 1087 srx->skb_new -= bytes; 1088 srx->skb_offset += bytes; 1089 srx->skb_copied += bytes; 1090 } 1091 1092 /* 1093 * DDP/RDMAP header receive completed. Check if the current 1094 * DDP segment starts a new RDMAP message or continues a previously 1095 * started RDMAP message. 1096 * 1097 * Alternating reception of DDP segments (or FPDUs) from incomplete 1098 * tagged and untagged RDMAP messages is supported, as long as 1099 * the current tagged or untagged message gets eventually completed 1100 * w/o intersection from another message of the same type 1101 * (tagged/untagged). E.g., a WRITE can get intersected by a SEND, 1102 * but not by a READ RESPONSE etc. 1103 */ 1104 if (srx->mpa_crc_hd) { 1105 /* 1106 * Restart CRC computation 1107 */ 1108 crypto_shash_init(srx->mpa_crc_hd); 1109 crypto_shash_update(srx->mpa_crc_hd, (u8 *)c_hdr, 1110 srx->fpdu_part_rcvd); 1111 } 1112 if (frx->more_ddp_segs) { 1113 frx->first_ddp_seg = 0; 1114 if (frx->prev_rdmap_op != opcode) { 1115 pr_warn("siw: packet intersection: %u : %u\n", 1116 frx->prev_rdmap_op, opcode); 1117 /* 1118 * The last inbound RDMA operation of same type 1119 * (tagged or untagged) is left unfinished. 1120 * To complete it in error, make it the current 1121 * operation again, even with the header already 1122 * overwritten. For error handling, only the opcode 1123 * and current rx context are relevant. 1124 */ 1125 set_rx_fpdu_context(qp, frx->prev_rdmap_op); 1126 __rdmap_set_opcode(c_hdr, frx->prev_rdmap_op); 1127 return -EPROTO; 1128 } 1129 } else { 1130 frx->prev_rdmap_op = opcode; 1131 frx->first_ddp_seg = 1; 1132 } 1133 frx->more_ddp_segs = c_hdr->ddp_rdmap_ctrl & DDP_FLAG_LAST ? 0 : 1; 1134 1135 return 0; 1136 } 1137 1138 static int siw_check_tx_fence(struct siw_qp *qp) 1139 { 1140 struct siw_wqe *tx_waiting = tx_wqe(qp); 1141 struct siw_sqe *rreq; 1142 int resume_tx = 0, rv = 0; 1143 unsigned long flags; 1144 1145 spin_lock_irqsave(&qp->orq_lock, flags); 1146 1147 rreq = orq_get_current(qp); 1148 1149 /* free current orq entry */ 1150 WRITE_ONCE(rreq->flags, 0); 1151 1152 if (qp->tx_ctx.orq_fence) { 1153 if (unlikely(tx_waiting->wr_status != SIW_WR_QUEUED)) { 1154 pr_warn("siw: [QP %u]: fence resume: bad status %d\n", 1155 qp_id(qp), tx_waiting->wr_status); 1156 rv = -EPROTO; 1157 goto out; 1158 } 1159 /* resume SQ processing */ 1160 if (tx_waiting->sqe.opcode == SIW_OP_READ || 1161 tx_waiting->sqe.opcode == SIW_OP_READ_LOCAL_INV) { 1162 rreq = orq_get_tail(qp); 1163 if (unlikely(!rreq)) { 1164 pr_warn("siw: [QP %u]: no ORQE\n", qp_id(qp)); 1165 rv = -EPROTO; 1166 goto out; 1167 } 1168 siw_read_to_orq(rreq, &tx_waiting->sqe); 1169 1170 qp->orq_put++; 1171 qp->tx_ctx.orq_fence = 0; 1172 resume_tx = 1; 1173 1174 } else if (siw_orq_empty(qp)) { 1175 qp->tx_ctx.orq_fence = 0; 1176 resume_tx = 1; 1177 } else { 1178 pr_warn("siw: [QP %u]: fence resume: orq idx: %d:%d\n", 1179 qp_id(qp), qp->orq_get, qp->orq_put); 1180 rv = -EPROTO; 1181 } 1182 } 1183 qp->orq_get++; 1184 out: 1185 spin_unlock_irqrestore(&qp->orq_lock, flags); 1186 1187 if (resume_tx) 1188 rv = siw_sq_start(qp); 1189 1190 return rv; 1191 } 1192 1193 /* 1194 * siw_rdmap_complete() 1195 * 1196 * Complete processing of an RDMA message after receiving all 1197 * DDP segmens or ABort processing after encountering error case. 1198 * 1199 * o SENDs + RRESPs will need for completion, 1200 * o RREQs need for READ RESPONSE initialization 1201 * o WRITEs need memory dereferencing 1202 * 1203 * TODO: Failed WRITEs need local error to be surfaced. 1204 */ 1205 static int siw_rdmap_complete(struct siw_qp *qp, int error) 1206 { 1207 struct siw_rx_stream *srx = &qp->rx_stream; 1208 struct siw_wqe *wqe = rx_wqe(qp->rx_fpdu); 1209 enum siw_wc_status wc_status = wqe->wc_status; 1210 u8 opcode = __rdmap_get_opcode(&srx->hdr.ctrl); 1211 int rv = 0; 1212 1213 switch (opcode) { 1214 case RDMAP_SEND_SE: 1215 case RDMAP_SEND_SE_INVAL: 1216 wqe->rqe.flags |= SIW_WQE_SOLICITED; 1217 /* Fall through */ 1218 1219 case RDMAP_SEND: 1220 case RDMAP_SEND_INVAL: 1221 if (wqe->wr_status == SIW_WR_IDLE) 1222 break; 1223 1224 srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]++; 1225 1226 if (error != 0 && wc_status == SIW_WC_SUCCESS) 1227 wc_status = SIW_WC_GENERAL_ERR; 1228 /* 1229 * Handle STag invalidation request 1230 */ 1231 if (wc_status == SIW_WC_SUCCESS && 1232 (opcode == RDMAP_SEND_INVAL || 1233 opcode == RDMAP_SEND_SE_INVAL)) { 1234 rv = siw_invalidate_stag(qp->pd, srx->inval_stag); 1235 if (rv) { 1236 siw_init_terminate( 1237 qp, TERM_ERROR_LAYER_RDMAP, 1238 rv == -EACCES ? 1239 RDMAP_ETYPE_REMOTE_PROTECTION : 1240 RDMAP_ETYPE_REMOTE_OPERATION, 1241 RDMAP_ECODE_CANNOT_INVALIDATE, 0); 1242 1243 wc_status = SIW_WC_REM_INV_REQ_ERR; 1244 } 1245 rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed, 1246 rv ? 0 : srx->inval_stag, 1247 wc_status); 1248 } else { 1249 rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed, 1250 0, wc_status); 1251 } 1252 siw_wqe_put_mem(wqe, SIW_OP_RECEIVE); 1253 break; 1254 1255 case RDMAP_RDMA_READ_RESP: 1256 if (wqe->wr_status == SIW_WR_IDLE) 1257 break; 1258 1259 if (error != 0) { 1260 if ((srx->state == SIW_GET_HDR && 1261 qp->rx_fpdu->first_ddp_seg) || error == -ENODATA) 1262 /* possible RREQ in ORQ left untouched */ 1263 break; 1264 1265 if (wc_status == SIW_WC_SUCCESS) 1266 wc_status = SIW_WC_GENERAL_ERR; 1267 } else if (qp->kernel_verbs && 1268 rx_type(wqe) == SIW_OP_READ_LOCAL_INV) { 1269 /* 1270 * Handle any STag invalidation request 1271 */ 1272 rv = siw_invalidate_stag(qp->pd, wqe->sqe.sge[0].lkey); 1273 if (rv) { 1274 siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP, 1275 RDMAP_ETYPE_CATASTROPHIC, 1276 RDMAP_ECODE_UNSPECIFIED, 0); 1277 1278 if (wc_status == SIW_WC_SUCCESS) { 1279 wc_status = SIW_WC_GENERAL_ERR; 1280 error = rv; 1281 } 1282 } 1283 } 1284 /* 1285 * All errors turn the wqe into signalled. 1286 */ 1287 if ((wqe->sqe.flags & SIW_WQE_SIGNALLED) || error != 0) 1288 rv = siw_sqe_complete(qp, &wqe->sqe, wqe->processed, 1289 wc_status); 1290 siw_wqe_put_mem(wqe, SIW_OP_READ); 1291 1292 if (!error) 1293 rv = siw_check_tx_fence(qp); 1294 else 1295 /* Disable current ORQ eleement */ 1296 WRITE_ONCE(orq_get_current(qp)->flags, 0); 1297 break; 1298 1299 case RDMAP_RDMA_READ_REQ: 1300 if (!error) { 1301 rv = siw_init_rresp(qp, srx); 1302 srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]++; 1303 } 1304 break; 1305 1306 case RDMAP_RDMA_WRITE: 1307 if (wqe->wr_status == SIW_WR_IDLE) 1308 break; 1309 1310 /* 1311 * Free References from memory object if 1312 * attached to receive context (inbound WRITE). 1313 * While a zero-length WRITE is allowed, 1314 * no memory reference got created. 1315 */ 1316 if (rx_mem(&qp->rx_tagged)) { 1317 siw_mem_put(rx_mem(&qp->rx_tagged)); 1318 rx_mem(&qp->rx_tagged) = NULL; 1319 } 1320 break; 1321 1322 default: 1323 break; 1324 } 1325 wqe->wr_status = SIW_WR_IDLE; 1326 1327 return rv; 1328 } 1329 1330 /* 1331 * siw_tcp_rx_data() 1332 * 1333 * Main routine to consume inbound TCP payload 1334 * 1335 * @rd_desc: read descriptor 1336 * @skb: socket buffer 1337 * @off: offset in skb 1338 * @len: skb->len - offset : payload in skb 1339 */ 1340 int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb, 1341 unsigned int off, size_t len) 1342 { 1343 struct siw_qp *qp = rd_desc->arg.data; 1344 struct siw_rx_stream *srx = &qp->rx_stream; 1345 int rv; 1346 1347 srx->skb = skb; 1348 srx->skb_new = skb->len - off; 1349 srx->skb_offset = off; 1350 srx->skb_copied = 0; 1351 1352 siw_dbg_qp(qp, "new data, len %d\n", srx->skb_new); 1353 1354 while (srx->skb_new) { 1355 int run_completion = 1; 1356 1357 if (unlikely(srx->rx_suspend)) { 1358 /* Do not process any more data */ 1359 srx->skb_copied += srx->skb_new; 1360 break; 1361 } 1362 switch (srx->state) { 1363 case SIW_GET_HDR: 1364 rv = siw_get_hdr(srx); 1365 if (!rv) { 1366 srx->fpdu_part_rem = 1367 be16_to_cpu(srx->hdr.ctrl.mpa_len) - 1368 srx->fpdu_part_rcvd + MPA_HDR_SIZE; 1369 1370 if (srx->fpdu_part_rem) 1371 srx->pad = -srx->fpdu_part_rem & 0x3; 1372 else 1373 srx->pad = 0; 1374 1375 srx->state = SIW_GET_DATA_START; 1376 srx->fpdu_part_rcvd = 0; 1377 } 1378 break; 1379 1380 case SIW_GET_DATA_MORE: 1381 /* 1382 * Another data fragment of the same DDP segment. 1383 * Setting first_ddp_seg = 0 avoids repeating 1384 * initializations that shall occur only once per 1385 * DDP segment. 1386 */ 1387 qp->rx_fpdu->first_ddp_seg = 0; 1388 /* Fall through */ 1389 1390 case SIW_GET_DATA_START: 1391 /* 1392 * Headers will be checked by the opcode-specific 1393 * data receive function below. 1394 */ 1395 rv = iwarp_pktinfo[qp->rx_stream.rdmap_op].rx_data(qp); 1396 if (!rv) { 1397 int mpa_len = 1398 be16_to_cpu(srx->hdr.ctrl.mpa_len) 1399 + MPA_HDR_SIZE; 1400 1401 srx->fpdu_part_rem = (-mpa_len & 0x3) 1402 + MPA_CRC_SIZE; 1403 srx->fpdu_part_rcvd = 0; 1404 srx->state = SIW_GET_TRAILER; 1405 } else { 1406 if (unlikely(rv == -ECONNRESET)) 1407 run_completion = 0; 1408 else 1409 srx->state = SIW_GET_DATA_MORE; 1410 } 1411 break; 1412 1413 case SIW_GET_TRAILER: 1414 /* 1415 * read CRC + any padding 1416 */ 1417 rv = siw_get_trailer(qp, srx); 1418 if (likely(!rv)) { 1419 /* 1420 * FPDU completed. 1421 * complete RDMAP message if last fragment 1422 */ 1423 srx->state = SIW_GET_HDR; 1424 srx->fpdu_part_rcvd = 0; 1425 1426 if (!(srx->hdr.ctrl.ddp_rdmap_ctrl & 1427 DDP_FLAG_LAST)) 1428 /* more frags */ 1429 break; 1430 1431 rv = siw_rdmap_complete(qp, 0); 1432 run_completion = 0; 1433 } 1434 break; 1435 1436 default: 1437 pr_warn("QP[%u]: RX out of state\n", qp_id(qp)); 1438 rv = -EPROTO; 1439 run_completion = 0; 1440 } 1441 if (unlikely(rv != 0 && rv != -EAGAIN)) { 1442 if ((srx->state > SIW_GET_HDR || 1443 qp->rx_fpdu->more_ddp_segs) && run_completion) 1444 siw_rdmap_complete(qp, rv); 1445 1446 siw_dbg_qp(qp, "rx error %d, rx state %d\n", rv, 1447 srx->state); 1448 1449 siw_qp_cm_drop(qp, 1); 1450 1451 break; 1452 } 1453 if (rv) { 1454 siw_dbg_qp(qp, "fpdu fragment, state %d, missing %d\n", 1455 srx->state, srx->fpdu_part_rem); 1456 break; 1457 } 1458 } 1459 return srx->skb_copied; 1460 } 1461