xref: /openbmc/linux/drivers/infiniband/sw/siw/siw_qp_rx.c (revision 04eb94d526423ff082efce61f4f26b0369d0bfdd)
1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
2 
3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
4 /* Copyright (c) 2008-2019, IBM Corporation */
5 
6 #include <linux/errno.h>
7 #include <linux/types.h>
8 #include <linux/net.h>
9 #include <linux/scatterlist.h>
10 #include <linux/highmem.h>
11 
12 #include <rdma/iw_cm.h>
13 #include <rdma/ib_verbs.h>
14 
15 #include "siw.h"
16 #include "siw_verbs.h"
17 #include "siw_mem.h"
18 
19 /*
20  * siw_rx_umem()
21  *
22  * Receive data of @len into target referenced by @dest_addr.
23  *
24  * @srx:	Receive Context
25  * @umem:	siw representation of target memory
26  * @dest_addr:	user virtual address
27  * @len:	number of bytes to place
28  */
29 static int siw_rx_umem(struct siw_rx_stream *srx, struct siw_umem *umem,
30 		       u64 dest_addr, int len)
31 {
32 	int copied = 0;
33 
34 	while (len) {
35 		struct page *p;
36 		int pg_off, bytes, rv;
37 		void *dest;
38 
39 		p = siw_get_upage(umem, dest_addr);
40 		if (unlikely(!p)) {
41 			pr_warn("siw: %s: [QP %u]: bogus addr: %p, %p\n",
42 				__func__, qp_id(rx_qp(srx)),
43 				(void *)dest_addr, (void *)umem->fp_addr);
44 			/* siw internal error */
45 			srx->skb_copied += copied;
46 			srx->skb_new -= copied;
47 
48 			return -EFAULT;
49 		}
50 		pg_off = dest_addr & ~PAGE_MASK;
51 		bytes = min(len, (int)PAGE_SIZE - pg_off);
52 
53 		siw_dbg_qp(rx_qp(srx), "page %p, bytes=%u\n", p, bytes);
54 
55 		dest = kmap_atomic(p);
56 		rv = skb_copy_bits(srx->skb, srx->skb_offset, dest + pg_off,
57 				   bytes);
58 
59 		if (unlikely(rv)) {
60 			kunmap_atomic(dest);
61 			srx->skb_copied += copied;
62 			srx->skb_new -= copied;
63 
64 			pr_warn("siw: [QP %u]: %s, len %d, page %p, rv %d\n",
65 				qp_id(rx_qp(srx)), __func__, len, p, rv);
66 
67 			return -EFAULT;
68 		}
69 		if (srx->mpa_crc_hd) {
70 			if (rx_qp(srx)->kernel_verbs) {
71 				crypto_shash_update(srx->mpa_crc_hd,
72 					(u8 *)(dest + pg_off), bytes);
73 				kunmap_atomic(dest);
74 			} else {
75 				kunmap_atomic(dest);
76 				/*
77 				 * Do CRC on original, not target buffer.
78 				 * Some user land applications may
79 				 * concurrently write the target buffer,
80 				 * which would yield a broken CRC.
81 				 * Walking the skb twice is very ineffcient.
82 				 * Folding the CRC into skb_copy_bits()
83 				 * would be much better, but is currently
84 				 * not supported.
85 				 */
86 				siw_crc_skb(srx, bytes);
87 			}
88 		} else {
89 			kunmap_atomic(dest);
90 		}
91 		srx->skb_offset += bytes;
92 		copied += bytes;
93 		len -= bytes;
94 		dest_addr += bytes;
95 		pg_off = 0;
96 	}
97 	srx->skb_copied += copied;
98 	srx->skb_new -= copied;
99 
100 	return copied;
101 }
102 
103 static int siw_rx_kva(struct siw_rx_stream *srx, void *kva, int len)
104 {
105 	int rv;
106 
107 	siw_dbg_qp(rx_qp(srx), "kva: 0x%p, len: %u\n", kva, len);
108 
109 	rv = skb_copy_bits(srx->skb, srx->skb_offset, kva, len);
110 	if (unlikely(rv)) {
111 		pr_warn("siw: [QP %u]: %s, len %d, kva 0x%p, rv %d\n",
112 			qp_id(rx_qp(srx)), __func__, len, kva, rv);
113 
114 		return rv;
115 	}
116 	if (srx->mpa_crc_hd)
117 		crypto_shash_update(srx->mpa_crc_hd, (u8 *)kva, len);
118 
119 	srx->skb_offset += len;
120 	srx->skb_copied += len;
121 	srx->skb_new -= len;
122 
123 	return len;
124 }
125 
126 static int siw_rx_pbl(struct siw_rx_stream *srx, int *pbl_idx,
127 		      struct siw_mem *mem, u64 addr, int len)
128 {
129 	struct siw_pbl *pbl = mem->pbl;
130 	u64 offset = addr - mem->va;
131 	int copied = 0;
132 
133 	while (len) {
134 		int bytes;
135 		u64 buf_addr =
136 			siw_pbl_get_buffer(pbl, offset, &bytes, pbl_idx);
137 		if (!buf_addr)
138 			break;
139 
140 		bytes = min(bytes, len);
141 		if (siw_rx_kva(srx, (void *)buf_addr, bytes) == bytes) {
142 			copied += bytes;
143 			offset += bytes;
144 			len -= bytes;
145 		} else {
146 			break;
147 		}
148 	}
149 	return copied;
150 }
151 
152 /*
153  * siw_rresp_check_ntoh()
154  *
155  * Check incoming RRESP fragment header against expected
156  * header values and update expected values for potential next
157  * fragment.
158  *
159  * NOTE: This function must be called only if a RRESP DDP segment
160  *       starts but not for fragmented consecutive pieces of an
161  *       already started DDP segment.
162  */
163 static int siw_rresp_check_ntoh(struct siw_rx_stream *srx,
164 				struct siw_rx_fpdu *frx)
165 {
166 	struct iwarp_rdma_rresp *rresp = &srx->hdr.rresp;
167 	struct siw_wqe *wqe = &frx->wqe_active;
168 	enum ddp_ecode ecode;
169 
170 	u32 sink_stag = be32_to_cpu(rresp->sink_stag);
171 	u64 sink_to = be64_to_cpu(rresp->sink_to);
172 
173 	if (frx->first_ddp_seg) {
174 		srx->ddp_stag = wqe->sqe.sge[0].lkey;
175 		srx->ddp_to = wqe->sqe.sge[0].laddr;
176 		frx->pbl_idx = 0;
177 	}
178 	/* Below checks extend beyond the semantics of DDP, and
179 	 * into RDMAP:
180 	 * We check if the read response matches exactly the
181 	 * read request which was send to the remote peer to
182 	 * trigger this read response. RFC5040/5041 do not
183 	 * always have a proper error code for the detected
184 	 * error cases. We choose 'base or bounds error' for
185 	 * cases where the inbound STag is valid, but offset
186 	 * or length do not match our response receive state.
187 	 */
188 	if (unlikely(srx->ddp_stag != sink_stag)) {
189 		pr_warn("siw: [QP %u]: rresp stag: %08x != %08x\n",
190 			qp_id(rx_qp(srx)), sink_stag, srx->ddp_stag);
191 		ecode = DDP_ECODE_T_INVALID_STAG;
192 		goto error;
193 	}
194 	if (unlikely(srx->ddp_to != sink_to)) {
195 		pr_warn("siw: [QP %u]: rresp off: %016llx != %016llx\n",
196 			qp_id(rx_qp(srx)), (unsigned long long)sink_to,
197 			(unsigned long long)srx->ddp_to);
198 		ecode = DDP_ECODE_T_BASE_BOUNDS;
199 		goto error;
200 	}
201 	if (unlikely(!frx->more_ddp_segs &&
202 		     (wqe->processed + srx->fpdu_part_rem != wqe->bytes))) {
203 		pr_warn("siw: [QP %u]: rresp len: %d != %d\n",
204 			qp_id(rx_qp(srx)),
205 			wqe->processed + srx->fpdu_part_rem, wqe->bytes);
206 		ecode = DDP_ECODE_T_BASE_BOUNDS;
207 		goto error;
208 	}
209 	return 0;
210 error:
211 	siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
212 			   DDP_ETYPE_TAGGED_BUF, ecode, 0);
213 	return -EINVAL;
214 }
215 
216 /*
217  * siw_write_check_ntoh()
218  *
219  * Check incoming WRITE fragment header against expected
220  * header values and update expected values for potential next
221  * fragment
222  *
223  * NOTE: This function must be called only if a WRITE DDP segment
224  *       starts but not for fragmented consecutive pieces of an
225  *       already started DDP segment.
226  */
227 static int siw_write_check_ntoh(struct siw_rx_stream *srx,
228 				struct siw_rx_fpdu *frx)
229 {
230 	struct iwarp_rdma_write *write = &srx->hdr.rwrite;
231 	enum ddp_ecode ecode;
232 
233 	u32 sink_stag = be32_to_cpu(write->sink_stag);
234 	u64 sink_to = be64_to_cpu(write->sink_to);
235 
236 	if (frx->first_ddp_seg) {
237 		srx->ddp_stag = sink_stag;
238 		srx->ddp_to = sink_to;
239 		frx->pbl_idx = 0;
240 	} else {
241 		if (unlikely(srx->ddp_stag != sink_stag)) {
242 			pr_warn("siw: [QP %u]: write stag: %08x != %08x\n",
243 				qp_id(rx_qp(srx)), sink_stag,
244 				srx->ddp_stag);
245 			ecode = DDP_ECODE_T_INVALID_STAG;
246 			goto error;
247 		}
248 		if (unlikely(srx->ddp_to != sink_to)) {
249 			pr_warn("siw: [QP %u]: write off: %016llx != %016llx\n",
250 				qp_id(rx_qp(srx)),
251 				(unsigned long long)sink_to,
252 				(unsigned long long)srx->ddp_to);
253 			ecode = DDP_ECODE_T_BASE_BOUNDS;
254 			goto error;
255 		}
256 	}
257 	return 0;
258 error:
259 	siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
260 			   DDP_ETYPE_TAGGED_BUF, ecode, 0);
261 	return -EINVAL;
262 }
263 
264 /*
265  * siw_send_check_ntoh()
266  *
267  * Check incoming SEND fragment header against expected
268  * header values and update expected MSN if no next
269  * fragment expected
270  *
271  * NOTE: This function must be called only if a SEND DDP segment
272  *       starts but not for fragmented consecutive pieces of an
273  *       already started DDP segment.
274  */
275 static int siw_send_check_ntoh(struct siw_rx_stream *srx,
276 			       struct siw_rx_fpdu *frx)
277 {
278 	struct iwarp_send_inv *send = &srx->hdr.send_inv;
279 	struct siw_wqe *wqe = &frx->wqe_active;
280 	enum ddp_ecode ecode;
281 
282 	u32 ddp_msn = be32_to_cpu(send->ddp_msn);
283 	u32 ddp_mo = be32_to_cpu(send->ddp_mo);
284 	u32 ddp_qn = be32_to_cpu(send->ddp_qn);
285 
286 	if (unlikely(ddp_qn != RDMAP_UNTAGGED_QN_SEND)) {
287 		pr_warn("siw: [QP %u]: invalid ddp qn %d for send\n",
288 			qp_id(rx_qp(srx)), ddp_qn);
289 		ecode = DDP_ECODE_UT_INVALID_QN;
290 		goto error;
291 	}
292 	if (unlikely(ddp_msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND])) {
293 		pr_warn("siw: [QP %u]: send msn: %u != %u\n",
294 			qp_id(rx_qp(srx)), ddp_msn,
295 			srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]);
296 		ecode = DDP_ECODE_UT_INVALID_MSN_RANGE;
297 		goto error;
298 	}
299 	if (unlikely(ddp_mo != wqe->processed)) {
300 		pr_warn("siw: [QP %u], send mo: %u != %u\n",
301 			qp_id(rx_qp(srx)), ddp_mo, wqe->processed);
302 		ecode = DDP_ECODE_UT_INVALID_MO;
303 		goto error;
304 	}
305 	if (frx->first_ddp_seg) {
306 		/* initialize user memory write position */
307 		frx->sge_idx = 0;
308 		frx->sge_off = 0;
309 		frx->pbl_idx = 0;
310 
311 		/* only valid for SEND_INV and SEND_SE_INV operations */
312 		srx->inval_stag = be32_to_cpu(send->inval_stag);
313 	}
314 	if (unlikely(wqe->bytes < wqe->processed + srx->fpdu_part_rem)) {
315 		siw_dbg_qp(rx_qp(srx), "receive space short: %d - %d < %d\n",
316 			   wqe->bytes, wqe->processed, srx->fpdu_part_rem);
317 		wqe->wc_status = SIW_WC_LOC_LEN_ERR;
318 		ecode = DDP_ECODE_UT_INVALID_MSN_NOBUF;
319 		goto error;
320 	}
321 	return 0;
322 error:
323 	siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
324 			   DDP_ETYPE_UNTAGGED_BUF, ecode, 0);
325 	return -EINVAL;
326 }
327 
328 static struct siw_wqe *siw_rqe_get(struct siw_qp *qp)
329 {
330 	struct siw_rqe *rqe;
331 	struct siw_srq *srq;
332 	struct siw_wqe *wqe = NULL;
333 	bool srq_event = false;
334 	unsigned long flags;
335 
336 	srq = qp->srq;
337 	if (srq) {
338 		spin_lock_irqsave(&srq->lock, flags);
339 		if (unlikely(!srq->num_rqe))
340 			goto out;
341 
342 		rqe = &srq->recvq[srq->rq_get % srq->num_rqe];
343 	} else {
344 		if (unlikely(!qp->recvq))
345 			goto out;
346 
347 		rqe = &qp->recvq[qp->rq_get % qp->attrs.rq_size];
348 	}
349 	if (likely(rqe->flags == SIW_WQE_VALID)) {
350 		int num_sge = rqe->num_sge;
351 
352 		if (likely(num_sge <= SIW_MAX_SGE)) {
353 			int i = 0;
354 
355 			wqe = rx_wqe(&qp->rx_untagged);
356 			rx_type(wqe) = SIW_OP_RECEIVE;
357 			wqe->wr_status = SIW_WR_INPROGRESS;
358 			wqe->bytes = 0;
359 			wqe->processed = 0;
360 
361 			wqe->rqe.id = rqe->id;
362 			wqe->rqe.num_sge = num_sge;
363 
364 			while (i < num_sge) {
365 				wqe->rqe.sge[i].laddr = rqe->sge[i].laddr;
366 				wqe->rqe.sge[i].lkey = rqe->sge[i].lkey;
367 				wqe->rqe.sge[i].length = rqe->sge[i].length;
368 				wqe->bytes += wqe->rqe.sge[i].length;
369 				wqe->mem[i] = NULL;
370 				i++;
371 			}
372 			/* can be re-used by appl */
373 			smp_store_mb(rqe->flags, 0);
374 		} else {
375 			siw_dbg_qp(qp, "too many sge's: %d\n", rqe->num_sge);
376 			if (srq)
377 				spin_unlock_irqrestore(&srq->lock, flags);
378 			return NULL;
379 		}
380 		if (!srq) {
381 			qp->rq_get++;
382 		} else {
383 			if (srq->armed) {
384 				/* Test SRQ limit */
385 				u32 off = (srq->rq_get + srq->limit) %
386 					  srq->num_rqe;
387 				struct siw_rqe *rqe2 = &srq->recvq[off];
388 
389 				if (!(rqe2->flags & SIW_WQE_VALID)) {
390 					srq->armed = 0;
391 					srq_event = true;
392 				}
393 			}
394 			srq->rq_get++;
395 		}
396 	}
397 out:
398 	if (srq) {
399 		spin_unlock_irqrestore(&srq->lock, flags);
400 		if (srq_event)
401 			siw_srq_event(srq, IB_EVENT_SRQ_LIMIT_REACHED);
402 	}
403 	return wqe;
404 }
405 
406 /*
407  * siw_proc_send:
408  *
409  * Process one incoming SEND and place data into memory referenced by
410  * receive wqe.
411  *
412  * Function supports partially received sends (suspending/resuming
413  * current receive wqe processing)
414  *
415  * return value:
416  *	0:       reached the end of a DDP segment
417  *	-EAGAIN: to be called again to finish the DDP segment
418  */
419 int siw_proc_send(struct siw_qp *qp)
420 {
421 	struct siw_rx_stream *srx = &qp->rx_stream;
422 	struct siw_rx_fpdu *frx = &qp->rx_untagged;
423 	struct siw_wqe *wqe;
424 	u32 data_bytes; /* all data bytes available */
425 	u32 rcvd_bytes; /* sum of data bytes rcvd */
426 	int rv = 0;
427 
428 	if (frx->first_ddp_seg) {
429 		wqe = siw_rqe_get(qp);
430 		if (unlikely(!wqe)) {
431 			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
432 					   DDP_ETYPE_UNTAGGED_BUF,
433 					   DDP_ECODE_UT_INVALID_MSN_NOBUF, 0);
434 			return -ENOENT;
435 		}
436 	} else {
437 		wqe = rx_wqe(frx);
438 	}
439 	if (srx->state == SIW_GET_DATA_START) {
440 		rv = siw_send_check_ntoh(srx, frx);
441 		if (unlikely(rv)) {
442 			siw_qp_event(qp, IB_EVENT_QP_FATAL);
443 			return rv;
444 		}
445 		if (!srx->fpdu_part_rem) /* zero length SEND */
446 			return 0;
447 	}
448 	data_bytes = min(srx->fpdu_part_rem, srx->skb_new);
449 	rcvd_bytes = 0;
450 
451 	/* A zero length SEND will skip below loop */
452 	while (data_bytes) {
453 		struct ib_pd *pd;
454 		struct siw_mem **mem, *mem_p;
455 		struct siw_sge *sge;
456 		u32 sge_bytes; /* data bytes avail for SGE */
457 
458 		sge = &wqe->rqe.sge[frx->sge_idx];
459 
460 		if (!sge->length) {
461 			/* just skip empty sge's */
462 			frx->sge_idx++;
463 			frx->sge_off = 0;
464 			frx->pbl_idx = 0;
465 			continue;
466 		}
467 		sge_bytes = min(data_bytes, sge->length - frx->sge_off);
468 		mem = &wqe->mem[frx->sge_idx];
469 
470 		/*
471 		 * check with QP's PD if no SRQ present, SRQ's PD otherwise
472 		 */
473 		pd = qp->srq == NULL ? qp->pd : qp->srq->base_srq.pd;
474 
475 		rv = siw_check_sge(pd, sge, mem, IB_ACCESS_LOCAL_WRITE,
476 				   frx->sge_off, sge_bytes);
477 		if (unlikely(rv)) {
478 			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
479 					   DDP_ETYPE_CATASTROPHIC,
480 					   DDP_ECODE_CATASTROPHIC, 0);
481 
482 			siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
483 			break;
484 		}
485 		mem_p = *mem;
486 		if (mem_p->mem_obj == NULL)
487 			rv = siw_rx_kva(srx,
488 					(void *)(sge->laddr + frx->sge_off),
489 					sge_bytes);
490 		else if (!mem_p->is_pbl)
491 			rv = siw_rx_umem(srx, mem_p->umem,
492 					 sge->laddr + frx->sge_off, sge_bytes);
493 		else
494 			rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p,
495 					sge->laddr + frx->sge_off, sge_bytes);
496 
497 		if (unlikely(rv != sge_bytes)) {
498 			wqe->processed += rcvd_bytes;
499 
500 			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
501 					   DDP_ETYPE_CATASTROPHIC,
502 					   DDP_ECODE_CATASTROPHIC, 0);
503 			return -EINVAL;
504 		}
505 		frx->sge_off += rv;
506 
507 		if (frx->sge_off == sge->length) {
508 			frx->sge_idx++;
509 			frx->sge_off = 0;
510 			frx->pbl_idx = 0;
511 		}
512 		data_bytes -= rv;
513 		rcvd_bytes += rv;
514 
515 		srx->fpdu_part_rem -= rv;
516 		srx->fpdu_part_rcvd += rv;
517 	}
518 	wqe->processed += rcvd_bytes;
519 
520 	if (!srx->fpdu_part_rem)
521 		return 0;
522 
523 	return (rv < 0) ? rv : -EAGAIN;
524 }
525 
526 /*
527  * siw_proc_write:
528  *
529  * Place incoming WRITE after referencing and checking target buffer
530 
531  * Function supports partially received WRITEs (suspending/resuming
532  * current receive processing)
533  *
534  * return value:
535  *	0:       reached the end of a DDP segment
536  *	-EAGAIN: to be called again to finish the DDP segment
537  */
538 int siw_proc_write(struct siw_qp *qp)
539 {
540 	struct siw_rx_stream *srx = &qp->rx_stream;
541 	struct siw_rx_fpdu *frx = &qp->rx_tagged;
542 	struct siw_mem *mem;
543 	int bytes, rv;
544 
545 	if (srx->state == SIW_GET_DATA_START) {
546 		if (!srx->fpdu_part_rem) /* zero length WRITE */
547 			return 0;
548 
549 		rv = siw_write_check_ntoh(srx, frx);
550 		if (unlikely(rv)) {
551 			siw_qp_event(qp, IB_EVENT_QP_FATAL);
552 			return rv;
553 		}
554 	}
555 	bytes = min(srx->fpdu_part_rem, srx->skb_new);
556 
557 	if (frx->first_ddp_seg) {
558 		struct siw_wqe *wqe = rx_wqe(frx);
559 
560 		rx_mem(frx) = siw_mem_id2obj(qp->sdev, srx->ddp_stag >> 8);
561 		if (unlikely(!rx_mem(frx))) {
562 			siw_dbg_qp(qp,
563 				   "sink stag not found/invalid, stag 0x%08x\n",
564 				   srx->ddp_stag);
565 
566 			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
567 					   DDP_ETYPE_TAGGED_BUF,
568 					   DDP_ECODE_T_INVALID_STAG, 0);
569 			return -EINVAL;
570 		}
571 		wqe->rqe.num_sge = 1;
572 		rx_type(wqe) = SIW_OP_WRITE;
573 		wqe->wr_status = SIW_WR_INPROGRESS;
574 	}
575 	mem = rx_mem(frx);
576 
577 	/*
578 	 * Check if application re-registered memory with different
579 	 * key field of STag.
580 	 */
581 	if (unlikely(mem->stag != srx->ddp_stag)) {
582 		siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
583 				   DDP_ETYPE_TAGGED_BUF,
584 				   DDP_ECODE_T_INVALID_STAG, 0);
585 		return -EINVAL;
586 	}
587 	rv = siw_check_mem(qp->pd, mem, srx->ddp_to + srx->fpdu_part_rcvd,
588 			   IB_ACCESS_REMOTE_WRITE, bytes);
589 	if (unlikely(rv)) {
590 		siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
591 				   DDP_ETYPE_TAGGED_BUF, siw_tagged_error(-rv),
592 				   0);
593 
594 		siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
595 
596 		return -EINVAL;
597 	}
598 
599 	if (mem->mem_obj == NULL)
600 		rv = siw_rx_kva(srx,
601 				(void *)(srx->ddp_to + srx->fpdu_part_rcvd),
602 				bytes);
603 	else if (!mem->is_pbl)
604 		rv = siw_rx_umem(srx, mem->umem,
605 				 srx->ddp_to + srx->fpdu_part_rcvd, bytes);
606 	else
607 		rv = siw_rx_pbl(srx, &frx->pbl_idx, mem,
608 				srx->ddp_to + srx->fpdu_part_rcvd, bytes);
609 
610 	if (unlikely(rv != bytes)) {
611 		siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
612 				   DDP_ETYPE_CATASTROPHIC,
613 				   DDP_ECODE_CATASTROPHIC, 0);
614 		return -EINVAL;
615 	}
616 	srx->fpdu_part_rem -= rv;
617 	srx->fpdu_part_rcvd += rv;
618 
619 	if (!srx->fpdu_part_rem) {
620 		srx->ddp_to += srx->fpdu_part_rcvd;
621 		return 0;
622 	}
623 	return -EAGAIN;
624 }
625 
626 /*
627  * Inbound RREQ's cannot carry user data.
628  */
629 int siw_proc_rreq(struct siw_qp *qp)
630 {
631 	struct siw_rx_stream *srx = &qp->rx_stream;
632 
633 	if (!srx->fpdu_part_rem)
634 		return 0;
635 
636 	pr_warn("siw: [QP %u]: rreq with mpa len %d\n", qp_id(qp),
637 		be16_to_cpu(srx->hdr.ctrl.mpa_len));
638 
639 	return -EPROTO;
640 }
641 
642 /*
643  * siw_init_rresp:
644  *
645  * Process inbound RDMA READ REQ. Produce a pseudo READ RESPONSE WQE.
646  * Put it at the tail of the IRQ, if there is another WQE currently in
647  * transmit processing. If not, make it the current WQE to be processed
648  * and schedule transmit processing.
649  *
650  * Can be called from softirq context and from process
651  * context (RREAD socket loopback case!)
652  *
653  * return value:
654  *	0:      success,
655  *		failure code otherwise
656  */
657 
658 static int siw_init_rresp(struct siw_qp *qp, struct siw_rx_stream *srx)
659 {
660 	struct siw_wqe *tx_work = tx_wqe(qp);
661 	struct siw_sqe *resp;
662 
663 	uint64_t raddr = be64_to_cpu(srx->hdr.rreq.sink_to),
664 		 laddr = be64_to_cpu(srx->hdr.rreq.source_to);
665 	uint32_t length = be32_to_cpu(srx->hdr.rreq.read_size),
666 		 lkey = be32_to_cpu(srx->hdr.rreq.source_stag),
667 		 rkey = be32_to_cpu(srx->hdr.rreq.sink_stag),
668 		 msn = be32_to_cpu(srx->hdr.rreq.ddp_msn);
669 
670 	int run_sq = 1, rv = 0;
671 	unsigned long flags;
672 
673 	if (unlikely(msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ])) {
674 		siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
675 				   DDP_ETYPE_UNTAGGED_BUF,
676 				   DDP_ECODE_UT_INVALID_MSN_RANGE, 0);
677 		return -EPROTO;
678 	}
679 	spin_lock_irqsave(&qp->sq_lock, flags);
680 
681 	if (tx_work->wr_status == SIW_WR_IDLE) {
682 		/*
683 		 * immediately schedule READ response w/o
684 		 * consuming IRQ entry: IRQ must be empty.
685 		 */
686 		tx_work->processed = 0;
687 		tx_work->mem[0] = NULL;
688 		tx_work->wr_status = SIW_WR_QUEUED;
689 		resp = &tx_work->sqe;
690 	} else {
691 		resp = irq_alloc_free(qp);
692 		run_sq = 0;
693 	}
694 	if (likely(resp)) {
695 		resp->opcode = SIW_OP_READ_RESPONSE;
696 
697 		resp->sge[0].length = length;
698 		resp->sge[0].laddr = laddr;
699 		resp->sge[0].lkey = lkey;
700 
701 		/* Keep aside message sequence number for potential
702 		 * error reporting during Read Response generation.
703 		 */
704 		resp->sge[1].length = msn;
705 
706 		resp->raddr = raddr;
707 		resp->rkey = rkey;
708 		resp->num_sge = length ? 1 : 0;
709 
710 		/* RRESP now valid as current TX wqe or placed into IRQ */
711 		smp_store_mb(resp->flags, SIW_WQE_VALID);
712 	} else {
713 		pr_warn("siw: [QP %u]: irq %d exceeded %d\n", qp_id(qp),
714 			qp->irq_put % qp->attrs.irq_size, qp->attrs.irq_size);
715 
716 		siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
717 				   RDMAP_ETYPE_REMOTE_OPERATION,
718 				   RDMAP_ECODE_CATASTROPHIC_STREAM, 0);
719 		rv = -EPROTO;
720 	}
721 
722 	spin_unlock_irqrestore(&qp->sq_lock, flags);
723 
724 	if (run_sq)
725 		rv = siw_sq_start(qp);
726 
727 	return rv;
728 }
729 
730 /*
731  * Only called at start of Read.Resonse processing.
732  * Transfer pending Read from tip of ORQ into currrent rx wqe,
733  * but keep ORQ entry valid until Read.Response processing done.
734  * No Queue locking needed.
735  */
736 static int siw_orqe_start_rx(struct siw_qp *qp)
737 {
738 	struct siw_sqe *orqe;
739 	struct siw_wqe *wqe = NULL;
740 
741 	/* make sure ORQ indices are current */
742 	smp_mb();
743 
744 	orqe = orq_get_current(qp);
745 	if (READ_ONCE(orqe->flags) & SIW_WQE_VALID) {
746 		/* RRESP is a TAGGED RDMAP operation */
747 		wqe = rx_wqe(&qp->rx_tagged);
748 		wqe->sqe.id = orqe->id;
749 		wqe->sqe.opcode = orqe->opcode;
750 		wqe->sqe.sge[0].laddr = orqe->sge[0].laddr;
751 		wqe->sqe.sge[0].lkey = orqe->sge[0].lkey;
752 		wqe->sqe.sge[0].length = orqe->sge[0].length;
753 		wqe->sqe.flags = orqe->flags;
754 		wqe->sqe.num_sge = 1;
755 		wqe->bytes = orqe->sge[0].length;
756 		wqe->processed = 0;
757 		wqe->mem[0] = NULL;
758 		/* make sure WQE is completely written before valid */
759 		smp_wmb();
760 		wqe->wr_status = SIW_WR_INPROGRESS;
761 
762 		return 0;
763 	}
764 	return -EPROTO;
765 }
766 
767 /*
768  * siw_proc_rresp:
769  *
770  * Place incoming RRESP data into memory referenced by RREQ WQE
771  * which is at the tip of the ORQ
772  *
773  * Function supports partially received RRESP's (suspending/resuming
774  * current receive processing)
775  */
776 int siw_proc_rresp(struct siw_qp *qp)
777 {
778 	struct siw_rx_stream *srx = &qp->rx_stream;
779 	struct siw_rx_fpdu *frx = &qp->rx_tagged;
780 	struct siw_wqe *wqe = rx_wqe(frx);
781 	struct siw_mem **mem, *mem_p;
782 	struct siw_sge *sge;
783 	int bytes, rv;
784 
785 	if (frx->first_ddp_seg) {
786 		if (unlikely(wqe->wr_status != SIW_WR_IDLE)) {
787 			pr_warn("siw: [QP %u]: proc RRESP: status %d, op %d\n",
788 				qp_id(qp), wqe->wr_status, wqe->sqe.opcode);
789 			rv = -EPROTO;
790 			goto error_term;
791 		}
792 		/*
793 		 * fetch pending RREQ from orq
794 		 */
795 		rv = siw_orqe_start_rx(qp);
796 		if (rv) {
797 			pr_warn("siw: [QP %u]: ORQ empty at idx %d\n",
798 				qp_id(qp), qp->orq_get % qp->attrs.orq_size);
799 			goto error_term;
800 		}
801 		rv = siw_rresp_check_ntoh(srx, frx);
802 		if (unlikely(rv)) {
803 			siw_qp_event(qp, IB_EVENT_QP_FATAL);
804 			return rv;
805 		}
806 	} else {
807 		if (unlikely(wqe->wr_status != SIW_WR_INPROGRESS)) {
808 			pr_warn("siw: [QP %u]: resume RRESP: status %d\n",
809 				qp_id(qp), wqe->wr_status);
810 			rv = -EPROTO;
811 			goto error_term;
812 		}
813 	}
814 	if (!srx->fpdu_part_rem) /* zero length RRESPONSE */
815 		return 0;
816 
817 	sge = wqe->sqe.sge; /* there is only one */
818 	mem = &wqe->mem[0];
819 
820 	if (!(*mem)) {
821 		/*
822 		 * check target memory which resolves memory on first fragment
823 		 */
824 		rv = siw_check_sge(qp->pd, sge, mem, IB_ACCESS_LOCAL_WRITE, 0,
825 				   wqe->bytes);
826 		if (unlikely(rv)) {
827 			siw_dbg_qp(qp, "target mem check: %d\n", rv);
828 			wqe->wc_status = SIW_WC_LOC_PROT_ERR;
829 
830 			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
831 					   DDP_ETYPE_TAGGED_BUF,
832 					   siw_tagged_error(-rv), 0);
833 
834 			siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
835 
836 			return -EINVAL;
837 		}
838 	}
839 	mem_p = *mem;
840 
841 	bytes = min(srx->fpdu_part_rem, srx->skb_new);
842 
843 	if (mem_p->mem_obj == NULL)
844 		rv = siw_rx_kva(srx, (void *)(sge->laddr + wqe->processed),
845 				bytes);
846 	else if (!mem_p->is_pbl)
847 		rv = siw_rx_umem(srx, mem_p->umem, sge->laddr + wqe->processed,
848 				 bytes);
849 	else
850 		rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p,
851 				sge->laddr + wqe->processed, bytes);
852 	if (rv != bytes) {
853 		wqe->wc_status = SIW_WC_GENERAL_ERR;
854 		rv = -EINVAL;
855 		goto error_term;
856 	}
857 	srx->fpdu_part_rem -= rv;
858 	srx->fpdu_part_rcvd += rv;
859 	wqe->processed += rv;
860 
861 	if (!srx->fpdu_part_rem) {
862 		srx->ddp_to += srx->fpdu_part_rcvd;
863 		return 0;
864 	}
865 	return -EAGAIN;
866 
867 error_term:
868 	siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, DDP_ETYPE_CATASTROPHIC,
869 			   DDP_ECODE_CATASTROPHIC, 0);
870 	return rv;
871 }
872 
873 int siw_proc_terminate(struct siw_qp *qp)
874 {
875 	struct siw_rx_stream *srx = &qp->rx_stream;
876 	struct sk_buff *skb = srx->skb;
877 	struct iwarp_terminate *term = &srx->hdr.terminate;
878 	union iwarp_hdr term_info;
879 	u8 *infop = (u8 *)&term_info;
880 	enum rdma_opcode op;
881 	u16 to_copy = sizeof(struct iwarp_ctrl);
882 
883 	pr_warn("siw: got TERMINATE. layer %d, type %d, code %d\n",
884 		__rdmap_term_layer(term), __rdmap_term_etype(term),
885 		__rdmap_term_ecode(term));
886 
887 	if (be32_to_cpu(term->ddp_qn) != RDMAP_UNTAGGED_QN_TERMINATE ||
888 	    be32_to_cpu(term->ddp_msn) !=
889 		    qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] ||
890 	    be32_to_cpu(term->ddp_mo) != 0) {
891 		pr_warn("siw: rx bogus TERM [QN x%08x, MSN x%08x, MO x%08x]\n",
892 			be32_to_cpu(term->ddp_qn), be32_to_cpu(term->ddp_msn),
893 			be32_to_cpu(term->ddp_mo));
894 		return -ECONNRESET;
895 	}
896 	/*
897 	 * Receive remaining pieces of TERM if indicated
898 	 */
899 	if (!term->flag_m)
900 		return -ECONNRESET;
901 
902 	/* Do not take the effort to reassemble a network fragmented
903 	 * TERM message
904 	 */
905 	if (srx->skb_new < sizeof(struct iwarp_ctrl_tagged))
906 		return -ECONNRESET;
907 
908 	memset(infop, 0, sizeof(term_info));
909 
910 	skb_copy_bits(skb, srx->skb_offset, infop, to_copy);
911 
912 	op = __rdmap_get_opcode(&term_info.ctrl);
913 	if (op >= RDMAP_TERMINATE)
914 		goto out;
915 
916 	infop += to_copy;
917 	srx->skb_offset += to_copy;
918 	srx->skb_new -= to_copy;
919 	srx->skb_copied += to_copy;
920 	srx->fpdu_part_rcvd += to_copy;
921 	srx->fpdu_part_rem -= to_copy;
922 
923 	to_copy = iwarp_pktinfo[op].hdr_len - to_copy;
924 
925 	/* Again, no network fragmented TERM's */
926 	if (to_copy + MPA_CRC_SIZE > srx->skb_new)
927 		return -ECONNRESET;
928 
929 	skb_copy_bits(skb, srx->skb_offset, infop, to_copy);
930 
931 	if (term->flag_r) {
932 		siw_dbg_qp(qp, "TERM reports RDMAP hdr type %u, len %u (%s)\n",
933 			   op, be16_to_cpu(term_info.ctrl.mpa_len),
934 			   term->flag_m ? "valid" : "invalid");
935 	} else if (term->flag_d) {
936 		siw_dbg_qp(qp, "TERM reports DDP hdr type %u, len %u (%s)\n",
937 			   op, be16_to_cpu(term_info.ctrl.mpa_len),
938 			   term->flag_m ? "valid" : "invalid");
939 	}
940 out:
941 	srx->skb_new -= to_copy;
942 	srx->skb_offset += to_copy;
943 	srx->skb_copied += to_copy;
944 	srx->fpdu_part_rcvd += to_copy;
945 	srx->fpdu_part_rem -= to_copy;
946 
947 	return -ECONNRESET;
948 }
949 
950 static int siw_get_trailer(struct siw_qp *qp, struct siw_rx_stream *srx)
951 {
952 	struct sk_buff *skb = srx->skb;
953 	u8 *tbuf = (u8 *)&srx->trailer.crc - srx->pad;
954 	__wsum crc_in, crc_own = 0;
955 
956 	siw_dbg_qp(qp, "expected %d, available %d, pad %u\n",
957 		   srx->fpdu_part_rem, srx->skb_new, srx->pad);
958 
959 	if (srx->skb_new < srx->fpdu_part_rem)
960 		return -EAGAIN;
961 
962 	skb_copy_bits(skb, srx->skb_offset, tbuf, srx->fpdu_part_rem);
963 
964 	if (srx->mpa_crc_hd && srx->pad)
965 		crypto_shash_update(srx->mpa_crc_hd, tbuf, srx->pad);
966 
967 	srx->skb_new -= srx->fpdu_part_rem;
968 	srx->skb_offset += srx->fpdu_part_rem;
969 	srx->skb_copied += srx->fpdu_part_rem;
970 
971 	if (!srx->mpa_crc_hd)
972 		return 0;
973 
974 	/*
975 	 * CRC32 is computed, transmitted and received directly in NBO,
976 	 * so there's never a reason to convert byte order.
977 	 */
978 	crypto_shash_final(srx->mpa_crc_hd, (u8 *)&crc_own);
979 	crc_in = (__force __wsum)srx->trailer.crc;
980 
981 	if (unlikely(crc_in != crc_own)) {
982 		pr_warn("siw: crc error. in: %08x, own %08x, op %u\n",
983 			crc_in, crc_own, qp->rx_stream.rdmap_op);
984 
985 		siw_init_terminate(qp, TERM_ERROR_LAYER_LLP,
986 				   LLP_ETYPE_MPA,
987 				   LLP_ECODE_RECEIVED_CRC, 0);
988 		return -EINVAL;
989 	}
990 	return 0;
991 }
992 
993 #define MIN_DDP_HDR sizeof(struct iwarp_ctrl_tagged)
994 
995 static int siw_get_hdr(struct siw_rx_stream *srx)
996 {
997 	struct sk_buff *skb = srx->skb;
998 	struct siw_qp *qp = rx_qp(srx);
999 	struct iwarp_ctrl *c_hdr = &srx->hdr.ctrl;
1000 	struct siw_rx_fpdu *frx;
1001 	u8 opcode;
1002 	int bytes;
1003 
1004 	if (srx->fpdu_part_rcvd < MIN_DDP_HDR) {
1005 		/*
1006 		 * copy a mimimum sized (tagged) DDP frame control part
1007 		 */
1008 		bytes = min_t(int, srx->skb_new,
1009 			      MIN_DDP_HDR - srx->fpdu_part_rcvd);
1010 
1011 		skb_copy_bits(skb, srx->skb_offset,
1012 			      (char *)c_hdr + srx->fpdu_part_rcvd, bytes);
1013 
1014 		srx->fpdu_part_rcvd += bytes;
1015 
1016 		srx->skb_new -= bytes;
1017 		srx->skb_offset += bytes;
1018 		srx->skb_copied += bytes;
1019 
1020 		if (srx->fpdu_part_rcvd < MIN_DDP_HDR)
1021 			return -EAGAIN;
1022 
1023 		if (unlikely(__ddp_get_version(c_hdr) != DDP_VERSION)) {
1024 			enum ddp_etype etype;
1025 			enum ddp_ecode ecode;
1026 
1027 			pr_warn("siw: received ddp version unsupported %d\n",
1028 				__ddp_get_version(c_hdr));
1029 
1030 			if (c_hdr->ddp_rdmap_ctrl & DDP_FLAG_TAGGED) {
1031 				etype = DDP_ETYPE_TAGGED_BUF;
1032 				ecode = DDP_ECODE_T_VERSION;
1033 			} else {
1034 				etype = DDP_ETYPE_UNTAGGED_BUF;
1035 				ecode = DDP_ECODE_UT_VERSION;
1036 			}
1037 			siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
1038 					   etype, ecode, 0);
1039 			return -EINVAL;
1040 		}
1041 		if (unlikely(__rdmap_get_version(c_hdr) != RDMAP_VERSION)) {
1042 			pr_warn("siw: received rdmap version unsupported %d\n",
1043 				__rdmap_get_version(c_hdr));
1044 
1045 			siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP,
1046 					   RDMAP_ETYPE_REMOTE_OPERATION,
1047 					   RDMAP_ECODE_VERSION, 0);
1048 			return -EINVAL;
1049 		}
1050 		opcode = __rdmap_get_opcode(c_hdr);
1051 
1052 		if (opcode > RDMAP_TERMINATE) {
1053 			pr_warn("siw: received unknown packet type %u\n",
1054 				opcode);
1055 
1056 			siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP,
1057 					   RDMAP_ETYPE_REMOTE_OPERATION,
1058 					   RDMAP_ECODE_OPCODE, 0);
1059 			return -EINVAL;
1060 		}
1061 		siw_dbg_qp(rx_qp(srx), "new header, opcode %u\n", opcode);
1062 	} else {
1063 		opcode = __rdmap_get_opcode(c_hdr);
1064 	}
1065 	set_rx_fpdu_context(qp, opcode);
1066 	frx = qp->rx_fpdu;
1067 
1068 	/*
1069 	 * Figure out len of current hdr: variable length of
1070 	 * iwarp hdr may force us to copy hdr information in
1071 	 * two steps. Only tagged DDP messages are already
1072 	 * completely received.
1073 	 */
1074 	if (iwarp_pktinfo[opcode].hdr_len > sizeof(struct iwarp_ctrl_tagged)) {
1075 		bytes = iwarp_pktinfo[opcode].hdr_len - MIN_DDP_HDR;
1076 
1077 		if (srx->skb_new < bytes)
1078 			return -EAGAIN;
1079 
1080 		skb_copy_bits(skb, srx->skb_offset,
1081 			      (char *)c_hdr + srx->fpdu_part_rcvd, bytes);
1082 
1083 		srx->fpdu_part_rcvd += bytes;
1084 
1085 		srx->skb_new -= bytes;
1086 		srx->skb_offset += bytes;
1087 		srx->skb_copied += bytes;
1088 	}
1089 
1090 	/*
1091 	 * DDP/RDMAP header receive completed. Check if the current
1092 	 * DDP segment starts a new RDMAP message or continues a previously
1093 	 * started RDMAP message.
1094 	 *
1095 	 * Alternating reception of DDP segments (or FPDUs) from incomplete
1096 	 * tagged and untagged RDMAP messages is supported, as long as
1097 	 * the current tagged or untagged message gets eventually completed
1098 	 * w/o intersection from another message of the same type
1099 	 * (tagged/untagged). E.g., a WRITE can get intersected by a SEND,
1100 	 * but not by a READ RESPONSE etc.
1101 	 */
1102 	if (srx->mpa_crc_hd) {
1103 		/*
1104 		 * Restart CRC computation
1105 		 */
1106 		crypto_shash_init(srx->mpa_crc_hd);
1107 		crypto_shash_update(srx->mpa_crc_hd, (u8 *)c_hdr,
1108 				    srx->fpdu_part_rcvd);
1109 	}
1110 	if (frx->more_ddp_segs) {
1111 		frx->first_ddp_seg = 0;
1112 		if (frx->prev_rdmap_op != opcode) {
1113 			pr_warn("siw: packet intersection: %u : %u\n",
1114 				frx->prev_rdmap_op, opcode);
1115 			/*
1116 			 * The last inbound RDMA operation of same type
1117 			 * (tagged or untagged) is left unfinished.
1118 			 * To complete it in error, make it the current
1119 			 * operation again, even with the header already
1120 			 * overwritten. For error handling, only the opcode
1121 			 * and current rx context are relevant.
1122 			 */
1123 			set_rx_fpdu_context(qp, frx->prev_rdmap_op);
1124 			__rdmap_set_opcode(c_hdr, frx->prev_rdmap_op);
1125 			return -EPROTO;
1126 		}
1127 	} else {
1128 		frx->prev_rdmap_op = opcode;
1129 		frx->first_ddp_seg = 1;
1130 	}
1131 	frx->more_ddp_segs = c_hdr->ddp_rdmap_ctrl & DDP_FLAG_LAST ? 0 : 1;
1132 
1133 	return 0;
1134 }
1135 
1136 static int siw_check_tx_fence(struct siw_qp *qp)
1137 {
1138 	struct siw_wqe *tx_waiting = tx_wqe(qp);
1139 	struct siw_sqe *rreq;
1140 	int resume_tx = 0, rv = 0;
1141 	unsigned long flags;
1142 
1143 	spin_lock_irqsave(&qp->orq_lock, flags);
1144 
1145 	rreq = orq_get_current(qp);
1146 
1147 	/* free current orq entry */
1148 	WRITE_ONCE(rreq->flags, 0);
1149 
1150 	if (qp->tx_ctx.orq_fence) {
1151 		if (unlikely(tx_waiting->wr_status != SIW_WR_QUEUED)) {
1152 			pr_warn("siw: [QP %u]: fence resume: bad status %d\n",
1153 				qp_id(qp), tx_waiting->wr_status);
1154 			rv = -EPROTO;
1155 			goto out;
1156 		}
1157 		/* resume SQ processing */
1158 		if (tx_waiting->sqe.opcode == SIW_OP_READ ||
1159 		    tx_waiting->sqe.opcode == SIW_OP_READ_LOCAL_INV) {
1160 			rreq = orq_get_tail(qp);
1161 			if (unlikely(!rreq)) {
1162 				pr_warn("siw: [QP %u]: no ORQE\n", qp_id(qp));
1163 				rv = -EPROTO;
1164 				goto out;
1165 			}
1166 			siw_read_to_orq(rreq, &tx_waiting->sqe);
1167 
1168 			qp->orq_put++;
1169 			qp->tx_ctx.orq_fence = 0;
1170 			resume_tx = 1;
1171 
1172 		} else if (siw_orq_empty(qp)) {
1173 			qp->tx_ctx.orq_fence = 0;
1174 			resume_tx = 1;
1175 		} else {
1176 			pr_warn("siw: [QP %u]: fence resume: orq idx: %d:%d\n",
1177 				qp_id(qp), qp->orq_get, qp->orq_put);
1178 			rv = -EPROTO;
1179 		}
1180 	}
1181 	qp->orq_get++;
1182 out:
1183 	spin_unlock_irqrestore(&qp->orq_lock, flags);
1184 
1185 	if (resume_tx)
1186 		rv = siw_sq_start(qp);
1187 
1188 	return rv;
1189 }
1190 
1191 /*
1192  * siw_rdmap_complete()
1193  *
1194  * Complete processing of an RDMA message after receiving all
1195  * DDP segmens or ABort processing after encountering error case.
1196  *
1197  *   o SENDs + RRESPs will need for completion,
1198  *   o RREQs need for  READ RESPONSE initialization
1199  *   o WRITEs need memory dereferencing
1200  *
1201  * TODO: Failed WRITEs need local error to be surfaced.
1202  */
1203 static int siw_rdmap_complete(struct siw_qp *qp, int error)
1204 {
1205 	struct siw_rx_stream *srx = &qp->rx_stream;
1206 	struct siw_wqe *wqe = rx_wqe(qp->rx_fpdu);
1207 	enum siw_wc_status wc_status = wqe->wc_status;
1208 	u8 opcode = __rdmap_get_opcode(&srx->hdr.ctrl);
1209 	int rv = 0;
1210 
1211 	switch (opcode) {
1212 	case RDMAP_SEND_SE:
1213 	case RDMAP_SEND_SE_INVAL:
1214 		wqe->rqe.flags |= SIW_WQE_SOLICITED;
1215 		/* Fall through */
1216 
1217 	case RDMAP_SEND:
1218 	case RDMAP_SEND_INVAL:
1219 		if (wqe->wr_status == SIW_WR_IDLE)
1220 			break;
1221 
1222 		srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]++;
1223 
1224 		if (error != 0 && wc_status == SIW_WC_SUCCESS)
1225 			wc_status = SIW_WC_GENERAL_ERR;
1226 		/*
1227 		 * Handle STag invalidation request
1228 		 */
1229 		if (wc_status == SIW_WC_SUCCESS &&
1230 		    (opcode == RDMAP_SEND_INVAL ||
1231 		     opcode == RDMAP_SEND_SE_INVAL)) {
1232 			rv = siw_invalidate_stag(qp->pd, srx->inval_stag);
1233 			if (rv) {
1234 				siw_init_terminate(
1235 					qp, TERM_ERROR_LAYER_RDMAP,
1236 					rv == -EACCES ?
1237 						RDMAP_ETYPE_REMOTE_PROTECTION :
1238 						RDMAP_ETYPE_REMOTE_OPERATION,
1239 					RDMAP_ECODE_CANNOT_INVALIDATE, 0);
1240 
1241 				wc_status = SIW_WC_REM_INV_REQ_ERR;
1242 			}
1243 			rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed,
1244 					      rv ? 0 : srx->inval_stag,
1245 					      wc_status);
1246 		} else {
1247 			rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed,
1248 					      0, wc_status);
1249 		}
1250 		siw_wqe_put_mem(wqe, SIW_OP_RECEIVE);
1251 		break;
1252 
1253 	case RDMAP_RDMA_READ_RESP:
1254 		if (wqe->wr_status == SIW_WR_IDLE)
1255 			break;
1256 
1257 		if (error != 0) {
1258 			if ((srx->state == SIW_GET_HDR &&
1259 			     qp->rx_fpdu->first_ddp_seg) || error == -ENODATA)
1260 				/* possible RREQ in ORQ left untouched */
1261 				break;
1262 
1263 			if (wc_status == SIW_WC_SUCCESS)
1264 				wc_status = SIW_WC_GENERAL_ERR;
1265 		} else if (qp->kernel_verbs &&
1266 			   rx_type(wqe) == SIW_OP_READ_LOCAL_INV) {
1267 			/*
1268 			 * Handle any STag invalidation request
1269 			 */
1270 			rv = siw_invalidate_stag(qp->pd, wqe->sqe.sge[0].lkey);
1271 			if (rv) {
1272 				siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
1273 						   RDMAP_ETYPE_CATASTROPHIC,
1274 						   RDMAP_ECODE_UNSPECIFIED, 0);
1275 
1276 				if (wc_status == SIW_WC_SUCCESS) {
1277 					wc_status = SIW_WC_GENERAL_ERR;
1278 					error = rv;
1279 				}
1280 			}
1281 		}
1282 		/*
1283 		 * All errors turn the wqe into signalled.
1284 		 */
1285 		if ((wqe->sqe.flags & SIW_WQE_SIGNALLED) || error != 0)
1286 			rv = siw_sqe_complete(qp, &wqe->sqe, wqe->processed,
1287 					      wc_status);
1288 		siw_wqe_put_mem(wqe, SIW_OP_READ);
1289 
1290 		if (!error)
1291 			rv = siw_check_tx_fence(qp);
1292 		else
1293 			/* Disable current ORQ eleement */
1294 			WRITE_ONCE(orq_get_current(qp)->flags, 0);
1295 		break;
1296 
1297 	case RDMAP_RDMA_READ_REQ:
1298 		if (!error) {
1299 			rv = siw_init_rresp(qp, srx);
1300 			srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]++;
1301 		}
1302 		break;
1303 
1304 	case RDMAP_RDMA_WRITE:
1305 		if (wqe->wr_status == SIW_WR_IDLE)
1306 			break;
1307 
1308 		/*
1309 		 * Free References from memory object if
1310 		 * attached to receive context (inbound WRITE).
1311 		 * While a zero-length WRITE is allowed,
1312 		 * no memory reference got created.
1313 		 */
1314 		if (rx_mem(&qp->rx_tagged)) {
1315 			siw_mem_put(rx_mem(&qp->rx_tagged));
1316 			rx_mem(&qp->rx_tagged) = NULL;
1317 		}
1318 		break;
1319 
1320 	default:
1321 		break;
1322 	}
1323 	wqe->wr_status = SIW_WR_IDLE;
1324 
1325 	return rv;
1326 }
1327 
1328 /*
1329  * siw_tcp_rx_data()
1330  *
1331  * Main routine to consume inbound TCP payload
1332  *
1333  * @rd_desc:	read descriptor
1334  * @skb:	socket buffer
1335  * @off:	offset in skb
1336  * @len:	skb->len - offset : payload in skb
1337  */
1338 int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb,
1339 		    unsigned int off, size_t len)
1340 {
1341 	struct siw_qp *qp = rd_desc->arg.data;
1342 	struct siw_rx_stream *srx = &qp->rx_stream;
1343 	int rv;
1344 
1345 	srx->skb = skb;
1346 	srx->skb_new = skb->len - off;
1347 	srx->skb_offset = off;
1348 	srx->skb_copied = 0;
1349 
1350 	siw_dbg_qp(qp, "new data, len %d\n", srx->skb_new);
1351 
1352 	while (srx->skb_new) {
1353 		int run_completion = 1;
1354 
1355 		if (unlikely(srx->rx_suspend)) {
1356 			/* Do not process any more data */
1357 			srx->skb_copied += srx->skb_new;
1358 			break;
1359 		}
1360 		switch (srx->state) {
1361 		case SIW_GET_HDR:
1362 			rv = siw_get_hdr(srx);
1363 			if (!rv) {
1364 				srx->fpdu_part_rem =
1365 					be16_to_cpu(srx->hdr.ctrl.mpa_len) -
1366 					srx->fpdu_part_rcvd + MPA_HDR_SIZE;
1367 
1368 				if (srx->fpdu_part_rem)
1369 					srx->pad = -srx->fpdu_part_rem & 0x3;
1370 				else
1371 					srx->pad = 0;
1372 
1373 				srx->state = SIW_GET_DATA_START;
1374 				srx->fpdu_part_rcvd = 0;
1375 			}
1376 			break;
1377 
1378 		case SIW_GET_DATA_MORE:
1379 			/*
1380 			 * Another data fragment of the same DDP segment.
1381 			 * Setting first_ddp_seg = 0 avoids repeating
1382 			 * initializations that shall occur only once per
1383 			 * DDP segment.
1384 			 */
1385 			qp->rx_fpdu->first_ddp_seg = 0;
1386 			/* Fall through */
1387 
1388 		case SIW_GET_DATA_START:
1389 			/*
1390 			 * Headers will be checked by the opcode-specific
1391 			 * data receive function below.
1392 			 */
1393 			rv = iwarp_pktinfo[qp->rx_stream.rdmap_op].rx_data(qp);
1394 			if (!rv) {
1395 				int mpa_len =
1396 					be16_to_cpu(srx->hdr.ctrl.mpa_len)
1397 					+ MPA_HDR_SIZE;
1398 
1399 				srx->fpdu_part_rem = (-mpa_len & 0x3)
1400 						      + MPA_CRC_SIZE;
1401 				srx->fpdu_part_rcvd = 0;
1402 				srx->state = SIW_GET_TRAILER;
1403 			} else {
1404 				if (unlikely(rv == -ECONNRESET))
1405 					run_completion = 0;
1406 				else
1407 					srx->state = SIW_GET_DATA_MORE;
1408 			}
1409 			break;
1410 
1411 		case SIW_GET_TRAILER:
1412 			/*
1413 			 * read CRC + any padding
1414 			 */
1415 			rv = siw_get_trailer(qp, srx);
1416 			if (likely(!rv)) {
1417 				/*
1418 				 * FPDU completed.
1419 				 * complete RDMAP message if last fragment
1420 				 */
1421 				srx->state = SIW_GET_HDR;
1422 				srx->fpdu_part_rcvd = 0;
1423 
1424 				if (!(srx->hdr.ctrl.ddp_rdmap_ctrl &
1425 				      DDP_FLAG_LAST))
1426 					/* more frags */
1427 					break;
1428 
1429 				rv = siw_rdmap_complete(qp, 0);
1430 				run_completion = 0;
1431 			}
1432 			break;
1433 
1434 		default:
1435 			pr_warn("QP[%u]: RX out of state\n", qp_id(qp));
1436 			rv = -EPROTO;
1437 			run_completion = 0;
1438 		}
1439 		if (unlikely(rv != 0 && rv != -EAGAIN)) {
1440 			if ((srx->state > SIW_GET_HDR ||
1441 			     qp->rx_fpdu->more_ddp_segs) && run_completion)
1442 				siw_rdmap_complete(qp, rv);
1443 
1444 			siw_dbg_qp(qp, "rx error %d, rx state %d\n", rv,
1445 				   srx->state);
1446 
1447 			siw_qp_cm_drop(qp, 1);
1448 
1449 			break;
1450 		}
1451 		if (rv) {
1452 			siw_dbg_qp(qp, "fpdu fragment, state %d, missing %d\n",
1453 				   srx->state, srx->fpdu_part_rem);
1454 			break;
1455 		}
1456 	}
1457 	return srx->skb_copied;
1458 }
1459