1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
2 
3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
4 /* Copyright (c) 2008-2019, IBM Corporation */
5 
6 #include <linux/errno.h>
7 #include <linux/types.h>
8 #include <linux/net.h>
9 #include <linux/scatterlist.h>
10 #include <linux/highmem.h>
11 
12 #include <rdma/iw_cm.h>
13 #include <rdma/ib_verbs.h>
14 
15 #include "siw.h"
16 #include "siw_verbs.h"
17 #include "siw_mem.h"
18 
19 /*
20  * siw_rx_umem()
21  *
22  * Receive data of @len into target referenced by @dest_addr.
23  *
24  * @srx:	Receive Context
25  * @umem:	siw representation of target memory
26  * @dest_addr:	user virtual address
27  * @len:	number of bytes to place
28  */
29 static int siw_rx_umem(struct siw_rx_stream *srx, struct siw_umem *umem,
30 		       u64 dest_addr, int len)
31 {
32 	int copied = 0;
33 
34 	while (len) {
35 		struct page *p;
36 		int pg_off, bytes, rv;
37 		void *dest;
38 
39 		p = siw_get_upage(umem, dest_addr);
40 		if (unlikely(!p)) {
41 			pr_warn("siw: %s: [QP %u]: bogus addr: %pK, %pK\n",
42 				__func__, qp_id(rx_qp(srx)),
43 				(void *)(uintptr_t)dest_addr,
44 				(void *)(uintptr_t)umem->fp_addr);
45 			/* siw internal error */
46 			srx->skb_copied += copied;
47 			srx->skb_new -= copied;
48 
49 			return -EFAULT;
50 		}
51 		pg_off = dest_addr & ~PAGE_MASK;
52 		bytes = min(len, (int)PAGE_SIZE - pg_off);
53 
54 		siw_dbg_qp(rx_qp(srx), "page %pK, bytes=%u\n", p, bytes);
55 
56 		dest = kmap_atomic(p);
57 		rv = skb_copy_bits(srx->skb, srx->skb_offset, dest + pg_off,
58 				   bytes);
59 
60 		if (unlikely(rv)) {
61 			kunmap_atomic(dest);
62 			srx->skb_copied += copied;
63 			srx->skb_new -= copied;
64 
65 			pr_warn("siw: [QP %u]: %s, len %d, page %p, rv %d\n",
66 				qp_id(rx_qp(srx)), __func__, len, p, rv);
67 
68 			return -EFAULT;
69 		}
70 		if (srx->mpa_crc_hd) {
71 			if (rx_qp(srx)->kernel_verbs) {
72 				crypto_shash_update(srx->mpa_crc_hd,
73 					(u8 *)(dest + pg_off), bytes);
74 				kunmap_atomic(dest);
75 			} else {
76 				kunmap_atomic(dest);
77 				/*
78 				 * Do CRC on original, not target buffer.
79 				 * Some user land applications may
80 				 * concurrently write the target buffer,
81 				 * which would yield a broken CRC.
82 				 * Walking the skb twice is very ineffcient.
83 				 * Folding the CRC into skb_copy_bits()
84 				 * would be much better, but is currently
85 				 * not supported.
86 				 */
87 				siw_crc_skb(srx, bytes);
88 			}
89 		} else {
90 			kunmap_atomic(dest);
91 		}
92 		srx->skb_offset += bytes;
93 		copied += bytes;
94 		len -= bytes;
95 		dest_addr += bytes;
96 		pg_off = 0;
97 	}
98 	srx->skb_copied += copied;
99 	srx->skb_new -= copied;
100 
101 	return copied;
102 }
103 
104 static int siw_rx_kva(struct siw_rx_stream *srx, void *kva, int len)
105 {
106 	int rv;
107 
108 	siw_dbg_qp(rx_qp(srx), "kva: 0x%pK, len: %u\n", kva, len);
109 
110 	rv = skb_copy_bits(srx->skb, srx->skb_offset, kva, len);
111 	if (unlikely(rv)) {
112 		pr_warn("siw: [QP %u]: %s, len %d, kva 0x%pK, rv %d\n",
113 			qp_id(rx_qp(srx)), __func__, len, kva, rv);
114 
115 		return rv;
116 	}
117 	if (srx->mpa_crc_hd)
118 		crypto_shash_update(srx->mpa_crc_hd, (u8 *)kva, len);
119 
120 	srx->skb_offset += len;
121 	srx->skb_copied += len;
122 	srx->skb_new -= len;
123 
124 	return len;
125 }
126 
127 static int siw_rx_pbl(struct siw_rx_stream *srx, int *pbl_idx,
128 		      struct siw_mem *mem, u64 addr, int len)
129 {
130 	struct siw_pbl *pbl = mem->pbl;
131 	u64 offset = addr - mem->va;
132 	int copied = 0;
133 
134 	while (len) {
135 		int bytes;
136 		dma_addr_t buf_addr =
137 			siw_pbl_get_buffer(pbl, offset, &bytes, pbl_idx);
138 		if (!buf_addr)
139 			break;
140 
141 		bytes = min(bytes, len);
142 		if (siw_rx_kva(srx, (void *)buf_addr, bytes) == bytes) {
143 			copied += bytes;
144 			offset += bytes;
145 			len -= bytes;
146 		} else {
147 			break;
148 		}
149 	}
150 	return copied;
151 }
152 
153 /*
154  * siw_rresp_check_ntoh()
155  *
156  * Check incoming RRESP fragment header against expected
157  * header values and update expected values for potential next
158  * fragment.
159  *
160  * NOTE: This function must be called only if a RRESP DDP segment
161  *       starts but not for fragmented consecutive pieces of an
162  *       already started DDP segment.
163  */
164 static int siw_rresp_check_ntoh(struct siw_rx_stream *srx,
165 				struct siw_rx_fpdu *frx)
166 {
167 	struct iwarp_rdma_rresp *rresp = &srx->hdr.rresp;
168 	struct siw_wqe *wqe = &frx->wqe_active;
169 	enum ddp_ecode ecode;
170 
171 	u32 sink_stag = be32_to_cpu(rresp->sink_stag);
172 	u64 sink_to = be64_to_cpu(rresp->sink_to);
173 
174 	if (frx->first_ddp_seg) {
175 		srx->ddp_stag = wqe->sqe.sge[0].lkey;
176 		srx->ddp_to = wqe->sqe.sge[0].laddr;
177 		frx->pbl_idx = 0;
178 	}
179 	/* Below checks extend beyond the semantics of DDP, and
180 	 * into RDMAP:
181 	 * We check if the read response matches exactly the
182 	 * read request which was send to the remote peer to
183 	 * trigger this read response. RFC5040/5041 do not
184 	 * always have a proper error code for the detected
185 	 * error cases. We choose 'base or bounds error' for
186 	 * cases where the inbound STag is valid, but offset
187 	 * or length do not match our response receive state.
188 	 */
189 	if (unlikely(srx->ddp_stag != sink_stag)) {
190 		pr_warn("siw: [QP %u]: rresp stag: %08x != %08x\n",
191 			qp_id(rx_qp(srx)), sink_stag, srx->ddp_stag);
192 		ecode = DDP_ECODE_T_INVALID_STAG;
193 		goto error;
194 	}
195 	if (unlikely(srx->ddp_to != sink_to)) {
196 		pr_warn("siw: [QP %u]: rresp off: %016llx != %016llx\n",
197 			qp_id(rx_qp(srx)), (unsigned long long)sink_to,
198 			(unsigned long long)srx->ddp_to);
199 		ecode = DDP_ECODE_T_BASE_BOUNDS;
200 		goto error;
201 	}
202 	if (unlikely(!frx->more_ddp_segs &&
203 		     (wqe->processed + srx->fpdu_part_rem != wqe->bytes))) {
204 		pr_warn("siw: [QP %u]: rresp len: %d != %d\n",
205 			qp_id(rx_qp(srx)),
206 			wqe->processed + srx->fpdu_part_rem, wqe->bytes);
207 		ecode = DDP_ECODE_T_BASE_BOUNDS;
208 		goto error;
209 	}
210 	return 0;
211 error:
212 	siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
213 			   DDP_ETYPE_TAGGED_BUF, ecode, 0);
214 	return -EINVAL;
215 }
216 
217 /*
218  * siw_write_check_ntoh()
219  *
220  * Check incoming WRITE fragment header against expected
221  * header values and update expected values for potential next
222  * fragment
223  *
224  * NOTE: This function must be called only if a WRITE DDP segment
225  *       starts but not for fragmented consecutive pieces of an
226  *       already started DDP segment.
227  */
228 static int siw_write_check_ntoh(struct siw_rx_stream *srx,
229 				struct siw_rx_fpdu *frx)
230 {
231 	struct iwarp_rdma_write *write = &srx->hdr.rwrite;
232 	enum ddp_ecode ecode;
233 
234 	u32 sink_stag = be32_to_cpu(write->sink_stag);
235 	u64 sink_to = be64_to_cpu(write->sink_to);
236 
237 	if (frx->first_ddp_seg) {
238 		srx->ddp_stag = sink_stag;
239 		srx->ddp_to = sink_to;
240 		frx->pbl_idx = 0;
241 	} else {
242 		if (unlikely(srx->ddp_stag != sink_stag)) {
243 			pr_warn("siw: [QP %u]: write stag: %08x != %08x\n",
244 				qp_id(rx_qp(srx)), sink_stag,
245 				srx->ddp_stag);
246 			ecode = DDP_ECODE_T_INVALID_STAG;
247 			goto error;
248 		}
249 		if (unlikely(srx->ddp_to != sink_to)) {
250 			pr_warn("siw: [QP %u]: write off: %016llx != %016llx\n",
251 				qp_id(rx_qp(srx)),
252 				(unsigned long long)sink_to,
253 				(unsigned long long)srx->ddp_to);
254 			ecode = DDP_ECODE_T_BASE_BOUNDS;
255 			goto error;
256 		}
257 	}
258 	return 0;
259 error:
260 	siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
261 			   DDP_ETYPE_TAGGED_BUF, ecode, 0);
262 	return -EINVAL;
263 }
264 
265 /*
266  * siw_send_check_ntoh()
267  *
268  * Check incoming SEND fragment header against expected
269  * header values and update expected MSN if no next
270  * fragment expected
271  *
272  * NOTE: This function must be called only if a SEND DDP segment
273  *       starts but not for fragmented consecutive pieces of an
274  *       already started DDP segment.
275  */
276 static int siw_send_check_ntoh(struct siw_rx_stream *srx,
277 			       struct siw_rx_fpdu *frx)
278 {
279 	struct iwarp_send_inv *send = &srx->hdr.send_inv;
280 	struct siw_wqe *wqe = &frx->wqe_active;
281 	enum ddp_ecode ecode;
282 
283 	u32 ddp_msn = be32_to_cpu(send->ddp_msn);
284 	u32 ddp_mo = be32_to_cpu(send->ddp_mo);
285 	u32 ddp_qn = be32_to_cpu(send->ddp_qn);
286 
287 	if (unlikely(ddp_qn != RDMAP_UNTAGGED_QN_SEND)) {
288 		pr_warn("siw: [QP %u]: invalid ddp qn %d for send\n",
289 			qp_id(rx_qp(srx)), ddp_qn);
290 		ecode = DDP_ECODE_UT_INVALID_QN;
291 		goto error;
292 	}
293 	if (unlikely(ddp_msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND])) {
294 		pr_warn("siw: [QP %u]: send msn: %u != %u\n",
295 			qp_id(rx_qp(srx)), ddp_msn,
296 			srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]);
297 		ecode = DDP_ECODE_UT_INVALID_MSN_RANGE;
298 		goto error;
299 	}
300 	if (unlikely(ddp_mo != wqe->processed)) {
301 		pr_warn("siw: [QP %u], send mo: %u != %u\n",
302 			qp_id(rx_qp(srx)), ddp_mo, wqe->processed);
303 		ecode = DDP_ECODE_UT_INVALID_MO;
304 		goto error;
305 	}
306 	if (frx->first_ddp_seg) {
307 		/* initialize user memory write position */
308 		frx->sge_idx = 0;
309 		frx->sge_off = 0;
310 		frx->pbl_idx = 0;
311 
312 		/* only valid for SEND_INV and SEND_SE_INV operations */
313 		srx->inval_stag = be32_to_cpu(send->inval_stag);
314 	}
315 	if (unlikely(wqe->bytes < wqe->processed + srx->fpdu_part_rem)) {
316 		siw_dbg_qp(rx_qp(srx), "receive space short: %d - %d < %d\n",
317 			   wqe->bytes, wqe->processed, srx->fpdu_part_rem);
318 		wqe->wc_status = SIW_WC_LOC_LEN_ERR;
319 		ecode = DDP_ECODE_UT_INVALID_MSN_NOBUF;
320 		goto error;
321 	}
322 	return 0;
323 error:
324 	siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
325 			   DDP_ETYPE_UNTAGGED_BUF, ecode, 0);
326 	return -EINVAL;
327 }
328 
329 static struct siw_wqe *siw_rqe_get(struct siw_qp *qp)
330 {
331 	struct siw_rqe *rqe;
332 	struct siw_srq *srq;
333 	struct siw_wqe *wqe = NULL;
334 	bool srq_event = false;
335 	unsigned long flags;
336 
337 	srq = qp->srq;
338 	if (srq) {
339 		spin_lock_irqsave(&srq->lock, flags);
340 		if (unlikely(!srq->num_rqe))
341 			goto out;
342 
343 		rqe = &srq->recvq[srq->rq_get % srq->num_rqe];
344 	} else {
345 		if (unlikely(!qp->recvq))
346 			goto out;
347 
348 		rqe = &qp->recvq[qp->rq_get % qp->attrs.rq_size];
349 	}
350 	if (likely(rqe->flags == SIW_WQE_VALID)) {
351 		int num_sge = rqe->num_sge;
352 
353 		if (likely(num_sge <= SIW_MAX_SGE)) {
354 			int i = 0;
355 
356 			wqe = rx_wqe(&qp->rx_untagged);
357 			rx_type(wqe) = SIW_OP_RECEIVE;
358 			wqe->wr_status = SIW_WR_INPROGRESS;
359 			wqe->bytes = 0;
360 			wqe->processed = 0;
361 
362 			wqe->rqe.id = rqe->id;
363 			wqe->rqe.num_sge = num_sge;
364 
365 			while (i < num_sge) {
366 				wqe->rqe.sge[i].laddr = rqe->sge[i].laddr;
367 				wqe->rqe.sge[i].lkey = rqe->sge[i].lkey;
368 				wqe->rqe.sge[i].length = rqe->sge[i].length;
369 				wqe->bytes += wqe->rqe.sge[i].length;
370 				wqe->mem[i] = NULL;
371 				i++;
372 			}
373 			/* can be re-used by appl */
374 			smp_store_mb(rqe->flags, 0);
375 		} else {
376 			siw_dbg_qp(qp, "too many sge's: %d\n", rqe->num_sge);
377 			if (srq)
378 				spin_unlock_irqrestore(&srq->lock, flags);
379 			return NULL;
380 		}
381 		if (!srq) {
382 			qp->rq_get++;
383 		} else {
384 			if (srq->armed) {
385 				/* Test SRQ limit */
386 				u32 off = (srq->rq_get + srq->limit) %
387 					  srq->num_rqe;
388 				struct siw_rqe *rqe2 = &srq->recvq[off];
389 
390 				if (!(rqe2->flags & SIW_WQE_VALID)) {
391 					srq->armed = 0;
392 					srq_event = true;
393 				}
394 			}
395 			srq->rq_get++;
396 		}
397 	}
398 out:
399 	if (srq) {
400 		spin_unlock_irqrestore(&srq->lock, flags);
401 		if (srq_event)
402 			siw_srq_event(srq, IB_EVENT_SRQ_LIMIT_REACHED);
403 	}
404 	return wqe;
405 }
406 
407 /*
408  * siw_proc_send:
409  *
410  * Process one incoming SEND and place data into memory referenced by
411  * receive wqe.
412  *
413  * Function supports partially received sends (suspending/resuming
414  * current receive wqe processing)
415  *
416  * return value:
417  *	0:       reached the end of a DDP segment
418  *	-EAGAIN: to be called again to finish the DDP segment
419  */
420 int siw_proc_send(struct siw_qp *qp)
421 {
422 	struct siw_rx_stream *srx = &qp->rx_stream;
423 	struct siw_rx_fpdu *frx = &qp->rx_untagged;
424 	struct siw_wqe *wqe;
425 	u32 data_bytes; /* all data bytes available */
426 	u32 rcvd_bytes; /* sum of data bytes rcvd */
427 	int rv = 0;
428 
429 	if (frx->first_ddp_seg) {
430 		wqe = siw_rqe_get(qp);
431 		if (unlikely(!wqe)) {
432 			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
433 					   DDP_ETYPE_UNTAGGED_BUF,
434 					   DDP_ECODE_UT_INVALID_MSN_NOBUF, 0);
435 			return -ENOENT;
436 		}
437 	} else {
438 		wqe = rx_wqe(frx);
439 	}
440 	if (srx->state == SIW_GET_DATA_START) {
441 		rv = siw_send_check_ntoh(srx, frx);
442 		if (unlikely(rv)) {
443 			siw_qp_event(qp, IB_EVENT_QP_FATAL);
444 			return rv;
445 		}
446 		if (!srx->fpdu_part_rem) /* zero length SEND */
447 			return 0;
448 	}
449 	data_bytes = min(srx->fpdu_part_rem, srx->skb_new);
450 	rcvd_bytes = 0;
451 
452 	/* A zero length SEND will skip below loop */
453 	while (data_bytes) {
454 		struct ib_pd *pd;
455 		struct siw_mem **mem, *mem_p;
456 		struct siw_sge *sge;
457 		u32 sge_bytes; /* data bytes avail for SGE */
458 
459 		sge = &wqe->rqe.sge[frx->sge_idx];
460 
461 		if (!sge->length) {
462 			/* just skip empty sge's */
463 			frx->sge_idx++;
464 			frx->sge_off = 0;
465 			frx->pbl_idx = 0;
466 			continue;
467 		}
468 		sge_bytes = min(data_bytes, sge->length - frx->sge_off);
469 		mem = &wqe->mem[frx->sge_idx];
470 
471 		/*
472 		 * check with QP's PD if no SRQ present, SRQ's PD otherwise
473 		 */
474 		pd = qp->srq == NULL ? qp->pd : qp->srq->base_srq.pd;
475 
476 		rv = siw_check_sge(pd, sge, mem, IB_ACCESS_LOCAL_WRITE,
477 				   frx->sge_off, sge_bytes);
478 		if (unlikely(rv)) {
479 			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
480 					   DDP_ETYPE_CATASTROPHIC,
481 					   DDP_ECODE_CATASTROPHIC, 0);
482 
483 			siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
484 			break;
485 		}
486 		mem_p = *mem;
487 		if (mem_p->mem_obj == NULL)
488 			rv = siw_rx_kva(srx,
489 				(void *)(uintptr_t)(sge->laddr + frx->sge_off),
490 				sge_bytes);
491 		else if (!mem_p->is_pbl)
492 			rv = siw_rx_umem(srx, mem_p->umem,
493 					 sge->laddr + frx->sge_off, sge_bytes);
494 		else
495 			rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p,
496 					sge->laddr + frx->sge_off, sge_bytes);
497 
498 		if (unlikely(rv != sge_bytes)) {
499 			wqe->processed += rcvd_bytes;
500 
501 			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
502 					   DDP_ETYPE_CATASTROPHIC,
503 					   DDP_ECODE_CATASTROPHIC, 0);
504 			return -EINVAL;
505 		}
506 		frx->sge_off += rv;
507 
508 		if (frx->sge_off == sge->length) {
509 			frx->sge_idx++;
510 			frx->sge_off = 0;
511 			frx->pbl_idx = 0;
512 		}
513 		data_bytes -= rv;
514 		rcvd_bytes += rv;
515 
516 		srx->fpdu_part_rem -= rv;
517 		srx->fpdu_part_rcvd += rv;
518 	}
519 	wqe->processed += rcvd_bytes;
520 
521 	if (!srx->fpdu_part_rem)
522 		return 0;
523 
524 	return (rv < 0) ? rv : -EAGAIN;
525 }
526 
527 /*
528  * siw_proc_write:
529  *
530  * Place incoming WRITE after referencing and checking target buffer
531 
532  * Function supports partially received WRITEs (suspending/resuming
533  * current receive processing)
534  *
535  * return value:
536  *	0:       reached the end of a DDP segment
537  *	-EAGAIN: to be called again to finish the DDP segment
538  */
539 int siw_proc_write(struct siw_qp *qp)
540 {
541 	struct siw_rx_stream *srx = &qp->rx_stream;
542 	struct siw_rx_fpdu *frx = &qp->rx_tagged;
543 	struct siw_mem *mem;
544 	int bytes, rv;
545 
546 	if (srx->state == SIW_GET_DATA_START) {
547 		if (!srx->fpdu_part_rem) /* zero length WRITE */
548 			return 0;
549 
550 		rv = siw_write_check_ntoh(srx, frx);
551 		if (unlikely(rv)) {
552 			siw_qp_event(qp, IB_EVENT_QP_FATAL);
553 			return rv;
554 		}
555 	}
556 	bytes = min(srx->fpdu_part_rem, srx->skb_new);
557 
558 	if (frx->first_ddp_seg) {
559 		struct siw_wqe *wqe = rx_wqe(frx);
560 
561 		rx_mem(frx) = siw_mem_id2obj(qp->sdev, srx->ddp_stag >> 8);
562 		if (unlikely(!rx_mem(frx))) {
563 			siw_dbg_qp(qp,
564 				   "sink stag not found/invalid, stag 0x%08x\n",
565 				   srx->ddp_stag);
566 
567 			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
568 					   DDP_ETYPE_TAGGED_BUF,
569 					   DDP_ECODE_T_INVALID_STAG, 0);
570 			return -EINVAL;
571 		}
572 		wqe->rqe.num_sge = 1;
573 		rx_type(wqe) = SIW_OP_WRITE;
574 		wqe->wr_status = SIW_WR_INPROGRESS;
575 	}
576 	mem = rx_mem(frx);
577 
578 	/*
579 	 * Check if application re-registered memory with different
580 	 * key field of STag.
581 	 */
582 	if (unlikely(mem->stag != srx->ddp_stag)) {
583 		siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
584 				   DDP_ETYPE_TAGGED_BUF,
585 				   DDP_ECODE_T_INVALID_STAG, 0);
586 		return -EINVAL;
587 	}
588 	rv = siw_check_mem(qp->pd, mem, srx->ddp_to + srx->fpdu_part_rcvd,
589 			   IB_ACCESS_REMOTE_WRITE, bytes);
590 	if (unlikely(rv)) {
591 		siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
592 				   DDP_ETYPE_TAGGED_BUF, siw_tagged_error(-rv),
593 				   0);
594 
595 		siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
596 
597 		return -EINVAL;
598 	}
599 
600 	if (mem->mem_obj == NULL)
601 		rv = siw_rx_kva(srx,
602 			(void *)(uintptr_t)(srx->ddp_to + srx->fpdu_part_rcvd),
603 			bytes);
604 	else if (!mem->is_pbl)
605 		rv = siw_rx_umem(srx, mem->umem,
606 				 srx->ddp_to + srx->fpdu_part_rcvd, bytes);
607 	else
608 		rv = siw_rx_pbl(srx, &frx->pbl_idx, mem,
609 				srx->ddp_to + srx->fpdu_part_rcvd, bytes);
610 
611 	if (unlikely(rv != bytes)) {
612 		siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
613 				   DDP_ETYPE_CATASTROPHIC,
614 				   DDP_ECODE_CATASTROPHIC, 0);
615 		return -EINVAL;
616 	}
617 	srx->fpdu_part_rem -= rv;
618 	srx->fpdu_part_rcvd += rv;
619 
620 	if (!srx->fpdu_part_rem) {
621 		srx->ddp_to += srx->fpdu_part_rcvd;
622 		return 0;
623 	}
624 	return -EAGAIN;
625 }
626 
627 /*
628  * Inbound RREQ's cannot carry user data.
629  */
630 int siw_proc_rreq(struct siw_qp *qp)
631 {
632 	struct siw_rx_stream *srx = &qp->rx_stream;
633 
634 	if (!srx->fpdu_part_rem)
635 		return 0;
636 
637 	pr_warn("siw: [QP %u]: rreq with mpa len %d\n", qp_id(qp),
638 		be16_to_cpu(srx->hdr.ctrl.mpa_len));
639 
640 	return -EPROTO;
641 }
642 
643 /*
644  * siw_init_rresp:
645  *
646  * Process inbound RDMA READ REQ. Produce a pseudo READ RESPONSE WQE.
647  * Put it at the tail of the IRQ, if there is another WQE currently in
648  * transmit processing. If not, make it the current WQE to be processed
649  * and schedule transmit processing.
650  *
651  * Can be called from softirq context and from process
652  * context (RREAD socket loopback case!)
653  *
654  * return value:
655  *	0:      success,
656  *		failure code otherwise
657  */
658 
659 static int siw_init_rresp(struct siw_qp *qp, struct siw_rx_stream *srx)
660 {
661 	struct siw_wqe *tx_work = tx_wqe(qp);
662 	struct siw_sqe *resp;
663 
664 	uint64_t raddr = be64_to_cpu(srx->hdr.rreq.sink_to),
665 		 laddr = be64_to_cpu(srx->hdr.rreq.source_to);
666 	uint32_t length = be32_to_cpu(srx->hdr.rreq.read_size),
667 		 lkey = be32_to_cpu(srx->hdr.rreq.source_stag),
668 		 rkey = be32_to_cpu(srx->hdr.rreq.sink_stag),
669 		 msn = be32_to_cpu(srx->hdr.rreq.ddp_msn);
670 
671 	int run_sq = 1, rv = 0;
672 	unsigned long flags;
673 
674 	if (unlikely(msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ])) {
675 		siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
676 				   DDP_ETYPE_UNTAGGED_BUF,
677 				   DDP_ECODE_UT_INVALID_MSN_RANGE, 0);
678 		return -EPROTO;
679 	}
680 	spin_lock_irqsave(&qp->sq_lock, flags);
681 
682 	if (tx_work->wr_status == SIW_WR_IDLE) {
683 		/*
684 		 * immediately schedule READ response w/o
685 		 * consuming IRQ entry: IRQ must be empty.
686 		 */
687 		tx_work->processed = 0;
688 		tx_work->mem[0] = NULL;
689 		tx_work->wr_status = SIW_WR_QUEUED;
690 		resp = &tx_work->sqe;
691 	} else {
692 		resp = irq_alloc_free(qp);
693 		run_sq = 0;
694 	}
695 	if (likely(resp)) {
696 		resp->opcode = SIW_OP_READ_RESPONSE;
697 
698 		resp->sge[0].length = length;
699 		resp->sge[0].laddr = laddr;
700 		resp->sge[0].lkey = lkey;
701 
702 		/* Keep aside message sequence number for potential
703 		 * error reporting during Read Response generation.
704 		 */
705 		resp->sge[1].length = msn;
706 
707 		resp->raddr = raddr;
708 		resp->rkey = rkey;
709 		resp->num_sge = length ? 1 : 0;
710 
711 		/* RRESP now valid as current TX wqe or placed into IRQ */
712 		smp_store_mb(resp->flags, SIW_WQE_VALID);
713 	} else {
714 		pr_warn("siw: [QP %u]: irq %d exceeded %d\n", qp_id(qp),
715 			qp->irq_put % qp->attrs.irq_size, qp->attrs.irq_size);
716 
717 		siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
718 				   RDMAP_ETYPE_REMOTE_OPERATION,
719 				   RDMAP_ECODE_CATASTROPHIC_STREAM, 0);
720 		rv = -EPROTO;
721 	}
722 
723 	spin_unlock_irqrestore(&qp->sq_lock, flags);
724 
725 	if (run_sq)
726 		rv = siw_sq_start(qp);
727 
728 	return rv;
729 }
730 
731 /*
732  * Only called at start of Read.Resonse processing.
733  * Transfer pending Read from tip of ORQ into currrent rx wqe,
734  * but keep ORQ entry valid until Read.Response processing done.
735  * No Queue locking needed.
736  */
737 static int siw_orqe_start_rx(struct siw_qp *qp)
738 {
739 	struct siw_sqe *orqe;
740 	struct siw_wqe *wqe = NULL;
741 
742 	/* make sure ORQ indices are current */
743 	smp_mb();
744 
745 	orqe = orq_get_current(qp);
746 	if (READ_ONCE(orqe->flags) & SIW_WQE_VALID) {
747 		/* RRESP is a TAGGED RDMAP operation */
748 		wqe = rx_wqe(&qp->rx_tagged);
749 		wqe->sqe.id = orqe->id;
750 		wqe->sqe.opcode = orqe->opcode;
751 		wqe->sqe.sge[0].laddr = orqe->sge[0].laddr;
752 		wqe->sqe.sge[0].lkey = orqe->sge[0].lkey;
753 		wqe->sqe.sge[0].length = orqe->sge[0].length;
754 		wqe->sqe.flags = orqe->flags;
755 		wqe->sqe.num_sge = 1;
756 		wqe->bytes = orqe->sge[0].length;
757 		wqe->processed = 0;
758 		wqe->mem[0] = NULL;
759 		/* make sure WQE is completely written before valid */
760 		smp_wmb();
761 		wqe->wr_status = SIW_WR_INPROGRESS;
762 
763 		return 0;
764 	}
765 	return -EPROTO;
766 }
767 
768 /*
769  * siw_proc_rresp:
770  *
771  * Place incoming RRESP data into memory referenced by RREQ WQE
772  * which is at the tip of the ORQ
773  *
774  * Function supports partially received RRESP's (suspending/resuming
775  * current receive processing)
776  */
777 int siw_proc_rresp(struct siw_qp *qp)
778 {
779 	struct siw_rx_stream *srx = &qp->rx_stream;
780 	struct siw_rx_fpdu *frx = &qp->rx_tagged;
781 	struct siw_wqe *wqe = rx_wqe(frx);
782 	struct siw_mem **mem, *mem_p;
783 	struct siw_sge *sge;
784 	int bytes, rv;
785 
786 	if (frx->first_ddp_seg) {
787 		if (unlikely(wqe->wr_status != SIW_WR_IDLE)) {
788 			pr_warn("siw: [QP %u]: proc RRESP: status %d, op %d\n",
789 				qp_id(qp), wqe->wr_status, wqe->sqe.opcode);
790 			rv = -EPROTO;
791 			goto error_term;
792 		}
793 		/*
794 		 * fetch pending RREQ from orq
795 		 */
796 		rv = siw_orqe_start_rx(qp);
797 		if (rv) {
798 			pr_warn("siw: [QP %u]: ORQ empty at idx %d\n",
799 				qp_id(qp), qp->orq_get % qp->attrs.orq_size);
800 			goto error_term;
801 		}
802 		rv = siw_rresp_check_ntoh(srx, frx);
803 		if (unlikely(rv)) {
804 			siw_qp_event(qp, IB_EVENT_QP_FATAL);
805 			return rv;
806 		}
807 	} else {
808 		if (unlikely(wqe->wr_status != SIW_WR_INPROGRESS)) {
809 			pr_warn("siw: [QP %u]: resume RRESP: status %d\n",
810 				qp_id(qp), wqe->wr_status);
811 			rv = -EPROTO;
812 			goto error_term;
813 		}
814 	}
815 	if (!srx->fpdu_part_rem) /* zero length RRESPONSE */
816 		return 0;
817 
818 	sge = wqe->sqe.sge; /* there is only one */
819 	mem = &wqe->mem[0];
820 
821 	if (!(*mem)) {
822 		/*
823 		 * check target memory which resolves memory on first fragment
824 		 */
825 		rv = siw_check_sge(qp->pd, sge, mem, IB_ACCESS_LOCAL_WRITE, 0,
826 				   wqe->bytes);
827 		if (unlikely(rv)) {
828 			siw_dbg_qp(qp, "target mem check: %d\n", rv);
829 			wqe->wc_status = SIW_WC_LOC_PROT_ERR;
830 
831 			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
832 					   DDP_ETYPE_TAGGED_BUF,
833 					   siw_tagged_error(-rv), 0);
834 
835 			siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
836 
837 			return -EINVAL;
838 		}
839 	}
840 	mem_p = *mem;
841 
842 	bytes = min(srx->fpdu_part_rem, srx->skb_new);
843 
844 	if (mem_p->mem_obj == NULL)
845 		rv = siw_rx_kva(srx,
846 			(void *)(uintptr_t)(sge->laddr + wqe->processed),
847 			bytes);
848 	else if (!mem_p->is_pbl)
849 		rv = siw_rx_umem(srx, mem_p->umem, sge->laddr + wqe->processed,
850 				 bytes);
851 	else
852 		rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p,
853 				sge->laddr + wqe->processed, bytes);
854 	if (rv != bytes) {
855 		wqe->wc_status = SIW_WC_GENERAL_ERR;
856 		rv = -EINVAL;
857 		goto error_term;
858 	}
859 	srx->fpdu_part_rem -= rv;
860 	srx->fpdu_part_rcvd += rv;
861 	wqe->processed += rv;
862 
863 	if (!srx->fpdu_part_rem) {
864 		srx->ddp_to += srx->fpdu_part_rcvd;
865 		return 0;
866 	}
867 	return -EAGAIN;
868 
869 error_term:
870 	siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, DDP_ETYPE_CATASTROPHIC,
871 			   DDP_ECODE_CATASTROPHIC, 0);
872 	return rv;
873 }
874 
875 int siw_proc_terminate(struct siw_qp *qp)
876 {
877 	struct siw_rx_stream *srx = &qp->rx_stream;
878 	struct sk_buff *skb = srx->skb;
879 	struct iwarp_terminate *term = &srx->hdr.terminate;
880 	union iwarp_hdr term_info;
881 	u8 *infop = (u8 *)&term_info;
882 	enum rdma_opcode op;
883 	u16 to_copy = sizeof(struct iwarp_ctrl);
884 
885 	pr_warn("siw: got TERMINATE. layer %d, type %d, code %d\n",
886 		__rdmap_term_layer(term), __rdmap_term_etype(term),
887 		__rdmap_term_ecode(term));
888 
889 	if (be32_to_cpu(term->ddp_qn) != RDMAP_UNTAGGED_QN_TERMINATE ||
890 	    be32_to_cpu(term->ddp_msn) !=
891 		    qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] ||
892 	    be32_to_cpu(term->ddp_mo) != 0) {
893 		pr_warn("siw: rx bogus TERM [QN x%08x, MSN x%08x, MO x%08x]\n",
894 			be32_to_cpu(term->ddp_qn), be32_to_cpu(term->ddp_msn),
895 			be32_to_cpu(term->ddp_mo));
896 		return -ECONNRESET;
897 	}
898 	/*
899 	 * Receive remaining pieces of TERM if indicated
900 	 */
901 	if (!term->flag_m)
902 		return -ECONNRESET;
903 
904 	/* Do not take the effort to reassemble a network fragmented
905 	 * TERM message
906 	 */
907 	if (srx->skb_new < sizeof(struct iwarp_ctrl_tagged))
908 		return -ECONNRESET;
909 
910 	memset(infop, 0, sizeof(term_info));
911 
912 	skb_copy_bits(skb, srx->skb_offset, infop, to_copy);
913 
914 	op = __rdmap_get_opcode(&term_info.ctrl);
915 	if (op >= RDMAP_TERMINATE)
916 		goto out;
917 
918 	infop += to_copy;
919 	srx->skb_offset += to_copy;
920 	srx->skb_new -= to_copy;
921 	srx->skb_copied += to_copy;
922 	srx->fpdu_part_rcvd += to_copy;
923 	srx->fpdu_part_rem -= to_copy;
924 
925 	to_copy = iwarp_pktinfo[op].hdr_len - to_copy;
926 
927 	/* Again, no network fragmented TERM's */
928 	if (to_copy + MPA_CRC_SIZE > srx->skb_new)
929 		return -ECONNRESET;
930 
931 	skb_copy_bits(skb, srx->skb_offset, infop, to_copy);
932 
933 	if (term->flag_r) {
934 		siw_dbg_qp(qp, "TERM reports RDMAP hdr type %u, len %u (%s)\n",
935 			   op, be16_to_cpu(term_info.ctrl.mpa_len),
936 			   term->flag_m ? "valid" : "invalid");
937 	} else if (term->flag_d) {
938 		siw_dbg_qp(qp, "TERM reports DDP hdr type %u, len %u (%s)\n",
939 			   op, be16_to_cpu(term_info.ctrl.mpa_len),
940 			   term->flag_m ? "valid" : "invalid");
941 	}
942 out:
943 	srx->skb_new -= to_copy;
944 	srx->skb_offset += to_copy;
945 	srx->skb_copied += to_copy;
946 	srx->fpdu_part_rcvd += to_copy;
947 	srx->fpdu_part_rem -= to_copy;
948 
949 	return -ECONNRESET;
950 }
951 
952 static int siw_get_trailer(struct siw_qp *qp, struct siw_rx_stream *srx)
953 {
954 	struct sk_buff *skb = srx->skb;
955 	u8 *tbuf = (u8 *)&srx->trailer.crc - srx->pad;
956 	__wsum crc_in, crc_own = 0;
957 
958 	siw_dbg_qp(qp, "expected %d, available %d, pad %u\n",
959 		   srx->fpdu_part_rem, srx->skb_new, srx->pad);
960 
961 	if (srx->skb_new < srx->fpdu_part_rem)
962 		return -EAGAIN;
963 
964 	skb_copy_bits(skb, srx->skb_offset, tbuf, srx->fpdu_part_rem);
965 
966 	if (srx->mpa_crc_hd && srx->pad)
967 		crypto_shash_update(srx->mpa_crc_hd, tbuf, srx->pad);
968 
969 	srx->skb_new -= srx->fpdu_part_rem;
970 	srx->skb_offset += srx->fpdu_part_rem;
971 	srx->skb_copied += srx->fpdu_part_rem;
972 
973 	if (!srx->mpa_crc_hd)
974 		return 0;
975 
976 	/*
977 	 * CRC32 is computed, transmitted and received directly in NBO,
978 	 * so there's never a reason to convert byte order.
979 	 */
980 	crypto_shash_final(srx->mpa_crc_hd, (u8 *)&crc_own);
981 	crc_in = (__force __wsum)srx->trailer.crc;
982 
983 	if (unlikely(crc_in != crc_own)) {
984 		pr_warn("siw: crc error. in: %08x, own %08x, op %u\n",
985 			crc_in, crc_own, qp->rx_stream.rdmap_op);
986 
987 		siw_init_terminate(qp, TERM_ERROR_LAYER_LLP,
988 				   LLP_ETYPE_MPA,
989 				   LLP_ECODE_RECEIVED_CRC, 0);
990 		return -EINVAL;
991 	}
992 	return 0;
993 }
994 
995 #define MIN_DDP_HDR sizeof(struct iwarp_ctrl_tagged)
996 
997 static int siw_get_hdr(struct siw_rx_stream *srx)
998 {
999 	struct sk_buff *skb = srx->skb;
1000 	struct siw_qp *qp = rx_qp(srx);
1001 	struct iwarp_ctrl *c_hdr = &srx->hdr.ctrl;
1002 	struct siw_rx_fpdu *frx;
1003 	u8 opcode;
1004 	int bytes;
1005 
1006 	if (srx->fpdu_part_rcvd < MIN_DDP_HDR) {
1007 		/*
1008 		 * copy a mimimum sized (tagged) DDP frame control part
1009 		 */
1010 		bytes = min_t(int, srx->skb_new,
1011 			      MIN_DDP_HDR - srx->fpdu_part_rcvd);
1012 
1013 		skb_copy_bits(skb, srx->skb_offset,
1014 			      (char *)c_hdr + srx->fpdu_part_rcvd, bytes);
1015 
1016 		srx->fpdu_part_rcvd += bytes;
1017 
1018 		srx->skb_new -= bytes;
1019 		srx->skb_offset += bytes;
1020 		srx->skb_copied += bytes;
1021 
1022 		if (srx->fpdu_part_rcvd < MIN_DDP_HDR)
1023 			return -EAGAIN;
1024 
1025 		if (unlikely(__ddp_get_version(c_hdr) != DDP_VERSION)) {
1026 			enum ddp_etype etype;
1027 			enum ddp_ecode ecode;
1028 
1029 			pr_warn("siw: received ddp version unsupported %d\n",
1030 				__ddp_get_version(c_hdr));
1031 
1032 			if (c_hdr->ddp_rdmap_ctrl & DDP_FLAG_TAGGED) {
1033 				etype = DDP_ETYPE_TAGGED_BUF;
1034 				ecode = DDP_ECODE_T_VERSION;
1035 			} else {
1036 				etype = DDP_ETYPE_UNTAGGED_BUF;
1037 				ecode = DDP_ECODE_UT_VERSION;
1038 			}
1039 			siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
1040 					   etype, ecode, 0);
1041 			return -EINVAL;
1042 		}
1043 		if (unlikely(__rdmap_get_version(c_hdr) != RDMAP_VERSION)) {
1044 			pr_warn("siw: received rdmap version unsupported %d\n",
1045 				__rdmap_get_version(c_hdr));
1046 
1047 			siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP,
1048 					   RDMAP_ETYPE_REMOTE_OPERATION,
1049 					   RDMAP_ECODE_VERSION, 0);
1050 			return -EINVAL;
1051 		}
1052 		opcode = __rdmap_get_opcode(c_hdr);
1053 
1054 		if (opcode > RDMAP_TERMINATE) {
1055 			pr_warn("siw: received unknown packet type %u\n",
1056 				opcode);
1057 
1058 			siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP,
1059 					   RDMAP_ETYPE_REMOTE_OPERATION,
1060 					   RDMAP_ECODE_OPCODE, 0);
1061 			return -EINVAL;
1062 		}
1063 		siw_dbg_qp(rx_qp(srx), "new header, opcode %u\n", opcode);
1064 	} else {
1065 		opcode = __rdmap_get_opcode(c_hdr);
1066 	}
1067 	set_rx_fpdu_context(qp, opcode);
1068 	frx = qp->rx_fpdu;
1069 
1070 	/*
1071 	 * Figure out len of current hdr: variable length of
1072 	 * iwarp hdr may force us to copy hdr information in
1073 	 * two steps. Only tagged DDP messages are already
1074 	 * completely received.
1075 	 */
1076 	if (iwarp_pktinfo[opcode].hdr_len > sizeof(struct iwarp_ctrl_tagged)) {
1077 		bytes = iwarp_pktinfo[opcode].hdr_len - MIN_DDP_HDR;
1078 
1079 		if (srx->skb_new < bytes)
1080 			return -EAGAIN;
1081 
1082 		skb_copy_bits(skb, srx->skb_offset,
1083 			      (char *)c_hdr + srx->fpdu_part_rcvd, bytes);
1084 
1085 		srx->fpdu_part_rcvd += bytes;
1086 
1087 		srx->skb_new -= bytes;
1088 		srx->skb_offset += bytes;
1089 		srx->skb_copied += bytes;
1090 	}
1091 
1092 	/*
1093 	 * DDP/RDMAP header receive completed. Check if the current
1094 	 * DDP segment starts a new RDMAP message or continues a previously
1095 	 * started RDMAP message.
1096 	 *
1097 	 * Alternating reception of DDP segments (or FPDUs) from incomplete
1098 	 * tagged and untagged RDMAP messages is supported, as long as
1099 	 * the current tagged or untagged message gets eventually completed
1100 	 * w/o intersection from another message of the same type
1101 	 * (tagged/untagged). E.g., a WRITE can get intersected by a SEND,
1102 	 * but not by a READ RESPONSE etc.
1103 	 */
1104 	if (srx->mpa_crc_hd) {
1105 		/*
1106 		 * Restart CRC computation
1107 		 */
1108 		crypto_shash_init(srx->mpa_crc_hd);
1109 		crypto_shash_update(srx->mpa_crc_hd, (u8 *)c_hdr,
1110 				    srx->fpdu_part_rcvd);
1111 	}
1112 	if (frx->more_ddp_segs) {
1113 		frx->first_ddp_seg = 0;
1114 		if (frx->prev_rdmap_op != opcode) {
1115 			pr_warn("siw: packet intersection: %u : %u\n",
1116 				frx->prev_rdmap_op, opcode);
1117 			/*
1118 			 * The last inbound RDMA operation of same type
1119 			 * (tagged or untagged) is left unfinished.
1120 			 * To complete it in error, make it the current
1121 			 * operation again, even with the header already
1122 			 * overwritten. For error handling, only the opcode
1123 			 * and current rx context are relevant.
1124 			 */
1125 			set_rx_fpdu_context(qp, frx->prev_rdmap_op);
1126 			__rdmap_set_opcode(c_hdr, frx->prev_rdmap_op);
1127 			return -EPROTO;
1128 		}
1129 	} else {
1130 		frx->prev_rdmap_op = opcode;
1131 		frx->first_ddp_seg = 1;
1132 	}
1133 	frx->more_ddp_segs = c_hdr->ddp_rdmap_ctrl & DDP_FLAG_LAST ? 0 : 1;
1134 
1135 	return 0;
1136 }
1137 
1138 static int siw_check_tx_fence(struct siw_qp *qp)
1139 {
1140 	struct siw_wqe *tx_waiting = tx_wqe(qp);
1141 	struct siw_sqe *rreq;
1142 	int resume_tx = 0, rv = 0;
1143 	unsigned long flags;
1144 
1145 	spin_lock_irqsave(&qp->orq_lock, flags);
1146 
1147 	rreq = orq_get_current(qp);
1148 
1149 	/* free current orq entry */
1150 	WRITE_ONCE(rreq->flags, 0);
1151 
1152 	if (qp->tx_ctx.orq_fence) {
1153 		if (unlikely(tx_waiting->wr_status != SIW_WR_QUEUED)) {
1154 			pr_warn("siw: [QP %u]: fence resume: bad status %d\n",
1155 				qp_id(qp), tx_waiting->wr_status);
1156 			rv = -EPROTO;
1157 			goto out;
1158 		}
1159 		/* resume SQ processing */
1160 		if (tx_waiting->sqe.opcode == SIW_OP_READ ||
1161 		    tx_waiting->sqe.opcode == SIW_OP_READ_LOCAL_INV) {
1162 			rreq = orq_get_tail(qp);
1163 			if (unlikely(!rreq)) {
1164 				pr_warn("siw: [QP %u]: no ORQE\n", qp_id(qp));
1165 				rv = -EPROTO;
1166 				goto out;
1167 			}
1168 			siw_read_to_orq(rreq, &tx_waiting->sqe);
1169 
1170 			qp->orq_put++;
1171 			qp->tx_ctx.orq_fence = 0;
1172 			resume_tx = 1;
1173 
1174 		} else if (siw_orq_empty(qp)) {
1175 			qp->tx_ctx.orq_fence = 0;
1176 			resume_tx = 1;
1177 		} else {
1178 			pr_warn("siw: [QP %u]: fence resume: orq idx: %d:%d\n",
1179 				qp_id(qp), qp->orq_get, qp->orq_put);
1180 			rv = -EPROTO;
1181 		}
1182 	}
1183 	qp->orq_get++;
1184 out:
1185 	spin_unlock_irqrestore(&qp->orq_lock, flags);
1186 
1187 	if (resume_tx)
1188 		rv = siw_sq_start(qp);
1189 
1190 	return rv;
1191 }
1192 
1193 /*
1194  * siw_rdmap_complete()
1195  *
1196  * Complete processing of an RDMA message after receiving all
1197  * DDP segmens or ABort processing after encountering error case.
1198  *
1199  *   o SENDs + RRESPs will need for completion,
1200  *   o RREQs need for  READ RESPONSE initialization
1201  *   o WRITEs need memory dereferencing
1202  *
1203  * TODO: Failed WRITEs need local error to be surfaced.
1204  */
1205 static int siw_rdmap_complete(struct siw_qp *qp, int error)
1206 {
1207 	struct siw_rx_stream *srx = &qp->rx_stream;
1208 	struct siw_wqe *wqe = rx_wqe(qp->rx_fpdu);
1209 	enum siw_wc_status wc_status = wqe->wc_status;
1210 	u8 opcode = __rdmap_get_opcode(&srx->hdr.ctrl);
1211 	int rv = 0;
1212 
1213 	switch (opcode) {
1214 	case RDMAP_SEND_SE:
1215 	case RDMAP_SEND_SE_INVAL:
1216 		wqe->rqe.flags |= SIW_WQE_SOLICITED;
1217 		/* Fall through */
1218 
1219 	case RDMAP_SEND:
1220 	case RDMAP_SEND_INVAL:
1221 		if (wqe->wr_status == SIW_WR_IDLE)
1222 			break;
1223 
1224 		srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]++;
1225 
1226 		if (error != 0 && wc_status == SIW_WC_SUCCESS)
1227 			wc_status = SIW_WC_GENERAL_ERR;
1228 		/*
1229 		 * Handle STag invalidation request
1230 		 */
1231 		if (wc_status == SIW_WC_SUCCESS &&
1232 		    (opcode == RDMAP_SEND_INVAL ||
1233 		     opcode == RDMAP_SEND_SE_INVAL)) {
1234 			rv = siw_invalidate_stag(qp->pd, srx->inval_stag);
1235 			if (rv) {
1236 				siw_init_terminate(
1237 					qp, TERM_ERROR_LAYER_RDMAP,
1238 					rv == -EACCES ?
1239 						RDMAP_ETYPE_REMOTE_PROTECTION :
1240 						RDMAP_ETYPE_REMOTE_OPERATION,
1241 					RDMAP_ECODE_CANNOT_INVALIDATE, 0);
1242 
1243 				wc_status = SIW_WC_REM_INV_REQ_ERR;
1244 			}
1245 			rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed,
1246 					      rv ? 0 : srx->inval_stag,
1247 					      wc_status);
1248 		} else {
1249 			rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed,
1250 					      0, wc_status);
1251 		}
1252 		siw_wqe_put_mem(wqe, SIW_OP_RECEIVE);
1253 		break;
1254 
1255 	case RDMAP_RDMA_READ_RESP:
1256 		if (wqe->wr_status == SIW_WR_IDLE)
1257 			break;
1258 
1259 		if (error != 0) {
1260 			if ((srx->state == SIW_GET_HDR &&
1261 			     qp->rx_fpdu->first_ddp_seg) || error == -ENODATA)
1262 				/* possible RREQ in ORQ left untouched */
1263 				break;
1264 
1265 			if (wc_status == SIW_WC_SUCCESS)
1266 				wc_status = SIW_WC_GENERAL_ERR;
1267 		} else if (qp->kernel_verbs &&
1268 			   rx_type(wqe) == SIW_OP_READ_LOCAL_INV) {
1269 			/*
1270 			 * Handle any STag invalidation request
1271 			 */
1272 			rv = siw_invalidate_stag(qp->pd, wqe->sqe.sge[0].lkey);
1273 			if (rv) {
1274 				siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
1275 						   RDMAP_ETYPE_CATASTROPHIC,
1276 						   RDMAP_ECODE_UNSPECIFIED, 0);
1277 
1278 				if (wc_status == SIW_WC_SUCCESS) {
1279 					wc_status = SIW_WC_GENERAL_ERR;
1280 					error = rv;
1281 				}
1282 			}
1283 		}
1284 		/*
1285 		 * All errors turn the wqe into signalled.
1286 		 */
1287 		if ((wqe->sqe.flags & SIW_WQE_SIGNALLED) || error != 0)
1288 			rv = siw_sqe_complete(qp, &wqe->sqe, wqe->processed,
1289 					      wc_status);
1290 		siw_wqe_put_mem(wqe, SIW_OP_READ);
1291 
1292 		if (!error)
1293 			rv = siw_check_tx_fence(qp);
1294 		else
1295 			/* Disable current ORQ eleement */
1296 			WRITE_ONCE(orq_get_current(qp)->flags, 0);
1297 		break;
1298 
1299 	case RDMAP_RDMA_READ_REQ:
1300 		if (!error) {
1301 			rv = siw_init_rresp(qp, srx);
1302 			srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]++;
1303 		}
1304 		break;
1305 
1306 	case RDMAP_RDMA_WRITE:
1307 		if (wqe->wr_status == SIW_WR_IDLE)
1308 			break;
1309 
1310 		/*
1311 		 * Free References from memory object if
1312 		 * attached to receive context (inbound WRITE).
1313 		 * While a zero-length WRITE is allowed,
1314 		 * no memory reference got created.
1315 		 */
1316 		if (rx_mem(&qp->rx_tagged)) {
1317 			siw_mem_put(rx_mem(&qp->rx_tagged));
1318 			rx_mem(&qp->rx_tagged) = NULL;
1319 		}
1320 		break;
1321 
1322 	default:
1323 		break;
1324 	}
1325 	wqe->wr_status = SIW_WR_IDLE;
1326 
1327 	return rv;
1328 }
1329 
1330 /*
1331  * siw_tcp_rx_data()
1332  *
1333  * Main routine to consume inbound TCP payload
1334  *
1335  * @rd_desc:	read descriptor
1336  * @skb:	socket buffer
1337  * @off:	offset in skb
1338  * @len:	skb->len - offset : payload in skb
1339  */
1340 int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb,
1341 		    unsigned int off, size_t len)
1342 {
1343 	struct siw_qp *qp = rd_desc->arg.data;
1344 	struct siw_rx_stream *srx = &qp->rx_stream;
1345 	int rv;
1346 
1347 	srx->skb = skb;
1348 	srx->skb_new = skb->len - off;
1349 	srx->skb_offset = off;
1350 	srx->skb_copied = 0;
1351 
1352 	siw_dbg_qp(qp, "new data, len %d\n", srx->skb_new);
1353 
1354 	while (srx->skb_new) {
1355 		int run_completion = 1;
1356 
1357 		if (unlikely(srx->rx_suspend)) {
1358 			/* Do not process any more data */
1359 			srx->skb_copied += srx->skb_new;
1360 			break;
1361 		}
1362 		switch (srx->state) {
1363 		case SIW_GET_HDR:
1364 			rv = siw_get_hdr(srx);
1365 			if (!rv) {
1366 				srx->fpdu_part_rem =
1367 					be16_to_cpu(srx->hdr.ctrl.mpa_len) -
1368 					srx->fpdu_part_rcvd + MPA_HDR_SIZE;
1369 
1370 				if (srx->fpdu_part_rem)
1371 					srx->pad = -srx->fpdu_part_rem & 0x3;
1372 				else
1373 					srx->pad = 0;
1374 
1375 				srx->state = SIW_GET_DATA_START;
1376 				srx->fpdu_part_rcvd = 0;
1377 			}
1378 			break;
1379 
1380 		case SIW_GET_DATA_MORE:
1381 			/*
1382 			 * Another data fragment of the same DDP segment.
1383 			 * Setting first_ddp_seg = 0 avoids repeating
1384 			 * initializations that shall occur only once per
1385 			 * DDP segment.
1386 			 */
1387 			qp->rx_fpdu->first_ddp_seg = 0;
1388 			/* Fall through */
1389 
1390 		case SIW_GET_DATA_START:
1391 			/*
1392 			 * Headers will be checked by the opcode-specific
1393 			 * data receive function below.
1394 			 */
1395 			rv = iwarp_pktinfo[qp->rx_stream.rdmap_op].rx_data(qp);
1396 			if (!rv) {
1397 				int mpa_len =
1398 					be16_to_cpu(srx->hdr.ctrl.mpa_len)
1399 					+ MPA_HDR_SIZE;
1400 
1401 				srx->fpdu_part_rem = (-mpa_len & 0x3)
1402 						      + MPA_CRC_SIZE;
1403 				srx->fpdu_part_rcvd = 0;
1404 				srx->state = SIW_GET_TRAILER;
1405 			} else {
1406 				if (unlikely(rv == -ECONNRESET))
1407 					run_completion = 0;
1408 				else
1409 					srx->state = SIW_GET_DATA_MORE;
1410 			}
1411 			break;
1412 
1413 		case SIW_GET_TRAILER:
1414 			/*
1415 			 * read CRC + any padding
1416 			 */
1417 			rv = siw_get_trailer(qp, srx);
1418 			if (likely(!rv)) {
1419 				/*
1420 				 * FPDU completed.
1421 				 * complete RDMAP message if last fragment
1422 				 */
1423 				srx->state = SIW_GET_HDR;
1424 				srx->fpdu_part_rcvd = 0;
1425 
1426 				if (!(srx->hdr.ctrl.ddp_rdmap_ctrl &
1427 				      DDP_FLAG_LAST))
1428 					/* more frags */
1429 					break;
1430 
1431 				rv = siw_rdmap_complete(qp, 0);
1432 				run_completion = 0;
1433 			}
1434 			break;
1435 
1436 		default:
1437 			pr_warn("QP[%u]: RX out of state\n", qp_id(qp));
1438 			rv = -EPROTO;
1439 			run_completion = 0;
1440 		}
1441 		if (unlikely(rv != 0 && rv != -EAGAIN)) {
1442 			if ((srx->state > SIW_GET_HDR ||
1443 			     qp->rx_fpdu->more_ddp_segs) && run_completion)
1444 				siw_rdmap_complete(qp, rv);
1445 
1446 			siw_dbg_qp(qp, "rx error %d, rx state %d\n", rv,
1447 				   srx->state);
1448 
1449 			siw_qp_cm_drop(qp, 1);
1450 
1451 			break;
1452 		}
1453 		if (rv) {
1454 			siw_dbg_qp(qp, "fpdu fragment, state %d, missing %d\n",
1455 				   srx->state, srx->fpdu_part_rem);
1456 			break;
1457 		}
1458 	}
1459 	return srx->skb_copied;
1460 }
1461