xref: /openbmc/linux/drivers/infiniband/hw/hfi1/rc.c (revision 68198dca)
1 /*
2  * Copyright(c) 2015, 2016 Intel Corporation.
3  *
4  * This file is provided under a dual BSD/GPLv2 license.  When using or
5  * redistributing this file, you may do so under either license.
6  *
7  * GPL LICENSE SUMMARY
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of version 2 of the GNU General Public License as
11  * published by the Free Software Foundation.
12  *
13  * This program is distributed in the hope that it will be useful, but
14  * WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * General Public License for more details.
17  *
18  * BSD LICENSE
19  *
20  * Redistribution and use in source and binary forms, with or without
21  * modification, are permitted provided that the following conditions
22  * are met:
23  *
24  *  - Redistributions of source code must retain the above copyright
25  *    notice, this list of conditions and the following disclaimer.
26  *  - Redistributions in binary form must reproduce the above copyright
27  *    notice, this list of conditions and the following disclaimer in
28  *    the documentation and/or other materials provided with the
29  *    distribution.
30  *  - Neither the name of Intel Corporation nor the names of its
31  *    contributors may be used to endorse or promote products derived
32  *    from this software without specific prior written permission.
33  *
34  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
36  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
37  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
38  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
39  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
40  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
41  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
42  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
43  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
44  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
45  *
46  */
47 
48 #include <linux/io.h>
49 #include <rdma/rdma_vt.h>
50 #include <rdma/rdmavt_qp.h>
51 
52 #include "hfi.h"
53 #include "qp.h"
54 #include "verbs_txreq.h"
55 #include "trace.h"
56 
57 /* cut down ridiculously long IB macro names */
58 #define OP(x) RC_OP(x)
59 
60 static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
61 		       u32 psn, u32 pmtu)
62 {
63 	u32 len;
64 
65 	len = delta_psn(psn, wqe->psn) * pmtu;
66 	ss->sge = wqe->sg_list[0];
67 	ss->sg_list = wqe->sg_list + 1;
68 	ss->num_sge = wqe->wr.num_sge;
69 	ss->total_len = wqe->length;
70 	rvt_skip_sge(ss, len, false);
71 	return wqe->length - len;
72 }
73 
74 /**
75  * make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
76  * @dev: the device for this QP
77  * @qp: a pointer to the QP
78  * @ohdr: a pointer to the IB header being constructed
79  * @ps: the xmit packet state
80  *
81  * Return 1 if constructed; otherwise, return 0.
82  * Note that we are in the responder's side of the QP context.
83  * Note the QP s_lock must be held.
84  */
85 static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
86 		       struct ib_other_headers *ohdr,
87 		       struct hfi1_pkt_state *ps)
88 {
89 	struct rvt_ack_entry *e;
90 	u32 hwords;
91 	u32 len;
92 	u32 bth0;
93 	u32 bth2;
94 	int middle = 0;
95 	u32 pmtu = qp->pmtu;
96 	struct hfi1_qp_priv *priv = qp->priv;
97 
98 	lockdep_assert_held(&qp->s_lock);
99 	/* Don't send an ACK if we aren't supposed to. */
100 	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
101 		goto bail;
102 
103 	if (priv->hdr_type == HFI1_PKT_TYPE_9B)
104 		/* header size in 32-bit words LRH+BTH = (8+12)/4. */
105 		hwords = 5;
106 	else
107 		/* header size in 32-bit words 16B LRH+BTH = (16+12)/4. */
108 		hwords = 7;
109 
110 	switch (qp->s_ack_state) {
111 	case OP(RDMA_READ_RESPONSE_LAST):
112 	case OP(RDMA_READ_RESPONSE_ONLY):
113 		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
114 		if (e->rdma_sge.mr) {
115 			rvt_put_mr(e->rdma_sge.mr);
116 			e->rdma_sge.mr = NULL;
117 		}
118 		/* FALLTHROUGH */
119 	case OP(ATOMIC_ACKNOWLEDGE):
120 		/*
121 		 * We can increment the tail pointer now that the last
122 		 * response has been sent instead of only being
123 		 * constructed.
124 		 */
125 		if (++qp->s_tail_ack_queue > HFI1_MAX_RDMA_ATOMIC)
126 			qp->s_tail_ack_queue = 0;
127 		/* FALLTHROUGH */
128 	case OP(SEND_ONLY):
129 	case OP(ACKNOWLEDGE):
130 		/* Check for no next entry in the queue. */
131 		if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {
132 			if (qp->s_flags & RVT_S_ACK_PENDING)
133 				goto normal;
134 			goto bail;
135 		}
136 
137 		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
138 		if (e->opcode == OP(RDMA_READ_REQUEST)) {
139 			/*
140 			 * If a RDMA read response is being resent and
141 			 * we haven't seen the duplicate request yet,
142 			 * then stop sending the remaining responses the
143 			 * responder has seen until the requester re-sends it.
144 			 */
145 			len = e->rdma_sge.sge_length;
146 			if (len && !e->rdma_sge.mr) {
147 				qp->s_tail_ack_queue = qp->r_head_ack_queue;
148 				goto bail;
149 			}
150 			/* Copy SGE state in case we need to resend */
151 			ps->s_txreq->mr = e->rdma_sge.mr;
152 			if (ps->s_txreq->mr)
153 				rvt_get_mr(ps->s_txreq->mr);
154 			qp->s_ack_rdma_sge.sge = e->rdma_sge;
155 			qp->s_ack_rdma_sge.num_sge = 1;
156 			ps->s_txreq->ss = &qp->s_ack_rdma_sge;
157 			if (len > pmtu) {
158 				len = pmtu;
159 				qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
160 			} else {
161 				qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
162 				e->sent = 1;
163 			}
164 			ohdr->u.aeth = rvt_compute_aeth(qp);
165 			hwords++;
166 			qp->s_ack_rdma_psn = e->psn;
167 			bth2 = mask_psn(qp->s_ack_rdma_psn++);
168 		} else {
169 			/* COMPARE_SWAP or FETCH_ADD */
170 			ps->s_txreq->ss = NULL;
171 			len = 0;
172 			qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
173 			ohdr->u.at.aeth = rvt_compute_aeth(qp);
174 			ib_u64_put(e->atomic_data, &ohdr->u.at.atomic_ack_eth);
175 			hwords += sizeof(ohdr->u.at) / sizeof(u32);
176 			bth2 = mask_psn(e->psn);
177 			e->sent = 1;
178 		}
179 		bth0 = qp->s_ack_state << 24;
180 		break;
181 
182 	case OP(RDMA_READ_RESPONSE_FIRST):
183 		qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
184 		/* FALLTHROUGH */
185 	case OP(RDMA_READ_RESPONSE_MIDDLE):
186 		ps->s_txreq->ss = &qp->s_ack_rdma_sge;
187 		ps->s_txreq->mr = qp->s_ack_rdma_sge.sge.mr;
188 		if (ps->s_txreq->mr)
189 			rvt_get_mr(ps->s_txreq->mr);
190 		len = qp->s_ack_rdma_sge.sge.sge_length;
191 		if (len > pmtu) {
192 			len = pmtu;
193 			middle = HFI1_CAP_IS_KSET(SDMA_AHG);
194 		} else {
195 			ohdr->u.aeth = rvt_compute_aeth(qp);
196 			hwords++;
197 			qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
198 			e = &qp->s_ack_queue[qp->s_tail_ack_queue];
199 			e->sent = 1;
200 		}
201 		bth0 = qp->s_ack_state << 24;
202 		bth2 = mask_psn(qp->s_ack_rdma_psn++);
203 		break;
204 
205 	default:
206 normal:
207 		/*
208 		 * Send a regular ACK.
209 		 * Set the s_ack_state so we wait until after sending
210 		 * the ACK before setting s_ack_state to ACKNOWLEDGE
211 		 * (see above).
212 		 */
213 		qp->s_ack_state = OP(SEND_ONLY);
214 		qp->s_flags &= ~RVT_S_ACK_PENDING;
215 		ps->s_txreq->ss = NULL;
216 		if (qp->s_nak_state)
217 			ohdr->u.aeth =
218 				cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
219 					    (qp->s_nak_state <<
220 					     IB_AETH_CREDIT_SHIFT));
221 		else
222 			ohdr->u.aeth = rvt_compute_aeth(qp);
223 		hwords++;
224 		len = 0;
225 		bth0 = OP(ACKNOWLEDGE) << 24;
226 		bth2 = mask_psn(qp->s_ack_psn);
227 	}
228 	qp->s_rdma_ack_cnt++;
229 	qp->s_hdrwords = hwords;
230 	ps->s_txreq->sde = priv->s_sde;
231 	ps->s_txreq->s_cur_size = len;
232 	hfi1_make_ruc_header(qp, ohdr, bth0, bth2, middle, ps);
233 	/* pbc */
234 	ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2;
235 	return 1;
236 
237 bail:
238 	qp->s_ack_state = OP(ACKNOWLEDGE);
239 	/*
240 	 * Ensure s_rdma_ack_cnt changes are committed prior to resetting
241 	 * RVT_S_RESP_PENDING
242 	 */
243 	smp_wmb();
244 	qp->s_flags &= ~(RVT_S_RESP_PENDING
245 				| RVT_S_ACK_PENDING
246 				| RVT_S_AHG_VALID);
247 	return 0;
248 }
249 
250 /**
251  * hfi1_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
252  * @qp: a pointer to the QP
253  *
254  * Assumes s_lock is held.
255  *
256  * Return 1 if constructed; otherwise, return 0.
257  */
258 int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
259 {
260 	struct hfi1_qp_priv *priv = qp->priv;
261 	struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
262 	struct ib_other_headers *ohdr;
263 	struct rvt_sge_state *ss;
264 	struct rvt_swqe *wqe;
265 	u32 hwords;
266 	u32 len;
267 	u32 bth0 = 0;
268 	u32 bth2;
269 	u32 pmtu = qp->pmtu;
270 	char newreq;
271 	int middle = 0;
272 	int delta;
273 
274 	lockdep_assert_held(&qp->s_lock);
275 	ps->s_txreq = get_txreq(ps->dev, qp);
276 	if (IS_ERR(ps->s_txreq))
277 		goto bail_no_tx;
278 
279 	if (priv->hdr_type == HFI1_PKT_TYPE_9B) {
280 		/* header size in 32-bit words LRH+BTH = (8+12)/4. */
281 		hwords = 5;
282 		if (rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH)
283 			ohdr = &ps->s_txreq->phdr.hdr.ibh.u.l.oth;
284 		else
285 			ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth;
286 	} else {
287 		/* header size in 32-bit words 16B LRH+BTH = (16+12)/4. */
288 		hwords = 7;
289 		if ((rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) &&
290 		    (hfi1_check_mcast(rdma_ah_get_dlid(&qp->remote_ah_attr))))
291 			ohdr = &ps->s_txreq->phdr.hdr.opah.u.l.oth;
292 		else
293 			ohdr = &ps->s_txreq->phdr.hdr.opah.u.oth;
294 	}
295 
296 	/* Sending responses has higher priority over sending requests. */
297 	if ((qp->s_flags & RVT_S_RESP_PENDING) &&
298 	    make_rc_ack(dev, qp, ohdr, ps))
299 		return 1;
300 
301 	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) {
302 		if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
303 			goto bail;
304 		/* We are in the error state, flush the work request. */
305 		smp_read_barrier_depends(); /* see post_one_send() */
306 		if (qp->s_last == READ_ONCE(qp->s_head))
307 			goto bail;
308 		/* If DMAs are in progress, we can't flush immediately. */
309 		if (iowait_sdma_pending(&priv->s_iowait)) {
310 			qp->s_flags |= RVT_S_WAIT_DMA;
311 			goto bail;
312 		}
313 		clear_ahg(qp);
314 		wqe = rvt_get_swqe_ptr(qp, qp->s_last);
315 		hfi1_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
316 			IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
317 		/* will get called again */
318 		goto done_free_tx;
319 	}
320 
321 	if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK))
322 		goto bail;
323 
324 	if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) {
325 		if (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) {
326 			qp->s_flags |= RVT_S_WAIT_PSN;
327 			goto bail;
328 		}
329 		qp->s_sending_psn = qp->s_psn;
330 		qp->s_sending_hpsn = qp->s_psn - 1;
331 	}
332 
333 	/* Send a request. */
334 	wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
335 	switch (qp->s_state) {
336 	default:
337 		if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK))
338 			goto bail;
339 		/*
340 		 * Resend an old request or start a new one.
341 		 *
342 		 * We keep track of the current SWQE so that
343 		 * we don't reset the "furthest progress" state
344 		 * if we need to back up.
345 		 */
346 		newreq = 0;
347 		if (qp->s_cur == qp->s_tail) {
348 			/* Check if send work queue is empty. */
349 			smp_read_barrier_depends(); /* see post_one_send() */
350 			if (qp->s_tail == READ_ONCE(qp->s_head)) {
351 				clear_ahg(qp);
352 				goto bail;
353 			}
354 			/*
355 			 * If a fence is requested, wait for previous
356 			 * RDMA read and atomic operations to finish.
357 			 */
358 			if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
359 			    qp->s_num_rd_atomic) {
360 				qp->s_flags |= RVT_S_WAIT_FENCE;
361 				goto bail;
362 			}
363 			/*
364 			 * Local operations are processed immediately
365 			 * after all prior requests have completed
366 			 */
367 			if (wqe->wr.opcode == IB_WR_REG_MR ||
368 			    wqe->wr.opcode == IB_WR_LOCAL_INV) {
369 				int local_ops = 0;
370 				int err = 0;
371 
372 				if (qp->s_last != qp->s_cur)
373 					goto bail;
374 				if (++qp->s_cur == qp->s_size)
375 					qp->s_cur = 0;
376 				if (++qp->s_tail == qp->s_size)
377 					qp->s_tail = 0;
378 				if (!(wqe->wr.send_flags &
379 				      RVT_SEND_COMPLETION_ONLY)) {
380 					err = rvt_invalidate_rkey(
381 						qp,
382 						wqe->wr.ex.invalidate_rkey);
383 					local_ops = 1;
384 				}
385 				hfi1_send_complete(qp, wqe,
386 						   err ? IB_WC_LOC_PROT_ERR
387 						       : IB_WC_SUCCESS);
388 				if (local_ops)
389 					atomic_dec(&qp->local_ops_pending);
390 				qp->s_hdrwords = 0;
391 				goto done_free_tx;
392 			}
393 
394 			newreq = 1;
395 			qp->s_psn = wqe->psn;
396 		}
397 		/*
398 		 * Note that we have to be careful not to modify the
399 		 * original work request since we may need to resend
400 		 * it.
401 		 */
402 		len = wqe->length;
403 		ss = &qp->s_sge;
404 		bth2 = mask_psn(qp->s_psn);
405 		switch (wqe->wr.opcode) {
406 		case IB_WR_SEND:
407 		case IB_WR_SEND_WITH_IMM:
408 		case IB_WR_SEND_WITH_INV:
409 			/* If no credit, return. */
410 			if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) &&
411 			    rvt_cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
412 				qp->s_flags |= RVT_S_WAIT_SSN_CREDIT;
413 				goto bail;
414 			}
415 			if (len > pmtu) {
416 				qp->s_state = OP(SEND_FIRST);
417 				len = pmtu;
418 				break;
419 			}
420 			if (wqe->wr.opcode == IB_WR_SEND) {
421 				qp->s_state = OP(SEND_ONLY);
422 			} else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
423 				qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
424 				/* Immediate data comes after the BTH */
425 				ohdr->u.imm_data = wqe->wr.ex.imm_data;
426 				hwords += 1;
427 			} else {
428 				qp->s_state = OP(SEND_ONLY_WITH_INVALIDATE);
429 				/* Invalidate rkey comes after the BTH */
430 				ohdr->u.ieth = cpu_to_be32(
431 						wqe->wr.ex.invalidate_rkey);
432 				hwords += 1;
433 			}
434 			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
435 				bth0 |= IB_BTH_SOLICITED;
436 			bth2 |= IB_BTH_REQ_ACK;
437 			if (++qp->s_cur == qp->s_size)
438 				qp->s_cur = 0;
439 			break;
440 
441 		case IB_WR_RDMA_WRITE:
442 			if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
443 				qp->s_lsn++;
444 			goto no_flow_control;
445 		case IB_WR_RDMA_WRITE_WITH_IMM:
446 			/* If no credit, return. */
447 			if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) &&
448 			    rvt_cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
449 				qp->s_flags |= RVT_S_WAIT_SSN_CREDIT;
450 				goto bail;
451 			}
452 no_flow_control:
453 			put_ib_reth_vaddr(
454 				wqe->rdma_wr.remote_addr,
455 				&ohdr->u.rc.reth);
456 			ohdr->u.rc.reth.rkey =
457 				cpu_to_be32(wqe->rdma_wr.rkey);
458 			ohdr->u.rc.reth.length = cpu_to_be32(len);
459 			hwords += sizeof(struct ib_reth) / sizeof(u32);
460 			if (len > pmtu) {
461 				qp->s_state = OP(RDMA_WRITE_FIRST);
462 				len = pmtu;
463 				break;
464 			}
465 			if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
466 				qp->s_state = OP(RDMA_WRITE_ONLY);
467 			} else {
468 				qp->s_state =
469 					OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
470 				/* Immediate data comes after RETH */
471 				ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
472 				hwords += 1;
473 				if (wqe->wr.send_flags & IB_SEND_SOLICITED)
474 					bth0 |= IB_BTH_SOLICITED;
475 			}
476 			bth2 |= IB_BTH_REQ_ACK;
477 			if (++qp->s_cur == qp->s_size)
478 				qp->s_cur = 0;
479 			break;
480 
481 		case IB_WR_RDMA_READ:
482 			/*
483 			 * Don't allow more operations to be started
484 			 * than the QP limits allow.
485 			 */
486 			if (newreq) {
487 				if (qp->s_num_rd_atomic >=
488 				    qp->s_max_rd_atomic) {
489 					qp->s_flags |= RVT_S_WAIT_RDMAR;
490 					goto bail;
491 				}
492 				qp->s_num_rd_atomic++;
493 				if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
494 					qp->s_lsn++;
495 			}
496 			put_ib_reth_vaddr(
497 				wqe->rdma_wr.remote_addr,
498 				&ohdr->u.rc.reth);
499 			ohdr->u.rc.reth.rkey =
500 				cpu_to_be32(wqe->rdma_wr.rkey);
501 			ohdr->u.rc.reth.length = cpu_to_be32(len);
502 			qp->s_state = OP(RDMA_READ_REQUEST);
503 			hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
504 			ss = NULL;
505 			len = 0;
506 			bth2 |= IB_BTH_REQ_ACK;
507 			if (++qp->s_cur == qp->s_size)
508 				qp->s_cur = 0;
509 			break;
510 
511 		case IB_WR_ATOMIC_CMP_AND_SWP:
512 		case IB_WR_ATOMIC_FETCH_AND_ADD:
513 			/*
514 			 * Don't allow more operations to be started
515 			 * than the QP limits allow.
516 			 */
517 			if (newreq) {
518 				if (qp->s_num_rd_atomic >=
519 				    qp->s_max_rd_atomic) {
520 					qp->s_flags |= RVT_S_WAIT_RDMAR;
521 					goto bail;
522 				}
523 				qp->s_num_rd_atomic++;
524 				if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
525 					qp->s_lsn++;
526 			}
527 			if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
528 				qp->s_state = OP(COMPARE_SWAP);
529 				put_ib_ateth_swap(wqe->atomic_wr.swap,
530 						  &ohdr->u.atomic_eth);
531 				put_ib_ateth_compare(wqe->atomic_wr.compare_add,
532 						     &ohdr->u.atomic_eth);
533 			} else {
534 				qp->s_state = OP(FETCH_ADD);
535 				put_ib_ateth_swap(wqe->atomic_wr.compare_add,
536 						  &ohdr->u.atomic_eth);
537 				put_ib_ateth_compare(0, &ohdr->u.atomic_eth);
538 			}
539 			put_ib_ateth_vaddr(wqe->atomic_wr.remote_addr,
540 					   &ohdr->u.atomic_eth);
541 			ohdr->u.atomic_eth.rkey = cpu_to_be32(
542 				wqe->atomic_wr.rkey);
543 			hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);
544 			ss = NULL;
545 			len = 0;
546 			bth2 |= IB_BTH_REQ_ACK;
547 			if (++qp->s_cur == qp->s_size)
548 				qp->s_cur = 0;
549 			break;
550 
551 		default:
552 			goto bail;
553 		}
554 		qp->s_sge.sge = wqe->sg_list[0];
555 		qp->s_sge.sg_list = wqe->sg_list + 1;
556 		qp->s_sge.num_sge = wqe->wr.num_sge;
557 		qp->s_sge.total_len = wqe->length;
558 		qp->s_len = wqe->length;
559 		if (newreq) {
560 			qp->s_tail++;
561 			if (qp->s_tail >= qp->s_size)
562 				qp->s_tail = 0;
563 		}
564 		if (wqe->wr.opcode == IB_WR_RDMA_READ)
565 			qp->s_psn = wqe->lpsn + 1;
566 		else
567 			qp->s_psn++;
568 		break;
569 
570 	case OP(RDMA_READ_RESPONSE_FIRST):
571 		/*
572 		 * qp->s_state is normally set to the opcode of the
573 		 * last packet constructed for new requests and therefore
574 		 * is never set to RDMA read response.
575 		 * RDMA_READ_RESPONSE_FIRST is used by the ACK processing
576 		 * thread to indicate a SEND needs to be restarted from an
577 		 * earlier PSN without interfering with the sending thread.
578 		 * See restart_rc().
579 		 */
580 		qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
581 		/* FALLTHROUGH */
582 	case OP(SEND_FIRST):
583 		qp->s_state = OP(SEND_MIDDLE);
584 		/* FALLTHROUGH */
585 	case OP(SEND_MIDDLE):
586 		bth2 = mask_psn(qp->s_psn++);
587 		ss = &qp->s_sge;
588 		len = qp->s_len;
589 		if (len > pmtu) {
590 			len = pmtu;
591 			middle = HFI1_CAP_IS_KSET(SDMA_AHG);
592 			break;
593 		}
594 		if (wqe->wr.opcode == IB_WR_SEND) {
595 			qp->s_state = OP(SEND_LAST);
596 		} else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
597 			qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
598 			/* Immediate data comes after the BTH */
599 			ohdr->u.imm_data = wqe->wr.ex.imm_data;
600 			hwords += 1;
601 		} else {
602 			qp->s_state = OP(SEND_LAST_WITH_INVALIDATE);
603 			/* invalidate data comes after the BTH */
604 			ohdr->u.ieth = cpu_to_be32(wqe->wr.ex.invalidate_rkey);
605 			hwords += 1;
606 		}
607 		if (wqe->wr.send_flags & IB_SEND_SOLICITED)
608 			bth0 |= IB_BTH_SOLICITED;
609 		bth2 |= IB_BTH_REQ_ACK;
610 		qp->s_cur++;
611 		if (qp->s_cur >= qp->s_size)
612 			qp->s_cur = 0;
613 		break;
614 
615 	case OP(RDMA_READ_RESPONSE_LAST):
616 		/*
617 		 * qp->s_state is normally set to the opcode of the
618 		 * last packet constructed for new requests and therefore
619 		 * is never set to RDMA read response.
620 		 * RDMA_READ_RESPONSE_LAST is used by the ACK processing
621 		 * thread to indicate a RDMA write needs to be restarted from
622 		 * an earlier PSN without interfering with the sending thread.
623 		 * See restart_rc().
624 		 */
625 		qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
626 		/* FALLTHROUGH */
627 	case OP(RDMA_WRITE_FIRST):
628 		qp->s_state = OP(RDMA_WRITE_MIDDLE);
629 		/* FALLTHROUGH */
630 	case OP(RDMA_WRITE_MIDDLE):
631 		bth2 = mask_psn(qp->s_psn++);
632 		ss = &qp->s_sge;
633 		len = qp->s_len;
634 		if (len > pmtu) {
635 			len = pmtu;
636 			middle = HFI1_CAP_IS_KSET(SDMA_AHG);
637 			break;
638 		}
639 		if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
640 			qp->s_state = OP(RDMA_WRITE_LAST);
641 		} else {
642 			qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
643 			/* Immediate data comes after the BTH */
644 			ohdr->u.imm_data = wqe->wr.ex.imm_data;
645 			hwords += 1;
646 			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
647 				bth0 |= IB_BTH_SOLICITED;
648 		}
649 		bth2 |= IB_BTH_REQ_ACK;
650 		qp->s_cur++;
651 		if (qp->s_cur >= qp->s_size)
652 			qp->s_cur = 0;
653 		break;
654 
655 	case OP(RDMA_READ_RESPONSE_MIDDLE):
656 		/*
657 		 * qp->s_state is normally set to the opcode of the
658 		 * last packet constructed for new requests and therefore
659 		 * is never set to RDMA read response.
660 		 * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing
661 		 * thread to indicate a RDMA read needs to be restarted from
662 		 * an earlier PSN without interfering with the sending thread.
663 		 * See restart_rc().
664 		 */
665 		len = (delta_psn(qp->s_psn, wqe->psn)) * pmtu;
666 		put_ib_reth_vaddr(
667 			wqe->rdma_wr.remote_addr + len,
668 			&ohdr->u.rc.reth);
669 		ohdr->u.rc.reth.rkey =
670 			cpu_to_be32(wqe->rdma_wr.rkey);
671 		ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len);
672 		qp->s_state = OP(RDMA_READ_REQUEST);
673 		hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
674 		bth2 = mask_psn(qp->s_psn) | IB_BTH_REQ_ACK;
675 		qp->s_psn = wqe->lpsn + 1;
676 		ss = NULL;
677 		len = 0;
678 		qp->s_cur++;
679 		if (qp->s_cur == qp->s_size)
680 			qp->s_cur = 0;
681 		break;
682 	}
683 	qp->s_sending_hpsn = bth2;
684 	delta = delta_psn(bth2, wqe->psn);
685 	if (delta && delta % HFI1_PSN_CREDIT == 0)
686 		bth2 |= IB_BTH_REQ_ACK;
687 	if (qp->s_flags & RVT_S_SEND_ONE) {
688 		qp->s_flags &= ~RVT_S_SEND_ONE;
689 		qp->s_flags |= RVT_S_WAIT_ACK;
690 		bth2 |= IB_BTH_REQ_ACK;
691 	}
692 	qp->s_len -= len;
693 	qp->s_hdrwords = hwords;
694 	ps->s_txreq->sde = priv->s_sde;
695 	ps->s_txreq->ss = ss;
696 	ps->s_txreq->s_cur_size = len;
697 	hfi1_make_ruc_header(
698 		qp,
699 		ohdr,
700 		bth0 | (qp->s_state << 24),
701 		bth2,
702 		middle,
703 		ps);
704 	/* pbc */
705 	ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2;
706 	return 1;
707 
708 done_free_tx:
709 	hfi1_put_txreq(ps->s_txreq);
710 	ps->s_txreq = NULL;
711 	return 1;
712 
713 bail:
714 	hfi1_put_txreq(ps->s_txreq);
715 
716 bail_no_tx:
717 	ps->s_txreq = NULL;
718 	qp->s_flags &= ~RVT_S_BUSY;
719 	qp->s_hdrwords = 0;
720 	return 0;
721 }
722 
723 static inline void hfi1_make_bth_aeth(struct rvt_qp *qp,
724 				      struct ib_other_headers *ohdr,
725 				      u32 bth0, u32 bth1)
726 {
727 	if (qp->r_nak_state)
728 		ohdr->u.aeth = cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
729 					    (qp->r_nak_state <<
730 					     IB_AETH_CREDIT_SHIFT));
731 	else
732 		ohdr->u.aeth = rvt_compute_aeth(qp);
733 
734 	ohdr->bth[0] = cpu_to_be32(bth0);
735 	ohdr->bth[1] = cpu_to_be32(bth1 | qp->remote_qpn);
736 	ohdr->bth[2] = cpu_to_be32(mask_psn(qp->r_ack_psn));
737 }
738 
739 static inline void hfi1_queue_rc_ack(struct rvt_qp *qp, bool is_fecn)
740 {
741 	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
742 	unsigned long flags;
743 
744 	spin_lock_irqsave(&qp->s_lock, flags);
745 	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
746 		goto unlock;
747 	this_cpu_inc(*ibp->rvp.rc_qacks);
748 	qp->s_flags |= RVT_S_ACK_PENDING | RVT_S_RESP_PENDING;
749 	qp->s_nak_state = qp->r_nak_state;
750 	qp->s_ack_psn = qp->r_ack_psn;
751 	if (is_fecn)
752 		qp->s_flags |= RVT_S_ECN;
753 
754 	/* Schedule the send tasklet. */
755 	hfi1_schedule_send(qp);
756 unlock:
757 	spin_unlock_irqrestore(&qp->s_lock, flags);
758 }
759 
760 static inline void hfi1_make_rc_ack_9B(struct rvt_qp *qp,
761 				       struct hfi1_opa_header *opa_hdr,
762 				       u8 sc5, bool is_fecn,
763 				       u64 *pbc_flags, u32 *hwords,
764 				       u32 *nwords)
765 {
766 	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
767 	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
768 	struct ib_header *hdr = &opa_hdr->ibh;
769 	struct ib_other_headers *ohdr;
770 	u16 lrh0 = HFI1_LRH_BTH;
771 	u16 pkey;
772 	u32 bth0, bth1;
773 
774 	opa_hdr->hdr_type = HFI1_PKT_TYPE_9B;
775 	ohdr = &hdr->u.oth;
776 	/* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4 */
777 	*hwords = 6;
778 
779 	if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH)) {
780 		*hwords += hfi1_make_grh(ibp, &hdr->u.l.grh,
781 					 rdma_ah_read_grh(&qp->remote_ah_attr),
782 					 *hwords - 2, SIZE_OF_CRC);
783 		ohdr = &hdr->u.l.oth;
784 		lrh0 = HFI1_LRH_GRH;
785 	}
786 	/* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
787 	*pbc_flags |= ((!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT);
788 
789 	/* read pkey_index w/o lock (its atomic) */
790 	pkey = hfi1_get_pkey(ibp, qp->s_pkey_index);
791 
792 	lrh0 |= (sc5 & IB_SC_MASK) << IB_SC_SHIFT |
793 		(rdma_ah_get_sl(&qp->remote_ah_attr) & IB_SL_MASK) <<
794 			IB_SL_SHIFT;
795 
796 	hfi1_make_ib_hdr(hdr, lrh0, *hwords + SIZE_OF_CRC,
797 			 opa_get_lid(rdma_ah_get_dlid(&qp->remote_ah_attr), 9B),
798 			 ppd->lid | rdma_ah_get_path_bits(&qp->remote_ah_attr));
799 
800 	bth0 = pkey | (OP(ACKNOWLEDGE) << 24);
801 	if (qp->s_mig_state == IB_MIG_MIGRATED)
802 		bth0 |= IB_BTH_MIG_REQ;
803 	bth1 = (!!is_fecn) << IB_BECN_SHIFT;
804 	hfi1_make_bth_aeth(qp, ohdr, bth0, bth1);
805 }
806 
807 static inline void hfi1_make_rc_ack_16B(struct rvt_qp *qp,
808 					struct hfi1_opa_header *opa_hdr,
809 					u8 sc5, bool is_fecn,
810 					u64 *pbc_flags, u32 *hwords,
811 					u32 *nwords)
812 {
813 	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
814 	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
815 	struct hfi1_16b_header *hdr = &opa_hdr->opah;
816 	struct ib_other_headers *ohdr;
817 	u32 bth0, bth1 = 0;
818 	u16 len, pkey;
819 	u8 becn = !!is_fecn;
820 	u8 l4 = OPA_16B_L4_IB_LOCAL;
821 	u8 extra_bytes;
822 
823 	opa_hdr->hdr_type = HFI1_PKT_TYPE_16B;
824 	ohdr = &hdr->u.oth;
825 	/* header size in 32-bit words 16B LRH+BTH+AETH = (16+12+4)/4 */
826 	*hwords = 8;
827 	extra_bytes = hfi1_get_16b_padding(*hwords << 2, 0);
828 	*nwords = SIZE_OF_CRC + ((extra_bytes + SIZE_OF_LT) >> 2);
829 
830 	if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) &&
831 	    hfi1_check_mcast(rdma_ah_get_dlid(&qp->remote_ah_attr))) {
832 		*hwords += hfi1_make_grh(ibp, &hdr->u.l.grh,
833 					 rdma_ah_read_grh(&qp->remote_ah_attr),
834 					 *hwords - 4, *nwords);
835 		ohdr = &hdr->u.l.oth;
836 		l4 = OPA_16B_L4_IB_GLOBAL;
837 	}
838 	*pbc_flags |= PBC_PACKET_BYPASS | PBC_INSERT_BYPASS_ICRC;
839 
840 	/* read pkey_index w/o lock (its atomic) */
841 	pkey = hfi1_get_pkey(ibp, qp->s_pkey_index);
842 
843 	/* Convert dwords to flits */
844 	len = (*hwords + *nwords) >> 1;
845 
846 	hfi1_make_16b_hdr(hdr,
847 			  ppd->lid | rdma_ah_get_path_bits(&qp->remote_ah_attr),
848 			  opa_get_lid(rdma_ah_get_dlid(&qp->remote_ah_attr),
849 				      16B),
850 			  len, pkey, becn, 0, l4, sc5);
851 
852 	bth0 = pkey | (OP(ACKNOWLEDGE) << 24);
853 	bth0 |= extra_bytes << 20;
854 	if (qp->s_mig_state == IB_MIG_MIGRATED)
855 		bth1 = OPA_BTH_MIG_REQ;
856 	hfi1_make_bth_aeth(qp, ohdr, bth0, bth1);
857 }
858 
859 typedef void (*hfi1_make_rc_ack)(struct rvt_qp *qp,
860 				 struct hfi1_opa_header *opa_hdr,
861 				 u8 sc5, bool is_fecn,
862 				 u64 *pbc_flags, u32 *hwords,
863 				 u32 *nwords);
864 
865 /* We support only two types - 9B and 16B for now */
866 static const hfi1_make_rc_ack hfi1_make_rc_ack_tbl[2] = {
867 	[HFI1_PKT_TYPE_9B] = &hfi1_make_rc_ack_9B,
868 	[HFI1_PKT_TYPE_16B] = &hfi1_make_rc_ack_16B
869 };
870 
871 /**
872  * hfi1_send_rc_ack - Construct an ACK packet and send it
873  * @qp: a pointer to the QP
874  *
875  * This is called from hfi1_rc_rcv() and handle_receive_interrupt().
876  * Note that RDMA reads and atomics are handled in the
877  * send side QP state and send engine.
878  */
879 void hfi1_send_rc_ack(struct hfi1_ctxtdata *rcd,
880 		      struct rvt_qp *qp, bool is_fecn)
881 {
882 	struct hfi1_ibport *ibp = rcd_to_iport(rcd);
883 	struct hfi1_qp_priv *priv = qp->priv;
884 	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
885 	u8 sc5 = ibp->sl_to_sc[rdma_ah_get_sl(&qp->remote_ah_attr)];
886 	u64 pbc, pbc_flags = 0;
887 	u32 hwords = 0;
888 	u32 nwords = 0;
889 	u32 plen;
890 	struct pio_buf *pbuf;
891 	struct hfi1_opa_header opa_hdr;
892 
893 	/* clear the defer count */
894 	qp->r_adefered = 0;
895 
896 	/* Don't send ACK or NAK if a RDMA read or atomic is pending. */
897 	if (qp->s_flags & RVT_S_RESP_PENDING) {
898 		hfi1_queue_rc_ack(qp, is_fecn);
899 		return;
900 	}
901 
902 	/* Ensure s_rdma_ack_cnt changes are committed */
903 	smp_read_barrier_depends();
904 	if (qp->s_rdma_ack_cnt) {
905 		hfi1_queue_rc_ack(qp, is_fecn);
906 		return;
907 	}
908 
909 	/* Don't try to send ACKs if the link isn't ACTIVE */
910 	if (driver_lstate(ppd) != IB_PORT_ACTIVE)
911 		return;
912 
913 	/* Make the appropriate header */
914 	hfi1_make_rc_ack_tbl[priv->hdr_type](qp, &opa_hdr, sc5, is_fecn,
915 					     &pbc_flags, &hwords, &nwords);
916 
917 	plen = 2 /* PBC */ + hwords + nwords;
918 	pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps,
919 			 sc_to_vlt(ppd->dd, sc5), plen);
920 	pbuf = sc_buffer_alloc(rcd->sc, plen, NULL, NULL);
921 	if (!pbuf) {
922 		/*
923 		 * We have no room to send at the moment.  Pass
924 		 * responsibility for sending the ACK to the send engine
925 		 * so that when enough buffer space becomes available,
926 		 * the ACK is sent ahead of other outgoing packets.
927 		 */
928 		hfi1_queue_rc_ack(qp, is_fecn);
929 		return;
930 	}
931 	trace_ack_output_ibhdr(dd_from_ibdev(qp->ibqp.device),
932 			       &opa_hdr, ib_is_sc5(sc5));
933 
934 	/* write the pbc and data */
935 	ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc,
936 				 (priv->hdr_type == HFI1_PKT_TYPE_9B ?
937 				 (void *)&opa_hdr.ibh :
938 				 (void *)&opa_hdr.opah), hwords);
939 	return;
940 }
941 
942 /**
943  * reset_psn - reset the QP state to send starting from PSN
944  * @qp: the QP
945  * @psn: the packet sequence number to restart at
946  *
947  * This is called from hfi1_rc_rcv() to process an incoming RC ACK
948  * for the given QP.
949  * Called at interrupt level with the QP s_lock held.
950  */
951 static void reset_psn(struct rvt_qp *qp, u32 psn)
952 {
953 	u32 n = qp->s_acked;
954 	struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n);
955 	u32 opcode;
956 
957 	lockdep_assert_held(&qp->s_lock);
958 	qp->s_cur = n;
959 
960 	/*
961 	 * If we are starting the request from the beginning,
962 	 * let the normal send code handle initialization.
963 	 */
964 	if (cmp_psn(psn, wqe->psn) <= 0) {
965 		qp->s_state = OP(SEND_LAST);
966 		goto done;
967 	}
968 
969 	/* Find the work request opcode corresponding to the given PSN. */
970 	opcode = wqe->wr.opcode;
971 	for (;;) {
972 		int diff;
973 
974 		if (++n == qp->s_size)
975 			n = 0;
976 		if (n == qp->s_tail)
977 			break;
978 		wqe = rvt_get_swqe_ptr(qp, n);
979 		diff = cmp_psn(psn, wqe->psn);
980 		if (diff < 0)
981 			break;
982 		qp->s_cur = n;
983 		/*
984 		 * If we are starting the request from the beginning,
985 		 * let the normal send code handle initialization.
986 		 */
987 		if (diff == 0) {
988 			qp->s_state = OP(SEND_LAST);
989 			goto done;
990 		}
991 		opcode = wqe->wr.opcode;
992 	}
993 
994 	/*
995 	 * Set the state to restart in the middle of a request.
996 	 * Don't change the s_sge, s_cur_sge, or s_cur_size.
997 	 * See hfi1_make_rc_req().
998 	 */
999 	switch (opcode) {
1000 	case IB_WR_SEND:
1001 	case IB_WR_SEND_WITH_IMM:
1002 		qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
1003 		break;
1004 
1005 	case IB_WR_RDMA_WRITE:
1006 	case IB_WR_RDMA_WRITE_WITH_IMM:
1007 		qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
1008 		break;
1009 
1010 	case IB_WR_RDMA_READ:
1011 		qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
1012 		break;
1013 
1014 	default:
1015 		/*
1016 		 * This case shouldn't happen since its only
1017 		 * one PSN per req.
1018 		 */
1019 		qp->s_state = OP(SEND_LAST);
1020 	}
1021 done:
1022 	qp->s_psn = psn;
1023 	/*
1024 	 * Set RVT_S_WAIT_PSN as rc_complete() may start the timer
1025 	 * asynchronously before the send engine can get scheduled.
1026 	 * Doing it in hfi1_make_rc_req() is too late.
1027 	 */
1028 	if ((cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) &&
1029 	    (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0))
1030 		qp->s_flags |= RVT_S_WAIT_PSN;
1031 	qp->s_flags &= ~RVT_S_AHG_VALID;
1032 }
1033 
1034 /*
1035  * Back up requester to resend the last un-ACKed request.
1036  * The QP r_lock and s_lock should be held and interrupts disabled.
1037  */
1038 void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
1039 {
1040 	struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1041 	struct hfi1_ibport *ibp;
1042 
1043 	lockdep_assert_held(&qp->r_lock);
1044 	lockdep_assert_held(&qp->s_lock);
1045 	if (qp->s_retry == 0) {
1046 		if (qp->s_mig_state == IB_MIG_ARMED) {
1047 			hfi1_migrate_qp(qp);
1048 			qp->s_retry = qp->s_retry_cnt;
1049 		} else if (qp->s_last == qp->s_acked) {
1050 			hfi1_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
1051 			rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
1052 			return;
1053 		} else { /* need to handle delayed completion */
1054 			return;
1055 		}
1056 	} else {
1057 		qp->s_retry--;
1058 	}
1059 
1060 	ibp = to_iport(qp->ibqp.device, qp->port_num);
1061 	if (wqe->wr.opcode == IB_WR_RDMA_READ)
1062 		ibp->rvp.n_rc_resends++;
1063 	else
1064 		ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
1065 
1066 	qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR |
1067 			 RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN |
1068 			 RVT_S_WAIT_ACK);
1069 	if (wait)
1070 		qp->s_flags |= RVT_S_SEND_ONE;
1071 	reset_psn(qp, psn);
1072 }
1073 
1074 /*
1075  * Set qp->s_sending_psn to the next PSN after the given one.
1076  * This would be psn+1 except when RDMA reads are present.
1077  */
1078 static void reset_sending_psn(struct rvt_qp *qp, u32 psn)
1079 {
1080 	struct rvt_swqe *wqe;
1081 	u32 n = qp->s_last;
1082 
1083 	lockdep_assert_held(&qp->s_lock);
1084 	/* Find the work request corresponding to the given PSN. */
1085 	for (;;) {
1086 		wqe = rvt_get_swqe_ptr(qp, n);
1087 		if (cmp_psn(psn, wqe->lpsn) <= 0) {
1088 			if (wqe->wr.opcode == IB_WR_RDMA_READ)
1089 				qp->s_sending_psn = wqe->lpsn + 1;
1090 			else
1091 				qp->s_sending_psn = psn + 1;
1092 			break;
1093 		}
1094 		if (++n == qp->s_size)
1095 			n = 0;
1096 		if (n == qp->s_tail)
1097 			break;
1098 	}
1099 }
1100 
1101 /*
1102  * This should be called with the QP s_lock held and interrupts disabled.
1103  */
1104 void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah)
1105 {
1106 	struct ib_other_headers *ohdr;
1107 	struct hfi1_qp_priv *priv = qp->priv;
1108 	struct rvt_swqe *wqe;
1109 	struct ib_header *hdr = NULL;
1110 	struct hfi1_16b_header *hdr_16b = NULL;
1111 	u32 opcode;
1112 	u32 psn;
1113 
1114 	lockdep_assert_held(&qp->s_lock);
1115 	if (!(ib_rvt_state_ops[qp->state] & RVT_SEND_OR_FLUSH_OR_RECV_OK))
1116 		return;
1117 
1118 	/* Find out where the BTH is */
1119 	if (priv->hdr_type == HFI1_PKT_TYPE_9B) {
1120 		hdr = &opah->ibh;
1121 		if (ib_get_lnh(hdr) == HFI1_LRH_BTH)
1122 			ohdr = &hdr->u.oth;
1123 		else
1124 			ohdr = &hdr->u.l.oth;
1125 	} else {
1126 		u8 l4;
1127 
1128 		hdr_16b = &opah->opah;
1129 		l4  = hfi1_16B_get_l4(hdr_16b);
1130 		if (l4 == OPA_16B_L4_IB_LOCAL)
1131 			ohdr = &hdr_16b->u.oth;
1132 		else
1133 			ohdr = &hdr_16b->u.l.oth;
1134 	}
1135 
1136 	opcode = ib_bth_get_opcode(ohdr);
1137 	if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
1138 	    opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
1139 		WARN_ON(!qp->s_rdma_ack_cnt);
1140 		qp->s_rdma_ack_cnt--;
1141 		return;
1142 	}
1143 
1144 	psn = ib_bth_get_psn(ohdr);
1145 	reset_sending_psn(qp, psn);
1146 
1147 	/*
1148 	 * Start timer after a packet requesting an ACK has been sent and
1149 	 * there are still requests that haven't been acked.
1150 	 */
1151 	if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&
1152 	    !(qp->s_flags &
1153 		(RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
1154 		(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
1155 		rvt_add_retry_timer(qp);
1156 
1157 	while (qp->s_last != qp->s_acked) {
1158 		u32 s_last;
1159 
1160 		wqe = rvt_get_swqe_ptr(qp, qp->s_last);
1161 		if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 &&
1162 		    cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
1163 			break;
1164 		s_last = qp->s_last;
1165 		trace_hfi1_qp_send_completion(qp, wqe, s_last);
1166 		if (++s_last >= qp->s_size)
1167 			s_last = 0;
1168 		qp->s_last = s_last;
1169 		/* see post_send() */
1170 		barrier();
1171 		rvt_put_swqe(wqe);
1172 		rvt_qp_swqe_complete(qp,
1173 				     wqe,
1174 				     ib_hfi1_wc_opcode[wqe->wr.opcode],
1175 				     IB_WC_SUCCESS);
1176 	}
1177 	/*
1178 	 * If we were waiting for sends to complete before re-sending,
1179 	 * and they are now complete, restart sending.
1180 	 */
1181 	trace_hfi1_sendcomplete(qp, psn);
1182 	if (qp->s_flags & RVT_S_WAIT_PSN &&
1183 	    cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
1184 		qp->s_flags &= ~RVT_S_WAIT_PSN;
1185 		qp->s_sending_psn = qp->s_psn;
1186 		qp->s_sending_hpsn = qp->s_psn - 1;
1187 		hfi1_schedule_send(qp);
1188 	}
1189 }
1190 
1191 static inline void update_last_psn(struct rvt_qp *qp, u32 psn)
1192 {
1193 	qp->s_last_psn = psn;
1194 }
1195 
1196 /*
1197  * Generate a SWQE completion.
1198  * This is similar to hfi1_send_complete but has to check to be sure
1199  * that the SGEs are not being referenced if the SWQE is being resent.
1200  */
1201 static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
1202 					 struct rvt_swqe *wqe,
1203 					 struct hfi1_ibport *ibp)
1204 {
1205 	lockdep_assert_held(&qp->s_lock);
1206 	/*
1207 	 * Don't decrement refcount and don't generate a
1208 	 * completion if the SWQE is being resent until the send
1209 	 * is finished.
1210 	 */
1211 	if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 ||
1212 	    cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
1213 		u32 s_last;
1214 
1215 		rvt_put_swqe(wqe);
1216 		s_last = qp->s_last;
1217 		trace_hfi1_qp_send_completion(qp, wqe, s_last);
1218 		if (++s_last >= qp->s_size)
1219 			s_last = 0;
1220 		qp->s_last = s_last;
1221 		/* see post_send() */
1222 		barrier();
1223 		rvt_qp_swqe_complete(qp,
1224 				     wqe,
1225 				     ib_hfi1_wc_opcode[wqe->wr.opcode],
1226 				     IB_WC_SUCCESS);
1227 	} else {
1228 		struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
1229 
1230 		this_cpu_inc(*ibp->rvp.rc_delayed_comp);
1231 		/*
1232 		 * If send progress not running attempt to progress
1233 		 * SDMA queue.
1234 		 */
1235 		if (ppd->dd->flags & HFI1_HAS_SEND_DMA) {
1236 			struct sdma_engine *engine;
1237 			u8 sl = rdma_ah_get_sl(&qp->remote_ah_attr);
1238 			u8 sc5;
1239 
1240 			/* For now use sc to find engine */
1241 			sc5 = ibp->sl_to_sc[sl];
1242 			engine = qp_to_sdma_engine(qp, sc5);
1243 			sdma_engine_progress_schedule(engine);
1244 		}
1245 	}
1246 
1247 	qp->s_retry = qp->s_retry_cnt;
1248 	update_last_psn(qp, wqe->lpsn);
1249 
1250 	/*
1251 	 * If we are completing a request which is in the process of
1252 	 * being resent, we can stop re-sending it since we know the
1253 	 * responder has already seen it.
1254 	 */
1255 	if (qp->s_acked == qp->s_cur) {
1256 		if (++qp->s_cur >= qp->s_size)
1257 			qp->s_cur = 0;
1258 		qp->s_acked = qp->s_cur;
1259 		wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
1260 		if (qp->s_acked != qp->s_tail) {
1261 			qp->s_state = OP(SEND_LAST);
1262 			qp->s_psn = wqe->psn;
1263 		}
1264 	} else {
1265 		if (++qp->s_acked >= qp->s_size)
1266 			qp->s_acked = 0;
1267 		if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur)
1268 			qp->s_draining = 0;
1269 		wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1270 	}
1271 	return wqe;
1272 }
1273 
1274 /**
1275  * do_rc_ack - process an incoming RC ACK
1276  * @qp: the QP the ACK came in on
1277  * @psn: the packet sequence number of the ACK
1278  * @opcode: the opcode of the request that resulted in the ACK
1279  *
1280  * This is called from rc_rcv_resp() to process an incoming RC ACK
1281  * for the given QP.
1282  * May be called at interrupt level, with the QP s_lock held.
1283  * Returns 1 if OK, 0 if current operation should be aborted (NAK).
1284  */
1285 static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
1286 		     u64 val, struct hfi1_ctxtdata *rcd)
1287 {
1288 	struct hfi1_ibport *ibp;
1289 	enum ib_wc_status status;
1290 	struct rvt_swqe *wqe;
1291 	int ret = 0;
1292 	u32 ack_psn;
1293 	int diff;
1294 
1295 	lockdep_assert_held(&qp->s_lock);
1296 	/*
1297 	 * Note that NAKs implicitly ACK outstanding SEND and RDMA write
1298 	 * requests and implicitly NAK RDMA read and atomic requests issued
1299 	 * before the NAK'ed request.  The MSN won't include the NAK'ed
1300 	 * request but will include an ACK'ed request(s).
1301 	 */
1302 	ack_psn = psn;
1303 	if (aeth >> IB_AETH_NAK_SHIFT)
1304 		ack_psn--;
1305 	wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1306 	ibp = rcd_to_iport(rcd);
1307 
1308 	/*
1309 	 * The MSN might be for a later WQE than the PSN indicates so
1310 	 * only complete WQEs that the PSN finishes.
1311 	 */
1312 	while ((diff = delta_psn(ack_psn, wqe->lpsn)) >= 0) {
1313 		/*
1314 		 * RDMA_READ_RESPONSE_ONLY is a special case since
1315 		 * we want to generate completion events for everything
1316 		 * before the RDMA read, copy the data, then generate
1317 		 * the completion for the read.
1318 		 */
1319 		if (wqe->wr.opcode == IB_WR_RDMA_READ &&
1320 		    opcode == OP(RDMA_READ_RESPONSE_ONLY) &&
1321 		    diff == 0) {
1322 			ret = 1;
1323 			goto bail_stop;
1324 		}
1325 		/*
1326 		 * If this request is a RDMA read or atomic, and the ACK is
1327 		 * for a later operation, this ACK NAKs the RDMA read or
1328 		 * atomic.  In other words, only a RDMA_READ_LAST or ONLY
1329 		 * can ACK a RDMA read and likewise for atomic ops.  Note
1330 		 * that the NAK case can only happen if relaxed ordering is
1331 		 * used and requests are sent after an RDMA read or atomic
1332 		 * is sent but before the response is received.
1333 		 */
1334 		if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
1335 		     (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
1336 		    ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1337 		      wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
1338 		     (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
1339 			/* Retry this request. */
1340 			if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
1341 				qp->r_flags |= RVT_R_RDMAR_SEQ;
1342 				hfi1_restart_rc(qp, qp->s_last_psn + 1, 0);
1343 				if (list_empty(&qp->rspwait)) {
1344 					qp->r_flags |= RVT_R_RSP_SEND;
1345 					rvt_get_qp(qp);
1346 					list_add_tail(&qp->rspwait,
1347 						      &rcd->qp_wait_list);
1348 				}
1349 			}
1350 			/*
1351 			 * No need to process the ACK/NAK since we are
1352 			 * restarting an earlier request.
1353 			 */
1354 			goto bail_stop;
1355 		}
1356 		if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1357 		    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
1358 			u64 *vaddr = wqe->sg_list[0].vaddr;
1359 			*vaddr = val;
1360 		}
1361 		if (qp->s_num_rd_atomic &&
1362 		    (wqe->wr.opcode == IB_WR_RDMA_READ ||
1363 		     wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1364 		     wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {
1365 			qp->s_num_rd_atomic--;
1366 			/* Restart sending task if fence is complete */
1367 			if ((qp->s_flags & RVT_S_WAIT_FENCE) &&
1368 			    !qp->s_num_rd_atomic) {
1369 				qp->s_flags &= ~(RVT_S_WAIT_FENCE |
1370 						 RVT_S_WAIT_ACK);
1371 				hfi1_schedule_send(qp);
1372 			} else if (qp->s_flags & RVT_S_WAIT_RDMAR) {
1373 				qp->s_flags &= ~(RVT_S_WAIT_RDMAR |
1374 						 RVT_S_WAIT_ACK);
1375 				hfi1_schedule_send(qp);
1376 			}
1377 		}
1378 		wqe = do_rc_completion(qp, wqe, ibp);
1379 		if (qp->s_acked == qp->s_tail)
1380 			break;
1381 	}
1382 
1383 	switch (aeth >> IB_AETH_NAK_SHIFT) {
1384 	case 0:         /* ACK */
1385 		this_cpu_inc(*ibp->rvp.rc_acks);
1386 		if (qp->s_acked != qp->s_tail) {
1387 			/*
1388 			 * We are expecting more ACKs so
1389 			 * mod the retry timer.
1390 			 */
1391 			rvt_mod_retry_timer(qp);
1392 			/*
1393 			 * We can stop re-sending the earlier packets and
1394 			 * continue with the next packet the receiver wants.
1395 			 */
1396 			if (cmp_psn(qp->s_psn, psn) <= 0)
1397 				reset_psn(qp, psn + 1);
1398 		} else {
1399 			/* No more acks - kill all timers */
1400 			rvt_stop_rc_timers(qp);
1401 			if (cmp_psn(qp->s_psn, psn) <= 0) {
1402 				qp->s_state = OP(SEND_LAST);
1403 				qp->s_psn = psn + 1;
1404 			}
1405 		}
1406 		if (qp->s_flags & RVT_S_WAIT_ACK) {
1407 			qp->s_flags &= ~RVT_S_WAIT_ACK;
1408 			hfi1_schedule_send(qp);
1409 		}
1410 		rvt_get_credit(qp, aeth);
1411 		qp->s_rnr_retry = qp->s_rnr_retry_cnt;
1412 		qp->s_retry = qp->s_retry_cnt;
1413 		update_last_psn(qp, psn);
1414 		return 1;
1415 
1416 	case 1:         /* RNR NAK */
1417 		ibp->rvp.n_rnr_naks++;
1418 		if (qp->s_acked == qp->s_tail)
1419 			goto bail_stop;
1420 		if (qp->s_flags & RVT_S_WAIT_RNR)
1421 			goto bail_stop;
1422 		if (qp->s_rnr_retry == 0) {
1423 			status = IB_WC_RNR_RETRY_EXC_ERR;
1424 			goto class_b;
1425 		}
1426 		if (qp->s_rnr_retry_cnt < 7)
1427 			qp->s_rnr_retry--;
1428 
1429 		/* The last valid PSN is the previous PSN. */
1430 		update_last_psn(qp, psn - 1);
1431 
1432 		ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
1433 
1434 		reset_psn(qp, psn);
1435 
1436 		qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK);
1437 		rvt_stop_rc_timers(qp);
1438 		rvt_add_rnr_timer(qp, aeth);
1439 		return 0;
1440 
1441 	case 3:         /* NAK */
1442 		if (qp->s_acked == qp->s_tail)
1443 			goto bail_stop;
1444 		/* The last valid PSN is the previous PSN. */
1445 		update_last_psn(qp, psn - 1);
1446 		switch ((aeth >> IB_AETH_CREDIT_SHIFT) &
1447 			IB_AETH_CREDIT_MASK) {
1448 		case 0: /* PSN sequence error */
1449 			ibp->rvp.n_seq_naks++;
1450 			/*
1451 			 * Back up to the responder's expected PSN.
1452 			 * Note that we might get a NAK in the middle of an
1453 			 * RDMA READ response which terminates the RDMA
1454 			 * READ.
1455 			 */
1456 			hfi1_restart_rc(qp, psn, 0);
1457 			hfi1_schedule_send(qp);
1458 			break;
1459 
1460 		case 1: /* Invalid Request */
1461 			status = IB_WC_REM_INV_REQ_ERR;
1462 			ibp->rvp.n_other_naks++;
1463 			goto class_b;
1464 
1465 		case 2: /* Remote Access Error */
1466 			status = IB_WC_REM_ACCESS_ERR;
1467 			ibp->rvp.n_other_naks++;
1468 			goto class_b;
1469 
1470 		case 3: /* Remote Operation Error */
1471 			status = IB_WC_REM_OP_ERR;
1472 			ibp->rvp.n_other_naks++;
1473 class_b:
1474 			if (qp->s_last == qp->s_acked) {
1475 				hfi1_send_complete(qp, wqe, status);
1476 				rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
1477 			}
1478 			break;
1479 
1480 		default:
1481 			/* Ignore other reserved NAK error codes */
1482 			goto reserved;
1483 		}
1484 		qp->s_retry = qp->s_retry_cnt;
1485 		qp->s_rnr_retry = qp->s_rnr_retry_cnt;
1486 		goto bail_stop;
1487 
1488 	default:                /* 2: reserved */
1489 reserved:
1490 		/* Ignore reserved NAK codes. */
1491 		goto bail_stop;
1492 	}
1493 	/* cannot be reached  */
1494 bail_stop:
1495 	rvt_stop_rc_timers(qp);
1496 	return ret;
1497 }
1498 
1499 /*
1500  * We have seen an out of sequence RDMA read middle or last packet.
1501  * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE.
1502  */
1503 static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn,
1504 			 struct hfi1_ctxtdata *rcd)
1505 {
1506 	struct rvt_swqe *wqe;
1507 
1508 	lockdep_assert_held(&qp->s_lock);
1509 	/* Remove QP from retry timer */
1510 	rvt_stop_rc_timers(qp);
1511 
1512 	wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1513 
1514 	while (cmp_psn(psn, wqe->lpsn) > 0) {
1515 		if (wqe->wr.opcode == IB_WR_RDMA_READ ||
1516 		    wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1517 		    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
1518 			break;
1519 		wqe = do_rc_completion(qp, wqe, ibp);
1520 	}
1521 
1522 	ibp->rvp.n_rdma_seq++;
1523 	qp->r_flags |= RVT_R_RDMAR_SEQ;
1524 	hfi1_restart_rc(qp, qp->s_last_psn + 1, 0);
1525 	if (list_empty(&qp->rspwait)) {
1526 		qp->r_flags |= RVT_R_RSP_SEND;
1527 		rvt_get_qp(qp);
1528 		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
1529 	}
1530 }
1531 
1532 /**
1533  * rc_rcv_resp - process an incoming RC response packet
1534  * @packet: data packet information
1535  *
1536  * This is called from hfi1_rc_rcv() to process an incoming RC response
1537  * packet for the given QP.
1538  * Called at interrupt level.
1539  */
1540 static void rc_rcv_resp(struct hfi1_packet *packet)
1541 {
1542 	struct hfi1_ctxtdata *rcd = packet->rcd;
1543 	void *data = packet->payload;
1544 	u32 tlen = packet->tlen;
1545 	struct rvt_qp *qp = packet->qp;
1546 	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
1547 	struct ib_other_headers *ohdr = packet->ohdr;
1548 	struct rvt_swqe *wqe;
1549 	enum ib_wc_status status;
1550 	unsigned long flags;
1551 	int diff;
1552 	u64 val;
1553 	u32 aeth;
1554 	u32 psn = ib_bth_get_psn(packet->ohdr);
1555 	u32 pmtu = qp->pmtu;
1556 	u16 hdrsize = packet->hlen;
1557 	u8 opcode = packet->opcode;
1558 	u8 pad = packet->pad;
1559 	u8 extra_bytes = pad + packet->extra_byte + (SIZE_OF_CRC << 2);
1560 
1561 	spin_lock_irqsave(&qp->s_lock, flags);
1562 	trace_hfi1_ack(qp, psn);
1563 
1564 	/* Ignore invalid responses. */
1565 	smp_read_barrier_depends(); /* see post_one_send */
1566 	if (cmp_psn(psn, READ_ONCE(qp->s_next_psn)) >= 0)
1567 		goto ack_done;
1568 
1569 	/* Ignore duplicate responses. */
1570 	diff = cmp_psn(psn, qp->s_last_psn);
1571 	if (unlikely(diff <= 0)) {
1572 		/* Update credits for "ghost" ACKs */
1573 		if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
1574 			aeth = be32_to_cpu(ohdr->u.aeth);
1575 			if ((aeth >> IB_AETH_NAK_SHIFT) == 0)
1576 				rvt_get_credit(qp, aeth);
1577 		}
1578 		goto ack_done;
1579 	}
1580 
1581 	/*
1582 	 * Skip everything other than the PSN we expect, if we are waiting
1583 	 * for a reply to a restarted RDMA read or atomic op.
1584 	 */
1585 	if (qp->r_flags & RVT_R_RDMAR_SEQ) {
1586 		if (cmp_psn(psn, qp->s_last_psn + 1) != 0)
1587 			goto ack_done;
1588 		qp->r_flags &= ~RVT_R_RDMAR_SEQ;
1589 	}
1590 
1591 	if (unlikely(qp->s_acked == qp->s_tail))
1592 		goto ack_done;
1593 	wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1594 	status = IB_WC_SUCCESS;
1595 
1596 	switch (opcode) {
1597 	case OP(ACKNOWLEDGE):
1598 	case OP(ATOMIC_ACKNOWLEDGE):
1599 	case OP(RDMA_READ_RESPONSE_FIRST):
1600 		aeth = be32_to_cpu(ohdr->u.aeth);
1601 		if (opcode == OP(ATOMIC_ACKNOWLEDGE))
1602 			val = ib_u64_get(&ohdr->u.at.atomic_ack_eth);
1603 		else
1604 			val = 0;
1605 		if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) ||
1606 		    opcode != OP(RDMA_READ_RESPONSE_FIRST))
1607 			goto ack_done;
1608 		wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1609 		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
1610 			goto ack_op_err;
1611 		/*
1612 		 * If this is a response to a resent RDMA read, we
1613 		 * have to be careful to copy the data to the right
1614 		 * location.
1615 		 */
1616 		qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
1617 						  wqe, psn, pmtu);
1618 		goto read_middle;
1619 
1620 	case OP(RDMA_READ_RESPONSE_MIDDLE):
1621 		/* no AETH, no ACK */
1622 		if (unlikely(cmp_psn(psn, qp->s_last_psn + 1)))
1623 			goto ack_seq_err;
1624 		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
1625 			goto ack_op_err;
1626 read_middle:
1627 		if (unlikely(tlen != (hdrsize + pmtu + extra_bytes)))
1628 			goto ack_len_err;
1629 		if (unlikely(pmtu >= qp->s_rdma_read_len))
1630 			goto ack_len_err;
1631 
1632 		/*
1633 		 * We got a response so update the timeout.
1634 		 * 4.096 usec. * (1 << qp->timeout)
1635 		 */
1636 		rvt_mod_retry_timer(qp);
1637 		if (qp->s_flags & RVT_S_WAIT_ACK) {
1638 			qp->s_flags &= ~RVT_S_WAIT_ACK;
1639 			hfi1_schedule_send(qp);
1640 		}
1641 
1642 		if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE))
1643 			qp->s_retry = qp->s_retry_cnt;
1644 
1645 		/*
1646 		 * Update the RDMA receive state but do the copy w/o
1647 		 * holding the locks and blocking interrupts.
1648 		 */
1649 		qp->s_rdma_read_len -= pmtu;
1650 		update_last_psn(qp, psn);
1651 		spin_unlock_irqrestore(&qp->s_lock, flags);
1652 		hfi1_copy_sge(&qp->s_rdma_read_sge, data, pmtu, false, false);
1653 		goto bail;
1654 
1655 	case OP(RDMA_READ_RESPONSE_ONLY):
1656 		aeth = be32_to_cpu(ohdr->u.aeth);
1657 		if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd))
1658 			goto ack_done;
1659 		/*
1660 		 * Check that the data size is >= 0 && <= pmtu.
1661 		 * Remember to account for ICRC (4).
1662 		 */
1663 		if (unlikely(tlen < (hdrsize + extra_bytes)))
1664 			goto ack_len_err;
1665 		/*
1666 		 * If this is a response to a resent RDMA read, we
1667 		 * have to be careful to copy the data to the right
1668 		 * location.
1669 		 */
1670 		wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1671 		qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
1672 						  wqe, psn, pmtu);
1673 		goto read_last;
1674 
1675 	case OP(RDMA_READ_RESPONSE_LAST):
1676 		/* ACKs READ req. */
1677 		if (unlikely(cmp_psn(psn, qp->s_last_psn + 1)))
1678 			goto ack_seq_err;
1679 		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
1680 			goto ack_op_err;
1681 		/*
1682 		 * Check that the data size is >= 1 && <= pmtu.
1683 		 * Remember to account for ICRC (4).
1684 		 */
1685 		if (unlikely(tlen <= (hdrsize + extra_bytes)))
1686 			goto ack_len_err;
1687 read_last:
1688 		tlen -= hdrsize + extra_bytes;
1689 		if (unlikely(tlen != qp->s_rdma_read_len))
1690 			goto ack_len_err;
1691 		aeth = be32_to_cpu(ohdr->u.aeth);
1692 		hfi1_copy_sge(&qp->s_rdma_read_sge, data, tlen, false, false);
1693 		WARN_ON(qp->s_rdma_read_sge.num_sge);
1694 		(void)do_rc_ack(qp, aeth, psn,
1695 				 OP(RDMA_READ_RESPONSE_LAST), 0, rcd);
1696 		goto ack_done;
1697 	}
1698 
1699 ack_op_err:
1700 	status = IB_WC_LOC_QP_OP_ERR;
1701 	goto ack_err;
1702 
1703 ack_seq_err:
1704 	rdma_seq_err(qp, ibp, psn, rcd);
1705 	goto ack_done;
1706 
1707 ack_len_err:
1708 	status = IB_WC_LOC_LEN_ERR;
1709 ack_err:
1710 	if (qp->s_last == qp->s_acked) {
1711 		hfi1_send_complete(qp, wqe, status);
1712 		rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
1713 	}
1714 ack_done:
1715 	spin_unlock_irqrestore(&qp->s_lock, flags);
1716 bail:
1717 	return;
1718 }
1719 
1720 static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd,
1721 				  struct rvt_qp *qp)
1722 {
1723 	if (list_empty(&qp->rspwait)) {
1724 		qp->r_flags |= RVT_R_RSP_NAK;
1725 		rvt_get_qp(qp);
1726 		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
1727 	}
1728 }
1729 
1730 static inline void rc_cancel_ack(struct rvt_qp *qp)
1731 {
1732 	qp->r_adefered = 0;
1733 	if (list_empty(&qp->rspwait))
1734 		return;
1735 	list_del_init(&qp->rspwait);
1736 	qp->r_flags &= ~RVT_R_RSP_NAK;
1737 	rvt_put_qp(qp);
1738 }
1739 
1740 /**
1741  * rc_rcv_error - process an incoming duplicate or error RC packet
1742  * @ohdr: the other headers for this packet
1743  * @data: the packet data
1744  * @qp: the QP for this packet
1745  * @opcode: the opcode for this packet
1746  * @psn: the packet sequence number for this packet
1747  * @diff: the difference between the PSN and the expected PSN
1748  *
1749  * This is called from hfi1_rc_rcv() to process an unexpected
1750  * incoming RC packet for the given QP.
1751  * Called at interrupt level.
1752  * Return 1 if no more processing is needed; otherwise return 0 to
1753  * schedule a response to be sent.
1754  */
1755 static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
1756 				 struct rvt_qp *qp, u32 opcode, u32 psn,
1757 				 int diff, struct hfi1_ctxtdata *rcd)
1758 {
1759 	struct hfi1_ibport *ibp = rcd_to_iport(rcd);
1760 	struct rvt_ack_entry *e;
1761 	unsigned long flags;
1762 	u8 i, prev;
1763 	int old_req;
1764 
1765 	trace_hfi1_rcv_error(qp, psn);
1766 	if (diff > 0) {
1767 		/*
1768 		 * Packet sequence error.
1769 		 * A NAK will ACK earlier sends and RDMA writes.
1770 		 * Don't queue the NAK if we already sent one.
1771 		 */
1772 		if (!qp->r_nak_state) {
1773 			ibp->rvp.n_rc_seqnak++;
1774 			qp->r_nak_state = IB_NAK_PSN_ERROR;
1775 			/* Use the expected PSN. */
1776 			qp->r_ack_psn = qp->r_psn;
1777 			/*
1778 			 * Wait to send the sequence NAK until all packets
1779 			 * in the receive queue have been processed.
1780 			 * Otherwise, we end up propagating congestion.
1781 			 */
1782 			rc_defered_ack(rcd, qp);
1783 		}
1784 		goto done;
1785 	}
1786 
1787 	/*
1788 	 * Handle a duplicate request.  Don't re-execute SEND, RDMA
1789 	 * write or atomic op.  Don't NAK errors, just silently drop
1790 	 * the duplicate request.  Note that r_sge, r_len, and
1791 	 * r_rcv_len may be in use so don't modify them.
1792 	 *
1793 	 * We are supposed to ACK the earliest duplicate PSN but we
1794 	 * can coalesce an outstanding duplicate ACK.  We have to
1795 	 * send the earliest so that RDMA reads can be restarted at
1796 	 * the requester's expected PSN.
1797 	 *
1798 	 * First, find where this duplicate PSN falls within the
1799 	 * ACKs previously sent.
1800 	 * old_req is true if there is an older response that is scheduled
1801 	 * to be sent before sending this one.
1802 	 */
1803 	e = NULL;
1804 	old_req = 1;
1805 	ibp->rvp.n_rc_dupreq++;
1806 
1807 	spin_lock_irqsave(&qp->s_lock, flags);
1808 
1809 	for (i = qp->r_head_ack_queue; ; i = prev) {
1810 		if (i == qp->s_tail_ack_queue)
1811 			old_req = 0;
1812 		if (i)
1813 			prev = i - 1;
1814 		else
1815 			prev = HFI1_MAX_RDMA_ATOMIC;
1816 		if (prev == qp->r_head_ack_queue) {
1817 			e = NULL;
1818 			break;
1819 		}
1820 		e = &qp->s_ack_queue[prev];
1821 		if (!e->opcode) {
1822 			e = NULL;
1823 			break;
1824 		}
1825 		if (cmp_psn(psn, e->psn) >= 0) {
1826 			if (prev == qp->s_tail_ack_queue &&
1827 			    cmp_psn(psn, e->lpsn) <= 0)
1828 				old_req = 0;
1829 			break;
1830 		}
1831 	}
1832 	switch (opcode) {
1833 	case OP(RDMA_READ_REQUEST): {
1834 		struct ib_reth *reth;
1835 		u32 offset;
1836 		u32 len;
1837 
1838 		/*
1839 		 * If we didn't find the RDMA read request in the ack queue,
1840 		 * we can ignore this request.
1841 		 */
1842 		if (!e || e->opcode != OP(RDMA_READ_REQUEST))
1843 			goto unlock_done;
1844 		/* RETH comes after BTH */
1845 		reth = &ohdr->u.rc.reth;
1846 		/*
1847 		 * Address range must be a subset of the original
1848 		 * request and start on pmtu boundaries.
1849 		 * We reuse the old ack_queue slot since the requester
1850 		 * should not back up and request an earlier PSN for the
1851 		 * same request.
1852 		 */
1853 		offset = delta_psn(psn, e->psn) * qp->pmtu;
1854 		len = be32_to_cpu(reth->length);
1855 		if (unlikely(offset + len != e->rdma_sge.sge_length))
1856 			goto unlock_done;
1857 		if (e->rdma_sge.mr) {
1858 			rvt_put_mr(e->rdma_sge.mr);
1859 			e->rdma_sge.mr = NULL;
1860 		}
1861 		if (len != 0) {
1862 			u32 rkey = be32_to_cpu(reth->rkey);
1863 			u64 vaddr = get_ib_reth_vaddr(reth);
1864 			int ok;
1865 
1866 			ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
1867 					 IB_ACCESS_REMOTE_READ);
1868 			if (unlikely(!ok))
1869 				goto unlock_done;
1870 		} else {
1871 			e->rdma_sge.vaddr = NULL;
1872 			e->rdma_sge.length = 0;
1873 			e->rdma_sge.sge_length = 0;
1874 		}
1875 		e->psn = psn;
1876 		if (old_req)
1877 			goto unlock_done;
1878 		qp->s_tail_ack_queue = prev;
1879 		break;
1880 	}
1881 
1882 	case OP(COMPARE_SWAP):
1883 	case OP(FETCH_ADD): {
1884 		/*
1885 		 * If we didn't find the atomic request in the ack queue
1886 		 * or the send engine is already backed up to send an
1887 		 * earlier entry, we can ignore this request.
1888 		 */
1889 		if (!e || e->opcode != (u8)opcode || old_req)
1890 			goto unlock_done;
1891 		qp->s_tail_ack_queue = prev;
1892 		break;
1893 	}
1894 
1895 	default:
1896 		/*
1897 		 * Ignore this operation if it doesn't request an ACK
1898 		 * or an earlier RDMA read or atomic is going to be resent.
1899 		 */
1900 		if (!(psn & IB_BTH_REQ_ACK) || old_req)
1901 			goto unlock_done;
1902 		/*
1903 		 * Resend the most recent ACK if this request is
1904 		 * after all the previous RDMA reads and atomics.
1905 		 */
1906 		if (i == qp->r_head_ack_queue) {
1907 			spin_unlock_irqrestore(&qp->s_lock, flags);
1908 			qp->r_nak_state = 0;
1909 			qp->r_ack_psn = qp->r_psn - 1;
1910 			goto send_ack;
1911 		}
1912 
1913 		/*
1914 		 * Resend the RDMA read or atomic op which
1915 		 * ACKs this duplicate request.
1916 		 */
1917 		qp->s_tail_ack_queue = i;
1918 		break;
1919 	}
1920 	qp->s_ack_state = OP(ACKNOWLEDGE);
1921 	qp->s_flags |= RVT_S_RESP_PENDING;
1922 	qp->r_nak_state = 0;
1923 	hfi1_schedule_send(qp);
1924 
1925 unlock_done:
1926 	spin_unlock_irqrestore(&qp->s_lock, flags);
1927 done:
1928 	return 1;
1929 
1930 send_ack:
1931 	return 0;
1932 }
1933 
1934 static inline void update_ack_queue(struct rvt_qp *qp, unsigned n)
1935 {
1936 	unsigned next;
1937 
1938 	next = n + 1;
1939 	if (next > HFI1_MAX_RDMA_ATOMIC)
1940 		next = 0;
1941 	qp->s_tail_ack_queue = next;
1942 	qp->s_ack_state = OP(ACKNOWLEDGE);
1943 }
1944 
1945 static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid,
1946 			  u32 lqpn, u32 rqpn, u8 svc_type)
1947 {
1948 	struct opa_hfi1_cong_log_event_internal *cc_event;
1949 	unsigned long flags;
1950 
1951 	if (sl >= OPA_MAX_SLS)
1952 		return;
1953 
1954 	spin_lock_irqsave(&ppd->cc_log_lock, flags);
1955 
1956 	ppd->threshold_cong_event_map[sl / 8] |= 1 << (sl % 8);
1957 	ppd->threshold_event_counter++;
1958 
1959 	cc_event = &ppd->cc_events[ppd->cc_log_idx++];
1960 	if (ppd->cc_log_idx == OPA_CONG_LOG_ELEMS)
1961 		ppd->cc_log_idx = 0;
1962 	cc_event->lqpn = lqpn & RVT_QPN_MASK;
1963 	cc_event->rqpn = rqpn & RVT_QPN_MASK;
1964 	cc_event->sl = sl;
1965 	cc_event->svc_type = svc_type;
1966 	cc_event->rlid = rlid;
1967 	/* keep timestamp in units of 1.024 usec */
1968 	cc_event->timestamp = ktime_get_ns() / 1024;
1969 
1970 	spin_unlock_irqrestore(&ppd->cc_log_lock, flags);
1971 }
1972 
1973 void process_becn(struct hfi1_pportdata *ppd, u8 sl, u32 rlid, u32 lqpn,
1974 		  u32 rqpn, u8 svc_type)
1975 {
1976 	struct cca_timer *cca_timer;
1977 	u16 ccti, ccti_incr, ccti_timer, ccti_limit;
1978 	u8 trigger_threshold;
1979 	struct cc_state *cc_state;
1980 	unsigned long flags;
1981 
1982 	if (sl >= OPA_MAX_SLS)
1983 		return;
1984 
1985 	cc_state = get_cc_state(ppd);
1986 
1987 	if (!cc_state)
1988 		return;
1989 
1990 	/*
1991 	 * 1) increase CCTI (for this SL)
1992 	 * 2) select IPG (i.e., call set_link_ipg())
1993 	 * 3) start timer
1994 	 */
1995 	ccti_limit = cc_state->cct.ccti_limit;
1996 	ccti_incr = cc_state->cong_setting.entries[sl].ccti_increase;
1997 	ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer;
1998 	trigger_threshold =
1999 		cc_state->cong_setting.entries[sl].trigger_threshold;
2000 
2001 	spin_lock_irqsave(&ppd->cca_timer_lock, flags);
2002 
2003 	cca_timer = &ppd->cca_timer[sl];
2004 	if (cca_timer->ccti < ccti_limit) {
2005 		if (cca_timer->ccti + ccti_incr <= ccti_limit)
2006 			cca_timer->ccti += ccti_incr;
2007 		else
2008 			cca_timer->ccti = ccti_limit;
2009 		set_link_ipg(ppd);
2010 	}
2011 
2012 	ccti = cca_timer->ccti;
2013 
2014 	if (!hrtimer_active(&cca_timer->hrtimer)) {
2015 		/* ccti_timer is in units of 1.024 usec */
2016 		unsigned long nsec = 1024 * ccti_timer;
2017 
2018 		hrtimer_start(&cca_timer->hrtimer, ns_to_ktime(nsec),
2019 			      HRTIMER_MODE_REL);
2020 	}
2021 
2022 	spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
2023 
2024 	if ((trigger_threshold != 0) && (ccti >= trigger_threshold))
2025 		log_cca_event(ppd, sl, rlid, lqpn, rqpn, svc_type);
2026 }
2027 
2028 /**
2029  * hfi1_rc_rcv - process an incoming RC packet
2030  * @packet: data packet information
2031  *
2032  * This is called from qp_rcv() to process an incoming RC packet
2033  * for the given QP.
2034  * May be called at interrupt level.
2035  */
2036 void hfi1_rc_rcv(struct hfi1_packet *packet)
2037 {
2038 	struct hfi1_ctxtdata *rcd = packet->rcd;
2039 	void *data = packet->payload;
2040 	u32 tlen = packet->tlen;
2041 	struct rvt_qp *qp = packet->qp;
2042 	struct hfi1_ibport *ibp = rcd_to_iport(rcd);
2043 	struct ib_other_headers *ohdr = packet->ohdr;
2044 	u32 bth0 = be32_to_cpu(ohdr->bth[0]);
2045 	u32 opcode = packet->opcode;
2046 	u32 hdrsize = packet->hlen;
2047 	u32 psn = ib_bth_get_psn(packet->ohdr);
2048 	u32 pad = packet->pad;
2049 	struct ib_wc wc;
2050 	u32 pmtu = qp->pmtu;
2051 	int diff;
2052 	struct ib_reth *reth;
2053 	unsigned long flags;
2054 	int ret;
2055 	bool is_fecn = false;
2056 	bool copy_last = false;
2057 	u32 rkey;
2058 	u8 extra_bytes = pad + packet->extra_byte + (SIZE_OF_CRC << 2);
2059 
2060 	lockdep_assert_held(&qp->r_lock);
2061 
2062 	if (hfi1_ruc_check_hdr(ibp, packet))
2063 		return;
2064 
2065 	is_fecn = process_ecn(qp, packet, false);
2066 
2067 	/*
2068 	 * Process responses (ACKs) before anything else.  Note that the
2069 	 * packet sequence number will be for something in the send work
2070 	 * queue rather than the expected receive packet sequence number.
2071 	 * In other words, this QP is the requester.
2072 	 */
2073 	if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
2074 	    opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
2075 		rc_rcv_resp(packet);
2076 		if (is_fecn)
2077 			goto send_ack;
2078 		return;
2079 	}
2080 
2081 	/* Compute 24 bits worth of difference. */
2082 	diff = delta_psn(psn, qp->r_psn);
2083 	if (unlikely(diff)) {
2084 		if (rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd))
2085 			return;
2086 		goto send_ack;
2087 	}
2088 
2089 	/* Check for opcode sequence errors. */
2090 	switch (qp->r_state) {
2091 	case OP(SEND_FIRST):
2092 	case OP(SEND_MIDDLE):
2093 		if (opcode == OP(SEND_MIDDLE) ||
2094 		    opcode == OP(SEND_LAST) ||
2095 		    opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
2096 		    opcode == OP(SEND_LAST_WITH_INVALIDATE))
2097 			break;
2098 		goto nack_inv;
2099 
2100 	case OP(RDMA_WRITE_FIRST):
2101 	case OP(RDMA_WRITE_MIDDLE):
2102 		if (opcode == OP(RDMA_WRITE_MIDDLE) ||
2103 		    opcode == OP(RDMA_WRITE_LAST) ||
2104 		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
2105 			break;
2106 		goto nack_inv;
2107 
2108 	default:
2109 		if (opcode == OP(SEND_MIDDLE) ||
2110 		    opcode == OP(SEND_LAST) ||
2111 		    opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
2112 		    opcode == OP(SEND_LAST_WITH_INVALIDATE) ||
2113 		    opcode == OP(RDMA_WRITE_MIDDLE) ||
2114 		    opcode == OP(RDMA_WRITE_LAST) ||
2115 		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
2116 			goto nack_inv;
2117 		/*
2118 		 * Note that it is up to the requester to not send a new
2119 		 * RDMA read or atomic operation before receiving an ACK
2120 		 * for the previous operation.
2121 		 */
2122 		break;
2123 	}
2124 
2125 	if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
2126 		rvt_comm_est(qp);
2127 
2128 	/* OK, process the packet. */
2129 	switch (opcode) {
2130 	case OP(SEND_FIRST):
2131 		ret = hfi1_rvt_get_rwqe(qp, 0);
2132 		if (ret < 0)
2133 			goto nack_op_err;
2134 		if (!ret)
2135 			goto rnr_nak;
2136 		qp->r_rcv_len = 0;
2137 		/* FALLTHROUGH */
2138 	case OP(SEND_MIDDLE):
2139 	case OP(RDMA_WRITE_MIDDLE):
2140 send_middle:
2141 		/* Check for invalid length PMTU or posted rwqe len. */
2142 		/*
2143 		 * There will be no padding for 9B packet but 16B packets
2144 		 * will come in with some padding since we always add
2145 		 * CRC and LT bytes which will need to be flit aligned
2146 		 */
2147 		if (unlikely(tlen != (hdrsize + pmtu + extra_bytes)))
2148 			goto nack_inv;
2149 		qp->r_rcv_len += pmtu;
2150 		if (unlikely(qp->r_rcv_len > qp->r_len))
2151 			goto nack_inv;
2152 		hfi1_copy_sge(&qp->r_sge, data, pmtu, true, false);
2153 		break;
2154 
2155 	case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
2156 		/* consume RWQE */
2157 		ret = hfi1_rvt_get_rwqe(qp, 1);
2158 		if (ret < 0)
2159 			goto nack_op_err;
2160 		if (!ret)
2161 			goto rnr_nak;
2162 		goto send_last_imm;
2163 
2164 	case OP(SEND_ONLY):
2165 	case OP(SEND_ONLY_WITH_IMMEDIATE):
2166 	case OP(SEND_ONLY_WITH_INVALIDATE):
2167 		ret = hfi1_rvt_get_rwqe(qp, 0);
2168 		if (ret < 0)
2169 			goto nack_op_err;
2170 		if (!ret)
2171 			goto rnr_nak;
2172 		qp->r_rcv_len = 0;
2173 		if (opcode == OP(SEND_ONLY))
2174 			goto no_immediate_data;
2175 		if (opcode == OP(SEND_ONLY_WITH_INVALIDATE))
2176 			goto send_last_inv;
2177 		/* FALLTHROUGH -- for SEND_ONLY_WITH_IMMEDIATE */
2178 	case OP(SEND_LAST_WITH_IMMEDIATE):
2179 send_last_imm:
2180 		wc.ex.imm_data = ohdr->u.imm_data;
2181 		wc.wc_flags = IB_WC_WITH_IMM;
2182 		goto send_last;
2183 	case OP(SEND_LAST_WITH_INVALIDATE):
2184 send_last_inv:
2185 		rkey = be32_to_cpu(ohdr->u.ieth);
2186 		if (rvt_invalidate_rkey(qp, rkey))
2187 			goto no_immediate_data;
2188 		wc.ex.invalidate_rkey = rkey;
2189 		wc.wc_flags = IB_WC_WITH_INVALIDATE;
2190 		goto send_last;
2191 	case OP(RDMA_WRITE_LAST):
2192 		copy_last = rvt_is_user_qp(qp);
2193 		/* fall through */
2194 	case OP(SEND_LAST):
2195 no_immediate_data:
2196 		wc.wc_flags = 0;
2197 		wc.ex.imm_data = 0;
2198 send_last:
2199 		/* Check for invalid length. */
2200 		/* LAST len should be >= 1 */
2201 		if (unlikely(tlen < (hdrsize + extra_bytes)))
2202 			goto nack_inv;
2203 		/* Don't count the CRC(and padding and LT byte for 16B). */
2204 		tlen -= (hdrsize + extra_bytes);
2205 		wc.byte_len = tlen + qp->r_rcv_len;
2206 		if (unlikely(wc.byte_len > qp->r_len))
2207 			goto nack_inv;
2208 		hfi1_copy_sge(&qp->r_sge, data, tlen, true, copy_last);
2209 		rvt_put_ss(&qp->r_sge);
2210 		qp->r_msn++;
2211 		if (!__test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
2212 			break;
2213 		wc.wr_id = qp->r_wr_id;
2214 		wc.status = IB_WC_SUCCESS;
2215 		if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||
2216 		    opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
2217 			wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
2218 		else
2219 			wc.opcode = IB_WC_RECV;
2220 		wc.qp = &qp->ibqp;
2221 		wc.src_qp = qp->remote_qpn;
2222 		wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr) & U16_MAX;
2223 		/*
2224 		 * It seems that IB mandates the presence of an SL in a
2225 		 * work completion only for the UD transport (see section
2226 		 * 11.4.2 of IBTA Vol. 1).
2227 		 *
2228 		 * However, the way the SL is chosen below is consistent
2229 		 * with the way that IB/qib works and is trying avoid
2230 		 * introducing incompatibilities.
2231 		 *
2232 		 * See also OPA Vol. 1, section 9.7.6, and table 9-17.
2233 		 */
2234 		wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr);
2235 		/* zero fields that are N/A */
2236 		wc.vendor_err = 0;
2237 		wc.pkey_index = 0;
2238 		wc.dlid_path_bits = 0;
2239 		wc.port_num = 0;
2240 		/* Signal completion event if the solicited bit is set. */
2241 		rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
2242 			     (bth0 & IB_BTH_SOLICITED) != 0);
2243 		break;
2244 
2245 	case OP(RDMA_WRITE_ONLY):
2246 		copy_last = rvt_is_user_qp(qp);
2247 		/* fall through */
2248 	case OP(RDMA_WRITE_FIRST):
2249 	case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
2250 		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
2251 			goto nack_inv;
2252 		/* consume RWQE */
2253 		reth = &ohdr->u.rc.reth;
2254 		qp->r_len = be32_to_cpu(reth->length);
2255 		qp->r_rcv_len = 0;
2256 		qp->r_sge.sg_list = NULL;
2257 		if (qp->r_len != 0) {
2258 			u32 rkey = be32_to_cpu(reth->rkey);
2259 			u64 vaddr = get_ib_reth_vaddr(reth);
2260 			int ok;
2261 
2262 			/* Check rkey & NAK */
2263 			ok = rvt_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr,
2264 					 rkey, IB_ACCESS_REMOTE_WRITE);
2265 			if (unlikely(!ok))
2266 				goto nack_acc;
2267 			qp->r_sge.num_sge = 1;
2268 		} else {
2269 			qp->r_sge.num_sge = 0;
2270 			qp->r_sge.sge.mr = NULL;
2271 			qp->r_sge.sge.vaddr = NULL;
2272 			qp->r_sge.sge.length = 0;
2273 			qp->r_sge.sge.sge_length = 0;
2274 		}
2275 		if (opcode == OP(RDMA_WRITE_FIRST))
2276 			goto send_middle;
2277 		else if (opcode == OP(RDMA_WRITE_ONLY))
2278 			goto no_immediate_data;
2279 		ret = hfi1_rvt_get_rwqe(qp, 1);
2280 		if (ret < 0)
2281 			goto nack_op_err;
2282 		if (!ret) {
2283 			/* peer will send again */
2284 			rvt_put_ss(&qp->r_sge);
2285 			goto rnr_nak;
2286 		}
2287 		wc.ex.imm_data = ohdr->u.rc.imm_data;
2288 		wc.wc_flags = IB_WC_WITH_IMM;
2289 		goto send_last;
2290 
2291 	case OP(RDMA_READ_REQUEST): {
2292 		struct rvt_ack_entry *e;
2293 		u32 len;
2294 		u8 next;
2295 
2296 		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
2297 			goto nack_inv;
2298 		next = qp->r_head_ack_queue + 1;
2299 		/* s_ack_queue is size HFI1_MAX_RDMA_ATOMIC+1 so use > not >= */
2300 		if (next > HFI1_MAX_RDMA_ATOMIC)
2301 			next = 0;
2302 		spin_lock_irqsave(&qp->s_lock, flags);
2303 		if (unlikely(next == qp->s_tail_ack_queue)) {
2304 			if (!qp->s_ack_queue[next].sent)
2305 				goto nack_inv_unlck;
2306 			update_ack_queue(qp, next);
2307 		}
2308 		e = &qp->s_ack_queue[qp->r_head_ack_queue];
2309 		if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
2310 			rvt_put_mr(e->rdma_sge.mr);
2311 			e->rdma_sge.mr = NULL;
2312 		}
2313 		reth = &ohdr->u.rc.reth;
2314 		len = be32_to_cpu(reth->length);
2315 		if (len) {
2316 			u32 rkey = be32_to_cpu(reth->rkey);
2317 			u64 vaddr = get_ib_reth_vaddr(reth);
2318 			int ok;
2319 
2320 			/* Check rkey & NAK */
2321 			ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr,
2322 					 rkey, IB_ACCESS_REMOTE_READ);
2323 			if (unlikely(!ok))
2324 				goto nack_acc_unlck;
2325 			/*
2326 			 * Update the next expected PSN.  We add 1 later
2327 			 * below, so only add the remainder here.
2328 			 */
2329 			qp->r_psn += rvt_div_mtu(qp, len - 1);
2330 		} else {
2331 			e->rdma_sge.mr = NULL;
2332 			e->rdma_sge.vaddr = NULL;
2333 			e->rdma_sge.length = 0;
2334 			e->rdma_sge.sge_length = 0;
2335 		}
2336 		e->opcode = opcode;
2337 		e->sent = 0;
2338 		e->psn = psn;
2339 		e->lpsn = qp->r_psn;
2340 		/*
2341 		 * We need to increment the MSN here instead of when we
2342 		 * finish sending the result since a duplicate request would
2343 		 * increment it more than once.
2344 		 */
2345 		qp->r_msn++;
2346 		qp->r_psn++;
2347 		qp->r_state = opcode;
2348 		qp->r_nak_state = 0;
2349 		qp->r_head_ack_queue = next;
2350 
2351 		/* Schedule the send engine. */
2352 		qp->s_flags |= RVT_S_RESP_PENDING;
2353 		hfi1_schedule_send(qp);
2354 
2355 		spin_unlock_irqrestore(&qp->s_lock, flags);
2356 		if (is_fecn)
2357 			goto send_ack;
2358 		return;
2359 	}
2360 
2361 	case OP(COMPARE_SWAP):
2362 	case OP(FETCH_ADD): {
2363 		struct ib_atomic_eth *ateth;
2364 		struct rvt_ack_entry *e;
2365 		u64 vaddr;
2366 		atomic64_t *maddr;
2367 		u64 sdata;
2368 		u32 rkey;
2369 		u8 next;
2370 
2371 		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
2372 			goto nack_inv;
2373 		next = qp->r_head_ack_queue + 1;
2374 		if (next > HFI1_MAX_RDMA_ATOMIC)
2375 			next = 0;
2376 		spin_lock_irqsave(&qp->s_lock, flags);
2377 		if (unlikely(next == qp->s_tail_ack_queue)) {
2378 			if (!qp->s_ack_queue[next].sent)
2379 				goto nack_inv_unlck;
2380 			update_ack_queue(qp, next);
2381 		}
2382 		e = &qp->s_ack_queue[qp->r_head_ack_queue];
2383 		if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
2384 			rvt_put_mr(e->rdma_sge.mr);
2385 			e->rdma_sge.mr = NULL;
2386 		}
2387 		ateth = &ohdr->u.atomic_eth;
2388 		vaddr = get_ib_ateth_vaddr(ateth);
2389 		if (unlikely(vaddr & (sizeof(u64) - 1)))
2390 			goto nack_inv_unlck;
2391 		rkey = be32_to_cpu(ateth->rkey);
2392 		/* Check rkey & NAK */
2393 		if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
2394 					  vaddr, rkey,
2395 					  IB_ACCESS_REMOTE_ATOMIC)))
2396 			goto nack_acc_unlck;
2397 		/* Perform atomic OP and save result. */
2398 		maddr = (atomic64_t *)qp->r_sge.sge.vaddr;
2399 		sdata = get_ib_ateth_swap(ateth);
2400 		e->atomic_data = (opcode == OP(FETCH_ADD)) ?
2401 			(u64)atomic64_add_return(sdata, maddr) - sdata :
2402 			(u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr,
2403 				      get_ib_ateth_compare(ateth),
2404 				      sdata);
2405 		rvt_put_mr(qp->r_sge.sge.mr);
2406 		qp->r_sge.num_sge = 0;
2407 		e->opcode = opcode;
2408 		e->sent = 0;
2409 		e->psn = psn;
2410 		e->lpsn = psn;
2411 		qp->r_msn++;
2412 		qp->r_psn++;
2413 		qp->r_state = opcode;
2414 		qp->r_nak_state = 0;
2415 		qp->r_head_ack_queue = next;
2416 
2417 		/* Schedule the send engine. */
2418 		qp->s_flags |= RVT_S_RESP_PENDING;
2419 		hfi1_schedule_send(qp);
2420 
2421 		spin_unlock_irqrestore(&qp->s_lock, flags);
2422 		if (is_fecn)
2423 			goto send_ack;
2424 		return;
2425 	}
2426 
2427 	default:
2428 		/* NAK unknown opcodes. */
2429 		goto nack_inv;
2430 	}
2431 	qp->r_psn++;
2432 	qp->r_state = opcode;
2433 	qp->r_ack_psn = psn;
2434 	qp->r_nak_state = 0;
2435 	/* Send an ACK if requested or required. */
2436 	if (psn & IB_BTH_REQ_ACK) {
2437 		if (packet->numpkt == 0) {
2438 			rc_cancel_ack(qp);
2439 			goto send_ack;
2440 		}
2441 		if (qp->r_adefered >= HFI1_PSN_CREDIT) {
2442 			rc_cancel_ack(qp);
2443 			goto send_ack;
2444 		}
2445 		if (unlikely(is_fecn)) {
2446 			rc_cancel_ack(qp);
2447 			goto send_ack;
2448 		}
2449 		qp->r_adefered++;
2450 		rc_defered_ack(rcd, qp);
2451 	}
2452 	return;
2453 
2454 rnr_nak:
2455 	qp->r_nak_state = qp->r_min_rnr_timer | IB_RNR_NAK;
2456 	qp->r_ack_psn = qp->r_psn;
2457 	/* Queue RNR NAK for later */
2458 	rc_defered_ack(rcd, qp);
2459 	return;
2460 
2461 nack_op_err:
2462 	rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2463 	qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
2464 	qp->r_ack_psn = qp->r_psn;
2465 	/* Queue NAK for later */
2466 	rc_defered_ack(rcd, qp);
2467 	return;
2468 
2469 nack_inv_unlck:
2470 	spin_unlock_irqrestore(&qp->s_lock, flags);
2471 nack_inv:
2472 	rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2473 	qp->r_nak_state = IB_NAK_INVALID_REQUEST;
2474 	qp->r_ack_psn = qp->r_psn;
2475 	/* Queue NAK for later */
2476 	rc_defered_ack(rcd, qp);
2477 	return;
2478 
2479 nack_acc_unlck:
2480 	spin_unlock_irqrestore(&qp->s_lock, flags);
2481 nack_acc:
2482 	rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
2483 	qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
2484 	qp->r_ack_psn = qp->r_psn;
2485 send_ack:
2486 	hfi1_send_rc_ack(rcd, qp, is_fecn);
2487 }
2488 
2489 void hfi1_rc_hdrerr(
2490 	struct hfi1_ctxtdata *rcd,
2491 	struct hfi1_packet *packet,
2492 	struct rvt_qp *qp)
2493 {
2494 	struct hfi1_ibport *ibp = rcd_to_iport(rcd);
2495 	int diff;
2496 	u32 opcode;
2497 	u32 psn;
2498 
2499 	if (hfi1_ruc_check_hdr(ibp, packet))
2500 		return;
2501 
2502 	psn = ib_bth_get_psn(packet->ohdr);
2503 	opcode = ib_bth_get_opcode(packet->ohdr);
2504 
2505 	/* Only deal with RDMA Writes for now */
2506 	if (opcode < IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) {
2507 		diff = delta_psn(psn, qp->r_psn);
2508 		if (!qp->r_nak_state && diff >= 0) {
2509 			ibp->rvp.n_rc_seqnak++;
2510 			qp->r_nak_state = IB_NAK_PSN_ERROR;
2511 			/* Use the expected PSN. */
2512 			qp->r_ack_psn = qp->r_psn;
2513 			/*
2514 			 * Wait to send the sequence
2515 			 * NAK until all packets
2516 			 * in the receive queue have
2517 			 * been processed.
2518 			 * Otherwise, we end up
2519 			 * propagating congestion.
2520 			 */
2521 			rc_defered_ack(rcd, qp);
2522 		} /* Out of sequence NAK */
2523 	} /* QP Request NAKs */
2524 }
2525