xref: /openbmc/linux/drivers/infiniband/hw/qib/qib_rc.c (revision 68198dca)
1 /*
2  * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
3  * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the
9  * OpenIB.org BSD license below:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      - Redistributions of source code must retain the above
16  *        copyright notice, this list of conditions and the following
17  *        disclaimer.
18  *
19  *      - Redistributions in binary form must reproduce the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer in the documentation and/or other materials
22  *        provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33 
34 #include <linux/io.h>
35 
36 #include "qib.h"
37 
38 /* cut down ridiculously long IB macro names */
39 #define OP(x) IB_OPCODE_RC_##x
40 
41 
42 static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
43 		       u32 psn, u32 pmtu)
44 {
45 	u32 len;
46 
47 	len = ((psn - wqe->psn) & QIB_PSN_MASK) * pmtu;
48 	ss->sge = wqe->sg_list[0];
49 	ss->sg_list = wqe->sg_list + 1;
50 	ss->num_sge = wqe->wr.num_sge;
51 	ss->total_len = wqe->length;
52 	rvt_skip_sge(ss, len, false);
53 	return wqe->length - len;
54 }
55 
56 /**
57  * qib_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
58  * @dev: the device for this QP
59  * @qp: a pointer to the QP
60  * @ohdr: a pointer to the IB header being constructed
61  * @pmtu: the path MTU
62  *
63  * Return 1 if constructed; otherwise, return 0.
64  * Note that we are in the responder's side of the QP context.
65  * Note the QP s_lock must be held.
66  */
67 static int qib_make_rc_ack(struct qib_ibdev *dev, struct rvt_qp *qp,
68 			   struct ib_other_headers *ohdr, u32 pmtu)
69 {
70 	struct rvt_ack_entry *e;
71 	u32 hwords;
72 	u32 len;
73 	u32 bth0;
74 	u32 bth2;
75 
76 	/* Don't send an ACK if we aren't supposed to. */
77 	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
78 		goto bail;
79 
80 	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
81 	hwords = 5;
82 
83 	switch (qp->s_ack_state) {
84 	case OP(RDMA_READ_RESPONSE_LAST):
85 	case OP(RDMA_READ_RESPONSE_ONLY):
86 		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
87 		if (e->rdma_sge.mr) {
88 			rvt_put_mr(e->rdma_sge.mr);
89 			e->rdma_sge.mr = NULL;
90 		}
91 		/* FALLTHROUGH */
92 	case OP(ATOMIC_ACKNOWLEDGE):
93 		/*
94 		 * We can increment the tail pointer now that the last
95 		 * response has been sent instead of only being
96 		 * constructed.
97 		 */
98 		if (++qp->s_tail_ack_queue > QIB_MAX_RDMA_ATOMIC)
99 			qp->s_tail_ack_queue = 0;
100 		/* FALLTHROUGH */
101 	case OP(SEND_ONLY):
102 	case OP(ACKNOWLEDGE):
103 		/* Check for no next entry in the queue. */
104 		if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {
105 			if (qp->s_flags & RVT_S_ACK_PENDING)
106 				goto normal;
107 			goto bail;
108 		}
109 
110 		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
111 		if (e->opcode == OP(RDMA_READ_REQUEST)) {
112 			/*
113 			 * If a RDMA read response is being resent and
114 			 * we haven't seen the duplicate request yet,
115 			 * then stop sending the remaining responses the
116 			 * responder has seen until the requester resends it.
117 			 */
118 			len = e->rdma_sge.sge_length;
119 			if (len && !e->rdma_sge.mr) {
120 				qp->s_tail_ack_queue = qp->r_head_ack_queue;
121 				goto bail;
122 			}
123 			/* Copy SGE state in case we need to resend */
124 			qp->s_rdma_mr = e->rdma_sge.mr;
125 			if (qp->s_rdma_mr)
126 				rvt_get_mr(qp->s_rdma_mr);
127 			qp->s_ack_rdma_sge.sge = e->rdma_sge;
128 			qp->s_ack_rdma_sge.num_sge = 1;
129 			qp->s_cur_sge = &qp->s_ack_rdma_sge;
130 			if (len > pmtu) {
131 				len = pmtu;
132 				qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
133 			} else {
134 				qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
135 				e->sent = 1;
136 			}
137 			ohdr->u.aeth = rvt_compute_aeth(qp);
138 			hwords++;
139 			qp->s_ack_rdma_psn = e->psn;
140 			bth2 = qp->s_ack_rdma_psn++ & QIB_PSN_MASK;
141 		} else {
142 			/* COMPARE_SWAP or FETCH_ADD */
143 			qp->s_cur_sge = NULL;
144 			len = 0;
145 			qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
146 			ohdr->u.at.aeth = rvt_compute_aeth(qp);
147 			ib_u64_put(e->atomic_data, &ohdr->u.at.atomic_ack_eth);
148 			hwords += sizeof(ohdr->u.at) / sizeof(u32);
149 			bth2 = e->psn & QIB_PSN_MASK;
150 			e->sent = 1;
151 		}
152 		bth0 = qp->s_ack_state << 24;
153 		break;
154 
155 	case OP(RDMA_READ_RESPONSE_FIRST):
156 		qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
157 		/* FALLTHROUGH */
158 	case OP(RDMA_READ_RESPONSE_MIDDLE):
159 		qp->s_cur_sge = &qp->s_ack_rdma_sge;
160 		qp->s_rdma_mr = qp->s_ack_rdma_sge.sge.mr;
161 		if (qp->s_rdma_mr)
162 			rvt_get_mr(qp->s_rdma_mr);
163 		len = qp->s_ack_rdma_sge.sge.sge_length;
164 		if (len > pmtu)
165 			len = pmtu;
166 		else {
167 			ohdr->u.aeth = rvt_compute_aeth(qp);
168 			hwords++;
169 			qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
170 			e = &qp->s_ack_queue[qp->s_tail_ack_queue];
171 			e->sent = 1;
172 		}
173 		bth0 = qp->s_ack_state << 24;
174 		bth2 = qp->s_ack_rdma_psn++ & QIB_PSN_MASK;
175 		break;
176 
177 	default:
178 normal:
179 		/*
180 		 * Send a regular ACK.
181 		 * Set the s_ack_state so we wait until after sending
182 		 * the ACK before setting s_ack_state to ACKNOWLEDGE
183 		 * (see above).
184 		 */
185 		qp->s_ack_state = OP(SEND_ONLY);
186 		qp->s_flags &= ~RVT_S_ACK_PENDING;
187 		qp->s_cur_sge = NULL;
188 		if (qp->s_nak_state)
189 			ohdr->u.aeth =
190 				cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
191 					    (qp->s_nak_state <<
192 					     IB_AETH_CREDIT_SHIFT));
193 		else
194 			ohdr->u.aeth = rvt_compute_aeth(qp);
195 		hwords++;
196 		len = 0;
197 		bth0 = OP(ACKNOWLEDGE) << 24;
198 		bth2 = qp->s_ack_psn & QIB_PSN_MASK;
199 	}
200 	qp->s_rdma_ack_cnt++;
201 	qp->s_hdrwords = hwords;
202 	qp->s_cur_size = len;
203 	qib_make_ruc_header(qp, ohdr, bth0, bth2);
204 	return 1;
205 
206 bail:
207 	qp->s_ack_state = OP(ACKNOWLEDGE);
208 	qp->s_flags &= ~(RVT_S_RESP_PENDING | RVT_S_ACK_PENDING);
209 	return 0;
210 }
211 
212 /**
213  * qib_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
214  * @qp: a pointer to the QP
215  *
216  * Assumes the s_lock is held.
217  *
218  * Return 1 if constructed; otherwise, return 0.
219  */
220 int qib_make_rc_req(struct rvt_qp *qp, unsigned long *flags)
221 {
222 	struct qib_qp_priv *priv = qp->priv;
223 	struct qib_ibdev *dev = to_idev(qp->ibqp.device);
224 	struct ib_other_headers *ohdr;
225 	struct rvt_sge_state *ss;
226 	struct rvt_swqe *wqe;
227 	u32 hwords;
228 	u32 len;
229 	u32 bth0;
230 	u32 bth2;
231 	u32 pmtu = qp->pmtu;
232 	char newreq;
233 	int ret = 0;
234 	int delta;
235 
236 	ohdr = &priv->s_hdr->u.oth;
237 	if (rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH)
238 		ohdr = &priv->s_hdr->u.l.oth;
239 
240 	/* Sending responses has higher priority over sending requests. */
241 	if ((qp->s_flags & RVT_S_RESP_PENDING) &&
242 	    qib_make_rc_ack(dev, qp, ohdr, pmtu))
243 		goto done;
244 
245 	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) {
246 		if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
247 			goto bail;
248 		/* We are in the error state, flush the work request. */
249 		smp_read_barrier_depends(); /* see post_one_send() */
250 		if (qp->s_last == READ_ONCE(qp->s_head))
251 			goto bail;
252 		/* If DMAs are in progress, we can't flush immediately. */
253 		if (atomic_read(&priv->s_dma_busy)) {
254 			qp->s_flags |= RVT_S_WAIT_DMA;
255 			goto bail;
256 		}
257 		wqe = rvt_get_swqe_ptr(qp, qp->s_last);
258 		qib_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
259 			IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
260 		/* will get called again */
261 		goto done;
262 	}
263 
264 	if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK))
265 		goto bail;
266 
267 	if (qib_cmp24(qp->s_psn, qp->s_sending_hpsn) <= 0) {
268 		if (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) {
269 			qp->s_flags |= RVT_S_WAIT_PSN;
270 			goto bail;
271 		}
272 		qp->s_sending_psn = qp->s_psn;
273 		qp->s_sending_hpsn = qp->s_psn - 1;
274 	}
275 
276 	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
277 	hwords = 5;
278 	bth0 = 0;
279 
280 	/* Send a request. */
281 	wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
282 	switch (qp->s_state) {
283 	default:
284 		if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK))
285 			goto bail;
286 		/*
287 		 * Resend an old request or start a new one.
288 		 *
289 		 * We keep track of the current SWQE so that
290 		 * we don't reset the "furthest progress" state
291 		 * if we need to back up.
292 		 */
293 		newreq = 0;
294 		if (qp->s_cur == qp->s_tail) {
295 			/* Check if send work queue is empty. */
296 			smp_read_barrier_depends(); /* see post_one_send() */
297 			if (qp->s_tail == READ_ONCE(qp->s_head))
298 				goto bail;
299 			/*
300 			 * If a fence is requested, wait for previous
301 			 * RDMA read and atomic operations to finish.
302 			 */
303 			if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
304 			    qp->s_num_rd_atomic) {
305 				qp->s_flags |= RVT_S_WAIT_FENCE;
306 				goto bail;
307 			}
308 			newreq = 1;
309 			qp->s_psn = wqe->psn;
310 		}
311 		/*
312 		 * Note that we have to be careful not to modify the
313 		 * original work request since we may need to resend
314 		 * it.
315 		 */
316 		len = wqe->length;
317 		ss = &qp->s_sge;
318 		bth2 = qp->s_psn & QIB_PSN_MASK;
319 		switch (wqe->wr.opcode) {
320 		case IB_WR_SEND:
321 		case IB_WR_SEND_WITH_IMM:
322 			/* If no credit, return. */
323 			if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) &&
324 			    rvt_cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
325 				qp->s_flags |= RVT_S_WAIT_SSN_CREDIT;
326 				goto bail;
327 			}
328 			if (len > pmtu) {
329 				qp->s_state = OP(SEND_FIRST);
330 				len = pmtu;
331 				break;
332 			}
333 			if (wqe->wr.opcode == IB_WR_SEND)
334 				qp->s_state = OP(SEND_ONLY);
335 			else {
336 				qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
337 				/* Immediate data comes after the BTH */
338 				ohdr->u.imm_data = wqe->wr.ex.imm_data;
339 				hwords += 1;
340 			}
341 			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
342 				bth0 |= IB_BTH_SOLICITED;
343 			bth2 |= IB_BTH_REQ_ACK;
344 			if (++qp->s_cur == qp->s_size)
345 				qp->s_cur = 0;
346 			break;
347 
348 		case IB_WR_RDMA_WRITE:
349 			if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
350 				qp->s_lsn++;
351 			goto no_flow_control;
352 		case IB_WR_RDMA_WRITE_WITH_IMM:
353 			/* If no credit, return. */
354 			if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) &&
355 			    rvt_cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
356 				qp->s_flags |= RVT_S_WAIT_SSN_CREDIT;
357 				goto bail;
358 			}
359 no_flow_control:
360 			ohdr->u.rc.reth.vaddr =
361 				cpu_to_be64(wqe->rdma_wr.remote_addr);
362 			ohdr->u.rc.reth.rkey =
363 				cpu_to_be32(wqe->rdma_wr.rkey);
364 			ohdr->u.rc.reth.length = cpu_to_be32(len);
365 			hwords += sizeof(struct ib_reth) / sizeof(u32);
366 			if (len > pmtu) {
367 				qp->s_state = OP(RDMA_WRITE_FIRST);
368 				len = pmtu;
369 				break;
370 			}
371 			if (wqe->rdma_wr.wr.opcode == IB_WR_RDMA_WRITE)
372 				qp->s_state = OP(RDMA_WRITE_ONLY);
373 			else {
374 				qp->s_state = OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
375 				/* Immediate data comes after RETH */
376 				ohdr->u.rc.imm_data =
377 					wqe->rdma_wr.wr.ex.imm_data;
378 				hwords += 1;
379 				if (wqe->rdma_wr.wr.send_flags & IB_SEND_SOLICITED)
380 					bth0 |= IB_BTH_SOLICITED;
381 			}
382 			bth2 |= IB_BTH_REQ_ACK;
383 			if (++qp->s_cur == qp->s_size)
384 				qp->s_cur = 0;
385 			break;
386 
387 		case IB_WR_RDMA_READ:
388 			/*
389 			 * Don't allow more operations to be started
390 			 * than the QP limits allow.
391 			 */
392 			if (newreq) {
393 				if (qp->s_num_rd_atomic >=
394 				    qp->s_max_rd_atomic) {
395 					qp->s_flags |= RVT_S_WAIT_RDMAR;
396 					goto bail;
397 				}
398 				qp->s_num_rd_atomic++;
399 				if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
400 					qp->s_lsn++;
401 			}
402 
403 			ohdr->u.rc.reth.vaddr =
404 				cpu_to_be64(wqe->rdma_wr.remote_addr);
405 			ohdr->u.rc.reth.rkey =
406 				cpu_to_be32(wqe->rdma_wr.rkey);
407 			ohdr->u.rc.reth.length = cpu_to_be32(len);
408 			qp->s_state = OP(RDMA_READ_REQUEST);
409 			hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
410 			ss = NULL;
411 			len = 0;
412 			bth2 |= IB_BTH_REQ_ACK;
413 			if (++qp->s_cur == qp->s_size)
414 				qp->s_cur = 0;
415 			break;
416 
417 		case IB_WR_ATOMIC_CMP_AND_SWP:
418 		case IB_WR_ATOMIC_FETCH_AND_ADD:
419 			/*
420 			 * Don't allow more operations to be started
421 			 * than the QP limits allow.
422 			 */
423 			if (newreq) {
424 				if (qp->s_num_rd_atomic >=
425 				    qp->s_max_rd_atomic) {
426 					qp->s_flags |= RVT_S_WAIT_RDMAR;
427 					goto bail;
428 				}
429 				qp->s_num_rd_atomic++;
430 				if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
431 					qp->s_lsn++;
432 			}
433 			if (wqe->atomic_wr.wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
434 				qp->s_state = OP(COMPARE_SWAP);
435 				put_ib_ateth_swap(wqe->atomic_wr.swap,
436 						  &ohdr->u.atomic_eth);
437 				put_ib_ateth_swap(wqe->atomic_wr.compare_add,
438 						  &ohdr->u.atomic_eth);
439 			} else {
440 				qp->s_state = OP(FETCH_ADD);
441 				put_ib_ateth_swap(wqe->atomic_wr.compare_add,
442 						  &ohdr->u.atomic_eth);
443 				put_ib_ateth_swap(0, &ohdr->u.atomic_eth);
444 			}
445 			put_ib_ateth_vaddr(wqe->atomic_wr.remote_addr,
446 					   &ohdr->u.atomic_eth);
447 			ohdr->u.atomic_eth.rkey = cpu_to_be32(
448 				wqe->atomic_wr.rkey);
449 			hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);
450 			ss = NULL;
451 			len = 0;
452 			bth2 |= IB_BTH_REQ_ACK;
453 			if (++qp->s_cur == qp->s_size)
454 				qp->s_cur = 0;
455 			break;
456 
457 		default:
458 			goto bail;
459 		}
460 		qp->s_sge.sge = wqe->sg_list[0];
461 		qp->s_sge.sg_list = wqe->sg_list + 1;
462 		qp->s_sge.num_sge = wqe->wr.num_sge;
463 		qp->s_sge.total_len = wqe->length;
464 		qp->s_len = wqe->length;
465 		if (newreq) {
466 			qp->s_tail++;
467 			if (qp->s_tail >= qp->s_size)
468 				qp->s_tail = 0;
469 		}
470 		if (wqe->wr.opcode == IB_WR_RDMA_READ)
471 			qp->s_psn = wqe->lpsn + 1;
472 		else
473 			qp->s_psn++;
474 		break;
475 
476 	case OP(RDMA_READ_RESPONSE_FIRST):
477 		/*
478 		 * qp->s_state is normally set to the opcode of the
479 		 * last packet constructed for new requests and therefore
480 		 * is never set to RDMA read response.
481 		 * RDMA_READ_RESPONSE_FIRST is used by the ACK processing
482 		 * thread to indicate a SEND needs to be restarted from an
483 		 * earlier PSN without interferring with the sending thread.
484 		 * See qib_restart_rc().
485 		 */
486 		qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
487 		/* FALLTHROUGH */
488 	case OP(SEND_FIRST):
489 		qp->s_state = OP(SEND_MIDDLE);
490 		/* FALLTHROUGH */
491 	case OP(SEND_MIDDLE):
492 		bth2 = qp->s_psn++ & QIB_PSN_MASK;
493 		ss = &qp->s_sge;
494 		len = qp->s_len;
495 		if (len > pmtu) {
496 			len = pmtu;
497 			break;
498 		}
499 		if (wqe->wr.opcode == IB_WR_SEND)
500 			qp->s_state = OP(SEND_LAST);
501 		else {
502 			qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
503 			/* Immediate data comes after the BTH */
504 			ohdr->u.imm_data = wqe->wr.ex.imm_data;
505 			hwords += 1;
506 		}
507 		if (wqe->wr.send_flags & IB_SEND_SOLICITED)
508 			bth0 |= IB_BTH_SOLICITED;
509 		bth2 |= IB_BTH_REQ_ACK;
510 		qp->s_cur++;
511 		if (qp->s_cur >= qp->s_size)
512 			qp->s_cur = 0;
513 		break;
514 
515 	case OP(RDMA_READ_RESPONSE_LAST):
516 		/*
517 		 * qp->s_state is normally set to the opcode of the
518 		 * last packet constructed for new requests and therefore
519 		 * is never set to RDMA read response.
520 		 * RDMA_READ_RESPONSE_LAST is used by the ACK processing
521 		 * thread to indicate a RDMA write needs to be restarted from
522 		 * an earlier PSN without interferring with the sending thread.
523 		 * See qib_restart_rc().
524 		 */
525 		qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
526 		/* FALLTHROUGH */
527 	case OP(RDMA_WRITE_FIRST):
528 		qp->s_state = OP(RDMA_WRITE_MIDDLE);
529 		/* FALLTHROUGH */
530 	case OP(RDMA_WRITE_MIDDLE):
531 		bth2 = qp->s_psn++ & QIB_PSN_MASK;
532 		ss = &qp->s_sge;
533 		len = qp->s_len;
534 		if (len > pmtu) {
535 			len = pmtu;
536 			break;
537 		}
538 		if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
539 			qp->s_state = OP(RDMA_WRITE_LAST);
540 		else {
541 			qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
542 			/* Immediate data comes after the BTH */
543 			ohdr->u.imm_data = wqe->wr.ex.imm_data;
544 			hwords += 1;
545 			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
546 				bth0 |= IB_BTH_SOLICITED;
547 		}
548 		bth2 |= IB_BTH_REQ_ACK;
549 		qp->s_cur++;
550 		if (qp->s_cur >= qp->s_size)
551 			qp->s_cur = 0;
552 		break;
553 
554 	case OP(RDMA_READ_RESPONSE_MIDDLE):
555 		/*
556 		 * qp->s_state is normally set to the opcode of the
557 		 * last packet constructed for new requests and therefore
558 		 * is never set to RDMA read response.
559 		 * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing
560 		 * thread to indicate a RDMA read needs to be restarted from
561 		 * an earlier PSN without interferring with the sending thread.
562 		 * See qib_restart_rc().
563 		 */
564 		len = ((qp->s_psn - wqe->psn) & QIB_PSN_MASK) * pmtu;
565 		ohdr->u.rc.reth.vaddr =
566 			cpu_to_be64(wqe->rdma_wr.remote_addr + len);
567 		ohdr->u.rc.reth.rkey =
568 			cpu_to_be32(wqe->rdma_wr.rkey);
569 		ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len);
570 		qp->s_state = OP(RDMA_READ_REQUEST);
571 		hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
572 		bth2 = (qp->s_psn & QIB_PSN_MASK) | IB_BTH_REQ_ACK;
573 		qp->s_psn = wqe->lpsn + 1;
574 		ss = NULL;
575 		len = 0;
576 		qp->s_cur++;
577 		if (qp->s_cur == qp->s_size)
578 			qp->s_cur = 0;
579 		break;
580 	}
581 	qp->s_sending_hpsn = bth2;
582 	delta = (((int) bth2 - (int) wqe->psn) << 8) >> 8;
583 	if (delta && delta % QIB_PSN_CREDIT == 0)
584 		bth2 |= IB_BTH_REQ_ACK;
585 	if (qp->s_flags & RVT_S_SEND_ONE) {
586 		qp->s_flags &= ~RVT_S_SEND_ONE;
587 		qp->s_flags |= RVT_S_WAIT_ACK;
588 		bth2 |= IB_BTH_REQ_ACK;
589 	}
590 	qp->s_len -= len;
591 	qp->s_hdrwords = hwords;
592 	qp->s_cur_sge = ss;
593 	qp->s_cur_size = len;
594 	qib_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24), bth2);
595 done:
596 	return 1;
597 bail:
598 	qp->s_flags &= ~RVT_S_BUSY;
599 	return ret;
600 }
601 
602 /**
603  * qib_send_rc_ack - Construct an ACK packet and send it
604  * @qp: a pointer to the QP
605  *
606  * This is called from qib_rc_rcv() and qib_kreceive().
607  * Note that RDMA reads and atomics are handled in the
608  * send side QP state and tasklet.
609  */
610 void qib_send_rc_ack(struct rvt_qp *qp)
611 {
612 	struct qib_devdata *dd = dd_from_ibdev(qp->ibqp.device);
613 	struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
614 	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
615 	u64 pbc;
616 	u16 lrh0;
617 	u32 bth0;
618 	u32 hwords;
619 	u32 pbufn;
620 	u32 __iomem *piobuf;
621 	struct ib_header hdr;
622 	struct ib_other_headers *ohdr;
623 	u32 control;
624 	unsigned long flags;
625 
626 	spin_lock_irqsave(&qp->s_lock, flags);
627 
628 	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
629 		goto unlock;
630 
631 	/* Don't send ACK or NAK if a RDMA read or atomic is pending. */
632 	if ((qp->s_flags & RVT_S_RESP_PENDING) || qp->s_rdma_ack_cnt)
633 		goto queue_ack;
634 
635 	/* Construct the header with s_lock held so APM doesn't change it. */
636 	ohdr = &hdr.u.oth;
637 	lrh0 = QIB_LRH_BTH;
638 	/* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */
639 	hwords = 6;
640 	if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) &
641 		     IB_AH_GRH)) {
642 		hwords += qib_make_grh(ibp, &hdr.u.l.grh,
643 				       rdma_ah_read_grh(&qp->remote_ah_attr),
644 				       hwords, 0);
645 		ohdr = &hdr.u.l.oth;
646 		lrh0 = QIB_LRH_GRH;
647 	}
648 	/* read pkey_index w/o lock (its atomic) */
649 	bth0 = qib_get_pkey(ibp, qp->s_pkey_index) | (OP(ACKNOWLEDGE) << 24);
650 	if (qp->s_mig_state == IB_MIG_MIGRATED)
651 		bth0 |= IB_BTH_MIG_REQ;
652 	if (qp->r_nak_state)
653 		ohdr->u.aeth = cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
654 					    (qp->r_nak_state <<
655 					     IB_AETH_CREDIT_SHIFT));
656 	else
657 		ohdr->u.aeth = rvt_compute_aeth(qp);
658 	lrh0 |= ibp->sl_to_vl[rdma_ah_get_sl(&qp->remote_ah_attr)] << 12 |
659 		rdma_ah_get_sl(&qp->remote_ah_attr) << 4;
660 	hdr.lrh[0] = cpu_to_be16(lrh0);
661 	hdr.lrh[1] = cpu_to_be16(rdma_ah_get_dlid(&qp->remote_ah_attr));
662 	hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
663 	hdr.lrh[3] = cpu_to_be16(ppd->lid |
664 				 rdma_ah_get_path_bits(&qp->remote_ah_attr));
665 	ohdr->bth[0] = cpu_to_be32(bth0);
666 	ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
667 	ohdr->bth[2] = cpu_to_be32(qp->r_ack_psn & QIB_PSN_MASK);
668 
669 	spin_unlock_irqrestore(&qp->s_lock, flags);
670 
671 	/* Don't try to send ACKs if the link isn't ACTIVE */
672 	if (!(ppd->lflags & QIBL_LINKACTIVE))
673 		goto done;
674 
675 	control = dd->f_setpbc_control(ppd, hwords + SIZE_OF_CRC,
676 				       qp->s_srate, lrh0 >> 12);
677 	/* length is + 1 for the control dword */
678 	pbc = ((u64) control << 32) | (hwords + 1);
679 
680 	piobuf = dd->f_getsendbuf(ppd, pbc, &pbufn);
681 	if (!piobuf) {
682 		/*
683 		 * We are out of PIO buffers at the moment.
684 		 * Pass responsibility for sending the ACK to the
685 		 * send tasklet so that when a PIO buffer becomes
686 		 * available, the ACK is sent ahead of other outgoing
687 		 * packets.
688 		 */
689 		spin_lock_irqsave(&qp->s_lock, flags);
690 		goto queue_ack;
691 	}
692 
693 	/*
694 	 * Write the pbc.
695 	 * We have to flush after the PBC for correctness
696 	 * on some cpus or WC buffer can be written out of order.
697 	 */
698 	writeq(pbc, piobuf);
699 
700 	if (dd->flags & QIB_PIO_FLUSH_WC) {
701 		u32 *hdrp = (u32 *) &hdr;
702 
703 		qib_flush_wc();
704 		qib_pio_copy(piobuf + 2, hdrp, hwords - 1);
705 		qib_flush_wc();
706 		__raw_writel(hdrp[hwords - 1], piobuf + hwords + 1);
707 	} else
708 		qib_pio_copy(piobuf + 2, (u32 *) &hdr, hwords);
709 
710 	if (dd->flags & QIB_USE_SPCL_TRIG) {
711 		u32 spcl_off = (pbufn >= dd->piobcnt2k) ? 2047 : 1023;
712 
713 		qib_flush_wc();
714 		__raw_writel(0xaebecede, piobuf + spcl_off);
715 	}
716 
717 	qib_flush_wc();
718 	qib_sendbuf_done(dd, pbufn);
719 
720 	this_cpu_inc(ibp->pmastats->n_unicast_xmit);
721 	goto done;
722 
723 queue_ack:
724 	if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
725 		this_cpu_inc(*ibp->rvp.rc_qacks);
726 		qp->s_flags |= RVT_S_ACK_PENDING | RVT_S_RESP_PENDING;
727 		qp->s_nak_state = qp->r_nak_state;
728 		qp->s_ack_psn = qp->r_ack_psn;
729 
730 		/* Schedule the send tasklet. */
731 		qib_schedule_send(qp);
732 	}
733 unlock:
734 	spin_unlock_irqrestore(&qp->s_lock, flags);
735 done:
736 	return;
737 }
738 
739 /**
740  * reset_psn - reset the QP state to send starting from PSN
741  * @qp: the QP
742  * @psn: the packet sequence number to restart at
743  *
744  * This is called from qib_rc_rcv() to process an incoming RC ACK
745  * for the given QP.
746  * Called at interrupt level with the QP s_lock held.
747  */
748 static void reset_psn(struct rvt_qp *qp, u32 psn)
749 {
750 	u32 n = qp->s_acked;
751 	struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n);
752 	u32 opcode;
753 
754 	qp->s_cur = n;
755 
756 	/*
757 	 * If we are starting the request from the beginning,
758 	 * let the normal send code handle initialization.
759 	 */
760 	if (qib_cmp24(psn, wqe->psn) <= 0) {
761 		qp->s_state = OP(SEND_LAST);
762 		goto done;
763 	}
764 
765 	/* Find the work request opcode corresponding to the given PSN. */
766 	opcode = wqe->wr.opcode;
767 	for (;;) {
768 		int diff;
769 
770 		if (++n == qp->s_size)
771 			n = 0;
772 		if (n == qp->s_tail)
773 			break;
774 		wqe = rvt_get_swqe_ptr(qp, n);
775 		diff = qib_cmp24(psn, wqe->psn);
776 		if (diff < 0)
777 			break;
778 		qp->s_cur = n;
779 		/*
780 		 * If we are starting the request from the beginning,
781 		 * let the normal send code handle initialization.
782 		 */
783 		if (diff == 0) {
784 			qp->s_state = OP(SEND_LAST);
785 			goto done;
786 		}
787 		opcode = wqe->wr.opcode;
788 	}
789 
790 	/*
791 	 * Set the state to restart in the middle of a request.
792 	 * Don't change the s_sge, s_cur_sge, or s_cur_size.
793 	 * See qib_make_rc_req().
794 	 */
795 	switch (opcode) {
796 	case IB_WR_SEND:
797 	case IB_WR_SEND_WITH_IMM:
798 		qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
799 		break;
800 
801 	case IB_WR_RDMA_WRITE:
802 	case IB_WR_RDMA_WRITE_WITH_IMM:
803 		qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
804 		break;
805 
806 	case IB_WR_RDMA_READ:
807 		qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
808 		break;
809 
810 	default:
811 		/*
812 		 * This case shouldn't happen since its only
813 		 * one PSN per req.
814 		 */
815 		qp->s_state = OP(SEND_LAST);
816 	}
817 done:
818 	qp->s_psn = psn;
819 	/*
820 	 * Set RVT_S_WAIT_PSN as qib_rc_complete() may start the timer
821 	 * asynchronously before the send tasklet can get scheduled.
822 	 * Doing it in qib_make_rc_req() is too late.
823 	 */
824 	if ((qib_cmp24(qp->s_psn, qp->s_sending_hpsn) <= 0) &&
825 	    (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0))
826 		qp->s_flags |= RVT_S_WAIT_PSN;
827 }
828 
829 /*
830  * Back up requester to resend the last un-ACKed request.
831  * The QP r_lock and s_lock should be held and interrupts disabled.
832  */
833 void qib_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
834 {
835 	struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
836 	struct qib_ibport *ibp;
837 
838 	if (qp->s_retry == 0) {
839 		if (qp->s_mig_state == IB_MIG_ARMED) {
840 			qib_migrate_qp(qp);
841 			qp->s_retry = qp->s_retry_cnt;
842 		} else if (qp->s_last == qp->s_acked) {
843 			qib_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
844 			rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
845 			return;
846 		} else /* XXX need to handle delayed completion */
847 			return;
848 	} else
849 		qp->s_retry--;
850 
851 	ibp = to_iport(qp->ibqp.device, qp->port_num);
852 	if (wqe->wr.opcode == IB_WR_RDMA_READ)
853 		ibp->rvp.n_rc_resends++;
854 	else
855 		ibp->rvp.n_rc_resends += (qp->s_psn - psn) & QIB_PSN_MASK;
856 
857 	qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR |
858 			 RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN |
859 			 RVT_S_WAIT_ACK);
860 	if (wait)
861 		qp->s_flags |= RVT_S_SEND_ONE;
862 	reset_psn(qp, psn);
863 }
864 
865 /*
866  * Set qp->s_sending_psn to the next PSN after the given one.
867  * This would be psn+1 except when RDMA reads are present.
868  */
869 static void reset_sending_psn(struct rvt_qp *qp, u32 psn)
870 {
871 	struct rvt_swqe *wqe;
872 	u32 n = qp->s_last;
873 
874 	/* Find the work request corresponding to the given PSN. */
875 	for (;;) {
876 		wqe = rvt_get_swqe_ptr(qp, n);
877 		if (qib_cmp24(psn, wqe->lpsn) <= 0) {
878 			if (wqe->wr.opcode == IB_WR_RDMA_READ)
879 				qp->s_sending_psn = wqe->lpsn + 1;
880 			else
881 				qp->s_sending_psn = psn + 1;
882 			break;
883 		}
884 		if (++n == qp->s_size)
885 			n = 0;
886 		if (n == qp->s_tail)
887 			break;
888 	}
889 }
890 
891 /*
892  * This should be called with the QP s_lock held and interrupts disabled.
893  */
894 void qib_rc_send_complete(struct rvt_qp *qp, struct ib_header *hdr)
895 {
896 	struct ib_other_headers *ohdr;
897 	struct rvt_swqe *wqe;
898 	u32 opcode;
899 	u32 psn;
900 
901 	if (!(ib_rvt_state_ops[qp->state] & RVT_SEND_OR_FLUSH_OR_RECV_OK))
902 		return;
903 
904 	/* Find out where the BTH is */
905 	if ((be16_to_cpu(hdr->lrh[0]) & 3) == QIB_LRH_BTH)
906 		ohdr = &hdr->u.oth;
907 	else
908 		ohdr = &hdr->u.l.oth;
909 
910 	opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
911 	if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
912 	    opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
913 		WARN_ON(!qp->s_rdma_ack_cnt);
914 		qp->s_rdma_ack_cnt--;
915 		return;
916 	}
917 
918 	psn = be32_to_cpu(ohdr->bth[2]);
919 	reset_sending_psn(qp, psn);
920 
921 	/*
922 	 * Start timer after a packet requesting an ACK has been sent and
923 	 * there are still requests that haven't been acked.
924 	 */
925 	if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&
926 	    !(qp->s_flags & (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
927 	    (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
928 		rvt_add_retry_timer(qp);
929 
930 	while (qp->s_last != qp->s_acked) {
931 		u32 s_last;
932 
933 		wqe = rvt_get_swqe_ptr(qp, qp->s_last);
934 		if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) >= 0 &&
935 		    qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
936 			break;
937 		s_last = qp->s_last;
938 		if (++s_last >= qp->s_size)
939 			s_last = 0;
940 		qp->s_last = s_last;
941 		/* see post_send() */
942 		barrier();
943 		rvt_put_swqe(wqe);
944 		rvt_qp_swqe_complete(qp,
945 				     wqe,
946 				     ib_qib_wc_opcode[wqe->wr.opcode],
947 				     IB_WC_SUCCESS);
948 	}
949 	/*
950 	 * If we were waiting for sends to complete before resending,
951 	 * and they are now complete, restart sending.
952 	 */
953 	if (qp->s_flags & RVT_S_WAIT_PSN &&
954 	    qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
955 		qp->s_flags &= ~RVT_S_WAIT_PSN;
956 		qp->s_sending_psn = qp->s_psn;
957 		qp->s_sending_hpsn = qp->s_psn - 1;
958 		qib_schedule_send(qp);
959 	}
960 }
961 
962 static inline void update_last_psn(struct rvt_qp *qp, u32 psn)
963 {
964 	qp->s_last_psn = psn;
965 }
966 
967 /*
968  * Generate a SWQE completion.
969  * This is similar to qib_send_complete but has to check to be sure
970  * that the SGEs are not being referenced if the SWQE is being resent.
971  */
972 static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
973 					 struct rvt_swqe *wqe,
974 					 struct qib_ibport *ibp)
975 {
976 	/*
977 	 * Don't decrement refcount and don't generate a
978 	 * completion if the SWQE is being resent until the send
979 	 * is finished.
980 	 */
981 	if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) < 0 ||
982 	    qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
983 		u32 s_last;
984 
985 		rvt_put_swqe(wqe);
986 		s_last = qp->s_last;
987 		if (++s_last >= qp->s_size)
988 			s_last = 0;
989 		qp->s_last = s_last;
990 		/* see post_send() */
991 		barrier();
992 		rvt_qp_swqe_complete(qp,
993 				     wqe,
994 				     ib_qib_wc_opcode[wqe->wr.opcode],
995 				     IB_WC_SUCCESS);
996 	} else
997 		this_cpu_inc(*ibp->rvp.rc_delayed_comp);
998 
999 	qp->s_retry = qp->s_retry_cnt;
1000 	update_last_psn(qp, wqe->lpsn);
1001 
1002 	/*
1003 	 * If we are completing a request which is in the process of
1004 	 * being resent, we can stop resending it since we know the
1005 	 * responder has already seen it.
1006 	 */
1007 	if (qp->s_acked == qp->s_cur) {
1008 		if (++qp->s_cur >= qp->s_size)
1009 			qp->s_cur = 0;
1010 		qp->s_acked = qp->s_cur;
1011 		wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
1012 		if (qp->s_acked != qp->s_tail) {
1013 			qp->s_state = OP(SEND_LAST);
1014 			qp->s_psn = wqe->psn;
1015 		}
1016 	} else {
1017 		if (++qp->s_acked >= qp->s_size)
1018 			qp->s_acked = 0;
1019 		if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur)
1020 			qp->s_draining = 0;
1021 		wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1022 	}
1023 	return wqe;
1024 }
1025 
1026 /**
1027  * do_rc_ack - process an incoming RC ACK
1028  * @qp: the QP the ACK came in on
1029  * @psn: the packet sequence number of the ACK
1030  * @opcode: the opcode of the request that resulted in the ACK
1031  *
1032  * This is called from qib_rc_rcv_resp() to process an incoming RC ACK
1033  * for the given QP.
1034  * Called at interrupt level with the QP s_lock held.
1035  * Returns 1 if OK, 0 if current operation should be aborted (NAK).
1036  */
1037 static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
1038 		     u64 val, struct qib_ctxtdata *rcd)
1039 {
1040 	struct qib_ibport *ibp;
1041 	enum ib_wc_status status;
1042 	struct rvt_swqe *wqe;
1043 	int ret = 0;
1044 	u32 ack_psn;
1045 	int diff;
1046 
1047 	/*
1048 	 * Note that NAKs implicitly ACK outstanding SEND and RDMA write
1049 	 * requests and implicitly NAK RDMA read and atomic requests issued
1050 	 * before the NAK'ed request.  The MSN won't include the NAK'ed
1051 	 * request but will include an ACK'ed request(s).
1052 	 */
1053 	ack_psn = psn;
1054 	if (aeth >> IB_AETH_NAK_SHIFT)
1055 		ack_psn--;
1056 	wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1057 	ibp = to_iport(qp->ibqp.device, qp->port_num);
1058 
1059 	/*
1060 	 * The MSN might be for a later WQE than the PSN indicates so
1061 	 * only complete WQEs that the PSN finishes.
1062 	 */
1063 	while ((diff = qib_cmp24(ack_psn, wqe->lpsn)) >= 0) {
1064 		/*
1065 		 * RDMA_READ_RESPONSE_ONLY is a special case since
1066 		 * we want to generate completion events for everything
1067 		 * before the RDMA read, copy the data, then generate
1068 		 * the completion for the read.
1069 		 */
1070 		if (wqe->wr.opcode == IB_WR_RDMA_READ &&
1071 		    opcode == OP(RDMA_READ_RESPONSE_ONLY) &&
1072 		    diff == 0) {
1073 			ret = 1;
1074 			goto bail;
1075 		}
1076 		/*
1077 		 * If this request is a RDMA read or atomic, and the ACK is
1078 		 * for a later operation, this ACK NAKs the RDMA read or
1079 		 * atomic.  In other words, only a RDMA_READ_LAST or ONLY
1080 		 * can ACK a RDMA read and likewise for atomic ops.  Note
1081 		 * that the NAK case can only happen if relaxed ordering is
1082 		 * used and requests are sent after an RDMA read or atomic
1083 		 * is sent but before the response is received.
1084 		 */
1085 		if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
1086 		     (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
1087 		    ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1088 		      wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
1089 		     (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
1090 			/* Retry this request. */
1091 			if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
1092 				qp->r_flags |= RVT_R_RDMAR_SEQ;
1093 				qib_restart_rc(qp, qp->s_last_psn + 1, 0);
1094 				if (list_empty(&qp->rspwait)) {
1095 					qp->r_flags |= RVT_R_RSP_SEND;
1096 					rvt_get_qp(qp);
1097 					list_add_tail(&qp->rspwait,
1098 						      &rcd->qp_wait_list);
1099 				}
1100 			}
1101 			/*
1102 			 * No need to process the ACK/NAK since we are
1103 			 * restarting an earlier request.
1104 			 */
1105 			goto bail;
1106 		}
1107 		if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1108 		    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
1109 			u64 *vaddr = wqe->sg_list[0].vaddr;
1110 			*vaddr = val;
1111 		}
1112 		if (qp->s_num_rd_atomic &&
1113 		    (wqe->wr.opcode == IB_WR_RDMA_READ ||
1114 		     wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1115 		     wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {
1116 			qp->s_num_rd_atomic--;
1117 			/* Restart sending task if fence is complete */
1118 			if ((qp->s_flags & RVT_S_WAIT_FENCE) &&
1119 			    !qp->s_num_rd_atomic) {
1120 				qp->s_flags &= ~(RVT_S_WAIT_FENCE |
1121 						 RVT_S_WAIT_ACK);
1122 				qib_schedule_send(qp);
1123 			} else if (qp->s_flags & RVT_S_WAIT_RDMAR) {
1124 				qp->s_flags &= ~(RVT_S_WAIT_RDMAR |
1125 						 RVT_S_WAIT_ACK);
1126 				qib_schedule_send(qp);
1127 			}
1128 		}
1129 		wqe = do_rc_completion(qp, wqe, ibp);
1130 		if (qp->s_acked == qp->s_tail)
1131 			break;
1132 	}
1133 
1134 	switch (aeth >> IB_AETH_NAK_SHIFT) {
1135 	case 0:         /* ACK */
1136 		this_cpu_inc(*ibp->rvp.rc_acks);
1137 		if (qp->s_acked != qp->s_tail) {
1138 			/*
1139 			 * We are expecting more ACKs so
1140 			 * reset the retransmit timer.
1141 			 */
1142 			rvt_mod_retry_timer(qp);
1143 			/*
1144 			 * We can stop resending the earlier packets and
1145 			 * continue with the next packet the receiver wants.
1146 			 */
1147 			if (qib_cmp24(qp->s_psn, psn) <= 0)
1148 				reset_psn(qp, psn + 1);
1149 		} else {
1150 			/* No more acks - kill all timers */
1151 			rvt_stop_rc_timers(qp);
1152 			if (qib_cmp24(qp->s_psn, psn) <= 0) {
1153 				qp->s_state = OP(SEND_LAST);
1154 				qp->s_psn = psn + 1;
1155 			}
1156 		}
1157 		if (qp->s_flags & RVT_S_WAIT_ACK) {
1158 			qp->s_flags &= ~RVT_S_WAIT_ACK;
1159 			qib_schedule_send(qp);
1160 		}
1161 		rvt_get_credit(qp, aeth);
1162 		qp->s_rnr_retry = qp->s_rnr_retry_cnt;
1163 		qp->s_retry = qp->s_retry_cnt;
1164 		update_last_psn(qp, psn);
1165 		return 1;
1166 
1167 	case 1:         /* RNR NAK */
1168 		ibp->rvp.n_rnr_naks++;
1169 		if (qp->s_acked == qp->s_tail)
1170 			goto bail;
1171 		if (qp->s_flags & RVT_S_WAIT_RNR)
1172 			goto bail;
1173 		if (qp->s_rnr_retry == 0) {
1174 			status = IB_WC_RNR_RETRY_EXC_ERR;
1175 			goto class_b;
1176 		}
1177 		if (qp->s_rnr_retry_cnt < 7)
1178 			qp->s_rnr_retry--;
1179 
1180 		/* The last valid PSN is the previous PSN. */
1181 		update_last_psn(qp, psn - 1);
1182 
1183 		ibp->rvp.n_rc_resends += (qp->s_psn - psn) & QIB_PSN_MASK;
1184 
1185 		reset_psn(qp, psn);
1186 
1187 		qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK);
1188 		rvt_stop_rc_timers(qp);
1189 		rvt_add_rnr_timer(qp, aeth);
1190 		return 0;
1191 
1192 	case 3:         /* NAK */
1193 		if (qp->s_acked == qp->s_tail)
1194 			goto bail;
1195 		/* The last valid PSN is the previous PSN. */
1196 		update_last_psn(qp, psn - 1);
1197 		switch ((aeth >> IB_AETH_CREDIT_SHIFT) &
1198 			IB_AETH_CREDIT_MASK) {
1199 		case 0: /* PSN sequence error */
1200 			ibp->rvp.n_seq_naks++;
1201 			/*
1202 			 * Back up to the responder's expected PSN.
1203 			 * Note that we might get a NAK in the middle of an
1204 			 * RDMA READ response which terminates the RDMA
1205 			 * READ.
1206 			 */
1207 			qib_restart_rc(qp, psn, 0);
1208 			qib_schedule_send(qp);
1209 			break;
1210 
1211 		case 1: /* Invalid Request */
1212 			status = IB_WC_REM_INV_REQ_ERR;
1213 			ibp->rvp.n_other_naks++;
1214 			goto class_b;
1215 
1216 		case 2: /* Remote Access Error */
1217 			status = IB_WC_REM_ACCESS_ERR;
1218 			ibp->rvp.n_other_naks++;
1219 			goto class_b;
1220 
1221 		case 3: /* Remote Operation Error */
1222 			status = IB_WC_REM_OP_ERR;
1223 			ibp->rvp.n_other_naks++;
1224 class_b:
1225 			if (qp->s_last == qp->s_acked) {
1226 				qib_send_complete(qp, wqe, status);
1227 				rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
1228 			}
1229 			break;
1230 
1231 		default:
1232 			/* Ignore other reserved NAK error codes */
1233 			goto reserved;
1234 		}
1235 		qp->s_retry = qp->s_retry_cnt;
1236 		qp->s_rnr_retry = qp->s_rnr_retry_cnt;
1237 		goto bail;
1238 
1239 	default:                /* 2: reserved */
1240 reserved:
1241 		/* Ignore reserved NAK codes. */
1242 		goto bail;
1243 	}
1244 
1245 bail:
1246 	rvt_stop_rc_timers(qp);
1247 	return ret;
1248 }
1249 
1250 /*
1251  * We have seen an out of sequence RDMA read middle or last packet.
1252  * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE.
1253  */
1254 static void rdma_seq_err(struct rvt_qp *qp, struct qib_ibport *ibp, u32 psn,
1255 			 struct qib_ctxtdata *rcd)
1256 {
1257 	struct rvt_swqe *wqe;
1258 
1259 	/* Remove QP from retry timer */
1260 	rvt_stop_rc_timers(qp);
1261 
1262 	wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1263 
1264 	while (qib_cmp24(psn, wqe->lpsn) > 0) {
1265 		if (wqe->wr.opcode == IB_WR_RDMA_READ ||
1266 		    wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1267 		    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
1268 			break;
1269 		wqe = do_rc_completion(qp, wqe, ibp);
1270 	}
1271 
1272 	ibp->rvp.n_rdma_seq++;
1273 	qp->r_flags |= RVT_R_RDMAR_SEQ;
1274 	qib_restart_rc(qp, qp->s_last_psn + 1, 0);
1275 	if (list_empty(&qp->rspwait)) {
1276 		qp->r_flags |= RVT_R_RSP_SEND;
1277 		rvt_get_qp(qp);
1278 		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
1279 	}
1280 }
1281 
1282 /**
1283  * qib_rc_rcv_resp - process an incoming RC response packet
1284  * @ibp: the port this packet came in on
1285  * @ohdr: the other headers for this packet
1286  * @data: the packet data
1287  * @tlen: the packet length
1288  * @qp: the QP for this packet
1289  * @opcode: the opcode for this packet
1290  * @psn: the packet sequence number for this packet
1291  * @hdrsize: the header length
1292  * @pmtu: the path MTU
1293  *
1294  * This is called from qib_rc_rcv() to process an incoming RC response
1295  * packet for the given QP.
1296  * Called at interrupt level.
1297  */
1298 static void qib_rc_rcv_resp(struct qib_ibport *ibp,
1299 			    struct ib_other_headers *ohdr,
1300 			    void *data, u32 tlen,
1301 			    struct rvt_qp *qp,
1302 			    u32 opcode,
1303 			    u32 psn, u32 hdrsize, u32 pmtu,
1304 			    struct qib_ctxtdata *rcd)
1305 {
1306 	struct rvt_swqe *wqe;
1307 	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
1308 	enum ib_wc_status status;
1309 	unsigned long flags;
1310 	int diff;
1311 	u32 pad;
1312 	u32 aeth;
1313 	u64 val;
1314 
1315 	if (opcode != OP(RDMA_READ_RESPONSE_MIDDLE)) {
1316 		/*
1317 		 * If ACK'd PSN on SDMA busy list try to make progress to
1318 		 * reclaim SDMA credits.
1319 		 */
1320 		if ((qib_cmp24(psn, qp->s_sending_psn) >= 0) &&
1321 		    (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)) {
1322 
1323 			/*
1324 			 * If send tasklet not running attempt to progress
1325 			 * SDMA queue.
1326 			 */
1327 			if (!(qp->s_flags & RVT_S_BUSY)) {
1328 				/* Acquire SDMA Lock */
1329 				spin_lock_irqsave(&ppd->sdma_lock, flags);
1330 				/* Invoke sdma make progress */
1331 				qib_sdma_make_progress(ppd);
1332 				/* Release SDMA Lock */
1333 				spin_unlock_irqrestore(&ppd->sdma_lock, flags);
1334 			}
1335 		}
1336 	}
1337 
1338 	spin_lock_irqsave(&qp->s_lock, flags);
1339 	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
1340 		goto ack_done;
1341 
1342 	/* Ignore invalid responses. */
1343 	smp_read_barrier_depends(); /* see post_one_send */
1344 	if (qib_cmp24(psn, READ_ONCE(qp->s_next_psn)) >= 0)
1345 		goto ack_done;
1346 
1347 	/* Ignore duplicate responses. */
1348 	diff = qib_cmp24(psn, qp->s_last_psn);
1349 	if (unlikely(diff <= 0)) {
1350 		/* Update credits for "ghost" ACKs */
1351 		if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
1352 			aeth = be32_to_cpu(ohdr->u.aeth);
1353 			if ((aeth >> IB_AETH_NAK_SHIFT) == 0)
1354 				rvt_get_credit(qp, aeth);
1355 		}
1356 		goto ack_done;
1357 	}
1358 
1359 	/*
1360 	 * Skip everything other than the PSN we expect, if we are waiting
1361 	 * for a reply to a restarted RDMA read or atomic op.
1362 	 */
1363 	if (qp->r_flags & RVT_R_RDMAR_SEQ) {
1364 		if (qib_cmp24(psn, qp->s_last_psn + 1) != 0)
1365 			goto ack_done;
1366 		qp->r_flags &= ~RVT_R_RDMAR_SEQ;
1367 	}
1368 
1369 	if (unlikely(qp->s_acked == qp->s_tail))
1370 		goto ack_done;
1371 	wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1372 	status = IB_WC_SUCCESS;
1373 
1374 	switch (opcode) {
1375 	case OP(ACKNOWLEDGE):
1376 	case OP(ATOMIC_ACKNOWLEDGE):
1377 	case OP(RDMA_READ_RESPONSE_FIRST):
1378 		aeth = be32_to_cpu(ohdr->u.aeth);
1379 		if (opcode == OP(ATOMIC_ACKNOWLEDGE))
1380 			val = ib_u64_get(&ohdr->u.at.atomic_ack_eth);
1381 		else
1382 			val = 0;
1383 		if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) ||
1384 		    opcode != OP(RDMA_READ_RESPONSE_FIRST))
1385 			goto ack_done;
1386 		hdrsize += 4;
1387 		wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1388 		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
1389 			goto ack_op_err;
1390 		/*
1391 		 * If this is a response to a resent RDMA read, we
1392 		 * have to be careful to copy the data to the right
1393 		 * location.
1394 		 */
1395 		qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
1396 						  wqe, psn, pmtu);
1397 		goto read_middle;
1398 
1399 	case OP(RDMA_READ_RESPONSE_MIDDLE):
1400 		/* no AETH, no ACK */
1401 		if (unlikely(qib_cmp24(psn, qp->s_last_psn + 1)))
1402 			goto ack_seq_err;
1403 		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
1404 			goto ack_op_err;
1405 read_middle:
1406 		if (unlikely(tlen != (hdrsize + pmtu + 4)))
1407 			goto ack_len_err;
1408 		if (unlikely(pmtu >= qp->s_rdma_read_len))
1409 			goto ack_len_err;
1410 
1411 		/*
1412 		 * We got a response so update the timeout.
1413 		 * 4.096 usec. * (1 << qp->timeout)
1414 		 */
1415 		rvt_mod_retry_timer(qp);
1416 		if (qp->s_flags & RVT_S_WAIT_ACK) {
1417 			qp->s_flags &= ~RVT_S_WAIT_ACK;
1418 			qib_schedule_send(qp);
1419 		}
1420 
1421 		if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE))
1422 			qp->s_retry = qp->s_retry_cnt;
1423 
1424 		/*
1425 		 * Update the RDMA receive state but do the copy w/o
1426 		 * holding the locks and blocking interrupts.
1427 		 */
1428 		qp->s_rdma_read_len -= pmtu;
1429 		update_last_psn(qp, psn);
1430 		spin_unlock_irqrestore(&qp->s_lock, flags);
1431 		qib_copy_sge(&qp->s_rdma_read_sge, data, pmtu, 0);
1432 		goto bail;
1433 
1434 	case OP(RDMA_READ_RESPONSE_ONLY):
1435 		aeth = be32_to_cpu(ohdr->u.aeth);
1436 		if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd))
1437 			goto ack_done;
1438 		/* Get the number of bytes the message was padded by. */
1439 		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
1440 		/*
1441 		 * Check that the data size is >= 0 && <= pmtu.
1442 		 * Remember to account for the AETH header (4) and
1443 		 * ICRC (4).
1444 		 */
1445 		if (unlikely(tlen < (hdrsize + pad + 8)))
1446 			goto ack_len_err;
1447 		/*
1448 		 * If this is a response to a resent RDMA read, we
1449 		 * have to be careful to copy the data to the right
1450 		 * location.
1451 		 */
1452 		wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1453 		qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
1454 						  wqe, psn, pmtu);
1455 		goto read_last;
1456 
1457 	case OP(RDMA_READ_RESPONSE_LAST):
1458 		/* ACKs READ req. */
1459 		if (unlikely(qib_cmp24(psn, qp->s_last_psn + 1)))
1460 			goto ack_seq_err;
1461 		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
1462 			goto ack_op_err;
1463 		/* Get the number of bytes the message was padded by. */
1464 		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
1465 		/*
1466 		 * Check that the data size is >= 1 && <= pmtu.
1467 		 * Remember to account for the AETH header (4) and
1468 		 * ICRC (4).
1469 		 */
1470 		if (unlikely(tlen <= (hdrsize + pad + 8)))
1471 			goto ack_len_err;
1472 read_last:
1473 		tlen -= hdrsize + pad + 8;
1474 		if (unlikely(tlen != qp->s_rdma_read_len))
1475 			goto ack_len_err;
1476 		aeth = be32_to_cpu(ohdr->u.aeth);
1477 		qib_copy_sge(&qp->s_rdma_read_sge, data, tlen, 0);
1478 		WARN_ON(qp->s_rdma_read_sge.num_sge);
1479 		(void) do_rc_ack(qp, aeth, psn,
1480 				 OP(RDMA_READ_RESPONSE_LAST), 0, rcd);
1481 		goto ack_done;
1482 	}
1483 
1484 ack_op_err:
1485 	status = IB_WC_LOC_QP_OP_ERR;
1486 	goto ack_err;
1487 
1488 ack_seq_err:
1489 	rdma_seq_err(qp, ibp, psn, rcd);
1490 	goto ack_done;
1491 
1492 ack_len_err:
1493 	status = IB_WC_LOC_LEN_ERR;
1494 ack_err:
1495 	if (qp->s_last == qp->s_acked) {
1496 		qib_send_complete(qp, wqe, status);
1497 		rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
1498 	}
1499 ack_done:
1500 	spin_unlock_irqrestore(&qp->s_lock, flags);
1501 bail:
1502 	return;
1503 }
1504 
1505 /**
1506  * qib_rc_rcv_error - process an incoming duplicate or error RC packet
1507  * @ohdr: the other headers for this packet
1508  * @data: the packet data
1509  * @qp: the QP for this packet
1510  * @opcode: the opcode for this packet
1511  * @psn: the packet sequence number for this packet
1512  * @diff: the difference between the PSN and the expected PSN
1513  *
1514  * This is called from qib_rc_rcv() to process an unexpected
1515  * incoming RC packet for the given QP.
1516  * Called at interrupt level.
1517  * Return 1 if no more processing is needed; otherwise return 0 to
1518  * schedule a response to be sent.
1519  */
1520 static int qib_rc_rcv_error(struct ib_other_headers *ohdr,
1521 			    void *data,
1522 			    struct rvt_qp *qp,
1523 			    u32 opcode,
1524 			    u32 psn,
1525 			    int diff,
1526 			    struct qib_ctxtdata *rcd)
1527 {
1528 	struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
1529 	struct rvt_ack_entry *e;
1530 	unsigned long flags;
1531 	u8 i, prev;
1532 	int old_req;
1533 
1534 	if (diff > 0) {
1535 		/*
1536 		 * Packet sequence error.
1537 		 * A NAK will ACK earlier sends and RDMA writes.
1538 		 * Don't queue the NAK if we already sent one.
1539 		 */
1540 		if (!qp->r_nak_state) {
1541 			ibp->rvp.n_rc_seqnak++;
1542 			qp->r_nak_state = IB_NAK_PSN_ERROR;
1543 			/* Use the expected PSN. */
1544 			qp->r_ack_psn = qp->r_psn;
1545 			/*
1546 			 * Wait to send the sequence NAK until all packets
1547 			 * in the receive queue have been processed.
1548 			 * Otherwise, we end up propagating congestion.
1549 			 */
1550 			if (list_empty(&qp->rspwait)) {
1551 				qp->r_flags |= RVT_R_RSP_NAK;
1552 				rvt_get_qp(qp);
1553 				list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
1554 			}
1555 		}
1556 		goto done;
1557 	}
1558 
1559 	/*
1560 	 * Handle a duplicate request.  Don't re-execute SEND, RDMA
1561 	 * write or atomic op.  Don't NAK errors, just silently drop
1562 	 * the duplicate request.  Note that r_sge, r_len, and
1563 	 * r_rcv_len may be in use so don't modify them.
1564 	 *
1565 	 * We are supposed to ACK the earliest duplicate PSN but we
1566 	 * can coalesce an outstanding duplicate ACK.  We have to
1567 	 * send the earliest so that RDMA reads can be restarted at
1568 	 * the requester's expected PSN.
1569 	 *
1570 	 * First, find where this duplicate PSN falls within the
1571 	 * ACKs previously sent.
1572 	 * old_req is true if there is an older response that is scheduled
1573 	 * to be sent before sending this one.
1574 	 */
1575 	e = NULL;
1576 	old_req = 1;
1577 	ibp->rvp.n_rc_dupreq++;
1578 
1579 	spin_lock_irqsave(&qp->s_lock, flags);
1580 
1581 	for (i = qp->r_head_ack_queue; ; i = prev) {
1582 		if (i == qp->s_tail_ack_queue)
1583 			old_req = 0;
1584 		if (i)
1585 			prev = i - 1;
1586 		else
1587 			prev = QIB_MAX_RDMA_ATOMIC;
1588 		if (prev == qp->r_head_ack_queue) {
1589 			e = NULL;
1590 			break;
1591 		}
1592 		e = &qp->s_ack_queue[prev];
1593 		if (!e->opcode) {
1594 			e = NULL;
1595 			break;
1596 		}
1597 		if (qib_cmp24(psn, e->psn) >= 0) {
1598 			if (prev == qp->s_tail_ack_queue &&
1599 			    qib_cmp24(psn, e->lpsn) <= 0)
1600 				old_req = 0;
1601 			break;
1602 		}
1603 	}
1604 	switch (opcode) {
1605 	case OP(RDMA_READ_REQUEST): {
1606 		struct ib_reth *reth;
1607 		u32 offset;
1608 		u32 len;
1609 
1610 		/*
1611 		 * If we didn't find the RDMA read request in the ack queue,
1612 		 * we can ignore this request.
1613 		 */
1614 		if (!e || e->opcode != OP(RDMA_READ_REQUEST))
1615 			goto unlock_done;
1616 		/* RETH comes after BTH */
1617 		reth = &ohdr->u.rc.reth;
1618 		/*
1619 		 * Address range must be a subset of the original
1620 		 * request and start on pmtu boundaries.
1621 		 * We reuse the old ack_queue slot since the requester
1622 		 * should not back up and request an earlier PSN for the
1623 		 * same request.
1624 		 */
1625 		offset = ((psn - e->psn) & QIB_PSN_MASK) *
1626 			qp->pmtu;
1627 		len = be32_to_cpu(reth->length);
1628 		if (unlikely(offset + len != e->rdma_sge.sge_length))
1629 			goto unlock_done;
1630 		if (e->rdma_sge.mr) {
1631 			rvt_put_mr(e->rdma_sge.mr);
1632 			e->rdma_sge.mr = NULL;
1633 		}
1634 		if (len != 0) {
1635 			u32 rkey = be32_to_cpu(reth->rkey);
1636 			u64 vaddr = be64_to_cpu(reth->vaddr);
1637 			int ok;
1638 
1639 			ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
1640 					 IB_ACCESS_REMOTE_READ);
1641 			if (unlikely(!ok))
1642 				goto unlock_done;
1643 		} else {
1644 			e->rdma_sge.vaddr = NULL;
1645 			e->rdma_sge.length = 0;
1646 			e->rdma_sge.sge_length = 0;
1647 		}
1648 		e->psn = psn;
1649 		if (old_req)
1650 			goto unlock_done;
1651 		qp->s_tail_ack_queue = prev;
1652 		break;
1653 	}
1654 
1655 	case OP(COMPARE_SWAP):
1656 	case OP(FETCH_ADD): {
1657 		/*
1658 		 * If we didn't find the atomic request in the ack queue
1659 		 * or the send tasklet is already backed up to send an
1660 		 * earlier entry, we can ignore this request.
1661 		 */
1662 		if (!e || e->opcode != (u8) opcode || old_req)
1663 			goto unlock_done;
1664 		qp->s_tail_ack_queue = prev;
1665 		break;
1666 	}
1667 
1668 	default:
1669 		/*
1670 		 * Ignore this operation if it doesn't request an ACK
1671 		 * or an earlier RDMA read or atomic is going to be resent.
1672 		 */
1673 		if (!(psn & IB_BTH_REQ_ACK) || old_req)
1674 			goto unlock_done;
1675 		/*
1676 		 * Resend the most recent ACK if this request is
1677 		 * after all the previous RDMA reads and atomics.
1678 		 */
1679 		if (i == qp->r_head_ack_queue) {
1680 			spin_unlock_irqrestore(&qp->s_lock, flags);
1681 			qp->r_nak_state = 0;
1682 			qp->r_ack_psn = qp->r_psn - 1;
1683 			goto send_ack;
1684 		}
1685 		/*
1686 		 * Try to send a simple ACK to work around a Mellanox bug
1687 		 * which doesn't accept a RDMA read response or atomic
1688 		 * response as an ACK for earlier SENDs or RDMA writes.
1689 		 */
1690 		if (!(qp->s_flags & RVT_S_RESP_PENDING)) {
1691 			spin_unlock_irqrestore(&qp->s_lock, flags);
1692 			qp->r_nak_state = 0;
1693 			qp->r_ack_psn = qp->s_ack_queue[i].psn - 1;
1694 			goto send_ack;
1695 		}
1696 		/*
1697 		 * Resend the RDMA read or atomic op which
1698 		 * ACKs this duplicate request.
1699 		 */
1700 		qp->s_tail_ack_queue = i;
1701 		break;
1702 	}
1703 	qp->s_ack_state = OP(ACKNOWLEDGE);
1704 	qp->s_flags |= RVT_S_RESP_PENDING;
1705 	qp->r_nak_state = 0;
1706 	qib_schedule_send(qp);
1707 
1708 unlock_done:
1709 	spin_unlock_irqrestore(&qp->s_lock, flags);
1710 done:
1711 	return 1;
1712 
1713 send_ack:
1714 	return 0;
1715 }
1716 
1717 static inline void qib_update_ack_queue(struct rvt_qp *qp, unsigned n)
1718 {
1719 	unsigned next;
1720 
1721 	next = n + 1;
1722 	if (next > QIB_MAX_RDMA_ATOMIC)
1723 		next = 0;
1724 	qp->s_tail_ack_queue = next;
1725 	qp->s_ack_state = OP(ACKNOWLEDGE);
1726 }
1727 
1728 /**
1729  * qib_rc_rcv - process an incoming RC packet
1730  * @rcd: the context pointer
1731  * @hdr: the header of this packet
1732  * @has_grh: true if the header has a GRH
1733  * @data: the packet data
1734  * @tlen: the packet length
1735  * @qp: the QP for this packet
1736  *
1737  * This is called from qib_qp_rcv() to process an incoming RC packet
1738  * for the given QP.
1739  * Called at interrupt level.
1740  */
1741 void qib_rc_rcv(struct qib_ctxtdata *rcd, struct ib_header *hdr,
1742 		int has_grh, void *data, u32 tlen, struct rvt_qp *qp)
1743 {
1744 	struct qib_ibport *ibp = &rcd->ppd->ibport_data;
1745 	struct ib_other_headers *ohdr;
1746 	u32 opcode;
1747 	u32 hdrsize;
1748 	u32 psn;
1749 	u32 pad;
1750 	struct ib_wc wc;
1751 	u32 pmtu = qp->pmtu;
1752 	int diff;
1753 	struct ib_reth *reth;
1754 	unsigned long flags;
1755 	int ret;
1756 
1757 	/* Check for GRH */
1758 	if (!has_grh) {
1759 		ohdr = &hdr->u.oth;
1760 		hdrsize = 8 + 12;       /* LRH + BTH */
1761 	} else {
1762 		ohdr = &hdr->u.l.oth;
1763 		hdrsize = 8 + 40 + 12;  /* LRH + GRH + BTH */
1764 	}
1765 
1766 	opcode = be32_to_cpu(ohdr->bth[0]);
1767 	if (qib_ruc_check_hdr(ibp, hdr, has_grh, qp, opcode))
1768 		return;
1769 
1770 	psn = be32_to_cpu(ohdr->bth[2]);
1771 	opcode >>= 24;
1772 
1773 	/*
1774 	 * Process responses (ACKs) before anything else.  Note that the
1775 	 * packet sequence number will be for something in the send work
1776 	 * queue rather than the expected receive packet sequence number.
1777 	 * In other words, this QP is the requester.
1778 	 */
1779 	if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
1780 	    opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
1781 		qib_rc_rcv_resp(ibp, ohdr, data, tlen, qp, opcode, psn,
1782 				hdrsize, pmtu, rcd);
1783 		return;
1784 	}
1785 
1786 	/* Compute 24 bits worth of difference. */
1787 	diff = qib_cmp24(psn, qp->r_psn);
1788 	if (unlikely(diff)) {
1789 		if (qib_rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd))
1790 			return;
1791 		goto send_ack;
1792 	}
1793 
1794 	/* Check for opcode sequence errors. */
1795 	switch (qp->r_state) {
1796 	case OP(SEND_FIRST):
1797 	case OP(SEND_MIDDLE):
1798 		if (opcode == OP(SEND_MIDDLE) ||
1799 		    opcode == OP(SEND_LAST) ||
1800 		    opcode == OP(SEND_LAST_WITH_IMMEDIATE))
1801 			break;
1802 		goto nack_inv;
1803 
1804 	case OP(RDMA_WRITE_FIRST):
1805 	case OP(RDMA_WRITE_MIDDLE):
1806 		if (opcode == OP(RDMA_WRITE_MIDDLE) ||
1807 		    opcode == OP(RDMA_WRITE_LAST) ||
1808 		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
1809 			break;
1810 		goto nack_inv;
1811 
1812 	default:
1813 		if (opcode == OP(SEND_MIDDLE) ||
1814 		    opcode == OP(SEND_LAST) ||
1815 		    opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
1816 		    opcode == OP(RDMA_WRITE_MIDDLE) ||
1817 		    opcode == OP(RDMA_WRITE_LAST) ||
1818 		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
1819 			goto nack_inv;
1820 		/*
1821 		 * Note that it is up to the requester to not send a new
1822 		 * RDMA read or atomic operation before receiving an ACK
1823 		 * for the previous operation.
1824 		 */
1825 		break;
1826 	}
1827 
1828 	if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
1829 		rvt_comm_est(qp);
1830 
1831 	/* OK, process the packet. */
1832 	switch (opcode) {
1833 	case OP(SEND_FIRST):
1834 		ret = qib_get_rwqe(qp, 0);
1835 		if (ret < 0)
1836 			goto nack_op_err;
1837 		if (!ret)
1838 			goto rnr_nak;
1839 		qp->r_rcv_len = 0;
1840 		/* FALLTHROUGH */
1841 	case OP(SEND_MIDDLE):
1842 	case OP(RDMA_WRITE_MIDDLE):
1843 send_middle:
1844 		/* Check for invalid length PMTU or posted rwqe len. */
1845 		if (unlikely(tlen != (hdrsize + pmtu + 4)))
1846 			goto nack_inv;
1847 		qp->r_rcv_len += pmtu;
1848 		if (unlikely(qp->r_rcv_len > qp->r_len))
1849 			goto nack_inv;
1850 		qib_copy_sge(&qp->r_sge, data, pmtu, 1);
1851 		break;
1852 
1853 	case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
1854 		/* consume RWQE */
1855 		ret = qib_get_rwqe(qp, 1);
1856 		if (ret < 0)
1857 			goto nack_op_err;
1858 		if (!ret)
1859 			goto rnr_nak;
1860 		goto send_last_imm;
1861 
1862 	case OP(SEND_ONLY):
1863 	case OP(SEND_ONLY_WITH_IMMEDIATE):
1864 		ret = qib_get_rwqe(qp, 0);
1865 		if (ret < 0)
1866 			goto nack_op_err;
1867 		if (!ret)
1868 			goto rnr_nak;
1869 		qp->r_rcv_len = 0;
1870 		if (opcode == OP(SEND_ONLY))
1871 			goto no_immediate_data;
1872 		/* fall through -- for SEND_ONLY_WITH_IMMEDIATE */
1873 	case OP(SEND_LAST_WITH_IMMEDIATE):
1874 send_last_imm:
1875 		wc.ex.imm_data = ohdr->u.imm_data;
1876 		hdrsize += 4;
1877 		wc.wc_flags = IB_WC_WITH_IMM;
1878 		goto send_last;
1879 	case OP(SEND_LAST):
1880 	case OP(RDMA_WRITE_LAST):
1881 no_immediate_data:
1882 		wc.wc_flags = 0;
1883 		wc.ex.imm_data = 0;
1884 send_last:
1885 		/* Get the number of bytes the message was padded by. */
1886 		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
1887 		/* Check for invalid length. */
1888 		/* XXX LAST len should be >= 1 */
1889 		if (unlikely(tlen < (hdrsize + pad + 4)))
1890 			goto nack_inv;
1891 		/* Don't count the CRC. */
1892 		tlen -= (hdrsize + pad + 4);
1893 		wc.byte_len = tlen + qp->r_rcv_len;
1894 		if (unlikely(wc.byte_len > qp->r_len))
1895 			goto nack_inv;
1896 		qib_copy_sge(&qp->r_sge, data, tlen, 1);
1897 		rvt_put_ss(&qp->r_sge);
1898 		qp->r_msn++;
1899 		if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
1900 			break;
1901 		wc.wr_id = qp->r_wr_id;
1902 		wc.status = IB_WC_SUCCESS;
1903 		if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||
1904 		    opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
1905 			wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
1906 		else
1907 			wc.opcode = IB_WC_RECV;
1908 		wc.qp = &qp->ibqp;
1909 		wc.src_qp = qp->remote_qpn;
1910 		wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr);
1911 		wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr);
1912 		/* zero fields that are N/A */
1913 		wc.vendor_err = 0;
1914 		wc.pkey_index = 0;
1915 		wc.dlid_path_bits = 0;
1916 		wc.port_num = 0;
1917 		/* Signal completion event if the solicited bit is set. */
1918 		rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
1919 			     (ohdr->bth[0] &
1920 			      cpu_to_be32(IB_BTH_SOLICITED)) != 0);
1921 		break;
1922 
1923 	case OP(RDMA_WRITE_FIRST):
1924 	case OP(RDMA_WRITE_ONLY):
1925 	case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
1926 		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
1927 			goto nack_inv;
1928 		/* consume RWQE */
1929 		reth = &ohdr->u.rc.reth;
1930 		hdrsize += sizeof(*reth);
1931 		qp->r_len = be32_to_cpu(reth->length);
1932 		qp->r_rcv_len = 0;
1933 		qp->r_sge.sg_list = NULL;
1934 		if (qp->r_len != 0) {
1935 			u32 rkey = be32_to_cpu(reth->rkey);
1936 			u64 vaddr = be64_to_cpu(reth->vaddr);
1937 			int ok;
1938 
1939 			/* Check rkey & NAK */
1940 			ok = rvt_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr,
1941 					 rkey, IB_ACCESS_REMOTE_WRITE);
1942 			if (unlikely(!ok))
1943 				goto nack_acc;
1944 			qp->r_sge.num_sge = 1;
1945 		} else {
1946 			qp->r_sge.num_sge = 0;
1947 			qp->r_sge.sge.mr = NULL;
1948 			qp->r_sge.sge.vaddr = NULL;
1949 			qp->r_sge.sge.length = 0;
1950 			qp->r_sge.sge.sge_length = 0;
1951 		}
1952 		if (opcode == OP(RDMA_WRITE_FIRST))
1953 			goto send_middle;
1954 		else if (opcode == OP(RDMA_WRITE_ONLY))
1955 			goto no_immediate_data;
1956 		ret = qib_get_rwqe(qp, 1);
1957 		if (ret < 0)
1958 			goto nack_op_err;
1959 		if (!ret) {
1960 			rvt_put_ss(&qp->r_sge);
1961 			goto rnr_nak;
1962 		}
1963 		wc.ex.imm_data = ohdr->u.rc.imm_data;
1964 		hdrsize += 4;
1965 		wc.wc_flags = IB_WC_WITH_IMM;
1966 		goto send_last;
1967 
1968 	case OP(RDMA_READ_REQUEST): {
1969 		struct rvt_ack_entry *e;
1970 		u32 len;
1971 		u8 next;
1972 
1973 		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
1974 			goto nack_inv;
1975 		next = qp->r_head_ack_queue + 1;
1976 		/* s_ack_queue is size QIB_MAX_RDMA_ATOMIC+1 so use > not >= */
1977 		if (next > QIB_MAX_RDMA_ATOMIC)
1978 			next = 0;
1979 		spin_lock_irqsave(&qp->s_lock, flags);
1980 		if (unlikely(next == qp->s_tail_ack_queue)) {
1981 			if (!qp->s_ack_queue[next].sent)
1982 				goto nack_inv_unlck;
1983 			qib_update_ack_queue(qp, next);
1984 		}
1985 		e = &qp->s_ack_queue[qp->r_head_ack_queue];
1986 		if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
1987 			rvt_put_mr(e->rdma_sge.mr);
1988 			e->rdma_sge.mr = NULL;
1989 		}
1990 		reth = &ohdr->u.rc.reth;
1991 		len = be32_to_cpu(reth->length);
1992 		if (len) {
1993 			u32 rkey = be32_to_cpu(reth->rkey);
1994 			u64 vaddr = be64_to_cpu(reth->vaddr);
1995 			int ok;
1996 
1997 			/* Check rkey & NAK */
1998 			ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr,
1999 					 rkey, IB_ACCESS_REMOTE_READ);
2000 			if (unlikely(!ok))
2001 				goto nack_acc_unlck;
2002 			/*
2003 			 * Update the next expected PSN.  We add 1 later
2004 			 * below, so only add the remainder here.
2005 			 */
2006 			qp->r_psn += rvt_div_mtu(qp, len - 1);
2007 		} else {
2008 			e->rdma_sge.mr = NULL;
2009 			e->rdma_sge.vaddr = NULL;
2010 			e->rdma_sge.length = 0;
2011 			e->rdma_sge.sge_length = 0;
2012 		}
2013 		e->opcode = opcode;
2014 		e->sent = 0;
2015 		e->psn = psn;
2016 		e->lpsn = qp->r_psn;
2017 		/*
2018 		 * We need to increment the MSN here instead of when we
2019 		 * finish sending the result since a duplicate request would
2020 		 * increment it more than once.
2021 		 */
2022 		qp->r_msn++;
2023 		qp->r_psn++;
2024 		qp->r_state = opcode;
2025 		qp->r_nak_state = 0;
2026 		qp->r_head_ack_queue = next;
2027 
2028 		/* Schedule the send tasklet. */
2029 		qp->s_flags |= RVT_S_RESP_PENDING;
2030 		qib_schedule_send(qp);
2031 
2032 		goto sunlock;
2033 	}
2034 
2035 	case OP(COMPARE_SWAP):
2036 	case OP(FETCH_ADD): {
2037 		struct ib_atomic_eth *ateth;
2038 		struct rvt_ack_entry *e;
2039 		u64 vaddr;
2040 		atomic64_t *maddr;
2041 		u64 sdata;
2042 		u32 rkey;
2043 		u8 next;
2044 
2045 		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
2046 			goto nack_inv;
2047 		next = qp->r_head_ack_queue + 1;
2048 		if (next > QIB_MAX_RDMA_ATOMIC)
2049 			next = 0;
2050 		spin_lock_irqsave(&qp->s_lock, flags);
2051 		if (unlikely(next == qp->s_tail_ack_queue)) {
2052 			if (!qp->s_ack_queue[next].sent)
2053 				goto nack_inv_unlck;
2054 			qib_update_ack_queue(qp, next);
2055 		}
2056 		e = &qp->s_ack_queue[qp->r_head_ack_queue];
2057 		if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
2058 			rvt_put_mr(e->rdma_sge.mr);
2059 			e->rdma_sge.mr = NULL;
2060 		}
2061 		ateth = &ohdr->u.atomic_eth;
2062 		vaddr = get_ib_ateth_vaddr(ateth);
2063 		if (unlikely(vaddr & (sizeof(u64) - 1)))
2064 			goto nack_inv_unlck;
2065 		rkey = be32_to_cpu(ateth->rkey);
2066 		/* Check rkey & NAK */
2067 		if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
2068 					  vaddr, rkey,
2069 					  IB_ACCESS_REMOTE_ATOMIC)))
2070 			goto nack_acc_unlck;
2071 		/* Perform atomic OP and save result. */
2072 		maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
2073 		sdata = get_ib_ateth_swap(ateth);
2074 		e->atomic_data = (opcode == OP(FETCH_ADD)) ?
2075 			(u64) atomic64_add_return(sdata, maddr) - sdata :
2076 			(u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
2077 				      get_ib_ateth_compare(ateth),
2078 				      sdata);
2079 		rvt_put_mr(qp->r_sge.sge.mr);
2080 		qp->r_sge.num_sge = 0;
2081 		e->opcode = opcode;
2082 		e->sent = 0;
2083 		e->psn = psn;
2084 		e->lpsn = psn;
2085 		qp->r_msn++;
2086 		qp->r_psn++;
2087 		qp->r_state = opcode;
2088 		qp->r_nak_state = 0;
2089 		qp->r_head_ack_queue = next;
2090 
2091 		/* Schedule the send tasklet. */
2092 		qp->s_flags |= RVT_S_RESP_PENDING;
2093 		qib_schedule_send(qp);
2094 
2095 		goto sunlock;
2096 	}
2097 
2098 	default:
2099 		/* NAK unknown opcodes. */
2100 		goto nack_inv;
2101 	}
2102 	qp->r_psn++;
2103 	qp->r_state = opcode;
2104 	qp->r_ack_psn = psn;
2105 	qp->r_nak_state = 0;
2106 	/* Send an ACK if requested or required. */
2107 	if (psn & (1 << 31))
2108 		goto send_ack;
2109 	return;
2110 
2111 rnr_nak:
2112 	qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer;
2113 	qp->r_ack_psn = qp->r_psn;
2114 	/* Queue RNR NAK for later */
2115 	if (list_empty(&qp->rspwait)) {
2116 		qp->r_flags |= RVT_R_RSP_NAK;
2117 		rvt_get_qp(qp);
2118 		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
2119 	}
2120 	return;
2121 
2122 nack_op_err:
2123 	rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2124 	qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
2125 	qp->r_ack_psn = qp->r_psn;
2126 	/* Queue NAK for later */
2127 	if (list_empty(&qp->rspwait)) {
2128 		qp->r_flags |= RVT_R_RSP_NAK;
2129 		rvt_get_qp(qp);
2130 		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
2131 	}
2132 	return;
2133 
2134 nack_inv_unlck:
2135 	spin_unlock_irqrestore(&qp->s_lock, flags);
2136 nack_inv:
2137 	rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2138 	qp->r_nak_state = IB_NAK_INVALID_REQUEST;
2139 	qp->r_ack_psn = qp->r_psn;
2140 	/* Queue NAK for later */
2141 	if (list_empty(&qp->rspwait)) {
2142 		qp->r_flags |= RVT_R_RSP_NAK;
2143 		rvt_get_qp(qp);
2144 		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
2145 	}
2146 	return;
2147 
2148 nack_acc_unlck:
2149 	spin_unlock_irqrestore(&qp->s_lock, flags);
2150 nack_acc:
2151 	rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
2152 	qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
2153 	qp->r_ack_psn = qp->r_psn;
2154 send_ack:
2155 	qib_send_rc_ack(qp);
2156 	return;
2157 
2158 sunlock:
2159 	spin_unlock_irqrestore(&qp->s_lock, flags);
2160 }
2161