xref: /openbmc/linux/drivers/infiniband/hw/qib/qib_rc.c (revision 615c36f5)
1 /*
2  * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
3  * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the
9  * OpenIB.org BSD license below:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      - Redistributions of source code must retain the above
16  *        copyright notice, this list of conditions and the following
17  *        disclaimer.
18  *
19  *      - Redistributions in binary form must reproduce the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer in the documentation and/or other materials
22  *        provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33 
34 #include <linux/io.h>
35 
36 #include "qib.h"
37 
38 /* cut down ridiculously long IB macro names */
39 #define OP(x) IB_OPCODE_RC_##x
40 
41 static void rc_timeout(unsigned long arg);
42 
43 static u32 restart_sge(struct qib_sge_state *ss, struct qib_swqe *wqe,
44 		       u32 psn, u32 pmtu)
45 {
46 	u32 len;
47 
48 	len = ((psn - wqe->psn) & QIB_PSN_MASK) * pmtu;
49 	ss->sge = wqe->sg_list[0];
50 	ss->sg_list = wqe->sg_list + 1;
51 	ss->num_sge = wqe->wr.num_sge;
52 	ss->total_len = wqe->length;
53 	qib_skip_sge(ss, len, 0);
54 	return wqe->length - len;
55 }
56 
57 static void start_timer(struct qib_qp *qp)
58 {
59 	qp->s_flags |= QIB_S_TIMER;
60 	qp->s_timer.function = rc_timeout;
61 	/* 4.096 usec. * (1 << qp->timeout) */
62 	qp->s_timer.expires = jiffies + qp->timeout_jiffies;
63 	add_timer(&qp->s_timer);
64 }
65 
66 /**
67  * qib_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
68  * @dev: the device for this QP
69  * @qp: a pointer to the QP
70  * @ohdr: a pointer to the IB header being constructed
71  * @pmtu: the path MTU
72  *
73  * Return 1 if constructed; otherwise, return 0.
74  * Note that we are in the responder's side of the QP context.
75  * Note the QP s_lock must be held.
76  */
77 static int qib_make_rc_ack(struct qib_ibdev *dev, struct qib_qp *qp,
78 			   struct qib_other_headers *ohdr, u32 pmtu)
79 {
80 	struct qib_ack_entry *e;
81 	u32 hwords;
82 	u32 len;
83 	u32 bth0;
84 	u32 bth2;
85 
86 	/* Don't send an ACK if we aren't supposed to. */
87 	if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK))
88 		goto bail;
89 
90 	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
91 	hwords = 5;
92 
93 	switch (qp->s_ack_state) {
94 	case OP(RDMA_READ_RESPONSE_LAST):
95 	case OP(RDMA_READ_RESPONSE_ONLY):
96 		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
97 		if (e->rdma_sge.mr) {
98 			atomic_dec(&e->rdma_sge.mr->refcount);
99 			e->rdma_sge.mr = NULL;
100 		}
101 		/* FALLTHROUGH */
102 	case OP(ATOMIC_ACKNOWLEDGE):
103 		/*
104 		 * We can increment the tail pointer now that the last
105 		 * response has been sent instead of only being
106 		 * constructed.
107 		 */
108 		if (++qp->s_tail_ack_queue > QIB_MAX_RDMA_ATOMIC)
109 			qp->s_tail_ack_queue = 0;
110 		/* FALLTHROUGH */
111 	case OP(SEND_ONLY):
112 	case OP(ACKNOWLEDGE):
113 		/* Check for no next entry in the queue. */
114 		if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {
115 			if (qp->s_flags & QIB_S_ACK_PENDING)
116 				goto normal;
117 			goto bail;
118 		}
119 
120 		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
121 		if (e->opcode == OP(RDMA_READ_REQUEST)) {
122 			/*
123 			 * If a RDMA read response is being resent and
124 			 * we haven't seen the duplicate request yet,
125 			 * then stop sending the remaining responses the
126 			 * responder has seen until the requester resends it.
127 			 */
128 			len = e->rdma_sge.sge_length;
129 			if (len && !e->rdma_sge.mr) {
130 				qp->s_tail_ack_queue = qp->r_head_ack_queue;
131 				goto bail;
132 			}
133 			/* Copy SGE state in case we need to resend */
134 			qp->s_rdma_mr = e->rdma_sge.mr;
135 			if (qp->s_rdma_mr)
136 				atomic_inc(&qp->s_rdma_mr->refcount);
137 			qp->s_ack_rdma_sge.sge = e->rdma_sge;
138 			qp->s_ack_rdma_sge.num_sge = 1;
139 			qp->s_cur_sge = &qp->s_ack_rdma_sge;
140 			if (len > pmtu) {
141 				len = pmtu;
142 				qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
143 			} else {
144 				qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
145 				e->sent = 1;
146 			}
147 			ohdr->u.aeth = qib_compute_aeth(qp);
148 			hwords++;
149 			qp->s_ack_rdma_psn = e->psn;
150 			bth2 = qp->s_ack_rdma_psn++ & QIB_PSN_MASK;
151 		} else {
152 			/* COMPARE_SWAP or FETCH_ADD */
153 			qp->s_cur_sge = NULL;
154 			len = 0;
155 			qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
156 			ohdr->u.at.aeth = qib_compute_aeth(qp);
157 			ohdr->u.at.atomic_ack_eth[0] =
158 				cpu_to_be32(e->atomic_data >> 32);
159 			ohdr->u.at.atomic_ack_eth[1] =
160 				cpu_to_be32(e->atomic_data);
161 			hwords += sizeof(ohdr->u.at) / sizeof(u32);
162 			bth2 = e->psn & QIB_PSN_MASK;
163 			e->sent = 1;
164 		}
165 		bth0 = qp->s_ack_state << 24;
166 		break;
167 
168 	case OP(RDMA_READ_RESPONSE_FIRST):
169 		qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
170 		/* FALLTHROUGH */
171 	case OP(RDMA_READ_RESPONSE_MIDDLE):
172 		qp->s_cur_sge = &qp->s_ack_rdma_sge;
173 		qp->s_rdma_mr = qp->s_ack_rdma_sge.sge.mr;
174 		if (qp->s_rdma_mr)
175 			atomic_inc(&qp->s_rdma_mr->refcount);
176 		len = qp->s_ack_rdma_sge.sge.sge_length;
177 		if (len > pmtu)
178 			len = pmtu;
179 		else {
180 			ohdr->u.aeth = qib_compute_aeth(qp);
181 			hwords++;
182 			qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
183 			e = &qp->s_ack_queue[qp->s_tail_ack_queue];
184 			e->sent = 1;
185 		}
186 		bth0 = qp->s_ack_state << 24;
187 		bth2 = qp->s_ack_rdma_psn++ & QIB_PSN_MASK;
188 		break;
189 
190 	default:
191 normal:
192 		/*
193 		 * Send a regular ACK.
194 		 * Set the s_ack_state so we wait until after sending
195 		 * the ACK before setting s_ack_state to ACKNOWLEDGE
196 		 * (see above).
197 		 */
198 		qp->s_ack_state = OP(SEND_ONLY);
199 		qp->s_flags &= ~QIB_S_ACK_PENDING;
200 		qp->s_cur_sge = NULL;
201 		if (qp->s_nak_state)
202 			ohdr->u.aeth =
203 				cpu_to_be32((qp->r_msn & QIB_MSN_MASK) |
204 					    (qp->s_nak_state <<
205 					     QIB_AETH_CREDIT_SHIFT));
206 		else
207 			ohdr->u.aeth = qib_compute_aeth(qp);
208 		hwords++;
209 		len = 0;
210 		bth0 = OP(ACKNOWLEDGE) << 24;
211 		bth2 = qp->s_ack_psn & QIB_PSN_MASK;
212 	}
213 	qp->s_rdma_ack_cnt++;
214 	qp->s_hdrwords = hwords;
215 	qp->s_cur_size = len;
216 	qib_make_ruc_header(qp, ohdr, bth0, bth2);
217 	return 1;
218 
219 bail:
220 	qp->s_ack_state = OP(ACKNOWLEDGE);
221 	qp->s_flags &= ~(QIB_S_RESP_PENDING | QIB_S_ACK_PENDING);
222 	return 0;
223 }
224 
225 /**
226  * qib_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
227  * @qp: a pointer to the QP
228  *
229  * Return 1 if constructed; otherwise, return 0.
230  */
231 int qib_make_rc_req(struct qib_qp *qp)
232 {
233 	struct qib_ibdev *dev = to_idev(qp->ibqp.device);
234 	struct qib_other_headers *ohdr;
235 	struct qib_sge_state *ss;
236 	struct qib_swqe *wqe;
237 	u32 hwords;
238 	u32 len;
239 	u32 bth0;
240 	u32 bth2;
241 	u32 pmtu = qp->pmtu;
242 	char newreq;
243 	unsigned long flags;
244 	int ret = 0;
245 	int delta;
246 
247 	ohdr = &qp->s_hdr.u.oth;
248 	if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
249 		ohdr = &qp->s_hdr.u.l.oth;
250 
251 	/*
252 	 * The lock is needed to synchronize between the sending tasklet,
253 	 * the receive interrupt handler, and timeout resends.
254 	 */
255 	spin_lock_irqsave(&qp->s_lock, flags);
256 
257 	/* Sending responses has higher priority over sending requests. */
258 	if ((qp->s_flags & QIB_S_RESP_PENDING) &&
259 	    qib_make_rc_ack(dev, qp, ohdr, pmtu))
260 		goto done;
261 
262 	if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_SEND_OK)) {
263 		if (!(ib_qib_state_ops[qp->state] & QIB_FLUSH_SEND))
264 			goto bail;
265 		/* We are in the error state, flush the work request. */
266 		if (qp->s_last == qp->s_head)
267 			goto bail;
268 		/* If DMAs are in progress, we can't flush immediately. */
269 		if (atomic_read(&qp->s_dma_busy)) {
270 			qp->s_flags |= QIB_S_WAIT_DMA;
271 			goto bail;
272 		}
273 		wqe = get_swqe_ptr(qp, qp->s_last);
274 		qib_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
275 			IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
276 		/* will get called again */
277 		goto done;
278 	}
279 
280 	if (qp->s_flags & (QIB_S_WAIT_RNR | QIB_S_WAIT_ACK))
281 		goto bail;
282 
283 	if (qib_cmp24(qp->s_psn, qp->s_sending_hpsn) <= 0) {
284 		if (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) {
285 			qp->s_flags |= QIB_S_WAIT_PSN;
286 			goto bail;
287 		}
288 		qp->s_sending_psn = qp->s_psn;
289 		qp->s_sending_hpsn = qp->s_psn - 1;
290 	}
291 
292 	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
293 	hwords = 5;
294 	bth0 = 0;
295 
296 	/* Send a request. */
297 	wqe = get_swqe_ptr(qp, qp->s_cur);
298 	switch (qp->s_state) {
299 	default:
300 		if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_NEXT_SEND_OK))
301 			goto bail;
302 		/*
303 		 * Resend an old request or start a new one.
304 		 *
305 		 * We keep track of the current SWQE so that
306 		 * we don't reset the "furthest progress" state
307 		 * if we need to back up.
308 		 */
309 		newreq = 0;
310 		if (qp->s_cur == qp->s_tail) {
311 			/* Check if send work queue is empty. */
312 			if (qp->s_tail == qp->s_head)
313 				goto bail;
314 			/*
315 			 * If a fence is requested, wait for previous
316 			 * RDMA read and atomic operations to finish.
317 			 */
318 			if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
319 			    qp->s_num_rd_atomic) {
320 				qp->s_flags |= QIB_S_WAIT_FENCE;
321 				goto bail;
322 			}
323 			wqe->psn = qp->s_next_psn;
324 			newreq = 1;
325 		}
326 		/*
327 		 * Note that we have to be careful not to modify the
328 		 * original work request since we may need to resend
329 		 * it.
330 		 */
331 		len = wqe->length;
332 		ss = &qp->s_sge;
333 		bth2 = qp->s_psn & QIB_PSN_MASK;
334 		switch (wqe->wr.opcode) {
335 		case IB_WR_SEND:
336 		case IB_WR_SEND_WITH_IMM:
337 			/* If no credit, return. */
338 			if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT) &&
339 			    qib_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {
340 				qp->s_flags |= QIB_S_WAIT_SSN_CREDIT;
341 				goto bail;
342 			}
343 			wqe->lpsn = wqe->psn;
344 			if (len > pmtu) {
345 				wqe->lpsn += (len - 1) / pmtu;
346 				qp->s_state = OP(SEND_FIRST);
347 				len = pmtu;
348 				break;
349 			}
350 			if (wqe->wr.opcode == IB_WR_SEND)
351 				qp->s_state = OP(SEND_ONLY);
352 			else {
353 				qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
354 				/* Immediate data comes after the BTH */
355 				ohdr->u.imm_data = wqe->wr.ex.imm_data;
356 				hwords += 1;
357 			}
358 			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
359 				bth0 |= IB_BTH_SOLICITED;
360 			bth2 |= IB_BTH_REQ_ACK;
361 			if (++qp->s_cur == qp->s_size)
362 				qp->s_cur = 0;
363 			break;
364 
365 		case IB_WR_RDMA_WRITE:
366 			if (newreq && !(qp->s_flags & QIB_S_UNLIMITED_CREDIT))
367 				qp->s_lsn++;
368 			/* FALLTHROUGH */
369 		case IB_WR_RDMA_WRITE_WITH_IMM:
370 			/* If no credit, return. */
371 			if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT) &&
372 			    qib_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {
373 				qp->s_flags |= QIB_S_WAIT_SSN_CREDIT;
374 				goto bail;
375 			}
376 			ohdr->u.rc.reth.vaddr =
377 				cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
378 			ohdr->u.rc.reth.rkey =
379 				cpu_to_be32(wqe->wr.wr.rdma.rkey);
380 			ohdr->u.rc.reth.length = cpu_to_be32(len);
381 			hwords += sizeof(struct ib_reth) / sizeof(u32);
382 			wqe->lpsn = wqe->psn;
383 			if (len > pmtu) {
384 				wqe->lpsn += (len - 1) / pmtu;
385 				qp->s_state = OP(RDMA_WRITE_FIRST);
386 				len = pmtu;
387 				break;
388 			}
389 			if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
390 				qp->s_state = OP(RDMA_WRITE_ONLY);
391 			else {
392 				qp->s_state =
393 					OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
394 				/* Immediate data comes after RETH */
395 				ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
396 				hwords += 1;
397 				if (wqe->wr.send_flags & IB_SEND_SOLICITED)
398 					bth0 |= IB_BTH_SOLICITED;
399 			}
400 			bth2 |= IB_BTH_REQ_ACK;
401 			if (++qp->s_cur == qp->s_size)
402 				qp->s_cur = 0;
403 			break;
404 
405 		case IB_WR_RDMA_READ:
406 			/*
407 			 * Don't allow more operations to be started
408 			 * than the QP limits allow.
409 			 */
410 			if (newreq) {
411 				if (qp->s_num_rd_atomic >=
412 				    qp->s_max_rd_atomic) {
413 					qp->s_flags |= QIB_S_WAIT_RDMAR;
414 					goto bail;
415 				}
416 				qp->s_num_rd_atomic++;
417 				if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT))
418 					qp->s_lsn++;
419 				/*
420 				 * Adjust s_next_psn to count the
421 				 * expected number of responses.
422 				 */
423 				if (len > pmtu)
424 					qp->s_next_psn += (len - 1) / pmtu;
425 				wqe->lpsn = qp->s_next_psn++;
426 			}
427 			ohdr->u.rc.reth.vaddr =
428 				cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
429 			ohdr->u.rc.reth.rkey =
430 				cpu_to_be32(wqe->wr.wr.rdma.rkey);
431 			ohdr->u.rc.reth.length = cpu_to_be32(len);
432 			qp->s_state = OP(RDMA_READ_REQUEST);
433 			hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
434 			ss = NULL;
435 			len = 0;
436 			bth2 |= IB_BTH_REQ_ACK;
437 			if (++qp->s_cur == qp->s_size)
438 				qp->s_cur = 0;
439 			break;
440 
441 		case IB_WR_ATOMIC_CMP_AND_SWP:
442 		case IB_WR_ATOMIC_FETCH_AND_ADD:
443 			/*
444 			 * Don't allow more operations to be started
445 			 * than the QP limits allow.
446 			 */
447 			if (newreq) {
448 				if (qp->s_num_rd_atomic >=
449 				    qp->s_max_rd_atomic) {
450 					qp->s_flags |= QIB_S_WAIT_RDMAR;
451 					goto bail;
452 				}
453 				qp->s_num_rd_atomic++;
454 				if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT))
455 					qp->s_lsn++;
456 				wqe->lpsn = wqe->psn;
457 			}
458 			if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
459 				qp->s_state = OP(COMPARE_SWAP);
460 				ohdr->u.atomic_eth.swap_data = cpu_to_be64(
461 					wqe->wr.wr.atomic.swap);
462 				ohdr->u.atomic_eth.compare_data = cpu_to_be64(
463 					wqe->wr.wr.atomic.compare_add);
464 			} else {
465 				qp->s_state = OP(FETCH_ADD);
466 				ohdr->u.atomic_eth.swap_data = cpu_to_be64(
467 					wqe->wr.wr.atomic.compare_add);
468 				ohdr->u.atomic_eth.compare_data = 0;
469 			}
470 			ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32(
471 				wqe->wr.wr.atomic.remote_addr >> 32);
472 			ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32(
473 				wqe->wr.wr.atomic.remote_addr);
474 			ohdr->u.atomic_eth.rkey = cpu_to_be32(
475 				wqe->wr.wr.atomic.rkey);
476 			hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);
477 			ss = NULL;
478 			len = 0;
479 			bth2 |= IB_BTH_REQ_ACK;
480 			if (++qp->s_cur == qp->s_size)
481 				qp->s_cur = 0;
482 			break;
483 
484 		default:
485 			goto bail;
486 		}
487 		qp->s_sge.sge = wqe->sg_list[0];
488 		qp->s_sge.sg_list = wqe->sg_list + 1;
489 		qp->s_sge.num_sge = wqe->wr.num_sge;
490 		qp->s_sge.total_len = wqe->length;
491 		qp->s_len = wqe->length;
492 		if (newreq) {
493 			qp->s_tail++;
494 			if (qp->s_tail >= qp->s_size)
495 				qp->s_tail = 0;
496 		}
497 		if (wqe->wr.opcode == IB_WR_RDMA_READ)
498 			qp->s_psn = wqe->lpsn + 1;
499 		else {
500 			qp->s_psn++;
501 			if (qib_cmp24(qp->s_psn, qp->s_next_psn) > 0)
502 				qp->s_next_psn = qp->s_psn;
503 		}
504 		break;
505 
506 	case OP(RDMA_READ_RESPONSE_FIRST):
507 		/*
508 		 * qp->s_state is normally set to the opcode of the
509 		 * last packet constructed for new requests and therefore
510 		 * is never set to RDMA read response.
511 		 * RDMA_READ_RESPONSE_FIRST is used by the ACK processing
512 		 * thread to indicate a SEND needs to be restarted from an
513 		 * earlier PSN without interferring with the sending thread.
514 		 * See qib_restart_rc().
515 		 */
516 		qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
517 		/* FALLTHROUGH */
518 	case OP(SEND_FIRST):
519 		qp->s_state = OP(SEND_MIDDLE);
520 		/* FALLTHROUGH */
521 	case OP(SEND_MIDDLE):
522 		bth2 = qp->s_psn++ & QIB_PSN_MASK;
523 		if (qib_cmp24(qp->s_psn, qp->s_next_psn) > 0)
524 			qp->s_next_psn = qp->s_psn;
525 		ss = &qp->s_sge;
526 		len = qp->s_len;
527 		if (len > pmtu) {
528 			len = pmtu;
529 			break;
530 		}
531 		if (wqe->wr.opcode == IB_WR_SEND)
532 			qp->s_state = OP(SEND_LAST);
533 		else {
534 			qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
535 			/* Immediate data comes after the BTH */
536 			ohdr->u.imm_data = wqe->wr.ex.imm_data;
537 			hwords += 1;
538 		}
539 		if (wqe->wr.send_flags & IB_SEND_SOLICITED)
540 			bth0 |= IB_BTH_SOLICITED;
541 		bth2 |= IB_BTH_REQ_ACK;
542 		qp->s_cur++;
543 		if (qp->s_cur >= qp->s_size)
544 			qp->s_cur = 0;
545 		break;
546 
547 	case OP(RDMA_READ_RESPONSE_LAST):
548 		/*
549 		 * qp->s_state is normally set to the opcode of the
550 		 * last packet constructed for new requests and therefore
551 		 * is never set to RDMA read response.
552 		 * RDMA_READ_RESPONSE_LAST is used by the ACK processing
553 		 * thread to indicate a RDMA write needs to be restarted from
554 		 * an earlier PSN without interferring with the sending thread.
555 		 * See qib_restart_rc().
556 		 */
557 		qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
558 		/* FALLTHROUGH */
559 	case OP(RDMA_WRITE_FIRST):
560 		qp->s_state = OP(RDMA_WRITE_MIDDLE);
561 		/* FALLTHROUGH */
562 	case OP(RDMA_WRITE_MIDDLE):
563 		bth2 = qp->s_psn++ & QIB_PSN_MASK;
564 		if (qib_cmp24(qp->s_psn, qp->s_next_psn) > 0)
565 			qp->s_next_psn = qp->s_psn;
566 		ss = &qp->s_sge;
567 		len = qp->s_len;
568 		if (len > pmtu) {
569 			len = pmtu;
570 			break;
571 		}
572 		if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
573 			qp->s_state = OP(RDMA_WRITE_LAST);
574 		else {
575 			qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
576 			/* Immediate data comes after the BTH */
577 			ohdr->u.imm_data = wqe->wr.ex.imm_data;
578 			hwords += 1;
579 			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
580 				bth0 |= IB_BTH_SOLICITED;
581 		}
582 		bth2 |= IB_BTH_REQ_ACK;
583 		qp->s_cur++;
584 		if (qp->s_cur >= qp->s_size)
585 			qp->s_cur = 0;
586 		break;
587 
588 	case OP(RDMA_READ_RESPONSE_MIDDLE):
589 		/*
590 		 * qp->s_state is normally set to the opcode of the
591 		 * last packet constructed for new requests and therefore
592 		 * is never set to RDMA read response.
593 		 * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing
594 		 * thread to indicate a RDMA read needs to be restarted from
595 		 * an earlier PSN without interferring with the sending thread.
596 		 * See qib_restart_rc().
597 		 */
598 		len = ((qp->s_psn - wqe->psn) & QIB_PSN_MASK) * pmtu;
599 		ohdr->u.rc.reth.vaddr =
600 			cpu_to_be64(wqe->wr.wr.rdma.remote_addr + len);
601 		ohdr->u.rc.reth.rkey =
602 			cpu_to_be32(wqe->wr.wr.rdma.rkey);
603 		ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len);
604 		qp->s_state = OP(RDMA_READ_REQUEST);
605 		hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
606 		bth2 = (qp->s_psn & QIB_PSN_MASK) | IB_BTH_REQ_ACK;
607 		qp->s_psn = wqe->lpsn + 1;
608 		ss = NULL;
609 		len = 0;
610 		qp->s_cur++;
611 		if (qp->s_cur == qp->s_size)
612 			qp->s_cur = 0;
613 		break;
614 	}
615 	qp->s_sending_hpsn = bth2;
616 	delta = (((int) bth2 - (int) wqe->psn) << 8) >> 8;
617 	if (delta && delta % QIB_PSN_CREDIT == 0)
618 		bth2 |= IB_BTH_REQ_ACK;
619 	if (qp->s_flags & QIB_S_SEND_ONE) {
620 		qp->s_flags &= ~QIB_S_SEND_ONE;
621 		qp->s_flags |= QIB_S_WAIT_ACK;
622 		bth2 |= IB_BTH_REQ_ACK;
623 	}
624 	qp->s_len -= len;
625 	qp->s_hdrwords = hwords;
626 	qp->s_cur_sge = ss;
627 	qp->s_cur_size = len;
628 	qib_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24), bth2);
629 done:
630 	ret = 1;
631 	goto unlock;
632 
633 bail:
634 	qp->s_flags &= ~QIB_S_BUSY;
635 unlock:
636 	spin_unlock_irqrestore(&qp->s_lock, flags);
637 	return ret;
638 }
639 
640 /**
641  * qib_send_rc_ack - Construct an ACK packet and send it
642  * @qp: a pointer to the QP
643  *
644  * This is called from qib_rc_rcv() and qib_kreceive().
645  * Note that RDMA reads and atomics are handled in the
646  * send side QP state and tasklet.
647  */
648 void qib_send_rc_ack(struct qib_qp *qp)
649 {
650 	struct qib_devdata *dd = dd_from_ibdev(qp->ibqp.device);
651 	struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
652 	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
653 	u64 pbc;
654 	u16 lrh0;
655 	u32 bth0;
656 	u32 hwords;
657 	u32 pbufn;
658 	u32 __iomem *piobuf;
659 	struct qib_ib_header hdr;
660 	struct qib_other_headers *ohdr;
661 	u32 control;
662 	unsigned long flags;
663 
664 	spin_lock_irqsave(&qp->s_lock, flags);
665 
666 	if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK))
667 		goto unlock;
668 
669 	/* Don't send ACK or NAK if a RDMA read or atomic is pending. */
670 	if ((qp->s_flags & QIB_S_RESP_PENDING) || qp->s_rdma_ack_cnt)
671 		goto queue_ack;
672 
673 	/* Construct the header with s_lock held so APM doesn't change it. */
674 	ohdr = &hdr.u.oth;
675 	lrh0 = QIB_LRH_BTH;
676 	/* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */
677 	hwords = 6;
678 	if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
679 		hwords += qib_make_grh(ibp, &hdr.u.l.grh,
680 				       &qp->remote_ah_attr.grh, hwords, 0);
681 		ohdr = &hdr.u.l.oth;
682 		lrh0 = QIB_LRH_GRH;
683 	}
684 	/* read pkey_index w/o lock (its atomic) */
685 	bth0 = qib_get_pkey(ibp, qp->s_pkey_index) | (OP(ACKNOWLEDGE) << 24);
686 	if (qp->s_mig_state == IB_MIG_MIGRATED)
687 		bth0 |= IB_BTH_MIG_REQ;
688 	if (qp->r_nak_state)
689 		ohdr->u.aeth = cpu_to_be32((qp->r_msn & QIB_MSN_MASK) |
690 					    (qp->r_nak_state <<
691 					     QIB_AETH_CREDIT_SHIFT));
692 	else
693 		ohdr->u.aeth = qib_compute_aeth(qp);
694 	lrh0 |= ibp->sl_to_vl[qp->remote_ah_attr.sl] << 12 |
695 		qp->remote_ah_attr.sl << 4;
696 	hdr.lrh[0] = cpu_to_be16(lrh0);
697 	hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
698 	hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
699 	hdr.lrh[3] = cpu_to_be16(ppd->lid | qp->remote_ah_attr.src_path_bits);
700 	ohdr->bth[0] = cpu_to_be32(bth0);
701 	ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
702 	ohdr->bth[2] = cpu_to_be32(qp->r_ack_psn & QIB_PSN_MASK);
703 
704 	spin_unlock_irqrestore(&qp->s_lock, flags);
705 
706 	/* Don't try to send ACKs if the link isn't ACTIVE */
707 	if (!(ppd->lflags & QIBL_LINKACTIVE))
708 		goto done;
709 
710 	control = dd->f_setpbc_control(ppd, hwords + SIZE_OF_CRC,
711 				       qp->s_srate, lrh0 >> 12);
712 	/* length is + 1 for the control dword */
713 	pbc = ((u64) control << 32) | (hwords + 1);
714 
715 	piobuf = dd->f_getsendbuf(ppd, pbc, &pbufn);
716 	if (!piobuf) {
717 		/*
718 		 * We are out of PIO buffers at the moment.
719 		 * Pass responsibility for sending the ACK to the
720 		 * send tasklet so that when a PIO buffer becomes
721 		 * available, the ACK is sent ahead of other outgoing
722 		 * packets.
723 		 */
724 		spin_lock_irqsave(&qp->s_lock, flags);
725 		goto queue_ack;
726 	}
727 
728 	/*
729 	 * Write the pbc.
730 	 * We have to flush after the PBC for correctness
731 	 * on some cpus or WC buffer can be written out of order.
732 	 */
733 	writeq(pbc, piobuf);
734 
735 	if (dd->flags & QIB_PIO_FLUSH_WC) {
736 		u32 *hdrp = (u32 *) &hdr;
737 
738 		qib_flush_wc();
739 		qib_pio_copy(piobuf + 2, hdrp, hwords - 1);
740 		qib_flush_wc();
741 		__raw_writel(hdrp[hwords - 1], piobuf + hwords + 1);
742 	} else
743 		qib_pio_copy(piobuf + 2, (u32 *) &hdr, hwords);
744 
745 	if (dd->flags & QIB_USE_SPCL_TRIG) {
746 		u32 spcl_off = (pbufn >= dd->piobcnt2k) ? 2047 : 1023;
747 
748 		qib_flush_wc();
749 		__raw_writel(0xaebecede, piobuf + spcl_off);
750 	}
751 
752 	qib_flush_wc();
753 	qib_sendbuf_done(dd, pbufn);
754 
755 	ibp->n_unicast_xmit++;
756 	goto done;
757 
758 queue_ack:
759 	if (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK) {
760 		ibp->n_rc_qacks++;
761 		qp->s_flags |= QIB_S_ACK_PENDING | QIB_S_RESP_PENDING;
762 		qp->s_nak_state = qp->r_nak_state;
763 		qp->s_ack_psn = qp->r_ack_psn;
764 
765 		/* Schedule the send tasklet. */
766 		qib_schedule_send(qp);
767 	}
768 unlock:
769 	spin_unlock_irqrestore(&qp->s_lock, flags);
770 done:
771 	return;
772 }
773 
774 /**
775  * reset_psn - reset the QP state to send starting from PSN
776  * @qp: the QP
777  * @psn: the packet sequence number to restart at
778  *
779  * This is called from qib_rc_rcv() to process an incoming RC ACK
780  * for the given QP.
781  * Called at interrupt level with the QP s_lock held.
782  */
783 static void reset_psn(struct qib_qp *qp, u32 psn)
784 {
785 	u32 n = qp->s_acked;
786 	struct qib_swqe *wqe = get_swqe_ptr(qp, n);
787 	u32 opcode;
788 
789 	qp->s_cur = n;
790 
791 	/*
792 	 * If we are starting the request from the beginning,
793 	 * let the normal send code handle initialization.
794 	 */
795 	if (qib_cmp24(psn, wqe->psn) <= 0) {
796 		qp->s_state = OP(SEND_LAST);
797 		goto done;
798 	}
799 
800 	/* Find the work request opcode corresponding to the given PSN. */
801 	opcode = wqe->wr.opcode;
802 	for (;;) {
803 		int diff;
804 
805 		if (++n == qp->s_size)
806 			n = 0;
807 		if (n == qp->s_tail)
808 			break;
809 		wqe = get_swqe_ptr(qp, n);
810 		diff = qib_cmp24(psn, wqe->psn);
811 		if (diff < 0)
812 			break;
813 		qp->s_cur = n;
814 		/*
815 		 * If we are starting the request from the beginning,
816 		 * let the normal send code handle initialization.
817 		 */
818 		if (diff == 0) {
819 			qp->s_state = OP(SEND_LAST);
820 			goto done;
821 		}
822 		opcode = wqe->wr.opcode;
823 	}
824 
825 	/*
826 	 * Set the state to restart in the middle of a request.
827 	 * Don't change the s_sge, s_cur_sge, or s_cur_size.
828 	 * See qib_make_rc_req().
829 	 */
830 	switch (opcode) {
831 	case IB_WR_SEND:
832 	case IB_WR_SEND_WITH_IMM:
833 		qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
834 		break;
835 
836 	case IB_WR_RDMA_WRITE:
837 	case IB_WR_RDMA_WRITE_WITH_IMM:
838 		qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
839 		break;
840 
841 	case IB_WR_RDMA_READ:
842 		qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
843 		break;
844 
845 	default:
846 		/*
847 		 * This case shouldn't happen since its only
848 		 * one PSN per req.
849 		 */
850 		qp->s_state = OP(SEND_LAST);
851 	}
852 done:
853 	qp->s_psn = psn;
854 	/*
855 	 * Set QIB_S_WAIT_PSN as qib_rc_complete() may start the timer
856 	 * asynchronously before the send tasklet can get scheduled.
857 	 * Doing it in qib_make_rc_req() is too late.
858 	 */
859 	if ((qib_cmp24(qp->s_psn, qp->s_sending_hpsn) <= 0) &&
860 	    (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0))
861 		qp->s_flags |= QIB_S_WAIT_PSN;
862 }
863 
864 /*
865  * Back up requester to resend the last un-ACKed request.
866  * The QP r_lock and s_lock should be held and interrupts disabled.
867  */
868 static void qib_restart_rc(struct qib_qp *qp, u32 psn, int wait)
869 {
870 	struct qib_swqe *wqe = get_swqe_ptr(qp, qp->s_acked);
871 	struct qib_ibport *ibp;
872 
873 	if (qp->s_retry == 0) {
874 		if (qp->s_mig_state == IB_MIG_ARMED) {
875 			qib_migrate_qp(qp);
876 			qp->s_retry = qp->s_retry_cnt;
877 		} else if (qp->s_last == qp->s_acked) {
878 			qib_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
879 			qib_error_qp(qp, IB_WC_WR_FLUSH_ERR);
880 			return;
881 		} else /* XXX need to handle delayed completion */
882 			return;
883 	} else
884 		qp->s_retry--;
885 
886 	ibp = to_iport(qp->ibqp.device, qp->port_num);
887 	if (wqe->wr.opcode == IB_WR_RDMA_READ)
888 		ibp->n_rc_resends++;
889 	else
890 		ibp->n_rc_resends += (qp->s_psn - psn) & QIB_PSN_MASK;
891 
892 	qp->s_flags &= ~(QIB_S_WAIT_FENCE | QIB_S_WAIT_RDMAR |
893 			 QIB_S_WAIT_SSN_CREDIT | QIB_S_WAIT_PSN |
894 			 QIB_S_WAIT_ACK);
895 	if (wait)
896 		qp->s_flags |= QIB_S_SEND_ONE;
897 	reset_psn(qp, psn);
898 }
899 
900 /*
901  * This is called from s_timer for missing responses.
902  */
903 static void rc_timeout(unsigned long arg)
904 {
905 	struct qib_qp *qp = (struct qib_qp *)arg;
906 	struct qib_ibport *ibp;
907 	unsigned long flags;
908 
909 	spin_lock_irqsave(&qp->r_lock, flags);
910 	spin_lock(&qp->s_lock);
911 	if (qp->s_flags & QIB_S_TIMER) {
912 		ibp = to_iport(qp->ibqp.device, qp->port_num);
913 		ibp->n_rc_timeouts++;
914 		qp->s_flags &= ~QIB_S_TIMER;
915 		del_timer(&qp->s_timer);
916 		qib_restart_rc(qp, qp->s_last_psn + 1, 1);
917 		qib_schedule_send(qp);
918 	}
919 	spin_unlock(&qp->s_lock);
920 	spin_unlock_irqrestore(&qp->r_lock, flags);
921 }
922 
923 /*
924  * This is called from s_timer for RNR timeouts.
925  */
926 void qib_rc_rnr_retry(unsigned long arg)
927 {
928 	struct qib_qp *qp = (struct qib_qp *)arg;
929 	unsigned long flags;
930 
931 	spin_lock_irqsave(&qp->s_lock, flags);
932 	if (qp->s_flags & QIB_S_WAIT_RNR) {
933 		qp->s_flags &= ~QIB_S_WAIT_RNR;
934 		del_timer(&qp->s_timer);
935 		qib_schedule_send(qp);
936 	}
937 	spin_unlock_irqrestore(&qp->s_lock, flags);
938 }
939 
940 /*
941  * Set qp->s_sending_psn to the next PSN after the given one.
942  * This would be psn+1 except when RDMA reads are present.
943  */
944 static void reset_sending_psn(struct qib_qp *qp, u32 psn)
945 {
946 	struct qib_swqe *wqe;
947 	u32 n = qp->s_last;
948 
949 	/* Find the work request corresponding to the given PSN. */
950 	for (;;) {
951 		wqe = get_swqe_ptr(qp, n);
952 		if (qib_cmp24(psn, wqe->lpsn) <= 0) {
953 			if (wqe->wr.opcode == IB_WR_RDMA_READ)
954 				qp->s_sending_psn = wqe->lpsn + 1;
955 			else
956 				qp->s_sending_psn = psn + 1;
957 			break;
958 		}
959 		if (++n == qp->s_size)
960 			n = 0;
961 		if (n == qp->s_tail)
962 			break;
963 	}
964 }
965 
966 /*
967  * This should be called with the QP s_lock held and interrupts disabled.
968  */
969 void qib_rc_send_complete(struct qib_qp *qp, struct qib_ib_header *hdr)
970 {
971 	struct qib_other_headers *ohdr;
972 	struct qib_swqe *wqe;
973 	struct ib_wc wc;
974 	unsigned i;
975 	u32 opcode;
976 	u32 psn;
977 
978 	if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_OR_FLUSH_SEND))
979 		return;
980 
981 	/* Find out where the BTH is */
982 	if ((be16_to_cpu(hdr->lrh[0]) & 3) == QIB_LRH_BTH)
983 		ohdr = &hdr->u.oth;
984 	else
985 		ohdr = &hdr->u.l.oth;
986 
987 	opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
988 	if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
989 	    opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
990 		WARN_ON(!qp->s_rdma_ack_cnt);
991 		qp->s_rdma_ack_cnt--;
992 		return;
993 	}
994 
995 	psn = be32_to_cpu(ohdr->bth[2]);
996 	reset_sending_psn(qp, psn);
997 
998 	/*
999 	 * Start timer after a packet requesting an ACK has been sent and
1000 	 * there are still requests that haven't been acked.
1001 	 */
1002 	if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&
1003 	    !(qp->s_flags & (QIB_S_TIMER | QIB_S_WAIT_RNR | QIB_S_WAIT_PSN)) &&
1004 	    (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK))
1005 		start_timer(qp);
1006 
1007 	while (qp->s_last != qp->s_acked) {
1008 		wqe = get_swqe_ptr(qp, qp->s_last);
1009 		if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) >= 0 &&
1010 		    qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
1011 			break;
1012 		for (i = 0; i < wqe->wr.num_sge; i++) {
1013 			struct qib_sge *sge = &wqe->sg_list[i];
1014 
1015 			atomic_dec(&sge->mr->refcount);
1016 		}
1017 		/* Post a send completion queue entry if requested. */
1018 		if (!(qp->s_flags & QIB_S_SIGNAL_REQ_WR) ||
1019 		    (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
1020 			memset(&wc, 0, sizeof wc);
1021 			wc.wr_id = wqe->wr.wr_id;
1022 			wc.status = IB_WC_SUCCESS;
1023 			wc.opcode = ib_qib_wc_opcode[wqe->wr.opcode];
1024 			wc.byte_len = wqe->length;
1025 			wc.qp = &qp->ibqp;
1026 			qib_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
1027 		}
1028 		if (++qp->s_last >= qp->s_size)
1029 			qp->s_last = 0;
1030 	}
1031 	/*
1032 	 * If we were waiting for sends to complete before resending,
1033 	 * and they are now complete, restart sending.
1034 	 */
1035 	if (qp->s_flags & QIB_S_WAIT_PSN &&
1036 	    qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
1037 		qp->s_flags &= ~QIB_S_WAIT_PSN;
1038 		qp->s_sending_psn = qp->s_psn;
1039 		qp->s_sending_hpsn = qp->s_psn - 1;
1040 		qib_schedule_send(qp);
1041 	}
1042 }
1043 
1044 static inline void update_last_psn(struct qib_qp *qp, u32 psn)
1045 {
1046 	qp->s_last_psn = psn;
1047 }
1048 
1049 /*
1050  * Generate a SWQE completion.
1051  * This is similar to qib_send_complete but has to check to be sure
1052  * that the SGEs are not being referenced if the SWQE is being resent.
1053  */
1054 static struct qib_swqe *do_rc_completion(struct qib_qp *qp,
1055 					 struct qib_swqe *wqe,
1056 					 struct qib_ibport *ibp)
1057 {
1058 	struct ib_wc wc;
1059 	unsigned i;
1060 
1061 	/*
1062 	 * Don't decrement refcount and don't generate a
1063 	 * completion if the SWQE is being resent until the send
1064 	 * is finished.
1065 	 */
1066 	if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) < 0 ||
1067 	    qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
1068 		for (i = 0; i < wqe->wr.num_sge; i++) {
1069 			struct qib_sge *sge = &wqe->sg_list[i];
1070 
1071 			atomic_dec(&sge->mr->refcount);
1072 		}
1073 		/* Post a send completion queue entry if requested. */
1074 		if (!(qp->s_flags & QIB_S_SIGNAL_REQ_WR) ||
1075 		    (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
1076 			memset(&wc, 0, sizeof wc);
1077 			wc.wr_id = wqe->wr.wr_id;
1078 			wc.status = IB_WC_SUCCESS;
1079 			wc.opcode = ib_qib_wc_opcode[wqe->wr.opcode];
1080 			wc.byte_len = wqe->length;
1081 			wc.qp = &qp->ibqp;
1082 			qib_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
1083 		}
1084 		if (++qp->s_last >= qp->s_size)
1085 			qp->s_last = 0;
1086 	} else
1087 		ibp->n_rc_delayed_comp++;
1088 
1089 	qp->s_retry = qp->s_retry_cnt;
1090 	update_last_psn(qp, wqe->lpsn);
1091 
1092 	/*
1093 	 * If we are completing a request which is in the process of
1094 	 * being resent, we can stop resending it since we know the
1095 	 * responder has already seen it.
1096 	 */
1097 	if (qp->s_acked == qp->s_cur) {
1098 		if (++qp->s_cur >= qp->s_size)
1099 			qp->s_cur = 0;
1100 		qp->s_acked = qp->s_cur;
1101 		wqe = get_swqe_ptr(qp, qp->s_cur);
1102 		if (qp->s_acked != qp->s_tail) {
1103 			qp->s_state = OP(SEND_LAST);
1104 			qp->s_psn = wqe->psn;
1105 		}
1106 	} else {
1107 		if (++qp->s_acked >= qp->s_size)
1108 			qp->s_acked = 0;
1109 		if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur)
1110 			qp->s_draining = 0;
1111 		wqe = get_swqe_ptr(qp, qp->s_acked);
1112 	}
1113 	return wqe;
1114 }
1115 
1116 /**
1117  * do_rc_ack - process an incoming RC ACK
1118  * @qp: the QP the ACK came in on
1119  * @psn: the packet sequence number of the ACK
1120  * @opcode: the opcode of the request that resulted in the ACK
1121  *
1122  * This is called from qib_rc_rcv_resp() to process an incoming RC ACK
1123  * for the given QP.
1124  * Called at interrupt level with the QP s_lock held.
1125  * Returns 1 if OK, 0 if current operation should be aborted (NAK).
1126  */
1127 static int do_rc_ack(struct qib_qp *qp, u32 aeth, u32 psn, int opcode,
1128 		     u64 val, struct qib_ctxtdata *rcd)
1129 {
1130 	struct qib_ibport *ibp;
1131 	enum ib_wc_status status;
1132 	struct qib_swqe *wqe;
1133 	int ret = 0;
1134 	u32 ack_psn;
1135 	int diff;
1136 
1137 	/* Remove QP from retry timer */
1138 	if (qp->s_flags & (QIB_S_TIMER | QIB_S_WAIT_RNR)) {
1139 		qp->s_flags &= ~(QIB_S_TIMER | QIB_S_WAIT_RNR);
1140 		del_timer(&qp->s_timer);
1141 	}
1142 
1143 	/*
1144 	 * Note that NAKs implicitly ACK outstanding SEND and RDMA write
1145 	 * requests and implicitly NAK RDMA read and atomic requests issued
1146 	 * before the NAK'ed request.  The MSN won't include the NAK'ed
1147 	 * request but will include an ACK'ed request(s).
1148 	 */
1149 	ack_psn = psn;
1150 	if (aeth >> 29)
1151 		ack_psn--;
1152 	wqe = get_swqe_ptr(qp, qp->s_acked);
1153 	ibp = to_iport(qp->ibqp.device, qp->port_num);
1154 
1155 	/*
1156 	 * The MSN might be for a later WQE than the PSN indicates so
1157 	 * only complete WQEs that the PSN finishes.
1158 	 */
1159 	while ((diff = qib_cmp24(ack_psn, wqe->lpsn)) >= 0) {
1160 		/*
1161 		 * RDMA_READ_RESPONSE_ONLY is a special case since
1162 		 * we want to generate completion events for everything
1163 		 * before the RDMA read, copy the data, then generate
1164 		 * the completion for the read.
1165 		 */
1166 		if (wqe->wr.opcode == IB_WR_RDMA_READ &&
1167 		    opcode == OP(RDMA_READ_RESPONSE_ONLY) &&
1168 		    diff == 0) {
1169 			ret = 1;
1170 			goto bail;
1171 		}
1172 		/*
1173 		 * If this request is a RDMA read or atomic, and the ACK is
1174 		 * for a later operation, this ACK NAKs the RDMA read or
1175 		 * atomic.  In other words, only a RDMA_READ_LAST or ONLY
1176 		 * can ACK a RDMA read and likewise for atomic ops.  Note
1177 		 * that the NAK case can only happen if relaxed ordering is
1178 		 * used and requests are sent after an RDMA read or atomic
1179 		 * is sent but before the response is received.
1180 		 */
1181 		if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
1182 		     (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
1183 		    ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1184 		      wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
1185 		     (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
1186 			/* Retry this request. */
1187 			if (!(qp->r_flags & QIB_R_RDMAR_SEQ)) {
1188 				qp->r_flags |= QIB_R_RDMAR_SEQ;
1189 				qib_restart_rc(qp, qp->s_last_psn + 1, 0);
1190 				if (list_empty(&qp->rspwait)) {
1191 					qp->r_flags |= QIB_R_RSP_SEND;
1192 					atomic_inc(&qp->refcount);
1193 					list_add_tail(&qp->rspwait,
1194 						      &rcd->qp_wait_list);
1195 				}
1196 			}
1197 			/*
1198 			 * No need to process the ACK/NAK since we are
1199 			 * restarting an earlier request.
1200 			 */
1201 			goto bail;
1202 		}
1203 		if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1204 		    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
1205 			u64 *vaddr = wqe->sg_list[0].vaddr;
1206 			*vaddr = val;
1207 		}
1208 		if (qp->s_num_rd_atomic &&
1209 		    (wqe->wr.opcode == IB_WR_RDMA_READ ||
1210 		     wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1211 		     wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {
1212 			qp->s_num_rd_atomic--;
1213 			/* Restart sending task if fence is complete */
1214 			if ((qp->s_flags & QIB_S_WAIT_FENCE) &&
1215 			    !qp->s_num_rd_atomic) {
1216 				qp->s_flags &= ~(QIB_S_WAIT_FENCE |
1217 						 QIB_S_WAIT_ACK);
1218 				qib_schedule_send(qp);
1219 			} else if (qp->s_flags & QIB_S_WAIT_RDMAR) {
1220 				qp->s_flags &= ~(QIB_S_WAIT_RDMAR |
1221 						 QIB_S_WAIT_ACK);
1222 				qib_schedule_send(qp);
1223 			}
1224 		}
1225 		wqe = do_rc_completion(qp, wqe, ibp);
1226 		if (qp->s_acked == qp->s_tail)
1227 			break;
1228 	}
1229 
1230 	switch (aeth >> 29) {
1231 	case 0:         /* ACK */
1232 		ibp->n_rc_acks++;
1233 		if (qp->s_acked != qp->s_tail) {
1234 			/*
1235 			 * We are expecting more ACKs so
1236 			 * reset the retransmit timer.
1237 			 */
1238 			start_timer(qp);
1239 			/*
1240 			 * We can stop resending the earlier packets and
1241 			 * continue with the next packet the receiver wants.
1242 			 */
1243 			if (qib_cmp24(qp->s_psn, psn) <= 0)
1244 				reset_psn(qp, psn + 1);
1245 		} else if (qib_cmp24(qp->s_psn, psn) <= 0) {
1246 			qp->s_state = OP(SEND_LAST);
1247 			qp->s_psn = psn + 1;
1248 		}
1249 		if (qp->s_flags & QIB_S_WAIT_ACK) {
1250 			qp->s_flags &= ~QIB_S_WAIT_ACK;
1251 			qib_schedule_send(qp);
1252 		}
1253 		qib_get_credit(qp, aeth);
1254 		qp->s_rnr_retry = qp->s_rnr_retry_cnt;
1255 		qp->s_retry = qp->s_retry_cnt;
1256 		update_last_psn(qp, psn);
1257 		ret = 1;
1258 		goto bail;
1259 
1260 	case 1:         /* RNR NAK */
1261 		ibp->n_rnr_naks++;
1262 		if (qp->s_acked == qp->s_tail)
1263 			goto bail;
1264 		if (qp->s_flags & QIB_S_WAIT_RNR)
1265 			goto bail;
1266 		if (qp->s_rnr_retry == 0) {
1267 			status = IB_WC_RNR_RETRY_EXC_ERR;
1268 			goto class_b;
1269 		}
1270 		if (qp->s_rnr_retry_cnt < 7)
1271 			qp->s_rnr_retry--;
1272 
1273 		/* The last valid PSN is the previous PSN. */
1274 		update_last_psn(qp, psn - 1);
1275 
1276 		ibp->n_rc_resends += (qp->s_psn - psn) & QIB_PSN_MASK;
1277 
1278 		reset_psn(qp, psn);
1279 
1280 		qp->s_flags &= ~(QIB_S_WAIT_SSN_CREDIT | QIB_S_WAIT_ACK);
1281 		qp->s_flags |= QIB_S_WAIT_RNR;
1282 		qp->s_timer.function = qib_rc_rnr_retry;
1283 		qp->s_timer.expires = jiffies + usecs_to_jiffies(
1284 			ib_qib_rnr_table[(aeth >> QIB_AETH_CREDIT_SHIFT) &
1285 					   QIB_AETH_CREDIT_MASK]);
1286 		add_timer(&qp->s_timer);
1287 		goto bail;
1288 
1289 	case 3:         /* NAK */
1290 		if (qp->s_acked == qp->s_tail)
1291 			goto bail;
1292 		/* The last valid PSN is the previous PSN. */
1293 		update_last_psn(qp, psn - 1);
1294 		switch ((aeth >> QIB_AETH_CREDIT_SHIFT) &
1295 			QIB_AETH_CREDIT_MASK) {
1296 		case 0: /* PSN sequence error */
1297 			ibp->n_seq_naks++;
1298 			/*
1299 			 * Back up to the responder's expected PSN.
1300 			 * Note that we might get a NAK in the middle of an
1301 			 * RDMA READ response which terminates the RDMA
1302 			 * READ.
1303 			 */
1304 			qib_restart_rc(qp, psn, 0);
1305 			qib_schedule_send(qp);
1306 			break;
1307 
1308 		case 1: /* Invalid Request */
1309 			status = IB_WC_REM_INV_REQ_ERR;
1310 			ibp->n_other_naks++;
1311 			goto class_b;
1312 
1313 		case 2: /* Remote Access Error */
1314 			status = IB_WC_REM_ACCESS_ERR;
1315 			ibp->n_other_naks++;
1316 			goto class_b;
1317 
1318 		case 3: /* Remote Operation Error */
1319 			status = IB_WC_REM_OP_ERR;
1320 			ibp->n_other_naks++;
1321 class_b:
1322 			if (qp->s_last == qp->s_acked) {
1323 				qib_send_complete(qp, wqe, status);
1324 				qib_error_qp(qp, IB_WC_WR_FLUSH_ERR);
1325 			}
1326 			break;
1327 
1328 		default:
1329 			/* Ignore other reserved NAK error codes */
1330 			goto reserved;
1331 		}
1332 		qp->s_retry = qp->s_retry_cnt;
1333 		qp->s_rnr_retry = qp->s_rnr_retry_cnt;
1334 		goto bail;
1335 
1336 	default:                /* 2: reserved */
1337 reserved:
1338 		/* Ignore reserved NAK codes. */
1339 		goto bail;
1340 	}
1341 
1342 bail:
1343 	return ret;
1344 }
1345 
1346 /*
1347  * We have seen an out of sequence RDMA read middle or last packet.
1348  * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE.
1349  */
1350 static void rdma_seq_err(struct qib_qp *qp, struct qib_ibport *ibp, u32 psn,
1351 			 struct qib_ctxtdata *rcd)
1352 {
1353 	struct qib_swqe *wqe;
1354 
1355 	/* Remove QP from retry timer */
1356 	if (qp->s_flags & (QIB_S_TIMER | QIB_S_WAIT_RNR)) {
1357 		qp->s_flags &= ~(QIB_S_TIMER | QIB_S_WAIT_RNR);
1358 		del_timer(&qp->s_timer);
1359 	}
1360 
1361 	wqe = get_swqe_ptr(qp, qp->s_acked);
1362 
1363 	while (qib_cmp24(psn, wqe->lpsn) > 0) {
1364 		if (wqe->wr.opcode == IB_WR_RDMA_READ ||
1365 		    wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1366 		    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
1367 			break;
1368 		wqe = do_rc_completion(qp, wqe, ibp);
1369 	}
1370 
1371 	ibp->n_rdma_seq++;
1372 	qp->r_flags |= QIB_R_RDMAR_SEQ;
1373 	qib_restart_rc(qp, qp->s_last_psn + 1, 0);
1374 	if (list_empty(&qp->rspwait)) {
1375 		qp->r_flags |= QIB_R_RSP_SEND;
1376 		atomic_inc(&qp->refcount);
1377 		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
1378 	}
1379 }
1380 
1381 /**
1382  * qib_rc_rcv_resp - process an incoming RC response packet
1383  * @ibp: the port this packet came in on
1384  * @ohdr: the other headers for this packet
1385  * @data: the packet data
1386  * @tlen: the packet length
1387  * @qp: the QP for this packet
1388  * @opcode: the opcode for this packet
1389  * @psn: the packet sequence number for this packet
1390  * @hdrsize: the header length
1391  * @pmtu: the path MTU
1392  *
1393  * This is called from qib_rc_rcv() to process an incoming RC response
1394  * packet for the given QP.
1395  * Called at interrupt level.
1396  */
1397 static void qib_rc_rcv_resp(struct qib_ibport *ibp,
1398 			    struct qib_other_headers *ohdr,
1399 			    void *data, u32 tlen,
1400 			    struct qib_qp *qp,
1401 			    u32 opcode,
1402 			    u32 psn, u32 hdrsize, u32 pmtu,
1403 			    struct qib_ctxtdata *rcd)
1404 {
1405 	struct qib_swqe *wqe;
1406 	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
1407 	enum ib_wc_status status;
1408 	unsigned long flags;
1409 	int diff;
1410 	u32 pad;
1411 	u32 aeth;
1412 	u64 val;
1413 
1414 	if (opcode != OP(RDMA_READ_RESPONSE_MIDDLE)) {
1415 		/*
1416 		 * If ACK'd PSN on SDMA busy list try to make progress to
1417 		 * reclaim SDMA credits.
1418 		 */
1419 		if ((qib_cmp24(psn, qp->s_sending_psn) >= 0) &&
1420 		    (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)) {
1421 
1422 			/*
1423 			 * If send tasklet not running attempt to progress
1424 			 * SDMA queue.
1425 			 */
1426 			if (!(qp->s_flags & QIB_S_BUSY)) {
1427 				/* Acquire SDMA Lock */
1428 				spin_lock_irqsave(&ppd->sdma_lock, flags);
1429 				/* Invoke sdma make progress */
1430 				qib_sdma_make_progress(ppd);
1431 				/* Release SDMA Lock */
1432 				spin_unlock_irqrestore(&ppd->sdma_lock, flags);
1433 			}
1434 		}
1435 	}
1436 
1437 	spin_lock_irqsave(&qp->s_lock, flags);
1438 	if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK))
1439 		goto ack_done;
1440 
1441 	/* Ignore invalid responses. */
1442 	if (qib_cmp24(psn, qp->s_next_psn) >= 0)
1443 		goto ack_done;
1444 
1445 	/* Ignore duplicate responses. */
1446 	diff = qib_cmp24(psn, qp->s_last_psn);
1447 	if (unlikely(diff <= 0)) {
1448 		/* Update credits for "ghost" ACKs */
1449 		if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
1450 			aeth = be32_to_cpu(ohdr->u.aeth);
1451 			if ((aeth >> 29) == 0)
1452 				qib_get_credit(qp, aeth);
1453 		}
1454 		goto ack_done;
1455 	}
1456 
1457 	/*
1458 	 * Skip everything other than the PSN we expect, if we are waiting
1459 	 * for a reply to a restarted RDMA read or atomic op.
1460 	 */
1461 	if (qp->r_flags & QIB_R_RDMAR_SEQ) {
1462 		if (qib_cmp24(psn, qp->s_last_psn + 1) != 0)
1463 			goto ack_done;
1464 		qp->r_flags &= ~QIB_R_RDMAR_SEQ;
1465 	}
1466 
1467 	if (unlikely(qp->s_acked == qp->s_tail))
1468 		goto ack_done;
1469 	wqe = get_swqe_ptr(qp, qp->s_acked);
1470 	status = IB_WC_SUCCESS;
1471 
1472 	switch (opcode) {
1473 	case OP(ACKNOWLEDGE):
1474 	case OP(ATOMIC_ACKNOWLEDGE):
1475 	case OP(RDMA_READ_RESPONSE_FIRST):
1476 		aeth = be32_to_cpu(ohdr->u.aeth);
1477 		if (opcode == OP(ATOMIC_ACKNOWLEDGE)) {
1478 			__be32 *p = ohdr->u.at.atomic_ack_eth;
1479 
1480 			val = ((u64) be32_to_cpu(p[0]) << 32) |
1481 				be32_to_cpu(p[1]);
1482 		} else
1483 			val = 0;
1484 		if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) ||
1485 		    opcode != OP(RDMA_READ_RESPONSE_FIRST))
1486 			goto ack_done;
1487 		hdrsize += 4;
1488 		wqe = get_swqe_ptr(qp, qp->s_acked);
1489 		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
1490 			goto ack_op_err;
1491 		/*
1492 		 * If this is a response to a resent RDMA read, we
1493 		 * have to be careful to copy the data to the right
1494 		 * location.
1495 		 */
1496 		qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
1497 						  wqe, psn, pmtu);
1498 		goto read_middle;
1499 
1500 	case OP(RDMA_READ_RESPONSE_MIDDLE):
1501 		/* no AETH, no ACK */
1502 		if (unlikely(qib_cmp24(psn, qp->s_last_psn + 1)))
1503 			goto ack_seq_err;
1504 		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
1505 			goto ack_op_err;
1506 read_middle:
1507 		if (unlikely(tlen != (hdrsize + pmtu + 4)))
1508 			goto ack_len_err;
1509 		if (unlikely(pmtu >= qp->s_rdma_read_len))
1510 			goto ack_len_err;
1511 
1512 		/*
1513 		 * We got a response so update the timeout.
1514 		 * 4.096 usec. * (1 << qp->timeout)
1515 		 */
1516 		qp->s_flags |= QIB_S_TIMER;
1517 		mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies);
1518 		if (qp->s_flags & QIB_S_WAIT_ACK) {
1519 			qp->s_flags &= ~QIB_S_WAIT_ACK;
1520 			qib_schedule_send(qp);
1521 		}
1522 
1523 		if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE))
1524 			qp->s_retry = qp->s_retry_cnt;
1525 
1526 		/*
1527 		 * Update the RDMA receive state but do the copy w/o
1528 		 * holding the locks and blocking interrupts.
1529 		 */
1530 		qp->s_rdma_read_len -= pmtu;
1531 		update_last_psn(qp, psn);
1532 		spin_unlock_irqrestore(&qp->s_lock, flags);
1533 		qib_copy_sge(&qp->s_rdma_read_sge, data, pmtu, 0);
1534 		goto bail;
1535 
1536 	case OP(RDMA_READ_RESPONSE_ONLY):
1537 		aeth = be32_to_cpu(ohdr->u.aeth);
1538 		if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd))
1539 			goto ack_done;
1540 		/* Get the number of bytes the message was padded by. */
1541 		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
1542 		/*
1543 		 * Check that the data size is >= 0 && <= pmtu.
1544 		 * Remember to account for the AETH header (4) and
1545 		 * ICRC (4).
1546 		 */
1547 		if (unlikely(tlen < (hdrsize + pad + 8)))
1548 			goto ack_len_err;
1549 		/*
1550 		 * If this is a response to a resent RDMA read, we
1551 		 * have to be careful to copy the data to the right
1552 		 * location.
1553 		 */
1554 		wqe = get_swqe_ptr(qp, qp->s_acked);
1555 		qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
1556 						  wqe, psn, pmtu);
1557 		goto read_last;
1558 
1559 	case OP(RDMA_READ_RESPONSE_LAST):
1560 		/* ACKs READ req. */
1561 		if (unlikely(qib_cmp24(psn, qp->s_last_psn + 1)))
1562 			goto ack_seq_err;
1563 		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
1564 			goto ack_op_err;
1565 		/* Get the number of bytes the message was padded by. */
1566 		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
1567 		/*
1568 		 * Check that the data size is >= 1 && <= pmtu.
1569 		 * Remember to account for the AETH header (4) and
1570 		 * ICRC (4).
1571 		 */
1572 		if (unlikely(tlen <= (hdrsize + pad + 8)))
1573 			goto ack_len_err;
1574 read_last:
1575 		tlen -= hdrsize + pad + 8;
1576 		if (unlikely(tlen != qp->s_rdma_read_len))
1577 			goto ack_len_err;
1578 		aeth = be32_to_cpu(ohdr->u.aeth);
1579 		qib_copy_sge(&qp->s_rdma_read_sge, data, tlen, 0);
1580 		WARN_ON(qp->s_rdma_read_sge.num_sge);
1581 		(void) do_rc_ack(qp, aeth, psn,
1582 				 OP(RDMA_READ_RESPONSE_LAST), 0, rcd);
1583 		goto ack_done;
1584 	}
1585 
1586 ack_op_err:
1587 	status = IB_WC_LOC_QP_OP_ERR;
1588 	goto ack_err;
1589 
1590 ack_seq_err:
1591 	rdma_seq_err(qp, ibp, psn, rcd);
1592 	goto ack_done;
1593 
1594 ack_len_err:
1595 	status = IB_WC_LOC_LEN_ERR;
1596 ack_err:
1597 	if (qp->s_last == qp->s_acked) {
1598 		qib_send_complete(qp, wqe, status);
1599 		qib_error_qp(qp, IB_WC_WR_FLUSH_ERR);
1600 	}
1601 ack_done:
1602 	spin_unlock_irqrestore(&qp->s_lock, flags);
1603 bail:
1604 	return;
1605 }
1606 
1607 /**
1608  * qib_rc_rcv_error - process an incoming duplicate or error RC packet
1609  * @ohdr: the other headers for this packet
1610  * @data: the packet data
1611  * @qp: the QP for this packet
1612  * @opcode: the opcode for this packet
1613  * @psn: the packet sequence number for this packet
1614  * @diff: the difference between the PSN and the expected PSN
1615  *
1616  * This is called from qib_rc_rcv() to process an unexpected
1617  * incoming RC packet for the given QP.
1618  * Called at interrupt level.
1619  * Return 1 if no more processing is needed; otherwise return 0 to
1620  * schedule a response to be sent.
1621  */
1622 static int qib_rc_rcv_error(struct qib_other_headers *ohdr,
1623 			    void *data,
1624 			    struct qib_qp *qp,
1625 			    u32 opcode,
1626 			    u32 psn,
1627 			    int diff,
1628 			    struct qib_ctxtdata *rcd)
1629 {
1630 	struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
1631 	struct qib_ack_entry *e;
1632 	unsigned long flags;
1633 	u8 i, prev;
1634 	int old_req;
1635 
1636 	if (diff > 0) {
1637 		/*
1638 		 * Packet sequence error.
1639 		 * A NAK will ACK earlier sends and RDMA writes.
1640 		 * Don't queue the NAK if we already sent one.
1641 		 */
1642 		if (!qp->r_nak_state) {
1643 			ibp->n_rc_seqnak++;
1644 			qp->r_nak_state = IB_NAK_PSN_ERROR;
1645 			/* Use the expected PSN. */
1646 			qp->r_ack_psn = qp->r_psn;
1647 			/*
1648 			 * Wait to send the sequence NAK until all packets
1649 			 * in the receive queue have been processed.
1650 			 * Otherwise, we end up propagating congestion.
1651 			 */
1652 			if (list_empty(&qp->rspwait)) {
1653 				qp->r_flags |= QIB_R_RSP_NAK;
1654 				atomic_inc(&qp->refcount);
1655 				list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
1656 			}
1657 		}
1658 		goto done;
1659 	}
1660 
1661 	/*
1662 	 * Handle a duplicate request.  Don't re-execute SEND, RDMA
1663 	 * write or atomic op.  Don't NAK errors, just silently drop
1664 	 * the duplicate request.  Note that r_sge, r_len, and
1665 	 * r_rcv_len may be in use so don't modify them.
1666 	 *
1667 	 * We are supposed to ACK the earliest duplicate PSN but we
1668 	 * can coalesce an outstanding duplicate ACK.  We have to
1669 	 * send the earliest so that RDMA reads can be restarted at
1670 	 * the requester's expected PSN.
1671 	 *
1672 	 * First, find where this duplicate PSN falls within the
1673 	 * ACKs previously sent.
1674 	 * old_req is true if there is an older response that is scheduled
1675 	 * to be sent before sending this one.
1676 	 */
1677 	e = NULL;
1678 	old_req = 1;
1679 	ibp->n_rc_dupreq++;
1680 
1681 	spin_lock_irqsave(&qp->s_lock, flags);
1682 
1683 	for (i = qp->r_head_ack_queue; ; i = prev) {
1684 		if (i == qp->s_tail_ack_queue)
1685 			old_req = 0;
1686 		if (i)
1687 			prev = i - 1;
1688 		else
1689 			prev = QIB_MAX_RDMA_ATOMIC;
1690 		if (prev == qp->r_head_ack_queue) {
1691 			e = NULL;
1692 			break;
1693 		}
1694 		e = &qp->s_ack_queue[prev];
1695 		if (!e->opcode) {
1696 			e = NULL;
1697 			break;
1698 		}
1699 		if (qib_cmp24(psn, e->psn) >= 0) {
1700 			if (prev == qp->s_tail_ack_queue &&
1701 			    qib_cmp24(psn, e->lpsn) <= 0)
1702 				old_req = 0;
1703 			break;
1704 		}
1705 	}
1706 	switch (opcode) {
1707 	case OP(RDMA_READ_REQUEST): {
1708 		struct ib_reth *reth;
1709 		u32 offset;
1710 		u32 len;
1711 
1712 		/*
1713 		 * If we didn't find the RDMA read request in the ack queue,
1714 		 * we can ignore this request.
1715 		 */
1716 		if (!e || e->opcode != OP(RDMA_READ_REQUEST))
1717 			goto unlock_done;
1718 		/* RETH comes after BTH */
1719 		reth = &ohdr->u.rc.reth;
1720 		/*
1721 		 * Address range must be a subset of the original
1722 		 * request and start on pmtu boundaries.
1723 		 * We reuse the old ack_queue slot since the requester
1724 		 * should not back up and request an earlier PSN for the
1725 		 * same request.
1726 		 */
1727 		offset = ((psn - e->psn) & QIB_PSN_MASK) *
1728 			qp->pmtu;
1729 		len = be32_to_cpu(reth->length);
1730 		if (unlikely(offset + len != e->rdma_sge.sge_length))
1731 			goto unlock_done;
1732 		if (e->rdma_sge.mr) {
1733 			atomic_dec(&e->rdma_sge.mr->refcount);
1734 			e->rdma_sge.mr = NULL;
1735 		}
1736 		if (len != 0) {
1737 			u32 rkey = be32_to_cpu(reth->rkey);
1738 			u64 vaddr = be64_to_cpu(reth->vaddr);
1739 			int ok;
1740 
1741 			ok = qib_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
1742 					 IB_ACCESS_REMOTE_READ);
1743 			if (unlikely(!ok))
1744 				goto unlock_done;
1745 		} else {
1746 			e->rdma_sge.vaddr = NULL;
1747 			e->rdma_sge.length = 0;
1748 			e->rdma_sge.sge_length = 0;
1749 		}
1750 		e->psn = psn;
1751 		if (old_req)
1752 			goto unlock_done;
1753 		qp->s_tail_ack_queue = prev;
1754 		break;
1755 	}
1756 
1757 	case OP(COMPARE_SWAP):
1758 	case OP(FETCH_ADD): {
1759 		/*
1760 		 * If we didn't find the atomic request in the ack queue
1761 		 * or the send tasklet is already backed up to send an
1762 		 * earlier entry, we can ignore this request.
1763 		 */
1764 		if (!e || e->opcode != (u8) opcode || old_req)
1765 			goto unlock_done;
1766 		qp->s_tail_ack_queue = prev;
1767 		break;
1768 	}
1769 
1770 	default:
1771 		/*
1772 		 * Ignore this operation if it doesn't request an ACK
1773 		 * or an earlier RDMA read or atomic is going to be resent.
1774 		 */
1775 		if (!(psn & IB_BTH_REQ_ACK) || old_req)
1776 			goto unlock_done;
1777 		/*
1778 		 * Resend the most recent ACK if this request is
1779 		 * after all the previous RDMA reads and atomics.
1780 		 */
1781 		if (i == qp->r_head_ack_queue) {
1782 			spin_unlock_irqrestore(&qp->s_lock, flags);
1783 			qp->r_nak_state = 0;
1784 			qp->r_ack_psn = qp->r_psn - 1;
1785 			goto send_ack;
1786 		}
1787 		/*
1788 		 * Try to send a simple ACK to work around a Mellanox bug
1789 		 * which doesn't accept a RDMA read response or atomic
1790 		 * response as an ACK for earlier SENDs or RDMA writes.
1791 		 */
1792 		if (!(qp->s_flags & QIB_S_RESP_PENDING)) {
1793 			spin_unlock_irqrestore(&qp->s_lock, flags);
1794 			qp->r_nak_state = 0;
1795 			qp->r_ack_psn = qp->s_ack_queue[i].psn - 1;
1796 			goto send_ack;
1797 		}
1798 		/*
1799 		 * Resend the RDMA read or atomic op which
1800 		 * ACKs this duplicate request.
1801 		 */
1802 		qp->s_tail_ack_queue = i;
1803 		break;
1804 	}
1805 	qp->s_ack_state = OP(ACKNOWLEDGE);
1806 	qp->s_flags |= QIB_S_RESP_PENDING;
1807 	qp->r_nak_state = 0;
1808 	qib_schedule_send(qp);
1809 
1810 unlock_done:
1811 	spin_unlock_irqrestore(&qp->s_lock, flags);
1812 done:
1813 	return 1;
1814 
1815 send_ack:
1816 	return 0;
1817 }
1818 
1819 void qib_rc_error(struct qib_qp *qp, enum ib_wc_status err)
1820 {
1821 	unsigned long flags;
1822 	int lastwqe;
1823 
1824 	spin_lock_irqsave(&qp->s_lock, flags);
1825 	lastwqe = qib_error_qp(qp, err);
1826 	spin_unlock_irqrestore(&qp->s_lock, flags);
1827 
1828 	if (lastwqe) {
1829 		struct ib_event ev;
1830 
1831 		ev.device = qp->ibqp.device;
1832 		ev.element.qp = &qp->ibqp;
1833 		ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
1834 		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
1835 	}
1836 }
1837 
1838 static inline void qib_update_ack_queue(struct qib_qp *qp, unsigned n)
1839 {
1840 	unsigned next;
1841 
1842 	next = n + 1;
1843 	if (next > QIB_MAX_RDMA_ATOMIC)
1844 		next = 0;
1845 	qp->s_tail_ack_queue = next;
1846 	qp->s_ack_state = OP(ACKNOWLEDGE);
1847 }
1848 
1849 /**
1850  * qib_rc_rcv - process an incoming RC packet
1851  * @rcd: the context pointer
1852  * @hdr: the header of this packet
1853  * @has_grh: true if the header has a GRH
1854  * @data: the packet data
1855  * @tlen: the packet length
1856  * @qp: the QP for this packet
1857  *
1858  * This is called from qib_qp_rcv() to process an incoming RC packet
1859  * for the given QP.
1860  * Called at interrupt level.
1861  */
1862 void qib_rc_rcv(struct qib_ctxtdata *rcd, struct qib_ib_header *hdr,
1863 		int has_grh, void *data, u32 tlen, struct qib_qp *qp)
1864 {
1865 	struct qib_ibport *ibp = &rcd->ppd->ibport_data;
1866 	struct qib_other_headers *ohdr;
1867 	u32 opcode;
1868 	u32 hdrsize;
1869 	u32 psn;
1870 	u32 pad;
1871 	struct ib_wc wc;
1872 	u32 pmtu = qp->pmtu;
1873 	int diff;
1874 	struct ib_reth *reth;
1875 	unsigned long flags;
1876 	int ret;
1877 
1878 	/* Check for GRH */
1879 	if (!has_grh) {
1880 		ohdr = &hdr->u.oth;
1881 		hdrsize = 8 + 12;       /* LRH + BTH */
1882 	} else {
1883 		ohdr = &hdr->u.l.oth;
1884 		hdrsize = 8 + 40 + 12;  /* LRH + GRH + BTH */
1885 	}
1886 
1887 	opcode = be32_to_cpu(ohdr->bth[0]);
1888 	if (qib_ruc_check_hdr(ibp, hdr, has_grh, qp, opcode))
1889 		return;
1890 
1891 	psn = be32_to_cpu(ohdr->bth[2]);
1892 	opcode >>= 24;
1893 
1894 	/*
1895 	 * Process responses (ACKs) before anything else.  Note that the
1896 	 * packet sequence number will be for something in the send work
1897 	 * queue rather than the expected receive packet sequence number.
1898 	 * In other words, this QP is the requester.
1899 	 */
1900 	if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
1901 	    opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
1902 		qib_rc_rcv_resp(ibp, ohdr, data, tlen, qp, opcode, psn,
1903 				hdrsize, pmtu, rcd);
1904 		return;
1905 	}
1906 
1907 	/* Compute 24 bits worth of difference. */
1908 	diff = qib_cmp24(psn, qp->r_psn);
1909 	if (unlikely(diff)) {
1910 		if (qib_rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd))
1911 			return;
1912 		goto send_ack;
1913 	}
1914 
1915 	/* Check for opcode sequence errors. */
1916 	switch (qp->r_state) {
1917 	case OP(SEND_FIRST):
1918 	case OP(SEND_MIDDLE):
1919 		if (opcode == OP(SEND_MIDDLE) ||
1920 		    opcode == OP(SEND_LAST) ||
1921 		    opcode == OP(SEND_LAST_WITH_IMMEDIATE))
1922 			break;
1923 		goto nack_inv;
1924 
1925 	case OP(RDMA_WRITE_FIRST):
1926 	case OP(RDMA_WRITE_MIDDLE):
1927 		if (opcode == OP(RDMA_WRITE_MIDDLE) ||
1928 		    opcode == OP(RDMA_WRITE_LAST) ||
1929 		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
1930 			break;
1931 		goto nack_inv;
1932 
1933 	default:
1934 		if (opcode == OP(SEND_MIDDLE) ||
1935 		    opcode == OP(SEND_LAST) ||
1936 		    opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
1937 		    opcode == OP(RDMA_WRITE_MIDDLE) ||
1938 		    opcode == OP(RDMA_WRITE_LAST) ||
1939 		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
1940 			goto nack_inv;
1941 		/*
1942 		 * Note that it is up to the requester to not send a new
1943 		 * RDMA read or atomic operation before receiving an ACK
1944 		 * for the previous operation.
1945 		 */
1946 		break;
1947 	}
1948 
1949 	if (qp->state == IB_QPS_RTR && !(qp->r_flags & QIB_R_COMM_EST)) {
1950 		qp->r_flags |= QIB_R_COMM_EST;
1951 		if (qp->ibqp.event_handler) {
1952 			struct ib_event ev;
1953 
1954 			ev.device = qp->ibqp.device;
1955 			ev.element.qp = &qp->ibqp;
1956 			ev.event = IB_EVENT_COMM_EST;
1957 			qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
1958 		}
1959 	}
1960 
1961 	/* OK, process the packet. */
1962 	switch (opcode) {
1963 	case OP(SEND_FIRST):
1964 		ret = qib_get_rwqe(qp, 0);
1965 		if (ret < 0)
1966 			goto nack_op_err;
1967 		if (!ret)
1968 			goto rnr_nak;
1969 		qp->r_rcv_len = 0;
1970 		/* FALLTHROUGH */
1971 	case OP(SEND_MIDDLE):
1972 	case OP(RDMA_WRITE_MIDDLE):
1973 send_middle:
1974 		/* Check for invalid length PMTU or posted rwqe len. */
1975 		if (unlikely(tlen != (hdrsize + pmtu + 4)))
1976 			goto nack_inv;
1977 		qp->r_rcv_len += pmtu;
1978 		if (unlikely(qp->r_rcv_len > qp->r_len))
1979 			goto nack_inv;
1980 		qib_copy_sge(&qp->r_sge, data, pmtu, 1);
1981 		break;
1982 
1983 	case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
1984 		/* consume RWQE */
1985 		ret = qib_get_rwqe(qp, 1);
1986 		if (ret < 0)
1987 			goto nack_op_err;
1988 		if (!ret)
1989 			goto rnr_nak;
1990 		goto send_last_imm;
1991 
1992 	case OP(SEND_ONLY):
1993 	case OP(SEND_ONLY_WITH_IMMEDIATE):
1994 		ret = qib_get_rwqe(qp, 0);
1995 		if (ret < 0)
1996 			goto nack_op_err;
1997 		if (!ret)
1998 			goto rnr_nak;
1999 		qp->r_rcv_len = 0;
2000 		if (opcode == OP(SEND_ONLY))
2001 			goto no_immediate_data;
2002 		/* FALLTHROUGH for SEND_ONLY_WITH_IMMEDIATE */
2003 	case OP(SEND_LAST_WITH_IMMEDIATE):
2004 send_last_imm:
2005 		wc.ex.imm_data = ohdr->u.imm_data;
2006 		hdrsize += 4;
2007 		wc.wc_flags = IB_WC_WITH_IMM;
2008 		goto send_last;
2009 	case OP(SEND_LAST):
2010 	case OP(RDMA_WRITE_LAST):
2011 no_immediate_data:
2012 		wc.wc_flags = 0;
2013 		wc.ex.imm_data = 0;
2014 send_last:
2015 		/* Get the number of bytes the message was padded by. */
2016 		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
2017 		/* Check for invalid length. */
2018 		/* XXX LAST len should be >= 1 */
2019 		if (unlikely(tlen < (hdrsize + pad + 4)))
2020 			goto nack_inv;
2021 		/* Don't count the CRC. */
2022 		tlen -= (hdrsize + pad + 4);
2023 		wc.byte_len = tlen + qp->r_rcv_len;
2024 		if (unlikely(wc.byte_len > qp->r_len))
2025 			goto nack_inv;
2026 		qib_copy_sge(&qp->r_sge, data, tlen, 1);
2027 		while (qp->r_sge.num_sge) {
2028 			atomic_dec(&qp->r_sge.sge.mr->refcount);
2029 			if (--qp->r_sge.num_sge)
2030 				qp->r_sge.sge = *qp->r_sge.sg_list++;
2031 		}
2032 		qp->r_msn++;
2033 		if (!test_and_clear_bit(QIB_R_WRID_VALID, &qp->r_aflags))
2034 			break;
2035 		wc.wr_id = qp->r_wr_id;
2036 		wc.status = IB_WC_SUCCESS;
2037 		if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||
2038 		    opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
2039 			wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
2040 		else
2041 			wc.opcode = IB_WC_RECV;
2042 		wc.qp = &qp->ibqp;
2043 		wc.src_qp = qp->remote_qpn;
2044 		wc.slid = qp->remote_ah_attr.dlid;
2045 		wc.sl = qp->remote_ah_attr.sl;
2046 		/* zero fields that are N/A */
2047 		wc.vendor_err = 0;
2048 		wc.pkey_index = 0;
2049 		wc.dlid_path_bits = 0;
2050 		wc.port_num = 0;
2051 		wc.csum_ok = 0;
2052 		/* Signal completion event if the solicited bit is set. */
2053 		qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
2054 			     (ohdr->bth[0] &
2055 			      cpu_to_be32(IB_BTH_SOLICITED)) != 0);
2056 		break;
2057 
2058 	case OP(RDMA_WRITE_FIRST):
2059 	case OP(RDMA_WRITE_ONLY):
2060 	case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
2061 		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
2062 			goto nack_inv;
2063 		/* consume RWQE */
2064 		reth = &ohdr->u.rc.reth;
2065 		hdrsize += sizeof(*reth);
2066 		qp->r_len = be32_to_cpu(reth->length);
2067 		qp->r_rcv_len = 0;
2068 		qp->r_sge.sg_list = NULL;
2069 		if (qp->r_len != 0) {
2070 			u32 rkey = be32_to_cpu(reth->rkey);
2071 			u64 vaddr = be64_to_cpu(reth->vaddr);
2072 			int ok;
2073 
2074 			/* Check rkey & NAK */
2075 			ok = qib_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr,
2076 					 rkey, IB_ACCESS_REMOTE_WRITE);
2077 			if (unlikely(!ok))
2078 				goto nack_acc;
2079 			qp->r_sge.num_sge = 1;
2080 		} else {
2081 			qp->r_sge.num_sge = 0;
2082 			qp->r_sge.sge.mr = NULL;
2083 			qp->r_sge.sge.vaddr = NULL;
2084 			qp->r_sge.sge.length = 0;
2085 			qp->r_sge.sge.sge_length = 0;
2086 		}
2087 		if (opcode == OP(RDMA_WRITE_FIRST))
2088 			goto send_middle;
2089 		else if (opcode == OP(RDMA_WRITE_ONLY))
2090 			goto no_immediate_data;
2091 		ret = qib_get_rwqe(qp, 1);
2092 		if (ret < 0)
2093 			goto nack_op_err;
2094 		if (!ret)
2095 			goto rnr_nak;
2096 		wc.ex.imm_data = ohdr->u.rc.imm_data;
2097 		hdrsize += 4;
2098 		wc.wc_flags = IB_WC_WITH_IMM;
2099 		goto send_last;
2100 
2101 	case OP(RDMA_READ_REQUEST): {
2102 		struct qib_ack_entry *e;
2103 		u32 len;
2104 		u8 next;
2105 
2106 		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
2107 			goto nack_inv;
2108 		next = qp->r_head_ack_queue + 1;
2109 		/* s_ack_queue is size QIB_MAX_RDMA_ATOMIC+1 so use > not >= */
2110 		if (next > QIB_MAX_RDMA_ATOMIC)
2111 			next = 0;
2112 		spin_lock_irqsave(&qp->s_lock, flags);
2113 		if (unlikely(next == qp->s_tail_ack_queue)) {
2114 			if (!qp->s_ack_queue[next].sent)
2115 				goto nack_inv_unlck;
2116 			qib_update_ack_queue(qp, next);
2117 		}
2118 		e = &qp->s_ack_queue[qp->r_head_ack_queue];
2119 		if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
2120 			atomic_dec(&e->rdma_sge.mr->refcount);
2121 			e->rdma_sge.mr = NULL;
2122 		}
2123 		reth = &ohdr->u.rc.reth;
2124 		len = be32_to_cpu(reth->length);
2125 		if (len) {
2126 			u32 rkey = be32_to_cpu(reth->rkey);
2127 			u64 vaddr = be64_to_cpu(reth->vaddr);
2128 			int ok;
2129 
2130 			/* Check rkey & NAK */
2131 			ok = qib_rkey_ok(qp, &e->rdma_sge, len, vaddr,
2132 					 rkey, IB_ACCESS_REMOTE_READ);
2133 			if (unlikely(!ok))
2134 				goto nack_acc_unlck;
2135 			/*
2136 			 * Update the next expected PSN.  We add 1 later
2137 			 * below, so only add the remainder here.
2138 			 */
2139 			if (len > pmtu)
2140 				qp->r_psn += (len - 1) / pmtu;
2141 		} else {
2142 			e->rdma_sge.mr = NULL;
2143 			e->rdma_sge.vaddr = NULL;
2144 			e->rdma_sge.length = 0;
2145 			e->rdma_sge.sge_length = 0;
2146 		}
2147 		e->opcode = opcode;
2148 		e->sent = 0;
2149 		e->psn = psn;
2150 		e->lpsn = qp->r_psn;
2151 		/*
2152 		 * We need to increment the MSN here instead of when we
2153 		 * finish sending the result since a duplicate request would
2154 		 * increment it more than once.
2155 		 */
2156 		qp->r_msn++;
2157 		qp->r_psn++;
2158 		qp->r_state = opcode;
2159 		qp->r_nak_state = 0;
2160 		qp->r_head_ack_queue = next;
2161 
2162 		/* Schedule the send tasklet. */
2163 		qp->s_flags |= QIB_S_RESP_PENDING;
2164 		qib_schedule_send(qp);
2165 
2166 		goto sunlock;
2167 	}
2168 
2169 	case OP(COMPARE_SWAP):
2170 	case OP(FETCH_ADD): {
2171 		struct ib_atomic_eth *ateth;
2172 		struct qib_ack_entry *e;
2173 		u64 vaddr;
2174 		atomic64_t *maddr;
2175 		u64 sdata;
2176 		u32 rkey;
2177 		u8 next;
2178 
2179 		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
2180 			goto nack_inv;
2181 		next = qp->r_head_ack_queue + 1;
2182 		if (next > QIB_MAX_RDMA_ATOMIC)
2183 			next = 0;
2184 		spin_lock_irqsave(&qp->s_lock, flags);
2185 		if (unlikely(next == qp->s_tail_ack_queue)) {
2186 			if (!qp->s_ack_queue[next].sent)
2187 				goto nack_inv_unlck;
2188 			qib_update_ack_queue(qp, next);
2189 		}
2190 		e = &qp->s_ack_queue[qp->r_head_ack_queue];
2191 		if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
2192 			atomic_dec(&e->rdma_sge.mr->refcount);
2193 			e->rdma_sge.mr = NULL;
2194 		}
2195 		ateth = &ohdr->u.atomic_eth;
2196 		vaddr = ((u64) be32_to_cpu(ateth->vaddr[0]) << 32) |
2197 			be32_to_cpu(ateth->vaddr[1]);
2198 		if (unlikely(vaddr & (sizeof(u64) - 1)))
2199 			goto nack_inv_unlck;
2200 		rkey = be32_to_cpu(ateth->rkey);
2201 		/* Check rkey & NAK */
2202 		if (unlikely(!qib_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
2203 					  vaddr, rkey,
2204 					  IB_ACCESS_REMOTE_ATOMIC)))
2205 			goto nack_acc_unlck;
2206 		/* Perform atomic OP and save result. */
2207 		maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
2208 		sdata = be64_to_cpu(ateth->swap_data);
2209 		e->atomic_data = (opcode == OP(FETCH_ADD)) ?
2210 			(u64) atomic64_add_return(sdata, maddr) - sdata :
2211 			(u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
2212 				      be64_to_cpu(ateth->compare_data),
2213 				      sdata);
2214 		atomic_dec(&qp->r_sge.sge.mr->refcount);
2215 		qp->r_sge.num_sge = 0;
2216 		e->opcode = opcode;
2217 		e->sent = 0;
2218 		e->psn = psn;
2219 		e->lpsn = psn;
2220 		qp->r_msn++;
2221 		qp->r_psn++;
2222 		qp->r_state = opcode;
2223 		qp->r_nak_state = 0;
2224 		qp->r_head_ack_queue = next;
2225 
2226 		/* Schedule the send tasklet. */
2227 		qp->s_flags |= QIB_S_RESP_PENDING;
2228 		qib_schedule_send(qp);
2229 
2230 		goto sunlock;
2231 	}
2232 
2233 	default:
2234 		/* NAK unknown opcodes. */
2235 		goto nack_inv;
2236 	}
2237 	qp->r_psn++;
2238 	qp->r_state = opcode;
2239 	qp->r_ack_psn = psn;
2240 	qp->r_nak_state = 0;
2241 	/* Send an ACK if requested or required. */
2242 	if (psn & (1 << 31))
2243 		goto send_ack;
2244 	return;
2245 
2246 rnr_nak:
2247 	qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer;
2248 	qp->r_ack_psn = qp->r_psn;
2249 	/* Queue RNR NAK for later */
2250 	if (list_empty(&qp->rspwait)) {
2251 		qp->r_flags |= QIB_R_RSP_NAK;
2252 		atomic_inc(&qp->refcount);
2253 		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
2254 	}
2255 	return;
2256 
2257 nack_op_err:
2258 	qib_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2259 	qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
2260 	qp->r_ack_psn = qp->r_psn;
2261 	/* Queue NAK for later */
2262 	if (list_empty(&qp->rspwait)) {
2263 		qp->r_flags |= QIB_R_RSP_NAK;
2264 		atomic_inc(&qp->refcount);
2265 		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
2266 	}
2267 	return;
2268 
2269 nack_inv_unlck:
2270 	spin_unlock_irqrestore(&qp->s_lock, flags);
2271 nack_inv:
2272 	qib_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2273 	qp->r_nak_state = IB_NAK_INVALID_REQUEST;
2274 	qp->r_ack_psn = qp->r_psn;
2275 	/* Queue NAK for later */
2276 	if (list_empty(&qp->rspwait)) {
2277 		qp->r_flags |= QIB_R_RSP_NAK;
2278 		atomic_inc(&qp->refcount);
2279 		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
2280 	}
2281 	return;
2282 
2283 nack_acc_unlck:
2284 	spin_unlock_irqrestore(&qp->s_lock, flags);
2285 nack_acc:
2286 	qib_rc_error(qp, IB_WC_LOC_PROT_ERR);
2287 	qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
2288 	qp->r_ack_psn = qp->r_psn;
2289 send_ack:
2290 	qib_send_rc_ack(qp);
2291 	return;
2292 
2293 sunlock:
2294 	spin_unlock_irqrestore(&qp->s_lock, flags);
2295 }
2296