xref: /openbmc/linux/drivers/infiniband/hw/hfi1/tid_rdma.c (revision 023e41632e065d49bcbe31b3c4b336217f96a271)
1 // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
2 /*
3  * Copyright(c) 2018 Intel Corporation.
4  *
5  */
6 
7 #include "hfi.h"
8 #include "qp.h"
9 #include "rc.h"
10 #include "verbs.h"
11 #include "tid_rdma.h"
12 #include "exp_rcv.h"
13 #include "trace.h"
14 
15 /**
16  * DOC: TID RDMA READ protocol
17  *
18  * This is an end-to-end protocol at the hfi1 level between two nodes that
19  * improves performance by avoiding data copy on the requester side. It
20  * converts a qualified RDMA READ request into a TID RDMA READ request on
21  * the requester side and thereafter handles the request and response
22  * differently. To be qualified, the RDMA READ request should meet the
23  * following:
24  * -- The total data length should be greater than 256K;
25  * -- The total data length should be a multiple of 4K page size;
26  * -- Each local scatter-gather entry should be 4K page aligned;
27  * -- Each local scatter-gather entry should be a multiple of 4K page size;
28  */
29 
30 #define RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK BIT_ULL(32)
31 #define RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK BIT_ULL(33)
32 #define RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK BIT_ULL(34)
33 #define RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK BIT_ULL(35)
34 #define RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK BIT_ULL(37)
35 #define RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK BIT_ULL(38)
36 
37 /* Maximum number of packets within a flow generation. */
38 #define MAX_TID_FLOW_PSN BIT(HFI1_KDETH_BTH_SEQ_SHIFT)
39 
40 #define GENERATION_MASK 0xFFFFF
41 
42 static u32 mask_generation(u32 a)
43 {
44 	return a & GENERATION_MASK;
45 }
46 
47 /* Reserved generation value to set to unused flows for kernel contexts */
48 #define KERN_GENERATION_RESERVED mask_generation(U32_MAX)
49 
50 /*
51  * J_KEY for kernel contexts when TID RDMA is used.
52  * See generate_jkey() in hfi.h for more information.
53  */
54 #define TID_RDMA_JKEY                   32
55 #define HFI1_KERNEL_MIN_JKEY HFI1_ADMIN_JKEY_RANGE
56 #define HFI1_KERNEL_MAX_JKEY (2 * HFI1_ADMIN_JKEY_RANGE - 1)
57 
58 /* Maximum number of segments in flight per QP request. */
59 #define TID_RDMA_MAX_READ_SEGS_PER_REQ  6
60 #define TID_RDMA_MAX_WRITE_SEGS_PER_REQ 4
61 #define MAX_REQ max_t(u16, TID_RDMA_MAX_READ_SEGS_PER_REQ, \
62 			TID_RDMA_MAX_WRITE_SEGS_PER_REQ)
63 #define MAX_FLOWS roundup_pow_of_two(MAX_REQ + 1)
64 
65 #define MAX_EXPECTED_PAGES     (MAX_EXPECTED_BUFFER / PAGE_SIZE)
66 
67 #define TID_RDMA_DESTQP_FLOW_SHIFT      11
68 #define TID_RDMA_DESTQP_FLOW_MASK       0x1f
69 
70 #define TID_FLOW_SW_PSN BIT(0)
71 
72 #define TID_OPFN_QP_CTXT_MASK 0xff
73 #define TID_OPFN_QP_CTXT_SHIFT 56
74 #define TID_OPFN_QP_KDETH_MASK 0xff
75 #define TID_OPFN_QP_KDETH_SHIFT 48
76 #define TID_OPFN_MAX_LEN_MASK 0x7ff
77 #define TID_OPFN_MAX_LEN_SHIFT 37
78 #define TID_OPFN_TIMEOUT_MASK 0x1f
79 #define TID_OPFN_TIMEOUT_SHIFT 32
80 #define TID_OPFN_RESERVED_MASK 0x3f
81 #define TID_OPFN_RESERVED_SHIFT 26
82 #define TID_OPFN_URG_MASK 0x1
83 #define TID_OPFN_URG_SHIFT 25
84 #define TID_OPFN_VER_MASK 0x7
85 #define TID_OPFN_VER_SHIFT 22
86 #define TID_OPFN_JKEY_MASK 0x3f
87 #define TID_OPFN_JKEY_SHIFT 16
88 #define TID_OPFN_MAX_READ_MASK 0x3f
89 #define TID_OPFN_MAX_READ_SHIFT 10
90 #define TID_OPFN_MAX_WRITE_MASK 0x3f
91 #define TID_OPFN_MAX_WRITE_SHIFT 4
92 
93 /*
94  * OPFN TID layout
95  *
96  * 63               47               31               15
97  * NNNNNNNNKKKKKKKK MMMMMMMMMMMTTTTT DDDDDDUVVVJJJJJJ RRRRRRWWWWWWCCCC
98  * 3210987654321098 7654321098765432 1098765432109876 5432109876543210
99  * N - the context Number
100  * K - the Kdeth_qp
101  * M - Max_len
102  * T - Timeout
103  * D - reserveD
104  * V - version
105  * U - Urg capable
106  * J - Jkey
107  * R - max_Read
108  * W - max_Write
109  * C - Capcode
110  */
111 
112 static u32 tid_rdma_flow_wt;
113 
114 static void tid_rdma_trigger_resume(struct work_struct *work);
115 static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req);
116 static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req,
117 					 gfp_t gfp);
118 static void hfi1_init_trdma_req(struct rvt_qp *qp,
119 				struct tid_rdma_request *req);
120 static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx);
121 static void hfi1_tid_timeout(struct timer_list *t);
122 static void hfi1_add_tid_reap_timer(struct rvt_qp *qp);
123 static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp);
124 static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp);
125 static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp);
126 static void hfi1_tid_retry_timeout(struct timer_list *t);
127 static int make_tid_rdma_ack(struct rvt_qp *qp,
128 			     struct ib_other_headers *ohdr,
129 			     struct hfi1_pkt_state *ps);
130 static void hfi1_do_tid_send(struct rvt_qp *qp);
131 
132 static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p)
133 {
134 	return
135 		(((u64)p->qp & TID_OPFN_QP_CTXT_MASK) <<
136 			TID_OPFN_QP_CTXT_SHIFT) |
137 		((((u64)p->qp >> 16) & TID_OPFN_QP_KDETH_MASK) <<
138 			TID_OPFN_QP_KDETH_SHIFT) |
139 		(((u64)((p->max_len >> PAGE_SHIFT) - 1) &
140 			TID_OPFN_MAX_LEN_MASK) << TID_OPFN_MAX_LEN_SHIFT) |
141 		(((u64)p->timeout & TID_OPFN_TIMEOUT_MASK) <<
142 			TID_OPFN_TIMEOUT_SHIFT) |
143 		(((u64)p->urg & TID_OPFN_URG_MASK) << TID_OPFN_URG_SHIFT) |
144 		(((u64)p->jkey & TID_OPFN_JKEY_MASK) << TID_OPFN_JKEY_SHIFT) |
145 		(((u64)p->max_read & TID_OPFN_MAX_READ_MASK) <<
146 			TID_OPFN_MAX_READ_SHIFT) |
147 		(((u64)p->max_write & TID_OPFN_MAX_WRITE_MASK) <<
148 			TID_OPFN_MAX_WRITE_SHIFT);
149 }
150 
151 static void tid_rdma_opfn_decode(struct tid_rdma_params *p, u64 data)
152 {
153 	p->max_len = (((data >> TID_OPFN_MAX_LEN_SHIFT) &
154 		TID_OPFN_MAX_LEN_MASK) + 1) << PAGE_SHIFT;
155 	p->jkey = (data >> TID_OPFN_JKEY_SHIFT) & TID_OPFN_JKEY_MASK;
156 	p->max_write = (data >> TID_OPFN_MAX_WRITE_SHIFT) &
157 		TID_OPFN_MAX_WRITE_MASK;
158 	p->max_read = (data >> TID_OPFN_MAX_READ_SHIFT) &
159 		TID_OPFN_MAX_READ_MASK;
160 	p->qp =
161 		((((data >> TID_OPFN_QP_KDETH_SHIFT) & TID_OPFN_QP_KDETH_MASK)
162 			<< 16) |
163 		((data >> TID_OPFN_QP_CTXT_SHIFT) & TID_OPFN_QP_CTXT_MASK));
164 	p->urg = (data >> TID_OPFN_URG_SHIFT) & TID_OPFN_URG_MASK;
165 	p->timeout = (data >> TID_OPFN_TIMEOUT_SHIFT) & TID_OPFN_TIMEOUT_MASK;
166 }
167 
168 void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p)
169 {
170 	struct hfi1_qp_priv *priv = qp->priv;
171 
172 	p->qp = (kdeth_qp << 16) | priv->rcd->ctxt;
173 	p->max_len = TID_RDMA_MAX_SEGMENT_SIZE;
174 	p->jkey = priv->rcd->jkey;
175 	p->max_read = TID_RDMA_MAX_READ_SEGS_PER_REQ;
176 	p->max_write = TID_RDMA_MAX_WRITE_SEGS_PER_REQ;
177 	p->timeout = qp->timeout;
178 	p->urg = is_urg_masked(priv->rcd);
179 }
180 
181 bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data)
182 {
183 	struct hfi1_qp_priv *priv = qp->priv;
184 
185 	*data = tid_rdma_opfn_encode(&priv->tid_rdma.local);
186 	return true;
187 }
188 
189 bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data)
190 {
191 	struct hfi1_qp_priv *priv = qp->priv;
192 	struct tid_rdma_params *remote, *old;
193 	bool ret = true;
194 
195 	old = rcu_dereference_protected(priv->tid_rdma.remote,
196 					lockdep_is_held(&priv->opfn.lock));
197 	data &= ~0xfULL;
198 	/*
199 	 * If data passed in is zero, return true so as not to continue the
200 	 * negotiation process
201 	 */
202 	if (!data || !HFI1_CAP_IS_KSET(TID_RDMA))
203 		goto null;
204 	/*
205 	 * If kzalloc fails, return false. This will result in:
206 	 * * at the requester a new OPFN request being generated to retry
207 	 *   the negotiation
208 	 * * at the responder, 0 being returned to the requester so as to
209 	 *   disable TID RDMA at both the requester and the responder
210 	 */
211 	remote = kzalloc(sizeof(*remote), GFP_ATOMIC);
212 	if (!remote) {
213 		ret = false;
214 		goto null;
215 	}
216 
217 	tid_rdma_opfn_decode(remote, data);
218 	priv->tid_timer_timeout_jiffies =
219 		usecs_to_jiffies((((4096UL * (1UL << remote->timeout)) /
220 				   1000UL) << 3) * 7);
221 	trace_hfi1_opfn_param(qp, 0, &priv->tid_rdma.local);
222 	trace_hfi1_opfn_param(qp, 1, remote);
223 	rcu_assign_pointer(priv->tid_rdma.remote, remote);
224 	/*
225 	 * A TID RDMA READ request's segment size is not equal to
226 	 * remote->max_len only when the request's data length is smaller
227 	 * than remote->max_len. In that case, there will be only one segment.
228 	 * Therefore, when priv->pkts_ps is used to calculate req->cur_seg
229 	 * during retry, it will lead to req->cur_seg = 0, which is exactly
230 	 * what is expected.
231 	 */
232 	priv->pkts_ps = (u16)rvt_div_mtu(qp, remote->max_len);
233 	priv->timeout_shift = ilog2(priv->pkts_ps - 1) + 1;
234 	goto free;
235 null:
236 	RCU_INIT_POINTER(priv->tid_rdma.remote, NULL);
237 	priv->timeout_shift = 0;
238 free:
239 	if (old)
240 		kfree_rcu(old, rcu_head);
241 	return ret;
242 }
243 
244 bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data)
245 {
246 	bool ret;
247 
248 	ret = tid_rdma_conn_reply(qp, *data);
249 	*data = 0;
250 	/*
251 	 * If tid_rdma_conn_reply() returns error, set *data as 0 to indicate
252 	 * TID RDMA could not be enabled. This will result in TID RDMA being
253 	 * disabled at the requester too.
254 	 */
255 	if (ret)
256 		(void)tid_rdma_conn_req(qp, data);
257 	return ret;
258 }
259 
260 void tid_rdma_conn_error(struct rvt_qp *qp)
261 {
262 	struct hfi1_qp_priv *priv = qp->priv;
263 	struct tid_rdma_params *old;
264 
265 	old = rcu_dereference_protected(priv->tid_rdma.remote,
266 					lockdep_is_held(&priv->opfn.lock));
267 	RCU_INIT_POINTER(priv->tid_rdma.remote, NULL);
268 	if (old)
269 		kfree_rcu(old, rcu_head);
270 }
271 
272 /* This is called at context initialization time */
273 int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit)
274 {
275 	if (reinit)
276 		return 0;
277 
278 	BUILD_BUG_ON(TID_RDMA_JKEY < HFI1_KERNEL_MIN_JKEY);
279 	BUILD_BUG_ON(TID_RDMA_JKEY > HFI1_KERNEL_MAX_JKEY);
280 	rcd->jkey = TID_RDMA_JKEY;
281 	hfi1_set_ctxt_jkey(rcd->dd, rcd, rcd->jkey);
282 	return hfi1_alloc_ctxt_rcv_groups(rcd);
283 }
284 
285 /**
286  * qp_to_rcd - determine the receive context used by a qp
287  * @qp - the qp
288  *
289  * This routine returns the receive context associated
290  * with a a qp's qpn.
291  *
292  * Returns the context.
293  */
294 static struct hfi1_ctxtdata *qp_to_rcd(struct rvt_dev_info *rdi,
295 				       struct rvt_qp *qp)
296 {
297 	struct hfi1_ibdev *verbs_dev = container_of(rdi,
298 						    struct hfi1_ibdev,
299 						    rdi);
300 	struct hfi1_devdata *dd = container_of(verbs_dev,
301 					       struct hfi1_devdata,
302 					       verbs_dev);
303 	unsigned int ctxt;
304 
305 	if (qp->ibqp.qp_num == 0)
306 		ctxt = 0;
307 	else
308 		ctxt = ((qp->ibqp.qp_num >> dd->qos_shift) %
309 			(dd->n_krcv_queues - 1)) + 1;
310 
311 	return dd->rcd[ctxt];
312 }
313 
314 int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp,
315 		      struct ib_qp_init_attr *init_attr)
316 {
317 	struct hfi1_qp_priv *qpriv = qp->priv;
318 	int i, ret;
319 
320 	qpriv->rcd = qp_to_rcd(rdi, qp);
321 
322 	spin_lock_init(&qpriv->opfn.lock);
323 	INIT_WORK(&qpriv->opfn.opfn_work, opfn_send_conn_request);
324 	INIT_WORK(&qpriv->tid_rdma.trigger_work, tid_rdma_trigger_resume);
325 	qpriv->flow_state.psn = 0;
326 	qpriv->flow_state.index = RXE_NUM_TID_FLOWS;
327 	qpriv->flow_state.last_index = RXE_NUM_TID_FLOWS;
328 	qpriv->flow_state.generation = KERN_GENERATION_RESERVED;
329 	qpriv->s_state = TID_OP(WRITE_RESP);
330 	qpriv->s_tid_cur = HFI1_QP_WQE_INVALID;
331 	qpriv->s_tid_head = HFI1_QP_WQE_INVALID;
332 	qpriv->s_tid_tail = HFI1_QP_WQE_INVALID;
333 	qpriv->rnr_nak_state = TID_RNR_NAK_INIT;
334 	qpriv->r_tid_head = HFI1_QP_WQE_INVALID;
335 	qpriv->r_tid_tail = HFI1_QP_WQE_INVALID;
336 	qpriv->r_tid_ack = HFI1_QP_WQE_INVALID;
337 	qpriv->r_tid_alloc = HFI1_QP_WQE_INVALID;
338 	atomic_set(&qpriv->n_requests, 0);
339 	atomic_set(&qpriv->n_tid_requests, 0);
340 	timer_setup(&qpriv->s_tid_timer, hfi1_tid_timeout, 0);
341 	timer_setup(&qpriv->s_tid_retry_timer, hfi1_tid_retry_timeout, 0);
342 	INIT_LIST_HEAD(&qpriv->tid_wait);
343 
344 	if (init_attr->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
345 		struct hfi1_devdata *dd = qpriv->rcd->dd;
346 
347 		qpriv->pages = kzalloc_node(TID_RDMA_MAX_PAGES *
348 						sizeof(*qpriv->pages),
349 					    GFP_KERNEL, dd->node);
350 		if (!qpriv->pages)
351 			return -ENOMEM;
352 		for (i = 0; i < qp->s_size; i++) {
353 			struct hfi1_swqe_priv *priv;
354 			struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i);
355 
356 			priv = kzalloc_node(sizeof(*priv), GFP_KERNEL,
357 					    dd->node);
358 			if (!priv)
359 				return -ENOMEM;
360 
361 			hfi1_init_trdma_req(qp, &priv->tid_req);
362 			priv->tid_req.e.swqe = wqe;
363 			wqe->priv = priv;
364 		}
365 		for (i = 0; i < rvt_max_atomic(rdi); i++) {
366 			struct hfi1_ack_priv *priv;
367 
368 			priv = kzalloc_node(sizeof(*priv), GFP_KERNEL,
369 					    dd->node);
370 			if (!priv)
371 				return -ENOMEM;
372 
373 			hfi1_init_trdma_req(qp, &priv->tid_req);
374 			priv->tid_req.e.ack = &qp->s_ack_queue[i];
375 
376 			ret = hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req,
377 							    GFP_KERNEL);
378 			if (ret) {
379 				kfree(priv);
380 				return ret;
381 			}
382 			qp->s_ack_queue[i].priv = priv;
383 		}
384 	}
385 
386 	return 0;
387 }
388 
389 void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp)
390 {
391 	struct hfi1_qp_priv *qpriv = qp->priv;
392 	struct rvt_swqe *wqe;
393 	u32 i;
394 
395 	if (qp->ibqp.qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
396 		for (i = 0; i < qp->s_size; i++) {
397 			wqe = rvt_get_swqe_ptr(qp, i);
398 			kfree(wqe->priv);
399 			wqe->priv = NULL;
400 		}
401 		for (i = 0; i < rvt_max_atomic(rdi); i++) {
402 			struct hfi1_ack_priv *priv = qp->s_ack_queue[i].priv;
403 
404 			if (priv)
405 				hfi1_kern_exp_rcv_free_flows(&priv->tid_req);
406 			kfree(priv);
407 			qp->s_ack_queue[i].priv = NULL;
408 		}
409 		cancel_work_sync(&qpriv->opfn.opfn_work);
410 		kfree(qpriv->pages);
411 		qpriv->pages = NULL;
412 	}
413 }
414 
415 /* Flow and tid waiter functions */
416 /**
417  * DOC: lock ordering
418  *
419  * There are two locks involved with the queuing
420  * routines: the qp s_lock and the exp_lock.
421  *
422  * Since the tid space allocation is called from
423  * the send engine, the qp s_lock is already held.
424  *
425  * The allocation routines will get the exp_lock.
426  *
427  * The first_qp() call is provided to allow the head of
428  * the rcd wait queue to be fetched under the exp_lock and
429  * followed by a drop of the exp_lock.
430  *
431  * Any qp in the wait list will have the qp reference count held
432  * to hold the qp in memory.
433  */
434 
435 /*
436  * return head of rcd wait list
437  *
438  * Must hold the exp_lock.
439  *
440  * Get a reference to the QP to hold the QP in memory.
441  *
442  * The caller must release the reference when the local
443  * is no longer being used.
444  */
445 static struct rvt_qp *first_qp(struct hfi1_ctxtdata *rcd,
446 			       struct tid_queue *queue)
447 	__must_hold(&rcd->exp_lock)
448 {
449 	struct hfi1_qp_priv *priv;
450 
451 	lockdep_assert_held(&rcd->exp_lock);
452 	priv = list_first_entry_or_null(&queue->queue_head,
453 					struct hfi1_qp_priv,
454 					tid_wait);
455 	if (!priv)
456 		return NULL;
457 	rvt_get_qp(priv->owner);
458 	return priv->owner;
459 }
460 
461 /**
462  * kernel_tid_waiters - determine rcd wait
463  * @rcd: the receive context
464  * @qp: the head of the qp being processed
465  *
466  * This routine will return false IFF
467  * the list is NULL or the head of the
468  * list is the indicated qp.
469  *
470  * Must hold the qp s_lock and the exp_lock.
471  *
472  * Return:
473  * false if either of the conditions below are statisfied:
474  * 1. The list is empty or
475  * 2. The indicated qp is at the head of the list and the
476  *    HFI1_S_WAIT_TID_SPACE bit is set in qp->s_flags.
477  * true is returned otherwise.
478  */
479 static bool kernel_tid_waiters(struct hfi1_ctxtdata *rcd,
480 			       struct tid_queue *queue, struct rvt_qp *qp)
481 	__must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
482 {
483 	struct rvt_qp *fqp;
484 	bool ret = true;
485 
486 	lockdep_assert_held(&qp->s_lock);
487 	lockdep_assert_held(&rcd->exp_lock);
488 	fqp = first_qp(rcd, queue);
489 	if (!fqp || (fqp == qp && (qp->s_flags & HFI1_S_WAIT_TID_SPACE)))
490 		ret = false;
491 	rvt_put_qp(fqp);
492 	return ret;
493 }
494 
495 /**
496  * dequeue_tid_waiter - dequeue the qp from the list
497  * @qp - the qp to remove the wait list
498  *
499  * This routine removes the indicated qp from the
500  * wait list if it is there.
501  *
502  * This should be done after the hardware flow and
503  * tid array resources have been allocated.
504  *
505  * Must hold the qp s_lock and the rcd exp_lock.
506  *
507  * It assumes the s_lock to protect the s_flags
508  * field and to reliably test the HFI1_S_WAIT_TID_SPACE flag.
509  */
510 static void dequeue_tid_waiter(struct hfi1_ctxtdata *rcd,
511 			       struct tid_queue *queue, struct rvt_qp *qp)
512 	__must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
513 {
514 	struct hfi1_qp_priv *priv = qp->priv;
515 
516 	lockdep_assert_held(&qp->s_lock);
517 	lockdep_assert_held(&rcd->exp_lock);
518 	if (list_empty(&priv->tid_wait))
519 		return;
520 	list_del_init(&priv->tid_wait);
521 	qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
522 	queue->dequeue++;
523 	rvt_put_qp(qp);
524 }
525 
526 /**
527  * queue_qp_for_tid_wait - suspend QP on tid space
528  * @rcd: the receive context
529  * @qp: the qp
530  *
531  * The qp is inserted at the tail of the rcd
532  * wait queue and the HFI1_S_WAIT_TID_SPACE s_flag is set.
533  *
534  * Must hold the qp s_lock and the exp_lock.
535  */
536 static void queue_qp_for_tid_wait(struct hfi1_ctxtdata *rcd,
537 				  struct tid_queue *queue, struct rvt_qp *qp)
538 	__must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
539 {
540 	struct hfi1_qp_priv *priv = qp->priv;
541 
542 	lockdep_assert_held(&qp->s_lock);
543 	lockdep_assert_held(&rcd->exp_lock);
544 	if (list_empty(&priv->tid_wait)) {
545 		qp->s_flags |= HFI1_S_WAIT_TID_SPACE;
546 		list_add_tail(&priv->tid_wait, &queue->queue_head);
547 		priv->tid_enqueue = ++queue->enqueue;
548 		rcd->dd->verbs_dev.n_tidwait++;
549 		trace_hfi1_qpsleep(qp, HFI1_S_WAIT_TID_SPACE);
550 		rvt_get_qp(qp);
551 	}
552 }
553 
554 /**
555  * __trigger_tid_waiter - trigger tid waiter
556  * @qp: the qp
557  *
558  * This is a private entrance to schedule the qp
559  * assuming the caller is holding the qp->s_lock.
560  */
561 static void __trigger_tid_waiter(struct rvt_qp *qp)
562 	__must_hold(&qp->s_lock)
563 {
564 	lockdep_assert_held(&qp->s_lock);
565 	if (!(qp->s_flags & HFI1_S_WAIT_TID_SPACE))
566 		return;
567 	trace_hfi1_qpwakeup(qp, HFI1_S_WAIT_TID_SPACE);
568 	hfi1_schedule_send(qp);
569 }
570 
571 /**
572  * tid_rdma_schedule_tid_wakeup - schedule wakeup for a qp
573  * @qp - the qp
574  *
575  * trigger a schedule or a waiting qp in a deadlock
576  * safe manner.  The qp reference is held prior
577  * to this call via first_qp().
578  *
579  * If the qp trigger was already scheduled (!rval)
580  * the the reference is dropped, otherwise the resume
581  * or the destroy cancel will dispatch the reference.
582  */
583 static void tid_rdma_schedule_tid_wakeup(struct rvt_qp *qp)
584 {
585 	struct hfi1_qp_priv *priv;
586 	struct hfi1_ibport *ibp;
587 	struct hfi1_pportdata *ppd;
588 	struct hfi1_devdata *dd;
589 	bool rval;
590 
591 	if (!qp)
592 		return;
593 
594 	priv = qp->priv;
595 	ibp = to_iport(qp->ibqp.device, qp->port_num);
596 	ppd = ppd_from_ibp(ibp);
597 	dd = dd_from_ibdev(qp->ibqp.device);
598 
599 	rval = queue_work_on(priv->s_sde ?
600 			     priv->s_sde->cpu :
601 			     cpumask_first(cpumask_of_node(dd->node)),
602 			     ppd->hfi1_wq,
603 			     &priv->tid_rdma.trigger_work);
604 	if (!rval)
605 		rvt_put_qp(qp);
606 }
607 
608 /**
609  * tid_rdma_trigger_resume - field a trigger work request
610  * @work - the work item
611  *
612  * Complete the off qp trigger processing by directly
613  * calling the progress routine.
614  */
615 static void tid_rdma_trigger_resume(struct work_struct *work)
616 {
617 	struct tid_rdma_qp_params *tr;
618 	struct hfi1_qp_priv *priv;
619 	struct rvt_qp *qp;
620 
621 	tr = container_of(work, struct tid_rdma_qp_params, trigger_work);
622 	priv = container_of(tr, struct hfi1_qp_priv, tid_rdma);
623 	qp = priv->owner;
624 	spin_lock_irq(&qp->s_lock);
625 	if (qp->s_flags & HFI1_S_WAIT_TID_SPACE) {
626 		spin_unlock_irq(&qp->s_lock);
627 		hfi1_do_send(priv->owner, true);
628 	} else {
629 		spin_unlock_irq(&qp->s_lock);
630 	}
631 	rvt_put_qp(qp);
632 }
633 
634 /**
635  * tid_rdma_flush_wait - unwind any tid space wait
636  *
637  * This is called when resetting a qp to
638  * allow a destroy or reset to get rid
639  * of any tid space linkage and reference counts.
640  */
641 static void _tid_rdma_flush_wait(struct rvt_qp *qp, struct tid_queue *queue)
642 	__must_hold(&qp->s_lock)
643 {
644 	struct hfi1_qp_priv *priv;
645 
646 	if (!qp)
647 		return;
648 	lockdep_assert_held(&qp->s_lock);
649 	priv = qp->priv;
650 	qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
651 	spin_lock(&priv->rcd->exp_lock);
652 	if (!list_empty(&priv->tid_wait)) {
653 		list_del_init(&priv->tid_wait);
654 		qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
655 		queue->dequeue++;
656 		rvt_put_qp(qp);
657 	}
658 	spin_unlock(&priv->rcd->exp_lock);
659 }
660 
661 void hfi1_tid_rdma_flush_wait(struct rvt_qp *qp)
662 	__must_hold(&qp->s_lock)
663 {
664 	struct hfi1_qp_priv *priv = qp->priv;
665 
666 	_tid_rdma_flush_wait(qp, &priv->rcd->flow_queue);
667 	_tid_rdma_flush_wait(qp, &priv->rcd->rarr_queue);
668 }
669 
670 /* Flow functions */
671 /**
672  * kern_reserve_flow - allocate a hardware flow
673  * @rcd - the context to use for allocation
674  * @last - the index of the preferred flow. Use RXE_NUM_TID_FLOWS to
675  *         signify "don't care".
676  *
677  * Use a bit mask based allocation to reserve a hardware
678  * flow for use in receiving KDETH data packets. If a preferred flow is
679  * specified the function will attempt to reserve that flow again, if
680  * available.
681  *
682  * The exp_lock must be held.
683  *
684  * Return:
685  * On success: a value postive value between 0 and RXE_NUM_TID_FLOWS - 1
686  * On failure: -EAGAIN
687  */
688 static int kern_reserve_flow(struct hfi1_ctxtdata *rcd, int last)
689 	__must_hold(&rcd->exp_lock)
690 {
691 	int nr;
692 
693 	/* Attempt to reserve the preferred flow index */
694 	if (last >= 0 && last < RXE_NUM_TID_FLOWS &&
695 	    !test_and_set_bit(last, &rcd->flow_mask))
696 		return last;
697 
698 	nr = ffz(rcd->flow_mask);
699 	BUILD_BUG_ON(RXE_NUM_TID_FLOWS >=
700 		     (sizeof(rcd->flow_mask) * BITS_PER_BYTE));
701 	if (nr > (RXE_NUM_TID_FLOWS - 1))
702 		return -EAGAIN;
703 	set_bit(nr, &rcd->flow_mask);
704 	return nr;
705 }
706 
707 static void kern_set_hw_flow(struct hfi1_ctxtdata *rcd, u32 generation,
708 			     u32 flow_idx)
709 {
710 	u64 reg;
711 
712 	reg = ((u64)generation << HFI1_KDETH_BTH_SEQ_SHIFT) |
713 		RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK |
714 		RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK |
715 		RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK |
716 		RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK |
717 		RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK;
718 
719 	if (generation != KERN_GENERATION_RESERVED)
720 		reg |= RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK;
721 
722 	write_uctxt_csr(rcd->dd, rcd->ctxt,
723 			RCV_TID_FLOW_TABLE + 8 * flow_idx, reg);
724 }
725 
726 static u32 kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx)
727 	__must_hold(&rcd->exp_lock)
728 {
729 	u32 generation = rcd->flows[flow_idx].generation;
730 
731 	kern_set_hw_flow(rcd, generation, flow_idx);
732 	return generation;
733 }
734 
735 static u32 kern_flow_generation_next(u32 gen)
736 {
737 	u32 generation = mask_generation(gen + 1);
738 
739 	if (generation == KERN_GENERATION_RESERVED)
740 		generation = mask_generation(generation + 1);
741 	return generation;
742 }
743 
744 static void kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx)
745 	__must_hold(&rcd->exp_lock)
746 {
747 	rcd->flows[flow_idx].generation =
748 		kern_flow_generation_next(rcd->flows[flow_idx].generation);
749 	kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, flow_idx);
750 }
751 
752 int hfi1_kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp)
753 {
754 	struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
755 	struct tid_flow_state *fs = &qpriv->flow_state;
756 	struct rvt_qp *fqp;
757 	unsigned long flags;
758 	int ret = 0;
759 
760 	/* The QP already has an allocated flow */
761 	if (fs->index != RXE_NUM_TID_FLOWS)
762 		return ret;
763 
764 	spin_lock_irqsave(&rcd->exp_lock, flags);
765 	if (kernel_tid_waiters(rcd, &rcd->flow_queue, qp))
766 		goto queue;
767 
768 	ret = kern_reserve_flow(rcd, fs->last_index);
769 	if (ret < 0)
770 		goto queue;
771 	fs->index = ret;
772 	fs->last_index = fs->index;
773 
774 	/* Generation received in a RESYNC overrides default flow generation */
775 	if (fs->generation != KERN_GENERATION_RESERVED)
776 		rcd->flows[fs->index].generation = fs->generation;
777 	fs->generation = kern_setup_hw_flow(rcd, fs->index);
778 	fs->psn = 0;
779 	fs->flags = 0;
780 	dequeue_tid_waiter(rcd, &rcd->flow_queue, qp);
781 	/* get head before dropping lock */
782 	fqp = first_qp(rcd, &rcd->flow_queue);
783 	spin_unlock_irqrestore(&rcd->exp_lock, flags);
784 
785 	tid_rdma_schedule_tid_wakeup(fqp);
786 	return 0;
787 queue:
788 	queue_qp_for_tid_wait(rcd, &rcd->flow_queue, qp);
789 	spin_unlock_irqrestore(&rcd->exp_lock, flags);
790 	return -EAGAIN;
791 }
792 
793 void hfi1_kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp)
794 {
795 	struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
796 	struct tid_flow_state *fs = &qpriv->flow_state;
797 	struct rvt_qp *fqp;
798 	unsigned long flags;
799 
800 	if (fs->index >= RXE_NUM_TID_FLOWS)
801 		return;
802 	spin_lock_irqsave(&rcd->exp_lock, flags);
803 	kern_clear_hw_flow(rcd, fs->index);
804 	clear_bit(fs->index, &rcd->flow_mask);
805 	fs->index = RXE_NUM_TID_FLOWS;
806 	fs->psn = 0;
807 	fs->generation = KERN_GENERATION_RESERVED;
808 
809 	/* get head before dropping lock */
810 	fqp = first_qp(rcd, &rcd->flow_queue);
811 	spin_unlock_irqrestore(&rcd->exp_lock, flags);
812 
813 	if (fqp == qp) {
814 		__trigger_tid_waiter(fqp);
815 		rvt_put_qp(fqp);
816 	} else {
817 		tid_rdma_schedule_tid_wakeup(fqp);
818 	}
819 }
820 
821 void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd)
822 {
823 	int i;
824 
825 	for (i = 0; i < RXE_NUM_TID_FLOWS; i++) {
826 		rcd->flows[i].generation = mask_generation(prandom_u32());
827 		kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, i);
828 	}
829 }
830 
831 /* TID allocation functions */
832 static u8 trdma_pset_order(struct tid_rdma_pageset *s)
833 {
834 	u8 count = s->count;
835 
836 	return ilog2(count) + 1;
837 }
838 
839 /**
840  * tid_rdma_find_phys_blocks_4k - get groups base on mr info
841  * @npages - number of pages
842  * @pages - pointer to an array of page structs
843  * @list - page set array to return
844  *
845  * This routine returns the number of groups associated with
846  * the current sge information.  This implementation is based
847  * on the expected receive find_phys_blocks() adjusted to
848  * use the MR information vs. the pfn.
849  *
850  * Return:
851  * the number of RcvArray entries
852  */
853 static u32 tid_rdma_find_phys_blocks_4k(struct tid_rdma_flow *flow,
854 					struct page **pages,
855 					u32 npages,
856 					struct tid_rdma_pageset *list)
857 {
858 	u32 pagecount, pageidx, setcount = 0, i;
859 	void *vaddr, *this_vaddr;
860 
861 	if (!npages)
862 		return 0;
863 
864 	/*
865 	 * Look for sets of physically contiguous pages in the user buffer.
866 	 * This will allow us to optimize Expected RcvArray entry usage by
867 	 * using the bigger supported sizes.
868 	 */
869 	vaddr = page_address(pages[0]);
870 	trace_hfi1_tid_flow_page(flow->req->qp, flow, 0, 0, 0, vaddr);
871 	for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
872 		this_vaddr = i < npages ? page_address(pages[i]) : NULL;
873 		trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 0, 0,
874 					 this_vaddr);
875 		/*
876 		 * If the vaddr's are not sequential, pages are not physically
877 		 * contiguous.
878 		 */
879 		if (this_vaddr != (vaddr + PAGE_SIZE)) {
880 			/*
881 			 * At this point we have to loop over the set of
882 			 * physically contiguous pages and break them down it
883 			 * sizes supported by the HW.
884 			 * There are two main constraints:
885 			 *     1. The max buffer size is MAX_EXPECTED_BUFFER.
886 			 *        If the total set size is bigger than that
887 			 *        program only a MAX_EXPECTED_BUFFER chunk.
888 			 *     2. The buffer size has to be a power of two. If
889 			 *        it is not, round down to the closes power of
890 			 *        2 and program that size.
891 			 */
892 			while (pagecount) {
893 				int maxpages = pagecount;
894 				u32 bufsize = pagecount * PAGE_SIZE;
895 
896 				if (bufsize > MAX_EXPECTED_BUFFER)
897 					maxpages =
898 						MAX_EXPECTED_BUFFER >>
899 						PAGE_SHIFT;
900 				else if (!is_power_of_2(bufsize))
901 					maxpages =
902 						rounddown_pow_of_two(bufsize) >>
903 						PAGE_SHIFT;
904 
905 				list[setcount].idx = pageidx;
906 				list[setcount].count = maxpages;
907 				trace_hfi1_tid_pageset(flow->req->qp, setcount,
908 						       list[setcount].idx,
909 						       list[setcount].count);
910 				pagecount -= maxpages;
911 				pageidx += maxpages;
912 				setcount++;
913 			}
914 			pageidx = i;
915 			pagecount = 1;
916 			vaddr = this_vaddr;
917 		} else {
918 			vaddr += PAGE_SIZE;
919 			pagecount++;
920 		}
921 	}
922 	/* insure we always return an even number of sets */
923 	if (setcount & 1)
924 		list[setcount++].count = 0;
925 	return setcount;
926 }
927 
928 /**
929  * tid_flush_pages - dump out pages into pagesets
930  * @list - list of pagesets
931  * @idx - pointer to current page index
932  * @pages - number of pages to dump
933  * @sets - current number of pagesset
934  *
935  * This routine flushes out accumuated pages.
936  *
937  * To insure an even number of sets the
938  * code may add a filler.
939  *
940  * This can happen with when pages is not
941  * a power of 2 or pages is a power of 2
942  * less than the maximum pages.
943  *
944  * Return:
945  * The new number of sets
946  */
947 
948 static u32 tid_flush_pages(struct tid_rdma_pageset *list,
949 			   u32 *idx, u32 pages, u32 sets)
950 {
951 	while (pages) {
952 		u32 maxpages = pages;
953 
954 		if (maxpages > MAX_EXPECTED_PAGES)
955 			maxpages = MAX_EXPECTED_PAGES;
956 		else if (!is_power_of_2(maxpages))
957 			maxpages = rounddown_pow_of_two(maxpages);
958 		list[sets].idx = *idx;
959 		list[sets++].count = maxpages;
960 		*idx += maxpages;
961 		pages -= maxpages;
962 	}
963 	/* might need a filler */
964 	if (sets & 1)
965 		list[sets++].count = 0;
966 	return sets;
967 }
968 
969 /**
970  * tid_rdma_find_phys_blocks_8k - get groups base on mr info
971  * @pages - pointer to an array of page structs
972  * @npages - number of pages
973  * @list - page set array to return
974  *
975  * This routine parses an array of pages to compute pagesets
976  * in an 8k compatible way.
977  *
978  * pages are tested two at a time, i, i + 1 for contiguous
979  * pages and i - 1 and i contiguous pages.
980  *
981  * If any condition is false, any accumlated pages are flushed and
982  * v0,v1 are emitted as separate PAGE_SIZE pagesets
983  *
984  * Otherwise, the current 8k is totaled for a future flush.
985  *
986  * Return:
987  * The number of pagesets
988  * list set with the returned number of pagesets
989  *
990  */
991 static u32 tid_rdma_find_phys_blocks_8k(struct tid_rdma_flow *flow,
992 					struct page **pages,
993 					u32 npages,
994 					struct tid_rdma_pageset *list)
995 {
996 	u32 idx, sets = 0, i;
997 	u32 pagecnt = 0;
998 	void *v0, *v1, *vm1;
999 
1000 	if (!npages)
1001 		return 0;
1002 	for (idx = 0, i = 0, vm1 = NULL; i < npages; i += 2) {
1003 		/* get a new v0 */
1004 		v0 = page_address(pages[i]);
1005 		trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 0, v0);
1006 		v1 = i + 1 < npages ?
1007 				page_address(pages[i + 1]) : NULL;
1008 		trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 1, v1);
1009 		/* compare i, i + 1 vaddr */
1010 		if (v1 != (v0 + PAGE_SIZE)) {
1011 			/* flush out pages */
1012 			sets = tid_flush_pages(list, &idx, pagecnt, sets);
1013 			/* output v0,v1 as two pagesets */
1014 			list[sets].idx = idx++;
1015 			list[sets++].count = 1;
1016 			if (v1) {
1017 				list[sets].count = 1;
1018 				list[sets++].idx = idx++;
1019 			} else {
1020 				list[sets++].count = 0;
1021 			}
1022 			vm1 = NULL;
1023 			pagecnt = 0;
1024 			continue;
1025 		}
1026 		/* i,i+1 consecutive, look at i-1,i */
1027 		if (vm1 && v0 != (vm1 + PAGE_SIZE)) {
1028 			/* flush out pages */
1029 			sets = tid_flush_pages(list, &idx, pagecnt, sets);
1030 			pagecnt = 0;
1031 		}
1032 		/* pages will always be a multiple of 8k */
1033 		pagecnt += 2;
1034 		/* save i-1 */
1035 		vm1 = v1;
1036 		/* move to next pair */
1037 	}
1038 	/* dump residual pages at end */
1039 	sets = tid_flush_pages(list, &idx, npages - idx, sets);
1040 	/* by design cannot be odd sets */
1041 	WARN_ON(sets & 1);
1042 	return sets;
1043 }
1044 
1045 /**
1046  * Find pages for one segment of a sge array represented by @ss. The function
1047  * does not check the sge, the sge must have been checked for alignment with a
1048  * prior call to hfi1_kern_trdma_ok. Other sge checking is done as part of
1049  * rvt_lkey_ok and rvt_rkey_ok. Also, the function only modifies the local sge
1050  * copy maintained in @ss->sge, the original sge is not modified.
1051  *
1052  * Unlike IB RDMA WRITE, we can't decrement ss->num_sge here because we are not
1053  * releasing the MR reference count at the same time. Otherwise, we'll "leak"
1054  * references to the MR. This difference requires that we keep track of progress
1055  * into the sg_list. This is done by the cur_seg cursor in the tid_rdma_request
1056  * structure.
1057  */
1058 static u32 kern_find_pages(struct tid_rdma_flow *flow,
1059 			   struct page **pages,
1060 			   struct rvt_sge_state *ss, bool *last)
1061 {
1062 	struct tid_rdma_request *req = flow->req;
1063 	struct rvt_sge *sge = &ss->sge;
1064 	u32 length = flow->req->seg_len;
1065 	u32 len = PAGE_SIZE;
1066 	u32 i = 0;
1067 
1068 	while (length && req->isge < ss->num_sge) {
1069 		pages[i++] = virt_to_page(sge->vaddr);
1070 
1071 		sge->vaddr += len;
1072 		sge->length -= len;
1073 		sge->sge_length -= len;
1074 		if (!sge->sge_length) {
1075 			if (++req->isge < ss->num_sge)
1076 				*sge = ss->sg_list[req->isge - 1];
1077 		} else if (sge->length == 0 && sge->mr->lkey) {
1078 			if (++sge->n >= RVT_SEGSZ) {
1079 				++sge->m;
1080 				sge->n = 0;
1081 			}
1082 			sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
1083 			sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
1084 		}
1085 		length -= len;
1086 	}
1087 
1088 	flow->length = flow->req->seg_len - length;
1089 	*last = req->isge == ss->num_sge ? false : true;
1090 	return i;
1091 }
1092 
1093 static void dma_unmap_flow(struct tid_rdma_flow *flow)
1094 {
1095 	struct hfi1_devdata *dd;
1096 	int i;
1097 	struct tid_rdma_pageset *pset;
1098 
1099 	dd = flow->req->rcd->dd;
1100 	for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets;
1101 			i++, pset++) {
1102 		if (pset->count && pset->addr) {
1103 			dma_unmap_page(&dd->pcidev->dev,
1104 				       pset->addr,
1105 				       PAGE_SIZE * pset->count,
1106 				       DMA_FROM_DEVICE);
1107 			pset->mapped = 0;
1108 		}
1109 	}
1110 }
1111 
1112 static int dma_map_flow(struct tid_rdma_flow *flow, struct page **pages)
1113 {
1114 	int i;
1115 	struct hfi1_devdata *dd = flow->req->rcd->dd;
1116 	struct tid_rdma_pageset *pset;
1117 
1118 	for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets;
1119 			i++, pset++) {
1120 		if (pset->count) {
1121 			pset->addr = dma_map_page(&dd->pcidev->dev,
1122 						  pages[pset->idx],
1123 						  0,
1124 						  PAGE_SIZE * pset->count,
1125 						  DMA_FROM_DEVICE);
1126 
1127 			if (dma_mapping_error(&dd->pcidev->dev, pset->addr)) {
1128 				dma_unmap_flow(flow);
1129 				return -ENOMEM;
1130 			}
1131 			pset->mapped = 1;
1132 		}
1133 	}
1134 	return 0;
1135 }
1136 
1137 static inline bool dma_mapped(struct tid_rdma_flow *flow)
1138 {
1139 	return !!flow->pagesets[0].mapped;
1140 }
1141 
1142 /*
1143  * Get pages pointers and identify contiguous physical memory chunks for a
1144  * segment. All segments are of length flow->req->seg_len.
1145  */
1146 static int kern_get_phys_blocks(struct tid_rdma_flow *flow,
1147 				struct page **pages,
1148 				struct rvt_sge_state *ss, bool *last)
1149 {
1150 	u8 npages;
1151 
1152 	/* Reuse previously computed pagesets, if any */
1153 	if (flow->npagesets) {
1154 		trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head,
1155 					  flow);
1156 		if (!dma_mapped(flow))
1157 			return dma_map_flow(flow, pages);
1158 		return 0;
1159 	}
1160 
1161 	npages = kern_find_pages(flow, pages, ss, last);
1162 
1163 	if (flow->req->qp->pmtu == enum_to_mtu(OPA_MTU_4096))
1164 		flow->npagesets =
1165 			tid_rdma_find_phys_blocks_4k(flow, pages, npages,
1166 						     flow->pagesets);
1167 	else
1168 		flow->npagesets =
1169 			tid_rdma_find_phys_blocks_8k(flow, pages, npages,
1170 						     flow->pagesets);
1171 
1172 	return dma_map_flow(flow, pages);
1173 }
1174 
1175 static inline void kern_add_tid_node(struct tid_rdma_flow *flow,
1176 				     struct hfi1_ctxtdata *rcd, char *s,
1177 				     struct tid_group *grp, u8 cnt)
1178 {
1179 	struct kern_tid_node *node = &flow->tnode[flow->tnode_cnt++];
1180 
1181 	WARN_ON_ONCE(flow->tnode_cnt >=
1182 		     (TID_RDMA_MAX_SEGMENT_SIZE >> PAGE_SHIFT));
1183 	if (WARN_ON_ONCE(cnt & 1))
1184 		dd_dev_err(rcd->dd,
1185 			   "unexpected odd allocation cnt %u map 0x%x used %u",
1186 			   cnt, grp->map, grp->used);
1187 
1188 	node->grp = grp;
1189 	node->map = grp->map;
1190 	node->cnt = cnt;
1191 	trace_hfi1_tid_node_add(flow->req->qp, s, flow->tnode_cnt - 1,
1192 				grp->base, grp->map, grp->used, cnt);
1193 }
1194 
1195 /*
1196  * Try to allocate pageset_count TID's from TID groups for a context
1197  *
1198  * This function allocates TID's without moving groups between lists or
1199  * modifying grp->map. This is done as follows, being cogizant of the lists
1200  * between which the TID groups will move:
1201  * 1. First allocate complete groups of 8 TID's since this is more efficient,
1202  *    these groups will move from group->full without affecting used
1203  * 2. If more TID's are needed allocate from used (will move from used->full or
1204  *    stay in used)
1205  * 3. If we still don't have the required number of TID's go back and look again
1206  *    at a complete group (will move from group->used)
1207  */
1208 static int kern_alloc_tids(struct tid_rdma_flow *flow)
1209 {
1210 	struct hfi1_ctxtdata *rcd = flow->req->rcd;
1211 	struct hfi1_devdata *dd = rcd->dd;
1212 	u32 ngroups, pageidx = 0;
1213 	struct tid_group *group = NULL, *used;
1214 	u8 use;
1215 
1216 	flow->tnode_cnt = 0;
1217 	ngroups = flow->npagesets / dd->rcv_entries.group_size;
1218 	if (!ngroups)
1219 		goto used_list;
1220 
1221 	/* First look at complete groups */
1222 	list_for_each_entry(group,  &rcd->tid_group_list.list, list) {
1223 		kern_add_tid_node(flow, rcd, "complete groups", group,
1224 				  group->size);
1225 
1226 		pageidx += group->size;
1227 		if (!--ngroups)
1228 			break;
1229 	}
1230 
1231 	if (pageidx >= flow->npagesets)
1232 		goto ok;
1233 
1234 used_list:
1235 	/* Now look at partially used groups */
1236 	list_for_each_entry(used, &rcd->tid_used_list.list, list) {
1237 		use = min_t(u32, flow->npagesets - pageidx,
1238 			    used->size - used->used);
1239 		kern_add_tid_node(flow, rcd, "used groups", used, use);
1240 
1241 		pageidx += use;
1242 		if (pageidx >= flow->npagesets)
1243 			goto ok;
1244 	}
1245 
1246 	/*
1247 	 * Look again at a complete group, continuing from where we left.
1248 	 * However, if we are at the head, we have reached the end of the
1249 	 * complete groups list from the first loop above
1250 	 */
1251 	if (group && &group->list == &rcd->tid_group_list.list)
1252 		goto bail_eagain;
1253 	group = list_prepare_entry(group, &rcd->tid_group_list.list,
1254 				   list);
1255 	if (list_is_last(&group->list, &rcd->tid_group_list.list))
1256 		goto bail_eagain;
1257 	group = list_next_entry(group, list);
1258 	use = min_t(u32, flow->npagesets - pageidx, group->size);
1259 	kern_add_tid_node(flow, rcd, "complete continue", group, use);
1260 	pageidx += use;
1261 	if (pageidx >= flow->npagesets)
1262 		goto ok;
1263 bail_eagain:
1264 	trace_hfi1_msg_alloc_tids(flow->req->qp, " insufficient tids: needed ",
1265 				  (u64)flow->npagesets);
1266 	return -EAGAIN;
1267 ok:
1268 	return 0;
1269 }
1270 
1271 static void kern_program_rcv_group(struct tid_rdma_flow *flow, int grp_num,
1272 				   u32 *pset_idx)
1273 {
1274 	struct hfi1_ctxtdata *rcd = flow->req->rcd;
1275 	struct hfi1_devdata *dd = rcd->dd;
1276 	struct kern_tid_node *node = &flow->tnode[grp_num];
1277 	struct tid_group *grp = node->grp;
1278 	struct tid_rdma_pageset *pset;
1279 	u32 pmtu_pg = flow->req->qp->pmtu >> PAGE_SHIFT;
1280 	u32 rcventry, npages = 0, pair = 0, tidctrl;
1281 	u8 i, cnt = 0;
1282 
1283 	for (i = 0; i < grp->size; i++) {
1284 		rcventry = grp->base + i;
1285 
1286 		if (node->map & BIT(i) || cnt >= node->cnt) {
1287 			rcv_array_wc_fill(dd, rcventry);
1288 			continue;
1289 		}
1290 		pset = &flow->pagesets[(*pset_idx)++];
1291 		if (pset->count) {
1292 			hfi1_put_tid(dd, rcventry, PT_EXPECTED,
1293 				     pset->addr, trdma_pset_order(pset));
1294 		} else {
1295 			hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0);
1296 		}
1297 		npages += pset->count;
1298 
1299 		rcventry -= rcd->expected_base;
1300 		tidctrl = pair ? 0x3 : rcventry & 0x1 ? 0x2 : 0x1;
1301 		/*
1302 		 * A single TID entry will be used to use a rcvarr pair (with
1303 		 * tidctrl 0x3), if ALL these are true (a) the bit pos is even
1304 		 * (b) the group map shows current and the next bits as free
1305 		 * indicating two consecutive rcvarry entries are available (c)
1306 		 * we actually need 2 more entries
1307 		 */
1308 		pair = !(i & 0x1) && !((node->map >> i) & 0x3) &&
1309 			node->cnt >= cnt + 2;
1310 		if (!pair) {
1311 			if (!pset->count)
1312 				tidctrl = 0x1;
1313 			flow->tid_entry[flow->tidcnt++] =
1314 				EXP_TID_SET(IDX, rcventry >> 1) |
1315 				EXP_TID_SET(CTRL, tidctrl) |
1316 				EXP_TID_SET(LEN, npages);
1317 			trace_hfi1_tid_entry_alloc(/* entry */
1318 			   flow->req->qp, flow->tidcnt - 1,
1319 			   flow->tid_entry[flow->tidcnt - 1]);
1320 
1321 			/* Efficient DIV_ROUND_UP(npages, pmtu_pg) */
1322 			flow->npkts += (npages + pmtu_pg - 1) >> ilog2(pmtu_pg);
1323 			npages = 0;
1324 		}
1325 
1326 		if (grp->used == grp->size - 1)
1327 			tid_group_move(grp, &rcd->tid_used_list,
1328 				       &rcd->tid_full_list);
1329 		else if (!grp->used)
1330 			tid_group_move(grp, &rcd->tid_group_list,
1331 				       &rcd->tid_used_list);
1332 
1333 		grp->used++;
1334 		grp->map |= BIT(i);
1335 		cnt++;
1336 	}
1337 }
1338 
1339 static void kern_unprogram_rcv_group(struct tid_rdma_flow *flow, int grp_num)
1340 {
1341 	struct hfi1_ctxtdata *rcd = flow->req->rcd;
1342 	struct hfi1_devdata *dd = rcd->dd;
1343 	struct kern_tid_node *node = &flow->tnode[grp_num];
1344 	struct tid_group *grp = node->grp;
1345 	u32 rcventry;
1346 	u8 i, cnt = 0;
1347 
1348 	for (i = 0; i < grp->size; i++) {
1349 		rcventry = grp->base + i;
1350 
1351 		if (node->map & BIT(i) || cnt >= node->cnt) {
1352 			rcv_array_wc_fill(dd, rcventry);
1353 			continue;
1354 		}
1355 
1356 		hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0);
1357 
1358 		grp->used--;
1359 		grp->map &= ~BIT(i);
1360 		cnt++;
1361 
1362 		if (grp->used == grp->size - 1)
1363 			tid_group_move(grp, &rcd->tid_full_list,
1364 				       &rcd->tid_used_list);
1365 		else if (!grp->used)
1366 			tid_group_move(grp, &rcd->tid_used_list,
1367 				       &rcd->tid_group_list);
1368 	}
1369 	if (WARN_ON_ONCE(cnt & 1)) {
1370 		struct hfi1_ctxtdata *rcd = flow->req->rcd;
1371 		struct hfi1_devdata *dd = rcd->dd;
1372 
1373 		dd_dev_err(dd, "unexpected odd free cnt %u map 0x%x used %u",
1374 			   cnt, grp->map, grp->used);
1375 	}
1376 }
1377 
1378 static void kern_program_rcvarray(struct tid_rdma_flow *flow)
1379 {
1380 	u32 pset_idx = 0;
1381 	int i;
1382 
1383 	flow->npkts = 0;
1384 	flow->tidcnt = 0;
1385 	for (i = 0; i < flow->tnode_cnt; i++)
1386 		kern_program_rcv_group(flow, i, &pset_idx);
1387 	trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head, flow);
1388 }
1389 
1390 /**
1391  * hfi1_kern_exp_rcv_setup() - setup TID's and flow for one segment of a
1392  * TID RDMA request
1393  *
1394  * @req: TID RDMA request for which the segment/flow is being set up
1395  * @ss: sge state, maintains state across successive segments of a sge
1396  * @last: set to true after the last sge segment has been processed
1397  *
1398  * This function
1399  * (1) finds a free flow entry in the flow circular buffer
1400  * (2) finds pages and continuous physical chunks constituing one segment
1401  *     of an sge
1402  * (3) allocates TID group entries for those chunks
1403  * (4) programs rcvarray entries in the hardware corresponding to those
1404  *     TID's
1405  * (5) computes a tidarray with formatted TID entries which can be sent
1406  *     to the sender
1407  * (6) Reserves and programs HW flows.
1408  * (7) It also manages queing the QP when TID/flow resources are not
1409  *     available.
1410  *
1411  * @req points to struct tid_rdma_request of which the segments are a part. The
1412  * function uses qp, rcd and seg_len members of @req. In the absence of errors,
1413  * req->flow_idx is the index of the flow which has been prepared in this
1414  * invocation of function call. With flow = &req->flows[req->flow_idx],
1415  * flow->tid_entry contains the TID array which the sender can use for TID RDMA
1416  * sends and flow->npkts contains number of packets required to send the
1417  * segment.
1418  *
1419  * hfi1_check_sge_align should be called prior to calling this function and if
1420  * it signals error TID RDMA cannot be used for this sge and this function
1421  * should not be called.
1422  *
1423  * For the queuing, caller must hold the flow->req->qp s_lock from the send
1424  * engine and the function will procure the exp_lock.
1425  *
1426  * Return:
1427  * The function returns -EAGAIN if sufficient number of TID/flow resources to
1428  * map the segment could not be allocated. In this case the function should be
1429  * called again with previous arguments to retry the TID allocation. There are
1430  * no other error returns. The function returns 0 on success.
1431  */
1432 int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req,
1433 			    struct rvt_sge_state *ss, bool *last)
1434 	__must_hold(&req->qp->s_lock)
1435 {
1436 	struct tid_rdma_flow *flow = &req->flows[req->setup_head];
1437 	struct hfi1_ctxtdata *rcd = req->rcd;
1438 	struct hfi1_qp_priv *qpriv = req->qp->priv;
1439 	unsigned long flags;
1440 	struct rvt_qp *fqp;
1441 	u16 clear_tail = req->clear_tail;
1442 
1443 	lockdep_assert_held(&req->qp->s_lock);
1444 	/*
1445 	 * We return error if either (a) we don't have space in the flow
1446 	 * circular buffer, or (b) we already have max entries in the buffer.
1447 	 * Max entries depend on the type of request we are processing and the
1448 	 * negotiated TID RDMA parameters.
1449 	 */
1450 	if (!CIRC_SPACE(req->setup_head, clear_tail, MAX_FLOWS) ||
1451 	    CIRC_CNT(req->setup_head, clear_tail, MAX_FLOWS) >=
1452 	    req->n_flows)
1453 		return -EINVAL;
1454 
1455 	/*
1456 	 * Get pages, identify contiguous physical memory chunks for the segment
1457 	 * If we can not determine a DMA address mapping we will treat it just
1458 	 * like if we ran out of space above.
1459 	 */
1460 	if (kern_get_phys_blocks(flow, qpriv->pages, ss, last)) {
1461 		hfi1_wait_kmem(flow->req->qp);
1462 		return -ENOMEM;
1463 	}
1464 
1465 	spin_lock_irqsave(&rcd->exp_lock, flags);
1466 	if (kernel_tid_waiters(rcd, &rcd->rarr_queue, flow->req->qp))
1467 		goto queue;
1468 
1469 	/*
1470 	 * At this point we know the number of pagesets and hence the number of
1471 	 * TID's to map the segment. Allocate the TID's from the TID groups. If
1472 	 * we cannot allocate the required number we exit and try again later
1473 	 */
1474 	if (kern_alloc_tids(flow))
1475 		goto queue;
1476 	/*
1477 	 * Finally program the TID entries with the pagesets, compute the
1478 	 * tidarray and enable the HW flow
1479 	 */
1480 	kern_program_rcvarray(flow);
1481 
1482 	/*
1483 	 * Setup the flow state with relevant information.
1484 	 * This information is used for tracking the sequence of data packets
1485 	 * for the segment.
1486 	 * The flow is setup here as this is the most accurate time and place
1487 	 * to do so. Doing at a later time runs the risk of the flow data in
1488 	 * qpriv getting out of sync.
1489 	 */
1490 	memset(&flow->flow_state, 0x0, sizeof(flow->flow_state));
1491 	flow->idx = qpriv->flow_state.index;
1492 	flow->flow_state.generation = qpriv->flow_state.generation;
1493 	flow->flow_state.spsn = qpriv->flow_state.psn;
1494 	flow->flow_state.lpsn = flow->flow_state.spsn + flow->npkts - 1;
1495 	flow->flow_state.r_next_psn =
1496 		full_flow_psn(flow, flow->flow_state.spsn);
1497 	qpriv->flow_state.psn += flow->npkts;
1498 
1499 	dequeue_tid_waiter(rcd, &rcd->rarr_queue, flow->req->qp);
1500 	/* get head before dropping lock */
1501 	fqp = first_qp(rcd, &rcd->rarr_queue);
1502 	spin_unlock_irqrestore(&rcd->exp_lock, flags);
1503 	tid_rdma_schedule_tid_wakeup(fqp);
1504 
1505 	req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1);
1506 	return 0;
1507 queue:
1508 	queue_qp_for_tid_wait(rcd, &rcd->rarr_queue, flow->req->qp);
1509 	spin_unlock_irqrestore(&rcd->exp_lock, flags);
1510 	return -EAGAIN;
1511 }
1512 
1513 static void hfi1_tid_rdma_reset_flow(struct tid_rdma_flow *flow)
1514 {
1515 	flow->npagesets = 0;
1516 }
1517 
1518 /*
1519  * This function is called after one segment has been successfully sent to
1520  * release the flow and TID HW/SW resources for that segment. The segments for a
1521  * TID RDMA request are setup and cleared in FIFO order which is managed using a
1522  * circular buffer.
1523  */
1524 int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req)
1525 	__must_hold(&req->qp->s_lock)
1526 {
1527 	struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
1528 	struct hfi1_ctxtdata *rcd = req->rcd;
1529 	unsigned long flags;
1530 	int i;
1531 	struct rvt_qp *fqp;
1532 
1533 	lockdep_assert_held(&req->qp->s_lock);
1534 	/* Exit if we have nothing in the flow circular buffer */
1535 	if (!CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS))
1536 		return -EINVAL;
1537 
1538 	spin_lock_irqsave(&rcd->exp_lock, flags);
1539 
1540 	for (i = 0; i < flow->tnode_cnt; i++)
1541 		kern_unprogram_rcv_group(flow, i);
1542 	/* To prevent double unprogramming */
1543 	flow->tnode_cnt = 0;
1544 	/* get head before dropping lock */
1545 	fqp = first_qp(rcd, &rcd->rarr_queue);
1546 	spin_unlock_irqrestore(&rcd->exp_lock, flags);
1547 
1548 	dma_unmap_flow(flow);
1549 
1550 	hfi1_tid_rdma_reset_flow(flow);
1551 	req->clear_tail = (req->clear_tail + 1) & (MAX_FLOWS - 1);
1552 
1553 	if (fqp == req->qp) {
1554 		__trigger_tid_waiter(fqp);
1555 		rvt_put_qp(fqp);
1556 	} else {
1557 		tid_rdma_schedule_tid_wakeup(fqp);
1558 	}
1559 
1560 	return 0;
1561 }
1562 
1563 /*
1564  * This function is called to release all the tid entries for
1565  * a request.
1566  */
1567 void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req)
1568 	__must_hold(&req->qp->s_lock)
1569 {
1570 	/* Use memory barrier for proper ordering */
1571 	while (CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS)) {
1572 		if (hfi1_kern_exp_rcv_clear(req))
1573 			break;
1574 	}
1575 }
1576 
1577 /**
1578  * hfi1_kern_exp_rcv_free_flows - free priviously allocated flow information
1579  * @req - the tid rdma request to be cleaned
1580  */
1581 static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req)
1582 {
1583 	kfree(req->flows);
1584 	req->flows = NULL;
1585 }
1586 
1587 /**
1588  * __trdma_clean_swqe - clean up for large sized QPs
1589  * @qp: the queue patch
1590  * @wqe: the send wqe
1591  */
1592 void __trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
1593 {
1594 	struct hfi1_swqe_priv *p = wqe->priv;
1595 
1596 	hfi1_kern_exp_rcv_free_flows(&p->tid_req);
1597 }
1598 
1599 /*
1600  * This can be called at QP create time or in the data path.
1601  */
1602 static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req,
1603 					 gfp_t gfp)
1604 {
1605 	struct tid_rdma_flow *flows;
1606 	int i;
1607 
1608 	if (likely(req->flows))
1609 		return 0;
1610 	flows = kmalloc_node(MAX_FLOWS * sizeof(*flows), gfp,
1611 			     req->rcd->numa_id);
1612 	if (!flows)
1613 		return -ENOMEM;
1614 	/* mini init */
1615 	for (i = 0; i < MAX_FLOWS; i++) {
1616 		flows[i].req = req;
1617 		flows[i].npagesets = 0;
1618 		flows[i].pagesets[0].mapped =  0;
1619 	}
1620 	req->flows = flows;
1621 	return 0;
1622 }
1623 
1624 static void hfi1_init_trdma_req(struct rvt_qp *qp,
1625 				struct tid_rdma_request *req)
1626 {
1627 	struct hfi1_qp_priv *qpriv = qp->priv;
1628 
1629 	/*
1630 	 * Initialize various TID RDMA request variables.
1631 	 * These variables are "static", which is why they
1632 	 * can be pre-initialized here before the WRs has
1633 	 * even been submitted.
1634 	 * However, non-NULL values for these variables do not
1635 	 * imply that this WQE has been enabled for TID RDMA.
1636 	 * Drivers should check the WQE's opcode to determine
1637 	 * if a request is a TID RDMA one or not.
1638 	 */
1639 	req->qp = qp;
1640 	req->rcd = qpriv->rcd;
1641 }
1642 
1643 u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry,
1644 			    void *context, int vl, int mode, u64 data)
1645 {
1646 	struct hfi1_devdata *dd = context;
1647 
1648 	return dd->verbs_dev.n_tidwait;
1649 }
1650 
1651 static struct tid_rdma_flow *find_flow_ib(struct tid_rdma_request *req,
1652 					  u32 psn, u16 *fidx)
1653 {
1654 	u16 head, tail;
1655 	struct tid_rdma_flow *flow;
1656 
1657 	head = req->setup_head;
1658 	tail = req->clear_tail;
1659 	for ( ; CIRC_CNT(head, tail, MAX_FLOWS);
1660 	     tail = CIRC_NEXT(tail, MAX_FLOWS)) {
1661 		flow = &req->flows[tail];
1662 		if (cmp_psn(psn, flow->flow_state.ib_spsn) >= 0 &&
1663 		    cmp_psn(psn, flow->flow_state.ib_lpsn) <= 0) {
1664 			if (fidx)
1665 				*fidx = tail;
1666 			return flow;
1667 		}
1668 	}
1669 	return NULL;
1670 }
1671 
1672 static struct tid_rdma_flow *
1673 __find_flow_ranged(struct tid_rdma_request *req, u16 head, u16 tail,
1674 		   u32 psn, u16 *fidx)
1675 {
1676 	for ( ; CIRC_CNT(head, tail, MAX_FLOWS);
1677 	      tail = CIRC_NEXT(tail, MAX_FLOWS)) {
1678 		struct tid_rdma_flow *flow = &req->flows[tail];
1679 		u32 spsn, lpsn;
1680 
1681 		spsn = full_flow_psn(flow, flow->flow_state.spsn);
1682 		lpsn = full_flow_psn(flow, flow->flow_state.lpsn);
1683 
1684 		if (cmp_psn(psn, spsn) >= 0 && cmp_psn(psn, lpsn) <= 0) {
1685 			if (fidx)
1686 				*fidx = tail;
1687 			return flow;
1688 		}
1689 	}
1690 	return NULL;
1691 }
1692 
1693 static struct tid_rdma_flow *find_flow(struct tid_rdma_request *req,
1694 				       u32 psn, u16 *fidx)
1695 {
1696 	return __find_flow_ranged(req, req->setup_head, req->clear_tail, psn,
1697 				  fidx);
1698 }
1699 
1700 /* TID RDMA READ functions */
1701 u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe,
1702 				    struct ib_other_headers *ohdr, u32 *bth1,
1703 				    u32 *bth2, u32 *len)
1704 {
1705 	struct tid_rdma_request *req = wqe_to_tid_req(wqe);
1706 	struct tid_rdma_flow *flow = &req->flows[req->flow_idx];
1707 	struct rvt_qp *qp = req->qp;
1708 	struct hfi1_qp_priv *qpriv = qp->priv;
1709 	struct hfi1_swqe_priv *wpriv = wqe->priv;
1710 	struct tid_rdma_read_req *rreq = &ohdr->u.tid_rdma.r_req;
1711 	struct tid_rdma_params *remote;
1712 	u32 req_len = 0;
1713 	void *req_addr = NULL;
1714 
1715 	/* This is the IB psn used to send the request */
1716 	*bth2 = mask_psn(flow->flow_state.ib_spsn + flow->pkt);
1717 	trace_hfi1_tid_flow_build_read_pkt(qp, req->flow_idx, flow);
1718 
1719 	/* TID Entries for TID RDMA READ payload */
1720 	req_addr = &flow->tid_entry[flow->tid_idx];
1721 	req_len = sizeof(*flow->tid_entry) *
1722 			(flow->tidcnt - flow->tid_idx);
1723 
1724 	memset(&ohdr->u.tid_rdma.r_req, 0, sizeof(ohdr->u.tid_rdma.r_req));
1725 	wpriv->ss.sge.vaddr = req_addr;
1726 	wpriv->ss.sge.sge_length = req_len;
1727 	wpriv->ss.sge.length = wpriv->ss.sge.sge_length;
1728 	/*
1729 	 * We can safely zero these out. Since the first SGE covers the
1730 	 * entire packet, nothing else should even look at the MR.
1731 	 */
1732 	wpriv->ss.sge.mr = NULL;
1733 	wpriv->ss.sge.m = 0;
1734 	wpriv->ss.sge.n = 0;
1735 
1736 	wpriv->ss.sg_list = NULL;
1737 	wpriv->ss.total_len = wpriv->ss.sge.sge_length;
1738 	wpriv->ss.num_sge = 1;
1739 
1740 	/* Construct the TID RDMA READ REQ packet header */
1741 	rcu_read_lock();
1742 	remote = rcu_dereference(qpriv->tid_rdma.remote);
1743 
1744 	KDETH_RESET(rreq->kdeth0, KVER, 0x1);
1745 	KDETH_RESET(rreq->kdeth1, JKEY, remote->jkey);
1746 	rreq->reth.vaddr = cpu_to_be64(wqe->rdma_wr.remote_addr +
1747 			   req->cur_seg * req->seg_len + flow->sent);
1748 	rreq->reth.rkey = cpu_to_be32(wqe->rdma_wr.rkey);
1749 	rreq->reth.length = cpu_to_be32(*len);
1750 	rreq->tid_flow_psn =
1751 		cpu_to_be32((flow->flow_state.generation <<
1752 			     HFI1_KDETH_BTH_SEQ_SHIFT) |
1753 			    ((flow->flow_state.spsn + flow->pkt) &
1754 			     HFI1_KDETH_BTH_SEQ_MASK));
1755 	rreq->tid_flow_qp =
1756 		cpu_to_be32(qpriv->tid_rdma.local.qp |
1757 			    ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) <<
1758 			     TID_RDMA_DESTQP_FLOW_SHIFT) |
1759 			    qpriv->rcd->ctxt);
1760 	rreq->verbs_qp = cpu_to_be32(qp->remote_qpn);
1761 	*bth1 &= ~RVT_QPN_MASK;
1762 	*bth1 |= remote->qp;
1763 	*bth2 |= IB_BTH_REQ_ACK;
1764 	rcu_read_unlock();
1765 
1766 	/* We are done with this segment */
1767 	flow->sent += *len;
1768 	req->cur_seg++;
1769 	qp->s_state = TID_OP(READ_REQ);
1770 	req->ack_pending++;
1771 	req->flow_idx = (req->flow_idx + 1) & (MAX_FLOWS - 1);
1772 	qpriv->pending_tid_r_segs++;
1773 	qp->s_num_rd_atomic++;
1774 
1775 	/* Set the TID RDMA READ request payload size */
1776 	*len = req_len;
1777 
1778 	return sizeof(ohdr->u.tid_rdma.r_req) / sizeof(u32);
1779 }
1780 
1781 /*
1782  * @len: contains the data length to read upon entry and the read request
1783  *       payload length upon exit.
1784  */
1785 u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
1786 				 struct ib_other_headers *ohdr, u32 *bth1,
1787 				 u32 *bth2, u32 *len)
1788 	__must_hold(&qp->s_lock)
1789 {
1790 	struct hfi1_qp_priv *qpriv = qp->priv;
1791 	struct tid_rdma_request *req = wqe_to_tid_req(wqe);
1792 	struct tid_rdma_flow *flow = NULL;
1793 	u32 hdwords = 0;
1794 	bool last;
1795 	bool retry = true;
1796 	u32 npkts = rvt_div_round_up_mtu(qp, *len);
1797 
1798 	trace_hfi1_tid_req_build_read_req(qp, 0, wqe->wr.opcode, wqe->psn,
1799 					  wqe->lpsn, req);
1800 	/*
1801 	 * Check sync conditions. Make sure that there are no pending
1802 	 * segments before freeing the flow.
1803 	 */
1804 sync_check:
1805 	if (req->state == TID_REQUEST_SYNC) {
1806 		if (qpriv->pending_tid_r_segs)
1807 			goto done;
1808 
1809 		hfi1_kern_clear_hw_flow(req->rcd, qp);
1810 		req->state = TID_REQUEST_ACTIVE;
1811 	}
1812 
1813 	/*
1814 	 * If the request for this segment is resent, the tid resources should
1815 	 * have been allocated before. In this case, req->flow_idx should
1816 	 * fall behind req->setup_head.
1817 	 */
1818 	if (req->flow_idx == req->setup_head) {
1819 		retry = false;
1820 		if (req->state == TID_REQUEST_RESEND) {
1821 			/*
1822 			 * This is the first new segment for a request whose
1823 			 * earlier segments have been re-sent. We need to
1824 			 * set up the sge pointer correctly.
1825 			 */
1826 			restart_sge(&qp->s_sge, wqe, req->s_next_psn,
1827 				    qp->pmtu);
1828 			req->isge = 0;
1829 			req->state = TID_REQUEST_ACTIVE;
1830 		}
1831 
1832 		/*
1833 		 * Check sync. The last PSN of each generation is reserved for
1834 		 * RESYNC.
1835 		 */
1836 		if ((qpriv->flow_state.psn + npkts) > MAX_TID_FLOW_PSN - 1) {
1837 			req->state = TID_REQUEST_SYNC;
1838 			goto sync_check;
1839 		}
1840 
1841 		/* Allocate the flow if not yet */
1842 		if (hfi1_kern_setup_hw_flow(qpriv->rcd, qp))
1843 			goto done;
1844 
1845 		/*
1846 		 * The following call will advance req->setup_head after
1847 		 * allocating the tid entries.
1848 		 */
1849 		if (hfi1_kern_exp_rcv_setup(req, &qp->s_sge, &last)) {
1850 			req->state = TID_REQUEST_QUEUED;
1851 
1852 			/*
1853 			 * We don't have resources for this segment. The QP has
1854 			 * already been queued.
1855 			 */
1856 			goto done;
1857 		}
1858 	}
1859 
1860 	/* req->flow_idx should only be one slot behind req->setup_head */
1861 	flow = &req->flows[req->flow_idx];
1862 	flow->pkt = 0;
1863 	flow->tid_idx = 0;
1864 	flow->sent = 0;
1865 	if (!retry) {
1866 		/* Set the first and last IB PSN for the flow in use.*/
1867 		flow->flow_state.ib_spsn = req->s_next_psn;
1868 		flow->flow_state.ib_lpsn =
1869 			flow->flow_state.ib_spsn + flow->npkts - 1;
1870 	}
1871 
1872 	/* Calculate the next segment start psn.*/
1873 	req->s_next_psn += flow->npkts;
1874 
1875 	/* Build the packet header */
1876 	hdwords = hfi1_build_tid_rdma_read_packet(wqe, ohdr, bth1, bth2, len);
1877 done:
1878 	return hdwords;
1879 }
1880 
1881 /*
1882  * Validate and accept the TID RDMA READ request parameters.
1883  * Return 0 if the request is accepted successfully;
1884  * Return 1 otherwise.
1885  */
1886 static int tid_rdma_rcv_read_request(struct rvt_qp *qp,
1887 				     struct rvt_ack_entry *e,
1888 				     struct hfi1_packet *packet,
1889 				     struct ib_other_headers *ohdr,
1890 				     u32 bth0, u32 psn, u64 vaddr, u32 len)
1891 {
1892 	struct hfi1_qp_priv *qpriv = qp->priv;
1893 	struct tid_rdma_request *req;
1894 	struct tid_rdma_flow *flow;
1895 	u32 flow_psn, i, tidlen = 0, pktlen, tlen;
1896 
1897 	req = ack_to_tid_req(e);
1898 
1899 	/* Validate the payload first */
1900 	flow = &req->flows[req->setup_head];
1901 
1902 	/* payload length = packet length - (header length + ICRC length) */
1903 	pktlen = packet->tlen - (packet->hlen + 4);
1904 	if (pktlen > sizeof(flow->tid_entry))
1905 		return 1;
1906 	memcpy(flow->tid_entry, packet->ebuf, pktlen);
1907 	flow->tidcnt = pktlen / sizeof(*flow->tid_entry);
1908 
1909 	/*
1910 	 * Walk the TID_ENTRY list to make sure we have enough space for a
1911 	 * complete segment. Also calculate the number of required packets.
1912 	 */
1913 	flow->npkts = rvt_div_round_up_mtu(qp, len);
1914 	for (i = 0; i < flow->tidcnt; i++) {
1915 		trace_hfi1_tid_entry_rcv_read_req(qp, i,
1916 						  flow->tid_entry[i]);
1917 		tlen = EXP_TID_GET(flow->tid_entry[i], LEN);
1918 		if (!tlen)
1919 			return 1;
1920 
1921 		/*
1922 		 * For tid pair (tidctr == 3), the buffer size of the pair
1923 		 * should be the sum of the buffer size described by each
1924 		 * tid entry. However, only the first entry needs to be
1925 		 * specified in the request (see WFR HAS Section 8.5.7.1).
1926 		 */
1927 		tidlen += tlen;
1928 	}
1929 	if (tidlen * PAGE_SIZE < len)
1930 		return 1;
1931 
1932 	/* Empty the flow array */
1933 	req->clear_tail = req->setup_head;
1934 	flow->pkt = 0;
1935 	flow->tid_idx = 0;
1936 	flow->tid_offset = 0;
1937 	flow->sent = 0;
1938 	flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_qp);
1939 	flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) &
1940 		    TID_RDMA_DESTQP_FLOW_MASK;
1941 	flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_psn));
1942 	flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT;
1943 	flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK;
1944 	flow->length = len;
1945 
1946 	flow->flow_state.lpsn = flow->flow_state.spsn +
1947 		flow->npkts - 1;
1948 	flow->flow_state.ib_spsn = psn;
1949 	flow->flow_state.ib_lpsn = flow->flow_state.ib_spsn + flow->npkts - 1;
1950 
1951 	trace_hfi1_tid_flow_rcv_read_req(qp, req->setup_head, flow);
1952 	/* Set the initial flow index to the current flow. */
1953 	req->flow_idx = req->setup_head;
1954 
1955 	/* advance circular buffer head */
1956 	req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1);
1957 
1958 	/*
1959 	 * Compute last PSN for request.
1960 	 */
1961 	e->opcode = (bth0 >> 24) & 0xff;
1962 	e->psn = psn;
1963 	e->lpsn = psn + flow->npkts - 1;
1964 	e->sent = 0;
1965 
1966 	req->n_flows = qpriv->tid_rdma.local.max_read;
1967 	req->state = TID_REQUEST_ACTIVE;
1968 	req->cur_seg = 0;
1969 	req->comp_seg = 0;
1970 	req->ack_seg = 0;
1971 	req->isge = 0;
1972 	req->seg_len = qpriv->tid_rdma.local.max_len;
1973 	req->total_len = len;
1974 	req->total_segs = 1;
1975 	req->r_flow_psn = e->psn;
1976 
1977 	trace_hfi1_tid_req_rcv_read_req(qp, 0, e->opcode, e->psn, e->lpsn,
1978 					req);
1979 	return 0;
1980 }
1981 
1982 static int tid_rdma_rcv_error(struct hfi1_packet *packet,
1983 			      struct ib_other_headers *ohdr,
1984 			      struct rvt_qp *qp, u32 psn, int diff)
1985 {
1986 	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
1987 	struct hfi1_ctxtdata *rcd = ((struct hfi1_qp_priv *)qp->priv)->rcd;
1988 	struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
1989 	struct hfi1_qp_priv *qpriv = qp->priv;
1990 	struct rvt_ack_entry *e;
1991 	struct tid_rdma_request *req;
1992 	unsigned long flags;
1993 	u8 prev;
1994 	bool old_req;
1995 
1996 	trace_hfi1_rsp_tid_rcv_error(qp, psn);
1997 	trace_hfi1_tid_rdma_rcv_err(qp, 0, psn, diff);
1998 	if (diff > 0) {
1999 		/* sequence error */
2000 		if (!qp->r_nak_state) {
2001 			ibp->rvp.n_rc_seqnak++;
2002 			qp->r_nak_state = IB_NAK_PSN_ERROR;
2003 			qp->r_ack_psn = qp->r_psn;
2004 			rc_defered_ack(rcd, qp);
2005 		}
2006 		goto done;
2007 	}
2008 
2009 	ibp->rvp.n_rc_dupreq++;
2010 
2011 	spin_lock_irqsave(&qp->s_lock, flags);
2012 	e = find_prev_entry(qp, psn, &prev, NULL, &old_req);
2013 	if (!e || (e->opcode != TID_OP(READ_REQ) &&
2014 		   e->opcode != TID_OP(WRITE_REQ)))
2015 		goto unlock;
2016 
2017 	req = ack_to_tid_req(e);
2018 	req->r_flow_psn = psn;
2019 	trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn, e->lpsn, req);
2020 	if (e->opcode == TID_OP(READ_REQ)) {
2021 		struct ib_reth *reth;
2022 		u32 offset;
2023 		u32 len;
2024 		u32 rkey;
2025 		u64 vaddr;
2026 		int ok;
2027 		u32 bth0;
2028 
2029 		reth = &ohdr->u.tid_rdma.r_req.reth;
2030 		/*
2031 		 * The requester always restarts from the start of the original
2032 		 * request.
2033 		 */
2034 		offset = delta_psn(psn, e->psn) * qp->pmtu;
2035 		len = be32_to_cpu(reth->length);
2036 		if (psn != e->psn || len != req->total_len)
2037 			goto unlock;
2038 
2039 		if (e->rdma_sge.mr) {
2040 			rvt_put_mr(e->rdma_sge.mr);
2041 			e->rdma_sge.mr = NULL;
2042 		}
2043 
2044 		rkey = be32_to_cpu(reth->rkey);
2045 		vaddr = get_ib_reth_vaddr(reth);
2046 
2047 		qp->r_len = len;
2048 		ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
2049 				 IB_ACCESS_REMOTE_READ);
2050 		if (unlikely(!ok))
2051 			goto unlock;
2052 
2053 		/*
2054 		 * If all the response packets for the current request have
2055 		 * been sent out and this request is complete (old_request
2056 		 * == false) and the TID flow may be unusable (the
2057 		 * req->clear_tail is advanced). However, when an earlier
2058 		 * request is received, this request will not be complete any
2059 		 * more (qp->s_tail_ack_queue is moved back, see below).
2060 		 * Consequently, we need to update the TID flow info everytime
2061 		 * a duplicate request is received.
2062 		 */
2063 		bth0 = be32_to_cpu(ohdr->bth[0]);
2064 		if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn,
2065 					      vaddr, len))
2066 			goto unlock;
2067 
2068 		/*
2069 		 * True if the request is already scheduled (between
2070 		 * qp->s_tail_ack_queue and qp->r_head_ack_queue);
2071 		 */
2072 		if (old_req)
2073 			goto unlock;
2074 	} else {
2075 		struct flow_state *fstate;
2076 		bool schedule = false;
2077 		u8 i;
2078 
2079 		if (req->state == TID_REQUEST_RESEND) {
2080 			req->state = TID_REQUEST_RESEND_ACTIVE;
2081 		} else if (req->state == TID_REQUEST_INIT_RESEND) {
2082 			req->state = TID_REQUEST_INIT;
2083 			schedule = true;
2084 		}
2085 
2086 		/*
2087 		 * True if the request is already scheduled (between
2088 		 * qp->s_tail_ack_queue and qp->r_head_ack_queue).
2089 		 * Also, don't change requests, which are at the SYNC
2090 		 * point and haven't generated any responses yet.
2091 		 * There is nothing to retransmit for them yet.
2092 		 */
2093 		if (old_req || req->state == TID_REQUEST_INIT ||
2094 		    (req->state == TID_REQUEST_SYNC && !req->cur_seg)) {
2095 			for (i = prev + 1; ; i++) {
2096 				if (i > rvt_size_atomic(&dev->rdi))
2097 					i = 0;
2098 				if (i == qp->r_head_ack_queue)
2099 					break;
2100 				e = &qp->s_ack_queue[i];
2101 				req = ack_to_tid_req(e);
2102 				if (e->opcode == TID_OP(WRITE_REQ) &&
2103 				    req->state == TID_REQUEST_INIT)
2104 					req->state = TID_REQUEST_INIT_RESEND;
2105 			}
2106 			/*
2107 			 * If the state of the request has been changed,
2108 			 * the first leg needs to get scheduled in order to
2109 			 * pick up the change. Otherwise, normal response
2110 			 * processing should take care of it.
2111 			 */
2112 			if (!schedule)
2113 				goto unlock;
2114 		}
2115 
2116 		/*
2117 		 * If there is no more allocated segment, just schedule the qp
2118 		 * without changing any state.
2119 		 */
2120 		if (req->clear_tail == req->setup_head)
2121 			goto schedule;
2122 		/*
2123 		 * If this request has sent responses for segments, which have
2124 		 * not received data yet (flow_idx != clear_tail), the flow_idx
2125 		 * pointer needs to be adjusted so the same responses can be
2126 		 * re-sent.
2127 		 */
2128 		if (CIRC_CNT(req->flow_idx, req->clear_tail, MAX_FLOWS)) {
2129 			fstate = &req->flows[req->clear_tail].flow_state;
2130 			qpriv->pending_tid_w_segs -=
2131 				CIRC_CNT(req->flow_idx, req->clear_tail,
2132 					 MAX_FLOWS);
2133 			req->flow_idx =
2134 				CIRC_ADD(req->clear_tail,
2135 					 delta_psn(psn, fstate->resp_ib_psn),
2136 					 MAX_FLOWS);
2137 			qpriv->pending_tid_w_segs +=
2138 				delta_psn(psn, fstate->resp_ib_psn);
2139 			/*
2140 			 * When flow_idx == setup_head, we've gotten a duplicate
2141 			 * request for a segment, which has not been allocated
2142 			 * yet. In that case, don't adjust this request.
2143 			 * However, we still want to go through the loop below
2144 			 * to adjust all subsequent requests.
2145 			 */
2146 			if (CIRC_CNT(req->setup_head, req->flow_idx,
2147 				     MAX_FLOWS)) {
2148 				req->cur_seg = delta_psn(psn, e->psn);
2149 				req->state = TID_REQUEST_RESEND_ACTIVE;
2150 			}
2151 		}
2152 
2153 		for (i = prev + 1; ; i++) {
2154 			/*
2155 			 * Look at everything up to and including
2156 			 * s_tail_ack_queue
2157 			 */
2158 			if (i > rvt_size_atomic(&dev->rdi))
2159 				i = 0;
2160 			if (i == qp->r_head_ack_queue)
2161 				break;
2162 			e = &qp->s_ack_queue[i];
2163 			req = ack_to_tid_req(e);
2164 			trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn,
2165 						   e->lpsn, req);
2166 			if (e->opcode != TID_OP(WRITE_REQ) ||
2167 			    req->cur_seg == req->comp_seg ||
2168 			    req->state == TID_REQUEST_INIT ||
2169 			    req->state == TID_REQUEST_INIT_RESEND) {
2170 				if (req->state == TID_REQUEST_INIT)
2171 					req->state = TID_REQUEST_INIT_RESEND;
2172 				continue;
2173 			}
2174 			qpriv->pending_tid_w_segs -=
2175 				CIRC_CNT(req->flow_idx,
2176 					 req->clear_tail,
2177 					 MAX_FLOWS);
2178 			req->flow_idx = req->clear_tail;
2179 			req->state = TID_REQUEST_RESEND;
2180 			req->cur_seg = req->comp_seg;
2181 		}
2182 		qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK;
2183 	}
2184 	/* Re-process old requests.*/
2185 	if (qp->s_acked_ack_queue == qp->s_tail_ack_queue)
2186 		qp->s_acked_ack_queue = prev;
2187 	qp->s_tail_ack_queue = prev;
2188 	/*
2189 	 * Since the qp->s_tail_ack_queue is modified, the
2190 	 * qp->s_ack_state must be changed to re-initialize
2191 	 * qp->s_ack_rdma_sge; Otherwise, we will end up in
2192 	 * wrong memory region.
2193 	 */
2194 	qp->s_ack_state = OP(ACKNOWLEDGE);
2195 schedule:
2196 	/*
2197 	 * It's possible to receive a retry psn that is earlier than an RNRNAK
2198 	 * psn. In this case, the rnrnak state should be cleared.
2199 	 */
2200 	if (qpriv->rnr_nak_state) {
2201 		qp->s_nak_state = 0;
2202 		qpriv->rnr_nak_state = TID_RNR_NAK_INIT;
2203 		qp->r_psn = e->lpsn + 1;
2204 		hfi1_tid_write_alloc_resources(qp, true);
2205 	}
2206 
2207 	qp->r_state = e->opcode;
2208 	qp->r_nak_state = 0;
2209 	qp->s_flags |= RVT_S_RESP_PENDING;
2210 	hfi1_schedule_send(qp);
2211 unlock:
2212 	spin_unlock_irqrestore(&qp->s_lock, flags);
2213 done:
2214 	return 1;
2215 }
2216 
2217 void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet)
2218 {
2219 	/* HANDLER FOR TID RDMA READ REQUEST packet (Responder side)*/
2220 
2221 	/*
2222 	 * 1. Verify TID RDMA READ REQ as per IB_OPCODE_RC_RDMA_READ
2223 	 *    (see hfi1_rc_rcv())
2224 	 * 2. Put TID RDMA READ REQ into the response queueu (s_ack_queue)
2225 	 *     - Setup struct tid_rdma_req with request info
2226 	 *     - Initialize struct tid_rdma_flow info;
2227 	 *     - Copy TID entries;
2228 	 * 3. Set the qp->s_ack_state.
2229 	 * 4. Set RVT_S_RESP_PENDING in s_flags.
2230 	 * 5. Kick the send engine (hfi1_schedule_send())
2231 	 */
2232 	struct hfi1_ctxtdata *rcd = packet->rcd;
2233 	struct rvt_qp *qp = packet->qp;
2234 	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
2235 	struct ib_other_headers *ohdr = packet->ohdr;
2236 	struct rvt_ack_entry *e;
2237 	unsigned long flags;
2238 	struct ib_reth *reth;
2239 	struct hfi1_qp_priv *qpriv = qp->priv;
2240 	u32 bth0, psn, len, rkey;
2241 	bool is_fecn;
2242 	u8 next;
2243 	u64 vaddr;
2244 	int diff;
2245 	u8 nack_state = IB_NAK_INVALID_REQUEST;
2246 
2247 	bth0 = be32_to_cpu(ohdr->bth[0]);
2248 	if (hfi1_ruc_check_hdr(ibp, packet))
2249 		return;
2250 
2251 	is_fecn = process_ecn(qp, packet);
2252 	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
2253 	trace_hfi1_rsp_rcv_tid_read_req(qp, psn);
2254 
2255 	if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
2256 		rvt_comm_est(qp);
2257 
2258 	if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
2259 		goto nack_inv;
2260 
2261 	reth = &ohdr->u.tid_rdma.r_req.reth;
2262 	vaddr = be64_to_cpu(reth->vaddr);
2263 	len = be32_to_cpu(reth->length);
2264 	/* The length needs to be in multiples of PAGE_SIZE */
2265 	if (!len || len & ~PAGE_MASK || len > qpriv->tid_rdma.local.max_len)
2266 		goto nack_inv;
2267 
2268 	diff = delta_psn(psn, qp->r_psn);
2269 	if (unlikely(diff)) {
2270 		if (tid_rdma_rcv_error(packet, ohdr, qp, psn, diff))
2271 			return;
2272 		goto send_ack;
2273 	}
2274 
2275 	/* We've verified the request, insert it into the ack queue. */
2276 	next = qp->r_head_ack_queue + 1;
2277 	if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
2278 		next = 0;
2279 	spin_lock_irqsave(&qp->s_lock, flags);
2280 	if (unlikely(next == qp->s_tail_ack_queue)) {
2281 		if (!qp->s_ack_queue[next].sent) {
2282 			nack_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
2283 			goto nack_inv_unlock;
2284 		}
2285 		update_ack_queue(qp, next);
2286 	}
2287 	e = &qp->s_ack_queue[qp->r_head_ack_queue];
2288 	if (e->rdma_sge.mr) {
2289 		rvt_put_mr(e->rdma_sge.mr);
2290 		e->rdma_sge.mr = NULL;
2291 	}
2292 
2293 	rkey = be32_to_cpu(reth->rkey);
2294 	qp->r_len = len;
2295 
2296 	if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr,
2297 				  rkey, IB_ACCESS_REMOTE_READ)))
2298 		goto nack_acc;
2299 
2300 	/* Accept the request parameters */
2301 	if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn, vaddr,
2302 				      len))
2303 		goto nack_inv_unlock;
2304 
2305 	qp->r_state = e->opcode;
2306 	qp->r_nak_state = 0;
2307 	/*
2308 	 * We need to increment the MSN here instead of when we
2309 	 * finish sending the result since a duplicate request would
2310 	 * increment it more than once.
2311 	 */
2312 	qp->r_msn++;
2313 	qp->r_psn += e->lpsn - e->psn + 1;
2314 
2315 	qp->r_head_ack_queue = next;
2316 
2317 	/*
2318 	 * For all requests other than TID WRITE which are added to the ack
2319 	 * queue, qpriv->r_tid_alloc follows qp->r_head_ack_queue. It is ok to
2320 	 * do this because of interlocks between these and TID WRITE
2321 	 * requests. The same change has also been made in hfi1_rc_rcv().
2322 	 */
2323 	qpriv->r_tid_alloc = qp->r_head_ack_queue;
2324 
2325 	/* Schedule the send tasklet. */
2326 	qp->s_flags |= RVT_S_RESP_PENDING;
2327 	hfi1_schedule_send(qp);
2328 
2329 	spin_unlock_irqrestore(&qp->s_lock, flags);
2330 	if (is_fecn)
2331 		goto send_ack;
2332 	return;
2333 
2334 nack_inv_unlock:
2335 	spin_unlock_irqrestore(&qp->s_lock, flags);
2336 nack_inv:
2337 	rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2338 	qp->r_nak_state = nack_state;
2339 	qp->r_ack_psn = qp->r_psn;
2340 	/* Queue NAK for later */
2341 	rc_defered_ack(rcd, qp);
2342 	return;
2343 nack_acc:
2344 	spin_unlock_irqrestore(&qp->s_lock, flags);
2345 	rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
2346 	qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
2347 	qp->r_ack_psn = qp->r_psn;
2348 send_ack:
2349 	hfi1_send_rc_ack(packet, is_fecn);
2350 }
2351 
2352 u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
2353 				  struct ib_other_headers *ohdr, u32 *bth0,
2354 				  u32 *bth1, u32 *bth2, u32 *len, bool *last)
2355 {
2356 	struct hfi1_ack_priv *epriv = e->priv;
2357 	struct tid_rdma_request *req = &epriv->tid_req;
2358 	struct hfi1_qp_priv *qpriv = qp->priv;
2359 	struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
2360 	u32 tidentry = flow->tid_entry[flow->tid_idx];
2361 	u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT;
2362 	struct tid_rdma_read_resp *resp = &ohdr->u.tid_rdma.r_rsp;
2363 	u32 next_offset, om = KDETH_OM_LARGE;
2364 	bool last_pkt;
2365 	u32 hdwords = 0;
2366 	struct tid_rdma_params *remote;
2367 
2368 	*len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset);
2369 	flow->sent += *len;
2370 	next_offset = flow->tid_offset + *len;
2371 	last_pkt = (flow->sent >= flow->length);
2372 
2373 	trace_hfi1_tid_entry_build_read_resp(qp, flow->tid_idx, tidentry);
2374 	trace_hfi1_tid_flow_build_read_resp(qp, req->clear_tail, flow);
2375 
2376 	rcu_read_lock();
2377 	remote = rcu_dereference(qpriv->tid_rdma.remote);
2378 	if (!remote) {
2379 		rcu_read_unlock();
2380 		goto done;
2381 	}
2382 	KDETH_RESET(resp->kdeth0, KVER, 0x1);
2383 	KDETH_SET(resp->kdeth0, SH, !last_pkt);
2384 	KDETH_SET(resp->kdeth0, INTR, !!(!last_pkt && remote->urg));
2385 	KDETH_SET(resp->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL));
2386 	KDETH_SET(resp->kdeth0, TID, EXP_TID_GET(tidentry, IDX));
2387 	KDETH_SET(resp->kdeth0, OM, om == KDETH_OM_LARGE);
2388 	KDETH_SET(resp->kdeth0, OFFSET, flow->tid_offset / om);
2389 	KDETH_RESET(resp->kdeth1, JKEY, remote->jkey);
2390 	resp->verbs_qp = cpu_to_be32(qp->remote_qpn);
2391 	rcu_read_unlock();
2392 
2393 	resp->aeth = rvt_compute_aeth(qp);
2394 	resp->verbs_psn = cpu_to_be32(mask_psn(flow->flow_state.ib_spsn +
2395 					       flow->pkt));
2396 
2397 	*bth0 = TID_OP(READ_RESP) << 24;
2398 	*bth1 = flow->tid_qpn;
2399 	*bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) &
2400 			  HFI1_KDETH_BTH_SEQ_MASK) |
2401 			 (flow->flow_state.generation <<
2402 			  HFI1_KDETH_BTH_SEQ_SHIFT));
2403 	*last = last_pkt;
2404 	if (last_pkt)
2405 		/* Advance to next flow */
2406 		req->clear_tail = (req->clear_tail + 1) &
2407 				  (MAX_FLOWS - 1);
2408 
2409 	if (next_offset >= tidlen) {
2410 		flow->tid_offset = 0;
2411 		flow->tid_idx++;
2412 	} else {
2413 		flow->tid_offset = next_offset;
2414 	}
2415 
2416 	hdwords = sizeof(ohdr->u.tid_rdma.r_rsp) / sizeof(u32);
2417 
2418 done:
2419 	return hdwords;
2420 }
2421 
2422 static inline struct tid_rdma_request *
2423 find_tid_request(struct rvt_qp *qp, u32 psn, enum ib_wr_opcode opcode)
2424 	__must_hold(&qp->s_lock)
2425 {
2426 	struct rvt_swqe *wqe;
2427 	struct tid_rdma_request *req = NULL;
2428 	u32 i, end;
2429 
2430 	end = qp->s_cur + 1;
2431 	if (end == qp->s_size)
2432 		end = 0;
2433 	for (i = qp->s_acked; i != end;) {
2434 		wqe = rvt_get_swqe_ptr(qp, i);
2435 		if (cmp_psn(psn, wqe->psn) >= 0 &&
2436 		    cmp_psn(psn, wqe->lpsn) <= 0) {
2437 			if (wqe->wr.opcode == opcode)
2438 				req = wqe_to_tid_req(wqe);
2439 			break;
2440 		}
2441 		if (++i == qp->s_size)
2442 			i = 0;
2443 	}
2444 
2445 	return req;
2446 }
2447 
2448 void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet)
2449 {
2450 	/* HANDLER FOR TID RDMA READ RESPONSE packet (Requestor side */
2451 
2452 	/*
2453 	 * 1. Find matching SWQE
2454 	 * 2. Check that the entire segment has been read.
2455 	 * 3. Remove HFI1_S_WAIT_TID_RESP from s_flags.
2456 	 * 4. Free the TID flow resources.
2457 	 * 5. Kick the send engine (hfi1_schedule_send())
2458 	 */
2459 	struct ib_other_headers *ohdr = packet->ohdr;
2460 	struct rvt_qp *qp = packet->qp;
2461 	struct hfi1_qp_priv *priv = qp->priv;
2462 	struct hfi1_ctxtdata *rcd = packet->rcd;
2463 	struct tid_rdma_request *req;
2464 	struct tid_rdma_flow *flow;
2465 	u32 opcode, aeth;
2466 	bool is_fecn;
2467 	unsigned long flags;
2468 	u32 kpsn, ipsn;
2469 
2470 	trace_hfi1_sender_rcv_tid_read_resp(qp);
2471 	is_fecn = process_ecn(qp, packet);
2472 	kpsn = mask_psn(be32_to_cpu(ohdr->bth[2]));
2473 	aeth = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.aeth);
2474 	opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
2475 
2476 	spin_lock_irqsave(&qp->s_lock, flags);
2477 	ipsn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn));
2478 	req = find_tid_request(qp, ipsn, IB_WR_TID_RDMA_READ);
2479 	if (unlikely(!req))
2480 		goto ack_op_err;
2481 
2482 	flow = &req->flows[req->clear_tail];
2483 	/* When header suppression is disabled */
2484 	if (cmp_psn(ipsn, flow->flow_state.ib_lpsn))
2485 		goto ack_done;
2486 	req->ack_pending--;
2487 	priv->pending_tid_r_segs--;
2488 	qp->s_num_rd_atomic--;
2489 	if ((qp->s_flags & RVT_S_WAIT_FENCE) &&
2490 	    !qp->s_num_rd_atomic) {
2491 		qp->s_flags &= ~(RVT_S_WAIT_FENCE |
2492 				 RVT_S_WAIT_ACK);
2493 		hfi1_schedule_send(qp);
2494 	}
2495 	if (qp->s_flags & RVT_S_WAIT_RDMAR) {
2496 		qp->s_flags &= ~(RVT_S_WAIT_RDMAR | RVT_S_WAIT_ACK);
2497 		hfi1_schedule_send(qp);
2498 	}
2499 
2500 	trace_hfi1_ack(qp, ipsn);
2501 	trace_hfi1_tid_req_rcv_read_resp(qp, 0, req->e.swqe->wr.opcode,
2502 					 req->e.swqe->psn, req->e.swqe->lpsn,
2503 					 req);
2504 	trace_hfi1_tid_flow_rcv_read_resp(qp, req->clear_tail, flow);
2505 
2506 	/* Release the tid resources */
2507 	hfi1_kern_exp_rcv_clear(req);
2508 
2509 	if (!do_rc_ack(qp, aeth, ipsn, opcode, 0, rcd))
2510 		goto ack_done;
2511 
2512 	/* If not done yet, build next read request */
2513 	if (++req->comp_seg >= req->total_segs) {
2514 		priv->tid_r_comp++;
2515 		req->state = TID_REQUEST_COMPLETE;
2516 	}
2517 
2518 	/*
2519 	 * Clear the hw flow under two conditions:
2520 	 * 1. This request is a sync point and it is complete;
2521 	 * 2. Current request is completed and there are no more requests.
2522 	 */
2523 	if ((req->state == TID_REQUEST_SYNC &&
2524 	     req->comp_seg == req->cur_seg) ||
2525 	    priv->tid_r_comp == priv->tid_r_reqs) {
2526 		hfi1_kern_clear_hw_flow(priv->rcd, qp);
2527 		if (req->state == TID_REQUEST_SYNC)
2528 			req->state = TID_REQUEST_ACTIVE;
2529 	}
2530 
2531 	hfi1_schedule_send(qp);
2532 	goto ack_done;
2533 
2534 ack_op_err:
2535 	/*
2536 	 * The test indicates that the send engine has finished its cleanup
2537 	 * after sending the request and it's now safe to put the QP into error
2538 	 * state. However, if the wqe queue is empty (qp->s_acked == qp->s_tail
2539 	 * == qp->s_head), it would be unsafe to complete the wqe pointed by
2540 	 * qp->s_acked here. Putting the qp into error state will safely flush
2541 	 * all remaining requests.
2542 	 */
2543 	if (qp->s_last == qp->s_acked)
2544 		rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
2545 
2546 ack_done:
2547 	spin_unlock_irqrestore(&qp->s_lock, flags);
2548 	if (is_fecn)
2549 		hfi1_send_rc_ack(packet, is_fecn);
2550 }
2551 
2552 void hfi1_kern_read_tid_flow_free(struct rvt_qp *qp)
2553 	__must_hold(&qp->s_lock)
2554 {
2555 	u32 n = qp->s_acked;
2556 	struct rvt_swqe *wqe;
2557 	struct tid_rdma_request *req;
2558 	struct hfi1_qp_priv *priv = qp->priv;
2559 
2560 	lockdep_assert_held(&qp->s_lock);
2561 	/* Free any TID entries */
2562 	while (n != qp->s_tail) {
2563 		wqe = rvt_get_swqe_ptr(qp, n);
2564 		if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
2565 			req = wqe_to_tid_req(wqe);
2566 			hfi1_kern_exp_rcv_clear_all(req);
2567 		}
2568 
2569 		if (++n == qp->s_size)
2570 			n = 0;
2571 	}
2572 	/* Free flow */
2573 	hfi1_kern_clear_hw_flow(priv->rcd, qp);
2574 }
2575 
2576 static bool tid_rdma_tid_err(struct hfi1_ctxtdata *rcd,
2577 			     struct hfi1_packet *packet, u8 rcv_type,
2578 			     u8 opcode)
2579 {
2580 	struct rvt_qp *qp = packet->qp;
2581 	struct hfi1_qp_priv *qpriv = qp->priv;
2582 	u32 ipsn;
2583 	struct ib_other_headers *ohdr = packet->ohdr;
2584 	struct rvt_ack_entry *e;
2585 	struct tid_rdma_request *req;
2586 	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
2587 	u32 i;
2588 
2589 	if (rcv_type >= RHF_RCV_TYPE_IB)
2590 		goto done;
2591 
2592 	spin_lock(&qp->s_lock);
2593 
2594 	/*
2595 	 * We've ran out of space in the eager buffer.
2596 	 * Eagerly received KDETH packets which require space in the
2597 	 * Eager buffer (packet that have payload) are TID RDMA WRITE
2598 	 * response packets. In this case, we have to re-transmit the
2599 	 * TID RDMA WRITE request.
2600 	 */
2601 	if (rcv_type == RHF_RCV_TYPE_EAGER) {
2602 		hfi1_restart_rc(qp, qp->s_last_psn + 1, 1);
2603 		hfi1_schedule_send(qp);
2604 		goto done_unlock;
2605 	}
2606 
2607 	/*
2608 	 * For TID READ response, error out QP after freeing the tid
2609 	 * resources.
2610 	 */
2611 	if (opcode == TID_OP(READ_RESP)) {
2612 		ipsn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn));
2613 		if (cmp_psn(ipsn, qp->s_last_psn) > 0 &&
2614 		    cmp_psn(ipsn, qp->s_psn) < 0) {
2615 			hfi1_kern_read_tid_flow_free(qp);
2616 			spin_unlock(&qp->s_lock);
2617 			rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2618 			goto done;
2619 		}
2620 		goto done_unlock;
2621 	}
2622 
2623 	/*
2624 	 * Error out the qp for TID RDMA WRITE
2625 	 */
2626 	hfi1_kern_clear_hw_flow(qpriv->rcd, qp);
2627 	for (i = 0; i < rvt_max_atomic(rdi); i++) {
2628 		e = &qp->s_ack_queue[i];
2629 		if (e->opcode == TID_OP(WRITE_REQ)) {
2630 			req = ack_to_tid_req(e);
2631 			hfi1_kern_exp_rcv_clear_all(req);
2632 		}
2633 	}
2634 	spin_unlock(&qp->s_lock);
2635 	rvt_rc_error(qp, IB_WC_LOC_LEN_ERR);
2636 	goto done;
2637 
2638 done_unlock:
2639 	spin_unlock(&qp->s_lock);
2640 done:
2641 	return true;
2642 }
2643 
2644 static void restart_tid_rdma_read_req(struct hfi1_ctxtdata *rcd,
2645 				      struct rvt_qp *qp, struct rvt_swqe *wqe)
2646 {
2647 	struct tid_rdma_request *req;
2648 	struct tid_rdma_flow *flow;
2649 
2650 	/* Start from the right segment */
2651 	qp->r_flags |= RVT_R_RDMAR_SEQ;
2652 	req = wqe_to_tid_req(wqe);
2653 	flow = &req->flows[req->clear_tail];
2654 	hfi1_restart_rc(qp, flow->flow_state.ib_spsn, 0);
2655 	if (list_empty(&qp->rspwait)) {
2656 		qp->r_flags |= RVT_R_RSP_SEND;
2657 		rvt_get_qp(qp);
2658 		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
2659 	}
2660 }
2661 
2662 /*
2663  * Handle the KDETH eflags for TID RDMA READ response.
2664  *
2665  * Return true if the last packet for a segment has been received and it is
2666  * time to process the response normally; otherwise, return true.
2667  *
2668  * The caller must hold the packet->qp->r_lock and the rcu_read_lock.
2669  */
2670 static bool handle_read_kdeth_eflags(struct hfi1_ctxtdata *rcd,
2671 				     struct hfi1_packet *packet, u8 rcv_type,
2672 				     u8 rte, u32 psn, u32 ibpsn)
2673 	__must_hold(&packet->qp->r_lock) __must_hold(RCU)
2674 {
2675 	struct hfi1_pportdata *ppd = rcd->ppd;
2676 	struct hfi1_devdata *dd = ppd->dd;
2677 	struct hfi1_ibport *ibp;
2678 	struct rvt_swqe *wqe;
2679 	struct tid_rdma_request *req;
2680 	struct tid_rdma_flow *flow;
2681 	u32 ack_psn;
2682 	struct rvt_qp *qp = packet->qp;
2683 	struct hfi1_qp_priv *priv = qp->priv;
2684 	bool ret = true;
2685 	int diff = 0;
2686 	u32 fpsn;
2687 
2688 	lockdep_assert_held(&qp->r_lock);
2689 	/* If the psn is out of valid range, drop the packet */
2690 	if (cmp_psn(ibpsn, qp->s_last_psn) < 0 ||
2691 	    cmp_psn(ibpsn, qp->s_psn) > 0)
2692 		return ret;
2693 
2694 	spin_lock(&qp->s_lock);
2695 	/*
2696 	 * Note that NAKs implicitly ACK outstanding SEND and RDMA write
2697 	 * requests and implicitly NAK RDMA read and atomic requests issued
2698 	 * before the NAK'ed request.
2699 	 */
2700 	ack_psn = ibpsn - 1;
2701 	wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
2702 	ibp = to_iport(qp->ibqp.device, qp->port_num);
2703 
2704 	/* Complete WQEs that the PSN finishes. */
2705 	while ((int)delta_psn(ack_psn, wqe->lpsn) >= 0) {
2706 		/*
2707 		 * If this request is a RDMA read or atomic, and the NACK is
2708 		 * for a later operation, this NACK NAKs the RDMA read or
2709 		 * atomic.
2710 		 */
2711 		if (wqe->wr.opcode == IB_WR_RDMA_READ ||
2712 		    wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
2713 		    wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
2714 		    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
2715 			/* Retry this request. */
2716 			if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
2717 				qp->r_flags |= RVT_R_RDMAR_SEQ;
2718 				if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
2719 					restart_tid_rdma_read_req(rcd, qp,
2720 								  wqe);
2721 				} else {
2722 					hfi1_restart_rc(qp, qp->s_last_psn + 1,
2723 							0);
2724 					if (list_empty(&qp->rspwait)) {
2725 						qp->r_flags |= RVT_R_RSP_SEND;
2726 						rvt_get_qp(qp);
2727 						list_add_tail(/* wait */
2728 						   &qp->rspwait,
2729 						   &rcd->qp_wait_list);
2730 					}
2731 				}
2732 			}
2733 			/*
2734 			 * No need to process the NAK since we are
2735 			 * restarting an earlier request.
2736 			 */
2737 			break;
2738 		}
2739 
2740 		wqe = do_rc_completion(qp, wqe, ibp);
2741 		if (qp->s_acked == qp->s_tail)
2742 			break;
2743 	}
2744 
2745 	/* Handle the eflags for the request */
2746 	if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
2747 		goto s_unlock;
2748 
2749 	req = wqe_to_tid_req(wqe);
2750 	switch (rcv_type) {
2751 	case RHF_RCV_TYPE_EXPECTED:
2752 		switch (rte) {
2753 		case RHF_RTE_EXPECTED_FLOW_SEQ_ERR:
2754 			/*
2755 			 * On the first occurrence of a Flow Sequence error,
2756 			 * the flag TID_FLOW_SW_PSN is set.
2757 			 *
2758 			 * After that, the flow is *not* reprogrammed and the
2759 			 * protocol falls back to SW PSN checking. This is done
2760 			 * to prevent continuous Flow Sequence errors for any
2761 			 * packets that could be still in the fabric.
2762 			 */
2763 			flow = find_flow(req, psn, NULL);
2764 			if (!flow) {
2765 				/*
2766 				 * We can't find the IB PSN matching the
2767 				 * received KDETH PSN. The only thing we can
2768 				 * do at this point is report the error to
2769 				 * the QP.
2770 				 */
2771 				hfi1_kern_read_tid_flow_free(qp);
2772 				spin_unlock(&qp->s_lock);
2773 				rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2774 				return ret;
2775 			}
2776 			if (priv->flow_state.flags & TID_FLOW_SW_PSN) {
2777 				diff = cmp_psn(psn,
2778 					       priv->flow_state.r_next_psn);
2779 				if (diff > 0) {
2780 					if (!(qp->r_flags & RVT_R_RDMAR_SEQ))
2781 						restart_tid_rdma_read_req(rcd,
2782 									  qp,
2783 									  wqe);
2784 
2785 					/* Drop the packet.*/
2786 					goto s_unlock;
2787 				} else if (diff < 0) {
2788 					/*
2789 					 * If a response packet for a restarted
2790 					 * request has come back, reset the
2791 					 * restart flag.
2792 					 */
2793 					if (qp->r_flags & RVT_R_RDMAR_SEQ)
2794 						qp->r_flags &=
2795 							~RVT_R_RDMAR_SEQ;
2796 
2797 					/* Drop the packet.*/
2798 					goto s_unlock;
2799 				}
2800 
2801 				/*
2802 				 * If SW PSN verification is successful and
2803 				 * this is the last packet in the segment, tell
2804 				 * the caller to process it as a normal packet.
2805 				 */
2806 				fpsn = full_flow_psn(flow,
2807 						     flow->flow_state.lpsn);
2808 				if (cmp_psn(fpsn, psn) == 0) {
2809 					ret = false;
2810 					if (qp->r_flags & RVT_R_RDMAR_SEQ)
2811 						qp->r_flags &=
2812 							~RVT_R_RDMAR_SEQ;
2813 				}
2814 				priv->flow_state.r_next_psn++;
2815 			} else {
2816 				u64 reg;
2817 				u32 last_psn;
2818 
2819 				/*
2820 				 * The only sane way to get the amount of
2821 				 * progress is to read the HW flow state.
2822 				 */
2823 				reg = read_uctxt_csr(dd, rcd->ctxt,
2824 						     RCV_TID_FLOW_TABLE +
2825 						     (8 * flow->idx));
2826 				last_psn = mask_psn(reg);
2827 
2828 				priv->flow_state.r_next_psn = last_psn;
2829 				priv->flow_state.flags |= TID_FLOW_SW_PSN;
2830 				/*
2831 				 * If no request has been restarted yet,
2832 				 * restart the current one.
2833 				 */
2834 				if (!(qp->r_flags & RVT_R_RDMAR_SEQ))
2835 					restart_tid_rdma_read_req(rcd, qp,
2836 								  wqe);
2837 			}
2838 
2839 			break;
2840 
2841 		case RHF_RTE_EXPECTED_FLOW_GEN_ERR:
2842 			/*
2843 			 * Since the TID flow is able to ride through
2844 			 * generation mismatch, drop this stale packet.
2845 			 */
2846 			break;
2847 
2848 		default:
2849 			break;
2850 		}
2851 		break;
2852 
2853 	case RHF_RCV_TYPE_ERROR:
2854 		switch (rte) {
2855 		case RHF_RTE_ERROR_OP_CODE_ERR:
2856 		case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR:
2857 		case RHF_RTE_ERROR_KHDR_HCRC_ERR:
2858 		case RHF_RTE_ERROR_KHDR_KVER_ERR:
2859 		case RHF_RTE_ERROR_CONTEXT_ERR:
2860 		case RHF_RTE_ERROR_KHDR_TID_ERR:
2861 		default:
2862 			break;
2863 		}
2864 	default:
2865 		break;
2866 	}
2867 s_unlock:
2868 	spin_unlock(&qp->s_lock);
2869 	return ret;
2870 }
2871 
2872 bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
2873 			      struct hfi1_pportdata *ppd,
2874 			      struct hfi1_packet *packet)
2875 {
2876 	struct hfi1_ibport *ibp = &ppd->ibport_data;
2877 	struct hfi1_devdata *dd = ppd->dd;
2878 	struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
2879 	u8 rcv_type = rhf_rcv_type(packet->rhf);
2880 	u8 rte = rhf_rcv_type_err(packet->rhf);
2881 	struct ib_header *hdr = packet->hdr;
2882 	struct ib_other_headers *ohdr = NULL;
2883 	int lnh = be16_to_cpu(hdr->lrh[0]) & 3;
2884 	u16 lid  = be16_to_cpu(hdr->lrh[1]);
2885 	u8 opcode;
2886 	u32 qp_num, psn, ibpsn;
2887 	struct rvt_qp *qp;
2888 	struct hfi1_qp_priv *qpriv;
2889 	unsigned long flags;
2890 	bool ret = true;
2891 	struct rvt_ack_entry *e;
2892 	struct tid_rdma_request *req;
2893 	struct tid_rdma_flow *flow;
2894 
2895 	trace_hfi1_msg_handle_kdeth_eflags(NULL, "Kdeth error: rhf ",
2896 					   packet->rhf);
2897 	if (packet->rhf & (RHF_VCRC_ERR | RHF_ICRC_ERR))
2898 		return ret;
2899 
2900 	packet->ohdr = &hdr->u.oth;
2901 	ohdr = packet->ohdr;
2902 	trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
2903 
2904 	/* Get the destination QP number. */
2905 	qp_num = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_qp) &
2906 		RVT_QPN_MASK;
2907 	if (lid >= be16_to_cpu(IB_MULTICAST_LID_BASE))
2908 		goto drop;
2909 
2910 	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
2911 	opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
2912 
2913 	rcu_read_lock();
2914 	qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
2915 	if (!qp)
2916 		goto rcu_unlock;
2917 
2918 	packet->qp = qp;
2919 
2920 	/* Check for valid receive state. */
2921 	spin_lock_irqsave(&qp->r_lock, flags);
2922 	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
2923 		ibp->rvp.n_pkt_drops++;
2924 		goto r_unlock;
2925 	}
2926 
2927 	if (packet->rhf & RHF_TID_ERR) {
2928 		/* For TIDERR and RC QPs preemptively schedule a NAK */
2929 		u32 tlen = rhf_pkt_len(packet->rhf); /* in bytes */
2930 
2931 		/* Sanity check packet */
2932 		if (tlen < 24)
2933 			goto r_unlock;
2934 
2935 		/*
2936 		 * Check for GRH. We should never get packets with GRH in this
2937 		 * path.
2938 		 */
2939 		if (lnh == HFI1_LRH_GRH)
2940 			goto r_unlock;
2941 
2942 		if (tid_rdma_tid_err(rcd, packet, rcv_type, opcode))
2943 			goto r_unlock;
2944 	}
2945 
2946 	/* handle TID RDMA READ */
2947 	if (opcode == TID_OP(READ_RESP)) {
2948 		ibpsn = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn);
2949 		ibpsn = mask_psn(ibpsn);
2950 		ret = handle_read_kdeth_eflags(rcd, packet, rcv_type, rte, psn,
2951 					       ibpsn);
2952 		goto r_unlock;
2953 	}
2954 
2955 	/*
2956 	 * qp->s_tail_ack_queue points to the rvt_ack_entry currently being
2957 	 * processed. These a completed sequentially so we can be sure that
2958 	 * the pointer will not change until the entire request has completed.
2959 	 */
2960 	spin_lock(&qp->s_lock);
2961 	qpriv = qp->priv;
2962 	e = &qp->s_ack_queue[qpriv->r_tid_tail];
2963 	req = ack_to_tid_req(e);
2964 	flow = &req->flows[req->clear_tail];
2965 	trace_hfi1_eflags_err_write(qp, rcv_type, rte, psn);
2966 	trace_hfi1_rsp_handle_kdeth_eflags(qp, psn);
2967 	trace_hfi1_tid_write_rsp_handle_kdeth_eflags(qp);
2968 	trace_hfi1_tid_req_handle_kdeth_eflags(qp, 0, e->opcode, e->psn,
2969 					       e->lpsn, req);
2970 	trace_hfi1_tid_flow_handle_kdeth_eflags(qp, req->clear_tail, flow);
2971 
2972 	switch (rcv_type) {
2973 	case RHF_RCV_TYPE_EXPECTED:
2974 		switch (rte) {
2975 		case RHF_RTE_EXPECTED_FLOW_SEQ_ERR:
2976 			if (!(qpriv->s_flags & HFI1_R_TID_SW_PSN)) {
2977 				u64 reg;
2978 
2979 				qpriv->s_flags |= HFI1_R_TID_SW_PSN;
2980 				/*
2981 				 * The only sane way to get the amount of
2982 				 * progress is to read the HW flow state.
2983 				 */
2984 				reg = read_uctxt_csr(dd, rcd->ctxt,
2985 						     RCV_TID_FLOW_TABLE +
2986 						     (8 * flow->idx));
2987 				flow->flow_state.r_next_psn = mask_psn(reg);
2988 				qpriv->r_next_psn_kdeth =
2989 					flow->flow_state.r_next_psn;
2990 				goto nak_psn;
2991 			} else {
2992 				/*
2993 				 * If the received PSN does not match the next
2994 				 * expected PSN, NAK the packet.
2995 				 * However, only do that if we know that the a
2996 				 * NAK has already been sent. Otherwise, this
2997 				 * mismatch could be due to packets that were
2998 				 * already in flight.
2999 				 */
3000 				if (psn != flow->flow_state.r_next_psn) {
3001 					psn = flow->flow_state.r_next_psn;
3002 					goto nak_psn;
3003 				}
3004 
3005 				qpriv->s_nak_state = 0;
3006 				/*
3007 				 * If SW PSN verification is successful and this
3008 				 * is the last packet in the segment, tell the
3009 				 * caller to process it as a normal packet.
3010 				 */
3011 				if (psn == full_flow_psn(flow,
3012 							 flow->flow_state.lpsn))
3013 					ret = false;
3014 				qpriv->r_next_psn_kdeth =
3015 					++flow->flow_state.r_next_psn;
3016 			}
3017 			break;
3018 
3019 		case RHF_RTE_EXPECTED_FLOW_GEN_ERR:
3020 			goto nak_psn;
3021 
3022 		default:
3023 			break;
3024 		}
3025 		break;
3026 
3027 	case RHF_RCV_TYPE_ERROR:
3028 		switch (rte) {
3029 		case RHF_RTE_ERROR_OP_CODE_ERR:
3030 		case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR:
3031 		case RHF_RTE_ERROR_KHDR_HCRC_ERR:
3032 		case RHF_RTE_ERROR_KHDR_KVER_ERR:
3033 		case RHF_RTE_ERROR_CONTEXT_ERR:
3034 		case RHF_RTE_ERROR_KHDR_TID_ERR:
3035 		default:
3036 			break;
3037 		}
3038 	default:
3039 		break;
3040 	}
3041 
3042 unlock:
3043 	spin_unlock(&qp->s_lock);
3044 r_unlock:
3045 	spin_unlock_irqrestore(&qp->r_lock, flags);
3046 rcu_unlock:
3047 	rcu_read_unlock();
3048 drop:
3049 	return ret;
3050 nak_psn:
3051 	ibp->rvp.n_rc_seqnak++;
3052 	if (!qpriv->s_nak_state) {
3053 		qpriv->s_nak_state = IB_NAK_PSN_ERROR;
3054 		/* We are NAK'ing the next expected PSN */
3055 		qpriv->s_nak_psn = mask_psn(flow->flow_state.r_next_psn);
3056 		qpriv->s_flags |= RVT_S_ACK_PENDING;
3057 		if (qpriv->r_tid_ack == HFI1_QP_WQE_INVALID)
3058 			qpriv->r_tid_ack = qpriv->r_tid_tail;
3059 		hfi1_schedule_tid_send(qp);
3060 	}
3061 	goto unlock;
3062 }
3063 
3064 /*
3065  * "Rewind" the TID request information.
3066  * This means that we reset the state back to ACTIVE,
3067  * find the proper flow, set the flow index to that flow,
3068  * and reset the flow information.
3069  */
3070 void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
3071 			       u32 *bth2)
3072 {
3073 	struct tid_rdma_request *req = wqe_to_tid_req(wqe);
3074 	struct tid_rdma_flow *flow;
3075 	struct hfi1_qp_priv *qpriv = qp->priv;
3076 	int diff, delta_pkts;
3077 	u32 tididx = 0, i;
3078 	u16 fidx;
3079 
3080 	if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
3081 		*bth2 = mask_psn(qp->s_psn);
3082 		flow = find_flow_ib(req, *bth2, &fidx);
3083 		if (!flow) {
3084 			trace_hfi1_msg_tid_restart_req(/* msg */
3085 			   qp, "!!!!!! Could not find flow to restart: bth2 ",
3086 			   (u64)*bth2);
3087 			trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode,
3088 						       wqe->psn, wqe->lpsn,
3089 						       req);
3090 			return;
3091 		}
3092 	} else {
3093 		fidx = req->acked_tail;
3094 		flow = &req->flows[fidx];
3095 		*bth2 = mask_psn(req->r_ack_psn);
3096 	}
3097 
3098 	if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
3099 		delta_pkts = delta_psn(*bth2, flow->flow_state.ib_spsn);
3100 	else
3101 		delta_pkts = delta_psn(*bth2,
3102 				       full_flow_psn(flow,
3103 						     flow->flow_state.spsn));
3104 
3105 	trace_hfi1_tid_flow_restart_req(qp, fidx, flow);
3106 	diff = delta_pkts + flow->resync_npkts;
3107 
3108 	flow->sent = 0;
3109 	flow->pkt = 0;
3110 	flow->tid_idx = 0;
3111 	flow->tid_offset = 0;
3112 	if (diff) {
3113 		for (tididx = 0; tididx < flow->tidcnt; tididx++) {
3114 			u32 tidentry = flow->tid_entry[tididx], tidlen,
3115 				tidnpkts, npkts;
3116 
3117 			flow->tid_offset = 0;
3118 			tidlen = EXP_TID_GET(tidentry, LEN) * PAGE_SIZE;
3119 			tidnpkts = rvt_div_round_up_mtu(qp, tidlen);
3120 			npkts = min_t(u32, diff, tidnpkts);
3121 			flow->pkt += npkts;
3122 			flow->sent += (npkts == tidnpkts ? tidlen :
3123 				       npkts * qp->pmtu);
3124 			flow->tid_offset += npkts * qp->pmtu;
3125 			diff -= npkts;
3126 			if (!diff)
3127 				break;
3128 		}
3129 	}
3130 	if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) {
3131 		rvt_skip_sge(&qpriv->tid_ss, (req->cur_seg * req->seg_len) +
3132 			     flow->sent, 0);
3133 		/*
3134 		 * Packet PSN is based on flow_state.spsn + flow->pkt. However,
3135 		 * during a RESYNC, the generation is incremented and the
3136 		 * sequence is reset to 0. Since we've adjusted the npkts in the
3137 		 * flow and the SGE has been sufficiently advanced, we have to
3138 		 * adjust flow->pkt in order to calculate the correct PSN.
3139 		 */
3140 		flow->pkt -= flow->resync_npkts;
3141 	}
3142 
3143 	if (flow->tid_offset ==
3144 	    EXP_TID_GET(flow->tid_entry[tididx], LEN) * PAGE_SIZE) {
3145 		tididx++;
3146 		flow->tid_offset = 0;
3147 	}
3148 	flow->tid_idx = tididx;
3149 	if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
3150 		/* Move flow_idx to correct index */
3151 		req->flow_idx = fidx;
3152 	else
3153 		req->clear_tail = fidx;
3154 
3155 	trace_hfi1_tid_flow_restart_req(qp, fidx, flow);
3156 	trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode, wqe->psn,
3157 				       wqe->lpsn, req);
3158 	req->state = TID_REQUEST_ACTIVE;
3159 	if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) {
3160 		/* Reset all the flows that we are going to resend */
3161 		fidx = CIRC_NEXT(fidx, MAX_FLOWS);
3162 		i = qpriv->s_tid_tail;
3163 		do {
3164 			for (; CIRC_CNT(req->setup_head, fidx, MAX_FLOWS);
3165 			      fidx = CIRC_NEXT(fidx, MAX_FLOWS)) {
3166 				req->flows[fidx].sent = 0;
3167 				req->flows[fidx].pkt = 0;
3168 				req->flows[fidx].tid_idx = 0;
3169 				req->flows[fidx].tid_offset = 0;
3170 				req->flows[fidx].resync_npkts = 0;
3171 			}
3172 			if (i == qpriv->s_tid_cur)
3173 				break;
3174 			do {
3175 				i = (++i == qp->s_size ? 0 : i);
3176 				wqe = rvt_get_swqe_ptr(qp, i);
3177 			} while (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE);
3178 			req = wqe_to_tid_req(wqe);
3179 			req->cur_seg = req->ack_seg;
3180 			fidx = req->acked_tail;
3181 			/* Pull req->clear_tail back */
3182 			req->clear_tail = fidx;
3183 		} while (1);
3184 	}
3185 }
3186 
3187 void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp)
3188 {
3189 	int i, ret;
3190 	struct hfi1_qp_priv *qpriv = qp->priv;
3191 	struct tid_flow_state *fs;
3192 
3193 	if (qp->ibqp.qp_type != IB_QPT_RC || !HFI1_CAP_IS_KSET(TID_RDMA))
3194 		return;
3195 
3196 	/*
3197 	 * First, clear the flow to help prevent any delayed packets from
3198 	 * being delivered.
3199 	 */
3200 	fs = &qpriv->flow_state;
3201 	if (fs->index != RXE_NUM_TID_FLOWS)
3202 		hfi1_kern_clear_hw_flow(qpriv->rcd, qp);
3203 
3204 	for (i = qp->s_acked; i != qp->s_head;) {
3205 		struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i);
3206 
3207 		if (++i == qp->s_size)
3208 			i = 0;
3209 		/* Free only locally allocated TID entries */
3210 		if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
3211 			continue;
3212 		do {
3213 			struct hfi1_swqe_priv *priv = wqe->priv;
3214 
3215 			ret = hfi1_kern_exp_rcv_clear(&priv->tid_req);
3216 		} while (!ret);
3217 	}
3218 	for (i = qp->s_acked_ack_queue; i != qp->r_head_ack_queue;) {
3219 		struct rvt_ack_entry *e = &qp->s_ack_queue[i];
3220 
3221 		if (++i == rvt_max_atomic(ib_to_rvt(qp->ibqp.device)))
3222 			i = 0;
3223 		/* Free only locally allocated TID entries */
3224 		if (e->opcode != TID_OP(WRITE_REQ))
3225 			continue;
3226 		do {
3227 			struct hfi1_ack_priv *priv = e->priv;
3228 
3229 			ret = hfi1_kern_exp_rcv_clear(&priv->tid_req);
3230 		} while (!ret);
3231 	}
3232 }
3233 
3234 bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe)
3235 {
3236 	struct rvt_swqe *prev;
3237 	struct hfi1_qp_priv *priv = qp->priv;
3238 	u32 s_prev;
3239 	struct tid_rdma_request *req;
3240 
3241 	s_prev = (qp->s_cur == 0 ? qp->s_size : qp->s_cur) - 1;
3242 	prev = rvt_get_swqe_ptr(qp, s_prev);
3243 
3244 	switch (wqe->wr.opcode) {
3245 	case IB_WR_SEND:
3246 	case IB_WR_SEND_WITH_IMM:
3247 	case IB_WR_SEND_WITH_INV:
3248 	case IB_WR_ATOMIC_CMP_AND_SWP:
3249 	case IB_WR_ATOMIC_FETCH_AND_ADD:
3250 	case IB_WR_RDMA_WRITE:
3251 		switch (prev->wr.opcode) {
3252 		case IB_WR_TID_RDMA_WRITE:
3253 			req = wqe_to_tid_req(prev);
3254 			if (req->ack_seg != req->total_segs)
3255 				goto interlock;
3256 		default:
3257 			break;
3258 		}
3259 		break;
3260 	case IB_WR_RDMA_READ:
3261 		if (prev->wr.opcode != IB_WR_TID_RDMA_WRITE)
3262 			break;
3263 		/* fall through */
3264 	case IB_WR_TID_RDMA_READ:
3265 		switch (prev->wr.opcode) {
3266 		case IB_WR_RDMA_READ:
3267 			if (qp->s_acked != qp->s_cur)
3268 				goto interlock;
3269 			break;
3270 		case IB_WR_TID_RDMA_WRITE:
3271 			req = wqe_to_tid_req(prev);
3272 			if (req->ack_seg != req->total_segs)
3273 				goto interlock;
3274 		default:
3275 			break;
3276 		}
3277 	default:
3278 		break;
3279 	}
3280 	return false;
3281 
3282 interlock:
3283 	priv->s_flags |= HFI1_S_TID_WAIT_INTERLCK;
3284 	return true;
3285 }
3286 
3287 /* Does @sge meet the alignment requirements for tid rdma? */
3288 static inline bool hfi1_check_sge_align(struct rvt_qp *qp,
3289 					struct rvt_sge *sge, int num_sge)
3290 {
3291 	int i;
3292 
3293 	for (i = 0; i < num_sge; i++, sge++) {
3294 		trace_hfi1_sge_check_align(qp, i, sge);
3295 		if ((u64)sge->vaddr & ~PAGE_MASK ||
3296 		    sge->sge_length & ~PAGE_MASK)
3297 			return false;
3298 	}
3299 	return true;
3300 }
3301 
3302 void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
3303 {
3304 	struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
3305 	struct hfi1_swqe_priv *priv = wqe->priv;
3306 	struct tid_rdma_params *remote;
3307 	enum ib_wr_opcode new_opcode;
3308 	bool do_tid_rdma = false;
3309 	struct hfi1_pportdata *ppd = qpriv->rcd->ppd;
3310 
3311 	if ((rdma_ah_get_dlid(&qp->remote_ah_attr) & ~((1 << ppd->lmc) - 1)) ==
3312 				ppd->lid)
3313 		return;
3314 	if (qpriv->hdr_type != HFI1_PKT_TYPE_9B)
3315 		return;
3316 
3317 	rcu_read_lock();
3318 	remote = rcu_dereference(qpriv->tid_rdma.remote);
3319 	/*
3320 	 * If TID RDMA is disabled by the negotiation, don't
3321 	 * use it.
3322 	 */
3323 	if (!remote)
3324 		goto exit;
3325 
3326 	if (wqe->wr.opcode == IB_WR_RDMA_READ) {
3327 		if (hfi1_check_sge_align(qp, &wqe->sg_list[0],
3328 					 wqe->wr.num_sge)) {
3329 			new_opcode = IB_WR_TID_RDMA_READ;
3330 			do_tid_rdma = true;
3331 		}
3332 	} else if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
3333 		/*
3334 		 * TID RDMA is enabled for this RDMA WRITE request iff:
3335 		 *   1. The remote address is page-aligned,
3336 		 *   2. The length is larger than the minimum segment size,
3337 		 *   3. The length is page-multiple.
3338 		 */
3339 		if (!(wqe->rdma_wr.remote_addr & ~PAGE_MASK) &&
3340 		    !(wqe->length & ~PAGE_MASK)) {
3341 			new_opcode = IB_WR_TID_RDMA_WRITE;
3342 			do_tid_rdma = true;
3343 		}
3344 	}
3345 
3346 	if (do_tid_rdma) {
3347 		if (hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req, GFP_ATOMIC))
3348 			goto exit;
3349 		wqe->wr.opcode = new_opcode;
3350 		priv->tid_req.seg_len =
3351 			min_t(u32, remote->max_len, wqe->length);
3352 		priv->tid_req.total_segs =
3353 			DIV_ROUND_UP(wqe->length, priv->tid_req.seg_len);
3354 		/* Compute the last PSN of the request */
3355 		wqe->lpsn = wqe->psn;
3356 		if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
3357 			priv->tid_req.n_flows = remote->max_read;
3358 			qpriv->tid_r_reqs++;
3359 			wqe->lpsn += rvt_div_round_up_mtu(qp, wqe->length) - 1;
3360 		} else {
3361 			wqe->lpsn += priv->tid_req.total_segs - 1;
3362 			atomic_inc(&qpriv->n_requests);
3363 		}
3364 
3365 		priv->tid_req.cur_seg = 0;
3366 		priv->tid_req.comp_seg = 0;
3367 		priv->tid_req.ack_seg = 0;
3368 		priv->tid_req.state = TID_REQUEST_INACTIVE;
3369 		/*
3370 		 * Reset acked_tail.
3371 		 * TID RDMA READ does not have ACKs so it does not
3372 		 * update the pointer. We have to reset it so TID RDMA
3373 		 * WRITE does not get confused.
3374 		 */
3375 		priv->tid_req.acked_tail = priv->tid_req.setup_head;
3376 		trace_hfi1_tid_req_setup_tid_wqe(qp, 1, wqe->wr.opcode,
3377 						 wqe->psn, wqe->lpsn,
3378 						 &priv->tid_req);
3379 	}
3380 exit:
3381 	rcu_read_unlock();
3382 }
3383 
3384 /* TID RDMA WRITE functions */
3385 
3386 u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
3387 				  struct ib_other_headers *ohdr,
3388 				  u32 *bth1, u32 *bth2, u32 *len)
3389 {
3390 	struct hfi1_qp_priv *qpriv = qp->priv;
3391 	struct tid_rdma_request *req = wqe_to_tid_req(wqe);
3392 	struct tid_rdma_params *remote;
3393 
3394 	rcu_read_lock();
3395 	remote = rcu_dereference(qpriv->tid_rdma.remote);
3396 	/*
3397 	 * Set the number of flow to be used based on negotiated
3398 	 * parameters.
3399 	 */
3400 	req->n_flows = remote->max_write;
3401 	req->state = TID_REQUEST_ACTIVE;
3402 
3403 	KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth0, KVER, 0x1);
3404 	KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth1, JKEY, remote->jkey);
3405 	ohdr->u.tid_rdma.w_req.reth.vaddr =
3406 		cpu_to_be64(wqe->rdma_wr.remote_addr + (wqe->length - *len));
3407 	ohdr->u.tid_rdma.w_req.reth.rkey =
3408 		cpu_to_be32(wqe->rdma_wr.rkey);
3409 	ohdr->u.tid_rdma.w_req.reth.length = cpu_to_be32(*len);
3410 	ohdr->u.tid_rdma.w_req.verbs_qp = cpu_to_be32(qp->remote_qpn);
3411 	*bth1 &= ~RVT_QPN_MASK;
3412 	*bth1 |= remote->qp;
3413 	qp->s_state = TID_OP(WRITE_REQ);
3414 	qp->s_flags |= HFI1_S_WAIT_TID_RESP;
3415 	*bth2 |= IB_BTH_REQ_ACK;
3416 	*len = 0;
3417 
3418 	rcu_read_unlock();
3419 	return sizeof(ohdr->u.tid_rdma.w_req) / sizeof(u32);
3420 }
3421 
3422 void hfi1_compute_tid_rdma_flow_wt(void)
3423 {
3424 	/*
3425 	 * Heuristic for computing the RNR timeout when waiting on the flow
3426 	 * queue. Rather than a computationaly expensive exact estimate of when
3427 	 * a flow will be available, we assume that if a QP is at position N in
3428 	 * the flow queue it has to wait approximately (N + 1) * (number of
3429 	 * segments between two sync points), assuming PMTU of 4K. The rationale
3430 	 * for this is that flows are released and recycled at each sync point.
3431 	 */
3432 	tid_rdma_flow_wt = MAX_TID_FLOW_PSN * enum_to_mtu(OPA_MTU_4096) /
3433 		TID_RDMA_MAX_SEGMENT_SIZE;
3434 }
3435 
3436 static u32 position_in_queue(struct hfi1_qp_priv *qpriv,
3437 			     struct tid_queue *queue)
3438 {
3439 	return qpriv->tid_enqueue - queue->dequeue;
3440 }
3441 
3442 /*
3443  * @qp: points to rvt_qp context.
3444  * @to_seg: desired RNR timeout in segments.
3445  * Return: index of the next highest timeout in the ib_hfi1_rnr_table[]
3446  */
3447 static u32 hfi1_compute_tid_rnr_timeout(struct rvt_qp *qp, u32 to_seg)
3448 {
3449 	struct hfi1_qp_priv *qpriv = qp->priv;
3450 	u64 timeout;
3451 	u32 bytes_per_us;
3452 	u8 i;
3453 
3454 	bytes_per_us = active_egress_rate(qpriv->rcd->ppd) / 8;
3455 	timeout = (to_seg * TID_RDMA_MAX_SEGMENT_SIZE) / bytes_per_us;
3456 	/*
3457 	 * Find the next highest value in the RNR table to the required
3458 	 * timeout. This gives the responder some padding.
3459 	 */
3460 	for (i = 1; i <= IB_AETH_CREDIT_MASK; i++)
3461 		if (rvt_rnr_tbl_to_usec(i) >= timeout)
3462 			return i;
3463 	return 0;
3464 }
3465 
3466 /**
3467  * Central place for resource allocation at TID write responder,
3468  * is called from write_req and write_data interrupt handlers as
3469  * well as the send thread when a queued QP is scheduled for
3470  * resource allocation.
3471  *
3472  * Iterates over (a) segments of a request and then (b) queued requests
3473  * themselves to allocate resources for up to local->max_write
3474  * segments across multiple requests. Stop allocating when we
3475  * hit a sync point, resume allocating after data packets at
3476  * sync point have been received.
3477  *
3478  * Resource allocation and sending of responses is decoupled. The
3479  * request/segment which are being allocated and sent are as follows.
3480  * Resources are allocated for:
3481  *     [request: qpriv->r_tid_alloc, segment: req->alloc_seg]
3482  * The send thread sends:
3483  *     [request: qp->s_tail_ack_queue, segment:req->cur_seg]
3484  */
3485 static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx)
3486 {
3487 	struct tid_rdma_request *req;
3488 	struct hfi1_qp_priv *qpriv = qp->priv;
3489 	struct hfi1_ctxtdata *rcd = qpriv->rcd;
3490 	struct tid_rdma_params *local = &qpriv->tid_rdma.local;
3491 	struct rvt_ack_entry *e;
3492 	u32 npkts, to_seg;
3493 	bool last;
3494 	int ret = 0;
3495 
3496 	lockdep_assert_held(&qp->s_lock);
3497 
3498 	while (1) {
3499 		trace_hfi1_rsp_tid_write_alloc_res(qp, 0);
3500 		trace_hfi1_tid_write_rsp_alloc_res(qp);
3501 		/*
3502 		 * Don't allocate more segments if a RNR NAK has already been
3503 		 * scheduled to avoid messing up qp->r_psn: the RNR NAK will
3504 		 * be sent only when all allocated segments have been sent.
3505 		 * However, if more segments are allocated before that, TID RDMA
3506 		 * WRITE RESP packets will be sent out for these new segments
3507 		 * before the RNR NAK packet. When the requester receives the
3508 		 * RNR NAK packet, it will restart with qp->s_last_psn + 1,
3509 		 * which does not match qp->r_psn and will be dropped.
3510 		 * Consequently, the requester will exhaust its retries and
3511 		 * put the qp into error state.
3512 		 */
3513 		if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND)
3514 			break;
3515 
3516 		/* No requests left to process */
3517 		if (qpriv->r_tid_alloc == qpriv->r_tid_head) {
3518 			/* If all data has been received, clear the flow */
3519 			if (qpriv->flow_state.index < RXE_NUM_TID_FLOWS &&
3520 			    !qpriv->alloc_w_segs)
3521 				hfi1_kern_clear_hw_flow(rcd, qp);
3522 			break;
3523 		}
3524 
3525 		e = &qp->s_ack_queue[qpriv->r_tid_alloc];
3526 		if (e->opcode != TID_OP(WRITE_REQ))
3527 			goto next_req;
3528 		req = ack_to_tid_req(e);
3529 		trace_hfi1_tid_req_write_alloc_res(qp, 0, e->opcode, e->psn,
3530 						   e->lpsn, req);
3531 		/* Finished allocating for all segments of this request */
3532 		if (req->alloc_seg >= req->total_segs)
3533 			goto next_req;
3534 
3535 		/* Can allocate only a maximum of local->max_write for a QP */
3536 		if (qpriv->alloc_w_segs >= local->max_write)
3537 			break;
3538 
3539 		/* Don't allocate at a sync point with data packets pending */
3540 		if (qpriv->sync_pt && qpriv->alloc_w_segs)
3541 			break;
3542 
3543 		/* All data received at the sync point, continue */
3544 		if (qpriv->sync_pt && !qpriv->alloc_w_segs) {
3545 			hfi1_kern_clear_hw_flow(rcd, qp);
3546 			qpriv->sync_pt = false;
3547 			if (qpriv->s_flags & HFI1_R_TID_SW_PSN)
3548 				qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
3549 		}
3550 
3551 		/* Allocate flow if we don't have one */
3552 		if (qpriv->flow_state.index >= RXE_NUM_TID_FLOWS) {
3553 			ret = hfi1_kern_setup_hw_flow(qpriv->rcd, qp);
3554 			if (ret) {
3555 				to_seg = tid_rdma_flow_wt *
3556 					position_in_queue(qpriv,
3557 							  &rcd->flow_queue);
3558 				break;
3559 			}
3560 		}
3561 
3562 		npkts = rvt_div_round_up_mtu(qp, req->seg_len);
3563 
3564 		/*
3565 		 * We are at a sync point if we run out of KDETH PSN space.
3566 		 * Last PSN of every generation is reserved for RESYNC.
3567 		 */
3568 		if (qpriv->flow_state.psn + npkts > MAX_TID_FLOW_PSN - 1) {
3569 			qpriv->sync_pt = true;
3570 			break;
3571 		}
3572 
3573 		/*
3574 		 * If overtaking req->acked_tail, send an RNR NAK. Because the
3575 		 * QP is not queued in this case, and the issue can only be
3576 		 * caused due a delay in scheduling the second leg which we
3577 		 * cannot estimate, we use a rather arbitrary RNR timeout of
3578 		 * (MAX_FLOWS / 2) segments
3579 		 */
3580 		if (!CIRC_SPACE(req->setup_head, req->acked_tail,
3581 				MAX_FLOWS)) {
3582 			ret = -EAGAIN;
3583 			to_seg = MAX_FLOWS >> 1;
3584 			qpriv->s_flags |= RVT_S_ACK_PENDING;
3585 			hfi1_schedule_tid_send(qp);
3586 			break;
3587 		}
3588 
3589 		/* Try to allocate rcv array / TID entries */
3590 		ret = hfi1_kern_exp_rcv_setup(req, &req->ss, &last);
3591 		if (ret == -EAGAIN)
3592 			to_seg = position_in_queue(qpriv, &rcd->rarr_queue);
3593 		if (ret)
3594 			break;
3595 
3596 		qpriv->alloc_w_segs++;
3597 		req->alloc_seg++;
3598 		continue;
3599 next_req:
3600 		/* Begin processing the next request */
3601 		if (++qpriv->r_tid_alloc >
3602 		    rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
3603 			qpriv->r_tid_alloc = 0;
3604 	}
3605 
3606 	/*
3607 	 * Schedule an RNR NAK to be sent if (a) flow or rcv array allocation
3608 	 * has failed (b) we are called from the rcv handler interrupt context
3609 	 * (c) an RNR NAK has not already been scheduled
3610 	 */
3611 	if (ret == -EAGAIN && intr_ctx && !qp->r_nak_state)
3612 		goto send_rnr_nak;
3613 
3614 	return;
3615 
3616 send_rnr_nak:
3617 	lockdep_assert_held(&qp->r_lock);
3618 
3619 	/* Set r_nak_state to prevent unrelated events from generating NAK's */
3620 	qp->r_nak_state = hfi1_compute_tid_rnr_timeout(qp, to_seg) | IB_RNR_NAK;
3621 
3622 	/* Pull back r_psn to the segment being RNR NAK'd */
3623 	qp->r_psn = e->psn + req->alloc_seg;
3624 	qp->r_ack_psn = qp->r_psn;
3625 	/*
3626 	 * Pull back r_head_ack_queue to the ack entry following the request
3627 	 * being RNR NAK'd. This allows resources to be allocated to the request
3628 	 * if the queued QP is scheduled.
3629 	 */
3630 	qp->r_head_ack_queue = qpriv->r_tid_alloc + 1;
3631 	if (qp->r_head_ack_queue > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
3632 		qp->r_head_ack_queue = 0;
3633 	qpriv->r_tid_head = qp->r_head_ack_queue;
3634 	/*
3635 	 * These send side fields are used in make_rc_ack(). They are set in
3636 	 * hfi1_send_rc_ack() but must be set here before dropping qp->s_lock
3637 	 * for consistency
3638 	 */
3639 	qp->s_nak_state = qp->r_nak_state;
3640 	qp->s_ack_psn = qp->r_ack_psn;
3641 	/*
3642 	 * Clear the ACK PENDING flag to prevent unwanted ACK because we
3643 	 * have modified qp->s_ack_psn here.
3644 	 */
3645 	qp->s_flags &= ~(RVT_S_ACK_PENDING);
3646 
3647 	trace_hfi1_rsp_tid_write_alloc_res(qp, qp->r_psn);
3648 	/*
3649 	 * qpriv->rnr_nak_state is used to determine when the scheduled RNR NAK
3650 	 * has actually been sent. qp->s_flags RVT_S_ACK_PENDING bit cannot be
3651 	 * used for this because qp->s_lock is dropped before calling
3652 	 * hfi1_send_rc_ack() leading to inconsistency between the receive
3653 	 * interrupt handlers and the send thread in make_rc_ack()
3654 	 */
3655 	qpriv->rnr_nak_state = TID_RNR_NAK_SEND;
3656 
3657 	/*
3658 	 * Schedule RNR NAK to be sent. RNR NAK's are scheduled from the receive
3659 	 * interrupt handlers but will be sent from the send engine behind any
3660 	 * previous responses that may have been scheduled
3661 	 */
3662 	rc_defered_ack(rcd, qp);
3663 }
3664 
3665 void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet)
3666 {
3667 	/* HANDLER FOR TID RDMA WRITE REQUEST packet (Responder side)*/
3668 
3669 	/*
3670 	 * 1. Verify TID RDMA WRITE REQ as per IB_OPCODE_RC_RDMA_WRITE_FIRST
3671 	 *    (see hfi1_rc_rcv())
3672 	 *     - Don't allow 0-length requests.
3673 	 * 2. Put TID RDMA WRITE REQ into the response queueu (s_ack_queue)
3674 	 *     - Setup struct tid_rdma_req with request info
3675 	 *     - Prepare struct tid_rdma_flow array?
3676 	 * 3. Set the qp->s_ack_state as state diagram in design doc.
3677 	 * 4. Set RVT_S_RESP_PENDING in s_flags.
3678 	 * 5. Kick the send engine (hfi1_schedule_send())
3679 	 */
3680 	struct hfi1_ctxtdata *rcd = packet->rcd;
3681 	struct rvt_qp *qp = packet->qp;
3682 	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
3683 	struct ib_other_headers *ohdr = packet->ohdr;
3684 	struct rvt_ack_entry *e;
3685 	unsigned long flags;
3686 	struct ib_reth *reth;
3687 	struct hfi1_qp_priv *qpriv = qp->priv;
3688 	struct tid_rdma_request *req;
3689 	u32 bth0, psn, len, rkey, num_segs;
3690 	bool is_fecn;
3691 	u8 next;
3692 	u64 vaddr;
3693 	int diff;
3694 
3695 	bth0 = be32_to_cpu(ohdr->bth[0]);
3696 	if (hfi1_ruc_check_hdr(ibp, packet))
3697 		return;
3698 
3699 	is_fecn = process_ecn(qp, packet);
3700 	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
3701 	trace_hfi1_rsp_rcv_tid_write_req(qp, psn);
3702 
3703 	if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
3704 		rvt_comm_est(qp);
3705 
3706 	if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
3707 		goto nack_inv;
3708 
3709 	reth = &ohdr->u.tid_rdma.w_req.reth;
3710 	vaddr = be64_to_cpu(reth->vaddr);
3711 	len = be32_to_cpu(reth->length);
3712 
3713 	num_segs = DIV_ROUND_UP(len, qpriv->tid_rdma.local.max_len);
3714 	diff = delta_psn(psn, qp->r_psn);
3715 	if (unlikely(diff)) {
3716 		if (tid_rdma_rcv_error(packet, ohdr, qp, psn, diff))
3717 			return;
3718 		goto send_ack;
3719 	}
3720 
3721 	/*
3722 	 * The resent request which was previously RNR NAK'd is inserted at the
3723 	 * location of the original request, which is one entry behind
3724 	 * r_head_ack_queue
3725 	 */
3726 	if (qpriv->rnr_nak_state)
3727 		qp->r_head_ack_queue = qp->r_head_ack_queue ?
3728 			qp->r_head_ack_queue - 1 :
3729 			rvt_size_atomic(ib_to_rvt(qp->ibqp.device));
3730 
3731 	/* We've verified the request, insert it into the ack queue. */
3732 	next = qp->r_head_ack_queue + 1;
3733 	if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
3734 		next = 0;
3735 	spin_lock_irqsave(&qp->s_lock, flags);
3736 	if (unlikely(next == qp->s_acked_ack_queue)) {
3737 		if (!qp->s_ack_queue[next].sent)
3738 			goto nack_inv_unlock;
3739 		update_ack_queue(qp, next);
3740 	}
3741 	e = &qp->s_ack_queue[qp->r_head_ack_queue];
3742 	req = ack_to_tid_req(e);
3743 
3744 	/* Bring previously RNR NAK'd request back to life */
3745 	if (qpriv->rnr_nak_state) {
3746 		qp->r_nak_state = 0;
3747 		qp->s_nak_state = 0;
3748 		qpriv->rnr_nak_state = TID_RNR_NAK_INIT;
3749 		qp->r_psn = e->lpsn + 1;
3750 		req->state = TID_REQUEST_INIT;
3751 		goto update_head;
3752 	}
3753 
3754 	if (e->rdma_sge.mr) {
3755 		rvt_put_mr(e->rdma_sge.mr);
3756 		e->rdma_sge.mr = NULL;
3757 	}
3758 
3759 	/* The length needs to be in multiples of PAGE_SIZE */
3760 	if (!len || len & ~PAGE_MASK)
3761 		goto nack_inv_unlock;
3762 
3763 	rkey = be32_to_cpu(reth->rkey);
3764 	qp->r_len = len;
3765 
3766 	if (e->opcode == TID_OP(WRITE_REQ) &&
3767 	    (req->setup_head != req->clear_tail ||
3768 	     req->clear_tail != req->acked_tail))
3769 		goto nack_inv_unlock;
3770 
3771 	if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr,
3772 				  rkey, IB_ACCESS_REMOTE_WRITE)))
3773 		goto nack_acc;
3774 
3775 	qp->r_psn += num_segs - 1;
3776 
3777 	e->opcode = (bth0 >> 24) & 0xff;
3778 	e->psn = psn;
3779 	e->lpsn = qp->r_psn;
3780 	e->sent = 0;
3781 
3782 	req->n_flows = min_t(u16, num_segs, qpriv->tid_rdma.local.max_write);
3783 	req->state = TID_REQUEST_INIT;
3784 	req->cur_seg = 0;
3785 	req->comp_seg = 0;
3786 	req->ack_seg = 0;
3787 	req->alloc_seg = 0;
3788 	req->isge = 0;
3789 	req->seg_len = qpriv->tid_rdma.local.max_len;
3790 	req->total_len = len;
3791 	req->total_segs = num_segs;
3792 	req->r_flow_psn = e->psn;
3793 	req->ss.sge = e->rdma_sge;
3794 	req->ss.num_sge = 1;
3795 
3796 	req->flow_idx = req->setup_head;
3797 	req->clear_tail = req->setup_head;
3798 	req->acked_tail = req->setup_head;
3799 
3800 	qp->r_state = e->opcode;
3801 	qp->r_nak_state = 0;
3802 	/*
3803 	 * We need to increment the MSN here instead of when we
3804 	 * finish sending the result since a duplicate request would
3805 	 * increment it more than once.
3806 	 */
3807 	qp->r_msn++;
3808 	qp->r_psn++;
3809 
3810 	trace_hfi1_tid_req_rcv_write_req(qp, 0, e->opcode, e->psn, e->lpsn,
3811 					 req);
3812 
3813 	if (qpriv->r_tid_tail == HFI1_QP_WQE_INVALID) {
3814 		qpriv->r_tid_tail = qp->r_head_ack_queue;
3815 	} else if (qpriv->r_tid_tail == qpriv->r_tid_head) {
3816 		struct tid_rdma_request *ptr;
3817 
3818 		e = &qp->s_ack_queue[qpriv->r_tid_tail];
3819 		ptr = ack_to_tid_req(e);
3820 
3821 		if (e->opcode != TID_OP(WRITE_REQ) ||
3822 		    ptr->comp_seg == ptr->total_segs) {
3823 			if (qpriv->r_tid_tail == qpriv->r_tid_ack)
3824 				qpriv->r_tid_ack = qp->r_head_ack_queue;
3825 			qpriv->r_tid_tail = qp->r_head_ack_queue;
3826 		}
3827 	}
3828 update_head:
3829 	qp->r_head_ack_queue = next;
3830 	qpriv->r_tid_head = qp->r_head_ack_queue;
3831 
3832 	hfi1_tid_write_alloc_resources(qp, true);
3833 	trace_hfi1_tid_write_rsp_rcv_req(qp);
3834 
3835 	/* Schedule the send tasklet. */
3836 	qp->s_flags |= RVT_S_RESP_PENDING;
3837 	hfi1_schedule_send(qp);
3838 
3839 	spin_unlock_irqrestore(&qp->s_lock, flags);
3840 	if (is_fecn)
3841 		goto send_ack;
3842 	return;
3843 
3844 nack_inv_unlock:
3845 	spin_unlock_irqrestore(&qp->s_lock, flags);
3846 nack_inv:
3847 	rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
3848 	qp->r_nak_state = IB_NAK_INVALID_REQUEST;
3849 	qp->r_ack_psn = qp->r_psn;
3850 	/* Queue NAK for later */
3851 	rc_defered_ack(rcd, qp);
3852 	return;
3853 nack_acc:
3854 	spin_unlock_irqrestore(&qp->s_lock, flags);
3855 	rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
3856 	qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
3857 	qp->r_ack_psn = qp->r_psn;
3858 send_ack:
3859 	hfi1_send_rc_ack(packet, is_fecn);
3860 }
3861 
3862 u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
3863 				   struct ib_other_headers *ohdr, u32 *bth1,
3864 				   u32 bth2, u32 *len,
3865 				   struct rvt_sge_state **ss)
3866 {
3867 	struct hfi1_ack_priv *epriv = e->priv;
3868 	struct tid_rdma_request *req = &epriv->tid_req;
3869 	struct hfi1_qp_priv *qpriv = qp->priv;
3870 	struct tid_rdma_flow *flow = NULL;
3871 	u32 resp_len = 0, hdwords = 0;
3872 	void *resp_addr = NULL;
3873 	struct tid_rdma_params *remote;
3874 
3875 	trace_hfi1_tid_req_build_write_resp(qp, 0, e->opcode, e->psn, e->lpsn,
3876 					    req);
3877 	trace_hfi1_tid_write_rsp_build_resp(qp);
3878 	trace_hfi1_rsp_build_tid_write_resp(qp, bth2);
3879 	flow = &req->flows[req->flow_idx];
3880 	switch (req->state) {
3881 	default:
3882 		/*
3883 		 * Try to allocate resources here in case QP was queued and was
3884 		 * later scheduled when resources became available
3885 		 */
3886 		hfi1_tid_write_alloc_resources(qp, false);
3887 
3888 		/* We've already sent everything which is ready */
3889 		if (req->cur_seg >= req->alloc_seg)
3890 			goto done;
3891 
3892 		/*
3893 		 * Resources can be assigned but responses cannot be sent in
3894 		 * rnr_nak state, till the resent request is received
3895 		 */
3896 		if (qpriv->rnr_nak_state == TID_RNR_NAK_SENT)
3897 			goto done;
3898 
3899 		req->state = TID_REQUEST_ACTIVE;
3900 		trace_hfi1_tid_flow_build_write_resp(qp, req->flow_idx, flow);
3901 		req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS);
3902 		hfi1_add_tid_reap_timer(qp);
3903 		break;
3904 
3905 	case TID_REQUEST_RESEND_ACTIVE:
3906 	case TID_REQUEST_RESEND:
3907 		trace_hfi1_tid_flow_build_write_resp(qp, req->flow_idx, flow);
3908 		req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS);
3909 		if (!CIRC_CNT(req->setup_head, req->flow_idx, MAX_FLOWS))
3910 			req->state = TID_REQUEST_ACTIVE;
3911 
3912 		hfi1_mod_tid_reap_timer(qp);
3913 		break;
3914 	}
3915 	flow->flow_state.resp_ib_psn = bth2;
3916 	resp_addr = (void *)flow->tid_entry;
3917 	resp_len = sizeof(*flow->tid_entry) * flow->tidcnt;
3918 	req->cur_seg++;
3919 
3920 	memset(&ohdr->u.tid_rdma.w_rsp, 0, sizeof(ohdr->u.tid_rdma.w_rsp));
3921 	epriv->ss.sge.vaddr = resp_addr;
3922 	epriv->ss.sge.sge_length = resp_len;
3923 	epriv->ss.sge.length = epriv->ss.sge.sge_length;
3924 	/*
3925 	 * We can safely zero these out. Since the first SGE covers the
3926 	 * entire packet, nothing else should even look at the MR.
3927 	 */
3928 	epriv->ss.sge.mr = NULL;
3929 	epriv->ss.sge.m = 0;
3930 	epriv->ss.sge.n = 0;
3931 
3932 	epriv->ss.sg_list = NULL;
3933 	epriv->ss.total_len = epriv->ss.sge.sge_length;
3934 	epriv->ss.num_sge = 1;
3935 
3936 	*ss = &epriv->ss;
3937 	*len = epriv->ss.total_len;
3938 
3939 	/* Construct the TID RDMA WRITE RESP packet header */
3940 	rcu_read_lock();
3941 	remote = rcu_dereference(qpriv->tid_rdma.remote);
3942 
3943 	KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth0, KVER, 0x1);
3944 	KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth1, JKEY, remote->jkey);
3945 	ohdr->u.tid_rdma.w_rsp.aeth = rvt_compute_aeth(qp);
3946 	ohdr->u.tid_rdma.w_rsp.tid_flow_psn =
3947 		cpu_to_be32((flow->flow_state.generation <<
3948 			     HFI1_KDETH_BTH_SEQ_SHIFT) |
3949 			    (flow->flow_state.spsn &
3950 			     HFI1_KDETH_BTH_SEQ_MASK));
3951 	ohdr->u.tid_rdma.w_rsp.tid_flow_qp =
3952 		cpu_to_be32(qpriv->tid_rdma.local.qp |
3953 			    ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) <<
3954 			     TID_RDMA_DESTQP_FLOW_SHIFT) |
3955 			    qpriv->rcd->ctxt);
3956 	ohdr->u.tid_rdma.w_rsp.verbs_qp = cpu_to_be32(qp->remote_qpn);
3957 	*bth1 = remote->qp;
3958 	rcu_read_unlock();
3959 	hdwords = sizeof(ohdr->u.tid_rdma.w_rsp) / sizeof(u32);
3960 	qpriv->pending_tid_w_segs++;
3961 done:
3962 	return hdwords;
3963 }
3964 
3965 static void hfi1_add_tid_reap_timer(struct rvt_qp *qp)
3966 {
3967 	struct hfi1_qp_priv *qpriv = qp->priv;
3968 
3969 	lockdep_assert_held(&qp->s_lock);
3970 	if (!(qpriv->s_flags & HFI1_R_TID_RSC_TIMER)) {
3971 		qpriv->s_flags |= HFI1_R_TID_RSC_TIMER;
3972 		qpriv->s_tid_timer.expires = jiffies +
3973 			qpriv->tid_timer_timeout_jiffies;
3974 		add_timer(&qpriv->s_tid_timer);
3975 	}
3976 }
3977 
3978 static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp)
3979 {
3980 	struct hfi1_qp_priv *qpriv = qp->priv;
3981 
3982 	lockdep_assert_held(&qp->s_lock);
3983 	qpriv->s_flags |= HFI1_R_TID_RSC_TIMER;
3984 	mod_timer(&qpriv->s_tid_timer, jiffies +
3985 		  qpriv->tid_timer_timeout_jiffies);
3986 }
3987 
3988 static int hfi1_stop_tid_reap_timer(struct rvt_qp *qp)
3989 {
3990 	struct hfi1_qp_priv *qpriv = qp->priv;
3991 	int rval = 0;
3992 
3993 	lockdep_assert_held(&qp->s_lock);
3994 	if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) {
3995 		rval = del_timer(&qpriv->s_tid_timer);
3996 		qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER;
3997 	}
3998 	return rval;
3999 }
4000 
4001 void hfi1_del_tid_reap_timer(struct rvt_qp *qp)
4002 {
4003 	struct hfi1_qp_priv *qpriv = qp->priv;
4004 
4005 	del_timer_sync(&qpriv->s_tid_timer);
4006 	qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER;
4007 }
4008 
4009 static void hfi1_tid_timeout(struct timer_list *t)
4010 {
4011 	struct hfi1_qp_priv *qpriv = from_timer(qpriv, t, s_tid_timer);
4012 	struct rvt_qp *qp = qpriv->owner;
4013 	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
4014 	unsigned long flags;
4015 	u32 i;
4016 
4017 	spin_lock_irqsave(&qp->r_lock, flags);
4018 	spin_lock(&qp->s_lock);
4019 	if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) {
4020 		dd_dev_warn(dd_from_ibdev(qp->ibqp.device), "[QP%u] %s %d\n",
4021 			    qp->ibqp.qp_num, __func__, __LINE__);
4022 		trace_hfi1_msg_tid_timeout(/* msg */
4023 			qp, "resource timeout = ",
4024 			(u64)qpriv->tid_timer_timeout_jiffies);
4025 		hfi1_stop_tid_reap_timer(qp);
4026 		/*
4027 		 * Go though the entire ack queue and clear any outstanding
4028 		 * HW flow and RcvArray resources.
4029 		 */
4030 		hfi1_kern_clear_hw_flow(qpriv->rcd, qp);
4031 		for (i = 0; i < rvt_max_atomic(rdi); i++) {
4032 			struct tid_rdma_request *req =
4033 				ack_to_tid_req(&qp->s_ack_queue[i]);
4034 
4035 			hfi1_kern_exp_rcv_clear_all(req);
4036 		}
4037 		spin_unlock(&qp->s_lock);
4038 		if (qp->ibqp.event_handler) {
4039 			struct ib_event ev;
4040 
4041 			ev.device = qp->ibqp.device;
4042 			ev.element.qp = &qp->ibqp;
4043 			ev.event = IB_EVENT_QP_FATAL;
4044 			qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
4045 		}
4046 		rvt_rc_error(qp, IB_WC_RESP_TIMEOUT_ERR);
4047 		goto unlock_r_lock;
4048 	}
4049 	spin_unlock(&qp->s_lock);
4050 unlock_r_lock:
4051 	spin_unlock_irqrestore(&qp->r_lock, flags);
4052 }
4053 
4054 void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet)
4055 {
4056 	/* HANDLER FOR TID RDMA WRITE RESPONSE packet (Requestor side */
4057 
4058 	/*
4059 	 * 1. Find matching SWQE
4060 	 * 2. Check that TIDENTRY array has enough space for a complete
4061 	 *    segment. If not, put QP in error state.
4062 	 * 3. Save response data in struct tid_rdma_req and struct tid_rdma_flow
4063 	 * 4. Remove HFI1_S_WAIT_TID_RESP from s_flags.
4064 	 * 5. Set qp->s_state
4065 	 * 6. Kick the send engine (hfi1_schedule_send())
4066 	 */
4067 	struct ib_other_headers *ohdr = packet->ohdr;
4068 	struct rvt_qp *qp = packet->qp;
4069 	struct hfi1_qp_priv *qpriv = qp->priv;
4070 	struct hfi1_ctxtdata *rcd = packet->rcd;
4071 	struct rvt_swqe *wqe;
4072 	struct tid_rdma_request *req;
4073 	struct tid_rdma_flow *flow;
4074 	enum ib_wc_status status;
4075 	u32 opcode, aeth, psn, flow_psn, i, tidlen = 0, pktlen;
4076 	bool is_fecn;
4077 	unsigned long flags;
4078 
4079 	is_fecn = process_ecn(qp, packet);
4080 	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
4081 	aeth = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.aeth);
4082 	opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
4083 
4084 	spin_lock_irqsave(&qp->s_lock, flags);
4085 
4086 	/* Ignore invalid responses */
4087 	if (cmp_psn(psn, qp->s_next_psn) >= 0)
4088 		goto ack_done;
4089 
4090 	/* Ignore duplicate responses. */
4091 	if (unlikely(cmp_psn(psn, qp->s_last_psn) <= 0))
4092 		goto ack_done;
4093 
4094 	if (unlikely(qp->s_acked == qp->s_tail))
4095 		goto ack_done;
4096 
4097 	/*
4098 	 * If we are waiting for a particular packet sequence number
4099 	 * due to a request being resent, check for it. Otherwise,
4100 	 * ensure that we haven't missed anything.
4101 	 */
4102 	if (qp->r_flags & RVT_R_RDMAR_SEQ) {
4103 		if (cmp_psn(psn, qp->s_last_psn + 1) != 0)
4104 			goto ack_done;
4105 		qp->r_flags &= ~RVT_R_RDMAR_SEQ;
4106 	}
4107 
4108 	wqe = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur);
4109 	if (unlikely(wqe->wr.opcode != IB_WR_TID_RDMA_WRITE))
4110 		goto ack_op_err;
4111 
4112 	req = wqe_to_tid_req(wqe);
4113 	/*
4114 	 * If we've lost ACKs and our acked_tail pointer is too far
4115 	 * behind, don't overwrite segments. Just drop the packet and
4116 	 * let the reliability protocol take care of it.
4117 	 */
4118 	if (!CIRC_SPACE(req->setup_head, req->acked_tail, MAX_FLOWS))
4119 		goto ack_done;
4120 
4121 	/*
4122 	 * The call to do_rc_ack() should be last in the chain of
4123 	 * packet checks because it will end up updating the QP state.
4124 	 * Therefore, anything that would prevent the packet from
4125 	 * being accepted as a successful response should be prior
4126 	 * to it.
4127 	 */
4128 	if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd))
4129 		goto ack_done;
4130 
4131 	trace_hfi1_ack(qp, psn);
4132 
4133 	flow = &req->flows[req->setup_head];
4134 	flow->pkt = 0;
4135 	flow->tid_idx = 0;
4136 	flow->tid_offset = 0;
4137 	flow->sent = 0;
4138 	flow->resync_npkts = 0;
4139 	flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_qp);
4140 	flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) &
4141 		TID_RDMA_DESTQP_FLOW_MASK;
4142 	flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_psn));
4143 	flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT;
4144 	flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK;
4145 	flow->flow_state.resp_ib_psn = psn;
4146 	flow->length = min_t(u32, req->seg_len,
4147 			     (wqe->length - (req->comp_seg * req->seg_len)));
4148 
4149 	flow->npkts = rvt_div_round_up_mtu(qp, flow->length);
4150 	flow->flow_state.lpsn = flow->flow_state.spsn +
4151 		flow->npkts - 1;
4152 	/* payload length = packet length - (header length + ICRC length) */
4153 	pktlen = packet->tlen - (packet->hlen + 4);
4154 	if (pktlen > sizeof(flow->tid_entry)) {
4155 		status = IB_WC_LOC_LEN_ERR;
4156 		goto ack_err;
4157 	}
4158 	memcpy(flow->tid_entry, packet->ebuf, pktlen);
4159 	flow->tidcnt = pktlen / sizeof(*flow->tid_entry);
4160 	trace_hfi1_tid_flow_rcv_write_resp(qp, req->setup_head, flow);
4161 
4162 	req->comp_seg++;
4163 	trace_hfi1_tid_write_sender_rcv_resp(qp, 0);
4164 	/*
4165 	 * Walk the TID_ENTRY list to make sure we have enough space for a
4166 	 * complete segment.
4167 	 */
4168 	for (i = 0; i < flow->tidcnt; i++) {
4169 		trace_hfi1_tid_entry_rcv_write_resp(/* entry */
4170 			qp, i, flow->tid_entry[i]);
4171 		if (!EXP_TID_GET(flow->tid_entry[i], LEN)) {
4172 			status = IB_WC_LOC_LEN_ERR;
4173 			goto ack_err;
4174 		}
4175 		tidlen += EXP_TID_GET(flow->tid_entry[i], LEN);
4176 	}
4177 	if (tidlen * PAGE_SIZE < flow->length) {
4178 		status = IB_WC_LOC_LEN_ERR;
4179 		goto ack_err;
4180 	}
4181 
4182 	trace_hfi1_tid_req_rcv_write_resp(qp, 0, wqe->wr.opcode, wqe->psn,
4183 					  wqe->lpsn, req);
4184 	/*
4185 	 * If this is the first response for this request, set the initial
4186 	 * flow index to the current flow.
4187 	 */
4188 	if (!cmp_psn(psn, wqe->psn)) {
4189 		req->r_last_acked = mask_psn(wqe->psn - 1);
4190 		/* Set acked flow index to head index */
4191 		req->acked_tail = req->setup_head;
4192 	}
4193 
4194 	/* advance circular buffer head */
4195 	req->setup_head = CIRC_NEXT(req->setup_head, MAX_FLOWS);
4196 	req->state = TID_REQUEST_ACTIVE;
4197 
4198 	/*
4199 	 * If all responses for this TID RDMA WRITE request have been received
4200 	 * advance the pointer to the next one.
4201 	 * Since TID RDMA requests could be mixed in with regular IB requests,
4202 	 * they might not appear sequentially in the queue. Therefore, the
4203 	 * next request needs to be "found".
4204 	 */
4205 	if (qpriv->s_tid_cur != qpriv->s_tid_head &&
4206 	    req->comp_seg == req->total_segs) {
4207 		for (i = qpriv->s_tid_cur + 1; ; i++) {
4208 			if (i == qp->s_size)
4209 				i = 0;
4210 			wqe = rvt_get_swqe_ptr(qp, i);
4211 			if (i == qpriv->s_tid_head)
4212 				break;
4213 			if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
4214 				break;
4215 		}
4216 		qpriv->s_tid_cur = i;
4217 	}
4218 	qp->s_flags &= ~HFI1_S_WAIT_TID_RESP;
4219 
4220 	hfi1_schedule_tid_send(qp);
4221 	goto ack_done;
4222 
4223 ack_op_err:
4224 	status = IB_WC_LOC_QP_OP_ERR;
4225 ack_err:
4226 	rvt_error_qp(qp, status);
4227 ack_done:
4228 	spin_unlock_irqrestore(&qp->s_lock, flags);
4229 	if (is_fecn)
4230 		hfi1_send_rc_ack(packet, is_fecn);
4231 }
4232 
4233 bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe,
4234 				struct ib_other_headers *ohdr,
4235 				u32 *bth1, u32 *bth2, u32 *len)
4236 {
4237 	struct tid_rdma_request *req = wqe_to_tid_req(wqe);
4238 	struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
4239 	struct tid_rdma_params *remote;
4240 	struct rvt_qp *qp = req->qp;
4241 	struct hfi1_qp_priv *qpriv = qp->priv;
4242 	u32 tidentry = flow->tid_entry[flow->tid_idx];
4243 	u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT;
4244 	struct tid_rdma_write_data *wd = &ohdr->u.tid_rdma.w_data;
4245 	u32 next_offset, om = KDETH_OM_LARGE;
4246 	bool last_pkt;
4247 
4248 	if (!tidlen) {
4249 		hfi1_trdma_send_complete(qp, wqe, IB_WC_REM_INV_RD_REQ_ERR);
4250 		rvt_error_qp(qp, IB_WC_REM_INV_RD_REQ_ERR);
4251 	}
4252 
4253 	*len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset);
4254 	flow->sent += *len;
4255 	next_offset = flow->tid_offset + *len;
4256 	last_pkt = (flow->tid_idx == (flow->tidcnt - 1) &&
4257 		    next_offset >= tidlen) || (flow->sent >= flow->length);
4258 	trace_hfi1_tid_entry_build_write_data(qp, flow->tid_idx, tidentry);
4259 	trace_hfi1_tid_flow_build_write_data(qp, req->clear_tail, flow);
4260 
4261 	rcu_read_lock();
4262 	remote = rcu_dereference(qpriv->tid_rdma.remote);
4263 	KDETH_RESET(wd->kdeth0, KVER, 0x1);
4264 	KDETH_SET(wd->kdeth0, SH, !last_pkt);
4265 	KDETH_SET(wd->kdeth0, INTR, !!(!last_pkt && remote->urg));
4266 	KDETH_SET(wd->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL));
4267 	KDETH_SET(wd->kdeth0, TID, EXP_TID_GET(tidentry, IDX));
4268 	KDETH_SET(wd->kdeth0, OM, om == KDETH_OM_LARGE);
4269 	KDETH_SET(wd->kdeth0, OFFSET, flow->tid_offset / om);
4270 	KDETH_RESET(wd->kdeth1, JKEY, remote->jkey);
4271 	wd->verbs_qp = cpu_to_be32(qp->remote_qpn);
4272 	rcu_read_unlock();
4273 
4274 	*bth1 = flow->tid_qpn;
4275 	*bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) &
4276 			 HFI1_KDETH_BTH_SEQ_MASK) |
4277 			 (flow->flow_state.generation <<
4278 			  HFI1_KDETH_BTH_SEQ_SHIFT));
4279 	if (last_pkt) {
4280 		/* PSNs are zero-based, so +1 to count number of packets */
4281 		if (flow->flow_state.lpsn + 1 +
4282 		    rvt_div_round_up_mtu(qp, req->seg_len) >
4283 		    MAX_TID_FLOW_PSN)
4284 			req->state = TID_REQUEST_SYNC;
4285 		*bth2 |= IB_BTH_REQ_ACK;
4286 	}
4287 
4288 	if (next_offset >= tidlen) {
4289 		flow->tid_offset = 0;
4290 		flow->tid_idx++;
4291 	} else {
4292 		flow->tid_offset = next_offset;
4293 	}
4294 	return last_pkt;
4295 }
4296 
4297 void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet)
4298 {
4299 	struct rvt_qp *qp = packet->qp;
4300 	struct hfi1_qp_priv *priv = qp->priv;
4301 	struct hfi1_ctxtdata *rcd = priv->rcd;
4302 	struct ib_other_headers *ohdr = packet->ohdr;
4303 	struct rvt_ack_entry *e;
4304 	struct tid_rdma_request *req;
4305 	struct tid_rdma_flow *flow;
4306 	struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
4307 	unsigned long flags;
4308 	u32 psn, next;
4309 	u8 opcode;
4310 
4311 	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
4312 	opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
4313 
4314 	/*
4315 	 * All error handling should be done by now. If we are here, the packet
4316 	 * is either good or been accepted by the error handler.
4317 	 */
4318 	spin_lock_irqsave(&qp->s_lock, flags);
4319 	e = &qp->s_ack_queue[priv->r_tid_tail];
4320 	req = ack_to_tid_req(e);
4321 	flow = &req->flows[req->clear_tail];
4322 	if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.lpsn))) {
4323 		if (cmp_psn(psn, flow->flow_state.r_next_psn))
4324 			goto send_nak;
4325 		flow->flow_state.r_next_psn++;
4326 		goto exit;
4327 	}
4328 	flow->flow_state.r_next_psn = mask_psn(psn + 1);
4329 	hfi1_kern_exp_rcv_clear(req);
4330 	priv->alloc_w_segs--;
4331 	rcd->flows[flow->idx].psn = psn & HFI1_KDETH_BTH_SEQ_MASK;
4332 	req->comp_seg++;
4333 	priv->s_nak_state = 0;
4334 
4335 	/*
4336 	 * Release the flow if one of the following conditions has been met:
4337 	 *  - The request has reached a sync point AND all outstanding
4338 	 *    segments have been completed, or
4339 	 *  - The entire request is complete and there are no more requests
4340 	 *    (of any kind) in the queue.
4341 	 */
4342 	trace_hfi1_rsp_rcv_tid_write_data(qp, psn);
4343 	trace_hfi1_tid_req_rcv_write_data(qp, 0, e->opcode, e->psn, e->lpsn,
4344 					  req);
4345 	trace_hfi1_tid_write_rsp_rcv_data(qp);
4346 	if (priv->r_tid_ack == HFI1_QP_WQE_INVALID)
4347 		priv->r_tid_ack = priv->r_tid_tail;
4348 
4349 	if (opcode == TID_OP(WRITE_DATA_LAST)) {
4350 		for (next = priv->r_tid_tail + 1; ; next++) {
4351 			if (next > rvt_size_atomic(&dev->rdi))
4352 				next = 0;
4353 			if (next == priv->r_tid_head)
4354 				break;
4355 			e = &qp->s_ack_queue[next];
4356 			if (e->opcode == TID_OP(WRITE_REQ))
4357 				break;
4358 		}
4359 		priv->r_tid_tail = next;
4360 		if (++qp->s_acked_ack_queue > rvt_size_atomic(&dev->rdi))
4361 			qp->s_acked_ack_queue = 0;
4362 	}
4363 
4364 	hfi1_tid_write_alloc_resources(qp, true);
4365 
4366 	/*
4367 	 * If we need to generate more responses, schedule the
4368 	 * send engine.
4369 	 */
4370 	if (req->cur_seg < req->total_segs ||
4371 	    qp->s_tail_ack_queue != qp->r_head_ack_queue) {
4372 		qp->s_flags |= RVT_S_RESP_PENDING;
4373 		hfi1_schedule_send(qp);
4374 	}
4375 
4376 	priv->pending_tid_w_segs--;
4377 	if (priv->s_flags & HFI1_R_TID_RSC_TIMER) {
4378 		if (priv->pending_tid_w_segs)
4379 			hfi1_mod_tid_reap_timer(req->qp);
4380 		else
4381 			hfi1_stop_tid_reap_timer(req->qp);
4382 	}
4383 
4384 done:
4385 	priv->s_flags |= RVT_S_ACK_PENDING;
4386 	hfi1_schedule_tid_send(qp);
4387 exit:
4388 	priv->r_next_psn_kdeth = flow->flow_state.r_next_psn;
4389 	spin_unlock_irqrestore(&qp->s_lock, flags);
4390 	return;
4391 
4392 send_nak:
4393 	if (!priv->s_nak_state) {
4394 		priv->s_nak_state = IB_NAK_PSN_ERROR;
4395 		priv->s_nak_psn = flow->flow_state.r_next_psn;
4396 		priv->s_flags |= RVT_S_ACK_PENDING;
4397 		if (priv->r_tid_ack == HFI1_QP_WQE_INVALID)
4398 			priv->r_tid_ack = priv->r_tid_tail;
4399 		hfi1_schedule_tid_send(qp);
4400 	}
4401 	goto done;
4402 }
4403 
4404 static bool hfi1_tid_rdma_is_resync_psn(u32 psn)
4405 {
4406 	return (bool)((psn & HFI1_KDETH_BTH_SEQ_MASK) ==
4407 		      HFI1_KDETH_BTH_SEQ_MASK);
4408 }
4409 
4410 u32 hfi1_build_tid_rdma_write_ack(struct rvt_qp *qp, struct rvt_ack_entry *e,
4411 				  struct ib_other_headers *ohdr, u16 iflow,
4412 				  u32 *bth1, u32 *bth2)
4413 {
4414 	struct hfi1_qp_priv *qpriv = qp->priv;
4415 	struct tid_flow_state *fs = &qpriv->flow_state;
4416 	struct tid_rdma_request *req = ack_to_tid_req(e);
4417 	struct tid_rdma_flow *flow = &req->flows[iflow];
4418 	struct tid_rdma_params *remote;
4419 
4420 	rcu_read_lock();
4421 	remote = rcu_dereference(qpriv->tid_rdma.remote);
4422 	KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey);
4423 	ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn);
4424 	*bth1 = remote->qp;
4425 	rcu_read_unlock();
4426 
4427 	if (qpriv->resync) {
4428 		*bth2 = mask_psn((fs->generation <<
4429 				  HFI1_KDETH_BTH_SEQ_SHIFT) - 1);
4430 		ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp);
4431 	} else if (qpriv->s_nak_state) {
4432 		*bth2 = mask_psn(qpriv->s_nak_psn);
4433 		ohdr->u.tid_rdma.ack.aeth =
4434 			cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
4435 				    (qpriv->s_nak_state <<
4436 				     IB_AETH_CREDIT_SHIFT));
4437 	} else {
4438 		*bth2 = full_flow_psn(flow, flow->flow_state.lpsn);
4439 		ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp);
4440 	}
4441 	KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, 0x1);
4442 	ohdr->u.tid_rdma.ack.tid_flow_qp =
4443 		cpu_to_be32(qpriv->tid_rdma.local.qp |
4444 			    ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) <<
4445 			     TID_RDMA_DESTQP_FLOW_SHIFT) |
4446 			    qpriv->rcd->ctxt);
4447 
4448 	ohdr->u.tid_rdma.ack.tid_flow_psn = 0;
4449 	ohdr->u.tid_rdma.ack.verbs_psn =
4450 		cpu_to_be32(flow->flow_state.resp_ib_psn);
4451 
4452 	if (qpriv->resync) {
4453 		/*
4454 		 * If the PSN before the current expect KDETH PSN is the
4455 		 * RESYNC PSN, then we never received a good TID RDMA WRITE
4456 		 * DATA packet after a previous RESYNC.
4457 		 * In this case, the next expected KDETH PSN stays the same.
4458 		 */
4459 		if (hfi1_tid_rdma_is_resync_psn(qpriv->r_next_psn_kdeth - 1)) {
4460 			ohdr->u.tid_rdma.ack.tid_flow_psn =
4461 				cpu_to_be32(qpriv->r_next_psn_kdeth_save);
4462 		} else {
4463 			/*
4464 			 * Because the KDETH PSNs jump during a RESYNC, it's
4465 			 * not possible to infer (or compute) the previous value
4466 			 * of r_next_psn_kdeth in the case of back-to-back
4467 			 * RESYNC packets. Therefore, we save it.
4468 			 */
4469 			qpriv->r_next_psn_kdeth_save =
4470 				qpriv->r_next_psn_kdeth - 1;
4471 			ohdr->u.tid_rdma.ack.tid_flow_psn =
4472 				cpu_to_be32(qpriv->r_next_psn_kdeth_save);
4473 			qpriv->r_next_psn_kdeth = mask_psn(*bth2 + 1);
4474 		}
4475 		qpriv->resync = false;
4476 	}
4477 
4478 	return sizeof(ohdr->u.tid_rdma.ack) / sizeof(u32);
4479 }
4480 
4481 void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet)
4482 {
4483 	struct ib_other_headers *ohdr = packet->ohdr;
4484 	struct rvt_qp *qp = packet->qp;
4485 	struct hfi1_qp_priv *qpriv = qp->priv;
4486 	struct rvt_swqe *wqe;
4487 	struct tid_rdma_request *req;
4488 	struct tid_rdma_flow *flow;
4489 	u32 aeth, psn, req_psn, ack_psn, fspsn, resync_psn, ack_kpsn;
4490 	bool is_fecn;
4491 	unsigned long flags;
4492 	u16 fidx;
4493 
4494 	trace_hfi1_tid_write_sender_rcv_tid_ack(qp, 0);
4495 	is_fecn = process_ecn(qp, packet);
4496 	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
4497 	aeth = be32_to_cpu(ohdr->u.tid_rdma.ack.aeth);
4498 	req_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.verbs_psn));
4499 	resync_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.tid_flow_psn));
4500 
4501 	spin_lock_irqsave(&qp->s_lock, flags);
4502 	trace_hfi1_rcv_tid_ack(qp, aeth, psn, req_psn, resync_psn);
4503 
4504 	/* If we are waiting for an ACK to RESYNC, drop any other packets */
4505 	if ((qp->s_flags & HFI1_S_WAIT_HALT) &&
4506 	    cmp_psn(psn, qpriv->s_resync_psn))
4507 		goto ack_op_err;
4508 
4509 	ack_psn = req_psn;
4510 	if (hfi1_tid_rdma_is_resync_psn(psn))
4511 		ack_kpsn = resync_psn;
4512 	else
4513 		ack_kpsn = psn;
4514 	if (aeth >> 29) {
4515 		ack_psn--;
4516 		ack_kpsn--;
4517 	}
4518 
4519 	wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
4520 
4521 	if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
4522 		goto ack_op_err;
4523 
4524 	req = wqe_to_tid_req(wqe);
4525 	trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn,
4526 				       wqe->lpsn, req);
4527 	flow = &req->flows[req->acked_tail];
4528 	trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow);
4529 
4530 	/* Drop stale ACK/NAK */
4531 	if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.spsn)) < 0)
4532 		goto ack_op_err;
4533 
4534 	while (cmp_psn(ack_kpsn,
4535 		       full_flow_psn(flow, flow->flow_state.lpsn)) >= 0 &&
4536 	       req->ack_seg < req->cur_seg) {
4537 		req->ack_seg++;
4538 		/* advance acked segment pointer */
4539 		req->acked_tail = CIRC_NEXT(req->acked_tail, MAX_FLOWS);
4540 		req->r_last_acked = flow->flow_state.resp_ib_psn;
4541 		trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn,
4542 					       wqe->lpsn, req);
4543 		if (req->ack_seg == req->total_segs) {
4544 			req->state = TID_REQUEST_COMPLETE;
4545 			wqe = do_rc_completion(qp, wqe,
4546 					       to_iport(qp->ibqp.device,
4547 							qp->port_num));
4548 			trace_hfi1_sender_rcv_tid_ack(qp);
4549 			atomic_dec(&qpriv->n_tid_requests);
4550 			if (qp->s_acked == qp->s_tail)
4551 				break;
4552 			if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
4553 				break;
4554 			req = wqe_to_tid_req(wqe);
4555 		}
4556 		flow = &req->flows[req->acked_tail];
4557 		trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow);
4558 	}
4559 
4560 	trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn,
4561 				       wqe->lpsn, req);
4562 	switch (aeth >> 29) {
4563 	case 0:         /* ACK */
4564 		if (qpriv->s_flags & RVT_S_WAIT_ACK)
4565 			qpriv->s_flags &= ~RVT_S_WAIT_ACK;
4566 		if (!hfi1_tid_rdma_is_resync_psn(psn)) {
4567 			/* Check if there is any pending TID ACK */
4568 			if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
4569 			    req->ack_seg < req->cur_seg)
4570 				hfi1_mod_tid_retry_timer(qp);
4571 			else
4572 				hfi1_stop_tid_retry_timer(qp);
4573 			hfi1_schedule_send(qp);
4574 		} else {
4575 			u32 spsn, fpsn, last_acked, generation;
4576 			struct tid_rdma_request *rptr;
4577 
4578 			/* ACK(RESYNC) */
4579 			hfi1_stop_tid_retry_timer(qp);
4580 			/* Allow new requests (see hfi1_make_tid_rdma_pkt) */
4581 			qp->s_flags &= ~HFI1_S_WAIT_HALT;
4582 			/*
4583 			 * Clear RVT_S_SEND_ONE flag in case that the TID RDMA
4584 			 * ACK is received after the TID retry timer is fired
4585 			 * again. In this case, do not send any more TID
4586 			 * RESYNC request or wait for any more TID ACK packet.
4587 			 */
4588 			qpriv->s_flags &= ~RVT_S_SEND_ONE;
4589 			hfi1_schedule_send(qp);
4590 
4591 			if ((qp->s_acked == qpriv->s_tid_tail &&
4592 			     req->ack_seg == req->total_segs) ||
4593 			    qp->s_acked == qp->s_tail) {
4594 				qpriv->s_state = TID_OP(WRITE_DATA_LAST);
4595 				goto done;
4596 			}
4597 
4598 			if (req->ack_seg == req->comp_seg) {
4599 				qpriv->s_state = TID_OP(WRITE_DATA);
4600 				goto done;
4601 			}
4602 
4603 			/*
4604 			 * The PSN to start with is the next PSN after the
4605 			 * RESYNC PSN.
4606 			 */
4607 			psn = mask_psn(psn + 1);
4608 			generation = psn >> HFI1_KDETH_BTH_SEQ_SHIFT;
4609 			spsn = 0;
4610 
4611 			/*
4612 			 * Update to the correct WQE when we get an ACK(RESYNC)
4613 			 * in the middle of a request.
4614 			 */
4615 			if (delta_psn(ack_psn, wqe->lpsn))
4616 				wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
4617 			req = wqe_to_tid_req(wqe);
4618 			flow = &req->flows[req->acked_tail];
4619 			/*
4620 			 * RESYNC re-numbers the PSN ranges of all remaining
4621 			 * segments. Also, PSN's start from 0 in the middle of a
4622 			 * segment and the first segment size is less than the
4623 			 * default number of packets. flow->resync_npkts is used
4624 			 * to track the number of packets from the start of the
4625 			 * real segment to the point of 0 PSN after the RESYNC
4626 			 * in order to later correctly rewind the SGE.
4627 			 */
4628 			fpsn = full_flow_psn(flow, flow->flow_state.spsn);
4629 			req->r_ack_psn = psn;
4630 			flow->resync_npkts +=
4631 				delta_psn(mask_psn(resync_psn + 1), fpsn);
4632 			/*
4633 			 * Renumber all packet sequence number ranges
4634 			 * based on the new generation.
4635 			 */
4636 			last_acked = qp->s_acked;
4637 			rptr = req;
4638 			while (1) {
4639 				/* start from last acked segment */
4640 				for (fidx = rptr->acked_tail;
4641 				     CIRC_CNT(rptr->setup_head, fidx,
4642 					      MAX_FLOWS);
4643 				     fidx = CIRC_NEXT(fidx, MAX_FLOWS)) {
4644 					u32 lpsn;
4645 					u32 gen;
4646 
4647 					flow = &rptr->flows[fidx];
4648 					gen = flow->flow_state.generation;
4649 					if (WARN_ON(gen == generation &&
4650 						    flow->flow_state.spsn !=
4651 						     spsn))
4652 						continue;
4653 					lpsn = flow->flow_state.lpsn;
4654 					lpsn = full_flow_psn(flow, lpsn);
4655 					flow->npkts =
4656 						delta_psn(lpsn,
4657 							  mask_psn(resync_psn)
4658 							  );
4659 					flow->flow_state.generation =
4660 						generation;
4661 					flow->flow_state.spsn = spsn;
4662 					flow->flow_state.lpsn =
4663 						flow->flow_state.spsn +
4664 						flow->npkts - 1;
4665 					flow->pkt = 0;
4666 					spsn += flow->npkts;
4667 					resync_psn += flow->npkts;
4668 					trace_hfi1_tid_flow_rcv_tid_ack(qp,
4669 									fidx,
4670 									flow);
4671 				}
4672 				if (++last_acked == qpriv->s_tid_cur + 1)
4673 					break;
4674 				if (last_acked == qp->s_size)
4675 					last_acked = 0;
4676 				wqe = rvt_get_swqe_ptr(qp, last_acked);
4677 				rptr = wqe_to_tid_req(wqe);
4678 			}
4679 			req->cur_seg = req->ack_seg;
4680 			qpriv->s_tid_tail = qp->s_acked;
4681 			qpriv->s_state = TID_OP(WRITE_REQ);
4682 			hfi1_schedule_tid_send(qp);
4683 		}
4684 done:
4685 		qpriv->s_retry = qp->s_retry_cnt;
4686 		break;
4687 
4688 	case 3:         /* NAK */
4689 		hfi1_stop_tid_retry_timer(qp);
4690 		switch ((aeth >> IB_AETH_CREDIT_SHIFT) &
4691 			IB_AETH_CREDIT_MASK) {
4692 		case 0: /* PSN sequence error */
4693 			flow = &req->flows[req->acked_tail];
4694 			fspsn = full_flow_psn(flow, flow->flow_state.spsn);
4695 			trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail,
4696 							flow);
4697 			req->r_ack_psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
4698 			req->cur_seg = req->ack_seg;
4699 			qpriv->s_tid_tail = qp->s_acked;
4700 			qpriv->s_state = TID_OP(WRITE_REQ);
4701 			qpriv->s_retry = qp->s_retry_cnt;
4702 			hfi1_schedule_tid_send(qp);
4703 			break;
4704 
4705 		default:
4706 			break;
4707 		}
4708 		break;
4709 
4710 	default:
4711 		break;
4712 	}
4713 
4714 ack_op_err:
4715 	spin_unlock_irqrestore(&qp->s_lock, flags);
4716 }
4717 
4718 void hfi1_add_tid_retry_timer(struct rvt_qp *qp)
4719 {
4720 	struct hfi1_qp_priv *priv = qp->priv;
4721 	struct ib_qp *ibqp = &qp->ibqp;
4722 	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
4723 
4724 	lockdep_assert_held(&qp->s_lock);
4725 	if (!(priv->s_flags & HFI1_S_TID_RETRY_TIMER)) {
4726 		priv->s_flags |= HFI1_S_TID_RETRY_TIMER;
4727 		priv->s_tid_retry_timer.expires = jiffies +
4728 			priv->tid_retry_timeout_jiffies + rdi->busy_jiffies;
4729 		add_timer(&priv->s_tid_retry_timer);
4730 	}
4731 }
4732 
4733 static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp)
4734 {
4735 	struct hfi1_qp_priv *priv = qp->priv;
4736 	struct ib_qp *ibqp = &qp->ibqp;
4737 	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
4738 
4739 	lockdep_assert_held(&qp->s_lock);
4740 	priv->s_flags |= HFI1_S_TID_RETRY_TIMER;
4741 	mod_timer(&priv->s_tid_retry_timer, jiffies +
4742 		  priv->tid_retry_timeout_jiffies + rdi->busy_jiffies);
4743 }
4744 
4745 static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp)
4746 {
4747 	struct hfi1_qp_priv *priv = qp->priv;
4748 	int rval = 0;
4749 
4750 	lockdep_assert_held(&qp->s_lock);
4751 	if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) {
4752 		rval = del_timer(&priv->s_tid_retry_timer);
4753 		priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER;
4754 	}
4755 	return rval;
4756 }
4757 
4758 void hfi1_del_tid_retry_timer(struct rvt_qp *qp)
4759 {
4760 	struct hfi1_qp_priv *priv = qp->priv;
4761 
4762 	del_timer_sync(&priv->s_tid_retry_timer);
4763 	priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER;
4764 }
4765 
4766 static void hfi1_tid_retry_timeout(struct timer_list *t)
4767 {
4768 	struct hfi1_qp_priv *priv = from_timer(priv, t, s_tid_retry_timer);
4769 	struct rvt_qp *qp = priv->owner;
4770 	struct rvt_swqe *wqe;
4771 	unsigned long flags;
4772 	struct tid_rdma_request *req;
4773 
4774 	spin_lock_irqsave(&qp->r_lock, flags);
4775 	spin_lock(&qp->s_lock);
4776 	trace_hfi1_tid_write_sender_retry_timeout(qp, 0);
4777 	if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) {
4778 		hfi1_stop_tid_retry_timer(qp);
4779 		if (!priv->s_retry) {
4780 			trace_hfi1_msg_tid_retry_timeout(/* msg */
4781 				qp,
4782 				"Exhausted retries. Tid retry timeout = ",
4783 				(u64)priv->tid_retry_timeout_jiffies);
4784 
4785 			wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
4786 			hfi1_trdma_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
4787 			rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
4788 		} else {
4789 			wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
4790 			req = wqe_to_tid_req(wqe);
4791 			trace_hfi1_tid_req_tid_retry_timeout(/* req */
4792 			   qp, 0, wqe->wr.opcode, wqe->psn, wqe->lpsn, req);
4793 
4794 			priv->s_flags &= ~RVT_S_WAIT_ACK;
4795 			/* Only send one packet (the RESYNC) */
4796 			priv->s_flags |= RVT_S_SEND_ONE;
4797 			/*
4798 			 * No additional request shall be made by this QP until
4799 			 * the RESYNC has been complete.
4800 			 */
4801 			qp->s_flags |= HFI1_S_WAIT_HALT;
4802 			priv->s_state = TID_OP(RESYNC);
4803 			priv->s_retry--;
4804 			hfi1_schedule_tid_send(qp);
4805 		}
4806 	}
4807 	spin_unlock(&qp->s_lock);
4808 	spin_unlock_irqrestore(&qp->r_lock, flags);
4809 }
4810 
4811 u32 hfi1_build_tid_rdma_resync(struct rvt_qp *qp, struct rvt_swqe *wqe,
4812 			       struct ib_other_headers *ohdr, u32 *bth1,
4813 			       u32 *bth2, u16 fidx)
4814 {
4815 	struct hfi1_qp_priv *qpriv = qp->priv;
4816 	struct tid_rdma_params *remote;
4817 	struct tid_rdma_request *req = wqe_to_tid_req(wqe);
4818 	struct tid_rdma_flow *flow = &req->flows[fidx];
4819 	u32 generation;
4820 
4821 	rcu_read_lock();
4822 	remote = rcu_dereference(qpriv->tid_rdma.remote);
4823 	KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey);
4824 	ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn);
4825 	*bth1 = remote->qp;
4826 	rcu_read_unlock();
4827 
4828 	generation = kern_flow_generation_next(flow->flow_state.generation);
4829 	*bth2 = mask_psn((generation << HFI1_KDETH_BTH_SEQ_SHIFT) - 1);
4830 	qpriv->s_resync_psn = *bth2;
4831 	*bth2 |= IB_BTH_REQ_ACK;
4832 	KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, 0x1);
4833 
4834 	return sizeof(ohdr->u.tid_rdma.resync) / sizeof(u32);
4835 }
4836 
4837 void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet)
4838 {
4839 	struct ib_other_headers *ohdr = packet->ohdr;
4840 	struct rvt_qp *qp = packet->qp;
4841 	struct hfi1_qp_priv *qpriv = qp->priv;
4842 	struct hfi1_ctxtdata *rcd = qpriv->rcd;
4843 	struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
4844 	struct rvt_ack_entry *e;
4845 	struct tid_rdma_request *req;
4846 	struct tid_rdma_flow *flow;
4847 	struct tid_flow_state *fs = &qpriv->flow_state;
4848 	u32 psn, generation, idx, gen_next;
4849 	bool is_fecn;
4850 	unsigned long flags;
4851 
4852 	is_fecn = process_ecn(qp, packet);
4853 	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
4854 
4855 	generation = mask_psn(psn + 1) >> HFI1_KDETH_BTH_SEQ_SHIFT;
4856 	spin_lock_irqsave(&qp->s_lock, flags);
4857 
4858 	gen_next = (fs->generation == KERN_GENERATION_RESERVED) ?
4859 		generation : kern_flow_generation_next(fs->generation);
4860 	/*
4861 	 * RESYNC packet contains the "next" generation and can only be
4862 	 * from the current or previous generations
4863 	 */
4864 	if (generation != mask_generation(gen_next - 1) &&
4865 	    generation != gen_next)
4866 		goto bail;
4867 	/* Already processing a resync */
4868 	if (qpriv->resync)
4869 		goto bail;
4870 
4871 	spin_lock(&rcd->exp_lock);
4872 	if (fs->index >= RXE_NUM_TID_FLOWS) {
4873 		/*
4874 		 * If we don't have a flow, save the generation so it can be
4875 		 * applied when a new flow is allocated
4876 		 */
4877 		fs->generation = generation;
4878 	} else {
4879 		/* Reprogram the QP flow with new generation */
4880 		rcd->flows[fs->index].generation = generation;
4881 		fs->generation = kern_setup_hw_flow(rcd, fs->index);
4882 	}
4883 	fs->psn = 0;
4884 	/*
4885 	 * Disable SW PSN checking since a RESYNC is equivalent to a
4886 	 * sync point and the flow has/will be reprogrammed
4887 	 */
4888 	qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
4889 	trace_hfi1_tid_write_rsp_rcv_resync(qp);
4890 
4891 	/*
4892 	 * Reset all TID flow information with the new generation.
4893 	 * This is done for all requests and segments after the
4894 	 * last received segment
4895 	 */
4896 	for (idx = qpriv->r_tid_tail; ; idx++) {
4897 		u16 flow_idx;
4898 
4899 		if (idx > rvt_size_atomic(&dev->rdi))
4900 			idx = 0;
4901 		e = &qp->s_ack_queue[idx];
4902 		if (e->opcode == TID_OP(WRITE_REQ)) {
4903 			req = ack_to_tid_req(e);
4904 			trace_hfi1_tid_req_rcv_resync(qp, 0, e->opcode, e->psn,
4905 						      e->lpsn, req);
4906 
4907 			/* start from last unacked segment */
4908 			for (flow_idx = req->clear_tail;
4909 			     CIRC_CNT(req->setup_head, flow_idx,
4910 				      MAX_FLOWS);
4911 			     flow_idx = CIRC_NEXT(flow_idx, MAX_FLOWS)) {
4912 				u32 lpsn;
4913 				u32 next;
4914 
4915 				flow = &req->flows[flow_idx];
4916 				lpsn = full_flow_psn(flow,
4917 						     flow->flow_state.lpsn);
4918 				next = flow->flow_state.r_next_psn;
4919 				flow->npkts = delta_psn(lpsn, next - 1);
4920 				flow->flow_state.generation = fs->generation;
4921 				flow->flow_state.spsn = fs->psn;
4922 				flow->flow_state.lpsn =
4923 					flow->flow_state.spsn + flow->npkts - 1;
4924 				flow->flow_state.r_next_psn =
4925 					full_flow_psn(flow,
4926 						      flow->flow_state.spsn);
4927 				fs->psn += flow->npkts;
4928 				trace_hfi1_tid_flow_rcv_resync(qp, flow_idx,
4929 							       flow);
4930 			}
4931 		}
4932 		if (idx == qp->s_tail_ack_queue)
4933 			break;
4934 	}
4935 
4936 	spin_unlock(&rcd->exp_lock);
4937 	qpriv->resync = true;
4938 	/* RESYNC request always gets a TID RDMA ACK. */
4939 	qpriv->s_nak_state = 0;
4940 	qpriv->s_flags |= RVT_S_ACK_PENDING;
4941 	hfi1_schedule_tid_send(qp);
4942 bail:
4943 	spin_unlock_irqrestore(&qp->s_lock, flags);
4944 }
4945 
4946 /*
4947  * Call this function when the last TID RDMA WRITE DATA packet for a request
4948  * is built.
4949  */
4950 static void update_tid_tail(struct rvt_qp *qp)
4951 	__must_hold(&qp->s_lock)
4952 {
4953 	struct hfi1_qp_priv *priv = qp->priv;
4954 	u32 i;
4955 	struct rvt_swqe *wqe;
4956 
4957 	lockdep_assert_held(&qp->s_lock);
4958 	/* Can't move beyond s_tid_cur */
4959 	if (priv->s_tid_tail == priv->s_tid_cur)
4960 		return;
4961 	for (i = priv->s_tid_tail + 1; ; i++) {
4962 		if (i == qp->s_size)
4963 			i = 0;
4964 
4965 		if (i == priv->s_tid_cur)
4966 			break;
4967 		wqe = rvt_get_swqe_ptr(qp, i);
4968 		if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
4969 			break;
4970 	}
4971 	priv->s_tid_tail = i;
4972 	priv->s_state = TID_OP(WRITE_RESP);
4973 }
4974 
4975 int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
4976 	__must_hold(&qp->s_lock)
4977 {
4978 	struct hfi1_qp_priv *priv = qp->priv;
4979 	struct rvt_swqe *wqe;
4980 	u32 bth1 = 0, bth2 = 0, hwords = 5, len, middle = 0;
4981 	struct ib_other_headers *ohdr;
4982 	struct rvt_sge_state *ss = &qp->s_sge;
4983 	struct rvt_ack_entry *e = &qp->s_ack_queue[qp->s_tail_ack_queue];
4984 	struct tid_rdma_request *req = ack_to_tid_req(e);
4985 	bool last = false;
4986 	u8 opcode = TID_OP(WRITE_DATA);
4987 
4988 	lockdep_assert_held(&qp->s_lock);
4989 	trace_hfi1_tid_write_sender_make_tid_pkt(qp, 0);
4990 	/*
4991 	 * Prioritize the sending of the requests and responses over the
4992 	 * sending of the TID RDMA data packets.
4993 	 */
4994 	if (((atomic_read(&priv->n_tid_requests) < HFI1_TID_RDMA_WRITE_CNT) &&
4995 	     atomic_read(&priv->n_requests) &&
4996 	     !(qp->s_flags & (RVT_S_BUSY | RVT_S_WAIT_ACK |
4997 			     HFI1_S_ANY_WAIT_IO))) ||
4998 	    (e->opcode == TID_OP(WRITE_REQ) && req->cur_seg < req->alloc_seg &&
4999 	     !(qp->s_flags & (RVT_S_BUSY | HFI1_S_ANY_WAIT_IO)))) {
5000 		struct iowait_work *iowork;
5001 
5002 		iowork = iowait_get_ib_work(&priv->s_iowait);
5003 		ps->s_txreq = get_waiting_verbs_txreq(iowork);
5004 		if (ps->s_txreq || hfi1_make_rc_req(qp, ps)) {
5005 			priv->s_flags |= HFI1_S_TID_BUSY_SET;
5006 			return 1;
5007 		}
5008 	}
5009 
5010 	ps->s_txreq = get_txreq(ps->dev, qp);
5011 	if (!ps->s_txreq)
5012 		goto bail_no_tx;
5013 
5014 	ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth;
5015 
5016 	if ((priv->s_flags & RVT_S_ACK_PENDING) &&
5017 	    make_tid_rdma_ack(qp, ohdr, ps))
5018 		return 1;
5019 
5020 	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) {
5021 		if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
5022 			goto bail;
5023 		/* We are in the error state, flush the work request. */
5024 		if (qp->s_last == READ_ONCE(qp->s_head))
5025 			goto bail;
5026 		/* If DMAs are in progress, we can't flush immediately. */
5027 		if (iowait_sdma_pending(&priv->s_iowait)) {
5028 			qp->s_flags |= RVT_S_WAIT_DMA;
5029 			goto bail;
5030 		}
5031 		clear_ahg(qp);
5032 		wqe = rvt_get_swqe_ptr(qp, qp->s_last);
5033 		hfi1_trdma_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
5034 					 IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
5035 		/* will get called again */
5036 		goto done_free_tx;
5037 	}
5038 
5039 	if (priv->s_flags & RVT_S_WAIT_ACK)
5040 		goto bail;
5041 
5042 	/* Check whether there is anything to do. */
5043 	if (priv->s_tid_tail == HFI1_QP_WQE_INVALID)
5044 		goto bail;
5045 	wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail);
5046 	req = wqe_to_tid_req(wqe);
5047 	trace_hfi1_tid_req_make_tid_pkt(qp, 0, wqe->wr.opcode, wqe->psn,
5048 					wqe->lpsn, req);
5049 	switch (priv->s_state) {
5050 	case TID_OP(WRITE_REQ):
5051 	case TID_OP(WRITE_RESP):
5052 		priv->tid_ss.sge = wqe->sg_list[0];
5053 		priv->tid_ss.sg_list = wqe->sg_list + 1;
5054 		priv->tid_ss.num_sge = wqe->wr.num_sge;
5055 		priv->tid_ss.total_len = wqe->length;
5056 
5057 		if (priv->s_state == TID_OP(WRITE_REQ))
5058 			hfi1_tid_rdma_restart_req(qp, wqe, &bth2);
5059 		priv->s_state = TID_OP(WRITE_DATA);
5060 		/* fall through */
5061 
5062 	case TID_OP(WRITE_DATA):
5063 		/*
5064 		 * 1. Check whether TID RDMA WRITE RESP available.
5065 		 * 2. If no:
5066 		 *    2.1 If have more segments and no TID RDMA WRITE RESP,
5067 		 *        set HFI1_S_WAIT_TID_RESP
5068 		 *    2.2 Return indicating no progress made.
5069 		 * 3. If yes:
5070 		 *    3.1 Build TID RDMA WRITE DATA packet.
5071 		 *    3.2 If last packet in segment:
5072 		 *        3.2.1 Change KDETH header bits
5073 		 *        3.2.2 Advance RESP pointers.
5074 		 *    3.3 Return indicating progress made.
5075 		 */
5076 		trace_hfi1_sender_make_tid_pkt(qp);
5077 		trace_hfi1_tid_write_sender_make_tid_pkt(qp, 0);
5078 		wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail);
5079 		req = wqe_to_tid_req(wqe);
5080 		len = wqe->length;
5081 
5082 		if (!req->comp_seg || req->cur_seg == req->comp_seg)
5083 			goto bail;
5084 
5085 		trace_hfi1_tid_req_make_tid_pkt(qp, 0, wqe->wr.opcode,
5086 						wqe->psn, wqe->lpsn, req);
5087 		last = hfi1_build_tid_rdma_packet(wqe, ohdr, &bth1, &bth2,
5088 						  &len);
5089 
5090 		if (last) {
5091 			/* move pointer to next flow */
5092 			req->clear_tail = CIRC_NEXT(req->clear_tail,
5093 						    MAX_FLOWS);
5094 			if (++req->cur_seg < req->total_segs) {
5095 				if (!CIRC_CNT(req->setup_head, req->clear_tail,
5096 					      MAX_FLOWS))
5097 					qp->s_flags |= HFI1_S_WAIT_TID_RESP;
5098 			} else {
5099 				priv->s_state = TID_OP(WRITE_DATA_LAST);
5100 				opcode = TID_OP(WRITE_DATA_LAST);
5101 
5102 				/* Advance the s_tid_tail now */
5103 				update_tid_tail(qp);
5104 			}
5105 		}
5106 		hwords += sizeof(ohdr->u.tid_rdma.w_data) / sizeof(u32);
5107 		ss = &priv->tid_ss;
5108 		break;
5109 
5110 	case TID_OP(RESYNC):
5111 		trace_hfi1_sender_make_tid_pkt(qp);
5112 		/* Use generation from the most recently received response */
5113 		wqe = rvt_get_swqe_ptr(qp, priv->s_tid_cur);
5114 		req = wqe_to_tid_req(wqe);
5115 		/* If no responses for this WQE look at the previous one */
5116 		if (!req->comp_seg) {
5117 			wqe = rvt_get_swqe_ptr(qp,
5118 					       (!priv->s_tid_cur ? qp->s_size :
5119 						priv->s_tid_cur) - 1);
5120 			req = wqe_to_tid_req(wqe);
5121 		}
5122 		hwords += hfi1_build_tid_rdma_resync(qp, wqe, ohdr, &bth1,
5123 						     &bth2,
5124 						     CIRC_PREV(req->setup_head,
5125 							       MAX_FLOWS));
5126 		ss = NULL;
5127 		len = 0;
5128 		opcode = TID_OP(RESYNC);
5129 		break;
5130 
5131 	default:
5132 		goto bail;
5133 	}
5134 	if (priv->s_flags & RVT_S_SEND_ONE) {
5135 		priv->s_flags &= ~RVT_S_SEND_ONE;
5136 		priv->s_flags |= RVT_S_WAIT_ACK;
5137 		bth2 |= IB_BTH_REQ_ACK;
5138 	}
5139 	qp->s_len -= len;
5140 	ps->s_txreq->hdr_dwords = hwords;
5141 	ps->s_txreq->sde = priv->s_sde;
5142 	ps->s_txreq->ss = ss;
5143 	ps->s_txreq->s_cur_size = len;
5144 	hfi1_make_ruc_header(qp, ohdr, (opcode << 24), bth1, bth2,
5145 			     middle, ps);
5146 	return 1;
5147 done_free_tx:
5148 	hfi1_put_txreq(ps->s_txreq);
5149 	ps->s_txreq = NULL;
5150 	return 1;
5151 
5152 bail:
5153 	hfi1_put_txreq(ps->s_txreq);
5154 bail_no_tx:
5155 	ps->s_txreq = NULL;
5156 	priv->s_flags &= ~RVT_S_BUSY;
5157 	/*
5158 	 * If we didn't get a txreq, the QP will be woken up later to try
5159 	 * again, set the flags to the the wake up which work item to wake
5160 	 * up.
5161 	 * (A better algorithm should be found to do this and generalize the
5162 	 * sleep/wakeup flags.)
5163 	 */
5164 	iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
5165 	return 0;
5166 }
5167 
5168 static int make_tid_rdma_ack(struct rvt_qp *qp,
5169 			     struct ib_other_headers *ohdr,
5170 			     struct hfi1_pkt_state *ps)
5171 {
5172 	struct rvt_ack_entry *e;
5173 	struct hfi1_qp_priv *qpriv = qp->priv;
5174 	struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
5175 	u32 hwords, next;
5176 	u32 len = 0;
5177 	u32 bth1 = 0, bth2 = 0;
5178 	int middle = 0;
5179 	u16 flow;
5180 	struct tid_rdma_request *req, *nreq;
5181 
5182 	trace_hfi1_tid_write_rsp_make_tid_ack(qp);
5183 	/* Don't send an ACK if we aren't supposed to. */
5184 	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
5185 		goto bail;
5186 
5187 	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
5188 	hwords = 5;
5189 
5190 	e = &qp->s_ack_queue[qpriv->r_tid_ack];
5191 	req = ack_to_tid_req(e);
5192 	/*
5193 	 * In the RESYNC case, we are exactly one segment past the
5194 	 * previously sent ack or at the previously sent NAK. So to send
5195 	 * the resync ack, we go back one segment (which might be part of
5196 	 * the previous request) and let the do-while loop execute again.
5197 	 * The advantage of executing the do-while loop is that any data
5198 	 * received after the previous ack is automatically acked in the
5199 	 * RESYNC ack. It turns out that for the do-while loop we only need
5200 	 * to pull back qpriv->r_tid_ack, not the segment
5201 	 * indices/counters. The scheme works even if the previous request
5202 	 * was not a TID WRITE request.
5203 	 */
5204 	if (qpriv->resync) {
5205 		if (!req->ack_seg || req->ack_seg == req->total_segs)
5206 			qpriv->r_tid_ack = !qpriv->r_tid_ack ?
5207 				rvt_size_atomic(&dev->rdi) :
5208 				qpriv->r_tid_ack - 1;
5209 		e = &qp->s_ack_queue[qpriv->r_tid_ack];
5210 		req = ack_to_tid_req(e);
5211 	}
5212 
5213 	trace_hfi1_rsp_make_tid_ack(qp, e->psn);
5214 	trace_hfi1_tid_req_make_tid_ack(qp, 0, e->opcode, e->psn, e->lpsn,
5215 					req);
5216 	/*
5217 	 * If we've sent all the ACKs that we can, we are done
5218 	 * until we get more segments...
5219 	 */
5220 	if (!qpriv->s_nak_state && !qpriv->resync &&
5221 	    req->ack_seg == req->comp_seg)
5222 		goto bail;
5223 
5224 	do {
5225 		/*
5226 		 * To deal with coalesced ACKs, the acked_tail pointer
5227 		 * into the flow array is used. The distance between it
5228 		 * and the clear_tail is the number of flows that are
5229 		 * being ACK'ed.
5230 		 */
5231 		req->ack_seg +=
5232 			/* Get up-to-date value */
5233 			CIRC_CNT(req->clear_tail, req->acked_tail,
5234 				 MAX_FLOWS);
5235 		/* Advance acked index */
5236 		req->acked_tail = req->clear_tail;
5237 
5238 		/*
5239 		 * req->clear_tail points to the segment currently being
5240 		 * received. So, when sending an ACK, the previous
5241 		 * segment is being ACK'ed.
5242 		 */
5243 		flow = CIRC_PREV(req->acked_tail, MAX_FLOWS);
5244 		if (req->ack_seg != req->total_segs)
5245 			break;
5246 		req->state = TID_REQUEST_COMPLETE;
5247 
5248 		next = qpriv->r_tid_ack + 1;
5249 		if (next > rvt_size_atomic(&dev->rdi))
5250 			next = 0;
5251 		qpriv->r_tid_ack = next;
5252 		if (qp->s_ack_queue[next].opcode != TID_OP(WRITE_REQ))
5253 			break;
5254 		nreq = ack_to_tid_req(&qp->s_ack_queue[next]);
5255 		if (!nreq->comp_seg || nreq->ack_seg == nreq->comp_seg)
5256 			break;
5257 
5258 		/* Move to the next ack entry now */
5259 		e = &qp->s_ack_queue[qpriv->r_tid_ack];
5260 		req = ack_to_tid_req(e);
5261 	} while (1);
5262 
5263 	/*
5264 	 * At this point qpriv->r_tid_ack == qpriv->r_tid_tail but e and
5265 	 * req could be pointing at the previous ack queue entry
5266 	 */
5267 	if (qpriv->s_nak_state ||
5268 	    (qpriv->resync &&
5269 	     !hfi1_tid_rdma_is_resync_psn(qpriv->r_next_psn_kdeth - 1) &&
5270 	     (cmp_psn(qpriv->r_next_psn_kdeth - 1,
5271 		      full_flow_psn(&req->flows[flow],
5272 				    req->flows[flow].flow_state.lpsn)) > 0))) {
5273 		/*
5274 		 * A NAK will implicitly acknowledge all previous TID RDMA
5275 		 * requests. Therefore, we NAK with the req->acked_tail
5276 		 * segment for the request at qpriv->r_tid_ack (same at
5277 		 * this point as the req->clear_tail segment for the
5278 		 * qpriv->r_tid_tail request)
5279 		 */
5280 		e = &qp->s_ack_queue[qpriv->r_tid_ack];
5281 		req = ack_to_tid_req(e);
5282 		flow = req->acked_tail;
5283 	} else if (req->ack_seg == req->total_segs &&
5284 		   qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK)
5285 		qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK;
5286 
5287 	trace_hfi1_tid_write_rsp_make_tid_ack(qp);
5288 	trace_hfi1_tid_req_make_tid_ack(qp, 0, e->opcode, e->psn, e->lpsn,
5289 					req);
5290 	hwords += hfi1_build_tid_rdma_write_ack(qp, e, ohdr, flow, &bth1,
5291 						&bth2);
5292 	len = 0;
5293 	qpriv->s_flags &= ~RVT_S_ACK_PENDING;
5294 	ps->s_txreq->hdr_dwords = hwords;
5295 	ps->s_txreq->sde = qpriv->s_sde;
5296 	ps->s_txreq->s_cur_size = len;
5297 	ps->s_txreq->ss = NULL;
5298 	hfi1_make_ruc_header(qp, ohdr, (TID_OP(ACK) << 24), bth1, bth2, middle,
5299 			     ps);
5300 	ps->s_txreq->txreq.flags |= SDMA_TXREQ_F_VIP;
5301 	return 1;
5302 bail:
5303 	/*
5304 	 * Ensure s_rdma_ack_cnt changes are committed prior to resetting
5305 	 * RVT_S_RESP_PENDING
5306 	 */
5307 	smp_wmb();
5308 	qpriv->s_flags &= ~RVT_S_ACK_PENDING;
5309 	return 0;
5310 }
5311 
5312 static int hfi1_send_tid_ok(struct rvt_qp *qp)
5313 {
5314 	struct hfi1_qp_priv *priv = qp->priv;
5315 
5316 	return !(priv->s_flags & RVT_S_BUSY ||
5317 		 qp->s_flags & HFI1_S_ANY_WAIT_IO) &&
5318 		(verbs_txreq_queued(iowait_get_tid_work(&priv->s_iowait)) ||
5319 		 (priv->s_flags & RVT_S_RESP_PENDING) ||
5320 		 !(qp->s_flags & HFI1_S_ANY_TID_WAIT_SEND));
5321 }
5322 
5323 void _hfi1_do_tid_send(struct work_struct *work)
5324 {
5325 	struct iowait_work *w = container_of(work, struct iowait_work, iowork);
5326 	struct rvt_qp *qp = iowait_to_qp(w->iow);
5327 
5328 	hfi1_do_tid_send(qp);
5329 }
5330 
5331 static void hfi1_do_tid_send(struct rvt_qp *qp)
5332 {
5333 	struct hfi1_pkt_state ps;
5334 	struct hfi1_qp_priv *priv = qp->priv;
5335 
5336 	ps.dev = to_idev(qp->ibqp.device);
5337 	ps.ibp = to_iport(qp->ibqp.device, qp->port_num);
5338 	ps.ppd = ppd_from_ibp(ps.ibp);
5339 	ps.wait = iowait_get_tid_work(&priv->s_iowait);
5340 	ps.in_thread = false;
5341 	ps.timeout_int = qp->timeout_jiffies / 8;
5342 
5343 	trace_hfi1_rc_do_tid_send(qp, false);
5344 	spin_lock_irqsave(&qp->s_lock, ps.flags);
5345 
5346 	/* Return if we are already busy processing a work request. */
5347 	if (!hfi1_send_tid_ok(qp)) {
5348 		if (qp->s_flags & HFI1_S_ANY_WAIT_IO)
5349 			iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
5350 		spin_unlock_irqrestore(&qp->s_lock, ps.flags);
5351 		return;
5352 	}
5353 
5354 	priv->s_flags |= RVT_S_BUSY;
5355 
5356 	ps.timeout = jiffies + ps.timeout_int;
5357 	ps.cpu = priv->s_sde ? priv->s_sde->cpu :
5358 		cpumask_first(cpumask_of_node(ps.ppd->dd->node));
5359 	ps.pkts_sent = false;
5360 
5361 	/* insure a pre-built packet is handled  */
5362 	ps.s_txreq = get_waiting_verbs_txreq(ps.wait);
5363 	do {
5364 		/* Check for a constructed packet to be sent. */
5365 		if (ps.s_txreq) {
5366 			if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
5367 				qp->s_flags |= RVT_S_BUSY;
5368 				ps.wait = iowait_get_ib_work(&priv->s_iowait);
5369 			}
5370 			spin_unlock_irqrestore(&qp->s_lock, ps.flags);
5371 
5372 			/*
5373 			 * If the packet cannot be sent now, return and
5374 			 * the send tasklet will be woken up later.
5375 			 */
5376 			if (hfi1_verbs_send(qp, &ps))
5377 				return;
5378 
5379 			/* allow other tasks to run */
5380 			if (hfi1_schedule_send_yield(qp, &ps, true))
5381 				return;
5382 
5383 			spin_lock_irqsave(&qp->s_lock, ps.flags);
5384 			if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
5385 				qp->s_flags &= ~RVT_S_BUSY;
5386 				priv->s_flags &= ~HFI1_S_TID_BUSY_SET;
5387 				ps.wait = iowait_get_tid_work(&priv->s_iowait);
5388 				if (iowait_flag_set(&priv->s_iowait,
5389 						    IOWAIT_PENDING_IB))
5390 					hfi1_schedule_send(qp);
5391 			}
5392 		}
5393 	} while (hfi1_make_tid_rdma_pkt(qp, &ps));
5394 	iowait_starve_clear(ps.pkts_sent, &priv->s_iowait);
5395 	spin_unlock_irqrestore(&qp->s_lock, ps.flags);
5396 }
5397 
5398 static bool _hfi1_schedule_tid_send(struct rvt_qp *qp)
5399 {
5400 	struct hfi1_qp_priv *priv = qp->priv;
5401 	struct hfi1_ibport *ibp =
5402 		to_iport(qp->ibqp.device, qp->port_num);
5403 	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
5404 	struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
5405 
5406 	return iowait_tid_schedule(&priv->s_iowait, ppd->hfi1_wq,
5407 				   priv->s_sde ?
5408 				   priv->s_sde->cpu :
5409 				   cpumask_first(cpumask_of_node(dd->node)));
5410 }
5411 
5412 /**
5413  * hfi1_schedule_tid_send - schedule progress on TID RDMA state machine
5414  * @qp: the QP
5415  *
5416  * This schedules qp progress on the TID RDMA state machine. Caller
5417  * should hold the s_lock.
5418  * Unlike hfi1_schedule_send(), this cannot use hfi1_send_ok() because
5419  * the two state machines can step on each other with respect to the
5420  * RVT_S_BUSY flag.
5421  * Therefore, a modified test is used.
5422  * @return true if the second leg is scheduled;
5423  *  false if the second leg is not scheduled.
5424  */
5425 bool hfi1_schedule_tid_send(struct rvt_qp *qp)
5426 {
5427 	lockdep_assert_held(&qp->s_lock);
5428 	if (hfi1_send_tid_ok(qp)) {
5429 		/*
5430 		 * The following call returns true if the qp is not on the
5431 		 * queue and false if the qp is already on the queue before
5432 		 * this call. Either way, the qp will be on the queue when the
5433 		 * call returns.
5434 		 */
5435 		_hfi1_schedule_tid_send(qp);
5436 		return true;
5437 	}
5438 	if (qp->s_flags & HFI1_S_ANY_WAIT_IO)
5439 		iowait_set_flag(&((struct hfi1_qp_priv *)qp->priv)->s_iowait,
5440 				IOWAIT_PENDING_TID);
5441 	return false;
5442 }
5443 
5444 bool hfi1_tid_rdma_ack_interlock(struct rvt_qp *qp, struct rvt_ack_entry *e)
5445 {
5446 	struct rvt_ack_entry *prev;
5447 	struct tid_rdma_request *req;
5448 	struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
5449 	struct hfi1_qp_priv *priv = qp->priv;
5450 	u32 s_prev;
5451 
5452 	s_prev = qp->s_tail_ack_queue == 0 ? rvt_size_atomic(&dev->rdi) :
5453 		(qp->s_tail_ack_queue - 1);
5454 	prev = &qp->s_ack_queue[s_prev];
5455 
5456 	if ((e->opcode == TID_OP(READ_REQ) ||
5457 	     e->opcode == OP(RDMA_READ_REQUEST)) &&
5458 	    prev->opcode == TID_OP(WRITE_REQ)) {
5459 		req = ack_to_tid_req(prev);
5460 		if (req->ack_seg != req->total_segs) {
5461 			priv->s_flags |= HFI1_R_TID_WAIT_INTERLCK;
5462 			return true;
5463 		}
5464 	}
5465 	return false;
5466 }
5467