1 // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
2 /*
3  * Copyright(c) 2018 Intel Corporation.
4  *
5  */
6 
7 #include "hfi.h"
8 #include "qp.h"
9 #include "rc.h"
10 #include "verbs.h"
11 #include "tid_rdma.h"
12 #include "exp_rcv.h"
13 #include "trace.h"
14 
15 /**
16  * DOC: TID RDMA READ protocol
17  *
18  * This is an end-to-end protocol at the hfi1 level between two nodes that
19  * improves performance by avoiding data copy on the requester side. It
20  * converts a qualified RDMA READ request into a TID RDMA READ request on
21  * the requester side and thereafter handles the request and response
22  * differently. To be qualified, the RDMA READ request should meet the
23  * following:
24  * -- The total data length should be greater than 256K;
25  * -- The total data length should be a multiple of 4K page size;
26  * -- Each local scatter-gather entry should be 4K page aligned;
27  * -- Each local scatter-gather entry should be a multiple of 4K page size;
28  */
29 
30 #define RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK BIT_ULL(32)
31 #define RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK BIT_ULL(33)
32 #define RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK BIT_ULL(34)
33 #define RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK BIT_ULL(35)
34 #define RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK BIT_ULL(37)
35 #define RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK BIT_ULL(38)
36 
37 /* Maximum number of packets within a flow generation. */
38 #define MAX_TID_FLOW_PSN BIT(HFI1_KDETH_BTH_SEQ_SHIFT)
39 
40 #define GENERATION_MASK 0xFFFFF
41 
42 static u32 mask_generation(u32 a)
43 {
44 	return a & GENERATION_MASK;
45 }
46 
47 /* Reserved generation value to set to unused flows for kernel contexts */
48 #define KERN_GENERATION_RESERVED mask_generation(U32_MAX)
49 
50 /*
51  * J_KEY for kernel contexts when TID RDMA is used.
52  * See generate_jkey() in hfi.h for more information.
53  */
54 #define TID_RDMA_JKEY                   32
55 #define HFI1_KERNEL_MIN_JKEY HFI1_ADMIN_JKEY_RANGE
56 #define HFI1_KERNEL_MAX_JKEY (2 * HFI1_ADMIN_JKEY_RANGE - 1)
57 
58 /* Maximum number of segments in flight per QP request. */
59 #define TID_RDMA_MAX_READ_SEGS_PER_REQ  6
60 #define TID_RDMA_MAX_WRITE_SEGS_PER_REQ 4
61 #define MAX_REQ max_t(u16, TID_RDMA_MAX_READ_SEGS_PER_REQ, \
62 			TID_RDMA_MAX_WRITE_SEGS_PER_REQ)
63 #define MAX_FLOWS roundup_pow_of_two(MAX_REQ + 1)
64 
65 #define MAX_EXPECTED_PAGES     (MAX_EXPECTED_BUFFER / PAGE_SIZE)
66 
67 #define TID_RDMA_DESTQP_FLOW_SHIFT      11
68 #define TID_RDMA_DESTQP_FLOW_MASK       0x1f
69 
70 #define TID_OPFN_QP_CTXT_MASK 0xff
71 #define TID_OPFN_QP_CTXT_SHIFT 56
72 #define TID_OPFN_QP_KDETH_MASK 0xff
73 #define TID_OPFN_QP_KDETH_SHIFT 48
74 #define TID_OPFN_MAX_LEN_MASK 0x7ff
75 #define TID_OPFN_MAX_LEN_SHIFT 37
76 #define TID_OPFN_TIMEOUT_MASK 0x1f
77 #define TID_OPFN_TIMEOUT_SHIFT 32
78 #define TID_OPFN_RESERVED_MASK 0x3f
79 #define TID_OPFN_RESERVED_SHIFT 26
80 #define TID_OPFN_URG_MASK 0x1
81 #define TID_OPFN_URG_SHIFT 25
82 #define TID_OPFN_VER_MASK 0x7
83 #define TID_OPFN_VER_SHIFT 22
84 #define TID_OPFN_JKEY_MASK 0x3f
85 #define TID_OPFN_JKEY_SHIFT 16
86 #define TID_OPFN_MAX_READ_MASK 0x3f
87 #define TID_OPFN_MAX_READ_SHIFT 10
88 #define TID_OPFN_MAX_WRITE_MASK 0x3f
89 #define TID_OPFN_MAX_WRITE_SHIFT 4
90 
91 /*
92  * OPFN TID layout
93  *
94  * 63               47               31               15
95  * NNNNNNNNKKKKKKKK MMMMMMMMMMMTTTTT DDDDDDUVVVJJJJJJ RRRRRRWWWWWWCCCC
96  * 3210987654321098 7654321098765432 1098765432109876 5432109876543210
97  * N - the context Number
98  * K - the Kdeth_qp
99  * M - Max_len
100  * T - Timeout
101  * D - reserveD
102  * V - version
103  * U - Urg capable
104  * J - Jkey
105  * R - max_Read
106  * W - max_Write
107  * C - Capcode
108  */
109 
110 static u32 tid_rdma_flow_wt;
111 
112 static void tid_rdma_trigger_resume(struct work_struct *work);
113 static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req);
114 static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req,
115 					 gfp_t gfp);
116 static void hfi1_init_trdma_req(struct rvt_qp *qp,
117 				struct tid_rdma_request *req);
118 static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx);
119 static void hfi1_tid_timeout(struct timer_list *t);
120 static void hfi1_add_tid_reap_timer(struct rvt_qp *qp);
121 static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp);
122 static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp);
123 static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp);
124 static void hfi1_tid_retry_timeout(struct timer_list *t);
125 static int make_tid_rdma_ack(struct rvt_qp *qp,
126 			     struct ib_other_headers *ohdr,
127 			     struct hfi1_pkt_state *ps);
128 static void hfi1_do_tid_send(struct rvt_qp *qp);
129 static u32 read_r_next_psn(struct hfi1_devdata *dd, u8 ctxt, u8 fidx);
130 static void tid_rdma_rcv_err(struct hfi1_packet *packet,
131 			     struct ib_other_headers *ohdr,
132 			     struct rvt_qp *qp, u32 psn, int diff, bool fecn);
133 static void update_r_next_psn_fecn(struct hfi1_packet *packet,
134 				   struct hfi1_qp_priv *priv,
135 				   struct hfi1_ctxtdata *rcd,
136 				   struct tid_rdma_flow *flow,
137 				   bool fecn);
138 
139 static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p)
140 {
141 	return
142 		(((u64)p->qp & TID_OPFN_QP_CTXT_MASK) <<
143 			TID_OPFN_QP_CTXT_SHIFT) |
144 		((((u64)p->qp >> 16) & TID_OPFN_QP_KDETH_MASK) <<
145 			TID_OPFN_QP_KDETH_SHIFT) |
146 		(((u64)((p->max_len >> PAGE_SHIFT) - 1) &
147 			TID_OPFN_MAX_LEN_MASK) << TID_OPFN_MAX_LEN_SHIFT) |
148 		(((u64)p->timeout & TID_OPFN_TIMEOUT_MASK) <<
149 			TID_OPFN_TIMEOUT_SHIFT) |
150 		(((u64)p->urg & TID_OPFN_URG_MASK) << TID_OPFN_URG_SHIFT) |
151 		(((u64)p->jkey & TID_OPFN_JKEY_MASK) << TID_OPFN_JKEY_SHIFT) |
152 		(((u64)p->max_read & TID_OPFN_MAX_READ_MASK) <<
153 			TID_OPFN_MAX_READ_SHIFT) |
154 		(((u64)p->max_write & TID_OPFN_MAX_WRITE_MASK) <<
155 			TID_OPFN_MAX_WRITE_SHIFT);
156 }
157 
158 static void tid_rdma_opfn_decode(struct tid_rdma_params *p, u64 data)
159 {
160 	p->max_len = (((data >> TID_OPFN_MAX_LEN_SHIFT) &
161 		TID_OPFN_MAX_LEN_MASK) + 1) << PAGE_SHIFT;
162 	p->jkey = (data >> TID_OPFN_JKEY_SHIFT) & TID_OPFN_JKEY_MASK;
163 	p->max_write = (data >> TID_OPFN_MAX_WRITE_SHIFT) &
164 		TID_OPFN_MAX_WRITE_MASK;
165 	p->max_read = (data >> TID_OPFN_MAX_READ_SHIFT) &
166 		TID_OPFN_MAX_READ_MASK;
167 	p->qp =
168 		((((data >> TID_OPFN_QP_KDETH_SHIFT) & TID_OPFN_QP_KDETH_MASK)
169 			<< 16) |
170 		((data >> TID_OPFN_QP_CTXT_SHIFT) & TID_OPFN_QP_CTXT_MASK));
171 	p->urg = (data >> TID_OPFN_URG_SHIFT) & TID_OPFN_URG_MASK;
172 	p->timeout = (data >> TID_OPFN_TIMEOUT_SHIFT) & TID_OPFN_TIMEOUT_MASK;
173 }
174 
175 void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p)
176 {
177 	struct hfi1_qp_priv *priv = qp->priv;
178 
179 	p->qp = (kdeth_qp << 16) | priv->rcd->ctxt;
180 	p->max_len = TID_RDMA_MAX_SEGMENT_SIZE;
181 	p->jkey = priv->rcd->jkey;
182 	p->max_read = TID_RDMA_MAX_READ_SEGS_PER_REQ;
183 	p->max_write = TID_RDMA_MAX_WRITE_SEGS_PER_REQ;
184 	p->timeout = qp->timeout;
185 	p->urg = is_urg_masked(priv->rcd);
186 }
187 
188 bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data)
189 {
190 	struct hfi1_qp_priv *priv = qp->priv;
191 
192 	*data = tid_rdma_opfn_encode(&priv->tid_rdma.local);
193 	return true;
194 }
195 
196 bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data)
197 {
198 	struct hfi1_qp_priv *priv = qp->priv;
199 	struct tid_rdma_params *remote, *old;
200 	bool ret = true;
201 
202 	old = rcu_dereference_protected(priv->tid_rdma.remote,
203 					lockdep_is_held(&priv->opfn.lock));
204 	data &= ~0xfULL;
205 	/*
206 	 * If data passed in is zero, return true so as not to continue the
207 	 * negotiation process
208 	 */
209 	if (!data || !HFI1_CAP_IS_KSET(TID_RDMA))
210 		goto null;
211 	/*
212 	 * If kzalloc fails, return false. This will result in:
213 	 * * at the requester a new OPFN request being generated to retry
214 	 *   the negotiation
215 	 * * at the responder, 0 being returned to the requester so as to
216 	 *   disable TID RDMA at both the requester and the responder
217 	 */
218 	remote = kzalloc(sizeof(*remote), GFP_ATOMIC);
219 	if (!remote) {
220 		ret = false;
221 		goto null;
222 	}
223 
224 	tid_rdma_opfn_decode(remote, data);
225 	priv->tid_timer_timeout_jiffies =
226 		usecs_to_jiffies((((4096UL * (1UL << remote->timeout)) /
227 				   1000UL) << 3) * 7);
228 	trace_hfi1_opfn_param(qp, 0, &priv->tid_rdma.local);
229 	trace_hfi1_opfn_param(qp, 1, remote);
230 	rcu_assign_pointer(priv->tid_rdma.remote, remote);
231 	/*
232 	 * A TID RDMA READ request's segment size is not equal to
233 	 * remote->max_len only when the request's data length is smaller
234 	 * than remote->max_len. In that case, there will be only one segment.
235 	 * Therefore, when priv->pkts_ps is used to calculate req->cur_seg
236 	 * during retry, it will lead to req->cur_seg = 0, which is exactly
237 	 * what is expected.
238 	 */
239 	priv->pkts_ps = (u16)rvt_div_mtu(qp, remote->max_len);
240 	priv->timeout_shift = ilog2(priv->pkts_ps - 1) + 1;
241 	goto free;
242 null:
243 	RCU_INIT_POINTER(priv->tid_rdma.remote, NULL);
244 	priv->timeout_shift = 0;
245 free:
246 	if (old)
247 		kfree_rcu(old, rcu_head);
248 	return ret;
249 }
250 
251 bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data)
252 {
253 	bool ret;
254 
255 	ret = tid_rdma_conn_reply(qp, *data);
256 	*data = 0;
257 	/*
258 	 * If tid_rdma_conn_reply() returns error, set *data as 0 to indicate
259 	 * TID RDMA could not be enabled. This will result in TID RDMA being
260 	 * disabled at the requester too.
261 	 */
262 	if (ret)
263 		(void)tid_rdma_conn_req(qp, data);
264 	return ret;
265 }
266 
267 void tid_rdma_conn_error(struct rvt_qp *qp)
268 {
269 	struct hfi1_qp_priv *priv = qp->priv;
270 	struct tid_rdma_params *old;
271 
272 	old = rcu_dereference_protected(priv->tid_rdma.remote,
273 					lockdep_is_held(&priv->opfn.lock));
274 	RCU_INIT_POINTER(priv->tid_rdma.remote, NULL);
275 	if (old)
276 		kfree_rcu(old, rcu_head);
277 }
278 
279 /* This is called at context initialization time */
280 int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit)
281 {
282 	if (reinit)
283 		return 0;
284 
285 	BUILD_BUG_ON(TID_RDMA_JKEY < HFI1_KERNEL_MIN_JKEY);
286 	BUILD_BUG_ON(TID_RDMA_JKEY > HFI1_KERNEL_MAX_JKEY);
287 	rcd->jkey = TID_RDMA_JKEY;
288 	hfi1_set_ctxt_jkey(rcd->dd, rcd, rcd->jkey);
289 	return hfi1_alloc_ctxt_rcv_groups(rcd);
290 }
291 
292 /**
293  * qp_to_rcd - determine the receive context used by a qp
294  * @qp - the qp
295  *
296  * This routine returns the receive context associated
297  * with a a qp's qpn.
298  *
299  * Returns the context.
300  */
301 static struct hfi1_ctxtdata *qp_to_rcd(struct rvt_dev_info *rdi,
302 				       struct rvt_qp *qp)
303 {
304 	struct hfi1_ibdev *verbs_dev = container_of(rdi,
305 						    struct hfi1_ibdev,
306 						    rdi);
307 	struct hfi1_devdata *dd = container_of(verbs_dev,
308 					       struct hfi1_devdata,
309 					       verbs_dev);
310 	unsigned int ctxt;
311 
312 	if (qp->ibqp.qp_num == 0)
313 		ctxt = 0;
314 	else
315 		ctxt = ((qp->ibqp.qp_num >> dd->qos_shift) %
316 			(dd->n_krcv_queues - 1)) + 1;
317 
318 	return dd->rcd[ctxt];
319 }
320 
321 int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp,
322 		      struct ib_qp_init_attr *init_attr)
323 {
324 	struct hfi1_qp_priv *qpriv = qp->priv;
325 	int i, ret;
326 
327 	qpriv->rcd = qp_to_rcd(rdi, qp);
328 
329 	spin_lock_init(&qpriv->opfn.lock);
330 	INIT_WORK(&qpriv->opfn.opfn_work, opfn_send_conn_request);
331 	INIT_WORK(&qpriv->tid_rdma.trigger_work, tid_rdma_trigger_resume);
332 	qpriv->flow_state.psn = 0;
333 	qpriv->flow_state.index = RXE_NUM_TID_FLOWS;
334 	qpriv->flow_state.last_index = RXE_NUM_TID_FLOWS;
335 	qpriv->flow_state.generation = KERN_GENERATION_RESERVED;
336 	qpriv->s_state = TID_OP(WRITE_RESP);
337 	qpriv->s_tid_cur = HFI1_QP_WQE_INVALID;
338 	qpriv->s_tid_head = HFI1_QP_WQE_INVALID;
339 	qpriv->s_tid_tail = HFI1_QP_WQE_INVALID;
340 	qpriv->rnr_nak_state = TID_RNR_NAK_INIT;
341 	qpriv->r_tid_head = HFI1_QP_WQE_INVALID;
342 	qpriv->r_tid_tail = HFI1_QP_WQE_INVALID;
343 	qpriv->r_tid_ack = HFI1_QP_WQE_INVALID;
344 	qpriv->r_tid_alloc = HFI1_QP_WQE_INVALID;
345 	atomic_set(&qpriv->n_requests, 0);
346 	atomic_set(&qpriv->n_tid_requests, 0);
347 	timer_setup(&qpriv->s_tid_timer, hfi1_tid_timeout, 0);
348 	timer_setup(&qpriv->s_tid_retry_timer, hfi1_tid_retry_timeout, 0);
349 	INIT_LIST_HEAD(&qpriv->tid_wait);
350 
351 	if (init_attr->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
352 		struct hfi1_devdata *dd = qpriv->rcd->dd;
353 
354 		qpriv->pages = kzalloc_node(TID_RDMA_MAX_PAGES *
355 						sizeof(*qpriv->pages),
356 					    GFP_KERNEL, dd->node);
357 		if (!qpriv->pages)
358 			return -ENOMEM;
359 		for (i = 0; i < qp->s_size; i++) {
360 			struct hfi1_swqe_priv *priv;
361 			struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i);
362 
363 			priv = kzalloc_node(sizeof(*priv), GFP_KERNEL,
364 					    dd->node);
365 			if (!priv)
366 				return -ENOMEM;
367 
368 			hfi1_init_trdma_req(qp, &priv->tid_req);
369 			priv->tid_req.e.swqe = wqe;
370 			wqe->priv = priv;
371 		}
372 		for (i = 0; i < rvt_max_atomic(rdi); i++) {
373 			struct hfi1_ack_priv *priv;
374 
375 			priv = kzalloc_node(sizeof(*priv), GFP_KERNEL,
376 					    dd->node);
377 			if (!priv)
378 				return -ENOMEM;
379 
380 			hfi1_init_trdma_req(qp, &priv->tid_req);
381 			priv->tid_req.e.ack = &qp->s_ack_queue[i];
382 
383 			ret = hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req,
384 							    GFP_KERNEL);
385 			if (ret) {
386 				kfree(priv);
387 				return ret;
388 			}
389 			qp->s_ack_queue[i].priv = priv;
390 		}
391 	}
392 
393 	return 0;
394 }
395 
396 void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp)
397 {
398 	struct hfi1_qp_priv *qpriv = qp->priv;
399 	struct rvt_swqe *wqe;
400 	u32 i;
401 
402 	if (qp->ibqp.qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
403 		for (i = 0; i < qp->s_size; i++) {
404 			wqe = rvt_get_swqe_ptr(qp, i);
405 			kfree(wqe->priv);
406 			wqe->priv = NULL;
407 		}
408 		for (i = 0; i < rvt_max_atomic(rdi); i++) {
409 			struct hfi1_ack_priv *priv = qp->s_ack_queue[i].priv;
410 
411 			if (priv)
412 				hfi1_kern_exp_rcv_free_flows(&priv->tid_req);
413 			kfree(priv);
414 			qp->s_ack_queue[i].priv = NULL;
415 		}
416 		cancel_work_sync(&qpriv->opfn.opfn_work);
417 		kfree(qpriv->pages);
418 		qpriv->pages = NULL;
419 	}
420 }
421 
422 /* Flow and tid waiter functions */
423 /**
424  * DOC: lock ordering
425  *
426  * There are two locks involved with the queuing
427  * routines: the qp s_lock and the exp_lock.
428  *
429  * Since the tid space allocation is called from
430  * the send engine, the qp s_lock is already held.
431  *
432  * The allocation routines will get the exp_lock.
433  *
434  * The first_qp() call is provided to allow the head of
435  * the rcd wait queue to be fetched under the exp_lock and
436  * followed by a drop of the exp_lock.
437  *
438  * Any qp in the wait list will have the qp reference count held
439  * to hold the qp in memory.
440  */
441 
442 /*
443  * return head of rcd wait list
444  *
445  * Must hold the exp_lock.
446  *
447  * Get a reference to the QP to hold the QP in memory.
448  *
449  * The caller must release the reference when the local
450  * is no longer being used.
451  */
452 static struct rvt_qp *first_qp(struct hfi1_ctxtdata *rcd,
453 			       struct tid_queue *queue)
454 	__must_hold(&rcd->exp_lock)
455 {
456 	struct hfi1_qp_priv *priv;
457 
458 	lockdep_assert_held(&rcd->exp_lock);
459 	priv = list_first_entry_or_null(&queue->queue_head,
460 					struct hfi1_qp_priv,
461 					tid_wait);
462 	if (!priv)
463 		return NULL;
464 	rvt_get_qp(priv->owner);
465 	return priv->owner;
466 }
467 
468 /**
469  * kernel_tid_waiters - determine rcd wait
470  * @rcd: the receive context
471  * @qp: the head of the qp being processed
472  *
473  * This routine will return false IFF
474  * the list is NULL or the head of the
475  * list is the indicated qp.
476  *
477  * Must hold the qp s_lock and the exp_lock.
478  *
479  * Return:
480  * false if either of the conditions below are statisfied:
481  * 1. The list is empty or
482  * 2. The indicated qp is at the head of the list and the
483  *    HFI1_S_WAIT_TID_SPACE bit is set in qp->s_flags.
484  * true is returned otherwise.
485  */
486 static bool kernel_tid_waiters(struct hfi1_ctxtdata *rcd,
487 			       struct tid_queue *queue, struct rvt_qp *qp)
488 	__must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
489 {
490 	struct rvt_qp *fqp;
491 	bool ret = true;
492 
493 	lockdep_assert_held(&qp->s_lock);
494 	lockdep_assert_held(&rcd->exp_lock);
495 	fqp = first_qp(rcd, queue);
496 	if (!fqp || (fqp == qp && (qp->s_flags & HFI1_S_WAIT_TID_SPACE)))
497 		ret = false;
498 	rvt_put_qp(fqp);
499 	return ret;
500 }
501 
502 /**
503  * dequeue_tid_waiter - dequeue the qp from the list
504  * @qp - the qp to remove the wait list
505  *
506  * This routine removes the indicated qp from the
507  * wait list if it is there.
508  *
509  * This should be done after the hardware flow and
510  * tid array resources have been allocated.
511  *
512  * Must hold the qp s_lock and the rcd exp_lock.
513  *
514  * It assumes the s_lock to protect the s_flags
515  * field and to reliably test the HFI1_S_WAIT_TID_SPACE flag.
516  */
517 static void dequeue_tid_waiter(struct hfi1_ctxtdata *rcd,
518 			       struct tid_queue *queue, struct rvt_qp *qp)
519 	__must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
520 {
521 	struct hfi1_qp_priv *priv = qp->priv;
522 
523 	lockdep_assert_held(&qp->s_lock);
524 	lockdep_assert_held(&rcd->exp_lock);
525 	if (list_empty(&priv->tid_wait))
526 		return;
527 	list_del_init(&priv->tid_wait);
528 	qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
529 	queue->dequeue++;
530 	rvt_put_qp(qp);
531 }
532 
533 /**
534  * queue_qp_for_tid_wait - suspend QP on tid space
535  * @rcd: the receive context
536  * @qp: the qp
537  *
538  * The qp is inserted at the tail of the rcd
539  * wait queue and the HFI1_S_WAIT_TID_SPACE s_flag is set.
540  *
541  * Must hold the qp s_lock and the exp_lock.
542  */
543 static void queue_qp_for_tid_wait(struct hfi1_ctxtdata *rcd,
544 				  struct tid_queue *queue, struct rvt_qp *qp)
545 	__must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
546 {
547 	struct hfi1_qp_priv *priv = qp->priv;
548 
549 	lockdep_assert_held(&qp->s_lock);
550 	lockdep_assert_held(&rcd->exp_lock);
551 	if (list_empty(&priv->tid_wait)) {
552 		qp->s_flags |= HFI1_S_WAIT_TID_SPACE;
553 		list_add_tail(&priv->tid_wait, &queue->queue_head);
554 		priv->tid_enqueue = ++queue->enqueue;
555 		rcd->dd->verbs_dev.n_tidwait++;
556 		trace_hfi1_qpsleep(qp, HFI1_S_WAIT_TID_SPACE);
557 		rvt_get_qp(qp);
558 	}
559 }
560 
561 /**
562  * __trigger_tid_waiter - trigger tid waiter
563  * @qp: the qp
564  *
565  * This is a private entrance to schedule the qp
566  * assuming the caller is holding the qp->s_lock.
567  */
568 static void __trigger_tid_waiter(struct rvt_qp *qp)
569 	__must_hold(&qp->s_lock)
570 {
571 	lockdep_assert_held(&qp->s_lock);
572 	if (!(qp->s_flags & HFI1_S_WAIT_TID_SPACE))
573 		return;
574 	trace_hfi1_qpwakeup(qp, HFI1_S_WAIT_TID_SPACE);
575 	hfi1_schedule_send(qp);
576 }
577 
578 /**
579  * tid_rdma_schedule_tid_wakeup - schedule wakeup for a qp
580  * @qp - the qp
581  *
582  * trigger a schedule or a waiting qp in a deadlock
583  * safe manner.  The qp reference is held prior
584  * to this call via first_qp().
585  *
586  * If the qp trigger was already scheduled (!rval)
587  * the the reference is dropped, otherwise the resume
588  * or the destroy cancel will dispatch the reference.
589  */
590 static void tid_rdma_schedule_tid_wakeup(struct rvt_qp *qp)
591 {
592 	struct hfi1_qp_priv *priv;
593 	struct hfi1_ibport *ibp;
594 	struct hfi1_pportdata *ppd;
595 	struct hfi1_devdata *dd;
596 	bool rval;
597 
598 	if (!qp)
599 		return;
600 
601 	priv = qp->priv;
602 	ibp = to_iport(qp->ibqp.device, qp->port_num);
603 	ppd = ppd_from_ibp(ibp);
604 	dd = dd_from_ibdev(qp->ibqp.device);
605 
606 	rval = queue_work_on(priv->s_sde ?
607 			     priv->s_sde->cpu :
608 			     cpumask_first(cpumask_of_node(dd->node)),
609 			     ppd->hfi1_wq,
610 			     &priv->tid_rdma.trigger_work);
611 	if (!rval)
612 		rvt_put_qp(qp);
613 }
614 
615 /**
616  * tid_rdma_trigger_resume - field a trigger work request
617  * @work - the work item
618  *
619  * Complete the off qp trigger processing by directly
620  * calling the progress routine.
621  */
622 static void tid_rdma_trigger_resume(struct work_struct *work)
623 {
624 	struct tid_rdma_qp_params *tr;
625 	struct hfi1_qp_priv *priv;
626 	struct rvt_qp *qp;
627 
628 	tr = container_of(work, struct tid_rdma_qp_params, trigger_work);
629 	priv = container_of(tr, struct hfi1_qp_priv, tid_rdma);
630 	qp = priv->owner;
631 	spin_lock_irq(&qp->s_lock);
632 	if (qp->s_flags & HFI1_S_WAIT_TID_SPACE) {
633 		spin_unlock_irq(&qp->s_lock);
634 		hfi1_do_send(priv->owner, true);
635 	} else {
636 		spin_unlock_irq(&qp->s_lock);
637 	}
638 	rvt_put_qp(qp);
639 }
640 
641 /**
642  * tid_rdma_flush_wait - unwind any tid space wait
643  *
644  * This is called when resetting a qp to
645  * allow a destroy or reset to get rid
646  * of any tid space linkage and reference counts.
647  */
648 static void _tid_rdma_flush_wait(struct rvt_qp *qp, struct tid_queue *queue)
649 	__must_hold(&qp->s_lock)
650 {
651 	struct hfi1_qp_priv *priv;
652 
653 	if (!qp)
654 		return;
655 	lockdep_assert_held(&qp->s_lock);
656 	priv = qp->priv;
657 	qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
658 	spin_lock(&priv->rcd->exp_lock);
659 	if (!list_empty(&priv->tid_wait)) {
660 		list_del_init(&priv->tid_wait);
661 		qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
662 		queue->dequeue++;
663 		rvt_put_qp(qp);
664 	}
665 	spin_unlock(&priv->rcd->exp_lock);
666 }
667 
668 void hfi1_tid_rdma_flush_wait(struct rvt_qp *qp)
669 	__must_hold(&qp->s_lock)
670 {
671 	struct hfi1_qp_priv *priv = qp->priv;
672 
673 	_tid_rdma_flush_wait(qp, &priv->rcd->flow_queue);
674 	_tid_rdma_flush_wait(qp, &priv->rcd->rarr_queue);
675 }
676 
677 /* Flow functions */
678 /**
679  * kern_reserve_flow - allocate a hardware flow
680  * @rcd - the context to use for allocation
681  * @last - the index of the preferred flow. Use RXE_NUM_TID_FLOWS to
682  *         signify "don't care".
683  *
684  * Use a bit mask based allocation to reserve a hardware
685  * flow for use in receiving KDETH data packets. If a preferred flow is
686  * specified the function will attempt to reserve that flow again, if
687  * available.
688  *
689  * The exp_lock must be held.
690  *
691  * Return:
692  * On success: a value postive value between 0 and RXE_NUM_TID_FLOWS - 1
693  * On failure: -EAGAIN
694  */
695 static int kern_reserve_flow(struct hfi1_ctxtdata *rcd, int last)
696 	__must_hold(&rcd->exp_lock)
697 {
698 	int nr;
699 
700 	/* Attempt to reserve the preferred flow index */
701 	if (last >= 0 && last < RXE_NUM_TID_FLOWS &&
702 	    !test_and_set_bit(last, &rcd->flow_mask))
703 		return last;
704 
705 	nr = ffz(rcd->flow_mask);
706 	BUILD_BUG_ON(RXE_NUM_TID_FLOWS >=
707 		     (sizeof(rcd->flow_mask) * BITS_PER_BYTE));
708 	if (nr > (RXE_NUM_TID_FLOWS - 1))
709 		return -EAGAIN;
710 	set_bit(nr, &rcd->flow_mask);
711 	return nr;
712 }
713 
714 static void kern_set_hw_flow(struct hfi1_ctxtdata *rcd, u32 generation,
715 			     u32 flow_idx)
716 {
717 	u64 reg;
718 
719 	reg = ((u64)generation << HFI1_KDETH_BTH_SEQ_SHIFT) |
720 		RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK |
721 		RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK |
722 		RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK |
723 		RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK |
724 		RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK;
725 
726 	if (generation != KERN_GENERATION_RESERVED)
727 		reg |= RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK;
728 
729 	write_uctxt_csr(rcd->dd, rcd->ctxt,
730 			RCV_TID_FLOW_TABLE + 8 * flow_idx, reg);
731 }
732 
733 static u32 kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx)
734 	__must_hold(&rcd->exp_lock)
735 {
736 	u32 generation = rcd->flows[flow_idx].generation;
737 
738 	kern_set_hw_flow(rcd, generation, flow_idx);
739 	return generation;
740 }
741 
742 static u32 kern_flow_generation_next(u32 gen)
743 {
744 	u32 generation = mask_generation(gen + 1);
745 
746 	if (generation == KERN_GENERATION_RESERVED)
747 		generation = mask_generation(generation + 1);
748 	return generation;
749 }
750 
751 static void kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx)
752 	__must_hold(&rcd->exp_lock)
753 {
754 	rcd->flows[flow_idx].generation =
755 		kern_flow_generation_next(rcd->flows[flow_idx].generation);
756 	kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, flow_idx);
757 }
758 
759 int hfi1_kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp)
760 {
761 	struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
762 	struct tid_flow_state *fs = &qpriv->flow_state;
763 	struct rvt_qp *fqp;
764 	unsigned long flags;
765 	int ret = 0;
766 
767 	/* The QP already has an allocated flow */
768 	if (fs->index != RXE_NUM_TID_FLOWS)
769 		return ret;
770 
771 	spin_lock_irqsave(&rcd->exp_lock, flags);
772 	if (kernel_tid_waiters(rcd, &rcd->flow_queue, qp))
773 		goto queue;
774 
775 	ret = kern_reserve_flow(rcd, fs->last_index);
776 	if (ret < 0)
777 		goto queue;
778 	fs->index = ret;
779 	fs->last_index = fs->index;
780 
781 	/* Generation received in a RESYNC overrides default flow generation */
782 	if (fs->generation != KERN_GENERATION_RESERVED)
783 		rcd->flows[fs->index].generation = fs->generation;
784 	fs->generation = kern_setup_hw_flow(rcd, fs->index);
785 	fs->psn = 0;
786 	dequeue_tid_waiter(rcd, &rcd->flow_queue, qp);
787 	/* get head before dropping lock */
788 	fqp = first_qp(rcd, &rcd->flow_queue);
789 	spin_unlock_irqrestore(&rcd->exp_lock, flags);
790 
791 	tid_rdma_schedule_tid_wakeup(fqp);
792 	return 0;
793 queue:
794 	queue_qp_for_tid_wait(rcd, &rcd->flow_queue, qp);
795 	spin_unlock_irqrestore(&rcd->exp_lock, flags);
796 	return -EAGAIN;
797 }
798 
799 void hfi1_kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp)
800 {
801 	struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
802 	struct tid_flow_state *fs = &qpriv->flow_state;
803 	struct rvt_qp *fqp;
804 	unsigned long flags;
805 
806 	if (fs->index >= RXE_NUM_TID_FLOWS)
807 		return;
808 	spin_lock_irqsave(&rcd->exp_lock, flags);
809 	kern_clear_hw_flow(rcd, fs->index);
810 	clear_bit(fs->index, &rcd->flow_mask);
811 	fs->index = RXE_NUM_TID_FLOWS;
812 	fs->psn = 0;
813 	fs->generation = KERN_GENERATION_RESERVED;
814 
815 	/* get head before dropping lock */
816 	fqp = first_qp(rcd, &rcd->flow_queue);
817 	spin_unlock_irqrestore(&rcd->exp_lock, flags);
818 
819 	if (fqp == qp) {
820 		__trigger_tid_waiter(fqp);
821 		rvt_put_qp(fqp);
822 	} else {
823 		tid_rdma_schedule_tid_wakeup(fqp);
824 	}
825 }
826 
827 void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd)
828 {
829 	int i;
830 
831 	for (i = 0; i < RXE_NUM_TID_FLOWS; i++) {
832 		rcd->flows[i].generation = mask_generation(prandom_u32());
833 		kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, i);
834 	}
835 }
836 
837 /* TID allocation functions */
838 static u8 trdma_pset_order(struct tid_rdma_pageset *s)
839 {
840 	u8 count = s->count;
841 
842 	return ilog2(count) + 1;
843 }
844 
845 /**
846  * tid_rdma_find_phys_blocks_4k - get groups base on mr info
847  * @npages - number of pages
848  * @pages - pointer to an array of page structs
849  * @list - page set array to return
850  *
851  * This routine returns the number of groups associated with
852  * the current sge information.  This implementation is based
853  * on the expected receive find_phys_blocks() adjusted to
854  * use the MR information vs. the pfn.
855  *
856  * Return:
857  * the number of RcvArray entries
858  */
859 static u32 tid_rdma_find_phys_blocks_4k(struct tid_rdma_flow *flow,
860 					struct page **pages,
861 					u32 npages,
862 					struct tid_rdma_pageset *list)
863 {
864 	u32 pagecount, pageidx, setcount = 0, i;
865 	void *vaddr, *this_vaddr;
866 
867 	if (!npages)
868 		return 0;
869 
870 	/*
871 	 * Look for sets of physically contiguous pages in the user buffer.
872 	 * This will allow us to optimize Expected RcvArray entry usage by
873 	 * using the bigger supported sizes.
874 	 */
875 	vaddr = page_address(pages[0]);
876 	trace_hfi1_tid_flow_page(flow->req->qp, flow, 0, 0, 0, vaddr);
877 	for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
878 		this_vaddr = i < npages ? page_address(pages[i]) : NULL;
879 		trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 0, 0,
880 					 this_vaddr);
881 		/*
882 		 * If the vaddr's are not sequential, pages are not physically
883 		 * contiguous.
884 		 */
885 		if (this_vaddr != (vaddr + PAGE_SIZE)) {
886 			/*
887 			 * At this point we have to loop over the set of
888 			 * physically contiguous pages and break them down it
889 			 * sizes supported by the HW.
890 			 * There are two main constraints:
891 			 *     1. The max buffer size is MAX_EXPECTED_BUFFER.
892 			 *        If the total set size is bigger than that
893 			 *        program only a MAX_EXPECTED_BUFFER chunk.
894 			 *     2. The buffer size has to be a power of two. If
895 			 *        it is not, round down to the closes power of
896 			 *        2 and program that size.
897 			 */
898 			while (pagecount) {
899 				int maxpages = pagecount;
900 				u32 bufsize = pagecount * PAGE_SIZE;
901 
902 				if (bufsize > MAX_EXPECTED_BUFFER)
903 					maxpages =
904 						MAX_EXPECTED_BUFFER >>
905 						PAGE_SHIFT;
906 				else if (!is_power_of_2(bufsize))
907 					maxpages =
908 						rounddown_pow_of_two(bufsize) >>
909 						PAGE_SHIFT;
910 
911 				list[setcount].idx = pageidx;
912 				list[setcount].count = maxpages;
913 				trace_hfi1_tid_pageset(flow->req->qp, setcount,
914 						       list[setcount].idx,
915 						       list[setcount].count);
916 				pagecount -= maxpages;
917 				pageidx += maxpages;
918 				setcount++;
919 			}
920 			pageidx = i;
921 			pagecount = 1;
922 			vaddr = this_vaddr;
923 		} else {
924 			vaddr += PAGE_SIZE;
925 			pagecount++;
926 		}
927 	}
928 	/* insure we always return an even number of sets */
929 	if (setcount & 1)
930 		list[setcount++].count = 0;
931 	return setcount;
932 }
933 
934 /**
935  * tid_flush_pages - dump out pages into pagesets
936  * @list - list of pagesets
937  * @idx - pointer to current page index
938  * @pages - number of pages to dump
939  * @sets - current number of pagesset
940  *
941  * This routine flushes out accumuated pages.
942  *
943  * To insure an even number of sets the
944  * code may add a filler.
945  *
946  * This can happen with when pages is not
947  * a power of 2 or pages is a power of 2
948  * less than the maximum pages.
949  *
950  * Return:
951  * The new number of sets
952  */
953 
954 static u32 tid_flush_pages(struct tid_rdma_pageset *list,
955 			   u32 *idx, u32 pages, u32 sets)
956 {
957 	while (pages) {
958 		u32 maxpages = pages;
959 
960 		if (maxpages > MAX_EXPECTED_PAGES)
961 			maxpages = MAX_EXPECTED_PAGES;
962 		else if (!is_power_of_2(maxpages))
963 			maxpages = rounddown_pow_of_two(maxpages);
964 		list[sets].idx = *idx;
965 		list[sets++].count = maxpages;
966 		*idx += maxpages;
967 		pages -= maxpages;
968 	}
969 	/* might need a filler */
970 	if (sets & 1)
971 		list[sets++].count = 0;
972 	return sets;
973 }
974 
975 /**
976  * tid_rdma_find_phys_blocks_8k - get groups base on mr info
977  * @pages - pointer to an array of page structs
978  * @npages - number of pages
979  * @list - page set array to return
980  *
981  * This routine parses an array of pages to compute pagesets
982  * in an 8k compatible way.
983  *
984  * pages are tested two at a time, i, i + 1 for contiguous
985  * pages and i - 1 and i contiguous pages.
986  *
987  * If any condition is false, any accumlated pages are flushed and
988  * v0,v1 are emitted as separate PAGE_SIZE pagesets
989  *
990  * Otherwise, the current 8k is totaled for a future flush.
991  *
992  * Return:
993  * The number of pagesets
994  * list set with the returned number of pagesets
995  *
996  */
997 static u32 tid_rdma_find_phys_blocks_8k(struct tid_rdma_flow *flow,
998 					struct page **pages,
999 					u32 npages,
1000 					struct tid_rdma_pageset *list)
1001 {
1002 	u32 idx, sets = 0, i;
1003 	u32 pagecnt = 0;
1004 	void *v0, *v1, *vm1;
1005 
1006 	if (!npages)
1007 		return 0;
1008 	for (idx = 0, i = 0, vm1 = NULL; i < npages; i += 2) {
1009 		/* get a new v0 */
1010 		v0 = page_address(pages[i]);
1011 		trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 0, v0);
1012 		v1 = i + 1 < npages ?
1013 				page_address(pages[i + 1]) : NULL;
1014 		trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 1, v1);
1015 		/* compare i, i + 1 vaddr */
1016 		if (v1 != (v0 + PAGE_SIZE)) {
1017 			/* flush out pages */
1018 			sets = tid_flush_pages(list, &idx, pagecnt, sets);
1019 			/* output v0,v1 as two pagesets */
1020 			list[sets].idx = idx++;
1021 			list[sets++].count = 1;
1022 			if (v1) {
1023 				list[sets].count = 1;
1024 				list[sets++].idx = idx++;
1025 			} else {
1026 				list[sets++].count = 0;
1027 			}
1028 			vm1 = NULL;
1029 			pagecnt = 0;
1030 			continue;
1031 		}
1032 		/* i,i+1 consecutive, look at i-1,i */
1033 		if (vm1 && v0 != (vm1 + PAGE_SIZE)) {
1034 			/* flush out pages */
1035 			sets = tid_flush_pages(list, &idx, pagecnt, sets);
1036 			pagecnt = 0;
1037 		}
1038 		/* pages will always be a multiple of 8k */
1039 		pagecnt += 2;
1040 		/* save i-1 */
1041 		vm1 = v1;
1042 		/* move to next pair */
1043 	}
1044 	/* dump residual pages at end */
1045 	sets = tid_flush_pages(list, &idx, npages - idx, sets);
1046 	/* by design cannot be odd sets */
1047 	WARN_ON(sets & 1);
1048 	return sets;
1049 }
1050 
1051 /**
1052  * Find pages for one segment of a sge array represented by @ss. The function
1053  * does not check the sge, the sge must have been checked for alignment with a
1054  * prior call to hfi1_kern_trdma_ok. Other sge checking is done as part of
1055  * rvt_lkey_ok and rvt_rkey_ok. Also, the function only modifies the local sge
1056  * copy maintained in @ss->sge, the original sge is not modified.
1057  *
1058  * Unlike IB RDMA WRITE, we can't decrement ss->num_sge here because we are not
1059  * releasing the MR reference count at the same time. Otherwise, we'll "leak"
1060  * references to the MR. This difference requires that we keep track of progress
1061  * into the sg_list. This is done by the cur_seg cursor in the tid_rdma_request
1062  * structure.
1063  */
1064 static u32 kern_find_pages(struct tid_rdma_flow *flow,
1065 			   struct page **pages,
1066 			   struct rvt_sge_state *ss, bool *last)
1067 {
1068 	struct tid_rdma_request *req = flow->req;
1069 	struct rvt_sge *sge = &ss->sge;
1070 	u32 length = flow->req->seg_len;
1071 	u32 len = PAGE_SIZE;
1072 	u32 i = 0;
1073 
1074 	while (length && req->isge < ss->num_sge) {
1075 		pages[i++] = virt_to_page(sge->vaddr);
1076 
1077 		sge->vaddr += len;
1078 		sge->length -= len;
1079 		sge->sge_length -= len;
1080 		if (!sge->sge_length) {
1081 			if (++req->isge < ss->num_sge)
1082 				*sge = ss->sg_list[req->isge - 1];
1083 		} else if (sge->length == 0 && sge->mr->lkey) {
1084 			if (++sge->n >= RVT_SEGSZ) {
1085 				++sge->m;
1086 				sge->n = 0;
1087 			}
1088 			sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
1089 			sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
1090 		}
1091 		length -= len;
1092 	}
1093 
1094 	flow->length = flow->req->seg_len - length;
1095 	*last = req->isge == ss->num_sge ? false : true;
1096 	return i;
1097 }
1098 
1099 static void dma_unmap_flow(struct tid_rdma_flow *flow)
1100 {
1101 	struct hfi1_devdata *dd;
1102 	int i;
1103 	struct tid_rdma_pageset *pset;
1104 
1105 	dd = flow->req->rcd->dd;
1106 	for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets;
1107 			i++, pset++) {
1108 		if (pset->count && pset->addr) {
1109 			dma_unmap_page(&dd->pcidev->dev,
1110 				       pset->addr,
1111 				       PAGE_SIZE * pset->count,
1112 				       DMA_FROM_DEVICE);
1113 			pset->mapped = 0;
1114 		}
1115 	}
1116 }
1117 
1118 static int dma_map_flow(struct tid_rdma_flow *flow, struct page **pages)
1119 {
1120 	int i;
1121 	struct hfi1_devdata *dd = flow->req->rcd->dd;
1122 	struct tid_rdma_pageset *pset;
1123 
1124 	for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets;
1125 			i++, pset++) {
1126 		if (pset->count) {
1127 			pset->addr = dma_map_page(&dd->pcidev->dev,
1128 						  pages[pset->idx],
1129 						  0,
1130 						  PAGE_SIZE * pset->count,
1131 						  DMA_FROM_DEVICE);
1132 
1133 			if (dma_mapping_error(&dd->pcidev->dev, pset->addr)) {
1134 				dma_unmap_flow(flow);
1135 				return -ENOMEM;
1136 			}
1137 			pset->mapped = 1;
1138 		}
1139 	}
1140 	return 0;
1141 }
1142 
1143 static inline bool dma_mapped(struct tid_rdma_flow *flow)
1144 {
1145 	return !!flow->pagesets[0].mapped;
1146 }
1147 
1148 /*
1149  * Get pages pointers and identify contiguous physical memory chunks for a
1150  * segment. All segments are of length flow->req->seg_len.
1151  */
1152 static int kern_get_phys_blocks(struct tid_rdma_flow *flow,
1153 				struct page **pages,
1154 				struct rvt_sge_state *ss, bool *last)
1155 {
1156 	u8 npages;
1157 
1158 	/* Reuse previously computed pagesets, if any */
1159 	if (flow->npagesets) {
1160 		trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head,
1161 					  flow);
1162 		if (!dma_mapped(flow))
1163 			return dma_map_flow(flow, pages);
1164 		return 0;
1165 	}
1166 
1167 	npages = kern_find_pages(flow, pages, ss, last);
1168 
1169 	if (flow->req->qp->pmtu == enum_to_mtu(OPA_MTU_4096))
1170 		flow->npagesets =
1171 			tid_rdma_find_phys_blocks_4k(flow, pages, npages,
1172 						     flow->pagesets);
1173 	else
1174 		flow->npagesets =
1175 			tid_rdma_find_phys_blocks_8k(flow, pages, npages,
1176 						     flow->pagesets);
1177 
1178 	return dma_map_flow(flow, pages);
1179 }
1180 
1181 static inline void kern_add_tid_node(struct tid_rdma_flow *flow,
1182 				     struct hfi1_ctxtdata *rcd, char *s,
1183 				     struct tid_group *grp, u8 cnt)
1184 {
1185 	struct kern_tid_node *node = &flow->tnode[flow->tnode_cnt++];
1186 
1187 	WARN_ON_ONCE(flow->tnode_cnt >=
1188 		     (TID_RDMA_MAX_SEGMENT_SIZE >> PAGE_SHIFT));
1189 	if (WARN_ON_ONCE(cnt & 1))
1190 		dd_dev_err(rcd->dd,
1191 			   "unexpected odd allocation cnt %u map 0x%x used %u",
1192 			   cnt, grp->map, grp->used);
1193 
1194 	node->grp = grp;
1195 	node->map = grp->map;
1196 	node->cnt = cnt;
1197 	trace_hfi1_tid_node_add(flow->req->qp, s, flow->tnode_cnt - 1,
1198 				grp->base, grp->map, grp->used, cnt);
1199 }
1200 
1201 /*
1202  * Try to allocate pageset_count TID's from TID groups for a context
1203  *
1204  * This function allocates TID's without moving groups between lists or
1205  * modifying grp->map. This is done as follows, being cogizant of the lists
1206  * between which the TID groups will move:
1207  * 1. First allocate complete groups of 8 TID's since this is more efficient,
1208  *    these groups will move from group->full without affecting used
1209  * 2. If more TID's are needed allocate from used (will move from used->full or
1210  *    stay in used)
1211  * 3. If we still don't have the required number of TID's go back and look again
1212  *    at a complete group (will move from group->used)
1213  */
1214 static int kern_alloc_tids(struct tid_rdma_flow *flow)
1215 {
1216 	struct hfi1_ctxtdata *rcd = flow->req->rcd;
1217 	struct hfi1_devdata *dd = rcd->dd;
1218 	u32 ngroups, pageidx = 0;
1219 	struct tid_group *group = NULL, *used;
1220 	u8 use;
1221 
1222 	flow->tnode_cnt = 0;
1223 	ngroups = flow->npagesets / dd->rcv_entries.group_size;
1224 	if (!ngroups)
1225 		goto used_list;
1226 
1227 	/* First look at complete groups */
1228 	list_for_each_entry(group,  &rcd->tid_group_list.list, list) {
1229 		kern_add_tid_node(flow, rcd, "complete groups", group,
1230 				  group->size);
1231 
1232 		pageidx += group->size;
1233 		if (!--ngroups)
1234 			break;
1235 	}
1236 
1237 	if (pageidx >= flow->npagesets)
1238 		goto ok;
1239 
1240 used_list:
1241 	/* Now look at partially used groups */
1242 	list_for_each_entry(used, &rcd->tid_used_list.list, list) {
1243 		use = min_t(u32, flow->npagesets - pageidx,
1244 			    used->size - used->used);
1245 		kern_add_tid_node(flow, rcd, "used groups", used, use);
1246 
1247 		pageidx += use;
1248 		if (pageidx >= flow->npagesets)
1249 			goto ok;
1250 	}
1251 
1252 	/*
1253 	 * Look again at a complete group, continuing from where we left.
1254 	 * However, if we are at the head, we have reached the end of the
1255 	 * complete groups list from the first loop above
1256 	 */
1257 	if (group && &group->list == &rcd->tid_group_list.list)
1258 		goto bail_eagain;
1259 	group = list_prepare_entry(group, &rcd->tid_group_list.list,
1260 				   list);
1261 	if (list_is_last(&group->list, &rcd->tid_group_list.list))
1262 		goto bail_eagain;
1263 	group = list_next_entry(group, list);
1264 	use = min_t(u32, flow->npagesets - pageidx, group->size);
1265 	kern_add_tid_node(flow, rcd, "complete continue", group, use);
1266 	pageidx += use;
1267 	if (pageidx >= flow->npagesets)
1268 		goto ok;
1269 bail_eagain:
1270 	trace_hfi1_msg_alloc_tids(flow->req->qp, " insufficient tids: needed ",
1271 				  (u64)flow->npagesets);
1272 	return -EAGAIN;
1273 ok:
1274 	return 0;
1275 }
1276 
1277 static void kern_program_rcv_group(struct tid_rdma_flow *flow, int grp_num,
1278 				   u32 *pset_idx)
1279 {
1280 	struct hfi1_ctxtdata *rcd = flow->req->rcd;
1281 	struct hfi1_devdata *dd = rcd->dd;
1282 	struct kern_tid_node *node = &flow->tnode[grp_num];
1283 	struct tid_group *grp = node->grp;
1284 	struct tid_rdma_pageset *pset;
1285 	u32 pmtu_pg = flow->req->qp->pmtu >> PAGE_SHIFT;
1286 	u32 rcventry, npages = 0, pair = 0, tidctrl;
1287 	u8 i, cnt = 0;
1288 
1289 	for (i = 0; i < grp->size; i++) {
1290 		rcventry = grp->base + i;
1291 
1292 		if (node->map & BIT(i) || cnt >= node->cnt) {
1293 			rcv_array_wc_fill(dd, rcventry);
1294 			continue;
1295 		}
1296 		pset = &flow->pagesets[(*pset_idx)++];
1297 		if (pset->count) {
1298 			hfi1_put_tid(dd, rcventry, PT_EXPECTED,
1299 				     pset->addr, trdma_pset_order(pset));
1300 		} else {
1301 			hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0);
1302 		}
1303 		npages += pset->count;
1304 
1305 		rcventry -= rcd->expected_base;
1306 		tidctrl = pair ? 0x3 : rcventry & 0x1 ? 0x2 : 0x1;
1307 		/*
1308 		 * A single TID entry will be used to use a rcvarr pair (with
1309 		 * tidctrl 0x3), if ALL these are true (a) the bit pos is even
1310 		 * (b) the group map shows current and the next bits as free
1311 		 * indicating two consecutive rcvarry entries are available (c)
1312 		 * we actually need 2 more entries
1313 		 */
1314 		pair = !(i & 0x1) && !((node->map >> i) & 0x3) &&
1315 			node->cnt >= cnt + 2;
1316 		if (!pair) {
1317 			if (!pset->count)
1318 				tidctrl = 0x1;
1319 			flow->tid_entry[flow->tidcnt++] =
1320 				EXP_TID_SET(IDX, rcventry >> 1) |
1321 				EXP_TID_SET(CTRL, tidctrl) |
1322 				EXP_TID_SET(LEN, npages);
1323 			trace_hfi1_tid_entry_alloc(/* entry */
1324 			   flow->req->qp, flow->tidcnt - 1,
1325 			   flow->tid_entry[flow->tidcnt - 1]);
1326 
1327 			/* Efficient DIV_ROUND_UP(npages, pmtu_pg) */
1328 			flow->npkts += (npages + pmtu_pg - 1) >> ilog2(pmtu_pg);
1329 			npages = 0;
1330 		}
1331 
1332 		if (grp->used == grp->size - 1)
1333 			tid_group_move(grp, &rcd->tid_used_list,
1334 				       &rcd->tid_full_list);
1335 		else if (!grp->used)
1336 			tid_group_move(grp, &rcd->tid_group_list,
1337 				       &rcd->tid_used_list);
1338 
1339 		grp->used++;
1340 		grp->map |= BIT(i);
1341 		cnt++;
1342 	}
1343 }
1344 
1345 static void kern_unprogram_rcv_group(struct tid_rdma_flow *flow, int grp_num)
1346 {
1347 	struct hfi1_ctxtdata *rcd = flow->req->rcd;
1348 	struct hfi1_devdata *dd = rcd->dd;
1349 	struct kern_tid_node *node = &flow->tnode[grp_num];
1350 	struct tid_group *grp = node->grp;
1351 	u32 rcventry;
1352 	u8 i, cnt = 0;
1353 
1354 	for (i = 0; i < grp->size; i++) {
1355 		rcventry = grp->base + i;
1356 
1357 		if (node->map & BIT(i) || cnt >= node->cnt) {
1358 			rcv_array_wc_fill(dd, rcventry);
1359 			continue;
1360 		}
1361 
1362 		hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0);
1363 
1364 		grp->used--;
1365 		grp->map &= ~BIT(i);
1366 		cnt++;
1367 
1368 		if (grp->used == grp->size - 1)
1369 			tid_group_move(grp, &rcd->tid_full_list,
1370 				       &rcd->tid_used_list);
1371 		else if (!grp->used)
1372 			tid_group_move(grp, &rcd->tid_used_list,
1373 				       &rcd->tid_group_list);
1374 	}
1375 	if (WARN_ON_ONCE(cnt & 1)) {
1376 		struct hfi1_ctxtdata *rcd = flow->req->rcd;
1377 		struct hfi1_devdata *dd = rcd->dd;
1378 
1379 		dd_dev_err(dd, "unexpected odd free cnt %u map 0x%x used %u",
1380 			   cnt, grp->map, grp->used);
1381 	}
1382 }
1383 
1384 static void kern_program_rcvarray(struct tid_rdma_flow *flow)
1385 {
1386 	u32 pset_idx = 0;
1387 	int i;
1388 
1389 	flow->npkts = 0;
1390 	flow->tidcnt = 0;
1391 	for (i = 0; i < flow->tnode_cnt; i++)
1392 		kern_program_rcv_group(flow, i, &pset_idx);
1393 	trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head, flow);
1394 }
1395 
1396 /**
1397  * hfi1_kern_exp_rcv_setup() - setup TID's and flow for one segment of a
1398  * TID RDMA request
1399  *
1400  * @req: TID RDMA request for which the segment/flow is being set up
1401  * @ss: sge state, maintains state across successive segments of a sge
1402  * @last: set to true after the last sge segment has been processed
1403  *
1404  * This function
1405  * (1) finds a free flow entry in the flow circular buffer
1406  * (2) finds pages and continuous physical chunks constituing one segment
1407  *     of an sge
1408  * (3) allocates TID group entries for those chunks
1409  * (4) programs rcvarray entries in the hardware corresponding to those
1410  *     TID's
1411  * (5) computes a tidarray with formatted TID entries which can be sent
1412  *     to the sender
1413  * (6) Reserves and programs HW flows.
1414  * (7) It also manages queing the QP when TID/flow resources are not
1415  *     available.
1416  *
1417  * @req points to struct tid_rdma_request of which the segments are a part. The
1418  * function uses qp, rcd and seg_len members of @req. In the absence of errors,
1419  * req->flow_idx is the index of the flow which has been prepared in this
1420  * invocation of function call. With flow = &req->flows[req->flow_idx],
1421  * flow->tid_entry contains the TID array which the sender can use for TID RDMA
1422  * sends and flow->npkts contains number of packets required to send the
1423  * segment.
1424  *
1425  * hfi1_check_sge_align should be called prior to calling this function and if
1426  * it signals error TID RDMA cannot be used for this sge and this function
1427  * should not be called.
1428  *
1429  * For the queuing, caller must hold the flow->req->qp s_lock from the send
1430  * engine and the function will procure the exp_lock.
1431  *
1432  * Return:
1433  * The function returns -EAGAIN if sufficient number of TID/flow resources to
1434  * map the segment could not be allocated. In this case the function should be
1435  * called again with previous arguments to retry the TID allocation. There are
1436  * no other error returns. The function returns 0 on success.
1437  */
1438 int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req,
1439 			    struct rvt_sge_state *ss, bool *last)
1440 	__must_hold(&req->qp->s_lock)
1441 {
1442 	struct tid_rdma_flow *flow = &req->flows[req->setup_head];
1443 	struct hfi1_ctxtdata *rcd = req->rcd;
1444 	struct hfi1_qp_priv *qpriv = req->qp->priv;
1445 	unsigned long flags;
1446 	struct rvt_qp *fqp;
1447 	u16 clear_tail = req->clear_tail;
1448 
1449 	lockdep_assert_held(&req->qp->s_lock);
1450 	/*
1451 	 * We return error if either (a) we don't have space in the flow
1452 	 * circular buffer, or (b) we already have max entries in the buffer.
1453 	 * Max entries depend on the type of request we are processing and the
1454 	 * negotiated TID RDMA parameters.
1455 	 */
1456 	if (!CIRC_SPACE(req->setup_head, clear_tail, MAX_FLOWS) ||
1457 	    CIRC_CNT(req->setup_head, clear_tail, MAX_FLOWS) >=
1458 	    req->n_flows)
1459 		return -EINVAL;
1460 
1461 	/*
1462 	 * Get pages, identify contiguous physical memory chunks for the segment
1463 	 * If we can not determine a DMA address mapping we will treat it just
1464 	 * like if we ran out of space above.
1465 	 */
1466 	if (kern_get_phys_blocks(flow, qpriv->pages, ss, last)) {
1467 		hfi1_wait_kmem(flow->req->qp);
1468 		return -ENOMEM;
1469 	}
1470 
1471 	spin_lock_irqsave(&rcd->exp_lock, flags);
1472 	if (kernel_tid_waiters(rcd, &rcd->rarr_queue, flow->req->qp))
1473 		goto queue;
1474 
1475 	/*
1476 	 * At this point we know the number of pagesets and hence the number of
1477 	 * TID's to map the segment. Allocate the TID's from the TID groups. If
1478 	 * we cannot allocate the required number we exit and try again later
1479 	 */
1480 	if (kern_alloc_tids(flow))
1481 		goto queue;
1482 	/*
1483 	 * Finally program the TID entries with the pagesets, compute the
1484 	 * tidarray and enable the HW flow
1485 	 */
1486 	kern_program_rcvarray(flow);
1487 
1488 	/*
1489 	 * Setup the flow state with relevant information.
1490 	 * This information is used for tracking the sequence of data packets
1491 	 * for the segment.
1492 	 * The flow is setup here as this is the most accurate time and place
1493 	 * to do so. Doing at a later time runs the risk of the flow data in
1494 	 * qpriv getting out of sync.
1495 	 */
1496 	memset(&flow->flow_state, 0x0, sizeof(flow->flow_state));
1497 	flow->idx = qpriv->flow_state.index;
1498 	flow->flow_state.generation = qpriv->flow_state.generation;
1499 	flow->flow_state.spsn = qpriv->flow_state.psn;
1500 	flow->flow_state.lpsn = flow->flow_state.spsn + flow->npkts - 1;
1501 	flow->flow_state.r_next_psn =
1502 		full_flow_psn(flow, flow->flow_state.spsn);
1503 	qpriv->flow_state.psn += flow->npkts;
1504 
1505 	dequeue_tid_waiter(rcd, &rcd->rarr_queue, flow->req->qp);
1506 	/* get head before dropping lock */
1507 	fqp = first_qp(rcd, &rcd->rarr_queue);
1508 	spin_unlock_irqrestore(&rcd->exp_lock, flags);
1509 	tid_rdma_schedule_tid_wakeup(fqp);
1510 
1511 	req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1);
1512 	return 0;
1513 queue:
1514 	queue_qp_for_tid_wait(rcd, &rcd->rarr_queue, flow->req->qp);
1515 	spin_unlock_irqrestore(&rcd->exp_lock, flags);
1516 	return -EAGAIN;
1517 }
1518 
1519 static void hfi1_tid_rdma_reset_flow(struct tid_rdma_flow *flow)
1520 {
1521 	flow->npagesets = 0;
1522 }
1523 
1524 /*
1525  * This function is called after one segment has been successfully sent to
1526  * release the flow and TID HW/SW resources for that segment. The segments for a
1527  * TID RDMA request are setup and cleared in FIFO order which is managed using a
1528  * circular buffer.
1529  */
1530 int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req)
1531 	__must_hold(&req->qp->s_lock)
1532 {
1533 	struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
1534 	struct hfi1_ctxtdata *rcd = req->rcd;
1535 	unsigned long flags;
1536 	int i;
1537 	struct rvt_qp *fqp;
1538 
1539 	lockdep_assert_held(&req->qp->s_lock);
1540 	/* Exit if we have nothing in the flow circular buffer */
1541 	if (!CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS))
1542 		return -EINVAL;
1543 
1544 	spin_lock_irqsave(&rcd->exp_lock, flags);
1545 
1546 	for (i = 0; i < flow->tnode_cnt; i++)
1547 		kern_unprogram_rcv_group(flow, i);
1548 	/* To prevent double unprogramming */
1549 	flow->tnode_cnt = 0;
1550 	/* get head before dropping lock */
1551 	fqp = first_qp(rcd, &rcd->rarr_queue);
1552 	spin_unlock_irqrestore(&rcd->exp_lock, flags);
1553 
1554 	dma_unmap_flow(flow);
1555 
1556 	hfi1_tid_rdma_reset_flow(flow);
1557 	req->clear_tail = (req->clear_tail + 1) & (MAX_FLOWS - 1);
1558 
1559 	if (fqp == req->qp) {
1560 		__trigger_tid_waiter(fqp);
1561 		rvt_put_qp(fqp);
1562 	} else {
1563 		tid_rdma_schedule_tid_wakeup(fqp);
1564 	}
1565 
1566 	return 0;
1567 }
1568 
1569 /*
1570  * This function is called to release all the tid entries for
1571  * a request.
1572  */
1573 void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req)
1574 	__must_hold(&req->qp->s_lock)
1575 {
1576 	/* Use memory barrier for proper ordering */
1577 	while (CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS)) {
1578 		if (hfi1_kern_exp_rcv_clear(req))
1579 			break;
1580 	}
1581 }
1582 
1583 /**
1584  * hfi1_kern_exp_rcv_free_flows - free priviously allocated flow information
1585  * @req - the tid rdma request to be cleaned
1586  */
1587 static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req)
1588 {
1589 	kfree(req->flows);
1590 	req->flows = NULL;
1591 }
1592 
1593 /**
1594  * __trdma_clean_swqe - clean up for large sized QPs
1595  * @qp: the queue patch
1596  * @wqe: the send wqe
1597  */
1598 void __trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
1599 {
1600 	struct hfi1_swqe_priv *p = wqe->priv;
1601 
1602 	hfi1_kern_exp_rcv_free_flows(&p->tid_req);
1603 }
1604 
1605 /*
1606  * This can be called at QP create time or in the data path.
1607  */
1608 static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req,
1609 					 gfp_t gfp)
1610 {
1611 	struct tid_rdma_flow *flows;
1612 	int i;
1613 
1614 	if (likely(req->flows))
1615 		return 0;
1616 	flows = kmalloc_node(MAX_FLOWS * sizeof(*flows), gfp,
1617 			     req->rcd->numa_id);
1618 	if (!flows)
1619 		return -ENOMEM;
1620 	/* mini init */
1621 	for (i = 0; i < MAX_FLOWS; i++) {
1622 		flows[i].req = req;
1623 		flows[i].npagesets = 0;
1624 		flows[i].pagesets[0].mapped =  0;
1625 	}
1626 	req->flows = flows;
1627 	return 0;
1628 }
1629 
1630 static void hfi1_init_trdma_req(struct rvt_qp *qp,
1631 				struct tid_rdma_request *req)
1632 {
1633 	struct hfi1_qp_priv *qpriv = qp->priv;
1634 
1635 	/*
1636 	 * Initialize various TID RDMA request variables.
1637 	 * These variables are "static", which is why they
1638 	 * can be pre-initialized here before the WRs has
1639 	 * even been submitted.
1640 	 * However, non-NULL values for these variables do not
1641 	 * imply that this WQE has been enabled for TID RDMA.
1642 	 * Drivers should check the WQE's opcode to determine
1643 	 * if a request is a TID RDMA one or not.
1644 	 */
1645 	req->qp = qp;
1646 	req->rcd = qpriv->rcd;
1647 }
1648 
1649 u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry,
1650 			    void *context, int vl, int mode, u64 data)
1651 {
1652 	struct hfi1_devdata *dd = context;
1653 
1654 	return dd->verbs_dev.n_tidwait;
1655 }
1656 
1657 static struct tid_rdma_flow *find_flow_ib(struct tid_rdma_request *req,
1658 					  u32 psn, u16 *fidx)
1659 {
1660 	u16 head, tail;
1661 	struct tid_rdma_flow *flow;
1662 
1663 	head = req->setup_head;
1664 	tail = req->clear_tail;
1665 	for ( ; CIRC_CNT(head, tail, MAX_FLOWS);
1666 	     tail = CIRC_NEXT(tail, MAX_FLOWS)) {
1667 		flow = &req->flows[tail];
1668 		if (cmp_psn(psn, flow->flow_state.ib_spsn) >= 0 &&
1669 		    cmp_psn(psn, flow->flow_state.ib_lpsn) <= 0) {
1670 			if (fidx)
1671 				*fidx = tail;
1672 			return flow;
1673 		}
1674 	}
1675 	return NULL;
1676 }
1677 
1678 static struct tid_rdma_flow *
1679 __find_flow_ranged(struct tid_rdma_request *req, u16 head, u16 tail,
1680 		   u32 psn, u16 *fidx)
1681 {
1682 	for ( ; CIRC_CNT(head, tail, MAX_FLOWS);
1683 	      tail = CIRC_NEXT(tail, MAX_FLOWS)) {
1684 		struct tid_rdma_flow *flow = &req->flows[tail];
1685 		u32 spsn, lpsn;
1686 
1687 		spsn = full_flow_psn(flow, flow->flow_state.spsn);
1688 		lpsn = full_flow_psn(flow, flow->flow_state.lpsn);
1689 
1690 		if (cmp_psn(psn, spsn) >= 0 && cmp_psn(psn, lpsn) <= 0) {
1691 			if (fidx)
1692 				*fidx = tail;
1693 			return flow;
1694 		}
1695 	}
1696 	return NULL;
1697 }
1698 
1699 static struct tid_rdma_flow *find_flow(struct tid_rdma_request *req,
1700 				       u32 psn, u16 *fidx)
1701 {
1702 	return __find_flow_ranged(req, req->setup_head, req->clear_tail, psn,
1703 				  fidx);
1704 }
1705 
1706 /* TID RDMA READ functions */
1707 u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe,
1708 				    struct ib_other_headers *ohdr, u32 *bth1,
1709 				    u32 *bth2, u32 *len)
1710 {
1711 	struct tid_rdma_request *req = wqe_to_tid_req(wqe);
1712 	struct tid_rdma_flow *flow = &req->flows[req->flow_idx];
1713 	struct rvt_qp *qp = req->qp;
1714 	struct hfi1_qp_priv *qpriv = qp->priv;
1715 	struct hfi1_swqe_priv *wpriv = wqe->priv;
1716 	struct tid_rdma_read_req *rreq = &ohdr->u.tid_rdma.r_req;
1717 	struct tid_rdma_params *remote;
1718 	u32 req_len = 0;
1719 	void *req_addr = NULL;
1720 
1721 	/* This is the IB psn used to send the request */
1722 	*bth2 = mask_psn(flow->flow_state.ib_spsn + flow->pkt);
1723 	trace_hfi1_tid_flow_build_read_pkt(qp, req->flow_idx, flow);
1724 
1725 	/* TID Entries for TID RDMA READ payload */
1726 	req_addr = &flow->tid_entry[flow->tid_idx];
1727 	req_len = sizeof(*flow->tid_entry) *
1728 			(flow->tidcnt - flow->tid_idx);
1729 
1730 	memset(&ohdr->u.tid_rdma.r_req, 0, sizeof(ohdr->u.tid_rdma.r_req));
1731 	wpriv->ss.sge.vaddr = req_addr;
1732 	wpriv->ss.sge.sge_length = req_len;
1733 	wpriv->ss.sge.length = wpriv->ss.sge.sge_length;
1734 	/*
1735 	 * We can safely zero these out. Since the first SGE covers the
1736 	 * entire packet, nothing else should even look at the MR.
1737 	 */
1738 	wpriv->ss.sge.mr = NULL;
1739 	wpriv->ss.sge.m = 0;
1740 	wpriv->ss.sge.n = 0;
1741 
1742 	wpriv->ss.sg_list = NULL;
1743 	wpriv->ss.total_len = wpriv->ss.sge.sge_length;
1744 	wpriv->ss.num_sge = 1;
1745 
1746 	/* Construct the TID RDMA READ REQ packet header */
1747 	rcu_read_lock();
1748 	remote = rcu_dereference(qpriv->tid_rdma.remote);
1749 
1750 	KDETH_RESET(rreq->kdeth0, KVER, 0x1);
1751 	KDETH_RESET(rreq->kdeth1, JKEY, remote->jkey);
1752 	rreq->reth.vaddr = cpu_to_be64(wqe->rdma_wr.remote_addr +
1753 			   req->cur_seg * req->seg_len + flow->sent);
1754 	rreq->reth.rkey = cpu_to_be32(wqe->rdma_wr.rkey);
1755 	rreq->reth.length = cpu_to_be32(*len);
1756 	rreq->tid_flow_psn =
1757 		cpu_to_be32((flow->flow_state.generation <<
1758 			     HFI1_KDETH_BTH_SEQ_SHIFT) |
1759 			    ((flow->flow_state.spsn + flow->pkt) &
1760 			     HFI1_KDETH_BTH_SEQ_MASK));
1761 	rreq->tid_flow_qp =
1762 		cpu_to_be32(qpriv->tid_rdma.local.qp |
1763 			    ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) <<
1764 			     TID_RDMA_DESTQP_FLOW_SHIFT) |
1765 			    qpriv->rcd->ctxt);
1766 	rreq->verbs_qp = cpu_to_be32(qp->remote_qpn);
1767 	*bth1 &= ~RVT_QPN_MASK;
1768 	*bth1 |= remote->qp;
1769 	*bth2 |= IB_BTH_REQ_ACK;
1770 	rcu_read_unlock();
1771 
1772 	/* We are done with this segment */
1773 	flow->sent += *len;
1774 	req->cur_seg++;
1775 	qp->s_state = TID_OP(READ_REQ);
1776 	req->ack_pending++;
1777 	req->flow_idx = (req->flow_idx + 1) & (MAX_FLOWS - 1);
1778 	qpriv->pending_tid_r_segs++;
1779 	qp->s_num_rd_atomic++;
1780 
1781 	/* Set the TID RDMA READ request payload size */
1782 	*len = req_len;
1783 
1784 	return sizeof(ohdr->u.tid_rdma.r_req) / sizeof(u32);
1785 }
1786 
1787 /*
1788  * @len: contains the data length to read upon entry and the read request
1789  *       payload length upon exit.
1790  */
1791 u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
1792 				 struct ib_other_headers *ohdr, u32 *bth1,
1793 				 u32 *bth2, u32 *len)
1794 	__must_hold(&qp->s_lock)
1795 {
1796 	struct hfi1_qp_priv *qpriv = qp->priv;
1797 	struct tid_rdma_request *req = wqe_to_tid_req(wqe);
1798 	struct tid_rdma_flow *flow = NULL;
1799 	u32 hdwords = 0;
1800 	bool last;
1801 	bool retry = true;
1802 	u32 npkts = rvt_div_round_up_mtu(qp, *len);
1803 
1804 	trace_hfi1_tid_req_build_read_req(qp, 0, wqe->wr.opcode, wqe->psn,
1805 					  wqe->lpsn, req);
1806 	/*
1807 	 * Check sync conditions. Make sure that there are no pending
1808 	 * segments before freeing the flow.
1809 	 */
1810 sync_check:
1811 	if (req->state == TID_REQUEST_SYNC) {
1812 		if (qpriv->pending_tid_r_segs)
1813 			goto done;
1814 
1815 		hfi1_kern_clear_hw_flow(req->rcd, qp);
1816 		qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
1817 		req->state = TID_REQUEST_ACTIVE;
1818 	}
1819 
1820 	/*
1821 	 * If the request for this segment is resent, the tid resources should
1822 	 * have been allocated before. In this case, req->flow_idx should
1823 	 * fall behind req->setup_head.
1824 	 */
1825 	if (req->flow_idx == req->setup_head) {
1826 		retry = false;
1827 		if (req->state == TID_REQUEST_RESEND) {
1828 			/*
1829 			 * This is the first new segment for a request whose
1830 			 * earlier segments have been re-sent. We need to
1831 			 * set up the sge pointer correctly.
1832 			 */
1833 			restart_sge(&qp->s_sge, wqe, req->s_next_psn,
1834 				    qp->pmtu);
1835 			req->isge = 0;
1836 			req->state = TID_REQUEST_ACTIVE;
1837 		}
1838 
1839 		/*
1840 		 * Check sync. The last PSN of each generation is reserved for
1841 		 * RESYNC.
1842 		 */
1843 		if ((qpriv->flow_state.psn + npkts) > MAX_TID_FLOW_PSN - 1) {
1844 			req->state = TID_REQUEST_SYNC;
1845 			goto sync_check;
1846 		}
1847 
1848 		/* Allocate the flow if not yet */
1849 		if (hfi1_kern_setup_hw_flow(qpriv->rcd, qp))
1850 			goto done;
1851 
1852 		/*
1853 		 * The following call will advance req->setup_head after
1854 		 * allocating the tid entries.
1855 		 */
1856 		if (hfi1_kern_exp_rcv_setup(req, &qp->s_sge, &last)) {
1857 			req->state = TID_REQUEST_QUEUED;
1858 
1859 			/*
1860 			 * We don't have resources for this segment. The QP has
1861 			 * already been queued.
1862 			 */
1863 			goto done;
1864 		}
1865 	}
1866 
1867 	/* req->flow_idx should only be one slot behind req->setup_head */
1868 	flow = &req->flows[req->flow_idx];
1869 	flow->pkt = 0;
1870 	flow->tid_idx = 0;
1871 	flow->sent = 0;
1872 	if (!retry) {
1873 		/* Set the first and last IB PSN for the flow in use.*/
1874 		flow->flow_state.ib_spsn = req->s_next_psn;
1875 		flow->flow_state.ib_lpsn =
1876 			flow->flow_state.ib_spsn + flow->npkts - 1;
1877 	}
1878 
1879 	/* Calculate the next segment start psn.*/
1880 	req->s_next_psn += flow->npkts;
1881 
1882 	/* Build the packet header */
1883 	hdwords = hfi1_build_tid_rdma_read_packet(wqe, ohdr, bth1, bth2, len);
1884 done:
1885 	return hdwords;
1886 }
1887 
1888 /*
1889  * Validate and accept the TID RDMA READ request parameters.
1890  * Return 0 if the request is accepted successfully;
1891  * Return 1 otherwise.
1892  */
1893 static int tid_rdma_rcv_read_request(struct rvt_qp *qp,
1894 				     struct rvt_ack_entry *e,
1895 				     struct hfi1_packet *packet,
1896 				     struct ib_other_headers *ohdr,
1897 				     u32 bth0, u32 psn, u64 vaddr, u32 len)
1898 {
1899 	struct hfi1_qp_priv *qpriv = qp->priv;
1900 	struct tid_rdma_request *req;
1901 	struct tid_rdma_flow *flow;
1902 	u32 flow_psn, i, tidlen = 0, pktlen, tlen;
1903 
1904 	req = ack_to_tid_req(e);
1905 
1906 	/* Validate the payload first */
1907 	flow = &req->flows[req->setup_head];
1908 
1909 	/* payload length = packet length - (header length + ICRC length) */
1910 	pktlen = packet->tlen - (packet->hlen + 4);
1911 	if (pktlen > sizeof(flow->tid_entry))
1912 		return 1;
1913 	memcpy(flow->tid_entry, packet->ebuf, pktlen);
1914 	flow->tidcnt = pktlen / sizeof(*flow->tid_entry);
1915 
1916 	/*
1917 	 * Walk the TID_ENTRY list to make sure we have enough space for a
1918 	 * complete segment. Also calculate the number of required packets.
1919 	 */
1920 	flow->npkts = rvt_div_round_up_mtu(qp, len);
1921 	for (i = 0; i < flow->tidcnt; i++) {
1922 		trace_hfi1_tid_entry_rcv_read_req(qp, i,
1923 						  flow->tid_entry[i]);
1924 		tlen = EXP_TID_GET(flow->tid_entry[i], LEN);
1925 		if (!tlen)
1926 			return 1;
1927 
1928 		/*
1929 		 * For tid pair (tidctr == 3), the buffer size of the pair
1930 		 * should be the sum of the buffer size described by each
1931 		 * tid entry. However, only the first entry needs to be
1932 		 * specified in the request (see WFR HAS Section 8.5.7.1).
1933 		 */
1934 		tidlen += tlen;
1935 	}
1936 	if (tidlen * PAGE_SIZE < len)
1937 		return 1;
1938 
1939 	/* Empty the flow array */
1940 	req->clear_tail = req->setup_head;
1941 	flow->pkt = 0;
1942 	flow->tid_idx = 0;
1943 	flow->tid_offset = 0;
1944 	flow->sent = 0;
1945 	flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_qp);
1946 	flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) &
1947 		    TID_RDMA_DESTQP_FLOW_MASK;
1948 	flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_psn));
1949 	flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT;
1950 	flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK;
1951 	flow->length = len;
1952 
1953 	flow->flow_state.lpsn = flow->flow_state.spsn +
1954 		flow->npkts - 1;
1955 	flow->flow_state.ib_spsn = psn;
1956 	flow->flow_state.ib_lpsn = flow->flow_state.ib_spsn + flow->npkts - 1;
1957 
1958 	trace_hfi1_tid_flow_rcv_read_req(qp, req->setup_head, flow);
1959 	/* Set the initial flow index to the current flow. */
1960 	req->flow_idx = req->setup_head;
1961 
1962 	/* advance circular buffer head */
1963 	req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1);
1964 
1965 	/*
1966 	 * Compute last PSN for request.
1967 	 */
1968 	e->opcode = (bth0 >> 24) & 0xff;
1969 	e->psn = psn;
1970 	e->lpsn = psn + flow->npkts - 1;
1971 	e->sent = 0;
1972 
1973 	req->n_flows = qpriv->tid_rdma.local.max_read;
1974 	req->state = TID_REQUEST_ACTIVE;
1975 	req->cur_seg = 0;
1976 	req->comp_seg = 0;
1977 	req->ack_seg = 0;
1978 	req->isge = 0;
1979 	req->seg_len = qpriv->tid_rdma.local.max_len;
1980 	req->total_len = len;
1981 	req->total_segs = 1;
1982 	req->r_flow_psn = e->psn;
1983 
1984 	trace_hfi1_tid_req_rcv_read_req(qp, 0, e->opcode, e->psn, e->lpsn,
1985 					req);
1986 	return 0;
1987 }
1988 
1989 static int tid_rdma_rcv_error(struct hfi1_packet *packet,
1990 			      struct ib_other_headers *ohdr,
1991 			      struct rvt_qp *qp, u32 psn, int diff)
1992 {
1993 	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
1994 	struct hfi1_ctxtdata *rcd = ((struct hfi1_qp_priv *)qp->priv)->rcd;
1995 	struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
1996 	struct hfi1_qp_priv *qpriv = qp->priv;
1997 	struct rvt_ack_entry *e;
1998 	struct tid_rdma_request *req;
1999 	unsigned long flags;
2000 	u8 prev;
2001 	bool old_req;
2002 
2003 	trace_hfi1_rsp_tid_rcv_error(qp, psn);
2004 	trace_hfi1_tid_rdma_rcv_err(qp, 0, psn, diff);
2005 	if (diff > 0) {
2006 		/* sequence error */
2007 		if (!qp->r_nak_state) {
2008 			ibp->rvp.n_rc_seqnak++;
2009 			qp->r_nak_state = IB_NAK_PSN_ERROR;
2010 			qp->r_ack_psn = qp->r_psn;
2011 			rc_defered_ack(rcd, qp);
2012 		}
2013 		goto done;
2014 	}
2015 
2016 	ibp->rvp.n_rc_dupreq++;
2017 
2018 	spin_lock_irqsave(&qp->s_lock, flags);
2019 	e = find_prev_entry(qp, psn, &prev, NULL, &old_req);
2020 	if (!e || (e->opcode != TID_OP(READ_REQ) &&
2021 		   e->opcode != TID_OP(WRITE_REQ)))
2022 		goto unlock;
2023 
2024 	req = ack_to_tid_req(e);
2025 	req->r_flow_psn = psn;
2026 	trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn, e->lpsn, req);
2027 	if (e->opcode == TID_OP(READ_REQ)) {
2028 		struct ib_reth *reth;
2029 		u32 offset;
2030 		u32 len;
2031 		u32 rkey;
2032 		u64 vaddr;
2033 		int ok;
2034 		u32 bth0;
2035 
2036 		reth = &ohdr->u.tid_rdma.r_req.reth;
2037 		/*
2038 		 * The requester always restarts from the start of the original
2039 		 * request.
2040 		 */
2041 		offset = delta_psn(psn, e->psn) * qp->pmtu;
2042 		len = be32_to_cpu(reth->length);
2043 		if (psn != e->psn || len != req->total_len)
2044 			goto unlock;
2045 
2046 		release_rdma_sge_mr(e);
2047 
2048 		rkey = be32_to_cpu(reth->rkey);
2049 		vaddr = get_ib_reth_vaddr(reth);
2050 
2051 		qp->r_len = len;
2052 		ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
2053 				 IB_ACCESS_REMOTE_READ);
2054 		if (unlikely(!ok))
2055 			goto unlock;
2056 
2057 		/*
2058 		 * If all the response packets for the current request have
2059 		 * been sent out and this request is complete (old_request
2060 		 * == false) and the TID flow may be unusable (the
2061 		 * req->clear_tail is advanced). However, when an earlier
2062 		 * request is received, this request will not be complete any
2063 		 * more (qp->s_tail_ack_queue is moved back, see below).
2064 		 * Consequently, we need to update the TID flow info everytime
2065 		 * a duplicate request is received.
2066 		 */
2067 		bth0 = be32_to_cpu(ohdr->bth[0]);
2068 		if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn,
2069 					      vaddr, len))
2070 			goto unlock;
2071 
2072 		/*
2073 		 * True if the request is already scheduled (between
2074 		 * qp->s_tail_ack_queue and qp->r_head_ack_queue);
2075 		 */
2076 		if (old_req)
2077 			goto unlock;
2078 	} else {
2079 		struct flow_state *fstate;
2080 		bool schedule = false;
2081 		u8 i;
2082 
2083 		if (req->state == TID_REQUEST_RESEND) {
2084 			req->state = TID_REQUEST_RESEND_ACTIVE;
2085 		} else if (req->state == TID_REQUEST_INIT_RESEND) {
2086 			req->state = TID_REQUEST_INIT;
2087 			schedule = true;
2088 		}
2089 
2090 		/*
2091 		 * True if the request is already scheduled (between
2092 		 * qp->s_tail_ack_queue and qp->r_head_ack_queue).
2093 		 * Also, don't change requests, which are at the SYNC
2094 		 * point and haven't generated any responses yet.
2095 		 * There is nothing to retransmit for them yet.
2096 		 */
2097 		if (old_req || req->state == TID_REQUEST_INIT ||
2098 		    (req->state == TID_REQUEST_SYNC && !req->cur_seg)) {
2099 			for (i = prev + 1; ; i++) {
2100 				if (i > rvt_size_atomic(&dev->rdi))
2101 					i = 0;
2102 				if (i == qp->r_head_ack_queue)
2103 					break;
2104 				e = &qp->s_ack_queue[i];
2105 				req = ack_to_tid_req(e);
2106 				if (e->opcode == TID_OP(WRITE_REQ) &&
2107 				    req->state == TID_REQUEST_INIT)
2108 					req->state = TID_REQUEST_INIT_RESEND;
2109 			}
2110 			/*
2111 			 * If the state of the request has been changed,
2112 			 * the first leg needs to get scheduled in order to
2113 			 * pick up the change. Otherwise, normal response
2114 			 * processing should take care of it.
2115 			 */
2116 			if (!schedule)
2117 				goto unlock;
2118 		}
2119 
2120 		/*
2121 		 * If there is no more allocated segment, just schedule the qp
2122 		 * without changing any state.
2123 		 */
2124 		if (req->clear_tail == req->setup_head)
2125 			goto schedule;
2126 		/*
2127 		 * If this request has sent responses for segments, which have
2128 		 * not received data yet (flow_idx != clear_tail), the flow_idx
2129 		 * pointer needs to be adjusted so the same responses can be
2130 		 * re-sent.
2131 		 */
2132 		if (CIRC_CNT(req->flow_idx, req->clear_tail, MAX_FLOWS)) {
2133 			fstate = &req->flows[req->clear_tail].flow_state;
2134 			qpriv->pending_tid_w_segs -=
2135 				CIRC_CNT(req->flow_idx, req->clear_tail,
2136 					 MAX_FLOWS);
2137 			req->flow_idx =
2138 				CIRC_ADD(req->clear_tail,
2139 					 delta_psn(psn, fstate->resp_ib_psn),
2140 					 MAX_FLOWS);
2141 			qpriv->pending_tid_w_segs +=
2142 				delta_psn(psn, fstate->resp_ib_psn);
2143 			/*
2144 			 * When flow_idx == setup_head, we've gotten a duplicate
2145 			 * request for a segment, which has not been allocated
2146 			 * yet. In that case, don't adjust this request.
2147 			 * However, we still want to go through the loop below
2148 			 * to adjust all subsequent requests.
2149 			 */
2150 			if (CIRC_CNT(req->setup_head, req->flow_idx,
2151 				     MAX_FLOWS)) {
2152 				req->cur_seg = delta_psn(psn, e->psn);
2153 				req->state = TID_REQUEST_RESEND_ACTIVE;
2154 			}
2155 		}
2156 
2157 		for (i = prev + 1; ; i++) {
2158 			/*
2159 			 * Look at everything up to and including
2160 			 * s_tail_ack_queue
2161 			 */
2162 			if (i > rvt_size_atomic(&dev->rdi))
2163 				i = 0;
2164 			if (i == qp->r_head_ack_queue)
2165 				break;
2166 			e = &qp->s_ack_queue[i];
2167 			req = ack_to_tid_req(e);
2168 			trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn,
2169 						   e->lpsn, req);
2170 			if (e->opcode != TID_OP(WRITE_REQ) ||
2171 			    req->cur_seg == req->comp_seg ||
2172 			    req->state == TID_REQUEST_INIT ||
2173 			    req->state == TID_REQUEST_INIT_RESEND) {
2174 				if (req->state == TID_REQUEST_INIT)
2175 					req->state = TID_REQUEST_INIT_RESEND;
2176 				continue;
2177 			}
2178 			qpriv->pending_tid_w_segs -=
2179 				CIRC_CNT(req->flow_idx,
2180 					 req->clear_tail,
2181 					 MAX_FLOWS);
2182 			req->flow_idx = req->clear_tail;
2183 			req->state = TID_REQUEST_RESEND;
2184 			req->cur_seg = req->comp_seg;
2185 		}
2186 		qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK;
2187 	}
2188 	/* Re-process old requests.*/
2189 	if (qp->s_acked_ack_queue == qp->s_tail_ack_queue)
2190 		qp->s_acked_ack_queue = prev;
2191 	qp->s_tail_ack_queue = prev;
2192 	/*
2193 	 * Since the qp->s_tail_ack_queue is modified, the
2194 	 * qp->s_ack_state must be changed to re-initialize
2195 	 * qp->s_ack_rdma_sge; Otherwise, we will end up in
2196 	 * wrong memory region.
2197 	 */
2198 	qp->s_ack_state = OP(ACKNOWLEDGE);
2199 schedule:
2200 	/*
2201 	 * It's possible to receive a retry psn that is earlier than an RNRNAK
2202 	 * psn. In this case, the rnrnak state should be cleared.
2203 	 */
2204 	if (qpriv->rnr_nak_state) {
2205 		qp->s_nak_state = 0;
2206 		qpriv->rnr_nak_state = TID_RNR_NAK_INIT;
2207 		qp->r_psn = e->lpsn + 1;
2208 		hfi1_tid_write_alloc_resources(qp, true);
2209 	}
2210 
2211 	qp->r_state = e->opcode;
2212 	qp->r_nak_state = 0;
2213 	qp->s_flags |= RVT_S_RESP_PENDING;
2214 	hfi1_schedule_send(qp);
2215 unlock:
2216 	spin_unlock_irqrestore(&qp->s_lock, flags);
2217 done:
2218 	return 1;
2219 }
2220 
2221 void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet)
2222 {
2223 	/* HANDLER FOR TID RDMA READ REQUEST packet (Responder side)*/
2224 
2225 	/*
2226 	 * 1. Verify TID RDMA READ REQ as per IB_OPCODE_RC_RDMA_READ
2227 	 *    (see hfi1_rc_rcv())
2228 	 * 2. Put TID RDMA READ REQ into the response queueu (s_ack_queue)
2229 	 *     - Setup struct tid_rdma_req with request info
2230 	 *     - Initialize struct tid_rdma_flow info;
2231 	 *     - Copy TID entries;
2232 	 * 3. Set the qp->s_ack_state.
2233 	 * 4. Set RVT_S_RESP_PENDING in s_flags.
2234 	 * 5. Kick the send engine (hfi1_schedule_send())
2235 	 */
2236 	struct hfi1_ctxtdata *rcd = packet->rcd;
2237 	struct rvt_qp *qp = packet->qp;
2238 	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
2239 	struct ib_other_headers *ohdr = packet->ohdr;
2240 	struct rvt_ack_entry *e;
2241 	unsigned long flags;
2242 	struct ib_reth *reth;
2243 	struct hfi1_qp_priv *qpriv = qp->priv;
2244 	u32 bth0, psn, len, rkey;
2245 	bool fecn;
2246 	u8 next;
2247 	u64 vaddr;
2248 	int diff;
2249 	u8 nack_state = IB_NAK_INVALID_REQUEST;
2250 
2251 	bth0 = be32_to_cpu(ohdr->bth[0]);
2252 	if (hfi1_ruc_check_hdr(ibp, packet))
2253 		return;
2254 
2255 	fecn = process_ecn(qp, packet);
2256 	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
2257 	trace_hfi1_rsp_rcv_tid_read_req(qp, psn);
2258 
2259 	if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
2260 		rvt_comm_est(qp);
2261 
2262 	if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
2263 		goto nack_inv;
2264 
2265 	reth = &ohdr->u.tid_rdma.r_req.reth;
2266 	vaddr = be64_to_cpu(reth->vaddr);
2267 	len = be32_to_cpu(reth->length);
2268 	/* The length needs to be in multiples of PAGE_SIZE */
2269 	if (!len || len & ~PAGE_MASK || len > qpriv->tid_rdma.local.max_len)
2270 		goto nack_inv;
2271 
2272 	diff = delta_psn(psn, qp->r_psn);
2273 	if (unlikely(diff)) {
2274 		tid_rdma_rcv_err(packet, ohdr, qp, psn, diff, fecn);
2275 		return;
2276 	}
2277 
2278 	/* We've verified the request, insert it into the ack queue. */
2279 	next = qp->r_head_ack_queue + 1;
2280 	if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
2281 		next = 0;
2282 	spin_lock_irqsave(&qp->s_lock, flags);
2283 	if (unlikely(next == qp->s_tail_ack_queue)) {
2284 		if (!qp->s_ack_queue[next].sent) {
2285 			nack_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
2286 			goto nack_inv_unlock;
2287 		}
2288 		update_ack_queue(qp, next);
2289 	}
2290 	e = &qp->s_ack_queue[qp->r_head_ack_queue];
2291 	release_rdma_sge_mr(e);
2292 
2293 	rkey = be32_to_cpu(reth->rkey);
2294 	qp->r_len = len;
2295 
2296 	if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr,
2297 				  rkey, IB_ACCESS_REMOTE_READ)))
2298 		goto nack_acc;
2299 
2300 	/* Accept the request parameters */
2301 	if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn, vaddr,
2302 				      len))
2303 		goto nack_inv_unlock;
2304 
2305 	qp->r_state = e->opcode;
2306 	qp->r_nak_state = 0;
2307 	/*
2308 	 * We need to increment the MSN here instead of when we
2309 	 * finish sending the result since a duplicate request would
2310 	 * increment it more than once.
2311 	 */
2312 	qp->r_msn++;
2313 	qp->r_psn += e->lpsn - e->psn + 1;
2314 
2315 	qp->r_head_ack_queue = next;
2316 
2317 	/*
2318 	 * For all requests other than TID WRITE which are added to the ack
2319 	 * queue, qpriv->r_tid_alloc follows qp->r_head_ack_queue. It is ok to
2320 	 * do this because of interlocks between these and TID WRITE
2321 	 * requests. The same change has also been made in hfi1_rc_rcv().
2322 	 */
2323 	qpriv->r_tid_alloc = qp->r_head_ack_queue;
2324 
2325 	/* Schedule the send tasklet. */
2326 	qp->s_flags |= RVT_S_RESP_PENDING;
2327 	if (fecn)
2328 		qp->s_flags |= RVT_S_ECN;
2329 	hfi1_schedule_send(qp);
2330 
2331 	spin_unlock_irqrestore(&qp->s_lock, flags);
2332 	return;
2333 
2334 nack_inv_unlock:
2335 	spin_unlock_irqrestore(&qp->s_lock, flags);
2336 nack_inv:
2337 	rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2338 	qp->r_nak_state = nack_state;
2339 	qp->r_ack_psn = qp->r_psn;
2340 	/* Queue NAK for later */
2341 	rc_defered_ack(rcd, qp);
2342 	return;
2343 nack_acc:
2344 	spin_unlock_irqrestore(&qp->s_lock, flags);
2345 	rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
2346 	qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
2347 	qp->r_ack_psn = qp->r_psn;
2348 }
2349 
2350 u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
2351 				  struct ib_other_headers *ohdr, u32 *bth0,
2352 				  u32 *bth1, u32 *bth2, u32 *len, bool *last)
2353 {
2354 	struct hfi1_ack_priv *epriv = e->priv;
2355 	struct tid_rdma_request *req = &epriv->tid_req;
2356 	struct hfi1_qp_priv *qpriv = qp->priv;
2357 	struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
2358 	u32 tidentry = flow->tid_entry[flow->tid_idx];
2359 	u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT;
2360 	struct tid_rdma_read_resp *resp = &ohdr->u.tid_rdma.r_rsp;
2361 	u32 next_offset, om = KDETH_OM_LARGE;
2362 	bool last_pkt;
2363 	u32 hdwords = 0;
2364 	struct tid_rdma_params *remote;
2365 
2366 	*len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset);
2367 	flow->sent += *len;
2368 	next_offset = flow->tid_offset + *len;
2369 	last_pkt = (flow->sent >= flow->length);
2370 
2371 	trace_hfi1_tid_entry_build_read_resp(qp, flow->tid_idx, tidentry);
2372 	trace_hfi1_tid_flow_build_read_resp(qp, req->clear_tail, flow);
2373 
2374 	rcu_read_lock();
2375 	remote = rcu_dereference(qpriv->tid_rdma.remote);
2376 	if (!remote) {
2377 		rcu_read_unlock();
2378 		goto done;
2379 	}
2380 	KDETH_RESET(resp->kdeth0, KVER, 0x1);
2381 	KDETH_SET(resp->kdeth0, SH, !last_pkt);
2382 	KDETH_SET(resp->kdeth0, INTR, !!(!last_pkt && remote->urg));
2383 	KDETH_SET(resp->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL));
2384 	KDETH_SET(resp->kdeth0, TID, EXP_TID_GET(tidentry, IDX));
2385 	KDETH_SET(resp->kdeth0, OM, om == KDETH_OM_LARGE);
2386 	KDETH_SET(resp->kdeth0, OFFSET, flow->tid_offset / om);
2387 	KDETH_RESET(resp->kdeth1, JKEY, remote->jkey);
2388 	resp->verbs_qp = cpu_to_be32(qp->remote_qpn);
2389 	rcu_read_unlock();
2390 
2391 	resp->aeth = rvt_compute_aeth(qp);
2392 	resp->verbs_psn = cpu_to_be32(mask_psn(flow->flow_state.ib_spsn +
2393 					       flow->pkt));
2394 
2395 	*bth0 = TID_OP(READ_RESP) << 24;
2396 	*bth1 = flow->tid_qpn;
2397 	*bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) &
2398 			  HFI1_KDETH_BTH_SEQ_MASK) |
2399 			 (flow->flow_state.generation <<
2400 			  HFI1_KDETH_BTH_SEQ_SHIFT));
2401 	*last = last_pkt;
2402 	if (last_pkt)
2403 		/* Advance to next flow */
2404 		req->clear_tail = (req->clear_tail + 1) &
2405 				  (MAX_FLOWS - 1);
2406 
2407 	if (next_offset >= tidlen) {
2408 		flow->tid_offset = 0;
2409 		flow->tid_idx++;
2410 	} else {
2411 		flow->tid_offset = next_offset;
2412 	}
2413 
2414 	hdwords = sizeof(ohdr->u.tid_rdma.r_rsp) / sizeof(u32);
2415 
2416 done:
2417 	return hdwords;
2418 }
2419 
2420 static inline struct tid_rdma_request *
2421 find_tid_request(struct rvt_qp *qp, u32 psn, enum ib_wr_opcode opcode)
2422 	__must_hold(&qp->s_lock)
2423 {
2424 	struct rvt_swqe *wqe;
2425 	struct tid_rdma_request *req = NULL;
2426 	u32 i, end;
2427 
2428 	end = qp->s_cur + 1;
2429 	if (end == qp->s_size)
2430 		end = 0;
2431 	for (i = qp->s_acked; i != end;) {
2432 		wqe = rvt_get_swqe_ptr(qp, i);
2433 		if (cmp_psn(psn, wqe->psn) >= 0 &&
2434 		    cmp_psn(psn, wqe->lpsn) <= 0) {
2435 			if (wqe->wr.opcode == opcode)
2436 				req = wqe_to_tid_req(wqe);
2437 			break;
2438 		}
2439 		if (++i == qp->s_size)
2440 			i = 0;
2441 	}
2442 
2443 	return req;
2444 }
2445 
2446 void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet)
2447 {
2448 	/* HANDLER FOR TID RDMA READ RESPONSE packet (Requestor side */
2449 
2450 	/*
2451 	 * 1. Find matching SWQE
2452 	 * 2. Check that the entire segment has been read.
2453 	 * 3. Remove HFI1_S_WAIT_TID_RESP from s_flags.
2454 	 * 4. Free the TID flow resources.
2455 	 * 5. Kick the send engine (hfi1_schedule_send())
2456 	 */
2457 	struct ib_other_headers *ohdr = packet->ohdr;
2458 	struct rvt_qp *qp = packet->qp;
2459 	struct hfi1_qp_priv *priv = qp->priv;
2460 	struct hfi1_ctxtdata *rcd = packet->rcd;
2461 	struct tid_rdma_request *req;
2462 	struct tid_rdma_flow *flow;
2463 	u32 opcode, aeth;
2464 	bool fecn;
2465 	unsigned long flags;
2466 	u32 kpsn, ipsn;
2467 
2468 	trace_hfi1_sender_rcv_tid_read_resp(qp);
2469 	fecn = process_ecn(qp, packet);
2470 	kpsn = mask_psn(be32_to_cpu(ohdr->bth[2]));
2471 	aeth = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.aeth);
2472 	opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
2473 
2474 	spin_lock_irqsave(&qp->s_lock, flags);
2475 	ipsn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn));
2476 	req = find_tid_request(qp, ipsn, IB_WR_TID_RDMA_READ);
2477 	if (unlikely(!req))
2478 		goto ack_op_err;
2479 
2480 	flow = &req->flows[req->clear_tail];
2481 	/* When header suppression is disabled */
2482 	if (cmp_psn(ipsn, flow->flow_state.ib_lpsn)) {
2483 		update_r_next_psn_fecn(packet, priv, rcd, flow, fecn);
2484 
2485 		if (cmp_psn(kpsn, flow->flow_state.r_next_psn))
2486 			goto ack_done;
2487 		flow->flow_state.r_next_psn = mask_psn(kpsn + 1);
2488 		/*
2489 		 * Copy the payload to destination buffer if this packet is
2490 		 * delivered as an eager packet due to RSM rule and FECN.
2491 		 * The RSM rule selects FECN bit in BTH and SH bit in
2492 		 * KDETH header and therefore will not match the last
2493 		 * packet of each segment that has SH bit cleared.
2494 		 */
2495 		if (fecn && packet->etype == RHF_RCV_TYPE_EAGER) {
2496 			struct rvt_sge_state ss;
2497 			u32 len;
2498 			u32 tlen = packet->tlen;
2499 			u16 hdrsize = packet->hlen;
2500 			u8 pad = packet->pad;
2501 			u8 extra_bytes = pad + packet->extra_byte +
2502 				(SIZE_OF_CRC << 2);
2503 			u32 pmtu = qp->pmtu;
2504 
2505 			if (unlikely(tlen != (hdrsize + pmtu + extra_bytes)))
2506 				goto ack_op_err;
2507 			len = restart_sge(&ss, req->e.swqe, ipsn, pmtu);
2508 			if (unlikely(len < pmtu))
2509 				goto ack_op_err;
2510 			rvt_copy_sge(qp, &ss, packet->payload, pmtu, false,
2511 				     false);
2512 			/* Raise the sw sequence check flag for next packet */
2513 			priv->s_flags |= HFI1_R_TID_SW_PSN;
2514 		}
2515 
2516 		goto ack_done;
2517 	}
2518 	flow->flow_state.r_next_psn = mask_psn(kpsn + 1);
2519 	req->ack_pending--;
2520 	priv->pending_tid_r_segs--;
2521 	qp->s_num_rd_atomic--;
2522 	if ((qp->s_flags & RVT_S_WAIT_FENCE) &&
2523 	    !qp->s_num_rd_atomic) {
2524 		qp->s_flags &= ~(RVT_S_WAIT_FENCE |
2525 				 RVT_S_WAIT_ACK);
2526 		hfi1_schedule_send(qp);
2527 	}
2528 	if (qp->s_flags & RVT_S_WAIT_RDMAR) {
2529 		qp->s_flags &= ~(RVT_S_WAIT_RDMAR | RVT_S_WAIT_ACK);
2530 		hfi1_schedule_send(qp);
2531 	}
2532 
2533 	trace_hfi1_ack(qp, ipsn);
2534 	trace_hfi1_tid_req_rcv_read_resp(qp, 0, req->e.swqe->wr.opcode,
2535 					 req->e.swqe->psn, req->e.swqe->lpsn,
2536 					 req);
2537 	trace_hfi1_tid_flow_rcv_read_resp(qp, req->clear_tail, flow);
2538 
2539 	/* Release the tid resources */
2540 	hfi1_kern_exp_rcv_clear(req);
2541 
2542 	if (!do_rc_ack(qp, aeth, ipsn, opcode, 0, rcd))
2543 		goto ack_done;
2544 
2545 	/* If not done yet, build next read request */
2546 	if (++req->comp_seg >= req->total_segs) {
2547 		priv->tid_r_comp++;
2548 		req->state = TID_REQUEST_COMPLETE;
2549 	}
2550 
2551 	/*
2552 	 * Clear the hw flow under two conditions:
2553 	 * 1. This request is a sync point and it is complete;
2554 	 * 2. Current request is completed and there are no more requests.
2555 	 */
2556 	if ((req->state == TID_REQUEST_SYNC &&
2557 	     req->comp_seg == req->cur_seg) ||
2558 	    priv->tid_r_comp == priv->tid_r_reqs) {
2559 		hfi1_kern_clear_hw_flow(priv->rcd, qp);
2560 		priv->s_flags &= ~HFI1_R_TID_SW_PSN;
2561 		if (req->state == TID_REQUEST_SYNC)
2562 			req->state = TID_REQUEST_ACTIVE;
2563 	}
2564 
2565 	hfi1_schedule_send(qp);
2566 	goto ack_done;
2567 
2568 ack_op_err:
2569 	/*
2570 	 * The test indicates that the send engine has finished its cleanup
2571 	 * after sending the request and it's now safe to put the QP into error
2572 	 * state. However, if the wqe queue is empty (qp->s_acked == qp->s_tail
2573 	 * == qp->s_head), it would be unsafe to complete the wqe pointed by
2574 	 * qp->s_acked here. Putting the qp into error state will safely flush
2575 	 * all remaining requests.
2576 	 */
2577 	if (qp->s_last == qp->s_acked)
2578 		rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
2579 
2580 ack_done:
2581 	spin_unlock_irqrestore(&qp->s_lock, flags);
2582 }
2583 
2584 void hfi1_kern_read_tid_flow_free(struct rvt_qp *qp)
2585 	__must_hold(&qp->s_lock)
2586 {
2587 	u32 n = qp->s_acked;
2588 	struct rvt_swqe *wqe;
2589 	struct tid_rdma_request *req;
2590 	struct hfi1_qp_priv *priv = qp->priv;
2591 
2592 	lockdep_assert_held(&qp->s_lock);
2593 	/* Free any TID entries */
2594 	while (n != qp->s_tail) {
2595 		wqe = rvt_get_swqe_ptr(qp, n);
2596 		if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
2597 			req = wqe_to_tid_req(wqe);
2598 			hfi1_kern_exp_rcv_clear_all(req);
2599 		}
2600 
2601 		if (++n == qp->s_size)
2602 			n = 0;
2603 	}
2604 	/* Free flow */
2605 	hfi1_kern_clear_hw_flow(priv->rcd, qp);
2606 }
2607 
2608 static bool tid_rdma_tid_err(struct hfi1_ctxtdata *rcd,
2609 			     struct hfi1_packet *packet, u8 rcv_type,
2610 			     u8 opcode)
2611 {
2612 	struct rvt_qp *qp = packet->qp;
2613 	struct hfi1_qp_priv *qpriv = qp->priv;
2614 	u32 ipsn;
2615 	struct ib_other_headers *ohdr = packet->ohdr;
2616 	struct rvt_ack_entry *e;
2617 	struct tid_rdma_request *req;
2618 	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
2619 	u32 i;
2620 
2621 	if (rcv_type >= RHF_RCV_TYPE_IB)
2622 		goto done;
2623 
2624 	spin_lock(&qp->s_lock);
2625 
2626 	/*
2627 	 * We've ran out of space in the eager buffer.
2628 	 * Eagerly received KDETH packets which require space in the
2629 	 * Eager buffer (packet that have payload) are TID RDMA WRITE
2630 	 * response packets. In this case, we have to re-transmit the
2631 	 * TID RDMA WRITE request.
2632 	 */
2633 	if (rcv_type == RHF_RCV_TYPE_EAGER) {
2634 		hfi1_restart_rc(qp, qp->s_last_psn + 1, 1);
2635 		hfi1_schedule_send(qp);
2636 		goto done_unlock;
2637 	}
2638 
2639 	/*
2640 	 * For TID READ response, error out QP after freeing the tid
2641 	 * resources.
2642 	 */
2643 	if (opcode == TID_OP(READ_RESP)) {
2644 		ipsn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn));
2645 		if (cmp_psn(ipsn, qp->s_last_psn) > 0 &&
2646 		    cmp_psn(ipsn, qp->s_psn) < 0) {
2647 			hfi1_kern_read_tid_flow_free(qp);
2648 			spin_unlock(&qp->s_lock);
2649 			rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2650 			goto done;
2651 		}
2652 		goto done_unlock;
2653 	}
2654 
2655 	/*
2656 	 * Error out the qp for TID RDMA WRITE
2657 	 */
2658 	hfi1_kern_clear_hw_flow(qpriv->rcd, qp);
2659 	for (i = 0; i < rvt_max_atomic(rdi); i++) {
2660 		e = &qp->s_ack_queue[i];
2661 		if (e->opcode == TID_OP(WRITE_REQ)) {
2662 			req = ack_to_tid_req(e);
2663 			hfi1_kern_exp_rcv_clear_all(req);
2664 		}
2665 	}
2666 	spin_unlock(&qp->s_lock);
2667 	rvt_rc_error(qp, IB_WC_LOC_LEN_ERR);
2668 	goto done;
2669 
2670 done_unlock:
2671 	spin_unlock(&qp->s_lock);
2672 done:
2673 	return true;
2674 }
2675 
2676 static void restart_tid_rdma_read_req(struct hfi1_ctxtdata *rcd,
2677 				      struct rvt_qp *qp, struct rvt_swqe *wqe)
2678 {
2679 	struct tid_rdma_request *req;
2680 	struct tid_rdma_flow *flow;
2681 
2682 	/* Start from the right segment */
2683 	qp->r_flags |= RVT_R_RDMAR_SEQ;
2684 	req = wqe_to_tid_req(wqe);
2685 	flow = &req->flows[req->clear_tail];
2686 	hfi1_restart_rc(qp, flow->flow_state.ib_spsn, 0);
2687 	if (list_empty(&qp->rspwait)) {
2688 		qp->r_flags |= RVT_R_RSP_SEND;
2689 		rvt_get_qp(qp);
2690 		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
2691 	}
2692 }
2693 
2694 /*
2695  * Handle the KDETH eflags for TID RDMA READ response.
2696  *
2697  * Return true if the last packet for a segment has been received and it is
2698  * time to process the response normally; otherwise, return true.
2699  *
2700  * The caller must hold the packet->qp->r_lock and the rcu_read_lock.
2701  */
2702 static bool handle_read_kdeth_eflags(struct hfi1_ctxtdata *rcd,
2703 				     struct hfi1_packet *packet, u8 rcv_type,
2704 				     u8 rte, u32 psn, u32 ibpsn)
2705 	__must_hold(&packet->qp->r_lock) __must_hold(RCU)
2706 {
2707 	struct hfi1_pportdata *ppd = rcd->ppd;
2708 	struct hfi1_devdata *dd = ppd->dd;
2709 	struct hfi1_ibport *ibp;
2710 	struct rvt_swqe *wqe;
2711 	struct tid_rdma_request *req;
2712 	struct tid_rdma_flow *flow;
2713 	u32 ack_psn;
2714 	struct rvt_qp *qp = packet->qp;
2715 	struct hfi1_qp_priv *priv = qp->priv;
2716 	bool ret = true;
2717 	int diff = 0;
2718 	u32 fpsn;
2719 
2720 	lockdep_assert_held(&qp->r_lock);
2721 	/* If the psn is out of valid range, drop the packet */
2722 	if (cmp_psn(ibpsn, qp->s_last_psn) < 0 ||
2723 	    cmp_psn(ibpsn, qp->s_psn) > 0)
2724 		return ret;
2725 
2726 	spin_lock(&qp->s_lock);
2727 	/*
2728 	 * Note that NAKs implicitly ACK outstanding SEND and RDMA write
2729 	 * requests and implicitly NAK RDMA read and atomic requests issued
2730 	 * before the NAK'ed request.
2731 	 */
2732 	ack_psn = ibpsn - 1;
2733 	wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
2734 	ibp = to_iport(qp->ibqp.device, qp->port_num);
2735 
2736 	/* Complete WQEs that the PSN finishes. */
2737 	while ((int)delta_psn(ack_psn, wqe->lpsn) >= 0) {
2738 		/*
2739 		 * If this request is a RDMA read or atomic, and the NACK is
2740 		 * for a later operation, this NACK NAKs the RDMA read or
2741 		 * atomic.
2742 		 */
2743 		if (wqe->wr.opcode == IB_WR_RDMA_READ ||
2744 		    wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
2745 		    wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
2746 		    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
2747 			/* Retry this request. */
2748 			if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
2749 				qp->r_flags |= RVT_R_RDMAR_SEQ;
2750 				if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
2751 					restart_tid_rdma_read_req(rcd, qp,
2752 								  wqe);
2753 				} else {
2754 					hfi1_restart_rc(qp, qp->s_last_psn + 1,
2755 							0);
2756 					if (list_empty(&qp->rspwait)) {
2757 						qp->r_flags |= RVT_R_RSP_SEND;
2758 						rvt_get_qp(qp);
2759 						list_add_tail(/* wait */
2760 						   &qp->rspwait,
2761 						   &rcd->qp_wait_list);
2762 					}
2763 				}
2764 			}
2765 			/*
2766 			 * No need to process the NAK since we are
2767 			 * restarting an earlier request.
2768 			 */
2769 			break;
2770 		}
2771 
2772 		wqe = do_rc_completion(qp, wqe, ibp);
2773 		if (qp->s_acked == qp->s_tail)
2774 			break;
2775 	}
2776 
2777 	/* Handle the eflags for the request */
2778 	if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
2779 		goto s_unlock;
2780 
2781 	req = wqe_to_tid_req(wqe);
2782 	switch (rcv_type) {
2783 	case RHF_RCV_TYPE_EXPECTED:
2784 		switch (rte) {
2785 		case RHF_RTE_EXPECTED_FLOW_SEQ_ERR:
2786 			/*
2787 			 * On the first occurrence of a Flow Sequence error,
2788 			 * the flag TID_FLOW_SW_PSN is set.
2789 			 *
2790 			 * After that, the flow is *not* reprogrammed and the
2791 			 * protocol falls back to SW PSN checking. This is done
2792 			 * to prevent continuous Flow Sequence errors for any
2793 			 * packets that could be still in the fabric.
2794 			 */
2795 			flow = find_flow(req, psn, NULL);
2796 			if (!flow) {
2797 				/*
2798 				 * We can't find the IB PSN matching the
2799 				 * received KDETH PSN. The only thing we can
2800 				 * do at this point is report the error to
2801 				 * the QP.
2802 				 */
2803 				hfi1_kern_read_tid_flow_free(qp);
2804 				spin_unlock(&qp->s_lock);
2805 				rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2806 				return ret;
2807 			}
2808 			if (priv->s_flags & HFI1_R_TID_SW_PSN) {
2809 				diff = cmp_psn(psn,
2810 					       flow->flow_state.r_next_psn);
2811 				if (diff > 0) {
2812 					if (!(qp->r_flags & RVT_R_RDMAR_SEQ))
2813 						restart_tid_rdma_read_req(rcd,
2814 									  qp,
2815 									  wqe);
2816 
2817 					/* Drop the packet.*/
2818 					goto s_unlock;
2819 				} else if (diff < 0) {
2820 					/*
2821 					 * If a response packet for a restarted
2822 					 * request has come back, reset the
2823 					 * restart flag.
2824 					 */
2825 					if (qp->r_flags & RVT_R_RDMAR_SEQ)
2826 						qp->r_flags &=
2827 							~RVT_R_RDMAR_SEQ;
2828 
2829 					/* Drop the packet.*/
2830 					goto s_unlock;
2831 				}
2832 
2833 				/*
2834 				 * If SW PSN verification is successful and
2835 				 * this is the last packet in the segment, tell
2836 				 * the caller to process it as a normal packet.
2837 				 */
2838 				fpsn = full_flow_psn(flow,
2839 						     flow->flow_state.lpsn);
2840 				if (cmp_psn(fpsn, psn) == 0) {
2841 					ret = false;
2842 					if (qp->r_flags & RVT_R_RDMAR_SEQ)
2843 						qp->r_flags &=
2844 							~RVT_R_RDMAR_SEQ;
2845 				}
2846 				flow->flow_state.r_next_psn =
2847 					mask_psn(psn + 1);
2848 			} else {
2849 				u32 last_psn;
2850 
2851 				last_psn = read_r_next_psn(dd, rcd->ctxt,
2852 							   flow->idx);
2853 				flow->flow_state.r_next_psn = last_psn;
2854 				priv->s_flags |= HFI1_R_TID_SW_PSN;
2855 				/*
2856 				 * If no request has been restarted yet,
2857 				 * restart the current one.
2858 				 */
2859 				if (!(qp->r_flags & RVT_R_RDMAR_SEQ))
2860 					restart_tid_rdma_read_req(rcd, qp,
2861 								  wqe);
2862 			}
2863 
2864 			break;
2865 
2866 		case RHF_RTE_EXPECTED_FLOW_GEN_ERR:
2867 			/*
2868 			 * Since the TID flow is able to ride through
2869 			 * generation mismatch, drop this stale packet.
2870 			 */
2871 			break;
2872 
2873 		default:
2874 			break;
2875 		}
2876 		break;
2877 
2878 	case RHF_RCV_TYPE_ERROR:
2879 		switch (rte) {
2880 		case RHF_RTE_ERROR_OP_CODE_ERR:
2881 		case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR:
2882 		case RHF_RTE_ERROR_KHDR_HCRC_ERR:
2883 		case RHF_RTE_ERROR_KHDR_KVER_ERR:
2884 		case RHF_RTE_ERROR_CONTEXT_ERR:
2885 		case RHF_RTE_ERROR_KHDR_TID_ERR:
2886 		default:
2887 			break;
2888 		}
2889 	default:
2890 		break;
2891 	}
2892 s_unlock:
2893 	spin_unlock(&qp->s_lock);
2894 	return ret;
2895 }
2896 
2897 bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
2898 			      struct hfi1_pportdata *ppd,
2899 			      struct hfi1_packet *packet)
2900 {
2901 	struct hfi1_ibport *ibp = &ppd->ibport_data;
2902 	struct hfi1_devdata *dd = ppd->dd;
2903 	struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
2904 	u8 rcv_type = rhf_rcv_type(packet->rhf);
2905 	u8 rte = rhf_rcv_type_err(packet->rhf);
2906 	struct ib_header *hdr = packet->hdr;
2907 	struct ib_other_headers *ohdr = NULL;
2908 	int lnh = be16_to_cpu(hdr->lrh[0]) & 3;
2909 	u16 lid  = be16_to_cpu(hdr->lrh[1]);
2910 	u8 opcode;
2911 	u32 qp_num, psn, ibpsn;
2912 	struct rvt_qp *qp;
2913 	struct hfi1_qp_priv *qpriv;
2914 	unsigned long flags;
2915 	bool ret = true;
2916 	struct rvt_ack_entry *e;
2917 	struct tid_rdma_request *req;
2918 	struct tid_rdma_flow *flow;
2919 	int diff = 0;
2920 
2921 	trace_hfi1_msg_handle_kdeth_eflags(NULL, "Kdeth error: rhf ",
2922 					   packet->rhf);
2923 	if (packet->rhf & RHF_ICRC_ERR)
2924 		return ret;
2925 
2926 	packet->ohdr = &hdr->u.oth;
2927 	ohdr = packet->ohdr;
2928 	trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
2929 
2930 	/* Get the destination QP number. */
2931 	qp_num = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_qp) &
2932 		RVT_QPN_MASK;
2933 	if (lid >= be16_to_cpu(IB_MULTICAST_LID_BASE))
2934 		goto drop;
2935 
2936 	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
2937 	opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
2938 
2939 	rcu_read_lock();
2940 	qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
2941 	if (!qp)
2942 		goto rcu_unlock;
2943 
2944 	packet->qp = qp;
2945 
2946 	/* Check for valid receive state. */
2947 	spin_lock_irqsave(&qp->r_lock, flags);
2948 	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
2949 		ibp->rvp.n_pkt_drops++;
2950 		goto r_unlock;
2951 	}
2952 
2953 	if (packet->rhf & RHF_TID_ERR) {
2954 		/* For TIDERR and RC QPs preemptively schedule a NAK */
2955 		u32 tlen = rhf_pkt_len(packet->rhf); /* in bytes */
2956 
2957 		/* Sanity check packet */
2958 		if (tlen < 24)
2959 			goto r_unlock;
2960 
2961 		/*
2962 		 * Check for GRH. We should never get packets with GRH in this
2963 		 * path.
2964 		 */
2965 		if (lnh == HFI1_LRH_GRH)
2966 			goto r_unlock;
2967 
2968 		if (tid_rdma_tid_err(rcd, packet, rcv_type, opcode))
2969 			goto r_unlock;
2970 	}
2971 
2972 	/* handle TID RDMA READ */
2973 	if (opcode == TID_OP(READ_RESP)) {
2974 		ibpsn = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn);
2975 		ibpsn = mask_psn(ibpsn);
2976 		ret = handle_read_kdeth_eflags(rcd, packet, rcv_type, rte, psn,
2977 					       ibpsn);
2978 		goto r_unlock;
2979 	}
2980 
2981 	/*
2982 	 * qp->s_tail_ack_queue points to the rvt_ack_entry currently being
2983 	 * processed. These a completed sequentially so we can be sure that
2984 	 * the pointer will not change until the entire request has completed.
2985 	 */
2986 	spin_lock(&qp->s_lock);
2987 	qpriv = qp->priv;
2988 	e = &qp->s_ack_queue[qpriv->r_tid_tail];
2989 	req = ack_to_tid_req(e);
2990 	flow = &req->flows[req->clear_tail];
2991 	trace_hfi1_eflags_err_write(qp, rcv_type, rte, psn);
2992 	trace_hfi1_rsp_handle_kdeth_eflags(qp, psn);
2993 	trace_hfi1_tid_write_rsp_handle_kdeth_eflags(qp);
2994 	trace_hfi1_tid_req_handle_kdeth_eflags(qp, 0, e->opcode, e->psn,
2995 					       e->lpsn, req);
2996 	trace_hfi1_tid_flow_handle_kdeth_eflags(qp, req->clear_tail, flow);
2997 
2998 	switch (rcv_type) {
2999 	case RHF_RCV_TYPE_EXPECTED:
3000 		switch (rte) {
3001 		case RHF_RTE_EXPECTED_FLOW_SEQ_ERR:
3002 			if (!(qpriv->s_flags & HFI1_R_TID_SW_PSN)) {
3003 				qpriv->s_flags |= HFI1_R_TID_SW_PSN;
3004 				flow->flow_state.r_next_psn =
3005 					read_r_next_psn(dd, rcd->ctxt,
3006 							flow->idx);
3007 				qpriv->r_next_psn_kdeth =
3008 					flow->flow_state.r_next_psn;
3009 				goto nak_psn;
3010 			} else {
3011 				/*
3012 				 * If the received PSN does not match the next
3013 				 * expected PSN, NAK the packet.
3014 				 * However, only do that if we know that the a
3015 				 * NAK has already been sent. Otherwise, this
3016 				 * mismatch could be due to packets that were
3017 				 * already in flight.
3018 				 */
3019 				diff = cmp_psn(psn,
3020 					       flow->flow_state.r_next_psn);
3021 				if (diff > 0)
3022 					goto nak_psn;
3023 				else if (diff < 0)
3024 					break;
3025 
3026 				qpriv->s_nak_state = 0;
3027 				/*
3028 				 * If SW PSN verification is successful and this
3029 				 * is the last packet in the segment, tell the
3030 				 * caller to process it as a normal packet.
3031 				 */
3032 				if (psn == full_flow_psn(flow,
3033 							 flow->flow_state.lpsn))
3034 					ret = false;
3035 				flow->flow_state.r_next_psn =
3036 					mask_psn(psn + 1);
3037 				qpriv->r_next_psn_kdeth =
3038 					flow->flow_state.r_next_psn;
3039 			}
3040 			break;
3041 
3042 		case RHF_RTE_EXPECTED_FLOW_GEN_ERR:
3043 			goto nak_psn;
3044 
3045 		default:
3046 			break;
3047 		}
3048 		break;
3049 
3050 	case RHF_RCV_TYPE_ERROR:
3051 		switch (rte) {
3052 		case RHF_RTE_ERROR_OP_CODE_ERR:
3053 		case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR:
3054 		case RHF_RTE_ERROR_KHDR_HCRC_ERR:
3055 		case RHF_RTE_ERROR_KHDR_KVER_ERR:
3056 		case RHF_RTE_ERROR_CONTEXT_ERR:
3057 		case RHF_RTE_ERROR_KHDR_TID_ERR:
3058 		default:
3059 			break;
3060 		}
3061 	default:
3062 		break;
3063 	}
3064 
3065 unlock:
3066 	spin_unlock(&qp->s_lock);
3067 r_unlock:
3068 	spin_unlock_irqrestore(&qp->r_lock, flags);
3069 rcu_unlock:
3070 	rcu_read_unlock();
3071 drop:
3072 	return ret;
3073 nak_psn:
3074 	ibp->rvp.n_rc_seqnak++;
3075 	if (!qpriv->s_nak_state) {
3076 		qpriv->s_nak_state = IB_NAK_PSN_ERROR;
3077 		/* We are NAK'ing the next expected PSN */
3078 		qpriv->s_nak_psn = mask_psn(flow->flow_state.r_next_psn);
3079 		qpriv->s_flags |= RVT_S_ACK_PENDING;
3080 		if (qpriv->r_tid_ack == HFI1_QP_WQE_INVALID)
3081 			qpriv->r_tid_ack = qpriv->r_tid_tail;
3082 		hfi1_schedule_tid_send(qp);
3083 	}
3084 	goto unlock;
3085 }
3086 
3087 /*
3088  * "Rewind" the TID request information.
3089  * This means that we reset the state back to ACTIVE,
3090  * find the proper flow, set the flow index to that flow,
3091  * and reset the flow information.
3092  */
3093 void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
3094 			       u32 *bth2)
3095 {
3096 	struct tid_rdma_request *req = wqe_to_tid_req(wqe);
3097 	struct tid_rdma_flow *flow;
3098 	struct hfi1_qp_priv *qpriv = qp->priv;
3099 	int diff, delta_pkts;
3100 	u32 tididx = 0, i;
3101 	u16 fidx;
3102 
3103 	if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
3104 		*bth2 = mask_psn(qp->s_psn);
3105 		flow = find_flow_ib(req, *bth2, &fidx);
3106 		if (!flow) {
3107 			trace_hfi1_msg_tid_restart_req(/* msg */
3108 			   qp, "!!!!!! Could not find flow to restart: bth2 ",
3109 			   (u64)*bth2);
3110 			trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode,
3111 						       wqe->psn, wqe->lpsn,
3112 						       req);
3113 			return;
3114 		}
3115 	} else {
3116 		fidx = req->acked_tail;
3117 		flow = &req->flows[fidx];
3118 		*bth2 = mask_psn(req->r_ack_psn);
3119 	}
3120 
3121 	if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
3122 		delta_pkts = delta_psn(*bth2, flow->flow_state.ib_spsn);
3123 	else
3124 		delta_pkts = delta_psn(*bth2,
3125 				       full_flow_psn(flow,
3126 						     flow->flow_state.spsn));
3127 
3128 	trace_hfi1_tid_flow_restart_req(qp, fidx, flow);
3129 	diff = delta_pkts + flow->resync_npkts;
3130 
3131 	flow->sent = 0;
3132 	flow->pkt = 0;
3133 	flow->tid_idx = 0;
3134 	flow->tid_offset = 0;
3135 	if (diff) {
3136 		for (tididx = 0; tididx < flow->tidcnt; tididx++) {
3137 			u32 tidentry = flow->tid_entry[tididx], tidlen,
3138 				tidnpkts, npkts;
3139 
3140 			flow->tid_offset = 0;
3141 			tidlen = EXP_TID_GET(tidentry, LEN) * PAGE_SIZE;
3142 			tidnpkts = rvt_div_round_up_mtu(qp, tidlen);
3143 			npkts = min_t(u32, diff, tidnpkts);
3144 			flow->pkt += npkts;
3145 			flow->sent += (npkts == tidnpkts ? tidlen :
3146 				       npkts * qp->pmtu);
3147 			flow->tid_offset += npkts * qp->pmtu;
3148 			diff -= npkts;
3149 			if (!diff)
3150 				break;
3151 		}
3152 	}
3153 	if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) {
3154 		rvt_skip_sge(&qpriv->tid_ss, (req->cur_seg * req->seg_len) +
3155 			     flow->sent, 0);
3156 		/*
3157 		 * Packet PSN is based on flow_state.spsn + flow->pkt. However,
3158 		 * during a RESYNC, the generation is incremented and the
3159 		 * sequence is reset to 0. Since we've adjusted the npkts in the
3160 		 * flow and the SGE has been sufficiently advanced, we have to
3161 		 * adjust flow->pkt in order to calculate the correct PSN.
3162 		 */
3163 		flow->pkt -= flow->resync_npkts;
3164 	}
3165 
3166 	if (flow->tid_offset ==
3167 	    EXP_TID_GET(flow->tid_entry[tididx], LEN) * PAGE_SIZE) {
3168 		tididx++;
3169 		flow->tid_offset = 0;
3170 	}
3171 	flow->tid_idx = tididx;
3172 	if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
3173 		/* Move flow_idx to correct index */
3174 		req->flow_idx = fidx;
3175 	else
3176 		req->clear_tail = fidx;
3177 
3178 	trace_hfi1_tid_flow_restart_req(qp, fidx, flow);
3179 	trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode, wqe->psn,
3180 				       wqe->lpsn, req);
3181 	req->state = TID_REQUEST_ACTIVE;
3182 	if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) {
3183 		/* Reset all the flows that we are going to resend */
3184 		fidx = CIRC_NEXT(fidx, MAX_FLOWS);
3185 		i = qpriv->s_tid_tail;
3186 		do {
3187 			for (; CIRC_CNT(req->setup_head, fidx, MAX_FLOWS);
3188 			      fidx = CIRC_NEXT(fidx, MAX_FLOWS)) {
3189 				req->flows[fidx].sent = 0;
3190 				req->flows[fidx].pkt = 0;
3191 				req->flows[fidx].tid_idx = 0;
3192 				req->flows[fidx].tid_offset = 0;
3193 				req->flows[fidx].resync_npkts = 0;
3194 			}
3195 			if (i == qpriv->s_tid_cur)
3196 				break;
3197 			do {
3198 				i = (++i == qp->s_size ? 0 : i);
3199 				wqe = rvt_get_swqe_ptr(qp, i);
3200 			} while (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE);
3201 			req = wqe_to_tid_req(wqe);
3202 			req->cur_seg = req->ack_seg;
3203 			fidx = req->acked_tail;
3204 			/* Pull req->clear_tail back */
3205 			req->clear_tail = fidx;
3206 		} while (1);
3207 	}
3208 }
3209 
3210 void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp)
3211 {
3212 	int i, ret;
3213 	struct hfi1_qp_priv *qpriv = qp->priv;
3214 	struct tid_flow_state *fs;
3215 
3216 	if (qp->ibqp.qp_type != IB_QPT_RC || !HFI1_CAP_IS_KSET(TID_RDMA))
3217 		return;
3218 
3219 	/*
3220 	 * First, clear the flow to help prevent any delayed packets from
3221 	 * being delivered.
3222 	 */
3223 	fs = &qpriv->flow_state;
3224 	if (fs->index != RXE_NUM_TID_FLOWS)
3225 		hfi1_kern_clear_hw_flow(qpriv->rcd, qp);
3226 
3227 	for (i = qp->s_acked; i != qp->s_head;) {
3228 		struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i);
3229 
3230 		if (++i == qp->s_size)
3231 			i = 0;
3232 		/* Free only locally allocated TID entries */
3233 		if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
3234 			continue;
3235 		do {
3236 			struct hfi1_swqe_priv *priv = wqe->priv;
3237 
3238 			ret = hfi1_kern_exp_rcv_clear(&priv->tid_req);
3239 		} while (!ret);
3240 	}
3241 	for (i = qp->s_acked_ack_queue; i != qp->r_head_ack_queue;) {
3242 		struct rvt_ack_entry *e = &qp->s_ack_queue[i];
3243 
3244 		if (++i == rvt_max_atomic(ib_to_rvt(qp->ibqp.device)))
3245 			i = 0;
3246 		/* Free only locally allocated TID entries */
3247 		if (e->opcode != TID_OP(WRITE_REQ))
3248 			continue;
3249 		do {
3250 			struct hfi1_ack_priv *priv = e->priv;
3251 
3252 			ret = hfi1_kern_exp_rcv_clear(&priv->tid_req);
3253 		} while (!ret);
3254 	}
3255 }
3256 
3257 bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe)
3258 {
3259 	struct rvt_swqe *prev;
3260 	struct hfi1_qp_priv *priv = qp->priv;
3261 	u32 s_prev;
3262 	struct tid_rdma_request *req;
3263 
3264 	s_prev = (qp->s_cur == 0 ? qp->s_size : qp->s_cur) - 1;
3265 	prev = rvt_get_swqe_ptr(qp, s_prev);
3266 
3267 	switch (wqe->wr.opcode) {
3268 	case IB_WR_SEND:
3269 	case IB_WR_SEND_WITH_IMM:
3270 	case IB_WR_SEND_WITH_INV:
3271 	case IB_WR_ATOMIC_CMP_AND_SWP:
3272 	case IB_WR_ATOMIC_FETCH_AND_ADD:
3273 	case IB_WR_RDMA_WRITE:
3274 		switch (prev->wr.opcode) {
3275 		case IB_WR_TID_RDMA_WRITE:
3276 			req = wqe_to_tid_req(prev);
3277 			if (req->ack_seg != req->total_segs)
3278 				goto interlock;
3279 		default:
3280 			break;
3281 		}
3282 		break;
3283 	case IB_WR_RDMA_READ:
3284 		if (prev->wr.opcode != IB_WR_TID_RDMA_WRITE)
3285 			break;
3286 		/* fall through */
3287 	case IB_WR_TID_RDMA_READ:
3288 		switch (prev->wr.opcode) {
3289 		case IB_WR_RDMA_READ:
3290 			if (qp->s_acked != qp->s_cur)
3291 				goto interlock;
3292 			break;
3293 		case IB_WR_TID_RDMA_WRITE:
3294 			req = wqe_to_tid_req(prev);
3295 			if (req->ack_seg != req->total_segs)
3296 				goto interlock;
3297 		default:
3298 			break;
3299 		}
3300 	default:
3301 		break;
3302 	}
3303 	return false;
3304 
3305 interlock:
3306 	priv->s_flags |= HFI1_S_TID_WAIT_INTERLCK;
3307 	return true;
3308 }
3309 
3310 /* Does @sge meet the alignment requirements for tid rdma? */
3311 static inline bool hfi1_check_sge_align(struct rvt_qp *qp,
3312 					struct rvt_sge *sge, int num_sge)
3313 {
3314 	int i;
3315 
3316 	for (i = 0; i < num_sge; i++, sge++) {
3317 		trace_hfi1_sge_check_align(qp, i, sge);
3318 		if ((u64)sge->vaddr & ~PAGE_MASK ||
3319 		    sge->sge_length & ~PAGE_MASK)
3320 			return false;
3321 	}
3322 	return true;
3323 }
3324 
3325 void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
3326 {
3327 	struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
3328 	struct hfi1_swqe_priv *priv = wqe->priv;
3329 	struct tid_rdma_params *remote;
3330 	enum ib_wr_opcode new_opcode;
3331 	bool do_tid_rdma = false;
3332 	struct hfi1_pportdata *ppd = qpriv->rcd->ppd;
3333 
3334 	if ((rdma_ah_get_dlid(&qp->remote_ah_attr) & ~((1 << ppd->lmc) - 1)) ==
3335 				ppd->lid)
3336 		return;
3337 	if (qpriv->hdr_type != HFI1_PKT_TYPE_9B)
3338 		return;
3339 
3340 	rcu_read_lock();
3341 	remote = rcu_dereference(qpriv->tid_rdma.remote);
3342 	/*
3343 	 * If TID RDMA is disabled by the negotiation, don't
3344 	 * use it.
3345 	 */
3346 	if (!remote)
3347 		goto exit;
3348 
3349 	if (wqe->wr.opcode == IB_WR_RDMA_READ) {
3350 		if (hfi1_check_sge_align(qp, &wqe->sg_list[0],
3351 					 wqe->wr.num_sge)) {
3352 			new_opcode = IB_WR_TID_RDMA_READ;
3353 			do_tid_rdma = true;
3354 		}
3355 	} else if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
3356 		/*
3357 		 * TID RDMA is enabled for this RDMA WRITE request iff:
3358 		 *   1. The remote address is page-aligned,
3359 		 *   2. The length is larger than the minimum segment size,
3360 		 *   3. The length is page-multiple.
3361 		 */
3362 		if (!(wqe->rdma_wr.remote_addr & ~PAGE_MASK) &&
3363 		    !(wqe->length & ~PAGE_MASK)) {
3364 			new_opcode = IB_WR_TID_RDMA_WRITE;
3365 			do_tid_rdma = true;
3366 		}
3367 	}
3368 
3369 	if (do_tid_rdma) {
3370 		if (hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req, GFP_ATOMIC))
3371 			goto exit;
3372 		wqe->wr.opcode = new_opcode;
3373 		priv->tid_req.seg_len =
3374 			min_t(u32, remote->max_len, wqe->length);
3375 		priv->tid_req.total_segs =
3376 			DIV_ROUND_UP(wqe->length, priv->tid_req.seg_len);
3377 		/* Compute the last PSN of the request */
3378 		wqe->lpsn = wqe->psn;
3379 		if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
3380 			priv->tid_req.n_flows = remote->max_read;
3381 			qpriv->tid_r_reqs++;
3382 			wqe->lpsn += rvt_div_round_up_mtu(qp, wqe->length) - 1;
3383 		} else {
3384 			wqe->lpsn += priv->tid_req.total_segs - 1;
3385 			atomic_inc(&qpriv->n_requests);
3386 		}
3387 
3388 		priv->tid_req.cur_seg = 0;
3389 		priv->tid_req.comp_seg = 0;
3390 		priv->tid_req.ack_seg = 0;
3391 		priv->tid_req.state = TID_REQUEST_INACTIVE;
3392 		/*
3393 		 * Reset acked_tail.
3394 		 * TID RDMA READ does not have ACKs so it does not
3395 		 * update the pointer. We have to reset it so TID RDMA
3396 		 * WRITE does not get confused.
3397 		 */
3398 		priv->tid_req.acked_tail = priv->tid_req.setup_head;
3399 		trace_hfi1_tid_req_setup_tid_wqe(qp, 1, wqe->wr.opcode,
3400 						 wqe->psn, wqe->lpsn,
3401 						 &priv->tid_req);
3402 	}
3403 exit:
3404 	rcu_read_unlock();
3405 }
3406 
3407 /* TID RDMA WRITE functions */
3408 
3409 u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
3410 				  struct ib_other_headers *ohdr,
3411 				  u32 *bth1, u32 *bth2, u32 *len)
3412 {
3413 	struct hfi1_qp_priv *qpriv = qp->priv;
3414 	struct tid_rdma_request *req = wqe_to_tid_req(wqe);
3415 	struct tid_rdma_params *remote;
3416 
3417 	rcu_read_lock();
3418 	remote = rcu_dereference(qpriv->tid_rdma.remote);
3419 	/*
3420 	 * Set the number of flow to be used based on negotiated
3421 	 * parameters.
3422 	 */
3423 	req->n_flows = remote->max_write;
3424 	req->state = TID_REQUEST_ACTIVE;
3425 
3426 	KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth0, KVER, 0x1);
3427 	KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth1, JKEY, remote->jkey);
3428 	ohdr->u.tid_rdma.w_req.reth.vaddr =
3429 		cpu_to_be64(wqe->rdma_wr.remote_addr + (wqe->length - *len));
3430 	ohdr->u.tid_rdma.w_req.reth.rkey =
3431 		cpu_to_be32(wqe->rdma_wr.rkey);
3432 	ohdr->u.tid_rdma.w_req.reth.length = cpu_to_be32(*len);
3433 	ohdr->u.tid_rdma.w_req.verbs_qp = cpu_to_be32(qp->remote_qpn);
3434 	*bth1 &= ~RVT_QPN_MASK;
3435 	*bth1 |= remote->qp;
3436 	qp->s_state = TID_OP(WRITE_REQ);
3437 	qp->s_flags |= HFI1_S_WAIT_TID_RESP;
3438 	*bth2 |= IB_BTH_REQ_ACK;
3439 	*len = 0;
3440 
3441 	rcu_read_unlock();
3442 	return sizeof(ohdr->u.tid_rdma.w_req) / sizeof(u32);
3443 }
3444 
3445 void hfi1_compute_tid_rdma_flow_wt(void)
3446 {
3447 	/*
3448 	 * Heuristic for computing the RNR timeout when waiting on the flow
3449 	 * queue. Rather than a computationaly expensive exact estimate of when
3450 	 * a flow will be available, we assume that if a QP is at position N in
3451 	 * the flow queue it has to wait approximately (N + 1) * (number of
3452 	 * segments between two sync points), assuming PMTU of 4K. The rationale
3453 	 * for this is that flows are released and recycled at each sync point.
3454 	 */
3455 	tid_rdma_flow_wt = MAX_TID_FLOW_PSN * enum_to_mtu(OPA_MTU_4096) /
3456 		TID_RDMA_MAX_SEGMENT_SIZE;
3457 }
3458 
3459 static u32 position_in_queue(struct hfi1_qp_priv *qpriv,
3460 			     struct tid_queue *queue)
3461 {
3462 	return qpriv->tid_enqueue - queue->dequeue;
3463 }
3464 
3465 /*
3466  * @qp: points to rvt_qp context.
3467  * @to_seg: desired RNR timeout in segments.
3468  * Return: index of the next highest timeout in the ib_hfi1_rnr_table[]
3469  */
3470 static u32 hfi1_compute_tid_rnr_timeout(struct rvt_qp *qp, u32 to_seg)
3471 {
3472 	struct hfi1_qp_priv *qpriv = qp->priv;
3473 	u64 timeout;
3474 	u32 bytes_per_us;
3475 	u8 i;
3476 
3477 	bytes_per_us = active_egress_rate(qpriv->rcd->ppd) / 8;
3478 	timeout = (to_seg * TID_RDMA_MAX_SEGMENT_SIZE) / bytes_per_us;
3479 	/*
3480 	 * Find the next highest value in the RNR table to the required
3481 	 * timeout. This gives the responder some padding.
3482 	 */
3483 	for (i = 1; i <= IB_AETH_CREDIT_MASK; i++)
3484 		if (rvt_rnr_tbl_to_usec(i) >= timeout)
3485 			return i;
3486 	return 0;
3487 }
3488 
3489 /**
3490  * Central place for resource allocation at TID write responder,
3491  * is called from write_req and write_data interrupt handlers as
3492  * well as the send thread when a queued QP is scheduled for
3493  * resource allocation.
3494  *
3495  * Iterates over (a) segments of a request and then (b) queued requests
3496  * themselves to allocate resources for up to local->max_write
3497  * segments across multiple requests. Stop allocating when we
3498  * hit a sync point, resume allocating after data packets at
3499  * sync point have been received.
3500  *
3501  * Resource allocation and sending of responses is decoupled. The
3502  * request/segment which are being allocated and sent are as follows.
3503  * Resources are allocated for:
3504  *     [request: qpriv->r_tid_alloc, segment: req->alloc_seg]
3505  * The send thread sends:
3506  *     [request: qp->s_tail_ack_queue, segment:req->cur_seg]
3507  */
3508 static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx)
3509 {
3510 	struct tid_rdma_request *req;
3511 	struct hfi1_qp_priv *qpriv = qp->priv;
3512 	struct hfi1_ctxtdata *rcd = qpriv->rcd;
3513 	struct tid_rdma_params *local = &qpriv->tid_rdma.local;
3514 	struct rvt_ack_entry *e;
3515 	u32 npkts, to_seg;
3516 	bool last;
3517 	int ret = 0;
3518 
3519 	lockdep_assert_held(&qp->s_lock);
3520 
3521 	while (1) {
3522 		trace_hfi1_rsp_tid_write_alloc_res(qp, 0);
3523 		trace_hfi1_tid_write_rsp_alloc_res(qp);
3524 		/*
3525 		 * Don't allocate more segments if a RNR NAK has already been
3526 		 * scheduled to avoid messing up qp->r_psn: the RNR NAK will
3527 		 * be sent only when all allocated segments have been sent.
3528 		 * However, if more segments are allocated before that, TID RDMA
3529 		 * WRITE RESP packets will be sent out for these new segments
3530 		 * before the RNR NAK packet. When the requester receives the
3531 		 * RNR NAK packet, it will restart with qp->s_last_psn + 1,
3532 		 * which does not match qp->r_psn and will be dropped.
3533 		 * Consequently, the requester will exhaust its retries and
3534 		 * put the qp into error state.
3535 		 */
3536 		if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND)
3537 			break;
3538 
3539 		/* No requests left to process */
3540 		if (qpriv->r_tid_alloc == qpriv->r_tid_head) {
3541 			/* If all data has been received, clear the flow */
3542 			if (qpriv->flow_state.index < RXE_NUM_TID_FLOWS &&
3543 			    !qpriv->alloc_w_segs) {
3544 				hfi1_kern_clear_hw_flow(rcd, qp);
3545 				qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
3546 			}
3547 			break;
3548 		}
3549 
3550 		e = &qp->s_ack_queue[qpriv->r_tid_alloc];
3551 		if (e->opcode != TID_OP(WRITE_REQ))
3552 			goto next_req;
3553 		req = ack_to_tid_req(e);
3554 		trace_hfi1_tid_req_write_alloc_res(qp, 0, e->opcode, e->psn,
3555 						   e->lpsn, req);
3556 		/* Finished allocating for all segments of this request */
3557 		if (req->alloc_seg >= req->total_segs)
3558 			goto next_req;
3559 
3560 		/* Can allocate only a maximum of local->max_write for a QP */
3561 		if (qpriv->alloc_w_segs >= local->max_write)
3562 			break;
3563 
3564 		/* Don't allocate at a sync point with data packets pending */
3565 		if (qpriv->sync_pt && qpriv->alloc_w_segs)
3566 			break;
3567 
3568 		/* All data received at the sync point, continue */
3569 		if (qpriv->sync_pt && !qpriv->alloc_w_segs) {
3570 			hfi1_kern_clear_hw_flow(rcd, qp);
3571 			qpriv->sync_pt = false;
3572 			qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
3573 		}
3574 
3575 		/* Allocate flow if we don't have one */
3576 		if (qpriv->flow_state.index >= RXE_NUM_TID_FLOWS) {
3577 			ret = hfi1_kern_setup_hw_flow(qpriv->rcd, qp);
3578 			if (ret) {
3579 				to_seg = tid_rdma_flow_wt *
3580 					position_in_queue(qpriv,
3581 							  &rcd->flow_queue);
3582 				break;
3583 			}
3584 		}
3585 
3586 		npkts = rvt_div_round_up_mtu(qp, req->seg_len);
3587 
3588 		/*
3589 		 * We are at a sync point if we run out of KDETH PSN space.
3590 		 * Last PSN of every generation is reserved for RESYNC.
3591 		 */
3592 		if (qpriv->flow_state.psn + npkts > MAX_TID_FLOW_PSN - 1) {
3593 			qpriv->sync_pt = true;
3594 			break;
3595 		}
3596 
3597 		/*
3598 		 * If overtaking req->acked_tail, send an RNR NAK. Because the
3599 		 * QP is not queued in this case, and the issue can only be
3600 		 * caused due a delay in scheduling the second leg which we
3601 		 * cannot estimate, we use a rather arbitrary RNR timeout of
3602 		 * (MAX_FLOWS / 2) segments
3603 		 */
3604 		if (!CIRC_SPACE(req->setup_head, req->acked_tail,
3605 				MAX_FLOWS)) {
3606 			ret = -EAGAIN;
3607 			to_seg = MAX_FLOWS >> 1;
3608 			qpriv->s_flags |= RVT_S_ACK_PENDING;
3609 			hfi1_schedule_tid_send(qp);
3610 			break;
3611 		}
3612 
3613 		/* Try to allocate rcv array / TID entries */
3614 		ret = hfi1_kern_exp_rcv_setup(req, &req->ss, &last);
3615 		if (ret == -EAGAIN)
3616 			to_seg = position_in_queue(qpriv, &rcd->rarr_queue);
3617 		if (ret)
3618 			break;
3619 
3620 		qpriv->alloc_w_segs++;
3621 		req->alloc_seg++;
3622 		continue;
3623 next_req:
3624 		/* Begin processing the next request */
3625 		if (++qpriv->r_tid_alloc >
3626 		    rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
3627 			qpriv->r_tid_alloc = 0;
3628 	}
3629 
3630 	/*
3631 	 * Schedule an RNR NAK to be sent if (a) flow or rcv array allocation
3632 	 * has failed (b) we are called from the rcv handler interrupt context
3633 	 * (c) an RNR NAK has not already been scheduled
3634 	 */
3635 	if (ret == -EAGAIN && intr_ctx && !qp->r_nak_state)
3636 		goto send_rnr_nak;
3637 
3638 	return;
3639 
3640 send_rnr_nak:
3641 	lockdep_assert_held(&qp->r_lock);
3642 
3643 	/* Set r_nak_state to prevent unrelated events from generating NAK's */
3644 	qp->r_nak_state = hfi1_compute_tid_rnr_timeout(qp, to_seg) | IB_RNR_NAK;
3645 
3646 	/* Pull back r_psn to the segment being RNR NAK'd */
3647 	qp->r_psn = e->psn + req->alloc_seg;
3648 	qp->r_ack_psn = qp->r_psn;
3649 	/*
3650 	 * Pull back r_head_ack_queue to the ack entry following the request
3651 	 * being RNR NAK'd. This allows resources to be allocated to the request
3652 	 * if the queued QP is scheduled.
3653 	 */
3654 	qp->r_head_ack_queue = qpriv->r_tid_alloc + 1;
3655 	if (qp->r_head_ack_queue > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
3656 		qp->r_head_ack_queue = 0;
3657 	qpriv->r_tid_head = qp->r_head_ack_queue;
3658 	/*
3659 	 * These send side fields are used in make_rc_ack(). They are set in
3660 	 * hfi1_send_rc_ack() but must be set here before dropping qp->s_lock
3661 	 * for consistency
3662 	 */
3663 	qp->s_nak_state = qp->r_nak_state;
3664 	qp->s_ack_psn = qp->r_ack_psn;
3665 	/*
3666 	 * Clear the ACK PENDING flag to prevent unwanted ACK because we
3667 	 * have modified qp->s_ack_psn here.
3668 	 */
3669 	qp->s_flags &= ~(RVT_S_ACK_PENDING);
3670 
3671 	trace_hfi1_rsp_tid_write_alloc_res(qp, qp->r_psn);
3672 	/*
3673 	 * qpriv->rnr_nak_state is used to determine when the scheduled RNR NAK
3674 	 * has actually been sent. qp->s_flags RVT_S_ACK_PENDING bit cannot be
3675 	 * used for this because qp->s_lock is dropped before calling
3676 	 * hfi1_send_rc_ack() leading to inconsistency between the receive
3677 	 * interrupt handlers and the send thread in make_rc_ack()
3678 	 */
3679 	qpriv->rnr_nak_state = TID_RNR_NAK_SEND;
3680 
3681 	/*
3682 	 * Schedule RNR NAK to be sent. RNR NAK's are scheduled from the receive
3683 	 * interrupt handlers but will be sent from the send engine behind any
3684 	 * previous responses that may have been scheduled
3685 	 */
3686 	rc_defered_ack(rcd, qp);
3687 }
3688 
3689 void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet)
3690 {
3691 	/* HANDLER FOR TID RDMA WRITE REQUEST packet (Responder side)*/
3692 
3693 	/*
3694 	 * 1. Verify TID RDMA WRITE REQ as per IB_OPCODE_RC_RDMA_WRITE_FIRST
3695 	 *    (see hfi1_rc_rcv())
3696 	 *     - Don't allow 0-length requests.
3697 	 * 2. Put TID RDMA WRITE REQ into the response queueu (s_ack_queue)
3698 	 *     - Setup struct tid_rdma_req with request info
3699 	 *     - Prepare struct tid_rdma_flow array?
3700 	 * 3. Set the qp->s_ack_state as state diagram in design doc.
3701 	 * 4. Set RVT_S_RESP_PENDING in s_flags.
3702 	 * 5. Kick the send engine (hfi1_schedule_send())
3703 	 */
3704 	struct hfi1_ctxtdata *rcd = packet->rcd;
3705 	struct rvt_qp *qp = packet->qp;
3706 	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
3707 	struct ib_other_headers *ohdr = packet->ohdr;
3708 	struct rvt_ack_entry *e;
3709 	unsigned long flags;
3710 	struct ib_reth *reth;
3711 	struct hfi1_qp_priv *qpriv = qp->priv;
3712 	struct tid_rdma_request *req;
3713 	u32 bth0, psn, len, rkey, num_segs;
3714 	bool fecn;
3715 	u8 next;
3716 	u64 vaddr;
3717 	int diff;
3718 
3719 	bth0 = be32_to_cpu(ohdr->bth[0]);
3720 	if (hfi1_ruc_check_hdr(ibp, packet))
3721 		return;
3722 
3723 	fecn = process_ecn(qp, packet);
3724 	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
3725 	trace_hfi1_rsp_rcv_tid_write_req(qp, psn);
3726 
3727 	if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
3728 		rvt_comm_est(qp);
3729 
3730 	if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
3731 		goto nack_inv;
3732 
3733 	reth = &ohdr->u.tid_rdma.w_req.reth;
3734 	vaddr = be64_to_cpu(reth->vaddr);
3735 	len = be32_to_cpu(reth->length);
3736 
3737 	num_segs = DIV_ROUND_UP(len, qpriv->tid_rdma.local.max_len);
3738 	diff = delta_psn(psn, qp->r_psn);
3739 	if (unlikely(diff)) {
3740 		tid_rdma_rcv_err(packet, ohdr, qp, psn, diff, fecn);
3741 		return;
3742 	}
3743 
3744 	/*
3745 	 * The resent request which was previously RNR NAK'd is inserted at the
3746 	 * location of the original request, which is one entry behind
3747 	 * r_head_ack_queue
3748 	 */
3749 	if (qpriv->rnr_nak_state)
3750 		qp->r_head_ack_queue = qp->r_head_ack_queue ?
3751 			qp->r_head_ack_queue - 1 :
3752 			rvt_size_atomic(ib_to_rvt(qp->ibqp.device));
3753 
3754 	/* We've verified the request, insert it into the ack queue. */
3755 	next = qp->r_head_ack_queue + 1;
3756 	if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
3757 		next = 0;
3758 	spin_lock_irqsave(&qp->s_lock, flags);
3759 	if (unlikely(next == qp->s_acked_ack_queue)) {
3760 		if (!qp->s_ack_queue[next].sent)
3761 			goto nack_inv_unlock;
3762 		update_ack_queue(qp, next);
3763 	}
3764 	e = &qp->s_ack_queue[qp->r_head_ack_queue];
3765 	req = ack_to_tid_req(e);
3766 
3767 	/* Bring previously RNR NAK'd request back to life */
3768 	if (qpriv->rnr_nak_state) {
3769 		qp->r_nak_state = 0;
3770 		qp->s_nak_state = 0;
3771 		qpriv->rnr_nak_state = TID_RNR_NAK_INIT;
3772 		qp->r_psn = e->lpsn + 1;
3773 		req->state = TID_REQUEST_INIT;
3774 		goto update_head;
3775 	}
3776 
3777 	release_rdma_sge_mr(e);
3778 
3779 	/* The length needs to be in multiples of PAGE_SIZE */
3780 	if (!len || len & ~PAGE_MASK)
3781 		goto nack_inv_unlock;
3782 
3783 	rkey = be32_to_cpu(reth->rkey);
3784 	qp->r_len = len;
3785 
3786 	if (e->opcode == TID_OP(WRITE_REQ) &&
3787 	    (req->setup_head != req->clear_tail ||
3788 	     req->clear_tail != req->acked_tail))
3789 		goto nack_inv_unlock;
3790 
3791 	if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr,
3792 				  rkey, IB_ACCESS_REMOTE_WRITE)))
3793 		goto nack_acc;
3794 
3795 	qp->r_psn += num_segs - 1;
3796 
3797 	e->opcode = (bth0 >> 24) & 0xff;
3798 	e->psn = psn;
3799 	e->lpsn = qp->r_psn;
3800 	e->sent = 0;
3801 
3802 	req->n_flows = min_t(u16, num_segs, qpriv->tid_rdma.local.max_write);
3803 	req->state = TID_REQUEST_INIT;
3804 	req->cur_seg = 0;
3805 	req->comp_seg = 0;
3806 	req->ack_seg = 0;
3807 	req->alloc_seg = 0;
3808 	req->isge = 0;
3809 	req->seg_len = qpriv->tid_rdma.local.max_len;
3810 	req->total_len = len;
3811 	req->total_segs = num_segs;
3812 	req->r_flow_psn = e->psn;
3813 	req->ss.sge = e->rdma_sge;
3814 	req->ss.num_sge = 1;
3815 
3816 	req->flow_idx = req->setup_head;
3817 	req->clear_tail = req->setup_head;
3818 	req->acked_tail = req->setup_head;
3819 
3820 	qp->r_state = e->opcode;
3821 	qp->r_nak_state = 0;
3822 	/*
3823 	 * We need to increment the MSN here instead of when we
3824 	 * finish sending the result since a duplicate request would
3825 	 * increment it more than once.
3826 	 */
3827 	qp->r_msn++;
3828 	qp->r_psn++;
3829 
3830 	trace_hfi1_tid_req_rcv_write_req(qp, 0, e->opcode, e->psn, e->lpsn,
3831 					 req);
3832 
3833 	if (qpriv->r_tid_tail == HFI1_QP_WQE_INVALID) {
3834 		qpriv->r_tid_tail = qp->r_head_ack_queue;
3835 	} else if (qpriv->r_tid_tail == qpriv->r_tid_head) {
3836 		struct tid_rdma_request *ptr;
3837 
3838 		e = &qp->s_ack_queue[qpriv->r_tid_tail];
3839 		ptr = ack_to_tid_req(e);
3840 
3841 		if (e->opcode != TID_OP(WRITE_REQ) ||
3842 		    ptr->comp_seg == ptr->total_segs) {
3843 			if (qpriv->r_tid_tail == qpriv->r_tid_ack)
3844 				qpriv->r_tid_ack = qp->r_head_ack_queue;
3845 			qpriv->r_tid_tail = qp->r_head_ack_queue;
3846 		}
3847 	}
3848 update_head:
3849 	qp->r_head_ack_queue = next;
3850 	qpriv->r_tid_head = qp->r_head_ack_queue;
3851 
3852 	hfi1_tid_write_alloc_resources(qp, true);
3853 	trace_hfi1_tid_write_rsp_rcv_req(qp);
3854 
3855 	/* Schedule the send tasklet. */
3856 	qp->s_flags |= RVT_S_RESP_PENDING;
3857 	if (fecn)
3858 		qp->s_flags |= RVT_S_ECN;
3859 	hfi1_schedule_send(qp);
3860 
3861 	spin_unlock_irqrestore(&qp->s_lock, flags);
3862 	return;
3863 
3864 nack_inv_unlock:
3865 	spin_unlock_irqrestore(&qp->s_lock, flags);
3866 nack_inv:
3867 	rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
3868 	qp->r_nak_state = IB_NAK_INVALID_REQUEST;
3869 	qp->r_ack_psn = qp->r_psn;
3870 	/* Queue NAK for later */
3871 	rc_defered_ack(rcd, qp);
3872 	return;
3873 nack_acc:
3874 	spin_unlock_irqrestore(&qp->s_lock, flags);
3875 	rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
3876 	qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
3877 	qp->r_ack_psn = qp->r_psn;
3878 }
3879 
3880 u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
3881 				   struct ib_other_headers *ohdr, u32 *bth1,
3882 				   u32 bth2, u32 *len,
3883 				   struct rvt_sge_state **ss)
3884 {
3885 	struct hfi1_ack_priv *epriv = e->priv;
3886 	struct tid_rdma_request *req = &epriv->tid_req;
3887 	struct hfi1_qp_priv *qpriv = qp->priv;
3888 	struct tid_rdma_flow *flow = NULL;
3889 	u32 resp_len = 0, hdwords = 0;
3890 	void *resp_addr = NULL;
3891 	struct tid_rdma_params *remote;
3892 
3893 	trace_hfi1_tid_req_build_write_resp(qp, 0, e->opcode, e->psn, e->lpsn,
3894 					    req);
3895 	trace_hfi1_tid_write_rsp_build_resp(qp);
3896 	trace_hfi1_rsp_build_tid_write_resp(qp, bth2);
3897 	flow = &req->flows[req->flow_idx];
3898 	switch (req->state) {
3899 	default:
3900 		/*
3901 		 * Try to allocate resources here in case QP was queued and was
3902 		 * later scheduled when resources became available
3903 		 */
3904 		hfi1_tid_write_alloc_resources(qp, false);
3905 
3906 		/* We've already sent everything which is ready */
3907 		if (req->cur_seg >= req->alloc_seg)
3908 			goto done;
3909 
3910 		/*
3911 		 * Resources can be assigned but responses cannot be sent in
3912 		 * rnr_nak state, till the resent request is received
3913 		 */
3914 		if (qpriv->rnr_nak_state == TID_RNR_NAK_SENT)
3915 			goto done;
3916 
3917 		req->state = TID_REQUEST_ACTIVE;
3918 		trace_hfi1_tid_flow_build_write_resp(qp, req->flow_idx, flow);
3919 		req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS);
3920 		hfi1_add_tid_reap_timer(qp);
3921 		break;
3922 
3923 	case TID_REQUEST_RESEND_ACTIVE:
3924 	case TID_REQUEST_RESEND:
3925 		trace_hfi1_tid_flow_build_write_resp(qp, req->flow_idx, flow);
3926 		req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS);
3927 		if (!CIRC_CNT(req->setup_head, req->flow_idx, MAX_FLOWS))
3928 			req->state = TID_REQUEST_ACTIVE;
3929 
3930 		hfi1_mod_tid_reap_timer(qp);
3931 		break;
3932 	}
3933 	flow->flow_state.resp_ib_psn = bth2;
3934 	resp_addr = (void *)flow->tid_entry;
3935 	resp_len = sizeof(*flow->tid_entry) * flow->tidcnt;
3936 	req->cur_seg++;
3937 
3938 	memset(&ohdr->u.tid_rdma.w_rsp, 0, sizeof(ohdr->u.tid_rdma.w_rsp));
3939 	epriv->ss.sge.vaddr = resp_addr;
3940 	epriv->ss.sge.sge_length = resp_len;
3941 	epriv->ss.sge.length = epriv->ss.sge.sge_length;
3942 	/*
3943 	 * We can safely zero these out. Since the first SGE covers the
3944 	 * entire packet, nothing else should even look at the MR.
3945 	 */
3946 	epriv->ss.sge.mr = NULL;
3947 	epriv->ss.sge.m = 0;
3948 	epriv->ss.sge.n = 0;
3949 
3950 	epriv->ss.sg_list = NULL;
3951 	epriv->ss.total_len = epriv->ss.sge.sge_length;
3952 	epriv->ss.num_sge = 1;
3953 
3954 	*ss = &epriv->ss;
3955 	*len = epriv->ss.total_len;
3956 
3957 	/* Construct the TID RDMA WRITE RESP packet header */
3958 	rcu_read_lock();
3959 	remote = rcu_dereference(qpriv->tid_rdma.remote);
3960 
3961 	KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth0, KVER, 0x1);
3962 	KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth1, JKEY, remote->jkey);
3963 	ohdr->u.tid_rdma.w_rsp.aeth = rvt_compute_aeth(qp);
3964 	ohdr->u.tid_rdma.w_rsp.tid_flow_psn =
3965 		cpu_to_be32((flow->flow_state.generation <<
3966 			     HFI1_KDETH_BTH_SEQ_SHIFT) |
3967 			    (flow->flow_state.spsn &
3968 			     HFI1_KDETH_BTH_SEQ_MASK));
3969 	ohdr->u.tid_rdma.w_rsp.tid_flow_qp =
3970 		cpu_to_be32(qpriv->tid_rdma.local.qp |
3971 			    ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) <<
3972 			     TID_RDMA_DESTQP_FLOW_SHIFT) |
3973 			    qpriv->rcd->ctxt);
3974 	ohdr->u.tid_rdma.w_rsp.verbs_qp = cpu_to_be32(qp->remote_qpn);
3975 	*bth1 = remote->qp;
3976 	rcu_read_unlock();
3977 	hdwords = sizeof(ohdr->u.tid_rdma.w_rsp) / sizeof(u32);
3978 	qpriv->pending_tid_w_segs++;
3979 done:
3980 	return hdwords;
3981 }
3982 
3983 static void hfi1_add_tid_reap_timer(struct rvt_qp *qp)
3984 {
3985 	struct hfi1_qp_priv *qpriv = qp->priv;
3986 
3987 	lockdep_assert_held(&qp->s_lock);
3988 	if (!(qpriv->s_flags & HFI1_R_TID_RSC_TIMER)) {
3989 		qpriv->s_flags |= HFI1_R_TID_RSC_TIMER;
3990 		qpriv->s_tid_timer.expires = jiffies +
3991 			qpriv->tid_timer_timeout_jiffies;
3992 		add_timer(&qpriv->s_tid_timer);
3993 	}
3994 }
3995 
3996 static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp)
3997 {
3998 	struct hfi1_qp_priv *qpriv = qp->priv;
3999 
4000 	lockdep_assert_held(&qp->s_lock);
4001 	qpriv->s_flags |= HFI1_R_TID_RSC_TIMER;
4002 	mod_timer(&qpriv->s_tid_timer, jiffies +
4003 		  qpriv->tid_timer_timeout_jiffies);
4004 }
4005 
4006 static int hfi1_stop_tid_reap_timer(struct rvt_qp *qp)
4007 {
4008 	struct hfi1_qp_priv *qpriv = qp->priv;
4009 	int rval = 0;
4010 
4011 	lockdep_assert_held(&qp->s_lock);
4012 	if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) {
4013 		rval = del_timer(&qpriv->s_tid_timer);
4014 		qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER;
4015 	}
4016 	return rval;
4017 }
4018 
4019 void hfi1_del_tid_reap_timer(struct rvt_qp *qp)
4020 {
4021 	struct hfi1_qp_priv *qpriv = qp->priv;
4022 
4023 	del_timer_sync(&qpriv->s_tid_timer);
4024 	qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER;
4025 }
4026 
4027 static void hfi1_tid_timeout(struct timer_list *t)
4028 {
4029 	struct hfi1_qp_priv *qpriv = from_timer(qpriv, t, s_tid_timer);
4030 	struct rvt_qp *qp = qpriv->owner;
4031 	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
4032 	unsigned long flags;
4033 	u32 i;
4034 
4035 	spin_lock_irqsave(&qp->r_lock, flags);
4036 	spin_lock(&qp->s_lock);
4037 	if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) {
4038 		dd_dev_warn(dd_from_ibdev(qp->ibqp.device), "[QP%u] %s %d\n",
4039 			    qp->ibqp.qp_num, __func__, __LINE__);
4040 		trace_hfi1_msg_tid_timeout(/* msg */
4041 			qp, "resource timeout = ",
4042 			(u64)qpriv->tid_timer_timeout_jiffies);
4043 		hfi1_stop_tid_reap_timer(qp);
4044 		/*
4045 		 * Go though the entire ack queue and clear any outstanding
4046 		 * HW flow and RcvArray resources.
4047 		 */
4048 		hfi1_kern_clear_hw_flow(qpriv->rcd, qp);
4049 		for (i = 0; i < rvt_max_atomic(rdi); i++) {
4050 			struct tid_rdma_request *req =
4051 				ack_to_tid_req(&qp->s_ack_queue[i]);
4052 
4053 			hfi1_kern_exp_rcv_clear_all(req);
4054 		}
4055 		spin_unlock(&qp->s_lock);
4056 		if (qp->ibqp.event_handler) {
4057 			struct ib_event ev;
4058 
4059 			ev.device = qp->ibqp.device;
4060 			ev.element.qp = &qp->ibqp;
4061 			ev.event = IB_EVENT_QP_FATAL;
4062 			qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
4063 		}
4064 		rvt_rc_error(qp, IB_WC_RESP_TIMEOUT_ERR);
4065 		goto unlock_r_lock;
4066 	}
4067 	spin_unlock(&qp->s_lock);
4068 unlock_r_lock:
4069 	spin_unlock_irqrestore(&qp->r_lock, flags);
4070 }
4071 
4072 void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet)
4073 {
4074 	/* HANDLER FOR TID RDMA WRITE RESPONSE packet (Requestor side */
4075 
4076 	/*
4077 	 * 1. Find matching SWQE
4078 	 * 2. Check that TIDENTRY array has enough space for a complete
4079 	 *    segment. If not, put QP in error state.
4080 	 * 3. Save response data in struct tid_rdma_req and struct tid_rdma_flow
4081 	 * 4. Remove HFI1_S_WAIT_TID_RESP from s_flags.
4082 	 * 5. Set qp->s_state
4083 	 * 6. Kick the send engine (hfi1_schedule_send())
4084 	 */
4085 	struct ib_other_headers *ohdr = packet->ohdr;
4086 	struct rvt_qp *qp = packet->qp;
4087 	struct hfi1_qp_priv *qpriv = qp->priv;
4088 	struct hfi1_ctxtdata *rcd = packet->rcd;
4089 	struct rvt_swqe *wqe;
4090 	struct tid_rdma_request *req;
4091 	struct tid_rdma_flow *flow;
4092 	enum ib_wc_status status;
4093 	u32 opcode, aeth, psn, flow_psn, i, tidlen = 0, pktlen;
4094 	bool fecn;
4095 	unsigned long flags;
4096 
4097 	fecn = process_ecn(qp, packet);
4098 	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
4099 	aeth = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.aeth);
4100 	opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
4101 
4102 	spin_lock_irqsave(&qp->s_lock, flags);
4103 
4104 	/* Ignore invalid responses */
4105 	if (cmp_psn(psn, qp->s_next_psn) >= 0)
4106 		goto ack_done;
4107 
4108 	/* Ignore duplicate responses. */
4109 	if (unlikely(cmp_psn(psn, qp->s_last_psn) <= 0))
4110 		goto ack_done;
4111 
4112 	if (unlikely(qp->s_acked == qp->s_tail))
4113 		goto ack_done;
4114 
4115 	/*
4116 	 * If we are waiting for a particular packet sequence number
4117 	 * due to a request being resent, check for it. Otherwise,
4118 	 * ensure that we haven't missed anything.
4119 	 */
4120 	if (qp->r_flags & RVT_R_RDMAR_SEQ) {
4121 		if (cmp_psn(psn, qp->s_last_psn + 1) != 0)
4122 			goto ack_done;
4123 		qp->r_flags &= ~RVT_R_RDMAR_SEQ;
4124 	}
4125 
4126 	wqe = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur);
4127 	if (unlikely(wqe->wr.opcode != IB_WR_TID_RDMA_WRITE))
4128 		goto ack_op_err;
4129 
4130 	req = wqe_to_tid_req(wqe);
4131 	/*
4132 	 * If we've lost ACKs and our acked_tail pointer is too far
4133 	 * behind, don't overwrite segments. Just drop the packet and
4134 	 * let the reliability protocol take care of it.
4135 	 */
4136 	if (!CIRC_SPACE(req->setup_head, req->acked_tail, MAX_FLOWS))
4137 		goto ack_done;
4138 
4139 	/*
4140 	 * The call to do_rc_ack() should be last in the chain of
4141 	 * packet checks because it will end up updating the QP state.
4142 	 * Therefore, anything that would prevent the packet from
4143 	 * being accepted as a successful response should be prior
4144 	 * to it.
4145 	 */
4146 	if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd))
4147 		goto ack_done;
4148 
4149 	trace_hfi1_ack(qp, psn);
4150 
4151 	flow = &req->flows[req->setup_head];
4152 	flow->pkt = 0;
4153 	flow->tid_idx = 0;
4154 	flow->tid_offset = 0;
4155 	flow->sent = 0;
4156 	flow->resync_npkts = 0;
4157 	flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_qp);
4158 	flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) &
4159 		TID_RDMA_DESTQP_FLOW_MASK;
4160 	flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_psn));
4161 	flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT;
4162 	flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK;
4163 	flow->flow_state.resp_ib_psn = psn;
4164 	flow->length = min_t(u32, req->seg_len,
4165 			     (wqe->length - (req->comp_seg * req->seg_len)));
4166 
4167 	flow->npkts = rvt_div_round_up_mtu(qp, flow->length);
4168 	flow->flow_state.lpsn = flow->flow_state.spsn +
4169 		flow->npkts - 1;
4170 	/* payload length = packet length - (header length + ICRC length) */
4171 	pktlen = packet->tlen - (packet->hlen + 4);
4172 	if (pktlen > sizeof(flow->tid_entry)) {
4173 		status = IB_WC_LOC_LEN_ERR;
4174 		goto ack_err;
4175 	}
4176 	memcpy(flow->tid_entry, packet->ebuf, pktlen);
4177 	flow->tidcnt = pktlen / sizeof(*flow->tid_entry);
4178 	trace_hfi1_tid_flow_rcv_write_resp(qp, req->setup_head, flow);
4179 
4180 	req->comp_seg++;
4181 	trace_hfi1_tid_write_sender_rcv_resp(qp, 0);
4182 	/*
4183 	 * Walk the TID_ENTRY list to make sure we have enough space for a
4184 	 * complete segment.
4185 	 */
4186 	for (i = 0; i < flow->tidcnt; i++) {
4187 		trace_hfi1_tid_entry_rcv_write_resp(/* entry */
4188 			qp, i, flow->tid_entry[i]);
4189 		if (!EXP_TID_GET(flow->tid_entry[i], LEN)) {
4190 			status = IB_WC_LOC_LEN_ERR;
4191 			goto ack_err;
4192 		}
4193 		tidlen += EXP_TID_GET(flow->tid_entry[i], LEN);
4194 	}
4195 	if (tidlen * PAGE_SIZE < flow->length) {
4196 		status = IB_WC_LOC_LEN_ERR;
4197 		goto ack_err;
4198 	}
4199 
4200 	trace_hfi1_tid_req_rcv_write_resp(qp, 0, wqe->wr.opcode, wqe->psn,
4201 					  wqe->lpsn, req);
4202 	/*
4203 	 * If this is the first response for this request, set the initial
4204 	 * flow index to the current flow.
4205 	 */
4206 	if (!cmp_psn(psn, wqe->psn)) {
4207 		req->r_last_acked = mask_psn(wqe->psn - 1);
4208 		/* Set acked flow index to head index */
4209 		req->acked_tail = req->setup_head;
4210 	}
4211 
4212 	/* advance circular buffer head */
4213 	req->setup_head = CIRC_NEXT(req->setup_head, MAX_FLOWS);
4214 	req->state = TID_REQUEST_ACTIVE;
4215 
4216 	/*
4217 	 * If all responses for this TID RDMA WRITE request have been received
4218 	 * advance the pointer to the next one.
4219 	 * Since TID RDMA requests could be mixed in with regular IB requests,
4220 	 * they might not appear sequentially in the queue. Therefore, the
4221 	 * next request needs to be "found".
4222 	 */
4223 	if (qpriv->s_tid_cur != qpriv->s_tid_head &&
4224 	    req->comp_seg == req->total_segs) {
4225 		for (i = qpriv->s_tid_cur + 1; ; i++) {
4226 			if (i == qp->s_size)
4227 				i = 0;
4228 			wqe = rvt_get_swqe_ptr(qp, i);
4229 			if (i == qpriv->s_tid_head)
4230 				break;
4231 			if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
4232 				break;
4233 		}
4234 		qpriv->s_tid_cur = i;
4235 	}
4236 	qp->s_flags &= ~HFI1_S_WAIT_TID_RESP;
4237 	hfi1_schedule_tid_send(qp);
4238 	goto ack_done;
4239 
4240 ack_op_err:
4241 	status = IB_WC_LOC_QP_OP_ERR;
4242 ack_err:
4243 	rvt_error_qp(qp, status);
4244 ack_done:
4245 	if (fecn)
4246 		qp->s_flags |= RVT_S_ECN;
4247 	spin_unlock_irqrestore(&qp->s_lock, flags);
4248 }
4249 
4250 bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe,
4251 				struct ib_other_headers *ohdr,
4252 				u32 *bth1, u32 *bth2, u32 *len)
4253 {
4254 	struct tid_rdma_request *req = wqe_to_tid_req(wqe);
4255 	struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
4256 	struct tid_rdma_params *remote;
4257 	struct rvt_qp *qp = req->qp;
4258 	struct hfi1_qp_priv *qpriv = qp->priv;
4259 	u32 tidentry = flow->tid_entry[flow->tid_idx];
4260 	u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT;
4261 	struct tid_rdma_write_data *wd = &ohdr->u.tid_rdma.w_data;
4262 	u32 next_offset, om = KDETH_OM_LARGE;
4263 	bool last_pkt;
4264 
4265 	if (!tidlen) {
4266 		hfi1_trdma_send_complete(qp, wqe, IB_WC_REM_INV_RD_REQ_ERR);
4267 		rvt_error_qp(qp, IB_WC_REM_INV_RD_REQ_ERR);
4268 	}
4269 
4270 	*len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset);
4271 	flow->sent += *len;
4272 	next_offset = flow->tid_offset + *len;
4273 	last_pkt = (flow->tid_idx == (flow->tidcnt - 1) &&
4274 		    next_offset >= tidlen) || (flow->sent >= flow->length);
4275 	trace_hfi1_tid_entry_build_write_data(qp, flow->tid_idx, tidentry);
4276 	trace_hfi1_tid_flow_build_write_data(qp, req->clear_tail, flow);
4277 
4278 	rcu_read_lock();
4279 	remote = rcu_dereference(qpriv->tid_rdma.remote);
4280 	KDETH_RESET(wd->kdeth0, KVER, 0x1);
4281 	KDETH_SET(wd->kdeth0, SH, !last_pkt);
4282 	KDETH_SET(wd->kdeth0, INTR, !!(!last_pkt && remote->urg));
4283 	KDETH_SET(wd->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL));
4284 	KDETH_SET(wd->kdeth0, TID, EXP_TID_GET(tidentry, IDX));
4285 	KDETH_SET(wd->kdeth0, OM, om == KDETH_OM_LARGE);
4286 	KDETH_SET(wd->kdeth0, OFFSET, flow->tid_offset / om);
4287 	KDETH_RESET(wd->kdeth1, JKEY, remote->jkey);
4288 	wd->verbs_qp = cpu_to_be32(qp->remote_qpn);
4289 	rcu_read_unlock();
4290 
4291 	*bth1 = flow->tid_qpn;
4292 	*bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) &
4293 			 HFI1_KDETH_BTH_SEQ_MASK) |
4294 			 (flow->flow_state.generation <<
4295 			  HFI1_KDETH_BTH_SEQ_SHIFT));
4296 	if (last_pkt) {
4297 		/* PSNs are zero-based, so +1 to count number of packets */
4298 		if (flow->flow_state.lpsn + 1 +
4299 		    rvt_div_round_up_mtu(qp, req->seg_len) >
4300 		    MAX_TID_FLOW_PSN)
4301 			req->state = TID_REQUEST_SYNC;
4302 		*bth2 |= IB_BTH_REQ_ACK;
4303 	}
4304 
4305 	if (next_offset >= tidlen) {
4306 		flow->tid_offset = 0;
4307 		flow->tid_idx++;
4308 	} else {
4309 		flow->tid_offset = next_offset;
4310 	}
4311 	return last_pkt;
4312 }
4313 
4314 void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet)
4315 {
4316 	struct rvt_qp *qp = packet->qp;
4317 	struct hfi1_qp_priv *priv = qp->priv;
4318 	struct hfi1_ctxtdata *rcd = priv->rcd;
4319 	struct ib_other_headers *ohdr = packet->ohdr;
4320 	struct rvt_ack_entry *e;
4321 	struct tid_rdma_request *req;
4322 	struct tid_rdma_flow *flow;
4323 	struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
4324 	unsigned long flags;
4325 	u32 psn, next;
4326 	u8 opcode;
4327 	bool fecn;
4328 
4329 	fecn = process_ecn(qp, packet);
4330 	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
4331 	opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
4332 
4333 	/*
4334 	 * All error handling should be done by now. If we are here, the packet
4335 	 * is either good or been accepted by the error handler.
4336 	 */
4337 	spin_lock_irqsave(&qp->s_lock, flags);
4338 	e = &qp->s_ack_queue[priv->r_tid_tail];
4339 	req = ack_to_tid_req(e);
4340 	flow = &req->flows[req->clear_tail];
4341 	if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.lpsn))) {
4342 		update_r_next_psn_fecn(packet, priv, rcd, flow, fecn);
4343 
4344 		if (cmp_psn(psn, flow->flow_state.r_next_psn))
4345 			goto send_nak;
4346 
4347 		flow->flow_state.r_next_psn = mask_psn(psn + 1);
4348 		/*
4349 		 * Copy the payload to destination buffer if this packet is
4350 		 * delivered as an eager packet due to RSM rule and FECN.
4351 		 * The RSM rule selects FECN bit in BTH and SH bit in
4352 		 * KDETH header and therefore will not match the last
4353 		 * packet of each segment that has SH bit cleared.
4354 		 */
4355 		if (fecn && packet->etype == RHF_RCV_TYPE_EAGER) {
4356 			struct rvt_sge_state ss;
4357 			u32 len;
4358 			u32 tlen = packet->tlen;
4359 			u16 hdrsize = packet->hlen;
4360 			u8 pad = packet->pad;
4361 			u8 extra_bytes = pad + packet->extra_byte +
4362 				(SIZE_OF_CRC << 2);
4363 			u32 pmtu = qp->pmtu;
4364 
4365 			if (unlikely(tlen != (hdrsize + pmtu + extra_bytes)))
4366 				goto send_nak;
4367 			len = req->comp_seg * req->seg_len;
4368 			len += delta_psn(psn,
4369 				full_flow_psn(flow, flow->flow_state.spsn)) *
4370 				pmtu;
4371 			if (unlikely(req->total_len - len < pmtu))
4372 				goto send_nak;
4373 
4374 			/*
4375 			 * The e->rdma_sge field is set when TID RDMA WRITE REQ
4376 			 * is first received and is never modified thereafter.
4377 			 */
4378 			ss.sge = e->rdma_sge;
4379 			ss.sg_list = NULL;
4380 			ss.num_sge = 1;
4381 			ss.total_len = req->total_len;
4382 			rvt_skip_sge(&ss, len, false);
4383 			rvt_copy_sge(qp, &ss, packet->payload, pmtu, false,
4384 				     false);
4385 			/* Raise the sw sequence check flag for next packet */
4386 			priv->r_next_psn_kdeth = mask_psn(psn + 1);
4387 			priv->s_flags |= HFI1_R_TID_SW_PSN;
4388 		}
4389 		goto exit;
4390 	}
4391 	flow->flow_state.r_next_psn = mask_psn(psn + 1);
4392 	hfi1_kern_exp_rcv_clear(req);
4393 	priv->alloc_w_segs--;
4394 	rcd->flows[flow->idx].psn = psn & HFI1_KDETH_BTH_SEQ_MASK;
4395 	req->comp_seg++;
4396 	priv->s_nak_state = 0;
4397 
4398 	/*
4399 	 * Release the flow if one of the following conditions has been met:
4400 	 *  - The request has reached a sync point AND all outstanding
4401 	 *    segments have been completed, or
4402 	 *  - The entire request is complete and there are no more requests
4403 	 *    (of any kind) in the queue.
4404 	 */
4405 	trace_hfi1_rsp_rcv_tid_write_data(qp, psn);
4406 	trace_hfi1_tid_req_rcv_write_data(qp, 0, e->opcode, e->psn, e->lpsn,
4407 					  req);
4408 	trace_hfi1_tid_write_rsp_rcv_data(qp);
4409 	if (priv->r_tid_ack == HFI1_QP_WQE_INVALID)
4410 		priv->r_tid_ack = priv->r_tid_tail;
4411 
4412 	if (opcode == TID_OP(WRITE_DATA_LAST)) {
4413 		release_rdma_sge_mr(e);
4414 		for (next = priv->r_tid_tail + 1; ; next++) {
4415 			if (next > rvt_size_atomic(&dev->rdi))
4416 				next = 0;
4417 			if (next == priv->r_tid_head)
4418 				break;
4419 			e = &qp->s_ack_queue[next];
4420 			if (e->opcode == TID_OP(WRITE_REQ))
4421 				break;
4422 		}
4423 		priv->r_tid_tail = next;
4424 		if (++qp->s_acked_ack_queue > rvt_size_atomic(&dev->rdi))
4425 			qp->s_acked_ack_queue = 0;
4426 	}
4427 
4428 	hfi1_tid_write_alloc_resources(qp, true);
4429 
4430 	/*
4431 	 * If we need to generate more responses, schedule the
4432 	 * send engine.
4433 	 */
4434 	if (req->cur_seg < req->total_segs ||
4435 	    qp->s_tail_ack_queue != qp->r_head_ack_queue) {
4436 		qp->s_flags |= RVT_S_RESP_PENDING;
4437 		hfi1_schedule_send(qp);
4438 	}
4439 
4440 	priv->pending_tid_w_segs--;
4441 	if (priv->s_flags & HFI1_R_TID_RSC_TIMER) {
4442 		if (priv->pending_tid_w_segs)
4443 			hfi1_mod_tid_reap_timer(req->qp);
4444 		else
4445 			hfi1_stop_tid_reap_timer(req->qp);
4446 	}
4447 
4448 done:
4449 	priv->s_flags |= RVT_S_ACK_PENDING;
4450 	hfi1_schedule_tid_send(qp);
4451 exit:
4452 	priv->r_next_psn_kdeth = flow->flow_state.r_next_psn;
4453 	if (fecn)
4454 		qp->s_flags |= RVT_S_ECN;
4455 	spin_unlock_irqrestore(&qp->s_lock, flags);
4456 	return;
4457 
4458 send_nak:
4459 	if (!priv->s_nak_state) {
4460 		priv->s_nak_state = IB_NAK_PSN_ERROR;
4461 		priv->s_nak_psn = flow->flow_state.r_next_psn;
4462 		priv->s_flags |= RVT_S_ACK_PENDING;
4463 		if (priv->r_tid_ack == HFI1_QP_WQE_INVALID)
4464 			priv->r_tid_ack = priv->r_tid_tail;
4465 		hfi1_schedule_tid_send(qp);
4466 	}
4467 	goto done;
4468 }
4469 
4470 static bool hfi1_tid_rdma_is_resync_psn(u32 psn)
4471 {
4472 	return (bool)((psn & HFI1_KDETH_BTH_SEQ_MASK) ==
4473 		      HFI1_KDETH_BTH_SEQ_MASK);
4474 }
4475 
4476 u32 hfi1_build_tid_rdma_write_ack(struct rvt_qp *qp, struct rvt_ack_entry *e,
4477 				  struct ib_other_headers *ohdr, u16 iflow,
4478 				  u32 *bth1, u32 *bth2)
4479 {
4480 	struct hfi1_qp_priv *qpriv = qp->priv;
4481 	struct tid_flow_state *fs = &qpriv->flow_state;
4482 	struct tid_rdma_request *req = ack_to_tid_req(e);
4483 	struct tid_rdma_flow *flow = &req->flows[iflow];
4484 	struct tid_rdma_params *remote;
4485 
4486 	rcu_read_lock();
4487 	remote = rcu_dereference(qpriv->tid_rdma.remote);
4488 	KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey);
4489 	ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn);
4490 	*bth1 = remote->qp;
4491 	rcu_read_unlock();
4492 
4493 	if (qpriv->resync) {
4494 		*bth2 = mask_psn((fs->generation <<
4495 				  HFI1_KDETH_BTH_SEQ_SHIFT) - 1);
4496 		ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp);
4497 	} else if (qpriv->s_nak_state) {
4498 		*bth2 = mask_psn(qpriv->s_nak_psn);
4499 		ohdr->u.tid_rdma.ack.aeth =
4500 			cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
4501 				    (qpriv->s_nak_state <<
4502 				     IB_AETH_CREDIT_SHIFT));
4503 	} else {
4504 		*bth2 = full_flow_psn(flow, flow->flow_state.lpsn);
4505 		ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp);
4506 	}
4507 	KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, 0x1);
4508 	ohdr->u.tid_rdma.ack.tid_flow_qp =
4509 		cpu_to_be32(qpriv->tid_rdma.local.qp |
4510 			    ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) <<
4511 			     TID_RDMA_DESTQP_FLOW_SHIFT) |
4512 			    qpriv->rcd->ctxt);
4513 
4514 	ohdr->u.tid_rdma.ack.tid_flow_psn = 0;
4515 	ohdr->u.tid_rdma.ack.verbs_psn =
4516 		cpu_to_be32(flow->flow_state.resp_ib_psn);
4517 
4518 	if (qpriv->resync) {
4519 		/*
4520 		 * If the PSN before the current expect KDETH PSN is the
4521 		 * RESYNC PSN, then we never received a good TID RDMA WRITE
4522 		 * DATA packet after a previous RESYNC.
4523 		 * In this case, the next expected KDETH PSN stays the same.
4524 		 */
4525 		if (hfi1_tid_rdma_is_resync_psn(qpriv->r_next_psn_kdeth - 1)) {
4526 			ohdr->u.tid_rdma.ack.tid_flow_psn =
4527 				cpu_to_be32(qpriv->r_next_psn_kdeth_save);
4528 		} else {
4529 			/*
4530 			 * Because the KDETH PSNs jump during a RESYNC, it's
4531 			 * not possible to infer (or compute) the previous value
4532 			 * of r_next_psn_kdeth in the case of back-to-back
4533 			 * RESYNC packets. Therefore, we save it.
4534 			 */
4535 			qpriv->r_next_psn_kdeth_save =
4536 				qpriv->r_next_psn_kdeth - 1;
4537 			ohdr->u.tid_rdma.ack.tid_flow_psn =
4538 				cpu_to_be32(qpriv->r_next_psn_kdeth_save);
4539 			qpriv->r_next_psn_kdeth = mask_psn(*bth2 + 1);
4540 		}
4541 		qpriv->resync = false;
4542 	}
4543 
4544 	return sizeof(ohdr->u.tid_rdma.ack) / sizeof(u32);
4545 }
4546 
4547 void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet)
4548 {
4549 	struct ib_other_headers *ohdr = packet->ohdr;
4550 	struct rvt_qp *qp = packet->qp;
4551 	struct hfi1_qp_priv *qpriv = qp->priv;
4552 	struct rvt_swqe *wqe;
4553 	struct tid_rdma_request *req;
4554 	struct tid_rdma_flow *flow;
4555 	u32 aeth, psn, req_psn, ack_psn, fspsn, resync_psn, ack_kpsn;
4556 	unsigned long flags;
4557 	u16 fidx;
4558 
4559 	trace_hfi1_tid_write_sender_rcv_tid_ack(qp, 0);
4560 	process_ecn(qp, packet);
4561 	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
4562 	aeth = be32_to_cpu(ohdr->u.tid_rdma.ack.aeth);
4563 	req_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.verbs_psn));
4564 	resync_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.tid_flow_psn));
4565 
4566 	spin_lock_irqsave(&qp->s_lock, flags);
4567 	trace_hfi1_rcv_tid_ack(qp, aeth, psn, req_psn, resync_psn);
4568 
4569 	/* If we are waiting for an ACK to RESYNC, drop any other packets */
4570 	if ((qp->s_flags & HFI1_S_WAIT_HALT) &&
4571 	    cmp_psn(psn, qpriv->s_resync_psn))
4572 		goto ack_op_err;
4573 
4574 	ack_psn = req_psn;
4575 	if (hfi1_tid_rdma_is_resync_psn(psn))
4576 		ack_kpsn = resync_psn;
4577 	else
4578 		ack_kpsn = psn;
4579 	if (aeth >> 29) {
4580 		ack_psn--;
4581 		ack_kpsn--;
4582 	}
4583 
4584 	wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
4585 
4586 	if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
4587 		goto ack_op_err;
4588 
4589 	req = wqe_to_tid_req(wqe);
4590 	trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn,
4591 				       wqe->lpsn, req);
4592 	flow = &req->flows[req->acked_tail];
4593 	trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow);
4594 
4595 	/* Drop stale ACK/NAK */
4596 	if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.spsn)) < 0)
4597 		goto ack_op_err;
4598 
4599 	while (cmp_psn(ack_kpsn,
4600 		       full_flow_psn(flow, flow->flow_state.lpsn)) >= 0 &&
4601 	       req->ack_seg < req->cur_seg) {
4602 		req->ack_seg++;
4603 		/* advance acked segment pointer */
4604 		req->acked_tail = CIRC_NEXT(req->acked_tail, MAX_FLOWS);
4605 		req->r_last_acked = flow->flow_state.resp_ib_psn;
4606 		trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn,
4607 					       wqe->lpsn, req);
4608 		if (req->ack_seg == req->total_segs) {
4609 			req->state = TID_REQUEST_COMPLETE;
4610 			wqe = do_rc_completion(qp, wqe,
4611 					       to_iport(qp->ibqp.device,
4612 							qp->port_num));
4613 			trace_hfi1_sender_rcv_tid_ack(qp);
4614 			atomic_dec(&qpriv->n_tid_requests);
4615 			if (qp->s_acked == qp->s_tail)
4616 				break;
4617 			if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
4618 				break;
4619 			req = wqe_to_tid_req(wqe);
4620 		}
4621 		flow = &req->flows[req->acked_tail];
4622 		trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow);
4623 	}
4624 
4625 	trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn,
4626 				       wqe->lpsn, req);
4627 	switch (aeth >> 29) {
4628 	case 0:         /* ACK */
4629 		if (qpriv->s_flags & RVT_S_WAIT_ACK)
4630 			qpriv->s_flags &= ~RVT_S_WAIT_ACK;
4631 		if (!hfi1_tid_rdma_is_resync_psn(psn)) {
4632 			/* Check if there is any pending TID ACK */
4633 			if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
4634 			    req->ack_seg < req->cur_seg)
4635 				hfi1_mod_tid_retry_timer(qp);
4636 			else
4637 				hfi1_stop_tid_retry_timer(qp);
4638 			hfi1_schedule_send(qp);
4639 		} else {
4640 			u32 spsn, fpsn, last_acked, generation;
4641 			struct tid_rdma_request *rptr;
4642 
4643 			/* ACK(RESYNC) */
4644 			hfi1_stop_tid_retry_timer(qp);
4645 			/* Allow new requests (see hfi1_make_tid_rdma_pkt) */
4646 			qp->s_flags &= ~HFI1_S_WAIT_HALT;
4647 			/*
4648 			 * Clear RVT_S_SEND_ONE flag in case that the TID RDMA
4649 			 * ACK is received after the TID retry timer is fired
4650 			 * again. In this case, do not send any more TID
4651 			 * RESYNC request or wait for any more TID ACK packet.
4652 			 */
4653 			qpriv->s_flags &= ~RVT_S_SEND_ONE;
4654 			hfi1_schedule_send(qp);
4655 
4656 			if ((qp->s_acked == qpriv->s_tid_tail &&
4657 			     req->ack_seg == req->total_segs) ||
4658 			    qp->s_acked == qp->s_tail) {
4659 				qpriv->s_state = TID_OP(WRITE_DATA_LAST);
4660 				goto done;
4661 			}
4662 
4663 			if (req->ack_seg == req->comp_seg) {
4664 				qpriv->s_state = TID_OP(WRITE_DATA);
4665 				goto done;
4666 			}
4667 
4668 			/*
4669 			 * The PSN to start with is the next PSN after the
4670 			 * RESYNC PSN.
4671 			 */
4672 			psn = mask_psn(psn + 1);
4673 			generation = psn >> HFI1_KDETH_BTH_SEQ_SHIFT;
4674 			spsn = 0;
4675 
4676 			/*
4677 			 * Update to the correct WQE when we get an ACK(RESYNC)
4678 			 * in the middle of a request.
4679 			 */
4680 			if (delta_psn(ack_psn, wqe->lpsn))
4681 				wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
4682 			req = wqe_to_tid_req(wqe);
4683 			flow = &req->flows[req->acked_tail];
4684 			/*
4685 			 * RESYNC re-numbers the PSN ranges of all remaining
4686 			 * segments. Also, PSN's start from 0 in the middle of a
4687 			 * segment and the first segment size is less than the
4688 			 * default number of packets. flow->resync_npkts is used
4689 			 * to track the number of packets from the start of the
4690 			 * real segment to the point of 0 PSN after the RESYNC
4691 			 * in order to later correctly rewind the SGE.
4692 			 */
4693 			fpsn = full_flow_psn(flow, flow->flow_state.spsn);
4694 			req->r_ack_psn = psn;
4695 			flow->resync_npkts +=
4696 				delta_psn(mask_psn(resync_psn + 1), fpsn);
4697 			/*
4698 			 * Renumber all packet sequence number ranges
4699 			 * based on the new generation.
4700 			 */
4701 			last_acked = qp->s_acked;
4702 			rptr = req;
4703 			while (1) {
4704 				/* start from last acked segment */
4705 				for (fidx = rptr->acked_tail;
4706 				     CIRC_CNT(rptr->setup_head, fidx,
4707 					      MAX_FLOWS);
4708 				     fidx = CIRC_NEXT(fidx, MAX_FLOWS)) {
4709 					u32 lpsn;
4710 					u32 gen;
4711 
4712 					flow = &rptr->flows[fidx];
4713 					gen = flow->flow_state.generation;
4714 					if (WARN_ON(gen == generation &&
4715 						    flow->flow_state.spsn !=
4716 						     spsn))
4717 						continue;
4718 					lpsn = flow->flow_state.lpsn;
4719 					lpsn = full_flow_psn(flow, lpsn);
4720 					flow->npkts =
4721 						delta_psn(lpsn,
4722 							  mask_psn(resync_psn)
4723 							  );
4724 					flow->flow_state.generation =
4725 						generation;
4726 					flow->flow_state.spsn = spsn;
4727 					flow->flow_state.lpsn =
4728 						flow->flow_state.spsn +
4729 						flow->npkts - 1;
4730 					flow->pkt = 0;
4731 					spsn += flow->npkts;
4732 					resync_psn += flow->npkts;
4733 					trace_hfi1_tid_flow_rcv_tid_ack(qp,
4734 									fidx,
4735 									flow);
4736 				}
4737 				if (++last_acked == qpriv->s_tid_cur + 1)
4738 					break;
4739 				if (last_acked == qp->s_size)
4740 					last_acked = 0;
4741 				wqe = rvt_get_swqe_ptr(qp, last_acked);
4742 				rptr = wqe_to_tid_req(wqe);
4743 			}
4744 			req->cur_seg = req->ack_seg;
4745 			qpriv->s_tid_tail = qp->s_acked;
4746 			qpriv->s_state = TID_OP(WRITE_REQ);
4747 			hfi1_schedule_tid_send(qp);
4748 		}
4749 done:
4750 		qpriv->s_retry = qp->s_retry_cnt;
4751 		break;
4752 
4753 	case 3:         /* NAK */
4754 		hfi1_stop_tid_retry_timer(qp);
4755 		switch ((aeth >> IB_AETH_CREDIT_SHIFT) &
4756 			IB_AETH_CREDIT_MASK) {
4757 		case 0: /* PSN sequence error */
4758 			flow = &req->flows[req->acked_tail];
4759 			fspsn = full_flow_psn(flow, flow->flow_state.spsn);
4760 			trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail,
4761 							flow);
4762 			req->r_ack_psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
4763 			req->cur_seg = req->ack_seg;
4764 			qpriv->s_tid_tail = qp->s_acked;
4765 			qpriv->s_state = TID_OP(WRITE_REQ);
4766 			qpriv->s_retry = qp->s_retry_cnt;
4767 			hfi1_schedule_tid_send(qp);
4768 			break;
4769 
4770 		default:
4771 			break;
4772 		}
4773 		break;
4774 
4775 	default:
4776 		break;
4777 	}
4778 
4779 ack_op_err:
4780 	spin_unlock_irqrestore(&qp->s_lock, flags);
4781 }
4782 
4783 void hfi1_add_tid_retry_timer(struct rvt_qp *qp)
4784 {
4785 	struct hfi1_qp_priv *priv = qp->priv;
4786 	struct ib_qp *ibqp = &qp->ibqp;
4787 	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
4788 
4789 	lockdep_assert_held(&qp->s_lock);
4790 	if (!(priv->s_flags & HFI1_S_TID_RETRY_TIMER)) {
4791 		priv->s_flags |= HFI1_S_TID_RETRY_TIMER;
4792 		priv->s_tid_retry_timer.expires = jiffies +
4793 			priv->tid_retry_timeout_jiffies + rdi->busy_jiffies;
4794 		add_timer(&priv->s_tid_retry_timer);
4795 	}
4796 }
4797 
4798 static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp)
4799 {
4800 	struct hfi1_qp_priv *priv = qp->priv;
4801 	struct ib_qp *ibqp = &qp->ibqp;
4802 	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
4803 
4804 	lockdep_assert_held(&qp->s_lock);
4805 	priv->s_flags |= HFI1_S_TID_RETRY_TIMER;
4806 	mod_timer(&priv->s_tid_retry_timer, jiffies +
4807 		  priv->tid_retry_timeout_jiffies + rdi->busy_jiffies);
4808 }
4809 
4810 static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp)
4811 {
4812 	struct hfi1_qp_priv *priv = qp->priv;
4813 	int rval = 0;
4814 
4815 	lockdep_assert_held(&qp->s_lock);
4816 	if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) {
4817 		rval = del_timer(&priv->s_tid_retry_timer);
4818 		priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER;
4819 	}
4820 	return rval;
4821 }
4822 
4823 void hfi1_del_tid_retry_timer(struct rvt_qp *qp)
4824 {
4825 	struct hfi1_qp_priv *priv = qp->priv;
4826 
4827 	del_timer_sync(&priv->s_tid_retry_timer);
4828 	priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER;
4829 }
4830 
4831 static void hfi1_tid_retry_timeout(struct timer_list *t)
4832 {
4833 	struct hfi1_qp_priv *priv = from_timer(priv, t, s_tid_retry_timer);
4834 	struct rvt_qp *qp = priv->owner;
4835 	struct rvt_swqe *wqe;
4836 	unsigned long flags;
4837 	struct tid_rdma_request *req;
4838 
4839 	spin_lock_irqsave(&qp->r_lock, flags);
4840 	spin_lock(&qp->s_lock);
4841 	trace_hfi1_tid_write_sender_retry_timeout(qp, 0);
4842 	if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) {
4843 		hfi1_stop_tid_retry_timer(qp);
4844 		if (!priv->s_retry) {
4845 			trace_hfi1_msg_tid_retry_timeout(/* msg */
4846 				qp,
4847 				"Exhausted retries. Tid retry timeout = ",
4848 				(u64)priv->tid_retry_timeout_jiffies);
4849 
4850 			wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
4851 			hfi1_trdma_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
4852 			rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
4853 		} else {
4854 			wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
4855 			req = wqe_to_tid_req(wqe);
4856 			trace_hfi1_tid_req_tid_retry_timeout(/* req */
4857 			   qp, 0, wqe->wr.opcode, wqe->psn, wqe->lpsn, req);
4858 
4859 			priv->s_flags &= ~RVT_S_WAIT_ACK;
4860 			/* Only send one packet (the RESYNC) */
4861 			priv->s_flags |= RVT_S_SEND_ONE;
4862 			/*
4863 			 * No additional request shall be made by this QP until
4864 			 * the RESYNC has been complete.
4865 			 */
4866 			qp->s_flags |= HFI1_S_WAIT_HALT;
4867 			priv->s_state = TID_OP(RESYNC);
4868 			priv->s_retry--;
4869 			hfi1_schedule_tid_send(qp);
4870 		}
4871 	}
4872 	spin_unlock(&qp->s_lock);
4873 	spin_unlock_irqrestore(&qp->r_lock, flags);
4874 }
4875 
4876 u32 hfi1_build_tid_rdma_resync(struct rvt_qp *qp, struct rvt_swqe *wqe,
4877 			       struct ib_other_headers *ohdr, u32 *bth1,
4878 			       u32 *bth2, u16 fidx)
4879 {
4880 	struct hfi1_qp_priv *qpriv = qp->priv;
4881 	struct tid_rdma_params *remote;
4882 	struct tid_rdma_request *req = wqe_to_tid_req(wqe);
4883 	struct tid_rdma_flow *flow = &req->flows[fidx];
4884 	u32 generation;
4885 
4886 	rcu_read_lock();
4887 	remote = rcu_dereference(qpriv->tid_rdma.remote);
4888 	KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey);
4889 	ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn);
4890 	*bth1 = remote->qp;
4891 	rcu_read_unlock();
4892 
4893 	generation = kern_flow_generation_next(flow->flow_state.generation);
4894 	*bth2 = mask_psn((generation << HFI1_KDETH_BTH_SEQ_SHIFT) - 1);
4895 	qpriv->s_resync_psn = *bth2;
4896 	*bth2 |= IB_BTH_REQ_ACK;
4897 	KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, 0x1);
4898 
4899 	return sizeof(ohdr->u.tid_rdma.resync) / sizeof(u32);
4900 }
4901 
4902 void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet)
4903 {
4904 	struct ib_other_headers *ohdr = packet->ohdr;
4905 	struct rvt_qp *qp = packet->qp;
4906 	struct hfi1_qp_priv *qpriv = qp->priv;
4907 	struct hfi1_ctxtdata *rcd = qpriv->rcd;
4908 	struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
4909 	struct rvt_ack_entry *e;
4910 	struct tid_rdma_request *req;
4911 	struct tid_rdma_flow *flow;
4912 	struct tid_flow_state *fs = &qpriv->flow_state;
4913 	u32 psn, generation, idx, gen_next;
4914 	bool fecn;
4915 	unsigned long flags;
4916 
4917 	fecn = process_ecn(qp, packet);
4918 	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
4919 
4920 	generation = mask_psn(psn + 1) >> HFI1_KDETH_BTH_SEQ_SHIFT;
4921 	spin_lock_irqsave(&qp->s_lock, flags);
4922 
4923 	gen_next = (fs->generation == KERN_GENERATION_RESERVED) ?
4924 		generation : kern_flow_generation_next(fs->generation);
4925 	/*
4926 	 * RESYNC packet contains the "next" generation and can only be
4927 	 * from the current or previous generations
4928 	 */
4929 	if (generation != mask_generation(gen_next - 1) &&
4930 	    generation != gen_next)
4931 		goto bail;
4932 	/* Already processing a resync */
4933 	if (qpriv->resync)
4934 		goto bail;
4935 
4936 	spin_lock(&rcd->exp_lock);
4937 	if (fs->index >= RXE_NUM_TID_FLOWS) {
4938 		/*
4939 		 * If we don't have a flow, save the generation so it can be
4940 		 * applied when a new flow is allocated
4941 		 */
4942 		fs->generation = generation;
4943 	} else {
4944 		/* Reprogram the QP flow with new generation */
4945 		rcd->flows[fs->index].generation = generation;
4946 		fs->generation = kern_setup_hw_flow(rcd, fs->index);
4947 	}
4948 	fs->psn = 0;
4949 	/*
4950 	 * Disable SW PSN checking since a RESYNC is equivalent to a
4951 	 * sync point and the flow has/will be reprogrammed
4952 	 */
4953 	qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
4954 	trace_hfi1_tid_write_rsp_rcv_resync(qp);
4955 
4956 	/*
4957 	 * Reset all TID flow information with the new generation.
4958 	 * This is done for all requests and segments after the
4959 	 * last received segment
4960 	 */
4961 	for (idx = qpriv->r_tid_tail; ; idx++) {
4962 		u16 flow_idx;
4963 
4964 		if (idx > rvt_size_atomic(&dev->rdi))
4965 			idx = 0;
4966 		e = &qp->s_ack_queue[idx];
4967 		if (e->opcode == TID_OP(WRITE_REQ)) {
4968 			req = ack_to_tid_req(e);
4969 			trace_hfi1_tid_req_rcv_resync(qp, 0, e->opcode, e->psn,
4970 						      e->lpsn, req);
4971 
4972 			/* start from last unacked segment */
4973 			for (flow_idx = req->clear_tail;
4974 			     CIRC_CNT(req->setup_head, flow_idx,
4975 				      MAX_FLOWS);
4976 			     flow_idx = CIRC_NEXT(flow_idx, MAX_FLOWS)) {
4977 				u32 lpsn;
4978 				u32 next;
4979 
4980 				flow = &req->flows[flow_idx];
4981 				lpsn = full_flow_psn(flow,
4982 						     flow->flow_state.lpsn);
4983 				next = flow->flow_state.r_next_psn;
4984 				flow->npkts = delta_psn(lpsn, next - 1);
4985 				flow->flow_state.generation = fs->generation;
4986 				flow->flow_state.spsn = fs->psn;
4987 				flow->flow_state.lpsn =
4988 					flow->flow_state.spsn + flow->npkts - 1;
4989 				flow->flow_state.r_next_psn =
4990 					full_flow_psn(flow,
4991 						      flow->flow_state.spsn);
4992 				fs->psn += flow->npkts;
4993 				trace_hfi1_tid_flow_rcv_resync(qp, flow_idx,
4994 							       flow);
4995 			}
4996 		}
4997 		if (idx == qp->s_tail_ack_queue)
4998 			break;
4999 	}
5000 
5001 	spin_unlock(&rcd->exp_lock);
5002 	qpriv->resync = true;
5003 	/* RESYNC request always gets a TID RDMA ACK. */
5004 	qpriv->s_nak_state = 0;
5005 	qpriv->s_flags |= RVT_S_ACK_PENDING;
5006 	hfi1_schedule_tid_send(qp);
5007 bail:
5008 	if (fecn)
5009 		qp->s_flags |= RVT_S_ECN;
5010 	spin_unlock_irqrestore(&qp->s_lock, flags);
5011 }
5012 
5013 /*
5014  * Call this function when the last TID RDMA WRITE DATA packet for a request
5015  * is built.
5016  */
5017 static void update_tid_tail(struct rvt_qp *qp)
5018 	__must_hold(&qp->s_lock)
5019 {
5020 	struct hfi1_qp_priv *priv = qp->priv;
5021 	u32 i;
5022 	struct rvt_swqe *wqe;
5023 
5024 	lockdep_assert_held(&qp->s_lock);
5025 	/* Can't move beyond s_tid_cur */
5026 	if (priv->s_tid_tail == priv->s_tid_cur)
5027 		return;
5028 	for (i = priv->s_tid_tail + 1; ; i++) {
5029 		if (i == qp->s_size)
5030 			i = 0;
5031 
5032 		if (i == priv->s_tid_cur)
5033 			break;
5034 		wqe = rvt_get_swqe_ptr(qp, i);
5035 		if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
5036 			break;
5037 	}
5038 	priv->s_tid_tail = i;
5039 	priv->s_state = TID_OP(WRITE_RESP);
5040 }
5041 
5042 int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
5043 	__must_hold(&qp->s_lock)
5044 {
5045 	struct hfi1_qp_priv *priv = qp->priv;
5046 	struct rvt_swqe *wqe;
5047 	u32 bth1 = 0, bth2 = 0, hwords = 5, len, middle = 0;
5048 	struct ib_other_headers *ohdr;
5049 	struct rvt_sge_state *ss = &qp->s_sge;
5050 	struct rvt_ack_entry *e = &qp->s_ack_queue[qp->s_tail_ack_queue];
5051 	struct tid_rdma_request *req = ack_to_tid_req(e);
5052 	bool last = false;
5053 	u8 opcode = TID_OP(WRITE_DATA);
5054 
5055 	lockdep_assert_held(&qp->s_lock);
5056 	trace_hfi1_tid_write_sender_make_tid_pkt(qp, 0);
5057 	/*
5058 	 * Prioritize the sending of the requests and responses over the
5059 	 * sending of the TID RDMA data packets.
5060 	 */
5061 	if (((atomic_read(&priv->n_tid_requests) < HFI1_TID_RDMA_WRITE_CNT) &&
5062 	     atomic_read(&priv->n_requests) &&
5063 	     !(qp->s_flags & (RVT_S_BUSY | RVT_S_WAIT_ACK |
5064 			     HFI1_S_ANY_WAIT_IO))) ||
5065 	    (e->opcode == TID_OP(WRITE_REQ) && req->cur_seg < req->alloc_seg &&
5066 	     !(qp->s_flags & (RVT_S_BUSY | HFI1_S_ANY_WAIT_IO)))) {
5067 		struct iowait_work *iowork;
5068 
5069 		iowork = iowait_get_ib_work(&priv->s_iowait);
5070 		ps->s_txreq = get_waiting_verbs_txreq(iowork);
5071 		if (ps->s_txreq || hfi1_make_rc_req(qp, ps)) {
5072 			priv->s_flags |= HFI1_S_TID_BUSY_SET;
5073 			return 1;
5074 		}
5075 	}
5076 
5077 	ps->s_txreq = get_txreq(ps->dev, qp);
5078 	if (!ps->s_txreq)
5079 		goto bail_no_tx;
5080 
5081 	ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth;
5082 
5083 	if ((priv->s_flags & RVT_S_ACK_PENDING) &&
5084 	    make_tid_rdma_ack(qp, ohdr, ps))
5085 		return 1;
5086 
5087 	/*
5088 	 * Bail out if we can't send data.
5089 	 * Be reminded that this check must been done after the call to
5090 	 * make_tid_rdma_ack() because the responding QP could be in
5091 	 * RTR state where it can send TID RDMA ACK, not TID RDMA WRITE DATA.
5092 	 */
5093 	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK))
5094 		goto bail;
5095 
5096 	if (priv->s_flags & RVT_S_WAIT_ACK)
5097 		goto bail;
5098 
5099 	/* Check whether there is anything to do. */
5100 	if (priv->s_tid_tail == HFI1_QP_WQE_INVALID)
5101 		goto bail;
5102 	wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail);
5103 	req = wqe_to_tid_req(wqe);
5104 	trace_hfi1_tid_req_make_tid_pkt(qp, 0, wqe->wr.opcode, wqe->psn,
5105 					wqe->lpsn, req);
5106 	switch (priv->s_state) {
5107 	case TID_OP(WRITE_REQ):
5108 	case TID_OP(WRITE_RESP):
5109 		priv->tid_ss.sge = wqe->sg_list[0];
5110 		priv->tid_ss.sg_list = wqe->sg_list + 1;
5111 		priv->tid_ss.num_sge = wqe->wr.num_sge;
5112 		priv->tid_ss.total_len = wqe->length;
5113 
5114 		if (priv->s_state == TID_OP(WRITE_REQ))
5115 			hfi1_tid_rdma_restart_req(qp, wqe, &bth2);
5116 		priv->s_state = TID_OP(WRITE_DATA);
5117 		/* fall through */
5118 
5119 	case TID_OP(WRITE_DATA):
5120 		/*
5121 		 * 1. Check whether TID RDMA WRITE RESP available.
5122 		 * 2. If no:
5123 		 *    2.1 If have more segments and no TID RDMA WRITE RESP,
5124 		 *        set HFI1_S_WAIT_TID_RESP
5125 		 *    2.2 Return indicating no progress made.
5126 		 * 3. If yes:
5127 		 *    3.1 Build TID RDMA WRITE DATA packet.
5128 		 *    3.2 If last packet in segment:
5129 		 *        3.2.1 Change KDETH header bits
5130 		 *        3.2.2 Advance RESP pointers.
5131 		 *    3.3 Return indicating progress made.
5132 		 */
5133 		trace_hfi1_sender_make_tid_pkt(qp);
5134 		trace_hfi1_tid_write_sender_make_tid_pkt(qp, 0);
5135 		wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail);
5136 		req = wqe_to_tid_req(wqe);
5137 		len = wqe->length;
5138 
5139 		if (!req->comp_seg || req->cur_seg == req->comp_seg)
5140 			goto bail;
5141 
5142 		trace_hfi1_tid_req_make_tid_pkt(qp, 0, wqe->wr.opcode,
5143 						wqe->psn, wqe->lpsn, req);
5144 		last = hfi1_build_tid_rdma_packet(wqe, ohdr, &bth1, &bth2,
5145 						  &len);
5146 
5147 		if (last) {
5148 			/* move pointer to next flow */
5149 			req->clear_tail = CIRC_NEXT(req->clear_tail,
5150 						    MAX_FLOWS);
5151 			if (++req->cur_seg < req->total_segs) {
5152 				if (!CIRC_CNT(req->setup_head, req->clear_tail,
5153 					      MAX_FLOWS))
5154 					qp->s_flags |= HFI1_S_WAIT_TID_RESP;
5155 			} else {
5156 				priv->s_state = TID_OP(WRITE_DATA_LAST);
5157 				opcode = TID_OP(WRITE_DATA_LAST);
5158 
5159 				/* Advance the s_tid_tail now */
5160 				update_tid_tail(qp);
5161 			}
5162 		}
5163 		hwords += sizeof(ohdr->u.tid_rdma.w_data) / sizeof(u32);
5164 		ss = &priv->tid_ss;
5165 		break;
5166 
5167 	case TID_OP(RESYNC):
5168 		trace_hfi1_sender_make_tid_pkt(qp);
5169 		/* Use generation from the most recently received response */
5170 		wqe = rvt_get_swqe_ptr(qp, priv->s_tid_cur);
5171 		req = wqe_to_tid_req(wqe);
5172 		/* If no responses for this WQE look at the previous one */
5173 		if (!req->comp_seg) {
5174 			wqe = rvt_get_swqe_ptr(qp,
5175 					       (!priv->s_tid_cur ? qp->s_size :
5176 						priv->s_tid_cur) - 1);
5177 			req = wqe_to_tid_req(wqe);
5178 		}
5179 		hwords += hfi1_build_tid_rdma_resync(qp, wqe, ohdr, &bth1,
5180 						     &bth2,
5181 						     CIRC_PREV(req->setup_head,
5182 							       MAX_FLOWS));
5183 		ss = NULL;
5184 		len = 0;
5185 		opcode = TID_OP(RESYNC);
5186 		break;
5187 
5188 	default:
5189 		goto bail;
5190 	}
5191 	if (priv->s_flags & RVT_S_SEND_ONE) {
5192 		priv->s_flags &= ~RVT_S_SEND_ONE;
5193 		priv->s_flags |= RVT_S_WAIT_ACK;
5194 		bth2 |= IB_BTH_REQ_ACK;
5195 	}
5196 	qp->s_len -= len;
5197 	ps->s_txreq->hdr_dwords = hwords;
5198 	ps->s_txreq->sde = priv->s_sde;
5199 	ps->s_txreq->ss = ss;
5200 	ps->s_txreq->s_cur_size = len;
5201 	hfi1_make_ruc_header(qp, ohdr, (opcode << 24), bth1, bth2,
5202 			     middle, ps);
5203 	return 1;
5204 bail:
5205 	hfi1_put_txreq(ps->s_txreq);
5206 bail_no_tx:
5207 	ps->s_txreq = NULL;
5208 	priv->s_flags &= ~RVT_S_BUSY;
5209 	/*
5210 	 * If we didn't get a txreq, the QP will be woken up later to try
5211 	 * again, set the flags to the the wake up which work item to wake
5212 	 * up.
5213 	 * (A better algorithm should be found to do this and generalize the
5214 	 * sleep/wakeup flags.)
5215 	 */
5216 	iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
5217 	return 0;
5218 }
5219 
5220 static int make_tid_rdma_ack(struct rvt_qp *qp,
5221 			     struct ib_other_headers *ohdr,
5222 			     struct hfi1_pkt_state *ps)
5223 {
5224 	struct rvt_ack_entry *e;
5225 	struct hfi1_qp_priv *qpriv = qp->priv;
5226 	struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
5227 	u32 hwords, next;
5228 	u32 len = 0;
5229 	u32 bth1 = 0, bth2 = 0;
5230 	int middle = 0;
5231 	u16 flow;
5232 	struct tid_rdma_request *req, *nreq;
5233 
5234 	trace_hfi1_tid_write_rsp_make_tid_ack(qp);
5235 	/* Don't send an ACK if we aren't supposed to. */
5236 	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
5237 		goto bail;
5238 
5239 	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
5240 	hwords = 5;
5241 
5242 	e = &qp->s_ack_queue[qpriv->r_tid_ack];
5243 	req = ack_to_tid_req(e);
5244 	/*
5245 	 * In the RESYNC case, we are exactly one segment past the
5246 	 * previously sent ack or at the previously sent NAK. So to send
5247 	 * the resync ack, we go back one segment (which might be part of
5248 	 * the previous request) and let the do-while loop execute again.
5249 	 * The advantage of executing the do-while loop is that any data
5250 	 * received after the previous ack is automatically acked in the
5251 	 * RESYNC ack. It turns out that for the do-while loop we only need
5252 	 * to pull back qpriv->r_tid_ack, not the segment
5253 	 * indices/counters. The scheme works even if the previous request
5254 	 * was not a TID WRITE request.
5255 	 */
5256 	if (qpriv->resync) {
5257 		if (!req->ack_seg || req->ack_seg == req->total_segs)
5258 			qpriv->r_tid_ack = !qpriv->r_tid_ack ?
5259 				rvt_size_atomic(&dev->rdi) :
5260 				qpriv->r_tid_ack - 1;
5261 		e = &qp->s_ack_queue[qpriv->r_tid_ack];
5262 		req = ack_to_tid_req(e);
5263 	}
5264 
5265 	trace_hfi1_rsp_make_tid_ack(qp, e->psn);
5266 	trace_hfi1_tid_req_make_tid_ack(qp, 0, e->opcode, e->psn, e->lpsn,
5267 					req);
5268 	/*
5269 	 * If we've sent all the ACKs that we can, we are done
5270 	 * until we get more segments...
5271 	 */
5272 	if (!qpriv->s_nak_state && !qpriv->resync &&
5273 	    req->ack_seg == req->comp_seg)
5274 		goto bail;
5275 
5276 	do {
5277 		/*
5278 		 * To deal with coalesced ACKs, the acked_tail pointer
5279 		 * into the flow array is used. The distance between it
5280 		 * and the clear_tail is the number of flows that are
5281 		 * being ACK'ed.
5282 		 */
5283 		req->ack_seg +=
5284 			/* Get up-to-date value */
5285 			CIRC_CNT(req->clear_tail, req->acked_tail,
5286 				 MAX_FLOWS);
5287 		/* Advance acked index */
5288 		req->acked_tail = req->clear_tail;
5289 
5290 		/*
5291 		 * req->clear_tail points to the segment currently being
5292 		 * received. So, when sending an ACK, the previous
5293 		 * segment is being ACK'ed.
5294 		 */
5295 		flow = CIRC_PREV(req->acked_tail, MAX_FLOWS);
5296 		if (req->ack_seg != req->total_segs)
5297 			break;
5298 		req->state = TID_REQUEST_COMPLETE;
5299 
5300 		next = qpriv->r_tid_ack + 1;
5301 		if (next > rvt_size_atomic(&dev->rdi))
5302 			next = 0;
5303 		qpriv->r_tid_ack = next;
5304 		if (qp->s_ack_queue[next].opcode != TID_OP(WRITE_REQ))
5305 			break;
5306 		nreq = ack_to_tid_req(&qp->s_ack_queue[next]);
5307 		if (!nreq->comp_seg || nreq->ack_seg == nreq->comp_seg)
5308 			break;
5309 
5310 		/* Move to the next ack entry now */
5311 		e = &qp->s_ack_queue[qpriv->r_tid_ack];
5312 		req = ack_to_tid_req(e);
5313 	} while (1);
5314 
5315 	/*
5316 	 * At this point qpriv->r_tid_ack == qpriv->r_tid_tail but e and
5317 	 * req could be pointing at the previous ack queue entry
5318 	 */
5319 	if (qpriv->s_nak_state ||
5320 	    (qpriv->resync &&
5321 	     !hfi1_tid_rdma_is_resync_psn(qpriv->r_next_psn_kdeth - 1) &&
5322 	     (cmp_psn(qpriv->r_next_psn_kdeth - 1,
5323 		      full_flow_psn(&req->flows[flow],
5324 				    req->flows[flow].flow_state.lpsn)) > 0))) {
5325 		/*
5326 		 * A NAK will implicitly acknowledge all previous TID RDMA
5327 		 * requests. Therefore, we NAK with the req->acked_tail
5328 		 * segment for the request at qpriv->r_tid_ack (same at
5329 		 * this point as the req->clear_tail segment for the
5330 		 * qpriv->r_tid_tail request)
5331 		 */
5332 		e = &qp->s_ack_queue[qpriv->r_tid_ack];
5333 		req = ack_to_tid_req(e);
5334 		flow = req->acked_tail;
5335 	} else if (req->ack_seg == req->total_segs &&
5336 		   qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK)
5337 		qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK;
5338 
5339 	trace_hfi1_tid_write_rsp_make_tid_ack(qp);
5340 	trace_hfi1_tid_req_make_tid_ack(qp, 0, e->opcode, e->psn, e->lpsn,
5341 					req);
5342 	hwords += hfi1_build_tid_rdma_write_ack(qp, e, ohdr, flow, &bth1,
5343 						&bth2);
5344 	len = 0;
5345 	qpriv->s_flags &= ~RVT_S_ACK_PENDING;
5346 	ps->s_txreq->hdr_dwords = hwords;
5347 	ps->s_txreq->sde = qpriv->s_sde;
5348 	ps->s_txreq->s_cur_size = len;
5349 	ps->s_txreq->ss = NULL;
5350 	hfi1_make_ruc_header(qp, ohdr, (TID_OP(ACK) << 24), bth1, bth2, middle,
5351 			     ps);
5352 	ps->s_txreq->txreq.flags |= SDMA_TXREQ_F_VIP;
5353 	return 1;
5354 bail:
5355 	/*
5356 	 * Ensure s_rdma_ack_cnt changes are committed prior to resetting
5357 	 * RVT_S_RESP_PENDING
5358 	 */
5359 	smp_wmb();
5360 	qpriv->s_flags &= ~RVT_S_ACK_PENDING;
5361 	return 0;
5362 }
5363 
5364 static int hfi1_send_tid_ok(struct rvt_qp *qp)
5365 {
5366 	struct hfi1_qp_priv *priv = qp->priv;
5367 
5368 	return !(priv->s_flags & RVT_S_BUSY ||
5369 		 qp->s_flags & HFI1_S_ANY_WAIT_IO) &&
5370 		(verbs_txreq_queued(iowait_get_tid_work(&priv->s_iowait)) ||
5371 		 (priv->s_flags & RVT_S_RESP_PENDING) ||
5372 		 !(qp->s_flags & HFI1_S_ANY_TID_WAIT_SEND));
5373 }
5374 
5375 void _hfi1_do_tid_send(struct work_struct *work)
5376 {
5377 	struct iowait_work *w = container_of(work, struct iowait_work, iowork);
5378 	struct rvt_qp *qp = iowait_to_qp(w->iow);
5379 
5380 	hfi1_do_tid_send(qp);
5381 }
5382 
5383 static void hfi1_do_tid_send(struct rvt_qp *qp)
5384 {
5385 	struct hfi1_pkt_state ps;
5386 	struct hfi1_qp_priv *priv = qp->priv;
5387 
5388 	ps.dev = to_idev(qp->ibqp.device);
5389 	ps.ibp = to_iport(qp->ibqp.device, qp->port_num);
5390 	ps.ppd = ppd_from_ibp(ps.ibp);
5391 	ps.wait = iowait_get_tid_work(&priv->s_iowait);
5392 	ps.in_thread = false;
5393 	ps.timeout_int = qp->timeout_jiffies / 8;
5394 
5395 	trace_hfi1_rc_do_tid_send(qp, false);
5396 	spin_lock_irqsave(&qp->s_lock, ps.flags);
5397 
5398 	/* Return if we are already busy processing a work request. */
5399 	if (!hfi1_send_tid_ok(qp)) {
5400 		if (qp->s_flags & HFI1_S_ANY_WAIT_IO)
5401 			iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
5402 		spin_unlock_irqrestore(&qp->s_lock, ps.flags);
5403 		return;
5404 	}
5405 
5406 	priv->s_flags |= RVT_S_BUSY;
5407 
5408 	ps.timeout = jiffies + ps.timeout_int;
5409 	ps.cpu = priv->s_sde ? priv->s_sde->cpu :
5410 		cpumask_first(cpumask_of_node(ps.ppd->dd->node));
5411 	ps.pkts_sent = false;
5412 
5413 	/* insure a pre-built packet is handled  */
5414 	ps.s_txreq = get_waiting_verbs_txreq(ps.wait);
5415 	do {
5416 		/* Check for a constructed packet to be sent. */
5417 		if (ps.s_txreq) {
5418 			if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
5419 				qp->s_flags |= RVT_S_BUSY;
5420 				ps.wait = iowait_get_ib_work(&priv->s_iowait);
5421 			}
5422 			spin_unlock_irqrestore(&qp->s_lock, ps.flags);
5423 
5424 			/*
5425 			 * If the packet cannot be sent now, return and
5426 			 * the send tasklet will be woken up later.
5427 			 */
5428 			if (hfi1_verbs_send(qp, &ps))
5429 				return;
5430 
5431 			/* allow other tasks to run */
5432 			if (hfi1_schedule_send_yield(qp, &ps, true))
5433 				return;
5434 
5435 			spin_lock_irqsave(&qp->s_lock, ps.flags);
5436 			if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
5437 				qp->s_flags &= ~RVT_S_BUSY;
5438 				priv->s_flags &= ~HFI1_S_TID_BUSY_SET;
5439 				ps.wait = iowait_get_tid_work(&priv->s_iowait);
5440 				if (iowait_flag_set(&priv->s_iowait,
5441 						    IOWAIT_PENDING_IB))
5442 					hfi1_schedule_send(qp);
5443 			}
5444 		}
5445 	} while (hfi1_make_tid_rdma_pkt(qp, &ps));
5446 	iowait_starve_clear(ps.pkts_sent, &priv->s_iowait);
5447 	spin_unlock_irqrestore(&qp->s_lock, ps.flags);
5448 }
5449 
5450 static bool _hfi1_schedule_tid_send(struct rvt_qp *qp)
5451 {
5452 	struct hfi1_qp_priv *priv = qp->priv;
5453 	struct hfi1_ibport *ibp =
5454 		to_iport(qp->ibqp.device, qp->port_num);
5455 	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
5456 	struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
5457 
5458 	return iowait_tid_schedule(&priv->s_iowait, ppd->hfi1_wq,
5459 				   priv->s_sde ?
5460 				   priv->s_sde->cpu :
5461 				   cpumask_first(cpumask_of_node(dd->node)));
5462 }
5463 
5464 /**
5465  * hfi1_schedule_tid_send - schedule progress on TID RDMA state machine
5466  * @qp: the QP
5467  *
5468  * This schedules qp progress on the TID RDMA state machine. Caller
5469  * should hold the s_lock.
5470  * Unlike hfi1_schedule_send(), this cannot use hfi1_send_ok() because
5471  * the two state machines can step on each other with respect to the
5472  * RVT_S_BUSY flag.
5473  * Therefore, a modified test is used.
5474  * @return true if the second leg is scheduled;
5475  *  false if the second leg is not scheduled.
5476  */
5477 bool hfi1_schedule_tid_send(struct rvt_qp *qp)
5478 {
5479 	lockdep_assert_held(&qp->s_lock);
5480 	if (hfi1_send_tid_ok(qp)) {
5481 		/*
5482 		 * The following call returns true if the qp is not on the
5483 		 * queue and false if the qp is already on the queue before
5484 		 * this call. Either way, the qp will be on the queue when the
5485 		 * call returns.
5486 		 */
5487 		_hfi1_schedule_tid_send(qp);
5488 		return true;
5489 	}
5490 	if (qp->s_flags & HFI1_S_ANY_WAIT_IO)
5491 		iowait_set_flag(&((struct hfi1_qp_priv *)qp->priv)->s_iowait,
5492 				IOWAIT_PENDING_TID);
5493 	return false;
5494 }
5495 
5496 bool hfi1_tid_rdma_ack_interlock(struct rvt_qp *qp, struct rvt_ack_entry *e)
5497 {
5498 	struct rvt_ack_entry *prev;
5499 	struct tid_rdma_request *req;
5500 	struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
5501 	struct hfi1_qp_priv *priv = qp->priv;
5502 	u32 s_prev;
5503 
5504 	s_prev = qp->s_tail_ack_queue == 0 ? rvt_size_atomic(&dev->rdi) :
5505 		(qp->s_tail_ack_queue - 1);
5506 	prev = &qp->s_ack_queue[s_prev];
5507 
5508 	if ((e->opcode == TID_OP(READ_REQ) ||
5509 	     e->opcode == OP(RDMA_READ_REQUEST)) &&
5510 	    prev->opcode == TID_OP(WRITE_REQ)) {
5511 		req = ack_to_tid_req(prev);
5512 		if (req->ack_seg != req->total_segs) {
5513 			priv->s_flags |= HFI1_R_TID_WAIT_INTERLCK;
5514 			return true;
5515 		}
5516 	}
5517 	return false;
5518 }
5519 
5520 static u32 read_r_next_psn(struct hfi1_devdata *dd, u8 ctxt, u8 fidx)
5521 {
5522 	u64 reg;
5523 
5524 	/*
5525 	 * The only sane way to get the amount of
5526 	 * progress is to read the HW flow state.
5527 	 */
5528 	reg = read_uctxt_csr(dd, ctxt, RCV_TID_FLOW_TABLE + (8 * fidx));
5529 	return mask_psn(reg);
5530 }
5531 
5532 static void tid_rdma_rcv_err(struct hfi1_packet *packet,
5533 			     struct ib_other_headers *ohdr,
5534 			     struct rvt_qp *qp, u32 psn, int diff, bool fecn)
5535 {
5536 	unsigned long flags;
5537 
5538 	tid_rdma_rcv_error(packet, ohdr, qp, psn, diff);
5539 	if (fecn) {
5540 		spin_lock_irqsave(&qp->s_lock, flags);
5541 		qp->s_flags |= RVT_S_ECN;
5542 		spin_unlock_irqrestore(&qp->s_lock, flags);
5543 	}
5544 }
5545 
5546 static void update_r_next_psn_fecn(struct hfi1_packet *packet,
5547 				   struct hfi1_qp_priv *priv,
5548 				   struct hfi1_ctxtdata *rcd,
5549 				   struct tid_rdma_flow *flow,
5550 				   bool fecn)
5551 {
5552 	/*
5553 	 * If a start/middle packet is delivered here due to
5554 	 * RSM rule and FECN, we need to update the r_next_psn.
5555 	 */
5556 	if (fecn && packet->etype == RHF_RCV_TYPE_EAGER &&
5557 	    !(priv->s_flags & HFI1_R_TID_SW_PSN)) {
5558 		struct hfi1_devdata *dd = rcd->dd;
5559 
5560 		flow->flow_state.r_next_psn =
5561 			read_r_next_psn(dd, rcd->ctxt, flow->idx);
5562 	}
5563 }
5564