1 // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) 2 /* 3 * Copyright(c) 2018 Intel Corporation. 4 * 5 */ 6 7 #include "hfi.h" 8 #include "qp.h" 9 #include "rc.h" 10 #include "verbs.h" 11 #include "tid_rdma.h" 12 #include "exp_rcv.h" 13 #include "trace.h" 14 15 /** 16 * DOC: TID RDMA READ protocol 17 * 18 * This is an end-to-end protocol at the hfi1 level between two nodes that 19 * improves performance by avoiding data copy on the requester side. It 20 * converts a qualified RDMA READ request into a TID RDMA READ request on 21 * the requester side and thereafter handles the request and response 22 * differently. To be qualified, the RDMA READ request should meet the 23 * following: 24 * -- The total data length should be greater than 256K; 25 * -- The total data length should be a multiple of 4K page size; 26 * -- Each local scatter-gather entry should be 4K page aligned; 27 * -- Each local scatter-gather entry should be a multiple of 4K page size; 28 */ 29 30 #define RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK BIT_ULL(32) 31 #define RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK BIT_ULL(33) 32 #define RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK BIT_ULL(34) 33 #define RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK BIT_ULL(35) 34 #define RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK BIT_ULL(37) 35 #define RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK BIT_ULL(38) 36 37 /* Maximum number of packets within a flow generation. */ 38 #define MAX_TID_FLOW_PSN BIT(HFI1_KDETH_BTH_SEQ_SHIFT) 39 40 #define GENERATION_MASK 0xFFFFF 41 42 static u32 mask_generation(u32 a) 43 { 44 return a & GENERATION_MASK; 45 } 46 47 /* Reserved generation value to set to unused flows for kernel contexts */ 48 #define KERN_GENERATION_RESERVED mask_generation(U32_MAX) 49 50 /* 51 * J_KEY for kernel contexts when TID RDMA is used. 52 * See generate_jkey() in hfi.h for more information. 53 */ 54 #define TID_RDMA_JKEY 32 55 #define HFI1_KERNEL_MIN_JKEY HFI1_ADMIN_JKEY_RANGE 56 #define HFI1_KERNEL_MAX_JKEY (2 * HFI1_ADMIN_JKEY_RANGE - 1) 57 58 /* Maximum number of segments in flight per QP request. */ 59 #define TID_RDMA_MAX_READ_SEGS_PER_REQ 6 60 #define TID_RDMA_MAX_WRITE_SEGS_PER_REQ 4 61 #define MAX_REQ max_t(u16, TID_RDMA_MAX_READ_SEGS_PER_REQ, \ 62 TID_RDMA_MAX_WRITE_SEGS_PER_REQ) 63 #define MAX_FLOWS roundup_pow_of_two(MAX_REQ + 1) 64 65 #define MAX_EXPECTED_PAGES (MAX_EXPECTED_BUFFER / PAGE_SIZE) 66 67 #define TID_RDMA_DESTQP_FLOW_SHIFT 11 68 #define TID_RDMA_DESTQP_FLOW_MASK 0x1f 69 70 #define TID_OPFN_QP_CTXT_MASK 0xff 71 #define TID_OPFN_QP_CTXT_SHIFT 56 72 #define TID_OPFN_QP_KDETH_MASK 0xff 73 #define TID_OPFN_QP_KDETH_SHIFT 48 74 #define TID_OPFN_MAX_LEN_MASK 0x7ff 75 #define TID_OPFN_MAX_LEN_SHIFT 37 76 #define TID_OPFN_TIMEOUT_MASK 0x1f 77 #define TID_OPFN_TIMEOUT_SHIFT 32 78 #define TID_OPFN_RESERVED_MASK 0x3f 79 #define TID_OPFN_RESERVED_SHIFT 26 80 #define TID_OPFN_URG_MASK 0x1 81 #define TID_OPFN_URG_SHIFT 25 82 #define TID_OPFN_VER_MASK 0x7 83 #define TID_OPFN_VER_SHIFT 22 84 #define TID_OPFN_JKEY_MASK 0x3f 85 #define TID_OPFN_JKEY_SHIFT 16 86 #define TID_OPFN_MAX_READ_MASK 0x3f 87 #define TID_OPFN_MAX_READ_SHIFT 10 88 #define TID_OPFN_MAX_WRITE_MASK 0x3f 89 #define TID_OPFN_MAX_WRITE_SHIFT 4 90 91 /* 92 * OPFN TID layout 93 * 94 * 63 47 31 15 95 * NNNNNNNNKKKKKKKK MMMMMMMMMMMTTTTT DDDDDDUVVVJJJJJJ RRRRRRWWWWWWCCCC 96 * 3210987654321098 7654321098765432 1098765432109876 5432109876543210 97 * N - the context Number 98 * K - the Kdeth_qp 99 * M - Max_len 100 * T - Timeout 101 * D - reserveD 102 * V - version 103 * U - Urg capable 104 * J - Jkey 105 * R - max_Read 106 * W - max_Write 107 * C - Capcode 108 */ 109 110 static u32 tid_rdma_flow_wt; 111 112 static void tid_rdma_trigger_resume(struct work_struct *work); 113 static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req); 114 static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req, 115 gfp_t gfp); 116 static void hfi1_init_trdma_req(struct rvt_qp *qp, 117 struct tid_rdma_request *req); 118 static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx); 119 static void hfi1_tid_timeout(struct timer_list *t); 120 static void hfi1_add_tid_reap_timer(struct rvt_qp *qp); 121 static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp); 122 static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp); 123 static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp); 124 static void hfi1_tid_retry_timeout(struct timer_list *t); 125 static int make_tid_rdma_ack(struct rvt_qp *qp, 126 struct ib_other_headers *ohdr, 127 struct hfi1_pkt_state *ps); 128 static void hfi1_do_tid_send(struct rvt_qp *qp); 129 static u32 read_r_next_psn(struct hfi1_devdata *dd, u8 ctxt, u8 fidx); 130 static void tid_rdma_rcv_err(struct hfi1_packet *packet, 131 struct ib_other_headers *ohdr, 132 struct rvt_qp *qp, u32 psn, int diff, bool fecn); 133 static void update_r_next_psn_fecn(struct hfi1_packet *packet, 134 struct hfi1_qp_priv *priv, 135 struct hfi1_ctxtdata *rcd, 136 struct tid_rdma_flow *flow, 137 bool fecn); 138 139 static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p) 140 { 141 return 142 (((u64)p->qp & TID_OPFN_QP_CTXT_MASK) << 143 TID_OPFN_QP_CTXT_SHIFT) | 144 ((((u64)p->qp >> 16) & TID_OPFN_QP_KDETH_MASK) << 145 TID_OPFN_QP_KDETH_SHIFT) | 146 (((u64)((p->max_len >> PAGE_SHIFT) - 1) & 147 TID_OPFN_MAX_LEN_MASK) << TID_OPFN_MAX_LEN_SHIFT) | 148 (((u64)p->timeout & TID_OPFN_TIMEOUT_MASK) << 149 TID_OPFN_TIMEOUT_SHIFT) | 150 (((u64)p->urg & TID_OPFN_URG_MASK) << TID_OPFN_URG_SHIFT) | 151 (((u64)p->jkey & TID_OPFN_JKEY_MASK) << TID_OPFN_JKEY_SHIFT) | 152 (((u64)p->max_read & TID_OPFN_MAX_READ_MASK) << 153 TID_OPFN_MAX_READ_SHIFT) | 154 (((u64)p->max_write & TID_OPFN_MAX_WRITE_MASK) << 155 TID_OPFN_MAX_WRITE_SHIFT); 156 } 157 158 static void tid_rdma_opfn_decode(struct tid_rdma_params *p, u64 data) 159 { 160 p->max_len = (((data >> TID_OPFN_MAX_LEN_SHIFT) & 161 TID_OPFN_MAX_LEN_MASK) + 1) << PAGE_SHIFT; 162 p->jkey = (data >> TID_OPFN_JKEY_SHIFT) & TID_OPFN_JKEY_MASK; 163 p->max_write = (data >> TID_OPFN_MAX_WRITE_SHIFT) & 164 TID_OPFN_MAX_WRITE_MASK; 165 p->max_read = (data >> TID_OPFN_MAX_READ_SHIFT) & 166 TID_OPFN_MAX_READ_MASK; 167 p->qp = 168 ((((data >> TID_OPFN_QP_KDETH_SHIFT) & TID_OPFN_QP_KDETH_MASK) 169 << 16) | 170 ((data >> TID_OPFN_QP_CTXT_SHIFT) & TID_OPFN_QP_CTXT_MASK)); 171 p->urg = (data >> TID_OPFN_URG_SHIFT) & TID_OPFN_URG_MASK; 172 p->timeout = (data >> TID_OPFN_TIMEOUT_SHIFT) & TID_OPFN_TIMEOUT_MASK; 173 } 174 175 void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p) 176 { 177 struct hfi1_qp_priv *priv = qp->priv; 178 179 p->qp = (kdeth_qp << 16) | priv->rcd->ctxt; 180 p->max_len = TID_RDMA_MAX_SEGMENT_SIZE; 181 p->jkey = priv->rcd->jkey; 182 p->max_read = TID_RDMA_MAX_READ_SEGS_PER_REQ; 183 p->max_write = TID_RDMA_MAX_WRITE_SEGS_PER_REQ; 184 p->timeout = qp->timeout; 185 p->urg = is_urg_masked(priv->rcd); 186 } 187 188 bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data) 189 { 190 struct hfi1_qp_priv *priv = qp->priv; 191 192 *data = tid_rdma_opfn_encode(&priv->tid_rdma.local); 193 return true; 194 } 195 196 bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data) 197 { 198 struct hfi1_qp_priv *priv = qp->priv; 199 struct tid_rdma_params *remote, *old; 200 bool ret = true; 201 202 old = rcu_dereference_protected(priv->tid_rdma.remote, 203 lockdep_is_held(&priv->opfn.lock)); 204 data &= ~0xfULL; 205 /* 206 * If data passed in is zero, return true so as not to continue the 207 * negotiation process 208 */ 209 if (!data || !HFI1_CAP_IS_KSET(TID_RDMA)) 210 goto null; 211 /* 212 * If kzalloc fails, return false. This will result in: 213 * * at the requester a new OPFN request being generated to retry 214 * the negotiation 215 * * at the responder, 0 being returned to the requester so as to 216 * disable TID RDMA at both the requester and the responder 217 */ 218 remote = kzalloc(sizeof(*remote), GFP_ATOMIC); 219 if (!remote) { 220 ret = false; 221 goto null; 222 } 223 224 tid_rdma_opfn_decode(remote, data); 225 priv->tid_timer_timeout_jiffies = 226 usecs_to_jiffies((((4096UL * (1UL << remote->timeout)) / 227 1000UL) << 3) * 7); 228 trace_hfi1_opfn_param(qp, 0, &priv->tid_rdma.local); 229 trace_hfi1_opfn_param(qp, 1, remote); 230 rcu_assign_pointer(priv->tid_rdma.remote, remote); 231 /* 232 * A TID RDMA READ request's segment size is not equal to 233 * remote->max_len only when the request's data length is smaller 234 * than remote->max_len. In that case, there will be only one segment. 235 * Therefore, when priv->pkts_ps is used to calculate req->cur_seg 236 * during retry, it will lead to req->cur_seg = 0, which is exactly 237 * what is expected. 238 */ 239 priv->pkts_ps = (u16)rvt_div_mtu(qp, remote->max_len); 240 priv->timeout_shift = ilog2(priv->pkts_ps - 1) + 1; 241 goto free; 242 null: 243 RCU_INIT_POINTER(priv->tid_rdma.remote, NULL); 244 priv->timeout_shift = 0; 245 free: 246 if (old) 247 kfree_rcu(old, rcu_head); 248 return ret; 249 } 250 251 bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data) 252 { 253 bool ret; 254 255 ret = tid_rdma_conn_reply(qp, *data); 256 *data = 0; 257 /* 258 * If tid_rdma_conn_reply() returns error, set *data as 0 to indicate 259 * TID RDMA could not be enabled. This will result in TID RDMA being 260 * disabled at the requester too. 261 */ 262 if (ret) 263 (void)tid_rdma_conn_req(qp, data); 264 return ret; 265 } 266 267 void tid_rdma_conn_error(struct rvt_qp *qp) 268 { 269 struct hfi1_qp_priv *priv = qp->priv; 270 struct tid_rdma_params *old; 271 272 old = rcu_dereference_protected(priv->tid_rdma.remote, 273 lockdep_is_held(&priv->opfn.lock)); 274 RCU_INIT_POINTER(priv->tid_rdma.remote, NULL); 275 if (old) 276 kfree_rcu(old, rcu_head); 277 } 278 279 /* This is called at context initialization time */ 280 int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit) 281 { 282 if (reinit) 283 return 0; 284 285 BUILD_BUG_ON(TID_RDMA_JKEY < HFI1_KERNEL_MIN_JKEY); 286 BUILD_BUG_ON(TID_RDMA_JKEY > HFI1_KERNEL_MAX_JKEY); 287 rcd->jkey = TID_RDMA_JKEY; 288 hfi1_set_ctxt_jkey(rcd->dd, rcd, rcd->jkey); 289 return hfi1_alloc_ctxt_rcv_groups(rcd); 290 } 291 292 /** 293 * qp_to_rcd - determine the receive context used by a qp 294 * @qp - the qp 295 * 296 * This routine returns the receive context associated 297 * with a a qp's qpn. 298 * 299 * Returns the context. 300 */ 301 static struct hfi1_ctxtdata *qp_to_rcd(struct rvt_dev_info *rdi, 302 struct rvt_qp *qp) 303 { 304 struct hfi1_ibdev *verbs_dev = container_of(rdi, 305 struct hfi1_ibdev, 306 rdi); 307 struct hfi1_devdata *dd = container_of(verbs_dev, 308 struct hfi1_devdata, 309 verbs_dev); 310 unsigned int ctxt; 311 312 if (qp->ibqp.qp_num == 0) 313 ctxt = 0; 314 else 315 ctxt = hfi1_get_qp_map(dd, qp->ibqp.qp_num >> dd->qos_shift); 316 return dd->rcd[ctxt]; 317 } 318 319 int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, 320 struct ib_qp_init_attr *init_attr) 321 { 322 struct hfi1_qp_priv *qpriv = qp->priv; 323 int i, ret; 324 325 qpriv->rcd = qp_to_rcd(rdi, qp); 326 327 spin_lock_init(&qpriv->opfn.lock); 328 INIT_WORK(&qpriv->opfn.opfn_work, opfn_send_conn_request); 329 INIT_WORK(&qpriv->tid_rdma.trigger_work, tid_rdma_trigger_resume); 330 qpriv->flow_state.psn = 0; 331 qpriv->flow_state.index = RXE_NUM_TID_FLOWS; 332 qpriv->flow_state.last_index = RXE_NUM_TID_FLOWS; 333 qpriv->flow_state.generation = KERN_GENERATION_RESERVED; 334 qpriv->s_state = TID_OP(WRITE_RESP); 335 qpriv->s_tid_cur = HFI1_QP_WQE_INVALID; 336 qpriv->s_tid_head = HFI1_QP_WQE_INVALID; 337 qpriv->s_tid_tail = HFI1_QP_WQE_INVALID; 338 qpriv->rnr_nak_state = TID_RNR_NAK_INIT; 339 qpriv->r_tid_head = HFI1_QP_WQE_INVALID; 340 qpriv->r_tid_tail = HFI1_QP_WQE_INVALID; 341 qpriv->r_tid_ack = HFI1_QP_WQE_INVALID; 342 qpriv->r_tid_alloc = HFI1_QP_WQE_INVALID; 343 atomic_set(&qpriv->n_requests, 0); 344 atomic_set(&qpriv->n_tid_requests, 0); 345 timer_setup(&qpriv->s_tid_timer, hfi1_tid_timeout, 0); 346 timer_setup(&qpriv->s_tid_retry_timer, hfi1_tid_retry_timeout, 0); 347 INIT_LIST_HEAD(&qpriv->tid_wait); 348 349 if (init_attr->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { 350 struct hfi1_devdata *dd = qpriv->rcd->dd; 351 352 qpriv->pages = kzalloc_node(TID_RDMA_MAX_PAGES * 353 sizeof(*qpriv->pages), 354 GFP_KERNEL, dd->node); 355 if (!qpriv->pages) 356 return -ENOMEM; 357 for (i = 0; i < qp->s_size; i++) { 358 struct hfi1_swqe_priv *priv; 359 struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i); 360 361 priv = kzalloc_node(sizeof(*priv), GFP_KERNEL, 362 dd->node); 363 if (!priv) 364 return -ENOMEM; 365 366 hfi1_init_trdma_req(qp, &priv->tid_req); 367 priv->tid_req.e.swqe = wqe; 368 wqe->priv = priv; 369 } 370 for (i = 0; i < rvt_max_atomic(rdi); i++) { 371 struct hfi1_ack_priv *priv; 372 373 priv = kzalloc_node(sizeof(*priv), GFP_KERNEL, 374 dd->node); 375 if (!priv) 376 return -ENOMEM; 377 378 hfi1_init_trdma_req(qp, &priv->tid_req); 379 priv->tid_req.e.ack = &qp->s_ack_queue[i]; 380 381 ret = hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req, 382 GFP_KERNEL); 383 if (ret) { 384 kfree(priv); 385 return ret; 386 } 387 qp->s_ack_queue[i].priv = priv; 388 } 389 } 390 391 return 0; 392 } 393 394 void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp) 395 { 396 struct hfi1_qp_priv *qpriv = qp->priv; 397 struct rvt_swqe *wqe; 398 u32 i; 399 400 if (qp->ibqp.qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { 401 for (i = 0; i < qp->s_size; i++) { 402 wqe = rvt_get_swqe_ptr(qp, i); 403 kfree(wqe->priv); 404 wqe->priv = NULL; 405 } 406 for (i = 0; i < rvt_max_atomic(rdi); i++) { 407 struct hfi1_ack_priv *priv = qp->s_ack_queue[i].priv; 408 409 if (priv) 410 hfi1_kern_exp_rcv_free_flows(&priv->tid_req); 411 kfree(priv); 412 qp->s_ack_queue[i].priv = NULL; 413 } 414 cancel_work_sync(&qpriv->opfn.opfn_work); 415 kfree(qpriv->pages); 416 qpriv->pages = NULL; 417 } 418 } 419 420 /* Flow and tid waiter functions */ 421 /** 422 * DOC: lock ordering 423 * 424 * There are two locks involved with the queuing 425 * routines: the qp s_lock and the exp_lock. 426 * 427 * Since the tid space allocation is called from 428 * the send engine, the qp s_lock is already held. 429 * 430 * The allocation routines will get the exp_lock. 431 * 432 * The first_qp() call is provided to allow the head of 433 * the rcd wait queue to be fetched under the exp_lock and 434 * followed by a drop of the exp_lock. 435 * 436 * Any qp in the wait list will have the qp reference count held 437 * to hold the qp in memory. 438 */ 439 440 /* 441 * return head of rcd wait list 442 * 443 * Must hold the exp_lock. 444 * 445 * Get a reference to the QP to hold the QP in memory. 446 * 447 * The caller must release the reference when the local 448 * is no longer being used. 449 */ 450 static struct rvt_qp *first_qp(struct hfi1_ctxtdata *rcd, 451 struct tid_queue *queue) 452 __must_hold(&rcd->exp_lock) 453 { 454 struct hfi1_qp_priv *priv; 455 456 lockdep_assert_held(&rcd->exp_lock); 457 priv = list_first_entry_or_null(&queue->queue_head, 458 struct hfi1_qp_priv, 459 tid_wait); 460 if (!priv) 461 return NULL; 462 rvt_get_qp(priv->owner); 463 return priv->owner; 464 } 465 466 /** 467 * kernel_tid_waiters - determine rcd wait 468 * @rcd: the receive context 469 * @qp: the head of the qp being processed 470 * 471 * This routine will return false IFF 472 * the list is NULL or the head of the 473 * list is the indicated qp. 474 * 475 * Must hold the qp s_lock and the exp_lock. 476 * 477 * Return: 478 * false if either of the conditions below are statisfied: 479 * 1. The list is empty or 480 * 2. The indicated qp is at the head of the list and the 481 * HFI1_S_WAIT_TID_SPACE bit is set in qp->s_flags. 482 * true is returned otherwise. 483 */ 484 static bool kernel_tid_waiters(struct hfi1_ctxtdata *rcd, 485 struct tid_queue *queue, struct rvt_qp *qp) 486 __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock) 487 { 488 struct rvt_qp *fqp; 489 bool ret = true; 490 491 lockdep_assert_held(&qp->s_lock); 492 lockdep_assert_held(&rcd->exp_lock); 493 fqp = first_qp(rcd, queue); 494 if (!fqp || (fqp == qp && (qp->s_flags & HFI1_S_WAIT_TID_SPACE))) 495 ret = false; 496 rvt_put_qp(fqp); 497 return ret; 498 } 499 500 /** 501 * dequeue_tid_waiter - dequeue the qp from the list 502 * @qp - the qp to remove the wait list 503 * 504 * This routine removes the indicated qp from the 505 * wait list if it is there. 506 * 507 * This should be done after the hardware flow and 508 * tid array resources have been allocated. 509 * 510 * Must hold the qp s_lock and the rcd exp_lock. 511 * 512 * It assumes the s_lock to protect the s_flags 513 * field and to reliably test the HFI1_S_WAIT_TID_SPACE flag. 514 */ 515 static void dequeue_tid_waiter(struct hfi1_ctxtdata *rcd, 516 struct tid_queue *queue, struct rvt_qp *qp) 517 __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock) 518 { 519 struct hfi1_qp_priv *priv = qp->priv; 520 521 lockdep_assert_held(&qp->s_lock); 522 lockdep_assert_held(&rcd->exp_lock); 523 if (list_empty(&priv->tid_wait)) 524 return; 525 list_del_init(&priv->tid_wait); 526 qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE; 527 queue->dequeue++; 528 rvt_put_qp(qp); 529 } 530 531 /** 532 * queue_qp_for_tid_wait - suspend QP on tid space 533 * @rcd: the receive context 534 * @qp: the qp 535 * 536 * The qp is inserted at the tail of the rcd 537 * wait queue and the HFI1_S_WAIT_TID_SPACE s_flag is set. 538 * 539 * Must hold the qp s_lock and the exp_lock. 540 */ 541 static void queue_qp_for_tid_wait(struct hfi1_ctxtdata *rcd, 542 struct tid_queue *queue, struct rvt_qp *qp) 543 __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock) 544 { 545 struct hfi1_qp_priv *priv = qp->priv; 546 547 lockdep_assert_held(&qp->s_lock); 548 lockdep_assert_held(&rcd->exp_lock); 549 if (list_empty(&priv->tid_wait)) { 550 qp->s_flags |= HFI1_S_WAIT_TID_SPACE; 551 list_add_tail(&priv->tid_wait, &queue->queue_head); 552 priv->tid_enqueue = ++queue->enqueue; 553 rcd->dd->verbs_dev.n_tidwait++; 554 trace_hfi1_qpsleep(qp, HFI1_S_WAIT_TID_SPACE); 555 rvt_get_qp(qp); 556 } 557 } 558 559 /** 560 * __trigger_tid_waiter - trigger tid waiter 561 * @qp: the qp 562 * 563 * This is a private entrance to schedule the qp 564 * assuming the caller is holding the qp->s_lock. 565 */ 566 static void __trigger_tid_waiter(struct rvt_qp *qp) 567 __must_hold(&qp->s_lock) 568 { 569 lockdep_assert_held(&qp->s_lock); 570 if (!(qp->s_flags & HFI1_S_WAIT_TID_SPACE)) 571 return; 572 trace_hfi1_qpwakeup(qp, HFI1_S_WAIT_TID_SPACE); 573 hfi1_schedule_send(qp); 574 } 575 576 /** 577 * tid_rdma_schedule_tid_wakeup - schedule wakeup for a qp 578 * @qp - the qp 579 * 580 * trigger a schedule or a waiting qp in a deadlock 581 * safe manner. The qp reference is held prior 582 * to this call via first_qp(). 583 * 584 * If the qp trigger was already scheduled (!rval) 585 * the the reference is dropped, otherwise the resume 586 * or the destroy cancel will dispatch the reference. 587 */ 588 static void tid_rdma_schedule_tid_wakeup(struct rvt_qp *qp) 589 { 590 struct hfi1_qp_priv *priv; 591 struct hfi1_ibport *ibp; 592 struct hfi1_pportdata *ppd; 593 struct hfi1_devdata *dd; 594 bool rval; 595 596 if (!qp) 597 return; 598 599 priv = qp->priv; 600 ibp = to_iport(qp->ibqp.device, qp->port_num); 601 ppd = ppd_from_ibp(ibp); 602 dd = dd_from_ibdev(qp->ibqp.device); 603 604 rval = queue_work_on(priv->s_sde ? 605 priv->s_sde->cpu : 606 cpumask_first(cpumask_of_node(dd->node)), 607 ppd->hfi1_wq, 608 &priv->tid_rdma.trigger_work); 609 if (!rval) 610 rvt_put_qp(qp); 611 } 612 613 /** 614 * tid_rdma_trigger_resume - field a trigger work request 615 * @work - the work item 616 * 617 * Complete the off qp trigger processing by directly 618 * calling the progress routine. 619 */ 620 static void tid_rdma_trigger_resume(struct work_struct *work) 621 { 622 struct tid_rdma_qp_params *tr; 623 struct hfi1_qp_priv *priv; 624 struct rvt_qp *qp; 625 626 tr = container_of(work, struct tid_rdma_qp_params, trigger_work); 627 priv = container_of(tr, struct hfi1_qp_priv, tid_rdma); 628 qp = priv->owner; 629 spin_lock_irq(&qp->s_lock); 630 if (qp->s_flags & HFI1_S_WAIT_TID_SPACE) { 631 spin_unlock_irq(&qp->s_lock); 632 hfi1_do_send(priv->owner, true); 633 } else { 634 spin_unlock_irq(&qp->s_lock); 635 } 636 rvt_put_qp(qp); 637 } 638 639 /** 640 * tid_rdma_flush_wait - unwind any tid space wait 641 * 642 * This is called when resetting a qp to 643 * allow a destroy or reset to get rid 644 * of any tid space linkage and reference counts. 645 */ 646 static void _tid_rdma_flush_wait(struct rvt_qp *qp, struct tid_queue *queue) 647 __must_hold(&qp->s_lock) 648 { 649 struct hfi1_qp_priv *priv; 650 651 if (!qp) 652 return; 653 lockdep_assert_held(&qp->s_lock); 654 priv = qp->priv; 655 qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE; 656 spin_lock(&priv->rcd->exp_lock); 657 if (!list_empty(&priv->tid_wait)) { 658 list_del_init(&priv->tid_wait); 659 qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE; 660 queue->dequeue++; 661 rvt_put_qp(qp); 662 } 663 spin_unlock(&priv->rcd->exp_lock); 664 } 665 666 void hfi1_tid_rdma_flush_wait(struct rvt_qp *qp) 667 __must_hold(&qp->s_lock) 668 { 669 struct hfi1_qp_priv *priv = qp->priv; 670 671 _tid_rdma_flush_wait(qp, &priv->rcd->flow_queue); 672 _tid_rdma_flush_wait(qp, &priv->rcd->rarr_queue); 673 } 674 675 /* Flow functions */ 676 /** 677 * kern_reserve_flow - allocate a hardware flow 678 * @rcd - the context to use for allocation 679 * @last - the index of the preferred flow. Use RXE_NUM_TID_FLOWS to 680 * signify "don't care". 681 * 682 * Use a bit mask based allocation to reserve a hardware 683 * flow for use in receiving KDETH data packets. If a preferred flow is 684 * specified the function will attempt to reserve that flow again, if 685 * available. 686 * 687 * The exp_lock must be held. 688 * 689 * Return: 690 * On success: a value postive value between 0 and RXE_NUM_TID_FLOWS - 1 691 * On failure: -EAGAIN 692 */ 693 static int kern_reserve_flow(struct hfi1_ctxtdata *rcd, int last) 694 __must_hold(&rcd->exp_lock) 695 { 696 int nr; 697 698 /* Attempt to reserve the preferred flow index */ 699 if (last >= 0 && last < RXE_NUM_TID_FLOWS && 700 !test_and_set_bit(last, &rcd->flow_mask)) 701 return last; 702 703 nr = ffz(rcd->flow_mask); 704 BUILD_BUG_ON(RXE_NUM_TID_FLOWS >= 705 (sizeof(rcd->flow_mask) * BITS_PER_BYTE)); 706 if (nr > (RXE_NUM_TID_FLOWS - 1)) 707 return -EAGAIN; 708 set_bit(nr, &rcd->flow_mask); 709 return nr; 710 } 711 712 static void kern_set_hw_flow(struct hfi1_ctxtdata *rcd, u32 generation, 713 u32 flow_idx) 714 { 715 u64 reg; 716 717 reg = ((u64)generation << HFI1_KDETH_BTH_SEQ_SHIFT) | 718 RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK | 719 RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK | 720 RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK | 721 RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK | 722 RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK; 723 724 if (generation != KERN_GENERATION_RESERVED) 725 reg |= RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK; 726 727 write_uctxt_csr(rcd->dd, rcd->ctxt, 728 RCV_TID_FLOW_TABLE + 8 * flow_idx, reg); 729 } 730 731 static u32 kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx) 732 __must_hold(&rcd->exp_lock) 733 { 734 u32 generation = rcd->flows[flow_idx].generation; 735 736 kern_set_hw_flow(rcd, generation, flow_idx); 737 return generation; 738 } 739 740 static u32 kern_flow_generation_next(u32 gen) 741 { 742 u32 generation = mask_generation(gen + 1); 743 744 if (generation == KERN_GENERATION_RESERVED) 745 generation = mask_generation(generation + 1); 746 return generation; 747 } 748 749 static void kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx) 750 __must_hold(&rcd->exp_lock) 751 { 752 rcd->flows[flow_idx].generation = 753 kern_flow_generation_next(rcd->flows[flow_idx].generation); 754 kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, flow_idx); 755 } 756 757 int hfi1_kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp) 758 { 759 struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv; 760 struct tid_flow_state *fs = &qpriv->flow_state; 761 struct rvt_qp *fqp; 762 unsigned long flags; 763 int ret = 0; 764 765 /* The QP already has an allocated flow */ 766 if (fs->index != RXE_NUM_TID_FLOWS) 767 return ret; 768 769 spin_lock_irqsave(&rcd->exp_lock, flags); 770 if (kernel_tid_waiters(rcd, &rcd->flow_queue, qp)) 771 goto queue; 772 773 ret = kern_reserve_flow(rcd, fs->last_index); 774 if (ret < 0) 775 goto queue; 776 fs->index = ret; 777 fs->last_index = fs->index; 778 779 /* Generation received in a RESYNC overrides default flow generation */ 780 if (fs->generation != KERN_GENERATION_RESERVED) 781 rcd->flows[fs->index].generation = fs->generation; 782 fs->generation = kern_setup_hw_flow(rcd, fs->index); 783 fs->psn = 0; 784 dequeue_tid_waiter(rcd, &rcd->flow_queue, qp); 785 /* get head before dropping lock */ 786 fqp = first_qp(rcd, &rcd->flow_queue); 787 spin_unlock_irqrestore(&rcd->exp_lock, flags); 788 789 tid_rdma_schedule_tid_wakeup(fqp); 790 return 0; 791 queue: 792 queue_qp_for_tid_wait(rcd, &rcd->flow_queue, qp); 793 spin_unlock_irqrestore(&rcd->exp_lock, flags); 794 return -EAGAIN; 795 } 796 797 void hfi1_kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp) 798 { 799 struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv; 800 struct tid_flow_state *fs = &qpriv->flow_state; 801 struct rvt_qp *fqp; 802 unsigned long flags; 803 804 if (fs->index >= RXE_NUM_TID_FLOWS) 805 return; 806 spin_lock_irqsave(&rcd->exp_lock, flags); 807 kern_clear_hw_flow(rcd, fs->index); 808 clear_bit(fs->index, &rcd->flow_mask); 809 fs->index = RXE_NUM_TID_FLOWS; 810 fs->psn = 0; 811 fs->generation = KERN_GENERATION_RESERVED; 812 813 /* get head before dropping lock */ 814 fqp = first_qp(rcd, &rcd->flow_queue); 815 spin_unlock_irqrestore(&rcd->exp_lock, flags); 816 817 if (fqp == qp) { 818 __trigger_tid_waiter(fqp); 819 rvt_put_qp(fqp); 820 } else { 821 tid_rdma_schedule_tid_wakeup(fqp); 822 } 823 } 824 825 void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd) 826 { 827 int i; 828 829 for (i = 0; i < RXE_NUM_TID_FLOWS; i++) { 830 rcd->flows[i].generation = mask_generation(prandom_u32()); 831 kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, i); 832 } 833 } 834 835 /* TID allocation functions */ 836 static u8 trdma_pset_order(struct tid_rdma_pageset *s) 837 { 838 u8 count = s->count; 839 840 return ilog2(count) + 1; 841 } 842 843 /** 844 * tid_rdma_find_phys_blocks_4k - get groups base on mr info 845 * @npages - number of pages 846 * @pages - pointer to an array of page structs 847 * @list - page set array to return 848 * 849 * This routine returns the number of groups associated with 850 * the current sge information. This implementation is based 851 * on the expected receive find_phys_blocks() adjusted to 852 * use the MR information vs. the pfn. 853 * 854 * Return: 855 * the number of RcvArray entries 856 */ 857 static u32 tid_rdma_find_phys_blocks_4k(struct tid_rdma_flow *flow, 858 struct page **pages, 859 u32 npages, 860 struct tid_rdma_pageset *list) 861 { 862 u32 pagecount, pageidx, setcount = 0, i; 863 void *vaddr, *this_vaddr; 864 865 if (!npages) 866 return 0; 867 868 /* 869 * Look for sets of physically contiguous pages in the user buffer. 870 * This will allow us to optimize Expected RcvArray entry usage by 871 * using the bigger supported sizes. 872 */ 873 vaddr = page_address(pages[0]); 874 trace_hfi1_tid_flow_page(flow->req->qp, flow, 0, 0, 0, vaddr); 875 for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) { 876 this_vaddr = i < npages ? page_address(pages[i]) : NULL; 877 trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 0, 0, 878 this_vaddr); 879 /* 880 * If the vaddr's are not sequential, pages are not physically 881 * contiguous. 882 */ 883 if (this_vaddr != (vaddr + PAGE_SIZE)) { 884 /* 885 * At this point we have to loop over the set of 886 * physically contiguous pages and break them down it 887 * sizes supported by the HW. 888 * There are two main constraints: 889 * 1. The max buffer size is MAX_EXPECTED_BUFFER. 890 * If the total set size is bigger than that 891 * program only a MAX_EXPECTED_BUFFER chunk. 892 * 2. The buffer size has to be a power of two. If 893 * it is not, round down to the closes power of 894 * 2 and program that size. 895 */ 896 while (pagecount) { 897 int maxpages = pagecount; 898 u32 bufsize = pagecount * PAGE_SIZE; 899 900 if (bufsize > MAX_EXPECTED_BUFFER) 901 maxpages = 902 MAX_EXPECTED_BUFFER >> 903 PAGE_SHIFT; 904 else if (!is_power_of_2(bufsize)) 905 maxpages = 906 rounddown_pow_of_two(bufsize) >> 907 PAGE_SHIFT; 908 909 list[setcount].idx = pageidx; 910 list[setcount].count = maxpages; 911 trace_hfi1_tid_pageset(flow->req->qp, setcount, 912 list[setcount].idx, 913 list[setcount].count); 914 pagecount -= maxpages; 915 pageidx += maxpages; 916 setcount++; 917 } 918 pageidx = i; 919 pagecount = 1; 920 vaddr = this_vaddr; 921 } else { 922 vaddr += PAGE_SIZE; 923 pagecount++; 924 } 925 } 926 /* insure we always return an even number of sets */ 927 if (setcount & 1) 928 list[setcount++].count = 0; 929 return setcount; 930 } 931 932 /** 933 * tid_flush_pages - dump out pages into pagesets 934 * @list - list of pagesets 935 * @idx - pointer to current page index 936 * @pages - number of pages to dump 937 * @sets - current number of pagesset 938 * 939 * This routine flushes out accumuated pages. 940 * 941 * To insure an even number of sets the 942 * code may add a filler. 943 * 944 * This can happen with when pages is not 945 * a power of 2 or pages is a power of 2 946 * less than the maximum pages. 947 * 948 * Return: 949 * The new number of sets 950 */ 951 952 static u32 tid_flush_pages(struct tid_rdma_pageset *list, 953 u32 *idx, u32 pages, u32 sets) 954 { 955 while (pages) { 956 u32 maxpages = pages; 957 958 if (maxpages > MAX_EXPECTED_PAGES) 959 maxpages = MAX_EXPECTED_PAGES; 960 else if (!is_power_of_2(maxpages)) 961 maxpages = rounddown_pow_of_two(maxpages); 962 list[sets].idx = *idx; 963 list[sets++].count = maxpages; 964 *idx += maxpages; 965 pages -= maxpages; 966 } 967 /* might need a filler */ 968 if (sets & 1) 969 list[sets++].count = 0; 970 return sets; 971 } 972 973 /** 974 * tid_rdma_find_phys_blocks_8k - get groups base on mr info 975 * @pages - pointer to an array of page structs 976 * @npages - number of pages 977 * @list - page set array to return 978 * 979 * This routine parses an array of pages to compute pagesets 980 * in an 8k compatible way. 981 * 982 * pages are tested two at a time, i, i + 1 for contiguous 983 * pages and i - 1 and i contiguous pages. 984 * 985 * If any condition is false, any accumlated pages are flushed and 986 * v0,v1 are emitted as separate PAGE_SIZE pagesets 987 * 988 * Otherwise, the current 8k is totaled for a future flush. 989 * 990 * Return: 991 * The number of pagesets 992 * list set with the returned number of pagesets 993 * 994 */ 995 static u32 tid_rdma_find_phys_blocks_8k(struct tid_rdma_flow *flow, 996 struct page **pages, 997 u32 npages, 998 struct tid_rdma_pageset *list) 999 { 1000 u32 idx, sets = 0, i; 1001 u32 pagecnt = 0; 1002 void *v0, *v1, *vm1; 1003 1004 if (!npages) 1005 return 0; 1006 for (idx = 0, i = 0, vm1 = NULL; i < npages; i += 2) { 1007 /* get a new v0 */ 1008 v0 = page_address(pages[i]); 1009 trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 0, v0); 1010 v1 = i + 1 < npages ? 1011 page_address(pages[i + 1]) : NULL; 1012 trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 1, v1); 1013 /* compare i, i + 1 vaddr */ 1014 if (v1 != (v0 + PAGE_SIZE)) { 1015 /* flush out pages */ 1016 sets = tid_flush_pages(list, &idx, pagecnt, sets); 1017 /* output v0,v1 as two pagesets */ 1018 list[sets].idx = idx++; 1019 list[sets++].count = 1; 1020 if (v1) { 1021 list[sets].count = 1; 1022 list[sets++].idx = idx++; 1023 } else { 1024 list[sets++].count = 0; 1025 } 1026 vm1 = NULL; 1027 pagecnt = 0; 1028 continue; 1029 } 1030 /* i,i+1 consecutive, look at i-1,i */ 1031 if (vm1 && v0 != (vm1 + PAGE_SIZE)) { 1032 /* flush out pages */ 1033 sets = tid_flush_pages(list, &idx, pagecnt, sets); 1034 pagecnt = 0; 1035 } 1036 /* pages will always be a multiple of 8k */ 1037 pagecnt += 2; 1038 /* save i-1 */ 1039 vm1 = v1; 1040 /* move to next pair */ 1041 } 1042 /* dump residual pages at end */ 1043 sets = tid_flush_pages(list, &idx, npages - idx, sets); 1044 /* by design cannot be odd sets */ 1045 WARN_ON(sets & 1); 1046 return sets; 1047 } 1048 1049 /** 1050 * Find pages for one segment of a sge array represented by @ss. The function 1051 * does not check the sge, the sge must have been checked for alignment with a 1052 * prior call to hfi1_kern_trdma_ok. Other sge checking is done as part of 1053 * rvt_lkey_ok and rvt_rkey_ok. Also, the function only modifies the local sge 1054 * copy maintained in @ss->sge, the original sge is not modified. 1055 * 1056 * Unlike IB RDMA WRITE, we can't decrement ss->num_sge here because we are not 1057 * releasing the MR reference count at the same time. Otherwise, we'll "leak" 1058 * references to the MR. This difference requires that we keep track of progress 1059 * into the sg_list. This is done by the cur_seg cursor in the tid_rdma_request 1060 * structure. 1061 */ 1062 static u32 kern_find_pages(struct tid_rdma_flow *flow, 1063 struct page **pages, 1064 struct rvt_sge_state *ss, bool *last) 1065 { 1066 struct tid_rdma_request *req = flow->req; 1067 struct rvt_sge *sge = &ss->sge; 1068 u32 length = flow->req->seg_len; 1069 u32 len = PAGE_SIZE; 1070 u32 i = 0; 1071 1072 while (length && req->isge < ss->num_sge) { 1073 pages[i++] = virt_to_page(sge->vaddr); 1074 1075 sge->vaddr += len; 1076 sge->length -= len; 1077 sge->sge_length -= len; 1078 if (!sge->sge_length) { 1079 if (++req->isge < ss->num_sge) 1080 *sge = ss->sg_list[req->isge - 1]; 1081 } else if (sge->length == 0 && sge->mr->lkey) { 1082 if (++sge->n >= RVT_SEGSZ) { 1083 ++sge->m; 1084 sge->n = 0; 1085 } 1086 sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr; 1087 sge->length = sge->mr->map[sge->m]->segs[sge->n].length; 1088 } 1089 length -= len; 1090 } 1091 1092 flow->length = flow->req->seg_len - length; 1093 *last = req->isge == ss->num_sge ? false : true; 1094 return i; 1095 } 1096 1097 static void dma_unmap_flow(struct tid_rdma_flow *flow) 1098 { 1099 struct hfi1_devdata *dd; 1100 int i; 1101 struct tid_rdma_pageset *pset; 1102 1103 dd = flow->req->rcd->dd; 1104 for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets; 1105 i++, pset++) { 1106 if (pset->count && pset->addr) { 1107 dma_unmap_page(&dd->pcidev->dev, 1108 pset->addr, 1109 PAGE_SIZE * pset->count, 1110 DMA_FROM_DEVICE); 1111 pset->mapped = 0; 1112 } 1113 } 1114 } 1115 1116 static int dma_map_flow(struct tid_rdma_flow *flow, struct page **pages) 1117 { 1118 int i; 1119 struct hfi1_devdata *dd = flow->req->rcd->dd; 1120 struct tid_rdma_pageset *pset; 1121 1122 for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets; 1123 i++, pset++) { 1124 if (pset->count) { 1125 pset->addr = dma_map_page(&dd->pcidev->dev, 1126 pages[pset->idx], 1127 0, 1128 PAGE_SIZE * pset->count, 1129 DMA_FROM_DEVICE); 1130 1131 if (dma_mapping_error(&dd->pcidev->dev, pset->addr)) { 1132 dma_unmap_flow(flow); 1133 return -ENOMEM; 1134 } 1135 pset->mapped = 1; 1136 } 1137 } 1138 return 0; 1139 } 1140 1141 static inline bool dma_mapped(struct tid_rdma_flow *flow) 1142 { 1143 return !!flow->pagesets[0].mapped; 1144 } 1145 1146 /* 1147 * Get pages pointers and identify contiguous physical memory chunks for a 1148 * segment. All segments are of length flow->req->seg_len. 1149 */ 1150 static int kern_get_phys_blocks(struct tid_rdma_flow *flow, 1151 struct page **pages, 1152 struct rvt_sge_state *ss, bool *last) 1153 { 1154 u8 npages; 1155 1156 /* Reuse previously computed pagesets, if any */ 1157 if (flow->npagesets) { 1158 trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head, 1159 flow); 1160 if (!dma_mapped(flow)) 1161 return dma_map_flow(flow, pages); 1162 return 0; 1163 } 1164 1165 npages = kern_find_pages(flow, pages, ss, last); 1166 1167 if (flow->req->qp->pmtu == enum_to_mtu(OPA_MTU_4096)) 1168 flow->npagesets = 1169 tid_rdma_find_phys_blocks_4k(flow, pages, npages, 1170 flow->pagesets); 1171 else 1172 flow->npagesets = 1173 tid_rdma_find_phys_blocks_8k(flow, pages, npages, 1174 flow->pagesets); 1175 1176 return dma_map_flow(flow, pages); 1177 } 1178 1179 static inline void kern_add_tid_node(struct tid_rdma_flow *flow, 1180 struct hfi1_ctxtdata *rcd, char *s, 1181 struct tid_group *grp, u8 cnt) 1182 { 1183 struct kern_tid_node *node = &flow->tnode[flow->tnode_cnt++]; 1184 1185 WARN_ON_ONCE(flow->tnode_cnt >= 1186 (TID_RDMA_MAX_SEGMENT_SIZE >> PAGE_SHIFT)); 1187 if (WARN_ON_ONCE(cnt & 1)) 1188 dd_dev_err(rcd->dd, 1189 "unexpected odd allocation cnt %u map 0x%x used %u", 1190 cnt, grp->map, grp->used); 1191 1192 node->grp = grp; 1193 node->map = grp->map; 1194 node->cnt = cnt; 1195 trace_hfi1_tid_node_add(flow->req->qp, s, flow->tnode_cnt - 1, 1196 grp->base, grp->map, grp->used, cnt); 1197 } 1198 1199 /* 1200 * Try to allocate pageset_count TID's from TID groups for a context 1201 * 1202 * This function allocates TID's without moving groups between lists or 1203 * modifying grp->map. This is done as follows, being cogizant of the lists 1204 * between which the TID groups will move: 1205 * 1. First allocate complete groups of 8 TID's since this is more efficient, 1206 * these groups will move from group->full without affecting used 1207 * 2. If more TID's are needed allocate from used (will move from used->full or 1208 * stay in used) 1209 * 3. If we still don't have the required number of TID's go back and look again 1210 * at a complete group (will move from group->used) 1211 */ 1212 static int kern_alloc_tids(struct tid_rdma_flow *flow) 1213 { 1214 struct hfi1_ctxtdata *rcd = flow->req->rcd; 1215 struct hfi1_devdata *dd = rcd->dd; 1216 u32 ngroups, pageidx = 0; 1217 struct tid_group *group = NULL, *used; 1218 u8 use; 1219 1220 flow->tnode_cnt = 0; 1221 ngroups = flow->npagesets / dd->rcv_entries.group_size; 1222 if (!ngroups) 1223 goto used_list; 1224 1225 /* First look at complete groups */ 1226 list_for_each_entry(group, &rcd->tid_group_list.list, list) { 1227 kern_add_tid_node(flow, rcd, "complete groups", group, 1228 group->size); 1229 1230 pageidx += group->size; 1231 if (!--ngroups) 1232 break; 1233 } 1234 1235 if (pageidx >= flow->npagesets) 1236 goto ok; 1237 1238 used_list: 1239 /* Now look at partially used groups */ 1240 list_for_each_entry(used, &rcd->tid_used_list.list, list) { 1241 use = min_t(u32, flow->npagesets - pageidx, 1242 used->size - used->used); 1243 kern_add_tid_node(flow, rcd, "used groups", used, use); 1244 1245 pageidx += use; 1246 if (pageidx >= flow->npagesets) 1247 goto ok; 1248 } 1249 1250 /* 1251 * Look again at a complete group, continuing from where we left. 1252 * However, if we are at the head, we have reached the end of the 1253 * complete groups list from the first loop above 1254 */ 1255 if (group && &group->list == &rcd->tid_group_list.list) 1256 goto bail_eagain; 1257 group = list_prepare_entry(group, &rcd->tid_group_list.list, 1258 list); 1259 if (list_is_last(&group->list, &rcd->tid_group_list.list)) 1260 goto bail_eagain; 1261 group = list_next_entry(group, list); 1262 use = min_t(u32, flow->npagesets - pageidx, group->size); 1263 kern_add_tid_node(flow, rcd, "complete continue", group, use); 1264 pageidx += use; 1265 if (pageidx >= flow->npagesets) 1266 goto ok; 1267 bail_eagain: 1268 trace_hfi1_msg_alloc_tids(flow->req->qp, " insufficient tids: needed ", 1269 (u64)flow->npagesets); 1270 return -EAGAIN; 1271 ok: 1272 return 0; 1273 } 1274 1275 static void kern_program_rcv_group(struct tid_rdma_flow *flow, int grp_num, 1276 u32 *pset_idx) 1277 { 1278 struct hfi1_ctxtdata *rcd = flow->req->rcd; 1279 struct hfi1_devdata *dd = rcd->dd; 1280 struct kern_tid_node *node = &flow->tnode[grp_num]; 1281 struct tid_group *grp = node->grp; 1282 struct tid_rdma_pageset *pset; 1283 u32 pmtu_pg = flow->req->qp->pmtu >> PAGE_SHIFT; 1284 u32 rcventry, npages = 0, pair = 0, tidctrl; 1285 u8 i, cnt = 0; 1286 1287 for (i = 0; i < grp->size; i++) { 1288 rcventry = grp->base + i; 1289 1290 if (node->map & BIT(i) || cnt >= node->cnt) { 1291 rcv_array_wc_fill(dd, rcventry); 1292 continue; 1293 } 1294 pset = &flow->pagesets[(*pset_idx)++]; 1295 if (pset->count) { 1296 hfi1_put_tid(dd, rcventry, PT_EXPECTED, 1297 pset->addr, trdma_pset_order(pset)); 1298 } else { 1299 hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0); 1300 } 1301 npages += pset->count; 1302 1303 rcventry -= rcd->expected_base; 1304 tidctrl = pair ? 0x3 : rcventry & 0x1 ? 0x2 : 0x1; 1305 /* 1306 * A single TID entry will be used to use a rcvarr pair (with 1307 * tidctrl 0x3), if ALL these are true (a) the bit pos is even 1308 * (b) the group map shows current and the next bits as free 1309 * indicating two consecutive rcvarry entries are available (c) 1310 * we actually need 2 more entries 1311 */ 1312 pair = !(i & 0x1) && !((node->map >> i) & 0x3) && 1313 node->cnt >= cnt + 2; 1314 if (!pair) { 1315 if (!pset->count) 1316 tidctrl = 0x1; 1317 flow->tid_entry[flow->tidcnt++] = 1318 EXP_TID_SET(IDX, rcventry >> 1) | 1319 EXP_TID_SET(CTRL, tidctrl) | 1320 EXP_TID_SET(LEN, npages); 1321 trace_hfi1_tid_entry_alloc(/* entry */ 1322 flow->req->qp, flow->tidcnt - 1, 1323 flow->tid_entry[flow->tidcnt - 1]); 1324 1325 /* Efficient DIV_ROUND_UP(npages, pmtu_pg) */ 1326 flow->npkts += (npages + pmtu_pg - 1) >> ilog2(pmtu_pg); 1327 npages = 0; 1328 } 1329 1330 if (grp->used == grp->size - 1) 1331 tid_group_move(grp, &rcd->tid_used_list, 1332 &rcd->tid_full_list); 1333 else if (!grp->used) 1334 tid_group_move(grp, &rcd->tid_group_list, 1335 &rcd->tid_used_list); 1336 1337 grp->used++; 1338 grp->map |= BIT(i); 1339 cnt++; 1340 } 1341 } 1342 1343 static void kern_unprogram_rcv_group(struct tid_rdma_flow *flow, int grp_num) 1344 { 1345 struct hfi1_ctxtdata *rcd = flow->req->rcd; 1346 struct hfi1_devdata *dd = rcd->dd; 1347 struct kern_tid_node *node = &flow->tnode[grp_num]; 1348 struct tid_group *grp = node->grp; 1349 u32 rcventry; 1350 u8 i, cnt = 0; 1351 1352 for (i = 0; i < grp->size; i++) { 1353 rcventry = grp->base + i; 1354 1355 if (node->map & BIT(i) || cnt >= node->cnt) { 1356 rcv_array_wc_fill(dd, rcventry); 1357 continue; 1358 } 1359 1360 hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0); 1361 1362 grp->used--; 1363 grp->map &= ~BIT(i); 1364 cnt++; 1365 1366 if (grp->used == grp->size - 1) 1367 tid_group_move(grp, &rcd->tid_full_list, 1368 &rcd->tid_used_list); 1369 else if (!grp->used) 1370 tid_group_move(grp, &rcd->tid_used_list, 1371 &rcd->tid_group_list); 1372 } 1373 if (WARN_ON_ONCE(cnt & 1)) { 1374 struct hfi1_ctxtdata *rcd = flow->req->rcd; 1375 struct hfi1_devdata *dd = rcd->dd; 1376 1377 dd_dev_err(dd, "unexpected odd free cnt %u map 0x%x used %u", 1378 cnt, grp->map, grp->used); 1379 } 1380 } 1381 1382 static void kern_program_rcvarray(struct tid_rdma_flow *flow) 1383 { 1384 u32 pset_idx = 0; 1385 int i; 1386 1387 flow->npkts = 0; 1388 flow->tidcnt = 0; 1389 for (i = 0; i < flow->tnode_cnt; i++) 1390 kern_program_rcv_group(flow, i, &pset_idx); 1391 trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head, flow); 1392 } 1393 1394 /** 1395 * hfi1_kern_exp_rcv_setup() - setup TID's and flow for one segment of a 1396 * TID RDMA request 1397 * 1398 * @req: TID RDMA request for which the segment/flow is being set up 1399 * @ss: sge state, maintains state across successive segments of a sge 1400 * @last: set to true after the last sge segment has been processed 1401 * 1402 * This function 1403 * (1) finds a free flow entry in the flow circular buffer 1404 * (2) finds pages and continuous physical chunks constituing one segment 1405 * of an sge 1406 * (3) allocates TID group entries for those chunks 1407 * (4) programs rcvarray entries in the hardware corresponding to those 1408 * TID's 1409 * (5) computes a tidarray with formatted TID entries which can be sent 1410 * to the sender 1411 * (6) Reserves and programs HW flows. 1412 * (7) It also manages queing the QP when TID/flow resources are not 1413 * available. 1414 * 1415 * @req points to struct tid_rdma_request of which the segments are a part. The 1416 * function uses qp, rcd and seg_len members of @req. In the absence of errors, 1417 * req->flow_idx is the index of the flow which has been prepared in this 1418 * invocation of function call. With flow = &req->flows[req->flow_idx], 1419 * flow->tid_entry contains the TID array which the sender can use for TID RDMA 1420 * sends and flow->npkts contains number of packets required to send the 1421 * segment. 1422 * 1423 * hfi1_check_sge_align should be called prior to calling this function and if 1424 * it signals error TID RDMA cannot be used for this sge and this function 1425 * should not be called. 1426 * 1427 * For the queuing, caller must hold the flow->req->qp s_lock from the send 1428 * engine and the function will procure the exp_lock. 1429 * 1430 * Return: 1431 * The function returns -EAGAIN if sufficient number of TID/flow resources to 1432 * map the segment could not be allocated. In this case the function should be 1433 * called again with previous arguments to retry the TID allocation. There are 1434 * no other error returns. The function returns 0 on success. 1435 */ 1436 int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req, 1437 struct rvt_sge_state *ss, bool *last) 1438 __must_hold(&req->qp->s_lock) 1439 { 1440 struct tid_rdma_flow *flow = &req->flows[req->setup_head]; 1441 struct hfi1_ctxtdata *rcd = req->rcd; 1442 struct hfi1_qp_priv *qpriv = req->qp->priv; 1443 unsigned long flags; 1444 struct rvt_qp *fqp; 1445 u16 clear_tail = req->clear_tail; 1446 1447 lockdep_assert_held(&req->qp->s_lock); 1448 /* 1449 * We return error if either (a) we don't have space in the flow 1450 * circular buffer, or (b) we already have max entries in the buffer. 1451 * Max entries depend on the type of request we are processing and the 1452 * negotiated TID RDMA parameters. 1453 */ 1454 if (!CIRC_SPACE(req->setup_head, clear_tail, MAX_FLOWS) || 1455 CIRC_CNT(req->setup_head, clear_tail, MAX_FLOWS) >= 1456 req->n_flows) 1457 return -EINVAL; 1458 1459 /* 1460 * Get pages, identify contiguous physical memory chunks for the segment 1461 * If we can not determine a DMA address mapping we will treat it just 1462 * like if we ran out of space above. 1463 */ 1464 if (kern_get_phys_blocks(flow, qpriv->pages, ss, last)) { 1465 hfi1_wait_kmem(flow->req->qp); 1466 return -ENOMEM; 1467 } 1468 1469 spin_lock_irqsave(&rcd->exp_lock, flags); 1470 if (kernel_tid_waiters(rcd, &rcd->rarr_queue, flow->req->qp)) 1471 goto queue; 1472 1473 /* 1474 * At this point we know the number of pagesets and hence the number of 1475 * TID's to map the segment. Allocate the TID's from the TID groups. If 1476 * we cannot allocate the required number we exit and try again later 1477 */ 1478 if (kern_alloc_tids(flow)) 1479 goto queue; 1480 /* 1481 * Finally program the TID entries with the pagesets, compute the 1482 * tidarray and enable the HW flow 1483 */ 1484 kern_program_rcvarray(flow); 1485 1486 /* 1487 * Setup the flow state with relevant information. 1488 * This information is used for tracking the sequence of data packets 1489 * for the segment. 1490 * The flow is setup here as this is the most accurate time and place 1491 * to do so. Doing at a later time runs the risk of the flow data in 1492 * qpriv getting out of sync. 1493 */ 1494 memset(&flow->flow_state, 0x0, sizeof(flow->flow_state)); 1495 flow->idx = qpriv->flow_state.index; 1496 flow->flow_state.generation = qpriv->flow_state.generation; 1497 flow->flow_state.spsn = qpriv->flow_state.psn; 1498 flow->flow_state.lpsn = flow->flow_state.spsn + flow->npkts - 1; 1499 flow->flow_state.r_next_psn = 1500 full_flow_psn(flow, flow->flow_state.spsn); 1501 qpriv->flow_state.psn += flow->npkts; 1502 1503 dequeue_tid_waiter(rcd, &rcd->rarr_queue, flow->req->qp); 1504 /* get head before dropping lock */ 1505 fqp = first_qp(rcd, &rcd->rarr_queue); 1506 spin_unlock_irqrestore(&rcd->exp_lock, flags); 1507 tid_rdma_schedule_tid_wakeup(fqp); 1508 1509 req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1); 1510 return 0; 1511 queue: 1512 queue_qp_for_tid_wait(rcd, &rcd->rarr_queue, flow->req->qp); 1513 spin_unlock_irqrestore(&rcd->exp_lock, flags); 1514 return -EAGAIN; 1515 } 1516 1517 static void hfi1_tid_rdma_reset_flow(struct tid_rdma_flow *flow) 1518 { 1519 flow->npagesets = 0; 1520 } 1521 1522 /* 1523 * This function is called after one segment has been successfully sent to 1524 * release the flow and TID HW/SW resources for that segment. The segments for a 1525 * TID RDMA request are setup and cleared in FIFO order which is managed using a 1526 * circular buffer. 1527 */ 1528 int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req) 1529 __must_hold(&req->qp->s_lock) 1530 { 1531 struct tid_rdma_flow *flow = &req->flows[req->clear_tail]; 1532 struct hfi1_ctxtdata *rcd = req->rcd; 1533 unsigned long flags; 1534 int i; 1535 struct rvt_qp *fqp; 1536 1537 lockdep_assert_held(&req->qp->s_lock); 1538 /* Exit if we have nothing in the flow circular buffer */ 1539 if (!CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS)) 1540 return -EINVAL; 1541 1542 spin_lock_irqsave(&rcd->exp_lock, flags); 1543 1544 for (i = 0; i < flow->tnode_cnt; i++) 1545 kern_unprogram_rcv_group(flow, i); 1546 /* To prevent double unprogramming */ 1547 flow->tnode_cnt = 0; 1548 /* get head before dropping lock */ 1549 fqp = first_qp(rcd, &rcd->rarr_queue); 1550 spin_unlock_irqrestore(&rcd->exp_lock, flags); 1551 1552 dma_unmap_flow(flow); 1553 1554 hfi1_tid_rdma_reset_flow(flow); 1555 req->clear_tail = (req->clear_tail + 1) & (MAX_FLOWS - 1); 1556 1557 if (fqp == req->qp) { 1558 __trigger_tid_waiter(fqp); 1559 rvt_put_qp(fqp); 1560 } else { 1561 tid_rdma_schedule_tid_wakeup(fqp); 1562 } 1563 1564 return 0; 1565 } 1566 1567 /* 1568 * This function is called to release all the tid entries for 1569 * a request. 1570 */ 1571 void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req) 1572 __must_hold(&req->qp->s_lock) 1573 { 1574 /* Use memory barrier for proper ordering */ 1575 while (CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS)) { 1576 if (hfi1_kern_exp_rcv_clear(req)) 1577 break; 1578 } 1579 } 1580 1581 /** 1582 * hfi1_kern_exp_rcv_free_flows - free priviously allocated flow information 1583 * @req - the tid rdma request to be cleaned 1584 */ 1585 static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req) 1586 { 1587 kfree(req->flows); 1588 req->flows = NULL; 1589 } 1590 1591 /** 1592 * __trdma_clean_swqe - clean up for large sized QPs 1593 * @qp: the queue patch 1594 * @wqe: the send wqe 1595 */ 1596 void __trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe) 1597 { 1598 struct hfi1_swqe_priv *p = wqe->priv; 1599 1600 hfi1_kern_exp_rcv_free_flows(&p->tid_req); 1601 } 1602 1603 /* 1604 * This can be called at QP create time or in the data path. 1605 */ 1606 static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req, 1607 gfp_t gfp) 1608 { 1609 struct tid_rdma_flow *flows; 1610 int i; 1611 1612 if (likely(req->flows)) 1613 return 0; 1614 flows = kmalloc_node(MAX_FLOWS * sizeof(*flows), gfp, 1615 req->rcd->numa_id); 1616 if (!flows) 1617 return -ENOMEM; 1618 /* mini init */ 1619 for (i = 0; i < MAX_FLOWS; i++) { 1620 flows[i].req = req; 1621 flows[i].npagesets = 0; 1622 flows[i].pagesets[0].mapped = 0; 1623 } 1624 req->flows = flows; 1625 return 0; 1626 } 1627 1628 static void hfi1_init_trdma_req(struct rvt_qp *qp, 1629 struct tid_rdma_request *req) 1630 { 1631 struct hfi1_qp_priv *qpriv = qp->priv; 1632 1633 /* 1634 * Initialize various TID RDMA request variables. 1635 * These variables are "static", which is why they 1636 * can be pre-initialized here before the WRs has 1637 * even been submitted. 1638 * However, non-NULL values for these variables do not 1639 * imply that this WQE has been enabled for TID RDMA. 1640 * Drivers should check the WQE's opcode to determine 1641 * if a request is a TID RDMA one or not. 1642 */ 1643 req->qp = qp; 1644 req->rcd = qpriv->rcd; 1645 } 1646 1647 u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry, 1648 void *context, int vl, int mode, u64 data) 1649 { 1650 struct hfi1_devdata *dd = context; 1651 1652 return dd->verbs_dev.n_tidwait; 1653 } 1654 1655 static struct tid_rdma_flow *find_flow_ib(struct tid_rdma_request *req, 1656 u32 psn, u16 *fidx) 1657 { 1658 u16 head, tail; 1659 struct tid_rdma_flow *flow; 1660 1661 head = req->setup_head; 1662 tail = req->clear_tail; 1663 for ( ; CIRC_CNT(head, tail, MAX_FLOWS); 1664 tail = CIRC_NEXT(tail, MAX_FLOWS)) { 1665 flow = &req->flows[tail]; 1666 if (cmp_psn(psn, flow->flow_state.ib_spsn) >= 0 && 1667 cmp_psn(psn, flow->flow_state.ib_lpsn) <= 0) { 1668 if (fidx) 1669 *fidx = tail; 1670 return flow; 1671 } 1672 } 1673 return NULL; 1674 } 1675 1676 static struct tid_rdma_flow * 1677 __find_flow_ranged(struct tid_rdma_request *req, u16 head, u16 tail, 1678 u32 psn, u16 *fidx) 1679 { 1680 for ( ; CIRC_CNT(head, tail, MAX_FLOWS); 1681 tail = CIRC_NEXT(tail, MAX_FLOWS)) { 1682 struct tid_rdma_flow *flow = &req->flows[tail]; 1683 u32 spsn, lpsn; 1684 1685 spsn = full_flow_psn(flow, flow->flow_state.spsn); 1686 lpsn = full_flow_psn(flow, flow->flow_state.lpsn); 1687 1688 if (cmp_psn(psn, spsn) >= 0 && cmp_psn(psn, lpsn) <= 0) { 1689 if (fidx) 1690 *fidx = tail; 1691 return flow; 1692 } 1693 } 1694 return NULL; 1695 } 1696 1697 static struct tid_rdma_flow *find_flow(struct tid_rdma_request *req, 1698 u32 psn, u16 *fidx) 1699 { 1700 return __find_flow_ranged(req, req->setup_head, req->clear_tail, psn, 1701 fidx); 1702 } 1703 1704 /* TID RDMA READ functions */ 1705 u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe, 1706 struct ib_other_headers *ohdr, u32 *bth1, 1707 u32 *bth2, u32 *len) 1708 { 1709 struct tid_rdma_request *req = wqe_to_tid_req(wqe); 1710 struct tid_rdma_flow *flow = &req->flows[req->flow_idx]; 1711 struct rvt_qp *qp = req->qp; 1712 struct hfi1_qp_priv *qpriv = qp->priv; 1713 struct hfi1_swqe_priv *wpriv = wqe->priv; 1714 struct tid_rdma_read_req *rreq = &ohdr->u.tid_rdma.r_req; 1715 struct tid_rdma_params *remote; 1716 u32 req_len = 0; 1717 void *req_addr = NULL; 1718 1719 /* This is the IB psn used to send the request */ 1720 *bth2 = mask_psn(flow->flow_state.ib_spsn + flow->pkt); 1721 trace_hfi1_tid_flow_build_read_pkt(qp, req->flow_idx, flow); 1722 1723 /* TID Entries for TID RDMA READ payload */ 1724 req_addr = &flow->tid_entry[flow->tid_idx]; 1725 req_len = sizeof(*flow->tid_entry) * 1726 (flow->tidcnt - flow->tid_idx); 1727 1728 memset(&ohdr->u.tid_rdma.r_req, 0, sizeof(ohdr->u.tid_rdma.r_req)); 1729 wpriv->ss.sge.vaddr = req_addr; 1730 wpriv->ss.sge.sge_length = req_len; 1731 wpriv->ss.sge.length = wpriv->ss.sge.sge_length; 1732 /* 1733 * We can safely zero these out. Since the first SGE covers the 1734 * entire packet, nothing else should even look at the MR. 1735 */ 1736 wpriv->ss.sge.mr = NULL; 1737 wpriv->ss.sge.m = 0; 1738 wpriv->ss.sge.n = 0; 1739 1740 wpriv->ss.sg_list = NULL; 1741 wpriv->ss.total_len = wpriv->ss.sge.sge_length; 1742 wpriv->ss.num_sge = 1; 1743 1744 /* Construct the TID RDMA READ REQ packet header */ 1745 rcu_read_lock(); 1746 remote = rcu_dereference(qpriv->tid_rdma.remote); 1747 1748 KDETH_RESET(rreq->kdeth0, KVER, 0x1); 1749 KDETH_RESET(rreq->kdeth1, JKEY, remote->jkey); 1750 rreq->reth.vaddr = cpu_to_be64(wqe->rdma_wr.remote_addr + 1751 req->cur_seg * req->seg_len + flow->sent); 1752 rreq->reth.rkey = cpu_to_be32(wqe->rdma_wr.rkey); 1753 rreq->reth.length = cpu_to_be32(*len); 1754 rreq->tid_flow_psn = 1755 cpu_to_be32((flow->flow_state.generation << 1756 HFI1_KDETH_BTH_SEQ_SHIFT) | 1757 ((flow->flow_state.spsn + flow->pkt) & 1758 HFI1_KDETH_BTH_SEQ_MASK)); 1759 rreq->tid_flow_qp = 1760 cpu_to_be32(qpriv->tid_rdma.local.qp | 1761 ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) << 1762 TID_RDMA_DESTQP_FLOW_SHIFT) | 1763 qpriv->rcd->ctxt); 1764 rreq->verbs_qp = cpu_to_be32(qp->remote_qpn); 1765 *bth1 &= ~RVT_QPN_MASK; 1766 *bth1 |= remote->qp; 1767 *bth2 |= IB_BTH_REQ_ACK; 1768 rcu_read_unlock(); 1769 1770 /* We are done with this segment */ 1771 flow->sent += *len; 1772 req->cur_seg++; 1773 qp->s_state = TID_OP(READ_REQ); 1774 req->ack_pending++; 1775 req->flow_idx = (req->flow_idx + 1) & (MAX_FLOWS - 1); 1776 qpriv->pending_tid_r_segs++; 1777 qp->s_num_rd_atomic++; 1778 1779 /* Set the TID RDMA READ request payload size */ 1780 *len = req_len; 1781 1782 return sizeof(ohdr->u.tid_rdma.r_req) / sizeof(u32); 1783 } 1784 1785 /* 1786 * @len: contains the data length to read upon entry and the read request 1787 * payload length upon exit. 1788 */ 1789 u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe, 1790 struct ib_other_headers *ohdr, u32 *bth1, 1791 u32 *bth2, u32 *len) 1792 __must_hold(&qp->s_lock) 1793 { 1794 struct hfi1_qp_priv *qpriv = qp->priv; 1795 struct tid_rdma_request *req = wqe_to_tid_req(wqe); 1796 struct tid_rdma_flow *flow = NULL; 1797 u32 hdwords = 0; 1798 bool last; 1799 bool retry = true; 1800 u32 npkts = rvt_div_round_up_mtu(qp, *len); 1801 1802 trace_hfi1_tid_req_build_read_req(qp, 0, wqe->wr.opcode, wqe->psn, 1803 wqe->lpsn, req); 1804 /* 1805 * Check sync conditions. Make sure that there are no pending 1806 * segments before freeing the flow. 1807 */ 1808 sync_check: 1809 if (req->state == TID_REQUEST_SYNC) { 1810 if (qpriv->pending_tid_r_segs) 1811 goto done; 1812 1813 hfi1_kern_clear_hw_flow(req->rcd, qp); 1814 qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; 1815 req->state = TID_REQUEST_ACTIVE; 1816 } 1817 1818 /* 1819 * If the request for this segment is resent, the tid resources should 1820 * have been allocated before. In this case, req->flow_idx should 1821 * fall behind req->setup_head. 1822 */ 1823 if (req->flow_idx == req->setup_head) { 1824 retry = false; 1825 if (req->state == TID_REQUEST_RESEND) { 1826 /* 1827 * This is the first new segment for a request whose 1828 * earlier segments have been re-sent. We need to 1829 * set up the sge pointer correctly. 1830 */ 1831 restart_sge(&qp->s_sge, wqe, req->s_next_psn, 1832 qp->pmtu); 1833 req->isge = 0; 1834 req->state = TID_REQUEST_ACTIVE; 1835 } 1836 1837 /* 1838 * Check sync. The last PSN of each generation is reserved for 1839 * RESYNC. 1840 */ 1841 if ((qpriv->flow_state.psn + npkts) > MAX_TID_FLOW_PSN - 1) { 1842 req->state = TID_REQUEST_SYNC; 1843 goto sync_check; 1844 } 1845 1846 /* Allocate the flow if not yet */ 1847 if (hfi1_kern_setup_hw_flow(qpriv->rcd, qp)) 1848 goto done; 1849 1850 /* 1851 * The following call will advance req->setup_head after 1852 * allocating the tid entries. 1853 */ 1854 if (hfi1_kern_exp_rcv_setup(req, &qp->s_sge, &last)) { 1855 req->state = TID_REQUEST_QUEUED; 1856 1857 /* 1858 * We don't have resources for this segment. The QP has 1859 * already been queued. 1860 */ 1861 goto done; 1862 } 1863 } 1864 1865 /* req->flow_idx should only be one slot behind req->setup_head */ 1866 flow = &req->flows[req->flow_idx]; 1867 flow->pkt = 0; 1868 flow->tid_idx = 0; 1869 flow->sent = 0; 1870 if (!retry) { 1871 /* Set the first and last IB PSN for the flow in use.*/ 1872 flow->flow_state.ib_spsn = req->s_next_psn; 1873 flow->flow_state.ib_lpsn = 1874 flow->flow_state.ib_spsn + flow->npkts - 1; 1875 } 1876 1877 /* Calculate the next segment start psn.*/ 1878 req->s_next_psn += flow->npkts; 1879 1880 /* Build the packet header */ 1881 hdwords = hfi1_build_tid_rdma_read_packet(wqe, ohdr, bth1, bth2, len); 1882 done: 1883 return hdwords; 1884 } 1885 1886 /* 1887 * Validate and accept the TID RDMA READ request parameters. 1888 * Return 0 if the request is accepted successfully; 1889 * Return 1 otherwise. 1890 */ 1891 static int tid_rdma_rcv_read_request(struct rvt_qp *qp, 1892 struct rvt_ack_entry *e, 1893 struct hfi1_packet *packet, 1894 struct ib_other_headers *ohdr, 1895 u32 bth0, u32 psn, u64 vaddr, u32 len) 1896 { 1897 struct hfi1_qp_priv *qpriv = qp->priv; 1898 struct tid_rdma_request *req; 1899 struct tid_rdma_flow *flow; 1900 u32 flow_psn, i, tidlen = 0, pktlen, tlen; 1901 1902 req = ack_to_tid_req(e); 1903 1904 /* Validate the payload first */ 1905 flow = &req->flows[req->setup_head]; 1906 1907 /* payload length = packet length - (header length + ICRC length) */ 1908 pktlen = packet->tlen - (packet->hlen + 4); 1909 if (pktlen > sizeof(flow->tid_entry)) 1910 return 1; 1911 memcpy(flow->tid_entry, packet->ebuf, pktlen); 1912 flow->tidcnt = pktlen / sizeof(*flow->tid_entry); 1913 1914 /* 1915 * Walk the TID_ENTRY list to make sure we have enough space for a 1916 * complete segment. Also calculate the number of required packets. 1917 */ 1918 flow->npkts = rvt_div_round_up_mtu(qp, len); 1919 for (i = 0; i < flow->tidcnt; i++) { 1920 trace_hfi1_tid_entry_rcv_read_req(qp, i, 1921 flow->tid_entry[i]); 1922 tlen = EXP_TID_GET(flow->tid_entry[i], LEN); 1923 if (!tlen) 1924 return 1; 1925 1926 /* 1927 * For tid pair (tidctr == 3), the buffer size of the pair 1928 * should be the sum of the buffer size described by each 1929 * tid entry. However, only the first entry needs to be 1930 * specified in the request (see WFR HAS Section 8.5.7.1). 1931 */ 1932 tidlen += tlen; 1933 } 1934 if (tidlen * PAGE_SIZE < len) 1935 return 1; 1936 1937 /* Empty the flow array */ 1938 req->clear_tail = req->setup_head; 1939 flow->pkt = 0; 1940 flow->tid_idx = 0; 1941 flow->tid_offset = 0; 1942 flow->sent = 0; 1943 flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_qp); 1944 flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) & 1945 TID_RDMA_DESTQP_FLOW_MASK; 1946 flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_psn)); 1947 flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT; 1948 flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK; 1949 flow->length = len; 1950 1951 flow->flow_state.lpsn = flow->flow_state.spsn + 1952 flow->npkts - 1; 1953 flow->flow_state.ib_spsn = psn; 1954 flow->flow_state.ib_lpsn = flow->flow_state.ib_spsn + flow->npkts - 1; 1955 1956 trace_hfi1_tid_flow_rcv_read_req(qp, req->setup_head, flow); 1957 /* Set the initial flow index to the current flow. */ 1958 req->flow_idx = req->setup_head; 1959 1960 /* advance circular buffer head */ 1961 req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1); 1962 1963 /* 1964 * Compute last PSN for request. 1965 */ 1966 e->opcode = (bth0 >> 24) & 0xff; 1967 e->psn = psn; 1968 e->lpsn = psn + flow->npkts - 1; 1969 e->sent = 0; 1970 1971 req->n_flows = qpriv->tid_rdma.local.max_read; 1972 req->state = TID_REQUEST_ACTIVE; 1973 req->cur_seg = 0; 1974 req->comp_seg = 0; 1975 req->ack_seg = 0; 1976 req->isge = 0; 1977 req->seg_len = qpriv->tid_rdma.local.max_len; 1978 req->total_len = len; 1979 req->total_segs = 1; 1980 req->r_flow_psn = e->psn; 1981 1982 trace_hfi1_tid_req_rcv_read_req(qp, 0, e->opcode, e->psn, e->lpsn, 1983 req); 1984 return 0; 1985 } 1986 1987 static int tid_rdma_rcv_error(struct hfi1_packet *packet, 1988 struct ib_other_headers *ohdr, 1989 struct rvt_qp *qp, u32 psn, int diff) 1990 { 1991 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); 1992 struct hfi1_ctxtdata *rcd = ((struct hfi1_qp_priv *)qp->priv)->rcd; 1993 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); 1994 struct hfi1_qp_priv *qpriv = qp->priv; 1995 struct rvt_ack_entry *e; 1996 struct tid_rdma_request *req; 1997 unsigned long flags; 1998 u8 prev; 1999 bool old_req; 2000 2001 trace_hfi1_rsp_tid_rcv_error(qp, psn); 2002 trace_hfi1_tid_rdma_rcv_err(qp, 0, psn, diff); 2003 if (diff > 0) { 2004 /* sequence error */ 2005 if (!qp->r_nak_state) { 2006 ibp->rvp.n_rc_seqnak++; 2007 qp->r_nak_state = IB_NAK_PSN_ERROR; 2008 qp->r_ack_psn = qp->r_psn; 2009 rc_defered_ack(rcd, qp); 2010 } 2011 goto done; 2012 } 2013 2014 ibp->rvp.n_rc_dupreq++; 2015 2016 spin_lock_irqsave(&qp->s_lock, flags); 2017 e = find_prev_entry(qp, psn, &prev, NULL, &old_req); 2018 if (!e || (e->opcode != TID_OP(READ_REQ) && 2019 e->opcode != TID_OP(WRITE_REQ))) 2020 goto unlock; 2021 2022 req = ack_to_tid_req(e); 2023 req->r_flow_psn = psn; 2024 trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn, e->lpsn, req); 2025 if (e->opcode == TID_OP(READ_REQ)) { 2026 struct ib_reth *reth; 2027 u32 offset; 2028 u32 len; 2029 u32 rkey; 2030 u64 vaddr; 2031 int ok; 2032 u32 bth0; 2033 2034 reth = &ohdr->u.tid_rdma.r_req.reth; 2035 /* 2036 * The requester always restarts from the start of the original 2037 * request. 2038 */ 2039 offset = delta_psn(psn, e->psn) * qp->pmtu; 2040 len = be32_to_cpu(reth->length); 2041 if (psn != e->psn || len != req->total_len) 2042 goto unlock; 2043 2044 release_rdma_sge_mr(e); 2045 2046 rkey = be32_to_cpu(reth->rkey); 2047 vaddr = get_ib_reth_vaddr(reth); 2048 2049 qp->r_len = len; 2050 ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey, 2051 IB_ACCESS_REMOTE_READ); 2052 if (unlikely(!ok)) 2053 goto unlock; 2054 2055 /* 2056 * If all the response packets for the current request have 2057 * been sent out and this request is complete (old_request 2058 * == false) and the TID flow may be unusable (the 2059 * req->clear_tail is advanced). However, when an earlier 2060 * request is received, this request will not be complete any 2061 * more (qp->s_tail_ack_queue is moved back, see below). 2062 * Consequently, we need to update the TID flow info everytime 2063 * a duplicate request is received. 2064 */ 2065 bth0 = be32_to_cpu(ohdr->bth[0]); 2066 if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn, 2067 vaddr, len)) 2068 goto unlock; 2069 2070 /* 2071 * True if the request is already scheduled (between 2072 * qp->s_tail_ack_queue and qp->r_head_ack_queue); 2073 */ 2074 if (old_req) 2075 goto unlock; 2076 } else { 2077 struct flow_state *fstate; 2078 bool schedule = false; 2079 u8 i; 2080 2081 if (req->state == TID_REQUEST_RESEND) { 2082 req->state = TID_REQUEST_RESEND_ACTIVE; 2083 } else if (req->state == TID_REQUEST_INIT_RESEND) { 2084 req->state = TID_REQUEST_INIT; 2085 schedule = true; 2086 } 2087 2088 /* 2089 * True if the request is already scheduled (between 2090 * qp->s_tail_ack_queue and qp->r_head_ack_queue). 2091 * Also, don't change requests, which are at the SYNC 2092 * point and haven't generated any responses yet. 2093 * There is nothing to retransmit for them yet. 2094 */ 2095 if (old_req || req->state == TID_REQUEST_INIT || 2096 (req->state == TID_REQUEST_SYNC && !req->cur_seg)) { 2097 for (i = prev + 1; ; i++) { 2098 if (i > rvt_size_atomic(&dev->rdi)) 2099 i = 0; 2100 if (i == qp->r_head_ack_queue) 2101 break; 2102 e = &qp->s_ack_queue[i]; 2103 req = ack_to_tid_req(e); 2104 if (e->opcode == TID_OP(WRITE_REQ) && 2105 req->state == TID_REQUEST_INIT) 2106 req->state = TID_REQUEST_INIT_RESEND; 2107 } 2108 /* 2109 * If the state of the request has been changed, 2110 * the first leg needs to get scheduled in order to 2111 * pick up the change. Otherwise, normal response 2112 * processing should take care of it. 2113 */ 2114 if (!schedule) 2115 goto unlock; 2116 } 2117 2118 /* 2119 * If there is no more allocated segment, just schedule the qp 2120 * without changing any state. 2121 */ 2122 if (req->clear_tail == req->setup_head) 2123 goto schedule; 2124 /* 2125 * If this request has sent responses for segments, which have 2126 * not received data yet (flow_idx != clear_tail), the flow_idx 2127 * pointer needs to be adjusted so the same responses can be 2128 * re-sent. 2129 */ 2130 if (CIRC_CNT(req->flow_idx, req->clear_tail, MAX_FLOWS)) { 2131 fstate = &req->flows[req->clear_tail].flow_state; 2132 qpriv->pending_tid_w_segs -= 2133 CIRC_CNT(req->flow_idx, req->clear_tail, 2134 MAX_FLOWS); 2135 req->flow_idx = 2136 CIRC_ADD(req->clear_tail, 2137 delta_psn(psn, fstate->resp_ib_psn), 2138 MAX_FLOWS); 2139 qpriv->pending_tid_w_segs += 2140 delta_psn(psn, fstate->resp_ib_psn); 2141 /* 2142 * When flow_idx == setup_head, we've gotten a duplicate 2143 * request for a segment, which has not been allocated 2144 * yet. In that case, don't adjust this request. 2145 * However, we still want to go through the loop below 2146 * to adjust all subsequent requests. 2147 */ 2148 if (CIRC_CNT(req->setup_head, req->flow_idx, 2149 MAX_FLOWS)) { 2150 req->cur_seg = delta_psn(psn, e->psn); 2151 req->state = TID_REQUEST_RESEND_ACTIVE; 2152 } 2153 } 2154 2155 for (i = prev + 1; ; i++) { 2156 /* 2157 * Look at everything up to and including 2158 * s_tail_ack_queue 2159 */ 2160 if (i > rvt_size_atomic(&dev->rdi)) 2161 i = 0; 2162 if (i == qp->r_head_ack_queue) 2163 break; 2164 e = &qp->s_ack_queue[i]; 2165 req = ack_to_tid_req(e); 2166 trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn, 2167 e->lpsn, req); 2168 if (e->opcode != TID_OP(WRITE_REQ) || 2169 req->cur_seg == req->comp_seg || 2170 req->state == TID_REQUEST_INIT || 2171 req->state == TID_REQUEST_INIT_RESEND) { 2172 if (req->state == TID_REQUEST_INIT) 2173 req->state = TID_REQUEST_INIT_RESEND; 2174 continue; 2175 } 2176 qpriv->pending_tid_w_segs -= 2177 CIRC_CNT(req->flow_idx, 2178 req->clear_tail, 2179 MAX_FLOWS); 2180 req->flow_idx = req->clear_tail; 2181 req->state = TID_REQUEST_RESEND; 2182 req->cur_seg = req->comp_seg; 2183 } 2184 qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK; 2185 } 2186 /* Re-process old requests.*/ 2187 if (qp->s_acked_ack_queue == qp->s_tail_ack_queue) 2188 qp->s_acked_ack_queue = prev; 2189 qp->s_tail_ack_queue = prev; 2190 /* 2191 * Since the qp->s_tail_ack_queue is modified, the 2192 * qp->s_ack_state must be changed to re-initialize 2193 * qp->s_ack_rdma_sge; Otherwise, we will end up in 2194 * wrong memory region. 2195 */ 2196 qp->s_ack_state = OP(ACKNOWLEDGE); 2197 schedule: 2198 /* 2199 * It's possible to receive a retry psn that is earlier than an RNRNAK 2200 * psn. In this case, the rnrnak state should be cleared. 2201 */ 2202 if (qpriv->rnr_nak_state) { 2203 qp->s_nak_state = 0; 2204 qpriv->rnr_nak_state = TID_RNR_NAK_INIT; 2205 qp->r_psn = e->lpsn + 1; 2206 hfi1_tid_write_alloc_resources(qp, true); 2207 } 2208 2209 qp->r_state = e->opcode; 2210 qp->r_nak_state = 0; 2211 qp->s_flags |= RVT_S_RESP_PENDING; 2212 hfi1_schedule_send(qp); 2213 unlock: 2214 spin_unlock_irqrestore(&qp->s_lock, flags); 2215 done: 2216 return 1; 2217 } 2218 2219 void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet) 2220 { 2221 /* HANDLER FOR TID RDMA READ REQUEST packet (Responder side)*/ 2222 2223 /* 2224 * 1. Verify TID RDMA READ REQ as per IB_OPCODE_RC_RDMA_READ 2225 * (see hfi1_rc_rcv()) 2226 * 2. Put TID RDMA READ REQ into the response queueu (s_ack_queue) 2227 * - Setup struct tid_rdma_req with request info 2228 * - Initialize struct tid_rdma_flow info; 2229 * - Copy TID entries; 2230 * 3. Set the qp->s_ack_state. 2231 * 4. Set RVT_S_RESP_PENDING in s_flags. 2232 * 5. Kick the send engine (hfi1_schedule_send()) 2233 */ 2234 struct hfi1_ctxtdata *rcd = packet->rcd; 2235 struct rvt_qp *qp = packet->qp; 2236 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); 2237 struct ib_other_headers *ohdr = packet->ohdr; 2238 struct rvt_ack_entry *e; 2239 unsigned long flags; 2240 struct ib_reth *reth; 2241 struct hfi1_qp_priv *qpriv = qp->priv; 2242 u32 bth0, psn, len, rkey; 2243 bool fecn; 2244 u8 next; 2245 u64 vaddr; 2246 int diff; 2247 u8 nack_state = IB_NAK_INVALID_REQUEST; 2248 2249 bth0 = be32_to_cpu(ohdr->bth[0]); 2250 if (hfi1_ruc_check_hdr(ibp, packet)) 2251 return; 2252 2253 fecn = process_ecn(qp, packet); 2254 psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 2255 trace_hfi1_rsp_rcv_tid_read_req(qp, psn); 2256 2257 if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST)) 2258 rvt_comm_est(qp); 2259 2260 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) 2261 goto nack_inv; 2262 2263 reth = &ohdr->u.tid_rdma.r_req.reth; 2264 vaddr = be64_to_cpu(reth->vaddr); 2265 len = be32_to_cpu(reth->length); 2266 /* The length needs to be in multiples of PAGE_SIZE */ 2267 if (!len || len & ~PAGE_MASK || len > qpriv->tid_rdma.local.max_len) 2268 goto nack_inv; 2269 2270 diff = delta_psn(psn, qp->r_psn); 2271 if (unlikely(diff)) { 2272 tid_rdma_rcv_err(packet, ohdr, qp, psn, diff, fecn); 2273 return; 2274 } 2275 2276 /* We've verified the request, insert it into the ack queue. */ 2277 next = qp->r_head_ack_queue + 1; 2278 if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) 2279 next = 0; 2280 spin_lock_irqsave(&qp->s_lock, flags); 2281 if (unlikely(next == qp->s_tail_ack_queue)) { 2282 if (!qp->s_ack_queue[next].sent) { 2283 nack_state = IB_NAK_REMOTE_OPERATIONAL_ERROR; 2284 goto nack_inv_unlock; 2285 } 2286 update_ack_queue(qp, next); 2287 } 2288 e = &qp->s_ack_queue[qp->r_head_ack_queue]; 2289 release_rdma_sge_mr(e); 2290 2291 rkey = be32_to_cpu(reth->rkey); 2292 qp->r_len = len; 2293 2294 if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr, 2295 rkey, IB_ACCESS_REMOTE_READ))) 2296 goto nack_acc; 2297 2298 /* Accept the request parameters */ 2299 if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn, vaddr, 2300 len)) 2301 goto nack_inv_unlock; 2302 2303 qp->r_state = e->opcode; 2304 qp->r_nak_state = 0; 2305 /* 2306 * We need to increment the MSN here instead of when we 2307 * finish sending the result since a duplicate request would 2308 * increment it more than once. 2309 */ 2310 qp->r_msn++; 2311 qp->r_psn += e->lpsn - e->psn + 1; 2312 2313 qp->r_head_ack_queue = next; 2314 2315 /* 2316 * For all requests other than TID WRITE which are added to the ack 2317 * queue, qpriv->r_tid_alloc follows qp->r_head_ack_queue. It is ok to 2318 * do this because of interlocks between these and TID WRITE 2319 * requests. The same change has also been made in hfi1_rc_rcv(). 2320 */ 2321 qpriv->r_tid_alloc = qp->r_head_ack_queue; 2322 2323 /* Schedule the send tasklet. */ 2324 qp->s_flags |= RVT_S_RESP_PENDING; 2325 if (fecn) 2326 qp->s_flags |= RVT_S_ECN; 2327 hfi1_schedule_send(qp); 2328 2329 spin_unlock_irqrestore(&qp->s_lock, flags); 2330 return; 2331 2332 nack_inv_unlock: 2333 spin_unlock_irqrestore(&qp->s_lock, flags); 2334 nack_inv: 2335 rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); 2336 qp->r_nak_state = nack_state; 2337 qp->r_ack_psn = qp->r_psn; 2338 /* Queue NAK for later */ 2339 rc_defered_ack(rcd, qp); 2340 return; 2341 nack_acc: 2342 spin_unlock_irqrestore(&qp->s_lock, flags); 2343 rvt_rc_error(qp, IB_WC_LOC_PROT_ERR); 2344 qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; 2345 qp->r_ack_psn = qp->r_psn; 2346 } 2347 2348 u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, 2349 struct ib_other_headers *ohdr, u32 *bth0, 2350 u32 *bth1, u32 *bth2, u32 *len, bool *last) 2351 { 2352 struct hfi1_ack_priv *epriv = e->priv; 2353 struct tid_rdma_request *req = &epriv->tid_req; 2354 struct hfi1_qp_priv *qpriv = qp->priv; 2355 struct tid_rdma_flow *flow = &req->flows[req->clear_tail]; 2356 u32 tidentry = flow->tid_entry[flow->tid_idx]; 2357 u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT; 2358 struct tid_rdma_read_resp *resp = &ohdr->u.tid_rdma.r_rsp; 2359 u32 next_offset, om = KDETH_OM_LARGE; 2360 bool last_pkt; 2361 u32 hdwords = 0; 2362 struct tid_rdma_params *remote; 2363 2364 *len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset); 2365 flow->sent += *len; 2366 next_offset = flow->tid_offset + *len; 2367 last_pkt = (flow->sent >= flow->length); 2368 2369 trace_hfi1_tid_entry_build_read_resp(qp, flow->tid_idx, tidentry); 2370 trace_hfi1_tid_flow_build_read_resp(qp, req->clear_tail, flow); 2371 2372 rcu_read_lock(); 2373 remote = rcu_dereference(qpriv->tid_rdma.remote); 2374 if (!remote) { 2375 rcu_read_unlock(); 2376 goto done; 2377 } 2378 KDETH_RESET(resp->kdeth0, KVER, 0x1); 2379 KDETH_SET(resp->kdeth0, SH, !last_pkt); 2380 KDETH_SET(resp->kdeth0, INTR, !!(!last_pkt && remote->urg)); 2381 KDETH_SET(resp->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL)); 2382 KDETH_SET(resp->kdeth0, TID, EXP_TID_GET(tidentry, IDX)); 2383 KDETH_SET(resp->kdeth0, OM, om == KDETH_OM_LARGE); 2384 KDETH_SET(resp->kdeth0, OFFSET, flow->tid_offset / om); 2385 KDETH_RESET(resp->kdeth1, JKEY, remote->jkey); 2386 resp->verbs_qp = cpu_to_be32(qp->remote_qpn); 2387 rcu_read_unlock(); 2388 2389 resp->aeth = rvt_compute_aeth(qp); 2390 resp->verbs_psn = cpu_to_be32(mask_psn(flow->flow_state.ib_spsn + 2391 flow->pkt)); 2392 2393 *bth0 = TID_OP(READ_RESP) << 24; 2394 *bth1 = flow->tid_qpn; 2395 *bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) & 2396 HFI1_KDETH_BTH_SEQ_MASK) | 2397 (flow->flow_state.generation << 2398 HFI1_KDETH_BTH_SEQ_SHIFT)); 2399 *last = last_pkt; 2400 if (last_pkt) 2401 /* Advance to next flow */ 2402 req->clear_tail = (req->clear_tail + 1) & 2403 (MAX_FLOWS - 1); 2404 2405 if (next_offset >= tidlen) { 2406 flow->tid_offset = 0; 2407 flow->tid_idx++; 2408 } else { 2409 flow->tid_offset = next_offset; 2410 } 2411 2412 hdwords = sizeof(ohdr->u.tid_rdma.r_rsp) / sizeof(u32); 2413 2414 done: 2415 return hdwords; 2416 } 2417 2418 static inline struct tid_rdma_request * 2419 find_tid_request(struct rvt_qp *qp, u32 psn, enum ib_wr_opcode opcode) 2420 __must_hold(&qp->s_lock) 2421 { 2422 struct rvt_swqe *wqe; 2423 struct tid_rdma_request *req = NULL; 2424 u32 i, end; 2425 2426 end = qp->s_cur + 1; 2427 if (end == qp->s_size) 2428 end = 0; 2429 for (i = qp->s_acked; i != end;) { 2430 wqe = rvt_get_swqe_ptr(qp, i); 2431 if (cmp_psn(psn, wqe->psn) >= 0 && 2432 cmp_psn(psn, wqe->lpsn) <= 0) { 2433 if (wqe->wr.opcode == opcode) 2434 req = wqe_to_tid_req(wqe); 2435 break; 2436 } 2437 if (++i == qp->s_size) 2438 i = 0; 2439 } 2440 2441 return req; 2442 } 2443 2444 void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet) 2445 { 2446 /* HANDLER FOR TID RDMA READ RESPONSE packet (Requestor side */ 2447 2448 /* 2449 * 1. Find matching SWQE 2450 * 2. Check that the entire segment has been read. 2451 * 3. Remove HFI1_S_WAIT_TID_RESP from s_flags. 2452 * 4. Free the TID flow resources. 2453 * 5. Kick the send engine (hfi1_schedule_send()) 2454 */ 2455 struct ib_other_headers *ohdr = packet->ohdr; 2456 struct rvt_qp *qp = packet->qp; 2457 struct hfi1_qp_priv *priv = qp->priv; 2458 struct hfi1_ctxtdata *rcd = packet->rcd; 2459 struct tid_rdma_request *req; 2460 struct tid_rdma_flow *flow; 2461 u32 opcode, aeth; 2462 bool fecn; 2463 unsigned long flags; 2464 u32 kpsn, ipsn; 2465 2466 trace_hfi1_sender_rcv_tid_read_resp(qp); 2467 fecn = process_ecn(qp, packet); 2468 kpsn = mask_psn(be32_to_cpu(ohdr->bth[2])); 2469 aeth = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.aeth); 2470 opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; 2471 2472 spin_lock_irqsave(&qp->s_lock, flags); 2473 ipsn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn)); 2474 req = find_tid_request(qp, ipsn, IB_WR_TID_RDMA_READ); 2475 if (unlikely(!req)) 2476 goto ack_op_err; 2477 2478 flow = &req->flows[req->clear_tail]; 2479 /* When header suppression is disabled */ 2480 if (cmp_psn(ipsn, flow->flow_state.ib_lpsn)) { 2481 update_r_next_psn_fecn(packet, priv, rcd, flow, fecn); 2482 2483 if (cmp_psn(kpsn, flow->flow_state.r_next_psn)) 2484 goto ack_done; 2485 flow->flow_state.r_next_psn = mask_psn(kpsn + 1); 2486 /* 2487 * Copy the payload to destination buffer if this packet is 2488 * delivered as an eager packet due to RSM rule and FECN. 2489 * The RSM rule selects FECN bit in BTH and SH bit in 2490 * KDETH header and therefore will not match the last 2491 * packet of each segment that has SH bit cleared. 2492 */ 2493 if (fecn && packet->etype == RHF_RCV_TYPE_EAGER) { 2494 struct rvt_sge_state ss; 2495 u32 len; 2496 u32 tlen = packet->tlen; 2497 u16 hdrsize = packet->hlen; 2498 u8 pad = packet->pad; 2499 u8 extra_bytes = pad + packet->extra_byte + 2500 (SIZE_OF_CRC << 2); 2501 u32 pmtu = qp->pmtu; 2502 2503 if (unlikely(tlen != (hdrsize + pmtu + extra_bytes))) 2504 goto ack_op_err; 2505 len = restart_sge(&ss, req->e.swqe, ipsn, pmtu); 2506 if (unlikely(len < pmtu)) 2507 goto ack_op_err; 2508 rvt_copy_sge(qp, &ss, packet->payload, pmtu, false, 2509 false); 2510 /* Raise the sw sequence check flag for next packet */ 2511 priv->s_flags |= HFI1_R_TID_SW_PSN; 2512 } 2513 2514 goto ack_done; 2515 } 2516 flow->flow_state.r_next_psn = mask_psn(kpsn + 1); 2517 req->ack_pending--; 2518 priv->pending_tid_r_segs--; 2519 qp->s_num_rd_atomic--; 2520 if ((qp->s_flags & RVT_S_WAIT_FENCE) && 2521 !qp->s_num_rd_atomic) { 2522 qp->s_flags &= ~(RVT_S_WAIT_FENCE | 2523 RVT_S_WAIT_ACK); 2524 hfi1_schedule_send(qp); 2525 } 2526 if (qp->s_flags & RVT_S_WAIT_RDMAR) { 2527 qp->s_flags &= ~(RVT_S_WAIT_RDMAR | RVT_S_WAIT_ACK); 2528 hfi1_schedule_send(qp); 2529 } 2530 2531 trace_hfi1_ack(qp, ipsn); 2532 trace_hfi1_tid_req_rcv_read_resp(qp, 0, req->e.swqe->wr.opcode, 2533 req->e.swqe->psn, req->e.swqe->lpsn, 2534 req); 2535 trace_hfi1_tid_flow_rcv_read_resp(qp, req->clear_tail, flow); 2536 2537 /* Release the tid resources */ 2538 hfi1_kern_exp_rcv_clear(req); 2539 2540 if (!do_rc_ack(qp, aeth, ipsn, opcode, 0, rcd)) 2541 goto ack_done; 2542 2543 /* If not done yet, build next read request */ 2544 if (++req->comp_seg >= req->total_segs) { 2545 priv->tid_r_comp++; 2546 req->state = TID_REQUEST_COMPLETE; 2547 } 2548 2549 /* 2550 * Clear the hw flow under two conditions: 2551 * 1. This request is a sync point and it is complete; 2552 * 2. Current request is completed and there are no more requests. 2553 */ 2554 if ((req->state == TID_REQUEST_SYNC && 2555 req->comp_seg == req->cur_seg) || 2556 priv->tid_r_comp == priv->tid_r_reqs) { 2557 hfi1_kern_clear_hw_flow(priv->rcd, qp); 2558 priv->s_flags &= ~HFI1_R_TID_SW_PSN; 2559 if (req->state == TID_REQUEST_SYNC) 2560 req->state = TID_REQUEST_ACTIVE; 2561 } 2562 2563 hfi1_schedule_send(qp); 2564 goto ack_done; 2565 2566 ack_op_err: 2567 /* 2568 * The test indicates that the send engine has finished its cleanup 2569 * after sending the request and it's now safe to put the QP into error 2570 * state. However, if the wqe queue is empty (qp->s_acked == qp->s_tail 2571 * == qp->s_head), it would be unsafe to complete the wqe pointed by 2572 * qp->s_acked here. Putting the qp into error state will safely flush 2573 * all remaining requests. 2574 */ 2575 if (qp->s_last == qp->s_acked) 2576 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); 2577 2578 ack_done: 2579 spin_unlock_irqrestore(&qp->s_lock, flags); 2580 } 2581 2582 void hfi1_kern_read_tid_flow_free(struct rvt_qp *qp) 2583 __must_hold(&qp->s_lock) 2584 { 2585 u32 n = qp->s_acked; 2586 struct rvt_swqe *wqe; 2587 struct tid_rdma_request *req; 2588 struct hfi1_qp_priv *priv = qp->priv; 2589 2590 lockdep_assert_held(&qp->s_lock); 2591 /* Free any TID entries */ 2592 while (n != qp->s_tail) { 2593 wqe = rvt_get_swqe_ptr(qp, n); 2594 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { 2595 req = wqe_to_tid_req(wqe); 2596 hfi1_kern_exp_rcv_clear_all(req); 2597 } 2598 2599 if (++n == qp->s_size) 2600 n = 0; 2601 } 2602 /* Free flow */ 2603 hfi1_kern_clear_hw_flow(priv->rcd, qp); 2604 } 2605 2606 static bool tid_rdma_tid_err(struct hfi1_ctxtdata *rcd, 2607 struct hfi1_packet *packet, u8 rcv_type, 2608 u8 opcode) 2609 { 2610 struct rvt_qp *qp = packet->qp; 2611 struct hfi1_qp_priv *qpriv = qp->priv; 2612 u32 ipsn; 2613 struct ib_other_headers *ohdr = packet->ohdr; 2614 struct rvt_ack_entry *e; 2615 struct tid_rdma_request *req; 2616 struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); 2617 u32 i; 2618 2619 if (rcv_type >= RHF_RCV_TYPE_IB) 2620 goto done; 2621 2622 spin_lock(&qp->s_lock); 2623 2624 /* 2625 * We've ran out of space in the eager buffer. 2626 * Eagerly received KDETH packets which require space in the 2627 * Eager buffer (packet that have payload) are TID RDMA WRITE 2628 * response packets. In this case, we have to re-transmit the 2629 * TID RDMA WRITE request. 2630 */ 2631 if (rcv_type == RHF_RCV_TYPE_EAGER) { 2632 hfi1_restart_rc(qp, qp->s_last_psn + 1, 1); 2633 hfi1_schedule_send(qp); 2634 goto done_unlock; 2635 } 2636 2637 /* 2638 * For TID READ response, error out QP after freeing the tid 2639 * resources. 2640 */ 2641 if (opcode == TID_OP(READ_RESP)) { 2642 ipsn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn)); 2643 if (cmp_psn(ipsn, qp->s_last_psn) > 0 && 2644 cmp_psn(ipsn, qp->s_psn) < 0) { 2645 hfi1_kern_read_tid_flow_free(qp); 2646 spin_unlock(&qp->s_lock); 2647 rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); 2648 goto done; 2649 } 2650 goto done_unlock; 2651 } 2652 2653 /* 2654 * Error out the qp for TID RDMA WRITE 2655 */ 2656 hfi1_kern_clear_hw_flow(qpriv->rcd, qp); 2657 for (i = 0; i < rvt_max_atomic(rdi); i++) { 2658 e = &qp->s_ack_queue[i]; 2659 if (e->opcode == TID_OP(WRITE_REQ)) { 2660 req = ack_to_tid_req(e); 2661 hfi1_kern_exp_rcv_clear_all(req); 2662 } 2663 } 2664 spin_unlock(&qp->s_lock); 2665 rvt_rc_error(qp, IB_WC_LOC_LEN_ERR); 2666 goto done; 2667 2668 done_unlock: 2669 spin_unlock(&qp->s_lock); 2670 done: 2671 return true; 2672 } 2673 2674 static void restart_tid_rdma_read_req(struct hfi1_ctxtdata *rcd, 2675 struct rvt_qp *qp, struct rvt_swqe *wqe) 2676 { 2677 struct tid_rdma_request *req; 2678 struct tid_rdma_flow *flow; 2679 2680 /* Start from the right segment */ 2681 qp->r_flags |= RVT_R_RDMAR_SEQ; 2682 req = wqe_to_tid_req(wqe); 2683 flow = &req->flows[req->clear_tail]; 2684 hfi1_restart_rc(qp, flow->flow_state.ib_spsn, 0); 2685 if (list_empty(&qp->rspwait)) { 2686 qp->r_flags |= RVT_R_RSP_SEND; 2687 rvt_get_qp(qp); 2688 list_add_tail(&qp->rspwait, &rcd->qp_wait_list); 2689 } 2690 } 2691 2692 /* 2693 * Handle the KDETH eflags for TID RDMA READ response. 2694 * 2695 * Return true if the last packet for a segment has been received and it is 2696 * time to process the response normally; otherwise, return true. 2697 * 2698 * The caller must hold the packet->qp->r_lock and the rcu_read_lock. 2699 */ 2700 static bool handle_read_kdeth_eflags(struct hfi1_ctxtdata *rcd, 2701 struct hfi1_packet *packet, u8 rcv_type, 2702 u8 rte, u32 psn, u32 ibpsn) 2703 __must_hold(&packet->qp->r_lock) __must_hold(RCU) 2704 { 2705 struct hfi1_pportdata *ppd = rcd->ppd; 2706 struct hfi1_devdata *dd = ppd->dd; 2707 struct hfi1_ibport *ibp; 2708 struct rvt_swqe *wqe; 2709 struct tid_rdma_request *req; 2710 struct tid_rdma_flow *flow; 2711 u32 ack_psn; 2712 struct rvt_qp *qp = packet->qp; 2713 struct hfi1_qp_priv *priv = qp->priv; 2714 bool ret = true; 2715 int diff = 0; 2716 u32 fpsn; 2717 2718 lockdep_assert_held(&qp->r_lock); 2719 /* If the psn is out of valid range, drop the packet */ 2720 if (cmp_psn(ibpsn, qp->s_last_psn) < 0 || 2721 cmp_psn(ibpsn, qp->s_psn) > 0) 2722 return ret; 2723 2724 spin_lock(&qp->s_lock); 2725 /* 2726 * Note that NAKs implicitly ACK outstanding SEND and RDMA write 2727 * requests and implicitly NAK RDMA read and atomic requests issued 2728 * before the NAK'ed request. 2729 */ 2730 ack_psn = ibpsn - 1; 2731 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 2732 ibp = to_iport(qp->ibqp.device, qp->port_num); 2733 2734 /* Complete WQEs that the PSN finishes. */ 2735 while ((int)delta_psn(ack_psn, wqe->lpsn) >= 0) { 2736 /* 2737 * If this request is a RDMA read or atomic, and the NACK is 2738 * for a later operation, this NACK NAKs the RDMA read or 2739 * atomic. 2740 */ 2741 if (wqe->wr.opcode == IB_WR_RDMA_READ || 2742 wqe->wr.opcode == IB_WR_TID_RDMA_READ || 2743 wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || 2744 wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { 2745 /* Retry this request. */ 2746 if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) { 2747 qp->r_flags |= RVT_R_RDMAR_SEQ; 2748 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { 2749 restart_tid_rdma_read_req(rcd, qp, 2750 wqe); 2751 } else { 2752 hfi1_restart_rc(qp, qp->s_last_psn + 1, 2753 0); 2754 if (list_empty(&qp->rspwait)) { 2755 qp->r_flags |= RVT_R_RSP_SEND; 2756 rvt_get_qp(qp); 2757 list_add_tail(/* wait */ 2758 &qp->rspwait, 2759 &rcd->qp_wait_list); 2760 } 2761 } 2762 } 2763 /* 2764 * No need to process the NAK since we are 2765 * restarting an earlier request. 2766 */ 2767 break; 2768 } 2769 2770 wqe = do_rc_completion(qp, wqe, ibp); 2771 if (qp->s_acked == qp->s_tail) 2772 break; 2773 } 2774 2775 /* Handle the eflags for the request */ 2776 if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) 2777 goto s_unlock; 2778 2779 req = wqe_to_tid_req(wqe); 2780 switch (rcv_type) { 2781 case RHF_RCV_TYPE_EXPECTED: 2782 switch (rte) { 2783 case RHF_RTE_EXPECTED_FLOW_SEQ_ERR: 2784 /* 2785 * On the first occurrence of a Flow Sequence error, 2786 * the flag TID_FLOW_SW_PSN is set. 2787 * 2788 * After that, the flow is *not* reprogrammed and the 2789 * protocol falls back to SW PSN checking. This is done 2790 * to prevent continuous Flow Sequence errors for any 2791 * packets that could be still in the fabric. 2792 */ 2793 flow = find_flow(req, psn, NULL); 2794 if (!flow) { 2795 /* 2796 * We can't find the IB PSN matching the 2797 * received KDETH PSN. The only thing we can 2798 * do at this point is report the error to 2799 * the QP. 2800 */ 2801 hfi1_kern_read_tid_flow_free(qp); 2802 spin_unlock(&qp->s_lock); 2803 rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); 2804 return ret; 2805 } 2806 if (priv->s_flags & HFI1_R_TID_SW_PSN) { 2807 diff = cmp_psn(psn, 2808 flow->flow_state.r_next_psn); 2809 if (diff > 0) { 2810 if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) 2811 restart_tid_rdma_read_req(rcd, 2812 qp, 2813 wqe); 2814 2815 /* Drop the packet.*/ 2816 goto s_unlock; 2817 } else if (diff < 0) { 2818 /* 2819 * If a response packet for a restarted 2820 * request has come back, reset the 2821 * restart flag. 2822 */ 2823 if (qp->r_flags & RVT_R_RDMAR_SEQ) 2824 qp->r_flags &= 2825 ~RVT_R_RDMAR_SEQ; 2826 2827 /* Drop the packet.*/ 2828 goto s_unlock; 2829 } 2830 2831 /* 2832 * If SW PSN verification is successful and 2833 * this is the last packet in the segment, tell 2834 * the caller to process it as a normal packet. 2835 */ 2836 fpsn = full_flow_psn(flow, 2837 flow->flow_state.lpsn); 2838 if (cmp_psn(fpsn, psn) == 0) { 2839 ret = false; 2840 if (qp->r_flags & RVT_R_RDMAR_SEQ) 2841 qp->r_flags &= 2842 ~RVT_R_RDMAR_SEQ; 2843 } 2844 flow->flow_state.r_next_psn = 2845 mask_psn(psn + 1); 2846 } else { 2847 u32 last_psn; 2848 2849 last_psn = read_r_next_psn(dd, rcd->ctxt, 2850 flow->idx); 2851 flow->flow_state.r_next_psn = last_psn; 2852 priv->s_flags |= HFI1_R_TID_SW_PSN; 2853 /* 2854 * If no request has been restarted yet, 2855 * restart the current one. 2856 */ 2857 if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) 2858 restart_tid_rdma_read_req(rcd, qp, 2859 wqe); 2860 } 2861 2862 break; 2863 2864 case RHF_RTE_EXPECTED_FLOW_GEN_ERR: 2865 /* 2866 * Since the TID flow is able to ride through 2867 * generation mismatch, drop this stale packet. 2868 */ 2869 break; 2870 2871 default: 2872 break; 2873 } 2874 break; 2875 2876 case RHF_RCV_TYPE_ERROR: 2877 switch (rte) { 2878 case RHF_RTE_ERROR_OP_CODE_ERR: 2879 case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR: 2880 case RHF_RTE_ERROR_KHDR_HCRC_ERR: 2881 case RHF_RTE_ERROR_KHDR_KVER_ERR: 2882 case RHF_RTE_ERROR_CONTEXT_ERR: 2883 case RHF_RTE_ERROR_KHDR_TID_ERR: 2884 default: 2885 break; 2886 } 2887 default: 2888 break; 2889 } 2890 s_unlock: 2891 spin_unlock(&qp->s_lock); 2892 return ret; 2893 } 2894 2895 bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, 2896 struct hfi1_pportdata *ppd, 2897 struct hfi1_packet *packet) 2898 { 2899 struct hfi1_ibport *ibp = &ppd->ibport_data; 2900 struct hfi1_devdata *dd = ppd->dd; 2901 struct rvt_dev_info *rdi = &dd->verbs_dev.rdi; 2902 u8 rcv_type = rhf_rcv_type(packet->rhf); 2903 u8 rte = rhf_rcv_type_err(packet->rhf); 2904 struct ib_header *hdr = packet->hdr; 2905 struct ib_other_headers *ohdr = NULL; 2906 int lnh = be16_to_cpu(hdr->lrh[0]) & 3; 2907 u16 lid = be16_to_cpu(hdr->lrh[1]); 2908 u8 opcode; 2909 u32 qp_num, psn, ibpsn; 2910 struct rvt_qp *qp; 2911 struct hfi1_qp_priv *qpriv; 2912 unsigned long flags; 2913 bool ret = true; 2914 struct rvt_ack_entry *e; 2915 struct tid_rdma_request *req; 2916 struct tid_rdma_flow *flow; 2917 int diff = 0; 2918 2919 trace_hfi1_msg_handle_kdeth_eflags(NULL, "Kdeth error: rhf ", 2920 packet->rhf); 2921 if (packet->rhf & RHF_ICRC_ERR) 2922 return ret; 2923 2924 packet->ohdr = &hdr->u.oth; 2925 ohdr = packet->ohdr; 2926 trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf))); 2927 2928 /* Get the destination QP number. */ 2929 qp_num = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_qp) & 2930 RVT_QPN_MASK; 2931 if (lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) 2932 goto drop; 2933 2934 psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 2935 opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; 2936 2937 rcu_read_lock(); 2938 qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num); 2939 if (!qp) 2940 goto rcu_unlock; 2941 2942 packet->qp = qp; 2943 2944 /* Check for valid receive state. */ 2945 spin_lock_irqsave(&qp->r_lock, flags); 2946 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { 2947 ibp->rvp.n_pkt_drops++; 2948 goto r_unlock; 2949 } 2950 2951 if (packet->rhf & RHF_TID_ERR) { 2952 /* For TIDERR and RC QPs preemptively schedule a NAK */ 2953 u32 tlen = rhf_pkt_len(packet->rhf); /* in bytes */ 2954 2955 /* Sanity check packet */ 2956 if (tlen < 24) 2957 goto r_unlock; 2958 2959 /* 2960 * Check for GRH. We should never get packets with GRH in this 2961 * path. 2962 */ 2963 if (lnh == HFI1_LRH_GRH) 2964 goto r_unlock; 2965 2966 if (tid_rdma_tid_err(rcd, packet, rcv_type, opcode)) 2967 goto r_unlock; 2968 } 2969 2970 /* handle TID RDMA READ */ 2971 if (opcode == TID_OP(READ_RESP)) { 2972 ibpsn = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn); 2973 ibpsn = mask_psn(ibpsn); 2974 ret = handle_read_kdeth_eflags(rcd, packet, rcv_type, rte, psn, 2975 ibpsn); 2976 goto r_unlock; 2977 } 2978 2979 /* 2980 * qp->s_tail_ack_queue points to the rvt_ack_entry currently being 2981 * processed. These a completed sequentially so we can be sure that 2982 * the pointer will not change until the entire request has completed. 2983 */ 2984 spin_lock(&qp->s_lock); 2985 qpriv = qp->priv; 2986 e = &qp->s_ack_queue[qpriv->r_tid_tail]; 2987 req = ack_to_tid_req(e); 2988 flow = &req->flows[req->clear_tail]; 2989 trace_hfi1_eflags_err_write(qp, rcv_type, rte, psn); 2990 trace_hfi1_rsp_handle_kdeth_eflags(qp, psn); 2991 trace_hfi1_tid_write_rsp_handle_kdeth_eflags(qp); 2992 trace_hfi1_tid_req_handle_kdeth_eflags(qp, 0, e->opcode, e->psn, 2993 e->lpsn, req); 2994 trace_hfi1_tid_flow_handle_kdeth_eflags(qp, req->clear_tail, flow); 2995 2996 switch (rcv_type) { 2997 case RHF_RCV_TYPE_EXPECTED: 2998 switch (rte) { 2999 case RHF_RTE_EXPECTED_FLOW_SEQ_ERR: 3000 if (!(qpriv->s_flags & HFI1_R_TID_SW_PSN)) { 3001 qpriv->s_flags |= HFI1_R_TID_SW_PSN; 3002 flow->flow_state.r_next_psn = 3003 read_r_next_psn(dd, rcd->ctxt, 3004 flow->idx); 3005 qpriv->r_next_psn_kdeth = 3006 flow->flow_state.r_next_psn; 3007 goto nak_psn; 3008 } else { 3009 /* 3010 * If the received PSN does not match the next 3011 * expected PSN, NAK the packet. 3012 * However, only do that if we know that the a 3013 * NAK has already been sent. Otherwise, this 3014 * mismatch could be due to packets that were 3015 * already in flight. 3016 */ 3017 diff = cmp_psn(psn, 3018 flow->flow_state.r_next_psn); 3019 if (diff > 0) 3020 goto nak_psn; 3021 else if (diff < 0) 3022 break; 3023 3024 qpriv->s_nak_state = 0; 3025 /* 3026 * If SW PSN verification is successful and this 3027 * is the last packet in the segment, tell the 3028 * caller to process it as a normal packet. 3029 */ 3030 if (psn == full_flow_psn(flow, 3031 flow->flow_state.lpsn)) 3032 ret = false; 3033 flow->flow_state.r_next_psn = 3034 mask_psn(psn + 1); 3035 qpriv->r_next_psn_kdeth = 3036 flow->flow_state.r_next_psn; 3037 } 3038 break; 3039 3040 case RHF_RTE_EXPECTED_FLOW_GEN_ERR: 3041 goto nak_psn; 3042 3043 default: 3044 break; 3045 } 3046 break; 3047 3048 case RHF_RCV_TYPE_ERROR: 3049 switch (rte) { 3050 case RHF_RTE_ERROR_OP_CODE_ERR: 3051 case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR: 3052 case RHF_RTE_ERROR_KHDR_HCRC_ERR: 3053 case RHF_RTE_ERROR_KHDR_KVER_ERR: 3054 case RHF_RTE_ERROR_CONTEXT_ERR: 3055 case RHF_RTE_ERROR_KHDR_TID_ERR: 3056 default: 3057 break; 3058 } 3059 default: 3060 break; 3061 } 3062 3063 unlock: 3064 spin_unlock(&qp->s_lock); 3065 r_unlock: 3066 spin_unlock_irqrestore(&qp->r_lock, flags); 3067 rcu_unlock: 3068 rcu_read_unlock(); 3069 drop: 3070 return ret; 3071 nak_psn: 3072 ibp->rvp.n_rc_seqnak++; 3073 if (!qpriv->s_nak_state) { 3074 qpriv->s_nak_state = IB_NAK_PSN_ERROR; 3075 /* We are NAK'ing the next expected PSN */ 3076 qpriv->s_nak_psn = mask_psn(flow->flow_state.r_next_psn); 3077 qpriv->s_flags |= RVT_S_ACK_PENDING; 3078 if (qpriv->r_tid_ack == HFI1_QP_WQE_INVALID) 3079 qpriv->r_tid_ack = qpriv->r_tid_tail; 3080 hfi1_schedule_tid_send(qp); 3081 } 3082 goto unlock; 3083 } 3084 3085 /* 3086 * "Rewind" the TID request information. 3087 * This means that we reset the state back to ACTIVE, 3088 * find the proper flow, set the flow index to that flow, 3089 * and reset the flow information. 3090 */ 3091 void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe, 3092 u32 *bth2) 3093 { 3094 struct tid_rdma_request *req = wqe_to_tid_req(wqe); 3095 struct tid_rdma_flow *flow; 3096 struct hfi1_qp_priv *qpriv = qp->priv; 3097 int diff, delta_pkts; 3098 u32 tididx = 0, i; 3099 u16 fidx; 3100 3101 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { 3102 *bth2 = mask_psn(qp->s_psn); 3103 flow = find_flow_ib(req, *bth2, &fidx); 3104 if (!flow) { 3105 trace_hfi1_msg_tid_restart_req(/* msg */ 3106 qp, "!!!!!! Could not find flow to restart: bth2 ", 3107 (u64)*bth2); 3108 trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode, 3109 wqe->psn, wqe->lpsn, 3110 req); 3111 return; 3112 } 3113 } else { 3114 fidx = req->acked_tail; 3115 flow = &req->flows[fidx]; 3116 *bth2 = mask_psn(req->r_ack_psn); 3117 } 3118 3119 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) 3120 delta_pkts = delta_psn(*bth2, flow->flow_state.ib_spsn); 3121 else 3122 delta_pkts = delta_psn(*bth2, 3123 full_flow_psn(flow, 3124 flow->flow_state.spsn)); 3125 3126 trace_hfi1_tid_flow_restart_req(qp, fidx, flow); 3127 diff = delta_pkts + flow->resync_npkts; 3128 3129 flow->sent = 0; 3130 flow->pkt = 0; 3131 flow->tid_idx = 0; 3132 flow->tid_offset = 0; 3133 if (diff) { 3134 for (tididx = 0; tididx < flow->tidcnt; tididx++) { 3135 u32 tidentry = flow->tid_entry[tididx], tidlen, 3136 tidnpkts, npkts; 3137 3138 flow->tid_offset = 0; 3139 tidlen = EXP_TID_GET(tidentry, LEN) * PAGE_SIZE; 3140 tidnpkts = rvt_div_round_up_mtu(qp, tidlen); 3141 npkts = min_t(u32, diff, tidnpkts); 3142 flow->pkt += npkts; 3143 flow->sent += (npkts == tidnpkts ? tidlen : 3144 npkts * qp->pmtu); 3145 flow->tid_offset += npkts * qp->pmtu; 3146 diff -= npkts; 3147 if (!diff) 3148 break; 3149 } 3150 } 3151 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) { 3152 rvt_skip_sge(&qpriv->tid_ss, (req->cur_seg * req->seg_len) + 3153 flow->sent, 0); 3154 /* 3155 * Packet PSN is based on flow_state.spsn + flow->pkt. However, 3156 * during a RESYNC, the generation is incremented and the 3157 * sequence is reset to 0. Since we've adjusted the npkts in the 3158 * flow and the SGE has been sufficiently advanced, we have to 3159 * adjust flow->pkt in order to calculate the correct PSN. 3160 */ 3161 flow->pkt -= flow->resync_npkts; 3162 } 3163 3164 if (flow->tid_offset == 3165 EXP_TID_GET(flow->tid_entry[tididx], LEN) * PAGE_SIZE) { 3166 tididx++; 3167 flow->tid_offset = 0; 3168 } 3169 flow->tid_idx = tididx; 3170 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) 3171 /* Move flow_idx to correct index */ 3172 req->flow_idx = fidx; 3173 else 3174 req->clear_tail = fidx; 3175 3176 trace_hfi1_tid_flow_restart_req(qp, fidx, flow); 3177 trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode, wqe->psn, 3178 wqe->lpsn, req); 3179 req->state = TID_REQUEST_ACTIVE; 3180 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) { 3181 /* Reset all the flows that we are going to resend */ 3182 fidx = CIRC_NEXT(fidx, MAX_FLOWS); 3183 i = qpriv->s_tid_tail; 3184 do { 3185 for (; CIRC_CNT(req->setup_head, fidx, MAX_FLOWS); 3186 fidx = CIRC_NEXT(fidx, MAX_FLOWS)) { 3187 req->flows[fidx].sent = 0; 3188 req->flows[fidx].pkt = 0; 3189 req->flows[fidx].tid_idx = 0; 3190 req->flows[fidx].tid_offset = 0; 3191 req->flows[fidx].resync_npkts = 0; 3192 } 3193 if (i == qpriv->s_tid_cur) 3194 break; 3195 do { 3196 i = (++i == qp->s_size ? 0 : i); 3197 wqe = rvt_get_swqe_ptr(qp, i); 3198 } while (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE); 3199 req = wqe_to_tid_req(wqe); 3200 req->cur_seg = req->ack_seg; 3201 fidx = req->acked_tail; 3202 /* Pull req->clear_tail back */ 3203 req->clear_tail = fidx; 3204 } while (1); 3205 } 3206 } 3207 3208 void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp) 3209 { 3210 int i, ret; 3211 struct hfi1_qp_priv *qpriv = qp->priv; 3212 struct tid_flow_state *fs; 3213 3214 if (qp->ibqp.qp_type != IB_QPT_RC || !HFI1_CAP_IS_KSET(TID_RDMA)) 3215 return; 3216 3217 /* 3218 * First, clear the flow to help prevent any delayed packets from 3219 * being delivered. 3220 */ 3221 fs = &qpriv->flow_state; 3222 if (fs->index != RXE_NUM_TID_FLOWS) 3223 hfi1_kern_clear_hw_flow(qpriv->rcd, qp); 3224 3225 for (i = qp->s_acked; i != qp->s_head;) { 3226 struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i); 3227 3228 if (++i == qp->s_size) 3229 i = 0; 3230 /* Free only locally allocated TID entries */ 3231 if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) 3232 continue; 3233 do { 3234 struct hfi1_swqe_priv *priv = wqe->priv; 3235 3236 ret = hfi1_kern_exp_rcv_clear(&priv->tid_req); 3237 } while (!ret); 3238 } 3239 for (i = qp->s_acked_ack_queue; i != qp->r_head_ack_queue;) { 3240 struct rvt_ack_entry *e = &qp->s_ack_queue[i]; 3241 3242 if (++i == rvt_max_atomic(ib_to_rvt(qp->ibqp.device))) 3243 i = 0; 3244 /* Free only locally allocated TID entries */ 3245 if (e->opcode != TID_OP(WRITE_REQ)) 3246 continue; 3247 do { 3248 struct hfi1_ack_priv *priv = e->priv; 3249 3250 ret = hfi1_kern_exp_rcv_clear(&priv->tid_req); 3251 } while (!ret); 3252 } 3253 } 3254 3255 bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe) 3256 { 3257 struct rvt_swqe *prev; 3258 struct hfi1_qp_priv *priv = qp->priv; 3259 u32 s_prev; 3260 struct tid_rdma_request *req; 3261 3262 s_prev = (qp->s_cur == 0 ? qp->s_size : qp->s_cur) - 1; 3263 prev = rvt_get_swqe_ptr(qp, s_prev); 3264 3265 switch (wqe->wr.opcode) { 3266 case IB_WR_SEND: 3267 case IB_WR_SEND_WITH_IMM: 3268 case IB_WR_SEND_WITH_INV: 3269 case IB_WR_ATOMIC_CMP_AND_SWP: 3270 case IB_WR_ATOMIC_FETCH_AND_ADD: 3271 case IB_WR_RDMA_WRITE: 3272 switch (prev->wr.opcode) { 3273 case IB_WR_TID_RDMA_WRITE: 3274 req = wqe_to_tid_req(prev); 3275 if (req->ack_seg != req->total_segs) 3276 goto interlock; 3277 default: 3278 break; 3279 } 3280 break; 3281 case IB_WR_RDMA_READ: 3282 if (prev->wr.opcode != IB_WR_TID_RDMA_WRITE) 3283 break; 3284 /* fall through */ 3285 case IB_WR_TID_RDMA_READ: 3286 switch (prev->wr.opcode) { 3287 case IB_WR_RDMA_READ: 3288 if (qp->s_acked != qp->s_cur) 3289 goto interlock; 3290 break; 3291 case IB_WR_TID_RDMA_WRITE: 3292 req = wqe_to_tid_req(prev); 3293 if (req->ack_seg != req->total_segs) 3294 goto interlock; 3295 default: 3296 break; 3297 } 3298 default: 3299 break; 3300 } 3301 return false; 3302 3303 interlock: 3304 priv->s_flags |= HFI1_S_TID_WAIT_INTERLCK; 3305 return true; 3306 } 3307 3308 /* Does @sge meet the alignment requirements for tid rdma? */ 3309 static inline bool hfi1_check_sge_align(struct rvt_qp *qp, 3310 struct rvt_sge *sge, int num_sge) 3311 { 3312 int i; 3313 3314 for (i = 0; i < num_sge; i++, sge++) { 3315 trace_hfi1_sge_check_align(qp, i, sge); 3316 if ((u64)sge->vaddr & ~PAGE_MASK || 3317 sge->sge_length & ~PAGE_MASK) 3318 return false; 3319 } 3320 return true; 3321 } 3322 3323 void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe) 3324 { 3325 struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv; 3326 struct hfi1_swqe_priv *priv = wqe->priv; 3327 struct tid_rdma_params *remote; 3328 enum ib_wr_opcode new_opcode; 3329 bool do_tid_rdma = false; 3330 struct hfi1_pportdata *ppd = qpriv->rcd->ppd; 3331 3332 if ((rdma_ah_get_dlid(&qp->remote_ah_attr) & ~((1 << ppd->lmc) - 1)) == 3333 ppd->lid) 3334 return; 3335 if (qpriv->hdr_type != HFI1_PKT_TYPE_9B) 3336 return; 3337 3338 rcu_read_lock(); 3339 remote = rcu_dereference(qpriv->tid_rdma.remote); 3340 /* 3341 * If TID RDMA is disabled by the negotiation, don't 3342 * use it. 3343 */ 3344 if (!remote) 3345 goto exit; 3346 3347 if (wqe->wr.opcode == IB_WR_RDMA_READ) { 3348 if (hfi1_check_sge_align(qp, &wqe->sg_list[0], 3349 wqe->wr.num_sge)) { 3350 new_opcode = IB_WR_TID_RDMA_READ; 3351 do_tid_rdma = true; 3352 } 3353 } else if (wqe->wr.opcode == IB_WR_RDMA_WRITE) { 3354 /* 3355 * TID RDMA is enabled for this RDMA WRITE request iff: 3356 * 1. The remote address is page-aligned, 3357 * 2. The length is larger than the minimum segment size, 3358 * 3. The length is page-multiple. 3359 */ 3360 if (!(wqe->rdma_wr.remote_addr & ~PAGE_MASK) && 3361 !(wqe->length & ~PAGE_MASK)) { 3362 new_opcode = IB_WR_TID_RDMA_WRITE; 3363 do_tid_rdma = true; 3364 } 3365 } 3366 3367 if (do_tid_rdma) { 3368 if (hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req, GFP_ATOMIC)) 3369 goto exit; 3370 wqe->wr.opcode = new_opcode; 3371 priv->tid_req.seg_len = 3372 min_t(u32, remote->max_len, wqe->length); 3373 priv->tid_req.total_segs = 3374 DIV_ROUND_UP(wqe->length, priv->tid_req.seg_len); 3375 /* Compute the last PSN of the request */ 3376 wqe->lpsn = wqe->psn; 3377 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { 3378 priv->tid_req.n_flows = remote->max_read; 3379 qpriv->tid_r_reqs++; 3380 wqe->lpsn += rvt_div_round_up_mtu(qp, wqe->length) - 1; 3381 } else { 3382 wqe->lpsn += priv->tid_req.total_segs - 1; 3383 atomic_inc(&qpriv->n_requests); 3384 } 3385 3386 priv->tid_req.cur_seg = 0; 3387 priv->tid_req.comp_seg = 0; 3388 priv->tid_req.ack_seg = 0; 3389 priv->tid_req.state = TID_REQUEST_INACTIVE; 3390 /* 3391 * Reset acked_tail. 3392 * TID RDMA READ does not have ACKs so it does not 3393 * update the pointer. We have to reset it so TID RDMA 3394 * WRITE does not get confused. 3395 */ 3396 priv->tid_req.acked_tail = priv->tid_req.setup_head; 3397 trace_hfi1_tid_req_setup_tid_wqe(qp, 1, wqe->wr.opcode, 3398 wqe->psn, wqe->lpsn, 3399 &priv->tid_req); 3400 } 3401 exit: 3402 rcu_read_unlock(); 3403 } 3404 3405 /* TID RDMA WRITE functions */ 3406 3407 u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe, 3408 struct ib_other_headers *ohdr, 3409 u32 *bth1, u32 *bth2, u32 *len) 3410 { 3411 struct hfi1_qp_priv *qpriv = qp->priv; 3412 struct tid_rdma_request *req = wqe_to_tid_req(wqe); 3413 struct tid_rdma_params *remote; 3414 3415 rcu_read_lock(); 3416 remote = rcu_dereference(qpriv->tid_rdma.remote); 3417 /* 3418 * Set the number of flow to be used based on negotiated 3419 * parameters. 3420 */ 3421 req->n_flows = remote->max_write; 3422 req->state = TID_REQUEST_ACTIVE; 3423 3424 KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth0, KVER, 0x1); 3425 KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth1, JKEY, remote->jkey); 3426 ohdr->u.tid_rdma.w_req.reth.vaddr = 3427 cpu_to_be64(wqe->rdma_wr.remote_addr + (wqe->length - *len)); 3428 ohdr->u.tid_rdma.w_req.reth.rkey = 3429 cpu_to_be32(wqe->rdma_wr.rkey); 3430 ohdr->u.tid_rdma.w_req.reth.length = cpu_to_be32(*len); 3431 ohdr->u.tid_rdma.w_req.verbs_qp = cpu_to_be32(qp->remote_qpn); 3432 *bth1 &= ~RVT_QPN_MASK; 3433 *bth1 |= remote->qp; 3434 qp->s_state = TID_OP(WRITE_REQ); 3435 qp->s_flags |= HFI1_S_WAIT_TID_RESP; 3436 *bth2 |= IB_BTH_REQ_ACK; 3437 *len = 0; 3438 3439 rcu_read_unlock(); 3440 return sizeof(ohdr->u.tid_rdma.w_req) / sizeof(u32); 3441 } 3442 3443 void hfi1_compute_tid_rdma_flow_wt(void) 3444 { 3445 /* 3446 * Heuristic for computing the RNR timeout when waiting on the flow 3447 * queue. Rather than a computationaly expensive exact estimate of when 3448 * a flow will be available, we assume that if a QP is at position N in 3449 * the flow queue it has to wait approximately (N + 1) * (number of 3450 * segments between two sync points), assuming PMTU of 4K. The rationale 3451 * for this is that flows are released and recycled at each sync point. 3452 */ 3453 tid_rdma_flow_wt = MAX_TID_FLOW_PSN * enum_to_mtu(OPA_MTU_4096) / 3454 TID_RDMA_MAX_SEGMENT_SIZE; 3455 } 3456 3457 static u32 position_in_queue(struct hfi1_qp_priv *qpriv, 3458 struct tid_queue *queue) 3459 { 3460 return qpriv->tid_enqueue - queue->dequeue; 3461 } 3462 3463 /* 3464 * @qp: points to rvt_qp context. 3465 * @to_seg: desired RNR timeout in segments. 3466 * Return: index of the next highest timeout in the ib_hfi1_rnr_table[] 3467 */ 3468 static u32 hfi1_compute_tid_rnr_timeout(struct rvt_qp *qp, u32 to_seg) 3469 { 3470 struct hfi1_qp_priv *qpriv = qp->priv; 3471 u64 timeout; 3472 u32 bytes_per_us; 3473 u8 i; 3474 3475 bytes_per_us = active_egress_rate(qpriv->rcd->ppd) / 8; 3476 timeout = (to_seg * TID_RDMA_MAX_SEGMENT_SIZE) / bytes_per_us; 3477 /* 3478 * Find the next highest value in the RNR table to the required 3479 * timeout. This gives the responder some padding. 3480 */ 3481 for (i = 1; i <= IB_AETH_CREDIT_MASK; i++) 3482 if (rvt_rnr_tbl_to_usec(i) >= timeout) 3483 return i; 3484 return 0; 3485 } 3486 3487 /** 3488 * Central place for resource allocation at TID write responder, 3489 * is called from write_req and write_data interrupt handlers as 3490 * well as the send thread when a queued QP is scheduled for 3491 * resource allocation. 3492 * 3493 * Iterates over (a) segments of a request and then (b) queued requests 3494 * themselves to allocate resources for up to local->max_write 3495 * segments across multiple requests. Stop allocating when we 3496 * hit a sync point, resume allocating after data packets at 3497 * sync point have been received. 3498 * 3499 * Resource allocation and sending of responses is decoupled. The 3500 * request/segment which are being allocated and sent are as follows. 3501 * Resources are allocated for: 3502 * [request: qpriv->r_tid_alloc, segment: req->alloc_seg] 3503 * The send thread sends: 3504 * [request: qp->s_tail_ack_queue, segment:req->cur_seg] 3505 */ 3506 static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx) 3507 { 3508 struct tid_rdma_request *req; 3509 struct hfi1_qp_priv *qpriv = qp->priv; 3510 struct hfi1_ctxtdata *rcd = qpriv->rcd; 3511 struct tid_rdma_params *local = &qpriv->tid_rdma.local; 3512 struct rvt_ack_entry *e; 3513 u32 npkts, to_seg; 3514 bool last; 3515 int ret = 0; 3516 3517 lockdep_assert_held(&qp->s_lock); 3518 3519 while (1) { 3520 trace_hfi1_rsp_tid_write_alloc_res(qp, 0); 3521 trace_hfi1_tid_write_rsp_alloc_res(qp); 3522 /* 3523 * Don't allocate more segments if a RNR NAK has already been 3524 * scheduled to avoid messing up qp->r_psn: the RNR NAK will 3525 * be sent only when all allocated segments have been sent. 3526 * However, if more segments are allocated before that, TID RDMA 3527 * WRITE RESP packets will be sent out for these new segments 3528 * before the RNR NAK packet. When the requester receives the 3529 * RNR NAK packet, it will restart with qp->s_last_psn + 1, 3530 * which does not match qp->r_psn and will be dropped. 3531 * Consequently, the requester will exhaust its retries and 3532 * put the qp into error state. 3533 */ 3534 if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND) 3535 break; 3536 3537 /* No requests left to process */ 3538 if (qpriv->r_tid_alloc == qpriv->r_tid_head) { 3539 /* If all data has been received, clear the flow */ 3540 if (qpriv->flow_state.index < RXE_NUM_TID_FLOWS && 3541 !qpriv->alloc_w_segs) { 3542 hfi1_kern_clear_hw_flow(rcd, qp); 3543 qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; 3544 } 3545 break; 3546 } 3547 3548 e = &qp->s_ack_queue[qpriv->r_tid_alloc]; 3549 if (e->opcode != TID_OP(WRITE_REQ)) 3550 goto next_req; 3551 req = ack_to_tid_req(e); 3552 trace_hfi1_tid_req_write_alloc_res(qp, 0, e->opcode, e->psn, 3553 e->lpsn, req); 3554 /* Finished allocating for all segments of this request */ 3555 if (req->alloc_seg >= req->total_segs) 3556 goto next_req; 3557 3558 /* Can allocate only a maximum of local->max_write for a QP */ 3559 if (qpriv->alloc_w_segs >= local->max_write) 3560 break; 3561 3562 /* Don't allocate at a sync point with data packets pending */ 3563 if (qpriv->sync_pt && qpriv->alloc_w_segs) 3564 break; 3565 3566 /* All data received at the sync point, continue */ 3567 if (qpriv->sync_pt && !qpriv->alloc_w_segs) { 3568 hfi1_kern_clear_hw_flow(rcd, qp); 3569 qpriv->sync_pt = false; 3570 qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; 3571 } 3572 3573 /* Allocate flow if we don't have one */ 3574 if (qpriv->flow_state.index >= RXE_NUM_TID_FLOWS) { 3575 ret = hfi1_kern_setup_hw_flow(qpriv->rcd, qp); 3576 if (ret) { 3577 to_seg = tid_rdma_flow_wt * 3578 position_in_queue(qpriv, 3579 &rcd->flow_queue); 3580 break; 3581 } 3582 } 3583 3584 npkts = rvt_div_round_up_mtu(qp, req->seg_len); 3585 3586 /* 3587 * We are at a sync point if we run out of KDETH PSN space. 3588 * Last PSN of every generation is reserved for RESYNC. 3589 */ 3590 if (qpriv->flow_state.psn + npkts > MAX_TID_FLOW_PSN - 1) { 3591 qpriv->sync_pt = true; 3592 break; 3593 } 3594 3595 /* 3596 * If overtaking req->acked_tail, send an RNR NAK. Because the 3597 * QP is not queued in this case, and the issue can only be 3598 * caused due a delay in scheduling the second leg which we 3599 * cannot estimate, we use a rather arbitrary RNR timeout of 3600 * (MAX_FLOWS / 2) segments 3601 */ 3602 if (!CIRC_SPACE(req->setup_head, req->acked_tail, 3603 MAX_FLOWS)) { 3604 ret = -EAGAIN; 3605 to_seg = MAX_FLOWS >> 1; 3606 qpriv->s_flags |= RVT_S_ACK_PENDING; 3607 hfi1_schedule_tid_send(qp); 3608 break; 3609 } 3610 3611 /* Try to allocate rcv array / TID entries */ 3612 ret = hfi1_kern_exp_rcv_setup(req, &req->ss, &last); 3613 if (ret == -EAGAIN) 3614 to_seg = position_in_queue(qpriv, &rcd->rarr_queue); 3615 if (ret) 3616 break; 3617 3618 qpriv->alloc_w_segs++; 3619 req->alloc_seg++; 3620 continue; 3621 next_req: 3622 /* Begin processing the next request */ 3623 if (++qpriv->r_tid_alloc > 3624 rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) 3625 qpriv->r_tid_alloc = 0; 3626 } 3627 3628 /* 3629 * Schedule an RNR NAK to be sent if (a) flow or rcv array allocation 3630 * has failed (b) we are called from the rcv handler interrupt context 3631 * (c) an RNR NAK has not already been scheduled 3632 */ 3633 if (ret == -EAGAIN && intr_ctx && !qp->r_nak_state) 3634 goto send_rnr_nak; 3635 3636 return; 3637 3638 send_rnr_nak: 3639 lockdep_assert_held(&qp->r_lock); 3640 3641 /* Set r_nak_state to prevent unrelated events from generating NAK's */ 3642 qp->r_nak_state = hfi1_compute_tid_rnr_timeout(qp, to_seg) | IB_RNR_NAK; 3643 3644 /* Pull back r_psn to the segment being RNR NAK'd */ 3645 qp->r_psn = e->psn + req->alloc_seg; 3646 qp->r_ack_psn = qp->r_psn; 3647 /* 3648 * Pull back r_head_ack_queue to the ack entry following the request 3649 * being RNR NAK'd. This allows resources to be allocated to the request 3650 * if the queued QP is scheduled. 3651 */ 3652 qp->r_head_ack_queue = qpriv->r_tid_alloc + 1; 3653 if (qp->r_head_ack_queue > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) 3654 qp->r_head_ack_queue = 0; 3655 qpriv->r_tid_head = qp->r_head_ack_queue; 3656 /* 3657 * These send side fields are used in make_rc_ack(). They are set in 3658 * hfi1_send_rc_ack() but must be set here before dropping qp->s_lock 3659 * for consistency 3660 */ 3661 qp->s_nak_state = qp->r_nak_state; 3662 qp->s_ack_psn = qp->r_ack_psn; 3663 /* 3664 * Clear the ACK PENDING flag to prevent unwanted ACK because we 3665 * have modified qp->s_ack_psn here. 3666 */ 3667 qp->s_flags &= ~(RVT_S_ACK_PENDING); 3668 3669 trace_hfi1_rsp_tid_write_alloc_res(qp, qp->r_psn); 3670 /* 3671 * qpriv->rnr_nak_state is used to determine when the scheduled RNR NAK 3672 * has actually been sent. qp->s_flags RVT_S_ACK_PENDING bit cannot be 3673 * used for this because qp->s_lock is dropped before calling 3674 * hfi1_send_rc_ack() leading to inconsistency between the receive 3675 * interrupt handlers and the send thread in make_rc_ack() 3676 */ 3677 qpriv->rnr_nak_state = TID_RNR_NAK_SEND; 3678 3679 /* 3680 * Schedule RNR NAK to be sent. RNR NAK's are scheduled from the receive 3681 * interrupt handlers but will be sent from the send engine behind any 3682 * previous responses that may have been scheduled 3683 */ 3684 rc_defered_ack(rcd, qp); 3685 } 3686 3687 void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet) 3688 { 3689 /* HANDLER FOR TID RDMA WRITE REQUEST packet (Responder side)*/ 3690 3691 /* 3692 * 1. Verify TID RDMA WRITE REQ as per IB_OPCODE_RC_RDMA_WRITE_FIRST 3693 * (see hfi1_rc_rcv()) 3694 * - Don't allow 0-length requests. 3695 * 2. Put TID RDMA WRITE REQ into the response queueu (s_ack_queue) 3696 * - Setup struct tid_rdma_req with request info 3697 * - Prepare struct tid_rdma_flow array? 3698 * 3. Set the qp->s_ack_state as state diagram in design doc. 3699 * 4. Set RVT_S_RESP_PENDING in s_flags. 3700 * 5. Kick the send engine (hfi1_schedule_send()) 3701 */ 3702 struct hfi1_ctxtdata *rcd = packet->rcd; 3703 struct rvt_qp *qp = packet->qp; 3704 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); 3705 struct ib_other_headers *ohdr = packet->ohdr; 3706 struct rvt_ack_entry *e; 3707 unsigned long flags; 3708 struct ib_reth *reth; 3709 struct hfi1_qp_priv *qpriv = qp->priv; 3710 struct tid_rdma_request *req; 3711 u32 bth0, psn, len, rkey, num_segs; 3712 bool fecn; 3713 u8 next; 3714 u64 vaddr; 3715 int diff; 3716 3717 bth0 = be32_to_cpu(ohdr->bth[0]); 3718 if (hfi1_ruc_check_hdr(ibp, packet)) 3719 return; 3720 3721 fecn = process_ecn(qp, packet); 3722 psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 3723 trace_hfi1_rsp_rcv_tid_write_req(qp, psn); 3724 3725 if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST)) 3726 rvt_comm_est(qp); 3727 3728 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) 3729 goto nack_inv; 3730 3731 reth = &ohdr->u.tid_rdma.w_req.reth; 3732 vaddr = be64_to_cpu(reth->vaddr); 3733 len = be32_to_cpu(reth->length); 3734 3735 num_segs = DIV_ROUND_UP(len, qpriv->tid_rdma.local.max_len); 3736 diff = delta_psn(psn, qp->r_psn); 3737 if (unlikely(diff)) { 3738 tid_rdma_rcv_err(packet, ohdr, qp, psn, diff, fecn); 3739 return; 3740 } 3741 3742 /* 3743 * The resent request which was previously RNR NAK'd is inserted at the 3744 * location of the original request, which is one entry behind 3745 * r_head_ack_queue 3746 */ 3747 if (qpriv->rnr_nak_state) 3748 qp->r_head_ack_queue = qp->r_head_ack_queue ? 3749 qp->r_head_ack_queue - 1 : 3750 rvt_size_atomic(ib_to_rvt(qp->ibqp.device)); 3751 3752 /* We've verified the request, insert it into the ack queue. */ 3753 next = qp->r_head_ack_queue + 1; 3754 if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) 3755 next = 0; 3756 spin_lock_irqsave(&qp->s_lock, flags); 3757 if (unlikely(next == qp->s_acked_ack_queue)) { 3758 if (!qp->s_ack_queue[next].sent) 3759 goto nack_inv_unlock; 3760 update_ack_queue(qp, next); 3761 } 3762 e = &qp->s_ack_queue[qp->r_head_ack_queue]; 3763 req = ack_to_tid_req(e); 3764 3765 /* Bring previously RNR NAK'd request back to life */ 3766 if (qpriv->rnr_nak_state) { 3767 qp->r_nak_state = 0; 3768 qp->s_nak_state = 0; 3769 qpriv->rnr_nak_state = TID_RNR_NAK_INIT; 3770 qp->r_psn = e->lpsn + 1; 3771 req->state = TID_REQUEST_INIT; 3772 goto update_head; 3773 } 3774 3775 release_rdma_sge_mr(e); 3776 3777 /* The length needs to be in multiples of PAGE_SIZE */ 3778 if (!len || len & ~PAGE_MASK) 3779 goto nack_inv_unlock; 3780 3781 rkey = be32_to_cpu(reth->rkey); 3782 qp->r_len = len; 3783 3784 if (e->opcode == TID_OP(WRITE_REQ) && 3785 (req->setup_head != req->clear_tail || 3786 req->clear_tail != req->acked_tail)) 3787 goto nack_inv_unlock; 3788 3789 if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr, 3790 rkey, IB_ACCESS_REMOTE_WRITE))) 3791 goto nack_acc; 3792 3793 qp->r_psn += num_segs - 1; 3794 3795 e->opcode = (bth0 >> 24) & 0xff; 3796 e->psn = psn; 3797 e->lpsn = qp->r_psn; 3798 e->sent = 0; 3799 3800 req->n_flows = min_t(u16, num_segs, qpriv->tid_rdma.local.max_write); 3801 req->state = TID_REQUEST_INIT; 3802 req->cur_seg = 0; 3803 req->comp_seg = 0; 3804 req->ack_seg = 0; 3805 req->alloc_seg = 0; 3806 req->isge = 0; 3807 req->seg_len = qpriv->tid_rdma.local.max_len; 3808 req->total_len = len; 3809 req->total_segs = num_segs; 3810 req->r_flow_psn = e->psn; 3811 req->ss.sge = e->rdma_sge; 3812 req->ss.num_sge = 1; 3813 3814 req->flow_idx = req->setup_head; 3815 req->clear_tail = req->setup_head; 3816 req->acked_tail = req->setup_head; 3817 3818 qp->r_state = e->opcode; 3819 qp->r_nak_state = 0; 3820 /* 3821 * We need to increment the MSN here instead of when we 3822 * finish sending the result since a duplicate request would 3823 * increment it more than once. 3824 */ 3825 qp->r_msn++; 3826 qp->r_psn++; 3827 3828 trace_hfi1_tid_req_rcv_write_req(qp, 0, e->opcode, e->psn, e->lpsn, 3829 req); 3830 3831 if (qpriv->r_tid_tail == HFI1_QP_WQE_INVALID) { 3832 qpriv->r_tid_tail = qp->r_head_ack_queue; 3833 } else if (qpriv->r_tid_tail == qpriv->r_tid_head) { 3834 struct tid_rdma_request *ptr; 3835 3836 e = &qp->s_ack_queue[qpriv->r_tid_tail]; 3837 ptr = ack_to_tid_req(e); 3838 3839 if (e->opcode != TID_OP(WRITE_REQ) || 3840 ptr->comp_seg == ptr->total_segs) { 3841 if (qpriv->r_tid_tail == qpriv->r_tid_ack) 3842 qpriv->r_tid_ack = qp->r_head_ack_queue; 3843 qpriv->r_tid_tail = qp->r_head_ack_queue; 3844 } 3845 } 3846 update_head: 3847 qp->r_head_ack_queue = next; 3848 qpriv->r_tid_head = qp->r_head_ack_queue; 3849 3850 hfi1_tid_write_alloc_resources(qp, true); 3851 trace_hfi1_tid_write_rsp_rcv_req(qp); 3852 3853 /* Schedule the send tasklet. */ 3854 qp->s_flags |= RVT_S_RESP_PENDING; 3855 if (fecn) 3856 qp->s_flags |= RVT_S_ECN; 3857 hfi1_schedule_send(qp); 3858 3859 spin_unlock_irqrestore(&qp->s_lock, flags); 3860 return; 3861 3862 nack_inv_unlock: 3863 spin_unlock_irqrestore(&qp->s_lock, flags); 3864 nack_inv: 3865 rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); 3866 qp->r_nak_state = IB_NAK_INVALID_REQUEST; 3867 qp->r_ack_psn = qp->r_psn; 3868 /* Queue NAK for later */ 3869 rc_defered_ack(rcd, qp); 3870 return; 3871 nack_acc: 3872 spin_unlock_irqrestore(&qp->s_lock, flags); 3873 rvt_rc_error(qp, IB_WC_LOC_PROT_ERR); 3874 qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; 3875 qp->r_ack_psn = qp->r_psn; 3876 } 3877 3878 u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, 3879 struct ib_other_headers *ohdr, u32 *bth1, 3880 u32 bth2, u32 *len, 3881 struct rvt_sge_state **ss) 3882 { 3883 struct hfi1_ack_priv *epriv = e->priv; 3884 struct tid_rdma_request *req = &epriv->tid_req; 3885 struct hfi1_qp_priv *qpriv = qp->priv; 3886 struct tid_rdma_flow *flow = NULL; 3887 u32 resp_len = 0, hdwords = 0; 3888 void *resp_addr = NULL; 3889 struct tid_rdma_params *remote; 3890 3891 trace_hfi1_tid_req_build_write_resp(qp, 0, e->opcode, e->psn, e->lpsn, 3892 req); 3893 trace_hfi1_tid_write_rsp_build_resp(qp); 3894 trace_hfi1_rsp_build_tid_write_resp(qp, bth2); 3895 flow = &req->flows[req->flow_idx]; 3896 switch (req->state) { 3897 default: 3898 /* 3899 * Try to allocate resources here in case QP was queued and was 3900 * later scheduled when resources became available 3901 */ 3902 hfi1_tid_write_alloc_resources(qp, false); 3903 3904 /* We've already sent everything which is ready */ 3905 if (req->cur_seg >= req->alloc_seg) 3906 goto done; 3907 3908 /* 3909 * Resources can be assigned but responses cannot be sent in 3910 * rnr_nak state, till the resent request is received 3911 */ 3912 if (qpriv->rnr_nak_state == TID_RNR_NAK_SENT) 3913 goto done; 3914 3915 req->state = TID_REQUEST_ACTIVE; 3916 trace_hfi1_tid_flow_build_write_resp(qp, req->flow_idx, flow); 3917 req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS); 3918 hfi1_add_tid_reap_timer(qp); 3919 break; 3920 3921 case TID_REQUEST_RESEND_ACTIVE: 3922 case TID_REQUEST_RESEND: 3923 trace_hfi1_tid_flow_build_write_resp(qp, req->flow_idx, flow); 3924 req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS); 3925 if (!CIRC_CNT(req->setup_head, req->flow_idx, MAX_FLOWS)) 3926 req->state = TID_REQUEST_ACTIVE; 3927 3928 hfi1_mod_tid_reap_timer(qp); 3929 break; 3930 } 3931 flow->flow_state.resp_ib_psn = bth2; 3932 resp_addr = (void *)flow->tid_entry; 3933 resp_len = sizeof(*flow->tid_entry) * flow->tidcnt; 3934 req->cur_seg++; 3935 3936 memset(&ohdr->u.tid_rdma.w_rsp, 0, sizeof(ohdr->u.tid_rdma.w_rsp)); 3937 epriv->ss.sge.vaddr = resp_addr; 3938 epriv->ss.sge.sge_length = resp_len; 3939 epriv->ss.sge.length = epriv->ss.sge.sge_length; 3940 /* 3941 * We can safely zero these out. Since the first SGE covers the 3942 * entire packet, nothing else should even look at the MR. 3943 */ 3944 epriv->ss.sge.mr = NULL; 3945 epriv->ss.sge.m = 0; 3946 epriv->ss.sge.n = 0; 3947 3948 epriv->ss.sg_list = NULL; 3949 epriv->ss.total_len = epriv->ss.sge.sge_length; 3950 epriv->ss.num_sge = 1; 3951 3952 *ss = &epriv->ss; 3953 *len = epriv->ss.total_len; 3954 3955 /* Construct the TID RDMA WRITE RESP packet header */ 3956 rcu_read_lock(); 3957 remote = rcu_dereference(qpriv->tid_rdma.remote); 3958 3959 KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth0, KVER, 0x1); 3960 KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth1, JKEY, remote->jkey); 3961 ohdr->u.tid_rdma.w_rsp.aeth = rvt_compute_aeth(qp); 3962 ohdr->u.tid_rdma.w_rsp.tid_flow_psn = 3963 cpu_to_be32((flow->flow_state.generation << 3964 HFI1_KDETH_BTH_SEQ_SHIFT) | 3965 (flow->flow_state.spsn & 3966 HFI1_KDETH_BTH_SEQ_MASK)); 3967 ohdr->u.tid_rdma.w_rsp.tid_flow_qp = 3968 cpu_to_be32(qpriv->tid_rdma.local.qp | 3969 ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) << 3970 TID_RDMA_DESTQP_FLOW_SHIFT) | 3971 qpriv->rcd->ctxt); 3972 ohdr->u.tid_rdma.w_rsp.verbs_qp = cpu_to_be32(qp->remote_qpn); 3973 *bth1 = remote->qp; 3974 rcu_read_unlock(); 3975 hdwords = sizeof(ohdr->u.tid_rdma.w_rsp) / sizeof(u32); 3976 qpriv->pending_tid_w_segs++; 3977 done: 3978 return hdwords; 3979 } 3980 3981 static void hfi1_add_tid_reap_timer(struct rvt_qp *qp) 3982 { 3983 struct hfi1_qp_priv *qpriv = qp->priv; 3984 3985 lockdep_assert_held(&qp->s_lock); 3986 if (!(qpriv->s_flags & HFI1_R_TID_RSC_TIMER)) { 3987 qpriv->s_flags |= HFI1_R_TID_RSC_TIMER; 3988 qpriv->s_tid_timer.expires = jiffies + 3989 qpriv->tid_timer_timeout_jiffies; 3990 add_timer(&qpriv->s_tid_timer); 3991 } 3992 } 3993 3994 static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp) 3995 { 3996 struct hfi1_qp_priv *qpriv = qp->priv; 3997 3998 lockdep_assert_held(&qp->s_lock); 3999 qpriv->s_flags |= HFI1_R_TID_RSC_TIMER; 4000 mod_timer(&qpriv->s_tid_timer, jiffies + 4001 qpriv->tid_timer_timeout_jiffies); 4002 } 4003 4004 static int hfi1_stop_tid_reap_timer(struct rvt_qp *qp) 4005 { 4006 struct hfi1_qp_priv *qpriv = qp->priv; 4007 int rval = 0; 4008 4009 lockdep_assert_held(&qp->s_lock); 4010 if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) { 4011 rval = del_timer(&qpriv->s_tid_timer); 4012 qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER; 4013 } 4014 return rval; 4015 } 4016 4017 void hfi1_del_tid_reap_timer(struct rvt_qp *qp) 4018 { 4019 struct hfi1_qp_priv *qpriv = qp->priv; 4020 4021 del_timer_sync(&qpriv->s_tid_timer); 4022 qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER; 4023 } 4024 4025 static void hfi1_tid_timeout(struct timer_list *t) 4026 { 4027 struct hfi1_qp_priv *qpriv = from_timer(qpriv, t, s_tid_timer); 4028 struct rvt_qp *qp = qpriv->owner; 4029 struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); 4030 unsigned long flags; 4031 u32 i; 4032 4033 spin_lock_irqsave(&qp->r_lock, flags); 4034 spin_lock(&qp->s_lock); 4035 if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) { 4036 dd_dev_warn(dd_from_ibdev(qp->ibqp.device), "[QP%u] %s %d\n", 4037 qp->ibqp.qp_num, __func__, __LINE__); 4038 trace_hfi1_msg_tid_timeout(/* msg */ 4039 qp, "resource timeout = ", 4040 (u64)qpriv->tid_timer_timeout_jiffies); 4041 hfi1_stop_tid_reap_timer(qp); 4042 /* 4043 * Go though the entire ack queue and clear any outstanding 4044 * HW flow and RcvArray resources. 4045 */ 4046 hfi1_kern_clear_hw_flow(qpriv->rcd, qp); 4047 for (i = 0; i < rvt_max_atomic(rdi); i++) { 4048 struct tid_rdma_request *req = 4049 ack_to_tid_req(&qp->s_ack_queue[i]); 4050 4051 hfi1_kern_exp_rcv_clear_all(req); 4052 } 4053 spin_unlock(&qp->s_lock); 4054 if (qp->ibqp.event_handler) { 4055 struct ib_event ev; 4056 4057 ev.device = qp->ibqp.device; 4058 ev.element.qp = &qp->ibqp; 4059 ev.event = IB_EVENT_QP_FATAL; 4060 qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); 4061 } 4062 rvt_rc_error(qp, IB_WC_RESP_TIMEOUT_ERR); 4063 goto unlock_r_lock; 4064 } 4065 spin_unlock(&qp->s_lock); 4066 unlock_r_lock: 4067 spin_unlock_irqrestore(&qp->r_lock, flags); 4068 } 4069 4070 void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet) 4071 { 4072 /* HANDLER FOR TID RDMA WRITE RESPONSE packet (Requestor side */ 4073 4074 /* 4075 * 1. Find matching SWQE 4076 * 2. Check that TIDENTRY array has enough space for a complete 4077 * segment. If not, put QP in error state. 4078 * 3. Save response data in struct tid_rdma_req and struct tid_rdma_flow 4079 * 4. Remove HFI1_S_WAIT_TID_RESP from s_flags. 4080 * 5. Set qp->s_state 4081 * 6. Kick the send engine (hfi1_schedule_send()) 4082 */ 4083 struct ib_other_headers *ohdr = packet->ohdr; 4084 struct rvt_qp *qp = packet->qp; 4085 struct hfi1_qp_priv *qpriv = qp->priv; 4086 struct hfi1_ctxtdata *rcd = packet->rcd; 4087 struct rvt_swqe *wqe; 4088 struct tid_rdma_request *req; 4089 struct tid_rdma_flow *flow; 4090 enum ib_wc_status status; 4091 u32 opcode, aeth, psn, flow_psn, i, tidlen = 0, pktlen; 4092 bool fecn; 4093 unsigned long flags; 4094 4095 fecn = process_ecn(qp, packet); 4096 psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 4097 aeth = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.aeth); 4098 opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; 4099 4100 spin_lock_irqsave(&qp->s_lock, flags); 4101 4102 /* Ignore invalid responses */ 4103 if (cmp_psn(psn, qp->s_next_psn) >= 0) 4104 goto ack_done; 4105 4106 /* Ignore duplicate responses. */ 4107 if (unlikely(cmp_psn(psn, qp->s_last_psn) <= 0)) 4108 goto ack_done; 4109 4110 if (unlikely(qp->s_acked == qp->s_tail)) 4111 goto ack_done; 4112 4113 /* 4114 * If we are waiting for a particular packet sequence number 4115 * due to a request being resent, check for it. Otherwise, 4116 * ensure that we haven't missed anything. 4117 */ 4118 if (qp->r_flags & RVT_R_RDMAR_SEQ) { 4119 if (cmp_psn(psn, qp->s_last_psn + 1) != 0) 4120 goto ack_done; 4121 qp->r_flags &= ~RVT_R_RDMAR_SEQ; 4122 } 4123 4124 wqe = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur); 4125 if (unlikely(wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)) 4126 goto ack_op_err; 4127 4128 req = wqe_to_tid_req(wqe); 4129 /* 4130 * If we've lost ACKs and our acked_tail pointer is too far 4131 * behind, don't overwrite segments. Just drop the packet and 4132 * let the reliability protocol take care of it. 4133 */ 4134 if (!CIRC_SPACE(req->setup_head, req->acked_tail, MAX_FLOWS)) 4135 goto ack_done; 4136 4137 /* 4138 * The call to do_rc_ack() should be last in the chain of 4139 * packet checks because it will end up updating the QP state. 4140 * Therefore, anything that would prevent the packet from 4141 * being accepted as a successful response should be prior 4142 * to it. 4143 */ 4144 if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd)) 4145 goto ack_done; 4146 4147 trace_hfi1_ack(qp, psn); 4148 4149 flow = &req->flows[req->setup_head]; 4150 flow->pkt = 0; 4151 flow->tid_idx = 0; 4152 flow->tid_offset = 0; 4153 flow->sent = 0; 4154 flow->resync_npkts = 0; 4155 flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_qp); 4156 flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) & 4157 TID_RDMA_DESTQP_FLOW_MASK; 4158 flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_psn)); 4159 flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT; 4160 flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK; 4161 flow->flow_state.resp_ib_psn = psn; 4162 flow->length = min_t(u32, req->seg_len, 4163 (wqe->length - (req->comp_seg * req->seg_len))); 4164 4165 flow->npkts = rvt_div_round_up_mtu(qp, flow->length); 4166 flow->flow_state.lpsn = flow->flow_state.spsn + 4167 flow->npkts - 1; 4168 /* payload length = packet length - (header length + ICRC length) */ 4169 pktlen = packet->tlen - (packet->hlen + 4); 4170 if (pktlen > sizeof(flow->tid_entry)) { 4171 status = IB_WC_LOC_LEN_ERR; 4172 goto ack_err; 4173 } 4174 memcpy(flow->tid_entry, packet->ebuf, pktlen); 4175 flow->tidcnt = pktlen / sizeof(*flow->tid_entry); 4176 trace_hfi1_tid_flow_rcv_write_resp(qp, req->setup_head, flow); 4177 4178 req->comp_seg++; 4179 trace_hfi1_tid_write_sender_rcv_resp(qp, 0); 4180 /* 4181 * Walk the TID_ENTRY list to make sure we have enough space for a 4182 * complete segment. 4183 */ 4184 for (i = 0; i < flow->tidcnt; i++) { 4185 trace_hfi1_tid_entry_rcv_write_resp(/* entry */ 4186 qp, i, flow->tid_entry[i]); 4187 if (!EXP_TID_GET(flow->tid_entry[i], LEN)) { 4188 status = IB_WC_LOC_LEN_ERR; 4189 goto ack_err; 4190 } 4191 tidlen += EXP_TID_GET(flow->tid_entry[i], LEN); 4192 } 4193 if (tidlen * PAGE_SIZE < flow->length) { 4194 status = IB_WC_LOC_LEN_ERR; 4195 goto ack_err; 4196 } 4197 4198 trace_hfi1_tid_req_rcv_write_resp(qp, 0, wqe->wr.opcode, wqe->psn, 4199 wqe->lpsn, req); 4200 /* 4201 * If this is the first response for this request, set the initial 4202 * flow index to the current flow. 4203 */ 4204 if (!cmp_psn(psn, wqe->psn)) { 4205 req->r_last_acked = mask_psn(wqe->psn - 1); 4206 /* Set acked flow index to head index */ 4207 req->acked_tail = req->setup_head; 4208 } 4209 4210 /* advance circular buffer head */ 4211 req->setup_head = CIRC_NEXT(req->setup_head, MAX_FLOWS); 4212 req->state = TID_REQUEST_ACTIVE; 4213 4214 /* 4215 * If all responses for this TID RDMA WRITE request have been received 4216 * advance the pointer to the next one. 4217 * Since TID RDMA requests could be mixed in with regular IB requests, 4218 * they might not appear sequentially in the queue. Therefore, the 4219 * next request needs to be "found". 4220 */ 4221 if (qpriv->s_tid_cur != qpriv->s_tid_head && 4222 req->comp_seg == req->total_segs) { 4223 for (i = qpriv->s_tid_cur + 1; ; i++) { 4224 if (i == qp->s_size) 4225 i = 0; 4226 wqe = rvt_get_swqe_ptr(qp, i); 4227 if (i == qpriv->s_tid_head) 4228 break; 4229 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) 4230 break; 4231 } 4232 qpriv->s_tid_cur = i; 4233 } 4234 qp->s_flags &= ~HFI1_S_WAIT_TID_RESP; 4235 hfi1_schedule_tid_send(qp); 4236 goto ack_done; 4237 4238 ack_op_err: 4239 status = IB_WC_LOC_QP_OP_ERR; 4240 ack_err: 4241 rvt_error_qp(qp, status); 4242 ack_done: 4243 if (fecn) 4244 qp->s_flags |= RVT_S_ECN; 4245 spin_unlock_irqrestore(&qp->s_lock, flags); 4246 } 4247 4248 bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe, 4249 struct ib_other_headers *ohdr, 4250 u32 *bth1, u32 *bth2, u32 *len) 4251 { 4252 struct tid_rdma_request *req = wqe_to_tid_req(wqe); 4253 struct tid_rdma_flow *flow = &req->flows[req->clear_tail]; 4254 struct tid_rdma_params *remote; 4255 struct rvt_qp *qp = req->qp; 4256 struct hfi1_qp_priv *qpriv = qp->priv; 4257 u32 tidentry = flow->tid_entry[flow->tid_idx]; 4258 u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT; 4259 struct tid_rdma_write_data *wd = &ohdr->u.tid_rdma.w_data; 4260 u32 next_offset, om = KDETH_OM_LARGE; 4261 bool last_pkt; 4262 4263 if (!tidlen) { 4264 hfi1_trdma_send_complete(qp, wqe, IB_WC_REM_INV_RD_REQ_ERR); 4265 rvt_error_qp(qp, IB_WC_REM_INV_RD_REQ_ERR); 4266 } 4267 4268 *len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset); 4269 flow->sent += *len; 4270 next_offset = flow->tid_offset + *len; 4271 last_pkt = (flow->tid_idx == (flow->tidcnt - 1) && 4272 next_offset >= tidlen) || (flow->sent >= flow->length); 4273 trace_hfi1_tid_entry_build_write_data(qp, flow->tid_idx, tidentry); 4274 trace_hfi1_tid_flow_build_write_data(qp, req->clear_tail, flow); 4275 4276 rcu_read_lock(); 4277 remote = rcu_dereference(qpriv->tid_rdma.remote); 4278 KDETH_RESET(wd->kdeth0, KVER, 0x1); 4279 KDETH_SET(wd->kdeth0, SH, !last_pkt); 4280 KDETH_SET(wd->kdeth0, INTR, !!(!last_pkt && remote->urg)); 4281 KDETH_SET(wd->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL)); 4282 KDETH_SET(wd->kdeth0, TID, EXP_TID_GET(tidentry, IDX)); 4283 KDETH_SET(wd->kdeth0, OM, om == KDETH_OM_LARGE); 4284 KDETH_SET(wd->kdeth0, OFFSET, flow->tid_offset / om); 4285 KDETH_RESET(wd->kdeth1, JKEY, remote->jkey); 4286 wd->verbs_qp = cpu_to_be32(qp->remote_qpn); 4287 rcu_read_unlock(); 4288 4289 *bth1 = flow->tid_qpn; 4290 *bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) & 4291 HFI1_KDETH_BTH_SEQ_MASK) | 4292 (flow->flow_state.generation << 4293 HFI1_KDETH_BTH_SEQ_SHIFT)); 4294 if (last_pkt) { 4295 /* PSNs are zero-based, so +1 to count number of packets */ 4296 if (flow->flow_state.lpsn + 1 + 4297 rvt_div_round_up_mtu(qp, req->seg_len) > 4298 MAX_TID_FLOW_PSN) 4299 req->state = TID_REQUEST_SYNC; 4300 *bth2 |= IB_BTH_REQ_ACK; 4301 } 4302 4303 if (next_offset >= tidlen) { 4304 flow->tid_offset = 0; 4305 flow->tid_idx++; 4306 } else { 4307 flow->tid_offset = next_offset; 4308 } 4309 return last_pkt; 4310 } 4311 4312 void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet) 4313 { 4314 struct rvt_qp *qp = packet->qp; 4315 struct hfi1_qp_priv *priv = qp->priv; 4316 struct hfi1_ctxtdata *rcd = priv->rcd; 4317 struct ib_other_headers *ohdr = packet->ohdr; 4318 struct rvt_ack_entry *e; 4319 struct tid_rdma_request *req; 4320 struct tid_rdma_flow *flow; 4321 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); 4322 unsigned long flags; 4323 u32 psn, next; 4324 u8 opcode; 4325 bool fecn; 4326 4327 fecn = process_ecn(qp, packet); 4328 psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 4329 opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; 4330 4331 /* 4332 * All error handling should be done by now. If we are here, the packet 4333 * is either good or been accepted by the error handler. 4334 */ 4335 spin_lock_irqsave(&qp->s_lock, flags); 4336 e = &qp->s_ack_queue[priv->r_tid_tail]; 4337 req = ack_to_tid_req(e); 4338 flow = &req->flows[req->clear_tail]; 4339 if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.lpsn))) { 4340 update_r_next_psn_fecn(packet, priv, rcd, flow, fecn); 4341 4342 if (cmp_psn(psn, flow->flow_state.r_next_psn)) 4343 goto send_nak; 4344 4345 flow->flow_state.r_next_psn = mask_psn(psn + 1); 4346 /* 4347 * Copy the payload to destination buffer if this packet is 4348 * delivered as an eager packet due to RSM rule and FECN. 4349 * The RSM rule selects FECN bit in BTH and SH bit in 4350 * KDETH header and therefore will not match the last 4351 * packet of each segment that has SH bit cleared. 4352 */ 4353 if (fecn && packet->etype == RHF_RCV_TYPE_EAGER) { 4354 struct rvt_sge_state ss; 4355 u32 len; 4356 u32 tlen = packet->tlen; 4357 u16 hdrsize = packet->hlen; 4358 u8 pad = packet->pad; 4359 u8 extra_bytes = pad + packet->extra_byte + 4360 (SIZE_OF_CRC << 2); 4361 u32 pmtu = qp->pmtu; 4362 4363 if (unlikely(tlen != (hdrsize + pmtu + extra_bytes))) 4364 goto send_nak; 4365 len = req->comp_seg * req->seg_len; 4366 len += delta_psn(psn, 4367 full_flow_psn(flow, flow->flow_state.spsn)) * 4368 pmtu; 4369 if (unlikely(req->total_len - len < pmtu)) 4370 goto send_nak; 4371 4372 /* 4373 * The e->rdma_sge field is set when TID RDMA WRITE REQ 4374 * is first received and is never modified thereafter. 4375 */ 4376 ss.sge = e->rdma_sge; 4377 ss.sg_list = NULL; 4378 ss.num_sge = 1; 4379 ss.total_len = req->total_len; 4380 rvt_skip_sge(&ss, len, false); 4381 rvt_copy_sge(qp, &ss, packet->payload, pmtu, false, 4382 false); 4383 /* Raise the sw sequence check flag for next packet */ 4384 priv->r_next_psn_kdeth = mask_psn(psn + 1); 4385 priv->s_flags |= HFI1_R_TID_SW_PSN; 4386 } 4387 goto exit; 4388 } 4389 flow->flow_state.r_next_psn = mask_psn(psn + 1); 4390 hfi1_kern_exp_rcv_clear(req); 4391 priv->alloc_w_segs--; 4392 rcd->flows[flow->idx].psn = psn & HFI1_KDETH_BTH_SEQ_MASK; 4393 req->comp_seg++; 4394 priv->s_nak_state = 0; 4395 4396 /* 4397 * Release the flow if one of the following conditions has been met: 4398 * - The request has reached a sync point AND all outstanding 4399 * segments have been completed, or 4400 * - The entire request is complete and there are no more requests 4401 * (of any kind) in the queue. 4402 */ 4403 trace_hfi1_rsp_rcv_tid_write_data(qp, psn); 4404 trace_hfi1_tid_req_rcv_write_data(qp, 0, e->opcode, e->psn, e->lpsn, 4405 req); 4406 trace_hfi1_tid_write_rsp_rcv_data(qp); 4407 if (priv->r_tid_ack == HFI1_QP_WQE_INVALID) 4408 priv->r_tid_ack = priv->r_tid_tail; 4409 4410 if (opcode == TID_OP(WRITE_DATA_LAST)) { 4411 release_rdma_sge_mr(e); 4412 for (next = priv->r_tid_tail + 1; ; next++) { 4413 if (next > rvt_size_atomic(&dev->rdi)) 4414 next = 0; 4415 if (next == priv->r_tid_head) 4416 break; 4417 e = &qp->s_ack_queue[next]; 4418 if (e->opcode == TID_OP(WRITE_REQ)) 4419 break; 4420 } 4421 priv->r_tid_tail = next; 4422 if (++qp->s_acked_ack_queue > rvt_size_atomic(&dev->rdi)) 4423 qp->s_acked_ack_queue = 0; 4424 } 4425 4426 hfi1_tid_write_alloc_resources(qp, true); 4427 4428 /* 4429 * If we need to generate more responses, schedule the 4430 * send engine. 4431 */ 4432 if (req->cur_seg < req->total_segs || 4433 qp->s_tail_ack_queue != qp->r_head_ack_queue) { 4434 qp->s_flags |= RVT_S_RESP_PENDING; 4435 hfi1_schedule_send(qp); 4436 } 4437 4438 priv->pending_tid_w_segs--; 4439 if (priv->s_flags & HFI1_R_TID_RSC_TIMER) { 4440 if (priv->pending_tid_w_segs) 4441 hfi1_mod_tid_reap_timer(req->qp); 4442 else 4443 hfi1_stop_tid_reap_timer(req->qp); 4444 } 4445 4446 done: 4447 priv->s_flags |= RVT_S_ACK_PENDING; 4448 hfi1_schedule_tid_send(qp); 4449 exit: 4450 priv->r_next_psn_kdeth = flow->flow_state.r_next_psn; 4451 if (fecn) 4452 qp->s_flags |= RVT_S_ECN; 4453 spin_unlock_irqrestore(&qp->s_lock, flags); 4454 return; 4455 4456 send_nak: 4457 if (!priv->s_nak_state) { 4458 priv->s_nak_state = IB_NAK_PSN_ERROR; 4459 priv->s_nak_psn = flow->flow_state.r_next_psn; 4460 priv->s_flags |= RVT_S_ACK_PENDING; 4461 if (priv->r_tid_ack == HFI1_QP_WQE_INVALID) 4462 priv->r_tid_ack = priv->r_tid_tail; 4463 hfi1_schedule_tid_send(qp); 4464 } 4465 goto done; 4466 } 4467 4468 static bool hfi1_tid_rdma_is_resync_psn(u32 psn) 4469 { 4470 return (bool)((psn & HFI1_KDETH_BTH_SEQ_MASK) == 4471 HFI1_KDETH_BTH_SEQ_MASK); 4472 } 4473 4474 u32 hfi1_build_tid_rdma_write_ack(struct rvt_qp *qp, struct rvt_ack_entry *e, 4475 struct ib_other_headers *ohdr, u16 iflow, 4476 u32 *bth1, u32 *bth2) 4477 { 4478 struct hfi1_qp_priv *qpriv = qp->priv; 4479 struct tid_flow_state *fs = &qpriv->flow_state; 4480 struct tid_rdma_request *req = ack_to_tid_req(e); 4481 struct tid_rdma_flow *flow = &req->flows[iflow]; 4482 struct tid_rdma_params *remote; 4483 4484 rcu_read_lock(); 4485 remote = rcu_dereference(qpriv->tid_rdma.remote); 4486 KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey); 4487 ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn); 4488 *bth1 = remote->qp; 4489 rcu_read_unlock(); 4490 4491 if (qpriv->resync) { 4492 *bth2 = mask_psn((fs->generation << 4493 HFI1_KDETH_BTH_SEQ_SHIFT) - 1); 4494 ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp); 4495 } else if (qpriv->s_nak_state) { 4496 *bth2 = mask_psn(qpriv->s_nak_psn); 4497 ohdr->u.tid_rdma.ack.aeth = 4498 cpu_to_be32((qp->r_msn & IB_MSN_MASK) | 4499 (qpriv->s_nak_state << 4500 IB_AETH_CREDIT_SHIFT)); 4501 } else { 4502 *bth2 = full_flow_psn(flow, flow->flow_state.lpsn); 4503 ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp); 4504 } 4505 KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, 0x1); 4506 ohdr->u.tid_rdma.ack.tid_flow_qp = 4507 cpu_to_be32(qpriv->tid_rdma.local.qp | 4508 ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) << 4509 TID_RDMA_DESTQP_FLOW_SHIFT) | 4510 qpriv->rcd->ctxt); 4511 4512 ohdr->u.tid_rdma.ack.tid_flow_psn = 0; 4513 ohdr->u.tid_rdma.ack.verbs_psn = 4514 cpu_to_be32(flow->flow_state.resp_ib_psn); 4515 4516 if (qpriv->resync) { 4517 /* 4518 * If the PSN before the current expect KDETH PSN is the 4519 * RESYNC PSN, then we never received a good TID RDMA WRITE 4520 * DATA packet after a previous RESYNC. 4521 * In this case, the next expected KDETH PSN stays the same. 4522 */ 4523 if (hfi1_tid_rdma_is_resync_psn(qpriv->r_next_psn_kdeth - 1)) { 4524 ohdr->u.tid_rdma.ack.tid_flow_psn = 4525 cpu_to_be32(qpriv->r_next_psn_kdeth_save); 4526 } else { 4527 /* 4528 * Because the KDETH PSNs jump during a RESYNC, it's 4529 * not possible to infer (or compute) the previous value 4530 * of r_next_psn_kdeth in the case of back-to-back 4531 * RESYNC packets. Therefore, we save it. 4532 */ 4533 qpriv->r_next_psn_kdeth_save = 4534 qpriv->r_next_psn_kdeth - 1; 4535 ohdr->u.tid_rdma.ack.tid_flow_psn = 4536 cpu_to_be32(qpriv->r_next_psn_kdeth_save); 4537 qpriv->r_next_psn_kdeth = mask_psn(*bth2 + 1); 4538 } 4539 qpriv->resync = false; 4540 } 4541 4542 return sizeof(ohdr->u.tid_rdma.ack) / sizeof(u32); 4543 } 4544 4545 void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) 4546 { 4547 struct ib_other_headers *ohdr = packet->ohdr; 4548 struct rvt_qp *qp = packet->qp; 4549 struct hfi1_qp_priv *qpriv = qp->priv; 4550 struct rvt_swqe *wqe; 4551 struct tid_rdma_request *req; 4552 struct tid_rdma_flow *flow; 4553 u32 aeth, psn, req_psn, ack_psn, fspsn, resync_psn, ack_kpsn; 4554 unsigned long flags; 4555 u16 fidx; 4556 4557 trace_hfi1_tid_write_sender_rcv_tid_ack(qp, 0); 4558 process_ecn(qp, packet); 4559 psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 4560 aeth = be32_to_cpu(ohdr->u.tid_rdma.ack.aeth); 4561 req_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.verbs_psn)); 4562 resync_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.tid_flow_psn)); 4563 4564 spin_lock_irqsave(&qp->s_lock, flags); 4565 trace_hfi1_rcv_tid_ack(qp, aeth, psn, req_psn, resync_psn); 4566 4567 /* If we are waiting for an ACK to RESYNC, drop any other packets */ 4568 if ((qp->s_flags & HFI1_S_WAIT_HALT) && 4569 cmp_psn(psn, qpriv->s_resync_psn)) 4570 goto ack_op_err; 4571 4572 ack_psn = req_psn; 4573 if (hfi1_tid_rdma_is_resync_psn(psn)) 4574 ack_kpsn = resync_psn; 4575 else 4576 ack_kpsn = psn; 4577 if (aeth >> 29) { 4578 ack_psn--; 4579 ack_kpsn--; 4580 } 4581 4582 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 4583 4584 if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) 4585 goto ack_op_err; 4586 4587 req = wqe_to_tid_req(wqe); 4588 trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn, 4589 wqe->lpsn, req); 4590 flow = &req->flows[req->acked_tail]; 4591 trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow); 4592 4593 /* Drop stale ACK/NAK */ 4594 if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.spsn)) < 0) 4595 goto ack_op_err; 4596 4597 while (cmp_psn(ack_kpsn, 4598 full_flow_psn(flow, flow->flow_state.lpsn)) >= 0 && 4599 req->ack_seg < req->cur_seg) { 4600 req->ack_seg++; 4601 /* advance acked segment pointer */ 4602 req->acked_tail = CIRC_NEXT(req->acked_tail, MAX_FLOWS); 4603 req->r_last_acked = flow->flow_state.resp_ib_psn; 4604 trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn, 4605 wqe->lpsn, req); 4606 if (req->ack_seg == req->total_segs) { 4607 req->state = TID_REQUEST_COMPLETE; 4608 wqe = do_rc_completion(qp, wqe, 4609 to_iport(qp->ibqp.device, 4610 qp->port_num)); 4611 trace_hfi1_sender_rcv_tid_ack(qp); 4612 atomic_dec(&qpriv->n_tid_requests); 4613 if (qp->s_acked == qp->s_tail) 4614 break; 4615 if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) 4616 break; 4617 req = wqe_to_tid_req(wqe); 4618 } 4619 flow = &req->flows[req->acked_tail]; 4620 trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow); 4621 } 4622 4623 trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn, 4624 wqe->lpsn, req); 4625 switch (aeth >> 29) { 4626 case 0: /* ACK */ 4627 if (qpriv->s_flags & RVT_S_WAIT_ACK) 4628 qpriv->s_flags &= ~RVT_S_WAIT_ACK; 4629 if (!hfi1_tid_rdma_is_resync_psn(psn)) { 4630 /* Check if there is any pending TID ACK */ 4631 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && 4632 req->ack_seg < req->cur_seg) 4633 hfi1_mod_tid_retry_timer(qp); 4634 else 4635 hfi1_stop_tid_retry_timer(qp); 4636 hfi1_schedule_send(qp); 4637 } else { 4638 u32 spsn, fpsn, last_acked, generation; 4639 struct tid_rdma_request *rptr; 4640 4641 /* ACK(RESYNC) */ 4642 hfi1_stop_tid_retry_timer(qp); 4643 /* Allow new requests (see hfi1_make_tid_rdma_pkt) */ 4644 qp->s_flags &= ~HFI1_S_WAIT_HALT; 4645 /* 4646 * Clear RVT_S_SEND_ONE flag in case that the TID RDMA 4647 * ACK is received after the TID retry timer is fired 4648 * again. In this case, do not send any more TID 4649 * RESYNC request or wait for any more TID ACK packet. 4650 */ 4651 qpriv->s_flags &= ~RVT_S_SEND_ONE; 4652 hfi1_schedule_send(qp); 4653 4654 if ((qp->s_acked == qpriv->s_tid_tail && 4655 req->ack_seg == req->total_segs) || 4656 qp->s_acked == qp->s_tail) { 4657 qpriv->s_state = TID_OP(WRITE_DATA_LAST); 4658 goto done; 4659 } 4660 4661 if (req->ack_seg == req->comp_seg) { 4662 qpriv->s_state = TID_OP(WRITE_DATA); 4663 goto done; 4664 } 4665 4666 /* 4667 * The PSN to start with is the next PSN after the 4668 * RESYNC PSN. 4669 */ 4670 psn = mask_psn(psn + 1); 4671 generation = psn >> HFI1_KDETH_BTH_SEQ_SHIFT; 4672 spsn = 0; 4673 4674 /* 4675 * Update to the correct WQE when we get an ACK(RESYNC) 4676 * in the middle of a request. 4677 */ 4678 if (delta_psn(ack_psn, wqe->lpsn)) 4679 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 4680 req = wqe_to_tid_req(wqe); 4681 flow = &req->flows[req->acked_tail]; 4682 /* 4683 * RESYNC re-numbers the PSN ranges of all remaining 4684 * segments. Also, PSN's start from 0 in the middle of a 4685 * segment and the first segment size is less than the 4686 * default number of packets. flow->resync_npkts is used 4687 * to track the number of packets from the start of the 4688 * real segment to the point of 0 PSN after the RESYNC 4689 * in order to later correctly rewind the SGE. 4690 */ 4691 fpsn = full_flow_psn(flow, flow->flow_state.spsn); 4692 req->r_ack_psn = psn; 4693 flow->resync_npkts += 4694 delta_psn(mask_psn(resync_psn + 1), fpsn); 4695 /* 4696 * Renumber all packet sequence number ranges 4697 * based on the new generation. 4698 */ 4699 last_acked = qp->s_acked; 4700 rptr = req; 4701 while (1) { 4702 /* start from last acked segment */ 4703 for (fidx = rptr->acked_tail; 4704 CIRC_CNT(rptr->setup_head, fidx, 4705 MAX_FLOWS); 4706 fidx = CIRC_NEXT(fidx, MAX_FLOWS)) { 4707 u32 lpsn; 4708 u32 gen; 4709 4710 flow = &rptr->flows[fidx]; 4711 gen = flow->flow_state.generation; 4712 if (WARN_ON(gen == generation && 4713 flow->flow_state.spsn != 4714 spsn)) 4715 continue; 4716 lpsn = flow->flow_state.lpsn; 4717 lpsn = full_flow_psn(flow, lpsn); 4718 flow->npkts = 4719 delta_psn(lpsn, 4720 mask_psn(resync_psn) 4721 ); 4722 flow->flow_state.generation = 4723 generation; 4724 flow->flow_state.spsn = spsn; 4725 flow->flow_state.lpsn = 4726 flow->flow_state.spsn + 4727 flow->npkts - 1; 4728 flow->pkt = 0; 4729 spsn += flow->npkts; 4730 resync_psn += flow->npkts; 4731 trace_hfi1_tid_flow_rcv_tid_ack(qp, 4732 fidx, 4733 flow); 4734 } 4735 if (++last_acked == qpriv->s_tid_cur + 1) 4736 break; 4737 if (last_acked == qp->s_size) 4738 last_acked = 0; 4739 wqe = rvt_get_swqe_ptr(qp, last_acked); 4740 rptr = wqe_to_tid_req(wqe); 4741 } 4742 req->cur_seg = req->ack_seg; 4743 qpriv->s_tid_tail = qp->s_acked; 4744 qpriv->s_state = TID_OP(WRITE_REQ); 4745 hfi1_schedule_tid_send(qp); 4746 } 4747 done: 4748 qpriv->s_retry = qp->s_retry_cnt; 4749 break; 4750 4751 case 3: /* NAK */ 4752 hfi1_stop_tid_retry_timer(qp); 4753 switch ((aeth >> IB_AETH_CREDIT_SHIFT) & 4754 IB_AETH_CREDIT_MASK) { 4755 case 0: /* PSN sequence error */ 4756 flow = &req->flows[req->acked_tail]; 4757 fspsn = full_flow_psn(flow, flow->flow_state.spsn); 4758 trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, 4759 flow); 4760 req->r_ack_psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 4761 req->cur_seg = req->ack_seg; 4762 qpriv->s_tid_tail = qp->s_acked; 4763 qpriv->s_state = TID_OP(WRITE_REQ); 4764 qpriv->s_retry = qp->s_retry_cnt; 4765 hfi1_schedule_tid_send(qp); 4766 break; 4767 4768 default: 4769 break; 4770 } 4771 break; 4772 4773 default: 4774 break; 4775 } 4776 4777 ack_op_err: 4778 spin_unlock_irqrestore(&qp->s_lock, flags); 4779 } 4780 4781 void hfi1_add_tid_retry_timer(struct rvt_qp *qp) 4782 { 4783 struct hfi1_qp_priv *priv = qp->priv; 4784 struct ib_qp *ibqp = &qp->ibqp; 4785 struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device); 4786 4787 lockdep_assert_held(&qp->s_lock); 4788 if (!(priv->s_flags & HFI1_S_TID_RETRY_TIMER)) { 4789 priv->s_flags |= HFI1_S_TID_RETRY_TIMER; 4790 priv->s_tid_retry_timer.expires = jiffies + 4791 priv->tid_retry_timeout_jiffies + rdi->busy_jiffies; 4792 add_timer(&priv->s_tid_retry_timer); 4793 } 4794 } 4795 4796 static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp) 4797 { 4798 struct hfi1_qp_priv *priv = qp->priv; 4799 struct ib_qp *ibqp = &qp->ibqp; 4800 struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device); 4801 4802 lockdep_assert_held(&qp->s_lock); 4803 priv->s_flags |= HFI1_S_TID_RETRY_TIMER; 4804 mod_timer(&priv->s_tid_retry_timer, jiffies + 4805 priv->tid_retry_timeout_jiffies + rdi->busy_jiffies); 4806 } 4807 4808 static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp) 4809 { 4810 struct hfi1_qp_priv *priv = qp->priv; 4811 int rval = 0; 4812 4813 lockdep_assert_held(&qp->s_lock); 4814 if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) { 4815 rval = del_timer(&priv->s_tid_retry_timer); 4816 priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER; 4817 } 4818 return rval; 4819 } 4820 4821 void hfi1_del_tid_retry_timer(struct rvt_qp *qp) 4822 { 4823 struct hfi1_qp_priv *priv = qp->priv; 4824 4825 del_timer_sync(&priv->s_tid_retry_timer); 4826 priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER; 4827 } 4828 4829 static void hfi1_tid_retry_timeout(struct timer_list *t) 4830 { 4831 struct hfi1_qp_priv *priv = from_timer(priv, t, s_tid_retry_timer); 4832 struct rvt_qp *qp = priv->owner; 4833 struct rvt_swqe *wqe; 4834 unsigned long flags; 4835 struct tid_rdma_request *req; 4836 4837 spin_lock_irqsave(&qp->r_lock, flags); 4838 spin_lock(&qp->s_lock); 4839 trace_hfi1_tid_write_sender_retry_timeout(qp, 0); 4840 if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) { 4841 hfi1_stop_tid_retry_timer(qp); 4842 if (!priv->s_retry) { 4843 trace_hfi1_msg_tid_retry_timeout(/* msg */ 4844 qp, 4845 "Exhausted retries. Tid retry timeout = ", 4846 (u64)priv->tid_retry_timeout_jiffies); 4847 4848 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 4849 hfi1_trdma_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); 4850 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); 4851 } else { 4852 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 4853 req = wqe_to_tid_req(wqe); 4854 trace_hfi1_tid_req_tid_retry_timeout(/* req */ 4855 qp, 0, wqe->wr.opcode, wqe->psn, wqe->lpsn, req); 4856 4857 priv->s_flags &= ~RVT_S_WAIT_ACK; 4858 /* Only send one packet (the RESYNC) */ 4859 priv->s_flags |= RVT_S_SEND_ONE; 4860 /* 4861 * No additional request shall be made by this QP until 4862 * the RESYNC has been complete. 4863 */ 4864 qp->s_flags |= HFI1_S_WAIT_HALT; 4865 priv->s_state = TID_OP(RESYNC); 4866 priv->s_retry--; 4867 hfi1_schedule_tid_send(qp); 4868 } 4869 } 4870 spin_unlock(&qp->s_lock); 4871 spin_unlock_irqrestore(&qp->r_lock, flags); 4872 } 4873 4874 u32 hfi1_build_tid_rdma_resync(struct rvt_qp *qp, struct rvt_swqe *wqe, 4875 struct ib_other_headers *ohdr, u32 *bth1, 4876 u32 *bth2, u16 fidx) 4877 { 4878 struct hfi1_qp_priv *qpriv = qp->priv; 4879 struct tid_rdma_params *remote; 4880 struct tid_rdma_request *req = wqe_to_tid_req(wqe); 4881 struct tid_rdma_flow *flow = &req->flows[fidx]; 4882 u32 generation; 4883 4884 rcu_read_lock(); 4885 remote = rcu_dereference(qpriv->tid_rdma.remote); 4886 KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey); 4887 ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn); 4888 *bth1 = remote->qp; 4889 rcu_read_unlock(); 4890 4891 generation = kern_flow_generation_next(flow->flow_state.generation); 4892 *bth2 = mask_psn((generation << HFI1_KDETH_BTH_SEQ_SHIFT) - 1); 4893 qpriv->s_resync_psn = *bth2; 4894 *bth2 |= IB_BTH_REQ_ACK; 4895 KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, 0x1); 4896 4897 return sizeof(ohdr->u.tid_rdma.resync) / sizeof(u32); 4898 } 4899 4900 void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet) 4901 { 4902 struct ib_other_headers *ohdr = packet->ohdr; 4903 struct rvt_qp *qp = packet->qp; 4904 struct hfi1_qp_priv *qpriv = qp->priv; 4905 struct hfi1_ctxtdata *rcd = qpriv->rcd; 4906 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); 4907 struct rvt_ack_entry *e; 4908 struct tid_rdma_request *req; 4909 struct tid_rdma_flow *flow; 4910 struct tid_flow_state *fs = &qpriv->flow_state; 4911 u32 psn, generation, idx, gen_next; 4912 bool fecn; 4913 unsigned long flags; 4914 4915 fecn = process_ecn(qp, packet); 4916 psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 4917 4918 generation = mask_psn(psn + 1) >> HFI1_KDETH_BTH_SEQ_SHIFT; 4919 spin_lock_irqsave(&qp->s_lock, flags); 4920 4921 gen_next = (fs->generation == KERN_GENERATION_RESERVED) ? 4922 generation : kern_flow_generation_next(fs->generation); 4923 /* 4924 * RESYNC packet contains the "next" generation and can only be 4925 * from the current or previous generations 4926 */ 4927 if (generation != mask_generation(gen_next - 1) && 4928 generation != gen_next) 4929 goto bail; 4930 /* Already processing a resync */ 4931 if (qpriv->resync) 4932 goto bail; 4933 4934 spin_lock(&rcd->exp_lock); 4935 if (fs->index >= RXE_NUM_TID_FLOWS) { 4936 /* 4937 * If we don't have a flow, save the generation so it can be 4938 * applied when a new flow is allocated 4939 */ 4940 fs->generation = generation; 4941 } else { 4942 /* Reprogram the QP flow with new generation */ 4943 rcd->flows[fs->index].generation = generation; 4944 fs->generation = kern_setup_hw_flow(rcd, fs->index); 4945 } 4946 fs->psn = 0; 4947 /* 4948 * Disable SW PSN checking since a RESYNC is equivalent to a 4949 * sync point and the flow has/will be reprogrammed 4950 */ 4951 qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; 4952 trace_hfi1_tid_write_rsp_rcv_resync(qp); 4953 4954 /* 4955 * Reset all TID flow information with the new generation. 4956 * This is done for all requests and segments after the 4957 * last received segment 4958 */ 4959 for (idx = qpriv->r_tid_tail; ; idx++) { 4960 u16 flow_idx; 4961 4962 if (idx > rvt_size_atomic(&dev->rdi)) 4963 idx = 0; 4964 e = &qp->s_ack_queue[idx]; 4965 if (e->opcode == TID_OP(WRITE_REQ)) { 4966 req = ack_to_tid_req(e); 4967 trace_hfi1_tid_req_rcv_resync(qp, 0, e->opcode, e->psn, 4968 e->lpsn, req); 4969 4970 /* start from last unacked segment */ 4971 for (flow_idx = req->clear_tail; 4972 CIRC_CNT(req->setup_head, flow_idx, 4973 MAX_FLOWS); 4974 flow_idx = CIRC_NEXT(flow_idx, MAX_FLOWS)) { 4975 u32 lpsn; 4976 u32 next; 4977 4978 flow = &req->flows[flow_idx]; 4979 lpsn = full_flow_psn(flow, 4980 flow->flow_state.lpsn); 4981 next = flow->flow_state.r_next_psn; 4982 flow->npkts = delta_psn(lpsn, next - 1); 4983 flow->flow_state.generation = fs->generation; 4984 flow->flow_state.spsn = fs->psn; 4985 flow->flow_state.lpsn = 4986 flow->flow_state.spsn + flow->npkts - 1; 4987 flow->flow_state.r_next_psn = 4988 full_flow_psn(flow, 4989 flow->flow_state.spsn); 4990 fs->psn += flow->npkts; 4991 trace_hfi1_tid_flow_rcv_resync(qp, flow_idx, 4992 flow); 4993 } 4994 } 4995 if (idx == qp->s_tail_ack_queue) 4996 break; 4997 } 4998 4999 spin_unlock(&rcd->exp_lock); 5000 qpriv->resync = true; 5001 /* RESYNC request always gets a TID RDMA ACK. */ 5002 qpriv->s_nak_state = 0; 5003 qpriv->s_flags |= RVT_S_ACK_PENDING; 5004 hfi1_schedule_tid_send(qp); 5005 bail: 5006 if (fecn) 5007 qp->s_flags |= RVT_S_ECN; 5008 spin_unlock_irqrestore(&qp->s_lock, flags); 5009 } 5010 5011 /* 5012 * Call this function when the last TID RDMA WRITE DATA packet for a request 5013 * is built. 5014 */ 5015 static void update_tid_tail(struct rvt_qp *qp) 5016 __must_hold(&qp->s_lock) 5017 { 5018 struct hfi1_qp_priv *priv = qp->priv; 5019 u32 i; 5020 struct rvt_swqe *wqe; 5021 5022 lockdep_assert_held(&qp->s_lock); 5023 /* Can't move beyond s_tid_cur */ 5024 if (priv->s_tid_tail == priv->s_tid_cur) 5025 return; 5026 for (i = priv->s_tid_tail + 1; ; i++) { 5027 if (i == qp->s_size) 5028 i = 0; 5029 5030 if (i == priv->s_tid_cur) 5031 break; 5032 wqe = rvt_get_swqe_ptr(qp, i); 5033 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) 5034 break; 5035 } 5036 priv->s_tid_tail = i; 5037 priv->s_state = TID_OP(WRITE_RESP); 5038 } 5039 5040 int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps) 5041 __must_hold(&qp->s_lock) 5042 { 5043 struct hfi1_qp_priv *priv = qp->priv; 5044 struct rvt_swqe *wqe; 5045 u32 bth1 = 0, bth2 = 0, hwords = 5, len, middle = 0; 5046 struct ib_other_headers *ohdr; 5047 struct rvt_sge_state *ss = &qp->s_sge; 5048 struct rvt_ack_entry *e = &qp->s_ack_queue[qp->s_tail_ack_queue]; 5049 struct tid_rdma_request *req = ack_to_tid_req(e); 5050 bool last = false; 5051 u8 opcode = TID_OP(WRITE_DATA); 5052 5053 lockdep_assert_held(&qp->s_lock); 5054 trace_hfi1_tid_write_sender_make_tid_pkt(qp, 0); 5055 /* 5056 * Prioritize the sending of the requests and responses over the 5057 * sending of the TID RDMA data packets. 5058 */ 5059 if (((atomic_read(&priv->n_tid_requests) < HFI1_TID_RDMA_WRITE_CNT) && 5060 atomic_read(&priv->n_requests) && 5061 !(qp->s_flags & (RVT_S_BUSY | RVT_S_WAIT_ACK | 5062 HFI1_S_ANY_WAIT_IO))) || 5063 (e->opcode == TID_OP(WRITE_REQ) && req->cur_seg < req->alloc_seg && 5064 !(qp->s_flags & (RVT_S_BUSY | HFI1_S_ANY_WAIT_IO)))) { 5065 struct iowait_work *iowork; 5066 5067 iowork = iowait_get_ib_work(&priv->s_iowait); 5068 ps->s_txreq = get_waiting_verbs_txreq(iowork); 5069 if (ps->s_txreq || hfi1_make_rc_req(qp, ps)) { 5070 priv->s_flags |= HFI1_S_TID_BUSY_SET; 5071 return 1; 5072 } 5073 } 5074 5075 ps->s_txreq = get_txreq(ps->dev, qp); 5076 if (!ps->s_txreq) 5077 goto bail_no_tx; 5078 5079 ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth; 5080 5081 if ((priv->s_flags & RVT_S_ACK_PENDING) && 5082 make_tid_rdma_ack(qp, ohdr, ps)) 5083 return 1; 5084 5085 /* 5086 * Bail out if we can't send data. 5087 * Be reminded that this check must been done after the call to 5088 * make_tid_rdma_ack() because the responding QP could be in 5089 * RTR state where it can send TID RDMA ACK, not TID RDMA WRITE DATA. 5090 */ 5091 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) 5092 goto bail; 5093 5094 if (priv->s_flags & RVT_S_WAIT_ACK) 5095 goto bail; 5096 5097 /* Check whether there is anything to do. */ 5098 if (priv->s_tid_tail == HFI1_QP_WQE_INVALID) 5099 goto bail; 5100 wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail); 5101 req = wqe_to_tid_req(wqe); 5102 trace_hfi1_tid_req_make_tid_pkt(qp, 0, wqe->wr.opcode, wqe->psn, 5103 wqe->lpsn, req); 5104 switch (priv->s_state) { 5105 case TID_OP(WRITE_REQ): 5106 case TID_OP(WRITE_RESP): 5107 priv->tid_ss.sge = wqe->sg_list[0]; 5108 priv->tid_ss.sg_list = wqe->sg_list + 1; 5109 priv->tid_ss.num_sge = wqe->wr.num_sge; 5110 priv->tid_ss.total_len = wqe->length; 5111 5112 if (priv->s_state == TID_OP(WRITE_REQ)) 5113 hfi1_tid_rdma_restart_req(qp, wqe, &bth2); 5114 priv->s_state = TID_OP(WRITE_DATA); 5115 /* fall through */ 5116 5117 case TID_OP(WRITE_DATA): 5118 /* 5119 * 1. Check whether TID RDMA WRITE RESP available. 5120 * 2. If no: 5121 * 2.1 If have more segments and no TID RDMA WRITE RESP, 5122 * set HFI1_S_WAIT_TID_RESP 5123 * 2.2 Return indicating no progress made. 5124 * 3. If yes: 5125 * 3.1 Build TID RDMA WRITE DATA packet. 5126 * 3.2 If last packet in segment: 5127 * 3.2.1 Change KDETH header bits 5128 * 3.2.2 Advance RESP pointers. 5129 * 3.3 Return indicating progress made. 5130 */ 5131 trace_hfi1_sender_make_tid_pkt(qp); 5132 trace_hfi1_tid_write_sender_make_tid_pkt(qp, 0); 5133 wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail); 5134 req = wqe_to_tid_req(wqe); 5135 len = wqe->length; 5136 5137 if (!req->comp_seg || req->cur_seg == req->comp_seg) 5138 goto bail; 5139 5140 trace_hfi1_tid_req_make_tid_pkt(qp, 0, wqe->wr.opcode, 5141 wqe->psn, wqe->lpsn, req); 5142 last = hfi1_build_tid_rdma_packet(wqe, ohdr, &bth1, &bth2, 5143 &len); 5144 5145 if (last) { 5146 /* move pointer to next flow */ 5147 req->clear_tail = CIRC_NEXT(req->clear_tail, 5148 MAX_FLOWS); 5149 if (++req->cur_seg < req->total_segs) { 5150 if (!CIRC_CNT(req->setup_head, req->clear_tail, 5151 MAX_FLOWS)) 5152 qp->s_flags |= HFI1_S_WAIT_TID_RESP; 5153 } else { 5154 priv->s_state = TID_OP(WRITE_DATA_LAST); 5155 opcode = TID_OP(WRITE_DATA_LAST); 5156 5157 /* Advance the s_tid_tail now */ 5158 update_tid_tail(qp); 5159 } 5160 } 5161 hwords += sizeof(ohdr->u.tid_rdma.w_data) / sizeof(u32); 5162 ss = &priv->tid_ss; 5163 break; 5164 5165 case TID_OP(RESYNC): 5166 trace_hfi1_sender_make_tid_pkt(qp); 5167 /* Use generation from the most recently received response */ 5168 wqe = rvt_get_swqe_ptr(qp, priv->s_tid_cur); 5169 req = wqe_to_tid_req(wqe); 5170 /* If no responses for this WQE look at the previous one */ 5171 if (!req->comp_seg) { 5172 wqe = rvt_get_swqe_ptr(qp, 5173 (!priv->s_tid_cur ? qp->s_size : 5174 priv->s_tid_cur) - 1); 5175 req = wqe_to_tid_req(wqe); 5176 } 5177 hwords += hfi1_build_tid_rdma_resync(qp, wqe, ohdr, &bth1, 5178 &bth2, 5179 CIRC_PREV(req->setup_head, 5180 MAX_FLOWS)); 5181 ss = NULL; 5182 len = 0; 5183 opcode = TID_OP(RESYNC); 5184 break; 5185 5186 default: 5187 goto bail; 5188 } 5189 if (priv->s_flags & RVT_S_SEND_ONE) { 5190 priv->s_flags &= ~RVT_S_SEND_ONE; 5191 priv->s_flags |= RVT_S_WAIT_ACK; 5192 bth2 |= IB_BTH_REQ_ACK; 5193 } 5194 qp->s_len -= len; 5195 ps->s_txreq->hdr_dwords = hwords; 5196 ps->s_txreq->sde = priv->s_sde; 5197 ps->s_txreq->ss = ss; 5198 ps->s_txreq->s_cur_size = len; 5199 hfi1_make_ruc_header(qp, ohdr, (opcode << 24), bth1, bth2, 5200 middle, ps); 5201 return 1; 5202 bail: 5203 hfi1_put_txreq(ps->s_txreq); 5204 bail_no_tx: 5205 ps->s_txreq = NULL; 5206 priv->s_flags &= ~RVT_S_BUSY; 5207 /* 5208 * If we didn't get a txreq, the QP will be woken up later to try 5209 * again, set the flags to the the wake up which work item to wake 5210 * up. 5211 * (A better algorithm should be found to do this and generalize the 5212 * sleep/wakeup flags.) 5213 */ 5214 iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID); 5215 return 0; 5216 } 5217 5218 static int make_tid_rdma_ack(struct rvt_qp *qp, 5219 struct ib_other_headers *ohdr, 5220 struct hfi1_pkt_state *ps) 5221 { 5222 struct rvt_ack_entry *e; 5223 struct hfi1_qp_priv *qpriv = qp->priv; 5224 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); 5225 u32 hwords, next; 5226 u32 len = 0; 5227 u32 bth1 = 0, bth2 = 0; 5228 int middle = 0; 5229 u16 flow; 5230 struct tid_rdma_request *req, *nreq; 5231 5232 trace_hfi1_tid_write_rsp_make_tid_ack(qp); 5233 /* Don't send an ACK if we aren't supposed to. */ 5234 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) 5235 goto bail; 5236 5237 /* header size in 32-bit words LRH+BTH = (8+12)/4. */ 5238 hwords = 5; 5239 5240 e = &qp->s_ack_queue[qpriv->r_tid_ack]; 5241 req = ack_to_tid_req(e); 5242 /* 5243 * In the RESYNC case, we are exactly one segment past the 5244 * previously sent ack or at the previously sent NAK. So to send 5245 * the resync ack, we go back one segment (which might be part of 5246 * the previous request) and let the do-while loop execute again. 5247 * The advantage of executing the do-while loop is that any data 5248 * received after the previous ack is automatically acked in the 5249 * RESYNC ack. It turns out that for the do-while loop we only need 5250 * to pull back qpriv->r_tid_ack, not the segment 5251 * indices/counters. The scheme works even if the previous request 5252 * was not a TID WRITE request. 5253 */ 5254 if (qpriv->resync) { 5255 if (!req->ack_seg || req->ack_seg == req->total_segs) 5256 qpriv->r_tid_ack = !qpriv->r_tid_ack ? 5257 rvt_size_atomic(&dev->rdi) : 5258 qpriv->r_tid_ack - 1; 5259 e = &qp->s_ack_queue[qpriv->r_tid_ack]; 5260 req = ack_to_tid_req(e); 5261 } 5262 5263 trace_hfi1_rsp_make_tid_ack(qp, e->psn); 5264 trace_hfi1_tid_req_make_tid_ack(qp, 0, e->opcode, e->psn, e->lpsn, 5265 req); 5266 /* 5267 * If we've sent all the ACKs that we can, we are done 5268 * until we get more segments... 5269 */ 5270 if (!qpriv->s_nak_state && !qpriv->resync && 5271 req->ack_seg == req->comp_seg) 5272 goto bail; 5273 5274 do { 5275 /* 5276 * To deal with coalesced ACKs, the acked_tail pointer 5277 * into the flow array is used. The distance between it 5278 * and the clear_tail is the number of flows that are 5279 * being ACK'ed. 5280 */ 5281 req->ack_seg += 5282 /* Get up-to-date value */ 5283 CIRC_CNT(req->clear_tail, req->acked_tail, 5284 MAX_FLOWS); 5285 /* Advance acked index */ 5286 req->acked_tail = req->clear_tail; 5287 5288 /* 5289 * req->clear_tail points to the segment currently being 5290 * received. So, when sending an ACK, the previous 5291 * segment is being ACK'ed. 5292 */ 5293 flow = CIRC_PREV(req->acked_tail, MAX_FLOWS); 5294 if (req->ack_seg != req->total_segs) 5295 break; 5296 req->state = TID_REQUEST_COMPLETE; 5297 5298 next = qpriv->r_tid_ack + 1; 5299 if (next > rvt_size_atomic(&dev->rdi)) 5300 next = 0; 5301 qpriv->r_tid_ack = next; 5302 if (qp->s_ack_queue[next].opcode != TID_OP(WRITE_REQ)) 5303 break; 5304 nreq = ack_to_tid_req(&qp->s_ack_queue[next]); 5305 if (!nreq->comp_seg || nreq->ack_seg == nreq->comp_seg) 5306 break; 5307 5308 /* Move to the next ack entry now */ 5309 e = &qp->s_ack_queue[qpriv->r_tid_ack]; 5310 req = ack_to_tid_req(e); 5311 } while (1); 5312 5313 /* 5314 * At this point qpriv->r_tid_ack == qpriv->r_tid_tail but e and 5315 * req could be pointing at the previous ack queue entry 5316 */ 5317 if (qpriv->s_nak_state || 5318 (qpriv->resync && 5319 !hfi1_tid_rdma_is_resync_psn(qpriv->r_next_psn_kdeth - 1) && 5320 (cmp_psn(qpriv->r_next_psn_kdeth - 1, 5321 full_flow_psn(&req->flows[flow], 5322 req->flows[flow].flow_state.lpsn)) > 0))) { 5323 /* 5324 * A NAK will implicitly acknowledge all previous TID RDMA 5325 * requests. Therefore, we NAK with the req->acked_tail 5326 * segment for the request at qpriv->r_tid_ack (same at 5327 * this point as the req->clear_tail segment for the 5328 * qpriv->r_tid_tail request) 5329 */ 5330 e = &qp->s_ack_queue[qpriv->r_tid_ack]; 5331 req = ack_to_tid_req(e); 5332 flow = req->acked_tail; 5333 } else if (req->ack_seg == req->total_segs && 5334 qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK) 5335 qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK; 5336 5337 trace_hfi1_tid_write_rsp_make_tid_ack(qp); 5338 trace_hfi1_tid_req_make_tid_ack(qp, 0, e->opcode, e->psn, e->lpsn, 5339 req); 5340 hwords += hfi1_build_tid_rdma_write_ack(qp, e, ohdr, flow, &bth1, 5341 &bth2); 5342 len = 0; 5343 qpriv->s_flags &= ~RVT_S_ACK_PENDING; 5344 ps->s_txreq->hdr_dwords = hwords; 5345 ps->s_txreq->sde = qpriv->s_sde; 5346 ps->s_txreq->s_cur_size = len; 5347 ps->s_txreq->ss = NULL; 5348 hfi1_make_ruc_header(qp, ohdr, (TID_OP(ACK) << 24), bth1, bth2, middle, 5349 ps); 5350 ps->s_txreq->txreq.flags |= SDMA_TXREQ_F_VIP; 5351 return 1; 5352 bail: 5353 /* 5354 * Ensure s_rdma_ack_cnt changes are committed prior to resetting 5355 * RVT_S_RESP_PENDING 5356 */ 5357 smp_wmb(); 5358 qpriv->s_flags &= ~RVT_S_ACK_PENDING; 5359 return 0; 5360 } 5361 5362 static int hfi1_send_tid_ok(struct rvt_qp *qp) 5363 { 5364 struct hfi1_qp_priv *priv = qp->priv; 5365 5366 return !(priv->s_flags & RVT_S_BUSY || 5367 qp->s_flags & HFI1_S_ANY_WAIT_IO) && 5368 (verbs_txreq_queued(iowait_get_tid_work(&priv->s_iowait)) || 5369 (priv->s_flags & RVT_S_RESP_PENDING) || 5370 !(qp->s_flags & HFI1_S_ANY_TID_WAIT_SEND)); 5371 } 5372 5373 void _hfi1_do_tid_send(struct work_struct *work) 5374 { 5375 struct iowait_work *w = container_of(work, struct iowait_work, iowork); 5376 struct rvt_qp *qp = iowait_to_qp(w->iow); 5377 5378 hfi1_do_tid_send(qp); 5379 } 5380 5381 static void hfi1_do_tid_send(struct rvt_qp *qp) 5382 { 5383 struct hfi1_pkt_state ps; 5384 struct hfi1_qp_priv *priv = qp->priv; 5385 5386 ps.dev = to_idev(qp->ibqp.device); 5387 ps.ibp = to_iport(qp->ibqp.device, qp->port_num); 5388 ps.ppd = ppd_from_ibp(ps.ibp); 5389 ps.wait = iowait_get_tid_work(&priv->s_iowait); 5390 ps.in_thread = false; 5391 ps.timeout_int = qp->timeout_jiffies / 8; 5392 5393 trace_hfi1_rc_do_tid_send(qp, false); 5394 spin_lock_irqsave(&qp->s_lock, ps.flags); 5395 5396 /* Return if we are already busy processing a work request. */ 5397 if (!hfi1_send_tid_ok(qp)) { 5398 if (qp->s_flags & HFI1_S_ANY_WAIT_IO) 5399 iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID); 5400 spin_unlock_irqrestore(&qp->s_lock, ps.flags); 5401 return; 5402 } 5403 5404 priv->s_flags |= RVT_S_BUSY; 5405 5406 ps.timeout = jiffies + ps.timeout_int; 5407 ps.cpu = priv->s_sde ? priv->s_sde->cpu : 5408 cpumask_first(cpumask_of_node(ps.ppd->dd->node)); 5409 ps.pkts_sent = false; 5410 5411 /* insure a pre-built packet is handled */ 5412 ps.s_txreq = get_waiting_verbs_txreq(ps.wait); 5413 do { 5414 /* Check for a constructed packet to be sent. */ 5415 if (ps.s_txreq) { 5416 if (priv->s_flags & HFI1_S_TID_BUSY_SET) { 5417 qp->s_flags |= RVT_S_BUSY; 5418 ps.wait = iowait_get_ib_work(&priv->s_iowait); 5419 } 5420 spin_unlock_irqrestore(&qp->s_lock, ps.flags); 5421 5422 /* 5423 * If the packet cannot be sent now, return and 5424 * the send tasklet will be woken up later. 5425 */ 5426 if (hfi1_verbs_send(qp, &ps)) 5427 return; 5428 5429 /* allow other tasks to run */ 5430 if (hfi1_schedule_send_yield(qp, &ps, true)) 5431 return; 5432 5433 spin_lock_irqsave(&qp->s_lock, ps.flags); 5434 if (priv->s_flags & HFI1_S_TID_BUSY_SET) { 5435 qp->s_flags &= ~RVT_S_BUSY; 5436 priv->s_flags &= ~HFI1_S_TID_BUSY_SET; 5437 ps.wait = iowait_get_tid_work(&priv->s_iowait); 5438 if (iowait_flag_set(&priv->s_iowait, 5439 IOWAIT_PENDING_IB)) 5440 hfi1_schedule_send(qp); 5441 } 5442 } 5443 } while (hfi1_make_tid_rdma_pkt(qp, &ps)); 5444 iowait_starve_clear(ps.pkts_sent, &priv->s_iowait); 5445 spin_unlock_irqrestore(&qp->s_lock, ps.flags); 5446 } 5447 5448 static bool _hfi1_schedule_tid_send(struct rvt_qp *qp) 5449 { 5450 struct hfi1_qp_priv *priv = qp->priv; 5451 struct hfi1_ibport *ibp = 5452 to_iport(qp->ibqp.device, qp->port_num); 5453 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); 5454 struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); 5455 5456 return iowait_tid_schedule(&priv->s_iowait, ppd->hfi1_wq, 5457 priv->s_sde ? 5458 priv->s_sde->cpu : 5459 cpumask_first(cpumask_of_node(dd->node))); 5460 } 5461 5462 /** 5463 * hfi1_schedule_tid_send - schedule progress on TID RDMA state machine 5464 * @qp: the QP 5465 * 5466 * This schedules qp progress on the TID RDMA state machine. Caller 5467 * should hold the s_lock. 5468 * Unlike hfi1_schedule_send(), this cannot use hfi1_send_ok() because 5469 * the two state machines can step on each other with respect to the 5470 * RVT_S_BUSY flag. 5471 * Therefore, a modified test is used. 5472 * @return true if the second leg is scheduled; 5473 * false if the second leg is not scheduled. 5474 */ 5475 bool hfi1_schedule_tid_send(struct rvt_qp *qp) 5476 { 5477 lockdep_assert_held(&qp->s_lock); 5478 if (hfi1_send_tid_ok(qp)) { 5479 /* 5480 * The following call returns true if the qp is not on the 5481 * queue and false if the qp is already on the queue before 5482 * this call. Either way, the qp will be on the queue when the 5483 * call returns. 5484 */ 5485 _hfi1_schedule_tid_send(qp); 5486 return true; 5487 } 5488 if (qp->s_flags & HFI1_S_ANY_WAIT_IO) 5489 iowait_set_flag(&((struct hfi1_qp_priv *)qp->priv)->s_iowait, 5490 IOWAIT_PENDING_TID); 5491 return false; 5492 } 5493 5494 bool hfi1_tid_rdma_ack_interlock(struct rvt_qp *qp, struct rvt_ack_entry *e) 5495 { 5496 struct rvt_ack_entry *prev; 5497 struct tid_rdma_request *req; 5498 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); 5499 struct hfi1_qp_priv *priv = qp->priv; 5500 u32 s_prev; 5501 5502 s_prev = qp->s_tail_ack_queue == 0 ? rvt_size_atomic(&dev->rdi) : 5503 (qp->s_tail_ack_queue - 1); 5504 prev = &qp->s_ack_queue[s_prev]; 5505 5506 if ((e->opcode == TID_OP(READ_REQ) || 5507 e->opcode == OP(RDMA_READ_REQUEST)) && 5508 prev->opcode == TID_OP(WRITE_REQ)) { 5509 req = ack_to_tid_req(prev); 5510 if (req->ack_seg != req->total_segs) { 5511 priv->s_flags |= HFI1_R_TID_WAIT_INTERLCK; 5512 return true; 5513 } 5514 } 5515 return false; 5516 } 5517 5518 static u32 read_r_next_psn(struct hfi1_devdata *dd, u8 ctxt, u8 fidx) 5519 { 5520 u64 reg; 5521 5522 /* 5523 * The only sane way to get the amount of 5524 * progress is to read the HW flow state. 5525 */ 5526 reg = read_uctxt_csr(dd, ctxt, RCV_TID_FLOW_TABLE + (8 * fidx)); 5527 return mask_psn(reg); 5528 } 5529 5530 static void tid_rdma_rcv_err(struct hfi1_packet *packet, 5531 struct ib_other_headers *ohdr, 5532 struct rvt_qp *qp, u32 psn, int diff, bool fecn) 5533 { 5534 unsigned long flags; 5535 5536 tid_rdma_rcv_error(packet, ohdr, qp, psn, diff); 5537 if (fecn) { 5538 spin_lock_irqsave(&qp->s_lock, flags); 5539 qp->s_flags |= RVT_S_ECN; 5540 spin_unlock_irqrestore(&qp->s_lock, flags); 5541 } 5542 } 5543 5544 static void update_r_next_psn_fecn(struct hfi1_packet *packet, 5545 struct hfi1_qp_priv *priv, 5546 struct hfi1_ctxtdata *rcd, 5547 struct tid_rdma_flow *flow, 5548 bool fecn) 5549 { 5550 /* 5551 * If a start/middle packet is delivered here due to 5552 * RSM rule and FECN, we need to update the r_next_psn. 5553 */ 5554 if (fecn && packet->etype == RHF_RCV_TYPE_EAGER && 5555 !(priv->s_flags & HFI1_R_TID_SW_PSN)) { 5556 struct hfi1_devdata *dd = rcd->dd; 5557 5558 flow->flow_state.r_next_psn = 5559 read_r_next_psn(dd, rcd->ctxt, flow->idx); 5560 } 5561 } 5562