1 // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) 2 /* 3 * Copyright(c) 2018 - 2020 Intel Corporation. 4 * 5 */ 6 7 #include "hfi.h" 8 #include "qp.h" 9 #include "rc.h" 10 #include "verbs.h" 11 #include "tid_rdma.h" 12 #include "exp_rcv.h" 13 #include "trace.h" 14 15 /** 16 * DOC: TID RDMA READ protocol 17 * 18 * This is an end-to-end protocol at the hfi1 level between two nodes that 19 * improves performance by avoiding data copy on the requester side. It 20 * converts a qualified RDMA READ request into a TID RDMA READ request on 21 * the requester side and thereafter handles the request and response 22 * differently. To be qualified, the RDMA READ request should meet the 23 * following: 24 * -- The total data length should be greater than 256K; 25 * -- The total data length should be a multiple of 4K page size; 26 * -- Each local scatter-gather entry should be 4K page aligned; 27 * -- Each local scatter-gather entry should be a multiple of 4K page size; 28 */ 29 30 #define RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK BIT_ULL(32) 31 #define RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK BIT_ULL(33) 32 #define RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK BIT_ULL(34) 33 #define RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK BIT_ULL(35) 34 #define RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK BIT_ULL(37) 35 #define RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK BIT_ULL(38) 36 37 /* Maximum number of packets within a flow generation. */ 38 #define MAX_TID_FLOW_PSN BIT(HFI1_KDETH_BTH_SEQ_SHIFT) 39 40 #define GENERATION_MASK 0xFFFFF 41 42 static u32 mask_generation(u32 a) 43 { 44 return a & GENERATION_MASK; 45 } 46 47 /* Reserved generation value to set to unused flows for kernel contexts */ 48 #define KERN_GENERATION_RESERVED mask_generation(U32_MAX) 49 50 /* 51 * J_KEY for kernel contexts when TID RDMA is used. 52 * See generate_jkey() in hfi.h for more information. 53 */ 54 #define TID_RDMA_JKEY 32 55 #define HFI1_KERNEL_MIN_JKEY HFI1_ADMIN_JKEY_RANGE 56 #define HFI1_KERNEL_MAX_JKEY (2 * HFI1_ADMIN_JKEY_RANGE - 1) 57 58 /* Maximum number of segments in flight per QP request. */ 59 #define TID_RDMA_MAX_READ_SEGS_PER_REQ 6 60 #define TID_RDMA_MAX_WRITE_SEGS_PER_REQ 4 61 #define MAX_REQ max_t(u16, TID_RDMA_MAX_READ_SEGS_PER_REQ, \ 62 TID_RDMA_MAX_WRITE_SEGS_PER_REQ) 63 #define MAX_FLOWS roundup_pow_of_two(MAX_REQ + 1) 64 65 #define MAX_EXPECTED_PAGES (MAX_EXPECTED_BUFFER / PAGE_SIZE) 66 67 #define TID_RDMA_DESTQP_FLOW_SHIFT 11 68 #define TID_RDMA_DESTQP_FLOW_MASK 0x1f 69 70 #define TID_OPFN_QP_CTXT_MASK 0xff 71 #define TID_OPFN_QP_CTXT_SHIFT 56 72 #define TID_OPFN_QP_KDETH_MASK 0xff 73 #define TID_OPFN_QP_KDETH_SHIFT 48 74 #define TID_OPFN_MAX_LEN_MASK 0x7ff 75 #define TID_OPFN_MAX_LEN_SHIFT 37 76 #define TID_OPFN_TIMEOUT_MASK 0x1f 77 #define TID_OPFN_TIMEOUT_SHIFT 32 78 #define TID_OPFN_RESERVED_MASK 0x3f 79 #define TID_OPFN_RESERVED_SHIFT 26 80 #define TID_OPFN_URG_MASK 0x1 81 #define TID_OPFN_URG_SHIFT 25 82 #define TID_OPFN_VER_MASK 0x7 83 #define TID_OPFN_VER_SHIFT 22 84 #define TID_OPFN_JKEY_MASK 0x3f 85 #define TID_OPFN_JKEY_SHIFT 16 86 #define TID_OPFN_MAX_READ_MASK 0x3f 87 #define TID_OPFN_MAX_READ_SHIFT 10 88 #define TID_OPFN_MAX_WRITE_MASK 0x3f 89 #define TID_OPFN_MAX_WRITE_SHIFT 4 90 91 /* 92 * OPFN TID layout 93 * 94 * 63 47 31 15 95 * NNNNNNNNKKKKKKKK MMMMMMMMMMMTTTTT DDDDDDUVVVJJJJJJ RRRRRRWWWWWWCCCC 96 * 3210987654321098 7654321098765432 1098765432109876 5432109876543210 97 * N - the context Number 98 * K - the Kdeth_qp 99 * M - Max_len 100 * T - Timeout 101 * D - reserveD 102 * V - version 103 * U - Urg capable 104 * J - Jkey 105 * R - max_Read 106 * W - max_Write 107 * C - Capcode 108 */ 109 110 static void tid_rdma_trigger_resume(struct work_struct *work); 111 static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req); 112 static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req, 113 gfp_t gfp); 114 static void hfi1_init_trdma_req(struct rvt_qp *qp, 115 struct tid_rdma_request *req); 116 static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx); 117 static void hfi1_tid_timeout(struct timer_list *t); 118 static void hfi1_add_tid_reap_timer(struct rvt_qp *qp); 119 static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp); 120 static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp); 121 static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp); 122 static void hfi1_tid_retry_timeout(struct timer_list *t); 123 static int make_tid_rdma_ack(struct rvt_qp *qp, 124 struct ib_other_headers *ohdr, 125 struct hfi1_pkt_state *ps); 126 static void hfi1_do_tid_send(struct rvt_qp *qp); 127 static u32 read_r_next_psn(struct hfi1_devdata *dd, u8 ctxt, u8 fidx); 128 static void tid_rdma_rcv_err(struct hfi1_packet *packet, 129 struct ib_other_headers *ohdr, 130 struct rvt_qp *qp, u32 psn, int diff, bool fecn); 131 static void update_r_next_psn_fecn(struct hfi1_packet *packet, 132 struct hfi1_qp_priv *priv, 133 struct hfi1_ctxtdata *rcd, 134 struct tid_rdma_flow *flow, 135 bool fecn); 136 137 static void validate_r_tid_ack(struct hfi1_qp_priv *priv) 138 { 139 if (priv->r_tid_ack == HFI1_QP_WQE_INVALID) 140 priv->r_tid_ack = priv->r_tid_tail; 141 } 142 143 static void tid_rdma_schedule_ack(struct rvt_qp *qp) 144 { 145 struct hfi1_qp_priv *priv = qp->priv; 146 147 priv->s_flags |= RVT_S_ACK_PENDING; 148 hfi1_schedule_tid_send(qp); 149 } 150 151 static void tid_rdma_trigger_ack(struct rvt_qp *qp) 152 { 153 validate_r_tid_ack(qp->priv); 154 tid_rdma_schedule_ack(qp); 155 } 156 157 static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p) 158 { 159 return 160 (((u64)p->qp & TID_OPFN_QP_CTXT_MASK) << 161 TID_OPFN_QP_CTXT_SHIFT) | 162 ((((u64)p->qp >> 16) & TID_OPFN_QP_KDETH_MASK) << 163 TID_OPFN_QP_KDETH_SHIFT) | 164 (((u64)((p->max_len >> PAGE_SHIFT) - 1) & 165 TID_OPFN_MAX_LEN_MASK) << TID_OPFN_MAX_LEN_SHIFT) | 166 (((u64)p->timeout & TID_OPFN_TIMEOUT_MASK) << 167 TID_OPFN_TIMEOUT_SHIFT) | 168 (((u64)p->urg & TID_OPFN_URG_MASK) << TID_OPFN_URG_SHIFT) | 169 (((u64)p->jkey & TID_OPFN_JKEY_MASK) << TID_OPFN_JKEY_SHIFT) | 170 (((u64)p->max_read & TID_OPFN_MAX_READ_MASK) << 171 TID_OPFN_MAX_READ_SHIFT) | 172 (((u64)p->max_write & TID_OPFN_MAX_WRITE_MASK) << 173 TID_OPFN_MAX_WRITE_SHIFT); 174 } 175 176 static void tid_rdma_opfn_decode(struct tid_rdma_params *p, u64 data) 177 { 178 p->max_len = (((data >> TID_OPFN_MAX_LEN_SHIFT) & 179 TID_OPFN_MAX_LEN_MASK) + 1) << PAGE_SHIFT; 180 p->jkey = (data >> TID_OPFN_JKEY_SHIFT) & TID_OPFN_JKEY_MASK; 181 p->max_write = (data >> TID_OPFN_MAX_WRITE_SHIFT) & 182 TID_OPFN_MAX_WRITE_MASK; 183 p->max_read = (data >> TID_OPFN_MAX_READ_SHIFT) & 184 TID_OPFN_MAX_READ_MASK; 185 p->qp = 186 ((((data >> TID_OPFN_QP_KDETH_SHIFT) & TID_OPFN_QP_KDETH_MASK) 187 << 16) | 188 ((data >> TID_OPFN_QP_CTXT_SHIFT) & TID_OPFN_QP_CTXT_MASK)); 189 p->urg = (data >> TID_OPFN_URG_SHIFT) & TID_OPFN_URG_MASK; 190 p->timeout = (data >> TID_OPFN_TIMEOUT_SHIFT) & TID_OPFN_TIMEOUT_MASK; 191 } 192 193 void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p) 194 { 195 struct hfi1_qp_priv *priv = qp->priv; 196 197 p->qp = (RVT_KDETH_QP_PREFIX << 16) | priv->rcd->ctxt; 198 p->max_len = TID_RDMA_MAX_SEGMENT_SIZE; 199 p->jkey = priv->rcd->jkey; 200 p->max_read = TID_RDMA_MAX_READ_SEGS_PER_REQ; 201 p->max_write = TID_RDMA_MAX_WRITE_SEGS_PER_REQ; 202 p->timeout = qp->timeout; 203 p->urg = is_urg_masked(priv->rcd); 204 } 205 206 bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data) 207 { 208 struct hfi1_qp_priv *priv = qp->priv; 209 210 *data = tid_rdma_opfn_encode(&priv->tid_rdma.local); 211 return true; 212 } 213 214 bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data) 215 { 216 struct hfi1_qp_priv *priv = qp->priv; 217 struct tid_rdma_params *remote, *old; 218 bool ret = true; 219 220 old = rcu_dereference_protected(priv->tid_rdma.remote, 221 lockdep_is_held(&priv->opfn.lock)); 222 data &= ~0xfULL; 223 /* 224 * If data passed in is zero, return true so as not to continue the 225 * negotiation process 226 */ 227 if (!data || !HFI1_CAP_IS_KSET(TID_RDMA)) 228 goto null; 229 /* 230 * If kzalloc fails, return false. This will result in: 231 * * at the requester a new OPFN request being generated to retry 232 * the negotiation 233 * * at the responder, 0 being returned to the requester so as to 234 * disable TID RDMA at both the requester and the responder 235 */ 236 remote = kzalloc(sizeof(*remote), GFP_ATOMIC); 237 if (!remote) { 238 ret = false; 239 goto null; 240 } 241 242 tid_rdma_opfn_decode(remote, data); 243 priv->tid_timer_timeout_jiffies = 244 usecs_to_jiffies((((4096UL * (1UL << remote->timeout)) / 245 1000UL) << 3) * 7); 246 trace_hfi1_opfn_param(qp, 0, &priv->tid_rdma.local); 247 trace_hfi1_opfn_param(qp, 1, remote); 248 rcu_assign_pointer(priv->tid_rdma.remote, remote); 249 /* 250 * A TID RDMA READ request's segment size is not equal to 251 * remote->max_len only when the request's data length is smaller 252 * than remote->max_len. In that case, there will be only one segment. 253 * Therefore, when priv->pkts_ps is used to calculate req->cur_seg 254 * during retry, it will lead to req->cur_seg = 0, which is exactly 255 * what is expected. 256 */ 257 priv->pkts_ps = (u16)rvt_div_mtu(qp, remote->max_len); 258 priv->timeout_shift = ilog2(priv->pkts_ps - 1) + 1; 259 goto free; 260 null: 261 RCU_INIT_POINTER(priv->tid_rdma.remote, NULL); 262 priv->timeout_shift = 0; 263 free: 264 if (old) 265 kfree_rcu(old, rcu_head); 266 return ret; 267 } 268 269 bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data) 270 { 271 bool ret; 272 273 ret = tid_rdma_conn_reply(qp, *data); 274 *data = 0; 275 /* 276 * If tid_rdma_conn_reply() returns error, set *data as 0 to indicate 277 * TID RDMA could not be enabled. This will result in TID RDMA being 278 * disabled at the requester too. 279 */ 280 if (ret) 281 (void)tid_rdma_conn_req(qp, data); 282 return ret; 283 } 284 285 void tid_rdma_conn_error(struct rvt_qp *qp) 286 { 287 struct hfi1_qp_priv *priv = qp->priv; 288 struct tid_rdma_params *old; 289 290 old = rcu_dereference_protected(priv->tid_rdma.remote, 291 lockdep_is_held(&priv->opfn.lock)); 292 RCU_INIT_POINTER(priv->tid_rdma.remote, NULL); 293 if (old) 294 kfree_rcu(old, rcu_head); 295 } 296 297 /* This is called at context initialization time */ 298 int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit) 299 { 300 if (reinit) 301 return 0; 302 303 BUILD_BUG_ON(TID_RDMA_JKEY < HFI1_KERNEL_MIN_JKEY); 304 BUILD_BUG_ON(TID_RDMA_JKEY > HFI1_KERNEL_MAX_JKEY); 305 rcd->jkey = TID_RDMA_JKEY; 306 hfi1_set_ctxt_jkey(rcd->dd, rcd, rcd->jkey); 307 return hfi1_alloc_ctxt_rcv_groups(rcd); 308 } 309 310 /** 311 * qp_to_rcd - determine the receive context used by a qp 312 * @qp - the qp 313 * 314 * This routine returns the receive context associated 315 * with a a qp's qpn. 316 * 317 * Returns the context. 318 */ 319 static struct hfi1_ctxtdata *qp_to_rcd(struct rvt_dev_info *rdi, 320 struct rvt_qp *qp) 321 { 322 struct hfi1_ibdev *verbs_dev = container_of(rdi, 323 struct hfi1_ibdev, 324 rdi); 325 struct hfi1_devdata *dd = container_of(verbs_dev, 326 struct hfi1_devdata, 327 verbs_dev); 328 unsigned int ctxt; 329 330 if (qp->ibqp.qp_num == 0) 331 ctxt = 0; 332 else 333 ctxt = hfi1_get_qp_map(dd, qp->ibqp.qp_num >> dd->qos_shift); 334 return dd->rcd[ctxt]; 335 } 336 337 int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, 338 struct ib_qp_init_attr *init_attr) 339 { 340 struct hfi1_qp_priv *qpriv = qp->priv; 341 int i, ret; 342 343 qpriv->rcd = qp_to_rcd(rdi, qp); 344 345 spin_lock_init(&qpriv->opfn.lock); 346 INIT_WORK(&qpriv->opfn.opfn_work, opfn_send_conn_request); 347 INIT_WORK(&qpriv->tid_rdma.trigger_work, tid_rdma_trigger_resume); 348 qpriv->flow_state.psn = 0; 349 qpriv->flow_state.index = RXE_NUM_TID_FLOWS; 350 qpriv->flow_state.last_index = RXE_NUM_TID_FLOWS; 351 qpriv->flow_state.generation = KERN_GENERATION_RESERVED; 352 qpriv->s_state = TID_OP(WRITE_RESP); 353 qpriv->s_tid_cur = HFI1_QP_WQE_INVALID; 354 qpriv->s_tid_head = HFI1_QP_WQE_INVALID; 355 qpriv->s_tid_tail = HFI1_QP_WQE_INVALID; 356 qpriv->rnr_nak_state = TID_RNR_NAK_INIT; 357 qpriv->r_tid_head = HFI1_QP_WQE_INVALID; 358 qpriv->r_tid_tail = HFI1_QP_WQE_INVALID; 359 qpriv->r_tid_ack = HFI1_QP_WQE_INVALID; 360 qpriv->r_tid_alloc = HFI1_QP_WQE_INVALID; 361 atomic_set(&qpriv->n_requests, 0); 362 atomic_set(&qpriv->n_tid_requests, 0); 363 timer_setup(&qpriv->s_tid_timer, hfi1_tid_timeout, 0); 364 timer_setup(&qpriv->s_tid_retry_timer, hfi1_tid_retry_timeout, 0); 365 INIT_LIST_HEAD(&qpriv->tid_wait); 366 367 if (init_attr->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { 368 struct hfi1_devdata *dd = qpriv->rcd->dd; 369 370 qpriv->pages = kzalloc_node(TID_RDMA_MAX_PAGES * 371 sizeof(*qpriv->pages), 372 GFP_KERNEL, dd->node); 373 if (!qpriv->pages) 374 return -ENOMEM; 375 for (i = 0; i < qp->s_size; i++) { 376 struct hfi1_swqe_priv *priv; 377 struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i); 378 379 priv = kzalloc_node(sizeof(*priv), GFP_KERNEL, 380 dd->node); 381 if (!priv) 382 return -ENOMEM; 383 384 hfi1_init_trdma_req(qp, &priv->tid_req); 385 priv->tid_req.e.swqe = wqe; 386 wqe->priv = priv; 387 } 388 for (i = 0; i < rvt_max_atomic(rdi); i++) { 389 struct hfi1_ack_priv *priv; 390 391 priv = kzalloc_node(sizeof(*priv), GFP_KERNEL, 392 dd->node); 393 if (!priv) 394 return -ENOMEM; 395 396 hfi1_init_trdma_req(qp, &priv->tid_req); 397 priv->tid_req.e.ack = &qp->s_ack_queue[i]; 398 399 ret = hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req, 400 GFP_KERNEL); 401 if (ret) { 402 kfree(priv); 403 return ret; 404 } 405 qp->s_ack_queue[i].priv = priv; 406 } 407 } 408 409 return 0; 410 } 411 412 void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp) 413 { 414 struct hfi1_qp_priv *qpriv = qp->priv; 415 struct rvt_swqe *wqe; 416 u32 i; 417 418 if (qp->ibqp.qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { 419 for (i = 0; i < qp->s_size; i++) { 420 wqe = rvt_get_swqe_ptr(qp, i); 421 kfree(wqe->priv); 422 wqe->priv = NULL; 423 } 424 for (i = 0; i < rvt_max_atomic(rdi); i++) { 425 struct hfi1_ack_priv *priv = qp->s_ack_queue[i].priv; 426 427 if (priv) 428 hfi1_kern_exp_rcv_free_flows(&priv->tid_req); 429 kfree(priv); 430 qp->s_ack_queue[i].priv = NULL; 431 } 432 cancel_work_sync(&qpriv->opfn.opfn_work); 433 kfree(qpriv->pages); 434 qpriv->pages = NULL; 435 } 436 } 437 438 /* Flow and tid waiter functions */ 439 /** 440 * DOC: lock ordering 441 * 442 * There are two locks involved with the queuing 443 * routines: the qp s_lock and the exp_lock. 444 * 445 * Since the tid space allocation is called from 446 * the send engine, the qp s_lock is already held. 447 * 448 * The allocation routines will get the exp_lock. 449 * 450 * The first_qp() call is provided to allow the head of 451 * the rcd wait queue to be fetched under the exp_lock and 452 * followed by a drop of the exp_lock. 453 * 454 * Any qp in the wait list will have the qp reference count held 455 * to hold the qp in memory. 456 */ 457 458 /* 459 * return head of rcd wait list 460 * 461 * Must hold the exp_lock. 462 * 463 * Get a reference to the QP to hold the QP in memory. 464 * 465 * The caller must release the reference when the local 466 * is no longer being used. 467 */ 468 static struct rvt_qp *first_qp(struct hfi1_ctxtdata *rcd, 469 struct tid_queue *queue) 470 __must_hold(&rcd->exp_lock) 471 { 472 struct hfi1_qp_priv *priv; 473 474 lockdep_assert_held(&rcd->exp_lock); 475 priv = list_first_entry_or_null(&queue->queue_head, 476 struct hfi1_qp_priv, 477 tid_wait); 478 if (!priv) 479 return NULL; 480 rvt_get_qp(priv->owner); 481 return priv->owner; 482 } 483 484 /** 485 * kernel_tid_waiters - determine rcd wait 486 * @rcd: the receive context 487 * @qp: the head of the qp being processed 488 * 489 * This routine will return false IFF 490 * the list is NULL or the head of the 491 * list is the indicated qp. 492 * 493 * Must hold the qp s_lock and the exp_lock. 494 * 495 * Return: 496 * false if either of the conditions below are satisfied: 497 * 1. The list is empty or 498 * 2. The indicated qp is at the head of the list and the 499 * HFI1_S_WAIT_TID_SPACE bit is set in qp->s_flags. 500 * true is returned otherwise. 501 */ 502 static bool kernel_tid_waiters(struct hfi1_ctxtdata *rcd, 503 struct tid_queue *queue, struct rvt_qp *qp) 504 __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock) 505 { 506 struct rvt_qp *fqp; 507 bool ret = true; 508 509 lockdep_assert_held(&qp->s_lock); 510 lockdep_assert_held(&rcd->exp_lock); 511 fqp = first_qp(rcd, queue); 512 if (!fqp || (fqp == qp && (qp->s_flags & HFI1_S_WAIT_TID_SPACE))) 513 ret = false; 514 rvt_put_qp(fqp); 515 return ret; 516 } 517 518 /** 519 * dequeue_tid_waiter - dequeue the qp from the list 520 * @qp - the qp to remove the wait list 521 * 522 * This routine removes the indicated qp from the 523 * wait list if it is there. 524 * 525 * This should be done after the hardware flow and 526 * tid array resources have been allocated. 527 * 528 * Must hold the qp s_lock and the rcd exp_lock. 529 * 530 * It assumes the s_lock to protect the s_flags 531 * field and to reliably test the HFI1_S_WAIT_TID_SPACE flag. 532 */ 533 static void dequeue_tid_waiter(struct hfi1_ctxtdata *rcd, 534 struct tid_queue *queue, struct rvt_qp *qp) 535 __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock) 536 { 537 struct hfi1_qp_priv *priv = qp->priv; 538 539 lockdep_assert_held(&qp->s_lock); 540 lockdep_assert_held(&rcd->exp_lock); 541 if (list_empty(&priv->tid_wait)) 542 return; 543 list_del_init(&priv->tid_wait); 544 qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE; 545 queue->dequeue++; 546 rvt_put_qp(qp); 547 } 548 549 /** 550 * queue_qp_for_tid_wait - suspend QP on tid space 551 * @rcd: the receive context 552 * @qp: the qp 553 * 554 * The qp is inserted at the tail of the rcd 555 * wait queue and the HFI1_S_WAIT_TID_SPACE s_flag is set. 556 * 557 * Must hold the qp s_lock and the exp_lock. 558 */ 559 static void queue_qp_for_tid_wait(struct hfi1_ctxtdata *rcd, 560 struct tid_queue *queue, struct rvt_qp *qp) 561 __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock) 562 { 563 struct hfi1_qp_priv *priv = qp->priv; 564 565 lockdep_assert_held(&qp->s_lock); 566 lockdep_assert_held(&rcd->exp_lock); 567 if (list_empty(&priv->tid_wait)) { 568 qp->s_flags |= HFI1_S_WAIT_TID_SPACE; 569 list_add_tail(&priv->tid_wait, &queue->queue_head); 570 priv->tid_enqueue = ++queue->enqueue; 571 rcd->dd->verbs_dev.n_tidwait++; 572 trace_hfi1_qpsleep(qp, HFI1_S_WAIT_TID_SPACE); 573 rvt_get_qp(qp); 574 } 575 } 576 577 /** 578 * __trigger_tid_waiter - trigger tid waiter 579 * @qp: the qp 580 * 581 * This is a private entrance to schedule the qp 582 * assuming the caller is holding the qp->s_lock. 583 */ 584 static void __trigger_tid_waiter(struct rvt_qp *qp) 585 __must_hold(&qp->s_lock) 586 { 587 lockdep_assert_held(&qp->s_lock); 588 if (!(qp->s_flags & HFI1_S_WAIT_TID_SPACE)) 589 return; 590 trace_hfi1_qpwakeup(qp, HFI1_S_WAIT_TID_SPACE); 591 hfi1_schedule_send(qp); 592 } 593 594 /** 595 * tid_rdma_schedule_tid_wakeup - schedule wakeup for a qp 596 * @qp - the qp 597 * 598 * trigger a schedule or a waiting qp in a deadlock 599 * safe manner. The qp reference is held prior 600 * to this call via first_qp(). 601 * 602 * If the qp trigger was already scheduled (!rval) 603 * the the reference is dropped, otherwise the resume 604 * or the destroy cancel will dispatch the reference. 605 */ 606 static void tid_rdma_schedule_tid_wakeup(struct rvt_qp *qp) 607 { 608 struct hfi1_qp_priv *priv; 609 struct hfi1_ibport *ibp; 610 struct hfi1_pportdata *ppd; 611 struct hfi1_devdata *dd; 612 bool rval; 613 614 if (!qp) 615 return; 616 617 priv = qp->priv; 618 ibp = to_iport(qp->ibqp.device, qp->port_num); 619 ppd = ppd_from_ibp(ibp); 620 dd = dd_from_ibdev(qp->ibqp.device); 621 622 rval = queue_work_on(priv->s_sde ? 623 priv->s_sde->cpu : 624 cpumask_first(cpumask_of_node(dd->node)), 625 ppd->hfi1_wq, 626 &priv->tid_rdma.trigger_work); 627 if (!rval) 628 rvt_put_qp(qp); 629 } 630 631 /** 632 * tid_rdma_trigger_resume - field a trigger work request 633 * @work - the work item 634 * 635 * Complete the off qp trigger processing by directly 636 * calling the progress routine. 637 */ 638 static void tid_rdma_trigger_resume(struct work_struct *work) 639 { 640 struct tid_rdma_qp_params *tr; 641 struct hfi1_qp_priv *priv; 642 struct rvt_qp *qp; 643 644 tr = container_of(work, struct tid_rdma_qp_params, trigger_work); 645 priv = container_of(tr, struct hfi1_qp_priv, tid_rdma); 646 qp = priv->owner; 647 spin_lock_irq(&qp->s_lock); 648 if (qp->s_flags & HFI1_S_WAIT_TID_SPACE) { 649 spin_unlock_irq(&qp->s_lock); 650 hfi1_do_send(priv->owner, true); 651 } else { 652 spin_unlock_irq(&qp->s_lock); 653 } 654 rvt_put_qp(qp); 655 } 656 657 /** 658 * tid_rdma_flush_wait - unwind any tid space wait 659 * 660 * This is called when resetting a qp to 661 * allow a destroy or reset to get rid 662 * of any tid space linkage and reference counts. 663 */ 664 static void _tid_rdma_flush_wait(struct rvt_qp *qp, struct tid_queue *queue) 665 __must_hold(&qp->s_lock) 666 { 667 struct hfi1_qp_priv *priv; 668 669 if (!qp) 670 return; 671 lockdep_assert_held(&qp->s_lock); 672 priv = qp->priv; 673 qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE; 674 spin_lock(&priv->rcd->exp_lock); 675 if (!list_empty(&priv->tid_wait)) { 676 list_del_init(&priv->tid_wait); 677 qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE; 678 queue->dequeue++; 679 rvt_put_qp(qp); 680 } 681 spin_unlock(&priv->rcd->exp_lock); 682 } 683 684 void hfi1_tid_rdma_flush_wait(struct rvt_qp *qp) 685 __must_hold(&qp->s_lock) 686 { 687 struct hfi1_qp_priv *priv = qp->priv; 688 689 _tid_rdma_flush_wait(qp, &priv->rcd->flow_queue); 690 _tid_rdma_flush_wait(qp, &priv->rcd->rarr_queue); 691 } 692 693 /* Flow functions */ 694 /** 695 * kern_reserve_flow - allocate a hardware flow 696 * @rcd - the context to use for allocation 697 * @last - the index of the preferred flow. Use RXE_NUM_TID_FLOWS to 698 * signify "don't care". 699 * 700 * Use a bit mask based allocation to reserve a hardware 701 * flow for use in receiving KDETH data packets. If a preferred flow is 702 * specified the function will attempt to reserve that flow again, if 703 * available. 704 * 705 * The exp_lock must be held. 706 * 707 * Return: 708 * On success: a value postive value between 0 and RXE_NUM_TID_FLOWS - 1 709 * On failure: -EAGAIN 710 */ 711 static int kern_reserve_flow(struct hfi1_ctxtdata *rcd, int last) 712 __must_hold(&rcd->exp_lock) 713 { 714 int nr; 715 716 /* Attempt to reserve the preferred flow index */ 717 if (last >= 0 && last < RXE_NUM_TID_FLOWS && 718 !test_and_set_bit(last, &rcd->flow_mask)) 719 return last; 720 721 nr = ffz(rcd->flow_mask); 722 BUILD_BUG_ON(RXE_NUM_TID_FLOWS >= 723 (sizeof(rcd->flow_mask) * BITS_PER_BYTE)); 724 if (nr > (RXE_NUM_TID_FLOWS - 1)) 725 return -EAGAIN; 726 set_bit(nr, &rcd->flow_mask); 727 return nr; 728 } 729 730 static void kern_set_hw_flow(struct hfi1_ctxtdata *rcd, u32 generation, 731 u32 flow_idx) 732 { 733 u64 reg; 734 735 reg = ((u64)generation << HFI1_KDETH_BTH_SEQ_SHIFT) | 736 RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK | 737 RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK | 738 RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK | 739 RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK | 740 RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK; 741 742 if (generation != KERN_GENERATION_RESERVED) 743 reg |= RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK; 744 745 write_uctxt_csr(rcd->dd, rcd->ctxt, 746 RCV_TID_FLOW_TABLE + 8 * flow_idx, reg); 747 } 748 749 static u32 kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx) 750 __must_hold(&rcd->exp_lock) 751 { 752 u32 generation = rcd->flows[flow_idx].generation; 753 754 kern_set_hw_flow(rcd, generation, flow_idx); 755 return generation; 756 } 757 758 static u32 kern_flow_generation_next(u32 gen) 759 { 760 u32 generation = mask_generation(gen + 1); 761 762 if (generation == KERN_GENERATION_RESERVED) 763 generation = mask_generation(generation + 1); 764 return generation; 765 } 766 767 static void kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx) 768 __must_hold(&rcd->exp_lock) 769 { 770 rcd->flows[flow_idx].generation = 771 kern_flow_generation_next(rcd->flows[flow_idx].generation); 772 kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, flow_idx); 773 } 774 775 int hfi1_kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp) 776 { 777 struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv; 778 struct tid_flow_state *fs = &qpriv->flow_state; 779 struct rvt_qp *fqp; 780 unsigned long flags; 781 int ret = 0; 782 783 /* The QP already has an allocated flow */ 784 if (fs->index != RXE_NUM_TID_FLOWS) 785 return ret; 786 787 spin_lock_irqsave(&rcd->exp_lock, flags); 788 if (kernel_tid_waiters(rcd, &rcd->flow_queue, qp)) 789 goto queue; 790 791 ret = kern_reserve_flow(rcd, fs->last_index); 792 if (ret < 0) 793 goto queue; 794 fs->index = ret; 795 fs->last_index = fs->index; 796 797 /* Generation received in a RESYNC overrides default flow generation */ 798 if (fs->generation != KERN_GENERATION_RESERVED) 799 rcd->flows[fs->index].generation = fs->generation; 800 fs->generation = kern_setup_hw_flow(rcd, fs->index); 801 fs->psn = 0; 802 dequeue_tid_waiter(rcd, &rcd->flow_queue, qp); 803 /* get head before dropping lock */ 804 fqp = first_qp(rcd, &rcd->flow_queue); 805 spin_unlock_irqrestore(&rcd->exp_lock, flags); 806 807 tid_rdma_schedule_tid_wakeup(fqp); 808 return 0; 809 queue: 810 queue_qp_for_tid_wait(rcd, &rcd->flow_queue, qp); 811 spin_unlock_irqrestore(&rcd->exp_lock, flags); 812 return -EAGAIN; 813 } 814 815 void hfi1_kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp) 816 { 817 struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv; 818 struct tid_flow_state *fs = &qpriv->flow_state; 819 struct rvt_qp *fqp; 820 unsigned long flags; 821 822 if (fs->index >= RXE_NUM_TID_FLOWS) 823 return; 824 spin_lock_irqsave(&rcd->exp_lock, flags); 825 kern_clear_hw_flow(rcd, fs->index); 826 clear_bit(fs->index, &rcd->flow_mask); 827 fs->index = RXE_NUM_TID_FLOWS; 828 fs->psn = 0; 829 fs->generation = KERN_GENERATION_RESERVED; 830 831 /* get head before dropping lock */ 832 fqp = first_qp(rcd, &rcd->flow_queue); 833 spin_unlock_irqrestore(&rcd->exp_lock, flags); 834 835 if (fqp == qp) { 836 __trigger_tid_waiter(fqp); 837 rvt_put_qp(fqp); 838 } else { 839 tid_rdma_schedule_tid_wakeup(fqp); 840 } 841 } 842 843 void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd) 844 { 845 int i; 846 847 for (i = 0; i < RXE_NUM_TID_FLOWS; i++) { 848 rcd->flows[i].generation = mask_generation(prandom_u32()); 849 kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, i); 850 } 851 } 852 853 /* TID allocation functions */ 854 static u8 trdma_pset_order(struct tid_rdma_pageset *s) 855 { 856 u8 count = s->count; 857 858 return ilog2(count) + 1; 859 } 860 861 /** 862 * tid_rdma_find_phys_blocks_4k - get groups base on mr info 863 * @npages - number of pages 864 * @pages - pointer to an array of page structs 865 * @list - page set array to return 866 * 867 * This routine returns the number of groups associated with 868 * the current sge information. This implementation is based 869 * on the expected receive find_phys_blocks() adjusted to 870 * use the MR information vs. the pfn. 871 * 872 * Return: 873 * the number of RcvArray entries 874 */ 875 static u32 tid_rdma_find_phys_blocks_4k(struct tid_rdma_flow *flow, 876 struct page **pages, 877 u32 npages, 878 struct tid_rdma_pageset *list) 879 { 880 u32 pagecount, pageidx, setcount = 0, i; 881 void *vaddr, *this_vaddr; 882 883 if (!npages) 884 return 0; 885 886 /* 887 * Look for sets of physically contiguous pages in the user buffer. 888 * This will allow us to optimize Expected RcvArray entry usage by 889 * using the bigger supported sizes. 890 */ 891 vaddr = page_address(pages[0]); 892 trace_hfi1_tid_flow_page(flow->req->qp, flow, 0, 0, 0, vaddr); 893 for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) { 894 this_vaddr = i < npages ? page_address(pages[i]) : NULL; 895 trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 0, 0, 896 this_vaddr); 897 /* 898 * If the vaddr's are not sequential, pages are not physically 899 * contiguous. 900 */ 901 if (this_vaddr != (vaddr + PAGE_SIZE)) { 902 /* 903 * At this point we have to loop over the set of 904 * physically contiguous pages and break them down it 905 * sizes supported by the HW. 906 * There are two main constraints: 907 * 1. The max buffer size is MAX_EXPECTED_BUFFER. 908 * If the total set size is bigger than that 909 * program only a MAX_EXPECTED_BUFFER chunk. 910 * 2. The buffer size has to be a power of two. If 911 * it is not, round down to the closes power of 912 * 2 and program that size. 913 */ 914 while (pagecount) { 915 int maxpages = pagecount; 916 u32 bufsize = pagecount * PAGE_SIZE; 917 918 if (bufsize > MAX_EXPECTED_BUFFER) 919 maxpages = 920 MAX_EXPECTED_BUFFER >> 921 PAGE_SHIFT; 922 else if (!is_power_of_2(bufsize)) 923 maxpages = 924 rounddown_pow_of_two(bufsize) >> 925 PAGE_SHIFT; 926 927 list[setcount].idx = pageidx; 928 list[setcount].count = maxpages; 929 trace_hfi1_tid_pageset(flow->req->qp, setcount, 930 list[setcount].idx, 931 list[setcount].count); 932 pagecount -= maxpages; 933 pageidx += maxpages; 934 setcount++; 935 } 936 pageidx = i; 937 pagecount = 1; 938 vaddr = this_vaddr; 939 } else { 940 vaddr += PAGE_SIZE; 941 pagecount++; 942 } 943 } 944 /* insure we always return an even number of sets */ 945 if (setcount & 1) 946 list[setcount++].count = 0; 947 return setcount; 948 } 949 950 /** 951 * tid_flush_pages - dump out pages into pagesets 952 * @list - list of pagesets 953 * @idx - pointer to current page index 954 * @pages - number of pages to dump 955 * @sets - current number of pagesset 956 * 957 * This routine flushes out accumuated pages. 958 * 959 * To insure an even number of sets the 960 * code may add a filler. 961 * 962 * This can happen with when pages is not 963 * a power of 2 or pages is a power of 2 964 * less than the maximum pages. 965 * 966 * Return: 967 * The new number of sets 968 */ 969 970 static u32 tid_flush_pages(struct tid_rdma_pageset *list, 971 u32 *idx, u32 pages, u32 sets) 972 { 973 while (pages) { 974 u32 maxpages = pages; 975 976 if (maxpages > MAX_EXPECTED_PAGES) 977 maxpages = MAX_EXPECTED_PAGES; 978 else if (!is_power_of_2(maxpages)) 979 maxpages = rounddown_pow_of_two(maxpages); 980 list[sets].idx = *idx; 981 list[sets++].count = maxpages; 982 *idx += maxpages; 983 pages -= maxpages; 984 } 985 /* might need a filler */ 986 if (sets & 1) 987 list[sets++].count = 0; 988 return sets; 989 } 990 991 /** 992 * tid_rdma_find_phys_blocks_8k - get groups base on mr info 993 * @pages - pointer to an array of page structs 994 * @npages - number of pages 995 * @list - page set array to return 996 * 997 * This routine parses an array of pages to compute pagesets 998 * in an 8k compatible way. 999 * 1000 * pages are tested two at a time, i, i + 1 for contiguous 1001 * pages and i - 1 and i contiguous pages. 1002 * 1003 * If any condition is false, any accumlated pages are flushed and 1004 * v0,v1 are emitted as separate PAGE_SIZE pagesets 1005 * 1006 * Otherwise, the current 8k is totaled for a future flush. 1007 * 1008 * Return: 1009 * The number of pagesets 1010 * list set with the returned number of pagesets 1011 * 1012 */ 1013 static u32 tid_rdma_find_phys_blocks_8k(struct tid_rdma_flow *flow, 1014 struct page **pages, 1015 u32 npages, 1016 struct tid_rdma_pageset *list) 1017 { 1018 u32 idx, sets = 0, i; 1019 u32 pagecnt = 0; 1020 void *v0, *v1, *vm1; 1021 1022 if (!npages) 1023 return 0; 1024 for (idx = 0, i = 0, vm1 = NULL; i < npages; i += 2) { 1025 /* get a new v0 */ 1026 v0 = page_address(pages[i]); 1027 trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 0, v0); 1028 v1 = i + 1 < npages ? 1029 page_address(pages[i + 1]) : NULL; 1030 trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 1, v1); 1031 /* compare i, i + 1 vaddr */ 1032 if (v1 != (v0 + PAGE_SIZE)) { 1033 /* flush out pages */ 1034 sets = tid_flush_pages(list, &idx, pagecnt, sets); 1035 /* output v0,v1 as two pagesets */ 1036 list[sets].idx = idx++; 1037 list[sets++].count = 1; 1038 if (v1) { 1039 list[sets].count = 1; 1040 list[sets++].idx = idx++; 1041 } else { 1042 list[sets++].count = 0; 1043 } 1044 vm1 = NULL; 1045 pagecnt = 0; 1046 continue; 1047 } 1048 /* i,i+1 consecutive, look at i-1,i */ 1049 if (vm1 && v0 != (vm1 + PAGE_SIZE)) { 1050 /* flush out pages */ 1051 sets = tid_flush_pages(list, &idx, pagecnt, sets); 1052 pagecnt = 0; 1053 } 1054 /* pages will always be a multiple of 8k */ 1055 pagecnt += 2; 1056 /* save i-1 */ 1057 vm1 = v1; 1058 /* move to next pair */ 1059 } 1060 /* dump residual pages at end */ 1061 sets = tid_flush_pages(list, &idx, npages - idx, sets); 1062 /* by design cannot be odd sets */ 1063 WARN_ON(sets & 1); 1064 return sets; 1065 } 1066 1067 /** 1068 * Find pages for one segment of a sge array represented by @ss. The function 1069 * does not check the sge, the sge must have been checked for alignment with a 1070 * prior call to hfi1_kern_trdma_ok. Other sge checking is done as part of 1071 * rvt_lkey_ok and rvt_rkey_ok. Also, the function only modifies the local sge 1072 * copy maintained in @ss->sge, the original sge is not modified. 1073 * 1074 * Unlike IB RDMA WRITE, we can't decrement ss->num_sge here because we are not 1075 * releasing the MR reference count at the same time. Otherwise, we'll "leak" 1076 * references to the MR. This difference requires that we keep track of progress 1077 * into the sg_list. This is done by the cur_seg cursor in the tid_rdma_request 1078 * structure. 1079 */ 1080 static u32 kern_find_pages(struct tid_rdma_flow *flow, 1081 struct page **pages, 1082 struct rvt_sge_state *ss, bool *last) 1083 { 1084 struct tid_rdma_request *req = flow->req; 1085 struct rvt_sge *sge = &ss->sge; 1086 u32 length = flow->req->seg_len; 1087 u32 len = PAGE_SIZE; 1088 u32 i = 0; 1089 1090 while (length && req->isge < ss->num_sge) { 1091 pages[i++] = virt_to_page(sge->vaddr); 1092 1093 sge->vaddr += len; 1094 sge->length -= len; 1095 sge->sge_length -= len; 1096 if (!sge->sge_length) { 1097 if (++req->isge < ss->num_sge) 1098 *sge = ss->sg_list[req->isge - 1]; 1099 } else if (sge->length == 0 && sge->mr->lkey) { 1100 if (++sge->n >= RVT_SEGSZ) { 1101 ++sge->m; 1102 sge->n = 0; 1103 } 1104 sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr; 1105 sge->length = sge->mr->map[sge->m]->segs[sge->n].length; 1106 } 1107 length -= len; 1108 } 1109 1110 flow->length = flow->req->seg_len - length; 1111 *last = req->isge == ss->num_sge ? false : true; 1112 return i; 1113 } 1114 1115 static void dma_unmap_flow(struct tid_rdma_flow *flow) 1116 { 1117 struct hfi1_devdata *dd; 1118 int i; 1119 struct tid_rdma_pageset *pset; 1120 1121 dd = flow->req->rcd->dd; 1122 for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets; 1123 i++, pset++) { 1124 if (pset->count && pset->addr) { 1125 dma_unmap_page(&dd->pcidev->dev, 1126 pset->addr, 1127 PAGE_SIZE * pset->count, 1128 DMA_FROM_DEVICE); 1129 pset->mapped = 0; 1130 } 1131 } 1132 } 1133 1134 static int dma_map_flow(struct tid_rdma_flow *flow, struct page **pages) 1135 { 1136 int i; 1137 struct hfi1_devdata *dd = flow->req->rcd->dd; 1138 struct tid_rdma_pageset *pset; 1139 1140 for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets; 1141 i++, pset++) { 1142 if (pset->count) { 1143 pset->addr = dma_map_page(&dd->pcidev->dev, 1144 pages[pset->idx], 1145 0, 1146 PAGE_SIZE * pset->count, 1147 DMA_FROM_DEVICE); 1148 1149 if (dma_mapping_error(&dd->pcidev->dev, pset->addr)) { 1150 dma_unmap_flow(flow); 1151 return -ENOMEM; 1152 } 1153 pset->mapped = 1; 1154 } 1155 } 1156 return 0; 1157 } 1158 1159 static inline bool dma_mapped(struct tid_rdma_flow *flow) 1160 { 1161 return !!flow->pagesets[0].mapped; 1162 } 1163 1164 /* 1165 * Get pages pointers and identify contiguous physical memory chunks for a 1166 * segment. All segments are of length flow->req->seg_len. 1167 */ 1168 static int kern_get_phys_blocks(struct tid_rdma_flow *flow, 1169 struct page **pages, 1170 struct rvt_sge_state *ss, bool *last) 1171 { 1172 u8 npages; 1173 1174 /* Reuse previously computed pagesets, if any */ 1175 if (flow->npagesets) { 1176 trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head, 1177 flow); 1178 if (!dma_mapped(flow)) 1179 return dma_map_flow(flow, pages); 1180 return 0; 1181 } 1182 1183 npages = kern_find_pages(flow, pages, ss, last); 1184 1185 if (flow->req->qp->pmtu == enum_to_mtu(OPA_MTU_4096)) 1186 flow->npagesets = 1187 tid_rdma_find_phys_blocks_4k(flow, pages, npages, 1188 flow->pagesets); 1189 else 1190 flow->npagesets = 1191 tid_rdma_find_phys_blocks_8k(flow, pages, npages, 1192 flow->pagesets); 1193 1194 return dma_map_flow(flow, pages); 1195 } 1196 1197 static inline void kern_add_tid_node(struct tid_rdma_flow *flow, 1198 struct hfi1_ctxtdata *rcd, char *s, 1199 struct tid_group *grp, u8 cnt) 1200 { 1201 struct kern_tid_node *node = &flow->tnode[flow->tnode_cnt++]; 1202 1203 WARN_ON_ONCE(flow->tnode_cnt >= 1204 (TID_RDMA_MAX_SEGMENT_SIZE >> PAGE_SHIFT)); 1205 if (WARN_ON_ONCE(cnt & 1)) 1206 dd_dev_err(rcd->dd, 1207 "unexpected odd allocation cnt %u map 0x%x used %u", 1208 cnt, grp->map, grp->used); 1209 1210 node->grp = grp; 1211 node->map = grp->map; 1212 node->cnt = cnt; 1213 trace_hfi1_tid_node_add(flow->req->qp, s, flow->tnode_cnt - 1, 1214 grp->base, grp->map, grp->used, cnt); 1215 } 1216 1217 /* 1218 * Try to allocate pageset_count TID's from TID groups for a context 1219 * 1220 * This function allocates TID's without moving groups between lists or 1221 * modifying grp->map. This is done as follows, being cogizant of the lists 1222 * between which the TID groups will move: 1223 * 1. First allocate complete groups of 8 TID's since this is more efficient, 1224 * these groups will move from group->full without affecting used 1225 * 2. If more TID's are needed allocate from used (will move from used->full or 1226 * stay in used) 1227 * 3. If we still don't have the required number of TID's go back and look again 1228 * at a complete group (will move from group->used) 1229 */ 1230 static int kern_alloc_tids(struct tid_rdma_flow *flow) 1231 { 1232 struct hfi1_ctxtdata *rcd = flow->req->rcd; 1233 struct hfi1_devdata *dd = rcd->dd; 1234 u32 ngroups, pageidx = 0; 1235 struct tid_group *group = NULL, *used; 1236 u8 use; 1237 1238 flow->tnode_cnt = 0; 1239 ngroups = flow->npagesets / dd->rcv_entries.group_size; 1240 if (!ngroups) 1241 goto used_list; 1242 1243 /* First look at complete groups */ 1244 list_for_each_entry(group, &rcd->tid_group_list.list, list) { 1245 kern_add_tid_node(flow, rcd, "complete groups", group, 1246 group->size); 1247 1248 pageidx += group->size; 1249 if (!--ngroups) 1250 break; 1251 } 1252 1253 if (pageidx >= flow->npagesets) 1254 goto ok; 1255 1256 used_list: 1257 /* Now look at partially used groups */ 1258 list_for_each_entry(used, &rcd->tid_used_list.list, list) { 1259 use = min_t(u32, flow->npagesets - pageidx, 1260 used->size - used->used); 1261 kern_add_tid_node(flow, rcd, "used groups", used, use); 1262 1263 pageidx += use; 1264 if (pageidx >= flow->npagesets) 1265 goto ok; 1266 } 1267 1268 /* 1269 * Look again at a complete group, continuing from where we left. 1270 * However, if we are at the head, we have reached the end of the 1271 * complete groups list from the first loop above 1272 */ 1273 if (group && &group->list == &rcd->tid_group_list.list) 1274 goto bail_eagain; 1275 group = list_prepare_entry(group, &rcd->tid_group_list.list, 1276 list); 1277 if (list_is_last(&group->list, &rcd->tid_group_list.list)) 1278 goto bail_eagain; 1279 group = list_next_entry(group, list); 1280 use = min_t(u32, flow->npagesets - pageidx, group->size); 1281 kern_add_tid_node(flow, rcd, "complete continue", group, use); 1282 pageidx += use; 1283 if (pageidx >= flow->npagesets) 1284 goto ok; 1285 bail_eagain: 1286 trace_hfi1_msg_alloc_tids(flow->req->qp, " insufficient tids: needed ", 1287 (u64)flow->npagesets); 1288 return -EAGAIN; 1289 ok: 1290 return 0; 1291 } 1292 1293 static void kern_program_rcv_group(struct tid_rdma_flow *flow, int grp_num, 1294 u32 *pset_idx) 1295 { 1296 struct hfi1_ctxtdata *rcd = flow->req->rcd; 1297 struct hfi1_devdata *dd = rcd->dd; 1298 struct kern_tid_node *node = &flow->tnode[grp_num]; 1299 struct tid_group *grp = node->grp; 1300 struct tid_rdma_pageset *pset; 1301 u32 pmtu_pg = flow->req->qp->pmtu >> PAGE_SHIFT; 1302 u32 rcventry, npages = 0, pair = 0, tidctrl; 1303 u8 i, cnt = 0; 1304 1305 for (i = 0; i < grp->size; i++) { 1306 rcventry = grp->base + i; 1307 1308 if (node->map & BIT(i) || cnt >= node->cnt) { 1309 rcv_array_wc_fill(dd, rcventry); 1310 continue; 1311 } 1312 pset = &flow->pagesets[(*pset_idx)++]; 1313 if (pset->count) { 1314 hfi1_put_tid(dd, rcventry, PT_EXPECTED, 1315 pset->addr, trdma_pset_order(pset)); 1316 } else { 1317 hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0); 1318 } 1319 npages += pset->count; 1320 1321 rcventry -= rcd->expected_base; 1322 tidctrl = pair ? 0x3 : rcventry & 0x1 ? 0x2 : 0x1; 1323 /* 1324 * A single TID entry will be used to use a rcvarr pair (with 1325 * tidctrl 0x3), if ALL these are true (a) the bit pos is even 1326 * (b) the group map shows current and the next bits as free 1327 * indicating two consecutive rcvarry entries are available (c) 1328 * we actually need 2 more entries 1329 */ 1330 pair = !(i & 0x1) && !((node->map >> i) & 0x3) && 1331 node->cnt >= cnt + 2; 1332 if (!pair) { 1333 if (!pset->count) 1334 tidctrl = 0x1; 1335 flow->tid_entry[flow->tidcnt++] = 1336 EXP_TID_SET(IDX, rcventry >> 1) | 1337 EXP_TID_SET(CTRL, tidctrl) | 1338 EXP_TID_SET(LEN, npages); 1339 trace_hfi1_tid_entry_alloc(/* entry */ 1340 flow->req->qp, flow->tidcnt - 1, 1341 flow->tid_entry[flow->tidcnt - 1]); 1342 1343 /* Efficient DIV_ROUND_UP(npages, pmtu_pg) */ 1344 flow->npkts += (npages + pmtu_pg - 1) >> ilog2(pmtu_pg); 1345 npages = 0; 1346 } 1347 1348 if (grp->used == grp->size - 1) 1349 tid_group_move(grp, &rcd->tid_used_list, 1350 &rcd->tid_full_list); 1351 else if (!grp->used) 1352 tid_group_move(grp, &rcd->tid_group_list, 1353 &rcd->tid_used_list); 1354 1355 grp->used++; 1356 grp->map |= BIT(i); 1357 cnt++; 1358 } 1359 } 1360 1361 static void kern_unprogram_rcv_group(struct tid_rdma_flow *flow, int grp_num) 1362 { 1363 struct hfi1_ctxtdata *rcd = flow->req->rcd; 1364 struct hfi1_devdata *dd = rcd->dd; 1365 struct kern_tid_node *node = &flow->tnode[grp_num]; 1366 struct tid_group *grp = node->grp; 1367 u32 rcventry; 1368 u8 i, cnt = 0; 1369 1370 for (i = 0; i < grp->size; i++) { 1371 rcventry = grp->base + i; 1372 1373 if (node->map & BIT(i) || cnt >= node->cnt) { 1374 rcv_array_wc_fill(dd, rcventry); 1375 continue; 1376 } 1377 1378 hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0); 1379 1380 grp->used--; 1381 grp->map &= ~BIT(i); 1382 cnt++; 1383 1384 if (grp->used == grp->size - 1) 1385 tid_group_move(grp, &rcd->tid_full_list, 1386 &rcd->tid_used_list); 1387 else if (!grp->used) 1388 tid_group_move(grp, &rcd->tid_used_list, 1389 &rcd->tid_group_list); 1390 } 1391 if (WARN_ON_ONCE(cnt & 1)) { 1392 struct hfi1_ctxtdata *rcd = flow->req->rcd; 1393 struct hfi1_devdata *dd = rcd->dd; 1394 1395 dd_dev_err(dd, "unexpected odd free cnt %u map 0x%x used %u", 1396 cnt, grp->map, grp->used); 1397 } 1398 } 1399 1400 static void kern_program_rcvarray(struct tid_rdma_flow *flow) 1401 { 1402 u32 pset_idx = 0; 1403 int i; 1404 1405 flow->npkts = 0; 1406 flow->tidcnt = 0; 1407 for (i = 0; i < flow->tnode_cnt; i++) 1408 kern_program_rcv_group(flow, i, &pset_idx); 1409 trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head, flow); 1410 } 1411 1412 /** 1413 * hfi1_kern_exp_rcv_setup() - setup TID's and flow for one segment of a 1414 * TID RDMA request 1415 * 1416 * @req: TID RDMA request for which the segment/flow is being set up 1417 * @ss: sge state, maintains state across successive segments of a sge 1418 * @last: set to true after the last sge segment has been processed 1419 * 1420 * This function 1421 * (1) finds a free flow entry in the flow circular buffer 1422 * (2) finds pages and continuous physical chunks constituing one segment 1423 * of an sge 1424 * (3) allocates TID group entries for those chunks 1425 * (4) programs rcvarray entries in the hardware corresponding to those 1426 * TID's 1427 * (5) computes a tidarray with formatted TID entries which can be sent 1428 * to the sender 1429 * (6) Reserves and programs HW flows. 1430 * (7) It also manages queing the QP when TID/flow resources are not 1431 * available. 1432 * 1433 * @req points to struct tid_rdma_request of which the segments are a part. The 1434 * function uses qp, rcd and seg_len members of @req. In the absence of errors, 1435 * req->flow_idx is the index of the flow which has been prepared in this 1436 * invocation of function call. With flow = &req->flows[req->flow_idx], 1437 * flow->tid_entry contains the TID array which the sender can use for TID RDMA 1438 * sends and flow->npkts contains number of packets required to send the 1439 * segment. 1440 * 1441 * hfi1_check_sge_align should be called prior to calling this function and if 1442 * it signals error TID RDMA cannot be used for this sge and this function 1443 * should not be called. 1444 * 1445 * For the queuing, caller must hold the flow->req->qp s_lock from the send 1446 * engine and the function will procure the exp_lock. 1447 * 1448 * Return: 1449 * The function returns -EAGAIN if sufficient number of TID/flow resources to 1450 * map the segment could not be allocated. In this case the function should be 1451 * called again with previous arguments to retry the TID allocation. There are 1452 * no other error returns. The function returns 0 on success. 1453 */ 1454 int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req, 1455 struct rvt_sge_state *ss, bool *last) 1456 __must_hold(&req->qp->s_lock) 1457 { 1458 struct tid_rdma_flow *flow = &req->flows[req->setup_head]; 1459 struct hfi1_ctxtdata *rcd = req->rcd; 1460 struct hfi1_qp_priv *qpriv = req->qp->priv; 1461 unsigned long flags; 1462 struct rvt_qp *fqp; 1463 u16 clear_tail = req->clear_tail; 1464 1465 lockdep_assert_held(&req->qp->s_lock); 1466 /* 1467 * We return error if either (a) we don't have space in the flow 1468 * circular buffer, or (b) we already have max entries in the buffer. 1469 * Max entries depend on the type of request we are processing and the 1470 * negotiated TID RDMA parameters. 1471 */ 1472 if (!CIRC_SPACE(req->setup_head, clear_tail, MAX_FLOWS) || 1473 CIRC_CNT(req->setup_head, clear_tail, MAX_FLOWS) >= 1474 req->n_flows) 1475 return -EINVAL; 1476 1477 /* 1478 * Get pages, identify contiguous physical memory chunks for the segment 1479 * If we can not determine a DMA address mapping we will treat it just 1480 * like if we ran out of space above. 1481 */ 1482 if (kern_get_phys_blocks(flow, qpriv->pages, ss, last)) { 1483 hfi1_wait_kmem(flow->req->qp); 1484 return -ENOMEM; 1485 } 1486 1487 spin_lock_irqsave(&rcd->exp_lock, flags); 1488 if (kernel_tid_waiters(rcd, &rcd->rarr_queue, flow->req->qp)) 1489 goto queue; 1490 1491 /* 1492 * At this point we know the number of pagesets and hence the number of 1493 * TID's to map the segment. Allocate the TID's from the TID groups. If 1494 * we cannot allocate the required number we exit and try again later 1495 */ 1496 if (kern_alloc_tids(flow)) 1497 goto queue; 1498 /* 1499 * Finally program the TID entries with the pagesets, compute the 1500 * tidarray and enable the HW flow 1501 */ 1502 kern_program_rcvarray(flow); 1503 1504 /* 1505 * Setup the flow state with relevant information. 1506 * This information is used for tracking the sequence of data packets 1507 * for the segment. 1508 * The flow is setup here as this is the most accurate time and place 1509 * to do so. Doing at a later time runs the risk of the flow data in 1510 * qpriv getting out of sync. 1511 */ 1512 memset(&flow->flow_state, 0x0, sizeof(flow->flow_state)); 1513 flow->idx = qpriv->flow_state.index; 1514 flow->flow_state.generation = qpriv->flow_state.generation; 1515 flow->flow_state.spsn = qpriv->flow_state.psn; 1516 flow->flow_state.lpsn = flow->flow_state.spsn + flow->npkts - 1; 1517 flow->flow_state.r_next_psn = 1518 full_flow_psn(flow, flow->flow_state.spsn); 1519 qpriv->flow_state.psn += flow->npkts; 1520 1521 dequeue_tid_waiter(rcd, &rcd->rarr_queue, flow->req->qp); 1522 /* get head before dropping lock */ 1523 fqp = first_qp(rcd, &rcd->rarr_queue); 1524 spin_unlock_irqrestore(&rcd->exp_lock, flags); 1525 tid_rdma_schedule_tid_wakeup(fqp); 1526 1527 req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1); 1528 return 0; 1529 queue: 1530 queue_qp_for_tid_wait(rcd, &rcd->rarr_queue, flow->req->qp); 1531 spin_unlock_irqrestore(&rcd->exp_lock, flags); 1532 return -EAGAIN; 1533 } 1534 1535 static void hfi1_tid_rdma_reset_flow(struct tid_rdma_flow *flow) 1536 { 1537 flow->npagesets = 0; 1538 } 1539 1540 /* 1541 * This function is called after one segment has been successfully sent to 1542 * release the flow and TID HW/SW resources for that segment. The segments for a 1543 * TID RDMA request are setup and cleared in FIFO order which is managed using a 1544 * circular buffer. 1545 */ 1546 int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req) 1547 __must_hold(&req->qp->s_lock) 1548 { 1549 struct tid_rdma_flow *flow = &req->flows[req->clear_tail]; 1550 struct hfi1_ctxtdata *rcd = req->rcd; 1551 unsigned long flags; 1552 int i; 1553 struct rvt_qp *fqp; 1554 1555 lockdep_assert_held(&req->qp->s_lock); 1556 /* Exit if we have nothing in the flow circular buffer */ 1557 if (!CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS)) 1558 return -EINVAL; 1559 1560 spin_lock_irqsave(&rcd->exp_lock, flags); 1561 1562 for (i = 0; i < flow->tnode_cnt; i++) 1563 kern_unprogram_rcv_group(flow, i); 1564 /* To prevent double unprogramming */ 1565 flow->tnode_cnt = 0; 1566 /* get head before dropping lock */ 1567 fqp = first_qp(rcd, &rcd->rarr_queue); 1568 spin_unlock_irqrestore(&rcd->exp_lock, flags); 1569 1570 dma_unmap_flow(flow); 1571 1572 hfi1_tid_rdma_reset_flow(flow); 1573 req->clear_tail = (req->clear_tail + 1) & (MAX_FLOWS - 1); 1574 1575 if (fqp == req->qp) { 1576 __trigger_tid_waiter(fqp); 1577 rvt_put_qp(fqp); 1578 } else { 1579 tid_rdma_schedule_tid_wakeup(fqp); 1580 } 1581 1582 return 0; 1583 } 1584 1585 /* 1586 * This function is called to release all the tid entries for 1587 * a request. 1588 */ 1589 void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req) 1590 __must_hold(&req->qp->s_lock) 1591 { 1592 /* Use memory barrier for proper ordering */ 1593 while (CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS)) { 1594 if (hfi1_kern_exp_rcv_clear(req)) 1595 break; 1596 } 1597 } 1598 1599 /** 1600 * hfi1_kern_exp_rcv_free_flows - free priviously allocated flow information 1601 * @req - the tid rdma request to be cleaned 1602 */ 1603 static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req) 1604 { 1605 kfree(req->flows); 1606 req->flows = NULL; 1607 } 1608 1609 /** 1610 * __trdma_clean_swqe - clean up for large sized QPs 1611 * @qp: the queue patch 1612 * @wqe: the send wqe 1613 */ 1614 void __trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe) 1615 { 1616 struct hfi1_swqe_priv *p = wqe->priv; 1617 1618 hfi1_kern_exp_rcv_free_flows(&p->tid_req); 1619 } 1620 1621 /* 1622 * This can be called at QP create time or in the data path. 1623 */ 1624 static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req, 1625 gfp_t gfp) 1626 { 1627 struct tid_rdma_flow *flows; 1628 int i; 1629 1630 if (likely(req->flows)) 1631 return 0; 1632 flows = kmalloc_node(MAX_FLOWS * sizeof(*flows), gfp, 1633 req->rcd->numa_id); 1634 if (!flows) 1635 return -ENOMEM; 1636 /* mini init */ 1637 for (i = 0; i < MAX_FLOWS; i++) { 1638 flows[i].req = req; 1639 flows[i].npagesets = 0; 1640 flows[i].pagesets[0].mapped = 0; 1641 flows[i].resync_npkts = 0; 1642 } 1643 req->flows = flows; 1644 return 0; 1645 } 1646 1647 static void hfi1_init_trdma_req(struct rvt_qp *qp, 1648 struct tid_rdma_request *req) 1649 { 1650 struct hfi1_qp_priv *qpriv = qp->priv; 1651 1652 /* 1653 * Initialize various TID RDMA request variables. 1654 * These variables are "static", which is why they 1655 * can be pre-initialized here before the WRs has 1656 * even been submitted. 1657 * However, non-NULL values for these variables do not 1658 * imply that this WQE has been enabled for TID RDMA. 1659 * Drivers should check the WQE's opcode to determine 1660 * if a request is a TID RDMA one or not. 1661 */ 1662 req->qp = qp; 1663 req->rcd = qpriv->rcd; 1664 } 1665 1666 u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry, 1667 void *context, int vl, int mode, u64 data) 1668 { 1669 struct hfi1_devdata *dd = context; 1670 1671 return dd->verbs_dev.n_tidwait; 1672 } 1673 1674 static struct tid_rdma_flow *find_flow_ib(struct tid_rdma_request *req, 1675 u32 psn, u16 *fidx) 1676 { 1677 u16 head, tail; 1678 struct tid_rdma_flow *flow; 1679 1680 head = req->setup_head; 1681 tail = req->clear_tail; 1682 for ( ; CIRC_CNT(head, tail, MAX_FLOWS); 1683 tail = CIRC_NEXT(tail, MAX_FLOWS)) { 1684 flow = &req->flows[tail]; 1685 if (cmp_psn(psn, flow->flow_state.ib_spsn) >= 0 && 1686 cmp_psn(psn, flow->flow_state.ib_lpsn) <= 0) { 1687 if (fidx) 1688 *fidx = tail; 1689 return flow; 1690 } 1691 } 1692 return NULL; 1693 } 1694 1695 /* TID RDMA READ functions */ 1696 u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe, 1697 struct ib_other_headers *ohdr, u32 *bth1, 1698 u32 *bth2, u32 *len) 1699 { 1700 struct tid_rdma_request *req = wqe_to_tid_req(wqe); 1701 struct tid_rdma_flow *flow = &req->flows[req->flow_idx]; 1702 struct rvt_qp *qp = req->qp; 1703 struct hfi1_qp_priv *qpriv = qp->priv; 1704 struct hfi1_swqe_priv *wpriv = wqe->priv; 1705 struct tid_rdma_read_req *rreq = &ohdr->u.tid_rdma.r_req; 1706 struct tid_rdma_params *remote; 1707 u32 req_len = 0; 1708 void *req_addr = NULL; 1709 1710 /* This is the IB psn used to send the request */ 1711 *bth2 = mask_psn(flow->flow_state.ib_spsn + flow->pkt); 1712 trace_hfi1_tid_flow_build_read_pkt(qp, req->flow_idx, flow); 1713 1714 /* TID Entries for TID RDMA READ payload */ 1715 req_addr = &flow->tid_entry[flow->tid_idx]; 1716 req_len = sizeof(*flow->tid_entry) * 1717 (flow->tidcnt - flow->tid_idx); 1718 1719 memset(&ohdr->u.tid_rdma.r_req, 0, sizeof(ohdr->u.tid_rdma.r_req)); 1720 wpriv->ss.sge.vaddr = req_addr; 1721 wpriv->ss.sge.sge_length = req_len; 1722 wpriv->ss.sge.length = wpriv->ss.sge.sge_length; 1723 /* 1724 * We can safely zero these out. Since the first SGE covers the 1725 * entire packet, nothing else should even look at the MR. 1726 */ 1727 wpriv->ss.sge.mr = NULL; 1728 wpriv->ss.sge.m = 0; 1729 wpriv->ss.sge.n = 0; 1730 1731 wpriv->ss.sg_list = NULL; 1732 wpriv->ss.total_len = wpriv->ss.sge.sge_length; 1733 wpriv->ss.num_sge = 1; 1734 1735 /* Construct the TID RDMA READ REQ packet header */ 1736 rcu_read_lock(); 1737 remote = rcu_dereference(qpriv->tid_rdma.remote); 1738 1739 KDETH_RESET(rreq->kdeth0, KVER, 0x1); 1740 KDETH_RESET(rreq->kdeth1, JKEY, remote->jkey); 1741 rreq->reth.vaddr = cpu_to_be64(wqe->rdma_wr.remote_addr + 1742 req->cur_seg * req->seg_len + flow->sent); 1743 rreq->reth.rkey = cpu_to_be32(wqe->rdma_wr.rkey); 1744 rreq->reth.length = cpu_to_be32(*len); 1745 rreq->tid_flow_psn = 1746 cpu_to_be32((flow->flow_state.generation << 1747 HFI1_KDETH_BTH_SEQ_SHIFT) | 1748 ((flow->flow_state.spsn + flow->pkt) & 1749 HFI1_KDETH_BTH_SEQ_MASK)); 1750 rreq->tid_flow_qp = 1751 cpu_to_be32(qpriv->tid_rdma.local.qp | 1752 ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) << 1753 TID_RDMA_DESTQP_FLOW_SHIFT) | 1754 qpriv->rcd->ctxt); 1755 rreq->verbs_qp = cpu_to_be32(qp->remote_qpn); 1756 *bth1 &= ~RVT_QPN_MASK; 1757 *bth1 |= remote->qp; 1758 *bth2 |= IB_BTH_REQ_ACK; 1759 rcu_read_unlock(); 1760 1761 /* We are done with this segment */ 1762 flow->sent += *len; 1763 req->cur_seg++; 1764 qp->s_state = TID_OP(READ_REQ); 1765 req->ack_pending++; 1766 req->flow_idx = (req->flow_idx + 1) & (MAX_FLOWS - 1); 1767 qpriv->pending_tid_r_segs++; 1768 qp->s_num_rd_atomic++; 1769 1770 /* Set the TID RDMA READ request payload size */ 1771 *len = req_len; 1772 1773 return sizeof(ohdr->u.tid_rdma.r_req) / sizeof(u32); 1774 } 1775 1776 /* 1777 * @len: contains the data length to read upon entry and the read request 1778 * payload length upon exit. 1779 */ 1780 u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe, 1781 struct ib_other_headers *ohdr, u32 *bth1, 1782 u32 *bth2, u32 *len) 1783 __must_hold(&qp->s_lock) 1784 { 1785 struct hfi1_qp_priv *qpriv = qp->priv; 1786 struct tid_rdma_request *req = wqe_to_tid_req(wqe); 1787 struct tid_rdma_flow *flow = NULL; 1788 u32 hdwords = 0; 1789 bool last; 1790 bool retry = true; 1791 u32 npkts = rvt_div_round_up_mtu(qp, *len); 1792 1793 trace_hfi1_tid_req_build_read_req(qp, 0, wqe->wr.opcode, wqe->psn, 1794 wqe->lpsn, req); 1795 /* 1796 * Check sync conditions. Make sure that there are no pending 1797 * segments before freeing the flow. 1798 */ 1799 sync_check: 1800 if (req->state == TID_REQUEST_SYNC) { 1801 if (qpriv->pending_tid_r_segs) 1802 goto done; 1803 1804 hfi1_kern_clear_hw_flow(req->rcd, qp); 1805 qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; 1806 req->state = TID_REQUEST_ACTIVE; 1807 } 1808 1809 /* 1810 * If the request for this segment is resent, the tid resources should 1811 * have been allocated before. In this case, req->flow_idx should 1812 * fall behind req->setup_head. 1813 */ 1814 if (req->flow_idx == req->setup_head) { 1815 retry = false; 1816 if (req->state == TID_REQUEST_RESEND) { 1817 /* 1818 * This is the first new segment for a request whose 1819 * earlier segments have been re-sent. We need to 1820 * set up the sge pointer correctly. 1821 */ 1822 restart_sge(&qp->s_sge, wqe, req->s_next_psn, 1823 qp->pmtu); 1824 req->isge = 0; 1825 req->state = TID_REQUEST_ACTIVE; 1826 } 1827 1828 /* 1829 * Check sync. The last PSN of each generation is reserved for 1830 * RESYNC. 1831 */ 1832 if ((qpriv->flow_state.psn + npkts) > MAX_TID_FLOW_PSN - 1) { 1833 req->state = TID_REQUEST_SYNC; 1834 goto sync_check; 1835 } 1836 1837 /* Allocate the flow if not yet */ 1838 if (hfi1_kern_setup_hw_flow(qpriv->rcd, qp)) 1839 goto done; 1840 1841 /* 1842 * The following call will advance req->setup_head after 1843 * allocating the tid entries. 1844 */ 1845 if (hfi1_kern_exp_rcv_setup(req, &qp->s_sge, &last)) { 1846 req->state = TID_REQUEST_QUEUED; 1847 1848 /* 1849 * We don't have resources for this segment. The QP has 1850 * already been queued. 1851 */ 1852 goto done; 1853 } 1854 } 1855 1856 /* req->flow_idx should only be one slot behind req->setup_head */ 1857 flow = &req->flows[req->flow_idx]; 1858 flow->pkt = 0; 1859 flow->tid_idx = 0; 1860 flow->sent = 0; 1861 if (!retry) { 1862 /* Set the first and last IB PSN for the flow in use.*/ 1863 flow->flow_state.ib_spsn = req->s_next_psn; 1864 flow->flow_state.ib_lpsn = 1865 flow->flow_state.ib_spsn + flow->npkts - 1; 1866 } 1867 1868 /* Calculate the next segment start psn.*/ 1869 req->s_next_psn += flow->npkts; 1870 1871 /* Build the packet header */ 1872 hdwords = hfi1_build_tid_rdma_read_packet(wqe, ohdr, bth1, bth2, len); 1873 done: 1874 return hdwords; 1875 } 1876 1877 /* 1878 * Validate and accept the TID RDMA READ request parameters. 1879 * Return 0 if the request is accepted successfully; 1880 * Return 1 otherwise. 1881 */ 1882 static int tid_rdma_rcv_read_request(struct rvt_qp *qp, 1883 struct rvt_ack_entry *e, 1884 struct hfi1_packet *packet, 1885 struct ib_other_headers *ohdr, 1886 u32 bth0, u32 psn, u64 vaddr, u32 len) 1887 { 1888 struct hfi1_qp_priv *qpriv = qp->priv; 1889 struct tid_rdma_request *req; 1890 struct tid_rdma_flow *flow; 1891 u32 flow_psn, i, tidlen = 0, pktlen, tlen; 1892 1893 req = ack_to_tid_req(e); 1894 1895 /* Validate the payload first */ 1896 flow = &req->flows[req->setup_head]; 1897 1898 /* payload length = packet length - (header length + ICRC length) */ 1899 pktlen = packet->tlen - (packet->hlen + 4); 1900 if (pktlen > sizeof(flow->tid_entry)) 1901 return 1; 1902 memcpy(flow->tid_entry, packet->ebuf, pktlen); 1903 flow->tidcnt = pktlen / sizeof(*flow->tid_entry); 1904 1905 /* 1906 * Walk the TID_ENTRY list to make sure we have enough space for a 1907 * complete segment. Also calculate the number of required packets. 1908 */ 1909 flow->npkts = rvt_div_round_up_mtu(qp, len); 1910 for (i = 0; i < flow->tidcnt; i++) { 1911 trace_hfi1_tid_entry_rcv_read_req(qp, i, 1912 flow->tid_entry[i]); 1913 tlen = EXP_TID_GET(flow->tid_entry[i], LEN); 1914 if (!tlen) 1915 return 1; 1916 1917 /* 1918 * For tid pair (tidctr == 3), the buffer size of the pair 1919 * should be the sum of the buffer size described by each 1920 * tid entry. However, only the first entry needs to be 1921 * specified in the request (see WFR HAS Section 8.5.7.1). 1922 */ 1923 tidlen += tlen; 1924 } 1925 if (tidlen * PAGE_SIZE < len) 1926 return 1; 1927 1928 /* Empty the flow array */ 1929 req->clear_tail = req->setup_head; 1930 flow->pkt = 0; 1931 flow->tid_idx = 0; 1932 flow->tid_offset = 0; 1933 flow->sent = 0; 1934 flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_qp); 1935 flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) & 1936 TID_RDMA_DESTQP_FLOW_MASK; 1937 flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_psn)); 1938 flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT; 1939 flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK; 1940 flow->length = len; 1941 1942 flow->flow_state.lpsn = flow->flow_state.spsn + 1943 flow->npkts - 1; 1944 flow->flow_state.ib_spsn = psn; 1945 flow->flow_state.ib_lpsn = flow->flow_state.ib_spsn + flow->npkts - 1; 1946 1947 trace_hfi1_tid_flow_rcv_read_req(qp, req->setup_head, flow); 1948 /* Set the initial flow index to the current flow. */ 1949 req->flow_idx = req->setup_head; 1950 1951 /* advance circular buffer head */ 1952 req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1); 1953 1954 /* 1955 * Compute last PSN for request. 1956 */ 1957 e->opcode = (bth0 >> 24) & 0xff; 1958 e->psn = psn; 1959 e->lpsn = psn + flow->npkts - 1; 1960 e->sent = 0; 1961 1962 req->n_flows = qpriv->tid_rdma.local.max_read; 1963 req->state = TID_REQUEST_ACTIVE; 1964 req->cur_seg = 0; 1965 req->comp_seg = 0; 1966 req->ack_seg = 0; 1967 req->isge = 0; 1968 req->seg_len = qpriv->tid_rdma.local.max_len; 1969 req->total_len = len; 1970 req->total_segs = 1; 1971 req->r_flow_psn = e->psn; 1972 1973 trace_hfi1_tid_req_rcv_read_req(qp, 0, e->opcode, e->psn, e->lpsn, 1974 req); 1975 return 0; 1976 } 1977 1978 static int tid_rdma_rcv_error(struct hfi1_packet *packet, 1979 struct ib_other_headers *ohdr, 1980 struct rvt_qp *qp, u32 psn, int diff) 1981 { 1982 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); 1983 struct hfi1_ctxtdata *rcd = ((struct hfi1_qp_priv *)qp->priv)->rcd; 1984 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); 1985 struct hfi1_qp_priv *qpriv = qp->priv; 1986 struct rvt_ack_entry *e; 1987 struct tid_rdma_request *req; 1988 unsigned long flags; 1989 u8 prev; 1990 bool old_req; 1991 1992 trace_hfi1_rsp_tid_rcv_error(qp, psn); 1993 trace_hfi1_tid_rdma_rcv_err(qp, 0, psn, diff); 1994 if (diff > 0) { 1995 /* sequence error */ 1996 if (!qp->r_nak_state) { 1997 ibp->rvp.n_rc_seqnak++; 1998 qp->r_nak_state = IB_NAK_PSN_ERROR; 1999 qp->r_ack_psn = qp->r_psn; 2000 rc_defered_ack(rcd, qp); 2001 } 2002 goto done; 2003 } 2004 2005 ibp->rvp.n_rc_dupreq++; 2006 2007 spin_lock_irqsave(&qp->s_lock, flags); 2008 e = find_prev_entry(qp, psn, &prev, NULL, &old_req); 2009 if (!e || (e->opcode != TID_OP(READ_REQ) && 2010 e->opcode != TID_OP(WRITE_REQ))) 2011 goto unlock; 2012 2013 req = ack_to_tid_req(e); 2014 req->r_flow_psn = psn; 2015 trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn, e->lpsn, req); 2016 if (e->opcode == TID_OP(READ_REQ)) { 2017 struct ib_reth *reth; 2018 u32 len; 2019 u32 rkey; 2020 u64 vaddr; 2021 int ok; 2022 u32 bth0; 2023 2024 reth = &ohdr->u.tid_rdma.r_req.reth; 2025 /* 2026 * The requester always restarts from the start of the original 2027 * request. 2028 */ 2029 len = be32_to_cpu(reth->length); 2030 if (psn != e->psn || len != req->total_len) 2031 goto unlock; 2032 2033 release_rdma_sge_mr(e); 2034 2035 rkey = be32_to_cpu(reth->rkey); 2036 vaddr = get_ib_reth_vaddr(reth); 2037 2038 qp->r_len = len; 2039 ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey, 2040 IB_ACCESS_REMOTE_READ); 2041 if (unlikely(!ok)) 2042 goto unlock; 2043 2044 /* 2045 * If all the response packets for the current request have 2046 * been sent out and this request is complete (old_request 2047 * == false) and the TID flow may be unusable (the 2048 * req->clear_tail is advanced). However, when an earlier 2049 * request is received, this request will not be complete any 2050 * more (qp->s_tail_ack_queue is moved back, see below). 2051 * Consequently, we need to update the TID flow info everytime 2052 * a duplicate request is received. 2053 */ 2054 bth0 = be32_to_cpu(ohdr->bth[0]); 2055 if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn, 2056 vaddr, len)) 2057 goto unlock; 2058 2059 /* 2060 * True if the request is already scheduled (between 2061 * qp->s_tail_ack_queue and qp->r_head_ack_queue); 2062 */ 2063 if (old_req) 2064 goto unlock; 2065 } else { 2066 struct flow_state *fstate; 2067 bool schedule = false; 2068 u8 i; 2069 2070 if (req->state == TID_REQUEST_RESEND) { 2071 req->state = TID_REQUEST_RESEND_ACTIVE; 2072 } else if (req->state == TID_REQUEST_INIT_RESEND) { 2073 req->state = TID_REQUEST_INIT; 2074 schedule = true; 2075 } 2076 2077 /* 2078 * True if the request is already scheduled (between 2079 * qp->s_tail_ack_queue and qp->r_head_ack_queue). 2080 * Also, don't change requests, which are at the SYNC 2081 * point and haven't generated any responses yet. 2082 * There is nothing to retransmit for them yet. 2083 */ 2084 if (old_req || req->state == TID_REQUEST_INIT || 2085 (req->state == TID_REQUEST_SYNC && !req->cur_seg)) { 2086 for (i = prev + 1; ; i++) { 2087 if (i > rvt_size_atomic(&dev->rdi)) 2088 i = 0; 2089 if (i == qp->r_head_ack_queue) 2090 break; 2091 e = &qp->s_ack_queue[i]; 2092 req = ack_to_tid_req(e); 2093 if (e->opcode == TID_OP(WRITE_REQ) && 2094 req->state == TID_REQUEST_INIT) 2095 req->state = TID_REQUEST_INIT_RESEND; 2096 } 2097 /* 2098 * If the state of the request has been changed, 2099 * the first leg needs to get scheduled in order to 2100 * pick up the change. Otherwise, normal response 2101 * processing should take care of it. 2102 */ 2103 if (!schedule) 2104 goto unlock; 2105 } 2106 2107 /* 2108 * If there is no more allocated segment, just schedule the qp 2109 * without changing any state. 2110 */ 2111 if (req->clear_tail == req->setup_head) 2112 goto schedule; 2113 /* 2114 * If this request has sent responses for segments, which have 2115 * not received data yet (flow_idx != clear_tail), the flow_idx 2116 * pointer needs to be adjusted so the same responses can be 2117 * re-sent. 2118 */ 2119 if (CIRC_CNT(req->flow_idx, req->clear_tail, MAX_FLOWS)) { 2120 fstate = &req->flows[req->clear_tail].flow_state; 2121 qpriv->pending_tid_w_segs -= 2122 CIRC_CNT(req->flow_idx, req->clear_tail, 2123 MAX_FLOWS); 2124 req->flow_idx = 2125 CIRC_ADD(req->clear_tail, 2126 delta_psn(psn, fstate->resp_ib_psn), 2127 MAX_FLOWS); 2128 qpriv->pending_tid_w_segs += 2129 delta_psn(psn, fstate->resp_ib_psn); 2130 /* 2131 * When flow_idx == setup_head, we've gotten a duplicate 2132 * request for a segment, which has not been allocated 2133 * yet. In that case, don't adjust this request. 2134 * However, we still want to go through the loop below 2135 * to adjust all subsequent requests. 2136 */ 2137 if (CIRC_CNT(req->setup_head, req->flow_idx, 2138 MAX_FLOWS)) { 2139 req->cur_seg = delta_psn(psn, e->psn); 2140 req->state = TID_REQUEST_RESEND_ACTIVE; 2141 } 2142 } 2143 2144 for (i = prev + 1; ; i++) { 2145 /* 2146 * Look at everything up to and including 2147 * s_tail_ack_queue 2148 */ 2149 if (i > rvt_size_atomic(&dev->rdi)) 2150 i = 0; 2151 if (i == qp->r_head_ack_queue) 2152 break; 2153 e = &qp->s_ack_queue[i]; 2154 req = ack_to_tid_req(e); 2155 trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn, 2156 e->lpsn, req); 2157 if (e->opcode != TID_OP(WRITE_REQ) || 2158 req->cur_seg == req->comp_seg || 2159 req->state == TID_REQUEST_INIT || 2160 req->state == TID_REQUEST_INIT_RESEND) { 2161 if (req->state == TID_REQUEST_INIT) 2162 req->state = TID_REQUEST_INIT_RESEND; 2163 continue; 2164 } 2165 qpriv->pending_tid_w_segs -= 2166 CIRC_CNT(req->flow_idx, 2167 req->clear_tail, 2168 MAX_FLOWS); 2169 req->flow_idx = req->clear_tail; 2170 req->state = TID_REQUEST_RESEND; 2171 req->cur_seg = req->comp_seg; 2172 } 2173 qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK; 2174 } 2175 /* Re-process old requests.*/ 2176 if (qp->s_acked_ack_queue == qp->s_tail_ack_queue) 2177 qp->s_acked_ack_queue = prev; 2178 qp->s_tail_ack_queue = prev; 2179 /* 2180 * Since the qp->s_tail_ack_queue is modified, the 2181 * qp->s_ack_state must be changed to re-initialize 2182 * qp->s_ack_rdma_sge; Otherwise, we will end up in 2183 * wrong memory region. 2184 */ 2185 qp->s_ack_state = OP(ACKNOWLEDGE); 2186 schedule: 2187 /* 2188 * It's possible to receive a retry psn that is earlier than an RNRNAK 2189 * psn. In this case, the rnrnak state should be cleared. 2190 */ 2191 if (qpriv->rnr_nak_state) { 2192 qp->s_nak_state = 0; 2193 qpriv->rnr_nak_state = TID_RNR_NAK_INIT; 2194 qp->r_psn = e->lpsn + 1; 2195 hfi1_tid_write_alloc_resources(qp, true); 2196 } 2197 2198 qp->r_state = e->opcode; 2199 qp->r_nak_state = 0; 2200 qp->s_flags |= RVT_S_RESP_PENDING; 2201 hfi1_schedule_send(qp); 2202 unlock: 2203 spin_unlock_irqrestore(&qp->s_lock, flags); 2204 done: 2205 return 1; 2206 } 2207 2208 void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet) 2209 { 2210 /* HANDLER FOR TID RDMA READ REQUEST packet (Responder side)*/ 2211 2212 /* 2213 * 1. Verify TID RDMA READ REQ as per IB_OPCODE_RC_RDMA_READ 2214 * (see hfi1_rc_rcv()) 2215 * 2. Put TID RDMA READ REQ into the response queueu (s_ack_queue) 2216 * - Setup struct tid_rdma_req with request info 2217 * - Initialize struct tid_rdma_flow info; 2218 * - Copy TID entries; 2219 * 3. Set the qp->s_ack_state. 2220 * 4. Set RVT_S_RESP_PENDING in s_flags. 2221 * 5. Kick the send engine (hfi1_schedule_send()) 2222 */ 2223 struct hfi1_ctxtdata *rcd = packet->rcd; 2224 struct rvt_qp *qp = packet->qp; 2225 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); 2226 struct ib_other_headers *ohdr = packet->ohdr; 2227 struct rvt_ack_entry *e; 2228 unsigned long flags; 2229 struct ib_reth *reth; 2230 struct hfi1_qp_priv *qpriv = qp->priv; 2231 u32 bth0, psn, len, rkey; 2232 bool fecn; 2233 u8 next; 2234 u64 vaddr; 2235 int diff; 2236 u8 nack_state = IB_NAK_INVALID_REQUEST; 2237 2238 bth0 = be32_to_cpu(ohdr->bth[0]); 2239 if (hfi1_ruc_check_hdr(ibp, packet)) 2240 return; 2241 2242 fecn = process_ecn(qp, packet); 2243 psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 2244 trace_hfi1_rsp_rcv_tid_read_req(qp, psn); 2245 2246 if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST)) 2247 rvt_comm_est(qp); 2248 2249 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) 2250 goto nack_inv; 2251 2252 reth = &ohdr->u.tid_rdma.r_req.reth; 2253 vaddr = be64_to_cpu(reth->vaddr); 2254 len = be32_to_cpu(reth->length); 2255 /* The length needs to be in multiples of PAGE_SIZE */ 2256 if (!len || len & ~PAGE_MASK || len > qpriv->tid_rdma.local.max_len) 2257 goto nack_inv; 2258 2259 diff = delta_psn(psn, qp->r_psn); 2260 if (unlikely(diff)) { 2261 tid_rdma_rcv_err(packet, ohdr, qp, psn, diff, fecn); 2262 return; 2263 } 2264 2265 /* We've verified the request, insert it into the ack queue. */ 2266 next = qp->r_head_ack_queue + 1; 2267 if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) 2268 next = 0; 2269 spin_lock_irqsave(&qp->s_lock, flags); 2270 if (unlikely(next == qp->s_tail_ack_queue)) { 2271 if (!qp->s_ack_queue[next].sent) { 2272 nack_state = IB_NAK_REMOTE_OPERATIONAL_ERROR; 2273 goto nack_inv_unlock; 2274 } 2275 update_ack_queue(qp, next); 2276 } 2277 e = &qp->s_ack_queue[qp->r_head_ack_queue]; 2278 release_rdma_sge_mr(e); 2279 2280 rkey = be32_to_cpu(reth->rkey); 2281 qp->r_len = len; 2282 2283 if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr, 2284 rkey, IB_ACCESS_REMOTE_READ))) 2285 goto nack_acc; 2286 2287 /* Accept the request parameters */ 2288 if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn, vaddr, 2289 len)) 2290 goto nack_inv_unlock; 2291 2292 qp->r_state = e->opcode; 2293 qp->r_nak_state = 0; 2294 /* 2295 * We need to increment the MSN here instead of when we 2296 * finish sending the result since a duplicate request would 2297 * increment it more than once. 2298 */ 2299 qp->r_msn++; 2300 qp->r_psn += e->lpsn - e->psn + 1; 2301 2302 qp->r_head_ack_queue = next; 2303 2304 /* 2305 * For all requests other than TID WRITE which are added to the ack 2306 * queue, qpriv->r_tid_alloc follows qp->r_head_ack_queue. It is ok to 2307 * do this because of interlocks between these and TID WRITE 2308 * requests. The same change has also been made in hfi1_rc_rcv(). 2309 */ 2310 qpriv->r_tid_alloc = qp->r_head_ack_queue; 2311 2312 /* Schedule the send tasklet. */ 2313 qp->s_flags |= RVT_S_RESP_PENDING; 2314 if (fecn) 2315 qp->s_flags |= RVT_S_ECN; 2316 hfi1_schedule_send(qp); 2317 2318 spin_unlock_irqrestore(&qp->s_lock, flags); 2319 return; 2320 2321 nack_inv_unlock: 2322 spin_unlock_irqrestore(&qp->s_lock, flags); 2323 nack_inv: 2324 rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); 2325 qp->r_nak_state = nack_state; 2326 qp->r_ack_psn = qp->r_psn; 2327 /* Queue NAK for later */ 2328 rc_defered_ack(rcd, qp); 2329 return; 2330 nack_acc: 2331 spin_unlock_irqrestore(&qp->s_lock, flags); 2332 rvt_rc_error(qp, IB_WC_LOC_PROT_ERR); 2333 qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; 2334 qp->r_ack_psn = qp->r_psn; 2335 } 2336 2337 u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, 2338 struct ib_other_headers *ohdr, u32 *bth0, 2339 u32 *bth1, u32 *bth2, u32 *len, bool *last) 2340 { 2341 struct hfi1_ack_priv *epriv = e->priv; 2342 struct tid_rdma_request *req = &epriv->tid_req; 2343 struct hfi1_qp_priv *qpriv = qp->priv; 2344 struct tid_rdma_flow *flow = &req->flows[req->clear_tail]; 2345 u32 tidentry = flow->tid_entry[flow->tid_idx]; 2346 u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT; 2347 struct tid_rdma_read_resp *resp = &ohdr->u.tid_rdma.r_rsp; 2348 u32 next_offset, om = KDETH_OM_LARGE; 2349 bool last_pkt; 2350 u32 hdwords = 0; 2351 struct tid_rdma_params *remote; 2352 2353 *len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset); 2354 flow->sent += *len; 2355 next_offset = flow->tid_offset + *len; 2356 last_pkt = (flow->sent >= flow->length); 2357 2358 trace_hfi1_tid_entry_build_read_resp(qp, flow->tid_idx, tidentry); 2359 trace_hfi1_tid_flow_build_read_resp(qp, req->clear_tail, flow); 2360 2361 rcu_read_lock(); 2362 remote = rcu_dereference(qpriv->tid_rdma.remote); 2363 if (!remote) { 2364 rcu_read_unlock(); 2365 goto done; 2366 } 2367 KDETH_RESET(resp->kdeth0, KVER, 0x1); 2368 KDETH_SET(resp->kdeth0, SH, !last_pkt); 2369 KDETH_SET(resp->kdeth0, INTR, !!(!last_pkt && remote->urg)); 2370 KDETH_SET(resp->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL)); 2371 KDETH_SET(resp->kdeth0, TID, EXP_TID_GET(tidentry, IDX)); 2372 KDETH_SET(resp->kdeth0, OM, om == KDETH_OM_LARGE); 2373 KDETH_SET(resp->kdeth0, OFFSET, flow->tid_offset / om); 2374 KDETH_RESET(resp->kdeth1, JKEY, remote->jkey); 2375 resp->verbs_qp = cpu_to_be32(qp->remote_qpn); 2376 rcu_read_unlock(); 2377 2378 resp->aeth = rvt_compute_aeth(qp); 2379 resp->verbs_psn = cpu_to_be32(mask_psn(flow->flow_state.ib_spsn + 2380 flow->pkt)); 2381 2382 *bth0 = TID_OP(READ_RESP) << 24; 2383 *bth1 = flow->tid_qpn; 2384 *bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) & 2385 HFI1_KDETH_BTH_SEQ_MASK) | 2386 (flow->flow_state.generation << 2387 HFI1_KDETH_BTH_SEQ_SHIFT)); 2388 *last = last_pkt; 2389 if (last_pkt) 2390 /* Advance to next flow */ 2391 req->clear_tail = (req->clear_tail + 1) & 2392 (MAX_FLOWS - 1); 2393 2394 if (next_offset >= tidlen) { 2395 flow->tid_offset = 0; 2396 flow->tid_idx++; 2397 } else { 2398 flow->tid_offset = next_offset; 2399 } 2400 2401 hdwords = sizeof(ohdr->u.tid_rdma.r_rsp) / sizeof(u32); 2402 2403 done: 2404 return hdwords; 2405 } 2406 2407 static inline struct tid_rdma_request * 2408 find_tid_request(struct rvt_qp *qp, u32 psn, enum ib_wr_opcode opcode) 2409 __must_hold(&qp->s_lock) 2410 { 2411 struct rvt_swqe *wqe; 2412 struct tid_rdma_request *req = NULL; 2413 u32 i, end; 2414 2415 end = qp->s_cur + 1; 2416 if (end == qp->s_size) 2417 end = 0; 2418 for (i = qp->s_acked; i != end;) { 2419 wqe = rvt_get_swqe_ptr(qp, i); 2420 if (cmp_psn(psn, wqe->psn) >= 0 && 2421 cmp_psn(psn, wqe->lpsn) <= 0) { 2422 if (wqe->wr.opcode == opcode) 2423 req = wqe_to_tid_req(wqe); 2424 break; 2425 } 2426 if (++i == qp->s_size) 2427 i = 0; 2428 } 2429 2430 return req; 2431 } 2432 2433 void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet) 2434 { 2435 /* HANDLER FOR TID RDMA READ RESPONSE packet (Requestor side */ 2436 2437 /* 2438 * 1. Find matching SWQE 2439 * 2. Check that the entire segment has been read. 2440 * 3. Remove HFI1_S_WAIT_TID_RESP from s_flags. 2441 * 4. Free the TID flow resources. 2442 * 5. Kick the send engine (hfi1_schedule_send()) 2443 */ 2444 struct ib_other_headers *ohdr = packet->ohdr; 2445 struct rvt_qp *qp = packet->qp; 2446 struct hfi1_qp_priv *priv = qp->priv; 2447 struct hfi1_ctxtdata *rcd = packet->rcd; 2448 struct tid_rdma_request *req; 2449 struct tid_rdma_flow *flow; 2450 u32 opcode, aeth; 2451 bool fecn; 2452 unsigned long flags; 2453 u32 kpsn, ipsn; 2454 2455 trace_hfi1_sender_rcv_tid_read_resp(qp); 2456 fecn = process_ecn(qp, packet); 2457 kpsn = mask_psn(be32_to_cpu(ohdr->bth[2])); 2458 aeth = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.aeth); 2459 opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; 2460 2461 spin_lock_irqsave(&qp->s_lock, flags); 2462 ipsn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn)); 2463 req = find_tid_request(qp, ipsn, IB_WR_TID_RDMA_READ); 2464 if (unlikely(!req)) 2465 goto ack_op_err; 2466 2467 flow = &req->flows[req->clear_tail]; 2468 /* When header suppression is disabled */ 2469 if (cmp_psn(ipsn, flow->flow_state.ib_lpsn)) { 2470 update_r_next_psn_fecn(packet, priv, rcd, flow, fecn); 2471 2472 if (cmp_psn(kpsn, flow->flow_state.r_next_psn)) 2473 goto ack_done; 2474 flow->flow_state.r_next_psn = mask_psn(kpsn + 1); 2475 /* 2476 * Copy the payload to destination buffer if this packet is 2477 * delivered as an eager packet due to RSM rule and FECN. 2478 * The RSM rule selects FECN bit in BTH and SH bit in 2479 * KDETH header and therefore will not match the last 2480 * packet of each segment that has SH bit cleared. 2481 */ 2482 if (fecn && packet->etype == RHF_RCV_TYPE_EAGER) { 2483 struct rvt_sge_state ss; 2484 u32 len; 2485 u32 tlen = packet->tlen; 2486 u16 hdrsize = packet->hlen; 2487 u8 pad = packet->pad; 2488 u8 extra_bytes = pad + packet->extra_byte + 2489 (SIZE_OF_CRC << 2); 2490 u32 pmtu = qp->pmtu; 2491 2492 if (unlikely(tlen != (hdrsize + pmtu + extra_bytes))) 2493 goto ack_op_err; 2494 len = restart_sge(&ss, req->e.swqe, ipsn, pmtu); 2495 if (unlikely(len < pmtu)) 2496 goto ack_op_err; 2497 rvt_copy_sge(qp, &ss, packet->payload, pmtu, false, 2498 false); 2499 /* Raise the sw sequence check flag for next packet */ 2500 priv->s_flags |= HFI1_R_TID_SW_PSN; 2501 } 2502 2503 goto ack_done; 2504 } 2505 flow->flow_state.r_next_psn = mask_psn(kpsn + 1); 2506 req->ack_pending--; 2507 priv->pending_tid_r_segs--; 2508 qp->s_num_rd_atomic--; 2509 if ((qp->s_flags & RVT_S_WAIT_FENCE) && 2510 !qp->s_num_rd_atomic) { 2511 qp->s_flags &= ~(RVT_S_WAIT_FENCE | 2512 RVT_S_WAIT_ACK); 2513 hfi1_schedule_send(qp); 2514 } 2515 if (qp->s_flags & RVT_S_WAIT_RDMAR) { 2516 qp->s_flags &= ~(RVT_S_WAIT_RDMAR | RVT_S_WAIT_ACK); 2517 hfi1_schedule_send(qp); 2518 } 2519 2520 trace_hfi1_ack(qp, ipsn); 2521 trace_hfi1_tid_req_rcv_read_resp(qp, 0, req->e.swqe->wr.opcode, 2522 req->e.swqe->psn, req->e.swqe->lpsn, 2523 req); 2524 trace_hfi1_tid_flow_rcv_read_resp(qp, req->clear_tail, flow); 2525 2526 /* Release the tid resources */ 2527 hfi1_kern_exp_rcv_clear(req); 2528 2529 if (!do_rc_ack(qp, aeth, ipsn, opcode, 0, rcd)) 2530 goto ack_done; 2531 2532 /* If not done yet, build next read request */ 2533 if (++req->comp_seg >= req->total_segs) { 2534 priv->tid_r_comp++; 2535 req->state = TID_REQUEST_COMPLETE; 2536 } 2537 2538 /* 2539 * Clear the hw flow under two conditions: 2540 * 1. This request is a sync point and it is complete; 2541 * 2. Current request is completed and there are no more requests. 2542 */ 2543 if ((req->state == TID_REQUEST_SYNC && 2544 req->comp_seg == req->cur_seg) || 2545 priv->tid_r_comp == priv->tid_r_reqs) { 2546 hfi1_kern_clear_hw_flow(priv->rcd, qp); 2547 priv->s_flags &= ~HFI1_R_TID_SW_PSN; 2548 if (req->state == TID_REQUEST_SYNC) 2549 req->state = TID_REQUEST_ACTIVE; 2550 } 2551 2552 hfi1_schedule_send(qp); 2553 goto ack_done; 2554 2555 ack_op_err: 2556 /* 2557 * The test indicates that the send engine has finished its cleanup 2558 * after sending the request and it's now safe to put the QP into error 2559 * state. However, if the wqe queue is empty (qp->s_acked == qp->s_tail 2560 * == qp->s_head), it would be unsafe to complete the wqe pointed by 2561 * qp->s_acked here. Putting the qp into error state will safely flush 2562 * all remaining requests. 2563 */ 2564 if (qp->s_last == qp->s_acked) 2565 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); 2566 2567 ack_done: 2568 spin_unlock_irqrestore(&qp->s_lock, flags); 2569 } 2570 2571 void hfi1_kern_read_tid_flow_free(struct rvt_qp *qp) 2572 __must_hold(&qp->s_lock) 2573 { 2574 u32 n = qp->s_acked; 2575 struct rvt_swqe *wqe; 2576 struct tid_rdma_request *req; 2577 struct hfi1_qp_priv *priv = qp->priv; 2578 2579 lockdep_assert_held(&qp->s_lock); 2580 /* Free any TID entries */ 2581 while (n != qp->s_tail) { 2582 wqe = rvt_get_swqe_ptr(qp, n); 2583 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { 2584 req = wqe_to_tid_req(wqe); 2585 hfi1_kern_exp_rcv_clear_all(req); 2586 } 2587 2588 if (++n == qp->s_size) 2589 n = 0; 2590 } 2591 /* Free flow */ 2592 hfi1_kern_clear_hw_flow(priv->rcd, qp); 2593 } 2594 2595 static bool tid_rdma_tid_err(struct hfi1_packet *packet, u8 rcv_type) 2596 { 2597 struct rvt_qp *qp = packet->qp; 2598 2599 if (rcv_type >= RHF_RCV_TYPE_IB) 2600 goto done; 2601 2602 spin_lock(&qp->s_lock); 2603 2604 /* 2605 * We've ran out of space in the eager buffer. 2606 * Eagerly received KDETH packets which require space in the 2607 * Eager buffer (packet that have payload) are TID RDMA WRITE 2608 * response packets. In this case, we have to re-transmit the 2609 * TID RDMA WRITE request. 2610 */ 2611 if (rcv_type == RHF_RCV_TYPE_EAGER) { 2612 hfi1_restart_rc(qp, qp->s_last_psn + 1, 1); 2613 hfi1_schedule_send(qp); 2614 } 2615 2616 /* Since no payload is delivered, just drop the packet */ 2617 spin_unlock(&qp->s_lock); 2618 done: 2619 return true; 2620 } 2621 2622 static void restart_tid_rdma_read_req(struct hfi1_ctxtdata *rcd, 2623 struct rvt_qp *qp, struct rvt_swqe *wqe) 2624 { 2625 struct tid_rdma_request *req; 2626 struct tid_rdma_flow *flow; 2627 2628 /* Start from the right segment */ 2629 qp->r_flags |= RVT_R_RDMAR_SEQ; 2630 req = wqe_to_tid_req(wqe); 2631 flow = &req->flows[req->clear_tail]; 2632 hfi1_restart_rc(qp, flow->flow_state.ib_spsn, 0); 2633 if (list_empty(&qp->rspwait)) { 2634 qp->r_flags |= RVT_R_RSP_SEND; 2635 rvt_get_qp(qp); 2636 list_add_tail(&qp->rspwait, &rcd->qp_wait_list); 2637 } 2638 } 2639 2640 /* 2641 * Handle the KDETH eflags for TID RDMA READ response. 2642 * 2643 * Return true if the last packet for a segment has been received and it is 2644 * time to process the response normally; otherwise, return true. 2645 * 2646 * The caller must hold the packet->qp->r_lock and the rcu_read_lock. 2647 */ 2648 static bool handle_read_kdeth_eflags(struct hfi1_ctxtdata *rcd, 2649 struct hfi1_packet *packet, u8 rcv_type, 2650 u8 rte, u32 psn, u32 ibpsn) 2651 __must_hold(&packet->qp->r_lock) __must_hold(RCU) 2652 { 2653 struct hfi1_pportdata *ppd = rcd->ppd; 2654 struct hfi1_devdata *dd = ppd->dd; 2655 struct hfi1_ibport *ibp; 2656 struct rvt_swqe *wqe; 2657 struct tid_rdma_request *req; 2658 struct tid_rdma_flow *flow; 2659 u32 ack_psn; 2660 struct rvt_qp *qp = packet->qp; 2661 struct hfi1_qp_priv *priv = qp->priv; 2662 bool ret = true; 2663 int diff = 0; 2664 u32 fpsn; 2665 2666 lockdep_assert_held(&qp->r_lock); 2667 trace_hfi1_rsp_read_kdeth_eflags(qp, ibpsn); 2668 trace_hfi1_sender_read_kdeth_eflags(qp); 2669 trace_hfi1_tid_read_sender_kdeth_eflags(qp, 0); 2670 spin_lock(&qp->s_lock); 2671 /* If the psn is out of valid range, drop the packet */ 2672 if (cmp_psn(ibpsn, qp->s_last_psn) < 0 || 2673 cmp_psn(ibpsn, qp->s_psn) > 0) 2674 goto s_unlock; 2675 2676 /* 2677 * Note that NAKs implicitly ACK outstanding SEND and RDMA write 2678 * requests and implicitly NAK RDMA read and atomic requests issued 2679 * before the NAK'ed request. 2680 */ 2681 ack_psn = ibpsn - 1; 2682 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 2683 ibp = to_iport(qp->ibqp.device, qp->port_num); 2684 2685 /* Complete WQEs that the PSN finishes. */ 2686 while ((int)delta_psn(ack_psn, wqe->lpsn) >= 0) { 2687 /* 2688 * If this request is a RDMA read or atomic, and the NACK is 2689 * for a later operation, this NACK NAKs the RDMA read or 2690 * atomic. 2691 */ 2692 if (wqe->wr.opcode == IB_WR_RDMA_READ || 2693 wqe->wr.opcode == IB_WR_TID_RDMA_READ || 2694 wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || 2695 wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { 2696 /* Retry this request. */ 2697 if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) { 2698 qp->r_flags |= RVT_R_RDMAR_SEQ; 2699 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { 2700 restart_tid_rdma_read_req(rcd, qp, 2701 wqe); 2702 } else { 2703 hfi1_restart_rc(qp, qp->s_last_psn + 1, 2704 0); 2705 if (list_empty(&qp->rspwait)) { 2706 qp->r_flags |= RVT_R_RSP_SEND; 2707 rvt_get_qp(qp); 2708 list_add_tail(/* wait */ 2709 &qp->rspwait, 2710 &rcd->qp_wait_list); 2711 } 2712 } 2713 } 2714 /* 2715 * No need to process the NAK since we are 2716 * restarting an earlier request. 2717 */ 2718 break; 2719 } 2720 2721 wqe = do_rc_completion(qp, wqe, ibp); 2722 if (qp->s_acked == qp->s_tail) 2723 goto s_unlock; 2724 } 2725 2726 if (qp->s_acked == qp->s_tail) 2727 goto s_unlock; 2728 2729 /* Handle the eflags for the request */ 2730 if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) 2731 goto s_unlock; 2732 2733 req = wqe_to_tid_req(wqe); 2734 trace_hfi1_tid_req_read_kdeth_eflags(qp, 0, wqe->wr.opcode, wqe->psn, 2735 wqe->lpsn, req); 2736 switch (rcv_type) { 2737 case RHF_RCV_TYPE_EXPECTED: 2738 switch (rte) { 2739 case RHF_RTE_EXPECTED_FLOW_SEQ_ERR: 2740 /* 2741 * On the first occurrence of a Flow Sequence error, 2742 * the flag TID_FLOW_SW_PSN is set. 2743 * 2744 * After that, the flow is *not* reprogrammed and the 2745 * protocol falls back to SW PSN checking. This is done 2746 * to prevent continuous Flow Sequence errors for any 2747 * packets that could be still in the fabric. 2748 */ 2749 flow = &req->flows[req->clear_tail]; 2750 trace_hfi1_tid_flow_read_kdeth_eflags(qp, 2751 req->clear_tail, 2752 flow); 2753 if (priv->s_flags & HFI1_R_TID_SW_PSN) { 2754 diff = cmp_psn(psn, 2755 flow->flow_state.r_next_psn); 2756 if (diff > 0) { 2757 /* Drop the packet.*/ 2758 goto s_unlock; 2759 } else if (diff < 0) { 2760 /* 2761 * If a response packet for a restarted 2762 * request has come back, reset the 2763 * restart flag. 2764 */ 2765 if (qp->r_flags & RVT_R_RDMAR_SEQ) 2766 qp->r_flags &= 2767 ~RVT_R_RDMAR_SEQ; 2768 2769 /* Drop the packet.*/ 2770 goto s_unlock; 2771 } 2772 2773 /* 2774 * If SW PSN verification is successful and 2775 * this is the last packet in the segment, tell 2776 * the caller to process it as a normal packet. 2777 */ 2778 fpsn = full_flow_psn(flow, 2779 flow->flow_state.lpsn); 2780 if (cmp_psn(fpsn, psn) == 0) { 2781 ret = false; 2782 if (qp->r_flags & RVT_R_RDMAR_SEQ) 2783 qp->r_flags &= 2784 ~RVT_R_RDMAR_SEQ; 2785 } 2786 flow->flow_state.r_next_psn = 2787 mask_psn(psn + 1); 2788 } else { 2789 u32 last_psn; 2790 2791 last_psn = read_r_next_psn(dd, rcd->ctxt, 2792 flow->idx); 2793 flow->flow_state.r_next_psn = last_psn; 2794 priv->s_flags |= HFI1_R_TID_SW_PSN; 2795 /* 2796 * If no request has been restarted yet, 2797 * restart the current one. 2798 */ 2799 if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) 2800 restart_tid_rdma_read_req(rcd, qp, 2801 wqe); 2802 } 2803 2804 break; 2805 2806 case RHF_RTE_EXPECTED_FLOW_GEN_ERR: 2807 /* 2808 * Since the TID flow is able to ride through 2809 * generation mismatch, drop this stale packet. 2810 */ 2811 break; 2812 2813 default: 2814 break; 2815 } 2816 break; 2817 2818 case RHF_RCV_TYPE_ERROR: 2819 switch (rte) { 2820 case RHF_RTE_ERROR_OP_CODE_ERR: 2821 case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR: 2822 case RHF_RTE_ERROR_KHDR_HCRC_ERR: 2823 case RHF_RTE_ERROR_KHDR_KVER_ERR: 2824 case RHF_RTE_ERROR_CONTEXT_ERR: 2825 case RHF_RTE_ERROR_KHDR_TID_ERR: 2826 default: 2827 break; 2828 } 2829 default: 2830 break; 2831 } 2832 s_unlock: 2833 spin_unlock(&qp->s_lock); 2834 return ret; 2835 } 2836 2837 bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, 2838 struct hfi1_pportdata *ppd, 2839 struct hfi1_packet *packet) 2840 { 2841 struct hfi1_ibport *ibp = &ppd->ibport_data; 2842 struct hfi1_devdata *dd = ppd->dd; 2843 struct rvt_dev_info *rdi = &dd->verbs_dev.rdi; 2844 u8 rcv_type = rhf_rcv_type(packet->rhf); 2845 u8 rte = rhf_rcv_type_err(packet->rhf); 2846 struct ib_header *hdr = packet->hdr; 2847 struct ib_other_headers *ohdr = NULL; 2848 int lnh = be16_to_cpu(hdr->lrh[0]) & 3; 2849 u16 lid = be16_to_cpu(hdr->lrh[1]); 2850 u8 opcode; 2851 u32 qp_num, psn, ibpsn; 2852 struct rvt_qp *qp; 2853 struct hfi1_qp_priv *qpriv; 2854 unsigned long flags; 2855 bool ret = true; 2856 struct rvt_ack_entry *e; 2857 struct tid_rdma_request *req; 2858 struct tid_rdma_flow *flow; 2859 int diff = 0; 2860 2861 trace_hfi1_msg_handle_kdeth_eflags(NULL, "Kdeth error: rhf ", 2862 packet->rhf); 2863 if (packet->rhf & RHF_ICRC_ERR) 2864 return ret; 2865 2866 packet->ohdr = &hdr->u.oth; 2867 ohdr = packet->ohdr; 2868 trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf))); 2869 2870 /* Get the destination QP number. */ 2871 qp_num = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_qp) & 2872 RVT_QPN_MASK; 2873 if (lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) 2874 goto drop; 2875 2876 psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 2877 opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; 2878 2879 rcu_read_lock(); 2880 qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num); 2881 if (!qp) 2882 goto rcu_unlock; 2883 2884 packet->qp = qp; 2885 2886 /* Check for valid receive state. */ 2887 spin_lock_irqsave(&qp->r_lock, flags); 2888 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { 2889 ibp->rvp.n_pkt_drops++; 2890 goto r_unlock; 2891 } 2892 2893 if (packet->rhf & RHF_TID_ERR) { 2894 /* For TIDERR and RC QPs preemptively schedule a NAK */ 2895 u32 tlen = rhf_pkt_len(packet->rhf); /* in bytes */ 2896 2897 /* Sanity check packet */ 2898 if (tlen < 24) 2899 goto r_unlock; 2900 2901 /* 2902 * Check for GRH. We should never get packets with GRH in this 2903 * path. 2904 */ 2905 if (lnh == HFI1_LRH_GRH) 2906 goto r_unlock; 2907 2908 if (tid_rdma_tid_err(packet, rcv_type)) 2909 goto r_unlock; 2910 } 2911 2912 /* handle TID RDMA READ */ 2913 if (opcode == TID_OP(READ_RESP)) { 2914 ibpsn = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn); 2915 ibpsn = mask_psn(ibpsn); 2916 ret = handle_read_kdeth_eflags(rcd, packet, rcv_type, rte, psn, 2917 ibpsn); 2918 goto r_unlock; 2919 } 2920 2921 /* 2922 * qp->s_tail_ack_queue points to the rvt_ack_entry currently being 2923 * processed. These a completed sequentially so we can be sure that 2924 * the pointer will not change until the entire request has completed. 2925 */ 2926 spin_lock(&qp->s_lock); 2927 qpriv = qp->priv; 2928 if (qpriv->r_tid_tail == HFI1_QP_WQE_INVALID || 2929 qpriv->r_tid_tail == qpriv->r_tid_head) 2930 goto unlock; 2931 e = &qp->s_ack_queue[qpriv->r_tid_tail]; 2932 if (e->opcode != TID_OP(WRITE_REQ)) 2933 goto unlock; 2934 req = ack_to_tid_req(e); 2935 if (req->comp_seg == req->cur_seg) 2936 goto unlock; 2937 flow = &req->flows[req->clear_tail]; 2938 trace_hfi1_eflags_err_write(qp, rcv_type, rte, psn); 2939 trace_hfi1_rsp_handle_kdeth_eflags(qp, psn); 2940 trace_hfi1_tid_write_rsp_handle_kdeth_eflags(qp); 2941 trace_hfi1_tid_req_handle_kdeth_eflags(qp, 0, e->opcode, e->psn, 2942 e->lpsn, req); 2943 trace_hfi1_tid_flow_handle_kdeth_eflags(qp, req->clear_tail, flow); 2944 2945 switch (rcv_type) { 2946 case RHF_RCV_TYPE_EXPECTED: 2947 switch (rte) { 2948 case RHF_RTE_EXPECTED_FLOW_SEQ_ERR: 2949 if (!(qpriv->s_flags & HFI1_R_TID_SW_PSN)) { 2950 qpriv->s_flags |= HFI1_R_TID_SW_PSN; 2951 flow->flow_state.r_next_psn = 2952 read_r_next_psn(dd, rcd->ctxt, 2953 flow->idx); 2954 qpriv->r_next_psn_kdeth = 2955 flow->flow_state.r_next_psn; 2956 goto nak_psn; 2957 } else { 2958 /* 2959 * If the received PSN does not match the next 2960 * expected PSN, NAK the packet. 2961 * However, only do that if we know that the a 2962 * NAK has already been sent. Otherwise, this 2963 * mismatch could be due to packets that were 2964 * already in flight. 2965 */ 2966 diff = cmp_psn(psn, 2967 flow->flow_state.r_next_psn); 2968 if (diff > 0) 2969 goto nak_psn; 2970 else if (diff < 0) 2971 break; 2972 2973 qpriv->s_nak_state = 0; 2974 /* 2975 * If SW PSN verification is successful and this 2976 * is the last packet in the segment, tell the 2977 * caller to process it as a normal packet. 2978 */ 2979 if (psn == full_flow_psn(flow, 2980 flow->flow_state.lpsn)) 2981 ret = false; 2982 flow->flow_state.r_next_psn = 2983 mask_psn(psn + 1); 2984 qpriv->r_next_psn_kdeth = 2985 flow->flow_state.r_next_psn; 2986 } 2987 break; 2988 2989 case RHF_RTE_EXPECTED_FLOW_GEN_ERR: 2990 goto nak_psn; 2991 2992 default: 2993 break; 2994 } 2995 break; 2996 2997 case RHF_RCV_TYPE_ERROR: 2998 switch (rte) { 2999 case RHF_RTE_ERROR_OP_CODE_ERR: 3000 case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR: 3001 case RHF_RTE_ERROR_KHDR_HCRC_ERR: 3002 case RHF_RTE_ERROR_KHDR_KVER_ERR: 3003 case RHF_RTE_ERROR_CONTEXT_ERR: 3004 case RHF_RTE_ERROR_KHDR_TID_ERR: 3005 default: 3006 break; 3007 } 3008 default: 3009 break; 3010 } 3011 3012 unlock: 3013 spin_unlock(&qp->s_lock); 3014 r_unlock: 3015 spin_unlock_irqrestore(&qp->r_lock, flags); 3016 rcu_unlock: 3017 rcu_read_unlock(); 3018 drop: 3019 return ret; 3020 nak_psn: 3021 ibp->rvp.n_rc_seqnak++; 3022 if (!qpriv->s_nak_state) { 3023 qpriv->s_nak_state = IB_NAK_PSN_ERROR; 3024 /* We are NAK'ing the next expected PSN */ 3025 qpriv->s_nak_psn = mask_psn(flow->flow_state.r_next_psn); 3026 tid_rdma_trigger_ack(qp); 3027 } 3028 goto unlock; 3029 } 3030 3031 /* 3032 * "Rewind" the TID request information. 3033 * This means that we reset the state back to ACTIVE, 3034 * find the proper flow, set the flow index to that flow, 3035 * and reset the flow information. 3036 */ 3037 void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe, 3038 u32 *bth2) 3039 { 3040 struct tid_rdma_request *req = wqe_to_tid_req(wqe); 3041 struct tid_rdma_flow *flow; 3042 struct hfi1_qp_priv *qpriv = qp->priv; 3043 int diff, delta_pkts; 3044 u32 tididx = 0, i; 3045 u16 fidx; 3046 3047 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { 3048 *bth2 = mask_psn(qp->s_psn); 3049 flow = find_flow_ib(req, *bth2, &fidx); 3050 if (!flow) { 3051 trace_hfi1_msg_tid_restart_req(/* msg */ 3052 qp, "!!!!!! Could not find flow to restart: bth2 ", 3053 (u64)*bth2); 3054 trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode, 3055 wqe->psn, wqe->lpsn, 3056 req); 3057 return; 3058 } 3059 } else { 3060 fidx = req->acked_tail; 3061 flow = &req->flows[fidx]; 3062 *bth2 = mask_psn(req->r_ack_psn); 3063 } 3064 3065 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) 3066 delta_pkts = delta_psn(*bth2, flow->flow_state.ib_spsn); 3067 else 3068 delta_pkts = delta_psn(*bth2, 3069 full_flow_psn(flow, 3070 flow->flow_state.spsn)); 3071 3072 trace_hfi1_tid_flow_restart_req(qp, fidx, flow); 3073 diff = delta_pkts + flow->resync_npkts; 3074 3075 flow->sent = 0; 3076 flow->pkt = 0; 3077 flow->tid_idx = 0; 3078 flow->tid_offset = 0; 3079 if (diff) { 3080 for (tididx = 0; tididx < flow->tidcnt; tididx++) { 3081 u32 tidentry = flow->tid_entry[tididx], tidlen, 3082 tidnpkts, npkts; 3083 3084 flow->tid_offset = 0; 3085 tidlen = EXP_TID_GET(tidentry, LEN) * PAGE_SIZE; 3086 tidnpkts = rvt_div_round_up_mtu(qp, tidlen); 3087 npkts = min_t(u32, diff, tidnpkts); 3088 flow->pkt += npkts; 3089 flow->sent += (npkts == tidnpkts ? tidlen : 3090 npkts * qp->pmtu); 3091 flow->tid_offset += npkts * qp->pmtu; 3092 diff -= npkts; 3093 if (!diff) 3094 break; 3095 } 3096 } 3097 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) { 3098 rvt_skip_sge(&qpriv->tid_ss, (req->cur_seg * req->seg_len) + 3099 flow->sent, 0); 3100 /* 3101 * Packet PSN is based on flow_state.spsn + flow->pkt. However, 3102 * during a RESYNC, the generation is incremented and the 3103 * sequence is reset to 0. Since we've adjusted the npkts in the 3104 * flow and the SGE has been sufficiently advanced, we have to 3105 * adjust flow->pkt in order to calculate the correct PSN. 3106 */ 3107 flow->pkt -= flow->resync_npkts; 3108 } 3109 3110 if (flow->tid_offset == 3111 EXP_TID_GET(flow->tid_entry[tididx], LEN) * PAGE_SIZE) { 3112 tididx++; 3113 flow->tid_offset = 0; 3114 } 3115 flow->tid_idx = tididx; 3116 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) 3117 /* Move flow_idx to correct index */ 3118 req->flow_idx = fidx; 3119 else 3120 req->clear_tail = fidx; 3121 3122 trace_hfi1_tid_flow_restart_req(qp, fidx, flow); 3123 trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode, wqe->psn, 3124 wqe->lpsn, req); 3125 req->state = TID_REQUEST_ACTIVE; 3126 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) { 3127 /* Reset all the flows that we are going to resend */ 3128 fidx = CIRC_NEXT(fidx, MAX_FLOWS); 3129 i = qpriv->s_tid_tail; 3130 do { 3131 for (; CIRC_CNT(req->setup_head, fidx, MAX_FLOWS); 3132 fidx = CIRC_NEXT(fidx, MAX_FLOWS)) { 3133 req->flows[fidx].sent = 0; 3134 req->flows[fidx].pkt = 0; 3135 req->flows[fidx].tid_idx = 0; 3136 req->flows[fidx].tid_offset = 0; 3137 req->flows[fidx].resync_npkts = 0; 3138 } 3139 if (i == qpriv->s_tid_cur) 3140 break; 3141 do { 3142 i = (++i == qp->s_size ? 0 : i); 3143 wqe = rvt_get_swqe_ptr(qp, i); 3144 } while (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE); 3145 req = wqe_to_tid_req(wqe); 3146 req->cur_seg = req->ack_seg; 3147 fidx = req->acked_tail; 3148 /* Pull req->clear_tail back */ 3149 req->clear_tail = fidx; 3150 } while (1); 3151 } 3152 } 3153 3154 void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp) 3155 { 3156 int i, ret; 3157 struct hfi1_qp_priv *qpriv = qp->priv; 3158 struct tid_flow_state *fs; 3159 3160 if (qp->ibqp.qp_type != IB_QPT_RC || !HFI1_CAP_IS_KSET(TID_RDMA)) 3161 return; 3162 3163 /* 3164 * First, clear the flow to help prevent any delayed packets from 3165 * being delivered. 3166 */ 3167 fs = &qpriv->flow_state; 3168 if (fs->index != RXE_NUM_TID_FLOWS) 3169 hfi1_kern_clear_hw_flow(qpriv->rcd, qp); 3170 3171 for (i = qp->s_acked; i != qp->s_head;) { 3172 struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i); 3173 3174 if (++i == qp->s_size) 3175 i = 0; 3176 /* Free only locally allocated TID entries */ 3177 if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) 3178 continue; 3179 do { 3180 struct hfi1_swqe_priv *priv = wqe->priv; 3181 3182 ret = hfi1_kern_exp_rcv_clear(&priv->tid_req); 3183 } while (!ret); 3184 } 3185 for (i = qp->s_acked_ack_queue; i != qp->r_head_ack_queue;) { 3186 struct rvt_ack_entry *e = &qp->s_ack_queue[i]; 3187 3188 if (++i == rvt_max_atomic(ib_to_rvt(qp->ibqp.device))) 3189 i = 0; 3190 /* Free only locally allocated TID entries */ 3191 if (e->opcode != TID_OP(WRITE_REQ)) 3192 continue; 3193 do { 3194 struct hfi1_ack_priv *priv = e->priv; 3195 3196 ret = hfi1_kern_exp_rcv_clear(&priv->tid_req); 3197 } while (!ret); 3198 } 3199 } 3200 3201 bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe) 3202 { 3203 struct rvt_swqe *prev; 3204 struct hfi1_qp_priv *priv = qp->priv; 3205 u32 s_prev; 3206 struct tid_rdma_request *req; 3207 3208 s_prev = (qp->s_cur == 0 ? qp->s_size : qp->s_cur) - 1; 3209 prev = rvt_get_swqe_ptr(qp, s_prev); 3210 3211 switch (wqe->wr.opcode) { 3212 case IB_WR_SEND: 3213 case IB_WR_SEND_WITH_IMM: 3214 case IB_WR_SEND_WITH_INV: 3215 case IB_WR_ATOMIC_CMP_AND_SWP: 3216 case IB_WR_ATOMIC_FETCH_AND_ADD: 3217 case IB_WR_RDMA_WRITE: 3218 case IB_WR_RDMA_WRITE_WITH_IMM: 3219 switch (prev->wr.opcode) { 3220 case IB_WR_TID_RDMA_WRITE: 3221 req = wqe_to_tid_req(prev); 3222 if (req->ack_seg != req->total_segs) 3223 goto interlock; 3224 default: 3225 break; 3226 } 3227 break; 3228 case IB_WR_RDMA_READ: 3229 if (prev->wr.opcode != IB_WR_TID_RDMA_WRITE) 3230 break; 3231 fallthrough; 3232 case IB_WR_TID_RDMA_READ: 3233 switch (prev->wr.opcode) { 3234 case IB_WR_RDMA_READ: 3235 if (qp->s_acked != qp->s_cur) 3236 goto interlock; 3237 break; 3238 case IB_WR_TID_RDMA_WRITE: 3239 req = wqe_to_tid_req(prev); 3240 if (req->ack_seg != req->total_segs) 3241 goto interlock; 3242 default: 3243 break; 3244 } 3245 default: 3246 break; 3247 } 3248 return false; 3249 3250 interlock: 3251 priv->s_flags |= HFI1_S_TID_WAIT_INTERLCK; 3252 return true; 3253 } 3254 3255 /* Does @sge meet the alignment requirements for tid rdma? */ 3256 static inline bool hfi1_check_sge_align(struct rvt_qp *qp, 3257 struct rvt_sge *sge, int num_sge) 3258 { 3259 int i; 3260 3261 for (i = 0; i < num_sge; i++, sge++) { 3262 trace_hfi1_sge_check_align(qp, i, sge); 3263 if ((u64)sge->vaddr & ~PAGE_MASK || 3264 sge->sge_length & ~PAGE_MASK) 3265 return false; 3266 } 3267 return true; 3268 } 3269 3270 void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe) 3271 { 3272 struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv; 3273 struct hfi1_swqe_priv *priv = wqe->priv; 3274 struct tid_rdma_params *remote; 3275 enum ib_wr_opcode new_opcode; 3276 bool do_tid_rdma = false; 3277 struct hfi1_pportdata *ppd = qpriv->rcd->ppd; 3278 3279 if ((rdma_ah_get_dlid(&qp->remote_ah_attr) & ~((1 << ppd->lmc) - 1)) == 3280 ppd->lid) 3281 return; 3282 if (qpriv->hdr_type != HFI1_PKT_TYPE_9B) 3283 return; 3284 3285 rcu_read_lock(); 3286 remote = rcu_dereference(qpriv->tid_rdma.remote); 3287 /* 3288 * If TID RDMA is disabled by the negotiation, don't 3289 * use it. 3290 */ 3291 if (!remote) 3292 goto exit; 3293 3294 if (wqe->wr.opcode == IB_WR_RDMA_READ) { 3295 if (hfi1_check_sge_align(qp, &wqe->sg_list[0], 3296 wqe->wr.num_sge)) { 3297 new_opcode = IB_WR_TID_RDMA_READ; 3298 do_tid_rdma = true; 3299 } 3300 } else if (wqe->wr.opcode == IB_WR_RDMA_WRITE) { 3301 /* 3302 * TID RDMA is enabled for this RDMA WRITE request iff: 3303 * 1. The remote address is page-aligned, 3304 * 2. The length is larger than the minimum segment size, 3305 * 3. The length is page-multiple. 3306 */ 3307 if (!(wqe->rdma_wr.remote_addr & ~PAGE_MASK) && 3308 !(wqe->length & ~PAGE_MASK)) { 3309 new_opcode = IB_WR_TID_RDMA_WRITE; 3310 do_tid_rdma = true; 3311 } 3312 } 3313 3314 if (do_tid_rdma) { 3315 if (hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req, GFP_ATOMIC)) 3316 goto exit; 3317 wqe->wr.opcode = new_opcode; 3318 priv->tid_req.seg_len = 3319 min_t(u32, remote->max_len, wqe->length); 3320 priv->tid_req.total_segs = 3321 DIV_ROUND_UP(wqe->length, priv->tid_req.seg_len); 3322 /* Compute the last PSN of the request */ 3323 wqe->lpsn = wqe->psn; 3324 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { 3325 priv->tid_req.n_flows = remote->max_read; 3326 qpriv->tid_r_reqs++; 3327 wqe->lpsn += rvt_div_round_up_mtu(qp, wqe->length) - 1; 3328 } else { 3329 wqe->lpsn += priv->tid_req.total_segs - 1; 3330 atomic_inc(&qpriv->n_requests); 3331 } 3332 3333 priv->tid_req.cur_seg = 0; 3334 priv->tid_req.comp_seg = 0; 3335 priv->tid_req.ack_seg = 0; 3336 priv->tid_req.state = TID_REQUEST_INACTIVE; 3337 /* 3338 * Reset acked_tail. 3339 * TID RDMA READ does not have ACKs so it does not 3340 * update the pointer. We have to reset it so TID RDMA 3341 * WRITE does not get confused. 3342 */ 3343 priv->tid_req.acked_tail = priv->tid_req.setup_head; 3344 trace_hfi1_tid_req_setup_tid_wqe(qp, 1, wqe->wr.opcode, 3345 wqe->psn, wqe->lpsn, 3346 &priv->tid_req); 3347 } 3348 exit: 3349 rcu_read_unlock(); 3350 } 3351 3352 /* TID RDMA WRITE functions */ 3353 3354 u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe, 3355 struct ib_other_headers *ohdr, 3356 u32 *bth1, u32 *bth2, u32 *len) 3357 { 3358 struct hfi1_qp_priv *qpriv = qp->priv; 3359 struct tid_rdma_request *req = wqe_to_tid_req(wqe); 3360 struct tid_rdma_params *remote; 3361 3362 rcu_read_lock(); 3363 remote = rcu_dereference(qpriv->tid_rdma.remote); 3364 /* 3365 * Set the number of flow to be used based on negotiated 3366 * parameters. 3367 */ 3368 req->n_flows = remote->max_write; 3369 req->state = TID_REQUEST_ACTIVE; 3370 3371 KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth0, KVER, 0x1); 3372 KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth1, JKEY, remote->jkey); 3373 ohdr->u.tid_rdma.w_req.reth.vaddr = 3374 cpu_to_be64(wqe->rdma_wr.remote_addr + (wqe->length - *len)); 3375 ohdr->u.tid_rdma.w_req.reth.rkey = 3376 cpu_to_be32(wqe->rdma_wr.rkey); 3377 ohdr->u.tid_rdma.w_req.reth.length = cpu_to_be32(*len); 3378 ohdr->u.tid_rdma.w_req.verbs_qp = cpu_to_be32(qp->remote_qpn); 3379 *bth1 &= ~RVT_QPN_MASK; 3380 *bth1 |= remote->qp; 3381 qp->s_state = TID_OP(WRITE_REQ); 3382 qp->s_flags |= HFI1_S_WAIT_TID_RESP; 3383 *bth2 |= IB_BTH_REQ_ACK; 3384 *len = 0; 3385 3386 rcu_read_unlock(); 3387 return sizeof(ohdr->u.tid_rdma.w_req) / sizeof(u32); 3388 } 3389 3390 static u32 hfi1_compute_tid_rdma_flow_wt(struct rvt_qp *qp) 3391 { 3392 /* 3393 * Heuristic for computing the RNR timeout when waiting on the flow 3394 * queue. Rather than a computationaly expensive exact estimate of when 3395 * a flow will be available, we assume that if a QP is at position N in 3396 * the flow queue it has to wait approximately (N + 1) * (number of 3397 * segments between two sync points). The rationale for this is that 3398 * flows are released and recycled at each sync point. 3399 */ 3400 return (MAX_TID_FLOW_PSN * qp->pmtu) >> TID_RDMA_SEGMENT_SHIFT; 3401 } 3402 3403 static u32 position_in_queue(struct hfi1_qp_priv *qpriv, 3404 struct tid_queue *queue) 3405 { 3406 return qpriv->tid_enqueue - queue->dequeue; 3407 } 3408 3409 /* 3410 * @qp: points to rvt_qp context. 3411 * @to_seg: desired RNR timeout in segments. 3412 * Return: index of the next highest timeout in the ib_hfi1_rnr_table[] 3413 */ 3414 static u32 hfi1_compute_tid_rnr_timeout(struct rvt_qp *qp, u32 to_seg) 3415 { 3416 struct hfi1_qp_priv *qpriv = qp->priv; 3417 u64 timeout; 3418 u32 bytes_per_us; 3419 u8 i; 3420 3421 bytes_per_us = active_egress_rate(qpriv->rcd->ppd) / 8; 3422 timeout = (to_seg * TID_RDMA_MAX_SEGMENT_SIZE) / bytes_per_us; 3423 /* 3424 * Find the next highest value in the RNR table to the required 3425 * timeout. This gives the responder some padding. 3426 */ 3427 for (i = 1; i <= IB_AETH_CREDIT_MASK; i++) 3428 if (rvt_rnr_tbl_to_usec(i) >= timeout) 3429 return i; 3430 return 0; 3431 } 3432 3433 /** 3434 * Central place for resource allocation at TID write responder, 3435 * is called from write_req and write_data interrupt handlers as 3436 * well as the send thread when a queued QP is scheduled for 3437 * resource allocation. 3438 * 3439 * Iterates over (a) segments of a request and then (b) queued requests 3440 * themselves to allocate resources for up to local->max_write 3441 * segments across multiple requests. Stop allocating when we 3442 * hit a sync point, resume allocating after data packets at 3443 * sync point have been received. 3444 * 3445 * Resource allocation and sending of responses is decoupled. The 3446 * request/segment which are being allocated and sent are as follows. 3447 * Resources are allocated for: 3448 * [request: qpriv->r_tid_alloc, segment: req->alloc_seg] 3449 * The send thread sends: 3450 * [request: qp->s_tail_ack_queue, segment:req->cur_seg] 3451 */ 3452 static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx) 3453 { 3454 struct tid_rdma_request *req; 3455 struct hfi1_qp_priv *qpriv = qp->priv; 3456 struct hfi1_ctxtdata *rcd = qpriv->rcd; 3457 struct tid_rdma_params *local = &qpriv->tid_rdma.local; 3458 struct rvt_ack_entry *e; 3459 u32 npkts, to_seg; 3460 bool last; 3461 int ret = 0; 3462 3463 lockdep_assert_held(&qp->s_lock); 3464 3465 while (1) { 3466 trace_hfi1_rsp_tid_write_alloc_res(qp, 0); 3467 trace_hfi1_tid_write_rsp_alloc_res(qp); 3468 /* 3469 * Don't allocate more segments if a RNR NAK has already been 3470 * scheduled to avoid messing up qp->r_psn: the RNR NAK will 3471 * be sent only when all allocated segments have been sent. 3472 * However, if more segments are allocated before that, TID RDMA 3473 * WRITE RESP packets will be sent out for these new segments 3474 * before the RNR NAK packet. When the requester receives the 3475 * RNR NAK packet, it will restart with qp->s_last_psn + 1, 3476 * which does not match qp->r_psn and will be dropped. 3477 * Consequently, the requester will exhaust its retries and 3478 * put the qp into error state. 3479 */ 3480 if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND) 3481 break; 3482 3483 /* No requests left to process */ 3484 if (qpriv->r_tid_alloc == qpriv->r_tid_head) { 3485 /* If all data has been received, clear the flow */ 3486 if (qpriv->flow_state.index < RXE_NUM_TID_FLOWS && 3487 !qpriv->alloc_w_segs) { 3488 hfi1_kern_clear_hw_flow(rcd, qp); 3489 qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; 3490 } 3491 break; 3492 } 3493 3494 e = &qp->s_ack_queue[qpriv->r_tid_alloc]; 3495 if (e->opcode != TID_OP(WRITE_REQ)) 3496 goto next_req; 3497 req = ack_to_tid_req(e); 3498 trace_hfi1_tid_req_write_alloc_res(qp, 0, e->opcode, e->psn, 3499 e->lpsn, req); 3500 /* Finished allocating for all segments of this request */ 3501 if (req->alloc_seg >= req->total_segs) 3502 goto next_req; 3503 3504 /* Can allocate only a maximum of local->max_write for a QP */ 3505 if (qpriv->alloc_w_segs >= local->max_write) 3506 break; 3507 3508 /* Don't allocate at a sync point with data packets pending */ 3509 if (qpriv->sync_pt && qpriv->alloc_w_segs) 3510 break; 3511 3512 /* All data received at the sync point, continue */ 3513 if (qpriv->sync_pt && !qpriv->alloc_w_segs) { 3514 hfi1_kern_clear_hw_flow(rcd, qp); 3515 qpriv->sync_pt = false; 3516 qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; 3517 } 3518 3519 /* Allocate flow if we don't have one */ 3520 if (qpriv->flow_state.index >= RXE_NUM_TID_FLOWS) { 3521 ret = hfi1_kern_setup_hw_flow(qpriv->rcd, qp); 3522 if (ret) { 3523 to_seg = hfi1_compute_tid_rdma_flow_wt(qp) * 3524 position_in_queue(qpriv, 3525 &rcd->flow_queue); 3526 break; 3527 } 3528 } 3529 3530 npkts = rvt_div_round_up_mtu(qp, req->seg_len); 3531 3532 /* 3533 * We are at a sync point if we run out of KDETH PSN space. 3534 * Last PSN of every generation is reserved for RESYNC. 3535 */ 3536 if (qpriv->flow_state.psn + npkts > MAX_TID_FLOW_PSN - 1) { 3537 qpriv->sync_pt = true; 3538 break; 3539 } 3540 3541 /* 3542 * If overtaking req->acked_tail, send an RNR NAK. Because the 3543 * QP is not queued in this case, and the issue can only be 3544 * caused by a delay in scheduling the second leg which we 3545 * cannot estimate, we use a rather arbitrary RNR timeout of 3546 * (MAX_FLOWS / 2) segments 3547 */ 3548 if (!CIRC_SPACE(req->setup_head, req->acked_tail, 3549 MAX_FLOWS)) { 3550 ret = -EAGAIN; 3551 to_seg = MAX_FLOWS >> 1; 3552 tid_rdma_trigger_ack(qp); 3553 break; 3554 } 3555 3556 /* Try to allocate rcv array / TID entries */ 3557 ret = hfi1_kern_exp_rcv_setup(req, &req->ss, &last); 3558 if (ret == -EAGAIN) 3559 to_seg = position_in_queue(qpriv, &rcd->rarr_queue); 3560 if (ret) 3561 break; 3562 3563 qpriv->alloc_w_segs++; 3564 req->alloc_seg++; 3565 continue; 3566 next_req: 3567 /* Begin processing the next request */ 3568 if (++qpriv->r_tid_alloc > 3569 rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) 3570 qpriv->r_tid_alloc = 0; 3571 } 3572 3573 /* 3574 * Schedule an RNR NAK to be sent if (a) flow or rcv array allocation 3575 * has failed (b) we are called from the rcv handler interrupt context 3576 * (c) an RNR NAK has not already been scheduled 3577 */ 3578 if (ret == -EAGAIN && intr_ctx && !qp->r_nak_state) 3579 goto send_rnr_nak; 3580 3581 return; 3582 3583 send_rnr_nak: 3584 lockdep_assert_held(&qp->r_lock); 3585 3586 /* Set r_nak_state to prevent unrelated events from generating NAK's */ 3587 qp->r_nak_state = hfi1_compute_tid_rnr_timeout(qp, to_seg) | IB_RNR_NAK; 3588 3589 /* Pull back r_psn to the segment being RNR NAK'd */ 3590 qp->r_psn = e->psn + req->alloc_seg; 3591 qp->r_ack_psn = qp->r_psn; 3592 /* 3593 * Pull back r_head_ack_queue to the ack entry following the request 3594 * being RNR NAK'd. This allows resources to be allocated to the request 3595 * if the queued QP is scheduled. 3596 */ 3597 qp->r_head_ack_queue = qpriv->r_tid_alloc + 1; 3598 if (qp->r_head_ack_queue > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) 3599 qp->r_head_ack_queue = 0; 3600 qpriv->r_tid_head = qp->r_head_ack_queue; 3601 /* 3602 * These send side fields are used in make_rc_ack(). They are set in 3603 * hfi1_send_rc_ack() but must be set here before dropping qp->s_lock 3604 * for consistency 3605 */ 3606 qp->s_nak_state = qp->r_nak_state; 3607 qp->s_ack_psn = qp->r_ack_psn; 3608 /* 3609 * Clear the ACK PENDING flag to prevent unwanted ACK because we 3610 * have modified qp->s_ack_psn here. 3611 */ 3612 qp->s_flags &= ~(RVT_S_ACK_PENDING); 3613 3614 trace_hfi1_rsp_tid_write_alloc_res(qp, qp->r_psn); 3615 /* 3616 * qpriv->rnr_nak_state is used to determine when the scheduled RNR NAK 3617 * has actually been sent. qp->s_flags RVT_S_ACK_PENDING bit cannot be 3618 * used for this because qp->s_lock is dropped before calling 3619 * hfi1_send_rc_ack() leading to inconsistency between the receive 3620 * interrupt handlers and the send thread in make_rc_ack() 3621 */ 3622 qpriv->rnr_nak_state = TID_RNR_NAK_SEND; 3623 3624 /* 3625 * Schedule RNR NAK to be sent. RNR NAK's are scheduled from the receive 3626 * interrupt handlers but will be sent from the send engine behind any 3627 * previous responses that may have been scheduled 3628 */ 3629 rc_defered_ack(rcd, qp); 3630 } 3631 3632 void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet) 3633 { 3634 /* HANDLER FOR TID RDMA WRITE REQUEST packet (Responder side)*/ 3635 3636 /* 3637 * 1. Verify TID RDMA WRITE REQ as per IB_OPCODE_RC_RDMA_WRITE_FIRST 3638 * (see hfi1_rc_rcv()) 3639 * - Don't allow 0-length requests. 3640 * 2. Put TID RDMA WRITE REQ into the response queueu (s_ack_queue) 3641 * - Setup struct tid_rdma_req with request info 3642 * - Prepare struct tid_rdma_flow array? 3643 * 3. Set the qp->s_ack_state as state diagram in design doc. 3644 * 4. Set RVT_S_RESP_PENDING in s_flags. 3645 * 5. Kick the send engine (hfi1_schedule_send()) 3646 */ 3647 struct hfi1_ctxtdata *rcd = packet->rcd; 3648 struct rvt_qp *qp = packet->qp; 3649 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); 3650 struct ib_other_headers *ohdr = packet->ohdr; 3651 struct rvt_ack_entry *e; 3652 unsigned long flags; 3653 struct ib_reth *reth; 3654 struct hfi1_qp_priv *qpriv = qp->priv; 3655 struct tid_rdma_request *req; 3656 u32 bth0, psn, len, rkey, num_segs; 3657 bool fecn; 3658 u8 next; 3659 u64 vaddr; 3660 int diff; 3661 3662 bth0 = be32_to_cpu(ohdr->bth[0]); 3663 if (hfi1_ruc_check_hdr(ibp, packet)) 3664 return; 3665 3666 fecn = process_ecn(qp, packet); 3667 psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 3668 trace_hfi1_rsp_rcv_tid_write_req(qp, psn); 3669 3670 if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST)) 3671 rvt_comm_est(qp); 3672 3673 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) 3674 goto nack_inv; 3675 3676 reth = &ohdr->u.tid_rdma.w_req.reth; 3677 vaddr = be64_to_cpu(reth->vaddr); 3678 len = be32_to_cpu(reth->length); 3679 3680 num_segs = DIV_ROUND_UP(len, qpriv->tid_rdma.local.max_len); 3681 diff = delta_psn(psn, qp->r_psn); 3682 if (unlikely(diff)) { 3683 tid_rdma_rcv_err(packet, ohdr, qp, psn, diff, fecn); 3684 return; 3685 } 3686 3687 /* 3688 * The resent request which was previously RNR NAK'd is inserted at the 3689 * location of the original request, which is one entry behind 3690 * r_head_ack_queue 3691 */ 3692 if (qpriv->rnr_nak_state) 3693 qp->r_head_ack_queue = qp->r_head_ack_queue ? 3694 qp->r_head_ack_queue - 1 : 3695 rvt_size_atomic(ib_to_rvt(qp->ibqp.device)); 3696 3697 /* We've verified the request, insert it into the ack queue. */ 3698 next = qp->r_head_ack_queue + 1; 3699 if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) 3700 next = 0; 3701 spin_lock_irqsave(&qp->s_lock, flags); 3702 if (unlikely(next == qp->s_acked_ack_queue)) { 3703 if (!qp->s_ack_queue[next].sent) 3704 goto nack_inv_unlock; 3705 update_ack_queue(qp, next); 3706 } 3707 e = &qp->s_ack_queue[qp->r_head_ack_queue]; 3708 req = ack_to_tid_req(e); 3709 3710 /* Bring previously RNR NAK'd request back to life */ 3711 if (qpriv->rnr_nak_state) { 3712 qp->r_nak_state = 0; 3713 qp->s_nak_state = 0; 3714 qpriv->rnr_nak_state = TID_RNR_NAK_INIT; 3715 qp->r_psn = e->lpsn + 1; 3716 req->state = TID_REQUEST_INIT; 3717 goto update_head; 3718 } 3719 3720 release_rdma_sge_mr(e); 3721 3722 /* The length needs to be in multiples of PAGE_SIZE */ 3723 if (!len || len & ~PAGE_MASK) 3724 goto nack_inv_unlock; 3725 3726 rkey = be32_to_cpu(reth->rkey); 3727 qp->r_len = len; 3728 3729 if (e->opcode == TID_OP(WRITE_REQ) && 3730 (req->setup_head != req->clear_tail || 3731 req->clear_tail != req->acked_tail)) 3732 goto nack_inv_unlock; 3733 3734 if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr, 3735 rkey, IB_ACCESS_REMOTE_WRITE))) 3736 goto nack_acc; 3737 3738 qp->r_psn += num_segs - 1; 3739 3740 e->opcode = (bth0 >> 24) & 0xff; 3741 e->psn = psn; 3742 e->lpsn = qp->r_psn; 3743 e->sent = 0; 3744 3745 req->n_flows = min_t(u16, num_segs, qpriv->tid_rdma.local.max_write); 3746 req->state = TID_REQUEST_INIT; 3747 req->cur_seg = 0; 3748 req->comp_seg = 0; 3749 req->ack_seg = 0; 3750 req->alloc_seg = 0; 3751 req->isge = 0; 3752 req->seg_len = qpriv->tid_rdma.local.max_len; 3753 req->total_len = len; 3754 req->total_segs = num_segs; 3755 req->r_flow_psn = e->psn; 3756 req->ss.sge = e->rdma_sge; 3757 req->ss.num_sge = 1; 3758 3759 req->flow_idx = req->setup_head; 3760 req->clear_tail = req->setup_head; 3761 req->acked_tail = req->setup_head; 3762 3763 qp->r_state = e->opcode; 3764 qp->r_nak_state = 0; 3765 /* 3766 * We need to increment the MSN here instead of when we 3767 * finish sending the result since a duplicate request would 3768 * increment it more than once. 3769 */ 3770 qp->r_msn++; 3771 qp->r_psn++; 3772 3773 trace_hfi1_tid_req_rcv_write_req(qp, 0, e->opcode, e->psn, e->lpsn, 3774 req); 3775 3776 if (qpriv->r_tid_tail == HFI1_QP_WQE_INVALID) { 3777 qpriv->r_tid_tail = qp->r_head_ack_queue; 3778 } else if (qpriv->r_tid_tail == qpriv->r_tid_head) { 3779 struct tid_rdma_request *ptr; 3780 3781 e = &qp->s_ack_queue[qpriv->r_tid_tail]; 3782 ptr = ack_to_tid_req(e); 3783 3784 if (e->opcode != TID_OP(WRITE_REQ) || 3785 ptr->comp_seg == ptr->total_segs) { 3786 if (qpriv->r_tid_tail == qpriv->r_tid_ack) 3787 qpriv->r_tid_ack = qp->r_head_ack_queue; 3788 qpriv->r_tid_tail = qp->r_head_ack_queue; 3789 } 3790 } 3791 update_head: 3792 qp->r_head_ack_queue = next; 3793 qpriv->r_tid_head = qp->r_head_ack_queue; 3794 3795 hfi1_tid_write_alloc_resources(qp, true); 3796 trace_hfi1_tid_write_rsp_rcv_req(qp); 3797 3798 /* Schedule the send tasklet. */ 3799 qp->s_flags |= RVT_S_RESP_PENDING; 3800 if (fecn) 3801 qp->s_flags |= RVT_S_ECN; 3802 hfi1_schedule_send(qp); 3803 3804 spin_unlock_irqrestore(&qp->s_lock, flags); 3805 return; 3806 3807 nack_inv_unlock: 3808 spin_unlock_irqrestore(&qp->s_lock, flags); 3809 nack_inv: 3810 rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); 3811 qp->r_nak_state = IB_NAK_INVALID_REQUEST; 3812 qp->r_ack_psn = qp->r_psn; 3813 /* Queue NAK for later */ 3814 rc_defered_ack(rcd, qp); 3815 return; 3816 nack_acc: 3817 spin_unlock_irqrestore(&qp->s_lock, flags); 3818 rvt_rc_error(qp, IB_WC_LOC_PROT_ERR); 3819 qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; 3820 qp->r_ack_psn = qp->r_psn; 3821 } 3822 3823 u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, 3824 struct ib_other_headers *ohdr, u32 *bth1, 3825 u32 bth2, u32 *len, 3826 struct rvt_sge_state **ss) 3827 { 3828 struct hfi1_ack_priv *epriv = e->priv; 3829 struct tid_rdma_request *req = &epriv->tid_req; 3830 struct hfi1_qp_priv *qpriv = qp->priv; 3831 struct tid_rdma_flow *flow = NULL; 3832 u32 resp_len = 0, hdwords = 0; 3833 void *resp_addr = NULL; 3834 struct tid_rdma_params *remote; 3835 3836 trace_hfi1_tid_req_build_write_resp(qp, 0, e->opcode, e->psn, e->lpsn, 3837 req); 3838 trace_hfi1_tid_write_rsp_build_resp(qp); 3839 trace_hfi1_rsp_build_tid_write_resp(qp, bth2); 3840 flow = &req->flows[req->flow_idx]; 3841 switch (req->state) { 3842 default: 3843 /* 3844 * Try to allocate resources here in case QP was queued and was 3845 * later scheduled when resources became available 3846 */ 3847 hfi1_tid_write_alloc_resources(qp, false); 3848 3849 /* We've already sent everything which is ready */ 3850 if (req->cur_seg >= req->alloc_seg) 3851 goto done; 3852 3853 /* 3854 * Resources can be assigned but responses cannot be sent in 3855 * rnr_nak state, till the resent request is received 3856 */ 3857 if (qpriv->rnr_nak_state == TID_RNR_NAK_SENT) 3858 goto done; 3859 3860 req->state = TID_REQUEST_ACTIVE; 3861 trace_hfi1_tid_flow_build_write_resp(qp, req->flow_idx, flow); 3862 req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS); 3863 hfi1_add_tid_reap_timer(qp); 3864 break; 3865 3866 case TID_REQUEST_RESEND_ACTIVE: 3867 case TID_REQUEST_RESEND: 3868 trace_hfi1_tid_flow_build_write_resp(qp, req->flow_idx, flow); 3869 req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS); 3870 if (!CIRC_CNT(req->setup_head, req->flow_idx, MAX_FLOWS)) 3871 req->state = TID_REQUEST_ACTIVE; 3872 3873 hfi1_mod_tid_reap_timer(qp); 3874 break; 3875 } 3876 flow->flow_state.resp_ib_psn = bth2; 3877 resp_addr = (void *)flow->tid_entry; 3878 resp_len = sizeof(*flow->tid_entry) * flow->tidcnt; 3879 req->cur_seg++; 3880 3881 memset(&ohdr->u.tid_rdma.w_rsp, 0, sizeof(ohdr->u.tid_rdma.w_rsp)); 3882 epriv->ss.sge.vaddr = resp_addr; 3883 epriv->ss.sge.sge_length = resp_len; 3884 epriv->ss.sge.length = epriv->ss.sge.sge_length; 3885 /* 3886 * We can safely zero these out. Since the first SGE covers the 3887 * entire packet, nothing else should even look at the MR. 3888 */ 3889 epriv->ss.sge.mr = NULL; 3890 epriv->ss.sge.m = 0; 3891 epriv->ss.sge.n = 0; 3892 3893 epriv->ss.sg_list = NULL; 3894 epriv->ss.total_len = epriv->ss.sge.sge_length; 3895 epriv->ss.num_sge = 1; 3896 3897 *ss = &epriv->ss; 3898 *len = epriv->ss.total_len; 3899 3900 /* Construct the TID RDMA WRITE RESP packet header */ 3901 rcu_read_lock(); 3902 remote = rcu_dereference(qpriv->tid_rdma.remote); 3903 3904 KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth0, KVER, 0x1); 3905 KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth1, JKEY, remote->jkey); 3906 ohdr->u.tid_rdma.w_rsp.aeth = rvt_compute_aeth(qp); 3907 ohdr->u.tid_rdma.w_rsp.tid_flow_psn = 3908 cpu_to_be32((flow->flow_state.generation << 3909 HFI1_KDETH_BTH_SEQ_SHIFT) | 3910 (flow->flow_state.spsn & 3911 HFI1_KDETH_BTH_SEQ_MASK)); 3912 ohdr->u.tid_rdma.w_rsp.tid_flow_qp = 3913 cpu_to_be32(qpriv->tid_rdma.local.qp | 3914 ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) << 3915 TID_RDMA_DESTQP_FLOW_SHIFT) | 3916 qpriv->rcd->ctxt); 3917 ohdr->u.tid_rdma.w_rsp.verbs_qp = cpu_to_be32(qp->remote_qpn); 3918 *bth1 = remote->qp; 3919 rcu_read_unlock(); 3920 hdwords = sizeof(ohdr->u.tid_rdma.w_rsp) / sizeof(u32); 3921 qpriv->pending_tid_w_segs++; 3922 done: 3923 return hdwords; 3924 } 3925 3926 static void hfi1_add_tid_reap_timer(struct rvt_qp *qp) 3927 { 3928 struct hfi1_qp_priv *qpriv = qp->priv; 3929 3930 lockdep_assert_held(&qp->s_lock); 3931 if (!(qpriv->s_flags & HFI1_R_TID_RSC_TIMER)) { 3932 qpriv->s_flags |= HFI1_R_TID_RSC_TIMER; 3933 qpriv->s_tid_timer.expires = jiffies + 3934 qpriv->tid_timer_timeout_jiffies; 3935 add_timer(&qpriv->s_tid_timer); 3936 } 3937 } 3938 3939 static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp) 3940 { 3941 struct hfi1_qp_priv *qpriv = qp->priv; 3942 3943 lockdep_assert_held(&qp->s_lock); 3944 qpriv->s_flags |= HFI1_R_TID_RSC_TIMER; 3945 mod_timer(&qpriv->s_tid_timer, jiffies + 3946 qpriv->tid_timer_timeout_jiffies); 3947 } 3948 3949 static int hfi1_stop_tid_reap_timer(struct rvt_qp *qp) 3950 { 3951 struct hfi1_qp_priv *qpriv = qp->priv; 3952 int rval = 0; 3953 3954 lockdep_assert_held(&qp->s_lock); 3955 if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) { 3956 rval = del_timer(&qpriv->s_tid_timer); 3957 qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER; 3958 } 3959 return rval; 3960 } 3961 3962 void hfi1_del_tid_reap_timer(struct rvt_qp *qp) 3963 { 3964 struct hfi1_qp_priv *qpriv = qp->priv; 3965 3966 del_timer_sync(&qpriv->s_tid_timer); 3967 qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER; 3968 } 3969 3970 static void hfi1_tid_timeout(struct timer_list *t) 3971 { 3972 struct hfi1_qp_priv *qpriv = from_timer(qpriv, t, s_tid_timer); 3973 struct rvt_qp *qp = qpriv->owner; 3974 struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); 3975 unsigned long flags; 3976 u32 i; 3977 3978 spin_lock_irqsave(&qp->r_lock, flags); 3979 spin_lock(&qp->s_lock); 3980 if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) { 3981 dd_dev_warn(dd_from_ibdev(qp->ibqp.device), "[QP%u] %s %d\n", 3982 qp->ibqp.qp_num, __func__, __LINE__); 3983 trace_hfi1_msg_tid_timeout(/* msg */ 3984 qp, "resource timeout = ", 3985 (u64)qpriv->tid_timer_timeout_jiffies); 3986 hfi1_stop_tid_reap_timer(qp); 3987 /* 3988 * Go though the entire ack queue and clear any outstanding 3989 * HW flow and RcvArray resources. 3990 */ 3991 hfi1_kern_clear_hw_flow(qpriv->rcd, qp); 3992 for (i = 0; i < rvt_max_atomic(rdi); i++) { 3993 struct tid_rdma_request *req = 3994 ack_to_tid_req(&qp->s_ack_queue[i]); 3995 3996 hfi1_kern_exp_rcv_clear_all(req); 3997 } 3998 spin_unlock(&qp->s_lock); 3999 if (qp->ibqp.event_handler) { 4000 struct ib_event ev; 4001 4002 ev.device = qp->ibqp.device; 4003 ev.element.qp = &qp->ibqp; 4004 ev.event = IB_EVENT_QP_FATAL; 4005 qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); 4006 } 4007 rvt_rc_error(qp, IB_WC_RESP_TIMEOUT_ERR); 4008 goto unlock_r_lock; 4009 } 4010 spin_unlock(&qp->s_lock); 4011 unlock_r_lock: 4012 spin_unlock_irqrestore(&qp->r_lock, flags); 4013 } 4014 4015 void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet) 4016 { 4017 /* HANDLER FOR TID RDMA WRITE RESPONSE packet (Requestor side */ 4018 4019 /* 4020 * 1. Find matching SWQE 4021 * 2. Check that TIDENTRY array has enough space for a complete 4022 * segment. If not, put QP in error state. 4023 * 3. Save response data in struct tid_rdma_req and struct tid_rdma_flow 4024 * 4. Remove HFI1_S_WAIT_TID_RESP from s_flags. 4025 * 5. Set qp->s_state 4026 * 6. Kick the send engine (hfi1_schedule_send()) 4027 */ 4028 struct ib_other_headers *ohdr = packet->ohdr; 4029 struct rvt_qp *qp = packet->qp; 4030 struct hfi1_qp_priv *qpriv = qp->priv; 4031 struct hfi1_ctxtdata *rcd = packet->rcd; 4032 struct rvt_swqe *wqe; 4033 struct tid_rdma_request *req; 4034 struct tid_rdma_flow *flow; 4035 enum ib_wc_status status; 4036 u32 opcode, aeth, psn, flow_psn, i, tidlen = 0, pktlen; 4037 bool fecn; 4038 unsigned long flags; 4039 4040 fecn = process_ecn(qp, packet); 4041 psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 4042 aeth = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.aeth); 4043 opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; 4044 4045 spin_lock_irqsave(&qp->s_lock, flags); 4046 4047 /* Ignore invalid responses */ 4048 if (cmp_psn(psn, qp->s_next_psn) >= 0) 4049 goto ack_done; 4050 4051 /* Ignore duplicate responses. */ 4052 if (unlikely(cmp_psn(psn, qp->s_last_psn) <= 0)) 4053 goto ack_done; 4054 4055 if (unlikely(qp->s_acked == qp->s_tail)) 4056 goto ack_done; 4057 4058 /* 4059 * If we are waiting for a particular packet sequence number 4060 * due to a request being resent, check for it. Otherwise, 4061 * ensure that we haven't missed anything. 4062 */ 4063 if (qp->r_flags & RVT_R_RDMAR_SEQ) { 4064 if (cmp_psn(psn, qp->s_last_psn + 1) != 0) 4065 goto ack_done; 4066 qp->r_flags &= ~RVT_R_RDMAR_SEQ; 4067 } 4068 4069 wqe = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur); 4070 if (unlikely(wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)) 4071 goto ack_op_err; 4072 4073 req = wqe_to_tid_req(wqe); 4074 /* 4075 * If we've lost ACKs and our acked_tail pointer is too far 4076 * behind, don't overwrite segments. Just drop the packet and 4077 * let the reliability protocol take care of it. 4078 */ 4079 if (!CIRC_SPACE(req->setup_head, req->acked_tail, MAX_FLOWS)) 4080 goto ack_done; 4081 4082 /* 4083 * The call to do_rc_ack() should be last in the chain of 4084 * packet checks because it will end up updating the QP state. 4085 * Therefore, anything that would prevent the packet from 4086 * being accepted as a successful response should be prior 4087 * to it. 4088 */ 4089 if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd)) 4090 goto ack_done; 4091 4092 trace_hfi1_ack(qp, psn); 4093 4094 flow = &req->flows[req->setup_head]; 4095 flow->pkt = 0; 4096 flow->tid_idx = 0; 4097 flow->tid_offset = 0; 4098 flow->sent = 0; 4099 flow->resync_npkts = 0; 4100 flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_qp); 4101 flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) & 4102 TID_RDMA_DESTQP_FLOW_MASK; 4103 flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_psn)); 4104 flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT; 4105 flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK; 4106 flow->flow_state.resp_ib_psn = psn; 4107 flow->length = min_t(u32, req->seg_len, 4108 (wqe->length - (req->comp_seg * req->seg_len))); 4109 4110 flow->npkts = rvt_div_round_up_mtu(qp, flow->length); 4111 flow->flow_state.lpsn = flow->flow_state.spsn + 4112 flow->npkts - 1; 4113 /* payload length = packet length - (header length + ICRC length) */ 4114 pktlen = packet->tlen - (packet->hlen + 4); 4115 if (pktlen > sizeof(flow->tid_entry)) { 4116 status = IB_WC_LOC_LEN_ERR; 4117 goto ack_err; 4118 } 4119 memcpy(flow->tid_entry, packet->ebuf, pktlen); 4120 flow->tidcnt = pktlen / sizeof(*flow->tid_entry); 4121 trace_hfi1_tid_flow_rcv_write_resp(qp, req->setup_head, flow); 4122 4123 req->comp_seg++; 4124 trace_hfi1_tid_write_sender_rcv_resp(qp, 0); 4125 /* 4126 * Walk the TID_ENTRY list to make sure we have enough space for a 4127 * complete segment. 4128 */ 4129 for (i = 0; i < flow->tidcnt; i++) { 4130 trace_hfi1_tid_entry_rcv_write_resp(/* entry */ 4131 qp, i, flow->tid_entry[i]); 4132 if (!EXP_TID_GET(flow->tid_entry[i], LEN)) { 4133 status = IB_WC_LOC_LEN_ERR; 4134 goto ack_err; 4135 } 4136 tidlen += EXP_TID_GET(flow->tid_entry[i], LEN); 4137 } 4138 if (tidlen * PAGE_SIZE < flow->length) { 4139 status = IB_WC_LOC_LEN_ERR; 4140 goto ack_err; 4141 } 4142 4143 trace_hfi1_tid_req_rcv_write_resp(qp, 0, wqe->wr.opcode, wqe->psn, 4144 wqe->lpsn, req); 4145 /* 4146 * If this is the first response for this request, set the initial 4147 * flow index to the current flow. 4148 */ 4149 if (!cmp_psn(psn, wqe->psn)) { 4150 req->r_last_acked = mask_psn(wqe->psn - 1); 4151 /* Set acked flow index to head index */ 4152 req->acked_tail = req->setup_head; 4153 } 4154 4155 /* advance circular buffer head */ 4156 req->setup_head = CIRC_NEXT(req->setup_head, MAX_FLOWS); 4157 req->state = TID_REQUEST_ACTIVE; 4158 4159 /* 4160 * If all responses for this TID RDMA WRITE request have been received 4161 * advance the pointer to the next one. 4162 * Since TID RDMA requests could be mixed in with regular IB requests, 4163 * they might not appear sequentially in the queue. Therefore, the 4164 * next request needs to be "found". 4165 */ 4166 if (qpriv->s_tid_cur != qpriv->s_tid_head && 4167 req->comp_seg == req->total_segs) { 4168 for (i = qpriv->s_tid_cur + 1; ; i++) { 4169 if (i == qp->s_size) 4170 i = 0; 4171 wqe = rvt_get_swqe_ptr(qp, i); 4172 if (i == qpriv->s_tid_head) 4173 break; 4174 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) 4175 break; 4176 } 4177 qpriv->s_tid_cur = i; 4178 } 4179 qp->s_flags &= ~HFI1_S_WAIT_TID_RESP; 4180 hfi1_schedule_tid_send(qp); 4181 goto ack_done; 4182 4183 ack_op_err: 4184 status = IB_WC_LOC_QP_OP_ERR; 4185 ack_err: 4186 rvt_error_qp(qp, status); 4187 ack_done: 4188 if (fecn) 4189 qp->s_flags |= RVT_S_ECN; 4190 spin_unlock_irqrestore(&qp->s_lock, flags); 4191 } 4192 4193 bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe, 4194 struct ib_other_headers *ohdr, 4195 u32 *bth1, u32 *bth2, u32 *len) 4196 { 4197 struct tid_rdma_request *req = wqe_to_tid_req(wqe); 4198 struct tid_rdma_flow *flow = &req->flows[req->clear_tail]; 4199 struct tid_rdma_params *remote; 4200 struct rvt_qp *qp = req->qp; 4201 struct hfi1_qp_priv *qpriv = qp->priv; 4202 u32 tidentry = flow->tid_entry[flow->tid_idx]; 4203 u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT; 4204 struct tid_rdma_write_data *wd = &ohdr->u.tid_rdma.w_data; 4205 u32 next_offset, om = KDETH_OM_LARGE; 4206 bool last_pkt; 4207 4208 if (!tidlen) { 4209 hfi1_trdma_send_complete(qp, wqe, IB_WC_REM_INV_RD_REQ_ERR); 4210 rvt_error_qp(qp, IB_WC_REM_INV_RD_REQ_ERR); 4211 } 4212 4213 *len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset); 4214 flow->sent += *len; 4215 next_offset = flow->tid_offset + *len; 4216 last_pkt = (flow->tid_idx == (flow->tidcnt - 1) && 4217 next_offset >= tidlen) || (flow->sent >= flow->length); 4218 trace_hfi1_tid_entry_build_write_data(qp, flow->tid_idx, tidentry); 4219 trace_hfi1_tid_flow_build_write_data(qp, req->clear_tail, flow); 4220 4221 rcu_read_lock(); 4222 remote = rcu_dereference(qpriv->tid_rdma.remote); 4223 KDETH_RESET(wd->kdeth0, KVER, 0x1); 4224 KDETH_SET(wd->kdeth0, SH, !last_pkt); 4225 KDETH_SET(wd->kdeth0, INTR, !!(!last_pkt && remote->urg)); 4226 KDETH_SET(wd->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL)); 4227 KDETH_SET(wd->kdeth0, TID, EXP_TID_GET(tidentry, IDX)); 4228 KDETH_SET(wd->kdeth0, OM, om == KDETH_OM_LARGE); 4229 KDETH_SET(wd->kdeth0, OFFSET, flow->tid_offset / om); 4230 KDETH_RESET(wd->kdeth1, JKEY, remote->jkey); 4231 wd->verbs_qp = cpu_to_be32(qp->remote_qpn); 4232 rcu_read_unlock(); 4233 4234 *bth1 = flow->tid_qpn; 4235 *bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) & 4236 HFI1_KDETH_BTH_SEQ_MASK) | 4237 (flow->flow_state.generation << 4238 HFI1_KDETH_BTH_SEQ_SHIFT)); 4239 if (last_pkt) { 4240 /* PSNs are zero-based, so +1 to count number of packets */ 4241 if (flow->flow_state.lpsn + 1 + 4242 rvt_div_round_up_mtu(qp, req->seg_len) > 4243 MAX_TID_FLOW_PSN) 4244 req->state = TID_REQUEST_SYNC; 4245 *bth2 |= IB_BTH_REQ_ACK; 4246 } 4247 4248 if (next_offset >= tidlen) { 4249 flow->tid_offset = 0; 4250 flow->tid_idx++; 4251 } else { 4252 flow->tid_offset = next_offset; 4253 } 4254 return last_pkt; 4255 } 4256 4257 void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet) 4258 { 4259 struct rvt_qp *qp = packet->qp; 4260 struct hfi1_qp_priv *priv = qp->priv; 4261 struct hfi1_ctxtdata *rcd = priv->rcd; 4262 struct ib_other_headers *ohdr = packet->ohdr; 4263 struct rvt_ack_entry *e; 4264 struct tid_rdma_request *req; 4265 struct tid_rdma_flow *flow; 4266 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); 4267 unsigned long flags; 4268 u32 psn, next; 4269 u8 opcode; 4270 bool fecn; 4271 4272 fecn = process_ecn(qp, packet); 4273 psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 4274 opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; 4275 4276 /* 4277 * All error handling should be done by now. If we are here, the packet 4278 * is either good or been accepted by the error handler. 4279 */ 4280 spin_lock_irqsave(&qp->s_lock, flags); 4281 e = &qp->s_ack_queue[priv->r_tid_tail]; 4282 req = ack_to_tid_req(e); 4283 flow = &req->flows[req->clear_tail]; 4284 if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.lpsn))) { 4285 update_r_next_psn_fecn(packet, priv, rcd, flow, fecn); 4286 4287 if (cmp_psn(psn, flow->flow_state.r_next_psn)) 4288 goto send_nak; 4289 4290 flow->flow_state.r_next_psn = mask_psn(psn + 1); 4291 /* 4292 * Copy the payload to destination buffer if this packet is 4293 * delivered as an eager packet due to RSM rule and FECN. 4294 * The RSM rule selects FECN bit in BTH and SH bit in 4295 * KDETH header and therefore will not match the last 4296 * packet of each segment that has SH bit cleared. 4297 */ 4298 if (fecn && packet->etype == RHF_RCV_TYPE_EAGER) { 4299 struct rvt_sge_state ss; 4300 u32 len; 4301 u32 tlen = packet->tlen; 4302 u16 hdrsize = packet->hlen; 4303 u8 pad = packet->pad; 4304 u8 extra_bytes = pad + packet->extra_byte + 4305 (SIZE_OF_CRC << 2); 4306 u32 pmtu = qp->pmtu; 4307 4308 if (unlikely(tlen != (hdrsize + pmtu + extra_bytes))) 4309 goto send_nak; 4310 len = req->comp_seg * req->seg_len; 4311 len += delta_psn(psn, 4312 full_flow_psn(flow, flow->flow_state.spsn)) * 4313 pmtu; 4314 if (unlikely(req->total_len - len < pmtu)) 4315 goto send_nak; 4316 4317 /* 4318 * The e->rdma_sge field is set when TID RDMA WRITE REQ 4319 * is first received and is never modified thereafter. 4320 */ 4321 ss.sge = e->rdma_sge; 4322 ss.sg_list = NULL; 4323 ss.num_sge = 1; 4324 ss.total_len = req->total_len; 4325 rvt_skip_sge(&ss, len, false); 4326 rvt_copy_sge(qp, &ss, packet->payload, pmtu, false, 4327 false); 4328 /* Raise the sw sequence check flag for next packet */ 4329 priv->r_next_psn_kdeth = mask_psn(psn + 1); 4330 priv->s_flags |= HFI1_R_TID_SW_PSN; 4331 } 4332 goto exit; 4333 } 4334 flow->flow_state.r_next_psn = mask_psn(psn + 1); 4335 hfi1_kern_exp_rcv_clear(req); 4336 priv->alloc_w_segs--; 4337 rcd->flows[flow->idx].psn = psn & HFI1_KDETH_BTH_SEQ_MASK; 4338 req->comp_seg++; 4339 priv->s_nak_state = 0; 4340 4341 /* 4342 * Release the flow if one of the following conditions has been met: 4343 * - The request has reached a sync point AND all outstanding 4344 * segments have been completed, or 4345 * - The entire request is complete and there are no more requests 4346 * (of any kind) in the queue. 4347 */ 4348 trace_hfi1_rsp_rcv_tid_write_data(qp, psn); 4349 trace_hfi1_tid_req_rcv_write_data(qp, 0, e->opcode, e->psn, e->lpsn, 4350 req); 4351 trace_hfi1_tid_write_rsp_rcv_data(qp); 4352 validate_r_tid_ack(priv); 4353 4354 if (opcode == TID_OP(WRITE_DATA_LAST)) { 4355 release_rdma_sge_mr(e); 4356 for (next = priv->r_tid_tail + 1; ; next++) { 4357 if (next > rvt_size_atomic(&dev->rdi)) 4358 next = 0; 4359 if (next == priv->r_tid_head) 4360 break; 4361 e = &qp->s_ack_queue[next]; 4362 if (e->opcode == TID_OP(WRITE_REQ)) 4363 break; 4364 } 4365 priv->r_tid_tail = next; 4366 if (++qp->s_acked_ack_queue > rvt_size_atomic(&dev->rdi)) 4367 qp->s_acked_ack_queue = 0; 4368 } 4369 4370 hfi1_tid_write_alloc_resources(qp, true); 4371 4372 /* 4373 * If we need to generate more responses, schedule the 4374 * send engine. 4375 */ 4376 if (req->cur_seg < req->total_segs || 4377 qp->s_tail_ack_queue != qp->r_head_ack_queue) { 4378 qp->s_flags |= RVT_S_RESP_PENDING; 4379 hfi1_schedule_send(qp); 4380 } 4381 4382 priv->pending_tid_w_segs--; 4383 if (priv->s_flags & HFI1_R_TID_RSC_TIMER) { 4384 if (priv->pending_tid_w_segs) 4385 hfi1_mod_tid_reap_timer(req->qp); 4386 else 4387 hfi1_stop_tid_reap_timer(req->qp); 4388 } 4389 4390 done: 4391 tid_rdma_schedule_ack(qp); 4392 exit: 4393 priv->r_next_psn_kdeth = flow->flow_state.r_next_psn; 4394 if (fecn) 4395 qp->s_flags |= RVT_S_ECN; 4396 spin_unlock_irqrestore(&qp->s_lock, flags); 4397 return; 4398 4399 send_nak: 4400 if (!priv->s_nak_state) { 4401 priv->s_nak_state = IB_NAK_PSN_ERROR; 4402 priv->s_nak_psn = flow->flow_state.r_next_psn; 4403 tid_rdma_trigger_ack(qp); 4404 } 4405 goto done; 4406 } 4407 4408 static bool hfi1_tid_rdma_is_resync_psn(u32 psn) 4409 { 4410 return (bool)((psn & HFI1_KDETH_BTH_SEQ_MASK) == 4411 HFI1_KDETH_BTH_SEQ_MASK); 4412 } 4413 4414 u32 hfi1_build_tid_rdma_write_ack(struct rvt_qp *qp, struct rvt_ack_entry *e, 4415 struct ib_other_headers *ohdr, u16 iflow, 4416 u32 *bth1, u32 *bth2) 4417 { 4418 struct hfi1_qp_priv *qpriv = qp->priv; 4419 struct tid_flow_state *fs = &qpriv->flow_state; 4420 struct tid_rdma_request *req = ack_to_tid_req(e); 4421 struct tid_rdma_flow *flow = &req->flows[iflow]; 4422 struct tid_rdma_params *remote; 4423 4424 rcu_read_lock(); 4425 remote = rcu_dereference(qpriv->tid_rdma.remote); 4426 KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey); 4427 ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn); 4428 *bth1 = remote->qp; 4429 rcu_read_unlock(); 4430 4431 if (qpriv->resync) { 4432 *bth2 = mask_psn((fs->generation << 4433 HFI1_KDETH_BTH_SEQ_SHIFT) - 1); 4434 ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp); 4435 } else if (qpriv->s_nak_state) { 4436 *bth2 = mask_psn(qpriv->s_nak_psn); 4437 ohdr->u.tid_rdma.ack.aeth = 4438 cpu_to_be32((qp->r_msn & IB_MSN_MASK) | 4439 (qpriv->s_nak_state << 4440 IB_AETH_CREDIT_SHIFT)); 4441 } else { 4442 *bth2 = full_flow_psn(flow, flow->flow_state.lpsn); 4443 ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp); 4444 } 4445 KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, 0x1); 4446 ohdr->u.tid_rdma.ack.tid_flow_qp = 4447 cpu_to_be32(qpriv->tid_rdma.local.qp | 4448 ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) << 4449 TID_RDMA_DESTQP_FLOW_SHIFT) | 4450 qpriv->rcd->ctxt); 4451 4452 ohdr->u.tid_rdma.ack.tid_flow_psn = 0; 4453 ohdr->u.tid_rdma.ack.verbs_psn = 4454 cpu_to_be32(flow->flow_state.resp_ib_psn); 4455 4456 if (qpriv->resync) { 4457 /* 4458 * If the PSN before the current expect KDETH PSN is the 4459 * RESYNC PSN, then we never received a good TID RDMA WRITE 4460 * DATA packet after a previous RESYNC. 4461 * In this case, the next expected KDETH PSN stays the same. 4462 */ 4463 if (hfi1_tid_rdma_is_resync_psn(qpriv->r_next_psn_kdeth - 1)) { 4464 ohdr->u.tid_rdma.ack.tid_flow_psn = 4465 cpu_to_be32(qpriv->r_next_psn_kdeth_save); 4466 } else { 4467 /* 4468 * Because the KDETH PSNs jump during a RESYNC, it's 4469 * not possible to infer (or compute) the previous value 4470 * of r_next_psn_kdeth in the case of back-to-back 4471 * RESYNC packets. Therefore, we save it. 4472 */ 4473 qpriv->r_next_psn_kdeth_save = 4474 qpriv->r_next_psn_kdeth - 1; 4475 ohdr->u.tid_rdma.ack.tid_flow_psn = 4476 cpu_to_be32(qpriv->r_next_psn_kdeth_save); 4477 qpriv->r_next_psn_kdeth = mask_psn(*bth2 + 1); 4478 } 4479 qpriv->resync = false; 4480 } 4481 4482 return sizeof(ohdr->u.tid_rdma.ack) / sizeof(u32); 4483 } 4484 4485 void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) 4486 { 4487 struct ib_other_headers *ohdr = packet->ohdr; 4488 struct rvt_qp *qp = packet->qp; 4489 struct hfi1_qp_priv *qpriv = qp->priv; 4490 struct rvt_swqe *wqe; 4491 struct tid_rdma_request *req; 4492 struct tid_rdma_flow *flow; 4493 u32 aeth, psn, req_psn, ack_psn, flpsn, resync_psn, ack_kpsn; 4494 unsigned long flags; 4495 u16 fidx; 4496 4497 trace_hfi1_tid_write_sender_rcv_tid_ack(qp, 0); 4498 process_ecn(qp, packet); 4499 psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 4500 aeth = be32_to_cpu(ohdr->u.tid_rdma.ack.aeth); 4501 req_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.verbs_psn)); 4502 resync_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.tid_flow_psn)); 4503 4504 spin_lock_irqsave(&qp->s_lock, flags); 4505 trace_hfi1_rcv_tid_ack(qp, aeth, psn, req_psn, resync_psn); 4506 4507 /* If we are waiting for an ACK to RESYNC, drop any other packets */ 4508 if ((qp->s_flags & HFI1_S_WAIT_HALT) && 4509 cmp_psn(psn, qpriv->s_resync_psn)) 4510 goto ack_op_err; 4511 4512 ack_psn = req_psn; 4513 if (hfi1_tid_rdma_is_resync_psn(psn)) 4514 ack_kpsn = resync_psn; 4515 else 4516 ack_kpsn = psn; 4517 if (aeth >> 29) { 4518 ack_psn--; 4519 ack_kpsn--; 4520 } 4521 4522 if (unlikely(qp->s_acked == qp->s_tail)) 4523 goto ack_op_err; 4524 4525 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 4526 4527 if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) 4528 goto ack_op_err; 4529 4530 req = wqe_to_tid_req(wqe); 4531 trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn, 4532 wqe->lpsn, req); 4533 flow = &req->flows[req->acked_tail]; 4534 trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow); 4535 4536 /* Drop stale ACK/NAK */ 4537 if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.spsn)) < 0 || 4538 cmp_psn(req_psn, flow->flow_state.resp_ib_psn) < 0) 4539 goto ack_op_err; 4540 4541 while (cmp_psn(ack_kpsn, 4542 full_flow_psn(flow, flow->flow_state.lpsn)) >= 0 && 4543 req->ack_seg < req->cur_seg) { 4544 req->ack_seg++; 4545 /* advance acked segment pointer */ 4546 req->acked_tail = CIRC_NEXT(req->acked_tail, MAX_FLOWS); 4547 req->r_last_acked = flow->flow_state.resp_ib_psn; 4548 trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn, 4549 wqe->lpsn, req); 4550 if (req->ack_seg == req->total_segs) { 4551 req->state = TID_REQUEST_COMPLETE; 4552 wqe = do_rc_completion(qp, wqe, 4553 to_iport(qp->ibqp.device, 4554 qp->port_num)); 4555 trace_hfi1_sender_rcv_tid_ack(qp); 4556 atomic_dec(&qpriv->n_tid_requests); 4557 if (qp->s_acked == qp->s_tail) 4558 break; 4559 if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) 4560 break; 4561 req = wqe_to_tid_req(wqe); 4562 } 4563 flow = &req->flows[req->acked_tail]; 4564 trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow); 4565 } 4566 4567 trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn, 4568 wqe->lpsn, req); 4569 switch (aeth >> 29) { 4570 case 0: /* ACK */ 4571 if (qpriv->s_flags & RVT_S_WAIT_ACK) 4572 qpriv->s_flags &= ~RVT_S_WAIT_ACK; 4573 if (!hfi1_tid_rdma_is_resync_psn(psn)) { 4574 /* Check if there is any pending TID ACK */ 4575 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && 4576 req->ack_seg < req->cur_seg) 4577 hfi1_mod_tid_retry_timer(qp); 4578 else 4579 hfi1_stop_tid_retry_timer(qp); 4580 hfi1_schedule_send(qp); 4581 } else { 4582 u32 spsn, fpsn, last_acked, generation; 4583 struct tid_rdma_request *rptr; 4584 4585 /* ACK(RESYNC) */ 4586 hfi1_stop_tid_retry_timer(qp); 4587 /* Allow new requests (see hfi1_make_tid_rdma_pkt) */ 4588 qp->s_flags &= ~HFI1_S_WAIT_HALT; 4589 /* 4590 * Clear RVT_S_SEND_ONE flag in case that the TID RDMA 4591 * ACK is received after the TID retry timer is fired 4592 * again. In this case, do not send any more TID 4593 * RESYNC request or wait for any more TID ACK packet. 4594 */ 4595 qpriv->s_flags &= ~RVT_S_SEND_ONE; 4596 hfi1_schedule_send(qp); 4597 4598 if ((qp->s_acked == qpriv->s_tid_tail && 4599 req->ack_seg == req->total_segs) || 4600 qp->s_acked == qp->s_tail) { 4601 qpriv->s_state = TID_OP(WRITE_DATA_LAST); 4602 goto done; 4603 } 4604 4605 if (req->ack_seg == req->comp_seg) { 4606 qpriv->s_state = TID_OP(WRITE_DATA); 4607 goto done; 4608 } 4609 4610 /* 4611 * The PSN to start with is the next PSN after the 4612 * RESYNC PSN. 4613 */ 4614 psn = mask_psn(psn + 1); 4615 generation = psn >> HFI1_KDETH_BTH_SEQ_SHIFT; 4616 spsn = 0; 4617 4618 /* 4619 * Update to the correct WQE when we get an ACK(RESYNC) 4620 * in the middle of a request. 4621 */ 4622 if (delta_psn(ack_psn, wqe->lpsn)) 4623 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 4624 req = wqe_to_tid_req(wqe); 4625 flow = &req->flows[req->acked_tail]; 4626 /* 4627 * RESYNC re-numbers the PSN ranges of all remaining 4628 * segments. Also, PSN's start from 0 in the middle of a 4629 * segment and the first segment size is less than the 4630 * default number of packets. flow->resync_npkts is used 4631 * to track the number of packets from the start of the 4632 * real segment to the point of 0 PSN after the RESYNC 4633 * in order to later correctly rewind the SGE. 4634 */ 4635 fpsn = full_flow_psn(flow, flow->flow_state.spsn); 4636 req->r_ack_psn = psn; 4637 /* 4638 * If resync_psn points to the last flow PSN for a 4639 * segment and the new segment (likely from a new 4640 * request) starts with a new generation number, we 4641 * need to adjust resync_psn accordingly. 4642 */ 4643 if (flow->flow_state.generation != 4644 (resync_psn >> HFI1_KDETH_BTH_SEQ_SHIFT)) 4645 resync_psn = mask_psn(fpsn - 1); 4646 flow->resync_npkts += 4647 delta_psn(mask_psn(resync_psn + 1), fpsn); 4648 /* 4649 * Renumber all packet sequence number ranges 4650 * based on the new generation. 4651 */ 4652 last_acked = qp->s_acked; 4653 rptr = req; 4654 while (1) { 4655 /* start from last acked segment */ 4656 for (fidx = rptr->acked_tail; 4657 CIRC_CNT(rptr->setup_head, fidx, 4658 MAX_FLOWS); 4659 fidx = CIRC_NEXT(fidx, MAX_FLOWS)) { 4660 u32 lpsn; 4661 u32 gen; 4662 4663 flow = &rptr->flows[fidx]; 4664 gen = flow->flow_state.generation; 4665 if (WARN_ON(gen == generation && 4666 flow->flow_state.spsn != 4667 spsn)) 4668 continue; 4669 lpsn = flow->flow_state.lpsn; 4670 lpsn = full_flow_psn(flow, lpsn); 4671 flow->npkts = 4672 delta_psn(lpsn, 4673 mask_psn(resync_psn) 4674 ); 4675 flow->flow_state.generation = 4676 generation; 4677 flow->flow_state.spsn = spsn; 4678 flow->flow_state.lpsn = 4679 flow->flow_state.spsn + 4680 flow->npkts - 1; 4681 flow->pkt = 0; 4682 spsn += flow->npkts; 4683 resync_psn += flow->npkts; 4684 trace_hfi1_tid_flow_rcv_tid_ack(qp, 4685 fidx, 4686 flow); 4687 } 4688 if (++last_acked == qpriv->s_tid_cur + 1) 4689 break; 4690 if (last_acked == qp->s_size) 4691 last_acked = 0; 4692 wqe = rvt_get_swqe_ptr(qp, last_acked); 4693 rptr = wqe_to_tid_req(wqe); 4694 } 4695 req->cur_seg = req->ack_seg; 4696 qpriv->s_tid_tail = qp->s_acked; 4697 qpriv->s_state = TID_OP(WRITE_REQ); 4698 hfi1_schedule_tid_send(qp); 4699 } 4700 done: 4701 qpriv->s_retry = qp->s_retry_cnt; 4702 break; 4703 4704 case 3: /* NAK */ 4705 hfi1_stop_tid_retry_timer(qp); 4706 switch ((aeth >> IB_AETH_CREDIT_SHIFT) & 4707 IB_AETH_CREDIT_MASK) { 4708 case 0: /* PSN sequence error */ 4709 if (!req->flows) 4710 break; 4711 flow = &req->flows[req->acked_tail]; 4712 flpsn = full_flow_psn(flow, flow->flow_state.lpsn); 4713 if (cmp_psn(psn, flpsn) > 0) 4714 break; 4715 trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, 4716 flow); 4717 req->r_ack_psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 4718 req->cur_seg = req->ack_seg; 4719 qpriv->s_tid_tail = qp->s_acked; 4720 qpriv->s_state = TID_OP(WRITE_REQ); 4721 qpriv->s_retry = qp->s_retry_cnt; 4722 hfi1_schedule_tid_send(qp); 4723 break; 4724 4725 default: 4726 break; 4727 } 4728 break; 4729 4730 default: 4731 break; 4732 } 4733 4734 ack_op_err: 4735 spin_unlock_irqrestore(&qp->s_lock, flags); 4736 } 4737 4738 void hfi1_add_tid_retry_timer(struct rvt_qp *qp) 4739 { 4740 struct hfi1_qp_priv *priv = qp->priv; 4741 struct ib_qp *ibqp = &qp->ibqp; 4742 struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device); 4743 4744 lockdep_assert_held(&qp->s_lock); 4745 if (!(priv->s_flags & HFI1_S_TID_RETRY_TIMER)) { 4746 priv->s_flags |= HFI1_S_TID_RETRY_TIMER; 4747 priv->s_tid_retry_timer.expires = jiffies + 4748 priv->tid_retry_timeout_jiffies + rdi->busy_jiffies; 4749 add_timer(&priv->s_tid_retry_timer); 4750 } 4751 } 4752 4753 static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp) 4754 { 4755 struct hfi1_qp_priv *priv = qp->priv; 4756 struct ib_qp *ibqp = &qp->ibqp; 4757 struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device); 4758 4759 lockdep_assert_held(&qp->s_lock); 4760 priv->s_flags |= HFI1_S_TID_RETRY_TIMER; 4761 mod_timer(&priv->s_tid_retry_timer, jiffies + 4762 priv->tid_retry_timeout_jiffies + rdi->busy_jiffies); 4763 } 4764 4765 static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp) 4766 { 4767 struct hfi1_qp_priv *priv = qp->priv; 4768 int rval = 0; 4769 4770 lockdep_assert_held(&qp->s_lock); 4771 if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) { 4772 rval = del_timer(&priv->s_tid_retry_timer); 4773 priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER; 4774 } 4775 return rval; 4776 } 4777 4778 void hfi1_del_tid_retry_timer(struct rvt_qp *qp) 4779 { 4780 struct hfi1_qp_priv *priv = qp->priv; 4781 4782 del_timer_sync(&priv->s_tid_retry_timer); 4783 priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER; 4784 } 4785 4786 static void hfi1_tid_retry_timeout(struct timer_list *t) 4787 { 4788 struct hfi1_qp_priv *priv = from_timer(priv, t, s_tid_retry_timer); 4789 struct rvt_qp *qp = priv->owner; 4790 struct rvt_swqe *wqe; 4791 unsigned long flags; 4792 struct tid_rdma_request *req; 4793 4794 spin_lock_irqsave(&qp->r_lock, flags); 4795 spin_lock(&qp->s_lock); 4796 trace_hfi1_tid_write_sender_retry_timeout(qp, 0); 4797 if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) { 4798 hfi1_stop_tid_retry_timer(qp); 4799 if (!priv->s_retry) { 4800 trace_hfi1_msg_tid_retry_timeout(/* msg */ 4801 qp, 4802 "Exhausted retries. Tid retry timeout = ", 4803 (u64)priv->tid_retry_timeout_jiffies); 4804 4805 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 4806 hfi1_trdma_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); 4807 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); 4808 } else { 4809 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 4810 req = wqe_to_tid_req(wqe); 4811 trace_hfi1_tid_req_tid_retry_timeout(/* req */ 4812 qp, 0, wqe->wr.opcode, wqe->psn, wqe->lpsn, req); 4813 4814 priv->s_flags &= ~RVT_S_WAIT_ACK; 4815 /* Only send one packet (the RESYNC) */ 4816 priv->s_flags |= RVT_S_SEND_ONE; 4817 /* 4818 * No additional request shall be made by this QP until 4819 * the RESYNC has been complete. 4820 */ 4821 qp->s_flags |= HFI1_S_WAIT_HALT; 4822 priv->s_state = TID_OP(RESYNC); 4823 priv->s_retry--; 4824 hfi1_schedule_tid_send(qp); 4825 } 4826 } 4827 spin_unlock(&qp->s_lock); 4828 spin_unlock_irqrestore(&qp->r_lock, flags); 4829 } 4830 4831 u32 hfi1_build_tid_rdma_resync(struct rvt_qp *qp, struct rvt_swqe *wqe, 4832 struct ib_other_headers *ohdr, u32 *bth1, 4833 u32 *bth2, u16 fidx) 4834 { 4835 struct hfi1_qp_priv *qpriv = qp->priv; 4836 struct tid_rdma_params *remote; 4837 struct tid_rdma_request *req = wqe_to_tid_req(wqe); 4838 struct tid_rdma_flow *flow = &req->flows[fidx]; 4839 u32 generation; 4840 4841 rcu_read_lock(); 4842 remote = rcu_dereference(qpriv->tid_rdma.remote); 4843 KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey); 4844 ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn); 4845 *bth1 = remote->qp; 4846 rcu_read_unlock(); 4847 4848 generation = kern_flow_generation_next(flow->flow_state.generation); 4849 *bth2 = mask_psn((generation << HFI1_KDETH_BTH_SEQ_SHIFT) - 1); 4850 qpriv->s_resync_psn = *bth2; 4851 *bth2 |= IB_BTH_REQ_ACK; 4852 KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, 0x1); 4853 4854 return sizeof(ohdr->u.tid_rdma.resync) / sizeof(u32); 4855 } 4856 4857 void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet) 4858 { 4859 struct ib_other_headers *ohdr = packet->ohdr; 4860 struct rvt_qp *qp = packet->qp; 4861 struct hfi1_qp_priv *qpriv = qp->priv; 4862 struct hfi1_ctxtdata *rcd = qpriv->rcd; 4863 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); 4864 struct rvt_ack_entry *e; 4865 struct tid_rdma_request *req; 4866 struct tid_rdma_flow *flow; 4867 struct tid_flow_state *fs = &qpriv->flow_state; 4868 u32 psn, generation, idx, gen_next; 4869 bool fecn; 4870 unsigned long flags; 4871 4872 fecn = process_ecn(qp, packet); 4873 psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 4874 4875 generation = mask_psn(psn + 1) >> HFI1_KDETH_BTH_SEQ_SHIFT; 4876 spin_lock_irqsave(&qp->s_lock, flags); 4877 4878 gen_next = (fs->generation == KERN_GENERATION_RESERVED) ? 4879 generation : kern_flow_generation_next(fs->generation); 4880 /* 4881 * RESYNC packet contains the "next" generation and can only be 4882 * from the current or previous generations 4883 */ 4884 if (generation != mask_generation(gen_next - 1) && 4885 generation != gen_next) 4886 goto bail; 4887 /* Already processing a resync */ 4888 if (qpriv->resync) 4889 goto bail; 4890 4891 spin_lock(&rcd->exp_lock); 4892 if (fs->index >= RXE_NUM_TID_FLOWS) { 4893 /* 4894 * If we don't have a flow, save the generation so it can be 4895 * applied when a new flow is allocated 4896 */ 4897 fs->generation = generation; 4898 } else { 4899 /* Reprogram the QP flow with new generation */ 4900 rcd->flows[fs->index].generation = generation; 4901 fs->generation = kern_setup_hw_flow(rcd, fs->index); 4902 } 4903 fs->psn = 0; 4904 /* 4905 * Disable SW PSN checking since a RESYNC is equivalent to a 4906 * sync point and the flow has/will be reprogrammed 4907 */ 4908 qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; 4909 trace_hfi1_tid_write_rsp_rcv_resync(qp); 4910 4911 /* 4912 * Reset all TID flow information with the new generation. 4913 * This is done for all requests and segments after the 4914 * last received segment 4915 */ 4916 for (idx = qpriv->r_tid_tail; ; idx++) { 4917 u16 flow_idx; 4918 4919 if (idx > rvt_size_atomic(&dev->rdi)) 4920 idx = 0; 4921 e = &qp->s_ack_queue[idx]; 4922 if (e->opcode == TID_OP(WRITE_REQ)) { 4923 req = ack_to_tid_req(e); 4924 trace_hfi1_tid_req_rcv_resync(qp, 0, e->opcode, e->psn, 4925 e->lpsn, req); 4926 4927 /* start from last unacked segment */ 4928 for (flow_idx = req->clear_tail; 4929 CIRC_CNT(req->setup_head, flow_idx, 4930 MAX_FLOWS); 4931 flow_idx = CIRC_NEXT(flow_idx, MAX_FLOWS)) { 4932 u32 lpsn; 4933 u32 next; 4934 4935 flow = &req->flows[flow_idx]; 4936 lpsn = full_flow_psn(flow, 4937 flow->flow_state.lpsn); 4938 next = flow->flow_state.r_next_psn; 4939 flow->npkts = delta_psn(lpsn, next - 1); 4940 flow->flow_state.generation = fs->generation; 4941 flow->flow_state.spsn = fs->psn; 4942 flow->flow_state.lpsn = 4943 flow->flow_state.spsn + flow->npkts - 1; 4944 flow->flow_state.r_next_psn = 4945 full_flow_psn(flow, 4946 flow->flow_state.spsn); 4947 fs->psn += flow->npkts; 4948 trace_hfi1_tid_flow_rcv_resync(qp, flow_idx, 4949 flow); 4950 } 4951 } 4952 if (idx == qp->s_tail_ack_queue) 4953 break; 4954 } 4955 4956 spin_unlock(&rcd->exp_lock); 4957 qpriv->resync = true; 4958 /* RESYNC request always gets a TID RDMA ACK. */ 4959 qpriv->s_nak_state = 0; 4960 tid_rdma_trigger_ack(qp); 4961 bail: 4962 if (fecn) 4963 qp->s_flags |= RVT_S_ECN; 4964 spin_unlock_irqrestore(&qp->s_lock, flags); 4965 } 4966 4967 /* 4968 * Call this function when the last TID RDMA WRITE DATA packet for a request 4969 * is built. 4970 */ 4971 static void update_tid_tail(struct rvt_qp *qp) 4972 __must_hold(&qp->s_lock) 4973 { 4974 struct hfi1_qp_priv *priv = qp->priv; 4975 u32 i; 4976 struct rvt_swqe *wqe; 4977 4978 lockdep_assert_held(&qp->s_lock); 4979 /* Can't move beyond s_tid_cur */ 4980 if (priv->s_tid_tail == priv->s_tid_cur) 4981 return; 4982 for (i = priv->s_tid_tail + 1; ; i++) { 4983 if (i == qp->s_size) 4984 i = 0; 4985 4986 if (i == priv->s_tid_cur) 4987 break; 4988 wqe = rvt_get_swqe_ptr(qp, i); 4989 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) 4990 break; 4991 } 4992 priv->s_tid_tail = i; 4993 priv->s_state = TID_OP(WRITE_RESP); 4994 } 4995 4996 int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps) 4997 __must_hold(&qp->s_lock) 4998 { 4999 struct hfi1_qp_priv *priv = qp->priv; 5000 struct rvt_swqe *wqe; 5001 u32 bth1 = 0, bth2 = 0, hwords = 5, len, middle = 0; 5002 struct ib_other_headers *ohdr; 5003 struct rvt_sge_state *ss = &qp->s_sge; 5004 struct rvt_ack_entry *e = &qp->s_ack_queue[qp->s_tail_ack_queue]; 5005 struct tid_rdma_request *req = ack_to_tid_req(e); 5006 bool last = false; 5007 u8 opcode = TID_OP(WRITE_DATA); 5008 5009 lockdep_assert_held(&qp->s_lock); 5010 trace_hfi1_tid_write_sender_make_tid_pkt(qp, 0); 5011 /* 5012 * Prioritize the sending of the requests and responses over the 5013 * sending of the TID RDMA data packets. 5014 */ 5015 if (((atomic_read(&priv->n_tid_requests) < HFI1_TID_RDMA_WRITE_CNT) && 5016 atomic_read(&priv->n_requests) && 5017 !(qp->s_flags & (RVT_S_BUSY | RVT_S_WAIT_ACK | 5018 HFI1_S_ANY_WAIT_IO))) || 5019 (e->opcode == TID_OP(WRITE_REQ) && req->cur_seg < req->alloc_seg && 5020 !(qp->s_flags & (RVT_S_BUSY | HFI1_S_ANY_WAIT_IO)))) { 5021 struct iowait_work *iowork; 5022 5023 iowork = iowait_get_ib_work(&priv->s_iowait); 5024 ps->s_txreq = get_waiting_verbs_txreq(iowork); 5025 if (ps->s_txreq || hfi1_make_rc_req(qp, ps)) { 5026 priv->s_flags |= HFI1_S_TID_BUSY_SET; 5027 return 1; 5028 } 5029 } 5030 5031 ps->s_txreq = get_txreq(ps->dev, qp); 5032 if (!ps->s_txreq) 5033 goto bail_no_tx; 5034 5035 ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth; 5036 5037 if ((priv->s_flags & RVT_S_ACK_PENDING) && 5038 make_tid_rdma_ack(qp, ohdr, ps)) 5039 return 1; 5040 5041 /* 5042 * Bail out if we can't send data. 5043 * Be reminded that this check must been done after the call to 5044 * make_tid_rdma_ack() because the responding QP could be in 5045 * RTR state where it can send TID RDMA ACK, not TID RDMA WRITE DATA. 5046 */ 5047 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) 5048 goto bail; 5049 5050 if (priv->s_flags & RVT_S_WAIT_ACK) 5051 goto bail; 5052 5053 /* Check whether there is anything to do. */ 5054 if (priv->s_tid_tail == HFI1_QP_WQE_INVALID) 5055 goto bail; 5056 wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail); 5057 req = wqe_to_tid_req(wqe); 5058 trace_hfi1_tid_req_make_tid_pkt(qp, 0, wqe->wr.opcode, wqe->psn, 5059 wqe->lpsn, req); 5060 switch (priv->s_state) { 5061 case TID_OP(WRITE_REQ): 5062 case TID_OP(WRITE_RESP): 5063 priv->tid_ss.sge = wqe->sg_list[0]; 5064 priv->tid_ss.sg_list = wqe->sg_list + 1; 5065 priv->tid_ss.num_sge = wqe->wr.num_sge; 5066 priv->tid_ss.total_len = wqe->length; 5067 5068 if (priv->s_state == TID_OP(WRITE_REQ)) 5069 hfi1_tid_rdma_restart_req(qp, wqe, &bth2); 5070 priv->s_state = TID_OP(WRITE_DATA); 5071 fallthrough; 5072 5073 case TID_OP(WRITE_DATA): 5074 /* 5075 * 1. Check whether TID RDMA WRITE RESP available. 5076 * 2. If no: 5077 * 2.1 If have more segments and no TID RDMA WRITE RESP, 5078 * set HFI1_S_WAIT_TID_RESP 5079 * 2.2 Return indicating no progress made. 5080 * 3. If yes: 5081 * 3.1 Build TID RDMA WRITE DATA packet. 5082 * 3.2 If last packet in segment: 5083 * 3.2.1 Change KDETH header bits 5084 * 3.2.2 Advance RESP pointers. 5085 * 3.3 Return indicating progress made. 5086 */ 5087 trace_hfi1_sender_make_tid_pkt(qp); 5088 trace_hfi1_tid_write_sender_make_tid_pkt(qp, 0); 5089 wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail); 5090 req = wqe_to_tid_req(wqe); 5091 len = wqe->length; 5092 5093 if (!req->comp_seg || req->cur_seg == req->comp_seg) 5094 goto bail; 5095 5096 trace_hfi1_tid_req_make_tid_pkt(qp, 0, wqe->wr.opcode, 5097 wqe->psn, wqe->lpsn, req); 5098 last = hfi1_build_tid_rdma_packet(wqe, ohdr, &bth1, &bth2, 5099 &len); 5100 5101 if (last) { 5102 /* move pointer to next flow */ 5103 req->clear_tail = CIRC_NEXT(req->clear_tail, 5104 MAX_FLOWS); 5105 if (++req->cur_seg < req->total_segs) { 5106 if (!CIRC_CNT(req->setup_head, req->clear_tail, 5107 MAX_FLOWS)) 5108 qp->s_flags |= HFI1_S_WAIT_TID_RESP; 5109 } else { 5110 priv->s_state = TID_OP(WRITE_DATA_LAST); 5111 opcode = TID_OP(WRITE_DATA_LAST); 5112 5113 /* Advance the s_tid_tail now */ 5114 update_tid_tail(qp); 5115 } 5116 } 5117 hwords += sizeof(ohdr->u.tid_rdma.w_data) / sizeof(u32); 5118 ss = &priv->tid_ss; 5119 break; 5120 5121 case TID_OP(RESYNC): 5122 trace_hfi1_sender_make_tid_pkt(qp); 5123 /* Use generation from the most recently received response */ 5124 wqe = rvt_get_swqe_ptr(qp, priv->s_tid_cur); 5125 req = wqe_to_tid_req(wqe); 5126 /* If no responses for this WQE look at the previous one */ 5127 if (!req->comp_seg) { 5128 wqe = rvt_get_swqe_ptr(qp, 5129 (!priv->s_tid_cur ? qp->s_size : 5130 priv->s_tid_cur) - 1); 5131 req = wqe_to_tid_req(wqe); 5132 } 5133 hwords += hfi1_build_tid_rdma_resync(qp, wqe, ohdr, &bth1, 5134 &bth2, 5135 CIRC_PREV(req->setup_head, 5136 MAX_FLOWS)); 5137 ss = NULL; 5138 len = 0; 5139 opcode = TID_OP(RESYNC); 5140 break; 5141 5142 default: 5143 goto bail; 5144 } 5145 if (priv->s_flags & RVT_S_SEND_ONE) { 5146 priv->s_flags &= ~RVT_S_SEND_ONE; 5147 priv->s_flags |= RVT_S_WAIT_ACK; 5148 bth2 |= IB_BTH_REQ_ACK; 5149 } 5150 qp->s_len -= len; 5151 ps->s_txreq->hdr_dwords = hwords; 5152 ps->s_txreq->sde = priv->s_sde; 5153 ps->s_txreq->ss = ss; 5154 ps->s_txreq->s_cur_size = len; 5155 hfi1_make_ruc_header(qp, ohdr, (opcode << 24), bth1, bth2, 5156 middle, ps); 5157 return 1; 5158 bail: 5159 hfi1_put_txreq(ps->s_txreq); 5160 bail_no_tx: 5161 ps->s_txreq = NULL; 5162 priv->s_flags &= ~RVT_S_BUSY; 5163 /* 5164 * If we didn't get a txreq, the QP will be woken up later to try 5165 * again, set the flags to the the wake up which work item to wake 5166 * up. 5167 * (A better algorithm should be found to do this and generalize the 5168 * sleep/wakeup flags.) 5169 */ 5170 iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID); 5171 return 0; 5172 } 5173 5174 static int make_tid_rdma_ack(struct rvt_qp *qp, 5175 struct ib_other_headers *ohdr, 5176 struct hfi1_pkt_state *ps) 5177 { 5178 struct rvt_ack_entry *e; 5179 struct hfi1_qp_priv *qpriv = qp->priv; 5180 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); 5181 u32 hwords, next; 5182 u32 len = 0; 5183 u32 bth1 = 0, bth2 = 0; 5184 int middle = 0; 5185 u16 flow; 5186 struct tid_rdma_request *req, *nreq; 5187 5188 trace_hfi1_tid_write_rsp_make_tid_ack(qp); 5189 /* Don't send an ACK if we aren't supposed to. */ 5190 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) 5191 goto bail; 5192 5193 /* header size in 32-bit words LRH+BTH = (8+12)/4. */ 5194 hwords = 5; 5195 5196 e = &qp->s_ack_queue[qpriv->r_tid_ack]; 5197 req = ack_to_tid_req(e); 5198 /* 5199 * In the RESYNC case, we are exactly one segment past the 5200 * previously sent ack or at the previously sent NAK. So to send 5201 * the resync ack, we go back one segment (which might be part of 5202 * the previous request) and let the do-while loop execute again. 5203 * The advantage of executing the do-while loop is that any data 5204 * received after the previous ack is automatically acked in the 5205 * RESYNC ack. It turns out that for the do-while loop we only need 5206 * to pull back qpriv->r_tid_ack, not the segment 5207 * indices/counters. The scheme works even if the previous request 5208 * was not a TID WRITE request. 5209 */ 5210 if (qpriv->resync) { 5211 if (!req->ack_seg || req->ack_seg == req->total_segs) 5212 qpriv->r_tid_ack = !qpriv->r_tid_ack ? 5213 rvt_size_atomic(&dev->rdi) : 5214 qpriv->r_tid_ack - 1; 5215 e = &qp->s_ack_queue[qpriv->r_tid_ack]; 5216 req = ack_to_tid_req(e); 5217 } 5218 5219 trace_hfi1_rsp_make_tid_ack(qp, e->psn); 5220 trace_hfi1_tid_req_make_tid_ack(qp, 0, e->opcode, e->psn, e->lpsn, 5221 req); 5222 /* 5223 * If we've sent all the ACKs that we can, we are done 5224 * until we get more segments... 5225 */ 5226 if (!qpriv->s_nak_state && !qpriv->resync && 5227 req->ack_seg == req->comp_seg) 5228 goto bail; 5229 5230 do { 5231 /* 5232 * To deal with coalesced ACKs, the acked_tail pointer 5233 * into the flow array is used. The distance between it 5234 * and the clear_tail is the number of flows that are 5235 * being ACK'ed. 5236 */ 5237 req->ack_seg += 5238 /* Get up-to-date value */ 5239 CIRC_CNT(req->clear_tail, req->acked_tail, 5240 MAX_FLOWS); 5241 /* Advance acked index */ 5242 req->acked_tail = req->clear_tail; 5243 5244 /* 5245 * req->clear_tail points to the segment currently being 5246 * received. So, when sending an ACK, the previous 5247 * segment is being ACK'ed. 5248 */ 5249 flow = CIRC_PREV(req->acked_tail, MAX_FLOWS); 5250 if (req->ack_seg != req->total_segs) 5251 break; 5252 req->state = TID_REQUEST_COMPLETE; 5253 5254 next = qpriv->r_tid_ack + 1; 5255 if (next > rvt_size_atomic(&dev->rdi)) 5256 next = 0; 5257 qpriv->r_tid_ack = next; 5258 if (qp->s_ack_queue[next].opcode != TID_OP(WRITE_REQ)) 5259 break; 5260 nreq = ack_to_tid_req(&qp->s_ack_queue[next]); 5261 if (!nreq->comp_seg || nreq->ack_seg == nreq->comp_seg) 5262 break; 5263 5264 /* Move to the next ack entry now */ 5265 e = &qp->s_ack_queue[qpriv->r_tid_ack]; 5266 req = ack_to_tid_req(e); 5267 } while (1); 5268 5269 /* 5270 * At this point qpriv->r_tid_ack == qpriv->r_tid_tail but e and 5271 * req could be pointing at the previous ack queue entry 5272 */ 5273 if (qpriv->s_nak_state || 5274 (qpriv->resync && 5275 !hfi1_tid_rdma_is_resync_psn(qpriv->r_next_psn_kdeth - 1) && 5276 (cmp_psn(qpriv->r_next_psn_kdeth - 1, 5277 full_flow_psn(&req->flows[flow], 5278 req->flows[flow].flow_state.lpsn)) > 0))) { 5279 /* 5280 * A NAK will implicitly acknowledge all previous TID RDMA 5281 * requests. Therefore, we NAK with the req->acked_tail 5282 * segment for the request at qpriv->r_tid_ack (same at 5283 * this point as the req->clear_tail segment for the 5284 * qpriv->r_tid_tail request) 5285 */ 5286 e = &qp->s_ack_queue[qpriv->r_tid_ack]; 5287 req = ack_to_tid_req(e); 5288 flow = req->acked_tail; 5289 } else if (req->ack_seg == req->total_segs && 5290 qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK) 5291 qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK; 5292 5293 trace_hfi1_tid_write_rsp_make_tid_ack(qp); 5294 trace_hfi1_tid_req_make_tid_ack(qp, 0, e->opcode, e->psn, e->lpsn, 5295 req); 5296 hwords += hfi1_build_tid_rdma_write_ack(qp, e, ohdr, flow, &bth1, 5297 &bth2); 5298 len = 0; 5299 qpriv->s_flags &= ~RVT_S_ACK_PENDING; 5300 ps->s_txreq->hdr_dwords = hwords; 5301 ps->s_txreq->sde = qpriv->s_sde; 5302 ps->s_txreq->s_cur_size = len; 5303 ps->s_txreq->ss = NULL; 5304 hfi1_make_ruc_header(qp, ohdr, (TID_OP(ACK) << 24), bth1, bth2, middle, 5305 ps); 5306 ps->s_txreq->txreq.flags |= SDMA_TXREQ_F_VIP; 5307 return 1; 5308 bail: 5309 /* 5310 * Ensure s_rdma_ack_cnt changes are committed prior to resetting 5311 * RVT_S_RESP_PENDING 5312 */ 5313 smp_wmb(); 5314 qpriv->s_flags &= ~RVT_S_ACK_PENDING; 5315 return 0; 5316 } 5317 5318 static int hfi1_send_tid_ok(struct rvt_qp *qp) 5319 { 5320 struct hfi1_qp_priv *priv = qp->priv; 5321 5322 return !(priv->s_flags & RVT_S_BUSY || 5323 qp->s_flags & HFI1_S_ANY_WAIT_IO) && 5324 (verbs_txreq_queued(iowait_get_tid_work(&priv->s_iowait)) || 5325 (priv->s_flags & RVT_S_RESP_PENDING) || 5326 !(qp->s_flags & HFI1_S_ANY_TID_WAIT_SEND)); 5327 } 5328 5329 void _hfi1_do_tid_send(struct work_struct *work) 5330 { 5331 struct iowait_work *w = container_of(work, struct iowait_work, iowork); 5332 struct rvt_qp *qp = iowait_to_qp(w->iow); 5333 5334 hfi1_do_tid_send(qp); 5335 } 5336 5337 static void hfi1_do_tid_send(struct rvt_qp *qp) 5338 { 5339 struct hfi1_pkt_state ps; 5340 struct hfi1_qp_priv *priv = qp->priv; 5341 5342 ps.dev = to_idev(qp->ibqp.device); 5343 ps.ibp = to_iport(qp->ibqp.device, qp->port_num); 5344 ps.ppd = ppd_from_ibp(ps.ibp); 5345 ps.wait = iowait_get_tid_work(&priv->s_iowait); 5346 ps.in_thread = false; 5347 ps.timeout_int = qp->timeout_jiffies / 8; 5348 5349 trace_hfi1_rc_do_tid_send(qp, false); 5350 spin_lock_irqsave(&qp->s_lock, ps.flags); 5351 5352 /* Return if we are already busy processing a work request. */ 5353 if (!hfi1_send_tid_ok(qp)) { 5354 if (qp->s_flags & HFI1_S_ANY_WAIT_IO) 5355 iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID); 5356 spin_unlock_irqrestore(&qp->s_lock, ps.flags); 5357 return; 5358 } 5359 5360 priv->s_flags |= RVT_S_BUSY; 5361 5362 ps.timeout = jiffies + ps.timeout_int; 5363 ps.cpu = priv->s_sde ? priv->s_sde->cpu : 5364 cpumask_first(cpumask_of_node(ps.ppd->dd->node)); 5365 ps.pkts_sent = false; 5366 5367 /* insure a pre-built packet is handled */ 5368 ps.s_txreq = get_waiting_verbs_txreq(ps.wait); 5369 do { 5370 /* Check for a constructed packet to be sent. */ 5371 if (ps.s_txreq) { 5372 if (priv->s_flags & HFI1_S_TID_BUSY_SET) { 5373 qp->s_flags |= RVT_S_BUSY; 5374 ps.wait = iowait_get_ib_work(&priv->s_iowait); 5375 } 5376 spin_unlock_irqrestore(&qp->s_lock, ps.flags); 5377 5378 /* 5379 * If the packet cannot be sent now, return and 5380 * the send tasklet will be woken up later. 5381 */ 5382 if (hfi1_verbs_send(qp, &ps)) 5383 return; 5384 5385 /* allow other tasks to run */ 5386 if (hfi1_schedule_send_yield(qp, &ps, true)) 5387 return; 5388 5389 spin_lock_irqsave(&qp->s_lock, ps.flags); 5390 if (priv->s_flags & HFI1_S_TID_BUSY_SET) { 5391 qp->s_flags &= ~RVT_S_BUSY; 5392 priv->s_flags &= ~HFI1_S_TID_BUSY_SET; 5393 ps.wait = iowait_get_tid_work(&priv->s_iowait); 5394 if (iowait_flag_set(&priv->s_iowait, 5395 IOWAIT_PENDING_IB)) 5396 hfi1_schedule_send(qp); 5397 } 5398 } 5399 } while (hfi1_make_tid_rdma_pkt(qp, &ps)); 5400 iowait_starve_clear(ps.pkts_sent, &priv->s_iowait); 5401 spin_unlock_irqrestore(&qp->s_lock, ps.flags); 5402 } 5403 5404 static bool _hfi1_schedule_tid_send(struct rvt_qp *qp) 5405 { 5406 struct hfi1_qp_priv *priv = qp->priv; 5407 struct hfi1_ibport *ibp = 5408 to_iport(qp->ibqp.device, qp->port_num); 5409 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); 5410 struct hfi1_devdata *dd = ppd->dd; 5411 5412 if ((dd->flags & HFI1_SHUTDOWN)) 5413 return true; 5414 5415 return iowait_tid_schedule(&priv->s_iowait, ppd->hfi1_wq, 5416 priv->s_sde ? 5417 priv->s_sde->cpu : 5418 cpumask_first(cpumask_of_node(dd->node))); 5419 } 5420 5421 /** 5422 * hfi1_schedule_tid_send - schedule progress on TID RDMA state machine 5423 * @qp: the QP 5424 * 5425 * This schedules qp progress on the TID RDMA state machine. Caller 5426 * should hold the s_lock. 5427 * Unlike hfi1_schedule_send(), this cannot use hfi1_send_ok() because 5428 * the two state machines can step on each other with respect to the 5429 * RVT_S_BUSY flag. 5430 * Therefore, a modified test is used. 5431 * @return true if the second leg is scheduled; 5432 * false if the second leg is not scheduled. 5433 */ 5434 bool hfi1_schedule_tid_send(struct rvt_qp *qp) 5435 { 5436 lockdep_assert_held(&qp->s_lock); 5437 if (hfi1_send_tid_ok(qp)) { 5438 /* 5439 * The following call returns true if the qp is not on the 5440 * queue and false if the qp is already on the queue before 5441 * this call. Either way, the qp will be on the queue when the 5442 * call returns. 5443 */ 5444 _hfi1_schedule_tid_send(qp); 5445 return true; 5446 } 5447 if (qp->s_flags & HFI1_S_ANY_WAIT_IO) 5448 iowait_set_flag(&((struct hfi1_qp_priv *)qp->priv)->s_iowait, 5449 IOWAIT_PENDING_TID); 5450 return false; 5451 } 5452 5453 bool hfi1_tid_rdma_ack_interlock(struct rvt_qp *qp, struct rvt_ack_entry *e) 5454 { 5455 struct rvt_ack_entry *prev; 5456 struct tid_rdma_request *req; 5457 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); 5458 struct hfi1_qp_priv *priv = qp->priv; 5459 u32 s_prev; 5460 5461 s_prev = qp->s_tail_ack_queue == 0 ? rvt_size_atomic(&dev->rdi) : 5462 (qp->s_tail_ack_queue - 1); 5463 prev = &qp->s_ack_queue[s_prev]; 5464 5465 if ((e->opcode == TID_OP(READ_REQ) || 5466 e->opcode == OP(RDMA_READ_REQUEST)) && 5467 prev->opcode == TID_OP(WRITE_REQ)) { 5468 req = ack_to_tid_req(prev); 5469 if (req->ack_seg != req->total_segs) { 5470 priv->s_flags |= HFI1_R_TID_WAIT_INTERLCK; 5471 return true; 5472 } 5473 } 5474 return false; 5475 } 5476 5477 static u32 read_r_next_psn(struct hfi1_devdata *dd, u8 ctxt, u8 fidx) 5478 { 5479 u64 reg; 5480 5481 /* 5482 * The only sane way to get the amount of 5483 * progress is to read the HW flow state. 5484 */ 5485 reg = read_uctxt_csr(dd, ctxt, RCV_TID_FLOW_TABLE + (8 * fidx)); 5486 return mask_psn(reg); 5487 } 5488 5489 static void tid_rdma_rcv_err(struct hfi1_packet *packet, 5490 struct ib_other_headers *ohdr, 5491 struct rvt_qp *qp, u32 psn, int diff, bool fecn) 5492 { 5493 unsigned long flags; 5494 5495 tid_rdma_rcv_error(packet, ohdr, qp, psn, diff); 5496 if (fecn) { 5497 spin_lock_irqsave(&qp->s_lock, flags); 5498 qp->s_flags |= RVT_S_ECN; 5499 spin_unlock_irqrestore(&qp->s_lock, flags); 5500 } 5501 } 5502 5503 static void update_r_next_psn_fecn(struct hfi1_packet *packet, 5504 struct hfi1_qp_priv *priv, 5505 struct hfi1_ctxtdata *rcd, 5506 struct tid_rdma_flow *flow, 5507 bool fecn) 5508 { 5509 /* 5510 * If a start/middle packet is delivered here due to 5511 * RSM rule and FECN, we need to update the r_next_psn. 5512 */ 5513 if (fecn && packet->etype == RHF_RCV_TYPE_EAGER && 5514 !(priv->s_flags & HFI1_R_TID_SW_PSN)) { 5515 struct hfi1_devdata *dd = rcd->dd; 5516 5517 flow->flow_state.r_next_psn = 5518 read_r_next_psn(dd, rcd->ctxt, flow->idx); 5519 } 5520 } 5521