1 // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) 2 /* 3 * Copyright(c) 2018 - 2020 Intel Corporation. 4 * 5 */ 6 7 #include "hfi.h" 8 #include "qp.h" 9 #include "rc.h" 10 #include "verbs.h" 11 #include "tid_rdma.h" 12 #include "exp_rcv.h" 13 #include "trace.h" 14 15 /** 16 * DOC: TID RDMA READ protocol 17 * 18 * This is an end-to-end protocol at the hfi1 level between two nodes that 19 * improves performance by avoiding data copy on the requester side. It 20 * converts a qualified RDMA READ request into a TID RDMA READ request on 21 * the requester side and thereafter handles the request and response 22 * differently. To be qualified, the RDMA READ request should meet the 23 * following: 24 * -- The total data length should be greater than 256K; 25 * -- The total data length should be a multiple of 4K page size; 26 * -- Each local scatter-gather entry should be 4K page aligned; 27 * -- Each local scatter-gather entry should be a multiple of 4K page size; 28 */ 29 30 #define RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK BIT_ULL(32) 31 #define RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK BIT_ULL(33) 32 #define RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK BIT_ULL(34) 33 #define RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK BIT_ULL(35) 34 #define RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK BIT_ULL(37) 35 #define RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK BIT_ULL(38) 36 37 /* Maximum number of packets within a flow generation. */ 38 #define MAX_TID_FLOW_PSN BIT(HFI1_KDETH_BTH_SEQ_SHIFT) 39 40 #define GENERATION_MASK 0xFFFFF 41 42 static u32 mask_generation(u32 a) 43 { 44 return a & GENERATION_MASK; 45 } 46 47 /* Reserved generation value to set to unused flows for kernel contexts */ 48 #define KERN_GENERATION_RESERVED mask_generation(U32_MAX) 49 50 /* 51 * J_KEY for kernel contexts when TID RDMA is used. 52 * See generate_jkey() in hfi.h for more information. 53 */ 54 #define TID_RDMA_JKEY 32 55 #define HFI1_KERNEL_MIN_JKEY HFI1_ADMIN_JKEY_RANGE 56 #define HFI1_KERNEL_MAX_JKEY (2 * HFI1_ADMIN_JKEY_RANGE - 1) 57 58 /* Maximum number of segments in flight per QP request. */ 59 #define TID_RDMA_MAX_READ_SEGS_PER_REQ 6 60 #define TID_RDMA_MAX_WRITE_SEGS_PER_REQ 4 61 #define MAX_REQ max_t(u16, TID_RDMA_MAX_READ_SEGS_PER_REQ, \ 62 TID_RDMA_MAX_WRITE_SEGS_PER_REQ) 63 #define MAX_FLOWS roundup_pow_of_two(MAX_REQ + 1) 64 65 #define MAX_EXPECTED_PAGES (MAX_EXPECTED_BUFFER / PAGE_SIZE) 66 67 #define TID_RDMA_DESTQP_FLOW_SHIFT 11 68 #define TID_RDMA_DESTQP_FLOW_MASK 0x1f 69 70 #define TID_OPFN_QP_CTXT_MASK 0xff 71 #define TID_OPFN_QP_CTXT_SHIFT 56 72 #define TID_OPFN_QP_KDETH_MASK 0xff 73 #define TID_OPFN_QP_KDETH_SHIFT 48 74 #define TID_OPFN_MAX_LEN_MASK 0x7ff 75 #define TID_OPFN_MAX_LEN_SHIFT 37 76 #define TID_OPFN_TIMEOUT_MASK 0x1f 77 #define TID_OPFN_TIMEOUT_SHIFT 32 78 #define TID_OPFN_RESERVED_MASK 0x3f 79 #define TID_OPFN_RESERVED_SHIFT 26 80 #define TID_OPFN_URG_MASK 0x1 81 #define TID_OPFN_URG_SHIFT 25 82 #define TID_OPFN_VER_MASK 0x7 83 #define TID_OPFN_VER_SHIFT 22 84 #define TID_OPFN_JKEY_MASK 0x3f 85 #define TID_OPFN_JKEY_SHIFT 16 86 #define TID_OPFN_MAX_READ_MASK 0x3f 87 #define TID_OPFN_MAX_READ_SHIFT 10 88 #define TID_OPFN_MAX_WRITE_MASK 0x3f 89 #define TID_OPFN_MAX_WRITE_SHIFT 4 90 91 /* 92 * OPFN TID layout 93 * 94 * 63 47 31 15 95 * NNNNNNNNKKKKKKKK MMMMMMMMMMMTTTTT DDDDDDUVVVJJJJJJ RRRRRRWWWWWWCCCC 96 * 3210987654321098 7654321098765432 1098765432109876 5432109876543210 97 * N - the context Number 98 * K - the Kdeth_qp 99 * M - Max_len 100 * T - Timeout 101 * D - reserveD 102 * V - version 103 * U - Urg capable 104 * J - Jkey 105 * R - max_Read 106 * W - max_Write 107 * C - Capcode 108 */ 109 110 static void tid_rdma_trigger_resume(struct work_struct *work); 111 static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req); 112 static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req, 113 gfp_t gfp); 114 static void hfi1_init_trdma_req(struct rvt_qp *qp, 115 struct tid_rdma_request *req); 116 static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx); 117 static void hfi1_tid_timeout(struct timer_list *t); 118 static void hfi1_add_tid_reap_timer(struct rvt_qp *qp); 119 static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp); 120 static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp); 121 static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp); 122 static void hfi1_tid_retry_timeout(struct timer_list *t); 123 static int make_tid_rdma_ack(struct rvt_qp *qp, 124 struct ib_other_headers *ohdr, 125 struct hfi1_pkt_state *ps); 126 static void hfi1_do_tid_send(struct rvt_qp *qp); 127 static u32 read_r_next_psn(struct hfi1_devdata *dd, u8 ctxt, u8 fidx); 128 static void tid_rdma_rcv_err(struct hfi1_packet *packet, 129 struct ib_other_headers *ohdr, 130 struct rvt_qp *qp, u32 psn, int diff, bool fecn); 131 static void update_r_next_psn_fecn(struct hfi1_packet *packet, 132 struct hfi1_qp_priv *priv, 133 struct hfi1_ctxtdata *rcd, 134 struct tid_rdma_flow *flow, 135 bool fecn); 136 137 static void validate_r_tid_ack(struct hfi1_qp_priv *priv) 138 { 139 if (priv->r_tid_ack == HFI1_QP_WQE_INVALID) 140 priv->r_tid_ack = priv->r_tid_tail; 141 } 142 143 static void tid_rdma_schedule_ack(struct rvt_qp *qp) 144 { 145 struct hfi1_qp_priv *priv = qp->priv; 146 147 priv->s_flags |= RVT_S_ACK_PENDING; 148 hfi1_schedule_tid_send(qp); 149 } 150 151 static void tid_rdma_trigger_ack(struct rvt_qp *qp) 152 { 153 validate_r_tid_ack(qp->priv); 154 tid_rdma_schedule_ack(qp); 155 } 156 157 static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p) 158 { 159 return 160 (((u64)p->qp & TID_OPFN_QP_CTXT_MASK) << 161 TID_OPFN_QP_CTXT_SHIFT) | 162 ((((u64)p->qp >> 16) & TID_OPFN_QP_KDETH_MASK) << 163 TID_OPFN_QP_KDETH_SHIFT) | 164 (((u64)((p->max_len >> PAGE_SHIFT) - 1) & 165 TID_OPFN_MAX_LEN_MASK) << TID_OPFN_MAX_LEN_SHIFT) | 166 (((u64)p->timeout & TID_OPFN_TIMEOUT_MASK) << 167 TID_OPFN_TIMEOUT_SHIFT) | 168 (((u64)p->urg & TID_OPFN_URG_MASK) << TID_OPFN_URG_SHIFT) | 169 (((u64)p->jkey & TID_OPFN_JKEY_MASK) << TID_OPFN_JKEY_SHIFT) | 170 (((u64)p->max_read & TID_OPFN_MAX_READ_MASK) << 171 TID_OPFN_MAX_READ_SHIFT) | 172 (((u64)p->max_write & TID_OPFN_MAX_WRITE_MASK) << 173 TID_OPFN_MAX_WRITE_SHIFT); 174 } 175 176 static void tid_rdma_opfn_decode(struct tid_rdma_params *p, u64 data) 177 { 178 p->max_len = (((data >> TID_OPFN_MAX_LEN_SHIFT) & 179 TID_OPFN_MAX_LEN_MASK) + 1) << PAGE_SHIFT; 180 p->jkey = (data >> TID_OPFN_JKEY_SHIFT) & TID_OPFN_JKEY_MASK; 181 p->max_write = (data >> TID_OPFN_MAX_WRITE_SHIFT) & 182 TID_OPFN_MAX_WRITE_MASK; 183 p->max_read = (data >> TID_OPFN_MAX_READ_SHIFT) & 184 TID_OPFN_MAX_READ_MASK; 185 p->qp = 186 ((((data >> TID_OPFN_QP_KDETH_SHIFT) & TID_OPFN_QP_KDETH_MASK) 187 << 16) | 188 ((data >> TID_OPFN_QP_CTXT_SHIFT) & TID_OPFN_QP_CTXT_MASK)); 189 p->urg = (data >> TID_OPFN_URG_SHIFT) & TID_OPFN_URG_MASK; 190 p->timeout = (data >> TID_OPFN_TIMEOUT_SHIFT) & TID_OPFN_TIMEOUT_MASK; 191 } 192 193 void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p) 194 { 195 struct hfi1_qp_priv *priv = qp->priv; 196 197 p->qp = (RVT_KDETH_QP_PREFIX << 16) | priv->rcd->ctxt; 198 p->max_len = TID_RDMA_MAX_SEGMENT_SIZE; 199 p->jkey = priv->rcd->jkey; 200 p->max_read = TID_RDMA_MAX_READ_SEGS_PER_REQ; 201 p->max_write = TID_RDMA_MAX_WRITE_SEGS_PER_REQ; 202 p->timeout = qp->timeout; 203 p->urg = is_urg_masked(priv->rcd); 204 } 205 206 bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data) 207 { 208 struct hfi1_qp_priv *priv = qp->priv; 209 210 *data = tid_rdma_opfn_encode(&priv->tid_rdma.local); 211 return true; 212 } 213 214 bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data) 215 { 216 struct hfi1_qp_priv *priv = qp->priv; 217 struct tid_rdma_params *remote, *old; 218 bool ret = true; 219 220 old = rcu_dereference_protected(priv->tid_rdma.remote, 221 lockdep_is_held(&priv->opfn.lock)); 222 data &= ~0xfULL; 223 /* 224 * If data passed in is zero, return true so as not to continue the 225 * negotiation process 226 */ 227 if (!data || !HFI1_CAP_IS_KSET(TID_RDMA)) 228 goto null; 229 /* 230 * If kzalloc fails, return false. This will result in: 231 * * at the requester a new OPFN request being generated to retry 232 * the negotiation 233 * * at the responder, 0 being returned to the requester so as to 234 * disable TID RDMA at both the requester and the responder 235 */ 236 remote = kzalloc(sizeof(*remote), GFP_ATOMIC); 237 if (!remote) { 238 ret = false; 239 goto null; 240 } 241 242 tid_rdma_opfn_decode(remote, data); 243 priv->tid_timer_timeout_jiffies = 244 usecs_to_jiffies((((4096UL * (1UL << remote->timeout)) / 245 1000UL) << 3) * 7); 246 trace_hfi1_opfn_param(qp, 0, &priv->tid_rdma.local); 247 trace_hfi1_opfn_param(qp, 1, remote); 248 rcu_assign_pointer(priv->tid_rdma.remote, remote); 249 /* 250 * A TID RDMA READ request's segment size is not equal to 251 * remote->max_len only when the request's data length is smaller 252 * than remote->max_len. In that case, there will be only one segment. 253 * Therefore, when priv->pkts_ps is used to calculate req->cur_seg 254 * during retry, it will lead to req->cur_seg = 0, which is exactly 255 * what is expected. 256 */ 257 priv->pkts_ps = (u16)rvt_div_mtu(qp, remote->max_len); 258 priv->timeout_shift = ilog2(priv->pkts_ps - 1) + 1; 259 goto free; 260 null: 261 RCU_INIT_POINTER(priv->tid_rdma.remote, NULL); 262 priv->timeout_shift = 0; 263 free: 264 if (old) 265 kfree_rcu(old, rcu_head); 266 return ret; 267 } 268 269 bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data) 270 { 271 bool ret; 272 273 ret = tid_rdma_conn_reply(qp, *data); 274 *data = 0; 275 /* 276 * If tid_rdma_conn_reply() returns error, set *data as 0 to indicate 277 * TID RDMA could not be enabled. This will result in TID RDMA being 278 * disabled at the requester too. 279 */ 280 if (ret) 281 (void)tid_rdma_conn_req(qp, data); 282 return ret; 283 } 284 285 void tid_rdma_conn_error(struct rvt_qp *qp) 286 { 287 struct hfi1_qp_priv *priv = qp->priv; 288 struct tid_rdma_params *old; 289 290 old = rcu_dereference_protected(priv->tid_rdma.remote, 291 lockdep_is_held(&priv->opfn.lock)); 292 RCU_INIT_POINTER(priv->tid_rdma.remote, NULL); 293 if (old) 294 kfree_rcu(old, rcu_head); 295 } 296 297 /* This is called at context initialization time */ 298 int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit) 299 { 300 if (reinit) 301 return 0; 302 303 BUILD_BUG_ON(TID_RDMA_JKEY < HFI1_KERNEL_MIN_JKEY); 304 BUILD_BUG_ON(TID_RDMA_JKEY > HFI1_KERNEL_MAX_JKEY); 305 rcd->jkey = TID_RDMA_JKEY; 306 hfi1_set_ctxt_jkey(rcd->dd, rcd, rcd->jkey); 307 return hfi1_alloc_ctxt_rcv_groups(rcd); 308 } 309 310 /** 311 * qp_to_rcd - determine the receive context used by a qp 312 * @qp - the qp 313 * 314 * This routine returns the receive context associated 315 * with a a qp's qpn. 316 * 317 * Returns the context. 318 */ 319 static struct hfi1_ctxtdata *qp_to_rcd(struct rvt_dev_info *rdi, 320 struct rvt_qp *qp) 321 { 322 struct hfi1_ibdev *verbs_dev = container_of(rdi, 323 struct hfi1_ibdev, 324 rdi); 325 struct hfi1_devdata *dd = container_of(verbs_dev, 326 struct hfi1_devdata, 327 verbs_dev); 328 unsigned int ctxt; 329 330 if (qp->ibqp.qp_num == 0) 331 ctxt = 0; 332 else 333 ctxt = hfi1_get_qp_map(dd, qp->ibqp.qp_num >> dd->qos_shift); 334 return dd->rcd[ctxt]; 335 } 336 337 int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, 338 struct ib_qp_init_attr *init_attr) 339 { 340 struct hfi1_qp_priv *qpriv = qp->priv; 341 int i, ret; 342 343 qpriv->rcd = qp_to_rcd(rdi, qp); 344 345 spin_lock_init(&qpriv->opfn.lock); 346 INIT_WORK(&qpriv->opfn.opfn_work, opfn_send_conn_request); 347 INIT_WORK(&qpriv->tid_rdma.trigger_work, tid_rdma_trigger_resume); 348 qpriv->flow_state.psn = 0; 349 qpriv->flow_state.index = RXE_NUM_TID_FLOWS; 350 qpriv->flow_state.last_index = RXE_NUM_TID_FLOWS; 351 qpriv->flow_state.generation = KERN_GENERATION_RESERVED; 352 qpriv->s_state = TID_OP(WRITE_RESP); 353 qpriv->s_tid_cur = HFI1_QP_WQE_INVALID; 354 qpriv->s_tid_head = HFI1_QP_WQE_INVALID; 355 qpriv->s_tid_tail = HFI1_QP_WQE_INVALID; 356 qpriv->rnr_nak_state = TID_RNR_NAK_INIT; 357 qpriv->r_tid_head = HFI1_QP_WQE_INVALID; 358 qpriv->r_tid_tail = HFI1_QP_WQE_INVALID; 359 qpriv->r_tid_ack = HFI1_QP_WQE_INVALID; 360 qpriv->r_tid_alloc = HFI1_QP_WQE_INVALID; 361 atomic_set(&qpriv->n_requests, 0); 362 atomic_set(&qpriv->n_tid_requests, 0); 363 timer_setup(&qpriv->s_tid_timer, hfi1_tid_timeout, 0); 364 timer_setup(&qpriv->s_tid_retry_timer, hfi1_tid_retry_timeout, 0); 365 INIT_LIST_HEAD(&qpriv->tid_wait); 366 367 if (init_attr->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { 368 struct hfi1_devdata *dd = qpriv->rcd->dd; 369 370 qpriv->pages = kzalloc_node(TID_RDMA_MAX_PAGES * 371 sizeof(*qpriv->pages), 372 GFP_KERNEL, dd->node); 373 if (!qpriv->pages) 374 return -ENOMEM; 375 for (i = 0; i < qp->s_size; i++) { 376 struct hfi1_swqe_priv *priv; 377 struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i); 378 379 priv = kzalloc_node(sizeof(*priv), GFP_KERNEL, 380 dd->node); 381 if (!priv) 382 return -ENOMEM; 383 384 hfi1_init_trdma_req(qp, &priv->tid_req); 385 priv->tid_req.e.swqe = wqe; 386 wqe->priv = priv; 387 } 388 for (i = 0; i < rvt_max_atomic(rdi); i++) { 389 struct hfi1_ack_priv *priv; 390 391 priv = kzalloc_node(sizeof(*priv), GFP_KERNEL, 392 dd->node); 393 if (!priv) 394 return -ENOMEM; 395 396 hfi1_init_trdma_req(qp, &priv->tid_req); 397 priv->tid_req.e.ack = &qp->s_ack_queue[i]; 398 399 ret = hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req, 400 GFP_KERNEL); 401 if (ret) { 402 kfree(priv); 403 return ret; 404 } 405 qp->s_ack_queue[i].priv = priv; 406 } 407 } 408 409 return 0; 410 } 411 412 void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp) 413 { 414 struct hfi1_qp_priv *qpriv = qp->priv; 415 struct rvt_swqe *wqe; 416 u32 i; 417 418 if (qp->ibqp.qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { 419 for (i = 0; i < qp->s_size; i++) { 420 wqe = rvt_get_swqe_ptr(qp, i); 421 kfree(wqe->priv); 422 wqe->priv = NULL; 423 } 424 for (i = 0; i < rvt_max_atomic(rdi); i++) { 425 struct hfi1_ack_priv *priv = qp->s_ack_queue[i].priv; 426 427 if (priv) 428 hfi1_kern_exp_rcv_free_flows(&priv->tid_req); 429 kfree(priv); 430 qp->s_ack_queue[i].priv = NULL; 431 } 432 cancel_work_sync(&qpriv->opfn.opfn_work); 433 kfree(qpriv->pages); 434 qpriv->pages = NULL; 435 } 436 } 437 438 /* Flow and tid waiter functions */ 439 /** 440 * DOC: lock ordering 441 * 442 * There are two locks involved with the queuing 443 * routines: the qp s_lock and the exp_lock. 444 * 445 * Since the tid space allocation is called from 446 * the send engine, the qp s_lock is already held. 447 * 448 * The allocation routines will get the exp_lock. 449 * 450 * The first_qp() call is provided to allow the head of 451 * the rcd wait queue to be fetched under the exp_lock and 452 * followed by a drop of the exp_lock. 453 * 454 * Any qp in the wait list will have the qp reference count held 455 * to hold the qp in memory. 456 */ 457 458 /* 459 * return head of rcd wait list 460 * 461 * Must hold the exp_lock. 462 * 463 * Get a reference to the QP to hold the QP in memory. 464 * 465 * The caller must release the reference when the local 466 * is no longer being used. 467 */ 468 static struct rvt_qp *first_qp(struct hfi1_ctxtdata *rcd, 469 struct tid_queue *queue) 470 __must_hold(&rcd->exp_lock) 471 { 472 struct hfi1_qp_priv *priv; 473 474 lockdep_assert_held(&rcd->exp_lock); 475 priv = list_first_entry_or_null(&queue->queue_head, 476 struct hfi1_qp_priv, 477 tid_wait); 478 if (!priv) 479 return NULL; 480 rvt_get_qp(priv->owner); 481 return priv->owner; 482 } 483 484 /** 485 * kernel_tid_waiters - determine rcd wait 486 * @rcd: the receive context 487 * @qp: the head of the qp being processed 488 * 489 * This routine will return false IFF 490 * the list is NULL or the head of the 491 * list is the indicated qp. 492 * 493 * Must hold the qp s_lock and the exp_lock. 494 * 495 * Return: 496 * false if either of the conditions below are satisfied: 497 * 1. The list is empty or 498 * 2. The indicated qp is at the head of the list and the 499 * HFI1_S_WAIT_TID_SPACE bit is set in qp->s_flags. 500 * true is returned otherwise. 501 */ 502 static bool kernel_tid_waiters(struct hfi1_ctxtdata *rcd, 503 struct tid_queue *queue, struct rvt_qp *qp) 504 __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock) 505 { 506 struct rvt_qp *fqp; 507 bool ret = true; 508 509 lockdep_assert_held(&qp->s_lock); 510 lockdep_assert_held(&rcd->exp_lock); 511 fqp = first_qp(rcd, queue); 512 if (!fqp || (fqp == qp && (qp->s_flags & HFI1_S_WAIT_TID_SPACE))) 513 ret = false; 514 rvt_put_qp(fqp); 515 return ret; 516 } 517 518 /** 519 * dequeue_tid_waiter - dequeue the qp from the list 520 * @qp - the qp to remove the wait list 521 * 522 * This routine removes the indicated qp from the 523 * wait list if it is there. 524 * 525 * This should be done after the hardware flow and 526 * tid array resources have been allocated. 527 * 528 * Must hold the qp s_lock and the rcd exp_lock. 529 * 530 * It assumes the s_lock to protect the s_flags 531 * field and to reliably test the HFI1_S_WAIT_TID_SPACE flag. 532 */ 533 static void dequeue_tid_waiter(struct hfi1_ctxtdata *rcd, 534 struct tid_queue *queue, struct rvt_qp *qp) 535 __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock) 536 { 537 struct hfi1_qp_priv *priv = qp->priv; 538 539 lockdep_assert_held(&qp->s_lock); 540 lockdep_assert_held(&rcd->exp_lock); 541 if (list_empty(&priv->tid_wait)) 542 return; 543 list_del_init(&priv->tid_wait); 544 qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE; 545 queue->dequeue++; 546 rvt_put_qp(qp); 547 } 548 549 /** 550 * queue_qp_for_tid_wait - suspend QP on tid space 551 * @rcd: the receive context 552 * @qp: the qp 553 * 554 * The qp is inserted at the tail of the rcd 555 * wait queue and the HFI1_S_WAIT_TID_SPACE s_flag is set. 556 * 557 * Must hold the qp s_lock and the exp_lock. 558 */ 559 static void queue_qp_for_tid_wait(struct hfi1_ctxtdata *rcd, 560 struct tid_queue *queue, struct rvt_qp *qp) 561 __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock) 562 { 563 struct hfi1_qp_priv *priv = qp->priv; 564 565 lockdep_assert_held(&qp->s_lock); 566 lockdep_assert_held(&rcd->exp_lock); 567 if (list_empty(&priv->tid_wait)) { 568 qp->s_flags |= HFI1_S_WAIT_TID_SPACE; 569 list_add_tail(&priv->tid_wait, &queue->queue_head); 570 priv->tid_enqueue = ++queue->enqueue; 571 rcd->dd->verbs_dev.n_tidwait++; 572 trace_hfi1_qpsleep(qp, HFI1_S_WAIT_TID_SPACE); 573 rvt_get_qp(qp); 574 } 575 } 576 577 /** 578 * __trigger_tid_waiter - trigger tid waiter 579 * @qp: the qp 580 * 581 * This is a private entrance to schedule the qp 582 * assuming the caller is holding the qp->s_lock. 583 */ 584 static void __trigger_tid_waiter(struct rvt_qp *qp) 585 __must_hold(&qp->s_lock) 586 { 587 lockdep_assert_held(&qp->s_lock); 588 if (!(qp->s_flags & HFI1_S_WAIT_TID_SPACE)) 589 return; 590 trace_hfi1_qpwakeup(qp, HFI1_S_WAIT_TID_SPACE); 591 hfi1_schedule_send(qp); 592 } 593 594 /** 595 * tid_rdma_schedule_tid_wakeup - schedule wakeup for a qp 596 * @qp - the qp 597 * 598 * trigger a schedule or a waiting qp in a deadlock 599 * safe manner. The qp reference is held prior 600 * to this call via first_qp(). 601 * 602 * If the qp trigger was already scheduled (!rval) 603 * the the reference is dropped, otherwise the resume 604 * or the destroy cancel will dispatch the reference. 605 */ 606 static void tid_rdma_schedule_tid_wakeup(struct rvt_qp *qp) 607 { 608 struct hfi1_qp_priv *priv; 609 struct hfi1_ibport *ibp; 610 struct hfi1_pportdata *ppd; 611 struct hfi1_devdata *dd; 612 bool rval; 613 614 if (!qp) 615 return; 616 617 priv = qp->priv; 618 ibp = to_iport(qp->ibqp.device, qp->port_num); 619 ppd = ppd_from_ibp(ibp); 620 dd = dd_from_ibdev(qp->ibqp.device); 621 622 rval = queue_work_on(priv->s_sde ? 623 priv->s_sde->cpu : 624 cpumask_first(cpumask_of_node(dd->node)), 625 ppd->hfi1_wq, 626 &priv->tid_rdma.trigger_work); 627 if (!rval) 628 rvt_put_qp(qp); 629 } 630 631 /** 632 * tid_rdma_trigger_resume - field a trigger work request 633 * @work - the work item 634 * 635 * Complete the off qp trigger processing by directly 636 * calling the progress routine. 637 */ 638 static void tid_rdma_trigger_resume(struct work_struct *work) 639 { 640 struct tid_rdma_qp_params *tr; 641 struct hfi1_qp_priv *priv; 642 struct rvt_qp *qp; 643 644 tr = container_of(work, struct tid_rdma_qp_params, trigger_work); 645 priv = container_of(tr, struct hfi1_qp_priv, tid_rdma); 646 qp = priv->owner; 647 spin_lock_irq(&qp->s_lock); 648 if (qp->s_flags & HFI1_S_WAIT_TID_SPACE) { 649 spin_unlock_irq(&qp->s_lock); 650 hfi1_do_send(priv->owner, true); 651 } else { 652 spin_unlock_irq(&qp->s_lock); 653 } 654 rvt_put_qp(qp); 655 } 656 657 /** 658 * tid_rdma_flush_wait - unwind any tid space wait 659 * 660 * This is called when resetting a qp to 661 * allow a destroy or reset to get rid 662 * of any tid space linkage and reference counts. 663 */ 664 static void _tid_rdma_flush_wait(struct rvt_qp *qp, struct tid_queue *queue) 665 __must_hold(&qp->s_lock) 666 { 667 struct hfi1_qp_priv *priv; 668 669 if (!qp) 670 return; 671 lockdep_assert_held(&qp->s_lock); 672 priv = qp->priv; 673 qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE; 674 spin_lock(&priv->rcd->exp_lock); 675 if (!list_empty(&priv->tid_wait)) { 676 list_del_init(&priv->tid_wait); 677 qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE; 678 queue->dequeue++; 679 rvt_put_qp(qp); 680 } 681 spin_unlock(&priv->rcd->exp_lock); 682 } 683 684 void hfi1_tid_rdma_flush_wait(struct rvt_qp *qp) 685 __must_hold(&qp->s_lock) 686 { 687 struct hfi1_qp_priv *priv = qp->priv; 688 689 _tid_rdma_flush_wait(qp, &priv->rcd->flow_queue); 690 _tid_rdma_flush_wait(qp, &priv->rcd->rarr_queue); 691 } 692 693 /* Flow functions */ 694 /** 695 * kern_reserve_flow - allocate a hardware flow 696 * @rcd - the context to use for allocation 697 * @last - the index of the preferred flow. Use RXE_NUM_TID_FLOWS to 698 * signify "don't care". 699 * 700 * Use a bit mask based allocation to reserve a hardware 701 * flow for use in receiving KDETH data packets. If a preferred flow is 702 * specified the function will attempt to reserve that flow again, if 703 * available. 704 * 705 * The exp_lock must be held. 706 * 707 * Return: 708 * On success: a value postive value between 0 and RXE_NUM_TID_FLOWS - 1 709 * On failure: -EAGAIN 710 */ 711 static int kern_reserve_flow(struct hfi1_ctxtdata *rcd, int last) 712 __must_hold(&rcd->exp_lock) 713 { 714 int nr; 715 716 /* Attempt to reserve the preferred flow index */ 717 if (last >= 0 && last < RXE_NUM_TID_FLOWS && 718 !test_and_set_bit(last, &rcd->flow_mask)) 719 return last; 720 721 nr = ffz(rcd->flow_mask); 722 BUILD_BUG_ON(RXE_NUM_TID_FLOWS >= 723 (sizeof(rcd->flow_mask) * BITS_PER_BYTE)); 724 if (nr > (RXE_NUM_TID_FLOWS - 1)) 725 return -EAGAIN; 726 set_bit(nr, &rcd->flow_mask); 727 return nr; 728 } 729 730 static void kern_set_hw_flow(struct hfi1_ctxtdata *rcd, u32 generation, 731 u32 flow_idx) 732 { 733 u64 reg; 734 735 reg = ((u64)generation << HFI1_KDETH_BTH_SEQ_SHIFT) | 736 RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK | 737 RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK | 738 RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK | 739 RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK | 740 RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK; 741 742 if (generation != KERN_GENERATION_RESERVED) 743 reg |= RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK; 744 745 write_uctxt_csr(rcd->dd, rcd->ctxt, 746 RCV_TID_FLOW_TABLE + 8 * flow_idx, reg); 747 } 748 749 static u32 kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx) 750 __must_hold(&rcd->exp_lock) 751 { 752 u32 generation = rcd->flows[flow_idx].generation; 753 754 kern_set_hw_flow(rcd, generation, flow_idx); 755 return generation; 756 } 757 758 static u32 kern_flow_generation_next(u32 gen) 759 { 760 u32 generation = mask_generation(gen + 1); 761 762 if (generation == KERN_GENERATION_RESERVED) 763 generation = mask_generation(generation + 1); 764 return generation; 765 } 766 767 static void kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx) 768 __must_hold(&rcd->exp_lock) 769 { 770 rcd->flows[flow_idx].generation = 771 kern_flow_generation_next(rcd->flows[flow_idx].generation); 772 kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, flow_idx); 773 } 774 775 int hfi1_kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp) 776 { 777 struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv; 778 struct tid_flow_state *fs = &qpriv->flow_state; 779 struct rvt_qp *fqp; 780 unsigned long flags; 781 int ret = 0; 782 783 /* The QP already has an allocated flow */ 784 if (fs->index != RXE_NUM_TID_FLOWS) 785 return ret; 786 787 spin_lock_irqsave(&rcd->exp_lock, flags); 788 if (kernel_tid_waiters(rcd, &rcd->flow_queue, qp)) 789 goto queue; 790 791 ret = kern_reserve_flow(rcd, fs->last_index); 792 if (ret < 0) 793 goto queue; 794 fs->index = ret; 795 fs->last_index = fs->index; 796 797 /* Generation received in a RESYNC overrides default flow generation */ 798 if (fs->generation != KERN_GENERATION_RESERVED) 799 rcd->flows[fs->index].generation = fs->generation; 800 fs->generation = kern_setup_hw_flow(rcd, fs->index); 801 fs->psn = 0; 802 dequeue_tid_waiter(rcd, &rcd->flow_queue, qp); 803 /* get head before dropping lock */ 804 fqp = first_qp(rcd, &rcd->flow_queue); 805 spin_unlock_irqrestore(&rcd->exp_lock, flags); 806 807 tid_rdma_schedule_tid_wakeup(fqp); 808 return 0; 809 queue: 810 queue_qp_for_tid_wait(rcd, &rcd->flow_queue, qp); 811 spin_unlock_irqrestore(&rcd->exp_lock, flags); 812 return -EAGAIN; 813 } 814 815 void hfi1_kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp) 816 { 817 struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv; 818 struct tid_flow_state *fs = &qpriv->flow_state; 819 struct rvt_qp *fqp; 820 unsigned long flags; 821 822 if (fs->index >= RXE_NUM_TID_FLOWS) 823 return; 824 spin_lock_irqsave(&rcd->exp_lock, flags); 825 kern_clear_hw_flow(rcd, fs->index); 826 clear_bit(fs->index, &rcd->flow_mask); 827 fs->index = RXE_NUM_TID_FLOWS; 828 fs->psn = 0; 829 fs->generation = KERN_GENERATION_RESERVED; 830 831 /* get head before dropping lock */ 832 fqp = first_qp(rcd, &rcd->flow_queue); 833 spin_unlock_irqrestore(&rcd->exp_lock, flags); 834 835 if (fqp == qp) { 836 __trigger_tid_waiter(fqp); 837 rvt_put_qp(fqp); 838 } else { 839 tid_rdma_schedule_tid_wakeup(fqp); 840 } 841 } 842 843 void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd) 844 { 845 int i; 846 847 for (i = 0; i < RXE_NUM_TID_FLOWS; i++) { 848 rcd->flows[i].generation = mask_generation(prandom_u32()); 849 kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, i); 850 } 851 } 852 853 /* TID allocation functions */ 854 static u8 trdma_pset_order(struct tid_rdma_pageset *s) 855 { 856 u8 count = s->count; 857 858 return ilog2(count) + 1; 859 } 860 861 /** 862 * tid_rdma_find_phys_blocks_4k - get groups base on mr info 863 * @npages - number of pages 864 * @pages - pointer to an array of page structs 865 * @list - page set array to return 866 * 867 * This routine returns the number of groups associated with 868 * the current sge information. This implementation is based 869 * on the expected receive find_phys_blocks() adjusted to 870 * use the MR information vs. the pfn. 871 * 872 * Return: 873 * the number of RcvArray entries 874 */ 875 static u32 tid_rdma_find_phys_blocks_4k(struct tid_rdma_flow *flow, 876 struct page **pages, 877 u32 npages, 878 struct tid_rdma_pageset *list) 879 { 880 u32 pagecount, pageidx, setcount = 0, i; 881 void *vaddr, *this_vaddr; 882 883 if (!npages) 884 return 0; 885 886 /* 887 * Look for sets of physically contiguous pages in the user buffer. 888 * This will allow us to optimize Expected RcvArray entry usage by 889 * using the bigger supported sizes. 890 */ 891 vaddr = page_address(pages[0]); 892 trace_hfi1_tid_flow_page(flow->req->qp, flow, 0, 0, 0, vaddr); 893 for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) { 894 this_vaddr = i < npages ? page_address(pages[i]) : NULL; 895 trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 0, 0, 896 this_vaddr); 897 /* 898 * If the vaddr's are not sequential, pages are not physically 899 * contiguous. 900 */ 901 if (this_vaddr != (vaddr + PAGE_SIZE)) { 902 /* 903 * At this point we have to loop over the set of 904 * physically contiguous pages and break them down it 905 * sizes supported by the HW. 906 * There are two main constraints: 907 * 1. The max buffer size is MAX_EXPECTED_BUFFER. 908 * If the total set size is bigger than that 909 * program only a MAX_EXPECTED_BUFFER chunk. 910 * 2. The buffer size has to be a power of two. If 911 * it is not, round down to the closes power of 912 * 2 and program that size. 913 */ 914 while (pagecount) { 915 int maxpages = pagecount; 916 u32 bufsize = pagecount * PAGE_SIZE; 917 918 if (bufsize > MAX_EXPECTED_BUFFER) 919 maxpages = 920 MAX_EXPECTED_BUFFER >> 921 PAGE_SHIFT; 922 else if (!is_power_of_2(bufsize)) 923 maxpages = 924 rounddown_pow_of_two(bufsize) >> 925 PAGE_SHIFT; 926 927 list[setcount].idx = pageidx; 928 list[setcount].count = maxpages; 929 trace_hfi1_tid_pageset(flow->req->qp, setcount, 930 list[setcount].idx, 931 list[setcount].count); 932 pagecount -= maxpages; 933 pageidx += maxpages; 934 setcount++; 935 } 936 pageidx = i; 937 pagecount = 1; 938 vaddr = this_vaddr; 939 } else { 940 vaddr += PAGE_SIZE; 941 pagecount++; 942 } 943 } 944 /* insure we always return an even number of sets */ 945 if (setcount & 1) 946 list[setcount++].count = 0; 947 return setcount; 948 } 949 950 /** 951 * tid_flush_pages - dump out pages into pagesets 952 * @list - list of pagesets 953 * @idx - pointer to current page index 954 * @pages - number of pages to dump 955 * @sets - current number of pagesset 956 * 957 * This routine flushes out accumuated pages. 958 * 959 * To insure an even number of sets the 960 * code may add a filler. 961 * 962 * This can happen with when pages is not 963 * a power of 2 or pages is a power of 2 964 * less than the maximum pages. 965 * 966 * Return: 967 * The new number of sets 968 */ 969 970 static u32 tid_flush_pages(struct tid_rdma_pageset *list, 971 u32 *idx, u32 pages, u32 sets) 972 { 973 while (pages) { 974 u32 maxpages = pages; 975 976 if (maxpages > MAX_EXPECTED_PAGES) 977 maxpages = MAX_EXPECTED_PAGES; 978 else if (!is_power_of_2(maxpages)) 979 maxpages = rounddown_pow_of_two(maxpages); 980 list[sets].idx = *idx; 981 list[sets++].count = maxpages; 982 *idx += maxpages; 983 pages -= maxpages; 984 } 985 /* might need a filler */ 986 if (sets & 1) 987 list[sets++].count = 0; 988 return sets; 989 } 990 991 /** 992 * tid_rdma_find_phys_blocks_8k - get groups base on mr info 993 * @pages - pointer to an array of page structs 994 * @npages - number of pages 995 * @list - page set array to return 996 * 997 * This routine parses an array of pages to compute pagesets 998 * in an 8k compatible way. 999 * 1000 * pages are tested two at a time, i, i + 1 for contiguous 1001 * pages and i - 1 and i contiguous pages. 1002 * 1003 * If any condition is false, any accumlated pages are flushed and 1004 * v0,v1 are emitted as separate PAGE_SIZE pagesets 1005 * 1006 * Otherwise, the current 8k is totaled for a future flush. 1007 * 1008 * Return: 1009 * The number of pagesets 1010 * list set with the returned number of pagesets 1011 * 1012 */ 1013 static u32 tid_rdma_find_phys_blocks_8k(struct tid_rdma_flow *flow, 1014 struct page **pages, 1015 u32 npages, 1016 struct tid_rdma_pageset *list) 1017 { 1018 u32 idx, sets = 0, i; 1019 u32 pagecnt = 0; 1020 void *v0, *v1, *vm1; 1021 1022 if (!npages) 1023 return 0; 1024 for (idx = 0, i = 0, vm1 = NULL; i < npages; i += 2) { 1025 /* get a new v0 */ 1026 v0 = page_address(pages[i]); 1027 trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 0, v0); 1028 v1 = i + 1 < npages ? 1029 page_address(pages[i + 1]) : NULL; 1030 trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 1, v1); 1031 /* compare i, i + 1 vaddr */ 1032 if (v1 != (v0 + PAGE_SIZE)) { 1033 /* flush out pages */ 1034 sets = tid_flush_pages(list, &idx, pagecnt, sets); 1035 /* output v0,v1 as two pagesets */ 1036 list[sets].idx = idx++; 1037 list[sets++].count = 1; 1038 if (v1) { 1039 list[sets].count = 1; 1040 list[sets++].idx = idx++; 1041 } else { 1042 list[sets++].count = 0; 1043 } 1044 vm1 = NULL; 1045 pagecnt = 0; 1046 continue; 1047 } 1048 /* i,i+1 consecutive, look at i-1,i */ 1049 if (vm1 && v0 != (vm1 + PAGE_SIZE)) { 1050 /* flush out pages */ 1051 sets = tid_flush_pages(list, &idx, pagecnt, sets); 1052 pagecnt = 0; 1053 } 1054 /* pages will always be a multiple of 8k */ 1055 pagecnt += 2; 1056 /* save i-1 */ 1057 vm1 = v1; 1058 /* move to next pair */ 1059 } 1060 /* dump residual pages at end */ 1061 sets = tid_flush_pages(list, &idx, npages - idx, sets); 1062 /* by design cannot be odd sets */ 1063 WARN_ON(sets & 1); 1064 return sets; 1065 } 1066 1067 /** 1068 * Find pages for one segment of a sge array represented by @ss. The function 1069 * does not check the sge, the sge must have been checked for alignment with a 1070 * prior call to hfi1_kern_trdma_ok. Other sge checking is done as part of 1071 * rvt_lkey_ok and rvt_rkey_ok. Also, the function only modifies the local sge 1072 * copy maintained in @ss->sge, the original sge is not modified. 1073 * 1074 * Unlike IB RDMA WRITE, we can't decrement ss->num_sge here because we are not 1075 * releasing the MR reference count at the same time. Otherwise, we'll "leak" 1076 * references to the MR. This difference requires that we keep track of progress 1077 * into the sg_list. This is done by the cur_seg cursor in the tid_rdma_request 1078 * structure. 1079 */ 1080 static u32 kern_find_pages(struct tid_rdma_flow *flow, 1081 struct page **pages, 1082 struct rvt_sge_state *ss, bool *last) 1083 { 1084 struct tid_rdma_request *req = flow->req; 1085 struct rvt_sge *sge = &ss->sge; 1086 u32 length = flow->req->seg_len; 1087 u32 len = PAGE_SIZE; 1088 u32 i = 0; 1089 1090 while (length && req->isge < ss->num_sge) { 1091 pages[i++] = virt_to_page(sge->vaddr); 1092 1093 sge->vaddr += len; 1094 sge->length -= len; 1095 sge->sge_length -= len; 1096 if (!sge->sge_length) { 1097 if (++req->isge < ss->num_sge) 1098 *sge = ss->sg_list[req->isge - 1]; 1099 } else if (sge->length == 0 && sge->mr->lkey) { 1100 if (++sge->n >= RVT_SEGSZ) { 1101 ++sge->m; 1102 sge->n = 0; 1103 } 1104 sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr; 1105 sge->length = sge->mr->map[sge->m]->segs[sge->n].length; 1106 } 1107 length -= len; 1108 } 1109 1110 flow->length = flow->req->seg_len - length; 1111 *last = req->isge == ss->num_sge ? false : true; 1112 return i; 1113 } 1114 1115 static void dma_unmap_flow(struct tid_rdma_flow *flow) 1116 { 1117 struct hfi1_devdata *dd; 1118 int i; 1119 struct tid_rdma_pageset *pset; 1120 1121 dd = flow->req->rcd->dd; 1122 for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets; 1123 i++, pset++) { 1124 if (pset->count && pset->addr) { 1125 dma_unmap_page(&dd->pcidev->dev, 1126 pset->addr, 1127 PAGE_SIZE * pset->count, 1128 DMA_FROM_DEVICE); 1129 pset->mapped = 0; 1130 } 1131 } 1132 } 1133 1134 static int dma_map_flow(struct tid_rdma_flow *flow, struct page **pages) 1135 { 1136 int i; 1137 struct hfi1_devdata *dd = flow->req->rcd->dd; 1138 struct tid_rdma_pageset *pset; 1139 1140 for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets; 1141 i++, pset++) { 1142 if (pset->count) { 1143 pset->addr = dma_map_page(&dd->pcidev->dev, 1144 pages[pset->idx], 1145 0, 1146 PAGE_SIZE * pset->count, 1147 DMA_FROM_DEVICE); 1148 1149 if (dma_mapping_error(&dd->pcidev->dev, pset->addr)) { 1150 dma_unmap_flow(flow); 1151 return -ENOMEM; 1152 } 1153 pset->mapped = 1; 1154 } 1155 } 1156 return 0; 1157 } 1158 1159 static inline bool dma_mapped(struct tid_rdma_flow *flow) 1160 { 1161 return !!flow->pagesets[0].mapped; 1162 } 1163 1164 /* 1165 * Get pages pointers and identify contiguous physical memory chunks for a 1166 * segment. All segments are of length flow->req->seg_len. 1167 */ 1168 static int kern_get_phys_blocks(struct tid_rdma_flow *flow, 1169 struct page **pages, 1170 struct rvt_sge_state *ss, bool *last) 1171 { 1172 u8 npages; 1173 1174 /* Reuse previously computed pagesets, if any */ 1175 if (flow->npagesets) { 1176 trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head, 1177 flow); 1178 if (!dma_mapped(flow)) 1179 return dma_map_flow(flow, pages); 1180 return 0; 1181 } 1182 1183 npages = kern_find_pages(flow, pages, ss, last); 1184 1185 if (flow->req->qp->pmtu == enum_to_mtu(OPA_MTU_4096)) 1186 flow->npagesets = 1187 tid_rdma_find_phys_blocks_4k(flow, pages, npages, 1188 flow->pagesets); 1189 else 1190 flow->npagesets = 1191 tid_rdma_find_phys_blocks_8k(flow, pages, npages, 1192 flow->pagesets); 1193 1194 return dma_map_flow(flow, pages); 1195 } 1196 1197 static inline void kern_add_tid_node(struct tid_rdma_flow *flow, 1198 struct hfi1_ctxtdata *rcd, char *s, 1199 struct tid_group *grp, u8 cnt) 1200 { 1201 struct kern_tid_node *node = &flow->tnode[flow->tnode_cnt++]; 1202 1203 WARN_ON_ONCE(flow->tnode_cnt >= 1204 (TID_RDMA_MAX_SEGMENT_SIZE >> PAGE_SHIFT)); 1205 if (WARN_ON_ONCE(cnt & 1)) 1206 dd_dev_err(rcd->dd, 1207 "unexpected odd allocation cnt %u map 0x%x used %u", 1208 cnt, grp->map, grp->used); 1209 1210 node->grp = grp; 1211 node->map = grp->map; 1212 node->cnt = cnt; 1213 trace_hfi1_tid_node_add(flow->req->qp, s, flow->tnode_cnt - 1, 1214 grp->base, grp->map, grp->used, cnt); 1215 } 1216 1217 /* 1218 * Try to allocate pageset_count TID's from TID groups for a context 1219 * 1220 * This function allocates TID's without moving groups between lists or 1221 * modifying grp->map. This is done as follows, being cogizant of the lists 1222 * between which the TID groups will move: 1223 * 1. First allocate complete groups of 8 TID's since this is more efficient, 1224 * these groups will move from group->full without affecting used 1225 * 2. If more TID's are needed allocate from used (will move from used->full or 1226 * stay in used) 1227 * 3. If we still don't have the required number of TID's go back and look again 1228 * at a complete group (will move from group->used) 1229 */ 1230 static int kern_alloc_tids(struct tid_rdma_flow *flow) 1231 { 1232 struct hfi1_ctxtdata *rcd = flow->req->rcd; 1233 struct hfi1_devdata *dd = rcd->dd; 1234 u32 ngroups, pageidx = 0; 1235 struct tid_group *group = NULL, *used; 1236 u8 use; 1237 1238 flow->tnode_cnt = 0; 1239 ngroups = flow->npagesets / dd->rcv_entries.group_size; 1240 if (!ngroups) 1241 goto used_list; 1242 1243 /* First look at complete groups */ 1244 list_for_each_entry(group, &rcd->tid_group_list.list, list) { 1245 kern_add_tid_node(flow, rcd, "complete groups", group, 1246 group->size); 1247 1248 pageidx += group->size; 1249 if (!--ngroups) 1250 break; 1251 } 1252 1253 if (pageidx >= flow->npagesets) 1254 goto ok; 1255 1256 used_list: 1257 /* Now look at partially used groups */ 1258 list_for_each_entry(used, &rcd->tid_used_list.list, list) { 1259 use = min_t(u32, flow->npagesets - pageidx, 1260 used->size - used->used); 1261 kern_add_tid_node(flow, rcd, "used groups", used, use); 1262 1263 pageidx += use; 1264 if (pageidx >= flow->npagesets) 1265 goto ok; 1266 } 1267 1268 /* 1269 * Look again at a complete group, continuing from where we left. 1270 * However, if we are at the head, we have reached the end of the 1271 * complete groups list from the first loop above 1272 */ 1273 if (group && &group->list == &rcd->tid_group_list.list) 1274 goto bail_eagain; 1275 group = list_prepare_entry(group, &rcd->tid_group_list.list, 1276 list); 1277 if (list_is_last(&group->list, &rcd->tid_group_list.list)) 1278 goto bail_eagain; 1279 group = list_next_entry(group, list); 1280 use = min_t(u32, flow->npagesets - pageidx, group->size); 1281 kern_add_tid_node(flow, rcd, "complete continue", group, use); 1282 pageidx += use; 1283 if (pageidx >= flow->npagesets) 1284 goto ok; 1285 bail_eagain: 1286 trace_hfi1_msg_alloc_tids(flow->req->qp, " insufficient tids: needed ", 1287 (u64)flow->npagesets); 1288 return -EAGAIN; 1289 ok: 1290 return 0; 1291 } 1292 1293 static void kern_program_rcv_group(struct tid_rdma_flow *flow, int grp_num, 1294 u32 *pset_idx) 1295 { 1296 struct hfi1_ctxtdata *rcd = flow->req->rcd; 1297 struct hfi1_devdata *dd = rcd->dd; 1298 struct kern_tid_node *node = &flow->tnode[grp_num]; 1299 struct tid_group *grp = node->grp; 1300 struct tid_rdma_pageset *pset; 1301 u32 pmtu_pg = flow->req->qp->pmtu >> PAGE_SHIFT; 1302 u32 rcventry, npages = 0, pair = 0, tidctrl; 1303 u8 i, cnt = 0; 1304 1305 for (i = 0; i < grp->size; i++) { 1306 rcventry = grp->base + i; 1307 1308 if (node->map & BIT(i) || cnt >= node->cnt) { 1309 rcv_array_wc_fill(dd, rcventry); 1310 continue; 1311 } 1312 pset = &flow->pagesets[(*pset_idx)++]; 1313 if (pset->count) { 1314 hfi1_put_tid(dd, rcventry, PT_EXPECTED, 1315 pset->addr, trdma_pset_order(pset)); 1316 } else { 1317 hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0); 1318 } 1319 npages += pset->count; 1320 1321 rcventry -= rcd->expected_base; 1322 tidctrl = pair ? 0x3 : rcventry & 0x1 ? 0x2 : 0x1; 1323 /* 1324 * A single TID entry will be used to use a rcvarr pair (with 1325 * tidctrl 0x3), if ALL these are true (a) the bit pos is even 1326 * (b) the group map shows current and the next bits as free 1327 * indicating two consecutive rcvarry entries are available (c) 1328 * we actually need 2 more entries 1329 */ 1330 pair = !(i & 0x1) && !((node->map >> i) & 0x3) && 1331 node->cnt >= cnt + 2; 1332 if (!pair) { 1333 if (!pset->count) 1334 tidctrl = 0x1; 1335 flow->tid_entry[flow->tidcnt++] = 1336 EXP_TID_SET(IDX, rcventry >> 1) | 1337 EXP_TID_SET(CTRL, tidctrl) | 1338 EXP_TID_SET(LEN, npages); 1339 trace_hfi1_tid_entry_alloc(/* entry */ 1340 flow->req->qp, flow->tidcnt - 1, 1341 flow->tid_entry[flow->tidcnt - 1]); 1342 1343 /* Efficient DIV_ROUND_UP(npages, pmtu_pg) */ 1344 flow->npkts += (npages + pmtu_pg - 1) >> ilog2(pmtu_pg); 1345 npages = 0; 1346 } 1347 1348 if (grp->used == grp->size - 1) 1349 tid_group_move(grp, &rcd->tid_used_list, 1350 &rcd->tid_full_list); 1351 else if (!grp->used) 1352 tid_group_move(grp, &rcd->tid_group_list, 1353 &rcd->tid_used_list); 1354 1355 grp->used++; 1356 grp->map |= BIT(i); 1357 cnt++; 1358 } 1359 } 1360 1361 static void kern_unprogram_rcv_group(struct tid_rdma_flow *flow, int grp_num) 1362 { 1363 struct hfi1_ctxtdata *rcd = flow->req->rcd; 1364 struct hfi1_devdata *dd = rcd->dd; 1365 struct kern_tid_node *node = &flow->tnode[grp_num]; 1366 struct tid_group *grp = node->grp; 1367 u32 rcventry; 1368 u8 i, cnt = 0; 1369 1370 for (i = 0; i < grp->size; i++) { 1371 rcventry = grp->base + i; 1372 1373 if (node->map & BIT(i) || cnt >= node->cnt) { 1374 rcv_array_wc_fill(dd, rcventry); 1375 continue; 1376 } 1377 1378 hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0); 1379 1380 grp->used--; 1381 grp->map &= ~BIT(i); 1382 cnt++; 1383 1384 if (grp->used == grp->size - 1) 1385 tid_group_move(grp, &rcd->tid_full_list, 1386 &rcd->tid_used_list); 1387 else if (!grp->used) 1388 tid_group_move(grp, &rcd->tid_used_list, 1389 &rcd->tid_group_list); 1390 } 1391 if (WARN_ON_ONCE(cnt & 1)) { 1392 struct hfi1_ctxtdata *rcd = flow->req->rcd; 1393 struct hfi1_devdata *dd = rcd->dd; 1394 1395 dd_dev_err(dd, "unexpected odd free cnt %u map 0x%x used %u", 1396 cnt, grp->map, grp->used); 1397 } 1398 } 1399 1400 static void kern_program_rcvarray(struct tid_rdma_flow *flow) 1401 { 1402 u32 pset_idx = 0; 1403 int i; 1404 1405 flow->npkts = 0; 1406 flow->tidcnt = 0; 1407 for (i = 0; i < flow->tnode_cnt; i++) 1408 kern_program_rcv_group(flow, i, &pset_idx); 1409 trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head, flow); 1410 } 1411 1412 /** 1413 * hfi1_kern_exp_rcv_setup() - setup TID's and flow for one segment of a 1414 * TID RDMA request 1415 * 1416 * @req: TID RDMA request for which the segment/flow is being set up 1417 * @ss: sge state, maintains state across successive segments of a sge 1418 * @last: set to true after the last sge segment has been processed 1419 * 1420 * This function 1421 * (1) finds a free flow entry in the flow circular buffer 1422 * (2) finds pages and continuous physical chunks constituing one segment 1423 * of an sge 1424 * (3) allocates TID group entries for those chunks 1425 * (4) programs rcvarray entries in the hardware corresponding to those 1426 * TID's 1427 * (5) computes a tidarray with formatted TID entries which can be sent 1428 * to the sender 1429 * (6) Reserves and programs HW flows. 1430 * (7) It also manages queing the QP when TID/flow resources are not 1431 * available. 1432 * 1433 * @req points to struct tid_rdma_request of which the segments are a part. The 1434 * function uses qp, rcd and seg_len members of @req. In the absence of errors, 1435 * req->flow_idx is the index of the flow which has been prepared in this 1436 * invocation of function call. With flow = &req->flows[req->flow_idx], 1437 * flow->tid_entry contains the TID array which the sender can use for TID RDMA 1438 * sends and flow->npkts contains number of packets required to send the 1439 * segment. 1440 * 1441 * hfi1_check_sge_align should be called prior to calling this function and if 1442 * it signals error TID RDMA cannot be used for this sge and this function 1443 * should not be called. 1444 * 1445 * For the queuing, caller must hold the flow->req->qp s_lock from the send 1446 * engine and the function will procure the exp_lock. 1447 * 1448 * Return: 1449 * The function returns -EAGAIN if sufficient number of TID/flow resources to 1450 * map the segment could not be allocated. In this case the function should be 1451 * called again with previous arguments to retry the TID allocation. There are 1452 * no other error returns. The function returns 0 on success. 1453 */ 1454 int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req, 1455 struct rvt_sge_state *ss, bool *last) 1456 __must_hold(&req->qp->s_lock) 1457 { 1458 struct tid_rdma_flow *flow = &req->flows[req->setup_head]; 1459 struct hfi1_ctxtdata *rcd = req->rcd; 1460 struct hfi1_qp_priv *qpriv = req->qp->priv; 1461 unsigned long flags; 1462 struct rvt_qp *fqp; 1463 u16 clear_tail = req->clear_tail; 1464 1465 lockdep_assert_held(&req->qp->s_lock); 1466 /* 1467 * We return error if either (a) we don't have space in the flow 1468 * circular buffer, or (b) we already have max entries in the buffer. 1469 * Max entries depend on the type of request we are processing and the 1470 * negotiated TID RDMA parameters. 1471 */ 1472 if (!CIRC_SPACE(req->setup_head, clear_tail, MAX_FLOWS) || 1473 CIRC_CNT(req->setup_head, clear_tail, MAX_FLOWS) >= 1474 req->n_flows) 1475 return -EINVAL; 1476 1477 /* 1478 * Get pages, identify contiguous physical memory chunks for the segment 1479 * If we can not determine a DMA address mapping we will treat it just 1480 * like if we ran out of space above. 1481 */ 1482 if (kern_get_phys_blocks(flow, qpriv->pages, ss, last)) { 1483 hfi1_wait_kmem(flow->req->qp); 1484 return -ENOMEM; 1485 } 1486 1487 spin_lock_irqsave(&rcd->exp_lock, flags); 1488 if (kernel_tid_waiters(rcd, &rcd->rarr_queue, flow->req->qp)) 1489 goto queue; 1490 1491 /* 1492 * At this point we know the number of pagesets and hence the number of 1493 * TID's to map the segment. Allocate the TID's from the TID groups. If 1494 * we cannot allocate the required number we exit and try again later 1495 */ 1496 if (kern_alloc_tids(flow)) 1497 goto queue; 1498 /* 1499 * Finally program the TID entries with the pagesets, compute the 1500 * tidarray and enable the HW flow 1501 */ 1502 kern_program_rcvarray(flow); 1503 1504 /* 1505 * Setup the flow state with relevant information. 1506 * This information is used for tracking the sequence of data packets 1507 * for the segment. 1508 * The flow is setup here as this is the most accurate time and place 1509 * to do so. Doing at a later time runs the risk of the flow data in 1510 * qpriv getting out of sync. 1511 */ 1512 memset(&flow->flow_state, 0x0, sizeof(flow->flow_state)); 1513 flow->idx = qpriv->flow_state.index; 1514 flow->flow_state.generation = qpriv->flow_state.generation; 1515 flow->flow_state.spsn = qpriv->flow_state.psn; 1516 flow->flow_state.lpsn = flow->flow_state.spsn + flow->npkts - 1; 1517 flow->flow_state.r_next_psn = 1518 full_flow_psn(flow, flow->flow_state.spsn); 1519 qpriv->flow_state.psn += flow->npkts; 1520 1521 dequeue_tid_waiter(rcd, &rcd->rarr_queue, flow->req->qp); 1522 /* get head before dropping lock */ 1523 fqp = first_qp(rcd, &rcd->rarr_queue); 1524 spin_unlock_irqrestore(&rcd->exp_lock, flags); 1525 tid_rdma_schedule_tid_wakeup(fqp); 1526 1527 req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1); 1528 return 0; 1529 queue: 1530 queue_qp_for_tid_wait(rcd, &rcd->rarr_queue, flow->req->qp); 1531 spin_unlock_irqrestore(&rcd->exp_lock, flags); 1532 return -EAGAIN; 1533 } 1534 1535 static void hfi1_tid_rdma_reset_flow(struct tid_rdma_flow *flow) 1536 { 1537 flow->npagesets = 0; 1538 } 1539 1540 /* 1541 * This function is called after one segment has been successfully sent to 1542 * release the flow and TID HW/SW resources for that segment. The segments for a 1543 * TID RDMA request are setup and cleared in FIFO order which is managed using a 1544 * circular buffer. 1545 */ 1546 int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req) 1547 __must_hold(&req->qp->s_lock) 1548 { 1549 struct tid_rdma_flow *flow = &req->flows[req->clear_tail]; 1550 struct hfi1_ctxtdata *rcd = req->rcd; 1551 unsigned long flags; 1552 int i; 1553 struct rvt_qp *fqp; 1554 1555 lockdep_assert_held(&req->qp->s_lock); 1556 /* Exit if we have nothing in the flow circular buffer */ 1557 if (!CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS)) 1558 return -EINVAL; 1559 1560 spin_lock_irqsave(&rcd->exp_lock, flags); 1561 1562 for (i = 0; i < flow->tnode_cnt; i++) 1563 kern_unprogram_rcv_group(flow, i); 1564 /* To prevent double unprogramming */ 1565 flow->tnode_cnt = 0; 1566 /* get head before dropping lock */ 1567 fqp = first_qp(rcd, &rcd->rarr_queue); 1568 spin_unlock_irqrestore(&rcd->exp_lock, flags); 1569 1570 dma_unmap_flow(flow); 1571 1572 hfi1_tid_rdma_reset_flow(flow); 1573 req->clear_tail = (req->clear_tail + 1) & (MAX_FLOWS - 1); 1574 1575 if (fqp == req->qp) { 1576 __trigger_tid_waiter(fqp); 1577 rvt_put_qp(fqp); 1578 } else { 1579 tid_rdma_schedule_tid_wakeup(fqp); 1580 } 1581 1582 return 0; 1583 } 1584 1585 /* 1586 * This function is called to release all the tid entries for 1587 * a request. 1588 */ 1589 void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req) 1590 __must_hold(&req->qp->s_lock) 1591 { 1592 /* Use memory barrier for proper ordering */ 1593 while (CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS)) { 1594 if (hfi1_kern_exp_rcv_clear(req)) 1595 break; 1596 } 1597 } 1598 1599 /** 1600 * hfi1_kern_exp_rcv_free_flows - free priviously allocated flow information 1601 * @req - the tid rdma request to be cleaned 1602 */ 1603 static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req) 1604 { 1605 kfree(req->flows); 1606 req->flows = NULL; 1607 } 1608 1609 /** 1610 * __trdma_clean_swqe - clean up for large sized QPs 1611 * @qp: the queue patch 1612 * @wqe: the send wqe 1613 */ 1614 void __trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe) 1615 { 1616 struct hfi1_swqe_priv *p = wqe->priv; 1617 1618 hfi1_kern_exp_rcv_free_flows(&p->tid_req); 1619 } 1620 1621 /* 1622 * This can be called at QP create time or in the data path. 1623 */ 1624 static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req, 1625 gfp_t gfp) 1626 { 1627 struct tid_rdma_flow *flows; 1628 int i; 1629 1630 if (likely(req->flows)) 1631 return 0; 1632 flows = kmalloc_node(MAX_FLOWS * sizeof(*flows), gfp, 1633 req->rcd->numa_id); 1634 if (!flows) 1635 return -ENOMEM; 1636 /* mini init */ 1637 for (i = 0; i < MAX_FLOWS; i++) { 1638 flows[i].req = req; 1639 flows[i].npagesets = 0; 1640 flows[i].pagesets[0].mapped = 0; 1641 flows[i].resync_npkts = 0; 1642 } 1643 req->flows = flows; 1644 return 0; 1645 } 1646 1647 static void hfi1_init_trdma_req(struct rvt_qp *qp, 1648 struct tid_rdma_request *req) 1649 { 1650 struct hfi1_qp_priv *qpriv = qp->priv; 1651 1652 /* 1653 * Initialize various TID RDMA request variables. 1654 * These variables are "static", which is why they 1655 * can be pre-initialized here before the WRs has 1656 * even been submitted. 1657 * However, non-NULL values for these variables do not 1658 * imply that this WQE has been enabled for TID RDMA. 1659 * Drivers should check the WQE's opcode to determine 1660 * if a request is a TID RDMA one or not. 1661 */ 1662 req->qp = qp; 1663 req->rcd = qpriv->rcd; 1664 } 1665 1666 u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry, 1667 void *context, int vl, int mode, u64 data) 1668 { 1669 struct hfi1_devdata *dd = context; 1670 1671 return dd->verbs_dev.n_tidwait; 1672 } 1673 1674 static struct tid_rdma_flow *find_flow_ib(struct tid_rdma_request *req, 1675 u32 psn, u16 *fidx) 1676 { 1677 u16 head, tail; 1678 struct tid_rdma_flow *flow; 1679 1680 head = req->setup_head; 1681 tail = req->clear_tail; 1682 for ( ; CIRC_CNT(head, tail, MAX_FLOWS); 1683 tail = CIRC_NEXT(tail, MAX_FLOWS)) { 1684 flow = &req->flows[tail]; 1685 if (cmp_psn(psn, flow->flow_state.ib_spsn) >= 0 && 1686 cmp_psn(psn, flow->flow_state.ib_lpsn) <= 0) { 1687 if (fidx) 1688 *fidx = tail; 1689 return flow; 1690 } 1691 } 1692 return NULL; 1693 } 1694 1695 /* TID RDMA READ functions */ 1696 u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe, 1697 struct ib_other_headers *ohdr, u32 *bth1, 1698 u32 *bth2, u32 *len) 1699 { 1700 struct tid_rdma_request *req = wqe_to_tid_req(wqe); 1701 struct tid_rdma_flow *flow = &req->flows[req->flow_idx]; 1702 struct rvt_qp *qp = req->qp; 1703 struct hfi1_qp_priv *qpriv = qp->priv; 1704 struct hfi1_swqe_priv *wpriv = wqe->priv; 1705 struct tid_rdma_read_req *rreq = &ohdr->u.tid_rdma.r_req; 1706 struct tid_rdma_params *remote; 1707 u32 req_len = 0; 1708 void *req_addr = NULL; 1709 1710 /* This is the IB psn used to send the request */ 1711 *bth2 = mask_psn(flow->flow_state.ib_spsn + flow->pkt); 1712 trace_hfi1_tid_flow_build_read_pkt(qp, req->flow_idx, flow); 1713 1714 /* TID Entries for TID RDMA READ payload */ 1715 req_addr = &flow->tid_entry[flow->tid_idx]; 1716 req_len = sizeof(*flow->tid_entry) * 1717 (flow->tidcnt - flow->tid_idx); 1718 1719 memset(&ohdr->u.tid_rdma.r_req, 0, sizeof(ohdr->u.tid_rdma.r_req)); 1720 wpriv->ss.sge.vaddr = req_addr; 1721 wpriv->ss.sge.sge_length = req_len; 1722 wpriv->ss.sge.length = wpriv->ss.sge.sge_length; 1723 /* 1724 * We can safely zero these out. Since the first SGE covers the 1725 * entire packet, nothing else should even look at the MR. 1726 */ 1727 wpriv->ss.sge.mr = NULL; 1728 wpriv->ss.sge.m = 0; 1729 wpriv->ss.sge.n = 0; 1730 1731 wpriv->ss.sg_list = NULL; 1732 wpriv->ss.total_len = wpriv->ss.sge.sge_length; 1733 wpriv->ss.num_sge = 1; 1734 1735 /* Construct the TID RDMA READ REQ packet header */ 1736 rcu_read_lock(); 1737 remote = rcu_dereference(qpriv->tid_rdma.remote); 1738 1739 KDETH_RESET(rreq->kdeth0, KVER, 0x1); 1740 KDETH_RESET(rreq->kdeth1, JKEY, remote->jkey); 1741 rreq->reth.vaddr = cpu_to_be64(wqe->rdma_wr.remote_addr + 1742 req->cur_seg * req->seg_len + flow->sent); 1743 rreq->reth.rkey = cpu_to_be32(wqe->rdma_wr.rkey); 1744 rreq->reth.length = cpu_to_be32(*len); 1745 rreq->tid_flow_psn = 1746 cpu_to_be32((flow->flow_state.generation << 1747 HFI1_KDETH_BTH_SEQ_SHIFT) | 1748 ((flow->flow_state.spsn + flow->pkt) & 1749 HFI1_KDETH_BTH_SEQ_MASK)); 1750 rreq->tid_flow_qp = 1751 cpu_to_be32(qpriv->tid_rdma.local.qp | 1752 ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) << 1753 TID_RDMA_DESTQP_FLOW_SHIFT) | 1754 qpriv->rcd->ctxt); 1755 rreq->verbs_qp = cpu_to_be32(qp->remote_qpn); 1756 *bth1 &= ~RVT_QPN_MASK; 1757 *bth1 |= remote->qp; 1758 *bth2 |= IB_BTH_REQ_ACK; 1759 rcu_read_unlock(); 1760 1761 /* We are done with this segment */ 1762 flow->sent += *len; 1763 req->cur_seg++; 1764 qp->s_state = TID_OP(READ_REQ); 1765 req->ack_pending++; 1766 req->flow_idx = (req->flow_idx + 1) & (MAX_FLOWS - 1); 1767 qpriv->pending_tid_r_segs++; 1768 qp->s_num_rd_atomic++; 1769 1770 /* Set the TID RDMA READ request payload size */ 1771 *len = req_len; 1772 1773 return sizeof(ohdr->u.tid_rdma.r_req) / sizeof(u32); 1774 } 1775 1776 /* 1777 * @len: contains the data length to read upon entry and the read request 1778 * payload length upon exit. 1779 */ 1780 u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe, 1781 struct ib_other_headers *ohdr, u32 *bth1, 1782 u32 *bth2, u32 *len) 1783 __must_hold(&qp->s_lock) 1784 { 1785 struct hfi1_qp_priv *qpriv = qp->priv; 1786 struct tid_rdma_request *req = wqe_to_tid_req(wqe); 1787 struct tid_rdma_flow *flow = NULL; 1788 u32 hdwords = 0; 1789 bool last; 1790 bool retry = true; 1791 u32 npkts = rvt_div_round_up_mtu(qp, *len); 1792 1793 trace_hfi1_tid_req_build_read_req(qp, 0, wqe->wr.opcode, wqe->psn, 1794 wqe->lpsn, req); 1795 /* 1796 * Check sync conditions. Make sure that there are no pending 1797 * segments before freeing the flow. 1798 */ 1799 sync_check: 1800 if (req->state == TID_REQUEST_SYNC) { 1801 if (qpriv->pending_tid_r_segs) 1802 goto done; 1803 1804 hfi1_kern_clear_hw_flow(req->rcd, qp); 1805 qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; 1806 req->state = TID_REQUEST_ACTIVE; 1807 } 1808 1809 /* 1810 * If the request for this segment is resent, the tid resources should 1811 * have been allocated before. In this case, req->flow_idx should 1812 * fall behind req->setup_head. 1813 */ 1814 if (req->flow_idx == req->setup_head) { 1815 retry = false; 1816 if (req->state == TID_REQUEST_RESEND) { 1817 /* 1818 * This is the first new segment for a request whose 1819 * earlier segments have been re-sent. We need to 1820 * set up the sge pointer correctly. 1821 */ 1822 restart_sge(&qp->s_sge, wqe, req->s_next_psn, 1823 qp->pmtu); 1824 req->isge = 0; 1825 req->state = TID_REQUEST_ACTIVE; 1826 } 1827 1828 /* 1829 * Check sync. The last PSN of each generation is reserved for 1830 * RESYNC. 1831 */ 1832 if ((qpriv->flow_state.psn + npkts) > MAX_TID_FLOW_PSN - 1) { 1833 req->state = TID_REQUEST_SYNC; 1834 goto sync_check; 1835 } 1836 1837 /* Allocate the flow if not yet */ 1838 if (hfi1_kern_setup_hw_flow(qpriv->rcd, qp)) 1839 goto done; 1840 1841 /* 1842 * The following call will advance req->setup_head after 1843 * allocating the tid entries. 1844 */ 1845 if (hfi1_kern_exp_rcv_setup(req, &qp->s_sge, &last)) { 1846 req->state = TID_REQUEST_QUEUED; 1847 1848 /* 1849 * We don't have resources for this segment. The QP has 1850 * already been queued. 1851 */ 1852 goto done; 1853 } 1854 } 1855 1856 /* req->flow_idx should only be one slot behind req->setup_head */ 1857 flow = &req->flows[req->flow_idx]; 1858 flow->pkt = 0; 1859 flow->tid_idx = 0; 1860 flow->sent = 0; 1861 if (!retry) { 1862 /* Set the first and last IB PSN for the flow in use.*/ 1863 flow->flow_state.ib_spsn = req->s_next_psn; 1864 flow->flow_state.ib_lpsn = 1865 flow->flow_state.ib_spsn + flow->npkts - 1; 1866 } 1867 1868 /* Calculate the next segment start psn.*/ 1869 req->s_next_psn += flow->npkts; 1870 1871 /* Build the packet header */ 1872 hdwords = hfi1_build_tid_rdma_read_packet(wqe, ohdr, bth1, bth2, len); 1873 done: 1874 return hdwords; 1875 } 1876 1877 /* 1878 * Validate and accept the TID RDMA READ request parameters. 1879 * Return 0 if the request is accepted successfully; 1880 * Return 1 otherwise. 1881 */ 1882 static int tid_rdma_rcv_read_request(struct rvt_qp *qp, 1883 struct rvt_ack_entry *e, 1884 struct hfi1_packet *packet, 1885 struct ib_other_headers *ohdr, 1886 u32 bth0, u32 psn, u64 vaddr, u32 len) 1887 { 1888 struct hfi1_qp_priv *qpriv = qp->priv; 1889 struct tid_rdma_request *req; 1890 struct tid_rdma_flow *flow; 1891 u32 flow_psn, i, tidlen = 0, pktlen, tlen; 1892 1893 req = ack_to_tid_req(e); 1894 1895 /* Validate the payload first */ 1896 flow = &req->flows[req->setup_head]; 1897 1898 /* payload length = packet length - (header length + ICRC length) */ 1899 pktlen = packet->tlen - (packet->hlen + 4); 1900 if (pktlen > sizeof(flow->tid_entry)) 1901 return 1; 1902 memcpy(flow->tid_entry, packet->ebuf, pktlen); 1903 flow->tidcnt = pktlen / sizeof(*flow->tid_entry); 1904 1905 /* 1906 * Walk the TID_ENTRY list to make sure we have enough space for a 1907 * complete segment. Also calculate the number of required packets. 1908 */ 1909 flow->npkts = rvt_div_round_up_mtu(qp, len); 1910 for (i = 0; i < flow->tidcnt; i++) { 1911 trace_hfi1_tid_entry_rcv_read_req(qp, i, 1912 flow->tid_entry[i]); 1913 tlen = EXP_TID_GET(flow->tid_entry[i], LEN); 1914 if (!tlen) 1915 return 1; 1916 1917 /* 1918 * For tid pair (tidctr == 3), the buffer size of the pair 1919 * should be the sum of the buffer size described by each 1920 * tid entry. However, only the first entry needs to be 1921 * specified in the request (see WFR HAS Section 8.5.7.1). 1922 */ 1923 tidlen += tlen; 1924 } 1925 if (tidlen * PAGE_SIZE < len) 1926 return 1; 1927 1928 /* Empty the flow array */ 1929 req->clear_tail = req->setup_head; 1930 flow->pkt = 0; 1931 flow->tid_idx = 0; 1932 flow->tid_offset = 0; 1933 flow->sent = 0; 1934 flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_qp); 1935 flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) & 1936 TID_RDMA_DESTQP_FLOW_MASK; 1937 flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_psn)); 1938 flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT; 1939 flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK; 1940 flow->length = len; 1941 1942 flow->flow_state.lpsn = flow->flow_state.spsn + 1943 flow->npkts - 1; 1944 flow->flow_state.ib_spsn = psn; 1945 flow->flow_state.ib_lpsn = flow->flow_state.ib_spsn + flow->npkts - 1; 1946 1947 trace_hfi1_tid_flow_rcv_read_req(qp, req->setup_head, flow); 1948 /* Set the initial flow index to the current flow. */ 1949 req->flow_idx = req->setup_head; 1950 1951 /* advance circular buffer head */ 1952 req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1); 1953 1954 /* 1955 * Compute last PSN for request. 1956 */ 1957 e->opcode = (bth0 >> 24) & 0xff; 1958 e->psn = psn; 1959 e->lpsn = psn + flow->npkts - 1; 1960 e->sent = 0; 1961 1962 req->n_flows = qpriv->tid_rdma.local.max_read; 1963 req->state = TID_REQUEST_ACTIVE; 1964 req->cur_seg = 0; 1965 req->comp_seg = 0; 1966 req->ack_seg = 0; 1967 req->isge = 0; 1968 req->seg_len = qpriv->tid_rdma.local.max_len; 1969 req->total_len = len; 1970 req->total_segs = 1; 1971 req->r_flow_psn = e->psn; 1972 1973 trace_hfi1_tid_req_rcv_read_req(qp, 0, e->opcode, e->psn, e->lpsn, 1974 req); 1975 return 0; 1976 } 1977 1978 static int tid_rdma_rcv_error(struct hfi1_packet *packet, 1979 struct ib_other_headers *ohdr, 1980 struct rvt_qp *qp, u32 psn, int diff) 1981 { 1982 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); 1983 struct hfi1_ctxtdata *rcd = ((struct hfi1_qp_priv *)qp->priv)->rcd; 1984 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); 1985 struct hfi1_qp_priv *qpriv = qp->priv; 1986 struct rvt_ack_entry *e; 1987 struct tid_rdma_request *req; 1988 unsigned long flags; 1989 u8 prev; 1990 bool old_req; 1991 1992 trace_hfi1_rsp_tid_rcv_error(qp, psn); 1993 trace_hfi1_tid_rdma_rcv_err(qp, 0, psn, diff); 1994 if (diff > 0) { 1995 /* sequence error */ 1996 if (!qp->r_nak_state) { 1997 ibp->rvp.n_rc_seqnak++; 1998 qp->r_nak_state = IB_NAK_PSN_ERROR; 1999 qp->r_ack_psn = qp->r_psn; 2000 rc_defered_ack(rcd, qp); 2001 } 2002 goto done; 2003 } 2004 2005 ibp->rvp.n_rc_dupreq++; 2006 2007 spin_lock_irqsave(&qp->s_lock, flags); 2008 e = find_prev_entry(qp, psn, &prev, NULL, &old_req); 2009 if (!e || (e->opcode != TID_OP(READ_REQ) && 2010 e->opcode != TID_OP(WRITE_REQ))) 2011 goto unlock; 2012 2013 req = ack_to_tid_req(e); 2014 req->r_flow_psn = psn; 2015 trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn, e->lpsn, req); 2016 if (e->opcode == TID_OP(READ_REQ)) { 2017 struct ib_reth *reth; 2018 u32 len; 2019 u32 rkey; 2020 u64 vaddr; 2021 int ok; 2022 u32 bth0; 2023 2024 reth = &ohdr->u.tid_rdma.r_req.reth; 2025 /* 2026 * The requester always restarts from the start of the original 2027 * request. 2028 */ 2029 len = be32_to_cpu(reth->length); 2030 if (psn != e->psn || len != req->total_len) 2031 goto unlock; 2032 2033 release_rdma_sge_mr(e); 2034 2035 rkey = be32_to_cpu(reth->rkey); 2036 vaddr = get_ib_reth_vaddr(reth); 2037 2038 qp->r_len = len; 2039 ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey, 2040 IB_ACCESS_REMOTE_READ); 2041 if (unlikely(!ok)) 2042 goto unlock; 2043 2044 /* 2045 * If all the response packets for the current request have 2046 * been sent out and this request is complete (old_request 2047 * == false) and the TID flow may be unusable (the 2048 * req->clear_tail is advanced). However, when an earlier 2049 * request is received, this request will not be complete any 2050 * more (qp->s_tail_ack_queue is moved back, see below). 2051 * Consequently, we need to update the TID flow info everytime 2052 * a duplicate request is received. 2053 */ 2054 bth0 = be32_to_cpu(ohdr->bth[0]); 2055 if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn, 2056 vaddr, len)) 2057 goto unlock; 2058 2059 /* 2060 * True if the request is already scheduled (between 2061 * qp->s_tail_ack_queue and qp->r_head_ack_queue); 2062 */ 2063 if (old_req) 2064 goto unlock; 2065 } else { 2066 struct flow_state *fstate; 2067 bool schedule = false; 2068 u8 i; 2069 2070 if (req->state == TID_REQUEST_RESEND) { 2071 req->state = TID_REQUEST_RESEND_ACTIVE; 2072 } else if (req->state == TID_REQUEST_INIT_RESEND) { 2073 req->state = TID_REQUEST_INIT; 2074 schedule = true; 2075 } 2076 2077 /* 2078 * True if the request is already scheduled (between 2079 * qp->s_tail_ack_queue and qp->r_head_ack_queue). 2080 * Also, don't change requests, which are at the SYNC 2081 * point and haven't generated any responses yet. 2082 * There is nothing to retransmit for them yet. 2083 */ 2084 if (old_req || req->state == TID_REQUEST_INIT || 2085 (req->state == TID_REQUEST_SYNC && !req->cur_seg)) { 2086 for (i = prev + 1; ; i++) { 2087 if (i > rvt_size_atomic(&dev->rdi)) 2088 i = 0; 2089 if (i == qp->r_head_ack_queue) 2090 break; 2091 e = &qp->s_ack_queue[i]; 2092 req = ack_to_tid_req(e); 2093 if (e->opcode == TID_OP(WRITE_REQ) && 2094 req->state == TID_REQUEST_INIT) 2095 req->state = TID_REQUEST_INIT_RESEND; 2096 } 2097 /* 2098 * If the state of the request has been changed, 2099 * the first leg needs to get scheduled in order to 2100 * pick up the change. Otherwise, normal response 2101 * processing should take care of it. 2102 */ 2103 if (!schedule) 2104 goto unlock; 2105 } 2106 2107 /* 2108 * If there is no more allocated segment, just schedule the qp 2109 * without changing any state. 2110 */ 2111 if (req->clear_tail == req->setup_head) 2112 goto schedule; 2113 /* 2114 * If this request has sent responses for segments, which have 2115 * not received data yet (flow_idx != clear_tail), the flow_idx 2116 * pointer needs to be adjusted so the same responses can be 2117 * re-sent. 2118 */ 2119 if (CIRC_CNT(req->flow_idx, req->clear_tail, MAX_FLOWS)) { 2120 fstate = &req->flows[req->clear_tail].flow_state; 2121 qpriv->pending_tid_w_segs -= 2122 CIRC_CNT(req->flow_idx, req->clear_tail, 2123 MAX_FLOWS); 2124 req->flow_idx = 2125 CIRC_ADD(req->clear_tail, 2126 delta_psn(psn, fstate->resp_ib_psn), 2127 MAX_FLOWS); 2128 qpriv->pending_tid_w_segs += 2129 delta_psn(psn, fstate->resp_ib_psn); 2130 /* 2131 * When flow_idx == setup_head, we've gotten a duplicate 2132 * request for a segment, which has not been allocated 2133 * yet. In that case, don't adjust this request. 2134 * However, we still want to go through the loop below 2135 * to adjust all subsequent requests. 2136 */ 2137 if (CIRC_CNT(req->setup_head, req->flow_idx, 2138 MAX_FLOWS)) { 2139 req->cur_seg = delta_psn(psn, e->psn); 2140 req->state = TID_REQUEST_RESEND_ACTIVE; 2141 } 2142 } 2143 2144 for (i = prev + 1; ; i++) { 2145 /* 2146 * Look at everything up to and including 2147 * s_tail_ack_queue 2148 */ 2149 if (i > rvt_size_atomic(&dev->rdi)) 2150 i = 0; 2151 if (i == qp->r_head_ack_queue) 2152 break; 2153 e = &qp->s_ack_queue[i]; 2154 req = ack_to_tid_req(e); 2155 trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn, 2156 e->lpsn, req); 2157 if (e->opcode != TID_OP(WRITE_REQ) || 2158 req->cur_seg == req->comp_seg || 2159 req->state == TID_REQUEST_INIT || 2160 req->state == TID_REQUEST_INIT_RESEND) { 2161 if (req->state == TID_REQUEST_INIT) 2162 req->state = TID_REQUEST_INIT_RESEND; 2163 continue; 2164 } 2165 qpriv->pending_tid_w_segs -= 2166 CIRC_CNT(req->flow_idx, 2167 req->clear_tail, 2168 MAX_FLOWS); 2169 req->flow_idx = req->clear_tail; 2170 req->state = TID_REQUEST_RESEND; 2171 req->cur_seg = req->comp_seg; 2172 } 2173 qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK; 2174 } 2175 /* Re-process old requests.*/ 2176 if (qp->s_acked_ack_queue == qp->s_tail_ack_queue) 2177 qp->s_acked_ack_queue = prev; 2178 qp->s_tail_ack_queue = prev; 2179 /* 2180 * Since the qp->s_tail_ack_queue is modified, the 2181 * qp->s_ack_state must be changed to re-initialize 2182 * qp->s_ack_rdma_sge; Otherwise, we will end up in 2183 * wrong memory region. 2184 */ 2185 qp->s_ack_state = OP(ACKNOWLEDGE); 2186 schedule: 2187 /* 2188 * It's possible to receive a retry psn that is earlier than an RNRNAK 2189 * psn. In this case, the rnrnak state should be cleared. 2190 */ 2191 if (qpriv->rnr_nak_state) { 2192 qp->s_nak_state = 0; 2193 qpriv->rnr_nak_state = TID_RNR_NAK_INIT; 2194 qp->r_psn = e->lpsn + 1; 2195 hfi1_tid_write_alloc_resources(qp, true); 2196 } 2197 2198 qp->r_state = e->opcode; 2199 qp->r_nak_state = 0; 2200 qp->s_flags |= RVT_S_RESP_PENDING; 2201 hfi1_schedule_send(qp); 2202 unlock: 2203 spin_unlock_irqrestore(&qp->s_lock, flags); 2204 done: 2205 return 1; 2206 } 2207 2208 void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet) 2209 { 2210 /* HANDLER FOR TID RDMA READ REQUEST packet (Responder side)*/ 2211 2212 /* 2213 * 1. Verify TID RDMA READ REQ as per IB_OPCODE_RC_RDMA_READ 2214 * (see hfi1_rc_rcv()) 2215 * 2. Put TID RDMA READ REQ into the response queueu (s_ack_queue) 2216 * - Setup struct tid_rdma_req with request info 2217 * - Initialize struct tid_rdma_flow info; 2218 * - Copy TID entries; 2219 * 3. Set the qp->s_ack_state. 2220 * 4. Set RVT_S_RESP_PENDING in s_flags. 2221 * 5. Kick the send engine (hfi1_schedule_send()) 2222 */ 2223 struct hfi1_ctxtdata *rcd = packet->rcd; 2224 struct rvt_qp *qp = packet->qp; 2225 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); 2226 struct ib_other_headers *ohdr = packet->ohdr; 2227 struct rvt_ack_entry *e; 2228 unsigned long flags; 2229 struct ib_reth *reth; 2230 struct hfi1_qp_priv *qpriv = qp->priv; 2231 u32 bth0, psn, len, rkey; 2232 bool fecn; 2233 u8 next; 2234 u64 vaddr; 2235 int diff; 2236 u8 nack_state = IB_NAK_INVALID_REQUEST; 2237 2238 bth0 = be32_to_cpu(ohdr->bth[0]); 2239 if (hfi1_ruc_check_hdr(ibp, packet)) 2240 return; 2241 2242 fecn = process_ecn(qp, packet); 2243 psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 2244 trace_hfi1_rsp_rcv_tid_read_req(qp, psn); 2245 2246 if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST)) 2247 rvt_comm_est(qp); 2248 2249 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) 2250 goto nack_inv; 2251 2252 reth = &ohdr->u.tid_rdma.r_req.reth; 2253 vaddr = be64_to_cpu(reth->vaddr); 2254 len = be32_to_cpu(reth->length); 2255 /* The length needs to be in multiples of PAGE_SIZE */ 2256 if (!len || len & ~PAGE_MASK || len > qpriv->tid_rdma.local.max_len) 2257 goto nack_inv; 2258 2259 diff = delta_psn(psn, qp->r_psn); 2260 if (unlikely(diff)) { 2261 tid_rdma_rcv_err(packet, ohdr, qp, psn, diff, fecn); 2262 return; 2263 } 2264 2265 /* We've verified the request, insert it into the ack queue. */ 2266 next = qp->r_head_ack_queue + 1; 2267 if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) 2268 next = 0; 2269 spin_lock_irqsave(&qp->s_lock, flags); 2270 if (unlikely(next == qp->s_tail_ack_queue)) { 2271 if (!qp->s_ack_queue[next].sent) { 2272 nack_state = IB_NAK_REMOTE_OPERATIONAL_ERROR; 2273 goto nack_inv_unlock; 2274 } 2275 update_ack_queue(qp, next); 2276 } 2277 e = &qp->s_ack_queue[qp->r_head_ack_queue]; 2278 release_rdma_sge_mr(e); 2279 2280 rkey = be32_to_cpu(reth->rkey); 2281 qp->r_len = len; 2282 2283 if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr, 2284 rkey, IB_ACCESS_REMOTE_READ))) 2285 goto nack_acc; 2286 2287 /* Accept the request parameters */ 2288 if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn, vaddr, 2289 len)) 2290 goto nack_inv_unlock; 2291 2292 qp->r_state = e->opcode; 2293 qp->r_nak_state = 0; 2294 /* 2295 * We need to increment the MSN here instead of when we 2296 * finish sending the result since a duplicate request would 2297 * increment it more than once. 2298 */ 2299 qp->r_msn++; 2300 qp->r_psn += e->lpsn - e->psn + 1; 2301 2302 qp->r_head_ack_queue = next; 2303 2304 /* 2305 * For all requests other than TID WRITE which are added to the ack 2306 * queue, qpriv->r_tid_alloc follows qp->r_head_ack_queue. It is ok to 2307 * do this because of interlocks between these and TID WRITE 2308 * requests. The same change has also been made in hfi1_rc_rcv(). 2309 */ 2310 qpriv->r_tid_alloc = qp->r_head_ack_queue; 2311 2312 /* Schedule the send tasklet. */ 2313 qp->s_flags |= RVT_S_RESP_PENDING; 2314 if (fecn) 2315 qp->s_flags |= RVT_S_ECN; 2316 hfi1_schedule_send(qp); 2317 2318 spin_unlock_irqrestore(&qp->s_lock, flags); 2319 return; 2320 2321 nack_inv_unlock: 2322 spin_unlock_irqrestore(&qp->s_lock, flags); 2323 nack_inv: 2324 rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); 2325 qp->r_nak_state = nack_state; 2326 qp->r_ack_psn = qp->r_psn; 2327 /* Queue NAK for later */ 2328 rc_defered_ack(rcd, qp); 2329 return; 2330 nack_acc: 2331 spin_unlock_irqrestore(&qp->s_lock, flags); 2332 rvt_rc_error(qp, IB_WC_LOC_PROT_ERR); 2333 qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; 2334 qp->r_ack_psn = qp->r_psn; 2335 } 2336 2337 u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, 2338 struct ib_other_headers *ohdr, u32 *bth0, 2339 u32 *bth1, u32 *bth2, u32 *len, bool *last) 2340 { 2341 struct hfi1_ack_priv *epriv = e->priv; 2342 struct tid_rdma_request *req = &epriv->tid_req; 2343 struct hfi1_qp_priv *qpriv = qp->priv; 2344 struct tid_rdma_flow *flow = &req->flows[req->clear_tail]; 2345 u32 tidentry = flow->tid_entry[flow->tid_idx]; 2346 u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT; 2347 struct tid_rdma_read_resp *resp = &ohdr->u.tid_rdma.r_rsp; 2348 u32 next_offset, om = KDETH_OM_LARGE; 2349 bool last_pkt; 2350 u32 hdwords = 0; 2351 struct tid_rdma_params *remote; 2352 2353 *len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset); 2354 flow->sent += *len; 2355 next_offset = flow->tid_offset + *len; 2356 last_pkt = (flow->sent >= flow->length); 2357 2358 trace_hfi1_tid_entry_build_read_resp(qp, flow->tid_idx, tidentry); 2359 trace_hfi1_tid_flow_build_read_resp(qp, req->clear_tail, flow); 2360 2361 rcu_read_lock(); 2362 remote = rcu_dereference(qpriv->tid_rdma.remote); 2363 if (!remote) { 2364 rcu_read_unlock(); 2365 goto done; 2366 } 2367 KDETH_RESET(resp->kdeth0, KVER, 0x1); 2368 KDETH_SET(resp->kdeth0, SH, !last_pkt); 2369 KDETH_SET(resp->kdeth0, INTR, !!(!last_pkt && remote->urg)); 2370 KDETH_SET(resp->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL)); 2371 KDETH_SET(resp->kdeth0, TID, EXP_TID_GET(tidentry, IDX)); 2372 KDETH_SET(resp->kdeth0, OM, om == KDETH_OM_LARGE); 2373 KDETH_SET(resp->kdeth0, OFFSET, flow->tid_offset / om); 2374 KDETH_RESET(resp->kdeth1, JKEY, remote->jkey); 2375 resp->verbs_qp = cpu_to_be32(qp->remote_qpn); 2376 rcu_read_unlock(); 2377 2378 resp->aeth = rvt_compute_aeth(qp); 2379 resp->verbs_psn = cpu_to_be32(mask_psn(flow->flow_state.ib_spsn + 2380 flow->pkt)); 2381 2382 *bth0 = TID_OP(READ_RESP) << 24; 2383 *bth1 = flow->tid_qpn; 2384 *bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) & 2385 HFI1_KDETH_BTH_SEQ_MASK) | 2386 (flow->flow_state.generation << 2387 HFI1_KDETH_BTH_SEQ_SHIFT)); 2388 *last = last_pkt; 2389 if (last_pkt) 2390 /* Advance to next flow */ 2391 req->clear_tail = (req->clear_tail + 1) & 2392 (MAX_FLOWS - 1); 2393 2394 if (next_offset >= tidlen) { 2395 flow->tid_offset = 0; 2396 flow->tid_idx++; 2397 } else { 2398 flow->tid_offset = next_offset; 2399 } 2400 2401 hdwords = sizeof(ohdr->u.tid_rdma.r_rsp) / sizeof(u32); 2402 2403 done: 2404 return hdwords; 2405 } 2406 2407 static inline struct tid_rdma_request * 2408 find_tid_request(struct rvt_qp *qp, u32 psn, enum ib_wr_opcode opcode) 2409 __must_hold(&qp->s_lock) 2410 { 2411 struct rvt_swqe *wqe; 2412 struct tid_rdma_request *req = NULL; 2413 u32 i, end; 2414 2415 end = qp->s_cur + 1; 2416 if (end == qp->s_size) 2417 end = 0; 2418 for (i = qp->s_acked; i != end;) { 2419 wqe = rvt_get_swqe_ptr(qp, i); 2420 if (cmp_psn(psn, wqe->psn) >= 0 && 2421 cmp_psn(psn, wqe->lpsn) <= 0) { 2422 if (wqe->wr.opcode == opcode) 2423 req = wqe_to_tid_req(wqe); 2424 break; 2425 } 2426 if (++i == qp->s_size) 2427 i = 0; 2428 } 2429 2430 return req; 2431 } 2432 2433 void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet) 2434 { 2435 /* HANDLER FOR TID RDMA READ RESPONSE packet (Requestor side */ 2436 2437 /* 2438 * 1. Find matching SWQE 2439 * 2. Check that the entire segment has been read. 2440 * 3. Remove HFI1_S_WAIT_TID_RESP from s_flags. 2441 * 4. Free the TID flow resources. 2442 * 5. Kick the send engine (hfi1_schedule_send()) 2443 */ 2444 struct ib_other_headers *ohdr = packet->ohdr; 2445 struct rvt_qp *qp = packet->qp; 2446 struct hfi1_qp_priv *priv = qp->priv; 2447 struct hfi1_ctxtdata *rcd = packet->rcd; 2448 struct tid_rdma_request *req; 2449 struct tid_rdma_flow *flow; 2450 u32 opcode, aeth; 2451 bool fecn; 2452 unsigned long flags; 2453 u32 kpsn, ipsn; 2454 2455 trace_hfi1_sender_rcv_tid_read_resp(qp); 2456 fecn = process_ecn(qp, packet); 2457 kpsn = mask_psn(be32_to_cpu(ohdr->bth[2])); 2458 aeth = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.aeth); 2459 opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; 2460 2461 spin_lock_irqsave(&qp->s_lock, flags); 2462 ipsn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn)); 2463 req = find_tid_request(qp, ipsn, IB_WR_TID_RDMA_READ); 2464 if (unlikely(!req)) 2465 goto ack_op_err; 2466 2467 flow = &req->flows[req->clear_tail]; 2468 /* When header suppression is disabled */ 2469 if (cmp_psn(ipsn, flow->flow_state.ib_lpsn)) { 2470 update_r_next_psn_fecn(packet, priv, rcd, flow, fecn); 2471 2472 if (cmp_psn(kpsn, flow->flow_state.r_next_psn)) 2473 goto ack_done; 2474 flow->flow_state.r_next_psn = mask_psn(kpsn + 1); 2475 /* 2476 * Copy the payload to destination buffer if this packet is 2477 * delivered as an eager packet due to RSM rule and FECN. 2478 * The RSM rule selects FECN bit in BTH and SH bit in 2479 * KDETH header and therefore will not match the last 2480 * packet of each segment that has SH bit cleared. 2481 */ 2482 if (fecn && packet->etype == RHF_RCV_TYPE_EAGER) { 2483 struct rvt_sge_state ss; 2484 u32 len; 2485 u32 tlen = packet->tlen; 2486 u16 hdrsize = packet->hlen; 2487 u8 pad = packet->pad; 2488 u8 extra_bytes = pad + packet->extra_byte + 2489 (SIZE_OF_CRC << 2); 2490 u32 pmtu = qp->pmtu; 2491 2492 if (unlikely(tlen != (hdrsize + pmtu + extra_bytes))) 2493 goto ack_op_err; 2494 len = restart_sge(&ss, req->e.swqe, ipsn, pmtu); 2495 if (unlikely(len < pmtu)) 2496 goto ack_op_err; 2497 rvt_copy_sge(qp, &ss, packet->payload, pmtu, false, 2498 false); 2499 /* Raise the sw sequence check flag for next packet */ 2500 priv->s_flags |= HFI1_R_TID_SW_PSN; 2501 } 2502 2503 goto ack_done; 2504 } 2505 flow->flow_state.r_next_psn = mask_psn(kpsn + 1); 2506 req->ack_pending--; 2507 priv->pending_tid_r_segs--; 2508 qp->s_num_rd_atomic--; 2509 if ((qp->s_flags & RVT_S_WAIT_FENCE) && 2510 !qp->s_num_rd_atomic) { 2511 qp->s_flags &= ~(RVT_S_WAIT_FENCE | 2512 RVT_S_WAIT_ACK); 2513 hfi1_schedule_send(qp); 2514 } 2515 if (qp->s_flags & RVT_S_WAIT_RDMAR) { 2516 qp->s_flags &= ~(RVT_S_WAIT_RDMAR | RVT_S_WAIT_ACK); 2517 hfi1_schedule_send(qp); 2518 } 2519 2520 trace_hfi1_ack(qp, ipsn); 2521 trace_hfi1_tid_req_rcv_read_resp(qp, 0, req->e.swqe->wr.opcode, 2522 req->e.swqe->psn, req->e.swqe->lpsn, 2523 req); 2524 trace_hfi1_tid_flow_rcv_read_resp(qp, req->clear_tail, flow); 2525 2526 /* Release the tid resources */ 2527 hfi1_kern_exp_rcv_clear(req); 2528 2529 if (!do_rc_ack(qp, aeth, ipsn, opcode, 0, rcd)) 2530 goto ack_done; 2531 2532 /* If not done yet, build next read request */ 2533 if (++req->comp_seg >= req->total_segs) { 2534 priv->tid_r_comp++; 2535 req->state = TID_REQUEST_COMPLETE; 2536 } 2537 2538 /* 2539 * Clear the hw flow under two conditions: 2540 * 1. This request is a sync point and it is complete; 2541 * 2. Current request is completed and there are no more requests. 2542 */ 2543 if ((req->state == TID_REQUEST_SYNC && 2544 req->comp_seg == req->cur_seg) || 2545 priv->tid_r_comp == priv->tid_r_reqs) { 2546 hfi1_kern_clear_hw_flow(priv->rcd, qp); 2547 priv->s_flags &= ~HFI1_R_TID_SW_PSN; 2548 if (req->state == TID_REQUEST_SYNC) 2549 req->state = TID_REQUEST_ACTIVE; 2550 } 2551 2552 hfi1_schedule_send(qp); 2553 goto ack_done; 2554 2555 ack_op_err: 2556 /* 2557 * The test indicates that the send engine has finished its cleanup 2558 * after sending the request and it's now safe to put the QP into error 2559 * state. However, if the wqe queue is empty (qp->s_acked == qp->s_tail 2560 * == qp->s_head), it would be unsafe to complete the wqe pointed by 2561 * qp->s_acked here. Putting the qp into error state will safely flush 2562 * all remaining requests. 2563 */ 2564 if (qp->s_last == qp->s_acked) 2565 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); 2566 2567 ack_done: 2568 spin_unlock_irqrestore(&qp->s_lock, flags); 2569 } 2570 2571 void hfi1_kern_read_tid_flow_free(struct rvt_qp *qp) 2572 __must_hold(&qp->s_lock) 2573 { 2574 u32 n = qp->s_acked; 2575 struct rvt_swqe *wqe; 2576 struct tid_rdma_request *req; 2577 struct hfi1_qp_priv *priv = qp->priv; 2578 2579 lockdep_assert_held(&qp->s_lock); 2580 /* Free any TID entries */ 2581 while (n != qp->s_tail) { 2582 wqe = rvt_get_swqe_ptr(qp, n); 2583 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { 2584 req = wqe_to_tid_req(wqe); 2585 hfi1_kern_exp_rcv_clear_all(req); 2586 } 2587 2588 if (++n == qp->s_size) 2589 n = 0; 2590 } 2591 /* Free flow */ 2592 hfi1_kern_clear_hw_flow(priv->rcd, qp); 2593 } 2594 2595 static bool tid_rdma_tid_err(struct hfi1_packet *packet, u8 rcv_type) 2596 { 2597 struct rvt_qp *qp = packet->qp; 2598 2599 if (rcv_type >= RHF_RCV_TYPE_IB) 2600 goto done; 2601 2602 spin_lock(&qp->s_lock); 2603 2604 /* 2605 * We've ran out of space in the eager buffer. 2606 * Eagerly received KDETH packets which require space in the 2607 * Eager buffer (packet that have payload) are TID RDMA WRITE 2608 * response packets. In this case, we have to re-transmit the 2609 * TID RDMA WRITE request. 2610 */ 2611 if (rcv_type == RHF_RCV_TYPE_EAGER) { 2612 hfi1_restart_rc(qp, qp->s_last_psn + 1, 1); 2613 hfi1_schedule_send(qp); 2614 } 2615 2616 /* Since no payload is delivered, just drop the packet */ 2617 spin_unlock(&qp->s_lock); 2618 done: 2619 return true; 2620 } 2621 2622 static void restart_tid_rdma_read_req(struct hfi1_ctxtdata *rcd, 2623 struct rvt_qp *qp, struct rvt_swqe *wqe) 2624 { 2625 struct tid_rdma_request *req; 2626 struct tid_rdma_flow *flow; 2627 2628 /* Start from the right segment */ 2629 qp->r_flags |= RVT_R_RDMAR_SEQ; 2630 req = wqe_to_tid_req(wqe); 2631 flow = &req->flows[req->clear_tail]; 2632 hfi1_restart_rc(qp, flow->flow_state.ib_spsn, 0); 2633 if (list_empty(&qp->rspwait)) { 2634 qp->r_flags |= RVT_R_RSP_SEND; 2635 rvt_get_qp(qp); 2636 list_add_tail(&qp->rspwait, &rcd->qp_wait_list); 2637 } 2638 } 2639 2640 /* 2641 * Handle the KDETH eflags for TID RDMA READ response. 2642 * 2643 * Return true if the last packet for a segment has been received and it is 2644 * time to process the response normally; otherwise, return true. 2645 * 2646 * The caller must hold the packet->qp->r_lock and the rcu_read_lock. 2647 */ 2648 static bool handle_read_kdeth_eflags(struct hfi1_ctxtdata *rcd, 2649 struct hfi1_packet *packet, u8 rcv_type, 2650 u8 rte, u32 psn, u32 ibpsn) 2651 __must_hold(&packet->qp->r_lock) __must_hold(RCU) 2652 { 2653 struct hfi1_pportdata *ppd = rcd->ppd; 2654 struct hfi1_devdata *dd = ppd->dd; 2655 struct hfi1_ibport *ibp; 2656 struct rvt_swqe *wqe; 2657 struct tid_rdma_request *req; 2658 struct tid_rdma_flow *flow; 2659 u32 ack_psn; 2660 struct rvt_qp *qp = packet->qp; 2661 struct hfi1_qp_priv *priv = qp->priv; 2662 bool ret = true; 2663 int diff = 0; 2664 u32 fpsn; 2665 2666 lockdep_assert_held(&qp->r_lock); 2667 trace_hfi1_rsp_read_kdeth_eflags(qp, ibpsn); 2668 trace_hfi1_sender_read_kdeth_eflags(qp); 2669 trace_hfi1_tid_read_sender_kdeth_eflags(qp, 0); 2670 spin_lock(&qp->s_lock); 2671 /* If the psn is out of valid range, drop the packet */ 2672 if (cmp_psn(ibpsn, qp->s_last_psn) < 0 || 2673 cmp_psn(ibpsn, qp->s_psn) > 0) 2674 goto s_unlock; 2675 2676 /* 2677 * Note that NAKs implicitly ACK outstanding SEND and RDMA write 2678 * requests and implicitly NAK RDMA read and atomic requests issued 2679 * before the NAK'ed request. 2680 */ 2681 ack_psn = ibpsn - 1; 2682 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 2683 ibp = to_iport(qp->ibqp.device, qp->port_num); 2684 2685 /* Complete WQEs that the PSN finishes. */ 2686 while ((int)delta_psn(ack_psn, wqe->lpsn) >= 0) { 2687 /* 2688 * If this request is a RDMA read or atomic, and the NACK is 2689 * for a later operation, this NACK NAKs the RDMA read or 2690 * atomic. 2691 */ 2692 if (wqe->wr.opcode == IB_WR_RDMA_READ || 2693 wqe->wr.opcode == IB_WR_TID_RDMA_READ || 2694 wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || 2695 wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { 2696 /* Retry this request. */ 2697 if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) { 2698 qp->r_flags |= RVT_R_RDMAR_SEQ; 2699 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { 2700 restart_tid_rdma_read_req(rcd, qp, 2701 wqe); 2702 } else { 2703 hfi1_restart_rc(qp, qp->s_last_psn + 1, 2704 0); 2705 if (list_empty(&qp->rspwait)) { 2706 qp->r_flags |= RVT_R_RSP_SEND; 2707 rvt_get_qp(qp); 2708 list_add_tail(/* wait */ 2709 &qp->rspwait, 2710 &rcd->qp_wait_list); 2711 } 2712 } 2713 } 2714 /* 2715 * No need to process the NAK since we are 2716 * restarting an earlier request. 2717 */ 2718 break; 2719 } 2720 2721 wqe = do_rc_completion(qp, wqe, ibp); 2722 if (qp->s_acked == qp->s_tail) 2723 goto s_unlock; 2724 } 2725 2726 if (qp->s_acked == qp->s_tail) 2727 goto s_unlock; 2728 2729 /* Handle the eflags for the request */ 2730 if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) 2731 goto s_unlock; 2732 2733 req = wqe_to_tid_req(wqe); 2734 trace_hfi1_tid_req_read_kdeth_eflags(qp, 0, wqe->wr.opcode, wqe->psn, 2735 wqe->lpsn, req); 2736 switch (rcv_type) { 2737 case RHF_RCV_TYPE_EXPECTED: 2738 switch (rte) { 2739 case RHF_RTE_EXPECTED_FLOW_SEQ_ERR: 2740 /* 2741 * On the first occurrence of a Flow Sequence error, 2742 * the flag TID_FLOW_SW_PSN is set. 2743 * 2744 * After that, the flow is *not* reprogrammed and the 2745 * protocol falls back to SW PSN checking. This is done 2746 * to prevent continuous Flow Sequence errors for any 2747 * packets that could be still in the fabric. 2748 */ 2749 flow = &req->flows[req->clear_tail]; 2750 trace_hfi1_tid_flow_read_kdeth_eflags(qp, 2751 req->clear_tail, 2752 flow); 2753 if (priv->s_flags & HFI1_R_TID_SW_PSN) { 2754 diff = cmp_psn(psn, 2755 flow->flow_state.r_next_psn); 2756 if (diff > 0) { 2757 /* Drop the packet.*/ 2758 goto s_unlock; 2759 } else if (diff < 0) { 2760 /* 2761 * If a response packet for a restarted 2762 * request has come back, reset the 2763 * restart flag. 2764 */ 2765 if (qp->r_flags & RVT_R_RDMAR_SEQ) 2766 qp->r_flags &= 2767 ~RVT_R_RDMAR_SEQ; 2768 2769 /* Drop the packet.*/ 2770 goto s_unlock; 2771 } 2772 2773 /* 2774 * If SW PSN verification is successful and 2775 * this is the last packet in the segment, tell 2776 * the caller to process it as a normal packet. 2777 */ 2778 fpsn = full_flow_psn(flow, 2779 flow->flow_state.lpsn); 2780 if (cmp_psn(fpsn, psn) == 0) { 2781 ret = false; 2782 if (qp->r_flags & RVT_R_RDMAR_SEQ) 2783 qp->r_flags &= 2784 ~RVT_R_RDMAR_SEQ; 2785 } 2786 flow->flow_state.r_next_psn = 2787 mask_psn(psn + 1); 2788 } else { 2789 u32 last_psn; 2790 2791 last_psn = read_r_next_psn(dd, rcd->ctxt, 2792 flow->idx); 2793 flow->flow_state.r_next_psn = last_psn; 2794 priv->s_flags |= HFI1_R_TID_SW_PSN; 2795 /* 2796 * If no request has been restarted yet, 2797 * restart the current one. 2798 */ 2799 if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) 2800 restart_tid_rdma_read_req(rcd, qp, 2801 wqe); 2802 } 2803 2804 break; 2805 2806 case RHF_RTE_EXPECTED_FLOW_GEN_ERR: 2807 /* 2808 * Since the TID flow is able to ride through 2809 * generation mismatch, drop this stale packet. 2810 */ 2811 break; 2812 2813 default: 2814 break; 2815 } 2816 break; 2817 2818 case RHF_RCV_TYPE_ERROR: 2819 switch (rte) { 2820 case RHF_RTE_ERROR_OP_CODE_ERR: 2821 case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR: 2822 case RHF_RTE_ERROR_KHDR_HCRC_ERR: 2823 case RHF_RTE_ERROR_KHDR_KVER_ERR: 2824 case RHF_RTE_ERROR_CONTEXT_ERR: 2825 case RHF_RTE_ERROR_KHDR_TID_ERR: 2826 default: 2827 break; 2828 } 2829 break; 2830 default: 2831 break; 2832 } 2833 s_unlock: 2834 spin_unlock(&qp->s_lock); 2835 return ret; 2836 } 2837 2838 bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, 2839 struct hfi1_pportdata *ppd, 2840 struct hfi1_packet *packet) 2841 { 2842 struct hfi1_ibport *ibp = &ppd->ibport_data; 2843 struct hfi1_devdata *dd = ppd->dd; 2844 struct rvt_dev_info *rdi = &dd->verbs_dev.rdi; 2845 u8 rcv_type = rhf_rcv_type(packet->rhf); 2846 u8 rte = rhf_rcv_type_err(packet->rhf); 2847 struct ib_header *hdr = packet->hdr; 2848 struct ib_other_headers *ohdr = NULL; 2849 int lnh = be16_to_cpu(hdr->lrh[0]) & 3; 2850 u16 lid = be16_to_cpu(hdr->lrh[1]); 2851 u8 opcode; 2852 u32 qp_num, psn, ibpsn; 2853 struct rvt_qp *qp; 2854 struct hfi1_qp_priv *qpriv; 2855 unsigned long flags; 2856 bool ret = true; 2857 struct rvt_ack_entry *e; 2858 struct tid_rdma_request *req; 2859 struct tid_rdma_flow *flow; 2860 int diff = 0; 2861 2862 trace_hfi1_msg_handle_kdeth_eflags(NULL, "Kdeth error: rhf ", 2863 packet->rhf); 2864 if (packet->rhf & RHF_ICRC_ERR) 2865 return ret; 2866 2867 packet->ohdr = &hdr->u.oth; 2868 ohdr = packet->ohdr; 2869 trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf))); 2870 2871 /* Get the destination QP number. */ 2872 qp_num = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_qp) & 2873 RVT_QPN_MASK; 2874 if (lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) 2875 goto drop; 2876 2877 psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 2878 opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; 2879 2880 rcu_read_lock(); 2881 qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num); 2882 if (!qp) 2883 goto rcu_unlock; 2884 2885 packet->qp = qp; 2886 2887 /* Check for valid receive state. */ 2888 spin_lock_irqsave(&qp->r_lock, flags); 2889 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { 2890 ibp->rvp.n_pkt_drops++; 2891 goto r_unlock; 2892 } 2893 2894 if (packet->rhf & RHF_TID_ERR) { 2895 /* For TIDERR and RC QPs preemptively schedule a NAK */ 2896 u32 tlen = rhf_pkt_len(packet->rhf); /* in bytes */ 2897 2898 /* Sanity check packet */ 2899 if (tlen < 24) 2900 goto r_unlock; 2901 2902 /* 2903 * Check for GRH. We should never get packets with GRH in this 2904 * path. 2905 */ 2906 if (lnh == HFI1_LRH_GRH) 2907 goto r_unlock; 2908 2909 if (tid_rdma_tid_err(packet, rcv_type)) 2910 goto r_unlock; 2911 } 2912 2913 /* handle TID RDMA READ */ 2914 if (opcode == TID_OP(READ_RESP)) { 2915 ibpsn = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn); 2916 ibpsn = mask_psn(ibpsn); 2917 ret = handle_read_kdeth_eflags(rcd, packet, rcv_type, rte, psn, 2918 ibpsn); 2919 goto r_unlock; 2920 } 2921 2922 /* 2923 * qp->s_tail_ack_queue points to the rvt_ack_entry currently being 2924 * processed. These a completed sequentially so we can be sure that 2925 * the pointer will not change until the entire request has completed. 2926 */ 2927 spin_lock(&qp->s_lock); 2928 qpriv = qp->priv; 2929 if (qpriv->r_tid_tail == HFI1_QP_WQE_INVALID || 2930 qpriv->r_tid_tail == qpriv->r_tid_head) 2931 goto unlock; 2932 e = &qp->s_ack_queue[qpriv->r_tid_tail]; 2933 if (e->opcode != TID_OP(WRITE_REQ)) 2934 goto unlock; 2935 req = ack_to_tid_req(e); 2936 if (req->comp_seg == req->cur_seg) 2937 goto unlock; 2938 flow = &req->flows[req->clear_tail]; 2939 trace_hfi1_eflags_err_write(qp, rcv_type, rte, psn); 2940 trace_hfi1_rsp_handle_kdeth_eflags(qp, psn); 2941 trace_hfi1_tid_write_rsp_handle_kdeth_eflags(qp); 2942 trace_hfi1_tid_req_handle_kdeth_eflags(qp, 0, e->opcode, e->psn, 2943 e->lpsn, req); 2944 trace_hfi1_tid_flow_handle_kdeth_eflags(qp, req->clear_tail, flow); 2945 2946 switch (rcv_type) { 2947 case RHF_RCV_TYPE_EXPECTED: 2948 switch (rte) { 2949 case RHF_RTE_EXPECTED_FLOW_SEQ_ERR: 2950 if (!(qpriv->s_flags & HFI1_R_TID_SW_PSN)) { 2951 qpriv->s_flags |= HFI1_R_TID_SW_PSN; 2952 flow->flow_state.r_next_psn = 2953 read_r_next_psn(dd, rcd->ctxt, 2954 flow->idx); 2955 qpriv->r_next_psn_kdeth = 2956 flow->flow_state.r_next_psn; 2957 goto nak_psn; 2958 } else { 2959 /* 2960 * If the received PSN does not match the next 2961 * expected PSN, NAK the packet. 2962 * However, only do that if we know that the a 2963 * NAK has already been sent. Otherwise, this 2964 * mismatch could be due to packets that were 2965 * already in flight. 2966 */ 2967 diff = cmp_psn(psn, 2968 flow->flow_state.r_next_psn); 2969 if (diff > 0) 2970 goto nak_psn; 2971 else if (diff < 0) 2972 break; 2973 2974 qpriv->s_nak_state = 0; 2975 /* 2976 * If SW PSN verification is successful and this 2977 * is the last packet in the segment, tell the 2978 * caller to process it as a normal packet. 2979 */ 2980 if (psn == full_flow_psn(flow, 2981 flow->flow_state.lpsn)) 2982 ret = false; 2983 flow->flow_state.r_next_psn = 2984 mask_psn(psn + 1); 2985 qpriv->r_next_psn_kdeth = 2986 flow->flow_state.r_next_psn; 2987 } 2988 break; 2989 2990 case RHF_RTE_EXPECTED_FLOW_GEN_ERR: 2991 goto nak_psn; 2992 2993 default: 2994 break; 2995 } 2996 break; 2997 2998 case RHF_RCV_TYPE_ERROR: 2999 switch (rte) { 3000 case RHF_RTE_ERROR_OP_CODE_ERR: 3001 case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR: 3002 case RHF_RTE_ERROR_KHDR_HCRC_ERR: 3003 case RHF_RTE_ERROR_KHDR_KVER_ERR: 3004 case RHF_RTE_ERROR_CONTEXT_ERR: 3005 case RHF_RTE_ERROR_KHDR_TID_ERR: 3006 default: 3007 break; 3008 } 3009 break; 3010 default: 3011 break; 3012 } 3013 3014 unlock: 3015 spin_unlock(&qp->s_lock); 3016 r_unlock: 3017 spin_unlock_irqrestore(&qp->r_lock, flags); 3018 rcu_unlock: 3019 rcu_read_unlock(); 3020 drop: 3021 return ret; 3022 nak_psn: 3023 ibp->rvp.n_rc_seqnak++; 3024 if (!qpriv->s_nak_state) { 3025 qpriv->s_nak_state = IB_NAK_PSN_ERROR; 3026 /* We are NAK'ing the next expected PSN */ 3027 qpriv->s_nak_psn = mask_psn(flow->flow_state.r_next_psn); 3028 tid_rdma_trigger_ack(qp); 3029 } 3030 goto unlock; 3031 } 3032 3033 /* 3034 * "Rewind" the TID request information. 3035 * This means that we reset the state back to ACTIVE, 3036 * find the proper flow, set the flow index to that flow, 3037 * and reset the flow information. 3038 */ 3039 void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe, 3040 u32 *bth2) 3041 { 3042 struct tid_rdma_request *req = wqe_to_tid_req(wqe); 3043 struct tid_rdma_flow *flow; 3044 struct hfi1_qp_priv *qpriv = qp->priv; 3045 int diff, delta_pkts; 3046 u32 tididx = 0, i; 3047 u16 fidx; 3048 3049 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { 3050 *bth2 = mask_psn(qp->s_psn); 3051 flow = find_flow_ib(req, *bth2, &fidx); 3052 if (!flow) { 3053 trace_hfi1_msg_tid_restart_req(/* msg */ 3054 qp, "!!!!!! Could not find flow to restart: bth2 ", 3055 (u64)*bth2); 3056 trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode, 3057 wqe->psn, wqe->lpsn, 3058 req); 3059 return; 3060 } 3061 } else { 3062 fidx = req->acked_tail; 3063 flow = &req->flows[fidx]; 3064 *bth2 = mask_psn(req->r_ack_psn); 3065 } 3066 3067 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) 3068 delta_pkts = delta_psn(*bth2, flow->flow_state.ib_spsn); 3069 else 3070 delta_pkts = delta_psn(*bth2, 3071 full_flow_psn(flow, 3072 flow->flow_state.spsn)); 3073 3074 trace_hfi1_tid_flow_restart_req(qp, fidx, flow); 3075 diff = delta_pkts + flow->resync_npkts; 3076 3077 flow->sent = 0; 3078 flow->pkt = 0; 3079 flow->tid_idx = 0; 3080 flow->tid_offset = 0; 3081 if (diff) { 3082 for (tididx = 0; tididx < flow->tidcnt; tididx++) { 3083 u32 tidentry = flow->tid_entry[tididx], tidlen, 3084 tidnpkts, npkts; 3085 3086 flow->tid_offset = 0; 3087 tidlen = EXP_TID_GET(tidentry, LEN) * PAGE_SIZE; 3088 tidnpkts = rvt_div_round_up_mtu(qp, tidlen); 3089 npkts = min_t(u32, diff, tidnpkts); 3090 flow->pkt += npkts; 3091 flow->sent += (npkts == tidnpkts ? tidlen : 3092 npkts * qp->pmtu); 3093 flow->tid_offset += npkts * qp->pmtu; 3094 diff -= npkts; 3095 if (!diff) 3096 break; 3097 } 3098 } 3099 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) { 3100 rvt_skip_sge(&qpriv->tid_ss, (req->cur_seg * req->seg_len) + 3101 flow->sent, 0); 3102 /* 3103 * Packet PSN is based on flow_state.spsn + flow->pkt. However, 3104 * during a RESYNC, the generation is incremented and the 3105 * sequence is reset to 0. Since we've adjusted the npkts in the 3106 * flow and the SGE has been sufficiently advanced, we have to 3107 * adjust flow->pkt in order to calculate the correct PSN. 3108 */ 3109 flow->pkt -= flow->resync_npkts; 3110 } 3111 3112 if (flow->tid_offset == 3113 EXP_TID_GET(flow->tid_entry[tididx], LEN) * PAGE_SIZE) { 3114 tididx++; 3115 flow->tid_offset = 0; 3116 } 3117 flow->tid_idx = tididx; 3118 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) 3119 /* Move flow_idx to correct index */ 3120 req->flow_idx = fidx; 3121 else 3122 req->clear_tail = fidx; 3123 3124 trace_hfi1_tid_flow_restart_req(qp, fidx, flow); 3125 trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode, wqe->psn, 3126 wqe->lpsn, req); 3127 req->state = TID_REQUEST_ACTIVE; 3128 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) { 3129 /* Reset all the flows that we are going to resend */ 3130 fidx = CIRC_NEXT(fidx, MAX_FLOWS); 3131 i = qpriv->s_tid_tail; 3132 do { 3133 for (; CIRC_CNT(req->setup_head, fidx, MAX_FLOWS); 3134 fidx = CIRC_NEXT(fidx, MAX_FLOWS)) { 3135 req->flows[fidx].sent = 0; 3136 req->flows[fidx].pkt = 0; 3137 req->flows[fidx].tid_idx = 0; 3138 req->flows[fidx].tid_offset = 0; 3139 req->flows[fidx].resync_npkts = 0; 3140 } 3141 if (i == qpriv->s_tid_cur) 3142 break; 3143 do { 3144 i = (++i == qp->s_size ? 0 : i); 3145 wqe = rvt_get_swqe_ptr(qp, i); 3146 } while (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE); 3147 req = wqe_to_tid_req(wqe); 3148 req->cur_seg = req->ack_seg; 3149 fidx = req->acked_tail; 3150 /* Pull req->clear_tail back */ 3151 req->clear_tail = fidx; 3152 } while (1); 3153 } 3154 } 3155 3156 void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp) 3157 { 3158 int i, ret; 3159 struct hfi1_qp_priv *qpriv = qp->priv; 3160 struct tid_flow_state *fs; 3161 3162 if (qp->ibqp.qp_type != IB_QPT_RC || !HFI1_CAP_IS_KSET(TID_RDMA)) 3163 return; 3164 3165 /* 3166 * First, clear the flow to help prevent any delayed packets from 3167 * being delivered. 3168 */ 3169 fs = &qpriv->flow_state; 3170 if (fs->index != RXE_NUM_TID_FLOWS) 3171 hfi1_kern_clear_hw_flow(qpriv->rcd, qp); 3172 3173 for (i = qp->s_acked; i != qp->s_head;) { 3174 struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i); 3175 3176 if (++i == qp->s_size) 3177 i = 0; 3178 /* Free only locally allocated TID entries */ 3179 if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) 3180 continue; 3181 do { 3182 struct hfi1_swqe_priv *priv = wqe->priv; 3183 3184 ret = hfi1_kern_exp_rcv_clear(&priv->tid_req); 3185 } while (!ret); 3186 } 3187 for (i = qp->s_acked_ack_queue; i != qp->r_head_ack_queue;) { 3188 struct rvt_ack_entry *e = &qp->s_ack_queue[i]; 3189 3190 if (++i == rvt_max_atomic(ib_to_rvt(qp->ibqp.device))) 3191 i = 0; 3192 /* Free only locally allocated TID entries */ 3193 if (e->opcode != TID_OP(WRITE_REQ)) 3194 continue; 3195 do { 3196 struct hfi1_ack_priv *priv = e->priv; 3197 3198 ret = hfi1_kern_exp_rcv_clear(&priv->tid_req); 3199 } while (!ret); 3200 } 3201 } 3202 3203 bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe) 3204 { 3205 struct rvt_swqe *prev; 3206 struct hfi1_qp_priv *priv = qp->priv; 3207 u32 s_prev; 3208 struct tid_rdma_request *req; 3209 3210 s_prev = (qp->s_cur == 0 ? qp->s_size : qp->s_cur) - 1; 3211 prev = rvt_get_swqe_ptr(qp, s_prev); 3212 3213 switch (wqe->wr.opcode) { 3214 case IB_WR_SEND: 3215 case IB_WR_SEND_WITH_IMM: 3216 case IB_WR_SEND_WITH_INV: 3217 case IB_WR_ATOMIC_CMP_AND_SWP: 3218 case IB_WR_ATOMIC_FETCH_AND_ADD: 3219 case IB_WR_RDMA_WRITE: 3220 case IB_WR_RDMA_WRITE_WITH_IMM: 3221 switch (prev->wr.opcode) { 3222 case IB_WR_TID_RDMA_WRITE: 3223 req = wqe_to_tid_req(prev); 3224 if (req->ack_seg != req->total_segs) 3225 goto interlock; 3226 break; 3227 default: 3228 break; 3229 } 3230 break; 3231 case IB_WR_RDMA_READ: 3232 if (prev->wr.opcode != IB_WR_TID_RDMA_WRITE) 3233 break; 3234 fallthrough; 3235 case IB_WR_TID_RDMA_READ: 3236 switch (prev->wr.opcode) { 3237 case IB_WR_RDMA_READ: 3238 if (qp->s_acked != qp->s_cur) 3239 goto interlock; 3240 break; 3241 case IB_WR_TID_RDMA_WRITE: 3242 req = wqe_to_tid_req(prev); 3243 if (req->ack_seg != req->total_segs) 3244 goto interlock; 3245 break; 3246 default: 3247 break; 3248 } 3249 break; 3250 default: 3251 break; 3252 } 3253 return false; 3254 3255 interlock: 3256 priv->s_flags |= HFI1_S_TID_WAIT_INTERLCK; 3257 return true; 3258 } 3259 3260 /* Does @sge meet the alignment requirements for tid rdma? */ 3261 static inline bool hfi1_check_sge_align(struct rvt_qp *qp, 3262 struct rvt_sge *sge, int num_sge) 3263 { 3264 int i; 3265 3266 for (i = 0; i < num_sge; i++, sge++) { 3267 trace_hfi1_sge_check_align(qp, i, sge); 3268 if ((u64)sge->vaddr & ~PAGE_MASK || 3269 sge->sge_length & ~PAGE_MASK) 3270 return false; 3271 } 3272 return true; 3273 } 3274 3275 void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe) 3276 { 3277 struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv; 3278 struct hfi1_swqe_priv *priv = wqe->priv; 3279 struct tid_rdma_params *remote; 3280 enum ib_wr_opcode new_opcode; 3281 bool do_tid_rdma = false; 3282 struct hfi1_pportdata *ppd = qpriv->rcd->ppd; 3283 3284 if ((rdma_ah_get_dlid(&qp->remote_ah_attr) & ~((1 << ppd->lmc) - 1)) == 3285 ppd->lid) 3286 return; 3287 if (qpriv->hdr_type != HFI1_PKT_TYPE_9B) 3288 return; 3289 3290 rcu_read_lock(); 3291 remote = rcu_dereference(qpriv->tid_rdma.remote); 3292 /* 3293 * If TID RDMA is disabled by the negotiation, don't 3294 * use it. 3295 */ 3296 if (!remote) 3297 goto exit; 3298 3299 if (wqe->wr.opcode == IB_WR_RDMA_READ) { 3300 if (hfi1_check_sge_align(qp, &wqe->sg_list[0], 3301 wqe->wr.num_sge)) { 3302 new_opcode = IB_WR_TID_RDMA_READ; 3303 do_tid_rdma = true; 3304 } 3305 } else if (wqe->wr.opcode == IB_WR_RDMA_WRITE) { 3306 /* 3307 * TID RDMA is enabled for this RDMA WRITE request iff: 3308 * 1. The remote address is page-aligned, 3309 * 2. The length is larger than the minimum segment size, 3310 * 3. The length is page-multiple. 3311 */ 3312 if (!(wqe->rdma_wr.remote_addr & ~PAGE_MASK) && 3313 !(wqe->length & ~PAGE_MASK)) { 3314 new_opcode = IB_WR_TID_RDMA_WRITE; 3315 do_tid_rdma = true; 3316 } 3317 } 3318 3319 if (do_tid_rdma) { 3320 if (hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req, GFP_ATOMIC)) 3321 goto exit; 3322 wqe->wr.opcode = new_opcode; 3323 priv->tid_req.seg_len = 3324 min_t(u32, remote->max_len, wqe->length); 3325 priv->tid_req.total_segs = 3326 DIV_ROUND_UP(wqe->length, priv->tid_req.seg_len); 3327 /* Compute the last PSN of the request */ 3328 wqe->lpsn = wqe->psn; 3329 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { 3330 priv->tid_req.n_flows = remote->max_read; 3331 qpriv->tid_r_reqs++; 3332 wqe->lpsn += rvt_div_round_up_mtu(qp, wqe->length) - 1; 3333 } else { 3334 wqe->lpsn += priv->tid_req.total_segs - 1; 3335 atomic_inc(&qpriv->n_requests); 3336 } 3337 3338 priv->tid_req.cur_seg = 0; 3339 priv->tid_req.comp_seg = 0; 3340 priv->tid_req.ack_seg = 0; 3341 priv->tid_req.state = TID_REQUEST_INACTIVE; 3342 /* 3343 * Reset acked_tail. 3344 * TID RDMA READ does not have ACKs so it does not 3345 * update the pointer. We have to reset it so TID RDMA 3346 * WRITE does not get confused. 3347 */ 3348 priv->tid_req.acked_tail = priv->tid_req.setup_head; 3349 trace_hfi1_tid_req_setup_tid_wqe(qp, 1, wqe->wr.opcode, 3350 wqe->psn, wqe->lpsn, 3351 &priv->tid_req); 3352 } 3353 exit: 3354 rcu_read_unlock(); 3355 } 3356 3357 /* TID RDMA WRITE functions */ 3358 3359 u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe, 3360 struct ib_other_headers *ohdr, 3361 u32 *bth1, u32 *bth2, u32 *len) 3362 { 3363 struct hfi1_qp_priv *qpriv = qp->priv; 3364 struct tid_rdma_request *req = wqe_to_tid_req(wqe); 3365 struct tid_rdma_params *remote; 3366 3367 rcu_read_lock(); 3368 remote = rcu_dereference(qpriv->tid_rdma.remote); 3369 /* 3370 * Set the number of flow to be used based on negotiated 3371 * parameters. 3372 */ 3373 req->n_flows = remote->max_write; 3374 req->state = TID_REQUEST_ACTIVE; 3375 3376 KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth0, KVER, 0x1); 3377 KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth1, JKEY, remote->jkey); 3378 ohdr->u.tid_rdma.w_req.reth.vaddr = 3379 cpu_to_be64(wqe->rdma_wr.remote_addr + (wqe->length - *len)); 3380 ohdr->u.tid_rdma.w_req.reth.rkey = 3381 cpu_to_be32(wqe->rdma_wr.rkey); 3382 ohdr->u.tid_rdma.w_req.reth.length = cpu_to_be32(*len); 3383 ohdr->u.tid_rdma.w_req.verbs_qp = cpu_to_be32(qp->remote_qpn); 3384 *bth1 &= ~RVT_QPN_MASK; 3385 *bth1 |= remote->qp; 3386 qp->s_state = TID_OP(WRITE_REQ); 3387 qp->s_flags |= HFI1_S_WAIT_TID_RESP; 3388 *bth2 |= IB_BTH_REQ_ACK; 3389 *len = 0; 3390 3391 rcu_read_unlock(); 3392 return sizeof(ohdr->u.tid_rdma.w_req) / sizeof(u32); 3393 } 3394 3395 static u32 hfi1_compute_tid_rdma_flow_wt(struct rvt_qp *qp) 3396 { 3397 /* 3398 * Heuristic for computing the RNR timeout when waiting on the flow 3399 * queue. Rather than a computationaly expensive exact estimate of when 3400 * a flow will be available, we assume that if a QP is at position N in 3401 * the flow queue it has to wait approximately (N + 1) * (number of 3402 * segments between two sync points). The rationale for this is that 3403 * flows are released and recycled at each sync point. 3404 */ 3405 return (MAX_TID_FLOW_PSN * qp->pmtu) >> TID_RDMA_SEGMENT_SHIFT; 3406 } 3407 3408 static u32 position_in_queue(struct hfi1_qp_priv *qpriv, 3409 struct tid_queue *queue) 3410 { 3411 return qpriv->tid_enqueue - queue->dequeue; 3412 } 3413 3414 /* 3415 * @qp: points to rvt_qp context. 3416 * @to_seg: desired RNR timeout in segments. 3417 * Return: index of the next highest timeout in the ib_hfi1_rnr_table[] 3418 */ 3419 static u32 hfi1_compute_tid_rnr_timeout(struct rvt_qp *qp, u32 to_seg) 3420 { 3421 struct hfi1_qp_priv *qpriv = qp->priv; 3422 u64 timeout; 3423 u32 bytes_per_us; 3424 u8 i; 3425 3426 bytes_per_us = active_egress_rate(qpriv->rcd->ppd) / 8; 3427 timeout = (to_seg * TID_RDMA_MAX_SEGMENT_SIZE) / bytes_per_us; 3428 /* 3429 * Find the next highest value in the RNR table to the required 3430 * timeout. This gives the responder some padding. 3431 */ 3432 for (i = 1; i <= IB_AETH_CREDIT_MASK; i++) 3433 if (rvt_rnr_tbl_to_usec(i) >= timeout) 3434 return i; 3435 return 0; 3436 } 3437 3438 /** 3439 * Central place for resource allocation at TID write responder, 3440 * is called from write_req and write_data interrupt handlers as 3441 * well as the send thread when a queued QP is scheduled for 3442 * resource allocation. 3443 * 3444 * Iterates over (a) segments of a request and then (b) queued requests 3445 * themselves to allocate resources for up to local->max_write 3446 * segments across multiple requests. Stop allocating when we 3447 * hit a sync point, resume allocating after data packets at 3448 * sync point have been received. 3449 * 3450 * Resource allocation and sending of responses is decoupled. The 3451 * request/segment which are being allocated and sent are as follows. 3452 * Resources are allocated for: 3453 * [request: qpriv->r_tid_alloc, segment: req->alloc_seg] 3454 * The send thread sends: 3455 * [request: qp->s_tail_ack_queue, segment:req->cur_seg] 3456 */ 3457 static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx) 3458 { 3459 struct tid_rdma_request *req; 3460 struct hfi1_qp_priv *qpriv = qp->priv; 3461 struct hfi1_ctxtdata *rcd = qpriv->rcd; 3462 struct tid_rdma_params *local = &qpriv->tid_rdma.local; 3463 struct rvt_ack_entry *e; 3464 u32 npkts, to_seg; 3465 bool last; 3466 int ret = 0; 3467 3468 lockdep_assert_held(&qp->s_lock); 3469 3470 while (1) { 3471 trace_hfi1_rsp_tid_write_alloc_res(qp, 0); 3472 trace_hfi1_tid_write_rsp_alloc_res(qp); 3473 /* 3474 * Don't allocate more segments if a RNR NAK has already been 3475 * scheduled to avoid messing up qp->r_psn: the RNR NAK will 3476 * be sent only when all allocated segments have been sent. 3477 * However, if more segments are allocated before that, TID RDMA 3478 * WRITE RESP packets will be sent out for these new segments 3479 * before the RNR NAK packet. When the requester receives the 3480 * RNR NAK packet, it will restart with qp->s_last_psn + 1, 3481 * which does not match qp->r_psn and will be dropped. 3482 * Consequently, the requester will exhaust its retries and 3483 * put the qp into error state. 3484 */ 3485 if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND) 3486 break; 3487 3488 /* No requests left to process */ 3489 if (qpriv->r_tid_alloc == qpriv->r_tid_head) { 3490 /* If all data has been received, clear the flow */ 3491 if (qpriv->flow_state.index < RXE_NUM_TID_FLOWS && 3492 !qpriv->alloc_w_segs) { 3493 hfi1_kern_clear_hw_flow(rcd, qp); 3494 qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; 3495 } 3496 break; 3497 } 3498 3499 e = &qp->s_ack_queue[qpriv->r_tid_alloc]; 3500 if (e->opcode != TID_OP(WRITE_REQ)) 3501 goto next_req; 3502 req = ack_to_tid_req(e); 3503 trace_hfi1_tid_req_write_alloc_res(qp, 0, e->opcode, e->psn, 3504 e->lpsn, req); 3505 /* Finished allocating for all segments of this request */ 3506 if (req->alloc_seg >= req->total_segs) 3507 goto next_req; 3508 3509 /* Can allocate only a maximum of local->max_write for a QP */ 3510 if (qpriv->alloc_w_segs >= local->max_write) 3511 break; 3512 3513 /* Don't allocate at a sync point with data packets pending */ 3514 if (qpriv->sync_pt && qpriv->alloc_w_segs) 3515 break; 3516 3517 /* All data received at the sync point, continue */ 3518 if (qpriv->sync_pt && !qpriv->alloc_w_segs) { 3519 hfi1_kern_clear_hw_flow(rcd, qp); 3520 qpriv->sync_pt = false; 3521 qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; 3522 } 3523 3524 /* Allocate flow if we don't have one */ 3525 if (qpriv->flow_state.index >= RXE_NUM_TID_FLOWS) { 3526 ret = hfi1_kern_setup_hw_flow(qpriv->rcd, qp); 3527 if (ret) { 3528 to_seg = hfi1_compute_tid_rdma_flow_wt(qp) * 3529 position_in_queue(qpriv, 3530 &rcd->flow_queue); 3531 break; 3532 } 3533 } 3534 3535 npkts = rvt_div_round_up_mtu(qp, req->seg_len); 3536 3537 /* 3538 * We are at a sync point if we run out of KDETH PSN space. 3539 * Last PSN of every generation is reserved for RESYNC. 3540 */ 3541 if (qpriv->flow_state.psn + npkts > MAX_TID_FLOW_PSN - 1) { 3542 qpriv->sync_pt = true; 3543 break; 3544 } 3545 3546 /* 3547 * If overtaking req->acked_tail, send an RNR NAK. Because the 3548 * QP is not queued in this case, and the issue can only be 3549 * caused by a delay in scheduling the second leg which we 3550 * cannot estimate, we use a rather arbitrary RNR timeout of 3551 * (MAX_FLOWS / 2) segments 3552 */ 3553 if (!CIRC_SPACE(req->setup_head, req->acked_tail, 3554 MAX_FLOWS)) { 3555 ret = -EAGAIN; 3556 to_seg = MAX_FLOWS >> 1; 3557 tid_rdma_trigger_ack(qp); 3558 break; 3559 } 3560 3561 /* Try to allocate rcv array / TID entries */ 3562 ret = hfi1_kern_exp_rcv_setup(req, &req->ss, &last); 3563 if (ret == -EAGAIN) 3564 to_seg = position_in_queue(qpriv, &rcd->rarr_queue); 3565 if (ret) 3566 break; 3567 3568 qpriv->alloc_w_segs++; 3569 req->alloc_seg++; 3570 continue; 3571 next_req: 3572 /* Begin processing the next request */ 3573 if (++qpriv->r_tid_alloc > 3574 rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) 3575 qpriv->r_tid_alloc = 0; 3576 } 3577 3578 /* 3579 * Schedule an RNR NAK to be sent if (a) flow or rcv array allocation 3580 * has failed (b) we are called from the rcv handler interrupt context 3581 * (c) an RNR NAK has not already been scheduled 3582 */ 3583 if (ret == -EAGAIN && intr_ctx && !qp->r_nak_state) 3584 goto send_rnr_nak; 3585 3586 return; 3587 3588 send_rnr_nak: 3589 lockdep_assert_held(&qp->r_lock); 3590 3591 /* Set r_nak_state to prevent unrelated events from generating NAK's */ 3592 qp->r_nak_state = hfi1_compute_tid_rnr_timeout(qp, to_seg) | IB_RNR_NAK; 3593 3594 /* Pull back r_psn to the segment being RNR NAK'd */ 3595 qp->r_psn = e->psn + req->alloc_seg; 3596 qp->r_ack_psn = qp->r_psn; 3597 /* 3598 * Pull back r_head_ack_queue to the ack entry following the request 3599 * being RNR NAK'd. This allows resources to be allocated to the request 3600 * if the queued QP is scheduled. 3601 */ 3602 qp->r_head_ack_queue = qpriv->r_tid_alloc + 1; 3603 if (qp->r_head_ack_queue > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) 3604 qp->r_head_ack_queue = 0; 3605 qpriv->r_tid_head = qp->r_head_ack_queue; 3606 /* 3607 * These send side fields are used in make_rc_ack(). They are set in 3608 * hfi1_send_rc_ack() but must be set here before dropping qp->s_lock 3609 * for consistency 3610 */ 3611 qp->s_nak_state = qp->r_nak_state; 3612 qp->s_ack_psn = qp->r_ack_psn; 3613 /* 3614 * Clear the ACK PENDING flag to prevent unwanted ACK because we 3615 * have modified qp->s_ack_psn here. 3616 */ 3617 qp->s_flags &= ~(RVT_S_ACK_PENDING); 3618 3619 trace_hfi1_rsp_tid_write_alloc_res(qp, qp->r_psn); 3620 /* 3621 * qpriv->rnr_nak_state is used to determine when the scheduled RNR NAK 3622 * has actually been sent. qp->s_flags RVT_S_ACK_PENDING bit cannot be 3623 * used for this because qp->s_lock is dropped before calling 3624 * hfi1_send_rc_ack() leading to inconsistency between the receive 3625 * interrupt handlers and the send thread in make_rc_ack() 3626 */ 3627 qpriv->rnr_nak_state = TID_RNR_NAK_SEND; 3628 3629 /* 3630 * Schedule RNR NAK to be sent. RNR NAK's are scheduled from the receive 3631 * interrupt handlers but will be sent from the send engine behind any 3632 * previous responses that may have been scheduled 3633 */ 3634 rc_defered_ack(rcd, qp); 3635 } 3636 3637 void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet) 3638 { 3639 /* HANDLER FOR TID RDMA WRITE REQUEST packet (Responder side)*/ 3640 3641 /* 3642 * 1. Verify TID RDMA WRITE REQ as per IB_OPCODE_RC_RDMA_WRITE_FIRST 3643 * (see hfi1_rc_rcv()) 3644 * - Don't allow 0-length requests. 3645 * 2. Put TID RDMA WRITE REQ into the response queueu (s_ack_queue) 3646 * - Setup struct tid_rdma_req with request info 3647 * - Prepare struct tid_rdma_flow array? 3648 * 3. Set the qp->s_ack_state as state diagram in design doc. 3649 * 4. Set RVT_S_RESP_PENDING in s_flags. 3650 * 5. Kick the send engine (hfi1_schedule_send()) 3651 */ 3652 struct hfi1_ctxtdata *rcd = packet->rcd; 3653 struct rvt_qp *qp = packet->qp; 3654 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); 3655 struct ib_other_headers *ohdr = packet->ohdr; 3656 struct rvt_ack_entry *e; 3657 unsigned long flags; 3658 struct ib_reth *reth; 3659 struct hfi1_qp_priv *qpriv = qp->priv; 3660 struct tid_rdma_request *req; 3661 u32 bth0, psn, len, rkey, num_segs; 3662 bool fecn; 3663 u8 next; 3664 u64 vaddr; 3665 int diff; 3666 3667 bth0 = be32_to_cpu(ohdr->bth[0]); 3668 if (hfi1_ruc_check_hdr(ibp, packet)) 3669 return; 3670 3671 fecn = process_ecn(qp, packet); 3672 psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 3673 trace_hfi1_rsp_rcv_tid_write_req(qp, psn); 3674 3675 if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST)) 3676 rvt_comm_est(qp); 3677 3678 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) 3679 goto nack_inv; 3680 3681 reth = &ohdr->u.tid_rdma.w_req.reth; 3682 vaddr = be64_to_cpu(reth->vaddr); 3683 len = be32_to_cpu(reth->length); 3684 3685 num_segs = DIV_ROUND_UP(len, qpriv->tid_rdma.local.max_len); 3686 diff = delta_psn(psn, qp->r_psn); 3687 if (unlikely(diff)) { 3688 tid_rdma_rcv_err(packet, ohdr, qp, psn, diff, fecn); 3689 return; 3690 } 3691 3692 /* 3693 * The resent request which was previously RNR NAK'd is inserted at the 3694 * location of the original request, which is one entry behind 3695 * r_head_ack_queue 3696 */ 3697 if (qpriv->rnr_nak_state) 3698 qp->r_head_ack_queue = qp->r_head_ack_queue ? 3699 qp->r_head_ack_queue - 1 : 3700 rvt_size_atomic(ib_to_rvt(qp->ibqp.device)); 3701 3702 /* We've verified the request, insert it into the ack queue. */ 3703 next = qp->r_head_ack_queue + 1; 3704 if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) 3705 next = 0; 3706 spin_lock_irqsave(&qp->s_lock, flags); 3707 if (unlikely(next == qp->s_acked_ack_queue)) { 3708 if (!qp->s_ack_queue[next].sent) 3709 goto nack_inv_unlock; 3710 update_ack_queue(qp, next); 3711 } 3712 e = &qp->s_ack_queue[qp->r_head_ack_queue]; 3713 req = ack_to_tid_req(e); 3714 3715 /* Bring previously RNR NAK'd request back to life */ 3716 if (qpriv->rnr_nak_state) { 3717 qp->r_nak_state = 0; 3718 qp->s_nak_state = 0; 3719 qpriv->rnr_nak_state = TID_RNR_NAK_INIT; 3720 qp->r_psn = e->lpsn + 1; 3721 req->state = TID_REQUEST_INIT; 3722 goto update_head; 3723 } 3724 3725 release_rdma_sge_mr(e); 3726 3727 /* The length needs to be in multiples of PAGE_SIZE */ 3728 if (!len || len & ~PAGE_MASK) 3729 goto nack_inv_unlock; 3730 3731 rkey = be32_to_cpu(reth->rkey); 3732 qp->r_len = len; 3733 3734 if (e->opcode == TID_OP(WRITE_REQ) && 3735 (req->setup_head != req->clear_tail || 3736 req->clear_tail != req->acked_tail)) 3737 goto nack_inv_unlock; 3738 3739 if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr, 3740 rkey, IB_ACCESS_REMOTE_WRITE))) 3741 goto nack_acc; 3742 3743 qp->r_psn += num_segs - 1; 3744 3745 e->opcode = (bth0 >> 24) & 0xff; 3746 e->psn = psn; 3747 e->lpsn = qp->r_psn; 3748 e->sent = 0; 3749 3750 req->n_flows = min_t(u16, num_segs, qpriv->tid_rdma.local.max_write); 3751 req->state = TID_REQUEST_INIT; 3752 req->cur_seg = 0; 3753 req->comp_seg = 0; 3754 req->ack_seg = 0; 3755 req->alloc_seg = 0; 3756 req->isge = 0; 3757 req->seg_len = qpriv->tid_rdma.local.max_len; 3758 req->total_len = len; 3759 req->total_segs = num_segs; 3760 req->r_flow_psn = e->psn; 3761 req->ss.sge = e->rdma_sge; 3762 req->ss.num_sge = 1; 3763 3764 req->flow_idx = req->setup_head; 3765 req->clear_tail = req->setup_head; 3766 req->acked_tail = req->setup_head; 3767 3768 qp->r_state = e->opcode; 3769 qp->r_nak_state = 0; 3770 /* 3771 * We need to increment the MSN here instead of when we 3772 * finish sending the result since a duplicate request would 3773 * increment it more than once. 3774 */ 3775 qp->r_msn++; 3776 qp->r_psn++; 3777 3778 trace_hfi1_tid_req_rcv_write_req(qp, 0, e->opcode, e->psn, e->lpsn, 3779 req); 3780 3781 if (qpriv->r_tid_tail == HFI1_QP_WQE_INVALID) { 3782 qpriv->r_tid_tail = qp->r_head_ack_queue; 3783 } else if (qpriv->r_tid_tail == qpriv->r_tid_head) { 3784 struct tid_rdma_request *ptr; 3785 3786 e = &qp->s_ack_queue[qpriv->r_tid_tail]; 3787 ptr = ack_to_tid_req(e); 3788 3789 if (e->opcode != TID_OP(WRITE_REQ) || 3790 ptr->comp_seg == ptr->total_segs) { 3791 if (qpriv->r_tid_tail == qpriv->r_tid_ack) 3792 qpriv->r_tid_ack = qp->r_head_ack_queue; 3793 qpriv->r_tid_tail = qp->r_head_ack_queue; 3794 } 3795 } 3796 update_head: 3797 qp->r_head_ack_queue = next; 3798 qpriv->r_tid_head = qp->r_head_ack_queue; 3799 3800 hfi1_tid_write_alloc_resources(qp, true); 3801 trace_hfi1_tid_write_rsp_rcv_req(qp); 3802 3803 /* Schedule the send tasklet. */ 3804 qp->s_flags |= RVT_S_RESP_PENDING; 3805 if (fecn) 3806 qp->s_flags |= RVT_S_ECN; 3807 hfi1_schedule_send(qp); 3808 3809 spin_unlock_irqrestore(&qp->s_lock, flags); 3810 return; 3811 3812 nack_inv_unlock: 3813 spin_unlock_irqrestore(&qp->s_lock, flags); 3814 nack_inv: 3815 rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); 3816 qp->r_nak_state = IB_NAK_INVALID_REQUEST; 3817 qp->r_ack_psn = qp->r_psn; 3818 /* Queue NAK for later */ 3819 rc_defered_ack(rcd, qp); 3820 return; 3821 nack_acc: 3822 spin_unlock_irqrestore(&qp->s_lock, flags); 3823 rvt_rc_error(qp, IB_WC_LOC_PROT_ERR); 3824 qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; 3825 qp->r_ack_psn = qp->r_psn; 3826 } 3827 3828 u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, 3829 struct ib_other_headers *ohdr, u32 *bth1, 3830 u32 bth2, u32 *len, 3831 struct rvt_sge_state **ss) 3832 { 3833 struct hfi1_ack_priv *epriv = e->priv; 3834 struct tid_rdma_request *req = &epriv->tid_req; 3835 struct hfi1_qp_priv *qpriv = qp->priv; 3836 struct tid_rdma_flow *flow = NULL; 3837 u32 resp_len = 0, hdwords = 0; 3838 void *resp_addr = NULL; 3839 struct tid_rdma_params *remote; 3840 3841 trace_hfi1_tid_req_build_write_resp(qp, 0, e->opcode, e->psn, e->lpsn, 3842 req); 3843 trace_hfi1_tid_write_rsp_build_resp(qp); 3844 trace_hfi1_rsp_build_tid_write_resp(qp, bth2); 3845 flow = &req->flows[req->flow_idx]; 3846 switch (req->state) { 3847 default: 3848 /* 3849 * Try to allocate resources here in case QP was queued and was 3850 * later scheduled when resources became available 3851 */ 3852 hfi1_tid_write_alloc_resources(qp, false); 3853 3854 /* We've already sent everything which is ready */ 3855 if (req->cur_seg >= req->alloc_seg) 3856 goto done; 3857 3858 /* 3859 * Resources can be assigned but responses cannot be sent in 3860 * rnr_nak state, till the resent request is received 3861 */ 3862 if (qpriv->rnr_nak_state == TID_RNR_NAK_SENT) 3863 goto done; 3864 3865 req->state = TID_REQUEST_ACTIVE; 3866 trace_hfi1_tid_flow_build_write_resp(qp, req->flow_idx, flow); 3867 req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS); 3868 hfi1_add_tid_reap_timer(qp); 3869 break; 3870 3871 case TID_REQUEST_RESEND_ACTIVE: 3872 case TID_REQUEST_RESEND: 3873 trace_hfi1_tid_flow_build_write_resp(qp, req->flow_idx, flow); 3874 req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS); 3875 if (!CIRC_CNT(req->setup_head, req->flow_idx, MAX_FLOWS)) 3876 req->state = TID_REQUEST_ACTIVE; 3877 3878 hfi1_mod_tid_reap_timer(qp); 3879 break; 3880 } 3881 flow->flow_state.resp_ib_psn = bth2; 3882 resp_addr = (void *)flow->tid_entry; 3883 resp_len = sizeof(*flow->tid_entry) * flow->tidcnt; 3884 req->cur_seg++; 3885 3886 memset(&ohdr->u.tid_rdma.w_rsp, 0, sizeof(ohdr->u.tid_rdma.w_rsp)); 3887 epriv->ss.sge.vaddr = resp_addr; 3888 epriv->ss.sge.sge_length = resp_len; 3889 epriv->ss.sge.length = epriv->ss.sge.sge_length; 3890 /* 3891 * We can safely zero these out. Since the first SGE covers the 3892 * entire packet, nothing else should even look at the MR. 3893 */ 3894 epriv->ss.sge.mr = NULL; 3895 epriv->ss.sge.m = 0; 3896 epriv->ss.sge.n = 0; 3897 3898 epriv->ss.sg_list = NULL; 3899 epriv->ss.total_len = epriv->ss.sge.sge_length; 3900 epriv->ss.num_sge = 1; 3901 3902 *ss = &epriv->ss; 3903 *len = epriv->ss.total_len; 3904 3905 /* Construct the TID RDMA WRITE RESP packet header */ 3906 rcu_read_lock(); 3907 remote = rcu_dereference(qpriv->tid_rdma.remote); 3908 3909 KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth0, KVER, 0x1); 3910 KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth1, JKEY, remote->jkey); 3911 ohdr->u.tid_rdma.w_rsp.aeth = rvt_compute_aeth(qp); 3912 ohdr->u.tid_rdma.w_rsp.tid_flow_psn = 3913 cpu_to_be32((flow->flow_state.generation << 3914 HFI1_KDETH_BTH_SEQ_SHIFT) | 3915 (flow->flow_state.spsn & 3916 HFI1_KDETH_BTH_SEQ_MASK)); 3917 ohdr->u.tid_rdma.w_rsp.tid_flow_qp = 3918 cpu_to_be32(qpriv->tid_rdma.local.qp | 3919 ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) << 3920 TID_RDMA_DESTQP_FLOW_SHIFT) | 3921 qpriv->rcd->ctxt); 3922 ohdr->u.tid_rdma.w_rsp.verbs_qp = cpu_to_be32(qp->remote_qpn); 3923 *bth1 = remote->qp; 3924 rcu_read_unlock(); 3925 hdwords = sizeof(ohdr->u.tid_rdma.w_rsp) / sizeof(u32); 3926 qpriv->pending_tid_w_segs++; 3927 done: 3928 return hdwords; 3929 } 3930 3931 static void hfi1_add_tid_reap_timer(struct rvt_qp *qp) 3932 { 3933 struct hfi1_qp_priv *qpriv = qp->priv; 3934 3935 lockdep_assert_held(&qp->s_lock); 3936 if (!(qpriv->s_flags & HFI1_R_TID_RSC_TIMER)) { 3937 qpriv->s_flags |= HFI1_R_TID_RSC_TIMER; 3938 qpriv->s_tid_timer.expires = jiffies + 3939 qpriv->tid_timer_timeout_jiffies; 3940 add_timer(&qpriv->s_tid_timer); 3941 } 3942 } 3943 3944 static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp) 3945 { 3946 struct hfi1_qp_priv *qpriv = qp->priv; 3947 3948 lockdep_assert_held(&qp->s_lock); 3949 qpriv->s_flags |= HFI1_R_TID_RSC_TIMER; 3950 mod_timer(&qpriv->s_tid_timer, jiffies + 3951 qpriv->tid_timer_timeout_jiffies); 3952 } 3953 3954 static int hfi1_stop_tid_reap_timer(struct rvt_qp *qp) 3955 { 3956 struct hfi1_qp_priv *qpriv = qp->priv; 3957 int rval = 0; 3958 3959 lockdep_assert_held(&qp->s_lock); 3960 if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) { 3961 rval = del_timer(&qpriv->s_tid_timer); 3962 qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER; 3963 } 3964 return rval; 3965 } 3966 3967 void hfi1_del_tid_reap_timer(struct rvt_qp *qp) 3968 { 3969 struct hfi1_qp_priv *qpriv = qp->priv; 3970 3971 del_timer_sync(&qpriv->s_tid_timer); 3972 qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER; 3973 } 3974 3975 static void hfi1_tid_timeout(struct timer_list *t) 3976 { 3977 struct hfi1_qp_priv *qpriv = from_timer(qpriv, t, s_tid_timer); 3978 struct rvt_qp *qp = qpriv->owner; 3979 struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); 3980 unsigned long flags; 3981 u32 i; 3982 3983 spin_lock_irqsave(&qp->r_lock, flags); 3984 spin_lock(&qp->s_lock); 3985 if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) { 3986 dd_dev_warn(dd_from_ibdev(qp->ibqp.device), "[QP%u] %s %d\n", 3987 qp->ibqp.qp_num, __func__, __LINE__); 3988 trace_hfi1_msg_tid_timeout(/* msg */ 3989 qp, "resource timeout = ", 3990 (u64)qpriv->tid_timer_timeout_jiffies); 3991 hfi1_stop_tid_reap_timer(qp); 3992 /* 3993 * Go though the entire ack queue and clear any outstanding 3994 * HW flow and RcvArray resources. 3995 */ 3996 hfi1_kern_clear_hw_flow(qpriv->rcd, qp); 3997 for (i = 0; i < rvt_max_atomic(rdi); i++) { 3998 struct tid_rdma_request *req = 3999 ack_to_tid_req(&qp->s_ack_queue[i]); 4000 4001 hfi1_kern_exp_rcv_clear_all(req); 4002 } 4003 spin_unlock(&qp->s_lock); 4004 if (qp->ibqp.event_handler) { 4005 struct ib_event ev; 4006 4007 ev.device = qp->ibqp.device; 4008 ev.element.qp = &qp->ibqp; 4009 ev.event = IB_EVENT_QP_FATAL; 4010 qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); 4011 } 4012 rvt_rc_error(qp, IB_WC_RESP_TIMEOUT_ERR); 4013 goto unlock_r_lock; 4014 } 4015 spin_unlock(&qp->s_lock); 4016 unlock_r_lock: 4017 spin_unlock_irqrestore(&qp->r_lock, flags); 4018 } 4019 4020 void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet) 4021 { 4022 /* HANDLER FOR TID RDMA WRITE RESPONSE packet (Requestor side */ 4023 4024 /* 4025 * 1. Find matching SWQE 4026 * 2. Check that TIDENTRY array has enough space for a complete 4027 * segment. If not, put QP in error state. 4028 * 3. Save response data in struct tid_rdma_req and struct tid_rdma_flow 4029 * 4. Remove HFI1_S_WAIT_TID_RESP from s_flags. 4030 * 5. Set qp->s_state 4031 * 6. Kick the send engine (hfi1_schedule_send()) 4032 */ 4033 struct ib_other_headers *ohdr = packet->ohdr; 4034 struct rvt_qp *qp = packet->qp; 4035 struct hfi1_qp_priv *qpriv = qp->priv; 4036 struct hfi1_ctxtdata *rcd = packet->rcd; 4037 struct rvt_swqe *wqe; 4038 struct tid_rdma_request *req; 4039 struct tid_rdma_flow *flow; 4040 enum ib_wc_status status; 4041 u32 opcode, aeth, psn, flow_psn, i, tidlen = 0, pktlen; 4042 bool fecn; 4043 unsigned long flags; 4044 4045 fecn = process_ecn(qp, packet); 4046 psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 4047 aeth = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.aeth); 4048 opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; 4049 4050 spin_lock_irqsave(&qp->s_lock, flags); 4051 4052 /* Ignore invalid responses */ 4053 if (cmp_psn(psn, qp->s_next_psn) >= 0) 4054 goto ack_done; 4055 4056 /* Ignore duplicate responses. */ 4057 if (unlikely(cmp_psn(psn, qp->s_last_psn) <= 0)) 4058 goto ack_done; 4059 4060 if (unlikely(qp->s_acked == qp->s_tail)) 4061 goto ack_done; 4062 4063 /* 4064 * If we are waiting for a particular packet sequence number 4065 * due to a request being resent, check for it. Otherwise, 4066 * ensure that we haven't missed anything. 4067 */ 4068 if (qp->r_flags & RVT_R_RDMAR_SEQ) { 4069 if (cmp_psn(psn, qp->s_last_psn + 1) != 0) 4070 goto ack_done; 4071 qp->r_flags &= ~RVT_R_RDMAR_SEQ; 4072 } 4073 4074 wqe = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur); 4075 if (unlikely(wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)) 4076 goto ack_op_err; 4077 4078 req = wqe_to_tid_req(wqe); 4079 /* 4080 * If we've lost ACKs and our acked_tail pointer is too far 4081 * behind, don't overwrite segments. Just drop the packet and 4082 * let the reliability protocol take care of it. 4083 */ 4084 if (!CIRC_SPACE(req->setup_head, req->acked_tail, MAX_FLOWS)) 4085 goto ack_done; 4086 4087 /* 4088 * The call to do_rc_ack() should be last in the chain of 4089 * packet checks because it will end up updating the QP state. 4090 * Therefore, anything that would prevent the packet from 4091 * being accepted as a successful response should be prior 4092 * to it. 4093 */ 4094 if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd)) 4095 goto ack_done; 4096 4097 trace_hfi1_ack(qp, psn); 4098 4099 flow = &req->flows[req->setup_head]; 4100 flow->pkt = 0; 4101 flow->tid_idx = 0; 4102 flow->tid_offset = 0; 4103 flow->sent = 0; 4104 flow->resync_npkts = 0; 4105 flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_qp); 4106 flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) & 4107 TID_RDMA_DESTQP_FLOW_MASK; 4108 flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_psn)); 4109 flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT; 4110 flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK; 4111 flow->flow_state.resp_ib_psn = psn; 4112 flow->length = min_t(u32, req->seg_len, 4113 (wqe->length - (req->comp_seg * req->seg_len))); 4114 4115 flow->npkts = rvt_div_round_up_mtu(qp, flow->length); 4116 flow->flow_state.lpsn = flow->flow_state.spsn + 4117 flow->npkts - 1; 4118 /* payload length = packet length - (header length + ICRC length) */ 4119 pktlen = packet->tlen - (packet->hlen + 4); 4120 if (pktlen > sizeof(flow->tid_entry)) { 4121 status = IB_WC_LOC_LEN_ERR; 4122 goto ack_err; 4123 } 4124 memcpy(flow->tid_entry, packet->ebuf, pktlen); 4125 flow->tidcnt = pktlen / sizeof(*flow->tid_entry); 4126 trace_hfi1_tid_flow_rcv_write_resp(qp, req->setup_head, flow); 4127 4128 req->comp_seg++; 4129 trace_hfi1_tid_write_sender_rcv_resp(qp, 0); 4130 /* 4131 * Walk the TID_ENTRY list to make sure we have enough space for a 4132 * complete segment. 4133 */ 4134 for (i = 0; i < flow->tidcnt; i++) { 4135 trace_hfi1_tid_entry_rcv_write_resp(/* entry */ 4136 qp, i, flow->tid_entry[i]); 4137 if (!EXP_TID_GET(flow->tid_entry[i], LEN)) { 4138 status = IB_WC_LOC_LEN_ERR; 4139 goto ack_err; 4140 } 4141 tidlen += EXP_TID_GET(flow->tid_entry[i], LEN); 4142 } 4143 if (tidlen * PAGE_SIZE < flow->length) { 4144 status = IB_WC_LOC_LEN_ERR; 4145 goto ack_err; 4146 } 4147 4148 trace_hfi1_tid_req_rcv_write_resp(qp, 0, wqe->wr.opcode, wqe->psn, 4149 wqe->lpsn, req); 4150 /* 4151 * If this is the first response for this request, set the initial 4152 * flow index to the current flow. 4153 */ 4154 if (!cmp_psn(psn, wqe->psn)) { 4155 req->r_last_acked = mask_psn(wqe->psn - 1); 4156 /* Set acked flow index to head index */ 4157 req->acked_tail = req->setup_head; 4158 } 4159 4160 /* advance circular buffer head */ 4161 req->setup_head = CIRC_NEXT(req->setup_head, MAX_FLOWS); 4162 req->state = TID_REQUEST_ACTIVE; 4163 4164 /* 4165 * If all responses for this TID RDMA WRITE request have been received 4166 * advance the pointer to the next one. 4167 * Since TID RDMA requests could be mixed in with regular IB requests, 4168 * they might not appear sequentially in the queue. Therefore, the 4169 * next request needs to be "found". 4170 */ 4171 if (qpriv->s_tid_cur != qpriv->s_tid_head && 4172 req->comp_seg == req->total_segs) { 4173 for (i = qpriv->s_tid_cur + 1; ; i++) { 4174 if (i == qp->s_size) 4175 i = 0; 4176 wqe = rvt_get_swqe_ptr(qp, i); 4177 if (i == qpriv->s_tid_head) 4178 break; 4179 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) 4180 break; 4181 } 4182 qpriv->s_tid_cur = i; 4183 } 4184 qp->s_flags &= ~HFI1_S_WAIT_TID_RESP; 4185 hfi1_schedule_tid_send(qp); 4186 goto ack_done; 4187 4188 ack_op_err: 4189 status = IB_WC_LOC_QP_OP_ERR; 4190 ack_err: 4191 rvt_error_qp(qp, status); 4192 ack_done: 4193 if (fecn) 4194 qp->s_flags |= RVT_S_ECN; 4195 spin_unlock_irqrestore(&qp->s_lock, flags); 4196 } 4197 4198 bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe, 4199 struct ib_other_headers *ohdr, 4200 u32 *bth1, u32 *bth2, u32 *len) 4201 { 4202 struct tid_rdma_request *req = wqe_to_tid_req(wqe); 4203 struct tid_rdma_flow *flow = &req->flows[req->clear_tail]; 4204 struct tid_rdma_params *remote; 4205 struct rvt_qp *qp = req->qp; 4206 struct hfi1_qp_priv *qpriv = qp->priv; 4207 u32 tidentry = flow->tid_entry[flow->tid_idx]; 4208 u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT; 4209 struct tid_rdma_write_data *wd = &ohdr->u.tid_rdma.w_data; 4210 u32 next_offset, om = KDETH_OM_LARGE; 4211 bool last_pkt; 4212 4213 if (!tidlen) { 4214 hfi1_trdma_send_complete(qp, wqe, IB_WC_REM_INV_RD_REQ_ERR); 4215 rvt_error_qp(qp, IB_WC_REM_INV_RD_REQ_ERR); 4216 } 4217 4218 *len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset); 4219 flow->sent += *len; 4220 next_offset = flow->tid_offset + *len; 4221 last_pkt = (flow->tid_idx == (flow->tidcnt - 1) && 4222 next_offset >= tidlen) || (flow->sent >= flow->length); 4223 trace_hfi1_tid_entry_build_write_data(qp, flow->tid_idx, tidentry); 4224 trace_hfi1_tid_flow_build_write_data(qp, req->clear_tail, flow); 4225 4226 rcu_read_lock(); 4227 remote = rcu_dereference(qpriv->tid_rdma.remote); 4228 KDETH_RESET(wd->kdeth0, KVER, 0x1); 4229 KDETH_SET(wd->kdeth0, SH, !last_pkt); 4230 KDETH_SET(wd->kdeth0, INTR, !!(!last_pkt && remote->urg)); 4231 KDETH_SET(wd->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL)); 4232 KDETH_SET(wd->kdeth0, TID, EXP_TID_GET(tidentry, IDX)); 4233 KDETH_SET(wd->kdeth0, OM, om == KDETH_OM_LARGE); 4234 KDETH_SET(wd->kdeth0, OFFSET, flow->tid_offset / om); 4235 KDETH_RESET(wd->kdeth1, JKEY, remote->jkey); 4236 wd->verbs_qp = cpu_to_be32(qp->remote_qpn); 4237 rcu_read_unlock(); 4238 4239 *bth1 = flow->tid_qpn; 4240 *bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) & 4241 HFI1_KDETH_BTH_SEQ_MASK) | 4242 (flow->flow_state.generation << 4243 HFI1_KDETH_BTH_SEQ_SHIFT)); 4244 if (last_pkt) { 4245 /* PSNs are zero-based, so +1 to count number of packets */ 4246 if (flow->flow_state.lpsn + 1 + 4247 rvt_div_round_up_mtu(qp, req->seg_len) > 4248 MAX_TID_FLOW_PSN) 4249 req->state = TID_REQUEST_SYNC; 4250 *bth2 |= IB_BTH_REQ_ACK; 4251 } 4252 4253 if (next_offset >= tidlen) { 4254 flow->tid_offset = 0; 4255 flow->tid_idx++; 4256 } else { 4257 flow->tid_offset = next_offset; 4258 } 4259 return last_pkt; 4260 } 4261 4262 void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet) 4263 { 4264 struct rvt_qp *qp = packet->qp; 4265 struct hfi1_qp_priv *priv = qp->priv; 4266 struct hfi1_ctxtdata *rcd = priv->rcd; 4267 struct ib_other_headers *ohdr = packet->ohdr; 4268 struct rvt_ack_entry *e; 4269 struct tid_rdma_request *req; 4270 struct tid_rdma_flow *flow; 4271 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); 4272 unsigned long flags; 4273 u32 psn, next; 4274 u8 opcode; 4275 bool fecn; 4276 4277 fecn = process_ecn(qp, packet); 4278 psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 4279 opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; 4280 4281 /* 4282 * All error handling should be done by now. If we are here, the packet 4283 * is either good or been accepted by the error handler. 4284 */ 4285 spin_lock_irqsave(&qp->s_lock, flags); 4286 e = &qp->s_ack_queue[priv->r_tid_tail]; 4287 req = ack_to_tid_req(e); 4288 flow = &req->flows[req->clear_tail]; 4289 if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.lpsn))) { 4290 update_r_next_psn_fecn(packet, priv, rcd, flow, fecn); 4291 4292 if (cmp_psn(psn, flow->flow_state.r_next_psn)) 4293 goto send_nak; 4294 4295 flow->flow_state.r_next_psn = mask_psn(psn + 1); 4296 /* 4297 * Copy the payload to destination buffer if this packet is 4298 * delivered as an eager packet due to RSM rule and FECN. 4299 * The RSM rule selects FECN bit in BTH and SH bit in 4300 * KDETH header and therefore will not match the last 4301 * packet of each segment that has SH bit cleared. 4302 */ 4303 if (fecn && packet->etype == RHF_RCV_TYPE_EAGER) { 4304 struct rvt_sge_state ss; 4305 u32 len; 4306 u32 tlen = packet->tlen; 4307 u16 hdrsize = packet->hlen; 4308 u8 pad = packet->pad; 4309 u8 extra_bytes = pad + packet->extra_byte + 4310 (SIZE_OF_CRC << 2); 4311 u32 pmtu = qp->pmtu; 4312 4313 if (unlikely(tlen != (hdrsize + pmtu + extra_bytes))) 4314 goto send_nak; 4315 len = req->comp_seg * req->seg_len; 4316 len += delta_psn(psn, 4317 full_flow_psn(flow, flow->flow_state.spsn)) * 4318 pmtu; 4319 if (unlikely(req->total_len - len < pmtu)) 4320 goto send_nak; 4321 4322 /* 4323 * The e->rdma_sge field is set when TID RDMA WRITE REQ 4324 * is first received and is never modified thereafter. 4325 */ 4326 ss.sge = e->rdma_sge; 4327 ss.sg_list = NULL; 4328 ss.num_sge = 1; 4329 ss.total_len = req->total_len; 4330 rvt_skip_sge(&ss, len, false); 4331 rvt_copy_sge(qp, &ss, packet->payload, pmtu, false, 4332 false); 4333 /* Raise the sw sequence check flag for next packet */ 4334 priv->r_next_psn_kdeth = mask_psn(psn + 1); 4335 priv->s_flags |= HFI1_R_TID_SW_PSN; 4336 } 4337 goto exit; 4338 } 4339 flow->flow_state.r_next_psn = mask_psn(psn + 1); 4340 hfi1_kern_exp_rcv_clear(req); 4341 priv->alloc_w_segs--; 4342 rcd->flows[flow->idx].psn = psn & HFI1_KDETH_BTH_SEQ_MASK; 4343 req->comp_seg++; 4344 priv->s_nak_state = 0; 4345 4346 /* 4347 * Release the flow if one of the following conditions has been met: 4348 * - The request has reached a sync point AND all outstanding 4349 * segments have been completed, or 4350 * - The entire request is complete and there are no more requests 4351 * (of any kind) in the queue. 4352 */ 4353 trace_hfi1_rsp_rcv_tid_write_data(qp, psn); 4354 trace_hfi1_tid_req_rcv_write_data(qp, 0, e->opcode, e->psn, e->lpsn, 4355 req); 4356 trace_hfi1_tid_write_rsp_rcv_data(qp); 4357 validate_r_tid_ack(priv); 4358 4359 if (opcode == TID_OP(WRITE_DATA_LAST)) { 4360 release_rdma_sge_mr(e); 4361 for (next = priv->r_tid_tail + 1; ; next++) { 4362 if (next > rvt_size_atomic(&dev->rdi)) 4363 next = 0; 4364 if (next == priv->r_tid_head) 4365 break; 4366 e = &qp->s_ack_queue[next]; 4367 if (e->opcode == TID_OP(WRITE_REQ)) 4368 break; 4369 } 4370 priv->r_tid_tail = next; 4371 if (++qp->s_acked_ack_queue > rvt_size_atomic(&dev->rdi)) 4372 qp->s_acked_ack_queue = 0; 4373 } 4374 4375 hfi1_tid_write_alloc_resources(qp, true); 4376 4377 /* 4378 * If we need to generate more responses, schedule the 4379 * send engine. 4380 */ 4381 if (req->cur_seg < req->total_segs || 4382 qp->s_tail_ack_queue != qp->r_head_ack_queue) { 4383 qp->s_flags |= RVT_S_RESP_PENDING; 4384 hfi1_schedule_send(qp); 4385 } 4386 4387 priv->pending_tid_w_segs--; 4388 if (priv->s_flags & HFI1_R_TID_RSC_TIMER) { 4389 if (priv->pending_tid_w_segs) 4390 hfi1_mod_tid_reap_timer(req->qp); 4391 else 4392 hfi1_stop_tid_reap_timer(req->qp); 4393 } 4394 4395 done: 4396 tid_rdma_schedule_ack(qp); 4397 exit: 4398 priv->r_next_psn_kdeth = flow->flow_state.r_next_psn; 4399 if (fecn) 4400 qp->s_flags |= RVT_S_ECN; 4401 spin_unlock_irqrestore(&qp->s_lock, flags); 4402 return; 4403 4404 send_nak: 4405 if (!priv->s_nak_state) { 4406 priv->s_nak_state = IB_NAK_PSN_ERROR; 4407 priv->s_nak_psn = flow->flow_state.r_next_psn; 4408 tid_rdma_trigger_ack(qp); 4409 } 4410 goto done; 4411 } 4412 4413 static bool hfi1_tid_rdma_is_resync_psn(u32 psn) 4414 { 4415 return (bool)((psn & HFI1_KDETH_BTH_SEQ_MASK) == 4416 HFI1_KDETH_BTH_SEQ_MASK); 4417 } 4418 4419 u32 hfi1_build_tid_rdma_write_ack(struct rvt_qp *qp, struct rvt_ack_entry *e, 4420 struct ib_other_headers *ohdr, u16 iflow, 4421 u32 *bth1, u32 *bth2) 4422 { 4423 struct hfi1_qp_priv *qpriv = qp->priv; 4424 struct tid_flow_state *fs = &qpriv->flow_state; 4425 struct tid_rdma_request *req = ack_to_tid_req(e); 4426 struct tid_rdma_flow *flow = &req->flows[iflow]; 4427 struct tid_rdma_params *remote; 4428 4429 rcu_read_lock(); 4430 remote = rcu_dereference(qpriv->tid_rdma.remote); 4431 KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey); 4432 ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn); 4433 *bth1 = remote->qp; 4434 rcu_read_unlock(); 4435 4436 if (qpriv->resync) { 4437 *bth2 = mask_psn((fs->generation << 4438 HFI1_KDETH_BTH_SEQ_SHIFT) - 1); 4439 ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp); 4440 } else if (qpriv->s_nak_state) { 4441 *bth2 = mask_psn(qpriv->s_nak_psn); 4442 ohdr->u.tid_rdma.ack.aeth = 4443 cpu_to_be32((qp->r_msn & IB_MSN_MASK) | 4444 (qpriv->s_nak_state << 4445 IB_AETH_CREDIT_SHIFT)); 4446 } else { 4447 *bth2 = full_flow_psn(flow, flow->flow_state.lpsn); 4448 ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp); 4449 } 4450 KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, 0x1); 4451 ohdr->u.tid_rdma.ack.tid_flow_qp = 4452 cpu_to_be32(qpriv->tid_rdma.local.qp | 4453 ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) << 4454 TID_RDMA_DESTQP_FLOW_SHIFT) | 4455 qpriv->rcd->ctxt); 4456 4457 ohdr->u.tid_rdma.ack.tid_flow_psn = 0; 4458 ohdr->u.tid_rdma.ack.verbs_psn = 4459 cpu_to_be32(flow->flow_state.resp_ib_psn); 4460 4461 if (qpriv->resync) { 4462 /* 4463 * If the PSN before the current expect KDETH PSN is the 4464 * RESYNC PSN, then we never received a good TID RDMA WRITE 4465 * DATA packet after a previous RESYNC. 4466 * In this case, the next expected KDETH PSN stays the same. 4467 */ 4468 if (hfi1_tid_rdma_is_resync_psn(qpriv->r_next_psn_kdeth - 1)) { 4469 ohdr->u.tid_rdma.ack.tid_flow_psn = 4470 cpu_to_be32(qpriv->r_next_psn_kdeth_save); 4471 } else { 4472 /* 4473 * Because the KDETH PSNs jump during a RESYNC, it's 4474 * not possible to infer (or compute) the previous value 4475 * of r_next_psn_kdeth in the case of back-to-back 4476 * RESYNC packets. Therefore, we save it. 4477 */ 4478 qpriv->r_next_psn_kdeth_save = 4479 qpriv->r_next_psn_kdeth - 1; 4480 ohdr->u.tid_rdma.ack.tid_flow_psn = 4481 cpu_to_be32(qpriv->r_next_psn_kdeth_save); 4482 qpriv->r_next_psn_kdeth = mask_psn(*bth2 + 1); 4483 } 4484 qpriv->resync = false; 4485 } 4486 4487 return sizeof(ohdr->u.tid_rdma.ack) / sizeof(u32); 4488 } 4489 4490 void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) 4491 { 4492 struct ib_other_headers *ohdr = packet->ohdr; 4493 struct rvt_qp *qp = packet->qp; 4494 struct hfi1_qp_priv *qpriv = qp->priv; 4495 struct rvt_swqe *wqe; 4496 struct tid_rdma_request *req; 4497 struct tid_rdma_flow *flow; 4498 u32 aeth, psn, req_psn, ack_psn, flpsn, resync_psn, ack_kpsn; 4499 unsigned long flags; 4500 u16 fidx; 4501 4502 trace_hfi1_tid_write_sender_rcv_tid_ack(qp, 0); 4503 process_ecn(qp, packet); 4504 psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 4505 aeth = be32_to_cpu(ohdr->u.tid_rdma.ack.aeth); 4506 req_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.verbs_psn)); 4507 resync_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.tid_flow_psn)); 4508 4509 spin_lock_irqsave(&qp->s_lock, flags); 4510 trace_hfi1_rcv_tid_ack(qp, aeth, psn, req_psn, resync_psn); 4511 4512 /* If we are waiting for an ACK to RESYNC, drop any other packets */ 4513 if ((qp->s_flags & HFI1_S_WAIT_HALT) && 4514 cmp_psn(psn, qpriv->s_resync_psn)) 4515 goto ack_op_err; 4516 4517 ack_psn = req_psn; 4518 if (hfi1_tid_rdma_is_resync_psn(psn)) 4519 ack_kpsn = resync_psn; 4520 else 4521 ack_kpsn = psn; 4522 if (aeth >> 29) { 4523 ack_psn--; 4524 ack_kpsn--; 4525 } 4526 4527 if (unlikely(qp->s_acked == qp->s_tail)) 4528 goto ack_op_err; 4529 4530 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 4531 4532 if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) 4533 goto ack_op_err; 4534 4535 req = wqe_to_tid_req(wqe); 4536 trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn, 4537 wqe->lpsn, req); 4538 flow = &req->flows[req->acked_tail]; 4539 trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow); 4540 4541 /* Drop stale ACK/NAK */ 4542 if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.spsn)) < 0 || 4543 cmp_psn(req_psn, flow->flow_state.resp_ib_psn) < 0) 4544 goto ack_op_err; 4545 4546 while (cmp_psn(ack_kpsn, 4547 full_flow_psn(flow, flow->flow_state.lpsn)) >= 0 && 4548 req->ack_seg < req->cur_seg) { 4549 req->ack_seg++; 4550 /* advance acked segment pointer */ 4551 req->acked_tail = CIRC_NEXT(req->acked_tail, MAX_FLOWS); 4552 req->r_last_acked = flow->flow_state.resp_ib_psn; 4553 trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn, 4554 wqe->lpsn, req); 4555 if (req->ack_seg == req->total_segs) { 4556 req->state = TID_REQUEST_COMPLETE; 4557 wqe = do_rc_completion(qp, wqe, 4558 to_iport(qp->ibqp.device, 4559 qp->port_num)); 4560 trace_hfi1_sender_rcv_tid_ack(qp); 4561 atomic_dec(&qpriv->n_tid_requests); 4562 if (qp->s_acked == qp->s_tail) 4563 break; 4564 if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) 4565 break; 4566 req = wqe_to_tid_req(wqe); 4567 } 4568 flow = &req->flows[req->acked_tail]; 4569 trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow); 4570 } 4571 4572 trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn, 4573 wqe->lpsn, req); 4574 switch (aeth >> 29) { 4575 case 0: /* ACK */ 4576 if (qpriv->s_flags & RVT_S_WAIT_ACK) 4577 qpriv->s_flags &= ~RVT_S_WAIT_ACK; 4578 if (!hfi1_tid_rdma_is_resync_psn(psn)) { 4579 /* Check if there is any pending TID ACK */ 4580 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && 4581 req->ack_seg < req->cur_seg) 4582 hfi1_mod_tid_retry_timer(qp); 4583 else 4584 hfi1_stop_tid_retry_timer(qp); 4585 hfi1_schedule_send(qp); 4586 } else { 4587 u32 spsn, fpsn, last_acked, generation; 4588 struct tid_rdma_request *rptr; 4589 4590 /* ACK(RESYNC) */ 4591 hfi1_stop_tid_retry_timer(qp); 4592 /* Allow new requests (see hfi1_make_tid_rdma_pkt) */ 4593 qp->s_flags &= ~HFI1_S_WAIT_HALT; 4594 /* 4595 * Clear RVT_S_SEND_ONE flag in case that the TID RDMA 4596 * ACK is received after the TID retry timer is fired 4597 * again. In this case, do not send any more TID 4598 * RESYNC request or wait for any more TID ACK packet. 4599 */ 4600 qpriv->s_flags &= ~RVT_S_SEND_ONE; 4601 hfi1_schedule_send(qp); 4602 4603 if ((qp->s_acked == qpriv->s_tid_tail && 4604 req->ack_seg == req->total_segs) || 4605 qp->s_acked == qp->s_tail) { 4606 qpriv->s_state = TID_OP(WRITE_DATA_LAST); 4607 goto done; 4608 } 4609 4610 if (req->ack_seg == req->comp_seg) { 4611 qpriv->s_state = TID_OP(WRITE_DATA); 4612 goto done; 4613 } 4614 4615 /* 4616 * The PSN to start with is the next PSN after the 4617 * RESYNC PSN. 4618 */ 4619 psn = mask_psn(psn + 1); 4620 generation = psn >> HFI1_KDETH_BTH_SEQ_SHIFT; 4621 spsn = 0; 4622 4623 /* 4624 * Update to the correct WQE when we get an ACK(RESYNC) 4625 * in the middle of a request. 4626 */ 4627 if (delta_psn(ack_psn, wqe->lpsn)) 4628 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 4629 req = wqe_to_tid_req(wqe); 4630 flow = &req->flows[req->acked_tail]; 4631 /* 4632 * RESYNC re-numbers the PSN ranges of all remaining 4633 * segments. Also, PSN's start from 0 in the middle of a 4634 * segment and the first segment size is less than the 4635 * default number of packets. flow->resync_npkts is used 4636 * to track the number of packets from the start of the 4637 * real segment to the point of 0 PSN after the RESYNC 4638 * in order to later correctly rewind the SGE. 4639 */ 4640 fpsn = full_flow_psn(flow, flow->flow_state.spsn); 4641 req->r_ack_psn = psn; 4642 /* 4643 * If resync_psn points to the last flow PSN for a 4644 * segment and the new segment (likely from a new 4645 * request) starts with a new generation number, we 4646 * need to adjust resync_psn accordingly. 4647 */ 4648 if (flow->flow_state.generation != 4649 (resync_psn >> HFI1_KDETH_BTH_SEQ_SHIFT)) 4650 resync_psn = mask_psn(fpsn - 1); 4651 flow->resync_npkts += 4652 delta_psn(mask_psn(resync_psn + 1), fpsn); 4653 /* 4654 * Renumber all packet sequence number ranges 4655 * based on the new generation. 4656 */ 4657 last_acked = qp->s_acked; 4658 rptr = req; 4659 while (1) { 4660 /* start from last acked segment */ 4661 for (fidx = rptr->acked_tail; 4662 CIRC_CNT(rptr->setup_head, fidx, 4663 MAX_FLOWS); 4664 fidx = CIRC_NEXT(fidx, MAX_FLOWS)) { 4665 u32 lpsn; 4666 u32 gen; 4667 4668 flow = &rptr->flows[fidx]; 4669 gen = flow->flow_state.generation; 4670 if (WARN_ON(gen == generation && 4671 flow->flow_state.spsn != 4672 spsn)) 4673 continue; 4674 lpsn = flow->flow_state.lpsn; 4675 lpsn = full_flow_psn(flow, lpsn); 4676 flow->npkts = 4677 delta_psn(lpsn, 4678 mask_psn(resync_psn) 4679 ); 4680 flow->flow_state.generation = 4681 generation; 4682 flow->flow_state.spsn = spsn; 4683 flow->flow_state.lpsn = 4684 flow->flow_state.spsn + 4685 flow->npkts - 1; 4686 flow->pkt = 0; 4687 spsn += flow->npkts; 4688 resync_psn += flow->npkts; 4689 trace_hfi1_tid_flow_rcv_tid_ack(qp, 4690 fidx, 4691 flow); 4692 } 4693 if (++last_acked == qpriv->s_tid_cur + 1) 4694 break; 4695 if (last_acked == qp->s_size) 4696 last_acked = 0; 4697 wqe = rvt_get_swqe_ptr(qp, last_acked); 4698 rptr = wqe_to_tid_req(wqe); 4699 } 4700 req->cur_seg = req->ack_seg; 4701 qpriv->s_tid_tail = qp->s_acked; 4702 qpriv->s_state = TID_OP(WRITE_REQ); 4703 hfi1_schedule_tid_send(qp); 4704 } 4705 done: 4706 qpriv->s_retry = qp->s_retry_cnt; 4707 break; 4708 4709 case 3: /* NAK */ 4710 hfi1_stop_tid_retry_timer(qp); 4711 switch ((aeth >> IB_AETH_CREDIT_SHIFT) & 4712 IB_AETH_CREDIT_MASK) { 4713 case 0: /* PSN sequence error */ 4714 if (!req->flows) 4715 break; 4716 flow = &req->flows[req->acked_tail]; 4717 flpsn = full_flow_psn(flow, flow->flow_state.lpsn); 4718 if (cmp_psn(psn, flpsn) > 0) 4719 break; 4720 trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, 4721 flow); 4722 req->r_ack_psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 4723 req->cur_seg = req->ack_seg; 4724 qpriv->s_tid_tail = qp->s_acked; 4725 qpriv->s_state = TID_OP(WRITE_REQ); 4726 qpriv->s_retry = qp->s_retry_cnt; 4727 hfi1_schedule_tid_send(qp); 4728 break; 4729 4730 default: 4731 break; 4732 } 4733 break; 4734 4735 default: 4736 break; 4737 } 4738 4739 ack_op_err: 4740 spin_unlock_irqrestore(&qp->s_lock, flags); 4741 } 4742 4743 void hfi1_add_tid_retry_timer(struct rvt_qp *qp) 4744 { 4745 struct hfi1_qp_priv *priv = qp->priv; 4746 struct ib_qp *ibqp = &qp->ibqp; 4747 struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device); 4748 4749 lockdep_assert_held(&qp->s_lock); 4750 if (!(priv->s_flags & HFI1_S_TID_RETRY_TIMER)) { 4751 priv->s_flags |= HFI1_S_TID_RETRY_TIMER; 4752 priv->s_tid_retry_timer.expires = jiffies + 4753 priv->tid_retry_timeout_jiffies + rdi->busy_jiffies; 4754 add_timer(&priv->s_tid_retry_timer); 4755 } 4756 } 4757 4758 static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp) 4759 { 4760 struct hfi1_qp_priv *priv = qp->priv; 4761 struct ib_qp *ibqp = &qp->ibqp; 4762 struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device); 4763 4764 lockdep_assert_held(&qp->s_lock); 4765 priv->s_flags |= HFI1_S_TID_RETRY_TIMER; 4766 mod_timer(&priv->s_tid_retry_timer, jiffies + 4767 priv->tid_retry_timeout_jiffies + rdi->busy_jiffies); 4768 } 4769 4770 static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp) 4771 { 4772 struct hfi1_qp_priv *priv = qp->priv; 4773 int rval = 0; 4774 4775 lockdep_assert_held(&qp->s_lock); 4776 if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) { 4777 rval = del_timer(&priv->s_tid_retry_timer); 4778 priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER; 4779 } 4780 return rval; 4781 } 4782 4783 void hfi1_del_tid_retry_timer(struct rvt_qp *qp) 4784 { 4785 struct hfi1_qp_priv *priv = qp->priv; 4786 4787 del_timer_sync(&priv->s_tid_retry_timer); 4788 priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER; 4789 } 4790 4791 static void hfi1_tid_retry_timeout(struct timer_list *t) 4792 { 4793 struct hfi1_qp_priv *priv = from_timer(priv, t, s_tid_retry_timer); 4794 struct rvt_qp *qp = priv->owner; 4795 struct rvt_swqe *wqe; 4796 unsigned long flags; 4797 struct tid_rdma_request *req; 4798 4799 spin_lock_irqsave(&qp->r_lock, flags); 4800 spin_lock(&qp->s_lock); 4801 trace_hfi1_tid_write_sender_retry_timeout(qp, 0); 4802 if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) { 4803 hfi1_stop_tid_retry_timer(qp); 4804 if (!priv->s_retry) { 4805 trace_hfi1_msg_tid_retry_timeout(/* msg */ 4806 qp, 4807 "Exhausted retries. Tid retry timeout = ", 4808 (u64)priv->tid_retry_timeout_jiffies); 4809 4810 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 4811 hfi1_trdma_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); 4812 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); 4813 } else { 4814 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 4815 req = wqe_to_tid_req(wqe); 4816 trace_hfi1_tid_req_tid_retry_timeout(/* req */ 4817 qp, 0, wqe->wr.opcode, wqe->psn, wqe->lpsn, req); 4818 4819 priv->s_flags &= ~RVT_S_WAIT_ACK; 4820 /* Only send one packet (the RESYNC) */ 4821 priv->s_flags |= RVT_S_SEND_ONE; 4822 /* 4823 * No additional request shall be made by this QP until 4824 * the RESYNC has been complete. 4825 */ 4826 qp->s_flags |= HFI1_S_WAIT_HALT; 4827 priv->s_state = TID_OP(RESYNC); 4828 priv->s_retry--; 4829 hfi1_schedule_tid_send(qp); 4830 } 4831 } 4832 spin_unlock(&qp->s_lock); 4833 spin_unlock_irqrestore(&qp->r_lock, flags); 4834 } 4835 4836 u32 hfi1_build_tid_rdma_resync(struct rvt_qp *qp, struct rvt_swqe *wqe, 4837 struct ib_other_headers *ohdr, u32 *bth1, 4838 u32 *bth2, u16 fidx) 4839 { 4840 struct hfi1_qp_priv *qpriv = qp->priv; 4841 struct tid_rdma_params *remote; 4842 struct tid_rdma_request *req = wqe_to_tid_req(wqe); 4843 struct tid_rdma_flow *flow = &req->flows[fidx]; 4844 u32 generation; 4845 4846 rcu_read_lock(); 4847 remote = rcu_dereference(qpriv->tid_rdma.remote); 4848 KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey); 4849 ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn); 4850 *bth1 = remote->qp; 4851 rcu_read_unlock(); 4852 4853 generation = kern_flow_generation_next(flow->flow_state.generation); 4854 *bth2 = mask_psn((generation << HFI1_KDETH_BTH_SEQ_SHIFT) - 1); 4855 qpriv->s_resync_psn = *bth2; 4856 *bth2 |= IB_BTH_REQ_ACK; 4857 KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, 0x1); 4858 4859 return sizeof(ohdr->u.tid_rdma.resync) / sizeof(u32); 4860 } 4861 4862 void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet) 4863 { 4864 struct ib_other_headers *ohdr = packet->ohdr; 4865 struct rvt_qp *qp = packet->qp; 4866 struct hfi1_qp_priv *qpriv = qp->priv; 4867 struct hfi1_ctxtdata *rcd = qpriv->rcd; 4868 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); 4869 struct rvt_ack_entry *e; 4870 struct tid_rdma_request *req; 4871 struct tid_rdma_flow *flow; 4872 struct tid_flow_state *fs = &qpriv->flow_state; 4873 u32 psn, generation, idx, gen_next; 4874 bool fecn; 4875 unsigned long flags; 4876 4877 fecn = process_ecn(qp, packet); 4878 psn = mask_psn(be32_to_cpu(ohdr->bth[2])); 4879 4880 generation = mask_psn(psn + 1) >> HFI1_KDETH_BTH_SEQ_SHIFT; 4881 spin_lock_irqsave(&qp->s_lock, flags); 4882 4883 gen_next = (fs->generation == KERN_GENERATION_RESERVED) ? 4884 generation : kern_flow_generation_next(fs->generation); 4885 /* 4886 * RESYNC packet contains the "next" generation and can only be 4887 * from the current or previous generations 4888 */ 4889 if (generation != mask_generation(gen_next - 1) && 4890 generation != gen_next) 4891 goto bail; 4892 /* Already processing a resync */ 4893 if (qpriv->resync) 4894 goto bail; 4895 4896 spin_lock(&rcd->exp_lock); 4897 if (fs->index >= RXE_NUM_TID_FLOWS) { 4898 /* 4899 * If we don't have a flow, save the generation so it can be 4900 * applied when a new flow is allocated 4901 */ 4902 fs->generation = generation; 4903 } else { 4904 /* Reprogram the QP flow with new generation */ 4905 rcd->flows[fs->index].generation = generation; 4906 fs->generation = kern_setup_hw_flow(rcd, fs->index); 4907 } 4908 fs->psn = 0; 4909 /* 4910 * Disable SW PSN checking since a RESYNC is equivalent to a 4911 * sync point and the flow has/will be reprogrammed 4912 */ 4913 qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; 4914 trace_hfi1_tid_write_rsp_rcv_resync(qp); 4915 4916 /* 4917 * Reset all TID flow information with the new generation. 4918 * This is done for all requests and segments after the 4919 * last received segment 4920 */ 4921 for (idx = qpriv->r_tid_tail; ; idx++) { 4922 u16 flow_idx; 4923 4924 if (idx > rvt_size_atomic(&dev->rdi)) 4925 idx = 0; 4926 e = &qp->s_ack_queue[idx]; 4927 if (e->opcode == TID_OP(WRITE_REQ)) { 4928 req = ack_to_tid_req(e); 4929 trace_hfi1_tid_req_rcv_resync(qp, 0, e->opcode, e->psn, 4930 e->lpsn, req); 4931 4932 /* start from last unacked segment */ 4933 for (flow_idx = req->clear_tail; 4934 CIRC_CNT(req->setup_head, flow_idx, 4935 MAX_FLOWS); 4936 flow_idx = CIRC_NEXT(flow_idx, MAX_FLOWS)) { 4937 u32 lpsn; 4938 u32 next; 4939 4940 flow = &req->flows[flow_idx]; 4941 lpsn = full_flow_psn(flow, 4942 flow->flow_state.lpsn); 4943 next = flow->flow_state.r_next_psn; 4944 flow->npkts = delta_psn(lpsn, next - 1); 4945 flow->flow_state.generation = fs->generation; 4946 flow->flow_state.spsn = fs->psn; 4947 flow->flow_state.lpsn = 4948 flow->flow_state.spsn + flow->npkts - 1; 4949 flow->flow_state.r_next_psn = 4950 full_flow_psn(flow, 4951 flow->flow_state.spsn); 4952 fs->psn += flow->npkts; 4953 trace_hfi1_tid_flow_rcv_resync(qp, flow_idx, 4954 flow); 4955 } 4956 } 4957 if (idx == qp->s_tail_ack_queue) 4958 break; 4959 } 4960 4961 spin_unlock(&rcd->exp_lock); 4962 qpriv->resync = true; 4963 /* RESYNC request always gets a TID RDMA ACK. */ 4964 qpriv->s_nak_state = 0; 4965 tid_rdma_trigger_ack(qp); 4966 bail: 4967 if (fecn) 4968 qp->s_flags |= RVT_S_ECN; 4969 spin_unlock_irqrestore(&qp->s_lock, flags); 4970 } 4971 4972 /* 4973 * Call this function when the last TID RDMA WRITE DATA packet for a request 4974 * is built. 4975 */ 4976 static void update_tid_tail(struct rvt_qp *qp) 4977 __must_hold(&qp->s_lock) 4978 { 4979 struct hfi1_qp_priv *priv = qp->priv; 4980 u32 i; 4981 struct rvt_swqe *wqe; 4982 4983 lockdep_assert_held(&qp->s_lock); 4984 /* Can't move beyond s_tid_cur */ 4985 if (priv->s_tid_tail == priv->s_tid_cur) 4986 return; 4987 for (i = priv->s_tid_tail + 1; ; i++) { 4988 if (i == qp->s_size) 4989 i = 0; 4990 4991 if (i == priv->s_tid_cur) 4992 break; 4993 wqe = rvt_get_swqe_ptr(qp, i); 4994 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) 4995 break; 4996 } 4997 priv->s_tid_tail = i; 4998 priv->s_state = TID_OP(WRITE_RESP); 4999 } 5000 5001 int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps) 5002 __must_hold(&qp->s_lock) 5003 { 5004 struct hfi1_qp_priv *priv = qp->priv; 5005 struct rvt_swqe *wqe; 5006 u32 bth1 = 0, bth2 = 0, hwords = 5, len, middle = 0; 5007 struct ib_other_headers *ohdr; 5008 struct rvt_sge_state *ss = &qp->s_sge; 5009 struct rvt_ack_entry *e = &qp->s_ack_queue[qp->s_tail_ack_queue]; 5010 struct tid_rdma_request *req = ack_to_tid_req(e); 5011 bool last = false; 5012 u8 opcode = TID_OP(WRITE_DATA); 5013 5014 lockdep_assert_held(&qp->s_lock); 5015 trace_hfi1_tid_write_sender_make_tid_pkt(qp, 0); 5016 /* 5017 * Prioritize the sending of the requests and responses over the 5018 * sending of the TID RDMA data packets. 5019 */ 5020 if (((atomic_read(&priv->n_tid_requests) < HFI1_TID_RDMA_WRITE_CNT) && 5021 atomic_read(&priv->n_requests) && 5022 !(qp->s_flags & (RVT_S_BUSY | RVT_S_WAIT_ACK | 5023 HFI1_S_ANY_WAIT_IO))) || 5024 (e->opcode == TID_OP(WRITE_REQ) && req->cur_seg < req->alloc_seg && 5025 !(qp->s_flags & (RVT_S_BUSY | HFI1_S_ANY_WAIT_IO)))) { 5026 struct iowait_work *iowork; 5027 5028 iowork = iowait_get_ib_work(&priv->s_iowait); 5029 ps->s_txreq = get_waiting_verbs_txreq(iowork); 5030 if (ps->s_txreq || hfi1_make_rc_req(qp, ps)) { 5031 priv->s_flags |= HFI1_S_TID_BUSY_SET; 5032 return 1; 5033 } 5034 } 5035 5036 ps->s_txreq = get_txreq(ps->dev, qp); 5037 if (!ps->s_txreq) 5038 goto bail_no_tx; 5039 5040 ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth; 5041 5042 if ((priv->s_flags & RVT_S_ACK_PENDING) && 5043 make_tid_rdma_ack(qp, ohdr, ps)) 5044 return 1; 5045 5046 /* 5047 * Bail out if we can't send data. 5048 * Be reminded that this check must been done after the call to 5049 * make_tid_rdma_ack() because the responding QP could be in 5050 * RTR state where it can send TID RDMA ACK, not TID RDMA WRITE DATA. 5051 */ 5052 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) 5053 goto bail; 5054 5055 if (priv->s_flags & RVT_S_WAIT_ACK) 5056 goto bail; 5057 5058 /* Check whether there is anything to do. */ 5059 if (priv->s_tid_tail == HFI1_QP_WQE_INVALID) 5060 goto bail; 5061 wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail); 5062 req = wqe_to_tid_req(wqe); 5063 trace_hfi1_tid_req_make_tid_pkt(qp, 0, wqe->wr.opcode, wqe->psn, 5064 wqe->lpsn, req); 5065 switch (priv->s_state) { 5066 case TID_OP(WRITE_REQ): 5067 case TID_OP(WRITE_RESP): 5068 priv->tid_ss.sge = wqe->sg_list[0]; 5069 priv->tid_ss.sg_list = wqe->sg_list + 1; 5070 priv->tid_ss.num_sge = wqe->wr.num_sge; 5071 priv->tid_ss.total_len = wqe->length; 5072 5073 if (priv->s_state == TID_OP(WRITE_REQ)) 5074 hfi1_tid_rdma_restart_req(qp, wqe, &bth2); 5075 priv->s_state = TID_OP(WRITE_DATA); 5076 fallthrough; 5077 5078 case TID_OP(WRITE_DATA): 5079 /* 5080 * 1. Check whether TID RDMA WRITE RESP available. 5081 * 2. If no: 5082 * 2.1 If have more segments and no TID RDMA WRITE RESP, 5083 * set HFI1_S_WAIT_TID_RESP 5084 * 2.2 Return indicating no progress made. 5085 * 3. If yes: 5086 * 3.1 Build TID RDMA WRITE DATA packet. 5087 * 3.2 If last packet in segment: 5088 * 3.2.1 Change KDETH header bits 5089 * 3.2.2 Advance RESP pointers. 5090 * 3.3 Return indicating progress made. 5091 */ 5092 trace_hfi1_sender_make_tid_pkt(qp); 5093 trace_hfi1_tid_write_sender_make_tid_pkt(qp, 0); 5094 wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail); 5095 req = wqe_to_tid_req(wqe); 5096 len = wqe->length; 5097 5098 if (!req->comp_seg || req->cur_seg == req->comp_seg) 5099 goto bail; 5100 5101 trace_hfi1_tid_req_make_tid_pkt(qp, 0, wqe->wr.opcode, 5102 wqe->psn, wqe->lpsn, req); 5103 last = hfi1_build_tid_rdma_packet(wqe, ohdr, &bth1, &bth2, 5104 &len); 5105 5106 if (last) { 5107 /* move pointer to next flow */ 5108 req->clear_tail = CIRC_NEXT(req->clear_tail, 5109 MAX_FLOWS); 5110 if (++req->cur_seg < req->total_segs) { 5111 if (!CIRC_CNT(req->setup_head, req->clear_tail, 5112 MAX_FLOWS)) 5113 qp->s_flags |= HFI1_S_WAIT_TID_RESP; 5114 } else { 5115 priv->s_state = TID_OP(WRITE_DATA_LAST); 5116 opcode = TID_OP(WRITE_DATA_LAST); 5117 5118 /* Advance the s_tid_tail now */ 5119 update_tid_tail(qp); 5120 } 5121 } 5122 hwords += sizeof(ohdr->u.tid_rdma.w_data) / sizeof(u32); 5123 ss = &priv->tid_ss; 5124 break; 5125 5126 case TID_OP(RESYNC): 5127 trace_hfi1_sender_make_tid_pkt(qp); 5128 /* Use generation from the most recently received response */ 5129 wqe = rvt_get_swqe_ptr(qp, priv->s_tid_cur); 5130 req = wqe_to_tid_req(wqe); 5131 /* If no responses for this WQE look at the previous one */ 5132 if (!req->comp_seg) { 5133 wqe = rvt_get_swqe_ptr(qp, 5134 (!priv->s_tid_cur ? qp->s_size : 5135 priv->s_tid_cur) - 1); 5136 req = wqe_to_tid_req(wqe); 5137 } 5138 hwords += hfi1_build_tid_rdma_resync(qp, wqe, ohdr, &bth1, 5139 &bth2, 5140 CIRC_PREV(req->setup_head, 5141 MAX_FLOWS)); 5142 ss = NULL; 5143 len = 0; 5144 opcode = TID_OP(RESYNC); 5145 break; 5146 5147 default: 5148 goto bail; 5149 } 5150 if (priv->s_flags & RVT_S_SEND_ONE) { 5151 priv->s_flags &= ~RVT_S_SEND_ONE; 5152 priv->s_flags |= RVT_S_WAIT_ACK; 5153 bth2 |= IB_BTH_REQ_ACK; 5154 } 5155 qp->s_len -= len; 5156 ps->s_txreq->hdr_dwords = hwords; 5157 ps->s_txreq->sde = priv->s_sde; 5158 ps->s_txreq->ss = ss; 5159 ps->s_txreq->s_cur_size = len; 5160 hfi1_make_ruc_header(qp, ohdr, (opcode << 24), bth1, bth2, 5161 middle, ps); 5162 return 1; 5163 bail: 5164 hfi1_put_txreq(ps->s_txreq); 5165 bail_no_tx: 5166 ps->s_txreq = NULL; 5167 priv->s_flags &= ~RVT_S_BUSY; 5168 /* 5169 * If we didn't get a txreq, the QP will be woken up later to try 5170 * again, set the flags to the the wake up which work item to wake 5171 * up. 5172 * (A better algorithm should be found to do this and generalize the 5173 * sleep/wakeup flags.) 5174 */ 5175 iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID); 5176 return 0; 5177 } 5178 5179 static int make_tid_rdma_ack(struct rvt_qp *qp, 5180 struct ib_other_headers *ohdr, 5181 struct hfi1_pkt_state *ps) 5182 { 5183 struct rvt_ack_entry *e; 5184 struct hfi1_qp_priv *qpriv = qp->priv; 5185 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); 5186 u32 hwords, next; 5187 u32 len = 0; 5188 u32 bth1 = 0, bth2 = 0; 5189 int middle = 0; 5190 u16 flow; 5191 struct tid_rdma_request *req, *nreq; 5192 5193 trace_hfi1_tid_write_rsp_make_tid_ack(qp); 5194 /* Don't send an ACK if we aren't supposed to. */ 5195 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) 5196 goto bail; 5197 5198 /* header size in 32-bit words LRH+BTH = (8+12)/4. */ 5199 hwords = 5; 5200 5201 e = &qp->s_ack_queue[qpriv->r_tid_ack]; 5202 req = ack_to_tid_req(e); 5203 /* 5204 * In the RESYNC case, we are exactly one segment past the 5205 * previously sent ack or at the previously sent NAK. So to send 5206 * the resync ack, we go back one segment (which might be part of 5207 * the previous request) and let the do-while loop execute again. 5208 * The advantage of executing the do-while loop is that any data 5209 * received after the previous ack is automatically acked in the 5210 * RESYNC ack. It turns out that for the do-while loop we only need 5211 * to pull back qpriv->r_tid_ack, not the segment 5212 * indices/counters. The scheme works even if the previous request 5213 * was not a TID WRITE request. 5214 */ 5215 if (qpriv->resync) { 5216 if (!req->ack_seg || req->ack_seg == req->total_segs) 5217 qpriv->r_tid_ack = !qpriv->r_tid_ack ? 5218 rvt_size_atomic(&dev->rdi) : 5219 qpriv->r_tid_ack - 1; 5220 e = &qp->s_ack_queue[qpriv->r_tid_ack]; 5221 req = ack_to_tid_req(e); 5222 } 5223 5224 trace_hfi1_rsp_make_tid_ack(qp, e->psn); 5225 trace_hfi1_tid_req_make_tid_ack(qp, 0, e->opcode, e->psn, e->lpsn, 5226 req); 5227 /* 5228 * If we've sent all the ACKs that we can, we are done 5229 * until we get more segments... 5230 */ 5231 if (!qpriv->s_nak_state && !qpriv->resync && 5232 req->ack_seg == req->comp_seg) 5233 goto bail; 5234 5235 do { 5236 /* 5237 * To deal with coalesced ACKs, the acked_tail pointer 5238 * into the flow array is used. The distance between it 5239 * and the clear_tail is the number of flows that are 5240 * being ACK'ed. 5241 */ 5242 req->ack_seg += 5243 /* Get up-to-date value */ 5244 CIRC_CNT(req->clear_tail, req->acked_tail, 5245 MAX_FLOWS); 5246 /* Advance acked index */ 5247 req->acked_tail = req->clear_tail; 5248 5249 /* 5250 * req->clear_tail points to the segment currently being 5251 * received. So, when sending an ACK, the previous 5252 * segment is being ACK'ed. 5253 */ 5254 flow = CIRC_PREV(req->acked_tail, MAX_FLOWS); 5255 if (req->ack_seg != req->total_segs) 5256 break; 5257 req->state = TID_REQUEST_COMPLETE; 5258 5259 next = qpriv->r_tid_ack + 1; 5260 if (next > rvt_size_atomic(&dev->rdi)) 5261 next = 0; 5262 qpriv->r_tid_ack = next; 5263 if (qp->s_ack_queue[next].opcode != TID_OP(WRITE_REQ)) 5264 break; 5265 nreq = ack_to_tid_req(&qp->s_ack_queue[next]); 5266 if (!nreq->comp_seg || nreq->ack_seg == nreq->comp_seg) 5267 break; 5268 5269 /* Move to the next ack entry now */ 5270 e = &qp->s_ack_queue[qpriv->r_tid_ack]; 5271 req = ack_to_tid_req(e); 5272 } while (1); 5273 5274 /* 5275 * At this point qpriv->r_tid_ack == qpriv->r_tid_tail but e and 5276 * req could be pointing at the previous ack queue entry 5277 */ 5278 if (qpriv->s_nak_state || 5279 (qpriv->resync && 5280 !hfi1_tid_rdma_is_resync_psn(qpriv->r_next_psn_kdeth - 1) && 5281 (cmp_psn(qpriv->r_next_psn_kdeth - 1, 5282 full_flow_psn(&req->flows[flow], 5283 req->flows[flow].flow_state.lpsn)) > 0))) { 5284 /* 5285 * A NAK will implicitly acknowledge all previous TID RDMA 5286 * requests. Therefore, we NAK with the req->acked_tail 5287 * segment for the request at qpriv->r_tid_ack (same at 5288 * this point as the req->clear_tail segment for the 5289 * qpriv->r_tid_tail request) 5290 */ 5291 e = &qp->s_ack_queue[qpriv->r_tid_ack]; 5292 req = ack_to_tid_req(e); 5293 flow = req->acked_tail; 5294 } else if (req->ack_seg == req->total_segs && 5295 qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK) 5296 qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK; 5297 5298 trace_hfi1_tid_write_rsp_make_tid_ack(qp); 5299 trace_hfi1_tid_req_make_tid_ack(qp, 0, e->opcode, e->psn, e->lpsn, 5300 req); 5301 hwords += hfi1_build_tid_rdma_write_ack(qp, e, ohdr, flow, &bth1, 5302 &bth2); 5303 len = 0; 5304 qpriv->s_flags &= ~RVT_S_ACK_PENDING; 5305 ps->s_txreq->hdr_dwords = hwords; 5306 ps->s_txreq->sde = qpriv->s_sde; 5307 ps->s_txreq->s_cur_size = len; 5308 ps->s_txreq->ss = NULL; 5309 hfi1_make_ruc_header(qp, ohdr, (TID_OP(ACK) << 24), bth1, bth2, middle, 5310 ps); 5311 ps->s_txreq->txreq.flags |= SDMA_TXREQ_F_VIP; 5312 return 1; 5313 bail: 5314 /* 5315 * Ensure s_rdma_ack_cnt changes are committed prior to resetting 5316 * RVT_S_RESP_PENDING 5317 */ 5318 smp_wmb(); 5319 qpriv->s_flags &= ~RVT_S_ACK_PENDING; 5320 return 0; 5321 } 5322 5323 static int hfi1_send_tid_ok(struct rvt_qp *qp) 5324 { 5325 struct hfi1_qp_priv *priv = qp->priv; 5326 5327 return !(priv->s_flags & RVT_S_BUSY || 5328 qp->s_flags & HFI1_S_ANY_WAIT_IO) && 5329 (verbs_txreq_queued(iowait_get_tid_work(&priv->s_iowait)) || 5330 (priv->s_flags & RVT_S_RESP_PENDING) || 5331 !(qp->s_flags & HFI1_S_ANY_TID_WAIT_SEND)); 5332 } 5333 5334 void _hfi1_do_tid_send(struct work_struct *work) 5335 { 5336 struct iowait_work *w = container_of(work, struct iowait_work, iowork); 5337 struct rvt_qp *qp = iowait_to_qp(w->iow); 5338 5339 hfi1_do_tid_send(qp); 5340 } 5341 5342 static void hfi1_do_tid_send(struct rvt_qp *qp) 5343 { 5344 struct hfi1_pkt_state ps; 5345 struct hfi1_qp_priv *priv = qp->priv; 5346 5347 ps.dev = to_idev(qp->ibqp.device); 5348 ps.ibp = to_iport(qp->ibqp.device, qp->port_num); 5349 ps.ppd = ppd_from_ibp(ps.ibp); 5350 ps.wait = iowait_get_tid_work(&priv->s_iowait); 5351 ps.in_thread = false; 5352 ps.timeout_int = qp->timeout_jiffies / 8; 5353 5354 trace_hfi1_rc_do_tid_send(qp, false); 5355 spin_lock_irqsave(&qp->s_lock, ps.flags); 5356 5357 /* Return if we are already busy processing a work request. */ 5358 if (!hfi1_send_tid_ok(qp)) { 5359 if (qp->s_flags & HFI1_S_ANY_WAIT_IO) 5360 iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID); 5361 spin_unlock_irqrestore(&qp->s_lock, ps.flags); 5362 return; 5363 } 5364 5365 priv->s_flags |= RVT_S_BUSY; 5366 5367 ps.timeout = jiffies + ps.timeout_int; 5368 ps.cpu = priv->s_sde ? priv->s_sde->cpu : 5369 cpumask_first(cpumask_of_node(ps.ppd->dd->node)); 5370 ps.pkts_sent = false; 5371 5372 /* insure a pre-built packet is handled */ 5373 ps.s_txreq = get_waiting_verbs_txreq(ps.wait); 5374 do { 5375 /* Check for a constructed packet to be sent. */ 5376 if (ps.s_txreq) { 5377 if (priv->s_flags & HFI1_S_TID_BUSY_SET) { 5378 qp->s_flags |= RVT_S_BUSY; 5379 ps.wait = iowait_get_ib_work(&priv->s_iowait); 5380 } 5381 spin_unlock_irqrestore(&qp->s_lock, ps.flags); 5382 5383 /* 5384 * If the packet cannot be sent now, return and 5385 * the send tasklet will be woken up later. 5386 */ 5387 if (hfi1_verbs_send(qp, &ps)) 5388 return; 5389 5390 /* allow other tasks to run */ 5391 if (hfi1_schedule_send_yield(qp, &ps, true)) 5392 return; 5393 5394 spin_lock_irqsave(&qp->s_lock, ps.flags); 5395 if (priv->s_flags & HFI1_S_TID_BUSY_SET) { 5396 qp->s_flags &= ~RVT_S_BUSY; 5397 priv->s_flags &= ~HFI1_S_TID_BUSY_SET; 5398 ps.wait = iowait_get_tid_work(&priv->s_iowait); 5399 if (iowait_flag_set(&priv->s_iowait, 5400 IOWAIT_PENDING_IB)) 5401 hfi1_schedule_send(qp); 5402 } 5403 } 5404 } while (hfi1_make_tid_rdma_pkt(qp, &ps)); 5405 iowait_starve_clear(ps.pkts_sent, &priv->s_iowait); 5406 spin_unlock_irqrestore(&qp->s_lock, ps.flags); 5407 } 5408 5409 static bool _hfi1_schedule_tid_send(struct rvt_qp *qp) 5410 { 5411 struct hfi1_qp_priv *priv = qp->priv; 5412 struct hfi1_ibport *ibp = 5413 to_iport(qp->ibqp.device, qp->port_num); 5414 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); 5415 struct hfi1_devdata *dd = ppd->dd; 5416 5417 if ((dd->flags & HFI1_SHUTDOWN)) 5418 return true; 5419 5420 return iowait_tid_schedule(&priv->s_iowait, ppd->hfi1_wq, 5421 priv->s_sde ? 5422 priv->s_sde->cpu : 5423 cpumask_first(cpumask_of_node(dd->node))); 5424 } 5425 5426 /** 5427 * hfi1_schedule_tid_send - schedule progress on TID RDMA state machine 5428 * @qp: the QP 5429 * 5430 * This schedules qp progress on the TID RDMA state machine. Caller 5431 * should hold the s_lock. 5432 * Unlike hfi1_schedule_send(), this cannot use hfi1_send_ok() because 5433 * the two state machines can step on each other with respect to the 5434 * RVT_S_BUSY flag. 5435 * Therefore, a modified test is used. 5436 * @return true if the second leg is scheduled; 5437 * false if the second leg is not scheduled. 5438 */ 5439 bool hfi1_schedule_tid_send(struct rvt_qp *qp) 5440 { 5441 lockdep_assert_held(&qp->s_lock); 5442 if (hfi1_send_tid_ok(qp)) { 5443 /* 5444 * The following call returns true if the qp is not on the 5445 * queue and false if the qp is already on the queue before 5446 * this call. Either way, the qp will be on the queue when the 5447 * call returns. 5448 */ 5449 _hfi1_schedule_tid_send(qp); 5450 return true; 5451 } 5452 if (qp->s_flags & HFI1_S_ANY_WAIT_IO) 5453 iowait_set_flag(&((struct hfi1_qp_priv *)qp->priv)->s_iowait, 5454 IOWAIT_PENDING_TID); 5455 return false; 5456 } 5457 5458 bool hfi1_tid_rdma_ack_interlock(struct rvt_qp *qp, struct rvt_ack_entry *e) 5459 { 5460 struct rvt_ack_entry *prev; 5461 struct tid_rdma_request *req; 5462 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); 5463 struct hfi1_qp_priv *priv = qp->priv; 5464 u32 s_prev; 5465 5466 s_prev = qp->s_tail_ack_queue == 0 ? rvt_size_atomic(&dev->rdi) : 5467 (qp->s_tail_ack_queue - 1); 5468 prev = &qp->s_ack_queue[s_prev]; 5469 5470 if ((e->opcode == TID_OP(READ_REQ) || 5471 e->opcode == OP(RDMA_READ_REQUEST)) && 5472 prev->opcode == TID_OP(WRITE_REQ)) { 5473 req = ack_to_tid_req(prev); 5474 if (req->ack_seg != req->total_segs) { 5475 priv->s_flags |= HFI1_R_TID_WAIT_INTERLCK; 5476 return true; 5477 } 5478 } 5479 return false; 5480 } 5481 5482 static u32 read_r_next_psn(struct hfi1_devdata *dd, u8 ctxt, u8 fidx) 5483 { 5484 u64 reg; 5485 5486 /* 5487 * The only sane way to get the amount of 5488 * progress is to read the HW flow state. 5489 */ 5490 reg = read_uctxt_csr(dd, ctxt, RCV_TID_FLOW_TABLE + (8 * fidx)); 5491 return mask_psn(reg); 5492 } 5493 5494 static void tid_rdma_rcv_err(struct hfi1_packet *packet, 5495 struct ib_other_headers *ohdr, 5496 struct rvt_qp *qp, u32 psn, int diff, bool fecn) 5497 { 5498 unsigned long flags; 5499 5500 tid_rdma_rcv_error(packet, ohdr, qp, psn, diff); 5501 if (fecn) { 5502 spin_lock_irqsave(&qp->s_lock, flags); 5503 qp->s_flags |= RVT_S_ECN; 5504 spin_unlock_irqrestore(&qp->s_lock, flags); 5505 } 5506 } 5507 5508 static void update_r_next_psn_fecn(struct hfi1_packet *packet, 5509 struct hfi1_qp_priv *priv, 5510 struct hfi1_ctxtdata *rcd, 5511 struct tid_rdma_flow *flow, 5512 bool fecn) 5513 { 5514 /* 5515 * If a start/middle packet is delivered here due to 5516 * RSM rule and FECN, we need to update the r_next_psn. 5517 */ 5518 if (fecn && packet->etype == RHF_RCV_TYPE_EAGER && 5519 !(priv->s_flags & HFI1_R_TID_SW_PSN)) { 5520 struct hfi1_devdata *dd = rcd->dd; 5521 5522 flow->flow_state.r_next_psn = 5523 read_r_next_psn(dd, rcd->ctxt, flow->idx); 5524 } 5525 } 5526