1 /* 2 * Copyright(c) 2015 - 2018 Intel Corporation. 3 * 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 * redistributing this file, you may do so under either license. 6 * 7 * GPL LICENSE SUMMARY 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of version 2 of the GNU General Public License as 11 * published by the Free Software Foundation. 12 * 13 * This program is distributed in the hope that it will be useful, but 14 * WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * General Public License for more details. 17 * 18 * BSD LICENSE 19 * 20 * Redistribution and use in source and binary forms, with or without 21 * modification, are permitted provided that the following conditions 22 * are met: 23 * 24 * - Redistributions of source code must retain the above copyright 25 * notice, this list of conditions and the following disclaimer. 26 * - Redistributions in binary form must reproduce the above copyright 27 * notice, this list of conditions and the following disclaimer in 28 * the documentation and/or other materials provided with the 29 * distribution. 30 * - Neither the name of Intel Corporation nor the names of its 31 * contributors may be used to endorse or promote products derived 32 * from this software without specific prior written permission. 33 * 34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 45 * 46 */ 47 48 #include <linux/io.h> 49 #include <rdma/rdma_vt.h> 50 #include <rdma/rdmavt_qp.h> 51 52 #include "hfi.h" 53 #include "qp.h" 54 #include "rc.h" 55 #include "verbs_txreq.h" 56 #include "trace.h" 57 58 struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev, 59 u8 *prev_ack, bool *scheduled) 60 __must_hold(&qp->s_lock) 61 { 62 struct rvt_ack_entry *e = NULL; 63 u8 i, p; 64 bool s = true; 65 66 for (i = qp->r_head_ack_queue; ; i = p) { 67 if (i == qp->s_tail_ack_queue) 68 s = false; 69 if (i) 70 p = i - 1; 71 else 72 p = rvt_size_atomic(ib_to_rvt(qp->ibqp.device)); 73 if (p == qp->r_head_ack_queue) { 74 e = NULL; 75 break; 76 } 77 e = &qp->s_ack_queue[p]; 78 if (!e->opcode) { 79 e = NULL; 80 break; 81 } 82 if (cmp_psn(psn, e->psn) >= 0) { 83 if (p == qp->s_tail_ack_queue && 84 cmp_psn(psn, e->lpsn) <= 0) 85 s = false; 86 break; 87 } 88 } 89 if (prev) 90 *prev = p; 91 if (prev_ack) 92 *prev_ack = i; 93 if (scheduled) 94 *scheduled = s; 95 return e; 96 } 97 98 /** 99 * make_rc_ack - construct a response packet (ACK, NAK, or RDMA read) 100 * @dev: the device for this QP 101 * @qp: a pointer to the QP 102 * @ohdr: a pointer to the IB header being constructed 103 * @ps: the xmit packet state 104 * 105 * Return 1 if constructed; otherwise, return 0. 106 * Note that we are in the responder's side of the QP context. 107 * Note the QP s_lock must be held. 108 */ 109 static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, 110 struct ib_other_headers *ohdr, 111 struct hfi1_pkt_state *ps) 112 { 113 struct rvt_ack_entry *e; 114 u32 hwords, hdrlen; 115 u32 len = 0; 116 u32 bth0 = 0, bth2 = 0; 117 u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT); 118 int middle = 0; 119 u32 pmtu = qp->pmtu; 120 struct hfi1_qp_priv *qpriv = qp->priv; 121 bool last_pkt; 122 u32 delta; 123 u8 next = qp->s_tail_ack_queue; 124 struct tid_rdma_request *req; 125 126 trace_hfi1_rsp_make_rc_ack(qp, 0); 127 lockdep_assert_held(&qp->s_lock); 128 /* Don't send an ACK if we aren't supposed to. */ 129 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) 130 goto bail; 131 132 if (qpriv->hdr_type == HFI1_PKT_TYPE_9B) 133 /* header size in 32-bit words LRH+BTH = (8+12)/4. */ 134 hwords = 5; 135 else 136 /* header size in 32-bit words 16B LRH+BTH = (16+12)/4. */ 137 hwords = 7; 138 139 switch (qp->s_ack_state) { 140 case OP(RDMA_READ_RESPONSE_LAST): 141 case OP(RDMA_READ_RESPONSE_ONLY): 142 e = &qp->s_ack_queue[qp->s_tail_ack_queue]; 143 if (e->rdma_sge.mr) { 144 rvt_put_mr(e->rdma_sge.mr); 145 e->rdma_sge.mr = NULL; 146 } 147 /* FALLTHROUGH */ 148 case OP(ATOMIC_ACKNOWLEDGE): 149 /* 150 * We can increment the tail pointer now that the last 151 * response has been sent instead of only being 152 * constructed. 153 */ 154 if (++next > rvt_size_atomic(&dev->rdi)) 155 next = 0; 156 /* 157 * Only advance the s_acked_ack_queue pointer if there 158 * have been no TID RDMA requests. 159 */ 160 e = &qp->s_ack_queue[qp->s_tail_ack_queue]; 161 if (e->opcode != TID_OP(WRITE_REQ) && 162 qp->s_acked_ack_queue == qp->s_tail_ack_queue) 163 qp->s_acked_ack_queue = next; 164 qp->s_tail_ack_queue = next; 165 trace_hfi1_rsp_make_rc_ack(qp, e->psn); 166 /* FALLTHROUGH */ 167 case OP(SEND_ONLY): 168 case OP(ACKNOWLEDGE): 169 /* Check for no next entry in the queue. */ 170 if (qp->r_head_ack_queue == qp->s_tail_ack_queue) { 171 if (qp->s_flags & RVT_S_ACK_PENDING) 172 goto normal; 173 goto bail; 174 } 175 176 e = &qp->s_ack_queue[qp->s_tail_ack_queue]; 177 /* Check for tid write fence */ 178 if ((qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK) || 179 hfi1_tid_rdma_ack_interlock(qp, e)) { 180 iowait_set_flag(&qpriv->s_iowait, IOWAIT_PENDING_IB); 181 goto bail; 182 } 183 if (e->opcode == OP(RDMA_READ_REQUEST)) { 184 /* 185 * If a RDMA read response is being resent and 186 * we haven't seen the duplicate request yet, 187 * then stop sending the remaining responses the 188 * responder has seen until the requester re-sends it. 189 */ 190 len = e->rdma_sge.sge_length; 191 if (len && !e->rdma_sge.mr) { 192 if (qp->s_acked_ack_queue == 193 qp->s_tail_ack_queue) 194 qp->s_acked_ack_queue = 195 qp->r_head_ack_queue; 196 qp->s_tail_ack_queue = qp->r_head_ack_queue; 197 goto bail; 198 } 199 /* Copy SGE state in case we need to resend */ 200 ps->s_txreq->mr = e->rdma_sge.mr; 201 if (ps->s_txreq->mr) 202 rvt_get_mr(ps->s_txreq->mr); 203 qp->s_ack_rdma_sge.sge = e->rdma_sge; 204 qp->s_ack_rdma_sge.num_sge = 1; 205 ps->s_txreq->ss = &qp->s_ack_rdma_sge; 206 if (len > pmtu) { 207 len = pmtu; 208 qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST); 209 } else { 210 qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY); 211 e->sent = 1; 212 } 213 ohdr->u.aeth = rvt_compute_aeth(qp); 214 hwords++; 215 qp->s_ack_rdma_psn = e->psn; 216 bth2 = mask_psn(qp->s_ack_rdma_psn++); 217 } else if (e->opcode == TID_OP(WRITE_REQ)) { 218 /* 219 * If a TID RDMA WRITE RESP is being resent, we have to 220 * wait for the actual request. All requests that are to 221 * be resent will have their state set to 222 * TID_REQUEST_RESEND. When the new request arrives, the 223 * state will be changed to TID_REQUEST_RESEND_ACTIVE. 224 */ 225 req = ack_to_tid_req(e); 226 if (req->state == TID_REQUEST_RESEND || 227 req->state == TID_REQUEST_INIT_RESEND) 228 goto bail; 229 qp->s_ack_state = TID_OP(WRITE_RESP); 230 qp->s_ack_rdma_psn = mask_psn(e->psn + req->cur_seg); 231 goto write_resp; 232 } else if (e->opcode == TID_OP(READ_REQ)) { 233 /* 234 * If a TID RDMA read response is being resent and 235 * we haven't seen the duplicate request yet, 236 * then stop sending the remaining responses the 237 * responder has seen until the requester re-sends it. 238 */ 239 len = e->rdma_sge.sge_length; 240 if (len && !e->rdma_sge.mr) { 241 if (qp->s_acked_ack_queue == 242 qp->s_tail_ack_queue) 243 qp->s_acked_ack_queue = 244 qp->r_head_ack_queue; 245 qp->s_tail_ack_queue = qp->r_head_ack_queue; 246 goto bail; 247 } 248 /* Copy SGE state in case we need to resend */ 249 ps->s_txreq->mr = e->rdma_sge.mr; 250 if (ps->s_txreq->mr) 251 rvt_get_mr(ps->s_txreq->mr); 252 qp->s_ack_rdma_sge.sge = e->rdma_sge; 253 qp->s_ack_rdma_sge.num_sge = 1; 254 qp->s_ack_state = TID_OP(READ_RESP); 255 goto read_resp; 256 } else { 257 /* COMPARE_SWAP or FETCH_ADD */ 258 ps->s_txreq->ss = NULL; 259 len = 0; 260 qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE); 261 ohdr->u.at.aeth = rvt_compute_aeth(qp); 262 ib_u64_put(e->atomic_data, &ohdr->u.at.atomic_ack_eth); 263 hwords += sizeof(ohdr->u.at) / sizeof(u32); 264 bth2 = mask_psn(e->psn); 265 e->sent = 1; 266 } 267 trace_hfi1_tid_write_rsp_make_rc_ack(qp); 268 bth0 = qp->s_ack_state << 24; 269 break; 270 271 case OP(RDMA_READ_RESPONSE_FIRST): 272 qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE); 273 /* FALLTHROUGH */ 274 case OP(RDMA_READ_RESPONSE_MIDDLE): 275 ps->s_txreq->ss = &qp->s_ack_rdma_sge; 276 ps->s_txreq->mr = qp->s_ack_rdma_sge.sge.mr; 277 if (ps->s_txreq->mr) 278 rvt_get_mr(ps->s_txreq->mr); 279 len = qp->s_ack_rdma_sge.sge.sge_length; 280 if (len > pmtu) { 281 len = pmtu; 282 middle = HFI1_CAP_IS_KSET(SDMA_AHG); 283 } else { 284 ohdr->u.aeth = rvt_compute_aeth(qp); 285 hwords++; 286 qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); 287 e = &qp->s_ack_queue[qp->s_tail_ack_queue]; 288 e->sent = 1; 289 } 290 bth0 = qp->s_ack_state << 24; 291 bth2 = mask_psn(qp->s_ack_rdma_psn++); 292 break; 293 294 case TID_OP(WRITE_RESP): 295 write_resp: 296 /* 297 * 1. Check if RVT_S_ACK_PENDING is set. If yes, 298 * goto normal. 299 * 2. Attempt to allocate TID resources. 300 * 3. Remove RVT_S_RESP_PENDING flags from s_flags 301 * 4. If resources not available: 302 * 4.1 Set RVT_S_WAIT_TID_SPACE 303 * 4.2 Queue QP on RCD TID queue 304 * 4.3 Put QP on iowait list. 305 * 4.4 Build IB RNR NAK with appropriate timeout value 306 * 4.5 Return indication progress made. 307 * 5. If resources are available: 308 * 5.1 Program HW flow CSRs 309 * 5.2 Build TID RDMA WRITE RESP packet 310 * 5.3 If more resources needed, do 2.1 - 2.3. 311 * 5.4 Wake up next QP on RCD TID queue. 312 * 5.5 Return indication progress made. 313 */ 314 315 e = &qp->s_ack_queue[qp->s_tail_ack_queue]; 316 req = ack_to_tid_req(e); 317 318 /* 319 * Send scheduled RNR NAK's. RNR NAK's need to be sent at 320 * segment boundaries, not at request boundaries. Don't change 321 * s_ack_state because we are still in the middle of a request 322 */ 323 if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND && 324 qp->s_tail_ack_queue == qpriv->r_tid_alloc && 325 req->cur_seg == req->alloc_seg) { 326 qpriv->rnr_nak_state = TID_RNR_NAK_SENT; 327 goto normal_no_state; 328 } 329 330 bth2 = mask_psn(qp->s_ack_rdma_psn); 331 hdrlen = hfi1_build_tid_rdma_write_resp(qp, e, ohdr, &bth1, 332 bth2, &len, 333 &ps->s_txreq->ss); 334 if (!hdrlen) 335 return 0; 336 337 hwords += hdrlen; 338 bth0 = qp->s_ack_state << 24; 339 qp->s_ack_rdma_psn++; 340 trace_hfi1_tid_req_make_rc_ack_write(qp, 0, e->opcode, e->psn, 341 e->lpsn, req); 342 if (req->cur_seg != req->total_segs) 343 break; 344 345 e->sent = 1; 346 qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); 347 break; 348 349 case TID_OP(READ_RESP): 350 read_resp: 351 e = &qp->s_ack_queue[qp->s_tail_ack_queue]; 352 ps->s_txreq->ss = &qp->s_ack_rdma_sge; 353 delta = hfi1_build_tid_rdma_read_resp(qp, e, ohdr, &bth0, 354 &bth1, &bth2, &len, 355 &last_pkt); 356 if (delta == 0) 357 goto error_qp; 358 hwords += delta; 359 if (last_pkt) { 360 e->sent = 1; 361 /* 362 * Increment qp->s_tail_ack_queue through s_ack_state 363 * transition. 364 */ 365 qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); 366 } 367 break; 368 case TID_OP(READ_REQ): 369 goto bail; 370 371 default: 372 normal: 373 /* 374 * Send a regular ACK. 375 * Set the s_ack_state so we wait until after sending 376 * the ACK before setting s_ack_state to ACKNOWLEDGE 377 * (see above). 378 */ 379 qp->s_ack_state = OP(SEND_ONLY); 380 normal_no_state: 381 if (qp->s_nak_state) 382 ohdr->u.aeth = 383 cpu_to_be32((qp->r_msn & IB_MSN_MASK) | 384 (qp->s_nak_state << 385 IB_AETH_CREDIT_SHIFT)); 386 else 387 ohdr->u.aeth = rvt_compute_aeth(qp); 388 hwords++; 389 len = 0; 390 bth0 = OP(ACKNOWLEDGE) << 24; 391 bth2 = mask_psn(qp->s_ack_psn); 392 qp->s_flags &= ~RVT_S_ACK_PENDING; 393 ps->s_txreq->txreq.flags |= SDMA_TXREQ_F_VIP; 394 ps->s_txreq->ss = NULL; 395 } 396 qp->s_rdma_ack_cnt++; 397 ps->s_txreq->sde = qpriv->s_sde; 398 ps->s_txreq->s_cur_size = len; 399 ps->s_txreq->hdr_dwords = hwords; 400 hfi1_make_ruc_header(qp, ohdr, bth0, bth1, bth2, middle, ps); 401 return 1; 402 error_qp: 403 spin_unlock_irqrestore(&qp->s_lock, ps->flags); 404 spin_lock_irqsave(&qp->r_lock, ps->flags); 405 spin_lock(&qp->s_lock); 406 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); 407 spin_unlock(&qp->s_lock); 408 spin_unlock_irqrestore(&qp->r_lock, ps->flags); 409 spin_lock_irqsave(&qp->s_lock, ps->flags); 410 bail: 411 qp->s_ack_state = OP(ACKNOWLEDGE); 412 /* 413 * Ensure s_rdma_ack_cnt changes are committed prior to resetting 414 * RVT_S_RESP_PENDING 415 */ 416 smp_wmb(); 417 qp->s_flags &= ~(RVT_S_RESP_PENDING 418 | RVT_S_ACK_PENDING 419 | HFI1_S_AHG_VALID); 420 return 0; 421 } 422 423 /** 424 * hfi1_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC) 425 * @qp: a pointer to the QP 426 * 427 * Assumes s_lock is held. 428 * 429 * Return 1 if constructed; otherwise, return 0. 430 */ 431 int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) 432 { 433 struct hfi1_qp_priv *priv = qp->priv; 434 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); 435 struct ib_other_headers *ohdr; 436 struct rvt_sge_state *ss = NULL; 437 struct rvt_swqe *wqe; 438 struct hfi1_swqe_priv *wpriv; 439 struct tid_rdma_request *req = NULL; 440 /* header size in 32-bit words LRH+BTH = (8+12)/4. */ 441 u32 hwords = 5; 442 u32 len = 0; 443 u32 bth0 = 0, bth2 = 0; 444 u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT); 445 u32 pmtu = qp->pmtu; 446 char newreq; 447 int middle = 0; 448 int delta; 449 struct tid_rdma_flow *flow = NULL; 450 struct tid_rdma_params *remote; 451 452 trace_hfi1_sender_make_rc_req(qp); 453 lockdep_assert_held(&qp->s_lock); 454 ps->s_txreq = get_txreq(ps->dev, qp); 455 if (!ps->s_txreq) 456 goto bail_no_tx; 457 458 if (priv->hdr_type == HFI1_PKT_TYPE_9B) { 459 /* header size in 32-bit words LRH+BTH = (8+12)/4. */ 460 hwords = 5; 461 if (rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) 462 ohdr = &ps->s_txreq->phdr.hdr.ibh.u.l.oth; 463 else 464 ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth; 465 } else { 466 /* header size in 32-bit words 16B LRH+BTH = (16+12)/4. */ 467 hwords = 7; 468 if ((rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) && 469 (hfi1_check_mcast(rdma_ah_get_dlid(&qp->remote_ah_attr)))) 470 ohdr = &ps->s_txreq->phdr.hdr.opah.u.l.oth; 471 else 472 ohdr = &ps->s_txreq->phdr.hdr.opah.u.oth; 473 } 474 475 /* Sending responses has higher priority over sending requests. */ 476 if ((qp->s_flags & RVT_S_RESP_PENDING) && 477 make_rc_ack(dev, qp, ohdr, ps)) 478 return 1; 479 480 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) { 481 if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND)) 482 goto bail; 483 /* We are in the error state, flush the work request. */ 484 if (qp->s_last == READ_ONCE(qp->s_head)) 485 goto bail; 486 /* If DMAs are in progress, we can't flush immediately. */ 487 if (iowait_sdma_pending(&priv->s_iowait)) { 488 qp->s_flags |= RVT_S_WAIT_DMA; 489 goto bail; 490 } 491 clear_ahg(qp); 492 wqe = rvt_get_swqe_ptr(qp, qp->s_last); 493 hfi1_trdma_send_complete(qp, wqe, qp->s_last != qp->s_acked ? 494 IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR); 495 /* will get called again */ 496 goto done_free_tx; 497 } 498 499 if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK | HFI1_S_WAIT_HALT)) 500 goto bail; 501 502 if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) { 503 if (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) { 504 qp->s_flags |= RVT_S_WAIT_PSN; 505 goto bail; 506 } 507 qp->s_sending_psn = qp->s_psn; 508 qp->s_sending_hpsn = qp->s_psn - 1; 509 } 510 511 /* Send a request. */ 512 wqe = rvt_get_swqe_ptr(qp, qp->s_cur); 513 check_s_state: 514 switch (qp->s_state) { 515 default: 516 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK)) 517 goto bail; 518 /* 519 * Resend an old request or start a new one. 520 * 521 * We keep track of the current SWQE so that 522 * we don't reset the "furthest progress" state 523 * if we need to back up. 524 */ 525 newreq = 0; 526 if (qp->s_cur == qp->s_tail) { 527 /* Check if send work queue is empty. */ 528 if (qp->s_tail == READ_ONCE(qp->s_head)) { 529 clear_ahg(qp); 530 goto bail; 531 } 532 /* 533 * If a fence is requested, wait for previous 534 * RDMA read and atomic operations to finish. 535 * However, there is no need to guard against 536 * TID RDMA READ after TID RDMA READ. 537 */ 538 if ((wqe->wr.send_flags & IB_SEND_FENCE) && 539 qp->s_num_rd_atomic && 540 (wqe->wr.opcode != IB_WR_TID_RDMA_READ || 541 priv->pending_tid_r_segs < qp->s_num_rd_atomic)) { 542 qp->s_flags |= RVT_S_WAIT_FENCE; 543 goto bail; 544 } 545 /* 546 * Local operations are processed immediately 547 * after all prior requests have completed 548 */ 549 if (wqe->wr.opcode == IB_WR_REG_MR || 550 wqe->wr.opcode == IB_WR_LOCAL_INV) { 551 int local_ops = 0; 552 int err = 0; 553 554 if (qp->s_last != qp->s_cur) 555 goto bail; 556 if (++qp->s_cur == qp->s_size) 557 qp->s_cur = 0; 558 if (++qp->s_tail == qp->s_size) 559 qp->s_tail = 0; 560 if (!(wqe->wr.send_flags & 561 RVT_SEND_COMPLETION_ONLY)) { 562 err = rvt_invalidate_rkey( 563 qp, 564 wqe->wr.ex.invalidate_rkey); 565 local_ops = 1; 566 } 567 rvt_send_complete(qp, wqe, 568 err ? IB_WC_LOC_PROT_ERR 569 : IB_WC_SUCCESS); 570 if (local_ops) 571 atomic_dec(&qp->local_ops_pending); 572 goto done_free_tx; 573 } 574 575 newreq = 1; 576 qp->s_psn = wqe->psn; 577 } 578 /* 579 * Note that we have to be careful not to modify the 580 * original work request since we may need to resend 581 * it. 582 */ 583 len = wqe->length; 584 ss = &qp->s_sge; 585 bth2 = mask_psn(qp->s_psn); 586 587 /* 588 * Interlock between various IB requests and TID RDMA 589 * if necessary. 590 */ 591 if ((priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) || 592 hfi1_tid_rdma_wqe_interlock(qp, wqe)) 593 goto bail; 594 595 switch (wqe->wr.opcode) { 596 case IB_WR_SEND: 597 case IB_WR_SEND_WITH_IMM: 598 case IB_WR_SEND_WITH_INV: 599 /* If no credit, return. */ 600 if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) && 601 rvt_cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) { 602 qp->s_flags |= RVT_S_WAIT_SSN_CREDIT; 603 goto bail; 604 } 605 if (len > pmtu) { 606 qp->s_state = OP(SEND_FIRST); 607 len = pmtu; 608 break; 609 } 610 if (wqe->wr.opcode == IB_WR_SEND) { 611 qp->s_state = OP(SEND_ONLY); 612 } else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) { 613 qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE); 614 /* Immediate data comes after the BTH */ 615 ohdr->u.imm_data = wqe->wr.ex.imm_data; 616 hwords += 1; 617 } else { 618 qp->s_state = OP(SEND_ONLY_WITH_INVALIDATE); 619 /* Invalidate rkey comes after the BTH */ 620 ohdr->u.ieth = cpu_to_be32( 621 wqe->wr.ex.invalidate_rkey); 622 hwords += 1; 623 } 624 if (wqe->wr.send_flags & IB_SEND_SOLICITED) 625 bth0 |= IB_BTH_SOLICITED; 626 bth2 |= IB_BTH_REQ_ACK; 627 if (++qp->s_cur == qp->s_size) 628 qp->s_cur = 0; 629 break; 630 631 case IB_WR_RDMA_WRITE: 632 if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) 633 qp->s_lsn++; 634 goto no_flow_control; 635 case IB_WR_RDMA_WRITE_WITH_IMM: 636 /* If no credit, return. */ 637 if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) && 638 rvt_cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) { 639 qp->s_flags |= RVT_S_WAIT_SSN_CREDIT; 640 goto bail; 641 } 642 no_flow_control: 643 put_ib_reth_vaddr( 644 wqe->rdma_wr.remote_addr, 645 &ohdr->u.rc.reth); 646 ohdr->u.rc.reth.rkey = 647 cpu_to_be32(wqe->rdma_wr.rkey); 648 ohdr->u.rc.reth.length = cpu_to_be32(len); 649 hwords += sizeof(struct ib_reth) / sizeof(u32); 650 if (len > pmtu) { 651 qp->s_state = OP(RDMA_WRITE_FIRST); 652 len = pmtu; 653 break; 654 } 655 if (wqe->wr.opcode == IB_WR_RDMA_WRITE) { 656 qp->s_state = OP(RDMA_WRITE_ONLY); 657 } else { 658 qp->s_state = 659 OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE); 660 /* Immediate data comes after RETH */ 661 ohdr->u.rc.imm_data = wqe->wr.ex.imm_data; 662 hwords += 1; 663 if (wqe->wr.send_flags & IB_SEND_SOLICITED) 664 bth0 |= IB_BTH_SOLICITED; 665 } 666 bth2 |= IB_BTH_REQ_ACK; 667 if (++qp->s_cur == qp->s_size) 668 qp->s_cur = 0; 669 break; 670 671 case IB_WR_TID_RDMA_WRITE: 672 if (newreq) { 673 /* 674 * Limit the number of TID RDMA WRITE requests. 675 */ 676 if (atomic_read(&priv->n_tid_requests) >= 677 HFI1_TID_RDMA_WRITE_CNT) 678 goto bail; 679 680 if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) 681 qp->s_lsn++; 682 } 683 684 hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr, 685 &bth1, &bth2, 686 &len); 687 ss = NULL; 688 if (priv->s_tid_cur == HFI1_QP_WQE_INVALID) { 689 priv->s_tid_cur = qp->s_cur; 690 if (priv->s_tid_tail == HFI1_QP_WQE_INVALID) { 691 priv->s_tid_tail = qp->s_cur; 692 priv->s_state = TID_OP(WRITE_RESP); 693 } 694 } else if (priv->s_tid_cur == priv->s_tid_head) { 695 struct rvt_swqe *__w; 696 struct tid_rdma_request *__r; 697 698 __w = rvt_get_swqe_ptr(qp, priv->s_tid_cur); 699 __r = wqe_to_tid_req(__w); 700 701 /* 702 * The s_tid_cur pointer is advanced to s_cur if 703 * any of the following conditions about the WQE 704 * to which s_ti_cur currently points to are 705 * satisfied: 706 * 1. The request is not a TID RDMA WRITE 707 * request, 708 * 2. The request is in the INACTIVE or 709 * COMPLETE states (TID RDMA READ requests 710 * stay at INACTIVE and TID RDMA WRITE 711 * transition to COMPLETE when done), 712 * 3. The request is in the ACTIVE or SYNC 713 * state and the number of completed 714 * segments is equal to the total segment 715 * count. 716 * (If ACTIVE, the request is waiting for 717 * ACKs. If SYNC, the request has not 718 * received any responses because it's 719 * waiting on a sync point.) 720 */ 721 if (__w->wr.opcode != IB_WR_TID_RDMA_WRITE || 722 __r->state == TID_REQUEST_INACTIVE || 723 __r->state == TID_REQUEST_COMPLETE || 724 ((__r->state == TID_REQUEST_ACTIVE || 725 __r->state == TID_REQUEST_SYNC) && 726 __r->comp_seg == __r->total_segs)) { 727 if (priv->s_tid_tail == 728 priv->s_tid_cur && 729 priv->s_state == 730 TID_OP(WRITE_DATA_LAST)) { 731 priv->s_tid_tail = qp->s_cur; 732 priv->s_state = 733 TID_OP(WRITE_RESP); 734 } 735 priv->s_tid_cur = qp->s_cur; 736 } 737 /* 738 * A corner case: when the last TID RDMA WRITE 739 * request was completed, s_tid_head, 740 * s_tid_cur, and s_tid_tail all point to the 741 * same location. Other requests are posted and 742 * s_cur wraps around to the same location, 743 * where a new TID RDMA WRITE is posted. In 744 * this case, none of the indices need to be 745 * updated. However, the priv->s_state should. 746 */ 747 if (priv->s_tid_tail == qp->s_cur && 748 priv->s_state == TID_OP(WRITE_DATA_LAST)) 749 priv->s_state = TID_OP(WRITE_RESP); 750 } 751 req = wqe_to_tid_req(wqe); 752 if (newreq) { 753 priv->s_tid_head = qp->s_cur; 754 priv->pending_tid_w_resp += req->total_segs; 755 atomic_inc(&priv->n_tid_requests); 756 atomic_dec(&priv->n_requests); 757 } else { 758 req->state = TID_REQUEST_RESEND; 759 req->comp_seg = delta_psn(bth2, wqe->psn); 760 /* 761 * Pull back any segments since we are going 762 * to re-receive them. 763 */ 764 req->setup_head = req->clear_tail; 765 priv->pending_tid_w_resp += 766 delta_psn(wqe->lpsn, bth2) + 1; 767 } 768 769 trace_hfi1_tid_write_sender_make_req(qp, newreq); 770 trace_hfi1_tid_req_make_req_write(qp, newreq, 771 wqe->wr.opcode, 772 wqe->psn, wqe->lpsn, 773 req); 774 if (++qp->s_cur == qp->s_size) 775 qp->s_cur = 0; 776 break; 777 778 case IB_WR_RDMA_READ: 779 /* 780 * Don't allow more operations to be started 781 * than the QP limits allow. 782 */ 783 if (qp->s_num_rd_atomic >= 784 qp->s_max_rd_atomic) { 785 qp->s_flags |= RVT_S_WAIT_RDMAR; 786 goto bail; 787 } 788 qp->s_num_rd_atomic++; 789 if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) 790 qp->s_lsn++; 791 put_ib_reth_vaddr( 792 wqe->rdma_wr.remote_addr, 793 &ohdr->u.rc.reth); 794 ohdr->u.rc.reth.rkey = 795 cpu_to_be32(wqe->rdma_wr.rkey); 796 ohdr->u.rc.reth.length = cpu_to_be32(len); 797 qp->s_state = OP(RDMA_READ_REQUEST); 798 hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32); 799 ss = NULL; 800 len = 0; 801 bth2 |= IB_BTH_REQ_ACK; 802 if (++qp->s_cur == qp->s_size) 803 qp->s_cur = 0; 804 break; 805 806 case IB_WR_TID_RDMA_READ: 807 trace_hfi1_tid_read_sender_make_req(qp, newreq); 808 wpriv = wqe->priv; 809 req = wqe_to_tid_req(wqe); 810 trace_hfi1_tid_req_make_req_read(qp, newreq, 811 wqe->wr.opcode, 812 wqe->psn, wqe->lpsn, 813 req); 814 delta = cmp_psn(qp->s_psn, wqe->psn); 815 816 /* 817 * Don't allow more operations to be started 818 * than the QP limits allow. We could get here under 819 * three conditions; (1) It's a new request; (2) We are 820 * sending the second or later segment of a request, 821 * but the qp->s_state is set to OP(RDMA_READ_REQUEST) 822 * when the last segment of a previous request is 823 * received just before this; (3) We are re-sending a 824 * request. 825 */ 826 if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) { 827 qp->s_flags |= RVT_S_WAIT_RDMAR; 828 goto bail; 829 } 830 if (newreq) { 831 struct tid_rdma_flow *flow = 832 &req->flows[req->setup_head]; 833 834 /* 835 * Set up s_sge as it is needed for TID 836 * allocation. However, if the pages have been 837 * walked and mapped, skip it. An earlier try 838 * has failed to allocate the TID entries. 839 */ 840 if (!flow->npagesets) { 841 qp->s_sge.sge = wqe->sg_list[0]; 842 qp->s_sge.sg_list = wqe->sg_list + 1; 843 qp->s_sge.num_sge = wqe->wr.num_sge; 844 qp->s_sge.total_len = wqe->length; 845 qp->s_len = wqe->length; 846 req->isge = 0; 847 req->clear_tail = req->setup_head; 848 req->flow_idx = req->setup_head; 849 req->state = TID_REQUEST_ACTIVE; 850 } 851 } else if (delta == 0) { 852 /* Re-send a request */ 853 req->cur_seg = 0; 854 req->comp_seg = 0; 855 req->ack_pending = 0; 856 req->flow_idx = req->clear_tail; 857 req->state = TID_REQUEST_RESEND; 858 } 859 req->s_next_psn = qp->s_psn; 860 /* Read one segment at a time */ 861 len = min_t(u32, req->seg_len, 862 wqe->length - req->seg_len * req->cur_seg); 863 delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr, 864 &bth1, &bth2, 865 &len); 866 if (delta <= 0) { 867 /* Wait for TID space */ 868 goto bail; 869 } 870 if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) 871 qp->s_lsn++; 872 hwords += delta; 873 ss = &wpriv->ss; 874 /* Check if this is the last segment */ 875 if (req->cur_seg >= req->total_segs && 876 ++qp->s_cur == qp->s_size) 877 qp->s_cur = 0; 878 break; 879 880 case IB_WR_ATOMIC_CMP_AND_SWP: 881 case IB_WR_ATOMIC_FETCH_AND_ADD: 882 /* 883 * Don't allow more operations to be started 884 * than the QP limits allow. 885 */ 886 if (qp->s_num_rd_atomic >= 887 qp->s_max_rd_atomic) { 888 qp->s_flags |= RVT_S_WAIT_RDMAR; 889 goto bail; 890 } 891 qp->s_num_rd_atomic++; 892 893 /* FALLTHROUGH */ 894 case IB_WR_OPFN: 895 if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) 896 qp->s_lsn++; 897 if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || 898 wqe->wr.opcode == IB_WR_OPFN) { 899 qp->s_state = OP(COMPARE_SWAP); 900 put_ib_ateth_swap(wqe->atomic_wr.swap, 901 &ohdr->u.atomic_eth); 902 put_ib_ateth_compare(wqe->atomic_wr.compare_add, 903 &ohdr->u.atomic_eth); 904 } else { 905 qp->s_state = OP(FETCH_ADD); 906 put_ib_ateth_swap(wqe->atomic_wr.compare_add, 907 &ohdr->u.atomic_eth); 908 put_ib_ateth_compare(0, &ohdr->u.atomic_eth); 909 } 910 put_ib_ateth_vaddr(wqe->atomic_wr.remote_addr, 911 &ohdr->u.atomic_eth); 912 ohdr->u.atomic_eth.rkey = cpu_to_be32( 913 wqe->atomic_wr.rkey); 914 hwords += sizeof(struct ib_atomic_eth) / sizeof(u32); 915 ss = NULL; 916 len = 0; 917 bth2 |= IB_BTH_REQ_ACK; 918 if (++qp->s_cur == qp->s_size) 919 qp->s_cur = 0; 920 break; 921 922 default: 923 goto bail; 924 } 925 if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) { 926 qp->s_sge.sge = wqe->sg_list[0]; 927 qp->s_sge.sg_list = wqe->sg_list + 1; 928 qp->s_sge.num_sge = wqe->wr.num_sge; 929 qp->s_sge.total_len = wqe->length; 930 qp->s_len = wqe->length; 931 } 932 if (newreq) { 933 qp->s_tail++; 934 if (qp->s_tail >= qp->s_size) 935 qp->s_tail = 0; 936 } 937 if (wqe->wr.opcode == IB_WR_RDMA_READ || 938 wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) 939 qp->s_psn = wqe->lpsn + 1; 940 else if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) 941 qp->s_psn = req->s_next_psn; 942 else 943 qp->s_psn++; 944 break; 945 946 case OP(RDMA_READ_RESPONSE_FIRST): 947 /* 948 * qp->s_state is normally set to the opcode of the 949 * last packet constructed for new requests and therefore 950 * is never set to RDMA read response. 951 * RDMA_READ_RESPONSE_FIRST is used by the ACK processing 952 * thread to indicate a SEND needs to be restarted from an 953 * earlier PSN without interfering with the sending thread. 954 * See restart_rc(). 955 */ 956 qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu); 957 /* FALLTHROUGH */ 958 case OP(SEND_FIRST): 959 qp->s_state = OP(SEND_MIDDLE); 960 /* FALLTHROUGH */ 961 case OP(SEND_MIDDLE): 962 bth2 = mask_psn(qp->s_psn++); 963 ss = &qp->s_sge; 964 len = qp->s_len; 965 if (len > pmtu) { 966 len = pmtu; 967 middle = HFI1_CAP_IS_KSET(SDMA_AHG); 968 break; 969 } 970 if (wqe->wr.opcode == IB_WR_SEND) { 971 qp->s_state = OP(SEND_LAST); 972 } else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) { 973 qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE); 974 /* Immediate data comes after the BTH */ 975 ohdr->u.imm_data = wqe->wr.ex.imm_data; 976 hwords += 1; 977 } else { 978 qp->s_state = OP(SEND_LAST_WITH_INVALIDATE); 979 /* invalidate data comes after the BTH */ 980 ohdr->u.ieth = cpu_to_be32(wqe->wr.ex.invalidate_rkey); 981 hwords += 1; 982 } 983 if (wqe->wr.send_flags & IB_SEND_SOLICITED) 984 bth0 |= IB_BTH_SOLICITED; 985 bth2 |= IB_BTH_REQ_ACK; 986 qp->s_cur++; 987 if (qp->s_cur >= qp->s_size) 988 qp->s_cur = 0; 989 break; 990 991 case OP(RDMA_READ_RESPONSE_LAST): 992 /* 993 * qp->s_state is normally set to the opcode of the 994 * last packet constructed for new requests and therefore 995 * is never set to RDMA read response. 996 * RDMA_READ_RESPONSE_LAST is used by the ACK processing 997 * thread to indicate a RDMA write needs to be restarted from 998 * an earlier PSN without interfering with the sending thread. 999 * See restart_rc(). 1000 */ 1001 qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu); 1002 /* FALLTHROUGH */ 1003 case OP(RDMA_WRITE_FIRST): 1004 qp->s_state = OP(RDMA_WRITE_MIDDLE); 1005 /* FALLTHROUGH */ 1006 case OP(RDMA_WRITE_MIDDLE): 1007 bth2 = mask_psn(qp->s_psn++); 1008 ss = &qp->s_sge; 1009 len = qp->s_len; 1010 if (len > pmtu) { 1011 len = pmtu; 1012 middle = HFI1_CAP_IS_KSET(SDMA_AHG); 1013 break; 1014 } 1015 if (wqe->wr.opcode == IB_WR_RDMA_WRITE) { 1016 qp->s_state = OP(RDMA_WRITE_LAST); 1017 } else { 1018 qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE); 1019 /* Immediate data comes after the BTH */ 1020 ohdr->u.imm_data = wqe->wr.ex.imm_data; 1021 hwords += 1; 1022 if (wqe->wr.send_flags & IB_SEND_SOLICITED) 1023 bth0 |= IB_BTH_SOLICITED; 1024 } 1025 bth2 |= IB_BTH_REQ_ACK; 1026 qp->s_cur++; 1027 if (qp->s_cur >= qp->s_size) 1028 qp->s_cur = 0; 1029 break; 1030 1031 case OP(RDMA_READ_RESPONSE_MIDDLE): 1032 /* 1033 * qp->s_state is normally set to the opcode of the 1034 * last packet constructed for new requests and therefore 1035 * is never set to RDMA read response. 1036 * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing 1037 * thread to indicate a RDMA read needs to be restarted from 1038 * an earlier PSN without interfering with the sending thread. 1039 * See restart_rc(). 1040 */ 1041 len = (delta_psn(qp->s_psn, wqe->psn)) * pmtu; 1042 put_ib_reth_vaddr( 1043 wqe->rdma_wr.remote_addr + len, 1044 &ohdr->u.rc.reth); 1045 ohdr->u.rc.reth.rkey = 1046 cpu_to_be32(wqe->rdma_wr.rkey); 1047 ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len); 1048 qp->s_state = OP(RDMA_READ_REQUEST); 1049 hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32); 1050 bth2 = mask_psn(qp->s_psn) | IB_BTH_REQ_ACK; 1051 qp->s_psn = wqe->lpsn + 1; 1052 ss = NULL; 1053 len = 0; 1054 qp->s_cur++; 1055 if (qp->s_cur == qp->s_size) 1056 qp->s_cur = 0; 1057 break; 1058 1059 case TID_OP(WRITE_RESP): 1060 /* 1061 * This value for s_state is used for restarting a TID RDMA 1062 * WRITE request. See comment in OP(RDMA_READ_RESPONSE_MIDDLE 1063 * for more). 1064 */ 1065 req = wqe_to_tid_req(wqe); 1066 req->state = TID_REQUEST_RESEND; 1067 rcu_read_lock(); 1068 remote = rcu_dereference(priv->tid_rdma.remote); 1069 req->comp_seg = delta_psn(qp->s_psn, wqe->psn); 1070 len = wqe->length - (req->comp_seg * remote->max_len); 1071 rcu_read_unlock(); 1072 1073 bth2 = mask_psn(qp->s_psn); 1074 hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr, &bth1, 1075 &bth2, &len); 1076 qp->s_psn = wqe->lpsn + 1; 1077 ss = NULL; 1078 qp->s_state = TID_OP(WRITE_REQ); 1079 priv->pending_tid_w_resp += delta_psn(wqe->lpsn, bth2) + 1; 1080 priv->s_tid_cur = qp->s_cur; 1081 if (++qp->s_cur == qp->s_size) 1082 qp->s_cur = 0; 1083 trace_hfi1_tid_req_make_req_write(qp, 0, wqe->wr.opcode, 1084 wqe->psn, wqe->lpsn, req); 1085 break; 1086 1087 case TID_OP(READ_RESP): 1088 if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) 1089 goto bail; 1090 /* This is used to restart a TID read request */ 1091 req = wqe_to_tid_req(wqe); 1092 wpriv = wqe->priv; 1093 /* 1094 * Back down. The field qp->s_psn has been set to the psn with 1095 * which the request should be restart. It's OK to use division 1096 * as this is on the retry path. 1097 */ 1098 req->cur_seg = delta_psn(qp->s_psn, wqe->psn) / priv->pkts_ps; 1099 1100 /* 1101 * The following function need to be redefined to return the 1102 * status to make sure that we find the flow. At the same 1103 * time, we can use the req->state change to check if the 1104 * call succeeds or not. 1105 */ 1106 req->state = TID_REQUEST_RESEND; 1107 hfi1_tid_rdma_restart_req(qp, wqe, &bth2); 1108 if (req->state != TID_REQUEST_ACTIVE) { 1109 /* 1110 * Failed to find the flow. Release all allocated tid 1111 * resources. 1112 */ 1113 hfi1_kern_exp_rcv_clear_all(req); 1114 hfi1_kern_clear_hw_flow(priv->rcd, qp); 1115 1116 hfi1_trdma_send_complete(qp, wqe, IB_WC_LOC_QP_OP_ERR); 1117 goto bail; 1118 } 1119 req->state = TID_REQUEST_RESEND; 1120 len = min_t(u32, req->seg_len, 1121 wqe->length - req->seg_len * req->cur_seg); 1122 flow = &req->flows[req->flow_idx]; 1123 len -= flow->sent; 1124 req->s_next_psn = flow->flow_state.ib_lpsn + 1; 1125 delta = hfi1_build_tid_rdma_read_packet(wqe, ohdr, &bth1, 1126 &bth2, &len); 1127 if (delta <= 0) { 1128 /* Wait for TID space */ 1129 goto bail; 1130 } 1131 hwords += delta; 1132 ss = &wpriv->ss; 1133 /* Check if this is the last segment */ 1134 if (req->cur_seg >= req->total_segs && 1135 ++qp->s_cur == qp->s_size) 1136 qp->s_cur = 0; 1137 qp->s_psn = req->s_next_psn; 1138 trace_hfi1_tid_req_make_req_read(qp, 0, wqe->wr.opcode, 1139 wqe->psn, wqe->lpsn, req); 1140 break; 1141 case TID_OP(READ_REQ): 1142 req = wqe_to_tid_req(wqe); 1143 delta = cmp_psn(qp->s_psn, wqe->psn); 1144 /* 1145 * If the current WR is not TID RDMA READ, or this is the start 1146 * of a new request, we need to change the qp->s_state so that 1147 * the request can be set up properly. 1148 */ 1149 if (wqe->wr.opcode != IB_WR_TID_RDMA_READ || delta == 0 || 1150 qp->s_cur == qp->s_tail) { 1151 qp->s_state = OP(RDMA_READ_REQUEST); 1152 if (delta == 0 || qp->s_cur == qp->s_tail) 1153 goto check_s_state; 1154 else 1155 goto bail; 1156 } 1157 1158 /* Rate limiting */ 1159 if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) { 1160 qp->s_flags |= RVT_S_WAIT_RDMAR; 1161 goto bail; 1162 } 1163 1164 wpriv = wqe->priv; 1165 /* Read one segment at a time */ 1166 len = min_t(u32, req->seg_len, 1167 wqe->length - req->seg_len * req->cur_seg); 1168 delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr, &bth1, 1169 &bth2, &len); 1170 if (delta <= 0) { 1171 /* Wait for TID space */ 1172 goto bail; 1173 } 1174 hwords += delta; 1175 ss = &wpriv->ss; 1176 /* Check if this is the last segment */ 1177 if (req->cur_seg >= req->total_segs && 1178 ++qp->s_cur == qp->s_size) 1179 qp->s_cur = 0; 1180 qp->s_psn = req->s_next_psn; 1181 trace_hfi1_tid_req_make_req_read(qp, 0, wqe->wr.opcode, 1182 wqe->psn, wqe->lpsn, req); 1183 break; 1184 } 1185 qp->s_sending_hpsn = bth2; 1186 delta = delta_psn(bth2, wqe->psn); 1187 if (delta && delta % HFI1_PSN_CREDIT == 0 && 1188 wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) 1189 bth2 |= IB_BTH_REQ_ACK; 1190 if (qp->s_flags & RVT_S_SEND_ONE) { 1191 qp->s_flags &= ~RVT_S_SEND_ONE; 1192 qp->s_flags |= RVT_S_WAIT_ACK; 1193 bth2 |= IB_BTH_REQ_ACK; 1194 } 1195 qp->s_len -= len; 1196 ps->s_txreq->hdr_dwords = hwords; 1197 ps->s_txreq->sde = priv->s_sde; 1198 ps->s_txreq->ss = ss; 1199 ps->s_txreq->s_cur_size = len; 1200 hfi1_make_ruc_header( 1201 qp, 1202 ohdr, 1203 bth0 | (qp->s_state << 24), 1204 bth1, 1205 bth2, 1206 middle, 1207 ps); 1208 return 1; 1209 1210 done_free_tx: 1211 hfi1_put_txreq(ps->s_txreq); 1212 ps->s_txreq = NULL; 1213 return 1; 1214 1215 bail: 1216 hfi1_put_txreq(ps->s_txreq); 1217 1218 bail_no_tx: 1219 ps->s_txreq = NULL; 1220 qp->s_flags &= ~RVT_S_BUSY; 1221 /* 1222 * If we didn't get a txreq, the QP will be woken up later to try 1223 * again. Set the flags to indicate which work item to wake 1224 * up. 1225 */ 1226 iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_IB); 1227 return 0; 1228 } 1229 1230 static inline void hfi1_make_bth_aeth(struct rvt_qp *qp, 1231 struct ib_other_headers *ohdr, 1232 u32 bth0, u32 bth1) 1233 { 1234 if (qp->r_nak_state) 1235 ohdr->u.aeth = cpu_to_be32((qp->r_msn & IB_MSN_MASK) | 1236 (qp->r_nak_state << 1237 IB_AETH_CREDIT_SHIFT)); 1238 else 1239 ohdr->u.aeth = rvt_compute_aeth(qp); 1240 1241 ohdr->bth[0] = cpu_to_be32(bth0); 1242 ohdr->bth[1] = cpu_to_be32(bth1 | qp->remote_qpn); 1243 ohdr->bth[2] = cpu_to_be32(mask_psn(qp->r_ack_psn)); 1244 } 1245 1246 static inline void hfi1_queue_rc_ack(struct hfi1_packet *packet, bool is_fecn) 1247 { 1248 struct rvt_qp *qp = packet->qp; 1249 struct hfi1_ibport *ibp; 1250 unsigned long flags; 1251 1252 spin_lock_irqsave(&qp->s_lock, flags); 1253 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) 1254 goto unlock; 1255 ibp = rcd_to_iport(packet->rcd); 1256 this_cpu_inc(*ibp->rvp.rc_qacks); 1257 qp->s_flags |= RVT_S_ACK_PENDING | RVT_S_RESP_PENDING; 1258 qp->s_nak_state = qp->r_nak_state; 1259 qp->s_ack_psn = qp->r_ack_psn; 1260 if (is_fecn) 1261 qp->s_flags |= RVT_S_ECN; 1262 1263 /* Schedule the send tasklet. */ 1264 hfi1_schedule_send(qp); 1265 unlock: 1266 spin_unlock_irqrestore(&qp->s_lock, flags); 1267 } 1268 1269 static inline void hfi1_make_rc_ack_9B(struct hfi1_packet *packet, 1270 struct hfi1_opa_header *opa_hdr, 1271 u8 sc5, bool is_fecn, 1272 u64 *pbc_flags, u32 *hwords, 1273 u32 *nwords) 1274 { 1275 struct rvt_qp *qp = packet->qp; 1276 struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd); 1277 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); 1278 struct ib_header *hdr = &opa_hdr->ibh; 1279 struct ib_other_headers *ohdr; 1280 u16 lrh0 = HFI1_LRH_BTH; 1281 u16 pkey; 1282 u32 bth0, bth1; 1283 1284 opa_hdr->hdr_type = HFI1_PKT_TYPE_9B; 1285 ohdr = &hdr->u.oth; 1286 /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4 */ 1287 *hwords = 6; 1288 1289 if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH)) { 1290 *hwords += hfi1_make_grh(ibp, &hdr->u.l.grh, 1291 rdma_ah_read_grh(&qp->remote_ah_attr), 1292 *hwords - 2, SIZE_OF_CRC); 1293 ohdr = &hdr->u.l.oth; 1294 lrh0 = HFI1_LRH_GRH; 1295 } 1296 /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */ 1297 *pbc_flags |= ((!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT); 1298 1299 /* read pkey_index w/o lock (its atomic) */ 1300 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index); 1301 1302 lrh0 |= (sc5 & IB_SC_MASK) << IB_SC_SHIFT | 1303 (rdma_ah_get_sl(&qp->remote_ah_attr) & IB_SL_MASK) << 1304 IB_SL_SHIFT; 1305 1306 hfi1_make_ib_hdr(hdr, lrh0, *hwords + SIZE_OF_CRC, 1307 opa_get_lid(rdma_ah_get_dlid(&qp->remote_ah_attr), 9B), 1308 ppd->lid | rdma_ah_get_path_bits(&qp->remote_ah_attr)); 1309 1310 bth0 = pkey | (OP(ACKNOWLEDGE) << 24); 1311 if (qp->s_mig_state == IB_MIG_MIGRATED) 1312 bth0 |= IB_BTH_MIG_REQ; 1313 bth1 = (!!is_fecn) << IB_BECN_SHIFT; 1314 /* 1315 * Inline ACKs go out without the use of the Verbs send engine, so 1316 * we need to set the STL Verbs Extended bit here 1317 */ 1318 bth1 |= HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT; 1319 hfi1_make_bth_aeth(qp, ohdr, bth0, bth1); 1320 } 1321 1322 static inline void hfi1_make_rc_ack_16B(struct hfi1_packet *packet, 1323 struct hfi1_opa_header *opa_hdr, 1324 u8 sc5, bool is_fecn, 1325 u64 *pbc_flags, u32 *hwords, 1326 u32 *nwords) 1327 { 1328 struct rvt_qp *qp = packet->qp; 1329 struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd); 1330 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); 1331 struct hfi1_16b_header *hdr = &opa_hdr->opah; 1332 struct ib_other_headers *ohdr; 1333 u32 bth0, bth1 = 0; 1334 u16 len, pkey; 1335 bool becn = is_fecn; 1336 u8 l4 = OPA_16B_L4_IB_LOCAL; 1337 u8 extra_bytes; 1338 1339 opa_hdr->hdr_type = HFI1_PKT_TYPE_16B; 1340 ohdr = &hdr->u.oth; 1341 /* header size in 32-bit words 16B LRH+BTH+AETH = (16+12+4)/4 */ 1342 *hwords = 8; 1343 extra_bytes = hfi1_get_16b_padding(*hwords << 2, 0); 1344 *nwords = SIZE_OF_CRC + ((extra_bytes + SIZE_OF_LT) >> 2); 1345 1346 if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) && 1347 hfi1_check_mcast(rdma_ah_get_dlid(&qp->remote_ah_attr))) { 1348 *hwords += hfi1_make_grh(ibp, &hdr->u.l.grh, 1349 rdma_ah_read_grh(&qp->remote_ah_attr), 1350 *hwords - 4, *nwords); 1351 ohdr = &hdr->u.l.oth; 1352 l4 = OPA_16B_L4_IB_GLOBAL; 1353 } 1354 *pbc_flags |= PBC_PACKET_BYPASS | PBC_INSERT_BYPASS_ICRC; 1355 1356 /* read pkey_index w/o lock (its atomic) */ 1357 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index); 1358 1359 /* Convert dwords to flits */ 1360 len = (*hwords + *nwords) >> 1; 1361 1362 hfi1_make_16b_hdr(hdr, ppd->lid | 1363 (rdma_ah_get_path_bits(&qp->remote_ah_attr) & 1364 ((1 << ppd->lmc) - 1)), 1365 opa_get_lid(rdma_ah_get_dlid(&qp->remote_ah_attr), 1366 16B), len, pkey, becn, 0, l4, sc5); 1367 1368 bth0 = pkey | (OP(ACKNOWLEDGE) << 24); 1369 bth0 |= extra_bytes << 20; 1370 if (qp->s_mig_state == IB_MIG_MIGRATED) 1371 bth1 = OPA_BTH_MIG_REQ; 1372 hfi1_make_bth_aeth(qp, ohdr, bth0, bth1); 1373 } 1374 1375 typedef void (*hfi1_make_rc_ack)(struct hfi1_packet *packet, 1376 struct hfi1_opa_header *opa_hdr, 1377 u8 sc5, bool is_fecn, 1378 u64 *pbc_flags, u32 *hwords, 1379 u32 *nwords); 1380 1381 /* We support only two types - 9B and 16B for now */ 1382 static const hfi1_make_rc_ack hfi1_make_rc_ack_tbl[2] = { 1383 [HFI1_PKT_TYPE_9B] = &hfi1_make_rc_ack_9B, 1384 [HFI1_PKT_TYPE_16B] = &hfi1_make_rc_ack_16B 1385 }; 1386 1387 /** 1388 * hfi1_send_rc_ack - Construct an ACK packet and send it 1389 * @qp: a pointer to the QP 1390 * 1391 * This is called from hfi1_rc_rcv() and handle_receive_interrupt(). 1392 * Note that RDMA reads and atomics are handled in the 1393 * send side QP state and send engine. 1394 */ 1395 void hfi1_send_rc_ack(struct hfi1_packet *packet, bool is_fecn) 1396 { 1397 struct hfi1_ctxtdata *rcd = packet->rcd; 1398 struct rvt_qp *qp = packet->qp; 1399 struct hfi1_ibport *ibp = rcd_to_iport(rcd); 1400 struct hfi1_qp_priv *priv = qp->priv; 1401 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); 1402 u8 sc5 = ibp->sl_to_sc[rdma_ah_get_sl(&qp->remote_ah_attr)]; 1403 u64 pbc, pbc_flags = 0; 1404 u32 hwords = 0; 1405 u32 nwords = 0; 1406 u32 plen; 1407 struct pio_buf *pbuf; 1408 struct hfi1_opa_header opa_hdr; 1409 1410 /* clear the defer count */ 1411 qp->r_adefered = 0; 1412 1413 /* Don't send ACK or NAK if a RDMA read or atomic is pending. */ 1414 if (qp->s_flags & RVT_S_RESP_PENDING) { 1415 hfi1_queue_rc_ack(packet, is_fecn); 1416 return; 1417 } 1418 1419 /* Ensure s_rdma_ack_cnt changes are committed */ 1420 if (qp->s_rdma_ack_cnt) { 1421 hfi1_queue_rc_ack(packet, is_fecn); 1422 return; 1423 } 1424 1425 /* Don't try to send ACKs if the link isn't ACTIVE */ 1426 if (driver_lstate(ppd) != IB_PORT_ACTIVE) 1427 return; 1428 1429 /* Make the appropriate header */ 1430 hfi1_make_rc_ack_tbl[priv->hdr_type](packet, &opa_hdr, sc5, is_fecn, 1431 &pbc_flags, &hwords, &nwords); 1432 1433 plen = 2 /* PBC */ + hwords + nwords; 1434 pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, 1435 sc_to_vlt(ppd->dd, sc5), plen); 1436 pbuf = sc_buffer_alloc(rcd->sc, plen, NULL, NULL); 1437 if (!pbuf) { 1438 /* 1439 * We have no room to send at the moment. Pass 1440 * responsibility for sending the ACK to the send engine 1441 * so that when enough buffer space becomes available, 1442 * the ACK is sent ahead of other outgoing packets. 1443 */ 1444 hfi1_queue_rc_ack(packet, is_fecn); 1445 return; 1446 } 1447 trace_ack_output_ibhdr(dd_from_ibdev(qp->ibqp.device), 1448 &opa_hdr, ib_is_sc5(sc5)); 1449 1450 /* write the pbc and data */ 1451 ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc, 1452 (priv->hdr_type == HFI1_PKT_TYPE_9B ? 1453 (void *)&opa_hdr.ibh : 1454 (void *)&opa_hdr.opah), hwords); 1455 return; 1456 } 1457 1458 /** 1459 * update_num_rd_atomic - update the qp->s_num_rd_atomic 1460 * @qp: the QP 1461 * @psn: the packet sequence number to restart at 1462 * @wqe: the wqe 1463 * 1464 * This is called from reset_psn() to update qp->s_num_rd_atomic 1465 * for the current wqe. 1466 * Called at interrupt level with the QP s_lock held. 1467 */ 1468 static void update_num_rd_atomic(struct rvt_qp *qp, u32 psn, 1469 struct rvt_swqe *wqe) 1470 { 1471 u32 opcode = wqe->wr.opcode; 1472 1473 if (opcode == IB_WR_RDMA_READ || 1474 opcode == IB_WR_ATOMIC_CMP_AND_SWP || 1475 opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { 1476 qp->s_num_rd_atomic++; 1477 } else if (opcode == IB_WR_TID_RDMA_READ) { 1478 struct tid_rdma_request *req = wqe_to_tid_req(wqe); 1479 struct hfi1_qp_priv *priv = qp->priv; 1480 1481 if (cmp_psn(psn, wqe->lpsn) <= 0) { 1482 u32 cur_seg; 1483 1484 cur_seg = (psn - wqe->psn) / priv->pkts_ps; 1485 req->ack_pending = cur_seg - req->comp_seg; 1486 priv->pending_tid_r_segs += req->ack_pending; 1487 qp->s_num_rd_atomic += req->ack_pending; 1488 } else { 1489 priv->pending_tid_r_segs += req->total_segs; 1490 qp->s_num_rd_atomic += req->total_segs; 1491 } 1492 } 1493 } 1494 1495 /** 1496 * reset_psn - reset the QP state to send starting from PSN 1497 * @qp: the QP 1498 * @psn: the packet sequence number to restart at 1499 * 1500 * This is called from hfi1_rc_rcv() to process an incoming RC ACK 1501 * for the given QP. 1502 * Called at interrupt level with the QP s_lock held. 1503 */ 1504 static void reset_psn(struct rvt_qp *qp, u32 psn) 1505 { 1506 u32 n = qp->s_acked; 1507 struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n); 1508 u32 opcode; 1509 struct hfi1_qp_priv *priv = qp->priv; 1510 1511 lockdep_assert_held(&qp->s_lock); 1512 qp->s_cur = n; 1513 priv->pending_tid_r_segs = 0; 1514 priv->pending_tid_w_resp = 0; 1515 qp->s_num_rd_atomic = 0; 1516 1517 /* 1518 * If we are starting the request from the beginning, 1519 * let the normal send code handle initialization. 1520 */ 1521 if (cmp_psn(psn, wqe->psn) <= 0) { 1522 qp->s_state = OP(SEND_LAST); 1523 goto done; 1524 } 1525 update_num_rd_atomic(qp, psn, wqe); 1526 1527 /* Find the work request opcode corresponding to the given PSN. */ 1528 for (;;) { 1529 int diff; 1530 1531 if (++n == qp->s_size) 1532 n = 0; 1533 if (n == qp->s_tail) 1534 break; 1535 wqe = rvt_get_swqe_ptr(qp, n); 1536 diff = cmp_psn(psn, wqe->psn); 1537 if (diff < 0) { 1538 /* Point wqe back to the previous one*/ 1539 wqe = rvt_get_swqe_ptr(qp, qp->s_cur); 1540 break; 1541 } 1542 qp->s_cur = n; 1543 /* 1544 * If we are starting the request from the beginning, 1545 * let the normal send code handle initialization. 1546 */ 1547 if (diff == 0) { 1548 qp->s_state = OP(SEND_LAST); 1549 goto done; 1550 } 1551 1552 update_num_rd_atomic(qp, psn, wqe); 1553 } 1554 opcode = wqe->wr.opcode; 1555 1556 /* 1557 * Set the state to restart in the middle of a request. 1558 * Don't change the s_sge, s_cur_sge, or s_cur_size. 1559 * See hfi1_make_rc_req(). 1560 */ 1561 switch (opcode) { 1562 case IB_WR_SEND: 1563 case IB_WR_SEND_WITH_IMM: 1564 qp->s_state = OP(RDMA_READ_RESPONSE_FIRST); 1565 break; 1566 1567 case IB_WR_RDMA_WRITE: 1568 case IB_WR_RDMA_WRITE_WITH_IMM: 1569 qp->s_state = OP(RDMA_READ_RESPONSE_LAST); 1570 break; 1571 1572 case IB_WR_TID_RDMA_WRITE: 1573 qp->s_state = TID_OP(WRITE_RESP); 1574 break; 1575 1576 case IB_WR_RDMA_READ: 1577 qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE); 1578 break; 1579 1580 case IB_WR_TID_RDMA_READ: 1581 qp->s_state = TID_OP(READ_RESP); 1582 break; 1583 1584 default: 1585 /* 1586 * This case shouldn't happen since its only 1587 * one PSN per req. 1588 */ 1589 qp->s_state = OP(SEND_LAST); 1590 } 1591 done: 1592 priv->s_flags &= ~HFI1_S_TID_WAIT_INTERLCK; 1593 qp->s_psn = psn; 1594 /* 1595 * Set RVT_S_WAIT_PSN as rc_complete() may start the timer 1596 * asynchronously before the send engine can get scheduled. 1597 * Doing it in hfi1_make_rc_req() is too late. 1598 */ 1599 if ((cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) && 1600 (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)) 1601 qp->s_flags |= RVT_S_WAIT_PSN; 1602 qp->s_flags &= ~HFI1_S_AHG_VALID; 1603 trace_hfi1_sender_reset_psn(qp); 1604 } 1605 1606 /* 1607 * Back up requester to resend the last un-ACKed request. 1608 * The QP r_lock and s_lock should be held and interrupts disabled. 1609 */ 1610 void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) 1611 { 1612 struct hfi1_qp_priv *priv = qp->priv; 1613 struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 1614 struct hfi1_ibport *ibp; 1615 1616 lockdep_assert_held(&qp->r_lock); 1617 lockdep_assert_held(&qp->s_lock); 1618 trace_hfi1_sender_restart_rc(qp); 1619 if (qp->s_retry == 0) { 1620 if (qp->s_mig_state == IB_MIG_ARMED) { 1621 hfi1_migrate_qp(qp); 1622 qp->s_retry = qp->s_retry_cnt; 1623 } else if (qp->s_last == qp->s_acked) { 1624 /* 1625 * We need special handling for the OPFN request WQEs as 1626 * they are not allowed to generate real user errors 1627 */ 1628 if (wqe->wr.opcode == IB_WR_OPFN) { 1629 struct hfi1_ibport *ibp = 1630 to_iport(qp->ibqp.device, qp->port_num); 1631 /* 1632 * Call opfn_conn_reply() with capcode and 1633 * remaining data as 0 to close out the 1634 * current request 1635 */ 1636 opfn_conn_reply(qp, priv->opfn.curr); 1637 wqe = do_rc_completion(qp, wqe, ibp); 1638 qp->s_flags &= ~RVT_S_WAIT_ACK; 1639 } else { 1640 trace_hfi1_tid_write_sender_restart_rc(qp, 0); 1641 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { 1642 struct tid_rdma_request *req; 1643 1644 req = wqe_to_tid_req(wqe); 1645 hfi1_kern_exp_rcv_clear_all(req); 1646 hfi1_kern_clear_hw_flow(priv->rcd, qp); 1647 } 1648 1649 hfi1_trdma_send_complete(qp, wqe, 1650 IB_WC_RETRY_EXC_ERR); 1651 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); 1652 } 1653 return; 1654 } else { /* need to handle delayed completion */ 1655 return; 1656 } 1657 } else { 1658 qp->s_retry--; 1659 } 1660 1661 ibp = to_iport(qp->ibqp.device, qp->port_num); 1662 if (wqe->wr.opcode == IB_WR_RDMA_READ || 1663 wqe->wr.opcode == IB_WR_TID_RDMA_READ) 1664 ibp->rvp.n_rc_resends++; 1665 else 1666 ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn); 1667 1668 qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR | 1669 RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN | 1670 RVT_S_WAIT_ACK | HFI1_S_WAIT_TID_RESP); 1671 if (wait) 1672 qp->s_flags |= RVT_S_SEND_ONE; 1673 reset_psn(qp, psn); 1674 } 1675 1676 /* 1677 * Set qp->s_sending_psn to the next PSN after the given one. 1678 * This would be psn+1 except when RDMA reads or TID RDMA ops 1679 * are present. 1680 */ 1681 static void reset_sending_psn(struct rvt_qp *qp, u32 psn) 1682 { 1683 struct rvt_swqe *wqe; 1684 u32 n = qp->s_last; 1685 1686 lockdep_assert_held(&qp->s_lock); 1687 /* Find the work request corresponding to the given PSN. */ 1688 for (;;) { 1689 wqe = rvt_get_swqe_ptr(qp, n); 1690 if (cmp_psn(psn, wqe->lpsn) <= 0) { 1691 if (wqe->wr.opcode == IB_WR_RDMA_READ || 1692 wqe->wr.opcode == IB_WR_TID_RDMA_READ || 1693 wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) 1694 qp->s_sending_psn = wqe->lpsn + 1; 1695 else 1696 qp->s_sending_psn = psn + 1; 1697 break; 1698 } 1699 if (++n == qp->s_size) 1700 n = 0; 1701 if (n == qp->s_tail) 1702 break; 1703 } 1704 } 1705 1706 /* 1707 * This should be called with the QP s_lock held and interrupts disabled. 1708 */ 1709 void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah) 1710 { 1711 struct ib_other_headers *ohdr; 1712 struct hfi1_qp_priv *priv = qp->priv; 1713 struct rvt_swqe *wqe; 1714 struct ib_header *hdr = NULL; 1715 struct hfi1_16b_header *hdr_16b = NULL; 1716 u32 opcode, head, tail; 1717 u32 psn; 1718 struct tid_rdma_request *req; 1719 1720 lockdep_assert_held(&qp->s_lock); 1721 if (!(ib_rvt_state_ops[qp->state] & RVT_SEND_OR_FLUSH_OR_RECV_OK)) 1722 return; 1723 1724 /* Find out where the BTH is */ 1725 if (priv->hdr_type == HFI1_PKT_TYPE_9B) { 1726 hdr = &opah->ibh; 1727 if (ib_get_lnh(hdr) == HFI1_LRH_BTH) 1728 ohdr = &hdr->u.oth; 1729 else 1730 ohdr = &hdr->u.l.oth; 1731 } else { 1732 u8 l4; 1733 1734 hdr_16b = &opah->opah; 1735 l4 = hfi1_16B_get_l4(hdr_16b); 1736 if (l4 == OPA_16B_L4_IB_LOCAL) 1737 ohdr = &hdr_16b->u.oth; 1738 else 1739 ohdr = &hdr_16b->u.l.oth; 1740 } 1741 1742 opcode = ib_bth_get_opcode(ohdr); 1743 if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) && 1744 opcode <= OP(ATOMIC_ACKNOWLEDGE)) || 1745 opcode == TID_OP(READ_RESP) || 1746 opcode == TID_OP(WRITE_RESP)) { 1747 WARN_ON(!qp->s_rdma_ack_cnt); 1748 qp->s_rdma_ack_cnt--; 1749 return; 1750 } 1751 1752 psn = ib_bth_get_psn(ohdr); 1753 /* 1754 * Don't attempt to reset the sending PSN for packets in the 1755 * KDETH PSN space since the PSN does not match anything. 1756 */ 1757 if (opcode != TID_OP(WRITE_DATA) && 1758 opcode != TID_OP(WRITE_DATA_LAST) && 1759 opcode != TID_OP(ACK) && opcode != TID_OP(RESYNC)) 1760 reset_sending_psn(qp, psn); 1761 1762 /* Handle TID RDMA WRITE packets differently */ 1763 if (opcode >= TID_OP(WRITE_REQ) && 1764 opcode <= TID_OP(WRITE_DATA_LAST)) { 1765 head = priv->s_tid_head; 1766 tail = priv->s_tid_cur; 1767 /* 1768 * s_tid_cur is set to s_tid_head in the case, where 1769 * a new TID RDMA request is being started and all 1770 * previous ones have been completed. 1771 * Therefore, we need to do a secondary check in order 1772 * to properly determine whether we should start the 1773 * RC timer. 1774 */ 1775 wqe = rvt_get_swqe_ptr(qp, tail); 1776 req = wqe_to_tid_req(wqe); 1777 if (head == tail && req->comp_seg < req->total_segs) { 1778 if (tail == 0) 1779 tail = qp->s_size - 1; 1780 else 1781 tail -= 1; 1782 } 1783 } else { 1784 head = qp->s_tail; 1785 tail = qp->s_acked; 1786 } 1787 1788 /* 1789 * Start timer after a packet requesting an ACK has been sent and 1790 * there are still requests that haven't been acked. 1791 */ 1792 if ((psn & IB_BTH_REQ_ACK) && tail != head && 1793 opcode != TID_OP(WRITE_DATA) && opcode != TID_OP(WRITE_DATA_LAST) && 1794 opcode != TID_OP(RESYNC) && 1795 !(qp->s_flags & 1796 (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) && 1797 (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { 1798 if (opcode == TID_OP(READ_REQ)) 1799 rvt_add_retry_timer_ext(qp, priv->timeout_shift); 1800 else 1801 rvt_add_retry_timer(qp); 1802 } 1803 1804 /* Start TID RDMA ACK timer */ 1805 if ((opcode == TID_OP(WRITE_DATA) || 1806 opcode == TID_OP(WRITE_DATA_LAST) || 1807 opcode == TID_OP(RESYNC)) && 1808 (psn & IB_BTH_REQ_ACK) && 1809 !(priv->s_flags & HFI1_S_TID_RETRY_TIMER) && 1810 (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { 1811 /* 1812 * The TID RDMA ACK packet could be received before this 1813 * function is called. Therefore, add the timer only if TID 1814 * RDMA ACK packets are actually pending. 1815 */ 1816 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 1817 req = wqe_to_tid_req(wqe); 1818 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && 1819 req->ack_seg < req->cur_seg) 1820 hfi1_add_tid_retry_timer(qp); 1821 } 1822 1823 while (qp->s_last != qp->s_acked) { 1824 u32 s_last; 1825 1826 wqe = rvt_get_swqe_ptr(qp, qp->s_last); 1827 if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 && 1828 cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) 1829 break; 1830 trdma_clean_swqe(qp, wqe); 1831 rvt_qp_wqe_unreserve(qp, wqe); 1832 s_last = qp->s_last; 1833 trace_hfi1_qp_send_completion(qp, wqe, s_last); 1834 if (++s_last >= qp->s_size) 1835 s_last = 0; 1836 qp->s_last = s_last; 1837 /* see post_send() */ 1838 barrier(); 1839 rvt_put_swqe(wqe); 1840 rvt_qp_swqe_complete(qp, 1841 wqe, 1842 ib_hfi1_wc_opcode[wqe->wr.opcode], 1843 IB_WC_SUCCESS); 1844 } 1845 /* 1846 * If we were waiting for sends to complete before re-sending, 1847 * and they are now complete, restart sending. 1848 */ 1849 trace_hfi1_sendcomplete(qp, psn); 1850 if (qp->s_flags & RVT_S_WAIT_PSN && 1851 cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) { 1852 qp->s_flags &= ~RVT_S_WAIT_PSN; 1853 qp->s_sending_psn = qp->s_psn; 1854 qp->s_sending_hpsn = qp->s_psn - 1; 1855 hfi1_schedule_send(qp); 1856 } 1857 } 1858 1859 static inline void update_last_psn(struct rvt_qp *qp, u32 psn) 1860 { 1861 qp->s_last_psn = psn; 1862 } 1863 1864 /* 1865 * Generate a SWQE completion. 1866 * This is similar to hfi1_send_complete but has to check to be sure 1867 * that the SGEs are not being referenced if the SWQE is being resent. 1868 */ 1869 struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, 1870 struct rvt_swqe *wqe, 1871 struct hfi1_ibport *ibp) 1872 { 1873 struct hfi1_qp_priv *priv = qp->priv; 1874 1875 lockdep_assert_held(&qp->s_lock); 1876 /* 1877 * Don't decrement refcount and don't generate a 1878 * completion if the SWQE is being resent until the send 1879 * is finished. 1880 */ 1881 trace_hfi1_rc_completion(qp, wqe->lpsn); 1882 if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 || 1883 cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) { 1884 u32 s_last; 1885 1886 trdma_clean_swqe(qp, wqe); 1887 rvt_put_swqe(wqe); 1888 rvt_qp_wqe_unreserve(qp, wqe); 1889 s_last = qp->s_last; 1890 trace_hfi1_qp_send_completion(qp, wqe, s_last); 1891 if (++s_last >= qp->s_size) 1892 s_last = 0; 1893 qp->s_last = s_last; 1894 /* see post_send() */ 1895 barrier(); 1896 rvt_qp_swqe_complete(qp, 1897 wqe, 1898 ib_hfi1_wc_opcode[wqe->wr.opcode], 1899 IB_WC_SUCCESS); 1900 } else { 1901 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); 1902 1903 this_cpu_inc(*ibp->rvp.rc_delayed_comp); 1904 /* 1905 * If send progress not running attempt to progress 1906 * SDMA queue. 1907 */ 1908 if (ppd->dd->flags & HFI1_HAS_SEND_DMA) { 1909 struct sdma_engine *engine; 1910 u8 sl = rdma_ah_get_sl(&qp->remote_ah_attr); 1911 u8 sc5; 1912 1913 /* For now use sc to find engine */ 1914 sc5 = ibp->sl_to_sc[sl]; 1915 engine = qp_to_sdma_engine(qp, sc5); 1916 sdma_engine_progress_schedule(engine); 1917 } 1918 } 1919 1920 qp->s_retry = qp->s_retry_cnt; 1921 /* 1922 * Don't update the last PSN if the request being completed is 1923 * a TID RDMA WRITE request. 1924 * Completion of the TID RDMA WRITE requests are done by the 1925 * TID RDMA ACKs and as such could be for a request that has 1926 * already been ACKed as far as the IB state machine is 1927 * concerned. 1928 */ 1929 if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) 1930 update_last_psn(qp, wqe->lpsn); 1931 1932 /* 1933 * If we are completing a request which is in the process of 1934 * being resent, we can stop re-sending it since we know the 1935 * responder has already seen it. 1936 */ 1937 if (qp->s_acked == qp->s_cur) { 1938 if (++qp->s_cur >= qp->s_size) 1939 qp->s_cur = 0; 1940 qp->s_acked = qp->s_cur; 1941 wqe = rvt_get_swqe_ptr(qp, qp->s_cur); 1942 if (qp->s_acked != qp->s_tail) { 1943 qp->s_state = OP(SEND_LAST); 1944 qp->s_psn = wqe->psn; 1945 } 1946 } else { 1947 if (++qp->s_acked >= qp->s_size) 1948 qp->s_acked = 0; 1949 if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur) 1950 qp->s_draining = 0; 1951 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 1952 } 1953 if (priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) { 1954 priv->s_flags &= ~HFI1_S_TID_WAIT_INTERLCK; 1955 hfi1_schedule_send(qp); 1956 } 1957 return wqe; 1958 } 1959 1960 static void set_restart_qp(struct rvt_qp *qp, struct hfi1_ctxtdata *rcd) 1961 { 1962 /* Retry this request. */ 1963 if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) { 1964 qp->r_flags |= RVT_R_RDMAR_SEQ; 1965 hfi1_restart_rc(qp, qp->s_last_psn + 1, 0); 1966 if (list_empty(&qp->rspwait)) { 1967 qp->r_flags |= RVT_R_RSP_SEND; 1968 rvt_get_qp(qp); 1969 list_add_tail(&qp->rspwait, &rcd->qp_wait_list); 1970 } 1971 } 1972 } 1973 1974 /** 1975 * update_qp_retry_state - Update qp retry state. 1976 * @qp: the QP 1977 * @psn: the packet sequence number of the TID RDMA WRITE RESP. 1978 * @spsn: The start psn for the given TID RDMA WRITE swqe. 1979 * @lpsn: The last psn for the given TID RDMA WRITE swqe. 1980 * 1981 * This function is called to update the qp retry state upon 1982 * receiving a TID WRITE RESP after the qp is scheduled to retry 1983 * a request. 1984 */ 1985 static void update_qp_retry_state(struct rvt_qp *qp, u32 psn, u32 spsn, 1986 u32 lpsn) 1987 { 1988 struct hfi1_qp_priv *qpriv = qp->priv; 1989 1990 qp->s_psn = psn + 1; 1991 /* 1992 * If this is the first TID RDMA WRITE RESP packet for the current 1993 * request, change the s_state so that the retry will be processed 1994 * correctly. Similarly, if this is the last TID RDMA WRITE RESP 1995 * packet, change the s_state and advance the s_cur. 1996 */ 1997 if (cmp_psn(psn, lpsn) >= 0) { 1998 qp->s_cur = qpriv->s_tid_cur + 1; 1999 if (qp->s_cur >= qp->s_size) 2000 qp->s_cur = 0; 2001 qp->s_state = TID_OP(WRITE_REQ); 2002 } else if (!cmp_psn(psn, spsn)) { 2003 qp->s_cur = qpriv->s_tid_cur; 2004 qp->s_state = TID_OP(WRITE_RESP); 2005 } 2006 } 2007 2008 /** 2009 * do_rc_ack - process an incoming RC ACK 2010 * @qp: the QP the ACK came in on 2011 * @psn: the packet sequence number of the ACK 2012 * @opcode: the opcode of the request that resulted in the ACK 2013 * 2014 * This is called from rc_rcv_resp() to process an incoming RC ACK 2015 * for the given QP. 2016 * May be called at interrupt level, with the QP s_lock held. 2017 * Returns 1 if OK, 0 if current operation should be aborted (NAK). 2018 */ 2019 int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, 2020 u64 val, struct hfi1_ctxtdata *rcd) 2021 { 2022 struct hfi1_ibport *ibp; 2023 enum ib_wc_status status; 2024 struct hfi1_qp_priv *qpriv = qp->priv; 2025 struct rvt_swqe *wqe; 2026 int ret = 0; 2027 u32 ack_psn; 2028 int diff; 2029 struct rvt_dev_info *rdi; 2030 2031 lockdep_assert_held(&qp->s_lock); 2032 /* 2033 * Note that NAKs implicitly ACK outstanding SEND and RDMA write 2034 * requests and implicitly NAK RDMA read and atomic requests issued 2035 * before the NAK'ed request. The MSN won't include the NAK'ed 2036 * request but will include an ACK'ed request(s). 2037 */ 2038 ack_psn = psn; 2039 if (aeth >> IB_AETH_NAK_SHIFT) 2040 ack_psn--; 2041 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 2042 ibp = rcd_to_iport(rcd); 2043 2044 /* 2045 * The MSN might be for a later WQE than the PSN indicates so 2046 * only complete WQEs that the PSN finishes. 2047 */ 2048 while ((diff = delta_psn(ack_psn, wqe->lpsn)) >= 0) { 2049 /* 2050 * RDMA_READ_RESPONSE_ONLY is a special case since 2051 * we want to generate completion events for everything 2052 * before the RDMA read, copy the data, then generate 2053 * the completion for the read. 2054 */ 2055 if (wqe->wr.opcode == IB_WR_RDMA_READ && 2056 opcode == OP(RDMA_READ_RESPONSE_ONLY) && 2057 diff == 0) { 2058 ret = 1; 2059 goto bail_stop; 2060 } 2061 /* 2062 * If this request is a RDMA read or atomic, and the ACK is 2063 * for a later operation, this ACK NAKs the RDMA read or 2064 * atomic. In other words, only a RDMA_READ_LAST or ONLY 2065 * can ACK a RDMA read and likewise for atomic ops. Note 2066 * that the NAK case can only happen if relaxed ordering is 2067 * used and requests are sent after an RDMA read or atomic 2068 * is sent but before the response is received. 2069 */ 2070 if ((wqe->wr.opcode == IB_WR_RDMA_READ && 2071 (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) || 2072 (wqe->wr.opcode == IB_WR_TID_RDMA_READ && 2073 (opcode != TID_OP(READ_RESP) || diff != 0)) || 2074 ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || 2075 wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) && 2076 (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0)) || 2077 (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && 2078 (delta_psn(psn, qp->s_last_psn) != 1))) { 2079 set_restart_qp(qp, rcd); 2080 /* 2081 * No need to process the ACK/NAK since we are 2082 * restarting an earlier request. 2083 */ 2084 goto bail_stop; 2085 } 2086 if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || 2087 wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { 2088 u64 *vaddr = wqe->sg_list[0].vaddr; 2089 *vaddr = val; 2090 } 2091 if (wqe->wr.opcode == IB_WR_OPFN) 2092 opfn_conn_reply(qp, val); 2093 2094 if (qp->s_num_rd_atomic && 2095 (wqe->wr.opcode == IB_WR_RDMA_READ || 2096 wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || 2097 wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) { 2098 qp->s_num_rd_atomic--; 2099 /* Restart sending task if fence is complete */ 2100 if ((qp->s_flags & RVT_S_WAIT_FENCE) && 2101 !qp->s_num_rd_atomic) { 2102 qp->s_flags &= ~(RVT_S_WAIT_FENCE | 2103 RVT_S_WAIT_ACK); 2104 hfi1_schedule_send(qp); 2105 } else if (qp->s_flags & RVT_S_WAIT_RDMAR) { 2106 qp->s_flags &= ~(RVT_S_WAIT_RDMAR | 2107 RVT_S_WAIT_ACK); 2108 hfi1_schedule_send(qp); 2109 } 2110 } 2111 2112 /* 2113 * TID RDMA WRITE requests will be completed by the TID RDMA 2114 * ACK packet handler (see tid_rdma.c). 2115 */ 2116 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) 2117 break; 2118 2119 wqe = do_rc_completion(qp, wqe, ibp); 2120 if (qp->s_acked == qp->s_tail) 2121 break; 2122 } 2123 2124 trace_hfi1_rc_ack_do(qp, aeth, psn, wqe); 2125 trace_hfi1_sender_do_rc_ack(qp); 2126 switch (aeth >> IB_AETH_NAK_SHIFT) { 2127 case 0: /* ACK */ 2128 this_cpu_inc(*ibp->rvp.rc_acks); 2129 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { 2130 if (wqe_to_tid_req(wqe)->ack_pending) 2131 rvt_mod_retry_timer_ext(qp, 2132 qpriv->timeout_shift); 2133 else 2134 rvt_stop_rc_timers(qp); 2135 } else if (qp->s_acked != qp->s_tail) { 2136 struct rvt_swqe *__w = NULL; 2137 2138 if (qpriv->s_tid_cur != HFI1_QP_WQE_INVALID) 2139 __w = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur); 2140 2141 /* 2142 * Stop timers if we've received all of the TID RDMA 2143 * WRITE * responses. 2144 */ 2145 if (__w && __w->wr.opcode == IB_WR_TID_RDMA_WRITE && 2146 opcode == TID_OP(WRITE_RESP)) { 2147 /* 2148 * Normally, the loop above would correctly 2149 * process all WQEs from s_acked onward and 2150 * either complete them or check for correct 2151 * PSN sequencing. 2152 * However, for TID RDMA, due to pipelining, 2153 * the response may not be for the request at 2154 * s_acked so the above look would just be 2155 * skipped. This does not allow for checking 2156 * the PSN sequencing. It has to be done 2157 * separately. 2158 */ 2159 if (cmp_psn(psn, qp->s_last_psn + 1)) { 2160 set_restart_qp(qp, rcd); 2161 goto bail_stop; 2162 } 2163 /* 2164 * If the psn is being resent, stop the 2165 * resending. 2166 */ 2167 if (qp->s_cur != qp->s_tail && 2168 cmp_psn(qp->s_psn, psn) <= 0) 2169 update_qp_retry_state(qp, psn, 2170 __w->psn, 2171 __w->lpsn); 2172 else if (--qpriv->pending_tid_w_resp) 2173 rvt_mod_retry_timer(qp); 2174 else 2175 rvt_stop_rc_timers(qp); 2176 } else { 2177 /* 2178 * We are expecting more ACKs so 2179 * mod the retry timer. 2180 */ 2181 rvt_mod_retry_timer(qp); 2182 /* 2183 * We can stop re-sending the earlier packets 2184 * and continue with the next packet the 2185 * receiver wants. 2186 */ 2187 if (cmp_psn(qp->s_psn, psn) <= 0) 2188 reset_psn(qp, psn + 1); 2189 } 2190 } else { 2191 /* No more acks - kill all timers */ 2192 rvt_stop_rc_timers(qp); 2193 if (cmp_psn(qp->s_psn, psn) <= 0) { 2194 qp->s_state = OP(SEND_LAST); 2195 qp->s_psn = psn + 1; 2196 } 2197 } 2198 if (qp->s_flags & RVT_S_WAIT_ACK) { 2199 qp->s_flags &= ~RVT_S_WAIT_ACK; 2200 hfi1_schedule_send(qp); 2201 } 2202 rvt_get_credit(qp, aeth); 2203 qp->s_rnr_retry = qp->s_rnr_retry_cnt; 2204 qp->s_retry = qp->s_retry_cnt; 2205 /* 2206 * If the current request is a TID RDMA WRITE request and the 2207 * response is not a TID RDMA WRITE RESP packet, s_last_psn 2208 * can't be advanced. 2209 */ 2210 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && 2211 opcode != TID_OP(WRITE_RESP) && 2212 cmp_psn(psn, wqe->psn) >= 0) 2213 return 1; 2214 update_last_psn(qp, psn); 2215 return 1; 2216 2217 case 1: /* RNR NAK */ 2218 ibp->rvp.n_rnr_naks++; 2219 if (qp->s_acked == qp->s_tail) 2220 goto bail_stop; 2221 if (qp->s_flags & RVT_S_WAIT_RNR) 2222 goto bail_stop; 2223 rdi = ib_to_rvt(qp->ibqp.device); 2224 if (qp->s_rnr_retry == 0 && 2225 !((rdi->post_parms[wqe->wr.opcode].flags & 2226 RVT_OPERATION_IGN_RNR_CNT) && 2227 qp->s_rnr_retry_cnt == 0)) { 2228 status = IB_WC_RNR_RETRY_EXC_ERR; 2229 goto class_b; 2230 } 2231 if (qp->s_rnr_retry_cnt < 7 && qp->s_rnr_retry_cnt > 0) 2232 qp->s_rnr_retry--; 2233 2234 /* 2235 * The last valid PSN is the previous PSN. For TID RDMA WRITE 2236 * request, s_last_psn should be incremented only when a TID 2237 * RDMA WRITE RESP is received to avoid skipping lost TID RDMA 2238 * WRITE RESP packets. 2239 */ 2240 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) { 2241 reset_psn(qp, qp->s_last_psn + 1); 2242 } else { 2243 update_last_psn(qp, psn - 1); 2244 reset_psn(qp, psn); 2245 } 2246 2247 ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn); 2248 qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK); 2249 rvt_stop_rc_timers(qp); 2250 rvt_add_rnr_timer(qp, aeth); 2251 return 0; 2252 2253 case 3: /* NAK */ 2254 if (qp->s_acked == qp->s_tail) 2255 goto bail_stop; 2256 /* The last valid PSN is the previous PSN. */ 2257 update_last_psn(qp, psn - 1); 2258 switch ((aeth >> IB_AETH_CREDIT_SHIFT) & 2259 IB_AETH_CREDIT_MASK) { 2260 case 0: /* PSN sequence error */ 2261 ibp->rvp.n_seq_naks++; 2262 /* 2263 * Back up to the responder's expected PSN. 2264 * Note that we might get a NAK in the middle of an 2265 * RDMA READ response which terminates the RDMA 2266 * READ. 2267 */ 2268 hfi1_restart_rc(qp, psn, 0); 2269 hfi1_schedule_send(qp); 2270 break; 2271 2272 case 1: /* Invalid Request */ 2273 status = IB_WC_REM_INV_REQ_ERR; 2274 ibp->rvp.n_other_naks++; 2275 goto class_b; 2276 2277 case 2: /* Remote Access Error */ 2278 status = IB_WC_REM_ACCESS_ERR; 2279 ibp->rvp.n_other_naks++; 2280 goto class_b; 2281 2282 case 3: /* Remote Operation Error */ 2283 status = IB_WC_REM_OP_ERR; 2284 ibp->rvp.n_other_naks++; 2285 class_b: 2286 if (qp->s_last == qp->s_acked) { 2287 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) 2288 hfi1_kern_read_tid_flow_free(qp); 2289 2290 hfi1_trdma_send_complete(qp, wqe, status); 2291 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); 2292 } 2293 break; 2294 2295 default: 2296 /* Ignore other reserved NAK error codes */ 2297 goto reserved; 2298 } 2299 qp->s_retry = qp->s_retry_cnt; 2300 qp->s_rnr_retry = qp->s_rnr_retry_cnt; 2301 goto bail_stop; 2302 2303 default: /* 2: reserved */ 2304 reserved: 2305 /* Ignore reserved NAK codes. */ 2306 goto bail_stop; 2307 } 2308 /* cannot be reached */ 2309 bail_stop: 2310 rvt_stop_rc_timers(qp); 2311 return ret; 2312 } 2313 2314 /* 2315 * We have seen an out of sequence RDMA read middle or last packet. 2316 * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE. 2317 */ 2318 static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn, 2319 struct hfi1_ctxtdata *rcd) 2320 { 2321 struct rvt_swqe *wqe; 2322 2323 lockdep_assert_held(&qp->s_lock); 2324 /* Remove QP from retry timer */ 2325 rvt_stop_rc_timers(qp); 2326 2327 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 2328 2329 while (cmp_psn(psn, wqe->lpsn) > 0) { 2330 if (wqe->wr.opcode == IB_WR_RDMA_READ || 2331 wqe->wr.opcode == IB_WR_TID_RDMA_READ || 2332 wqe->wr.opcode == IB_WR_TID_RDMA_WRITE || 2333 wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || 2334 wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) 2335 break; 2336 wqe = do_rc_completion(qp, wqe, ibp); 2337 } 2338 2339 ibp->rvp.n_rdma_seq++; 2340 qp->r_flags |= RVT_R_RDMAR_SEQ; 2341 hfi1_restart_rc(qp, qp->s_last_psn + 1, 0); 2342 if (list_empty(&qp->rspwait)) { 2343 qp->r_flags |= RVT_R_RSP_SEND; 2344 rvt_get_qp(qp); 2345 list_add_tail(&qp->rspwait, &rcd->qp_wait_list); 2346 } 2347 } 2348 2349 /** 2350 * rc_rcv_resp - process an incoming RC response packet 2351 * @packet: data packet information 2352 * 2353 * This is called from hfi1_rc_rcv() to process an incoming RC response 2354 * packet for the given QP. 2355 * Called at interrupt level. 2356 */ 2357 static void rc_rcv_resp(struct hfi1_packet *packet) 2358 { 2359 struct hfi1_ctxtdata *rcd = packet->rcd; 2360 void *data = packet->payload; 2361 u32 tlen = packet->tlen; 2362 struct rvt_qp *qp = packet->qp; 2363 struct hfi1_ibport *ibp; 2364 struct ib_other_headers *ohdr = packet->ohdr; 2365 struct rvt_swqe *wqe; 2366 enum ib_wc_status status; 2367 unsigned long flags; 2368 int diff; 2369 u64 val; 2370 u32 aeth; 2371 u32 psn = ib_bth_get_psn(packet->ohdr); 2372 u32 pmtu = qp->pmtu; 2373 u16 hdrsize = packet->hlen; 2374 u8 opcode = packet->opcode; 2375 u8 pad = packet->pad; 2376 u8 extra_bytes = pad + packet->extra_byte + (SIZE_OF_CRC << 2); 2377 2378 spin_lock_irqsave(&qp->s_lock, flags); 2379 trace_hfi1_ack(qp, psn); 2380 2381 /* Ignore invalid responses. */ 2382 if (cmp_psn(psn, READ_ONCE(qp->s_next_psn)) >= 0) 2383 goto ack_done; 2384 2385 /* Ignore duplicate responses. */ 2386 diff = cmp_psn(psn, qp->s_last_psn); 2387 if (unlikely(diff <= 0)) { 2388 /* Update credits for "ghost" ACKs */ 2389 if (diff == 0 && opcode == OP(ACKNOWLEDGE)) { 2390 aeth = be32_to_cpu(ohdr->u.aeth); 2391 if ((aeth >> IB_AETH_NAK_SHIFT) == 0) 2392 rvt_get_credit(qp, aeth); 2393 } 2394 goto ack_done; 2395 } 2396 2397 /* 2398 * Skip everything other than the PSN we expect, if we are waiting 2399 * for a reply to a restarted RDMA read or atomic op. 2400 */ 2401 if (qp->r_flags & RVT_R_RDMAR_SEQ) { 2402 if (cmp_psn(psn, qp->s_last_psn + 1) != 0) 2403 goto ack_done; 2404 qp->r_flags &= ~RVT_R_RDMAR_SEQ; 2405 } 2406 2407 if (unlikely(qp->s_acked == qp->s_tail)) 2408 goto ack_done; 2409 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 2410 status = IB_WC_SUCCESS; 2411 2412 switch (opcode) { 2413 case OP(ACKNOWLEDGE): 2414 case OP(ATOMIC_ACKNOWLEDGE): 2415 case OP(RDMA_READ_RESPONSE_FIRST): 2416 aeth = be32_to_cpu(ohdr->u.aeth); 2417 if (opcode == OP(ATOMIC_ACKNOWLEDGE)) 2418 val = ib_u64_get(&ohdr->u.at.atomic_ack_eth); 2419 else 2420 val = 0; 2421 if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) || 2422 opcode != OP(RDMA_READ_RESPONSE_FIRST)) 2423 goto ack_done; 2424 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 2425 if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) 2426 goto ack_op_err; 2427 /* 2428 * If this is a response to a resent RDMA read, we 2429 * have to be careful to copy the data to the right 2430 * location. 2431 */ 2432 qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge, 2433 wqe, psn, pmtu); 2434 goto read_middle; 2435 2436 case OP(RDMA_READ_RESPONSE_MIDDLE): 2437 /* no AETH, no ACK */ 2438 if (unlikely(cmp_psn(psn, qp->s_last_psn + 1))) 2439 goto ack_seq_err; 2440 if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) 2441 goto ack_op_err; 2442 read_middle: 2443 if (unlikely(tlen != (hdrsize + pmtu + extra_bytes))) 2444 goto ack_len_err; 2445 if (unlikely(pmtu >= qp->s_rdma_read_len)) 2446 goto ack_len_err; 2447 2448 /* 2449 * We got a response so update the timeout. 2450 * 4.096 usec. * (1 << qp->timeout) 2451 */ 2452 rvt_mod_retry_timer(qp); 2453 if (qp->s_flags & RVT_S_WAIT_ACK) { 2454 qp->s_flags &= ~RVT_S_WAIT_ACK; 2455 hfi1_schedule_send(qp); 2456 } 2457 2458 if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE)) 2459 qp->s_retry = qp->s_retry_cnt; 2460 2461 /* 2462 * Update the RDMA receive state but do the copy w/o 2463 * holding the locks and blocking interrupts. 2464 */ 2465 qp->s_rdma_read_len -= pmtu; 2466 update_last_psn(qp, psn); 2467 spin_unlock_irqrestore(&qp->s_lock, flags); 2468 rvt_copy_sge(qp, &qp->s_rdma_read_sge, 2469 data, pmtu, false, false); 2470 goto bail; 2471 2472 case OP(RDMA_READ_RESPONSE_ONLY): 2473 aeth = be32_to_cpu(ohdr->u.aeth); 2474 if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd)) 2475 goto ack_done; 2476 /* 2477 * Check that the data size is >= 0 && <= pmtu. 2478 * Remember to account for ICRC (4). 2479 */ 2480 if (unlikely(tlen < (hdrsize + extra_bytes))) 2481 goto ack_len_err; 2482 /* 2483 * If this is a response to a resent RDMA read, we 2484 * have to be careful to copy the data to the right 2485 * location. 2486 */ 2487 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 2488 qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge, 2489 wqe, psn, pmtu); 2490 goto read_last; 2491 2492 case OP(RDMA_READ_RESPONSE_LAST): 2493 /* ACKs READ req. */ 2494 if (unlikely(cmp_psn(psn, qp->s_last_psn + 1))) 2495 goto ack_seq_err; 2496 if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) 2497 goto ack_op_err; 2498 /* 2499 * Check that the data size is >= 1 && <= pmtu. 2500 * Remember to account for ICRC (4). 2501 */ 2502 if (unlikely(tlen <= (hdrsize + extra_bytes))) 2503 goto ack_len_err; 2504 read_last: 2505 tlen -= hdrsize + extra_bytes; 2506 if (unlikely(tlen != qp->s_rdma_read_len)) 2507 goto ack_len_err; 2508 aeth = be32_to_cpu(ohdr->u.aeth); 2509 rvt_copy_sge(qp, &qp->s_rdma_read_sge, 2510 data, tlen, false, false); 2511 WARN_ON(qp->s_rdma_read_sge.num_sge); 2512 (void)do_rc_ack(qp, aeth, psn, 2513 OP(RDMA_READ_RESPONSE_LAST), 0, rcd); 2514 goto ack_done; 2515 } 2516 2517 ack_op_err: 2518 status = IB_WC_LOC_QP_OP_ERR; 2519 goto ack_err; 2520 2521 ack_seq_err: 2522 ibp = rcd_to_iport(rcd); 2523 rdma_seq_err(qp, ibp, psn, rcd); 2524 goto ack_done; 2525 2526 ack_len_err: 2527 status = IB_WC_LOC_LEN_ERR; 2528 ack_err: 2529 if (qp->s_last == qp->s_acked) { 2530 rvt_send_complete(qp, wqe, status); 2531 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); 2532 } 2533 ack_done: 2534 spin_unlock_irqrestore(&qp->s_lock, flags); 2535 bail: 2536 return; 2537 } 2538 2539 static inline void rc_cancel_ack(struct rvt_qp *qp) 2540 { 2541 qp->r_adefered = 0; 2542 if (list_empty(&qp->rspwait)) 2543 return; 2544 list_del_init(&qp->rspwait); 2545 qp->r_flags &= ~RVT_R_RSP_NAK; 2546 rvt_put_qp(qp); 2547 } 2548 2549 /** 2550 * rc_rcv_error - process an incoming duplicate or error RC packet 2551 * @ohdr: the other headers for this packet 2552 * @data: the packet data 2553 * @qp: the QP for this packet 2554 * @opcode: the opcode for this packet 2555 * @psn: the packet sequence number for this packet 2556 * @diff: the difference between the PSN and the expected PSN 2557 * 2558 * This is called from hfi1_rc_rcv() to process an unexpected 2559 * incoming RC packet for the given QP. 2560 * Called at interrupt level. 2561 * Return 1 if no more processing is needed; otherwise return 0 to 2562 * schedule a response to be sent. 2563 */ 2564 static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, 2565 struct rvt_qp *qp, u32 opcode, u32 psn, 2566 int diff, struct hfi1_ctxtdata *rcd) 2567 { 2568 struct hfi1_ibport *ibp = rcd_to_iport(rcd); 2569 struct rvt_ack_entry *e; 2570 unsigned long flags; 2571 u8 prev; 2572 u8 mra; /* most recent ACK */ 2573 bool old_req; 2574 2575 trace_hfi1_rcv_error(qp, psn); 2576 if (diff > 0) { 2577 /* 2578 * Packet sequence error. 2579 * A NAK will ACK earlier sends and RDMA writes. 2580 * Don't queue the NAK if we already sent one. 2581 */ 2582 if (!qp->r_nak_state) { 2583 ibp->rvp.n_rc_seqnak++; 2584 qp->r_nak_state = IB_NAK_PSN_ERROR; 2585 /* Use the expected PSN. */ 2586 qp->r_ack_psn = qp->r_psn; 2587 /* 2588 * Wait to send the sequence NAK until all packets 2589 * in the receive queue have been processed. 2590 * Otherwise, we end up propagating congestion. 2591 */ 2592 rc_defered_ack(rcd, qp); 2593 } 2594 goto done; 2595 } 2596 2597 /* 2598 * Handle a duplicate request. Don't re-execute SEND, RDMA 2599 * write or atomic op. Don't NAK errors, just silently drop 2600 * the duplicate request. Note that r_sge, r_len, and 2601 * r_rcv_len may be in use so don't modify them. 2602 * 2603 * We are supposed to ACK the earliest duplicate PSN but we 2604 * can coalesce an outstanding duplicate ACK. We have to 2605 * send the earliest so that RDMA reads can be restarted at 2606 * the requester's expected PSN. 2607 * 2608 * First, find where this duplicate PSN falls within the 2609 * ACKs previously sent. 2610 * old_req is true if there is an older response that is scheduled 2611 * to be sent before sending this one. 2612 */ 2613 e = NULL; 2614 old_req = 1; 2615 ibp->rvp.n_rc_dupreq++; 2616 2617 spin_lock_irqsave(&qp->s_lock, flags); 2618 2619 e = find_prev_entry(qp, psn, &prev, &mra, &old_req); 2620 2621 switch (opcode) { 2622 case OP(RDMA_READ_REQUEST): { 2623 struct ib_reth *reth; 2624 u32 offset; 2625 u32 len; 2626 2627 /* 2628 * If we didn't find the RDMA read request in the ack queue, 2629 * we can ignore this request. 2630 */ 2631 if (!e || e->opcode != OP(RDMA_READ_REQUEST)) 2632 goto unlock_done; 2633 /* RETH comes after BTH */ 2634 reth = &ohdr->u.rc.reth; 2635 /* 2636 * Address range must be a subset of the original 2637 * request and start on pmtu boundaries. 2638 * We reuse the old ack_queue slot since the requester 2639 * should not back up and request an earlier PSN for the 2640 * same request. 2641 */ 2642 offset = delta_psn(psn, e->psn) * qp->pmtu; 2643 len = be32_to_cpu(reth->length); 2644 if (unlikely(offset + len != e->rdma_sge.sge_length)) 2645 goto unlock_done; 2646 if (e->rdma_sge.mr) { 2647 rvt_put_mr(e->rdma_sge.mr); 2648 e->rdma_sge.mr = NULL; 2649 } 2650 if (len != 0) { 2651 u32 rkey = be32_to_cpu(reth->rkey); 2652 u64 vaddr = get_ib_reth_vaddr(reth); 2653 int ok; 2654 2655 ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey, 2656 IB_ACCESS_REMOTE_READ); 2657 if (unlikely(!ok)) 2658 goto unlock_done; 2659 } else { 2660 e->rdma_sge.vaddr = NULL; 2661 e->rdma_sge.length = 0; 2662 e->rdma_sge.sge_length = 0; 2663 } 2664 e->psn = psn; 2665 if (old_req) 2666 goto unlock_done; 2667 if (qp->s_acked_ack_queue == qp->s_tail_ack_queue) 2668 qp->s_acked_ack_queue = prev; 2669 qp->s_tail_ack_queue = prev; 2670 break; 2671 } 2672 2673 case OP(COMPARE_SWAP): 2674 case OP(FETCH_ADD): { 2675 /* 2676 * If we didn't find the atomic request in the ack queue 2677 * or the send engine is already backed up to send an 2678 * earlier entry, we can ignore this request. 2679 */ 2680 if (!e || e->opcode != (u8)opcode || old_req) 2681 goto unlock_done; 2682 if (qp->s_tail_ack_queue == qp->s_acked_ack_queue) 2683 qp->s_acked_ack_queue = prev; 2684 qp->s_tail_ack_queue = prev; 2685 break; 2686 } 2687 2688 default: 2689 /* 2690 * Ignore this operation if it doesn't request an ACK 2691 * or an earlier RDMA read or atomic is going to be resent. 2692 */ 2693 if (!(psn & IB_BTH_REQ_ACK) || old_req) 2694 goto unlock_done; 2695 /* 2696 * Resend the most recent ACK if this request is 2697 * after all the previous RDMA reads and atomics. 2698 */ 2699 if (mra == qp->r_head_ack_queue) { 2700 spin_unlock_irqrestore(&qp->s_lock, flags); 2701 qp->r_nak_state = 0; 2702 qp->r_ack_psn = qp->r_psn - 1; 2703 goto send_ack; 2704 } 2705 2706 /* 2707 * Resend the RDMA read or atomic op which 2708 * ACKs this duplicate request. 2709 */ 2710 if (qp->s_tail_ack_queue == qp->s_acked_ack_queue) 2711 qp->s_acked_ack_queue = mra; 2712 qp->s_tail_ack_queue = mra; 2713 break; 2714 } 2715 qp->s_ack_state = OP(ACKNOWLEDGE); 2716 qp->s_flags |= RVT_S_RESP_PENDING; 2717 qp->r_nak_state = 0; 2718 hfi1_schedule_send(qp); 2719 2720 unlock_done: 2721 spin_unlock_irqrestore(&qp->s_lock, flags); 2722 done: 2723 return 1; 2724 2725 send_ack: 2726 return 0; 2727 } 2728 2729 static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid, 2730 u32 lqpn, u32 rqpn, u8 svc_type) 2731 { 2732 struct opa_hfi1_cong_log_event_internal *cc_event; 2733 unsigned long flags; 2734 2735 if (sl >= OPA_MAX_SLS) 2736 return; 2737 2738 spin_lock_irqsave(&ppd->cc_log_lock, flags); 2739 2740 ppd->threshold_cong_event_map[sl / 8] |= 1 << (sl % 8); 2741 ppd->threshold_event_counter++; 2742 2743 cc_event = &ppd->cc_events[ppd->cc_log_idx++]; 2744 if (ppd->cc_log_idx == OPA_CONG_LOG_ELEMS) 2745 ppd->cc_log_idx = 0; 2746 cc_event->lqpn = lqpn & RVT_QPN_MASK; 2747 cc_event->rqpn = rqpn & RVT_QPN_MASK; 2748 cc_event->sl = sl; 2749 cc_event->svc_type = svc_type; 2750 cc_event->rlid = rlid; 2751 /* keep timestamp in units of 1.024 usec */ 2752 cc_event->timestamp = ktime_get_ns() / 1024; 2753 2754 spin_unlock_irqrestore(&ppd->cc_log_lock, flags); 2755 } 2756 2757 void process_becn(struct hfi1_pportdata *ppd, u8 sl, u32 rlid, u32 lqpn, 2758 u32 rqpn, u8 svc_type) 2759 { 2760 struct cca_timer *cca_timer; 2761 u16 ccti, ccti_incr, ccti_timer, ccti_limit; 2762 u8 trigger_threshold; 2763 struct cc_state *cc_state; 2764 unsigned long flags; 2765 2766 if (sl >= OPA_MAX_SLS) 2767 return; 2768 2769 cc_state = get_cc_state(ppd); 2770 2771 if (!cc_state) 2772 return; 2773 2774 /* 2775 * 1) increase CCTI (for this SL) 2776 * 2) select IPG (i.e., call set_link_ipg()) 2777 * 3) start timer 2778 */ 2779 ccti_limit = cc_state->cct.ccti_limit; 2780 ccti_incr = cc_state->cong_setting.entries[sl].ccti_increase; 2781 ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer; 2782 trigger_threshold = 2783 cc_state->cong_setting.entries[sl].trigger_threshold; 2784 2785 spin_lock_irqsave(&ppd->cca_timer_lock, flags); 2786 2787 cca_timer = &ppd->cca_timer[sl]; 2788 if (cca_timer->ccti < ccti_limit) { 2789 if (cca_timer->ccti + ccti_incr <= ccti_limit) 2790 cca_timer->ccti += ccti_incr; 2791 else 2792 cca_timer->ccti = ccti_limit; 2793 set_link_ipg(ppd); 2794 } 2795 2796 ccti = cca_timer->ccti; 2797 2798 if (!hrtimer_active(&cca_timer->hrtimer)) { 2799 /* ccti_timer is in units of 1.024 usec */ 2800 unsigned long nsec = 1024 * ccti_timer; 2801 2802 hrtimer_start(&cca_timer->hrtimer, ns_to_ktime(nsec), 2803 HRTIMER_MODE_REL_PINNED); 2804 } 2805 2806 spin_unlock_irqrestore(&ppd->cca_timer_lock, flags); 2807 2808 if ((trigger_threshold != 0) && (ccti >= trigger_threshold)) 2809 log_cca_event(ppd, sl, rlid, lqpn, rqpn, svc_type); 2810 } 2811 2812 /** 2813 * hfi1_rc_rcv - process an incoming RC packet 2814 * @packet: data packet information 2815 * 2816 * This is called from qp_rcv() to process an incoming RC packet 2817 * for the given QP. 2818 * May be called at interrupt level. 2819 */ 2820 void hfi1_rc_rcv(struct hfi1_packet *packet) 2821 { 2822 struct hfi1_ctxtdata *rcd = packet->rcd; 2823 void *data = packet->payload; 2824 u32 tlen = packet->tlen; 2825 struct rvt_qp *qp = packet->qp; 2826 struct hfi1_qp_priv *qpriv = qp->priv; 2827 struct hfi1_ibport *ibp = rcd_to_iport(rcd); 2828 struct ib_other_headers *ohdr = packet->ohdr; 2829 u32 opcode = packet->opcode; 2830 u32 hdrsize = packet->hlen; 2831 u32 psn = ib_bth_get_psn(packet->ohdr); 2832 u32 pad = packet->pad; 2833 struct ib_wc wc; 2834 u32 pmtu = qp->pmtu; 2835 int diff; 2836 struct ib_reth *reth; 2837 unsigned long flags; 2838 int ret; 2839 bool copy_last = false, fecn; 2840 u32 rkey; 2841 u8 extra_bytes = pad + packet->extra_byte + (SIZE_OF_CRC << 2); 2842 2843 lockdep_assert_held(&qp->r_lock); 2844 2845 if (hfi1_ruc_check_hdr(ibp, packet)) 2846 return; 2847 2848 fecn = process_ecn(qp, packet); 2849 opfn_trigger_conn_request(qp, be32_to_cpu(ohdr->bth[1])); 2850 2851 /* 2852 * Process responses (ACKs) before anything else. Note that the 2853 * packet sequence number will be for something in the send work 2854 * queue rather than the expected receive packet sequence number. 2855 * In other words, this QP is the requester. 2856 */ 2857 if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) && 2858 opcode <= OP(ATOMIC_ACKNOWLEDGE)) { 2859 rc_rcv_resp(packet); 2860 return; 2861 } 2862 2863 /* Compute 24 bits worth of difference. */ 2864 diff = delta_psn(psn, qp->r_psn); 2865 if (unlikely(diff)) { 2866 if (rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd)) 2867 return; 2868 goto send_ack; 2869 } 2870 2871 /* Check for opcode sequence errors. */ 2872 switch (qp->r_state) { 2873 case OP(SEND_FIRST): 2874 case OP(SEND_MIDDLE): 2875 if (opcode == OP(SEND_MIDDLE) || 2876 opcode == OP(SEND_LAST) || 2877 opcode == OP(SEND_LAST_WITH_IMMEDIATE) || 2878 opcode == OP(SEND_LAST_WITH_INVALIDATE)) 2879 break; 2880 goto nack_inv; 2881 2882 case OP(RDMA_WRITE_FIRST): 2883 case OP(RDMA_WRITE_MIDDLE): 2884 if (opcode == OP(RDMA_WRITE_MIDDLE) || 2885 opcode == OP(RDMA_WRITE_LAST) || 2886 opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) 2887 break; 2888 goto nack_inv; 2889 2890 default: 2891 if (opcode == OP(SEND_MIDDLE) || 2892 opcode == OP(SEND_LAST) || 2893 opcode == OP(SEND_LAST_WITH_IMMEDIATE) || 2894 opcode == OP(SEND_LAST_WITH_INVALIDATE) || 2895 opcode == OP(RDMA_WRITE_MIDDLE) || 2896 opcode == OP(RDMA_WRITE_LAST) || 2897 opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) 2898 goto nack_inv; 2899 /* 2900 * Note that it is up to the requester to not send a new 2901 * RDMA read or atomic operation before receiving an ACK 2902 * for the previous operation. 2903 */ 2904 break; 2905 } 2906 2907 if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST)) 2908 rvt_comm_est(qp); 2909 2910 /* OK, process the packet. */ 2911 switch (opcode) { 2912 case OP(SEND_FIRST): 2913 ret = rvt_get_rwqe(qp, false); 2914 if (ret < 0) 2915 goto nack_op_err; 2916 if (!ret) 2917 goto rnr_nak; 2918 qp->r_rcv_len = 0; 2919 /* FALLTHROUGH */ 2920 case OP(SEND_MIDDLE): 2921 case OP(RDMA_WRITE_MIDDLE): 2922 send_middle: 2923 /* Check for invalid length PMTU or posted rwqe len. */ 2924 /* 2925 * There will be no padding for 9B packet but 16B packets 2926 * will come in with some padding since we always add 2927 * CRC and LT bytes which will need to be flit aligned 2928 */ 2929 if (unlikely(tlen != (hdrsize + pmtu + extra_bytes))) 2930 goto nack_inv; 2931 qp->r_rcv_len += pmtu; 2932 if (unlikely(qp->r_rcv_len > qp->r_len)) 2933 goto nack_inv; 2934 rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false); 2935 break; 2936 2937 case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): 2938 /* consume RWQE */ 2939 ret = rvt_get_rwqe(qp, true); 2940 if (ret < 0) 2941 goto nack_op_err; 2942 if (!ret) 2943 goto rnr_nak; 2944 goto send_last_imm; 2945 2946 case OP(SEND_ONLY): 2947 case OP(SEND_ONLY_WITH_IMMEDIATE): 2948 case OP(SEND_ONLY_WITH_INVALIDATE): 2949 ret = rvt_get_rwqe(qp, false); 2950 if (ret < 0) 2951 goto nack_op_err; 2952 if (!ret) 2953 goto rnr_nak; 2954 qp->r_rcv_len = 0; 2955 if (opcode == OP(SEND_ONLY)) 2956 goto no_immediate_data; 2957 if (opcode == OP(SEND_ONLY_WITH_INVALIDATE)) 2958 goto send_last_inv; 2959 /* FALLTHROUGH -- for SEND_ONLY_WITH_IMMEDIATE */ 2960 case OP(SEND_LAST_WITH_IMMEDIATE): 2961 send_last_imm: 2962 wc.ex.imm_data = ohdr->u.imm_data; 2963 wc.wc_flags = IB_WC_WITH_IMM; 2964 goto send_last; 2965 case OP(SEND_LAST_WITH_INVALIDATE): 2966 send_last_inv: 2967 rkey = be32_to_cpu(ohdr->u.ieth); 2968 if (rvt_invalidate_rkey(qp, rkey)) 2969 goto no_immediate_data; 2970 wc.ex.invalidate_rkey = rkey; 2971 wc.wc_flags = IB_WC_WITH_INVALIDATE; 2972 goto send_last; 2973 case OP(RDMA_WRITE_LAST): 2974 copy_last = rvt_is_user_qp(qp); 2975 /* fall through */ 2976 case OP(SEND_LAST): 2977 no_immediate_data: 2978 wc.wc_flags = 0; 2979 wc.ex.imm_data = 0; 2980 send_last: 2981 /* Check for invalid length. */ 2982 /* LAST len should be >= 1 */ 2983 if (unlikely(tlen < (hdrsize + extra_bytes))) 2984 goto nack_inv; 2985 /* Don't count the CRC(and padding and LT byte for 16B). */ 2986 tlen -= (hdrsize + extra_bytes); 2987 wc.byte_len = tlen + qp->r_rcv_len; 2988 if (unlikely(wc.byte_len > qp->r_len)) 2989 goto nack_inv; 2990 rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, copy_last); 2991 rvt_put_ss(&qp->r_sge); 2992 qp->r_msn++; 2993 if (!__test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) 2994 break; 2995 wc.wr_id = qp->r_wr_id; 2996 wc.status = IB_WC_SUCCESS; 2997 if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) || 2998 opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) 2999 wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; 3000 else 3001 wc.opcode = IB_WC_RECV; 3002 wc.qp = &qp->ibqp; 3003 wc.src_qp = qp->remote_qpn; 3004 wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr) & U16_MAX; 3005 /* 3006 * It seems that IB mandates the presence of an SL in a 3007 * work completion only for the UD transport (see section 3008 * 11.4.2 of IBTA Vol. 1). 3009 * 3010 * However, the way the SL is chosen below is consistent 3011 * with the way that IB/qib works and is trying avoid 3012 * introducing incompatibilities. 3013 * 3014 * See also OPA Vol. 1, section 9.7.6, and table 9-17. 3015 */ 3016 wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr); 3017 /* zero fields that are N/A */ 3018 wc.vendor_err = 0; 3019 wc.pkey_index = 0; 3020 wc.dlid_path_bits = 0; 3021 wc.port_num = 0; 3022 /* Signal completion event if the solicited bit is set. */ 3023 rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 3024 ib_bth_is_solicited(ohdr)); 3025 break; 3026 3027 case OP(RDMA_WRITE_ONLY): 3028 copy_last = rvt_is_user_qp(qp); 3029 /* fall through */ 3030 case OP(RDMA_WRITE_FIRST): 3031 case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): 3032 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) 3033 goto nack_inv; 3034 /* consume RWQE */ 3035 reth = &ohdr->u.rc.reth; 3036 qp->r_len = be32_to_cpu(reth->length); 3037 qp->r_rcv_len = 0; 3038 qp->r_sge.sg_list = NULL; 3039 if (qp->r_len != 0) { 3040 u32 rkey = be32_to_cpu(reth->rkey); 3041 u64 vaddr = get_ib_reth_vaddr(reth); 3042 int ok; 3043 3044 /* Check rkey & NAK */ 3045 ok = rvt_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr, 3046 rkey, IB_ACCESS_REMOTE_WRITE); 3047 if (unlikely(!ok)) 3048 goto nack_acc; 3049 qp->r_sge.num_sge = 1; 3050 } else { 3051 qp->r_sge.num_sge = 0; 3052 qp->r_sge.sge.mr = NULL; 3053 qp->r_sge.sge.vaddr = NULL; 3054 qp->r_sge.sge.length = 0; 3055 qp->r_sge.sge.sge_length = 0; 3056 } 3057 if (opcode == OP(RDMA_WRITE_FIRST)) 3058 goto send_middle; 3059 else if (opcode == OP(RDMA_WRITE_ONLY)) 3060 goto no_immediate_data; 3061 ret = rvt_get_rwqe(qp, true); 3062 if (ret < 0) 3063 goto nack_op_err; 3064 if (!ret) { 3065 /* peer will send again */ 3066 rvt_put_ss(&qp->r_sge); 3067 goto rnr_nak; 3068 } 3069 wc.ex.imm_data = ohdr->u.rc.imm_data; 3070 wc.wc_flags = IB_WC_WITH_IMM; 3071 goto send_last; 3072 3073 case OP(RDMA_READ_REQUEST): { 3074 struct rvt_ack_entry *e; 3075 u32 len; 3076 u8 next; 3077 3078 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) 3079 goto nack_inv; 3080 next = qp->r_head_ack_queue + 1; 3081 /* s_ack_queue is size rvt_size_atomic()+1 so use > not >= */ 3082 if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) 3083 next = 0; 3084 spin_lock_irqsave(&qp->s_lock, flags); 3085 if (unlikely(next == qp->s_acked_ack_queue)) { 3086 if (!qp->s_ack_queue[next].sent) 3087 goto nack_inv_unlck; 3088 update_ack_queue(qp, next); 3089 } 3090 e = &qp->s_ack_queue[qp->r_head_ack_queue]; 3091 if (e->rdma_sge.mr) { 3092 rvt_put_mr(e->rdma_sge.mr); 3093 e->rdma_sge.mr = NULL; 3094 } 3095 reth = &ohdr->u.rc.reth; 3096 len = be32_to_cpu(reth->length); 3097 if (len) { 3098 u32 rkey = be32_to_cpu(reth->rkey); 3099 u64 vaddr = get_ib_reth_vaddr(reth); 3100 int ok; 3101 3102 /* Check rkey & NAK */ 3103 ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, 3104 rkey, IB_ACCESS_REMOTE_READ); 3105 if (unlikely(!ok)) 3106 goto nack_acc_unlck; 3107 /* 3108 * Update the next expected PSN. We add 1 later 3109 * below, so only add the remainder here. 3110 */ 3111 qp->r_psn += rvt_div_mtu(qp, len - 1); 3112 } else { 3113 e->rdma_sge.mr = NULL; 3114 e->rdma_sge.vaddr = NULL; 3115 e->rdma_sge.length = 0; 3116 e->rdma_sge.sge_length = 0; 3117 } 3118 e->opcode = opcode; 3119 e->sent = 0; 3120 e->psn = psn; 3121 e->lpsn = qp->r_psn; 3122 /* 3123 * We need to increment the MSN here instead of when we 3124 * finish sending the result since a duplicate request would 3125 * increment it more than once. 3126 */ 3127 qp->r_msn++; 3128 qp->r_psn++; 3129 qp->r_state = opcode; 3130 qp->r_nak_state = 0; 3131 qp->r_head_ack_queue = next; 3132 qpriv->r_tid_alloc = qp->r_head_ack_queue; 3133 3134 /* Schedule the send engine. */ 3135 qp->s_flags |= RVT_S_RESP_PENDING; 3136 if (fecn) 3137 qp->s_flags |= RVT_S_ECN; 3138 hfi1_schedule_send(qp); 3139 3140 spin_unlock_irqrestore(&qp->s_lock, flags); 3141 return; 3142 } 3143 3144 case OP(COMPARE_SWAP): 3145 case OP(FETCH_ADD): { 3146 struct ib_atomic_eth *ateth = &ohdr->u.atomic_eth; 3147 u64 vaddr = get_ib_ateth_vaddr(ateth); 3148 bool opfn = opcode == OP(COMPARE_SWAP) && 3149 vaddr == HFI1_VERBS_E_ATOMIC_VADDR; 3150 struct rvt_ack_entry *e; 3151 atomic64_t *maddr; 3152 u64 sdata; 3153 u32 rkey; 3154 u8 next; 3155 3156 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) && 3157 !opfn)) 3158 goto nack_inv; 3159 next = qp->r_head_ack_queue + 1; 3160 if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) 3161 next = 0; 3162 spin_lock_irqsave(&qp->s_lock, flags); 3163 if (unlikely(next == qp->s_acked_ack_queue)) { 3164 if (!qp->s_ack_queue[next].sent) 3165 goto nack_inv_unlck; 3166 update_ack_queue(qp, next); 3167 } 3168 e = &qp->s_ack_queue[qp->r_head_ack_queue]; 3169 if (e->rdma_sge.mr) { 3170 rvt_put_mr(e->rdma_sge.mr); 3171 e->rdma_sge.mr = NULL; 3172 } 3173 /* Process OPFN special virtual address */ 3174 if (opfn) { 3175 opfn_conn_response(qp, e, ateth); 3176 goto ack; 3177 } 3178 if (unlikely(vaddr & (sizeof(u64) - 1))) 3179 goto nack_inv_unlck; 3180 rkey = be32_to_cpu(ateth->rkey); 3181 /* Check rkey & NAK */ 3182 if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64), 3183 vaddr, rkey, 3184 IB_ACCESS_REMOTE_ATOMIC))) 3185 goto nack_acc_unlck; 3186 /* Perform atomic OP and save result. */ 3187 maddr = (atomic64_t *)qp->r_sge.sge.vaddr; 3188 sdata = get_ib_ateth_swap(ateth); 3189 e->atomic_data = (opcode == OP(FETCH_ADD)) ? 3190 (u64)atomic64_add_return(sdata, maddr) - sdata : 3191 (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr, 3192 get_ib_ateth_compare(ateth), 3193 sdata); 3194 rvt_put_mr(qp->r_sge.sge.mr); 3195 qp->r_sge.num_sge = 0; 3196 ack: 3197 e->opcode = opcode; 3198 e->sent = 0; 3199 e->psn = psn; 3200 e->lpsn = psn; 3201 qp->r_msn++; 3202 qp->r_psn++; 3203 qp->r_state = opcode; 3204 qp->r_nak_state = 0; 3205 qp->r_head_ack_queue = next; 3206 qpriv->r_tid_alloc = qp->r_head_ack_queue; 3207 3208 /* Schedule the send engine. */ 3209 qp->s_flags |= RVT_S_RESP_PENDING; 3210 if (fecn) 3211 qp->s_flags |= RVT_S_ECN; 3212 hfi1_schedule_send(qp); 3213 3214 spin_unlock_irqrestore(&qp->s_lock, flags); 3215 return; 3216 } 3217 3218 default: 3219 /* NAK unknown opcodes. */ 3220 goto nack_inv; 3221 } 3222 qp->r_psn++; 3223 qp->r_state = opcode; 3224 qp->r_ack_psn = psn; 3225 qp->r_nak_state = 0; 3226 /* Send an ACK if requested or required. */ 3227 if (psn & IB_BTH_REQ_ACK || fecn) { 3228 if (packet->numpkt == 0 || fecn || 3229 qp->r_adefered >= HFI1_PSN_CREDIT) { 3230 rc_cancel_ack(qp); 3231 goto send_ack; 3232 } 3233 qp->r_adefered++; 3234 rc_defered_ack(rcd, qp); 3235 } 3236 return; 3237 3238 rnr_nak: 3239 qp->r_nak_state = qp->r_min_rnr_timer | IB_RNR_NAK; 3240 qp->r_ack_psn = qp->r_psn; 3241 /* Queue RNR NAK for later */ 3242 rc_defered_ack(rcd, qp); 3243 return; 3244 3245 nack_op_err: 3246 rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); 3247 qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR; 3248 qp->r_ack_psn = qp->r_psn; 3249 /* Queue NAK for later */ 3250 rc_defered_ack(rcd, qp); 3251 return; 3252 3253 nack_inv_unlck: 3254 spin_unlock_irqrestore(&qp->s_lock, flags); 3255 nack_inv: 3256 rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); 3257 qp->r_nak_state = IB_NAK_INVALID_REQUEST; 3258 qp->r_ack_psn = qp->r_psn; 3259 /* Queue NAK for later */ 3260 rc_defered_ack(rcd, qp); 3261 return; 3262 3263 nack_acc_unlck: 3264 spin_unlock_irqrestore(&qp->s_lock, flags); 3265 nack_acc: 3266 rvt_rc_error(qp, IB_WC_LOC_PROT_ERR); 3267 qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; 3268 qp->r_ack_psn = qp->r_psn; 3269 send_ack: 3270 hfi1_send_rc_ack(packet, fecn); 3271 } 3272 3273 void hfi1_rc_hdrerr( 3274 struct hfi1_ctxtdata *rcd, 3275 struct hfi1_packet *packet, 3276 struct rvt_qp *qp) 3277 { 3278 struct hfi1_ibport *ibp = rcd_to_iport(rcd); 3279 int diff; 3280 u32 opcode; 3281 u32 psn; 3282 3283 if (hfi1_ruc_check_hdr(ibp, packet)) 3284 return; 3285 3286 psn = ib_bth_get_psn(packet->ohdr); 3287 opcode = ib_bth_get_opcode(packet->ohdr); 3288 3289 /* Only deal with RDMA Writes for now */ 3290 if (opcode < IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) { 3291 diff = delta_psn(psn, qp->r_psn); 3292 if (!qp->r_nak_state && diff >= 0) { 3293 ibp->rvp.n_rc_seqnak++; 3294 qp->r_nak_state = IB_NAK_PSN_ERROR; 3295 /* Use the expected PSN. */ 3296 qp->r_ack_psn = qp->r_psn; 3297 /* 3298 * Wait to send the sequence 3299 * NAK until all packets 3300 * in the receive queue have 3301 * been processed. 3302 * Otherwise, we end up 3303 * propagating congestion. 3304 */ 3305 rc_defered_ack(rcd, qp); 3306 } /* Out of sequence NAK */ 3307 } /* QP Request NAKs */ 3308 } 3309