1 /* 2 * Copyright(c) 2015, 2016 Intel Corporation. 3 * 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 * redistributing this file, you may do so under either license. 6 * 7 * GPL LICENSE SUMMARY 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of version 2 of the GNU General Public License as 11 * published by the Free Software Foundation. 12 * 13 * This program is distributed in the hope that it will be useful, but 14 * WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * General Public License for more details. 17 * 18 * BSD LICENSE 19 * 20 * Redistribution and use in source and binary forms, with or without 21 * modification, are permitted provided that the following conditions 22 * are met: 23 * 24 * - Redistributions of source code must retain the above copyright 25 * notice, this list of conditions and the following disclaimer. 26 * - Redistributions in binary form must reproduce the above copyright 27 * notice, this list of conditions and the following disclaimer in 28 * the documentation and/or other materials provided with the 29 * distribution. 30 * - Neither the name of Intel Corporation nor the names of its 31 * contributors may be used to endorse or promote products derived 32 * from this software without specific prior written permission. 33 * 34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 45 * 46 */ 47 48 #include <linux/io.h> 49 #include <rdma/rdma_vt.h> 50 #include <rdma/rdmavt_qp.h> 51 52 #include "hfi.h" 53 #include "qp.h" 54 #include "verbs_txreq.h" 55 #include "trace.h" 56 57 /* cut down ridiculously long IB macro names */ 58 #define OP(x) RC_OP(x) 59 60 static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, 61 u32 psn, u32 pmtu) 62 { 63 u32 len; 64 65 len = delta_psn(psn, wqe->psn) * pmtu; 66 ss->sge = wqe->sg_list[0]; 67 ss->sg_list = wqe->sg_list + 1; 68 ss->num_sge = wqe->wr.num_sge; 69 ss->total_len = wqe->length; 70 rvt_skip_sge(ss, len, false); 71 return wqe->length - len; 72 } 73 74 /** 75 * make_rc_ack - construct a response packet (ACK, NAK, or RDMA read) 76 * @dev: the device for this QP 77 * @qp: a pointer to the QP 78 * @ohdr: a pointer to the IB header being constructed 79 * @ps: the xmit packet state 80 * 81 * Return 1 if constructed; otherwise, return 0. 82 * Note that we are in the responder's side of the QP context. 83 * Note the QP s_lock must be held. 84 */ 85 static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, 86 struct ib_other_headers *ohdr, 87 struct hfi1_pkt_state *ps) 88 { 89 struct rvt_ack_entry *e; 90 u32 hwords; 91 u32 len; 92 u32 bth0; 93 u32 bth2; 94 int middle = 0; 95 u32 pmtu = qp->pmtu; 96 struct hfi1_qp_priv *priv = qp->priv; 97 98 lockdep_assert_held(&qp->s_lock); 99 /* Don't send an ACK if we aren't supposed to. */ 100 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) 101 goto bail; 102 103 if (priv->hdr_type == HFI1_PKT_TYPE_9B) 104 /* header size in 32-bit words LRH+BTH = (8+12)/4. */ 105 hwords = 5; 106 else 107 /* header size in 32-bit words 16B LRH+BTH = (16+12)/4. */ 108 hwords = 7; 109 110 switch (qp->s_ack_state) { 111 case OP(RDMA_READ_RESPONSE_LAST): 112 case OP(RDMA_READ_RESPONSE_ONLY): 113 e = &qp->s_ack_queue[qp->s_tail_ack_queue]; 114 if (e->rdma_sge.mr) { 115 rvt_put_mr(e->rdma_sge.mr); 116 e->rdma_sge.mr = NULL; 117 } 118 /* FALLTHROUGH */ 119 case OP(ATOMIC_ACKNOWLEDGE): 120 /* 121 * We can increment the tail pointer now that the last 122 * response has been sent instead of only being 123 * constructed. 124 */ 125 if (++qp->s_tail_ack_queue > HFI1_MAX_RDMA_ATOMIC) 126 qp->s_tail_ack_queue = 0; 127 /* FALLTHROUGH */ 128 case OP(SEND_ONLY): 129 case OP(ACKNOWLEDGE): 130 /* Check for no next entry in the queue. */ 131 if (qp->r_head_ack_queue == qp->s_tail_ack_queue) { 132 if (qp->s_flags & RVT_S_ACK_PENDING) 133 goto normal; 134 goto bail; 135 } 136 137 e = &qp->s_ack_queue[qp->s_tail_ack_queue]; 138 if (e->opcode == OP(RDMA_READ_REQUEST)) { 139 /* 140 * If a RDMA read response is being resent and 141 * we haven't seen the duplicate request yet, 142 * then stop sending the remaining responses the 143 * responder has seen until the requester re-sends it. 144 */ 145 len = e->rdma_sge.sge_length; 146 if (len && !e->rdma_sge.mr) { 147 qp->s_tail_ack_queue = qp->r_head_ack_queue; 148 goto bail; 149 } 150 /* Copy SGE state in case we need to resend */ 151 ps->s_txreq->mr = e->rdma_sge.mr; 152 if (ps->s_txreq->mr) 153 rvt_get_mr(ps->s_txreq->mr); 154 qp->s_ack_rdma_sge.sge = e->rdma_sge; 155 qp->s_ack_rdma_sge.num_sge = 1; 156 ps->s_txreq->ss = &qp->s_ack_rdma_sge; 157 if (len > pmtu) { 158 len = pmtu; 159 qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST); 160 } else { 161 qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY); 162 e->sent = 1; 163 } 164 ohdr->u.aeth = rvt_compute_aeth(qp); 165 hwords++; 166 qp->s_ack_rdma_psn = e->psn; 167 bth2 = mask_psn(qp->s_ack_rdma_psn++); 168 } else { 169 /* COMPARE_SWAP or FETCH_ADD */ 170 ps->s_txreq->ss = NULL; 171 len = 0; 172 qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE); 173 ohdr->u.at.aeth = rvt_compute_aeth(qp); 174 ib_u64_put(e->atomic_data, &ohdr->u.at.atomic_ack_eth); 175 hwords += sizeof(ohdr->u.at) / sizeof(u32); 176 bth2 = mask_psn(e->psn); 177 e->sent = 1; 178 } 179 bth0 = qp->s_ack_state << 24; 180 break; 181 182 case OP(RDMA_READ_RESPONSE_FIRST): 183 qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE); 184 /* FALLTHROUGH */ 185 case OP(RDMA_READ_RESPONSE_MIDDLE): 186 ps->s_txreq->ss = &qp->s_ack_rdma_sge; 187 ps->s_txreq->mr = qp->s_ack_rdma_sge.sge.mr; 188 if (ps->s_txreq->mr) 189 rvt_get_mr(ps->s_txreq->mr); 190 len = qp->s_ack_rdma_sge.sge.sge_length; 191 if (len > pmtu) { 192 len = pmtu; 193 middle = HFI1_CAP_IS_KSET(SDMA_AHG); 194 } else { 195 ohdr->u.aeth = rvt_compute_aeth(qp); 196 hwords++; 197 qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); 198 e = &qp->s_ack_queue[qp->s_tail_ack_queue]; 199 e->sent = 1; 200 } 201 bth0 = qp->s_ack_state << 24; 202 bth2 = mask_psn(qp->s_ack_rdma_psn++); 203 break; 204 205 default: 206 normal: 207 /* 208 * Send a regular ACK. 209 * Set the s_ack_state so we wait until after sending 210 * the ACK before setting s_ack_state to ACKNOWLEDGE 211 * (see above). 212 */ 213 qp->s_ack_state = OP(SEND_ONLY); 214 qp->s_flags &= ~RVT_S_ACK_PENDING; 215 ps->s_txreq->ss = NULL; 216 if (qp->s_nak_state) 217 ohdr->u.aeth = 218 cpu_to_be32((qp->r_msn & IB_MSN_MASK) | 219 (qp->s_nak_state << 220 IB_AETH_CREDIT_SHIFT)); 221 else 222 ohdr->u.aeth = rvt_compute_aeth(qp); 223 hwords++; 224 len = 0; 225 bth0 = OP(ACKNOWLEDGE) << 24; 226 bth2 = mask_psn(qp->s_ack_psn); 227 } 228 qp->s_rdma_ack_cnt++; 229 qp->s_hdrwords = hwords; 230 ps->s_txreq->sde = priv->s_sde; 231 ps->s_txreq->s_cur_size = len; 232 hfi1_make_ruc_header(qp, ohdr, bth0, bth2, middle, ps); 233 /* pbc */ 234 ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2; 235 return 1; 236 237 bail: 238 qp->s_ack_state = OP(ACKNOWLEDGE); 239 /* 240 * Ensure s_rdma_ack_cnt changes are committed prior to resetting 241 * RVT_S_RESP_PENDING 242 */ 243 smp_wmb(); 244 qp->s_flags &= ~(RVT_S_RESP_PENDING 245 | RVT_S_ACK_PENDING 246 | RVT_S_AHG_VALID); 247 return 0; 248 } 249 250 /** 251 * hfi1_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC) 252 * @qp: a pointer to the QP 253 * 254 * Assumes s_lock is held. 255 * 256 * Return 1 if constructed; otherwise, return 0. 257 */ 258 int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) 259 { 260 struct hfi1_qp_priv *priv = qp->priv; 261 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); 262 struct ib_other_headers *ohdr; 263 struct rvt_sge_state *ss; 264 struct rvt_swqe *wqe; 265 u32 hwords; 266 u32 len; 267 u32 bth0 = 0; 268 u32 bth2; 269 u32 pmtu = qp->pmtu; 270 char newreq; 271 int middle = 0; 272 int delta; 273 274 lockdep_assert_held(&qp->s_lock); 275 ps->s_txreq = get_txreq(ps->dev, qp); 276 if (IS_ERR(ps->s_txreq)) 277 goto bail_no_tx; 278 279 if (priv->hdr_type == HFI1_PKT_TYPE_9B) { 280 /* header size in 32-bit words LRH+BTH = (8+12)/4. */ 281 hwords = 5; 282 if (rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) 283 ohdr = &ps->s_txreq->phdr.hdr.ibh.u.l.oth; 284 else 285 ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth; 286 } else { 287 /* header size in 32-bit words 16B LRH+BTH = (16+12)/4. */ 288 hwords = 7; 289 if ((rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) && 290 (hfi1_check_mcast(rdma_ah_get_dlid(&qp->remote_ah_attr)))) 291 ohdr = &ps->s_txreq->phdr.hdr.opah.u.l.oth; 292 else 293 ohdr = &ps->s_txreq->phdr.hdr.opah.u.oth; 294 } 295 296 /* Sending responses has higher priority over sending requests. */ 297 if ((qp->s_flags & RVT_S_RESP_PENDING) && 298 make_rc_ack(dev, qp, ohdr, ps)) 299 return 1; 300 301 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) { 302 if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND)) 303 goto bail; 304 /* We are in the error state, flush the work request. */ 305 smp_read_barrier_depends(); /* see post_one_send() */ 306 if (qp->s_last == READ_ONCE(qp->s_head)) 307 goto bail; 308 /* If DMAs are in progress, we can't flush immediately. */ 309 if (iowait_sdma_pending(&priv->s_iowait)) { 310 qp->s_flags |= RVT_S_WAIT_DMA; 311 goto bail; 312 } 313 clear_ahg(qp); 314 wqe = rvt_get_swqe_ptr(qp, qp->s_last); 315 hfi1_send_complete(qp, wqe, qp->s_last != qp->s_acked ? 316 IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR); 317 /* will get called again */ 318 goto done_free_tx; 319 } 320 321 if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK)) 322 goto bail; 323 324 if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) { 325 if (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) { 326 qp->s_flags |= RVT_S_WAIT_PSN; 327 goto bail; 328 } 329 qp->s_sending_psn = qp->s_psn; 330 qp->s_sending_hpsn = qp->s_psn - 1; 331 } 332 333 /* Send a request. */ 334 wqe = rvt_get_swqe_ptr(qp, qp->s_cur); 335 switch (qp->s_state) { 336 default: 337 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK)) 338 goto bail; 339 /* 340 * Resend an old request or start a new one. 341 * 342 * We keep track of the current SWQE so that 343 * we don't reset the "furthest progress" state 344 * if we need to back up. 345 */ 346 newreq = 0; 347 if (qp->s_cur == qp->s_tail) { 348 /* Check if send work queue is empty. */ 349 smp_read_barrier_depends(); /* see post_one_send() */ 350 if (qp->s_tail == READ_ONCE(qp->s_head)) { 351 clear_ahg(qp); 352 goto bail; 353 } 354 /* 355 * If a fence is requested, wait for previous 356 * RDMA read and atomic operations to finish. 357 */ 358 if ((wqe->wr.send_flags & IB_SEND_FENCE) && 359 qp->s_num_rd_atomic) { 360 qp->s_flags |= RVT_S_WAIT_FENCE; 361 goto bail; 362 } 363 /* 364 * Local operations are processed immediately 365 * after all prior requests have completed 366 */ 367 if (wqe->wr.opcode == IB_WR_REG_MR || 368 wqe->wr.opcode == IB_WR_LOCAL_INV) { 369 int local_ops = 0; 370 int err = 0; 371 372 if (qp->s_last != qp->s_cur) 373 goto bail; 374 if (++qp->s_cur == qp->s_size) 375 qp->s_cur = 0; 376 if (++qp->s_tail == qp->s_size) 377 qp->s_tail = 0; 378 if (!(wqe->wr.send_flags & 379 RVT_SEND_COMPLETION_ONLY)) { 380 err = rvt_invalidate_rkey( 381 qp, 382 wqe->wr.ex.invalidate_rkey); 383 local_ops = 1; 384 } 385 hfi1_send_complete(qp, wqe, 386 err ? IB_WC_LOC_PROT_ERR 387 : IB_WC_SUCCESS); 388 if (local_ops) 389 atomic_dec(&qp->local_ops_pending); 390 qp->s_hdrwords = 0; 391 goto done_free_tx; 392 } 393 394 newreq = 1; 395 qp->s_psn = wqe->psn; 396 } 397 /* 398 * Note that we have to be careful not to modify the 399 * original work request since we may need to resend 400 * it. 401 */ 402 len = wqe->length; 403 ss = &qp->s_sge; 404 bth2 = mask_psn(qp->s_psn); 405 switch (wqe->wr.opcode) { 406 case IB_WR_SEND: 407 case IB_WR_SEND_WITH_IMM: 408 case IB_WR_SEND_WITH_INV: 409 /* If no credit, return. */ 410 if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) && 411 rvt_cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) { 412 qp->s_flags |= RVT_S_WAIT_SSN_CREDIT; 413 goto bail; 414 } 415 if (len > pmtu) { 416 qp->s_state = OP(SEND_FIRST); 417 len = pmtu; 418 break; 419 } 420 if (wqe->wr.opcode == IB_WR_SEND) { 421 qp->s_state = OP(SEND_ONLY); 422 } else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) { 423 qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE); 424 /* Immediate data comes after the BTH */ 425 ohdr->u.imm_data = wqe->wr.ex.imm_data; 426 hwords += 1; 427 } else { 428 qp->s_state = OP(SEND_ONLY_WITH_INVALIDATE); 429 /* Invalidate rkey comes after the BTH */ 430 ohdr->u.ieth = cpu_to_be32( 431 wqe->wr.ex.invalidate_rkey); 432 hwords += 1; 433 } 434 if (wqe->wr.send_flags & IB_SEND_SOLICITED) 435 bth0 |= IB_BTH_SOLICITED; 436 bth2 |= IB_BTH_REQ_ACK; 437 if (++qp->s_cur == qp->s_size) 438 qp->s_cur = 0; 439 break; 440 441 case IB_WR_RDMA_WRITE: 442 if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) 443 qp->s_lsn++; 444 goto no_flow_control; 445 case IB_WR_RDMA_WRITE_WITH_IMM: 446 /* If no credit, return. */ 447 if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) && 448 rvt_cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) { 449 qp->s_flags |= RVT_S_WAIT_SSN_CREDIT; 450 goto bail; 451 } 452 no_flow_control: 453 put_ib_reth_vaddr( 454 wqe->rdma_wr.remote_addr, 455 &ohdr->u.rc.reth); 456 ohdr->u.rc.reth.rkey = 457 cpu_to_be32(wqe->rdma_wr.rkey); 458 ohdr->u.rc.reth.length = cpu_to_be32(len); 459 hwords += sizeof(struct ib_reth) / sizeof(u32); 460 if (len > pmtu) { 461 qp->s_state = OP(RDMA_WRITE_FIRST); 462 len = pmtu; 463 break; 464 } 465 if (wqe->wr.opcode == IB_WR_RDMA_WRITE) { 466 qp->s_state = OP(RDMA_WRITE_ONLY); 467 } else { 468 qp->s_state = 469 OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE); 470 /* Immediate data comes after RETH */ 471 ohdr->u.rc.imm_data = wqe->wr.ex.imm_data; 472 hwords += 1; 473 if (wqe->wr.send_flags & IB_SEND_SOLICITED) 474 bth0 |= IB_BTH_SOLICITED; 475 } 476 bth2 |= IB_BTH_REQ_ACK; 477 if (++qp->s_cur == qp->s_size) 478 qp->s_cur = 0; 479 break; 480 481 case IB_WR_RDMA_READ: 482 /* 483 * Don't allow more operations to be started 484 * than the QP limits allow. 485 */ 486 if (newreq) { 487 if (qp->s_num_rd_atomic >= 488 qp->s_max_rd_atomic) { 489 qp->s_flags |= RVT_S_WAIT_RDMAR; 490 goto bail; 491 } 492 qp->s_num_rd_atomic++; 493 if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) 494 qp->s_lsn++; 495 } 496 put_ib_reth_vaddr( 497 wqe->rdma_wr.remote_addr, 498 &ohdr->u.rc.reth); 499 ohdr->u.rc.reth.rkey = 500 cpu_to_be32(wqe->rdma_wr.rkey); 501 ohdr->u.rc.reth.length = cpu_to_be32(len); 502 qp->s_state = OP(RDMA_READ_REQUEST); 503 hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32); 504 ss = NULL; 505 len = 0; 506 bth2 |= IB_BTH_REQ_ACK; 507 if (++qp->s_cur == qp->s_size) 508 qp->s_cur = 0; 509 break; 510 511 case IB_WR_ATOMIC_CMP_AND_SWP: 512 case IB_WR_ATOMIC_FETCH_AND_ADD: 513 /* 514 * Don't allow more operations to be started 515 * than the QP limits allow. 516 */ 517 if (newreq) { 518 if (qp->s_num_rd_atomic >= 519 qp->s_max_rd_atomic) { 520 qp->s_flags |= RVT_S_WAIT_RDMAR; 521 goto bail; 522 } 523 qp->s_num_rd_atomic++; 524 if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) 525 qp->s_lsn++; 526 } 527 if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) { 528 qp->s_state = OP(COMPARE_SWAP); 529 put_ib_ateth_swap(wqe->atomic_wr.swap, 530 &ohdr->u.atomic_eth); 531 put_ib_ateth_compare(wqe->atomic_wr.compare_add, 532 &ohdr->u.atomic_eth); 533 } else { 534 qp->s_state = OP(FETCH_ADD); 535 put_ib_ateth_swap(wqe->atomic_wr.compare_add, 536 &ohdr->u.atomic_eth); 537 put_ib_ateth_compare(0, &ohdr->u.atomic_eth); 538 } 539 put_ib_ateth_vaddr(wqe->atomic_wr.remote_addr, 540 &ohdr->u.atomic_eth); 541 ohdr->u.atomic_eth.rkey = cpu_to_be32( 542 wqe->atomic_wr.rkey); 543 hwords += sizeof(struct ib_atomic_eth) / sizeof(u32); 544 ss = NULL; 545 len = 0; 546 bth2 |= IB_BTH_REQ_ACK; 547 if (++qp->s_cur == qp->s_size) 548 qp->s_cur = 0; 549 break; 550 551 default: 552 goto bail; 553 } 554 qp->s_sge.sge = wqe->sg_list[0]; 555 qp->s_sge.sg_list = wqe->sg_list + 1; 556 qp->s_sge.num_sge = wqe->wr.num_sge; 557 qp->s_sge.total_len = wqe->length; 558 qp->s_len = wqe->length; 559 if (newreq) { 560 qp->s_tail++; 561 if (qp->s_tail >= qp->s_size) 562 qp->s_tail = 0; 563 } 564 if (wqe->wr.opcode == IB_WR_RDMA_READ) 565 qp->s_psn = wqe->lpsn + 1; 566 else 567 qp->s_psn++; 568 break; 569 570 case OP(RDMA_READ_RESPONSE_FIRST): 571 /* 572 * qp->s_state is normally set to the opcode of the 573 * last packet constructed for new requests and therefore 574 * is never set to RDMA read response. 575 * RDMA_READ_RESPONSE_FIRST is used by the ACK processing 576 * thread to indicate a SEND needs to be restarted from an 577 * earlier PSN without interfering with the sending thread. 578 * See restart_rc(). 579 */ 580 qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu); 581 /* FALLTHROUGH */ 582 case OP(SEND_FIRST): 583 qp->s_state = OP(SEND_MIDDLE); 584 /* FALLTHROUGH */ 585 case OP(SEND_MIDDLE): 586 bth2 = mask_psn(qp->s_psn++); 587 ss = &qp->s_sge; 588 len = qp->s_len; 589 if (len > pmtu) { 590 len = pmtu; 591 middle = HFI1_CAP_IS_KSET(SDMA_AHG); 592 break; 593 } 594 if (wqe->wr.opcode == IB_WR_SEND) { 595 qp->s_state = OP(SEND_LAST); 596 } else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) { 597 qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE); 598 /* Immediate data comes after the BTH */ 599 ohdr->u.imm_data = wqe->wr.ex.imm_data; 600 hwords += 1; 601 } else { 602 qp->s_state = OP(SEND_LAST_WITH_INVALIDATE); 603 /* invalidate data comes after the BTH */ 604 ohdr->u.ieth = cpu_to_be32(wqe->wr.ex.invalidate_rkey); 605 hwords += 1; 606 } 607 if (wqe->wr.send_flags & IB_SEND_SOLICITED) 608 bth0 |= IB_BTH_SOLICITED; 609 bth2 |= IB_BTH_REQ_ACK; 610 qp->s_cur++; 611 if (qp->s_cur >= qp->s_size) 612 qp->s_cur = 0; 613 break; 614 615 case OP(RDMA_READ_RESPONSE_LAST): 616 /* 617 * qp->s_state is normally set to the opcode of the 618 * last packet constructed for new requests and therefore 619 * is never set to RDMA read response. 620 * RDMA_READ_RESPONSE_LAST is used by the ACK processing 621 * thread to indicate a RDMA write needs to be restarted from 622 * an earlier PSN without interfering with the sending thread. 623 * See restart_rc(). 624 */ 625 qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu); 626 /* FALLTHROUGH */ 627 case OP(RDMA_WRITE_FIRST): 628 qp->s_state = OP(RDMA_WRITE_MIDDLE); 629 /* FALLTHROUGH */ 630 case OP(RDMA_WRITE_MIDDLE): 631 bth2 = mask_psn(qp->s_psn++); 632 ss = &qp->s_sge; 633 len = qp->s_len; 634 if (len > pmtu) { 635 len = pmtu; 636 middle = HFI1_CAP_IS_KSET(SDMA_AHG); 637 break; 638 } 639 if (wqe->wr.opcode == IB_WR_RDMA_WRITE) { 640 qp->s_state = OP(RDMA_WRITE_LAST); 641 } else { 642 qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE); 643 /* Immediate data comes after the BTH */ 644 ohdr->u.imm_data = wqe->wr.ex.imm_data; 645 hwords += 1; 646 if (wqe->wr.send_flags & IB_SEND_SOLICITED) 647 bth0 |= IB_BTH_SOLICITED; 648 } 649 bth2 |= IB_BTH_REQ_ACK; 650 qp->s_cur++; 651 if (qp->s_cur >= qp->s_size) 652 qp->s_cur = 0; 653 break; 654 655 case OP(RDMA_READ_RESPONSE_MIDDLE): 656 /* 657 * qp->s_state is normally set to the opcode of the 658 * last packet constructed for new requests and therefore 659 * is never set to RDMA read response. 660 * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing 661 * thread to indicate a RDMA read needs to be restarted from 662 * an earlier PSN without interfering with the sending thread. 663 * See restart_rc(). 664 */ 665 len = (delta_psn(qp->s_psn, wqe->psn)) * pmtu; 666 put_ib_reth_vaddr( 667 wqe->rdma_wr.remote_addr + len, 668 &ohdr->u.rc.reth); 669 ohdr->u.rc.reth.rkey = 670 cpu_to_be32(wqe->rdma_wr.rkey); 671 ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len); 672 qp->s_state = OP(RDMA_READ_REQUEST); 673 hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32); 674 bth2 = mask_psn(qp->s_psn) | IB_BTH_REQ_ACK; 675 qp->s_psn = wqe->lpsn + 1; 676 ss = NULL; 677 len = 0; 678 qp->s_cur++; 679 if (qp->s_cur == qp->s_size) 680 qp->s_cur = 0; 681 break; 682 } 683 qp->s_sending_hpsn = bth2; 684 delta = delta_psn(bth2, wqe->psn); 685 if (delta && delta % HFI1_PSN_CREDIT == 0) 686 bth2 |= IB_BTH_REQ_ACK; 687 if (qp->s_flags & RVT_S_SEND_ONE) { 688 qp->s_flags &= ~RVT_S_SEND_ONE; 689 qp->s_flags |= RVT_S_WAIT_ACK; 690 bth2 |= IB_BTH_REQ_ACK; 691 } 692 qp->s_len -= len; 693 qp->s_hdrwords = hwords; 694 ps->s_txreq->sde = priv->s_sde; 695 ps->s_txreq->ss = ss; 696 ps->s_txreq->s_cur_size = len; 697 hfi1_make_ruc_header( 698 qp, 699 ohdr, 700 bth0 | (qp->s_state << 24), 701 bth2, 702 middle, 703 ps); 704 /* pbc */ 705 ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2; 706 return 1; 707 708 done_free_tx: 709 hfi1_put_txreq(ps->s_txreq); 710 ps->s_txreq = NULL; 711 return 1; 712 713 bail: 714 hfi1_put_txreq(ps->s_txreq); 715 716 bail_no_tx: 717 ps->s_txreq = NULL; 718 qp->s_flags &= ~RVT_S_BUSY; 719 qp->s_hdrwords = 0; 720 return 0; 721 } 722 723 static inline void hfi1_make_bth_aeth(struct rvt_qp *qp, 724 struct ib_other_headers *ohdr, 725 u32 bth0, u32 bth1) 726 { 727 if (qp->r_nak_state) 728 ohdr->u.aeth = cpu_to_be32((qp->r_msn & IB_MSN_MASK) | 729 (qp->r_nak_state << 730 IB_AETH_CREDIT_SHIFT)); 731 else 732 ohdr->u.aeth = rvt_compute_aeth(qp); 733 734 ohdr->bth[0] = cpu_to_be32(bth0); 735 ohdr->bth[1] = cpu_to_be32(bth1 | qp->remote_qpn); 736 ohdr->bth[2] = cpu_to_be32(mask_psn(qp->r_ack_psn)); 737 } 738 739 static inline void hfi1_queue_rc_ack(struct rvt_qp *qp, bool is_fecn) 740 { 741 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); 742 unsigned long flags; 743 744 spin_lock_irqsave(&qp->s_lock, flags); 745 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) 746 goto unlock; 747 this_cpu_inc(*ibp->rvp.rc_qacks); 748 qp->s_flags |= RVT_S_ACK_PENDING | RVT_S_RESP_PENDING; 749 qp->s_nak_state = qp->r_nak_state; 750 qp->s_ack_psn = qp->r_ack_psn; 751 if (is_fecn) 752 qp->s_flags |= RVT_S_ECN; 753 754 /* Schedule the send tasklet. */ 755 hfi1_schedule_send(qp); 756 unlock: 757 spin_unlock_irqrestore(&qp->s_lock, flags); 758 } 759 760 static inline void hfi1_make_rc_ack_9B(struct rvt_qp *qp, 761 struct hfi1_opa_header *opa_hdr, 762 u8 sc5, bool is_fecn, 763 u64 *pbc_flags, u32 *hwords, 764 u32 *nwords) 765 { 766 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); 767 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); 768 struct ib_header *hdr = &opa_hdr->ibh; 769 struct ib_other_headers *ohdr; 770 u16 lrh0 = HFI1_LRH_BTH; 771 u16 pkey; 772 u32 bth0, bth1; 773 774 opa_hdr->hdr_type = HFI1_PKT_TYPE_9B; 775 ohdr = &hdr->u.oth; 776 /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4 */ 777 *hwords = 6; 778 779 if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH)) { 780 *hwords += hfi1_make_grh(ibp, &hdr->u.l.grh, 781 rdma_ah_read_grh(&qp->remote_ah_attr), 782 *hwords - 2, SIZE_OF_CRC); 783 ohdr = &hdr->u.l.oth; 784 lrh0 = HFI1_LRH_GRH; 785 } 786 /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */ 787 *pbc_flags |= ((!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT); 788 789 /* read pkey_index w/o lock (its atomic) */ 790 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index); 791 792 lrh0 |= (sc5 & IB_SC_MASK) << IB_SC_SHIFT | 793 (rdma_ah_get_sl(&qp->remote_ah_attr) & IB_SL_MASK) << 794 IB_SL_SHIFT; 795 796 hfi1_make_ib_hdr(hdr, lrh0, *hwords + SIZE_OF_CRC, 797 opa_get_lid(rdma_ah_get_dlid(&qp->remote_ah_attr), 9B), 798 ppd->lid | rdma_ah_get_path_bits(&qp->remote_ah_attr)); 799 800 bth0 = pkey | (OP(ACKNOWLEDGE) << 24); 801 if (qp->s_mig_state == IB_MIG_MIGRATED) 802 bth0 |= IB_BTH_MIG_REQ; 803 bth1 = (!!is_fecn) << IB_BECN_SHIFT; 804 hfi1_make_bth_aeth(qp, ohdr, bth0, bth1); 805 } 806 807 static inline void hfi1_make_rc_ack_16B(struct rvt_qp *qp, 808 struct hfi1_opa_header *opa_hdr, 809 u8 sc5, bool is_fecn, 810 u64 *pbc_flags, u32 *hwords, 811 u32 *nwords) 812 { 813 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); 814 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); 815 struct hfi1_16b_header *hdr = &opa_hdr->opah; 816 struct ib_other_headers *ohdr; 817 u32 bth0, bth1 = 0; 818 u16 len, pkey; 819 u8 becn = !!is_fecn; 820 u8 l4 = OPA_16B_L4_IB_LOCAL; 821 u8 extra_bytes; 822 823 opa_hdr->hdr_type = HFI1_PKT_TYPE_16B; 824 ohdr = &hdr->u.oth; 825 /* header size in 32-bit words 16B LRH+BTH+AETH = (16+12+4)/4 */ 826 *hwords = 8; 827 extra_bytes = hfi1_get_16b_padding(*hwords << 2, 0); 828 *nwords = SIZE_OF_CRC + ((extra_bytes + SIZE_OF_LT) >> 2); 829 830 if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) && 831 hfi1_check_mcast(rdma_ah_get_dlid(&qp->remote_ah_attr))) { 832 *hwords += hfi1_make_grh(ibp, &hdr->u.l.grh, 833 rdma_ah_read_grh(&qp->remote_ah_attr), 834 *hwords - 4, *nwords); 835 ohdr = &hdr->u.l.oth; 836 l4 = OPA_16B_L4_IB_GLOBAL; 837 } 838 *pbc_flags |= PBC_PACKET_BYPASS | PBC_INSERT_BYPASS_ICRC; 839 840 /* read pkey_index w/o lock (its atomic) */ 841 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index); 842 843 /* Convert dwords to flits */ 844 len = (*hwords + *nwords) >> 1; 845 846 hfi1_make_16b_hdr(hdr, 847 ppd->lid | rdma_ah_get_path_bits(&qp->remote_ah_attr), 848 opa_get_lid(rdma_ah_get_dlid(&qp->remote_ah_attr), 849 16B), 850 len, pkey, becn, 0, l4, sc5); 851 852 bth0 = pkey | (OP(ACKNOWLEDGE) << 24); 853 bth0 |= extra_bytes << 20; 854 if (qp->s_mig_state == IB_MIG_MIGRATED) 855 bth1 = OPA_BTH_MIG_REQ; 856 hfi1_make_bth_aeth(qp, ohdr, bth0, bth1); 857 } 858 859 typedef void (*hfi1_make_rc_ack)(struct rvt_qp *qp, 860 struct hfi1_opa_header *opa_hdr, 861 u8 sc5, bool is_fecn, 862 u64 *pbc_flags, u32 *hwords, 863 u32 *nwords); 864 865 /* We support only two types - 9B and 16B for now */ 866 static const hfi1_make_rc_ack hfi1_make_rc_ack_tbl[2] = { 867 [HFI1_PKT_TYPE_9B] = &hfi1_make_rc_ack_9B, 868 [HFI1_PKT_TYPE_16B] = &hfi1_make_rc_ack_16B 869 }; 870 871 /** 872 * hfi1_send_rc_ack - Construct an ACK packet and send it 873 * @qp: a pointer to the QP 874 * 875 * This is called from hfi1_rc_rcv() and handle_receive_interrupt(). 876 * Note that RDMA reads and atomics are handled in the 877 * send side QP state and send engine. 878 */ 879 void hfi1_send_rc_ack(struct hfi1_ctxtdata *rcd, 880 struct rvt_qp *qp, bool is_fecn) 881 { 882 struct hfi1_ibport *ibp = rcd_to_iport(rcd); 883 struct hfi1_qp_priv *priv = qp->priv; 884 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); 885 u8 sc5 = ibp->sl_to_sc[rdma_ah_get_sl(&qp->remote_ah_attr)]; 886 u64 pbc, pbc_flags = 0; 887 u32 hwords = 0; 888 u32 nwords = 0; 889 u32 plen; 890 struct pio_buf *pbuf; 891 struct hfi1_opa_header opa_hdr; 892 893 /* clear the defer count */ 894 qp->r_adefered = 0; 895 896 /* Don't send ACK or NAK if a RDMA read or atomic is pending. */ 897 if (qp->s_flags & RVT_S_RESP_PENDING) { 898 hfi1_queue_rc_ack(qp, is_fecn); 899 return; 900 } 901 902 /* Ensure s_rdma_ack_cnt changes are committed */ 903 smp_read_barrier_depends(); 904 if (qp->s_rdma_ack_cnt) { 905 hfi1_queue_rc_ack(qp, is_fecn); 906 return; 907 } 908 909 /* Don't try to send ACKs if the link isn't ACTIVE */ 910 if (driver_lstate(ppd) != IB_PORT_ACTIVE) 911 return; 912 913 /* Make the appropriate header */ 914 hfi1_make_rc_ack_tbl[priv->hdr_type](qp, &opa_hdr, sc5, is_fecn, 915 &pbc_flags, &hwords, &nwords); 916 917 plen = 2 /* PBC */ + hwords + nwords; 918 pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, 919 sc_to_vlt(ppd->dd, sc5), plen); 920 pbuf = sc_buffer_alloc(rcd->sc, plen, NULL, NULL); 921 if (!pbuf) { 922 /* 923 * We have no room to send at the moment. Pass 924 * responsibility for sending the ACK to the send engine 925 * so that when enough buffer space becomes available, 926 * the ACK is sent ahead of other outgoing packets. 927 */ 928 hfi1_queue_rc_ack(qp, is_fecn); 929 return; 930 } 931 trace_ack_output_ibhdr(dd_from_ibdev(qp->ibqp.device), 932 &opa_hdr, ib_is_sc5(sc5)); 933 934 /* write the pbc and data */ 935 ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc, 936 (priv->hdr_type == HFI1_PKT_TYPE_9B ? 937 (void *)&opa_hdr.ibh : 938 (void *)&opa_hdr.opah), hwords); 939 return; 940 } 941 942 /** 943 * reset_psn - reset the QP state to send starting from PSN 944 * @qp: the QP 945 * @psn: the packet sequence number to restart at 946 * 947 * This is called from hfi1_rc_rcv() to process an incoming RC ACK 948 * for the given QP. 949 * Called at interrupt level with the QP s_lock held. 950 */ 951 static void reset_psn(struct rvt_qp *qp, u32 psn) 952 { 953 u32 n = qp->s_acked; 954 struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n); 955 u32 opcode; 956 957 lockdep_assert_held(&qp->s_lock); 958 qp->s_cur = n; 959 960 /* 961 * If we are starting the request from the beginning, 962 * let the normal send code handle initialization. 963 */ 964 if (cmp_psn(psn, wqe->psn) <= 0) { 965 qp->s_state = OP(SEND_LAST); 966 goto done; 967 } 968 969 /* Find the work request opcode corresponding to the given PSN. */ 970 opcode = wqe->wr.opcode; 971 for (;;) { 972 int diff; 973 974 if (++n == qp->s_size) 975 n = 0; 976 if (n == qp->s_tail) 977 break; 978 wqe = rvt_get_swqe_ptr(qp, n); 979 diff = cmp_psn(psn, wqe->psn); 980 if (diff < 0) 981 break; 982 qp->s_cur = n; 983 /* 984 * If we are starting the request from the beginning, 985 * let the normal send code handle initialization. 986 */ 987 if (diff == 0) { 988 qp->s_state = OP(SEND_LAST); 989 goto done; 990 } 991 opcode = wqe->wr.opcode; 992 } 993 994 /* 995 * Set the state to restart in the middle of a request. 996 * Don't change the s_sge, s_cur_sge, or s_cur_size. 997 * See hfi1_make_rc_req(). 998 */ 999 switch (opcode) { 1000 case IB_WR_SEND: 1001 case IB_WR_SEND_WITH_IMM: 1002 qp->s_state = OP(RDMA_READ_RESPONSE_FIRST); 1003 break; 1004 1005 case IB_WR_RDMA_WRITE: 1006 case IB_WR_RDMA_WRITE_WITH_IMM: 1007 qp->s_state = OP(RDMA_READ_RESPONSE_LAST); 1008 break; 1009 1010 case IB_WR_RDMA_READ: 1011 qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE); 1012 break; 1013 1014 default: 1015 /* 1016 * This case shouldn't happen since its only 1017 * one PSN per req. 1018 */ 1019 qp->s_state = OP(SEND_LAST); 1020 } 1021 done: 1022 qp->s_psn = psn; 1023 /* 1024 * Set RVT_S_WAIT_PSN as rc_complete() may start the timer 1025 * asynchronously before the send engine can get scheduled. 1026 * Doing it in hfi1_make_rc_req() is too late. 1027 */ 1028 if ((cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) && 1029 (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)) 1030 qp->s_flags |= RVT_S_WAIT_PSN; 1031 qp->s_flags &= ~RVT_S_AHG_VALID; 1032 } 1033 1034 /* 1035 * Back up requester to resend the last un-ACKed request. 1036 * The QP r_lock and s_lock should be held and interrupts disabled. 1037 */ 1038 void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) 1039 { 1040 struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 1041 struct hfi1_ibport *ibp; 1042 1043 lockdep_assert_held(&qp->r_lock); 1044 lockdep_assert_held(&qp->s_lock); 1045 if (qp->s_retry == 0) { 1046 if (qp->s_mig_state == IB_MIG_ARMED) { 1047 hfi1_migrate_qp(qp); 1048 qp->s_retry = qp->s_retry_cnt; 1049 } else if (qp->s_last == qp->s_acked) { 1050 hfi1_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); 1051 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); 1052 return; 1053 } else { /* need to handle delayed completion */ 1054 return; 1055 } 1056 } else { 1057 qp->s_retry--; 1058 } 1059 1060 ibp = to_iport(qp->ibqp.device, qp->port_num); 1061 if (wqe->wr.opcode == IB_WR_RDMA_READ) 1062 ibp->rvp.n_rc_resends++; 1063 else 1064 ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn); 1065 1066 qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR | 1067 RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN | 1068 RVT_S_WAIT_ACK); 1069 if (wait) 1070 qp->s_flags |= RVT_S_SEND_ONE; 1071 reset_psn(qp, psn); 1072 } 1073 1074 /* 1075 * Set qp->s_sending_psn to the next PSN after the given one. 1076 * This would be psn+1 except when RDMA reads are present. 1077 */ 1078 static void reset_sending_psn(struct rvt_qp *qp, u32 psn) 1079 { 1080 struct rvt_swqe *wqe; 1081 u32 n = qp->s_last; 1082 1083 lockdep_assert_held(&qp->s_lock); 1084 /* Find the work request corresponding to the given PSN. */ 1085 for (;;) { 1086 wqe = rvt_get_swqe_ptr(qp, n); 1087 if (cmp_psn(psn, wqe->lpsn) <= 0) { 1088 if (wqe->wr.opcode == IB_WR_RDMA_READ) 1089 qp->s_sending_psn = wqe->lpsn + 1; 1090 else 1091 qp->s_sending_psn = psn + 1; 1092 break; 1093 } 1094 if (++n == qp->s_size) 1095 n = 0; 1096 if (n == qp->s_tail) 1097 break; 1098 } 1099 } 1100 1101 /* 1102 * This should be called with the QP s_lock held and interrupts disabled. 1103 */ 1104 void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah) 1105 { 1106 struct ib_other_headers *ohdr; 1107 struct hfi1_qp_priv *priv = qp->priv; 1108 struct rvt_swqe *wqe; 1109 struct ib_header *hdr = NULL; 1110 struct hfi1_16b_header *hdr_16b = NULL; 1111 u32 opcode; 1112 u32 psn; 1113 1114 lockdep_assert_held(&qp->s_lock); 1115 if (!(ib_rvt_state_ops[qp->state] & RVT_SEND_OR_FLUSH_OR_RECV_OK)) 1116 return; 1117 1118 /* Find out where the BTH is */ 1119 if (priv->hdr_type == HFI1_PKT_TYPE_9B) { 1120 hdr = &opah->ibh; 1121 if (ib_get_lnh(hdr) == HFI1_LRH_BTH) 1122 ohdr = &hdr->u.oth; 1123 else 1124 ohdr = &hdr->u.l.oth; 1125 } else { 1126 u8 l4; 1127 1128 hdr_16b = &opah->opah; 1129 l4 = hfi1_16B_get_l4(hdr_16b); 1130 if (l4 == OPA_16B_L4_IB_LOCAL) 1131 ohdr = &hdr_16b->u.oth; 1132 else 1133 ohdr = &hdr_16b->u.l.oth; 1134 } 1135 1136 opcode = ib_bth_get_opcode(ohdr); 1137 if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) && 1138 opcode <= OP(ATOMIC_ACKNOWLEDGE)) { 1139 WARN_ON(!qp->s_rdma_ack_cnt); 1140 qp->s_rdma_ack_cnt--; 1141 return; 1142 } 1143 1144 psn = ib_bth_get_psn(ohdr); 1145 reset_sending_psn(qp, psn); 1146 1147 /* 1148 * Start timer after a packet requesting an ACK has been sent and 1149 * there are still requests that haven't been acked. 1150 */ 1151 if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail && 1152 !(qp->s_flags & 1153 (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) && 1154 (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) 1155 rvt_add_retry_timer(qp); 1156 1157 while (qp->s_last != qp->s_acked) { 1158 u32 s_last; 1159 1160 wqe = rvt_get_swqe_ptr(qp, qp->s_last); 1161 if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 && 1162 cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) 1163 break; 1164 s_last = qp->s_last; 1165 trace_hfi1_qp_send_completion(qp, wqe, s_last); 1166 if (++s_last >= qp->s_size) 1167 s_last = 0; 1168 qp->s_last = s_last; 1169 /* see post_send() */ 1170 barrier(); 1171 rvt_put_swqe(wqe); 1172 rvt_qp_swqe_complete(qp, 1173 wqe, 1174 ib_hfi1_wc_opcode[wqe->wr.opcode], 1175 IB_WC_SUCCESS); 1176 } 1177 /* 1178 * If we were waiting for sends to complete before re-sending, 1179 * and they are now complete, restart sending. 1180 */ 1181 trace_hfi1_sendcomplete(qp, psn); 1182 if (qp->s_flags & RVT_S_WAIT_PSN && 1183 cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) { 1184 qp->s_flags &= ~RVT_S_WAIT_PSN; 1185 qp->s_sending_psn = qp->s_psn; 1186 qp->s_sending_hpsn = qp->s_psn - 1; 1187 hfi1_schedule_send(qp); 1188 } 1189 } 1190 1191 static inline void update_last_psn(struct rvt_qp *qp, u32 psn) 1192 { 1193 qp->s_last_psn = psn; 1194 } 1195 1196 /* 1197 * Generate a SWQE completion. 1198 * This is similar to hfi1_send_complete but has to check to be sure 1199 * that the SGEs are not being referenced if the SWQE is being resent. 1200 */ 1201 static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, 1202 struct rvt_swqe *wqe, 1203 struct hfi1_ibport *ibp) 1204 { 1205 lockdep_assert_held(&qp->s_lock); 1206 /* 1207 * Don't decrement refcount and don't generate a 1208 * completion if the SWQE is being resent until the send 1209 * is finished. 1210 */ 1211 if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 || 1212 cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) { 1213 u32 s_last; 1214 1215 rvt_put_swqe(wqe); 1216 s_last = qp->s_last; 1217 trace_hfi1_qp_send_completion(qp, wqe, s_last); 1218 if (++s_last >= qp->s_size) 1219 s_last = 0; 1220 qp->s_last = s_last; 1221 /* see post_send() */ 1222 barrier(); 1223 rvt_qp_swqe_complete(qp, 1224 wqe, 1225 ib_hfi1_wc_opcode[wqe->wr.opcode], 1226 IB_WC_SUCCESS); 1227 } else { 1228 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); 1229 1230 this_cpu_inc(*ibp->rvp.rc_delayed_comp); 1231 /* 1232 * If send progress not running attempt to progress 1233 * SDMA queue. 1234 */ 1235 if (ppd->dd->flags & HFI1_HAS_SEND_DMA) { 1236 struct sdma_engine *engine; 1237 u8 sl = rdma_ah_get_sl(&qp->remote_ah_attr); 1238 u8 sc5; 1239 1240 /* For now use sc to find engine */ 1241 sc5 = ibp->sl_to_sc[sl]; 1242 engine = qp_to_sdma_engine(qp, sc5); 1243 sdma_engine_progress_schedule(engine); 1244 } 1245 } 1246 1247 qp->s_retry = qp->s_retry_cnt; 1248 update_last_psn(qp, wqe->lpsn); 1249 1250 /* 1251 * If we are completing a request which is in the process of 1252 * being resent, we can stop re-sending it since we know the 1253 * responder has already seen it. 1254 */ 1255 if (qp->s_acked == qp->s_cur) { 1256 if (++qp->s_cur >= qp->s_size) 1257 qp->s_cur = 0; 1258 qp->s_acked = qp->s_cur; 1259 wqe = rvt_get_swqe_ptr(qp, qp->s_cur); 1260 if (qp->s_acked != qp->s_tail) { 1261 qp->s_state = OP(SEND_LAST); 1262 qp->s_psn = wqe->psn; 1263 } 1264 } else { 1265 if (++qp->s_acked >= qp->s_size) 1266 qp->s_acked = 0; 1267 if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur) 1268 qp->s_draining = 0; 1269 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 1270 } 1271 return wqe; 1272 } 1273 1274 /** 1275 * do_rc_ack - process an incoming RC ACK 1276 * @qp: the QP the ACK came in on 1277 * @psn: the packet sequence number of the ACK 1278 * @opcode: the opcode of the request that resulted in the ACK 1279 * 1280 * This is called from rc_rcv_resp() to process an incoming RC ACK 1281 * for the given QP. 1282 * May be called at interrupt level, with the QP s_lock held. 1283 * Returns 1 if OK, 0 if current operation should be aborted (NAK). 1284 */ 1285 static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, 1286 u64 val, struct hfi1_ctxtdata *rcd) 1287 { 1288 struct hfi1_ibport *ibp; 1289 enum ib_wc_status status; 1290 struct rvt_swqe *wqe; 1291 int ret = 0; 1292 u32 ack_psn; 1293 int diff; 1294 1295 lockdep_assert_held(&qp->s_lock); 1296 /* 1297 * Note that NAKs implicitly ACK outstanding SEND and RDMA write 1298 * requests and implicitly NAK RDMA read and atomic requests issued 1299 * before the NAK'ed request. The MSN won't include the NAK'ed 1300 * request but will include an ACK'ed request(s). 1301 */ 1302 ack_psn = psn; 1303 if (aeth >> IB_AETH_NAK_SHIFT) 1304 ack_psn--; 1305 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 1306 ibp = rcd_to_iport(rcd); 1307 1308 /* 1309 * The MSN might be for a later WQE than the PSN indicates so 1310 * only complete WQEs that the PSN finishes. 1311 */ 1312 while ((diff = delta_psn(ack_psn, wqe->lpsn)) >= 0) { 1313 /* 1314 * RDMA_READ_RESPONSE_ONLY is a special case since 1315 * we want to generate completion events for everything 1316 * before the RDMA read, copy the data, then generate 1317 * the completion for the read. 1318 */ 1319 if (wqe->wr.opcode == IB_WR_RDMA_READ && 1320 opcode == OP(RDMA_READ_RESPONSE_ONLY) && 1321 diff == 0) { 1322 ret = 1; 1323 goto bail_stop; 1324 } 1325 /* 1326 * If this request is a RDMA read or atomic, and the ACK is 1327 * for a later operation, this ACK NAKs the RDMA read or 1328 * atomic. In other words, only a RDMA_READ_LAST or ONLY 1329 * can ACK a RDMA read and likewise for atomic ops. Note 1330 * that the NAK case can only happen if relaxed ordering is 1331 * used and requests are sent after an RDMA read or atomic 1332 * is sent but before the response is received. 1333 */ 1334 if ((wqe->wr.opcode == IB_WR_RDMA_READ && 1335 (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) || 1336 ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || 1337 wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) && 1338 (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) { 1339 /* Retry this request. */ 1340 if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) { 1341 qp->r_flags |= RVT_R_RDMAR_SEQ; 1342 hfi1_restart_rc(qp, qp->s_last_psn + 1, 0); 1343 if (list_empty(&qp->rspwait)) { 1344 qp->r_flags |= RVT_R_RSP_SEND; 1345 rvt_get_qp(qp); 1346 list_add_tail(&qp->rspwait, 1347 &rcd->qp_wait_list); 1348 } 1349 } 1350 /* 1351 * No need to process the ACK/NAK since we are 1352 * restarting an earlier request. 1353 */ 1354 goto bail_stop; 1355 } 1356 if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || 1357 wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { 1358 u64 *vaddr = wqe->sg_list[0].vaddr; 1359 *vaddr = val; 1360 } 1361 if (qp->s_num_rd_atomic && 1362 (wqe->wr.opcode == IB_WR_RDMA_READ || 1363 wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || 1364 wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) { 1365 qp->s_num_rd_atomic--; 1366 /* Restart sending task if fence is complete */ 1367 if ((qp->s_flags & RVT_S_WAIT_FENCE) && 1368 !qp->s_num_rd_atomic) { 1369 qp->s_flags &= ~(RVT_S_WAIT_FENCE | 1370 RVT_S_WAIT_ACK); 1371 hfi1_schedule_send(qp); 1372 } else if (qp->s_flags & RVT_S_WAIT_RDMAR) { 1373 qp->s_flags &= ~(RVT_S_WAIT_RDMAR | 1374 RVT_S_WAIT_ACK); 1375 hfi1_schedule_send(qp); 1376 } 1377 } 1378 wqe = do_rc_completion(qp, wqe, ibp); 1379 if (qp->s_acked == qp->s_tail) 1380 break; 1381 } 1382 1383 switch (aeth >> IB_AETH_NAK_SHIFT) { 1384 case 0: /* ACK */ 1385 this_cpu_inc(*ibp->rvp.rc_acks); 1386 if (qp->s_acked != qp->s_tail) { 1387 /* 1388 * We are expecting more ACKs so 1389 * mod the retry timer. 1390 */ 1391 rvt_mod_retry_timer(qp); 1392 /* 1393 * We can stop re-sending the earlier packets and 1394 * continue with the next packet the receiver wants. 1395 */ 1396 if (cmp_psn(qp->s_psn, psn) <= 0) 1397 reset_psn(qp, psn + 1); 1398 } else { 1399 /* No more acks - kill all timers */ 1400 rvt_stop_rc_timers(qp); 1401 if (cmp_psn(qp->s_psn, psn) <= 0) { 1402 qp->s_state = OP(SEND_LAST); 1403 qp->s_psn = psn + 1; 1404 } 1405 } 1406 if (qp->s_flags & RVT_S_WAIT_ACK) { 1407 qp->s_flags &= ~RVT_S_WAIT_ACK; 1408 hfi1_schedule_send(qp); 1409 } 1410 rvt_get_credit(qp, aeth); 1411 qp->s_rnr_retry = qp->s_rnr_retry_cnt; 1412 qp->s_retry = qp->s_retry_cnt; 1413 update_last_psn(qp, psn); 1414 return 1; 1415 1416 case 1: /* RNR NAK */ 1417 ibp->rvp.n_rnr_naks++; 1418 if (qp->s_acked == qp->s_tail) 1419 goto bail_stop; 1420 if (qp->s_flags & RVT_S_WAIT_RNR) 1421 goto bail_stop; 1422 if (qp->s_rnr_retry == 0) { 1423 status = IB_WC_RNR_RETRY_EXC_ERR; 1424 goto class_b; 1425 } 1426 if (qp->s_rnr_retry_cnt < 7) 1427 qp->s_rnr_retry--; 1428 1429 /* The last valid PSN is the previous PSN. */ 1430 update_last_psn(qp, psn - 1); 1431 1432 ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn); 1433 1434 reset_psn(qp, psn); 1435 1436 qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK); 1437 rvt_stop_rc_timers(qp); 1438 rvt_add_rnr_timer(qp, aeth); 1439 return 0; 1440 1441 case 3: /* NAK */ 1442 if (qp->s_acked == qp->s_tail) 1443 goto bail_stop; 1444 /* The last valid PSN is the previous PSN. */ 1445 update_last_psn(qp, psn - 1); 1446 switch ((aeth >> IB_AETH_CREDIT_SHIFT) & 1447 IB_AETH_CREDIT_MASK) { 1448 case 0: /* PSN sequence error */ 1449 ibp->rvp.n_seq_naks++; 1450 /* 1451 * Back up to the responder's expected PSN. 1452 * Note that we might get a NAK in the middle of an 1453 * RDMA READ response which terminates the RDMA 1454 * READ. 1455 */ 1456 hfi1_restart_rc(qp, psn, 0); 1457 hfi1_schedule_send(qp); 1458 break; 1459 1460 case 1: /* Invalid Request */ 1461 status = IB_WC_REM_INV_REQ_ERR; 1462 ibp->rvp.n_other_naks++; 1463 goto class_b; 1464 1465 case 2: /* Remote Access Error */ 1466 status = IB_WC_REM_ACCESS_ERR; 1467 ibp->rvp.n_other_naks++; 1468 goto class_b; 1469 1470 case 3: /* Remote Operation Error */ 1471 status = IB_WC_REM_OP_ERR; 1472 ibp->rvp.n_other_naks++; 1473 class_b: 1474 if (qp->s_last == qp->s_acked) { 1475 hfi1_send_complete(qp, wqe, status); 1476 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); 1477 } 1478 break; 1479 1480 default: 1481 /* Ignore other reserved NAK error codes */ 1482 goto reserved; 1483 } 1484 qp->s_retry = qp->s_retry_cnt; 1485 qp->s_rnr_retry = qp->s_rnr_retry_cnt; 1486 goto bail_stop; 1487 1488 default: /* 2: reserved */ 1489 reserved: 1490 /* Ignore reserved NAK codes. */ 1491 goto bail_stop; 1492 } 1493 /* cannot be reached */ 1494 bail_stop: 1495 rvt_stop_rc_timers(qp); 1496 return ret; 1497 } 1498 1499 /* 1500 * We have seen an out of sequence RDMA read middle or last packet. 1501 * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE. 1502 */ 1503 static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn, 1504 struct hfi1_ctxtdata *rcd) 1505 { 1506 struct rvt_swqe *wqe; 1507 1508 lockdep_assert_held(&qp->s_lock); 1509 /* Remove QP from retry timer */ 1510 rvt_stop_rc_timers(qp); 1511 1512 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 1513 1514 while (cmp_psn(psn, wqe->lpsn) > 0) { 1515 if (wqe->wr.opcode == IB_WR_RDMA_READ || 1516 wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || 1517 wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) 1518 break; 1519 wqe = do_rc_completion(qp, wqe, ibp); 1520 } 1521 1522 ibp->rvp.n_rdma_seq++; 1523 qp->r_flags |= RVT_R_RDMAR_SEQ; 1524 hfi1_restart_rc(qp, qp->s_last_psn + 1, 0); 1525 if (list_empty(&qp->rspwait)) { 1526 qp->r_flags |= RVT_R_RSP_SEND; 1527 rvt_get_qp(qp); 1528 list_add_tail(&qp->rspwait, &rcd->qp_wait_list); 1529 } 1530 } 1531 1532 /** 1533 * rc_rcv_resp - process an incoming RC response packet 1534 * @packet: data packet information 1535 * 1536 * This is called from hfi1_rc_rcv() to process an incoming RC response 1537 * packet for the given QP. 1538 * Called at interrupt level. 1539 */ 1540 static void rc_rcv_resp(struct hfi1_packet *packet) 1541 { 1542 struct hfi1_ctxtdata *rcd = packet->rcd; 1543 void *data = packet->payload; 1544 u32 tlen = packet->tlen; 1545 struct rvt_qp *qp = packet->qp; 1546 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); 1547 struct ib_other_headers *ohdr = packet->ohdr; 1548 struct rvt_swqe *wqe; 1549 enum ib_wc_status status; 1550 unsigned long flags; 1551 int diff; 1552 u64 val; 1553 u32 aeth; 1554 u32 psn = ib_bth_get_psn(packet->ohdr); 1555 u32 pmtu = qp->pmtu; 1556 u16 hdrsize = packet->hlen; 1557 u8 opcode = packet->opcode; 1558 u8 pad = packet->pad; 1559 u8 extra_bytes = pad + packet->extra_byte + (SIZE_OF_CRC << 2); 1560 1561 spin_lock_irqsave(&qp->s_lock, flags); 1562 trace_hfi1_ack(qp, psn); 1563 1564 /* Ignore invalid responses. */ 1565 smp_read_barrier_depends(); /* see post_one_send */ 1566 if (cmp_psn(psn, READ_ONCE(qp->s_next_psn)) >= 0) 1567 goto ack_done; 1568 1569 /* Ignore duplicate responses. */ 1570 diff = cmp_psn(psn, qp->s_last_psn); 1571 if (unlikely(diff <= 0)) { 1572 /* Update credits for "ghost" ACKs */ 1573 if (diff == 0 && opcode == OP(ACKNOWLEDGE)) { 1574 aeth = be32_to_cpu(ohdr->u.aeth); 1575 if ((aeth >> IB_AETH_NAK_SHIFT) == 0) 1576 rvt_get_credit(qp, aeth); 1577 } 1578 goto ack_done; 1579 } 1580 1581 /* 1582 * Skip everything other than the PSN we expect, if we are waiting 1583 * for a reply to a restarted RDMA read or atomic op. 1584 */ 1585 if (qp->r_flags & RVT_R_RDMAR_SEQ) { 1586 if (cmp_psn(psn, qp->s_last_psn + 1) != 0) 1587 goto ack_done; 1588 qp->r_flags &= ~RVT_R_RDMAR_SEQ; 1589 } 1590 1591 if (unlikely(qp->s_acked == qp->s_tail)) 1592 goto ack_done; 1593 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 1594 status = IB_WC_SUCCESS; 1595 1596 switch (opcode) { 1597 case OP(ACKNOWLEDGE): 1598 case OP(ATOMIC_ACKNOWLEDGE): 1599 case OP(RDMA_READ_RESPONSE_FIRST): 1600 aeth = be32_to_cpu(ohdr->u.aeth); 1601 if (opcode == OP(ATOMIC_ACKNOWLEDGE)) 1602 val = ib_u64_get(&ohdr->u.at.atomic_ack_eth); 1603 else 1604 val = 0; 1605 if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) || 1606 opcode != OP(RDMA_READ_RESPONSE_FIRST)) 1607 goto ack_done; 1608 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 1609 if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) 1610 goto ack_op_err; 1611 /* 1612 * If this is a response to a resent RDMA read, we 1613 * have to be careful to copy the data to the right 1614 * location. 1615 */ 1616 qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge, 1617 wqe, psn, pmtu); 1618 goto read_middle; 1619 1620 case OP(RDMA_READ_RESPONSE_MIDDLE): 1621 /* no AETH, no ACK */ 1622 if (unlikely(cmp_psn(psn, qp->s_last_psn + 1))) 1623 goto ack_seq_err; 1624 if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) 1625 goto ack_op_err; 1626 read_middle: 1627 if (unlikely(tlen != (hdrsize + pmtu + extra_bytes))) 1628 goto ack_len_err; 1629 if (unlikely(pmtu >= qp->s_rdma_read_len)) 1630 goto ack_len_err; 1631 1632 /* 1633 * We got a response so update the timeout. 1634 * 4.096 usec. * (1 << qp->timeout) 1635 */ 1636 rvt_mod_retry_timer(qp); 1637 if (qp->s_flags & RVT_S_WAIT_ACK) { 1638 qp->s_flags &= ~RVT_S_WAIT_ACK; 1639 hfi1_schedule_send(qp); 1640 } 1641 1642 if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE)) 1643 qp->s_retry = qp->s_retry_cnt; 1644 1645 /* 1646 * Update the RDMA receive state but do the copy w/o 1647 * holding the locks and blocking interrupts. 1648 */ 1649 qp->s_rdma_read_len -= pmtu; 1650 update_last_psn(qp, psn); 1651 spin_unlock_irqrestore(&qp->s_lock, flags); 1652 hfi1_copy_sge(&qp->s_rdma_read_sge, data, pmtu, false, false); 1653 goto bail; 1654 1655 case OP(RDMA_READ_RESPONSE_ONLY): 1656 aeth = be32_to_cpu(ohdr->u.aeth); 1657 if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd)) 1658 goto ack_done; 1659 /* 1660 * Check that the data size is >= 0 && <= pmtu. 1661 * Remember to account for ICRC (4). 1662 */ 1663 if (unlikely(tlen < (hdrsize + extra_bytes))) 1664 goto ack_len_err; 1665 /* 1666 * If this is a response to a resent RDMA read, we 1667 * have to be careful to copy the data to the right 1668 * location. 1669 */ 1670 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 1671 qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge, 1672 wqe, psn, pmtu); 1673 goto read_last; 1674 1675 case OP(RDMA_READ_RESPONSE_LAST): 1676 /* ACKs READ req. */ 1677 if (unlikely(cmp_psn(psn, qp->s_last_psn + 1))) 1678 goto ack_seq_err; 1679 if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) 1680 goto ack_op_err; 1681 /* 1682 * Check that the data size is >= 1 && <= pmtu. 1683 * Remember to account for ICRC (4). 1684 */ 1685 if (unlikely(tlen <= (hdrsize + extra_bytes))) 1686 goto ack_len_err; 1687 read_last: 1688 tlen -= hdrsize + extra_bytes; 1689 if (unlikely(tlen != qp->s_rdma_read_len)) 1690 goto ack_len_err; 1691 aeth = be32_to_cpu(ohdr->u.aeth); 1692 hfi1_copy_sge(&qp->s_rdma_read_sge, data, tlen, false, false); 1693 WARN_ON(qp->s_rdma_read_sge.num_sge); 1694 (void)do_rc_ack(qp, aeth, psn, 1695 OP(RDMA_READ_RESPONSE_LAST), 0, rcd); 1696 goto ack_done; 1697 } 1698 1699 ack_op_err: 1700 status = IB_WC_LOC_QP_OP_ERR; 1701 goto ack_err; 1702 1703 ack_seq_err: 1704 rdma_seq_err(qp, ibp, psn, rcd); 1705 goto ack_done; 1706 1707 ack_len_err: 1708 status = IB_WC_LOC_LEN_ERR; 1709 ack_err: 1710 if (qp->s_last == qp->s_acked) { 1711 hfi1_send_complete(qp, wqe, status); 1712 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); 1713 } 1714 ack_done: 1715 spin_unlock_irqrestore(&qp->s_lock, flags); 1716 bail: 1717 return; 1718 } 1719 1720 static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd, 1721 struct rvt_qp *qp) 1722 { 1723 if (list_empty(&qp->rspwait)) { 1724 qp->r_flags |= RVT_R_RSP_NAK; 1725 rvt_get_qp(qp); 1726 list_add_tail(&qp->rspwait, &rcd->qp_wait_list); 1727 } 1728 } 1729 1730 static inline void rc_cancel_ack(struct rvt_qp *qp) 1731 { 1732 qp->r_adefered = 0; 1733 if (list_empty(&qp->rspwait)) 1734 return; 1735 list_del_init(&qp->rspwait); 1736 qp->r_flags &= ~RVT_R_RSP_NAK; 1737 rvt_put_qp(qp); 1738 } 1739 1740 /** 1741 * rc_rcv_error - process an incoming duplicate or error RC packet 1742 * @ohdr: the other headers for this packet 1743 * @data: the packet data 1744 * @qp: the QP for this packet 1745 * @opcode: the opcode for this packet 1746 * @psn: the packet sequence number for this packet 1747 * @diff: the difference between the PSN and the expected PSN 1748 * 1749 * This is called from hfi1_rc_rcv() to process an unexpected 1750 * incoming RC packet for the given QP. 1751 * Called at interrupt level. 1752 * Return 1 if no more processing is needed; otherwise return 0 to 1753 * schedule a response to be sent. 1754 */ 1755 static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, 1756 struct rvt_qp *qp, u32 opcode, u32 psn, 1757 int diff, struct hfi1_ctxtdata *rcd) 1758 { 1759 struct hfi1_ibport *ibp = rcd_to_iport(rcd); 1760 struct rvt_ack_entry *e; 1761 unsigned long flags; 1762 u8 i, prev; 1763 int old_req; 1764 1765 trace_hfi1_rcv_error(qp, psn); 1766 if (diff > 0) { 1767 /* 1768 * Packet sequence error. 1769 * A NAK will ACK earlier sends and RDMA writes. 1770 * Don't queue the NAK if we already sent one. 1771 */ 1772 if (!qp->r_nak_state) { 1773 ibp->rvp.n_rc_seqnak++; 1774 qp->r_nak_state = IB_NAK_PSN_ERROR; 1775 /* Use the expected PSN. */ 1776 qp->r_ack_psn = qp->r_psn; 1777 /* 1778 * Wait to send the sequence NAK until all packets 1779 * in the receive queue have been processed. 1780 * Otherwise, we end up propagating congestion. 1781 */ 1782 rc_defered_ack(rcd, qp); 1783 } 1784 goto done; 1785 } 1786 1787 /* 1788 * Handle a duplicate request. Don't re-execute SEND, RDMA 1789 * write or atomic op. Don't NAK errors, just silently drop 1790 * the duplicate request. Note that r_sge, r_len, and 1791 * r_rcv_len may be in use so don't modify them. 1792 * 1793 * We are supposed to ACK the earliest duplicate PSN but we 1794 * can coalesce an outstanding duplicate ACK. We have to 1795 * send the earliest so that RDMA reads can be restarted at 1796 * the requester's expected PSN. 1797 * 1798 * First, find where this duplicate PSN falls within the 1799 * ACKs previously sent. 1800 * old_req is true if there is an older response that is scheduled 1801 * to be sent before sending this one. 1802 */ 1803 e = NULL; 1804 old_req = 1; 1805 ibp->rvp.n_rc_dupreq++; 1806 1807 spin_lock_irqsave(&qp->s_lock, flags); 1808 1809 for (i = qp->r_head_ack_queue; ; i = prev) { 1810 if (i == qp->s_tail_ack_queue) 1811 old_req = 0; 1812 if (i) 1813 prev = i - 1; 1814 else 1815 prev = HFI1_MAX_RDMA_ATOMIC; 1816 if (prev == qp->r_head_ack_queue) { 1817 e = NULL; 1818 break; 1819 } 1820 e = &qp->s_ack_queue[prev]; 1821 if (!e->opcode) { 1822 e = NULL; 1823 break; 1824 } 1825 if (cmp_psn(psn, e->psn) >= 0) { 1826 if (prev == qp->s_tail_ack_queue && 1827 cmp_psn(psn, e->lpsn) <= 0) 1828 old_req = 0; 1829 break; 1830 } 1831 } 1832 switch (opcode) { 1833 case OP(RDMA_READ_REQUEST): { 1834 struct ib_reth *reth; 1835 u32 offset; 1836 u32 len; 1837 1838 /* 1839 * If we didn't find the RDMA read request in the ack queue, 1840 * we can ignore this request. 1841 */ 1842 if (!e || e->opcode != OP(RDMA_READ_REQUEST)) 1843 goto unlock_done; 1844 /* RETH comes after BTH */ 1845 reth = &ohdr->u.rc.reth; 1846 /* 1847 * Address range must be a subset of the original 1848 * request and start on pmtu boundaries. 1849 * We reuse the old ack_queue slot since the requester 1850 * should not back up and request an earlier PSN for the 1851 * same request. 1852 */ 1853 offset = delta_psn(psn, e->psn) * qp->pmtu; 1854 len = be32_to_cpu(reth->length); 1855 if (unlikely(offset + len != e->rdma_sge.sge_length)) 1856 goto unlock_done; 1857 if (e->rdma_sge.mr) { 1858 rvt_put_mr(e->rdma_sge.mr); 1859 e->rdma_sge.mr = NULL; 1860 } 1861 if (len != 0) { 1862 u32 rkey = be32_to_cpu(reth->rkey); 1863 u64 vaddr = get_ib_reth_vaddr(reth); 1864 int ok; 1865 1866 ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey, 1867 IB_ACCESS_REMOTE_READ); 1868 if (unlikely(!ok)) 1869 goto unlock_done; 1870 } else { 1871 e->rdma_sge.vaddr = NULL; 1872 e->rdma_sge.length = 0; 1873 e->rdma_sge.sge_length = 0; 1874 } 1875 e->psn = psn; 1876 if (old_req) 1877 goto unlock_done; 1878 qp->s_tail_ack_queue = prev; 1879 break; 1880 } 1881 1882 case OP(COMPARE_SWAP): 1883 case OP(FETCH_ADD): { 1884 /* 1885 * If we didn't find the atomic request in the ack queue 1886 * or the send engine is already backed up to send an 1887 * earlier entry, we can ignore this request. 1888 */ 1889 if (!e || e->opcode != (u8)opcode || old_req) 1890 goto unlock_done; 1891 qp->s_tail_ack_queue = prev; 1892 break; 1893 } 1894 1895 default: 1896 /* 1897 * Ignore this operation if it doesn't request an ACK 1898 * or an earlier RDMA read or atomic is going to be resent. 1899 */ 1900 if (!(psn & IB_BTH_REQ_ACK) || old_req) 1901 goto unlock_done; 1902 /* 1903 * Resend the most recent ACK if this request is 1904 * after all the previous RDMA reads and atomics. 1905 */ 1906 if (i == qp->r_head_ack_queue) { 1907 spin_unlock_irqrestore(&qp->s_lock, flags); 1908 qp->r_nak_state = 0; 1909 qp->r_ack_psn = qp->r_psn - 1; 1910 goto send_ack; 1911 } 1912 1913 /* 1914 * Resend the RDMA read or atomic op which 1915 * ACKs this duplicate request. 1916 */ 1917 qp->s_tail_ack_queue = i; 1918 break; 1919 } 1920 qp->s_ack_state = OP(ACKNOWLEDGE); 1921 qp->s_flags |= RVT_S_RESP_PENDING; 1922 qp->r_nak_state = 0; 1923 hfi1_schedule_send(qp); 1924 1925 unlock_done: 1926 spin_unlock_irqrestore(&qp->s_lock, flags); 1927 done: 1928 return 1; 1929 1930 send_ack: 1931 return 0; 1932 } 1933 1934 static inline void update_ack_queue(struct rvt_qp *qp, unsigned n) 1935 { 1936 unsigned next; 1937 1938 next = n + 1; 1939 if (next > HFI1_MAX_RDMA_ATOMIC) 1940 next = 0; 1941 qp->s_tail_ack_queue = next; 1942 qp->s_ack_state = OP(ACKNOWLEDGE); 1943 } 1944 1945 static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid, 1946 u32 lqpn, u32 rqpn, u8 svc_type) 1947 { 1948 struct opa_hfi1_cong_log_event_internal *cc_event; 1949 unsigned long flags; 1950 1951 if (sl >= OPA_MAX_SLS) 1952 return; 1953 1954 spin_lock_irqsave(&ppd->cc_log_lock, flags); 1955 1956 ppd->threshold_cong_event_map[sl / 8] |= 1 << (sl % 8); 1957 ppd->threshold_event_counter++; 1958 1959 cc_event = &ppd->cc_events[ppd->cc_log_idx++]; 1960 if (ppd->cc_log_idx == OPA_CONG_LOG_ELEMS) 1961 ppd->cc_log_idx = 0; 1962 cc_event->lqpn = lqpn & RVT_QPN_MASK; 1963 cc_event->rqpn = rqpn & RVT_QPN_MASK; 1964 cc_event->sl = sl; 1965 cc_event->svc_type = svc_type; 1966 cc_event->rlid = rlid; 1967 /* keep timestamp in units of 1.024 usec */ 1968 cc_event->timestamp = ktime_get_ns() / 1024; 1969 1970 spin_unlock_irqrestore(&ppd->cc_log_lock, flags); 1971 } 1972 1973 void process_becn(struct hfi1_pportdata *ppd, u8 sl, u32 rlid, u32 lqpn, 1974 u32 rqpn, u8 svc_type) 1975 { 1976 struct cca_timer *cca_timer; 1977 u16 ccti, ccti_incr, ccti_timer, ccti_limit; 1978 u8 trigger_threshold; 1979 struct cc_state *cc_state; 1980 unsigned long flags; 1981 1982 if (sl >= OPA_MAX_SLS) 1983 return; 1984 1985 cc_state = get_cc_state(ppd); 1986 1987 if (!cc_state) 1988 return; 1989 1990 /* 1991 * 1) increase CCTI (for this SL) 1992 * 2) select IPG (i.e., call set_link_ipg()) 1993 * 3) start timer 1994 */ 1995 ccti_limit = cc_state->cct.ccti_limit; 1996 ccti_incr = cc_state->cong_setting.entries[sl].ccti_increase; 1997 ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer; 1998 trigger_threshold = 1999 cc_state->cong_setting.entries[sl].trigger_threshold; 2000 2001 spin_lock_irqsave(&ppd->cca_timer_lock, flags); 2002 2003 cca_timer = &ppd->cca_timer[sl]; 2004 if (cca_timer->ccti < ccti_limit) { 2005 if (cca_timer->ccti + ccti_incr <= ccti_limit) 2006 cca_timer->ccti += ccti_incr; 2007 else 2008 cca_timer->ccti = ccti_limit; 2009 set_link_ipg(ppd); 2010 } 2011 2012 ccti = cca_timer->ccti; 2013 2014 if (!hrtimer_active(&cca_timer->hrtimer)) { 2015 /* ccti_timer is in units of 1.024 usec */ 2016 unsigned long nsec = 1024 * ccti_timer; 2017 2018 hrtimer_start(&cca_timer->hrtimer, ns_to_ktime(nsec), 2019 HRTIMER_MODE_REL); 2020 } 2021 2022 spin_unlock_irqrestore(&ppd->cca_timer_lock, flags); 2023 2024 if ((trigger_threshold != 0) && (ccti >= trigger_threshold)) 2025 log_cca_event(ppd, sl, rlid, lqpn, rqpn, svc_type); 2026 } 2027 2028 /** 2029 * hfi1_rc_rcv - process an incoming RC packet 2030 * @packet: data packet information 2031 * 2032 * This is called from qp_rcv() to process an incoming RC packet 2033 * for the given QP. 2034 * May be called at interrupt level. 2035 */ 2036 void hfi1_rc_rcv(struct hfi1_packet *packet) 2037 { 2038 struct hfi1_ctxtdata *rcd = packet->rcd; 2039 void *data = packet->payload; 2040 u32 tlen = packet->tlen; 2041 struct rvt_qp *qp = packet->qp; 2042 struct hfi1_ibport *ibp = rcd_to_iport(rcd); 2043 struct ib_other_headers *ohdr = packet->ohdr; 2044 u32 bth0 = be32_to_cpu(ohdr->bth[0]); 2045 u32 opcode = packet->opcode; 2046 u32 hdrsize = packet->hlen; 2047 u32 psn = ib_bth_get_psn(packet->ohdr); 2048 u32 pad = packet->pad; 2049 struct ib_wc wc; 2050 u32 pmtu = qp->pmtu; 2051 int diff; 2052 struct ib_reth *reth; 2053 unsigned long flags; 2054 int ret; 2055 bool is_fecn = false; 2056 bool copy_last = false; 2057 u32 rkey; 2058 u8 extra_bytes = pad + packet->extra_byte + (SIZE_OF_CRC << 2); 2059 2060 lockdep_assert_held(&qp->r_lock); 2061 2062 if (hfi1_ruc_check_hdr(ibp, packet)) 2063 return; 2064 2065 is_fecn = process_ecn(qp, packet, false); 2066 2067 /* 2068 * Process responses (ACKs) before anything else. Note that the 2069 * packet sequence number will be for something in the send work 2070 * queue rather than the expected receive packet sequence number. 2071 * In other words, this QP is the requester. 2072 */ 2073 if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) && 2074 opcode <= OP(ATOMIC_ACKNOWLEDGE)) { 2075 rc_rcv_resp(packet); 2076 if (is_fecn) 2077 goto send_ack; 2078 return; 2079 } 2080 2081 /* Compute 24 bits worth of difference. */ 2082 diff = delta_psn(psn, qp->r_psn); 2083 if (unlikely(diff)) { 2084 if (rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd)) 2085 return; 2086 goto send_ack; 2087 } 2088 2089 /* Check for opcode sequence errors. */ 2090 switch (qp->r_state) { 2091 case OP(SEND_FIRST): 2092 case OP(SEND_MIDDLE): 2093 if (opcode == OP(SEND_MIDDLE) || 2094 opcode == OP(SEND_LAST) || 2095 opcode == OP(SEND_LAST_WITH_IMMEDIATE) || 2096 opcode == OP(SEND_LAST_WITH_INVALIDATE)) 2097 break; 2098 goto nack_inv; 2099 2100 case OP(RDMA_WRITE_FIRST): 2101 case OP(RDMA_WRITE_MIDDLE): 2102 if (opcode == OP(RDMA_WRITE_MIDDLE) || 2103 opcode == OP(RDMA_WRITE_LAST) || 2104 opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) 2105 break; 2106 goto nack_inv; 2107 2108 default: 2109 if (opcode == OP(SEND_MIDDLE) || 2110 opcode == OP(SEND_LAST) || 2111 opcode == OP(SEND_LAST_WITH_IMMEDIATE) || 2112 opcode == OP(SEND_LAST_WITH_INVALIDATE) || 2113 opcode == OP(RDMA_WRITE_MIDDLE) || 2114 opcode == OP(RDMA_WRITE_LAST) || 2115 opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) 2116 goto nack_inv; 2117 /* 2118 * Note that it is up to the requester to not send a new 2119 * RDMA read or atomic operation before receiving an ACK 2120 * for the previous operation. 2121 */ 2122 break; 2123 } 2124 2125 if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST)) 2126 rvt_comm_est(qp); 2127 2128 /* OK, process the packet. */ 2129 switch (opcode) { 2130 case OP(SEND_FIRST): 2131 ret = hfi1_rvt_get_rwqe(qp, 0); 2132 if (ret < 0) 2133 goto nack_op_err; 2134 if (!ret) 2135 goto rnr_nak; 2136 qp->r_rcv_len = 0; 2137 /* FALLTHROUGH */ 2138 case OP(SEND_MIDDLE): 2139 case OP(RDMA_WRITE_MIDDLE): 2140 send_middle: 2141 /* Check for invalid length PMTU or posted rwqe len. */ 2142 /* 2143 * There will be no padding for 9B packet but 16B packets 2144 * will come in with some padding since we always add 2145 * CRC and LT bytes which will need to be flit aligned 2146 */ 2147 if (unlikely(tlen != (hdrsize + pmtu + extra_bytes))) 2148 goto nack_inv; 2149 qp->r_rcv_len += pmtu; 2150 if (unlikely(qp->r_rcv_len > qp->r_len)) 2151 goto nack_inv; 2152 hfi1_copy_sge(&qp->r_sge, data, pmtu, true, false); 2153 break; 2154 2155 case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): 2156 /* consume RWQE */ 2157 ret = hfi1_rvt_get_rwqe(qp, 1); 2158 if (ret < 0) 2159 goto nack_op_err; 2160 if (!ret) 2161 goto rnr_nak; 2162 goto send_last_imm; 2163 2164 case OP(SEND_ONLY): 2165 case OP(SEND_ONLY_WITH_IMMEDIATE): 2166 case OP(SEND_ONLY_WITH_INVALIDATE): 2167 ret = hfi1_rvt_get_rwqe(qp, 0); 2168 if (ret < 0) 2169 goto nack_op_err; 2170 if (!ret) 2171 goto rnr_nak; 2172 qp->r_rcv_len = 0; 2173 if (opcode == OP(SEND_ONLY)) 2174 goto no_immediate_data; 2175 if (opcode == OP(SEND_ONLY_WITH_INVALIDATE)) 2176 goto send_last_inv; 2177 /* FALLTHROUGH -- for SEND_ONLY_WITH_IMMEDIATE */ 2178 case OP(SEND_LAST_WITH_IMMEDIATE): 2179 send_last_imm: 2180 wc.ex.imm_data = ohdr->u.imm_data; 2181 wc.wc_flags = IB_WC_WITH_IMM; 2182 goto send_last; 2183 case OP(SEND_LAST_WITH_INVALIDATE): 2184 send_last_inv: 2185 rkey = be32_to_cpu(ohdr->u.ieth); 2186 if (rvt_invalidate_rkey(qp, rkey)) 2187 goto no_immediate_data; 2188 wc.ex.invalidate_rkey = rkey; 2189 wc.wc_flags = IB_WC_WITH_INVALIDATE; 2190 goto send_last; 2191 case OP(RDMA_WRITE_LAST): 2192 copy_last = rvt_is_user_qp(qp); 2193 /* fall through */ 2194 case OP(SEND_LAST): 2195 no_immediate_data: 2196 wc.wc_flags = 0; 2197 wc.ex.imm_data = 0; 2198 send_last: 2199 /* Check for invalid length. */ 2200 /* LAST len should be >= 1 */ 2201 if (unlikely(tlen < (hdrsize + extra_bytes))) 2202 goto nack_inv; 2203 /* Don't count the CRC(and padding and LT byte for 16B). */ 2204 tlen -= (hdrsize + extra_bytes); 2205 wc.byte_len = tlen + qp->r_rcv_len; 2206 if (unlikely(wc.byte_len > qp->r_len)) 2207 goto nack_inv; 2208 hfi1_copy_sge(&qp->r_sge, data, tlen, true, copy_last); 2209 rvt_put_ss(&qp->r_sge); 2210 qp->r_msn++; 2211 if (!__test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) 2212 break; 2213 wc.wr_id = qp->r_wr_id; 2214 wc.status = IB_WC_SUCCESS; 2215 if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) || 2216 opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) 2217 wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; 2218 else 2219 wc.opcode = IB_WC_RECV; 2220 wc.qp = &qp->ibqp; 2221 wc.src_qp = qp->remote_qpn; 2222 wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr) & U16_MAX; 2223 /* 2224 * It seems that IB mandates the presence of an SL in a 2225 * work completion only for the UD transport (see section 2226 * 11.4.2 of IBTA Vol. 1). 2227 * 2228 * However, the way the SL is chosen below is consistent 2229 * with the way that IB/qib works and is trying avoid 2230 * introducing incompatibilities. 2231 * 2232 * See also OPA Vol. 1, section 9.7.6, and table 9-17. 2233 */ 2234 wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr); 2235 /* zero fields that are N/A */ 2236 wc.vendor_err = 0; 2237 wc.pkey_index = 0; 2238 wc.dlid_path_bits = 0; 2239 wc.port_num = 0; 2240 /* Signal completion event if the solicited bit is set. */ 2241 rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 2242 (bth0 & IB_BTH_SOLICITED) != 0); 2243 break; 2244 2245 case OP(RDMA_WRITE_ONLY): 2246 copy_last = rvt_is_user_qp(qp); 2247 /* fall through */ 2248 case OP(RDMA_WRITE_FIRST): 2249 case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): 2250 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) 2251 goto nack_inv; 2252 /* consume RWQE */ 2253 reth = &ohdr->u.rc.reth; 2254 qp->r_len = be32_to_cpu(reth->length); 2255 qp->r_rcv_len = 0; 2256 qp->r_sge.sg_list = NULL; 2257 if (qp->r_len != 0) { 2258 u32 rkey = be32_to_cpu(reth->rkey); 2259 u64 vaddr = get_ib_reth_vaddr(reth); 2260 int ok; 2261 2262 /* Check rkey & NAK */ 2263 ok = rvt_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr, 2264 rkey, IB_ACCESS_REMOTE_WRITE); 2265 if (unlikely(!ok)) 2266 goto nack_acc; 2267 qp->r_sge.num_sge = 1; 2268 } else { 2269 qp->r_sge.num_sge = 0; 2270 qp->r_sge.sge.mr = NULL; 2271 qp->r_sge.sge.vaddr = NULL; 2272 qp->r_sge.sge.length = 0; 2273 qp->r_sge.sge.sge_length = 0; 2274 } 2275 if (opcode == OP(RDMA_WRITE_FIRST)) 2276 goto send_middle; 2277 else if (opcode == OP(RDMA_WRITE_ONLY)) 2278 goto no_immediate_data; 2279 ret = hfi1_rvt_get_rwqe(qp, 1); 2280 if (ret < 0) 2281 goto nack_op_err; 2282 if (!ret) { 2283 /* peer will send again */ 2284 rvt_put_ss(&qp->r_sge); 2285 goto rnr_nak; 2286 } 2287 wc.ex.imm_data = ohdr->u.rc.imm_data; 2288 wc.wc_flags = IB_WC_WITH_IMM; 2289 goto send_last; 2290 2291 case OP(RDMA_READ_REQUEST): { 2292 struct rvt_ack_entry *e; 2293 u32 len; 2294 u8 next; 2295 2296 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) 2297 goto nack_inv; 2298 next = qp->r_head_ack_queue + 1; 2299 /* s_ack_queue is size HFI1_MAX_RDMA_ATOMIC+1 so use > not >= */ 2300 if (next > HFI1_MAX_RDMA_ATOMIC) 2301 next = 0; 2302 spin_lock_irqsave(&qp->s_lock, flags); 2303 if (unlikely(next == qp->s_tail_ack_queue)) { 2304 if (!qp->s_ack_queue[next].sent) 2305 goto nack_inv_unlck; 2306 update_ack_queue(qp, next); 2307 } 2308 e = &qp->s_ack_queue[qp->r_head_ack_queue]; 2309 if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) { 2310 rvt_put_mr(e->rdma_sge.mr); 2311 e->rdma_sge.mr = NULL; 2312 } 2313 reth = &ohdr->u.rc.reth; 2314 len = be32_to_cpu(reth->length); 2315 if (len) { 2316 u32 rkey = be32_to_cpu(reth->rkey); 2317 u64 vaddr = get_ib_reth_vaddr(reth); 2318 int ok; 2319 2320 /* Check rkey & NAK */ 2321 ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, 2322 rkey, IB_ACCESS_REMOTE_READ); 2323 if (unlikely(!ok)) 2324 goto nack_acc_unlck; 2325 /* 2326 * Update the next expected PSN. We add 1 later 2327 * below, so only add the remainder here. 2328 */ 2329 qp->r_psn += rvt_div_mtu(qp, len - 1); 2330 } else { 2331 e->rdma_sge.mr = NULL; 2332 e->rdma_sge.vaddr = NULL; 2333 e->rdma_sge.length = 0; 2334 e->rdma_sge.sge_length = 0; 2335 } 2336 e->opcode = opcode; 2337 e->sent = 0; 2338 e->psn = psn; 2339 e->lpsn = qp->r_psn; 2340 /* 2341 * We need to increment the MSN here instead of when we 2342 * finish sending the result since a duplicate request would 2343 * increment it more than once. 2344 */ 2345 qp->r_msn++; 2346 qp->r_psn++; 2347 qp->r_state = opcode; 2348 qp->r_nak_state = 0; 2349 qp->r_head_ack_queue = next; 2350 2351 /* Schedule the send engine. */ 2352 qp->s_flags |= RVT_S_RESP_PENDING; 2353 hfi1_schedule_send(qp); 2354 2355 spin_unlock_irqrestore(&qp->s_lock, flags); 2356 if (is_fecn) 2357 goto send_ack; 2358 return; 2359 } 2360 2361 case OP(COMPARE_SWAP): 2362 case OP(FETCH_ADD): { 2363 struct ib_atomic_eth *ateth; 2364 struct rvt_ack_entry *e; 2365 u64 vaddr; 2366 atomic64_t *maddr; 2367 u64 sdata; 2368 u32 rkey; 2369 u8 next; 2370 2371 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) 2372 goto nack_inv; 2373 next = qp->r_head_ack_queue + 1; 2374 if (next > HFI1_MAX_RDMA_ATOMIC) 2375 next = 0; 2376 spin_lock_irqsave(&qp->s_lock, flags); 2377 if (unlikely(next == qp->s_tail_ack_queue)) { 2378 if (!qp->s_ack_queue[next].sent) 2379 goto nack_inv_unlck; 2380 update_ack_queue(qp, next); 2381 } 2382 e = &qp->s_ack_queue[qp->r_head_ack_queue]; 2383 if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) { 2384 rvt_put_mr(e->rdma_sge.mr); 2385 e->rdma_sge.mr = NULL; 2386 } 2387 ateth = &ohdr->u.atomic_eth; 2388 vaddr = get_ib_ateth_vaddr(ateth); 2389 if (unlikely(vaddr & (sizeof(u64) - 1))) 2390 goto nack_inv_unlck; 2391 rkey = be32_to_cpu(ateth->rkey); 2392 /* Check rkey & NAK */ 2393 if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64), 2394 vaddr, rkey, 2395 IB_ACCESS_REMOTE_ATOMIC))) 2396 goto nack_acc_unlck; 2397 /* Perform atomic OP and save result. */ 2398 maddr = (atomic64_t *)qp->r_sge.sge.vaddr; 2399 sdata = get_ib_ateth_swap(ateth); 2400 e->atomic_data = (opcode == OP(FETCH_ADD)) ? 2401 (u64)atomic64_add_return(sdata, maddr) - sdata : 2402 (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr, 2403 get_ib_ateth_compare(ateth), 2404 sdata); 2405 rvt_put_mr(qp->r_sge.sge.mr); 2406 qp->r_sge.num_sge = 0; 2407 e->opcode = opcode; 2408 e->sent = 0; 2409 e->psn = psn; 2410 e->lpsn = psn; 2411 qp->r_msn++; 2412 qp->r_psn++; 2413 qp->r_state = opcode; 2414 qp->r_nak_state = 0; 2415 qp->r_head_ack_queue = next; 2416 2417 /* Schedule the send engine. */ 2418 qp->s_flags |= RVT_S_RESP_PENDING; 2419 hfi1_schedule_send(qp); 2420 2421 spin_unlock_irqrestore(&qp->s_lock, flags); 2422 if (is_fecn) 2423 goto send_ack; 2424 return; 2425 } 2426 2427 default: 2428 /* NAK unknown opcodes. */ 2429 goto nack_inv; 2430 } 2431 qp->r_psn++; 2432 qp->r_state = opcode; 2433 qp->r_ack_psn = psn; 2434 qp->r_nak_state = 0; 2435 /* Send an ACK if requested or required. */ 2436 if (psn & IB_BTH_REQ_ACK) { 2437 if (packet->numpkt == 0) { 2438 rc_cancel_ack(qp); 2439 goto send_ack; 2440 } 2441 if (qp->r_adefered >= HFI1_PSN_CREDIT) { 2442 rc_cancel_ack(qp); 2443 goto send_ack; 2444 } 2445 if (unlikely(is_fecn)) { 2446 rc_cancel_ack(qp); 2447 goto send_ack; 2448 } 2449 qp->r_adefered++; 2450 rc_defered_ack(rcd, qp); 2451 } 2452 return; 2453 2454 rnr_nak: 2455 qp->r_nak_state = qp->r_min_rnr_timer | IB_RNR_NAK; 2456 qp->r_ack_psn = qp->r_psn; 2457 /* Queue RNR NAK for later */ 2458 rc_defered_ack(rcd, qp); 2459 return; 2460 2461 nack_op_err: 2462 rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); 2463 qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR; 2464 qp->r_ack_psn = qp->r_psn; 2465 /* Queue NAK for later */ 2466 rc_defered_ack(rcd, qp); 2467 return; 2468 2469 nack_inv_unlck: 2470 spin_unlock_irqrestore(&qp->s_lock, flags); 2471 nack_inv: 2472 rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); 2473 qp->r_nak_state = IB_NAK_INVALID_REQUEST; 2474 qp->r_ack_psn = qp->r_psn; 2475 /* Queue NAK for later */ 2476 rc_defered_ack(rcd, qp); 2477 return; 2478 2479 nack_acc_unlck: 2480 spin_unlock_irqrestore(&qp->s_lock, flags); 2481 nack_acc: 2482 rvt_rc_error(qp, IB_WC_LOC_PROT_ERR); 2483 qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; 2484 qp->r_ack_psn = qp->r_psn; 2485 send_ack: 2486 hfi1_send_rc_ack(rcd, qp, is_fecn); 2487 } 2488 2489 void hfi1_rc_hdrerr( 2490 struct hfi1_ctxtdata *rcd, 2491 struct hfi1_packet *packet, 2492 struct rvt_qp *qp) 2493 { 2494 struct hfi1_ibport *ibp = rcd_to_iport(rcd); 2495 int diff; 2496 u32 opcode; 2497 u32 psn; 2498 2499 if (hfi1_ruc_check_hdr(ibp, packet)) 2500 return; 2501 2502 psn = ib_bth_get_psn(packet->ohdr); 2503 opcode = ib_bth_get_opcode(packet->ohdr); 2504 2505 /* Only deal with RDMA Writes for now */ 2506 if (opcode < IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) { 2507 diff = delta_psn(psn, qp->r_psn); 2508 if (!qp->r_nak_state && diff >= 0) { 2509 ibp->rvp.n_rc_seqnak++; 2510 qp->r_nak_state = IB_NAK_PSN_ERROR; 2511 /* Use the expected PSN. */ 2512 qp->r_ack_psn = qp->r_psn; 2513 /* 2514 * Wait to send the sequence 2515 * NAK until all packets 2516 * in the receive queue have 2517 * been processed. 2518 * Otherwise, we end up 2519 * propagating congestion. 2520 */ 2521 rc_defered_ack(rcd, qp); 2522 } /* Out of sequence NAK */ 2523 } /* QP Request NAKs */ 2524 } 2525