1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Shared Memory Communications over RDMA (SMC-R) and RoCE 4 * 5 * Work Requests exploiting Infiniband API 6 * 7 * Work requests (WR) of type ib_post_send or ib_post_recv respectively 8 * are submitted to either RC SQ or RC RQ respectively 9 * (reliably connected send/receive queue) 10 * and become work queue entries (WQEs). 11 * While an SQ WR/WQE is pending, we track it until transmission completion. 12 * Through a send or receive completion queue (CQ) respectively, 13 * we get completion queue entries (CQEs) [aka work completions (WCs)]. 14 * Since the CQ callback is called from IRQ context, we split work by using 15 * bottom halves implemented by tasklets. 16 * 17 * SMC uses this to exchange LLC (link layer control) 18 * and CDC (connection data control) messages. 19 * 20 * Copyright IBM Corp. 2016 21 * 22 * Author(s): Steffen Maier <maier@linux.vnet.ibm.com> 23 */ 24 25 #include <linux/atomic.h> 26 #include <linux/hashtable.h> 27 #include <linux/wait.h> 28 #include <rdma/ib_verbs.h> 29 #include <asm/div64.h> 30 31 #include "smc.h" 32 #include "smc_wr.h" 33 34 #define SMC_WR_MAX_POLL_CQE 10 /* max. # of compl. queue elements in 1 poll */ 35 36 #define SMC_WR_RX_HASH_BITS 4 37 static DEFINE_HASHTABLE(smc_wr_rx_hash, SMC_WR_RX_HASH_BITS); 38 static DEFINE_SPINLOCK(smc_wr_rx_hash_lock); 39 40 struct smc_wr_tx_pend { /* control data for a pending send request */ 41 u64 wr_id; /* work request id sent */ 42 smc_wr_tx_handler handler; 43 enum ib_wc_status wc_status; /* CQE status */ 44 struct smc_link *link; 45 u32 idx; 46 struct smc_wr_tx_pend_priv priv; 47 u8 compl_requested; 48 }; 49 50 /******************************** send queue *********************************/ 51 52 /*------------------------------- completion --------------------------------*/ 53 54 /* returns true if at least one tx work request is pending on the given link */ 55 static inline bool smc_wr_is_tx_pend(struct smc_link *link) 56 { 57 if (find_first_bit(link->wr_tx_mask, link->wr_tx_cnt) != 58 link->wr_tx_cnt) { 59 return true; 60 } 61 return false; 62 } 63 64 /* wait till all pending tx work requests on the given link are completed */ 65 void smc_wr_tx_wait_no_pending_sends(struct smc_link *link) 66 { 67 wait_event(link->wr_tx_wait, !smc_wr_is_tx_pend(link)); 68 } 69 70 static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id) 71 { 72 u32 i; 73 74 for (i = 0; i < link->wr_tx_cnt; i++) { 75 if (link->wr_tx_pends[i].wr_id == wr_id) 76 return i; 77 } 78 return link->wr_tx_cnt; 79 } 80 81 static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) 82 { 83 struct smc_wr_tx_pend pnd_snd; 84 struct smc_link *link; 85 u32 pnd_snd_idx; 86 87 link = wc->qp->qp_context; 88 89 if (wc->opcode == IB_WC_REG_MR) { 90 if (wc->status) 91 link->wr_reg_state = FAILED; 92 else 93 link->wr_reg_state = CONFIRMED; 94 smc_wr_wakeup_reg_wait(link); 95 return; 96 } 97 98 pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id); 99 if (pnd_snd_idx == link->wr_tx_cnt) { 100 if (link->lgr->smc_version != SMC_V2 || 101 link->wr_tx_v2_pend->wr_id != wc->wr_id) 102 return; 103 link->wr_tx_v2_pend->wc_status = wc->status; 104 memcpy(&pnd_snd, link->wr_tx_v2_pend, sizeof(pnd_snd)); 105 /* clear the full struct smc_wr_tx_pend including .priv */ 106 memset(link->wr_tx_v2_pend, 0, 107 sizeof(*link->wr_tx_v2_pend)); 108 memset(link->lgr->wr_tx_buf_v2, 0, 109 sizeof(*link->lgr->wr_tx_buf_v2)); 110 } else { 111 link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status; 112 if (link->wr_tx_pends[pnd_snd_idx].compl_requested) 113 complete(&link->wr_tx_compl[pnd_snd_idx]); 114 memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx], 115 sizeof(pnd_snd)); 116 /* clear the full struct smc_wr_tx_pend including .priv */ 117 memset(&link->wr_tx_pends[pnd_snd_idx], 0, 118 sizeof(link->wr_tx_pends[pnd_snd_idx])); 119 memset(&link->wr_tx_bufs[pnd_snd_idx], 0, 120 sizeof(link->wr_tx_bufs[pnd_snd_idx])); 121 if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask)) 122 return; 123 } 124 125 if (wc->status) { 126 if (link->lgr->smc_version == SMC_V2) { 127 memset(link->wr_tx_v2_pend, 0, 128 sizeof(*link->wr_tx_v2_pend)); 129 memset(link->lgr->wr_tx_buf_v2, 0, 130 sizeof(*link->lgr->wr_tx_buf_v2)); 131 } 132 /* terminate link */ 133 smcr_link_down_cond_sched(link); 134 } 135 if (pnd_snd.handler) 136 pnd_snd.handler(&pnd_snd.priv, link, wc->status); 137 wake_up(&link->wr_tx_wait); 138 } 139 140 static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) 141 { 142 struct smc_ib_device *dev = from_tasklet(dev, t, send_tasklet); 143 struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; 144 int i = 0, rc; 145 int polled = 0; 146 147 again: 148 polled++; 149 do { 150 memset(&wc, 0, sizeof(wc)); 151 rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc); 152 if (polled == 1) { 153 ib_req_notify_cq(dev->roce_cq_send, 154 IB_CQ_NEXT_COMP | 155 IB_CQ_REPORT_MISSED_EVENTS); 156 } 157 if (!rc) 158 break; 159 for (i = 0; i < rc; i++) 160 smc_wr_tx_process_cqe(&wc[i]); 161 } while (rc > 0); 162 if (polled == 1) 163 goto again; 164 } 165 166 void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context) 167 { 168 struct smc_ib_device *dev = (struct smc_ib_device *)cq_context; 169 170 tasklet_schedule(&dev->send_tasklet); 171 } 172 173 /*---------------------------- request submission ---------------------------*/ 174 175 static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx) 176 { 177 *idx = link->wr_tx_cnt; 178 if (!smc_link_sendable(link)) 179 return -ENOLINK; 180 for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) { 181 if (!test_and_set_bit(*idx, link->wr_tx_mask)) 182 return 0; 183 } 184 *idx = link->wr_tx_cnt; 185 return -EBUSY; 186 } 187 188 /** 189 * smc_wr_tx_get_free_slot() - returns buffer for message assembly, 190 * and sets info for pending transmit tracking 191 * @link: Pointer to smc_link used to later send the message. 192 * @handler: Send completion handler function pointer. 193 * @wr_buf: Out value returns pointer to message buffer. 194 * @wr_rdma_buf: Out value returns pointer to rdma work request. 195 * @wr_pend_priv: Out value returns pointer serving as handler context. 196 * 197 * Return: 0 on success, or -errno on error. 198 */ 199 int smc_wr_tx_get_free_slot(struct smc_link *link, 200 smc_wr_tx_handler handler, 201 struct smc_wr_buf **wr_buf, 202 struct smc_rdma_wr **wr_rdma_buf, 203 struct smc_wr_tx_pend_priv **wr_pend_priv) 204 { 205 struct smc_link_group *lgr = smc_get_lgr(link); 206 struct smc_wr_tx_pend *wr_pend; 207 u32 idx = link->wr_tx_cnt; 208 struct ib_send_wr *wr_ib; 209 u64 wr_id; 210 int rc; 211 212 *wr_buf = NULL; 213 *wr_pend_priv = NULL; 214 if (in_softirq() || lgr->terminating) { 215 rc = smc_wr_tx_get_free_slot_index(link, &idx); 216 if (rc) 217 return rc; 218 } else { 219 rc = wait_event_interruptible_timeout( 220 link->wr_tx_wait, 221 !smc_link_sendable(link) || 222 lgr->terminating || 223 (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY), 224 SMC_WR_TX_WAIT_FREE_SLOT_TIME); 225 if (!rc) { 226 /* timeout - terminate link */ 227 smcr_link_down_cond_sched(link); 228 return -EPIPE; 229 } 230 if (idx == link->wr_tx_cnt) 231 return -EPIPE; 232 } 233 wr_id = smc_wr_tx_get_next_wr_id(link); 234 wr_pend = &link->wr_tx_pends[idx]; 235 wr_pend->wr_id = wr_id; 236 wr_pend->handler = handler; 237 wr_pend->link = link; 238 wr_pend->idx = idx; 239 wr_ib = &link->wr_tx_ibs[idx]; 240 wr_ib->wr_id = wr_id; 241 *wr_buf = &link->wr_tx_bufs[idx]; 242 if (wr_rdma_buf) 243 *wr_rdma_buf = &link->wr_tx_rdmas[idx]; 244 *wr_pend_priv = &wr_pend->priv; 245 return 0; 246 } 247 248 int smc_wr_tx_get_v2_slot(struct smc_link *link, 249 smc_wr_tx_handler handler, 250 struct smc_wr_v2_buf **wr_buf, 251 struct smc_wr_tx_pend_priv **wr_pend_priv) 252 { 253 struct smc_wr_tx_pend *wr_pend; 254 struct ib_send_wr *wr_ib; 255 u64 wr_id; 256 257 if (link->wr_tx_v2_pend->idx == link->wr_tx_cnt) 258 return -EBUSY; 259 260 *wr_buf = NULL; 261 *wr_pend_priv = NULL; 262 wr_id = smc_wr_tx_get_next_wr_id(link); 263 wr_pend = link->wr_tx_v2_pend; 264 wr_pend->wr_id = wr_id; 265 wr_pend->handler = handler; 266 wr_pend->link = link; 267 wr_pend->idx = link->wr_tx_cnt; 268 wr_ib = link->wr_tx_v2_ib; 269 wr_ib->wr_id = wr_id; 270 *wr_buf = link->lgr->wr_tx_buf_v2; 271 *wr_pend_priv = &wr_pend->priv; 272 return 0; 273 } 274 275 int smc_wr_tx_put_slot(struct smc_link *link, 276 struct smc_wr_tx_pend_priv *wr_pend_priv) 277 { 278 struct smc_wr_tx_pend *pend; 279 280 pend = container_of(wr_pend_priv, struct smc_wr_tx_pend, priv); 281 if (pend->idx < link->wr_tx_cnt) { 282 u32 idx = pend->idx; 283 284 /* clear the full struct smc_wr_tx_pend including .priv */ 285 memset(&link->wr_tx_pends[idx], 0, 286 sizeof(link->wr_tx_pends[idx])); 287 memset(&link->wr_tx_bufs[idx], 0, 288 sizeof(link->wr_tx_bufs[idx])); 289 test_and_clear_bit(idx, link->wr_tx_mask); 290 wake_up(&link->wr_tx_wait); 291 return 1; 292 } else if (link->lgr->smc_version == SMC_V2 && 293 pend->idx == link->wr_tx_cnt) { 294 /* Large v2 buffer */ 295 memset(&link->wr_tx_v2_pend, 0, 296 sizeof(link->wr_tx_v2_pend)); 297 memset(&link->lgr->wr_tx_buf_v2, 0, 298 sizeof(link->lgr->wr_tx_buf_v2)); 299 return 1; 300 } 301 302 return 0; 303 } 304 305 /* Send prepared WR slot via ib_post_send. 306 * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer 307 */ 308 int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) 309 { 310 struct smc_wr_tx_pend *pend; 311 int rc; 312 313 ib_req_notify_cq(link->smcibdev->roce_cq_send, 314 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); 315 pend = container_of(priv, struct smc_wr_tx_pend, priv); 316 rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL); 317 if (rc) { 318 smc_wr_tx_put_slot(link, priv); 319 smcr_link_down_cond_sched(link); 320 } 321 return rc; 322 } 323 324 int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, 325 int len) 326 { 327 int rc; 328 329 link->wr_tx_v2_ib->sg_list[0].length = len; 330 ib_req_notify_cq(link->smcibdev->roce_cq_send, 331 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); 332 rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL); 333 if (rc) { 334 smc_wr_tx_put_slot(link, priv); 335 smcr_link_down_cond_sched(link); 336 } 337 return rc; 338 } 339 340 /* Send prepared WR slot via ib_post_send and wait for send completion 341 * notification. 342 * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer 343 */ 344 int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, 345 unsigned long timeout) 346 { 347 struct smc_wr_tx_pend *pend; 348 u32 pnd_idx; 349 int rc; 350 351 pend = container_of(priv, struct smc_wr_tx_pend, priv); 352 pend->compl_requested = 1; 353 pnd_idx = pend->idx; 354 init_completion(&link->wr_tx_compl[pnd_idx]); 355 356 rc = smc_wr_tx_send(link, priv); 357 if (rc) 358 return rc; 359 /* wait for completion by smc_wr_tx_process_cqe() */ 360 rc = wait_for_completion_interruptible_timeout( 361 &link->wr_tx_compl[pnd_idx], timeout); 362 if (rc <= 0) 363 rc = -ENODATA; 364 if (rc > 0) 365 rc = 0; 366 return rc; 367 } 368 369 /* Register a memory region and wait for result. */ 370 int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) 371 { 372 int rc; 373 374 ib_req_notify_cq(link->smcibdev->roce_cq_send, 375 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); 376 link->wr_reg_state = POSTED; 377 link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr; 378 link->wr_reg.mr = mr; 379 link->wr_reg.key = mr->rkey; 380 rc = ib_post_send(link->roce_qp, &link->wr_reg.wr, NULL); 381 if (rc) 382 return rc; 383 384 atomic_inc(&link->wr_reg_refcnt); 385 rc = wait_event_interruptible_timeout(link->wr_reg_wait, 386 (link->wr_reg_state != POSTED), 387 SMC_WR_REG_MR_WAIT_TIME); 388 if (atomic_dec_and_test(&link->wr_reg_refcnt)) 389 wake_up_all(&link->wr_reg_wait); 390 if (!rc) { 391 /* timeout - terminate link */ 392 smcr_link_down_cond_sched(link); 393 return -EPIPE; 394 } 395 if (rc == -ERESTARTSYS) 396 return -EINTR; 397 switch (link->wr_reg_state) { 398 case CONFIRMED: 399 rc = 0; 400 break; 401 case FAILED: 402 rc = -EIO; 403 break; 404 case POSTED: 405 rc = -EPIPE; 406 break; 407 } 408 return rc; 409 } 410 411 /****************************** receive queue ********************************/ 412 413 int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler) 414 { 415 struct smc_wr_rx_handler *h_iter; 416 int rc = 0; 417 418 spin_lock(&smc_wr_rx_hash_lock); 419 hash_for_each_possible(smc_wr_rx_hash, h_iter, list, handler->type) { 420 if (h_iter->type == handler->type) { 421 rc = -EEXIST; 422 goto out_unlock; 423 } 424 } 425 hash_add(smc_wr_rx_hash, &handler->list, handler->type); 426 out_unlock: 427 spin_unlock(&smc_wr_rx_hash_lock); 428 return rc; 429 } 430 431 /* Demultiplex a received work request based on the message type to its handler. 432 * Relies on smc_wr_rx_hash having been completely filled before any IB WRs, 433 * and not being modified any more afterwards so we don't need to lock it. 434 */ 435 static inline void smc_wr_rx_demultiplex(struct ib_wc *wc) 436 { 437 struct smc_link *link = (struct smc_link *)wc->qp->qp_context; 438 struct smc_wr_rx_handler *handler; 439 struct smc_wr_rx_hdr *wr_rx; 440 u64 temp_wr_id; 441 u32 index; 442 443 if (wc->byte_len < sizeof(*wr_rx)) 444 return; /* short message */ 445 temp_wr_id = wc->wr_id; 446 index = do_div(temp_wr_id, link->wr_rx_cnt); 447 wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[index]; 448 hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) { 449 if (handler->type == wr_rx->type) 450 handler->handler(wc, wr_rx); 451 } 452 } 453 454 static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) 455 { 456 struct smc_link *link; 457 int i; 458 459 for (i = 0; i < num; i++) { 460 link = wc[i].qp->qp_context; 461 if (wc[i].status == IB_WC_SUCCESS) { 462 link->wr_rx_tstamp = jiffies; 463 smc_wr_rx_demultiplex(&wc[i]); 464 smc_wr_rx_post(link); /* refill WR RX */ 465 } else { 466 /* handle status errors */ 467 switch (wc[i].status) { 468 case IB_WC_RETRY_EXC_ERR: 469 case IB_WC_RNR_RETRY_EXC_ERR: 470 case IB_WC_WR_FLUSH_ERR: 471 smcr_link_down_cond_sched(link); 472 break; 473 default: 474 smc_wr_rx_post(link); /* refill WR RX */ 475 break; 476 } 477 } 478 } 479 } 480 481 static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) 482 { 483 struct smc_ib_device *dev = from_tasklet(dev, t, recv_tasklet); 484 struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; 485 int polled = 0; 486 int rc; 487 488 again: 489 polled++; 490 do { 491 memset(&wc, 0, sizeof(wc)); 492 rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc); 493 if (polled == 1) { 494 ib_req_notify_cq(dev->roce_cq_recv, 495 IB_CQ_SOLICITED_MASK 496 | IB_CQ_REPORT_MISSED_EVENTS); 497 } 498 if (!rc) 499 break; 500 smc_wr_rx_process_cqes(&wc[0], rc); 501 } while (rc > 0); 502 if (polled == 1) 503 goto again; 504 } 505 506 void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context) 507 { 508 struct smc_ib_device *dev = (struct smc_ib_device *)cq_context; 509 510 tasklet_schedule(&dev->recv_tasklet); 511 } 512 513 int smc_wr_rx_post_init(struct smc_link *link) 514 { 515 u32 i; 516 int rc = 0; 517 518 for (i = 0; i < link->wr_rx_cnt; i++) 519 rc = smc_wr_rx_post(link); 520 return rc; 521 } 522 523 /***************************** init, exit, misc ******************************/ 524 525 void smc_wr_remember_qp_attr(struct smc_link *lnk) 526 { 527 struct ib_qp_attr *attr = &lnk->qp_attr; 528 struct ib_qp_init_attr init_attr; 529 530 memset(attr, 0, sizeof(*attr)); 531 memset(&init_attr, 0, sizeof(init_attr)); 532 ib_query_qp(lnk->roce_qp, attr, 533 IB_QP_STATE | 534 IB_QP_CUR_STATE | 535 IB_QP_PKEY_INDEX | 536 IB_QP_PORT | 537 IB_QP_QKEY | 538 IB_QP_AV | 539 IB_QP_PATH_MTU | 540 IB_QP_TIMEOUT | 541 IB_QP_RETRY_CNT | 542 IB_QP_RNR_RETRY | 543 IB_QP_RQ_PSN | 544 IB_QP_ALT_PATH | 545 IB_QP_MIN_RNR_TIMER | 546 IB_QP_SQ_PSN | 547 IB_QP_PATH_MIG_STATE | 548 IB_QP_CAP | 549 IB_QP_DEST_QPN, 550 &init_attr); 551 552 lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT, 553 lnk->qp_attr.cap.max_send_wr); 554 lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3, 555 lnk->qp_attr.cap.max_recv_wr); 556 } 557 558 static void smc_wr_init_sge(struct smc_link *lnk) 559 { 560 int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; 561 u32 i; 562 563 for (i = 0; i < lnk->wr_tx_cnt; i++) { 564 lnk->wr_tx_sges[i].addr = 565 lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE; 566 lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE; 567 lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey; 568 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[0].lkey = 569 lnk->roce_pd->local_dma_lkey; 570 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[1].lkey = 571 lnk->roce_pd->local_dma_lkey; 572 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[0].lkey = 573 lnk->roce_pd->local_dma_lkey; 574 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[1].lkey = 575 lnk->roce_pd->local_dma_lkey; 576 lnk->wr_tx_ibs[i].next = NULL; 577 lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i]; 578 lnk->wr_tx_ibs[i].num_sge = 1; 579 lnk->wr_tx_ibs[i].opcode = IB_WR_SEND; 580 lnk->wr_tx_ibs[i].send_flags = 581 IB_SEND_SIGNALED | IB_SEND_SOLICITED; 582 lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE; 583 lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.opcode = IB_WR_RDMA_WRITE; 584 lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.sg_list = 585 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge; 586 lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.sg_list = 587 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge; 588 } 589 590 if (lnk->lgr->smc_version == SMC_V2) { 591 lnk->wr_tx_v2_sge->addr = lnk->wr_tx_v2_dma_addr; 592 lnk->wr_tx_v2_sge->length = SMC_WR_BUF_V2_SIZE; 593 lnk->wr_tx_v2_sge->lkey = lnk->roce_pd->local_dma_lkey; 594 595 lnk->wr_tx_v2_ib->next = NULL; 596 lnk->wr_tx_v2_ib->sg_list = lnk->wr_tx_v2_sge; 597 lnk->wr_tx_v2_ib->num_sge = 1; 598 lnk->wr_tx_v2_ib->opcode = IB_WR_SEND; 599 lnk->wr_tx_v2_ib->send_flags = 600 IB_SEND_SIGNALED | IB_SEND_SOLICITED; 601 } 602 603 /* With SMC-Rv2 there can be messages larger than SMC_WR_TX_SIZE. 604 * Each ib_recv_wr gets 2 sges, the second one is a spillover buffer 605 * and the same buffer for all sges. When a larger message arrived then 606 * the content of the first small sge is copied to the beginning of 607 * the larger spillover buffer, allowing easy data mapping. 608 */ 609 for (i = 0; i < lnk->wr_rx_cnt; i++) { 610 int x = i * sges_per_buf; 611 612 lnk->wr_rx_sges[x].addr = 613 lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE; 614 lnk->wr_rx_sges[x].length = SMC_WR_TX_SIZE; 615 lnk->wr_rx_sges[x].lkey = lnk->roce_pd->local_dma_lkey; 616 if (lnk->lgr->smc_version == SMC_V2) { 617 lnk->wr_rx_sges[x + 1].addr = 618 lnk->wr_rx_v2_dma_addr + SMC_WR_TX_SIZE; 619 lnk->wr_rx_sges[x + 1].length = 620 SMC_WR_BUF_V2_SIZE - SMC_WR_TX_SIZE; 621 lnk->wr_rx_sges[x + 1].lkey = 622 lnk->roce_pd->local_dma_lkey; 623 } 624 lnk->wr_rx_ibs[i].next = NULL; 625 lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[x]; 626 lnk->wr_rx_ibs[i].num_sge = sges_per_buf; 627 } 628 lnk->wr_reg.wr.next = NULL; 629 lnk->wr_reg.wr.num_sge = 0; 630 lnk->wr_reg.wr.send_flags = IB_SEND_SIGNALED; 631 lnk->wr_reg.wr.opcode = IB_WR_REG_MR; 632 lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE; 633 } 634 635 void smc_wr_free_link(struct smc_link *lnk) 636 { 637 struct ib_device *ibdev; 638 639 if (!lnk->smcibdev) 640 return; 641 ibdev = lnk->smcibdev->ibdev; 642 643 smc_wr_wakeup_reg_wait(lnk); 644 smc_wr_wakeup_tx_wait(lnk); 645 646 smc_wr_tx_wait_no_pending_sends(lnk); 647 wait_event(lnk->wr_reg_wait, (!atomic_read(&lnk->wr_reg_refcnt))); 648 wait_event(lnk->wr_tx_wait, (!atomic_read(&lnk->wr_tx_refcnt))); 649 650 if (lnk->wr_rx_dma_addr) { 651 ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr, 652 SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, 653 DMA_FROM_DEVICE); 654 lnk->wr_rx_dma_addr = 0; 655 } 656 if (lnk->wr_rx_v2_dma_addr) { 657 ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr, 658 SMC_WR_BUF_V2_SIZE, 659 DMA_FROM_DEVICE); 660 lnk->wr_rx_v2_dma_addr = 0; 661 } 662 if (lnk->wr_tx_dma_addr) { 663 ib_dma_unmap_single(ibdev, lnk->wr_tx_dma_addr, 664 SMC_WR_BUF_SIZE * lnk->wr_tx_cnt, 665 DMA_TO_DEVICE); 666 lnk->wr_tx_dma_addr = 0; 667 } 668 if (lnk->wr_tx_v2_dma_addr) { 669 ib_dma_unmap_single(ibdev, lnk->wr_tx_v2_dma_addr, 670 SMC_WR_BUF_V2_SIZE, 671 DMA_TO_DEVICE); 672 lnk->wr_tx_v2_dma_addr = 0; 673 } 674 } 675 676 void smc_wr_free_lgr_mem(struct smc_link_group *lgr) 677 { 678 if (lgr->smc_version < SMC_V2) 679 return; 680 681 kfree(lgr->wr_rx_buf_v2); 682 lgr->wr_rx_buf_v2 = NULL; 683 kfree(lgr->wr_tx_buf_v2); 684 lgr->wr_tx_buf_v2 = NULL; 685 } 686 687 void smc_wr_free_link_mem(struct smc_link *lnk) 688 { 689 kfree(lnk->wr_tx_v2_ib); 690 lnk->wr_tx_v2_ib = NULL; 691 kfree(lnk->wr_tx_v2_sge); 692 lnk->wr_tx_v2_sge = NULL; 693 kfree(lnk->wr_tx_v2_pend); 694 lnk->wr_tx_v2_pend = NULL; 695 kfree(lnk->wr_tx_compl); 696 lnk->wr_tx_compl = NULL; 697 kfree(lnk->wr_tx_pends); 698 lnk->wr_tx_pends = NULL; 699 kfree(lnk->wr_tx_mask); 700 lnk->wr_tx_mask = NULL; 701 kfree(lnk->wr_tx_sges); 702 lnk->wr_tx_sges = NULL; 703 kfree(lnk->wr_tx_rdma_sges); 704 lnk->wr_tx_rdma_sges = NULL; 705 kfree(lnk->wr_rx_sges); 706 lnk->wr_rx_sges = NULL; 707 kfree(lnk->wr_tx_rdmas); 708 lnk->wr_tx_rdmas = NULL; 709 kfree(lnk->wr_rx_ibs); 710 lnk->wr_rx_ibs = NULL; 711 kfree(lnk->wr_tx_ibs); 712 lnk->wr_tx_ibs = NULL; 713 kfree(lnk->wr_tx_bufs); 714 lnk->wr_tx_bufs = NULL; 715 kfree(lnk->wr_rx_bufs); 716 lnk->wr_rx_bufs = NULL; 717 } 718 719 int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr) 720 { 721 if (lgr->smc_version < SMC_V2) 722 return 0; 723 724 lgr->wr_rx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL); 725 if (!lgr->wr_rx_buf_v2) 726 return -ENOMEM; 727 lgr->wr_tx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL); 728 if (!lgr->wr_tx_buf_v2) { 729 kfree(lgr->wr_rx_buf_v2); 730 return -ENOMEM; 731 } 732 return 0; 733 } 734 735 int smc_wr_alloc_link_mem(struct smc_link *link) 736 { 737 int sges_per_buf = link->lgr->smc_version == SMC_V2 ? 2 : 1; 738 739 /* allocate link related memory */ 740 link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL); 741 if (!link->wr_tx_bufs) 742 goto no_mem; 743 link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE, 744 GFP_KERNEL); 745 if (!link->wr_rx_bufs) 746 goto no_mem_wr_tx_bufs; 747 link->wr_tx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_ibs[0]), 748 GFP_KERNEL); 749 if (!link->wr_tx_ibs) 750 goto no_mem_wr_rx_bufs; 751 link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3, 752 sizeof(link->wr_rx_ibs[0]), 753 GFP_KERNEL); 754 if (!link->wr_rx_ibs) 755 goto no_mem_wr_tx_ibs; 756 link->wr_tx_rdmas = kcalloc(SMC_WR_BUF_CNT, 757 sizeof(link->wr_tx_rdmas[0]), 758 GFP_KERNEL); 759 if (!link->wr_tx_rdmas) 760 goto no_mem_wr_rx_ibs; 761 link->wr_tx_rdma_sges = kcalloc(SMC_WR_BUF_CNT, 762 sizeof(link->wr_tx_rdma_sges[0]), 763 GFP_KERNEL); 764 if (!link->wr_tx_rdma_sges) 765 goto no_mem_wr_tx_rdmas; 766 link->wr_tx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_sges[0]), 767 GFP_KERNEL); 768 if (!link->wr_tx_sges) 769 goto no_mem_wr_tx_rdma_sges; 770 link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3, 771 sizeof(link->wr_rx_sges[0]) * sges_per_buf, 772 GFP_KERNEL); 773 if (!link->wr_rx_sges) 774 goto no_mem_wr_tx_sges; 775 link->wr_tx_mask = kcalloc(BITS_TO_LONGS(SMC_WR_BUF_CNT), 776 sizeof(*link->wr_tx_mask), 777 GFP_KERNEL); 778 if (!link->wr_tx_mask) 779 goto no_mem_wr_rx_sges; 780 link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT, 781 sizeof(link->wr_tx_pends[0]), 782 GFP_KERNEL); 783 if (!link->wr_tx_pends) 784 goto no_mem_wr_tx_mask; 785 link->wr_tx_compl = kcalloc(SMC_WR_BUF_CNT, 786 sizeof(link->wr_tx_compl[0]), 787 GFP_KERNEL); 788 if (!link->wr_tx_compl) 789 goto no_mem_wr_tx_pends; 790 791 if (link->lgr->smc_version == SMC_V2) { 792 link->wr_tx_v2_ib = kzalloc(sizeof(*link->wr_tx_v2_ib), 793 GFP_KERNEL); 794 if (!link->wr_tx_v2_ib) 795 goto no_mem_tx_compl; 796 link->wr_tx_v2_sge = kzalloc(sizeof(*link->wr_tx_v2_sge), 797 GFP_KERNEL); 798 if (!link->wr_tx_v2_sge) 799 goto no_mem_v2_ib; 800 link->wr_tx_v2_pend = kzalloc(sizeof(*link->wr_tx_v2_pend), 801 GFP_KERNEL); 802 if (!link->wr_tx_v2_pend) 803 goto no_mem_v2_sge; 804 } 805 return 0; 806 807 no_mem_v2_sge: 808 kfree(link->wr_tx_v2_sge); 809 no_mem_v2_ib: 810 kfree(link->wr_tx_v2_ib); 811 no_mem_tx_compl: 812 kfree(link->wr_tx_compl); 813 no_mem_wr_tx_pends: 814 kfree(link->wr_tx_pends); 815 no_mem_wr_tx_mask: 816 kfree(link->wr_tx_mask); 817 no_mem_wr_rx_sges: 818 kfree(link->wr_rx_sges); 819 no_mem_wr_tx_sges: 820 kfree(link->wr_tx_sges); 821 no_mem_wr_tx_rdma_sges: 822 kfree(link->wr_tx_rdma_sges); 823 no_mem_wr_tx_rdmas: 824 kfree(link->wr_tx_rdmas); 825 no_mem_wr_rx_ibs: 826 kfree(link->wr_rx_ibs); 827 no_mem_wr_tx_ibs: 828 kfree(link->wr_tx_ibs); 829 no_mem_wr_rx_bufs: 830 kfree(link->wr_rx_bufs); 831 no_mem_wr_tx_bufs: 832 kfree(link->wr_tx_bufs); 833 no_mem: 834 return -ENOMEM; 835 } 836 837 void smc_wr_remove_dev(struct smc_ib_device *smcibdev) 838 { 839 tasklet_kill(&smcibdev->recv_tasklet); 840 tasklet_kill(&smcibdev->send_tasklet); 841 } 842 843 void smc_wr_add_dev(struct smc_ib_device *smcibdev) 844 { 845 tasklet_setup(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn); 846 tasklet_setup(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn); 847 } 848 849 int smc_wr_create_link(struct smc_link *lnk) 850 { 851 struct ib_device *ibdev = lnk->smcibdev->ibdev; 852 int rc = 0; 853 854 smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0); 855 lnk->wr_rx_id = 0; 856 lnk->wr_rx_dma_addr = ib_dma_map_single( 857 ibdev, lnk->wr_rx_bufs, SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, 858 DMA_FROM_DEVICE); 859 if (ib_dma_mapping_error(ibdev, lnk->wr_rx_dma_addr)) { 860 lnk->wr_rx_dma_addr = 0; 861 rc = -EIO; 862 goto out; 863 } 864 if (lnk->lgr->smc_version == SMC_V2) { 865 lnk->wr_rx_v2_dma_addr = ib_dma_map_single(ibdev, 866 lnk->lgr->wr_rx_buf_v2, SMC_WR_BUF_V2_SIZE, 867 DMA_FROM_DEVICE); 868 if (ib_dma_mapping_error(ibdev, lnk->wr_rx_v2_dma_addr)) { 869 lnk->wr_rx_v2_dma_addr = 0; 870 rc = -EIO; 871 goto dma_unmap; 872 } 873 lnk->wr_tx_v2_dma_addr = ib_dma_map_single(ibdev, 874 lnk->lgr->wr_tx_buf_v2, SMC_WR_BUF_V2_SIZE, 875 DMA_TO_DEVICE); 876 if (ib_dma_mapping_error(ibdev, lnk->wr_tx_v2_dma_addr)) { 877 lnk->wr_tx_v2_dma_addr = 0; 878 rc = -EIO; 879 goto dma_unmap; 880 } 881 } 882 lnk->wr_tx_dma_addr = ib_dma_map_single( 883 ibdev, lnk->wr_tx_bufs, SMC_WR_BUF_SIZE * lnk->wr_tx_cnt, 884 DMA_TO_DEVICE); 885 if (ib_dma_mapping_error(ibdev, lnk->wr_tx_dma_addr)) { 886 rc = -EIO; 887 goto dma_unmap; 888 } 889 smc_wr_init_sge(lnk); 890 memset(lnk->wr_tx_mask, 0, 891 BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask)); 892 init_waitqueue_head(&lnk->wr_tx_wait); 893 atomic_set(&lnk->wr_tx_refcnt, 0); 894 init_waitqueue_head(&lnk->wr_reg_wait); 895 atomic_set(&lnk->wr_reg_refcnt, 0); 896 return rc; 897 898 dma_unmap: 899 if (lnk->wr_rx_v2_dma_addr) { 900 ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr, 901 SMC_WR_BUF_V2_SIZE, 902 DMA_FROM_DEVICE); 903 lnk->wr_rx_v2_dma_addr = 0; 904 } 905 if (lnk->wr_tx_v2_dma_addr) { 906 ib_dma_unmap_single(ibdev, lnk->wr_tx_v2_dma_addr, 907 SMC_WR_BUF_V2_SIZE, 908 DMA_TO_DEVICE); 909 lnk->wr_tx_v2_dma_addr = 0; 910 } 911 ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr, 912 SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, 913 DMA_FROM_DEVICE); 914 lnk->wr_rx_dma_addr = 0; 915 out: 916 return rc; 917 } 918