1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Shared Memory Communications over RDMA (SMC-R) and RoCE 4 * 5 * Work Requests exploiting Infiniband API 6 * 7 * Work requests (WR) of type ib_post_send or ib_post_recv respectively 8 * are submitted to either RC SQ or RC RQ respectively 9 * (reliably connected send/receive queue) 10 * and become work queue entries (WQEs). 11 * While an SQ WR/WQE is pending, we track it until transmission completion. 12 * Through a send or receive completion queue (CQ) respectively, 13 * we get completion queue entries (CQEs) [aka work completions (WCs)]. 14 * Since the CQ callback is called from IRQ context, we split work by using 15 * bottom halves implemented by tasklets. 16 * 17 * SMC uses this to exchange LLC (link layer control) 18 * and CDC (connection data control) messages. 19 * 20 * Copyright IBM Corp. 2016 21 * 22 * Author(s): Steffen Maier <maier@linux.vnet.ibm.com> 23 */ 24 25 #include <linux/atomic.h> 26 #include <linux/hashtable.h> 27 #include <linux/wait.h> 28 #include <rdma/ib_verbs.h> 29 #include <asm/div64.h> 30 31 #include "smc.h" 32 #include "smc_wr.h" 33 34 #define SMC_WR_MAX_POLL_CQE 10 /* max. # of compl. queue elements in 1 poll */ 35 36 #define SMC_WR_RX_HASH_BITS 4 37 static DEFINE_HASHTABLE(smc_wr_rx_hash, SMC_WR_RX_HASH_BITS); 38 static DEFINE_SPINLOCK(smc_wr_rx_hash_lock); 39 40 struct smc_wr_tx_pend { /* control data for a pending send request */ 41 u64 wr_id; /* work request id sent */ 42 smc_wr_tx_handler handler; 43 enum ib_wc_status wc_status; /* CQE status */ 44 struct smc_link *link; 45 u32 idx; 46 struct smc_wr_tx_pend_priv priv; 47 u8 compl_requested; 48 }; 49 50 /******************************** send queue *********************************/ 51 52 /*------------------------------- completion --------------------------------*/ 53 54 /* returns true if at least one tx work request is pending on the given link */ 55 static inline bool smc_wr_is_tx_pend(struct smc_link *link) 56 { 57 if (find_first_bit(link->wr_tx_mask, link->wr_tx_cnt) != 58 link->wr_tx_cnt) { 59 return true; 60 } 61 return false; 62 } 63 64 /* wait till all pending tx work requests on the given link are completed */ 65 int smc_wr_tx_wait_no_pending_sends(struct smc_link *link) 66 { 67 if (wait_event_timeout(link->wr_tx_wait, !smc_wr_is_tx_pend(link), 68 SMC_WR_TX_WAIT_PENDING_TIME)) 69 return 0; 70 else /* timeout */ 71 return -EPIPE; 72 } 73 74 static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id) 75 { 76 u32 i; 77 78 for (i = 0; i < link->wr_tx_cnt; i++) { 79 if (link->wr_tx_pends[i].wr_id == wr_id) 80 return i; 81 } 82 return link->wr_tx_cnt; 83 } 84 85 static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) 86 { 87 struct smc_wr_tx_pend pnd_snd; 88 struct smc_link *link; 89 u32 pnd_snd_idx; 90 int i; 91 92 link = wc->qp->qp_context; 93 94 if (wc->opcode == IB_WC_REG_MR) { 95 if (wc->status) 96 link->wr_reg_state = FAILED; 97 else 98 link->wr_reg_state = CONFIRMED; 99 smc_wr_wakeup_reg_wait(link); 100 return; 101 } 102 103 pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id); 104 if (pnd_snd_idx == link->wr_tx_cnt) { 105 if (link->lgr->smc_version != SMC_V2 || 106 link->wr_tx_v2_pend->wr_id != wc->wr_id) 107 return; 108 link->wr_tx_v2_pend->wc_status = wc->status; 109 memcpy(&pnd_snd, link->wr_tx_v2_pend, sizeof(pnd_snd)); 110 /* clear the full struct smc_wr_tx_pend including .priv */ 111 memset(link->wr_tx_v2_pend, 0, 112 sizeof(*link->wr_tx_v2_pend)); 113 memset(link->lgr->wr_tx_buf_v2, 0, 114 sizeof(*link->lgr->wr_tx_buf_v2)); 115 } else { 116 link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status; 117 if (link->wr_tx_pends[pnd_snd_idx].compl_requested) 118 complete(&link->wr_tx_compl[pnd_snd_idx]); 119 memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx], 120 sizeof(pnd_snd)); 121 /* clear the full struct smc_wr_tx_pend including .priv */ 122 memset(&link->wr_tx_pends[pnd_snd_idx], 0, 123 sizeof(link->wr_tx_pends[pnd_snd_idx])); 124 memset(&link->wr_tx_bufs[pnd_snd_idx], 0, 125 sizeof(link->wr_tx_bufs[pnd_snd_idx])); 126 if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask)) 127 return; 128 } 129 130 if (wc->status) { 131 for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) { 132 /* clear full struct smc_wr_tx_pend including .priv */ 133 memset(&link->wr_tx_pends[i], 0, 134 sizeof(link->wr_tx_pends[i])); 135 memset(&link->wr_tx_bufs[i], 0, 136 sizeof(link->wr_tx_bufs[i])); 137 clear_bit(i, link->wr_tx_mask); 138 } 139 if (link->lgr->smc_version == SMC_V2) { 140 memset(link->wr_tx_v2_pend, 0, 141 sizeof(*link->wr_tx_v2_pend)); 142 memset(link->lgr->wr_tx_buf_v2, 0, 143 sizeof(*link->lgr->wr_tx_buf_v2)); 144 } 145 /* terminate link */ 146 smcr_link_down_cond_sched(link); 147 } 148 if (pnd_snd.handler) 149 pnd_snd.handler(&pnd_snd.priv, link, wc->status); 150 wake_up(&link->wr_tx_wait); 151 } 152 153 static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) 154 { 155 struct smc_ib_device *dev = from_tasklet(dev, t, send_tasklet); 156 struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; 157 int i = 0, rc; 158 int polled = 0; 159 160 again: 161 polled++; 162 do { 163 memset(&wc, 0, sizeof(wc)); 164 rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc); 165 if (polled == 1) { 166 ib_req_notify_cq(dev->roce_cq_send, 167 IB_CQ_NEXT_COMP | 168 IB_CQ_REPORT_MISSED_EVENTS); 169 } 170 if (!rc) 171 break; 172 for (i = 0; i < rc; i++) 173 smc_wr_tx_process_cqe(&wc[i]); 174 } while (rc > 0); 175 if (polled == 1) 176 goto again; 177 } 178 179 void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context) 180 { 181 struct smc_ib_device *dev = (struct smc_ib_device *)cq_context; 182 183 tasklet_schedule(&dev->send_tasklet); 184 } 185 186 /*---------------------------- request submission ---------------------------*/ 187 188 static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx) 189 { 190 *idx = link->wr_tx_cnt; 191 if (!smc_link_usable(link)) 192 return -ENOLINK; 193 for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) { 194 if (!test_and_set_bit(*idx, link->wr_tx_mask)) 195 return 0; 196 } 197 *idx = link->wr_tx_cnt; 198 return -EBUSY; 199 } 200 201 /** 202 * smc_wr_tx_get_free_slot() - returns buffer for message assembly, 203 * and sets info for pending transmit tracking 204 * @link: Pointer to smc_link used to later send the message. 205 * @handler: Send completion handler function pointer. 206 * @wr_buf: Out value returns pointer to message buffer. 207 * @wr_rdma_buf: Out value returns pointer to rdma work request. 208 * @wr_pend_priv: Out value returns pointer serving as handler context. 209 * 210 * Return: 0 on success, or -errno on error. 211 */ 212 int smc_wr_tx_get_free_slot(struct smc_link *link, 213 smc_wr_tx_handler handler, 214 struct smc_wr_buf **wr_buf, 215 struct smc_rdma_wr **wr_rdma_buf, 216 struct smc_wr_tx_pend_priv **wr_pend_priv) 217 { 218 struct smc_link_group *lgr = smc_get_lgr(link); 219 struct smc_wr_tx_pend *wr_pend; 220 u32 idx = link->wr_tx_cnt; 221 struct ib_send_wr *wr_ib; 222 u64 wr_id; 223 int rc; 224 225 *wr_buf = NULL; 226 *wr_pend_priv = NULL; 227 if (in_softirq() || lgr->terminating) { 228 rc = smc_wr_tx_get_free_slot_index(link, &idx); 229 if (rc) 230 return rc; 231 } else { 232 rc = wait_event_interruptible_timeout( 233 link->wr_tx_wait, 234 !smc_link_usable(link) || 235 lgr->terminating || 236 (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY), 237 SMC_WR_TX_WAIT_FREE_SLOT_TIME); 238 if (!rc) { 239 /* timeout - terminate link */ 240 smcr_link_down_cond_sched(link); 241 return -EPIPE; 242 } 243 if (idx == link->wr_tx_cnt) 244 return -EPIPE; 245 } 246 wr_id = smc_wr_tx_get_next_wr_id(link); 247 wr_pend = &link->wr_tx_pends[idx]; 248 wr_pend->wr_id = wr_id; 249 wr_pend->handler = handler; 250 wr_pend->link = link; 251 wr_pend->idx = idx; 252 wr_ib = &link->wr_tx_ibs[idx]; 253 wr_ib->wr_id = wr_id; 254 *wr_buf = &link->wr_tx_bufs[idx]; 255 if (wr_rdma_buf) 256 *wr_rdma_buf = &link->wr_tx_rdmas[idx]; 257 *wr_pend_priv = &wr_pend->priv; 258 return 0; 259 } 260 261 int smc_wr_tx_get_v2_slot(struct smc_link *link, 262 smc_wr_tx_handler handler, 263 struct smc_wr_v2_buf **wr_buf, 264 struct smc_wr_tx_pend_priv **wr_pend_priv) 265 { 266 struct smc_wr_tx_pend *wr_pend; 267 struct ib_send_wr *wr_ib; 268 u64 wr_id; 269 270 if (link->wr_tx_v2_pend->idx == link->wr_tx_cnt) 271 return -EBUSY; 272 273 *wr_buf = NULL; 274 *wr_pend_priv = NULL; 275 wr_id = smc_wr_tx_get_next_wr_id(link); 276 wr_pend = link->wr_tx_v2_pend; 277 wr_pend->wr_id = wr_id; 278 wr_pend->handler = handler; 279 wr_pend->link = link; 280 wr_pend->idx = link->wr_tx_cnt; 281 wr_ib = link->wr_tx_v2_ib; 282 wr_ib->wr_id = wr_id; 283 *wr_buf = link->lgr->wr_tx_buf_v2; 284 *wr_pend_priv = &wr_pend->priv; 285 return 0; 286 } 287 288 int smc_wr_tx_put_slot(struct smc_link *link, 289 struct smc_wr_tx_pend_priv *wr_pend_priv) 290 { 291 struct smc_wr_tx_pend *pend; 292 293 pend = container_of(wr_pend_priv, struct smc_wr_tx_pend, priv); 294 if (pend->idx < link->wr_tx_cnt) { 295 u32 idx = pend->idx; 296 297 /* clear the full struct smc_wr_tx_pend including .priv */ 298 memset(&link->wr_tx_pends[idx], 0, 299 sizeof(link->wr_tx_pends[idx])); 300 memset(&link->wr_tx_bufs[idx], 0, 301 sizeof(link->wr_tx_bufs[idx])); 302 test_and_clear_bit(idx, link->wr_tx_mask); 303 wake_up(&link->wr_tx_wait); 304 return 1; 305 } else if (link->lgr->smc_version == SMC_V2 && 306 pend->idx == link->wr_tx_cnt) { 307 /* Large v2 buffer */ 308 memset(&link->wr_tx_v2_pend, 0, 309 sizeof(link->wr_tx_v2_pend)); 310 memset(&link->lgr->wr_tx_buf_v2, 0, 311 sizeof(link->lgr->wr_tx_buf_v2)); 312 return 1; 313 } 314 315 return 0; 316 } 317 318 /* Send prepared WR slot via ib_post_send. 319 * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer 320 */ 321 int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) 322 { 323 struct smc_wr_tx_pend *pend; 324 int rc; 325 326 ib_req_notify_cq(link->smcibdev->roce_cq_send, 327 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); 328 pend = container_of(priv, struct smc_wr_tx_pend, priv); 329 rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL); 330 if (rc) { 331 smc_wr_tx_put_slot(link, priv); 332 smcr_link_down_cond_sched(link); 333 } 334 return rc; 335 } 336 337 int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, 338 int len) 339 { 340 int rc; 341 342 link->wr_tx_v2_ib->sg_list[0].length = len; 343 ib_req_notify_cq(link->smcibdev->roce_cq_send, 344 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); 345 rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL); 346 if (rc) { 347 smc_wr_tx_put_slot(link, priv); 348 smcr_link_down_cond_sched(link); 349 } 350 return rc; 351 } 352 353 /* Send prepared WR slot via ib_post_send and wait for send completion 354 * notification. 355 * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer 356 */ 357 int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, 358 unsigned long timeout) 359 { 360 struct smc_wr_tx_pend *pend; 361 int rc; 362 363 pend = container_of(priv, struct smc_wr_tx_pend, priv); 364 pend->compl_requested = 1; 365 init_completion(&link->wr_tx_compl[pend->idx]); 366 367 rc = smc_wr_tx_send(link, priv); 368 if (rc) 369 return rc; 370 /* wait for completion by smc_wr_tx_process_cqe() */ 371 rc = wait_for_completion_interruptible_timeout( 372 &link->wr_tx_compl[pend->idx], timeout); 373 if (rc <= 0) 374 rc = -ENODATA; 375 if (rc > 0) 376 rc = 0; 377 return rc; 378 } 379 380 /* Register a memory region and wait for result. */ 381 int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) 382 { 383 int rc; 384 385 ib_req_notify_cq(link->smcibdev->roce_cq_send, 386 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); 387 link->wr_reg_state = POSTED; 388 link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr; 389 link->wr_reg.mr = mr; 390 link->wr_reg.key = mr->rkey; 391 rc = ib_post_send(link->roce_qp, &link->wr_reg.wr, NULL); 392 if (rc) 393 return rc; 394 395 atomic_inc(&link->wr_reg_refcnt); 396 rc = wait_event_interruptible_timeout(link->wr_reg_wait, 397 (link->wr_reg_state != POSTED), 398 SMC_WR_REG_MR_WAIT_TIME); 399 if (atomic_dec_and_test(&link->wr_reg_refcnt)) 400 wake_up_all(&link->wr_reg_wait); 401 if (!rc) { 402 /* timeout - terminate link */ 403 smcr_link_down_cond_sched(link); 404 return -EPIPE; 405 } 406 if (rc == -ERESTARTSYS) 407 return -EINTR; 408 switch (link->wr_reg_state) { 409 case CONFIRMED: 410 rc = 0; 411 break; 412 case FAILED: 413 rc = -EIO; 414 break; 415 case POSTED: 416 rc = -EPIPE; 417 break; 418 } 419 return rc; 420 } 421 422 void smc_wr_tx_dismiss_slots(struct smc_link *link, u8 wr_tx_hdr_type, 423 smc_wr_tx_filter filter, 424 smc_wr_tx_dismisser dismisser, 425 unsigned long data) 426 { 427 struct smc_wr_tx_pend_priv *tx_pend; 428 struct smc_wr_rx_hdr *wr_tx; 429 int i; 430 431 for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) { 432 wr_tx = (struct smc_wr_rx_hdr *)&link->wr_tx_bufs[i]; 433 if (wr_tx->type != wr_tx_hdr_type) 434 continue; 435 tx_pend = &link->wr_tx_pends[i].priv; 436 if (filter(tx_pend, data)) 437 dismisser(tx_pend); 438 } 439 } 440 441 /****************************** receive queue ********************************/ 442 443 int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler) 444 { 445 struct smc_wr_rx_handler *h_iter; 446 int rc = 0; 447 448 spin_lock(&smc_wr_rx_hash_lock); 449 hash_for_each_possible(smc_wr_rx_hash, h_iter, list, handler->type) { 450 if (h_iter->type == handler->type) { 451 rc = -EEXIST; 452 goto out_unlock; 453 } 454 } 455 hash_add(smc_wr_rx_hash, &handler->list, handler->type); 456 out_unlock: 457 spin_unlock(&smc_wr_rx_hash_lock); 458 return rc; 459 } 460 461 /* Demultiplex a received work request based on the message type to its handler. 462 * Relies on smc_wr_rx_hash having been completely filled before any IB WRs, 463 * and not being modified any more afterwards so we don't need to lock it. 464 */ 465 static inline void smc_wr_rx_demultiplex(struct ib_wc *wc) 466 { 467 struct smc_link *link = (struct smc_link *)wc->qp->qp_context; 468 struct smc_wr_rx_handler *handler; 469 struct smc_wr_rx_hdr *wr_rx; 470 u64 temp_wr_id; 471 u32 index; 472 473 if (wc->byte_len < sizeof(*wr_rx)) 474 return; /* short message */ 475 temp_wr_id = wc->wr_id; 476 index = do_div(temp_wr_id, link->wr_rx_cnt); 477 wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[index]; 478 hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) { 479 if (handler->type == wr_rx->type) 480 handler->handler(wc, wr_rx); 481 } 482 } 483 484 static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) 485 { 486 struct smc_link *link; 487 int i; 488 489 for (i = 0; i < num; i++) { 490 link = wc[i].qp->qp_context; 491 if (wc[i].status == IB_WC_SUCCESS) { 492 link->wr_rx_tstamp = jiffies; 493 smc_wr_rx_demultiplex(&wc[i]); 494 smc_wr_rx_post(link); /* refill WR RX */ 495 } else { 496 /* handle status errors */ 497 switch (wc[i].status) { 498 case IB_WC_RETRY_EXC_ERR: 499 case IB_WC_RNR_RETRY_EXC_ERR: 500 case IB_WC_WR_FLUSH_ERR: 501 smcr_link_down_cond_sched(link); 502 break; 503 default: 504 smc_wr_rx_post(link); /* refill WR RX */ 505 break; 506 } 507 } 508 } 509 } 510 511 static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) 512 { 513 struct smc_ib_device *dev = from_tasklet(dev, t, recv_tasklet); 514 struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; 515 int polled = 0; 516 int rc; 517 518 again: 519 polled++; 520 do { 521 memset(&wc, 0, sizeof(wc)); 522 rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc); 523 if (polled == 1) { 524 ib_req_notify_cq(dev->roce_cq_recv, 525 IB_CQ_SOLICITED_MASK 526 | IB_CQ_REPORT_MISSED_EVENTS); 527 } 528 if (!rc) 529 break; 530 smc_wr_rx_process_cqes(&wc[0], rc); 531 } while (rc > 0); 532 if (polled == 1) 533 goto again; 534 } 535 536 void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context) 537 { 538 struct smc_ib_device *dev = (struct smc_ib_device *)cq_context; 539 540 tasklet_schedule(&dev->recv_tasklet); 541 } 542 543 int smc_wr_rx_post_init(struct smc_link *link) 544 { 545 u32 i; 546 int rc = 0; 547 548 for (i = 0; i < link->wr_rx_cnt; i++) 549 rc = smc_wr_rx_post(link); 550 return rc; 551 } 552 553 /***************************** init, exit, misc ******************************/ 554 555 void smc_wr_remember_qp_attr(struct smc_link *lnk) 556 { 557 struct ib_qp_attr *attr = &lnk->qp_attr; 558 struct ib_qp_init_attr init_attr; 559 560 memset(attr, 0, sizeof(*attr)); 561 memset(&init_attr, 0, sizeof(init_attr)); 562 ib_query_qp(lnk->roce_qp, attr, 563 IB_QP_STATE | 564 IB_QP_CUR_STATE | 565 IB_QP_PKEY_INDEX | 566 IB_QP_PORT | 567 IB_QP_QKEY | 568 IB_QP_AV | 569 IB_QP_PATH_MTU | 570 IB_QP_TIMEOUT | 571 IB_QP_RETRY_CNT | 572 IB_QP_RNR_RETRY | 573 IB_QP_RQ_PSN | 574 IB_QP_ALT_PATH | 575 IB_QP_MIN_RNR_TIMER | 576 IB_QP_SQ_PSN | 577 IB_QP_PATH_MIG_STATE | 578 IB_QP_CAP | 579 IB_QP_DEST_QPN, 580 &init_attr); 581 582 lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT, 583 lnk->qp_attr.cap.max_send_wr); 584 lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3, 585 lnk->qp_attr.cap.max_recv_wr); 586 } 587 588 static void smc_wr_init_sge(struct smc_link *lnk) 589 { 590 int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; 591 u32 i; 592 593 for (i = 0; i < lnk->wr_tx_cnt; i++) { 594 lnk->wr_tx_sges[i].addr = 595 lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE; 596 lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE; 597 lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey; 598 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[0].lkey = 599 lnk->roce_pd->local_dma_lkey; 600 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[1].lkey = 601 lnk->roce_pd->local_dma_lkey; 602 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[0].lkey = 603 lnk->roce_pd->local_dma_lkey; 604 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[1].lkey = 605 lnk->roce_pd->local_dma_lkey; 606 lnk->wr_tx_ibs[i].next = NULL; 607 lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i]; 608 lnk->wr_tx_ibs[i].num_sge = 1; 609 lnk->wr_tx_ibs[i].opcode = IB_WR_SEND; 610 lnk->wr_tx_ibs[i].send_flags = 611 IB_SEND_SIGNALED | IB_SEND_SOLICITED; 612 lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE; 613 lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.opcode = IB_WR_RDMA_WRITE; 614 lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.sg_list = 615 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge; 616 lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.sg_list = 617 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge; 618 } 619 620 if (lnk->lgr->smc_version == SMC_V2) { 621 lnk->wr_tx_v2_sge->addr = lnk->wr_tx_v2_dma_addr; 622 lnk->wr_tx_v2_sge->length = SMC_WR_BUF_V2_SIZE; 623 lnk->wr_tx_v2_sge->lkey = lnk->roce_pd->local_dma_lkey; 624 625 lnk->wr_tx_v2_ib->next = NULL; 626 lnk->wr_tx_v2_ib->sg_list = lnk->wr_tx_v2_sge; 627 lnk->wr_tx_v2_ib->num_sge = 1; 628 lnk->wr_tx_v2_ib->opcode = IB_WR_SEND; 629 lnk->wr_tx_v2_ib->send_flags = 630 IB_SEND_SIGNALED | IB_SEND_SOLICITED; 631 } 632 633 /* With SMC-Rv2 there can be messages larger than SMC_WR_TX_SIZE. 634 * Each ib_recv_wr gets 2 sges, the second one is a spillover buffer 635 * and the same buffer for all sges. When a larger message arrived then 636 * the content of the first small sge is copied to the beginning of 637 * the larger spillover buffer, allowing easy data mapping. 638 */ 639 for (i = 0; i < lnk->wr_rx_cnt; i++) { 640 int x = i * sges_per_buf; 641 642 lnk->wr_rx_sges[x].addr = 643 lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE; 644 lnk->wr_rx_sges[x].length = SMC_WR_TX_SIZE; 645 lnk->wr_rx_sges[x].lkey = lnk->roce_pd->local_dma_lkey; 646 if (lnk->lgr->smc_version == SMC_V2) { 647 lnk->wr_rx_sges[x + 1].addr = 648 lnk->wr_rx_v2_dma_addr + SMC_WR_TX_SIZE; 649 lnk->wr_rx_sges[x + 1].length = 650 SMC_WR_BUF_V2_SIZE - SMC_WR_TX_SIZE; 651 lnk->wr_rx_sges[x + 1].lkey = 652 lnk->roce_pd->local_dma_lkey; 653 } 654 lnk->wr_rx_ibs[i].next = NULL; 655 lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[x]; 656 lnk->wr_rx_ibs[i].num_sge = sges_per_buf; 657 } 658 lnk->wr_reg.wr.next = NULL; 659 lnk->wr_reg.wr.num_sge = 0; 660 lnk->wr_reg.wr.send_flags = IB_SEND_SIGNALED; 661 lnk->wr_reg.wr.opcode = IB_WR_REG_MR; 662 lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE; 663 } 664 665 void smc_wr_free_link(struct smc_link *lnk) 666 { 667 struct ib_device *ibdev; 668 669 if (!lnk->smcibdev) 670 return; 671 ibdev = lnk->smcibdev->ibdev; 672 673 smc_wr_wakeup_reg_wait(lnk); 674 smc_wr_wakeup_tx_wait(lnk); 675 676 if (smc_wr_tx_wait_no_pending_sends(lnk)) 677 memset(lnk->wr_tx_mask, 0, 678 BITS_TO_LONGS(SMC_WR_BUF_CNT) * 679 sizeof(*lnk->wr_tx_mask)); 680 wait_event(lnk->wr_reg_wait, (!atomic_read(&lnk->wr_reg_refcnt))); 681 wait_event(lnk->wr_tx_wait, (!atomic_read(&lnk->wr_tx_refcnt))); 682 683 if (lnk->wr_rx_dma_addr) { 684 ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr, 685 SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, 686 DMA_FROM_DEVICE); 687 lnk->wr_rx_dma_addr = 0; 688 } 689 if (lnk->wr_rx_v2_dma_addr) { 690 ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr, 691 SMC_WR_BUF_V2_SIZE, 692 DMA_FROM_DEVICE); 693 lnk->wr_rx_v2_dma_addr = 0; 694 } 695 if (lnk->wr_tx_dma_addr) { 696 ib_dma_unmap_single(ibdev, lnk->wr_tx_dma_addr, 697 SMC_WR_BUF_SIZE * lnk->wr_tx_cnt, 698 DMA_TO_DEVICE); 699 lnk->wr_tx_dma_addr = 0; 700 } 701 if (lnk->wr_tx_v2_dma_addr) { 702 ib_dma_unmap_single(ibdev, lnk->wr_tx_v2_dma_addr, 703 SMC_WR_BUF_V2_SIZE, 704 DMA_TO_DEVICE); 705 lnk->wr_tx_v2_dma_addr = 0; 706 } 707 } 708 709 void smc_wr_free_lgr_mem(struct smc_link_group *lgr) 710 { 711 if (lgr->smc_version < SMC_V2) 712 return; 713 714 kfree(lgr->wr_rx_buf_v2); 715 lgr->wr_rx_buf_v2 = NULL; 716 kfree(lgr->wr_tx_buf_v2); 717 lgr->wr_tx_buf_v2 = NULL; 718 } 719 720 void smc_wr_free_link_mem(struct smc_link *lnk) 721 { 722 kfree(lnk->wr_tx_v2_ib); 723 lnk->wr_tx_v2_ib = NULL; 724 kfree(lnk->wr_tx_v2_sge); 725 lnk->wr_tx_v2_sge = NULL; 726 kfree(lnk->wr_tx_v2_pend); 727 lnk->wr_tx_v2_pend = NULL; 728 kfree(lnk->wr_tx_compl); 729 lnk->wr_tx_compl = NULL; 730 kfree(lnk->wr_tx_pends); 731 lnk->wr_tx_pends = NULL; 732 kfree(lnk->wr_tx_mask); 733 lnk->wr_tx_mask = NULL; 734 kfree(lnk->wr_tx_sges); 735 lnk->wr_tx_sges = NULL; 736 kfree(lnk->wr_tx_rdma_sges); 737 lnk->wr_tx_rdma_sges = NULL; 738 kfree(lnk->wr_rx_sges); 739 lnk->wr_rx_sges = NULL; 740 kfree(lnk->wr_tx_rdmas); 741 lnk->wr_tx_rdmas = NULL; 742 kfree(lnk->wr_rx_ibs); 743 lnk->wr_rx_ibs = NULL; 744 kfree(lnk->wr_tx_ibs); 745 lnk->wr_tx_ibs = NULL; 746 kfree(lnk->wr_tx_bufs); 747 lnk->wr_tx_bufs = NULL; 748 kfree(lnk->wr_rx_bufs); 749 lnk->wr_rx_bufs = NULL; 750 } 751 752 int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr) 753 { 754 if (lgr->smc_version < SMC_V2) 755 return 0; 756 757 lgr->wr_rx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL); 758 if (!lgr->wr_rx_buf_v2) 759 return -ENOMEM; 760 lgr->wr_tx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL); 761 if (!lgr->wr_tx_buf_v2) { 762 kfree(lgr->wr_rx_buf_v2); 763 return -ENOMEM; 764 } 765 return 0; 766 } 767 768 int smc_wr_alloc_link_mem(struct smc_link *link) 769 { 770 int sges_per_buf = link->lgr->smc_version == SMC_V2 ? 2 : 1; 771 772 /* allocate link related memory */ 773 link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL); 774 if (!link->wr_tx_bufs) 775 goto no_mem; 776 link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE, 777 GFP_KERNEL); 778 if (!link->wr_rx_bufs) 779 goto no_mem_wr_tx_bufs; 780 link->wr_tx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_ibs[0]), 781 GFP_KERNEL); 782 if (!link->wr_tx_ibs) 783 goto no_mem_wr_rx_bufs; 784 link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3, 785 sizeof(link->wr_rx_ibs[0]), 786 GFP_KERNEL); 787 if (!link->wr_rx_ibs) 788 goto no_mem_wr_tx_ibs; 789 link->wr_tx_rdmas = kcalloc(SMC_WR_BUF_CNT, 790 sizeof(link->wr_tx_rdmas[0]), 791 GFP_KERNEL); 792 if (!link->wr_tx_rdmas) 793 goto no_mem_wr_rx_ibs; 794 link->wr_tx_rdma_sges = kcalloc(SMC_WR_BUF_CNT, 795 sizeof(link->wr_tx_rdma_sges[0]), 796 GFP_KERNEL); 797 if (!link->wr_tx_rdma_sges) 798 goto no_mem_wr_tx_rdmas; 799 link->wr_tx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_sges[0]), 800 GFP_KERNEL); 801 if (!link->wr_tx_sges) 802 goto no_mem_wr_tx_rdma_sges; 803 link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3, 804 sizeof(link->wr_rx_sges[0]) * sges_per_buf, 805 GFP_KERNEL); 806 if (!link->wr_rx_sges) 807 goto no_mem_wr_tx_sges; 808 link->wr_tx_mask = kcalloc(BITS_TO_LONGS(SMC_WR_BUF_CNT), 809 sizeof(*link->wr_tx_mask), 810 GFP_KERNEL); 811 if (!link->wr_tx_mask) 812 goto no_mem_wr_rx_sges; 813 link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT, 814 sizeof(link->wr_tx_pends[0]), 815 GFP_KERNEL); 816 if (!link->wr_tx_pends) 817 goto no_mem_wr_tx_mask; 818 link->wr_tx_compl = kcalloc(SMC_WR_BUF_CNT, 819 sizeof(link->wr_tx_compl[0]), 820 GFP_KERNEL); 821 if (!link->wr_tx_compl) 822 goto no_mem_wr_tx_pends; 823 824 if (link->lgr->smc_version == SMC_V2) { 825 link->wr_tx_v2_ib = kzalloc(sizeof(*link->wr_tx_v2_ib), 826 GFP_KERNEL); 827 if (!link->wr_tx_v2_ib) 828 goto no_mem_tx_compl; 829 link->wr_tx_v2_sge = kzalloc(sizeof(*link->wr_tx_v2_sge), 830 GFP_KERNEL); 831 if (!link->wr_tx_v2_sge) 832 goto no_mem_v2_ib; 833 link->wr_tx_v2_pend = kzalloc(sizeof(*link->wr_tx_v2_pend), 834 GFP_KERNEL); 835 if (!link->wr_tx_v2_pend) 836 goto no_mem_v2_sge; 837 } 838 return 0; 839 840 no_mem_v2_sge: 841 kfree(link->wr_tx_v2_sge); 842 no_mem_v2_ib: 843 kfree(link->wr_tx_v2_ib); 844 no_mem_tx_compl: 845 kfree(link->wr_tx_compl); 846 no_mem_wr_tx_pends: 847 kfree(link->wr_tx_pends); 848 no_mem_wr_tx_mask: 849 kfree(link->wr_tx_mask); 850 no_mem_wr_rx_sges: 851 kfree(link->wr_rx_sges); 852 no_mem_wr_tx_sges: 853 kfree(link->wr_tx_sges); 854 no_mem_wr_tx_rdma_sges: 855 kfree(link->wr_tx_rdma_sges); 856 no_mem_wr_tx_rdmas: 857 kfree(link->wr_tx_rdmas); 858 no_mem_wr_rx_ibs: 859 kfree(link->wr_rx_ibs); 860 no_mem_wr_tx_ibs: 861 kfree(link->wr_tx_ibs); 862 no_mem_wr_rx_bufs: 863 kfree(link->wr_rx_bufs); 864 no_mem_wr_tx_bufs: 865 kfree(link->wr_tx_bufs); 866 no_mem: 867 return -ENOMEM; 868 } 869 870 void smc_wr_remove_dev(struct smc_ib_device *smcibdev) 871 { 872 tasklet_kill(&smcibdev->recv_tasklet); 873 tasklet_kill(&smcibdev->send_tasklet); 874 } 875 876 void smc_wr_add_dev(struct smc_ib_device *smcibdev) 877 { 878 tasklet_setup(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn); 879 tasklet_setup(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn); 880 } 881 882 int smc_wr_create_link(struct smc_link *lnk) 883 { 884 struct ib_device *ibdev = lnk->smcibdev->ibdev; 885 int rc = 0; 886 887 smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0); 888 lnk->wr_rx_id = 0; 889 lnk->wr_rx_dma_addr = ib_dma_map_single( 890 ibdev, lnk->wr_rx_bufs, SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, 891 DMA_FROM_DEVICE); 892 if (ib_dma_mapping_error(ibdev, lnk->wr_rx_dma_addr)) { 893 lnk->wr_rx_dma_addr = 0; 894 rc = -EIO; 895 goto out; 896 } 897 if (lnk->lgr->smc_version == SMC_V2) { 898 lnk->wr_rx_v2_dma_addr = ib_dma_map_single(ibdev, 899 lnk->lgr->wr_rx_buf_v2, SMC_WR_BUF_V2_SIZE, 900 DMA_FROM_DEVICE); 901 if (ib_dma_mapping_error(ibdev, lnk->wr_rx_v2_dma_addr)) { 902 lnk->wr_rx_v2_dma_addr = 0; 903 rc = -EIO; 904 goto dma_unmap; 905 } 906 lnk->wr_tx_v2_dma_addr = ib_dma_map_single(ibdev, 907 lnk->lgr->wr_tx_buf_v2, SMC_WR_BUF_V2_SIZE, 908 DMA_TO_DEVICE); 909 if (ib_dma_mapping_error(ibdev, lnk->wr_tx_v2_dma_addr)) { 910 lnk->wr_tx_v2_dma_addr = 0; 911 rc = -EIO; 912 goto dma_unmap; 913 } 914 } 915 lnk->wr_tx_dma_addr = ib_dma_map_single( 916 ibdev, lnk->wr_tx_bufs, SMC_WR_BUF_SIZE * lnk->wr_tx_cnt, 917 DMA_TO_DEVICE); 918 if (ib_dma_mapping_error(ibdev, lnk->wr_tx_dma_addr)) { 919 rc = -EIO; 920 goto dma_unmap; 921 } 922 smc_wr_init_sge(lnk); 923 memset(lnk->wr_tx_mask, 0, 924 BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask)); 925 init_waitqueue_head(&lnk->wr_tx_wait); 926 atomic_set(&lnk->wr_tx_refcnt, 0); 927 init_waitqueue_head(&lnk->wr_reg_wait); 928 atomic_set(&lnk->wr_reg_refcnt, 0); 929 return rc; 930 931 dma_unmap: 932 if (lnk->wr_rx_v2_dma_addr) { 933 ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr, 934 SMC_WR_BUF_V2_SIZE, 935 DMA_FROM_DEVICE); 936 lnk->wr_rx_v2_dma_addr = 0; 937 } 938 if (lnk->wr_tx_v2_dma_addr) { 939 ib_dma_unmap_single(ibdev, lnk->wr_tx_v2_dma_addr, 940 SMC_WR_BUF_V2_SIZE, 941 DMA_TO_DEVICE); 942 lnk->wr_tx_v2_dma_addr = 0; 943 } 944 ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr, 945 SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, 946 DMA_FROM_DEVICE); 947 lnk->wr_rx_dma_addr = 0; 948 out: 949 return rc; 950 } 951