1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Shared Memory Communications over RDMA (SMC-R) and RoCE 4 * 5 * Work Requests exploiting Infiniband API 6 * 7 * Work requests (WR) of type ib_post_send or ib_post_recv respectively 8 * are submitted to either RC SQ or RC RQ respectively 9 * (reliably connected send/receive queue) 10 * and become work queue entries (WQEs). 11 * While an SQ WR/WQE is pending, we track it until transmission completion. 12 * Through a send or receive completion queue (CQ) respectively, 13 * we get completion queue entries (CQEs) [aka work completions (WCs)]. 14 * Since the CQ callback is called from IRQ context, we split work by using 15 * bottom halves implemented by tasklets. 16 * 17 * SMC uses this to exchange LLC (link layer control) 18 * and CDC (connection data control) messages. 19 * 20 * Copyright IBM Corp. 2016 21 * 22 * Author(s): Steffen Maier <maier@linux.vnet.ibm.com> 23 */ 24 25 #include <linux/atomic.h> 26 #include <linux/hashtable.h> 27 #include <linux/wait.h> 28 #include <rdma/ib_verbs.h> 29 #include <asm/div64.h> 30 31 #include "smc.h" 32 #include "smc_wr.h" 33 34 #define SMC_WR_MAX_POLL_CQE 10 /* max. # of compl. queue elements in 1 poll */ 35 36 #define SMC_WR_RX_HASH_BITS 4 37 static DEFINE_HASHTABLE(smc_wr_rx_hash, SMC_WR_RX_HASH_BITS); 38 static DEFINE_SPINLOCK(smc_wr_rx_hash_lock); 39 40 struct smc_wr_tx_pend { /* control data for a pending send request */ 41 u64 wr_id; /* work request id sent */ 42 smc_wr_tx_handler handler; 43 enum ib_wc_status wc_status; /* CQE status */ 44 struct smc_link *link; 45 u32 idx; 46 struct smc_wr_tx_pend_priv priv; 47 u8 compl_requested; 48 }; 49 50 /******************************** send queue *********************************/ 51 52 /*------------------------------- completion --------------------------------*/ 53 54 /* returns true if at least one tx work request is pending on the given link */ 55 static inline bool smc_wr_is_tx_pend(struct smc_link *link) 56 { 57 return !bitmap_empty(link->wr_tx_mask, link->wr_tx_cnt); 58 } 59 60 /* wait till all pending tx work requests on the given link are completed */ 61 int smc_wr_tx_wait_no_pending_sends(struct smc_link *link) 62 { 63 if (wait_event_timeout(link->wr_tx_wait, !smc_wr_is_tx_pend(link), 64 SMC_WR_TX_WAIT_PENDING_TIME)) 65 return 0; 66 else /* timeout */ 67 return -EPIPE; 68 } 69 70 static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id) 71 { 72 u32 i; 73 74 for (i = 0; i < link->wr_tx_cnt; i++) { 75 if (link->wr_tx_pends[i].wr_id == wr_id) 76 return i; 77 } 78 return link->wr_tx_cnt; 79 } 80 81 static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) 82 { 83 struct smc_wr_tx_pend pnd_snd; 84 struct smc_link *link; 85 u32 pnd_snd_idx; 86 int i; 87 88 link = wc->qp->qp_context; 89 90 if (wc->opcode == IB_WC_REG_MR) { 91 if (wc->status) 92 link->wr_reg_state = FAILED; 93 else 94 link->wr_reg_state = CONFIRMED; 95 smc_wr_wakeup_reg_wait(link); 96 return; 97 } 98 99 pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id); 100 if (pnd_snd_idx == link->wr_tx_cnt) { 101 if (link->lgr->smc_version != SMC_V2 || 102 link->wr_tx_v2_pend->wr_id != wc->wr_id) 103 return; 104 link->wr_tx_v2_pend->wc_status = wc->status; 105 memcpy(&pnd_snd, link->wr_tx_v2_pend, sizeof(pnd_snd)); 106 /* clear the full struct smc_wr_tx_pend including .priv */ 107 memset(link->wr_tx_v2_pend, 0, 108 sizeof(*link->wr_tx_v2_pend)); 109 memset(link->lgr->wr_tx_buf_v2, 0, 110 sizeof(*link->lgr->wr_tx_buf_v2)); 111 } else { 112 link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status; 113 if (link->wr_tx_pends[pnd_snd_idx].compl_requested) 114 complete(&link->wr_tx_compl[pnd_snd_idx]); 115 memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx], 116 sizeof(pnd_snd)); 117 /* clear the full struct smc_wr_tx_pend including .priv */ 118 memset(&link->wr_tx_pends[pnd_snd_idx], 0, 119 sizeof(link->wr_tx_pends[pnd_snd_idx])); 120 memset(&link->wr_tx_bufs[pnd_snd_idx], 0, 121 sizeof(link->wr_tx_bufs[pnd_snd_idx])); 122 if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask)) 123 return; 124 } 125 126 if (wc->status) { 127 for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) { 128 /* clear full struct smc_wr_tx_pend including .priv */ 129 memset(&link->wr_tx_pends[i], 0, 130 sizeof(link->wr_tx_pends[i])); 131 memset(&link->wr_tx_bufs[i], 0, 132 sizeof(link->wr_tx_bufs[i])); 133 clear_bit(i, link->wr_tx_mask); 134 } 135 if (link->lgr->smc_version == SMC_V2) { 136 memset(link->wr_tx_v2_pend, 0, 137 sizeof(*link->wr_tx_v2_pend)); 138 memset(link->lgr->wr_tx_buf_v2, 0, 139 sizeof(*link->lgr->wr_tx_buf_v2)); 140 } 141 /* terminate link */ 142 smcr_link_down_cond_sched(link); 143 } 144 if (pnd_snd.handler) 145 pnd_snd.handler(&pnd_snd.priv, link, wc->status); 146 wake_up(&link->wr_tx_wait); 147 } 148 149 static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) 150 { 151 struct smc_ib_device *dev = from_tasklet(dev, t, send_tasklet); 152 struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; 153 int i = 0, rc; 154 int polled = 0; 155 156 again: 157 polled++; 158 do { 159 memset(&wc, 0, sizeof(wc)); 160 rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc); 161 if (polled == 1) { 162 ib_req_notify_cq(dev->roce_cq_send, 163 IB_CQ_NEXT_COMP | 164 IB_CQ_REPORT_MISSED_EVENTS); 165 } 166 if (!rc) 167 break; 168 for (i = 0; i < rc; i++) 169 smc_wr_tx_process_cqe(&wc[i]); 170 } while (rc > 0); 171 if (polled == 1) 172 goto again; 173 } 174 175 void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context) 176 { 177 struct smc_ib_device *dev = (struct smc_ib_device *)cq_context; 178 179 tasklet_schedule(&dev->send_tasklet); 180 } 181 182 /*---------------------------- request submission ---------------------------*/ 183 184 static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx) 185 { 186 *idx = link->wr_tx_cnt; 187 if (!smc_link_usable(link)) 188 return -ENOLINK; 189 for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) { 190 if (!test_and_set_bit(*idx, link->wr_tx_mask)) 191 return 0; 192 } 193 *idx = link->wr_tx_cnt; 194 return -EBUSY; 195 } 196 197 /** 198 * smc_wr_tx_get_free_slot() - returns buffer for message assembly, 199 * and sets info for pending transmit tracking 200 * @link: Pointer to smc_link used to later send the message. 201 * @handler: Send completion handler function pointer. 202 * @wr_buf: Out value returns pointer to message buffer. 203 * @wr_rdma_buf: Out value returns pointer to rdma work request. 204 * @wr_pend_priv: Out value returns pointer serving as handler context. 205 * 206 * Return: 0 on success, or -errno on error. 207 */ 208 int smc_wr_tx_get_free_slot(struct smc_link *link, 209 smc_wr_tx_handler handler, 210 struct smc_wr_buf **wr_buf, 211 struct smc_rdma_wr **wr_rdma_buf, 212 struct smc_wr_tx_pend_priv **wr_pend_priv) 213 { 214 struct smc_link_group *lgr = smc_get_lgr(link); 215 struct smc_wr_tx_pend *wr_pend; 216 u32 idx = link->wr_tx_cnt; 217 struct ib_send_wr *wr_ib; 218 u64 wr_id; 219 int rc; 220 221 *wr_buf = NULL; 222 *wr_pend_priv = NULL; 223 if (in_softirq() || lgr->terminating) { 224 rc = smc_wr_tx_get_free_slot_index(link, &idx); 225 if (rc) 226 return rc; 227 } else { 228 rc = wait_event_interruptible_timeout( 229 link->wr_tx_wait, 230 !smc_link_usable(link) || 231 lgr->terminating || 232 (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY), 233 SMC_WR_TX_WAIT_FREE_SLOT_TIME); 234 if (!rc) { 235 /* timeout - terminate link */ 236 smcr_link_down_cond_sched(link); 237 return -EPIPE; 238 } 239 if (idx == link->wr_tx_cnt) 240 return -EPIPE; 241 } 242 wr_id = smc_wr_tx_get_next_wr_id(link); 243 wr_pend = &link->wr_tx_pends[idx]; 244 wr_pend->wr_id = wr_id; 245 wr_pend->handler = handler; 246 wr_pend->link = link; 247 wr_pend->idx = idx; 248 wr_ib = &link->wr_tx_ibs[idx]; 249 wr_ib->wr_id = wr_id; 250 *wr_buf = &link->wr_tx_bufs[idx]; 251 if (wr_rdma_buf) 252 *wr_rdma_buf = &link->wr_tx_rdmas[idx]; 253 *wr_pend_priv = &wr_pend->priv; 254 return 0; 255 } 256 257 int smc_wr_tx_get_v2_slot(struct smc_link *link, 258 smc_wr_tx_handler handler, 259 struct smc_wr_v2_buf **wr_buf, 260 struct smc_wr_tx_pend_priv **wr_pend_priv) 261 { 262 struct smc_wr_tx_pend *wr_pend; 263 struct ib_send_wr *wr_ib; 264 u64 wr_id; 265 266 if (link->wr_tx_v2_pend->idx == link->wr_tx_cnt) 267 return -EBUSY; 268 269 *wr_buf = NULL; 270 *wr_pend_priv = NULL; 271 wr_id = smc_wr_tx_get_next_wr_id(link); 272 wr_pend = link->wr_tx_v2_pend; 273 wr_pend->wr_id = wr_id; 274 wr_pend->handler = handler; 275 wr_pend->link = link; 276 wr_pend->idx = link->wr_tx_cnt; 277 wr_ib = link->wr_tx_v2_ib; 278 wr_ib->wr_id = wr_id; 279 *wr_buf = link->lgr->wr_tx_buf_v2; 280 *wr_pend_priv = &wr_pend->priv; 281 return 0; 282 } 283 284 int smc_wr_tx_put_slot(struct smc_link *link, 285 struct smc_wr_tx_pend_priv *wr_pend_priv) 286 { 287 struct smc_wr_tx_pend *pend; 288 289 pend = container_of(wr_pend_priv, struct smc_wr_tx_pend, priv); 290 if (pend->idx < link->wr_tx_cnt) { 291 u32 idx = pend->idx; 292 293 /* clear the full struct smc_wr_tx_pend including .priv */ 294 memset(&link->wr_tx_pends[idx], 0, 295 sizeof(link->wr_tx_pends[idx])); 296 memset(&link->wr_tx_bufs[idx], 0, 297 sizeof(link->wr_tx_bufs[idx])); 298 test_and_clear_bit(idx, link->wr_tx_mask); 299 wake_up(&link->wr_tx_wait); 300 return 1; 301 } else if (link->lgr->smc_version == SMC_V2 && 302 pend->idx == link->wr_tx_cnt) { 303 /* Large v2 buffer */ 304 memset(&link->wr_tx_v2_pend, 0, 305 sizeof(link->wr_tx_v2_pend)); 306 memset(&link->lgr->wr_tx_buf_v2, 0, 307 sizeof(link->lgr->wr_tx_buf_v2)); 308 return 1; 309 } 310 311 return 0; 312 } 313 314 /* Send prepared WR slot via ib_post_send. 315 * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer 316 */ 317 int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) 318 { 319 struct smc_wr_tx_pend *pend; 320 int rc; 321 322 ib_req_notify_cq(link->smcibdev->roce_cq_send, 323 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); 324 pend = container_of(priv, struct smc_wr_tx_pend, priv); 325 rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL); 326 if (rc) { 327 smc_wr_tx_put_slot(link, priv); 328 smcr_link_down_cond_sched(link); 329 } 330 return rc; 331 } 332 333 int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, 334 int len) 335 { 336 int rc; 337 338 link->wr_tx_v2_ib->sg_list[0].length = len; 339 ib_req_notify_cq(link->smcibdev->roce_cq_send, 340 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); 341 rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL); 342 if (rc) { 343 smc_wr_tx_put_slot(link, priv); 344 smcr_link_down_cond_sched(link); 345 } 346 return rc; 347 } 348 349 /* Send prepared WR slot via ib_post_send and wait for send completion 350 * notification. 351 * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer 352 */ 353 int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, 354 unsigned long timeout) 355 { 356 struct smc_wr_tx_pend *pend; 357 int rc; 358 359 pend = container_of(priv, struct smc_wr_tx_pend, priv); 360 pend->compl_requested = 1; 361 init_completion(&link->wr_tx_compl[pend->idx]); 362 363 rc = smc_wr_tx_send(link, priv); 364 if (rc) 365 return rc; 366 /* wait for completion by smc_wr_tx_process_cqe() */ 367 rc = wait_for_completion_interruptible_timeout( 368 &link->wr_tx_compl[pend->idx], timeout); 369 if (rc <= 0) 370 rc = -ENODATA; 371 if (rc > 0) 372 rc = 0; 373 return rc; 374 } 375 376 /* Register a memory region and wait for result. */ 377 int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) 378 { 379 int rc; 380 381 ib_req_notify_cq(link->smcibdev->roce_cq_send, 382 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); 383 link->wr_reg_state = POSTED; 384 link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr; 385 link->wr_reg.mr = mr; 386 link->wr_reg.key = mr->rkey; 387 rc = ib_post_send(link->roce_qp, &link->wr_reg.wr, NULL); 388 if (rc) 389 return rc; 390 391 atomic_inc(&link->wr_reg_refcnt); 392 rc = wait_event_interruptible_timeout(link->wr_reg_wait, 393 (link->wr_reg_state != POSTED), 394 SMC_WR_REG_MR_WAIT_TIME); 395 if (atomic_dec_and_test(&link->wr_reg_refcnt)) 396 wake_up_all(&link->wr_reg_wait); 397 if (!rc) { 398 /* timeout - terminate link */ 399 smcr_link_down_cond_sched(link); 400 return -EPIPE; 401 } 402 if (rc == -ERESTARTSYS) 403 return -EINTR; 404 switch (link->wr_reg_state) { 405 case CONFIRMED: 406 rc = 0; 407 break; 408 case FAILED: 409 rc = -EIO; 410 break; 411 case POSTED: 412 rc = -EPIPE; 413 break; 414 } 415 return rc; 416 } 417 418 void smc_wr_tx_dismiss_slots(struct smc_link *link, u8 wr_tx_hdr_type, 419 smc_wr_tx_filter filter, 420 smc_wr_tx_dismisser dismisser, 421 unsigned long data) 422 { 423 struct smc_wr_tx_pend_priv *tx_pend; 424 struct smc_wr_rx_hdr *wr_tx; 425 int i; 426 427 for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) { 428 wr_tx = (struct smc_wr_rx_hdr *)&link->wr_tx_bufs[i]; 429 if (wr_tx->type != wr_tx_hdr_type) 430 continue; 431 tx_pend = &link->wr_tx_pends[i].priv; 432 if (filter(tx_pend, data)) 433 dismisser(tx_pend); 434 } 435 } 436 437 /****************************** receive queue ********************************/ 438 439 int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler) 440 { 441 struct smc_wr_rx_handler *h_iter; 442 int rc = 0; 443 444 spin_lock(&smc_wr_rx_hash_lock); 445 hash_for_each_possible(smc_wr_rx_hash, h_iter, list, handler->type) { 446 if (h_iter->type == handler->type) { 447 rc = -EEXIST; 448 goto out_unlock; 449 } 450 } 451 hash_add(smc_wr_rx_hash, &handler->list, handler->type); 452 out_unlock: 453 spin_unlock(&smc_wr_rx_hash_lock); 454 return rc; 455 } 456 457 /* Demultiplex a received work request based on the message type to its handler. 458 * Relies on smc_wr_rx_hash having been completely filled before any IB WRs, 459 * and not being modified any more afterwards so we don't need to lock it. 460 */ 461 static inline void smc_wr_rx_demultiplex(struct ib_wc *wc) 462 { 463 struct smc_link *link = (struct smc_link *)wc->qp->qp_context; 464 struct smc_wr_rx_handler *handler; 465 struct smc_wr_rx_hdr *wr_rx; 466 u64 temp_wr_id; 467 u32 index; 468 469 if (wc->byte_len < sizeof(*wr_rx)) 470 return; /* short message */ 471 temp_wr_id = wc->wr_id; 472 index = do_div(temp_wr_id, link->wr_rx_cnt); 473 wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[index]; 474 hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) { 475 if (handler->type == wr_rx->type) 476 handler->handler(wc, wr_rx); 477 } 478 } 479 480 static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) 481 { 482 struct smc_link *link; 483 int i; 484 485 for (i = 0; i < num; i++) { 486 link = wc[i].qp->qp_context; 487 if (wc[i].status == IB_WC_SUCCESS) { 488 link->wr_rx_tstamp = jiffies; 489 smc_wr_rx_demultiplex(&wc[i]); 490 smc_wr_rx_post(link); /* refill WR RX */ 491 } else { 492 /* handle status errors */ 493 switch (wc[i].status) { 494 case IB_WC_RETRY_EXC_ERR: 495 case IB_WC_RNR_RETRY_EXC_ERR: 496 case IB_WC_WR_FLUSH_ERR: 497 smcr_link_down_cond_sched(link); 498 break; 499 default: 500 smc_wr_rx_post(link); /* refill WR RX */ 501 break; 502 } 503 } 504 } 505 } 506 507 static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) 508 { 509 struct smc_ib_device *dev = from_tasklet(dev, t, recv_tasklet); 510 struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; 511 int polled = 0; 512 int rc; 513 514 again: 515 polled++; 516 do { 517 memset(&wc, 0, sizeof(wc)); 518 rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc); 519 if (polled == 1) { 520 ib_req_notify_cq(dev->roce_cq_recv, 521 IB_CQ_SOLICITED_MASK 522 | IB_CQ_REPORT_MISSED_EVENTS); 523 } 524 if (!rc) 525 break; 526 smc_wr_rx_process_cqes(&wc[0], rc); 527 } while (rc > 0); 528 if (polled == 1) 529 goto again; 530 } 531 532 void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context) 533 { 534 struct smc_ib_device *dev = (struct smc_ib_device *)cq_context; 535 536 tasklet_schedule(&dev->recv_tasklet); 537 } 538 539 int smc_wr_rx_post_init(struct smc_link *link) 540 { 541 u32 i; 542 int rc = 0; 543 544 for (i = 0; i < link->wr_rx_cnt; i++) 545 rc = smc_wr_rx_post(link); 546 return rc; 547 } 548 549 /***************************** init, exit, misc ******************************/ 550 551 void smc_wr_remember_qp_attr(struct smc_link *lnk) 552 { 553 struct ib_qp_attr *attr = &lnk->qp_attr; 554 struct ib_qp_init_attr init_attr; 555 556 memset(attr, 0, sizeof(*attr)); 557 memset(&init_attr, 0, sizeof(init_attr)); 558 ib_query_qp(lnk->roce_qp, attr, 559 IB_QP_STATE | 560 IB_QP_CUR_STATE | 561 IB_QP_PKEY_INDEX | 562 IB_QP_PORT | 563 IB_QP_QKEY | 564 IB_QP_AV | 565 IB_QP_PATH_MTU | 566 IB_QP_TIMEOUT | 567 IB_QP_RETRY_CNT | 568 IB_QP_RNR_RETRY | 569 IB_QP_RQ_PSN | 570 IB_QP_ALT_PATH | 571 IB_QP_MIN_RNR_TIMER | 572 IB_QP_SQ_PSN | 573 IB_QP_PATH_MIG_STATE | 574 IB_QP_CAP | 575 IB_QP_DEST_QPN, 576 &init_attr); 577 578 lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT, 579 lnk->qp_attr.cap.max_send_wr); 580 lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3, 581 lnk->qp_attr.cap.max_recv_wr); 582 } 583 584 static void smc_wr_init_sge(struct smc_link *lnk) 585 { 586 int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; 587 u32 i; 588 589 for (i = 0; i < lnk->wr_tx_cnt; i++) { 590 lnk->wr_tx_sges[i].addr = 591 lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE; 592 lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE; 593 lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey; 594 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[0].lkey = 595 lnk->roce_pd->local_dma_lkey; 596 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[1].lkey = 597 lnk->roce_pd->local_dma_lkey; 598 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[0].lkey = 599 lnk->roce_pd->local_dma_lkey; 600 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[1].lkey = 601 lnk->roce_pd->local_dma_lkey; 602 lnk->wr_tx_ibs[i].next = NULL; 603 lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i]; 604 lnk->wr_tx_ibs[i].num_sge = 1; 605 lnk->wr_tx_ibs[i].opcode = IB_WR_SEND; 606 lnk->wr_tx_ibs[i].send_flags = 607 IB_SEND_SIGNALED | IB_SEND_SOLICITED; 608 lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE; 609 lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.opcode = IB_WR_RDMA_WRITE; 610 lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.sg_list = 611 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge; 612 lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.sg_list = 613 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge; 614 } 615 616 if (lnk->lgr->smc_version == SMC_V2) { 617 lnk->wr_tx_v2_sge->addr = lnk->wr_tx_v2_dma_addr; 618 lnk->wr_tx_v2_sge->length = SMC_WR_BUF_V2_SIZE; 619 lnk->wr_tx_v2_sge->lkey = lnk->roce_pd->local_dma_lkey; 620 621 lnk->wr_tx_v2_ib->next = NULL; 622 lnk->wr_tx_v2_ib->sg_list = lnk->wr_tx_v2_sge; 623 lnk->wr_tx_v2_ib->num_sge = 1; 624 lnk->wr_tx_v2_ib->opcode = IB_WR_SEND; 625 lnk->wr_tx_v2_ib->send_flags = 626 IB_SEND_SIGNALED | IB_SEND_SOLICITED; 627 } 628 629 /* With SMC-Rv2 there can be messages larger than SMC_WR_TX_SIZE. 630 * Each ib_recv_wr gets 2 sges, the second one is a spillover buffer 631 * and the same buffer for all sges. When a larger message arrived then 632 * the content of the first small sge is copied to the beginning of 633 * the larger spillover buffer, allowing easy data mapping. 634 */ 635 for (i = 0; i < lnk->wr_rx_cnt; i++) { 636 int x = i * sges_per_buf; 637 638 lnk->wr_rx_sges[x].addr = 639 lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE; 640 lnk->wr_rx_sges[x].length = SMC_WR_TX_SIZE; 641 lnk->wr_rx_sges[x].lkey = lnk->roce_pd->local_dma_lkey; 642 if (lnk->lgr->smc_version == SMC_V2) { 643 lnk->wr_rx_sges[x + 1].addr = 644 lnk->wr_rx_v2_dma_addr + SMC_WR_TX_SIZE; 645 lnk->wr_rx_sges[x + 1].length = 646 SMC_WR_BUF_V2_SIZE - SMC_WR_TX_SIZE; 647 lnk->wr_rx_sges[x + 1].lkey = 648 lnk->roce_pd->local_dma_lkey; 649 } 650 lnk->wr_rx_ibs[i].next = NULL; 651 lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[x]; 652 lnk->wr_rx_ibs[i].num_sge = sges_per_buf; 653 } 654 lnk->wr_reg.wr.next = NULL; 655 lnk->wr_reg.wr.num_sge = 0; 656 lnk->wr_reg.wr.send_flags = IB_SEND_SIGNALED; 657 lnk->wr_reg.wr.opcode = IB_WR_REG_MR; 658 lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE; 659 } 660 661 void smc_wr_free_link(struct smc_link *lnk) 662 { 663 struct ib_device *ibdev; 664 665 if (!lnk->smcibdev) 666 return; 667 ibdev = lnk->smcibdev->ibdev; 668 669 smc_wr_wakeup_reg_wait(lnk); 670 smc_wr_wakeup_tx_wait(lnk); 671 672 if (smc_wr_tx_wait_no_pending_sends(lnk)) 673 bitmap_zero(lnk->wr_tx_mask, SMC_WR_BUF_CNT); 674 wait_event(lnk->wr_reg_wait, (!atomic_read(&lnk->wr_reg_refcnt))); 675 wait_event(lnk->wr_tx_wait, (!atomic_read(&lnk->wr_tx_refcnt))); 676 677 if (lnk->wr_rx_dma_addr) { 678 ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr, 679 SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, 680 DMA_FROM_DEVICE); 681 lnk->wr_rx_dma_addr = 0; 682 } 683 if (lnk->wr_rx_v2_dma_addr) { 684 ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr, 685 SMC_WR_BUF_V2_SIZE, 686 DMA_FROM_DEVICE); 687 lnk->wr_rx_v2_dma_addr = 0; 688 } 689 if (lnk->wr_tx_dma_addr) { 690 ib_dma_unmap_single(ibdev, lnk->wr_tx_dma_addr, 691 SMC_WR_BUF_SIZE * lnk->wr_tx_cnt, 692 DMA_TO_DEVICE); 693 lnk->wr_tx_dma_addr = 0; 694 } 695 if (lnk->wr_tx_v2_dma_addr) { 696 ib_dma_unmap_single(ibdev, lnk->wr_tx_v2_dma_addr, 697 SMC_WR_BUF_V2_SIZE, 698 DMA_TO_DEVICE); 699 lnk->wr_tx_v2_dma_addr = 0; 700 } 701 } 702 703 void smc_wr_free_lgr_mem(struct smc_link_group *lgr) 704 { 705 if (lgr->smc_version < SMC_V2) 706 return; 707 708 kfree(lgr->wr_rx_buf_v2); 709 lgr->wr_rx_buf_v2 = NULL; 710 kfree(lgr->wr_tx_buf_v2); 711 lgr->wr_tx_buf_v2 = NULL; 712 } 713 714 void smc_wr_free_link_mem(struct smc_link *lnk) 715 { 716 kfree(lnk->wr_tx_v2_ib); 717 lnk->wr_tx_v2_ib = NULL; 718 kfree(lnk->wr_tx_v2_sge); 719 lnk->wr_tx_v2_sge = NULL; 720 kfree(lnk->wr_tx_v2_pend); 721 lnk->wr_tx_v2_pend = NULL; 722 kfree(lnk->wr_tx_compl); 723 lnk->wr_tx_compl = NULL; 724 kfree(lnk->wr_tx_pends); 725 lnk->wr_tx_pends = NULL; 726 bitmap_free(lnk->wr_tx_mask); 727 lnk->wr_tx_mask = NULL; 728 kfree(lnk->wr_tx_sges); 729 lnk->wr_tx_sges = NULL; 730 kfree(lnk->wr_tx_rdma_sges); 731 lnk->wr_tx_rdma_sges = NULL; 732 kfree(lnk->wr_rx_sges); 733 lnk->wr_rx_sges = NULL; 734 kfree(lnk->wr_tx_rdmas); 735 lnk->wr_tx_rdmas = NULL; 736 kfree(lnk->wr_rx_ibs); 737 lnk->wr_rx_ibs = NULL; 738 kfree(lnk->wr_tx_ibs); 739 lnk->wr_tx_ibs = NULL; 740 kfree(lnk->wr_tx_bufs); 741 lnk->wr_tx_bufs = NULL; 742 kfree(lnk->wr_rx_bufs); 743 lnk->wr_rx_bufs = NULL; 744 } 745 746 int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr) 747 { 748 if (lgr->smc_version < SMC_V2) 749 return 0; 750 751 lgr->wr_rx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL); 752 if (!lgr->wr_rx_buf_v2) 753 return -ENOMEM; 754 lgr->wr_tx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL); 755 if (!lgr->wr_tx_buf_v2) { 756 kfree(lgr->wr_rx_buf_v2); 757 return -ENOMEM; 758 } 759 return 0; 760 } 761 762 int smc_wr_alloc_link_mem(struct smc_link *link) 763 { 764 int sges_per_buf = link->lgr->smc_version == SMC_V2 ? 2 : 1; 765 766 /* allocate link related memory */ 767 link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL); 768 if (!link->wr_tx_bufs) 769 goto no_mem; 770 link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE, 771 GFP_KERNEL); 772 if (!link->wr_rx_bufs) 773 goto no_mem_wr_tx_bufs; 774 link->wr_tx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_ibs[0]), 775 GFP_KERNEL); 776 if (!link->wr_tx_ibs) 777 goto no_mem_wr_rx_bufs; 778 link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3, 779 sizeof(link->wr_rx_ibs[0]), 780 GFP_KERNEL); 781 if (!link->wr_rx_ibs) 782 goto no_mem_wr_tx_ibs; 783 link->wr_tx_rdmas = kcalloc(SMC_WR_BUF_CNT, 784 sizeof(link->wr_tx_rdmas[0]), 785 GFP_KERNEL); 786 if (!link->wr_tx_rdmas) 787 goto no_mem_wr_rx_ibs; 788 link->wr_tx_rdma_sges = kcalloc(SMC_WR_BUF_CNT, 789 sizeof(link->wr_tx_rdma_sges[0]), 790 GFP_KERNEL); 791 if (!link->wr_tx_rdma_sges) 792 goto no_mem_wr_tx_rdmas; 793 link->wr_tx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_sges[0]), 794 GFP_KERNEL); 795 if (!link->wr_tx_sges) 796 goto no_mem_wr_tx_rdma_sges; 797 link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3, 798 sizeof(link->wr_rx_sges[0]) * sges_per_buf, 799 GFP_KERNEL); 800 if (!link->wr_rx_sges) 801 goto no_mem_wr_tx_sges; 802 link->wr_tx_mask = bitmap_zalloc(SMC_WR_BUF_CNT, GFP_KERNEL); 803 if (!link->wr_tx_mask) 804 goto no_mem_wr_rx_sges; 805 link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT, 806 sizeof(link->wr_tx_pends[0]), 807 GFP_KERNEL); 808 if (!link->wr_tx_pends) 809 goto no_mem_wr_tx_mask; 810 link->wr_tx_compl = kcalloc(SMC_WR_BUF_CNT, 811 sizeof(link->wr_tx_compl[0]), 812 GFP_KERNEL); 813 if (!link->wr_tx_compl) 814 goto no_mem_wr_tx_pends; 815 816 if (link->lgr->smc_version == SMC_V2) { 817 link->wr_tx_v2_ib = kzalloc(sizeof(*link->wr_tx_v2_ib), 818 GFP_KERNEL); 819 if (!link->wr_tx_v2_ib) 820 goto no_mem_tx_compl; 821 link->wr_tx_v2_sge = kzalloc(sizeof(*link->wr_tx_v2_sge), 822 GFP_KERNEL); 823 if (!link->wr_tx_v2_sge) 824 goto no_mem_v2_ib; 825 link->wr_tx_v2_pend = kzalloc(sizeof(*link->wr_tx_v2_pend), 826 GFP_KERNEL); 827 if (!link->wr_tx_v2_pend) 828 goto no_mem_v2_sge; 829 } 830 return 0; 831 832 no_mem_v2_sge: 833 kfree(link->wr_tx_v2_sge); 834 no_mem_v2_ib: 835 kfree(link->wr_tx_v2_ib); 836 no_mem_tx_compl: 837 kfree(link->wr_tx_compl); 838 no_mem_wr_tx_pends: 839 kfree(link->wr_tx_pends); 840 no_mem_wr_tx_mask: 841 kfree(link->wr_tx_mask); 842 no_mem_wr_rx_sges: 843 kfree(link->wr_rx_sges); 844 no_mem_wr_tx_sges: 845 kfree(link->wr_tx_sges); 846 no_mem_wr_tx_rdma_sges: 847 kfree(link->wr_tx_rdma_sges); 848 no_mem_wr_tx_rdmas: 849 kfree(link->wr_tx_rdmas); 850 no_mem_wr_rx_ibs: 851 kfree(link->wr_rx_ibs); 852 no_mem_wr_tx_ibs: 853 kfree(link->wr_tx_ibs); 854 no_mem_wr_rx_bufs: 855 kfree(link->wr_rx_bufs); 856 no_mem_wr_tx_bufs: 857 kfree(link->wr_tx_bufs); 858 no_mem: 859 return -ENOMEM; 860 } 861 862 void smc_wr_remove_dev(struct smc_ib_device *smcibdev) 863 { 864 tasklet_kill(&smcibdev->recv_tasklet); 865 tasklet_kill(&smcibdev->send_tasklet); 866 } 867 868 void smc_wr_add_dev(struct smc_ib_device *smcibdev) 869 { 870 tasklet_setup(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn); 871 tasklet_setup(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn); 872 } 873 874 int smc_wr_create_link(struct smc_link *lnk) 875 { 876 struct ib_device *ibdev = lnk->smcibdev->ibdev; 877 int rc = 0; 878 879 smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0); 880 lnk->wr_rx_id = 0; 881 lnk->wr_rx_dma_addr = ib_dma_map_single( 882 ibdev, lnk->wr_rx_bufs, SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, 883 DMA_FROM_DEVICE); 884 if (ib_dma_mapping_error(ibdev, lnk->wr_rx_dma_addr)) { 885 lnk->wr_rx_dma_addr = 0; 886 rc = -EIO; 887 goto out; 888 } 889 if (lnk->lgr->smc_version == SMC_V2) { 890 lnk->wr_rx_v2_dma_addr = ib_dma_map_single(ibdev, 891 lnk->lgr->wr_rx_buf_v2, SMC_WR_BUF_V2_SIZE, 892 DMA_FROM_DEVICE); 893 if (ib_dma_mapping_error(ibdev, lnk->wr_rx_v2_dma_addr)) { 894 lnk->wr_rx_v2_dma_addr = 0; 895 rc = -EIO; 896 goto dma_unmap; 897 } 898 lnk->wr_tx_v2_dma_addr = ib_dma_map_single(ibdev, 899 lnk->lgr->wr_tx_buf_v2, SMC_WR_BUF_V2_SIZE, 900 DMA_TO_DEVICE); 901 if (ib_dma_mapping_error(ibdev, lnk->wr_tx_v2_dma_addr)) { 902 lnk->wr_tx_v2_dma_addr = 0; 903 rc = -EIO; 904 goto dma_unmap; 905 } 906 } 907 lnk->wr_tx_dma_addr = ib_dma_map_single( 908 ibdev, lnk->wr_tx_bufs, SMC_WR_BUF_SIZE * lnk->wr_tx_cnt, 909 DMA_TO_DEVICE); 910 if (ib_dma_mapping_error(ibdev, lnk->wr_tx_dma_addr)) { 911 rc = -EIO; 912 goto dma_unmap; 913 } 914 smc_wr_init_sge(lnk); 915 bitmap_zero(lnk->wr_tx_mask, SMC_WR_BUF_CNT); 916 init_waitqueue_head(&lnk->wr_tx_wait); 917 atomic_set(&lnk->wr_tx_refcnt, 0); 918 init_waitqueue_head(&lnk->wr_reg_wait); 919 atomic_set(&lnk->wr_reg_refcnt, 0); 920 return rc; 921 922 dma_unmap: 923 if (lnk->wr_rx_v2_dma_addr) { 924 ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr, 925 SMC_WR_BUF_V2_SIZE, 926 DMA_FROM_DEVICE); 927 lnk->wr_rx_v2_dma_addr = 0; 928 } 929 if (lnk->wr_tx_v2_dma_addr) { 930 ib_dma_unmap_single(ibdev, lnk->wr_tx_v2_dma_addr, 931 SMC_WR_BUF_V2_SIZE, 932 DMA_TO_DEVICE); 933 lnk->wr_tx_v2_dma_addr = 0; 934 } 935 ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr, 936 SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, 937 DMA_FROM_DEVICE); 938 lnk->wr_rx_dma_addr = 0; 939 out: 940 return rc; 941 } 942