1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Shared Memory Communications over RDMA (SMC-R) and RoCE 4 * 5 * IB infrastructure: 6 * Establish SMC-R as an Infiniband Client to be notified about added and 7 * removed IB devices of type RDMA. 8 * Determine device and port characteristics for these IB devices. 9 * 10 * Copyright IBM Corp. 2016 11 * 12 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> 13 */ 14 15 #include <linux/random.h> 16 #include <linux/workqueue.h> 17 #include <linux/scatterlist.h> 18 #include <linux/wait.h> 19 #include <linux/mutex.h> 20 #include <rdma/ib_verbs.h> 21 #include <rdma/ib_cache.h> 22 23 #include "smc_pnet.h" 24 #include "smc_ib.h" 25 #include "smc_core.h" 26 #include "smc_wr.h" 27 #include "smc.h" 28 #include "smc_netlink.h" 29 30 #define SMC_MAX_CQE 32766 /* max. # of completion queue elements */ 31 32 #define SMC_QP_MIN_RNR_TIMER 5 33 #define SMC_QP_TIMEOUT 15 /* 4096 * 2 ** timeout usec */ 34 #define SMC_QP_RETRY_CNT 7 /* 7: infinite */ 35 #define SMC_QP_RNR_RETRY 7 /* 7: infinite */ 36 37 struct smc_ib_devices smc_ib_devices = { /* smc-registered ib devices */ 38 .mutex = __MUTEX_INITIALIZER(smc_ib_devices.mutex), 39 .list = LIST_HEAD_INIT(smc_ib_devices.list), 40 }; 41 42 u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */ 43 44 static int smc_ib_modify_qp_init(struct smc_link *lnk) 45 { 46 struct ib_qp_attr qp_attr; 47 48 memset(&qp_attr, 0, sizeof(qp_attr)); 49 qp_attr.qp_state = IB_QPS_INIT; 50 qp_attr.pkey_index = 0; 51 qp_attr.port_num = lnk->ibport; 52 qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE 53 | IB_ACCESS_REMOTE_WRITE; 54 return ib_modify_qp(lnk->roce_qp, &qp_attr, 55 IB_QP_STATE | IB_QP_PKEY_INDEX | 56 IB_QP_ACCESS_FLAGS | IB_QP_PORT); 57 } 58 59 static int smc_ib_modify_qp_rtr(struct smc_link *lnk) 60 { 61 enum ib_qp_attr_mask qp_attr_mask = 62 IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | 63 IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER; 64 struct ib_qp_attr qp_attr; 65 66 memset(&qp_attr, 0, sizeof(qp_attr)); 67 qp_attr.qp_state = IB_QPS_RTR; 68 qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu); 69 qp_attr.ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE; 70 rdma_ah_set_port_num(&qp_attr.ah_attr, lnk->ibport); 71 rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, lnk->sgid_index, 1, 0); 72 rdma_ah_set_dgid_raw(&qp_attr.ah_attr, lnk->peer_gid); 73 memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac, 74 sizeof(lnk->peer_mac)); 75 qp_attr.dest_qp_num = lnk->peer_qpn; 76 qp_attr.rq_psn = lnk->peer_psn; /* starting receive packet seq # */ 77 qp_attr.max_dest_rd_atomic = 1; /* max # of resources for incoming 78 * requests 79 */ 80 qp_attr.min_rnr_timer = SMC_QP_MIN_RNR_TIMER; 81 82 return ib_modify_qp(lnk->roce_qp, &qp_attr, qp_attr_mask); 83 } 84 85 int smc_ib_modify_qp_rts(struct smc_link *lnk) 86 { 87 struct ib_qp_attr qp_attr; 88 89 memset(&qp_attr, 0, sizeof(qp_attr)); 90 qp_attr.qp_state = IB_QPS_RTS; 91 qp_attr.timeout = SMC_QP_TIMEOUT; /* local ack timeout */ 92 qp_attr.retry_cnt = SMC_QP_RETRY_CNT; /* retry count */ 93 qp_attr.rnr_retry = SMC_QP_RNR_RETRY; /* RNR retries, 7=infinite */ 94 qp_attr.sq_psn = lnk->psn_initial; /* starting send packet seq # */ 95 qp_attr.max_rd_atomic = 1; /* # of outstanding RDMA reads and 96 * atomic ops allowed 97 */ 98 return ib_modify_qp(lnk->roce_qp, &qp_attr, 99 IB_QP_STATE | IB_QP_TIMEOUT | IB_QP_RETRY_CNT | 100 IB_QP_SQ_PSN | IB_QP_RNR_RETRY | 101 IB_QP_MAX_QP_RD_ATOMIC); 102 } 103 104 int smc_ib_modify_qp_reset(struct smc_link *lnk) 105 { 106 struct ib_qp_attr qp_attr; 107 108 memset(&qp_attr, 0, sizeof(qp_attr)); 109 qp_attr.qp_state = IB_QPS_RESET; 110 return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE); 111 } 112 113 int smc_ib_ready_link(struct smc_link *lnk) 114 { 115 struct smc_link_group *lgr = smc_get_lgr(lnk); 116 int rc = 0; 117 118 rc = smc_ib_modify_qp_init(lnk); 119 if (rc) 120 goto out; 121 122 rc = smc_ib_modify_qp_rtr(lnk); 123 if (rc) 124 goto out; 125 smc_wr_remember_qp_attr(lnk); 126 rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv, 127 IB_CQ_SOLICITED_MASK); 128 if (rc) 129 goto out; 130 rc = smc_wr_rx_post_init(lnk); 131 if (rc) 132 goto out; 133 smc_wr_remember_qp_attr(lnk); 134 135 if (lgr->role == SMC_SERV) { 136 rc = smc_ib_modify_qp_rts(lnk); 137 if (rc) 138 goto out; 139 smc_wr_remember_qp_attr(lnk); 140 } 141 out: 142 return rc; 143 } 144 145 static int smc_ib_fill_mac(struct smc_ib_device *smcibdev, u8 ibport) 146 { 147 const struct ib_gid_attr *attr; 148 int rc; 149 150 attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, 0); 151 if (IS_ERR(attr)) 152 return -ENODEV; 153 154 rc = rdma_read_gid_l2_fields(attr, NULL, smcibdev->mac[ibport - 1]); 155 rdma_put_gid_attr(attr); 156 return rc; 157 } 158 159 /* Create an identifier unique for this instance of SMC-R. 160 * The MAC-address of the first active registered IB device 161 * plus a random 2-byte number is used to create this identifier. 162 * This name is delivered to the peer during connection initialization. 163 */ 164 static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev, 165 u8 ibport) 166 { 167 memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1], 168 sizeof(smcibdev->mac[ibport - 1])); 169 } 170 171 bool smc_ib_is_valid_local_systemid(void) 172 { 173 return !is_zero_ether_addr(&local_systemid[2]); 174 } 175 176 static void smc_ib_init_local_systemid(void) 177 { 178 get_random_bytes(&local_systemid[0], 2); 179 } 180 181 bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport) 182 { 183 return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE; 184 } 185 186 /* determine the gid for an ib-device port and vlan id */ 187 int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, 188 unsigned short vlan_id, u8 gid[], u8 *sgid_index) 189 { 190 const struct ib_gid_attr *attr; 191 const struct net_device *ndev; 192 int i; 193 194 for (i = 0; i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) { 195 attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, i); 196 if (IS_ERR(attr)) 197 continue; 198 199 rcu_read_lock(); 200 ndev = rdma_read_gid_attr_ndev_rcu(attr); 201 if (!IS_ERR(ndev) && 202 ((!vlan_id && !is_vlan_dev(ndev)) || 203 (vlan_id && is_vlan_dev(ndev) && 204 vlan_dev_vlan_id(ndev) == vlan_id)) && 205 attr->gid_type == IB_GID_TYPE_ROCE) { 206 rcu_read_unlock(); 207 if (gid) 208 memcpy(gid, &attr->gid, SMC_GID_SIZE); 209 if (sgid_index) 210 *sgid_index = attr->index; 211 rdma_put_gid_attr(attr); 212 return 0; 213 } 214 rcu_read_unlock(); 215 rdma_put_gid_attr(attr); 216 } 217 return -ENODEV; 218 } 219 220 static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport) 221 { 222 int rc; 223 224 memset(&smcibdev->pattr[ibport - 1], 0, 225 sizeof(smcibdev->pattr[ibport - 1])); 226 rc = ib_query_port(smcibdev->ibdev, ibport, 227 &smcibdev->pattr[ibport - 1]); 228 if (rc) 229 goto out; 230 /* the SMC protocol requires specification of the RoCE MAC address */ 231 rc = smc_ib_fill_mac(smcibdev, ibport); 232 if (rc) 233 goto out; 234 if (!smc_ib_is_valid_local_systemid() && 235 smc_ib_port_active(smcibdev, ibport)) 236 /* create unique system identifier */ 237 smc_ib_define_local_systemid(smcibdev, ibport); 238 out: 239 return rc; 240 } 241 242 /* process context wrapper for might_sleep smc_ib_remember_port_attr */ 243 static void smc_ib_port_event_work(struct work_struct *work) 244 { 245 struct smc_ib_device *smcibdev = container_of( 246 work, struct smc_ib_device, port_event_work); 247 u8 port_idx; 248 249 for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) { 250 smc_ib_remember_port_attr(smcibdev, port_idx + 1); 251 clear_bit(port_idx, &smcibdev->port_event_mask); 252 if (!smc_ib_port_active(smcibdev, port_idx + 1)) { 253 set_bit(port_idx, smcibdev->ports_going_away); 254 smcr_port_err(smcibdev, port_idx + 1); 255 } else { 256 clear_bit(port_idx, smcibdev->ports_going_away); 257 smcr_port_add(smcibdev, port_idx + 1); 258 } 259 } 260 } 261 262 /* can be called in IRQ context */ 263 static void smc_ib_global_event_handler(struct ib_event_handler *handler, 264 struct ib_event *ibevent) 265 { 266 struct smc_ib_device *smcibdev; 267 bool schedule = false; 268 u8 port_idx; 269 270 smcibdev = container_of(handler, struct smc_ib_device, event_handler); 271 272 switch (ibevent->event) { 273 case IB_EVENT_DEVICE_FATAL: 274 /* terminate all ports on device */ 275 for (port_idx = 0; port_idx < SMC_MAX_PORTS; port_idx++) { 276 set_bit(port_idx, &smcibdev->port_event_mask); 277 if (!test_and_set_bit(port_idx, 278 smcibdev->ports_going_away)) 279 schedule = true; 280 } 281 if (schedule) 282 schedule_work(&smcibdev->port_event_work); 283 break; 284 case IB_EVENT_PORT_ACTIVE: 285 port_idx = ibevent->element.port_num - 1; 286 if (port_idx >= SMC_MAX_PORTS) 287 break; 288 set_bit(port_idx, &smcibdev->port_event_mask); 289 if (test_and_clear_bit(port_idx, smcibdev->ports_going_away)) 290 schedule_work(&smcibdev->port_event_work); 291 break; 292 case IB_EVENT_PORT_ERR: 293 port_idx = ibevent->element.port_num - 1; 294 if (port_idx >= SMC_MAX_PORTS) 295 break; 296 set_bit(port_idx, &smcibdev->port_event_mask); 297 if (!test_and_set_bit(port_idx, smcibdev->ports_going_away)) 298 schedule_work(&smcibdev->port_event_work); 299 break; 300 case IB_EVENT_GID_CHANGE: 301 port_idx = ibevent->element.port_num - 1; 302 if (port_idx >= SMC_MAX_PORTS) 303 break; 304 set_bit(port_idx, &smcibdev->port_event_mask); 305 schedule_work(&smcibdev->port_event_work); 306 break; 307 default: 308 break; 309 } 310 } 311 312 void smc_ib_dealloc_protection_domain(struct smc_link *lnk) 313 { 314 if (lnk->roce_pd) 315 ib_dealloc_pd(lnk->roce_pd); 316 lnk->roce_pd = NULL; 317 } 318 319 int smc_ib_create_protection_domain(struct smc_link *lnk) 320 { 321 int rc; 322 323 lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, 0); 324 rc = PTR_ERR_OR_ZERO(lnk->roce_pd); 325 if (IS_ERR(lnk->roce_pd)) 326 lnk->roce_pd = NULL; 327 return rc; 328 } 329 330 static bool smcr_diag_is_dev_critical(struct smc_lgr_list *smc_lgr, 331 struct smc_ib_device *smcibdev) 332 { 333 struct smc_link_group *lgr; 334 bool rc = false; 335 int i; 336 337 spin_lock_bh(&smc_lgr->lock); 338 list_for_each_entry(lgr, &smc_lgr->list, list) { 339 if (lgr->is_smcd) 340 continue; 341 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { 342 if (lgr->lnk[i].state == SMC_LNK_UNUSED || 343 lgr->lnk[i].smcibdev != smcibdev) 344 continue; 345 if (lgr->type == SMC_LGR_SINGLE || 346 lgr->type == SMC_LGR_ASYMMETRIC_LOCAL) { 347 rc = true; 348 goto out; 349 } 350 } 351 } 352 out: 353 spin_unlock_bh(&smc_lgr->lock); 354 return rc; 355 } 356 357 static int smc_nl_handle_dev_port(struct sk_buff *skb, 358 struct ib_device *ibdev, 359 struct smc_ib_device *smcibdev, 360 int port) 361 { 362 char smc_pnet[SMC_MAX_PNETID_LEN + 1]; 363 struct nlattr *port_attrs; 364 unsigned char port_state; 365 int lnk_count = 0; 366 367 port_attrs = nla_nest_start(skb, SMC_NLA_DEV_PORT + port); 368 if (!port_attrs) 369 goto errout; 370 371 if (nla_put_u8(skb, SMC_NLA_DEV_PORT_PNET_USR, 372 smcibdev->pnetid_by_user[port])) 373 goto errattr; 374 snprintf(smc_pnet, sizeof(smc_pnet), "%s", 375 (char *)&smcibdev->pnetid[port]); 376 if (nla_put_string(skb, SMC_NLA_DEV_PORT_PNETID, smc_pnet)) 377 goto errattr; 378 if (nla_put_u32(skb, SMC_NLA_DEV_PORT_NETDEV, 379 smcibdev->ndev_ifidx[port])) 380 goto errattr; 381 if (nla_put_u8(skb, SMC_NLA_DEV_PORT_VALID, 1)) 382 goto errattr; 383 port_state = smc_ib_port_active(smcibdev, port + 1); 384 if (nla_put_u8(skb, SMC_NLA_DEV_PORT_STATE, port_state)) 385 goto errattr; 386 lnk_count = atomic_read(&smcibdev->lnk_cnt_by_port[port]); 387 if (nla_put_u32(skb, SMC_NLA_DEV_PORT_LNK_CNT, lnk_count)) 388 goto errattr; 389 nla_nest_end(skb, port_attrs); 390 return 0; 391 errattr: 392 nla_nest_cancel(skb, port_attrs); 393 errout: 394 return -EMSGSIZE; 395 } 396 397 static int smc_nl_handle_smcr_dev(struct smc_ib_device *smcibdev, 398 struct sk_buff *skb, 399 struct netlink_callback *cb) 400 { 401 char smc_ibname[IB_DEVICE_NAME_MAX + 1]; 402 struct smc_pci_dev smc_pci_dev; 403 struct pci_dev *pci_dev; 404 unsigned char is_crit; 405 struct nlattr *attrs; 406 void *nlh; 407 int i; 408 409 nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 410 &smc_gen_nl_family, NLM_F_MULTI, 411 SMC_NETLINK_GET_DEV_SMCR); 412 if (!nlh) 413 goto errmsg; 414 attrs = nla_nest_start(skb, SMC_GEN_DEV_SMCR); 415 if (!attrs) 416 goto errout; 417 is_crit = smcr_diag_is_dev_critical(&smc_lgr_list, smcibdev); 418 if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, is_crit)) 419 goto errattr; 420 memset(&smc_pci_dev, 0, sizeof(smc_pci_dev)); 421 pci_dev = to_pci_dev(smcibdev->ibdev->dev.parent); 422 smc_set_pci_values(pci_dev, &smc_pci_dev); 423 if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev.pci_fid)) 424 goto errattr; 425 if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev.pci_pchid)) 426 goto errattr; 427 if (nla_put_u16(skb, SMC_NLA_DEV_PCI_VENDOR, smc_pci_dev.pci_vendor)) 428 goto errattr; 429 if (nla_put_u16(skb, SMC_NLA_DEV_PCI_DEVICE, smc_pci_dev.pci_device)) 430 goto errattr; 431 if (nla_put_string(skb, SMC_NLA_DEV_PCI_ID, smc_pci_dev.pci_id)) 432 goto errattr; 433 snprintf(smc_ibname, sizeof(smc_ibname), "%s", smcibdev->ibdev->name); 434 if (nla_put_string(skb, SMC_NLA_DEV_IB_NAME, smc_ibname)) 435 goto errattr; 436 for (i = 1; i <= SMC_MAX_PORTS; i++) { 437 if (!rdma_is_port_valid(smcibdev->ibdev, i)) 438 continue; 439 if (smc_nl_handle_dev_port(skb, smcibdev->ibdev, 440 smcibdev, i - 1)) 441 goto errattr; 442 } 443 444 nla_nest_end(skb, attrs); 445 genlmsg_end(skb, nlh); 446 return 0; 447 448 errattr: 449 nla_nest_cancel(skb, attrs); 450 errout: 451 genlmsg_cancel(skb, nlh); 452 errmsg: 453 return -EMSGSIZE; 454 } 455 456 static void smc_nl_prep_smcr_dev(struct smc_ib_devices *dev_list, 457 struct sk_buff *skb, 458 struct netlink_callback *cb) 459 { 460 struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); 461 struct smc_ib_device *smcibdev; 462 int snum = cb_ctx->pos[0]; 463 int num = 0; 464 465 mutex_lock(&dev_list->mutex); 466 list_for_each_entry(smcibdev, &dev_list->list, list) { 467 if (num < snum) 468 goto next; 469 if (smc_nl_handle_smcr_dev(smcibdev, skb, cb)) 470 goto errout; 471 next: 472 num++; 473 } 474 errout: 475 mutex_unlock(&dev_list->mutex); 476 cb_ctx->pos[0] = num; 477 } 478 479 int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) 480 { 481 smc_nl_prep_smcr_dev(&smc_ib_devices, skb, cb); 482 return skb->len; 483 } 484 485 static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) 486 { 487 struct smc_link *lnk = (struct smc_link *)priv; 488 struct smc_ib_device *smcibdev = lnk->smcibdev; 489 u8 port_idx; 490 491 switch (ibevent->event) { 492 case IB_EVENT_QP_FATAL: 493 case IB_EVENT_QP_ACCESS_ERR: 494 port_idx = ibevent->element.qp->port - 1; 495 if (port_idx >= SMC_MAX_PORTS) 496 break; 497 set_bit(port_idx, &smcibdev->port_event_mask); 498 if (!test_and_set_bit(port_idx, smcibdev->ports_going_away)) 499 schedule_work(&smcibdev->port_event_work); 500 break; 501 default: 502 break; 503 } 504 } 505 506 void smc_ib_destroy_queue_pair(struct smc_link *lnk) 507 { 508 if (lnk->roce_qp) 509 ib_destroy_qp(lnk->roce_qp); 510 lnk->roce_qp = NULL; 511 } 512 513 /* create a queue pair within the protection domain for a link */ 514 int smc_ib_create_queue_pair(struct smc_link *lnk) 515 { 516 struct ib_qp_init_attr qp_attr = { 517 .event_handler = smc_ib_qp_event_handler, 518 .qp_context = lnk, 519 .send_cq = lnk->smcibdev->roce_cq_send, 520 .recv_cq = lnk->smcibdev->roce_cq_recv, 521 .srq = NULL, 522 .cap = { 523 /* include unsolicited rdma_writes as well, 524 * there are max. 2 RDMA_WRITE per 1 WR_SEND 525 */ 526 .max_send_wr = SMC_WR_BUF_CNT * 3, 527 .max_recv_wr = SMC_WR_BUF_CNT * 3, 528 .max_send_sge = SMC_IB_MAX_SEND_SGE, 529 .max_recv_sge = 1, 530 }, 531 .sq_sig_type = IB_SIGNAL_REQ_WR, 532 .qp_type = IB_QPT_RC, 533 }; 534 int rc; 535 536 lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr); 537 rc = PTR_ERR_OR_ZERO(lnk->roce_qp); 538 if (IS_ERR(lnk->roce_qp)) 539 lnk->roce_qp = NULL; 540 else 541 smc_wr_remember_qp_attr(lnk); 542 return rc; 543 } 544 545 void smc_ib_put_memory_region(struct ib_mr *mr) 546 { 547 ib_dereg_mr(mr); 548 } 549 550 static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot, u8 link_idx) 551 { 552 unsigned int offset = 0; 553 int sg_num; 554 555 /* map the largest prefix of a dma mapped SG list */ 556 sg_num = ib_map_mr_sg(buf_slot->mr_rx[link_idx], 557 buf_slot->sgt[link_idx].sgl, 558 buf_slot->sgt[link_idx].orig_nents, 559 &offset, PAGE_SIZE); 560 561 return sg_num; 562 } 563 564 /* Allocate a memory region and map the dma mapped SG list of buf_slot */ 565 int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, 566 struct smc_buf_desc *buf_slot, u8 link_idx) 567 { 568 if (buf_slot->mr_rx[link_idx]) 569 return 0; /* already done */ 570 571 buf_slot->mr_rx[link_idx] = 572 ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, 1 << buf_slot->order); 573 if (IS_ERR(buf_slot->mr_rx[link_idx])) { 574 int rc; 575 576 rc = PTR_ERR(buf_slot->mr_rx[link_idx]); 577 buf_slot->mr_rx[link_idx] = NULL; 578 return rc; 579 } 580 581 if (smc_ib_map_mr_sg(buf_slot, link_idx) != 1) 582 return -EINVAL; 583 584 return 0; 585 } 586 587 /* synchronize buffer usage for cpu access */ 588 void smc_ib_sync_sg_for_cpu(struct smc_link *lnk, 589 struct smc_buf_desc *buf_slot, 590 enum dma_data_direction data_direction) 591 { 592 struct scatterlist *sg; 593 unsigned int i; 594 595 /* for now there is just one DMA address */ 596 for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, 597 buf_slot->sgt[lnk->link_idx].nents, i) { 598 if (!sg_dma_len(sg)) 599 break; 600 ib_dma_sync_single_for_cpu(lnk->smcibdev->ibdev, 601 sg_dma_address(sg), 602 sg_dma_len(sg), 603 data_direction); 604 } 605 } 606 607 /* synchronize buffer usage for device access */ 608 void smc_ib_sync_sg_for_device(struct smc_link *lnk, 609 struct smc_buf_desc *buf_slot, 610 enum dma_data_direction data_direction) 611 { 612 struct scatterlist *sg; 613 unsigned int i; 614 615 /* for now there is just one DMA address */ 616 for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, 617 buf_slot->sgt[lnk->link_idx].nents, i) { 618 if (!sg_dma_len(sg)) 619 break; 620 ib_dma_sync_single_for_device(lnk->smcibdev->ibdev, 621 sg_dma_address(sg), 622 sg_dma_len(sg), 623 data_direction); 624 } 625 } 626 627 /* Map a new TX or RX buffer SG-table to DMA */ 628 int smc_ib_buf_map_sg(struct smc_link *lnk, 629 struct smc_buf_desc *buf_slot, 630 enum dma_data_direction data_direction) 631 { 632 int mapped_nents; 633 634 mapped_nents = ib_dma_map_sg(lnk->smcibdev->ibdev, 635 buf_slot->sgt[lnk->link_idx].sgl, 636 buf_slot->sgt[lnk->link_idx].orig_nents, 637 data_direction); 638 if (!mapped_nents) 639 return -ENOMEM; 640 641 return mapped_nents; 642 } 643 644 void smc_ib_buf_unmap_sg(struct smc_link *lnk, 645 struct smc_buf_desc *buf_slot, 646 enum dma_data_direction data_direction) 647 { 648 if (!buf_slot->sgt[lnk->link_idx].sgl->dma_address) 649 return; /* already unmapped */ 650 651 ib_dma_unmap_sg(lnk->smcibdev->ibdev, 652 buf_slot->sgt[lnk->link_idx].sgl, 653 buf_slot->sgt[lnk->link_idx].orig_nents, 654 data_direction); 655 buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0; 656 } 657 658 long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) 659 { 660 struct ib_cq_init_attr cqattr = { 661 .cqe = SMC_MAX_CQE, .comp_vector = 0 }; 662 int cqe_size_order, smc_order; 663 long rc; 664 665 mutex_lock(&smcibdev->mutex); 666 rc = 0; 667 if (smcibdev->initialized) 668 goto out; 669 /* the calculated number of cq entries fits to mlx5 cq allocation */ 670 cqe_size_order = cache_line_size() == 128 ? 7 : 6; 671 smc_order = MAX_ORDER - cqe_size_order - 1; 672 if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE) 673 cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2; 674 smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev, 675 smc_wr_tx_cq_handler, NULL, 676 smcibdev, &cqattr); 677 rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send); 678 if (IS_ERR(smcibdev->roce_cq_send)) { 679 smcibdev->roce_cq_send = NULL; 680 goto out; 681 } 682 smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev, 683 smc_wr_rx_cq_handler, NULL, 684 smcibdev, &cqattr); 685 rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_recv); 686 if (IS_ERR(smcibdev->roce_cq_recv)) { 687 smcibdev->roce_cq_recv = NULL; 688 goto err; 689 } 690 smc_wr_add_dev(smcibdev); 691 smcibdev->initialized = 1; 692 goto out; 693 694 err: 695 ib_destroy_cq(smcibdev->roce_cq_send); 696 out: 697 mutex_unlock(&smcibdev->mutex); 698 return rc; 699 } 700 701 static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev) 702 { 703 mutex_lock(&smcibdev->mutex); 704 if (!smcibdev->initialized) 705 goto out; 706 smcibdev->initialized = 0; 707 ib_destroy_cq(smcibdev->roce_cq_recv); 708 ib_destroy_cq(smcibdev->roce_cq_send); 709 smc_wr_remove_dev(smcibdev); 710 out: 711 mutex_unlock(&smcibdev->mutex); 712 } 713 714 static struct ib_client smc_ib_client; 715 716 static void smc_copy_netdev_ifindex(struct smc_ib_device *smcibdev, int port) 717 { 718 struct ib_device *ibdev = smcibdev->ibdev; 719 struct net_device *ndev; 720 721 if (!ibdev->ops.get_netdev) 722 return; 723 ndev = ibdev->ops.get_netdev(ibdev, port + 1); 724 if (ndev) { 725 smcibdev->ndev_ifidx[port] = ndev->ifindex; 726 dev_put(ndev); 727 } 728 } 729 730 void smc_ib_ndev_change(struct net_device *ndev, unsigned long event) 731 { 732 struct smc_ib_device *smcibdev; 733 struct ib_device *libdev; 734 struct net_device *lndev; 735 u8 port_cnt; 736 int i; 737 738 mutex_lock(&smc_ib_devices.mutex); 739 list_for_each_entry(smcibdev, &smc_ib_devices.list, list) { 740 port_cnt = smcibdev->ibdev->phys_port_cnt; 741 for (i = 0; i < min_t(size_t, port_cnt, SMC_MAX_PORTS); i++) { 742 libdev = smcibdev->ibdev; 743 if (!libdev->ops.get_netdev) 744 continue; 745 lndev = libdev->ops.get_netdev(libdev, i + 1); 746 if (lndev) 747 dev_put(lndev); 748 if (lndev != ndev) 749 continue; 750 if (event == NETDEV_REGISTER) 751 smcibdev->ndev_ifidx[i] = ndev->ifindex; 752 if (event == NETDEV_UNREGISTER) 753 smcibdev->ndev_ifidx[i] = 0; 754 } 755 } 756 mutex_unlock(&smc_ib_devices.mutex); 757 } 758 759 /* callback function for ib_register_client() */ 760 static int smc_ib_add_dev(struct ib_device *ibdev) 761 { 762 struct smc_ib_device *smcibdev; 763 u8 port_cnt; 764 int i; 765 766 if (ibdev->node_type != RDMA_NODE_IB_CA) 767 return -EOPNOTSUPP; 768 769 smcibdev = kzalloc(sizeof(*smcibdev), GFP_KERNEL); 770 if (!smcibdev) 771 return -ENOMEM; 772 773 smcibdev->ibdev = ibdev; 774 INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work); 775 atomic_set(&smcibdev->lnk_cnt, 0); 776 init_waitqueue_head(&smcibdev->lnks_deleted); 777 mutex_init(&smcibdev->mutex); 778 mutex_lock(&smc_ib_devices.mutex); 779 list_add_tail(&smcibdev->list, &smc_ib_devices.list); 780 mutex_unlock(&smc_ib_devices.mutex); 781 ib_set_client_data(ibdev, &smc_ib_client, smcibdev); 782 INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev, 783 smc_ib_global_event_handler); 784 ib_register_event_handler(&smcibdev->event_handler); 785 786 /* trigger reading of the port attributes */ 787 port_cnt = smcibdev->ibdev->phys_port_cnt; 788 pr_warn_ratelimited("smc: adding ib device %s with port count %d\n", 789 smcibdev->ibdev->name, port_cnt); 790 for (i = 0; 791 i < min_t(size_t, port_cnt, SMC_MAX_PORTS); 792 i++) { 793 set_bit(i, &smcibdev->port_event_mask); 794 /* determine pnetids of the port */ 795 if (smc_pnetid_by_dev_port(ibdev->dev.parent, i, 796 smcibdev->pnetid[i])) 797 smc_pnetid_by_table_ib(smcibdev, i + 1); 798 smc_copy_netdev_ifindex(smcibdev, i); 799 pr_warn_ratelimited("smc: ib device %s port %d has pnetid " 800 "%.16s%s\n", 801 smcibdev->ibdev->name, i + 1, 802 smcibdev->pnetid[i], 803 smcibdev->pnetid_by_user[i] ? 804 " (user defined)" : 805 ""); 806 } 807 schedule_work(&smcibdev->port_event_work); 808 return 0; 809 } 810 811 /* callback function for ib_unregister_client() */ 812 static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) 813 { 814 struct smc_ib_device *smcibdev = client_data; 815 816 mutex_lock(&smc_ib_devices.mutex); 817 list_del_init(&smcibdev->list); /* remove from smc_ib_devices */ 818 mutex_unlock(&smc_ib_devices.mutex); 819 pr_warn_ratelimited("smc: removing ib device %s\n", 820 smcibdev->ibdev->name); 821 smc_smcr_terminate_all(smcibdev); 822 smc_ib_cleanup_per_ibdev(smcibdev); 823 ib_unregister_event_handler(&smcibdev->event_handler); 824 cancel_work_sync(&smcibdev->port_event_work); 825 kfree(smcibdev); 826 } 827 828 static struct ib_client smc_ib_client = { 829 .name = "smc_ib", 830 .add = smc_ib_add_dev, 831 .remove = smc_ib_remove_dev, 832 }; 833 834 int __init smc_ib_register_client(void) 835 { 836 smc_ib_init_local_systemid(); 837 return ib_register_client(&smc_ib_client); 838 } 839 840 void smc_ib_unregister_client(void) 841 { 842 ib_unregister_client(&smc_ib_client); 843 } 844