1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Shared Memory Communications over RDMA (SMC-R) and RoCE 4 * 5 * IB infrastructure: 6 * Establish SMC-R as an Infiniband Client to be notified about added and 7 * removed IB devices of type RDMA. 8 * Determine device and port characteristics for these IB devices. 9 * 10 * Copyright IBM Corp. 2016 11 * 12 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> 13 */ 14 15 #include <linux/etherdevice.h> 16 #include <linux/if_vlan.h> 17 #include <linux/random.h> 18 #include <linux/workqueue.h> 19 #include <linux/scatterlist.h> 20 #include <linux/wait.h> 21 #include <linux/mutex.h> 22 #include <linux/inetdevice.h> 23 #include <rdma/ib_verbs.h> 24 #include <rdma/ib_cache.h> 25 26 #include "smc_pnet.h" 27 #include "smc_ib.h" 28 #include "smc_core.h" 29 #include "smc_wr.h" 30 #include "smc.h" 31 #include "smc_netlink.h" 32 33 #define SMC_MAX_CQE 32766 /* max. # of completion queue elements */ 34 35 #define SMC_QP_MIN_RNR_TIMER 5 36 #define SMC_QP_TIMEOUT 15 /* 4096 * 2 ** timeout usec */ 37 #define SMC_QP_RETRY_CNT 7 /* 7: infinite */ 38 #define SMC_QP_RNR_RETRY 7 /* 7: infinite */ 39 40 struct smc_ib_devices smc_ib_devices = { /* smc-registered ib devices */ 41 .mutex = __MUTEX_INITIALIZER(smc_ib_devices.mutex), 42 .list = LIST_HEAD_INIT(smc_ib_devices.list), 43 }; 44 45 u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */ 46 47 static int smc_ib_modify_qp_init(struct smc_link *lnk) 48 { 49 struct ib_qp_attr qp_attr; 50 51 memset(&qp_attr, 0, sizeof(qp_attr)); 52 qp_attr.qp_state = IB_QPS_INIT; 53 qp_attr.pkey_index = 0; 54 qp_attr.port_num = lnk->ibport; 55 qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE 56 | IB_ACCESS_REMOTE_WRITE; 57 return ib_modify_qp(lnk->roce_qp, &qp_attr, 58 IB_QP_STATE | IB_QP_PKEY_INDEX | 59 IB_QP_ACCESS_FLAGS | IB_QP_PORT); 60 } 61 62 static int smc_ib_modify_qp_rtr(struct smc_link *lnk) 63 { 64 enum ib_qp_attr_mask qp_attr_mask = 65 IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | 66 IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER; 67 struct ib_qp_attr qp_attr; 68 u8 hop_lim = 1; 69 70 memset(&qp_attr, 0, sizeof(qp_attr)); 71 qp_attr.qp_state = IB_QPS_RTR; 72 qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu); 73 qp_attr.ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE; 74 rdma_ah_set_port_num(&qp_attr.ah_attr, lnk->ibport); 75 if (lnk->lgr->smc_version == SMC_V2 && lnk->lgr->uses_gateway) 76 hop_lim = IPV6_DEFAULT_HOPLIMIT; 77 rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, lnk->sgid_index, hop_lim, 0); 78 rdma_ah_set_dgid_raw(&qp_attr.ah_attr, lnk->peer_gid); 79 if (lnk->lgr->smc_version == SMC_V2 && lnk->lgr->uses_gateway) 80 memcpy(&qp_attr.ah_attr.roce.dmac, lnk->lgr->nexthop_mac, 81 sizeof(lnk->lgr->nexthop_mac)); 82 else 83 memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac, 84 sizeof(lnk->peer_mac)); 85 qp_attr.dest_qp_num = lnk->peer_qpn; 86 qp_attr.rq_psn = lnk->peer_psn; /* starting receive packet seq # */ 87 qp_attr.max_dest_rd_atomic = 1; /* max # of resources for incoming 88 * requests 89 */ 90 qp_attr.min_rnr_timer = SMC_QP_MIN_RNR_TIMER; 91 92 return ib_modify_qp(lnk->roce_qp, &qp_attr, qp_attr_mask); 93 } 94 95 int smc_ib_modify_qp_rts(struct smc_link *lnk) 96 { 97 struct ib_qp_attr qp_attr; 98 99 memset(&qp_attr, 0, sizeof(qp_attr)); 100 qp_attr.qp_state = IB_QPS_RTS; 101 qp_attr.timeout = SMC_QP_TIMEOUT; /* local ack timeout */ 102 qp_attr.retry_cnt = SMC_QP_RETRY_CNT; /* retry count */ 103 qp_attr.rnr_retry = SMC_QP_RNR_RETRY; /* RNR retries, 7=infinite */ 104 qp_attr.sq_psn = lnk->psn_initial; /* starting send packet seq # */ 105 qp_attr.max_rd_atomic = 1; /* # of outstanding RDMA reads and 106 * atomic ops allowed 107 */ 108 return ib_modify_qp(lnk->roce_qp, &qp_attr, 109 IB_QP_STATE | IB_QP_TIMEOUT | IB_QP_RETRY_CNT | 110 IB_QP_SQ_PSN | IB_QP_RNR_RETRY | 111 IB_QP_MAX_QP_RD_ATOMIC); 112 } 113 114 int smc_ib_modify_qp_error(struct smc_link *lnk) 115 { 116 struct ib_qp_attr qp_attr; 117 118 memset(&qp_attr, 0, sizeof(qp_attr)); 119 qp_attr.qp_state = IB_QPS_ERR; 120 return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE); 121 } 122 123 int smc_ib_ready_link(struct smc_link *lnk) 124 { 125 struct smc_link_group *lgr = smc_get_lgr(lnk); 126 int rc = 0; 127 128 rc = smc_ib_modify_qp_init(lnk); 129 if (rc) 130 goto out; 131 132 rc = smc_ib_modify_qp_rtr(lnk); 133 if (rc) 134 goto out; 135 smc_wr_remember_qp_attr(lnk); 136 rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv, 137 IB_CQ_SOLICITED_MASK); 138 if (rc) 139 goto out; 140 rc = smc_wr_rx_post_init(lnk); 141 if (rc) 142 goto out; 143 smc_wr_remember_qp_attr(lnk); 144 145 if (lgr->role == SMC_SERV) { 146 rc = smc_ib_modify_qp_rts(lnk); 147 if (rc) 148 goto out; 149 smc_wr_remember_qp_attr(lnk); 150 } 151 out: 152 return rc; 153 } 154 155 static int smc_ib_fill_mac(struct smc_ib_device *smcibdev, u8 ibport) 156 { 157 const struct ib_gid_attr *attr; 158 int rc; 159 160 attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, 0); 161 if (IS_ERR(attr)) 162 return -ENODEV; 163 164 rc = rdma_read_gid_l2_fields(attr, NULL, smcibdev->mac[ibport - 1]); 165 rdma_put_gid_attr(attr); 166 return rc; 167 } 168 169 /* Create an identifier unique for this instance of SMC-R. 170 * The MAC-address of the first active registered IB device 171 * plus a random 2-byte number is used to create this identifier. 172 * This name is delivered to the peer during connection initialization. 173 */ 174 static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev, 175 u8 ibport) 176 { 177 memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1], 178 sizeof(smcibdev->mac[ibport - 1])); 179 } 180 181 bool smc_ib_is_valid_local_systemid(void) 182 { 183 return !is_zero_ether_addr(&local_systemid[2]); 184 } 185 186 static void smc_ib_init_local_systemid(void) 187 { 188 get_random_bytes(&local_systemid[0], 2); 189 } 190 191 bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport) 192 { 193 return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE; 194 } 195 196 int smc_ib_find_route(struct net *net, __be32 saddr, __be32 daddr, 197 u8 nexthop_mac[], u8 *uses_gateway) 198 { 199 struct neighbour *neigh = NULL; 200 struct rtable *rt = NULL; 201 struct flowi4 fl4 = { 202 .saddr = saddr, 203 .daddr = daddr 204 }; 205 206 if (daddr == cpu_to_be32(INADDR_NONE)) 207 goto out; 208 rt = ip_route_output_flow(net, &fl4, NULL); 209 if (IS_ERR(rt)) 210 goto out; 211 if (rt->rt_uses_gateway && rt->rt_gw_family != AF_INET) 212 goto out_rt; 213 neigh = dst_neigh_lookup(&rt->dst, &fl4.daddr); 214 if (!neigh) 215 goto out_rt; 216 memcpy(nexthop_mac, neigh->ha, ETH_ALEN); 217 *uses_gateway = rt->rt_uses_gateway; 218 neigh_release(neigh); 219 ip_rt_put(rt); 220 return 0; 221 222 out_rt: 223 ip_rt_put(rt); 224 out: 225 return -ENOENT; 226 } 227 228 static int smc_ib_determine_gid_rcu(const struct net_device *ndev, 229 const struct ib_gid_attr *attr, 230 u8 gid[], u8 *sgid_index, 231 struct smc_init_info_smcrv2 *smcrv2) 232 { 233 if (!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) { 234 if (gid) 235 memcpy(gid, &attr->gid, SMC_GID_SIZE); 236 if (sgid_index) 237 *sgid_index = attr->index; 238 return 0; 239 } 240 if (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP && 241 smc_ib_gid_to_ipv4((u8 *)&attr->gid) != cpu_to_be32(INADDR_NONE)) { 242 struct in_device *in_dev = __in_dev_get_rcu(ndev); 243 struct net *net = dev_net(ndev); 244 const struct in_ifaddr *ifa; 245 bool subnet_match = false; 246 247 if (!in_dev) 248 goto out; 249 in_dev_for_each_ifa_rcu(ifa, in_dev) { 250 if (!inet_ifa_match(smcrv2->saddr, ifa)) 251 continue; 252 subnet_match = true; 253 break; 254 } 255 if (!subnet_match) 256 goto out; 257 if (smcrv2->daddr && smc_ib_find_route(net, smcrv2->saddr, 258 smcrv2->daddr, 259 smcrv2->nexthop_mac, 260 &smcrv2->uses_gateway)) 261 goto out; 262 263 if (gid) 264 memcpy(gid, &attr->gid, SMC_GID_SIZE); 265 if (sgid_index) 266 *sgid_index = attr->index; 267 return 0; 268 } 269 out: 270 return -ENODEV; 271 } 272 273 /* determine the gid for an ib-device port and vlan id */ 274 int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, 275 unsigned short vlan_id, u8 gid[], u8 *sgid_index, 276 struct smc_init_info_smcrv2 *smcrv2) 277 { 278 const struct ib_gid_attr *attr; 279 const struct net_device *ndev; 280 int i; 281 282 for (i = 0; i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) { 283 attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, i); 284 if (IS_ERR(attr)) 285 continue; 286 287 rcu_read_lock(); 288 ndev = rdma_read_gid_attr_ndev_rcu(attr); 289 if (!IS_ERR(ndev) && 290 ((!vlan_id && !is_vlan_dev(ndev)) || 291 (vlan_id && is_vlan_dev(ndev) && 292 vlan_dev_vlan_id(ndev) == vlan_id))) { 293 if (!smc_ib_determine_gid_rcu(ndev, attr, gid, 294 sgid_index, smcrv2)) { 295 rcu_read_unlock(); 296 rdma_put_gid_attr(attr); 297 return 0; 298 } 299 } 300 rcu_read_unlock(); 301 rdma_put_gid_attr(attr); 302 } 303 return -ENODEV; 304 } 305 306 /* check if gid is still defined on smcibdev */ 307 static bool smc_ib_check_link_gid(u8 gid[SMC_GID_SIZE], bool smcrv2, 308 struct smc_ib_device *smcibdev, u8 ibport) 309 { 310 const struct ib_gid_attr *attr; 311 bool rc = false; 312 int i; 313 314 for (i = 0; !rc && i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) { 315 attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, i); 316 if (IS_ERR(attr)) 317 continue; 318 319 rcu_read_lock(); 320 if ((!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) || 321 (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP && 322 !(ipv6_addr_type((const struct in6_addr *)&attr->gid) 323 & IPV6_ADDR_LINKLOCAL))) 324 if (!memcmp(gid, &attr->gid, SMC_GID_SIZE)) 325 rc = true; 326 rcu_read_unlock(); 327 rdma_put_gid_attr(attr); 328 } 329 return rc; 330 } 331 332 /* check all links if the gid is still defined on smcibdev */ 333 static void smc_ib_gid_check(struct smc_ib_device *smcibdev, u8 ibport) 334 { 335 struct smc_link_group *lgr; 336 int i; 337 338 spin_lock_bh(&smc_lgr_list.lock); 339 list_for_each_entry(lgr, &smc_lgr_list.list, list) { 340 if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id, 341 SMC_MAX_PNETID_LEN)) 342 continue; /* lgr is not affected */ 343 if (list_empty(&lgr->list)) 344 continue; 345 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { 346 if (lgr->lnk[i].state == SMC_LNK_UNUSED || 347 lgr->lnk[i].smcibdev != smcibdev) 348 continue; 349 if (!smc_ib_check_link_gid(lgr->lnk[i].gid, 350 lgr->smc_version == SMC_V2, 351 smcibdev, ibport)) 352 smcr_port_err(smcibdev, ibport); 353 } 354 } 355 spin_unlock_bh(&smc_lgr_list.lock); 356 } 357 358 static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport) 359 { 360 int rc; 361 362 memset(&smcibdev->pattr[ibport - 1], 0, 363 sizeof(smcibdev->pattr[ibport - 1])); 364 rc = ib_query_port(smcibdev->ibdev, ibport, 365 &smcibdev->pattr[ibport - 1]); 366 if (rc) 367 goto out; 368 /* the SMC protocol requires specification of the RoCE MAC address */ 369 rc = smc_ib_fill_mac(smcibdev, ibport); 370 if (rc) 371 goto out; 372 if (!smc_ib_is_valid_local_systemid() && 373 smc_ib_port_active(smcibdev, ibport)) 374 /* create unique system identifier */ 375 smc_ib_define_local_systemid(smcibdev, ibport); 376 out: 377 return rc; 378 } 379 380 /* process context wrapper for might_sleep smc_ib_remember_port_attr */ 381 static void smc_ib_port_event_work(struct work_struct *work) 382 { 383 struct smc_ib_device *smcibdev = container_of( 384 work, struct smc_ib_device, port_event_work); 385 u8 port_idx; 386 387 for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) { 388 smc_ib_remember_port_attr(smcibdev, port_idx + 1); 389 clear_bit(port_idx, &smcibdev->port_event_mask); 390 if (!smc_ib_port_active(smcibdev, port_idx + 1)) { 391 set_bit(port_idx, smcibdev->ports_going_away); 392 smcr_port_err(smcibdev, port_idx + 1); 393 } else { 394 clear_bit(port_idx, smcibdev->ports_going_away); 395 smcr_port_add(smcibdev, port_idx + 1); 396 smc_ib_gid_check(smcibdev, port_idx + 1); 397 } 398 } 399 } 400 401 /* can be called in IRQ context */ 402 static void smc_ib_global_event_handler(struct ib_event_handler *handler, 403 struct ib_event *ibevent) 404 { 405 struct smc_ib_device *smcibdev; 406 bool schedule = false; 407 u8 port_idx; 408 409 smcibdev = container_of(handler, struct smc_ib_device, event_handler); 410 411 switch (ibevent->event) { 412 case IB_EVENT_DEVICE_FATAL: 413 /* terminate all ports on device */ 414 for (port_idx = 0; port_idx < SMC_MAX_PORTS; port_idx++) { 415 set_bit(port_idx, &smcibdev->port_event_mask); 416 if (!test_and_set_bit(port_idx, 417 smcibdev->ports_going_away)) 418 schedule = true; 419 } 420 if (schedule) 421 schedule_work(&smcibdev->port_event_work); 422 break; 423 case IB_EVENT_PORT_ACTIVE: 424 port_idx = ibevent->element.port_num - 1; 425 if (port_idx >= SMC_MAX_PORTS) 426 break; 427 set_bit(port_idx, &smcibdev->port_event_mask); 428 if (test_and_clear_bit(port_idx, smcibdev->ports_going_away)) 429 schedule_work(&smcibdev->port_event_work); 430 break; 431 case IB_EVENT_PORT_ERR: 432 port_idx = ibevent->element.port_num - 1; 433 if (port_idx >= SMC_MAX_PORTS) 434 break; 435 set_bit(port_idx, &smcibdev->port_event_mask); 436 if (!test_and_set_bit(port_idx, smcibdev->ports_going_away)) 437 schedule_work(&smcibdev->port_event_work); 438 break; 439 case IB_EVENT_GID_CHANGE: 440 port_idx = ibevent->element.port_num - 1; 441 if (port_idx >= SMC_MAX_PORTS) 442 break; 443 set_bit(port_idx, &smcibdev->port_event_mask); 444 schedule_work(&smcibdev->port_event_work); 445 break; 446 default: 447 break; 448 } 449 } 450 451 void smc_ib_dealloc_protection_domain(struct smc_link *lnk) 452 { 453 if (lnk->roce_pd) 454 ib_dealloc_pd(lnk->roce_pd); 455 lnk->roce_pd = NULL; 456 } 457 458 int smc_ib_create_protection_domain(struct smc_link *lnk) 459 { 460 int rc; 461 462 lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, 0); 463 rc = PTR_ERR_OR_ZERO(lnk->roce_pd); 464 if (IS_ERR(lnk->roce_pd)) 465 lnk->roce_pd = NULL; 466 return rc; 467 } 468 469 static bool smcr_diag_is_dev_critical(struct smc_lgr_list *smc_lgr, 470 struct smc_ib_device *smcibdev) 471 { 472 struct smc_link_group *lgr; 473 bool rc = false; 474 int i; 475 476 spin_lock_bh(&smc_lgr->lock); 477 list_for_each_entry(lgr, &smc_lgr->list, list) { 478 if (lgr->is_smcd) 479 continue; 480 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { 481 if (lgr->lnk[i].state == SMC_LNK_UNUSED || 482 lgr->lnk[i].smcibdev != smcibdev) 483 continue; 484 if (lgr->type == SMC_LGR_SINGLE || 485 lgr->type == SMC_LGR_ASYMMETRIC_LOCAL) { 486 rc = true; 487 goto out; 488 } 489 } 490 } 491 out: 492 spin_unlock_bh(&smc_lgr->lock); 493 return rc; 494 } 495 496 static int smc_nl_handle_dev_port(struct sk_buff *skb, 497 struct ib_device *ibdev, 498 struct smc_ib_device *smcibdev, 499 int port) 500 { 501 char smc_pnet[SMC_MAX_PNETID_LEN + 1]; 502 struct nlattr *port_attrs; 503 unsigned char port_state; 504 int lnk_count = 0; 505 506 port_attrs = nla_nest_start(skb, SMC_NLA_DEV_PORT + port); 507 if (!port_attrs) 508 goto errout; 509 510 if (nla_put_u8(skb, SMC_NLA_DEV_PORT_PNET_USR, 511 smcibdev->pnetid_by_user[port])) 512 goto errattr; 513 memcpy(smc_pnet, &smcibdev->pnetid[port], SMC_MAX_PNETID_LEN); 514 smc_pnet[SMC_MAX_PNETID_LEN] = 0; 515 if (nla_put_string(skb, SMC_NLA_DEV_PORT_PNETID, smc_pnet)) 516 goto errattr; 517 if (nla_put_u32(skb, SMC_NLA_DEV_PORT_NETDEV, 518 smcibdev->ndev_ifidx[port])) 519 goto errattr; 520 if (nla_put_u8(skb, SMC_NLA_DEV_PORT_VALID, 1)) 521 goto errattr; 522 port_state = smc_ib_port_active(smcibdev, port + 1); 523 if (nla_put_u8(skb, SMC_NLA_DEV_PORT_STATE, port_state)) 524 goto errattr; 525 lnk_count = atomic_read(&smcibdev->lnk_cnt_by_port[port]); 526 if (nla_put_u32(skb, SMC_NLA_DEV_PORT_LNK_CNT, lnk_count)) 527 goto errattr; 528 nla_nest_end(skb, port_attrs); 529 return 0; 530 errattr: 531 nla_nest_cancel(skb, port_attrs); 532 errout: 533 return -EMSGSIZE; 534 } 535 536 static bool smc_nl_handle_pci_values(const struct smc_pci_dev *smc_pci_dev, 537 struct sk_buff *skb) 538 { 539 if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev->pci_fid)) 540 return false; 541 if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev->pci_pchid)) 542 return false; 543 if (nla_put_u16(skb, SMC_NLA_DEV_PCI_VENDOR, smc_pci_dev->pci_vendor)) 544 return false; 545 if (nla_put_u16(skb, SMC_NLA_DEV_PCI_DEVICE, smc_pci_dev->pci_device)) 546 return false; 547 if (nla_put_string(skb, SMC_NLA_DEV_PCI_ID, smc_pci_dev->pci_id)) 548 return false; 549 return true; 550 } 551 552 static int smc_nl_handle_smcr_dev(struct smc_ib_device *smcibdev, 553 struct sk_buff *skb, 554 struct netlink_callback *cb) 555 { 556 char smc_ibname[IB_DEVICE_NAME_MAX]; 557 struct smc_pci_dev smc_pci_dev; 558 struct pci_dev *pci_dev; 559 unsigned char is_crit; 560 struct nlattr *attrs; 561 void *nlh; 562 int i; 563 564 nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 565 &smc_gen_nl_family, NLM_F_MULTI, 566 SMC_NETLINK_GET_DEV_SMCR); 567 if (!nlh) 568 goto errmsg; 569 attrs = nla_nest_start(skb, SMC_GEN_DEV_SMCR); 570 if (!attrs) 571 goto errout; 572 is_crit = smcr_diag_is_dev_critical(&smc_lgr_list, smcibdev); 573 if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, is_crit)) 574 goto errattr; 575 if (smcibdev->ibdev->dev.parent) { 576 memset(&smc_pci_dev, 0, sizeof(smc_pci_dev)); 577 pci_dev = to_pci_dev(smcibdev->ibdev->dev.parent); 578 smc_set_pci_values(pci_dev, &smc_pci_dev); 579 if (!smc_nl_handle_pci_values(&smc_pci_dev, skb)) 580 goto errattr; 581 } 582 snprintf(smc_ibname, sizeof(smc_ibname), "%s", smcibdev->ibdev->name); 583 if (nla_put_string(skb, SMC_NLA_DEV_IB_NAME, smc_ibname)) 584 goto errattr; 585 for (i = 1; i <= SMC_MAX_PORTS; i++) { 586 if (!rdma_is_port_valid(smcibdev->ibdev, i)) 587 continue; 588 if (smc_nl_handle_dev_port(skb, smcibdev->ibdev, 589 smcibdev, i - 1)) 590 goto errattr; 591 } 592 593 nla_nest_end(skb, attrs); 594 genlmsg_end(skb, nlh); 595 return 0; 596 597 errattr: 598 nla_nest_cancel(skb, attrs); 599 errout: 600 genlmsg_cancel(skb, nlh); 601 errmsg: 602 return -EMSGSIZE; 603 } 604 605 static void smc_nl_prep_smcr_dev(struct smc_ib_devices *dev_list, 606 struct sk_buff *skb, 607 struct netlink_callback *cb) 608 { 609 struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); 610 struct smc_ib_device *smcibdev; 611 int snum = cb_ctx->pos[0]; 612 int num = 0; 613 614 mutex_lock(&dev_list->mutex); 615 list_for_each_entry(smcibdev, &dev_list->list, list) { 616 if (num < snum) 617 goto next; 618 if (smc_nl_handle_smcr_dev(smcibdev, skb, cb)) 619 goto errout; 620 next: 621 num++; 622 } 623 errout: 624 mutex_unlock(&dev_list->mutex); 625 cb_ctx->pos[0] = num; 626 } 627 628 int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) 629 { 630 smc_nl_prep_smcr_dev(&smc_ib_devices, skb, cb); 631 return skb->len; 632 } 633 634 static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) 635 { 636 struct smc_link *lnk = (struct smc_link *)priv; 637 struct smc_ib_device *smcibdev = lnk->smcibdev; 638 u8 port_idx; 639 640 switch (ibevent->event) { 641 case IB_EVENT_QP_FATAL: 642 case IB_EVENT_QP_ACCESS_ERR: 643 port_idx = ibevent->element.qp->port - 1; 644 if (port_idx >= SMC_MAX_PORTS) 645 break; 646 set_bit(port_idx, &smcibdev->port_event_mask); 647 if (!test_and_set_bit(port_idx, smcibdev->ports_going_away)) 648 schedule_work(&smcibdev->port_event_work); 649 break; 650 default: 651 break; 652 } 653 } 654 655 void smc_ib_destroy_queue_pair(struct smc_link *lnk) 656 { 657 if (lnk->roce_qp) 658 ib_destroy_qp(lnk->roce_qp); 659 lnk->roce_qp = NULL; 660 } 661 662 /* create a queue pair within the protection domain for a link */ 663 int smc_ib_create_queue_pair(struct smc_link *lnk) 664 { 665 int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; 666 struct ib_qp_init_attr qp_attr = { 667 .event_handler = smc_ib_qp_event_handler, 668 .qp_context = lnk, 669 .send_cq = lnk->smcibdev->roce_cq_send, 670 .recv_cq = lnk->smcibdev->roce_cq_recv, 671 .srq = NULL, 672 .cap = { 673 /* include unsolicited rdma_writes as well, 674 * there are max. 2 RDMA_WRITE per 1 WR_SEND 675 */ 676 .max_send_wr = SMC_WR_BUF_CNT * 3, 677 .max_recv_wr = SMC_WR_BUF_CNT * 3, 678 .max_send_sge = SMC_IB_MAX_SEND_SGE, 679 .max_recv_sge = sges_per_buf, 680 .max_inline_data = 0, 681 }, 682 .sq_sig_type = IB_SIGNAL_REQ_WR, 683 .qp_type = IB_QPT_RC, 684 }; 685 int rc; 686 687 lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr); 688 rc = PTR_ERR_OR_ZERO(lnk->roce_qp); 689 if (IS_ERR(lnk->roce_qp)) 690 lnk->roce_qp = NULL; 691 else 692 smc_wr_remember_qp_attr(lnk); 693 return rc; 694 } 695 696 void smc_ib_put_memory_region(struct ib_mr *mr) 697 { 698 ib_dereg_mr(mr); 699 } 700 701 static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot, u8 link_idx) 702 { 703 unsigned int offset = 0; 704 int sg_num; 705 706 /* map the largest prefix of a dma mapped SG list */ 707 sg_num = ib_map_mr_sg(buf_slot->mr[link_idx], 708 buf_slot->sgt[link_idx].sgl, 709 buf_slot->sgt[link_idx].orig_nents, 710 &offset, PAGE_SIZE); 711 712 return sg_num; 713 } 714 715 /* Allocate a memory region and map the dma mapped SG list of buf_slot */ 716 int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, 717 struct smc_buf_desc *buf_slot, u8 link_idx) 718 { 719 if (buf_slot->mr[link_idx]) 720 return 0; /* already done */ 721 722 buf_slot->mr[link_idx] = 723 ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, 1 << buf_slot->order); 724 if (IS_ERR(buf_slot->mr[link_idx])) { 725 int rc; 726 727 rc = PTR_ERR(buf_slot->mr[link_idx]); 728 buf_slot->mr[link_idx] = NULL; 729 return rc; 730 } 731 732 if (smc_ib_map_mr_sg(buf_slot, link_idx) != 733 buf_slot->sgt[link_idx].orig_nents) 734 return -EINVAL; 735 736 return 0; 737 } 738 739 bool smc_ib_is_sg_need_sync(struct smc_link *lnk, 740 struct smc_buf_desc *buf_slot) 741 { 742 struct scatterlist *sg; 743 unsigned int i; 744 bool ret = false; 745 746 /* for now there is just one DMA address */ 747 for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, 748 buf_slot->sgt[lnk->link_idx].nents, i) { 749 if (!sg_dma_len(sg)) 750 break; 751 if (dma_need_sync(lnk->smcibdev->ibdev->dma_device, 752 sg_dma_address(sg))) { 753 ret = true; 754 goto out; 755 } 756 } 757 758 out: 759 return ret; 760 } 761 762 /* synchronize buffer usage for cpu access */ 763 void smc_ib_sync_sg_for_cpu(struct smc_link *lnk, 764 struct smc_buf_desc *buf_slot, 765 enum dma_data_direction data_direction) 766 { 767 struct scatterlist *sg; 768 unsigned int i; 769 770 if (!(buf_slot->is_dma_need_sync & (1U << lnk->link_idx))) 771 return; 772 773 /* for now there is just one DMA address */ 774 for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, 775 buf_slot->sgt[lnk->link_idx].nents, i) { 776 if (!sg_dma_len(sg)) 777 break; 778 ib_dma_sync_single_for_cpu(lnk->smcibdev->ibdev, 779 sg_dma_address(sg), 780 sg_dma_len(sg), 781 data_direction); 782 } 783 } 784 785 /* synchronize buffer usage for device access */ 786 void smc_ib_sync_sg_for_device(struct smc_link *lnk, 787 struct smc_buf_desc *buf_slot, 788 enum dma_data_direction data_direction) 789 { 790 struct scatterlist *sg; 791 unsigned int i; 792 793 if (!(buf_slot->is_dma_need_sync & (1U << lnk->link_idx))) 794 return; 795 796 /* for now there is just one DMA address */ 797 for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, 798 buf_slot->sgt[lnk->link_idx].nents, i) { 799 if (!sg_dma_len(sg)) 800 break; 801 ib_dma_sync_single_for_device(lnk->smcibdev->ibdev, 802 sg_dma_address(sg), 803 sg_dma_len(sg), 804 data_direction); 805 } 806 } 807 808 /* Map a new TX or RX buffer SG-table to DMA */ 809 int smc_ib_buf_map_sg(struct smc_link *lnk, 810 struct smc_buf_desc *buf_slot, 811 enum dma_data_direction data_direction) 812 { 813 int mapped_nents; 814 815 mapped_nents = ib_dma_map_sg(lnk->smcibdev->ibdev, 816 buf_slot->sgt[lnk->link_idx].sgl, 817 buf_slot->sgt[lnk->link_idx].orig_nents, 818 data_direction); 819 if (!mapped_nents) 820 return -ENOMEM; 821 822 return mapped_nents; 823 } 824 825 void smc_ib_buf_unmap_sg(struct smc_link *lnk, 826 struct smc_buf_desc *buf_slot, 827 enum dma_data_direction data_direction) 828 { 829 if (!buf_slot->sgt[lnk->link_idx].sgl->dma_address) 830 return; /* already unmapped */ 831 832 ib_dma_unmap_sg(lnk->smcibdev->ibdev, 833 buf_slot->sgt[lnk->link_idx].sgl, 834 buf_slot->sgt[lnk->link_idx].orig_nents, 835 data_direction); 836 buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0; 837 } 838 839 long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) 840 { 841 struct ib_cq_init_attr cqattr = { 842 .cqe = SMC_MAX_CQE, .comp_vector = 0 }; 843 int cqe_size_order, smc_order; 844 long rc; 845 846 mutex_lock(&smcibdev->mutex); 847 rc = 0; 848 if (smcibdev->initialized) 849 goto out; 850 /* the calculated number of cq entries fits to mlx5 cq allocation */ 851 cqe_size_order = cache_line_size() == 128 ? 7 : 6; 852 smc_order = MAX_ORDER - cqe_size_order; 853 if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE) 854 cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2; 855 smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev, 856 smc_wr_tx_cq_handler, NULL, 857 smcibdev, &cqattr); 858 rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send); 859 if (IS_ERR(smcibdev->roce_cq_send)) { 860 smcibdev->roce_cq_send = NULL; 861 goto out; 862 } 863 smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev, 864 smc_wr_rx_cq_handler, NULL, 865 smcibdev, &cqattr); 866 rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_recv); 867 if (IS_ERR(smcibdev->roce_cq_recv)) { 868 smcibdev->roce_cq_recv = NULL; 869 goto err; 870 } 871 smc_wr_add_dev(smcibdev); 872 smcibdev->initialized = 1; 873 goto out; 874 875 err: 876 ib_destroy_cq(smcibdev->roce_cq_send); 877 out: 878 mutex_unlock(&smcibdev->mutex); 879 return rc; 880 } 881 882 static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev) 883 { 884 mutex_lock(&smcibdev->mutex); 885 if (!smcibdev->initialized) 886 goto out; 887 smcibdev->initialized = 0; 888 ib_destroy_cq(smcibdev->roce_cq_recv); 889 ib_destroy_cq(smcibdev->roce_cq_send); 890 smc_wr_remove_dev(smcibdev); 891 out: 892 mutex_unlock(&smcibdev->mutex); 893 } 894 895 static struct ib_client smc_ib_client; 896 897 static void smc_copy_netdev_ifindex(struct smc_ib_device *smcibdev, int port) 898 { 899 struct ib_device *ibdev = smcibdev->ibdev; 900 struct net_device *ndev; 901 902 if (!ibdev->ops.get_netdev) 903 return; 904 ndev = ibdev->ops.get_netdev(ibdev, port + 1); 905 if (ndev) { 906 smcibdev->ndev_ifidx[port] = ndev->ifindex; 907 dev_put(ndev); 908 } 909 } 910 911 void smc_ib_ndev_change(struct net_device *ndev, unsigned long event) 912 { 913 struct smc_ib_device *smcibdev; 914 struct ib_device *libdev; 915 struct net_device *lndev; 916 u8 port_cnt; 917 int i; 918 919 mutex_lock(&smc_ib_devices.mutex); 920 list_for_each_entry(smcibdev, &smc_ib_devices.list, list) { 921 port_cnt = smcibdev->ibdev->phys_port_cnt; 922 for (i = 0; i < min_t(size_t, port_cnt, SMC_MAX_PORTS); i++) { 923 libdev = smcibdev->ibdev; 924 if (!libdev->ops.get_netdev) 925 continue; 926 lndev = libdev->ops.get_netdev(libdev, i + 1); 927 dev_put(lndev); 928 if (lndev != ndev) 929 continue; 930 if (event == NETDEV_REGISTER) 931 smcibdev->ndev_ifidx[i] = ndev->ifindex; 932 if (event == NETDEV_UNREGISTER) 933 smcibdev->ndev_ifidx[i] = 0; 934 } 935 } 936 mutex_unlock(&smc_ib_devices.mutex); 937 } 938 939 /* callback function for ib_register_client() */ 940 static int smc_ib_add_dev(struct ib_device *ibdev) 941 { 942 struct smc_ib_device *smcibdev; 943 u8 port_cnt; 944 int i; 945 946 if (ibdev->node_type != RDMA_NODE_IB_CA) 947 return -EOPNOTSUPP; 948 949 smcibdev = kzalloc(sizeof(*smcibdev), GFP_KERNEL); 950 if (!smcibdev) 951 return -ENOMEM; 952 953 smcibdev->ibdev = ibdev; 954 INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work); 955 atomic_set(&smcibdev->lnk_cnt, 0); 956 init_waitqueue_head(&smcibdev->lnks_deleted); 957 mutex_init(&smcibdev->mutex); 958 mutex_lock(&smc_ib_devices.mutex); 959 list_add_tail(&smcibdev->list, &smc_ib_devices.list); 960 mutex_unlock(&smc_ib_devices.mutex); 961 ib_set_client_data(ibdev, &smc_ib_client, smcibdev); 962 INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev, 963 smc_ib_global_event_handler); 964 ib_register_event_handler(&smcibdev->event_handler); 965 966 /* trigger reading of the port attributes */ 967 port_cnt = smcibdev->ibdev->phys_port_cnt; 968 pr_warn_ratelimited("smc: adding ib device %s with port count %d\n", 969 smcibdev->ibdev->name, port_cnt); 970 for (i = 0; 971 i < min_t(size_t, port_cnt, SMC_MAX_PORTS); 972 i++) { 973 set_bit(i, &smcibdev->port_event_mask); 974 /* determine pnetids of the port */ 975 if (smc_pnetid_by_dev_port(ibdev->dev.parent, i, 976 smcibdev->pnetid[i])) 977 smc_pnetid_by_table_ib(smcibdev, i + 1); 978 smc_copy_netdev_ifindex(smcibdev, i); 979 pr_warn_ratelimited("smc: ib device %s port %d has pnetid " 980 "%.16s%s\n", 981 smcibdev->ibdev->name, i + 1, 982 smcibdev->pnetid[i], 983 smcibdev->pnetid_by_user[i] ? 984 " (user defined)" : 985 ""); 986 } 987 schedule_work(&smcibdev->port_event_work); 988 return 0; 989 } 990 991 /* callback function for ib_unregister_client() */ 992 static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) 993 { 994 struct smc_ib_device *smcibdev = client_data; 995 996 mutex_lock(&smc_ib_devices.mutex); 997 list_del_init(&smcibdev->list); /* remove from smc_ib_devices */ 998 mutex_unlock(&smc_ib_devices.mutex); 999 pr_warn_ratelimited("smc: removing ib device %s\n", 1000 smcibdev->ibdev->name); 1001 smc_smcr_terminate_all(smcibdev); 1002 smc_ib_cleanup_per_ibdev(smcibdev); 1003 ib_unregister_event_handler(&smcibdev->event_handler); 1004 cancel_work_sync(&smcibdev->port_event_work); 1005 kfree(smcibdev); 1006 } 1007 1008 static struct ib_client smc_ib_client = { 1009 .name = "smc_ib", 1010 .add = smc_ib_add_dev, 1011 .remove = smc_ib_remove_dev, 1012 }; 1013 1014 int __init smc_ib_register_client(void) 1015 { 1016 smc_ib_init_local_systemid(); 1017 return ib_register_client(&smc_ib_client); 1018 } 1019 1020 void smc_ib_unregister_client(void) 1021 { 1022 ib_unregister_client(&smc_ib_client); 1023 } 1024