1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Shared Memory Communications over RDMA (SMC-R) and RoCE 4 * 5 * IB infrastructure: 6 * Establish SMC-R as an Infiniband Client to be notified about added and 7 * removed IB devices of type RDMA. 8 * Determine device and port characteristics for these IB devices. 9 * 10 * Copyright IBM Corp. 2016 11 * 12 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> 13 */ 14 15 #include <linux/random.h> 16 #include <linux/workqueue.h> 17 #include <linux/scatterlist.h> 18 #include <linux/wait.h> 19 #include <linux/mutex.h> 20 #include <linux/inetdevice.h> 21 #include <rdma/ib_verbs.h> 22 #include <rdma/ib_cache.h> 23 24 #include "smc_pnet.h" 25 #include "smc_ib.h" 26 #include "smc_core.h" 27 #include "smc_wr.h" 28 #include "smc.h" 29 #include "smc_netlink.h" 30 31 #define SMC_MAX_CQE 32766 /* max. # of completion queue elements */ 32 33 #define SMC_QP_MIN_RNR_TIMER 5 34 #define SMC_QP_TIMEOUT 15 /* 4096 * 2 ** timeout usec */ 35 #define SMC_QP_RETRY_CNT 7 /* 7: infinite */ 36 #define SMC_QP_RNR_RETRY 7 /* 7: infinite */ 37 38 struct smc_ib_devices smc_ib_devices = { /* smc-registered ib devices */ 39 .mutex = __MUTEX_INITIALIZER(smc_ib_devices.mutex), 40 .list = LIST_HEAD_INIT(smc_ib_devices.list), 41 }; 42 43 u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */ 44 45 static int smc_ib_modify_qp_init(struct smc_link *lnk) 46 { 47 struct ib_qp_attr qp_attr; 48 49 memset(&qp_attr, 0, sizeof(qp_attr)); 50 qp_attr.qp_state = IB_QPS_INIT; 51 qp_attr.pkey_index = 0; 52 qp_attr.port_num = lnk->ibport; 53 qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE 54 | IB_ACCESS_REMOTE_WRITE; 55 return ib_modify_qp(lnk->roce_qp, &qp_attr, 56 IB_QP_STATE | IB_QP_PKEY_INDEX | 57 IB_QP_ACCESS_FLAGS | IB_QP_PORT); 58 } 59 60 static int smc_ib_modify_qp_rtr(struct smc_link *lnk) 61 { 62 enum ib_qp_attr_mask qp_attr_mask = 63 IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | 64 IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER; 65 struct ib_qp_attr qp_attr; 66 u8 hop_lim = 1; 67 68 memset(&qp_attr, 0, sizeof(qp_attr)); 69 qp_attr.qp_state = IB_QPS_RTR; 70 qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu); 71 qp_attr.ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE; 72 rdma_ah_set_port_num(&qp_attr.ah_attr, lnk->ibport); 73 if (lnk->lgr->smc_version == SMC_V2 && lnk->lgr->uses_gateway) 74 hop_lim = IPV6_DEFAULT_HOPLIMIT; 75 rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, lnk->sgid_index, hop_lim, 0); 76 rdma_ah_set_dgid_raw(&qp_attr.ah_attr, lnk->peer_gid); 77 if (lnk->lgr->smc_version == SMC_V2 && lnk->lgr->uses_gateway) 78 memcpy(&qp_attr.ah_attr.roce.dmac, lnk->lgr->nexthop_mac, 79 sizeof(lnk->lgr->nexthop_mac)); 80 else 81 memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac, 82 sizeof(lnk->peer_mac)); 83 qp_attr.dest_qp_num = lnk->peer_qpn; 84 qp_attr.rq_psn = lnk->peer_psn; /* starting receive packet seq # */ 85 qp_attr.max_dest_rd_atomic = 1; /* max # of resources for incoming 86 * requests 87 */ 88 qp_attr.min_rnr_timer = SMC_QP_MIN_RNR_TIMER; 89 90 return ib_modify_qp(lnk->roce_qp, &qp_attr, qp_attr_mask); 91 } 92 93 int smc_ib_modify_qp_rts(struct smc_link *lnk) 94 { 95 struct ib_qp_attr qp_attr; 96 97 memset(&qp_attr, 0, sizeof(qp_attr)); 98 qp_attr.qp_state = IB_QPS_RTS; 99 qp_attr.timeout = SMC_QP_TIMEOUT; /* local ack timeout */ 100 qp_attr.retry_cnt = SMC_QP_RETRY_CNT; /* retry count */ 101 qp_attr.rnr_retry = SMC_QP_RNR_RETRY; /* RNR retries, 7=infinite */ 102 qp_attr.sq_psn = lnk->psn_initial; /* starting send packet seq # */ 103 qp_attr.max_rd_atomic = 1; /* # of outstanding RDMA reads and 104 * atomic ops allowed 105 */ 106 return ib_modify_qp(lnk->roce_qp, &qp_attr, 107 IB_QP_STATE | IB_QP_TIMEOUT | IB_QP_RETRY_CNT | 108 IB_QP_SQ_PSN | IB_QP_RNR_RETRY | 109 IB_QP_MAX_QP_RD_ATOMIC); 110 } 111 112 int smc_ib_modify_qp_reset(struct smc_link *lnk) 113 { 114 struct ib_qp_attr qp_attr; 115 116 memset(&qp_attr, 0, sizeof(qp_attr)); 117 qp_attr.qp_state = IB_QPS_RESET; 118 return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE); 119 } 120 121 int smc_ib_ready_link(struct smc_link *lnk) 122 { 123 struct smc_link_group *lgr = smc_get_lgr(lnk); 124 int rc = 0; 125 126 rc = smc_ib_modify_qp_init(lnk); 127 if (rc) 128 goto out; 129 130 rc = smc_ib_modify_qp_rtr(lnk); 131 if (rc) 132 goto out; 133 smc_wr_remember_qp_attr(lnk); 134 rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv, 135 IB_CQ_SOLICITED_MASK); 136 if (rc) 137 goto out; 138 rc = smc_wr_rx_post_init(lnk); 139 if (rc) 140 goto out; 141 smc_wr_remember_qp_attr(lnk); 142 143 if (lgr->role == SMC_SERV) { 144 rc = smc_ib_modify_qp_rts(lnk); 145 if (rc) 146 goto out; 147 smc_wr_remember_qp_attr(lnk); 148 } 149 out: 150 return rc; 151 } 152 153 static int smc_ib_fill_mac(struct smc_ib_device *smcibdev, u8 ibport) 154 { 155 const struct ib_gid_attr *attr; 156 int rc; 157 158 attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, 0); 159 if (IS_ERR(attr)) 160 return -ENODEV; 161 162 rc = rdma_read_gid_l2_fields(attr, NULL, smcibdev->mac[ibport - 1]); 163 rdma_put_gid_attr(attr); 164 return rc; 165 } 166 167 /* Create an identifier unique for this instance of SMC-R. 168 * The MAC-address of the first active registered IB device 169 * plus a random 2-byte number is used to create this identifier. 170 * This name is delivered to the peer during connection initialization. 171 */ 172 static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev, 173 u8 ibport) 174 { 175 memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1], 176 sizeof(smcibdev->mac[ibport - 1])); 177 } 178 179 bool smc_ib_is_valid_local_systemid(void) 180 { 181 return !is_zero_ether_addr(&local_systemid[2]); 182 } 183 184 static void smc_ib_init_local_systemid(void) 185 { 186 get_random_bytes(&local_systemid[0], 2); 187 } 188 189 bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport) 190 { 191 return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE; 192 } 193 194 int smc_ib_find_route(__be32 saddr, __be32 daddr, 195 u8 nexthop_mac[], u8 *uses_gateway) 196 { 197 struct neighbour *neigh = NULL; 198 struct rtable *rt = NULL; 199 struct flowi4 fl4 = { 200 .saddr = saddr, 201 .daddr = daddr 202 }; 203 204 if (daddr == cpu_to_be32(INADDR_NONE)) 205 goto out; 206 rt = ip_route_output_flow(&init_net, &fl4, NULL); 207 if (IS_ERR(rt)) 208 goto out; 209 if (rt->rt_uses_gateway && rt->rt_gw_family != AF_INET) 210 goto out; 211 neigh = rt->dst.ops->neigh_lookup(&rt->dst, NULL, &fl4.daddr); 212 if (neigh) { 213 memcpy(nexthop_mac, neigh->ha, ETH_ALEN); 214 *uses_gateway = rt->rt_uses_gateway; 215 return 0; 216 } 217 out: 218 return -ENOENT; 219 } 220 221 static int smc_ib_determine_gid_rcu(const struct net_device *ndev, 222 const struct ib_gid_attr *attr, 223 u8 gid[], u8 *sgid_index, 224 struct smc_init_info_smcrv2 *smcrv2) 225 { 226 if (!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) { 227 if (gid) 228 memcpy(gid, &attr->gid, SMC_GID_SIZE); 229 if (sgid_index) 230 *sgid_index = attr->index; 231 return 0; 232 } 233 if (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP && 234 smc_ib_gid_to_ipv4((u8 *)&attr->gid) != cpu_to_be32(INADDR_NONE)) { 235 struct in_device *in_dev = __in_dev_get_rcu(ndev); 236 const struct in_ifaddr *ifa; 237 bool subnet_match = false; 238 239 if (!in_dev) 240 goto out; 241 in_dev_for_each_ifa_rcu(ifa, in_dev) { 242 if (!inet_ifa_match(smcrv2->saddr, ifa)) 243 continue; 244 subnet_match = true; 245 break; 246 } 247 if (!subnet_match) 248 goto out; 249 if (smcrv2->daddr && smc_ib_find_route(smcrv2->saddr, 250 smcrv2->daddr, 251 smcrv2->nexthop_mac, 252 &smcrv2->uses_gateway)) 253 goto out; 254 255 if (gid) 256 memcpy(gid, &attr->gid, SMC_GID_SIZE); 257 if (sgid_index) 258 *sgid_index = attr->index; 259 return 0; 260 } 261 out: 262 return -ENODEV; 263 } 264 265 /* determine the gid for an ib-device port and vlan id */ 266 int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, 267 unsigned short vlan_id, u8 gid[], u8 *sgid_index, 268 struct smc_init_info_smcrv2 *smcrv2) 269 { 270 const struct ib_gid_attr *attr; 271 const struct net_device *ndev; 272 int i; 273 274 for (i = 0; i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) { 275 attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, i); 276 if (IS_ERR(attr)) 277 continue; 278 279 rcu_read_lock(); 280 ndev = rdma_read_gid_attr_ndev_rcu(attr); 281 if (!IS_ERR(ndev) && 282 ((!vlan_id && !is_vlan_dev(ndev)) || 283 (vlan_id && is_vlan_dev(ndev) && 284 vlan_dev_vlan_id(ndev) == vlan_id))) { 285 if (!smc_ib_determine_gid_rcu(ndev, attr, gid, 286 sgid_index, smcrv2)) { 287 rcu_read_unlock(); 288 rdma_put_gid_attr(attr); 289 return 0; 290 } 291 } 292 rcu_read_unlock(); 293 rdma_put_gid_attr(attr); 294 } 295 return -ENODEV; 296 } 297 298 /* check if gid is still defined on smcibdev */ 299 static bool smc_ib_check_link_gid(u8 gid[SMC_GID_SIZE], bool smcrv2, 300 struct smc_ib_device *smcibdev, u8 ibport) 301 { 302 const struct ib_gid_attr *attr; 303 bool rc = false; 304 int i; 305 306 for (i = 0; !rc && i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) { 307 attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, i); 308 if (IS_ERR(attr)) 309 continue; 310 311 rcu_read_lock(); 312 if ((!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) || 313 (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP && 314 !(ipv6_addr_type((const struct in6_addr *)&attr->gid) 315 & IPV6_ADDR_LINKLOCAL))) 316 if (!memcmp(gid, &attr->gid, SMC_GID_SIZE)) 317 rc = true; 318 rcu_read_unlock(); 319 rdma_put_gid_attr(attr); 320 } 321 return rc; 322 } 323 324 /* check all links if the gid is still defined on smcibdev */ 325 static void smc_ib_gid_check(struct smc_ib_device *smcibdev, u8 ibport) 326 { 327 struct smc_link_group *lgr; 328 int i; 329 330 spin_lock_bh(&smc_lgr_list.lock); 331 list_for_each_entry(lgr, &smc_lgr_list.list, list) { 332 if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id, 333 SMC_MAX_PNETID_LEN)) 334 continue; /* lgr is not affected */ 335 if (list_empty(&lgr->list)) 336 continue; 337 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { 338 if (lgr->lnk[i].state == SMC_LNK_UNUSED || 339 lgr->lnk[i].smcibdev != smcibdev) 340 continue; 341 if (!smc_ib_check_link_gid(lgr->lnk[i].gid, 342 lgr->smc_version == SMC_V2, 343 smcibdev, ibport)) 344 smcr_port_err(smcibdev, ibport); 345 } 346 } 347 spin_unlock_bh(&smc_lgr_list.lock); 348 } 349 350 static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport) 351 { 352 int rc; 353 354 memset(&smcibdev->pattr[ibport - 1], 0, 355 sizeof(smcibdev->pattr[ibport - 1])); 356 rc = ib_query_port(smcibdev->ibdev, ibport, 357 &smcibdev->pattr[ibport - 1]); 358 if (rc) 359 goto out; 360 /* the SMC protocol requires specification of the RoCE MAC address */ 361 rc = smc_ib_fill_mac(smcibdev, ibport); 362 if (rc) 363 goto out; 364 if (!smc_ib_is_valid_local_systemid() && 365 smc_ib_port_active(smcibdev, ibport)) 366 /* create unique system identifier */ 367 smc_ib_define_local_systemid(smcibdev, ibport); 368 out: 369 return rc; 370 } 371 372 /* process context wrapper for might_sleep smc_ib_remember_port_attr */ 373 static void smc_ib_port_event_work(struct work_struct *work) 374 { 375 struct smc_ib_device *smcibdev = container_of( 376 work, struct smc_ib_device, port_event_work); 377 u8 port_idx; 378 379 for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) { 380 smc_ib_remember_port_attr(smcibdev, port_idx + 1); 381 clear_bit(port_idx, &smcibdev->port_event_mask); 382 if (!smc_ib_port_active(smcibdev, port_idx + 1)) { 383 set_bit(port_idx, smcibdev->ports_going_away); 384 smcr_port_err(smcibdev, port_idx + 1); 385 } else { 386 clear_bit(port_idx, smcibdev->ports_going_away); 387 smcr_port_add(smcibdev, port_idx + 1); 388 smc_ib_gid_check(smcibdev, port_idx + 1); 389 } 390 } 391 } 392 393 /* can be called in IRQ context */ 394 static void smc_ib_global_event_handler(struct ib_event_handler *handler, 395 struct ib_event *ibevent) 396 { 397 struct smc_ib_device *smcibdev; 398 bool schedule = false; 399 u8 port_idx; 400 401 smcibdev = container_of(handler, struct smc_ib_device, event_handler); 402 403 switch (ibevent->event) { 404 case IB_EVENT_DEVICE_FATAL: 405 /* terminate all ports on device */ 406 for (port_idx = 0; port_idx < SMC_MAX_PORTS; port_idx++) { 407 set_bit(port_idx, &smcibdev->port_event_mask); 408 if (!test_and_set_bit(port_idx, 409 smcibdev->ports_going_away)) 410 schedule = true; 411 } 412 if (schedule) 413 schedule_work(&smcibdev->port_event_work); 414 break; 415 case IB_EVENT_PORT_ACTIVE: 416 port_idx = ibevent->element.port_num - 1; 417 if (port_idx >= SMC_MAX_PORTS) 418 break; 419 set_bit(port_idx, &smcibdev->port_event_mask); 420 if (test_and_clear_bit(port_idx, smcibdev->ports_going_away)) 421 schedule_work(&smcibdev->port_event_work); 422 break; 423 case IB_EVENT_PORT_ERR: 424 port_idx = ibevent->element.port_num - 1; 425 if (port_idx >= SMC_MAX_PORTS) 426 break; 427 set_bit(port_idx, &smcibdev->port_event_mask); 428 if (!test_and_set_bit(port_idx, smcibdev->ports_going_away)) 429 schedule_work(&smcibdev->port_event_work); 430 break; 431 case IB_EVENT_GID_CHANGE: 432 port_idx = ibevent->element.port_num - 1; 433 if (port_idx >= SMC_MAX_PORTS) 434 break; 435 set_bit(port_idx, &smcibdev->port_event_mask); 436 schedule_work(&smcibdev->port_event_work); 437 break; 438 default: 439 break; 440 } 441 } 442 443 void smc_ib_dealloc_protection_domain(struct smc_link *lnk) 444 { 445 if (lnk->roce_pd) 446 ib_dealloc_pd(lnk->roce_pd); 447 lnk->roce_pd = NULL; 448 } 449 450 int smc_ib_create_protection_domain(struct smc_link *lnk) 451 { 452 int rc; 453 454 lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, 0); 455 rc = PTR_ERR_OR_ZERO(lnk->roce_pd); 456 if (IS_ERR(lnk->roce_pd)) 457 lnk->roce_pd = NULL; 458 return rc; 459 } 460 461 static bool smcr_diag_is_dev_critical(struct smc_lgr_list *smc_lgr, 462 struct smc_ib_device *smcibdev) 463 { 464 struct smc_link_group *lgr; 465 bool rc = false; 466 int i; 467 468 spin_lock_bh(&smc_lgr->lock); 469 list_for_each_entry(lgr, &smc_lgr->list, list) { 470 if (lgr->is_smcd) 471 continue; 472 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { 473 if (lgr->lnk[i].state == SMC_LNK_UNUSED || 474 lgr->lnk[i].smcibdev != smcibdev) 475 continue; 476 if (lgr->type == SMC_LGR_SINGLE || 477 lgr->type == SMC_LGR_ASYMMETRIC_LOCAL) { 478 rc = true; 479 goto out; 480 } 481 } 482 } 483 out: 484 spin_unlock_bh(&smc_lgr->lock); 485 return rc; 486 } 487 488 static int smc_nl_handle_dev_port(struct sk_buff *skb, 489 struct ib_device *ibdev, 490 struct smc_ib_device *smcibdev, 491 int port) 492 { 493 char smc_pnet[SMC_MAX_PNETID_LEN + 1]; 494 struct nlattr *port_attrs; 495 unsigned char port_state; 496 int lnk_count = 0; 497 498 port_attrs = nla_nest_start(skb, SMC_NLA_DEV_PORT + port); 499 if (!port_attrs) 500 goto errout; 501 502 if (nla_put_u8(skb, SMC_NLA_DEV_PORT_PNET_USR, 503 smcibdev->pnetid_by_user[port])) 504 goto errattr; 505 memcpy(smc_pnet, &smcibdev->pnetid[port], SMC_MAX_PNETID_LEN); 506 smc_pnet[SMC_MAX_PNETID_LEN] = 0; 507 if (nla_put_string(skb, SMC_NLA_DEV_PORT_PNETID, smc_pnet)) 508 goto errattr; 509 if (nla_put_u32(skb, SMC_NLA_DEV_PORT_NETDEV, 510 smcibdev->ndev_ifidx[port])) 511 goto errattr; 512 if (nla_put_u8(skb, SMC_NLA_DEV_PORT_VALID, 1)) 513 goto errattr; 514 port_state = smc_ib_port_active(smcibdev, port + 1); 515 if (nla_put_u8(skb, SMC_NLA_DEV_PORT_STATE, port_state)) 516 goto errattr; 517 lnk_count = atomic_read(&smcibdev->lnk_cnt_by_port[port]); 518 if (nla_put_u32(skb, SMC_NLA_DEV_PORT_LNK_CNT, lnk_count)) 519 goto errattr; 520 nla_nest_end(skb, port_attrs); 521 return 0; 522 errattr: 523 nla_nest_cancel(skb, port_attrs); 524 errout: 525 return -EMSGSIZE; 526 } 527 528 static bool smc_nl_handle_pci_values(const struct smc_pci_dev *smc_pci_dev, 529 struct sk_buff *skb) 530 { 531 if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev->pci_fid)) 532 return false; 533 if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev->pci_pchid)) 534 return false; 535 if (nla_put_u16(skb, SMC_NLA_DEV_PCI_VENDOR, smc_pci_dev->pci_vendor)) 536 return false; 537 if (nla_put_u16(skb, SMC_NLA_DEV_PCI_DEVICE, smc_pci_dev->pci_device)) 538 return false; 539 if (nla_put_string(skb, SMC_NLA_DEV_PCI_ID, smc_pci_dev->pci_id)) 540 return false; 541 return true; 542 } 543 544 static int smc_nl_handle_smcr_dev(struct smc_ib_device *smcibdev, 545 struct sk_buff *skb, 546 struct netlink_callback *cb) 547 { 548 char smc_ibname[IB_DEVICE_NAME_MAX]; 549 struct smc_pci_dev smc_pci_dev; 550 struct pci_dev *pci_dev; 551 unsigned char is_crit; 552 struct nlattr *attrs; 553 void *nlh; 554 int i; 555 556 nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 557 &smc_gen_nl_family, NLM_F_MULTI, 558 SMC_NETLINK_GET_DEV_SMCR); 559 if (!nlh) 560 goto errmsg; 561 attrs = nla_nest_start(skb, SMC_GEN_DEV_SMCR); 562 if (!attrs) 563 goto errout; 564 is_crit = smcr_diag_is_dev_critical(&smc_lgr_list, smcibdev); 565 if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, is_crit)) 566 goto errattr; 567 if (smcibdev->ibdev->dev.parent) { 568 memset(&smc_pci_dev, 0, sizeof(smc_pci_dev)); 569 pci_dev = to_pci_dev(smcibdev->ibdev->dev.parent); 570 smc_set_pci_values(pci_dev, &smc_pci_dev); 571 if (!smc_nl_handle_pci_values(&smc_pci_dev, skb)) 572 goto errattr; 573 } 574 snprintf(smc_ibname, sizeof(smc_ibname), "%s", smcibdev->ibdev->name); 575 if (nla_put_string(skb, SMC_NLA_DEV_IB_NAME, smc_ibname)) 576 goto errattr; 577 for (i = 1; i <= SMC_MAX_PORTS; i++) { 578 if (!rdma_is_port_valid(smcibdev->ibdev, i)) 579 continue; 580 if (smc_nl_handle_dev_port(skb, smcibdev->ibdev, 581 smcibdev, i - 1)) 582 goto errattr; 583 } 584 585 nla_nest_end(skb, attrs); 586 genlmsg_end(skb, nlh); 587 return 0; 588 589 errattr: 590 nla_nest_cancel(skb, attrs); 591 errout: 592 genlmsg_cancel(skb, nlh); 593 errmsg: 594 return -EMSGSIZE; 595 } 596 597 static void smc_nl_prep_smcr_dev(struct smc_ib_devices *dev_list, 598 struct sk_buff *skb, 599 struct netlink_callback *cb) 600 { 601 struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); 602 struct smc_ib_device *smcibdev; 603 int snum = cb_ctx->pos[0]; 604 int num = 0; 605 606 mutex_lock(&dev_list->mutex); 607 list_for_each_entry(smcibdev, &dev_list->list, list) { 608 if (num < snum) 609 goto next; 610 if (smc_nl_handle_smcr_dev(smcibdev, skb, cb)) 611 goto errout; 612 next: 613 num++; 614 } 615 errout: 616 mutex_unlock(&dev_list->mutex); 617 cb_ctx->pos[0] = num; 618 } 619 620 int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) 621 { 622 smc_nl_prep_smcr_dev(&smc_ib_devices, skb, cb); 623 return skb->len; 624 } 625 626 static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) 627 { 628 struct smc_link *lnk = (struct smc_link *)priv; 629 struct smc_ib_device *smcibdev = lnk->smcibdev; 630 u8 port_idx; 631 632 switch (ibevent->event) { 633 case IB_EVENT_QP_FATAL: 634 case IB_EVENT_QP_ACCESS_ERR: 635 port_idx = ibevent->element.qp->port - 1; 636 if (port_idx >= SMC_MAX_PORTS) 637 break; 638 set_bit(port_idx, &smcibdev->port_event_mask); 639 if (!test_and_set_bit(port_idx, smcibdev->ports_going_away)) 640 schedule_work(&smcibdev->port_event_work); 641 break; 642 default: 643 break; 644 } 645 } 646 647 void smc_ib_destroy_queue_pair(struct smc_link *lnk) 648 { 649 if (lnk->roce_qp) 650 ib_destroy_qp(lnk->roce_qp); 651 lnk->roce_qp = NULL; 652 } 653 654 /* create a queue pair within the protection domain for a link */ 655 int smc_ib_create_queue_pair(struct smc_link *lnk) 656 { 657 int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; 658 struct ib_qp_init_attr qp_attr = { 659 .event_handler = smc_ib_qp_event_handler, 660 .qp_context = lnk, 661 .send_cq = lnk->smcibdev->roce_cq_send, 662 .recv_cq = lnk->smcibdev->roce_cq_recv, 663 .srq = NULL, 664 .cap = { 665 /* include unsolicited rdma_writes as well, 666 * there are max. 2 RDMA_WRITE per 1 WR_SEND 667 */ 668 .max_send_wr = SMC_WR_BUF_CNT * 3, 669 .max_recv_wr = SMC_WR_BUF_CNT * 3, 670 .max_send_sge = SMC_IB_MAX_SEND_SGE, 671 .max_recv_sge = sges_per_buf, 672 }, 673 .sq_sig_type = IB_SIGNAL_REQ_WR, 674 .qp_type = IB_QPT_RC, 675 }; 676 int rc; 677 678 lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr); 679 rc = PTR_ERR_OR_ZERO(lnk->roce_qp); 680 if (IS_ERR(lnk->roce_qp)) 681 lnk->roce_qp = NULL; 682 else 683 smc_wr_remember_qp_attr(lnk); 684 return rc; 685 } 686 687 void smc_ib_put_memory_region(struct ib_mr *mr) 688 { 689 ib_dereg_mr(mr); 690 } 691 692 static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot, u8 link_idx) 693 { 694 unsigned int offset = 0; 695 int sg_num; 696 697 /* map the largest prefix of a dma mapped SG list */ 698 sg_num = ib_map_mr_sg(buf_slot->mr_rx[link_idx], 699 buf_slot->sgt[link_idx].sgl, 700 buf_slot->sgt[link_idx].orig_nents, 701 &offset, PAGE_SIZE); 702 703 return sg_num; 704 } 705 706 /* Allocate a memory region and map the dma mapped SG list of buf_slot */ 707 int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, 708 struct smc_buf_desc *buf_slot, u8 link_idx) 709 { 710 if (buf_slot->mr_rx[link_idx]) 711 return 0; /* already done */ 712 713 buf_slot->mr_rx[link_idx] = 714 ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, 1 << buf_slot->order); 715 if (IS_ERR(buf_slot->mr_rx[link_idx])) { 716 int rc; 717 718 rc = PTR_ERR(buf_slot->mr_rx[link_idx]); 719 buf_slot->mr_rx[link_idx] = NULL; 720 return rc; 721 } 722 723 if (smc_ib_map_mr_sg(buf_slot, link_idx) != 1) 724 return -EINVAL; 725 726 return 0; 727 } 728 729 /* synchronize buffer usage for cpu access */ 730 void smc_ib_sync_sg_for_cpu(struct smc_link *lnk, 731 struct smc_buf_desc *buf_slot, 732 enum dma_data_direction data_direction) 733 { 734 struct scatterlist *sg; 735 unsigned int i; 736 737 /* for now there is just one DMA address */ 738 for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, 739 buf_slot->sgt[lnk->link_idx].nents, i) { 740 if (!sg_dma_len(sg)) 741 break; 742 ib_dma_sync_single_for_cpu(lnk->smcibdev->ibdev, 743 sg_dma_address(sg), 744 sg_dma_len(sg), 745 data_direction); 746 } 747 } 748 749 /* synchronize buffer usage for device access */ 750 void smc_ib_sync_sg_for_device(struct smc_link *lnk, 751 struct smc_buf_desc *buf_slot, 752 enum dma_data_direction data_direction) 753 { 754 struct scatterlist *sg; 755 unsigned int i; 756 757 /* for now there is just one DMA address */ 758 for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, 759 buf_slot->sgt[lnk->link_idx].nents, i) { 760 if (!sg_dma_len(sg)) 761 break; 762 ib_dma_sync_single_for_device(lnk->smcibdev->ibdev, 763 sg_dma_address(sg), 764 sg_dma_len(sg), 765 data_direction); 766 } 767 } 768 769 /* Map a new TX or RX buffer SG-table to DMA */ 770 int smc_ib_buf_map_sg(struct smc_link *lnk, 771 struct smc_buf_desc *buf_slot, 772 enum dma_data_direction data_direction) 773 { 774 int mapped_nents; 775 776 mapped_nents = ib_dma_map_sg(lnk->smcibdev->ibdev, 777 buf_slot->sgt[lnk->link_idx].sgl, 778 buf_slot->sgt[lnk->link_idx].orig_nents, 779 data_direction); 780 if (!mapped_nents) 781 return -ENOMEM; 782 783 return mapped_nents; 784 } 785 786 void smc_ib_buf_unmap_sg(struct smc_link *lnk, 787 struct smc_buf_desc *buf_slot, 788 enum dma_data_direction data_direction) 789 { 790 if (!buf_slot->sgt[lnk->link_idx].sgl->dma_address) 791 return; /* already unmapped */ 792 793 ib_dma_unmap_sg(lnk->smcibdev->ibdev, 794 buf_slot->sgt[lnk->link_idx].sgl, 795 buf_slot->sgt[lnk->link_idx].orig_nents, 796 data_direction); 797 buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0; 798 } 799 800 long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) 801 { 802 struct ib_cq_init_attr cqattr = { 803 .cqe = SMC_MAX_CQE, .comp_vector = 0 }; 804 int cqe_size_order, smc_order; 805 long rc; 806 807 mutex_lock(&smcibdev->mutex); 808 rc = 0; 809 if (smcibdev->initialized) 810 goto out; 811 /* the calculated number of cq entries fits to mlx5 cq allocation */ 812 cqe_size_order = cache_line_size() == 128 ? 7 : 6; 813 smc_order = MAX_ORDER - cqe_size_order - 1; 814 if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE) 815 cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2; 816 smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev, 817 smc_wr_tx_cq_handler, NULL, 818 smcibdev, &cqattr); 819 rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send); 820 if (IS_ERR(smcibdev->roce_cq_send)) { 821 smcibdev->roce_cq_send = NULL; 822 goto out; 823 } 824 smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev, 825 smc_wr_rx_cq_handler, NULL, 826 smcibdev, &cqattr); 827 rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_recv); 828 if (IS_ERR(smcibdev->roce_cq_recv)) { 829 smcibdev->roce_cq_recv = NULL; 830 goto err; 831 } 832 smc_wr_add_dev(smcibdev); 833 smcibdev->initialized = 1; 834 goto out; 835 836 err: 837 ib_destroy_cq(smcibdev->roce_cq_send); 838 out: 839 mutex_unlock(&smcibdev->mutex); 840 return rc; 841 } 842 843 static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev) 844 { 845 mutex_lock(&smcibdev->mutex); 846 if (!smcibdev->initialized) 847 goto out; 848 smcibdev->initialized = 0; 849 ib_destroy_cq(smcibdev->roce_cq_recv); 850 ib_destroy_cq(smcibdev->roce_cq_send); 851 smc_wr_remove_dev(smcibdev); 852 out: 853 mutex_unlock(&smcibdev->mutex); 854 } 855 856 static struct ib_client smc_ib_client; 857 858 static void smc_copy_netdev_ifindex(struct smc_ib_device *smcibdev, int port) 859 { 860 struct ib_device *ibdev = smcibdev->ibdev; 861 struct net_device *ndev; 862 863 if (!ibdev->ops.get_netdev) 864 return; 865 ndev = ibdev->ops.get_netdev(ibdev, port + 1); 866 if (ndev) { 867 smcibdev->ndev_ifidx[port] = ndev->ifindex; 868 dev_put(ndev); 869 } 870 } 871 872 void smc_ib_ndev_change(struct net_device *ndev, unsigned long event) 873 { 874 struct smc_ib_device *smcibdev; 875 struct ib_device *libdev; 876 struct net_device *lndev; 877 u8 port_cnt; 878 int i; 879 880 mutex_lock(&smc_ib_devices.mutex); 881 list_for_each_entry(smcibdev, &smc_ib_devices.list, list) { 882 port_cnt = smcibdev->ibdev->phys_port_cnt; 883 for (i = 0; i < min_t(size_t, port_cnt, SMC_MAX_PORTS); i++) { 884 libdev = smcibdev->ibdev; 885 if (!libdev->ops.get_netdev) 886 continue; 887 lndev = libdev->ops.get_netdev(libdev, i + 1); 888 dev_put(lndev); 889 if (lndev != ndev) 890 continue; 891 if (event == NETDEV_REGISTER) 892 smcibdev->ndev_ifidx[i] = ndev->ifindex; 893 if (event == NETDEV_UNREGISTER) 894 smcibdev->ndev_ifidx[i] = 0; 895 } 896 } 897 mutex_unlock(&smc_ib_devices.mutex); 898 } 899 900 /* callback function for ib_register_client() */ 901 static int smc_ib_add_dev(struct ib_device *ibdev) 902 { 903 struct smc_ib_device *smcibdev; 904 u8 port_cnt; 905 int i; 906 907 if (ibdev->node_type != RDMA_NODE_IB_CA) 908 return -EOPNOTSUPP; 909 910 smcibdev = kzalloc(sizeof(*smcibdev), GFP_KERNEL); 911 if (!smcibdev) 912 return -ENOMEM; 913 914 smcibdev->ibdev = ibdev; 915 INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work); 916 atomic_set(&smcibdev->lnk_cnt, 0); 917 init_waitqueue_head(&smcibdev->lnks_deleted); 918 mutex_init(&smcibdev->mutex); 919 mutex_lock(&smc_ib_devices.mutex); 920 list_add_tail(&smcibdev->list, &smc_ib_devices.list); 921 mutex_unlock(&smc_ib_devices.mutex); 922 ib_set_client_data(ibdev, &smc_ib_client, smcibdev); 923 INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev, 924 smc_ib_global_event_handler); 925 ib_register_event_handler(&smcibdev->event_handler); 926 927 /* trigger reading of the port attributes */ 928 port_cnt = smcibdev->ibdev->phys_port_cnt; 929 pr_warn_ratelimited("smc: adding ib device %s with port count %d\n", 930 smcibdev->ibdev->name, port_cnt); 931 for (i = 0; 932 i < min_t(size_t, port_cnt, SMC_MAX_PORTS); 933 i++) { 934 set_bit(i, &smcibdev->port_event_mask); 935 /* determine pnetids of the port */ 936 if (smc_pnetid_by_dev_port(ibdev->dev.parent, i, 937 smcibdev->pnetid[i])) 938 smc_pnetid_by_table_ib(smcibdev, i + 1); 939 smc_copy_netdev_ifindex(smcibdev, i); 940 pr_warn_ratelimited("smc: ib device %s port %d has pnetid " 941 "%.16s%s\n", 942 smcibdev->ibdev->name, i + 1, 943 smcibdev->pnetid[i], 944 smcibdev->pnetid_by_user[i] ? 945 " (user defined)" : 946 ""); 947 } 948 schedule_work(&smcibdev->port_event_work); 949 return 0; 950 } 951 952 /* callback function for ib_unregister_client() */ 953 static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) 954 { 955 struct smc_ib_device *smcibdev = client_data; 956 957 mutex_lock(&smc_ib_devices.mutex); 958 list_del_init(&smcibdev->list); /* remove from smc_ib_devices */ 959 mutex_unlock(&smc_ib_devices.mutex); 960 pr_warn_ratelimited("smc: removing ib device %s\n", 961 smcibdev->ibdev->name); 962 smc_smcr_terminate_all(smcibdev); 963 smc_ib_cleanup_per_ibdev(smcibdev); 964 ib_unregister_event_handler(&smcibdev->event_handler); 965 cancel_work_sync(&smcibdev->port_event_work); 966 kfree(smcibdev); 967 } 968 969 static struct ib_client smc_ib_client = { 970 .name = "smc_ib", 971 .add = smc_ib_add_dev, 972 .remove = smc_ib_remove_dev, 973 }; 974 975 int __init smc_ib_register_client(void) 976 { 977 smc_ib_init_local_systemid(); 978 return ib_register_client(&smc_ib_client); 979 } 980 981 void smc_ib_unregister_client(void) 982 { 983 ib_unregister_client(&smc_ib_client); 984 } 985