1 /* 2 * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. 3 * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 #include <linux/skbuff.h> 35 #include <linux/if_arp.h> 36 #include <linux/netdevice.h> 37 #include <linux/if.h> 38 #include <linux/if_vlan.h> 39 #include <net/udp_tunnel.h> 40 #include <net/sch_generic.h> 41 #include <linux/netfilter.h> 42 #include <rdma/ib_addr.h> 43 44 #include "rxe.h" 45 #include "rxe_net.h" 46 #include "rxe_loc.h" 47 48 static struct rxe_recv_sockets recv_sockets; 49 50 struct device *rxe_dma_device(struct rxe_dev *rxe) 51 { 52 struct net_device *ndev; 53 54 ndev = rxe->ndev; 55 56 if (is_vlan_dev(ndev)) 57 ndev = vlan_dev_real_dev(ndev); 58 59 return ndev->dev.parent; 60 } 61 62 int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid) 63 { 64 int err; 65 unsigned char ll_addr[ETH_ALEN]; 66 67 ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr); 68 err = dev_mc_add(rxe->ndev, ll_addr); 69 70 return err; 71 } 72 73 int rxe_mcast_delete(struct rxe_dev *rxe, union ib_gid *mgid) 74 { 75 int err; 76 unsigned char ll_addr[ETH_ALEN]; 77 78 ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr); 79 err = dev_mc_del(rxe->ndev, ll_addr); 80 81 return err; 82 } 83 84 static struct dst_entry *rxe_find_route4(struct net_device *ndev, 85 struct in_addr *saddr, 86 struct in_addr *daddr) 87 { 88 struct rtable *rt; 89 struct flowi4 fl = { { 0 } }; 90 91 memset(&fl, 0, sizeof(fl)); 92 fl.flowi4_oif = ndev->ifindex; 93 memcpy(&fl.saddr, saddr, sizeof(*saddr)); 94 memcpy(&fl.daddr, daddr, sizeof(*daddr)); 95 fl.flowi4_proto = IPPROTO_UDP; 96 97 rt = ip_route_output_key(&init_net, &fl); 98 if (IS_ERR(rt)) { 99 pr_err_ratelimited("no route to %pI4\n", &daddr->s_addr); 100 return NULL; 101 } 102 103 return &rt->dst; 104 } 105 106 #if IS_ENABLED(CONFIG_IPV6) 107 static struct dst_entry *rxe_find_route6(struct net_device *ndev, 108 struct in6_addr *saddr, 109 struct in6_addr *daddr) 110 { 111 struct dst_entry *ndst; 112 struct flowi6 fl6 = { { 0 } }; 113 114 memset(&fl6, 0, sizeof(fl6)); 115 fl6.flowi6_oif = ndev->ifindex; 116 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 117 memcpy(&fl6.daddr, daddr, sizeof(*daddr)); 118 fl6.flowi6_proto = IPPROTO_UDP; 119 120 if (unlikely(ipv6_stub->ipv6_dst_lookup(sock_net(recv_sockets.sk6->sk), 121 recv_sockets.sk6->sk, &ndst, &fl6))) { 122 pr_err_ratelimited("no route to %pI6\n", daddr); 123 goto put; 124 } 125 126 if (unlikely(ndst->error)) { 127 pr_err("no route to %pI6\n", daddr); 128 goto put; 129 } 130 131 return ndst; 132 put: 133 dst_release(ndst); 134 return NULL; 135 } 136 137 #else 138 139 static struct dst_entry *rxe_find_route6(struct net_device *ndev, 140 struct in6_addr *saddr, 141 struct in6_addr *daddr) 142 { 143 return NULL; 144 } 145 146 #endif 147 148 static struct dst_entry *rxe_find_route(struct net_device *ndev, 149 struct rxe_qp *qp, 150 struct rxe_av *av) 151 { 152 struct dst_entry *dst = NULL; 153 154 if (qp_type(qp) == IB_QPT_RC) 155 dst = sk_dst_get(qp->sk->sk); 156 157 if (!dst || !dst_check(dst, qp->dst_cookie)) { 158 if (dst) 159 dst_release(dst); 160 161 if (av->network_type == RDMA_NETWORK_IPV4) { 162 struct in_addr *saddr; 163 struct in_addr *daddr; 164 165 saddr = &av->sgid_addr._sockaddr_in.sin_addr; 166 daddr = &av->dgid_addr._sockaddr_in.sin_addr; 167 dst = rxe_find_route4(ndev, saddr, daddr); 168 } else if (av->network_type == RDMA_NETWORK_IPV6) { 169 struct in6_addr *saddr6; 170 struct in6_addr *daddr6; 171 172 saddr6 = &av->sgid_addr._sockaddr_in6.sin6_addr; 173 daddr6 = &av->dgid_addr._sockaddr_in6.sin6_addr; 174 dst = rxe_find_route6(ndev, saddr6, daddr6); 175 #if IS_ENABLED(CONFIG_IPV6) 176 if (dst) 177 qp->dst_cookie = 178 rt6_get_cookie((struct rt6_info *)dst); 179 #endif 180 } 181 182 if (dst && (qp_type(qp) == IB_QPT_RC)) { 183 dst_hold(dst); 184 sk_dst_set(qp->sk->sk, dst); 185 } 186 } 187 return dst; 188 } 189 190 static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb) 191 { 192 struct udphdr *udph; 193 struct net_device *ndev = skb->dev; 194 struct net_device *rdev = ndev; 195 struct rxe_dev *rxe = rxe_get_dev_from_net(ndev); 196 struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); 197 198 if (!rxe && is_vlan_dev(rdev)) { 199 rdev = vlan_dev_real_dev(ndev); 200 rxe = rxe_get_dev_from_net(rdev); 201 } 202 if (!rxe) 203 goto drop; 204 205 if (skb_linearize(skb)) { 206 pr_err("skb_linearize failed\n"); 207 ib_device_put(&rxe->ib_dev); 208 goto drop; 209 } 210 211 udph = udp_hdr(skb); 212 pkt->rxe = rxe; 213 pkt->port_num = 1; 214 pkt->hdr = (u8 *)(udph + 1); 215 pkt->mask = RXE_GRH_MASK; 216 pkt->paylen = be16_to_cpu(udph->len) - sizeof(*udph); 217 218 rxe_rcv(skb); 219 220 /* 221 * FIXME: this is in the wrong place, it needs to be done when pkt is 222 * destroyed 223 */ 224 ib_device_put(&rxe->ib_dev); 225 226 return 0; 227 drop: 228 kfree_skb(skb); 229 230 return 0; 231 } 232 233 static struct socket *rxe_setup_udp_tunnel(struct net *net, __be16 port, 234 bool ipv6) 235 { 236 int err; 237 struct socket *sock; 238 struct udp_port_cfg udp_cfg = { }; 239 struct udp_tunnel_sock_cfg tnl_cfg = { }; 240 241 if (ipv6) { 242 udp_cfg.family = AF_INET6; 243 udp_cfg.ipv6_v6only = 1; 244 } else { 245 udp_cfg.family = AF_INET; 246 } 247 248 udp_cfg.local_udp_port = port; 249 250 /* Create UDP socket */ 251 err = udp_sock_create(net, &udp_cfg, &sock); 252 if (err < 0) { 253 pr_err("failed to create udp socket. err = %d\n", err); 254 return ERR_PTR(err); 255 } 256 257 tnl_cfg.encap_type = 1; 258 tnl_cfg.encap_rcv = rxe_udp_encap_recv; 259 260 /* Setup UDP tunnel */ 261 setup_udp_tunnel_sock(net, sock, &tnl_cfg); 262 263 return sock; 264 } 265 266 static void rxe_release_udp_tunnel(struct socket *sk) 267 { 268 if (sk) 269 udp_tunnel_sock_release(sk); 270 } 271 272 static void prepare_udp_hdr(struct sk_buff *skb, __be16 src_port, 273 __be16 dst_port) 274 { 275 struct udphdr *udph; 276 277 __skb_push(skb, sizeof(*udph)); 278 skb_reset_transport_header(skb); 279 udph = udp_hdr(skb); 280 281 udph->dest = dst_port; 282 udph->source = src_port; 283 udph->len = htons(skb->len); 284 udph->check = 0; 285 } 286 287 static void prepare_ipv4_hdr(struct dst_entry *dst, struct sk_buff *skb, 288 __be32 saddr, __be32 daddr, __u8 proto, 289 __u8 tos, __u8 ttl, __be16 df, bool xnet) 290 { 291 struct iphdr *iph; 292 293 skb_scrub_packet(skb, xnet); 294 295 skb_clear_hash(skb); 296 skb_dst_set(skb, dst_clone(dst)); 297 memset(IPCB(skb), 0, sizeof(*IPCB(skb))); 298 299 skb_push(skb, sizeof(struct iphdr)); 300 skb_reset_network_header(skb); 301 302 iph = ip_hdr(skb); 303 304 iph->version = IPVERSION; 305 iph->ihl = sizeof(struct iphdr) >> 2; 306 iph->frag_off = df; 307 iph->protocol = proto; 308 iph->tos = tos; 309 iph->daddr = daddr; 310 iph->saddr = saddr; 311 iph->ttl = ttl; 312 __ip_select_ident(dev_net(dst->dev), iph, 313 skb_shinfo(skb)->gso_segs ?: 1); 314 iph->tot_len = htons(skb->len); 315 ip_send_check(iph); 316 } 317 318 static void prepare_ipv6_hdr(struct dst_entry *dst, struct sk_buff *skb, 319 struct in6_addr *saddr, struct in6_addr *daddr, 320 __u8 proto, __u8 prio, __u8 ttl) 321 { 322 struct ipv6hdr *ip6h; 323 324 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 325 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED 326 | IPSKB_REROUTED); 327 skb_dst_set(skb, dst_clone(dst)); 328 329 __skb_push(skb, sizeof(*ip6h)); 330 skb_reset_network_header(skb); 331 ip6h = ipv6_hdr(skb); 332 ip6_flow_hdr(ip6h, prio, htonl(0)); 333 ip6h->payload_len = htons(skb->len); 334 ip6h->nexthdr = proto; 335 ip6h->hop_limit = ttl; 336 ip6h->daddr = *daddr; 337 ip6h->saddr = *saddr; 338 ip6h->payload_len = htons(skb->len - sizeof(*ip6h)); 339 } 340 341 static int prepare4(struct rxe_pkt_info *pkt, struct sk_buff *skb, 342 struct rxe_av *av) 343 { 344 struct rxe_qp *qp = pkt->qp; 345 struct dst_entry *dst; 346 bool xnet = false; 347 __be16 df = htons(IP_DF); 348 struct in_addr *saddr = &av->sgid_addr._sockaddr_in.sin_addr; 349 struct in_addr *daddr = &av->dgid_addr._sockaddr_in.sin_addr; 350 351 dst = rxe_find_route(skb->dev, qp, av); 352 if (!dst) { 353 pr_err("Host not reachable\n"); 354 return -EHOSTUNREACH; 355 } 356 357 prepare_udp_hdr(skb, cpu_to_be16(qp->src_port), 358 cpu_to_be16(ROCE_V2_UDP_DPORT)); 359 360 prepare_ipv4_hdr(dst, skb, saddr->s_addr, daddr->s_addr, IPPROTO_UDP, 361 av->grh.traffic_class, av->grh.hop_limit, df, xnet); 362 363 dst_release(dst); 364 return 0; 365 } 366 367 static int prepare6(struct rxe_pkt_info *pkt, struct sk_buff *skb, 368 struct rxe_av *av) 369 { 370 struct rxe_qp *qp = pkt->qp; 371 struct dst_entry *dst; 372 struct in6_addr *saddr = &av->sgid_addr._sockaddr_in6.sin6_addr; 373 struct in6_addr *daddr = &av->dgid_addr._sockaddr_in6.sin6_addr; 374 375 dst = rxe_find_route(skb->dev, qp, av); 376 if (!dst) { 377 pr_err("Host not reachable\n"); 378 return -EHOSTUNREACH; 379 } 380 381 prepare_udp_hdr(skb, cpu_to_be16(qp->src_port), 382 cpu_to_be16(ROCE_V2_UDP_DPORT)); 383 384 prepare_ipv6_hdr(dst, skb, saddr, daddr, IPPROTO_UDP, 385 av->grh.traffic_class, 386 av->grh.hop_limit); 387 388 dst_release(dst); 389 return 0; 390 } 391 392 int rxe_prepare(struct rxe_pkt_info *pkt, struct sk_buff *skb, u32 *crc) 393 { 394 int err = 0; 395 struct rxe_av *av = rxe_get_av(pkt); 396 397 if (av->network_type == RDMA_NETWORK_IPV4) 398 err = prepare4(pkt, skb, av); 399 else if (av->network_type == RDMA_NETWORK_IPV6) 400 err = prepare6(pkt, skb, av); 401 402 *crc = rxe_icrc_hdr(pkt, skb); 403 404 if (ether_addr_equal(skb->dev->dev_addr, av->dmac)) 405 pkt->mask |= RXE_LOOPBACK_MASK; 406 407 return err; 408 } 409 410 static void rxe_skb_tx_dtor(struct sk_buff *skb) 411 { 412 struct sock *sk = skb->sk; 413 struct rxe_qp *qp = sk->sk_user_data; 414 int skb_out = atomic_dec_return(&qp->skb_out); 415 416 if (unlikely(qp->need_req_skb && 417 skb_out < RXE_INFLIGHT_SKBS_PER_QP_LOW)) 418 rxe_run_task(&qp->req.task, 1); 419 420 rxe_drop_ref(qp); 421 } 422 423 int rxe_send(struct rxe_pkt_info *pkt, struct sk_buff *skb) 424 { 425 struct rxe_av *av; 426 int err; 427 428 av = rxe_get_av(pkt); 429 430 skb->destructor = rxe_skb_tx_dtor; 431 skb->sk = pkt->qp->sk->sk; 432 433 rxe_add_ref(pkt->qp); 434 atomic_inc(&pkt->qp->skb_out); 435 436 if (av->network_type == RDMA_NETWORK_IPV4) { 437 err = ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); 438 } else if (av->network_type == RDMA_NETWORK_IPV6) { 439 err = ip6_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); 440 } else { 441 pr_err("Unknown layer 3 protocol: %d\n", av->network_type); 442 atomic_dec(&pkt->qp->skb_out); 443 rxe_drop_ref(pkt->qp); 444 kfree_skb(skb); 445 return -EINVAL; 446 } 447 448 if (unlikely(net_xmit_eval(err))) { 449 pr_debug("error sending packet: %d\n", err); 450 return -EAGAIN; 451 } 452 453 return 0; 454 } 455 456 void rxe_loopback(struct sk_buff *skb) 457 { 458 rxe_rcv(skb); 459 } 460 461 struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av, 462 int paylen, struct rxe_pkt_info *pkt) 463 { 464 unsigned int hdr_len; 465 struct sk_buff *skb; 466 struct net_device *ndev; 467 const struct ib_gid_attr *attr; 468 const int port_num = 1; 469 470 attr = rdma_get_gid_attr(&rxe->ib_dev, port_num, av->grh.sgid_index); 471 if (IS_ERR(attr)) 472 return NULL; 473 ndev = attr->ndev; 474 475 if (av->network_type == RDMA_NETWORK_IPV4) 476 hdr_len = ETH_HLEN + sizeof(struct udphdr) + 477 sizeof(struct iphdr); 478 else 479 hdr_len = ETH_HLEN + sizeof(struct udphdr) + 480 sizeof(struct ipv6hdr); 481 482 skb = alloc_skb(paylen + hdr_len + LL_RESERVED_SPACE(ndev), 483 GFP_ATOMIC); 484 485 if (unlikely(!skb)) 486 goto out; 487 488 skb_reserve(skb, hdr_len + LL_RESERVED_SPACE(rxe->ndev)); 489 490 skb->dev = ndev; 491 if (av->network_type == RDMA_NETWORK_IPV4) 492 skb->protocol = htons(ETH_P_IP); 493 else 494 skb->protocol = htons(ETH_P_IPV6); 495 496 pkt->rxe = rxe; 497 pkt->port_num = port_num; 498 pkt->hdr = skb_put_zero(skb, paylen); 499 pkt->mask |= RXE_GRH_MASK; 500 501 out: 502 rdma_put_gid_attr(attr); 503 return skb; 504 } 505 506 /* 507 * this is required by rxe_cfg to match rxe devices in 508 * /sys/class/infiniband up with their underlying ethernet devices 509 */ 510 const char *rxe_parent_name(struct rxe_dev *rxe, unsigned int port_num) 511 { 512 return rxe->ndev->name; 513 } 514 515 enum rdma_link_layer rxe_link_layer(struct rxe_dev *rxe, unsigned int port_num) 516 { 517 return IB_LINK_LAYER_ETHERNET; 518 } 519 520 int rxe_net_add(const char *ibdev_name, struct net_device *ndev) 521 { 522 int err; 523 struct rxe_dev *rxe = NULL; 524 525 rxe = ib_alloc_device(rxe_dev, ib_dev); 526 if (!rxe) 527 return -ENOMEM; 528 529 rxe->ndev = ndev; 530 531 err = rxe_add(rxe, ndev->mtu, ibdev_name); 532 if (err) { 533 ib_dealloc_device(&rxe->ib_dev); 534 return err; 535 } 536 537 return 0; 538 } 539 540 static void rxe_port_event(struct rxe_dev *rxe, 541 enum ib_event_type event) 542 { 543 struct ib_event ev; 544 545 ev.device = &rxe->ib_dev; 546 ev.element.port_num = 1; 547 ev.event = event; 548 549 ib_dispatch_event(&ev); 550 } 551 552 /* Caller must hold net_info_lock */ 553 void rxe_port_up(struct rxe_dev *rxe) 554 { 555 struct rxe_port *port; 556 557 port = &rxe->port; 558 port->attr.state = IB_PORT_ACTIVE; 559 560 rxe_port_event(rxe, IB_EVENT_PORT_ACTIVE); 561 dev_info(&rxe->ib_dev.dev, "set active\n"); 562 } 563 564 /* Caller must hold net_info_lock */ 565 void rxe_port_down(struct rxe_dev *rxe) 566 { 567 struct rxe_port *port; 568 569 port = &rxe->port; 570 port->attr.state = IB_PORT_DOWN; 571 572 rxe_port_event(rxe, IB_EVENT_PORT_ERR); 573 rxe_counter_inc(rxe, RXE_CNT_LINK_DOWNED); 574 dev_info(&rxe->ib_dev.dev, "set down\n"); 575 } 576 577 void rxe_set_port_state(struct rxe_dev *rxe) 578 { 579 if (netif_running(rxe->ndev) && netif_carrier_ok(rxe->ndev)) 580 rxe_port_up(rxe); 581 else 582 rxe_port_down(rxe); 583 } 584 585 static int rxe_notify(struct notifier_block *not_blk, 586 unsigned long event, 587 void *arg) 588 { 589 struct net_device *ndev = netdev_notifier_info_to_dev(arg); 590 struct rxe_dev *rxe = rxe_get_dev_from_net(ndev); 591 592 if (!rxe) 593 return NOTIFY_OK; 594 595 switch (event) { 596 case NETDEV_UNREGISTER: 597 ib_unregister_device_queued(&rxe->ib_dev); 598 break; 599 case NETDEV_UP: 600 rxe_port_up(rxe); 601 break; 602 case NETDEV_DOWN: 603 rxe_port_down(rxe); 604 break; 605 case NETDEV_CHANGEMTU: 606 pr_info("%s changed mtu to %d\n", ndev->name, ndev->mtu); 607 rxe_set_mtu(rxe, ndev->mtu); 608 break; 609 case NETDEV_CHANGE: 610 rxe_set_port_state(rxe); 611 break; 612 case NETDEV_REBOOT: 613 case NETDEV_GOING_DOWN: 614 case NETDEV_CHANGEADDR: 615 case NETDEV_CHANGENAME: 616 case NETDEV_FEAT_CHANGE: 617 default: 618 pr_info("ignoring netdev event = %ld for %s\n", 619 event, ndev->name); 620 break; 621 } 622 623 ib_device_put(&rxe->ib_dev); 624 return NOTIFY_OK; 625 } 626 627 static struct notifier_block rxe_net_notifier = { 628 .notifier_call = rxe_notify, 629 }; 630 631 static int rxe_net_ipv4_init(void) 632 { 633 recv_sockets.sk4 = rxe_setup_udp_tunnel(&init_net, 634 htons(ROCE_V2_UDP_DPORT), false); 635 if (IS_ERR(recv_sockets.sk4)) { 636 recv_sockets.sk4 = NULL; 637 pr_err("Failed to create IPv4 UDP tunnel\n"); 638 return -1; 639 } 640 641 return 0; 642 } 643 644 static int rxe_net_ipv6_init(void) 645 { 646 #if IS_ENABLED(CONFIG_IPV6) 647 648 recv_sockets.sk6 = rxe_setup_udp_tunnel(&init_net, 649 htons(ROCE_V2_UDP_DPORT), true); 650 if (IS_ERR(recv_sockets.sk6)) { 651 recv_sockets.sk6 = NULL; 652 pr_err("Failed to create IPv6 UDP tunnel\n"); 653 return -1; 654 } 655 #endif 656 return 0; 657 } 658 659 void rxe_net_exit(void) 660 { 661 rxe_release_udp_tunnel(recv_sockets.sk6); 662 rxe_release_udp_tunnel(recv_sockets.sk4); 663 unregister_netdevice_notifier(&rxe_net_notifier); 664 } 665 666 int rxe_net_init(void) 667 { 668 int err; 669 670 recv_sockets.sk6 = NULL; 671 672 err = rxe_net_ipv4_init(); 673 if (err) 674 return err; 675 err = rxe_net_ipv6_init(); 676 if (err) 677 goto err_out; 678 err = register_netdevice_notifier(&rxe_net_notifier); 679 if (err) { 680 pr_err("Failed to register netdev notifier\n"); 681 goto err_out; 682 } 683 return 0; 684 err_out: 685 rxe_net_exit(); 686 return err; 687 } 688