1 /* 2 * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. 3 * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 #include <linux/skbuff.h> 35 #include <linux/if_arp.h> 36 #include <linux/netdevice.h> 37 #include <linux/if.h> 38 #include <linux/if_vlan.h> 39 #include <net/udp_tunnel.h> 40 #include <net/sch_generic.h> 41 #include <linux/netfilter.h> 42 #include <rdma/ib_addr.h> 43 44 #include "rxe.h" 45 #include "rxe_net.h" 46 #include "rxe_loc.h" 47 48 static struct rxe_recv_sockets recv_sockets; 49 50 struct device *rxe_dma_device(struct rxe_dev *rxe) 51 { 52 struct net_device *ndev; 53 54 ndev = rxe->ndev; 55 56 if (is_vlan_dev(ndev)) 57 ndev = vlan_dev_real_dev(ndev); 58 59 return ndev->dev.parent; 60 } 61 62 int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid) 63 { 64 int err; 65 unsigned char ll_addr[ETH_ALEN]; 66 67 ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr); 68 err = dev_mc_add(rxe->ndev, ll_addr); 69 70 return err; 71 } 72 73 int rxe_mcast_delete(struct rxe_dev *rxe, union ib_gid *mgid) 74 { 75 int err; 76 unsigned char ll_addr[ETH_ALEN]; 77 78 ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr); 79 err = dev_mc_del(rxe->ndev, ll_addr); 80 81 return err; 82 } 83 84 static struct dst_entry *rxe_find_route4(struct net_device *ndev, 85 struct in_addr *saddr, 86 struct in_addr *daddr) 87 { 88 struct rtable *rt; 89 struct flowi4 fl = { { 0 } }; 90 91 memset(&fl, 0, sizeof(fl)); 92 fl.flowi4_oif = ndev->ifindex; 93 memcpy(&fl.saddr, saddr, sizeof(*saddr)); 94 memcpy(&fl.daddr, daddr, sizeof(*daddr)); 95 fl.flowi4_proto = IPPROTO_UDP; 96 97 rt = ip_route_output_key(&init_net, &fl); 98 if (IS_ERR(rt)) { 99 pr_err_ratelimited("no route to %pI4\n", &daddr->s_addr); 100 return NULL; 101 } 102 103 return &rt->dst; 104 } 105 106 #if IS_ENABLED(CONFIG_IPV6) 107 static struct dst_entry *rxe_find_route6(struct net_device *ndev, 108 struct in6_addr *saddr, 109 struct in6_addr *daddr) 110 { 111 struct dst_entry *ndst; 112 struct flowi6 fl6 = { { 0 } }; 113 114 memset(&fl6, 0, sizeof(fl6)); 115 fl6.flowi6_oif = ndev->ifindex; 116 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 117 memcpy(&fl6.daddr, daddr, sizeof(*daddr)); 118 fl6.flowi6_proto = IPPROTO_UDP; 119 120 if (unlikely(ipv6_stub->ipv6_dst_lookup(sock_net(recv_sockets.sk6->sk), 121 recv_sockets.sk6->sk, &ndst, &fl6))) { 122 pr_err_ratelimited("no route to %pI6\n", daddr); 123 goto put; 124 } 125 126 if (unlikely(ndst->error)) { 127 pr_err("no route to %pI6\n", daddr); 128 goto put; 129 } 130 131 return ndst; 132 put: 133 dst_release(ndst); 134 return NULL; 135 } 136 137 #else 138 139 static struct dst_entry *rxe_find_route6(struct net_device *ndev, 140 struct in6_addr *saddr, 141 struct in6_addr *daddr) 142 { 143 return NULL; 144 } 145 146 #endif 147 148 static struct dst_entry *rxe_find_route(struct net_device *ndev, 149 struct rxe_qp *qp, 150 struct rxe_av *av) 151 { 152 struct dst_entry *dst = NULL; 153 154 if (qp_type(qp) == IB_QPT_RC) 155 dst = sk_dst_get(qp->sk->sk); 156 157 if (!dst || !dst_check(dst, qp->dst_cookie)) { 158 if (dst) 159 dst_release(dst); 160 161 if (av->network_type == RDMA_NETWORK_IPV4) { 162 struct in_addr *saddr; 163 struct in_addr *daddr; 164 165 saddr = &av->sgid_addr._sockaddr_in.sin_addr; 166 daddr = &av->dgid_addr._sockaddr_in.sin_addr; 167 dst = rxe_find_route4(ndev, saddr, daddr); 168 } else if (av->network_type == RDMA_NETWORK_IPV6) { 169 struct in6_addr *saddr6; 170 struct in6_addr *daddr6; 171 172 saddr6 = &av->sgid_addr._sockaddr_in6.sin6_addr; 173 daddr6 = &av->dgid_addr._sockaddr_in6.sin6_addr; 174 dst = rxe_find_route6(ndev, saddr6, daddr6); 175 #if IS_ENABLED(CONFIG_IPV6) 176 if (dst) 177 qp->dst_cookie = 178 rt6_get_cookie((struct rt6_info *)dst); 179 #endif 180 } 181 182 if (dst && (qp_type(qp) == IB_QPT_RC)) { 183 dst_hold(dst); 184 sk_dst_set(qp->sk->sk, dst); 185 } 186 } 187 return dst; 188 } 189 190 static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb) 191 { 192 struct udphdr *udph; 193 struct net_device *ndev = skb->dev; 194 struct net_device *rdev = ndev; 195 struct rxe_dev *rxe = rxe_get_dev_from_net(ndev); 196 struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); 197 198 if (!rxe && is_vlan_dev(rdev)) { 199 rdev = vlan_dev_real_dev(ndev); 200 rxe = rxe_get_dev_from_net(rdev); 201 } 202 if (!rxe) 203 goto drop; 204 205 if (skb_linearize(skb)) { 206 pr_err("skb_linearize failed\n"); 207 ib_device_put(&rxe->ib_dev); 208 goto drop; 209 } 210 211 udph = udp_hdr(skb); 212 pkt->rxe = rxe; 213 pkt->port_num = 1; 214 pkt->hdr = (u8 *)(udph + 1); 215 pkt->mask = RXE_GRH_MASK; 216 pkt->paylen = be16_to_cpu(udph->len) - sizeof(*udph); 217 218 rxe_rcv(skb); 219 220 /* 221 * FIXME: this is in the wrong place, it needs to be done when pkt is 222 * destroyed 223 */ 224 ib_device_put(&rxe->ib_dev); 225 226 return 0; 227 drop: 228 kfree_skb(skb); 229 230 return 0; 231 } 232 233 static struct socket *rxe_setup_udp_tunnel(struct net *net, __be16 port, 234 bool ipv6) 235 { 236 int err; 237 struct socket *sock; 238 struct udp_port_cfg udp_cfg = { }; 239 struct udp_tunnel_sock_cfg tnl_cfg = { }; 240 241 if (ipv6) { 242 udp_cfg.family = AF_INET6; 243 udp_cfg.ipv6_v6only = 1; 244 } else { 245 udp_cfg.family = AF_INET; 246 } 247 248 udp_cfg.local_udp_port = port; 249 250 /* Create UDP socket */ 251 err = udp_sock_create(net, &udp_cfg, &sock); 252 if (err < 0) { 253 pr_err("failed to create udp socket. err = %d\n", err); 254 return ERR_PTR(err); 255 } 256 257 tnl_cfg.encap_type = 1; 258 tnl_cfg.encap_rcv = rxe_udp_encap_recv; 259 260 /* Setup UDP tunnel */ 261 setup_udp_tunnel_sock(net, sock, &tnl_cfg); 262 263 return sock; 264 } 265 266 static void rxe_release_udp_tunnel(struct socket *sk) 267 { 268 if (sk) 269 udp_tunnel_sock_release(sk); 270 } 271 272 static void prepare_udp_hdr(struct sk_buff *skb, __be16 src_port, 273 __be16 dst_port) 274 { 275 struct udphdr *udph; 276 277 __skb_push(skb, sizeof(*udph)); 278 skb_reset_transport_header(skb); 279 udph = udp_hdr(skb); 280 281 udph->dest = dst_port; 282 udph->source = src_port; 283 udph->len = htons(skb->len); 284 udph->check = 0; 285 } 286 287 static void prepare_ipv4_hdr(struct dst_entry *dst, struct sk_buff *skb, 288 __be32 saddr, __be32 daddr, __u8 proto, 289 __u8 tos, __u8 ttl, __be16 df, bool xnet) 290 { 291 struct iphdr *iph; 292 293 skb_scrub_packet(skb, xnet); 294 295 skb_clear_hash(skb); 296 skb_dst_set(skb, dst_clone(dst)); 297 memset(IPCB(skb), 0, sizeof(*IPCB(skb))); 298 299 skb_push(skb, sizeof(struct iphdr)); 300 skb_reset_network_header(skb); 301 302 iph = ip_hdr(skb); 303 304 iph->version = IPVERSION; 305 iph->ihl = sizeof(struct iphdr) >> 2; 306 iph->frag_off = df; 307 iph->protocol = proto; 308 iph->tos = tos; 309 iph->daddr = daddr; 310 iph->saddr = saddr; 311 iph->ttl = ttl; 312 __ip_select_ident(dev_net(dst->dev), iph, 313 skb_shinfo(skb)->gso_segs ?: 1); 314 iph->tot_len = htons(skb->len); 315 ip_send_check(iph); 316 } 317 318 static void prepare_ipv6_hdr(struct dst_entry *dst, struct sk_buff *skb, 319 struct in6_addr *saddr, struct in6_addr *daddr, 320 __u8 proto, __u8 prio, __u8 ttl) 321 { 322 struct ipv6hdr *ip6h; 323 324 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 325 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED 326 | IPSKB_REROUTED); 327 skb_dst_set(skb, dst_clone(dst)); 328 329 __skb_push(skb, sizeof(*ip6h)); 330 skb_reset_network_header(skb); 331 ip6h = ipv6_hdr(skb); 332 ip6_flow_hdr(ip6h, prio, htonl(0)); 333 ip6h->payload_len = htons(skb->len); 334 ip6h->nexthdr = proto; 335 ip6h->hop_limit = ttl; 336 ip6h->daddr = *daddr; 337 ip6h->saddr = *saddr; 338 ip6h->payload_len = htons(skb->len - sizeof(*ip6h)); 339 } 340 341 static int prepare4(struct rxe_pkt_info *pkt, struct sk_buff *skb) 342 { 343 struct rxe_qp *qp = pkt->qp; 344 struct dst_entry *dst; 345 bool xnet = false; 346 __be16 df = htons(IP_DF); 347 struct rxe_av *av = rxe_get_av(pkt); 348 struct in_addr *saddr = &av->sgid_addr._sockaddr_in.sin_addr; 349 struct in_addr *daddr = &av->dgid_addr._sockaddr_in.sin_addr; 350 351 dst = rxe_find_route(skb->dev, qp, av); 352 if (!dst) { 353 pr_err("Host not reachable\n"); 354 return -EHOSTUNREACH; 355 } 356 357 prepare_udp_hdr(skb, cpu_to_be16(qp->src_port), 358 cpu_to_be16(ROCE_V2_UDP_DPORT)); 359 360 prepare_ipv4_hdr(dst, skb, saddr->s_addr, daddr->s_addr, IPPROTO_UDP, 361 av->grh.traffic_class, av->grh.hop_limit, df, xnet); 362 363 dst_release(dst); 364 return 0; 365 } 366 367 static int prepare6(struct rxe_pkt_info *pkt, struct sk_buff *skb) 368 { 369 struct rxe_qp *qp = pkt->qp; 370 struct dst_entry *dst; 371 struct rxe_av *av = rxe_get_av(pkt); 372 struct in6_addr *saddr = &av->sgid_addr._sockaddr_in6.sin6_addr; 373 struct in6_addr *daddr = &av->dgid_addr._sockaddr_in6.sin6_addr; 374 375 dst = rxe_find_route(skb->dev, qp, av); 376 if (!dst) { 377 pr_err("Host not reachable\n"); 378 return -EHOSTUNREACH; 379 } 380 381 prepare_udp_hdr(skb, cpu_to_be16(qp->src_port), 382 cpu_to_be16(ROCE_V2_UDP_DPORT)); 383 384 prepare_ipv6_hdr(dst, skb, saddr, daddr, IPPROTO_UDP, 385 av->grh.traffic_class, 386 av->grh.hop_limit); 387 388 dst_release(dst); 389 return 0; 390 } 391 392 int rxe_prepare(struct rxe_pkt_info *pkt, struct sk_buff *skb, u32 *crc) 393 { 394 int err = 0; 395 396 if (skb->protocol == htons(ETH_P_IP)) 397 err = prepare4(pkt, skb); 398 else if (skb->protocol == htons(ETH_P_IPV6)) 399 err = prepare6(pkt, skb); 400 401 *crc = rxe_icrc_hdr(pkt, skb); 402 403 if (ether_addr_equal(skb->dev->dev_addr, rxe_get_av(pkt)->dmac)) 404 pkt->mask |= RXE_LOOPBACK_MASK; 405 406 return err; 407 } 408 409 static void rxe_skb_tx_dtor(struct sk_buff *skb) 410 { 411 struct sock *sk = skb->sk; 412 struct rxe_qp *qp = sk->sk_user_data; 413 int skb_out = atomic_dec_return(&qp->skb_out); 414 415 if (unlikely(qp->need_req_skb && 416 skb_out < RXE_INFLIGHT_SKBS_PER_QP_LOW)) 417 rxe_run_task(&qp->req.task, 1); 418 419 rxe_drop_ref(qp); 420 } 421 422 int rxe_send(struct rxe_pkt_info *pkt, struct sk_buff *skb) 423 { 424 int err; 425 426 skb->destructor = rxe_skb_tx_dtor; 427 skb->sk = pkt->qp->sk->sk; 428 429 rxe_add_ref(pkt->qp); 430 atomic_inc(&pkt->qp->skb_out); 431 432 if (skb->protocol == htons(ETH_P_IP)) { 433 err = ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); 434 } else if (skb->protocol == htons(ETH_P_IPV6)) { 435 err = ip6_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); 436 } else { 437 pr_err("Unknown layer 3 protocol: %d\n", skb->protocol); 438 atomic_dec(&pkt->qp->skb_out); 439 rxe_drop_ref(pkt->qp); 440 kfree_skb(skb); 441 return -EINVAL; 442 } 443 444 if (unlikely(net_xmit_eval(err))) { 445 pr_debug("error sending packet: %d\n", err); 446 return -EAGAIN; 447 } 448 449 return 0; 450 } 451 452 void rxe_loopback(struct sk_buff *skb) 453 { 454 rxe_rcv(skb); 455 } 456 457 struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av, 458 int paylen, struct rxe_pkt_info *pkt) 459 { 460 unsigned int hdr_len; 461 struct sk_buff *skb = NULL; 462 struct net_device *ndev; 463 const struct ib_gid_attr *attr; 464 const int port_num = 1; 465 466 attr = rdma_get_gid_attr(&rxe->ib_dev, port_num, av->grh.sgid_index); 467 if (IS_ERR(attr)) 468 return NULL; 469 470 if (av->network_type == RDMA_NETWORK_IPV4) 471 hdr_len = ETH_HLEN + sizeof(struct udphdr) + 472 sizeof(struct iphdr); 473 else 474 hdr_len = ETH_HLEN + sizeof(struct udphdr) + 475 sizeof(struct ipv6hdr); 476 477 rcu_read_lock(); 478 ndev = rdma_read_gid_attr_ndev_rcu(attr); 479 if (IS_ERR(ndev)) { 480 rcu_read_unlock(); 481 goto out; 482 } 483 skb = alloc_skb(paylen + hdr_len + LL_RESERVED_SPACE(ndev), 484 GFP_ATOMIC); 485 486 if (unlikely(!skb)) { 487 rcu_read_unlock(); 488 goto out; 489 } 490 491 skb_reserve(skb, hdr_len + LL_RESERVED_SPACE(ndev)); 492 493 /* FIXME: hold reference to this netdev until life of this skb. */ 494 skb->dev = ndev; 495 rcu_read_unlock(); 496 497 if (av->network_type == RDMA_NETWORK_IPV4) 498 skb->protocol = htons(ETH_P_IP); 499 else 500 skb->protocol = htons(ETH_P_IPV6); 501 502 pkt->rxe = rxe; 503 pkt->port_num = port_num; 504 pkt->hdr = skb_put_zero(skb, paylen); 505 pkt->mask |= RXE_GRH_MASK; 506 507 out: 508 rdma_put_gid_attr(attr); 509 return skb; 510 } 511 512 /* 513 * this is required by rxe_cfg to match rxe devices in 514 * /sys/class/infiniband up with their underlying ethernet devices 515 */ 516 const char *rxe_parent_name(struct rxe_dev *rxe, unsigned int port_num) 517 { 518 return rxe->ndev->name; 519 } 520 521 enum rdma_link_layer rxe_link_layer(struct rxe_dev *rxe, unsigned int port_num) 522 { 523 return IB_LINK_LAYER_ETHERNET; 524 } 525 526 int rxe_net_add(const char *ibdev_name, struct net_device *ndev) 527 { 528 int err; 529 struct rxe_dev *rxe = NULL; 530 531 rxe = ib_alloc_device(rxe_dev, ib_dev); 532 if (!rxe) 533 return -ENOMEM; 534 535 rxe->ndev = ndev; 536 537 err = rxe_add(rxe, ndev->mtu, ibdev_name); 538 if (err) { 539 ib_dealloc_device(&rxe->ib_dev); 540 return err; 541 } 542 543 return 0; 544 } 545 546 static void rxe_port_event(struct rxe_dev *rxe, 547 enum ib_event_type event) 548 { 549 struct ib_event ev; 550 551 ev.device = &rxe->ib_dev; 552 ev.element.port_num = 1; 553 ev.event = event; 554 555 ib_dispatch_event(&ev); 556 } 557 558 /* Caller must hold net_info_lock */ 559 void rxe_port_up(struct rxe_dev *rxe) 560 { 561 struct rxe_port *port; 562 563 port = &rxe->port; 564 port->attr.state = IB_PORT_ACTIVE; 565 566 rxe_port_event(rxe, IB_EVENT_PORT_ACTIVE); 567 dev_info(&rxe->ib_dev.dev, "set active\n"); 568 } 569 570 /* Caller must hold net_info_lock */ 571 void rxe_port_down(struct rxe_dev *rxe) 572 { 573 struct rxe_port *port; 574 575 port = &rxe->port; 576 port->attr.state = IB_PORT_DOWN; 577 578 rxe_port_event(rxe, IB_EVENT_PORT_ERR); 579 rxe_counter_inc(rxe, RXE_CNT_LINK_DOWNED); 580 dev_info(&rxe->ib_dev.dev, "set down\n"); 581 } 582 583 void rxe_set_port_state(struct rxe_dev *rxe) 584 { 585 if (netif_running(rxe->ndev) && netif_carrier_ok(rxe->ndev)) 586 rxe_port_up(rxe); 587 else 588 rxe_port_down(rxe); 589 } 590 591 static int rxe_notify(struct notifier_block *not_blk, 592 unsigned long event, 593 void *arg) 594 { 595 struct net_device *ndev = netdev_notifier_info_to_dev(arg); 596 struct rxe_dev *rxe = rxe_get_dev_from_net(ndev); 597 598 if (!rxe) 599 return NOTIFY_OK; 600 601 switch (event) { 602 case NETDEV_UNREGISTER: 603 ib_unregister_device_queued(&rxe->ib_dev); 604 break; 605 case NETDEV_UP: 606 rxe_port_up(rxe); 607 break; 608 case NETDEV_DOWN: 609 rxe_port_down(rxe); 610 break; 611 case NETDEV_CHANGEMTU: 612 pr_info("%s changed mtu to %d\n", ndev->name, ndev->mtu); 613 rxe_set_mtu(rxe, ndev->mtu); 614 break; 615 case NETDEV_CHANGE: 616 rxe_set_port_state(rxe); 617 break; 618 case NETDEV_REBOOT: 619 case NETDEV_GOING_DOWN: 620 case NETDEV_CHANGEADDR: 621 case NETDEV_CHANGENAME: 622 case NETDEV_FEAT_CHANGE: 623 default: 624 pr_info("ignoring netdev event = %ld for %s\n", 625 event, ndev->name); 626 break; 627 } 628 629 ib_device_put(&rxe->ib_dev); 630 return NOTIFY_OK; 631 } 632 633 static struct notifier_block rxe_net_notifier = { 634 .notifier_call = rxe_notify, 635 }; 636 637 static int rxe_net_ipv4_init(void) 638 { 639 recv_sockets.sk4 = rxe_setup_udp_tunnel(&init_net, 640 htons(ROCE_V2_UDP_DPORT), false); 641 if (IS_ERR(recv_sockets.sk4)) { 642 recv_sockets.sk4 = NULL; 643 pr_err("Failed to create IPv4 UDP tunnel\n"); 644 return -1; 645 } 646 647 return 0; 648 } 649 650 static int rxe_net_ipv6_init(void) 651 { 652 #if IS_ENABLED(CONFIG_IPV6) 653 654 recv_sockets.sk6 = rxe_setup_udp_tunnel(&init_net, 655 htons(ROCE_V2_UDP_DPORT), true); 656 if (IS_ERR(recv_sockets.sk6)) { 657 recv_sockets.sk6 = NULL; 658 pr_err("Failed to create IPv6 UDP tunnel\n"); 659 return -1; 660 } 661 #endif 662 return 0; 663 } 664 665 void rxe_net_exit(void) 666 { 667 rxe_release_udp_tunnel(recv_sockets.sk6); 668 rxe_release_udp_tunnel(recv_sockets.sk4); 669 unregister_netdevice_notifier(&rxe_net_notifier); 670 } 671 672 int rxe_net_init(void) 673 { 674 int err; 675 676 recv_sockets.sk6 = NULL; 677 678 err = rxe_net_ipv4_init(); 679 if (err) 680 return err; 681 err = rxe_net_ipv6_init(); 682 if (err) 683 goto err_out; 684 err = register_netdevice_notifier(&rxe_net_notifier); 685 if (err) { 686 pr_err("Failed to register netdev notifier\n"); 687 goto err_out; 688 } 689 return 0; 690 err_out: 691 rxe_net_exit(); 692 return err; 693 } 694