1 /* 2 * drivers/net/veth.c 3 * 4 * Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc 5 * 6 * Author: Pavel Emelianov <xemul@openvz.org> 7 * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com> 8 * 9 */ 10 11 #include <linux/netdevice.h> 12 #include <linux/slab.h> 13 #include <linux/ethtool.h> 14 #include <linux/etherdevice.h> 15 #include <linux/u64_stats_sync.h> 16 17 #include <net/rtnetlink.h> 18 #include <net/dst.h> 19 #include <net/xfrm.h> 20 #include <net/xdp.h> 21 #include <linux/veth.h> 22 #include <linux/module.h> 23 #include <linux/bpf.h> 24 #include <linux/filter.h> 25 #include <linux/ptr_ring.h> 26 #include <linux/bpf_trace.h> 27 #include <linux/net_tstamp.h> 28 29 #define DRV_NAME "veth" 30 #define DRV_VERSION "1.0" 31 32 #define VETH_XDP_FLAG BIT(0) 33 #define VETH_RING_SIZE 256 34 #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) 35 36 /* Separating two types of XDP xmit */ 37 #define VETH_XDP_TX BIT(0) 38 #define VETH_XDP_REDIR BIT(1) 39 40 struct pcpu_vstats { 41 u64 packets; 42 u64 bytes; 43 struct u64_stats_sync syncp; 44 }; 45 46 struct veth_rq { 47 struct napi_struct xdp_napi; 48 struct net_device *dev; 49 struct bpf_prog __rcu *xdp_prog; 50 struct xdp_mem_info xdp_mem; 51 bool rx_notify_masked; 52 struct ptr_ring xdp_ring; 53 struct xdp_rxq_info xdp_rxq; 54 }; 55 56 struct veth_priv { 57 struct net_device __rcu *peer; 58 atomic64_t dropped; 59 struct bpf_prog *_xdp_prog; 60 struct veth_rq *rq; 61 unsigned int requested_headroom; 62 }; 63 64 /* 65 * ethtool interface 66 */ 67 68 static struct { 69 const char string[ETH_GSTRING_LEN]; 70 } ethtool_stats_keys[] = { 71 { "peer_ifindex" }, 72 }; 73 74 static int veth_get_link_ksettings(struct net_device *dev, 75 struct ethtool_link_ksettings *cmd) 76 { 77 cmd->base.speed = SPEED_10000; 78 cmd->base.duplex = DUPLEX_FULL; 79 cmd->base.port = PORT_TP; 80 cmd->base.autoneg = AUTONEG_DISABLE; 81 return 0; 82 } 83 84 static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) 85 { 86 strlcpy(info->driver, DRV_NAME, sizeof(info->driver)); 87 strlcpy(info->version, DRV_VERSION, sizeof(info->version)); 88 } 89 90 static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf) 91 { 92 switch(stringset) { 93 case ETH_SS_STATS: 94 memcpy(buf, ðtool_stats_keys, sizeof(ethtool_stats_keys)); 95 break; 96 } 97 } 98 99 static int veth_get_sset_count(struct net_device *dev, int sset) 100 { 101 switch (sset) { 102 case ETH_SS_STATS: 103 return ARRAY_SIZE(ethtool_stats_keys); 104 default: 105 return -EOPNOTSUPP; 106 } 107 } 108 109 static void veth_get_ethtool_stats(struct net_device *dev, 110 struct ethtool_stats *stats, u64 *data) 111 { 112 struct veth_priv *priv = netdev_priv(dev); 113 struct net_device *peer = rtnl_dereference(priv->peer); 114 115 data[0] = peer ? peer->ifindex : 0; 116 } 117 118 static int veth_get_ts_info(struct net_device *dev, 119 struct ethtool_ts_info *info) 120 { 121 info->so_timestamping = 122 SOF_TIMESTAMPING_TX_SOFTWARE | 123 SOF_TIMESTAMPING_RX_SOFTWARE | 124 SOF_TIMESTAMPING_SOFTWARE; 125 info->phc_index = -1; 126 127 return 0; 128 } 129 130 static const struct ethtool_ops veth_ethtool_ops = { 131 .get_drvinfo = veth_get_drvinfo, 132 .get_link = ethtool_op_get_link, 133 .get_strings = veth_get_strings, 134 .get_sset_count = veth_get_sset_count, 135 .get_ethtool_stats = veth_get_ethtool_stats, 136 .get_link_ksettings = veth_get_link_ksettings, 137 .get_ts_info = veth_get_ts_info, 138 }; 139 140 /* general routines */ 141 142 static bool veth_is_xdp_frame(void *ptr) 143 { 144 return (unsigned long)ptr & VETH_XDP_FLAG; 145 } 146 147 static void *veth_ptr_to_xdp(void *ptr) 148 { 149 return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG); 150 } 151 152 static void *veth_xdp_to_ptr(void *ptr) 153 { 154 return (void *)((unsigned long)ptr | VETH_XDP_FLAG); 155 } 156 157 static void veth_ptr_free(void *ptr) 158 { 159 if (veth_is_xdp_frame(ptr)) 160 xdp_return_frame(veth_ptr_to_xdp(ptr)); 161 else 162 kfree_skb(ptr); 163 } 164 165 static void __veth_xdp_flush(struct veth_rq *rq) 166 { 167 /* Write ptr_ring before reading rx_notify_masked */ 168 smp_mb(); 169 if (!rq->rx_notify_masked) { 170 rq->rx_notify_masked = true; 171 napi_schedule(&rq->xdp_napi); 172 } 173 } 174 175 static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) 176 { 177 if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) { 178 dev_kfree_skb_any(skb); 179 return NET_RX_DROP; 180 } 181 182 return NET_RX_SUCCESS; 183 } 184 185 static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, 186 struct veth_rq *rq, bool xdp) 187 { 188 return __dev_forward_skb(dev, skb) ?: xdp ? 189 veth_xdp_rx(rq, skb) : 190 netif_rx(skb); 191 } 192 193 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) 194 { 195 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 196 struct veth_rq *rq = NULL; 197 struct net_device *rcv; 198 int length = skb->len; 199 bool rcv_xdp = false; 200 int rxq; 201 202 rcu_read_lock(); 203 rcv = rcu_dereference(priv->peer); 204 if (unlikely(!rcv)) { 205 kfree_skb(skb); 206 goto drop; 207 } 208 209 rcv_priv = netdev_priv(rcv); 210 rxq = skb_get_queue_mapping(skb); 211 if (rxq < rcv->real_num_rx_queues) { 212 rq = &rcv_priv->rq[rxq]; 213 rcv_xdp = rcu_access_pointer(rq->xdp_prog); 214 if (rcv_xdp) 215 skb_record_rx_queue(skb, rxq); 216 } 217 218 skb_tx_timestamp(skb); 219 if (likely(veth_forward_skb(rcv, skb, rq, rcv_xdp) == NET_RX_SUCCESS)) { 220 struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats); 221 222 u64_stats_update_begin(&stats->syncp); 223 stats->bytes += length; 224 stats->packets++; 225 u64_stats_update_end(&stats->syncp); 226 } else { 227 drop: 228 atomic64_inc(&priv->dropped); 229 } 230 231 if (rcv_xdp) 232 __veth_xdp_flush(rq); 233 234 rcu_read_unlock(); 235 236 return NETDEV_TX_OK; 237 } 238 239 static u64 veth_stats_one(struct pcpu_vstats *result, struct net_device *dev) 240 { 241 struct veth_priv *priv = netdev_priv(dev); 242 int cpu; 243 244 result->packets = 0; 245 result->bytes = 0; 246 for_each_possible_cpu(cpu) { 247 struct pcpu_vstats *stats = per_cpu_ptr(dev->vstats, cpu); 248 u64 packets, bytes; 249 unsigned int start; 250 251 do { 252 start = u64_stats_fetch_begin_irq(&stats->syncp); 253 packets = stats->packets; 254 bytes = stats->bytes; 255 } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); 256 result->packets += packets; 257 result->bytes += bytes; 258 } 259 return atomic64_read(&priv->dropped); 260 } 261 262 static void veth_get_stats64(struct net_device *dev, 263 struct rtnl_link_stats64 *tot) 264 { 265 struct veth_priv *priv = netdev_priv(dev); 266 struct net_device *peer; 267 struct pcpu_vstats one; 268 269 tot->tx_dropped = veth_stats_one(&one, dev); 270 tot->tx_bytes = one.bytes; 271 tot->tx_packets = one.packets; 272 273 rcu_read_lock(); 274 peer = rcu_dereference(priv->peer); 275 if (peer) { 276 tot->rx_dropped = veth_stats_one(&one, peer); 277 tot->rx_bytes = one.bytes; 278 tot->rx_packets = one.packets; 279 } 280 rcu_read_unlock(); 281 } 282 283 /* fake multicast ability */ 284 static void veth_set_multicast_list(struct net_device *dev) 285 { 286 } 287 288 static struct sk_buff *veth_build_skb(void *head, int headroom, int len, 289 int buflen) 290 { 291 struct sk_buff *skb; 292 293 if (!buflen) { 294 buflen = SKB_DATA_ALIGN(headroom + len) + 295 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 296 } 297 skb = build_skb(head, buflen); 298 if (!skb) 299 return NULL; 300 301 skb_reserve(skb, headroom); 302 skb_put(skb, len); 303 304 return skb; 305 } 306 307 static int veth_select_rxq(struct net_device *dev) 308 { 309 return smp_processor_id() % dev->real_num_rx_queues; 310 } 311 312 static int veth_xdp_xmit(struct net_device *dev, int n, 313 struct xdp_frame **frames, u32 flags) 314 { 315 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 316 struct net_device *rcv; 317 unsigned int max_len; 318 struct veth_rq *rq; 319 int i, drops = 0; 320 321 if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) 322 return -EINVAL; 323 324 rcv = rcu_dereference(priv->peer); 325 if (unlikely(!rcv)) 326 return -ENXIO; 327 328 rcv_priv = netdev_priv(rcv); 329 rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 330 /* Non-NULL xdp_prog ensures that xdp_ring is initialized on receive 331 * side. This means an XDP program is loaded on the peer and the peer 332 * device is up. 333 */ 334 if (!rcu_access_pointer(rq->xdp_prog)) 335 return -ENXIO; 336 337 max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN; 338 339 spin_lock(&rq->xdp_ring.producer_lock); 340 for (i = 0; i < n; i++) { 341 struct xdp_frame *frame = frames[i]; 342 void *ptr = veth_xdp_to_ptr(frame); 343 344 if (unlikely(frame->len > max_len || 345 __ptr_ring_produce(&rq->xdp_ring, ptr))) { 346 xdp_return_frame_rx_napi(frame); 347 drops++; 348 } 349 } 350 spin_unlock(&rq->xdp_ring.producer_lock); 351 352 if (flags & XDP_XMIT_FLUSH) 353 __veth_xdp_flush(rq); 354 355 return n - drops; 356 } 357 358 static void veth_xdp_flush(struct net_device *dev) 359 { 360 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 361 struct net_device *rcv; 362 struct veth_rq *rq; 363 364 rcu_read_lock(); 365 rcv = rcu_dereference(priv->peer); 366 if (unlikely(!rcv)) 367 goto out; 368 369 rcv_priv = netdev_priv(rcv); 370 rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 371 /* xdp_ring is initialized on receive side? */ 372 if (unlikely(!rcu_access_pointer(rq->xdp_prog))) 373 goto out; 374 375 __veth_xdp_flush(rq); 376 out: 377 rcu_read_unlock(); 378 } 379 380 static int veth_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) 381 { 382 struct xdp_frame *frame = convert_to_xdp_frame(xdp); 383 384 if (unlikely(!frame)) 385 return -EOVERFLOW; 386 387 return veth_xdp_xmit(dev, 1, &frame, 0); 388 } 389 390 static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, 391 struct xdp_frame *frame, 392 unsigned int *xdp_xmit) 393 { 394 void *hard_start = frame->data - frame->headroom; 395 void *head = hard_start - sizeof(struct xdp_frame); 396 int len = frame->len, delta = 0; 397 struct xdp_frame orig_frame; 398 struct bpf_prog *xdp_prog; 399 unsigned int headroom; 400 struct sk_buff *skb; 401 402 rcu_read_lock(); 403 xdp_prog = rcu_dereference(rq->xdp_prog); 404 if (likely(xdp_prog)) { 405 struct xdp_buff xdp; 406 u32 act; 407 408 xdp.data_hard_start = hard_start; 409 xdp.data = frame->data; 410 xdp.data_end = frame->data + frame->len; 411 xdp.data_meta = frame->data - frame->metasize; 412 xdp.rxq = &rq->xdp_rxq; 413 414 act = bpf_prog_run_xdp(xdp_prog, &xdp); 415 416 switch (act) { 417 case XDP_PASS: 418 delta = frame->data - xdp.data; 419 len = xdp.data_end - xdp.data; 420 break; 421 case XDP_TX: 422 orig_frame = *frame; 423 xdp.data_hard_start = head; 424 xdp.rxq->mem = frame->mem; 425 if (unlikely(veth_xdp_tx(rq->dev, &xdp) < 0)) { 426 trace_xdp_exception(rq->dev, xdp_prog, act); 427 frame = &orig_frame; 428 goto err_xdp; 429 } 430 *xdp_xmit |= VETH_XDP_TX; 431 rcu_read_unlock(); 432 goto xdp_xmit; 433 case XDP_REDIRECT: 434 orig_frame = *frame; 435 xdp.data_hard_start = head; 436 xdp.rxq->mem = frame->mem; 437 if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { 438 frame = &orig_frame; 439 goto err_xdp; 440 } 441 *xdp_xmit |= VETH_XDP_REDIR; 442 rcu_read_unlock(); 443 goto xdp_xmit; 444 default: 445 bpf_warn_invalid_xdp_action(act); 446 case XDP_ABORTED: 447 trace_xdp_exception(rq->dev, xdp_prog, act); 448 case XDP_DROP: 449 goto err_xdp; 450 } 451 } 452 rcu_read_unlock(); 453 454 headroom = sizeof(struct xdp_frame) + frame->headroom - delta; 455 skb = veth_build_skb(head, headroom, len, 0); 456 if (!skb) { 457 xdp_return_frame(frame); 458 goto err; 459 } 460 461 xdp_scrub_frame(frame); 462 skb->protocol = eth_type_trans(skb, rq->dev); 463 err: 464 return skb; 465 err_xdp: 466 rcu_read_unlock(); 467 xdp_return_frame(frame); 468 xdp_xmit: 469 return NULL; 470 } 471 472 static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, struct sk_buff *skb, 473 unsigned int *xdp_xmit) 474 { 475 u32 pktlen, headroom, act, metalen; 476 void *orig_data, *orig_data_end; 477 struct bpf_prog *xdp_prog; 478 int mac_len, delta, off; 479 struct xdp_buff xdp; 480 481 rcu_read_lock(); 482 xdp_prog = rcu_dereference(rq->xdp_prog); 483 if (unlikely(!xdp_prog)) { 484 rcu_read_unlock(); 485 goto out; 486 } 487 488 mac_len = skb->data - skb_mac_header(skb); 489 pktlen = skb->len + mac_len; 490 headroom = skb_headroom(skb) - mac_len; 491 492 if (skb_shared(skb) || skb_head_is_locked(skb) || 493 skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) { 494 struct sk_buff *nskb; 495 int size, head_off; 496 void *head, *start; 497 struct page *page; 498 499 size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) + 500 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 501 if (size > PAGE_SIZE) 502 goto drop; 503 504 page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); 505 if (!page) 506 goto drop; 507 508 head = page_address(page); 509 start = head + VETH_XDP_HEADROOM; 510 if (skb_copy_bits(skb, -mac_len, start, pktlen)) { 511 page_frag_free(head); 512 goto drop; 513 } 514 515 nskb = veth_build_skb(head, 516 VETH_XDP_HEADROOM + mac_len, skb->len, 517 PAGE_SIZE); 518 if (!nskb) { 519 page_frag_free(head); 520 goto drop; 521 } 522 523 skb_copy_header(nskb, skb); 524 head_off = skb_headroom(nskb) - skb_headroom(skb); 525 skb_headers_offset_update(nskb, head_off); 526 if (skb->sk) 527 skb_set_owner_w(nskb, skb->sk); 528 consume_skb(skb); 529 skb = nskb; 530 } 531 532 xdp.data_hard_start = skb->head; 533 xdp.data = skb_mac_header(skb); 534 xdp.data_end = xdp.data + pktlen; 535 xdp.data_meta = xdp.data; 536 xdp.rxq = &rq->xdp_rxq; 537 orig_data = xdp.data; 538 orig_data_end = xdp.data_end; 539 540 act = bpf_prog_run_xdp(xdp_prog, &xdp); 541 542 switch (act) { 543 case XDP_PASS: 544 break; 545 case XDP_TX: 546 get_page(virt_to_page(xdp.data)); 547 consume_skb(skb); 548 xdp.rxq->mem = rq->xdp_mem; 549 if (unlikely(veth_xdp_tx(rq->dev, &xdp) < 0)) { 550 trace_xdp_exception(rq->dev, xdp_prog, act); 551 goto err_xdp; 552 } 553 *xdp_xmit |= VETH_XDP_TX; 554 rcu_read_unlock(); 555 goto xdp_xmit; 556 case XDP_REDIRECT: 557 get_page(virt_to_page(xdp.data)); 558 consume_skb(skb); 559 xdp.rxq->mem = rq->xdp_mem; 560 if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) 561 goto err_xdp; 562 *xdp_xmit |= VETH_XDP_REDIR; 563 rcu_read_unlock(); 564 goto xdp_xmit; 565 default: 566 bpf_warn_invalid_xdp_action(act); 567 case XDP_ABORTED: 568 trace_xdp_exception(rq->dev, xdp_prog, act); 569 case XDP_DROP: 570 goto drop; 571 } 572 rcu_read_unlock(); 573 574 delta = orig_data - xdp.data; 575 off = mac_len + delta; 576 if (off > 0) 577 __skb_push(skb, off); 578 else if (off < 0) 579 __skb_pull(skb, -off); 580 skb->mac_header -= delta; 581 off = xdp.data_end - orig_data_end; 582 if (off != 0) 583 __skb_put(skb, off); 584 skb->protocol = eth_type_trans(skb, rq->dev); 585 586 metalen = xdp.data - xdp.data_meta; 587 if (metalen) 588 skb_metadata_set(skb, metalen); 589 out: 590 return skb; 591 drop: 592 rcu_read_unlock(); 593 kfree_skb(skb); 594 return NULL; 595 err_xdp: 596 rcu_read_unlock(); 597 page_frag_free(xdp.data); 598 xdp_xmit: 599 return NULL; 600 } 601 602 static int veth_xdp_rcv(struct veth_rq *rq, int budget, unsigned int *xdp_xmit) 603 { 604 int i, done = 0; 605 606 for (i = 0; i < budget; i++) { 607 void *ptr = __ptr_ring_consume(&rq->xdp_ring); 608 struct sk_buff *skb; 609 610 if (!ptr) 611 break; 612 613 if (veth_is_xdp_frame(ptr)) { 614 skb = veth_xdp_rcv_one(rq, veth_ptr_to_xdp(ptr), 615 xdp_xmit); 616 } else { 617 skb = veth_xdp_rcv_skb(rq, ptr, xdp_xmit); 618 } 619 620 if (skb) 621 napi_gro_receive(&rq->xdp_napi, skb); 622 623 done++; 624 } 625 626 return done; 627 } 628 629 static int veth_poll(struct napi_struct *napi, int budget) 630 { 631 struct veth_rq *rq = 632 container_of(napi, struct veth_rq, xdp_napi); 633 unsigned int xdp_xmit = 0; 634 int done; 635 636 xdp_set_return_frame_no_direct(); 637 done = veth_xdp_rcv(rq, budget, &xdp_xmit); 638 639 if (done < budget && napi_complete_done(napi, done)) { 640 /* Write rx_notify_masked before reading ptr_ring */ 641 smp_store_mb(rq->rx_notify_masked, false); 642 if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { 643 rq->rx_notify_masked = true; 644 napi_schedule(&rq->xdp_napi); 645 } 646 } 647 648 if (xdp_xmit & VETH_XDP_TX) 649 veth_xdp_flush(rq->dev); 650 if (xdp_xmit & VETH_XDP_REDIR) 651 xdp_do_flush_map(); 652 xdp_clear_return_frame_no_direct(); 653 654 return done; 655 } 656 657 static int veth_napi_add(struct net_device *dev) 658 { 659 struct veth_priv *priv = netdev_priv(dev); 660 int err, i; 661 662 for (i = 0; i < dev->real_num_rx_queues; i++) { 663 struct veth_rq *rq = &priv->rq[i]; 664 665 err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL); 666 if (err) 667 goto err_xdp_ring; 668 } 669 670 for (i = 0; i < dev->real_num_rx_queues; i++) { 671 struct veth_rq *rq = &priv->rq[i]; 672 673 netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); 674 napi_enable(&rq->xdp_napi); 675 } 676 677 return 0; 678 err_xdp_ring: 679 for (i--; i >= 0; i--) 680 ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); 681 682 return err; 683 } 684 685 static void veth_napi_del(struct net_device *dev) 686 { 687 struct veth_priv *priv = netdev_priv(dev); 688 int i; 689 690 for (i = 0; i < dev->real_num_rx_queues; i++) { 691 struct veth_rq *rq = &priv->rq[i]; 692 693 napi_disable(&rq->xdp_napi); 694 napi_hash_del(&rq->xdp_napi); 695 } 696 synchronize_net(); 697 698 for (i = 0; i < dev->real_num_rx_queues; i++) { 699 struct veth_rq *rq = &priv->rq[i]; 700 701 netif_napi_del(&rq->xdp_napi); 702 rq->rx_notify_masked = false; 703 ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free); 704 } 705 } 706 707 static int veth_enable_xdp(struct net_device *dev) 708 { 709 struct veth_priv *priv = netdev_priv(dev); 710 int err, i; 711 712 if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) { 713 for (i = 0; i < dev->real_num_rx_queues; i++) { 714 struct veth_rq *rq = &priv->rq[i]; 715 716 err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i); 717 if (err < 0) 718 goto err_rxq_reg; 719 720 err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, 721 MEM_TYPE_PAGE_SHARED, 722 NULL); 723 if (err < 0) 724 goto err_reg_mem; 725 726 /* Save original mem info as it can be overwritten */ 727 rq->xdp_mem = rq->xdp_rxq.mem; 728 } 729 730 err = veth_napi_add(dev); 731 if (err) 732 goto err_rxq_reg; 733 } 734 735 for (i = 0; i < dev->real_num_rx_queues; i++) 736 rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog); 737 738 return 0; 739 err_reg_mem: 740 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 741 err_rxq_reg: 742 for (i--; i >= 0; i--) 743 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 744 745 return err; 746 } 747 748 static void veth_disable_xdp(struct net_device *dev) 749 { 750 struct veth_priv *priv = netdev_priv(dev); 751 int i; 752 753 for (i = 0; i < dev->real_num_rx_queues; i++) 754 rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); 755 veth_napi_del(dev); 756 for (i = 0; i < dev->real_num_rx_queues; i++) { 757 struct veth_rq *rq = &priv->rq[i]; 758 759 rq->xdp_rxq.mem = rq->xdp_mem; 760 xdp_rxq_info_unreg(&rq->xdp_rxq); 761 } 762 } 763 764 static int veth_open(struct net_device *dev) 765 { 766 struct veth_priv *priv = netdev_priv(dev); 767 struct net_device *peer = rtnl_dereference(priv->peer); 768 int err; 769 770 if (!peer) 771 return -ENOTCONN; 772 773 if (priv->_xdp_prog) { 774 err = veth_enable_xdp(dev); 775 if (err) 776 return err; 777 } 778 779 if (peer->flags & IFF_UP) { 780 netif_carrier_on(dev); 781 netif_carrier_on(peer); 782 } 783 784 return 0; 785 } 786 787 static int veth_close(struct net_device *dev) 788 { 789 struct veth_priv *priv = netdev_priv(dev); 790 struct net_device *peer = rtnl_dereference(priv->peer); 791 792 netif_carrier_off(dev); 793 if (peer) 794 netif_carrier_off(peer); 795 796 if (priv->_xdp_prog) 797 veth_disable_xdp(dev); 798 799 return 0; 800 } 801 802 static int is_valid_veth_mtu(int mtu) 803 { 804 return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU; 805 } 806 807 static int veth_alloc_queues(struct net_device *dev) 808 { 809 struct veth_priv *priv = netdev_priv(dev); 810 int i; 811 812 priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL); 813 if (!priv->rq) 814 return -ENOMEM; 815 816 for (i = 0; i < dev->num_rx_queues; i++) 817 priv->rq[i].dev = dev; 818 819 return 0; 820 } 821 822 static void veth_free_queues(struct net_device *dev) 823 { 824 struct veth_priv *priv = netdev_priv(dev); 825 826 kfree(priv->rq); 827 } 828 829 static int veth_dev_init(struct net_device *dev) 830 { 831 int err; 832 833 dev->vstats = netdev_alloc_pcpu_stats(struct pcpu_vstats); 834 if (!dev->vstats) 835 return -ENOMEM; 836 837 err = veth_alloc_queues(dev); 838 if (err) { 839 free_percpu(dev->vstats); 840 return err; 841 } 842 843 return 0; 844 } 845 846 static void veth_dev_free(struct net_device *dev) 847 { 848 veth_free_queues(dev); 849 free_percpu(dev->vstats); 850 } 851 852 #ifdef CONFIG_NET_POLL_CONTROLLER 853 static void veth_poll_controller(struct net_device *dev) 854 { 855 /* veth only receives frames when its peer sends one 856 * Since it has nothing to do with disabling irqs, we are guaranteed 857 * never to have pending data when we poll for it so 858 * there is nothing to do here. 859 * 860 * We need this though so netpoll recognizes us as an interface that 861 * supports polling, which enables bridge devices in virt setups to 862 * still use netconsole 863 */ 864 } 865 #endif /* CONFIG_NET_POLL_CONTROLLER */ 866 867 static int veth_get_iflink(const struct net_device *dev) 868 { 869 struct veth_priv *priv = netdev_priv(dev); 870 struct net_device *peer; 871 int iflink; 872 873 rcu_read_lock(); 874 peer = rcu_dereference(priv->peer); 875 iflink = peer ? peer->ifindex : 0; 876 rcu_read_unlock(); 877 878 return iflink; 879 } 880 881 static netdev_features_t veth_fix_features(struct net_device *dev, 882 netdev_features_t features) 883 { 884 struct veth_priv *priv = netdev_priv(dev); 885 struct net_device *peer; 886 887 peer = rtnl_dereference(priv->peer); 888 if (peer) { 889 struct veth_priv *peer_priv = netdev_priv(peer); 890 891 if (peer_priv->_xdp_prog) 892 features &= ~NETIF_F_GSO_SOFTWARE; 893 } 894 895 return features; 896 } 897 898 static void veth_set_rx_headroom(struct net_device *dev, int new_hr) 899 { 900 struct veth_priv *peer_priv, *priv = netdev_priv(dev); 901 struct net_device *peer; 902 903 if (new_hr < 0) 904 new_hr = 0; 905 906 rcu_read_lock(); 907 peer = rcu_dereference(priv->peer); 908 if (unlikely(!peer)) 909 goto out; 910 911 peer_priv = netdev_priv(peer); 912 priv->requested_headroom = new_hr; 913 new_hr = max(priv->requested_headroom, peer_priv->requested_headroom); 914 dev->needed_headroom = new_hr; 915 peer->needed_headroom = new_hr; 916 917 out: 918 rcu_read_unlock(); 919 } 920 921 static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, 922 struct netlink_ext_ack *extack) 923 { 924 struct veth_priv *priv = netdev_priv(dev); 925 struct bpf_prog *old_prog; 926 struct net_device *peer; 927 unsigned int max_mtu; 928 int err; 929 930 old_prog = priv->_xdp_prog; 931 priv->_xdp_prog = prog; 932 peer = rtnl_dereference(priv->peer); 933 934 if (prog) { 935 if (!peer) { 936 NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached"); 937 err = -ENOTCONN; 938 goto err; 939 } 940 941 max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM - 942 peer->hard_header_len - 943 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 944 if (peer->mtu > max_mtu) { 945 NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP"); 946 err = -ERANGE; 947 goto err; 948 } 949 950 if (dev->real_num_rx_queues < peer->real_num_tx_queues) { 951 NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues"); 952 err = -ENOSPC; 953 goto err; 954 } 955 956 if (dev->flags & IFF_UP) { 957 err = veth_enable_xdp(dev); 958 if (err) { 959 NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed"); 960 goto err; 961 } 962 } 963 964 if (!old_prog) { 965 peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; 966 peer->max_mtu = max_mtu; 967 } 968 } 969 970 if (old_prog) { 971 if (!prog) { 972 if (dev->flags & IFF_UP) 973 veth_disable_xdp(dev); 974 975 if (peer) { 976 peer->hw_features |= NETIF_F_GSO_SOFTWARE; 977 peer->max_mtu = ETH_MAX_MTU; 978 } 979 } 980 bpf_prog_put(old_prog); 981 } 982 983 if ((!!old_prog ^ !!prog) && peer) 984 netdev_update_features(peer); 985 986 return 0; 987 err: 988 priv->_xdp_prog = old_prog; 989 990 return err; 991 } 992 993 static u32 veth_xdp_query(struct net_device *dev) 994 { 995 struct veth_priv *priv = netdev_priv(dev); 996 const struct bpf_prog *xdp_prog; 997 998 xdp_prog = priv->_xdp_prog; 999 if (xdp_prog) 1000 return xdp_prog->aux->id; 1001 1002 return 0; 1003 } 1004 1005 static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) 1006 { 1007 switch (xdp->command) { 1008 case XDP_SETUP_PROG: 1009 return veth_xdp_set(dev, xdp->prog, xdp->extack); 1010 case XDP_QUERY_PROG: 1011 xdp->prog_id = veth_xdp_query(dev); 1012 return 0; 1013 default: 1014 return -EINVAL; 1015 } 1016 } 1017 1018 static const struct net_device_ops veth_netdev_ops = { 1019 .ndo_init = veth_dev_init, 1020 .ndo_open = veth_open, 1021 .ndo_stop = veth_close, 1022 .ndo_start_xmit = veth_xmit, 1023 .ndo_get_stats64 = veth_get_stats64, 1024 .ndo_set_rx_mode = veth_set_multicast_list, 1025 .ndo_set_mac_address = eth_mac_addr, 1026 #ifdef CONFIG_NET_POLL_CONTROLLER 1027 .ndo_poll_controller = veth_poll_controller, 1028 #endif 1029 .ndo_get_iflink = veth_get_iflink, 1030 .ndo_fix_features = veth_fix_features, 1031 .ndo_features_check = passthru_features_check, 1032 .ndo_set_rx_headroom = veth_set_rx_headroom, 1033 .ndo_bpf = veth_xdp, 1034 .ndo_xdp_xmit = veth_xdp_xmit, 1035 }; 1036 1037 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ 1038 NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \ 1039 NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \ 1040 NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \ 1041 NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX ) 1042 1043 static void veth_setup(struct net_device *dev) 1044 { 1045 ether_setup(dev); 1046 1047 dev->priv_flags &= ~IFF_TX_SKB_SHARING; 1048 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 1049 dev->priv_flags |= IFF_NO_QUEUE; 1050 dev->priv_flags |= IFF_PHONY_HEADROOM; 1051 1052 dev->netdev_ops = &veth_netdev_ops; 1053 dev->ethtool_ops = &veth_ethtool_ops; 1054 dev->features |= NETIF_F_LLTX; 1055 dev->features |= VETH_FEATURES; 1056 dev->vlan_features = dev->features & 1057 ~(NETIF_F_HW_VLAN_CTAG_TX | 1058 NETIF_F_HW_VLAN_STAG_TX | 1059 NETIF_F_HW_VLAN_CTAG_RX | 1060 NETIF_F_HW_VLAN_STAG_RX); 1061 dev->needs_free_netdev = true; 1062 dev->priv_destructor = veth_dev_free; 1063 dev->max_mtu = ETH_MAX_MTU; 1064 1065 dev->hw_features = VETH_FEATURES; 1066 dev->hw_enc_features = VETH_FEATURES; 1067 dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; 1068 } 1069 1070 /* 1071 * netlink interface 1072 */ 1073 1074 static int veth_validate(struct nlattr *tb[], struct nlattr *data[], 1075 struct netlink_ext_ack *extack) 1076 { 1077 if (tb[IFLA_ADDRESS]) { 1078 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 1079 return -EINVAL; 1080 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 1081 return -EADDRNOTAVAIL; 1082 } 1083 if (tb[IFLA_MTU]) { 1084 if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU]))) 1085 return -EINVAL; 1086 } 1087 return 0; 1088 } 1089 1090 static struct rtnl_link_ops veth_link_ops; 1091 1092 static int veth_newlink(struct net *src_net, struct net_device *dev, 1093 struct nlattr *tb[], struct nlattr *data[], 1094 struct netlink_ext_ack *extack) 1095 { 1096 int err; 1097 struct net_device *peer; 1098 struct veth_priv *priv; 1099 char ifname[IFNAMSIZ]; 1100 struct nlattr *peer_tb[IFLA_MAX + 1], **tbp; 1101 unsigned char name_assign_type; 1102 struct ifinfomsg *ifmp; 1103 struct net *net; 1104 1105 /* 1106 * create and register peer first 1107 */ 1108 if (data != NULL && data[VETH_INFO_PEER] != NULL) { 1109 struct nlattr *nla_peer; 1110 1111 nla_peer = data[VETH_INFO_PEER]; 1112 ifmp = nla_data(nla_peer); 1113 err = rtnl_nla_parse_ifla(peer_tb, 1114 nla_data(nla_peer) + sizeof(struct ifinfomsg), 1115 nla_len(nla_peer) - sizeof(struct ifinfomsg), 1116 NULL); 1117 if (err < 0) 1118 return err; 1119 1120 err = veth_validate(peer_tb, NULL, extack); 1121 if (err < 0) 1122 return err; 1123 1124 tbp = peer_tb; 1125 } else { 1126 ifmp = NULL; 1127 tbp = tb; 1128 } 1129 1130 if (ifmp && tbp[IFLA_IFNAME]) { 1131 nla_strlcpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); 1132 name_assign_type = NET_NAME_USER; 1133 } else { 1134 snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); 1135 name_assign_type = NET_NAME_ENUM; 1136 } 1137 1138 net = rtnl_link_get_net(src_net, tbp); 1139 if (IS_ERR(net)) 1140 return PTR_ERR(net); 1141 1142 peer = rtnl_create_link(net, ifname, name_assign_type, 1143 &veth_link_ops, tbp); 1144 if (IS_ERR(peer)) { 1145 put_net(net); 1146 return PTR_ERR(peer); 1147 } 1148 1149 if (!ifmp || !tbp[IFLA_ADDRESS]) 1150 eth_hw_addr_random(peer); 1151 1152 if (ifmp && (dev->ifindex != 0)) 1153 peer->ifindex = ifmp->ifi_index; 1154 1155 peer->gso_max_size = dev->gso_max_size; 1156 peer->gso_max_segs = dev->gso_max_segs; 1157 1158 err = register_netdevice(peer); 1159 put_net(net); 1160 net = NULL; 1161 if (err < 0) 1162 goto err_register_peer; 1163 1164 netif_carrier_off(peer); 1165 1166 err = rtnl_configure_link(peer, ifmp); 1167 if (err < 0) 1168 goto err_configure_peer; 1169 1170 /* 1171 * register dev last 1172 * 1173 * note, that since we've registered new device the dev's name 1174 * should be re-allocated 1175 */ 1176 1177 if (tb[IFLA_ADDRESS] == NULL) 1178 eth_hw_addr_random(dev); 1179 1180 if (tb[IFLA_IFNAME]) 1181 nla_strlcpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); 1182 else 1183 snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); 1184 1185 err = register_netdevice(dev); 1186 if (err < 0) 1187 goto err_register_dev; 1188 1189 netif_carrier_off(dev); 1190 1191 /* 1192 * tie the deviced together 1193 */ 1194 1195 priv = netdev_priv(dev); 1196 rcu_assign_pointer(priv->peer, peer); 1197 1198 priv = netdev_priv(peer); 1199 rcu_assign_pointer(priv->peer, dev); 1200 1201 return 0; 1202 1203 err_register_dev: 1204 /* nothing to do */ 1205 err_configure_peer: 1206 unregister_netdevice(peer); 1207 return err; 1208 1209 err_register_peer: 1210 free_netdev(peer); 1211 return err; 1212 } 1213 1214 static void veth_dellink(struct net_device *dev, struct list_head *head) 1215 { 1216 struct veth_priv *priv; 1217 struct net_device *peer; 1218 1219 priv = netdev_priv(dev); 1220 peer = rtnl_dereference(priv->peer); 1221 1222 /* Note : dellink() is called from default_device_exit_batch(), 1223 * before a rcu_synchronize() point. The devices are guaranteed 1224 * not being freed before one RCU grace period. 1225 */ 1226 RCU_INIT_POINTER(priv->peer, NULL); 1227 unregister_netdevice_queue(dev, head); 1228 1229 if (peer) { 1230 priv = netdev_priv(peer); 1231 RCU_INIT_POINTER(priv->peer, NULL); 1232 unregister_netdevice_queue(peer, head); 1233 } 1234 } 1235 1236 static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = { 1237 [VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) }, 1238 }; 1239 1240 static struct net *veth_get_link_net(const struct net_device *dev) 1241 { 1242 struct veth_priv *priv = netdev_priv(dev); 1243 struct net_device *peer = rtnl_dereference(priv->peer); 1244 1245 return peer ? dev_net(peer) : dev_net(dev); 1246 } 1247 1248 static struct rtnl_link_ops veth_link_ops = { 1249 .kind = DRV_NAME, 1250 .priv_size = sizeof(struct veth_priv), 1251 .setup = veth_setup, 1252 .validate = veth_validate, 1253 .newlink = veth_newlink, 1254 .dellink = veth_dellink, 1255 .policy = veth_policy, 1256 .maxtype = VETH_INFO_MAX, 1257 .get_link_net = veth_get_link_net, 1258 }; 1259 1260 /* 1261 * init/fini 1262 */ 1263 1264 static __init int veth_init(void) 1265 { 1266 return rtnl_link_register(&veth_link_ops); 1267 } 1268 1269 static __exit void veth_exit(void) 1270 { 1271 rtnl_link_unregister(&veth_link_ops); 1272 } 1273 1274 module_init(veth_init); 1275 module_exit(veth_exit); 1276 1277 MODULE_DESCRIPTION("Virtual Ethernet Tunnel"); 1278 MODULE_LICENSE("GPL v2"); 1279 MODULE_ALIAS_RTNL_LINK(DRV_NAME); 1280