1 /* 2 * drivers/net/veth.c 3 * 4 * Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc 5 * 6 * Author: Pavel Emelianov <xemul@openvz.org> 7 * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com> 8 * 9 */ 10 11 #include <linux/netdevice.h> 12 #include <linux/slab.h> 13 #include <linux/ethtool.h> 14 #include <linux/etherdevice.h> 15 #include <linux/u64_stats_sync.h> 16 17 #include <net/rtnetlink.h> 18 #include <net/dst.h> 19 #include <net/xfrm.h> 20 #include <net/xdp.h> 21 #include <linux/veth.h> 22 #include <linux/module.h> 23 #include <linux/bpf.h> 24 #include <linux/filter.h> 25 #include <linux/ptr_ring.h> 26 #include <linux/bpf_trace.h> 27 #include <linux/net_tstamp.h> 28 29 #define DRV_NAME "veth" 30 #define DRV_VERSION "1.0" 31 32 #define VETH_XDP_FLAG BIT(0) 33 #define VETH_RING_SIZE 256 34 #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) 35 36 /* Separating two types of XDP xmit */ 37 #define VETH_XDP_TX BIT(0) 38 #define VETH_XDP_REDIR BIT(1) 39 40 struct veth_rq { 41 struct napi_struct xdp_napi; 42 struct net_device *dev; 43 struct bpf_prog __rcu *xdp_prog; 44 struct xdp_mem_info xdp_mem; 45 bool rx_notify_masked; 46 struct ptr_ring xdp_ring; 47 struct xdp_rxq_info xdp_rxq; 48 }; 49 50 struct veth_priv { 51 struct net_device __rcu *peer; 52 atomic64_t dropped; 53 struct bpf_prog *_xdp_prog; 54 struct veth_rq *rq; 55 unsigned int requested_headroom; 56 }; 57 58 /* 59 * ethtool interface 60 */ 61 62 static struct { 63 const char string[ETH_GSTRING_LEN]; 64 } ethtool_stats_keys[] = { 65 { "peer_ifindex" }, 66 }; 67 68 static int veth_get_link_ksettings(struct net_device *dev, 69 struct ethtool_link_ksettings *cmd) 70 { 71 cmd->base.speed = SPEED_10000; 72 cmd->base.duplex = DUPLEX_FULL; 73 cmd->base.port = PORT_TP; 74 cmd->base.autoneg = AUTONEG_DISABLE; 75 return 0; 76 } 77 78 static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) 79 { 80 strlcpy(info->driver, DRV_NAME, sizeof(info->driver)); 81 strlcpy(info->version, DRV_VERSION, sizeof(info->version)); 82 } 83 84 static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf) 85 { 86 switch(stringset) { 87 case ETH_SS_STATS: 88 memcpy(buf, ðtool_stats_keys, sizeof(ethtool_stats_keys)); 89 break; 90 } 91 } 92 93 static int veth_get_sset_count(struct net_device *dev, int sset) 94 { 95 switch (sset) { 96 case ETH_SS_STATS: 97 return ARRAY_SIZE(ethtool_stats_keys); 98 default: 99 return -EOPNOTSUPP; 100 } 101 } 102 103 static void veth_get_ethtool_stats(struct net_device *dev, 104 struct ethtool_stats *stats, u64 *data) 105 { 106 struct veth_priv *priv = netdev_priv(dev); 107 struct net_device *peer = rtnl_dereference(priv->peer); 108 109 data[0] = peer ? peer->ifindex : 0; 110 } 111 112 static int veth_get_ts_info(struct net_device *dev, 113 struct ethtool_ts_info *info) 114 { 115 info->so_timestamping = 116 SOF_TIMESTAMPING_TX_SOFTWARE | 117 SOF_TIMESTAMPING_RX_SOFTWARE | 118 SOF_TIMESTAMPING_SOFTWARE; 119 info->phc_index = -1; 120 121 return 0; 122 } 123 124 static const struct ethtool_ops veth_ethtool_ops = { 125 .get_drvinfo = veth_get_drvinfo, 126 .get_link = ethtool_op_get_link, 127 .get_strings = veth_get_strings, 128 .get_sset_count = veth_get_sset_count, 129 .get_ethtool_stats = veth_get_ethtool_stats, 130 .get_link_ksettings = veth_get_link_ksettings, 131 .get_ts_info = veth_get_ts_info, 132 }; 133 134 /* general routines */ 135 136 static bool veth_is_xdp_frame(void *ptr) 137 { 138 return (unsigned long)ptr & VETH_XDP_FLAG; 139 } 140 141 static void *veth_ptr_to_xdp(void *ptr) 142 { 143 return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG); 144 } 145 146 static void *veth_xdp_to_ptr(void *ptr) 147 { 148 return (void *)((unsigned long)ptr | VETH_XDP_FLAG); 149 } 150 151 static void veth_ptr_free(void *ptr) 152 { 153 if (veth_is_xdp_frame(ptr)) 154 xdp_return_frame(veth_ptr_to_xdp(ptr)); 155 else 156 kfree_skb(ptr); 157 } 158 159 static void __veth_xdp_flush(struct veth_rq *rq) 160 { 161 /* Write ptr_ring before reading rx_notify_masked */ 162 smp_mb(); 163 if (!rq->rx_notify_masked) { 164 rq->rx_notify_masked = true; 165 napi_schedule(&rq->xdp_napi); 166 } 167 } 168 169 static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) 170 { 171 if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) { 172 dev_kfree_skb_any(skb); 173 return NET_RX_DROP; 174 } 175 176 return NET_RX_SUCCESS; 177 } 178 179 static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, 180 struct veth_rq *rq, bool xdp) 181 { 182 return __dev_forward_skb(dev, skb) ?: xdp ? 183 veth_xdp_rx(rq, skb) : 184 netif_rx(skb); 185 } 186 187 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) 188 { 189 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 190 struct veth_rq *rq = NULL; 191 struct net_device *rcv; 192 int length = skb->len; 193 bool rcv_xdp = false; 194 int rxq; 195 196 rcu_read_lock(); 197 rcv = rcu_dereference(priv->peer); 198 if (unlikely(!rcv)) { 199 kfree_skb(skb); 200 goto drop; 201 } 202 203 rcv_priv = netdev_priv(rcv); 204 rxq = skb_get_queue_mapping(skb); 205 if (rxq < rcv->real_num_rx_queues) { 206 rq = &rcv_priv->rq[rxq]; 207 rcv_xdp = rcu_access_pointer(rq->xdp_prog); 208 if (rcv_xdp) 209 skb_record_rx_queue(skb, rxq); 210 } 211 212 skb_tx_timestamp(skb); 213 if (likely(veth_forward_skb(rcv, skb, rq, rcv_xdp) == NET_RX_SUCCESS)) { 214 struct pcpu_lstats *stats = this_cpu_ptr(dev->lstats); 215 216 u64_stats_update_begin(&stats->syncp); 217 stats->bytes += length; 218 stats->packets++; 219 u64_stats_update_end(&stats->syncp); 220 } else { 221 drop: 222 atomic64_inc(&priv->dropped); 223 } 224 225 if (rcv_xdp) 226 __veth_xdp_flush(rq); 227 228 rcu_read_unlock(); 229 230 return NETDEV_TX_OK; 231 } 232 233 static u64 veth_stats_one(struct pcpu_lstats *result, struct net_device *dev) 234 { 235 struct veth_priv *priv = netdev_priv(dev); 236 int cpu; 237 238 result->packets = 0; 239 result->bytes = 0; 240 for_each_possible_cpu(cpu) { 241 struct pcpu_lstats *stats = per_cpu_ptr(dev->lstats, cpu); 242 u64 packets, bytes; 243 unsigned int start; 244 245 do { 246 start = u64_stats_fetch_begin_irq(&stats->syncp); 247 packets = stats->packets; 248 bytes = stats->bytes; 249 } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); 250 result->packets += packets; 251 result->bytes += bytes; 252 } 253 return atomic64_read(&priv->dropped); 254 } 255 256 static void veth_get_stats64(struct net_device *dev, 257 struct rtnl_link_stats64 *tot) 258 { 259 struct veth_priv *priv = netdev_priv(dev); 260 struct net_device *peer; 261 struct pcpu_lstats one; 262 263 tot->tx_dropped = veth_stats_one(&one, dev); 264 tot->tx_bytes = one.bytes; 265 tot->tx_packets = one.packets; 266 267 rcu_read_lock(); 268 peer = rcu_dereference(priv->peer); 269 if (peer) { 270 tot->rx_dropped = veth_stats_one(&one, peer); 271 tot->rx_bytes = one.bytes; 272 tot->rx_packets = one.packets; 273 } 274 rcu_read_unlock(); 275 } 276 277 /* fake multicast ability */ 278 static void veth_set_multicast_list(struct net_device *dev) 279 { 280 } 281 282 static struct sk_buff *veth_build_skb(void *head, int headroom, int len, 283 int buflen) 284 { 285 struct sk_buff *skb; 286 287 if (!buflen) { 288 buflen = SKB_DATA_ALIGN(headroom + len) + 289 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 290 } 291 skb = build_skb(head, buflen); 292 if (!skb) 293 return NULL; 294 295 skb_reserve(skb, headroom); 296 skb_put(skb, len); 297 298 return skb; 299 } 300 301 static int veth_select_rxq(struct net_device *dev) 302 { 303 return smp_processor_id() % dev->real_num_rx_queues; 304 } 305 306 static int veth_xdp_xmit(struct net_device *dev, int n, 307 struct xdp_frame **frames, u32 flags) 308 { 309 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 310 struct net_device *rcv; 311 unsigned int max_len; 312 struct veth_rq *rq; 313 int i, drops = 0; 314 315 if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) 316 return -EINVAL; 317 318 rcv = rcu_dereference(priv->peer); 319 if (unlikely(!rcv)) 320 return -ENXIO; 321 322 rcv_priv = netdev_priv(rcv); 323 rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 324 /* Non-NULL xdp_prog ensures that xdp_ring is initialized on receive 325 * side. This means an XDP program is loaded on the peer and the peer 326 * device is up. 327 */ 328 if (!rcu_access_pointer(rq->xdp_prog)) 329 return -ENXIO; 330 331 max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN; 332 333 spin_lock(&rq->xdp_ring.producer_lock); 334 for (i = 0; i < n; i++) { 335 struct xdp_frame *frame = frames[i]; 336 void *ptr = veth_xdp_to_ptr(frame); 337 338 if (unlikely(frame->len > max_len || 339 __ptr_ring_produce(&rq->xdp_ring, ptr))) { 340 xdp_return_frame_rx_napi(frame); 341 drops++; 342 } 343 } 344 spin_unlock(&rq->xdp_ring.producer_lock); 345 346 if (flags & XDP_XMIT_FLUSH) 347 __veth_xdp_flush(rq); 348 349 return n - drops; 350 } 351 352 static void veth_xdp_flush(struct net_device *dev) 353 { 354 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 355 struct net_device *rcv; 356 struct veth_rq *rq; 357 358 rcu_read_lock(); 359 rcv = rcu_dereference(priv->peer); 360 if (unlikely(!rcv)) 361 goto out; 362 363 rcv_priv = netdev_priv(rcv); 364 rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 365 /* xdp_ring is initialized on receive side? */ 366 if (unlikely(!rcu_access_pointer(rq->xdp_prog))) 367 goto out; 368 369 __veth_xdp_flush(rq); 370 out: 371 rcu_read_unlock(); 372 } 373 374 static int veth_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) 375 { 376 struct xdp_frame *frame = convert_to_xdp_frame(xdp); 377 378 if (unlikely(!frame)) 379 return -EOVERFLOW; 380 381 return veth_xdp_xmit(dev, 1, &frame, 0); 382 } 383 384 static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, 385 struct xdp_frame *frame, 386 unsigned int *xdp_xmit) 387 { 388 void *hard_start = frame->data - frame->headroom; 389 void *head = hard_start - sizeof(struct xdp_frame); 390 int len = frame->len, delta = 0; 391 struct xdp_frame orig_frame; 392 struct bpf_prog *xdp_prog; 393 unsigned int headroom; 394 struct sk_buff *skb; 395 396 rcu_read_lock(); 397 xdp_prog = rcu_dereference(rq->xdp_prog); 398 if (likely(xdp_prog)) { 399 struct xdp_buff xdp; 400 u32 act; 401 402 xdp.data_hard_start = hard_start; 403 xdp.data = frame->data; 404 xdp.data_end = frame->data + frame->len; 405 xdp.data_meta = frame->data - frame->metasize; 406 xdp.rxq = &rq->xdp_rxq; 407 408 act = bpf_prog_run_xdp(xdp_prog, &xdp); 409 410 switch (act) { 411 case XDP_PASS: 412 delta = frame->data - xdp.data; 413 len = xdp.data_end - xdp.data; 414 break; 415 case XDP_TX: 416 orig_frame = *frame; 417 xdp.data_hard_start = head; 418 xdp.rxq->mem = frame->mem; 419 if (unlikely(veth_xdp_tx(rq->dev, &xdp) < 0)) { 420 trace_xdp_exception(rq->dev, xdp_prog, act); 421 frame = &orig_frame; 422 goto err_xdp; 423 } 424 *xdp_xmit |= VETH_XDP_TX; 425 rcu_read_unlock(); 426 goto xdp_xmit; 427 case XDP_REDIRECT: 428 orig_frame = *frame; 429 xdp.data_hard_start = head; 430 xdp.rxq->mem = frame->mem; 431 if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { 432 frame = &orig_frame; 433 goto err_xdp; 434 } 435 *xdp_xmit |= VETH_XDP_REDIR; 436 rcu_read_unlock(); 437 goto xdp_xmit; 438 default: 439 bpf_warn_invalid_xdp_action(act); 440 case XDP_ABORTED: 441 trace_xdp_exception(rq->dev, xdp_prog, act); 442 case XDP_DROP: 443 goto err_xdp; 444 } 445 } 446 rcu_read_unlock(); 447 448 headroom = sizeof(struct xdp_frame) + frame->headroom - delta; 449 skb = veth_build_skb(head, headroom, len, 0); 450 if (!skb) { 451 xdp_return_frame(frame); 452 goto err; 453 } 454 455 xdp_scrub_frame(frame); 456 skb->protocol = eth_type_trans(skb, rq->dev); 457 err: 458 return skb; 459 err_xdp: 460 rcu_read_unlock(); 461 xdp_return_frame(frame); 462 xdp_xmit: 463 return NULL; 464 } 465 466 static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, struct sk_buff *skb, 467 unsigned int *xdp_xmit) 468 { 469 u32 pktlen, headroom, act, metalen; 470 void *orig_data, *orig_data_end; 471 struct bpf_prog *xdp_prog; 472 int mac_len, delta, off; 473 struct xdp_buff xdp; 474 475 skb_orphan(skb); 476 477 rcu_read_lock(); 478 xdp_prog = rcu_dereference(rq->xdp_prog); 479 if (unlikely(!xdp_prog)) { 480 rcu_read_unlock(); 481 goto out; 482 } 483 484 mac_len = skb->data - skb_mac_header(skb); 485 pktlen = skb->len + mac_len; 486 headroom = skb_headroom(skb) - mac_len; 487 488 if (skb_shared(skb) || skb_head_is_locked(skb) || 489 skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) { 490 struct sk_buff *nskb; 491 int size, head_off; 492 void *head, *start; 493 struct page *page; 494 495 size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) + 496 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 497 if (size > PAGE_SIZE) 498 goto drop; 499 500 page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); 501 if (!page) 502 goto drop; 503 504 head = page_address(page); 505 start = head + VETH_XDP_HEADROOM; 506 if (skb_copy_bits(skb, -mac_len, start, pktlen)) { 507 page_frag_free(head); 508 goto drop; 509 } 510 511 nskb = veth_build_skb(head, 512 VETH_XDP_HEADROOM + mac_len, skb->len, 513 PAGE_SIZE); 514 if (!nskb) { 515 page_frag_free(head); 516 goto drop; 517 } 518 519 skb_copy_header(nskb, skb); 520 head_off = skb_headroom(nskb) - skb_headroom(skb); 521 skb_headers_offset_update(nskb, head_off); 522 consume_skb(skb); 523 skb = nskb; 524 } 525 526 xdp.data_hard_start = skb->head; 527 xdp.data = skb_mac_header(skb); 528 xdp.data_end = xdp.data + pktlen; 529 xdp.data_meta = xdp.data; 530 xdp.rxq = &rq->xdp_rxq; 531 orig_data = xdp.data; 532 orig_data_end = xdp.data_end; 533 534 act = bpf_prog_run_xdp(xdp_prog, &xdp); 535 536 switch (act) { 537 case XDP_PASS: 538 break; 539 case XDP_TX: 540 get_page(virt_to_page(xdp.data)); 541 consume_skb(skb); 542 xdp.rxq->mem = rq->xdp_mem; 543 if (unlikely(veth_xdp_tx(rq->dev, &xdp) < 0)) { 544 trace_xdp_exception(rq->dev, xdp_prog, act); 545 goto err_xdp; 546 } 547 *xdp_xmit |= VETH_XDP_TX; 548 rcu_read_unlock(); 549 goto xdp_xmit; 550 case XDP_REDIRECT: 551 get_page(virt_to_page(xdp.data)); 552 consume_skb(skb); 553 xdp.rxq->mem = rq->xdp_mem; 554 if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) 555 goto err_xdp; 556 *xdp_xmit |= VETH_XDP_REDIR; 557 rcu_read_unlock(); 558 goto xdp_xmit; 559 default: 560 bpf_warn_invalid_xdp_action(act); 561 case XDP_ABORTED: 562 trace_xdp_exception(rq->dev, xdp_prog, act); 563 case XDP_DROP: 564 goto drop; 565 } 566 rcu_read_unlock(); 567 568 delta = orig_data - xdp.data; 569 off = mac_len + delta; 570 if (off > 0) 571 __skb_push(skb, off); 572 else if (off < 0) 573 __skb_pull(skb, -off); 574 skb->mac_header -= delta; 575 off = xdp.data_end - orig_data_end; 576 if (off != 0) 577 __skb_put(skb, off); 578 skb->protocol = eth_type_trans(skb, rq->dev); 579 580 metalen = xdp.data - xdp.data_meta; 581 if (metalen) 582 skb_metadata_set(skb, metalen); 583 out: 584 return skb; 585 drop: 586 rcu_read_unlock(); 587 kfree_skb(skb); 588 return NULL; 589 err_xdp: 590 rcu_read_unlock(); 591 page_frag_free(xdp.data); 592 xdp_xmit: 593 return NULL; 594 } 595 596 static int veth_xdp_rcv(struct veth_rq *rq, int budget, unsigned int *xdp_xmit) 597 { 598 int i, done = 0; 599 600 for (i = 0; i < budget; i++) { 601 void *ptr = __ptr_ring_consume(&rq->xdp_ring); 602 struct sk_buff *skb; 603 604 if (!ptr) 605 break; 606 607 if (veth_is_xdp_frame(ptr)) { 608 skb = veth_xdp_rcv_one(rq, veth_ptr_to_xdp(ptr), 609 xdp_xmit); 610 } else { 611 skb = veth_xdp_rcv_skb(rq, ptr, xdp_xmit); 612 } 613 614 if (skb) 615 napi_gro_receive(&rq->xdp_napi, skb); 616 617 done++; 618 } 619 620 return done; 621 } 622 623 static int veth_poll(struct napi_struct *napi, int budget) 624 { 625 struct veth_rq *rq = 626 container_of(napi, struct veth_rq, xdp_napi); 627 unsigned int xdp_xmit = 0; 628 int done; 629 630 xdp_set_return_frame_no_direct(); 631 done = veth_xdp_rcv(rq, budget, &xdp_xmit); 632 633 if (done < budget && napi_complete_done(napi, done)) { 634 /* Write rx_notify_masked before reading ptr_ring */ 635 smp_store_mb(rq->rx_notify_masked, false); 636 if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { 637 rq->rx_notify_masked = true; 638 napi_schedule(&rq->xdp_napi); 639 } 640 } 641 642 if (xdp_xmit & VETH_XDP_TX) 643 veth_xdp_flush(rq->dev); 644 if (xdp_xmit & VETH_XDP_REDIR) 645 xdp_do_flush_map(); 646 xdp_clear_return_frame_no_direct(); 647 648 return done; 649 } 650 651 static int veth_napi_add(struct net_device *dev) 652 { 653 struct veth_priv *priv = netdev_priv(dev); 654 int err, i; 655 656 for (i = 0; i < dev->real_num_rx_queues; i++) { 657 struct veth_rq *rq = &priv->rq[i]; 658 659 err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL); 660 if (err) 661 goto err_xdp_ring; 662 } 663 664 for (i = 0; i < dev->real_num_rx_queues; i++) { 665 struct veth_rq *rq = &priv->rq[i]; 666 667 netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); 668 napi_enable(&rq->xdp_napi); 669 } 670 671 return 0; 672 err_xdp_ring: 673 for (i--; i >= 0; i--) 674 ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); 675 676 return err; 677 } 678 679 static void veth_napi_del(struct net_device *dev) 680 { 681 struct veth_priv *priv = netdev_priv(dev); 682 int i; 683 684 for (i = 0; i < dev->real_num_rx_queues; i++) { 685 struct veth_rq *rq = &priv->rq[i]; 686 687 napi_disable(&rq->xdp_napi); 688 napi_hash_del(&rq->xdp_napi); 689 } 690 synchronize_net(); 691 692 for (i = 0; i < dev->real_num_rx_queues; i++) { 693 struct veth_rq *rq = &priv->rq[i]; 694 695 netif_napi_del(&rq->xdp_napi); 696 rq->rx_notify_masked = false; 697 ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free); 698 } 699 } 700 701 static int veth_enable_xdp(struct net_device *dev) 702 { 703 struct veth_priv *priv = netdev_priv(dev); 704 int err, i; 705 706 if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) { 707 for (i = 0; i < dev->real_num_rx_queues; i++) { 708 struct veth_rq *rq = &priv->rq[i]; 709 710 err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i); 711 if (err < 0) 712 goto err_rxq_reg; 713 714 err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, 715 MEM_TYPE_PAGE_SHARED, 716 NULL); 717 if (err < 0) 718 goto err_reg_mem; 719 720 /* Save original mem info as it can be overwritten */ 721 rq->xdp_mem = rq->xdp_rxq.mem; 722 } 723 724 err = veth_napi_add(dev); 725 if (err) 726 goto err_rxq_reg; 727 } 728 729 for (i = 0; i < dev->real_num_rx_queues; i++) 730 rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog); 731 732 return 0; 733 err_reg_mem: 734 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 735 err_rxq_reg: 736 for (i--; i >= 0; i--) 737 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 738 739 return err; 740 } 741 742 static void veth_disable_xdp(struct net_device *dev) 743 { 744 struct veth_priv *priv = netdev_priv(dev); 745 int i; 746 747 for (i = 0; i < dev->real_num_rx_queues; i++) 748 rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); 749 veth_napi_del(dev); 750 for (i = 0; i < dev->real_num_rx_queues; i++) { 751 struct veth_rq *rq = &priv->rq[i]; 752 753 rq->xdp_rxq.mem = rq->xdp_mem; 754 xdp_rxq_info_unreg(&rq->xdp_rxq); 755 } 756 } 757 758 static int veth_open(struct net_device *dev) 759 { 760 struct veth_priv *priv = netdev_priv(dev); 761 struct net_device *peer = rtnl_dereference(priv->peer); 762 int err; 763 764 if (!peer) 765 return -ENOTCONN; 766 767 if (priv->_xdp_prog) { 768 err = veth_enable_xdp(dev); 769 if (err) 770 return err; 771 } 772 773 if (peer->flags & IFF_UP) { 774 netif_carrier_on(dev); 775 netif_carrier_on(peer); 776 } 777 778 return 0; 779 } 780 781 static int veth_close(struct net_device *dev) 782 { 783 struct veth_priv *priv = netdev_priv(dev); 784 struct net_device *peer = rtnl_dereference(priv->peer); 785 786 netif_carrier_off(dev); 787 if (peer) 788 netif_carrier_off(peer); 789 790 if (priv->_xdp_prog) 791 veth_disable_xdp(dev); 792 793 return 0; 794 } 795 796 static int is_valid_veth_mtu(int mtu) 797 { 798 return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU; 799 } 800 801 static int veth_alloc_queues(struct net_device *dev) 802 { 803 struct veth_priv *priv = netdev_priv(dev); 804 int i; 805 806 priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL); 807 if (!priv->rq) 808 return -ENOMEM; 809 810 for (i = 0; i < dev->num_rx_queues; i++) 811 priv->rq[i].dev = dev; 812 813 return 0; 814 } 815 816 static void veth_free_queues(struct net_device *dev) 817 { 818 struct veth_priv *priv = netdev_priv(dev); 819 820 kfree(priv->rq); 821 } 822 823 static int veth_dev_init(struct net_device *dev) 824 { 825 int err; 826 827 dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); 828 if (!dev->lstats) 829 return -ENOMEM; 830 831 err = veth_alloc_queues(dev); 832 if (err) { 833 free_percpu(dev->lstats); 834 return err; 835 } 836 837 return 0; 838 } 839 840 static void veth_dev_free(struct net_device *dev) 841 { 842 veth_free_queues(dev); 843 free_percpu(dev->lstats); 844 } 845 846 #ifdef CONFIG_NET_POLL_CONTROLLER 847 static void veth_poll_controller(struct net_device *dev) 848 { 849 /* veth only receives frames when its peer sends one 850 * Since it has nothing to do with disabling irqs, we are guaranteed 851 * never to have pending data when we poll for it so 852 * there is nothing to do here. 853 * 854 * We need this though so netpoll recognizes us as an interface that 855 * supports polling, which enables bridge devices in virt setups to 856 * still use netconsole 857 */ 858 } 859 #endif /* CONFIG_NET_POLL_CONTROLLER */ 860 861 static int veth_get_iflink(const struct net_device *dev) 862 { 863 struct veth_priv *priv = netdev_priv(dev); 864 struct net_device *peer; 865 int iflink; 866 867 rcu_read_lock(); 868 peer = rcu_dereference(priv->peer); 869 iflink = peer ? peer->ifindex : 0; 870 rcu_read_unlock(); 871 872 return iflink; 873 } 874 875 static netdev_features_t veth_fix_features(struct net_device *dev, 876 netdev_features_t features) 877 { 878 struct veth_priv *priv = netdev_priv(dev); 879 struct net_device *peer; 880 881 peer = rtnl_dereference(priv->peer); 882 if (peer) { 883 struct veth_priv *peer_priv = netdev_priv(peer); 884 885 if (peer_priv->_xdp_prog) 886 features &= ~NETIF_F_GSO_SOFTWARE; 887 } 888 889 return features; 890 } 891 892 static void veth_set_rx_headroom(struct net_device *dev, int new_hr) 893 { 894 struct veth_priv *peer_priv, *priv = netdev_priv(dev); 895 struct net_device *peer; 896 897 if (new_hr < 0) 898 new_hr = 0; 899 900 rcu_read_lock(); 901 peer = rcu_dereference(priv->peer); 902 if (unlikely(!peer)) 903 goto out; 904 905 peer_priv = netdev_priv(peer); 906 priv->requested_headroom = new_hr; 907 new_hr = max(priv->requested_headroom, peer_priv->requested_headroom); 908 dev->needed_headroom = new_hr; 909 peer->needed_headroom = new_hr; 910 911 out: 912 rcu_read_unlock(); 913 } 914 915 static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, 916 struct netlink_ext_ack *extack) 917 { 918 struct veth_priv *priv = netdev_priv(dev); 919 struct bpf_prog *old_prog; 920 struct net_device *peer; 921 unsigned int max_mtu; 922 int err; 923 924 old_prog = priv->_xdp_prog; 925 priv->_xdp_prog = prog; 926 peer = rtnl_dereference(priv->peer); 927 928 if (prog) { 929 if (!peer) { 930 NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached"); 931 err = -ENOTCONN; 932 goto err; 933 } 934 935 max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM - 936 peer->hard_header_len - 937 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 938 if (peer->mtu > max_mtu) { 939 NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP"); 940 err = -ERANGE; 941 goto err; 942 } 943 944 if (dev->real_num_rx_queues < peer->real_num_tx_queues) { 945 NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues"); 946 err = -ENOSPC; 947 goto err; 948 } 949 950 if (dev->flags & IFF_UP) { 951 err = veth_enable_xdp(dev); 952 if (err) { 953 NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed"); 954 goto err; 955 } 956 } 957 958 if (!old_prog) { 959 peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; 960 peer->max_mtu = max_mtu; 961 } 962 } 963 964 if (old_prog) { 965 if (!prog) { 966 if (dev->flags & IFF_UP) 967 veth_disable_xdp(dev); 968 969 if (peer) { 970 peer->hw_features |= NETIF_F_GSO_SOFTWARE; 971 peer->max_mtu = ETH_MAX_MTU; 972 } 973 } 974 bpf_prog_put(old_prog); 975 } 976 977 if ((!!old_prog ^ !!prog) && peer) 978 netdev_update_features(peer); 979 980 return 0; 981 err: 982 priv->_xdp_prog = old_prog; 983 984 return err; 985 } 986 987 static u32 veth_xdp_query(struct net_device *dev) 988 { 989 struct veth_priv *priv = netdev_priv(dev); 990 const struct bpf_prog *xdp_prog; 991 992 xdp_prog = priv->_xdp_prog; 993 if (xdp_prog) 994 return xdp_prog->aux->id; 995 996 return 0; 997 } 998 999 static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) 1000 { 1001 switch (xdp->command) { 1002 case XDP_SETUP_PROG: 1003 return veth_xdp_set(dev, xdp->prog, xdp->extack); 1004 case XDP_QUERY_PROG: 1005 xdp->prog_id = veth_xdp_query(dev); 1006 return 0; 1007 default: 1008 return -EINVAL; 1009 } 1010 } 1011 1012 static const struct net_device_ops veth_netdev_ops = { 1013 .ndo_init = veth_dev_init, 1014 .ndo_open = veth_open, 1015 .ndo_stop = veth_close, 1016 .ndo_start_xmit = veth_xmit, 1017 .ndo_get_stats64 = veth_get_stats64, 1018 .ndo_set_rx_mode = veth_set_multicast_list, 1019 .ndo_set_mac_address = eth_mac_addr, 1020 #ifdef CONFIG_NET_POLL_CONTROLLER 1021 .ndo_poll_controller = veth_poll_controller, 1022 #endif 1023 .ndo_get_iflink = veth_get_iflink, 1024 .ndo_fix_features = veth_fix_features, 1025 .ndo_features_check = passthru_features_check, 1026 .ndo_set_rx_headroom = veth_set_rx_headroom, 1027 .ndo_bpf = veth_xdp, 1028 .ndo_xdp_xmit = veth_xdp_xmit, 1029 }; 1030 1031 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ 1032 NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \ 1033 NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \ 1034 NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \ 1035 NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX ) 1036 1037 static void veth_setup(struct net_device *dev) 1038 { 1039 ether_setup(dev); 1040 1041 dev->priv_flags &= ~IFF_TX_SKB_SHARING; 1042 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 1043 dev->priv_flags |= IFF_NO_QUEUE; 1044 dev->priv_flags |= IFF_PHONY_HEADROOM; 1045 1046 dev->netdev_ops = &veth_netdev_ops; 1047 dev->ethtool_ops = &veth_ethtool_ops; 1048 dev->features |= NETIF_F_LLTX; 1049 dev->features |= VETH_FEATURES; 1050 dev->vlan_features = dev->features & 1051 ~(NETIF_F_HW_VLAN_CTAG_TX | 1052 NETIF_F_HW_VLAN_STAG_TX | 1053 NETIF_F_HW_VLAN_CTAG_RX | 1054 NETIF_F_HW_VLAN_STAG_RX); 1055 dev->needs_free_netdev = true; 1056 dev->priv_destructor = veth_dev_free; 1057 dev->max_mtu = ETH_MAX_MTU; 1058 1059 dev->hw_features = VETH_FEATURES; 1060 dev->hw_enc_features = VETH_FEATURES; 1061 dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; 1062 } 1063 1064 /* 1065 * netlink interface 1066 */ 1067 1068 static int veth_validate(struct nlattr *tb[], struct nlattr *data[], 1069 struct netlink_ext_ack *extack) 1070 { 1071 if (tb[IFLA_ADDRESS]) { 1072 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 1073 return -EINVAL; 1074 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 1075 return -EADDRNOTAVAIL; 1076 } 1077 if (tb[IFLA_MTU]) { 1078 if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU]))) 1079 return -EINVAL; 1080 } 1081 return 0; 1082 } 1083 1084 static struct rtnl_link_ops veth_link_ops; 1085 1086 static int veth_newlink(struct net *src_net, struct net_device *dev, 1087 struct nlattr *tb[], struct nlattr *data[], 1088 struct netlink_ext_ack *extack) 1089 { 1090 int err; 1091 struct net_device *peer; 1092 struct veth_priv *priv; 1093 char ifname[IFNAMSIZ]; 1094 struct nlattr *peer_tb[IFLA_MAX + 1], **tbp; 1095 unsigned char name_assign_type; 1096 struct ifinfomsg *ifmp; 1097 struct net *net; 1098 1099 /* 1100 * create and register peer first 1101 */ 1102 if (data != NULL && data[VETH_INFO_PEER] != NULL) { 1103 struct nlattr *nla_peer; 1104 1105 nla_peer = data[VETH_INFO_PEER]; 1106 ifmp = nla_data(nla_peer); 1107 err = rtnl_nla_parse_ifla(peer_tb, 1108 nla_data(nla_peer) + sizeof(struct ifinfomsg), 1109 nla_len(nla_peer) - sizeof(struct ifinfomsg), 1110 NULL); 1111 if (err < 0) 1112 return err; 1113 1114 err = veth_validate(peer_tb, NULL, extack); 1115 if (err < 0) 1116 return err; 1117 1118 tbp = peer_tb; 1119 } else { 1120 ifmp = NULL; 1121 tbp = tb; 1122 } 1123 1124 if (ifmp && tbp[IFLA_IFNAME]) { 1125 nla_strlcpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); 1126 name_assign_type = NET_NAME_USER; 1127 } else { 1128 snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); 1129 name_assign_type = NET_NAME_ENUM; 1130 } 1131 1132 net = rtnl_link_get_net(src_net, tbp); 1133 if (IS_ERR(net)) 1134 return PTR_ERR(net); 1135 1136 peer = rtnl_create_link(net, ifname, name_assign_type, 1137 &veth_link_ops, tbp); 1138 if (IS_ERR(peer)) { 1139 put_net(net); 1140 return PTR_ERR(peer); 1141 } 1142 1143 if (!ifmp || !tbp[IFLA_ADDRESS]) 1144 eth_hw_addr_random(peer); 1145 1146 if (ifmp && (dev->ifindex != 0)) 1147 peer->ifindex = ifmp->ifi_index; 1148 1149 peer->gso_max_size = dev->gso_max_size; 1150 peer->gso_max_segs = dev->gso_max_segs; 1151 1152 err = register_netdevice(peer); 1153 put_net(net); 1154 net = NULL; 1155 if (err < 0) 1156 goto err_register_peer; 1157 1158 netif_carrier_off(peer); 1159 1160 err = rtnl_configure_link(peer, ifmp); 1161 if (err < 0) 1162 goto err_configure_peer; 1163 1164 /* 1165 * register dev last 1166 * 1167 * note, that since we've registered new device the dev's name 1168 * should be re-allocated 1169 */ 1170 1171 if (tb[IFLA_ADDRESS] == NULL) 1172 eth_hw_addr_random(dev); 1173 1174 if (tb[IFLA_IFNAME]) 1175 nla_strlcpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); 1176 else 1177 snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); 1178 1179 err = register_netdevice(dev); 1180 if (err < 0) 1181 goto err_register_dev; 1182 1183 netif_carrier_off(dev); 1184 1185 /* 1186 * tie the deviced together 1187 */ 1188 1189 priv = netdev_priv(dev); 1190 rcu_assign_pointer(priv->peer, peer); 1191 1192 priv = netdev_priv(peer); 1193 rcu_assign_pointer(priv->peer, dev); 1194 1195 return 0; 1196 1197 err_register_dev: 1198 /* nothing to do */ 1199 err_configure_peer: 1200 unregister_netdevice(peer); 1201 return err; 1202 1203 err_register_peer: 1204 free_netdev(peer); 1205 return err; 1206 } 1207 1208 static void veth_dellink(struct net_device *dev, struct list_head *head) 1209 { 1210 struct veth_priv *priv; 1211 struct net_device *peer; 1212 1213 priv = netdev_priv(dev); 1214 peer = rtnl_dereference(priv->peer); 1215 1216 /* Note : dellink() is called from default_device_exit_batch(), 1217 * before a rcu_synchronize() point. The devices are guaranteed 1218 * not being freed before one RCU grace period. 1219 */ 1220 RCU_INIT_POINTER(priv->peer, NULL); 1221 unregister_netdevice_queue(dev, head); 1222 1223 if (peer) { 1224 priv = netdev_priv(peer); 1225 RCU_INIT_POINTER(priv->peer, NULL); 1226 unregister_netdevice_queue(peer, head); 1227 } 1228 } 1229 1230 static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = { 1231 [VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) }, 1232 }; 1233 1234 static struct net *veth_get_link_net(const struct net_device *dev) 1235 { 1236 struct veth_priv *priv = netdev_priv(dev); 1237 struct net_device *peer = rtnl_dereference(priv->peer); 1238 1239 return peer ? dev_net(peer) : dev_net(dev); 1240 } 1241 1242 static struct rtnl_link_ops veth_link_ops = { 1243 .kind = DRV_NAME, 1244 .priv_size = sizeof(struct veth_priv), 1245 .setup = veth_setup, 1246 .validate = veth_validate, 1247 .newlink = veth_newlink, 1248 .dellink = veth_dellink, 1249 .policy = veth_policy, 1250 .maxtype = VETH_INFO_MAX, 1251 .get_link_net = veth_get_link_net, 1252 }; 1253 1254 /* 1255 * init/fini 1256 */ 1257 1258 static __init int veth_init(void) 1259 { 1260 return rtnl_link_register(&veth_link_ops); 1261 } 1262 1263 static __exit void veth_exit(void) 1264 { 1265 rtnl_link_unregister(&veth_link_ops); 1266 } 1267 1268 module_init(veth_init); 1269 module_exit(veth_exit); 1270 1271 MODULE_DESCRIPTION("Virtual Ethernet Tunnel"); 1272 MODULE_LICENSE("GPL v2"); 1273 MODULE_ALIAS_RTNL_LINK(DRV_NAME); 1274