1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * drivers/net/veth.c 4 * 5 * Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc 6 * 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com> 9 * 10 */ 11 12 #include <linux/netdevice.h> 13 #include <linux/slab.h> 14 #include <linux/ethtool.h> 15 #include <linux/etherdevice.h> 16 #include <linux/u64_stats_sync.h> 17 18 #include <net/rtnetlink.h> 19 #include <net/dst.h> 20 #include <net/xfrm.h> 21 #include <net/xdp.h> 22 #include <linux/veth.h> 23 #include <linux/module.h> 24 #include <linux/bpf.h> 25 #include <linux/filter.h> 26 #include <linux/ptr_ring.h> 27 #include <linux/bpf_trace.h> 28 #include <linux/net_tstamp.h> 29 30 #define DRV_NAME "veth" 31 #define DRV_VERSION "1.0" 32 33 #define VETH_XDP_FLAG BIT(0) 34 #define VETH_RING_SIZE 256 35 #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) 36 37 /* Separating two types of XDP xmit */ 38 #define VETH_XDP_TX BIT(0) 39 #define VETH_XDP_REDIR BIT(1) 40 41 #define VETH_XDP_TX_BULK_SIZE 16 42 43 struct veth_rq_stats { 44 u64 xdp_packets; 45 u64 xdp_bytes; 46 u64 xdp_drops; 47 struct u64_stats_sync syncp; 48 }; 49 50 struct veth_rq { 51 struct napi_struct xdp_napi; 52 struct net_device *dev; 53 struct bpf_prog __rcu *xdp_prog; 54 struct xdp_mem_info xdp_mem; 55 struct veth_rq_stats stats; 56 bool rx_notify_masked; 57 struct ptr_ring xdp_ring; 58 struct xdp_rxq_info xdp_rxq; 59 }; 60 61 struct veth_priv { 62 struct net_device __rcu *peer; 63 atomic64_t dropped; 64 struct bpf_prog *_xdp_prog; 65 struct veth_rq *rq; 66 unsigned int requested_headroom; 67 }; 68 69 struct veth_xdp_tx_bq { 70 struct xdp_frame *q[VETH_XDP_TX_BULK_SIZE]; 71 unsigned int count; 72 }; 73 74 /* 75 * ethtool interface 76 */ 77 78 struct veth_q_stat_desc { 79 char desc[ETH_GSTRING_LEN]; 80 size_t offset; 81 }; 82 83 #define VETH_RQ_STAT(m) offsetof(struct veth_rq_stats, m) 84 85 static const struct veth_q_stat_desc veth_rq_stats_desc[] = { 86 { "xdp_packets", VETH_RQ_STAT(xdp_packets) }, 87 { "xdp_bytes", VETH_RQ_STAT(xdp_bytes) }, 88 { "xdp_drops", VETH_RQ_STAT(xdp_drops) }, 89 }; 90 91 #define VETH_RQ_STATS_LEN ARRAY_SIZE(veth_rq_stats_desc) 92 93 static struct { 94 const char string[ETH_GSTRING_LEN]; 95 } ethtool_stats_keys[] = { 96 { "peer_ifindex" }, 97 }; 98 99 static int veth_get_link_ksettings(struct net_device *dev, 100 struct ethtool_link_ksettings *cmd) 101 { 102 cmd->base.speed = SPEED_10000; 103 cmd->base.duplex = DUPLEX_FULL; 104 cmd->base.port = PORT_TP; 105 cmd->base.autoneg = AUTONEG_DISABLE; 106 return 0; 107 } 108 109 static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) 110 { 111 strlcpy(info->driver, DRV_NAME, sizeof(info->driver)); 112 strlcpy(info->version, DRV_VERSION, sizeof(info->version)); 113 } 114 115 static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf) 116 { 117 char *p = (char *)buf; 118 int i, j; 119 120 switch(stringset) { 121 case ETH_SS_STATS: 122 memcpy(p, ðtool_stats_keys, sizeof(ethtool_stats_keys)); 123 p += sizeof(ethtool_stats_keys); 124 for (i = 0; i < dev->real_num_rx_queues; i++) { 125 for (j = 0; j < VETH_RQ_STATS_LEN; j++) { 126 snprintf(p, ETH_GSTRING_LEN, 127 "rx_queue_%u_%.11s", 128 i, veth_rq_stats_desc[j].desc); 129 p += ETH_GSTRING_LEN; 130 } 131 } 132 break; 133 } 134 } 135 136 static int veth_get_sset_count(struct net_device *dev, int sset) 137 { 138 switch (sset) { 139 case ETH_SS_STATS: 140 return ARRAY_SIZE(ethtool_stats_keys) + 141 VETH_RQ_STATS_LEN * dev->real_num_rx_queues; 142 default: 143 return -EOPNOTSUPP; 144 } 145 } 146 147 static void veth_get_ethtool_stats(struct net_device *dev, 148 struct ethtool_stats *stats, u64 *data) 149 { 150 struct veth_priv *priv = netdev_priv(dev); 151 struct net_device *peer = rtnl_dereference(priv->peer); 152 int i, j, idx; 153 154 data[0] = peer ? peer->ifindex : 0; 155 idx = 1; 156 for (i = 0; i < dev->real_num_rx_queues; i++) { 157 const struct veth_rq_stats *rq_stats = &priv->rq[i].stats; 158 const void *stats_base = (void *)rq_stats; 159 unsigned int start; 160 size_t offset; 161 162 do { 163 start = u64_stats_fetch_begin_irq(&rq_stats->syncp); 164 for (j = 0; j < VETH_RQ_STATS_LEN; j++) { 165 offset = veth_rq_stats_desc[j].offset; 166 data[idx + j] = *(u64 *)(stats_base + offset); 167 } 168 } while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start)); 169 idx += VETH_RQ_STATS_LEN; 170 } 171 } 172 173 static const struct ethtool_ops veth_ethtool_ops = { 174 .get_drvinfo = veth_get_drvinfo, 175 .get_link = ethtool_op_get_link, 176 .get_strings = veth_get_strings, 177 .get_sset_count = veth_get_sset_count, 178 .get_ethtool_stats = veth_get_ethtool_stats, 179 .get_link_ksettings = veth_get_link_ksettings, 180 .get_ts_info = ethtool_op_get_ts_info, 181 }; 182 183 /* general routines */ 184 185 static bool veth_is_xdp_frame(void *ptr) 186 { 187 return (unsigned long)ptr & VETH_XDP_FLAG; 188 } 189 190 static void *veth_ptr_to_xdp(void *ptr) 191 { 192 return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG); 193 } 194 195 static void *veth_xdp_to_ptr(void *ptr) 196 { 197 return (void *)((unsigned long)ptr | VETH_XDP_FLAG); 198 } 199 200 static void veth_ptr_free(void *ptr) 201 { 202 if (veth_is_xdp_frame(ptr)) 203 xdp_return_frame(veth_ptr_to_xdp(ptr)); 204 else 205 kfree_skb(ptr); 206 } 207 208 static void __veth_xdp_flush(struct veth_rq *rq) 209 { 210 /* Write ptr_ring before reading rx_notify_masked */ 211 smp_mb(); 212 if (!rq->rx_notify_masked) { 213 rq->rx_notify_masked = true; 214 napi_schedule(&rq->xdp_napi); 215 } 216 } 217 218 static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) 219 { 220 if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) { 221 dev_kfree_skb_any(skb); 222 return NET_RX_DROP; 223 } 224 225 return NET_RX_SUCCESS; 226 } 227 228 static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, 229 struct veth_rq *rq, bool xdp) 230 { 231 return __dev_forward_skb(dev, skb) ?: xdp ? 232 veth_xdp_rx(rq, skb) : 233 netif_rx(skb); 234 } 235 236 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) 237 { 238 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 239 struct veth_rq *rq = NULL; 240 struct net_device *rcv; 241 int length = skb->len; 242 bool rcv_xdp = false; 243 int rxq; 244 245 rcu_read_lock(); 246 rcv = rcu_dereference(priv->peer); 247 if (unlikely(!rcv)) { 248 kfree_skb(skb); 249 goto drop; 250 } 251 252 rcv_priv = netdev_priv(rcv); 253 rxq = skb_get_queue_mapping(skb); 254 if (rxq < rcv->real_num_rx_queues) { 255 rq = &rcv_priv->rq[rxq]; 256 rcv_xdp = rcu_access_pointer(rq->xdp_prog); 257 if (rcv_xdp) 258 skb_record_rx_queue(skb, rxq); 259 } 260 261 skb_tx_timestamp(skb); 262 if (likely(veth_forward_skb(rcv, skb, rq, rcv_xdp) == NET_RX_SUCCESS)) { 263 if (!rcv_xdp) 264 dev_lstats_add(dev, length); 265 } else { 266 drop: 267 atomic64_inc(&priv->dropped); 268 } 269 270 if (rcv_xdp) 271 __veth_xdp_flush(rq); 272 273 rcu_read_unlock(); 274 275 return NETDEV_TX_OK; 276 } 277 278 static u64 veth_stats_tx(struct net_device *dev, u64 *packets, u64 *bytes) 279 { 280 struct veth_priv *priv = netdev_priv(dev); 281 282 dev_lstats_read(dev, packets, bytes); 283 return atomic64_read(&priv->dropped); 284 } 285 286 static void veth_stats_rx(struct veth_rq_stats *result, struct net_device *dev) 287 { 288 struct veth_priv *priv = netdev_priv(dev); 289 int i; 290 291 result->xdp_packets = 0; 292 result->xdp_bytes = 0; 293 result->xdp_drops = 0; 294 for (i = 0; i < dev->num_rx_queues; i++) { 295 struct veth_rq_stats *stats = &priv->rq[i].stats; 296 u64 packets, bytes, drops; 297 unsigned int start; 298 299 do { 300 start = u64_stats_fetch_begin_irq(&stats->syncp); 301 packets = stats->xdp_packets; 302 bytes = stats->xdp_bytes; 303 drops = stats->xdp_drops; 304 } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); 305 result->xdp_packets += packets; 306 result->xdp_bytes += bytes; 307 result->xdp_drops += drops; 308 } 309 } 310 311 static void veth_get_stats64(struct net_device *dev, 312 struct rtnl_link_stats64 *tot) 313 { 314 struct veth_priv *priv = netdev_priv(dev); 315 struct net_device *peer; 316 struct veth_rq_stats rx; 317 u64 packets, bytes; 318 319 tot->tx_dropped = veth_stats_tx(dev, &packets, &bytes); 320 tot->tx_bytes = bytes; 321 tot->tx_packets = packets; 322 323 veth_stats_rx(&rx, dev); 324 tot->rx_dropped = rx.xdp_drops; 325 tot->rx_bytes = rx.xdp_bytes; 326 tot->rx_packets = rx.xdp_packets; 327 328 rcu_read_lock(); 329 peer = rcu_dereference(priv->peer); 330 if (peer) { 331 tot->rx_dropped += veth_stats_tx(peer, &packets, &bytes); 332 tot->rx_bytes += bytes; 333 tot->rx_packets += packets; 334 335 veth_stats_rx(&rx, peer); 336 tot->tx_bytes += rx.xdp_bytes; 337 tot->tx_packets += rx.xdp_packets; 338 } 339 rcu_read_unlock(); 340 } 341 342 /* fake multicast ability */ 343 static void veth_set_multicast_list(struct net_device *dev) 344 { 345 } 346 347 static struct sk_buff *veth_build_skb(void *head, int headroom, int len, 348 int buflen) 349 { 350 struct sk_buff *skb; 351 352 if (!buflen) { 353 buflen = SKB_DATA_ALIGN(headroom + len) + 354 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 355 } 356 skb = build_skb(head, buflen); 357 if (!skb) 358 return NULL; 359 360 skb_reserve(skb, headroom); 361 skb_put(skb, len); 362 363 return skb; 364 } 365 366 static int veth_select_rxq(struct net_device *dev) 367 { 368 return smp_processor_id() % dev->real_num_rx_queues; 369 } 370 371 static int veth_xdp_xmit(struct net_device *dev, int n, 372 struct xdp_frame **frames, u32 flags) 373 { 374 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 375 struct net_device *rcv; 376 int i, ret, drops = n; 377 unsigned int max_len; 378 struct veth_rq *rq; 379 380 rcu_read_lock(); 381 if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) { 382 ret = -EINVAL; 383 goto drop; 384 } 385 386 rcv = rcu_dereference(priv->peer); 387 if (unlikely(!rcv)) { 388 ret = -ENXIO; 389 goto drop; 390 } 391 392 rcv_priv = netdev_priv(rcv); 393 rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 394 /* Non-NULL xdp_prog ensures that xdp_ring is initialized on receive 395 * side. This means an XDP program is loaded on the peer and the peer 396 * device is up. 397 */ 398 if (!rcu_access_pointer(rq->xdp_prog)) { 399 ret = -ENXIO; 400 goto drop; 401 } 402 403 drops = 0; 404 max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN; 405 406 spin_lock(&rq->xdp_ring.producer_lock); 407 for (i = 0; i < n; i++) { 408 struct xdp_frame *frame = frames[i]; 409 void *ptr = veth_xdp_to_ptr(frame); 410 411 if (unlikely(frame->len > max_len || 412 __ptr_ring_produce(&rq->xdp_ring, ptr))) { 413 xdp_return_frame_rx_napi(frame); 414 drops++; 415 } 416 } 417 spin_unlock(&rq->xdp_ring.producer_lock); 418 419 if (flags & XDP_XMIT_FLUSH) 420 __veth_xdp_flush(rq); 421 422 if (likely(!drops)) { 423 rcu_read_unlock(); 424 return n; 425 } 426 427 ret = n - drops; 428 drop: 429 rcu_read_unlock(); 430 atomic64_add(drops, &priv->dropped); 431 432 return ret; 433 } 434 435 static void veth_xdp_flush_bq(struct net_device *dev, struct veth_xdp_tx_bq *bq) 436 { 437 int sent, i, err = 0; 438 439 sent = veth_xdp_xmit(dev, bq->count, bq->q, 0); 440 if (sent < 0) { 441 err = sent; 442 sent = 0; 443 for (i = 0; i < bq->count; i++) 444 xdp_return_frame(bq->q[i]); 445 } 446 trace_xdp_bulk_tx(dev, sent, bq->count - sent, err); 447 448 bq->count = 0; 449 } 450 451 static void veth_xdp_flush(struct net_device *dev, struct veth_xdp_tx_bq *bq) 452 { 453 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 454 struct net_device *rcv; 455 struct veth_rq *rq; 456 457 rcu_read_lock(); 458 veth_xdp_flush_bq(dev, bq); 459 rcv = rcu_dereference(priv->peer); 460 if (unlikely(!rcv)) 461 goto out; 462 463 rcv_priv = netdev_priv(rcv); 464 rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 465 /* xdp_ring is initialized on receive side? */ 466 if (unlikely(!rcu_access_pointer(rq->xdp_prog))) 467 goto out; 468 469 __veth_xdp_flush(rq); 470 out: 471 rcu_read_unlock(); 472 } 473 474 static int veth_xdp_tx(struct net_device *dev, struct xdp_buff *xdp, 475 struct veth_xdp_tx_bq *bq) 476 { 477 struct xdp_frame *frame = convert_to_xdp_frame(xdp); 478 479 if (unlikely(!frame)) 480 return -EOVERFLOW; 481 482 if (unlikely(bq->count == VETH_XDP_TX_BULK_SIZE)) 483 veth_xdp_flush_bq(dev, bq); 484 485 bq->q[bq->count++] = frame; 486 487 return 0; 488 } 489 490 static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, 491 struct xdp_frame *frame, 492 unsigned int *xdp_xmit, 493 struct veth_xdp_tx_bq *bq) 494 { 495 void *hard_start = frame->data - frame->headroom; 496 void *head = hard_start - sizeof(struct xdp_frame); 497 int len = frame->len, delta = 0; 498 struct xdp_frame orig_frame; 499 struct bpf_prog *xdp_prog; 500 unsigned int headroom; 501 struct sk_buff *skb; 502 503 rcu_read_lock(); 504 xdp_prog = rcu_dereference(rq->xdp_prog); 505 if (likely(xdp_prog)) { 506 struct xdp_buff xdp; 507 u32 act; 508 509 xdp.data_hard_start = hard_start; 510 xdp.data = frame->data; 511 xdp.data_end = frame->data + frame->len; 512 xdp.data_meta = frame->data - frame->metasize; 513 xdp.rxq = &rq->xdp_rxq; 514 515 act = bpf_prog_run_xdp(xdp_prog, &xdp); 516 517 switch (act) { 518 case XDP_PASS: 519 delta = frame->data - xdp.data; 520 len = xdp.data_end - xdp.data; 521 break; 522 case XDP_TX: 523 orig_frame = *frame; 524 xdp.data_hard_start = head; 525 xdp.rxq->mem = frame->mem; 526 if (unlikely(veth_xdp_tx(rq->dev, &xdp, bq) < 0)) { 527 trace_xdp_exception(rq->dev, xdp_prog, act); 528 frame = &orig_frame; 529 goto err_xdp; 530 } 531 *xdp_xmit |= VETH_XDP_TX; 532 rcu_read_unlock(); 533 goto xdp_xmit; 534 case XDP_REDIRECT: 535 orig_frame = *frame; 536 xdp.data_hard_start = head; 537 xdp.rxq->mem = frame->mem; 538 if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { 539 frame = &orig_frame; 540 goto err_xdp; 541 } 542 *xdp_xmit |= VETH_XDP_REDIR; 543 rcu_read_unlock(); 544 goto xdp_xmit; 545 default: 546 bpf_warn_invalid_xdp_action(act); 547 /* fall through */ 548 case XDP_ABORTED: 549 trace_xdp_exception(rq->dev, xdp_prog, act); 550 /* fall through */ 551 case XDP_DROP: 552 goto err_xdp; 553 } 554 } 555 rcu_read_unlock(); 556 557 headroom = sizeof(struct xdp_frame) + frame->headroom - delta; 558 skb = veth_build_skb(head, headroom, len, 0); 559 if (!skb) { 560 xdp_return_frame(frame); 561 goto err; 562 } 563 564 xdp_release_frame(frame); 565 xdp_scrub_frame(frame); 566 skb->protocol = eth_type_trans(skb, rq->dev); 567 err: 568 return skb; 569 err_xdp: 570 rcu_read_unlock(); 571 xdp_return_frame(frame); 572 xdp_xmit: 573 return NULL; 574 } 575 576 static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, struct sk_buff *skb, 577 unsigned int *xdp_xmit, 578 struct veth_xdp_tx_bq *bq) 579 { 580 u32 pktlen, headroom, act, metalen; 581 void *orig_data, *orig_data_end; 582 struct bpf_prog *xdp_prog; 583 int mac_len, delta, off; 584 struct xdp_buff xdp; 585 586 skb_orphan(skb); 587 588 rcu_read_lock(); 589 xdp_prog = rcu_dereference(rq->xdp_prog); 590 if (unlikely(!xdp_prog)) { 591 rcu_read_unlock(); 592 goto out; 593 } 594 595 mac_len = skb->data - skb_mac_header(skb); 596 pktlen = skb->len + mac_len; 597 headroom = skb_headroom(skb) - mac_len; 598 599 if (skb_shared(skb) || skb_head_is_locked(skb) || 600 skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) { 601 struct sk_buff *nskb; 602 int size, head_off; 603 void *head, *start; 604 struct page *page; 605 606 size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) + 607 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 608 if (size > PAGE_SIZE) 609 goto drop; 610 611 page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); 612 if (!page) 613 goto drop; 614 615 head = page_address(page); 616 start = head + VETH_XDP_HEADROOM; 617 if (skb_copy_bits(skb, -mac_len, start, pktlen)) { 618 page_frag_free(head); 619 goto drop; 620 } 621 622 nskb = veth_build_skb(head, 623 VETH_XDP_HEADROOM + mac_len, skb->len, 624 PAGE_SIZE); 625 if (!nskb) { 626 page_frag_free(head); 627 goto drop; 628 } 629 630 skb_copy_header(nskb, skb); 631 head_off = skb_headroom(nskb) - skb_headroom(skb); 632 skb_headers_offset_update(nskb, head_off); 633 consume_skb(skb); 634 skb = nskb; 635 } 636 637 xdp.data_hard_start = skb->head; 638 xdp.data = skb_mac_header(skb); 639 xdp.data_end = xdp.data + pktlen; 640 xdp.data_meta = xdp.data; 641 xdp.rxq = &rq->xdp_rxq; 642 orig_data = xdp.data; 643 orig_data_end = xdp.data_end; 644 645 act = bpf_prog_run_xdp(xdp_prog, &xdp); 646 647 switch (act) { 648 case XDP_PASS: 649 break; 650 case XDP_TX: 651 get_page(virt_to_page(xdp.data)); 652 consume_skb(skb); 653 xdp.rxq->mem = rq->xdp_mem; 654 if (unlikely(veth_xdp_tx(rq->dev, &xdp, bq) < 0)) { 655 trace_xdp_exception(rq->dev, xdp_prog, act); 656 goto err_xdp; 657 } 658 *xdp_xmit |= VETH_XDP_TX; 659 rcu_read_unlock(); 660 goto xdp_xmit; 661 case XDP_REDIRECT: 662 get_page(virt_to_page(xdp.data)); 663 consume_skb(skb); 664 xdp.rxq->mem = rq->xdp_mem; 665 if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) 666 goto err_xdp; 667 *xdp_xmit |= VETH_XDP_REDIR; 668 rcu_read_unlock(); 669 goto xdp_xmit; 670 default: 671 bpf_warn_invalid_xdp_action(act); 672 /* fall through */ 673 case XDP_ABORTED: 674 trace_xdp_exception(rq->dev, xdp_prog, act); 675 /* fall through */ 676 case XDP_DROP: 677 goto drop; 678 } 679 rcu_read_unlock(); 680 681 delta = orig_data - xdp.data; 682 off = mac_len + delta; 683 if (off > 0) 684 __skb_push(skb, off); 685 else if (off < 0) 686 __skb_pull(skb, -off); 687 skb->mac_header -= delta; 688 off = xdp.data_end - orig_data_end; 689 if (off != 0) 690 __skb_put(skb, off); 691 skb->protocol = eth_type_trans(skb, rq->dev); 692 693 metalen = xdp.data - xdp.data_meta; 694 if (metalen) 695 skb_metadata_set(skb, metalen); 696 out: 697 return skb; 698 drop: 699 rcu_read_unlock(); 700 kfree_skb(skb); 701 return NULL; 702 err_xdp: 703 rcu_read_unlock(); 704 page_frag_free(xdp.data); 705 xdp_xmit: 706 return NULL; 707 } 708 709 static int veth_xdp_rcv(struct veth_rq *rq, int budget, unsigned int *xdp_xmit, 710 struct veth_xdp_tx_bq *bq) 711 { 712 int i, done = 0, drops = 0, bytes = 0; 713 714 for (i = 0; i < budget; i++) { 715 void *ptr = __ptr_ring_consume(&rq->xdp_ring); 716 unsigned int xdp_xmit_one = 0; 717 struct sk_buff *skb; 718 719 if (!ptr) 720 break; 721 722 if (veth_is_xdp_frame(ptr)) { 723 struct xdp_frame *frame = veth_ptr_to_xdp(ptr); 724 725 bytes += frame->len; 726 skb = veth_xdp_rcv_one(rq, frame, &xdp_xmit_one, bq); 727 } else { 728 skb = ptr; 729 bytes += skb->len; 730 skb = veth_xdp_rcv_skb(rq, skb, &xdp_xmit_one, bq); 731 } 732 *xdp_xmit |= xdp_xmit_one; 733 734 if (skb) 735 napi_gro_receive(&rq->xdp_napi, skb); 736 else if (!xdp_xmit_one) 737 drops++; 738 739 done++; 740 } 741 742 u64_stats_update_begin(&rq->stats.syncp); 743 rq->stats.xdp_packets += done; 744 rq->stats.xdp_bytes += bytes; 745 rq->stats.xdp_drops += drops; 746 u64_stats_update_end(&rq->stats.syncp); 747 748 return done; 749 } 750 751 static int veth_poll(struct napi_struct *napi, int budget) 752 { 753 struct veth_rq *rq = 754 container_of(napi, struct veth_rq, xdp_napi); 755 unsigned int xdp_xmit = 0; 756 struct veth_xdp_tx_bq bq; 757 int done; 758 759 bq.count = 0; 760 761 xdp_set_return_frame_no_direct(); 762 done = veth_xdp_rcv(rq, budget, &xdp_xmit, &bq); 763 764 if (done < budget && napi_complete_done(napi, done)) { 765 /* Write rx_notify_masked before reading ptr_ring */ 766 smp_store_mb(rq->rx_notify_masked, false); 767 if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { 768 rq->rx_notify_masked = true; 769 napi_schedule(&rq->xdp_napi); 770 } 771 } 772 773 if (xdp_xmit & VETH_XDP_TX) 774 veth_xdp_flush(rq->dev, &bq); 775 if (xdp_xmit & VETH_XDP_REDIR) 776 xdp_do_flush(); 777 xdp_clear_return_frame_no_direct(); 778 779 return done; 780 } 781 782 static int veth_napi_add(struct net_device *dev) 783 { 784 struct veth_priv *priv = netdev_priv(dev); 785 int err, i; 786 787 for (i = 0; i < dev->real_num_rx_queues; i++) { 788 struct veth_rq *rq = &priv->rq[i]; 789 790 err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL); 791 if (err) 792 goto err_xdp_ring; 793 } 794 795 for (i = 0; i < dev->real_num_rx_queues; i++) { 796 struct veth_rq *rq = &priv->rq[i]; 797 798 netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); 799 napi_enable(&rq->xdp_napi); 800 } 801 802 return 0; 803 err_xdp_ring: 804 for (i--; i >= 0; i--) 805 ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); 806 807 return err; 808 } 809 810 static void veth_napi_del(struct net_device *dev) 811 { 812 struct veth_priv *priv = netdev_priv(dev); 813 int i; 814 815 for (i = 0; i < dev->real_num_rx_queues; i++) { 816 struct veth_rq *rq = &priv->rq[i]; 817 818 napi_disable(&rq->xdp_napi); 819 napi_hash_del(&rq->xdp_napi); 820 } 821 synchronize_net(); 822 823 for (i = 0; i < dev->real_num_rx_queues; i++) { 824 struct veth_rq *rq = &priv->rq[i]; 825 826 netif_napi_del(&rq->xdp_napi); 827 rq->rx_notify_masked = false; 828 ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free); 829 } 830 } 831 832 static int veth_enable_xdp(struct net_device *dev) 833 { 834 struct veth_priv *priv = netdev_priv(dev); 835 int err, i; 836 837 if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) { 838 for (i = 0; i < dev->real_num_rx_queues; i++) { 839 struct veth_rq *rq = &priv->rq[i]; 840 841 err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i); 842 if (err < 0) 843 goto err_rxq_reg; 844 845 err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, 846 MEM_TYPE_PAGE_SHARED, 847 NULL); 848 if (err < 0) 849 goto err_reg_mem; 850 851 /* Save original mem info as it can be overwritten */ 852 rq->xdp_mem = rq->xdp_rxq.mem; 853 } 854 855 err = veth_napi_add(dev); 856 if (err) 857 goto err_rxq_reg; 858 } 859 860 for (i = 0; i < dev->real_num_rx_queues; i++) 861 rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog); 862 863 return 0; 864 err_reg_mem: 865 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 866 err_rxq_reg: 867 for (i--; i >= 0; i--) 868 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 869 870 return err; 871 } 872 873 static void veth_disable_xdp(struct net_device *dev) 874 { 875 struct veth_priv *priv = netdev_priv(dev); 876 int i; 877 878 for (i = 0; i < dev->real_num_rx_queues; i++) 879 rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); 880 veth_napi_del(dev); 881 for (i = 0; i < dev->real_num_rx_queues; i++) { 882 struct veth_rq *rq = &priv->rq[i]; 883 884 rq->xdp_rxq.mem = rq->xdp_mem; 885 xdp_rxq_info_unreg(&rq->xdp_rxq); 886 } 887 } 888 889 static int veth_open(struct net_device *dev) 890 { 891 struct veth_priv *priv = netdev_priv(dev); 892 struct net_device *peer = rtnl_dereference(priv->peer); 893 int err; 894 895 if (!peer) 896 return -ENOTCONN; 897 898 if (priv->_xdp_prog) { 899 err = veth_enable_xdp(dev); 900 if (err) 901 return err; 902 } 903 904 if (peer->flags & IFF_UP) { 905 netif_carrier_on(dev); 906 netif_carrier_on(peer); 907 } 908 909 return 0; 910 } 911 912 static int veth_close(struct net_device *dev) 913 { 914 struct veth_priv *priv = netdev_priv(dev); 915 struct net_device *peer = rtnl_dereference(priv->peer); 916 917 netif_carrier_off(dev); 918 if (peer) 919 netif_carrier_off(peer); 920 921 if (priv->_xdp_prog) 922 veth_disable_xdp(dev); 923 924 return 0; 925 } 926 927 static int is_valid_veth_mtu(int mtu) 928 { 929 return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU; 930 } 931 932 static int veth_alloc_queues(struct net_device *dev) 933 { 934 struct veth_priv *priv = netdev_priv(dev); 935 int i; 936 937 priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL); 938 if (!priv->rq) 939 return -ENOMEM; 940 941 for (i = 0; i < dev->num_rx_queues; i++) { 942 priv->rq[i].dev = dev; 943 u64_stats_init(&priv->rq[i].stats.syncp); 944 } 945 946 return 0; 947 } 948 949 static void veth_free_queues(struct net_device *dev) 950 { 951 struct veth_priv *priv = netdev_priv(dev); 952 953 kfree(priv->rq); 954 } 955 956 static int veth_dev_init(struct net_device *dev) 957 { 958 int err; 959 960 dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); 961 if (!dev->lstats) 962 return -ENOMEM; 963 964 err = veth_alloc_queues(dev); 965 if (err) { 966 free_percpu(dev->lstats); 967 return err; 968 } 969 970 return 0; 971 } 972 973 static void veth_dev_free(struct net_device *dev) 974 { 975 veth_free_queues(dev); 976 free_percpu(dev->lstats); 977 } 978 979 #ifdef CONFIG_NET_POLL_CONTROLLER 980 static void veth_poll_controller(struct net_device *dev) 981 { 982 /* veth only receives frames when its peer sends one 983 * Since it has nothing to do with disabling irqs, we are guaranteed 984 * never to have pending data when we poll for it so 985 * there is nothing to do here. 986 * 987 * We need this though so netpoll recognizes us as an interface that 988 * supports polling, which enables bridge devices in virt setups to 989 * still use netconsole 990 */ 991 } 992 #endif /* CONFIG_NET_POLL_CONTROLLER */ 993 994 static int veth_get_iflink(const struct net_device *dev) 995 { 996 struct veth_priv *priv = netdev_priv(dev); 997 struct net_device *peer; 998 int iflink; 999 1000 rcu_read_lock(); 1001 peer = rcu_dereference(priv->peer); 1002 iflink = peer ? peer->ifindex : 0; 1003 rcu_read_unlock(); 1004 1005 return iflink; 1006 } 1007 1008 static netdev_features_t veth_fix_features(struct net_device *dev, 1009 netdev_features_t features) 1010 { 1011 struct veth_priv *priv = netdev_priv(dev); 1012 struct net_device *peer; 1013 1014 peer = rtnl_dereference(priv->peer); 1015 if (peer) { 1016 struct veth_priv *peer_priv = netdev_priv(peer); 1017 1018 if (peer_priv->_xdp_prog) 1019 features &= ~NETIF_F_GSO_SOFTWARE; 1020 } 1021 1022 return features; 1023 } 1024 1025 static void veth_set_rx_headroom(struct net_device *dev, int new_hr) 1026 { 1027 struct veth_priv *peer_priv, *priv = netdev_priv(dev); 1028 struct net_device *peer; 1029 1030 if (new_hr < 0) 1031 new_hr = 0; 1032 1033 rcu_read_lock(); 1034 peer = rcu_dereference(priv->peer); 1035 if (unlikely(!peer)) 1036 goto out; 1037 1038 peer_priv = netdev_priv(peer); 1039 priv->requested_headroom = new_hr; 1040 new_hr = max(priv->requested_headroom, peer_priv->requested_headroom); 1041 dev->needed_headroom = new_hr; 1042 peer->needed_headroom = new_hr; 1043 1044 out: 1045 rcu_read_unlock(); 1046 } 1047 1048 static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, 1049 struct netlink_ext_ack *extack) 1050 { 1051 struct veth_priv *priv = netdev_priv(dev); 1052 struct bpf_prog *old_prog; 1053 struct net_device *peer; 1054 unsigned int max_mtu; 1055 int err; 1056 1057 old_prog = priv->_xdp_prog; 1058 priv->_xdp_prog = prog; 1059 peer = rtnl_dereference(priv->peer); 1060 1061 if (prog) { 1062 if (!peer) { 1063 NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached"); 1064 err = -ENOTCONN; 1065 goto err; 1066 } 1067 1068 max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM - 1069 peer->hard_header_len - 1070 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 1071 if (peer->mtu > max_mtu) { 1072 NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP"); 1073 err = -ERANGE; 1074 goto err; 1075 } 1076 1077 if (dev->real_num_rx_queues < peer->real_num_tx_queues) { 1078 NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues"); 1079 err = -ENOSPC; 1080 goto err; 1081 } 1082 1083 if (dev->flags & IFF_UP) { 1084 err = veth_enable_xdp(dev); 1085 if (err) { 1086 NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed"); 1087 goto err; 1088 } 1089 } 1090 1091 if (!old_prog) { 1092 peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; 1093 peer->max_mtu = max_mtu; 1094 } 1095 } 1096 1097 if (old_prog) { 1098 if (!prog) { 1099 if (dev->flags & IFF_UP) 1100 veth_disable_xdp(dev); 1101 1102 if (peer) { 1103 peer->hw_features |= NETIF_F_GSO_SOFTWARE; 1104 peer->max_mtu = ETH_MAX_MTU; 1105 } 1106 } 1107 bpf_prog_put(old_prog); 1108 } 1109 1110 if ((!!old_prog ^ !!prog) && peer) 1111 netdev_update_features(peer); 1112 1113 return 0; 1114 err: 1115 priv->_xdp_prog = old_prog; 1116 1117 return err; 1118 } 1119 1120 static u32 veth_xdp_query(struct net_device *dev) 1121 { 1122 struct veth_priv *priv = netdev_priv(dev); 1123 const struct bpf_prog *xdp_prog; 1124 1125 xdp_prog = priv->_xdp_prog; 1126 if (xdp_prog) 1127 return xdp_prog->aux->id; 1128 1129 return 0; 1130 } 1131 1132 static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) 1133 { 1134 switch (xdp->command) { 1135 case XDP_SETUP_PROG: 1136 return veth_xdp_set(dev, xdp->prog, xdp->extack); 1137 case XDP_QUERY_PROG: 1138 xdp->prog_id = veth_xdp_query(dev); 1139 return 0; 1140 default: 1141 return -EINVAL; 1142 } 1143 } 1144 1145 static const struct net_device_ops veth_netdev_ops = { 1146 .ndo_init = veth_dev_init, 1147 .ndo_open = veth_open, 1148 .ndo_stop = veth_close, 1149 .ndo_start_xmit = veth_xmit, 1150 .ndo_get_stats64 = veth_get_stats64, 1151 .ndo_set_rx_mode = veth_set_multicast_list, 1152 .ndo_set_mac_address = eth_mac_addr, 1153 #ifdef CONFIG_NET_POLL_CONTROLLER 1154 .ndo_poll_controller = veth_poll_controller, 1155 #endif 1156 .ndo_get_iflink = veth_get_iflink, 1157 .ndo_fix_features = veth_fix_features, 1158 .ndo_features_check = passthru_features_check, 1159 .ndo_set_rx_headroom = veth_set_rx_headroom, 1160 .ndo_bpf = veth_xdp, 1161 .ndo_xdp_xmit = veth_xdp_xmit, 1162 }; 1163 1164 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ 1165 NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \ 1166 NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \ 1167 NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \ 1168 NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX ) 1169 1170 static void veth_setup(struct net_device *dev) 1171 { 1172 ether_setup(dev); 1173 1174 dev->priv_flags &= ~IFF_TX_SKB_SHARING; 1175 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 1176 dev->priv_flags |= IFF_NO_QUEUE; 1177 dev->priv_flags |= IFF_PHONY_HEADROOM; 1178 1179 dev->netdev_ops = &veth_netdev_ops; 1180 dev->ethtool_ops = &veth_ethtool_ops; 1181 dev->features |= NETIF_F_LLTX; 1182 dev->features |= VETH_FEATURES; 1183 dev->vlan_features = dev->features & 1184 ~(NETIF_F_HW_VLAN_CTAG_TX | 1185 NETIF_F_HW_VLAN_STAG_TX | 1186 NETIF_F_HW_VLAN_CTAG_RX | 1187 NETIF_F_HW_VLAN_STAG_RX); 1188 dev->needs_free_netdev = true; 1189 dev->priv_destructor = veth_dev_free; 1190 dev->max_mtu = ETH_MAX_MTU; 1191 1192 dev->hw_features = VETH_FEATURES; 1193 dev->hw_enc_features = VETH_FEATURES; 1194 dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; 1195 } 1196 1197 /* 1198 * netlink interface 1199 */ 1200 1201 static int veth_validate(struct nlattr *tb[], struct nlattr *data[], 1202 struct netlink_ext_ack *extack) 1203 { 1204 if (tb[IFLA_ADDRESS]) { 1205 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 1206 return -EINVAL; 1207 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 1208 return -EADDRNOTAVAIL; 1209 } 1210 if (tb[IFLA_MTU]) { 1211 if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU]))) 1212 return -EINVAL; 1213 } 1214 return 0; 1215 } 1216 1217 static struct rtnl_link_ops veth_link_ops; 1218 1219 static int veth_newlink(struct net *src_net, struct net_device *dev, 1220 struct nlattr *tb[], struct nlattr *data[], 1221 struct netlink_ext_ack *extack) 1222 { 1223 int err; 1224 struct net_device *peer; 1225 struct veth_priv *priv; 1226 char ifname[IFNAMSIZ]; 1227 struct nlattr *peer_tb[IFLA_MAX + 1], **tbp; 1228 unsigned char name_assign_type; 1229 struct ifinfomsg *ifmp; 1230 struct net *net; 1231 1232 /* 1233 * create and register peer first 1234 */ 1235 if (data != NULL && data[VETH_INFO_PEER] != NULL) { 1236 struct nlattr *nla_peer; 1237 1238 nla_peer = data[VETH_INFO_PEER]; 1239 ifmp = nla_data(nla_peer); 1240 err = rtnl_nla_parse_ifla(peer_tb, 1241 nla_data(nla_peer) + sizeof(struct ifinfomsg), 1242 nla_len(nla_peer) - sizeof(struct ifinfomsg), 1243 NULL); 1244 if (err < 0) 1245 return err; 1246 1247 err = veth_validate(peer_tb, NULL, extack); 1248 if (err < 0) 1249 return err; 1250 1251 tbp = peer_tb; 1252 } else { 1253 ifmp = NULL; 1254 tbp = tb; 1255 } 1256 1257 if (ifmp && tbp[IFLA_IFNAME]) { 1258 nla_strlcpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); 1259 name_assign_type = NET_NAME_USER; 1260 } else { 1261 snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); 1262 name_assign_type = NET_NAME_ENUM; 1263 } 1264 1265 net = rtnl_link_get_net(src_net, tbp); 1266 if (IS_ERR(net)) 1267 return PTR_ERR(net); 1268 1269 peer = rtnl_create_link(net, ifname, name_assign_type, 1270 &veth_link_ops, tbp, extack); 1271 if (IS_ERR(peer)) { 1272 put_net(net); 1273 return PTR_ERR(peer); 1274 } 1275 1276 if (!ifmp || !tbp[IFLA_ADDRESS]) 1277 eth_hw_addr_random(peer); 1278 1279 if (ifmp && (dev->ifindex != 0)) 1280 peer->ifindex = ifmp->ifi_index; 1281 1282 peer->gso_max_size = dev->gso_max_size; 1283 peer->gso_max_segs = dev->gso_max_segs; 1284 1285 err = register_netdevice(peer); 1286 put_net(net); 1287 net = NULL; 1288 if (err < 0) 1289 goto err_register_peer; 1290 1291 netif_carrier_off(peer); 1292 1293 err = rtnl_configure_link(peer, ifmp); 1294 if (err < 0) 1295 goto err_configure_peer; 1296 1297 /* 1298 * register dev last 1299 * 1300 * note, that since we've registered new device the dev's name 1301 * should be re-allocated 1302 */ 1303 1304 if (tb[IFLA_ADDRESS] == NULL) 1305 eth_hw_addr_random(dev); 1306 1307 if (tb[IFLA_IFNAME]) 1308 nla_strlcpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); 1309 else 1310 snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); 1311 1312 err = register_netdevice(dev); 1313 if (err < 0) 1314 goto err_register_dev; 1315 1316 netif_carrier_off(dev); 1317 1318 /* 1319 * tie the deviced together 1320 */ 1321 1322 priv = netdev_priv(dev); 1323 rcu_assign_pointer(priv->peer, peer); 1324 1325 priv = netdev_priv(peer); 1326 rcu_assign_pointer(priv->peer, dev); 1327 1328 return 0; 1329 1330 err_register_dev: 1331 /* nothing to do */ 1332 err_configure_peer: 1333 unregister_netdevice(peer); 1334 return err; 1335 1336 err_register_peer: 1337 free_netdev(peer); 1338 return err; 1339 } 1340 1341 static void veth_dellink(struct net_device *dev, struct list_head *head) 1342 { 1343 struct veth_priv *priv; 1344 struct net_device *peer; 1345 1346 priv = netdev_priv(dev); 1347 peer = rtnl_dereference(priv->peer); 1348 1349 /* Note : dellink() is called from default_device_exit_batch(), 1350 * before a rcu_synchronize() point. The devices are guaranteed 1351 * not being freed before one RCU grace period. 1352 */ 1353 RCU_INIT_POINTER(priv->peer, NULL); 1354 unregister_netdevice_queue(dev, head); 1355 1356 if (peer) { 1357 priv = netdev_priv(peer); 1358 RCU_INIT_POINTER(priv->peer, NULL); 1359 unregister_netdevice_queue(peer, head); 1360 } 1361 } 1362 1363 static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = { 1364 [VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) }, 1365 }; 1366 1367 static struct net *veth_get_link_net(const struct net_device *dev) 1368 { 1369 struct veth_priv *priv = netdev_priv(dev); 1370 struct net_device *peer = rtnl_dereference(priv->peer); 1371 1372 return peer ? dev_net(peer) : dev_net(dev); 1373 } 1374 1375 static struct rtnl_link_ops veth_link_ops = { 1376 .kind = DRV_NAME, 1377 .priv_size = sizeof(struct veth_priv), 1378 .setup = veth_setup, 1379 .validate = veth_validate, 1380 .newlink = veth_newlink, 1381 .dellink = veth_dellink, 1382 .policy = veth_policy, 1383 .maxtype = VETH_INFO_MAX, 1384 .get_link_net = veth_get_link_net, 1385 }; 1386 1387 /* 1388 * init/fini 1389 */ 1390 1391 static __init int veth_init(void) 1392 { 1393 return rtnl_link_register(&veth_link_ops); 1394 } 1395 1396 static __exit void veth_exit(void) 1397 { 1398 rtnl_link_unregister(&veth_link_ops); 1399 } 1400 1401 module_init(veth_init); 1402 module_exit(veth_exit); 1403 1404 MODULE_DESCRIPTION("Virtual Ethernet Tunnel"); 1405 MODULE_LICENSE("GPL v2"); 1406 MODULE_ALIAS_RTNL_LINK(DRV_NAME); 1407