1 /* 2 * drivers/net/veth.c 3 * 4 * Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc 5 * 6 * Author: Pavel Emelianov <xemul@openvz.org> 7 * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com> 8 * 9 */ 10 11 #include <linux/netdevice.h> 12 #include <linux/slab.h> 13 #include <linux/ethtool.h> 14 #include <linux/etherdevice.h> 15 #include <linux/u64_stats_sync.h> 16 17 #include <net/rtnetlink.h> 18 #include <net/dst.h> 19 #include <net/xfrm.h> 20 #include <net/xdp.h> 21 #include <linux/veth.h> 22 #include <linux/module.h> 23 #include <linux/bpf.h> 24 #include <linux/filter.h> 25 #include <linux/ptr_ring.h> 26 #include <linux/bpf_trace.h> 27 #include <linux/net_tstamp.h> 28 29 #define DRV_NAME "veth" 30 #define DRV_VERSION "1.0" 31 32 #define VETH_XDP_FLAG BIT(0) 33 #define VETH_RING_SIZE 256 34 #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) 35 36 /* Separating two types of XDP xmit */ 37 #define VETH_XDP_TX BIT(0) 38 #define VETH_XDP_REDIR BIT(1) 39 40 struct veth_rq_stats { 41 u64 xdp_packets; 42 u64 xdp_bytes; 43 u64 xdp_drops; 44 struct u64_stats_sync syncp; 45 }; 46 47 struct veth_rq { 48 struct napi_struct xdp_napi; 49 struct net_device *dev; 50 struct bpf_prog __rcu *xdp_prog; 51 struct xdp_mem_info xdp_mem; 52 struct veth_rq_stats stats; 53 bool rx_notify_masked; 54 struct ptr_ring xdp_ring; 55 struct xdp_rxq_info xdp_rxq; 56 }; 57 58 struct veth_priv { 59 struct net_device __rcu *peer; 60 atomic64_t dropped; 61 struct bpf_prog *_xdp_prog; 62 struct veth_rq *rq; 63 unsigned int requested_headroom; 64 }; 65 66 /* 67 * ethtool interface 68 */ 69 70 struct veth_q_stat_desc { 71 char desc[ETH_GSTRING_LEN]; 72 size_t offset; 73 }; 74 75 #define VETH_RQ_STAT(m) offsetof(struct veth_rq_stats, m) 76 77 static const struct veth_q_stat_desc veth_rq_stats_desc[] = { 78 { "xdp_packets", VETH_RQ_STAT(xdp_packets) }, 79 { "xdp_bytes", VETH_RQ_STAT(xdp_bytes) }, 80 { "xdp_drops", VETH_RQ_STAT(xdp_drops) }, 81 }; 82 83 #define VETH_RQ_STATS_LEN ARRAY_SIZE(veth_rq_stats_desc) 84 85 static struct { 86 const char string[ETH_GSTRING_LEN]; 87 } ethtool_stats_keys[] = { 88 { "peer_ifindex" }, 89 }; 90 91 static int veth_get_link_ksettings(struct net_device *dev, 92 struct ethtool_link_ksettings *cmd) 93 { 94 cmd->base.speed = SPEED_10000; 95 cmd->base.duplex = DUPLEX_FULL; 96 cmd->base.port = PORT_TP; 97 cmd->base.autoneg = AUTONEG_DISABLE; 98 return 0; 99 } 100 101 static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) 102 { 103 strlcpy(info->driver, DRV_NAME, sizeof(info->driver)); 104 strlcpy(info->version, DRV_VERSION, sizeof(info->version)); 105 } 106 107 static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf) 108 { 109 char *p = (char *)buf; 110 int i, j; 111 112 switch(stringset) { 113 case ETH_SS_STATS: 114 memcpy(p, ðtool_stats_keys, sizeof(ethtool_stats_keys)); 115 p += sizeof(ethtool_stats_keys); 116 for (i = 0; i < dev->real_num_rx_queues; i++) { 117 for (j = 0; j < VETH_RQ_STATS_LEN; j++) { 118 snprintf(p, ETH_GSTRING_LEN, 119 "rx_queue_%u_%.11s", 120 i, veth_rq_stats_desc[j].desc); 121 p += ETH_GSTRING_LEN; 122 } 123 } 124 break; 125 } 126 } 127 128 static int veth_get_sset_count(struct net_device *dev, int sset) 129 { 130 switch (sset) { 131 case ETH_SS_STATS: 132 return ARRAY_SIZE(ethtool_stats_keys) + 133 VETH_RQ_STATS_LEN * dev->real_num_rx_queues; 134 default: 135 return -EOPNOTSUPP; 136 } 137 } 138 139 static void veth_get_ethtool_stats(struct net_device *dev, 140 struct ethtool_stats *stats, u64 *data) 141 { 142 struct veth_priv *priv = netdev_priv(dev); 143 struct net_device *peer = rtnl_dereference(priv->peer); 144 int i, j, idx; 145 146 data[0] = peer ? peer->ifindex : 0; 147 idx = 1; 148 for (i = 0; i < dev->real_num_rx_queues; i++) { 149 const struct veth_rq_stats *rq_stats = &priv->rq[i].stats; 150 const void *stats_base = (void *)rq_stats; 151 unsigned int start; 152 size_t offset; 153 154 do { 155 start = u64_stats_fetch_begin_irq(&rq_stats->syncp); 156 for (j = 0; j < VETH_RQ_STATS_LEN; j++) { 157 offset = veth_rq_stats_desc[j].offset; 158 data[idx + j] = *(u64 *)(stats_base + offset); 159 } 160 } while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start)); 161 idx += VETH_RQ_STATS_LEN; 162 } 163 } 164 165 static const struct ethtool_ops veth_ethtool_ops = { 166 .get_drvinfo = veth_get_drvinfo, 167 .get_link = ethtool_op_get_link, 168 .get_strings = veth_get_strings, 169 .get_sset_count = veth_get_sset_count, 170 .get_ethtool_stats = veth_get_ethtool_stats, 171 .get_link_ksettings = veth_get_link_ksettings, 172 .get_ts_info = ethtool_op_get_ts_info, 173 }; 174 175 /* general routines */ 176 177 static bool veth_is_xdp_frame(void *ptr) 178 { 179 return (unsigned long)ptr & VETH_XDP_FLAG; 180 } 181 182 static void *veth_ptr_to_xdp(void *ptr) 183 { 184 return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG); 185 } 186 187 static void *veth_xdp_to_ptr(void *ptr) 188 { 189 return (void *)((unsigned long)ptr | VETH_XDP_FLAG); 190 } 191 192 static void veth_ptr_free(void *ptr) 193 { 194 if (veth_is_xdp_frame(ptr)) 195 xdp_return_frame(veth_ptr_to_xdp(ptr)); 196 else 197 kfree_skb(ptr); 198 } 199 200 static void __veth_xdp_flush(struct veth_rq *rq) 201 { 202 /* Write ptr_ring before reading rx_notify_masked */ 203 smp_mb(); 204 if (!rq->rx_notify_masked) { 205 rq->rx_notify_masked = true; 206 napi_schedule(&rq->xdp_napi); 207 } 208 } 209 210 static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) 211 { 212 if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) { 213 dev_kfree_skb_any(skb); 214 return NET_RX_DROP; 215 } 216 217 return NET_RX_SUCCESS; 218 } 219 220 static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, 221 struct veth_rq *rq, bool xdp) 222 { 223 return __dev_forward_skb(dev, skb) ?: xdp ? 224 veth_xdp_rx(rq, skb) : 225 netif_rx(skb); 226 } 227 228 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) 229 { 230 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 231 struct veth_rq *rq = NULL; 232 struct net_device *rcv; 233 int length = skb->len; 234 bool rcv_xdp = false; 235 int rxq; 236 237 rcu_read_lock(); 238 rcv = rcu_dereference(priv->peer); 239 if (unlikely(!rcv)) { 240 kfree_skb(skb); 241 goto drop; 242 } 243 244 rcv_priv = netdev_priv(rcv); 245 rxq = skb_get_queue_mapping(skb); 246 if (rxq < rcv->real_num_rx_queues) { 247 rq = &rcv_priv->rq[rxq]; 248 rcv_xdp = rcu_access_pointer(rq->xdp_prog); 249 if (rcv_xdp) 250 skb_record_rx_queue(skb, rxq); 251 } 252 253 skb_tx_timestamp(skb); 254 if (likely(veth_forward_skb(rcv, skb, rq, rcv_xdp) == NET_RX_SUCCESS)) { 255 if (!rcv_xdp) { 256 struct pcpu_lstats *stats = this_cpu_ptr(dev->lstats); 257 258 u64_stats_update_begin(&stats->syncp); 259 stats->bytes += length; 260 stats->packets++; 261 u64_stats_update_end(&stats->syncp); 262 } 263 } else { 264 drop: 265 atomic64_inc(&priv->dropped); 266 } 267 268 if (rcv_xdp) 269 __veth_xdp_flush(rq); 270 271 rcu_read_unlock(); 272 273 return NETDEV_TX_OK; 274 } 275 276 static u64 veth_stats_tx(struct pcpu_lstats *result, struct net_device *dev) 277 { 278 struct veth_priv *priv = netdev_priv(dev); 279 int cpu; 280 281 result->packets = 0; 282 result->bytes = 0; 283 for_each_possible_cpu(cpu) { 284 struct pcpu_lstats *stats = per_cpu_ptr(dev->lstats, cpu); 285 u64 packets, bytes; 286 unsigned int start; 287 288 do { 289 start = u64_stats_fetch_begin_irq(&stats->syncp); 290 packets = stats->packets; 291 bytes = stats->bytes; 292 } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); 293 result->packets += packets; 294 result->bytes += bytes; 295 } 296 return atomic64_read(&priv->dropped); 297 } 298 299 static void veth_stats_rx(struct veth_rq_stats *result, struct net_device *dev) 300 { 301 struct veth_priv *priv = netdev_priv(dev); 302 int i; 303 304 result->xdp_packets = 0; 305 result->xdp_bytes = 0; 306 result->xdp_drops = 0; 307 for (i = 0; i < dev->num_rx_queues; i++) { 308 struct veth_rq_stats *stats = &priv->rq[i].stats; 309 u64 packets, bytes, drops; 310 unsigned int start; 311 312 do { 313 start = u64_stats_fetch_begin_irq(&stats->syncp); 314 packets = stats->xdp_packets; 315 bytes = stats->xdp_bytes; 316 drops = stats->xdp_drops; 317 } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); 318 result->xdp_packets += packets; 319 result->xdp_bytes += bytes; 320 result->xdp_drops += drops; 321 } 322 } 323 324 static void veth_get_stats64(struct net_device *dev, 325 struct rtnl_link_stats64 *tot) 326 { 327 struct veth_priv *priv = netdev_priv(dev); 328 struct net_device *peer; 329 struct veth_rq_stats rx; 330 struct pcpu_lstats tx; 331 332 tot->tx_dropped = veth_stats_tx(&tx, dev); 333 tot->tx_bytes = tx.bytes; 334 tot->tx_packets = tx.packets; 335 336 veth_stats_rx(&rx, dev); 337 tot->rx_dropped = rx.xdp_drops; 338 tot->rx_bytes = rx.xdp_bytes; 339 tot->rx_packets = rx.xdp_packets; 340 341 rcu_read_lock(); 342 peer = rcu_dereference(priv->peer); 343 if (peer) { 344 tot->rx_dropped += veth_stats_tx(&tx, peer); 345 tot->rx_bytes += tx.bytes; 346 tot->rx_packets += tx.packets; 347 348 veth_stats_rx(&rx, peer); 349 tot->tx_bytes += rx.xdp_bytes; 350 tot->tx_packets += rx.xdp_packets; 351 } 352 rcu_read_unlock(); 353 } 354 355 /* fake multicast ability */ 356 static void veth_set_multicast_list(struct net_device *dev) 357 { 358 } 359 360 static struct sk_buff *veth_build_skb(void *head, int headroom, int len, 361 int buflen) 362 { 363 struct sk_buff *skb; 364 365 if (!buflen) { 366 buflen = SKB_DATA_ALIGN(headroom + len) + 367 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 368 } 369 skb = build_skb(head, buflen); 370 if (!skb) 371 return NULL; 372 373 skb_reserve(skb, headroom); 374 skb_put(skb, len); 375 376 return skb; 377 } 378 379 static int veth_select_rxq(struct net_device *dev) 380 { 381 return smp_processor_id() % dev->real_num_rx_queues; 382 } 383 384 static int veth_xdp_xmit(struct net_device *dev, int n, 385 struct xdp_frame **frames, u32 flags) 386 { 387 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 388 struct net_device *rcv; 389 int i, ret, drops = n; 390 unsigned int max_len; 391 struct veth_rq *rq; 392 393 if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) { 394 ret = -EINVAL; 395 goto drop; 396 } 397 398 rcv = rcu_dereference(priv->peer); 399 if (unlikely(!rcv)) { 400 ret = -ENXIO; 401 goto drop; 402 } 403 404 rcv_priv = netdev_priv(rcv); 405 rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 406 /* Non-NULL xdp_prog ensures that xdp_ring is initialized on receive 407 * side. This means an XDP program is loaded on the peer and the peer 408 * device is up. 409 */ 410 if (!rcu_access_pointer(rq->xdp_prog)) { 411 ret = -ENXIO; 412 goto drop; 413 } 414 415 drops = 0; 416 max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN; 417 418 spin_lock(&rq->xdp_ring.producer_lock); 419 for (i = 0; i < n; i++) { 420 struct xdp_frame *frame = frames[i]; 421 void *ptr = veth_xdp_to_ptr(frame); 422 423 if (unlikely(frame->len > max_len || 424 __ptr_ring_produce(&rq->xdp_ring, ptr))) { 425 xdp_return_frame_rx_napi(frame); 426 drops++; 427 } 428 } 429 spin_unlock(&rq->xdp_ring.producer_lock); 430 431 if (flags & XDP_XMIT_FLUSH) 432 __veth_xdp_flush(rq); 433 434 if (likely(!drops)) 435 return n; 436 437 ret = n - drops; 438 drop: 439 atomic64_add(drops, &priv->dropped); 440 441 return ret; 442 } 443 444 static void veth_xdp_flush(struct net_device *dev) 445 { 446 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 447 struct net_device *rcv; 448 struct veth_rq *rq; 449 450 rcu_read_lock(); 451 rcv = rcu_dereference(priv->peer); 452 if (unlikely(!rcv)) 453 goto out; 454 455 rcv_priv = netdev_priv(rcv); 456 rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 457 /* xdp_ring is initialized on receive side? */ 458 if (unlikely(!rcu_access_pointer(rq->xdp_prog))) 459 goto out; 460 461 __veth_xdp_flush(rq); 462 out: 463 rcu_read_unlock(); 464 } 465 466 static int veth_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) 467 { 468 struct xdp_frame *frame = convert_to_xdp_frame(xdp); 469 470 if (unlikely(!frame)) 471 return -EOVERFLOW; 472 473 return veth_xdp_xmit(dev, 1, &frame, 0); 474 } 475 476 static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, 477 struct xdp_frame *frame, 478 unsigned int *xdp_xmit) 479 { 480 void *hard_start = frame->data - frame->headroom; 481 void *head = hard_start - sizeof(struct xdp_frame); 482 int len = frame->len, delta = 0; 483 struct xdp_frame orig_frame; 484 struct bpf_prog *xdp_prog; 485 unsigned int headroom; 486 struct sk_buff *skb; 487 488 rcu_read_lock(); 489 xdp_prog = rcu_dereference(rq->xdp_prog); 490 if (likely(xdp_prog)) { 491 struct xdp_buff xdp; 492 u32 act; 493 494 xdp.data_hard_start = hard_start; 495 xdp.data = frame->data; 496 xdp.data_end = frame->data + frame->len; 497 xdp.data_meta = frame->data - frame->metasize; 498 xdp.rxq = &rq->xdp_rxq; 499 500 act = bpf_prog_run_xdp(xdp_prog, &xdp); 501 502 switch (act) { 503 case XDP_PASS: 504 delta = frame->data - xdp.data; 505 len = xdp.data_end - xdp.data; 506 break; 507 case XDP_TX: 508 orig_frame = *frame; 509 xdp.data_hard_start = head; 510 xdp.rxq->mem = frame->mem; 511 if (unlikely(veth_xdp_tx(rq->dev, &xdp) < 0)) { 512 trace_xdp_exception(rq->dev, xdp_prog, act); 513 frame = &orig_frame; 514 goto err_xdp; 515 } 516 *xdp_xmit |= VETH_XDP_TX; 517 rcu_read_unlock(); 518 goto xdp_xmit; 519 case XDP_REDIRECT: 520 orig_frame = *frame; 521 xdp.data_hard_start = head; 522 xdp.rxq->mem = frame->mem; 523 if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { 524 frame = &orig_frame; 525 goto err_xdp; 526 } 527 *xdp_xmit |= VETH_XDP_REDIR; 528 rcu_read_unlock(); 529 goto xdp_xmit; 530 default: 531 bpf_warn_invalid_xdp_action(act); 532 /* fall through */ 533 case XDP_ABORTED: 534 trace_xdp_exception(rq->dev, xdp_prog, act); 535 /* fall through */ 536 case XDP_DROP: 537 goto err_xdp; 538 } 539 } 540 rcu_read_unlock(); 541 542 headroom = sizeof(struct xdp_frame) + frame->headroom - delta; 543 skb = veth_build_skb(head, headroom, len, 0); 544 if (!skb) { 545 xdp_return_frame(frame); 546 goto err; 547 } 548 549 xdp_scrub_frame(frame); 550 skb->protocol = eth_type_trans(skb, rq->dev); 551 err: 552 return skb; 553 err_xdp: 554 rcu_read_unlock(); 555 xdp_return_frame(frame); 556 xdp_xmit: 557 return NULL; 558 } 559 560 static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, struct sk_buff *skb, 561 unsigned int *xdp_xmit) 562 { 563 u32 pktlen, headroom, act, metalen; 564 void *orig_data, *orig_data_end; 565 struct bpf_prog *xdp_prog; 566 int mac_len, delta, off; 567 struct xdp_buff xdp; 568 569 skb_orphan(skb); 570 571 rcu_read_lock(); 572 xdp_prog = rcu_dereference(rq->xdp_prog); 573 if (unlikely(!xdp_prog)) { 574 rcu_read_unlock(); 575 goto out; 576 } 577 578 mac_len = skb->data - skb_mac_header(skb); 579 pktlen = skb->len + mac_len; 580 headroom = skb_headroom(skb) - mac_len; 581 582 if (skb_shared(skb) || skb_head_is_locked(skb) || 583 skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) { 584 struct sk_buff *nskb; 585 int size, head_off; 586 void *head, *start; 587 struct page *page; 588 589 size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) + 590 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 591 if (size > PAGE_SIZE) 592 goto drop; 593 594 page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); 595 if (!page) 596 goto drop; 597 598 head = page_address(page); 599 start = head + VETH_XDP_HEADROOM; 600 if (skb_copy_bits(skb, -mac_len, start, pktlen)) { 601 page_frag_free(head); 602 goto drop; 603 } 604 605 nskb = veth_build_skb(head, 606 VETH_XDP_HEADROOM + mac_len, skb->len, 607 PAGE_SIZE); 608 if (!nskb) { 609 page_frag_free(head); 610 goto drop; 611 } 612 613 skb_copy_header(nskb, skb); 614 head_off = skb_headroom(nskb) - skb_headroom(skb); 615 skb_headers_offset_update(nskb, head_off); 616 consume_skb(skb); 617 skb = nskb; 618 } 619 620 xdp.data_hard_start = skb->head; 621 xdp.data = skb_mac_header(skb); 622 xdp.data_end = xdp.data + pktlen; 623 xdp.data_meta = xdp.data; 624 xdp.rxq = &rq->xdp_rxq; 625 orig_data = xdp.data; 626 orig_data_end = xdp.data_end; 627 628 act = bpf_prog_run_xdp(xdp_prog, &xdp); 629 630 switch (act) { 631 case XDP_PASS: 632 break; 633 case XDP_TX: 634 get_page(virt_to_page(xdp.data)); 635 consume_skb(skb); 636 xdp.rxq->mem = rq->xdp_mem; 637 if (unlikely(veth_xdp_tx(rq->dev, &xdp) < 0)) { 638 trace_xdp_exception(rq->dev, xdp_prog, act); 639 goto err_xdp; 640 } 641 *xdp_xmit |= VETH_XDP_TX; 642 rcu_read_unlock(); 643 goto xdp_xmit; 644 case XDP_REDIRECT: 645 get_page(virt_to_page(xdp.data)); 646 consume_skb(skb); 647 xdp.rxq->mem = rq->xdp_mem; 648 if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) 649 goto err_xdp; 650 *xdp_xmit |= VETH_XDP_REDIR; 651 rcu_read_unlock(); 652 goto xdp_xmit; 653 default: 654 bpf_warn_invalid_xdp_action(act); 655 /* fall through */ 656 case XDP_ABORTED: 657 trace_xdp_exception(rq->dev, xdp_prog, act); 658 /* fall through */ 659 case XDP_DROP: 660 goto drop; 661 } 662 rcu_read_unlock(); 663 664 delta = orig_data - xdp.data; 665 off = mac_len + delta; 666 if (off > 0) 667 __skb_push(skb, off); 668 else if (off < 0) 669 __skb_pull(skb, -off); 670 skb->mac_header -= delta; 671 off = xdp.data_end - orig_data_end; 672 if (off != 0) 673 __skb_put(skb, off); 674 skb->protocol = eth_type_trans(skb, rq->dev); 675 676 metalen = xdp.data - xdp.data_meta; 677 if (metalen) 678 skb_metadata_set(skb, metalen); 679 out: 680 return skb; 681 drop: 682 rcu_read_unlock(); 683 kfree_skb(skb); 684 return NULL; 685 err_xdp: 686 rcu_read_unlock(); 687 page_frag_free(xdp.data); 688 xdp_xmit: 689 return NULL; 690 } 691 692 static int veth_xdp_rcv(struct veth_rq *rq, int budget, unsigned int *xdp_xmit) 693 { 694 int i, done = 0, drops = 0, bytes = 0; 695 696 for (i = 0; i < budget; i++) { 697 void *ptr = __ptr_ring_consume(&rq->xdp_ring); 698 unsigned int xdp_xmit_one = 0; 699 struct sk_buff *skb; 700 701 if (!ptr) 702 break; 703 704 if (veth_is_xdp_frame(ptr)) { 705 struct xdp_frame *frame = veth_ptr_to_xdp(ptr); 706 707 bytes += frame->len; 708 skb = veth_xdp_rcv_one(rq, frame, &xdp_xmit_one); 709 } else { 710 skb = ptr; 711 bytes += skb->len; 712 skb = veth_xdp_rcv_skb(rq, skb, &xdp_xmit_one); 713 } 714 *xdp_xmit |= xdp_xmit_one; 715 716 if (skb) 717 napi_gro_receive(&rq->xdp_napi, skb); 718 else if (!xdp_xmit_one) 719 drops++; 720 721 done++; 722 } 723 724 u64_stats_update_begin(&rq->stats.syncp); 725 rq->stats.xdp_packets += done; 726 rq->stats.xdp_bytes += bytes; 727 rq->stats.xdp_drops += drops; 728 u64_stats_update_end(&rq->stats.syncp); 729 730 return done; 731 } 732 733 static int veth_poll(struct napi_struct *napi, int budget) 734 { 735 struct veth_rq *rq = 736 container_of(napi, struct veth_rq, xdp_napi); 737 unsigned int xdp_xmit = 0; 738 int done; 739 740 xdp_set_return_frame_no_direct(); 741 done = veth_xdp_rcv(rq, budget, &xdp_xmit); 742 743 if (done < budget && napi_complete_done(napi, done)) { 744 /* Write rx_notify_masked before reading ptr_ring */ 745 smp_store_mb(rq->rx_notify_masked, false); 746 if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { 747 rq->rx_notify_masked = true; 748 napi_schedule(&rq->xdp_napi); 749 } 750 } 751 752 if (xdp_xmit & VETH_XDP_TX) 753 veth_xdp_flush(rq->dev); 754 if (xdp_xmit & VETH_XDP_REDIR) 755 xdp_do_flush_map(); 756 xdp_clear_return_frame_no_direct(); 757 758 return done; 759 } 760 761 static int veth_napi_add(struct net_device *dev) 762 { 763 struct veth_priv *priv = netdev_priv(dev); 764 int err, i; 765 766 for (i = 0; i < dev->real_num_rx_queues; i++) { 767 struct veth_rq *rq = &priv->rq[i]; 768 769 err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL); 770 if (err) 771 goto err_xdp_ring; 772 } 773 774 for (i = 0; i < dev->real_num_rx_queues; i++) { 775 struct veth_rq *rq = &priv->rq[i]; 776 777 netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); 778 napi_enable(&rq->xdp_napi); 779 } 780 781 return 0; 782 err_xdp_ring: 783 for (i--; i >= 0; i--) 784 ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); 785 786 return err; 787 } 788 789 static void veth_napi_del(struct net_device *dev) 790 { 791 struct veth_priv *priv = netdev_priv(dev); 792 int i; 793 794 for (i = 0; i < dev->real_num_rx_queues; i++) { 795 struct veth_rq *rq = &priv->rq[i]; 796 797 napi_disable(&rq->xdp_napi); 798 napi_hash_del(&rq->xdp_napi); 799 } 800 synchronize_net(); 801 802 for (i = 0; i < dev->real_num_rx_queues; i++) { 803 struct veth_rq *rq = &priv->rq[i]; 804 805 netif_napi_del(&rq->xdp_napi); 806 rq->rx_notify_masked = false; 807 ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free); 808 } 809 } 810 811 static int veth_enable_xdp(struct net_device *dev) 812 { 813 struct veth_priv *priv = netdev_priv(dev); 814 int err, i; 815 816 if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) { 817 for (i = 0; i < dev->real_num_rx_queues; i++) { 818 struct veth_rq *rq = &priv->rq[i]; 819 820 err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i); 821 if (err < 0) 822 goto err_rxq_reg; 823 824 err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, 825 MEM_TYPE_PAGE_SHARED, 826 NULL); 827 if (err < 0) 828 goto err_reg_mem; 829 830 /* Save original mem info as it can be overwritten */ 831 rq->xdp_mem = rq->xdp_rxq.mem; 832 } 833 834 err = veth_napi_add(dev); 835 if (err) 836 goto err_rxq_reg; 837 } 838 839 for (i = 0; i < dev->real_num_rx_queues; i++) 840 rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog); 841 842 return 0; 843 err_reg_mem: 844 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 845 err_rxq_reg: 846 for (i--; i >= 0; i--) 847 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 848 849 return err; 850 } 851 852 static void veth_disable_xdp(struct net_device *dev) 853 { 854 struct veth_priv *priv = netdev_priv(dev); 855 int i; 856 857 for (i = 0; i < dev->real_num_rx_queues; i++) 858 rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); 859 veth_napi_del(dev); 860 for (i = 0; i < dev->real_num_rx_queues; i++) { 861 struct veth_rq *rq = &priv->rq[i]; 862 863 rq->xdp_rxq.mem = rq->xdp_mem; 864 xdp_rxq_info_unreg(&rq->xdp_rxq); 865 } 866 } 867 868 static int veth_open(struct net_device *dev) 869 { 870 struct veth_priv *priv = netdev_priv(dev); 871 struct net_device *peer = rtnl_dereference(priv->peer); 872 int err; 873 874 if (!peer) 875 return -ENOTCONN; 876 877 if (priv->_xdp_prog) { 878 err = veth_enable_xdp(dev); 879 if (err) 880 return err; 881 } 882 883 if (peer->flags & IFF_UP) { 884 netif_carrier_on(dev); 885 netif_carrier_on(peer); 886 } 887 888 return 0; 889 } 890 891 static int veth_close(struct net_device *dev) 892 { 893 struct veth_priv *priv = netdev_priv(dev); 894 struct net_device *peer = rtnl_dereference(priv->peer); 895 896 netif_carrier_off(dev); 897 if (peer) 898 netif_carrier_off(peer); 899 900 if (priv->_xdp_prog) 901 veth_disable_xdp(dev); 902 903 return 0; 904 } 905 906 static int is_valid_veth_mtu(int mtu) 907 { 908 return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU; 909 } 910 911 static int veth_alloc_queues(struct net_device *dev) 912 { 913 struct veth_priv *priv = netdev_priv(dev); 914 int i; 915 916 priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL); 917 if (!priv->rq) 918 return -ENOMEM; 919 920 for (i = 0; i < dev->num_rx_queues; i++) { 921 priv->rq[i].dev = dev; 922 u64_stats_init(&priv->rq[i].stats.syncp); 923 } 924 925 return 0; 926 } 927 928 static void veth_free_queues(struct net_device *dev) 929 { 930 struct veth_priv *priv = netdev_priv(dev); 931 932 kfree(priv->rq); 933 } 934 935 static int veth_dev_init(struct net_device *dev) 936 { 937 int err; 938 939 dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); 940 if (!dev->lstats) 941 return -ENOMEM; 942 943 err = veth_alloc_queues(dev); 944 if (err) { 945 free_percpu(dev->lstats); 946 return err; 947 } 948 949 return 0; 950 } 951 952 static void veth_dev_free(struct net_device *dev) 953 { 954 veth_free_queues(dev); 955 free_percpu(dev->lstats); 956 } 957 958 #ifdef CONFIG_NET_POLL_CONTROLLER 959 static void veth_poll_controller(struct net_device *dev) 960 { 961 /* veth only receives frames when its peer sends one 962 * Since it has nothing to do with disabling irqs, we are guaranteed 963 * never to have pending data when we poll for it so 964 * there is nothing to do here. 965 * 966 * We need this though so netpoll recognizes us as an interface that 967 * supports polling, which enables bridge devices in virt setups to 968 * still use netconsole 969 */ 970 } 971 #endif /* CONFIG_NET_POLL_CONTROLLER */ 972 973 static int veth_get_iflink(const struct net_device *dev) 974 { 975 struct veth_priv *priv = netdev_priv(dev); 976 struct net_device *peer; 977 int iflink; 978 979 rcu_read_lock(); 980 peer = rcu_dereference(priv->peer); 981 iflink = peer ? peer->ifindex : 0; 982 rcu_read_unlock(); 983 984 return iflink; 985 } 986 987 static netdev_features_t veth_fix_features(struct net_device *dev, 988 netdev_features_t features) 989 { 990 struct veth_priv *priv = netdev_priv(dev); 991 struct net_device *peer; 992 993 peer = rtnl_dereference(priv->peer); 994 if (peer) { 995 struct veth_priv *peer_priv = netdev_priv(peer); 996 997 if (peer_priv->_xdp_prog) 998 features &= ~NETIF_F_GSO_SOFTWARE; 999 } 1000 1001 return features; 1002 } 1003 1004 static void veth_set_rx_headroom(struct net_device *dev, int new_hr) 1005 { 1006 struct veth_priv *peer_priv, *priv = netdev_priv(dev); 1007 struct net_device *peer; 1008 1009 if (new_hr < 0) 1010 new_hr = 0; 1011 1012 rcu_read_lock(); 1013 peer = rcu_dereference(priv->peer); 1014 if (unlikely(!peer)) 1015 goto out; 1016 1017 peer_priv = netdev_priv(peer); 1018 priv->requested_headroom = new_hr; 1019 new_hr = max(priv->requested_headroom, peer_priv->requested_headroom); 1020 dev->needed_headroom = new_hr; 1021 peer->needed_headroom = new_hr; 1022 1023 out: 1024 rcu_read_unlock(); 1025 } 1026 1027 static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, 1028 struct netlink_ext_ack *extack) 1029 { 1030 struct veth_priv *priv = netdev_priv(dev); 1031 struct bpf_prog *old_prog; 1032 struct net_device *peer; 1033 unsigned int max_mtu; 1034 int err; 1035 1036 old_prog = priv->_xdp_prog; 1037 priv->_xdp_prog = prog; 1038 peer = rtnl_dereference(priv->peer); 1039 1040 if (prog) { 1041 if (!peer) { 1042 NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached"); 1043 err = -ENOTCONN; 1044 goto err; 1045 } 1046 1047 max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM - 1048 peer->hard_header_len - 1049 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 1050 if (peer->mtu > max_mtu) { 1051 NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP"); 1052 err = -ERANGE; 1053 goto err; 1054 } 1055 1056 if (dev->real_num_rx_queues < peer->real_num_tx_queues) { 1057 NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues"); 1058 err = -ENOSPC; 1059 goto err; 1060 } 1061 1062 if (dev->flags & IFF_UP) { 1063 err = veth_enable_xdp(dev); 1064 if (err) { 1065 NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed"); 1066 goto err; 1067 } 1068 } 1069 1070 if (!old_prog) { 1071 peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; 1072 peer->max_mtu = max_mtu; 1073 } 1074 } 1075 1076 if (old_prog) { 1077 if (!prog) { 1078 if (dev->flags & IFF_UP) 1079 veth_disable_xdp(dev); 1080 1081 if (peer) { 1082 peer->hw_features |= NETIF_F_GSO_SOFTWARE; 1083 peer->max_mtu = ETH_MAX_MTU; 1084 } 1085 } 1086 bpf_prog_put(old_prog); 1087 } 1088 1089 if ((!!old_prog ^ !!prog) && peer) 1090 netdev_update_features(peer); 1091 1092 return 0; 1093 err: 1094 priv->_xdp_prog = old_prog; 1095 1096 return err; 1097 } 1098 1099 static u32 veth_xdp_query(struct net_device *dev) 1100 { 1101 struct veth_priv *priv = netdev_priv(dev); 1102 const struct bpf_prog *xdp_prog; 1103 1104 xdp_prog = priv->_xdp_prog; 1105 if (xdp_prog) 1106 return xdp_prog->aux->id; 1107 1108 return 0; 1109 } 1110 1111 static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) 1112 { 1113 switch (xdp->command) { 1114 case XDP_SETUP_PROG: 1115 return veth_xdp_set(dev, xdp->prog, xdp->extack); 1116 case XDP_QUERY_PROG: 1117 xdp->prog_id = veth_xdp_query(dev); 1118 return 0; 1119 default: 1120 return -EINVAL; 1121 } 1122 } 1123 1124 static const struct net_device_ops veth_netdev_ops = { 1125 .ndo_init = veth_dev_init, 1126 .ndo_open = veth_open, 1127 .ndo_stop = veth_close, 1128 .ndo_start_xmit = veth_xmit, 1129 .ndo_get_stats64 = veth_get_stats64, 1130 .ndo_set_rx_mode = veth_set_multicast_list, 1131 .ndo_set_mac_address = eth_mac_addr, 1132 #ifdef CONFIG_NET_POLL_CONTROLLER 1133 .ndo_poll_controller = veth_poll_controller, 1134 #endif 1135 .ndo_get_iflink = veth_get_iflink, 1136 .ndo_fix_features = veth_fix_features, 1137 .ndo_features_check = passthru_features_check, 1138 .ndo_set_rx_headroom = veth_set_rx_headroom, 1139 .ndo_bpf = veth_xdp, 1140 .ndo_xdp_xmit = veth_xdp_xmit, 1141 }; 1142 1143 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ 1144 NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \ 1145 NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \ 1146 NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \ 1147 NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX ) 1148 1149 static void veth_setup(struct net_device *dev) 1150 { 1151 ether_setup(dev); 1152 1153 dev->priv_flags &= ~IFF_TX_SKB_SHARING; 1154 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 1155 dev->priv_flags |= IFF_NO_QUEUE; 1156 dev->priv_flags |= IFF_PHONY_HEADROOM; 1157 1158 dev->netdev_ops = &veth_netdev_ops; 1159 dev->ethtool_ops = &veth_ethtool_ops; 1160 dev->features |= NETIF_F_LLTX; 1161 dev->features |= VETH_FEATURES; 1162 dev->vlan_features = dev->features & 1163 ~(NETIF_F_HW_VLAN_CTAG_TX | 1164 NETIF_F_HW_VLAN_STAG_TX | 1165 NETIF_F_HW_VLAN_CTAG_RX | 1166 NETIF_F_HW_VLAN_STAG_RX); 1167 dev->needs_free_netdev = true; 1168 dev->priv_destructor = veth_dev_free; 1169 dev->max_mtu = ETH_MAX_MTU; 1170 1171 dev->hw_features = VETH_FEATURES; 1172 dev->hw_enc_features = VETH_FEATURES; 1173 dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; 1174 } 1175 1176 /* 1177 * netlink interface 1178 */ 1179 1180 static int veth_validate(struct nlattr *tb[], struct nlattr *data[], 1181 struct netlink_ext_ack *extack) 1182 { 1183 if (tb[IFLA_ADDRESS]) { 1184 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 1185 return -EINVAL; 1186 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 1187 return -EADDRNOTAVAIL; 1188 } 1189 if (tb[IFLA_MTU]) { 1190 if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU]))) 1191 return -EINVAL; 1192 } 1193 return 0; 1194 } 1195 1196 static struct rtnl_link_ops veth_link_ops; 1197 1198 static int veth_newlink(struct net *src_net, struct net_device *dev, 1199 struct nlattr *tb[], struct nlattr *data[], 1200 struct netlink_ext_ack *extack) 1201 { 1202 int err; 1203 struct net_device *peer; 1204 struct veth_priv *priv; 1205 char ifname[IFNAMSIZ]; 1206 struct nlattr *peer_tb[IFLA_MAX + 1], **tbp; 1207 unsigned char name_assign_type; 1208 struct ifinfomsg *ifmp; 1209 struct net *net; 1210 1211 /* 1212 * create and register peer first 1213 */ 1214 if (data != NULL && data[VETH_INFO_PEER] != NULL) { 1215 struct nlattr *nla_peer; 1216 1217 nla_peer = data[VETH_INFO_PEER]; 1218 ifmp = nla_data(nla_peer); 1219 err = rtnl_nla_parse_ifla(peer_tb, 1220 nla_data(nla_peer) + sizeof(struct ifinfomsg), 1221 nla_len(nla_peer) - sizeof(struct ifinfomsg), 1222 NULL); 1223 if (err < 0) 1224 return err; 1225 1226 err = veth_validate(peer_tb, NULL, extack); 1227 if (err < 0) 1228 return err; 1229 1230 tbp = peer_tb; 1231 } else { 1232 ifmp = NULL; 1233 tbp = tb; 1234 } 1235 1236 if (ifmp && tbp[IFLA_IFNAME]) { 1237 nla_strlcpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); 1238 name_assign_type = NET_NAME_USER; 1239 } else { 1240 snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); 1241 name_assign_type = NET_NAME_ENUM; 1242 } 1243 1244 net = rtnl_link_get_net(src_net, tbp); 1245 if (IS_ERR(net)) 1246 return PTR_ERR(net); 1247 1248 peer = rtnl_create_link(net, ifname, name_assign_type, 1249 &veth_link_ops, tbp, extack); 1250 if (IS_ERR(peer)) { 1251 put_net(net); 1252 return PTR_ERR(peer); 1253 } 1254 1255 if (!ifmp || !tbp[IFLA_ADDRESS]) 1256 eth_hw_addr_random(peer); 1257 1258 if (ifmp && (dev->ifindex != 0)) 1259 peer->ifindex = ifmp->ifi_index; 1260 1261 peer->gso_max_size = dev->gso_max_size; 1262 peer->gso_max_segs = dev->gso_max_segs; 1263 1264 err = register_netdevice(peer); 1265 put_net(net); 1266 net = NULL; 1267 if (err < 0) 1268 goto err_register_peer; 1269 1270 netif_carrier_off(peer); 1271 1272 err = rtnl_configure_link(peer, ifmp); 1273 if (err < 0) 1274 goto err_configure_peer; 1275 1276 /* 1277 * register dev last 1278 * 1279 * note, that since we've registered new device the dev's name 1280 * should be re-allocated 1281 */ 1282 1283 if (tb[IFLA_ADDRESS] == NULL) 1284 eth_hw_addr_random(dev); 1285 1286 if (tb[IFLA_IFNAME]) 1287 nla_strlcpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); 1288 else 1289 snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); 1290 1291 err = register_netdevice(dev); 1292 if (err < 0) 1293 goto err_register_dev; 1294 1295 netif_carrier_off(dev); 1296 1297 /* 1298 * tie the deviced together 1299 */ 1300 1301 priv = netdev_priv(dev); 1302 rcu_assign_pointer(priv->peer, peer); 1303 1304 priv = netdev_priv(peer); 1305 rcu_assign_pointer(priv->peer, dev); 1306 1307 return 0; 1308 1309 err_register_dev: 1310 /* nothing to do */ 1311 err_configure_peer: 1312 unregister_netdevice(peer); 1313 return err; 1314 1315 err_register_peer: 1316 free_netdev(peer); 1317 return err; 1318 } 1319 1320 static void veth_dellink(struct net_device *dev, struct list_head *head) 1321 { 1322 struct veth_priv *priv; 1323 struct net_device *peer; 1324 1325 priv = netdev_priv(dev); 1326 peer = rtnl_dereference(priv->peer); 1327 1328 /* Note : dellink() is called from default_device_exit_batch(), 1329 * before a rcu_synchronize() point. The devices are guaranteed 1330 * not being freed before one RCU grace period. 1331 */ 1332 RCU_INIT_POINTER(priv->peer, NULL); 1333 unregister_netdevice_queue(dev, head); 1334 1335 if (peer) { 1336 priv = netdev_priv(peer); 1337 RCU_INIT_POINTER(priv->peer, NULL); 1338 unregister_netdevice_queue(peer, head); 1339 } 1340 } 1341 1342 static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = { 1343 [VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) }, 1344 }; 1345 1346 static struct net *veth_get_link_net(const struct net_device *dev) 1347 { 1348 struct veth_priv *priv = netdev_priv(dev); 1349 struct net_device *peer = rtnl_dereference(priv->peer); 1350 1351 return peer ? dev_net(peer) : dev_net(dev); 1352 } 1353 1354 static struct rtnl_link_ops veth_link_ops = { 1355 .kind = DRV_NAME, 1356 .priv_size = sizeof(struct veth_priv), 1357 .setup = veth_setup, 1358 .validate = veth_validate, 1359 .newlink = veth_newlink, 1360 .dellink = veth_dellink, 1361 .policy = veth_policy, 1362 .maxtype = VETH_INFO_MAX, 1363 .get_link_net = veth_get_link_net, 1364 }; 1365 1366 /* 1367 * init/fini 1368 */ 1369 1370 static __init int veth_init(void) 1371 { 1372 return rtnl_link_register(&veth_link_ops); 1373 } 1374 1375 static __exit void veth_exit(void) 1376 { 1377 rtnl_link_unregister(&veth_link_ops); 1378 } 1379 1380 module_init(veth_init); 1381 module_exit(veth_exit); 1382 1383 MODULE_DESCRIPTION("Virtual Ethernet Tunnel"); 1384 MODULE_LICENSE("GPL v2"); 1385 MODULE_ALIAS_RTNL_LINK(DRV_NAME); 1386