1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * drivers/net/veth.c 4 * 5 * Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc 6 * 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com> 9 * 10 */ 11 12 #include <linux/netdevice.h> 13 #include <linux/slab.h> 14 #include <linux/ethtool.h> 15 #include <linux/etherdevice.h> 16 #include <linux/u64_stats_sync.h> 17 18 #include <net/rtnetlink.h> 19 #include <net/dst.h> 20 #include <net/xfrm.h> 21 #include <net/xdp.h> 22 #include <linux/veth.h> 23 #include <linux/module.h> 24 #include <linux/bpf.h> 25 #include <linux/filter.h> 26 #include <linux/ptr_ring.h> 27 #include <linux/bpf_trace.h> 28 #include <linux/net_tstamp.h> 29 30 #define DRV_NAME "veth" 31 #define DRV_VERSION "1.0" 32 33 #define VETH_XDP_FLAG BIT(0) 34 #define VETH_RING_SIZE 256 35 #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) 36 37 #define VETH_XDP_TX_BULK_SIZE 16 38 #define VETH_XDP_BATCH 16 39 40 struct veth_stats { 41 u64 rx_drops; 42 /* xdp */ 43 u64 xdp_packets; 44 u64 xdp_bytes; 45 u64 xdp_redirect; 46 u64 xdp_drops; 47 u64 xdp_tx; 48 u64 xdp_tx_err; 49 u64 peer_tq_xdp_xmit; 50 u64 peer_tq_xdp_xmit_err; 51 }; 52 53 struct veth_rq_stats { 54 struct veth_stats vs; 55 struct u64_stats_sync syncp; 56 }; 57 58 struct veth_rq { 59 struct napi_struct xdp_napi; 60 struct napi_struct __rcu *napi; /* points to xdp_napi when the latter is initialized */ 61 struct net_device *dev; 62 struct bpf_prog __rcu *xdp_prog; 63 struct xdp_mem_info xdp_mem; 64 struct veth_rq_stats stats; 65 bool rx_notify_masked; 66 struct ptr_ring xdp_ring; 67 struct xdp_rxq_info xdp_rxq; 68 }; 69 70 struct veth_priv { 71 struct net_device __rcu *peer; 72 atomic64_t dropped; 73 struct bpf_prog *_xdp_prog; 74 struct veth_rq *rq; 75 unsigned int requested_headroom; 76 }; 77 78 struct veth_xdp_tx_bq { 79 struct xdp_frame *q[VETH_XDP_TX_BULK_SIZE]; 80 unsigned int count; 81 }; 82 83 /* 84 * ethtool interface 85 */ 86 87 struct veth_q_stat_desc { 88 char desc[ETH_GSTRING_LEN]; 89 size_t offset; 90 }; 91 92 #define VETH_RQ_STAT(m) offsetof(struct veth_stats, m) 93 94 static const struct veth_q_stat_desc veth_rq_stats_desc[] = { 95 { "xdp_packets", VETH_RQ_STAT(xdp_packets) }, 96 { "xdp_bytes", VETH_RQ_STAT(xdp_bytes) }, 97 { "drops", VETH_RQ_STAT(rx_drops) }, 98 { "xdp_redirect", VETH_RQ_STAT(xdp_redirect) }, 99 { "xdp_drops", VETH_RQ_STAT(xdp_drops) }, 100 { "xdp_tx", VETH_RQ_STAT(xdp_tx) }, 101 { "xdp_tx_errors", VETH_RQ_STAT(xdp_tx_err) }, 102 }; 103 104 #define VETH_RQ_STATS_LEN ARRAY_SIZE(veth_rq_stats_desc) 105 106 static const struct veth_q_stat_desc veth_tq_stats_desc[] = { 107 { "xdp_xmit", VETH_RQ_STAT(peer_tq_xdp_xmit) }, 108 { "xdp_xmit_errors", VETH_RQ_STAT(peer_tq_xdp_xmit_err) }, 109 }; 110 111 #define VETH_TQ_STATS_LEN ARRAY_SIZE(veth_tq_stats_desc) 112 113 static struct { 114 const char string[ETH_GSTRING_LEN]; 115 } ethtool_stats_keys[] = { 116 { "peer_ifindex" }, 117 }; 118 119 static int veth_get_link_ksettings(struct net_device *dev, 120 struct ethtool_link_ksettings *cmd) 121 { 122 cmd->base.speed = SPEED_10000; 123 cmd->base.duplex = DUPLEX_FULL; 124 cmd->base.port = PORT_TP; 125 cmd->base.autoneg = AUTONEG_DISABLE; 126 return 0; 127 } 128 129 static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) 130 { 131 strlcpy(info->driver, DRV_NAME, sizeof(info->driver)); 132 strlcpy(info->version, DRV_VERSION, sizeof(info->version)); 133 } 134 135 static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf) 136 { 137 u8 *p = buf; 138 int i, j; 139 140 switch(stringset) { 141 case ETH_SS_STATS: 142 memcpy(p, ðtool_stats_keys, sizeof(ethtool_stats_keys)); 143 p += sizeof(ethtool_stats_keys); 144 for (i = 0; i < dev->real_num_rx_queues; i++) 145 for (j = 0; j < VETH_RQ_STATS_LEN; j++) 146 ethtool_sprintf(&p, "rx_queue_%u_%.18s", 147 i, veth_rq_stats_desc[j].desc); 148 149 for (i = 0; i < dev->real_num_tx_queues; i++) 150 for (j = 0; j < VETH_TQ_STATS_LEN; j++) 151 ethtool_sprintf(&p, "tx_queue_%u_%.18s", 152 i, veth_tq_stats_desc[j].desc); 153 break; 154 } 155 } 156 157 static int veth_get_sset_count(struct net_device *dev, int sset) 158 { 159 switch (sset) { 160 case ETH_SS_STATS: 161 return ARRAY_SIZE(ethtool_stats_keys) + 162 VETH_RQ_STATS_LEN * dev->real_num_rx_queues + 163 VETH_TQ_STATS_LEN * dev->real_num_tx_queues; 164 default: 165 return -EOPNOTSUPP; 166 } 167 } 168 169 static void veth_get_ethtool_stats(struct net_device *dev, 170 struct ethtool_stats *stats, u64 *data) 171 { 172 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 173 struct net_device *peer = rtnl_dereference(priv->peer); 174 int i, j, idx; 175 176 data[0] = peer ? peer->ifindex : 0; 177 idx = 1; 178 for (i = 0; i < dev->real_num_rx_queues; i++) { 179 const struct veth_rq_stats *rq_stats = &priv->rq[i].stats; 180 const void *stats_base = (void *)&rq_stats->vs; 181 unsigned int start; 182 size_t offset; 183 184 do { 185 start = u64_stats_fetch_begin_irq(&rq_stats->syncp); 186 for (j = 0; j < VETH_RQ_STATS_LEN; j++) { 187 offset = veth_rq_stats_desc[j].offset; 188 data[idx + j] = *(u64 *)(stats_base + offset); 189 } 190 } while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start)); 191 idx += VETH_RQ_STATS_LEN; 192 } 193 194 if (!peer) 195 return; 196 197 rcv_priv = netdev_priv(peer); 198 for (i = 0; i < peer->real_num_rx_queues; i++) { 199 const struct veth_rq_stats *rq_stats = &rcv_priv->rq[i].stats; 200 const void *base = (void *)&rq_stats->vs; 201 unsigned int start, tx_idx = idx; 202 size_t offset; 203 204 tx_idx += (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN; 205 do { 206 start = u64_stats_fetch_begin_irq(&rq_stats->syncp); 207 for (j = 0; j < VETH_TQ_STATS_LEN; j++) { 208 offset = veth_tq_stats_desc[j].offset; 209 data[tx_idx + j] += *(u64 *)(base + offset); 210 } 211 } while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start)); 212 } 213 } 214 215 static void veth_get_channels(struct net_device *dev, 216 struct ethtool_channels *channels) 217 { 218 channels->tx_count = dev->real_num_tx_queues; 219 channels->rx_count = dev->real_num_rx_queues; 220 channels->max_tx = dev->num_tx_queues; 221 channels->max_rx = dev->num_rx_queues; 222 } 223 224 static int veth_set_channels(struct net_device *dev, 225 struct ethtool_channels *ch); 226 227 static const struct ethtool_ops veth_ethtool_ops = { 228 .get_drvinfo = veth_get_drvinfo, 229 .get_link = ethtool_op_get_link, 230 .get_strings = veth_get_strings, 231 .get_sset_count = veth_get_sset_count, 232 .get_ethtool_stats = veth_get_ethtool_stats, 233 .get_link_ksettings = veth_get_link_ksettings, 234 .get_ts_info = ethtool_op_get_ts_info, 235 .get_channels = veth_get_channels, 236 .set_channels = veth_set_channels, 237 }; 238 239 /* general routines */ 240 241 static bool veth_is_xdp_frame(void *ptr) 242 { 243 return (unsigned long)ptr & VETH_XDP_FLAG; 244 } 245 246 static struct xdp_frame *veth_ptr_to_xdp(void *ptr) 247 { 248 return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG); 249 } 250 251 static void *veth_xdp_to_ptr(struct xdp_frame *xdp) 252 { 253 return (void *)((unsigned long)xdp | VETH_XDP_FLAG); 254 } 255 256 static void veth_ptr_free(void *ptr) 257 { 258 if (veth_is_xdp_frame(ptr)) 259 xdp_return_frame(veth_ptr_to_xdp(ptr)); 260 else 261 kfree_skb(ptr); 262 } 263 264 static void __veth_xdp_flush(struct veth_rq *rq) 265 { 266 /* Write ptr_ring before reading rx_notify_masked */ 267 smp_mb(); 268 if (!rq->rx_notify_masked) { 269 rq->rx_notify_masked = true; 270 napi_schedule(&rq->xdp_napi); 271 } 272 } 273 274 static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) 275 { 276 if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) { 277 dev_kfree_skb_any(skb); 278 return NET_RX_DROP; 279 } 280 281 return NET_RX_SUCCESS; 282 } 283 284 static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, 285 struct veth_rq *rq, bool xdp) 286 { 287 return __dev_forward_skb(dev, skb) ?: xdp ? 288 veth_xdp_rx(rq, skb) : 289 netif_rx(skb); 290 } 291 292 /* return true if the specified skb has chances of GRO aggregation 293 * Don't strive for accuracy, but try to avoid GRO overhead in the most 294 * common scenarios. 295 * When XDP is enabled, all traffic is considered eligible, as the xmit 296 * device has TSO off. 297 * When TSO is enabled on the xmit device, we are likely interested only 298 * in UDP aggregation, explicitly check for that if the skb is suspected 299 * - the sock_wfree destructor is used by UDP, ICMP and XDP sockets - 300 * to belong to locally generated UDP traffic. 301 */ 302 static bool veth_skb_is_eligible_for_gro(const struct net_device *dev, 303 const struct net_device *rcv, 304 const struct sk_buff *skb) 305 { 306 return !(dev->features & NETIF_F_ALL_TSO) || 307 (skb->destructor == sock_wfree && 308 rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD)); 309 } 310 311 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) 312 { 313 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 314 struct veth_rq *rq = NULL; 315 struct net_device *rcv; 316 int length = skb->len; 317 bool use_napi = false; 318 int rxq; 319 320 rcu_read_lock(); 321 rcv = rcu_dereference(priv->peer); 322 if (unlikely(!rcv)) { 323 kfree_skb(skb); 324 goto drop; 325 } 326 327 rcv_priv = netdev_priv(rcv); 328 rxq = skb_get_queue_mapping(skb); 329 if (rxq < rcv->real_num_rx_queues) { 330 rq = &rcv_priv->rq[rxq]; 331 332 /* The napi pointer is available when an XDP program is 333 * attached or when GRO is enabled 334 * Don't bother with napi/GRO if the skb can't be aggregated 335 */ 336 use_napi = rcu_access_pointer(rq->napi) && 337 veth_skb_is_eligible_for_gro(dev, rcv, skb); 338 } 339 340 skb_tx_timestamp(skb); 341 if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) { 342 if (!use_napi) 343 dev_lstats_add(dev, length); 344 } else { 345 drop: 346 atomic64_inc(&priv->dropped); 347 } 348 349 if (use_napi) 350 __veth_xdp_flush(rq); 351 352 rcu_read_unlock(); 353 354 return NETDEV_TX_OK; 355 } 356 357 static u64 veth_stats_tx(struct net_device *dev, u64 *packets, u64 *bytes) 358 { 359 struct veth_priv *priv = netdev_priv(dev); 360 361 dev_lstats_read(dev, packets, bytes); 362 return atomic64_read(&priv->dropped); 363 } 364 365 static void veth_stats_rx(struct veth_stats *result, struct net_device *dev) 366 { 367 struct veth_priv *priv = netdev_priv(dev); 368 int i; 369 370 result->peer_tq_xdp_xmit_err = 0; 371 result->xdp_packets = 0; 372 result->xdp_tx_err = 0; 373 result->xdp_bytes = 0; 374 result->rx_drops = 0; 375 for (i = 0; i < dev->num_rx_queues; i++) { 376 u64 packets, bytes, drops, xdp_tx_err, peer_tq_xdp_xmit_err; 377 struct veth_rq_stats *stats = &priv->rq[i].stats; 378 unsigned int start; 379 380 do { 381 start = u64_stats_fetch_begin_irq(&stats->syncp); 382 peer_tq_xdp_xmit_err = stats->vs.peer_tq_xdp_xmit_err; 383 xdp_tx_err = stats->vs.xdp_tx_err; 384 packets = stats->vs.xdp_packets; 385 bytes = stats->vs.xdp_bytes; 386 drops = stats->vs.rx_drops; 387 } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); 388 result->peer_tq_xdp_xmit_err += peer_tq_xdp_xmit_err; 389 result->xdp_tx_err += xdp_tx_err; 390 result->xdp_packets += packets; 391 result->xdp_bytes += bytes; 392 result->rx_drops += drops; 393 } 394 } 395 396 static void veth_get_stats64(struct net_device *dev, 397 struct rtnl_link_stats64 *tot) 398 { 399 struct veth_priv *priv = netdev_priv(dev); 400 struct net_device *peer; 401 struct veth_stats rx; 402 u64 packets, bytes; 403 404 tot->tx_dropped = veth_stats_tx(dev, &packets, &bytes); 405 tot->tx_bytes = bytes; 406 tot->tx_packets = packets; 407 408 veth_stats_rx(&rx, dev); 409 tot->tx_dropped += rx.xdp_tx_err; 410 tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err; 411 tot->rx_bytes = rx.xdp_bytes; 412 tot->rx_packets = rx.xdp_packets; 413 414 rcu_read_lock(); 415 peer = rcu_dereference(priv->peer); 416 if (peer) { 417 veth_stats_tx(peer, &packets, &bytes); 418 tot->rx_bytes += bytes; 419 tot->rx_packets += packets; 420 421 veth_stats_rx(&rx, peer); 422 tot->tx_dropped += rx.peer_tq_xdp_xmit_err; 423 tot->rx_dropped += rx.xdp_tx_err; 424 tot->tx_bytes += rx.xdp_bytes; 425 tot->tx_packets += rx.xdp_packets; 426 } 427 rcu_read_unlock(); 428 } 429 430 /* fake multicast ability */ 431 static void veth_set_multicast_list(struct net_device *dev) 432 { 433 } 434 435 static struct sk_buff *veth_build_skb(void *head, int headroom, int len, 436 int buflen) 437 { 438 struct sk_buff *skb; 439 440 skb = build_skb(head, buflen); 441 if (!skb) 442 return NULL; 443 444 skb_reserve(skb, headroom); 445 skb_put(skb, len); 446 447 return skb; 448 } 449 450 static int veth_select_rxq(struct net_device *dev) 451 { 452 return smp_processor_id() % dev->real_num_rx_queues; 453 } 454 455 static struct net_device *veth_peer_dev(struct net_device *dev) 456 { 457 struct veth_priv *priv = netdev_priv(dev); 458 459 /* Callers must be under RCU read side. */ 460 return rcu_dereference(priv->peer); 461 } 462 463 static int veth_xdp_xmit(struct net_device *dev, int n, 464 struct xdp_frame **frames, 465 u32 flags, bool ndo_xmit) 466 { 467 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 468 int i, ret = -ENXIO, nxmit = 0; 469 struct net_device *rcv; 470 unsigned int max_len; 471 struct veth_rq *rq; 472 473 if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) 474 return -EINVAL; 475 476 rcu_read_lock(); 477 rcv = rcu_dereference(priv->peer); 478 if (unlikely(!rcv)) 479 goto out; 480 481 rcv_priv = netdev_priv(rcv); 482 rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 483 /* The napi pointer is set if NAPI is enabled, which ensures that 484 * xdp_ring is initialized on receive side and the peer device is up. 485 */ 486 if (!rcu_access_pointer(rq->napi)) 487 goto out; 488 489 max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN; 490 491 spin_lock(&rq->xdp_ring.producer_lock); 492 for (i = 0; i < n; i++) { 493 struct xdp_frame *frame = frames[i]; 494 void *ptr = veth_xdp_to_ptr(frame); 495 496 if (unlikely(frame->len > max_len || 497 __ptr_ring_produce(&rq->xdp_ring, ptr))) 498 break; 499 nxmit++; 500 } 501 spin_unlock(&rq->xdp_ring.producer_lock); 502 503 if (flags & XDP_XMIT_FLUSH) 504 __veth_xdp_flush(rq); 505 506 ret = nxmit; 507 if (ndo_xmit) { 508 u64_stats_update_begin(&rq->stats.syncp); 509 rq->stats.vs.peer_tq_xdp_xmit += nxmit; 510 rq->stats.vs.peer_tq_xdp_xmit_err += n - nxmit; 511 u64_stats_update_end(&rq->stats.syncp); 512 } 513 514 out: 515 rcu_read_unlock(); 516 517 return ret; 518 } 519 520 static int veth_ndo_xdp_xmit(struct net_device *dev, int n, 521 struct xdp_frame **frames, u32 flags) 522 { 523 int err; 524 525 err = veth_xdp_xmit(dev, n, frames, flags, true); 526 if (err < 0) { 527 struct veth_priv *priv = netdev_priv(dev); 528 529 atomic64_add(n, &priv->dropped); 530 } 531 532 return err; 533 } 534 535 static void veth_xdp_flush_bq(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) 536 { 537 int sent, i, err = 0, drops; 538 539 sent = veth_xdp_xmit(rq->dev, bq->count, bq->q, 0, false); 540 if (sent < 0) { 541 err = sent; 542 sent = 0; 543 } 544 545 for (i = sent; unlikely(i < bq->count); i++) 546 xdp_return_frame(bq->q[i]); 547 548 drops = bq->count - sent; 549 trace_xdp_bulk_tx(rq->dev, sent, drops, err); 550 551 u64_stats_update_begin(&rq->stats.syncp); 552 rq->stats.vs.xdp_tx += sent; 553 rq->stats.vs.xdp_tx_err += drops; 554 u64_stats_update_end(&rq->stats.syncp); 555 556 bq->count = 0; 557 } 558 559 static void veth_xdp_flush(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) 560 { 561 struct veth_priv *rcv_priv, *priv = netdev_priv(rq->dev); 562 struct net_device *rcv; 563 struct veth_rq *rcv_rq; 564 565 rcu_read_lock(); 566 veth_xdp_flush_bq(rq, bq); 567 rcv = rcu_dereference(priv->peer); 568 if (unlikely(!rcv)) 569 goto out; 570 571 rcv_priv = netdev_priv(rcv); 572 rcv_rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 573 /* xdp_ring is initialized on receive side? */ 574 if (unlikely(!rcu_access_pointer(rcv_rq->xdp_prog))) 575 goto out; 576 577 __veth_xdp_flush(rcv_rq); 578 out: 579 rcu_read_unlock(); 580 } 581 582 static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp, 583 struct veth_xdp_tx_bq *bq) 584 { 585 struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp); 586 587 if (unlikely(!frame)) 588 return -EOVERFLOW; 589 590 if (unlikely(bq->count == VETH_XDP_TX_BULK_SIZE)) 591 veth_xdp_flush_bq(rq, bq); 592 593 bq->q[bq->count++] = frame; 594 595 return 0; 596 } 597 598 static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq, 599 struct xdp_frame *frame, 600 struct veth_xdp_tx_bq *bq, 601 struct veth_stats *stats) 602 { 603 struct xdp_frame orig_frame; 604 struct bpf_prog *xdp_prog; 605 606 rcu_read_lock(); 607 xdp_prog = rcu_dereference(rq->xdp_prog); 608 if (likely(xdp_prog)) { 609 struct xdp_buff xdp; 610 u32 act; 611 612 xdp_convert_frame_to_buff(frame, &xdp); 613 xdp.rxq = &rq->xdp_rxq; 614 615 act = bpf_prog_run_xdp(xdp_prog, &xdp); 616 617 switch (act) { 618 case XDP_PASS: 619 if (xdp_update_frame_from_buff(&xdp, frame)) 620 goto err_xdp; 621 break; 622 case XDP_TX: 623 orig_frame = *frame; 624 xdp.rxq->mem = frame->mem; 625 if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) { 626 trace_xdp_exception(rq->dev, xdp_prog, act); 627 frame = &orig_frame; 628 stats->rx_drops++; 629 goto err_xdp; 630 } 631 stats->xdp_tx++; 632 rcu_read_unlock(); 633 goto xdp_xmit; 634 case XDP_REDIRECT: 635 orig_frame = *frame; 636 xdp.rxq->mem = frame->mem; 637 if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { 638 frame = &orig_frame; 639 stats->rx_drops++; 640 goto err_xdp; 641 } 642 stats->xdp_redirect++; 643 rcu_read_unlock(); 644 goto xdp_xmit; 645 default: 646 bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); 647 fallthrough; 648 case XDP_ABORTED: 649 trace_xdp_exception(rq->dev, xdp_prog, act); 650 fallthrough; 651 case XDP_DROP: 652 stats->xdp_drops++; 653 goto err_xdp; 654 } 655 } 656 rcu_read_unlock(); 657 658 return frame; 659 err_xdp: 660 rcu_read_unlock(); 661 xdp_return_frame(frame); 662 xdp_xmit: 663 return NULL; 664 } 665 666 /* frames array contains VETH_XDP_BATCH at most */ 667 static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames, 668 int n_xdpf, struct veth_xdp_tx_bq *bq, 669 struct veth_stats *stats) 670 { 671 void *skbs[VETH_XDP_BATCH]; 672 int i; 673 674 if (xdp_alloc_skb_bulk(skbs, n_xdpf, 675 GFP_ATOMIC | __GFP_ZERO) < 0) { 676 for (i = 0; i < n_xdpf; i++) 677 xdp_return_frame(frames[i]); 678 stats->rx_drops += n_xdpf; 679 680 return; 681 } 682 683 for (i = 0; i < n_xdpf; i++) { 684 struct sk_buff *skb = skbs[i]; 685 686 skb = __xdp_build_skb_from_frame(frames[i], skb, 687 rq->dev); 688 if (!skb) { 689 xdp_return_frame(frames[i]); 690 stats->rx_drops++; 691 continue; 692 } 693 napi_gro_receive(&rq->xdp_napi, skb); 694 } 695 } 696 697 static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, 698 struct sk_buff *skb, 699 struct veth_xdp_tx_bq *bq, 700 struct veth_stats *stats) 701 { 702 u32 pktlen, headroom, act, metalen, frame_sz; 703 void *orig_data, *orig_data_end; 704 struct bpf_prog *xdp_prog; 705 int mac_len, delta, off; 706 struct xdp_buff xdp; 707 708 skb_prepare_for_gro(skb); 709 710 rcu_read_lock(); 711 xdp_prog = rcu_dereference(rq->xdp_prog); 712 if (unlikely(!xdp_prog)) { 713 rcu_read_unlock(); 714 goto out; 715 } 716 717 mac_len = skb->data - skb_mac_header(skb); 718 pktlen = skb->len + mac_len; 719 headroom = skb_headroom(skb) - mac_len; 720 721 if (skb_shared(skb) || skb_head_is_locked(skb) || 722 skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) { 723 struct sk_buff *nskb; 724 int size, head_off; 725 void *head, *start; 726 struct page *page; 727 728 size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) + 729 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 730 if (size > PAGE_SIZE) 731 goto drop; 732 733 page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); 734 if (!page) 735 goto drop; 736 737 head = page_address(page); 738 start = head + VETH_XDP_HEADROOM; 739 if (skb_copy_bits(skb, -mac_len, start, pktlen)) { 740 page_frag_free(head); 741 goto drop; 742 } 743 744 nskb = veth_build_skb(head, VETH_XDP_HEADROOM + mac_len, 745 skb->len, PAGE_SIZE); 746 if (!nskb) { 747 page_frag_free(head); 748 goto drop; 749 } 750 751 skb_copy_header(nskb, skb); 752 head_off = skb_headroom(nskb) - skb_headroom(skb); 753 skb_headers_offset_update(nskb, head_off); 754 consume_skb(skb); 755 skb = nskb; 756 } 757 758 /* SKB "head" area always have tailroom for skb_shared_info */ 759 frame_sz = skb_end_pointer(skb) - skb->head; 760 frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 761 xdp_init_buff(&xdp, frame_sz, &rq->xdp_rxq); 762 xdp_prepare_buff(&xdp, skb->head, skb->mac_header, pktlen, true); 763 764 orig_data = xdp.data; 765 orig_data_end = xdp.data_end; 766 767 act = bpf_prog_run_xdp(xdp_prog, &xdp); 768 769 switch (act) { 770 case XDP_PASS: 771 break; 772 case XDP_TX: 773 get_page(virt_to_page(xdp.data)); 774 consume_skb(skb); 775 xdp.rxq->mem = rq->xdp_mem; 776 if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) { 777 trace_xdp_exception(rq->dev, xdp_prog, act); 778 stats->rx_drops++; 779 goto err_xdp; 780 } 781 stats->xdp_tx++; 782 rcu_read_unlock(); 783 goto xdp_xmit; 784 case XDP_REDIRECT: 785 get_page(virt_to_page(xdp.data)); 786 consume_skb(skb); 787 xdp.rxq->mem = rq->xdp_mem; 788 if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { 789 stats->rx_drops++; 790 goto err_xdp; 791 } 792 stats->xdp_redirect++; 793 rcu_read_unlock(); 794 goto xdp_xmit; 795 default: 796 bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); 797 fallthrough; 798 case XDP_ABORTED: 799 trace_xdp_exception(rq->dev, xdp_prog, act); 800 fallthrough; 801 case XDP_DROP: 802 stats->xdp_drops++; 803 goto xdp_drop; 804 } 805 rcu_read_unlock(); 806 807 /* check if bpf_xdp_adjust_head was used */ 808 delta = orig_data - xdp.data; 809 off = mac_len + delta; 810 if (off > 0) 811 __skb_push(skb, off); 812 else if (off < 0) 813 __skb_pull(skb, -off); 814 skb->mac_header -= delta; 815 816 /* check if bpf_xdp_adjust_tail was used */ 817 off = xdp.data_end - orig_data_end; 818 if (off != 0) 819 __skb_put(skb, off); /* positive on grow, negative on shrink */ 820 skb->protocol = eth_type_trans(skb, rq->dev); 821 822 metalen = xdp.data - xdp.data_meta; 823 if (metalen) 824 skb_metadata_set(skb, metalen); 825 out: 826 return skb; 827 drop: 828 stats->rx_drops++; 829 xdp_drop: 830 rcu_read_unlock(); 831 kfree_skb(skb); 832 return NULL; 833 err_xdp: 834 rcu_read_unlock(); 835 page_frag_free(xdp.data); 836 xdp_xmit: 837 return NULL; 838 } 839 840 static int veth_xdp_rcv(struct veth_rq *rq, int budget, 841 struct veth_xdp_tx_bq *bq, 842 struct veth_stats *stats) 843 { 844 int i, done = 0, n_xdpf = 0; 845 void *xdpf[VETH_XDP_BATCH]; 846 847 for (i = 0; i < budget; i++) { 848 void *ptr = __ptr_ring_consume(&rq->xdp_ring); 849 850 if (!ptr) 851 break; 852 853 if (veth_is_xdp_frame(ptr)) { 854 /* ndo_xdp_xmit */ 855 struct xdp_frame *frame = veth_ptr_to_xdp(ptr); 856 857 stats->xdp_bytes += frame->len; 858 frame = veth_xdp_rcv_one(rq, frame, bq, stats); 859 if (frame) { 860 /* XDP_PASS */ 861 xdpf[n_xdpf++] = frame; 862 if (n_xdpf == VETH_XDP_BATCH) { 863 veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, 864 bq, stats); 865 n_xdpf = 0; 866 } 867 } 868 } else { 869 /* ndo_start_xmit */ 870 struct sk_buff *skb = ptr; 871 872 stats->xdp_bytes += skb->len; 873 skb = veth_xdp_rcv_skb(rq, skb, bq, stats); 874 if (skb) { 875 if (skb_shared(skb) || skb_unclone(skb, GFP_ATOMIC)) 876 netif_receive_skb(skb); 877 else 878 napi_gro_receive(&rq->xdp_napi, skb); 879 } 880 } 881 done++; 882 } 883 884 if (n_xdpf) 885 veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats); 886 887 u64_stats_update_begin(&rq->stats.syncp); 888 rq->stats.vs.xdp_redirect += stats->xdp_redirect; 889 rq->stats.vs.xdp_bytes += stats->xdp_bytes; 890 rq->stats.vs.xdp_drops += stats->xdp_drops; 891 rq->stats.vs.rx_drops += stats->rx_drops; 892 rq->stats.vs.xdp_packets += done; 893 u64_stats_update_end(&rq->stats.syncp); 894 895 return done; 896 } 897 898 static int veth_poll(struct napi_struct *napi, int budget) 899 { 900 struct veth_rq *rq = 901 container_of(napi, struct veth_rq, xdp_napi); 902 struct veth_stats stats = {}; 903 struct veth_xdp_tx_bq bq; 904 int done; 905 906 bq.count = 0; 907 908 xdp_set_return_frame_no_direct(); 909 done = veth_xdp_rcv(rq, budget, &bq, &stats); 910 911 if (done < budget && napi_complete_done(napi, done)) { 912 /* Write rx_notify_masked before reading ptr_ring */ 913 smp_store_mb(rq->rx_notify_masked, false); 914 if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { 915 rq->rx_notify_masked = true; 916 napi_schedule(&rq->xdp_napi); 917 } 918 } 919 920 if (stats.xdp_tx > 0) 921 veth_xdp_flush(rq, &bq); 922 if (stats.xdp_redirect > 0) 923 xdp_do_flush(); 924 xdp_clear_return_frame_no_direct(); 925 926 return done; 927 } 928 929 static int __veth_napi_enable_range(struct net_device *dev, int start, int end) 930 { 931 struct veth_priv *priv = netdev_priv(dev); 932 int err, i; 933 934 for (i = start; i < end; i++) { 935 struct veth_rq *rq = &priv->rq[i]; 936 937 err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL); 938 if (err) 939 goto err_xdp_ring; 940 } 941 942 for (i = start; i < end; i++) { 943 struct veth_rq *rq = &priv->rq[i]; 944 945 napi_enable(&rq->xdp_napi); 946 rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); 947 } 948 949 return 0; 950 951 err_xdp_ring: 952 for (i--; i >= start; i--) 953 ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); 954 955 return err; 956 } 957 958 static int __veth_napi_enable(struct net_device *dev) 959 { 960 return __veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); 961 } 962 963 static void veth_napi_del_range(struct net_device *dev, int start, int end) 964 { 965 struct veth_priv *priv = netdev_priv(dev); 966 int i; 967 968 for (i = start; i < end; i++) { 969 struct veth_rq *rq = &priv->rq[i]; 970 971 rcu_assign_pointer(priv->rq[i].napi, NULL); 972 napi_disable(&rq->xdp_napi); 973 __netif_napi_del(&rq->xdp_napi); 974 } 975 synchronize_net(); 976 977 for (i = start; i < end; i++) { 978 struct veth_rq *rq = &priv->rq[i]; 979 980 rq->rx_notify_masked = false; 981 ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free); 982 } 983 } 984 985 static void veth_napi_del(struct net_device *dev) 986 { 987 veth_napi_del_range(dev, 0, dev->real_num_rx_queues); 988 } 989 990 static bool veth_gro_requested(const struct net_device *dev) 991 { 992 return !!(dev->wanted_features & NETIF_F_GRO); 993 } 994 995 static int veth_enable_xdp_range(struct net_device *dev, int start, int end, 996 bool napi_already_on) 997 { 998 struct veth_priv *priv = netdev_priv(dev); 999 int err, i; 1000 1001 for (i = start; i < end; i++) { 1002 struct veth_rq *rq = &priv->rq[i]; 1003 1004 if (!napi_already_on) 1005 netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); 1006 err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id); 1007 if (err < 0) 1008 goto err_rxq_reg; 1009 1010 err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, 1011 MEM_TYPE_PAGE_SHARED, 1012 NULL); 1013 if (err < 0) 1014 goto err_reg_mem; 1015 1016 /* Save original mem info as it can be overwritten */ 1017 rq->xdp_mem = rq->xdp_rxq.mem; 1018 } 1019 return 0; 1020 1021 err_reg_mem: 1022 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 1023 err_rxq_reg: 1024 for (i--; i >= start; i--) { 1025 struct veth_rq *rq = &priv->rq[i]; 1026 1027 xdp_rxq_info_unreg(&rq->xdp_rxq); 1028 if (!napi_already_on) 1029 netif_napi_del(&rq->xdp_napi); 1030 } 1031 1032 return err; 1033 } 1034 1035 static void veth_disable_xdp_range(struct net_device *dev, int start, int end, 1036 bool delete_napi) 1037 { 1038 struct veth_priv *priv = netdev_priv(dev); 1039 int i; 1040 1041 for (i = start; i < end; i++) { 1042 struct veth_rq *rq = &priv->rq[i]; 1043 1044 rq->xdp_rxq.mem = rq->xdp_mem; 1045 xdp_rxq_info_unreg(&rq->xdp_rxq); 1046 1047 if (delete_napi) 1048 netif_napi_del(&rq->xdp_napi); 1049 } 1050 } 1051 1052 static int veth_enable_xdp(struct net_device *dev) 1053 { 1054 bool napi_already_on = veth_gro_requested(dev) && (dev->flags & IFF_UP); 1055 struct veth_priv *priv = netdev_priv(dev); 1056 int err, i; 1057 1058 if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) { 1059 err = veth_enable_xdp_range(dev, 0, dev->real_num_rx_queues, napi_already_on); 1060 if (err) 1061 return err; 1062 1063 if (!napi_already_on) { 1064 err = __veth_napi_enable(dev); 1065 if (err) { 1066 veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, true); 1067 return err; 1068 } 1069 1070 if (!veth_gro_requested(dev)) { 1071 /* user-space did not require GRO, but adding XDP 1072 * is supposed to get GRO working 1073 */ 1074 dev->features |= NETIF_F_GRO; 1075 netdev_features_change(dev); 1076 } 1077 } 1078 } 1079 1080 for (i = 0; i < dev->real_num_rx_queues; i++) { 1081 rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog); 1082 rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); 1083 } 1084 1085 return 0; 1086 } 1087 1088 static void veth_disable_xdp(struct net_device *dev) 1089 { 1090 struct veth_priv *priv = netdev_priv(dev); 1091 int i; 1092 1093 for (i = 0; i < dev->real_num_rx_queues; i++) 1094 rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); 1095 1096 if (!netif_running(dev) || !veth_gro_requested(dev)) { 1097 veth_napi_del(dev); 1098 1099 /* if user-space did not require GRO, since adding XDP 1100 * enabled it, clear it now 1101 */ 1102 if (!veth_gro_requested(dev) && netif_running(dev)) { 1103 dev->features &= ~NETIF_F_GRO; 1104 netdev_features_change(dev); 1105 } 1106 } 1107 1108 veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, false); 1109 } 1110 1111 static int veth_napi_enable_range(struct net_device *dev, int start, int end) 1112 { 1113 struct veth_priv *priv = netdev_priv(dev); 1114 int err, i; 1115 1116 for (i = start; i < end; i++) { 1117 struct veth_rq *rq = &priv->rq[i]; 1118 1119 netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); 1120 } 1121 1122 err = __veth_napi_enable_range(dev, start, end); 1123 if (err) { 1124 for (i = start; i < end; i++) { 1125 struct veth_rq *rq = &priv->rq[i]; 1126 1127 netif_napi_del(&rq->xdp_napi); 1128 } 1129 return err; 1130 } 1131 return err; 1132 } 1133 1134 static int veth_napi_enable(struct net_device *dev) 1135 { 1136 return veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); 1137 } 1138 1139 static void veth_disable_range_safe(struct net_device *dev, int start, int end) 1140 { 1141 struct veth_priv *priv = netdev_priv(dev); 1142 1143 if (start >= end) 1144 return; 1145 1146 if (priv->_xdp_prog) { 1147 veth_napi_del_range(dev, start, end); 1148 veth_disable_xdp_range(dev, start, end, false); 1149 } else if (veth_gro_requested(dev)) { 1150 veth_napi_del_range(dev, start, end); 1151 } 1152 } 1153 1154 static int veth_enable_range_safe(struct net_device *dev, int start, int end) 1155 { 1156 struct veth_priv *priv = netdev_priv(dev); 1157 int err; 1158 1159 if (start >= end) 1160 return 0; 1161 1162 if (priv->_xdp_prog) { 1163 /* these channels are freshly initialized, napi is not on there even 1164 * when GRO is requeste 1165 */ 1166 err = veth_enable_xdp_range(dev, start, end, false); 1167 if (err) 1168 return err; 1169 1170 err = __veth_napi_enable_range(dev, start, end); 1171 if (err) { 1172 /* on error always delete the newly added napis */ 1173 veth_disable_xdp_range(dev, start, end, true); 1174 return err; 1175 } 1176 } else if (veth_gro_requested(dev)) { 1177 return veth_napi_enable_range(dev, start, end); 1178 } 1179 return 0; 1180 } 1181 1182 static int veth_set_channels(struct net_device *dev, 1183 struct ethtool_channels *ch) 1184 { 1185 struct veth_priv *priv = netdev_priv(dev); 1186 unsigned int old_rx_count, new_rx_count; 1187 struct veth_priv *peer_priv; 1188 struct net_device *peer; 1189 int err; 1190 1191 /* sanity check. Upper bounds are already enforced by the caller */ 1192 if (!ch->rx_count || !ch->tx_count) 1193 return -EINVAL; 1194 1195 /* avoid braking XDP, if that is enabled */ 1196 peer = rtnl_dereference(priv->peer); 1197 peer_priv = peer ? netdev_priv(peer) : NULL; 1198 if (priv->_xdp_prog && peer && ch->rx_count < peer->real_num_tx_queues) 1199 return -EINVAL; 1200 1201 if (peer && peer_priv && peer_priv->_xdp_prog && ch->tx_count > peer->real_num_rx_queues) 1202 return -EINVAL; 1203 1204 old_rx_count = dev->real_num_rx_queues; 1205 new_rx_count = ch->rx_count; 1206 if (netif_running(dev)) { 1207 /* turn device off */ 1208 netif_carrier_off(dev); 1209 if (peer) 1210 netif_carrier_off(peer); 1211 1212 /* try to allocate new resurces, as needed*/ 1213 err = veth_enable_range_safe(dev, old_rx_count, new_rx_count); 1214 if (err) 1215 goto out; 1216 } 1217 1218 err = netif_set_real_num_rx_queues(dev, ch->rx_count); 1219 if (err) 1220 goto revert; 1221 1222 err = netif_set_real_num_tx_queues(dev, ch->tx_count); 1223 if (err) { 1224 int err2 = netif_set_real_num_rx_queues(dev, old_rx_count); 1225 1226 /* this error condition could happen only if rx and tx change 1227 * in opposite directions (e.g. tx nr raises, rx nr decreases) 1228 * and we can't do anything to fully restore the original 1229 * status 1230 */ 1231 if (err2) 1232 pr_warn("Can't restore rx queues config %d -> %d %d", 1233 new_rx_count, old_rx_count, err2); 1234 else 1235 goto revert; 1236 } 1237 1238 out: 1239 if (netif_running(dev)) { 1240 /* note that we need to swap the arguments WRT the enable part 1241 * to identify the range we have to disable 1242 */ 1243 veth_disable_range_safe(dev, new_rx_count, old_rx_count); 1244 netif_carrier_on(dev); 1245 if (peer) 1246 netif_carrier_on(peer); 1247 } 1248 return err; 1249 1250 revert: 1251 new_rx_count = old_rx_count; 1252 old_rx_count = ch->rx_count; 1253 goto out; 1254 } 1255 1256 static int veth_open(struct net_device *dev) 1257 { 1258 struct veth_priv *priv = netdev_priv(dev); 1259 struct net_device *peer = rtnl_dereference(priv->peer); 1260 int err; 1261 1262 if (!peer) 1263 return -ENOTCONN; 1264 1265 if (priv->_xdp_prog) { 1266 err = veth_enable_xdp(dev); 1267 if (err) 1268 return err; 1269 } else if (veth_gro_requested(dev)) { 1270 err = veth_napi_enable(dev); 1271 if (err) 1272 return err; 1273 } 1274 1275 if (peer->flags & IFF_UP) { 1276 netif_carrier_on(dev); 1277 netif_carrier_on(peer); 1278 } 1279 1280 return 0; 1281 } 1282 1283 static int veth_close(struct net_device *dev) 1284 { 1285 struct veth_priv *priv = netdev_priv(dev); 1286 struct net_device *peer = rtnl_dereference(priv->peer); 1287 1288 netif_carrier_off(dev); 1289 if (peer) 1290 netif_carrier_off(peer); 1291 1292 if (priv->_xdp_prog) 1293 veth_disable_xdp(dev); 1294 else if (veth_gro_requested(dev)) 1295 veth_napi_del(dev); 1296 1297 return 0; 1298 } 1299 1300 static int is_valid_veth_mtu(int mtu) 1301 { 1302 return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU; 1303 } 1304 1305 static int veth_alloc_queues(struct net_device *dev) 1306 { 1307 struct veth_priv *priv = netdev_priv(dev); 1308 int i; 1309 1310 priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL); 1311 if (!priv->rq) 1312 return -ENOMEM; 1313 1314 for (i = 0; i < dev->num_rx_queues; i++) { 1315 priv->rq[i].dev = dev; 1316 u64_stats_init(&priv->rq[i].stats.syncp); 1317 } 1318 1319 return 0; 1320 } 1321 1322 static void veth_free_queues(struct net_device *dev) 1323 { 1324 struct veth_priv *priv = netdev_priv(dev); 1325 1326 kfree(priv->rq); 1327 } 1328 1329 static int veth_dev_init(struct net_device *dev) 1330 { 1331 int err; 1332 1333 dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); 1334 if (!dev->lstats) 1335 return -ENOMEM; 1336 1337 err = veth_alloc_queues(dev); 1338 if (err) { 1339 free_percpu(dev->lstats); 1340 return err; 1341 } 1342 1343 return 0; 1344 } 1345 1346 static void veth_dev_free(struct net_device *dev) 1347 { 1348 veth_free_queues(dev); 1349 free_percpu(dev->lstats); 1350 } 1351 1352 #ifdef CONFIG_NET_POLL_CONTROLLER 1353 static void veth_poll_controller(struct net_device *dev) 1354 { 1355 /* veth only receives frames when its peer sends one 1356 * Since it has nothing to do with disabling irqs, we are guaranteed 1357 * never to have pending data when we poll for it so 1358 * there is nothing to do here. 1359 * 1360 * We need this though so netpoll recognizes us as an interface that 1361 * supports polling, which enables bridge devices in virt setups to 1362 * still use netconsole 1363 */ 1364 } 1365 #endif /* CONFIG_NET_POLL_CONTROLLER */ 1366 1367 static int veth_get_iflink(const struct net_device *dev) 1368 { 1369 struct veth_priv *priv = netdev_priv(dev); 1370 struct net_device *peer; 1371 int iflink; 1372 1373 rcu_read_lock(); 1374 peer = rcu_dereference(priv->peer); 1375 iflink = peer ? peer->ifindex : 0; 1376 rcu_read_unlock(); 1377 1378 return iflink; 1379 } 1380 1381 static netdev_features_t veth_fix_features(struct net_device *dev, 1382 netdev_features_t features) 1383 { 1384 struct veth_priv *priv = netdev_priv(dev); 1385 struct net_device *peer; 1386 1387 peer = rtnl_dereference(priv->peer); 1388 if (peer) { 1389 struct veth_priv *peer_priv = netdev_priv(peer); 1390 1391 if (peer_priv->_xdp_prog) 1392 features &= ~NETIF_F_GSO_SOFTWARE; 1393 } 1394 if (priv->_xdp_prog) 1395 features |= NETIF_F_GRO; 1396 1397 return features; 1398 } 1399 1400 static int veth_set_features(struct net_device *dev, 1401 netdev_features_t features) 1402 { 1403 netdev_features_t changed = features ^ dev->features; 1404 struct veth_priv *priv = netdev_priv(dev); 1405 int err; 1406 1407 if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog) 1408 return 0; 1409 1410 if (features & NETIF_F_GRO) { 1411 err = veth_napi_enable(dev); 1412 if (err) 1413 return err; 1414 } else { 1415 veth_napi_del(dev); 1416 } 1417 return 0; 1418 } 1419 1420 static void veth_set_rx_headroom(struct net_device *dev, int new_hr) 1421 { 1422 struct veth_priv *peer_priv, *priv = netdev_priv(dev); 1423 struct net_device *peer; 1424 1425 if (new_hr < 0) 1426 new_hr = 0; 1427 1428 rcu_read_lock(); 1429 peer = rcu_dereference(priv->peer); 1430 if (unlikely(!peer)) 1431 goto out; 1432 1433 peer_priv = netdev_priv(peer); 1434 priv->requested_headroom = new_hr; 1435 new_hr = max(priv->requested_headroom, peer_priv->requested_headroom); 1436 dev->needed_headroom = new_hr; 1437 peer->needed_headroom = new_hr; 1438 1439 out: 1440 rcu_read_unlock(); 1441 } 1442 1443 static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, 1444 struct netlink_ext_ack *extack) 1445 { 1446 struct veth_priv *priv = netdev_priv(dev); 1447 struct bpf_prog *old_prog; 1448 struct net_device *peer; 1449 unsigned int max_mtu; 1450 int err; 1451 1452 old_prog = priv->_xdp_prog; 1453 priv->_xdp_prog = prog; 1454 peer = rtnl_dereference(priv->peer); 1455 1456 if (prog) { 1457 if (!peer) { 1458 NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached"); 1459 err = -ENOTCONN; 1460 goto err; 1461 } 1462 1463 max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM - 1464 peer->hard_header_len - 1465 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 1466 if (peer->mtu > max_mtu) { 1467 NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP"); 1468 err = -ERANGE; 1469 goto err; 1470 } 1471 1472 if (dev->real_num_rx_queues < peer->real_num_tx_queues) { 1473 NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues"); 1474 err = -ENOSPC; 1475 goto err; 1476 } 1477 1478 if (dev->flags & IFF_UP) { 1479 err = veth_enable_xdp(dev); 1480 if (err) { 1481 NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed"); 1482 goto err; 1483 } 1484 } 1485 1486 if (!old_prog) { 1487 peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; 1488 peer->max_mtu = max_mtu; 1489 } 1490 } 1491 1492 if (old_prog) { 1493 if (!prog) { 1494 if (dev->flags & IFF_UP) 1495 veth_disable_xdp(dev); 1496 1497 if (peer) { 1498 peer->hw_features |= NETIF_F_GSO_SOFTWARE; 1499 peer->max_mtu = ETH_MAX_MTU; 1500 } 1501 } 1502 bpf_prog_put(old_prog); 1503 } 1504 1505 if ((!!old_prog ^ !!prog) && peer) 1506 netdev_update_features(peer); 1507 1508 return 0; 1509 err: 1510 priv->_xdp_prog = old_prog; 1511 1512 return err; 1513 } 1514 1515 static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) 1516 { 1517 switch (xdp->command) { 1518 case XDP_SETUP_PROG: 1519 return veth_xdp_set(dev, xdp->prog, xdp->extack); 1520 default: 1521 return -EINVAL; 1522 } 1523 } 1524 1525 static const struct net_device_ops veth_netdev_ops = { 1526 .ndo_init = veth_dev_init, 1527 .ndo_open = veth_open, 1528 .ndo_stop = veth_close, 1529 .ndo_start_xmit = veth_xmit, 1530 .ndo_get_stats64 = veth_get_stats64, 1531 .ndo_set_rx_mode = veth_set_multicast_list, 1532 .ndo_set_mac_address = eth_mac_addr, 1533 #ifdef CONFIG_NET_POLL_CONTROLLER 1534 .ndo_poll_controller = veth_poll_controller, 1535 #endif 1536 .ndo_get_iflink = veth_get_iflink, 1537 .ndo_fix_features = veth_fix_features, 1538 .ndo_set_features = veth_set_features, 1539 .ndo_features_check = passthru_features_check, 1540 .ndo_set_rx_headroom = veth_set_rx_headroom, 1541 .ndo_bpf = veth_xdp, 1542 .ndo_xdp_xmit = veth_ndo_xdp_xmit, 1543 .ndo_get_peer_dev = veth_peer_dev, 1544 }; 1545 1546 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ 1547 NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \ 1548 NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \ 1549 NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \ 1550 NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX ) 1551 1552 static void veth_setup(struct net_device *dev) 1553 { 1554 ether_setup(dev); 1555 1556 dev->priv_flags &= ~IFF_TX_SKB_SHARING; 1557 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 1558 dev->priv_flags |= IFF_NO_QUEUE; 1559 dev->priv_flags |= IFF_PHONY_HEADROOM; 1560 1561 dev->netdev_ops = &veth_netdev_ops; 1562 dev->ethtool_ops = &veth_ethtool_ops; 1563 dev->features |= NETIF_F_LLTX; 1564 dev->features |= VETH_FEATURES; 1565 dev->vlan_features = dev->features & 1566 ~(NETIF_F_HW_VLAN_CTAG_TX | 1567 NETIF_F_HW_VLAN_STAG_TX | 1568 NETIF_F_HW_VLAN_CTAG_RX | 1569 NETIF_F_HW_VLAN_STAG_RX); 1570 dev->needs_free_netdev = true; 1571 dev->priv_destructor = veth_dev_free; 1572 dev->max_mtu = ETH_MAX_MTU; 1573 1574 dev->hw_features = VETH_FEATURES; 1575 dev->hw_enc_features = VETH_FEATURES; 1576 dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; 1577 } 1578 1579 /* 1580 * netlink interface 1581 */ 1582 1583 static int veth_validate(struct nlattr *tb[], struct nlattr *data[], 1584 struct netlink_ext_ack *extack) 1585 { 1586 if (tb[IFLA_ADDRESS]) { 1587 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 1588 return -EINVAL; 1589 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 1590 return -EADDRNOTAVAIL; 1591 } 1592 if (tb[IFLA_MTU]) { 1593 if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU]))) 1594 return -EINVAL; 1595 } 1596 return 0; 1597 } 1598 1599 static struct rtnl_link_ops veth_link_ops; 1600 1601 static void veth_disable_gro(struct net_device *dev) 1602 { 1603 dev->features &= ~NETIF_F_GRO; 1604 dev->wanted_features &= ~NETIF_F_GRO; 1605 netdev_update_features(dev); 1606 } 1607 1608 static int veth_init_queues(struct net_device *dev, struct nlattr *tb[]) 1609 { 1610 int err; 1611 1612 if (!tb[IFLA_NUM_TX_QUEUES] && dev->num_tx_queues > 1) { 1613 err = netif_set_real_num_tx_queues(dev, 1); 1614 if (err) 1615 return err; 1616 } 1617 if (!tb[IFLA_NUM_RX_QUEUES] && dev->num_rx_queues > 1) { 1618 err = netif_set_real_num_rx_queues(dev, 1); 1619 if (err) 1620 return err; 1621 } 1622 return 0; 1623 } 1624 1625 static int veth_newlink(struct net *src_net, struct net_device *dev, 1626 struct nlattr *tb[], struct nlattr *data[], 1627 struct netlink_ext_ack *extack) 1628 { 1629 int err; 1630 struct net_device *peer; 1631 struct veth_priv *priv; 1632 char ifname[IFNAMSIZ]; 1633 struct nlattr *peer_tb[IFLA_MAX + 1], **tbp; 1634 unsigned char name_assign_type; 1635 struct ifinfomsg *ifmp; 1636 struct net *net; 1637 1638 /* 1639 * create and register peer first 1640 */ 1641 if (data != NULL && data[VETH_INFO_PEER] != NULL) { 1642 struct nlattr *nla_peer; 1643 1644 nla_peer = data[VETH_INFO_PEER]; 1645 ifmp = nla_data(nla_peer); 1646 err = rtnl_nla_parse_ifla(peer_tb, 1647 nla_data(nla_peer) + sizeof(struct ifinfomsg), 1648 nla_len(nla_peer) - sizeof(struct ifinfomsg), 1649 NULL); 1650 if (err < 0) 1651 return err; 1652 1653 err = veth_validate(peer_tb, NULL, extack); 1654 if (err < 0) 1655 return err; 1656 1657 tbp = peer_tb; 1658 } else { 1659 ifmp = NULL; 1660 tbp = tb; 1661 } 1662 1663 if (ifmp && tbp[IFLA_IFNAME]) { 1664 nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); 1665 name_assign_type = NET_NAME_USER; 1666 } else { 1667 snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); 1668 name_assign_type = NET_NAME_ENUM; 1669 } 1670 1671 net = rtnl_link_get_net(src_net, tbp); 1672 if (IS_ERR(net)) 1673 return PTR_ERR(net); 1674 1675 peer = rtnl_create_link(net, ifname, name_assign_type, 1676 &veth_link_ops, tbp, extack); 1677 if (IS_ERR(peer)) { 1678 put_net(net); 1679 return PTR_ERR(peer); 1680 } 1681 1682 if (!ifmp || !tbp[IFLA_ADDRESS]) 1683 eth_hw_addr_random(peer); 1684 1685 if (ifmp && (dev->ifindex != 0)) 1686 peer->ifindex = ifmp->ifi_index; 1687 1688 netif_set_gso_max_size(peer, dev->gso_max_size); 1689 netif_set_gso_max_segs(peer, dev->gso_max_segs); 1690 1691 err = register_netdevice(peer); 1692 put_net(net); 1693 net = NULL; 1694 if (err < 0) 1695 goto err_register_peer; 1696 1697 /* keep GRO disabled by default to be consistent with the established 1698 * veth behavior 1699 */ 1700 veth_disable_gro(peer); 1701 netif_carrier_off(peer); 1702 1703 err = rtnl_configure_link(peer, ifmp); 1704 if (err < 0) 1705 goto err_configure_peer; 1706 1707 /* 1708 * register dev last 1709 * 1710 * note, that since we've registered new device the dev's name 1711 * should be re-allocated 1712 */ 1713 1714 if (tb[IFLA_ADDRESS] == NULL) 1715 eth_hw_addr_random(dev); 1716 1717 if (tb[IFLA_IFNAME]) 1718 nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); 1719 else 1720 snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); 1721 1722 err = register_netdevice(dev); 1723 if (err < 0) 1724 goto err_register_dev; 1725 1726 netif_carrier_off(dev); 1727 1728 /* 1729 * tie the deviced together 1730 */ 1731 1732 priv = netdev_priv(dev); 1733 rcu_assign_pointer(priv->peer, peer); 1734 err = veth_init_queues(dev, tb); 1735 if (err) 1736 goto err_queues; 1737 1738 priv = netdev_priv(peer); 1739 rcu_assign_pointer(priv->peer, dev); 1740 err = veth_init_queues(peer, tb); 1741 if (err) 1742 goto err_queues; 1743 1744 veth_disable_gro(dev); 1745 return 0; 1746 1747 err_queues: 1748 unregister_netdevice(dev); 1749 err_register_dev: 1750 /* nothing to do */ 1751 err_configure_peer: 1752 unregister_netdevice(peer); 1753 return err; 1754 1755 err_register_peer: 1756 free_netdev(peer); 1757 return err; 1758 } 1759 1760 static void veth_dellink(struct net_device *dev, struct list_head *head) 1761 { 1762 struct veth_priv *priv; 1763 struct net_device *peer; 1764 1765 priv = netdev_priv(dev); 1766 peer = rtnl_dereference(priv->peer); 1767 1768 /* Note : dellink() is called from default_device_exit_batch(), 1769 * before a rcu_synchronize() point. The devices are guaranteed 1770 * not being freed before one RCU grace period. 1771 */ 1772 RCU_INIT_POINTER(priv->peer, NULL); 1773 unregister_netdevice_queue(dev, head); 1774 1775 if (peer) { 1776 priv = netdev_priv(peer); 1777 RCU_INIT_POINTER(priv->peer, NULL); 1778 unregister_netdevice_queue(peer, head); 1779 } 1780 } 1781 1782 static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = { 1783 [VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) }, 1784 }; 1785 1786 static struct net *veth_get_link_net(const struct net_device *dev) 1787 { 1788 struct veth_priv *priv = netdev_priv(dev); 1789 struct net_device *peer = rtnl_dereference(priv->peer); 1790 1791 return peer ? dev_net(peer) : dev_net(dev); 1792 } 1793 1794 static unsigned int veth_get_num_queues(void) 1795 { 1796 /* enforce the same queue limit as rtnl_create_link */ 1797 int queues = num_possible_cpus(); 1798 1799 if (queues > 4096) 1800 queues = 4096; 1801 return queues; 1802 } 1803 1804 static struct rtnl_link_ops veth_link_ops = { 1805 .kind = DRV_NAME, 1806 .priv_size = sizeof(struct veth_priv), 1807 .setup = veth_setup, 1808 .validate = veth_validate, 1809 .newlink = veth_newlink, 1810 .dellink = veth_dellink, 1811 .policy = veth_policy, 1812 .maxtype = VETH_INFO_MAX, 1813 .get_link_net = veth_get_link_net, 1814 .get_num_tx_queues = veth_get_num_queues, 1815 .get_num_rx_queues = veth_get_num_queues, 1816 }; 1817 1818 /* 1819 * init/fini 1820 */ 1821 1822 static __init int veth_init(void) 1823 { 1824 return rtnl_link_register(&veth_link_ops); 1825 } 1826 1827 static __exit void veth_exit(void) 1828 { 1829 rtnl_link_unregister(&veth_link_ops); 1830 } 1831 1832 module_init(veth_init); 1833 module_exit(veth_exit); 1834 1835 MODULE_DESCRIPTION("Virtual Ethernet Tunnel"); 1836 MODULE_LICENSE("GPL v2"); 1837 MODULE_ALIAS_RTNL_LINK(DRV_NAME); 1838