1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * drivers/net/veth.c 4 * 5 * Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc 6 * 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com> 9 * 10 */ 11 12 #include <linux/netdevice.h> 13 #include <linux/slab.h> 14 #include <linux/ethtool.h> 15 #include <linux/etherdevice.h> 16 #include <linux/u64_stats_sync.h> 17 18 #include <net/rtnetlink.h> 19 #include <net/dst.h> 20 #include <net/xfrm.h> 21 #include <net/xdp.h> 22 #include <linux/veth.h> 23 #include <linux/module.h> 24 #include <linux/bpf.h> 25 #include <linux/filter.h> 26 #include <linux/ptr_ring.h> 27 #include <linux/bpf_trace.h> 28 #include <linux/net_tstamp.h> 29 30 #define DRV_NAME "veth" 31 #define DRV_VERSION "1.0" 32 33 #define VETH_XDP_FLAG BIT(0) 34 #define VETH_RING_SIZE 256 35 #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) 36 37 #define VETH_XDP_TX_BULK_SIZE 16 38 #define VETH_XDP_BATCH 16 39 40 struct veth_stats { 41 u64 rx_drops; 42 /* xdp */ 43 u64 xdp_packets; 44 u64 xdp_bytes; 45 u64 xdp_redirect; 46 u64 xdp_drops; 47 u64 xdp_tx; 48 u64 xdp_tx_err; 49 u64 peer_tq_xdp_xmit; 50 u64 peer_tq_xdp_xmit_err; 51 }; 52 53 struct veth_rq_stats { 54 struct veth_stats vs; 55 struct u64_stats_sync syncp; 56 }; 57 58 struct veth_rq { 59 struct napi_struct xdp_napi; 60 struct napi_struct __rcu *napi; /* points to xdp_napi when the latter is initialized */ 61 struct net_device *dev; 62 struct bpf_prog __rcu *xdp_prog; 63 struct xdp_mem_info xdp_mem; 64 struct veth_rq_stats stats; 65 bool rx_notify_masked; 66 struct ptr_ring xdp_ring; 67 struct xdp_rxq_info xdp_rxq; 68 }; 69 70 struct veth_priv { 71 struct net_device __rcu *peer; 72 atomic64_t dropped; 73 struct bpf_prog *_xdp_prog; 74 struct veth_rq *rq; 75 unsigned int requested_headroom; 76 }; 77 78 struct veth_xdp_tx_bq { 79 struct xdp_frame *q[VETH_XDP_TX_BULK_SIZE]; 80 unsigned int count; 81 }; 82 83 /* 84 * ethtool interface 85 */ 86 87 struct veth_q_stat_desc { 88 char desc[ETH_GSTRING_LEN]; 89 size_t offset; 90 }; 91 92 #define VETH_RQ_STAT(m) offsetof(struct veth_stats, m) 93 94 static const struct veth_q_stat_desc veth_rq_stats_desc[] = { 95 { "xdp_packets", VETH_RQ_STAT(xdp_packets) }, 96 { "xdp_bytes", VETH_RQ_STAT(xdp_bytes) }, 97 { "drops", VETH_RQ_STAT(rx_drops) }, 98 { "xdp_redirect", VETH_RQ_STAT(xdp_redirect) }, 99 { "xdp_drops", VETH_RQ_STAT(xdp_drops) }, 100 { "xdp_tx", VETH_RQ_STAT(xdp_tx) }, 101 { "xdp_tx_errors", VETH_RQ_STAT(xdp_tx_err) }, 102 }; 103 104 #define VETH_RQ_STATS_LEN ARRAY_SIZE(veth_rq_stats_desc) 105 106 static const struct veth_q_stat_desc veth_tq_stats_desc[] = { 107 { "xdp_xmit", VETH_RQ_STAT(peer_tq_xdp_xmit) }, 108 { "xdp_xmit_errors", VETH_RQ_STAT(peer_tq_xdp_xmit_err) }, 109 }; 110 111 #define VETH_TQ_STATS_LEN ARRAY_SIZE(veth_tq_stats_desc) 112 113 static struct { 114 const char string[ETH_GSTRING_LEN]; 115 } ethtool_stats_keys[] = { 116 { "peer_ifindex" }, 117 }; 118 119 static int veth_get_link_ksettings(struct net_device *dev, 120 struct ethtool_link_ksettings *cmd) 121 { 122 cmd->base.speed = SPEED_10000; 123 cmd->base.duplex = DUPLEX_FULL; 124 cmd->base.port = PORT_TP; 125 cmd->base.autoneg = AUTONEG_DISABLE; 126 return 0; 127 } 128 129 static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) 130 { 131 strlcpy(info->driver, DRV_NAME, sizeof(info->driver)); 132 strlcpy(info->version, DRV_VERSION, sizeof(info->version)); 133 } 134 135 static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf) 136 { 137 u8 *p = buf; 138 int i, j; 139 140 switch(stringset) { 141 case ETH_SS_STATS: 142 memcpy(p, ðtool_stats_keys, sizeof(ethtool_stats_keys)); 143 p += sizeof(ethtool_stats_keys); 144 for (i = 0; i < dev->real_num_rx_queues; i++) 145 for (j = 0; j < VETH_RQ_STATS_LEN; j++) 146 ethtool_sprintf(&p, "rx_queue_%u_%.18s", 147 i, veth_rq_stats_desc[j].desc); 148 149 for (i = 0; i < dev->real_num_tx_queues; i++) 150 for (j = 0; j < VETH_TQ_STATS_LEN; j++) 151 ethtool_sprintf(&p, "tx_queue_%u_%.18s", 152 i, veth_tq_stats_desc[j].desc); 153 break; 154 } 155 } 156 157 static int veth_get_sset_count(struct net_device *dev, int sset) 158 { 159 switch (sset) { 160 case ETH_SS_STATS: 161 return ARRAY_SIZE(ethtool_stats_keys) + 162 VETH_RQ_STATS_LEN * dev->real_num_rx_queues + 163 VETH_TQ_STATS_LEN * dev->real_num_tx_queues; 164 default: 165 return -EOPNOTSUPP; 166 } 167 } 168 169 static void veth_get_ethtool_stats(struct net_device *dev, 170 struct ethtool_stats *stats, u64 *data) 171 { 172 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 173 struct net_device *peer = rtnl_dereference(priv->peer); 174 int i, j, idx; 175 176 data[0] = peer ? peer->ifindex : 0; 177 idx = 1; 178 for (i = 0; i < dev->real_num_rx_queues; i++) { 179 const struct veth_rq_stats *rq_stats = &priv->rq[i].stats; 180 const void *stats_base = (void *)&rq_stats->vs; 181 unsigned int start; 182 size_t offset; 183 184 do { 185 start = u64_stats_fetch_begin_irq(&rq_stats->syncp); 186 for (j = 0; j < VETH_RQ_STATS_LEN; j++) { 187 offset = veth_rq_stats_desc[j].offset; 188 data[idx + j] = *(u64 *)(stats_base + offset); 189 } 190 } while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start)); 191 idx += VETH_RQ_STATS_LEN; 192 } 193 194 if (!peer) 195 return; 196 197 rcv_priv = netdev_priv(peer); 198 for (i = 0; i < peer->real_num_rx_queues; i++) { 199 const struct veth_rq_stats *rq_stats = &rcv_priv->rq[i].stats; 200 const void *base = (void *)&rq_stats->vs; 201 unsigned int start, tx_idx = idx; 202 size_t offset; 203 204 tx_idx += (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN; 205 do { 206 start = u64_stats_fetch_begin_irq(&rq_stats->syncp); 207 for (j = 0; j < VETH_TQ_STATS_LEN; j++) { 208 offset = veth_tq_stats_desc[j].offset; 209 data[tx_idx + j] += *(u64 *)(base + offset); 210 } 211 } while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start)); 212 } 213 } 214 215 static void veth_get_channels(struct net_device *dev, 216 struct ethtool_channels *channels) 217 { 218 channels->tx_count = dev->real_num_tx_queues; 219 channels->rx_count = dev->real_num_rx_queues; 220 channels->max_tx = dev->num_tx_queues; 221 channels->max_rx = dev->num_rx_queues; 222 } 223 224 static int veth_set_channels(struct net_device *dev, 225 struct ethtool_channels *ch); 226 227 static const struct ethtool_ops veth_ethtool_ops = { 228 .get_drvinfo = veth_get_drvinfo, 229 .get_link = ethtool_op_get_link, 230 .get_strings = veth_get_strings, 231 .get_sset_count = veth_get_sset_count, 232 .get_ethtool_stats = veth_get_ethtool_stats, 233 .get_link_ksettings = veth_get_link_ksettings, 234 .get_ts_info = ethtool_op_get_ts_info, 235 .get_channels = veth_get_channels, 236 .set_channels = veth_set_channels, 237 }; 238 239 /* general routines */ 240 241 static bool veth_is_xdp_frame(void *ptr) 242 { 243 return (unsigned long)ptr & VETH_XDP_FLAG; 244 } 245 246 static struct xdp_frame *veth_ptr_to_xdp(void *ptr) 247 { 248 return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG); 249 } 250 251 static void *veth_xdp_to_ptr(struct xdp_frame *xdp) 252 { 253 return (void *)((unsigned long)xdp | VETH_XDP_FLAG); 254 } 255 256 static void veth_ptr_free(void *ptr) 257 { 258 if (veth_is_xdp_frame(ptr)) 259 xdp_return_frame(veth_ptr_to_xdp(ptr)); 260 else 261 kfree_skb(ptr); 262 } 263 264 static void __veth_xdp_flush(struct veth_rq *rq) 265 { 266 /* Write ptr_ring before reading rx_notify_masked */ 267 smp_mb(); 268 if (!READ_ONCE(rq->rx_notify_masked) && 269 napi_schedule_prep(&rq->xdp_napi)) { 270 WRITE_ONCE(rq->rx_notify_masked, true); 271 __napi_schedule(&rq->xdp_napi); 272 } 273 } 274 275 static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) 276 { 277 if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) { 278 dev_kfree_skb_any(skb); 279 return NET_RX_DROP; 280 } 281 282 return NET_RX_SUCCESS; 283 } 284 285 static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, 286 struct veth_rq *rq, bool xdp) 287 { 288 return __dev_forward_skb(dev, skb) ?: xdp ? 289 veth_xdp_rx(rq, skb) : 290 netif_rx(skb); 291 } 292 293 /* return true if the specified skb has chances of GRO aggregation 294 * Don't strive for accuracy, but try to avoid GRO overhead in the most 295 * common scenarios. 296 * When XDP is enabled, all traffic is considered eligible, as the xmit 297 * device has TSO off. 298 * When TSO is enabled on the xmit device, we are likely interested only 299 * in UDP aggregation, explicitly check for that if the skb is suspected 300 * - the sock_wfree destructor is used by UDP, ICMP and XDP sockets - 301 * to belong to locally generated UDP traffic. 302 */ 303 static bool veth_skb_is_eligible_for_gro(const struct net_device *dev, 304 const struct net_device *rcv, 305 const struct sk_buff *skb) 306 { 307 return !(dev->features & NETIF_F_ALL_TSO) || 308 (skb->destructor == sock_wfree && 309 rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD)); 310 } 311 312 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) 313 { 314 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 315 struct veth_rq *rq = NULL; 316 struct net_device *rcv; 317 int length = skb->len; 318 bool use_napi = false; 319 int rxq; 320 321 rcu_read_lock(); 322 rcv = rcu_dereference(priv->peer); 323 if (unlikely(!rcv)) { 324 kfree_skb(skb); 325 goto drop; 326 } 327 328 rcv_priv = netdev_priv(rcv); 329 rxq = skb_get_queue_mapping(skb); 330 if (rxq < rcv->real_num_rx_queues) { 331 rq = &rcv_priv->rq[rxq]; 332 333 /* The napi pointer is available when an XDP program is 334 * attached or when GRO is enabled 335 * Don't bother with napi/GRO if the skb can't be aggregated 336 */ 337 use_napi = rcu_access_pointer(rq->napi) && 338 veth_skb_is_eligible_for_gro(dev, rcv, skb); 339 } 340 341 skb_tx_timestamp(skb); 342 if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) { 343 if (!use_napi) 344 dev_lstats_add(dev, length); 345 } else { 346 drop: 347 atomic64_inc(&priv->dropped); 348 } 349 350 if (use_napi) 351 __veth_xdp_flush(rq); 352 353 rcu_read_unlock(); 354 355 return NETDEV_TX_OK; 356 } 357 358 static u64 veth_stats_tx(struct net_device *dev, u64 *packets, u64 *bytes) 359 { 360 struct veth_priv *priv = netdev_priv(dev); 361 362 dev_lstats_read(dev, packets, bytes); 363 return atomic64_read(&priv->dropped); 364 } 365 366 static void veth_stats_rx(struct veth_stats *result, struct net_device *dev) 367 { 368 struct veth_priv *priv = netdev_priv(dev); 369 int i; 370 371 result->peer_tq_xdp_xmit_err = 0; 372 result->xdp_packets = 0; 373 result->xdp_tx_err = 0; 374 result->xdp_bytes = 0; 375 result->rx_drops = 0; 376 for (i = 0; i < dev->num_rx_queues; i++) { 377 u64 packets, bytes, drops, xdp_tx_err, peer_tq_xdp_xmit_err; 378 struct veth_rq_stats *stats = &priv->rq[i].stats; 379 unsigned int start; 380 381 do { 382 start = u64_stats_fetch_begin_irq(&stats->syncp); 383 peer_tq_xdp_xmit_err = stats->vs.peer_tq_xdp_xmit_err; 384 xdp_tx_err = stats->vs.xdp_tx_err; 385 packets = stats->vs.xdp_packets; 386 bytes = stats->vs.xdp_bytes; 387 drops = stats->vs.rx_drops; 388 } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); 389 result->peer_tq_xdp_xmit_err += peer_tq_xdp_xmit_err; 390 result->xdp_tx_err += xdp_tx_err; 391 result->xdp_packets += packets; 392 result->xdp_bytes += bytes; 393 result->rx_drops += drops; 394 } 395 } 396 397 static void veth_get_stats64(struct net_device *dev, 398 struct rtnl_link_stats64 *tot) 399 { 400 struct veth_priv *priv = netdev_priv(dev); 401 struct net_device *peer; 402 struct veth_stats rx; 403 u64 packets, bytes; 404 405 tot->tx_dropped = veth_stats_tx(dev, &packets, &bytes); 406 tot->tx_bytes = bytes; 407 tot->tx_packets = packets; 408 409 veth_stats_rx(&rx, dev); 410 tot->tx_dropped += rx.xdp_tx_err; 411 tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err; 412 tot->rx_bytes = rx.xdp_bytes; 413 tot->rx_packets = rx.xdp_packets; 414 415 rcu_read_lock(); 416 peer = rcu_dereference(priv->peer); 417 if (peer) { 418 veth_stats_tx(peer, &packets, &bytes); 419 tot->rx_bytes += bytes; 420 tot->rx_packets += packets; 421 422 veth_stats_rx(&rx, peer); 423 tot->tx_dropped += rx.peer_tq_xdp_xmit_err; 424 tot->rx_dropped += rx.xdp_tx_err; 425 tot->tx_bytes += rx.xdp_bytes; 426 tot->tx_packets += rx.xdp_packets; 427 } 428 rcu_read_unlock(); 429 } 430 431 /* fake multicast ability */ 432 static void veth_set_multicast_list(struct net_device *dev) 433 { 434 } 435 436 static struct sk_buff *veth_build_skb(void *head, int headroom, int len, 437 int buflen) 438 { 439 struct sk_buff *skb; 440 441 skb = build_skb(head, buflen); 442 if (!skb) 443 return NULL; 444 445 skb_reserve(skb, headroom); 446 skb_put(skb, len); 447 448 return skb; 449 } 450 451 static int veth_select_rxq(struct net_device *dev) 452 { 453 return smp_processor_id() % dev->real_num_rx_queues; 454 } 455 456 static struct net_device *veth_peer_dev(struct net_device *dev) 457 { 458 struct veth_priv *priv = netdev_priv(dev); 459 460 /* Callers must be under RCU read side. */ 461 return rcu_dereference(priv->peer); 462 } 463 464 static int veth_xdp_xmit(struct net_device *dev, int n, 465 struct xdp_frame **frames, 466 u32 flags, bool ndo_xmit) 467 { 468 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 469 int i, ret = -ENXIO, nxmit = 0; 470 struct net_device *rcv; 471 unsigned int max_len; 472 struct veth_rq *rq; 473 474 if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) 475 return -EINVAL; 476 477 rcu_read_lock(); 478 rcv = rcu_dereference(priv->peer); 479 if (unlikely(!rcv)) 480 goto out; 481 482 rcv_priv = netdev_priv(rcv); 483 rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 484 /* The napi pointer is set if NAPI is enabled, which ensures that 485 * xdp_ring is initialized on receive side and the peer device is up. 486 */ 487 if (!rcu_access_pointer(rq->napi)) 488 goto out; 489 490 max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN; 491 492 spin_lock(&rq->xdp_ring.producer_lock); 493 for (i = 0; i < n; i++) { 494 struct xdp_frame *frame = frames[i]; 495 void *ptr = veth_xdp_to_ptr(frame); 496 497 if (unlikely(frame->len > max_len || 498 __ptr_ring_produce(&rq->xdp_ring, ptr))) 499 break; 500 nxmit++; 501 } 502 spin_unlock(&rq->xdp_ring.producer_lock); 503 504 if (flags & XDP_XMIT_FLUSH) 505 __veth_xdp_flush(rq); 506 507 ret = nxmit; 508 if (ndo_xmit) { 509 u64_stats_update_begin(&rq->stats.syncp); 510 rq->stats.vs.peer_tq_xdp_xmit += nxmit; 511 rq->stats.vs.peer_tq_xdp_xmit_err += n - nxmit; 512 u64_stats_update_end(&rq->stats.syncp); 513 } 514 515 out: 516 rcu_read_unlock(); 517 518 return ret; 519 } 520 521 static int veth_ndo_xdp_xmit(struct net_device *dev, int n, 522 struct xdp_frame **frames, u32 flags) 523 { 524 int err; 525 526 err = veth_xdp_xmit(dev, n, frames, flags, true); 527 if (err < 0) { 528 struct veth_priv *priv = netdev_priv(dev); 529 530 atomic64_add(n, &priv->dropped); 531 } 532 533 return err; 534 } 535 536 static void veth_xdp_flush_bq(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) 537 { 538 int sent, i, err = 0, drops; 539 540 sent = veth_xdp_xmit(rq->dev, bq->count, bq->q, 0, false); 541 if (sent < 0) { 542 err = sent; 543 sent = 0; 544 } 545 546 for (i = sent; unlikely(i < bq->count); i++) 547 xdp_return_frame(bq->q[i]); 548 549 drops = bq->count - sent; 550 trace_xdp_bulk_tx(rq->dev, sent, drops, err); 551 552 u64_stats_update_begin(&rq->stats.syncp); 553 rq->stats.vs.xdp_tx += sent; 554 rq->stats.vs.xdp_tx_err += drops; 555 u64_stats_update_end(&rq->stats.syncp); 556 557 bq->count = 0; 558 } 559 560 static void veth_xdp_flush(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) 561 { 562 struct veth_priv *rcv_priv, *priv = netdev_priv(rq->dev); 563 struct net_device *rcv; 564 struct veth_rq *rcv_rq; 565 566 rcu_read_lock(); 567 veth_xdp_flush_bq(rq, bq); 568 rcv = rcu_dereference(priv->peer); 569 if (unlikely(!rcv)) 570 goto out; 571 572 rcv_priv = netdev_priv(rcv); 573 rcv_rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 574 /* xdp_ring is initialized on receive side? */ 575 if (unlikely(!rcu_access_pointer(rcv_rq->xdp_prog))) 576 goto out; 577 578 __veth_xdp_flush(rcv_rq); 579 out: 580 rcu_read_unlock(); 581 } 582 583 static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp, 584 struct veth_xdp_tx_bq *bq) 585 { 586 struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp); 587 588 if (unlikely(!frame)) 589 return -EOVERFLOW; 590 591 if (unlikely(bq->count == VETH_XDP_TX_BULK_SIZE)) 592 veth_xdp_flush_bq(rq, bq); 593 594 bq->q[bq->count++] = frame; 595 596 return 0; 597 } 598 599 static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq, 600 struct xdp_frame *frame, 601 struct veth_xdp_tx_bq *bq, 602 struct veth_stats *stats) 603 { 604 struct xdp_frame orig_frame; 605 struct bpf_prog *xdp_prog; 606 607 rcu_read_lock(); 608 xdp_prog = rcu_dereference(rq->xdp_prog); 609 if (likely(xdp_prog)) { 610 struct xdp_buff xdp; 611 u32 act; 612 613 xdp_convert_frame_to_buff(frame, &xdp); 614 xdp.rxq = &rq->xdp_rxq; 615 616 act = bpf_prog_run_xdp(xdp_prog, &xdp); 617 618 switch (act) { 619 case XDP_PASS: 620 if (xdp_update_frame_from_buff(&xdp, frame)) 621 goto err_xdp; 622 break; 623 case XDP_TX: 624 orig_frame = *frame; 625 xdp.rxq->mem = frame->mem; 626 if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) { 627 trace_xdp_exception(rq->dev, xdp_prog, act); 628 frame = &orig_frame; 629 stats->rx_drops++; 630 goto err_xdp; 631 } 632 stats->xdp_tx++; 633 rcu_read_unlock(); 634 goto xdp_xmit; 635 case XDP_REDIRECT: 636 orig_frame = *frame; 637 xdp.rxq->mem = frame->mem; 638 if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { 639 frame = &orig_frame; 640 stats->rx_drops++; 641 goto err_xdp; 642 } 643 stats->xdp_redirect++; 644 rcu_read_unlock(); 645 goto xdp_xmit; 646 default: 647 bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); 648 fallthrough; 649 case XDP_ABORTED: 650 trace_xdp_exception(rq->dev, xdp_prog, act); 651 fallthrough; 652 case XDP_DROP: 653 stats->xdp_drops++; 654 goto err_xdp; 655 } 656 } 657 rcu_read_unlock(); 658 659 return frame; 660 err_xdp: 661 rcu_read_unlock(); 662 xdp_return_frame(frame); 663 xdp_xmit: 664 return NULL; 665 } 666 667 /* frames array contains VETH_XDP_BATCH at most */ 668 static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames, 669 int n_xdpf, struct veth_xdp_tx_bq *bq, 670 struct veth_stats *stats) 671 { 672 void *skbs[VETH_XDP_BATCH]; 673 int i; 674 675 if (xdp_alloc_skb_bulk(skbs, n_xdpf, 676 GFP_ATOMIC | __GFP_ZERO) < 0) { 677 for (i = 0; i < n_xdpf; i++) 678 xdp_return_frame(frames[i]); 679 stats->rx_drops += n_xdpf; 680 681 return; 682 } 683 684 for (i = 0; i < n_xdpf; i++) { 685 struct sk_buff *skb = skbs[i]; 686 687 skb = __xdp_build_skb_from_frame(frames[i], skb, 688 rq->dev); 689 if (!skb) { 690 xdp_return_frame(frames[i]); 691 stats->rx_drops++; 692 continue; 693 } 694 napi_gro_receive(&rq->xdp_napi, skb); 695 } 696 } 697 698 static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, 699 struct sk_buff *skb, 700 struct veth_xdp_tx_bq *bq, 701 struct veth_stats *stats) 702 { 703 u32 pktlen, headroom, act, metalen, frame_sz; 704 void *orig_data, *orig_data_end; 705 struct bpf_prog *xdp_prog; 706 int mac_len, delta, off; 707 struct xdp_buff xdp; 708 709 skb_prepare_for_gro(skb); 710 711 rcu_read_lock(); 712 xdp_prog = rcu_dereference(rq->xdp_prog); 713 if (unlikely(!xdp_prog)) { 714 rcu_read_unlock(); 715 goto out; 716 } 717 718 mac_len = skb->data - skb_mac_header(skb); 719 pktlen = skb->len + mac_len; 720 headroom = skb_headroom(skb) - mac_len; 721 722 if (skb_shared(skb) || skb_head_is_locked(skb) || 723 skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) { 724 struct sk_buff *nskb; 725 int size, head_off; 726 void *head, *start; 727 struct page *page; 728 729 size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) + 730 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 731 if (size > PAGE_SIZE) 732 goto drop; 733 734 page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); 735 if (!page) 736 goto drop; 737 738 head = page_address(page); 739 start = head + VETH_XDP_HEADROOM; 740 if (skb_copy_bits(skb, -mac_len, start, pktlen)) { 741 page_frag_free(head); 742 goto drop; 743 } 744 745 nskb = veth_build_skb(head, VETH_XDP_HEADROOM + mac_len, 746 skb->len, PAGE_SIZE); 747 if (!nskb) { 748 page_frag_free(head); 749 goto drop; 750 } 751 752 skb_copy_header(nskb, skb); 753 head_off = skb_headroom(nskb) - skb_headroom(skb); 754 skb_headers_offset_update(nskb, head_off); 755 consume_skb(skb); 756 skb = nskb; 757 } 758 759 /* SKB "head" area always have tailroom for skb_shared_info */ 760 frame_sz = skb_end_pointer(skb) - skb->head; 761 frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 762 xdp_init_buff(&xdp, frame_sz, &rq->xdp_rxq); 763 xdp_prepare_buff(&xdp, skb->head, skb->mac_header, pktlen, true); 764 765 orig_data = xdp.data; 766 orig_data_end = xdp.data_end; 767 768 act = bpf_prog_run_xdp(xdp_prog, &xdp); 769 770 switch (act) { 771 case XDP_PASS: 772 break; 773 case XDP_TX: 774 get_page(virt_to_page(xdp.data)); 775 consume_skb(skb); 776 xdp.rxq->mem = rq->xdp_mem; 777 if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) { 778 trace_xdp_exception(rq->dev, xdp_prog, act); 779 stats->rx_drops++; 780 goto err_xdp; 781 } 782 stats->xdp_tx++; 783 rcu_read_unlock(); 784 goto xdp_xmit; 785 case XDP_REDIRECT: 786 get_page(virt_to_page(xdp.data)); 787 consume_skb(skb); 788 xdp.rxq->mem = rq->xdp_mem; 789 if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { 790 stats->rx_drops++; 791 goto err_xdp; 792 } 793 stats->xdp_redirect++; 794 rcu_read_unlock(); 795 goto xdp_xmit; 796 default: 797 bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); 798 fallthrough; 799 case XDP_ABORTED: 800 trace_xdp_exception(rq->dev, xdp_prog, act); 801 fallthrough; 802 case XDP_DROP: 803 stats->xdp_drops++; 804 goto xdp_drop; 805 } 806 rcu_read_unlock(); 807 808 /* check if bpf_xdp_adjust_head was used */ 809 delta = orig_data - xdp.data; 810 off = mac_len + delta; 811 if (off > 0) 812 __skb_push(skb, off); 813 else if (off < 0) 814 __skb_pull(skb, -off); 815 skb->mac_header -= delta; 816 817 /* check if bpf_xdp_adjust_tail was used */ 818 off = xdp.data_end - orig_data_end; 819 if (off != 0) 820 __skb_put(skb, off); /* positive on grow, negative on shrink */ 821 skb->protocol = eth_type_trans(skb, rq->dev); 822 823 metalen = xdp.data - xdp.data_meta; 824 if (metalen) 825 skb_metadata_set(skb, metalen); 826 out: 827 return skb; 828 drop: 829 stats->rx_drops++; 830 xdp_drop: 831 rcu_read_unlock(); 832 kfree_skb(skb); 833 return NULL; 834 err_xdp: 835 rcu_read_unlock(); 836 page_frag_free(xdp.data); 837 xdp_xmit: 838 return NULL; 839 } 840 841 static int veth_xdp_rcv(struct veth_rq *rq, int budget, 842 struct veth_xdp_tx_bq *bq, 843 struct veth_stats *stats) 844 { 845 int i, done = 0, n_xdpf = 0; 846 void *xdpf[VETH_XDP_BATCH]; 847 848 for (i = 0; i < budget; i++) { 849 void *ptr = __ptr_ring_consume(&rq->xdp_ring); 850 851 if (!ptr) 852 break; 853 854 if (veth_is_xdp_frame(ptr)) { 855 /* ndo_xdp_xmit */ 856 struct xdp_frame *frame = veth_ptr_to_xdp(ptr); 857 858 stats->xdp_bytes += frame->len; 859 frame = veth_xdp_rcv_one(rq, frame, bq, stats); 860 if (frame) { 861 /* XDP_PASS */ 862 xdpf[n_xdpf++] = frame; 863 if (n_xdpf == VETH_XDP_BATCH) { 864 veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, 865 bq, stats); 866 n_xdpf = 0; 867 } 868 } 869 } else { 870 /* ndo_start_xmit */ 871 struct sk_buff *skb = ptr; 872 873 stats->xdp_bytes += skb->len; 874 skb = veth_xdp_rcv_skb(rq, skb, bq, stats); 875 if (skb) { 876 if (skb_shared(skb) || skb_unclone(skb, GFP_ATOMIC)) 877 netif_receive_skb(skb); 878 else 879 napi_gro_receive(&rq->xdp_napi, skb); 880 } 881 } 882 done++; 883 } 884 885 if (n_xdpf) 886 veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats); 887 888 u64_stats_update_begin(&rq->stats.syncp); 889 rq->stats.vs.xdp_redirect += stats->xdp_redirect; 890 rq->stats.vs.xdp_bytes += stats->xdp_bytes; 891 rq->stats.vs.xdp_drops += stats->xdp_drops; 892 rq->stats.vs.rx_drops += stats->rx_drops; 893 rq->stats.vs.xdp_packets += done; 894 u64_stats_update_end(&rq->stats.syncp); 895 896 return done; 897 } 898 899 static int veth_poll(struct napi_struct *napi, int budget) 900 { 901 struct veth_rq *rq = 902 container_of(napi, struct veth_rq, xdp_napi); 903 struct veth_stats stats = {}; 904 struct veth_xdp_tx_bq bq; 905 int done; 906 907 bq.count = 0; 908 909 xdp_set_return_frame_no_direct(); 910 done = veth_xdp_rcv(rq, budget, &bq, &stats); 911 912 if (done < budget && napi_complete_done(napi, done)) { 913 /* Write rx_notify_masked before reading ptr_ring */ 914 smp_store_mb(rq->rx_notify_masked, false); 915 if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { 916 if (napi_schedule_prep(&rq->xdp_napi)) { 917 WRITE_ONCE(rq->rx_notify_masked, true); 918 __napi_schedule(&rq->xdp_napi); 919 } 920 } 921 } 922 923 if (stats.xdp_tx > 0) 924 veth_xdp_flush(rq, &bq); 925 if (stats.xdp_redirect > 0) 926 xdp_do_flush(); 927 xdp_clear_return_frame_no_direct(); 928 929 return done; 930 } 931 932 static int __veth_napi_enable_range(struct net_device *dev, int start, int end) 933 { 934 struct veth_priv *priv = netdev_priv(dev); 935 int err, i; 936 937 for (i = start; i < end; i++) { 938 struct veth_rq *rq = &priv->rq[i]; 939 940 err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL); 941 if (err) 942 goto err_xdp_ring; 943 } 944 945 for (i = start; i < end; i++) { 946 struct veth_rq *rq = &priv->rq[i]; 947 948 napi_enable(&rq->xdp_napi); 949 rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); 950 } 951 952 return 0; 953 954 err_xdp_ring: 955 for (i--; i >= start; i--) 956 ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); 957 958 return err; 959 } 960 961 static int __veth_napi_enable(struct net_device *dev) 962 { 963 return __veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); 964 } 965 966 static void veth_napi_del_range(struct net_device *dev, int start, int end) 967 { 968 struct veth_priv *priv = netdev_priv(dev); 969 int i; 970 971 for (i = start; i < end; i++) { 972 struct veth_rq *rq = &priv->rq[i]; 973 974 rcu_assign_pointer(priv->rq[i].napi, NULL); 975 napi_disable(&rq->xdp_napi); 976 __netif_napi_del(&rq->xdp_napi); 977 } 978 synchronize_net(); 979 980 for (i = start; i < end; i++) { 981 struct veth_rq *rq = &priv->rq[i]; 982 983 rq->rx_notify_masked = false; 984 ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free); 985 } 986 } 987 988 static void veth_napi_del(struct net_device *dev) 989 { 990 veth_napi_del_range(dev, 0, dev->real_num_rx_queues); 991 } 992 993 static bool veth_gro_requested(const struct net_device *dev) 994 { 995 return !!(dev->wanted_features & NETIF_F_GRO); 996 } 997 998 static int veth_enable_xdp_range(struct net_device *dev, int start, int end, 999 bool napi_already_on) 1000 { 1001 struct veth_priv *priv = netdev_priv(dev); 1002 int err, i; 1003 1004 for (i = start; i < end; i++) { 1005 struct veth_rq *rq = &priv->rq[i]; 1006 1007 if (!napi_already_on) 1008 netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); 1009 err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id); 1010 if (err < 0) 1011 goto err_rxq_reg; 1012 1013 err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, 1014 MEM_TYPE_PAGE_SHARED, 1015 NULL); 1016 if (err < 0) 1017 goto err_reg_mem; 1018 1019 /* Save original mem info as it can be overwritten */ 1020 rq->xdp_mem = rq->xdp_rxq.mem; 1021 } 1022 return 0; 1023 1024 err_reg_mem: 1025 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 1026 err_rxq_reg: 1027 for (i--; i >= start; i--) { 1028 struct veth_rq *rq = &priv->rq[i]; 1029 1030 xdp_rxq_info_unreg(&rq->xdp_rxq); 1031 if (!napi_already_on) 1032 netif_napi_del(&rq->xdp_napi); 1033 } 1034 1035 return err; 1036 } 1037 1038 static void veth_disable_xdp_range(struct net_device *dev, int start, int end, 1039 bool delete_napi) 1040 { 1041 struct veth_priv *priv = netdev_priv(dev); 1042 int i; 1043 1044 for (i = start; i < end; i++) { 1045 struct veth_rq *rq = &priv->rq[i]; 1046 1047 rq->xdp_rxq.mem = rq->xdp_mem; 1048 xdp_rxq_info_unreg(&rq->xdp_rxq); 1049 1050 if (delete_napi) 1051 netif_napi_del(&rq->xdp_napi); 1052 } 1053 } 1054 1055 static int veth_enable_xdp(struct net_device *dev) 1056 { 1057 bool napi_already_on = veth_gro_requested(dev) && (dev->flags & IFF_UP); 1058 struct veth_priv *priv = netdev_priv(dev); 1059 int err, i; 1060 1061 if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) { 1062 err = veth_enable_xdp_range(dev, 0, dev->real_num_rx_queues, napi_already_on); 1063 if (err) 1064 return err; 1065 1066 if (!napi_already_on) { 1067 err = __veth_napi_enable(dev); 1068 if (err) { 1069 veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, true); 1070 return err; 1071 } 1072 1073 if (!veth_gro_requested(dev)) { 1074 /* user-space did not require GRO, but adding XDP 1075 * is supposed to get GRO working 1076 */ 1077 dev->features |= NETIF_F_GRO; 1078 netdev_features_change(dev); 1079 } 1080 } 1081 } 1082 1083 for (i = 0; i < dev->real_num_rx_queues; i++) { 1084 rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog); 1085 rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); 1086 } 1087 1088 return 0; 1089 } 1090 1091 static void veth_disable_xdp(struct net_device *dev) 1092 { 1093 struct veth_priv *priv = netdev_priv(dev); 1094 int i; 1095 1096 for (i = 0; i < dev->real_num_rx_queues; i++) 1097 rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); 1098 1099 if (!netif_running(dev) || !veth_gro_requested(dev)) { 1100 veth_napi_del(dev); 1101 1102 /* if user-space did not require GRO, since adding XDP 1103 * enabled it, clear it now 1104 */ 1105 if (!veth_gro_requested(dev) && netif_running(dev)) { 1106 dev->features &= ~NETIF_F_GRO; 1107 netdev_features_change(dev); 1108 } 1109 } 1110 1111 veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, false); 1112 } 1113 1114 static int veth_napi_enable_range(struct net_device *dev, int start, int end) 1115 { 1116 struct veth_priv *priv = netdev_priv(dev); 1117 int err, i; 1118 1119 for (i = start; i < end; i++) { 1120 struct veth_rq *rq = &priv->rq[i]; 1121 1122 netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); 1123 } 1124 1125 err = __veth_napi_enable_range(dev, start, end); 1126 if (err) { 1127 for (i = start; i < end; i++) { 1128 struct veth_rq *rq = &priv->rq[i]; 1129 1130 netif_napi_del(&rq->xdp_napi); 1131 } 1132 return err; 1133 } 1134 return err; 1135 } 1136 1137 static int veth_napi_enable(struct net_device *dev) 1138 { 1139 return veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); 1140 } 1141 1142 static void veth_disable_range_safe(struct net_device *dev, int start, int end) 1143 { 1144 struct veth_priv *priv = netdev_priv(dev); 1145 1146 if (start >= end) 1147 return; 1148 1149 if (priv->_xdp_prog) { 1150 veth_napi_del_range(dev, start, end); 1151 veth_disable_xdp_range(dev, start, end, false); 1152 } else if (veth_gro_requested(dev)) { 1153 veth_napi_del_range(dev, start, end); 1154 } 1155 } 1156 1157 static int veth_enable_range_safe(struct net_device *dev, int start, int end) 1158 { 1159 struct veth_priv *priv = netdev_priv(dev); 1160 int err; 1161 1162 if (start >= end) 1163 return 0; 1164 1165 if (priv->_xdp_prog) { 1166 /* these channels are freshly initialized, napi is not on there even 1167 * when GRO is requeste 1168 */ 1169 err = veth_enable_xdp_range(dev, start, end, false); 1170 if (err) 1171 return err; 1172 1173 err = __veth_napi_enable_range(dev, start, end); 1174 if (err) { 1175 /* on error always delete the newly added napis */ 1176 veth_disable_xdp_range(dev, start, end, true); 1177 return err; 1178 } 1179 } else if (veth_gro_requested(dev)) { 1180 return veth_napi_enable_range(dev, start, end); 1181 } 1182 return 0; 1183 } 1184 1185 static int veth_set_channels(struct net_device *dev, 1186 struct ethtool_channels *ch) 1187 { 1188 struct veth_priv *priv = netdev_priv(dev); 1189 unsigned int old_rx_count, new_rx_count; 1190 struct veth_priv *peer_priv; 1191 struct net_device *peer; 1192 int err; 1193 1194 /* sanity check. Upper bounds are already enforced by the caller */ 1195 if (!ch->rx_count || !ch->tx_count) 1196 return -EINVAL; 1197 1198 /* avoid braking XDP, if that is enabled */ 1199 peer = rtnl_dereference(priv->peer); 1200 peer_priv = peer ? netdev_priv(peer) : NULL; 1201 if (priv->_xdp_prog && peer && ch->rx_count < peer->real_num_tx_queues) 1202 return -EINVAL; 1203 1204 if (peer && peer_priv && peer_priv->_xdp_prog && ch->tx_count > peer->real_num_rx_queues) 1205 return -EINVAL; 1206 1207 old_rx_count = dev->real_num_rx_queues; 1208 new_rx_count = ch->rx_count; 1209 if (netif_running(dev)) { 1210 /* turn device off */ 1211 netif_carrier_off(dev); 1212 if (peer) 1213 netif_carrier_off(peer); 1214 1215 /* try to allocate new resurces, as needed*/ 1216 err = veth_enable_range_safe(dev, old_rx_count, new_rx_count); 1217 if (err) 1218 goto out; 1219 } 1220 1221 err = netif_set_real_num_rx_queues(dev, ch->rx_count); 1222 if (err) 1223 goto revert; 1224 1225 err = netif_set_real_num_tx_queues(dev, ch->tx_count); 1226 if (err) { 1227 int err2 = netif_set_real_num_rx_queues(dev, old_rx_count); 1228 1229 /* this error condition could happen only if rx and tx change 1230 * in opposite directions (e.g. tx nr raises, rx nr decreases) 1231 * and we can't do anything to fully restore the original 1232 * status 1233 */ 1234 if (err2) 1235 pr_warn("Can't restore rx queues config %d -> %d %d", 1236 new_rx_count, old_rx_count, err2); 1237 else 1238 goto revert; 1239 } 1240 1241 out: 1242 if (netif_running(dev)) { 1243 /* note that we need to swap the arguments WRT the enable part 1244 * to identify the range we have to disable 1245 */ 1246 veth_disable_range_safe(dev, new_rx_count, old_rx_count); 1247 netif_carrier_on(dev); 1248 if (peer) 1249 netif_carrier_on(peer); 1250 } 1251 return err; 1252 1253 revert: 1254 new_rx_count = old_rx_count; 1255 old_rx_count = ch->rx_count; 1256 goto out; 1257 } 1258 1259 static int veth_open(struct net_device *dev) 1260 { 1261 struct veth_priv *priv = netdev_priv(dev); 1262 struct net_device *peer = rtnl_dereference(priv->peer); 1263 int err; 1264 1265 if (!peer) 1266 return -ENOTCONN; 1267 1268 if (priv->_xdp_prog) { 1269 err = veth_enable_xdp(dev); 1270 if (err) 1271 return err; 1272 } else if (veth_gro_requested(dev)) { 1273 err = veth_napi_enable(dev); 1274 if (err) 1275 return err; 1276 } 1277 1278 if (peer->flags & IFF_UP) { 1279 netif_carrier_on(dev); 1280 netif_carrier_on(peer); 1281 } 1282 1283 return 0; 1284 } 1285 1286 static int veth_close(struct net_device *dev) 1287 { 1288 struct veth_priv *priv = netdev_priv(dev); 1289 struct net_device *peer = rtnl_dereference(priv->peer); 1290 1291 netif_carrier_off(dev); 1292 if (peer) 1293 netif_carrier_off(peer); 1294 1295 if (priv->_xdp_prog) 1296 veth_disable_xdp(dev); 1297 else if (veth_gro_requested(dev)) 1298 veth_napi_del(dev); 1299 1300 return 0; 1301 } 1302 1303 static int is_valid_veth_mtu(int mtu) 1304 { 1305 return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU; 1306 } 1307 1308 static int veth_alloc_queues(struct net_device *dev) 1309 { 1310 struct veth_priv *priv = netdev_priv(dev); 1311 int i; 1312 1313 priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL); 1314 if (!priv->rq) 1315 return -ENOMEM; 1316 1317 for (i = 0; i < dev->num_rx_queues; i++) { 1318 priv->rq[i].dev = dev; 1319 u64_stats_init(&priv->rq[i].stats.syncp); 1320 } 1321 1322 return 0; 1323 } 1324 1325 static void veth_free_queues(struct net_device *dev) 1326 { 1327 struct veth_priv *priv = netdev_priv(dev); 1328 1329 kfree(priv->rq); 1330 } 1331 1332 static int veth_dev_init(struct net_device *dev) 1333 { 1334 int err; 1335 1336 dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); 1337 if (!dev->lstats) 1338 return -ENOMEM; 1339 1340 err = veth_alloc_queues(dev); 1341 if (err) { 1342 free_percpu(dev->lstats); 1343 return err; 1344 } 1345 1346 return 0; 1347 } 1348 1349 static void veth_dev_free(struct net_device *dev) 1350 { 1351 veth_free_queues(dev); 1352 free_percpu(dev->lstats); 1353 } 1354 1355 #ifdef CONFIG_NET_POLL_CONTROLLER 1356 static void veth_poll_controller(struct net_device *dev) 1357 { 1358 /* veth only receives frames when its peer sends one 1359 * Since it has nothing to do with disabling irqs, we are guaranteed 1360 * never to have pending data when we poll for it so 1361 * there is nothing to do here. 1362 * 1363 * We need this though so netpoll recognizes us as an interface that 1364 * supports polling, which enables bridge devices in virt setups to 1365 * still use netconsole 1366 */ 1367 } 1368 #endif /* CONFIG_NET_POLL_CONTROLLER */ 1369 1370 static int veth_get_iflink(const struct net_device *dev) 1371 { 1372 struct veth_priv *priv = netdev_priv(dev); 1373 struct net_device *peer; 1374 int iflink; 1375 1376 rcu_read_lock(); 1377 peer = rcu_dereference(priv->peer); 1378 iflink = peer ? peer->ifindex : 0; 1379 rcu_read_unlock(); 1380 1381 return iflink; 1382 } 1383 1384 static netdev_features_t veth_fix_features(struct net_device *dev, 1385 netdev_features_t features) 1386 { 1387 struct veth_priv *priv = netdev_priv(dev); 1388 struct net_device *peer; 1389 1390 peer = rtnl_dereference(priv->peer); 1391 if (peer) { 1392 struct veth_priv *peer_priv = netdev_priv(peer); 1393 1394 if (peer_priv->_xdp_prog) 1395 features &= ~NETIF_F_GSO_SOFTWARE; 1396 } 1397 if (priv->_xdp_prog) 1398 features |= NETIF_F_GRO; 1399 1400 return features; 1401 } 1402 1403 static int veth_set_features(struct net_device *dev, 1404 netdev_features_t features) 1405 { 1406 netdev_features_t changed = features ^ dev->features; 1407 struct veth_priv *priv = netdev_priv(dev); 1408 int err; 1409 1410 if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog) 1411 return 0; 1412 1413 if (features & NETIF_F_GRO) { 1414 err = veth_napi_enable(dev); 1415 if (err) 1416 return err; 1417 } else { 1418 veth_napi_del(dev); 1419 } 1420 return 0; 1421 } 1422 1423 static void veth_set_rx_headroom(struct net_device *dev, int new_hr) 1424 { 1425 struct veth_priv *peer_priv, *priv = netdev_priv(dev); 1426 struct net_device *peer; 1427 1428 if (new_hr < 0) 1429 new_hr = 0; 1430 1431 rcu_read_lock(); 1432 peer = rcu_dereference(priv->peer); 1433 if (unlikely(!peer)) 1434 goto out; 1435 1436 peer_priv = netdev_priv(peer); 1437 priv->requested_headroom = new_hr; 1438 new_hr = max(priv->requested_headroom, peer_priv->requested_headroom); 1439 dev->needed_headroom = new_hr; 1440 peer->needed_headroom = new_hr; 1441 1442 out: 1443 rcu_read_unlock(); 1444 } 1445 1446 static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, 1447 struct netlink_ext_ack *extack) 1448 { 1449 struct veth_priv *priv = netdev_priv(dev); 1450 struct bpf_prog *old_prog; 1451 struct net_device *peer; 1452 unsigned int max_mtu; 1453 int err; 1454 1455 old_prog = priv->_xdp_prog; 1456 priv->_xdp_prog = prog; 1457 peer = rtnl_dereference(priv->peer); 1458 1459 if (prog) { 1460 if (!peer) { 1461 NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached"); 1462 err = -ENOTCONN; 1463 goto err; 1464 } 1465 1466 max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM - 1467 peer->hard_header_len - 1468 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 1469 if (peer->mtu > max_mtu) { 1470 NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP"); 1471 err = -ERANGE; 1472 goto err; 1473 } 1474 1475 if (dev->real_num_rx_queues < peer->real_num_tx_queues) { 1476 NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues"); 1477 err = -ENOSPC; 1478 goto err; 1479 } 1480 1481 if (dev->flags & IFF_UP) { 1482 err = veth_enable_xdp(dev); 1483 if (err) { 1484 NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed"); 1485 goto err; 1486 } 1487 } 1488 1489 if (!old_prog) { 1490 peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; 1491 peer->max_mtu = max_mtu; 1492 } 1493 } 1494 1495 if (old_prog) { 1496 if (!prog) { 1497 if (dev->flags & IFF_UP) 1498 veth_disable_xdp(dev); 1499 1500 if (peer) { 1501 peer->hw_features |= NETIF_F_GSO_SOFTWARE; 1502 peer->max_mtu = ETH_MAX_MTU; 1503 } 1504 } 1505 bpf_prog_put(old_prog); 1506 } 1507 1508 if ((!!old_prog ^ !!prog) && peer) 1509 netdev_update_features(peer); 1510 1511 return 0; 1512 err: 1513 priv->_xdp_prog = old_prog; 1514 1515 return err; 1516 } 1517 1518 static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) 1519 { 1520 switch (xdp->command) { 1521 case XDP_SETUP_PROG: 1522 return veth_xdp_set(dev, xdp->prog, xdp->extack); 1523 default: 1524 return -EINVAL; 1525 } 1526 } 1527 1528 static const struct net_device_ops veth_netdev_ops = { 1529 .ndo_init = veth_dev_init, 1530 .ndo_open = veth_open, 1531 .ndo_stop = veth_close, 1532 .ndo_start_xmit = veth_xmit, 1533 .ndo_get_stats64 = veth_get_stats64, 1534 .ndo_set_rx_mode = veth_set_multicast_list, 1535 .ndo_set_mac_address = eth_mac_addr, 1536 #ifdef CONFIG_NET_POLL_CONTROLLER 1537 .ndo_poll_controller = veth_poll_controller, 1538 #endif 1539 .ndo_get_iflink = veth_get_iflink, 1540 .ndo_fix_features = veth_fix_features, 1541 .ndo_set_features = veth_set_features, 1542 .ndo_features_check = passthru_features_check, 1543 .ndo_set_rx_headroom = veth_set_rx_headroom, 1544 .ndo_bpf = veth_xdp, 1545 .ndo_xdp_xmit = veth_ndo_xdp_xmit, 1546 .ndo_get_peer_dev = veth_peer_dev, 1547 }; 1548 1549 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ 1550 NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \ 1551 NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \ 1552 NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \ 1553 NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX ) 1554 1555 static void veth_setup(struct net_device *dev) 1556 { 1557 ether_setup(dev); 1558 1559 dev->priv_flags &= ~IFF_TX_SKB_SHARING; 1560 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 1561 dev->priv_flags |= IFF_NO_QUEUE; 1562 dev->priv_flags |= IFF_PHONY_HEADROOM; 1563 1564 dev->netdev_ops = &veth_netdev_ops; 1565 dev->ethtool_ops = &veth_ethtool_ops; 1566 dev->features |= NETIF_F_LLTX; 1567 dev->features |= VETH_FEATURES; 1568 dev->vlan_features = dev->features & 1569 ~(NETIF_F_HW_VLAN_CTAG_TX | 1570 NETIF_F_HW_VLAN_STAG_TX | 1571 NETIF_F_HW_VLAN_CTAG_RX | 1572 NETIF_F_HW_VLAN_STAG_RX); 1573 dev->needs_free_netdev = true; 1574 dev->priv_destructor = veth_dev_free; 1575 dev->max_mtu = ETH_MAX_MTU; 1576 1577 dev->hw_features = VETH_FEATURES; 1578 dev->hw_enc_features = VETH_FEATURES; 1579 dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; 1580 } 1581 1582 /* 1583 * netlink interface 1584 */ 1585 1586 static int veth_validate(struct nlattr *tb[], struct nlattr *data[], 1587 struct netlink_ext_ack *extack) 1588 { 1589 if (tb[IFLA_ADDRESS]) { 1590 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 1591 return -EINVAL; 1592 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 1593 return -EADDRNOTAVAIL; 1594 } 1595 if (tb[IFLA_MTU]) { 1596 if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU]))) 1597 return -EINVAL; 1598 } 1599 return 0; 1600 } 1601 1602 static struct rtnl_link_ops veth_link_ops; 1603 1604 static void veth_disable_gro(struct net_device *dev) 1605 { 1606 dev->features &= ~NETIF_F_GRO; 1607 dev->wanted_features &= ~NETIF_F_GRO; 1608 netdev_update_features(dev); 1609 } 1610 1611 static int veth_init_queues(struct net_device *dev, struct nlattr *tb[]) 1612 { 1613 int err; 1614 1615 if (!tb[IFLA_NUM_TX_QUEUES] && dev->num_tx_queues > 1) { 1616 err = netif_set_real_num_tx_queues(dev, 1); 1617 if (err) 1618 return err; 1619 } 1620 if (!tb[IFLA_NUM_RX_QUEUES] && dev->num_rx_queues > 1) { 1621 err = netif_set_real_num_rx_queues(dev, 1); 1622 if (err) 1623 return err; 1624 } 1625 return 0; 1626 } 1627 1628 static int veth_newlink(struct net *src_net, struct net_device *dev, 1629 struct nlattr *tb[], struct nlattr *data[], 1630 struct netlink_ext_ack *extack) 1631 { 1632 int err; 1633 struct net_device *peer; 1634 struct veth_priv *priv; 1635 char ifname[IFNAMSIZ]; 1636 struct nlattr *peer_tb[IFLA_MAX + 1], **tbp; 1637 unsigned char name_assign_type; 1638 struct ifinfomsg *ifmp; 1639 struct net *net; 1640 1641 /* 1642 * create and register peer first 1643 */ 1644 if (data != NULL && data[VETH_INFO_PEER] != NULL) { 1645 struct nlattr *nla_peer; 1646 1647 nla_peer = data[VETH_INFO_PEER]; 1648 ifmp = nla_data(nla_peer); 1649 err = rtnl_nla_parse_ifla(peer_tb, 1650 nla_data(nla_peer) + sizeof(struct ifinfomsg), 1651 nla_len(nla_peer) - sizeof(struct ifinfomsg), 1652 NULL); 1653 if (err < 0) 1654 return err; 1655 1656 err = veth_validate(peer_tb, NULL, extack); 1657 if (err < 0) 1658 return err; 1659 1660 tbp = peer_tb; 1661 } else { 1662 ifmp = NULL; 1663 tbp = tb; 1664 } 1665 1666 if (ifmp && tbp[IFLA_IFNAME]) { 1667 nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); 1668 name_assign_type = NET_NAME_USER; 1669 } else { 1670 snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); 1671 name_assign_type = NET_NAME_ENUM; 1672 } 1673 1674 net = rtnl_link_get_net(src_net, tbp); 1675 if (IS_ERR(net)) 1676 return PTR_ERR(net); 1677 1678 peer = rtnl_create_link(net, ifname, name_assign_type, 1679 &veth_link_ops, tbp, extack); 1680 if (IS_ERR(peer)) { 1681 put_net(net); 1682 return PTR_ERR(peer); 1683 } 1684 1685 if (!ifmp || !tbp[IFLA_ADDRESS]) 1686 eth_hw_addr_random(peer); 1687 1688 if (ifmp && (dev->ifindex != 0)) 1689 peer->ifindex = ifmp->ifi_index; 1690 1691 netif_set_gso_max_size(peer, dev->gso_max_size); 1692 netif_set_gso_max_segs(peer, dev->gso_max_segs); 1693 1694 err = register_netdevice(peer); 1695 put_net(net); 1696 net = NULL; 1697 if (err < 0) 1698 goto err_register_peer; 1699 1700 /* keep GRO disabled by default to be consistent with the established 1701 * veth behavior 1702 */ 1703 veth_disable_gro(peer); 1704 netif_carrier_off(peer); 1705 1706 err = rtnl_configure_link(peer, ifmp); 1707 if (err < 0) 1708 goto err_configure_peer; 1709 1710 /* 1711 * register dev last 1712 * 1713 * note, that since we've registered new device the dev's name 1714 * should be re-allocated 1715 */ 1716 1717 if (tb[IFLA_ADDRESS] == NULL) 1718 eth_hw_addr_random(dev); 1719 1720 if (tb[IFLA_IFNAME]) 1721 nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); 1722 else 1723 snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); 1724 1725 err = register_netdevice(dev); 1726 if (err < 0) 1727 goto err_register_dev; 1728 1729 netif_carrier_off(dev); 1730 1731 /* 1732 * tie the deviced together 1733 */ 1734 1735 priv = netdev_priv(dev); 1736 rcu_assign_pointer(priv->peer, peer); 1737 err = veth_init_queues(dev, tb); 1738 if (err) 1739 goto err_queues; 1740 1741 priv = netdev_priv(peer); 1742 rcu_assign_pointer(priv->peer, dev); 1743 err = veth_init_queues(peer, tb); 1744 if (err) 1745 goto err_queues; 1746 1747 veth_disable_gro(dev); 1748 return 0; 1749 1750 err_queues: 1751 unregister_netdevice(dev); 1752 err_register_dev: 1753 /* nothing to do */ 1754 err_configure_peer: 1755 unregister_netdevice(peer); 1756 return err; 1757 1758 err_register_peer: 1759 free_netdev(peer); 1760 return err; 1761 } 1762 1763 static void veth_dellink(struct net_device *dev, struct list_head *head) 1764 { 1765 struct veth_priv *priv; 1766 struct net_device *peer; 1767 1768 priv = netdev_priv(dev); 1769 peer = rtnl_dereference(priv->peer); 1770 1771 /* Note : dellink() is called from default_device_exit_batch(), 1772 * before a rcu_synchronize() point. The devices are guaranteed 1773 * not being freed before one RCU grace period. 1774 */ 1775 RCU_INIT_POINTER(priv->peer, NULL); 1776 unregister_netdevice_queue(dev, head); 1777 1778 if (peer) { 1779 priv = netdev_priv(peer); 1780 RCU_INIT_POINTER(priv->peer, NULL); 1781 unregister_netdevice_queue(peer, head); 1782 } 1783 } 1784 1785 static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = { 1786 [VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) }, 1787 }; 1788 1789 static struct net *veth_get_link_net(const struct net_device *dev) 1790 { 1791 struct veth_priv *priv = netdev_priv(dev); 1792 struct net_device *peer = rtnl_dereference(priv->peer); 1793 1794 return peer ? dev_net(peer) : dev_net(dev); 1795 } 1796 1797 static unsigned int veth_get_num_queues(void) 1798 { 1799 /* enforce the same queue limit as rtnl_create_link */ 1800 int queues = num_possible_cpus(); 1801 1802 if (queues > 4096) 1803 queues = 4096; 1804 return queues; 1805 } 1806 1807 static struct rtnl_link_ops veth_link_ops = { 1808 .kind = DRV_NAME, 1809 .priv_size = sizeof(struct veth_priv), 1810 .setup = veth_setup, 1811 .validate = veth_validate, 1812 .newlink = veth_newlink, 1813 .dellink = veth_dellink, 1814 .policy = veth_policy, 1815 .maxtype = VETH_INFO_MAX, 1816 .get_link_net = veth_get_link_net, 1817 .get_num_tx_queues = veth_get_num_queues, 1818 .get_num_rx_queues = veth_get_num_queues, 1819 }; 1820 1821 /* 1822 * init/fini 1823 */ 1824 1825 static __init int veth_init(void) 1826 { 1827 return rtnl_link_register(&veth_link_ops); 1828 } 1829 1830 static __exit void veth_exit(void) 1831 { 1832 rtnl_link_unregister(&veth_link_ops); 1833 } 1834 1835 module_init(veth_init); 1836 module_exit(veth_exit); 1837 1838 MODULE_DESCRIPTION("Virtual Ethernet Tunnel"); 1839 MODULE_LICENSE("GPL v2"); 1840 MODULE_ALIAS_RTNL_LINK(DRV_NAME); 1841