1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * drivers/net/veth.c 4 * 5 * Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc 6 * 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com> 9 * 10 */ 11 12 #include <linux/netdevice.h> 13 #include <linux/slab.h> 14 #include <linux/ethtool.h> 15 #include <linux/etherdevice.h> 16 #include <linux/u64_stats_sync.h> 17 18 #include <net/rtnetlink.h> 19 #include <net/dst.h> 20 #include <net/xfrm.h> 21 #include <net/xdp.h> 22 #include <linux/veth.h> 23 #include <linux/module.h> 24 #include <linux/bpf.h> 25 #include <linux/filter.h> 26 #include <linux/ptr_ring.h> 27 #include <linux/bpf_trace.h> 28 #include <linux/net_tstamp.h> 29 30 #define DRV_NAME "veth" 31 #define DRV_VERSION "1.0" 32 33 #define VETH_XDP_FLAG BIT(0) 34 #define VETH_RING_SIZE 256 35 #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) 36 37 #define VETH_XDP_TX_BULK_SIZE 16 38 #define VETH_XDP_BATCH 16 39 40 struct veth_stats { 41 u64 rx_drops; 42 /* xdp */ 43 u64 xdp_packets; 44 u64 xdp_bytes; 45 u64 xdp_redirect; 46 u64 xdp_drops; 47 u64 xdp_tx; 48 u64 xdp_tx_err; 49 u64 peer_tq_xdp_xmit; 50 u64 peer_tq_xdp_xmit_err; 51 }; 52 53 struct veth_rq_stats { 54 struct veth_stats vs; 55 struct u64_stats_sync syncp; 56 }; 57 58 struct veth_rq { 59 struct napi_struct xdp_napi; 60 struct napi_struct __rcu *napi; /* points to xdp_napi when the latter is initialized */ 61 struct net_device *dev; 62 struct bpf_prog __rcu *xdp_prog; 63 struct xdp_mem_info xdp_mem; 64 struct veth_rq_stats stats; 65 bool rx_notify_masked; 66 struct ptr_ring xdp_ring; 67 struct xdp_rxq_info xdp_rxq; 68 }; 69 70 struct veth_priv { 71 struct net_device __rcu *peer; 72 atomic64_t dropped; 73 struct bpf_prog *_xdp_prog; 74 struct veth_rq *rq; 75 unsigned int requested_headroom; 76 }; 77 78 struct veth_xdp_tx_bq { 79 struct xdp_frame *q[VETH_XDP_TX_BULK_SIZE]; 80 unsigned int count; 81 }; 82 83 /* 84 * ethtool interface 85 */ 86 87 struct veth_q_stat_desc { 88 char desc[ETH_GSTRING_LEN]; 89 size_t offset; 90 }; 91 92 #define VETH_RQ_STAT(m) offsetof(struct veth_stats, m) 93 94 static const struct veth_q_stat_desc veth_rq_stats_desc[] = { 95 { "xdp_packets", VETH_RQ_STAT(xdp_packets) }, 96 { "xdp_bytes", VETH_RQ_STAT(xdp_bytes) }, 97 { "drops", VETH_RQ_STAT(rx_drops) }, 98 { "xdp_redirect", VETH_RQ_STAT(xdp_redirect) }, 99 { "xdp_drops", VETH_RQ_STAT(xdp_drops) }, 100 { "xdp_tx", VETH_RQ_STAT(xdp_tx) }, 101 { "xdp_tx_errors", VETH_RQ_STAT(xdp_tx_err) }, 102 }; 103 104 #define VETH_RQ_STATS_LEN ARRAY_SIZE(veth_rq_stats_desc) 105 106 static const struct veth_q_stat_desc veth_tq_stats_desc[] = { 107 { "xdp_xmit", VETH_RQ_STAT(peer_tq_xdp_xmit) }, 108 { "xdp_xmit_errors", VETH_RQ_STAT(peer_tq_xdp_xmit_err) }, 109 }; 110 111 #define VETH_TQ_STATS_LEN ARRAY_SIZE(veth_tq_stats_desc) 112 113 static struct { 114 const char string[ETH_GSTRING_LEN]; 115 } ethtool_stats_keys[] = { 116 { "peer_ifindex" }, 117 }; 118 119 static int veth_get_link_ksettings(struct net_device *dev, 120 struct ethtool_link_ksettings *cmd) 121 { 122 cmd->base.speed = SPEED_10000; 123 cmd->base.duplex = DUPLEX_FULL; 124 cmd->base.port = PORT_TP; 125 cmd->base.autoneg = AUTONEG_DISABLE; 126 return 0; 127 } 128 129 static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) 130 { 131 strlcpy(info->driver, DRV_NAME, sizeof(info->driver)); 132 strlcpy(info->version, DRV_VERSION, sizeof(info->version)); 133 } 134 135 static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf) 136 { 137 char *p = (char *)buf; 138 int i, j; 139 140 switch(stringset) { 141 case ETH_SS_STATS: 142 memcpy(p, ðtool_stats_keys, sizeof(ethtool_stats_keys)); 143 p += sizeof(ethtool_stats_keys); 144 for (i = 0; i < dev->real_num_rx_queues; i++) { 145 for (j = 0; j < VETH_RQ_STATS_LEN; j++) { 146 snprintf(p, ETH_GSTRING_LEN, 147 "rx_queue_%u_%.18s", 148 i, veth_rq_stats_desc[j].desc); 149 p += ETH_GSTRING_LEN; 150 } 151 } 152 for (i = 0; i < dev->real_num_tx_queues; i++) { 153 for (j = 0; j < VETH_TQ_STATS_LEN; j++) { 154 snprintf(p, ETH_GSTRING_LEN, 155 "tx_queue_%u_%.18s", 156 i, veth_tq_stats_desc[j].desc); 157 p += ETH_GSTRING_LEN; 158 } 159 } 160 break; 161 } 162 } 163 164 static int veth_get_sset_count(struct net_device *dev, int sset) 165 { 166 switch (sset) { 167 case ETH_SS_STATS: 168 return ARRAY_SIZE(ethtool_stats_keys) + 169 VETH_RQ_STATS_LEN * dev->real_num_rx_queues + 170 VETH_TQ_STATS_LEN * dev->real_num_tx_queues; 171 default: 172 return -EOPNOTSUPP; 173 } 174 } 175 176 static void veth_get_ethtool_stats(struct net_device *dev, 177 struct ethtool_stats *stats, u64 *data) 178 { 179 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 180 struct net_device *peer = rtnl_dereference(priv->peer); 181 int i, j, idx; 182 183 data[0] = peer ? peer->ifindex : 0; 184 idx = 1; 185 for (i = 0; i < dev->real_num_rx_queues; i++) { 186 const struct veth_rq_stats *rq_stats = &priv->rq[i].stats; 187 const void *stats_base = (void *)&rq_stats->vs; 188 unsigned int start; 189 size_t offset; 190 191 do { 192 start = u64_stats_fetch_begin_irq(&rq_stats->syncp); 193 for (j = 0; j < VETH_RQ_STATS_LEN; j++) { 194 offset = veth_rq_stats_desc[j].offset; 195 data[idx + j] = *(u64 *)(stats_base + offset); 196 } 197 } while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start)); 198 idx += VETH_RQ_STATS_LEN; 199 } 200 201 if (!peer) 202 return; 203 204 rcv_priv = netdev_priv(peer); 205 for (i = 0; i < peer->real_num_rx_queues; i++) { 206 const struct veth_rq_stats *rq_stats = &rcv_priv->rq[i].stats; 207 const void *base = (void *)&rq_stats->vs; 208 unsigned int start, tx_idx = idx; 209 size_t offset; 210 211 tx_idx += (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN; 212 do { 213 start = u64_stats_fetch_begin_irq(&rq_stats->syncp); 214 for (j = 0; j < VETH_TQ_STATS_LEN; j++) { 215 offset = veth_tq_stats_desc[j].offset; 216 data[tx_idx + j] += *(u64 *)(base + offset); 217 } 218 } while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start)); 219 } 220 } 221 222 static void veth_get_channels(struct net_device *dev, 223 struct ethtool_channels *channels) 224 { 225 channels->tx_count = dev->real_num_tx_queues; 226 channels->rx_count = dev->real_num_rx_queues; 227 channels->max_tx = dev->num_tx_queues; 228 channels->max_rx = dev->num_rx_queues; 229 } 230 231 static int veth_set_channels(struct net_device *dev, 232 struct ethtool_channels *ch); 233 234 static const struct ethtool_ops veth_ethtool_ops = { 235 .get_drvinfo = veth_get_drvinfo, 236 .get_link = ethtool_op_get_link, 237 .get_strings = veth_get_strings, 238 .get_sset_count = veth_get_sset_count, 239 .get_ethtool_stats = veth_get_ethtool_stats, 240 .get_link_ksettings = veth_get_link_ksettings, 241 .get_ts_info = ethtool_op_get_ts_info, 242 .get_channels = veth_get_channels, 243 .set_channels = veth_set_channels, 244 }; 245 246 /* general routines */ 247 248 static bool veth_is_xdp_frame(void *ptr) 249 { 250 return (unsigned long)ptr & VETH_XDP_FLAG; 251 } 252 253 static struct xdp_frame *veth_ptr_to_xdp(void *ptr) 254 { 255 return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG); 256 } 257 258 static void *veth_xdp_to_ptr(struct xdp_frame *xdp) 259 { 260 return (void *)((unsigned long)xdp | VETH_XDP_FLAG); 261 } 262 263 static void veth_ptr_free(void *ptr) 264 { 265 if (veth_is_xdp_frame(ptr)) 266 xdp_return_frame(veth_ptr_to_xdp(ptr)); 267 else 268 kfree_skb(ptr); 269 } 270 271 static void __veth_xdp_flush(struct veth_rq *rq) 272 { 273 /* Write ptr_ring before reading rx_notify_masked */ 274 smp_mb(); 275 if (!rq->rx_notify_masked) { 276 rq->rx_notify_masked = true; 277 napi_schedule(&rq->xdp_napi); 278 } 279 } 280 281 static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) 282 { 283 if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) { 284 dev_kfree_skb_any(skb); 285 return NET_RX_DROP; 286 } 287 288 return NET_RX_SUCCESS; 289 } 290 291 static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, 292 struct veth_rq *rq, bool xdp) 293 { 294 return __dev_forward_skb(dev, skb) ?: xdp ? 295 veth_xdp_rx(rq, skb) : 296 netif_rx(skb); 297 } 298 299 /* return true if the specified skb has chances of GRO aggregation 300 * Don't strive for accuracy, but try to avoid GRO overhead in the most 301 * common scenarios. 302 * When XDP is enabled, all traffic is considered eligible, as the xmit 303 * device has TSO off. 304 * When TSO is enabled on the xmit device, we are likely interested only 305 * in UDP aggregation, explicitly check for that if the skb is suspected 306 * - the sock_wfree destructor is used by UDP, ICMP and XDP sockets - 307 * to belong to locally generated UDP traffic. 308 */ 309 static bool veth_skb_is_eligible_for_gro(const struct net_device *dev, 310 const struct net_device *rcv, 311 const struct sk_buff *skb) 312 { 313 return !(dev->features & NETIF_F_ALL_TSO) || 314 (skb->destructor == sock_wfree && 315 rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD)); 316 } 317 318 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) 319 { 320 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 321 struct veth_rq *rq = NULL; 322 struct net_device *rcv; 323 int length = skb->len; 324 bool use_napi = false; 325 int rxq; 326 327 rcu_read_lock(); 328 rcv = rcu_dereference(priv->peer); 329 if (unlikely(!rcv)) { 330 kfree_skb(skb); 331 goto drop; 332 } 333 334 rcv_priv = netdev_priv(rcv); 335 rxq = skb_get_queue_mapping(skb); 336 if (rxq < rcv->real_num_rx_queues) { 337 rq = &rcv_priv->rq[rxq]; 338 339 /* The napi pointer is available when an XDP program is 340 * attached or when GRO is enabled 341 * Don't bother with napi/GRO if the skb can't be aggregated 342 */ 343 use_napi = rcu_access_pointer(rq->napi) && 344 veth_skb_is_eligible_for_gro(dev, rcv, skb); 345 skb_record_rx_queue(skb, rxq); 346 } 347 348 skb_tx_timestamp(skb); 349 if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) { 350 if (!use_napi) 351 dev_lstats_add(dev, length); 352 } else { 353 drop: 354 atomic64_inc(&priv->dropped); 355 } 356 357 if (use_napi) 358 __veth_xdp_flush(rq); 359 360 rcu_read_unlock(); 361 362 return NETDEV_TX_OK; 363 } 364 365 static u64 veth_stats_tx(struct net_device *dev, u64 *packets, u64 *bytes) 366 { 367 struct veth_priv *priv = netdev_priv(dev); 368 369 dev_lstats_read(dev, packets, bytes); 370 return atomic64_read(&priv->dropped); 371 } 372 373 static void veth_stats_rx(struct veth_stats *result, struct net_device *dev) 374 { 375 struct veth_priv *priv = netdev_priv(dev); 376 int i; 377 378 result->peer_tq_xdp_xmit_err = 0; 379 result->xdp_packets = 0; 380 result->xdp_tx_err = 0; 381 result->xdp_bytes = 0; 382 result->rx_drops = 0; 383 for (i = 0; i < dev->num_rx_queues; i++) { 384 u64 packets, bytes, drops, xdp_tx_err, peer_tq_xdp_xmit_err; 385 struct veth_rq_stats *stats = &priv->rq[i].stats; 386 unsigned int start; 387 388 do { 389 start = u64_stats_fetch_begin_irq(&stats->syncp); 390 peer_tq_xdp_xmit_err = stats->vs.peer_tq_xdp_xmit_err; 391 xdp_tx_err = stats->vs.xdp_tx_err; 392 packets = stats->vs.xdp_packets; 393 bytes = stats->vs.xdp_bytes; 394 drops = stats->vs.rx_drops; 395 } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); 396 result->peer_tq_xdp_xmit_err += peer_tq_xdp_xmit_err; 397 result->xdp_tx_err += xdp_tx_err; 398 result->xdp_packets += packets; 399 result->xdp_bytes += bytes; 400 result->rx_drops += drops; 401 } 402 } 403 404 static void veth_get_stats64(struct net_device *dev, 405 struct rtnl_link_stats64 *tot) 406 { 407 struct veth_priv *priv = netdev_priv(dev); 408 struct net_device *peer; 409 struct veth_stats rx; 410 u64 packets, bytes; 411 412 tot->tx_dropped = veth_stats_tx(dev, &packets, &bytes); 413 tot->tx_bytes = bytes; 414 tot->tx_packets = packets; 415 416 veth_stats_rx(&rx, dev); 417 tot->tx_dropped += rx.xdp_tx_err; 418 tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err; 419 tot->rx_bytes = rx.xdp_bytes; 420 tot->rx_packets = rx.xdp_packets; 421 422 rcu_read_lock(); 423 peer = rcu_dereference(priv->peer); 424 if (peer) { 425 veth_stats_tx(peer, &packets, &bytes); 426 tot->rx_bytes += bytes; 427 tot->rx_packets += packets; 428 429 veth_stats_rx(&rx, peer); 430 tot->tx_dropped += rx.peer_tq_xdp_xmit_err; 431 tot->rx_dropped += rx.xdp_tx_err; 432 tot->tx_bytes += rx.xdp_bytes; 433 tot->tx_packets += rx.xdp_packets; 434 } 435 rcu_read_unlock(); 436 } 437 438 /* fake multicast ability */ 439 static void veth_set_multicast_list(struct net_device *dev) 440 { 441 } 442 443 static struct sk_buff *veth_build_skb(void *head, int headroom, int len, 444 int buflen) 445 { 446 struct sk_buff *skb; 447 448 skb = build_skb(head, buflen); 449 if (!skb) 450 return NULL; 451 452 skb_reserve(skb, headroom); 453 skb_put(skb, len); 454 455 return skb; 456 } 457 458 static int veth_select_rxq(struct net_device *dev) 459 { 460 return smp_processor_id() % dev->real_num_rx_queues; 461 } 462 463 static struct net_device *veth_peer_dev(struct net_device *dev) 464 { 465 struct veth_priv *priv = netdev_priv(dev); 466 467 /* Callers must be under RCU read side. */ 468 return rcu_dereference(priv->peer); 469 } 470 471 static int veth_xdp_xmit(struct net_device *dev, int n, 472 struct xdp_frame **frames, 473 u32 flags, bool ndo_xmit) 474 { 475 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 476 int i, ret = -ENXIO, nxmit = 0; 477 struct net_device *rcv; 478 unsigned int max_len; 479 struct veth_rq *rq; 480 481 if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) 482 return -EINVAL; 483 484 rcu_read_lock(); 485 rcv = rcu_dereference(priv->peer); 486 if (unlikely(!rcv)) 487 goto out; 488 489 rcv_priv = netdev_priv(rcv); 490 rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 491 /* The napi pointer is set if NAPI is enabled, which ensures that 492 * xdp_ring is initialized on receive side and the peer device is up. 493 */ 494 if (!rcu_access_pointer(rq->napi)) 495 goto out; 496 497 max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN; 498 499 spin_lock(&rq->xdp_ring.producer_lock); 500 for (i = 0; i < n; i++) { 501 struct xdp_frame *frame = frames[i]; 502 void *ptr = veth_xdp_to_ptr(frame); 503 504 if (unlikely(frame->len > max_len || 505 __ptr_ring_produce(&rq->xdp_ring, ptr))) 506 break; 507 nxmit++; 508 } 509 spin_unlock(&rq->xdp_ring.producer_lock); 510 511 if (flags & XDP_XMIT_FLUSH) 512 __veth_xdp_flush(rq); 513 514 ret = nxmit; 515 if (ndo_xmit) { 516 u64_stats_update_begin(&rq->stats.syncp); 517 rq->stats.vs.peer_tq_xdp_xmit += nxmit; 518 rq->stats.vs.peer_tq_xdp_xmit_err += n - nxmit; 519 u64_stats_update_end(&rq->stats.syncp); 520 } 521 522 out: 523 rcu_read_unlock(); 524 525 return ret; 526 } 527 528 static int veth_ndo_xdp_xmit(struct net_device *dev, int n, 529 struct xdp_frame **frames, u32 flags) 530 { 531 int err; 532 533 err = veth_xdp_xmit(dev, n, frames, flags, true); 534 if (err < 0) { 535 struct veth_priv *priv = netdev_priv(dev); 536 537 atomic64_add(n, &priv->dropped); 538 } 539 540 return err; 541 } 542 543 static void veth_xdp_flush_bq(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) 544 { 545 int sent, i, err = 0, drops; 546 547 sent = veth_xdp_xmit(rq->dev, bq->count, bq->q, 0, false); 548 if (sent < 0) { 549 err = sent; 550 sent = 0; 551 } 552 553 for (i = sent; unlikely(i < bq->count); i++) 554 xdp_return_frame(bq->q[i]); 555 556 drops = bq->count - sent; 557 trace_xdp_bulk_tx(rq->dev, sent, drops, err); 558 559 u64_stats_update_begin(&rq->stats.syncp); 560 rq->stats.vs.xdp_tx += sent; 561 rq->stats.vs.xdp_tx_err += drops; 562 u64_stats_update_end(&rq->stats.syncp); 563 564 bq->count = 0; 565 } 566 567 static void veth_xdp_flush(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) 568 { 569 struct veth_priv *rcv_priv, *priv = netdev_priv(rq->dev); 570 struct net_device *rcv; 571 struct veth_rq *rcv_rq; 572 573 rcu_read_lock(); 574 veth_xdp_flush_bq(rq, bq); 575 rcv = rcu_dereference(priv->peer); 576 if (unlikely(!rcv)) 577 goto out; 578 579 rcv_priv = netdev_priv(rcv); 580 rcv_rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 581 /* xdp_ring is initialized on receive side? */ 582 if (unlikely(!rcu_access_pointer(rcv_rq->xdp_prog))) 583 goto out; 584 585 __veth_xdp_flush(rcv_rq); 586 out: 587 rcu_read_unlock(); 588 } 589 590 static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp, 591 struct veth_xdp_tx_bq *bq) 592 { 593 struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp); 594 595 if (unlikely(!frame)) 596 return -EOVERFLOW; 597 598 if (unlikely(bq->count == VETH_XDP_TX_BULK_SIZE)) 599 veth_xdp_flush_bq(rq, bq); 600 601 bq->q[bq->count++] = frame; 602 603 return 0; 604 } 605 606 static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq, 607 struct xdp_frame *frame, 608 struct veth_xdp_tx_bq *bq, 609 struct veth_stats *stats) 610 { 611 struct xdp_frame orig_frame; 612 struct bpf_prog *xdp_prog; 613 614 rcu_read_lock(); 615 xdp_prog = rcu_dereference(rq->xdp_prog); 616 if (likely(xdp_prog)) { 617 struct xdp_buff xdp; 618 u32 act; 619 620 xdp_convert_frame_to_buff(frame, &xdp); 621 xdp.rxq = &rq->xdp_rxq; 622 623 act = bpf_prog_run_xdp(xdp_prog, &xdp); 624 625 switch (act) { 626 case XDP_PASS: 627 if (xdp_update_frame_from_buff(&xdp, frame)) 628 goto err_xdp; 629 break; 630 case XDP_TX: 631 orig_frame = *frame; 632 xdp.rxq->mem = frame->mem; 633 if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) { 634 trace_xdp_exception(rq->dev, xdp_prog, act); 635 frame = &orig_frame; 636 stats->rx_drops++; 637 goto err_xdp; 638 } 639 stats->xdp_tx++; 640 rcu_read_unlock(); 641 goto xdp_xmit; 642 case XDP_REDIRECT: 643 orig_frame = *frame; 644 xdp.rxq->mem = frame->mem; 645 if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { 646 frame = &orig_frame; 647 stats->rx_drops++; 648 goto err_xdp; 649 } 650 stats->xdp_redirect++; 651 rcu_read_unlock(); 652 goto xdp_xmit; 653 default: 654 bpf_warn_invalid_xdp_action(act); 655 fallthrough; 656 case XDP_ABORTED: 657 trace_xdp_exception(rq->dev, xdp_prog, act); 658 fallthrough; 659 case XDP_DROP: 660 stats->xdp_drops++; 661 goto err_xdp; 662 } 663 } 664 rcu_read_unlock(); 665 666 return frame; 667 err_xdp: 668 rcu_read_unlock(); 669 xdp_return_frame(frame); 670 xdp_xmit: 671 return NULL; 672 } 673 674 /* frames array contains VETH_XDP_BATCH at most */ 675 static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames, 676 int n_xdpf, struct veth_xdp_tx_bq *bq, 677 struct veth_stats *stats) 678 { 679 void *skbs[VETH_XDP_BATCH]; 680 int i; 681 682 if (xdp_alloc_skb_bulk(skbs, n_xdpf, 683 GFP_ATOMIC | __GFP_ZERO) < 0) { 684 for (i = 0; i < n_xdpf; i++) 685 xdp_return_frame(frames[i]); 686 stats->rx_drops += n_xdpf; 687 688 return; 689 } 690 691 for (i = 0; i < n_xdpf; i++) { 692 struct sk_buff *skb = skbs[i]; 693 694 skb = __xdp_build_skb_from_frame(frames[i], skb, 695 rq->dev); 696 if (!skb) { 697 xdp_return_frame(frames[i]); 698 stats->rx_drops++; 699 continue; 700 } 701 napi_gro_receive(&rq->xdp_napi, skb); 702 } 703 } 704 705 static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, 706 struct sk_buff *skb, 707 struct veth_xdp_tx_bq *bq, 708 struct veth_stats *stats) 709 { 710 u32 pktlen, headroom, act, metalen, frame_sz; 711 void *orig_data, *orig_data_end; 712 struct bpf_prog *xdp_prog; 713 int mac_len, delta, off; 714 struct xdp_buff xdp; 715 716 skb_prepare_for_gro(skb); 717 718 rcu_read_lock(); 719 xdp_prog = rcu_dereference(rq->xdp_prog); 720 if (unlikely(!xdp_prog)) { 721 rcu_read_unlock(); 722 goto out; 723 } 724 725 mac_len = skb->data - skb_mac_header(skb); 726 pktlen = skb->len + mac_len; 727 headroom = skb_headroom(skb) - mac_len; 728 729 if (skb_shared(skb) || skb_head_is_locked(skb) || 730 skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) { 731 struct sk_buff *nskb; 732 int size, head_off; 733 void *head, *start; 734 struct page *page; 735 736 size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) + 737 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 738 if (size > PAGE_SIZE) 739 goto drop; 740 741 page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); 742 if (!page) 743 goto drop; 744 745 head = page_address(page); 746 start = head + VETH_XDP_HEADROOM; 747 if (skb_copy_bits(skb, -mac_len, start, pktlen)) { 748 page_frag_free(head); 749 goto drop; 750 } 751 752 nskb = veth_build_skb(head, VETH_XDP_HEADROOM + mac_len, 753 skb->len, PAGE_SIZE); 754 if (!nskb) { 755 page_frag_free(head); 756 goto drop; 757 } 758 759 skb_copy_header(nskb, skb); 760 head_off = skb_headroom(nskb) - skb_headroom(skb); 761 skb_headers_offset_update(nskb, head_off); 762 consume_skb(skb); 763 skb = nskb; 764 } 765 766 /* SKB "head" area always have tailroom for skb_shared_info */ 767 frame_sz = skb_end_pointer(skb) - skb->head; 768 frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 769 xdp_init_buff(&xdp, frame_sz, &rq->xdp_rxq); 770 xdp_prepare_buff(&xdp, skb->head, skb->mac_header, pktlen, true); 771 772 orig_data = xdp.data; 773 orig_data_end = xdp.data_end; 774 775 act = bpf_prog_run_xdp(xdp_prog, &xdp); 776 777 switch (act) { 778 case XDP_PASS: 779 break; 780 case XDP_TX: 781 get_page(virt_to_page(xdp.data)); 782 consume_skb(skb); 783 xdp.rxq->mem = rq->xdp_mem; 784 if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) { 785 trace_xdp_exception(rq->dev, xdp_prog, act); 786 stats->rx_drops++; 787 goto err_xdp; 788 } 789 stats->xdp_tx++; 790 rcu_read_unlock(); 791 goto xdp_xmit; 792 case XDP_REDIRECT: 793 get_page(virt_to_page(xdp.data)); 794 consume_skb(skb); 795 xdp.rxq->mem = rq->xdp_mem; 796 if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { 797 stats->rx_drops++; 798 goto err_xdp; 799 } 800 stats->xdp_redirect++; 801 rcu_read_unlock(); 802 goto xdp_xmit; 803 default: 804 bpf_warn_invalid_xdp_action(act); 805 fallthrough; 806 case XDP_ABORTED: 807 trace_xdp_exception(rq->dev, xdp_prog, act); 808 fallthrough; 809 case XDP_DROP: 810 stats->xdp_drops++; 811 goto xdp_drop; 812 } 813 rcu_read_unlock(); 814 815 /* check if bpf_xdp_adjust_head was used */ 816 delta = orig_data - xdp.data; 817 off = mac_len + delta; 818 if (off > 0) 819 __skb_push(skb, off); 820 else if (off < 0) 821 __skb_pull(skb, -off); 822 skb->mac_header -= delta; 823 824 /* check if bpf_xdp_adjust_tail was used */ 825 off = xdp.data_end - orig_data_end; 826 if (off != 0) 827 __skb_put(skb, off); /* positive on grow, negative on shrink */ 828 skb->protocol = eth_type_trans(skb, rq->dev); 829 830 metalen = xdp.data - xdp.data_meta; 831 if (metalen) 832 skb_metadata_set(skb, metalen); 833 out: 834 return skb; 835 drop: 836 stats->rx_drops++; 837 xdp_drop: 838 rcu_read_unlock(); 839 kfree_skb(skb); 840 return NULL; 841 err_xdp: 842 rcu_read_unlock(); 843 page_frag_free(xdp.data); 844 xdp_xmit: 845 return NULL; 846 } 847 848 static int veth_xdp_rcv(struct veth_rq *rq, int budget, 849 struct veth_xdp_tx_bq *bq, 850 struct veth_stats *stats) 851 { 852 int i, done = 0, n_xdpf = 0; 853 void *xdpf[VETH_XDP_BATCH]; 854 855 for (i = 0; i < budget; i++) { 856 void *ptr = __ptr_ring_consume(&rq->xdp_ring); 857 858 if (!ptr) 859 break; 860 861 if (veth_is_xdp_frame(ptr)) { 862 /* ndo_xdp_xmit */ 863 struct xdp_frame *frame = veth_ptr_to_xdp(ptr); 864 865 stats->xdp_bytes += frame->len; 866 frame = veth_xdp_rcv_one(rq, frame, bq, stats); 867 if (frame) { 868 /* XDP_PASS */ 869 xdpf[n_xdpf++] = frame; 870 if (n_xdpf == VETH_XDP_BATCH) { 871 veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, 872 bq, stats); 873 n_xdpf = 0; 874 } 875 } 876 } else { 877 /* ndo_start_xmit */ 878 struct sk_buff *skb = ptr; 879 880 stats->xdp_bytes += skb->len; 881 skb = veth_xdp_rcv_skb(rq, skb, bq, stats); 882 if (skb) { 883 if (skb_shared(skb) || skb_unclone(skb, GFP_ATOMIC)) 884 netif_receive_skb(skb); 885 else 886 napi_gro_receive(&rq->xdp_napi, skb); 887 } 888 } 889 done++; 890 } 891 892 if (n_xdpf) 893 veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats); 894 895 u64_stats_update_begin(&rq->stats.syncp); 896 rq->stats.vs.xdp_redirect += stats->xdp_redirect; 897 rq->stats.vs.xdp_bytes += stats->xdp_bytes; 898 rq->stats.vs.xdp_drops += stats->xdp_drops; 899 rq->stats.vs.rx_drops += stats->rx_drops; 900 rq->stats.vs.xdp_packets += done; 901 u64_stats_update_end(&rq->stats.syncp); 902 903 return done; 904 } 905 906 static int veth_poll(struct napi_struct *napi, int budget) 907 { 908 struct veth_rq *rq = 909 container_of(napi, struct veth_rq, xdp_napi); 910 struct veth_stats stats = {}; 911 struct veth_xdp_tx_bq bq; 912 int done; 913 914 bq.count = 0; 915 916 xdp_set_return_frame_no_direct(); 917 done = veth_xdp_rcv(rq, budget, &bq, &stats); 918 919 if (done < budget && napi_complete_done(napi, done)) { 920 /* Write rx_notify_masked before reading ptr_ring */ 921 smp_store_mb(rq->rx_notify_masked, false); 922 if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { 923 rq->rx_notify_masked = true; 924 napi_schedule(&rq->xdp_napi); 925 } 926 } 927 928 if (stats.xdp_tx > 0) 929 veth_xdp_flush(rq, &bq); 930 if (stats.xdp_redirect > 0) 931 xdp_do_flush(); 932 xdp_clear_return_frame_no_direct(); 933 934 return done; 935 } 936 937 static int __veth_napi_enable_range(struct net_device *dev, int start, int end) 938 { 939 struct veth_priv *priv = netdev_priv(dev); 940 int err, i; 941 942 for (i = start; i < end; i++) { 943 struct veth_rq *rq = &priv->rq[i]; 944 945 err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL); 946 if (err) 947 goto err_xdp_ring; 948 } 949 950 for (i = start; i < end; i++) { 951 struct veth_rq *rq = &priv->rq[i]; 952 953 napi_enable(&rq->xdp_napi); 954 rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); 955 } 956 957 return 0; 958 959 err_xdp_ring: 960 for (i--; i >= start; i--) 961 ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); 962 963 return err; 964 } 965 966 static int __veth_napi_enable(struct net_device *dev) 967 { 968 return __veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); 969 } 970 971 static void veth_napi_del_range(struct net_device *dev, int start, int end) 972 { 973 struct veth_priv *priv = netdev_priv(dev); 974 int i; 975 976 for (i = start; i < end; i++) { 977 struct veth_rq *rq = &priv->rq[i]; 978 979 rcu_assign_pointer(priv->rq[i].napi, NULL); 980 napi_disable(&rq->xdp_napi); 981 __netif_napi_del(&rq->xdp_napi); 982 } 983 synchronize_net(); 984 985 for (i = start; i < end; i++) { 986 struct veth_rq *rq = &priv->rq[i]; 987 988 rq->rx_notify_masked = false; 989 ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free); 990 } 991 } 992 993 static void veth_napi_del(struct net_device *dev) 994 { 995 veth_napi_del_range(dev, 0, dev->real_num_rx_queues); 996 } 997 998 static bool veth_gro_requested(const struct net_device *dev) 999 { 1000 return !!(dev->wanted_features & NETIF_F_GRO); 1001 } 1002 1003 static int veth_enable_xdp_range(struct net_device *dev, int start, int end, 1004 bool napi_already_on) 1005 { 1006 struct veth_priv *priv = netdev_priv(dev); 1007 int err, i; 1008 1009 for (i = start; i < end; i++) { 1010 struct veth_rq *rq = &priv->rq[i]; 1011 1012 if (!napi_already_on) 1013 netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); 1014 err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id); 1015 if (err < 0) 1016 goto err_rxq_reg; 1017 1018 err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, 1019 MEM_TYPE_PAGE_SHARED, 1020 NULL); 1021 if (err < 0) 1022 goto err_reg_mem; 1023 1024 /* Save original mem info as it can be overwritten */ 1025 rq->xdp_mem = rq->xdp_rxq.mem; 1026 } 1027 return 0; 1028 1029 err_reg_mem: 1030 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 1031 err_rxq_reg: 1032 for (i--; i >= start; i--) { 1033 struct veth_rq *rq = &priv->rq[i]; 1034 1035 xdp_rxq_info_unreg(&rq->xdp_rxq); 1036 if (!napi_already_on) 1037 netif_napi_del(&rq->xdp_napi); 1038 } 1039 1040 return err; 1041 } 1042 1043 static void veth_disable_xdp_range(struct net_device *dev, int start, int end, 1044 bool delete_napi) 1045 { 1046 struct veth_priv *priv = netdev_priv(dev); 1047 int i; 1048 1049 for (i = start; i < end; i++) { 1050 struct veth_rq *rq = &priv->rq[i]; 1051 1052 rq->xdp_rxq.mem = rq->xdp_mem; 1053 xdp_rxq_info_unreg(&rq->xdp_rxq); 1054 1055 if (delete_napi) 1056 netif_napi_del(&rq->xdp_napi); 1057 } 1058 } 1059 1060 static int veth_enable_xdp(struct net_device *dev) 1061 { 1062 bool napi_already_on = veth_gro_requested(dev) && (dev->flags & IFF_UP); 1063 struct veth_priv *priv = netdev_priv(dev); 1064 int err, i; 1065 1066 if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) { 1067 err = veth_enable_xdp_range(dev, 0, dev->real_num_rx_queues, napi_already_on); 1068 if (err) 1069 return err; 1070 1071 if (!napi_already_on) { 1072 err = __veth_napi_enable(dev); 1073 if (err) { 1074 veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, true); 1075 return err; 1076 } 1077 1078 if (!veth_gro_requested(dev)) { 1079 /* user-space did not require GRO, but adding XDP 1080 * is supposed to get GRO working 1081 */ 1082 dev->features |= NETIF_F_GRO; 1083 netdev_features_change(dev); 1084 } 1085 } 1086 } 1087 1088 for (i = 0; i < dev->real_num_rx_queues; i++) { 1089 rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog); 1090 rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); 1091 } 1092 1093 return 0; 1094 } 1095 1096 static void veth_disable_xdp(struct net_device *dev) 1097 { 1098 struct veth_priv *priv = netdev_priv(dev); 1099 int i; 1100 1101 for (i = 0; i < dev->real_num_rx_queues; i++) 1102 rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); 1103 1104 if (!netif_running(dev) || !veth_gro_requested(dev)) { 1105 veth_napi_del(dev); 1106 1107 /* if user-space did not require GRO, since adding XDP 1108 * enabled it, clear it now 1109 */ 1110 if (!veth_gro_requested(dev) && netif_running(dev)) { 1111 dev->features &= ~NETIF_F_GRO; 1112 netdev_features_change(dev); 1113 } 1114 } 1115 1116 veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, false); 1117 } 1118 1119 static int veth_napi_enable_range(struct net_device *dev, int start, int end) 1120 { 1121 struct veth_priv *priv = netdev_priv(dev); 1122 int err, i; 1123 1124 for (i = start; i < end; i++) { 1125 struct veth_rq *rq = &priv->rq[i]; 1126 1127 netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); 1128 } 1129 1130 err = __veth_napi_enable_range(dev, start, end); 1131 if (err) { 1132 for (i = start; i < end; i++) { 1133 struct veth_rq *rq = &priv->rq[i]; 1134 1135 netif_napi_del(&rq->xdp_napi); 1136 } 1137 return err; 1138 } 1139 return err; 1140 } 1141 1142 static int veth_napi_enable(struct net_device *dev) 1143 { 1144 return veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); 1145 } 1146 1147 static void veth_disable_range_safe(struct net_device *dev, int start, int end) 1148 { 1149 struct veth_priv *priv = netdev_priv(dev); 1150 1151 if (start >= end) 1152 return; 1153 1154 if (priv->_xdp_prog) { 1155 veth_napi_del_range(dev, start, end); 1156 veth_disable_xdp_range(dev, start, end, false); 1157 } else if (veth_gro_requested(dev)) { 1158 veth_napi_del_range(dev, start, end); 1159 } 1160 } 1161 1162 static int veth_enable_range_safe(struct net_device *dev, int start, int end) 1163 { 1164 struct veth_priv *priv = netdev_priv(dev); 1165 int err; 1166 1167 if (start >= end) 1168 return 0; 1169 1170 if (priv->_xdp_prog) { 1171 /* these channels are freshly initialized, napi is not on there even 1172 * when GRO is requeste 1173 */ 1174 err = veth_enable_xdp_range(dev, start, end, false); 1175 if (err) 1176 return err; 1177 1178 err = __veth_napi_enable_range(dev, start, end); 1179 if (err) { 1180 /* on error always delete the newly added napis */ 1181 veth_disable_xdp_range(dev, start, end, true); 1182 return err; 1183 } 1184 } else if (veth_gro_requested(dev)) { 1185 return veth_napi_enable_range(dev, start, end); 1186 } 1187 return 0; 1188 } 1189 1190 static int veth_set_channels(struct net_device *dev, 1191 struct ethtool_channels *ch) 1192 { 1193 struct veth_priv *priv = netdev_priv(dev); 1194 unsigned int old_rx_count, new_rx_count; 1195 struct veth_priv *peer_priv; 1196 struct net_device *peer; 1197 int err; 1198 1199 /* sanity check. Upper bounds are already enforced by the caller */ 1200 if (!ch->rx_count || !ch->tx_count) 1201 return -EINVAL; 1202 1203 /* avoid braking XDP, if that is enabled */ 1204 peer = rtnl_dereference(priv->peer); 1205 peer_priv = peer ? netdev_priv(peer) : NULL; 1206 if (priv->_xdp_prog && peer && ch->rx_count < peer->real_num_tx_queues) 1207 return -EINVAL; 1208 1209 if (peer && peer_priv && peer_priv->_xdp_prog && ch->tx_count > peer->real_num_rx_queues) 1210 return -EINVAL; 1211 1212 old_rx_count = dev->real_num_rx_queues; 1213 new_rx_count = ch->rx_count; 1214 if (netif_running(dev)) { 1215 /* turn device off */ 1216 netif_carrier_off(dev); 1217 if (peer) 1218 netif_carrier_off(peer); 1219 1220 /* try to allocate new resurces, as needed*/ 1221 err = veth_enable_range_safe(dev, old_rx_count, new_rx_count); 1222 if (err) 1223 goto out; 1224 } 1225 1226 err = netif_set_real_num_rx_queues(dev, ch->rx_count); 1227 if (err) 1228 goto revert; 1229 1230 err = netif_set_real_num_tx_queues(dev, ch->tx_count); 1231 if (err) { 1232 int err2 = netif_set_real_num_rx_queues(dev, old_rx_count); 1233 1234 /* this error condition could happen only if rx and tx change 1235 * in opposite directions (e.g. tx nr raises, rx nr decreases) 1236 * and we can't do anything to fully restore the original 1237 * status 1238 */ 1239 if (err2) 1240 pr_warn("Can't restore rx queues config %d -> %d %d", 1241 new_rx_count, old_rx_count, err2); 1242 else 1243 goto revert; 1244 } 1245 1246 out: 1247 if (netif_running(dev)) { 1248 /* note that we need to swap the arguments WRT the enable part 1249 * to identify the range we have to disable 1250 */ 1251 veth_disable_range_safe(dev, new_rx_count, old_rx_count); 1252 netif_carrier_on(dev); 1253 if (peer) 1254 netif_carrier_on(peer); 1255 } 1256 return err; 1257 1258 revert: 1259 new_rx_count = old_rx_count; 1260 old_rx_count = ch->rx_count; 1261 goto out; 1262 } 1263 1264 static int veth_open(struct net_device *dev) 1265 { 1266 struct veth_priv *priv = netdev_priv(dev); 1267 struct net_device *peer = rtnl_dereference(priv->peer); 1268 int err; 1269 1270 if (!peer) 1271 return -ENOTCONN; 1272 1273 if (priv->_xdp_prog) { 1274 err = veth_enable_xdp(dev); 1275 if (err) 1276 return err; 1277 } else if (veth_gro_requested(dev)) { 1278 err = veth_napi_enable(dev); 1279 if (err) 1280 return err; 1281 } 1282 1283 if (peer->flags & IFF_UP) { 1284 netif_carrier_on(dev); 1285 netif_carrier_on(peer); 1286 } 1287 1288 return 0; 1289 } 1290 1291 static int veth_close(struct net_device *dev) 1292 { 1293 struct veth_priv *priv = netdev_priv(dev); 1294 struct net_device *peer = rtnl_dereference(priv->peer); 1295 1296 netif_carrier_off(dev); 1297 if (peer) 1298 netif_carrier_off(peer); 1299 1300 if (priv->_xdp_prog) 1301 veth_disable_xdp(dev); 1302 else if (veth_gro_requested(dev)) 1303 veth_napi_del(dev); 1304 1305 return 0; 1306 } 1307 1308 static int is_valid_veth_mtu(int mtu) 1309 { 1310 return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU; 1311 } 1312 1313 static int veth_alloc_queues(struct net_device *dev) 1314 { 1315 struct veth_priv *priv = netdev_priv(dev); 1316 int i; 1317 1318 priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL); 1319 if (!priv->rq) 1320 return -ENOMEM; 1321 1322 for (i = 0; i < dev->num_rx_queues; i++) { 1323 priv->rq[i].dev = dev; 1324 u64_stats_init(&priv->rq[i].stats.syncp); 1325 } 1326 1327 return 0; 1328 } 1329 1330 static void veth_free_queues(struct net_device *dev) 1331 { 1332 struct veth_priv *priv = netdev_priv(dev); 1333 1334 kfree(priv->rq); 1335 } 1336 1337 static int veth_dev_init(struct net_device *dev) 1338 { 1339 int err; 1340 1341 dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); 1342 if (!dev->lstats) 1343 return -ENOMEM; 1344 1345 err = veth_alloc_queues(dev); 1346 if (err) { 1347 free_percpu(dev->lstats); 1348 return err; 1349 } 1350 1351 return 0; 1352 } 1353 1354 static void veth_dev_free(struct net_device *dev) 1355 { 1356 veth_free_queues(dev); 1357 free_percpu(dev->lstats); 1358 } 1359 1360 #ifdef CONFIG_NET_POLL_CONTROLLER 1361 static void veth_poll_controller(struct net_device *dev) 1362 { 1363 /* veth only receives frames when its peer sends one 1364 * Since it has nothing to do with disabling irqs, we are guaranteed 1365 * never to have pending data when we poll for it so 1366 * there is nothing to do here. 1367 * 1368 * We need this though so netpoll recognizes us as an interface that 1369 * supports polling, which enables bridge devices in virt setups to 1370 * still use netconsole 1371 */ 1372 } 1373 #endif /* CONFIG_NET_POLL_CONTROLLER */ 1374 1375 static int veth_get_iflink(const struct net_device *dev) 1376 { 1377 struct veth_priv *priv = netdev_priv(dev); 1378 struct net_device *peer; 1379 int iflink; 1380 1381 rcu_read_lock(); 1382 peer = rcu_dereference(priv->peer); 1383 iflink = peer ? peer->ifindex : 0; 1384 rcu_read_unlock(); 1385 1386 return iflink; 1387 } 1388 1389 static netdev_features_t veth_fix_features(struct net_device *dev, 1390 netdev_features_t features) 1391 { 1392 struct veth_priv *priv = netdev_priv(dev); 1393 struct net_device *peer; 1394 1395 peer = rtnl_dereference(priv->peer); 1396 if (peer) { 1397 struct veth_priv *peer_priv = netdev_priv(peer); 1398 1399 if (peer_priv->_xdp_prog) 1400 features &= ~NETIF_F_GSO_SOFTWARE; 1401 } 1402 if (priv->_xdp_prog) 1403 features |= NETIF_F_GRO; 1404 1405 return features; 1406 } 1407 1408 static int veth_set_features(struct net_device *dev, 1409 netdev_features_t features) 1410 { 1411 netdev_features_t changed = features ^ dev->features; 1412 struct veth_priv *priv = netdev_priv(dev); 1413 int err; 1414 1415 if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog) 1416 return 0; 1417 1418 if (features & NETIF_F_GRO) { 1419 err = veth_napi_enable(dev); 1420 if (err) 1421 return err; 1422 } else { 1423 veth_napi_del(dev); 1424 } 1425 return 0; 1426 } 1427 1428 static void veth_set_rx_headroom(struct net_device *dev, int new_hr) 1429 { 1430 struct veth_priv *peer_priv, *priv = netdev_priv(dev); 1431 struct net_device *peer; 1432 1433 if (new_hr < 0) 1434 new_hr = 0; 1435 1436 rcu_read_lock(); 1437 peer = rcu_dereference(priv->peer); 1438 if (unlikely(!peer)) 1439 goto out; 1440 1441 peer_priv = netdev_priv(peer); 1442 priv->requested_headroom = new_hr; 1443 new_hr = max(priv->requested_headroom, peer_priv->requested_headroom); 1444 dev->needed_headroom = new_hr; 1445 peer->needed_headroom = new_hr; 1446 1447 out: 1448 rcu_read_unlock(); 1449 } 1450 1451 static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, 1452 struct netlink_ext_ack *extack) 1453 { 1454 struct veth_priv *priv = netdev_priv(dev); 1455 struct bpf_prog *old_prog; 1456 struct net_device *peer; 1457 unsigned int max_mtu; 1458 int err; 1459 1460 old_prog = priv->_xdp_prog; 1461 priv->_xdp_prog = prog; 1462 peer = rtnl_dereference(priv->peer); 1463 1464 if (prog) { 1465 if (!peer) { 1466 NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached"); 1467 err = -ENOTCONN; 1468 goto err; 1469 } 1470 1471 max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM - 1472 peer->hard_header_len - 1473 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 1474 if (peer->mtu > max_mtu) { 1475 NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP"); 1476 err = -ERANGE; 1477 goto err; 1478 } 1479 1480 if (dev->real_num_rx_queues < peer->real_num_tx_queues) { 1481 NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues"); 1482 err = -ENOSPC; 1483 goto err; 1484 } 1485 1486 if (dev->flags & IFF_UP) { 1487 err = veth_enable_xdp(dev); 1488 if (err) { 1489 NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed"); 1490 goto err; 1491 } 1492 } 1493 1494 if (!old_prog) { 1495 peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; 1496 peer->max_mtu = max_mtu; 1497 } 1498 } 1499 1500 if (old_prog) { 1501 if (!prog) { 1502 if (dev->flags & IFF_UP) 1503 veth_disable_xdp(dev); 1504 1505 if (peer) { 1506 peer->hw_features |= NETIF_F_GSO_SOFTWARE; 1507 peer->max_mtu = ETH_MAX_MTU; 1508 } 1509 } 1510 bpf_prog_put(old_prog); 1511 } 1512 1513 if ((!!old_prog ^ !!prog) && peer) 1514 netdev_update_features(peer); 1515 1516 return 0; 1517 err: 1518 priv->_xdp_prog = old_prog; 1519 1520 return err; 1521 } 1522 1523 static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) 1524 { 1525 switch (xdp->command) { 1526 case XDP_SETUP_PROG: 1527 return veth_xdp_set(dev, xdp->prog, xdp->extack); 1528 default: 1529 return -EINVAL; 1530 } 1531 } 1532 1533 static const struct net_device_ops veth_netdev_ops = { 1534 .ndo_init = veth_dev_init, 1535 .ndo_open = veth_open, 1536 .ndo_stop = veth_close, 1537 .ndo_start_xmit = veth_xmit, 1538 .ndo_get_stats64 = veth_get_stats64, 1539 .ndo_set_rx_mode = veth_set_multicast_list, 1540 .ndo_set_mac_address = eth_mac_addr, 1541 #ifdef CONFIG_NET_POLL_CONTROLLER 1542 .ndo_poll_controller = veth_poll_controller, 1543 #endif 1544 .ndo_get_iflink = veth_get_iflink, 1545 .ndo_fix_features = veth_fix_features, 1546 .ndo_set_features = veth_set_features, 1547 .ndo_features_check = passthru_features_check, 1548 .ndo_set_rx_headroom = veth_set_rx_headroom, 1549 .ndo_bpf = veth_xdp, 1550 .ndo_xdp_xmit = veth_ndo_xdp_xmit, 1551 .ndo_get_peer_dev = veth_peer_dev, 1552 }; 1553 1554 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ 1555 NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \ 1556 NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \ 1557 NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \ 1558 NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX ) 1559 1560 static void veth_setup(struct net_device *dev) 1561 { 1562 ether_setup(dev); 1563 1564 dev->priv_flags &= ~IFF_TX_SKB_SHARING; 1565 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 1566 dev->priv_flags |= IFF_NO_QUEUE; 1567 dev->priv_flags |= IFF_PHONY_HEADROOM; 1568 1569 dev->netdev_ops = &veth_netdev_ops; 1570 dev->ethtool_ops = &veth_ethtool_ops; 1571 dev->features |= NETIF_F_LLTX; 1572 dev->features |= VETH_FEATURES; 1573 dev->vlan_features = dev->features & 1574 ~(NETIF_F_HW_VLAN_CTAG_TX | 1575 NETIF_F_HW_VLAN_STAG_TX | 1576 NETIF_F_HW_VLAN_CTAG_RX | 1577 NETIF_F_HW_VLAN_STAG_RX); 1578 dev->needs_free_netdev = true; 1579 dev->priv_destructor = veth_dev_free; 1580 dev->max_mtu = ETH_MAX_MTU; 1581 1582 dev->hw_features = VETH_FEATURES; 1583 dev->hw_enc_features = VETH_FEATURES; 1584 dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; 1585 } 1586 1587 /* 1588 * netlink interface 1589 */ 1590 1591 static int veth_validate(struct nlattr *tb[], struct nlattr *data[], 1592 struct netlink_ext_ack *extack) 1593 { 1594 if (tb[IFLA_ADDRESS]) { 1595 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 1596 return -EINVAL; 1597 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 1598 return -EADDRNOTAVAIL; 1599 } 1600 if (tb[IFLA_MTU]) { 1601 if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU]))) 1602 return -EINVAL; 1603 } 1604 return 0; 1605 } 1606 1607 static struct rtnl_link_ops veth_link_ops; 1608 1609 static void veth_disable_gro(struct net_device *dev) 1610 { 1611 dev->features &= ~NETIF_F_GRO; 1612 dev->wanted_features &= ~NETIF_F_GRO; 1613 netdev_update_features(dev); 1614 } 1615 1616 static int veth_init_queues(struct net_device *dev, struct nlattr *tb[]) 1617 { 1618 int err; 1619 1620 if (!tb[IFLA_NUM_TX_QUEUES] && dev->num_tx_queues > 1) { 1621 err = netif_set_real_num_tx_queues(dev, 1); 1622 if (err) 1623 return err; 1624 } 1625 if (!tb[IFLA_NUM_RX_QUEUES] && dev->num_rx_queues > 1) { 1626 err = netif_set_real_num_rx_queues(dev, 1); 1627 if (err) 1628 return err; 1629 } 1630 return 0; 1631 } 1632 1633 static int veth_newlink(struct net *src_net, struct net_device *dev, 1634 struct nlattr *tb[], struct nlattr *data[], 1635 struct netlink_ext_ack *extack) 1636 { 1637 int err; 1638 struct net_device *peer; 1639 struct veth_priv *priv; 1640 char ifname[IFNAMSIZ]; 1641 struct nlattr *peer_tb[IFLA_MAX + 1], **tbp; 1642 unsigned char name_assign_type; 1643 struct ifinfomsg *ifmp; 1644 struct net *net; 1645 1646 /* 1647 * create and register peer first 1648 */ 1649 if (data != NULL && data[VETH_INFO_PEER] != NULL) { 1650 struct nlattr *nla_peer; 1651 1652 nla_peer = data[VETH_INFO_PEER]; 1653 ifmp = nla_data(nla_peer); 1654 err = rtnl_nla_parse_ifla(peer_tb, 1655 nla_data(nla_peer) + sizeof(struct ifinfomsg), 1656 nla_len(nla_peer) - sizeof(struct ifinfomsg), 1657 NULL); 1658 if (err < 0) 1659 return err; 1660 1661 err = veth_validate(peer_tb, NULL, extack); 1662 if (err < 0) 1663 return err; 1664 1665 tbp = peer_tb; 1666 } else { 1667 ifmp = NULL; 1668 tbp = tb; 1669 } 1670 1671 if (ifmp && tbp[IFLA_IFNAME]) { 1672 nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); 1673 name_assign_type = NET_NAME_USER; 1674 } else { 1675 snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); 1676 name_assign_type = NET_NAME_ENUM; 1677 } 1678 1679 net = rtnl_link_get_net(src_net, tbp); 1680 if (IS_ERR(net)) 1681 return PTR_ERR(net); 1682 1683 peer = rtnl_create_link(net, ifname, name_assign_type, 1684 &veth_link_ops, tbp, extack); 1685 if (IS_ERR(peer)) { 1686 put_net(net); 1687 return PTR_ERR(peer); 1688 } 1689 1690 if (!ifmp || !tbp[IFLA_ADDRESS]) 1691 eth_hw_addr_random(peer); 1692 1693 if (ifmp && (dev->ifindex != 0)) 1694 peer->ifindex = ifmp->ifi_index; 1695 1696 peer->gso_max_size = dev->gso_max_size; 1697 peer->gso_max_segs = dev->gso_max_segs; 1698 1699 err = register_netdevice(peer); 1700 put_net(net); 1701 net = NULL; 1702 if (err < 0) 1703 goto err_register_peer; 1704 1705 /* keep GRO disabled by default to be consistent with the established 1706 * veth behavior 1707 */ 1708 veth_disable_gro(peer); 1709 netif_carrier_off(peer); 1710 1711 err = rtnl_configure_link(peer, ifmp); 1712 if (err < 0) 1713 goto err_configure_peer; 1714 1715 /* 1716 * register dev last 1717 * 1718 * note, that since we've registered new device the dev's name 1719 * should be re-allocated 1720 */ 1721 1722 if (tb[IFLA_ADDRESS] == NULL) 1723 eth_hw_addr_random(dev); 1724 1725 if (tb[IFLA_IFNAME]) 1726 nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); 1727 else 1728 snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); 1729 1730 err = register_netdevice(dev); 1731 if (err < 0) 1732 goto err_register_dev; 1733 1734 netif_carrier_off(dev); 1735 1736 /* 1737 * tie the deviced together 1738 */ 1739 1740 priv = netdev_priv(dev); 1741 rcu_assign_pointer(priv->peer, peer); 1742 err = veth_init_queues(dev, tb); 1743 if (err) 1744 goto err_queues; 1745 1746 priv = netdev_priv(peer); 1747 rcu_assign_pointer(priv->peer, dev); 1748 err = veth_init_queues(peer, tb); 1749 if (err) 1750 goto err_queues; 1751 1752 veth_disable_gro(dev); 1753 return 0; 1754 1755 err_queues: 1756 unregister_netdevice(dev); 1757 err_register_dev: 1758 /* nothing to do */ 1759 err_configure_peer: 1760 unregister_netdevice(peer); 1761 return err; 1762 1763 err_register_peer: 1764 free_netdev(peer); 1765 return err; 1766 } 1767 1768 static void veth_dellink(struct net_device *dev, struct list_head *head) 1769 { 1770 struct veth_priv *priv; 1771 struct net_device *peer; 1772 1773 priv = netdev_priv(dev); 1774 peer = rtnl_dereference(priv->peer); 1775 1776 /* Note : dellink() is called from default_device_exit_batch(), 1777 * before a rcu_synchronize() point. The devices are guaranteed 1778 * not being freed before one RCU grace period. 1779 */ 1780 RCU_INIT_POINTER(priv->peer, NULL); 1781 unregister_netdevice_queue(dev, head); 1782 1783 if (peer) { 1784 priv = netdev_priv(peer); 1785 RCU_INIT_POINTER(priv->peer, NULL); 1786 unregister_netdevice_queue(peer, head); 1787 } 1788 } 1789 1790 static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = { 1791 [VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) }, 1792 }; 1793 1794 static struct net *veth_get_link_net(const struct net_device *dev) 1795 { 1796 struct veth_priv *priv = netdev_priv(dev); 1797 struct net_device *peer = rtnl_dereference(priv->peer); 1798 1799 return peer ? dev_net(peer) : dev_net(dev); 1800 } 1801 1802 static unsigned int veth_get_num_queues(void) 1803 { 1804 /* enforce the same queue limit as rtnl_create_link */ 1805 int queues = num_possible_cpus(); 1806 1807 if (queues > 4096) 1808 queues = 4096; 1809 return queues; 1810 } 1811 1812 static struct rtnl_link_ops veth_link_ops = { 1813 .kind = DRV_NAME, 1814 .priv_size = sizeof(struct veth_priv), 1815 .setup = veth_setup, 1816 .validate = veth_validate, 1817 .newlink = veth_newlink, 1818 .dellink = veth_dellink, 1819 .policy = veth_policy, 1820 .maxtype = VETH_INFO_MAX, 1821 .get_link_net = veth_get_link_net, 1822 .get_num_tx_queues = veth_get_num_queues, 1823 .get_num_rx_queues = veth_get_num_queues, 1824 }; 1825 1826 /* 1827 * init/fini 1828 */ 1829 1830 static __init int veth_init(void) 1831 { 1832 return rtnl_link_register(&veth_link_ops); 1833 } 1834 1835 static __exit void veth_exit(void) 1836 { 1837 rtnl_link_unregister(&veth_link_ops); 1838 } 1839 1840 module_init(veth_init); 1841 module_exit(veth_exit); 1842 1843 MODULE_DESCRIPTION("Virtual Ethernet Tunnel"); 1844 MODULE_LICENSE("GPL v2"); 1845 MODULE_ALIAS_RTNL_LINK(DRV_NAME); 1846