1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * drivers/net/veth.c 4 * 5 * Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc 6 * 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com> 9 * 10 */ 11 12 #include <linux/netdevice.h> 13 #include <linux/slab.h> 14 #include <linux/ethtool.h> 15 #include <linux/etherdevice.h> 16 #include <linux/u64_stats_sync.h> 17 18 #include <net/rtnetlink.h> 19 #include <net/dst.h> 20 #include <net/xfrm.h> 21 #include <net/xdp.h> 22 #include <linux/veth.h> 23 #include <linux/module.h> 24 #include <linux/bpf.h> 25 #include <linux/filter.h> 26 #include <linux/ptr_ring.h> 27 #include <linux/bpf_trace.h> 28 #include <linux/net_tstamp.h> 29 30 #define DRV_NAME "veth" 31 #define DRV_VERSION "1.0" 32 33 #define VETH_XDP_FLAG BIT(0) 34 #define VETH_RING_SIZE 256 35 #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) 36 37 #define VETH_XDP_TX_BULK_SIZE 16 38 #define VETH_XDP_BATCH 16 39 40 struct veth_stats { 41 u64 rx_drops; 42 /* xdp */ 43 u64 xdp_packets; 44 u64 xdp_bytes; 45 u64 xdp_redirect; 46 u64 xdp_drops; 47 u64 xdp_tx; 48 u64 xdp_tx_err; 49 u64 peer_tq_xdp_xmit; 50 u64 peer_tq_xdp_xmit_err; 51 }; 52 53 struct veth_rq_stats { 54 struct veth_stats vs; 55 struct u64_stats_sync syncp; 56 }; 57 58 struct veth_rq { 59 struct napi_struct xdp_napi; 60 struct napi_struct __rcu *napi; /* points to xdp_napi when the latter is initialized */ 61 struct net_device *dev; 62 struct bpf_prog __rcu *xdp_prog; 63 struct xdp_mem_info xdp_mem; 64 struct veth_rq_stats stats; 65 bool rx_notify_masked; 66 struct ptr_ring xdp_ring; 67 struct xdp_rxq_info xdp_rxq; 68 }; 69 70 struct veth_priv { 71 struct net_device __rcu *peer; 72 atomic64_t dropped; 73 struct bpf_prog *_xdp_prog; 74 struct veth_rq *rq; 75 unsigned int requested_headroom; 76 }; 77 78 struct veth_xdp_tx_bq { 79 struct xdp_frame *q[VETH_XDP_TX_BULK_SIZE]; 80 unsigned int count; 81 }; 82 83 /* 84 * ethtool interface 85 */ 86 87 struct veth_q_stat_desc { 88 char desc[ETH_GSTRING_LEN]; 89 size_t offset; 90 }; 91 92 #define VETH_RQ_STAT(m) offsetof(struct veth_stats, m) 93 94 static const struct veth_q_stat_desc veth_rq_stats_desc[] = { 95 { "xdp_packets", VETH_RQ_STAT(xdp_packets) }, 96 { "xdp_bytes", VETH_RQ_STAT(xdp_bytes) }, 97 { "drops", VETH_RQ_STAT(rx_drops) }, 98 { "xdp_redirect", VETH_RQ_STAT(xdp_redirect) }, 99 { "xdp_drops", VETH_RQ_STAT(xdp_drops) }, 100 { "xdp_tx", VETH_RQ_STAT(xdp_tx) }, 101 { "xdp_tx_errors", VETH_RQ_STAT(xdp_tx_err) }, 102 }; 103 104 #define VETH_RQ_STATS_LEN ARRAY_SIZE(veth_rq_stats_desc) 105 106 static const struct veth_q_stat_desc veth_tq_stats_desc[] = { 107 { "xdp_xmit", VETH_RQ_STAT(peer_tq_xdp_xmit) }, 108 { "xdp_xmit_errors", VETH_RQ_STAT(peer_tq_xdp_xmit_err) }, 109 }; 110 111 #define VETH_TQ_STATS_LEN ARRAY_SIZE(veth_tq_stats_desc) 112 113 static struct { 114 const char string[ETH_GSTRING_LEN]; 115 } ethtool_stats_keys[] = { 116 { "peer_ifindex" }, 117 }; 118 119 struct veth_xdp_buff { 120 struct xdp_buff xdp; 121 struct sk_buff *skb; 122 }; 123 124 static int veth_get_link_ksettings(struct net_device *dev, 125 struct ethtool_link_ksettings *cmd) 126 { 127 cmd->base.speed = SPEED_10000; 128 cmd->base.duplex = DUPLEX_FULL; 129 cmd->base.port = PORT_TP; 130 cmd->base.autoneg = AUTONEG_DISABLE; 131 return 0; 132 } 133 134 static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) 135 { 136 strscpy(info->driver, DRV_NAME, sizeof(info->driver)); 137 strscpy(info->version, DRV_VERSION, sizeof(info->version)); 138 } 139 140 static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf) 141 { 142 u8 *p = buf; 143 int i, j; 144 145 switch(stringset) { 146 case ETH_SS_STATS: 147 memcpy(p, ðtool_stats_keys, sizeof(ethtool_stats_keys)); 148 p += sizeof(ethtool_stats_keys); 149 for (i = 0; i < dev->real_num_rx_queues; i++) 150 for (j = 0; j < VETH_RQ_STATS_LEN; j++) 151 ethtool_sprintf(&p, "rx_queue_%u_%.18s", 152 i, veth_rq_stats_desc[j].desc); 153 154 for (i = 0; i < dev->real_num_tx_queues; i++) 155 for (j = 0; j < VETH_TQ_STATS_LEN; j++) 156 ethtool_sprintf(&p, "tx_queue_%u_%.18s", 157 i, veth_tq_stats_desc[j].desc); 158 break; 159 } 160 } 161 162 static int veth_get_sset_count(struct net_device *dev, int sset) 163 { 164 switch (sset) { 165 case ETH_SS_STATS: 166 return ARRAY_SIZE(ethtool_stats_keys) + 167 VETH_RQ_STATS_LEN * dev->real_num_rx_queues + 168 VETH_TQ_STATS_LEN * dev->real_num_tx_queues; 169 default: 170 return -EOPNOTSUPP; 171 } 172 } 173 174 static void veth_get_ethtool_stats(struct net_device *dev, 175 struct ethtool_stats *stats, u64 *data) 176 { 177 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 178 struct net_device *peer = rtnl_dereference(priv->peer); 179 int i, j, idx; 180 181 data[0] = peer ? peer->ifindex : 0; 182 idx = 1; 183 for (i = 0; i < dev->real_num_rx_queues; i++) { 184 const struct veth_rq_stats *rq_stats = &priv->rq[i].stats; 185 const void *stats_base = (void *)&rq_stats->vs; 186 unsigned int start; 187 size_t offset; 188 189 do { 190 start = u64_stats_fetch_begin(&rq_stats->syncp); 191 for (j = 0; j < VETH_RQ_STATS_LEN; j++) { 192 offset = veth_rq_stats_desc[j].offset; 193 data[idx + j] = *(u64 *)(stats_base + offset); 194 } 195 } while (u64_stats_fetch_retry(&rq_stats->syncp, start)); 196 idx += VETH_RQ_STATS_LEN; 197 } 198 199 if (!peer) 200 return; 201 202 rcv_priv = netdev_priv(peer); 203 for (i = 0; i < peer->real_num_rx_queues; i++) { 204 const struct veth_rq_stats *rq_stats = &rcv_priv->rq[i].stats; 205 const void *base = (void *)&rq_stats->vs; 206 unsigned int start, tx_idx = idx; 207 size_t offset; 208 209 tx_idx += (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN; 210 do { 211 start = u64_stats_fetch_begin(&rq_stats->syncp); 212 for (j = 0; j < VETH_TQ_STATS_LEN; j++) { 213 offset = veth_tq_stats_desc[j].offset; 214 data[tx_idx + j] += *(u64 *)(base + offset); 215 } 216 } while (u64_stats_fetch_retry(&rq_stats->syncp, start)); 217 } 218 } 219 220 static void veth_get_channels(struct net_device *dev, 221 struct ethtool_channels *channels) 222 { 223 channels->tx_count = dev->real_num_tx_queues; 224 channels->rx_count = dev->real_num_rx_queues; 225 channels->max_tx = dev->num_tx_queues; 226 channels->max_rx = dev->num_rx_queues; 227 } 228 229 static int veth_set_channels(struct net_device *dev, 230 struct ethtool_channels *ch); 231 232 static const struct ethtool_ops veth_ethtool_ops = { 233 .get_drvinfo = veth_get_drvinfo, 234 .get_link = ethtool_op_get_link, 235 .get_strings = veth_get_strings, 236 .get_sset_count = veth_get_sset_count, 237 .get_ethtool_stats = veth_get_ethtool_stats, 238 .get_link_ksettings = veth_get_link_ksettings, 239 .get_ts_info = ethtool_op_get_ts_info, 240 .get_channels = veth_get_channels, 241 .set_channels = veth_set_channels, 242 }; 243 244 /* general routines */ 245 246 static bool veth_is_xdp_frame(void *ptr) 247 { 248 return (unsigned long)ptr & VETH_XDP_FLAG; 249 } 250 251 static struct xdp_frame *veth_ptr_to_xdp(void *ptr) 252 { 253 return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG); 254 } 255 256 static void *veth_xdp_to_ptr(struct xdp_frame *xdp) 257 { 258 return (void *)((unsigned long)xdp | VETH_XDP_FLAG); 259 } 260 261 static void veth_ptr_free(void *ptr) 262 { 263 if (veth_is_xdp_frame(ptr)) 264 xdp_return_frame(veth_ptr_to_xdp(ptr)); 265 else 266 kfree_skb(ptr); 267 } 268 269 static void __veth_xdp_flush(struct veth_rq *rq) 270 { 271 /* Write ptr_ring before reading rx_notify_masked */ 272 smp_mb(); 273 if (!READ_ONCE(rq->rx_notify_masked) && 274 napi_schedule_prep(&rq->xdp_napi)) { 275 WRITE_ONCE(rq->rx_notify_masked, true); 276 __napi_schedule(&rq->xdp_napi); 277 } 278 } 279 280 static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) 281 { 282 if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) { 283 dev_kfree_skb_any(skb); 284 return NET_RX_DROP; 285 } 286 287 return NET_RX_SUCCESS; 288 } 289 290 static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, 291 struct veth_rq *rq, bool xdp) 292 { 293 return __dev_forward_skb(dev, skb) ?: xdp ? 294 veth_xdp_rx(rq, skb) : 295 __netif_rx(skb); 296 } 297 298 /* return true if the specified skb has chances of GRO aggregation 299 * Don't strive for accuracy, but try to avoid GRO overhead in the most 300 * common scenarios. 301 * When XDP is enabled, all traffic is considered eligible, as the xmit 302 * device has TSO off. 303 * When TSO is enabled on the xmit device, we are likely interested only 304 * in UDP aggregation, explicitly check for that if the skb is suspected 305 * - the sock_wfree destructor is used by UDP, ICMP and XDP sockets - 306 * to belong to locally generated UDP traffic. 307 */ 308 static bool veth_skb_is_eligible_for_gro(const struct net_device *dev, 309 const struct net_device *rcv, 310 const struct sk_buff *skb) 311 { 312 return !(dev->features & NETIF_F_ALL_TSO) || 313 (skb->destructor == sock_wfree && 314 rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD)); 315 } 316 317 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) 318 { 319 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 320 struct veth_rq *rq = NULL; 321 struct net_device *rcv; 322 int length = skb->len; 323 bool use_napi = false; 324 int rxq; 325 326 rcu_read_lock(); 327 rcv = rcu_dereference(priv->peer); 328 if (unlikely(!rcv) || !pskb_may_pull(skb, ETH_HLEN)) { 329 kfree_skb(skb); 330 goto drop; 331 } 332 333 rcv_priv = netdev_priv(rcv); 334 rxq = skb_get_queue_mapping(skb); 335 if (rxq < rcv->real_num_rx_queues) { 336 rq = &rcv_priv->rq[rxq]; 337 338 /* The napi pointer is available when an XDP program is 339 * attached or when GRO is enabled 340 * Don't bother with napi/GRO if the skb can't be aggregated 341 */ 342 use_napi = rcu_access_pointer(rq->napi) && 343 veth_skb_is_eligible_for_gro(dev, rcv, skb); 344 } 345 346 skb_tx_timestamp(skb); 347 if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) { 348 if (!use_napi) 349 dev_lstats_add(dev, length); 350 } else { 351 drop: 352 atomic64_inc(&priv->dropped); 353 } 354 355 if (use_napi) 356 __veth_xdp_flush(rq); 357 358 rcu_read_unlock(); 359 360 return NETDEV_TX_OK; 361 } 362 363 static u64 veth_stats_tx(struct net_device *dev, u64 *packets, u64 *bytes) 364 { 365 struct veth_priv *priv = netdev_priv(dev); 366 367 dev_lstats_read(dev, packets, bytes); 368 return atomic64_read(&priv->dropped); 369 } 370 371 static void veth_stats_rx(struct veth_stats *result, struct net_device *dev) 372 { 373 struct veth_priv *priv = netdev_priv(dev); 374 int i; 375 376 result->peer_tq_xdp_xmit_err = 0; 377 result->xdp_packets = 0; 378 result->xdp_tx_err = 0; 379 result->xdp_bytes = 0; 380 result->rx_drops = 0; 381 for (i = 0; i < dev->num_rx_queues; i++) { 382 u64 packets, bytes, drops, xdp_tx_err, peer_tq_xdp_xmit_err; 383 struct veth_rq_stats *stats = &priv->rq[i].stats; 384 unsigned int start; 385 386 do { 387 start = u64_stats_fetch_begin(&stats->syncp); 388 peer_tq_xdp_xmit_err = stats->vs.peer_tq_xdp_xmit_err; 389 xdp_tx_err = stats->vs.xdp_tx_err; 390 packets = stats->vs.xdp_packets; 391 bytes = stats->vs.xdp_bytes; 392 drops = stats->vs.rx_drops; 393 } while (u64_stats_fetch_retry(&stats->syncp, start)); 394 result->peer_tq_xdp_xmit_err += peer_tq_xdp_xmit_err; 395 result->xdp_tx_err += xdp_tx_err; 396 result->xdp_packets += packets; 397 result->xdp_bytes += bytes; 398 result->rx_drops += drops; 399 } 400 } 401 402 static void veth_get_stats64(struct net_device *dev, 403 struct rtnl_link_stats64 *tot) 404 { 405 struct veth_priv *priv = netdev_priv(dev); 406 struct net_device *peer; 407 struct veth_stats rx; 408 u64 packets, bytes; 409 410 tot->tx_dropped = veth_stats_tx(dev, &packets, &bytes); 411 tot->tx_bytes = bytes; 412 tot->tx_packets = packets; 413 414 veth_stats_rx(&rx, dev); 415 tot->tx_dropped += rx.xdp_tx_err; 416 tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err; 417 tot->rx_bytes = rx.xdp_bytes; 418 tot->rx_packets = rx.xdp_packets; 419 420 rcu_read_lock(); 421 peer = rcu_dereference(priv->peer); 422 if (peer) { 423 veth_stats_tx(peer, &packets, &bytes); 424 tot->rx_bytes += bytes; 425 tot->rx_packets += packets; 426 427 veth_stats_rx(&rx, peer); 428 tot->tx_dropped += rx.peer_tq_xdp_xmit_err; 429 tot->rx_dropped += rx.xdp_tx_err; 430 tot->tx_bytes += rx.xdp_bytes; 431 tot->tx_packets += rx.xdp_packets; 432 } 433 rcu_read_unlock(); 434 } 435 436 /* fake multicast ability */ 437 static void veth_set_multicast_list(struct net_device *dev) 438 { 439 } 440 441 static int veth_select_rxq(struct net_device *dev) 442 { 443 return smp_processor_id() % dev->real_num_rx_queues; 444 } 445 446 static struct net_device *veth_peer_dev(struct net_device *dev) 447 { 448 struct veth_priv *priv = netdev_priv(dev); 449 450 /* Callers must be under RCU read side. */ 451 return rcu_dereference(priv->peer); 452 } 453 454 static int veth_xdp_xmit(struct net_device *dev, int n, 455 struct xdp_frame **frames, 456 u32 flags, bool ndo_xmit) 457 { 458 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 459 int i, ret = -ENXIO, nxmit = 0; 460 struct net_device *rcv; 461 unsigned int max_len; 462 struct veth_rq *rq; 463 464 if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) 465 return -EINVAL; 466 467 rcu_read_lock(); 468 rcv = rcu_dereference(priv->peer); 469 if (unlikely(!rcv)) 470 goto out; 471 472 rcv_priv = netdev_priv(rcv); 473 rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 474 /* The napi pointer is set if NAPI is enabled, which ensures that 475 * xdp_ring is initialized on receive side and the peer device is up. 476 */ 477 if (!rcu_access_pointer(rq->napi)) 478 goto out; 479 480 max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN; 481 482 spin_lock(&rq->xdp_ring.producer_lock); 483 for (i = 0; i < n; i++) { 484 struct xdp_frame *frame = frames[i]; 485 void *ptr = veth_xdp_to_ptr(frame); 486 487 if (unlikely(xdp_get_frame_len(frame) > max_len || 488 __ptr_ring_produce(&rq->xdp_ring, ptr))) 489 break; 490 nxmit++; 491 } 492 spin_unlock(&rq->xdp_ring.producer_lock); 493 494 if (flags & XDP_XMIT_FLUSH) 495 __veth_xdp_flush(rq); 496 497 ret = nxmit; 498 if (ndo_xmit) { 499 u64_stats_update_begin(&rq->stats.syncp); 500 rq->stats.vs.peer_tq_xdp_xmit += nxmit; 501 rq->stats.vs.peer_tq_xdp_xmit_err += n - nxmit; 502 u64_stats_update_end(&rq->stats.syncp); 503 } 504 505 out: 506 rcu_read_unlock(); 507 508 return ret; 509 } 510 511 static int veth_ndo_xdp_xmit(struct net_device *dev, int n, 512 struct xdp_frame **frames, u32 flags) 513 { 514 int err; 515 516 err = veth_xdp_xmit(dev, n, frames, flags, true); 517 if (err < 0) { 518 struct veth_priv *priv = netdev_priv(dev); 519 520 atomic64_add(n, &priv->dropped); 521 } 522 523 return err; 524 } 525 526 static void veth_xdp_flush_bq(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) 527 { 528 int sent, i, err = 0, drops; 529 530 sent = veth_xdp_xmit(rq->dev, bq->count, bq->q, 0, false); 531 if (sent < 0) { 532 err = sent; 533 sent = 0; 534 } 535 536 for (i = sent; unlikely(i < bq->count); i++) 537 xdp_return_frame(bq->q[i]); 538 539 drops = bq->count - sent; 540 trace_xdp_bulk_tx(rq->dev, sent, drops, err); 541 542 u64_stats_update_begin(&rq->stats.syncp); 543 rq->stats.vs.xdp_tx += sent; 544 rq->stats.vs.xdp_tx_err += drops; 545 u64_stats_update_end(&rq->stats.syncp); 546 547 bq->count = 0; 548 } 549 550 static void veth_xdp_flush(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) 551 { 552 struct veth_priv *rcv_priv, *priv = netdev_priv(rq->dev); 553 struct net_device *rcv; 554 struct veth_rq *rcv_rq; 555 556 rcu_read_lock(); 557 veth_xdp_flush_bq(rq, bq); 558 rcv = rcu_dereference(priv->peer); 559 if (unlikely(!rcv)) 560 goto out; 561 562 rcv_priv = netdev_priv(rcv); 563 rcv_rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 564 /* xdp_ring is initialized on receive side? */ 565 if (unlikely(!rcu_access_pointer(rcv_rq->xdp_prog))) 566 goto out; 567 568 __veth_xdp_flush(rcv_rq); 569 out: 570 rcu_read_unlock(); 571 } 572 573 static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp, 574 struct veth_xdp_tx_bq *bq) 575 { 576 struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp); 577 578 if (unlikely(!frame)) 579 return -EOVERFLOW; 580 581 if (unlikely(bq->count == VETH_XDP_TX_BULK_SIZE)) 582 veth_xdp_flush_bq(rq, bq); 583 584 bq->q[bq->count++] = frame; 585 586 return 0; 587 } 588 589 static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq, 590 struct xdp_frame *frame, 591 struct veth_xdp_tx_bq *bq, 592 struct veth_stats *stats) 593 { 594 struct xdp_frame orig_frame; 595 struct bpf_prog *xdp_prog; 596 597 rcu_read_lock(); 598 xdp_prog = rcu_dereference(rq->xdp_prog); 599 if (likely(xdp_prog)) { 600 struct veth_xdp_buff vxbuf; 601 struct xdp_buff *xdp = &vxbuf.xdp; 602 u32 act; 603 604 xdp_convert_frame_to_buff(frame, xdp); 605 xdp->rxq = &rq->xdp_rxq; 606 vxbuf.skb = NULL; 607 608 act = bpf_prog_run_xdp(xdp_prog, xdp); 609 610 switch (act) { 611 case XDP_PASS: 612 if (xdp_update_frame_from_buff(xdp, frame)) 613 goto err_xdp; 614 break; 615 case XDP_TX: 616 orig_frame = *frame; 617 xdp->rxq->mem = frame->mem; 618 if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) { 619 trace_xdp_exception(rq->dev, xdp_prog, act); 620 frame = &orig_frame; 621 stats->rx_drops++; 622 goto err_xdp; 623 } 624 stats->xdp_tx++; 625 rcu_read_unlock(); 626 goto xdp_xmit; 627 case XDP_REDIRECT: 628 orig_frame = *frame; 629 xdp->rxq->mem = frame->mem; 630 if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) { 631 frame = &orig_frame; 632 stats->rx_drops++; 633 goto err_xdp; 634 } 635 stats->xdp_redirect++; 636 rcu_read_unlock(); 637 goto xdp_xmit; 638 default: 639 bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); 640 fallthrough; 641 case XDP_ABORTED: 642 trace_xdp_exception(rq->dev, xdp_prog, act); 643 fallthrough; 644 case XDP_DROP: 645 stats->xdp_drops++; 646 goto err_xdp; 647 } 648 } 649 rcu_read_unlock(); 650 651 return frame; 652 err_xdp: 653 rcu_read_unlock(); 654 xdp_return_frame(frame); 655 xdp_xmit: 656 return NULL; 657 } 658 659 /* frames array contains VETH_XDP_BATCH at most */ 660 static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames, 661 int n_xdpf, struct veth_xdp_tx_bq *bq, 662 struct veth_stats *stats) 663 { 664 void *skbs[VETH_XDP_BATCH]; 665 int i; 666 667 if (xdp_alloc_skb_bulk(skbs, n_xdpf, 668 GFP_ATOMIC | __GFP_ZERO) < 0) { 669 for (i = 0; i < n_xdpf; i++) 670 xdp_return_frame(frames[i]); 671 stats->rx_drops += n_xdpf; 672 673 return; 674 } 675 676 for (i = 0; i < n_xdpf; i++) { 677 struct sk_buff *skb = skbs[i]; 678 679 skb = __xdp_build_skb_from_frame(frames[i], skb, 680 rq->dev); 681 if (!skb) { 682 xdp_return_frame(frames[i]); 683 stats->rx_drops++; 684 continue; 685 } 686 napi_gro_receive(&rq->xdp_napi, skb); 687 } 688 } 689 690 static void veth_xdp_get(struct xdp_buff *xdp) 691 { 692 struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); 693 int i; 694 695 get_page(virt_to_page(xdp->data)); 696 if (likely(!xdp_buff_has_frags(xdp))) 697 return; 698 699 for (i = 0; i < sinfo->nr_frags; i++) 700 __skb_frag_ref(&sinfo->frags[i]); 701 } 702 703 static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq, 704 struct xdp_buff *xdp, 705 struct sk_buff **pskb) 706 { 707 struct sk_buff *skb = *pskb; 708 u32 frame_sz; 709 710 if (skb_shared(skb) || skb_head_is_locked(skb) || 711 skb_shinfo(skb)->nr_frags) { 712 u32 size, len, max_head_size, off; 713 struct sk_buff *nskb; 714 struct page *page; 715 int i, head_off; 716 717 /* We need a private copy of the skb and data buffers since 718 * the ebpf program can modify it. We segment the original skb 719 * into order-0 pages without linearize it. 720 * 721 * Make sure we have enough space for linear and paged area 722 */ 723 max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - 724 VETH_XDP_HEADROOM); 725 if (skb->len > PAGE_SIZE * MAX_SKB_FRAGS + max_head_size) 726 goto drop; 727 728 /* Allocate skb head */ 729 page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); 730 if (!page) 731 goto drop; 732 733 nskb = build_skb(page_address(page), PAGE_SIZE); 734 if (!nskb) { 735 put_page(page); 736 goto drop; 737 } 738 739 skb_reserve(nskb, VETH_XDP_HEADROOM); 740 size = min_t(u32, skb->len, max_head_size); 741 if (skb_copy_bits(skb, 0, nskb->data, size)) { 742 consume_skb(nskb); 743 goto drop; 744 } 745 skb_put(nskb, size); 746 747 skb_copy_header(nskb, skb); 748 head_off = skb_headroom(nskb) - skb_headroom(skb); 749 skb_headers_offset_update(nskb, head_off); 750 751 /* Allocate paged area of new skb */ 752 off = size; 753 len = skb->len - off; 754 755 for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) { 756 page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); 757 if (!page) { 758 consume_skb(nskb); 759 goto drop; 760 } 761 762 size = min_t(u32, len, PAGE_SIZE); 763 skb_add_rx_frag(nskb, i, page, 0, size, PAGE_SIZE); 764 if (skb_copy_bits(skb, off, page_address(page), 765 size)) { 766 consume_skb(nskb); 767 goto drop; 768 } 769 770 len -= size; 771 off += size; 772 } 773 774 consume_skb(skb); 775 skb = nskb; 776 } else if (skb_headroom(skb) < XDP_PACKET_HEADROOM && 777 pskb_expand_head(skb, VETH_XDP_HEADROOM, 0, GFP_ATOMIC)) { 778 goto drop; 779 } 780 781 /* SKB "head" area always have tailroom for skb_shared_info */ 782 frame_sz = skb_end_pointer(skb) - skb->head; 783 frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 784 xdp_init_buff(xdp, frame_sz, &rq->xdp_rxq); 785 xdp_prepare_buff(xdp, skb->head, skb_headroom(skb), 786 skb_headlen(skb), true); 787 788 if (skb_is_nonlinear(skb)) { 789 skb_shinfo(skb)->xdp_frags_size = skb->data_len; 790 xdp_buff_set_frags_flag(xdp); 791 } else { 792 xdp_buff_clear_frags_flag(xdp); 793 } 794 *pskb = skb; 795 796 return 0; 797 drop: 798 consume_skb(skb); 799 *pskb = NULL; 800 801 return -ENOMEM; 802 } 803 804 static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, 805 struct sk_buff *skb, 806 struct veth_xdp_tx_bq *bq, 807 struct veth_stats *stats) 808 { 809 void *orig_data, *orig_data_end; 810 struct bpf_prog *xdp_prog; 811 struct veth_xdp_buff vxbuf; 812 struct xdp_buff *xdp = &vxbuf.xdp; 813 u32 act, metalen; 814 int off; 815 816 skb_prepare_for_gro(skb); 817 818 rcu_read_lock(); 819 xdp_prog = rcu_dereference(rq->xdp_prog); 820 if (unlikely(!xdp_prog)) { 821 rcu_read_unlock(); 822 goto out; 823 } 824 825 __skb_push(skb, skb->data - skb_mac_header(skb)); 826 if (veth_convert_skb_to_xdp_buff(rq, xdp, &skb)) 827 goto drop; 828 vxbuf.skb = skb; 829 830 orig_data = xdp->data; 831 orig_data_end = xdp->data_end; 832 833 act = bpf_prog_run_xdp(xdp_prog, xdp); 834 835 switch (act) { 836 case XDP_PASS: 837 break; 838 case XDP_TX: 839 veth_xdp_get(xdp); 840 consume_skb(skb); 841 xdp->rxq->mem = rq->xdp_mem; 842 if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) { 843 trace_xdp_exception(rq->dev, xdp_prog, act); 844 stats->rx_drops++; 845 goto err_xdp; 846 } 847 stats->xdp_tx++; 848 rcu_read_unlock(); 849 goto xdp_xmit; 850 case XDP_REDIRECT: 851 veth_xdp_get(xdp); 852 consume_skb(skb); 853 xdp->rxq->mem = rq->xdp_mem; 854 if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) { 855 stats->rx_drops++; 856 goto err_xdp; 857 } 858 stats->xdp_redirect++; 859 rcu_read_unlock(); 860 goto xdp_xmit; 861 default: 862 bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); 863 fallthrough; 864 case XDP_ABORTED: 865 trace_xdp_exception(rq->dev, xdp_prog, act); 866 fallthrough; 867 case XDP_DROP: 868 stats->xdp_drops++; 869 goto xdp_drop; 870 } 871 rcu_read_unlock(); 872 873 /* check if bpf_xdp_adjust_head was used */ 874 off = orig_data - xdp->data; 875 if (off > 0) 876 __skb_push(skb, off); 877 else if (off < 0) 878 __skb_pull(skb, -off); 879 880 skb_reset_mac_header(skb); 881 882 /* check if bpf_xdp_adjust_tail was used */ 883 off = xdp->data_end - orig_data_end; 884 if (off != 0) 885 __skb_put(skb, off); /* positive on grow, negative on shrink */ 886 887 /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers 888 * (e.g. bpf_xdp_adjust_tail), we need to update data_len here. 889 */ 890 if (xdp_buff_has_frags(xdp)) 891 skb->data_len = skb_shinfo(skb)->xdp_frags_size; 892 else 893 skb->data_len = 0; 894 895 skb->protocol = eth_type_trans(skb, rq->dev); 896 897 metalen = xdp->data - xdp->data_meta; 898 if (metalen) 899 skb_metadata_set(skb, metalen); 900 out: 901 return skb; 902 drop: 903 stats->rx_drops++; 904 xdp_drop: 905 rcu_read_unlock(); 906 kfree_skb(skb); 907 return NULL; 908 err_xdp: 909 rcu_read_unlock(); 910 xdp_return_buff(xdp); 911 xdp_xmit: 912 return NULL; 913 } 914 915 static int veth_xdp_rcv(struct veth_rq *rq, int budget, 916 struct veth_xdp_tx_bq *bq, 917 struct veth_stats *stats) 918 { 919 int i, done = 0, n_xdpf = 0; 920 void *xdpf[VETH_XDP_BATCH]; 921 922 for (i = 0; i < budget; i++) { 923 void *ptr = __ptr_ring_consume(&rq->xdp_ring); 924 925 if (!ptr) 926 break; 927 928 if (veth_is_xdp_frame(ptr)) { 929 /* ndo_xdp_xmit */ 930 struct xdp_frame *frame = veth_ptr_to_xdp(ptr); 931 932 stats->xdp_bytes += xdp_get_frame_len(frame); 933 frame = veth_xdp_rcv_one(rq, frame, bq, stats); 934 if (frame) { 935 /* XDP_PASS */ 936 xdpf[n_xdpf++] = frame; 937 if (n_xdpf == VETH_XDP_BATCH) { 938 veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, 939 bq, stats); 940 n_xdpf = 0; 941 } 942 } 943 } else { 944 /* ndo_start_xmit */ 945 struct sk_buff *skb = ptr; 946 947 stats->xdp_bytes += skb->len; 948 skb = veth_xdp_rcv_skb(rq, skb, bq, stats); 949 if (skb) { 950 if (skb_shared(skb) || skb_unclone(skb, GFP_ATOMIC)) 951 netif_receive_skb(skb); 952 else 953 napi_gro_receive(&rq->xdp_napi, skb); 954 } 955 } 956 done++; 957 } 958 959 if (n_xdpf) 960 veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats); 961 962 u64_stats_update_begin(&rq->stats.syncp); 963 rq->stats.vs.xdp_redirect += stats->xdp_redirect; 964 rq->stats.vs.xdp_bytes += stats->xdp_bytes; 965 rq->stats.vs.xdp_drops += stats->xdp_drops; 966 rq->stats.vs.rx_drops += stats->rx_drops; 967 rq->stats.vs.xdp_packets += done; 968 u64_stats_update_end(&rq->stats.syncp); 969 970 return done; 971 } 972 973 static int veth_poll(struct napi_struct *napi, int budget) 974 { 975 struct veth_rq *rq = 976 container_of(napi, struct veth_rq, xdp_napi); 977 struct veth_stats stats = {}; 978 struct veth_xdp_tx_bq bq; 979 int done; 980 981 bq.count = 0; 982 983 xdp_set_return_frame_no_direct(); 984 done = veth_xdp_rcv(rq, budget, &bq, &stats); 985 986 if (stats.xdp_redirect > 0) 987 xdp_do_flush(); 988 989 if (done < budget && napi_complete_done(napi, done)) { 990 /* Write rx_notify_masked before reading ptr_ring */ 991 smp_store_mb(rq->rx_notify_masked, false); 992 if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { 993 if (napi_schedule_prep(&rq->xdp_napi)) { 994 WRITE_ONCE(rq->rx_notify_masked, true); 995 __napi_schedule(&rq->xdp_napi); 996 } 997 } 998 } 999 1000 if (stats.xdp_tx > 0) 1001 veth_xdp_flush(rq, &bq); 1002 xdp_clear_return_frame_no_direct(); 1003 1004 return done; 1005 } 1006 1007 static int __veth_napi_enable_range(struct net_device *dev, int start, int end) 1008 { 1009 struct veth_priv *priv = netdev_priv(dev); 1010 int err, i; 1011 1012 for (i = start; i < end; i++) { 1013 struct veth_rq *rq = &priv->rq[i]; 1014 1015 err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL); 1016 if (err) 1017 goto err_xdp_ring; 1018 } 1019 1020 for (i = start; i < end; i++) { 1021 struct veth_rq *rq = &priv->rq[i]; 1022 1023 napi_enable(&rq->xdp_napi); 1024 rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); 1025 } 1026 1027 return 0; 1028 1029 err_xdp_ring: 1030 for (i--; i >= start; i--) 1031 ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); 1032 1033 return err; 1034 } 1035 1036 static int __veth_napi_enable(struct net_device *dev) 1037 { 1038 return __veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); 1039 } 1040 1041 static void veth_napi_del_range(struct net_device *dev, int start, int end) 1042 { 1043 struct veth_priv *priv = netdev_priv(dev); 1044 int i; 1045 1046 for (i = start; i < end; i++) { 1047 struct veth_rq *rq = &priv->rq[i]; 1048 1049 rcu_assign_pointer(priv->rq[i].napi, NULL); 1050 napi_disable(&rq->xdp_napi); 1051 __netif_napi_del(&rq->xdp_napi); 1052 } 1053 synchronize_net(); 1054 1055 for (i = start; i < end; i++) { 1056 struct veth_rq *rq = &priv->rq[i]; 1057 1058 rq->rx_notify_masked = false; 1059 ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free); 1060 } 1061 } 1062 1063 static void veth_napi_del(struct net_device *dev) 1064 { 1065 veth_napi_del_range(dev, 0, dev->real_num_rx_queues); 1066 } 1067 1068 static bool veth_gro_requested(const struct net_device *dev) 1069 { 1070 return !!(dev->wanted_features & NETIF_F_GRO); 1071 } 1072 1073 static int veth_enable_xdp_range(struct net_device *dev, int start, int end, 1074 bool napi_already_on) 1075 { 1076 struct veth_priv *priv = netdev_priv(dev); 1077 int err, i; 1078 1079 for (i = start; i < end; i++) { 1080 struct veth_rq *rq = &priv->rq[i]; 1081 1082 if (!napi_already_on) 1083 netif_napi_add(dev, &rq->xdp_napi, veth_poll); 1084 err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id); 1085 if (err < 0) 1086 goto err_rxq_reg; 1087 1088 err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, 1089 MEM_TYPE_PAGE_SHARED, 1090 NULL); 1091 if (err < 0) 1092 goto err_reg_mem; 1093 1094 /* Save original mem info as it can be overwritten */ 1095 rq->xdp_mem = rq->xdp_rxq.mem; 1096 } 1097 return 0; 1098 1099 err_reg_mem: 1100 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 1101 err_rxq_reg: 1102 for (i--; i >= start; i--) { 1103 struct veth_rq *rq = &priv->rq[i]; 1104 1105 xdp_rxq_info_unreg(&rq->xdp_rxq); 1106 if (!napi_already_on) 1107 netif_napi_del(&rq->xdp_napi); 1108 } 1109 1110 return err; 1111 } 1112 1113 static void veth_disable_xdp_range(struct net_device *dev, int start, int end, 1114 bool delete_napi) 1115 { 1116 struct veth_priv *priv = netdev_priv(dev); 1117 int i; 1118 1119 for (i = start; i < end; i++) { 1120 struct veth_rq *rq = &priv->rq[i]; 1121 1122 rq->xdp_rxq.mem = rq->xdp_mem; 1123 xdp_rxq_info_unreg(&rq->xdp_rxq); 1124 1125 if (delete_napi) 1126 netif_napi_del(&rq->xdp_napi); 1127 } 1128 } 1129 1130 static int veth_enable_xdp(struct net_device *dev) 1131 { 1132 bool napi_already_on = veth_gro_requested(dev) && (dev->flags & IFF_UP); 1133 struct veth_priv *priv = netdev_priv(dev); 1134 int err, i; 1135 1136 if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) { 1137 err = veth_enable_xdp_range(dev, 0, dev->real_num_rx_queues, napi_already_on); 1138 if (err) 1139 return err; 1140 1141 if (!napi_already_on) { 1142 err = __veth_napi_enable(dev); 1143 if (err) { 1144 veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, true); 1145 return err; 1146 } 1147 1148 if (!veth_gro_requested(dev)) { 1149 /* user-space did not require GRO, but adding XDP 1150 * is supposed to get GRO working 1151 */ 1152 dev->features |= NETIF_F_GRO; 1153 netdev_features_change(dev); 1154 } 1155 } 1156 } 1157 1158 for (i = 0; i < dev->real_num_rx_queues; i++) { 1159 rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog); 1160 rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); 1161 } 1162 1163 return 0; 1164 } 1165 1166 static void veth_disable_xdp(struct net_device *dev) 1167 { 1168 struct veth_priv *priv = netdev_priv(dev); 1169 int i; 1170 1171 for (i = 0; i < dev->real_num_rx_queues; i++) 1172 rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); 1173 1174 if (!netif_running(dev) || !veth_gro_requested(dev)) { 1175 veth_napi_del(dev); 1176 1177 /* if user-space did not require GRO, since adding XDP 1178 * enabled it, clear it now 1179 */ 1180 if (!veth_gro_requested(dev) && netif_running(dev)) { 1181 dev->features &= ~NETIF_F_GRO; 1182 netdev_features_change(dev); 1183 } 1184 } 1185 1186 veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, false); 1187 } 1188 1189 static int veth_napi_enable_range(struct net_device *dev, int start, int end) 1190 { 1191 struct veth_priv *priv = netdev_priv(dev); 1192 int err, i; 1193 1194 for (i = start; i < end; i++) { 1195 struct veth_rq *rq = &priv->rq[i]; 1196 1197 netif_napi_add(dev, &rq->xdp_napi, veth_poll); 1198 } 1199 1200 err = __veth_napi_enable_range(dev, start, end); 1201 if (err) { 1202 for (i = start; i < end; i++) { 1203 struct veth_rq *rq = &priv->rq[i]; 1204 1205 netif_napi_del(&rq->xdp_napi); 1206 } 1207 return err; 1208 } 1209 return err; 1210 } 1211 1212 static int veth_napi_enable(struct net_device *dev) 1213 { 1214 return veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); 1215 } 1216 1217 static void veth_disable_range_safe(struct net_device *dev, int start, int end) 1218 { 1219 struct veth_priv *priv = netdev_priv(dev); 1220 1221 if (start >= end) 1222 return; 1223 1224 if (priv->_xdp_prog) { 1225 veth_napi_del_range(dev, start, end); 1226 veth_disable_xdp_range(dev, start, end, false); 1227 } else if (veth_gro_requested(dev)) { 1228 veth_napi_del_range(dev, start, end); 1229 } 1230 } 1231 1232 static int veth_enable_range_safe(struct net_device *dev, int start, int end) 1233 { 1234 struct veth_priv *priv = netdev_priv(dev); 1235 int err; 1236 1237 if (start >= end) 1238 return 0; 1239 1240 if (priv->_xdp_prog) { 1241 /* these channels are freshly initialized, napi is not on there even 1242 * when GRO is requeste 1243 */ 1244 err = veth_enable_xdp_range(dev, start, end, false); 1245 if (err) 1246 return err; 1247 1248 err = __veth_napi_enable_range(dev, start, end); 1249 if (err) { 1250 /* on error always delete the newly added napis */ 1251 veth_disable_xdp_range(dev, start, end, true); 1252 return err; 1253 } 1254 } else if (veth_gro_requested(dev)) { 1255 return veth_napi_enable_range(dev, start, end); 1256 } 1257 return 0; 1258 } 1259 1260 static int veth_set_channels(struct net_device *dev, 1261 struct ethtool_channels *ch) 1262 { 1263 struct veth_priv *priv = netdev_priv(dev); 1264 unsigned int old_rx_count, new_rx_count; 1265 struct veth_priv *peer_priv; 1266 struct net_device *peer; 1267 int err; 1268 1269 /* sanity check. Upper bounds are already enforced by the caller */ 1270 if (!ch->rx_count || !ch->tx_count) 1271 return -EINVAL; 1272 1273 /* avoid braking XDP, if that is enabled */ 1274 peer = rtnl_dereference(priv->peer); 1275 peer_priv = peer ? netdev_priv(peer) : NULL; 1276 if (priv->_xdp_prog && peer && ch->rx_count < peer->real_num_tx_queues) 1277 return -EINVAL; 1278 1279 if (peer && peer_priv && peer_priv->_xdp_prog && ch->tx_count > peer->real_num_rx_queues) 1280 return -EINVAL; 1281 1282 old_rx_count = dev->real_num_rx_queues; 1283 new_rx_count = ch->rx_count; 1284 if (netif_running(dev)) { 1285 /* turn device off */ 1286 netif_carrier_off(dev); 1287 if (peer) 1288 netif_carrier_off(peer); 1289 1290 /* try to allocate new resurces, as needed*/ 1291 err = veth_enable_range_safe(dev, old_rx_count, new_rx_count); 1292 if (err) 1293 goto out; 1294 } 1295 1296 err = netif_set_real_num_rx_queues(dev, ch->rx_count); 1297 if (err) 1298 goto revert; 1299 1300 err = netif_set_real_num_tx_queues(dev, ch->tx_count); 1301 if (err) { 1302 int err2 = netif_set_real_num_rx_queues(dev, old_rx_count); 1303 1304 /* this error condition could happen only if rx and tx change 1305 * in opposite directions (e.g. tx nr raises, rx nr decreases) 1306 * and we can't do anything to fully restore the original 1307 * status 1308 */ 1309 if (err2) 1310 pr_warn("Can't restore rx queues config %d -> %d %d", 1311 new_rx_count, old_rx_count, err2); 1312 else 1313 goto revert; 1314 } 1315 1316 out: 1317 if (netif_running(dev)) { 1318 /* note that we need to swap the arguments WRT the enable part 1319 * to identify the range we have to disable 1320 */ 1321 veth_disable_range_safe(dev, new_rx_count, old_rx_count); 1322 netif_carrier_on(dev); 1323 if (peer) 1324 netif_carrier_on(peer); 1325 } 1326 return err; 1327 1328 revert: 1329 new_rx_count = old_rx_count; 1330 old_rx_count = ch->rx_count; 1331 goto out; 1332 } 1333 1334 static int veth_open(struct net_device *dev) 1335 { 1336 struct veth_priv *priv = netdev_priv(dev); 1337 struct net_device *peer = rtnl_dereference(priv->peer); 1338 int err; 1339 1340 if (!peer) 1341 return -ENOTCONN; 1342 1343 if (priv->_xdp_prog) { 1344 err = veth_enable_xdp(dev); 1345 if (err) 1346 return err; 1347 } else if (veth_gro_requested(dev)) { 1348 err = veth_napi_enable(dev); 1349 if (err) 1350 return err; 1351 } 1352 1353 if (peer->flags & IFF_UP) { 1354 netif_carrier_on(dev); 1355 netif_carrier_on(peer); 1356 } 1357 1358 return 0; 1359 } 1360 1361 static int veth_close(struct net_device *dev) 1362 { 1363 struct veth_priv *priv = netdev_priv(dev); 1364 struct net_device *peer = rtnl_dereference(priv->peer); 1365 1366 netif_carrier_off(dev); 1367 if (peer) 1368 netif_carrier_off(peer); 1369 1370 if (priv->_xdp_prog) 1371 veth_disable_xdp(dev); 1372 else if (veth_gro_requested(dev)) 1373 veth_napi_del(dev); 1374 1375 return 0; 1376 } 1377 1378 static int is_valid_veth_mtu(int mtu) 1379 { 1380 return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU; 1381 } 1382 1383 static int veth_alloc_queues(struct net_device *dev) 1384 { 1385 struct veth_priv *priv = netdev_priv(dev); 1386 int i; 1387 1388 priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL_ACCOUNT); 1389 if (!priv->rq) 1390 return -ENOMEM; 1391 1392 for (i = 0; i < dev->num_rx_queues; i++) { 1393 priv->rq[i].dev = dev; 1394 u64_stats_init(&priv->rq[i].stats.syncp); 1395 } 1396 1397 return 0; 1398 } 1399 1400 static void veth_free_queues(struct net_device *dev) 1401 { 1402 struct veth_priv *priv = netdev_priv(dev); 1403 1404 kfree(priv->rq); 1405 } 1406 1407 static int veth_dev_init(struct net_device *dev) 1408 { 1409 int err; 1410 1411 dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); 1412 if (!dev->lstats) 1413 return -ENOMEM; 1414 1415 err = veth_alloc_queues(dev); 1416 if (err) { 1417 free_percpu(dev->lstats); 1418 return err; 1419 } 1420 1421 return 0; 1422 } 1423 1424 static void veth_dev_free(struct net_device *dev) 1425 { 1426 veth_free_queues(dev); 1427 free_percpu(dev->lstats); 1428 } 1429 1430 #ifdef CONFIG_NET_POLL_CONTROLLER 1431 static void veth_poll_controller(struct net_device *dev) 1432 { 1433 /* veth only receives frames when its peer sends one 1434 * Since it has nothing to do with disabling irqs, we are guaranteed 1435 * never to have pending data when we poll for it so 1436 * there is nothing to do here. 1437 * 1438 * We need this though so netpoll recognizes us as an interface that 1439 * supports polling, which enables bridge devices in virt setups to 1440 * still use netconsole 1441 */ 1442 } 1443 #endif /* CONFIG_NET_POLL_CONTROLLER */ 1444 1445 static int veth_get_iflink(const struct net_device *dev) 1446 { 1447 struct veth_priv *priv = netdev_priv(dev); 1448 struct net_device *peer; 1449 int iflink; 1450 1451 rcu_read_lock(); 1452 peer = rcu_dereference(priv->peer); 1453 iflink = peer ? peer->ifindex : 0; 1454 rcu_read_unlock(); 1455 1456 return iflink; 1457 } 1458 1459 static netdev_features_t veth_fix_features(struct net_device *dev, 1460 netdev_features_t features) 1461 { 1462 struct veth_priv *priv = netdev_priv(dev); 1463 struct net_device *peer; 1464 1465 peer = rtnl_dereference(priv->peer); 1466 if (peer) { 1467 struct veth_priv *peer_priv = netdev_priv(peer); 1468 1469 if (peer_priv->_xdp_prog) 1470 features &= ~NETIF_F_GSO_SOFTWARE; 1471 } 1472 if (priv->_xdp_prog) 1473 features |= NETIF_F_GRO; 1474 1475 return features; 1476 } 1477 1478 static int veth_set_features(struct net_device *dev, 1479 netdev_features_t features) 1480 { 1481 netdev_features_t changed = features ^ dev->features; 1482 struct veth_priv *priv = netdev_priv(dev); 1483 int err; 1484 1485 if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog) 1486 return 0; 1487 1488 if (features & NETIF_F_GRO) { 1489 err = veth_napi_enable(dev); 1490 if (err) 1491 return err; 1492 } else { 1493 veth_napi_del(dev); 1494 } 1495 return 0; 1496 } 1497 1498 static void veth_set_rx_headroom(struct net_device *dev, int new_hr) 1499 { 1500 struct veth_priv *peer_priv, *priv = netdev_priv(dev); 1501 struct net_device *peer; 1502 1503 if (new_hr < 0) 1504 new_hr = 0; 1505 1506 rcu_read_lock(); 1507 peer = rcu_dereference(priv->peer); 1508 if (unlikely(!peer)) 1509 goto out; 1510 1511 peer_priv = netdev_priv(peer); 1512 priv->requested_headroom = new_hr; 1513 new_hr = max(priv->requested_headroom, peer_priv->requested_headroom); 1514 dev->needed_headroom = new_hr; 1515 peer->needed_headroom = new_hr; 1516 1517 out: 1518 rcu_read_unlock(); 1519 } 1520 1521 static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, 1522 struct netlink_ext_ack *extack) 1523 { 1524 struct veth_priv *priv = netdev_priv(dev); 1525 struct bpf_prog *old_prog; 1526 struct net_device *peer; 1527 unsigned int max_mtu; 1528 int err; 1529 1530 old_prog = priv->_xdp_prog; 1531 priv->_xdp_prog = prog; 1532 peer = rtnl_dereference(priv->peer); 1533 1534 if (prog) { 1535 if (!peer) { 1536 NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached"); 1537 err = -ENOTCONN; 1538 goto err; 1539 } 1540 1541 max_mtu = SKB_WITH_OVERHEAD(PAGE_SIZE - VETH_XDP_HEADROOM) - 1542 peer->hard_header_len; 1543 /* Allow increasing the max_mtu if the program supports 1544 * XDP fragments. 1545 */ 1546 if (prog->aux->xdp_has_frags) 1547 max_mtu += PAGE_SIZE * MAX_SKB_FRAGS; 1548 1549 if (peer->mtu > max_mtu) { 1550 NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP"); 1551 err = -ERANGE; 1552 goto err; 1553 } 1554 1555 if (dev->real_num_rx_queues < peer->real_num_tx_queues) { 1556 NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues"); 1557 err = -ENOSPC; 1558 goto err; 1559 } 1560 1561 if (dev->flags & IFF_UP) { 1562 err = veth_enable_xdp(dev); 1563 if (err) { 1564 NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed"); 1565 goto err; 1566 } 1567 } 1568 1569 if (!old_prog) { 1570 peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; 1571 peer->max_mtu = max_mtu; 1572 } 1573 } 1574 1575 if (old_prog) { 1576 if (!prog) { 1577 if (dev->flags & IFF_UP) 1578 veth_disable_xdp(dev); 1579 1580 if (peer) { 1581 peer->hw_features |= NETIF_F_GSO_SOFTWARE; 1582 peer->max_mtu = ETH_MAX_MTU; 1583 } 1584 } 1585 bpf_prog_put(old_prog); 1586 } 1587 1588 if ((!!old_prog ^ !!prog) && peer) 1589 netdev_update_features(peer); 1590 1591 return 0; 1592 err: 1593 priv->_xdp_prog = old_prog; 1594 1595 return err; 1596 } 1597 1598 static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) 1599 { 1600 switch (xdp->command) { 1601 case XDP_SETUP_PROG: 1602 return veth_xdp_set(dev, xdp->prog, xdp->extack); 1603 default: 1604 return -EINVAL; 1605 } 1606 } 1607 1608 static int veth_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp) 1609 { 1610 struct veth_xdp_buff *_ctx = (void *)ctx; 1611 1612 if (!_ctx->skb) 1613 return -EOPNOTSUPP; 1614 1615 *timestamp = skb_hwtstamps(_ctx->skb)->hwtstamp; 1616 return 0; 1617 } 1618 1619 static int veth_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash) 1620 { 1621 struct veth_xdp_buff *_ctx = (void *)ctx; 1622 1623 if (!_ctx->skb) 1624 return -EOPNOTSUPP; 1625 1626 *hash = skb_get_hash(_ctx->skb); 1627 return 0; 1628 } 1629 1630 static const struct net_device_ops veth_netdev_ops = { 1631 .ndo_init = veth_dev_init, 1632 .ndo_open = veth_open, 1633 .ndo_stop = veth_close, 1634 .ndo_start_xmit = veth_xmit, 1635 .ndo_get_stats64 = veth_get_stats64, 1636 .ndo_set_rx_mode = veth_set_multicast_list, 1637 .ndo_set_mac_address = eth_mac_addr, 1638 #ifdef CONFIG_NET_POLL_CONTROLLER 1639 .ndo_poll_controller = veth_poll_controller, 1640 #endif 1641 .ndo_get_iflink = veth_get_iflink, 1642 .ndo_fix_features = veth_fix_features, 1643 .ndo_set_features = veth_set_features, 1644 .ndo_features_check = passthru_features_check, 1645 .ndo_set_rx_headroom = veth_set_rx_headroom, 1646 .ndo_bpf = veth_xdp, 1647 .ndo_xdp_xmit = veth_ndo_xdp_xmit, 1648 .ndo_get_peer_dev = veth_peer_dev, 1649 }; 1650 1651 static const struct xdp_metadata_ops veth_xdp_metadata_ops = { 1652 .xmo_rx_timestamp = veth_xdp_rx_timestamp, 1653 .xmo_rx_hash = veth_xdp_rx_hash, 1654 }; 1655 1656 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ 1657 NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \ 1658 NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \ 1659 NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \ 1660 NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX ) 1661 1662 static void veth_setup(struct net_device *dev) 1663 { 1664 ether_setup(dev); 1665 1666 dev->priv_flags &= ~IFF_TX_SKB_SHARING; 1667 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 1668 dev->priv_flags |= IFF_NO_QUEUE; 1669 dev->priv_flags |= IFF_PHONY_HEADROOM; 1670 1671 dev->netdev_ops = &veth_netdev_ops; 1672 dev->xdp_metadata_ops = &veth_xdp_metadata_ops; 1673 dev->ethtool_ops = &veth_ethtool_ops; 1674 dev->features |= NETIF_F_LLTX; 1675 dev->features |= VETH_FEATURES; 1676 dev->vlan_features = dev->features & 1677 ~(NETIF_F_HW_VLAN_CTAG_TX | 1678 NETIF_F_HW_VLAN_STAG_TX | 1679 NETIF_F_HW_VLAN_CTAG_RX | 1680 NETIF_F_HW_VLAN_STAG_RX); 1681 dev->needs_free_netdev = true; 1682 dev->priv_destructor = veth_dev_free; 1683 dev->max_mtu = ETH_MAX_MTU; 1684 1685 dev->hw_features = VETH_FEATURES; 1686 dev->hw_enc_features = VETH_FEATURES; 1687 dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; 1688 netif_set_tso_max_size(dev, GSO_MAX_SIZE); 1689 } 1690 1691 /* 1692 * netlink interface 1693 */ 1694 1695 static int veth_validate(struct nlattr *tb[], struct nlattr *data[], 1696 struct netlink_ext_ack *extack) 1697 { 1698 if (tb[IFLA_ADDRESS]) { 1699 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 1700 return -EINVAL; 1701 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 1702 return -EADDRNOTAVAIL; 1703 } 1704 if (tb[IFLA_MTU]) { 1705 if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU]))) 1706 return -EINVAL; 1707 } 1708 return 0; 1709 } 1710 1711 static struct rtnl_link_ops veth_link_ops; 1712 1713 static void veth_disable_gro(struct net_device *dev) 1714 { 1715 dev->features &= ~NETIF_F_GRO; 1716 dev->wanted_features &= ~NETIF_F_GRO; 1717 netdev_update_features(dev); 1718 } 1719 1720 static int veth_init_queues(struct net_device *dev, struct nlattr *tb[]) 1721 { 1722 int err; 1723 1724 if (!tb[IFLA_NUM_TX_QUEUES] && dev->num_tx_queues > 1) { 1725 err = netif_set_real_num_tx_queues(dev, 1); 1726 if (err) 1727 return err; 1728 } 1729 if (!tb[IFLA_NUM_RX_QUEUES] && dev->num_rx_queues > 1) { 1730 err = netif_set_real_num_rx_queues(dev, 1); 1731 if (err) 1732 return err; 1733 } 1734 return 0; 1735 } 1736 1737 static int veth_newlink(struct net *src_net, struct net_device *dev, 1738 struct nlattr *tb[], struct nlattr *data[], 1739 struct netlink_ext_ack *extack) 1740 { 1741 int err; 1742 struct net_device *peer; 1743 struct veth_priv *priv; 1744 char ifname[IFNAMSIZ]; 1745 struct nlattr *peer_tb[IFLA_MAX + 1], **tbp; 1746 unsigned char name_assign_type; 1747 struct ifinfomsg *ifmp; 1748 struct net *net; 1749 1750 /* 1751 * create and register peer first 1752 */ 1753 if (data != NULL && data[VETH_INFO_PEER] != NULL) { 1754 struct nlattr *nla_peer; 1755 1756 nla_peer = data[VETH_INFO_PEER]; 1757 ifmp = nla_data(nla_peer); 1758 err = rtnl_nla_parse_ifla(peer_tb, 1759 nla_data(nla_peer) + sizeof(struct ifinfomsg), 1760 nla_len(nla_peer) - sizeof(struct ifinfomsg), 1761 NULL); 1762 if (err < 0) 1763 return err; 1764 1765 err = veth_validate(peer_tb, NULL, extack); 1766 if (err < 0) 1767 return err; 1768 1769 tbp = peer_tb; 1770 } else { 1771 ifmp = NULL; 1772 tbp = tb; 1773 } 1774 1775 if (ifmp && tbp[IFLA_IFNAME]) { 1776 nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); 1777 name_assign_type = NET_NAME_USER; 1778 } else { 1779 snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); 1780 name_assign_type = NET_NAME_ENUM; 1781 } 1782 1783 net = rtnl_link_get_net(src_net, tbp); 1784 if (IS_ERR(net)) 1785 return PTR_ERR(net); 1786 1787 peer = rtnl_create_link(net, ifname, name_assign_type, 1788 &veth_link_ops, tbp, extack); 1789 if (IS_ERR(peer)) { 1790 put_net(net); 1791 return PTR_ERR(peer); 1792 } 1793 1794 if (!ifmp || !tbp[IFLA_ADDRESS]) 1795 eth_hw_addr_random(peer); 1796 1797 if (ifmp && (dev->ifindex != 0)) 1798 peer->ifindex = ifmp->ifi_index; 1799 1800 netif_inherit_tso_max(peer, dev); 1801 1802 err = register_netdevice(peer); 1803 put_net(net); 1804 net = NULL; 1805 if (err < 0) 1806 goto err_register_peer; 1807 1808 /* keep GRO disabled by default to be consistent with the established 1809 * veth behavior 1810 */ 1811 veth_disable_gro(peer); 1812 netif_carrier_off(peer); 1813 1814 err = rtnl_configure_link(peer, ifmp, 0, NULL); 1815 if (err < 0) 1816 goto err_configure_peer; 1817 1818 /* 1819 * register dev last 1820 * 1821 * note, that since we've registered new device the dev's name 1822 * should be re-allocated 1823 */ 1824 1825 if (tb[IFLA_ADDRESS] == NULL) 1826 eth_hw_addr_random(dev); 1827 1828 if (tb[IFLA_IFNAME]) 1829 nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); 1830 else 1831 snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); 1832 1833 err = register_netdevice(dev); 1834 if (err < 0) 1835 goto err_register_dev; 1836 1837 netif_carrier_off(dev); 1838 1839 /* 1840 * tie the deviced together 1841 */ 1842 1843 priv = netdev_priv(dev); 1844 rcu_assign_pointer(priv->peer, peer); 1845 err = veth_init_queues(dev, tb); 1846 if (err) 1847 goto err_queues; 1848 1849 priv = netdev_priv(peer); 1850 rcu_assign_pointer(priv->peer, dev); 1851 err = veth_init_queues(peer, tb); 1852 if (err) 1853 goto err_queues; 1854 1855 veth_disable_gro(dev); 1856 return 0; 1857 1858 err_queues: 1859 unregister_netdevice(dev); 1860 err_register_dev: 1861 /* nothing to do */ 1862 err_configure_peer: 1863 unregister_netdevice(peer); 1864 return err; 1865 1866 err_register_peer: 1867 free_netdev(peer); 1868 return err; 1869 } 1870 1871 static void veth_dellink(struct net_device *dev, struct list_head *head) 1872 { 1873 struct veth_priv *priv; 1874 struct net_device *peer; 1875 1876 priv = netdev_priv(dev); 1877 peer = rtnl_dereference(priv->peer); 1878 1879 /* Note : dellink() is called from default_device_exit_batch(), 1880 * before a rcu_synchronize() point. The devices are guaranteed 1881 * not being freed before one RCU grace period. 1882 */ 1883 RCU_INIT_POINTER(priv->peer, NULL); 1884 unregister_netdevice_queue(dev, head); 1885 1886 if (peer) { 1887 priv = netdev_priv(peer); 1888 RCU_INIT_POINTER(priv->peer, NULL); 1889 unregister_netdevice_queue(peer, head); 1890 } 1891 } 1892 1893 static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = { 1894 [VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) }, 1895 }; 1896 1897 static struct net *veth_get_link_net(const struct net_device *dev) 1898 { 1899 struct veth_priv *priv = netdev_priv(dev); 1900 struct net_device *peer = rtnl_dereference(priv->peer); 1901 1902 return peer ? dev_net(peer) : dev_net(dev); 1903 } 1904 1905 static unsigned int veth_get_num_queues(void) 1906 { 1907 /* enforce the same queue limit as rtnl_create_link */ 1908 int queues = num_possible_cpus(); 1909 1910 if (queues > 4096) 1911 queues = 4096; 1912 return queues; 1913 } 1914 1915 static struct rtnl_link_ops veth_link_ops = { 1916 .kind = DRV_NAME, 1917 .priv_size = sizeof(struct veth_priv), 1918 .setup = veth_setup, 1919 .validate = veth_validate, 1920 .newlink = veth_newlink, 1921 .dellink = veth_dellink, 1922 .policy = veth_policy, 1923 .maxtype = VETH_INFO_MAX, 1924 .get_link_net = veth_get_link_net, 1925 .get_num_tx_queues = veth_get_num_queues, 1926 .get_num_rx_queues = veth_get_num_queues, 1927 }; 1928 1929 /* 1930 * init/fini 1931 */ 1932 1933 static __init int veth_init(void) 1934 { 1935 return rtnl_link_register(&veth_link_ops); 1936 } 1937 1938 static __exit void veth_exit(void) 1939 { 1940 rtnl_link_unregister(&veth_link_ops); 1941 } 1942 1943 module_init(veth_init); 1944 module_exit(veth_exit); 1945 1946 MODULE_DESCRIPTION("Virtual Ethernet Tunnel"); 1947 MODULE_LICENSE("GPL v2"); 1948 MODULE_ALIAS_RTNL_LINK(DRV_NAME); 1949