1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * drivers/net/veth.c 4 * 5 * Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc 6 * 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com> 9 * 10 */ 11 12 #include <linux/netdevice.h> 13 #include <linux/slab.h> 14 #include <linux/ethtool.h> 15 #include <linux/etherdevice.h> 16 #include <linux/u64_stats_sync.h> 17 18 #include <net/rtnetlink.h> 19 #include <net/dst.h> 20 #include <net/xfrm.h> 21 #include <net/xdp.h> 22 #include <linux/veth.h> 23 #include <linux/module.h> 24 #include <linux/bpf.h> 25 #include <linux/filter.h> 26 #include <linux/ptr_ring.h> 27 #include <linux/bpf_trace.h> 28 #include <linux/net_tstamp.h> 29 30 #define DRV_NAME "veth" 31 #define DRV_VERSION "1.0" 32 33 #define VETH_XDP_FLAG BIT(0) 34 #define VETH_RING_SIZE 256 35 #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) 36 37 #define VETH_XDP_TX_BULK_SIZE 16 38 #define VETH_XDP_BATCH 16 39 40 struct veth_stats { 41 u64 rx_drops; 42 /* xdp */ 43 u64 xdp_packets; 44 u64 xdp_bytes; 45 u64 xdp_redirect; 46 u64 xdp_drops; 47 u64 xdp_tx; 48 u64 xdp_tx_err; 49 u64 peer_tq_xdp_xmit; 50 u64 peer_tq_xdp_xmit_err; 51 }; 52 53 struct veth_rq_stats { 54 struct veth_stats vs; 55 struct u64_stats_sync syncp; 56 }; 57 58 struct veth_rq { 59 struct napi_struct xdp_napi; 60 struct napi_struct __rcu *napi; /* points to xdp_napi when the latter is initialized */ 61 struct net_device *dev; 62 struct bpf_prog __rcu *xdp_prog; 63 struct xdp_mem_info xdp_mem; 64 struct veth_rq_stats stats; 65 bool rx_notify_masked; 66 struct ptr_ring xdp_ring; 67 struct xdp_rxq_info xdp_rxq; 68 }; 69 70 struct veth_priv { 71 struct net_device __rcu *peer; 72 atomic64_t dropped; 73 struct bpf_prog *_xdp_prog; 74 struct veth_rq *rq; 75 unsigned int requested_headroom; 76 }; 77 78 struct veth_xdp_tx_bq { 79 struct xdp_frame *q[VETH_XDP_TX_BULK_SIZE]; 80 unsigned int count; 81 }; 82 83 /* 84 * ethtool interface 85 */ 86 87 struct veth_q_stat_desc { 88 char desc[ETH_GSTRING_LEN]; 89 size_t offset; 90 }; 91 92 #define VETH_RQ_STAT(m) offsetof(struct veth_stats, m) 93 94 static const struct veth_q_stat_desc veth_rq_stats_desc[] = { 95 { "xdp_packets", VETH_RQ_STAT(xdp_packets) }, 96 { "xdp_bytes", VETH_RQ_STAT(xdp_bytes) }, 97 { "drops", VETH_RQ_STAT(rx_drops) }, 98 { "xdp_redirect", VETH_RQ_STAT(xdp_redirect) }, 99 { "xdp_drops", VETH_RQ_STAT(xdp_drops) }, 100 { "xdp_tx", VETH_RQ_STAT(xdp_tx) }, 101 { "xdp_tx_errors", VETH_RQ_STAT(xdp_tx_err) }, 102 }; 103 104 #define VETH_RQ_STATS_LEN ARRAY_SIZE(veth_rq_stats_desc) 105 106 static const struct veth_q_stat_desc veth_tq_stats_desc[] = { 107 { "xdp_xmit", VETH_RQ_STAT(peer_tq_xdp_xmit) }, 108 { "xdp_xmit_errors", VETH_RQ_STAT(peer_tq_xdp_xmit_err) }, 109 }; 110 111 #define VETH_TQ_STATS_LEN ARRAY_SIZE(veth_tq_stats_desc) 112 113 static struct { 114 const char string[ETH_GSTRING_LEN]; 115 } ethtool_stats_keys[] = { 116 { "peer_ifindex" }, 117 }; 118 119 struct veth_xdp_buff { 120 struct xdp_buff xdp; 121 struct sk_buff *skb; 122 }; 123 124 static int veth_get_link_ksettings(struct net_device *dev, 125 struct ethtool_link_ksettings *cmd) 126 { 127 cmd->base.speed = SPEED_10000; 128 cmd->base.duplex = DUPLEX_FULL; 129 cmd->base.port = PORT_TP; 130 cmd->base.autoneg = AUTONEG_DISABLE; 131 return 0; 132 } 133 134 static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) 135 { 136 strscpy(info->driver, DRV_NAME, sizeof(info->driver)); 137 strscpy(info->version, DRV_VERSION, sizeof(info->version)); 138 } 139 140 static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf) 141 { 142 u8 *p = buf; 143 int i, j; 144 145 switch(stringset) { 146 case ETH_SS_STATS: 147 memcpy(p, ðtool_stats_keys, sizeof(ethtool_stats_keys)); 148 p += sizeof(ethtool_stats_keys); 149 for (i = 0; i < dev->real_num_rx_queues; i++) 150 for (j = 0; j < VETH_RQ_STATS_LEN; j++) 151 ethtool_sprintf(&p, "rx_queue_%u_%.18s", 152 i, veth_rq_stats_desc[j].desc); 153 154 for (i = 0; i < dev->real_num_tx_queues; i++) 155 for (j = 0; j < VETH_TQ_STATS_LEN; j++) 156 ethtool_sprintf(&p, "tx_queue_%u_%.18s", 157 i, veth_tq_stats_desc[j].desc); 158 break; 159 } 160 } 161 162 static int veth_get_sset_count(struct net_device *dev, int sset) 163 { 164 switch (sset) { 165 case ETH_SS_STATS: 166 return ARRAY_SIZE(ethtool_stats_keys) + 167 VETH_RQ_STATS_LEN * dev->real_num_rx_queues + 168 VETH_TQ_STATS_LEN * dev->real_num_tx_queues; 169 default: 170 return -EOPNOTSUPP; 171 } 172 } 173 174 static void veth_get_ethtool_stats(struct net_device *dev, 175 struct ethtool_stats *stats, u64 *data) 176 { 177 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 178 struct net_device *peer = rtnl_dereference(priv->peer); 179 int i, j, idx; 180 181 data[0] = peer ? peer->ifindex : 0; 182 idx = 1; 183 for (i = 0; i < dev->real_num_rx_queues; i++) { 184 const struct veth_rq_stats *rq_stats = &priv->rq[i].stats; 185 const void *stats_base = (void *)&rq_stats->vs; 186 unsigned int start; 187 size_t offset; 188 189 do { 190 start = u64_stats_fetch_begin(&rq_stats->syncp); 191 for (j = 0; j < VETH_RQ_STATS_LEN; j++) { 192 offset = veth_rq_stats_desc[j].offset; 193 data[idx + j] = *(u64 *)(stats_base + offset); 194 } 195 } while (u64_stats_fetch_retry(&rq_stats->syncp, start)); 196 idx += VETH_RQ_STATS_LEN; 197 } 198 199 if (!peer) 200 return; 201 202 rcv_priv = netdev_priv(peer); 203 for (i = 0; i < peer->real_num_rx_queues; i++) { 204 const struct veth_rq_stats *rq_stats = &rcv_priv->rq[i].stats; 205 const void *base = (void *)&rq_stats->vs; 206 unsigned int start, tx_idx = idx; 207 size_t offset; 208 209 tx_idx += (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN; 210 do { 211 start = u64_stats_fetch_begin(&rq_stats->syncp); 212 for (j = 0; j < VETH_TQ_STATS_LEN; j++) { 213 offset = veth_tq_stats_desc[j].offset; 214 data[tx_idx + j] += *(u64 *)(base + offset); 215 } 216 } while (u64_stats_fetch_retry(&rq_stats->syncp, start)); 217 } 218 } 219 220 static void veth_get_channels(struct net_device *dev, 221 struct ethtool_channels *channels) 222 { 223 channels->tx_count = dev->real_num_tx_queues; 224 channels->rx_count = dev->real_num_rx_queues; 225 channels->max_tx = dev->num_tx_queues; 226 channels->max_rx = dev->num_rx_queues; 227 } 228 229 static int veth_set_channels(struct net_device *dev, 230 struct ethtool_channels *ch); 231 232 static const struct ethtool_ops veth_ethtool_ops = { 233 .get_drvinfo = veth_get_drvinfo, 234 .get_link = ethtool_op_get_link, 235 .get_strings = veth_get_strings, 236 .get_sset_count = veth_get_sset_count, 237 .get_ethtool_stats = veth_get_ethtool_stats, 238 .get_link_ksettings = veth_get_link_ksettings, 239 .get_ts_info = ethtool_op_get_ts_info, 240 .get_channels = veth_get_channels, 241 .set_channels = veth_set_channels, 242 }; 243 244 /* general routines */ 245 246 static bool veth_is_xdp_frame(void *ptr) 247 { 248 return (unsigned long)ptr & VETH_XDP_FLAG; 249 } 250 251 static struct xdp_frame *veth_ptr_to_xdp(void *ptr) 252 { 253 return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG); 254 } 255 256 static void *veth_xdp_to_ptr(struct xdp_frame *xdp) 257 { 258 return (void *)((unsigned long)xdp | VETH_XDP_FLAG); 259 } 260 261 static void veth_ptr_free(void *ptr) 262 { 263 if (veth_is_xdp_frame(ptr)) 264 xdp_return_frame(veth_ptr_to_xdp(ptr)); 265 else 266 kfree_skb(ptr); 267 } 268 269 static void __veth_xdp_flush(struct veth_rq *rq) 270 { 271 /* Write ptr_ring before reading rx_notify_masked */ 272 smp_mb(); 273 if (!READ_ONCE(rq->rx_notify_masked) && 274 napi_schedule_prep(&rq->xdp_napi)) { 275 WRITE_ONCE(rq->rx_notify_masked, true); 276 __napi_schedule(&rq->xdp_napi); 277 } 278 } 279 280 static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) 281 { 282 if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) { 283 dev_kfree_skb_any(skb); 284 return NET_RX_DROP; 285 } 286 287 return NET_RX_SUCCESS; 288 } 289 290 static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, 291 struct veth_rq *rq, bool xdp) 292 { 293 return __dev_forward_skb(dev, skb) ?: xdp ? 294 veth_xdp_rx(rq, skb) : 295 __netif_rx(skb); 296 } 297 298 /* return true if the specified skb has chances of GRO aggregation 299 * Don't strive for accuracy, but try to avoid GRO overhead in the most 300 * common scenarios. 301 * When XDP is enabled, all traffic is considered eligible, as the xmit 302 * device has TSO off. 303 * When TSO is enabled on the xmit device, we are likely interested only 304 * in UDP aggregation, explicitly check for that if the skb is suspected 305 * - the sock_wfree destructor is used by UDP, ICMP and XDP sockets - 306 * to belong to locally generated UDP traffic. 307 */ 308 static bool veth_skb_is_eligible_for_gro(const struct net_device *dev, 309 const struct net_device *rcv, 310 const struct sk_buff *skb) 311 { 312 return !(dev->features & NETIF_F_ALL_TSO) || 313 (skb->destructor == sock_wfree && 314 rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD)); 315 } 316 317 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) 318 { 319 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 320 struct veth_rq *rq = NULL; 321 struct net_device *rcv; 322 int length = skb->len; 323 bool use_napi = false; 324 int rxq; 325 326 rcu_read_lock(); 327 rcv = rcu_dereference(priv->peer); 328 if (unlikely(!rcv) || !pskb_may_pull(skb, ETH_HLEN)) { 329 kfree_skb(skb); 330 goto drop; 331 } 332 333 rcv_priv = netdev_priv(rcv); 334 rxq = skb_get_queue_mapping(skb); 335 if (rxq < rcv->real_num_rx_queues) { 336 rq = &rcv_priv->rq[rxq]; 337 338 /* The napi pointer is available when an XDP program is 339 * attached or when GRO is enabled 340 * Don't bother with napi/GRO if the skb can't be aggregated 341 */ 342 use_napi = rcu_access_pointer(rq->napi) && 343 veth_skb_is_eligible_for_gro(dev, rcv, skb); 344 } 345 346 skb_tx_timestamp(skb); 347 if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) { 348 if (!use_napi) 349 dev_lstats_add(dev, length); 350 } else { 351 drop: 352 atomic64_inc(&priv->dropped); 353 } 354 355 if (use_napi) 356 __veth_xdp_flush(rq); 357 358 rcu_read_unlock(); 359 360 return NETDEV_TX_OK; 361 } 362 363 static u64 veth_stats_tx(struct net_device *dev, u64 *packets, u64 *bytes) 364 { 365 struct veth_priv *priv = netdev_priv(dev); 366 367 dev_lstats_read(dev, packets, bytes); 368 return atomic64_read(&priv->dropped); 369 } 370 371 static void veth_stats_rx(struct veth_stats *result, struct net_device *dev) 372 { 373 struct veth_priv *priv = netdev_priv(dev); 374 int i; 375 376 result->peer_tq_xdp_xmit_err = 0; 377 result->xdp_packets = 0; 378 result->xdp_tx_err = 0; 379 result->xdp_bytes = 0; 380 result->rx_drops = 0; 381 for (i = 0; i < dev->num_rx_queues; i++) { 382 u64 packets, bytes, drops, xdp_tx_err, peer_tq_xdp_xmit_err; 383 struct veth_rq_stats *stats = &priv->rq[i].stats; 384 unsigned int start; 385 386 do { 387 start = u64_stats_fetch_begin(&stats->syncp); 388 peer_tq_xdp_xmit_err = stats->vs.peer_tq_xdp_xmit_err; 389 xdp_tx_err = stats->vs.xdp_tx_err; 390 packets = stats->vs.xdp_packets; 391 bytes = stats->vs.xdp_bytes; 392 drops = stats->vs.rx_drops; 393 } while (u64_stats_fetch_retry(&stats->syncp, start)); 394 result->peer_tq_xdp_xmit_err += peer_tq_xdp_xmit_err; 395 result->xdp_tx_err += xdp_tx_err; 396 result->xdp_packets += packets; 397 result->xdp_bytes += bytes; 398 result->rx_drops += drops; 399 } 400 } 401 402 static void veth_get_stats64(struct net_device *dev, 403 struct rtnl_link_stats64 *tot) 404 { 405 struct veth_priv *priv = netdev_priv(dev); 406 struct net_device *peer; 407 struct veth_stats rx; 408 u64 packets, bytes; 409 410 tot->tx_dropped = veth_stats_tx(dev, &packets, &bytes); 411 tot->tx_bytes = bytes; 412 tot->tx_packets = packets; 413 414 veth_stats_rx(&rx, dev); 415 tot->tx_dropped += rx.xdp_tx_err; 416 tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err; 417 tot->rx_bytes = rx.xdp_bytes; 418 tot->rx_packets = rx.xdp_packets; 419 420 rcu_read_lock(); 421 peer = rcu_dereference(priv->peer); 422 if (peer) { 423 veth_stats_tx(peer, &packets, &bytes); 424 tot->rx_bytes += bytes; 425 tot->rx_packets += packets; 426 427 veth_stats_rx(&rx, peer); 428 tot->tx_dropped += rx.peer_tq_xdp_xmit_err; 429 tot->rx_dropped += rx.xdp_tx_err; 430 tot->tx_bytes += rx.xdp_bytes; 431 tot->tx_packets += rx.xdp_packets; 432 } 433 rcu_read_unlock(); 434 } 435 436 /* fake multicast ability */ 437 static void veth_set_multicast_list(struct net_device *dev) 438 { 439 } 440 441 static int veth_select_rxq(struct net_device *dev) 442 { 443 return smp_processor_id() % dev->real_num_rx_queues; 444 } 445 446 static struct net_device *veth_peer_dev(struct net_device *dev) 447 { 448 struct veth_priv *priv = netdev_priv(dev); 449 450 /* Callers must be under RCU read side. */ 451 return rcu_dereference(priv->peer); 452 } 453 454 static int veth_xdp_xmit(struct net_device *dev, int n, 455 struct xdp_frame **frames, 456 u32 flags, bool ndo_xmit) 457 { 458 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 459 int i, ret = -ENXIO, nxmit = 0; 460 struct net_device *rcv; 461 unsigned int max_len; 462 struct veth_rq *rq; 463 464 if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) 465 return -EINVAL; 466 467 rcu_read_lock(); 468 rcv = rcu_dereference(priv->peer); 469 if (unlikely(!rcv)) 470 goto out; 471 472 rcv_priv = netdev_priv(rcv); 473 rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 474 /* The napi pointer is set if NAPI is enabled, which ensures that 475 * xdp_ring is initialized on receive side and the peer device is up. 476 */ 477 if (!rcu_access_pointer(rq->napi)) 478 goto out; 479 480 max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN; 481 482 spin_lock(&rq->xdp_ring.producer_lock); 483 for (i = 0; i < n; i++) { 484 struct xdp_frame *frame = frames[i]; 485 void *ptr = veth_xdp_to_ptr(frame); 486 487 if (unlikely(xdp_get_frame_len(frame) > max_len || 488 __ptr_ring_produce(&rq->xdp_ring, ptr))) 489 break; 490 nxmit++; 491 } 492 spin_unlock(&rq->xdp_ring.producer_lock); 493 494 if (flags & XDP_XMIT_FLUSH) 495 __veth_xdp_flush(rq); 496 497 ret = nxmit; 498 if (ndo_xmit) { 499 u64_stats_update_begin(&rq->stats.syncp); 500 rq->stats.vs.peer_tq_xdp_xmit += nxmit; 501 rq->stats.vs.peer_tq_xdp_xmit_err += n - nxmit; 502 u64_stats_update_end(&rq->stats.syncp); 503 } 504 505 out: 506 rcu_read_unlock(); 507 508 return ret; 509 } 510 511 static int veth_ndo_xdp_xmit(struct net_device *dev, int n, 512 struct xdp_frame **frames, u32 flags) 513 { 514 int err; 515 516 err = veth_xdp_xmit(dev, n, frames, flags, true); 517 if (err < 0) { 518 struct veth_priv *priv = netdev_priv(dev); 519 520 atomic64_add(n, &priv->dropped); 521 } 522 523 return err; 524 } 525 526 static void veth_xdp_flush_bq(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) 527 { 528 int sent, i, err = 0, drops; 529 530 sent = veth_xdp_xmit(rq->dev, bq->count, bq->q, 0, false); 531 if (sent < 0) { 532 err = sent; 533 sent = 0; 534 } 535 536 for (i = sent; unlikely(i < bq->count); i++) 537 xdp_return_frame(bq->q[i]); 538 539 drops = bq->count - sent; 540 trace_xdp_bulk_tx(rq->dev, sent, drops, err); 541 542 u64_stats_update_begin(&rq->stats.syncp); 543 rq->stats.vs.xdp_tx += sent; 544 rq->stats.vs.xdp_tx_err += drops; 545 u64_stats_update_end(&rq->stats.syncp); 546 547 bq->count = 0; 548 } 549 550 static void veth_xdp_flush(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) 551 { 552 struct veth_priv *rcv_priv, *priv = netdev_priv(rq->dev); 553 struct net_device *rcv; 554 struct veth_rq *rcv_rq; 555 556 rcu_read_lock(); 557 veth_xdp_flush_bq(rq, bq); 558 rcv = rcu_dereference(priv->peer); 559 if (unlikely(!rcv)) 560 goto out; 561 562 rcv_priv = netdev_priv(rcv); 563 rcv_rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 564 /* xdp_ring is initialized on receive side? */ 565 if (unlikely(!rcu_access_pointer(rcv_rq->xdp_prog))) 566 goto out; 567 568 __veth_xdp_flush(rcv_rq); 569 out: 570 rcu_read_unlock(); 571 } 572 573 static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp, 574 struct veth_xdp_tx_bq *bq) 575 { 576 struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp); 577 578 if (unlikely(!frame)) 579 return -EOVERFLOW; 580 581 if (unlikely(bq->count == VETH_XDP_TX_BULK_SIZE)) 582 veth_xdp_flush_bq(rq, bq); 583 584 bq->q[bq->count++] = frame; 585 586 return 0; 587 } 588 589 static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq, 590 struct xdp_frame *frame, 591 struct veth_xdp_tx_bq *bq, 592 struct veth_stats *stats) 593 { 594 struct xdp_frame orig_frame; 595 struct bpf_prog *xdp_prog; 596 597 rcu_read_lock(); 598 xdp_prog = rcu_dereference(rq->xdp_prog); 599 if (likely(xdp_prog)) { 600 struct veth_xdp_buff vxbuf; 601 struct xdp_buff *xdp = &vxbuf.xdp; 602 u32 act; 603 604 xdp_convert_frame_to_buff(frame, xdp); 605 xdp->rxq = &rq->xdp_rxq; 606 vxbuf.skb = NULL; 607 608 act = bpf_prog_run_xdp(xdp_prog, xdp); 609 610 switch (act) { 611 case XDP_PASS: 612 if (xdp_update_frame_from_buff(xdp, frame)) 613 goto err_xdp; 614 break; 615 case XDP_TX: 616 orig_frame = *frame; 617 xdp->rxq->mem = frame->mem; 618 if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) { 619 trace_xdp_exception(rq->dev, xdp_prog, act); 620 frame = &orig_frame; 621 stats->rx_drops++; 622 goto err_xdp; 623 } 624 stats->xdp_tx++; 625 rcu_read_unlock(); 626 goto xdp_xmit; 627 case XDP_REDIRECT: 628 orig_frame = *frame; 629 xdp->rxq->mem = frame->mem; 630 if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) { 631 frame = &orig_frame; 632 stats->rx_drops++; 633 goto err_xdp; 634 } 635 stats->xdp_redirect++; 636 rcu_read_unlock(); 637 goto xdp_xmit; 638 default: 639 bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); 640 fallthrough; 641 case XDP_ABORTED: 642 trace_xdp_exception(rq->dev, xdp_prog, act); 643 fallthrough; 644 case XDP_DROP: 645 stats->xdp_drops++; 646 goto err_xdp; 647 } 648 } 649 rcu_read_unlock(); 650 651 return frame; 652 err_xdp: 653 rcu_read_unlock(); 654 xdp_return_frame(frame); 655 xdp_xmit: 656 return NULL; 657 } 658 659 /* frames array contains VETH_XDP_BATCH at most */ 660 static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames, 661 int n_xdpf, struct veth_xdp_tx_bq *bq, 662 struct veth_stats *stats) 663 { 664 void *skbs[VETH_XDP_BATCH]; 665 int i; 666 667 if (xdp_alloc_skb_bulk(skbs, n_xdpf, 668 GFP_ATOMIC | __GFP_ZERO) < 0) { 669 for (i = 0; i < n_xdpf; i++) 670 xdp_return_frame(frames[i]); 671 stats->rx_drops += n_xdpf; 672 673 return; 674 } 675 676 for (i = 0; i < n_xdpf; i++) { 677 struct sk_buff *skb = skbs[i]; 678 679 skb = __xdp_build_skb_from_frame(frames[i], skb, 680 rq->dev); 681 if (!skb) { 682 xdp_return_frame(frames[i]); 683 stats->rx_drops++; 684 continue; 685 } 686 napi_gro_receive(&rq->xdp_napi, skb); 687 } 688 } 689 690 static void veth_xdp_get(struct xdp_buff *xdp) 691 { 692 struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); 693 int i; 694 695 get_page(virt_to_page(xdp->data)); 696 if (likely(!xdp_buff_has_frags(xdp))) 697 return; 698 699 for (i = 0; i < sinfo->nr_frags; i++) 700 __skb_frag_ref(&sinfo->frags[i]); 701 } 702 703 static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq, 704 struct xdp_buff *xdp, 705 struct sk_buff **pskb) 706 { 707 struct sk_buff *skb = *pskb; 708 u32 frame_sz; 709 710 if (skb_shared(skb) || skb_head_is_locked(skb) || 711 skb_shinfo(skb)->nr_frags || 712 skb_headroom(skb) < XDP_PACKET_HEADROOM) { 713 u32 size, len, max_head_size, off; 714 struct sk_buff *nskb; 715 struct page *page; 716 int i, head_off; 717 718 /* We need a private copy of the skb and data buffers since 719 * the ebpf program can modify it. We segment the original skb 720 * into order-0 pages without linearize it. 721 * 722 * Make sure we have enough space for linear and paged area 723 */ 724 max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - 725 VETH_XDP_HEADROOM); 726 if (skb->len > PAGE_SIZE * MAX_SKB_FRAGS + max_head_size) 727 goto drop; 728 729 /* Allocate skb head */ 730 page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); 731 if (!page) 732 goto drop; 733 734 nskb = build_skb(page_address(page), PAGE_SIZE); 735 if (!nskb) { 736 put_page(page); 737 goto drop; 738 } 739 740 skb_reserve(nskb, VETH_XDP_HEADROOM); 741 size = min_t(u32, skb->len, max_head_size); 742 if (skb_copy_bits(skb, 0, nskb->data, size)) { 743 consume_skb(nskb); 744 goto drop; 745 } 746 skb_put(nskb, size); 747 748 skb_copy_header(nskb, skb); 749 head_off = skb_headroom(nskb) - skb_headroom(skb); 750 skb_headers_offset_update(nskb, head_off); 751 752 /* Allocate paged area of new skb */ 753 off = size; 754 len = skb->len - off; 755 756 for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) { 757 page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); 758 if (!page) { 759 consume_skb(nskb); 760 goto drop; 761 } 762 763 size = min_t(u32, len, PAGE_SIZE); 764 skb_add_rx_frag(nskb, i, page, 0, size, PAGE_SIZE); 765 if (skb_copy_bits(skb, off, page_address(page), 766 size)) { 767 consume_skb(nskb); 768 goto drop; 769 } 770 771 len -= size; 772 off += size; 773 } 774 775 consume_skb(skb); 776 skb = nskb; 777 } 778 779 /* SKB "head" area always have tailroom for skb_shared_info */ 780 frame_sz = skb_end_pointer(skb) - skb->head; 781 frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 782 xdp_init_buff(xdp, frame_sz, &rq->xdp_rxq); 783 xdp_prepare_buff(xdp, skb->head, skb_headroom(skb), 784 skb_headlen(skb), true); 785 786 if (skb_is_nonlinear(skb)) { 787 skb_shinfo(skb)->xdp_frags_size = skb->data_len; 788 xdp_buff_set_frags_flag(xdp); 789 } else { 790 xdp_buff_clear_frags_flag(xdp); 791 } 792 *pskb = skb; 793 794 return 0; 795 drop: 796 consume_skb(skb); 797 *pskb = NULL; 798 799 return -ENOMEM; 800 } 801 802 static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, 803 struct sk_buff *skb, 804 struct veth_xdp_tx_bq *bq, 805 struct veth_stats *stats) 806 { 807 void *orig_data, *orig_data_end; 808 struct bpf_prog *xdp_prog; 809 struct veth_xdp_buff vxbuf; 810 struct xdp_buff *xdp = &vxbuf.xdp; 811 u32 act, metalen; 812 int off; 813 814 skb_prepare_for_gro(skb); 815 816 rcu_read_lock(); 817 xdp_prog = rcu_dereference(rq->xdp_prog); 818 if (unlikely(!xdp_prog)) { 819 rcu_read_unlock(); 820 goto out; 821 } 822 823 __skb_push(skb, skb->data - skb_mac_header(skb)); 824 if (veth_convert_skb_to_xdp_buff(rq, xdp, &skb)) 825 goto drop; 826 vxbuf.skb = skb; 827 828 orig_data = xdp->data; 829 orig_data_end = xdp->data_end; 830 831 act = bpf_prog_run_xdp(xdp_prog, xdp); 832 833 switch (act) { 834 case XDP_PASS: 835 break; 836 case XDP_TX: 837 veth_xdp_get(xdp); 838 consume_skb(skb); 839 xdp->rxq->mem = rq->xdp_mem; 840 if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) { 841 trace_xdp_exception(rq->dev, xdp_prog, act); 842 stats->rx_drops++; 843 goto err_xdp; 844 } 845 stats->xdp_tx++; 846 rcu_read_unlock(); 847 goto xdp_xmit; 848 case XDP_REDIRECT: 849 veth_xdp_get(xdp); 850 consume_skb(skb); 851 xdp->rxq->mem = rq->xdp_mem; 852 if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) { 853 stats->rx_drops++; 854 goto err_xdp; 855 } 856 stats->xdp_redirect++; 857 rcu_read_unlock(); 858 goto xdp_xmit; 859 default: 860 bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); 861 fallthrough; 862 case XDP_ABORTED: 863 trace_xdp_exception(rq->dev, xdp_prog, act); 864 fallthrough; 865 case XDP_DROP: 866 stats->xdp_drops++; 867 goto xdp_drop; 868 } 869 rcu_read_unlock(); 870 871 /* check if bpf_xdp_adjust_head was used */ 872 off = orig_data - xdp->data; 873 if (off > 0) 874 __skb_push(skb, off); 875 else if (off < 0) 876 __skb_pull(skb, -off); 877 878 skb_reset_mac_header(skb); 879 880 /* check if bpf_xdp_adjust_tail was used */ 881 off = xdp->data_end - orig_data_end; 882 if (off != 0) 883 __skb_put(skb, off); /* positive on grow, negative on shrink */ 884 885 /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers 886 * (e.g. bpf_xdp_adjust_tail), we need to update data_len here. 887 */ 888 if (xdp_buff_has_frags(xdp)) 889 skb->data_len = skb_shinfo(skb)->xdp_frags_size; 890 else 891 skb->data_len = 0; 892 893 skb->protocol = eth_type_trans(skb, rq->dev); 894 895 metalen = xdp->data - xdp->data_meta; 896 if (metalen) 897 skb_metadata_set(skb, metalen); 898 out: 899 return skb; 900 drop: 901 stats->rx_drops++; 902 xdp_drop: 903 rcu_read_unlock(); 904 kfree_skb(skb); 905 return NULL; 906 err_xdp: 907 rcu_read_unlock(); 908 xdp_return_buff(xdp); 909 xdp_xmit: 910 return NULL; 911 } 912 913 static int veth_xdp_rcv(struct veth_rq *rq, int budget, 914 struct veth_xdp_tx_bq *bq, 915 struct veth_stats *stats) 916 { 917 int i, done = 0, n_xdpf = 0; 918 void *xdpf[VETH_XDP_BATCH]; 919 920 for (i = 0; i < budget; i++) { 921 void *ptr = __ptr_ring_consume(&rq->xdp_ring); 922 923 if (!ptr) 924 break; 925 926 if (veth_is_xdp_frame(ptr)) { 927 /* ndo_xdp_xmit */ 928 struct xdp_frame *frame = veth_ptr_to_xdp(ptr); 929 930 stats->xdp_bytes += xdp_get_frame_len(frame); 931 frame = veth_xdp_rcv_one(rq, frame, bq, stats); 932 if (frame) { 933 /* XDP_PASS */ 934 xdpf[n_xdpf++] = frame; 935 if (n_xdpf == VETH_XDP_BATCH) { 936 veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, 937 bq, stats); 938 n_xdpf = 0; 939 } 940 } 941 } else { 942 /* ndo_start_xmit */ 943 struct sk_buff *skb = ptr; 944 945 stats->xdp_bytes += skb->len; 946 skb = veth_xdp_rcv_skb(rq, skb, bq, stats); 947 if (skb) { 948 if (skb_shared(skb) || skb_unclone(skb, GFP_ATOMIC)) 949 netif_receive_skb(skb); 950 else 951 napi_gro_receive(&rq->xdp_napi, skb); 952 } 953 } 954 done++; 955 } 956 957 if (n_xdpf) 958 veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats); 959 960 u64_stats_update_begin(&rq->stats.syncp); 961 rq->stats.vs.xdp_redirect += stats->xdp_redirect; 962 rq->stats.vs.xdp_bytes += stats->xdp_bytes; 963 rq->stats.vs.xdp_drops += stats->xdp_drops; 964 rq->stats.vs.rx_drops += stats->rx_drops; 965 rq->stats.vs.xdp_packets += done; 966 u64_stats_update_end(&rq->stats.syncp); 967 968 return done; 969 } 970 971 static int veth_poll(struct napi_struct *napi, int budget) 972 { 973 struct veth_rq *rq = 974 container_of(napi, struct veth_rq, xdp_napi); 975 struct veth_stats stats = {}; 976 struct veth_xdp_tx_bq bq; 977 int done; 978 979 bq.count = 0; 980 981 xdp_set_return_frame_no_direct(); 982 done = veth_xdp_rcv(rq, budget, &bq, &stats); 983 984 if (stats.xdp_redirect > 0) 985 xdp_do_flush(); 986 987 if (done < budget && napi_complete_done(napi, done)) { 988 /* Write rx_notify_masked before reading ptr_ring */ 989 smp_store_mb(rq->rx_notify_masked, false); 990 if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { 991 if (napi_schedule_prep(&rq->xdp_napi)) { 992 WRITE_ONCE(rq->rx_notify_masked, true); 993 __napi_schedule(&rq->xdp_napi); 994 } 995 } 996 } 997 998 if (stats.xdp_tx > 0) 999 veth_xdp_flush(rq, &bq); 1000 xdp_clear_return_frame_no_direct(); 1001 1002 return done; 1003 } 1004 1005 static int __veth_napi_enable_range(struct net_device *dev, int start, int end) 1006 { 1007 struct veth_priv *priv = netdev_priv(dev); 1008 int err, i; 1009 1010 for (i = start; i < end; i++) { 1011 struct veth_rq *rq = &priv->rq[i]; 1012 1013 err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL); 1014 if (err) 1015 goto err_xdp_ring; 1016 } 1017 1018 for (i = start; i < end; i++) { 1019 struct veth_rq *rq = &priv->rq[i]; 1020 1021 napi_enable(&rq->xdp_napi); 1022 rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); 1023 } 1024 1025 return 0; 1026 1027 err_xdp_ring: 1028 for (i--; i >= start; i--) 1029 ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); 1030 1031 return err; 1032 } 1033 1034 static int __veth_napi_enable(struct net_device *dev) 1035 { 1036 return __veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); 1037 } 1038 1039 static void veth_napi_del_range(struct net_device *dev, int start, int end) 1040 { 1041 struct veth_priv *priv = netdev_priv(dev); 1042 int i; 1043 1044 for (i = start; i < end; i++) { 1045 struct veth_rq *rq = &priv->rq[i]; 1046 1047 rcu_assign_pointer(priv->rq[i].napi, NULL); 1048 napi_disable(&rq->xdp_napi); 1049 __netif_napi_del(&rq->xdp_napi); 1050 } 1051 synchronize_net(); 1052 1053 for (i = start; i < end; i++) { 1054 struct veth_rq *rq = &priv->rq[i]; 1055 1056 rq->rx_notify_masked = false; 1057 ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free); 1058 } 1059 } 1060 1061 static void veth_napi_del(struct net_device *dev) 1062 { 1063 veth_napi_del_range(dev, 0, dev->real_num_rx_queues); 1064 } 1065 1066 static bool veth_gro_requested(const struct net_device *dev) 1067 { 1068 return !!(dev->wanted_features & NETIF_F_GRO); 1069 } 1070 1071 static int veth_enable_xdp_range(struct net_device *dev, int start, int end, 1072 bool napi_already_on) 1073 { 1074 struct veth_priv *priv = netdev_priv(dev); 1075 int err, i; 1076 1077 for (i = start; i < end; i++) { 1078 struct veth_rq *rq = &priv->rq[i]; 1079 1080 if (!napi_already_on) 1081 netif_napi_add(dev, &rq->xdp_napi, veth_poll); 1082 err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id); 1083 if (err < 0) 1084 goto err_rxq_reg; 1085 1086 err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, 1087 MEM_TYPE_PAGE_SHARED, 1088 NULL); 1089 if (err < 0) 1090 goto err_reg_mem; 1091 1092 /* Save original mem info as it can be overwritten */ 1093 rq->xdp_mem = rq->xdp_rxq.mem; 1094 } 1095 return 0; 1096 1097 err_reg_mem: 1098 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 1099 err_rxq_reg: 1100 for (i--; i >= start; i--) { 1101 struct veth_rq *rq = &priv->rq[i]; 1102 1103 xdp_rxq_info_unreg(&rq->xdp_rxq); 1104 if (!napi_already_on) 1105 netif_napi_del(&rq->xdp_napi); 1106 } 1107 1108 return err; 1109 } 1110 1111 static void veth_disable_xdp_range(struct net_device *dev, int start, int end, 1112 bool delete_napi) 1113 { 1114 struct veth_priv *priv = netdev_priv(dev); 1115 int i; 1116 1117 for (i = start; i < end; i++) { 1118 struct veth_rq *rq = &priv->rq[i]; 1119 1120 rq->xdp_rxq.mem = rq->xdp_mem; 1121 xdp_rxq_info_unreg(&rq->xdp_rxq); 1122 1123 if (delete_napi) 1124 netif_napi_del(&rq->xdp_napi); 1125 } 1126 } 1127 1128 static int veth_enable_xdp(struct net_device *dev) 1129 { 1130 bool napi_already_on = veth_gro_requested(dev) && (dev->flags & IFF_UP); 1131 struct veth_priv *priv = netdev_priv(dev); 1132 int err, i; 1133 1134 if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) { 1135 err = veth_enable_xdp_range(dev, 0, dev->real_num_rx_queues, napi_already_on); 1136 if (err) 1137 return err; 1138 1139 if (!napi_already_on) { 1140 err = __veth_napi_enable(dev); 1141 if (err) { 1142 veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, true); 1143 return err; 1144 } 1145 1146 if (!veth_gro_requested(dev)) { 1147 /* user-space did not require GRO, but adding XDP 1148 * is supposed to get GRO working 1149 */ 1150 dev->features |= NETIF_F_GRO; 1151 netdev_features_change(dev); 1152 } 1153 } 1154 } 1155 1156 for (i = 0; i < dev->real_num_rx_queues; i++) { 1157 rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog); 1158 rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); 1159 } 1160 1161 return 0; 1162 } 1163 1164 static void veth_disable_xdp(struct net_device *dev) 1165 { 1166 struct veth_priv *priv = netdev_priv(dev); 1167 int i; 1168 1169 for (i = 0; i < dev->real_num_rx_queues; i++) 1170 rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); 1171 1172 if (!netif_running(dev) || !veth_gro_requested(dev)) { 1173 veth_napi_del(dev); 1174 1175 /* if user-space did not require GRO, since adding XDP 1176 * enabled it, clear it now 1177 */ 1178 if (!veth_gro_requested(dev) && netif_running(dev)) { 1179 dev->features &= ~NETIF_F_GRO; 1180 netdev_features_change(dev); 1181 } 1182 } 1183 1184 veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, false); 1185 } 1186 1187 static int veth_napi_enable_range(struct net_device *dev, int start, int end) 1188 { 1189 struct veth_priv *priv = netdev_priv(dev); 1190 int err, i; 1191 1192 for (i = start; i < end; i++) { 1193 struct veth_rq *rq = &priv->rq[i]; 1194 1195 netif_napi_add(dev, &rq->xdp_napi, veth_poll); 1196 } 1197 1198 err = __veth_napi_enable_range(dev, start, end); 1199 if (err) { 1200 for (i = start; i < end; i++) { 1201 struct veth_rq *rq = &priv->rq[i]; 1202 1203 netif_napi_del(&rq->xdp_napi); 1204 } 1205 return err; 1206 } 1207 return err; 1208 } 1209 1210 static int veth_napi_enable(struct net_device *dev) 1211 { 1212 return veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); 1213 } 1214 1215 static void veth_disable_range_safe(struct net_device *dev, int start, int end) 1216 { 1217 struct veth_priv *priv = netdev_priv(dev); 1218 1219 if (start >= end) 1220 return; 1221 1222 if (priv->_xdp_prog) { 1223 veth_napi_del_range(dev, start, end); 1224 veth_disable_xdp_range(dev, start, end, false); 1225 } else if (veth_gro_requested(dev)) { 1226 veth_napi_del_range(dev, start, end); 1227 } 1228 } 1229 1230 static int veth_enable_range_safe(struct net_device *dev, int start, int end) 1231 { 1232 struct veth_priv *priv = netdev_priv(dev); 1233 int err; 1234 1235 if (start >= end) 1236 return 0; 1237 1238 if (priv->_xdp_prog) { 1239 /* these channels are freshly initialized, napi is not on there even 1240 * when GRO is requeste 1241 */ 1242 err = veth_enable_xdp_range(dev, start, end, false); 1243 if (err) 1244 return err; 1245 1246 err = __veth_napi_enable_range(dev, start, end); 1247 if (err) { 1248 /* on error always delete the newly added napis */ 1249 veth_disable_xdp_range(dev, start, end, true); 1250 return err; 1251 } 1252 } else if (veth_gro_requested(dev)) { 1253 return veth_napi_enable_range(dev, start, end); 1254 } 1255 return 0; 1256 } 1257 1258 static void veth_set_xdp_features(struct net_device *dev) 1259 { 1260 struct veth_priv *priv = netdev_priv(dev); 1261 struct net_device *peer; 1262 1263 peer = rtnl_dereference(priv->peer); 1264 if (peer && peer->real_num_tx_queues <= dev->real_num_rx_queues) { 1265 struct veth_priv *priv_peer = netdev_priv(peer); 1266 xdp_features_t val = NETDEV_XDP_ACT_BASIC | 1267 NETDEV_XDP_ACT_REDIRECT | 1268 NETDEV_XDP_ACT_RX_SG; 1269 1270 if (priv_peer->_xdp_prog || veth_gro_requested(peer)) 1271 val |= NETDEV_XDP_ACT_NDO_XMIT | 1272 NETDEV_XDP_ACT_NDO_XMIT_SG; 1273 xdp_set_features_flag(dev, val); 1274 } else { 1275 xdp_clear_features_flag(dev); 1276 } 1277 } 1278 1279 static int veth_set_channels(struct net_device *dev, 1280 struct ethtool_channels *ch) 1281 { 1282 struct veth_priv *priv = netdev_priv(dev); 1283 unsigned int old_rx_count, new_rx_count; 1284 struct veth_priv *peer_priv; 1285 struct net_device *peer; 1286 int err; 1287 1288 /* sanity check. Upper bounds are already enforced by the caller */ 1289 if (!ch->rx_count || !ch->tx_count) 1290 return -EINVAL; 1291 1292 /* avoid braking XDP, if that is enabled */ 1293 peer = rtnl_dereference(priv->peer); 1294 peer_priv = peer ? netdev_priv(peer) : NULL; 1295 if (priv->_xdp_prog && peer && ch->rx_count < peer->real_num_tx_queues) 1296 return -EINVAL; 1297 1298 if (peer && peer_priv && peer_priv->_xdp_prog && ch->tx_count > peer->real_num_rx_queues) 1299 return -EINVAL; 1300 1301 old_rx_count = dev->real_num_rx_queues; 1302 new_rx_count = ch->rx_count; 1303 if (netif_running(dev)) { 1304 /* turn device off */ 1305 netif_carrier_off(dev); 1306 if (peer) 1307 netif_carrier_off(peer); 1308 1309 /* try to allocate new resurces, as needed*/ 1310 err = veth_enable_range_safe(dev, old_rx_count, new_rx_count); 1311 if (err) 1312 goto out; 1313 } 1314 1315 err = netif_set_real_num_rx_queues(dev, ch->rx_count); 1316 if (err) 1317 goto revert; 1318 1319 err = netif_set_real_num_tx_queues(dev, ch->tx_count); 1320 if (err) { 1321 int err2 = netif_set_real_num_rx_queues(dev, old_rx_count); 1322 1323 /* this error condition could happen only if rx and tx change 1324 * in opposite directions (e.g. tx nr raises, rx nr decreases) 1325 * and we can't do anything to fully restore the original 1326 * status 1327 */ 1328 if (err2) 1329 pr_warn("Can't restore rx queues config %d -> %d %d", 1330 new_rx_count, old_rx_count, err2); 1331 else 1332 goto revert; 1333 } 1334 1335 out: 1336 if (netif_running(dev)) { 1337 /* note that we need to swap the arguments WRT the enable part 1338 * to identify the range we have to disable 1339 */ 1340 veth_disable_range_safe(dev, new_rx_count, old_rx_count); 1341 netif_carrier_on(dev); 1342 if (peer) 1343 netif_carrier_on(peer); 1344 } 1345 1346 /* update XDP supported features */ 1347 veth_set_xdp_features(dev); 1348 if (peer) 1349 veth_set_xdp_features(peer); 1350 1351 return err; 1352 1353 revert: 1354 new_rx_count = old_rx_count; 1355 old_rx_count = ch->rx_count; 1356 goto out; 1357 } 1358 1359 static int veth_open(struct net_device *dev) 1360 { 1361 struct veth_priv *priv = netdev_priv(dev); 1362 struct net_device *peer = rtnl_dereference(priv->peer); 1363 int err; 1364 1365 if (!peer) 1366 return -ENOTCONN; 1367 1368 if (priv->_xdp_prog) { 1369 err = veth_enable_xdp(dev); 1370 if (err) 1371 return err; 1372 } else if (veth_gro_requested(dev)) { 1373 err = veth_napi_enable(dev); 1374 if (err) 1375 return err; 1376 } 1377 1378 if (peer->flags & IFF_UP) { 1379 netif_carrier_on(dev); 1380 netif_carrier_on(peer); 1381 } 1382 1383 return 0; 1384 } 1385 1386 static int veth_close(struct net_device *dev) 1387 { 1388 struct veth_priv *priv = netdev_priv(dev); 1389 struct net_device *peer = rtnl_dereference(priv->peer); 1390 1391 netif_carrier_off(dev); 1392 if (peer) 1393 netif_carrier_off(peer); 1394 1395 if (priv->_xdp_prog) 1396 veth_disable_xdp(dev); 1397 else if (veth_gro_requested(dev)) 1398 veth_napi_del(dev); 1399 1400 return 0; 1401 } 1402 1403 static int is_valid_veth_mtu(int mtu) 1404 { 1405 return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU; 1406 } 1407 1408 static int veth_alloc_queues(struct net_device *dev) 1409 { 1410 struct veth_priv *priv = netdev_priv(dev); 1411 int i; 1412 1413 priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL_ACCOUNT); 1414 if (!priv->rq) 1415 return -ENOMEM; 1416 1417 for (i = 0; i < dev->num_rx_queues; i++) { 1418 priv->rq[i].dev = dev; 1419 u64_stats_init(&priv->rq[i].stats.syncp); 1420 } 1421 1422 return 0; 1423 } 1424 1425 static void veth_free_queues(struct net_device *dev) 1426 { 1427 struct veth_priv *priv = netdev_priv(dev); 1428 1429 kfree(priv->rq); 1430 } 1431 1432 static int veth_dev_init(struct net_device *dev) 1433 { 1434 int err; 1435 1436 dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); 1437 if (!dev->lstats) 1438 return -ENOMEM; 1439 1440 err = veth_alloc_queues(dev); 1441 if (err) { 1442 free_percpu(dev->lstats); 1443 return err; 1444 } 1445 1446 return 0; 1447 } 1448 1449 static void veth_dev_free(struct net_device *dev) 1450 { 1451 veth_free_queues(dev); 1452 free_percpu(dev->lstats); 1453 } 1454 1455 #ifdef CONFIG_NET_POLL_CONTROLLER 1456 static void veth_poll_controller(struct net_device *dev) 1457 { 1458 /* veth only receives frames when its peer sends one 1459 * Since it has nothing to do with disabling irqs, we are guaranteed 1460 * never to have pending data when we poll for it so 1461 * there is nothing to do here. 1462 * 1463 * We need this though so netpoll recognizes us as an interface that 1464 * supports polling, which enables bridge devices in virt setups to 1465 * still use netconsole 1466 */ 1467 } 1468 #endif /* CONFIG_NET_POLL_CONTROLLER */ 1469 1470 static int veth_get_iflink(const struct net_device *dev) 1471 { 1472 struct veth_priv *priv = netdev_priv(dev); 1473 struct net_device *peer; 1474 int iflink; 1475 1476 rcu_read_lock(); 1477 peer = rcu_dereference(priv->peer); 1478 iflink = peer ? peer->ifindex : 0; 1479 rcu_read_unlock(); 1480 1481 return iflink; 1482 } 1483 1484 static netdev_features_t veth_fix_features(struct net_device *dev, 1485 netdev_features_t features) 1486 { 1487 struct veth_priv *priv = netdev_priv(dev); 1488 struct net_device *peer; 1489 1490 peer = rtnl_dereference(priv->peer); 1491 if (peer) { 1492 struct veth_priv *peer_priv = netdev_priv(peer); 1493 1494 if (peer_priv->_xdp_prog) 1495 features &= ~NETIF_F_GSO_SOFTWARE; 1496 } 1497 if (priv->_xdp_prog) 1498 features |= NETIF_F_GRO; 1499 1500 return features; 1501 } 1502 1503 static int veth_set_features(struct net_device *dev, 1504 netdev_features_t features) 1505 { 1506 netdev_features_t changed = features ^ dev->features; 1507 struct veth_priv *priv = netdev_priv(dev); 1508 struct net_device *peer; 1509 int err; 1510 1511 if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog) 1512 return 0; 1513 1514 peer = rtnl_dereference(priv->peer); 1515 if (features & NETIF_F_GRO) { 1516 err = veth_napi_enable(dev); 1517 if (err) 1518 return err; 1519 1520 if (peer) 1521 xdp_features_set_redirect_target(peer, true); 1522 } else { 1523 if (peer) 1524 xdp_features_clear_redirect_target(peer); 1525 veth_napi_del(dev); 1526 } 1527 return 0; 1528 } 1529 1530 static void veth_set_rx_headroom(struct net_device *dev, int new_hr) 1531 { 1532 struct veth_priv *peer_priv, *priv = netdev_priv(dev); 1533 struct net_device *peer; 1534 1535 if (new_hr < 0) 1536 new_hr = 0; 1537 1538 rcu_read_lock(); 1539 peer = rcu_dereference(priv->peer); 1540 if (unlikely(!peer)) 1541 goto out; 1542 1543 peer_priv = netdev_priv(peer); 1544 priv->requested_headroom = new_hr; 1545 new_hr = max(priv->requested_headroom, peer_priv->requested_headroom); 1546 dev->needed_headroom = new_hr; 1547 peer->needed_headroom = new_hr; 1548 1549 out: 1550 rcu_read_unlock(); 1551 } 1552 1553 static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, 1554 struct netlink_ext_ack *extack) 1555 { 1556 struct veth_priv *priv = netdev_priv(dev); 1557 struct bpf_prog *old_prog; 1558 struct net_device *peer; 1559 unsigned int max_mtu; 1560 int err; 1561 1562 old_prog = priv->_xdp_prog; 1563 priv->_xdp_prog = prog; 1564 peer = rtnl_dereference(priv->peer); 1565 1566 if (prog) { 1567 if (!peer) { 1568 NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached"); 1569 err = -ENOTCONN; 1570 goto err; 1571 } 1572 1573 max_mtu = SKB_WITH_OVERHEAD(PAGE_SIZE - VETH_XDP_HEADROOM) - 1574 peer->hard_header_len; 1575 /* Allow increasing the max_mtu if the program supports 1576 * XDP fragments. 1577 */ 1578 if (prog->aux->xdp_has_frags) 1579 max_mtu += PAGE_SIZE * MAX_SKB_FRAGS; 1580 1581 if (peer->mtu > max_mtu) { 1582 NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP"); 1583 err = -ERANGE; 1584 goto err; 1585 } 1586 1587 if (dev->real_num_rx_queues < peer->real_num_tx_queues) { 1588 NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues"); 1589 err = -ENOSPC; 1590 goto err; 1591 } 1592 1593 if (dev->flags & IFF_UP) { 1594 err = veth_enable_xdp(dev); 1595 if (err) { 1596 NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed"); 1597 goto err; 1598 } 1599 } 1600 1601 if (!old_prog) { 1602 peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; 1603 peer->max_mtu = max_mtu; 1604 } 1605 1606 xdp_features_set_redirect_target(peer, true); 1607 } 1608 1609 if (old_prog) { 1610 if (!prog) { 1611 if (peer && !veth_gro_requested(dev)) 1612 xdp_features_clear_redirect_target(peer); 1613 1614 if (dev->flags & IFF_UP) 1615 veth_disable_xdp(dev); 1616 1617 if (peer) { 1618 peer->hw_features |= NETIF_F_GSO_SOFTWARE; 1619 peer->max_mtu = ETH_MAX_MTU; 1620 } 1621 } 1622 bpf_prog_put(old_prog); 1623 } 1624 1625 if ((!!old_prog ^ !!prog) && peer) 1626 netdev_update_features(peer); 1627 1628 return 0; 1629 err: 1630 priv->_xdp_prog = old_prog; 1631 1632 return err; 1633 } 1634 1635 static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) 1636 { 1637 switch (xdp->command) { 1638 case XDP_SETUP_PROG: 1639 return veth_xdp_set(dev, xdp->prog, xdp->extack); 1640 default: 1641 return -EINVAL; 1642 } 1643 } 1644 1645 static int veth_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp) 1646 { 1647 struct veth_xdp_buff *_ctx = (void *)ctx; 1648 1649 if (!_ctx->skb) 1650 return -ENODATA; 1651 1652 *timestamp = skb_hwtstamps(_ctx->skb)->hwtstamp; 1653 return 0; 1654 } 1655 1656 static int veth_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash, 1657 enum xdp_rss_hash_type *rss_type) 1658 { 1659 struct veth_xdp_buff *_ctx = (void *)ctx; 1660 struct sk_buff *skb = _ctx->skb; 1661 1662 if (!skb) 1663 return -ENODATA; 1664 1665 *hash = skb_get_hash(skb); 1666 *rss_type = skb->l4_hash ? XDP_RSS_TYPE_L4_ANY : XDP_RSS_TYPE_NONE; 1667 1668 return 0; 1669 } 1670 1671 static const struct net_device_ops veth_netdev_ops = { 1672 .ndo_init = veth_dev_init, 1673 .ndo_open = veth_open, 1674 .ndo_stop = veth_close, 1675 .ndo_start_xmit = veth_xmit, 1676 .ndo_get_stats64 = veth_get_stats64, 1677 .ndo_set_rx_mode = veth_set_multicast_list, 1678 .ndo_set_mac_address = eth_mac_addr, 1679 #ifdef CONFIG_NET_POLL_CONTROLLER 1680 .ndo_poll_controller = veth_poll_controller, 1681 #endif 1682 .ndo_get_iflink = veth_get_iflink, 1683 .ndo_fix_features = veth_fix_features, 1684 .ndo_set_features = veth_set_features, 1685 .ndo_features_check = passthru_features_check, 1686 .ndo_set_rx_headroom = veth_set_rx_headroom, 1687 .ndo_bpf = veth_xdp, 1688 .ndo_xdp_xmit = veth_ndo_xdp_xmit, 1689 .ndo_get_peer_dev = veth_peer_dev, 1690 }; 1691 1692 static const struct xdp_metadata_ops veth_xdp_metadata_ops = { 1693 .xmo_rx_timestamp = veth_xdp_rx_timestamp, 1694 .xmo_rx_hash = veth_xdp_rx_hash, 1695 }; 1696 1697 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ 1698 NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \ 1699 NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \ 1700 NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \ 1701 NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX ) 1702 1703 static void veth_setup(struct net_device *dev) 1704 { 1705 ether_setup(dev); 1706 1707 dev->priv_flags &= ~IFF_TX_SKB_SHARING; 1708 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 1709 dev->priv_flags |= IFF_NO_QUEUE; 1710 dev->priv_flags |= IFF_PHONY_HEADROOM; 1711 1712 dev->netdev_ops = &veth_netdev_ops; 1713 dev->xdp_metadata_ops = &veth_xdp_metadata_ops; 1714 dev->ethtool_ops = &veth_ethtool_ops; 1715 dev->features |= NETIF_F_LLTX; 1716 dev->features |= VETH_FEATURES; 1717 dev->vlan_features = dev->features & 1718 ~(NETIF_F_HW_VLAN_CTAG_TX | 1719 NETIF_F_HW_VLAN_STAG_TX | 1720 NETIF_F_HW_VLAN_CTAG_RX | 1721 NETIF_F_HW_VLAN_STAG_RX); 1722 dev->needs_free_netdev = true; 1723 dev->priv_destructor = veth_dev_free; 1724 dev->max_mtu = ETH_MAX_MTU; 1725 1726 dev->hw_features = VETH_FEATURES; 1727 dev->hw_enc_features = VETH_FEATURES; 1728 dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; 1729 netif_set_tso_max_size(dev, GSO_MAX_SIZE); 1730 } 1731 1732 /* 1733 * netlink interface 1734 */ 1735 1736 static int veth_validate(struct nlattr *tb[], struct nlattr *data[], 1737 struct netlink_ext_ack *extack) 1738 { 1739 if (tb[IFLA_ADDRESS]) { 1740 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 1741 return -EINVAL; 1742 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 1743 return -EADDRNOTAVAIL; 1744 } 1745 if (tb[IFLA_MTU]) { 1746 if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU]))) 1747 return -EINVAL; 1748 } 1749 return 0; 1750 } 1751 1752 static struct rtnl_link_ops veth_link_ops; 1753 1754 static void veth_disable_gro(struct net_device *dev) 1755 { 1756 dev->features &= ~NETIF_F_GRO; 1757 dev->wanted_features &= ~NETIF_F_GRO; 1758 netdev_update_features(dev); 1759 } 1760 1761 static int veth_init_queues(struct net_device *dev, struct nlattr *tb[]) 1762 { 1763 int err; 1764 1765 if (!tb[IFLA_NUM_TX_QUEUES] && dev->num_tx_queues > 1) { 1766 err = netif_set_real_num_tx_queues(dev, 1); 1767 if (err) 1768 return err; 1769 } 1770 if (!tb[IFLA_NUM_RX_QUEUES] && dev->num_rx_queues > 1) { 1771 err = netif_set_real_num_rx_queues(dev, 1); 1772 if (err) 1773 return err; 1774 } 1775 return 0; 1776 } 1777 1778 static int veth_newlink(struct net *src_net, struct net_device *dev, 1779 struct nlattr *tb[], struct nlattr *data[], 1780 struct netlink_ext_ack *extack) 1781 { 1782 int err; 1783 struct net_device *peer; 1784 struct veth_priv *priv; 1785 char ifname[IFNAMSIZ]; 1786 struct nlattr *peer_tb[IFLA_MAX + 1], **tbp; 1787 unsigned char name_assign_type; 1788 struct ifinfomsg *ifmp; 1789 struct net *net; 1790 1791 /* 1792 * create and register peer first 1793 */ 1794 if (data != NULL && data[VETH_INFO_PEER] != NULL) { 1795 struct nlattr *nla_peer; 1796 1797 nla_peer = data[VETH_INFO_PEER]; 1798 ifmp = nla_data(nla_peer); 1799 err = rtnl_nla_parse_ifla(peer_tb, 1800 nla_data(nla_peer) + sizeof(struct ifinfomsg), 1801 nla_len(nla_peer) - sizeof(struct ifinfomsg), 1802 NULL); 1803 if (err < 0) 1804 return err; 1805 1806 err = veth_validate(peer_tb, NULL, extack); 1807 if (err < 0) 1808 return err; 1809 1810 tbp = peer_tb; 1811 } else { 1812 ifmp = NULL; 1813 tbp = tb; 1814 } 1815 1816 if (ifmp && tbp[IFLA_IFNAME]) { 1817 nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); 1818 name_assign_type = NET_NAME_USER; 1819 } else { 1820 snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); 1821 name_assign_type = NET_NAME_ENUM; 1822 } 1823 1824 net = rtnl_link_get_net(src_net, tbp); 1825 if (IS_ERR(net)) 1826 return PTR_ERR(net); 1827 1828 peer = rtnl_create_link(net, ifname, name_assign_type, 1829 &veth_link_ops, tbp, extack); 1830 if (IS_ERR(peer)) { 1831 put_net(net); 1832 return PTR_ERR(peer); 1833 } 1834 1835 if (!ifmp || !tbp[IFLA_ADDRESS]) 1836 eth_hw_addr_random(peer); 1837 1838 if (ifmp && (dev->ifindex != 0)) 1839 peer->ifindex = ifmp->ifi_index; 1840 1841 netif_inherit_tso_max(peer, dev); 1842 1843 err = register_netdevice(peer); 1844 put_net(net); 1845 net = NULL; 1846 if (err < 0) 1847 goto err_register_peer; 1848 1849 /* keep GRO disabled by default to be consistent with the established 1850 * veth behavior 1851 */ 1852 veth_disable_gro(peer); 1853 netif_carrier_off(peer); 1854 1855 err = rtnl_configure_link(peer, ifmp, 0, NULL); 1856 if (err < 0) 1857 goto err_configure_peer; 1858 1859 /* 1860 * register dev last 1861 * 1862 * note, that since we've registered new device the dev's name 1863 * should be re-allocated 1864 */ 1865 1866 if (tb[IFLA_ADDRESS] == NULL) 1867 eth_hw_addr_random(dev); 1868 1869 if (tb[IFLA_IFNAME]) 1870 nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); 1871 else 1872 snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); 1873 1874 err = register_netdevice(dev); 1875 if (err < 0) 1876 goto err_register_dev; 1877 1878 netif_carrier_off(dev); 1879 1880 /* 1881 * tie the deviced together 1882 */ 1883 1884 priv = netdev_priv(dev); 1885 rcu_assign_pointer(priv->peer, peer); 1886 err = veth_init_queues(dev, tb); 1887 if (err) 1888 goto err_queues; 1889 1890 priv = netdev_priv(peer); 1891 rcu_assign_pointer(priv->peer, dev); 1892 err = veth_init_queues(peer, tb); 1893 if (err) 1894 goto err_queues; 1895 1896 veth_disable_gro(dev); 1897 /* update XDP supported features */ 1898 veth_set_xdp_features(dev); 1899 veth_set_xdp_features(peer); 1900 1901 return 0; 1902 1903 err_queues: 1904 unregister_netdevice(dev); 1905 err_register_dev: 1906 /* nothing to do */ 1907 err_configure_peer: 1908 unregister_netdevice(peer); 1909 return err; 1910 1911 err_register_peer: 1912 free_netdev(peer); 1913 return err; 1914 } 1915 1916 static void veth_dellink(struct net_device *dev, struct list_head *head) 1917 { 1918 struct veth_priv *priv; 1919 struct net_device *peer; 1920 1921 priv = netdev_priv(dev); 1922 peer = rtnl_dereference(priv->peer); 1923 1924 /* Note : dellink() is called from default_device_exit_batch(), 1925 * before a rcu_synchronize() point. The devices are guaranteed 1926 * not being freed before one RCU grace period. 1927 */ 1928 RCU_INIT_POINTER(priv->peer, NULL); 1929 unregister_netdevice_queue(dev, head); 1930 1931 if (peer) { 1932 priv = netdev_priv(peer); 1933 RCU_INIT_POINTER(priv->peer, NULL); 1934 unregister_netdevice_queue(peer, head); 1935 } 1936 } 1937 1938 static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = { 1939 [VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) }, 1940 }; 1941 1942 static struct net *veth_get_link_net(const struct net_device *dev) 1943 { 1944 struct veth_priv *priv = netdev_priv(dev); 1945 struct net_device *peer = rtnl_dereference(priv->peer); 1946 1947 return peer ? dev_net(peer) : dev_net(dev); 1948 } 1949 1950 static unsigned int veth_get_num_queues(void) 1951 { 1952 /* enforce the same queue limit as rtnl_create_link */ 1953 int queues = num_possible_cpus(); 1954 1955 if (queues > 4096) 1956 queues = 4096; 1957 return queues; 1958 } 1959 1960 static struct rtnl_link_ops veth_link_ops = { 1961 .kind = DRV_NAME, 1962 .priv_size = sizeof(struct veth_priv), 1963 .setup = veth_setup, 1964 .validate = veth_validate, 1965 .newlink = veth_newlink, 1966 .dellink = veth_dellink, 1967 .policy = veth_policy, 1968 .maxtype = VETH_INFO_MAX, 1969 .get_link_net = veth_get_link_net, 1970 .get_num_tx_queues = veth_get_num_queues, 1971 .get_num_rx_queues = veth_get_num_queues, 1972 }; 1973 1974 /* 1975 * init/fini 1976 */ 1977 1978 static __init int veth_init(void) 1979 { 1980 return rtnl_link_register(&veth_link_ops); 1981 } 1982 1983 static __exit void veth_exit(void) 1984 { 1985 rtnl_link_unregister(&veth_link_ops); 1986 } 1987 1988 module_init(veth_init); 1989 module_exit(veth_exit); 1990 1991 MODULE_DESCRIPTION("Virtual Ethernet Tunnel"); 1992 MODULE_LICENSE("GPL v2"); 1993 MODULE_ALIAS_RTNL_LINK(DRV_NAME); 1994