1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * drivers/net/veth.c 4 * 5 * Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc 6 * 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com> 9 * 10 */ 11 12 #include <linux/netdevice.h> 13 #include <linux/slab.h> 14 #include <linux/ethtool.h> 15 #include <linux/etherdevice.h> 16 #include <linux/u64_stats_sync.h> 17 18 #include <net/rtnetlink.h> 19 #include <net/dst.h> 20 #include <net/xfrm.h> 21 #include <net/xdp.h> 22 #include <linux/veth.h> 23 #include <linux/module.h> 24 #include <linux/bpf.h> 25 #include <linux/filter.h> 26 #include <linux/ptr_ring.h> 27 #include <linux/bpf_trace.h> 28 #include <linux/net_tstamp.h> 29 #include <net/page_pool/helpers.h> 30 31 #define DRV_NAME "veth" 32 #define DRV_VERSION "1.0" 33 34 #define VETH_XDP_FLAG BIT(0) 35 #define VETH_RING_SIZE 256 36 #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) 37 38 #define VETH_XDP_TX_BULK_SIZE 16 39 #define VETH_XDP_BATCH 16 40 41 struct veth_stats { 42 u64 rx_drops; 43 /* xdp */ 44 u64 xdp_packets; 45 u64 xdp_bytes; 46 u64 xdp_redirect; 47 u64 xdp_drops; 48 u64 xdp_tx; 49 u64 xdp_tx_err; 50 u64 peer_tq_xdp_xmit; 51 u64 peer_tq_xdp_xmit_err; 52 }; 53 54 struct veth_rq_stats { 55 struct veth_stats vs; 56 struct u64_stats_sync syncp; 57 }; 58 59 struct veth_rq { 60 struct napi_struct xdp_napi; 61 struct napi_struct __rcu *napi; /* points to xdp_napi when the latter is initialized */ 62 struct net_device *dev; 63 struct bpf_prog __rcu *xdp_prog; 64 struct xdp_mem_info xdp_mem; 65 struct veth_rq_stats stats; 66 bool rx_notify_masked; 67 struct ptr_ring xdp_ring; 68 struct xdp_rxq_info xdp_rxq; 69 struct page_pool *page_pool; 70 }; 71 72 struct veth_priv { 73 struct net_device __rcu *peer; 74 atomic64_t dropped; 75 struct bpf_prog *_xdp_prog; 76 struct veth_rq *rq; 77 unsigned int requested_headroom; 78 }; 79 80 struct veth_xdp_tx_bq { 81 struct xdp_frame *q[VETH_XDP_TX_BULK_SIZE]; 82 unsigned int count; 83 }; 84 85 /* 86 * ethtool interface 87 */ 88 89 struct veth_q_stat_desc { 90 char desc[ETH_GSTRING_LEN]; 91 size_t offset; 92 }; 93 94 #define VETH_RQ_STAT(m) offsetof(struct veth_stats, m) 95 96 static const struct veth_q_stat_desc veth_rq_stats_desc[] = { 97 { "xdp_packets", VETH_RQ_STAT(xdp_packets) }, 98 { "xdp_bytes", VETH_RQ_STAT(xdp_bytes) }, 99 { "drops", VETH_RQ_STAT(rx_drops) }, 100 { "xdp_redirect", VETH_RQ_STAT(xdp_redirect) }, 101 { "xdp_drops", VETH_RQ_STAT(xdp_drops) }, 102 { "xdp_tx", VETH_RQ_STAT(xdp_tx) }, 103 { "xdp_tx_errors", VETH_RQ_STAT(xdp_tx_err) }, 104 }; 105 106 #define VETH_RQ_STATS_LEN ARRAY_SIZE(veth_rq_stats_desc) 107 108 static const struct veth_q_stat_desc veth_tq_stats_desc[] = { 109 { "xdp_xmit", VETH_RQ_STAT(peer_tq_xdp_xmit) }, 110 { "xdp_xmit_errors", VETH_RQ_STAT(peer_tq_xdp_xmit_err) }, 111 }; 112 113 #define VETH_TQ_STATS_LEN ARRAY_SIZE(veth_tq_stats_desc) 114 115 static struct { 116 const char string[ETH_GSTRING_LEN]; 117 } ethtool_stats_keys[] = { 118 { "peer_ifindex" }, 119 }; 120 121 struct veth_xdp_buff { 122 struct xdp_buff xdp; 123 struct sk_buff *skb; 124 }; 125 126 static int veth_get_link_ksettings(struct net_device *dev, 127 struct ethtool_link_ksettings *cmd) 128 { 129 cmd->base.speed = SPEED_10000; 130 cmd->base.duplex = DUPLEX_FULL; 131 cmd->base.port = PORT_TP; 132 cmd->base.autoneg = AUTONEG_DISABLE; 133 return 0; 134 } 135 136 static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) 137 { 138 strscpy(info->driver, DRV_NAME, sizeof(info->driver)); 139 strscpy(info->version, DRV_VERSION, sizeof(info->version)); 140 } 141 142 static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf) 143 { 144 u8 *p = buf; 145 int i, j; 146 147 switch(stringset) { 148 case ETH_SS_STATS: 149 memcpy(p, ðtool_stats_keys, sizeof(ethtool_stats_keys)); 150 p += sizeof(ethtool_stats_keys); 151 for (i = 0; i < dev->real_num_rx_queues; i++) 152 for (j = 0; j < VETH_RQ_STATS_LEN; j++) 153 ethtool_sprintf(&p, "rx_queue_%u_%.18s", 154 i, veth_rq_stats_desc[j].desc); 155 156 for (i = 0; i < dev->real_num_tx_queues; i++) 157 for (j = 0; j < VETH_TQ_STATS_LEN; j++) 158 ethtool_sprintf(&p, "tx_queue_%u_%.18s", 159 i, veth_tq_stats_desc[j].desc); 160 161 page_pool_ethtool_stats_get_strings(p); 162 break; 163 } 164 } 165 166 static int veth_get_sset_count(struct net_device *dev, int sset) 167 { 168 switch (sset) { 169 case ETH_SS_STATS: 170 return ARRAY_SIZE(ethtool_stats_keys) + 171 VETH_RQ_STATS_LEN * dev->real_num_rx_queues + 172 VETH_TQ_STATS_LEN * dev->real_num_tx_queues + 173 page_pool_ethtool_stats_get_count(); 174 default: 175 return -EOPNOTSUPP; 176 } 177 } 178 179 static void veth_get_page_pool_stats(struct net_device *dev, u64 *data) 180 { 181 #ifdef CONFIG_PAGE_POOL_STATS 182 struct veth_priv *priv = netdev_priv(dev); 183 struct page_pool_stats pp_stats = {}; 184 int i; 185 186 for (i = 0; i < dev->real_num_rx_queues; i++) { 187 if (!priv->rq[i].page_pool) 188 continue; 189 page_pool_get_stats(priv->rq[i].page_pool, &pp_stats); 190 } 191 page_pool_ethtool_stats_get(data, &pp_stats); 192 #endif /* CONFIG_PAGE_POOL_STATS */ 193 } 194 195 static void veth_get_ethtool_stats(struct net_device *dev, 196 struct ethtool_stats *stats, u64 *data) 197 { 198 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 199 struct net_device *peer = rtnl_dereference(priv->peer); 200 int i, j, idx, pp_idx; 201 202 data[0] = peer ? peer->ifindex : 0; 203 idx = 1; 204 for (i = 0; i < dev->real_num_rx_queues; i++) { 205 const struct veth_rq_stats *rq_stats = &priv->rq[i].stats; 206 const void *stats_base = (void *)&rq_stats->vs; 207 unsigned int start; 208 size_t offset; 209 210 do { 211 start = u64_stats_fetch_begin(&rq_stats->syncp); 212 for (j = 0; j < VETH_RQ_STATS_LEN; j++) { 213 offset = veth_rq_stats_desc[j].offset; 214 data[idx + j] = *(u64 *)(stats_base + offset); 215 } 216 } while (u64_stats_fetch_retry(&rq_stats->syncp, start)); 217 idx += VETH_RQ_STATS_LEN; 218 } 219 pp_idx = idx; 220 221 if (!peer) 222 goto page_pool_stats; 223 224 rcv_priv = netdev_priv(peer); 225 for (i = 0; i < peer->real_num_rx_queues; i++) { 226 const struct veth_rq_stats *rq_stats = &rcv_priv->rq[i].stats; 227 const void *base = (void *)&rq_stats->vs; 228 unsigned int start, tx_idx = idx; 229 size_t offset; 230 231 tx_idx += (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN; 232 do { 233 start = u64_stats_fetch_begin(&rq_stats->syncp); 234 for (j = 0; j < VETH_TQ_STATS_LEN; j++) { 235 offset = veth_tq_stats_desc[j].offset; 236 data[tx_idx + j] += *(u64 *)(base + offset); 237 } 238 } while (u64_stats_fetch_retry(&rq_stats->syncp, start)); 239 } 240 pp_idx = idx + dev->real_num_tx_queues * VETH_TQ_STATS_LEN; 241 242 page_pool_stats: 243 veth_get_page_pool_stats(dev, &data[pp_idx]); 244 } 245 246 static void veth_get_channels(struct net_device *dev, 247 struct ethtool_channels *channels) 248 { 249 channels->tx_count = dev->real_num_tx_queues; 250 channels->rx_count = dev->real_num_rx_queues; 251 channels->max_tx = dev->num_tx_queues; 252 channels->max_rx = dev->num_rx_queues; 253 } 254 255 static int veth_set_channels(struct net_device *dev, 256 struct ethtool_channels *ch); 257 258 static const struct ethtool_ops veth_ethtool_ops = { 259 .get_drvinfo = veth_get_drvinfo, 260 .get_link = ethtool_op_get_link, 261 .get_strings = veth_get_strings, 262 .get_sset_count = veth_get_sset_count, 263 .get_ethtool_stats = veth_get_ethtool_stats, 264 .get_link_ksettings = veth_get_link_ksettings, 265 .get_ts_info = ethtool_op_get_ts_info, 266 .get_channels = veth_get_channels, 267 .set_channels = veth_set_channels, 268 }; 269 270 /* general routines */ 271 272 static bool veth_is_xdp_frame(void *ptr) 273 { 274 return (unsigned long)ptr & VETH_XDP_FLAG; 275 } 276 277 static struct xdp_frame *veth_ptr_to_xdp(void *ptr) 278 { 279 return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG); 280 } 281 282 static void *veth_xdp_to_ptr(struct xdp_frame *xdp) 283 { 284 return (void *)((unsigned long)xdp | VETH_XDP_FLAG); 285 } 286 287 static void veth_ptr_free(void *ptr) 288 { 289 if (veth_is_xdp_frame(ptr)) 290 xdp_return_frame(veth_ptr_to_xdp(ptr)); 291 else 292 kfree_skb(ptr); 293 } 294 295 static void __veth_xdp_flush(struct veth_rq *rq) 296 { 297 /* Write ptr_ring before reading rx_notify_masked */ 298 smp_mb(); 299 if (!READ_ONCE(rq->rx_notify_masked) && 300 napi_schedule_prep(&rq->xdp_napi)) { 301 WRITE_ONCE(rq->rx_notify_masked, true); 302 __napi_schedule(&rq->xdp_napi); 303 } 304 } 305 306 static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) 307 { 308 if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) { 309 dev_kfree_skb_any(skb); 310 return NET_RX_DROP; 311 } 312 313 return NET_RX_SUCCESS; 314 } 315 316 static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, 317 struct veth_rq *rq, bool xdp) 318 { 319 return __dev_forward_skb(dev, skb) ?: xdp ? 320 veth_xdp_rx(rq, skb) : 321 __netif_rx(skb); 322 } 323 324 /* return true if the specified skb has chances of GRO aggregation 325 * Don't strive for accuracy, but try to avoid GRO overhead in the most 326 * common scenarios. 327 * When XDP is enabled, all traffic is considered eligible, as the xmit 328 * device has TSO off. 329 * When TSO is enabled on the xmit device, we are likely interested only 330 * in UDP aggregation, explicitly check for that if the skb is suspected 331 * - the sock_wfree destructor is used by UDP, ICMP and XDP sockets - 332 * to belong to locally generated UDP traffic. 333 */ 334 static bool veth_skb_is_eligible_for_gro(const struct net_device *dev, 335 const struct net_device *rcv, 336 const struct sk_buff *skb) 337 { 338 return !(dev->features & NETIF_F_ALL_TSO) || 339 (skb->destructor == sock_wfree && 340 rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD)); 341 } 342 343 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) 344 { 345 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 346 struct veth_rq *rq = NULL; 347 int ret = NETDEV_TX_OK; 348 struct net_device *rcv; 349 int length = skb->len; 350 bool use_napi = false; 351 int rxq; 352 353 rcu_read_lock(); 354 rcv = rcu_dereference(priv->peer); 355 if (unlikely(!rcv) || !pskb_may_pull(skb, ETH_HLEN)) { 356 kfree_skb(skb); 357 goto drop; 358 } 359 360 rcv_priv = netdev_priv(rcv); 361 rxq = skb_get_queue_mapping(skb); 362 if (rxq < rcv->real_num_rx_queues) { 363 rq = &rcv_priv->rq[rxq]; 364 365 /* The napi pointer is available when an XDP program is 366 * attached or when GRO is enabled 367 * Don't bother with napi/GRO if the skb can't be aggregated 368 */ 369 use_napi = rcu_access_pointer(rq->napi) && 370 veth_skb_is_eligible_for_gro(dev, rcv, skb); 371 } 372 373 skb_tx_timestamp(skb); 374 if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) { 375 if (!use_napi) 376 dev_sw_netstats_tx_add(dev, 1, length); 377 else 378 __veth_xdp_flush(rq); 379 } else { 380 drop: 381 atomic64_inc(&priv->dropped); 382 ret = NET_XMIT_DROP; 383 } 384 385 rcu_read_unlock(); 386 387 return ret; 388 } 389 390 static void veth_stats_rx(struct veth_stats *result, struct net_device *dev) 391 { 392 struct veth_priv *priv = netdev_priv(dev); 393 int i; 394 395 result->peer_tq_xdp_xmit_err = 0; 396 result->xdp_packets = 0; 397 result->xdp_tx_err = 0; 398 result->xdp_bytes = 0; 399 result->rx_drops = 0; 400 for (i = 0; i < dev->num_rx_queues; i++) { 401 u64 packets, bytes, drops, xdp_tx_err, peer_tq_xdp_xmit_err; 402 struct veth_rq_stats *stats = &priv->rq[i].stats; 403 unsigned int start; 404 405 do { 406 start = u64_stats_fetch_begin(&stats->syncp); 407 peer_tq_xdp_xmit_err = stats->vs.peer_tq_xdp_xmit_err; 408 xdp_tx_err = stats->vs.xdp_tx_err; 409 packets = stats->vs.xdp_packets; 410 bytes = stats->vs.xdp_bytes; 411 drops = stats->vs.rx_drops; 412 } while (u64_stats_fetch_retry(&stats->syncp, start)); 413 result->peer_tq_xdp_xmit_err += peer_tq_xdp_xmit_err; 414 result->xdp_tx_err += xdp_tx_err; 415 result->xdp_packets += packets; 416 result->xdp_bytes += bytes; 417 result->rx_drops += drops; 418 } 419 } 420 421 static void veth_get_stats64(struct net_device *dev, 422 struct rtnl_link_stats64 *tot) 423 { 424 struct veth_priv *priv = netdev_priv(dev); 425 struct net_device *peer; 426 struct veth_stats rx; 427 428 tot->tx_dropped = atomic64_read(&priv->dropped); 429 dev_fetch_sw_netstats(tot, dev->tstats); 430 431 veth_stats_rx(&rx, dev); 432 tot->tx_dropped += rx.xdp_tx_err; 433 tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err; 434 tot->rx_bytes += rx.xdp_bytes; 435 tot->rx_packets += rx.xdp_packets; 436 437 rcu_read_lock(); 438 peer = rcu_dereference(priv->peer); 439 if (peer) { 440 struct rtnl_link_stats64 tot_peer = {}; 441 442 dev_fetch_sw_netstats(&tot_peer, peer->tstats); 443 tot->rx_bytes += tot_peer.tx_bytes; 444 tot->rx_packets += tot_peer.tx_packets; 445 446 veth_stats_rx(&rx, peer); 447 tot->tx_dropped += rx.peer_tq_xdp_xmit_err; 448 tot->rx_dropped += rx.xdp_tx_err; 449 tot->tx_bytes += rx.xdp_bytes; 450 tot->tx_packets += rx.xdp_packets; 451 } 452 rcu_read_unlock(); 453 } 454 455 /* fake multicast ability */ 456 static void veth_set_multicast_list(struct net_device *dev) 457 { 458 } 459 460 static int veth_select_rxq(struct net_device *dev) 461 { 462 return smp_processor_id() % dev->real_num_rx_queues; 463 } 464 465 static struct net_device *veth_peer_dev(struct net_device *dev) 466 { 467 struct veth_priv *priv = netdev_priv(dev); 468 469 /* Callers must be under RCU read side. */ 470 return rcu_dereference(priv->peer); 471 } 472 473 static int veth_xdp_xmit(struct net_device *dev, int n, 474 struct xdp_frame **frames, 475 u32 flags, bool ndo_xmit) 476 { 477 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 478 int i, ret = -ENXIO, nxmit = 0; 479 struct net_device *rcv; 480 unsigned int max_len; 481 struct veth_rq *rq; 482 483 if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) 484 return -EINVAL; 485 486 rcu_read_lock(); 487 rcv = rcu_dereference(priv->peer); 488 if (unlikely(!rcv)) 489 goto out; 490 491 rcv_priv = netdev_priv(rcv); 492 rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 493 /* The napi pointer is set if NAPI is enabled, which ensures that 494 * xdp_ring is initialized on receive side and the peer device is up. 495 */ 496 if (!rcu_access_pointer(rq->napi)) 497 goto out; 498 499 max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN; 500 501 spin_lock(&rq->xdp_ring.producer_lock); 502 for (i = 0; i < n; i++) { 503 struct xdp_frame *frame = frames[i]; 504 void *ptr = veth_xdp_to_ptr(frame); 505 506 if (unlikely(xdp_get_frame_len(frame) > max_len || 507 __ptr_ring_produce(&rq->xdp_ring, ptr))) 508 break; 509 nxmit++; 510 } 511 spin_unlock(&rq->xdp_ring.producer_lock); 512 513 if (flags & XDP_XMIT_FLUSH) 514 __veth_xdp_flush(rq); 515 516 ret = nxmit; 517 if (ndo_xmit) { 518 u64_stats_update_begin(&rq->stats.syncp); 519 rq->stats.vs.peer_tq_xdp_xmit += nxmit; 520 rq->stats.vs.peer_tq_xdp_xmit_err += n - nxmit; 521 u64_stats_update_end(&rq->stats.syncp); 522 } 523 524 out: 525 rcu_read_unlock(); 526 527 return ret; 528 } 529 530 static int veth_ndo_xdp_xmit(struct net_device *dev, int n, 531 struct xdp_frame **frames, u32 flags) 532 { 533 int err; 534 535 err = veth_xdp_xmit(dev, n, frames, flags, true); 536 if (err < 0) { 537 struct veth_priv *priv = netdev_priv(dev); 538 539 atomic64_add(n, &priv->dropped); 540 } 541 542 return err; 543 } 544 545 static void veth_xdp_flush_bq(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) 546 { 547 int sent, i, err = 0, drops; 548 549 sent = veth_xdp_xmit(rq->dev, bq->count, bq->q, 0, false); 550 if (sent < 0) { 551 err = sent; 552 sent = 0; 553 } 554 555 for (i = sent; unlikely(i < bq->count); i++) 556 xdp_return_frame(bq->q[i]); 557 558 drops = bq->count - sent; 559 trace_xdp_bulk_tx(rq->dev, sent, drops, err); 560 561 u64_stats_update_begin(&rq->stats.syncp); 562 rq->stats.vs.xdp_tx += sent; 563 rq->stats.vs.xdp_tx_err += drops; 564 u64_stats_update_end(&rq->stats.syncp); 565 566 bq->count = 0; 567 } 568 569 static void veth_xdp_flush(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) 570 { 571 struct veth_priv *rcv_priv, *priv = netdev_priv(rq->dev); 572 struct net_device *rcv; 573 struct veth_rq *rcv_rq; 574 575 rcu_read_lock(); 576 veth_xdp_flush_bq(rq, bq); 577 rcv = rcu_dereference(priv->peer); 578 if (unlikely(!rcv)) 579 goto out; 580 581 rcv_priv = netdev_priv(rcv); 582 rcv_rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 583 /* xdp_ring is initialized on receive side? */ 584 if (unlikely(!rcu_access_pointer(rcv_rq->xdp_prog))) 585 goto out; 586 587 __veth_xdp_flush(rcv_rq); 588 out: 589 rcu_read_unlock(); 590 } 591 592 static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp, 593 struct veth_xdp_tx_bq *bq) 594 { 595 struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp); 596 597 if (unlikely(!frame)) 598 return -EOVERFLOW; 599 600 if (unlikely(bq->count == VETH_XDP_TX_BULK_SIZE)) 601 veth_xdp_flush_bq(rq, bq); 602 603 bq->q[bq->count++] = frame; 604 605 return 0; 606 } 607 608 static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq, 609 struct xdp_frame *frame, 610 struct veth_xdp_tx_bq *bq, 611 struct veth_stats *stats) 612 { 613 struct xdp_frame orig_frame; 614 struct bpf_prog *xdp_prog; 615 616 rcu_read_lock(); 617 xdp_prog = rcu_dereference(rq->xdp_prog); 618 if (likely(xdp_prog)) { 619 struct veth_xdp_buff vxbuf; 620 struct xdp_buff *xdp = &vxbuf.xdp; 621 u32 act; 622 623 xdp_convert_frame_to_buff(frame, xdp); 624 xdp->rxq = &rq->xdp_rxq; 625 vxbuf.skb = NULL; 626 627 act = bpf_prog_run_xdp(xdp_prog, xdp); 628 629 switch (act) { 630 case XDP_PASS: 631 if (xdp_update_frame_from_buff(xdp, frame)) 632 goto err_xdp; 633 break; 634 case XDP_TX: 635 orig_frame = *frame; 636 xdp->rxq->mem = frame->mem; 637 if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) { 638 trace_xdp_exception(rq->dev, xdp_prog, act); 639 frame = &orig_frame; 640 stats->rx_drops++; 641 goto err_xdp; 642 } 643 stats->xdp_tx++; 644 rcu_read_unlock(); 645 goto xdp_xmit; 646 case XDP_REDIRECT: 647 orig_frame = *frame; 648 xdp->rxq->mem = frame->mem; 649 if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) { 650 frame = &orig_frame; 651 stats->rx_drops++; 652 goto err_xdp; 653 } 654 stats->xdp_redirect++; 655 rcu_read_unlock(); 656 goto xdp_xmit; 657 default: 658 bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); 659 fallthrough; 660 case XDP_ABORTED: 661 trace_xdp_exception(rq->dev, xdp_prog, act); 662 fallthrough; 663 case XDP_DROP: 664 stats->xdp_drops++; 665 goto err_xdp; 666 } 667 } 668 rcu_read_unlock(); 669 670 return frame; 671 err_xdp: 672 rcu_read_unlock(); 673 xdp_return_frame(frame); 674 xdp_xmit: 675 return NULL; 676 } 677 678 /* frames array contains VETH_XDP_BATCH at most */ 679 static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames, 680 int n_xdpf, struct veth_xdp_tx_bq *bq, 681 struct veth_stats *stats) 682 { 683 void *skbs[VETH_XDP_BATCH]; 684 int i; 685 686 if (xdp_alloc_skb_bulk(skbs, n_xdpf, 687 GFP_ATOMIC | __GFP_ZERO) < 0) { 688 for (i = 0; i < n_xdpf; i++) 689 xdp_return_frame(frames[i]); 690 stats->rx_drops += n_xdpf; 691 692 return; 693 } 694 695 for (i = 0; i < n_xdpf; i++) { 696 struct sk_buff *skb = skbs[i]; 697 698 skb = __xdp_build_skb_from_frame(frames[i], skb, 699 rq->dev); 700 if (!skb) { 701 xdp_return_frame(frames[i]); 702 stats->rx_drops++; 703 continue; 704 } 705 napi_gro_receive(&rq->xdp_napi, skb); 706 } 707 } 708 709 static void veth_xdp_get(struct xdp_buff *xdp) 710 { 711 struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); 712 int i; 713 714 get_page(virt_to_page(xdp->data)); 715 if (likely(!xdp_buff_has_frags(xdp))) 716 return; 717 718 for (i = 0; i < sinfo->nr_frags; i++) 719 __skb_frag_ref(&sinfo->frags[i]); 720 } 721 722 static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq, 723 struct xdp_buff *xdp, 724 struct sk_buff **pskb) 725 { 726 struct sk_buff *skb = *pskb; 727 u32 frame_sz; 728 729 if (skb_shared(skb) || skb_head_is_locked(skb) || 730 skb_shinfo(skb)->nr_frags || 731 skb_headroom(skb) < XDP_PACKET_HEADROOM) { 732 u32 size, len, max_head_size, off; 733 struct sk_buff *nskb; 734 struct page *page; 735 int i, head_off; 736 737 /* We need a private copy of the skb and data buffers since 738 * the ebpf program can modify it. We segment the original skb 739 * into order-0 pages without linearize it. 740 * 741 * Make sure we have enough space for linear and paged area 742 */ 743 max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - 744 VETH_XDP_HEADROOM); 745 if (skb->len > PAGE_SIZE * MAX_SKB_FRAGS + max_head_size) 746 goto drop; 747 748 /* Allocate skb head */ 749 page = page_pool_dev_alloc_pages(rq->page_pool); 750 if (!page) 751 goto drop; 752 753 nskb = napi_build_skb(page_address(page), PAGE_SIZE); 754 if (!nskb) { 755 page_pool_put_full_page(rq->page_pool, page, true); 756 goto drop; 757 } 758 759 skb_reserve(nskb, VETH_XDP_HEADROOM); 760 skb_copy_header(nskb, skb); 761 skb_mark_for_recycle(nskb); 762 763 size = min_t(u32, skb->len, max_head_size); 764 if (skb_copy_bits(skb, 0, nskb->data, size)) { 765 consume_skb(nskb); 766 goto drop; 767 } 768 skb_put(nskb, size); 769 770 head_off = skb_headroom(nskb) - skb_headroom(skb); 771 skb_headers_offset_update(nskb, head_off); 772 773 /* Allocate paged area of new skb */ 774 off = size; 775 len = skb->len - off; 776 777 for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) { 778 page = page_pool_dev_alloc_pages(rq->page_pool); 779 if (!page) { 780 consume_skb(nskb); 781 goto drop; 782 } 783 784 size = min_t(u32, len, PAGE_SIZE); 785 skb_add_rx_frag(nskb, i, page, 0, size, PAGE_SIZE); 786 if (skb_copy_bits(skb, off, page_address(page), 787 size)) { 788 consume_skb(nskb); 789 goto drop; 790 } 791 792 len -= size; 793 off += size; 794 } 795 796 consume_skb(skb); 797 skb = nskb; 798 } 799 800 /* SKB "head" area always have tailroom for skb_shared_info */ 801 frame_sz = skb_end_pointer(skb) - skb->head; 802 frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 803 xdp_init_buff(xdp, frame_sz, &rq->xdp_rxq); 804 xdp_prepare_buff(xdp, skb->head, skb_headroom(skb), 805 skb_headlen(skb), true); 806 807 if (skb_is_nonlinear(skb)) { 808 skb_shinfo(skb)->xdp_frags_size = skb->data_len; 809 xdp_buff_set_frags_flag(xdp); 810 } else { 811 xdp_buff_clear_frags_flag(xdp); 812 } 813 *pskb = skb; 814 815 return 0; 816 drop: 817 consume_skb(skb); 818 *pskb = NULL; 819 820 return -ENOMEM; 821 } 822 823 static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, 824 struct sk_buff *skb, 825 struct veth_xdp_tx_bq *bq, 826 struct veth_stats *stats) 827 { 828 void *orig_data, *orig_data_end; 829 struct bpf_prog *xdp_prog; 830 struct veth_xdp_buff vxbuf; 831 struct xdp_buff *xdp = &vxbuf.xdp; 832 u32 act, metalen; 833 int off; 834 835 skb_prepare_for_gro(skb); 836 837 rcu_read_lock(); 838 xdp_prog = rcu_dereference(rq->xdp_prog); 839 if (unlikely(!xdp_prog)) { 840 rcu_read_unlock(); 841 goto out; 842 } 843 844 __skb_push(skb, skb->data - skb_mac_header(skb)); 845 if (veth_convert_skb_to_xdp_buff(rq, xdp, &skb)) 846 goto drop; 847 vxbuf.skb = skb; 848 849 orig_data = xdp->data; 850 orig_data_end = xdp->data_end; 851 852 act = bpf_prog_run_xdp(xdp_prog, xdp); 853 854 switch (act) { 855 case XDP_PASS: 856 break; 857 case XDP_TX: 858 veth_xdp_get(xdp); 859 consume_skb(skb); 860 xdp->rxq->mem = rq->xdp_mem; 861 if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) { 862 trace_xdp_exception(rq->dev, xdp_prog, act); 863 stats->rx_drops++; 864 goto err_xdp; 865 } 866 stats->xdp_tx++; 867 rcu_read_unlock(); 868 goto xdp_xmit; 869 case XDP_REDIRECT: 870 veth_xdp_get(xdp); 871 consume_skb(skb); 872 xdp->rxq->mem = rq->xdp_mem; 873 if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) { 874 stats->rx_drops++; 875 goto err_xdp; 876 } 877 stats->xdp_redirect++; 878 rcu_read_unlock(); 879 goto xdp_xmit; 880 default: 881 bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); 882 fallthrough; 883 case XDP_ABORTED: 884 trace_xdp_exception(rq->dev, xdp_prog, act); 885 fallthrough; 886 case XDP_DROP: 887 stats->xdp_drops++; 888 goto xdp_drop; 889 } 890 rcu_read_unlock(); 891 892 /* check if bpf_xdp_adjust_head was used */ 893 off = orig_data - xdp->data; 894 if (off > 0) 895 __skb_push(skb, off); 896 else if (off < 0) 897 __skb_pull(skb, -off); 898 899 skb_reset_mac_header(skb); 900 901 /* check if bpf_xdp_adjust_tail was used */ 902 off = xdp->data_end - orig_data_end; 903 if (off != 0) 904 __skb_put(skb, off); /* positive on grow, negative on shrink */ 905 906 /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers 907 * (e.g. bpf_xdp_adjust_tail), we need to update data_len here. 908 */ 909 if (xdp_buff_has_frags(xdp)) 910 skb->data_len = skb_shinfo(skb)->xdp_frags_size; 911 else 912 skb->data_len = 0; 913 914 skb->protocol = eth_type_trans(skb, rq->dev); 915 916 metalen = xdp->data - xdp->data_meta; 917 if (metalen) 918 skb_metadata_set(skb, metalen); 919 out: 920 return skb; 921 drop: 922 stats->rx_drops++; 923 xdp_drop: 924 rcu_read_unlock(); 925 kfree_skb(skb); 926 return NULL; 927 err_xdp: 928 rcu_read_unlock(); 929 xdp_return_buff(xdp); 930 xdp_xmit: 931 return NULL; 932 } 933 934 static int veth_xdp_rcv(struct veth_rq *rq, int budget, 935 struct veth_xdp_tx_bq *bq, 936 struct veth_stats *stats) 937 { 938 int i, done = 0, n_xdpf = 0; 939 void *xdpf[VETH_XDP_BATCH]; 940 941 for (i = 0; i < budget; i++) { 942 void *ptr = __ptr_ring_consume(&rq->xdp_ring); 943 944 if (!ptr) 945 break; 946 947 if (veth_is_xdp_frame(ptr)) { 948 /* ndo_xdp_xmit */ 949 struct xdp_frame *frame = veth_ptr_to_xdp(ptr); 950 951 stats->xdp_bytes += xdp_get_frame_len(frame); 952 frame = veth_xdp_rcv_one(rq, frame, bq, stats); 953 if (frame) { 954 /* XDP_PASS */ 955 xdpf[n_xdpf++] = frame; 956 if (n_xdpf == VETH_XDP_BATCH) { 957 veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, 958 bq, stats); 959 n_xdpf = 0; 960 } 961 } 962 } else { 963 /* ndo_start_xmit */ 964 struct sk_buff *skb = ptr; 965 966 stats->xdp_bytes += skb->len; 967 skb = veth_xdp_rcv_skb(rq, skb, bq, stats); 968 if (skb) { 969 if (skb_shared(skb) || skb_unclone(skb, GFP_ATOMIC)) 970 netif_receive_skb(skb); 971 else 972 napi_gro_receive(&rq->xdp_napi, skb); 973 } 974 } 975 done++; 976 } 977 978 if (n_xdpf) 979 veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats); 980 981 u64_stats_update_begin(&rq->stats.syncp); 982 rq->stats.vs.xdp_redirect += stats->xdp_redirect; 983 rq->stats.vs.xdp_bytes += stats->xdp_bytes; 984 rq->stats.vs.xdp_drops += stats->xdp_drops; 985 rq->stats.vs.rx_drops += stats->rx_drops; 986 rq->stats.vs.xdp_packets += done; 987 u64_stats_update_end(&rq->stats.syncp); 988 989 return done; 990 } 991 992 static int veth_poll(struct napi_struct *napi, int budget) 993 { 994 struct veth_rq *rq = 995 container_of(napi, struct veth_rq, xdp_napi); 996 struct veth_stats stats = {}; 997 struct veth_xdp_tx_bq bq; 998 int done; 999 1000 bq.count = 0; 1001 1002 xdp_set_return_frame_no_direct(); 1003 done = veth_xdp_rcv(rq, budget, &bq, &stats); 1004 1005 if (stats.xdp_redirect > 0) 1006 xdp_do_flush(); 1007 1008 if (done < budget && napi_complete_done(napi, done)) { 1009 /* Write rx_notify_masked before reading ptr_ring */ 1010 smp_store_mb(rq->rx_notify_masked, false); 1011 if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { 1012 if (napi_schedule_prep(&rq->xdp_napi)) { 1013 WRITE_ONCE(rq->rx_notify_masked, true); 1014 __napi_schedule(&rq->xdp_napi); 1015 } 1016 } 1017 } 1018 1019 if (stats.xdp_tx > 0) 1020 veth_xdp_flush(rq, &bq); 1021 xdp_clear_return_frame_no_direct(); 1022 1023 return done; 1024 } 1025 1026 static int veth_create_page_pool(struct veth_rq *rq) 1027 { 1028 struct page_pool_params pp_params = { 1029 .order = 0, 1030 .pool_size = VETH_RING_SIZE, 1031 .nid = NUMA_NO_NODE, 1032 .dev = &rq->dev->dev, 1033 }; 1034 1035 rq->page_pool = page_pool_create(&pp_params); 1036 if (IS_ERR(rq->page_pool)) { 1037 int err = PTR_ERR(rq->page_pool); 1038 1039 rq->page_pool = NULL; 1040 return err; 1041 } 1042 1043 return 0; 1044 } 1045 1046 static int __veth_napi_enable_range(struct net_device *dev, int start, int end) 1047 { 1048 struct veth_priv *priv = netdev_priv(dev); 1049 int err, i; 1050 1051 for (i = start; i < end; i++) { 1052 err = veth_create_page_pool(&priv->rq[i]); 1053 if (err) 1054 goto err_page_pool; 1055 } 1056 1057 for (i = start; i < end; i++) { 1058 struct veth_rq *rq = &priv->rq[i]; 1059 1060 err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL); 1061 if (err) 1062 goto err_xdp_ring; 1063 } 1064 1065 for (i = start; i < end; i++) { 1066 struct veth_rq *rq = &priv->rq[i]; 1067 1068 napi_enable(&rq->xdp_napi); 1069 rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); 1070 } 1071 1072 return 0; 1073 1074 err_xdp_ring: 1075 for (i--; i >= start; i--) 1076 ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); 1077 i = end; 1078 err_page_pool: 1079 for (i--; i >= start; i--) { 1080 page_pool_destroy(priv->rq[i].page_pool); 1081 priv->rq[i].page_pool = NULL; 1082 } 1083 1084 return err; 1085 } 1086 1087 static int __veth_napi_enable(struct net_device *dev) 1088 { 1089 return __veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); 1090 } 1091 1092 static void veth_napi_del_range(struct net_device *dev, int start, int end) 1093 { 1094 struct veth_priv *priv = netdev_priv(dev); 1095 int i; 1096 1097 for (i = start; i < end; i++) { 1098 struct veth_rq *rq = &priv->rq[i]; 1099 1100 rcu_assign_pointer(priv->rq[i].napi, NULL); 1101 napi_disable(&rq->xdp_napi); 1102 __netif_napi_del(&rq->xdp_napi); 1103 } 1104 synchronize_net(); 1105 1106 for (i = start; i < end; i++) { 1107 struct veth_rq *rq = &priv->rq[i]; 1108 1109 rq->rx_notify_masked = false; 1110 ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free); 1111 } 1112 1113 for (i = start; i < end; i++) { 1114 page_pool_destroy(priv->rq[i].page_pool); 1115 priv->rq[i].page_pool = NULL; 1116 } 1117 } 1118 1119 static void veth_napi_del(struct net_device *dev) 1120 { 1121 veth_napi_del_range(dev, 0, dev->real_num_rx_queues); 1122 } 1123 1124 static bool veth_gro_requested(const struct net_device *dev) 1125 { 1126 return !!(dev->wanted_features & NETIF_F_GRO); 1127 } 1128 1129 static int veth_enable_xdp_range(struct net_device *dev, int start, int end, 1130 bool napi_already_on) 1131 { 1132 struct veth_priv *priv = netdev_priv(dev); 1133 int err, i; 1134 1135 for (i = start; i < end; i++) { 1136 struct veth_rq *rq = &priv->rq[i]; 1137 1138 if (!napi_already_on) 1139 netif_napi_add(dev, &rq->xdp_napi, veth_poll); 1140 err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id); 1141 if (err < 0) 1142 goto err_rxq_reg; 1143 1144 err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, 1145 MEM_TYPE_PAGE_SHARED, 1146 NULL); 1147 if (err < 0) 1148 goto err_reg_mem; 1149 1150 /* Save original mem info as it can be overwritten */ 1151 rq->xdp_mem = rq->xdp_rxq.mem; 1152 } 1153 return 0; 1154 1155 err_reg_mem: 1156 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 1157 err_rxq_reg: 1158 for (i--; i >= start; i--) { 1159 struct veth_rq *rq = &priv->rq[i]; 1160 1161 xdp_rxq_info_unreg(&rq->xdp_rxq); 1162 if (!napi_already_on) 1163 netif_napi_del(&rq->xdp_napi); 1164 } 1165 1166 return err; 1167 } 1168 1169 static void veth_disable_xdp_range(struct net_device *dev, int start, int end, 1170 bool delete_napi) 1171 { 1172 struct veth_priv *priv = netdev_priv(dev); 1173 int i; 1174 1175 for (i = start; i < end; i++) { 1176 struct veth_rq *rq = &priv->rq[i]; 1177 1178 rq->xdp_rxq.mem = rq->xdp_mem; 1179 xdp_rxq_info_unreg(&rq->xdp_rxq); 1180 1181 if (delete_napi) 1182 netif_napi_del(&rq->xdp_napi); 1183 } 1184 } 1185 1186 static int veth_enable_xdp(struct net_device *dev) 1187 { 1188 bool napi_already_on = veth_gro_requested(dev) && (dev->flags & IFF_UP); 1189 struct veth_priv *priv = netdev_priv(dev); 1190 int err, i; 1191 1192 if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) { 1193 err = veth_enable_xdp_range(dev, 0, dev->real_num_rx_queues, napi_already_on); 1194 if (err) 1195 return err; 1196 1197 if (!napi_already_on) { 1198 err = __veth_napi_enable(dev); 1199 if (err) { 1200 veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, true); 1201 return err; 1202 } 1203 } 1204 } 1205 1206 for (i = 0; i < dev->real_num_rx_queues; i++) { 1207 rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog); 1208 rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); 1209 } 1210 1211 return 0; 1212 } 1213 1214 static void veth_disable_xdp(struct net_device *dev) 1215 { 1216 struct veth_priv *priv = netdev_priv(dev); 1217 int i; 1218 1219 for (i = 0; i < dev->real_num_rx_queues; i++) 1220 rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); 1221 1222 if (!netif_running(dev) || !veth_gro_requested(dev)) 1223 veth_napi_del(dev); 1224 1225 veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, false); 1226 } 1227 1228 static int veth_napi_enable_range(struct net_device *dev, int start, int end) 1229 { 1230 struct veth_priv *priv = netdev_priv(dev); 1231 int err, i; 1232 1233 for (i = start; i < end; i++) { 1234 struct veth_rq *rq = &priv->rq[i]; 1235 1236 netif_napi_add(dev, &rq->xdp_napi, veth_poll); 1237 } 1238 1239 err = __veth_napi_enable_range(dev, start, end); 1240 if (err) { 1241 for (i = start; i < end; i++) { 1242 struct veth_rq *rq = &priv->rq[i]; 1243 1244 netif_napi_del(&rq->xdp_napi); 1245 } 1246 return err; 1247 } 1248 return err; 1249 } 1250 1251 static int veth_napi_enable(struct net_device *dev) 1252 { 1253 return veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); 1254 } 1255 1256 static void veth_disable_range_safe(struct net_device *dev, int start, int end) 1257 { 1258 struct veth_priv *priv = netdev_priv(dev); 1259 1260 if (start >= end) 1261 return; 1262 1263 if (priv->_xdp_prog) { 1264 veth_napi_del_range(dev, start, end); 1265 veth_disable_xdp_range(dev, start, end, false); 1266 } else if (veth_gro_requested(dev)) { 1267 veth_napi_del_range(dev, start, end); 1268 } 1269 } 1270 1271 static int veth_enable_range_safe(struct net_device *dev, int start, int end) 1272 { 1273 struct veth_priv *priv = netdev_priv(dev); 1274 int err; 1275 1276 if (start >= end) 1277 return 0; 1278 1279 if (priv->_xdp_prog) { 1280 /* these channels are freshly initialized, napi is not on there even 1281 * when GRO is requeste 1282 */ 1283 err = veth_enable_xdp_range(dev, start, end, false); 1284 if (err) 1285 return err; 1286 1287 err = __veth_napi_enable_range(dev, start, end); 1288 if (err) { 1289 /* on error always delete the newly added napis */ 1290 veth_disable_xdp_range(dev, start, end, true); 1291 return err; 1292 } 1293 } else if (veth_gro_requested(dev)) { 1294 return veth_napi_enable_range(dev, start, end); 1295 } 1296 return 0; 1297 } 1298 1299 static void veth_set_xdp_features(struct net_device *dev) 1300 { 1301 struct veth_priv *priv = netdev_priv(dev); 1302 struct net_device *peer; 1303 1304 peer = rtnl_dereference(priv->peer); 1305 if (peer && peer->real_num_tx_queues <= dev->real_num_rx_queues) { 1306 struct veth_priv *priv_peer = netdev_priv(peer); 1307 xdp_features_t val = NETDEV_XDP_ACT_BASIC | 1308 NETDEV_XDP_ACT_REDIRECT | 1309 NETDEV_XDP_ACT_RX_SG; 1310 1311 if (priv_peer->_xdp_prog || veth_gro_requested(peer)) 1312 val |= NETDEV_XDP_ACT_NDO_XMIT | 1313 NETDEV_XDP_ACT_NDO_XMIT_SG; 1314 xdp_set_features_flag(dev, val); 1315 } else { 1316 xdp_clear_features_flag(dev); 1317 } 1318 } 1319 1320 static int veth_set_channels(struct net_device *dev, 1321 struct ethtool_channels *ch) 1322 { 1323 struct veth_priv *priv = netdev_priv(dev); 1324 unsigned int old_rx_count, new_rx_count; 1325 struct veth_priv *peer_priv; 1326 struct net_device *peer; 1327 int err; 1328 1329 /* sanity check. Upper bounds are already enforced by the caller */ 1330 if (!ch->rx_count || !ch->tx_count) 1331 return -EINVAL; 1332 1333 /* avoid braking XDP, if that is enabled */ 1334 peer = rtnl_dereference(priv->peer); 1335 peer_priv = peer ? netdev_priv(peer) : NULL; 1336 if (priv->_xdp_prog && peer && ch->rx_count < peer->real_num_tx_queues) 1337 return -EINVAL; 1338 1339 if (peer && peer_priv && peer_priv->_xdp_prog && ch->tx_count > peer->real_num_rx_queues) 1340 return -EINVAL; 1341 1342 old_rx_count = dev->real_num_rx_queues; 1343 new_rx_count = ch->rx_count; 1344 if (netif_running(dev)) { 1345 /* turn device off */ 1346 netif_carrier_off(dev); 1347 if (peer) 1348 netif_carrier_off(peer); 1349 1350 /* try to allocate new resurces, as needed*/ 1351 err = veth_enable_range_safe(dev, old_rx_count, new_rx_count); 1352 if (err) 1353 goto out; 1354 } 1355 1356 err = netif_set_real_num_rx_queues(dev, ch->rx_count); 1357 if (err) 1358 goto revert; 1359 1360 err = netif_set_real_num_tx_queues(dev, ch->tx_count); 1361 if (err) { 1362 int err2 = netif_set_real_num_rx_queues(dev, old_rx_count); 1363 1364 /* this error condition could happen only if rx and tx change 1365 * in opposite directions (e.g. tx nr raises, rx nr decreases) 1366 * and we can't do anything to fully restore the original 1367 * status 1368 */ 1369 if (err2) 1370 pr_warn("Can't restore rx queues config %d -> %d %d", 1371 new_rx_count, old_rx_count, err2); 1372 else 1373 goto revert; 1374 } 1375 1376 out: 1377 if (netif_running(dev)) { 1378 /* note that we need to swap the arguments WRT the enable part 1379 * to identify the range we have to disable 1380 */ 1381 veth_disable_range_safe(dev, new_rx_count, old_rx_count); 1382 netif_carrier_on(dev); 1383 if (peer) 1384 netif_carrier_on(peer); 1385 } 1386 1387 /* update XDP supported features */ 1388 veth_set_xdp_features(dev); 1389 if (peer) 1390 veth_set_xdp_features(peer); 1391 1392 return err; 1393 1394 revert: 1395 new_rx_count = old_rx_count; 1396 old_rx_count = ch->rx_count; 1397 goto out; 1398 } 1399 1400 static int veth_open(struct net_device *dev) 1401 { 1402 struct veth_priv *priv = netdev_priv(dev); 1403 struct net_device *peer = rtnl_dereference(priv->peer); 1404 int err; 1405 1406 if (!peer) 1407 return -ENOTCONN; 1408 1409 if (priv->_xdp_prog) { 1410 err = veth_enable_xdp(dev); 1411 if (err) 1412 return err; 1413 } else if (veth_gro_requested(dev)) { 1414 err = veth_napi_enable(dev); 1415 if (err) 1416 return err; 1417 } 1418 1419 if (peer->flags & IFF_UP) { 1420 netif_carrier_on(dev); 1421 netif_carrier_on(peer); 1422 } 1423 1424 veth_set_xdp_features(dev); 1425 1426 return 0; 1427 } 1428 1429 static int veth_close(struct net_device *dev) 1430 { 1431 struct veth_priv *priv = netdev_priv(dev); 1432 struct net_device *peer = rtnl_dereference(priv->peer); 1433 1434 netif_carrier_off(dev); 1435 if (peer) 1436 netif_carrier_off(peer); 1437 1438 if (priv->_xdp_prog) 1439 veth_disable_xdp(dev); 1440 else if (veth_gro_requested(dev)) 1441 veth_napi_del(dev); 1442 1443 return 0; 1444 } 1445 1446 static int is_valid_veth_mtu(int mtu) 1447 { 1448 return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU; 1449 } 1450 1451 static int veth_alloc_queues(struct net_device *dev) 1452 { 1453 struct veth_priv *priv = netdev_priv(dev); 1454 int i; 1455 1456 priv->rq = kvcalloc(dev->num_rx_queues, sizeof(*priv->rq), 1457 GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); 1458 if (!priv->rq) 1459 return -ENOMEM; 1460 1461 for (i = 0; i < dev->num_rx_queues; i++) { 1462 priv->rq[i].dev = dev; 1463 u64_stats_init(&priv->rq[i].stats.syncp); 1464 } 1465 1466 return 0; 1467 } 1468 1469 static void veth_free_queues(struct net_device *dev) 1470 { 1471 struct veth_priv *priv = netdev_priv(dev); 1472 1473 kvfree(priv->rq); 1474 } 1475 1476 static int veth_dev_init(struct net_device *dev) 1477 { 1478 return veth_alloc_queues(dev); 1479 } 1480 1481 static void veth_dev_free(struct net_device *dev) 1482 { 1483 veth_free_queues(dev); 1484 } 1485 1486 #ifdef CONFIG_NET_POLL_CONTROLLER 1487 static void veth_poll_controller(struct net_device *dev) 1488 { 1489 /* veth only receives frames when its peer sends one 1490 * Since it has nothing to do with disabling irqs, we are guaranteed 1491 * never to have pending data when we poll for it so 1492 * there is nothing to do here. 1493 * 1494 * We need this though so netpoll recognizes us as an interface that 1495 * supports polling, which enables bridge devices in virt setups to 1496 * still use netconsole 1497 */ 1498 } 1499 #endif /* CONFIG_NET_POLL_CONTROLLER */ 1500 1501 static int veth_get_iflink(const struct net_device *dev) 1502 { 1503 struct veth_priv *priv = netdev_priv(dev); 1504 struct net_device *peer; 1505 int iflink; 1506 1507 rcu_read_lock(); 1508 peer = rcu_dereference(priv->peer); 1509 iflink = peer ? peer->ifindex : 0; 1510 rcu_read_unlock(); 1511 1512 return iflink; 1513 } 1514 1515 static netdev_features_t veth_fix_features(struct net_device *dev, 1516 netdev_features_t features) 1517 { 1518 struct veth_priv *priv = netdev_priv(dev); 1519 struct net_device *peer; 1520 1521 peer = rtnl_dereference(priv->peer); 1522 if (peer) { 1523 struct veth_priv *peer_priv = netdev_priv(peer); 1524 1525 if (peer_priv->_xdp_prog) 1526 features &= ~NETIF_F_GSO_SOFTWARE; 1527 } 1528 1529 return features; 1530 } 1531 1532 static int veth_set_features(struct net_device *dev, 1533 netdev_features_t features) 1534 { 1535 netdev_features_t changed = features ^ dev->features; 1536 struct veth_priv *priv = netdev_priv(dev); 1537 struct net_device *peer; 1538 int err; 1539 1540 if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog) 1541 return 0; 1542 1543 peer = rtnl_dereference(priv->peer); 1544 if (features & NETIF_F_GRO) { 1545 err = veth_napi_enable(dev); 1546 if (err) 1547 return err; 1548 1549 if (peer) 1550 xdp_features_set_redirect_target(peer, true); 1551 } else { 1552 if (peer) 1553 xdp_features_clear_redirect_target(peer); 1554 veth_napi_del(dev); 1555 } 1556 return 0; 1557 } 1558 1559 static void veth_set_rx_headroom(struct net_device *dev, int new_hr) 1560 { 1561 struct veth_priv *peer_priv, *priv = netdev_priv(dev); 1562 struct net_device *peer; 1563 1564 if (new_hr < 0) 1565 new_hr = 0; 1566 1567 rcu_read_lock(); 1568 peer = rcu_dereference(priv->peer); 1569 if (unlikely(!peer)) 1570 goto out; 1571 1572 peer_priv = netdev_priv(peer); 1573 priv->requested_headroom = new_hr; 1574 new_hr = max(priv->requested_headroom, peer_priv->requested_headroom); 1575 dev->needed_headroom = new_hr; 1576 peer->needed_headroom = new_hr; 1577 1578 out: 1579 rcu_read_unlock(); 1580 } 1581 1582 static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, 1583 struct netlink_ext_ack *extack) 1584 { 1585 struct veth_priv *priv = netdev_priv(dev); 1586 struct bpf_prog *old_prog; 1587 struct net_device *peer; 1588 unsigned int max_mtu; 1589 int err; 1590 1591 old_prog = priv->_xdp_prog; 1592 priv->_xdp_prog = prog; 1593 peer = rtnl_dereference(priv->peer); 1594 1595 if (prog) { 1596 if (!peer) { 1597 NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached"); 1598 err = -ENOTCONN; 1599 goto err; 1600 } 1601 1602 max_mtu = SKB_WITH_OVERHEAD(PAGE_SIZE - VETH_XDP_HEADROOM) - 1603 peer->hard_header_len; 1604 /* Allow increasing the max_mtu if the program supports 1605 * XDP fragments. 1606 */ 1607 if (prog->aux->xdp_has_frags) 1608 max_mtu += PAGE_SIZE * MAX_SKB_FRAGS; 1609 1610 if (peer->mtu > max_mtu) { 1611 NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP"); 1612 err = -ERANGE; 1613 goto err; 1614 } 1615 1616 if (dev->real_num_rx_queues < peer->real_num_tx_queues) { 1617 NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues"); 1618 err = -ENOSPC; 1619 goto err; 1620 } 1621 1622 if (dev->flags & IFF_UP) { 1623 err = veth_enable_xdp(dev); 1624 if (err) { 1625 NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed"); 1626 goto err; 1627 } 1628 } 1629 1630 if (!old_prog) { 1631 peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; 1632 peer->max_mtu = max_mtu; 1633 } 1634 1635 xdp_features_set_redirect_target(peer, true); 1636 } 1637 1638 if (old_prog) { 1639 if (!prog) { 1640 if (peer && !veth_gro_requested(dev)) 1641 xdp_features_clear_redirect_target(peer); 1642 1643 if (dev->flags & IFF_UP) 1644 veth_disable_xdp(dev); 1645 1646 if (peer) { 1647 peer->hw_features |= NETIF_F_GSO_SOFTWARE; 1648 peer->max_mtu = ETH_MAX_MTU; 1649 } 1650 } 1651 bpf_prog_put(old_prog); 1652 } 1653 1654 if ((!!old_prog ^ !!prog) && peer) 1655 netdev_update_features(peer); 1656 1657 return 0; 1658 err: 1659 priv->_xdp_prog = old_prog; 1660 1661 return err; 1662 } 1663 1664 static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) 1665 { 1666 switch (xdp->command) { 1667 case XDP_SETUP_PROG: 1668 return veth_xdp_set(dev, xdp->prog, xdp->extack); 1669 default: 1670 return -EINVAL; 1671 } 1672 } 1673 1674 static int veth_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp) 1675 { 1676 struct veth_xdp_buff *_ctx = (void *)ctx; 1677 1678 if (!_ctx->skb) 1679 return -ENODATA; 1680 1681 *timestamp = skb_hwtstamps(_ctx->skb)->hwtstamp; 1682 return 0; 1683 } 1684 1685 static int veth_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash, 1686 enum xdp_rss_hash_type *rss_type) 1687 { 1688 struct veth_xdp_buff *_ctx = (void *)ctx; 1689 struct sk_buff *skb = _ctx->skb; 1690 1691 if (!skb) 1692 return -ENODATA; 1693 1694 *hash = skb_get_hash(skb); 1695 *rss_type = skb->l4_hash ? XDP_RSS_TYPE_L4_ANY : XDP_RSS_TYPE_NONE; 1696 1697 return 0; 1698 } 1699 1700 static const struct net_device_ops veth_netdev_ops = { 1701 .ndo_init = veth_dev_init, 1702 .ndo_open = veth_open, 1703 .ndo_stop = veth_close, 1704 .ndo_start_xmit = veth_xmit, 1705 .ndo_get_stats64 = veth_get_stats64, 1706 .ndo_set_rx_mode = veth_set_multicast_list, 1707 .ndo_set_mac_address = eth_mac_addr, 1708 #ifdef CONFIG_NET_POLL_CONTROLLER 1709 .ndo_poll_controller = veth_poll_controller, 1710 #endif 1711 .ndo_get_iflink = veth_get_iflink, 1712 .ndo_fix_features = veth_fix_features, 1713 .ndo_set_features = veth_set_features, 1714 .ndo_features_check = passthru_features_check, 1715 .ndo_set_rx_headroom = veth_set_rx_headroom, 1716 .ndo_bpf = veth_xdp, 1717 .ndo_xdp_xmit = veth_ndo_xdp_xmit, 1718 .ndo_get_peer_dev = veth_peer_dev, 1719 }; 1720 1721 static const struct xdp_metadata_ops veth_xdp_metadata_ops = { 1722 .xmo_rx_timestamp = veth_xdp_rx_timestamp, 1723 .xmo_rx_hash = veth_xdp_rx_hash, 1724 }; 1725 1726 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ 1727 NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \ 1728 NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \ 1729 NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \ 1730 NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX ) 1731 1732 static void veth_setup(struct net_device *dev) 1733 { 1734 ether_setup(dev); 1735 1736 dev->priv_flags &= ~IFF_TX_SKB_SHARING; 1737 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 1738 dev->priv_flags |= IFF_NO_QUEUE; 1739 dev->priv_flags |= IFF_PHONY_HEADROOM; 1740 1741 dev->netdev_ops = &veth_netdev_ops; 1742 dev->xdp_metadata_ops = &veth_xdp_metadata_ops; 1743 dev->ethtool_ops = &veth_ethtool_ops; 1744 dev->features |= NETIF_F_LLTX; 1745 dev->features |= VETH_FEATURES; 1746 dev->vlan_features = dev->features & 1747 ~(NETIF_F_HW_VLAN_CTAG_TX | 1748 NETIF_F_HW_VLAN_STAG_TX | 1749 NETIF_F_HW_VLAN_CTAG_RX | 1750 NETIF_F_HW_VLAN_STAG_RX); 1751 dev->needs_free_netdev = true; 1752 dev->priv_destructor = veth_dev_free; 1753 dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; 1754 dev->max_mtu = ETH_MAX_MTU; 1755 1756 dev->hw_features = VETH_FEATURES; 1757 dev->hw_enc_features = VETH_FEATURES; 1758 dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; 1759 netif_set_tso_max_size(dev, GSO_MAX_SIZE); 1760 } 1761 1762 /* 1763 * netlink interface 1764 */ 1765 1766 static int veth_validate(struct nlattr *tb[], struct nlattr *data[], 1767 struct netlink_ext_ack *extack) 1768 { 1769 if (tb[IFLA_ADDRESS]) { 1770 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 1771 return -EINVAL; 1772 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 1773 return -EADDRNOTAVAIL; 1774 } 1775 if (tb[IFLA_MTU]) { 1776 if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU]))) 1777 return -EINVAL; 1778 } 1779 return 0; 1780 } 1781 1782 static struct rtnl_link_ops veth_link_ops; 1783 1784 static void veth_disable_gro(struct net_device *dev) 1785 { 1786 dev->features &= ~NETIF_F_GRO; 1787 dev->wanted_features &= ~NETIF_F_GRO; 1788 netdev_update_features(dev); 1789 } 1790 1791 static int veth_init_queues(struct net_device *dev, struct nlattr *tb[]) 1792 { 1793 int err; 1794 1795 if (!tb[IFLA_NUM_TX_QUEUES] && dev->num_tx_queues > 1) { 1796 err = netif_set_real_num_tx_queues(dev, 1); 1797 if (err) 1798 return err; 1799 } 1800 if (!tb[IFLA_NUM_RX_QUEUES] && dev->num_rx_queues > 1) { 1801 err = netif_set_real_num_rx_queues(dev, 1); 1802 if (err) 1803 return err; 1804 } 1805 return 0; 1806 } 1807 1808 static int veth_newlink(struct net *src_net, struct net_device *dev, 1809 struct nlattr *tb[], struct nlattr *data[], 1810 struct netlink_ext_ack *extack) 1811 { 1812 int err; 1813 struct net_device *peer; 1814 struct veth_priv *priv; 1815 char ifname[IFNAMSIZ]; 1816 struct nlattr *peer_tb[IFLA_MAX + 1], **tbp; 1817 unsigned char name_assign_type; 1818 struct ifinfomsg *ifmp; 1819 struct net *net; 1820 1821 /* 1822 * create and register peer first 1823 */ 1824 if (data != NULL && data[VETH_INFO_PEER] != NULL) { 1825 struct nlattr *nla_peer; 1826 1827 nla_peer = data[VETH_INFO_PEER]; 1828 ifmp = nla_data(nla_peer); 1829 err = rtnl_nla_parse_ifinfomsg(peer_tb, nla_peer, extack); 1830 if (err < 0) 1831 return err; 1832 1833 err = veth_validate(peer_tb, NULL, extack); 1834 if (err < 0) 1835 return err; 1836 1837 tbp = peer_tb; 1838 } else { 1839 ifmp = NULL; 1840 tbp = tb; 1841 } 1842 1843 if (ifmp && tbp[IFLA_IFNAME]) { 1844 nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); 1845 name_assign_type = NET_NAME_USER; 1846 } else { 1847 snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); 1848 name_assign_type = NET_NAME_ENUM; 1849 } 1850 1851 net = rtnl_link_get_net(src_net, tbp); 1852 if (IS_ERR(net)) 1853 return PTR_ERR(net); 1854 1855 peer = rtnl_create_link(net, ifname, name_assign_type, 1856 &veth_link_ops, tbp, extack); 1857 if (IS_ERR(peer)) { 1858 put_net(net); 1859 return PTR_ERR(peer); 1860 } 1861 1862 if (!ifmp || !tbp[IFLA_ADDRESS]) 1863 eth_hw_addr_random(peer); 1864 1865 if (ifmp && (dev->ifindex != 0)) 1866 peer->ifindex = ifmp->ifi_index; 1867 1868 netif_inherit_tso_max(peer, dev); 1869 1870 err = register_netdevice(peer); 1871 put_net(net); 1872 net = NULL; 1873 if (err < 0) 1874 goto err_register_peer; 1875 1876 /* keep GRO disabled by default to be consistent with the established 1877 * veth behavior 1878 */ 1879 veth_disable_gro(peer); 1880 netif_carrier_off(peer); 1881 1882 err = rtnl_configure_link(peer, ifmp, 0, NULL); 1883 if (err < 0) 1884 goto err_configure_peer; 1885 1886 /* 1887 * register dev last 1888 * 1889 * note, that since we've registered new device the dev's name 1890 * should be re-allocated 1891 */ 1892 1893 if (tb[IFLA_ADDRESS] == NULL) 1894 eth_hw_addr_random(dev); 1895 1896 if (tb[IFLA_IFNAME]) 1897 nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); 1898 else 1899 snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); 1900 1901 err = register_netdevice(dev); 1902 if (err < 0) 1903 goto err_register_dev; 1904 1905 netif_carrier_off(dev); 1906 1907 /* 1908 * tie the deviced together 1909 */ 1910 1911 priv = netdev_priv(dev); 1912 rcu_assign_pointer(priv->peer, peer); 1913 err = veth_init_queues(dev, tb); 1914 if (err) 1915 goto err_queues; 1916 1917 priv = netdev_priv(peer); 1918 rcu_assign_pointer(priv->peer, dev); 1919 err = veth_init_queues(peer, tb); 1920 if (err) 1921 goto err_queues; 1922 1923 veth_disable_gro(dev); 1924 /* update XDP supported features */ 1925 veth_set_xdp_features(dev); 1926 veth_set_xdp_features(peer); 1927 1928 return 0; 1929 1930 err_queues: 1931 unregister_netdevice(dev); 1932 err_register_dev: 1933 /* nothing to do */ 1934 err_configure_peer: 1935 unregister_netdevice(peer); 1936 return err; 1937 1938 err_register_peer: 1939 free_netdev(peer); 1940 return err; 1941 } 1942 1943 static void veth_dellink(struct net_device *dev, struct list_head *head) 1944 { 1945 struct veth_priv *priv; 1946 struct net_device *peer; 1947 1948 priv = netdev_priv(dev); 1949 peer = rtnl_dereference(priv->peer); 1950 1951 /* Note : dellink() is called from default_device_exit_batch(), 1952 * before a rcu_synchronize() point. The devices are guaranteed 1953 * not being freed before one RCU grace period. 1954 */ 1955 RCU_INIT_POINTER(priv->peer, NULL); 1956 unregister_netdevice_queue(dev, head); 1957 1958 if (peer) { 1959 priv = netdev_priv(peer); 1960 RCU_INIT_POINTER(priv->peer, NULL); 1961 unregister_netdevice_queue(peer, head); 1962 } 1963 } 1964 1965 static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = { 1966 [VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) }, 1967 }; 1968 1969 static struct net *veth_get_link_net(const struct net_device *dev) 1970 { 1971 struct veth_priv *priv = netdev_priv(dev); 1972 struct net_device *peer = rtnl_dereference(priv->peer); 1973 1974 return peer ? dev_net(peer) : dev_net(dev); 1975 } 1976 1977 static unsigned int veth_get_num_queues(void) 1978 { 1979 /* enforce the same queue limit as rtnl_create_link */ 1980 int queues = num_possible_cpus(); 1981 1982 if (queues > 4096) 1983 queues = 4096; 1984 return queues; 1985 } 1986 1987 static struct rtnl_link_ops veth_link_ops = { 1988 .kind = DRV_NAME, 1989 .priv_size = sizeof(struct veth_priv), 1990 .setup = veth_setup, 1991 .validate = veth_validate, 1992 .newlink = veth_newlink, 1993 .dellink = veth_dellink, 1994 .policy = veth_policy, 1995 .maxtype = VETH_INFO_MAX, 1996 .get_link_net = veth_get_link_net, 1997 .get_num_tx_queues = veth_get_num_queues, 1998 .get_num_rx_queues = veth_get_num_queues, 1999 }; 2000 2001 /* 2002 * init/fini 2003 */ 2004 2005 static __init int veth_init(void) 2006 { 2007 return rtnl_link_register(&veth_link_ops); 2008 } 2009 2010 static __exit void veth_exit(void) 2011 { 2012 rtnl_link_unregister(&veth_link_ops); 2013 } 2014 2015 module_init(veth_init); 2016 module_exit(veth_exit); 2017 2018 MODULE_DESCRIPTION("Virtual Ethernet Tunnel"); 2019 MODULE_LICENSE("GPL v2"); 2020 MODULE_ALIAS_RTNL_LINK(DRV_NAME); 2021