1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * drivers/net/veth.c 4 * 5 * Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc 6 * 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com> 9 * 10 */ 11 12 #include <linux/netdevice.h> 13 #include <linux/slab.h> 14 #include <linux/ethtool.h> 15 #include <linux/etherdevice.h> 16 #include <linux/u64_stats_sync.h> 17 18 #include <net/rtnetlink.h> 19 #include <net/dst.h> 20 #include <net/xfrm.h> 21 #include <net/xdp.h> 22 #include <linux/veth.h> 23 #include <linux/module.h> 24 #include <linux/bpf.h> 25 #include <linux/filter.h> 26 #include <linux/ptr_ring.h> 27 #include <linux/bpf_trace.h> 28 #include <linux/net_tstamp.h> 29 #include <net/page_pool/helpers.h> 30 31 #define DRV_NAME "veth" 32 #define DRV_VERSION "1.0" 33 34 #define VETH_XDP_FLAG BIT(0) 35 #define VETH_RING_SIZE 256 36 #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) 37 38 #define VETH_XDP_TX_BULK_SIZE 16 39 #define VETH_XDP_BATCH 16 40 41 struct veth_stats { 42 u64 rx_drops; 43 /* xdp */ 44 u64 xdp_packets; 45 u64 xdp_bytes; 46 u64 xdp_redirect; 47 u64 xdp_drops; 48 u64 xdp_tx; 49 u64 xdp_tx_err; 50 u64 peer_tq_xdp_xmit; 51 u64 peer_tq_xdp_xmit_err; 52 }; 53 54 struct veth_rq_stats { 55 struct veth_stats vs; 56 struct u64_stats_sync syncp; 57 }; 58 59 struct veth_rq { 60 struct napi_struct xdp_napi; 61 struct napi_struct __rcu *napi; /* points to xdp_napi when the latter is initialized */ 62 struct net_device *dev; 63 struct bpf_prog __rcu *xdp_prog; 64 struct xdp_mem_info xdp_mem; 65 struct veth_rq_stats stats; 66 bool rx_notify_masked; 67 struct ptr_ring xdp_ring; 68 struct xdp_rxq_info xdp_rxq; 69 struct page_pool *page_pool; 70 }; 71 72 struct veth_priv { 73 struct net_device __rcu *peer; 74 atomic64_t dropped; 75 struct bpf_prog *_xdp_prog; 76 struct veth_rq *rq; 77 unsigned int requested_headroom; 78 }; 79 80 struct veth_xdp_tx_bq { 81 struct xdp_frame *q[VETH_XDP_TX_BULK_SIZE]; 82 unsigned int count; 83 }; 84 85 /* 86 * ethtool interface 87 */ 88 89 struct veth_q_stat_desc { 90 char desc[ETH_GSTRING_LEN]; 91 size_t offset; 92 }; 93 94 #define VETH_RQ_STAT(m) offsetof(struct veth_stats, m) 95 96 static const struct veth_q_stat_desc veth_rq_stats_desc[] = { 97 { "xdp_packets", VETH_RQ_STAT(xdp_packets) }, 98 { "xdp_bytes", VETH_RQ_STAT(xdp_bytes) }, 99 { "drops", VETH_RQ_STAT(rx_drops) }, 100 { "xdp_redirect", VETH_RQ_STAT(xdp_redirect) }, 101 { "xdp_drops", VETH_RQ_STAT(xdp_drops) }, 102 { "xdp_tx", VETH_RQ_STAT(xdp_tx) }, 103 { "xdp_tx_errors", VETH_RQ_STAT(xdp_tx_err) }, 104 }; 105 106 #define VETH_RQ_STATS_LEN ARRAY_SIZE(veth_rq_stats_desc) 107 108 static const struct veth_q_stat_desc veth_tq_stats_desc[] = { 109 { "xdp_xmit", VETH_RQ_STAT(peer_tq_xdp_xmit) }, 110 { "xdp_xmit_errors", VETH_RQ_STAT(peer_tq_xdp_xmit_err) }, 111 }; 112 113 #define VETH_TQ_STATS_LEN ARRAY_SIZE(veth_tq_stats_desc) 114 115 static struct { 116 const char string[ETH_GSTRING_LEN]; 117 } ethtool_stats_keys[] = { 118 { "peer_ifindex" }, 119 }; 120 121 struct veth_xdp_buff { 122 struct xdp_buff xdp; 123 struct sk_buff *skb; 124 }; 125 126 static int veth_get_link_ksettings(struct net_device *dev, 127 struct ethtool_link_ksettings *cmd) 128 { 129 cmd->base.speed = SPEED_10000; 130 cmd->base.duplex = DUPLEX_FULL; 131 cmd->base.port = PORT_TP; 132 cmd->base.autoneg = AUTONEG_DISABLE; 133 return 0; 134 } 135 136 static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) 137 { 138 strscpy(info->driver, DRV_NAME, sizeof(info->driver)); 139 strscpy(info->version, DRV_VERSION, sizeof(info->version)); 140 } 141 142 static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf) 143 { 144 u8 *p = buf; 145 int i, j; 146 147 switch(stringset) { 148 case ETH_SS_STATS: 149 memcpy(p, ðtool_stats_keys, sizeof(ethtool_stats_keys)); 150 p += sizeof(ethtool_stats_keys); 151 for (i = 0; i < dev->real_num_rx_queues; i++) 152 for (j = 0; j < VETH_RQ_STATS_LEN; j++) 153 ethtool_sprintf(&p, "rx_queue_%u_%.18s", 154 i, veth_rq_stats_desc[j].desc); 155 156 for (i = 0; i < dev->real_num_tx_queues; i++) 157 for (j = 0; j < VETH_TQ_STATS_LEN; j++) 158 ethtool_sprintf(&p, "tx_queue_%u_%.18s", 159 i, veth_tq_stats_desc[j].desc); 160 161 page_pool_ethtool_stats_get_strings(p); 162 break; 163 } 164 } 165 166 static int veth_get_sset_count(struct net_device *dev, int sset) 167 { 168 switch (sset) { 169 case ETH_SS_STATS: 170 return ARRAY_SIZE(ethtool_stats_keys) + 171 VETH_RQ_STATS_LEN * dev->real_num_rx_queues + 172 VETH_TQ_STATS_LEN * dev->real_num_tx_queues + 173 page_pool_ethtool_stats_get_count(); 174 default: 175 return -EOPNOTSUPP; 176 } 177 } 178 179 static void veth_get_page_pool_stats(struct net_device *dev, u64 *data) 180 { 181 #ifdef CONFIG_PAGE_POOL_STATS 182 struct veth_priv *priv = netdev_priv(dev); 183 struct page_pool_stats pp_stats = {}; 184 int i; 185 186 for (i = 0; i < dev->real_num_rx_queues; i++) { 187 if (!priv->rq[i].page_pool) 188 continue; 189 page_pool_get_stats(priv->rq[i].page_pool, &pp_stats); 190 } 191 page_pool_ethtool_stats_get(data, &pp_stats); 192 #endif /* CONFIG_PAGE_POOL_STATS */ 193 } 194 195 static void veth_get_ethtool_stats(struct net_device *dev, 196 struct ethtool_stats *stats, u64 *data) 197 { 198 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 199 struct net_device *peer = rtnl_dereference(priv->peer); 200 int i, j, idx, pp_idx; 201 202 data[0] = peer ? peer->ifindex : 0; 203 idx = 1; 204 for (i = 0; i < dev->real_num_rx_queues; i++) { 205 const struct veth_rq_stats *rq_stats = &priv->rq[i].stats; 206 const void *stats_base = (void *)&rq_stats->vs; 207 unsigned int start; 208 size_t offset; 209 210 do { 211 start = u64_stats_fetch_begin(&rq_stats->syncp); 212 for (j = 0; j < VETH_RQ_STATS_LEN; j++) { 213 offset = veth_rq_stats_desc[j].offset; 214 data[idx + j] = *(u64 *)(stats_base + offset); 215 } 216 } while (u64_stats_fetch_retry(&rq_stats->syncp, start)); 217 idx += VETH_RQ_STATS_LEN; 218 } 219 pp_idx = idx; 220 221 if (!peer) 222 goto page_pool_stats; 223 224 rcv_priv = netdev_priv(peer); 225 for (i = 0; i < peer->real_num_rx_queues; i++) { 226 const struct veth_rq_stats *rq_stats = &rcv_priv->rq[i].stats; 227 const void *base = (void *)&rq_stats->vs; 228 unsigned int start, tx_idx = idx; 229 size_t offset; 230 231 tx_idx += (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN; 232 do { 233 start = u64_stats_fetch_begin(&rq_stats->syncp); 234 for (j = 0; j < VETH_TQ_STATS_LEN; j++) { 235 offset = veth_tq_stats_desc[j].offset; 236 data[tx_idx + j] += *(u64 *)(base + offset); 237 } 238 } while (u64_stats_fetch_retry(&rq_stats->syncp, start)); 239 pp_idx = tx_idx + VETH_TQ_STATS_LEN; 240 } 241 242 page_pool_stats: 243 veth_get_page_pool_stats(dev, &data[pp_idx]); 244 } 245 246 static void veth_get_channels(struct net_device *dev, 247 struct ethtool_channels *channels) 248 { 249 channels->tx_count = dev->real_num_tx_queues; 250 channels->rx_count = dev->real_num_rx_queues; 251 channels->max_tx = dev->num_tx_queues; 252 channels->max_rx = dev->num_rx_queues; 253 } 254 255 static int veth_set_channels(struct net_device *dev, 256 struct ethtool_channels *ch); 257 258 static const struct ethtool_ops veth_ethtool_ops = { 259 .get_drvinfo = veth_get_drvinfo, 260 .get_link = ethtool_op_get_link, 261 .get_strings = veth_get_strings, 262 .get_sset_count = veth_get_sset_count, 263 .get_ethtool_stats = veth_get_ethtool_stats, 264 .get_link_ksettings = veth_get_link_ksettings, 265 .get_ts_info = ethtool_op_get_ts_info, 266 .get_channels = veth_get_channels, 267 .set_channels = veth_set_channels, 268 }; 269 270 /* general routines */ 271 272 static bool veth_is_xdp_frame(void *ptr) 273 { 274 return (unsigned long)ptr & VETH_XDP_FLAG; 275 } 276 277 static struct xdp_frame *veth_ptr_to_xdp(void *ptr) 278 { 279 return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG); 280 } 281 282 static void *veth_xdp_to_ptr(struct xdp_frame *xdp) 283 { 284 return (void *)((unsigned long)xdp | VETH_XDP_FLAG); 285 } 286 287 static void veth_ptr_free(void *ptr) 288 { 289 if (veth_is_xdp_frame(ptr)) 290 xdp_return_frame(veth_ptr_to_xdp(ptr)); 291 else 292 kfree_skb(ptr); 293 } 294 295 static void __veth_xdp_flush(struct veth_rq *rq) 296 { 297 /* Write ptr_ring before reading rx_notify_masked */ 298 smp_mb(); 299 if (!READ_ONCE(rq->rx_notify_masked) && 300 napi_schedule_prep(&rq->xdp_napi)) { 301 WRITE_ONCE(rq->rx_notify_masked, true); 302 __napi_schedule(&rq->xdp_napi); 303 } 304 } 305 306 static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) 307 { 308 if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) { 309 dev_kfree_skb_any(skb); 310 return NET_RX_DROP; 311 } 312 313 return NET_RX_SUCCESS; 314 } 315 316 static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, 317 struct veth_rq *rq, bool xdp) 318 { 319 return __dev_forward_skb(dev, skb) ?: xdp ? 320 veth_xdp_rx(rq, skb) : 321 __netif_rx(skb); 322 } 323 324 /* return true if the specified skb has chances of GRO aggregation 325 * Don't strive for accuracy, but try to avoid GRO overhead in the most 326 * common scenarios. 327 * When XDP is enabled, all traffic is considered eligible, as the xmit 328 * device has TSO off. 329 * When TSO is enabled on the xmit device, we are likely interested only 330 * in UDP aggregation, explicitly check for that if the skb is suspected 331 * - the sock_wfree destructor is used by UDP, ICMP and XDP sockets - 332 * to belong to locally generated UDP traffic. 333 */ 334 static bool veth_skb_is_eligible_for_gro(const struct net_device *dev, 335 const struct net_device *rcv, 336 const struct sk_buff *skb) 337 { 338 return !(dev->features & NETIF_F_ALL_TSO) || 339 (skb->destructor == sock_wfree && 340 rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD)); 341 } 342 343 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) 344 { 345 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 346 struct veth_rq *rq = NULL; 347 struct net_device *rcv; 348 int length = skb->len; 349 bool use_napi = false; 350 int rxq; 351 352 rcu_read_lock(); 353 rcv = rcu_dereference(priv->peer); 354 if (unlikely(!rcv) || !pskb_may_pull(skb, ETH_HLEN)) { 355 kfree_skb(skb); 356 goto drop; 357 } 358 359 rcv_priv = netdev_priv(rcv); 360 rxq = skb_get_queue_mapping(skb); 361 if (rxq < rcv->real_num_rx_queues) { 362 rq = &rcv_priv->rq[rxq]; 363 364 /* The napi pointer is available when an XDP program is 365 * attached or when GRO is enabled 366 * Don't bother with napi/GRO if the skb can't be aggregated 367 */ 368 use_napi = rcu_access_pointer(rq->napi) && 369 veth_skb_is_eligible_for_gro(dev, rcv, skb); 370 } 371 372 skb_tx_timestamp(skb); 373 if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) { 374 if (!use_napi) 375 dev_lstats_add(dev, length); 376 else 377 __veth_xdp_flush(rq); 378 } else { 379 drop: 380 atomic64_inc(&priv->dropped); 381 } 382 383 rcu_read_unlock(); 384 385 return NETDEV_TX_OK; 386 } 387 388 static u64 veth_stats_tx(struct net_device *dev, u64 *packets, u64 *bytes) 389 { 390 struct veth_priv *priv = netdev_priv(dev); 391 392 dev_lstats_read(dev, packets, bytes); 393 return atomic64_read(&priv->dropped); 394 } 395 396 static void veth_stats_rx(struct veth_stats *result, struct net_device *dev) 397 { 398 struct veth_priv *priv = netdev_priv(dev); 399 int i; 400 401 result->peer_tq_xdp_xmit_err = 0; 402 result->xdp_packets = 0; 403 result->xdp_tx_err = 0; 404 result->xdp_bytes = 0; 405 result->rx_drops = 0; 406 for (i = 0; i < dev->num_rx_queues; i++) { 407 u64 packets, bytes, drops, xdp_tx_err, peer_tq_xdp_xmit_err; 408 struct veth_rq_stats *stats = &priv->rq[i].stats; 409 unsigned int start; 410 411 do { 412 start = u64_stats_fetch_begin(&stats->syncp); 413 peer_tq_xdp_xmit_err = stats->vs.peer_tq_xdp_xmit_err; 414 xdp_tx_err = stats->vs.xdp_tx_err; 415 packets = stats->vs.xdp_packets; 416 bytes = stats->vs.xdp_bytes; 417 drops = stats->vs.rx_drops; 418 } while (u64_stats_fetch_retry(&stats->syncp, start)); 419 result->peer_tq_xdp_xmit_err += peer_tq_xdp_xmit_err; 420 result->xdp_tx_err += xdp_tx_err; 421 result->xdp_packets += packets; 422 result->xdp_bytes += bytes; 423 result->rx_drops += drops; 424 } 425 } 426 427 static void veth_get_stats64(struct net_device *dev, 428 struct rtnl_link_stats64 *tot) 429 { 430 struct veth_priv *priv = netdev_priv(dev); 431 struct net_device *peer; 432 struct veth_stats rx; 433 u64 packets, bytes; 434 435 tot->tx_dropped = veth_stats_tx(dev, &packets, &bytes); 436 tot->tx_bytes = bytes; 437 tot->tx_packets = packets; 438 439 veth_stats_rx(&rx, dev); 440 tot->tx_dropped += rx.xdp_tx_err; 441 tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err; 442 tot->rx_bytes = rx.xdp_bytes; 443 tot->rx_packets = rx.xdp_packets; 444 445 rcu_read_lock(); 446 peer = rcu_dereference(priv->peer); 447 if (peer) { 448 veth_stats_tx(peer, &packets, &bytes); 449 tot->rx_bytes += bytes; 450 tot->rx_packets += packets; 451 452 veth_stats_rx(&rx, peer); 453 tot->tx_dropped += rx.peer_tq_xdp_xmit_err; 454 tot->rx_dropped += rx.xdp_tx_err; 455 tot->tx_bytes += rx.xdp_bytes; 456 tot->tx_packets += rx.xdp_packets; 457 } 458 rcu_read_unlock(); 459 } 460 461 /* fake multicast ability */ 462 static void veth_set_multicast_list(struct net_device *dev) 463 { 464 } 465 466 static int veth_select_rxq(struct net_device *dev) 467 { 468 return smp_processor_id() % dev->real_num_rx_queues; 469 } 470 471 static struct net_device *veth_peer_dev(struct net_device *dev) 472 { 473 struct veth_priv *priv = netdev_priv(dev); 474 475 /* Callers must be under RCU read side. */ 476 return rcu_dereference(priv->peer); 477 } 478 479 static int veth_xdp_xmit(struct net_device *dev, int n, 480 struct xdp_frame **frames, 481 u32 flags, bool ndo_xmit) 482 { 483 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 484 int i, ret = -ENXIO, nxmit = 0; 485 struct net_device *rcv; 486 unsigned int max_len; 487 struct veth_rq *rq; 488 489 if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) 490 return -EINVAL; 491 492 rcu_read_lock(); 493 rcv = rcu_dereference(priv->peer); 494 if (unlikely(!rcv)) 495 goto out; 496 497 rcv_priv = netdev_priv(rcv); 498 rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 499 /* The napi pointer is set if NAPI is enabled, which ensures that 500 * xdp_ring is initialized on receive side and the peer device is up. 501 */ 502 if (!rcu_access_pointer(rq->napi)) 503 goto out; 504 505 max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN; 506 507 spin_lock(&rq->xdp_ring.producer_lock); 508 for (i = 0; i < n; i++) { 509 struct xdp_frame *frame = frames[i]; 510 void *ptr = veth_xdp_to_ptr(frame); 511 512 if (unlikely(xdp_get_frame_len(frame) > max_len || 513 __ptr_ring_produce(&rq->xdp_ring, ptr))) 514 break; 515 nxmit++; 516 } 517 spin_unlock(&rq->xdp_ring.producer_lock); 518 519 if (flags & XDP_XMIT_FLUSH) 520 __veth_xdp_flush(rq); 521 522 ret = nxmit; 523 if (ndo_xmit) { 524 u64_stats_update_begin(&rq->stats.syncp); 525 rq->stats.vs.peer_tq_xdp_xmit += nxmit; 526 rq->stats.vs.peer_tq_xdp_xmit_err += n - nxmit; 527 u64_stats_update_end(&rq->stats.syncp); 528 } 529 530 out: 531 rcu_read_unlock(); 532 533 return ret; 534 } 535 536 static int veth_ndo_xdp_xmit(struct net_device *dev, int n, 537 struct xdp_frame **frames, u32 flags) 538 { 539 int err; 540 541 err = veth_xdp_xmit(dev, n, frames, flags, true); 542 if (err < 0) { 543 struct veth_priv *priv = netdev_priv(dev); 544 545 atomic64_add(n, &priv->dropped); 546 } 547 548 return err; 549 } 550 551 static void veth_xdp_flush_bq(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) 552 { 553 int sent, i, err = 0, drops; 554 555 sent = veth_xdp_xmit(rq->dev, bq->count, bq->q, 0, false); 556 if (sent < 0) { 557 err = sent; 558 sent = 0; 559 } 560 561 for (i = sent; unlikely(i < bq->count); i++) 562 xdp_return_frame(bq->q[i]); 563 564 drops = bq->count - sent; 565 trace_xdp_bulk_tx(rq->dev, sent, drops, err); 566 567 u64_stats_update_begin(&rq->stats.syncp); 568 rq->stats.vs.xdp_tx += sent; 569 rq->stats.vs.xdp_tx_err += drops; 570 u64_stats_update_end(&rq->stats.syncp); 571 572 bq->count = 0; 573 } 574 575 static void veth_xdp_flush(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) 576 { 577 struct veth_priv *rcv_priv, *priv = netdev_priv(rq->dev); 578 struct net_device *rcv; 579 struct veth_rq *rcv_rq; 580 581 rcu_read_lock(); 582 veth_xdp_flush_bq(rq, bq); 583 rcv = rcu_dereference(priv->peer); 584 if (unlikely(!rcv)) 585 goto out; 586 587 rcv_priv = netdev_priv(rcv); 588 rcv_rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 589 /* xdp_ring is initialized on receive side? */ 590 if (unlikely(!rcu_access_pointer(rcv_rq->xdp_prog))) 591 goto out; 592 593 __veth_xdp_flush(rcv_rq); 594 out: 595 rcu_read_unlock(); 596 } 597 598 static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp, 599 struct veth_xdp_tx_bq *bq) 600 { 601 struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp); 602 603 if (unlikely(!frame)) 604 return -EOVERFLOW; 605 606 if (unlikely(bq->count == VETH_XDP_TX_BULK_SIZE)) 607 veth_xdp_flush_bq(rq, bq); 608 609 bq->q[bq->count++] = frame; 610 611 return 0; 612 } 613 614 static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq, 615 struct xdp_frame *frame, 616 struct veth_xdp_tx_bq *bq, 617 struct veth_stats *stats) 618 { 619 struct xdp_frame orig_frame; 620 struct bpf_prog *xdp_prog; 621 622 rcu_read_lock(); 623 xdp_prog = rcu_dereference(rq->xdp_prog); 624 if (likely(xdp_prog)) { 625 struct veth_xdp_buff vxbuf; 626 struct xdp_buff *xdp = &vxbuf.xdp; 627 u32 act; 628 629 xdp_convert_frame_to_buff(frame, xdp); 630 xdp->rxq = &rq->xdp_rxq; 631 vxbuf.skb = NULL; 632 633 act = bpf_prog_run_xdp(xdp_prog, xdp); 634 635 switch (act) { 636 case XDP_PASS: 637 if (xdp_update_frame_from_buff(xdp, frame)) 638 goto err_xdp; 639 break; 640 case XDP_TX: 641 orig_frame = *frame; 642 xdp->rxq->mem = frame->mem; 643 if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) { 644 trace_xdp_exception(rq->dev, xdp_prog, act); 645 frame = &orig_frame; 646 stats->rx_drops++; 647 goto err_xdp; 648 } 649 stats->xdp_tx++; 650 rcu_read_unlock(); 651 goto xdp_xmit; 652 case XDP_REDIRECT: 653 orig_frame = *frame; 654 xdp->rxq->mem = frame->mem; 655 if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) { 656 frame = &orig_frame; 657 stats->rx_drops++; 658 goto err_xdp; 659 } 660 stats->xdp_redirect++; 661 rcu_read_unlock(); 662 goto xdp_xmit; 663 default: 664 bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); 665 fallthrough; 666 case XDP_ABORTED: 667 trace_xdp_exception(rq->dev, xdp_prog, act); 668 fallthrough; 669 case XDP_DROP: 670 stats->xdp_drops++; 671 goto err_xdp; 672 } 673 } 674 rcu_read_unlock(); 675 676 return frame; 677 err_xdp: 678 rcu_read_unlock(); 679 xdp_return_frame(frame); 680 xdp_xmit: 681 return NULL; 682 } 683 684 /* frames array contains VETH_XDP_BATCH at most */ 685 static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames, 686 int n_xdpf, struct veth_xdp_tx_bq *bq, 687 struct veth_stats *stats) 688 { 689 void *skbs[VETH_XDP_BATCH]; 690 int i; 691 692 if (xdp_alloc_skb_bulk(skbs, n_xdpf, 693 GFP_ATOMIC | __GFP_ZERO) < 0) { 694 for (i = 0; i < n_xdpf; i++) 695 xdp_return_frame(frames[i]); 696 stats->rx_drops += n_xdpf; 697 698 return; 699 } 700 701 for (i = 0; i < n_xdpf; i++) { 702 struct sk_buff *skb = skbs[i]; 703 704 skb = __xdp_build_skb_from_frame(frames[i], skb, 705 rq->dev); 706 if (!skb) { 707 xdp_return_frame(frames[i]); 708 stats->rx_drops++; 709 continue; 710 } 711 napi_gro_receive(&rq->xdp_napi, skb); 712 } 713 } 714 715 static void veth_xdp_get(struct xdp_buff *xdp) 716 { 717 struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); 718 int i; 719 720 get_page(virt_to_page(xdp->data)); 721 if (likely(!xdp_buff_has_frags(xdp))) 722 return; 723 724 for (i = 0; i < sinfo->nr_frags; i++) 725 __skb_frag_ref(&sinfo->frags[i]); 726 } 727 728 static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq, 729 struct xdp_buff *xdp, 730 struct sk_buff **pskb) 731 { 732 struct sk_buff *skb = *pskb; 733 u32 frame_sz; 734 735 if (skb_shared(skb) || skb_head_is_locked(skb) || 736 skb_shinfo(skb)->nr_frags || 737 skb_headroom(skb) < XDP_PACKET_HEADROOM) { 738 u32 size, len, max_head_size, off; 739 struct sk_buff *nskb; 740 struct page *page; 741 int i, head_off; 742 743 /* We need a private copy of the skb and data buffers since 744 * the ebpf program can modify it. We segment the original skb 745 * into order-0 pages without linearize it. 746 * 747 * Make sure we have enough space for linear and paged area 748 */ 749 max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - 750 VETH_XDP_HEADROOM); 751 if (skb->len > PAGE_SIZE * MAX_SKB_FRAGS + max_head_size) 752 goto drop; 753 754 /* Allocate skb head */ 755 page = page_pool_dev_alloc_pages(rq->page_pool); 756 if (!page) 757 goto drop; 758 759 nskb = napi_build_skb(page_address(page), PAGE_SIZE); 760 if (!nskb) { 761 page_pool_put_full_page(rq->page_pool, page, true); 762 goto drop; 763 } 764 765 skb_reserve(nskb, VETH_XDP_HEADROOM); 766 skb_copy_header(nskb, skb); 767 skb_mark_for_recycle(nskb); 768 769 size = min_t(u32, skb->len, max_head_size); 770 if (skb_copy_bits(skb, 0, nskb->data, size)) { 771 consume_skb(nskb); 772 goto drop; 773 } 774 skb_put(nskb, size); 775 776 head_off = skb_headroom(nskb) - skb_headroom(skb); 777 skb_headers_offset_update(nskb, head_off); 778 779 /* Allocate paged area of new skb */ 780 off = size; 781 len = skb->len - off; 782 783 for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) { 784 page = page_pool_dev_alloc_pages(rq->page_pool); 785 if (!page) { 786 consume_skb(nskb); 787 goto drop; 788 } 789 790 size = min_t(u32, len, PAGE_SIZE); 791 skb_add_rx_frag(nskb, i, page, 0, size, PAGE_SIZE); 792 if (skb_copy_bits(skb, off, page_address(page), 793 size)) { 794 consume_skb(nskb); 795 goto drop; 796 } 797 798 len -= size; 799 off += size; 800 } 801 802 consume_skb(skb); 803 skb = nskb; 804 } 805 806 /* SKB "head" area always have tailroom for skb_shared_info */ 807 frame_sz = skb_end_pointer(skb) - skb->head; 808 frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 809 xdp_init_buff(xdp, frame_sz, &rq->xdp_rxq); 810 xdp_prepare_buff(xdp, skb->head, skb_headroom(skb), 811 skb_headlen(skb), true); 812 813 if (skb_is_nonlinear(skb)) { 814 skb_shinfo(skb)->xdp_frags_size = skb->data_len; 815 xdp_buff_set_frags_flag(xdp); 816 } else { 817 xdp_buff_clear_frags_flag(xdp); 818 } 819 *pskb = skb; 820 821 return 0; 822 drop: 823 consume_skb(skb); 824 *pskb = NULL; 825 826 return -ENOMEM; 827 } 828 829 static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, 830 struct sk_buff *skb, 831 struct veth_xdp_tx_bq *bq, 832 struct veth_stats *stats) 833 { 834 void *orig_data, *orig_data_end; 835 struct bpf_prog *xdp_prog; 836 struct veth_xdp_buff vxbuf; 837 struct xdp_buff *xdp = &vxbuf.xdp; 838 u32 act, metalen; 839 int off; 840 841 skb_prepare_for_gro(skb); 842 843 rcu_read_lock(); 844 xdp_prog = rcu_dereference(rq->xdp_prog); 845 if (unlikely(!xdp_prog)) { 846 rcu_read_unlock(); 847 goto out; 848 } 849 850 __skb_push(skb, skb->data - skb_mac_header(skb)); 851 if (veth_convert_skb_to_xdp_buff(rq, xdp, &skb)) 852 goto drop; 853 vxbuf.skb = skb; 854 855 orig_data = xdp->data; 856 orig_data_end = xdp->data_end; 857 858 act = bpf_prog_run_xdp(xdp_prog, xdp); 859 860 switch (act) { 861 case XDP_PASS: 862 break; 863 case XDP_TX: 864 veth_xdp_get(xdp); 865 consume_skb(skb); 866 xdp->rxq->mem = rq->xdp_mem; 867 if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) { 868 trace_xdp_exception(rq->dev, xdp_prog, act); 869 stats->rx_drops++; 870 goto err_xdp; 871 } 872 stats->xdp_tx++; 873 rcu_read_unlock(); 874 goto xdp_xmit; 875 case XDP_REDIRECT: 876 veth_xdp_get(xdp); 877 consume_skb(skb); 878 xdp->rxq->mem = rq->xdp_mem; 879 if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) { 880 stats->rx_drops++; 881 goto err_xdp; 882 } 883 stats->xdp_redirect++; 884 rcu_read_unlock(); 885 goto xdp_xmit; 886 default: 887 bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); 888 fallthrough; 889 case XDP_ABORTED: 890 trace_xdp_exception(rq->dev, xdp_prog, act); 891 fallthrough; 892 case XDP_DROP: 893 stats->xdp_drops++; 894 goto xdp_drop; 895 } 896 rcu_read_unlock(); 897 898 /* check if bpf_xdp_adjust_head was used */ 899 off = orig_data - xdp->data; 900 if (off > 0) 901 __skb_push(skb, off); 902 else if (off < 0) 903 __skb_pull(skb, -off); 904 905 skb_reset_mac_header(skb); 906 907 /* check if bpf_xdp_adjust_tail was used */ 908 off = xdp->data_end - orig_data_end; 909 if (off != 0) 910 __skb_put(skb, off); /* positive on grow, negative on shrink */ 911 912 /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers 913 * (e.g. bpf_xdp_adjust_tail), we need to update data_len here. 914 */ 915 if (xdp_buff_has_frags(xdp)) 916 skb->data_len = skb_shinfo(skb)->xdp_frags_size; 917 else 918 skb->data_len = 0; 919 920 skb->protocol = eth_type_trans(skb, rq->dev); 921 922 metalen = xdp->data - xdp->data_meta; 923 if (metalen) 924 skb_metadata_set(skb, metalen); 925 out: 926 return skb; 927 drop: 928 stats->rx_drops++; 929 xdp_drop: 930 rcu_read_unlock(); 931 kfree_skb(skb); 932 return NULL; 933 err_xdp: 934 rcu_read_unlock(); 935 xdp_return_buff(xdp); 936 xdp_xmit: 937 return NULL; 938 } 939 940 static int veth_xdp_rcv(struct veth_rq *rq, int budget, 941 struct veth_xdp_tx_bq *bq, 942 struct veth_stats *stats) 943 { 944 int i, done = 0, n_xdpf = 0; 945 void *xdpf[VETH_XDP_BATCH]; 946 947 for (i = 0; i < budget; i++) { 948 void *ptr = __ptr_ring_consume(&rq->xdp_ring); 949 950 if (!ptr) 951 break; 952 953 if (veth_is_xdp_frame(ptr)) { 954 /* ndo_xdp_xmit */ 955 struct xdp_frame *frame = veth_ptr_to_xdp(ptr); 956 957 stats->xdp_bytes += xdp_get_frame_len(frame); 958 frame = veth_xdp_rcv_one(rq, frame, bq, stats); 959 if (frame) { 960 /* XDP_PASS */ 961 xdpf[n_xdpf++] = frame; 962 if (n_xdpf == VETH_XDP_BATCH) { 963 veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, 964 bq, stats); 965 n_xdpf = 0; 966 } 967 } 968 } else { 969 /* ndo_start_xmit */ 970 struct sk_buff *skb = ptr; 971 972 stats->xdp_bytes += skb->len; 973 skb = veth_xdp_rcv_skb(rq, skb, bq, stats); 974 if (skb) { 975 if (skb_shared(skb) || skb_unclone(skb, GFP_ATOMIC)) 976 netif_receive_skb(skb); 977 else 978 napi_gro_receive(&rq->xdp_napi, skb); 979 } 980 } 981 done++; 982 } 983 984 if (n_xdpf) 985 veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats); 986 987 u64_stats_update_begin(&rq->stats.syncp); 988 rq->stats.vs.xdp_redirect += stats->xdp_redirect; 989 rq->stats.vs.xdp_bytes += stats->xdp_bytes; 990 rq->stats.vs.xdp_drops += stats->xdp_drops; 991 rq->stats.vs.rx_drops += stats->rx_drops; 992 rq->stats.vs.xdp_packets += done; 993 u64_stats_update_end(&rq->stats.syncp); 994 995 return done; 996 } 997 998 static int veth_poll(struct napi_struct *napi, int budget) 999 { 1000 struct veth_rq *rq = 1001 container_of(napi, struct veth_rq, xdp_napi); 1002 struct veth_stats stats = {}; 1003 struct veth_xdp_tx_bq bq; 1004 int done; 1005 1006 bq.count = 0; 1007 1008 xdp_set_return_frame_no_direct(); 1009 done = veth_xdp_rcv(rq, budget, &bq, &stats); 1010 1011 if (stats.xdp_redirect > 0) 1012 xdp_do_flush(); 1013 1014 if (done < budget && napi_complete_done(napi, done)) { 1015 /* Write rx_notify_masked before reading ptr_ring */ 1016 smp_store_mb(rq->rx_notify_masked, false); 1017 if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { 1018 if (napi_schedule_prep(&rq->xdp_napi)) { 1019 WRITE_ONCE(rq->rx_notify_masked, true); 1020 __napi_schedule(&rq->xdp_napi); 1021 } 1022 } 1023 } 1024 1025 if (stats.xdp_tx > 0) 1026 veth_xdp_flush(rq, &bq); 1027 xdp_clear_return_frame_no_direct(); 1028 1029 return done; 1030 } 1031 1032 static int veth_create_page_pool(struct veth_rq *rq) 1033 { 1034 struct page_pool_params pp_params = { 1035 .order = 0, 1036 .pool_size = VETH_RING_SIZE, 1037 .nid = NUMA_NO_NODE, 1038 .dev = &rq->dev->dev, 1039 }; 1040 1041 rq->page_pool = page_pool_create(&pp_params); 1042 if (IS_ERR(rq->page_pool)) { 1043 int err = PTR_ERR(rq->page_pool); 1044 1045 rq->page_pool = NULL; 1046 return err; 1047 } 1048 1049 return 0; 1050 } 1051 1052 static int __veth_napi_enable_range(struct net_device *dev, int start, int end) 1053 { 1054 struct veth_priv *priv = netdev_priv(dev); 1055 int err, i; 1056 1057 for (i = start; i < end; i++) { 1058 err = veth_create_page_pool(&priv->rq[i]); 1059 if (err) 1060 goto err_page_pool; 1061 } 1062 1063 for (i = start; i < end; i++) { 1064 struct veth_rq *rq = &priv->rq[i]; 1065 1066 err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL); 1067 if (err) 1068 goto err_xdp_ring; 1069 } 1070 1071 for (i = start; i < end; i++) { 1072 struct veth_rq *rq = &priv->rq[i]; 1073 1074 napi_enable(&rq->xdp_napi); 1075 rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); 1076 } 1077 1078 return 0; 1079 1080 err_xdp_ring: 1081 for (i--; i >= start; i--) 1082 ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); 1083 i = end; 1084 err_page_pool: 1085 for (i--; i >= start; i--) { 1086 page_pool_destroy(priv->rq[i].page_pool); 1087 priv->rq[i].page_pool = NULL; 1088 } 1089 1090 return err; 1091 } 1092 1093 static int __veth_napi_enable(struct net_device *dev) 1094 { 1095 return __veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); 1096 } 1097 1098 static void veth_napi_del_range(struct net_device *dev, int start, int end) 1099 { 1100 struct veth_priv *priv = netdev_priv(dev); 1101 int i; 1102 1103 for (i = start; i < end; i++) { 1104 struct veth_rq *rq = &priv->rq[i]; 1105 1106 rcu_assign_pointer(priv->rq[i].napi, NULL); 1107 napi_disable(&rq->xdp_napi); 1108 __netif_napi_del(&rq->xdp_napi); 1109 } 1110 synchronize_net(); 1111 1112 for (i = start; i < end; i++) { 1113 struct veth_rq *rq = &priv->rq[i]; 1114 1115 rq->rx_notify_masked = false; 1116 ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free); 1117 } 1118 1119 for (i = start; i < end; i++) { 1120 page_pool_destroy(priv->rq[i].page_pool); 1121 priv->rq[i].page_pool = NULL; 1122 } 1123 } 1124 1125 static void veth_napi_del(struct net_device *dev) 1126 { 1127 veth_napi_del_range(dev, 0, dev->real_num_rx_queues); 1128 } 1129 1130 static bool veth_gro_requested(const struct net_device *dev) 1131 { 1132 return !!(dev->wanted_features & NETIF_F_GRO); 1133 } 1134 1135 static int veth_enable_xdp_range(struct net_device *dev, int start, int end, 1136 bool napi_already_on) 1137 { 1138 struct veth_priv *priv = netdev_priv(dev); 1139 int err, i; 1140 1141 for (i = start; i < end; i++) { 1142 struct veth_rq *rq = &priv->rq[i]; 1143 1144 if (!napi_already_on) 1145 netif_napi_add(dev, &rq->xdp_napi, veth_poll); 1146 err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id); 1147 if (err < 0) 1148 goto err_rxq_reg; 1149 1150 err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, 1151 MEM_TYPE_PAGE_SHARED, 1152 NULL); 1153 if (err < 0) 1154 goto err_reg_mem; 1155 1156 /* Save original mem info as it can be overwritten */ 1157 rq->xdp_mem = rq->xdp_rxq.mem; 1158 } 1159 return 0; 1160 1161 err_reg_mem: 1162 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 1163 err_rxq_reg: 1164 for (i--; i >= start; i--) { 1165 struct veth_rq *rq = &priv->rq[i]; 1166 1167 xdp_rxq_info_unreg(&rq->xdp_rxq); 1168 if (!napi_already_on) 1169 netif_napi_del(&rq->xdp_napi); 1170 } 1171 1172 return err; 1173 } 1174 1175 static void veth_disable_xdp_range(struct net_device *dev, int start, int end, 1176 bool delete_napi) 1177 { 1178 struct veth_priv *priv = netdev_priv(dev); 1179 int i; 1180 1181 for (i = start; i < end; i++) { 1182 struct veth_rq *rq = &priv->rq[i]; 1183 1184 rq->xdp_rxq.mem = rq->xdp_mem; 1185 xdp_rxq_info_unreg(&rq->xdp_rxq); 1186 1187 if (delete_napi) 1188 netif_napi_del(&rq->xdp_napi); 1189 } 1190 } 1191 1192 static int veth_enable_xdp(struct net_device *dev) 1193 { 1194 bool napi_already_on = veth_gro_requested(dev) && (dev->flags & IFF_UP); 1195 struct veth_priv *priv = netdev_priv(dev); 1196 int err, i; 1197 1198 if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) { 1199 err = veth_enable_xdp_range(dev, 0, dev->real_num_rx_queues, napi_already_on); 1200 if (err) 1201 return err; 1202 1203 if (!napi_already_on) { 1204 err = __veth_napi_enable(dev); 1205 if (err) { 1206 veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, true); 1207 return err; 1208 } 1209 1210 if (!veth_gro_requested(dev)) { 1211 /* user-space did not require GRO, but adding XDP 1212 * is supposed to get GRO working 1213 */ 1214 dev->features |= NETIF_F_GRO; 1215 netdev_features_change(dev); 1216 } 1217 } 1218 } 1219 1220 for (i = 0; i < dev->real_num_rx_queues; i++) { 1221 rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog); 1222 rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); 1223 } 1224 1225 return 0; 1226 } 1227 1228 static void veth_disable_xdp(struct net_device *dev) 1229 { 1230 struct veth_priv *priv = netdev_priv(dev); 1231 int i; 1232 1233 for (i = 0; i < dev->real_num_rx_queues; i++) 1234 rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); 1235 1236 if (!netif_running(dev) || !veth_gro_requested(dev)) { 1237 veth_napi_del(dev); 1238 1239 /* if user-space did not require GRO, since adding XDP 1240 * enabled it, clear it now 1241 */ 1242 if (!veth_gro_requested(dev) && netif_running(dev)) { 1243 dev->features &= ~NETIF_F_GRO; 1244 netdev_features_change(dev); 1245 } 1246 } 1247 1248 veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, false); 1249 } 1250 1251 static int veth_napi_enable_range(struct net_device *dev, int start, int end) 1252 { 1253 struct veth_priv *priv = netdev_priv(dev); 1254 int err, i; 1255 1256 for (i = start; i < end; i++) { 1257 struct veth_rq *rq = &priv->rq[i]; 1258 1259 netif_napi_add(dev, &rq->xdp_napi, veth_poll); 1260 } 1261 1262 err = __veth_napi_enable_range(dev, start, end); 1263 if (err) { 1264 for (i = start; i < end; i++) { 1265 struct veth_rq *rq = &priv->rq[i]; 1266 1267 netif_napi_del(&rq->xdp_napi); 1268 } 1269 return err; 1270 } 1271 return err; 1272 } 1273 1274 static int veth_napi_enable(struct net_device *dev) 1275 { 1276 return veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); 1277 } 1278 1279 static void veth_disable_range_safe(struct net_device *dev, int start, int end) 1280 { 1281 struct veth_priv *priv = netdev_priv(dev); 1282 1283 if (start >= end) 1284 return; 1285 1286 if (priv->_xdp_prog) { 1287 veth_napi_del_range(dev, start, end); 1288 veth_disable_xdp_range(dev, start, end, false); 1289 } else if (veth_gro_requested(dev)) { 1290 veth_napi_del_range(dev, start, end); 1291 } 1292 } 1293 1294 static int veth_enable_range_safe(struct net_device *dev, int start, int end) 1295 { 1296 struct veth_priv *priv = netdev_priv(dev); 1297 int err; 1298 1299 if (start >= end) 1300 return 0; 1301 1302 if (priv->_xdp_prog) { 1303 /* these channels are freshly initialized, napi is not on there even 1304 * when GRO is requeste 1305 */ 1306 err = veth_enable_xdp_range(dev, start, end, false); 1307 if (err) 1308 return err; 1309 1310 err = __veth_napi_enable_range(dev, start, end); 1311 if (err) { 1312 /* on error always delete the newly added napis */ 1313 veth_disable_xdp_range(dev, start, end, true); 1314 return err; 1315 } 1316 } else if (veth_gro_requested(dev)) { 1317 return veth_napi_enable_range(dev, start, end); 1318 } 1319 return 0; 1320 } 1321 1322 static void veth_set_xdp_features(struct net_device *dev) 1323 { 1324 struct veth_priv *priv = netdev_priv(dev); 1325 struct net_device *peer; 1326 1327 peer = rtnl_dereference(priv->peer); 1328 if (peer && peer->real_num_tx_queues <= dev->real_num_rx_queues) { 1329 struct veth_priv *priv_peer = netdev_priv(peer); 1330 xdp_features_t val = NETDEV_XDP_ACT_BASIC | 1331 NETDEV_XDP_ACT_REDIRECT | 1332 NETDEV_XDP_ACT_RX_SG; 1333 1334 if (priv_peer->_xdp_prog || veth_gro_requested(peer)) 1335 val |= NETDEV_XDP_ACT_NDO_XMIT | 1336 NETDEV_XDP_ACT_NDO_XMIT_SG; 1337 xdp_set_features_flag(dev, val); 1338 } else { 1339 xdp_clear_features_flag(dev); 1340 } 1341 } 1342 1343 static int veth_set_channels(struct net_device *dev, 1344 struct ethtool_channels *ch) 1345 { 1346 struct veth_priv *priv = netdev_priv(dev); 1347 unsigned int old_rx_count, new_rx_count; 1348 struct veth_priv *peer_priv; 1349 struct net_device *peer; 1350 int err; 1351 1352 /* sanity check. Upper bounds are already enforced by the caller */ 1353 if (!ch->rx_count || !ch->tx_count) 1354 return -EINVAL; 1355 1356 /* avoid braking XDP, if that is enabled */ 1357 peer = rtnl_dereference(priv->peer); 1358 peer_priv = peer ? netdev_priv(peer) : NULL; 1359 if (priv->_xdp_prog && peer && ch->rx_count < peer->real_num_tx_queues) 1360 return -EINVAL; 1361 1362 if (peer && peer_priv && peer_priv->_xdp_prog && ch->tx_count > peer->real_num_rx_queues) 1363 return -EINVAL; 1364 1365 old_rx_count = dev->real_num_rx_queues; 1366 new_rx_count = ch->rx_count; 1367 if (netif_running(dev)) { 1368 /* turn device off */ 1369 netif_carrier_off(dev); 1370 if (peer) 1371 netif_carrier_off(peer); 1372 1373 /* try to allocate new resurces, as needed*/ 1374 err = veth_enable_range_safe(dev, old_rx_count, new_rx_count); 1375 if (err) 1376 goto out; 1377 } 1378 1379 err = netif_set_real_num_rx_queues(dev, ch->rx_count); 1380 if (err) 1381 goto revert; 1382 1383 err = netif_set_real_num_tx_queues(dev, ch->tx_count); 1384 if (err) { 1385 int err2 = netif_set_real_num_rx_queues(dev, old_rx_count); 1386 1387 /* this error condition could happen only if rx and tx change 1388 * in opposite directions (e.g. tx nr raises, rx nr decreases) 1389 * and we can't do anything to fully restore the original 1390 * status 1391 */ 1392 if (err2) 1393 pr_warn("Can't restore rx queues config %d -> %d %d", 1394 new_rx_count, old_rx_count, err2); 1395 else 1396 goto revert; 1397 } 1398 1399 out: 1400 if (netif_running(dev)) { 1401 /* note that we need to swap the arguments WRT the enable part 1402 * to identify the range we have to disable 1403 */ 1404 veth_disable_range_safe(dev, new_rx_count, old_rx_count); 1405 netif_carrier_on(dev); 1406 if (peer) 1407 netif_carrier_on(peer); 1408 } 1409 1410 /* update XDP supported features */ 1411 veth_set_xdp_features(dev); 1412 if (peer) 1413 veth_set_xdp_features(peer); 1414 1415 return err; 1416 1417 revert: 1418 new_rx_count = old_rx_count; 1419 old_rx_count = ch->rx_count; 1420 goto out; 1421 } 1422 1423 static int veth_open(struct net_device *dev) 1424 { 1425 struct veth_priv *priv = netdev_priv(dev); 1426 struct net_device *peer = rtnl_dereference(priv->peer); 1427 int err; 1428 1429 if (!peer) 1430 return -ENOTCONN; 1431 1432 if (priv->_xdp_prog) { 1433 err = veth_enable_xdp(dev); 1434 if (err) 1435 return err; 1436 } else if (veth_gro_requested(dev)) { 1437 err = veth_napi_enable(dev); 1438 if (err) 1439 return err; 1440 } 1441 1442 if (peer->flags & IFF_UP) { 1443 netif_carrier_on(dev); 1444 netif_carrier_on(peer); 1445 } 1446 1447 return 0; 1448 } 1449 1450 static int veth_close(struct net_device *dev) 1451 { 1452 struct veth_priv *priv = netdev_priv(dev); 1453 struct net_device *peer = rtnl_dereference(priv->peer); 1454 1455 netif_carrier_off(dev); 1456 if (peer) 1457 netif_carrier_off(peer); 1458 1459 if (priv->_xdp_prog) 1460 veth_disable_xdp(dev); 1461 else if (veth_gro_requested(dev)) 1462 veth_napi_del(dev); 1463 1464 return 0; 1465 } 1466 1467 static int is_valid_veth_mtu(int mtu) 1468 { 1469 return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU; 1470 } 1471 1472 static int veth_alloc_queues(struct net_device *dev) 1473 { 1474 struct veth_priv *priv = netdev_priv(dev); 1475 int i; 1476 1477 priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL_ACCOUNT); 1478 if (!priv->rq) 1479 return -ENOMEM; 1480 1481 for (i = 0; i < dev->num_rx_queues; i++) { 1482 priv->rq[i].dev = dev; 1483 u64_stats_init(&priv->rq[i].stats.syncp); 1484 } 1485 1486 return 0; 1487 } 1488 1489 static void veth_free_queues(struct net_device *dev) 1490 { 1491 struct veth_priv *priv = netdev_priv(dev); 1492 1493 kfree(priv->rq); 1494 } 1495 1496 static int veth_dev_init(struct net_device *dev) 1497 { 1498 int err; 1499 1500 dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); 1501 if (!dev->lstats) 1502 return -ENOMEM; 1503 1504 err = veth_alloc_queues(dev); 1505 if (err) { 1506 free_percpu(dev->lstats); 1507 return err; 1508 } 1509 1510 return 0; 1511 } 1512 1513 static void veth_dev_free(struct net_device *dev) 1514 { 1515 veth_free_queues(dev); 1516 free_percpu(dev->lstats); 1517 } 1518 1519 #ifdef CONFIG_NET_POLL_CONTROLLER 1520 static void veth_poll_controller(struct net_device *dev) 1521 { 1522 /* veth only receives frames when its peer sends one 1523 * Since it has nothing to do with disabling irqs, we are guaranteed 1524 * never to have pending data when we poll for it so 1525 * there is nothing to do here. 1526 * 1527 * We need this though so netpoll recognizes us as an interface that 1528 * supports polling, which enables bridge devices in virt setups to 1529 * still use netconsole 1530 */ 1531 } 1532 #endif /* CONFIG_NET_POLL_CONTROLLER */ 1533 1534 static int veth_get_iflink(const struct net_device *dev) 1535 { 1536 struct veth_priv *priv = netdev_priv(dev); 1537 struct net_device *peer; 1538 int iflink; 1539 1540 rcu_read_lock(); 1541 peer = rcu_dereference(priv->peer); 1542 iflink = peer ? peer->ifindex : 0; 1543 rcu_read_unlock(); 1544 1545 return iflink; 1546 } 1547 1548 static netdev_features_t veth_fix_features(struct net_device *dev, 1549 netdev_features_t features) 1550 { 1551 struct veth_priv *priv = netdev_priv(dev); 1552 struct net_device *peer; 1553 1554 peer = rtnl_dereference(priv->peer); 1555 if (peer) { 1556 struct veth_priv *peer_priv = netdev_priv(peer); 1557 1558 if (peer_priv->_xdp_prog) 1559 features &= ~NETIF_F_GSO_SOFTWARE; 1560 } 1561 if (priv->_xdp_prog) 1562 features |= NETIF_F_GRO; 1563 1564 return features; 1565 } 1566 1567 static int veth_set_features(struct net_device *dev, 1568 netdev_features_t features) 1569 { 1570 netdev_features_t changed = features ^ dev->features; 1571 struct veth_priv *priv = netdev_priv(dev); 1572 struct net_device *peer; 1573 int err; 1574 1575 if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog) 1576 return 0; 1577 1578 peer = rtnl_dereference(priv->peer); 1579 if (features & NETIF_F_GRO) { 1580 err = veth_napi_enable(dev); 1581 if (err) 1582 return err; 1583 1584 if (peer) 1585 xdp_features_set_redirect_target(peer, true); 1586 } else { 1587 if (peer) 1588 xdp_features_clear_redirect_target(peer); 1589 veth_napi_del(dev); 1590 } 1591 return 0; 1592 } 1593 1594 static void veth_set_rx_headroom(struct net_device *dev, int new_hr) 1595 { 1596 struct veth_priv *peer_priv, *priv = netdev_priv(dev); 1597 struct net_device *peer; 1598 1599 if (new_hr < 0) 1600 new_hr = 0; 1601 1602 rcu_read_lock(); 1603 peer = rcu_dereference(priv->peer); 1604 if (unlikely(!peer)) 1605 goto out; 1606 1607 peer_priv = netdev_priv(peer); 1608 priv->requested_headroom = new_hr; 1609 new_hr = max(priv->requested_headroom, peer_priv->requested_headroom); 1610 dev->needed_headroom = new_hr; 1611 peer->needed_headroom = new_hr; 1612 1613 out: 1614 rcu_read_unlock(); 1615 } 1616 1617 static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, 1618 struct netlink_ext_ack *extack) 1619 { 1620 struct veth_priv *priv = netdev_priv(dev); 1621 struct bpf_prog *old_prog; 1622 struct net_device *peer; 1623 unsigned int max_mtu; 1624 int err; 1625 1626 old_prog = priv->_xdp_prog; 1627 priv->_xdp_prog = prog; 1628 peer = rtnl_dereference(priv->peer); 1629 1630 if (prog) { 1631 if (!peer) { 1632 NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached"); 1633 err = -ENOTCONN; 1634 goto err; 1635 } 1636 1637 max_mtu = SKB_WITH_OVERHEAD(PAGE_SIZE - VETH_XDP_HEADROOM) - 1638 peer->hard_header_len; 1639 /* Allow increasing the max_mtu if the program supports 1640 * XDP fragments. 1641 */ 1642 if (prog->aux->xdp_has_frags) 1643 max_mtu += PAGE_SIZE * MAX_SKB_FRAGS; 1644 1645 if (peer->mtu > max_mtu) { 1646 NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP"); 1647 err = -ERANGE; 1648 goto err; 1649 } 1650 1651 if (dev->real_num_rx_queues < peer->real_num_tx_queues) { 1652 NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues"); 1653 err = -ENOSPC; 1654 goto err; 1655 } 1656 1657 if (dev->flags & IFF_UP) { 1658 err = veth_enable_xdp(dev); 1659 if (err) { 1660 NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed"); 1661 goto err; 1662 } 1663 } 1664 1665 if (!old_prog) { 1666 peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; 1667 peer->max_mtu = max_mtu; 1668 } 1669 1670 xdp_features_set_redirect_target(peer, true); 1671 } 1672 1673 if (old_prog) { 1674 if (!prog) { 1675 if (peer && !veth_gro_requested(dev)) 1676 xdp_features_clear_redirect_target(peer); 1677 1678 if (dev->flags & IFF_UP) 1679 veth_disable_xdp(dev); 1680 1681 if (peer) { 1682 peer->hw_features |= NETIF_F_GSO_SOFTWARE; 1683 peer->max_mtu = ETH_MAX_MTU; 1684 } 1685 } 1686 bpf_prog_put(old_prog); 1687 } 1688 1689 if ((!!old_prog ^ !!prog) && peer) 1690 netdev_update_features(peer); 1691 1692 return 0; 1693 err: 1694 priv->_xdp_prog = old_prog; 1695 1696 return err; 1697 } 1698 1699 static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) 1700 { 1701 switch (xdp->command) { 1702 case XDP_SETUP_PROG: 1703 return veth_xdp_set(dev, xdp->prog, xdp->extack); 1704 default: 1705 return -EINVAL; 1706 } 1707 } 1708 1709 static int veth_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp) 1710 { 1711 struct veth_xdp_buff *_ctx = (void *)ctx; 1712 1713 if (!_ctx->skb) 1714 return -ENODATA; 1715 1716 *timestamp = skb_hwtstamps(_ctx->skb)->hwtstamp; 1717 return 0; 1718 } 1719 1720 static int veth_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash, 1721 enum xdp_rss_hash_type *rss_type) 1722 { 1723 struct veth_xdp_buff *_ctx = (void *)ctx; 1724 struct sk_buff *skb = _ctx->skb; 1725 1726 if (!skb) 1727 return -ENODATA; 1728 1729 *hash = skb_get_hash(skb); 1730 *rss_type = skb->l4_hash ? XDP_RSS_TYPE_L4_ANY : XDP_RSS_TYPE_NONE; 1731 1732 return 0; 1733 } 1734 1735 static const struct net_device_ops veth_netdev_ops = { 1736 .ndo_init = veth_dev_init, 1737 .ndo_open = veth_open, 1738 .ndo_stop = veth_close, 1739 .ndo_start_xmit = veth_xmit, 1740 .ndo_get_stats64 = veth_get_stats64, 1741 .ndo_set_rx_mode = veth_set_multicast_list, 1742 .ndo_set_mac_address = eth_mac_addr, 1743 #ifdef CONFIG_NET_POLL_CONTROLLER 1744 .ndo_poll_controller = veth_poll_controller, 1745 #endif 1746 .ndo_get_iflink = veth_get_iflink, 1747 .ndo_fix_features = veth_fix_features, 1748 .ndo_set_features = veth_set_features, 1749 .ndo_features_check = passthru_features_check, 1750 .ndo_set_rx_headroom = veth_set_rx_headroom, 1751 .ndo_bpf = veth_xdp, 1752 .ndo_xdp_xmit = veth_ndo_xdp_xmit, 1753 .ndo_get_peer_dev = veth_peer_dev, 1754 }; 1755 1756 static const struct xdp_metadata_ops veth_xdp_metadata_ops = { 1757 .xmo_rx_timestamp = veth_xdp_rx_timestamp, 1758 .xmo_rx_hash = veth_xdp_rx_hash, 1759 }; 1760 1761 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ 1762 NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \ 1763 NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \ 1764 NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \ 1765 NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX ) 1766 1767 static void veth_setup(struct net_device *dev) 1768 { 1769 ether_setup(dev); 1770 1771 dev->priv_flags &= ~IFF_TX_SKB_SHARING; 1772 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 1773 dev->priv_flags |= IFF_NO_QUEUE; 1774 dev->priv_flags |= IFF_PHONY_HEADROOM; 1775 1776 dev->netdev_ops = &veth_netdev_ops; 1777 dev->xdp_metadata_ops = &veth_xdp_metadata_ops; 1778 dev->ethtool_ops = &veth_ethtool_ops; 1779 dev->features |= NETIF_F_LLTX; 1780 dev->features |= VETH_FEATURES; 1781 dev->vlan_features = dev->features & 1782 ~(NETIF_F_HW_VLAN_CTAG_TX | 1783 NETIF_F_HW_VLAN_STAG_TX | 1784 NETIF_F_HW_VLAN_CTAG_RX | 1785 NETIF_F_HW_VLAN_STAG_RX); 1786 dev->needs_free_netdev = true; 1787 dev->priv_destructor = veth_dev_free; 1788 dev->max_mtu = ETH_MAX_MTU; 1789 1790 dev->hw_features = VETH_FEATURES; 1791 dev->hw_enc_features = VETH_FEATURES; 1792 dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; 1793 netif_set_tso_max_size(dev, GSO_MAX_SIZE); 1794 } 1795 1796 /* 1797 * netlink interface 1798 */ 1799 1800 static int veth_validate(struct nlattr *tb[], struct nlattr *data[], 1801 struct netlink_ext_ack *extack) 1802 { 1803 if (tb[IFLA_ADDRESS]) { 1804 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 1805 return -EINVAL; 1806 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 1807 return -EADDRNOTAVAIL; 1808 } 1809 if (tb[IFLA_MTU]) { 1810 if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU]))) 1811 return -EINVAL; 1812 } 1813 return 0; 1814 } 1815 1816 static struct rtnl_link_ops veth_link_ops; 1817 1818 static void veth_disable_gro(struct net_device *dev) 1819 { 1820 dev->features &= ~NETIF_F_GRO; 1821 dev->wanted_features &= ~NETIF_F_GRO; 1822 netdev_update_features(dev); 1823 } 1824 1825 static int veth_init_queues(struct net_device *dev, struct nlattr *tb[]) 1826 { 1827 int err; 1828 1829 if (!tb[IFLA_NUM_TX_QUEUES] && dev->num_tx_queues > 1) { 1830 err = netif_set_real_num_tx_queues(dev, 1); 1831 if (err) 1832 return err; 1833 } 1834 if (!tb[IFLA_NUM_RX_QUEUES] && dev->num_rx_queues > 1) { 1835 err = netif_set_real_num_rx_queues(dev, 1); 1836 if (err) 1837 return err; 1838 } 1839 return 0; 1840 } 1841 1842 static int veth_newlink(struct net *src_net, struct net_device *dev, 1843 struct nlattr *tb[], struct nlattr *data[], 1844 struct netlink_ext_ack *extack) 1845 { 1846 int err; 1847 struct net_device *peer; 1848 struct veth_priv *priv; 1849 char ifname[IFNAMSIZ]; 1850 struct nlattr *peer_tb[IFLA_MAX + 1], **tbp; 1851 unsigned char name_assign_type; 1852 struct ifinfomsg *ifmp; 1853 struct net *net; 1854 1855 /* 1856 * create and register peer first 1857 */ 1858 if (data != NULL && data[VETH_INFO_PEER] != NULL) { 1859 struct nlattr *nla_peer; 1860 1861 nla_peer = data[VETH_INFO_PEER]; 1862 ifmp = nla_data(nla_peer); 1863 err = rtnl_nla_parse_ifinfomsg(peer_tb, nla_peer, extack); 1864 if (err < 0) 1865 return err; 1866 1867 err = veth_validate(peer_tb, NULL, extack); 1868 if (err < 0) 1869 return err; 1870 1871 tbp = peer_tb; 1872 } else { 1873 ifmp = NULL; 1874 tbp = tb; 1875 } 1876 1877 if (ifmp && tbp[IFLA_IFNAME]) { 1878 nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); 1879 name_assign_type = NET_NAME_USER; 1880 } else { 1881 snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); 1882 name_assign_type = NET_NAME_ENUM; 1883 } 1884 1885 net = rtnl_link_get_net(src_net, tbp); 1886 if (IS_ERR(net)) 1887 return PTR_ERR(net); 1888 1889 peer = rtnl_create_link(net, ifname, name_assign_type, 1890 &veth_link_ops, tbp, extack); 1891 if (IS_ERR(peer)) { 1892 put_net(net); 1893 return PTR_ERR(peer); 1894 } 1895 1896 if (!ifmp || !tbp[IFLA_ADDRESS]) 1897 eth_hw_addr_random(peer); 1898 1899 if (ifmp && (dev->ifindex != 0)) 1900 peer->ifindex = ifmp->ifi_index; 1901 1902 netif_inherit_tso_max(peer, dev); 1903 1904 err = register_netdevice(peer); 1905 put_net(net); 1906 net = NULL; 1907 if (err < 0) 1908 goto err_register_peer; 1909 1910 /* keep GRO disabled by default to be consistent with the established 1911 * veth behavior 1912 */ 1913 veth_disable_gro(peer); 1914 netif_carrier_off(peer); 1915 1916 err = rtnl_configure_link(peer, ifmp, 0, NULL); 1917 if (err < 0) 1918 goto err_configure_peer; 1919 1920 /* 1921 * register dev last 1922 * 1923 * note, that since we've registered new device the dev's name 1924 * should be re-allocated 1925 */ 1926 1927 if (tb[IFLA_ADDRESS] == NULL) 1928 eth_hw_addr_random(dev); 1929 1930 if (tb[IFLA_IFNAME]) 1931 nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); 1932 else 1933 snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); 1934 1935 err = register_netdevice(dev); 1936 if (err < 0) 1937 goto err_register_dev; 1938 1939 netif_carrier_off(dev); 1940 1941 /* 1942 * tie the deviced together 1943 */ 1944 1945 priv = netdev_priv(dev); 1946 rcu_assign_pointer(priv->peer, peer); 1947 err = veth_init_queues(dev, tb); 1948 if (err) 1949 goto err_queues; 1950 1951 priv = netdev_priv(peer); 1952 rcu_assign_pointer(priv->peer, dev); 1953 err = veth_init_queues(peer, tb); 1954 if (err) 1955 goto err_queues; 1956 1957 veth_disable_gro(dev); 1958 /* update XDP supported features */ 1959 veth_set_xdp_features(dev); 1960 veth_set_xdp_features(peer); 1961 1962 return 0; 1963 1964 err_queues: 1965 unregister_netdevice(dev); 1966 err_register_dev: 1967 /* nothing to do */ 1968 err_configure_peer: 1969 unregister_netdevice(peer); 1970 return err; 1971 1972 err_register_peer: 1973 free_netdev(peer); 1974 return err; 1975 } 1976 1977 static void veth_dellink(struct net_device *dev, struct list_head *head) 1978 { 1979 struct veth_priv *priv; 1980 struct net_device *peer; 1981 1982 priv = netdev_priv(dev); 1983 peer = rtnl_dereference(priv->peer); 1984 1985 /* Note : dellink() is called from default_device_exit_batch(), 1986 * before a rcu_synchronize() point. The devices are guaranteed 1987 * not being freed before one RCU grace period. 1988 */ 1989 RCU_INIT_POINTER(priv->peer, NULL); 1990 unregister_netdevice_queue(dev, head); 1991 1992 if (peer) { 1993 priv = netdev_priv(peer); 1994 RCU_INIT_POINTER(priv->peer, NULL); 1995 unregister_netdevice_queue(peer, head); 1996 } 1997 } 1998 1999 static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = { 2000 [VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) }, 2001 }; 2002 2003 static struct net *veth_get_link_net(const struct net_device *dev) 2004 { 2005 struct veth_priv *priv = netdev_priv(dev); 2006 struct net_device *peer = rtnl_dereference(priv->peer); 2007 2008 return peer ? dev_net(peer) : dev_net(dev); 2009 } 2010 2011 static unsigned int veth_get_num_queues(void) 2012 { 2013 /* enforce the same queue limit as rtnl_create_link */ 2014 int queues = num_possible_cpus(); 2015 2016 if (queues > 4096) 2017 queues = 4096; 2018 return queues; 2019 } 2020 2021 static struct rtnl_link_ops veth_link_ops = { 2022 .kind = DRV_NAME, 2023 .priv_size = sizeof(struct veth_priv), 2024 .setup = veth_setup, 2025 .validate = veth_validate, 2026 .newlink = veth_newlink, 2027 .dellink = veth_dellink, 2028 .policy = veth_policy, 2029 .maxtype = VETH_INFO_MAX, 2030 .get_link_net = veth_get_link_net, 2031 .get_num_tx_queues = veth_get_num_queues, 2032 .get_num_rx_queues = veth_get_num_queues, 2033 }; 2034 2035 /* 2036 * init/fini 2037 */ 2038 2039 static __init int veth_init(void) 2040 { 2041 return rtnl_link_register(&veth_link_ops); 2042 } 2043 2044 static __exit void veth_exit(void) 2045 { 2046 rtnl_link_unregister(&veth_link_ops); 2047 } 2048 2049 module_init(veth_init); 2050 module_exit(veth_exit); 2051 2052 MODULE_DESCRIPTION("Virtual Ethernet Tunnel"); 2053 MODULE_LICENSE("GPL v2"); 2054 MODULE_ALIAS_RTNL_LINK(DRV_NAME); 2055