xref: /openbmc/linux/drivers/net/veth.c (revision 1e952e95843d437b8a904dbd5b48d72db8ac23ec)
1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   *  drivers/net/veth.c
4   *
5   *  Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc
6   *
7   * Author: Pavel Emelianov <xemul@openvz.org>
8   * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com>
9   *
10   */
11  
12  #include <linux/netdevice.h>
13  #include <linux/slab.h>
14  #include <linux/ethtool.h>
15  #include <linux/etherdevice.h>
16  #include <linux/u64_stats_sync.h>
17  
18  #include <net/rtnetlink.h>
19  #include <net/dst.h>
20  #include <net/xfrm.h>
21  #include <net/xdp.h>
22  #include <linux/veth.h>
23  #include <linux/module.h>
24  #include <linux/bpf.h>
25  #include <linux/filter.h>
26  #include <linux/ptr_ring.h>
27  #include <linux/bpf_trace.h>
28  #include <linux/net_tstamp.h>
29  #include <net/page_pool/helpers.h>
30  
31  #define DRV_NAME	"veth"
32  #define DRV_VERSION	"1.0"
33  
34  #define VETH_XDP_FLAG		BIT(0)
35  #define VETH_RING_SIZE		256
36  #define VETH_XDP_HEADROOM	(XDP_PACKET_HEADROOM + NET_IP_ALIGN)
37  
38  #define VETH_XDP_TX_BULK_SIZE	16
39  #define VETH_XDP_BATCH		16
40  
41  struct veth_stats {
42  	u64	rx_drops;
43  	/* xdp */
44  	u64	xdp_packets;
45  	u64	xdp_bytes;
46  	u64	xdp_redirect;
47  	u64	xdp_drops;
48  	u64	xdp_tx;
49  	u64	xdp_tx_err;
50  	u64	peer_tq_xdp_xmit;
51  	u64	peer_tq_xdp_xmit_err;
52  };
53  
54  struct veth_rq_stats {
55  	struct veth_stats	vs;
56  	struct u64_stats_sync	syncp;
57  };
58  
59  struct veth_rq {
60  	struct napi_struct	xdp_napi;
61  	struct napi_struct __rcu *napi; /* points to xdp_napi when the latter is initialized */
62  	struct net_device	*dev;
63  	struct bpf_prog __rcu	*xdp_prog;
64  	struct xdp_mem_info	xdp_mem;
65  	struct veth_rq_stats	stats;
66  	bool			rx_notify_masked;
67  	struct ptr_ring		xdp_ring;
68  	struct xdp_rxq_info	xdp_rxq;
69  	struct page_pool	*page_pool;
70  };
71  
72  struct veth_priv {
73  	struct net_device __rcu	*peer;
74  	atomic64_t		dropped;
75  	struct bpf_prog		*_xdp_prog;
76  	struct veth_rq		*rq;
77  	unsigned int		requested_headroom;
78  };
79  
80  struct veth_xdp_tx_bq {
81  	struct xdp_frame *q[VETH_XDP_TX_BULK_SIZE];
82  	unsigned int count;
83  };
84  
85  /*
86   * ethtool interface
87   */
88  
89  struct veth_q_stat_desc {
90  	char	desc[ETH_GSTRING_LEN];
91  	size_t	offset;
92  };
93  
94  #define VETH_RQ_STAT(m)	offsetof(struct veth_stats, m)
95  
96  static const struct veth_q_stat_desc veth_rq_stats_desc[] = {
97  	{ "xdp_packets",	VETH_RQ_STAT(xdp_packets) },
98  	{ "xdp_bytes",		VETH_RQ_STAT(xdp_bytes) },
99  	{ "drops",		VETH_RQ_STAT(rx_drops) },
100  	{ "xdp_redirect",	VETH_RQ_STAT(xdp_redirect) },
101  	{ "xdp_drops",		VETH_RQ_STAT(xdp_drops) },
102  	{ "xdp_tx",		VETH_RQ_STAT(xdp_tx) },
103  	{ "xdp_tx_errors",	VETH_RQ_STAT(xdp_tx_err) },
104  };
105  
106  #define VETH_RQ_STATS_LEN	ARRAY_SIZE(veth_rq_stats_desc)
107  
108  static const struct veth_q_stat_desc veth_tq_stats_desc[] = {
109  	{ "xdp_xmit",		VETH_RQ_STAT(peer_tq_xdp_xmit) },
110  	{ "xdp_xmit_errors",	VETH_RQ_STAT(peer_tq_xdp_xmit_err) },
111  };
112  
113  #define VETH_TQ_STATS_LEN	ARRAY_SIZE(veth_tq_stats_desc)
114  
115  static struct {
116  	const char string[ETH_GSTRING_LEN];
117  } ethtool_stats_keys[] = {
118  	{ "peer_ifindex" },
119  };
120  
121  struct veth_xdp_buff {
122  	struct xdp_buff xdp;
123  	struct sk_buff *skb;
124  };
125  
veth_get_link_ksettings(struct net_device * dev,struct ethtool_link_ksettings * cmd)126  static int veth_get_link_ksettings(struct net_device *dev,
127  				   struct ethtool_link_ksettings *cmd)
128  {
129  	cmd->base.speed		= SPEED_10000;
130  	cmd->base.duplex	= DUPLEX_FULL;
131  	cmd->base.port		= PORT_TP;
132  	cmd->base.autoneg	= AUTONEG_DISABLE;
133  	return 0;
134  }
135  
veth_get_drvinfo(struct net_device * dev,struct ethtool_drvinfo * info)136  static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
137  {
138  	strscpy(info->driver, DRV_NAME, sizeof(info->driver));
139  	strscpy(info->version, DRV_VERSION, sizeof(info->version));
140  }
141  
veth_get_strings(struct net_device * dev,u32 stringset,u8 * buf)142  static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
143  {
144  	u8 *p = buf;
145  	int i, j;
146  
147  	switch(stringset) {
148  	case ETH_SS_STATS:
149  		memcpy(p, &ethtool_stats_keys, sizeof(ethtool_stats_keys));
150  		p += sizeof(ethtool_stats_keys);
151  		for (i = 0; i < dev->real_num_rx_queues; i++)
152  			for (j = 0; j < VETH_RQ_STATS_LEN; j++)
153  				ethtool_sprintf(&p, "rx_queue_%u_%.18s",
154  						i, veth_rq_stats_desc[j].desc);
155  
156  		for (i = 0; i < dev->real_num_tx_queues; i++)
157  			for (j = 0; j < VETH_TQ_STATS_LEN; j++)
158  				ethtool_sprintf(&p, "tx_queue_%u_%.18s",
159  						i, veth_tq_stats_desc[j].desc);
160  
161  		page_pool_ethtool_stats_get_strings(p);
162  		break;
163  	}
164  }
165  
veth_get_sset_count(struct net_device * dev,int sset)166  static int veth_get_sset_count(struct net_device *dev, int sset)
167  {
168  	switch (sset) {
169  	case ETH_SS_STATS:
170  		return ARRAY_SIZE(ethtool_stats_keys) +
171  		       VETH_RQ_STATS_LEN * dev->real_num_rx_queues +
172  		       VETH_TQ_STATS_LEN * dev->real_num_tx_queues +
173  		       page_pool_ethtool_stats_get_count();
174  	default:
175  		return -EOPNOTSUPP;
176  	}
177  }
178  
veth_get_page_pool_stats(struct net_device * dev,u64 * data)179  static void veth_get_page_pool_stats(struct net_device *dev, u64 *data)
180  {
181  #ifdef CONFIG_PAGE_POOL_STATS
182  	struct veth_priv *priv = netdev_priv(dev);
183  	struct page_pool_stats pp_stats = {};
184  	int i;
185  
186  	for (i = 0; i < dev->real_num_rx_queues; i++) {
187  		if (!priv->rq[i].page_pool)
188  			continue;
189  		page_pool_get_stats(priv->rq[i].page_pool, &pp_stats);
190  	}
191  	page_pool_ethtool_stats_get(data, &pp_stats);
192  #endif /* CONFIG_PAGE_POOL_STATS */
193  }
194  
veth_get_ethtool_stats(struct net_device * dev,struct ethtool_stats * stats,u64 * data)195  static void veth_get_ethtool_stats(struct net_device *dev,
196  		struct ethtool_stats *stats, u64 *data)
197  {
198  	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
199  	struct net_device *peer = rtnl_dereference(priv->peer);
200  	int i, j, idx, pp_idx;
201  
202  	data[0] = peer ? peer->ifindex : 0;
203  	idx = 1;
204  	for (i = 0; i < dev->real_num_rx_queues; i++) {
205  		const struct veth_rq_stats *rq_stats = &priv->rq[i].stats;
206  		const void *stats_base = (void *)&rq_stats->vs;
207  		unsigned int start;
208  		size_t offset;
209  
210  		do {
211  			start = u64_stats_fetch_begin(&rq_stats->syncp);
212  			for (j = 0; j < VETH_RQ_STATS_LEN; j++) {
213  				offset = veth_rq_stats_desc[j].offset;
214  				data[idx + j] = *(u64 *)(stats_base + offset);
215  			}
216  		} while (u64_stats_fetch_retry(&rq_stats->syncp, start));
217  		idx += VETH_RQ_STATS_LEN;
218  	}
219  	pp_idx = idx;
220  
221  	if (!peer)
222  		goto page_pool_stats;
223  
224  	rcv_priv = netdev_priv(peer);
225  	for (i = 0; i < peer->real_num_rx_queues; i++) {
226  		const struct veth_rq_stats *rq_stats = &rcv_priv->rq[i].stats;
227  		const void *base = (void *)&rq_stats->vs;
228  		unsigned int start, tx_idx = idx;
229  		size_t offset;
230  
231  		tx_idx += (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN;
232  		do {
233  			start = u64_stats_fetch_begin(&rq_stats->syncp);
234  			for (j = 0; j < VETH_TQ_STATS_LEN; j++) {
235  				offset = veth_tq_stats_desc[j].offset;
236  				data[tx_idx + j] += *(u64 *)(base + offset);
237  			}
238  		} while (u64_stats_fetch_retry(&rq_stats->syncp, start));
239  	}
240  	pp_idx = idx + dev->real_num_tx_queues * VETH_TQ_STATS_LEN;
241  
242  page_pool_stats:
243  	veth_get_page_pool_stats(dev, &data[pp_idx]);
244  }
245  
veth_get_channels(struct net_device * dev,struct ethtool_channels * channels)246  static void veth_get_channels(struct net_device *dev,
247  			      struct ethtool_channels *channels)
248  {
249  	channels->tx_count = dev->real_num_tx_queues;
250  	channels->rx_count = dev->real_num_rx_queues;
251  	channels->max_tx = dev->num_tx_queues;
252  	channels->max_rx = dev->num_rx_queues;
253  }
254  
255  static int veth_set_channels(struct net_device *dev,
256  			     struct ethtool_channels *ch);
257  
258  static const struct ethtool_ops veth_ethtool_ops = {
259  	.get_drvinfo		= veth_get_drvinfo,
260  	.get_link		= ethtool_op_get_link,
261  	.get_strings		= veth_get_strings,
262  	.get_sset_count		= veth_get_sset_count,
263  	.get_ethtool_stats	= veth_get_ethtool_stats,
264  	.get_link_ksettings	= veth_get_link_ksettings,
265  	.get_ts_info		= ethtool_op_get_ts_info,
266  	.get_channels		= veth_get_channels,
267  	.set_channels		= veth_set_channels,
268  };
269  
270  /* general routines */
271  
veth_is_xdp_frame(void * ptr)272  static bool veth_is_xdp_frame(void *ptr)
273  {
274  	return (unsigned long)ptr & VETH_XDP_FLAG;
275  }
276  
veth_ptr_to_xdp(void * ptr)277  static struct xdp_frame *veth_ptr_to_xdp(void *ptr)
278  {
279  	return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG);
280  }
281  
veth_xdp_to_ptr(struct xdp_frame * xdp)282  static void *veth_xdp_to_ptr(struct xdp_frame *xdp)
283  {
284  	return (void *)((unsigned long)xdp | VETH_XDP_FLAG);
285  }
286  
veth_ptr_free(void * ptr)287  static void veth_ptr_free(void *ptr)
288  {
289  	if (veth_is_xdp_frame(ptr))
290  		xdp_return_frame(veth_ptr_to_xdp(ptr));
291  	else
292  		kfree_skb(ptr);
293  }
294  
__veth_xdp_flush(struct veth_rq * rq)295  static void __veth_xdp_flush(struct veth_rq *rq)
296  {
297  	/* Write ptr_ring before reading rx_notify_masked */
298  	smp_mb();
299  	if (!READ_ONCE(rq->rx_notify_masked) &&
300  	    napi_schedule_prep(&rq->xdp_napi)) {
301  		WRITE_ONCE(rq->rx_notify_masked, true);
302  		__napi_schedule(&rq->xdp_napi);
303  	}
304  }
305  
veth_xdp_rx(struct veth_rq * rq,struct sk_buff * skb)306  static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb)
307  {
308  	if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) {
309  		dev_kfree_skb_any(skb);
310  		return NET_RX_DROP;
311  	}
312  
313  	return NET_RX_SUCCESS;
314  }
315  
veth_forward_skb(struct net_device * dev,struct sk_buff * skb,struct veth_rq * rq,bool xdp)316  static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb,
317  			    struct veth_rq *rq, bool xdp)
318  {
319  	return __dev_forward_skb(dev, skb) ?: xdp ?
320  		veth_xdp_rx(rq, skb) :
321  		__netif_rx(skb);
322  }
323  
324  /* return true if the specified skb has chances of GRO aggregation
325   * Don't strive for accuracy, but try to avoid GRO overhead in the most
326   * common scenarios.
327   * When XDP is enabled, all traffic is considered eligible, as the xmit
328   * device has TSO off.
329   * When TSO is enabled on the xmit device, we are likely interested only
330   * in UDP aggregation, explicitly check for that if the skb is suspected
331   * - the sock_wfree destructor is used by UDP, ICMP and XDP sockets -
332   * to belong to locally generated UDP traffic.
333   */
veth_skb_is_eligible_for_gro(const struct net_device * dev,const struct net_device * rcv,const struct sk_buff * skb)334  static bool veth_skb_is_eligible_for_gro(const struct net_device *dev,
335  					 const struct net_device *rcv,
336  					 const struct sk_buff *skb)
337  {
338  	return !(dev->features & NETIF_F_ALL_TSO) ||
339  		(skb->destructor == sock_wfree &&
340  		 rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD));
341  }
342  
veth_xmit(struct sk_buff * skb,struct net_device * dev)343  static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
344  {
345  	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
346  	struct veth_rq *rq = NULL;
347  	int ret = NETDEV_TX_OK;
348  	struct net_device *rcv;
349  	int length = skb->len;
350  	bool use_napi = false;
351  	int rxq;
352  
353  	rcu_read_lock();
354  	rcv = rcu_dereference(priv->peer);
355  	if (unlikely(!rcv) || !pskb_may_pull(skb, ETH_HLEN)) {
356  		kfree_skb(skb);
357  		goto drop;
358  	}
359  
360  	rcv_priv = netdev_priv(rcv);
361  	rxq = skb_get_queue_mapping(skb);
362  	if (rxq < rcv->real_num_rx_queues) {
363  		rq = &rcv_priv->rq[rxq];
364  
365  		/* The napi pointer is available when an XDP program is
366  		 * attached or when GRO is enabled
367  		 * Don't bother with napi/GRO if the skb can't be aggregated
368  		 */
369  		use_napi = rcu_access_pointer(rq->napi) &&
370  			   veth_skb_is_eligible_for_gro(dev, rcv, skb);
371  	}
372  
373  	skb_tx_timestamp(skb);
374  	if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) {
375  		if (!use_napi)
376  			dev_sw_netstats_tx_add(dev, 1, length);
377  		else
378  			__veth_xdp_flush(rq);
379  	} else {
380  drop:
381  		atomic64_inc(&priv->dropped);
382  		ret = NET_XMIT_DROP;
383  	}
384  
385  	rcu_read_unlock();
386  
387  	return ret;
388  }
389  
veth_stats_rx(struct veth_stats * result,struct net_device * dev)390  static void veth_stats_rx(struct veth_stats *result, struct net_device *dev)
391  {
392  	struct veth_priv *priv = netdev_priv(dev);
393  	int i;
394  
395  	result->peer_tq_xdp_xmit_err = 0;
396  	result->xdp_packets = 0;
397  	result->xdp_tx_err = 0;
398  	result->xdp_bytes = 0;
399  	result->rx_drops = 0;
400  	for (i = 0; i < dev->num_rx_queues; i++) {
401  		u64 packets, bytes, drops, xdp_tx_err, peer_tq_xdp_xmit_err;
402  		struct veth_rq_stats *stats = &priv->rq[i].stats;
403  		unsigned int start;
404  
405  		do {
406  			start = u64_stats_fetch_begin(&stats->syncp);
407  			peer_tq_xdp_xmit_err = stats->vs.peer_tq_xdp_xmit_err;
408  			xdp_tx_err = stats->vs.xdp_tx_err;
409  			packets = stats->vs.xdp_packets;
410  			bytes = stats->vs.xdp_bytes;
411  			drops = stats->vs.rx_drops;
412  		} while (u64_stats_fetch_retry(&stats->syncp, start));
413  		result->peer_tq_xdp_xmit_err += peer_tq_xdp_xmit_err;
414  		result->xdp_tx_err += xdp_tx_err;
415  		result->xdp_packets += packets;
416  		result->xdp_bytes += bytes;
417  		result->rx_drops += drops;
418  	}
419  }
420  
veth_get_stats64(struct net_device * dev,struct rtnl_link_stats64 * tot)421  static void veth_get_stats64(struct net_device *dev,
422  			     struct rtnl_link_stats64 *tot)
423  {
424  	struct veth_priv *priv = netdev_priv(dev);
425  	struct net_device *peer;
426  	struct veth_stats rx;
427  
428  	tot->tx_dropped = atomic64_read(&priv->dropped);
429  	dev_fetch_sw_netstats(tot, dev->tstats);
430  
431  	veth_stats_rx(&rx, dev);
432  	tot->tx_dropped += rx.xdp_tx_err;
433  	tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err;
434  	tot->rx_bytes += rx.xdp_bytes;
435  	tot->rx_packets += rx.xdp_packets;
436  
437  	rcu_read_lock();
438  	peer = rcu_dereference(priv->peer);
439  	if (peer) {
440  		struct rtnl_link_stats64 tot_peer = {};
441  
442  		dev_fetch_sw_netstats(&tot_peer, peer->tstats);
443  		tot->rx_bytes += tot_peer.tx_bytes;
444  		tot->rx_packets += tot_peer.tx_packets;
445  
446  		veth_stats_rx(&rx, peer);
447  		tot->tx_dropped += rx.peer_tq_xdp_xmit_err;
448  		tot->rx_dropped += rx.xdp_tx_err;
449  		tot->tx_bytes += rx.xdp_bytes;
450  		tot->tx_packets += rx.xdp_packets;
451  	}
452  	rcu_read_unlock();
453  }
454  
455  /* fake multicast ability */
veth_set_multicast_list(struct net_device * dev)456  static void veth_set_multicast_list(struct net_device *dev)
457  {
458  }
459  
veth_select_rxq(struct net_device * dev)460  static int veth_select_rxq(struct net_device *dev)
461  {
462  	return smp_processor_id() % dev->real_num_rx_queues;
463  }
464  
veth_peer_dev(struct net_device * dev)465  static struct net_device *veth_peer_dev(struct net_device *dev)
466  {
467  	struct veth_priv *priv = netdev_priv(dev);
468  
469  	/* Callers must be under RCU read side. */
470  	return rcu_dereference(priv->peer);
471  }
472  
veth_xdp_xmit(struct net_device * dev,int n,struct xdp_frame ** frames,u32 flags,bool ndo_xmit)473  static int veth_xdp_xmit(struct net_device *dev, int n,
474  			 struct xdp_frame **frames,
475  			 u32 flags, bool ndo_xmit)
476  {
477  	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
478  	int i, ret = -ENXIO, nxmit = 0;
479  	struct net_device *rcv;
480  	unsigned int max_len;
481  	struct veth_rq *rq;
482  
483  	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
484  		return -EINVAL;
485  
486  	rcu_read_lock();
487  	rcv = rcu_dereference(priv->peer);
488  	if (unlikely(!rcv))
489  		goto out;
490  
491  	rcv_priv = netdev_priv(rcv);
492  	rq = &rcv_priv->rq[veth_select_rxq(rcv)];
493  	/* The napi pointer is set if NAPI is enabled, which ensures that
494  	 * xdp_ring is initialized on receive side and the peer device is up.
495  	 */
496  	if (!rcu_access_pointer(rq->napi))
497  		goto out;
498  
499  	max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN;
500  
501  	spin_lock(&rq->xdp_ring.producer_lock);
502  	for (i = 0; i < n; i++) {
503  		struct xdp_frame *frame = frames[i];
504  		void *ptr = veth_xdp_to_ptr(frame);
505  
506  		if (unlikely(xdp_get_frame_len(frame) > max_len ||
507  			     __ptr_ring_produce(&rq->xdp_ring, ptr)))
508  			break;
509  		nxmit++;
510  	}
511  	spin_unlock(&rq->xdp_ring.producer_lock);
512  
513  	if (flags & XDP_XMIT_FLUSH)
514  		__veth_xdp_flush(rq);
515  
516  	ret = nxmit;
517  	if (ndo_xmit) {
518  		u64_stats_update_begin(&rq->stats.syncp);
519  		rq->stats.vs.peer_tq_xdp_xmit += nxmit;
520  		rq->stats.vs.peer_tq_xdp_xmit_err += n - nxmit;
521  		u64_stats_update_end(&rq->stats.syncp);
522  	}
523  
524  out:
525  	rcu_read_unlock();
526  
527  	return ret;
528  }
529  
veth_ndo_xdp_xmit(struct net_device * dev,int n,struct xdp_frame ** frames,u32 flags)530  static int veth_ndo_xdp_xmit(struct net_device *dev, int n,
531  			     struct xdp_frame **frames, u32 flags)
532  {
533  	int err;
534  
535  	err = veth_xdp_xmit(dev, n, frames, flags, true);
536  	if (err < 0) {
537  		struct veth_priv *priv = netdev_priv(dev);
538  
539  		atomic64_add(n, &priv->dropped);
540  	}
541  
542  	return err;
543  }
544  
veth_xdp_flush_bq(struct veth_rq * rq,struct veth_xdp_tx_bq * bq)545  static void veth_xdp_flush_bq(struct veth_rq *rq, struct veth_xdp_tx_bq *bq)
546  {
547  	int sent, i, err = 0, drops;
548  
549  	sent = veth_xdp_xmit(rq->dev, bq->count, bq->q, 0, false);
550  	if (sent < 0) {
551  		err = sent;
552  		sent = 0;
553  	}
554  
555  	for (i = sent; unlikely(i < bq->count); i++)
556  		xdp_return_frame(bq->q[i]);
557  
558  	drops = bq->count - sent;
559  	trace_xdp_bulk_tx(rq->dev, sent, drops, err);
560  
561  	u64_stats_update_begin(&rq->stats.syncp);
562  	rq->stats.vs.xdp_tx += sent;
563  	rq->stats.vs.xdp_tx_err += drops;
564  	u64_stats_update_end(&rq->stats.syncp);
565  
566  	bq->count = 0;
567  }
568  
veth_xdp_flush(struct veth_rq * rq,struct veth_xdp_tx_bq * bq)569  static void veth_xdp_flush(struct veth_rq *rq, struct veth_xdp_tx_bq *bq)
570  {
571  	struct veth_priv *rcv_priv, *priv = netdev_priv(rq->dev);
572  	struct net_device *rcv;
573  	struct veth_rq *rcv_rq;
574  
575  	rcu_read_lock();
576  	veth_xdp_flush_bq(rq, bq);
577  	rcv = rcu_dereference(priv->peer);
578  	if (unlikely(!rcv))
579  		goto out;
580  
581  	rcv_priv = netdev_priv(rcv);
582  	rcv_rq = &rcv_priv->rq[veth_select_rxq(rcv)];
583  	/* xdp_ring is initialized on receive side? */
584  	if (unlikely(!rcu_access_pointer(rcv_rq->xdp_prog)))
585  		goto out;
586  
587  	__veth_xdp_flush(rcv_rq);
588  out:
589  	rcu_read_unlock();
590  }
591  
veth_xdp_tx(struct veth_rq * rq,struct xdp_buff * xdp,struct veth_xdp_tx_bq * bq)592  static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp,
593  		       struct veth_xdp_tx_bq *bq)
594  {
595  	struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp);
596  
597  	if (unlikely(!frame))
598  		return -EOVERFLOW;
599  
600  	if (unlikely(bq->count == VETH_XDP_TX_BULK_SIZE))
601  		veth_xdp_flush_bq(rq, bq);
602  
603  	bq->q[bq->count++] = frame;
604  
605  	return 0;
606  }
607  
veth_xdp_rcv_one(struct veth_rq * rq,struct xdp_frame * frame,struct veth_xdp_tx_bq * bq,struct veth_stats * stats)608  static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq,
609  					  struct xdp_frame *frame,
610  					  struct veth_xdp_tx_bq *bq,
611  					  struct veth_stats *stats)
612  {
613  	struct xdp_frame orig_frame;
614  	struct bpf_prog *xdp_prog;
615  
616  	rcu_read_lock();
617  	xdp_prog = rcu_dereference(rq->xdp_prog);
618  	if (likely(xdp_prog)) {
619  		struct veth_xdp_buff vxbuf;
620  		struct xdp_buff *xdp = &vxbuf.xdp;
621  		u32 act;
622  
623  		xdp_convert_frame_to_buff(frame, xdp);
624  		xdp->rxq = &rq->xdp_rxq;
625  		vxbuf.skb = NULL;
626  
627  		act = bpf_prog_run_xdp(xdp_prog, xdp);
628  
629  		switch (act) {
630  		case XDP_PASS:
631  			if (xdp_update_frame_from_buff(xdp, frame))
632  				goto err_xdp;
633  			break;
634  		case XDP_TX:
635  			orig_frame = *frame;
636  			xdp->rxq->mem = frame->mem;
637  			if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) {
638  				trace_xdp_exception(rq->dev, xdp_prog, act);
639  				frame = &orig_frame;
640  				stats->rx_drops++;
641  				goto err_xdp;
642  			}
643  			stats->xdp_tx++;
644  			rcu_read_unlock();
645  			goto xdp_xmit;
646  		case XDP_REDIRECT:
647  			orig_frame = *frame;
648  			xdp->rxq->mem = frame->mem;
649  			if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) {
650  				frame = &orig_frame;
651  				stats->rx_drops++;
652  				goto err_xdp;
653  			}
654  			stats->xdp_redirect++;
655  			rcu_read_unlock();
656  			goto xdp_xmit;
657  		default:
658  			bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act);
659  			fallthrough;
660  		case XDP_ABORTED:
661  			trace_xdp_exception(rq->dev, xdp_prog, act);
662  			fallthrough;
663  		case XDP_DROP:
664  			stats->xdp_drops++;
665  			goto err_xdp;
666  		}
667  	}
668  	rcu_read_unlock();
669  
670  	return frame;
671  err_xdp:
672  	rcu_read_unlock();
673  	xdp_return_frame(frame);
674  xdp_xmit:
675  	return NULL;
676  }
677  
678  /* frames array contains VETH_XDP_BATCH at most */
veth_xdp_rcv_bulk_skb(struct veth_rq * rq,void ** frames,int n_xdpf,struct veth_xdp_tx_bq * bq,struct veth_stats * stats)679  static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames,
680  				  int n_xdpf, struct veth_xdp_tx_bq *bq,
681  				  struct veth_stats *stats)
682  {
683  	void *skbs[VETH_XDP_BATCH];
684  	int i;
685  
686  	if (xdp_alloc_skb_bulk(skbs, n_xdpf,
687  			       GFP_ATOMIC | __GFP_ZERO) < 0) {
688  		for (i = 0; i < n_xdpf; i++)
689  			xdp_return_frame(frames[i]);
690  		stats->rx_drops += n_xdpf;
691  
692  		return;
693  	}
694  
695  	for (i = 0; i < n_xdpf; i++) {
696  		struct sk_buff *skb = skbs[i];
697  
698  		skb = __xdp_build_skb_from_frame(frames[i], skb,
699  						 rq->dev);
700  		if (!skb) {
701  			xdp_return_frame(frames[i]);
702  			stats->rx_drops++;
703  			continue;
704  		}
705  		napi_gro_receive(&rq->xdp_napi, skb);
706  	}
707  }
708  
veth_xdp_get(struct xdp_buff * xdp)709  static void veth_xdp_get(struct xdp_buff *xdp)
710  {
711  	struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
712  	int i;
713  
714  	get_page(virt_to_page(xdp->data));
715  	if (likely(!xdp_buff_has_frags(xdp)))
716  		return;
717  
718  	for (i = 0; i < sinfo->nr_frags; i++)
719  		__skb_frag_ref(&sinfo->frags[i]);
720  }
721  
veth_convert_skb_to_xdp_buff(struct veth_rq * rq,struct xdp_buff * xdp,struct sk_buff ** pskb)722  static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq,
723  					struct xdp_buff *xdp,
724  					struct sk_buff **pskb)
725  {
726  	struct sk_buff *skb = *pskb;
727  	u32 frame_sz;
728  
729  	if (skb_shared(skb) || skb_head_is_locked(skb) ||
730  	    skb_shinfo(skb)->nr_frags ||
731  	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
732  		u32 size, len, max_head_size, off;
733  		struct sk_buff *nskb;
734  		struct page *page;
735  		int i, head_off;
736  
737  		/* We need a private copy of the skb and data buffers since
738  		 * the ebpf program can modify it. We segment the original skb
739  		 * into order-0 pages without linearize it.
740  		 *
741  		 * Make sure we have enough space for linear and paged area
742  		 */
743  		max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE -
744  						  VETH_XDP_HEADROOM);
745  		if (skb->len > PAGE_SIZE * MAX_SKB_FRAGS + max_head_size)
746  			goto drop;
747  
748  		/* Allocate skb head */
749  		page = page_pool_dev_alloc_pages(rq->page_pool);
750  		if (!page)
751  			goto drop;
752  
753  		nskb = napi_build_skb(page_address(page), PAGE_SIZE);
754  		if (!nskb) {
755  			page_pool_put_full_page(rq->page_pool, page, true);
756  			goto drop;
757  		}
758  
759  		skb_reserve(nskb, VETH_XDP_HEADROOM);
760  		skb_copy_header(nskb, skb);
761  		skb_mark_for_recycle(nskb);
762  
763  		size = min_t(u32, skb->len, max_head_size);
764  		if (skb_copy_bits(skb, 0, nskb->data, size)) {
765  			consume_skb(nskb);
766  			goto drop;
767  		}
768  		skb_put(nskb, size);
769  
770  		head_off = skb_headroom(nskb) - skb_headroom(skb);
771  		skb_headers_offset_update(nskb, head_off);
772  
773  		/* Allocate paged area of new skb */
774  		off = size;
775  		len = skb->len - off;
776  
777  		for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
778  			page = page_pool_dev_alloc_pages(rq->page_pool);
779  			if (!page) {
780  				consume_skb(nskb);
781  				goto drop;
782  			}
783  
784  			size = min_t(u32, len, PAGE_SIZE);
785  			skb_add_rx_frag(nskb, i, page, 0, size, PAGE_SIZE);
786  			if (skb_copy_bits(skb, off, page_address(page),
787  					  size)) {
788  				consume_skb(nskb);
789  				goto drop;
790  			}
791  
792  			len -= size;
793  			off += size;
794  		}
795  
796  		consume_skb(skb);
797  		skb = nskb;
798  	}
799  
800  	/* SKB "head" area always have tailroom for skb_shared_info */
801  	frame_sz = skb_end_pointer(skb) - skb->head;
802  	frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
803  	xdp_init_buff(xdp, frame_sz, &rq->xdp_rxq);
804  	xdp_prepare_buff(xdp, skb->head, skb_headroom(skb),
805  			 skb_headlen(skb), true);
806  
807  	if (skb_is_nonlinear(skb)) {
808  		skb_shinfo(skb)->xdp_frags_size = skb->data_len;
809  		xdp_buff_set_frags_flag(xdp);
810  	} else {
811  		xdp_buff_clear_frags_flag(xdp);
812  	}
813  	*pskb = skb;
814  
815  	return 0;
816  drop:
817  	consume_skb(skb);
818  	*pskb = NULL;
819  
820  	return -ENOMEM;
821  }
822  
veth_xdp_rcv_skb(struct veth_rq * rq,struct sk_buff * skb,struct veth_xdp_tx_bq * bq,struct veth_stats * stats)823  static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
824  					struct sk_buff *skb,
825  					struct veth_xdp_tx_bq *bq,
826  					struct veth_stats *stats)
827  {
828  	void *orig_data, *orig_data_end;
829  	struct bpf_prog *xdp_prog;
830  	struct veth_xdp_buff vxbuf;
831  	struct xdp_buff *xdp = &vxbuf.xdp;
832  	u32 act, metalen;
833  	int off;
834  
835  	skb_prepare_for_gro(skb);
836  
837  	rcu_read_lock();
838  	xdp_prog = rcu_dereference(rq->xdp_prog);
839  	if (unlikely(!xdp_prog)) {
840  		rcu_read_unlock();
841  		goto out;
842  	}
843  
844  	__skb_push(skb, skb->data - skb_mac_header(skb));
845  	if (veth_convert_skb_to_xdp_buff(rq, xdp, &skb))
846  		goto drop;
847  	vxbuf.skb = skb;
848  
849  	orig_data = xdp->data;
850  	orig_data_end = xdp->data_end;
851  
852  	act = bpf_prog_run_xdp(xdp_prog, xdp);
853  
854  	switch (act) {
855  	case XDP_PASS:
856  		break;
857  	case XDP_TX:
858  		veth_xdp_get(xdp);
859  		consume_skb(skb);
860  		xdp->rxq->mem = rq->xdp_mem;
861  		if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) {
862  			trace_xdp_exception(rq->dev, xdp_prog, act);
863  			stats->rx_drops++;
864  			goto err_xdp;
865  		}
866  		stats->xdp_tx++;
867  		rcu_read_unlock();
868  		goto xdp_xmit;
869  	case XDP_REDIRECT:
870  		veth_xdp_get(xdp);
871  		consume_skb(skb);
872  		xdp->rxq->mem = rq->xdp_mem;
873  		if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) {
874  			stats->rx_drops++;
875  			goto err_xdp;
876  		}
877  		stats->xdp_redirect++;
878  		rcu_read_unlock();
879  		goto xdp_xmit;
880  	default:
881  		bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act);
882  		fallthrough;
883  	case XDP_ABORTED:
884  		trace_xdp_exception(rq->dev, xdp_prog, act);
885  		fallthrough;
886  	case XDP_DROP:
887  		stats->xdp_drops++;
888  		goto xdp_drop;
889  	}
890  	rcu_read_unlock();
891  
892  	/* check if bpf_xdp_adjust_head was used */
893  	off = orig_data - xdp->data;
894  	if (off > 0)
895  		__skb_push(skb, off);
896  	else if (off < 0)
897  		__skb_pull(skb, -off);
898  
899  	skb_reset_mac_header(skb);
900  
901  	/* check if bpf_xdp_adjust_tail was used */
902  	off = xdp->data_end - orig_data_end;
903  	if (off != 0)
904  		__skb_put(skb, off); /* positive on grow, negative on shrink */
905  
906  	/* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
907  	 * (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
908  	 */
909  	if (xdp_buff_has_frags(xdp))
910  		skb->data_len = skb_shinfo(skb)->xdp_frags_size;
911  	else
912  		skb->data_len = 0;
913  
914  	skb->protocol = eth_type_trans(skb, rq->dev);
915  
916  	metalen = xdp->data - xdp->data_meta;
917  	if (metalen)
918  		skb_metadata_set(skb, metalen);
919  out:
920  	return skb;
921  drop:
922  	stats->rx_drops++;
923  xdp_drop:
924  	rcu_read_unlock();
925  	kfree_skb(skb);
926  	return NULL;
927  err_xdp:
928  	rcu_read_unlock();
929  	xdp_return_buff(xdp);
930  xdp_xmit:
931  	return NULL;
932  }
933  
veth_xdp_rcv(struct veth_rq * rq,int budget,struct veth_xdp_tx_bq * bq,struct veth_stats * stats)934  static int veth_xdp_rcv(struct veth_rq *rq, int budget,
935  			struct veth_xdp_tx_bq *bq,
936  			struct veth_stats *stats)
937  {
938  	int i, done = 0, n_xdpf = 0;
939  	void *xdpf[VETH_XDP_BATCH];
940  
941  	for (i = 0; i < budget; i++) {
942  		void *ptr = __ptr_ring_consume(&rq->xdp_ring);
943  
944  		if (!ptr)
945  			break;
946  
947  		if (veth_is_xdp_frame(ptr)) {
948  			/* ndo_xdp_xmit */
949  			struct xdp_frame *frame = veth_ptr_to_xdp(ptr);
950  
951  			stats->xdp_bytes += xdp_get_frame_len(frame);
952  			frame = veth_xdp_rcv_one(rq, frame, bq, stats);
953  			if (frame) {
954  				/* XDP_PASS */
955  				xdpf[n_xdpf++] = frame;
956  				if (n_xdpf == VETH_XDP_BATCH) {
957  					veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf,
958  							      bq, stats);
959  					n_xdpf = 0;
960  				}
961  			}
962  		} else {
963  			/* ndo_start_xmit */
964  			struct sk_buff *skb = ptr;
965  
966  			stats->xdp_bytes += skb->len;
967  			skb = veth_xdp_rcv_skb(rq, skb, bq, stats);
968  			if (skb) {
969  				if (skb_shared(skb) || skb_unclone(skb, GFP_ATOMIC))
970  					netif_receive_skb(skb);
971  				else
972  					napi_gro_receive(&rq->xdp_napi, skb);
973  			}
974  		}
975  		done++;
976  	}
977  
978  	if (n_xdpf)
979  		veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats);
980  
981  	u64_stats_update_begin(&rq->stats.syncp);
982  	rq->stats.vs.xdp_redirect += stats->xdp_redirect;
983  	rq->stats.vs.xdp_bytes += stats->xdp_bytes;
984  	rq->stats.vs.xdp_drops += stats->xdp_drops;
985  	rq->stats.vs.rx_drops += stats->rx_drops;
986  	rq->stats.vs.xdp_packets += done;
987  	u64_stats_update_end(&rq->stats.syncp);
988  
989  	return done;
990  }
991  
veth_poll(struct napi_struct * napi,int budget)992  static int veth_poll(struct napi_struct *napi, int budget)
993  {
994  	struct veth_rq *rq =
995  		container_of(napi, struct veth_rq, xdp_napi);
996  	struct veth_stats stats = {};
997  	struct veth_xdp_tx_bq bq;
998  	int done;
999  
1000  	bq.count = 0;
1001  
1002  	xdp_set_return_frame_no_direct();
1003  	done = veth_xdp_rcv(rq, budget, &bq, &stats);
1004  
1005  	if (stats.xdp_redirect > 0)
1006  		xdp_do_flush();
1007  
1008  	if (done < budget && napi_complete_done(napi, done)) {
1009  		/* Write rx_notify_masked before reading ptr_ring */
1010  		smp_store_mb(rq->rx_notify_masked, false);
1011  		if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) {
1012  			if (napi_schedule_prep(&rq->xdp_napi)) {
1013  				WRITE_ONCE(rq->rx_notify_masked, true);
1014  				__napi_schedule(&rq->xdp_napi);
1015  			}
1016  		}
1017  	}
1018  
1019  	if (stats.xdp_tx > 0)
1020  		veth_xdp_flush(rq, &bq);
1021  	xdp_clear_return_frame_no_direct();
1022  
1023  	return done;
1024  }
1025  
veth_create_page_pool(struct veth_rq * rq)1026  static int veth_create_page_pool(struct veth_rq *rq)
1027  {
1028  	struct page_pool_params pp_params = {
1029  		.order = 0,
1030  		.pool_size = VETH_RING_SIZE,
1031  		.nid = NUMA_NO_NODE,
1032  		.dev = &rq->dev->dev,
1033  	};
1034  
1035  	rq->page_pool = page_pool_create(&pp_params);
1036  	if (IS_ERR(rq->page_pool)) {
1037  		int err = PTR_ERR(rq->page_pool);
1038  
1039  		rq->page_pool = NULL;
1040  		return err;
1041  	}
1042  
1043  	return 0;
1044  }
1045  
__veth_napi_enable_range(struct net_device * dev,int start,int end)1046  static int __veth_napi_enable_range(struct net_device *dev, int start, int end)
1047  {
1048  	struct veth_priv *priv = netdev_priv(dev);
1049  	int err, i;
1050  
1051  	for (i = start; i < end; i++) {
1052  		err = veth_create_page_pool(&priv->rq[i]);
1053  		if (err)
1054  			goto err_page_pool;
1055  	}
1056  
1057  	for (i = start; i < end; i++) {
1058  		struct veth_rq *rq = &priv->rq[i];
1059  
1060  		err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL);
1061  		if (err)
1062  			goto err_xdp_ring;
1063  	}
1064  
1065  	for (i = start; i < end; i++) {
1066  		struct veth_rq *rq = &priv->rq[i];
1067  
1068  		napi_enable(&rq->xdp_napi);
1069  		rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi);
1070  	}
1071  
1072  	return 0;
1073  
1074  err_xdp_ring:
1075  	for (i--; i >= start; i--)
1076  		ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free);
1077  	i = end;
1078  err_page_pool:
1079  	for (i--; i >= start; i--) {
1080  		page_pool_destroy(priv->rq[i].page_pool);
1081  		priv->rq[i].page_pool = NULL;
1082  	}
1083  
1084  	return err;
1085  }
1086  
__veth_napi_enable(struct net_device * dev)1087  static int __veth_napi_enable(struct net_device *dev)
1088  {
1089  	return __veth_napi_enable_range(dev, 0, dev->real_num_rx_queues);
1090  }
1091  
veth_napi_del_range(struct net_device * dev,int start,int end)1092  static void veth_napi_del_range(struct net_device *dev, int start, int end)
1093  {
1094  	struct veth_priv *priv = netdev_priv(dev);
1095  	int i;
1096  
1097  	for (i = start; i < end; i++) {
1098  		struct veth_rq *rq = &priv->rq[i];
1099  
1100  		rcu_assign_pointer(priv->rq[i].napi, NULL);
1101  		napi_disable(&rq->xdp_napi);
1102  		__netif_napi_del(&rq->xdp_napi);
1103  	}
1104  	synchronize_net();
1105  
1106  	for (i = start; i < end; i++) {
1107  		struct veth_rq *rq = &priv->rq[i];
1108  
1109  		rq->rx_notify_masked = false;
1110  		ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free);
1111  	}
1112  
1113  	for (i = start; i < end; i++) {
1114  		page_pool_destroy(priv->rq[i].page_pool);
1115  		priv->rq[i].page_pool = NULL;
1116  	}
1117  }
1118  
veth_napi_del(struct net_device * dev)1119  static void veth_napi_del(struct net_device *dev)
1120  {
1121  	veth_napi_del_range(dev, 0, dev->real_num_rx_queues);
1122  }
1123  
veth_gro_requested(const struct net_device * dev)1124  static bool veth_gro_requested(const struct net_device *dev)
1125  {
1126  	return !!(dev->wanted_features & NETIF_F_GRO);
1127  }
1128  
veth_enable_xdp_range(struct net_device * dev,int start,int end,bool napi_already_on)1129  static int veth_enable_xdp_range(struct net_device *dev, int start, int end,
1130  				 bool napi_already_on)
1131  {
1132  	struct veth_priv *priv = netdev_priv(dev);
1133  	int err, i;
1134  
1135  	for (i = start; i < end; i++) {
1136  		struct veth_rq *rq = &priv->rq[i];
1137  
1138  		if (!napi_already_on)
1139  			netif_napi_add(dev, &rq->xdp_napi, veth_poll);
1140  		err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id);
1141  		if (err < 0)
1142  			goto err_rxq_reg;
1143  
1144  		err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
1145  						 MEM_TYPE_PAGE_SHARED,
1146  						 NULL);
1147  		if (err < 0)
1148  			goto err_reg_mem;
1149  
1150  		/* Save original mem info as it can be overwritten */
1151  		rq->xdp_mem = rq->xdp_rxq.mem;
1152  	}
1153  	return 0;
1154  
1155  err_reg_mem:
1156  	xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq);
1157  err_rxq_reg:
1158  	for (i--; i >= start; i--) {
1159  		struct veth_rq *rq = &priv->rq[i];
1160  
1161  		xdp_rxq_info_unreg(&rq->xdp_rxq);
1162  		if (!napi_already_on)
1163  			netif_napi_del(&rq->xdp_napi);
1164  	}
1165  
1166  	return err;
1167  }
1168  
veth_disable_xdp_range(struct net_device * dev,int start,int end,bool delete_napi)1169  static void veth_disable_xdp_range(struct net_device *dev, int start, int end,
1170  				   bool delete_napi)
1171  {
1172  	struct veth_priv *priv = netdev_priv(dev);
1173  	int i;
1174  
1175  	for (i = start; i < end; i++) {
1176  		struct veth_rq *rq = &priv->rq[i];
1177  
1178  		rq->xdp_rxq.mem = rq->xdp_mem;
1179  		xdp_rxq_info_unreg(&rq->xdp_rxq);
1180  
1181  		if (delete_napi)
1182  			netif_napi_del(&rq->xdp_napi);
1183  	}
1184  }
1185  
veth_enable_xdp(struct net_device * dev)1186  static int veth_enable_xdp(struct net_device *dev)
1187  {
1188  	bool napi_already_on = veth_gro_requested(dev) && (dev->flags & IFF_UP);
1189  	struct veth_priv *priv = netdev_priv(dev);
1190  	int err, i;
1191  
1192  	if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) {
1193  		err = veth_enable_xdp_range(dev, 0, dev->real_num_rx_queues, napi_already_on);
1194  		if (err)
1195  			return err;
1196  
1197  		if (!napi_already_on) {
1198  			err = __veth_napi_enable(dev);
1199  			if (err) {
1200  				veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, true);
1201  				return err;
1202  			}
1203  		}
1204  	}
1205  
1206  	for (i = 0; i < dev->real_num_rx_queues; i++) {
1207  		rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog);
1208  		rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi);
1209  	}
1210  
1211  	return 0;
1212  }
1213  
veth_disable_xdp(struct net_device * dev)1214  static void veth_disable_xdp(struct net_device *dev)
1215  {
1216  	struct veth_priv *priv = netdev_priv(dev);
1217  	int i;
1218  
1219  	for (i = 0; i < dev->real_num_rx_queues; i++)
1220  		rcu_assign_pointer(priv->rq[i].xdp_prog, NULL);
1221  
1222  	if (!netif_running(dev) || !veth_gro_requested(dev))
1223  		veth_napi_del(dev);
1224  
1225  	veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, false);
1226  }
1227  
veth_napi_enable_range(struct net_device * dev,int start,int end)1228  static int veth_napi_enable_range(struct net_device *dev, int start, int end)
1229  {
1230  	struct veth_priv *priv = netdev_priv(dev);
1231  	int err, i;
1232  
1233  	for (i = start; i < end; i++) {
1234  		struct veth_rq *rq = &priv->rq[i];
1235  
1236  		netif_napi_add(dev, &rq->xdp_napi, veth_poll);
1237  	}
1238  
1239  	err = __veth_napi_enable_range(dev, start, end);
1240  	if (err) {
1241  		for (i = start; i < end; i++) {
1242  			struct veth_rq *rq = &priv->rq[i];
1243  
1244  			netif_napi_del(&rq->xdp_napi);
1245  		}
1246  		return err;
1247  	}
1248  	return err;
1249  }
1250  
veth_napi_enable(struct net_device * dev)1251  static int veth_napi_enable(struct net_device *dev)
1252  {
1253  	return veth_napi_enable_range(dev, 0, dev->real_num_rx_queues);
1254  }
1255  
veth_disable_range_safe(struct net_device * dev,int start,int end)1256  static void veth_disable_range_safe(struct net_device *dev, int start, int end)
1257  {
1258  	struct veth_priv *priv = netdev_priv(dev);
1259  
1260  	if (start >= end)
1261  		return;
1262  
1263  	if (priv->_xdp_prog) {
1264  		veth_napi_del_range(dev, start, end);
1265  		veth_disable_xdp_range(dev, start, end, false);
1266  	} else if (veth_gro_requested(dev)) {
1267  		veth_napi_del_range(dev, start, end);
1268  	}
1269  }
1270  
veth_enable_range_safe(struct net_device * dev,int start,int end)1271  static int veth_enable_range_safe(struct net_device *dev, int start, int end)
1272  {
1273  	struct veth_priv *priv = netdev_priv(dev);
1274  	int err;
1275  
1276  	if (start >= end)
1277  		return 0;
1278  
1279  	if (priv->_xdp_prog) {
1280  		/* these channels are freshly initialized, napi is not on there even
1281  		 * when GRO is requeste
1282  		 */
1283  		err = veth_enable_xdp_range(dev, start, end, false);
1284  		if (err)
1285  			return err;
1286  
1287  		err = __veth_napi_enable_range(dev, start, end);
1288  		if (err) {
1289  			/* on error always delete the newly added napis */
1290  			veth_disable_xdp_range(dev, start, end, true);
1291  			return err;
1292  		}
1293  	} else if (veth_gro_requested(dev)) {
1294  		return veth_napi_enable_range(dev, start, end);
1295  	}
1296  	return 0;
1297  }
1298  
veth_set_xdp_features(struct net_device * dev)1299  static void veth_set_xdp_features(struct net_device *dev)
1300  {
1301  	struct veth_priv *priv = netdev_priv(dev);
1302  	struct net_device *peer;
1303  
1304  	peer = rtnl_dereference(priv->peer);
1305  	if (peer && peer->real_num_tx_queues <= dev->real_num_rx_queues) {
1306  		struct veth_priv *priv_peer = netdev_priv(peer);
1307  		xdp_features_t val = NETDEV_XDP_ACT_BASIC |
1308  				     NETDEV_XDP_ACT_REDIRECT |
1309  				     NETDEV_XDP_ACT_RX_SG;
1310  
1311  		if (priv_peer->_xdp_prog || veth_gro_requested(peer))
1312  			val |= NETDEV_XDP_ACT_NDO_XMIT |
1313  			       NETDEV_XDP_ACT_NDO_XMIT_SG;
1314  		xdp_set_features_flag(dev, val);
1315  	} else {
1316  		xdp_clear_features_flag(dev);
1317  	}
1318  }
1319  
veth_set_channels(struct net_device * dev,struct ethtool_channels * ch)1320  static int veth_set_channels(struct net_device *dev,
1321  			     struct ethtool_channels *ch)
1322  {
1323  	struct veth_priv *priv = netdev_priv(dev);
1324  	unsigned int old_rx_count, new_rx_count;
1325  	struct veth_priv *peer_priv;
1326  	struct net_device *peer;
1327  	int err;
1328  
1329  	/* sanity check. Upper bounds are already enforced by the caller */
1330  	if (!ch->rx_count || !ch->tx_count)
1331  		return -EINVAL;
1332  
1333  	/* avoid braking XDP, if that is enabled */
1334  	peer = rtnl_dereference(priv->peer);
1335  	peer_priv = peer ? netdev_priv(peer) : NULL;
1336  	if (priv->_xdp_prog && peer && ch->rx_count < peer->real_num_tx_queues)
1337  		return -EINVAL;
1338  
1339  	if (peer && peer_priv && peer_priv->_xdp_prog && ch->tx_count > peer->real_num_rx_queues)
1340  		return -EINVAL;
1341  
1342  	old_rx_count = dev->real_num_rx_queues;
1343  	new_rx_count = ch->rx_count;
1344  	if (netif_running(dev)) {
1345  		/* turn device off */
1346  		netif_carrier_off(dev);
1347  		if (peer)
1348  			netif_carrier_off(peer);
1349  
1350  		/* try to allocate new resurces, as needed*/
1351  		err = veth_enable_range_safe(dev, old_rx_count, new_rx_count);
1352  		if (err)
1353  			goto out;
1354  	}
1355  
1356  	err = netif_set_real_num_rx_queues(dev, ch->rx_count);
1357  	if (err)
1358  		goto revert;
1359  
1360  	err = netif_set_real_num_tx_queues(dev, ch->tx_count);
1361  	if (err) {
1362  		int err2 = netif_set_real_num_rx_queues(dev, old_rx_count);
1363  
1364  		/* this error condition could happen only if rx and tx change
1365  		 * in opposite directions (e.g. tx nr raises, rx nr decreases)
1366  		 * and we can't do anything to fully restore the original
1367  		 * status
1368  		 */
1369  		if (err2)
1370  			pr_warn("Can't restore rx queues config %d -> %d %d",
1371  				new_rx_count, old_rx_count, err2);
1372  		else
1373  			goto revert;
1374  	}
1375  
1376  out:
1377  	if (netif_running(dev)) {
1378  		/* note that we need to swap the arguments WRT the enable part
1379  		 * to identify the range we have to disable
1380  		 */
1381  		veth_disable_range_safe(dev, new_rx_count, old_rx_count);
1382  		netif_carrier_on(dev);
1383  		if (peer)
1384  			netif_carrier_on(peer);
1385  	}
1386  
1387  	/* update XDP supported features */
1388  	veth_set_xdp_features(dev);
1389  	if (peer)
1390  		veth_set_xdp_features(peer);
1391  
1392  	return err;
1393  
1394  revert:
1395  	new_rx_count = old_rx_count;
1396  	old_rx_count = ch->rx_count;
1397  	goto out;
1398  }
1399  
veth_open(struct net_device * dev)1400  static int veth_open(struct net_device *dev)
1401  {
1402  	struct veth_priv *priv = netdev_priv(dev);
1403  	struct net_device *peer = rtnl_dereference(priv->peer);
1404  	int err;
1405  
1406  	if (!peer)
1407  		return -ENOTCONN;
1408  
1409  	if (priv->_xdp_prog) {
1410  		err = veth_enable_xdp(dev);
1411  		if (err)
1412  			return err;
1413  	} else if (veth_gro_requested(dev)) {
1414  		err = veth_napi_enable(dev);
1415  		if (err)
1416  			return err;
1417  	}
1418  
1419  	if (peer->flags & IFF_UP) {
1420  		netif_carrier_on(dev);
1421  		netif_carrier_on(peer);
1422  	}
1423  
1424  	veth_set_xdp_features(dev);
1425  
1426  	return 0;
1427  }
1428  
veth_close(struct net_device * dev)1429  static int veth_close(struct net_device *dev)
1430  {
1431  	struct veth_priv *priv = netdev_priv(dev);
1432  	struct net_device *peer = rtnl_dereference(priv->peer);
1433  
1434  	netif_carrier_off(dev);
1435  	if (peer)
1436  		netif_carrier_off(peer);
1437  
1438  	if (priv->_xdp_prog)
1439  		veth_disable_xdp(dev);
1440  	else if (veth_gro_requested(dev))
1441  		veth_napi_del(dev);
1442  
1443  	return 0;
1444  }
1445  
is_valid_veth_mtu(int mtu)1446  static int is_valid_veth_mtu(int mtu)
1447  {
1448  	return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU;
1449  }
1450  
veth_alloc_queues(struct net_device * dev)1451  static int veth_alloc_queues(struct net_device *dev)
1452  {
1453  	struct veth_priv *priv = netdev_priv(dev);
1454  	int i;
1455  
1456  	priv->rq = kvcalloc(dev->num_rx_queues, sizeof(*priv->rq),
1457  			    GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
1458  	if (!priv->rq)
1459  		return -ENOMEM;
1460  
1461  	for (i = 0; i < dev->num_rx_queues; i++) {
1462  		priv->rq[i].dev = dev;
1463  		u64_stats_init(&priv->rq[i].stats.syncp);
1464  	}
1465  
1466  	return 0;
1467  }
1468  
veth_free_queues(struct net_device * dev)1469  static void veth_free_queues(struct net_device *dev)
1470  {
1471  	struct veth_priv *priv = netdev_priv(dev);
1472  
1473  	kvfree(priv->rq);
1474  }
1475  
veth_dev_init(struct net_device * dev)1476  static int veth_dev_init(struct net_device *dev)
1477  {
1478  	netdev_lockdep_set_classes(dev);
1479  	return veth_alloc_queues(dev);
1480  }
1481  
veth_dev_free(struct net_device * dev)1482  static void veth_dev_free(struct net_device *dev)
1483  {
1484  	veth_free_queues(dev);
1485  }
1486  
1487  #ifdef CONFIG_NET_POLL_CONTROLLER
veth_poll_controller(struct net_device * dev)1488  static void veth_poll_controller(struct net_device *dev)
1489  {
1490  	/* veth only receives frames when its peer sends one
1491  	 * Since it has nothing to do with disabling irqs, we are guaranteed
1492  	 * never to have pending data when we poll for it so
1493  	 * there is nothing to do here.
1494  	 *
1495  	 * We need this though so netpoll recognizes us as an interface that
1496  	 * supports polling, which enables bridge devices in virt setups to
1497  	 * still use netconsole
1498  	 */
1499  }
1500  #endif	/* CONFIG_NET_POLL_CONTROLLER */
1501  
veth_get_iflink(const struct net_device * dev)1502  static int veth_get_iflink(const struct net_device *dev)
1503  {
1504  	struct veth_priv *priv = netdev_priv(dev);
1505  	struct net_device *peer;
1506  	int iflink;
1507  
1508  	rcu_read_lock();
1509  	peer = rcu_dereference(priv->peer);
1510  	iflink = peer ? peer->ifindex : 0;
1511  	rcu_read_unlock();
1512  
1513  	return iflink;
1514  }
1515  
veth_fix_features(struct net_device * dev,netdev_features_t features)1516  static netdev_features_t veth_fix_features(struct net_device *dev,
1517  					   netdev_features_t features)
1518  {
1519  	struct veth_priv *priv = netdev_priv(dev);
1520  	struct net_device *peer;
1521  
1522  	peer = rtnl_dereference(priv->peer);
1523  	if (peer) {
1524  		struct veth_priv *peer_priv = netdev_priv(peer);
1525  
1526  		if (peer_priv->_xdp_prog)
1527  			features &= ~NETIF_F_GSO_SOFTWARE;
1528  	}
1529  
1530  	return features;
1531  }
1532  
veth_set_features(struct net_device * dev,netdev_features_t features)1533  static int veth_set_features(struct net_device *dev,
1534  			     netdev_features_t features)
1535  {
1536  	netdev_features_t changed = features ^ dev->features;
1537  	struct veth_priv *priv = netdev_priv(dev);
1538  	struct net_device *peer;
1539  	int err;
1540  
1541  	if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog)
1542  		return 0;
1543  
1544  	peer = rtnl_dereference(priv->peer);
1545  	if (features & NETIF_F_GRO) {
1546  		err = veth_napi_enable(dev);
1547  		if (err)
1548  			return err;
1549  
1550  		if (peer)
1551  			xdp_features_set_redirect_target(peer, true);
1552  	} else {
1553  		if (peer)
1554  			xdp_features_clear_redirect_target(peer);
1555  		veth_napi_del(dev);
1556  	}
1557  	return 0;
1558  }
1559  
veth_set_rx_headroom(struct net_device * dev,int new_hr)1560  static void veth_set_rx_headroom(struct net_device *dev, int new_hr)
1561  {
1562  	struct veth_priv *peer_priv, *priv = netdev_priv(dev);
1563  	struct net_device *peer;
1564  
1565  	if (new_hr < 0)
1566  		new_hr = 0;
1567  
1568  	rcu_read_lock();
1569  	peer = rcu_dereference(priv->peer);
1570  	if (unlikely(!peer))
1571  		goto out;
1572  
1573  	peer_priv = netdev_priv(peer);
1574  	priv->requested_headroom = new_hr;
1575  	new_hr = max(priv->requested_headroom, peer_priv->requested_headroom);
1576  	dev->needed_headroom = new_hr;
1577  	peer->needed_headroom = new_hr;
1578  
1579  out:
1580  	rcu_read_unlock();
1581  }
1582  
veth_xdp_set(struct net_device * dev,struct bpf_prog * prog,struct netlink_ext_ack * extack)1583  static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog,
1584  			struct netlink_ext_ack *extack)
1585  {
1586  	struct veth_priv *priv = netdev_priv(dev);
1587  	struct bpf_prog *old_prog;
1588  	struct net_device *peer;
1589  	unsigned int max_mtu;
1590  	int err;
1591  
1592  	old_prog = priv->_xdp_prog;
1593  	priv->_xdp_prog = prog;
1594  	peer = rtnl_dereference(priv->peer);
1595  
1596  	if (prog) {
1597  		if (!peer) {
1598  			NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached");
1599  			err = -ENOTCONN;
1600  			goto err;
1601  		}
1602  
1603  		max_mtu = SKB_WITH_OVERHEAD(PAGE_SIZE - VETH_XDP_HEADROOM) -
1604  			  peer->hard_header_len;
1605  		/* Allow increasing the max_mtu if the program supports
1606  		 * XDP fragments.
1607  		 */
1608  		if (prog->aux->xdp_has_frags)
1609  			max_mtu += PAGE_SIZE * MAX_SKB_FRAGS;
1610  
1611  		if (peer->mtu > max_mtu) {
1612  			NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP");
1613  			err = -ERANGE;
1614  			goto err;
1615  		}
1616  
1617  		if (dev->real_num_rx_queues < peer->real_num_tx_queues) {
1618  			NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues");
1619  			err = -ENOSPC;
1620  			goto err;
1621  		}
1622  
1623  		if (dev->flags & IFF_UP) {
1624  			err = veth_enable_xdp(dev);
1625  			if (err) {
1626  				NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed");
1627  				goto err;
1628  			}
1629  		}
1630  
1631  		if (!old_prog) {
1632  			peer->hw_features &= ~NETIF_F_GSO_SOFTWARE;
1633  			peer->max_mtu = max_mtu;
1634  		}
1635  
1636  		xdp_features_set_redirect_target(peer, true);
1637  	}
1638  
1639  	if (old_prog) {
1640  		if (!prog) {
1641  			if (peer && !veth_gro_requested(dev))
1642  				xdp_features_clear_redirect_target(peer);
1643  
1644  			if (dev->flags & IFF_UP)
1645  				veth_disable_xdp(dev);
1646  
1647  			if (peer) {
1648  				peer->hw_features |= NETIF_F_GSO_SOFTWARE;
1649  				peer->max_mtu = ETH_MAX_MTU;
1650  			}
1651  		}
1652  		bpf_prog_put(old_prog);
1653  	}
1654  
1655  	if ((!!old_prog ^ !!prog) && peer)
1656  		netdev_update_features(peer);
1657  
1658  	return 0;
1659  err:
1660  	priv->_xdp_prog = old_prog;
1661  
1662  	return err;
1663  }
1664  
veth_xdp(struct net_device * dev,struct netdev_bpf * xdp)1665  static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
1666  {
1667  	switch (xdp->command) {
1668  	case XDP_SETUP_PROG:
1669  		return veth_xdp_set(dev, xdp->prog, xdp->extack);
1670  	default:
1671  		return -EINVAL;
1672  	}
1673  }
1674  
veth_xdp_rx_timestamp(const struct xdp_md * ctx,u64 * timestamp)1675  static int veth_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp)
1676  {
1677  	struct veth_xdp_buff *_ctx = (void *)ctx;
1678  
1679  	if (!_ctx->skb)
1680  		return -ENODATA;
1681  
1682  	*timestamp = skb_hwtstamps(_ctx->skb)->hwtstamp;
1683  	return 0;
1684  }
1685  
veth_xdp_rx_hash(const struct xdp_md * ctx,u32 * hash,enum xdp_rss_hash_type * rss_type)1686  static int veth_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash,
1687  			    enum xdp_rss_hash_type *rss_type)
1688  {
1689  	struct veth_xdp_buff *_ctx = (void *)ctx;
1690  	struct sk_buff *skb = _ctx->skb;
1691  
1692  	if (!skb)
1693  		return -ENODATA;
1694  
1695  	*hash = skb_get_hash(skb);
1696  	*rss_type = skb->l4_hash ? XDP_RSS_TYPE_L4_ANY : XDP_RSS_TYPE_NONE;
1697  
1698  	return 0;
1699  }
1700  
1701  static const struct net_device_ops veth_netdev_ops = {
1702  	.ndo_init            = veth_dev_init,
1703  	.ndo_open            = veth_open,
1704  	.ndo_stop            = veth_close,
1705  	.ndo_start_xmit      = veth_xmit,
1706  	.ndo_get_stats64     = veth_get_stats64,
1707  	.ndo_set_rx_mode     = veth_set_multicast_list,
1708  	.ndo_set_mac_address = eth_mac_addr,
1709  #ifdef CONFIG_NET_POLL_CONTROLLER
1710  	.ndo_poll_controller	= veth_poll_controller,
1711  #endif
1712  	.ndo_get_iflink		= veth_get_iflink,
1713  	.ndo_fix_features	= veth_fix_features,
1714  	.ndo_set_features	= veth_set_features,
1715  	.ndo_features_check	= passthru_features_check,
1716  	.ndo_set_rx_headroom	= veth_set_rx_headroom,
1717  	.ndo_bpf		= veth_xdp,
1718  	.ndo_xdp_xmit		= veth_ndo_xdp_xmit,
1719  	.ndo_get_peer_dev	= veth_peer_dev,
1720  };
1721  
1722  static const struct xdp_metadata_ops veth_xdp_metadata_ops = {
1723  	.xmo_rx_timestamp		= veth_xdp_rx_timestamp,
1724  	.xmo_rx_hash			= veth_xdp_rx_hash,
1725  };
1726  
1727  #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
1728  		       NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \
1729  		       NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \
1730  		       NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \
1731  		       NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX )
1732  
veth_setup(struct net_device * dev)1733  static void veth_setup(struct net_device *dev)
1734  {
1735  	ether_setup(dev);
1736  
1737  	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1738  	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1739  	dev->priv_flags |= IFF_NO_QUEUE;
1740  	dev->priv_flags |= IFF_PHONY_HEADROOM;
1741  
1742  	dev->netdev_ops = &veth_netdev_ops;
1743  	dev->xdp_metadata_ops = &veth_xdp_metadata_ops;
1744  	dev->ethtool_ops = &veth_ethtool_ops;
1745  	dev->features |= NETIF_F_LLTX;
1746  	dev->features |= VETH_FEATURES;
1747  	dev->vlan_features = dev->features &
1748  			     ~(NETIF_F_HW_VLAN_CTAG_TX |
1749  			       NETIF_F_HW_VLAN_STAG_TX |
1750  			       NETIF_F_HW_VLAN_CTAG_RX |
1751  			       NETIF_F_HW_VLAN_STAG_RX);
1752  	dev->needs_free_netdev = true;
1753  	dev->priv_destructor = veth_dev_free;
1754  	dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
1755  	dev->max_mtu = ETH_MAX_MTU;
1756  
1757  	dev->hw_features = VETH_FEATURES;
1758  	dev->hw_enc_features = VETH_FEATURES;
1759  	dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE;
1760  	netif_set_tso_max_size(dev, GSO_MAX_SIZE);
1761  }
1762  
1763  /*
1764   * netlink interface
1765   */
1766  
veth_validate(struct nlattr * tb[],struct nlattr * data[],struct netlink_ext_ack * extack)1767  static int veth_validate(struct nlattr *tb[], struct nlattr *data[],
1768  			 struct netlink_ext_ack *extack)
1769  {
1770  	if (tb[IFLA_ADDRESS]) {
1771  		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1772  			return -EINVAL;
1773  		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1774  			return -EADDRNOTAVAIL;
1775  	}
1776  	if (tb[IFLA_MTU]) {
1777  		if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU])))
1778  			return -EINVAL;
1779  	}
1780  	return 0;
1781  }
1782  
1783  static struct rtnl_link_ops veth_link_ops;
1784  
veth_disable_gro(struct net_device * dev)1785  static void veth_disable_gro(struct net_device *dev)
1786  {
1787  	dev->features &= ~NETIF_F_GRO;
1788  	dev->wanted_features &= ~NETIF_F_GRO;
1789  	netdev_update_features(dev);
1790  }
1791  
veth_init_queues(struct net_device * dev,struct nlattr * tb[])1792  static int veth_init_queues(struct net_device *dev, struct nlattr *tb[])
1793  {
1794  	int err;
1795  
1796  	if (!tb[IFLA_NUM_TX_QUEUES] && dev->num_tx_queues > 1) {
1797  		err = netif_set_real_num_tx_queues(dev, 1);
1798  		if (err)
1799  			return err;
1800  	}
1801  	if (!tb[IFLA_NUM_RX_QUEUES] && dev->num_rx_queues > 1) {
1802  		err = netif_set_real_num_rx_queues(dev, 1);
1803  		if (err)
1804  			return err;
1805  	}
1806  	return 0;
1807  }
1808  
veth_newlink(struct net * src_net,struct net_device * dev,struct nlattr * tb[],struct nlattr * data[],struct netlink_ext_ack * extack)1809  static int veth_newlink(struct net *src_net, struct net_device *dev,
1810  			struct nlattr *tb[], struct nlattr *data[],
1811  			struct netlink_ext_ack *extack)
1812  {
1813  	int err;
1814  	struct net_device *peer;
1815  	struct veth_priv *priv;
1816  	char ifname[IFNAMSIZ];
1817  	struct nlattr *peer_tb[IFLA_MAX + 1], **tbp;
1818  	unsigned char name_assign_type;
1819  	struct ifinfomsg *ifmp;
1820  	struct net *net;
1821  
1822  	/*
1823  	 * create and register peer first
1824  	 */
1825  	if (data != NULL && data[VETH_INFO_PEER] != NULL) {
1826  		struct nlattr *nla_peer;
1827  
1828  		nla_peer = data[VETH_INFO_PEER];
1829  		ifmp = nla_data(nla_peer);
1830  		err = rtnl_nla_parse_ifinfomsg(peer_tb, nla_peer, extack);
1831  		if (err < 0)
1832  			return err;
1833  
1834  		err = veth_validate(peer_tb, NULL, extack);
1835  		if (err < 0)
1836  			return err;
1837  
1838  		tbp = peer_tb;
1839  	} else {
1840  		ifmp = NULL;
1841  		tbp = tb;
1842  	}
1843  
1844  	if (ifmp && tbp[IFLA_IFNAME]) {
1845  		nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ);
1846  		name_assign_type = NET_NAME_USER;
1847  	} else {
1848  		snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d");
1849  		name_assign_type = NET_NAME_ENUM;
1850  	}
1851  
1852  	net = rtnl_link_get_net(src_net, tbp);
1853  	if (IS_ERR(net))
1854  		return PTR_ERR(net);
1855  
1856  	peer = rtnl_create_link(net, ifname, name_assign_type,
1857  				&veth_link_ops, tbp, extack);
1858  	if (IS_ERR(peer)) {
1859  		put_net(net);
1860  		return PTR_ERR(peer);
1861  	}
1862  
1863  	if (!ifmp || !tbp[IFLA_ADDRESS])
1864  		eth_hw_addr_random(peer);
1865  
1866  	if (ifmp && (dev->ifindex != 0))
1867  		peer->ifindex = ifmp->ifi_index;
1868  
1869  	netif_inherit_tso_max(peer, dev);
1870  
1871  	err = register_netdevice(peer);
1872  	put_net(net);
1873  	net = NULL;
1874  	if (err < 0)
1875  		goto err_register_peer;
1876  
1877  	/* keep GRO disabled by default to be consistent with the established
1878  	 * veth behavior
1879  	 */
1880  	veth_disable_gro(peer);
1881  	netif_carrier_off(peer);
1882  
1883  	err = rtnl_configure_link(peer, ifmp, 0, NULL);
1884  	if (err < 0)
1885  		goto err_configure_peer;
1886  
1887  	/*
1888  	 * register dev last
1889  	 *
1890  	 * note, that since we've registered new device the dev's name
1891  	 * should be re-allocated
1892  	 */
1893  
1894  	if (tb[IFLA_ADDRESS] == NULL)
1895  		eth_hw_addr_random(dev);
1896  
1897  	if (tb[IFLA_IFNAME])
1898  		nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ);
1899  	else
1900  		snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d");
1901  
1902  	err = register_netdevice(dev);
1903  	if (err < 0)
1904  		goto err_register_dev;
1905  
1906  	netif_carrier_off(dev);
1907  
1908  	/*
1909  	 * tie the deviced together
1910  	 */
1911  
1912  	priv = netdev_priv(dev);
1913  	rcu_assign_pointer(priv->peer, peer);
1914  	err = veth_init_queues(dev, tb);
1915  	if (err)
1916  		goto err_queues;
1917  
1918  	priv = netdev_priv(peer);
1919  	rcu_assign_pointer(priv->peer, dev);
1920  	err = veth_init_queues(peer, tb);
1921  	if (err)
1922  		goto err_queues;
1923  
1924  	veth_disable_gro(dev);
1925  	/* update XDP supported features */
1926  	veth_set_xdp_features(dev);
1927  	veth_set_xdp_features(peer);
1928  
1929  	return 0;
1930  
1931  err_queues:
1932  	unregister_netdevice(dev);
1933  err_register_dev:
1934  	/* nothing to do */
1935  err_configure_peer:
1936  	unregister_netdevice(peer);
1937  	return err;
1938  
1939  err_register_peer:
1940  	free_netdev(peer);
1941  	return err;
1942  }
1943  
veth_dellink(struct net_device * dev,struct list_head * head)1944  static void veth_dellink(struct net_device *dev, struct list_head *head)
1945  {
1946  	struct veth_priv *priv;
1947  	struct net_device *peer;
1948  
1949  	priv = netdev_priv(dev);
1950  	peer = rtnl_dereference(priv->peer);
1951  
1952  	/* Note : dellink() is called from default_device_exit_batch(),
1953  	 * before a rcu_synchronize() point. The devices are guaranteed
1954  	 * not being freed before one RCU grace period.
1955  	 */
1956  	RCU_INIT_POINTER(priv->peer, NULL);
1957  	unregister_netdevice_queue(dev, head);
1958  
1959  	if (peer) {
1960  		priv = netdev_priv(peer);
1961  		RCU_INIT_POINTER(priv->peer, NULL);
1962  		unregister_netdevice_queue(peer, head);
1963  	}
1964  }
1965  
1966  static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = {
1967  	[VETH_INFO_PEER]	= { .len = sizeof(struct ifinfomsg) },
1968  };
1969  
veth_get_link_net(const struct net_device * dev)1970  static struct net *veth_get_link_net(const struct net_device *dev)
1971  {
1972  	struct veth_priv *priv = netdev_priv(dev);
1973  	struct net_device *peer = rtnl_dereference(priv->peer);
1974  
1975  	return peer ? dev_net(peer) : dev_net(dev);
1976  }
1977  
veth_get_num_queues(void)1978  static unsigned int veth_get_num_queues(void)
1979  {
1980  	/* enforce the same queue limit as rtnl_create_link */
1981  	int queues = num_possible_cpus();
1982  
1983  	if (queues > 4096)
1984  		queues = 4096;
1985  	return queues;
1986  }
1987  
1988  static struct rtnl_link_ops veth_link_ops = {
1989  	.kind		= DRV_NAME,
1990  	.priv_size	= sizeof(struct veth_priv),
1991  	.setup		= veth_setup,
1992  	.validate	= veth_validate,
1993  	.newlink	= veth_newlink,
1994  	.dellink	= veth_dellink,
1995  	.policy		= veth_policy,
1996  	.maxtype	= VETH_INFO_MAX,
1997  	.get_link_net	= veth_get_link_net,
1998  	.get_num_tx_queues	= veth_get_num_queues,
1999  	.get_num_rx_queues	= veth_get_num_queues,
2000  };
2001  
2002  /*
2003   * init/fini
2004   */
2005  
veth_init(void)2006  static __init int veth_init(void)
2007  {
2008  	return rtnl_link_register(&veth_link_ops);
2009  }
2010  
veth_exit(void)2011  static __exit void veth_exit(void)
2012  {
2013  	rtnl_link_unregister(&veth_link_ops);
2014  }
2015  
2016  module_init(veth_init);
2017  module_exit(veth_exit);
2018  
2019  MODULE_DESCRIPTION("Virtual Ethernet Tunnel");
2020  MODULE_LICENSE("GPL v2");
2021  MODULE_ALIAS_RTNL_LINK(DRV_NAME);
2022