xref: /openbmc/linux/drivers/net/veth.c (revision 2e6ae11dd0d1c37f44cec51a58fb2092e55ed0f5)
1 /*
2  *  drivers/net/veth.c
3  *
4  *  Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc
5  *
6  * Author: Pavel Emelianov <xemul@openvz.org>
7  * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com>
8  *
9  */
10 
11 #include <linux/netdevice.h>
12 #include <linux/slab.h>
13 #include <linux/ethtool.h>
14 #include <linux/etherdevice.h>
15 #include <linux/u64_stats_sync.h>
16 
17 #include <net/rtnetlink.h>
18 #include <net/dst.h>
19 #include <net/xfrm.h>
20 #include <net/xdp.h>
21 #include <linux/veth.h>
22 #include <linux/module.h>
23 #include <linux/bpf.h>
24 #include <linux/filter.h>
25 #include <linux/ptr_ring.h>
26 #include <linux/bpf_trace.h>
27 
28 #define DRV_NAME	"veth"
29 #define DRV_VERSION	"1.0"
30 
31 #define VETH_XDP_FLAG		BIT(0)
32 #define VETH_RING_SIZE		256
33 #define VETH_XDP_HEADROOM	(XDP_PACKET_HEADROOM + NET_IP_ALIGN)
34 
35 /* Separating two types of XDP xmit */
36 #define VETH_XDP_TX		BIT(0)
37 #define VETH_XDP_REDIR		BIT(1)
38 
39 struct pcpu_vstats {
40 	u64			packets;
41 	u64			bytes;
42 	struct u64_stats_sync	syncp;
43 };
44 
45 struct veth_rq {
46 	struct napi_struct	xdp_napi;
47 	struct net_device	*dev;
48 	struct bpf_prog __rcu	*xdp_prog;
49 	struct xdp_mem_info	xdp_mem;
50 	bool			rx_notify_masked;
51 	struct ptr_ring		xdp_ring;
52 	struct xdp_rxq_info	xdp_rxq;
53 };
54 
55 struct veth_priv {
56 	struct net_device __rcu	*peer;
57 	atomic64_t		dropped;
58 	struct bpf_prog		*_xdp_prog;
59 	struct veth_rq		*rq;
60 	unsigned int		requested_headroom;
61 };
62 
63 /*
64  * ethtool interface
65  */
66 
67 static struct {
68 	const char string[ETH_GSTRING_LEN];
69 } ethtool_stats_keys[] = {
70 	{ "peer_ifindex" },
71 };
72 
73 static int veth_get_link_ksettings(struct net_device *dev,
74 				   struct ethtool_link_ksettings *cmd)
75 {
76 	cmd->base.speed		= SPEED_10000;
77 	cmd->base.duplex	= DUPLEX_FULL;
78 	cmd->base.port		= PORT_TP;
79 	cmd->base.autoneg	= AUTONEG_DISABLE;
80 	return 0;
81 }
82 
83 static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
84 {
85 	strlcpy(info->driver, DRV_NAME, sizeof(info->driver));
86 	strlcpy(info->version, DRV_VERSION, sizeof(info->version));
87 }
88 
89 static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
90 {
91 	switch(stringset) {
92 	case ETH_SS_STATS:
93 		memcpy(buf, &ethtool_stats_keys, sizeof(ethtool_stats_keys));
94 		break;
95 	}
96 }
97 
98 static int veth_get_sset_count(struct net_device *dev, int sset)
99 {
100 	switch (sset) {
101 	case ETH_SS_STATS:
102 		return ARRAY_SIZE(ethtool_stats_keys);
103 	default:
104 		return -EOPNOTSUPP;
105 	}
106 }
107 
108 static void veth_get_ethtool_stats(struct net_device *dev,
109 		struct ethtool_stats *stats, u64 *data)
110 {
111 	struct veth_priv *priv = netdev_priv(dev);
112 	struct net_device *peer = rtnl_dereference(priv->peer);
113 
114 	data[0] = peer ? peer->ifindex : 0;
115 }
116 
117 static const struct ethtool_ops veth_ethtool_ops = {
118 	.get_drvinfo		= veth_get_drvinfo,
119 	.get_link		= ethtool_op_get_link,
120 	.get_strings		= veth_get_strings,
121 	.get_sset_count		= veth_get_sset_count,
122 	.get_ethtool_stats	= veth_get_ethtool_stats,
123 	.get_link_ksettings	= veth_get_link_ksettings,
124 };
125 
126 /* general routines */
127 
128 static bool veth_is_xdp_frame(void *ptr)
129 {
130 	return (unsigned long)ptr & VETH_XDP_FLAG;
131 }
132 
133 static void *veth_ptr_to_xdp(void *ptr)
134 {
135 	return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG);
136 }
137 
138 static void *veth_xdp_to_ptr(void *ptr)
139 {
140 	return (void *)((unsigned long)ptr | VETH_XDP_FLAG);
141 }
142 
143 static void veth_ptr_free(void *ptr)
144 {
145 	if (veth_is_xdp_frame(ptr))
146 		xdp_return_frame(veth_ptr_to_xdp(ptr));
147 	else
148 		kfree_skb(ptr);
149 }
150 
151 static void __veth_xdp_flush(struct veth_rq *rq)
152 {
153 	/* Write ptr_ring before reading rx_notify_masked */
154 	smp_mb();
155 	if (!rq->rx_notify_masked) {
156 		rq->rx_notify_masked = true;
157 		napi_schedule(&rq->xdp_napi);
158 	}
159 }
160 
161 static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb)
162 {
163 	if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) {
164 		dev_kfree_skb_any(skb);
165 		return NET_RX_DROP;
166 	}
167 
168 	return NET_RX_SUCCESS;
169 }
170 
171 static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb,
172 			    struct veth_rq *rq, bool xdp)
173 {
174 	return __dev_forward_skb(dev, skb) ?: xdp ?
175 		veth_xdp_rx(rq, skb) :
176 		netif_rx(skb);
177 }
178 
179 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
180 {
181 	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
182 	struct veth_rq *rq = NULL;
183 	struct net_device *rcv;
184 	int length = skb->len;
185 	bool rcv_xdp = false;
186 	int rxq;
187 
188 	rcu_read_lock();
189 	rcv = rcu_dereference(priv->peer);
190 	if (unlikely(!rcv)) {
191 		kfree_skb(skb);
192 		goto drop;
193 	}
194 
195 	rcv_priv = netdev_priv(rcv);
196 	rxq = skb_get_queue_mapping(skb);
197 	if (rxq < rcv->real_num_rx_queues) {
198 		rq = &rcv_priv->rq[rxq];
199 		rcv_xdp = rcu_access_pointer(rq->xdp_prog);
200 		if (rcv_xdp)
201 			skb_record_rx_queue(skb, rxq);
202 	}
203 
204 	if (likely(veth_forward_skb(rcv, skb, rq, rcv_xdp) == NET_RX_SUCCESS)) {
205 		struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
206 
207 		u64_stats_update_begin(&stats->syncp);
208 		stats->bytes += length;
209 		stats->packets++;
210 		u64_stats_update_end(&stats->syncp);
211 	} else {
212 drop:
213 		atomic64_inc(&priv->dropped);
214 	}
215 
216 	if (rcv_xdp)
217 		__veth_xdp_flush(rq);
218 
219 	rcu_read_unlock();
220 
221 	return NETDEV_TX_OK;
222 }
223 
224 static u64 veth_stats_one(struct pcpu_vstats *result, struct net_device *dev)
225 {
226 	struct veth_priv *priv = netdev_priv(dev);
227 	int cpu;
228 
229 	result->packets = 0;
230 	result->bytes = 0;
231 	for_each_possible_cpu(cpu) {
232 		struct pcpu_vstats *stats = per_cpu_ptr(dev->vstats, cpu);
233 		u64 packets, bytes;
234 		unsigned int start;
235 
236 		do {
237 			start = u64_stats_fetch_begin_irq(&stats->syncp);
238 			packets = stats->packets;
239 			bytes = stats->bytes;
240 		} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
241 		result->packets += packets;
242 		result->bytes += bytes;
243 	}
244 	return atomic64_read(&priv->dropped);
245 }
246 
247 static void veth_get_stats64(struct net_device *dev,
248 			     struct rtnl_link_stats64 *tot)
249 {
250 	struct veth_priv *priv = netdev_priv(dev);
251 	struct net_device *peer;
252 	struct pcpu_vstats one;
253 
254 	tot->tx_dropped = veth_stats_one(&one, dev);
255 	tot->tx_bytes = one.bytes;
256 	tot->tx_packets = one.packets;
257 
258 	rcu_read_lock();
259 	peer = rcu_dereference(priv->peer);
260 	if (peer) {
261 		tot->rx_dropped = veth_stats_one(&one, peer);
262 		tot->rx_bytes = one.bytes;
263 		tot->rx_packets = one.packets;
264 	}
265 	rcu_read_unlock();
266 }
267 
268 /* fake multicast ability */
269 static void veth_set_multicast_list(struct net_device *dev)
270 {
271 }
272 
273 static struct sk_buff *veth_build_skb(void *head, int headroom, int len,
274 				      int buflen)
275 {
276 	struct sk_buff *skb;
277 
278 	if (!buflen) {
279 		buflen = SKB_DATA_ALIGN(headroom + len) +
280 			 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
281 	}
282 	skb = build_skb(head, buflen);
283 	if (!skb)
284 		return NULL;
285 
286 	skb_reserve(skb, headroom);
287 	skb_put(skb, len);
288 
289 	return skb;
290 }
291 
292 static int veth_select_rxq(struct net_device *dev)
293 {
294 	return smp_processor_id() % dev->real_num_rx_queues;
295 }
296 
297 static int veth_xdp_xmit(struct net_device *dev, int n,
298 			 struct xdp_frame **frames, u32 flags)
299 {
300 	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
301 	struct net_device *rcv;
302 	unsigned int max_len;
303 	struct veth_rq *rq;
304 	int i, drops = 0;
305 
306 	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
307 		return -EINVAL;
308 
309 	rcv = rcu_dereference(priv->peer);
310 	if (unlikely(!rcv))
311 		return -ENXIO;
312 
313 	rcv_priv = netdev_priv(rcv);
314 	rq = &rcv_priv->rq[veth_select_rxq(rcv)];
315 	/* Non-NULL xdp_prog ensures that xdp_ring is initialized on receive
316 	 * side. This means an XDP program is loaded on the peer and the peer
317 	 * device is up.
318 	 */
319 	if (!rcu_access_pointer(rq->xdp_prog))
320 		return -ENXIO;
321 
322 	max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN;
323 
324 	spin_lock(&rq->xdp_ring.producer_lock);
325 	for (i = 0; i < n; i++) {
326 		struct xdp_frame *frame = frames[i];
327 		void *ptr = veth_xdp_to_ptr(frame);
328 
329 		if (unlikely(frame->len > max_len ||
330 			     __ptr_ring_produce(&rq->xdp_ring, ptr))) {
331 			xdp_return_frame_rx_napi(frame);
332 			drops++;
333 		}
334 	}
335 	spin_unlock(&rq->xdp_ring.producer_lock);
336 
337 	if (flags & XDP_XMIT_FLUSH)
338 		__veth_xdp_flush(rq);
339 
340 	return n - drops;
341 }
342 
343 static void veth_xdp_flush(struct net_device *dev)
344 {
345 	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
346 	struct net_device *rcv;
347 	struct veth_rq *rq;
348 
349 	rcu_read_lock();
350 	rcv = rcu_dereference(priv->peer);
351 	if (unlikely(!rcv))
352 		goto out;
353 
354 	rcv_priv = netdev_priv(rcv);
355 	rq = &rcv_priv->rq[veth_select_rxq(rcv)];
356 	/* xdp_ring is initialized on receive side? */
357 	if (unlikely(!rcu_access_pointer(rq->xdp_prog)))
358 		goto out;
359 
360 	__veth_xdp_flush(rq);
361 out:
362 	rcu_read_unlock();
363 }
364 
365 static int veth_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
366 {
367 	struct xdp_frame *frame = convert_to_xdp_frame(xdp);
368 
369 	if (unlikely(!frame))
370 		return -EOVERFLOW;
371 
372 	return veth_xdp_xmit(dev, 1, &frame, 0);
373 }
374 
375 static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq,
376 					struct xdp_frame *frame,
377 					unsigned int *xdp_xmit)
378 {
379 	void *hard_start = frame->data - frame->headroom;
380 	void *head = hard_start - sizeof(struct xdp_frame);
381 	int len = frame->len, delta = 0;
382 	struct xdp_frame orig_frame;
383 	struct bpf_prog *xdp_prog;
384 	unsigned int headroom;
385 	struct sk_buff *skb;
386 
387 	rcu_read_lock();
388 	xdp_prog = rcu_dereference(rq->xdp_prog);
389 	if (likely(xdp_prog)) {
390 		struct xdp_buff xdp;
391 		u32 act;
392 
393 		xdp.data_hard_start = hard_start;
394 		xdp.data = frame->data;
395 		xdp.data_end = frame->data + frame->len;
396 		xdp.data_meta = frame->data - frame->metasize;
397 		xdp.rxq = &rq->xdp_rxq;
398 
399 		act = bpf_prog_run_xdp(xdp_prog, &xdp);
400 
401 		switch (act) {
402 		case XDP_PASS:
403 			delta = frame->data - xdp.data;
404 			len = xdp.data_end - xdp.data;
405 			break;
406 		case XDP_TX:
407 			orig_frame = *frame;
408 			xdp.data_hard_start = head;
409 			xdp.rxq->mem = frame->mem;
410 			if (unlikely(veth_xdp_tx(rq->dev, &xdp) < 0)) {
411 				trace_xdp_exception(rq->dev, xdp_prog, act);
412 				frame = &orig_frame;
413 				goto err_xdp;
414 			}
415 			*xdp_xmit |= VETH_XDP_TX;
416 			rcu_read_unlock();
417 			goto xdp_xmit;
418 		case XDP_REDIRECT:
419 			orig_frame = *frame;
420 			xdp.data_hard_start = head;
421 			xdp.rxq->mem = frame->mem;
422 			if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) {
423 				frame = &orig_frame;
424 				goto err_xdp;
425 			}
426 			*xdp_xmit |= VETH_XDP_REDIR;
427 			rcu_read_unlock();
428 			goto xdp_xmit;
429 		default:
430 			bpf_warn_invalid_xdp_action(act);
431 		case XDP_ABORTED:
432 			trace_xdp_exception(rq->dev, xdp_prog, act);
433 		case XDP_DROP:
434 			goto err_xdp;
435 		}
436 	}
437 	rcu_read_unlock();
438 
439 	headroom = sizeof(struct xdp_frame) + frame->headroom - delta;
440 	skb = veth_build_skb(head, headroom, len, 0);
441 	if (!skb) {
442 		xdp_return_frame(frame);
443 		goto err;
444 	}
445 
446 	xdp_scrub_frame(frame);
447 	skb->protocol = eth_type_trans(skb, rq->dev);
448 err:
449 	return skb;
450 err_xdp:
451 	rcu_read_unlock();
452 	xdp_return_frame(frame);
453 xdp_xmit:
454 	return NULL;
455 }
456 
457 static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, struct sk_buff *skb,
458 					unsigned int *xdp_xmit)
459 {
460 	u32 pktlen, headroom, act, metalen;
461 	void *orig_data, *orig_data_end;
462 	struct bpf_prog *xdp_prog;
463 	int mac_len, delta, off;
464 	struct xdp_buff xdp;
465 
466 	rcu_read_lock();
467 	xdp_prog = rcu_dereference(rq->xdp_prog);
468 	if (unlikely(!xdp_prog)) {
469 		rcu_read_unlock();
470 		goto out;
471 	}
472 
473 	mac_len = skb->data - skb_mac_header(skb);
474 	pktlen = skb->len + mac_len;
475 	headroom = skb_headroom(skb) - mac_len;
476 
477 	if (skb_shared(skb) || skb_head_is_locked(skb) ||
478 	    skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) {
479 		struct sk_buff *nskb;
480 		int size, head_off;
481 		void *head, *start;
482 		struct page *page;
483 
484 		size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) +
485 		       SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
486 		if (size > PAGE_SIZE)
487 			goto drop;
488 
489 		page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
490 		if (!page)
491 			goto drop;
492 
493 		head = page_address(page);
494 		start = head + VETH_XDP_HEADROOM;
495 		if (skb_copy_bits(skb, -mac_len, start, pktlen)) {
496 			page_frag_free(head);
497 			goto drop;
498 		}
499 
500 		nskb = veth_build_skb(head,
501 				      VETH_XDP_HEADROOM + mac_len, skb->len,
502 				      PAGE_SIZE);
503 		if (!nskb) {
504 			page_frag_free(head);
505 			goto drop;
506 		}
507 
508 		skb_copy_header(nskb, skb);
509 		head_off = skb_headroom(nskb) - skb_headroom(skb);
510 		skb_headers_offset_update(nskb, head_off);
511 		if (skb->sk)
512 			skb_set_owner_w(nskb, skb->sk);
513 		consume_skb(skb);
514 		skb = nskb;
515 	}
516 
517 	xdp.data_hard_start = skb->head;
518 	xdp.data = skb_mac_header(skb);
519 	xdp.data_end = xdp.data + pktlen;
520 	xdp.data_meta = xdp.data;
521 	xdp.rxq = &rq->xdp_rxq;
522 	orig_data = xdp.data;
523 	orig_data_end = xdp.data_end;
524 
525 	act = bpf_prog_run_xdp(xdp_prog, &xdp);
526 
527 	switch (act) {
528 	case XDP_PASS:
529 		break;
530 	case XDP_TX:
531 		get_page(virt_to_page(xdp.data));
532 		consume_skb(skb);
533 		xdp.rxq->mem = rq->xdp_mem;
534 		if (unlikely(veth_xdp_tx(rq->dev, &xdp) < 0)) {
535 			trace_xdp_exception(rq->dev, xdp_prog, act);
536 			goto err_xdp;
537 		}
538 		*xdp_xmit |= VETH_XDP_TX;
539 		rcu_read_unlock();
540 		goto xdp_xmit;
541 	case XDP_REDIRECT:
542 		get_page(virt_to_page(xdp.data));
543 		consume_skb(skb);
544 		xdp.rxq->mem = rq->xdp_mem;
545 		if (xdp_do_redirect(rq->dev, &xdp, xdp_prog))
546 			goto err_xdp;
547 		*xdp_xmit |= VETH_XDP_REDIR;
548 		rcu_read_unlock();
549 		goto xdp_xmit;
550 	default:
551 		bpf_warn_invalid_xdp_action(act);
552 	case XDP_ABORTED:
553 		trace_xdp_exception(rq->dev, xdp_prog, act);
554 	case XDP_DROP:
555 		goto drop;
556 	}
557 	rcu_read_unlock();
558 
559 	delta = orig_data - xdp.data;
560 	off = mac_len + delta;
561 	if (off > 0)
562 		__skb_push(skb, off);
563 	else if (off < 0)
564 		__skb_pull(skb, -off);
565 	skb->mac_header -= delta;
566 	off = xdp.data_end - orig_data_end;
567 	if (off != 0)
568 		__skb_put(skb, off);
569 	skb->protocol = eth_type_trans(skb, rq->dev);
570 
571 	metalen = xdp.data - xdp.data_meta;
572 	if (metalen)
573 		skb_metadata_set(skb, metalen);
574 out:
575 	return skb;
576 drop:
577 	rcu_read_unlock();
578 	kfree_skb(skb);
579 	return NULL;
580 err_xdp:
581 	rcu_read_unlock();
582 	page_frag_free(xdp.data);
583 xdp_xmit:
584 	return NULL;
585 }
586 
587 static int veth_xdp_rcv(struct veth_rq *rq, int budget, unsigned int *xdp_xmit)
588 {
589 	int i, done = 0;
590 
591 	for (i = 0; i < budget; i++) {
592 		void *ptr = __ptr_ring_consume(&rq->xdp_ring);
593 		struct sk_buff *skb;
594 
595 		if (!ptr)
596 			break;
597 
598 		if (veth_is_xdp_frame(ptr)) {
599 			skb = veth_xdp_rcv_one(rq, veth_ptr_to_xdp(ptr),
600 					       xdp_xmit);
601 		} else {
602 			skb = veth_xdp_rcv_skb(rq, ptr, xdp_xmit);
603 		}
604 
605 		if (skb)
606 			napi_gro_receive(&rq->xdp_napi, skb);
607 
608 		done++;
609 	}
610 
611 	return done;
612 }
613 
614 static int veth_poll(struct napi_struct *napi, int budget)
615 {
616 	struct veth_rq *rq =
617 		container_of(napi, struct veth_rq, xdp_napi);
618 	unsigned int xdp_xmit = 0;
619 	int done;
620 
621 	xdp_set_return_frame_no_direct();
622 	done = veth_xdp_rcv(rq, budget, &xdp_xmit);
623 
624 	if (done < budget && napi_complete_done(napi, done)) {
625 		/* Write rx_notify_masked before reading ptr_ring */
626 		smp_store_mb(rq->rx_notify_masked, false);
627 		if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) {
628 			rq->rx_notify_masked = true;
629 			napi_schedule(&rq->xdp_napi);
630 		}
631 	}
632 
633 	if (xdp_xmit & VETH_XDP_TX)
634 		veth_xdp_flush(rq->dev);
635 	if (xdp_xmit & VETH_XDP_REDIR)
636 		xdp_do_flush_map();
637 	xdp_clear_return_frame_no_direct();
638 
639 	return done;
640 }
641 
642 static int veth_napi_add(struct net_device *dev)
643 {
644 	struct veth_priv *priv = netdev_priv(dev);
645 	int err, i;
646 
647 	for (i = 0; i < dev->real_num_rx_queues; i++) {
648 		struct veth_rq *rq = &priv->rq[i];
649 
650 		err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL);
651 		if (err)
652 			goto err_xdp_ring;
653 	}
654 
655 	for (i = 0; i < dev->real_num_rx_queues; i++) {
656 		struct veth_rq *rq = &priv->rq[i];
657 
658 		netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT);
659 		napi_enable(&rq->xdp_napi);
660 	}
661 
662 	return 0;
663 err_xdp_ring:
664 	for (i--; i >= 0; i--)
665 		ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free);
666 
667 	return err;
668 }
669 
670 static void veth_napi_del(struct net_device *dev)
671 {
672 	struct veth_priv *priv = netdev_priv(dev);
673 	int i;
674 
675 	for (i = 0; i < dev->real_num_rx_queues; i++) {
676 		struct veth_rq *rq = &priv->rq[i];
677 
678 		napi_disable(&rq->xdp_napi);
679 		napi_hash_del(&rq->xdp_napi);
680 	}
681 	synchronize_net();
682 
683 	for (i = 0; i < dev->real_num_rx_queues; i++) {
684 		struct veth_rq *rq = &priv->rq[i];
685 
686 		netif_napi_del(&rq->xdp_napi);
687 		rq->rx_notify_masked = false;
688 		ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free);
689 	}
690 }
691 
692 static int veth_enable_xdp(struct net_device *dev)
693 {
694 	struct veth_priv *priv = netdev_priv(dev);
695 	int err, i;
696 
697 	if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) {
698 		for (i = 0; i < dev->real_num_rx_queues; i++) {
699 			struct veth_rq *rq = &priv->rq[i];
700 
701 			err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i);
702 			if (err < 0)
703 				goto err_rxq_reg;
704 
705 			err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
706 							 MEM_TYPE_PAGE_SHARED,
707 							 NULL);
708 			if (err < 0)
709 				goto err_reg_mem;
710 
711 			/* Save original mem info as it can be overwritten */
712 			rq->xdp_mem = rq->xdp_rxq.mem;
713 		}
714 
715 		err = veth_napi_add(dev);
716 		if (err)
717 			goto err_rxq_reg;
718 	}
719 
720 	for (i = 0; i < dev->real_num_rx_queues; i++)
721 		rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog);
722 
723 	return 0;
724 err_reg_mem:
725 	xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq);
726 err_rxq_reg:
727 	for (i--; i >= 0; i--)
728 		xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq);
729 
730 	return err;
731 }
732 
733 static void veth_disable_xdp(struct net_device *dev)
734 {
735 	struct veth_priv *priv = netdev_priv(dev);
736 	int i;
737 
738 	for (i = 0; i < dev->real_num_rx_queues; i++)
739 		rcu_assign_pointer(priv->rq[i].xdp_prog, NULL);
740 	veth_napi_del(dev);
741 	for (i = 0; i < dev->real_num_rx_queues; i++) {
742 		struct veth_rq *rq = &priv->rq[i];
743 
744 		rq->xdp_rxq.mem = rq->xdp_mem;
745 		xdp_rxq_info_unreg(&rq->xdp_rxq);
746 	}
747 }
748 
749 static int veth_open(struct net_device *dev)
750 {
751 	struct veth_priv *priv = netdev_priv(dev);
752 	struct net_device *peer = rtnl_dereference(priv->peer);
753 	int err;
754 
755 	if (!peer)
756 		return -ENOTCONN;
757 
758 	if (priv->_xdp_prog) {
759 		err = veth_enable_xdp(dev);
760 		if (err)
761 			return err;
762 	}
763 
764 	if (peer->flags & IFF_UP) {
765 		netif_carrier_on(dev);
766 		netif_carrier_on(peer);
767 	}
768 
769 	return 0;
770 }
771 
772 static int veth_close(struct net_device *dev)
773 {
774 	struct veth_priv *priv = netdev_priv(dev);
775 	struct net_device *peer = rtnl_dereference(priv->peer);
776 
777 	netif_carrier_off(dev);
778 	if (peer)
779 		netif_carrier_off(peer);
780 
781 	if (priv->_xdp_prog)
782 		veth_disable_xdp(dev);
783 
784 	return 0;
785 }
786 
787 static int is_valid_veth_mtu(int mtu)
788 {
789 	return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU;
790 }
791 
792 static int veth_alloc_queues(struct net_device *dev)
793 {
794 	struct veth_priv *priv = netdev_priv(dev);
795 	int i;
796 
797 	priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL);
798 	if (!priv->rq)
799 		return -ENOMEM;
800 
801 	for (i = 0; i < dev->num_rx_queues; i++)
802 		priv->rq[i].dev = dev;
803 
804 	return 0;
805 }
806 
807 static void veth_free_queues(struct net_device *dev)
808 {
809 	struct veth_priv *priv = netdev_priv(dev);
810 
811 	kfree(priv->rq);
812 }
813 
814 static int veth_dev_init(struct net_device *dev)
815 {
816 	int err;
817 
818 	dev->vstats = netdev_alloc_pcpu_stats(struct pcpu_vstats);
819 	if (!dev->vstats)
820 		return -ENOMEM;
821 
822 	err = veth_alloc_queues(dev);
823 	if (err) {
824 		free_percpu(dev->vstats);
825 		return err;
826 	}
827 
828 	return 0;
829 }
830 
831 static void veth_dev_free(struct net_device *dev)
832 {
833 	veth_free_queues(dev);
834 	free_percpu(dev->vstats);
835 }
836 
837 #ifdef CONFIG_NET_POLL_CONTROLLER
838 static void veth_poll_controller(struct net_device *dev)
839 {
840 	/* veth only receives frames when its peer sends one
841 	 * Since it has nothing to do with disabling irqs, we are guaranteed
842 	 * never to have pending data when we poll for it so
843 	 * there is nothing to do here.
844 	 *
845 	 * We need this though so netpoll recognizes us as an interface that
846 	 * supports polling, which enables bridge devices in virt setups to
847 	 * still use netconsole
848 	 */
849 }
850 #endif	/* CONFIG_NET_POLL_CONTROLLER */
851 
852 static int veth_get_iflink(const struct net_device *dev)
853 {
854 	struct veth_priv *priv = netdev_priv(dev);
855 	struct net_device *peer;
856 	int iflink;
857 
858 	rcu_read_lock();
859 	peer = rcu_dereference(priv->peer);
860 	iflink = peer ? peer->ifindex : 0;
861 	rcu_read_unlock();
862 
863 	return iflink;
864 }
865 
866 static netdev_features_t veth_fix_features(struct net_device *dev,
867 					   netdev_features_t features)
868 {
869 	struct veth_priv *priv = netdev_priv(dev);
870 	struct net_device *peer;
871 
872 	peer = rtnl_dereference(priv->peer);
873 	if (peer) {
874 		struct veth_priv *peer_priv = netdev_priv(peer);
875 
876 		if (peer_priv->_xdp_prog)
877 			features &= ~NETIF_F_GSO_SOFTWARE;
878 	}
879 
880 	return features;
881 }
882 
883 static void veth_set_rx_headroom(struct net_device *dev, int new_hr)
884 {
885 	struct veth_priv *peer_priv, *priv = netdev_priv(dev);
886 	struct net_device *peer;
887 
888 	if (new_hr < 0)
889 		new_hr = 0;
890 
891 	rcu_read_lock();
892 	peer = rcu_dereference(priv->peer);
893 	if (unlikely(!peer))
894 		goto out;
895 
896 	peer_priv = netdev_priv(peer);
897 	priv->requested_headroom = new_hr;
898 	new_hr = max(priv->requested_headroom, peer_priv->requested_headroom);
899 	dev->needed_headroom = new_hr;
900 	peer->needed_headroom = new_hr;
901 
902 out:
903 	rcu_read_unlock();
904 }
905 
906 static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog,
907 			struct netlink_ext_ack *extack)
908 {
909 	struct veth_priv *priv = netdev_priv(dev);
910 	struct bpf_prog *old_prog;
911 	struct net_device *peer;
912 	unsigned int max_mtu;
913 	int err;
914 
915 	old_prog = priv->_xdp_prog;
916 	priv->_xdp_prog = prog;
917 	peer = rtnl_dereference(priv->peer);
918 
919 	if (prog) {
920 		if (!peer) {
921 			NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached");
922 			err = -ENOTCONN;
923 			goto err;
924 		}
925 
926 		max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM -
927 			  peer->hard_header_len -
928 			  SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
929 		if (peer->mtu > max_mtu) {
930 			NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP");
931 			err = -ERANGE;
932 			goto err;
933 		}
934 
935 		if (dev->real_num_rx_queues < peer->real_num_tx_queues) {
936 			NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues");
937 			err = -ENOSPC;
938 			goto err;
939 		}
940 
941 		if (dev->flags & IFF_UP) {
942 			err = veth_enable_xdp(dev);
943 			if (err) {
944 				NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed");
945 				goto err;
946 			}
947 		}
948 
949 		if (!old_prog) {
950 			peer->hw_features &= ~NETIF_F_GSO_SOFTWARE;
951 			peer->max_mtu = max_mtu;
952 		}
953 	}
954 
955 	if (old_prog) {
956 		if (!prog) {
957 			if (dev->flags & IFF_UP)
958 				veth_disable_xdp(dev);
959 
960 			if (peer) {
961 				peer->hw_features |= NETIF_F_GSO_SOFTWARE;
962 				peer->max_mtu = ETH_MAX_MTU;
963 			}
964 		}
965 		bpf_prog_put(old_prog);
966 	}
967 
968 	if ((!!old_prog ^ !!prog) && peer)
969 		netdev_update_features(peer);
970 
971 	return 0;
972 err:
973 	priv->_xdp_prog = old_prog;
974 
975 	return err;
976 }
977 
978 static u32 veth_xdp_query(struct net_device *dev)
979 {
980 	struct veth_priv *priv = netdev_priv(dev);
981 	const struct bpf_prog *xdp_prog;
982 
983 	xdp_prog = priv->_xdp_prog;
984 	if (xdp_prog)
985 		return xdp_prog->aux->id;
986 
987 	return 0;
988 }
989 
990 static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
991 {
992 	switch (xdp->command) {
993 	case XDP_SETUP_PROG:
994 		return veth_xdp_set(dev, xdp->prog, xdp->extack);
995 	case XDP_QUERY_PROG:
996 		xdp->prog_id = veth_xdp_query(dev);
997 		return 0;
998 	default:
999 		return -EINVAL;
1000 	}
1001 }
1002 
1003 static const struct net_device_ops veth_netdev_ops = {
1004 	.ndo_init            = veth_dev_init,
1005 	.ndo_open            = veth_open,
1006 	.ndo_stop            = veth_close,
1007 	.ndo_start_xmit      = veth_xmit,
1008 	.ndo_get_stats64     = veth_get_stats64,
1009 	.ndo_set_rx_mode     = veth_set_multicast_list,
1010 	.ndo_set_mac_address = eth_mac_addr,
1011 #ifdef CONFIG_NET_POLL_CONTROLLER
1012 	.ndo_poll_controller	= veth_poll_controller,
1013 #endif
1014 	.ndo_get_iflink		= veth_get_iflink,
1015 	.ndo_fix_features	= veth_fix_features,
1016 	.ndo_features_check	= passthru_features_check,
1017 	.ndo_set_rx_headroom	= veth_set_rx_headroom,
1018 	.ndo_bpf		= veth_xdp,
1019 	.ndo_xdp_xmit		= veth_xdp_xmit,
1020 };
1021 
1022 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
1023 		       NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \
1024 		       NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \
1025 		       NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \
1026 		       NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX )
1027 
1028 static void veth_setup(struct net_device *dev)
1029 {
1030 	ether_setup(dev);
1031 
1032 	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1033 	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1034 	dev->priv_flags |= IFF_NO_QUEUE;
1035 	dev->priv_flags |= IFF_PHONY_HEADROOM;
1036 
1037 	dev->netdev_ops = &veth_netdev_ops;
1038 	dev->ethtool_ops = &veth_ethtool_ops;
1039 	dev->features |= NETIF_F_LLTX;
1040 	dev->features |= VETH_FEATURES;
1041 	dev->vlan_features = dev->features &
1042 			     ~(NETIF_F_HW_VLAN_CTAG_TX |
1043 			       NETIF_F_HW_VLAN_STAG_TX |
1044 			       NETIF_F_HW_VLAN_CTAG_RX |
1045 			       NETIF_F_HW_VLAN_STAG_RX);
1046 	dev->needs_free_netdev = true;
1047 	dev->priv_destructor = veth_dev_free;
1048 	dev->max_mtu = ETH_MAX_MTU;
1049 
1050 	dev->hw_features = VETH_FEATURES;
1051 	dev->hw_enc_features = VETH_FEATURES;
1052 	dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE;
1053 }
1054 
1055 /*
1056  * netlink interface
1057  */
1058 
1059 static int veth_validate(struct nlattr *tb[], struct nlattr *data[],
1060 			 struct netlink_ext_ack *extack)
1061 {
1062 	if (tb[IFLA_ADDRESS]) {
1063 		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1064 			return -EINVAL;
1065 		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1066 			return -EADDRNOTAVAIL;
1067 	}
1068 	if (tb[IFLA_MTU]) {
1069 		if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU])))
1070 			return -EINVAL;
1071 	}
1072 	return 0;
1073 }
1074 
1075 static struct rtnl_link_ops veth_link_ops;
1076 
1077 static int veth_newlink(struct net *src_net, struct net_device *dev,
1078 			struct nlattr *tb[], struct nlattr *data[],
1079 			struct netlink_ext_ack *extack)
1080 {
1081 	int err;
1082 	struct net_device *peer;
1083 	struct veth_priv *priv;
1084 	char ifname[IFNAMSIZ];
1085 	struct nlattr *peer_tb[IFLA_MAX + 1], **tbp;
1086 	unsigned char name_assign_type;
1087 	struct ifinfomsg *ifmp;
1088 	struct net *net;
1089 
1090 	/*
1091 	 * create and register peer first
1092 	 */
1093 	if (data != NULL && data[VETH_INFO_PEER] != NULL) {
1094 		struct nlattr *nla_peer;
1095 
1096 		nla_peer = data[VETH_INFO_PEER];
1097 		ifmp = nla_data(nla_peer);
1098 		err = rtnl_nla_parse_ifla(peer_tb,
1099 					  nla_data(nla_peer) + sizeof(struct ifinfomsg),
1100 					  nla_len(nla_peer) - sizeof(struct ifinfomsg),
1101 					  NULL);
1102 		if (err < 0)
1103 			return err;
1104 
1105 		err = veth_validate(peer_tb, NULL, extack);
1106 		if (err < 0)
1107 			return err;
1108 
1109 		tbp = peer_tb;
1110 	} else {
1111 		ifmp = NULL;
1112 		tbp = tb;
1113 	}
1114 
1115 	if (ifmp && tbp[IFLA_IFNAME]) {
1116 		nla_strlcpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ);
1117 		name_assign_type = NET_NAME_USER;
1118 	} else {
1119 		snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d");
1120 		name_assign_type = NET_NAME_ENUM;
1121 	}
1122 
1123 	net = rtnl_link_get_net(src_net, tbp);
1124 	if (IS_ERR(net))
1125 		return PTR_ERR(net);
1126 
1127 	peer = rtnl_create_link(net, ifname, name_assign_type,
1128 				&veth_link_ops, tbp);
1129 	if (IS_ERR(peer)) {
1130 		put_net(net);
1131 		return PTR_ERR(peer);
1132 	}
1133 
1134 	if (!ifmp || !tbp[IFLA_ADDRESS])
1135 		eth_hw_addr_random(peer);
1136 
1137 	if (ifmp && (dev->ifindex != 0))
1138 		peer->ifindex = ifmp->ifi_index;
1139 
1140 	peer->gso_max_size = dev->gso_max_size;
1141 	peer->gso_max_segs = dev->gso_max_segs;
1142 
1143 	err = register_netdevice(peer);
1144 	put_net(net);
1145 	net = NULL;
1146 	if (err < 0)
1147 		goto err_register_peer;
1148 
1149 	netif_carrier_off(peer);
1150 
1151 	err = rtnl_configure_link(peer, ifmp);
1152 	if (err < 0)
1153 		goto err_configure_peer;
1154 
1155 	/*
1156 	 * register dev last
1157 	 *
1158 	 * note, that since we've registered new device the dev's name
1159 	 * should be re-allocated
1160 	 */
1161 
1162 	if (tb[IFLA_ADDRESS] == NULL)
1163 		eth_hw_addr_random(dev);
1164 
1165 	if (tb[IFLA_IFNAME])
1166 		nla_strlcpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ);
1167 	else
1168 		snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d");
1169 
1170 	err = register_netdevice(dev);
1171 	if (err < 0)
1172 		goto err_register_dev;
1173 
1174 	netif_carrier_off(dev);
1175 
1176 	/*
1177 	 * tie the deviced together
1178 	 */
1179 
1180 	priv = netdev_priv(dev);
1181 	rcu_assign_pointer(priv->peer, peer);
1182 
1183 	priv = netdev_priv(peer);
1184 	rcu_assign_pointer(priv->peer, dev);
1185 
1186 	return 0;
1187 
1188 err_register_dev:
1189 	/* nothing to do */
1190 err_configure_peer:
1191 	unregister_netdevice(peer);
1192 	return err;
1193 
1194 err_register_peer:
1195 	free_netdev(peer);
1196 	return err;
1197 }
1198 
1199 static void veth_dellink(struct net_device *dev, struct list_head *head)
1200 {
1201 	struct veth_priv *priv;
1202 	struct net_device *peer;
1203 
1204 	priv = netdev_priv(dev);
1205 	peer = rtnl_dereference(priv->peer);
1206 
1207 	/* Note : dellink() is called from default_device_exit_batch(),
1208 	 * before a rcu_synchronize() point. The devices are guaranteed
1209 	 * not being freed before one RCU grace period.
1210 	 */
1211 	RCU_INIT_POINTER(priv->peer, NULL);
1212 	unregister_netdevice_queue(dev, head);
1213 
1214 	if (peer) {
1215 		priv = netdev_priv(peer);
1216 		RCU_INIT_POINTER(priv->peer, NULL);
1217 		unregister_netdevice_queue(peer, head);
1218 	}
1219 }
1220 
1221 static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = {
1222 	[VETH_INFO_PEER]	= { .len = sizeof(struct ifinfomsg) },
1223 };
1224 
1225 static struct net *veth_get_link_net(const struct net_device *dev)
1226 {
1227 	struct veth_priv *priv = netdev_priv(dev);
1228 	struct net_device *peer = rtnl_dereference(priv->peer);
1229 
1230 	return peer ? dev_net(peer) : dev_net(dev);
1231 }
1232 
1233 static struct rtnl_link_ops veth_link_ops = {
1234 	.kind		= DRV_NAME,
1235 	.priv_size	= sizeof(struct veth_priv),
1236 	.setup		= veth_setup,
1237 	.validate	= veth_validate,
1238 	.newlink	= veth_newlink,
1239 	.dellink	= veth_dellink,
1240 	.policy		= veth_policy,
1241 	.maxtype	= VETH_INFO_MAX,
1242 	.get_link_net	= veth_get_link_net,
1243 };
1244 
1245 /*
1246  * init/fini
1247  */
1248 
1249 static __init int veth_init(void)
1250 {
1251 	return rtnl_link_register(&veth_link_ops);
1252 }
1253 
1254 static __exit void veth_exit(void)
1255 {
1256 	rtnl_link_unregister(&veth_link_ops);
1257 }
1258 
1259 module_init(veth_init);
1260 module_exit(veth_exit);
1261 
1262 MODULE_DESCRIPTION("Virtual Ethernet Tunnel");
1263 MODULE_LICENSE("GPL v2");
1264 MODULE_ALIAS_RTNL_LINK(DRV_NAME);
1265