xref: /openbmc/linux/net/ipv6/seg6_local.c (revision a1dff44b354c0e2721aeae075a287d07daf1c76b)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *  SR-IPv6 implementation
4  *
5  *  Authors:
6  *  David Lebrun <david.lebrun@uclouvain.be>
7  *  eBPF support: Mathieu Xhonneux <m.xhonneux@gmail.com>
8  */
9 
10 #include <linux/types.h>
11 #include <linux/skbuff.h>
12 #include <linux/net.h>
13 #include <linux/module.h>
14 #include <net/ip.h>
15 #include <net/lwtunnel.h>
16 #include <net/netevent.h>
17 #include <net/netns/generic.h>
18 #include <net/ip6_fib.h>
19 #include <net/route.h>
20 #include <net/seg6.h>
21 #include <linux/seg6.h>
22 #include <linux/seg6_local.h>
23 #include <net/addrconf.h>
24 #include <net/ip6_route.h>
25 #include <net/dst_cache.h>
26 #include <net/ip_tunnels.h>
27 #ifdef CONFIG_IPV6_SEG6_HMAC
28 #include <net/seg6_hmac.h>
29 #endif
30 #include <net/seg6_local.h>
31 #include <linux/etherdevice.h>
32 #include <linux/bpf.h>
33 
34 #define SEG6_F_ATTR(i)		BIT(i)
35 
36 struct seg6_local_lwt;
37 
38 /* callbacks used for customizing the creation and destruction of a behavior */
39 struct seg6_local_lwtunnel_ops {
40 	int (*build_state)(struct seg6_local_lwt *slwt, const void *cfg,
41 			   struct netlink_ext_ack *extack);
42 	void (*destroy_state)(struct seg6_local_lwt *slwt);
43 };
44 
45 struct seg6_action_desc {
46 	int action;
47 	unsigned long attrs;
48 
49 	/* The optattrs field is used for specifying all the optional
50 	 * attributes supported by a specific behavior.
51 	 * It means that if one of these attributes is not provided in the
52 	 * netlink message during the behavior creation, no errors will be
53 	 * returned to the userspace.
54 	 *
55 	 * Each attribute can be only of two types (mutually exclusive):
56 	 * 1) required or 2) optional.
57 	 * Every user MUST obey to this rule! If you set an attribute as
58 	 * required the same attribute CANNOT be set as optional and vice
59 	 * versa.
60 	 */
61 	unsigned long optattrs;
62 
63 	int (*input)(struct sk_buff *skb, struct seg6_local_lwt *slwt);
64 	int static_headroom;
65 
66 	struct seg6_local_lwtunnel_ops slwt_ops;
67 };
68 
69 struct bpf_lwt_prog {
70 	struct bpf_prog *prog;
71 	char *name;
72 };
73 
74 enum seg6_end_dt_mode {
75 	DT_INVALID_MODE	= -EINVAL,
76 	DT_LEGACY_MODE	= 0,
77 	DT_VRF_MODE	= 1,
78 };
79 
80 struct seg6_end_dt_info {
81 	enum seg6_end_dt_mode mode;
82 
83 	struct net *net;
84 	/* VRF device associated to the routing table used by the SRv6
85 	 * End.DT4/DT6 behavior for routing IPv4/IPv6 packets.
86 	 */
87 	int vrf_ifindex;
88 	int vrf_table;
89 
90 	/* tunneled packet proto and family (IPv4 or IPv6) */
91 	__be16 proto;
92 	u16 family;
93 	int hdrlen;
94 };
95 
96 struct pcpu_seg6_local_counters {
97 	u64_stats_t packets;
98 	u64_stats_t bytes;
99 	u64_stats_t errors;
100 
101 	struct u64_stats_sync syncp;
102 };
103 
104 /* This struct groups all the SRv6 Behavior counters supported so far.
105  *
106  * put_nla_counters() makes use of this data structure to collect all counter
107  * values after the per-CPU counter evaluation has been performed.
108  * Finally, each counter value (in seg6_local_counters) is stored in the
109  * corresponding netlink attribute and sent to user space.
110  *
111  * NB: we don't want to expose this structure to user space!
112  */
113 struct seg6_local_counters {
114 	__u64 packets;
115 	__u64 bytes;
116 	__u64 errors;
117 };
118 
119 #define seg6_local_alloc_pcpu_counters(__gfp)				\
120 	__netdev_alloc_pcpu_stats(struct pcpu_seg6_local_counters,	\
121 				  ((__gfp) | __GFP_ZERO))
122 
123 #define SEG6_F_LOCAL_COUNTERS	SEG6_F_ATTR(SEG6_LOCAL_COUNTERS)
124 
125 struct seg6_local_lwt {
126 	int action;
127 	struct ipv6_sr_hdr *srh;
128 	int table;
129 	struct in_addr nh4;
130 	struct in6_addr nh6;
131 	int iif;
132 	int oif;
133 	struct bpf_lwt_prog bpf;
134 #ifdef CONFIG_NET_L3_MASTER_DEV
135 	struct seg6_end_dt_info dt_info;
136 #endif
137 	struct pcpu_seg6_local_counters __percpu *pcpu_counters;
138 
139 	int headroom;
140 	struct seg6_action_desc *desc;
141 	/* unlike the required attrs, we have to track the optional attributes
142 	 * that have been effectively parsed.
143 	 */
144 	unsigned long parsed_optattrs;
145 };
146 
147 static struct seg6_local_lwt *seg6_local_lwtunnel(struct lwtunnel_state *lwt)
148 {
149 	return (struct seg6_local_lwt *)lwt->data;
150 }
151 
152 static struct ipv6_sr_hdr *get_srh(struct sk_buff *skb, int flags)
153 {
154 	struct ipv6_sr_hdr *srh;
155 	int len, srhoff = 0;
156 
157 	if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, &flags) < 0)
158 		return NULL;
159 
160 	if (!pskb_may_pull(skb, srhoff + sizeof(*srh)))
161 		return NULL;
162 
163 	srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
164 
165 	len = (srh->hdrlen + 1) << 3;
166 
167 	if (!pskb_may_pull(skb, srhoff + len))
168 		return NULL;
169 
170 	/* note that pskb_may_pull may change pointers in header;
171 	 * for this reason it is necessary to reload them when needed.
172 	 */
173 	srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
174 
175 	if (!seg6_validate_srh(srh, len, true))
176 		return NULL;
177 
178 	return srh;
179 }
180 
181 static struct ipv6_sr_hdr *get_and_validate_srh(struct sk_buff *skb)
182 {
183 	struct ipv6_sr_hdr *srh;
184 
185 	srh = get_srh(skb, IP6_FH_F_SKIP_RH);
186 	if (!srh)
187 		return NULL;
188 
189 #ifdef CONFIG_IPV6_SEG6_HMAC
190 	if (!seg6_hmac_validate_skb(skb))
191 		return NULL;
192 #endif
193 
194 	return srh;
195 }
196 
197 static bool decap_and_validate(struct sk_buff *skb, int proto)
198 {
199 	struct ipv6_sr_hdr *srh;
200 	unsigned int off = 0;
201 
202 	srh = get_srh(skb, 0);
203 	if (srh && srh->segments_left > 0)
204 		return false;
205 
206 #ifdef CONFIG_IPV6_SEG6_HMAC
207 	if (srh && !seg6_hmac_validate_skb(skb))
208 		return false;
209 #endif
210 
211 	if (ipv6_find_hdr(skb, &off, proto, NULL, NULL) < 0)
212 		return false;
213 
214 	if (!pskb_pull(skb, off))
215 		return false;
216 
217 	skb_postpull_rcsum(skb, skb_network_header(skb), off);
218 
219 	skb_reset_network_header(skb);
220 	skb_reset_transport_header(skb);
221 	if (iptunnel_pull_offloads(skb))
222 		return false;
223 
224 	return true;
225 }
226 
227 static void advance_nextseg(struct ipv6_sr_hdr *srh, struct in6_addr *daddr)
228 {
229 	struct in6_addr *addr;
230 
231 	srh->segments_left--;
232 	addr = srh->segments + srh->segments_left;
233 	*daddr = *addr;
234 }
235 
236 static int
237 seg6_lookup_any_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
238 			u32 tbl_id, bool local_delivery)
239 {
240 	struct net *net = dev_net(skb->dev);
241 	struct ipv6hdr *hdr = ipv6_hdr(skb);
242 	int flags = RT6_LOOKUP_F_HAS_SADDR;
243 	struct dst_entry *dst = NULL;
244 	struct rt6_info *rt;
245 	struct flowi6 fl6;
246 	int dev_flags = 0;
247 
248 	fl6.flowi6_iif = skb->dev->ifindex;
249 	fl6.daddr = nhaddr ? *nhaddr : hdr->daddr;
250 	fl6.saddr = hdr->saddr;
251 	fl6.flowlabel = ip6_flowinfo(hdr);
252 	fl6.flowi6_mark = skb->mark;
253 	fl6.flowi6_proto = hdr->nexthdr;
254 
255 	if (nhaddr)
256 		fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH;
257 
258 	if (!tbl_id) {
259 		dst = ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags);
260 	} else {
261 		struct fib6_table *table;
262 
263 		table = fib6_get_table(net, tbl_id);
264 		if (!table)
265 			goto out;
266 
267 		rt = ip6_pol_route(net, table, 0, &fl6, skb, flags);
268 		dst = &rt->dst;
269 	}
270 
271 	/* we want to discard traffic destined for local packet processing,
272 	 * if @local_delivery is set to false.
273 	 */
274 	if (!local_delivery)
275 		dev_flags |= IFF_LOOPBACK;
276 
277 	if (dst && (dst->dev->flags & dev_flags) && !dst->error) {
278 		dst_release(dst);
279 		dst = NULL;
280 	}
281 
282 out:
283 	if (!dst) {
284 		rt = net->ipv6.ip6_blk_hole_entry;
285 		dst = &rt->dst;
286 		dst_hold(dst);
287 	}
288 
289 	skb_dst_drop(skb);
290 	skb_dst_set(skb, dst);
291 	return dst->error;
292 }
293 
294 int seg6_lookup_nexthop(struct sk_buff *skb,
295 			struct in6_addr *nhaddr, u32 tbl_id)
296 {
297 	return seg6_lookup_any_nexthop(skb, nhaddr, tbl_id, false);
298 }
299 
300 /* regular endpoint function */
301 static int input_action_end(struct sk_buff *skb, struct seg6_local_lwt *slwt)
302 {
303 	struct ipv6_sr_hdr *srh;
304 
305 	srh = get_and_validate_srh(skb);
306 	if (!srh)
307 		goto drop;
308 
309 	advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
310 
311 	seg6_lookup_nexthop(skb, NULL, 0);
312 
313 	return dst_input(skb);
314 
315 drop:
316 	kfree_skb(skb);
317 	return -EINVAL;
318 }
319 
320 /* regular endpoint, and forward to specified nexthop */
321 static int input_action_end_x(struct sk_buff *skb, struct seg6_local_lwt *slwt)
322 {
323 	struct ipv6_sr_hdr *srh;
324 
325 	srh = get_and_validate_srh(skb);
326 	if (!srh)
327 		goto drop;
328 
329 	advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
330 
331 	seg6_lookup_nexthop(skb, &slwt->nh6, 0);
332 
333 	return dst_input(skb);
334 
335 drop:
336 	kfree_skb(skb);
337 	return -EINVAL;
338 }
339 
340 static int input_action_end_t(struct sk_buff *skb, struct seg6_local_lwt *slwt)
341 {
342 	struct ipv6_sr_hdr *srh;
343 
344 	srh = get_and_validate_srh(skb);
345 	if (!srh)
346 		goto drop;
347 
348 	advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
349 
350 	seg6_lookup_nexthop(skb, NULL, slwt->table);
351 
352 	return dst_input(skb);
353 
354 drop:
355 	kfree_skb(skb);
356 	return -EINVAL;
357 }
358 
359 /* decapsulate and forward inner L2 frame on specified interface */
360 static int input_action_end_dx2(struct sk_buff *skb,
361 				struct seg6_local_lwt *slwt)
362 {
363 	struct net *net = dev_net(skb->dev);
364 	struct net_device *odev;
365 	struct ethhdr *eth;
366 
367 	if (!decap_and_validate(skb, IPPROTO_ETHERNET))
368 		goto drop;
369 
370 	if (!pskb_may_pull(skb, ETH_HLEN))
371 		goto drop;
372 
373 	skb_reset_mac_header(skb);
374 	eth = (struct ethhdr *)skb->data;
375 
376 	/* To determine the frame's protocol, we assume it is 802.3. This avoids
377 	 * a call to eth_type_trans(), which is not really relevant for our
378 	 * use case.
379 	 */
380 	if (!eth_proto_is_802_3(eth->h_proto))
381 		goto drop;
382 
383 	odev = dev_get_by_index_rcu(net, slwt->oif);
384 	if (!odev)
385 		goto drop;
386 
387 	/* As we accept Ethernet frames, make sure the egress device is of
388 	 * the correct type.
389 	 */
390 	if (odev->type != ARPHRD_ETHER)
391 		goto drop;
392 
393 	if (!(odev->flags & IFF_UP) || !netif_carrier_ok(odev))
394 		goto drop;
395 
396 	skb_orphan(skb);
397 
398 	if (skb_warn_if_lro(skb))
399 		goto drop;
400 
401 	skb_forward_csum(skb);
402 
403 	if (skb->len - ETH_HLEN > odev->mtu)
404 		goto drop;
405 
406 	skb->dev = odev;
407 	skb->protocol = eth->h_proto;
408 
409 	return dev_queue_xmit(skb);
410 
411 drop:
412 	kfree_skb(skb);
413 	return -EINVAL;
414 }
415 
416 /* decapsulate and forward to specified nexthop */
417 static int input_action_end_dx6(struct sk_buff *skb,
418 				struct seg6_local_lwt *slwt)
419 {
420 	struct in6_addr *nhaddr = NULL;
421 
422 	/* this function accepts IPv6 encapsulated packets, with either
423 	 * an SRH with SL=0, or no SRH.
424 	 */
425 
426 	if (!decap_and_validate(skb, IPPROTO_IPV6))
427 		goto drop;
428 
429 	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
430 		goto drop;
431 
432 	/* The inner packet is not associated to any local interface,
433 	 * so we do not call netif_rx().
434 	 *
435 	 * If slwt->nh6 is set to ::, then lookup the nexthop for the
436 	 * inner packet's DA. Otherwise, use the specified nexthop.
437 	 */
438 
439 	if (!ipv6_addr_any(&slwt->nh6))
440 		nhaddr = &slwt->nh6;
441 
442 	skb_set_transport_header(skb, sizeof(struct ipv6hdr));
443 
444 	seg6_lookup_nexthop(skb, nhaddr, 0);
445 
446 	return dst_input(skb);
447 drop:
448 	kfree_skb(skb);
449 	return -EINVAL;
450 }
451 
452 static int input_action_end_dx4(struct sk_buff *skb,
453 				struct seg6_local_lwt *slwt)
454 {
455 	struct iphdr *iph;
456 	__be32 nhaddr;
457 	int err;
458 
459 	if (!decap_and_validate(skb, IPPROTO_IPIP))
460 		goto drop;
461 
462 	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
463 		goto drop;
464 
465 	skb->protocol = htons(ETH_P_IP);
466 
467 	iph = ip_hdr(skb);
468 
469 	nhaddr = slwt->nh4.s_addr ?: iph->daddr;
470 
471 	skb_dst_drop(skb);
472 
473 	skb_set_transport_header(skb, sizeof(struct iphdr));
474 
475 	err = ip_route_input(skb, nhaddr, iph->saddr, 0, skb->dev);
476 	if (err)
477 		goto drop;
478 
479 	return dst_input(skb);
480 
481 drop:
482 	kfree_skb(skb);
483 	return -EINVAL;
484 }
485 
486 #ifdef CONFIG_NET_L3_MASTER_DEV
487 static struct net *fib6_config_get_net(const struct fib6_config *fib6_cfg)
488 {
489 	const struct nl_info *nli = &fib6_cfg->fc_nlinfo;
490 
491 	return nli->nl_net;
492 }
493 
494 static int __seg6_end_dt_vrf_build(struct seg6_local_lwt *slwt, const void *cfg,
495 				   u16 family, struct netlink_ext_ack *extack)
496 {
497 	struct seg6_end_dt_info *info = &slwt->dt_info;
498 	int vrf_ifindex;
499 	struct net *net;
500 
501 	net = fib6_config_get_net(cfg);
502 
503 	/* note that vrf_table was already set by parse_nla_vrftable() */
504 	vrf_ifindex = l3mdev_ifindex_lookup_by_table_id(L3MDEV_TYPE_VRF, net,
505 							info->vrf_table);
506 	if (vrf_ifindex < 0) {
507 		if (vrf_ifindex == -EPERM) {
508 			NL_SET_ERR_MSG(extack,
509 				       "Strict mode for VRF is disabled");
510 		} else if (vrf_ifindex == -ENODEV) {
511 			NL_SET_ERR_MSG(extack,
512 				       "Table has no associated VRF device");
513 		} else {
514 			pr_debug("seg6local: SRv6 End.DT* creation error=%d\n",
515 				 vrf_ifindex);
516 		}
517 
518 		return vrf_ifindex;
519 	}
520 
521 	info->net = net;
522 	info->vrf_ifindex = vrf_ifindex;
523 
524 	switch (family) {
525 	case AF_INET:
526 		info->proto = htons(ETH_P_IP);
527 		info->hdrlen = sizeof(struct iphdr);
528 		break;
529 	case AF_INET6:
530 		info->proto = htons(ETH_P_IPV6);
531 		info->hdrlen = sizeof(struct ipv6hdr);
532 		break;
533 	default:
534 		return -EINVAL;
535 	}
536 
537 	info->family = family;
538 	info->mode = DT_VRF_MODE;
539 
540 	return 0;
541 }
542 
543 /* The SRv6 End.DT4/DT6 behavior extracts the inner (IPv4/IPv6) packet and
544  * routes the IPv4/IPv6 packet by looking at the configured routing table.
545  *
546  * In the SRv6 End.DT4/DT6 use case, we can receive traffic (IPv6+Segment
547  * Routing Header packets) from several interfaces and the outer IPv6
548  * destination address (DA) is used for retrieving the specific instance of the
549  * End.DT4/DT6 behavior that should process the packets.
550  *
551  * However, the inner IPv4/IPv6 packet is not really bound to any receiving
552  * interface and thus the End.DT4/DT6 sets the VRF (associated with the
553  * corresponding routing table) as the *receiving* interface.
554  * In other words, the End.DT4/DT6 processes a packet as if it has been received
555  * directly by the VRF (and not by one of its slave devices, if any).
556  * In this way, the VRF interface is used for routing the IPv4/IPv6 packet in
557  * according to the routing table configured by the End.DT4/DT6 instance.
558  *
559  * This design allows you to get some interesting features like:
560  *  1) the statistics on rx packets;
561  *  2) the possibility to install a packet sniffer on the receiving interface
562  *     (the VRF one) for looking at the incoming packets;
563  *  3) the possibility to leverage the netfilter prerouting hook for the inner
564  *     IPv4 packet.
565  *
566  * This function returns:
567  *  - the sk_buff* when the VRF rcv handler has processed the packet correctly;
568  *  - NULL when the skb is consumed by the VRF rcv handler;
569  *  - a pointer which encodes a negative error number in case of error.
570  *    Note that in this case, the function takes care of freeing the skb.
571  */
572 static struct sk_buff *end_dt_vrf_rcv(struct sk_buff *skb, u16 family,
573 				      struct net_device *dev)
574 {
575 	/* based on l3mdev_ip_rcv; we are only interested in the master */
576 	if (unlikely(!netif_is_l3_master(dev) && !netif_has_l3_rx_handler(dev)))
577 		goto drop;
578 
579 	if (unlikely(!dev->l3mdev_ops->l3mdev_l3_rcv))
580 		goto drop;
581 
582 	/* the decap packet IPv4/IPv6 does not come with any mac header info.
583 	 * We must unset the mac header to allow the VRF device to rebuild it,
584 	 * just in case there is a sniffer attached on the device.
585 	 */
586 	skb_unset_mac_header(skb);
587 
588 	skb = dev->l3mdev_ops->l3mdev_l3_rcv(dev, skb, family);
589 	if (!skb)
590 		/* the skb buffer was consumed by the handler */
591 		return NULL;
592 
593 	/* when a packet is received by a VRF or by one of its slaves, the
594 	 * master device reference is set into the skb.
595 	 */
596 	if (unlikely(skb->dev != dev || skb->skb_iif != dev->ifindex))
597 		goto drop;
598 
599 	return skb;
600 
601 drop:
602 	kfree_skb(skb);
603 	return ERR_PTR(-EINVAL);
604 }
605 
606 static struct net_device *end_dt_get_vrf_rcu(struct sk_buff *skb,
607 					     struct seg6_end_dt_info *info)
608 {
609 	int vrf_ifindex = info->vrf_ifindex;
610 	struct net *net = info->net;
611 
612 	if (unlikely(vrf_ifindex < 0))
613 		goto error;
614 
615 	if (unlikely(!net_eq(dev_net(skb->dev), net)))
616 		goto error;
617 
618 	return dev_get_by_index_rcu(net, vrf_ifindex);
619 
620 error:
621 	return NULL;
622 }
623 
624 static struct sk_buff *end_dt_vrf_core(struct sk_buff *skb,
625 				       struct seg6_local_lwt *slwt)
626 {
627 	struct seg6_end_dt_info *info = &slwt->dt_info;
628 	struct net_device *vrf;
629 
630 	vrf = end_dt_get_vrf_rcu(skb, info);
631 	if (unlikely(!vrf))
632 		goto drop;
633 
634 	skb->protocol = info->proto;
635 
636 	skb_dst_drop(skb);
637 
638 	skb_set_transport_header(skb, info->hdrlen);
639 
640 	return end_dt_vrf_rcv(skb, info->family, vrf);
641 
642 drop:
643 	kfree_skb(skb);
644 	return ERR_PTR(-EINVAL);
645 }
646 
647 static int input_action_end_dt4(struct sk_buff *skb,
648 				struct seg6_local_lwt *slwt)
649 {
650 	struct iphdr *iph;
651 	int err;
652 
653 	if (!decap_and_validate(skb, IPPROTO_IPIP))
654 		goto drop;
655 
656 	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
657 		goto drop;
658 
659 	skb = end_dt_vrf_core(skb, slwt);
660 	if (!skb)
661 		/* packet has been processed and consumed by the VRF */
662 		return 0;
663 
664 	if (IS_ERR(skb))
665 		return PTR_ERR(skb);
666 
667 	iph = ip_hdr(skb);
668 
669 	err = ip_route_input(skb, iph->daddr, iph->saddr, 0, skb->dev);
670 	if (unlikely(err))
671 		goto drop;
672 
673 	return dst_input(skb);
674 
675 drop:
676 	kfree_skb(skb);
677 	return -EINVAL;
678 }
679 
680 static int seg6_end_dt4_build(struct seg6_local_lwt *slwt, const void *cfg,
681 			      struct netlink_ext_ack *extack)
682 {
683 	return __seg6_end_dt_vrf_build(slwt, cfg, AF_INET, extack);
684 }
685 
686 static enum
687 seg6_end_dt_mode seg6_end_dt6_parse_mode(struct seg6_local_lwt *slwt)
688 {
689 	unsigned long parsed_optattrs = slwt->parsed_optattrs;
690 	bool legacy, vrfmode;
691 
692 	legacy	= !!(parsed_optattrs & SEG6_F_ATTR(SEG6_LOCAL_TABLE));
693 	vrfmode	= !!(parsed_optattrs & SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE));
694 
695 	if (!(legacy ^ vrfmode))
696 		/* both are absent or present: invalid DT6 mode */
697 		return DT_INVALID_MODE;
698 
699 	return legacy ? DT_LEGACY_MODE : DT_VRF_MODE;
700 }
701 
702 static enum seg6_end_dt_mode seg6_end_dt6_get_mode(struct seg6_local_lwt *slwt)
703 {
704 	struct seg6_end_dt_info *info = &slwt->dt_info;
705 
706 	return info->mode;
707 }
708 
709 static int seg6_end_dt6_build(struct seg6_local_lwt *slwt, const void *cfg,
710 			      struct netlink_ext_ack *extack)
711 {
712 	enum seg6_end_dt_mode mode = seg6_end_dt6_parse_mode(slwt);
713 	struct seg6_end_dt_info *info = &slwt->dt_info;
714 
715 	switch (mode) {
716 	case DT_LEGACY_MODE:
717 		info->mode = DT_LEGACY_MODE;
718 		return 0;
719 	case DT_VRF_MODE:
720 		return __seg6_end_dt_vrf_build(slwt, cfg, AF_INET6, extack);
721 	default:
722 		NL_SET_ERR_MSG(extack, "table or vrftable must be specified");
723 		return -EINVAL;
724 	}
725 }
726 #endif
727 
728 static int input_action_end_dt6(struct sk_buff *skb,
729 				struct seg6_local_lwt *slwt)
730 {
731 	if (!decap_and_validate(skb, IPPROTO_IPV6))
732 		goto drop;
733 
734 	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
735 		goto drop;
736 
737 #ifdef CONFIG_NET_L3_MASTER_DEV
738 	if (seg6_end_dt6_get_mode(slwt) == DT_LEGACY_MODE)
739 		goto legacy_mode;
740 
741 	/* DT6_VRF_MODE */
742 	skb = end_dt_vrf_core(skb, slwt);
743 	if (!skb)
744 		/* packet has been processed and consumed by the VRF */
745 		return 0;
746 
747 	if (IS_ERR(skb))
748 		return PTR_ERR(skb);
749 
750 	/* note: this time we do not need to specify the table because the VRF
751 	 * takes care of selecting the correct table.
752 	 */
753 	seg6_lookup_any_nexthop(skb, NULL, 0, true);
754 
755 	return dst_input(skb);
756 
757 legacy_mode:
758 #endif
759 	skb_set_transport_header(skb, sizeof(struct ipv6hdr));
760 
761 	seg6_lookup_any_nexthop(skb, NULL, slwt->table, true);
762 
763 	return dst_input(skb);
764 
765 drop:
766 	kfree_skb(skb);
767 	return -EINVAL;
768 }
769 
770 /* push an SRH on top of the current one */
771 static int input_action_end_b6(struct sk_buff *skb, struct seg6_local_lwt *slwt)
772 {
773 	struct ipv6_sr_hdr *srh;
774 	int err = -EINVAL;
775 
776 	srh = get_and_validate_srh(skb);
777 	if (!srh)
778 		goto drop;
779 
780 	err = seg6_do_srh_inline(skb, slwt->srh);
781 	if (err)
782 		goto drop;
783 
784 	ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
785 	skb_set_transport_header(skb, sizeof(struct ipv6hdr));
786 
787 	seg6_lookup_nexthop(skb, NULL, 0);
788 
789 	return dst_input(skb);
790 
791 drop:
792 	kfree_skb(skb);
793 	return err;
794 }
795 
796 /* encapsulate within an outer IPv6 header and a specified SRH */
797 static int input_action_end_b6_encap(struct sk_buff *skb,
798 				     struct seg6_local_lwt *slwt)
799 {
800 	struct ipv6_sr_hdr *srh;
801 	int err = -EINVAL;
802 
803 	srh = get_and_validate_srh(skb);
804 	if (!srh)
805 		goto drop;
806 
807 	advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
808 
809 	skb_reset_inner_headers(skb);
810 	skb->encapsulation = 1;
811 
812 	err = seg6_do_srh_encap(skb, slwt->srh, IPPROTO_IPV6);
813 	if (err)
814 		goto drop;
815 
816 	ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
817 	skb_set_transport_header(skb, sizeof(struct ipv6hdr));
818 
819 	seg6_lookup_nexthop(skb, NULL, 0);
820 
821 	return dst_input(skb);
822 
823 drop:
824 	kfree_skb(skb);
825 	return err;
826 }
827 
828 DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states);
829 
830 bool seg6_bpf_has_valid_srh(struct sk_buff *skb)
831 {
832 	struct seg6_bpf_srh_state *srh_state =
833 		this_cpu_ptr(&seg6_bpf_srh_states);
834 	struct ipv6_sr_hdr *srh = srh_state->srh;
835 
836 	if (unlikely(srh == NULL))
837 		return false;
838 
839 	if (unlikely(!srh_state->valid)) {
840 		if ((srh_state->hdrlen & 7) != 0)
841 			return false;
842 
843 		srh->hdrlen = (u8)(srh_state->hdrlen >> 3);
844 		if (!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3, true))
845 			return false;
846 
847 		srh_state->valid = true;
848 	}
849 
850 	return true;
851 }
852 
853 static int input_action_end_bpf(struct sk_buff *skb,
854 				struct seg6_local_lwt *slwt)
855 {
856 	struct seg6_bpf_srh_state *srh_state =
857 		this_cpu_ptr(&seg6_bpf_srh_states);
858 	struct ipv6_sr_hdr *srh;
859 	int ret;
860 
861 	srh = get_and_validate_srh(skb);
862 	if (!srh) {
863 		kfree_skb(skb);
864 		return -EINVAL;
865 	}
866 	advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
867 
868 	/* preempt_disable is needed to protect the per-CPU buffer srh_state,
869 	 * which is also accessed by the bpf_lwt_seg6_* helpers
870 	 */
871 	preempt_disable();
872 	srh_state->srh = srh;
873 	srh_state->hdrlen = srh->hdrlen << 3;
874 	srh_state->valid = true;
875 
876 	rcu_read_lock();
877 	bpf_compute_data_pointers(skb);
878 	ret = bpf_prog_run_save_cb(slwt->bpf.prog, skb);
879 	rcu_read_unlock();
880 
881 	switch (ret) {
882 	case BPF_OK:
883 	case BPF_REDIRECT:
884 		break;
885 	case BPF_DROP:
886 		goto drop;
887 	default:
888 		pr_warn_once("bpf-seg6local: Illegal return value %u\n", ret);
889 		goto drop;
890 	}
891 
892 	if (srh_state->srh && !seg6_bpf_has_valid_srh(skb))
893 		goto drop;
894 
895 	preempt_enable();
896 	if (ret != BPF_REDIRECT)
897 		seg6_lookup_nexthop(skb, NULL, 0);
898 
899 	return dst_input(skb);
900 
901 drop:
902 	preempt_enable();
903 	kfree_skb(skb);
904 	return -EINVAL;
905 }
906 
907 static struct seg6_action_desc seg6_action_table[] = {
908 	{
909 		.action		= SEG6_LOCAL_ACTION_END,
910 		.attrs		= 0,
911 		.optattrs	= SEG6_F_LOCAL_COUNTERS,
912 		.input		= input_action_end,
913 	},
914 	{
915 		.action		= SEG6_LOCAL_ACTION_END_X,
916 		.attrs		= SEG6_F_ATTR(SEG6_LOCAL_NH6),
917 		.optattrs	= SEG6_F_LOCAL_COUNTERS,
918 		.input		= input_action_end_x,
919 	},
920 	{
921 		.action		= SEG6_LOCAL_ACTION_END_T,
922 		.attrs		= SEG6_F_ATTR(SEG6_LOCAL_TABLE),
923 		.optattrs	= SEG6_F_LOCAL_COUNTERS,
924 		.input		= input_action_end_t,
925 	},
926 	{
927 		.action		= SEG6_LOCAL_ACTION_END_DX2,
928 		.attrs		= SEG6_F_ATTR(SEG6_LOCAL_OIF),
929 		.optattrs	= SEG6_F_LOCAL_COUNTERS,
930 		.input		= input_action_end_dx2,
931 	},
932 	{
933 		.action		= SEG6_LOCAL_ACTION_END_DX6,
934 		.attrs		= SEG6_F_ATTR(SEG6_LOCAL_NH6),
935 		.optattrs	= SEG6_F_LOCAL_COUNTERS,
936 		.input		= input_action_end_dx6,
937 	},
938 	{
939 		.action		= SEG6_LOCAL_ACTION_END_DX4,
940 		.attrs		= SEG6_F_ATTR(SEG6_LOCAL_NH4),
941 		.optattrs	= SEG6_F_LOCAL_COUNTERS,
942 		.input		= input_action_end_dx4,
943 	},
944 	{
945 		.action		= SEG6_LOCAL_ACTION_END_DT4,
946 		.attrs		= SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE),
947 		.optattrs	= SEG6_F_LOCAL_COUNTERS,
948 #ifdef CONFIG_NET_L3_MASTER_DEV
949 		.input		= input_action_end_dt4,
950 		.slwt_ops	= {
951 					.build_state = seg6_end_dt4_build,
952 				  },
953 #endif
954 	},
955 	{
956 		.action		= SEG6_LOCAL_ACTION_END_DT6,
957 #ifdef CONFIG_NET_L3_MASTER_DEV
958 		.attrs		= 0,
959 		.optattrs	= SEG6_F_LOCAL_COUNTERS		|
960 				  SEG6_F_ATTR(SEG6_LOCAL_TABLE) |
961 				  SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE),
962 		.slwt_ops	= {
963 					.build_state = seg6_end_dt6_build,
964 				  },
965 #else
966 		.attrs		= SEG6_F_ATTR(SEG6_LOCAL_TABLE),
967 		.optattrs	= SEG6_F_LOCAL_COUNTERS,
968 #endif
969 		.input		= input_action_end_dt6,
970 	},
971 	{
972 		.action		= SEG6_LOCAL_ACTION_END_B6,
973 		.attrs		= SEG6_F_ATTR(SEG6_LOCAL_SRH),
974 		.optattrs	= SEG6_F_LOCAL_COUNTERS,
975 		.input		= input_action_end_b6,
976 	},
977 	{
978 		.action		= SEG6_LOCAL_ACTION_END_B6_ENCAP,
979 		.attrs		= SEG6_F_ATTR(SEG6_LOCAL_SRH),
980 		.optattrs	= SEG6_F_LOCAL_COUNTERS,
981 		.input		= input_action_end_b6_encap,
982 		.static_headroom	= sizeof(struct ipv6hdr),
983 	},
984 	{
985 		.action		= SEG6_LOCAL_ACTION_END_BPF,
986 		.attrs		= SEG6_F_ATTR(SEG6_LOCAL_BPF),
987 		.optattrs	= SEG6_F_LOCAL_COUNTERS,
988 		.input		= input_action_end_bpf,
989 	},
990 
991 };
992 
993 static struct seg6_action_desc *__get_action_desc(int action)
994 {
995 	struct seg6_action_desc *desc;
996 	int i, count;
997 
998 	count = ARRAY_SIZE(seg6_action_table);
999 	for (i = 0; i < count; i++) {
1000 		desc = &seg6_action_table[i];
1001 		if (desc->action == action)
1002 			return desc;
1003 	}
1004 
1005 	return NULL;
1006 }
1007 
1008 static bool seg6_lwtunnel_counters_enabled(struct seg6_local_lwt *slwt)
1009 {
1010 	return slwt->parsed_optattrs & SEG6_F_LOCAL_COUNTERS;
1011 }
1012 
1013 static void seg6_local_update_counters(struct seg6_local_lwt *slwt,
1014 				       unsigned int len, int err)
1015 {
1016 	struct pcpu_seg6_local_counters *pcounters;
1017 
1018 	pcounters = this_cpu_ptr(slwt->pcpu_counters);
1019 	u64_stats_update_begin(&pcounters->syncp);
1020 
1021 	if (likely(!err)) {
1022 		u64_stats_inc(&pcounters->packets);
1023 		u64_stats_add(&pcounters->bytes, len);
1024 	} else {
1025 		u64_stats_inc(&pcounters->errors);
1026 	}
1027 
1028 	u64_stats_update_end(&pcounters->syncp);
1029 }
1030 
1031 static int seg6_local_input(struct sk_buff *skb)
1032 {
1033 	struct dst_entry *orig_dst = skb_dst(skb);
1034 	struct seg6_action_desc *desc;
1035 	struct seg6_local_lwt *slwt;
1036 	unsigned int len = skb->len;
1037 	int rc;
1038 
1039 	if (skb->protocol != htons(ETH_P_IPV6)) {
1040 		kfree_skb(skb);
1041 		return -EINVAL;
1042 	}
1043 
1044 	slwt = seg6_local_lwtunnel(orig_dst->lwtstate);
1045 	desc = slwt->desc;
1046 
1047 	rc = desc->input(skb, slwt);
1048 
1049 	if (!seg6_lwtunnel_counters_enabled(slwt))
1050 		return rc;
1051 
1052 	seg6_local_update_counters(slwt, len, rc);
1053 
1054 	return rc;
1055 }
1056 
1057 static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = {
1058 	[SEG6_LOCAL_ACTION]	= { .type = NLA_U32 },
1059 	[SEG6_LOCAL_SRH]	= { .type = NLA_BINARY },
1060 	[SEG6_LOCAL_TABLE]	= { .type = NLA_U32 },
1061 	[SEG6_LOCAL_VRFTABLE]	= { .type = NLA_U32 },
1062 	[SEG6_LOCAL_NH4]	= { .type = NLA_BINARY,
1063 				    .len = sizeof(struct in_addr) },
1064 	[SEG6_LOCAL_NH6]	= { .type = NLA_BINARY,
1065 				    .len = sizeof(struct in6_addr) },
1066 	[SEG6_LOCAL_IIF]	= { .type = NLA_U32 },
1067 	[SEG6_LOCAL_OIF]	= { .type = NLA_U32 },
1068 	[SEG6_LOCAL_BPF]	= { .type = NLA_NESTED },
1069 	[SEG6_LOCAL_COUNTERS]	= { .type = NLA_NESTED },
1070 };
1071 
1072 static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt)
1073 {
1074 	struct ipv6_sr_hdr *srh;
1075 	int len;
1076 
1077 	srh = nla_data(attrs[SEG6_LOCAL_SRH]);
1078 	len = nla_len(attrs[SEG6_LOCAL_SRH]);
1079 
1080 	/* SRH must contain at least one segment */
1081 	if (len < sizeof(*srh) + sizeof(struct in6_addr))
1082 		return -EINVAL;
1083 
1084 	if (!seg6_validate_srh(srh, len, false))
1085 		return -EINVAL;
1086 
1087 	slwt->srh = kmemdup(srh, len, GFP_KERNEL);
1088 	if (!slwt->srh)
1089 		return -ENOMEM;
1090 
1091 	slwt->headroom += len;
1092 
1093 	return 0;
1094 }
1095 
1096 static int put_nla_srh(struct sk_buff *skb, struct seg6_local_lwt *slwt)
1097 {
1098 	struct ipv6_sr_hdr *srh;
1099 	struct nlattr *nla;
1100 	int len;
1101 
1102 	srh = slwt->srh;
1103 	len = (srh->hdrlen + 1) << 3;
1104 
1105 	nla = nla_reserve(skb, SEG6_LOCAL_SRH, len);
1106 	if (!nla)
1107 		return -EMSGSIZE;
1108 
1109 	memcpy(nla_data(nla), srh, len);
1110 
1111 	return 0;
1112 }
1113 
1114 static int cmp_nla_srh(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
1115 {
1116 	int len = (a->srh->hdrlen + 1) << 3;
1117 
1118 	if (len != ((b->srh->hdrlen + 1) << 3))
1119 		return 1;
1120 
1121 	return memcmp(a->srh, b->srh, len);
1122 }
1123 
1124 static void destroy_attr_srh(struct seg6_local_lwt *slwt)
1125 {
1126 	kfree(slwt->srh);
1127 }
1128 
1129 static int parse_nla_table(struct nlattr **attrs, struct seg6_local_lwt *slwt)
1130 {
1131 	slwt->table = nla_get_u32(attrs[SEG6_LOCAL_TABLE]);
1132 
1133 	return 0;
1134 }
1135 
1136 static int put_nla_table(struct sk_buff *skb, struct seg6_local_lwt *slwt)
1137 {
1138 	if (nla_put_u32(skb, SEG6_LOCAL_TABLE, slwt->table))
1139 		return -EMSGSIZE;
1140 
1141 	return 0;
1142 }
1143 
1144 static int cmp_nla_table(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
1145 {
1146 	if (a->table != b->table)
1147 		return 1;
1148 
1149 	return 0;
1150 }
1151 
1152 static struct
1153 seg6_end_dt_info *seg6_possible_end_dt_info(struct seg6_local_lwt *slwt)
1154 {
1155 #ifdef CONFIG_NET_L3_MASTER_DEV
1156 	return &slwt->dt_info;
1157 #else
1158 	return ERR_PTR(-EOPNOTSUPP);
1159 #endif
1160 }
1161 
1162 static int parse_nla_vrftable(struct nlattr **attrs,
1163 			      struct seg6_local_lwt *slwt)
1164 {
1165 	struct seg6_end_dt_info *info = seg6_possible_end_dt_info(slwt);
1166 
1167 	if (IS_ERR(info))
1168 		return PTR_ERR(info);
1169 
1170 	info->vrf_table = nla_get_u32(attrs[SEG6_LOCAL_VRFTABLE]);
1171 
1172 	return 0;
1173 }
1174 
1175 static int put_nla_vrftable(struct sk_buff *skb, struct seg6_local_lwt *slwt)
1176 {
1177 	struct seg6_end_dt_info *info = seg6_possible_end_dt_info(slwt);
1178 
1179 	if (IS_ERR(info))
1180 		return PTR_ERR(info);
1181 
1182 	if (nla_put_u32(skb, SEG6_LOCAL_VRFTABLE, info->vrf_table))
1183 		return -EMSGSIZE;
1184 
1185 	return 0;
1186 }
1187 
1188 static int cmp_nla_vrftable(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
1189 {
1190 	struct seg6_end_dt_info *info_a = seg6_possible_end_dt_info(a);
1191 	struct seg6_end_dt_info *info_b = seg6_possible_end_dt_info(b);
1192 
1193 	if (info_a->vrf_table != info_b->vrf_table)
1194 		return 1;
1195 
1196 	return 0;
1197 }
1198 
1199 static int parse_nla_nh4(struct nlattr **attrs, struct seg6_local_lwt *slwt)
1200 {
1201 	memcpy(&slwt->nh4, nla_data(attrs[SEG6_LOCAL_NH4]),
1202 	       sizeof(struct in_addr));
1203 
1204 	return 0;
1205 }
1206 
1207 static int put_nla_nh4(struct sk_buff *skb, struct seg6_local_lwt *slwt)
1208 {
1209 	struct nlattr *nla;
1210 
1211 	nla = nla_reserve(skb, SEG6_LOCAL_NH4, sizeof(struct in_addr));
1212 	if (!nla)
1213 		return -EMSGSIZE;
1214 
1215 	memcpy(nla_data(nla), &slwt->nh4, sizeof(struct in_addr));
1216 
1217 	return 0;
1218 }
1219 
1220 static int cmp_nla_nh4(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
1221 {
1222 	return memcmp(&a->nh4, &b->nh4, sizeof(struct in_addr));
1223 }
1224 
1225 static int parse_nla_nh6(struct nlattr **attrs, struct seg6_local_lwt *slwt)
1226 {
1227 	memcpy(&slwt->nh6, nla_data(attrs[SEG6_LOCAL_NH6]),
1228 	       sizeof(struct in6_addr));
1229 
1230 	return 0;
1231 }
1232 
1233 static int put_nla_nh6(struct sk_buff *skb, struct seg6_local_lwt *slwt)
1234 {
1235 	struct nlattr *nla;
1236 
1237 	nla = nla_reserve(skb, SEG6_LOCAL_NH6, sizeof(struct in6_addr));
1238 	if (!nla)
1239 		return -EMSGSIZE;
1240 
1241 	memcpy(nla_data(nla), &slwt->nh6, sizeof(struct in6_addr));
1242 
1243 	return 0;
1244 }
1245 
1246 static int cmp_nla_nh6(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
1247 {
1248 	return memcmp(&a->nh6, &b->nh6, sizeof(struct in6_addr));
1249 }
1250 
1251 static int parse_nla_iif(struct nlattr **attrs, struct seg6_local_lwt *slwt)
1252 {
1253 	slwt->iif = nla_get_u32(attrs[SEG6_LOCAL_IIF]);
1254 
1255 	return 0;
1256 }
1257 
1258 static int put_nla_iif(struct sk_buff *skb, struct seg6_local_lwt *slwt)
1259 {
1260 	if (nla_put_u32(skb, SEG6_LOCAL_IIF, slwt->iif))
1261 		return -EMSGSIZE;
1262 
1263 	return 0;
1264 }
1265 
1266 static int cmp_nla_iif(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
1267 {
1268 	if (a->iif != b->iif)
1269 		return 1;
1270 
1271 	return 0;
1272 }
1273 
1274 static int parse_nla_oif(struct nlattr **attrs, struct seg6_local_lwt *slwt)
1275 {
1276 	slwt->oif = nla_get_u32(attrs[SEG6_LOCAL_OIF]);
1277 
1278 	return 0;
1279 }
1280 
1281 static int put_nla_oif(struct sk_buff *skb, struct seg6_local_lwt *slwt)
1282 {
1283 	if (nla_put_u32(skb, SEG6_LOCAL_OIF, slwt->oif))
1284 		return -EMSGSIZE;
1285 
1286 	return 0;
1287 }
1288 
1289 static int cmp_nla_oif(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
1290 {
1291 	if (a->oif != b->oif)
1292 		return 1;
1293 
1294 	return 0;
1295 }
1296 
1297 #define MAX_PROG_NAME 256
1298 static const struct nla_policy bpf_prog_policy[SEG6_LOCAL_BPF_PROG_MAX + 1] = {
1299 	[SEG6_LOCAL_BPF_PROG]	   = { .type = NLA_U32, },
1300 	[SEG6_LOCAL_BPF_PROG_NAME] = { .type = NLA_NUL_STRING,
1301 				       .len = MAX_PROG_NAME },
1302 };
1303 
1304 static int parse_nla_bpf(struct nlattr **attrs, struct seg6_local_lwt *slwt)
1305 {
1306 	struct nlattr *tb[SEG6_LOCAL_BPF_PROG_MAX + 1];
1307 	struct bpf_prog *p;
1308 	int ret;
1309 	u32 fd;
1310 
1311 	ret = nla_parse_nested_deprecated(tb, SEG6_LOCAL_BPF_PROG_MAX,
1312 					  attrs[SEG6_LOCAL_BPF],
1313 					  bpf_prog_policy, NULL);
1314 	if (ret < 0)
1315 		return ret;
1316 
1317 	if (!tb[SEG6_LOCAL_BPF_PROG] || !tb[SEG6_LOCAL_BPF_PROG_NAME])
1318 		return -EINVAL;
1319 
1320 	slwt->bpf.name = nla_memdup(tb[SEG6_LOCAL_BPF_PROG_NAME], GFP_KERNEL);
1321 	if (!slwt->bpf.name)
1322 		return -ENOMEM;
1323 
1324 	fd = nla_get_u32(tb[SEG6_LOCAL_BPF_PROG]);
1325 	p = bpf_prog_get_type(fd, BPF_PROG_TYPE_LWT_SEG6LOCAL);
1326 	if (IS_ERR(p)) {
1327 		kfree(slwt->bpf.name);
1328 		return PTR_ERR(p);
1329 	}
1330 
1331 	slwt->bpf.prog = p;
1332 	return 0;
1333 }
1334 
1335 static int put_nla_bpf(struct sk_buff *skb, struct seg6_local_lwt *slwt)
1336 {
1337 	struct nlattr *nest;
1338 
1339 	if (!slwt->bpf.prog)
1340 		return 0;
1341 
1342 	nest = nla_nest_start_noflag(skb, SEG6_LOCAL_BPF);
1343 	if (!nest)
1344 		return -EMSGSIZE;
1345 
1346 	if (nla_put_u32(skb, SEG6_LOCAL_BPF_PROG, slwt->bpf.prog->aux->id))
1347 		return -EMSGSIZE;
1348 
1349 	if (slwt->bpf.name &&
1350 	    nla_put_string(skb, SEG6_LOCAL_BPF_PROG_NAME, slwt->bpf.name))
1351 		return -EMSGSIZE;
1352 
1353 	return nla_nest_end(skb, nest);
1354 }
1355 
1356 static int cmp_nla_bpf(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
1357 {
1358 	if (!a->bpf.name && !b->bpf.name)
1359 		return 0;
1360 
1361 	if (!a->bpf.name || !b->bpf.name)
1362 		return 1;
1363 
1364 	return strcmp(a->bpf.name, b->bpf.name);
1365 }
1366 
1367 static void destroy_attr_bpf(struct seg6_local_lwt *slwt)
1368 {
1369 	kfree(slwt->bpf.name);
1370 	if (slwt->bpf.prog)
1371 		bpf_prog_put(slwt->bpf.prog);
1372 }
1373 
1374 static const struct
1375 nla_policy seg6_local_counters_policy[SEG6_LOCAL_CNT_MAX + 1] = {
1376 	[SEG6_LOCAL_CNT_PACKETS]	= { .type = NLA_U64 },
1377 	[SEG6_LOCAL_CNT_BYTES]		= { .type = NLA_U64 },
1378 	[SEG6_LOCAL_CNT_ERRORS]		= { .type = NLA_U64 },
1379 };
1380 
1381 static int parse_nla_counters(struct nlattr **attrs,
1382 			      struct seg6_local_lwt *slwt)
1383 {
1384 	struct pcpu_seg6_local_counters __percpu *pcounters;
1385 	struct nlattr *tb[SEG6_LOCAL_CNT_MAX + 1];
1386 	int ret;
1387 
1388 	ret = nla_parse_nested_deprecated(tb, SEG6_LOCAL_CNT_MAX,
1389 					  attrs[SEG6_LOCAL_COUNTERS],
1390 					  seg6_local_counters_policy, NULL);
1391 	if (ret < 0)
1392 		return ret;
1393 
1394 	/* basic support for SRv6 Behavior counters requires at least:
1395 	 * packets, bytes and errors.
1396 	 */
1397 	if (!tb[SEG6_LOCAL_CNT_PACKETS] || !tb[SEG6_LOCAL_CNT_BYTES] ||
1398 	    !tb[SEG6_LOCAL_CNT_ERRORS])
1399 		return -EINVAL;
1400 
1401 	/* counters are always zero initialized */
1402 	pcounters = seg6_local_alloc_pcpu_counters(GFP_KERNEL);
1403 	if (!pcounters)
1404 		return -ENOMEM;
1405 
1406 	slwt->pcpu_counters = pcounters;
1407 
1408 	return 0;
1409 }
1410 
1411 static int seg6_local_fill_nla_counters(struct sk_buff *skb,
1412 					struct seg6_local_counters *counters)
1413 {
1414 	if (nla_put_u64_64bit(skb, SEG6_LOCAL_CNT_PACKETS, counters->packets,
1415 			      SEG6_LOCAL_CNT_PAD))
1416 		return -EMSGSIZE;
1417 
1418 	if (nla_put_u64_64bit(skb, SEG6_LOCAL_CNT_BYTES, counters->bytes,
1419 			      SEG6_LOCAL_CNT_PAD))
1420 		return -EMSGSIZE;
1421 
1422 	if (nla_put_u64_64bit(skb, SEG6_LOCAL_CNT_ERRORS, counters->errors,
1423 			      SEG6_LOCAL_CNT_PAD))
1424 		return -EMSGSIZE;
1425 
1426 	return 0;
1427 }
1428 
1429 static int put_nla_counters(struct sk_buff *skb, struct seg6_local_lwt *slwt)
1430 {
1431 	struct seg6_local_counters counters = { 0, 0, 0 };
1432 	struct nlattr *nest;
1433 	int rc, i;
1434 
1435 	nest = nla_nest_start(skb, SEG6_LOCAL_COUNTERS);
1436 	if (!nest)
1437 		return -EMSGSIZE;
1438 
1439 	for_each_possible_cpu(i) {
1440 		struct pcpu_seg6_local_counters *pcounters;
1441 		u64 packets, bytes, errors;
1442 		unsigned int start;
1443 
1444 		pcounters = per_cpu_ptr(slwt->pcpu_counters, i);
1445 		do {
1446 			start = u64_stats_fetch_begin_irq(&pcounters->syncp);
1447 
1448 			packets = u64_stats_read(&pcounters->packets);
1449 			bytes = u64_stats_read(&pcounters->bytes);
1450 			errors = u64_stats_read(&pcounters->errors);
1451 
1452 		} while (u64_stats_fetch_retry_irq(&pcounters->syncp, start));
1453 
1454 		counters.packets += packets;
1455 		counters.bytes += bytes;
1456 		counters.errors += errors;
1457 	}
1458 
1459 	rc = seg6_local_fill_nla_counters(skb, &counters);
1460 	if (rc < 0) {
1461 		nla_nest_cancel(skb, nest);
1462 		return rc;
1463 	}
1464 
1465 	return nla_nest_end(skb, nest);
1466 }
1467 
1468 static int cmp_nla_counters(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
1469 {
1470 	/* a and b are equal if both have pcpu_counters set or not */
1471 	return (!!((unsigned long)a->pcpu_counters)) ^
1472 		(!!((unsigned long)b->pcpu_counters));
1473 }
1474 
1475 static void destroy_attr_counters(struct seg6_local_lwt *slwt)
1476 {
1477 	free_percpu(slwt->pcpu_counters);
1478 }
1479 
1480 struct seg6_action_param {
1481 	int (*parse)(struct nlattr **attrs, struct seg6_local_lwt *slwt);
1482 	int (*put)(struct sk_buff *skb, struct seg6_local_lwt *slwt);
1483 	int (*cmp)(struct seg6_local_lwt *a, struct seg6_local_lwt *b);
1484 
1485 	/* optional destroy() callback useful for releasing resources which
1486 	 * have been previously acquired in the corresponding parse()
1487 	 * function.
1488 	 */
1489 	void (*destroy)(struct seg6_local_lwt *slwt);
1490 };
1491 
1492 static struct seg6_action_param seg6_action_params[SEG6_LOCAL_MAX + 1] = {
1493 	[SEG6_LOCAL_SRH]	= { .parse = parse_nla_srh,
1494 				    .put = put_nla_srh,
1495 				    .cmp = cmp_nla_srh,
1496 				    .destroy = destroy_attr_srh },
1497 
1498 	[SEG6_LOCAL_TABLE]	= { .parse = parse_nla_table,
1499 				    .put = put_nla_table,
1500 				    .cmp = cmp_nla_table },
1501 
1502 	[SEG6_LOCAL_NH4]	= { .parse = parse_nla_nh4,
1503 				    .put = put_nla_nh4,
1504 				    .cmp = cmp_nla_nh4 },
1505 
1506 	[SEG6_LOCAL_NH6]	= { .parse = parse_nla_nh6,
1507 				    .put = put_nla_nh6,
1508 				    .cmp = cmp_nla_nh6 },
1509 
1510 	[SEG6_LOCAL_IIF]	= { .parse = parse_nla_iif,
1511 				    .put = put_nla_iif,
1512 				    .cmp = cmp_nla_iif },
1513 
1514 	[SEG6_LOCAL_OIF]	= { .parse = parse_nla_oif,
1515 				    .put = put_nla_oif,
1516 				    .cmp = cmp_nla_oif },
1517 
1518 	[SEG6_LOCAL_BPF]	= { .parse = parse_nla_bpf,
1519 				    .put = put_nla_bpf,
1520 				    .cmp = cmp_nla_bpf,
1521 				    .destroy = destroy_attr_bpf },
1522 
1523 	[SEG6_LOCAL_VRFTABLE]	= { .parse = parse_nla_vrftable,
1524 				    .put = put_nla_vrftable,
1525 				    .cmp = cmp_nla_vrftable },
1526 
1527 	[SEG6_LOCAL_COUNTERS]	= { .parse = parse_nla_counters,
1528 				    .put = put_nla_counters,
1529 				    .cmp = cmp_nla_counters,
1530 				    .destroy = destroy_attr_counters },
1531 };
1532 
1533 /* call the destroy() callback (if available) for each set attribute in
1534  * @parsed_attrs, starting from the first attribute up to the @max_parsed
1535  * (excluded) attribute.
1536  */
1537 static void __destroy_attrs(unsigned long parsed_attrs, int max_parsed,
1538 			    struct seg6_local_lwt *slwt)
1539 {
1540 	struct seg6_action_param *param;
1541 	int i;
1542 
1543 	/* Every required seg6local attribute is identified by an ID which is
1544 	 * encoded as a flag (i.e: 1 << ID) in the 'attrs' bitmask;
1545 	 *
1546 	 * We scan the 'parsed_attrs' bitmask, starting from the first attribute
1547 	 * up to the @max_parsed (excluded) attribute.
1548 	 * For each set attribute, we retrieve the corresponding destroy()
1549 	 * callback. If the callback is not available, then we skip to the next
1550 	 * attribute; otherwise, we call the destroy() callback.
1551 	 */
1552 	for (i = 0; i < max_parsed; ++i) {
1553 		if (!(parsed_attrs & SEG6_F_ATTR(i)))
1554 			continue;
1555 
1556 		param = &seg6_action_params[i];
1557 
1558 		if (param->destroy)
1559 			param->destroy(slwt);
1560 	}
1561 }
1562 
1563 /* release all the resources that may have been acquired during parsing
1564  * operations.
1565  */
1566 static void destroy_attrs(struct seg6_local_lwt *slwt)
1567 {
1568 	unsigned long attrs = slwt->desc->attrs | slwt->parsed_optattrs;
1569 
1570 	__destroy_attrs(attrs, SEG6_LOCAL_MAX + 1, slwt);
1571 }
1572 
1573 static int parse_nla_optional_attrs(struct nlattr **attrs,
1574 				    struct seg6_local_lwt *slwt)
1575 {
1576 	struct seg6_action_desc *desc = slwt->desc;
1577 	unsigned long parsed_optattrs = 0;
1578 	struct seg6_action_param *param;
1579 	int err, i;
1580 
1581 	for (i = 0; i < SEG6_LOCAL_MAX + 1; ++i) {
1582 		if (!(desc->optattrs & SEG6_F_ATTR(i)) || !attrs[i])
1583 			continue;
1584 
1585 		/* once here, the i-th attribute is provided by the
1586 		 * userspace AND it is identified optional as well.
1587 		 */
1588 		param = &seg6_action_params[i];
1589 
1590 		err = param->parse(attrs, slwt);
1591 		if (err < 0)
1592 			goto parse_optattrs_err;
1593 
1594 		/* current attribute has been correctly parsed */
1595 		parsed_optattrs |= SEG6_F_ATTR(i);
1596 	}
1597 
1598 	/* store in the tunnel state all the optional attributed successfully
1599 	 * parsed.
1600 	 */
1601 	slwt->parsed_optattrs = parsed_optattrs;
1602 
1603 	return 0;
1604 
1605 parse_optattrs_err:
1606 	__destroy_attrs(parsed_optattrs, i, slwt);
1607 
1608 	return err;
1609 }
1610 
1611 /* call the custom constructor of the behavior during its initialization phase
1612  * and after that all its attributes have been parsed successfully.
1613  */
1614 static int
1615 seg6_local_lwtunnel_build_state(struct seg6_local_lwt *slwt, const void *cfg,
1616 				struct netlink_ext_ack *extack)
1617 {
1618 	struct seg6_action_desc *desc = slwt->desc;
1619 	struct seg6_local_lwtunnel_ops *ops;
1620 
1621 	ops = &desc->slwt_ops;
1622 	if (!ops->build_state)
1623 		return 0;
1624 
1625 	return ops->build_state(slwt, cfg, extack);
1626 }
1627 
1628 /* call the custom destructor of the behavior which is invoked before the
1629  * tunnel is going to be destroyed.
1630  */
1631 static void seg6_local_lwtunnel_destroy_state(struct seg6_local_lwt *slwt)
1632 {
1633 	struct seg6_action_desc *desc = slwt->desc;
1634 	struct seg6_local_lwtunnel_ops *ops;
1635 
1636 	ops = &desc->slwt_ops;
1637 	if (!ops->destroy_state)
1638 		return;
1639 
1640 	ops->destroy_state(slwt);
1641 }
1642 
1643 static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt)
1644 {
1645 	struct seg6_action_param *param;
1646 	struct seg6_action_desc *desc;
1647 	unsigned long invalid_attrs;
1648 	int i, err;
1649 
1650 	desc = __get_action_desc(slwt->action);
1651 	if (!desc)
1652 		return -EINVAL;
1653 
1654 	if (!desc->input)
1655 		return -EOPNOTSUPP;
1656 
1657 	slwt->desc = desc;
1658 	slwt->headroom += desc->static_headroom;
1659 
1660 	/* Forcing the desc->optattrs *set* and the desc->attrs *set* to be
1661 	 * disjoined, this allow us to release acquired resources by optional
1662 	 * attributes and by required attributes independently from each other
1663 	 * without any interference.
1664 	 * In other terms, we are sure that we do not release some the acquired
1665 	 * resources twice.
1666 	 *
1667 	 * Note that if an attribute is configured both as required and as
1668 	 * optional, it means that the user has messed something up in the
1669 	 * seg6_action_table. Therefore, this check is required for SRv6
1670 	 * behaviors to work properly.
1671 	 */
1672 	invalid_attrs = desc->attrs & desc->optattrs;
1673 	if (invalid_attrs) {
1674 		WARN_ONCE(1,
1675 			  "An attribute cannot be both required AND optional");
1676 		return -EINVAL;
1677 	}
1678 
1679 	/* parse the required attributes */
1680 	for (i = 0; i < SEG6_LOCAL_MAX + 1; i++) {
1681 		if (desc->attrs & SEG6_F_ATTR(i)) {
1682 			if (!attrs[i])
1683 				return -EINVAL;
1684 
1685 			param = &seg6_action_params[i];
1686 
1687 			err = param->parse(attrs, slwt);
1688 			if (err < 0)
1689 				goto parse_attrs_err;
1690 		}
1691 	}
1692 
1693 	/* parse the optional attributes, if any */
1694 	err = parse_nla_optional_attrs(attrs, slwt);
1695 	if (err < 0)
1696 		goto parse_attrs_err;
1697 
1698 	return 0;
1699 
1700 parse_attrs_err:
1701 	/* release any resource that may have been acquired during the i-1
1702 	 * parse() operations.
1703 	 */
1704 	__destroy_attrs(desc->attrs, i, slwt);
1705 
1706 	return err;
1707 }
1708 
1709 static int seg6_local_build_state(struct net *net, struct nlattr *nla,
1710 				  unsigned int family, const void *cfg,
1711 				  struct lwtunnel_state **ts,
1712 				  struct netlink_ext_ack *extack)
1713 {
1714 	struct nlattr *tb[SEG6_LOCAL_MAX + 1];
1715 	struct lwtunnel_state *newts;
1716 	struct seg6_local_lwt *slwt;
1717 	int err;
1718 
1719 	if (family != AF_INET6)
1720 		return -EINVAL;
1721 
1722 	err = nla_parse_nested_deprecated(tb, SEG6_LOCAL_MAX, nla,
1723 					  seg6_local_policy, extack);
1724 
1725 	if (err < 0)
1726 		return err;
1727 
1728 	if (!tb[SEG6_LOCAL_ACTION])
1729 		return -EINVAL;
1730 
1731 	newts = lwtunnel_state_alloc(sizeof(*slwt));
1732 	if (!newts)
1733 		return -ENOMEM;
1734 
1735 	slwt = seg6_local_lwtunnel(newts);
1736 	slwt->action = nla_get_u32(tb[SEG6_LOCAL_ACTION]);
1737 
1738 	err = parse_nla_action(tb, slwt);
1739 	if (err < 0)
1740 		goto out_free;
1741 
1742 	err = seg6_local_lwtunnel_build_state(slwt, cfg, extack);
1743 	if (err < 0)
1744 		goto out_destroy_attrs;
1745 
1746 	newts->type = LWTUNNEL_ENCAP_SEG6_LOCAL;
1747 	newts->flags = LWTUNNEL_STATE_INPUT_REDIRECT;
1748 	newts->headroom = slwt->headroom;
1749 
1750 	*ts = newts;
1751 
1752 	return 0;
1753 
1754 out_destroy_attrs:
1755 	destroy_attrs(slwt);
1756 out_free:
1757 	kfree(newts);
1758 	return err;
1759 }
1760 
1761 static void seg6_local_destroy_state(struct lwtunnel_state *lwt)
1762 {
1763 	struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt);
1764 
1765 	seg6_local_lwtunnel_destroy_state(slwt);
1766 
1767 	destroy_attrs(slwt);
1768 
1769 	return;
1770 }
1771 
1772 static int seg6_local_fill_encap(struct sk_buff *skb,
1773 				 struct lwtunnel_state *lwt)
1774 {
1775 	struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt);
1776 	struct seg6_action_param *param;
1777 	unsigned long attrs;
1778 	int i, err;
1779 
1780 	if (nla_put_u32(skb, SEG6_LOCAL_ACTION, slwt->action))
1781 		return -EMSGSIZE;
1782 
1783 	attrs = slwt->desc->attrs | slwt->parsed_optattrs;
1784 
1785 	for (i = 0; i < SEG6_LOCAL_MAX + 1; i++) {
1786 		if (attrs & SEG6_F_ATTR(i)) {
1787 			param = &seg6_action_params[i];
1788 			err = param->put(skb, slwt);
1789 			if (err < 0)
1790 				return err;
1791 		}
1792 	}
1793 
1794 	return 0;
1795 }
1796 
1797 static int seg6_local_get_encap_size(struct lwtunnel_state *lwt)
1798 {
1799 	struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt);
1800 	unsigned long attrs;
1801 	int nlsize;
1802 
1803 	nlsize = nla_total_size(4); /* action */
1804 
1805 	attrs = slwt->desc->attrs | slwt->parsed_optattrs;
1806 
1807 	if (attrs & SEG6_F_ATTR(SEG6_LOCAL_SRH))
1808 		nlsize += nla_total_size((slwt->srh->hdrlen + 1) << 3);
1809 
1810 	if (attrs & SEG6_F_ATTR(SEG6_LOCAL_TABLE))
1811 		nlsize += nla_total_size(4);
1812 
1813 	if (attrs & SEG6_F_ATTR(SEG6_LOCAL_NH4))
1814 		nlsize += nla_total_size(4);
1815 
1816 	if (attrs & SEG6_F_ATTR(SEG6_LOCAL_NH6))
1817 		nlsize += nla_total_size(16);
1818 
1819 	if (attrs & SEG6_F_ATTR(SEG6_LOCAL_IIF))
1820 		nlsize += nla_total_size(4);
1821 
1822 	if (attrs & SEG6_F_ATTR(SEG6_LOCAL_OIF))
1823 		nlsize += nla_total_size(4);
1824 
1825 	if (attrs & SEG6_F_ATTR(SEG6_LOCAL_BPF))
1826 		nlsize += nla_total_size(sizeof(struct nlattr)) +
1827 		       nla_total_size(MAX_PROG_NAME) +
1828 		       nla_total_size(4);
1829 
1830 	if (attrs & SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE))
1831 		nlsize += nla_total_size(4);
1832 
1833 	if (attrs & SEG6_F_LOCAL_COUNTERS)
1834 		nlsize += nla_total_size(0) + /* nest SEG6_LOCAL_COUNTERS */
1835 			  /* SEG6_LOCAL_CNT_PACKETS */
1836 			  nla_total_size_64bit(sizeof(__u64)) +
1837 			  /* SEG6_LOCAL_CNT_BYTES */
1838 			  nla_total_size_64bit(sizeof(__u64)) +
1839 			  /* SEG6_LOCAL_CNT_ERRORS */
1840 			  nla_total_size_64bit(sizeof(__u64));
1841 
1842 	return nlsize;
1843 }
1844 
1845 static int seg6_local_cmp_encap(struct lwtunnel_state *a,
1846 				struct lwtunnel_state *b)
1847 {
1848 	struct seg6_local_lwt *slwt_a, *slwt_b;
1849 	struct seg6_action_param *param;
1850 	unsigned long attrs_a, attrs_b;
1851 	int i;
1852 
1853 	slwt_a = seg6_local_lwtunnel(a);
1854 	slwt_b = seg6_local_lwtunnel(b);
1855 
1856 	if (slwt_a->action != slwt_b->action)
1857 		return 1;
1858 
1859 	attrs_a = slwt_a->desc->attrs | slwt_a->parsed_optattrs;
1860 	attrs_b = slwt_b->desc->attrs | slwt_b->parsed_optattrs;
1861 
1862 	if (attrs_a != attrs_b)
1863 		return 1;
1864 
1865 	for (i = 0; i < SEG6_LOCAL_MAX + 1; i++) {
1866 		if (attrs_a & SEG6_F_ATTR(i)) {
1867 			param = &seg6_action_params[i];
1868 			if (param->cmp(slwt_a, slwt_b))
1869 				return 1;
1870 		}
1871 	}
1872 
1873 	return 0;
1874 }
1875 
1876 static const struct lwtunnel_encap_ops seg6_local_ops = {
1877 	.build_state	= seg6_local_build_state,
1878 	.destroy_state	= seg6_local_destroy_state,
1879 	.input		= seg6_local_input,
1880 	.fill_encap	= seg6_local_fill_encap,
1881 	.get_encap_size	= seg6_local_get_encap_size,
1882 	.cmp_encap	= seg6_local_cmp_encap,
1883 	.owner		= THIS_MODULE,
1884 };
1885 
1886 int __init seg6_local_init(void)
1887 {
1888 	/* If the max total number of defined attributes is reached, then your
1889 	 * kernel build stops here.
1890 	 *
1891 	 * This check is required to avoid arithmetic overflows when processing
1892 	 * behavior attributes and the maximum number of defined attributes
1893 	 * exceeds the allowed value.
1894 	 */
1895 	BUILD_BUG_ON(SEG6_LOCAL_MAX + 1 > BITS_PER_TYPE(unsigned long));
1896 
1897 	return lwtunnel_encap_add_ops(&seg6_local_ops,
1898 				      LWTUNNEL_ENCAP_SEG6_LOCAL);
1899 }
1900 
1901 void seg6_local_exit(void)
1902 {
1903 	lwtunnel_encap_del_ops(&seg6_local_ops, LWTUNNEL_ENCAP_SEG6_LOCAL);
1904 }
1905