1 // SPDX-License-Identifier: GPL-2.0
2 // Copyright (c) 2022 Meta
3 
4 #include <stddef.h>
5 #include <stdint.h>
6 #include <stdbool.h>
7 #include <linux/bpf.h>
8 #include <linux/stddef.h>
9 #include <linux/pkt_cls.h>
10 #include <linux/if_ether.h>
11 #include <linux/in.h>
12 #include <linux/ip.h>
13 #include <linux/ipv6.h>
14 #include <linux/tcp.h>
15 #include <linux/udp.h>
16 #include <bpf/bpf_helpers.h>
17 #include <bpf/bpf_endian.h>
18 #include <sys/socket.h>
19 
20 /* veth_src --- veth_src_fwd --- veth_det_fwd --- veth_dst
21  *           |                                 |
22  *  ns_src   |              ns_fwd             |   ns_dst
23  *
24  * ns_src and ns_dst: ENDHOST namespace
25  *            ns_fwd: Fowarding namespace
26  */
27 
28 #define ctx_ptr(field)		(void *)(long)(field)
29 
30 #define ip4_src			__bpf_htonl(0xac100164) /* 172.16.1.100 */
31 #define ip4_dst			__bpf_htonl(0xac100264) /* 172.16.2.100 */
32 
33 #define ip6_src			{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
34 				  0x00, 0x01, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe }
35 #define ip6_dst			{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
36 				  0x00, 0x02, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe }
37 
38 #define v6_equal(a, b)		(a.s6_addr32[0] == b.s6_addr32[0] && \
39 				 a.s6_addr32[1] == b.s6_addr32[1] && \
40 				 a.s6_addr32[2] == b.s6_addr32[2] && \
41 				 a.s6_addr32[3] == b.s6_addr32[3])
42 
43 volatile const __u32 IFINDEX_SRC;
44 volatile const __u32 IFINDEX_DST;
45 
46 #define EGRESS_ENDHOST_MAGIC	0x0b9fbeef
47 #define INGRESS_FWDNS_MAGIC	0x1b9fbeef
48 #define EGRESS_FWDNS_MAGIC	0x2b9fbeef
49 
50 enum {
51 	INGRESS_FWDNS_P100,
52 	INGRESS_FWDNS_P101,
53 	EGRESS_FWDNS_P100,
54 	EGRESS_FWDNS_P101,
55 	INGRESS_ENDHOST,
56 	EGRESS_ENDHOST,
57 	SET_DTIME,
58 	__MAX_CNT,
59 };
60 
61 enum {
62 	TCP_IP6_CLEAR_DTIME,
63 	TCP_IP4,
64 	TCP_IP6,
65 	UDP_IP4,
66 	UDP_IP6,
67 	TCP_IP4_RT_FWD,
68 	TCP_IP6_RT_FWD,
69 	UDP_IP4_RT_FWD,
70 	UDP_IP6_RT_FWD,
71 	UKN_TEST,
72 	__NR_TESTS,
73 };
74 
75 enum {
76 	SRC_NS = 1,
77 	DST_NS,
78 };
79 
80 __u32 dtimes[__NR_TESTS][__MAX_CNT] = {};
81 __u32 errs[__NR_TESTS][__MAX_CNT] = {};
82 __u32 test = 0;
83 
84 static void inc_dtimes(__u32 idx)
85 {
86 	if (test < __NR_TESTS)
87 		dtimes[test][idx]++;
88 	else
89 		dtimes[UKN_TEST][idx]++;
90 }
91 
92 static void inc_errs(__u32 idx)
93 {
94 	if (test < __NR_TESTS)
95 		errs[test][idx]++;
96 	else
97 		errs[UKN_TEST][idx]++;
98 }
99 
100 static int skb_proto(int type)
101 {
102 	return type & 0xff;
103 }
104 
105 static int skb_ns(int type)
106 {
107 	return (type >> 8) & 0xff;
108 }
109 
110 static bool fwdns_clear_dtime(void)
111 {
112 	return test == TCP_IP6_CLEAR_DTIME;
113 }
114 
115 static bool bpf_fwd(void)
116 {
117 	return test < TCP_IP4_RT_FWD;
118 }
119 
120 static __u8 get_proto(void)
121 {
122 	switch (test) {
123 	case UDP_IP4:
124 	case UDP_IP6:
125 	case UDP_IP4_RT_FWD:
126 	case UDP_IP6_RT_FWD:
127 		return IPPROTO_UDP;
128 	default:
129 		return IPPROTO_TCP;
130 	}
131 }
132 
133 /* -1: parse error: TC_ACT_SHOT
134  *  0: not testing traffic: TC_ACT_OK
135  * >0: first byte is the inet_proto, second byte has the netns
136  *     of the sender
137  */
138 static int skb_get_type(struct __sk_buff *skb)
139 {
140 	__u16 dst_ns_port = __bpf_htons(50000 + test);
141 	void *data_end = ctx_ptr(skb->data_end);
142 	void *data = ctx_ptr(skb->data);
143 	__u8 inet_proto = 0, ns = 0;
144 	struct ipv6hdr *ip6h;
145 	__u16 sport, dport;
146 	struct iphdr *iph;
147 	struct tcphdr *th;
148 	struct udphdr *uh;
149 	void *trans;
150 
151 	switch (skb->protocol) {
152 	case __bpf_htons(ETH_P_IP):
153 		iph = data + sizeof(struct ethhdr);
154 		if (iph + 1 > data_end)
155 			return -1;
156 		if (iph->saddr == ip4_src)
157 			ns = SRC_NS;
158 		else if (iph->saddr == ip4_dst)
159 			ns = DST_NS;
160 		inet_proto = iph->protocol;
161 		trans = iph + 1;
162 		break;
163 	case __bpf_htons(ETH_P_IPV6):
164 		ip6h = data + sizeof(struct ethhdr);
165 		if (ip6h + 1 > data_end)
166 			return -1;
167 		if (v6_equal(ip6h->saddr, (struct in6_addr)ip6_src))
168 			ns = SRC_NS;
169 		else if (v6_equal(ip6h->saddr, (struct in6_addr)ip6_dst))
170 			ns = DST_NS;
171 		inet_proto = ip6h->nexthdr;
172 		trans = ip6h + 1;
173 		break;
174 	default:
175 		return 0;
176 	}
177 
178 	/* skb is not from src_ns or dst_ns.
179 	 * skb is not the testing IPPROTO.
180 	 */
181 	if (!ns || inet_proto != get_proto())
182 		return 0;
183 
184 	switch (inet_proto) {
185 	case IPPROTO_TCP:
186 		th = trans;
187 		if (th + 1 > data_end)
188 			return -1;
189 		sport = th->source;
190 		dport = th->dest;
191 		break;
192 	case IPPROTO_UDP:
193 		uh = trans;
194 		if (uh + 1 > data_end)
195 			return -1;
196 		sport = uh->source;
197 		dport = uh->dest;
198 		break;
199 	default:
200 		return 0;
201 	}
202 
203 	/* The skb is the testing traffic */
204 	if ((ns == SRC_NS && dport == dst_ns_port) ||
205 	    (ns == DST_NS && sport == dst_ns_port))
206 		return (ns << 8 | inet_proto);
207 
208 	return 0;
209 }
210 
211 /* format: direction@iface@netns
212  * egress@veth_(src|dst)@ns_(src|dst)
213  */
214 SEC("tc")
215 int egress_host(struct __sk_buff *skb)
216 {
217 	int skb_type;
218 
219 	skb_type = skb_get_type(skb);
220 	if (skb_type == -1)
221 		return TC_ACT_SHOT;
222 	if (!skb_type)
223 		return TC_ACT_OK;
224 
225 	if (skb_proto(skb_type) == IPPROTO_TCP) {
226 		if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO &&
227 		    skb->tstamp)
228 			inc_dtimes(EGRESS_ENDHOST);
229 		else
230 			inc_errs(EGRESS_ENDHOST);
231 	} else {
232 		if (skb->tstamp_type == BPF_SKB_TSTAMP_UNSPEC &&
233 		    skb->tstamp)
234 			inc_dtimes(EGRESS_ENDHOST);
235 		else
236 			inc_errs(EGRESS_ENDHOST);
237 	}
238 
239 	skb->tstamp = EGRESS_ENDHOST_MAGIC;
240 
241 	return TC_ACT_OK;
242 }
243 
244 /* ingress@veth_(src|dst)@ns_(src|dst) */
245 SEC("tc")
246 int ingress_host(struct __sk_buff *skb)
247 {
248 	int skb_type;
249 
250 	skb_type = skb_get_type(skb);
251 	if (skb_type == -1)
252 		return TC_ACT_SHOT;
253 	if (!skb_type)
254 		return TC_ACT_OK;
255 
256 	if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO &&
257 	    skb->tstamp == EGRESS_FWDNS_MAGIC)
258 		inc_dtimes(INGRESS_ENDHOST);
259 	else
260 		inc_errs(INGRESS_ENDHOST);
261 
262 	return TC_ACT_OK;
263 }
264 
265 /* ingress@veth_(src|dst)_fwd@ns_fwd priority 100 */
266 SEC("tc")
267 int ingress_fwdns_prio100(struct __sk_buff *skb)
268 {
269 	int skb_type;
270 
271 	skb_type = skb_get_type(skb);
272 	if (skb_type == -1)
273 		return TC_ACT_SHOT;
274 	if (!skb_type)
275 		return TC_ACT_OK;
276 
277 	/* delivery_time is only available to the ingress
278 	 * if the tc-bpf checks the skb->tstamp_type.
279 	 */
280 	if (skb->tstamp == EGRESS_ENDHOST_MAGIC)
281 		inc_errs(INGRESS_FWDNS_P100);
282 
283 	if (fwdns_clear_dtime())
284 		skb->tstamp = 0;
285 
286 	return TC_ACT_UNSPEC;
287 }
288 
289 /* egress@veth_(src|dst)_fwd@ns_fwd priority 100 */
290 SEC("tc")
291 int egress_fwdns_prio100(struct __sk_buff *skb)
292 {
293 	int skb_type;
294 
295 	skb_type = skb_get_type(skb);
296 	if (skb_type == -1)
297 		return TC_ACT_SHOT;
298 	if (!skb_type)
299 		return TC_ACT_OK;
300 
301 	/* delivery_time is always available to egress even
302 	 * the tc-bpf did not use the tstamp_type.
303 	 */
304 	if (skb->tstamp == INGRESS_FWDNS_MAGIC)
305 		inc_dtimes(EGRESS_FWDNS_P100);
306 	else
307 		inc_errs(EGRESS_FWDNS_P100);
308 
309 	if (fwdns_clear_dtime())
310 		skb->tstamp = 0;
311 
312 	return TC_ACT_UNSPEC;
313 }
314 
315 /* ingress@veth_(src|dst)_fwd@ns_fwd priority 101 */
316 SEC("tc")
317 int ingress_fwdns_prio101(struct __sk_buff *skb)
318 {
319 	__u64 expected_dtime = EGRESS_ENDHOST_MAGIC;
320 	int skb_type;
321 
322 	skb_type = skb_get_type(skb);
323 	if (skb_type == -1 || !skb_type)
324 		/* Should have handled in prio100 */
325 		return TC_ACT_SHOT;
326 
327 	if (skb_proto(skb_type) == IPPROTO_UDP)
328 		expected_dtime = 0;
329 
330 	if (skb->tstamp_type) {
331 		if (fwdns_clear_dtime() ||
332 		    skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_MONO ||
333 		    skb->tstamp != expected_dtime)
334 			inc_errs(INGRESS_FWDNS_P101);
335 		else
336 			inc_dtimes(INGRESS_FWDNS_P101);
337 	} else {
338 		if (!fwdns_clear_dtime() && expected_dtime)
339 			inc_errs(INGRESS_FWDNS_P101);
340 	}
341 
342 	if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO) {
343 		skb->tstamp = INGRESS_FWDNS_MAGIC;
344 	} else {
345 		if (bpf_skb_set_tstamp(skb, INGRESS_FWDNS_MAGIC,
346 				       BPF_SKB_TSTAMP_DELIVERY_MONO))
347 			inc_errs(SET_DTIME);
348 		if (!bpf_skb_set_tstamp(skb, INGRESS_FWDNS_MAGIC,
349 					BPF_SKB_TSTAMP_UNSPEC))
350 			inc_errs(SET_DTIME);
351 	}
352 
353 	if (skb_ns(skb_type) == SRC_NS)
354 		return bpf_fwd() ?
355 			bpf_redirect_neigh(IFINDEX_DST, NULL, 0, 0) : TC_ACT_OK;
356 	else
357 		return bpf_fwd() ?
358 			bpf_redirect_neigh(IFINDEX_SRC, NULL, 0, 0) : TC_ACT_OK;
359 }
360 
361 /* egress@veth_(src|dst)_fwd@ns_fwd priority 101 */
362 SEC("tc")
363 int egress_fwdns_prio101(struct __sk_buff *skb)
364 {
365 	int skb_type;
366 
367 	skb_type = skb_get_type(skb);
368 	if (skb_type == -1 || !skb_type)
369 		/* Should have handled in prio100 */
370 		return TC_ACT_SHOT;
371 
372 	if (skb->tstamp_type) {
373 		if (fwdns_clear_dtime() ||
374 		    skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_MONO ||
375 		    skb->tstamp != INGRESS_FWDNS_MAGIC)
376 			inc_errs(EGRESS_FWDNS_P101);
377 		else
378 			inc_dtimes(EGRESS_FWDNS_P101);
379 	} else {
380 		if (!fwdns_clear_dtime())
381 			inc_errs(EGRESS_FWDNS_P101);
382 	}
383 
384 	if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO) {
385 		skb->tstamp = EGRESS_FWDNS_MAGIC;
386 	} else {
387 		if (bpf_skb_set_tstamp(skb, EGRESS_FWDNS_MAGIC,
388 				       BPF_SKB_TSTAMP_DELIVERY_MONO))
389 			inc_errs(SET_DTIME);
390 		if (!bpf_skb_set_tstamp(skb, INGRESS_FWDNS_MAGIC,
391 					BPF_SKB_TSTAMP_UNSPEC))
392 			inc_errs(SET_DTIME);
393 	}
394 
395 	return TC_ACT_OK;
396 }
397 
398 char __license[] SEC("license") = "GPL";
399