1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 // Copyright (c) 2019, 2020 Cloudflare
3 
4 #include <stdbool.h>
5 #include <stddef.h>
6 #include <stdint.h>
7 #include <string.h>
8 
9 #include <linux/bpf.h>
10 #include <linux/icmp.h>
11 #include <linux/icmpv6.h>
12 #include <linux/if_ether.h>
13 #include <linux/in.h>
14 #include <linux/ip.h>
15 #include <linux/ipv6.h>
16 #include <linux/pkt_cls.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19 
20 #include <bpf/bpf_helpers.h>
21 #include <bpf/bpf_endian.h>
22 
23 #include "test_cls_redirect.h"
24 #include "bpf_kfuncs.h"
25 
26 #define offsetofend(TYPE, MEMBER) \
27 	(offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER)))
28 
29 #define IP_OFFSET_MASK (0x1FFF)
30 #define IP_MF (0x2000)
31 
32 char _license[] SEC("license") = "Dual BSD/GPL";
33 
34 /**
35  * Destination port and IP used for UDP encapsulation.
36  */
37 volatile const __be16 ENCAPSULATION_PORT;
38 volatile const __be32 ENCAPSULATION_IP;
39 
40 typedef struct {
41 	uint64_t processed_packets_total;
42 	uint64_t l3_protocol_packets_total_ipv4;
43 	uint64_t l3_protocol_packets_total_ipv6;
44 	uint64_t l4_protocol_packets_total_tcp;
45 	uint64_t l4_protocol_packets_total_udp;
46 	uint64_t accepted_packets_total_syn;
47 	uint64_t accepted_packets_total_syn_cookies;
48 	uint64_t accepted_packets_total_last_hop;
49 	uint64_t accepted_packets_total_icmp_echo_request;
50 	uint64_t accepted_packets_total_established;
51 	uint64_t forwarded_packets_total_gue;
52 	uint64_t forwarded_packets_total_gre;
53 
54 	uint64_t errors_total_unknown_l3_proto;
55 	uint64_t errors_total_unknown_l4_proto;
56 	uint64_t errors_total_malformed_ip;
57 	uint64_t errors_total_fragmented_ip;
58 	uint64_t errors_total_malformed_icmp;
59 	uint64_t errors_total_unwanted_icmp;
60 	uint64_t errors_total_malformed_icmp_pkt_too_big;
61 	uint64_t errors_total_malformed_tcp;
62 	uint64_t errors_total_malformed_udp;
63 	uint64_t errors_total_icmp_echo_replies;
64 	uint64_t errors_total_malformed_encapsulation;
65 	uint64_t errors_total_encap_adjust_failed;
66 	uint64_t errors_total_encap_buffer_too_small;
67 	uint64_t errors_total_redirect_loop;
68 	uint64_t errors_total_encap_mtu_violate;
69 } metrics_t;
70 
71 typedef enum {
72 	INVALID = 0,
73 	UNKNOWN,
74 	ECHO_REQUEST,
75 	SYN,
76 	SYN_COOKIE,
77 	ESTABLISHED,
78 } verdict_t;
79 
80 typedef struct {
81 	uint16_t src, dst;
82 } flow_ports_t;
83 
84 _Static_assert(
85 	sizeof(flow_ports_t) !=
86 		offsetofend(struct bpf_sock_tuple, ipv4.dport) -
87 			offsetof(struct bpf_sock_tuple, ipv4.sport) - 1,
88 	"flow_ports_t must match sport and dport in struct bpf_sock_tuple");
89 _Static_assert(
90 	sizeof(flow_ports_t) !=
91 		offsetofend(struct bpf_sock_tuple, ipv6.dport) -
92 			offsetof(struct bpf_sock_tuple, ipv6.sport) - 1,
93 	"flow_ports_t must match sport and dport in struct bpf_sock_tuple");
94 
95 struct iphdr_info {
96 	void *hdr;
97 	__u64 len;
98 };
99 
100 typedef int ret_t;
101 
102 /* This is a bit of a hack. We need a return value which allows us to
103  * indicate that the regular flow of the program should continue,
104  * while allowing functions to use XDP_PASS and XDP_DROP, etc.
105  */
106 static const ret_t CONTINUE_PROCESSING = -1;
107 
108 /* Convenience macro to call functions which return ret_t.
109  */
110 #define MAYBE_RETURN(x)                           \
111 	do {                                      \
112 		ret_t __ret = x;                  \
113 		if (__ret != CONTINUE_PROCESSING) \
114 			return __ret;             \
115 	} while (0)
116 
ipv4_is_fragment(const struct iphdr * ip)117 static bool ipv4_is_fragment(const struct iphdr *ip)
118 {
119 	uint16_t frag_off = ip->frag_off & bpf_htons(IP_OFFSET_MASK);
120 	return (ip->frag_off & bpf_htons(IP_MF)) != 0 || frag_off > 0;
121 }
122 
pkt_parse_ipv4(struct bpf_dynptr * dynptr,__u64 * offset,struct iphdr * iphdr)123 static int pkt_parse_ipv4(struct bpf_dynptr *dynptr, __u64 *offset, struct iphdr *iphdr)
124 {
125 	if (bpf_dynptr_read(iphdr, sizeof(*iphdr), dynptr, *offset, 0))
126 		return -1;
127 
128 	*offset += sizeof(*iphdr);
129 
130 	if (iphdr->ihl < 5)
131 		return -1;
132 
133 	/* skip ipv4 options */
134 	*offset += (iphdr->ihl - 5) * 4;
135 
136 	return 0;
137 }
138 
139 /* Parse the L4 ports from a packet, assuming a layout like TCP or UDP. */
pkt_parse_icmp_l4_ports(struct bpf_dynptr * dynptr,__u64 * offset,flow_ports_t * ports)140 static bool pkt_parse_icmp_l4_ports(struct bpf_dynptr *dynptr, __u64 *offset, flow_ports_t *ports)
141 {
142 	if (bpf_dynptr_read(ports, sizeof(*ports), dynptr, *offset, 0))
143 		return false;
144 
145 	*offset += sizeof(*ports);
146 
147 	/* Ports in the L4 headers are reversed, since we are parsing an ICMP
148 	 * payload which is going towards the eyeball.
149 	 */
150 	uint16_t dst = ports->src;
151 	ports->src = ports->dst;
152 	ports->dst = dst;
153 	return true;
154 }
155 
pkt_checksum_fold(uint32_t csum)156 static uint16_t pkt_checksum_fold(uint32_t csum)
157 {
158 	/* The highest reasonable value for an IPv4 header
159 	 * checksum requires two folds, so we just do that always.
160 	 */
161 	csum = (csum & 0xffff) + (csum >> 16);
162 	csum = (csum & 0xffff) + (csum >> 16);
163 	return (uint16_t)~csum;
164 }
165 
pkt_ipv4_checksum(struct iphdr * iph)166 static void pkt_ipv4_checksum(struct iphdr *iph)
167 {
168 	iph->check = 0;
169 
170 	/* An IP header without options is 20 bytes. Two of those
171 	 * are the checksum, which we always set to zero. Hence,
172 	 * the maximum accumulated value is 18 / 2 * 0xffff = 0x8fff7,
173 	 * which fits in 32 bit.
174 	 */
175 	_Static_assert(sizeof(struct iphdr) == 20, "iphdr must be 20 bytes");
176 	uint32_t acc = 0;
177 	uint16_t *ipw = (uint16_t *)iph;
178 
179 	for (size_t i = 0; i < sizeof(struct iphdr) / 2; i++)
180 		acc += ipw[i];
181 
182 	iph->check = pkt_checksum_fold(acc);
183 }
184 
pkt_skip_ipv6_extension_headers(struct bpf_dynptr * dynptr,__u64 * offset,const struct ipv6hdr * ipv6,uint8_t * upper_proto,bool * is_fragment)185 static bool pkt_skip_ipv6_extension_headers(struct bpf_dynptr *dynptr, __u64 *offset,
186 					    const struct ipv6hdr *ipv6, uint8_t *upper_proto,
187 					    bool *is_fragment)
188 {
189 	/* We understand five extension headers.
190 	 * https://tools.ietf.org/html/rfc8200#section-4.1 states that all
191 	 * headers should occur once, except Destination Options, which may
192 	 * occur twice. Hence we give up after 6 headers.
193 	 */
194 	struct {
195 		uint8_t next;
196 		uint8_t len;
197 	} exthdr = {
198 		.next = ipv6->nexthdr,
199 	};
200 	*is_fragment = false;
201 
202 	for (int i = 0; i < 6; i++) {
203 		switch (exthdr.next) {
204 		case IPPROTO_FRAGMENT:
205 			*is_fragment = true;
206 			/* NB: We don't check that hdrlen == 0 as per spec. */
207 			/* fallthrough; */
208 
209 		case IPPROTO_HOPOPTS:
210 		case IPPROTO_ROUTING:
211 		case IPPROTO_DSTOPTS:
212 		case IPPROTO_MH:
213 			if (bpf_dynptr_read(&exthdr, sizeof(exthdr), dynptr, *offset, 0))
214 				return false;
215 
216 			/* hdrlen is in 8-octet units, and excludes the first 8 octets. */
217 			*offset += (exthdr.len + 1) * 8;
218 
219 			/* Decode next header */
220 			break;
221 
222 		default:
223 			/* The next header is not one of the known extension
224 			 * headers, treat it as the upper layer header.
225 			 *
226 			 * This handles IPPROTO_NONE.
227 			 *
228 			 * Encapsulating Security Payload (50) and Authentication
229 			 * Header (51) also end up here (and will trigger an
230 			 * unknown proto error later). They have a custom header
231 			 * format and seem too esoteric to care about.
232 			 */
233 			*upper_proto = exthdr.next;
234 			return true;
235 		}
236 	}
237 
238 	/* We never found an upper layer header. */
239 	return false;
240 }
241 
pkt_parse_ipv6(struct bpf_dynptr * dynptr,__u64 * offset,struct ipv6hdr * ipv6,uint8_t * proto,bool * is_fragment)242 static int pkt_parse_ipv6(struct bpf_dynptr *dynptr, __u64 *offset, struct ipv6hdr *ipv6,
243 			  uint8_t *proto, bool *is_fragment)
244 {
245 	if (bpf_dynptr_read(ipv6, sizeof(*ipv6), dynptr, *offset, 0))
246 		return -1;
247 
248 	*offset += sizeof(*ipv6);
249 
250 	if (!pkt_skip_ipv6_extension_headers(dynptr, offset, ipv6, proto, is_fragment))
251 		return -1;
252 
253 	return 0;
254 }
255 
256 /* Global metrics, per CPU
257  */
258 struct {
259 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
260 	__uint(max_entries, 1);
261 	__type(key, unsigned int);
262 	__type(value, metrics_t);
263 } metrics_map SEC(".maps");
264 
get_global_metrics(void)265 static metrics_t *get_global_metrics(void)
266 {
267 	uint64_t key = 0;
268 	return bpf_map_lookup_elem(&metrics_map, &key);
269 }
270 
accept_locally(struct __sk_buff * skb,encap_headers_t * encap)271 static ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap)
272 {
273 	const int payload_off =
274 		sizeof(*encap) +
275 		sizeof(struct in_addr) * encap->unigue.hop_count;
276 	int32_t encap_overhead = payload_off - sizeof(struct ethhdr);
277 
278 	/* Changing the ethertype if the encapsulated packet is ipv6 */
279 	if (encap->gue.proto_ctype == IPPROTO_IPV6)
280 		encap->eth.h_proto = bpf_htons(ETH_P_IPV6);
281 
282 	if (bpf_skb_adjust_room(skb, -encap_overhead, BPF_ADJ_ROOM_MAC,
283 				BPF_F_ADJ_ROOM_FIXED_GSO |
284 				BPF_F_ADJ_ROOM_NO_CSUM_RESET) ||
285 	    bpf_csum_level(skb, BPF_CSUM_LEVEL_DEC))
286 		return TC_ACT_SHOT;
287 
288 	return bpf_redirect(skb->ifindex, BPF_F_INGRESS);
289 }
290 
forward_with_gre(struct __sk_buff * skb,struct bpf_dynptr * dynptr,encap_headers_t * encap,struct in_addr * next_hop,metrics_t * metrics)291 static ret_t forward_with_gre(struct __sk_buff *skb, struct bpf_dynptr *dynptr,
292 			      encap_headers_t *encap, struct in_addr *next_hop,
293 			      metrics_t *metrics)
294 {
295 	const int payload_off =
296 		sizeof(*encap) +
297 		sizeof(struct in_addr) * encap->unigue.hop_count;
298 	int32_t encap_overhead =
299 		payload_off - sizeof(struct ethhdr) - sizeof(struct iphdr);
300 	int32_t delta = sizeof(struct gre_base_hdr) - encap_overhead;
301 	__u8 encap_buffer[sizeof(encap_gre_t)] = {};
302 	uint16_t proto = ETH_P_IP;
303 	uint32_t mtu_len = 0;
304 	encap_gre_t *encap_gre;
305 
306 	metrics->forwarded_packets_total_gre++;
307 
308 	/* Loop protection: the inner packet's TTL is decremented as a safeguard
309 	 * against any forwarding loop. As the only interesting field is the TTL
310 	 * hop limit for IPv6, it is easier to use bpf_skb_load_bytes/bpf_skb_store_bytes
311 	 * as they handle the split packets if needed (no need for the data to be
312 	 * in the linear section).
313 	 */
314 	if (encap->gue.proto_ctype == IPPROTO_IPV6) {
315 		proto = ETH_P_IPV6;
316 		uint8_t ttl;
317 		int rc;
318 
319 		rc = bpf_skb_load_bytes(
320 			skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
321 			&ttl, 1);
322 		if (rc != 0) {
323 			metrics->errors_total_malformed_encapsulation++;
324 			return TC_ACT_SHOT;
325 		}
326 
327 		if (ttl == 0) {
328 			metrics->errors_total_redirect_loop++;
329 			return TC_ACT_SHOT;
330 		}
331 
332 		ttl--;
333 		rc = bpf_skb_store_bytes(
334 			skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
335 			&ttl, 1, 0);
336 		if (rc != 0) {
337 			metrics->errors_total_malformed_encapsulation++;
338 			return TC_ACT_SHOT;
339 		}
340 	} else {
341 		uint8_t ttl;
342 		int rc;
343 
344 		rc = bpf_skb_load_bytes(
345 			skb, payload_off + offsetof(struct iphdr, ttl), &ttl,
346 			1);
347 		if (rc != 0) {
348 			metrics->errors_total_malformed_encapsulation++;
349 			return TC_ACT_SHOT;
350 		}
351 
352 		if (ttl == 0) {
353 			metrics->errors_total_redirect_loop++;
354 			return TC_ACT_SHOT;
355 		}
356 
357 		/* IPv4 also has a checksum to patch. While the TTL is only one byte,
358 		 * this function only works for 2 and 4 bytes arguments (the result is
359 		 * the same).
360 		 */
361 		rc = bpf_l3_csum_replace(
362 			skb, payload_off + offsetof(struct iphdr, check), ttl,
363 			ttl - 1, 2);
364 		if (rc != 0) {
365 			metrics->errors_total_malformed_encapsulation++;
366 			return TC_ACT_SHOT;
367 		}
368 
369 		ttl--;
370 		rc = bpf_skb_store_bytes(
371 			skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 1,
372 			0);
373 		if (rc != 0) {
374 			metrics->errors_total_malformed_encapsulation++;
375 			return TC_ACT_SHOT;
376 		}
377 	}
378 
379 	if (bpf_check_mtu(skb, skb->ifindex, &mtu_len, delta, 0)) {
380 		metrics->errors_total_encap_mtu_violate++;
381 		return TC_ACT_SHOT;
382 	}
383 
384 	if (bpf_skb_adjust_room(skb, delta, BPF_ADJ_ROOM_NET,
385 				BPF_F_ADJ_ROOM_FIXED_GSO |
386 				BPF_F_ADJ_ROOM_NO_CSUM_RESET) ||
387 	    bpf_csum_level(skb, BPF_CSUM_LEVEL_INC)) {
388 		metrics->errors_total_encap_adjust_failed++;
389 		return TC_ACT_SHOT;
390 	}
391 
392 	if (bpf_skb_pull_data(skb, sizeof(encap_gre_t))) {
393 		metrics->errors_total_encap_buffer_too_small++;
394 		return TC_ACT_SHOT;
395 	}
396 
397 	encap_gre = bpf_dynptr_slice_rdwr(dynptr, 0, encap_buffer, sizeof(encap_buffer));
398 	if (!encap_gre) {
399 		metrics->errors_total_encap_buffer_too_small++;
400 		return TC_ACT_SHOT;
401 	}
402 
403 	encap_gre->ip.protocol = IPPROTO_GRE;
404 	encap_gre->ip.daddr = next_hop->s_addr;
405 	encap_gre->ip.saddr = ENCAPSULATION_IP;
406 	encap_gre->ip.tot_len =
407 		bpf_htons(bpf_ntohs(encap_gre->ip.tot_len) + delta);
408 	encap_gre->gre.flags = 0;
409 	encap_gre->gre.protocol = bpf_htons(proto);
410 	pkt_ipv4_checksum((void *)&encap_gre->ip);
411 
412 	if (encap_gre == encap_buffer)
413 		bpf_dynptr_write(dynptr, 0, encap_buffer, sizeof(encap_buffer), 0);
414 
415 	return bpf_redirect(skb->ifindex, 0);
416 }
417 
forward_to_next_hop(struct __sk_buff * skb,struct bpf_dynptr * dynptr,encap_headers_t * encap,struct in_addr * next_hop,metrics_t * metrics)418 static ret_t forward_to_next_hop(struct __sk_buff *skb, struct bpf_dynptr *dynptr,
419 				 encap_headers_t *encap, struct in_addr *next_hop,
420 				 metrics_t *metrics)
421 {
422 	/* swap L2 addresses */
423 	/* This assumes that packets are received from a router.
424 	 * So just swapping the MAC addresses here will make the packet go back to
425 	 * the router, which will send it to the appropriate machine.
426 	 */
427 	unsigned char temp[ETH_ALEN];
428 	memcpy(temp, encap->eth.h_dest, sizeof(temp));
429 	memcpy(encap->eth.h_dest, encap->eth.h_source,
430 	       sizeof(encap->eth.h_dest));
431 	memcpy(encap->eth.h_source, temp, sizeof(encap->eth.h_source));
432 
433 	if (encap->unigue.next_hop == encap->unigue.hop_count - 1 &&
434 	    encap->unigue.last_hop_gre) {
435 		return forward_with_gre(skb, dynptr, encap, next_hop, metrics);
436 	}
437 
438 	metrics->forwarded_packets_total_gue++;
439 	uint32_t old_saddr = encap->ip.saddr;
440 	encap->ip.saddr = encap->ip.daddr;
441 	encap->ip.daddr = next_hop->s_addr;
442 	if (encap->unigue.next_hop < encap->unigue.hop_count) {
443 		encap->unigue.next_hop++;
444 	}
445 
446 	/* Remove ip->saddr, add next_hop->s_addr */
447 	const uint64_t off = offsetof(typeof(*encap), ip.check);
448 	int ret = bpf_l3_csum_replace(skb, off, old_saddr, next_hop->s_addr, 4);
449 	if (ret < 0) {
450 		return TC_ACT_SHOT;
451 	}
452 
453 	return bpf_redirect(skb->ifindex, 0);
454 }
455 
skip_next_hops(__u64 * offset,int n)456 static ret_t skip_next_hops(__u64 *offset, int n)
457 {
458 	switch (n) {
459 	case 1:
460 		*offset += sizeof(struct in_addr);
461 	case 0:
462 		return CONTINUE_PROCESSING;
463 
464 	default:
465 		return TC_ACT_SHOT;
466 	}
467 }
468 
469 /* Get the next hop from the GLB header.
470  *
471  * Sets next_hop->s_addr to 0 if there are no more hops left.
472  * pkt is positioned just after the variable length GLB header
473  * iff the call is successful.
474  */
get_next_hop(struct bpf_dynptr * dynptr,__u64 * offset,encap_headers_t * encap,struct in_addr * next_hop)475 static ret_t get_next_hop(struct bpf_dynptr *dynptr, __u64 *offset, encap_headers_t *encap,
476 			  struct in_addr *next_hop)
477 {
478 	if (encap->unigue.next_hop > encap->unigue.hop_count)
479 		return TC_ACT_SHOT;
480 
481 	/* Skip "used" next hops. */
482 	MAYBE_RETURN(skip_next_hops(offset, encap->unigue.next_hop));
483 
484 	if (encap->unigue.next_hop == encap->unigue.hop_count) {
485 		/* No more next hops, we are at the end of the GLB header. */
486 		next_hop->s_addr = 0;
487 		return CONTINUE_PROCESSING;
488 	}
489 
490 	if (bpf_dynptr_read(next_hop, sizeof(*next_hop), dynptr, *offset, 0))
491 		return TC_ACT_SHOT;
492 
493 	*offset += sizeof(*next_hop);
494 
495 	/* Skip the remainig next hops (may be zero). */
496 	return skip_next_hops(offset, encap->unigue.hop_count - encap->unigue.next_hop - 1);
497 }
498 
499 /* Fill a bpf_sock_tuple to be used with the socket lookup functions.
500  * This is a kludge that let's us work around verifier limitations:
501  *
502  *    fill_tuple(&t, foo, sizeof(struct iphdr), 123, 321)
503  *
504  * clang will substitue a costant for sizeof, which allows the verifier
505  * to track it's value. Based on this, it can figure out the constant
506  * return value, and calling code works while still being "generic" to
507  * IPv4 and IPv6.
508  */
fill_tuple(struct bpf_sock_tuple * tuple,void * iph,uint64_t iphlen,uint16_t sport,uint16_t dport)509 static uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph,
510 				    uint64_t iphlen, uint16_t sport, uint16_t dport)
511 {
512 	switch (iphlen) {
513 	case sizeof(struct iphdr): {
514 		struct iphdr *ipv4 = (struct iphdr *)iph;
515 		tuple->ipv4.daddr = ipv4->daddr;
516 		tuple->ipv4.saddr = ipv4->saddr;
517 		tuple->ipv4.sport = sport;
518 		tuple->ipv4.dport = dport;
519 		return sizeof(tuple->ipv4);
520 	}
521 
522 	case sizeof(struct ipv6hdr): {
523 		struct ipv6hdr *ipv6 = (struct ipv6hdr *)iph;
524 		memcpy(&tuple->ipv6.daddr, &ipv6->daddr,
525 		       sizeof(tuple->ipv6.daddr));
526 		memcpy(&tuple->ipv6.saddr, &ipv6->saddr,
527 		       sizeof(tuple->ipv6.saddr));
528 		tuple->ipv6.sport = sport;
529 		tuple->ipv6.dport = dport;
530 		return sizeof(tuple->ipv6);
531 	}
532 
533 	default:
534 		return 0;
535 	}
536 }
537 
classify_tcp(struct __sk_buff * skb,struct bpf_sock_tuple * tuple,uint64_t tuplen,void * iph,struct tcphdr * tcp)538 static verdict_t classify_tcp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple,
539 			      uint64_t tuplen, void *iph, struct tcphdr *tcp)
540 {
541 	struct bpf_sock *sk =
542 		bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0);
543 
544 	if (sk == NULL)
545 		return UNKNOWN;
546 
547 	if (sk->state != BPF_TCP_LISTEN) {
548 		bpf_sk_release(sk);
549 		return ESTABLISHED;
550 	}
551 
552 	if (iph != NULL && tcp != NULL) {
553 		/* Kludge: we've run out of arguments, but need the length of the ip header. */
554 		uint64_t iphlen = sizeof(struct iphdr);
555 
556 		if (tuplen == sizeof(tuple->ipv6))
557 			iphlen = sizeof(struct ipv6hdr);
558 
559 		if (bpf_tcp_check_syncookie(sk, iph, iphlen, tcp,
560 					    sizeof(*tcp)) == 0) {
561 			bpf_sk_release(sk);
562 			return SYN_COOKIE;
563 		}
564 	}
565 
566 	bpf_sk_release(sk);
567 	return UNKNOWN;
568 }
569 
classify_udp(struct __sk_buff * skb,struct bpf_sock_tuple * tuple,uint64_t tuplen)570 static verdict_t classify_udp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, uint64_t tuplen)
571 {
572 	struct bpf_sock *sk =
573 		bpf_sk_lookup_udp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0);
574 
575 	if (sk == NULL)
576 		return UNKNOWN;
577 
578 	if (sk->state == BPF_TCP_ESTABLISHED) {
579 		bpf_sk_release(sk);
580 		return ESTABLISHED;
581 	}
582 
583 	bpf_sk_release(sk);
584 	return UNKNOWN;
585 }
586 
classify_icmp(struct __sk_buff * skb,uint8_t proto,struct bpf_sock_tuple * tuple,uint64_t tuplen,metrics_t * metrics)587 static verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto, struct bpf_sock_tuple *tuple,
588 			       uint64_t tuplen, metrics_t *metrics)
589 {
590 	switch (proto) {
591 	case IPPROTO_TCP:
592 		return classify_tcp(skb, tuple, tuplen, NULL, NULL);
593 
594 	case IPPROTO_UDP:
595 		return classify_udp(skb, tuple, tuplen);
596 
597 	default:
598 		metrics->errors_total_malformed_icmp++;
599 		return INVALID;
600 	}
601 }
602 
process_icmpv4(struct __sk_buff * skb,struct bpf_dynptr * dynptr,__u64 * offset,metrics_t * metrics)603 static verdict_t process_icmpv4(struct __sk_buff *skb, struct bpf_dynptr *dynptr, __u64 *offset,
604 				metrics_t *metrics)
605 {
606 	struct icmphdr icmp;
607 	struct iphdr ipv4;
608 
609 	if (bpf_dynptr_read(&icmp, sizeof(icmp), dynptr, *offset, 0)) {
610 		metrics->errors_total_malformed_icmp++;
611 		return INVALID;
612 	}
613 
614 	*offset += sizeof(icmp);
615 
616 	/* We should never receive encapsulated echo replies. */
617 	if (icmp.type == ICMP_ECHOREPLY) {
618 		metrics->errors_total_icmp_echo_replies++;
619 		return INVALID;
620 	}
621 
622 	if (icmp.type == ICMP_ECHO)
623 		return ECHO_REQUEST;
624 
625 	if (icmp.type != ICMP_DEST_UNREACH || icmp.code != ICMP_FRAG_NEEDED) {
626 		metrics->errors_total_unwanted_icmp++;
627 		return INVALID;
628 	}
629 
630 	if (pkt_parse_ipv4(dynptr, offset, &ipv4)) {
631 		metrics->errors_total_malformed_icmp_pkt_too_big++;
632 		return INVALID;
633 	}
634 
635 	/* The source address in the outer IP header is from the entity that
636 	 * originated the ICMP message. Use the original IP header to restore
637 	 * the correct flow tuple.
638 	 */
639 	struct bpf_sock_tuple tuple;
640 	tuple.ipv4.saddr = ipv4.daddr;
641 	tuple.ipv4.daddr = ipv4.saddr;
642 
643 	if (!pkt_parse_icmp_l4_ports(dynptr, offset, (flow_ports_t *)&tuple.ipv4.sport)) {
644 		metrics->errors_total_malformed_icmp_pkt_too_big++;
645 		return INVALID;
646 	}
647 
648 	return classify_icmp(skb, ipv4.protocol, &tuple,
649 			     sizeof(tuple.ipv4), metrics);
650 }
651 
process_icmpv6(struct bpf_dynptr * dynptr,__u64 * offset,struct __sk_buff * skb,metrics_t * metrics)652 static verdict_t process_icmpv6(struct bpf_dynptr *dynptr, __u64 *offset, struct __sk_buff *skb,
653 				metrics_t *metrics)
654 {
655 	struct bpf_sock_tuple tuple;
656 	struct ipv6hdr ipv6;
657 	struct icmp6hdr icmp6;
658 	bool is_fragment;
659 	uint8_t l4_proto;
660 
661 	if (bpf_dynptr_read(&icmp6, sizeof(icmp6), dynptr, *offset, 0)) {
662 		metrics->errors_total_malformed_icmp++;
663 		return INVALID;
664 	}
665 
666 	/* We should never receive encapsulated echo replies. */
667 	if (icmp6.icmp6_type == ICMPV6_ECHO_REPLY) {
668 		metrics->errors_total_icmp_echo_replies++;
669 		return INVALID;
670 	}
671 
672 	if (icmp6.icmp6_type == ICMPV6_ECHO_REQUEST) {
673 		return ECHO_REQUEST;
674 	}
675 
676 	if (icmp6.icmp6_type != ICMPV6_PKT_TOOBIG) {
677 		metrics->errors_total_unwanted_icmp++;
678 		return INVALID;
679 	}
680 
681 	if (pkt_parse_ipv6(dynptr, offset, &ipv6, &l4_proto, &is_fragment)) {
682 		metrics->errors_total_malformed_icmp_pkt_too_big++;
683 		return INVALID;
684 	}
685 
686 	if (is_fragment) {
687 		metrics->errors_total_fragmented_ip++;
688 		return INVALID;
689 	}
690 
691 	/* Swap source and dest addresses. */
692 	memcpy(&tuple.ipv6.saddr, &ipv6.daddr, sizeof(tuple.ipv6.saddr));
693 	memcpy(&tuple.ipv6.daddr, &ipv6.saddr, sizeof(tuple.ipv6.daddr));
694 
695 	if (!pkt_parse_icmp_l4_ports(dynptr, offset, (flow_ports_t *)&tuple.ipv6.sport)) {
696 		metrics->errors_total_malformed_icmp_pkt_too_big++;
697 		return INVALID;
698 	}
699 
700 	return classify_icmp(skb, l4_proto, &tuple, sizeof(tuple.ipv6),
701 			     metrics);
702 }
703 
process_tcp(struct bpf_dynptr * dynptr,__u64 * offset,struct __sk_buff * skb,struct iphdr_info * info,metrics_t * metrics)704 static verdict_t process_tcp(struct bpf_dynptr *dynptr, __u64 *offset, struct __sk_buff *skb,
705 			     struct iphdr_info *info, metrics_t *metrics)
706 {
707 	struct bpf_sock_tuple tuple;
708 	struct tcphdr tcp;
709 	uint64_t tuplen;
710 
711 	metrics->l4_protocol_packets_total_tcp++;
712 
713 	if (bpf_dynptr_read(&tcp, sizeof(tcp), dynptr, *offset, 0)) {
714 		metrics->errors_total_malformed_tcp++;
715 		return INVALID;
716 	}
717 
718 	*offset += sizeof(tcp);
719 
720 	if (tcp.syn)
721 		return SYN;
722 
723 	tuplen = fill_tuple(&tuple, info->hdr, info->len, tcp.source, tcp.dest);
724 	return classify_tcp(skb, &tuple, tuplen, info->hdr, &tcp);
725 }
726 
process_udp(struct bpf_dynptr * dynptr,__u64 * offset,struct __sk_buff * skb,struct iphdr_info * info,metrics_t * metrics)727 static verdict_t process_udp(struct bpf_dynptr *dynptr, __u64 *offset, struct __sk_buff *skb,
728 			     struct iphdr_info *info, metrics_t *metrics)
729 {
730 	struct bpf_sock_tuple tuple;
731 	struct udphdr udph;
732 	uint64_t tuplen;
733 
734 	metrics->l4_protocol_packets_total_udp++;
735 
736 	if (bpf_dynptr_read(&udph, sizeof(udph), dynptr, *offset, 0)) {
737 		metrics->errors_total_malformed_udp++;
738 		return INVALID;
739 	}
740 	*offset += sizeof(udph);
741 
742 	tuplen = fill_tuple(&tuple, info->hdr, info->len, udph.source, udph.dest);
743 	return classify_udp(skb, &tuple, tuplen);
744 }
745 
process_ipv4(struct __sk_buff * skb,struct bpf_dynptr * dynptr,__u64 * offset,metrics_t * metrics)746 static verdict_t process_ipv4(struct __sk_buff *skb, struct bpf_dynptr *dynptr,
747 			      __u64 *offset, metrics_t *metrics)
748 {
749 	struct iphdr ipv4;
750 	struct iphdr_info info = {
751 		.hdr = &ipv4,
752 		.len = sizeof(ipv4),
753 	};
754 
755 	metrics->l3_protocol_packets_total_ipv4++;
756 
757 	if (pkt_parse_ipv4(dynptr, offset, &ipv4)) {
758 		metrics->errors_total_malformed_ip++;
759 		return INVALID;
760 	}
761 
762 	if (ipv4.version != 4) {
763 		metrics->errors_total_malformed_ip++;
764 		return INVALID;
765 	}
766 
767 	if (ipv4_is_fragment(&ipv4)) {
768 		metrics->errors_total_fragmented_ip++;
769 		return INVALID;
770 	}
771 
772 	switch (ipv4.protocol) {
773 	case IPPROTO_ICMP:
774 		return process_icmpv4(skb, dynptr, offset, metrics);
775 
776 	case IPPROTO_TCP:
777 		return process_tcp(dynptr, offset, skb, &info, metrics);
778 
779 	case IPPROTO_UDP:
780 		return process_udp(dynptr, offset, skb, &info, metrics);
781 
782 	default:
783 		metrics->errors_total_unknown_l4_proto++;
784 		return INVALID;
785 	}
786 }
787 
process_ipv6(struct __sk_buff * skb,struct bpf_dynptr * dynptr,__u64 * offset,metrics_t * metrics)788 static verdict_t process_ipv6(struct __sk_buff *skb, struct bpf_dynptr *dynptr,
789 			      __u64 *offset, metrics_t *metrics)
790 {
791 	struct ipv6hdr ipv6;
792 	struct iphdr_info info = {
793 		.hdr = &ipv6,
794 		.len = sizeof(ipv6),
795 	};
796 	uint8_t l4_proto;
797 	bool is_fragment;
798 
799 	metrics->l3_protocol_packets_total_ipv6++;
800 
801 	if (pkt_parse_ipv6(dynptr, offset, &ipv6, &l4_proto, &is_fragment)) {
802 		metrics->errors_total_malformed_ip++;
803 		return INVALID;
804 	}
805 
806 	if (ipv6.version != 6) {
807 		metrics->errors_total_malformed_ip++;
808 		return INVALID;
809 	}
810 
811 	if (is_fragment) {
812 		metrics->errors_total_fragmented_ip++;
813 		return INVALID;
814 	}
815 
816 	switch (l4_proto) {
817 	case IPPROTO_ICMPV6:
818 		return process_icmpv6(dynptr, offset, skb, metrics);
819 
820 	case IPPROTO_TCP:
821 		return process_tcp(dynptr, offset, skb, &info, metrics);
822 
823 	case IPPROTO_UDP:
824 		return process_udp(dynptr, offset, skb, &info, metrics);
825 
826 	default:
827 		metrics->errors_total_unknown_l4_proto++;
828 		return INVALID;
829 	}
830 }
831 
832 SEC("tc")
cls_redirect(struct __sk_buff * skb)833 int cls_redirect(struct __sk_buff *skb)
834 {
835 	__u8 encap_buffer[sizeof(encap_headers_t)] = {};
836 	struct bpf_dynptr dynptr;
837 	struct in_addr next_hop;
838 	/* Tracks offset of the dynptr. This will be unnecessary once
839 	 * bpf_dynptr_advance() is available.
840 	 */
841 	__u64 off = 0;
842 	ret_t ret;
843 
844 	bpf_dynptr_from_skb(skb, 0, &dynptr);
845 
846 	metrics_t *metrics = get_global_metrics();
847 	if (metrics == NULL)
848 		return TC_ACT_SHOT;
849 
850 	metrics->processed_packets_total++;
851 
852 	/* Pass bogus packets as long as we're not sure they're
853 	 * destined for us.
854 	 */
855 	if (skb->protocol != bpf_htons(ETH_P_IP))
856 		return TC_ACT_OK;
857 
858 	encap_headers_t *encap;
859 
860 	/* Make sure that all encapsulation headers are available in
861 	 * the linear portion of the skb. This makes it easy to manipulate them.
862 	 */
863 	if (bpf_skb_pull_data(skb, sizeof(*encap)))
864 		return TC_ACT_OK;
865 
866 	encap = bpf_dynptr_slice_rdwr(&dynptr, 0, encap_buffer, sizeof(encap_buffer));
867 	if (!encap)
868 		return TC_ACT_OK;
869 
870 	off += sizeof(*encap);
871 
872 	if (encap->ip.ihl != 5)
873 		/* We never have any options. */
874 		return TC_ACT_OK;
875 
876 	if (encap->ip.daddr != ENCAPSULATION_IP ||
877 	    encap->ip.protocol != IPPROTO_UDP)
878 		return TC_ACT_OK;
879 
880 	/* TODO Check UDP length? */
881 	if (encap->udp.dest != ENCAPSULATION_PORT)
882 		return TC_ACT_OK;
883 
884 	/* We now know that the packet is destined to us, we can
885 	 * drop bogus ones.
886 	 */
887 	if (ipv4_is_fragment((void *)&encap->ip)) {
888 		metrics->errors_total_fragmented_ip++;
889 		return TC_ACT_SHOT;
890 	}
891 
892 	if (encap->gue.variant != 0) {
893 		metrics->errors_total_malformed_encapsulation++;
894 		return TC_ACT_SHOT;
895 	}
896 
897 	if (encap->gue.control != 0) {
898 		metrics->errors_total_malformed_encapsulation++;
899 		return TC_ACT_SHOT;
900 	}
901 
902 	if (encap->gue.flags != 0) {
903 		metrics->errors_total_malformed_encapsulation++;
904 		return TC_ACT_SHOT;
905 	}
906 
907 	if (encap->gue.hlen !=
908 	    sizeof(encap->unigue) / 4 + encap->unigue.hop_count) {
909 		metrics->errors_total_malformed_encapsulation++;
910 		return TC_ACT_SHOT;
911 	}
912 
913 	if (encap->unigue.version != 0) {
914 		metrics->errors_total_malformed_encapsulation++;
915 		return TC_ACT_SHOT;
916 	}
917 
918 	if (encap->unigue.reserved != 0)
919 		return TC_ACT_SHOT;
920 
921 	MAYBE_RETURN(get_next_hop(&dynptr, &off, encap, &next_hop));
922 
923 	if (next_hop.s_addr == 0) {
924 		metrics->accepted_packets_total_last_hop++;
925 		return accept_locally(skb, encap);
926 	}
927 
928 	verdict_t verdict;
929 	switch (encap->gue.proto_ctype) {
930 	case IPPROTO_IPIP:
931 		verdict = process_ipv4(skb, &dynptr, &off, metrics);
932 		break;
933 
934 	case IPPROTO_IPV6:
935 		verdict = process_ipv6(skb, &dynptr, &off, metrics);
936 		break;
937 
938 	default:
939 		metrics->errors_total_unknown_l3_proto++;
940 		return TC_ACT_SHOT;
941 	}
942 
943 	switch (verdict) {
944 	case INVALID:
945 		/* metrics have already been bumped */
946 		return TC_ACT_SHOT;
947 
948 	case UNKNOWN:
949 		return forward_to_next_hop(skb, &dynptr, encap, &next_hop, metrics);
950 
951 	case ECHO_REQUEST:
952 		metrics->accepted_packets_total_icmp_echo_request++;
953 		break;
954 
955 	case SYN:
956 		if (encap->unigue.forward_syn) {
957 			return forward_to_next_hop(skb, &dynptr, encap, &next_hop,
958 						   metrics);
959 		}
960 
961 		metrics->accepted_packets_total_syn++;
962 		break;
963 
964 	case SYN_COOKIE:
965 		metrics->accepted_packets_total_syn_cookies++;
966 		break;
967 
968 	case ESTABLISHED:
969 		metrics->accepted_packets_total_established++;
970 		break;
971 	}
972 
973 	ret = accept_locally(skb, encap);
974 
975 	if (encap == encap_buffer)
976 		bpf_dynptr_write(&dynptr, 0, encap_buffer, sizeof(encap_buffer), 0);
977 
978 	return ret;
979 }
980