1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 // Copyright (c) 2019, 2020 Cloudflare
3
4 #include <stdbool.h>
5 #include <stddef.h>
6 #include <stdint.h>
7 #include <string.h>
8
9 #include <linux/bpf.h>
10 #include <linux/icmp.h>
11 #include <linux/icmpv6.h>
12 #include <linux/if_ether.h>
13 #include <linux/in.h>
14 #include <linux/ip.h>
15 #include <linux/ipv6.h>
16 #include <linux/pkt_cls.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19
20 #include <bpf/bpf_helpers.h>
21 #include <bpf/bpf_endian.h>
22
23 #include "test_cls_redirect.h"
24 #include "bpf_kfuncs.h"
25
26 #define offsetofend(TYPE, MEMBER) \
27 (offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER)))
28
29 #define IP_OFFSET_MASK (0x1FFF)
30 #define IP_MF (0x2000)
31
32 char _license[] SEC("license") = "Dual BSD/GPL";
33
34 /**
35 * Destination port and IP used for UDP encapsulation.
36 */
37 volatile const __be16 ENCAPSULATION_PORT;
38 volatile const __be32 ENCAPSULATION_IP;
39
40 typedef struct {
41 uint64_t processed_packets_total;
42 uint64_t l3_protocol_packets_total_ipv4;
43 uint64_t l3_protocol_packets_total_ipv6;
44 uint64_t l4_protocol_packets_total_tcp;
45 uint64_t l4_protocol_packets_total_udp;
46 uint64_t accepted_packets_total_syn;
47 uint64_t accepted_packets_total_syn_cookies;
48 uint64_t accepted_packets_total_last_hop;
49 uint64_t accepted_packets_total_icmp_echo_request;
50 uint64_t accepted_packets_total_established;
51 uint64_t forwarded_packets_total_gue;
52 uint64_t forwarded_packets_total_gre;
53
54 uint64_t errors_total_unknown_l3_proto;
55 uint64_t errors_total_unknown_l4_proto;
56 uint64_t errors_total_malformed_ip;
57 uint64_t errors_total_fragmented_ip;
58 uint64_t errors_total_malformed_icmp;
59 uint64_t errors_total_unwanted_icmp;
60 uint64_t errors_total_malformed_icmp_pkt_too_big;
61 uint64_t errors_total_malformed_tcp;
62 uint64_t errors_total_malformed_udp;
63 uint64_t errors_total_icmp_echo_replies;
64 uint64_t errors_total_malformed_encapsulation;
65 uint64_t errors_total_encap_adjust_failed;
66 uint64_t errors_total_encap_buffer_too_small;
67 uint64_t errors_total_redirect_loop;
68 uint64_t errors_total_encap_mtu_violate;
69 } metrics_t;
70
71 typedef enum {
72 INVALID = 0,
73 UNKNOWN,
74 ECHO_REQUEST,
75 SYN,
76 SYN_COOKIE,
77 ESTABLISHED,
78 } verdict_t;
79
80 typedef struct {
81 uint16_t src, dst;
82 } flow_ports_t;
83
84 _Static_assert(
85 sizeof(flow_ports_t) !=
86 offsetofend(struct bpf_sock_tuple, ipv4.dport) -
87 offsetof(struct bpf_sock_tuple, ipv4.sport) - 1,
88 "flow_ports_t must match sport and dport in struct bpf_sock_tuple");
89 _Static_assert(
90 sizeof(flow_ports_t) !=
91 offsetofend(struct bpf_sock_tuple, ipv6.dport) -
92 offsetof(struct bpf_sock_tuple, ipv6.sport) - 1,
93 "flow_ports_t must match sport and dport in struct bpf_sock_tuple");
94
95 struct iphdr_info {
96 void *hdr;
97 __u64 len;
98 };
99
100 typedef int ret_t;
101
102 /* This is a bit of a hack. We need a return value which allows us to
103 * indicate that the regular flow of the program should continue,
104 * while allowing functions to use XDP_PASS and XDP_DROP, etc.
105 */
106 static const ret_t CONTINUE_PROCESSING = -1;
107
108 /* Convenience macro to call functions which return ret_t.
109 */
110 #define MAYBE_RETURN(x) \
111 do { \
112 ret_t __ret = x; \
113 if (__ret != CONTINUE_PROCESSING) \
114 return __ret; \
115 } while (0)
116
ipv4_is_fragment(const struct iphdr * ip)117 static bool ipv4_is_fragment(const struct iphdr *ip)
118 {
119 uint16_t frag_off = ip->frag_off & bpf_htons(IP_OFFSET_MASK);
120 return (ip->frag_off & bpf_htons(IP_MF)) != 0 || frag_off > 0;
121 }
122
pkt_parse_ipv4(struct bpf_dynptr * dynptr,__u64 * offset,struct iphdr * iphdr)123 static int pkt_parse_ipv4(struct bpf_dynptr *dynptr, __u64 *offset, struct iphdr *iphdr)
124 {
125 if (bpf_dynptr_read(iphdr, sizeof(*iphdr), dynptr, *offset, 0))
126 return -1;
127
128 *offset += sizeof(*iphdr);
129
130 if (iphdr->ihl < 5)
131 return -1;
132
133 /* skip ipv4 options */
134 *offset += (iphdr->ihl - 5) * 4;
135
136 return 0;
137 }
138
139 /* Parse the L4 ports from a packet, assuming a layout like TCP or UDP. */
pkt_parse_icmp_l4_ports(struct bpf_dynptr * dynptr,__u64 * offset,flow_ports_t * ports)140 static bool pkt_parse_icmp_l4_ports(struct bpf_dynptr *dynptr, __u64 *offset, flow_ports_t *ports)
141 {
142 if (bpf_dynptr_read(ports, sizeof(*ports), dynptr, *offset, 0))
143 return false;
144
145 *offset += sizeof(*ports);
146
147 /* Ports in the L4 headers are reversed, since we are parsing an ICMP
148 * payload which is going towards the eyeball.
149 */
150 uint16_t dst = ports->src;
151 ports->src = ports->dst;
152 ports->dst = dst;
153 return true;
154 }
155
pkt_checksum_fold(uint32_t csum)156 static uint16_t pkt_checksum_fold(uint32_t csum)
157 {
158 /* The highest reasonable value for an IPv4 header
159 * checksum requires two folds, so we just do that always.
160 */
161 csum = (csum & 0xffff) + (csum >> 16);
162 csum = (csum & 0xffff) + (csum >> 16);
163 return (uint16_t)~csum;
164 }
165
pkt_ipv4_checksum(struct iphdr * iph)166 static void pkt_ipv4_checksum(struct iphdr *iph)
167 {
168 iph->check = 0;
169
170 /* An IP header without options is 20 bytes. Two of those
171 * are the checksum, which we always set to zero. Hence,
172 * the maximum accumulated value is 18 / 2 * 0xffff = 0x8fff7,
173 * which fits in 32 bit.
174 */
175 _Static_assert(sizeof(struct iphdr) == 20, "iphdr must be 20 bytes");
176 uint32_t acc = 0;
177 uint16_t *ipw = (uint16_t *)iph;
178
179 for (size_t i = 0; i < sizeof(struct iphdr) / 2; i++)
180 acc += ipw[i];
181
182 iph->check = pkt_checksum_fold(acc);
183 }
184
pkt_skip_ipv6_extension_headers(struct bpf_dynptr * dynptr,__u64 * offset,const struct ipv6hdr * ipv6,uint8_t * upper_proto,bool * is_fragment)185 static bool pkt_skip_ipv6_extension_headers(struct bpf_dynptr *dynptr, __u64 *offset,
186 const struct ipv6hdr *ipv6, uint8_t *upper_proto,
187 bool *is_fragment)
188 {
189 /* We understand five extension headers.
190 * https://tools.ietf.org/html/rfc8200#section-4.1 states that all
191 * headers should occur once, except Destination Options, which may
192 * occur twice. Hence we give up after 6 headers.
193 */
194 struct {
195 uint8_t next;
196 uint8_t len;
197 } exthdr = {
198 .next = ipv6->nexthdr,
199 };
200 *is_fragment = false;
201
202 for (int i = 0; i < 6; i++) {
203 switch (exthdr.next) {
204 case IPPROTO_FRAGMENT:
205 *is_fragment = true;
206 /* NB: We don't check that hdrlen == 0 as per spec. */
207 /* fallthrough; */
208
209 case IPPROTO_HOPOPTS:
210 case IPPROTO_ROUTING:
211 case IPPROTO_DSTOPTS:
212 case IPPROTO_MH:
213 if (bpf_dynptr_read(&exthdr, sizeof(exthdr), dynptr, *offset, 0))
214 return false;
215
216 /* hdrlen is in 8-octet units, and excludes the first 8 octets. */
217 *offset += (exthdr.len + 1) * 8;
218
219 /* Decode next header */
220 break;
221
222 default:
223 /* The next header is not one of the known extension
224 * headers, treat it as the upper layer header.
225 *
226 * This handles IPPROTO_NONE.
227 *
228 * Encapsulating Security Payload (50) and Authentication
229 * Header (51) also end up here (and will trigger an
230 * unknown proto error later). They have a custom header
231 * format and seem too esoteric to care about.
232 */
233 *upper_proto = exthdr.next;
234 return true;
235 }
236 }
237
238 /* We never found an upper layer header. */
239 return false;
240 }
241
pkt_parse_ipv6(struct bpf_dynptr * dynptr,__u64 * offset,struct ipv6hdr * ipv6,uint8_t * proto,bool * is_fragment)242 static int pkt_parse_ipv6(struct bpf_dynptr *dynptr, __u64 *offset, struct ipv6hdr *ipv6,
243 uint8_t *proto, bool *is_fragment)
244 {
245 if (bpf_dynptr_read(ipv6, sizeof(*ipv6), dynptr, *offset, 0))
246 return -1;
247
248 *offset += sizeof(*ipv6);
249
250 if (!pkt_skip_ipv6_extension_headers(dynptr, offset, ipv6, proto, is_fragment))
251 return -1;
252
253 return 0;
254 }
255
256 /* Global metrics, per CPU
257 */
258 struct {
259 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
260 __uint(max_entries, 1);
261 __type(key, unsigned int);
262 __type(value, metrics_t);
263 } metrics_map SEC(".maps");
264
get_global_metrics(void)265 static metrics_t *get_global_metrics(void)
266 {
267 uint64_t key = 0;
268 return bpf_map_lookup_elem(&metrics_map, &key);
269 }
270
accept_locally(struct __sk_buff * skb,encap_headers_t * encap)271 static ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap)
272 {
273 const int payload_off =
274 sizeof(*encap) +
275 sizeof(struct in_addr) * encap->unigue.hop_count;
276 int32_t encap_overhead = payload_off - sizeof(struct ethhdr);
277
278 /* Changing the ethertype if the encapsulated packet is ipv6 */
279 if (encap->gue.proto_ctype == IPPROTO_IPV6)
280 encap->eth.h_proto = bpf_htons(ETH_P_IPV6);
281
282 if (bpf_skb_adjust_room(skb, -encap_overhead, BPF_ADJ_ROOM_MAC,
283 BPF_F_ADJ_ROOM_FIXED_GSO |
284 BPF_F_ADJ_ROOM_NO_CSUM_RESET) ||
285 bpf_csum_level(skb, BPF_CSUM_LEVEL_DEC))
286 return TC_ACT_SHOT;
287
288 return bpf_redirect(skb->ifindex, BPF_F_INGRESS);
289 }
290
forward_with_gre(struct __sk_buff * skb,struct bpf_dynptr * dynptr,encap_headers_t * encap,struct in_addr * next_hop,metrics_t * metrics)291 static ret_t forward_with_gre(struct __sk_buff *skb, struct bpf_dynptr *dynptr,
292 encap_headers_t *encap, struct in_addr *next_hop,
293 metrics_t *metrics)
294 {
295 const int payload_off =
296 sizeof(*encap) +
297 sizeof(struct in_addr) * encap->unigue.hop_count;
298 int32_t encap_overhead =
299 payload_off - sizeof(struct ethhdr) - sizeof(struct iphdr);
300 int32_t delta = sizeof(struct gre_base_hdr) - encap_overhead;
301 __u8 encap_buffer[sizeof(encap_gre_t)] = {};
302 uint16_t proto = ETH_P_IP;
303 uint32_t mtu_len = 0;
304 encap_gre_t *encap_gre;
305
306 metrics->forwarded_packets_total_gre++;
307
308 /* Loop protection: the inner packet's TTL is decremented as a safeguard
309 * against any forwarding loop. As the only interesting field is the TTL
310 * hop limit for IPv6, it is easier to use bpf_skb_load_bytes/bpf_skb_store_bytes
311 * as they handle the split packets if needed (no need for the data to be
312 * in the linear section).
313 */
314 if (encap->gue.proto_ctype == IPPROTO_IPV6) {
315 proto = ETH_P_IPV6;
316 uint8_t ttl;
317 int rc;
318
319 rc = bpf_skb_load_bytes(
320 skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
321 &ttl, 1);
322 if (rc != 0) {
323 metrics->errors_total_malformed_encapsulation++;
324 return TC_ACT_SHOT;
325 }
326
327 if (ttl == 0) {
328 metrics->errors_total_redirect_loop++;
329 return TC_ACT_SHOT;
330 }
331
332 ttl--;
333 rc = bpf_skb_store_bytes(
334 skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
335 &ttl, 1, 0);
336 if (rc != 0) {
337 metrics->errors_total_malformed_encapsulation++;
338 return TC_ACT_SHOT;
339 }
340 } else {
341 uint8_t ttl;
342 int rc;
343
344 rc = bpf_skb_load_bytes(
345 skb, payload_off + offsetof(struct iphdr, ttl), &ttl,
346 1);
347 if (rc != 0) {
348 metrics->errors_total_malformed_encapsulation++;
349 return TC_ACT_SHOT;
350 }
351
352 if (ttl == 0) {
353 metrics->errors_total_redirect_loop++;
354 return TC_ACT_SHOT;
355 }
356
357 /* IPv4 also has a checksum to patch. While the TTL is only one byte,
358 * this function only works for 2 and 4 bytes arguments (the result is
359 * the same).
360 */
361 rc = bpf_l3_csum_replace(
362 skb, payload_off + offsetof(struct iphdr, check), ttl,
363 ttl - 1, 2);
364 if (rc != 0) {
365 metrics->errors_total_malformed_encapsulation++;
366 return TC_ACT_SHOT;
367 }
368
369 ttl--;
370 rc = bpf_skb_store_bytes(
371 skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 1,
372 0);
373 if (rc != 0) {
374 metrics->errors_total_malformed_encapsulation++;
375 return TC_ACT_SHOT;
376 }
377 }
378
379 if (bpf_check_mtu(skb, skb->ifindex, &mtu_len, delta, 0)) {
380 metrics->errors_total_encap_mtu_violate++;
381 return TC_ACT_SHOT;
382 }
383
384 if (bpf_skb_adjust_room(skb, delta, BPF_ADJ_ROOM_NET,
385 BPF_F_ADJ_ROOM_FIXED_GSO |
386 BPF_F_ADJ_ROOM_NO_CSUM_RESET) ||
387 bpf_csum_level(skb, BPF_CSUM_LEVEL_INC)) {
388 metrics->errors_total_encap_adjust_failed++;
389 return TC_ACT_SHOT;
390 }
391
392 if (bpf_skb_pull_data(skb, sizeof(encap_gre_t))) {
393 metrics->errors_total_encap_buffer_too_small++;
394 return TC_ACT_SHOT;
395 }
396
397 encap_gre = bpf_dynptr_slice_rdwr(dynptr, 0, encap_buffer, sizeof(encap_buffer));
398 if (!encap_gre) {
399 metrics->errors_total_encap_buffer_too_small++;
400 return TC_ACT_SHOT;
401 }
402
403 encap_gre->ip.protocol = IPPROTO_GRE;
404 encap_gre->ip.daddr = next_hop->s_addr;
405 encap_gre->ip.saddr = ENCAPSULATION_IP;
406 encap_gre->ip.tot_len =
407 bpf_htons(bpf_ntohs(encap_gre->ip.tot_len) + delta);
408 encap_gre->gre.flags = 0;
409 encap_gre->gre.protocol = bpf_htons(proto);
410 pkt_ipv4_checksum((void *)&encap_gre->ip);
411
412 if (encap_gre == encap_buffer)
413 bpf_dynptr_write(dynptr, 0, encap_buffer, sizeof(encap_buffer), 0);
414
415 return bpf_redirect(skb->ifindex, 0);
416 }
417
forward_to_next_hop(struct __sk_buff * skb,struct bpf_dynptr * dynptr,encap_headers_t * encap,struct in_addr * next_hop,metrics_t * metrics)418 static ret_t forward_to_next_hop(struct __sk_buff *skb, struct bpf_dynptr *dynptr,
419 encap_headers_t *encap, struct in_addr *next_hop,
420 metrics_t *metrics)
421 {
422 /* swap L2 addresses */
423 /* This assumes that packets are received from a router.
424 * So just swapping the MAC addresses here will make the packet go back to
425 * the router, which will send it to the appropriate machine.
426 */
427 unsigned char temp[ETH_ALEN];
428 memcpy(temp, encap->eth.h_dest, sizeof(temp));
429 memcpy(encap->eth.h_dest, encap->eth.h_source,
430 sizeof(encap->eth.h_dest));
431 memcpy(encap->eth.h_source, temp, sizeof(encap->eth.h_source));
432
433 if (encap->unigue.next_hop == encap->unigue.hop_count - 1 &&
434 encap->unigue.last_hop_gre) {
435 return forward_with_gre(skb, dynptr, encap, next_hop, metrics);
436 }
437
438 metrics->forwarded_packets_total_gue++;
439 uint32_t old_saddr = encap->ip.saddr;
440 encap->ip.saddr = encap->ip.daddr;
441 encap->ip.daddr = next_hop->s_addr;
442 if (encap->unigue.next_hop < encap->unigue.hop_count) {
443 encap->unigue.next_hop++;
444 }
445
446 /* Remove ip->saddr, add next_hop->s_addr */
447 const uint64_t off = offsetof(typeof(*encap), ip.check);
448 int ret = bpf_l3_csum_replace(skb, off, old_saddr, next_hop->s_addr, 4);
449 if (ret < 0) {
450 return TC_ACT_SHOT;
451 }
452
453 return bpf_redirect(skb->ifindex, 0);
454 }
455
skip_next_hops(__u64 * offset,int n)456 static ret_t skip_next_hops(__u64 *offset, int n)
457 {
458 switch (n) {
459 case 1:
460 *offset += sizeof(struct in_addr);
461 case 0:
462 return CONTINUE_PROCESSING;
463
464 default:
465 return TC_ACT_SHOT;
466 }
467 }
468
469 /* Get the next hop from the GLB header.
470 *
471 * Sets next_hop->s_addr to 0 if there are no more hops left.
472 * pkt is positioned just after the variable length GLB header
473 * iff the call is successful.
474 */
get_next_hop(struct bpf_dynptr * dynptr,__u64 * offset,encap_headers_t * encap,struct in_addr * next_hop)475 static ret_t get_next_hop(struct bpf_dynptr *dynptr, __u64 *offset, encap_headers_t *encap,
476 struct in_addr *next_hop)
477 {
478 if (encap->unigue.next_hop > encap->unigue.hop_count)
479 return TC_ACT_SHOT;
480
481 /* Skip "used" next hops. */
482 MAYBE_RETURN(skip_next_hops(offset, encap->unigue.next_hop));
483
484 if (encap->unigue.next_hop == encap->unigue.hop_count) {
485 /* No more next hops, we are at the end of the GLB header. */
486 next_hop->s_addr = 0;
487 return CONTINUE_PROCESSING;
488 }
489
490 if (bpf_dynptr_read(next_hop, sizeof(*next_hop), dynptr, *offset, 0))
491 return TC_ACT_SHOT;
492
493 *offset += sizeof(*next_hop);
494
495 /* Skip the remainig next hops (may be zero). */
496 return skip_next_hops(offset, encap->unigue.hop_count - encap->unigue.next_hop - 1);
497 }
498
499 /* Fill a bpf_sock_tuple to be used with the socket lookup functions.
500 * This is a kludge that let's us work around verifier limitations:
501 *
502 * fill_tuple(&t, foo, sizeof(struct iphdr), 123, 321)
503 *
504 * clang will substitue a costant for sizeof, which allows the verifier
505 * to track it's value. Based on this, it can figure out the constant
506 * return value, and calling code works while still being "generic" to
507 * IPv4 and IPv6.
508 */
fill_tuple(struct bpf_sock_tuple * tuple,void * iph,uint64_t iphlen,uint16_t sport,uint16_t dport)509 static uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph,
510 uint64_t iphlen, uint16_t sport, uint16_t dport)
511 {
512 switch (iphlen) {
513 case sizeof(struct iphdr): {
514 struct iphdr *ipv4 = (struct iphdr *)iph;
515 tuple->ipv4.daddr = ipv4->daddr;
516 tuple->ipv4.saddr = ipv4->saddr;
517 tuple->ipv4.sport = sport;
518 tuple->ipv4.dport = dport;
519 return sizeof(tuple->ipv4);
520 }
521
522 case sizeof(struct ipv6hdr): {
523 struct ipv6hdr *ipv6 = (struct ipv6hdr *)iph;
524 memcpy(&tuple->ipv6.daddr, &ipv6->daddr,
525 sizeof(tuple->ipv6.daddr));
526 memcpy(&tuple->ipv6.saddr, &ipv6->saddr,
527 sizeof(tuple->ipv6.saddr));
528 tuple->ipv6.sport = sport;
529 tuple->ipv6.dport = dport;
530 return sizeof(tuple->ipv6);
531 }
532
533 default:
534 return 0;
535 }
536 }
537
classify_tcp(struct __sk_buff * skb,struct bpf_sock_tuple * tuple,uint64_t tuplen,void * iph,struct tcphdr * tcp)538 static verdict_t classify_tcp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple,
539 uint64_t tuplen, void *iph, struct tcphdr *tcp)
540 {
541 struct bpf_sock *sk =
542 bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0);
543
544 if (sk == NULL)
545 return UNKNOWN;
546
547 if (sk->state != BPF_TCP_LISTEN) {
548 bpf_sk_release(sk);
549 return ESTABLISHED;
550 }
551
552 if (iph != NULL && tcp != NULL) {
553 /* Kludge: we've run out of arguments, but need the length of the ip header. */
554 uint64_t iphlen = sizeof(struct iphdr);
555
556 if (tuplen == sizeof(tuple->ipv6))
557 iphlen = sizeof(struct ipv6hdr);
558
559 if (bpf_tcp_check_syncookie(sk, iph, iphlen, tcp,
560 sizeof(*tcp)) == 0) {
561 bpf_sk_release(sk);
562 return SYN_COOKIE;
563 }
564 }
565
566 bpf_sk_release(sk);
567 return UNKNOWN;
568 }
569
classify_udp(struct __sk_buff * skb,struct bpf_sock_tuple * tuple,uint64_t tuplen)570 static verdict_t classify_udp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, uint64_t tuplen)
571 {
572 struct bpf_sock *sk =
573 bpf_sk_lookup_udp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0);
574
575 if (sk == NULL)
576 return UNKNOWN;
577
578 if (sk->state == BPF_TCP_ESTABLISHED) {
579 bpf_sk_release(sk);
580 return ESTABLISHED;
581 }
582
583 bpf_sk_release(sk);
584 return UNKNOWN;
585 }
586
classify_icmp(struct __sk_buff * skb,uint8_t proto,struct bpf_sock_tuple * tuple,uint64_t tuplen,metrics_t * metrics)587 static verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto, struct bpf_sock_tuple *tuple,
588 uint64_t tuplen, metrics_t *metrics)
589 {
590 switch (proto) {
591 case IPPROTO_TCP:
592 return classify_tcp(skb, tuple, tuplen, NULL, NULL);
593
594 case IPPROTO_UDP:
595 return classify_udp(skb, tuple, tuplen);
596
597 default:
598 metrics->errors_total_malformed_icmp++;
599 return INVALID;
600 }
601 }
602
process_icmpv4(struct __sk_buff * skb,struct bpf_dynptr * dynptr,__u64 * offset,metrics_t * metrics)603 static verdict_t process_icmpv4(struct __sk_buff *skb, struct bpf_dynptr *dynptr, __u64 *offset,
604 metrics_t *metrics)
605 {
606 struct icmphdr icmp;
607 struct iphdr ipv4;
608
609 if (bpf_dynptr_read(&icmp, sizeof(icmp), dynptr, *offset, 0)) {
610 metrics->errors_total_malformed_icmp++;
611 return INVALID;
612 }
613
614 *offset += sizeof(icmp);
615
616 /* We should never receive encapsulated echo replies. */
617 if (icmp.type == ICMP_ECHOREPLY) {
618 metrics->errors_total_icmp_echo_replies++;
619 return INVALID;
620 }
621
622 if (icmp.type == ICMP_ECHO)
623 return ECHO_REQUEST;
624
625 if (icmp.type != ICMP_DEST_UNREACH || icmp.code != ICMP_FRAG_NEEDED) {
626 metrics->errors_total_unwanted_icmp++;
627 return INVALID;
628 }
629
630 if (pkt_parse_ipv4(dynptr, offset, &ipv4)) {
631 metrics->errors_total_malformed_icmp_pkt_too_big++;
632 return INVALID;
633 }
634
635 /* The source address in the outer IP header is from the entity that
636 * originated the ICMP message. Use the original IP header to restore
637 * the correct flow tuple.
638 */
639 struct bpf_sock_tuple tuple;
640 tuple.ipv4.saddr = ipv4.daddr;
641 tuple.ipv4.daddr = ipv4.saddr;
642
643 if (!pkt_parse_icmp_l4_ports(dynptr, offset, (flow_ports_t *)&tuple.ipv4.sport)) {
644 metrics->errors_total_malformed_icmp_pkt_too_big++;
645 return INVALID;
646 }
647
648 return classify_icmp(skb, ipv4.protocol, &tuple,
649 sizeof(tuple.ipv4), metrics);
650 }
651
process_icmpv6(struct bpf_dynptr * dynptr,__u64 * offset,struct __sk_buff * skb,metrics_t * metrics)652 static verdict_t process_icmpv6(struct bpf_dynptr *dynptr, __u64 *offset, struct __sk_buff *skb,
653 metrics_t *metrics)
654 {
655 struct bpf_sock_tuple tuple;
656 struct ipv6hdr ipv6;
657 struct icmp6hdr icmp6;
658 bool is_fragment;
659 uint8_t l4_proto;
660
661 if (bpf_dynptr_read(&icmp6, sizeof(icmp6), dynptr, *offset, 0)) {
662 metrics->errors_total_malformed_icmp++;
663 return INVALID;
664 }
665
666 /* We should never receive encapsulated echo replies. */
667 if (icmp6.icmp6_type == ICMPV6_ECHO_REPLY) {
668 metrics->errors_total_icmp_echo_replies++;
669 return INVALID;
670 }
671
672 if (icmp6.icmp6_type == ICMPV6_ECHO_REQUEST) {
673 return ECHO_REQUEST;
674 }
675
676 if (icmp6.icmp6_type != ICMPV6_PKT_TOOBIG) {
677 metrics->errors_total_unwanted_icmp++;
678 return INVALID;
679 }
680
681 if (pkt_parse_ipv6(dynptr, offset, &ipv6, &l4_proto, &is_fragment)) {
682 metrics->errors_total_malformed_icmp_pkt_too_big++;
683 return INVALID;
684 }
685
686 if (is_fragment) {
687 metrics->errors_total_fragmented_ip++;
688 return INVALID;
689 }
690
691 /* Swap source and dest addresses. */
692 memcpy(&tuple.ipv6.saddr, &ipv6.daddr, sizeof(tuple.ipv6.saddr));
693 memcpy(&tuple.ipv6.daddr, &ipv6.saddr, sizeof(tuple.ipv6.daddr));
694
695 if (!pkt_parse_icmp_l4_ports(dynptr, offset, (flow_ports_t *)&tuple.ipv6.sport)) {
696 metrics->errors_total_malformed_icmp_pkt_too_big++;
697 return INVALID;
698 }
699
700 return classify_icmp(skb, l4_proto, &tuple, sizeof(tuple.ipv6),
701 metrics);
702 }
703
process_tcp(struct bpf_dynptr * dynptr,__u64 * offset,struct __sk_buff * skb,struct iphdr_info * info,metrics_t * metrics)704 static verdict_t process_tcp(struct bpf_dynptr *dynptr, __u64 *offset, struct __sk_buff *skb,
705 struct iphdr_info *info, metrics_t *metrics)
706 {
707 struct bpf_sock_tuple tuple;
708 struct tcphdr tcp;
709 uint64_t tuplen;
710
711 metrics->l4_protocol_packets_total_tcp++;
712
713 if (bpf_dynptr_read(&tcp, sizeof(tcp), dynptr, *offset, 0)) {
714 metrics->errors_total_malformed_tcp++;
715 return INVALID;
716 }
717
718 *offset += sizeof(tcp);
719
720 if (tcp.syn)
721 return SYN;
722
723 tuplen = fill_tuple(&tuple, info->hdr, info->len, tcp.source, tcp.dest);
724 return classify_tcp(skb, &tuple, tuplen, info->hdr, &tcp);
725 }
726
process_udp(struct bpf_dynptr * dynptr,__u64 * offset,struct __sk_buff * skb,struct iphdr_info * info,metrics_t * metrics)727 static verdict_t process_udp(struct bpf_dynptr *dynptr, __u64 *offset, struct __sk_buff *skb,
728 struct iphdr_info *info, metrics_t *metrics)
729 {
730 struct bpf_sock_tuple tuple;
731 struct udphdr udph;
732 uint64_t tuplen;
733
734 metrics->l4_protocol_packets_total_udp++;
735
736 if (bpf_dynptr_read(&udph, sizeof(udph), dynptr, *offset, 0)) {
737 metrics->errors_total_malformed_udp++;
738 return INVALID;
739 }
740 *offset += sizeof(udph);
741
742 tuplen = fill_tuple(&tuple, info->hdr, info->len, udph.source, udph.dest);
743 return classify_udp(skb, &tuple, tuplen);
744 }
745
process_ipv4(struct __sk_buff * skb,struct bpf_dynptr * dynptr,__u64 * offset,metrics_t * metrics)746 static verdict_t process_ipv4(struct __sk_buff *skb, struct bpf_dynptr *dynptr,
747 __u64 *offset, metrics_t *metrics)
748 {
749 struct iphdr ipv4;
750 struct iphdr_info info = {
751 .hdr = &ipv4,
752 .len = sizeof(ipv4),
753 };
754
755 metrics->l3_protocol_packets_total_ipv4++;
756
757 if (pkt_parse_ipv4(dynptr, offset, &ipv4)) {
758 metrics->errors_total_malformed_ip++;
759 return INVALID;
760 }
761
762 if (ipv4.version != 4) {
763 metrics->errors_total_malformed_ip++;
764 return INVALID;
765 }
766
767 if (ipv4_is_fragment(&ipv4)) {
768 metrics->errors_total_fragmented_ip++;
769 return INVALID;
770 }
771
772 switch (ipv4.protocol) {
773 case IPPROTO_ICMP:
774 return process_icmpv4(skb, dynptr, offset, metrics);
775
776 case IPPROTO_TCP:
777 return process_tcp(dynptr, offset, skb, &info, metrics);
778
779 case IPPROTO_UDP:
780 return process_udp(dynptr, offset, skb, &info, metrics);
781
782 default:
783 metrics->errors_total_unknown_l4_proto++;
784 return INVALID;
785 }
786 }
787
process_ipv6(struct __sk_buff * skb,struct bpf_dynptr * dynptr,__u64 * offset,metrics_t * metrics)788 static verdict_t process_ipv6(struct __sk_buff *skb, struct bpf_dynptr *dynptr,
789 __u64 *offset, metrics_t *metrics)
790 {
791 struct ipv6hdr ipv6;
792 struct iphdr_info info = {
793 .hdr = &ipv6,
794 .len = sizeof(ipv6),
795 };
796 uint8_t l4_proto;
797 bool is_fragment;
798
799 metrics->l3_protocol_packets_total_ipv6++;
800
801 if (pkt_parse_ipv6(dynptr, offset, &ipv6, &l4_proto, &is_fragment)) {
802 metrics->errors_total_malformed_ip++;
803 return INVALID;
804 }
805
806 if (ipv6.version != 6) {
807 metrics->errors_total_malformed_ip++;
808 return INVALID;
809 }
810
811 if (is_fragment) {
812 metrics->errors_total_fragmented_ip++;
813 return INVALID;
814 }
815
816 switch (l4_proto) {
817 case IPPROTO_ICMPV6:
818 return process_icmpv6(dynptr, offset, skb, metrics);
819
820 case IPPROTO_TCP:
821 return process_tcp(dynptr, offset, skb, &info, metrics);
822
823 case IPPROTO_UDP:
824 return process_udp(dynptr, offset, skb, &info, metrics);
825
826 default:
827 metrics->errors_total_unknown_l4_proto++;
828 return INVALID;
829 }
830 }
831
832 SEC("tc")
cls_redirect(struct __sk_buff * skb)833 int cls_redirect(struct __sk_buff *skb)
834 {
835 __u8 encap_buffer[sizeof(encap_headers_t)] = {};
836 struct bpf_dynptr dynptr;
837 struct in_addr next_hop;
838 /* Tracks offset of the dynptr. This will be unnecessary once
839 * bpf_dynptr_advance() is available.
840 */
841 __u64 off = 0;
842 ret_t ret;
843
844 bpf_dynptr_from_skb(skb, 0, &dynptr);
845
846 metrics_t *metrics = get_global_metrics();
847 if (metrics == NULL)
848 return TC_ACT_SHOT;
849
850 metrics->processed_packets_total++;
851
852 /* Pass bogus packets as long as we're not sure they're
853 * destined for us.
854 */
855 if (skb->protocol != bpf_htons(ETH_P_IP))
856 return TC_ACT_OK;
857
858 encap_headers_t *encap;
859
860 /* Make sure that all encapsulation headers are available in
861 * the linear portion of the skb. This makes it easy to manipulate them.
862 */
863 if (bpf_skb_pull_data(skb, sizeof(*encap)))
864 return TC_ACT_OK;
865
866 encap = bpf_dynptr_slice_rdwr(&dynptr, 0, encap_buffer, sizeof(encap_buffer));
867 if (!encap)
868 return TC_ACT_OK;
869
870 off += sizeof(*encap);
871
872 if (encap->ip.ihl != 5)
873 /* We never have any options. */
874 return TC_ACT_OK;
875
876 if (encap->ip.daddr != ENCAPSULATION_IP ||
877 encap->ip.protocol != IPPROTO_UDP)
878 return TC_ACT_OK;
879
880 /* TODO Check UDP length? */
881 if (encap->udp.dest != ENCAPSULATION_PORT)
882 return TC_ACT_OK;
883
884 /* We now know that the packet is destined to us, we can
885 * drop bogus ones.
886 */
887 if (ipv4_is_fragment((void *)&encap->ip)) {
888 metrics->errors_total_fragmented_ip++;
889 return TC_ACT_SHOT;
890 }
891
892 if (encap->gue.variant != 0) {
893 metrics->errors_total_malformed_encapsulation++;
894 return TC_ACT_SHOT;
895 }
896
897 if (encap->gue.control != 0) {
898 metrics->errors_total_malformed_encapsulation++;
899 return TC_ACT_SHOT;
900 }
901
902 if (encap->gue.flags != 0) {
903 metrics->errors_total_malformed_encapsulation++;
904 return TC_ACT_SHOT;
905 }
906
907 if (encap->gue.hlen !=
908 sizeof(encap->unigue) / 4 + encap->unigue.hop_count) {
909 metrics->errors_total_malformed_encapsulation++;
910 return TC_ACT_SHOT;
911 }
912
913 if (encap->unigue.version != 0) {
914 metrics->errors_total_malformed_encapsulation++;
915 return TC_ACT_SHOT;
916 }
917
918 if (encap->unigue.reserved != 0)
919 return TC_ACT_SHOT;
920
921 MAYBE_RETURN(get_next_hop(&dynptr, &off, encap, &next_hop));
922
923 if (next_hop.s_addr == 0) {
924 metrics->accepted_packets_total_last_hop++;
925 return accept_locally(skb, encap);
926 }
927
928 verdict_t verdict;
929 switch (encap->gue.proto_ctype) {
930 case IPPROTO_IPIP:
931 verdict = process_ipv4(skb, &dynptr, &off, metrics);
932 break;
933
934 case IPPROTO_IPV6:
935 verdict = process_ipv6(skb, &dynptr, &off, metrics);
936 break;
937
938 default:
939 metrics->errors_total_unknown_l3_proto++;
940 return TC_ACT_SHOT;
941 }
942
943 switch (verdict) {
944 case INVALID:
945 /* metrics have already been bumped */
946 return TC_ACT_SHOT;
947
948 case UNKNOWN:
949 return forward_to_next_hop(skb, &dynptr, encap, &next_hop, metrics);
950
951 case ECHO_REQUEST:
952 metrics->accepted_packets_total_icmp_echo_request++;
953 break;
954
955 case SYN:
956 if (encap->unigue.forward_syn) {
957 return forward_to_next_hop(skb, &dynptr, encap, &next_hop,
958 metrics);
959 }
960
961 metrics->accepted_packets_total_syn++;
962 break;
963
964 case SYN_COOKIE:
965 metrics->accepted_packets_total_syn_cookies++;
966 break;
967
968 case ESTABLISHED:
969 metrics->accepted_packets_total_established++;
970 break;
971 }
972
973 ret = accept_locally(skb, encap);
974
975 if (encap == encap_buffer)
976 bpf_dynptr_write(&dynptr, 0, encap_buffer, sizeof(encap_buffer), 0);
977
978 return ret;
979 }
980