1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 // Copyright (c) 2019, 2020 Cloudflare
3
4 #include <stdbool.h>
5 #include <stddef.h>
6 #include <stdint.h>
7 #include <string.h>
8
9 #include <linux/bpf.h>
10 #include <linux/icmp.h>
11 #include <linux/icmpv6.h>
12 #include <linux/if_ether.h>
13 #include <linux/in.h>
14 #include <linux/ip.h>
15 #include <linux/ipv6.h>
16 #include <linux/pkt_cls.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19
20 #include <bpf/bpf_helpers.h>
21 #include <bpf/bpf_endian.h>
22
23 #include "test_cls_redirect.h"
24
25 #ifdef SUBPROGS
26 #define INLINING __noinline
27 #else
28 #define INLINING __always_inline
29 #endif
30
31 #define offsetofend(TYPE, MEMBER) \
32 (offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER)))
33
34 #define IP_OFFSET_MASK (0x1FFF)
35 #define IP_MF (0x2000)
36
37 char _license[] SEC("license") = "Dual BSD/GPL";
38
39 /**
40 * Destination port and IP used for UDP encapsulation.
41 */
42 volatile const __be16 ENCAPSULATION_PORT;
43 volatile const __be32 ENCAPSULATION_IP;
44
45 typedef struct {
46 uint64_t processed_packets_total;
47 uint64_t l3_protocol_packets_total_ipv4;
48 uint64_t l3_protocol_packets_total_ipv6;
49 uint64_t l4_protocol_packets_total_tcp;
50 uint64_t l4_protocol_packets_total_udp;
51 uint64_t accepted_packets_total_syn;
52 uint64_t accepted_packets_total_syn_cookies;
53 uint64_t accepted_packets_total_last_hop;
54 uint64_t accepted_packets_total_icmp_echo_request;
55 uint64_t accepted_packets_total_established;
56 uint64_t forwarded_packets_total_gue;
57 uint64_t forwarded_packets_total_gre;
58
59 uint64_t errors_total_unknown_l3_proto;
60 uint64_t errors_total_unknown_l4_proto;
61 uint64_t errors_total_malformed_ip;
62 uint64_t errors_total_fragmented_ip;
63 uint64_t errors_total_malformed_icmp;
64 uint64_t errors_total_unwanted_icmp;
65 uint64_t errors_total_malformed_icmp_pkt_too_big;
66 uint64_t errors_total_malformed_tcp;
67 uint64_t errors_total_malformed_udp;
68 uint64_t errors_total_icmp_echo_replies;
69 uint64_t errors_total_malformed_encapsulation;
70 uint64_t errors_total_encap_adjust_failed;
71 uint64_t errors_total_encap_buffer_too_small;
72 uint64_t errors_total_redirect_loop;
73 uint64_t errors_total_encap_mtu_violate;
74 } metrics_t;
75
76 typedef enum {
77 INVALID = 0,
78 UNKNOWN,
79 ECHO_REQUEST,
80 SYN,
81 SYN_COOKIE,
82 ESTABLISHED,
83 } verdict_t;
84
85 typedef struct {
86 uint16_t src, dst;
87 } flow_ports_t;
88
89 _Static_assert(
90 sizeof(flow_ports_t) !=
91 offsetofend(struct bpf_sock_tuple, ipv4.dport) -
92 offsetof(struct bpf_sock_tuple, ipv4.sport) - 1,
93 "flow_ports_t must match sport and dport in struct bpf_sock_tuple");
94 _Static_assert(
95 sizeof(flow_ports_t) !=
96 offsetofend(struct bpf_sock_tuple, ipv6.dport) -
97 offsetof(struct bpf_sock_tuple, ipv6.sport) - 1,
98 "flow_ports_t must match sport and dport in struct bpf_sock_tuple");
99
100 typedef int ret_t;
101
102 /* This is a bit of a hack. We need a return value which allows us to
103 * indicate that the regular flow of the program should continue,
104 * while allowing functions to use XDP_PASS and XDP_DROP, etc.
105 */
106 static const ret_t CONTINUE_PROCESSING = -1;
107
108 /* Convenience macro to call functions which return ret_t.
109 */
110 #define MAYBE_RETURN(x) \
111 do { \
112 ret_t __ret = x; \
113 if (__ret != CONTINUE_PROCESSING) \
114 return __ret; \
115 } while (0)
116
117 /* Linux packet pointers are either aligned to NET_IP_ALIGN (aka 2 bytes),
118 * or not aligned if the arch supports efficient unaligned access.
119 *
120 * Since the verifier ensures that eBPF packet accesses follow these rules,
121 * we can tell LLVM to emit code as if we always had a larger alignment.
122 * It will yell at us if we end up on a platform where this is not valid.
123 */
124 typedef uint8_t *net_ptr __attribute__((align_value(8)));
125
126 typedef struct buf {
127 struct __sk_buff *skb;
128 net_ptr head;
129 /* NB: tail musn't have alignment other than 1, otherwise
130 * LLVM will go and eliminate code, e.g. when checking packet lengths.
131 */
132 uint8_t *const tail;
133 } buf_t;
134
buf_off(const buf_t * buf)135 static __always_inline size_t buf_off(const buf_t *buf)
136 {
137 /* Clang seems to optimize constructs like
138 * a - b + c
139 * if c is known:
140 * r? = c
141 * r? -= b
142 * r? += a
143 *
144 * This is a problem if a and b are packet pointers,
145 * since the verifier allows subtracting two pointers to
146 * get a scalar, but not a scalar and a pointer.
147 *
148 * Use inline asm to break this optimization.
149 */
150 size_t off = (size_t)buf->head;
151 asm("%0 -= %1" : "+r"(off) : "r"(buf->skb->data));
152 return off;
153 }
154
buf_copy(buf_t * buf,void * dst,size_t len)155 static __always_inline bool buf_copy(buf_t *buf, void *dst, size_t len)
156 {
157 if (bpf_skb_load_bytes(buf->skb, buf_off(buf), dst, len)) {
158 return false;
159 }
160
161 buf->head += len;
162 return true;
163 }
164
buf_skip(buf_t * buf,const size_t len)165 static __always_inline bool buf_skip(buf_t *buf, const size_t len)
166 {
167 /* Check whether off + len is valid in the non-linear part. */
168 if (buf_off(buf) + len > buf->skb->len) {
169 return false;
170 }
171
172 buf->head += len;
173 return true;
174 }
175
176 /* Returns a pointer to the start of buf, or NULL if len is
177 * larger than the remaining data. Consumes len bytes on a successful
178 * call.
179 *
180 * If scratch is not NULL, the function will attempt to load non-linear
181 * data via bpf_skb_load_bytes. On success, scratch is returned.
182 */
buf_assign(buf_t * buf,const size_t len,void * scratch)183 static __always_inline void *buf_assign(buf_t *buf, const size_t len, void *scratch)
184 {
185 if (buf->head + len > buf->tail) {
186 if (scratch == NULL) {
187 return NULL;
188 }
189
190 return buf_copy(buf, scratch, len) ? scratch : NULL;
191 }
192
193 void *ptr = buf->head;
194 buf->head += len;
195 return ptr;
196 }
197
pkt_skip_ipv4_options(buf_t * buf,const struct iphdr * ipv4)198 static INLINING bool pkt_skip_ipv4_options(buf_t *buf, const struct iphdr *ipv4)
199 {
200 if (ipv4->ihl <= 5) {
201 return true;
202 }
203
204 return buf_skip(buf, (ipv4->ihl - 5) * 4);
205 }
206
ipv4_is_fragment(const struct iphdr * ip)207 static INLINING bool ipv4_is_fragment(const struct iphdr *ip)
208 {
209 uint16_t frag_off = ip->frag_off & bpf_htons(IP_OFFSET_MASK);
210 return (ip->frag_off & bpf_htons(IP_MF)) != 0 || frag_off > 0;
211 }
212
pkt_parse_ipv4(buf_t * pkt,struct iphdr * scratch)213 static __always_inline struct iphdr *pkt_parse_ipv4(buf_t *pkt, struct iphdr *scratch)
214 {
215 struct iphdr *ipv4 = buf_assign(pkt, sizeof(*ipv4), scratch);
216 if (ipv4 == NULL) {
217 return NULL;
218 }
219
220 if (ipv4->ihl < 5) {
221 return NULL;
222 }
223
224 if (!pkt_skip_ipv4_options(pkt, ipv4)) {
225 return NULL;
226 }
227
228 return ipv4;
229 }
230
231 /* Parse the L4 ports from a packet, assuming a layout like TCP or UDP. */
pkt_parse_icmp_l4_ports(buf_t * pkt,flow_ports_t * ports)232 static INLINING bool pkt_parse_icmp_l4_ports(buf_t *pkt, flow_ports_t *ports)
233 {
234 if (!buf_copy(pkt, ports, sizeof(*ports))) {
235 return false;
236 }
237
238 /* Ports in the L4 headers are reversed, since we are parsing an ICMP
239 * payload which is going towards the eyeball.
240 */
241 uint16_t dst = ports->src;
242 ports->src = ports->dst;
243 ports->dst = dst;
244 return true;
245 }
246
pkt_checksum_fold(uint32_t csum)247 static INLINING uint16_t pkt_checksum_fold(uint32_t csum)
248 {
249 /* The highest reasonable value for an IPv4 header
250 * checksum requires two folds, so we just do that always.
251 */
252 csum = (csum & 0xffff) + (csum >> 16);
253 csum = (csum & 0xffff) + (csum >> 16);
254 return (uint16_t)~csum;
255 }
256
pkt_ipv4_checksum(struct iphdr * iph)257 static INLINING void pkt_ipv4_checksum(struct iphdr *iph)
258 {
259 iph->check = 0;
260
261 /* An IP header without options is 20 bytes. Two of those
262 * are the checksum, which we always set to zero. Hence,
263 * the maximum accumulated value is 18 / 2 * 0xffff = 0x8fff7,
264 * which fits in 32 bit.
265 */
266 _Static_assert(sizeof(struct iphdr) == 20, "iphdr must be 20 bytes");
267 uint32_t acc = 0;
268 uint16_t *ipw = (uint16_t *)iph;
269
270 #pragma clang loop unroll(full)
271 for (size_t i = 0; i < sizeof(struct iphdr) / 2; i++) {
272 acc += ipw[i];
273 }
274
275 iph->check = pkt_checksum_fold(acc);
276 }
277
278 static INLINING
pkt_skip_ipv6_extension_headers(buf_t * pkt,const struct ipv6hdr * ipv6,uint8_t * upper_proto,bool * is_fragment)279 bool pkt_skip_ipv6_extension_headers(buf_t *pkt,
280 const struct ipv6hdr *ipv6,
281 uint8_t *upper_proto,
282 bool *is_fragment)
283 {
284 /* We understand five extension headers.
285 * https://tools.ietf.org/html/rfc8200#section-4.1 states that all
286 * headers should occur once, except Destination Options, which may
287 * occur twice. Hence we give up after 6 headers.
288 */
289 struct {
290 uint8_t next;
291 uint8_t len;
292 } exthdr = {
293 .next = ipv6->nexthdr,
294 };
295 *is_fragment = false;
296
297 #pragma clang loop unroll(full)
298 for (int i = 0; i < 6; i++) {
299 switch (exthdr.next) {
300 case IPPROTO_FRAGMENT:
301 *is_fragment = true;
302 /* NB: We don't check that hdrlen == 0 as per spec. */
303 /* fallthrough; */
304
305 case IPPROTO_HOPOPTS:
306 case IPPROTO_ROUTING:
307 case IPPROTO_DSTOPTS:
308 case IPPROTO_MH:
309 if (!buf_copy(pkt, &exthdr, sizeof(exthdr))) {
310 return false;
311 }
312
313 /* hdrlen is in 8-octet units, and excludes the first 8 octets. */
314 if (!buf_skip(pkt,
315 (exthdr.len + 1) * 8 - sizeof(exthdr))) {
316 return false;
317 }
318
319 /* Decode next header */
320 break;
321
322 default:
323 /* The next header is not one of the known extension
324 * headers, treat it as the upper layer header.
325 *
326 * This handles IPPROTO_NONE.
327 *
328 * Encapsulating Security Payload (50) and Authentication
329 * Header (51) also end up here (and will trigger an
330 * unknown proto error later). They have a custom header
331 * format and seem too esoteric to care about.
332 */
333 *upper_proto = exthdr.next;
334 return true;
335 }
336 }
337
338 /* We never found an upper layer header. */
339 return false;
340 }
341
342 /* This function has to be inlined, because the verifier otherwise rejects it
343 * due to returning a pointer to the stack. This is technically correct, since
344 * scratch is allocated on the stack. However, this usage should be safe since
345 * it's the callers stack after all.
346 */
347 static __always_inline struct ipv6hdr *
pkt_parse_ipv6(buf_t * pkt,struct ipv6hdr * scratch,uint8_t * proto,bool * is_fragment)348 pkt_parse_ipv6(buf_t *pkt, struct ipv6hdr *scratch, uint8_t *proto,
349 bool *is_fragment)
350 {
351 struct ipv6hdr *ipv6 = buf_assign(pkt, sizeof(*ipv6), scratch);
352 if (ipv6 == NULL) {
353 return NULL;
354 }
355
356 if (!pkt_skip_ipv6_extension_headers(pkt, ipv6, proto, is_fragment)) {
357 return NULL;
358 }
359
360 return ipv6;
361 }
362
363 /* Global metrics, per CPU
364 */
365 struct {
366 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
367 __uint(max_entries, 1);
368 __type(key, unsigned int);
369 __type(value, metrics_t);
370 } metrics_map SEC(".maps");
371
get_global_metrics(void)372 static INLINING metrics_t *get_global_metrics(void)
373 {
374 uint64_t key = 0;
375 return bpf_map_lookup_elem(&metrics_map, &key);
376 }
377
accept_locally(struct __sk_buff * skb,encap_headers_t * encap)378 static INLINING ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap)
379 {
380 const int payload_off =
381 sizeof(*encap) +
382 sizeof(struct in_addr) * encap->unigue.hop_count;
383 int32_t encap_overhead = payload_off - sizeof(struct ethhdr);
384
385 // Changing the ethertype if the encapsulated packet is ipv6
386 if (encap->gue.proto_ctype == IPPROTO_IPV6) {
387 encap->eth.h_proto = bpf_htons(ETH_P_IPV6);
388 }
389
390 if (bpf_skb_adjust_room(skb, -encap_overhead, BPF_ADJ_ROOM_MAC,
391 BPF_F_ADJ_ROOM_FIXED_GSO |
392 BPF_F_ADJ_ROOM_NO_CSUM_RESET) ||
393 bpf_csum_level(skb, BPF_CSUM_LEVEL_DEC))
394 return TC_ACT_SHOT;
395
396 return bpf_redirect(skb->ifindex, BPF_F_INGRESS);
397 }
398
forward_with_gre(struct __sk_buff * skb,encap_headers_t * encap,struct in_addr * next_hop,metrics_t * metrics)399 static INLINING ret_t forward_with_gre(struct __sk_buff *skb, encap_headers_t *encap,
400 struct in_addr *next_hop, metrics_t *metrics)
401 {
402 metrics->forwarded_packets_total_gre++;
403
404 const int payload_off =
405 sizeof(*encap) +
406 sizeof(struct in_addr) * encap->unigue.hop_count;
407 int32_t encap_overhead =
408 payload_off - sizeof(struct ethhdr) - sizeof(struct iphdr);
409 int32_t delta = sizeof(struct gre_base_hdr) - encap_overhead;
410 uint16_t proto = ETH_P_IP;
411 uint32_t mtu_len = 0;
412
413 /* Loop protection: the inner packet's TTL is decremented as a safeguard
414 * against any forwarding loop. As the only interesting field is the TTL
415 * hop limit for IPv6, it is easier to use bpf_skb_load_bytes/bpf_skb_store_bytes
416 * as they handle the split packets if needed (no need for the data to be
417 * in the linear section).
418 */
419 if (encap->gue.proto_ctype == IPPROTO_IPV6) {
420 proto = ETH_P_IPV6;
421 uint8_t ttl;
422 int rc;
423
424 rc = bpf_skb_load_bytes(
425 skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
426 &ttl, 1);
427 if (rc != 0) {
428 metrics->errors_total_malformed_encapsulation++;
429 return TC_ACT_SHOT;
430 }
431
432 if (ttl == 0) {
433 metrics->errors_total_redirect_loop++;
434 return TC_ACT_SHOT;
435 }
436
437 ttl--;
438 rc = bpf_skb_store_bytes(
439 skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
440 &ttl, 1, 0);
441 if (rc != 0) {
442 metrics->errors_total_malformed_encapsulation++;
443 return TC_ACT_SHOT;
444 }
445 } else {
446 uint8_t ttl;
447 int rc;
448
449 rc = bpf_skb_load_bytes(
450 skb, payload_off + offsetof(struct iphdr, ttl), &ttl,
451 1);
452 if (rc != 0) {
453 metrics->errors_total_malformed_encapsulation++;
454 return TC_ACT_SHOT;
455 }
456
457 if (ttl == 0) {
458 metrics->errors_total_redirect_loop++;
459 return TC_ACT_SHOT;
460 }
461
462 /* IPv4 also has a checksum to patch. While the TTL is only one byte,
463 * this function only works for 2 and 4 bytes arguments (the result is
464 * the same).
465 */
466 rc = bpf_l3_csum_replace(
467 skb, payload_off + offsetof(struct iphdr, check), ttl,
468 ttl - 1, 2);
469 if (rc != 0) {
470 metrics->errors_total_malformed_encapsulation++;
471 return TC_ACT_SHOT;
472 }
473
474 ttl--;
475 rc = bpf_skb_store_bytes(
476 skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 1,
477 0);
478 if (rc != 0) {
479 metrics->errors_total_malformed_encapsulation++;
480 return TC_ACT_SHOT;
481 }
482 }
483
484 if (bpf_check_mtu(skb, skb->ifindex, &mtu_len, delta, 0)) {
485 metrics->errors_total_encap_mtu_violate++;
486 return TC_ACT_SHOT;
487 }
488
489 if (bpf_skb_adjust_room(skb, delta, BPF_ADJ_ROOM_NET,
490 BPF_F_ADJ_ROOM_FIXED_GSO |
491 BPF_F_ADJ_ROOM_NO_CSUM_RESET) ||
492 bpf_csum_level(skb, BPF_CSUM_LEVEL_INC)) {
493 metrics->errors_total_encap_adjust_failed++;
494 return TC_ACT_SHOT;
495 }
496
497 if (bpf_skb_pull_data(skb, sizeof(encap_gre_t))) {
498 metrics->errors_total_encap_buffer_too_small++;
499 return TC_ACT_SHOT;
500 }
501
502 buf_t pkt = {
503 .skb = skb,
504 .head = (uint8_t *)(long)skb->data,
505 .tail = (uint8_t *)(long)skb->data_end,
506 };
507
508 encap_gre_t *encap_gre = buf_assign(&pkt, sizeof(encap_gre_t), NULL);
509 if (encap_gre == NULL) {
510 metrics->errors_total_encap_buffer_too_small++;
511 return TC_ACT_SHOT;
512 }
513
514 encap_gre->ip.protocol = IPPROTO_GRE;
515 encap_gre->ip.daddr = next_hop->s_addr;
516 encap_gre->ip.saddr = ENCAPSULATION_IP;
517 encap_gre->ip.tot_len =
518 bpf_htons(bpf_ntohs(encap_gre->ip.tot_len) + delta);
519 encap_gre->gre.flags = 0;
520 encap_gre->gre.protocol = bpf_htons(proto);
521 pkt_ipv4_checksum((void *)&encap_gre->ip);
522
523 return bpf_redirect(skb->ifindex, 0);
524 }
525
forward_to_next_hop(struct __sk_buff * skb,encap_headers_t * encap,struct in_addr * next_hop,metrics_t * metrics)526 static INLINING ret_t forward_to_next_hop(struct __sk_buff *skb, encap_headers_t *encap,
527 struct in_addr *next_hop, metrics_t *metrics)
528 {
529 /* swap L2 addresses */
530 /* This assumes that packets are received from a router.
531 * So just swapping the MAC addresses here will make the packet go back to
532 * the router, which will send it to the appropriate machine.
533 */
534 unsigned char temp[ETH_ALEN];
535 memcpy(temp, encap->eth.h_dest, sizeof(temp));
536 memcpy(encap->eth.h_dest, encap->eth.h_source,
537 sizeof(encap->eth.h_dest));
538 memcpy(encap->eth.h_source, temp, sizeof(encap->eth.h_source));
539
540 if (encap->unigue.next_hop == encap->unigue.hop_count - 1 &&
541 encap->unigue.last_hop_gre) {
542 return forward_with_gre(skb, encap, next_hop, metrics);
543 }
544
545 metrics->forwarded_packets_total_gue++;
546 uint32_t old_saddr = encap->ip.saddr;
547 encap->ip.saddr = encap->ip.daddr;
548 encap->ip.daddr = next_hop->s_addr;
549 if (encap->unigue.next_hop < encap->unigue.hop_count) {
550 encap->unigue.next_hop++;
551 }
552
553 /* Remove ip->saddr, add next_hop->s_addr */
554 const uint64_t off = offsetof(typeof(*encap), ip.check);
555 int ret = bpf_l3_csum_replace(skb, off, old_saddr, next_hop->s_addr, 4);
556 if (ret < 0) {
557 return TC_ACT_SHOT;
558 }
559
560 return bpf_redirect(skb->ifindex, 0);
561 }
562
skip_next_hops(buf_t * pkt,int n)563 static INLINING ret_t skip_next_hops(buf_t *pkt, int n)
564 {
565 switch (n) {
566 case 1:
567 if (!buf_skip(pkt, sizeof(struct in_addr)))
568 return TC_ACT_SHOT;
569 case 0:
570 return CONTINUE_PROCESSING;
571
572 default:
573 return TC_ACT_SHOT;
574 }
575 }
576
577 /* Get the next hop from the GLB header.
578 *
579 * Sets next_hop->s_addr to 0 if there are no more hops left.
580 * pkt is positioned just after the variable length GLB header
581 * iff the call is successful.
582 */
get_next_hop(buf_t * pkt,encap_headers_t * encap,struct in_addr * next_hop)583 static INLINING ret_t get_next_hop(buf_t *pkt, encap_headers_t *encap,
584 struct in_addr *next_hop)
585 {
586 if (encap->unigue.next_hop > encap->unigue.hop_count) {
587 return TC_ACT_SHOT;
588 }
589
590 /* Skip "used" next hops. */
591 MAYBE_RETURN(skip_next_hops(pkt, encap->unigue.next_hop));
592
593 if (encap->unigue.next_hop == encap->unigue.hop_count) {
594 /* No more next hops, we are at the end of the GLB header. */
595 next_hop->s_addr = 0;
596 return CONTINUE_PROCESSING;
597 }
598
599 if (!buf_copy(pkt, next_hop, sizeof(*next_hop))) {
600 return TC_ACT_SHOT;
601 }
602
603 /* Skip the remaining next hops (may be zero). */
604 return skip_next_hops(pkt, encap->unigue.hop_count -
605 encap->unigue.next_hop - 1);
606 }
607
608 /* Fill a bpf_sock_tuple to be used with the socket lookup functions.
609 * This is a kludge that let's us work around verifier limitations:
610 *
611 * fill_tuple(&t, foo, sizeof(struct iphdr), 123, 321)
612 *
613 * clang will substitute a constant for sizeof, which allows the verifier
614 * to track its value. Based on this, it can figure out the constant
615 * return value, and calling code works while still being "generic" to
616 * IPv4 and IPv6.
617 */
fill_tuple(struct bpf_sock_tuple * tuple,void * iph,uint64_t iphlen,uint16_t sport,uint16_t dport)618 static INLINING uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph,
619 uint64_t iphlen, uint16_t sport, uint16_t dport)
620 {
621 switch (iphlen) {
622 case sizeof(struct iphdr): {
623 struct iphdr *ipv4 = (struct iphdr *)iph;
624 tuple->ipv4.daddr = ipv4->daddr;
625 tuple->ipv4.saddr = ipv4->saddr;
626 tuple->ipv4.sport = sport;
627 tuple->ipv4.dport = dport;
628 return sizeof(tuple->ipv4);
629 }
630
631 case sizeof(struct ipv6hdr): {
632 struct ipv6hdr *ipv6 = (struct ipv6hdr *)iph;
633 memcpy(&tuple->ipv6.daddr, &ipv6->daddr,
634 sizeof(tuple->ipv6.daddr));
635 memcpy(&tuple->ipv6.saddr, &ipv6->saddr,
636 sizeof(tuple->ipv6.saddr));
637 tuple->ipv6.sport = sport;
638 tuple->ipv6.dport = dport;
639 return sizeof(tuple->ipv6);
640 }
641
642 default:
643 return 0;
644 }
645 }
646
classify_tcp(struct __sk_buff * skb,struct bpf_sock_tuple * tuple,uint64_t tuplen,void * iph,struct tcphdr * tcp)647 static INLINING verdict_t classify_tcp(struct __sk_buff *skb,
648 struct bpf_sock_tuple *tuple, uint64_t tuplen,
649 void *iph, struct tcphdr *tcp)
650 {
651 struct bpf_sock *sk =
652 bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0);
653 if (sk == NULL) {
654 return UNKNOWN;
655 }
656
657 if (sk->state != BPF_TCP_LISTEN) {
658 bpf_sk_release(sk);
659 return ESTABLISHED;
660 }
661
662 if (iph != NULL && tcp != NULL) {
663 /* Kludge: we've run out of arguments, but need the length of the ip header. */
664 uint64_t iphlen = sizeof(struct iphdr);
665 if (tuplen == sizeof(tuple->ipv6)) {
666 iphlen = sizeof(struct ipv6hdr);
667 }
668
669 if (bpf_tcp_check_syncookie(sk, iph, iphlen, tcp,
670 sizeof(*tcp)) == 0) {
671 bpf_sk_release(sk);
672 return SYN_COOKIE;
673 }
674 }
675
676 bpf_sk_release(sk);
677 return UNKNOWN;
678 }
679
classify_udp(struct __sk_buff * skb,struct bpf_sock_tuple * tuple,uint64_t tuplen)680 static INLINING verdict_t classify_udp(struct __sk_buff *skb,
681 struct bpf_sock_tuple *tuple, uint64_t tuplen)
682 {
683 struct bpf_sock *sk =
684 bpf_sk_lookup_udp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0);
685 if (sk == NULL) {
686 return UNKNOWN;
687 }
688
689 if (sk->state == BPF_TCP_ESTABLISHED) {
690 bpf_sk_release(sk);
691 return ESTABLISHED;
692 }
693
694 bpf_sk_release(sk);
695 return UNKNOWN;
696 }
697
classify_icmp(struct __sk_buff * skb,uint8_t proto,struct bpf_sock_tuple * tuple,uint64_t tuplen,metrics_t * metrics)698 static INLINING verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto,
699 struct bpf_sock_tuple *tuple, uint64_t tuplen,
700 metrics_t *metrics)
701 {
702 switch (proto) {
703 case IPPROTO_TCP:
704 return classify_tcp(skb, tuple, tuplen, NULL, NULL);
705
706 case IPPROTO_UDP:
707 return classify_udp(skb, tuple, tuplen);
708
709 default:
710 metrics->errors_total_malformed_icmp++;
711 return INVALID;
712 }
713 }
714
process_icmpv4(buf_t * pkt,metrics_t * metrics)715 static INLINING verdict_t process_icmpv4(buf_t *pkt, metrics_t *metrics)
716 {
717 struct icmphdr icmp;
718 if (!buf_copy(pkt, &icmp, sizeof(icmp))) {
719 metrics->errors_total_malformed_icmp++;
720 return INVALID;
721 }
722
723 /* We should never receive encapsulated echo replies. */
724 if (icmp.type == ICMP_ECHOREPLY) {
725 metrics->errors_total_icmp_echo_replies++;
726 return INVALID;
727 }
728
729 if (icmp.type == ICMP_ECHO) {
730 return ECHO_REQUEST;
731 }
732
733 if (icmp.type != ICMP_DEST_UNREACH || icmp.code != ICMP_FRAG_NEEDED) {
734 metrics->errors_total_unwanted_icmp++;
735 return INVALID;
736 }
737
738 struct iphdr _ip4;
739 const struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4);
740 if (ipv4 == NULL) {
741 metrics->errors_total_malformed_icmp_pkt_too_big++;
742 return INVALID;
743 }
744
745 /* The source address in the outer IP header is from the entity that
746 * originated the ICMP message. Use the original IP header to restore
747 * the correct flow tuple.
748 */
749 struct bpf_sock_tuple tuple;
750 tuple.ipv4.saddr = ipv4->daddr;
751 tuple.ipv4.daddr = ipv4->saddr;
752
753 if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv4.sport)) {
754 metrics->errors_total_malformed_icmp_pkt_too_big++;
755 return INVALID;
756 }
757
758 return classify_icmp(pkt->skb, ipv4->protocol, &tuple,
759 sizeof(tuple.ipv4), metrics);
760 }
761
process_icmpv6(buf_t * pkt,metrics_t * metrics)762 static INLINING verdict_t process_icmpv6(buf_t *pkt, metrics_t *metrics)
763 {
764 struct icmp6hdr icmp6;
765 if (!buf_copy(pkt, &icmp6, sizeof(icmp6))) {
766 metrics->errors_total_malformed_icmp++;
767 return INVALID;
768 }
769
770 /* We should never receive encapsulated echo replies. */
771 if (icmp6.icmp6_type == ICMPV6_ECHO_REPLY) {
772 metrics->errors_total_icmp_echo_replies++;
773 return INVALID;
774 }
775
776 if (icmp6.icmp6_type == ICMPV6_ECHO_REQUEST) {
777 return ECHO_REQUEST;
778 }
779
780 if (icmp6.icmp6_type != ICMPV6_PKT_TOOBIG) {
781 metrics->errors_total_unwanted_icmp++;
782 return INVALID;
783 }
784
785 bool is_fragment;
786 uint8_t l4_proto;
787 struct ipv6hdr _ipv6;
788 const struct ipv6hdr *ipv6 =
789 pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment);
790 if (ipv6 == NULL) {
791 metrics->errors_total_malformed_icmp_pkt_too_big++;
792 return INVALID;
793 }
794
795 if (is_fragment) {
796 metrics->errors_total_fragmented_ip++;
797 return INVALID;
798 }
799
800 /* Swap source and dest addresses. */
801 struct bpf_sock_tuple tuple;
802 memcpy(&tuple.ipv6.saddr, &ipv6->daddr, sizeof(tuple.ipv6.saddr));
803 memcpy(&tuple.ipv6.daddr, &ipv6->saddr, sizeof(tuple.ipv6.daddr));
804
805 if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv6.sport)) {
806 metrics->errors_total_malformed_icmp_pkt_too_big++;
807 return INVALID;
808 }
809
810 return classify_icmp(pkt->skb, l4_proto, &tuple, sizeof(tuple.ipv6),
811 metrics);
812 }
813
process_tcp(buf_t * pkt,void * iph,uint64_t iphlen,metrics_t * metrics)814 static INLINING verdict_t process_tcp(buf_t *pkt, void *iph, uint64_t iphlen,
815 metrics_t *metrics)
816 {
817 metrics->l4_protocol_packets_total_tcp++;
818
819 struct tcphdr _tcp;
820 struct tcphdr *tcp = buf_assign(pkt, sizeof(_tcp), &_tcp);
821 if (tcp == NULL) {
822 metrics->errors_total_malformed_tcp++;
823 return INVALID;
824 }
825
826 if (tcp->syn) {
827 return SYN;
828 }
829
830 struct bpf_sock_tuple tuple;
831 uint64_t tuplen =
832 fill_tuple(&tuple, iph, iphlen, tcp->source, tcp->dest);
833 return classify_tcp(pkt->skb, &tuple, tuplen, iph, tcp);
834 }
835
process_udp(buf_t * pkt,void * iph,uint64_t iphlen,metrics_t * metrics)836 static INLINING verdict_t process_udp(buf_t *pkt, void *iph, uint64_t iphlen,
837 metrics_t *metrics)
838 {
839 metrics->l4_protocol_packets_total_udp++;
840
841 struct udphdr _udp;
842 struct udphdr *udph = buf_assign(pkt, sizeof(_udp), &_udp);
843 if (udph == NULL) {
844 metrics->errors_total_malformed_udp++;
845 return INVALID;
846 }
847
848 struct bpf_sock_tuple tuple;
849 uint64_t tuplen =
850 fill_tuple(&tuple, iph, iphlen, udph->source, udph->dest);
851 return classify_udp(pkt->skb, &tuple, tuplen);
852 }
853
process_ipv4(buf_t * pkt,metrics_t * metrics)854 static INLINING verdict_t process_ipv4(buf_t *pkt, metrics_t *metrics)
855 {
856 metrics->l3_protocol_packets_total_ipv4++;
857
858 struct iphdr _ip4;
859 struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4);
860 if (ipv4 == NULL) {
861 metrics->errors_total_malformed_ip++;
862 return INVALID;
863 }
864
865 if (ipv4->version != 4) {
866 metrics->errors_total_malformed_ip++;
867 return INVALID;
868 }
869
870 if (ipv4_is_fragment(ipv4)) {
871 metrics->errors_total_fragmented_ip++;
872 return INVALID;
873 }
874
875 switch (ipv4->protocol) {
876 case IPPROTO_ICMP:
877 return process_icmpv4(pkt, metrics);
878
879 case IPPROTO_TCP:
880 return process_tcp(pkt, ipv4, sizeof(*ipv4), metrics);
881
882 case IPPROTO_UDP:
883 return process_udp(pkt, ipv4, sizeof(*ipv4), metrics);
884
885 default:
886 metrics->errors_total_unknown_l4_proto++;
887 return INVALID;
888 }
889 }
890
process_ipv6(buf_t * pkt,metrics_t * metrics)891 static INLINING verdict_t process_ipv6(buf_t *pkt, metrics_t *metrics)
892 {
893 metrics->l3_protocol_packets_total_ipv6++;
894
895 uint8_t l4_proto;
896 bool is_fragment;
897 struct ipv6hdr _ipv6;
898 struct ipv6hdr *ipv6 =
899 pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment);
900 if (ipv6 == NULL) {
901 metrics->errors_total_malformed_ip++;
902 return INVALID;
903 }
904
905 if (ipv6->version != 6) {
906 metrics->errors_total_malformed_ip++;
907 return INVALID;
908 }
909
910 if (is_fragment) {
911 metrics->errors_total_fragmented_ip++;
912 return INVALID;
913 }
914
915 switch (l4_proto) {
916 case IPPROTO_ICMPV6:
917 return process_icmpv6(pkt, metrics);
918
919 case IPPROTO_TCP:
920 return process_tcp(pkt, ipv6, sizeof(*ipv6), metrics);
921
922 case IPPROTO_UDP:
923 return process_udp(pkt, ipv6, sizeof(*ipv6), metrics);
924
925 default:
926 metrics->errors_total_unknown_l4_proto++;
927 return INVALID;
928 }
929 }
930
931 SEC("tc")
cls_redirect(struct __sk_buff * skb)932 int cls_redirect(struct __sk_buff *skb)
933 {
934 metrics_t *metrics = get_global_metrics();
935 if (metrics == NULL) {
936 return TC_ACT_SHOT;
937 }
938
939 metrics->processed_packets_total++;
940
941 /* Pass bogus packets as long as we're not sure they're
942 * destined for us.
943 */
944 if (skb->protocol != bpf_htons(ETH_P_IP)) {
945 return TC_ACT_OK;
946 }
947
948 encap_headers_t *encap;
949
950 /* Make sure that all encapsulation headers are available in
951 * the linear portion of the skb. This makes it easy to manipulate them.
952 */
953 if (bpf_skb_pull_data(skb, sizeof(*encap))) {
954 return TC_ACT_OK;
955 }
956
957 buf_t pkt = {
958 .skb = skb,
959 .head = (uint8_t *)(long)skb->data,
960 .tail = (uint8_t *)(long)skb->data_end,
961 };
962
963 encap = buf_assign(&pkt, sizeof(*encap), NULL);
964 if (encap == NULL) {
965 return TC_ACT_OK;
966 }
967
968 if (encap->ip.ihl != 5) {
969 /* We never have any options. */
970 return TC_ACT_OK;
971 }
972
973 if (encap->ip.daddr != ENCAPSULATION_IP ||
974 encap->ip.protocol != IPPROTO_UDP) {
975 return TC_ACT_OK;
976 }
977
978 /* TODO Check UDP length? */
979 if (encap->udp.dest != ENCAPSULATION_PORT) {
980 return TC_ACT_OK;
981 }
982
983 /* We now know that the packet is destined to us, we can
984 * drop bogus ones.
985 */
986 if (ipv4_is_fragment((void *)&encap->ip)) {
987 metrics->errors_total_fragmented_ip++;
988 return TC_ACT_SHOT;
989 }
990
991 if (encap->gue.variant != 0) {
992 metrics->errors_total_malformed_encapsulation++;
993 return TC_ACT_SHOT;
994 }
995
996 if (encap->gue.control != 0) {
997 metrics->errors_total_malformed_encapsulation++;
998 return TC_ACT_SHOT;
999 }
1000
1001 if (encap->gue.flags != 0) {
1002 metrics->errors_total_malformed_encapsulation++;
1003 return TC_ACT_SHOT;
1004 }
1005
1006 if (encap->gue.hlen !=
1007 sizeof(encap->unigue) / 4 + encap->unigue.hop_count) {
1008 metrics->errors_total_malformed_encapsulation++;
1009 return TC_ACT_SHOT;
1010 }
1011
1012 if (encap->unigue.version != 0) {
1013 metrics->errors_total_malformed_encapsulation++;
1014 return TC_ACT_SHOT;
1015 }
1016
1017 if (encap->unigue.reserved != 0) {
1018 return TC_ACT_SHOT;
1019 }
1020
1021 struct in_addr next_hop;
1022 MAYBE_RETURN(get_next_hop(&pkt, encap, &next_hop));
1023
1024 if (next_hop.s_addr == 0) {
1025 metrics->accepted_packets_total_last_hop++;
1026 return accept_locally(skb, encap);
1027 }
1028
1029 verdict_t verdict;
1030 switch (encap->gue.proto_ctype) {
1031 case IPPROTO_IPIP:
1032 verdict = process_ipv4(&pkt, metrics);
1033 break;
1034
1035 case IPPROTO_IPV6:
1036 verdict = process_ipv6(&pkt, metrics);
1037 break;
1038
1039 default:
1040 metrics->errors_total_unknown_l3_proto++;
1041 return TC_ACT_SHOT;
1042 }
1043
1044 switch (verdict) {
1045 case INVALID:
1046 /* metrics have already been bumped */
1047 return TC_ACT_SHOT;
1048
1049 case UNKNOWN:
1050 return forward_to_next_hop(skb, encap, &next_hop, metrics);
1051
1052 case ECHO_REQUEST:
1053 metrics->accepted_packets_total_icmp_echo_request++;
1054 break;
1055
1056 case SYN:
1057 if (encap->unigue.forward_syn) {
1058 return forward_to_next_hop(skb, encap, &next_hop,
1059 metrics);
1060 }
1061
1062 metrics->accepted_packets_total_syn++;
1063 break;
1064
1065 case SYN_COOKIE:
1066 metrics->accepted_packets_total_syn_cookies++;
1067 break;
1068
1069 case ESTABLISHED:
1070 metrics->accepted_packets_total_established++;
1071 break;
1072 }
1073
1074 return accept_locally(skb, encap);
1075 }
1076