123458901SLorenz Bauer // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 223458901SLorenz Bauer // Copyright (c) 2019, 2020 Cloudflare 323458901SLorenz Bauer 423458901SLorenz Bauer #include <stdbool.h> 523458901SLorenz Bauer #include <stddef.h> 623458901SLorenz Bauer #include <stdint.h> 723458901SLorenz Bauer #include <string.h> 823458901SLorenz Bauer 923458901SLorenz Bauer #include <linux/bpf.h> 1023458901SLorenz Bauer #include <linux/icmp.h> 1123458901SLorenz Bauer #include <linux/icmpv6.h> 1223458901SLorenz Bauer #include <linux/if_ether.h> 1323458901SLorenz Bauer #include <linux/in.h> 1423458901SLorenz Bauer #include <linux/ip.h> 1523458901SLorenz Bauer #include <linux/ipv6.h> 1623458901SLorenz Bauer #include <linux/pkt_cls.h> 1723458901SLorenz Bauer #include <linux/tcp.h> 1823458901SLorenz Bauer #include <linux/udp.h> 1923458901SLorenz Bauer 2023458901SLorenz Bauer #include <bpf/bpf_helpers.h> 2123458901SLorenz Bauer #include <bpf/bpf_endian.h> 2223458901SLorenz Bauer 2323458901SLorenz Bauer #include "test_cls_redirect.h" 2423458901SLorenz Bauer 2523458901SLorenz Bauer #define offsetofend(TYPE, MEMBER) \ 2623458901SLorenz Bauer (offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER))) 2723458901SLorenz Bauer 2823458901SLorenz Bauer #define IP_OFFSET_MASK (0x1FFF) 2923458901SLorenz Bauer #define IP_MF (0x2000) 3023458901SLorenz Bauer 3123458901SLorenz Bauer char _license[] SEC("license") = "Dual BSD/GPL"; 3223458901SLorenz Bauer 3323458901SLorenz Bauer /** 3423458901SLorenz Bauer * Destination port and IP used for UDP encapsulation. 3523458901SLorenz Bauer */ 3623458901SLorenz Bauer static volatile const __be16 ENCAPSULATION_PORT; 3723458901SLorenz Bauer static volatile const __be32 ENCAPSULATION_IP; 3823458901SLorenz Bauer 3923458901SLorenz Bauer typedef struct { 4023458901SLorenz Bauer uint64_t processed_packets_total; 4123458901SLorenz Bauer uint64_t l3_protocol_packets_total_ipv4; 4223458901SLorenz Bauer uint64_t l3_protocol_packets_total_ipv6; 4323458901SLorenz Bauer uint64_t l4_protocol_packets_total_tcp; 4423458901SLorenz Bauer uint64_t l4_protocol_packets_total_udp; 4523458901SLorenz Bauer uint64_t accepted_packets_total_syn; 4623458901SLorenz Bauer uint64_t accepted_packets_total_syn_cookies; 4723458901SLorenz Bauer uint64_t accepted_packets_total_last_hop; 4823458901SLorenz Bauer uint64_t accepted_packets_total_icmp_echo_request; 4923458901SLorenz Bauer uint64_t accepted_packets_total_established; 5023458901SLorenz Bauer uint64_t forwarded_packets_total_gue; 5123458901SLorenz Bauer uint64_t forwarded_packets_total_gre; 5223458901SLorenz Bauer 5323458901SLorenz Bauer uint64_t errors_total_unknown_l3_proto; 5423458901SLorenz Bauer uint64_t errors_total_unknown_l4_proto; 5523458901SLorenz Bauer uint64_t errors_total_malformed_ip; 5623458901SLorenz Bauer uint64_t errors_total_fragmented_ip; 5723458901SLorenz Bauer uint64_t errors_total_malformed_icmp; 5823458901SLorenz Bauer uint64_t errors_total_unwanted_icmp; 5923458901SLorenz Bauer uint64_t errors_total_malformed_icmp_pkt_too_big; 6023458901SLorenz Bauer uint64_t errors_total_malformed_tcp; 6123458901SLorenz Bauer uint64_t errors_total_malformed_udp; 6223458901SLorenz Bauer uint64_t errors_total_icmp_echo_replies; 6323458901SLorenz Bauer uint64_t errors_total_malformed_encapsulation; 6423458901SLorenz Bauer uint64_t errors_total_encap_adjust_failed; 6523458901SLorenz Bauer uint64_t errors_total_encap_buffer_too_small; 6623458901SLorenz Bauer uint64_t errors_total_redirect_loop; 6723458901SLorenz Bauer } metrics_t; 6823458901SLorenz Bauer 6923458901SLorenz Bauer typedef enum { 7023458901SLorenz Bauer INVALID = 0, 7123458901SLorenz Bauer UNKNOWN, 7223458901SLorenz Bauer ECHO_REQUEST, 7323458901SLorenz Bauer SYN, 7423458901SLorenz Bauer SYN_COOKIE, 7523458901SLorenz Bauer ESTABLISHED, 7623458901SLorenz Bauer } verdict_t; 7723458901SLorenz Bauer 7823458901SLorenz Bauer typedef struct { 7923458901SLorenz Bauer uint16_t src, dst; 8023458901SLorenz Bauer } flow_ports_t; 8123458901SLorenz Bauer 8223458901SLorenz Bauer _Static_assert( 8323458901SLorenz Bauer sizeof(flow_ports_t) != 8423458901SLorenz Bauer offsetofend(struct bpf_sock_tuple, ipv4.dport) - 8523458901SLorenz Bauer offsetof(struct bpf_sock_tuple, ipv4.sport) - 1, 8623458901SLorenz Bauer "flow_ports_t must match sport and dport in struct bpf_sock_tuple"); 8723458901SLorenz Bauer _Static_assert( 8823458901SLorenz Bauer sizeof(flow_ports_t) != 8923458901SLorenz Bauer offsetofend(struct bpf_sock_tuple, ipv6.dport) - 9023458901SLorenz Bauer offsetof(struct bpf_sock_tuple, ipv6.sport) - 1, 9123458901SLorenz Bauer "flow_ports_t must match sport and dport in struct bpf_sock_tuple"); 9223458901SLorenz Bauer 9323458901SLorenz Bauer typedef int ret_t; 9423458901SLorenz Bauer 9523458901SLorenz Bauer /* This is a bit of a hack. We need a return value which allows us to 9623458901SLorenz Bauer * indicate that the regular flow of the program should continue, 9723458901SLorenz Bauer * while allowing functions to use XDP_PASS and XDP_DROP, etc. 9823458901SLorenz Bauer */ 9923458901SLorenz Bauer static const ret_t CONTINUE_PROCESSING = -1; 10023458901SLorenz Bauer 10123458901SLorenz Bauer /* Convenience macro to call functions which return ret_t. 10223458901SLorenz Bauer */ 10323458901SLorenz Bauer #define MAYBE_RETURN(x) \ 10423458901SLorenz Bauer do { \ 10523458901SLorenz Bauer ret_t __ret = x; \ 10623458901SLorenz Bauer if (__ret != CONTINUE_PROCESSING) \ 10723458901SLorenz Bauer return __ret; \ 10823458901SLorenz Bauer } while (0) 10923458901SLorenz Bauer 11023458901SLorenz Bauer /* Linux packet pointers are either aligned to NET_IP_ALIGN (aka 2 bytes), 11123458901SLorenz Bauer * or not aligned if the arch supports efficient unaligned access. 11223458901SLorenz Bauer * 11323458901SLorenz Bauer * Since the verifier ensures that eBPF packet accesses follow these rules, 11423458901SLorenz Bauer * we can tell LLVM to emit code as if we always had a larger alignment. 11523458901SLorenz Bauer * It will yell at us if we end up on a platform where this is not valid. 11623458901SLorenz Bauer */ 11723458901SLorenz Bauer typedef uint8_t *net_ptr __attribute__((align_value(8))); 11823458901SLorenz Bauer 11923458901SLorenz Bauer typedef struct buf { 12023458901SLorenz Bauer struct __sk_buff *skb; 12123458901SLorenz Bauer net_ptr head; 12223458901SLorenz Bauer /* NB: tail musn't have alignment other than 1, otherwise 12323458901SLorenz Bauer * LLVM will go and eliminate code, e.g. when checking packet lengths. 12423458901SLorenz Bauer */ 12523458901SLorenz Bauer uint8_t *const tail; 12623458901SLorenz Bauer } buf_t; 12723458901SLorenz Bauer 12823458901SLorenz Bauer static size_t buf_off(const buf_t *buf) 12923458901SLorenz Bauer { 13023458901SLorenz Bauer /* Clang seems to optimize constructs like 13123458901SLorenz Bauer * a - b + c 13223458901SLorenz Bauer * if c is known: 13323458901SLorenz Bauer * r? = c 13423458901SLorenz Bauer * r? -= b 13523458901SLorenz Bauer * r? += a 13623458901SLorenz Bauer * 13723458901SLorenz Bauer * This is a problem if a and b are packet pointers, 13823458901SLorenz Bauer * since the verifier allows subtracting two pointers to 13923458901SLorenz Bauer * get a scalar, but not a scalar and a pointer. 14023458901SLorenz Bauer * 14123458901SLorenz Bauer * Use inline asm to break this optimization. 14223458901SLorenz Bauer */ 14323458901SLorenz Bauer size_t off = (size_t)buf->head; 14423458901SLorenz Bauer asm("%0 -= %1" : "+r"(off) : "r"(buf->skb->data)); 14523458901SLorenz Bauer return off; 14623458901SLorenz Bauer } 14723458901SLorenz Bauer 14823458901SLorenz Bauer static bool buf_copy(buf_t *buf, void *dst, size_t len) 14923458901SLorenz Bauer { 15023458901SLorenz Bauer if (bpf_skb_load_bytes(buf->skb, buf_off(buf), dst, len)) { 15123458901SLorenz Bauer return false; 15223458901SLorenz Bauer } 15323458901SLorenz Bauer 15423458901SLorenz Bauer buf->head += len; 15523458901SLorenz Bauer return true; 15623458901SLorenz Bauer } 15723458901SLorenz Bauer 15823458901SLorenz Bauer static bool buf_skip(buf_t *buf, const size_t len) 15923458901SLorenz Bauer { 16023458901SLorenz Bauer /* Check whether off + len is valid in the non-linear part. */ 16123458901SLorenz Bauer if (buf_off(buf) + len > buf->skb->len) { 16223458901SLorenz Bauer return false; 16323458901SLorenz Bauer } 16423458901SLorenz Bauer 16523458901SLorenz Bauer buf->head += len; 16623458901SLorenz Bauer return true; 16723458901SLorenz Bauer } 16823458901SLorenz Bauer 16923458901SLorenz Bauer /* Returns a pointer to the start of buf, or NULL if len is 17023458901SLorenz Bauer * larger than the remaining data. Consumes len bytes on a successful 17123458901SLorenz Bauer * call. 17223458901SLorenz Bauer * 17323458901SLorenz Bauer * If scratch is not NULL, the function will attempt to load non-linear 17423458901SLorenz Bauer * data via bpf_skb_load_bytes. On success, scratch is returned. 17523458901SLorenz Bauer */ 17623458901SLorenz Bauer static void *buf_assign(buf_t *buf, const size_t len, void *scratch) 17723458901SLorenz Bauer { 17823458901SLorenz Bauer if (buf->head + len > buf->tail) { 17923458901SLorenz Bauer if (scratch == NULL) { 18023458901SLorenz Bauer return NULL; 18123458901SLorenz Bauer } 18223458901SLorenz Bauer 18323458901SLorenz Bauer return buf_copy(buf, scratch, len) ? scratch : NULL; 18423458901SLorenz Bauer } 18523458901SLorenz Bauer 18623458901SLorenz Bauer void *ptr = buf->head; 18723458901SLorenz Bauer buf->head += len; 18823458901SLorenz Bauer return ptr; 18923458901SLorenz Bauer } 19023458901SLorenz Bauer 19123458901SLorenz Bauer static bool pkt_skip_ipv4_options(buf_t *buf, const struct iphdr *ipv4) 19223458901SLorenz Bauer { 19323458901SLorenz Bauer if (ipv4->ihl <= 5) { 19423458901SLorenz Bauer return true; 19523458901SLorenz Bauer } 19623458901SLorenz Bauer 19723458901SLorenz Bauer return buf_skip(buf, (ipv4->ihl - 5) * 4); 19823458901SLorenz Bauer } 19923458901SLorenz Bauer 20023458901SLorenz Bauer static bool ipv4_is_fragment(const struct iphdr *ip) 20123458901SLorenz Bauer { 20223458901SLorenz Bauer uint16_t frag_off = ip->frag_off & bpf_htons(IP_OFFSET_MASK); 20323458901SLorenz Bauer return (ip->frag_off & bpf_htons(IP_MF)) != 0 || frag_off > 0; 20423458901SLorenz Bauer } 20523458901SLorenz Bauer 20623458901SLorenz Bauer static struct iphdr *pkt_parse_ipv4(buf_t *pkt, struct iphdr *scratch) 20723458901SLorenz Bauer { 20823458901SLorenz Bauer struct iphdr *ipv4 = buf_assign(pkt, sizeof(*ipv4), scratch); 20923458901SLorenz Bauer if (ipv4 == NULL) { 21023458901SLorenz Bauer return NULL; 21123458901SLorenz Bauer } 21223458901SLorenz Bauer 21323458901SLorenz Bauer if (ipv4->ihl < 5) { 21423458901SLorenz Bauer return NULL; 21523458901SLorenz Bauer } 21623458901SLorenz Bauer 21723458901SLorenz Bauer if (!pkt_skip_ipv4_options(pkt, ipv4)) { 21823458901SLorenz Bauer return NULL; 21923458901SLorenz Bauer } 22023458901SLorenz Bauer 22123458901SLorenz Bauer return ipv4; 22223458901SLorenz Bauer } 22323458901SLorenz Bauer 22423458901SLorenz Bauer /* Parse the L4 ports from a packet, assuming a layout like TCP or UDP. */ 22523458901SLorenz Bauer static bool pkt_parse_icmp_l4_ports(buf_t *pkt, flow_ports_t *ports) 22623458901SLorenz Bauer { 22723458901SLorenz Bauer if (!buf_copy(pkt, ports, sizeof(*ports))) { 22823458901SLorenz Bauer return false; 22923458901SLorenz Bauer } 23023458901SLorenz Bauer 23123458901SLorenz Bauer /* Ports in the L4 headers are reversed, since we are parsing an ICMP 23223458901SLorenz Bauer * payload which is going towards the eyeball. 23323458901SLorenz Bauer */ 23423458901SLorenz Bauer uint16_t dst = ports->src; 23523458901SLorenz Bauer ports->src = ports->dst; 23623458901SLorenz Bauer ports->dst = dst; 23723458901SLorenz Bauer return true; 23823458901SLorenz Bauer } 23923458901SLorenz Bauer 24023458901SLorenz Bauer static uint16_t pkt_checksum_fold(uint32_t csum) 24123458901SLorenz Bauer { 24223458901SLorenz Bauer /* The highest reasonable value for an IPv4 header 24323458901SLorenz Bauer * checksum requires two folds, so we just do that always. 24423458901SLorenz Bauer */ 24523458901SLorenz Bauer csum = (csum & 0xffff) + (csum >> 16); 24623458901SLorenz Bauer csum = (csum & 0xffff) + (csum >> 16); 24723458901SLorenz Bauer return (uint16_t)~csum; 24823458901SLorenz Bauer } 24923458901SLorenz Bauer 25023458901SLorenz Bauer static void pkt_ipv4_checksum(struct iphdr *iph) 25123458901SLorenz Bauer { 25223458901SLorenz Bauer iph->check = 0; 25323458901SLorenz Bauer 25423458901SLorenz Bauer /* An IP header without options is 20 bytes. Two of those 25523458901SLorenz Bauer * are the checksum, which we always set to zero. Hence, 25623458901SLorenz Bauer * the maximum accumulated value is 18 / 2 * 0xffff = 0x8fff7, 25723458901SLorenz Bauer * which fits in 32 bit. 25823458901SLorenz Bauer */ 25923458901SLorenz Bauer _Static_assert(sizeof(struct iphdr) == 20, "iphdr must be 20 bytes"); 26023458901SLorenz Bauer uint32_t acc = 0; 26123458901SLorenz Bauer uint16_t *ipw = (uint16_t *)iph; 26223458901SLorenz Bauer 26323458901SLorenz Bauer #pragma clang loop unroll(full) 26423458901SLorenz Bauer for (size_t i = 0; i < sizeof(struct iphdr) / 2; i++) { 26523458901SLorenz Bauer acc += ipw[i]; 26623458901SLorenz Bauer } 26723458901SLorenz Bauer 26823458901SLorenz Bauer iph->check = pkt_checksum_fold(acc); 26923458901SLorenz Bauer } 27023458901SLorenz Bauer 27123458901SLorenz Bauer static bool pkt_skip_ipv6_extension_headers(buf_t *pkt, 27223458901SLorenz Bauer const struct ipv6hdr *ipv6, 27323458901SLorenz Bauer uint8_t *upper_proto, 27423458901SLorenz Bauer bool *is_fragment) 27523458901SLorenz Bauer { 27623458901SLorenz Bauer /* We understand five extension headers. 27723458901SLorenz Bauer * https://tools.ietf.org/html/rfc8200#section-4.1 states that all 27823458901SLorenz Bauer * headers should occur once, except Destination Options, which may 27923458901SLorenz Bauer * occur twice. Hence we give up after 6 headers. 28023458901SLorenz Bauer */ 28123458901SLorenz Bauer struct { 28223458901SLorenz Bauer uint8_t next; 28323458901SLorenz Bauer uint8_t len; 28423458901SLorenz Bauer } exthdr = { 28523458901SLorenz Bauer .next = ipv6->nexthdr, 28623458901SLorenz Bauer }; 28723458901SLorenz Bauer *is_fragment = false; 28823458901SLorenz Bauer 28923458901SLorenz Bauer #pragma clang loop unroll(full) 29023458901SLorenz Bauer for (int i = 0; i < 6; i++) { 29123458901SLorenz Bauer switch (exthdr.next) { 29223458901SLorenz Bauer case IPPROTO_FRAGMENT: 29323458901SLorenz Bauer *is_fragment = true; 29423458901SLorenz Bauer /* NB: We don't check that hdrlen == 0 as per spec. */ 29523458901SLorenz Bauer /* fallthrough; */ 29623458901SLorenz Bauer 29723458901SLorenz Bauer case IPPROTO_HOPOPTS: 29823458901SLorenz Bauer case IPPROTO_ROUTING: 29923458901SLorenz Bauer case IPPROTO_DSTOPTS: 30023458901SLorenz Bauer case IPPROTO_MH: 30123458901SLorenz Bauer if (!buf_copy(pkt, &exthdr, sizeof(exthdr))) { 30223458901SLorenz Bauer return false; 30323458901SLorenz Bauer } 30423458901SLorenz Bauer 30523458901SLorenz Bauer /* hdrlen is in 8-octet units, and excludes the first 8 octets. */ 30623458901SLorenz Bauer if (!buf_skip(pkt, 30723458901SLorenz Bauer (exthdr.len + 1) * 8 - sizeof(exthdr))) { 30823458901SLorenz Bauer return false; 30923458901SLorenz Bauer } 31023458901SLorenz Bauer 31123458901SLorenz Bauer /* Decode next header */ 31223458901SLorenz Bauer break; 31323458901SLorenz Bauer 31423458901SLorenz Bauer default: 31523458901SLorenz Bauer /* The next header is not one of the known extension 31623458901SLorenz Bauer * headers, treat it as the upper layer header. 31723458901SLorenz Bauer * 31823458901SLorenz Bauer * This handles IPPROTO_NONE. 31923458901SLorenz Bauer * 32023458901SLorenz Bauer * Encapsulating Security Payload (50) and Authentication 32123458901SLorenz Bauer * Header (51) also end up here (and will trigger an 32223458901SLorenz Bauer * unknown proto error later). They have a custom header 32323458901SLorenz Bauer * format and seem too esoteric to care about. 32423458901SLorenz Bauer */ 32523458901SLorenz Bauer *upper_proto = exthdr.next; 32623458901SLorenz Bauer return true; 32723458901SLorenz Bauer } 32823458901SLorenz Bauer } 32923458901SLorenz Bauer 33023458901SLorenz Bauer /* We never found an upper layer header. */ 33123458901SLorenz Bauer return false; 33223458901SLorenz Bauer } 33323458901SLorenz Bauer 33423458901SLorenz Bauer /* This function has to be inlined, because the verifier otherwise rejects it 33523458901SLorenz Bauer * due to returning a pointer to the stack. This is technically correct, since 33623458901SLorenz Bauer * scratch is allocated on the stack. However, this usage should be safe since 33723458901SLorenz Bauer * it's the callers stack after all. 33823458901SLorenz Bauer */ 33923458901SLorenz Bauer static inline __attribute__((__always_inline__)) struct ipv6hdr * 34023458901SLorenz Bauer pkt_parse_ipv6(buf_t *pkt, struct ipv6hdr *scratch, uint8_t *proto, 34123458901SLorenz Bauer bool *is_fragment) 34223458901SLorenz Bauer { 34323458901SLorenz Bauer struct ipv6hdr *ipv6 = buf_assign(pkt, sizeof(*ipv6), scratch); 34423458901SLorenz Bauer if (ipv6 == NULL) { 34523458901SLorenz Bauer return NULL; 34623458901SLorenz Bauer } 34723458901SLorenz Bauer 34823458901SLorenz Bauer if (!pkt_skip_ipv6_extension_headers(pkt, ipv6, proto, is_fragment)) { 34923458901SLorenz Bauer return NULL; 35023458901SLorenz Bauer } 35123458901SLorenz Bauer 35223458901SLorenz Bauer return ipv6; 35323458901SLorenz Bauer } 35423458901SLorenz Bauer 35523458901SLorenz Bauer /* Global metrics, per CPU 35623458901SLorenz Bauer */ 35723458901SLorenz Bauer struct bpf_map_def metrics_map SEC("maps") = { 35823458901SLorenz Bauer .type = BPF_MAP_TYPE_PERCPU_ARRAY, 35923458901SLorenz Bauer .key_size = sizeof(unsigned int), 36023458901SLorenz Bauer .value_size = sizeof(metrics_t), 36123458901SLorenz Bauer .max_entries = 1, 36223458901SLorenz Bauer }; 36323458901SLorenz Bauer 36423458901SLorenz Bauer static metrics_t *get_global_metrics(void) 36523458901SLorenz Bauer { 36623458901SLorenz Bauer uint64_t key = 0; 36723458901SLorenz Bauer return bpf_map_lookup_elem(&metrics_map, &key); 36823458901SLorenz Bauer } 36923458901SLorenz Bauer 37023458901SLorenz Bauer static ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap) 37123458901SLorenz Bauer { 37223458901SLorenz Bauer const int payload_off = 37323458901SLorenz Bauer sizeof(*encap) + 37423458901SLorenz Bauer sizeof(struct in_addr) * encap->unigue.hop_count; 37523458901SLorenz Bauer int32_t encap_overhead = payload_off - sizeof(struct ethhdr); 37623458901SLorenz Bauer 37723458901SLorenz Bauer // Changing the ethertype if the encapsulated packet is ipv6 37823458901SLorenz Bauer if (encap->gue.proto_ctype == IPPROTO_IPV6) { 37923458901SLorenz Bauer encap->eth.h_proto = bpf_htons(ETH_P_IPV6); 38023458901SLorenz Bauer } 38123458901SLorenz Bauer 38223458901SLorenz Bauer if (bpf_skb_adjust_room(skb, -encap_overhead, BPF_ADJ_ROOM_MAC, 383c4ba153bSDaniel Borkmann BPF_F_ADJ_ROOM_FIXED_GSO | 384c4ba153bSDaniel Borkmann BPF_F_ADJ_ROOM_NO_CSUM_RESET) || 385c4ba153bSDaniel Borkmann bpf_csum_level(skb, BPF_CSUM_LEVEL_DEC)) 38623458901SLorenz Bauer return TC_ACT_SHOT; 38723458901SLorenz Bauer 38823458901SLorenz Bauer return bpf_redirect(skb->ifindex, BPF_F_INGRESS); 38923458901SLorenz Bauer } 39023458901SLorenz Bauer 39123458901SLorenz Bauer static ret_t forward_with_gre(struct __sk_buff *skb, encap_headers_t *encap, 39223458901SLorenz Bauer struct in_addr *next_hop, metrics_t *metrics) 39323458901SLorenz Bauer { 39423458901SLorenz Bauer metrics->forwarded_packets_total_gre++; 39523458901SLorenz Bauer 39623458901SLorenz Bauer const int payload_off = 39723458901SLorenz Bauer sizeof(*encap) + 39823458901SLorenz Bauer sizeof(struct in_addr) * encap->unigue.hop_count; 39923458901SLorenz Bauer int32_t encap_overhead = 40023458901SLorenz Bauer payload_off - sizeof(struct ethhdr) - sizeof(struct iphdr); 40123458901SLorenz Bauer int32_t delta = sizeof(struct gre_base_hdr) - encap_overhead; 40223458901SLorenz Bauer uint16_t proto = ETH_P_IP; 40323458901SLorenz Bauer 40423458901SLorenz Bauer /* Loop protection: the inner packet's TTL is decremented as a safeguard 40523458901SLorenz Bauer * against any forwarding loop. As the only interesting field is the TTL 40623458901SLorenz Bauer * hop limit for IPv6, it is easier to use bpf_skb_load_bytes/bpf_skb_store_bytes 40723458901SLorenz Bauer * as they handle the split packets if needed (no need for the data to be 40823458901SLorenz Bauer * in the linear section). 40923458901SLorenz Bauer */ 41023458901SLorenz Bauer if (encap->gue.proto_ctype == IPPROTO_IPV6) { 41123458901SLorenz Bauer proto = ETH_P_IPV6; 41223458901SLorenz Bauer uint8_t ttl; 41323458901SLorenz Bauer int rc; 41423458901SLorenz Bauer 41523458901SLorenz Bauer rc = bpf_skb_load_bytes( 41623458901SLorenz Bauer skb, payload_off + offsetof(struct ipv6hdr, hop_limit), 41723458901SLorenz Bauer &ttl, 1); 41823458901SLorenz Bauer if (rc != 0) { 41923458901SLorenz Bauer metrics->errors_total_malformed_encapsulation++; 42023458901SLorenz Bauer return TC_ACT_SHOT; 42123458901SLorenz Bauer } 42223458901SLorenz Bauer 42323458901SLorenz Bauer if (ttl == 0) { 42423458901SLorenz Bauer metrics->errors_total_redirect_loop++; 42523458901SLorenz Bauer return TC_ACT_SHOT; 42623458901SLorenz Bauer } 42723458901SLorenz Bauer 42823458901SLorenz Bauer ttl--; 42923458901SLorenz Bauer rc = bpf_skb_store_bytes( 43023458901SLorenz Bauer skb, payload_off + offsetof(struct ipv6hdr, hop_limit), 43123458901SLorenz Bauer &ttl, 1, 0); 43223458901SLorenz Bauer if (rc != 0) { 43323458901SLorenz Bauer metrics->errors_total_malformed_encapsulation++; 43423458901SLorenz Bauer return TC_ACT_SHOT; 43523458901SLorenz Bauer } 43623458901SLorenz Bauer } else { 43723458901SLorenz Bauer uint8_t ttl; 43823458901SLorenz Bauer int rc; 43923458901SLorenz Bauer 44023458901SLorenz Bauer rc = bpf_skb_load_bytes( 44123458901SLorenz Bauer skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 44223458901SLorenz Bauer 1); 44323458901SLorenz Bauer if (rc != 0) { 44423458901SLorenz Bauer metrics->errors_total_malformed_encapsulation++; 44523458901SLorenz Bauer return TC_ACT_SHOT; 44623458901SLorenz Bauer } 44723458901SLorenz Bauer 44823458901SLorenz Bauer if (ttl == 0) { 44923458901SLorenz Bauer metrics->errors_total_redirect_loop++; 45023458901SLorenz Bauer return TC_ACT_SHOT; 45123458901SLorenz Bauer } 45223458901SLorenz Bauer 45323458901SLorenz Bauer /* IPv4 also has a checksum to patch. While the TTL is only one byte, 45423458901SLorenz Bauer * this function only works for 2 and 4 bytes arguments (the result is 45523458901SLorenz Bauer * the same). 45623458901SLorenz Bauer */ 45723458901SLorenz Bauer rc = bpf_l3_csum_replace( 45823458901SLorenz Bauer skb, payload_off + offsetof(struct iphdr, check), ttl, 45923458901SLorenz Bauer ttl - 1, 2); 46023458901SLorenz Bauer if (rc != 0) { 46123458901SLorenz Bauer metrics->errors_total_malformed_encapsulation++; 46223458901SLorenz Bauer return TC_ACT_SHOT; 46323458901SLorenz Bauer } 46423458901SLorenz Bauer 46523458901SLorenz Bauer ttl--; 46623458901SLorenz Bauer rc = bpf_skb_store_bytes( 46723458901SLorenz Bauer skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 1, 46823458901SLorenz Bauer 0); 46923458901SLorenz Bauer if (rc != 0) { 47023458901SLorenz Bauer metrics->errors_total_malformed_encapsulation++; 47123458901SLorenz Bauer return TC_ACT_SHOT; 47223458901SLorenz Bauer } 47323458901SLorenz Bauer } 47423458901SLorenz Bauer 47523458901SLorenz Bauer if (bpf_skb_adjust_room(skb, delta, BPF_ADJ_ROOM_NET, 476c4ba153bSDaniel Borkmann BPF_F_ADJ_ROOM_FIXED_GSO | 477c4ba153bSDaniel Borkmann BPF_F_ADJ_ROOM_NO_CSUM_RESET) || 478c4ba153bSDaniel Borkmann bpf_csum_level(skb, BPF_CSUM_LEVEL_INC)) { 47923458901SLorenz Bauer metrics->errors_total_encap_adjust_failed++; 48023458901SLorenz Bauer return TC_ACT_SHOT; 48123458901SLorenz Bauer } 48223458901SLorenz Bauer 48323458901SLorenz Bauer if (bpf_skb_pull_data(skb, sizeof(encap_gre_t))) { 48423458901SLorenz Bauer metrics->errors_total_encap_buffer_too_small++; 48523458901SLorenz Bauer return TC_ACT_SHOT; 48623458901SLorenz Bauer } 48723458901SLorenz Bauer 48823458901SLorenz Bauer buf_t pkt = { 48923458901SLorenz Bauer .skb = skb, 49023458901SLorenz Bauer .head = (uint8_t *)(long)skb->data, 49123458901SLorenz Bauer .tail = (uint8_t *)(long)skb->data_end, 49223458901SLorenz Bauer }; 49323458901SLorenz Bauer 49423458901SLorenz Bauer encap_gre_t *encap_gre = buf_assign(&pkt, sizeof(encap_gre_t), NULL); 49523458901SLorenz Bauer if (encap_gre == NULL) { 49623458901SLorenz Bauer metrics->errors_total_encap_buffer_too_small++; 49723458901SLorenz Bauer return TC_ACT_SHOT; 49823458901SLorenz Bauer } 49923458901SLorenz Bauer 50023458901SLorenz Bauer encap_gre->ip.protocol = IPPROTO_GRE; 50123458901SLorenz Bauer encap_gre->ip.daddr = next_hop->s_addr; 50223458901SLorenz Bauer encap_gre->ip.saddr = ENCAPSULATION_IP; 50323458901SLorenz Bauer encap_gre->ip.tot_len = 50423458901SLorenz Bauer bpf_htons(bpf_ntohs(encap_gre->ip.tot_len) + delta); 50523458901SLorenz Bauer encap_gre->gre.flags = 0; 50623458901SLorenz Bauer encap_gre->gre.protocol = bpf_htons(proto); 50723458901SLorenz Bauer pkt_ipv4_checksum((void *)&encap_gre->ip); 50823458901SLorenz Bauer 50923458901SLorenz Bauer return bpf_redirect(skb->ifindex, 0); 51023458901SLorenz Bauer } 51123458901SLorenz Bauer 51223458901SLorenz Bauer static ret_t forward_to_next_hop(struct __sk_buff *skb, encap_headers_t *encap, 51323458901SLorenz Bauer struct in_addr *next_hop, metrics_t *metrics) 51423458901SLorenz Bauer { 51523458901SLorenz Bauer /* swap L2 addresses */ 51623458901SLorenz Bauer /* This assumes that packets are received from a router. 51723458901SLorenz Bauer * So just swapping the MAC addresses here will make the packet go back to 51823458901SLorenz Bauer * the router, which will send it to the appropriate machine. 51923458901SLorenz Bauer */ 52023458901SLorenz Bauer unsigned char temp[ETH_ALEN]; 52123458901SLorenz Bauer memcpy(temp, encap->eth.h_dest, sizeof(temp)); 52223458901SLorenz Bauer memcpy(encap->eth.h_dest, encap->eth.h_source, 52323458901SLorenz Bauer sizeof(encap->eth.h_dest)); 52423458901SLorenz Bauer memcpy(encap->eth.h_source, temp, sizeof(encap->eth.h_source)); 52523458901SLorenz Bauer 52623458901SLorenz Bauer if (encap->unigue.next_hop == encap->unigue.hop_count - 1 && 52723458901SLorenz Bauer encap->unigue.last_hop_gre) { 52823458901SLorenz Bauer return forward_with_gre(skb, encap, next_hop, metrics); 52923458901SLorenz Bauer } 53023458901SLorenz Bauer 53123458901SLorenz Bauer metrics->forwarded_packets_total_gue++; 53223458901SLorenz Bauer uint32_t old_saddr = encap->ip.saddr; 53323458901SLorenz Bauer encap->ip.saddr = encap->ip.daddr; 53423458901SLorenz Bauer encap->ip.daddr = next_hop->s_addr; 53523458901SLorenz Bauer if (encap->unigue.next_hop < encap->unigue.hop_count) { 53623458901SLorenz Bauer encap->unigue.next_hop++; 53723458901SLorenz Bauer } 53823458901SLorenz Bauer 53923458901SLorenz Bauer /* Remove ip->saddr, add next_hop->s_addr */ 54023458901SLorenz Bauer const uint64_t off = offsetof(typeof(*encap), ip.check); 54123458901SLorenz Bauer int ret = bpf_l3_csum_replace(skb, off, old_saddr, next_hop->s_addr, 4); 54223458901SLorenz Bauer if (ret < 0) { 54323458901SLorenz Bauer return TC_ACT_SHOT; 54423458901SLorenz Bauer } 54523458901SLorenz Bauer 54623458901SLorenz Bauer return bpf_redirect(skb->ifindex, 0); 54723458901SLorenz Bauer } 54823458901SLorenz Bauer 54923458901SLorenz Bauer static ret_t skip_next_hops(buf_t *pkt, int n) 55023458901SLorenz Bauer { 55123458901SLorenz Bauer switch (n) { 55223458901SLorenz Bauer case 1: 55323458901SLorenz Bauer if (!buf_skip(pkt, sizeof(struct in_addr))) 55423458901SLorenz Bauer return TC_ACT_SHOT; 55523458901SLorenz Bauer case 0: 55623458901SLorenz Bauer return CONTINUE_PROCESSING; 55723458901SLorenz Bauer 55823458901SLorenz Bauer default: 55923458901SLorenz Bauer return TC_ACT_SHOT; 56023458901SLorenz Bauer } 56123458901SLorenz Bauer } 56223458901SLorenz Bauer 56323458901SLorenz Bauer /* Get the next hop from the GLB header. 56423458901SLorenz Bauer * 56523458901SLorenz Bauer * Sets next_hop->s_addr to 0 if there are no more hops left. 56623458901SLorenz Bauer * pkt is positioned just after the variable length GLB header 56723458901SLorenz Bauer * iff the call is successful. 56823458901SLorenz Bauer */ 56923458901SLorenz Bauer static ret_t get_next_hop(buf_t *pkt, encap_headers_t *encap, 57023458901SLorenz Bauer struct in_addr *next_hop) 57123458901SLorenz Bauer { 57223458901SLorenz Bauer if (encap->unigue.next_hop > encap->unigue.hop_count) { 57323458901SLorenz Bauer return TC_ACT_SHOT; 57423458901SLorenz Bauer } 57523458901SLorenz Bauer 57623458901SLorenz Bauer /* Skip "used" next hops. */ 57723458901SLorenz Bauer MAYBE_RETURN(skip_next_hops(pkt, encap->unigue.next_hop)); 57823458901SLorenz Bauer 57923458901SLorenz Bauer if (encap->unigue.next_hop == encap->unigue.hop_count) { 58023458901SLorenz Bauer /* No more next hops, we are at the end of the GLB header. */ 58123458901SLorenz Bauer next_hop->s_addr = 0; 58223458901SLorenz Bauer return CONTINUE_PROCESSING; 58323458901SLorenz Bauer } 58423458901SLorenz Bauer 58523458901SLorenz Bauer if (!buf_copy(pkt, next_hop, sizeof(*next_hop))) { 58623458901SLorenz Bauer return TC_ACT_SHOT; 58723458901SLorenz Bauer } 58823458901SLorenz Bauer 58923458901SLorenz Bauer /* Skip the remainig next hops (may be zero). */ 59023458901SLorenz Bauer return skip_next_hops(pkt, encap->unigue.hop_count - 59123458901SLorenz Bauer encap->unigue.next_hop - 1); 59223458901SLorenz Bauer } 59323458901SLorenz Bauer 59423458901SLorenz Bauer /* Fill a bpf_sock_tuple to be used with the socket lookup functions. 59523458901SLorenz Bauer * This is a kludge that let's us work around verifier limitations: 59623458901SLorenz Bauer * 59723458901SLorenz Bauer * fill_tuple(&t, foo, sizeof(struct iphdr), 123, 321) 59823458901SLorenz Bauer * 59923458901SLorenz Bauer * clang will substitue a costant for sizeof, which allows the verifier 60023458901SLorenz Bauer * to track it's value. Based on this, it can figure out the constant 60123458901SLorenz Bauer * return value, and calling code works while still being "generic" to 60223458901SLorenz Bauer * IPv4 and IPv6. 60323458901SLorenz Bauer */ 60423458901SLorenz Bauer static uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph, 60523458901SLorenz Bauer uint64_t iphlen, uint16_t sport, uint16_t dport) 60623458901SLorenz Bauer { 60723458901SLorenz Bauer switch (iphlen) { 60823458901SLorenz Bauer case sizeof(struct iphdr): { 60923458901SLorenz Bauer struct iphdr *ipv4 = (struct iphdr *)iph; 61023458901SLorenz Bauer tuple->ipv4.daddr = ipv4->daddr; 61123458901SLorenz Bauer tuple->ipv4.saddr = ipv4->saddr; 61223458901SLorenz Bauer tuple->ipv4.sport = sport; 61323458901SLorenz Bauer tuple->ipv4.dport = dport; 61423458901SLorenz Bauer return sizeof(tuple->ipv4); 61523458901SLorenz Bauer } 61623458901SLorenz Bauer 61723458901SLorenz Bauer case sizeof(struct ipv6hdr): { 61823458901SLorenz Bauer struct ipv6hdr *ipv6 = (struct ipv6hdr *)iph; 61923458901SLorenz Bauer memcpy(&tuple->ipv6.daddr, &ipv6->daddr, 62023458901SLorenz Bauer sizeof(tuple->ipv6.daddr)); 62123458901SLorenz Bauer memcpy(&tuple->ipv6.saddr, &ipv6->saddr, 62223458901SLorenz Bauer sizeof(tuple->ipv6.saddr)); 62323458901SLorenz Bauer tuple->ipv6.sport = sport; 62423458901SLorenz Bauer tuple->ipv6.dport = dport; 62523458901SLorenz Bauer return sizeof(tuple->ipv6); 62623458901SLorenz Bauer } 62723458901SLorenz Bauer 62823458901SLorenz Bauer default: 62923458901SLorenz Bauer return 0; 63023458901SLorenz Bauer } 63123458901SLorenz Bauer } 63223458901SLorenz Bauer 63323458901SLorenz Bauer static verdict_t classify_tcp(struct __sk_buff *skb, 63423458901SLorenz Bauer struct bpf_sock_tuple *tuple, uint64_t tuplen, 63523458901SLorenz Bauer void *iph, struct tcphdr *tcp) 63623458901SLorenz Bauer { 63723458901SLorenz Bauer struct bpf_sock *sk = 63823458901SLorenz Bauer bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); 63923458901SLorenz Bauer if (sk == NULL) { 64023458901SLorenz Bauer return UNKNOWN; 64123458901SLorenz Bauer } 64223458901SLorenz Bauer 64323458901SLorenz Bauer if (sk->state != BPF_TCP_LISTEN) { 64423458901SLorenz Bauer bpf_sk_release(sk); 64523458901SLorenz Bauer return ESTABLISHED; 64623458901SLorenz Bauer } 64723458901SLorenz Bauer 64823458901SLorenz Bauer if (iph != NULL && tcp != NULL) { 64923458901SLorenz Bauer /* Kludge: we've run out of arguments, but need the length of the ip header. */ 65023458901SLorenz Bauer uint64_t iphlen = sizeof(struct iphdr); 65123458901SLorenz Bauer if (tuplen == sizeof(tuple->ipv6)) { 65223458901SLorenz Bauer iphlen = sizeof(struct ipv6hdr); 65323458901SLorenz Bauer } 65423458901SLorenz Bauer 65523458901SLorenz Bauer if (bpf_tcp_check_syncookie(sk, iph, iphlen, tcp, 65623458901SLorenz Bauer sizeof(*tcp)) == 0) { 65723458901SLorenz Bauer bpf_sk_release(sk); 65823458901SLorenz Bauer return SYN_COOKIE; 65923458901SLorenz Bauer } 66023458901SLorenz Bauer } 66123458901SLorenz Bauer 66223458901SLorenz Bauer bpf_sk_release(sk); 66323458901SLorenz Bauer return UNKNOWN; 66423458901SLorenz Bauer } 66523458901SLorenz Bauer 66623458901SLorenz Bauer static verdict_t classify_udp(struct __sk_buff *skb, 66723458901SLorenz Bauer struct bpf_sock_tuple *tuple, uint64_t tuplen) 66823458901SLorenz Bauer { 66923458901SLorenz Bauer struct bpf_sock *sk = 67023458901SLorenz Bauer bpf_sk_lookup_udp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); 67123458901SLorenz Bauer if (sk == NULL) { 67223458901SLorenz Bauer return UNKNOWN; 67323458901SLorenz Bauer } 67423458901SLorenz Bauer 67523458901SLorenz Bauer if (sk->state == BPF_TCP_ESTABLISHED) { 67623458901SLorenz Bauer bpf_sk_release(sk); 67723458901SLorenz Bauer return ESTABLISHED; 67823458901SLorenz Bauer } 67923458901SLorenz Bauer 68023458901SLorenz Bauer bpf_sk_release(sk); 68123458901SLorenz Bauer return UNKNOWN; 68223458901SLorenz Bauer } 68323458901SLorenz Bauer 68423458901SLorenz Bauer static verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto, 68523458901SLorenz Bauer struct bpf_sock_tuple *tuple, uint64_t tuplen, 68623458901SLorenz Bauer metrics_t *metrics) 68723458901SLorenz Bauer { 68823458901SLorenz Bauer switch (proto) { 68923458901SLorenz Bauer case IPPROTO_TCP: 69023458901SLorenz Bauer return classify_tcp(skb, tuple, tuplen, NULL, NULL); 69123458901SLorenz Bauer 69223458901SLorenz Bauer case IPPROTO_UDP: 69323458901SLorenz Bauer return classify_udp(skb, tuple, tuplen); 69423458901SLorenz Bauer 69523458901SLorenz Bauer default: 69623458901SLorenz Bauer metrics->errors_total_malformed_icmp++; 69723458901SLorenz Bauer return INVALID; 69823458901SLorenz Bauer } 69923458901SLorenz Bauer } 70023458901SLorenz Bauer 70123458901SLorenz Bauer static verdict_t process_icmpv4(buf_t *pkt, metrics_t *metrics) 70223458901SLorenz Bauer { 70323458901SLorenz Bauer struct icmphdr icmp; 70423458901SLorenz Bauer if (!buf_copy(pkt, &icmp, sizeof(icmp))) { 70523458901SLorenz Bauer metrics->errors_total_malformed_icmp++; 70623458901SLorenz Bauer return INVALID; 70723458901SLorenz Bauer } 70823458901SLorenz Bauer 70923458901SLorenz Bauer /* We should never receive encapsulated echo replies. */ 71023458901SLorenz Bauer if (icmp.type == ICMP_ECHOREPLY) { 71123458901SLorenz Bauer metrics->errors_total_icmp_echo_replies++; 71223458901SLorenz Bauer return INVALID; 71323458901SLorenz Bauer } 71423458901SLorenz Bauer 71523458901SLorenz Bauer if (icmp.type == ICMP_ECHO) { 71623458901SLorenz Bauer return ECHO_REQUEST; 71723458901SLorenz Bauer } 71823458901SLorenz Bauer 71923458901SLorenz Bauer if (icmp.type != ICMP_DEST_UNREACH || icmp.code != ICMP_FRAG_NEEDED) { 72023458901SLorenz Bauer metrics->errors_total_unwanted_icmp++; 72123458901SLorenz Bauer return INVALID; 72223458901SLorenz Bauer } 72323458901SLorenz Bauer 72423458901SLorenz Bauer struct iphdr _ip4; 72523458901SLorenz Bauer const struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4); 72623458901SLorenz Bauer if (ipv4 == NULL) { 72723458901SLorenz Bauer metrics->errors_total_malformed_icmp_pkt_too_big++; 72823458901SLorenz Bauer return INVALID; 72923458901SLorenz Bauer } 73023458901SLorenz Bauer 73123458901SLorenz Bauer /* The source address in the outer IP header is from the entity that 73223458901SLorenz Bauer * originated the ICMP message. Use the original IP header to restore 73323458901SLorenz Bauer * the correct flow tuple. 73423458901SLorenz Bauer */ 73523458901SLorenz Bauer struct bpf_sock_tuple tuple; 73623458901SLorenz Bauer tuple.ipv4.saddr = ipv4->daddr; 73723458901SLorenz Bauer tuple.ipv4.daddr = ipv4->saddr; 73823458901SLorenz Bauer 73923458901SLorenz Bauer if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv4.sport)) { 74023458901SLorenz Bauer metrics->errors_total_malformed_icmp_pkt_too_big++; 74123458901SLorenz Bauer return INVALID; 74223458901SLorenz Bauer } 74323458901SLorenz Bauer 74423458901SLorenz Bauer return classify_icmp(pkt->skb, ipv4->protocol, &tuple, 74523458901SLorenz Bauer sizeof(tuple.ipv4), metrics); 74623458901SLorenz Bauer } 74723458901SLorenz Bauer 74823458901SLorenz Bauer static verdict_t process_icmpv6(buf_t *pkt, metrics_t *metrics) 74923458901SLorenz Bauer { 75023458901SLorenz Bauer struct icmp6hdr icmp6; 75123458901SLorenz Bauer if (!buf_copy(pkt, &icmp6, sizeof(icmp6))) { 75223458901SLorenz Bauer metrics->errors_total_malformed_icmp++; 75323458901SLorenz Bauer return INVALID; 75423458901SLorenz Bauer } 75523458901SLorenz Bauer 75623458901SLorenz Bauer /* We should never receive encapsulated echo replies. */ 75723458901SLorenz Bauer if (icmp6.icmp6_type == ICMPV6_ECHO_REPLY) { 75823458901SLorenz Bauer metrics->errors_total_icmp_echo_replies++; 75923458901SLorenz Bauer return INVALID; 76023458901SLorenz Bauer } 76123458901SLorenz Bauer 76223458901SLorenz Bauer if (icmp6.icmp6_type == ICMPV6_ECHO_REQUEST) { 76323458901SLorenz Bauer return ECHO_REQUEST; 76423458901SLorenz Bauer } 76523458901SLorenz Bauer 76623458901SLorenz Bauer if (icmp6.icmp6_type != ICMPV6_PKT_TOOBIG) { 76723458901SLorenz Bauer metrics->errors_total_unwanted_icmp++; 76823458901SLorenz Bauer return INVALID; 76923458901SLorenz Bauer } 77023458901SLorenz Bauer 77123458901SLorenz Bauer bool is_fragment; 77223458901SLorenz Bauer uint8_t l4_proto; 77323458901SLorenz Bauer struct ipv6hdr _ipv6; 77423458901SLorenz Bauer const struct ipv6hdr *ipv6 = 77523458901SLorenz Bauer pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment); 77623458901SLorenz Bauer if (ipv6 == NULL) { 77723458901SLorenz Bauer metrics->errors_total_malformed_icmp_pkt_too_big++; 77823458901SLorenz Bauer return INVALID; 77923458901SLorenz Bauer } 78023458901SLorenz Bauer 78123458901SLorenz Bauer if (is_fragment) { 78223458901SLorenz Bauer metrics->errors_total_fragmented_ip++; 78323458901SLorenz Bauer return INVALID; 78423458901SLorenz Bauer } 78523458901SLorenz Bauer 78623458901SLorenz Bauer /* Swap source and dest addresses. */ 78723458901SLorenz Bauer struct bpf_sock_tuple tuple; 78823458901SLorenz Bauer memcpy(&tuple.ipv6.saddr, &ipv6->daddr, sizeof(tuple.ipv6.saddr)); 78923458901SLorenz Bauer memcpy(&tuple.ipv6.daddr, &ipv6->saddr, sizeof(tuple.ipv6.daddr)); 79023458901SLorenz Bauer 79123458901SLorenz Bauer if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv6.sport)) { 79223458901SLorenz Bauer metrics->errors_total_malformed_icmp_pkt_too_big++; 79323458901SLorenz Bauer return INVALID; 79423458901SLorenz Bauer } 79523458901SLorenz Bauer 79623458901SLorenz Bauer return classify_icmp(pkt->skb, l4_proto, &tuple, sizeof(tuple.ipv6), 79723458901SLorenz Bauer metrics); 79823458901SLorenz Bauer } 79923458901SLorenz Bauer 80023458901SLorenz Bauer static verdict_t process_tcp(buf_t *pkt, void *iph, uint64_t iphlen, 80123458901SLorenz Bauer metrics_t *metrics) 80223458901SLorenz Bauer { 80323458901SLorenz Bauer metrics->l4_protocol_packets_total_tcp++; 80423458901SLorenz Bauer 80523458901SLorenz Bauer struct tcphdr _tcp; 80623458901SLorenz Bauer struct tcphdr *tcp = buf_assign(pkt, sizeof(_tcp), &_tcp); 80723458901SLorenz Bauer if (tcp == NULL) { 80823458901SLorenz Bauer metrics->errors_total_malformed_tcp++; 80923458901SLorenz Bauer return INVALID; 81023458901SLorenz Bauer } 81123458901SLorenz Bauer 81223458901SLorenz Bauer if (tcp->syn) { 81323458901SLorenz Bauer return SYN; 81423458901SLorenz Bauer } 81523458901SLorenz Bauer 81623458901SLorenz Bauer struct bpf_sock_tuple tuple; 81723458901SLorenz Bauer uint64_t tuplen = 81823458901SLorenz Bauer fill_tuple(&tuple, iph, iphlen, tcp->source, tcp->dest); 81923458901SLorenz Bauer return classify_tcp(pkt->skb, &tuple, tuplen, iph, tcp); 82023458901SLorenz Bauer } 82123458901SLorenz Bauer 82223458901SLorenz Bauer static verdict_t process_udp(buf_t *pkt, void *iph, uint64_t iphlen, 82323458901SLorenz Bauer metrics_t *metrics) 82423458901SLorenz Bauer { 82523458901SLorenz Bauer metrics->l4_protocol_packets_total_udp++; 82623458901SLorenz Bauer 82723458901SLorenz Bauer struct udphdr _udp; 82823458901SLorenz Bauer struct udphdr *udph = buf_assign(pkt, sizeof(_udp), &_udp); 82923458901SLorenz Bauer if (udph == NULL) { 83023458901SLorenz Bauer metrics->errors_total_malformed_udp++; 83123458901SLorenz Bauer return INVALID; 83223458901SLorenz Bauer } 83323458901SLorenz Bauer 83423458901SLorenz Bauer struct bpf_sock_tuple tuple; 83523458901SLorenz Bauer uint64_t tuplen = 83623458901SLorenz Bauer fill_tuple(&tuple, iph, iphlen, udph->source, udph->dest); 83723458901SLorenz Bauer return classify_udp(pkt->skb, &tuple, tuplen); 83823458901SLorenz Bauer } 83923458901SLorenz Bauer 84023458901SLorenz Bauer static verdict_t process_ipv4(buf_t *pkt, metrics_t *metrics) 84123458901SLorenz Bauer { 84223458901SLorenz Bauer metrics->l3_protocol_packets_total_ipv4++; 84323458901SLorenz Bauer 84423458901SLorenz Bauer struct iphdr _ip4; 84523458901SLorenz Bauer struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4); 84623458901SLorenz Bauer if (ipv4 == NULL) { 84723458901SLorenz Bauer metrics->errors_total_malformed_ip++; 84823458901SLorenz Bauer return INVALID; 84923458901SLorenz Bauer } 85023458901SLorenz Bauer 85123458901SLorenz Bauer if (ipv4->version != 4) { 85223458901SLorenz Bauer metrics->errors_total_malformed_ip++; 85323458901SLorenz Bauer return INVALID; 85423458901SLorenz Bauer } 85523458901SLorenz Bauer 85623458901SLorenz Bauer if (ipv4_is_fragment(ipv4)) { 85723458901SLorenz Bauer metrics->errors_total_fragmented_ip++; 85823458901SLorenz Bauer return INVALID; 85923458901SLorenz Bauer } 86023458901SLorenz Bauer 86123458901SLorenz Bauer switch (ipv4->protocol) { 86223458901SLorenz Bauer case IPPROTO_ICMP: 86323458901SLorenz Bauer return process_icmpv4(pkt, metrics); 86423458901SLorenz Bauer 86523458901SLorenz Bauer case IPPROTO_TCP: 86623458901SLorenz Bauer return process_tcp(pkt, ipv4, sizeof(*ipv4), metrics); 86723458901SLorenz Bauer 86823458901SLorenz Bauer case IPPROTO_UDP: 86923458901SLorenz Bauer return process_udp(pkt, ipv4, sizeof(*ipv4), metrics); 87023458901SLorenz Bauer 87123458901SLorenz Bauer default: 87223458901SLorenz Bauer metrics->errors_total_unknown_l4_proto++; 87323458901SLorenz Bauer return INVALID; 87423458901SLorenz Bauer } 87523458901SLorenz Bauer } 87623458901SLorenz Bauer 87723458901SLorenz Bauer static verdict_t process_ipv6(buf_t *pkt, metrics_t *metrics) 87823458901SLorenz Bauer { 87923458901SLorenz Bauer metrics->l3_protocol_packets_total_ipv6++; 88023458901SLorenz Bauer 88123458901SLorenz Bauer uint8_t l4_proto; 88223458901SLorenz Bauer bool is_fragment; 88323458901SLorenz Bauer struct ipv6hdr _ipv6; 88423458901SLorenz Bauer struct ipv6hdr *ipv6 = 88523458901SLorenz Bauer pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment); 88623458901SLorenz Bauer if (ipv6 == NULL) { 88723458901SLorenz Bauer metrics->errors_total_malformed_ip++; 88823458901SLorenz Bauer return INVALID; 88923458901SLorenz Bauer } 89023458901SLorenz Bauer 89123458901SLorenz Bauer if (ipv6->version != 6) { 89223458901SLorenz Bauer metrics->errors_total_malformed_ip++; 89323458901SLorenz Bauer return INVALID; 89423458901SLorenz Bauer } 89523458901SLorenz Bauer 89623458901SLorenz Bauer if (is_fragment) { 89723458901SLorenz Bauer metrics->errors_total_fragmented_ip++; 89823458901SLorenz Bauer return INVALID; 89923458901SLorenz Bauer } 90023458901SLorenz Bauer 90123458901SLorenz Bauer switch (l4_proto) { 90223458901SLorenz Bauer case IPPROTO_ICMPV6: 90323458901SLorenz Bauer return process_icmpv6(pkt, metrics); 90423458901SLorenz Bauer 90523458901SLorenz Bauer case IPPROTO_TCP: 90623458901SLorenz Bauer return process_tcp(pkt, ipv6, sizeof(*ipv6), metrics); 90723458901SLorenz Bauer 90823458901SLorenz Bauer case IPPROTO_UDP: 90923458901SLorenz Bauer return process_udp(pkt, ipv6, sizeof(*ipv6), metrics); 91023458901SLorenz Bauer 91123458901SLorenz Bauer default: 91223458901SLorenz Bauer metrics->errors_total_unknown_l4_proto++; 91323458901SLorenz Bauer return INVALID; 91423458901SLorenz Bauer } 91523458901SLorenz Bauer } 91623458901SLorenz Bauer 91723458901SLorenz Bauer SEC("classifier/cls_redirect") 91823458901SLorenz Bauer int cls_redirect(struct __sk_buff *skb) 91923458901SLorenz Bauer { 92023458901SLorenz Bauer metrics_t *metrics = get_global_metrics(); 92123458901SLorenz Bauer if (metrics == NULL) { 92223458901SLorenz Bauer return TC_ACT_SHOT; 92323458901SLorenz Bauer } 92423458901SLorenz Bauer 92523458901SLorenz Bauer metrics->processed_packets_total++; 92623458901SLorenz Bauer 92723458901SLorenz Bauer /* Pass bogus packets as long as we're not sure they're 92823458901SLorenz Bauer * destined for us. 92923458901SLorenz Bauer */ 93023458901SLorenz Bauer if (skb->protocol != bpf_htons(ETH_P_IP)) { 93123458901SLorenz Bauer return TC_ACT_OK; 93223458901SLorenz Bauer } 93323458901SLorenz Bauer 93423458901SLorenz Bauer encap_headers_t *encap; 93523458901SLorenz Bauer 93623458901SLorenz Bauer /* Make sure that all encapsulation headers are available in 93723458901SLorenz Bauer * the linear portion of the skb. This makes it easy to manipulate them. 93823458901SLorenz Bauer */ 93923458901SLorenz Bauer if (bpf_skb_pull_data(skb, sizeof(*encap))) { 94023458901SLorenz Bauer return TC_ACT_OK; 94123458901SLorenz Bauer } 94223458901SLorenz Bauer 94323458901SLorenz Bauer buf_t pkt = { 94423458901SLorenz Bauer .skb = skb, 94523458901SLorenz Bauer .head = (uint8_t *)(long)skb->data, 94623458901SLorenz Bauer .tail = (uint8_t *)(long)skb->data_end, 94723458901SLorenz Bauer }; 94823458901SLorenz Bauer 94923458901SLorenz Bauer encap = buf_assign(&pkt, sizeof(*encap), NULL); 95023458901SLorenz Bauer if (encap == NULL) { 95123458901SLorenz Bauer return TC_ACT_OK; 95223458901SLorenz Bauer } 95323458901SLorenz Bauer 95423458901SLorenz Bauer if (encap->ip.ihl != 5) { 95523458901SLorenz Bauer /* We never have any options. */ 95623458901SLorenz Bauer return TC_ACT_OK; 95723458901SLorenz Bauer } 95823458901SLorenz Bauer 95923458901SLorenz Bauer if (encap->ip.daddr != ENCAPSULATION_IP || 96023458901SLorenz Bauer encap->ip.protocol != IPPROTO_UDP) { 96123458901SLorenz Bauer return TC_ACT_OK; 96223458901SLorenz Bauer } 96323458901SLorenz Bauer 96423458901SLorenz Bauer /* TODO Check UDP length? */ 96523458901SLorenz Bauer if (encap->udp.dest != ENCAPSULATION_PORT) { 96623458901SLorenz Bauer return TC_ACT_OK; 96723458901SLorenz Bauer } 96823458901SLorenz Bauer 96923458901SLorenz Bauer /* We now know that the packet is destined to us, we can 97023458901SLorenz Bauer * drop bogus ones. 97123458901SLorenz Bauer */ 97223458901SLorenz Bauer if (ipv4_is_fragment((void *)&encap->ip)) { 97323458901SLorenz Bauer metrics->errors_total_fragmented_ip++; 97423458901SLorenz Bauer return TC_ACT_SHOT; 97523458901SLorenz Bauer } 97623458901SLorenz Bauer 97723458901SLorenz Bauer if (encap->gue.variant != 0) { 97823458901SLorenz Bauer metrics->errors_total_malformed_encapsulation++; 97923458901SLorenz Bauer return TC_ACT_SHOT; 98023458901SLorenz Bauer } 98123458901SLorenz Bauer 98223458901SLorenz Bauer if (encap->gue.control != 0) { 98323458901SLorenz Bauer metrics->errors_total_malformed_encapsulation++; 98423458901SLorenz Bauer return TC_ACT_SHOT; 98523458901SLorenz Bauer } 98623458901SLorenz Bauer 98723458901SLorenz Bauer if (encap->gue.flags != 0) { 98823458901SLorenz Bauer metrics->errors_total_malformed_encapsulation++; 98923458901SLorenz Bauer return TC_ACT_SHOT; 99023458901SLorenz Bauer } 99123458901SLorenz Bauer 99223458901SLorenz Bauer if (encap->gue.hlen != 99323458901SLorenz Bauer sizeof(encap->unigue) / 4 + encap->unigue.hop_count) { 99423458901SLorenz Bauer metrics->errors_total_malformed_encapsulation++; 99523458901SLorenz Bauer return TC_ACT_SHOT; 99623458901SLorenz Bauer } 99723458901SLorenz Bauer 99823458901SLorenz Bauer if (encap->unigue.version != 0) { 99923458901SLorenz Bauer metrics->errors_total_malformed_encapsulation++; 100023458901SLorenz Bauer return TC_ACT_SHOT; 100123458901SLorenz Bauer } 100223458901SLorenz Bauer 100323458901SLorenz Bauer if (encap->unigue.reserved != 0) { 100423458901SLorenz Bauer return TC_ACT_SHOT; 100523458901SLorenz Bauer } 100623458901SLorenz Bauer 100723458901SLorenz Bauer struct in_addr next_hop; 100823458901SLorenz Bauer MAYBE_RETURN(get_next_hop(&pkt, encap, &next_hop)); 100923458901SLorenz Bauer 101023458901SLorenz Bauer if (next_hop.s_addr == 0) { 101123458901SLorenz Bauer metrics->accepted_packets_total_last_hop++; 101223458901SLorenz Bauer return accept_locally(skb, encap); 101323458901SLorenz Bauer } 101423458901SLorenz Bauer 101523458901SLorenz Bauer verdict_t verdict; 101623458901SLorenz Bauer switch (encap->gue.proto_ctype) { 101723458901SLorenz Bauer case IPPROTO_IPIP: 101823458901SLorenz Bauer verdict = process_ipv4(&pkt, metrics); 101923458901SLorenz Bauer break; 102023458901SLorenz Bauer 102123458901SLorenz Bauer case IPPROTO_IPV6: 102223458901SLorenz Bauer verdict = process_ipv6(&pkt, metrics); 102323458901SLorenz Bauer break; 102423458901SLorenz Bauer 102523458901SLorenz Bauer default: 102623458901SLorenz Bauer metrics->errors_total_unknown_l3_proto++; 102723458901SLorenz Bauer return TC_ACT_SHOT; 102823458901SLorenz Bauer } 102923458901SLorenz Bauer 103023458901SLorenz Bauer switch (verdict) { 103123458901SLorenz Bauer case INVALID: 103223458901SLorenz Bauer /* metrics have already been bumped */ 103323458901SLorenz Bauer return TC_ACT_SHOT; 103423458901SLorenz Bauer 103523458901SLorenz Bauer case UNKNOWN: 103623458901SLorenz Bauer return forward_to_next_hop(skb, encap, &next_hop, metrics); 103723458901SLorenz Bauer 103823458901SLorenz Bauer case ECHO_REQUEST: 103923458901SLorenz Bauer metrics->accepted_packets_total_icmp_echo_request++; 104023458901SLorenz Bauer break; 104123458901SLorenz Bauer 104223458901SLorenz Bauer case SYN: 104323458901SLorenz Bauer if (encap->unigue.forward_syn) { 104423458901SLorenz Bauer return forward_to_next_hop(skb, encap, &next_hop, 104523458901SLorenz Bauer metrics); 104623458901SLorenz Bauer } 104723458901SLorenz Bauer 104823458901SLorenz Bauer metrics->accepted_packets_total_syn++; 104923458901SLorenz Bauer break; 105023458901SLorenz Bauer 105123458901SLorenz Bauer case SYN_COOKIE: 105223458901SLorenz Bauer metrics->accepted_packets_total_syn_cookies++; 105323458901SLorenz Bauer break; 105423458901SLorenz Bauer 105523458901SLorenz Bauer case ESTABLISHED: 105623458901SLorenz Bauer metrics->accepted_packets_total_established++; 105723458901SLorenz Bauer break; 105823458901SLorenz Bauer } 105923458901SLorenz Bauer 106023458901SLorenz Bauer return accept_locally(skb, encap); 106123458901SLorenz Bauer } 1062