1 // SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause 2 /* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ 3 4 #include "vmlinux.h" 5 6 #include <bpf/bpf_helpers.h> 7 #include <bpf/bpf_endian.h> 8 #include <asm/errno.h> 9 10 #define TC_ACT_OK 0 11 #define TC_ACT_SHOT 2 12 13 #define NSEC_PER_SEC 1000000000L 14 15 #define ETH_ALEN 6 16 #define ETH_P_IP 0x0800 17 #define ETH_P_IPV6 0x86DD 18 19 #define tcp_flag_word(tp) (((union tcp_word_hdr *)(tp))->words[3]) 20 21 #define IP_DF 0x4000 22 #define IP_MF 0x2000 23 #define IP_OFFSET 0x1fff 24 25 #define NEXTHDR_TCP 6 26 27 #define TCPOPT_NOP 1 28 #define TCPOPT_EOL 0 29 #define TCPOPT_MSS 2 30 #define TCPOPT_WINDOW 3 31 #define TCPOPT_SACK_PERM 4 32 #define TCPOPT_TIMESTAMP 8 33 34 #define TCPOLEN_MSS 4 35 #define TCPOLEN_WINDOW 3 36 #define TCPOLEN_SACK_PERM 2 37 #define TCPOLEN_TIMESTAMP 10 38 39 #define TCP_TS_HZ 1000 40 #define TS_OPT_WSCALE_MASK 0xf 41 #define TS_OPT_SACK (1 << 4) 42 #define TS_OPT_ECN (1 << 5) 43 #define TSBITS 6 44 #define TSMASK (((__u32)1 << TSBITS) - 1) 45 #define TCP_MAX_WSCALE 14U 46 47 #define IPV4_MAXLEN 60 48 #define TCP_MAXLEN 60 49 50 #define DEFAULT_MSS4 1460 51 #define DEFAULT_MSS6 1440 52 #define DEFAULT_WSCALE 7 53 #define DEFAULT_TTL 64 54 #define MAX_ALLOWED_PORTS 8 55 56 #define swap(a, b) \ 57 do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) 58 59 #define __get_unaligned_t(type, ptr) ({ \ 60 const struct { type x; } __attribute__((__packed__)) *__pptr = (typeof(__pptr))(ptr); \ 61 __pptr->x; \ 62 }) 63 64 #define get_unaligned(ptr) __get_unaligned_t(typeof(*(ptr)), (ptr)) 65 66 struct { 67 __uint(type, BPF_MAP_TYPE_ARRAY); 68 __type(key, __u32); 69 __type(value, __u64); 70 __uint(max_entries, 2); 71 } values SEC(".maps"); 72 73 struct { 74 __uint(type, BPF_MAP_TYPE_ARRAY); 75 __type(key, __u32); 76 __type(value, __u16); 77 __uint(max_entries, MAX_ALLOWED_PORTS); 78 } allowed_ports SEC(".maps"); 79 80 /* Some symbols defined in net/netfilter/nf_conntrack_bpf.c are unavailable in 81 * vmlinux.h if CONFIG_NF_CONNTRACK=m, so they are redefined locally. 82 */ 83 84 struct bpf_ct_opts___local { 85 s32 netns_id; 86 s32 error; 87 u8 l4proto; 88 u8 dir; 89 u8 reserved[2]; 90 } __attribute__((preserve_access_index)); 91 92 #define BPF_F_CURRENT_NETNS (-1) 93 94 extern struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, 95 struct bpf_sock_tuple *bpf_tuple, 96 __u32 len_tuple, 97 struct bpf_ct_opts___local *opts, 98 __u32 len_opts) __ksym; 99 100 extern struct nf_conn *bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, 101 struct bpf_sock_tuple *bpf_tuple, 102 u32 len_tuple, 103 struct bpf_ct_opts___local *opts, 104 u32 len_opts) __ksym; 105 106 extern void bpf_ct_release(struct nf_conn *ct) __ksym; 107 108 static __always_inline void swap_eth_addr(__u8 *a, __u8 *b) 109 { 110 __u8 tmp[ETH_ALEN]; 111 112 __builtin_memcpy(tmp, a, ETH_ALEN); 113 __builtin_memcpy(a, b, ETH_ALEN); 114 __builtin_memcpy(b, tmp, ETH_ALEN); 115 } 116 117 static __always_inline __u16 csum_fold(__u32 csum) 118 { 119 csum = (csum & 0xffff) + (csum >> 16); 120 csum = (csum & 0xffff) + (csum >> 16); 121 return (__u16)~csum; 122 } 123 124 static __always_inline __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, 125 __u32 len, __u8 proto, 126 __u32 csum) 127 { 128 __u64 s = csum; 129 130 s += (__u32)saddr; 131 s += (__u32)daddr; 132 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ 133 s += proto + len; 134 #elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 135 s += (proto + len) << 8; 136 #else 137 #error Unknown endian 138 #endif 139 s = (s & 0xffffffff) + (s >> 32); 140 s = (s & 0xffffffff) + (s >> 32); 141 142 return csum_fold((__u32)s); 143 } 144 145 static __always_inline __u16 csum_ipv6_magic(const struct in6_addr *saddr, 146 const struct in6_addr *daddr, 147 __u32 len, __u8 proto, __u32 csum) 148 { 149 __u64 sum = csum; 150 int i; 151 152 #pragma unroll 153 for (i = 0; i < 4; i++) 154 sum += (__u32)saddr->in6_u.u6_addr32[i]; 155 156 #pragma unroll 157 for (i = 0; i < 4; i++) 158 sum += (__u32)daddr->in6_u.u6_addr32[i]; 159 160 /* Don't combine additions to avoid 32-bit overflow. */ 161 sum += bpf_htonl(len); 162 sum += bpf_htonl(proto); 163 164 sum = (sum & 0xffffffff) + (sum >> 32); 165 sum = (sum & 0xffffffff) + (sum >> 32); 166 167 return csum_fold((__u32)sum); 168 } 169 170 static __always_inline __u64 tcp_clock_ns(void) 171 { 172 return bpf_ktime_get_ns(); 173 } 174 175 static __always_inline __u32 tcp_ns_to_ts(__u64 ns) 176 { 177 return ns / (NSEC_PER_SEC / TCP_TS_HZ); 178 } 179 180 static __always_inline __u32 tcp_time_stamp_raw(void) 181 { 182 return tcp_ns_to_ts(tcp_clock_ns()); 183 } 184 185 struct tcpopt_context { 186 __u8 *ptr; 187 __u8 *end; 188 void *data_end; 189 __be32 *tsecr; 190 __u8 wscale; 191 bool option_timestamp; 192 bool option_sack; 193 }; 194 195 static int tscookie_tcpopt_parse(struct tcpopt_context *ctx) 196 { 197 __u8 opcode, opsize; 198 199 if (ctx->ptr >= ctx->end) 200 return 1; 201 if (ctx->ptr >= ctx->data_end) 202 return 1; 203 204 opcode = ctx->ptr[0]; 205 206 if (opcode == TCPOPT_EOL) 207 return 1; 208 if (opcode == TCPOPT_NOP) { 209 ++ctx->ptr; 210 return 0; 211 } 212 213 if (ctx->ptr + 1 >= ctx->end) 214 return 1; 215 if (ctx->ptr + 1 >= ctx->data_end) 216 return 1; 217 opsize = ctx->ptr[1]; 218 if (opsize < 2) 219 return 1; 220 221 if (ctx->ptr + opsize > ctx->end) 222 return 1; 223 224 switch (opcode) { 225 case TCPOPT_WINDOW: 226 if (opsize == TCPOLEN_WINDOW && ctx->ptr + TCPOLEN_WINDOW <= ctx->data_end) 227 ctx->wscale = ctx->ptr[2] < TCP_MAX_WSCALE ? ctx->ptr[2] : TCP_MAX_WSCALE; 228 break; 229 case TCPOPT_TIMESTAMP: 230 if (opsize == TCPOLEN_TIMESTAMP && ctx->ptr + TCPOLEN_TIMESTAMP <= ctx->data_end) { 231 ctx->option_timestamp = true; 232 /* Client's tsval becomes our tsecr. */ 233 *ctx->tsecr = get_unaligned((__be32 *)(ctx->ptr + 2)); 234 } 235 break; 236 case TCPOPT_SACK_PERM: 237 if (opsize == TCPOLEN_SACK_PERM) 238 ctx->option_sack = true; 239 break; 240 } 241 242 ctx->ptr += opsize; 243 244 return 0; 245 } 246 247 static int tscookie_tcpopt_parse_batch(__u32 index, void *context) 248 { 249 int i; 250 251 for (i = 0; i < 7; i++) 252 if (tscookie_tcpopt_parse(context)) 253 return 1; 254 return 0; 255 } 256 257 static __always_inline bool tscookie_init(struct tcphdr *tcp_header, 258 __u16 tcp_len, __be32 *tsval, 259 __be32 *tsecr, void *data_end) 260 { 261 struct tcpopt_context loop_ctx = { 262 .ptr = (__u8 *)(tcp_header + 1), 263 .end = (__u8 *)tcp_header + tcp_len, 264 .data_end = data_end, 265 .tsecr = tsecr, 266 .wscale = TS_OPT_WSCALE_MASK, 267 .option_timestamp = false, 268 .option_sack = false, 269 }; 270 u32 cookie; 271 272 bpf_loop(6, tscookie_tcpopt_parse_batch, &loop_ctx, 0); 273 274 if (!loop_ctx.option_timestamp) 275 return false; 276 277 cookie = tcp_time_stamp_raw() & ~TSMASK; 278 cookie |= loop_ctx.wscale & TS_OPT_WSCALE_MASK; 279 if (loop_ctx.option_sack) 280 cookie |= TS_OPT_SACK; 281 if (tcp_header->ece && tcp_header->cwr) 282 cookie |= TS_OPT_ECN; 283 *tsval = bpf_htonl(cookie); 284 285 return true; 286 } 287 288 static __always_inline void values_get_tcpipopts(__u16 *mss, __u8 *wscale, 289 __u8 *ttl, bool ipv6) 290 { 291 __u32 key = 0; 292 __u64 *value; 293 294 value = bpf_map_lookup_elem(&values, &key); 295 if (value && *value != 0) { 296 if (ipv6) 297 *mss = (*value >> 32) & 0xffff; 298 else 299 *mss = *value & 0xffff; 300 *wscale = (*value >> 16) & 0xf; 301 *ttl = (*value >> 24) & 0xff; 302 return; 303 } 304 305 *mss = ipv6 ? DEFAULT_MSS6 : DEFAULT_MSS4; 306 *wscale = DEFAULT_WSCALE; 307 *ttl = DEFAULT_TTL; 308 } 309 310 static __always_inline void values_inc_synacks(void) 311 { 312 __u32 key = 1; 313 __u32 *value; 314 315 value = bpf_map_lookup_elem(&values, &key); 316 if (value) 317 __sync_fetch_and_add(value, 1); 318 } 319 320 static __always_inline bool check_port_allowed(__u16 port) 321 { 322 __u32 i; 323 324 for (i = 0; i < MAX_ALLOWED_PORTS; i++) { 325 __u32 key = i; 326 __u16 *value; 327 328 value = bpf_map_lookup_elem(&allowed_ports, &key); 329 330 if (!value) 331 break; 332 /* 0 is a terminator value. Check it first to avoid matching on 333 * a forbidden port == 0 and returning true. 334 */ 335 if (*value == 0) 336 break; 337 338 if (*value == port) 339 return true; 340 } 341 342 return false; 343 } 344 345 struct header_pointers { 346 struct ethhdr *eth; 347 struct iphdr *ipv4; 348 struct ipv6hdr *ipv6; 349 struct tcphdr *tcp; 350 __u16 tcp_len; 351 }; 352 353 static __always_inline int tcp_dissect(void *data, void *data_end, 354 struct header_pointers *hdr) 355 { 356 hdr->eth = data; 357 if (hdr->eth + 1 > data_end) 358 return XDP_DROP; 359 360 switch (bpf_ntohs(hdr->eth->h_proto)) { 361 case ETH_P_IP: 362 hdr->ipv6 = NULL; 363 364 hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth); 365 if (hdr->ipv4 + 1 > data_end) 366 return XDP_DROP; 367 if (hdr->ipv4->ihl * 4 < sizeof(*hdr->ipv4)) 368 return XDP_DROP; 369 if (hdr->ipv4->version != 4) 370 return XDP_DROP; 371 372 if (hdr->ipv4->protocol != IPPROTO_TCP) 373 return XDP_PASS; 374 375 hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4; 376 break; 377 case ETH_P_IPV6: 378 hdr->ipv4 = NULL; 379 380 hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth); 381 if (hdr->ipv6 + 1 > data_end) 382 return XDP_DROP; 383 if (hdr->ipv6->version != 6) 384 return XDP_DROP; 385 386 /* XXX: Extension headers are not supported and could circumvent 387 * XDP SYN flood protection. 388 */ 389 if (hdr->ipv6->nexthdr != NEXTHDR_TCP) 390 return XDP_PASS; 391 392 hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6); 393 break; 394 default: 395 /* XXX: VLANs will circumvent XDP SYN flood protection. */ 396 return XDP_PASS; 397 } 398 399 if (hdr->tcp + 1 > data_end) 400 return XDP_DROP; 401 hdr->tcp_len = hdr->tcp->doff * 4; 402 if (hdr->tcp_len < sizeof(*hdr->tcp)) 403 return XDP_DROP; 404 405 return XDP_TX; 406 } 407 408 static __always_inline int tcp_lookup(void *ctx, struct header_pointers *hdr, bool xdp) 409 { 410 struct bpf_ct_opts___local ct_lookup_opts = { 411 .netns_id = BPF_F_CURRENT_NETNS, 412 .l4proto = IPPROTO_TCP, 413 }; 414 struct bpf_sock_tuple tup = {}; 415 struct nf_conn *ct; 416 __u32 tup_size; 417 418 if (hdr->ipv4) { 419 /* TCP doesn't normally use fragments, and XDP can't reassemble 420 * them. 421 */ 422 if ((hdr->ipv4->frag_off & bpf_htons(IP_DF | IP_MF | IP_OFFSET)) != bpf_htons(IP_DF)) 423 return XDP_DROP; 424 425 tup.ipv4.saddr = hdr->ipv4->saddr; 426 tup.ipv4.daddr = hdr->ipv4->daddr; 427 tup.ipv4.sport = hdr->tcp->source; 428 tup.ipv4.dport = hdr->tcp->dest; 429 tup_size = sizeof(tup.ipv4); 430 } else if (hdr->ipv6) { 431 __builtin_memcpy(tup.ipv6.saddr, &hdr->ipv6->saddr, sizeof(tup.ipv6.saddr)); 432 __builtin_memcpy(tup.ipv6.daddr, &hdr->ipv6->daddr, sizeof(tup.ipv6.daddr)); 433 tup.ipv6.sport = hdr->tcp->source; 434 tup.ipv6.dport = hdr->tcp->dest; 435 tup_size = sizeof(tup.ipv6); 436 } else { 437 /* The verifier can't track that either ipv4 or ipv6 is not 438 * NULL. 439 */ 440 return XDP_ABORTED; 441 } 442 if (xdp) 443 ct = bpf_xdp_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts)); 444 else 445 ct = bpf_skb_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts)); 446 if (ct) { 447 unsigned long status = ct->status; 448 449 bpf_ct_release(ct); 450 if (status & IPS_CONFIRMED_BIT) 451 return XDP_PASS; 452 } else if (ct_lookup_opts.error != -ENOENT) { 453 return XDP_ABORTED; 454 } 455 456 /* error == -ENOENT || !(status & IPS_CONFIRMED_BIT) */ 457 return XDP_TX; 458 } 459 460 static __always_inline __u8 tcp_mkoptions(__be32 *buf, __be32 *tsopt, __u16 mss, 461 __u8 wscale) 462 { 463 __be32 *start = buf; 464 465 *buf++ = bpf_htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss); 466 467 if (!tsopt) 468 return buf - start; 469 470 if (tsopt[0] & bpf_htonl(1 << 4)) 471 *buf++ = bpf_htonl((TCPOPT_SACK_PERM << 24) | 472 (TCPOLEN_SACK_PERM << 16) | 473 (TCPOPT_TIMESTAMP << 8) | 474 TCPOLEN_TIMESTAMP); 475 else 476 *buf++ = bpf_htonl((TCPOPT_NOP << 24) | 477 (TCPOPT_NOP << 16) | 478 (TCPOPT_TIMESTAMP << 8) | 479 TCPOLEN_TIMESTAMP); 480 *buf++ = tsopt[0]; 481 *buf++ = tsopt[1]; 482 483 if ((tsopt[0] & bpf_htonl(0xf)) != bpf_htonl(0xf)) 484 *buf++ = bpf_htonl((TCPOPT_NOP << 24) | 485 (TCPOPT_WINDOW << 16) | 486 (TCPOLEN_WINDOW << 8) | 487 wscale); 488 489 return buf - start; 490 } 491 492 static __always_inline void tcp_gen_synack(struct tcphdr *tcp_header, 493 __u32 cookie, __be32 *tsopt, 494 __u16 mss, __u8 wscale) 495 { 496 void *tcp_options; 497 498 tcp_flag_word(tcp_header) = TCP_FLAG_SYN | TCP_FLAG_ACK; 499 if (tsopt && (tsopt[0] & bpf_htonl(1 << 5))) 500 tcp_flag_word(tcp_header) |= TCP_FLAG_ECE; 501 tcp_header->doff = 5; /* doff is part of tcp_flag_word. */ 502 swap(tcp_header->source, tcp_header->dest); 503 tcp_header->ack_seq = bpf_htonl(bpf_ntohl(tcp_header->seq) + 1); 504 tcp_header->seq = bpf_htonl(cookie); 505 tcp_header->window = 0; 506 tcp_header->urg_ptr = 0; 507 tcp_header->check = 0; /* Calculate checksum later. */ 508 509 tcp_options = (void *)(tcp_header + 1); 510 tcp_header->doff += tcp_mkoptions(tcp_options, tsopt, mss, wscale); 511 } 512 513 static __always_inline void tcpv4_gen_synack(struct header_pointers *hdr, 514 __u32 cookie, __be32 *tsopt) 515 { 516 __u8 wscale; 517 __u16 mss; 518 __u8 ttl; 519 520 values_get_tcpipopts(&mss, &wscale, &ttl, false); 521 522 swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest); 523 524 swap(hdr->ipv4->saddr, hdr->ipv4->daddr); 525 hdr->ipv4->check = 0; /* Calculate checksum later. */ 526 hdr->ipv4->tos = 0; 527 hdr->ipv4->id = 0; 528 hdr->ipv4->ttl = ttl; 529 530 tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale); 531 532 hdr->tcp_len = hdr->tcp->doff * 4; 533 hdr->ipv4->tot_len = bpf_htons(sizeof(*hdr->ipv4) + hdr->tcp_len); 534 } 535 536 static __always_inline void tcpv6_gen_synack(struct header_pointers *hdr, 537 __u32 cookie, __be32 *tsopt) 538 { 539 __u8 wscale; 540 __u16 mss; 541 __u8 ttl; 542 543 values_get_tcpipopts(&mss, &wscale, &ttl, true); 544 545 swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest); 546 547 swap(hdr->ipv6->saddr, hdr->ipv6->daddr); 548 *(__be32 *)hdr->ipv6 = bpf_htonl(0x60000000); 549 hdr->ipv6->hop_limit = ttl; 550 551 tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale); 552 553 hdr->tcp_len = hdr->tcp->doff * 4; 554 hdr->ipv6->payload_len = bpf_htons(hdr->tcp_len); 555 } 556 557 static __always_inline int syncookie_handle_syn(struct header_pointers *hdr, 558 void *ctx, 559 void *data, void *data_end, 560 bool xdp) 561 { 562 __u32 old_pkt_size, new_pkt_size; 563 /* Unlike clang 10, clang 11 and 12 generate code that doesn't pass the 564 * BPF verifier if tsopt is not volatile. Volatile forces it to store 565 * the pointer value and use it directly, otherwise tcp_mkoptions is 566 * (mis)compiled like this: 567 * if (!tsopt) 568 * return buf - start; 569 * reg = stored_return_value_of_tscookie_init; 570 * if (reg) 571 * tsopt = tsopt_buf; 572 * else 573 * tsopt = NULL; 574 * ... 575 * *buf++ = tsopt[1]; 576 * It creates a dead branch where tsopt is assigned NULL, but the 577 * verifier can't prove it's dead and blocks the program. 578 */ 579 __be32 * volatile tsopt = NULL; 580 __be32 tsopt_buf[2] = {}; 581 __u16 ip_len; 582 __u32 cookie; 583 __s64 value; 584 585 /* Checksum is not yet verified, but both checksum failure and TCP 586 * header checks return XDP_DROP, so the order doesn't matter. 587 */ 588 if (hdr->tcp->fin || hdr->tcp->rst) 589 return XDP_DROP; 590 591 /* Issue SYN cookies on allowed ports, drop SYN packets on blocked 592 * ports. 593 */ 594 if (!check_port_allowed(bpf_ntohs(hdr->tcp->dest))) 595 return XDP_DROP; 596 597 if (hdr->ipv4) { 598 /* Check the IPv4 and TCP checksums before creating a SYNACK. */ 599 value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, hdr->ipv4->ihl * 4, 0); 600 if (value < 0) 601 return XDP_ABORTED; 602 if (csum_fold(value) != 0) 603 return XDP_DROP; /* Bad IPv4 checksum. */ 604 605 value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0); 606 if (value < 0) 607 return XDP_ABORTED; 608 if (csum_tcpudp_magic(hdr->ipv4->saddr, hdr->ipv4->daddr, 609 hdr->tcp_len, IPPROTO_TCP, value) != 0) 610 return XDP_DROP; /* Bad TCP checksum. */ 611 612 ip_len = sizeof(*hdr->ipv4); 613 614 value = bpf_tcp_raw_gen_syncookie_ipv4(hdr->ipv4, hdr->tcp, 615 hdr->tcp_len); 616 } else if (hdr->ipv6) { 617 /* Check the TCP checksum before creating a SYNACK. */ 618 value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0); 619 if (value < 0) 620 return XDP_ABORTED; 621 if (csum_ipv6_magic(&hdr->ipv6->saddr, &hdr->ipv6->daddr, 622 hdr->tcp_len, IPPROTO_TCP, value) != 0) 623 return XDP_DROP; /* Bad TCP checksum. */ 624 625 ip_len = sizeof(*hdr->ipv6); 626 627 value = bpf_tcp_raw_gen_syncookie_ipv6(hdr->ipv6, hdr->tcp, 628 hdr->tcp_len); 629 } else { 630 return XDP_ABORTED; 631 } 632 633 if (value < 0) 634 return XDP_ABORTED; 635 cookie = (__u32)value; 636 637 if (tscookie_init((void *)hdr->tcp, hdr->tcp_len, 638 &tsopt_buf[0], &tsopt_buf[1], data_end)) 639 tsopt = tsopt_buf; 640 641 /* Check that there is enough space for a SYNACK. It also covers 642 * the check that the destination of the __builtin_memmove below 643 * doesn't overflow. 644 */ 645 if (data + sizeof(*hdr->eth) + ip_len + TCP_MAXLEN > data_end) 646 return XDP_ABORTED; 647 648 if (hdr->ipv4) { 649 if (hdr->ipv4->ihl * 4 > sizeof(*hdr->ipv4)) { 650 struct tcphdr *new_tcp_header; 651 652 new_tcp_header = data + sizeof(*hdr->eth) + sizeof(*hdr->ipv4); 653 __builtin_memmove(new_tcp_header, hdr->tcp, sizeof(*hdr->tcp)); 654 hdr->tcp = new_tcp_header; 655 656 hdr->ipv4->ihl = sizeof(*hdr->ipv4) / 4; 657 } 658 659 tcpv4_gen_synack(hdr, cookie, tsopt); 660 } else if (hdr->ipv6) { 661 tcpv6_gen_synack(hdr, cookie, tsopt); 662 } else { 663 return XDP_ABORTED; 664 } 665 666 /* Recalculate checksums. */ 667 hdr->tcp->check = 0; 668 value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0); 669 if (value < 0) 670 return XDP_ABORTED; 671 if (hdr->ipv4) { 672 hdr->tcp->check = csum_tcpudp_magic(hdr->ipv4->saddr, 673 hdr->ipv4->daddr, 674 hdr->tcp_len, 675 IPPROTO_TCP, 676 value); 677 678 hdr->ipv4->check = 0; 679 value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, sizeof(*hdr->ipv4), 0); 680 if (value < 0) 681 return XDP_ABORTED; 682 hdr->ipv4->check = csum_fold(value); 683 } else if (hdr->ipv6) { 684 hdr->tcp->check = csum_ipv6_magic(&hdr->ipv6->saddr, 685 &hdr->ipv6->daddr, 686 hdr->tcp_len, 687 IPPROTO_TCP, 688 value); 689 } else { 690 return XDP_ABORTED; 691 } 692 693 /* Set the new packet size. */ 694 old_pkt_size = data_end - data; 695 new_pkt_size = sizeof(*hdr->eth) + ip_len + hdr->tcp->doff * 4; 696 if (xdp) { 697 if (bpf_xdp_adjust_tail(ctx, new_pkt_size - old_pkt_size)) 698 return XDP_ABORTED; 699 } else { 700 if (bpf_skb_change_tail(ctx, new_pkt_size, 0)) 701 return XDP_ABORTED; 702 } 703 704 values_inc_synacks(); 705 706 return XDP_TX; 707 } 708 709 static __always_inline int syncookie_handle_ack(struct header_pointers *hdr) 710 { 711 int err; 712 713 if (hdr->tcp->rst) 714 return XDP_DROP; 715 716 if (hdr->ipv4) 717 err = bpf_tcp_raw_check_syncookie_ipv4(hdr->ipv4, hdr->tcp); 718 else if (hdr->ipv6) 719 err = bpf_tcp_raw_check_syncookie_ipv6(hdr->ipv6, hdr->tcp); 720 else 721 return XDP_ABORTED; 722 if (err) 723 return XDP_DROP; 724 725 return XDP_PASS; 726 } 727 728 static __always_inline int syncookie_part1(void *ctx, void *data, void *data_end, 729 struct header_pointers *hdr, bool xdp) 730 { 731 int ret; 732 733 ret = tcp_dissect(data, data_end, hdr); 734 if (ret != XDP_TX) 735 return ret; 736 737 ret = tcp_lookup(ctx, hdr, xdp); 738 if (ret != XDP_TX) 739 return ret; 740 741 /* Packet is TCP and doesn't belong to an established connection. */ 742 743 if ((hdr->tcp->syn ^ hdr->tcp->ack) != 1) 744 return XDP_DROP; 745 746 /* Grow the TCP header to TCP_MAXLEN to be able to pass any hdr->tcp_len 747 * to bpf_tcp_raw_gen_syncookie_ipv{4,6} and pass the verifier. 748 */ 749 if (xdp) { 750 if (bpf_xdp_adjust_tail(ctx, TCP_MAXLEN - hdr->tcp_len)) 751 return XDP_ABORTED; 752 } else { 753 /* Without volatile the verifier throws this error: 754 * R9 32-bit pointer arithmetic prohibited 755 */ 756 volatile u64 old_len = data_end - data; 757 758 if (bpf_skb_change_tail(ctx, old_len + TCP_MAXLEN - hdr->tcp_len, 0)) 759 return XDP_ABORTED; 760 } 761 762 return XDP_TX; 763 } 764 765 static __always_inline int syncookie_part2(void *ctx, void *data, void *data_end, 766 struct header_pointers *hdr, bool xdp) 767 { 768 if (hdr->ipv4) { 769 hdr->eth = data; 770 hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth); 771 /* IPV4_MAXLEN is needed when calculating checksum. 772 * At least sizeof(struct iphdr) is needed here to access ihl. 773 */ 774 if ((void *)hdr->ipv4 + IPV4_MAXLEN > data_end) 775 return XDP_ABORTED; 776 hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4; 777 } else if (hdr->ipv6) { 778 hdr->eth = data; 779 hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth); 780 hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6); 781 } else { 782 return XDP_ABORTED; 783 } 784 785 if ((void *)hdr->tcp + TCP_MAXLEN > data_end) 786 return XDP_ABORTED; 787 788 /* We run out of registers, tcp_len gets spilled to the stack, and the 789 * verifier forgets its min and max values checked above in tcp_dissect. 790 */ 791 hdr->tcp_len = hdr->tcp->doff * 4; 792 if (hdr->tcp_len < sizeof(*hdr->tcp)) 793 return XDP_ABORTED; 794 795 return hdr->tcp->syn ? syncookie_handle_syn(hdr, ctx, data, data_end, xdp) : 796 syncookie_handle_ack(hdr); 797 } 798 799 SEC("xdp") 800 int syncookie_xdp(struct xdp_md *ctx) 801 { 802 void *data_end = (void *)(long)ctx->data_end; 803 void *data = (void *)(long)ctx->data; 804 struct header_pointers hdr; 805 int ret; 806 807 ret = syncookie_part1(ctx, data, data_end, &hdr, true); 808 if (ret != XDP_TX) 809 return ret; 810 811 data_end = (void *)(long)ctx->data_end; 812 data = (void *)(long)ctx->data; 813 814 return syncookie_part2(ctx, data, data_end, &hdr, true); 815 } 816 817 SEC("tc") 818 int syncookie_tc(struct __sk_buff *skb) 819 { 820 void *data_end = (void *)(long)skb->data_end; 821 void *data = (void *)(long)skb->data; 822 struct header_pointers hdr; 823 int ret; 824 825 ret = syncookie_part1(skb, data, data_end, &hdr, false); 826 if (ret != XDP_TX) 827 return ret == XDP_PASS ? TC_ACT_OK : TC_ACT_SHOT; 828 829 data_end = (void *)(long)skb->data_end; 830 data = (void *)(long)skb->data; 831 832 ret = syncookie_part2(skb, data, data_end, &hdr, false); 833 switch (ret) { 834 case XDP_PASS: 835 return TC_ACT_OK; 836 case XDP_TX: 837 return bpf_redirect(skb->ifindex, 0); 838 default: 839 return TC_ACT_SHOT; 840 } 841 } 842 843 char _license[] SEC("license") = "GPL"; 844