1 // SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause 2 /* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ 3 4 #include "vmlinux.h" 5 6 #include <bpf/bpf_helpers.h> 7 #include <bpf/bpf_endian.h> 8 #include <asm/errno.h> 9 10 #define TC_ACT_OK 0 11 #define TC_ACT_SHOT 2 12 13 #define NSEC_PER_SEC 1000000000L 14 15 #define ETH_ALEN 6 16 #define ETH_P_IP 0x0800 17 #define ETH_P_IPV6 0x86DD 18 19 #define tcp_flag_word(tp) (((union tcp_word_hdr *)(tp))->words[3]) 20 21 #define IP_DF 0x4000 22 #define IP_MF 0x2000 23 #define IP_OFFSET 0x1fff 24 25 #define NEXTHDR_TCP 6 26 27 #define TCPOPT_NOP 1 28 #define TCPOPT_EOL 0 29 #define TCPOPT_MSS 2 30 #define TCPOPT_WINDOW 3 31 #define TCPOPT_SACK_PERM 4 32 #define TCPOPT_TIMESTAMP 8 33 34 #define TCPOLEN_MSS 4 35 #define TCPOLEN_WINDOW 3 36 #define TCPOLEN_SACK_PERM 2 37 #define TCPOLEN_TIMESTAMP 10 38 39 #define TCP_TS_HZ 1000 40 #define TS_OPT_WSCALE_MASK 0xf 41 #define TS_OPT_SACK (1 << 4) 42 #define TS_OPT_ECN (1 << 5) 43 #define TSBITS 6 44 #define TSMASK (((__u32)1 << TSBITS) - 1) 45 #define TCP_MAX_WSCALE 14U 46 47 #define IPV4_MAXLEN 60 48 #define TCP_MAXLEN 60 49 50 #define DEFAULT_MSS4 1460 51 #define DEFAULT_MSS6 1440 52 #define DEFAULT_WSCALE 7 53 #define DEFAULT_TTL 64 54 #define MAX_ALLOWED_PORTS 8 55 56 #define swap(a, b) \ 57 do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) 58 59 #define __get_unaligned_t(type, ptr) ({ \ 60 const struct { type x; } __attribute__((__packed__)) *__pptr = (typeof(__pptr))(ptr); \ 61 __pptr->x; \ 62 }) 63 64 #define get_unaligned(ptr) __get_unaligned_t(typeof(*(ptr)), (ptr)) 65 66 struct { 67 __uint(type, BPF_MAP_TYPE_ARRAY); 68 __type(key, __u32); 69 __type(value, __u64); 70 __uint(max_entries, 2); 71 } values SEC(".maps"); 72 73 struct { 74 __uint(type, BPF_MAP_TYPE_ARRAY); 75 __type(key, __u32); 76 __type(value, __u16); 77 __uint(max_entries, MAX_ALLOWED_PORTS); 78 } allowed_ports SEC(".maps"); 79 80 extern struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, 81 struct bpf_sock_tuple *bpf_tuple, 82 __u32 len_tuple, 83 struct bpf_ct_opts *opts, 84 __u32 len_opts) __ksym; 85 86 extern struct nf_conn *bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, 87 struct bpf_sock_tuple *bpf_tuple, 88 u32 len_tuple, 89 struct bpf_ct_opts *opts, 90 u32 len_opts) __ksym; 91 92 extern void bpf_ct_release(struct nf_conn *ct) __ksym; 93 94 static __always_inline void swap_eth_addr(__u8 *a, __u8 *b) 95 { 96 __u8 tmp[ETH_ALEN]; 97 98 __builtin_memcpy(tmp, a, ETH_ALEN); 99 __builtin_memcpy(a, b, ETH_ALEN); 100 __builtin_memcpy(b, tmp, ETH_ALEN); 101 } 102 103 static __always_inline __u16 csum_fold(__u32 csum) 104 { 105 csum = (csum & 0xffff) + (csum >> 16); 106 csum = (csum & 0xffff) + (csum >> 16); 107 return (__u16)~csum; 108 } 109 110 static __always_inline __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, 111 __u32 len, __u8 proto, 112 __u32 csum) 113 { 114 __u64 s = csum; 115 116 s += (__u32)saddr; 117 s += (__u32)daddr; 118 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ 119 s += proto + len; 120 #elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 121 s += (proto + len) << 8; 122 #else 123 #error Unknown endian 124 #endif 125 s = (s & 0xffffffff) + (s >> 32); 126 s = (s & 0xffffffff) + (s >> 32); 127 128 return csum_fold((__u32)s); 129 } 130 131 static __always_inline __u16 csum_ipv6_magic(const struct in6_addr *saddr, 132 const struct in6_addr *daddr, 133 __u32 len, __u8 proto, __u32 csum) 134 { 135 __u64 sum = csum; 136 int i; 137 138 #pragma unroll 139 for (i = 0; i < 4; i++) 140 sum += (__u32)saddr->in6_u.u6_addr32[i]; 141 142 #pragma unroll 143 for (i = 0; i < 4; i++) 144 sum += (__u32)daddr->in6_u.u6_addr32[i]; 145 146 /* Don't combine additions to avoid 32-bit overflow. */ 147 sum += bpf_htonl(len); 148 sum += bpf_htonl(proto); 149 150 sum = (sum & 0xffffffff) + (sum >> 32); 151 sum = (sum & 0xffffffff) + (sum >> 32); 152 153 return csum_fold((__u32)sum); 154 } 155 156 static __always_inline __u64 tcp_clock_ns(void) 157 { 158 return bpf_ktime_get_ns(); 159 } 160 161 static __always_inline __u32 tcp_ns_to_ts(__u64 ns) 162 { 163 return ns / (NSEC_PER_SEC / TCP_TS_HZ); 164 } 165 166 static __always_inline __u32 tcp_time_stamp_raw(void) 167 { 168 return tcp_ns_to_ts(tcp_clock_ns()); 169 } 170 171 struct tcpopt_context { 172 __u8 *ptr; 173 __u8 *end; 174 void *data_end; 175 __be32 *tsecr; 176 __u8 wscale; 177 bool option_timestamp; 178 bool option_sack; 179 }; 180 181 static int tscookie_tcpopt_parse(struct tcpopt_context *ctx) 182 { 183 __u8 opcode, opsize; 184 185 if (ctx->ptr >= ctx->end) 186 return 1; 187 if (ctx->ptr >= ctx->data_end) 188 return 1; 189 190 opcode = ctx->ptr[0]; 191 192 if (opcode == TCPOPT_EOL) 193 return 1; 194 if (opcode == TCPOPT_NOP) { 195 ++ctx->ptr; 196 return 0; 197 } 198 199 if (ctx->ptr + 1 >= ctx->end) 200 return 1; 201 if (ctx->ptr + 1 >= ctx->data_end) 202 return 1; 203 opsize = ctx->ptr[1]; 204 if (opsize < 2) 205 return 1; 206 207 if (ctx->ptr + opsize > ctx->end) 208 return 1; 209 210 switch (opcode) { 211 case TCPOPT_WINDOW: 212 if (opsize == TCPOLEN_WINDOW && ctx->ptr + TCPOLEN_WINDOW <= ctx->data_end) 213 ctx->wscale = ctx->ptr[2] < TCP_MAX_WSCALE ? ctx->ptr[2] : TCP_MAX_WSCALE; 214 break; 215 case TCPOPT_TIMESTAMP: 216 if (opsize == TCPOLEN_TIMESTAMP && ctx->ptr + TCPOLEN_TIMESTAMP <= ctx->data_end) { 217 ctx->option_timestamp = true; 218 /* Client's tsval becomes our tsecr. */ 219 *ctx->tsecr = get_unaligned((__be32 *)(ctx->ptr + 2)); 220 } 221 break; 222 case TCPOPT_SACK_PERM: 223 if (opsize == TCPOLEN_SACK_PERM) 224 ctx->option_sack = true; 225 break; 226 } 227 228 ctx->ptr += opsize; 229 230 return 0; 231 } 232 233 static int tscookie_tcpopt_parse_batch(__u32 index, void *context) 234 { 235 int i; 236 237 for (i = 0; i < 7; i++) 238 if (tscookie_tcpopt_parse(context)) 239 return 1; 240 return 0; 241 } 242 243 static __always_inline bool tscookie_init(struct tcphdr *tcp_header, 244 __u16 tcp_len, __be32 *tsval, 245 __be32 *tsecr, void *data_end) 246 { 247 struct tcpopt_context loop_ctx = { 248 .ptr = (__u8 *)(tcp_header + 1), 249 .end = (__u8 *)tcp_header + tcp_len, 250 .data_end = data_end, 251 .tsecr = tsecr, 252 .wscale = TS_OPT_WSCALE_MASK, 253 .option_timestamp = false, 254 .option_sack = false, 255 }; 256 u32 cookie; 257 258 bpf_loop(6, tscookie_tcpopt_parse_batch, &loop_ctx, 0); 259 260 if (!loop_ctx.option_timestamp) 261 return false; 262 263 cookie = tcp_time_stamp_raw() & ~TSMASK; 264 cookie |= loop_ctx.wscale & TS_OPT_WSCALE_MASK; 265 if (loop_ctx.option_sack) 266 cookie |= TS_OPT_SACK; 267 if (tcp_header->ece && tcp_header->cwr) 268 cookie |= TS_OPT_ECN; 269 *tsval = bpf_htonl(cookie); 270 271 return true; 272 } 273 274 static __always_inline void values_get_tcpipopts(__u16 *mss, __u8 *wscale, 275 __u8 *ttl, bool ipv6) 276 { 277 __u32 key = 0; 278 __u64 *value; 279 280 value = bpf_map_lookup_elem(&values, &key); 281 if (value && *value != 0) { 282 if (ipv6) 283 *mss = (*value >> 32) & 0xffff; 284 else 285 *mss = *value & 0xffff; 286 *wscale = (*value >> 16) & 0xf; 287 *ttl = (*value >> 24) & 0xff; 288 return; 289 } 290 291 *mss = ipv6 ? DEFAULT_MSS6 : DEFAULT_MSS4; 292 *wscale = DEFAULT_WSCALE; 293 *ttl = DEFAULT_TTL; 294 } 295 296 static __always_inline void values_inc_synacks(void) 297 { 298 __u32 key = 1; 299 __u32 *value; 300 301 value = bpf_map_lookup_elem(&values, &key); 302 if (value) 303 __sync_fetch_and_add(value, 1); 304 } 305 306 static __always_inline bool check_port_allowed(__u16 port) 307 { 308 __u32 i; 309 310 for (i = 0; i < MAX_ALLOWED_PORTS; i++) { 311 __u32 key = i; 312 __u16 *value; 313 314 value = bpf_map_lookup_elem(&allowed_ports, &key); 315 316 if (!value) 317 break; 318 /* 0 is a terminator value. Check it first to avoid matching on 319 * a forbidden port == 0 and returning true. 320 */ 321 if (*value == 0) 322 break; 323 324 if (*value == port) 325 return true; 326 } 327 328 return false; 329 } 330 331 struct header_pointers { 332 struct ethhdr *eth; 333 struct iphdr *ipv4; 334 struct ipv6hdr *ipv6; 335 struct tcphdr *tcp; 336 __u16 tcp_len; 337 }; 338 339 static __always_inline int tcp_dissect(void *data, void *data_end, 340 struct header_pointers *hdr) 341 { 342 hdr->eth = data; 343 if (hdr->eth + 1 > data_end) 344 return XDP_DROP; 345 346 switch (bpf_ntohs(hdr->eth->h_proto)) { 347 case ETH_P_IP: 348 hdr->ipv6 = NULL; 349 350 hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth); 351 if (hdr->ipv4 + 1 > data_end) 352 return XDP_DROP; 353 if (hdr->ipv4->ihl * 4 < sizeof(*hdr->ipv4)) 354 return XDP_DROP; 355 if (hdr->ipv4->version != 4) 356 return XDP_DROP; 357 358 if (hdr->ipv4->protocol != IPPROTO_TCP) 359 return XDP_PASS; 360 361 hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4; 362 break; 363 case ETH_P_IPV6: 364 hdr->ipv4 = NULL; 365 366 hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth); 367 if (hdr->ipv6 + 1 > data_end) 368 return XDP_DROP; 369 if (hdr->ipv6->version != 6) 370 return XDP_DROP; 371 372 /* XXX: Extension headers are not supported and could circumvent 373 * XDP SYN flood protection. 374 */ 375 if (hdr->ipv6->nexthdr != NEXTHDR_TCP) 376 return XDP_PASS; 377 378 hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6); 379 break; 380 default: 381 /* XXX: VLANs will circumvent XDP SYN flood protection. */ 382 return XDP_PASS; 383 } 384 385 if (hdr->tcp + 1 > data_end) 386 return XDP_DROP; 387 hdr->tcp_len = hdr->tcp->doff * 4; 388 if (hdr->tcp_len < sizeof(*hdr->tcp)) 389 return XDP_DROP; 390 391 return XDP_TX; 392 } 393 394 static __always_inline int tcp_lookup(void *ctx, struct header_pointers *hdr, bool xdp) 395 { 396 struct bpf_ct_opts ct_lookup_opts = { 397 .netns_id = BPF_F_CURRENT_NETNS, 398 .l4proto = IPPROTO_TCP, 399 }; 400 struct bpf_sock_tuple tup = {}; 401 struct nf_conn *ct; 402 __u32 tup_size; 403 404 if (hdr->ipv4) { 405 /* TCP doesn't normally use fragments, and XDP can't reassemble 406 * them. 407 */ 408 if ((hdr->ipv4->frag_off & bpf_htons(IP_DF | IP_MF | IP_OFFSET)) != bpf_htons(IP_DF)) 409 return XDP_DROP; 410 411 tup.ipv4.saddr = hdr->ipv4->saddr; 412 tup.ipv4.daddr = hdr->ipv4->daddr; 413 tup.ipv4.sport = hdr->tcp->source; 414 tup.ipv4.dport = hdr->tcp->dest; 415 tup_size = sizeof(tup.ipv4); 416 } else if (hdr->ipv6) { 417 __builtin_memcpy(tup.ipv6.saddr, &hdr->ipv6->saddr, sizeof(tup.ipv6.saddr)); 418 __builtin_memcpy(tup.ipv6.daddr, &hdr->ipv6->daddr, sizeof(tup.ipv6.daddr)); 419 tup.ipv6.sport = hdr->tcp->source; 420 tup.ipv6.dport = hdr->tcp->dest; 421 tup_size = sizeof(tup.ipv6); 422 } else { 423 /* The verifier can't track that either ipv4 or ipv6 is not 424 * NULL. 425 */ 426 return XDP_ABORTED; 427 } 428 if (xdp) 429 ct = bpf_xdp_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts)); 430 else 431 ct = bpf_skb_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts)); 432 if (ct) { 433 unsigned long status = ct->status; 434 435 bpf_ct_release(ct); 436 if (status & IPS_CONFIRMED_BIT) 437 return XDP_PASS; 438 } else if (ct_lookup_opts.error != -ENOENT) { 439 return XDP_ABORTED; 440 } 441 442 /* error == -ENOENT || !(status & IPS_CONFIRMED_BIT) */ 443 return XDP_TX; 444 } 445 446 static __always_inline __u8 tcp_mkoptions(__be32 *buf, __be32 *tsopt, __u16 mss, 447 __u8 wscale) 448 { 449 __be32 *start = buf; 450 451 *buf++ = bpf_htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss); 452 453 if (!tsopt) 454 return buf - start; 455 456 if (tsopt[0] & bpf_htonl(1 << 4)) 457 *buf++ = bpf_htonl((TCPOPT_SACK_PERM << 24) | 458 (TCPOLEN_SACK_PERM << 16) | 459 (TCPOPT_TIMESTAMP << 8) | 460 TCPOLEN_TIMESTAMP); 461 else 462 *buf++ = bpf_htonl((TCPOPT_NOP << 24) | 463 (TCPOPT_NOP << 16) | 464 (TCPOPT_TIMESTAMP << 8) | 465 TCPOLEN_TIMESTAMP); 466 *buf++ = tsopt[0]; 467 *buf++ = tsopt[1]; 468 469 if ((tsopt[0] & bpf_htonl(0xf)) != bpf_htonl(0xf)) 470 *buf++ = bpf_htonl((TCPOPT_NOP << 24) | 471 (TCPOPT_WINDOW << 16) | 472 (TCPOLEN_WINDOW << 8) | 473 wscale); 474 475 return buf - start; 476 } 477 478 static __always_inline void tcp_gen_synack(struct tcphdr *tcp_header, 479 __u32 cookie, __be32 *tsopt, 480 __u16 mss, __u8 wscale) 481 { 482 void *tcp_options; 483 484 tcp_flag_word(tcp_header) = TCP_FLAG_SYN | TCP_FLAG_ACK; 485 if (tsopt && (tsopt[0] & bpf_htonl(1 << 5))) 486 tcp_flag_word(tcp_header) |= TCP_FLAG_ECE; 487 tcp_header->doff = 5; /* doff is part of tcp_flag_word. */ 488 swap(tcp_header->source, tcp_header->dest); 489 tcp_header->ack_seq = bpf_htonl(bpf_ntohl(tcp_header->seq) + 1); 490 tcp_header->seq = bpf_htonl(cookie); 491 tcp_header->window = 0; 492 tcp_header->urg_ptr = 0; 493 tcp_header->check = 0; /* Calculate checksum later. */ 494 495 tcp_options = (void *)(tcp_header + 1); 496 tcp_header->doff += tcp_mkoptions(tcp_options, tsopt, mss, wscale); 497 } 498 499 static __always_inline void tcpv4_gen_synack(struct header_pointers *hdr, 500 __u32 cookie, __be32 *tsopt) 501 { 502 __u8 wscale; 503 __u16 mss; 504 __u8 ttl; 505 506 values_get_tcpipopts(&mss, &wscale, &ttl, false); 507 508 swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest); 509 510 swap(hdr->ipv4->saddr, hdr->ipv4->daddr); 511 hdr->ipv4->check = 0; /* Calculate checksum later. */ 512 hdr->ipv4->tos = 0; 513 hdr->ipv4->id = 0; 514 hdr->ipv4->ttl = ttl; 515 516 tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale); 517 518 hdr->tcp_len = hdr->tcp->doff * 4; 519 hdr->ipv4->tot_len = bpf_htons(sizeof(*hdr->ipv4) + hdr->tcp_len); 520 } 521 522 static __always_inline void tcpv6_gen_synack(struct header_pointers *hdr, 523 __u32 cookie, __be32 *tsopt) 524 { 525 __u8 wscale; 526 __u16 mss; 527 __u8 ttl; 528 529 values_get_tcpipopts(&mss, &wscale, &ttl, true); 530 531 swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest); 532 533 swap(hdr->ipv6->saddr, hdr->ipv6->daddr); 534 *(__be32 *)hdr->ipv6 = bpf_htonl(0x60000000); 535 hdr->ipv6->hop_limit = ttl; 536 537 tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale); 538 539 hdr->tcp_len = hdr->tcp->doff * 4; 540 hdr->ipv6->payload_len = bpf_htons(hdr->tcp_len); 541 } 542 543 static __always_inline int syncookie_handle_syn(struct header_pointers *hdr, 544 void *ctx, 545 void *data, void *data_end, 546 bool xdp) 547 { 548 __u32 old_pkt_size, new_pkt_size; 549 /* Unlike clang 10, clang 11 and 12 generate code that doesn't pass the 550 * BPF verifier if tsopt is not volatile. Volatile forces it to store 551 * the pointer value and use it directly, otherwise tcp_mkoptions is 552 * (mis)compiled like this: 553 * if (!tsopt) 554 * return buf - start; 555 * reg = stored_return_value_of_tscookie_init; 556 * if (reg) 557 * tsopt = tsopt_buf; 558 * else 559 * tsopt = NULL; 560 * ... 561 * *buf++ = tsopt[1]; 562 * It creates a dead branch where tsopt is assigned NULL, but the 563 * verifier can't prove it's dead and blocks the program. 564 */ 565 __be32 * volatile tsopt = NULL; 566 __be32 tsopt_buf[2] = {}; 567 __u16 ip_len; 568 __u32 cookie; 569 __s64 value; 570 571 /* Checksum is not yet verified, but both checksum failure and TCP 572 * header checks return XDP_DROP, so the order doesn't matter. 573 */ 574 if (hdr->tcp->fin || hdr->tcp->rst) 575 return XDP_DROP; 576 577 /* Issue SYN cookies on allowed ports, drop SYN packets on blocked 578 * ports. 579 */ 580 if (!check_port_allowed(bpf_ntohs(hdr->tcp->dest))) 581 return XDP_DROP; 582 583 if (hdr->ipv4) { 584 /* Check the IPv4 and TCP checksums before creating a SYNACK. */ 585 value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, hdr->ipv4->ihl * 4, 0); 586 if (value < 0) 587 return XDP_ABORTED; 588 if (csum_fold(value) != 0) 589 return XDP_DROP; /* Bad IPv4 checksum. */ 590 591 value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0); 592 if (value < 0) 593 return XDP_ABORTED; 594 if (csum_tcpudp_magic(hdr->ipv4->saddr, hdr->ipv4->daddr, 595 hdr->tcp_len, IPPROTO_TCP, value) != 0) 596 return XDP_DROP; /* Bad TCP checksum. */ 597 598 ip_len = sizeof(*hdr->ipv4); 599 600 value = bpf_tcp_raw_gen_syncookie_ipv4(hdr->ipv4, hdr->tcp, 601 hdr->tcp_len); 602 } else if (hdr->ipv6) { 603 /* Check the TCP checksum before creating a SYNACK. */ 604 value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0); 605 if (value < 0) 606 return XDP_ABORTED; 607 if (csum_ipv6_magic(&hdr->ipv6->saddr, &hdr->ipv6->daddr, 608 hdr->tcp_len, IPPROTO_TCP, value) != 0) 609 return XDP_DROP; /* Bad TCP checksum. */ 610 611 ip_len = sizeof(*hdr->ipv6); 612 613 value = bpf_tcp_raw_gen_syncookie_ipv6(hdr->ipv6, hdr->tcp, 614 hdr->tcp_len); 615 } else { 616 return XDP_ABORTED; 617 } 618 619 if (value < 0) 620 return XDP_ABORTED; 621 cookie = (__u32)value; 622 623 if (tscookie_init((void *)hdr->tcp, hdr->tcp_len, 624 &tsopt_buf[0], &tsopt_buf[1], data_end)) 625 tsopt = tsopt_buf; 626 627 /* Check that there is enough space for a SYNACK. It also covers 628 * the check that the destination of the __builtin_memmove below 629 * doesn't overflow. 630 */ 631 if (data + sizeof(*hdr->eth) + ip_len + TCP_MAXLEN > data_end) 632 return XDP_ABORTED; 633 634 if (hdr->ipv4) { 635 if (hdr->ipv4->ihl * 4 > sizeof(*hdr->ipv4)) { 636 struct tcphdr *new_tcp_header; 637 638 new_tcp_header = data + sizeof(*hdr->eth) + sizeof(*hdr->ipv4); 639 __builtin_memmove(new_tcp_header, hdr->tcp, sizeof(*hdr->tcp)); 640 hdr->tcp = new_tcp_header; 641 642 hdr->ipv4->ihl = sizeof(*hdr->ipv4) / 4; 643 } 644 645 tcpv4_gen_synack(hdr, cookie, tsopt); 646 } else if (hdr->ipv6) { 647 tcpv6_gen_synack(hdr, cookie, tsopt); 648 } else { 649 return XDP_ABORTED; 650 } 651 652 /* Recalculate checksums. */ 653 hdr->tcp->check = 0; 654 value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0); 655 if (value < 0) 656 return XDP_ABORTED; 657 if (hdr->ipv4) { 658 hdr->tcp->check = csum_tcpudp_magic(hdr->ipv4->saddr, 659 hdr->ipv4->daddr, 660 hdr->tcp_len, 661 IPPROTO_TCP, 662 value); 663 664 hdr->ipv4->check = 0; 665 value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, sizeof(*hdr->ipv4), 0); 666 if (value < 0) 667 return XDP_ABORTED; 668 hdr->ipv4->check = csum_fold(value); 669 } else if (hdr->ipv6) { 670 hdr->tcp->check = csum_ipv6_magic(&hdr->ipv6->saddr, 671 &hdr->ipv6->daddr, 672 hdr->tcp_len, 673 IPPROTO_TCP, 674 value); 675 } else { 676 return XDP_ABORTED; 677 } 678 679 /* Set the new packet size. */ 680 old_pkt_size = data_end - data; 681 new_pkt_size = sizeof(*hdr->eth) + ip_len + hdr->tcp->doff * 4; 682 if (xdp) { 683 if (bpf_xdp_adjust_tail(ctx, new_pkt_size - old_pkt_size)) 684 return XDP_ABORTED; 685 } else { 686 if (bpf_skb_change_tail(ctx, new_pkt_size, 0)) 687 return XDP_ABORTED; 688 } 689 690 values_inc_synacks(); 691 692 return XDP_TX; 693 } 694 695 static __always_inline int syncookie_handle_ack(struct header_pointers *hdr) 696 { 697 int err; 698 699 if (hdr->tcp->rst) 700 return XDP_DROP; 701 702 if (hdr->ipv4) 703 err = bpf_tcp_raw_check_syncookie_ipv4(hdr->ipv4, hdr->tcp); 704 else if (hdr->ipv6) 705 err = bpf_tcp_raw_check_syncookie_ipv6(hdr->ipv6, hdr->tcp); 706 else 707 return XDP_ABORTED; 708 if (err) 709 return XDP_DROP; 710 711 return XDP_PASS; 712 } 713 714 static __always_inline int syncookie_part1(void *ctx, void *data, void *data_end, 715 struct header_pointers *hdr, bool xdp) 716 { 717 struct bpf_ct_opts ct_lookup_opts = { 718 .netns_id = BPF_F_CURRENT_NETNS, 719 .l4proto = IPPROTO_TCP, 720 }; 721 int ret; 722 723 ret = tcp_dissect(data, data_end, hdr); 724 if (ret != XDP_TX) 725 return ret; 726 727 ret = tcp_lookup(ctx, hdr, xdp); 728 if (ret != XDP_TX) 729 return ret; 730 731 /* Packet is TCP and doesn't belong to an established connection. */ 732 733 if ((hdr->tcp->syn ^ hdr->tcp->ack) != 1) 734 return XDP_DROP; 735 736 /* Grow the TCP header to TCP_MAXLEN to be able to pass any hdr->tcp_len 737 * to bpf_tcp_raw_gen_syncookie_ipv{4,6} and pass the verifier. 738 */ 739 if (xdp) { 740 if (bpf_xdp_adjust_tail(ctx, TCP_MAXLEN - hdr->tcp_len)) 741 return XDP_ABORTED; 742 } else { 743 /* Without volatile the verifier throws this error: 744 * R9 32-bit pointer arithmetic prohibited 745 */ 746 volatile u64 old_len = data_end - data; 747 748 if (bpf_skb_change_tail(ctx, old_len + TCP_MAXLEN - hdr->tcp_len, 0)) 749 return XDP_ABORTED; 750 } 751 752 return XDP_TX; 753 } 754 755 static __always_inline int syncookie_part2(void *ctx, void *data, void *data_end, 756 struct header_pointers *hdr, bool xdp) 757 { 758 if (hdr->ipv4) { 759 hdr->eth = data; 760 hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth); 761 /* IPV4_MAXLEN is needed when calculating checksum. 762 * At least sizeof(struct iphdr) is needed here to access ihl. 763 */ 764 if ((void *)hdr->ipv4 + IPV4_MAXLEN > data_end) 765 return XDP_ABORTED; 766 hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4; 767 } else if (hdr->ipv6) { 768 hdr->eth = data; 769 hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth); 770 hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6); 771 } else { 772 return XDP_ABORTED; 773 } 774 775 if ((void *)hdr->tcp + TCP_MAXLEN > data_end) 776 return XDP_ABORTED; 777 778 /* We run out of registers, tcp_len gets spilled to the stack, and the 779 * verifier forgets its min and max values checked above in tcp_dissect. 780 */ 781 hdr->tcp_len = hdr->tcp->doff * 4; 782 if (hdr->tcp_len < sizeof(*hdr->tcp)) 783 return XDP_ABORTED; 784 785 return hdr->tcp->syn ? syncookie_handle_syn(hdr, ctx, data, data_end, xdp) : 786 syncookie_handle_ack(hdr); 787 } 788 789 SEC("xdp") 790 int syncookie_xdp(struct xdp_md *ctx) 791 { 792 void *data_end = (void *)(long)ctx->data_end; 793 void *data = (void *)(long)ctx->data; 794 struct header_pointers hdr; 795 int ret; 796 797 ret = syncookie_part1(ctx, data, data_end, &hdr, true); 798 if (ret != XDP_TX) 799 return ret; 800 801 data_end = (void *)(long)ctx->data_end; 802 data = (void *)(long)ctx->data; 803 804 return syncookie_part2(ctx, data, data_end, &hdr, true); 805 } 806 807 SEC("tc") 808 int syncookie_tc(struct __sk_buff *skb) 809 { 810 void *data_end = (void *)(long)skb->data_end; 811 void *data = (void *)(long)skb->data; 812 struct header_pointers hdr; 813 int ret; 814 815 ret = syncookie_part1(skb, data, data_end, &hdr, false); 816 if (ret != XDP_TX) 817 return ret == XDP_PASS ? TC_ACT_OK : TC_ACT_SHOT; 818 819 data_end = (void *)(long)skb->data_end; 820 data = (void *)(long)skb->data; 821 822 ret = syncookie_part2(skb, data, data_end, &hdr, false); 823 switch (ret) { 824 case XDP_PASS: 825 return TC_ACT_OK; 826 case XDP_TX: 827 return bpf_redirect(skb->ifindex, 0); 828 default: 829 return TC_ACT_SHOT; 830 } 831 } 832 833 char _license[] SEC("license") = "GPL"; 834