1 // SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause
2 /* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
3 
4 #include "vmlinux.h"
5 
6 #include <bpf/bpf_helpers.h>
7 #include <bpf/bpf_endian.h>
8 #include <asm/errno.h>
9 
10 #define TC_ACT_OK 0
11 #define TC_ACT_SHOT 2
12 
13 #define NSEC_PER_SEC 1000000000L
14 
15 #define ETH_ALEN 6
16 #define ETH_P_IP 0x0800
17 #define ETH_P_IPV6 0x86DD
18 
19 #define tcp_flag_word(tp) (((union tcp_word_hdr *)(tp))->words[3])
20 
21 #define IP_DF 0x4000
22 #define IP_MF 0x2000
23 #define IP_OFFSET 0x1fff
24 
25 #define NEXTHDR_TCP 6
26 
27 #define TCPOPT_NOP 1
28 #define TCPOPT_EOL 0
29 #define TCPOPT_MSS 2
30 #define TCPOPT_WINDOW 3
31 #define TCPOPT_SACK_PERM 4
32 #define TCPOPT_TIMESTAMP 8
33 
34 #define TCPOLEN_MSS 4
35 #define TCPOLEN_WINDOW 3
36 #define TCPOLEN_SACK_PERM 2
37 #define TCPOLEN_TIMESTAMP 10
38 
39 #define TCP_TS_HZ 1000
40 #define TS_OPT_WSCALE_MASK 0xf
41 #define TS_OPT_SACK (1 << 4)
42 #define TS_OPT_ECN (1 << 5)
43 #define TSBITS 6
44 #define TSMASK (((__u32)1 << TSBITS) - 1)
45 #define TCP_MAX_WSCALE 14U
46 
47 #define IPV4_MAXLEN 60
48 #define TCP_MAXLEN 60
49 
50 #define DEFAULT_MSS4 1460
51 #define DEFAULT_MSS6 1440
52 #define DEFAULT_WSCALE 7
53 #define DEFAULT_TTL 64
54 #define MAX_ALLOWED_PORTS 8
55 
56 #define swap(a, b) \
57 	do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
58 
59 #define __get_unaligned_t(type, ptr) ({						\
60 	const struct { type x; } __attribute__((__packed__)) *__pptr = (typeof(__pptr))(ptr); \
61 	__pptr->x;								\
62 })
63 
64 #define get_unaligned(ptr) __get_unaligned_t(typeof(*(ptr)), (ptr))
65 
66 struct {
67 	__uint(type, BPF_MAP_TYPE_ARRAY);
68 	__type(key, __u32);
69 	__type(value, __u64);
70 	__uint(max_entries, 2);
71 } values SEC(".maps");
72 
73 struct {
74 	__uint(type, BPF_MAP_TYPE_ARRAY);
75 	__type(key, __u32);
76 	__type(value, __u16);
77 	__uint(max_entries, MAX_ALLOWED_PORTS);
78 } allowed_ports SEC(".maps");
79 
80 extern struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx,
81 					 struct bpf_sock_tuple *bpf_tuple,
82 					 __u32 len_tuple,
83 					 struct bpf_ct_opts *opts,
84 					 __u32 len_opts) __ksym;
85 
86 extern struct nf_conn *bpf_skb_ct_lookup(struct __sk_buff *skb_ctx,
87 					 struct bpf_sock_tuple *bpf_tuple,
88 					 u32 len_tuple,
89 					 struct bpf_ct_opts *opts,
90 					 u32 len_opts) __ksym;
91 
92 extern void bpf_ct_release(struct nf_conn *ct) __ksym;
93 
94 static __always_inline void swap_eth_addr(__u8 *a, __u8 *b)
95 {
96 	__u8 tmp[ETH_ALEN];
97 
98 	__builtin_memcpy(tmp, a, ETH_ALEN);
99 	__builtin_memcpy(a, b, ETH_ALEN);
100 	__builtin_memcpy(b, tmp, ETH_ALEN);
101 }
102 
103 static __always_inline __u16 csum_fold(__u32 csum)
104 {
105 	csum = (csum & 0xffff) + (csum >> 16);
106 	csum = (csum & 0xffff) + (csum >> 16);
107 	return (__u16)~csum;
108 }
109 
110 static __always_inline __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
111 					       __u32 len, __u8 proto,
112 					       __u32 csum)
113 {
114 	__u64 s = csum;
115 
116 	s += (__u32)saddr;
117 	s += (__u32)daddr;
118 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
119 	s += proto + len;
120 #elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
121 	s += (proto + len) << 8;
122 #else
123 #error Unknown endian
124 #endif
125 	s = (s & 0xffffffff) + (s >> 32);
126 	s = (s & 0xffffffff) + (s >> 32);
127 
128 	return csum_fold((__u32)s);
129 }
130 
131 static __always_inline __u16 csum_ipv6_magic(const struct in6_addr *saddr,
132 					     const struct in6_addr *daddr,
133 					     __u32 len, __u8 proto, __u32 csum)
134 {
135 	__u64 sum = csum;
136 	int i;
137 
138 #pragma unroll
139 	for (i = 0; i < 4; i++)
140 		sum += (__u32)saddr->in6_u.u6_addr32[i];
141 
142 #pragma unroll
143 	for (i = 0; i < 4; i++)
144 		sum += (__u32)daddr->in6_u.u6_addr32[i];
145 
146 	/* Don't combine additions to avoid 32-bit overflow. */
147 	sum += bpf_htonl(len);
148 	sum += bpf_htonl(proto);
149 
150 	sum = (sum & 0xffffffff) + (sum >> 32);
151 	sum = (sum & 0xffffffff) + (sum >> 32);
152 
153 	return csum_fold((__u32)sum);
154 }
155 
156 static __always_inline __u64 tcp_clock_ns(void)
157 {
158 	return bpf_ktime_get_ns();
159 }
160 
161 static __always_inline __u32 tcp_ns_to_ts(__u64 ns)
162 {
163 	return ns / (NSEC_PER_SEC / TCP_TS_HZ);
164 }
165 
166 static __always_inline __u32 tcp_time_stamp_raw(void)
167 {
168 	return tcp_ns_to_ts(tcp_clock_ns());
169 }
170 
171 struct tcpopt_context {
172 	__u8 *ptr;
173 	__u8 *end;
174 	void *data_end;
175 	__be32 *tsecr;
176 	__u8 wscale;
177 	bool option_timestamp;
178 	bool option_sack;
179 };
180 
181 static int tscookie_tcpopt_parse(struct tcpopt_context *ctx)
182 {
183 	__u8 opcode, opsize;
184 
185 	if (ctx->ptr >= ctx->end)
186 		return 1;
187 	if (ctx->ptr >= ctx->data_end)
188 		return 1;
189 
190 	opcode = ctx->ptr[0];
191 
192 	if (opcode == TCPOPT_EOL)
193 		return 1;
194 	if (opcode == TCPOPT_NOP) {
195 		++ctx->ptr;
196 		return 0;
197 	}
198 
199 	if (ctx->ptr + 1 >= ctx->end)
200 		return 1;
201 	if (ctx->ptr + 1 >= ctx->data_end)
202 		return 1;
203 	opsize = ctx->ptr[1];
204 	if (opsize < 2)
205 		return 1;
206 
207 	if (ctx->ptr + opsize > ctx->end)
208 		return 1;
209 
210 	switch (opcode) {
211 	case TCPOPT_WINDOW:
212 		if (opsize == TCPOLEN_WINDOW && ctx->ptr + TCPOLEN_WINDOW <= ctx->data_end)
213 			ctx->wscale = ctx->ptr[2] < TCP_MAX_WSCALE ? ctx->ptr[2] : TCP_MAX_WSCALE;
214 		break;
215 	case TCPOPT_TIMESTAMP:
216 		if (opsize == TCPOLEN_TIMESTAMP && ctx->ptr + TCPOLEN_TIMESTAMP <= ctx->data_end) {
217 			ctx->option_timestamp = true;
218 			/* Client's tsval becomes our tsecr. */
219 			*ctx->tsecr = get_unaligned((__be32 *)(ctx->ptr + 2));
220 		}
221 		break;
222 	case TCPOPT_SACK_PERM:
223 		if (opsize == TCPOLEN_SACK_PERM)
224 			ctx->option_sack = true;
225 		break;
226 	}
227 
228 	ctx->ptr += opsize;
229 
230 	return 0;
231 }
232 
233 static int tscookie_tcpopt_parse_batch(__u32 index, void *context)
234 {
235 	int i;
236 
237 	for (i = 0; i < 7; i++)
238 		if (tscookie_tcpopt_parse(context))
239 			return 1;
240 	return 0;
241 }
242 
243 static __always_inline bool tscookie_init(struct tcphdr *tcp_header,
244 					  __u16 tcp_len, __be32 *tsval,
245 					  __be32 *tsecr, void *data_end)
246 {
247 	struct tcpopt_context loop_ctx = {
248 		.ptr = (__u8 *)(tcp_header + 1),
249 		.end = (__u8 *)tcp_header + tcp_len,
250 		.data_end = data_end,
251 		.tsecr = tsecr,
252 		.wscale = TS_OPT_WSCALE_MASK,
253 		.option_timestamp = false,
254 		.option_sack = false,
255 	};
256 	u32 cookie;
257 
258 	bpf_loop(6, tscookie_tcpopt_parse_batch, &loop_ctx, 0);
259 
260 	if (!loop_ctx.option_timestamp)
261 		return false;
262 
263 	cookie = tcp_time_stamp_raw() & ~TSMASK;
264 	cookie |= loop_ctx.wscale & TS_OPT_WSCALE_MASK;
265 	if (loop_ctx.option_sack)
266 		cookie |= TS_OPT_SACK;
267 	if (tcp_header->ece && tcp_header->cwr)
268 		cookie |= TS_OPT_ECN;
269 	*tsval = bpf_htonl(cookie);
270 
271 	return true;
272 }
273 
274 static __always_inline void values_get_tcpipopts(__u16 *mss, __u8 *wscale,
275 						 __u8 *ttl, bool ipv6)
276 {
277 	__u32 key = 0;
278 	__u64 *value;
279 
280 	value = bpf_map_lookup_elem(&values, &key);
281 	if (value && *value != 0) {
282 		if (ipv6)
283 			*mss = (*value >> 32) & 0xffff;
284 		else
285 			*mss = *value & 0xffff;
286 		*wscale = (*value >> 16) & 0xf;
287 		*ttl = (*value >> 24) & 0xff;
288 		return;
289 	}
290 
291 	*mss = ipv6 ? DEFAULT_MSS6 : DEFAULT_MSS4;
292 	*wscale = DEFAULT_WSCALE;
293 	*ttl = DEFAULT_TTL;
294 }
295 
296 static __always_inline void values_inc_synacks(void)
297 {
298 	__u32 key = 1;
299 	__u32 *value;
300 
301 	value = bpf_map_lookup_elem(&values, &key);
302 	if (value)
303 		__sync_fetch_and_add(value, 1);
304 }
305 
306 static __always_inline bool check_port_allowed(__u16 port)
307 {
308 	__u32 i;
309 
310 	for (i = 0; i < MAX_ALLOWED_PORTS; i++) {
311 		__u32 key = i;
312 		__u16 *value;
313 
314 		value = bpf_map_lookup_elem(&allowed_ports, &key);
315 
316 		if (!value)
317 			break;
318 		/* 0 is a terminator value. Check it first to avoid matching on
319 		 * a forbidden port == 0 and returning true.
320 		 */
321 		if (*value == 0)
322 			break;
323 
324 		if (*value == port)
325 			return true;
326 	}
327 
328 	return false;
329 }
330 
331 struct header_pointers {
332 	struct ethhdr *eth;
333 	struct iphdr *ipv4;
334 	struct ipv6hdr *ipv6;
335 	struct tcphdr *tcp;
336 	__u16 tcp_len;
337 };
338 
339 static __always_inline int tcp_dissect(void *data, void *data_end,
340 				       struct header_pointers *hdr)
341 {
342 	hdr->eth = data;
343 	if (hdr->eth + 1 > data_end)
344 		return XDP_DROP;
345 
346 	switch (bpf_ntohs(hdr->eth->h_proto)) {
347 	case ETH_P_IP:
348 		hdr->ipv6 = NULL;
349 
350 		hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth);
351 		if (hdr->ipv4 + 1 > data_end)
352 			return XDP_DROP;
353 		if (hdr->ipv4->ihl * 4 < sizeof(*hdr->ipv4))
354 			return XDP_DROP;
355 		if (hdr->ipv4->version != 4)
356 			return XDP_DROP;
357 
358 		if (hdr->ipv4->protocol != IPPROTO_TCP)
359 			return XDP_PASS;
360 
361 		hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4;
362 		break;
363 	case ETH_P_IPV6:
364 		hdr->ipv4 = NULL;
365 
366 		hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth);
367 		if (hdr->ipv6 + 1 > data_end)
368 			return XDP_DROP;
369 		if (hdr->ipv6->version != 6)
370 			return XDP_DROP;
371 
372 		/* XXX: Extension headers are not supported and could circumvent
373 		 * XDP SYN flood protection.
374 		 */
375 		if (hdr->ipv6->nexthdr != NEXTHDR_TCP)
376 			return XDP_PASS;
377 
378 		hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6);
379 		break;
380 	default:
381 		/* XXX: VLANs will circumvent XDP SYN flood protection. */
382 		return XDP_PASS;
383 	}
384 
385 	if (hdr->tcp + 1 > data_end)
386 		return XDP_DROP;
387 	hdr->tcp_len = hdr->tcp->doff * 4;
388 	if (hdr->tcp_len < sizeof(*hdr->tcp))
389 		return XDP_DROP;
390 
391 	return XDP_TX;
392 }
393 
394 static __always_inline int tcp_lookup(void *ctx, struct header_pointers *hdr, bool xdp)
395 {
396 	struct bpf_ct_opts ct_lookup_opts = {
397 		.netns_id = BPF_F_CURRENT_NETNS,
398 		.l4proto = IPPROTO_TCP,
399 	};
400 	struct bpf_sock_tuple tup = {};
401 	struct nf_conn *ct;
402 	__u32 tup_size;
403 
404 	if (hdr->ipv4) {
405 		/* TCP doesn't normally use fragments, and XDP can't reassemble
406 		 * them.
407 		 */
408 		if ((hdr->ipv4->frag_off & bpf_htons(IP_DF | IP_MF | IP_OFFSET)) != bpf_htons(IP_DF))
409 			return XDP_DROP;
410 
411 		tup.ipv4.saddr = hdr->ipv4->saddr;
412 		tup.ipv4.daddr = hdr->ipv4->daddr;
413 		tup.ipv4.sport = hdr->tcp->source;
414 		tup.ipv4.dport = hdr->tcp->dest;
415 		tup_size = sizeof(tup.ipv4);
416 	} else if (hdr->ipv6) {
417 		__builtin_memcpy(tup.ipv6.saddr, &hdr->ipv6->saddr, sizeof(tup.ipv6.saddr));
418 		__builtin_memcpy(tup.ipv6.daddr, &hdr->ipv6->daddr, sizeof(tup.ipv6.daddr));
419 		tup.ipv6.sport = hdr->tcp->source;
420 		tup.ipv6.dport = hdr->tcp->dest;
421 		tup_size = sizeof(tup.ipv6);
422 	} else {
423 		/* The verifier can't track that either ipv4 or ipv6 is not
424 		 * NULL.
425 		 */
426 		return XDP_ABORTED;
427 	}
428 	if (xdp)
429 		ct = bpf_xdp_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts));
430 	else
431 		ct = bpf_skb_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts));
432 	if (ct) {
433 		unsigned long status = ct->status;
434 
435 		bpf_ct_release(ct);
436 		if (status & IPS_CONFIRMED_BIT)
437 			return XDP_PASS;
438 	} else if (ct_lookup_opts.error != -ENOENT) {
439 		return XDP_ABORTED;
440 	}
441 
442 	/* error == -ENOENT || !(status & IPS_CONFIRMED_BIT) */
443 	return XDP_TX;
444 }
445 
446 static __always_inline __u8 tcp_mkoptions(__be32 *buf, __be32 *tsopt, __u16 mss,
447 					  __u8 wscale)
448 {
449 	__be32 *start = buf;
450 
451 	*buf++ = bpf_htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);
452 
453 	if (!tsopt)
454 		return buf - start;
455 
456 	if (tsopt[0] & bpf_htonl(1 << 4))
457 		*buf++ = bpf_htonl((TCPOPT_SACK_PERM << 24) |
458 				   (TCPOLEN_SACK_PERM << 16) |
459 				   (TCPOPT_TIMESTAMP << 8) |
460 				   TCPOLEN_TIMESTAMP);
461 	else
462 		*buf++ = bpf_htonl((TCPOPT_NOP << 24) |
463 				   (TCPOPT_NOP << 16) |
464 				   (TCPOPT_TIMESTAMP << 8) |
465 				   TCPOLEN_TIMESTAMP);
466 	*buf++ = tsopt[0];
467 	*buf++ = tsopt[1];
468 
469 	if ((tsopt[0] & bpf_htonl(0xf)) != bpf_htonl(0xf))
470 		*buf++ = bpf_htonl((TCPOPT_NOP << 24) |
471 				   (TCPOPT_WINDOW << 16) |
472 				   (TCPOLEN_WINDOW << 8) |
473 				   wscale);
474 
475 	return buf - start;
476 }
477 
478 static __always_inline void tcp_gen_synack(struct tcphdr *tcp_header,
479 					   __u32 cookie, __be32 *tsopt,
480 					   __u16 mss, __u8 wscale)
481 {
482 	void *tcp_options;
483 
484 	tcp_flag_word(tcp_header) = TCP_FLAG_SYN | TCP_FLAG_ACK;
485 	if (tsopt && (tsopt[0] & bpf_htonl(1 << 5)))
486 		tcp_flag_word(tcp_header) |= TCP_FLAG_ECE;
487 	tcp_header->doff = 5; /* doff is part of tcp_flag_word. */
488 	swap(tcp_header->source, tcp_header->dest);
489 	tcp_header->ack_seq = bpf_htonl(bpf_ntohl(tcp_header->seq) + 1);
490 	tcp_header->seq = bpf_htonl(cookie);
491 	tcp_header->window = 0;
492 	tcp_header->urg_ptr = 0;
493 	tcp_header->check = 0; /* Calculate checksum later. */
494 
495 	tcp_options = (void *)(tcp_header + 1);
496 	tcp_header->doff += tcp_mkoptions(tcp_options, tsopt, mss, wscale);
497 }
498 
499 static __always_inline void tcpv4_gen_synack(struct header_pointers *hdr,
500 					     __u32 cookie, __be32 *tsopt)
501 {
502 	__u8 wscale;
503 	__u16 mss;
504 	__u8 ttl;
505 
506 	values_get_tcpipopts(&mss, &wscale, &ttl, false);
507 
508 	swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest);
509 
510 	swap(hdr->ipv4->saddr, hdr->ipv4->daddr);
511 	hdr->ipv4->check = 0; /* Calculate checksum later. */
512 	hdr->ipv4->tos = 0;
513 	hdr->ipv4->id = 0;
514 	hdr->ipv4->ttl = ttl;
515 
516 	tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale);
517 
518 	hdr->tcp_len = hdr->tcp->doff * 4;
519 	hdr->ipv4->tot_len = bpf_htons(sizeof(*hdr->ipv4) + hdr->tcp_len);
520 }
521 
522 static __always_inline void tcpv6_gen_synack(struct header_pointers *hdr,
523 					     __u32 cookie, __be32 *tsopt)
524 {
525 	__u8 wscale;
526 	__u16 mss;
527 	__u8 ttl;
528 
529 	values_get_tcpipopts(&mss, &wscale, &ttl, true);
530 
531 	swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest);
532 
533 	swap(hdr->ipv6->saddr, hdr->ipv6->daddr);
534 	*(__be32 *)hdr->ipv6 = bpf_htonl(0x60000000);
535 	hdr->ipv6->hop_limit = ttl;
536 
537 	tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale);
538 
539 	hdr->tcp_len = hdr->tcp->doff * 4;
540 	hdr->ipv6->payload_len = bpf_htons(hdr->tcp_len);
541 }
542 
543 static __always_inline int syncookie_handle_syn(struct header_pointers *hdr,
544 						void *ctx,
545 						void *data, void *data_end,
546 						bool xdp)
547 {
548 	__u32 old_pkt_size, new_pkt_size;
549 	/* Unlike clang 10, clang 11 and 12 generate code that doesn't pass the
550 	 * BPF verifier if tsopt is not volatile. Volatile forces it to store
551 	 * the pointer value and use it directly, otherwise tcp_mkoptions is
552 	 * (mis)compiled like this:
553 	 *   if (!tsopt)
554 	 *       return buf - start;
555 	 *   reg = stored_return_value_of_tscookie_init;
556 	 *   if (reg)
557 	 *       tsopt = tsopt_buf;
558 	 *   else
559 	 *       tsopt = NULL;
560 	 *   ...
561 	 *   *buf++ = tsopt[1];
562 	 * It creates a dead branch where tsopt is assigned NULL, but the
563 	 * verifier can't prove it's dead and blocks the program.
564 	 */
565 	__be32 * volatile tsopt = NULL;
566 	__be32 tsopt_buf[2] = {};
567 	__u16 ip_len;
568 	__u32 cookie;
569 	__s64 value;
570 
571 	/* Checksum is not yet verified, but both checksum failure and TCP
572 	 * header checks return XDP_DROP, so the order doesn't matter.
573 	 */
574 	if (hdr->tcp->fin || hdr->tcp->rst)
575 		return XDP_DROP;
576 
577 	/* Issue SYN cookies on allowed ports, drop SYN packets on blocked
578 	 * ports.
579 	 */
580 	if (!check_port_allowed(bpf_ntohs(hdr->tcp->dest)))
581 		return XDP_DROP;
582 
583 	if (hdr->ipv4) {
584 		/* Check the IPv4 and TCP checksums before creating a SYNACK. */
585 		value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, hdr->ipv4->ihl * 4, 0);
586 		if (value < 0)
587 			return XDP_ABORTED;
588 		if (csum_fold(value) != 0)
589 			return XDP_DROP; /* Bad IPv4 checksum. */
590 
591 		value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
592 		if (value < 0)
593 			return XDP_ABORTED;
594 		if (csum_tcpudp_magic(hdr->ipv4->saddr, hdr->ipv4->daddr,
595 				      hdr->tcp_len, IPPROTO_TCP, value) != 0)
596 			return XDP_DROP; /* Bad TCP checksum. */
597 
598 		ip_len = sizeof(*hdr->ipv4);
599 
600 		value = bpf_tcp_raw_gen_syncookie_ipv4(hdr->ipv4, hdr->tcp,
601 						       hdr->tcp_len);
602 	} else if (hdr->ipv6) {
603 		/* Check the TCP checksum before creating a SYNACK. */
604 		value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
605 		if (value < 0)
606 			return XDP_ABORTED;
607 		if (csum_ipv6_magic(&hdr->ipv6->saddr, &hdr->ipv6->daddr,
608 				    hdr->tcp_len, IPPROTO_TCP, value) != 0)
609 			return XDP_DROP; /* Bad TCP checksum. */
610 
611 		ip_len = sizeof(*hdr->ipv6);
612 
613 		value = bpf_tcp_raw_gen_syncookie_ipv6(hdr->ipv6, hdr->tcp,
614 						       hdr->tcp_len);
615 	} else {
616 		return XDP_ABORTED;
617 	}
618 
619 	if (value < 0)
620 		return XDP_ABORTED;
621 	cookie = (__u32)value;
622 
623 	if (tscookie_init((void *)hdr->tcp, hdr->tcp_len,
624 			  &tsopt_buf[0], &tsopt_buf[1], data_end))
625 		tsopt = tsopt_buf;
626 
627 	/* Check that there is enough space for a SYNACK. It also covers
628 	 * the check that the destination of the __builtin_memmove below
629 	 * doesn't overflow.
630 	 */
631 	if (data + sizeof(*hdr->eth) + ip_len + TCP_MAXLEN > data_end)
632 		return XDP_ABORTED;
633 
634 	if (hdr->ipv4) {
635 		if (hdr->ipv4->ihl * 4 > sizeof(*hdr->ipv4)) {
636 			struct tcphdr *new_tcp_header;
637 
638 			new_tcp_header = data + sizeof(*hdr->eth) + sizeof(*hdr->ipv4);
639 			__builtin_memmove(new_tcp_header, hdr->tcp, sizeof(*hdr->tcp));
640 			hdr->tcp = new_tcp_header;
641 
642 			hdr->ipv4->ihl = sizeof(*hdr->ipv4) / 4;
643 		}
644 
645 		tcpv4_gen_synack(hdr, cookie, tsopt);
646 	} else if (hdr->ipv6) {
647 		tcpv6_gen_synack(hdr, cookie, tsopt);
648 	} else {
649 		return XDP_ABORTED;
650 	}
651 
652 	/* Recalculate checksums. */
653 	hdr->tcp->check = 0;
654 	value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
655 	if (value < 0)
656 		return XDP_ABORTED;
657 	if (hdr->ipv4) {
658 		hdr->tcp->check = csum_tcpudp_magic(hdr->ipv4->saddr,
659 						    hdr->ipv4->daddr,
660 						    hdr->tcp_len,
661 						    IPPROTO_TCP,
662 						    value);
663 
664 		hdr->ipv4->check = 0;
665 		value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, sizeof(*hdr->ipv4), 0);
666 		if (value < 0)
667 			return XDP_ABORTED;
668 		hdr->ipv4->check = csum_fold(value);
669 	} else if (hdr->ipv6) {
670 		hdr->tcp->check = csum_ipv6_magic(&hdr->ipv6->saddr,
671 						  &hdr->ipv6->daddr,
672 						  hdr->tcp_len,
673 						  IPPROTO_TCP,
674 						  value);
675 	} else {
676 		return XDP_ABORTED;
677 	}
678 
679 	/* Set the new packet size. */
680 	old_pkt_size = data_end - data;
681 	new_pkt_size = sizeof(*hdr->eth) + ip_len + hdr->tcp->doff * 4;
682 	if (xdp) {
683 		if (bpf_xdp_adjust_tail(ctx, new_pkt_size - old_pkt_size))
684 			return XDP_ABORTED;
685 	} else {
686 		if (bpf_skb_change_tail(ctx, new_pkt_size, 0))
687 			return XDP_ABORTED;
688 	}
689 
690 	values_inc_synacks();
691 
692 	return XDP_TX;
693 }
694 
695 static __always_inline int syncookie_handle_ack(struct header_pointers *hdr)
696 {
697 	int err;
698 
699 	if (hdr->tcp->rst)
700 		return XDP_DROP;
701 
702 	if (hdr->ipv4)
703 		err = bpf_tcp_raw_check_syncookie_ipv4(hdr->ipv4, hdr->tcp);
704 	else if (hdr->ipv6)
705 		err = bpf_tcp_raw_check_syncookie_ipv6(hdr->ipv6, hdr->tcp);
706 	else
707 		return XDP_ABORTED;
708 	if (err)
709 		return XDP_DROP;
710 
711 	return XDP_PASS;
712 }
713 
714 static __always_inline int syncookie_part1(void *ctx, void *data, void *data_end,
715 					   struct header_pointers *hdr, bool xdp)
716 {
717 	struct bpf_ct_opts ct_lookup_opts = {
718 		.netns_id = BPF_F_CURRENT_NETNS,
719 		.l4proto = IPPROTO_TCP,
720 	};
721 	int ret;
722 
723 	ret = tcp_dissect(data, data_end, hdr);
724 	if (ret != XDP_TX)
725 		return ret;
726 
727 	ret = tcp_lookup(ctx, hdr, xdp);
728 	if (ret != XDP_TX)
729 		return ret;
730 
731 	/* Packet is TCP and doesn't belong to an established connection. */
732 
733 	if ((hdr->tcp->syn ^ hdr->tcp->ack) != 1)
734 		return XDP_DROP;
735 
736 	/* Grow the TCP header to TCP_MAXLEN to be able to pass any hdr->tcp_len
737 	 * to bpf_tcp_raw_gen_syncookie_ipv{4,6} and pass the verifier.
738 	 */
739 	if (xdp) {
740 		if (bpf_xdp_adjust_tail(ctx, TCP_MAXLEN - hdr->tcp_len))
741 			return XDP_ABORTED;
742 	} else {
743 		/* Without volatile the verifier throws this error:
744 		 * R9 32-bit pointer arithmetic prohibited
745 		 */
746 		volatile u64 old_len = data_end - data;
747 
748 		if (bpf_skb_change_tail(ctx, old_len + TCP_MAXLEN - hdr->tcp_len, 0))
749 			return XDP_ABORTED;
750 	}
751 
752 	return XDP_TX;
753 }
754 
755 static __always_inline int syncookie_part2(void *ctx, void *data, void *data_end,
756 					   struct header_pointers *hdr, bool xdp)
757 {
758 	if (hdr->ipv4) {
759 		hdr->eth = data;
760 		hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth);
761 		/* IPV4_MAXLEN is needed when calculating checksum.
762 		 * At least sizeof(struct iphdr) is needed here to access ihl.
763 		 */
764 		if ((void *)hdr->ipv4 + IPV4_MAXLEN > data_end)
765 			return XDP_ABORTED;
766 		hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4;
767 	} else if (hdr->ipv6) {
768 		hdr->eth = data;
769 		hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth);
770 		hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6);
771 	} else {
772 		return XDP_ABORTED;
773 	}
774 
775 	if ((void *)hdr->tcp + TCP_MAXLEN > data_end)
776 		return XDP_ABORTED;
777 
778 	/* We run out of registers, tcp_len gets spilled to the stack, and the
779 	 * verifier forgets its min and max values checked above in tcp_dissect.
780 	 */
781 	hdr->tcp_len = hdr->tcp->doff * 4;
782 	if (hdr->tcp_len < sizeof(*hdr->tcp))
783 		return XDP_ABORTED;
784 
785 	return hdr->tcp->syn ? syncookie_handle_syn(hdr, ctx, data, data_end, xdp) :
786 			       syncookie_handle_ack(hdr);
787 }
788 
789 SEC("xdp")
790 int syncookie_xdp(struct xdp_md *ctx)
791 {
792 	void *data_end = (void *)(long)ctx->data_end;
793 	void *data = (void *)(long)ctx->data;
794 	struct header_pointers hdr;
795 	int ret;
796 
797 	ret = syncookie_part1(ctx, data, data_end, &hdr, true);
798 	if (ret != XDP_TX)
799 		return ret;
800 
801 	data_end = (void *)(long)ctx->data_end;
802 	data = (void *)(long)ctx->data;
803 
804 	return syncookie_part2(ctx, data, data_end, &hdr, true);
805 }
806 
807 SEC("tc")
808 int syncookie_tc(struct __sk_buff *skb)
809 {
810 	void *data_end = (void *)(long)skb->data_end;
811 	void *data = (void *)(long)skb->data;
812 	struct header_pointers hdr;
813 	int ret;
814 
815 	ret = syncookie_part1(skb, data, data_end, &hdr, false);
816 	if (ret != XDP_TX)
817 		return ret == XDP_PASS ? TC_ACT_OK : TC_ACT_SHOT;
818 
819 	data_end = (void *)(long)skb->data_end;
820 	data = (void *)(long)skb->data;
821 
822 	ret = syncookie_part2(skb, data, data_end, &hdr, false);
823 	switch (ret) {
824 	case XDP_PASS:
825 		return TC_ACT_OK;
826 	case XDP_TX:
827 		return bpf_redirect(skb->ifindex, 0);
828 	default:
829 		return TC_ACT_SHOT;
830 	}
831 }
832 
833 char _license[] SEC("license") = "GPL";
834