1 // SPDX-License-Identifier: GPL-2.0-only
2 /* (C) 1999-2001 Paul `Rusty' Russell
3  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
4  * (C) 2002-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
5  * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
6  */
7 
8 #include <linux/types.h>
9 #include <linux/timer.h>
10 #include <linux/module.h>
11 #include <linux/in.h>
12 #include <linux/tcp.h>
13 #include <linux/spinlock.h>
14 #include <linux/skbuff.h>
15 #include <linux/ipv6.h>
16 #include <net/ip6_checksum.h>
17 #include <asm/unaligned.h>
18 
19 #include <net/tcp.h>
20 
21 #include <linux/netfilter.h>
22 #include <linux/netfilter_ipv4.h>
23 #include <linux/netfilter_ipv6.h>
24 #include <net/netfilter/nf_conntrack.h>
25 #include <net/netfilter/nf_conntrack_l4proto.h>
26 #include <net/netfilter/nf_conntrack_ecache.h>
27 #include <net/netfilter/nf_conntrack_seqadj.h>
28 #include <net/netfilter/nf_conntrack_synproxy.h>
29 #include <net/netfilter/nf_conntrack_timeout.h>
30 #include <net/netfilter/nf_log.h>
31 #include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
32 #include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
33 
34   /* FIXME: Examine ipfilter's timeouts and conntrack transitions more
35      closely.  They're more complex. --RR */
36 
37 static const char *const tcp_conntrack_names[] = {
38 	"NONE",
39 	"SYN_SENT",
40 	"SYN_RECV",
41 	"ESTABLISHED",
42 	"FIN_WAIT",
43 	"CLOSE_WAIT",
44 	"LAST_ACK",
45 	"TIME_WAIT",
46 	"CLOSE",
47 	"SYN_SENT2",
48 };
49 
50 #define SECS * HZ
51 #define MINS * 60 SECS
52 #define HOURS * 60 MINS
53 #define DAYS * 24 HOURS
54 
55 static const unsigned int tcp_timeouts[TCP_CONNTRACK_TIMEOUT_MAX] = {
56 	[TCP_CONNTRACK_SYN_SENT]	= 2 MINS,
57 	[TCP_CONNTRACK_SYN_RECV]	= 60 SECS,
58 	[TCP_CONNTRACK_ESTABLISHED]	= 5 DAYS,
59 	[TCP_CONNTRACK_FIN_WAIT]	= 2 MINS,
60 	[TCP_CONNTRACK_CLOSE_WAIT]	= 60 SECS,
61 	[TCP_CONNTRACK_LAST_ACK]	= 30 SECS,
62 	[TCP_CONNTRACK_TIME_WAIT]	= 2 MINS,
63 	[TCP_CONNTRACK_CLOSE]		= 10 SECS,
64 	[TCP_CONNTRACK_SYN_SENT2]	= 2 MINS,
65 /* RFC1122 says the R2 limit should be at least 100 seconds.
66    Linux uses 15 packets as limit, which corresponds
67    to ~13-30min depending on RTO. */
68 	[TCP_CONNTRACK_RETRANS]		= 5 MINS,
69 	[TCP_CONNTRACK_UNACK]		= 5 MINS,
70 };
71 
72 #define sNO TCP_CONNTRACK_NONE
73 #define sSS TCP_CONNTRACK_SYN_SENT
74 #define sSR TCP_CONNTRACK_SYN_RECV
75 #define sES TCP_CONNTRACK_ESTABLISHED
76 #define sFW TCP_CONNTRACK_FIN_WAIT
77 #define sCW TCP_CONNTRACK_CLOSE_WAIT
78 #define sLA TCP_CONNTRACK_LAST_ACK
79 #define sTW TCP_CONNTRACK_TIME_WAIT
80 #define sCL TCP_CONNTRACK_CLOSE
81 #define sS2 TCP_CONNTRACK_SYN_SENT2
82 #define sIV TCP_CONNTRACK_MAX
83 #define sIG TCP_CONNTRACK_IGNORE
84 
85 /* What TCP flags are set from RST/SYN/FIN/ACK. */
86 enum tcp_bit_set {
87 	TCP_SYN_SET,
88 	TCP_SYNACK_SET,
89 	TCP_FIN_SET,
90 	TCP_ACK_SET,
91 	TCP_RST_SET,
92 	TCP_NONE_SET,
93 };
94 
95 /*
96  * The TCP state transition table needs a few words...
97  *
98  * We are the man in the middle. All the packets go through us
99  * but might get lost in transit to the destination.
100  * It is assumed that the destinations can't receive segments
101  * we haven't seen.
102  *
103  * The checked segment is in window, but our windows are *not*
104  * equivalent with the ones of the sender/receiver. We always
105  * try to guess the state of the current sender.
106  *
107  * The meaning of the states are:
108  *
109  * NONE:	initial state
110  * SYN_SENT:	SYN-only packet seen
111  * SYN_SENT2:	SYN-only packet seen from reply dir, simultaneous open
112  * SYN_RECV:	SYN-ACK packet seen
113  * ESTABLISHED:	ACK packet seen
114  * FIN_WAIT:	FIN packet seen
115  * CLOSE_WAIT:	ACK seen (after FIN)
116  * LAST_ACK:	FIN seen (after FIN)
117  * TIME_WAIT:	last ACK seen
118  * CLOSE:	closed connection (RST)
119  *
120  * Packets marked as IGNORED (sIG):
121  *	if they may be either invalid or valid
122  *	and the receiver may send back a connection
123  *	closing RST or a SYN/ACK.
124  *
125  * Packets marked as INVALID (sIV):
126  *	if we regard them as truly invalid packets
127  */
128 static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
129 	{
130 /* ORIGINAL */
131 /* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
132 /*syn*/	   { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sS2 },
133 /*
134  *	sNO -> sSS	Initialize a new connection
135  *	sSS -> sSS	Retransmitted SYN
136  *	sS2 -> sS2	Late retransmitted SYN
137  *	sSR -> sIG
138  *	sES -> sIG	Error: SYNs in window outside the SYN_SENT state
139  *			are errors. Receiver will reply with RST
140  *			and close the connection.
141  *			Or we are not in sync and hold a dead connection.
142  *	sFW -> sIG
143  *	sCW -> sIG
144  *	sLA -> sIG
145  *	sTW -> sSS	Reopened connection (RFC 1122).
146  *	sCL -> sSS
147  */
148 /* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
149 /*synack*/ { sIV, sIV, sSR, sIV, sIV, sIV, sIV, sIV, sIV, sSR },
150 /*
151  *	sNO -> sIV	Too late and no reason to do anything
152  *	sSS -> sIV	Client can't send SYN and then SYN/ACK
153  *	sS2 -> sSR	SYN/ACK sent to SYN2 in simultaneous open
154  *	sSR -> sSR	Late retransmitted SYN/ACK in simultaneous open
155  *	sES -> sIV	Invalid SYN/ACK packets sent by the client
156  *	sFW -> sIV
157  *	sCW -> sIV
158  *	sLA -> sIV
159  *	sTW -> sIV
160  *	sCL -> sIV
161  */
162 /* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
163 /*fin*/    { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
164 /*
165  *	sNO -> sIV	Too late and no reason to do anything...
166  *	sSS -> sIV	Client migth not send FIN in this state:
167  *			we enforce waiting for a SYN/ACK reply first.
168  *	sS2 -> sIV
169  *	sSR -> sFW	Close started.
170  *	sES -> sFW
171  *	sFW -> sLA	FIN seen in both directions, waiting for
172  *			the last ACK.
173  *			Migth be a retransmitted FIN as well...
174  *	sCW -> sLA
175  *	sLA -> sLA	Retransmitted FIN. Remain in the same state.
176  *	sTW -> sTW
177  *	sCL -> sCL
178  */
179 /* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
180 /*ack*/	   { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV },
181 /*
182  *	sNO -> sES	Assumed.
183  *	sSS -> sIV	ACK is invalid: we haven't seen a SYN/ACK yet.
184  *	sS2 -> sIV
185  *	sSR -> sES	Established state is reached.
186  *	sES -> sES	:-)
187  *	sFW -> sCW	Normal close request answered by ACK.
188  *	sCW -> sCW
189  *	sLA -> sTW	Last ACK detected (RFC5961 challenged)
190  *	sTW -> sTW	Retransmitted last ACK. Remain in the same state.
191  *	sCL -> sCL
192  */
193 /* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
194 /*rst*/    { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL },
195 /*none*/   { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
196 	},
197 	{
198 /* REPLY */
199 /* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
200 /*syn*/	   { sIV, sS2, sIV, sIV, sIV, sIV, sIV, sSS, sIV, sS2 },
201 /*
202  *	sNO -> sIV	Never reached.
203  *	sSS -> sS2	Simultaneous open
204  *	sS2 -> sS2	Retransmitted simultaneous SYN
205  *	sSR -> sIV	Invalid SYN packets sent by the server
206  *	sES -> sIV
207  *	sFW -> sIV
208  *	sCW -> sIV
209  *	sLA -> sIV
210  *	sTW -> sSS	Reopened connection, but server may have switched role
211  *	sCL -> sIV
212  */
213 /* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
214 /*synack*/ { sIV, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR },
215 /*
216  *	sSS -> sSR	Standard open.
217  *	sS2 -> sSR	Simultaneous open
218  *	sSR -> sIG	Retransmitted SYN/ACK, ignore it.
219  *	sES -> sIG	Late retransmitted SYN/ACK?
220  *	sFW -> sIG	Might be SYN/ACK answering ignored SYN
221  *	sCW -> sIG
222  *	sLA -> sIG
223  *	sTW -> sIG
224  *	sCL -> sIG
225  */
226 /* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
227 /*fin*/    { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
228 /*
229  *	sSS -> sIV	Server might not send FIN in this state.
230  *	sS2 -> sIV
231  *	sSR -> sFW	Close started.
232  *	sES -> sFW
233  *	sFW -> sLA	FIN seen in both directions.
234  *	sCW -> sLA
235  *	sLA -> sLA	Retransmitted FIN.
236  *	sTW -> sTW
237  *	sCL -> sCL
238  */
239 /* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
240 /*ack*/	   { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIG },
241 /*
242  *	sSS -> sIG	Might be a half-open connection.
243  *	sS2 -> sIG
244  *	sSR -> sSR	Might answer late resent SYN.
245  *	sES -> sES	:-)
246  *	sFW -> sCW	Normal close request answered by ACK.
247  *	sCW -> sCW
248  *	sLA -> sTW	Last ACK detected (RFC5961 challenged)
249  *	sTW -> sTW	Retransmitted last ACK.
250  *	sCL -> sCL
251  */
252 /* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
253 /*rst*/    { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL },
254 /*none*/   { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
255 	}
256 };
257 
258 #ifdef CONFIG_NF_CONNTRACK_PROCFS
259 /* Print out the private part of the conntrack. */
260 static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
261 {
262 	if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
263 		return;
264 
265 	seq_printf(s, "%s ", tcp_conntrack_names[ct->proto.tcp.state]);
266 }
267 #endif
268 
269 static unsigned int get_conntrack_index(const struct tcphdr *tcph)
270 {
271 	if (tcph->rst) return TCP_RST_SET;
272 	else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET);
273 	else if (tcph->fin) return TCP_FIN_SET;
274 	else if (tcph->ack) return TCP_ACK_SET;
275 	else return TCP_NONE_SET;
276 }
277 
278 /* TCP connection tracking based on 'Real Stateful TCP Packet Filtering
279    in IP Filter' by Guido van Rooij.
280 
281    http://www.sane.nl/events/sane2000/papers.html
282    http://www.darkart.com/mirrors/www.obfuscation.org/ipf/
283 
284    The boundaries and the conditions are changed according to RFC793:
285    the packet must intersect the window (i.e. segments may be
286    after the right or before the left edge) and thus receivers may ACK
287    segments after the right edge of the window.
288 
289 	td_maxend = max(sack + max(win,1)) seen in reply packets
290 	td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets
291 	td_maxwin += seq + len - sender.td_maxend
292 			if seq + len > sender.td_maxend
293 	td_end    = max(seq + len) seen in sent packets
294 
295    I.   Upper bound for valid data:	seq <= sender.td_maxend
296    II.  Lower bound for valid data:	seq + len >= sender.td_end - receiver.td_maxwin
297    III.	Upper bound for valid (s)ack:   sack <= receiver.td_end
298    IV.	Lower bound for valid (s)ack:	sack >= receiver.td_end - MAXACKWINDOW
299 
300    where sack is the highest right edge of sack block found in the packet
301    or ack in the case of packet without SACK option.
302 
303    The upper bound limit for a valid (s)ack is not ignored -
304    we doesn't have to deal with fragments.
305 */
306 
307 static inline __u32 segment_seq_plus_len(__u32 seq,
308 					 size_t len,
309 					 unsigned int dataoff,
310 					 const struct tcphdr *tcph)
311 {
312 	/* XXX Should I use payload length field in IP/IPv6 header ?
313 	 * - YK */
314 	return (seq + len - dataoff - tcph->doff*4
315 		+ (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0));
316 }
317 
318 /* Fixme: what about big packets? */
319 #define MAXACKWINCONST			66000
320 #define MAXACKWINDOW(sender)						\
321 	((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin	\
322 					      : MAXACKWINCONST)
323 
324 /*
325  * Simplified tcp_parse_options routine from tcp_input.c
326  */
327 static void tcp_options(const struct sk_buff *skb,
328 			unsigned int dataoff,
329 			const struct tcphdr *tcph,
330 			struct ip_ct_tcp_state *state)
331 {
332 	unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
333 	const unsigned char *ptr;
334 	int length = (tcph->doff*4) - sizeof(struct tcphdr);
335 
336 	if (!length)
337 		return;
338 
339 	ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
340 				 length, buff);
341 	if (!ptr)
342 		return;
343 
344 	state->td_scale =
345 	state->flags = 0;
346 
347 	while (length > 0) {
348 		int opcode=*ptr++;
349 		int opsize;
350 
351 		switch (opcode) {
352 		case TCPOPT_EOL:
353 			return;
354 		case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */
355 			length--;
356 			continue;
357 		default:
358 			if (length < 2)
359 				return;
360 			opsize=*ptr++;
361 			if (opsize < 2) /* "silly options" */
362 				return;
363 			if (opsize > length)
364 				return;	/* don't parse partial options */
365 
366 			if (opcode == TCPOPT_SACK_PERM
367 			    && opsize == TCPOLEN_SACK_PERM)
368 				state->flags |= IP_CT_TCP_FLAG_SACK_PERM;
369 			else if (opcode == TCPOPT_WINDOW
370 				 && opsize == TCPOLEN_WINDOW) {
371 				state->td_scale = *(u_int8_t *)ptr;
372 
373 				if (state->td_scale > TCP_MAX_WSCALE)
374 					state->td_scale = TCP_MAX_WSCALE;
375 
376 				state->flags |=
377 					IP_CT_TCP_FLAG_WINDOW_SCALE;
378 			}
379 			ptr += opsize - 2;
380 			length -= opsize;
381 		}
382 	}
383 }
384 
385 static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
386                      const struct tcphdr *tcph, __u32 *sack)
387 {
388 	unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
389 	const unsigned char *ptr;
390 	int length = (tcph->doff*4) - sizeof(struct tcphdr);
391 	__u32 tmp;
392 
393 	if (!length)
394 		return;
395 
396 	ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
397 				 length, buff);
398 	if (!ptr)
399 		return;
400 
401 	/* Fast path for timestamp-only option */
402 	if (length == TCPOLEN_TSTAMP_ALIGNED
403 	    && *(__be32 *)ptr == htonl((TCPOPT_NOP << 24)
404 				       | (TCPOPT_NOP << 16)
405 				       | (TCPOPT_TIMESTAMP << 8)
406 				       | TCPOLEN_TIMESTAMP))
407 		return;
408 
409 	while (length > 0) {
410 		int opcode = *ptr++;
411 		int opsize, i;
412 
413 		switch (opcode) {
414 		case TCPOPT_EOL:
415 			return;
416 		case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */
417 			length--;
418 			continue;
419 		default:
420 			if (length < 2)
421 				return;
422 			opsize = *ptr++;
423 			if (opsize < 2) /* "silly options" */
424 				return;
425 			if (opsize > length)
426 				return;	/* don't parse partial options */
427 
428 			if (opcode == TCPOPT_SACK
429 			    && opsize >= (TCPOLEN_SACK_BASE
430 					  + TCPOLEN_SACK_PERBLOCK)
431 			    && !((opsize - TCPOLEN_SACK_BASE)
432 				 % TCPOLEN_SACK_PERBLOCK)) {
433 				for (i = 0;
434 				     i < (opsize - TCPOLEN_SACK_BASE);
435 				     i += TCPOLEN_SACK_PERBLOCK) {
436 					tmp = get_unaligned_be32((__be32 *)(ptr+i)+1);
437 
438 					if (after(tmp, *sack))
439 						*sack = tmp;
440 				}
441 				return;
442 			}
443 			ptr += opsize - 2;
444 			length -= opsize;
445 		}
446 	}
447 }
448 
449 static bool tcp_in_window(struct nf_conn *ct,
450 			  enum ip_conntrack_dir dir,
451 			  unsigned int index,
452 			  const struct sk_buff *skb,
453 			  unsigned int dataoff,
454 			  const struct tcphdr *tcph,
455 			  const struct nf_hook_state *hook_state)
456 {
457 	struct ip_ct_tcp *state = &ct->proto.tcp;
458 	struct net *net = nf_ct_net(ct);
459 	struct nf_tcp_net *tn = nf_tcp_pernet(net);
460 	struct ip_ct_tcp_state *sender = &state->seen[dir];
461 	struct ip_ct_tcp_state *receiver = &state->seen[!dir];
462 	const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple;
463 	__u32 seq, ack, sack, end, win, swin;
464 	u16 win_raw;
465 	s32 receiver_offset;
466 	bool res, in_recv_win;
467 
468 	/*
469 	 * Get the required data from the packet.
470 	 */
471 	seq = ntohl(tcph->seq);
472 	ack = sack = ntohl(tcph->ack_seq);
473 	win_raw = ntohs(tcph->window);
474 	win = win_raw;
475 	end = segment_seq_plus_len(seq, skb->len, dataoff, tcph);
476 
477 	if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM)
478 		tcp_sack(skb, dataoff, tcph, &sack);
479 
480 	/* Take into account NAT sequence number mangling */
481 	receiver_offset = nf_ct_seq_offset(ct, !dir, ack - 1);
482 	ack -= receiver_offset;
483 	sack -= receiver_offset;
484 
485 	pr_debug("tcp_in_window: START\n");
486 	pr_debug("tcp_in_window: ");
487 	nf_ct_dump_tuple(tuple);
488 	pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n",
489 		 seq, ack, receiver_offset, sack, receiver_offset, win, end);
490 	pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
491 		 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
492 		 sender->td_end, sender->td_maxend, sender->td_maxwin,
493 		 sender->td_scale,
494 		 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
495 		 receiver->td_scale);
496 
497 	if (sender->td_maxwin == 0) {
498 		/*
499 		 * Initialize sender data.
500 		 */
501 		if (tcph->syn) {
502 			/*
503 			 * SYN-ACK in reply to a SYN
504 			 * or SYN from reply direction in simultaneous open.
505 			 */
506 			sender->td_end =
507 			sender->td_maxend = end;
508 			sender->td_maxwin = (win == 0 ? 1 : win);
509 
510 			tcp_options(skb, dataoff, tcph, sender);
511 			/*
512 			 * RFC 1323:
513 			 * Both sides must send the Window Scale option
514 			 * to enable window scaling in either direction.
515 			 */
516 			if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE
517 			      && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE))
518 				sender->td_scale =
519 				receiver->td_scale = 0;
520 			if (!tcph->ack)
521 				/* Simultaneous open */
522 				return true;
523 		} else {
524 			/*
525 			 * We are in the middle of a connection,
526 			 * its history is lost for us.
527 			 * Let's try to use the data from the packet.
528 			 */
529 			sender->td_end = end;
530 			swin = win << sender->td_scale;
531 			sender->td_maxwin = (swin == 0 ? 1 : swin);
532 			sender->td_maxend = end + sender->td_maxwin;
533 			if (receiver->td_maxwin == 0) {
534 				/* We haven't seen traffic in the other
535 				 * direction yet but we have to tweak window
536 				 * tracking to pass III and IV until that
537 				 * happens.
538 				 */
539 				receiver->td_end = receiver->td_maxend = sack;
540 			} else if (sack == receiver->td_end + 1) {
541 				/* Likely a reply to a keepalive.
542 				 * Needed for III.
543 				 */
544 				receiver->td_end++;
545 			}
546 
547 		}
548 	} else if (((state->state == TCP_CONNTRACK_SYN_SENT
549 		     && dir == IP_CT_DIR_ORIGINAL)
550 		   || (state->state == TCP_CONNTRACK_SYN_RECV
551 		     && dir == IP_CT_DIR_REPLY))
552 		   && after(end, sender->td_end)) {
553 		/*
554 		 * RFC 793: "if a TCP is reinitialized ... then it need
555 		 * not wait at all; it must only be sure to use sequence
556 		 * numbers larger than those recently used."
557 		 */
558 		sender->td_end =
559 		sender->td_maxend = end;
560 		sender->td_maxwin = (win == 0 ? 1 : win);
561 
562 		tcp_options(skb, dataoff, tcph, sender);
563 	}
564 
565 	if (!(tcph->ack)) {
566 		/*
567 		 * If there is no ACK, just pretend it was set and OK.
568 		 */
569 		ack = sack = receiver->td_end;
570 	} else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) ==
571 		    (TCP_FLAG_ACK|TCP_FLAG_RST))
572 		   && (ack == 0)) {
573 		/*
574 		 * Broken TCP stacks, that set ACK in RST packets as well
575 		 * with zero ack value.
576 		 */
577 		ack = sack = receiver->td_end;
578 	}
579 
580 	if (tcph->rst && seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT)
581 		/*
582 		 * RST sent answering SYN.
583 		 */
584 		seq = end = sender->td_end;
585 
586 	pr_debug("tcp_in_window: ");
587 	nf_ct_dump_tuple(tuple);
588 	pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n",
589 		 seq, ack, receiver_offset, sack, receiver_offset, win, end);
590 	pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
591 		 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
592 		 sender->td_end, sender->td_maxend, sender->td_maxwin,
593 		 sender->td_scale,
594 		 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
595 		 receiver->td_scale);
596 
597 	/* Is the ending sequence in the receive window (if available)? */
598 	in_recv_win = !receiver->td_maxwin ||
599 		      after(end, sender->td_end - receiver->td_maxwin - 1);
600 
601 	pr_debug("tcp_in_window: I=%i II=%i III=%i IV=%i\n",
602 		 before(seq, sender->td_maxend + 1),
603 		 (in_recv_win ? 1 : 0),
604 		 before(sack, receiver->td_end + 1),
605 		 after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1));
606 
607 	if (before(seq, sender->td_maxend + 1) &&
608 	    in_recv_win &&
609 	    before(sack, receiver->td_end + 1) &&
610 	    after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) {
611 		/*
612 		 * Take into account window scaling (RFC 1323).
613 		 */
614 		if (!tcph->syn)
615 			win <<= sender->td_scale;
616 
617 		/*
618 		 * Update sender data.
619 		 */
620 		swin = win + (sack - ack);
621 		if (sender->td_maxwin < swin)
622 			sender->td_maxwin = swin;
623 		if (after(end, sender->td_end)) {
624 			sender->td_end = end;
625 			sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
626 		}
627 		if (tcph->ack) {
628 			if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) {
629 				sender->td_maxack = ack;
630 				sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET;
631 			} else if (after(ack, sender->td_maxack))
632 				sender->td_maxack = ack;
633 		}
634 
635 		/*
636 		 * Update receiver data.
637 		 */
638 		if (receiver->td_maxwin != 0 && after(end, sender->td_maxend))
639 			receiver->td_maxwin += end - sender->td_maxend;
640 		if (after(sack + win, receiver->td_maxend - 1)) {
641 			receiver->td_maxend = sack + win;
642 			if (win == 0)
643 				receiver->td_maxend++;
644 		}
645 		if (ack == receiver->td_end)
646 			receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
647 
648 		/*
649 		 * Check retransmissions.
650 		 */
651 		if (index == TCP_ACK_SET) {
652 			if (state->last_dir == dir
653 			    && state->last_seq == seq
654 			    && state->last_ack == ack
655 			    && state->last_end == end
656 			    && state->last_win == win_raw)
657 				state->retrans++;
658 			else {
659 				state->last_dir = dir;
660 				state->last_seq = seq;
661 				state->last_ack = ack;
662 				state->last_end = end;
663 				state->last_win = win_raw;
664 				state->retrans = 0;
665 			}
666 		}
667 		res = true;
668 	} else {
669 		res = false;
670 		if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL ||
671 		    tn->tcp_be_liberal)
672 			res = true;
673 		if (!res) {
674 			nf_ct_l4proto_log_invalid(skb, ct, hook_state,
675 			"%s",
676 			before(seq, sender->td_maxend + 1) ?
677 			in_recv_win ?
678 			before(sack, receiver->td_end + 1) ?
679 			after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1) ? "BUG"
680 			: "ACK is under the lower bound (possible overly delayed ACK)"
681 			: "ACK is over the upper bound (ACKed data not seen yet)"
682 			: "SEQ is under the lower bound (already ACKed data retransmitted)"
683 			: "SEQ is over the upper bound (over the window of the receiver)");
684 		}
685 	}
686 
687 	pr_debug("tcp_in_window: res=%u sender end=%u maxend=%u maxwin=%u "
688 		 "receiver end=%u maxend=%u maxwin=%u\n",
689 		 res, sender->td_end, sender->td_maxend, sender->td_maxwin,
690 		 receiver->td_end, receiver->td_maxend, receiver->td_maxwin);
691 
692 	return res;
693 }
694 
695 /* table of valid flag combinations - PUSH, ECE and CWR are always valid */
696 static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK|
697 				 TCPHDR_URG) + 1] =
698 {
699 	[TCPHDR_SYN]				= 1,
700 	[TCPHDR_SYN|TCPHDR_URG]			= 1,
701 	[TCPHDR_SYN|TCPHDR_ACK]			= 1,
702 	[TCPHDR_RST]				= 1,
703 	[TCPHDR_RST|TCPHDR_ACK]			= 1,
704 	[TCPHDR_FIN|TCPHDR_ACK]			= 1,
705 	[TCPHDR_FIN|TCPHDR_ACK|TCPHDR_URG]	= 1,
706 	[TCPHDR_ACK]				= 1,
707 	[TCPHDR_ACK|TCPHDR_URG]			= 1,
708 };
709 
710 static void tcp_error_log(const struct sk_buff *skb,
711 			  const struct nf_hook_state *state,
712 			  const char *msg)
713 {
714 	nf_l4proto_log_invalid(skb, state, IPPROTO_TCP, "%s", msg);
715 }
716 
717 /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c.  */
718 static bool tcp_error(const struct tcphdr *th,
719 		      struct sk_buff *skb,
720 		      unsigned int dataoff,
721 		      const struct nf_hook_state *state)
722 {
723 	unsigned int tcplen = skb->len - dataoff;
724 	u8 tcpflags;
725 
726 	/* Not whole TCP header or malformed packet */
727 	if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
728 		tcp_error_log(skb, state, "truncated packet");
729 		return true;
730 	}
731 
732 	/* Checksum invalid? Ignore.
733 	 * We skip checking packets on the outgoing path
734 	 * because the checksum is assumed to be correct.
735 	 */
736 	/* FIXME: Source route IP option packets --RR */
737 	if (state->net->ct.sysctl_checksum &&
738 	    state->hook == NF_INET_PRE_ROUTING &&
739 	    nf_checksum(skb, state->hook, dataoff, IPPROTO_TCP, state->pf)) {
740 		tcp_error_log(skb, state, "bad checksum");
741 		return true;
742 	}
743 
744 	/* Check TCP flags. */
745 	tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH));
746 	if (!tcp_valid_flags[tcpflags]) {
747 		tcp_error_log(skb, state, "invalid tcp flag combination");
748 		return true;
749 	}
750 
751 	return false;
752 }
753 
754 static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
755 			     unsigned int dataoff,
756 			     const struct tcphdr *th)
757 {
758 	enum tcp_conntrack new_state;
759 	struct net *net = nf_ct_net(ct);
760 	const struct nf_tcp_net *tn = nf_tcp_pernet(net);
761 	const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0];
762 	const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1];
763 
764 	/* Don't need lock here: this conntrack not in circulation yet */
765 	new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE];
766 
767 	/* Invalid: delete conntrack */
768 	if (new_state >= TCP_CONNTRACK_MAX) {
769 		pr_debug("nf_ct_tcp: invalid new deleting.\n");
770 		return false;
771 	}
772 
773 	if (new_state == TCP_CONNTRACK_SYN_SENT) {
774 		memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
775 		/* SYN packet */
776 		ct->proto.tcp.seen[0].td_end =
777 			segment_seq_plus_len(ntohl(th->seq), skb->len,
778 					     dataoff, th);
779 		ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
780 		if (ct->proto.tcp.seen[0].td_maxwin == 0)
781 			ct->proto.tcp.seen[0].td_maxwin = 1;
782 		ct->proto.tcp.seen[0].td_maxend =
783 			ct->proto.tcp.seen[0].td_end;
784 
785 		tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]);
786 	} else if (tn->tcp_loose == 0) {
787 		/* Don't try to pick up connections. */
788 		return false;
789 	} else {
790 		memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
791 		/*
792 		 * We are in the middle of a connection,
793 		 * its history is lost for us.
794 		 * Let's try to use the data from the packet.
795 		 */
796 		ct->proto.tcp.seen[0].td_end =
797 			segment_seq_plus_len(ntohl(th->seq), skb->len,
798 					     dataoff, th);
799 		ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
800 		if (ct->proto.tcp.seen[0].td_maxwin == 0)
801 			ct->proto.tcp.seen[0].td_maxwin = 1;
802 		ct->proto.tcp.seen[0].td_maxend =
803 			ct->proto.tcp.seen[0].td_end +
804 			ct->proto.tcp.seen[0].td_maxwin;
805 
806 		/* We assume SACK and liberal window checking to handle
807 		 * window scaling */
808 		ct->proto.tcp.seen[0].flags =
809 		ct->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM |
810 					      IP_CT_TCP_FLAG_BE_LIBERAL;
811 	}
812 
813 	/* tcp_packet will set them */
814 	ct->proto.tcp.last_index = TCP_NONE_SET;
815 
816 	pr_debug("%s: sender end=%u maxend=%u maxwin=%u scale=%i "
817 		 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
818 		 __func__,
819 		 sender->td_end, sender->td_maxend, sender->td_maxwin,
820 		 sender->td_scale,
821 		 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
822 		 receiver->td_scale);
823 	return true;
824 }
825 
826 /* Returns verdict for packet, or -1 for invalid. */
827 int nf_conntrack_tcp_packet(struct nf_conn *ct,
828 			    struct sk_buff *skb,
829 			    unsigned int dataoff,
830 			    enum ip_conntrack_info ctinfo,
831 			    const struct nf_hook_state *state)
832 {
833 	struct net *net = nf_ct_net(ct);
834 	struct nf_tcp_net *tn = nf_tcp_pernet(net);
835 	struct nf_conntrack_tuple *tuple;
836 	enum tcp_conntrack new_state, old_state;
837 	unsigned int index, *timeouts;
838 	enum ip_conntrack_dir dir;
839 	const struct tcphdr *th;
840 	struct tcphdr _tcph;
841 	unsigned long timeout;
842 
843 	th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
844 	if (th == NULL)
845 		return -NF_ACCEPT;
846 
847 	if (tcp_error(th, skb, dataoff, state))
848 		return -NF_ACCEPT;
849 
850 	if (!nf_ct_is_confirmed(ct) && !tcp_new(ct, skb, dataoff, th))
851 		return -NF_ACCEPT;
852 
853 	spin_lock_bh(&ct->lock);
854 	old_state = ct->proto.tcp.state;
855 	dir = CTINFO2DIR(ctinfo);
856 	index = get_conntrack_index(th);
857 	new_state = tcp_conntracks[dir][index][old_state];
858 	tuple = &ct->tuplehash[dir].tuple;
859 
860 	switch (new_state) {
861 	case TCP_CONNTRACK_SYN_SENT:
862 		if (old_state < TCP_CONNTRACK_TIME_WAIT)
863 			break;
864 		/* RFC 1122: "When a connection is closed actively,
865 		 * it MUST linger in TIME-WAIT state for a time 2xMSL
866 		 * (Maximum Segment Lifetime). However, it MAY accept
867 		 * a new SYN from the remote TCP to reopen the connection
868 		 * directly from TIME-WAIT state, if..."
869 		 * We ignore the conditions because we are in the
870 		 * TIME-WAIT state anyway.
871 		 *
872 		 * Handle aborted connections: we and the server
873 		 * think there is an existing connection but the client
874 		 * aborts it and starts a new one.
875 		 */
876 		if (((ct->proto.tcp.seen[dir].flags
877 		      | ct->proto.tcp.seen[!dir].flags)
878 		     & IP_CT_TCP_FLAG_CLOSE_INIT)
879 		    || (ct->proto.tcp.last_dir == dir
880 		        && ct->proto.tcp.last_index == TCP_RST_SET)) {
881 			/* Attempt to reopen a closed/aborted connection.
882 			 * Delete this connection and look up again. */
883 			spin_unlock_bh(&ct->lock);
884 
885 			/* Only repeat if we can actually remove the timer.
886 			 * Destruction may already be in progress in process
887 			 * context and we must give it a chance to terminate.
888 			 */
889 			if (nf_ct_kill(ct))
890 				return -NF_REPEAT;
891 			return NF_DROP;
892 		}
893 		fallthrough;
894 	case TCP_CONNTRACK_IGNORE:
895 		/* Ignored packets:
896 		 *
897 		 * Our connection entry may be out of sync, so ignore
898 		 * packets which may signal the real connection between
899 		 * the client and the server.
900 		 *
901 		 * a) SYN in ORIGINAL
902 		 * b) SYN/ACK in REPLY
903 		 * c) ACK in reply direction after initial SYN in original.
904 		 *
905 		 * If the ignored packet is invalid, the receiver will send
906 		 * a RST we'll catch below.
907 		 */
908 		if (index == TCP_SYNACK_SET
909 		    && ct->proto.tcp.last_index == TCP_SYN_SET
910 		    && ct->proto.tcp.last_dir != dir
911 		    && ntohl(th->ack_seq) == ct->proto.tcp.last_end) {
912 			/* b) This SYN/ACK acknowledges a SYN that we earlier
913 			 * ignored as invalid. This means that the client and
914 			 * the server are both in sync, while the firewall is
915 			 * not. We get in sync from the previously annotated
916 			 * values.
917 			 */
918 			old_state = TCP_CONNTRACK_SYN_SENT;
919 			new_state = TCP_CONNTRACK_SYN_RECV;
920 			ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_end =
921 				ct->proto.tcp.last_end;
922 			ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxend =
923 				ct->proto.tcp.last_end;
924 			ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxwin =
925 				ct->proto.tcp.last_win == 0 ?
926 					1 : ct->proto.tcp.last_win;
927 			ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_scale =
928 				ct->proto.tcp.last_wscale;
929 			ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK;
930 			ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags =
931 				ct->proto.tcp.last_flags;
932 			memset(&ct->proto.tcp.seen[dir], 0,
933 			       sizeof(struct ip_ct_tcp_state));
934 			break;
935 		}
936 		ct->proto.tcp.last_index = index;
937 		ct->proto.tcp.last_dir = dir;
938 		ct->proto.tcp.last_seq = ntohl(th->seq);
939 		ct->proto.tcp.last_end =
940 		    segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th);
941 		ct->proto.tcp.last_win = ntohs(th->window);
942 
943 		/* a) This is a SYN in ORIGINAL. The client and the server
944 		 * may be in sync but we are not. In that case, we annotate
945 		 * the TCP options and let the packet go through. If it is a
946 		 * valid SYN packet, the server will reply with a SYN/ACK, and
947 		 * then we'll get in sync. Otherwise, the server potentially
948 		 * responds with a challenge ACK if implementing RFC5961.
949 		 */
950 		if (index == TCP_SYN_SET && dir == IP_CT_DIR_ORIGINAL) {
951 			struct ip_ct_tcp_state seen = {};
952 
953 			ct->proto.tcp.last_flags =
954 			ct->proto.tcp.last_wscale = 0;
955 			tcp_options(skb, dataoff, th, &seen);
956 			if (seen.flags & IP_CT_TCP_FLAG_WINDOW_SCALE) {
957 				ct->proto.tcp.last_flags |=
958 					IP_CT_TCP_FLAG_WINDOW_SCALE;
959 				ct->proto.tcp.last_wscale = seen.td_scale;
960 			}
961 			if (seen.flags & IP_CT_TCP_FLAG_SACK_PERM) {
962 				ct->proto.tcp.last_flags |=
963 					IP_CT_TCP_FLAG_SACK_PERM;
964 			}
965 			/* Mark the potential for RFC5961 challenge ACK,
966 			 * this pose a special problem for LAST_ACK state
967 			 * as ACK is intrepretated as ACKing last FIN.
968 			 */
969 			if (old_state == TCP_CONNTRACK_LAST_ACK)
970 				ct->proto.tcp.last_flags |=
971 					IP_CT_EXP_CHALLENGE_ACK;
972 		}
973 		spin_unlock_bh(&ct->lock);
974 		nf_ct_l4proto_log_invalid(skb, ct, state,
975 					  "packet (index %d) in dir %d ignored, state %s",
976 					  index, dir,
977 					  tcp_conntrack_names[old_state]);
978 		return NF_ACCEPT;
979 	case TCP_CONNTRACK_MAX:
980 		/* Special case for SYN proxy: when the SYN to the server or
981 		 * the SYN/ACK from the server is lost, the client may transmit
982 		 * a keep-alive packet while in SYN_SENT state. This needs to
983 		 * be associated with the original conntrack entry in order to
984 		 * generate a new SYN with the correct sequence number.
985 		 */
986 		if (nfct_synproxy(ct) && old_state == TCP_CONNTRACK_SYN_SENT &&
987 		    index == TCP_ACK_SET && dir == IP_CT_DIR_ORIGINAL &&
988 		    ct->proto.tcp.last_dir == IP_CT_DIR_ORIGINAL &&
989 		    ct->proto.tcp.seen[dir].td_end - 1 == ntohl(th->seq)) {
990 			pr_debug("nf_ct_tcp: SYN proxy client keep alive\n");
991 			spin_unlock_bh(&ct->lock);
992 			return NF_ACCEPT;
993 		}
994 
995 		/* Invalid packet */
996 		pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
997 			 dir, get_conntrack_index(th), old_state);
998 		spin_unlock_bh(&ct->lock);
999 		nf_ct_l4proto_log_invalid(skb, ct, state, "invalid state");
1000 		return -NF_ACCEPT;
1001 	case TCP_CONNTRACK_TIME_WAIT:
1002 		/* RFC5961 compliance cause stack to send "challenge-ACK"
1003 		 * e.g. in response to spurious SYNs.  Conntrack MUST
1004 		 * not believe this ACK is acking last FIN.
1005 		 */
1006 		if (old_state == TCP_CONNTRACK_LAST_ACK &&
1007 		    index == TCP_ACK_SET &&
1008 		    ct->proto.tcp.last_dir != dir &&
1009 		    ct->proto.tcp.last_index == TCP_SYN_SET &&
1010 		    (ct->proto.tcp.last_flags & IP_CT_EXP_CHALLENGE_ACK)) {
1011 			/* Detected RFC5961 challenge ACK */
1012 			ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK;
1013 			spin_unlock_bh(&ct->lock);
1014 			nf_ct_l4proto_log_invalid(skb, ct, state, "challenge-ack ignored");
1015 			return NF_ACCEPT; /* Don't change state */
1016 		}
1017 		break;
1018 	case TCP_CONNTRACK_SYN_SENT2:
1019 		/* tcp_conntracks table is not smart enough to handle
1020 		 * simultaneous open.
1021 		 */
1022 		ct->proto.tcp.last_flags |= IP_CT_TCP_SIMULTANEOUS_OPEN;
1023 		break;
1024 	case TCP_CONNTRACK_SYN_RECV:
1025 		if (dir == IP_CT_DIR_REPLY && index == TCP_ACK_SET &&
1026 		    ct->proto.tcp.last_flags & IP_CT_TCP_SIMULTANEOUS_OPEN)
1027 			new_state = TCP_CONNTRACK_ESTABLISHED;
1028 		break;
1029 	case TCP_CONNTRACK_CLOSE:
1030 		if (index != TCP_RST_SET)
1031 			break;
1032 
1033 		if (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) {
1034 			u32 seq = ntohl(th->seq);
1035 
1036 			if (before(seq, ct->proto.tcp.seen[!dir].td_maxack)) {
1037 				/* Invalid RST  */
1038 				spin_unlock_bh(&ct->lock);
1039 				nf_ct_l4proto_log_invalid(skb, ct, state, "invalid rst");
1040 				return -NF_ACCEPT;
1041 			}
1042 
1043 			if (!nf_conntrack_tcp_established(ct) ||
1044 			    seq == ct->proto.tcp.seen[!dir].td_maxack)
1045 				break;
1046 
1047 			/* Check if rst is part of train, such as
1048 			 *   foo:80 > bar:4379: P, 235946583:235946602(19) ack 42
1049 			 *   foo:80 > bar:4379: R, 235946602:235946602(0)  ack 42
1050 			 */
1051 			if (ct->proto.tcp.last_index == TCP_ACK_SET &&
1052 			    ct->proto.tcp.last_dir == dir &&
1053 			    seq == ct->proto.tcp.last_end)
1054 				break;
1055 
1056 			/* ... RST sequence number doesn't match exactly, keep
1057 			 * established state to allow a possible challenge ACK.
1058 			 */
1059 			new_state = old_state;
1060 		}
1061 		if (((test_bit(IPS_SEEN_REPLY_BIT, &ct->status)
1062 			 && ct->proto.tcp.last_index == TCP_SYN_SET)
1063 			|| (!test_bit(IPS_ASSURED_BIT, &ct->status)
1064 			    && ct->proto.tcp.last_index == TCP_ACK_SET))
1065 		    && ntohl(th->ack_seq) == ct->proto.tcp.last_end) {
1066 			/* RST sent to invalid SYN or ACK we had let through
1067 			 * at a) and c) above:
1068 			 *
1069 			 * a) SYN was in window then
1070 			 * c) we hold a half-open connection.
1071 			 *
1072 			 * Delete our connection entry.
1073 			 * We skip window checking, because packet might ACK
1074 			 * segments we ignored. */
1075 			goto in_window;
1076 		}
1077 		break;
1078 	default:
1079 		/* Keep compilers happy. */
1080 		break;
1081 	}
1082 
1083 	if (!tcp_in_window(ct, dir, index,
1084 			   skb, dataoff, th, state)) {
1085 		spin_unlock_bh(&ct->lock);
1086 		return -NF_ACCEPT;
1087 	}
1088      in_window:
1089 	/* From now on we have got in-window packets */
1090 	ct->proto.tcp.last_index = index;
1091 	ct->proto.tcp.last_dir = dir;
1092 
1093 	pr_debug("tcp_conntracks: ");
1094 	nf_ct_dump_tuple(tuple);
1095 	pr_debug("syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n",
1096 		 (th->syn ? 1 : 0), (th->ack ? 1 : 0),
1097 		 (th->fin ? 1 : 0), (th->rst ? 1 : 0),
1098 		 old_state, new_state);
1099 
1100 	ct->proto.tcp.state = new_state;
1101 	if (old_state != new_state
1102 	    && new_state == TCP_CONNTRACK_FIN_WAIT)
1103 		ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
1104 
1105 	timeouts = nf_ct_timeout_lookup(ct);
1106 	if (!timeouts)
1107 		timeouts = tn->timeouts;
1108 
1109 	if (ct->proto.tcp.retrans >= tn->tcp_max_retrans &&
1110 	    timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS])
1111 		timeout = timeouts[TCP_CONNTRACK_RETRANS];
1112 	else if (unlikely(index == TCP_RST_SET))
1113 		timeout = timeouts[TCP_CONNTRACK_CLOSE];
1114 	else if ((ct->proto.tcp.seen[0].flags | ct->proto.tcp.seen[1].flags) &
1115 		 IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED &&
1116 		 timeouts[new_state] > timeouts[TCP_CONNTRACK_UNACK])
1117 		timeout = timeouts[TCP_CONNTRACK_UNACK];
1118 	else if (ct->proto.tcp.last_win == 0 &&
1119 		 timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS])
1120 		timeout = timeouts[TCP_CONNTRACK_RETRANS];
1121 	else
1122 		timeout = timeouts[new_state];
1123 	spin_unlock_bh(&ct->lock);
1124 
1125 	if (new_state != old_state)
1126 		nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
1127 
1128 	if (!test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
1129 		/* If only reply is a RST, we can consider ourselves not to
1130 		   have an established connection: this is a fairly common
1131 		   problem case, so we can delete the conntrack
1132 		   immediately.  --RR */
1133 		if (th->rst) {
1134 			nf_ct_kill_acct(ct, ctinfo, skb);
1135 			return NF_ACCEPT;
1136 		}
1137 		/* ESTABLISHED without SEEN_REPLY, i.e. mid-connection
1138 		 * pickup with loose=1. Avoid large ESTABLISHED timeout.
1139 		 */
1140 		if (new_state == TCP_CONNTRACK_ESTABLISHED &&
1141 		    timeout > timeouts[TCP_CONNTRACK_UNACK])
1142 			timeout = timeouts[TCP_CONNTRACK_UNACK];
1143 	} else if (!test_bit(IPS_ASSURED_BIT, &ct->status)
1144 		   && (old_state == TCP_CONNTRACK_SYN_RECV
1145 		       || old_state == TCP_CONNTRACK_ESTABLISHED)
1146 		   && new_state == TCP_CONNTRACK_ESTABLISHED) {
1147 		/* Set ASSURED if we see valid ack in ESTABLISHED
1148 		   after SYN_RECV or a valid answer for a picked up
1149 		   connection. */
1150 		set_bit(IPS_ASSURED_BIT, &ct->status);
1151 		nf_conntrack_event_cache(IPCT_ASSURED, ct);
1152 	}
1153 	nf_ct_refresh_acct(ct, ctinfo, skb, timeout);
1154 
1155 	return NF_ACCEPT;
1156 }
1157 
1158 static bool tcp_can_early_drop(const struct nf_conn *ct)
1159 {
1160 	switch (ct->proto.tcp.state) {
1161 	case TCP_CONNTRACK_FIN_WAIT:
1162 	case TCP_CONNTRACK_LAST_ACK:
1163 	case TCP_CONNTRACK_TIME_WAIT:
1164 	case TCP_CONNTRACK_CLOSE:
1165 	case TCP_CONNTRACK_CLOSE_WAIT:
1166 		return true;
1167 	default:
1168 		break;
1169 	}
1170 
1171 	return false;
1172 }
1173 
1174 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
1175 
1176 #include <linux/netfilter/nfnetlink.h>
1177 #include <linux/netfilter/nfnetlink_conntrack.h>
1178 
1179 static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
1180 			 struct nf_conn *ct, bool destroy)
1181 {
1182 	struct nlattr *nest_parms;
1183 	struct nf_ct_tcp_flags tmp = {};
1184 
1185 	spin_lock_bh(&ct->lock);
1186 	nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP);
1187 	if (!nest_parms)
1188 		goto nla_put_failure;
1189 
1190 	if (nla_put_u8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state))
1191 		goto nla_put_failure;
1192 
1193 	if (destroy)
1194 		goto skip_state;
1195 
1196 	if (nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL,
1197 		       ct->proto.tcp.seen[0].td_scale) ||
1198 	    nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY,
1199 		       ct->proto.tcp.seen[1].td_scale))
1200 		goto nla_put_failure;
1201 
1202 	tmp.flags = ct->proto.tcp.seen[0].flags;
1203 	if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL,
1204 		    sizeof(struct nf_ct_tcp_flags), &tmp))
1205 		goto nla_put_failure;
1206 
1207 	tmp.flags = ct->proto.tcp.seen[1].flags;
1208 	if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY,
1209 		    sizeof(struct nf_ct_tcp_flags), &tmp))
1210 		goto nla_put_failure;
1211 skip_state:
1212 	spin_unlock_bh(&ct->lock);
1213 	nla_nest_end(skb, nest_parms);
1214 
1215 	return 0;
1216 
1217 nla_put_failure:
1218 	spin_unlock_bh(&ct->lock);
1219 	return -1;
1220 }
1221 
1222 static const struct nla_policy tcp_nla_policy[CTA_PROTOINFO_TCP_MAX+1] = {
1223 	[CTA_PROTOINFO_TCP_STATE]	    = { .type = NLA_U8 },
1224 	[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] = { .type = NLA_U8 },
1225 	[CTA_PROTOINFO_TCP_WSCALE_REPLY]    = { .type = NLA_U8 },
1226 	[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]  = { .len = sizeof(struct nf_ct_tcp_flags) },
1227 	[CTA_PROTOINFO_TCP_FLAGS_REPLY]	    = { .len = sizeof(struct nf_ct_tcp_flags) },
1228 };
1229 
1230 #define TCP_NLATTR_SIZE	( \
1231 	NLA_ALIGN(NLA_HDRLEN + 1) + \
1232 	NLA_ALIGN(NLA_HDRLEN + 1) + \
1233 	NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags)) + \
1234 	NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags)))
1235 
1236 static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct)
1237 {
1238 	struct nlattr *pattr = cda[CTA_PROTOINFO_TCP];
1239 	struct nlattr *tb[CTA_PROTOINFO_TCP_MAX+1];
1240 	int err;
1241 
1242 	/* updates could not contain anything about the private
1243 	 * protocol info, in that case skip the parsing */
1244 	if (!pattr)
1245 		return 0;
1246 
1247 	err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_TCP_MAX, pattr,
1248 					  tcp_nla_policy, NULL);
1249 	if (err < 0)
1250 		return err;
1251 
1252 	if (tb[CTA_PROTOINFO_TCP_STATE] &&
1253 	    nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX)
1254 		return -EINVAL;
1255 
1256 	spin_lock_bh(&ct->lock);
1257 	if (tb[CTA_PROTOINFO_TCP_STATE])
1258 		ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]);
1259 
1260 	if (tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]) {
1261 		struct nf_ct_tcp_flags *attr =
1262 			nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]);
1263 		ct->proto.tcp.seen[0].flags &= ~attr->mask;
1264 		ct->proto.tcp.seen[0].flags |= attr->flags & attr->mask;
1265 	}
1266 
1267 	if (tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]) {
1268 		struct nf_ct_tcp_flags *attr =
1269 			nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]);
1270 		ct->proto.tcp.seen[1].flags &= ~attr->mask;
1271 		ct->proto.tcp.seen[1].flags |= attr->flags & attr->mask;
1272 	}
1273 
1274 	if (tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] &&
1275 	    tb[CTA_PROTOINFO_TCP_WSCALE_REPLY] &&
1276 	    ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_WINDOW_SCALE &&
1277 	    ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_WINDOW_SCALE) {
1278 		ct->proto.tcp.seen[0].td_scale =
1279 			nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL]);
1280 		ct->proto.tcp.seen[1].td_scale =
1281 			nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]);
1282 	}
1283 	spin_unlock_bh(&ct->lock);
1284 
1285 	return 0;
1286 }
1287 
1288 static unsigned int tcp_nlattr_tuple_size(void)
1289 {
1290 	static unsigned int size __read_mostly;
1291 
1292 	if (!size)
1293 		size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
1294 
1295 	return size;
1296 }
1297 #endif
1298 
1299 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
1300 
1301 #include <linux/netfilter/nfnetlink.h>
1302 #include <linux/netfilter/nfnetlink_cttimeout.h>
1303 
1304 static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[],
1305 				     struct net *net, void *data)
1306 {
1307 	struct nf_tcp_net *tn = nf_tcp_pernet(net);
1308 	unsigned int *timeouts = data;
1309 	int i;
1310 
1311 	if (!timeouts)
1312 		timeouts = tn->timeouts;
1313 	/* set default TCP timeouts. */
1314 	for (i=0; i<TCP_CONNTRACK_TIMEOUT_MAX; i++)
1315 		timeouts[i] = tn->timeouts[i];
1316 
1317 	if (tb[CTA_TIMEOUT_TCP_SYN_SENT]) {
1318 		timeouts[TCP_CONNTRACK_SYN_SENT] =
1319 			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT]))*HZ;
1320 	}
1321 
1322 	if (tb[CTA_TIMEOUT_TCP_SYN_RECV]) {
1323 		timeouts[TCP_CONNTRACK_SYN_RECV] =
1324 			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_RECV]))*HZ;
1325 	}
1326 	if (tb[CTA_TIMEOUT_TCP_ESTABLISHED]) {
1327 		timeouts[TCP_CONNTRACK_ESTABLISHED] =
1328 			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_ESTABLISHED]))*HZ;
1329 	}
1330 	if (tb[CTA_TIMEOUT_TCP_FIN_WAIT]) {
1331 		timeouts[TCP_CONNTRACK_FIN_WAIT] =
1332 			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_FIN_WAIT]))*HZ;
1333 	}
1334 	if (tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]) {
1335 		timeouts[TCP_CONNTRACK_CLOSE_WAIT] =
1336 			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]))*HZ;
1337 	}
1338 	if (tb[CTA_TIMEOUT_TCP_LAST_ACK]) {
1339 		timeouts[TCP_CONNTRACK_LAST_ACK] =
1340 			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_LAST_ACK]))*HZ;
1341 	}
1342 	if (tb[CTA_TIMEOUT_TCP_TIME_WAIT]) {
1343 		timeouts[TCP_CONNTRACK_TIME_WAIT] =
1344 			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_TIME_WAIT]))*HZ;
1345 	}
1346 	if (tb[CTA_TIMEOUT_TCP_CLOSE]) {
1347 		timeouts[TCP_CONNTRACK_CLOSE] =
1348 			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE]))*HZ;
1349 	}
1350 	if (tb[CTA_TIMEOUT_TCP_SYN_SENT2]) {
1351 		timeouts[TCP_CONNTRACK_SYN_SENT2] =
1352 			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT2]))*HZ;
1353 	}
1354 	if (tb[CTA_TIMEOUT_TCP_RETRANS]) {
1355 		timeouts[TCP_CONNTRACK_RETRANS] =
1356 			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_RETRANS]))*HZ;
1357 	}
1358 	if (tb[CTA_TIMEOUT_TCP_UNACK]) {
1359 		timeouts[TCP_CONNTRACK_UNACK] =
1360 			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_UNACK]))*HZ;
1361 	}
1362 
1363 	timeouts[CTA_TIMEOUT_TCP_UNSPEC] = timeouts[CTA_TIMEOUT_TCP_SYN_SENT];
1364 	return 0;
1365 }
1366 
1367 static int
1368 tcp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
1369 {
1370 	const unsigned int *timeouts = data;
1371 
1372 	if (nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT,
1373 			htonl(timeouts[TCP_CONNTRACK_SYN_SENT] / HZ)) ||
1374 	    nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_RECV,
1375 			 htonl(timeouts[TCP_CONNTRACK_SYN_RECV] / HZ)) ||
1376 	    nla_put_be32(skb, CTA_TIMEOUT_TCP_ESTABLISHED,
1377 			 htonl(timeouts[TCP_CONNTRACK_ESTABLISHED] / HZ)) ||
1378 	    nla_put_be32(skb, CTA_TIMEOUT_TCP_FIN_WAIT,
1379 			 htonl(timeouts[TCP_CONNTRACK_FIN_WAIT] / HZ)) ||
1380 	    nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE_WAIT,
1381 			 htonl(timeouts[TCP_CONNTRACK_CLOSE_WAIT] / HZ)) ||
1382 	    nla_put_be32(skb, CTA_TIMEOUT_TCP_LAST_ACK,
1383 			 htonl(timeouts[TCP_CONNTRACK_LAST_ACK] / HZ)) ||
1384 	    nla_put_be32(skb, CTA_TIMEOUT_TCP_TIME_WAIT,
1385 			 htonl(timeouts[TCP_CONNTRACK_TIME_WAIT] / HZ)) ||
1386 	    nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE,
1387 			 htonl(timeouts[TCP_CONNTRACK_CLOSE] / HZ)) ||
1388 	    nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT2,
1389 			 htonl(timeouts[TCP_CONNTRACK_SYN_SENT2] / HZ)) ||
1390 	    nla_put_be32(skb, CTA_TIMEOUT_TCP_RETRANS,
1391 			 htonl(timeouts[TCP_CONNTRACK_RETRANS] / HZ)) ||
1392 	    nla_put_be32(skb, CTA_TIMEOUT_TCP_UNACK,
1393 			 htonl(timeouts[TCP_CONNTRACK_UNACK] / HZ)))
1394 		goto nla_put_failure;
1395 	return 0;
1396 
1397 nla_put_failure:
1398 	return -ENOSPC;
1399 }
1400 
1401 static const struct nla_policy tcp_timeout_nla_policy[CTA_TIMEOUT_TCP_MAX+1] = {
1402 	[CTA_TIMEOUT_TCP_SYN_SENT]	= { .type = NLA_U32 },
1403 	[CTA_TIMEOUT_TCP_SYN_RECV]	= { .type = NLA_U32 },
1404 	[CTA_TIMEOUT_TCP_ESTABLISHED]	= { .type = NLA_U32 },
1405 	[CTA_TIMEOUT_TCP_FIN_WAIT]	= { .type = NLA_U32 },
1406 	[CTA_TIMEOUT_TCP_CLOSE_WAIT]	= { .type = NLA_U32 },
1407 	[CTA_TIMEOUT_TCP_LAST_ACK]	= { .type = NLA_U32 },
1408 	[CTA_TIMEOUT_TCP_TIME_WAIT]	= { .type = NLA_U32 },
1409 	[CTA_TIMEOUT_TCP_CLOSE]		= { .type = NLA_U32 },
1410 	[CTA_TIMEOUT_TCP_SYN_SENT2]	= { .type = NLA_U32 },
1411 	[CTA_TIMEOUT_TCP_RETRANS]	= { .type = NLA_U32 },
1412 	[CTA_TIMEOUT_TCP_UNACK]		= { .type = NLA_U32 },
1413 };
1414 #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
1415 
1416 void nf_conntrack_tcp_init_net(struct net *net)
1417 {
1418 	struct nf_tcp_net *tn = nf_tcp_pernet(net);
1419 	int i;
1420 
1421 	for (i = 0; i < TCP_CONNTRACK_TIMEOUT_MAX; i++)
1422 		tn->timeouts[i] = tcp_timeouts[i];
1423 
1424 	/* timeouts[0] is unused, make it same as SYN_SENT so
1425 	 * ->timeouts[0] contains 'new' timeout, like udp or icmp.
1426 	 */
1427 	tn->timeouts[0] = tcp_timeouts[TCP_CONNTRACK_SYN_SENT];
1428 
1429 	/* If it is set to zero, we disable picking up already established
1430 	 * connections.
1431 	 */
1432 	tn->tcp_loose = 1;
1433 
1434 	/* "Be conservative in what you do,
1435 	 *  be liberal in what you accept from others."
1436 	 * If it's non-zero, we mark only out of window RST segments as INVALID.
1437 	 */
1438 	tn->tcp_be_liberal = 0;
1439 
1440 	/* Max number of the retransmitted packets without receiving an (acceptable)
1441 	 * ACK from the destination. If this number is reached, a shorter timer
1442 	 * will be started.
1443 	 */
1444 	tn->tcp_max_retrans = 3;
1445 
1446 #if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
1447 	tn->offload_timeout = 30 * HZ;
1448 	tn->offload_pickup = 120 * HZ;
1449 #endif
1450 }
1451 
1452 const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp =
1453 {
1454 	.l4proto 		= IPPROTO_TCP,
1455 #ifdef CONFIG_NF_CONNTRACK_PROCFS
1456 	.print_conntrack 	= tcp_print_conntrack,
1457 #endif
1458 	.can_early_drop		= tcp_can_early_drop,
1459 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
1460 	.to_nlattr		= tcp_to_nlattr,
1461 	.from_nlattr		= nlattr_to_tcp,
1462 	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
1463 	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
1464 	.nlattr_tuple_size	= tcp_nlattr_tuple_size,
1465 	.nlattr_size		= TCP_NLATTR_SIZE,
1466 	.nla_policy		= nf_ct_port_nla_policy,
1467 #endif
1468 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
1469 	.ctnl_timeout		= {
1470 		.nlattr_to_obj	= tcp_timeout_nlattr_to_obj,
1471 		.obj_to_nlattr	= tcp_timeout_obj_to_nlattr,
1472 		.nlattr_max	= CTA_TIMEOUT_TCP_MAX,
1473 		.obj_size	= sizeof(unsigned int) *
1474 					TCP_CONNTRACK_TIMEOUT_MAX,
1475 		.nla_policy	= tcp_timeout_nla_policy,
1476 	},
1477 #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
1478 };
1479