1 // SPDX-License-Identifier: GPL-2.0-only
2 /* (C) 1999-2001 Paul `Rusty' Russell
3  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
4  * (C) 2002-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
5  * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
6  */
7 
8 #include <linux/types.h>
9 #include <linux/timer.h>
10 #include <linux/module.h>
11 #include <linux/in.h>
12 #include <linux/tcp.h>
13 #include <linux/spinlock.h>
14 #include <linux/skbuff.h>
15 #include <linux/ipv6.h>
16 #include <net/ip6_checksum.h>
17 #include <asm/unaligned.h>
18 
19 #include <net/tcp.h>
20 
21 #include <linux/netfilter.h>
22 #include <linux/netfilter_ipv4.h>
23 #include <linux/netfilter_ipv6.h>
24 #include <net/netfilter/nf_conntrack.h>
25 #include <net/netfilter/nf_conntrack_l4proto.h>
26 #include <net/netfilter/nf_conntrack_ecache.h>
27 #include <net/netfilter/nf_conntrack_seqadj.h>
28 #include <net/netfilter/nf_conntrack_synproxy.h>
29 #include <net/netfilter/nf_conntrack_timeout.h>
30 #include <net/netfilter/nf_log.h>
31 #include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
32 #include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
33 
34   /* FIXME: Examine ipfilter's timeouts and conntrack transitions more
35      closely.  They're more complex. --RR */
36 
37 static const char *const tcp_conntrack_names[] = {
38 	"NONE",
39 	"SYN_SENT",
40 	"SYN_RECV",
41 	"ESTABLISHED",
42 	"FIN_WAIT",
43 	"CLOSE_WAIT",
44 	"LAST_ACK",
45 	"TIME_WAIT",
46 	"CLOSE",
47 	"SYN_SENT2",
48 };
49 
50 #define SECS * HZ
51 #define MINS * 60 SECS
52 #define HOURS * 60 MINS
53 #define DAYS * 24 HOURS
54 
55 static const unsigned int tcp_timeouts[TCP_CONNTRACK_TIMEOUT_MAX] = {
56 	[TCP_CONNTRACK_SYN_SENT]	= 2 MINS,
57 	[TCP_CONNTRACK_SYN_RECV]	= 60 SECS,
58 	[TCP_CONNTRACK_ESTABLISHED]	= 5 DAYS,
59 	[TCP_CONNTRACK_FIN_WAIT]	= 2 MINS,
60 	[TCP_CONNTRACK_CLOSE_WAIT]	= 60 SECS,
61 	[TCP_CONNTRACK_LAST_ACK]	= 30 SECS,
62 	[TCP_CONNTRACK_TIME_WAIT]	= 2 MINS,
63 	[TCP_CONNTRACK_CLOSE]		= 10 SECS,
64 	[TCP_CONNTRACK_SYN_SENT2]	= 2 MINS,
65 /* RFC1122 says the R2 limit should be at least 100 seconds.
66    Linux uses 15 packets as limit, which corresponds
67    to ~13-30min depending on RTO. */
68 	[TCP_CONNTRACK_RETRANS]		= 5 MINS,
69 	[TCP_CONNTRACK_UNACK]		= 5 MINS,
70 };
71 
72 #define sNO TCP_CONNTRACK_NONE
73 #define sSS TCP_CONNTRACK_SYN_SENT
74 #define sSR TCP_CONNTRACK_SYN_RECV
75 #define sES TCP_CONNTRACK_ESTABLISHED
76 #define sFW TCP_CONNTRACK_FIN_WAIT
77 #define sCW TCP_CONNTRACK_CLOSE_WAIT
78 #define sLA TCP_CONNTRACK_LAST_ACK
79 #define sTW TCP_CONNTRACK_TIME_WAIT
80 #define sCL TCP_CONNTRACK_CLOSE
81 #define sS2 TCP_CONNTRACK_SYN_SENT2
82 #define sIV TCP_CONNTRACK_MAX
83 #define sIG TCP_CONNTRACK_IGNORE
84 
85 /* What TCP flags are set from RST/SYN/FIN/ACK. */
86 enum tcp_bit_set {
87 	TCP_SYN_SET,
88 	TCP_SYNACK_SET,
89 	TCP_FIN_SET,
90 	TCP_ACK_SET,
91 	TCP_RST_SET,
92 	TCP_NONE_SET,
93 };
94 
95 /*
96  * The TCP state transition table needs a few words...
97  *
98  * We are the man in the middle. All the packets go through us
99  * but might get lost in transit to the destination.
100  * It is assumed that the destinations can't receive segments
101  * we haven't seen.
102  *
103  * The checked segment is in window, but our windows are *not*
104  * equivalent with the ones of the sender/receiver. We always
105  * try to guess the state of the current sender.
106  *
107  * The meaning of the states are:
108  *
109  * NONE:	initial state
110  * SYN_SENT:	SYN-only packet seen
111  * SYN_SENT2:	SYN-only packet seen from reply dir, simultaneous open
112  * SYN_RECV:	SYN-ACK packet seen
113  * ESTABLISHED:	ACK packet seen
114  * FIN_WAIT:	FIN packet seen
115  * CLOSE_WAIT:	ACK seen (after FIN)
116  * LAST_ACK:	FIN seen (after FIN)
117  * TIME_WAIT:	last ACK seen
118  * CLOSE:	closed connection (RST)
119  *
120  * Packets marked as IGNORED (sIG):
121  *	if they may be either invalid or valid
122  *	and the receiver may send back a connection
123  *	closing RST or a SYN/ACK.
124  *
125  * Packets marked as INVALID (sIV):
126  *	if we regard them as truly invalid packets
127  */
128 static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
129 	{
130 /* ORIGINAL */
131 /* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
132 /*syn*/	   { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sS2 },
133 /*
134  *	sNO -> sSS	Initialize a new connection
135  *	sSS -> sSS	Retransmitted SYN
136  *	sS2 -> sS2	Late retransmitted SYN
137  *	sSR -> sIG
138  *	sES -> sIG	Error: SYNs in window outside the SYN_SENT state
139  *			are errors. Receiver will reply with RST
140  *			and close the connection.
141  *			Or we are not in sync and hold a dead connection.
142  *	sFW -> sIG
143  *	sCW -> sIG
144  *	sLA -> sIG
145  *	sTW -> sSS	Reopened connection (RFC 1122).
146  *	sCL -> sSS
147  */
148 /* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
149 /*synack*/ { sIV, sIV, sSR, sIV, sIV, sIV, sIV, sIV, sIV, sSR },
150 /*
151  *	sNO -> sIV	Too late and no reason to do anything
152  *	sSS -> sIV	Client can't send SYN and then SYN/ACK
153  *	sS2 -> sSR	SYN/ACK sent to SYN2 in simultaneous open
154  *	sSR -> sSR	Late retransmitted SYN/ACK in simultaneous open
155  *	sES -> sIV	Invalid SYN/ACK packets sent by the client
156  *	sFW -> sIV
157  *	sCW -> sIV
158  *	sLA -> sIV
159  *	sTW -> sIV
160  *	sCL -> sIV
161  */
162 /* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
163 /*fin*/    { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
164 /*
165  *	sNO -> sIV	Too late and no reason to do anything...
166  *	sSS -> sIV	Client migth not send FIN in this state:
167  *			we enforce waiting for a SYN/ACK reply first.
168  *	sS2 -> sIV
169  *	sSR -> sFW	Close started.
170  *	sES -> sFW
171  *	sFW -> sLA	FIN seen in both directions, waiting for
172  *			the last ACK.
173  *			Migth be a retransmitted FIN as well...
174  *	sCW -> sLA
175  *	sLA -> sLA	Retransmitted FIN. Remain in the same state.
176  *	sTW -> sTW
177  *	sCL -> sCL
178  */
179 /* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
180 /*ack*/	   { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV },
181 /*
182  *	sNO -> sES	Assumed.
183  *	sSS -> sIV	ACK is invalid: we haven't seen a SYN/ACK yet.
184  *	sS2 -> sIV
185  *	sSR -> sES	Established state is reached.
186  *	sES -> sES	:-)
187  *	sFW -> sCW	Normal close request answered by ACK.
188  *	sCW -> sCW
189  *	sLA -> sTW	Last ACK detected (RFC5961 challenged)
190  *	sTW -> sTW	Retransmitted last ACK. Remain in the same state.
191  *	sCL -> sCL
192  */
193 /* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
194 /*rst*/    { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL },
195 /*none*/   { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
196 	},
197 	{
198 /* REPLY */
199 /* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
200 /*syn*/	   { sIV, sS2, sIV, sIV, sIV, sIV, sIV, sSS, sIV, sS2 },
201 /*
202  *	sNO -> sIV	Never reached.
203  *	sSS -> sS2	Simultaneous open
204  *	sS2 -> sS2	Retransmitted simultaneous SYN
205  *	sSR -> sIV	Invalid SYN packets sent by the server
206  *	sES -> sIV
207  *	sFW -> sIV
208  *	sCW -> sIV
209  *	sLA -> sIV
210  *	sTW -> sSS	Reopened connection, but server may have switched role
211  *	sCL -> sIV
212  */
213 /* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
214 /*synack*/ { sIV, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR },
215 /*
216  *	sSS -> sSR	Standard open.
217  *	sS2 -> sSR	Simultaneous open
218  *	sSR -> sIG	Retransmitted SYN/ACK, ignore it.
219  *	sES -> sIG	Late retransmitted SYN/ACK?
220  *	sFW -> sIG	Might be SYN/ACK answering ignored SYN
221  *	sCW -> sIG
222  *	sLA -> sIG
223  *	sTW -> sIG
224  *	sCL -> sIG
225  */
226 /* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
227 /*fin*/    { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
228 /*
229  *	sSS -> sIV	Server might not send FIN in this state.
230  *	sS2 -> sIV
231  *	sSR -> sFW	Close started.
232  *	sES -> sFW
233  *	sFW -> sLA	FIN seen in both directions.
234  *	sCW -> sLA
235  *	sLA -> sLA	Retransmitted FIN.
236  *	sTW -> sTW
237  *	sCL -> sCL
238  */
239 /* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
240 /*ack*/	   { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIG },
241 /*
242  *	sSS -> sIG	Might be a half-open connection.
243  *	sS2 -> sIG
244  *	sSR -> sSR	Might answer late resent SYN.
245  *	sES -> sES	:-)
246  *	sFW -> sCW	Normal close request answered by ACK.
247  *	sCW -> sCW
248  *	sLA -> sTW	Last ACK detected (RFC5961 challenged)
249  *	sTW -> sTW	Retransmitted last ACK.
250  *	sCL -> sCL
251  */
252 /* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
253 /*rst*/    { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL },
254 /*none*/   { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
255 	}
256 };
257 
258 #ifdef CONFIG_NF_CONNTRACK_PROCFS
259 /* Print out the private part of the conntrack. */
260 static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
261 {
262 	if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
263 		return;
264 
265 	seq_printf(s, "%s ", tcp_conntrack_names[ct->proto.tcp.state]);
266 }
267 #endif
268 
269 static unsigned int get_conntrack_index(const struct tcphdr *tcph)
270 {
271 	if (tcph->rst) return TCP_RST_SET;
272 	else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET);
273 	else if (tcph->fin) return TCP_FIN_SET;
274 	else if (tcph->ack) return TCP_ACK_SET;
275 	else return TCP_NONE_SET;
276 }
277 
278 /* TCP connection tracking based on 'Real Stateful TCP Packet Filtering
279    in IP Filter' by Guido van Rooij.
280 
281    http://www.sane.nl/events/sane2000/papers.html
282    http://www.darkart.com/mirrors/www.obfuscation.org/ipf/
283 
284    The boundaries and the conditions are changed according to RFC793:
285    the packet must intersect the window (i.e. segments may be
286    after the right or before the left edge) and thus receivers may ACK
287    segments after the right edge of the window.
288 
289 	td_maxend = max(sack + max(win,1)) seen in reply packets
290 	td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets
291 	td_maxwin += seq + len - sender.td_maxend
292 			if seq + len > sender.td_maxend
293 	td_end    = max(seq + len) seen in sent packets
294 
295    I.   Upper bound for valid data:	seq <= sender.td_maxend
296    II.  Lower bound for valid data:	seq + len >= sender.td_end - receiver.td_maxwin
297    III.	Upper bound for valid (s)ack:   sack <= receiver.td_end
298    IV.	Lower bound for valid (s)ack:	sack >= receiver.td_end - MAXACKWINDOW
299 
300    where sack is the highest right edge of sack block found in the packet
301    or ack in the case of packet without SACK option.
302 
303    The upper bound limit for a valid (s)ack is not ignored -
304    we doesn't have to deal with fragments.
305 */
306 
307 static inline __u32 segment_seq_plus_len(__u32 seq,
308 					 size_t len,
309 					 unsigned int dataoff,
310 					 const struct tcphdr *tcph)
311 {
312 	/* XXX Should I use payload length field in IP/IPv6 header ?
313 	 * - YK */
314 	return (seq + len - dataoff - tcph->doff*4
315 		+ (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0));
316 }
317 
318 /* Fixme: what about big packets? */
319 #define MAXACKWINCONST			66000
320 #define MAXACKWINDOW(sender)						\
321 	((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin	\
322 					      : MAXACKWINCONST)
323 
324 /*
325  * Simplified tcp_parse_options routine from tcp_input.c
326  */
327 static void tcp_options(const struct sk_buff *skb,
328 			unsigned int dataoff,
329 			const struct tcphdr *tcph,
330 			struct ip_ct_tcp_state *state)
331 {
332 	unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
333 	const unsigned char *ptr;
334 	int length = (tcph->doff*4) - sizeof(struct tcphdr);
335 
336 	if (!length)
337 		return;
338 
339 	ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
340 				 length, buff);
341 	if (!ptr)
342 		return;
343 
344 	state->td_scale =
345 	state->flags = 0;
346 
347 	while (length > 0) {
348 		int opcode=*ptr++;
349 		int opsize;
350 
351 		switch (opcode) {
352 		case TCPOPT_EOL:
353 			return;
354 		case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */
355 			length--;
356 			continue;
357 		default:
358 			if (length < 2)
359 				return;
360 			opsize=*ptr++;
361 			if (opsize < 2) /* "silly options" */
362 				return;
363 			if (opsize > length)
364 				return;	/* don't parse partial options */
365 
366 			if (opcode == TCPOPT_SACK_PERM
367 			    && opsize == TCPOLEN_SACK_PERM)
368 				state->flags |= IP_CT_TCP_FLAG_SACK_PERM;
369 			else if (opcode == TCPOPT_WINDOW
370 				 && opsize == TCPOLEN_WINDOW) {
371 				state->td_scale = *(u_int8_t *)ptr;
372 
373 				if (state->td_scale > TCP_MAX_WSCALE)
374 					state->td_scale = TCP_MAX_WSCALE;
375 
376 				state->flags |=
377 					IP_CT_TCP_FLAG_WINDOW_SCALE;
378 			}
379 			ptr += opsize - 2;
380 			length -= opsize;
381 		}
382 	}
383 }
384 
385 static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
386                      const struct tcphdr *tcph, __u32 *sack)
387 {
388 	unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
389 	const unsigned char *ptr;
390 	int length = (tcph->doff*4) - sizeof(struct tcphdr);
391 	__u32 tmp;
392 
393 	if (!length)
394 		return;
395 
396 	ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
397 				 length, buff);
398 	if (!ptr)
399 		return;
400 
401 	/* Fast path for timestamp-only option */
402 	if (length == TCPOLEN_TSTAMP_ALIGNED
403 	    && *(__be32 *)ptr == htonl((TCPOPT_NOP << 24)
404 				       | (TCPOPT_NOP << 16)
405 				       | (TCPOPT_TIMESTAMP << 8)
406 				       | TCPOLEN_TIMESTAMP))
407 		return;
408 
409 	while (length > 0) {
410 		int opcode = *ptr++;
411 		int opsize, i;
412 
413 		switch (opcode) {
414 		case TCPOPT_EOL:
415 			return;
416 		case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */
417 			length--;
418 			continue;
419 		default:
420 			if (length < 2)
421 				return;
422 			opsize = *ptr++;
423 			if (opsize < 2) /* "silly options" */
424 				return;
425 			if (opsize > length)
426 				return;	/* don't parse partial options */
427 
428 			if (opcode == TCPOPT_SACK
429 			    && opsize >= (TCPOLEN_SACK_BASE
430 					  + TCPOLEN_SACK_PERBLOCK)
431 			    && !((opsize - TCPOLEN_SACK_BASE)
432 				 % TCPOLEN_SACK_PERBLOCK)) {
433 				for (i = 0;
434 				     i < (opsize - TCPOLEN_SACK_BASE);
435 				     i += TCPOLEN_SACK_PERBLOCK) {
436 					tmp = get_unaligned_be32((__be32 *)(ptr+i)+1);
437 
438 					if (after(tmp, *sack))
439 						*sack = tmp;
440 				}
441 				return;
442 			}
443 			ptr += opsize - 2;
444 			length -= opsize;
445 		}
446 	}
447 }
448 
449 static bool tcp_in_window(struct nf_conn *ct,
450 			  enum ip_conntrack_dir dir,
451 			  unsigned int index,
452 			  const struct sk_buff *skb,
453 			  unsigned int dataoff,
454 			  const struct tcphdr *tcph,
455 			  const struct nf_hook_state *hook_state)
456 {
457 	struct ip_ct_tcp *state = &ct->proto.tcp;
458 	struct net *net = nf_ct_net(ct);
459 	struct nf_tcp_net *tn = nf_tcp_pernet(net);
460 	struct ip_ct_tcp_state *sender = &state->seen[dir];
461 	struct ip_ct_tcp_state *receiver = &state->seen[!dir];
462 	const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple;
463 	__u32 seq, ack, sack, end, win, swin;
464 	u16 win_raw;
465 	s32 receiver_offset;
466 	bool res, in_recv_win;
467 
468 	/*
469 	 * Get the required data from the packet.
470 	 */
471 	seq = ntohl(tcph->seq);
472 	ack = sack = ntohl(tcph->ack_seq);
473 	win_raw = ntohs(tcph->window);
474 	win = win_raw;
475 	end = segment_seq_plus_len(seq, skb->len, dataoff, tcph);
476 
477 	if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM)
478 		tcp_sack(skb, dataoff, tcph, &sack);
479 
480 	/* Take into account NAT sequence number mangling */
481 	receiver_offset = nf_ct_seq_offset(ct, !dir, ack - 1);
482 	ack -= receiver_offset;
483 	sack -= receiver_offset;
484 
485 	pr_debug("tcp_in_window: START\n");
486 	pr_debug("tcp_in_window: ");
487 	nf_ct_dump_tuple(tuple);
488 	pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n",
489 		 seq, ack, receiver_offset, sack, receiver_offset, win, end);
490 	pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
491 		 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
492 		 sender->td_end, sender->td_maxend, sender->td_maxwin,
493 		 sender->td_scale,
494 		 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
495 		 receiver->td_scale);
496 
497 	if (sender->td_maxwin == 0) {
498 		/*
499 		 * Initialize sender data.
500 		 */
501 		if (tcph->syn) {
502 			/*
503 			 * SYN-ACK in reply to a SYN
504 			 * or SYN from reply direction in simultaneous open.
505 			 */
506 			sender->td_end =
507 			sender->td_maxend = end;
508 			sender->td_maxwin = (win == 0 ? 1 : win);
509 
510 			tcp_options(skb, dataoff, tcph, sender);
511 			/*
512 			 * RFC 1323:
513 			 * Both sides must send the Window Scale option
514 			 * to enable window scaling in either direction.
515 			 */
516 			if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE
517 			      && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE))
518 				sender->td_scale =
519 				receiver->td_scale = 0;
520 			if (!tcph->ack)
521 				/* Simultaneous open */
522 				return true;
523 		} else {
524 			/*
525 			 * We are in the middle of a connection,
526 			 * its history is lost for us.
527 			 * Let's try to use the data from the packet.
528 			 */
529 			sender->td_end = end;
530 			swin = win << sender->td_scale;
531 			sender->td_maxwin = (swin == 0 ? 1 : swin);
532 			sender->td_maxend = end + sender->td_maxwin;
533 			if (receiver->td_maxwin == 0) {
534 				/* We haven't seen traffic in the other
535 				 * direction yet but we have to tweak window
536 				 * tracking to pass III and IV until that
537 				 * happens.
538 				 */
539 				receiver->td_end = receiver->td_maxend = sack;
540 			} else if (sack == receiver->td_end + 1) {
541 				/* Likely a reply to a keepalive.
542 				 * Needed for III.
543 				 */
544 				receiver->td_end++;
545 			}
546 
547 		}
548 	} else if (((state->state == TCP_CONNTRACK_SYN_SENT
549 		     && dir == IP_CT_DIR_ORIGINAL)
550 		   || (state->state == TCP_CONNTRACK_SYN_RECV
551 		     && dir == IP_CT_DIR_REPLY))
552 		   && after(end, sender->td_end)) {
553 		/*
554 		 * RFC 793: "if a TCP is reinitialized ... then it need
555 		 * not wait at all; it must only be sure to use sequence
556 		 * numbers larger than those recently used."
557 		 */
558 		sender->td_end =
559 		sender->td_maxend = end;
560 		sender->td_maxwin = (win == 0 ? 1 : win);
561 
562 		tcp_options(skb, dataoff, tcph, sender);
563 	}
564 
565 	if (!(tcph->ack)) {
566 		/*
567 		 * If there is no ACK, just pretend it was set and OK.
568 		 */
569 		ack = sack = receiver->td_end;
570 	} else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) ==
571 		    (TCP_FLAG_ACK|TCP_FLAG_RST))
572 		   && (ack == 0)) {
573 		/*
574 		 * Broken TCP stacks, that set ACK in RST packets as well
575 		 * with zero ack value.
576 		 */
577 		ack = sack = receiver->td_end;
578 	}
579 
580 	if (tcph->rst && seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT)
581 		/*
582 		 * RST sent answering SYN.
583 		 */
584 		seq = end = sender->td_end;
585 
586 	pr_debug("tcp_in_window: ");
587 	nf_ct_dump_tuple(tuple);
588 	pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n",
589 		 seq, ack, receiver_offset, sack, receiver_offset, win, end);
590 	pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
591 		 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
592 		 sender->td_end, sender->td_maxend, sender->td_maxwin,
593 		 sender->td_scale,
594 		 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
595 		 receiver->td_scale);
596 
597 	/* Is the ending sequence in the receive window (if available)? */
598 	in_recv_win = !receiver->td_maxwin ||
599 		      after(end, sender->td_end - receiver->td_maxwin - 1);
600 
601 	pr_debug("tcp_in_window: I=%i II=%i III=%i IV=%i\n",
602 		 before(seq, sender->td_maxend + 1),
603 		 (in_recv_win ? 1 : 0),
604 		 before(sack, receiver->td_end + 1),
605 		 after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1));
606 
607 	if (before(seq, sender->td_maxend + 1) &&
608 	    in_recv_win &&
609 	    before(sack, receiver->td_end + 1) &&
610 	    after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) {
611 		/*
612 		 * Take into account window scaling (RFC 1323).
613 		 */
614 		if (!tcph->syn)
615 			win <<= sender->td_scale;
616 
617 		/*
618 		 * Update sender data.
619 		 */
620 		swin = win + (sack - ack);
621 		if (sender->td_maxwin < swin)
622 			sender->td_maxwin = swin;
623 		if (after(end, sender->td_end)) {
624 			sender->td_end = end;
625 			sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
626 		}
627 		if (tcph->ack) {
628 			if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) {
629 				sender->td_maxack = ack;
630 				sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET;
631 			} else if (after(ack, sender->td_maxack))
632 				sender->td_maxack = ack;
633 		}
634 
635 		/*
636 		 * Update receiver data.
637 		 */
638 		if (receiver->td_maxwin != 0 && after(end, sender->td_maxend))
639 			receiver->td_maxwin += end - sender->td_maxend;
640 		if (after(sack + win, receiver->td_maxend - 1)) {
641 			receiver->td_maxend = sack + win;
642 			if (win == 0)
643 				receiver->td_maxend++;
644 		}
645 		if (ack == receiver->td_end)
646 			receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
647 
648 		/*
649 		 * Check retransmissions.
650 		 */
651 		if (index == TCP_ACK_SET) {
652 			if (state->last_dir == dir
653 			    && state->last_seq == seq
654 			    && state->last_ack == ack
655 			    && state->last_end == end
656 			    && state->last_win == win_raw)
657 				state->retrans++;
658 			else {
659 				state->last_dir = dir;
660 				state->last_seq = seq;
661 				state->last_ack = ack;
662 				state->last_end = end;
663 				state->last_win = win_raw;
664 				state->retrans = 0;
665 			}
666 		}
667 		res = true;
668 	} else {
669 		res = false;
670 		if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL ||
671 		    tn->tcp_be_liberal)
672 			res = true;
673 		if (!res) {
674 			nf_ct_l4proto_log_invalid(skb, ct, hook_state,
675 			"%s",
676 			before(seq, sender->td_maxend + 1) ?
677 			in_recv_win ?
678 			before(sack, receiver->td_end + 1) ?
679 			after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1) ? "BUG"
680 			: "ACK is under the lower bound (possible overly delayed ACK)"
681 			: "ACK is over the upper bound (ACKed data not seen yet)"
682 			: "SEQ is under the lower bound (already ACKed data retransmitted)"
683 			: "SEQ is over the upper bound (over the window of the receiver)");
684 		}
685 	}
686 
687 	pr_debug("tcp_in_window: res=%u sender end=%u maxend=%u maxwin=%u "
688 		 "receiver end=%u maxend=%u maxwin=%u\n",
689 		 res, sender->td_end, sender->td_maxend, sender->td_maxwin,
690 		 receiver->td_end, receiver->td_maxend, receiver->td_maxwin);
691 
692 	return res;
693 }
694 
695 /* table of valid flag combinations - PUSH, ECE and CWR are always valid */
696 static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK|
697 				 TCPHDR_URG) + 1] =
698 {
699 	[TCPHDR_SYN]				= 1,
700 	[TCPHDR_SYN|TCPHDR_URG]			= 1,
701 	[TCPHDR_SYN|TCPHDR_ACK]			= 1,
702 	[TCPHDR_RST]				= 1,
703 	[TCPHDR_RST|TCPHDR_ACK]			= 1,
704 	[TCPHDR_FIN|TCPHDR_ACK]			= 1,
705 	[TCPHDR_FIN|TCPHDR_ACK|TCPHDR_URG]	= 1,
706 	[TCPHDR_ACK]				= 1,
707 	[TCPHDR_ACK|TCPHDR_URG]			= 1,
708 };
709 
710 static void tcp_error_log(const struct sk_buff *skb,
711 			  const struct nf_hook_state *state,
712 			  const char *msg)
713 {
714 	nf_l4proto_log_invalid(skb, state, IPPROTO_TCP, "%s", msg);
715 }
716 
717 /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c.  */
718 static bool tcp_error(const struct tcphdr *th,
719 		      struct sk_buff *skb,
720 		      unsigned int dataoff,
721 		      const struct nf_hook_state *state)
722 {
723 	unsigned int tcplen = skb->len - dataoff;
724 	u8 tcpflags;
725 
726 	/* Not whole TCP header or malformed packet */
727 	if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
728 		tcp_error_log(skb, state, "truncated packet");
729 		return true;
730 	}
731 
732 	/* Checksum invalid? Ignore.
733 	 * We skip checking packets on the outgoing path
734 	 * because the checksum is assumed to be correct.
735 	 */
736 	/* FIXME: Source route IP option packets --RR */
737 	if (state->net->ct.sysctl_checksum &&
738 	    state->hook == NF_INET_PRE_ROUTING &&
739 	    nf_checksum(skb, state->hook, dataoff, IPPROTO_TCP, state->pf)) {
740 		tcp_error_log(skb, state, "bad checksum");
741 		return true;
742 	}
743 
744 	/* Check TCP flags. */
745 	tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH));
746 	if (!tcp_valid_flags[tcpflags]) {
747 		tcp_error_log(skb, state, "invalid tcp flag combination");
748 		return true;
749 	}
750 
751 	return false;
752 }
753 
754 static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
755 			     unsigned int dataoff,
756 			     const struct tcphdr *th)
757 {
758 	enum tcp_conntrack new_state;
759 	struct net *net = nf_ct_net(ct);
760 	const struct nf_tcp_net *tn = nf_tcp_pernet(net);
761 	const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0];
762 	const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1];
763 
764 	/* Don't need lock here: this conntrack not in circulation yet */
765 	new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE];
766 
767 	/* Invalid: delete conntrack */
768 	if (new_state >= TCP_CONNTRACK_MAX) {
769 		pr_debug("nf_ct_tcp: invalid new deleting.\n");
770 		return false;
771 	}
772 
773 	if (new_state == TCP_CONNTRACK_SYN_SENT) {
774 		memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
775 		/* SYN packet */
776 		ct->proto.tcp.seen[0].td_end =
777 			segment_seq_plus_len(ntohl(th->seq), skb->len,
778 					     dataoff, th);
779 		ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
780 		if (ct->proto.tcp.seen[0].td_maxwin == 0)
781 			ct->proto.tcp.seen[0].td_maxwin = 1;
782 		ct->proto.tcp.seen[0].td_maxend =
783 			ct->proto.tcp.seen[0].td_end;
784 
785 		tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]);
786 	} else if (tn->tcp_loose == 0) {
787 		/* Don't try to pick up connections. */
788 		return false;
789 	} else {
790 		memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
791 		/*
792 		 * We are in the middle of a connection,
793 		 * its history is lost for us.
794 		 * Let's try to use the data from the packet.
795 		 */
796 		ct->proto.tcp.seen[0].td_end =
797 			segment_seq_plus_len(ntohl(th->seq), skb->len,
798 					     dataoff, th);
799 		ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
800 		if (ct->proto.tcp.seen[0].td_maxwin == 0)
801 			ct->proto.tcp.seen[0].td_maxwin = 1;
802 		ct->proto.tcp.seen[0].td_maxend =
803 			ct->proto.tcp.seen[0].td_end +
804 			ct->proto.tcp.seen[0].td_maxwin;
805 
806 		/* We assume SACK and liberal window checking to handle
807 		 * window scaling */
808 		ct->proto.tcp.seen[0].flags =
809 		ct->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM |
810 					      IP_CT_TCP_FLAG_BE_LIBERAL;
811 	}
812 
813 	/* tcp_packet will set them */
814 	ct->proto.tcp.last_index = TCP_NONE_SET;
815 
816 	pr_debug("%s: sender end=%u maxend=%u maxwin=%u scale=%i "
817 		 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
818 		 __func__,
819 		 sender->td_end, sender->td_maxend, sender->td_maxwin,
820 		 sender->td_scale,
821 		 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
822 		 receiver->td_scale);
823 	return true;
824 }
825 
826 static bool tcp_can_early_drop(const struct nf_conn *ct)
827 {
828 	switch (ct->proto.tcp.state) {
829 	case TCP_CONNTRACK_FIN_WAIT:
830 	case TCP_CONNTRACK_LAST_ACK:
831 	case TCP_CONNTRACK_TIME_WAIT:
832 	case TCP_CONNTRACK_CLOSE:
833 	case TCP_CONNTRACK_CLOSE_WAIT:
834 		return true;
835 	default:
836 		break;
837 	}
838 
839 	return false;
840 }
841 
842 /* Returns verdict for packet, or -1 for invalid. */
843 int nf_conntrack_tcp_packet(struct nf_conn *ct,
844 			    struct sk_buff *skb,
845 			    unsigned int dataoff,
846 			    enum ip_conntrack_info ctinfo,
847 			    const struct nf_hook_state *state)
848 {
849 	struct net *net = nf_ct_net(ct);
850 	struct nf_tcp_net *tn = nf_tcp_pernet(net);
851 	struct nf_conntrack_tuple *tuple;
852 	enum tcp_conntrack new_state, old_state;
853 	unsigned int index, *timeouts;
854 	enum ip_conntrack_dir dir;
855 	const struct tcphdr *th;
856 	struct tcphdr _tcph;
857 	unsigned long timeout;
858 
859 	th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
860 	if (th == NULL)
861 		return -NF_ACCEPT;
862 
863 	if (tcp_error(th, skb, dataoff, state))
864 		return -NF_ACCEPT;
865 
866 	if (!nf_ct_is_confirmed(ct) && !tcp_new(ct, skb, dataoff, th))
867 		return -NF_ACCEPT;
868 
869 	spin_lock_bh(&ct->lock);
870 	old_state = ct->proto.tcp.state;
871 	dir = CTINFO2DIR(ctinfo);
872 	index = get_conntrack_index(th);
873 	new_state = tcp_conntracks[dir][index][old_state];
874 	tuple = &ct->tuplehash[dir].tuple;
875 
876 	switch (new_state) {
877 	case TCP_CONNTRACK_SYN_SENT:
878 		if (old_state < TCP_CONNTRACK_TIME_WAIT)
879 			break;
880 		/* RFC 1122: "When a connection is closed actively,
881 		 * it MUST linger in TIME-WAIT state for a time 2xMSL
882 		 * (Maximum Segment Lifetime). However, it MAY accept
883 		 * a new SYN from the remote TCP to reopen the connection
884 		 * directly from TIME-WAIT state, if..."
885 		 * We ignore the conditions because we are in the
886 		 * TIME-WAIT state anyway.
887 		 *
888 		 * Handle aborted connections: we and the server
889 		 * think there is an existing connection but the client
890 		 * aborts it and starts a new one.
891 		 */
892 		if (((ct->proto.tcp.seen[dir].flags
893 		      | ct->proto.tcp.seen[!dir].flags)
894 		     & IP_CT_TCP_FLAG_CLOSE_INIT)
895 		    || (ct->proto.tcp.last_dir == dir
896 		        && ct->proto.tcp.last_index == TCP_RST_SET)) {
897 			/* Attempt to reopen a closed/aborted connection.
898 			 * Delete this connection and look up again. */
899 			spin_unlock_bh(&ct->lock);
900 
901 			/* Only repeat if we can actually remove the timer.
902 			 * Destruction may already be in progress in process
903 			 * context and we must give it a chance to terminate.
904 			 */
905 			if (nf_ct_kill(ct))
906 				return -NF_REPEAT;
907 			return NF_DROP;
908 		}
909 		fallthrough;
910 	case TCP_CONNTRACK_IGNORE:
911 		/* Ignored packets:
912 		 *
913 		 * Our connection entry may be out of sync, so ignore
914 		 * packets which may signal the real connection between
915 		 * the client and the server.
916 		 *
917 		 * a) SYN in ORIGINAL
918 		 * b) SYN/ACK in REPLY
919 		 * c) ACK in reply direction after initial SYN in original.
920 		 *
921 		 * If the ignored packet is invalid, the receiver will send
922 		 * a RST we'll catch below.
923 		 */
924 		if (index == TCP_SYNACK_SET
925 		    && ct->proto.tcp.last_index == TCP_SYN_SET
926 		    && ct->proto.tcp.last_dir != dir
927 		    && ntohl(th->ack_seq) == ct->proto.tcp.last_end) {
928 			/* b) This SYN/ACK acknowledges a SYN that we earlier
929 			 * ignored as invalid. This means that the client and
930 			 * the server are both in sync, while the firewall is
931 			 * not. We get in sync from the previously annotated
932 			 * values.
933 			 */
934 			old_state = TCP_CONNTRACK_SYN_SENT;
935 			new_state = TCP_CONNTRACK_SYN_RECV;
936 			ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_end =
937 				ct->proto.tcp.last_end;
938 			ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxend =
939 				ct->proto.tcp.last_end;
940 			ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxwin =
941 				ct->proto.tcp.last_win == 0 ?
942 					1 : ct->proto.tcp.last_win;
943 			ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_scale =
944 				ct->proto.tcp.last_wscale;
945 			ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK;
946 			ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags =
947 				ct->proto.tcp.last_flags;
948 			memset(&ct->proto.tcp.seen[dir], 0,
949 			       sizeof(struct ip_ct_tcp_state));
950 			break;
951 		}
952 		ct->proto.tcp.last_index = index;
953 		ct->proto.tcp.last_dir = dir;
954 		ct->proto.tcp.last_seq = ntohl(th->seq);
955 		ct->proto.tcp.last_end =
956 		    segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th);
957 		ct->proto.tcp.last_win = ntohs(th->window);
958 
959 		/* a) This is a SYN in ORIGINAL. The client and the server
960 		 * may be in sync but we are not. In that case, we annotate
961 		 * the TCP options and let the packet go through. If it is a
962 		 * valid SYN packet, the server will reply with a SYN/ACK, and
963 		 * then we'll get in sync. Otherwise, the server potentially
964 		 * responds with a challenge ACK if implementing RFC5961.
965 		 */
966 		if (index == TCP_SYN_SET && dir == IP_CT_DIR_ORIGINAL) {
967 			struct ip_ct_tcp_state seen = {};
968 
969 			ct->proto.tcp.last_flags =
970 			ct->proto.tcp.last_wscale = 0;
971 			tcp_options(skb, dataoff, th, &seen);
972 			if (seen.flags & IP_CT_TCP_FLAG_WINDOW_SCALE) {
973 				ct->proto.tcp.last_flags |=
974 					IP_CT_TCP_FLAG_WINDOW_SCALE;
975 				ct->proto.tcp.last_wscale = seen.td_scale;
976 			}
977 			if (seen.flags & IP_CT_TCP_FLAG_SACK_PERM) {
978 				ct->proto.tcp.last_flags |=
979 					IP_CT_TCP_FLAG_SACK_PERM;
980 			}
981 			/* Mark the potential for RFC5961 challenge ACK,
982 			 * this pose a special problem for LAST_ACK state
983 			 * as ACK is intrepretated as ACKing last FIN.
984 			 */
985 			if (old_state == TCP_CONNTRACK_LAST_ACK)
986 				ct->proto.tcp.last_flags |=
987 					IP_CT_EXP_CHALLENGE_ACK;
988 		}
989 		spin_unlock_bh(&ct->lock);
990 		nf_ct_l4proto_log_invalid(skb, ct, state,
991 					  "packet (index %d) in dir %d ignored, state %s",
992 					  index, dir,
993 					  tcp_conntrack_names[old_state]);
994 		return NF_ACCEPT;
995 	case TCP_CONNTRACK_MAX:
996 		/* Special case for SYN proxy: when the SYN to the server or
997 		 * the SYN/ACK from the server is lost, the client may transmit
998 		 * a keep-alive packet while in SYN_SENT state. This needs to
999 		 * be associated with the original conntrack entry in order to
1000 		 * generate a new SYN with the correct sequence number.
1001 		 */
1002 		if (nfct_synproxy(ct) && old_state == TCP_CONNTRACK_SYN_SENT &&
1003 		    index == TCP_ACK_SET && dir == IP_CT_DIR_ORIGINAL &&
1004 		    ct->proto.tcp.last_dir == IP_CT_DIR_ORIGINAL &&
1005 		    ct->proto.tcp.seen[dir].td_end - 1 == ntohl(th->seq)) {
1006 			pr_debug("nf_ct_tcp: SYN proxy client keep alive\n");
1007 			spin_unlock_bh(&ct->lock);
1008 			return NF_ACCEPT;
1009 		}
1010 
1011 		/* Invalid packet */
1012 		pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
1013 			 dir, get_conntrack_index(th), old_state);
1014 		spin_unlock_bh(&ct->lock);
1015 		nf_ct_l4proto_log_invalid(skb, ct, state, "invalid state");
1016 		return -NF_ACCEPT;
1017 	case TCP_CONNTRACK_TIME_WAIT:
1018 		/* RFC5961 compliance cause stack to send "challenge-ACK"
1019 		 * e.g. in response to spurious SYNs.  Conntrack MUST
1020 		 * not believe this ACK is acking last FIN.
1021 		 */
1022 		if (old_state == TCP_CONNTRACK_LAST_ACK &&
1023 		    index == TCP_ACK_SET &&
1024 		    ct->proto.tcp.last_dir != dir &&
1025 		    ct->proto.tcp.last_index == TCP_SYN_SET &&
1026 		    (ct->proto.tcp.last_flags & IP_CT_EXP_CHALLENGE_ACK)) {
1027 			/* Detected RFC5961 challenge ACK */
1028 			ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK;
1029 			spin_unlock_bh(&ct->lock);
1030 			nf_ct_l4proto_log_invalid(skb, ct, state, "challenge-ack ignored");
1031 			return NF_ACCEPT; /* Don't change state */
1032 		}
1033 		break;
1034 	case TCP_CONNTRACK_SYN_SENT2:
1035 		/* tcp_conntracks table is not smart enough to handle
1036 		 * simultaneous open.
1037 		 */
1038 		ct->proto.tcp.last_flags |= IP_CT_TCP_SIMULTANEOUS_OPEN;
1039 		break;
1040 	case TCP_CONNTRACK_SYN_RECV:
1041 		if (dir == IP_CT_DIR_REPLY && index == TCP_ACK_SET &&
1042 		    ct->proto.tcp.last_flags & IP_CT_TCP_SIMULTANEOUS_OPEN)
1043 			new_state = TCP_CONNTRACK_ESTABLISHED;
1044 		break;
1045 	case TCP_CONNTRACK_CLOSE:
1046 		if (index != TCP_RST_SET)
1047 			break;
1048 
1049 		/* If we are closing, tuple might have been re-used already.
1050 		 * last_index, last_ack, and all other ct fields used for
1051 		 * sequence/window validation are outdated in that case.
1052 		 *
1053 		 * As the conntrack can already be expired by GC under pressure,
1054 		 * just skip validation checks.
1055 		 */
1056 		if (tcp_can_early_drop(ct))
1057 			goto in_window;
1058 
1059 		/* td_maxack might be outdated if we let a SYN through earlier */
1060 		if ((ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) &&
1061 		    ct->proto.tcp.last_index != TCP_SYN_SET) {
1062 			u32 seq = ntohl(th->seq);
1063 
1064 			/* If we are not in established state and SEQ=0 this is most
1065 			 * likely an answer to a SYN we let go through above (last_index
1066 			 * can be updated due to out-of-order ACKs).
1067 			 */
1068 			if (seq == 0 && !nf_conntrack_tcp_established(ct))
1069 				break;
1070 
1071 			if (before(seq, ct->proto.tcp.seen[!dir].td_maxack) &&
1072 			    !tn->tcp_ignore_invalid_rst) {
1073 				/* Invalid RST  */
1074 				spin_unlock_bh(&ct->lock);
1075 				nf_ct_l4proto_log_invalid(skb, ct, state, "invalid rst");
1076 				return -NF_ACCEPT;
1077 			}
1078 
1079 			if (!nf_conntrack_tcp_established(ct) ||
1080 			    seq == ct->proto.tcp.seen[!dir].td_maxack)
1081 				break;
1082 
1083 			/* Check if rst is part of train, such as
1084 			 *   foo:80 > bar:4379: P, 235946583:235946602(19) ack 42
1085 			 *   foo:80 > bar:4379: R, 235946602:235946602(0)  ack 42
1086 			 */
1087 			if (ct->proto.tcp.last_index == TCP_ACK_SET &&
1088 			    ct->proto.tcp.last_dir == dir &&
1089 			    seq == ct->proto.tcp.last_end)
1090 				break;
1091 
1092 			/* ... RST sequence number doesn't match exactly, keep
1093 			 * established state to allow a possible challenge ACK.
1094 			 */
1095 			new_state = old_state;
1096 		}
1097 		if (((test_bit(IPS_SEEN_REPLY_BIT, &ct->status)
1098 			 && ct->proto.tcp.last_index == TCP_SYN_SET)
1099 			|| (!test_bit(IPS_ASSURED_BIT, &ct->status)
1100 			    && ct->proto.tcp.last_index == TCP_ACK_SET))
1101 		    && ntohl(th->ack_seq) == ct->proto.tcp.last_end) {
1102 			/* RST sent to invalid SYN or ACK we had let through
1103 			 * at a) and c) above:
1104 			 *
1105 			 * a) SYN was in window then
1106 			 * c) we hold a half-open connection.
1107 			 *
1108 			 * Delete our connection entry.
1109 			 * We skip window checking, because packet might ACK
1110 			 * segments we ignored. */
1111 			goto in_window;
1112 		}
1113 		break;
1114 	default:
1115 		/* Keep compilers happy. */
1116 		break;
1117 	}
1118 
1119 	if (!tcp_in_window(ct, dir, index,
1120 			   skb, dataoff, th, state)) {
1121 		spin_unlock_bh(&ct->lock);
1122 		return -NF_ACCEPT;
1123 	}
1124      in_window:
1125 	/* From now on we have got in-window packets */
1126 	ct->proto.tcp.last_index = index;
1127 	ct->proto.tcp.last_dir = dir;
1128 
1129 	pr_debug("tcp_conntracks: ");
1130 	nf_ct_dump_tuple(tuple);
1131 	pr_debug("syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n",
1132 		 (th->syn ? 1 : 0), (th->ack ? 1 : 0),
1133 		 (th->fin ? 1 : 0), (th->rst ? 1 : 0),
1134 		 old_state, new_state);
1135 
1136 	ct->proto.tcp.state = new_state;
1137 	if (old_state != new_state
1138 	    && new_state == TCP_CONNTRACK_FIN_WAIT)
1139 		ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
1140 
1141 	timeouts = nf_ct_timeout_lookup(ct);
1142 	if (!timeouts)
1143 		timeouts = tn->timeouts;
1144 
1145 	if (ct->proto.tcp.retrans >= tn->tcp_max_retrans &&
1146 	    timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS])
1147 		timeout = timeouts[TCP_CONNTRACK_RETRANS];
1148 	else if (unlikely(index == TCP_RST_SET))
1149 		timeout = timeouts[TCP_CONNTRACK_CLOSE];
1150 	else if ((ct->proto.tcp.seen[0].flags | ct->proto.tcp.seen[1].flags) &
1151 		 IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED &&
1152 		 timeouts[new_state] > timeouts[TCP_CONNTRACK_UNACK])
1153 		timeout = timeouts[TCP_CONNTRACK_UNACK];
1154 	else if (ct->proto.tcp.last_win == 0 &&
1155 		 timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS])
1156 		timeout = timeouts[TCP_CONNTRACK_RETRANS];
1157 	else
1158 		timeout = timeouts[new_state];
1159 	spin_unlock_bh(&ct->lock);
1160 
1161 	if (new_state != old_state)
1162 		nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
1163 
1164 	if (!test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
1165 		/* If only reply is a RST, we can consider ourselves not to
1166 		   have an established connection: this is a fairly common
1167 		   problem case, so we can delete the conntrack
1168 		   immediately.  --RR */
1169 		if (th->rst) {
1170 			nf_ct_kill_acct(ct, ctinfo, skb);
1171 			return NF_ACCEPT;
1172 		}
1173 
1174 		if (index == TCP_SYN_SET && old_state == TCP_CONNTRACK_SYN_SENT) {
1175 			/* do not renew timeout on SYN retransmit.
1176 			 *
1177 			 * Else port reuse by client or NAT middlebox can keep
1178 			 * entry alive indefinitely (including nat info).
1179 			 */
1180 			return NF_ACCEPT;
1181 		}
1182 
1183 		/* ESTABLISHED without SEEN_REPLY, i.e. mid-connection
1184 		 * pickup with loose=1. Avoid large ESTABLISHED timeout.
1185 		 */
1186 		if (new_state == TCP_CONNTRACK_ESTABLISHED &&
1187 		    timeout > timeouts[TCP_CONNTRACK_UNACK])
1188 			timeout = timeouts[TCP_CONNTRACK_UNACK];
1189 	} else if (!test_bit(IPS_ASSURED_BIT, &ct->status)
1190 		   && (old_state == TCP_CONNTRACK_SYN_RECV
1191 		       || old_state == TCP_CONNTRACK_ESTABLISHED)
1192 		   && new_state == TCP_CONNTRACK_ESTABLISHED) {
1193 		/* Set ASSURED if we see valid ack in ESTABLISHED
1194 		   after SYN_RECV or a valid answer for a picked up
1195 		   connection. */
1196 		set_bit(IPS_ASSURED_BIT, &ct->status);
1197 		nf_conntrack_event_cache(IPCT_ASSURED, ct);
1198 	}
1199 	nf_ct_refresh_acct(ct, ctinfo, skb, timeout);
1200 
1201 	return NF_ACCEPT;
1202 }
1203 
1204 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
1205 
1206 #include <linux/netfilter/nfnetlink.h>
1207 #include <linux/netfilter/nfnetlink_conntrack.h>
1208 
1209 static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
1210 			 struct nf_conn *ct, bool destroy)
1211 {
1212 	struct nlattr *nest_parms;
1213 	struct nf_ct_tcp_flags tmp = {};
1214 
1215 	spin_lock_bh(&ct->lock);
1216 	nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP);
1217 	if (!nest_parms)
1218 		goto nla_put_failure;
1219 
1220 	if (nla_put_u8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state))
1221 		goto nla_put_failure;
1222 
1223 	if (destroy)
1224 		goto skip_state;
1225 
1226 	if (nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL,
1227 		       ct->proto.tcp.seen[0].td_scale) ||
1228 	    nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY,
1229 		       ct->proto.tcp.seen[1].td_scale))
1230 		goto nla_put_failure;
1231 
1232 	tmp.flags = ct->proto.tcp.seen[0].flags;
1233 	if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL,
1234 		    sizeof(struct nf_ct_tcp_flags), &tmp))
1235 		goto nla_put_failure;
1236 
1237 	tmp.flags = ct->proto.tcp.seen[1].flags;
1238 	if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY,
1239 		    sizeof(struct nf_ct_tcp_flags), &tmp))
1240 		goto nla_put_failure;
1241 skip_state:
1242 	spin_unlock_bh(&ct->lock);
1243 	nla_nest_end(skb, nest_parms);
1244 
1245 	return 0;
1246 
1247 nla_put_failure:
1248 	spin_unlock_bh(&ct->lock);
1249 	return -1;
1250 }
1251 
1252 static const struct nla_policy tcp_nla_policy[CTA_PROTOINFO_TCP_MAX+1] = {
1253 	[CTA_PROTOINFO_TCP_STATE]	    = { .type = NLA_U8 },
1254 	[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] = { .type = NLA_U8 },
1255 	[CTA_PROTOINFO_TCP_WSCALE_REPLY]    = { .type = NLA_U8 },
1256 	[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]  = { .len = sizeof(struct nf_ct_tcp_flags) },
1257 	[CTA_PROTOINFO_TCP_FLAGS_REPLY]	    = { .len = sizeof(struct nf_ct_tcp_flags) },
1258 };
1259 
1260 #define TCP_NLATTR_SIZE	( \
1261 	NLA_ALIGN(NLA_HDRLEN + 1) + \
1262 	NLA_ALIGN(NLA_HDRLEN + 1) + \
1263 	NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags)) + \
1264 	NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags)))
1265 
1266 static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct)
1267 {
1268 	struct nlattr *pattr = cda[CTA_PROTOINFO_TCP];
1269 	struct nlattr *tb[CTA_PROTOINFO_TCP_MAX+1];
1270 	int err;
1271 
1272 	/* updates could not contain anything about the private
1273 	 * protocol info, in that case skip the parsing */
1274 	if (!pattr)
1275 		return 0;
1276 
1277 	err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_TCP_MAX, pattr,
1278 					  tcp_nla_policy, NULL);
1279 	if (err < 0)
1280 		return err;
1281 
1282 	if (tb[CTA_PROTOINFO_TCP_STATE] &&
1283 	    nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX)
1284 		return -EINVAL;
1285 
1286 	spin_lock_bh(&ct->lock);
1287 	if (tb[CTA_PROTOINFO_TCP_STATE])
1288 		ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]);
1289 
1290 	if (tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]) {
1291 		struct nf_ct_tcp_flags *attr =
1292 			nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]);
1293 		ct->proto.tcp.seen[0].flags &= ~attr->mask;
1294 		ct->proto.tcp.seen[0].flags |= attr->flags & attr->mask;
1295 	}
1296 
1297 	if (tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]) {
1298 		struct nf_ct_tcp_flags *attr =
1299 			nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]);
1300 		ct->proto.tcp.seen[1].flags &= ~attr->mask;
1301 		ct->proto.tcp.seen[1].flags |= attr->flags & attr->mask;
1302 	}
1303 
1304 	if (tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] &&
1305 	    tb[CTA_PROTOINFO_TCP_WSCALE_REPLY] &&
1306 	    ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_WINDOW_SCALE &&
1307 	    ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_WINDOW_SCALE) {
1308 		ct->proto.tcp.seen[0].td_scale =
1309 			nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL]);
1310 		ct->proto.tcp.seen[1].td_scale =
1311 			nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]);
1312 	}
1313 	spin_unlock_bh(&ct->lock);
1314 
1315 	return 0;
1316 }
1317 
1318 static unsigned int tcp_nlattr_tuple_size(void)
1319 {
1320 	static unsigned int size __read_mostly;
1321 
1322 	if (!size)
1323 		size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
1324 
1325 	return size;
1326 }
1327 #endif
1328 
1329 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
1330 
1331 #include <linux/netfilter/nfnetlink.h>
1332 #include <linux/netfilter/nfnetlink_cttimeout.h>
1333 
1334 static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[],
1335 				     struct net *net, void *data)
1336 {
1337 	struct nf_tcp_net *tn = nf_tcp_pernet(net);
1338 	unsigned int *timeouts = data;
1339 	int i;
1340 
1341 	if (!timeouts)
1342 		timeouts = tn->timeouts;
1343 	/* set default TCP timeouts. */
1344 	for (i=0; i<TCP_CONNTRACK_TIMEOUT_MAX; i++)
1345 		timeouts[i] = tn->timeouts[i];
1346 
1347 	if (tb[CTA_TIMEOUT_TCP_SYN_SENT]) {
1348 		timeouts[TCP_CONNTRACK_SYN_SENT] =
1349 			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT]))*HZ;
1350 	}
1351 
1352 	if (tb[CTA_TIMEOUT_TCP_SYN_RECV]) {
1353 		timeouts[TCP_CONNTRACK_SYN_RECV] =
1354 			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_RECV]))*HZ;
1355 	}
1356 	if (tb[CTA_TIMEOUT_TCP_ESTABLISHED]) {
1357 		timeouts[TCP_CONNTRACK_ESTABLISHED] =
1358 			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_ESTABLISHED]))*HZ;
1359 	}
1360 	if (tb[CTA_TIMEOUT_TCP_FIN_WAIT]) {
1361 		timeouts[TCP_CONNTRACK_FIN_WAIT] =
1362 			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_FIN_WAIT]))*HZ;
1363 	}
1364 	if (tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]) {
1365 		timeouts[TCP_CONNTRACK_CLOSE_WAIT] =
1366 			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]))*HZ;
1367 	}
1368 	if (tb[CTA_TIMEOUT_TCP_LAST_ACK]) {
1369 		timeouts[TCP_CONNTRACK_LAST_ACK] =
1370 			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_LAST_ACK]))*HZ;
1371 	}
1372 	if (tb[CTA_TIMEOUT_TCP_TIME_WAIT]) {
1373 		timeouts[TCP_CONNTRACK_TIME_WAIT] =
1374 			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_TIME_WAIT]))*HZ;
1375 	}
1376 	if (tb[CTA_TIMEOUT_TCP_CLOSE]) {
1377 		timeouts[TCP_CONNTRACK_CLOSE] =
1378 			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE]))*HZ;
1379 	}
1380 	if (tb[CTA_TIMEOUT_TCP_SYN_SENT2]) {
1381 		timeouts[TCP_CONNTRACK_SYN_SENT2] =
1382 			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT2]))*HZ;
1383 	}
1384 	if (tb[CTA_TIMEOUT_TCP_RETRANS]) {
1385 		timeouts[TCP_CONNTRACK_RETRANS] =
1386 			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_RETRANS]))*HZ;
1387 	}
1388 	if (tb[CTA_TIMEOUT_TCP_UNACK]) {
1389 		timeouts[TCP_CONNTRACK_UNACK] =
1390 			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_UNACK]))*HZ;
1391 	}
1392 
1393 	timeouts[CTA_TIMEOUT_TCP_UNSPEC] = timeouts[CTA_TIMEOUT_TCP_SYN_SENT];
1394 	return 0;
1395 }
1396 
1397 static int
1398 tcp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
1399 {
1400 	const unsigned int *timeouts = data;
1401 
1402 	if (nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT,
1403 			htonl(timeouts[TCP_CONNTRACK_SYN_SENT] / HZ)) ||
1404 	    nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_RECV,
1405 			 htonl(timeouts[TCP_CONNTRACK_SYN_RECV] / HZ)) ||
1406 	    nla_put_be32(skb, CTA_TIMEOUT_TCP_ESTABLISHED,
1407 			 htonl(timeouts[TCP_CONNTRACK_ESTABLISHED] / HZ)) ||
1408 	    nla_put_be32(skb, CTA_TIMEOUT_TCP_FIN_WAIT,
1409 			 htonl(timeouts[TCP_CONNTRACK_FIN_WAIT] / HZ)) ||
1410 	    nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE_WAIT,
1411 			 htonl(timeouts[TCP_CONNTRACK_CLOSE_WAIT] / HZ)) ||
1412 	    nla_put_be32(skb, CTA_TIMEOUT_TCP_LAST_ACK,
1413 			 htonl(timeouts[TCP_CONNTRACK_LAST_ACK] / HZ)) ||
1414 	    nla_put_be32(skb, CTA_TIMEOUT_TCP_TIME_WAIT,
1415 			 htonl(timeouts[TCP_CONNTRACK_TIME_WAIT] / HZ)) ||
1416 	    nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE,
1417 			 htonl(timeouts[TCP_CONNTRACK_CLOSE] / HZ)) ||
1418 	    nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT2,
1419 			 htonl(timeouts[TCP_CONNTRACK_SYN_SENT2] / HZ)) ||
1420 	    nla_put_be32(skb, CTA_TIMEOUT_TCP_RETRANS,
1421 			 htonl(timeouts[TCP_CONNTRACK_RETRANS] / HZ)) ||
1422 	    nla_put_be32(skb, CTA_TIMEOUT_TCP_UNACK,
1423 			 htonl(timeouts[TCP_CONNTRACK_UNACK] / HZ)))
1424 		goto nla_put_failure;
1425 	return 0;
1426 
1427 nla_put_failure:
1428 	return -ENOSPC;
1429 }
1430 
1431 static const struct nla_policy tcp_timeout_nla_policy[CTA_TIMEOUT_TCP_MAX+1] = {
1432 	[CTA_TIMEOUT_TCP_SYN_SENT]	= { .type = NLA_U32 },
1433 	[CTA_TIMEOUT_TCP_SYN_RECV]	= { .type = NLA_U32 },
1434 	[CTA_TIMEOUT_TCP_ESTABLISHED]	= { .type = NLA_U32 },
1435 	[CTA_TIMEOUT_TCP_FIN_WAIT]	= { .type = NLA_U32 },
1436 	[CTA_TIMEOUT_TCP_CLOSE_WAIT]	= { .type = NLA_U32 },
1437 	[CTA_TIMEOUT_TCP_LAST_ACK]	= { .type = NLA_U32 },
1438 	[CTA_TIMEOUT_TCP_TIME_WAIT]	= { .type = NLA_U32 },
1439 	[CTA_TIMEOUT_TCP_CLOSE]		= { .type = NLA_U32 },
1440 	[CTA_TIMEOUT_TCP_SYN_SENT2]	= { .type = NLA_U32 },
1441 	[CTA_TIMEOUT_TCP_RETRANS]	= { .type = NLA_U32 },
1442 	[CTA_TIMEOUT_TCP_UNACK]		= { .type = NLA_U32 },
1443 };
1444 #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
1445 
1446 void nf_conntrack_tcp_init_net(struct net *net)
1447 {
1448 	struct nf_tcp_net *tn = nf_tcp_pernet(net);
1449 	int i;
1450 
1451 	for (i = 0; i < TCP_CONNTRACK_TIMEOUT_MAX; i++)
1452 		tn->timeouts[i] = tcp_timeouts[i];
1453 
1454 	/* timeouts[0] is unused, make it same as SYN_SENT so
1455 	 * ->timeouts[0] contains 'new' timeout, like udp or icmp.
1456 	 */
1457 	tn->timeouts[0] = tcp_timeouts[TCP_CONNTRACK_SYN_SENT];
1458 
1459 	/* If it is set to zero, we disable picking up already established
1460 	 * connections.
1461 	 */
1462 	tn->tcp_loose = 1;
1463 
1464 	/* "Be conservative in what you do,
1465 	 *  be liberal in what you accept from others."
1466 	 * If it's non-zero, we mark only out of window RST segments as INVALID.
1467 	 */
1468 	tn->tcp_be_liberal = 0;
1469 
1470 	/* If it's non-zero, we turn off RST sequence number check */
1471 	tn->tcp_ignore_invalid_rst = 0;
1472 
1473 	/* Max number of the retransmitted packets without receiving an (acceptable)
1474 	 * ACK from the destination. If this number is reached, a shorter timer
1475 	 * will be started.
1476 	 */
1477 	tn->tcp_max_retrans = 3;
1478 
1479 #if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
1480 	tn->offload_timeout = 30 * HZ;
1481 #endif
1482 }
1483 
1484 const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp =
1485 {
1486 	.l4proto 		= IPPROTO_TCP,
1487 #ifdef CONFIG_NF_CONNTRACK_PROCFS
1488 	.print_conntrack 	= tcp_print_conntrack,
1489 #endif
1490 	.can_early_drop		= tcp_can_early_drop,
1491 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
1492 	.to_nlattr		= tcp_to_nlattr,
1493 	.from_nlattr		= nlattr_to_tcp,
1494 	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
1495 	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
1496 	.nlattr_tuple_size	= tcp_nlattr_tuple_size,
1497 	.nlattr_size		= TCP_NLATTR_SIZE,
1498 	.nla_policy		= nf_ct_port_nla_policy,
1499 #endif
1500 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
1501 	.ctnl_timeout		= {
1502 		.nlattr_to_obj	= tcp_timeout_nlattr_to_obj,
1503 		.obj_to_nlattr	= tcp_timeout_obj_to_nlattr,
1504 		.nlattr_max	= CTA_TIMEOUT_TCP_MAX,
1505 		.obj_size	= sizeof(unsigned int) *
1506 					TCP_CONNTRACK_TIMEOUT_MAX,
1507 		.nla_policy	= tcp_timeout_nla_policy,
1508 	},
1509 #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
1510 };
1511