1 // SPDX-License-Identifier: GPL-2.0-only
2 /* (C) 1999-2001 Paul `Rusty' Russell
3  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
4  * (C) 2002-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
5  * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
6  */
7 
8 #include <linux/types.h>
9 #include <linux/timer.h>
10 #include <linux/module.h>
11 #include <linux/in.h>
12 #include <linux/tcp.h>
13 #include <linux/spinlock.h>
14 #include <linux/skbuff.h>
15 #include <linux/ipv6.h>
16 #include <net/ip6_checksum.h>
17 #include <asm/unaligned.h>
18 
19 #include <net/tcp.h>
20 
21 #include <linux/netfilter.h>
22 #include <linux/netfilter_ipv4.h>
23 #include <linux/netfilter_ipv6.h>
24 #include <net/netfilter/nf_conntrack.h>
25 #include <net/netfilter/nf_conntrack_l4proto.h>
26 #include <net/netfilter/nf_conntrack_ecache.h>
27 #include <net/netfilter/nf_conntrack_seqadj.h>
28 #include <net/netfilter/nf_conntrack_synproxy.h>
29 #include <net/netfilter/nf_conntrack_timeout.h>
30 #include <net/netfilter/nf_log.h>
31 #include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
32 #include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
33 
34   /* FIXME: Examine ipfilter's timeouts and conntrack transitions more
35      closely.  They're more complex. --RR */
36 
37 static const char *const tcp_conntrack_names[] = {
38 	"NONE",
39 	"SYN_SENT",
40 	"SYN_RECV",
41 	"ESTABLISHED",
42 	"FIN_WAIT",
43 	"CLOSE_WAIT",
44 	"LAST_ACK",
45 	"TIME_WAIT",
46 	"CLOSE",
47 	"SYN_SENT2",
48 };
49 
50 #define SECS * HZ
51 #define MINS * 60 SECS
52 #define HOURS * 60 MINS
53 #define DAYS * 24 HOURS
54 
55 static const unsigned int tcp_timeouts[TCP_CONNTRACK_TIMEOUT_MAX] = {
56 	[TCP_CONNTRACK_SYN_SENT]	= 2 MINS,
57 	[TCP_CONNTRACK_SYN_RECV]	= 60 SECS,
58 	[TCP_CONNTRACK_ESTABLISHED]	= 5 DAYS,
59 	[TCP_CONNTRACK_FIN_WAIT]	= 2 MINS,
60 	[TCP_CONNTRACK_CLOSE_WAIT]	= 60 SECS,
61 	[TCP_CONNTRACK_LAST_ACK]	= 30 SECS,
62 	[TCP_CONNTRACK_TIME_WAIT]	= 2 MINS,
63 	[TCP_CONNTRACK_CLOSE]		= 10 SECS,
64 	[TCP_CONNTRACK_SYN_SENT2]	= 2 MINS,
65 /* RFC1122 says the R2 limit should be at least 100 seconds.
66    Linux uses 15 packets as limit, which corresponds
67    to ~13-30min depending on RTO. */
68 	[TCP_CONNTRACK_RETRANS]		= 5 MINS,
69 	[TCP_CONNTRACK_UNACK]		= 5 MINS,
70 };
71 
72 #define sNO TCP_CONNTRACK_NONE
73 #define sSS TCP_CONNTRACK_SYN_SENT
74 #define sSR TCP_CONNTRACK_SYN_RECV
75 #define sES TCP_CONNTRACK_ESTABLISHED
76 #define sFW TCP_CONNTRACK_FIN_WAIT
77 #define sCW TCP_CONNTRACK_CLOSE_WAIT
78 #define sLA TCP_CONNTRACK_LAST_ACK
79 #define sTW TCP_CONNTRACK_TIME_WAIT
80 #define sCL TCP_CONNTRACK_CLOSE
81 #define sS2 TCP_CONNTRACK_SYN_SENT2
82 #define sIV TCP_CONNTRACK_MAX
83 #define sIG TCP_CONNTRACK_IGNORE
84 
85 /* What TCP flags are set from RST/SYN/FIN/ACK. */
86 enum tcp_bit_set {
87 	TCP_SYN_SET,
88 	TCP_SYNACK_SET,
89 	TCP_FIN_SET,
90 	TCP_ACK_SET,
91 	TCP_RST_SET,
92 	TCP_NONE_SET,
93 };
94 
95 /*
96  * The TCP state transition table needs a few words...
97  *
98  * We are the man in the middle. All the packets go through us
99  * but might get lost in transit to the destination.
100  * It is assumed that the destinations can't receive segments
101  * we haven't seen.
102  *
103  * The checked segment is in window, but our windows are *not*
104  * equivalent with the ones of the sender/receiver. We always
105  * try to guess the state of the current sender.
106  *
107  * The meaning of the states are:
108  *
109  * NONE:	initial state
110  * SYN_SENT:	SYN-only packet seen
111  * SYN_SENT2:	SYN-only packet seen from reply dir, simultaneous open
112  * SYN_RECV:	SYN-ACK packet seen
113  * ESTABLISHED:	ACK packet seen
114  * FIN_WAIT:	FIN packet seen
115  * CLOSE_WAIT:	ACK seen (after FIN)
116  * LAST_ACK:	FIN seen (after FIN)
117  * TIME_WAIT:	last ACK seen
118  * CLOSE:	closed connection (RST)
119  *
120  * Packets marked as IGNORED (sIG):
121  *	if they may be either invalid or valid
122  *	and the receiver may send back a connection
123  *	closing RST or a SYN/ACK.
124  *
125  * Packets marked as INVALID (sIV):
126  *	if we regard them as truly invalid packets
127  */
128 static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
129 	{
130 /* ORIGINAL */
131 /* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
132 /*syn*/	   { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sS2 },
133 /*
134  *	sNO -> sSS	Initialize a new connection
135  *	sSS -> sSS	Retransmitted SYN
136  *	sS2 -> sS2	Late retransmitted SYN
137  *	sSR -> sIG
138  *	sES -> sIG	Error: SYNs in window outside the SYN_SENT state
139  *			are errors. Receiver will reply with RST
140  *			and close the connection.
141  *			Or we are not in sync and hold a dead connection.
142  *	sFW -> sIG
143  *	sCW -> sIG
144  *	sLA -> sIG
145  *	sTW -> sSS	Reopened connection (RFC 1122).
146  *	sCL -> sSS
147  */
148 /* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
149 /*synack*/ { sIV, sIV, sSR, sIV, sIV, sIV, sIV, sIV, sIV, sSR },
150 /*
151  *	sNO -> sIV	Too late and no reason to do anything
152  *	sSS -> sIV	Client can't send SYN and then SYN/ACK
153  *	sS2 -> sSR	SYN/ACK sent to SYN2 in simultaneous open
154  *	sSR -> sSR	Late retransmitted SYN/ACK in simultaneous open
155  *	sES -> sIV	Invalid SYN/ACK packets sent by the client
156  *	sFW -> sIV
157  *	sCW -> sIV
158  *	sLA -> sIV
159  *	sTW -> sIV
160  *	sCL -> sIV
161  */
162 /* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
163 /*fin*/    { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
164 /*
165  *	sNO -> sIV	Too late and no reason to do anything...
166  *	sSS -> sIV	Client migth not send FIN in this state:
167  *			we enforce waiting for a SYN/ACK reply first.
168  *	sS2 -> sIV
169  *	sSR -> sFW	Close started.
170  *	sES -> sFW
171  *	sFW -> sLA	FIN seen in both directions, waiting for
172  *			the last ACK.
173  *			Migth be a retransmitted FIN as well...
174  *	sCW -> sLA
175  *	sLA -> sLA	Retransmitted FIN. Remain in the same state.
176  *	sTW -> sTW
177  *	sCL -> sCL
178  */
179 /* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
180 /*ack*/	   { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV },
181 /*
182  *	sNO -> sES	Assumed.
183  *	sSS -> sIV	ACK is invalid: we haven't seen a SYN/ACK yet.
184  *	sS2 -> sIV
185  *	sSR -> sES	Established state is reached.
186  *	sES -> sES	:-)
187  *	sFW -> sCW	Normal close request answered by ACK.
188  *	sCW -> sCW
189  *	sLA -> sTW	Last ACK detected (RFC5961 challenged)
190  *	sTW -> sTW	Retransmitted last ACK. Remain in the same state.
191  *	sCL -> sCL
192  */
193 /* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
194 /*rst*/    { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL },
195 /*none*/   { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
196 	},
197 	{
198 /* REPLY */
199 /* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
200 /*syn*/	   { sIV, sS2, sIV, sIV, sIV, sIV, sIV, sSS, sIV, sS2 },
201 /*
202  *	sNO -> sIV	Never reached.
203  *	sSS -> sS2	Simultaneous open
204  *	sS2 -> sS2	Retransmitted simultaneous SYN
205  *	sSR -> sIV	Invalid SYN packets sent by the server
206  *	sES -> sIV
207  *	sFW -> sIV
208  *	sCW -> sIV
209  *	sLA -> sIV
210  *	sTW -> sSS	Reopened connection, but server may have switched role
211  *	sCL -> sIV
212  */
213 /* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
214 /*synack*/ { sIV, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR },
215 /*
216  *	sSS -> sSR	Standard open.
217  *	sS2 -> sSR	Simultaneous open
218  *	sSR -> sIG	Retransmitted SYN/ACK, ignore it.
219  *	sES -> sIG	Late retransmitted SYN/ACK?
220  *	sFW -> sIG	Might be SYN/ACK answering ignored SYN
221  *	sCW -> sIG
222  *	sLA -> sIG
223  *	sTW -> sIG
224  *	sCL -> sIG
225  */
226 /* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
227 /*fin*/    { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
228 /*
229  *	sSS -> sIV	Server might not send FIN in this state.
230  *	sS2 -> sIV
231  *	sSR -> sFW	Close started.
232  *	sES -> sFW
233  *	sFW -> sLA	FIN seen in both directions.
234  *	sCW -> sLA
235  *	sLA -> sLA	Retransmitted FIN.
236  *	sTW -> sTW
237  *	sCL -> sCL
238  */
239 /* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
240 /*ack*/	   { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIG },
241 /*
242  *	sSS -> sIG	Might be a half-open connection.
243  *	sS2 -> sIG
244  *	sSR -> sSR	Might answer late resent SYN.
245  *	sES -> sES	:-)
246  *	sFW -> sCW	Normal close request answered by ACK.
247  *	sCW -> sCW
248  *	sLA -> sTW	Last ACK detected (RFC5961 challenged)
249  *	sTW -> sTW	Retransmitted last ACK.
250  *	sCL -> sCL
251  */
252 /* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
253 /*rst*/    { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL },
254 /*none*/   { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
255 	}
256 };
257 
258 #ifdef CONFIG_NF_CONNTRACK_PROCFS
259 /* Print out the private part of the conntrack. */
260 static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
261 {
262 	if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
263 		return;
264 
265 	seq_printf(s, "%s ", tcp_conntrack_names[ct->proto.tcp.state]);
266 }
267 #endif
268 
269 static unsigned int get_conntrack_index(const struct tcphdr *tcph)
270 {
271 	if (tcph->rst) return TCP_RST_SET;
272 	else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET);
273 	else if (tcph->fin) return TCP_FIN_SET;
274 	else if (tcph->ack) return TCP_ACK_SET;
275 	else return TCP_NONE_SET;
276 }
277 
278 /* TCP connection tracking based on 'Real Stateful TCP Packet Filtering
279    in IP Filter' by Guido van Rooij.
280 
281    http://www.sane.nl/events/sane2000/papers.html
282    http://www.darkart.com/mirrors/www.obfuscation.org/ipf/
283 
284    The boundaries and the conditions are changed according to RFC793:
285    the packet must intersect the window (i.e. segments may be
286    after the right or before the left edge) and thus receivers may ACK
287    segments after the right edge of the window.
288 
289 	td_maxend = max(sack + max(win,1)) seen in reply packets
290 	td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets
291 	td_maxwin += seq + len - sender.td_maxend
292 			if seq + len > sender.td_maxend
293 	td_end    = max(seq + len) seen in sent packets
294 
295    I.   Upper bound for valid data:	seq <= sender.td_maxend
296    II.  Lower bound for valid data:	seq + len >= sender.td_end - receiver.td_maxwin
297    III.	Upper bound for valid (s)ack:   sack <= receiver.td_end
298    IV.	Lower bound for valid (s)ack:	sack >= receiver.td_end - MAXACKWINDOW
299 
300    where sack is the highest right edge of sack block found in the packet
301    or ack in the case of packet without SACK option.
302 
303    The upper bound limit for a valid (s)ack is not ignored -
304    we doesn't have to deal with fragments.
305 */
306 
307 static inline __u32 segment_seq_plus_len(__u32 seq,
308 					 size_t len,
309 					 unsigned int dataoff,
310 					 const struct tcphdr *tcph)
311 {
312 	/* XXX Should I use payload length field in IP/IPv6 header ?
313 	 * - YK */
314 	return (seq + len - dataoff - tcph->doff*4
315 		+ (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0));
316 }
317 
318 /* Fixme: what about big packets? */
319 #define MAXACKWINCONST			66000
320 #define MAXACKWINDOW(sender)						\
321 	((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin	\
322 					      : MAXACKWINCONST)
323 
324 /*
325  * Simplified tcp_parse_options routine from tcp_input.c
326  */
327 static void tcp_options(const struct sk_buff *skb,
328 			unsigned int dataoff,
329 			const struct tcphdr *tcph,
330 			struct ip_ct_tcp_state *state)
331 {
332 	unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
333 	const unsigned char *ptr;
334 	int length = (tcph->doff*4) - sizeof(struct tcphdr);
335 
336 	if (!length)
337 		return;
338 
339 	ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
340 				 length, buff);
341 	if (!ptr)
342 		return;
343 
344 	state->td_scale = 0;
345 	state->flags &= IP_CT_TCP_FLAG_BE_LIBERAL;
346 
347 	while (length > 0) {
348 		int opcode=*ptr++;
349 		int opsize;
350 
351 		switch (opcode) {
352 		case TCPOPT_EOL:
353 			return;
354 		case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */
355 			length--;
356 			continue;
357 		default:
358 			if (length < 2)
359 				return;
360 			opsize=*ptr++;
361 			if (opsize < 2) /* "silly options" */
362 				return;
363 			if (opsize > length)
364 				return;	/* don't parse partial options */
365 
366 			if (opcode == TCPOPT_SACK_PERM
367 			    && opsize == TCPOLEN_SACK_PERM)
368 				state->flags |= IP_CT_TCP_FLAG_SACK_PERM;
369 			else if (opcode == TCPOPT_WINDOW
370 				 && opsize == TCPOLEN_WINDOW) {
371 				state->td_scale = *(u_int8_t *)ptr;
372 
373 				if (state->td_scale > TCP_MAX_WSCALE)
374 					state->td_scale = TCP_MAX_WSCALE;
375 
376 				state->flags |=
377 					IP_CT_TCP_FLAG_WINDOW_SCALE;
378 			}
379 			ptr += opsize - 2;
380 			length -= opsize;
381 		}
382 	}
383 }
384 
385 static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
386                      const struct tcphdr *tcph, __u32 *sack)
387 {
388 	unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
389 	const unsigned char *ptr;
390 	int length = (tcph->doff*4) - sizeof(struct tcphdr);
391 	__u32 tmp;
392 
393 	if (!length)
394 		return;
395 
396 	ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
397 				 length, buff);
398 	if (!ptr)
399 		return;
400 
401 	/* Fast path for timestamp-only option */
402 	if (length == TCPOLEN_TSTAMP_ALIGNED
403 	    && *(__be32 *)ptr == htonl((TCPOPT_NOP << 24)
404 				       | (TCPOPT_NOP << 16)
405 				       | (TCPOPT_TIMESTAMP << 8)
406 				       | TCPOLEN_TIMESTAMP))
407 		return;
408 
409 	while (length > 0) {
410 		int opcode = *ptr++;
411 		int opsize, i;
412 
413 		switch (opcode) {
414 		case TCPOPT_EOL:
415 			return;
416 		case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */
417 			length--;
418 			continue;
419 		default:
420 			if (length < 2)
421 				return;
422 			opsize = *ptr++;
423 			if (opsize < 2) /* "silly options" */
424 				return;
425 			if (opsize > length)
426 				return;	/* don't parse partial options */
427 
428 			if (opcode == TCPOPT_SACK
429 			    && opsize >= (TCPOLEN_SACK_BASE
430 					  + TCPOLEN_SACK_PERBLOCK)
431 			    && !((opsize - TCPOLEN_SACK_BASE)
432 				 % TCPOLEN_SACK_PERBLOCK)) {
433 				for (i = 0;
434 				     i < (opsize - TCPOLEN_SACK_BASE);
435 				     i += TCPOLEN_SACK_PERBLOCK) {
436 					tmp = get_unaligned_be32((__be32 *)(ptr+i)+1);
437 
438 					if (after(tmp, *sack))
439 						*sack = tmp;
440 				}
441 				return;
442 			}
443 			ptr += opsize - 2;
444 			length -= opsize;
445 		}
446 	}
447 }
448 
449 static void tcp_init_sender(struct ip_ct_tcp_state *sender,
450 			    struct ip_ct_tcp_state *receiver,
451 			    const struct sk_buff *skb,
452 			    unsigned int dataoff,
453 			    const struct tcphdr *tcph,
454 			    u32 end, u32 win)
455 {
456 	/* SYN-ACK in reply to a SYN
457 	 * or SYN from reply direction in simultaneous open.
458 	 */
459 	sender->td_end =
460 	sender->td_maxend = end;
461 	sender->td_maxwin = (win == 0 ? 1 : win);
462 
463 	tcp_options(skb, dataoff, tcph, sender);
464 	/* RFC 1323:
465 	 * Both sides must send the Window Scale option
466 	 * to enable window scaling in either direction.
467 	 */
468 	if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE &&
469 	      receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) {
470 		sender->td_scale = 0;
471 		receiver->td_scale = 0;
472 	}
473 }
474 
475 static bool tcp_in_window(struct nf_conn *ct,
476 			  enum ip_conntrack_dir dir,
477 			  unsigned int index,
478 			  const struct sk_buff *skb,
479 			  unsigned int dataoff,
480 			  const struct tcphdr *tcph,
481 			  const struct nf_hook_state *hook_state)
482 {
483 	struct ip_ct_tcp *state = &ct->proto.tcp;
484 	struct net *net = nf_ct_net(ct);
485 	struct nf_tcp_net *tn = nf_tcp_pernet(net);
486 	struct ip_ct_tcp_state *sender = &state->seen[dir];
487 	struct ip_ct_tcp_state *receiver = &state->seen[!dir];
488 	__u32 seq, ack, sack, end, win, swin;
489 	u16 win_raw;
490 	s32 receiver_offset;
491 	bool res, in_recv_win;
492 
493 	/*
494 	 * Get the required data from the packet.
495 	 */
496 	seq = ntohl(tcph->seq);
497 	ack = sack = ntohl(tcph->ack_seq);
498 	win_raw = ntohs(tcph->window);
499 	win = win_raw;
500 	end = segment_seq_plus_len(seq, skb->len, dataoff, tcph);
501 
502 	if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM)
503 		tcp_sack(skb, dataoff, tcph, &sack);
504 
505 	/* Take into account NAT sequence number mangling */
506 	receiver_offset = nf_ct_seq_offset(ct, !dir, ack - 1);
507 	ack -= receiver_offset;
508 	sack -= receiver_offset;
509 
510 	if (sender->td_maxwin == 0) {
511 		/*
512 		 * Initialize sender data.
513 		 */
514 		if (tcph->syn) {
515 			tcp_init_sender(sender, receiver,
516 					skb, dataoff, tcph,
517 					end, win);
518 			if (!tcph->ack)
519 				/* Simultaneous open */
520 				return true;
521 		} else {
522 			/*
523 			 * We are in the middle of a connection,
524 			 * its history is lost for us.
525 			 * Let's try to use the data from the packet.
526 			 */
527 			sender->td_end = end;
528 			swin = win << sender->td_scale;
529 			sender->td_maxwin = (swin == 0 ? 1 : swin);
530 			sender->td_maxend = end + sender->td_maxwin;
531 			if (receiver->td_maxwin == 0) {
532 				/* We haven't seen traffic in the other
533 				 * direction yet but we have to tweak window
534 				 * tracking to pass III and IV until that
535 				 * happens.
536 				 */
537 				receiver->td_end = receiver->td_maxend = sack;
538 			} else if (sack == receiver->td_end + 1) {
539 				/* Likely a reply to a keepalive.
540 				 * Needed for III.
541 				 */
542 				receiver->td_end++;
543 			}
544 
545 		}
546 	} else if (tcph->syn &&
547 		   after(end, sender->td_end) &&
548 		   (state->state == TCP_CONNTRACK_SYN_SENT ||
549 		    state->state == TCP_CONNTRACK_SYN_RECV)) {
550 		/*
551 		 * RFC 793: "if a TCP is reinitialized ... then it need
552 		 * not wait at all; it must only be sure to use sequence
553 		 * numbers larger than those recently used."
554 		 *
555 		 * Re-init state for this direction, just like for the first
556 		 * syn(-ack) reply, it might differ in seq, ack or tcp options.
557 		 */
558 		tcp_init_sender(sender, receiver,
559 				skb, dataoff, tcph,
560 				end, win);
561 
562 		if (dir == IP_CT_DIR_REPLY && !tcph->ack)
563 			return true;
564 	}
565 
566 	if (!(tcph->ack)) {
567 		/*
568 		 * If there is no ACK, just pretend it was set and OK.
569 		 */
570 		ack = sack = receiver->td_end;
571 	} else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) ==
572 		    (TCP_FLAG_ACK|TCP_FLAG_RST))
573 		   && (ack == 0)) {
574 		/*
575 		 * Broken TCP stacks, that set ACK in RST packets as well
576 		 * with zero ack value.
577 		 */
578 		ack = sack = receiver->td_end;
579 	}
580 
581 	if (tcph->rst && seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT)
582 		/*
583 		 * RST sent answering SYN.
584 		 */
585 		seq = end = sender->td_end;
586 
587 	/* Is the ending sequence in the receive window (if available)? */
588 	in_recv_win = !receiver->td_maxwin ||
589 		      after(end, sender->td_end - receiver->td_maxwin - 1);
590 
591 	if (before(seq, sender->td_maxend + 1) &&
592 	    in_recv_win &&
593 	    before(sack, receiver->td_end + 1) &&
594 	    after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) {
595 		/*
596 		 * Take into account window scaling (RFC 1323).
597 		 */
598 		if (!tcph->syn)
599 			win <<= sender->td_scale;
600 
601 		/*
602 		 * Update sender data.
603 		 */
604 		swin = win + (sack - ack);
605 		if (sender->td_maxwin < swin)
606 			sender->td_maxwin = swin;
607 		if (after(end, sender->td_end)) {
608 			sender->td_end = end;
609 			sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
610 		}
611 		if (tcph->ack) {
612 			if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) {
613 				sender->td_maxack = ack;
614 				sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET;
615 			} else if (after(ack, sender->td_maxack))
616 				sender->td_maxack = ack;
617 		}
618 
619 		/*
620 		 * Update receiver data.
621 		 */
622 		if (receiver->td_maxwin != 0 && after(end, sender->td_maxend))
623 			receiver->td_maxwin += end - sender->td_maxend;
624 		if (after(sack + win, receiver->td_maxend - 1)) {
625 			receiver->td_maxend = sack + win;
626 			if (win == 0)
627 				receiver->td_maxend++;
628 		}
629 		if (ack == receiver->td_end)
630 			receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
631 
632 		/*
633 		 * Check retransmissions.
634 		 */
635 		if (index == TCP_ACK_SET) {
636 			if (state->last_dir == dir
637 			    && state->last_seq == seq
638 			    && state->last_ack == ack
639 			    && state->last_end == end
640 			    && state->last_win == win_raw)
641 				state->retrans++;
642 			else {
643 				state->last_dir = dir;
644 				state->last_seq = seq;
645 				state->last_ack = ack;
646 				state->last_end = end;
647 				state->last_win = win_raw;
648 				state->retrans = 0;
649 			}
650 		}
651 		res = true;
652 	} else {
653 		res = false;
654 		if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL ||
655 		    tn->tcp_be_liberal)
656 			res = true;
657 		if (!res) {
658 			nf_ct_l4proto_log_invalid(skb, ct, hook_state,
659 			"%s",
660 			before(seq, sender->td_maxend + 1) ?
661 			in_recv_win ?
662 			before(sack, receiver->td_end + 1) ?
663 			after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1) ? "BUG"
664 			: "ACK is under the lower bound (possible overly delayed ACK)"
665 			: "ACK is over the upper bound (ACKed data not seen yet)"
666 			: "SEQ is under the lower bound (already ACKed data retransmitted)"
667 			: "SEQ is over the upper bound (over the window of the receiver)");
668 		}
669 	}
670 
671 	return res;
672 }
673 
674 /* table of valid flag combinations - PUSH, ECE and CWR are always valid */
675 static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK|
676 				 TCPHDR_URG) + 1] =
677 {
678 	[TCPHDR_SYN]				= 1,
679 	[TCPHDR_SYN|TCPHDR_URG]			= 1,
680 	[TCPHDR_SYN|TCPHDR_ACK]			= 1,
681 	[TCPHDR_RST]				= 1,
682 	[TCPHDR_RST|TCPHDR_ACK]			= 1,
683 	[TCPHDR_FIN|TCPHDR_ACK]			= 1,
684 	[TCPHDR_FIN|TCPHDR_ACK|TCPHDR_URG]	= 1,
685 	[TCPHDR_ACK]				= 1,
686 	[TCPHDR_ACK|TCPHDR_URG]			= 1,
687 };
688 
689 static void tcp_error_log(const struct sk_buff *skb,
690 			  const struct nf_hook_state *state,
691 			  const char *msg)
692 {
693 	nf_l4proto_log_invalid(skb, state, IPPROTO_TCP, "%s", msg);
694 }
695 
696 /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c.  */
697 static bool tcp_error(const struct tcphdr *th,
698 		      struct sk_buff *skb,
699 		      unsigned int dataoff,
700 		      const struct nf_hook_state *state)
701 {
702 	unsigned int tcplen = skb->len - dataoff;
703 	u8 tcpflags;
704 
705 	/* Not whole TCP header or malformed packet */
706 	if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
707 		tcp_error_log(skb, state, "truncated packet");
708 		return true;
709 	}
710 
711 	/* Checksum invalid? Ignore.
712 	 * We skip checking packets on the outgoing path
713 	 * because the checksum is assumed to be correct.
714 	 */
715 	/* FIXME: Source route IP option packets --RR */
716 	if (state->net->ct.sysctl_checksum &&
717 	    state->hook == NF_INET_PRE_ROUTING &&
718 	    nf_checksum(skb, state->hook, dataoff, IPPROTO_TCP, state->pf)) {
719 		tcp_error_log(skb, state, "bad checksum");
720 		return true;
721 	}
722 
723 	/* Check TCP flags. */
724 	tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH));
725 	if (!tcp_valid_flags[tcpflags]) {
726 		tcp_error_log(skb, state, "invalid tcp flag combination");
727 		return true;
728 	}
729 
730 	return false;
731 }
732 
733 static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
734 			     unsigned int dataoff,
735 			     const struct tcphdr *th)
736 {
737 	enum tcp_conntrack new_state;
738 	struct net *net = nf_ct_net(ct);
739 	const struct nf_tcp_net *tn = nf_tcp_pernet(net);
740 
741 	/* Don't need lock here: this conntrack not in circulation yet */
742 	new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE];
743 
744 	/* Invalid: delete conntrack */
745 	if (new_state >= TCP_CONNTRACK_MAX) {
746 		pr_debug("nf_ct_tcp: invalid new deleting.\n");
747 		return false;
748 	}
749 
750 	if (new_state == TCP_CONNTRACK_SYN_SENT) {
751 		memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
752 		/* SYN packet */
753 		ct->proto.tcp.seen[0].td_end =
754 			segment_seq_plus_len(ntohl(th->seq), skb->len,
755 					     dataoff, th);
756 		ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
757 		if (ct->proto.tcp.seen[0].td_maxwin == 0)
758 			ct->proto.tcp.seen[0].td_maxwin = 1;
759 		ct->proto.tcp.seen[0].td_maxend =
760 			ct->proto.tcp.seen[0].td_end;
761 
762 		tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]);
763 	} else if (tn->tcp_loose == 0) {
764 		/* Don't try to pick up connections. */
765 		return false;
766 	} else {
767 		memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
768 		/*
769 		 * We are in the middle of a connection,
770 		 * its history is lost for us.
771 		 * Let's try to use the data from the packet.
772 		 */
773 		ct->proto.tcp.seen[0].td_end =
774 			segment_seq_plus_len(ntohl(th->seq), skb->len,
775 					     dataoff, th);
776 		ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
777 		if (ct->proto.tcp.seen[0].td_maxwin == 0)
778 			ct->proto.tcp.seen[0].td_maxwin = 1;
779 		ct->proto.tcp.seen[0].td_maxend =
780 			ct->proto.tcp.seen[0].td_end +
781 			ct->proto.tcp.seen[0].td_maxwin;
782 
783 		/* We assume SACK and liberal window checking to handle
784 		 * window scaling */
785 		ct->proto.tcp.seen[0].flags =
786 		ct->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM |
787 					      IP_CT_TCP_FLAG_BE_LIBERAL;
788 	}
789 
790 	/* tcp_packet will set them */
791 	ct->proto.tcp.last_index = TCP_NONE_SET;
792 	return true;
793 }
794 
795 static bool tcp_can_early_drop(const struct nf_conn *ct)
796 {
797 	switch (ct->proto.tcp.state) {
798 	case TCP_CONNTRACK_FIN_WAIT:
799 	case TCP_CONNTRACK_LAST_ACK:
800 	case TCP_CONNTRACK_TIME_WAIT:
801 	case TCP_CONNTRACK_CLOSE:
802 	case TCP_CONNTRACK_CLOSE_WAIT:
803 		return true;
804 	default:
805 		break;
806 	}
807 
808 	return false;
809 }
810 
811 static void nf_ct_tcp_state_reset(struct ip_ct_tcp_state *state)
812 {
813 	state->td_end		= 0;
814 	state->td_maxend	= 0;
815 	state->td_maxwin	= 0;
816 	state->td_maxack	= 0;
817 	state->td_scale		= 0;
818 	state->flags		&= IP_CT_TCP_FLAG_BE_LIBERAL;
819 }
820 
821 /* Returns verdict for packet, or -1 for invalid. */
822 int nf_conntrack_tcp_packet(struct nf_conn *ct,
823 			    struct sk_buff *skb,
824 			    unsigned int dataoff,
825 			    enum ip_conntrack_info ctinfo,
826 			    const struct nf_hook_state *state)
827 {
828 	struct net *net = nf_ct_net(ct);
829 	struct nf_tcp_net *tn = nf_tcp_pernet(net);
830 	struct nf_conntrack_tuple *tuple;
831 	enum tcp_conntrack new_state, old_state;
832 	unsigned int index, *timeouts;
833 	enum ip_conntrack_dir dir;
834 	const struct tcphdr *th;
835 	struct tcphdr _tcph;
836 	unsigned long timeout;
837 
838 	th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
839 	if (th == NULL)
840 		return -NF_ACCEPT;
841 
842 	if (tcp_error(th, skb, dataoff, state))
843 		return -NF_ACCEPT;
844 
845 	if (!nf_ct_is_confirmed(ct) && !tcp_new(ct, skb, dataoff, th))
846 		return -NF_ACCEPT;
847 
848 	spin_lock_bh(&ct->lock);
849 	old_state = ct->proto.tcp.state;
850 	dir = CTINFO2DIR(ctinfo);
851 	index = get_conntrack_index(th);
852 	new_state = tcp_conntracks[dir][index][old_state];
853 	tuple = &ct->tuplehash[dir].tuple;
854 
855 	switch (new_state) {
856 	case TCP_CONNTRACK_SYN_SENT:
857 		if (old_state < TCP_CONNTRACK_TIME_WAIT)
858 			break;
859 		/* RFC 1122: "When a connection is closed actively,
860 		 * it MUST linger in TIME-WAIT state for a time 2xMSL
861 		 * (Maximum Segment Lifetime). However, it MAY accept
862 		 * a new SYN from the remote TCP to reopen the connection
863 		 * directly from TIME-WAIT state, if..."
864 		 * We ignore the conditions because we are in the
865 		 * TIME-WAIT state anyway.
866 		 *
867 		 * Handle aborted connections: we and the server
868 		 * think there is an existing connection but the client
869 		 * aborts it and starts a new one.
870 		 */
871 		if (((ct->proto.tcp.seen[dir].flags
872 		      | ct->proto.tcp.seen[!dir].flags)
873 		     & IP_CT_TCP_FLAG_CLOSE_INIT)
874 		    || (ct->proto.tcp.last_dir == dir
875 		        && ct->proto.tcp.last_index == TCP_RST_SET)) {
876 			/* Attempt to reopen a closed/aborted connection.
877 			 * Delete this connection and look up again. */
878 			spin_unlock_bh(&ct->lock);
879 
880 			/* Only repeat if we can actually remove the timer.
881 			 * Destruction may already be in progress in process
882 			 * context and we must give it a chance to terminate.
883 			 */
884 			if (nf_ct_kill(ct))
885 				return -NF_REPEAT;
886 			return NF_DROP;
887 		}
888 		fallthrough;
889 	case TCP_CONNTRACK_IGNORE:
890 		/* Ignored packets:
891 		 *
892 		 * Our connection entry may be out of sync, so ignore
893 		 * packets which may signal the real connection between
894 		 * the client and the server.
895 		 *
896 		 * a) SYN in ORIGINAL
897 		 * b) SYN/ACK in REPLY
898 		 * c) ACK in reply direction after initial SYN in original.
899 		 *
900 		 * If the ignored packet is invalid, the receiver will send
901 		 * a RST we'll catch below.
902 		 */
903 		if (index == TCP_SYNACK_SET
904 		    && ct->proto.tcp.last_index == TCP_SYN_SET
905 		    && ct->proto.tcp.last_dir != dir
906 		    && ntohl(th->ack_seq) == ct->proto.tcp.last_end) {
907 			/* b) This SYN/ACK acknowledges a SYN that we earlier
908 			 * ignored as invalid. This means that the client and
909 			 * the server are both in sync, while the firewall is
910 			 * not. We get in sync from the previously annotated
911 			 * values.
912 			 */
913 			old_state = TCP_CONNTRACK_SYN_SENT;
914 			new_state = TCP_CONNTRACK_SYN_RECV;
915 			ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_end =
916 				ct->proto.tcp.last_end;
917 			ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxend =
918 				ct->proto.tcp.last_end;
919 			ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxwin =
920 				ct->proto.tcp.last_win == 0 ?
921 					1 : ct->proto.tcp.last_win;
922 			ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_scale =
923 				ct->proto.tcp.last_wscale;
924 			ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK;
925 			ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags =
926 				ct->proto.tcp.last_flags;
927 			nf_ct_tcp_state_reset(&ct->proto.tcp.seen[dir]);
928 			break;
929 		}
930 		ct->proto.tcp.last_index = index;
931 		ct->proto.tcp.last_dir = dir;
932 		ct->proto.tcp.last_seq = ntohl(th->seq);
933 		ct->proto.tcp.last_end =
934 		    segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th);
935 		ct->proto.tcp.last_win = ntohs(th->window);
936 
937 		/* a) This is a SYN in ORIGINAL. The client and the server
938 		 * may be in sync but we are not. In that case, we annotate
939 		 * the TCP options and let the packet go through. If it is a
940 		 * valid SYN packet, the server will reply with a SYN/ACK, and
941 		 * then we'll get in sync. Otherwise, the server potentially
942 		 * responds with a challenge ACK if implementing RFC5961.
943 		 */
944 		if (index == TCP_SYN_SET && dir == IP_CT_DIR_ORIGINAL) {
945 			struct ip_ct_tcp_state seen = {};
946 
947 			ct->proto.tcp.last_flags =
948 			ct->proto.tcp.last_wscale = 0;
949 			tcp_options(skb, dataoff, th, &seen);
950 			if (seen.flags & IP_CT_TCP_FLAG_WINDOW_SCALE) {
951 				ct->proto.tcp.last_flags |=
952 					IP_CT_TCP_FLAG_WINDOW_SCALE;
953 				ct->proto.tcp.last_wscale = seen.td_scale;
954 			}
955 			if (seen.flags & IP_CT_TCP_FLAG_SACK_PERM) {
956 				ct->proto.tcp.last_flags |=
957 					IP_CT_TCP_FLAG_SACK_PERM;
958 			}
959 			/* Mark the potential for RFC5961 challenge ACK,
960 			 * this pose a special problem for LAST_ACK state
961 			 * as ACK is intrepretated as ACKing last FIN.
962 			 */
963 			if (old_state == TCP_CONNTRACK_LAST_ACK)
964 				ct->proto.tcp.last_flags |=
965 					IP_CT_EXP_CHALLENGE_ACK;
966 		}
967 		spin_unlock_bh(&ct->lock);
968 		nf_ct_l4proto_log_invalid(skb, ct, state,
969 					  "packet (index %d) in dir %d ignored, state %s",
970 					  index, dir,
971 					  tcp_conntrack_names[old_state]);
972 		return NF_ACCEPT;
973 	case TCP_CONNTRACK_MAX:
974 		/* Special case for SYN proxy: when the SYN to the server or
975 		 * the SYN/ACK from the server is lost, the client may transmit
976 		 * a keep-alive packet while in SYN_SENT state. This needs to
977 		 * be associated with the original conntrack entry in order to
978 		 * generate a new SYN with the correct sequence number.
979 		 */
980 		if (nfct_synproxy(ct) && old_state == TCP_CONNTRACK_SYN_SENT &&
981 		    index == TCP_ACK_SET && dir == IP_CT_DIR_ORIGINAL &&
982 		    ct->proto.tcp.last_dir == IP_CT_DIR_ORIGINAL &&
983 		    ct->proto.tcp.seen[dir].td_end - 1 == ntohl(th->seq)) {
984 			pr_debug("nf_ct_tcp: SYN proxy client keep alive\n");
985 			spin_unlock_bh(&ct->lock);
986 			return NF_ACCEPT;
987 		}
988 
989 		/* Invalid packet */
990 		spin_unlock_bh(&ct->lock);
991 		nf_ct_l4proto_log_invalid(skb, ct, state,
992 					  "packet (index %d) in dir %d invalid, state %s",
993 					  index, dir,
994 					  tcp_conntrack_names[old_state]);
995 		return -NF_ACCEPT;
996 	case TCP_CONNTRACK_TIME_WAIT:
997 		/* RFC5961 compliance cause stack to send "challenge-ACK"
998 		 * e.g. in response to spurious SYNs.  Conntrack MUST
999 		 * not believe this ACK is acking last FIN.
1000 		 */
1001 		if (old_state == TCP_CONNTRACK_LAST_ACK &&
1002 		    index == TCP_ACK_SET &&
1003 		    ct->proto.tcp.last_dir != dir &&
1004 		    ct->proto.tcp.last_index == TCP_SYN_SET &&
1005 		    (ct->proto.tcp.last_flags & IP_CT_EXP_CHALLENGE_ACK)) {
1006 			/* Detected RFC5961 challenge ACK */
1007 			ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK;
1008 			spin_unlock_bh(&ct->lock);
1009 			nf_ct_l4proto_log_invalid(skb, ct, state, "challenge-ack ignored");
1010 			return NF_ACCEPT; /* Don't change state */
1011 		}
1012 		break;
1013 	case TCP_CONNTRACK_SYN_SENT2:
1014 		/* tcp_conntracks table is not smart enough to handle
1015 		 * simultaneous open.
1016 		 */
1017 		ct->proto.tcp.last_flags |= IP_CT_TCP_SIMULTANEOUS_OPEN;
1018 		break;
1019 	case TCP_CONNTRACK_SYN_RECV:
1020 		if (dir == IP_CT_DIR_REPLY && index == TCP_ACK_SET &&
1021 		    ct->proto.tcp.last_flags & IP_CT_TCP_SIMULTANEOUS_OPEN)
1022 			new_state = TCP_CONNTRACK_ESTABLISHED;
1023 		break;
1024 	case TCP_CONNTRACK_CLOSE:
1025 		if (index != TCP_RST_SET)
1026 			break;
1027 
1028 		/* If we are closing, tuple might have been re-used already.
1029 		 * last_index, last_ack, and all other ct fields used for
1030 		 * sequence/window validation are outdated in that case.
1031 		 *
1032 		 * As the conntrack can already be expired by GC under pressure,
1033 		 * just skip validation checks.
1034 		 */
1035 		if (tcp_can_early_drop(ct))
1036 			goto in_window;
1037 
1038 		/* td_maxack might be outdated if we let a SYN through earlier */
1039 		if ((ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) &&
1040 		    ct->proto.tcp.last_index != TCP_SYN_SET) {
1041 			u32 seq = ntohl(th->seq);
1042 
1043 			/* If we are not in established state and SEQ=0 this is most
1044 			 * likely an answer to a SYN we let go through above (last_index
1045 			 * can be updated due to out-of-order ACKs).
1046 			 */
1047 			if (seq == 0 && !nf_conntrack_tcp_established(ct))
1048 				break;
1049 
1050 			if (before(seq, ct->proto.tcp.seen[!dir].td_maxack) &&
1051 			    !tn->tcp_ignore_invalid_rst) {
1052 				/* Invalid RST  */
1053 				spin_unlock_bh(&ct->lock);
1054 				nf_ct_l4proto_log_invalid(skb, ct, state, "invalid rst");
1055 				return -NF_ACCEPT;
1056 			}
1057 
1058 			if (!nf_conntrack_tcp_established(ct) ||
1059 			    seq == ct->proto.tcp.seen[!dir].td_maxack)
1060 				break;
1061 
1062 			/* Check if rst is part of train, such as
1063 			 *   foo:80 > bar:4379: P, 235946583:235946602(19) ack 42
1064 			 *   foo:80 > bar:4379: R, 235946602:235946602(0)  ack 42
1065 			 */
1066 			if (ct->proto.tcp.last_index == TCP_ACK_SET &&
1067 			    ct->proto.tcp.last_dir == dir &&
1068 			    seq == ct->proto.tcp.last_end)
1069 				break;
1070 
1071 			/* ... RST sequence number doesn't match exactly, keep
1072 			 * established state to allow a possible challenge ACK.
1073 			 */
1074 			new_state = old_state;
1075 		}
1076 		if (((test_bit(IPS_SEEN_REPLY_BIT, &ct->status)
1077 			 && ct->proto.tcp.last_index == TCP_SYN_SET)
1078 			|| (!test_bit(IPS_ASSURED_BIT, &ct->status)
1079 			    && ct->proto.tcp.last_index == TCP_ACK_SET))
1080 		    && ntohl(th->ack_seq) == ct->proto.tcp.last_end) {
1081 			/* RST sent to invalid SYN or ACK we had let through
1082 			 * at a) and c) above:
1083 			 *
1084 			 * a) SYN was in window then
1085 			 * c) we hold a half-open connection.
1086 			 *
1087 			 * Delete our connection entry.
1088 			 * We skip window checking, because packet might ACK
1089 			 * segments we ignored. */
1090 			goto in_window;
1091 		}
1092 		break;
1093 	default:
1094 		/* Keep compilers happy. */
1095 		break;
1096 	}
1097 
1098 	if (!tcp_in_window(ct, dir, index,
1099 			   skb, dataoff, th, state)) {
1100 		spin_unlock_bh(&ct->lock);
1101 		return -NF_ACCEPT;
1102 	}
1103      in_window:
1104 	/* From now on we have got in-window packets */
1105 	ct->proto.tcp.last_index = index;
1106 	ct->proto.tcp.last_dir = dir;
1107 
1108 	pr_debug("tcp_conntracks: ");
1109 	nf_ct_dump_tuple(tuple);
1110 	pr_debug("syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n",
1111 		 (th->syn ? 1 : 0), (th->ack ? 1 : 0),
1112 		 (th->fin ? 1 : 0), (th->rst ? 1 : 0),
1113 		 old_state, new_state);
1114 
1115 	ct->proto.tcp.state = new_state;
1116 	if (old_state != new_state
1117 	    && new_state == TCP_CONNTRACK_FIN_WAIT)
1118 		ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
1119 
1120 	timeouts = nf_ct_timeout_lookup(ct);
1121 	if (!timeouts)
1122 		timeouts = tn->timeouts;
1123 
1124 	if (ct->proto.tcp.retrans >= tn->tcp_max_retrans &&
1125 	    timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS])
1126 		timeout = timeouts[TCP_CONNTRACK_RETRANS];
1127 	else if (unlikely(index == TCP_RST_SET))
1128 		timeout = timeouts[TCP_CONNTRACK_CLOSE];
1129 	else if ((ct->proto.tcp.seen[0].flags | ct->proto.tcp.seen[1].flags) &
1130 		 IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED &&
1131 		 timeouts[new_state] > timeouts[TCP_CONNTRACK_UNACK])
1132 		timeout = timeouts[TCP_CONNTRACK_UNACK];
1133 	else if (ct->proto.tcp.last_win == 0 &&
1134 		 timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS])
1135 		timeout = timeouts[TCP_CONNTRACK_RETRANS];
1136 	else
1137 		timeout = timeouts[new_state];
1138 	spin_unlock_bh(&ct->lock);
1139 
1140 	if (new_state != old_state)
1141 		nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
1142 
1143 	if (!test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
1144 		/* If only reply is a RST, we can consider ourselves not to
1145 		   have an established connection: this is a fairly common
1146 		   problem case, so we can delete the conntrack
1147 		   immediately.  --RR */
1148 		if (th->rst) {
1149 			nf_ct_kill_acct(ct, ctinfo, skb);
1150 			return NF_ACCEPT;
1151 		}
1152 
1153 		if (index == TCP_SYN_SET && old_state == TCP_CONNTRACK_SYN_SENT) {
1154 			/* do not renew timeout on SYN retransmit.
1155 			 *
1156 			 * Else port reuse by client or NAT middlebox can keep
1157 			 * entry alive indefinitely (including nat info).
1158 			 */
1159 			return NF_ACCEPT;
1160 		}
1161 
1162 		/* ESTABLISHED without SEEN_REPLY, i.e. mid-connection
1163 		 * pickup with loose=1. Avoid large ESTABLISHED timeout.
1164 		 */
1165 		if (new_state == TCP_CONNTRACK_ESTABLISHED &&
1166 		    timeout > timeouts[TCP_CONNTRACK_UNACK])
1167 			timeout = timeouts[TCP_CONNTRACK_UNACK];
1168 	} else if (!test_bit(IPS_ASSURED_BIT, &ct->status)
1169 		   && (old_state == TCP_CONNTRACK_SYN_RECV
1170 		       || old_state == TCP_CONNTRACK_ESTABLISHED)
1171 		   && new_state == TCP_CONNTRACK_ESTABLISHED) {
1172 		/* Set ASSURED if we see valid ack in ESTABLISHED
1173 		   after SYN_RECV or a valid answer for a picked up
1174 		   connection. */
1175 		set_bit(IPS_ASSURED_BIT, &ct->status);
1176 		nf_conntrack_event_cache(IPCT_ASSURED, ct);
1177 	}
1178 	nf_ct_refresh_acct(ct, ctinfo, skb, timeout);
1179 
1180 	return NF_ACCEPT;
1181 }
1182 
1183 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
1184 
1185 #include <linux/netfilter/nfnetlink.h>
1186 #include <linux/netfilter/nfnetlink_conntrack.h>
1187 
1188 static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
1189 			 struct nf_conn *ct, bool destroy)
1190 {
1191 	struct nlattr *nest_parms;
1192 	struct nf_ct_tcp_flags tmp = {};
1193 
1194 	spin_lock_bh(&ct->lock);
1195 	nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP);
1196 	if (!nest_parms)
1197 		goto nla_put_failure;
1198 
1199 	if (nla_put_u8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state))
1200 		goto nla_put_failure;
1201 
1202 	if (destroy)
1203 		goto skip_state;
1204 
1205 	if (nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL,
1206 		       ct->proto.tcp.seen[0].td_scale) ||
1207 	    nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY,
1208 		       ct->proto.tcp.seen[1].td_scale))
1209 		goto nla_put_failure;
1210 
1211 	tmp.flags = ct->proto.tcp.seen[0].flags;
1212 	if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL,
1213 		    sizeof(struct nf_ct_tcp_flags), &tmp))
1214 		goto nla_put_failure;
1215 
1216 	tmp.flags = ct->proto.tcp.seen[1].flags;
1217 	if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY,
1218 		    sizeof(struct nf_ct_tcp_flags), &tmp))
1219 		goto nla_put_failure;
1220 skip_state:
1221 	spin_unlock_bh(&ct->lock);
1222 	nla_nest_end(skb, nest_parms);
1223 
1224 	return 0;
1225 
1226 nla_put_failure:
1227 	spin_unlock_bh(&ct->lock);
1228 	return -1;
1229 }
1230 
1231 static const struct nla_policy tcp_nla_policy[CTA_PROTOINFO_TCP_MAX+1] = {
1232 	[CTA_PROTOINFO_TCP_STATE]	    = { .type = NLA_U8 },
1233 	[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] = { .type = NLA_U8 },
1234 	[CTA_PROTOINFO_TCP_WSCALE_REPLY]    = { .type = NLA_U8 },
1235 	[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]  = { .len = sizeof(struct nf_ct_tcp_flags) },
1236 	[CTA_PROTOINFO_TCP_FLAGS_REPLY]	    = { .len = sizeof(struct nf_ct_tcp_flags) },
1237 };
1238 
1239 #define TCP_NLATTR_SIZE	( \
1240 	NLA_ALIGN(NLA_HDRLEN + 1) + \
1241 	NLA_ALIGN(NLA_HDRLEN + 1) + \
1242 	NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags)) + \
1243 	NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags)))
1244 
1245 static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct)
1246 {
1247 	struct nlattr *pattr = cda[CTA_PROTOINFO_TCP];
1248 	struct nlattr *tb[CTA_PROTOINFO_TCP_MAX+1];
1249 	int err;
1250 
1251 	/* updates could not contain anything about the private
1252 	 * protocol info, in that case skip the parsing */
1253 	if (!pattr)
1254 		return 0;
1255 
1256 	err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_TCP_MAX, pattr,
1257 					  tcp_nla_policy, NULL);
1258 	if (err < 0)
1259 		return err;
1260 
1261 	if (tb[CTA_PROTOINFO_TCP_STATE] &&
1262 	    nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX)
1263 		return -EINVAL;
1264 
1265 	spin_lock_bh(&ct->lock);
1266 	if (tb[CTA_PROTOINFO_TCP_STATE])
1267 		ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]);
1268 
1269 	if (tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]) {
1270 		struct nf_ct_tcp_flags *attr =
1271 			nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]);
1272 		ct->proto.tcp.seen[0].flags &= ~attr->mask;
1273 		ct->proto.tcp.seen[0].flags |= attr->flags & attr->mask;
1274 	}
1275 
1276 	if (tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]) {
1277 		struct nf_ct_tcp_flags *attr =
1278 			nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]);
1279 		ct->proto.tcp.seen[1].flags &= ~attr->mask;
1280 		ct->proto.tcp.seen[1].flags |= attr->flags & attr->mask;
1281 	}
1282 
1283 	if (tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] &&
1284 	    tb[CTA_PROTOINFO_TCP_WSCALE_REPLY] &&
1285 	    ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_WINDOW_SCALE &&
1286 	    ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_WINDOW_SCALE) {
1287 		ct->proto.tcp.seen[0].td_scale =
1288 			nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL]);
1289 		ct->proto.tcp.seen[1].td_scale =
1290 			nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]);
1291 	}
1292 	spin_unlock_bh(&ct->lock);
1293 
1294 	return 0;
1295 }
1296 
1297 static unsigned int tcp_nlattr_tuple_size(void)
1298 {
1299 	static unsigned int size __read_mostly;
1300 
1301 	if (!size)
1302 		size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
1303 
1304 	return size;
1305 }
1306 #endif
1307 
1308 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
1309 
1310 #include <linux/netfilter/nfnetlink.h>
1311 #include <linux/netfilter/nfnetlink_cttimeout.h>
1312 
1313 static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[],
1314 				     struct net *net, void *data)
1315 {
1316 	struct nf_tcp_net *tn = nf_tcp_pernet(net);
1317 	unsigned int *timeouts = data;
1318 	int i;
1319 
1320 	if (!timeouts)
1321 		timeouts = tn->timeouts;
1322 	/* set default TCP timeouts. */
1323 	for (i=0; i<TCP_CONNTRACK_TIMEOUT_MAX; i++)
1324 		timeouts[i] = tn->timeouts[i];
1325 
1326 	if (tb[CTA_TIMEOUT_TCP_SYN_SENT]) {
1327 		timeouts[TCP_CONNTRACK_SYN_SENT] =
1328 			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT]))*HZ;
1329 	}
1330 
1331 	if (tb[CTA_TIMEOUT_TCP_SYN_RECV]) {
1332 		timeouts[TCP_CONNTRACK_SYN_RECV] =
1333 			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_RECV]))*HZ;
1334 	}
1335 	if (tb[CTA_TIMEOUT_TCP_ESTABLISHED]) {
1336 		timeouts[TCP_CONNTRACK_ESTABLISHED] =
1337 			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_ESTABLISHED]))*HZ;
1338 	}
1339 	if (tb[CTA_TIMEOUT_TCP_FIN_WAIT]) {
1340 		timeouts[TCP_CONNTRACK_FIN_WAIT] =
1341 			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_FIN_WAIT]))*HZ;
1342 	}
1343 	if (tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]) {
1344 		timeouts[TCP_CONNTRACK_CLOSE_WAIT] =
1345 			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]))*HZ;
1346 	}
1347 	if (tb[CTA_TIMEOUT_TCP_LAST_ACK]) {
1348 		timeouts[TCP_CONNTRACK_LAST_ACK] =
1349 			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_LAST_ACK]))*HZ;
1350 	}
1351 	if (tb[CTA_TIMEOUT_TCP_TIME_WAIT]) {
1352 		timeouts[TCP_CONNTRACK_TIME_WAIT] =
1353 			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_TIME_WAIT]))*HZ;
1354 	}
1355 	if (tb[CTA_TIMEOUT_TCP_CLOSE]) {
1356 		timeouts[TCP_CONNTRACK_CLOSE] =
1357 			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE]))*HZ;
1358 	}
1359 	if (tb[CTA_TIMEOUT_TCP_SYN_SENT2]) {
1360 		timeouts[TCP_CONNTRACK_SYN_SENT2] =
1361 			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT2]))*HZ;
1362 	}
1363 	if (tb[CTA_TIMEOUT_TCP_RETRANS]) {
1364 		timeouts[TCP_CONNTRACK_RETRANS] =
1365 			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_RETRANS]))*HZ;
1366 	}
1367 	if (tb[CTA_TIMEOUT_TCP_UNACK]) {
1368 		timeouts[TCP_CONNTRACK_UNACK] =
1369 			ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_UNACK]))*HZ;
1370 	}
1371 
1372 	timeouts[CTA_TIMEOUT_TCP_UNSPEC] = timeouts[CTA_TIMEOUT_TCP_SYN_SENT];
1373 	return 0;
1374 }
1375 
1376 static int
1377 tcp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
1378 {
1379 	const unsigned int *timeouts = data;
1380 
1381 	if (nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT,
1382 			htonl(timeouts[TCP_CONNTRACK_SYN_SENT] / HZ)) ||
1383 	    nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_RECV,
1384 			 htonl(timeouts[TCP_CONNTRACK_SYN_RECV] / HZ)) ||
1385 	    nla_put_be32(skb, CTA_TIMEOUT_TCP_ESTABLISHED,
1386 			 htonl(timeouts[TCP_CONNTRACK_ESTABLISHED] / HZ)) ||
1387 	    nla_put_be32(skb, CTA_TIMEOUT_TCP_FIN_WAIT,
1388 			 htonl(timeouts[TCP_CONNTRACK_FIN_WAIT] / HZ)) ||
1389 	    nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE_WAIT,
1390 			 htonl(timeouts[TCP_CONNTRACK_CLOSE_WAIT] / HZ)) ||
1391 	    nla_put_be32(skb, CTA_TIMEOUT_TCP_LAST_ACK,
1392 			 htonl(timeouts[TCP_CONNTRACK_LAST_ACK] / HZ)) ||
1393 	    nla_put_be32(skb, CTA_TIMEOUT_TCP_TIME_WAIT,
1394 			 htonl(timeouts[TCP_CONNTRACK_TIME_WAIT] / HZ)) ||
1395 	    nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE,
1396 			 htonl(timeouts[TCP_CONNTRACK_CLOSE] / HZ)) ||
1397 	    nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT2,
1398 			 htonl(timeouts[TCP_CONNTRACK_SYN_SENT2] / HZ)) ||
1399 	    nla_put_be32(skb, CTA_TIMEOUT_TCP_RETRANS,
1400 			 htonl(timeouts[TCP_CONNTRACK_RETRANS] / HZ)) ||
1401 	    nla_put_be32(skb, CTA_TIMEOUT_TCP_UNACK,
1402 			 htonl(timeouts[TCP_CONNTRACK_UNACK] / HZ)))
1403 		goto nla_put_failure;
1404 	return 0;
1405 
1406 nla_put_failure:
1407 	return -ENOSPC;
1408 }
1409 
1410 static const struct nla_policy tcp_timeout_nla_policy[CTA_TIMEOUT_TCP_MAX+1] = {
1411 	[CTA_TIMEOUT_TCP_SYN_SENT]	= { .type = NLA_U32 },
1412 	[CTA_TIMEOUT_TCP_SYN_RECV]	= { .type = NLA_U32 },
1413 	[CTA_TIMEOUT_TCP_ESTABLISHED]	= { .type = NLA_U32 },
1414 	[CTA_TIMEOUT_TCP_FIN_WAIT]	= { .type = NLA_U32 },
1415 	[CTA_TIMEOUT_TCP_CLOSE_WAIT]	= { .type = NLA_U32 },
1416 	[CTA_TIMEOUT_TCP_LAST_ACK]	= { .type = NLA_U32 },
1417 	[CTA_TIMEOUT_TCP_TIME_WAIT]	= { .type = NLA_U32 },
1418 	[CTA_TIMEOUT_TCP_CLOSE]		= { .type = NLA_U32 },
1419 	[CTA_TIMEOUT_TCP_SYN_SENT2]	= { .type = NLA_U32 },
1420 	[CTA_TIMEOUT_TCP_RETRANS]	= { .type = NLA_U32 },
1421 	[CTA_TIMEOUT_TCP_UNACK]		= { .type = NLA_U32 },
1422 };
1423 #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
1424 
1425 void nf_conntrack_tcp_init_net(struct net *net)
1426 {
1427 	struct nf_tcp_net *tn = nf_tcp_pernet(net);
1428 	int i;
1429 
1430 	for (i = 0; i < TCP_CONNTRACK_TIMEOUT_MAX; i++)
1431 		tn->timeouts[i] = tcp_timeouts[i];
1432 
1433 	/* timeouts[0] is unused, make it same as SYN_SENT so
1434 	 * ->timeouts[0] contains 'new' timeout, like udp or icmp.
1435 	 */
1436 	tn->timeouts[0] = tcp_timeouts[TCP_CONNTRACK_SYN_SENT];
1437 
1438 	/* If it is set to zero, we disable picking up already established
1439 	 * connections.
1440 	 */
1441 	tn->tcp_loose = 1;
1442 
1443 	/* "Be conservative in what you do,
1444 	 *  be liberal in what you accept from others."
1445 	 * If it's non-zero, we mark only out of window RST segments as INVALID.
1446 	 */
1447 	tn->tcp_be_liberal = 0;
1448 
1449 	/* If it's non-zero, we turn off RST sequence number check */
1450 	tn->tcp_ignore_invalid_rst = 0;
1451 
1452 	/* Max number of the retransmitted packets without receiving an (acceptable)
1453 	 * ACK from the destination. If this number is reached, a shorter timer
1454 	 * will be started.
1455 	 */
1456 	tn->tcp_max_retrans = 3;
1457 
1458 #if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
1459 	tn->offload_timeout = 30 * HZ;
1460 #endif
1461 }
1462 
1463 const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp =
1464 {
1465 	.l4proto 		= IPPROTO_TCP,
1466 #ifdef CONFIG_NF_CONNTRACK_PROCFS
1467 	.print_conntrack 	= tcp_print_conntrack,
1468 #endif
1469 	.can_early_drop		= tcp_can_early_drop,
1470 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
1471 	.to_nlattr		= tcp_to_nlattr,
1472 	.from_nlattr		= nlattr_to_tcp,
1473 	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
1474 	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
1475 	.nlattr_tuple_size	= tcp_nlattr_tuple_size,
1476 	.nlattr_size		= TCP_NLATTR_SIZE,
1477 	.nla_policy		= nf_ct_port_nla_policy,
1478 #endif
1479 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
1480 	.ctnl_timeout		= {
1481 		.nlattr_to_obj	= tcp_timeout_nlattr_to_obj,
1482 		.obj_to_nlattr	= tcp_timeout_obj_to_nlattr,
1483 		.nlattr_max	= CTA_TIMEOUT_TCP_MAX,
1484 		.obj_size	= sizeof(unsigned int) *
1485 					TCP_CONNTRACK_TIMEOUT_MAX,
1486 		.nla_policy	= tcp_timeout_nla_policy,
1487 	},
1488 #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
1489 };
1490