1 // SPDX-License-Identifier: GPL-2.0-only 2 /* (C) 1999-2001 Paul `Rusty' Russell 3 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> 4 * (C) 2002-2013 Jozsef Kadlecsik <kadlec@netfilter.org> 5 * (C) 2006-2012 Patrick McHardy <kaber@trash.net> 6 */ 7 8 #include <linux/types.h> 9 #include <linux/timer.h> 10 #include <linux/module.h> 11 #include <linux/in.h> 12 #include <linux/tcp.h> 13 #include <linux/spinlock.h> 14 #include <linux/skbuff.h> 15 #include <linux/ipv6.h> 16 #include <net/ip6_checksum.h> 17 #include <asm/unaligned.h> 18 19 #include <net/tcp.h> 20 21 #include <linux/netfilter.h> 22 #include <linux/netfilter_ipv4.h> 23 #include <linux/netfilter_ipv6.h> 24 #include <net/netfilter/nf_conntrack.h> 25 #include <net/netfilter/nf_conntrack_l4proto.h> 26 #include <net/netfilter/nf_conntrack_ecache.h> 27 #include <net/netfilter/nf_conntrack_seqadj.h> 28 #include <net/netfilter/nf_conntrack_synproxy.h> 29 #include <net/netfilter/nf_conntrack_timeout.h> 30 #include <net/netfilter/nf_log.h> 31 #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> 32 #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> 33 34 /* FIXME: Examine ipfilter's timeouts and conntrack transitions more 35 closely. They're more complex. --RR */ 36 37 static const char *const tcp_conntrack_names[] = { 38 "NONE", 39 "SYN_SENT", 40 "SYN_RECV", 41 "ESTABLISHED", 42 "FIN_WAIT", 43 "CLOSE_WAIT", 44 "LAST_ACK", 45 "TIME_WAIT", 46 "CLOSE", 47 "SYN_SENT2", 48 }; 49 50 #define SECS * HZ 51 #define MINS * 60 SECS 52 #define HOURS * 60 MINS 53 #define DAYS * 24 HOURS 54 55 static const unsigned int tcp_timeouts[TCP_CONNTRACK_TIMEOUT_MAX] = { 56 [TCP_CONNTRACK_SYN_SENT] = 2 MINS, 57 [TCP_CONNTRACK_SYN_RECV] = 60 SECS, 58 [TCP_CONNTRACK_ESTABLISHED] = 5 DAYS, 59 [TCP_CONNTRACK_FIN_WAIT] = 2 MINS, 60 [TCP_CONNTRACK_CLOSE_WAIT] = 60 SECS, 61 [TCP_CONNTRACK_LAST_ACK] = 30 SECS, 62 [TCP_CONNTRACK_TIME_WAIT] = 2 MINS, 63 [TCP_CONNTRACK_CLOSE] = 10 SECS, 64 [TCP_CONNTRACK_SYN_SENT2] = 2 MINS, 65 /* RFC1122 says the R2 limit should be at least 100 seconds. 66 Linux uses 15 packets as limit, which corresponds 67 to ~13-30min depending on RTO. */ 68 [TCP_CONNTRACK_RETRANS] = 5 MINS, 69 [TCP_CONNTRACK_UNACK] = 5 MINS, 70 }; 71 72 #define sNO TCP_CONNTRACK_NONE 73 #define sSS TCP_CONNTRACK_SYN_SENT 74 #define sSR TCP_CONNTRACK_SYN_RECV 75 #define sES TCP_CONNTRACK_ESTABLISHED 76 #define sFW TCP_CONNTRACK_FIN_WAIT 77 #define sCW TCP_CONNTRACK_CLOSE_WAIT 78 #define sLA TCP_CONNTRACK_LAST_ACK 79 #define sTW TCP_CONNTRACK_TIME_WAIT 80 #define sCL TCP_CONNTRACK_CLOSE 81 #define sS2 TCP_CONNTRACK_SYN_SENT2 82 #define sIV TCP_CONNTRACK_MAX 83 #define sIG TCP_CONNTRACK_IGNORE 84 85 /* What TCP flags are set from RST/SYN/FIN/ACK. */ 86 enum tcp_bit_set { 87 TCP_SYN_SET, 88 TCP_SYNACK_SET, 89 TCP_FIN_SET, 90 TCP_ACK_SET, 91 TCP_RST_SET, 92 TCP_NONE_SET, 93 }; 94 95 /* 96 * The TCP state transition table needs a few words... 97 * 98 * We are the man in the middle. All the packets go through us 99 * but might get lost in transit to the destination. 100 * It is assumed that the destinations can't receive segments 101 * we haven't seen. 102 * 103 * The checked segment is in window, but our windows are *not* 104 * equivalent with the ones of the sender/receiver. We always 105 * try to guess the state of the current sender. 106 * 107 * The meaning of the states are: 108 * 109 * NONE: initial state 110 * SYN_SENT: SYN-only packet seen 111 * SYN_SENT2: SYN-only packet seen from reply dir, simultaneous open 112 * SYN_RECV: SYN-ACK packet seen 113 * ESTABLISHED: ACK packet seen 114 * FIN_WAIT: FIN packet seen 115 * CLOSE_WAIT: ACK seen (after FIN) 116 * LAST_ACK: FIN seen (after FIN) 117 * TIME_WAIT: last ACK seen 118 * CLOSE: closed connection (RST) 119 * 120 * Packets marked as IGNORED (sIG): 121 * if they may be either invalid or valid 122 * and the receiver may send back a connection 123 * closing RST or a SYN/ACK. 124 * 125 * Packets marked as INVALID (sIV): 126 * if we regard them as truly invalid packets 127 */ 128 static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { 129 { 130 /* ORIGINAL */ 131 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ 132 /*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sS2 }, 133 /* 134 * sNO -> sSS Initialize a new connection 135 * sSS -> sSS Retransmitted SYN 136 * sS2 -> sS2 Late retransmitted SYN 137 * sSR -> sIG 138 * sES -> sIG Error: SYNs in window outside the SYN_SENT state 139 * are errors. Receiver will reply with RST 140 * and close the connection. 141 * Or we are not in sync and hold a dead connection. 142 * sFW -> sIG 143 * sCW -> sIG 144 * sLA -> sIG 145 * sTW -> sSS Reopened connection (RFC 1122). 146 * sCL -> sSS 147 */ 148 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ 149 /*synack*/ { sIV, sIV, sSR, sIV, sIV, sIV, sIV, sIV, sIV, sSR }, 150 /* 151 * sNO -> sIV Too late and no reason to do anything 152 * sSS -> sIV Client can't send SYN and then SYN/ACK 153 * sS2 -> sSR SYN/ACK sent to SYN2 in simultaneous open 154 * sSR -> sSR Late retransmitted SYN/ACK in simultaneous open 155 * sES -> sIV Invalid SYN/ACK packets sent by the client 156 * sFW -> sIV 157 * sCW -> sIV 158 * sLA -> sIV 159 * sTW -> sIV 160 * sCL -> sIV 161 */ 162 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ 163 /*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, 164 /* 165 * sNO -> sIV Too late and no reason to do anything... 166 * sSS -> sIV Client migth not send FIN in this state: 167 * we enforce waiting for a SYN/ACK reply first. 168 * sS2 -> sIV 169 * sSR -> sFW Close started. 170 * sES -> sFW 171 * sFW -> sLA FIN seen in both directions, waiting for 172 * the last ACK. 173 * Migth be a retransmitted FIN as well... 174 * sCW -> sLA 175 * sLA -> sLA Retransmitted FIN. Remain in the same state. 176 * sTW -> sTW 177 * sCL -> sCL 178 */ 179 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ 180 /*ack*/ { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV }, 181 /* 182 * sNO -> sES Assumed. 183 * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet. 184 * sS2 -> sIV 185 * sSR -> sES Established state is reached. 186 * sES -> sES :-) 187 * sFW -> sCW Normal close request answered by ACK. 188 * sCW -> sCW 189 * sLA -> sTW Last ACK detected (RFC5961 challenged) 190 * sTW -> sTW Retransmitted last ACK. Remain in the same state. 191 * sCL -> sCL 192 */ 193 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ 194 /*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL }, 195 /*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } 196 }, 197 { 198 /* REPLY */ 199 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ 200 /*syn*/ { sIV, sS2, sIV, sIV, sIV, sIV, sIV, sSS, sIV, sS2 }, 201 /* 202 * sNO -> sIV Never reached. 203 * sSS -> sS2 Simultaneous open 204 * sS2 -> sS2 Retransmitted simultaneous SYN 205 * sSR -> sIV Invalid SYN packets sent by the server 206 * sES -> sIV 207 * sFW -> sIV 208 * sCW -> sIV 209 * sLA -> sIV 210 * sTW -> sSS Reopened connection, but server may have switched role 211 * sCL -> sIV 212 */ 213 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ 214 /*synack*/ { sIV, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR }, 215 /* 216 * sSS -> sSR Standard open. 217 * sS2 -> sSR Simultaneous open 218 * sSR -> sIG Retransmitted SYN/ACK, ignore it. 219 * sES -> sIG Late retransmitted SYN/ACK? 220 * sFW -> sIG Might be SYN/ACK answering ignored SYN 221 * sCW -> sIG 222 * sLA -> sIG 223 * sTW -> sIG 224 * sCL -> sIG 225 */ 226 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ 227 /*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, 228 /* 229 * sSS -> sIV Server might not send FIN in this state. 230 * sS2 -> sIV 231 * sSR -> sFW Close started. 232 * sES -> sFW 233 * sFW -> sLA FIN seen in both directions. 234 * sCW -> sLA 235 * sLA -> sLA Retransmitted FIN. 236 * sTW -> sTW 237 * sCL -> sCL 238 */ 239 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ 240 /*ack*/ { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIG }, 241 /* 242 * sSS -> sIG Might be a half-open connection. 243 * sS2 -> sIG 244 * sSR -> sSR Might answer late resent SYN. 245 * sES -> sES :-) 246 * sFW -> sCW Normal close request answered by ACK. 247 * sCW -> sCW 248 * sLA -> sTW Last ACK detected (RFC5961 challenged) 249 * sTW -> sTW Retransmitted last ACK. 250 * sCL -> sCL 251 */ 252 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ 253 /*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL }, 254 /*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } 255 } 256 }; 257 258 #ifdef CONFIG_NF_CONNTRACK_PROCFS 259 /* Print out the private part of the conntrack. */ 260 static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct) 261 { 262 if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) 263 return; 264 265 seq_printf(s, "%s ", tcp_conntrack_names[ct->proto.tcp.state]); 266 } 267 #endif 268 269 static unsigned int get_conntrack_index(const struct tcphdr *tcph) 270 { 271 if (tcph->rst) return TCP_RST_SET; 272 else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET); 273 else if (tcph->fin) return TCP_FIN_SET; 274 else if (tcph->ack) return TCP_ACK_SET; 275 else return TCP_NONE_SET; 276 } 277 278 /* TCP connection tracking based on 'Real Stateful TCP Packet Filtering 279 in IP Filter' by Guido van Rooij. 280 281 http://www.sane.nl/events/sane2000/papers.html 282 http://www.darkart.com/mirrors/www.obfuscation.org/ipf/ 283 284 The boundaries and the conditions are changed according to RFC793: 285 the packet must intersect the window (i.e. segments may be 286 after the right or before the left edge) and thus receivers may ACK 287 segments after the right edge of the window. 288 289 td_maxend = max(sack + max(win,1)) seen in reply packets 290 td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets 291 td_maxwin += seq + len - sender.td_maxend 292 if seq + len > sender.td_maxend 293 td_end = max(seq + len) seen in sent packets 294 295 I. Upper bound for valid data: seq <= sender.td_maxend 296 II. Lower bound for valid data: seq + len >= sender.td_end - receiver.td_maxwin 297 III. Upper bound for valid (s)ack: sack <= receiver.td_end 298 IV. Lower bound for valid (s)ack: sack >= receiver.td_end - MAXACKWINDOW 299 300 where sack is the highest right edge of sack block found in the packet 301 or ack in the case of packet without SACK option. 302 303 The upper bound limit for a valid (s)ack is not ignored - 304 we doesn't have to deal with fragments. 305 */ 306 307 static inline __u32 segment_seq_plus_len(__u32 seq, 308 size_t len, 309 unsigned int dataoff, 310 const struct tcphdr *tcph) 311 { 312 /* XXX Should I use payload length field in IP/IPv6 header ? 313 * - YK */ 314 return (seq + len - dataoff - tcph->doff*4 315 + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0)); 316 } 317 318 /* Fixme: what about big packets? */ 319 #define MAXACKWINCONST 66000 320 #define MAXACKWINDOW(sender) \ 321 ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin \ 322 : MAXACKWINCONST) 323 324 /* 325 * Simplified tcp_parse_options routine from tcp_input.c 326 */ 327 static void tcp_options(const struct sk_buff *skb, 328 unsigned int dataoff, 329 const struct tcphdr *tcph, 330 struct ip_ct_tcp_state *state) 331 { 332 unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; 333 const unsigned char *ptr; 334 int length = (tcph->doff*4) - sizeof(struct tcphdr); 335 336 if (!length) 337 return; 338 339 ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr), 340 length, buff); 341 if (!ptr) 342 return; 343 344 state->td_scale = 0; 345 state->flags &= IP_CT_TCP_FLAG_BE_LIBERAL; 346 347 while (length > 0) { 348 int opcode=*ptr++; 349 int opsize; 350 351 switch (opcode) { 352 case TCPOPT_EOL: 353 return; 354 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ 355 length--; 356 continue; 357 default: 358 if (length < 2) 359 return; 360 opsize=*ptr++; 361 if (opsize < 2) /* "silly options" */ 362 return; 363 if (opsize > length) 364 return; /* don't parse partial options */ 365 366 if (opcode == TCPOPT_SACK_PERM 367 && opsize == TCPOLEN_SACK_PERM) 368 state->flags |= IP_CT_TCP_FLAG_SACK_PERM; 369 else if (opcode == TCPOPT_WINDOW 370 && opsize == TCPOLEN_WINDOW) { 371 state->td_scale = *(u_int8_t *)ptr; 372 373 if (state->td_scale > TCP_MAX_WSCALE) 374 state->td_scale = TCP_MAX_WSCALE; 375 376 state->flags |= 377 IP_CT_TCP_FLAG_WINDOW_SCALE; 378 } 379 ptr += opsize - 2; 380 length -= opsize; 381 } 382 } 383 } 384 385 static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, 386 const struct tcphdr *tcph, __u32 *sack) 387 { 388 unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; 389 const unsigned char *ptr; 390 int length = (tcph->doff*4) - sizeof(struct tcphdr); 391 __u32 tmp; 392 393 if (!length) 394 return; 395 396 ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr), 397 length, buff); 398 if (!ptr) 399 return; 400 401 /* Fast path for timestamp-only option */ 402 if (length == TCPOLEN_TSTAMP_ALIGNED 403 && *(__be32 *)ptr == htonl((TCPOPT_NOP << 24) 404 | (TCPOPT_NOP << 16) 405 | (TCPOPT_TIMESTAMP << 8) 406 | TCPOLEN_TIMESTAMP)) 407 return; 408 409 while (length > 0) { 410 int opcode = *ptr++; 411 int opsize, i; 412 413 switch (opcode) { 414 case TCPOPT_EOL: 415 return; 416 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ 417 length--; 418 continue; 419 default: 420 if (length < 2) 421 return; 422 opsize = *ptr++; 423 if (opsize < 2) /* "silly options" */ 424 return; 425 if (opsize > length) 426 return; /* don't parse partial options */ 427 428 if (opcode == TCPOPT_SACK 429 && opsize >= (TCPOLEN_SACK_BASE 430 + TCPOLEN_SACK_PERBLOCK) 431 && !((opsize - TCPOLEN_SACK_BASE) 432 % TCPOLEN_SACK_PERBLOCK)) { 433 for (i = 0; 434 i < (opsize - TCPOLEN_SACK_BASE); 435 i += TCPOLEN_SACK_PERBLOCK) { 436 tmp = get_unaligned_be32((__be32 *)(ptr+i)+1); 437 438 if (after(tmp, *sack)) 439 *sack = tmp; 440 } 441 return; 442 } 443 ptr += opsize - 2; 444 length -= opsize; 445 } 446 } 447 } 448 449 static void tcp_init_sender(struct ip_ct_tcp_state *sender, 450 struct ip_ct_tcp_state *receiver, 451 const struct sk_buff *skb, 452 unsigned int dataoff, 453 const struct tcphdr *tcph, 454 u32 end, u32 win) 455 { 456 /* SYN-ACK in reply to a SYN 457 * or SYN from reply direction in simultaneous open. 458 */ 459 sender->td_end = 460 sender->td_maxend = end; 461 sender->td_maxwin = (win == 0 ? 1 : win); 462 463 tcp_options(skb, dataoff, tcph, sender); 464 /* RFC 1323: 465 * Both sides must send the Window Scale option 466 * to enable window scaling in either direction. 467 */ 468 if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE && 469 receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) { 470 sender->td_scale = 0; 471 receiver->td_scale = 0; 472 } 473 } 474 475 static bool tcp_in_window(struct nf_conn *ct, 476 enum ip_conntrack_dir dir, 477 unsigned int index, 478 const struct sk_buff *skb, 479 unsigned int dataoff, 480 const struct tcphdr *tcph, 481 const struct nf_hook_state *hook_state) 482 { 483 struct ip_ct_tcp *state = &ct->proto.tcp; 484 struct net *net = nf_ct_net(ct); 485 struct nf_tcp_net *tn = nf_tcp_pernet(net); 486 struct ip_ct_tcp_state *sender = &state->seen[dir]; 487 struct ip_ct_tcp_state *receiver = &state->seen[!dir]; 488 const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple; 489 __u32 seq, ack, sack, end, win, swin; 490 u16 win_raw; 491 s32 receiver_offset; 492 bool res, in_recv_win; 493 494 /* 495 * Get the required data from the packet. 496 */ 497 seq = ntohl(tcph->seq); 498 ack = sack = ntohl(tcph->ack_seq); 499 win_raw = ntohs(tcph->window); 500 win = win_raw; 501 end = segment_seq_plus_len(seq, skb->len, dataoff, tcph); 502 503 if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM) 504 tcp_sack(skb, dataoff, tcph, &sack); 505 506 /* Take into account NAT sequence number mangling */ 507 receiver_offset = nf_ct_seq_offset(ct, !dir, ack - 1); 508 ack -= receiver_offset; 509 sack -= receiver_offset; 510 511 pr_debug("tcp_in_window: START\n"); 512 pr_debug("tcp_in_window: "); 513 nf_ct_dump_tuple(tuple); 514 pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n", 515 seq, ack, receiver_offset, sack, receiver_offset, win, end); 516 pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " 517 "receiver end=%u maxend=%u maxwin=%u scale=%i\n", 518 sender->td_end, sender->td_maxend, sender->td_maxwin, 519 sender->td_scale, 520 receiver->td_end, receiver->td_maxend, receiver->td_maxwin, 521 receiver->td_scale); 522 523 if (sender->td_maxwin == 0) { 524 /* 525 * Initialize sender data. 526 */ 527 if (tcph->syn) { 528 tcp_init_sender(sender, receiver, 529 skb, dataoff, tcph, 530 end, win); 531 if (!tcph->ack) 532 /* Simultaneous open */ 533 return true; 534 } else { 535 /* 536 * We are in the middle of a connection, 537 * its history is lost for us. 538 * Let's try to use the data from the packet. 539 */ 540 sender->td_end = end; 541 swin = win << sender->td_scale; 542 sender->td_maxwin = (swin == 0 ? 1 : swin); 543 sender->td_maxend = end + sender->td_maxwin; 544 if (receiver->td_maxwin == 0) { 545 /* We haven't seen traffic in the other 546 * direction yet but we have to tweak window 547 * tracking to pass III and IV until that 548 * happens. 549 */ 550 receiver->td_end = receiver->td_maxend = sack; 551 } else if (sack == receiver->td_end + 1) { 552 /* Likely a reply to a keepalive. 553 * Needed for III. 554 */ 555 receiver->td_end++; 556 } 557 558 } 559 } else if (((state->state == TCP_CONNTRACK_SYN_SENT 560 && dir == IP_CT_DIR_ORIGINAL) 561 || (state->state == TCP_CONNTRACK_SYN_RECV 562 && dir == IP_CT_DIR_REPLY)) 563 && after(end, sender->td_end)) { 564 /* 565 * RFC 793: "if a TCP is reinitialized ... then it need 566 * not wait at all; it must only be sure to use sequence 567 * numbers larger than those recently used." 568 */ 569 sender->td_end = 570 sender->td_maxend = end; 571 sender->td_maxwin = (win == 0 ? 1 : win); 572 573 tcp_options(skb, dataoff, tcph, sender); 574 } else if (tcph->syn && dir == IP_CT_DIR_REPLY && 575 state->state == TCP_CONNTRACK_SYN_SENT) { 576 /* Retransmitted syn-ack, or syn (simultaneous open). 577 * 578 * Re-init state for this direction, just like for the first 579 * syn(-ack) reply, it might differ in seq, ack or tcp options. 580 */ 581 tcp_init_sender(sender, receiver, 582 skb, dataoff, tcph, 583 end, win); 584 if (!tcph->ack) 585 return true; 586 } 587 588 if (!(tcph->ack)) { 589 /* 590 * If there is no ACK, just pretend it was set and OK. 591 */ 592 ack = sack = receiver->td_end; 593 } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) == 594 (TCP_FLAG_ACK|TCP_FLAG_RST)) 595 && (ack == 0)) { 596 /* 597 * Broken TCP stacks, that set ACK in RST packets as well 598 * with zero ack value. 599 */ 600 ack = sack = receiver->td_end; 601 } 602 603 if (tcph->rst && seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT) 604 /* 605 * RST sent answering SYN. 606 */ 607 seq = end = sender->td_end; 608 609 pr_debug("tcp_in_window: "); 610 nf_ct_dump_tuple(tuple); 611 pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n", 612 seq, ack, receiver_offset, sack, receiver_offset, win, end); 613 pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " 614 "receiver end=%u maxend=%u maxwin=%u scale=%i\n", 615 sender->td_end, sender->td_maxend, sender->td_maxwin, 616 sender->td_scale, 617 receiver->td_end, receiver->td_maxend, receiver->td_maxwin, 618 receiver->td_scale); 619 620 /* Is the ending sequence in the receive window (if available)? */ 621 in_recv_win = !receiver->td_maxwin || 622 after(end, sender->td_end - receiver->td_maxwin - 1); 623 624 pr_debug("tcp_in_window: I=%i II=%i III=%i IV=%i\n", 625 before(seq, sender->td_maxend + 1), 626 (in_recv_win ? 1 : 0), 627 before(sack, receiver->td_end + 1), 628 after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)); 629 630 if (before(seq, sender->td_maxend + 1) && 631 in_recv_win && 632 before(sack, receiver->td_end + 1) && 633 after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) { 634 /* 635 * Take into account window scaling (RFC 1323). 636 */ 637 if (!tcph->syn) 638 win <<= sender->td_scale; 639 640 /* 641 * Update sender data. 642 */ 643 swin = win + (sack - ack); 644 if (sender->td_maxwin < swin) 645 sender->td_maxwin = swin; 646 if (after(end, sender->td_end)) { 647 sender->td_end = end; 648 sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; 649 } 650 if (tcph->ack) { 651 if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) { 652 sender->td_maxack = ack; 653 sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET; 654 } else if (after(ack, sender->td_maxack)) 655 sender->td_maxack = ack; 656 } 657 658 /* 659 * Update receiver data. 660 */ 661 if (receiver->td_maxwin != 0 && after(end, sender->td_maxend)) 662 receiver->td_maxwin += end - sender->td_maxend; 663 if (after(sack + win, receiver->td_maxend - 1)) { 664 receiver->td_maxend = sack + win; 665 if (win == 0) 666 receiver->td_maxend++; 667 } 668 if (ack == receiver->td_end) 669 receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; 670 671 /* 672 * Check retransmissions. 673 */ 674 if (index == TCP_ACK_SET) { 675 if (state->last_dir == dir 676 && state->last_seq == seq 677 && state->last_ack == ack 678 && state->last_end == end 679 && state->last_win == win_raw) 680 state->retrans++; 681 else { 682 state->last_dir = dir; 683 state->last_seq = seq; 684 state->last_ack = ack; 685 state->last_end = end; 686 state->last_win = win_raw; 687 state->retrans = 0; 688 } 689 } 690 res = true; 691 } else { 692 res = false; 693 if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL || 694 tn->tcp_be_liberal) 695 res = true; 696 if (!res) { 697 nf_ct_l4proto_log_invalid(skb, ct, hook_state, 698 "%s", 699 before(seq, sender->td_maxend + 1) ? 700 in_recv_win ? 701 before(sack, receiver->td_end + 1) ? 702 after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1) ? "BUG" 703 : "ACK is under the lower bound (possible overly delayed ACK)" 704 : "ACK is over the upper bound (ACKed data not seen yet)" 705 : "SEQ is under the lower bound (already ACKed data retransmitted)" 706 : "SEQ is over the upper bound (over the window of the receiver)"); 707 } 708 } 709 710 pr_debug("tcp_in_window: res=%u sender end=%u maxend=%u maxwin=%u " 711 "receiver end=%u maxend=%u maxwin=%u\n", 712 res, sender->td_end, sender->td_maxend, sender->td_maxwin, 713 receiver->td_end, receiver->td_maxend, receiver->td_maxwin); 714 715 return res; 716 } 717 718 /* table of valid flag combinations - PUSH, ECE and CWR are always valid */ 719 static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK| 720 TCPHDR_URG) + 1] = 721 { 722 [TCPHDR_SYN] = 1, 723 [TCPHDR_SYN|TCPHDR_URG] = 1, 724 [TCPHDR_SYN|TCPHDR_ACK] = 1, 725 [TCPHDR_RST] = 1, 726 [TCPHDR_RST|TCPHDR_ACK] = 1, 727 [TCPHDR_FIN|TCPHDR_ACK] = 1, 728 [TCPHDR_FIN|TCPHDR_ACK|TCPHDR_URG] = 1, 729 [TCPHDR_ACK] = 1, 730 [TCPHDR_ACK|TCPHDR_URG] = 1, 731 }; 732 733 static void tcp_error_log(const struct sk_buff *skb, 734 const struct nf_hook_state *state, 735 const char *msg) 736 { 737 nf_l4proto_log_invalid(skb, state, IPPROTO_TCP, "%s", msg); 738 } 739 740 /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */ 741 static bool tcp_error(const struct tcphdr *th, 742 struct sk_buff *skb, 743 unsigned int dataoff, 744 const struct nf_hook_state *state) 745 { 746 unsigned int tcplen = skb->len - dataoff; 747 u8 tcpflags; 748 749 /* Not whole TCP header or malformed packet */ 750 if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { 751 tcp_error_log(skb, state, "truncated packet"); 752 return true; 753 } 754 755 /* Checksum invalid? Ignore. 756 * We skip checking packets on the outgoing path 757 * because the checksum is assumed to be correct. 758 */ 759 /* FIXME: Source route IP option packets --RR */ 760 if (state->net->ct.sysctl_checksum && 761 state->hook == NF_INET_PRE_ROUTING && 762 nf_checksum(skb, state->hook, dataoff, IPPROTO_TCP, state->pf)) { 763 tcp_error_log(skb, state, "bad checksum"); 764 return true; 765 } 766 767 /* Check TCP flags. */ 768 tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH)); 769 if (!tcp_valid_flags[tcpflags]) { 770 tcp_error_log(skb, state, "invalid tcp flag combination"); 771 return true; 772 } 773 774 return false; 775 } 776 777 static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, 778 unsigned int dataoff, 779 const struct tcphdr *th) 780 { 781 enum tcp_conntrack new_state; 782 struct net *net = nf_ct_net(ct); 783 const struct nf_tcp_net *tn = nf_tcp_pernet(net); 784 const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0]; 785 const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1]; 786 787 /* Don't need lock here: this conntrack not in circulation yet */ 788 new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE]; 789 790 /* Invalid: delete conntrack */ 791 if (new_state >= TCP_CONNTRACK_MAX) { 792 pr_debug("nf_ct_tcp: invalid new deleting.\n"); 793 return false; 794 } 795 796 if (new_state == TCP_CONNTRACK_SYN_SENT) { 797 memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp)); 798 /* SYN packet */ 799 ct->proto.tcp.seen[0].td_end = 800 segment_seq_plus_len(ntohl(th->seq), skb->len, 801 dataoff, th); 802 ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window); 803 if (ct->proto.tcp.seen[0].td_maxwin == 0) 804 ct->proto.tcp.seen[0].td_maxwin = 1; 805 ct->proto.tcp.seen[0].td_maxend = 806 ct->proto.tcp.seen[0].td_end; 807 808 tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]); 809 } else if (tn->tcp_loose == 0) { 810 /* Don't try to pick up connections. */ 811 return false; 812 } else { 813 memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp)); 814 /* 815 * We are in the middle of a connection, 816 * its history is lost for us. 817 * Let's try to use the data from the packet. 818 */ 819 ct->proto.tcp.seen[0].td_end = 820 segment_seq_plus_len(ntohl(th->seq), skb->len, 821 dataoff, th); 822 ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window); 823 if (ct->proto.tcp.seen[0].td_maxwin == 0) 824 ct->proto.tcp.seen[0].td_maxwin = 1; 825 ct->proto.tcp.seen[0].td_maxend = 826 ct->proto.tcp.seen[0].td_end + 827 ct->proto.tcp.seen[0].td_maxwin; 828 829 /* We assume SACK and liberal window checking to handle 830 * window scaling */ 831 ct->proto.tcp.seen[0].flags = 832 ct->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM | 833 IP_CT_TCP_FLAG_BE_LIBERAL; 834 } 835 836 /* tcp_packet will set them */ 837 ct->proto.tcp.last_index = TCP_NONE_SET; 838 839 pr_debug("%s: sender end=%u maxend=%u maxwin=%u scale=%i " 840 "receiver end=%u maxend=%u maxwin=%u scale=%i\n", 841 __func__, 842 sender->td_end, sender->td_maxend, sender->td_maxwin, 843 sender->td_scale, 844 receiver->td_end, receiver->td_maxend, receiver->td_maxwin, 845 receiver->td_scale); 846 return true; 847 } 848 849 static bool tcp_can_early_drop(const struct nf_conn *ct) 850 { 851 switch (ct->proto.tcp.state) { 852 case TCP_CONNTRACK_FIN_WAIT: 853 case TCP_CONNTRACK_LAST_ACK: 854 case TCP_CONNTRACK_TIME_WAIT: 855 case TCP_CONNTRACK_CLOSE: 856 case TCP_CONNTRACK_CLOSE_WAIT: 857 return true; 858 default: 859 break; 860 } 861 862 return false; 863 } 864 865 static void nf_ct_tcp_state_reset(struct ip_ct_tcp_state *state) 866 { 867 state->td_end = 0; 868 state->td_maxend = 0; 869 state->td_maxwin = 0; 870 state->td_maxack = 0; 871 state->td_scale = 0; 872 state->flags &= IP_CT_TCP_FLAG_BE_LIBERAL; 873 } 874 875 /* Returns verdict for packet, or -1 for invalid. */ 876 int nf_conntrack_tcp_packet(struct nf_conn *ct, 877 struct sk_buff *skb, 878 unsigned int dataoff, 879 enum ip_conntrack_info ctinfo, 880 const struct nf_hook_state *state) 881 { 882 struct net *net = nf_ct_net(ct); 883 struct nf_tcp_net *tn = nf_tcp_pernet(net); 884 struct nf_conntrack_tuple *tuple; 885 enum tcp_conntrack new_state, old_state; 886 unsigned int index, *timeouts; 887 enum ip_conntrack_dir dir; 888 const struct tcphdr *th; 889 struct tcphdr _tcph; 890 unsigned long timeout; 891 892 th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); 893 if (th == NULL) 894 return -NF_ACCEPT; 895 896 if (tcp_error(th, skb, dataoff, state)) 897 return -NF_ACCEPT; 898 899 if (!nf_ct_is_confirmed(ct) && !tcp_new(ct, skb, dataoff, th)) 900 return -NF_ACCEPT; 901 902 spin_lock_bh(&ct->lock); 903 old_state = ct->proto.tcp.state; 904 dir = CTINFO2DIR(ctinfo); 905 index = get_conntrack_index(th); 906 new_state = tcp_conntracks[dir][index][old_state]; 907 tuple = &ct->tuplehash[dir].tuple; 908 909 switch (new_state) { 910 case TCP_CONNTRACK_SYN_SENT: 911 if (old_state < TCP_CONNTRACK_TIME_WAIT) 912 break; 913 /* RFC 1122: "When a connection is closed actively, 914 * it MUST linger in TIME-WAIT state for a time 2xMSL 915 * (Maximum Segment Lifetime). However, it MAY accept 916 * a new SYN from the remote TCP to reopen the connection 917 * directly from TIME-WAIT state, if..." 918 * We ignore the conditions because we are in the 919 * TIME-WAIT state anyway. 920 * 921 * Handle aborted connections: we and the server 922 * think there is an existing connection but the client 923 * aborts it and starts a new one. 924 */ 925 if (((ct->proto.tcp.seen[dir].flags 926 | ct->proto.tcp.seen[!dir].flags) 927 & IP_CT_TCP_FLAG_CLOSE_INIT) 928 || (ct->proto.tcp.last_dir == dir 929 && ct->proto.tcp.last_index == TCP_RST_SET)) { 930 /* Attempt to reopen a closed/aborted connection. 931 * Delete this connection and look up again. */ 932 spin_unlock_bh(&ct->lock); 933 934 /* Only repeat if we can actually remove the timer. 935 * Destruction may already be in progress in process 936 * context and we must give it a chance to terminate. 937 */ 938 if (nf_ct_kill(ct)) 939 return -NF_REPEAT; 940 return NF_DROP; 941 } 942 fallthrough; 943 case TCP_CONNTRACK_IGNORE: 944 /* Ignored packets: 945 * 946 * Our connection entry may be out of sync, so ignore 947 * packets which may signal the real connection between 948 * the client and the server. 949 * 950 * a) SYN in ORIGINAL 951 * b) SYN/ACK in REPLY 952 * c) ACK in reply direction after initial SYN in original. 953 * 954 * If the ignored packet is invalid, the receiver will send 955 * a RST we'll catch below. 956 */ 957 if (index == TCP_SYNACK_SET 958 && ct->proto.tcp.last_index == TCP_SYN_SET 959 && ct->proto.tcp.last_dir != dir 960 && ntohl(th->ack_seq) == ct->proto.tcp.last_end) { 961 /* b) This SYN/ACK acknowledges a SYN that we earlier 962 * ignored as invalid. This means that the client and 963 * the server are both in sync, while the firewall is 964 * not. We get in sync from the previously annotated 965 * values. 966 */ 967 old_state = TCP_CONNTRACK_SYN_SENT; 968 new_state = TCP_CONNTRACK_SYN_RECV; 969 ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_end = 970 ct->proto.tcp.last_end; 971 ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxend = 972 ct->proto.tcp.last_end; 973 ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxwin = 974 ct->proto.tcp.last_win == 0 ? 975 1 : ct->proto.tcp.last_win; 976 ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_scale = 977 ct->proto.tcp.last_wscale; 978 ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK; 979 ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags = 980 ct->proto.tcp.last_flags; 981 nf_ct_tcp_state_reset(&ct->proto.tcp.seen[dir]); 982 break; 983 } 984 ct->proto.tcp.last_index = index; 985 ct->proto.tcp.last_dir = dir; 986 ct->proto.tcp.last_seq = ntohl(th->seq); 987 ct->proto.tcp.last_end = 988 segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th); 989 ct->proto.tcp.last_win = ntohs(th->window); 990 991 /* a) This is a SYN in ORIGINAL. The client and the server 992 * may be in sync but we are not. In that case, we annotate 993 * the TCP options and let the packet go through. If it is a 994 * valid SYN packet, the server will reply with a SYN/ACK, and 995 * then we'll get in sync. Otherwise, the server potentially 996 * responds with a challenge ACK if implementing RFC5961. 997 */ 998 if (index == TCP_SYN_SET && dir == IP_CT_DIR_ORIGINAL) { 999 struct ip_ct_tcp_state seen = {}; 1000 1001 ct->proto.tcp.last_flags = 1002 ct->proto.tcp.last_wscale = 0; 1003 tcp_options(skb, dataoff, th, &seen); 1004 if (seen.flags & IP_CT_TCP_FLAG_WINDOW_SCALE) { 1005 ct->proto.tcp.last_flags |= 1006 IP_CT_TCP_FLAG_WINDOW_SCALE; 1007 ct->proto.tcp.last_wscale = seen.td_scale; 1008 } 1009 if (seen.flags & IP_CT_TCP_FLAG_SACK_PERM) { 1010 ct->proto.tcp.last_flags |= 1011 IP_CT_TCP_FLAG_SACK_PERM; 1012 } 1013 /* Mark the potential for RFC5961 challenge ACK, 1014 * this pose a special problem for LAST_ACK state 1015 * as ACK is intrepretated as ACKing last FIN. 1016 */ 1017 if (old_state == TCP_CONNTRACK_LAST_ACK) 1018 ct->proto.tcp.last_flags |= 1019 IP_CT_EXP_CHALLENGE_ACK; 1020 } 1021 spin_unlock_bh(&ct->lock); 1022 nf_ct_l4proto_log_invalid(skb, ct, state, 1023 "packet (index %d) in dir %d ignored, state %s", 1024 index, dir, 1025 tcp_conntrack_names[old_state]); 1026 return NF_ACCEPT; 1027 case TCP_CONNTRACK_MAX: 1028 /* Special case for SYN proxy: when the SYN to the server or 1029 * the SYN/ACK from the server is lost, the client may transmit 1030 * a keep-alive packet while in SYN_SENT state. This needs to 1031 * be associated with the original conntrack entry in order to 1032 * generate a new SYN with the correct sequence number. 1033 */ 1034 if (nfct_synproxy(ct) && old_state == TCP_CONNTRACK_SYN_SENT && 1035 index == TCP_ACK_SET && dir == IP_CT_DIR_ORIGINAL && 1036 ct->proto.tcp.last_dir == IP_CT_DIR_ORIGINAL && 1037 ct->proto.tcp.seen[dir].td_end - 1 == ntohl(th->seq)) { 1038 pr_debug("nf_ct_tcp: SYN proxy client keep alive\n"); 1039 spin_unlock_bh(&ct->lock); 1040 return NF_ACCEPT; 1041 } 1042 1043 /* Invalid packet */ 1044 pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", 1045 dir, get_conntrack_index(th), old_state); 1046 spin_unlock_bh(&ct->lock); 1047 nf_ct_l4proto_log_invalid(skb, ct, state, "invalid state"); 1048 return -NF_ACCEPT; 1049 case TCP_CONNTRACK_TIME_WAIT: 1050 /* RFC5961 compliance cause stack to send "challenge-ACK" 1051 * e.g. in response to spurious SYNs. Conntrack MUST 1052 * not believe this ACK is acking last FIN. 1053 */ 1054 if (old_state == TCP_CONNTRACK_LAST_ACK && 1055 index == TCP_ACK_SET && 1056 ct->proto.tcp.last_dir != dir && 1057 ct->proto.tcp.last_index == TCP_SYN_SET && 1058 (ct->proto.tcp.last_flags & IP_CT_EXP_CHALLENGE_ACK)) { 1059 /* Detected RFC5961 challenge ACK */ 1060 ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK; 1061 spin_unlock_bh(&ct->lock); 1062 nf_ct_l4proto_log_invalid(skb, ct, state, "challenge-ack ignored"); 1063 return NF_ACCEPT; /* Don't change state */ 1064 } 1065 break; 1066 case TCP_CONNTRACK_SYN_SENT2: 1067 /* tcp_conntracks table is not smart enough to handle 1068 * simultaneous open. 1069 */ 1070 ct->proto.tcp.last_flags |= IP_CT_TCP_SIMULTANEOUS_OPEN; 1071 break; 1072 case TCP_CONNTRACK_SYN_RECV: 1073 if (dir == IP_CT_DIR_REPLY && index == TCP_ACK_SET && 1074 ct->proto.tcp.last_flags & IP_CT_TCP_SIMULTANEOUS_OPEN) 1075 new_state = TCP_CONNTRACK_ESTABLISHED; 1076 break; 1077 case TCP_CONNTRACK_CLOSE: 1078 if (index != TCP_RST_SET) 1079 break; 1080 1081 /* If we are closing, tuple might have been re-used already. 1082 * last_index, last_ack, and all other ct fields used for 1083 * sequence/window validation are outdated in that case. 1084 * 1085 * As the conntrack can already be expired by GC under pressure, 1086 * just skip validation checks. 1087 */ 1088 if (tcp_can_early_drop(ct)) 1089 goto in_window; 1090 1091 /* td_maxack might be outdated if we let a SYN through earlier */ 1092 if ((ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) && 1093 ct->proto.tcp.last_index != TCP_SYN_SET) { 1094 u32 seq = ntohl(th->seq); 1095 1096 /* If we are not in established state and SEQ=0 this is most 1097 * likely an answer to a SYN we let go through above (last_index 1098 * can be updated due to out-of-order ACKs). 1099 */ 1100 if (seq == 0 && !nf_conntrack_tcp_established(ct)) 1101 break; 1102 1103 if (before(seq, ct->proto.tcp.seen[!dir].td_maxack) && 1104 !tn->tcp_ignore_invalid_rst) { 1105 /* Invalid RST */ 1106 spin_unlock_bh(&ct->lock); 1107 nf_ct_l4proto_log_invalid(skb, ct, state, "invalid rst"); 1108 return -NF_ACCEPT; 1109 } 1110 1111 if (!nf_conntrack_tcp_established(ct) || 1112 seq == ct->proto.tcp.seen[!dir].td_maxack) 1113 break; 1114 1115 /* Check if rst is part of train, such as 1116 * foo:80 > bar:4379: P, 235946583:235946602(19) ack 42 1117 * foo:80 > bar:4379: R, 235946602:235946602(0) ack 42 1118 */ 1119 if (ct->proto.tcp.last_index == TCP_ACK_SET && 1120 ct->proto.tcp.last_dir == dir && 1121 seq == ct->proto.tcp.last_end) 1122 break; 1123 1124 /* ... RST sequence number doesn't match exactly, keep 1125 * established state to allow a possible challenge ACK. 1126 */ 1127 new_state = old_state; 1128 } 1129 if (((test_bit(IPS_SEEN_REPLY_BIT, &ct->status) 1130 && ct->proto.tcp.last_index == TCP_SYN_SET) 1131 || (!test_bit(IPS_ASSURED_BIT, &ct->status) 1132 && ct->proto.tcp.last_index == TCP_ACK_SET)) 1133 && ntohl(th->ack_seq) == ct->proto.tcp.last_end) { 1134 /* RST sent to invalid SYN or ACK we had let through 1135 * at a) and c) above: 1136 * 1137 * a) SYN was in window then 1138 * c) we hold a half-open connection. 1139 * 1140 * Delete our connection entry. 1141 * We skip window checking, because packet might ACK 1142 * segments we ignored. */ 1143 goto in_window; 1144 } 1145 break; 1146 default: 1147 /* Keep compilers happy. */ 1148 break; 1149 } 1150 1151 if (!tcp_in_window(ct, dir, index, 1152 skb, dataoff, th, state)) { 1153 spin_unlock_bh(&ct->lock); 1154 return -NF_ACCEPT; 1155 } 1156 in_window: 1157 /* From now on we have got in-window packets */ 1158 ct->proto.tcp.last_index = index; 1159 ct->proto.tcp.last_dir = dir; 1160 1161 pr_debug("tcp_conntracks: "); 1162 nf_ct_dump_tuple(tuple); 1163 pr_debug("syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n", 1164 (th->syn ? 1 : 0), (th->ack ? 1 : 0), 1165 (th->fin ? 1 : 0), (th->rst ? 1 : 0), 1166 old_state, new_state); 1167 1168 ct->proto.tcp.state = new_state; 1169 if (old_state != new_state 1170 && new_state == TCP_CONNTRACK_FIN_WAIT) 1171 ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; 1172 1173 timeouts = nf_ct_timeout_lookup(ct); 1174 if (!timeouts) 1175 timeouts = tn->timeouts; 1176 1177 if (ct->proto.tcp.retrans >= tn->tcp_max_retrans && 1178 timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS]) 1179 timeout = timeouts[TCP_CONNTRACK_RETRANS]; 1180 else if (unlikely(index == TCP_RST_SET)) 1181 timeout = timeouts[TCP_CONNTRACK_CLOSE]; 1182 else if ((ct->proto.tcp.seen[0].flags | ct->proto.tcp.seen[1].flags) & 1183 IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED && 1184 timeouts[new_state] > timeouts[TCP_CONNTRACK_UNACK]) 1185 timeout = timeouts[TCP_CONNTRACK_UNACK]; 1186 else if (ct->proto.tcp.last_win == 0 && 1187 timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS]) 1188 timeout = timeouts[TCP_CONNTRACK_RETRANS]; 1189 else 1190 timeout = timeouts[new_state]; 1191 spin_unlock_bh(&ct->lock); 1192 1193 if (new_state != old_state) 1194 nf_conntrack_event_cache(IPCT_PROTOINFO, ct); 1195 1196 if (!test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { 1197 /* If only reply is a RST, we can consider ourselves not to 1198 have an established connection: this is a fairly common 1199 problem case, so we can delete the conntrack 1200 immediately. --RR */ 1201 if (th->rst) { 1202 nf_ct_kill_acct(ct, ctinfo, skb); 1203 return NF_ACCEPT; 1204 } 1205 1206 if (index == TCP_SYN_SET && old_state == TCP_CONNTRACK_SYN_SENT) { 1207 /* do not renew timeout on SYN retransmit. 1208 * 1209 * Else port reuse by client or NAT middlebox can keep 1210 * entry alive indefinitely (including nat info). 1211 */ 1212 return NF_ACCEPT; 1213 } 1214 1215 /* ESTABLISHED without SEEN_REPLY, i.e. mid-connection 1216 * pickup with loose=1. Avoid large ESTABLISHED timeout. 1217 */ 1218 if (new_state == TCP_CONNTRACK_ESTABLISHED && 1219 timeout > timeouts[TCP_CONNTRACK_UNACK]) 1220 timeout = timeouts[TCP_CONNTRACK_UNACK]; 1221 } else if (!test_bit(IPS_ASSURED_BIT, &ct->status) 1222 && (old_state == TCP_CONNTRACK_SYN_RECV 1223 || old_state == TCP_CONNTRACK_ESTABLISHED) 1224 && new_state == TCP_CONNTRACK_ESTABLISHED) { 1225 /* Set ASSURED if we see valid ack in ESTABLISHED 1226 after SYN_RECV or a valid answer for a picked up 1227 connection. */ 1228 set_bit(IPS_ASSURED_BIT, &ct->status); 1229 nf_conntrack_event_cache(IPCT_ASSURED, ct); 1230 } 1231 nf_ct_refresh_acct(ct, ctinfo, skb, timeout); 1232 1233 return NF_ACCEPT; 1234 } 1235 1236 #if IS_ENABLED(CONFIG_NF_CT_NETLINK) 1237 1238 #include <linux/netfilter/nfnetlink.h> 1239 #include <linux/netfilter/nfnetlink_conntrack.h> 1240 1241 static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, 1242 struct nf_conn *ct, bool destroy) 1243 { 1244 struct nlattr *nest_parms; 1245 struct nf_ct_tcp_flags tmp = {}; 1246 1247 spin_lock_bh(&ct->lock); 1248 nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP); 1249 if (!nest_parms) 1250 goto nla_put_failure; 1251 1252 if (nla_put_u8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state)) 1253 goto nla_put_failure; 1254 1255 if (destroy) 1256 goto skip_state; 1257 1258 if (nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL, 1259 ct->proto.tcp.seen[0].td_scale) || 1260 nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY, 1261 ct->proto.tcp.seen[1].td_scale)) 1262 goto nla_put_failure; 1263 1264 tmp.flags = ct->proto.tcp.seen[0].flags; 1265 if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL, 1266 sizeof(struct nf_ct_tcp_flags), &tmp)) 1267 goto nla_put_failure; 1268 1269 tmp.flags = ct->proto.tcp.seen[1].flags; 1270 if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY, 1271 sizeof(struct nf_ct_tcp_flags), &tmp)) 1272 goto nla_put_failure; 1273 skip_state: 1274 spin_unlock_bh(&ct->lock); 1275 nla_nest_end(skb, nest_parms); 1276 1277 return 0; 1278 1279 nla_put_failure: 1280 spin_unlock_bh(&ct->lock); 1281 return -1; 1282 } 1283 1284 static const struct nla_policy tcp_nla_policy[CTA_PROTOINFO_TCP_MAX+1] = { 1285 [CTA_PROTOINFO_TCP_STATE] = { .type = NLA_U8 }, 1286 [CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] = { .type = NLA_U8 }, 1287 [CTA_PROTOINFO_TCP_WSCALE_REPLY] = { .type = NLA_U8 }, 1288 [CTA_PROTOINFO_TCP_FLAGS_ORIGINAL] = { .len = sizeof(struct nf_ct_tcp_flags) }, 1289 [CTA_PROTOINFO_TCP_FLAGS_REPLY] = { .len = sizeof(struct nf_ct_tcp_flags) }, 1290 }; 1291 1292 #define TCP_NLATTR_SIZE ( \ 1293 NLA_ALIGN(NLA_HDRLEN + 1) + \ 1294 NLA_ALIGN(NLA_HDRLEN + 1) + \ 1295 NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags)) + \ 1296 NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags))) 1297 1298 static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct) 1299 { 1300 struct nlattr *pattr = cda[CTA_PROTOINFO_TCP]; 1301 struct nlattr *tb[CTA_PROTOINFO_TCP_MAX+1]; 1302 int err; 1303 1304 /* updates could not contain anything about the private 1305 * protocol info, in that case skip the parsing */ 1306 if (!pattr) 1307 return 0; 1308 1309 err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_TCP_MAX, pattr, 1310 tcp_nla_policy, NULL); 1311 if (err < 0) 1312 return err; 1313 1314 if (tb[CTA_PROTOINFO_TCP_STATE] && 1315 nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX) 1316 return -EINVAL; 1317 1318 spin_lock_bh(&ct->lock); 1319 if (tb[CTA_PROTOINFO_TCP_STATE]) 1320 ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]); 1321 1322 if (tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]) { 1323 struct nf_ct_tcp_flags *attr = 1324 nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]); 1325 ct->proto.tcp.seen[0].flags &= ~attr->mask; 1326 ct->proto.tcp.seen[0].flags |= attr->flags & attr->mask; 1327 } 1328 1329 if (tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]) { 1330 struct nf_ct_tcp_flags *attr = 1331 nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]); 1332 ct->proto.tcp.seen[1].flags &= ~attr->mask; 1333 ct->proto.tcp.seen[1].flags |= attr->flags & attr->mask; 1334 } 1335 1336 if (tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] && 1337 tb[CTA_PROTOINFO_TCP_WSCALE_REPLY] && 1338 ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_WINDOW_SCALE && 1339 ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_WINDOW_SCALE) { 1340 ct->proto.tcp.seen[0].td_scale = 1341 nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL]); 1342 ct->proto.tcp.seen[1].td_scale = 1343 nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]); 1344 } 1345 spin_unlock_bh(&ct->lock); 1346 1347 return 0; 1348 } 1349 1350 static unsigned int tcp_nlattr_tuple_size(void) 1351 { 1352 static unsigned int size __read_mostly; 1353 1354 if (!size) 1355 size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); 1356 1357 return size; 1358 } 1359 #endif 1360 1361 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT 1362 1363 #include <linux/netfilter/nfnetlink.h> 1364 #include <linux/netfilter/nfnetlink_cttimeout.h> 1365 1366 static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[], 1367 struct net *net, void *data) 1368 { 1369 struct nf_tcp_net *tn = nf_tcp_pernet(net); 1370 unsigned int *timeouts = data; 1371 int i; 1372 1373 if (!timeouts) 1374 timeouts = tn->timeouts; 1375 /* set default TCP timeouts. */ 1376 for (i=0; i<TCP_CONNTRACK_TIMEOUT_MAX; i++) 1377 timeouts[i] = tn->timeouts[i]; 1378 1379 if (tb[CTA_TIMEOUT_TCP_SYN_SENT]) { 1380 timeouts[TCP_CONNTRACK_SYN_SENT] = 1381 ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT]))*HZ; 1382 } 1383 1384 if (tb[CTA_TIMEOUT_TCP_SYN_RECV]) { 1385 timeouts[TCP_CONNTRACK_SYN_RECV] = 1386 ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_RECV]))*HZ; 1387 } 1388 if (tb[CTA_TIMEOUT_TCP_ESTABLISHED]) { 1389 timeouts[TCP_CONNTRACK_ESTABLISHED] = 1390 ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_ESTABLISHED]))*HZ; 1391 } 1392 if (tb[CTA_TIMEOUT_TCP_FIN_WAIT]) { 1393 timeouts[TCP_CONNTRACK_FIN_WAIT] = 1394 ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_FIN_WAIT]))*HZ; 1395 } 1396 if (tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]) { 1397 timeouts[TCP_CONNTRACK_CLOSE_WAIT] = 1398 ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]))*HZ; 1399 } 1400 if (tb[CTA_TIMEOUT_TCP_LAST_ACK]) { 1401 timeouts[TCP_CONNTRACK_LAST_ACK] = 1402 ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_LAST_ACK]))*HZ; 1403 } 1404 if (tb[CTA_TIMEOUT_TCP_TIME_WAIT]) { 1405 timeouts[TCP_CONNTRACK_TIME_WAIT] = 1406 ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_TIME_WAIT]))*HZ; 1407 } 1408 if (tb[CTA_TIMEOUT_TCP_CLOSE]) { 1409 timeouts[TCP_CONNTRACK_CLOSE] = 1410 ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE]))*HZ; 1411 } 1412 if (tb[CTA_TIMEOUT_TCP_SYN_SENT2]) { 1413 timeouts[TCP_CONNTRACK_SYN_SENT2] = 1414 ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT2]))*HZ; 1415 } 1416 if (tb[CTA_TIMEOUT_TCP_RETRANS]) { 1417 timeouts[TCP_CONNTRACK_RETRANS] = 1418 ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_RETRANS]))*HZ; 1419 } 1420 if (tb[CTA_TIMEOUT_TCP_UNACK]) { 1421 timeouts[TCP_CONNTRACK_UNACK] = 1422 ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_UNACK]))*HZ; 1423 } 1424 1425 timeouts[CTA_TIMEOUT_TCP_UNSPEC] = timeouts[CTA_TIMEOUT_TCP_SYN_SENT]; 1426 return 0; 1427 } 1428 1429 static int 1430 tcp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) 1431 { 1432 const unsigned int *timeouts = data; 1433 1434 if (nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT, 1435 htonl(timeouts[TCP_CONNTRACK_SYN_SENT] / HZ)) || 1436 nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_RECV, 1437 htonl(timeouts[TCP_CONNTRACK_SYN_RECV] / HZ)) || 1438 nla_put_be32(skb, CTA_TIMEOUT_TCP_ESTABLISHED, 1439 htonl(timeouts[TCP_CONNTRACK_ESTABLISHED] / HZ)) || 1440 nla_put_be32(skb, CTA_TIMEOUT_TCP_FIN_WAIT, 1441 htonl(timeouts[TCP_CONNTRACK_FIN_WAIT] / HZ)) || 1442 nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE_WAIT, 1443 htonl(timeouts[TCP_CONNTRACK_CLOSE_WAIT] / HZ)) || 1444 nla_put_be32(skb, CTA_TIMEOUT_TCP_LAST_ACK, 1445 htonl(timeouts[TCP_CONNTRACK_LAST_ACK] / HZ)) || 1446 nla_put_be32(skb, CTA_TIMEOUT_TCP_TIME_WAIT, 1447 htonl(timeouts[TCP_CONNTRACK_TIME_WAIT] / HZ)) || 1448 nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE, 1449 htonl(timeouts[TCP_CONNTRACK_CLOSE] / HZ)) || 1450 nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT2, 1451 htonl(timeouts[TCP_CONNTRACK_SYN_SENT2] / HZ)) || 1452 nla_put_be32(skb, CTA_TIMEOUT_TCP_RETRANS, 1453 htonl(timeouts[TCP_CONNTRACK_RETRANS] / HZ)) || 1454 nla_put_be32(skb, CTA_TIMEOUT_TCP_UNACK, 1455 htonl(timeouts[TCP_CONNTRACK_UNACK] / HZ))) 1456 goto nla_put_failure; 1457 return 0; 1458 1459 nla_put_failure: 1460 return -ENOSPC; 1461 } 1462 1463 static const struct nla_policy tcp_timeout_nla_policy[CTA_TIMEOUT_TCP_MAX+1] = { 1464 [CTA_TIMEOUT_TCP_SYN_SENT] = { .type = NLA_U32 }, 1465 [CTA_TIMEOUT_TCP_SYN_RECV] = { .type = NLA_U32 }, 1466 [CTA_TIMEOUT_TCP_ESTABLISHED] = { .type = NLA_U32 }, 1467 [CTA_TIMEOUT_TCP_FIN_WAIT] = { .type = NLA_U32 }, 1468 [CTA_TIMEOUT_TCP_CLOSE_WAIT] = { .type = NLA_U32 }, 1469 [CTA_TIMEOUT_TCP_LAST_ACK] = { .type = NLA_U32 }, 1470 [CTA_TIMEOUT_TCP_TIME_WAIT] = { .type = NLA_U32 }, 1471 [CTA_TIMEOUT_TCP_CLOSE] = { .type = NLA_U32 }, 1472 [CTA_TIMEOUT_TCP_SYN_SENT2] = { .type = NLA_U32 }, 1473 [CTA_TIMEOUT_TCP_RETRANS] = { .type = NLA_U32 }, 1474 [CTA_TIMEOUT_TCP_UNACK] = { .type = NLA_U32 }, 1475 }; 1476 #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ 1477 1478 void nf_conntrack_tcp_init_net(struct net *net) 1479 { 1480 struct nf_tcp_net *tn = nf_tcp_pernet(net); 1481 int i; 1482 1483 for (i = 0; i < TCP_CONNTRACK_TIMEOUT_MAX; i++) 1484 tn->timeouts[i] = tcp_timeouts[i]; 1485 1486 /* timeouts[0] is unused, make it same as SYN_SENT so 1487 * ->timeouts[0] contains 'new' timeout, like udp or icmp. 1488 */ 1489 tn->timeouts[0] = tcp_timeouts[TCP_CONNTRACK_SYN_SENT]; 1490 1491 /* If it is set to zero, we disable picking up already established 1492 * connections. 1493 */ 1494 tn->tcp_loose = 1; 1495 1496 /* "Be conservative in what you do, 1497 * be liberal in what you accept from others." 1498 * If it's non-zero, we mark only out of window RST segments as INVALID. 1499 */ 1500 tn->tcp_be_liberal = 0; 1501 1502 /* If it's non-zero, we turn off RST sequence number check */ 1503 tn->tcp_ignore_invalid_rst = 0; 1504 1505 /* Max number of the retransmitted packets without receiving an (acceptable) 1506 * ACK from the destination. If this number is reached, a shorter timer 1507 * will be started. 1508 */ 1509 tn->tcp_max_retrans = 3; 1510 1511 #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) 1512 tn->offload_timeout = 30 * HZ; 1513 #endif 1514 } 1515 1516 const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp = 1517 { 1518 .l4proto = IPPROTO_TCP, 1519 #ifdef CONFIG_NF_CONNTRACK_PROCFS 1520 .print_conntrack = tcp_print_conntrack, 1521 #endif 1522 .can_early_drop = tcp_can_early_drop, 1523 #if IS_ENABLED(CONFIG_NF_CT_NETLINK) 1524 .to_nlattr = tcp_to_nlattr, 1525 .from_nlattr = nlattr_to_tcp, 1526 .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, 1527 .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, 1528 .nlattr_tuple_size = tcp_nlattr_tuple_size, 1529 .nlattr_size = TCP_NLATTR_SIZE, 1530 .nla_policy = nf_ct_port_nla_policy, 1531 #endif 1532 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT 1533 .ctnl_timeout = { 1534 .nlattr_to_obj = tcp_timeout_nlattr_to_obj, 1535 .obj_to_nlattr = tcp_timeout_obj_to_nlattr, 1536 .nlattr_max = CTA_TIMEOUT_TCP_MAX, 1537 .obj_size = sizeof(unsigned int) * 1538 TCP_CONNTRACK_TIMEOUT_MAX, 1539 .nla_policy = tcp_timeout_nla_policy, 1540 }, 1541 #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ 1542 }; 1543