1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * IPv4 specific functions 9 * 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 * 18 * This program is free software; you can redistribute it and/or 19 * modify it under the terms of the GNU General Public License 20 * as published by the Free Software Foundation; either version 21 * 2 of the License, or (at your option) any later version. 22 */ 23 24 /* 25 * Changes: 26 * David S. Miller : New socket lookup architecture. 27 * This code is dedicated to John Dyson. 28 * David S. Miller : Change semantics of established hash, 29 * half is devoted to TIME_WAIT sockets 30 * and the rest go in the other half. 31 * Andi Kleen : Add support for syncookies and fixed 32 * some bugs: ip options weren't passed to 33 * the TCP layer, missed a check for an 34 * ACK bit. 35 * Andi Kleen : Implemented fast path mtu discovery. 36 * Fixed many serious bugs in the 37 * request_sock handling and moved 38 * most of it into the af independent code. 39 * Added tail drop and some other bugfixes. 40 * Added new listen semantics. 41 * Mike McLagan : Routing by source 42 * Juan Jose Ciarlante: ip_dynaddr bits 43 * Andi Kleen: various fixes. 44 * Vitaly E. Lavrov : Transparent proxy revived after year 45 * coma. 46 * Andi Kleen : Fix new listen. 47 * Andi Kleen : Fix accept error reporting. 48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 50 * a single port at the same time. 51 */ 52 53 #define pr_fmt(fmt) "TCP: " fmt 54 55 #include <linux/bottom_half.h> 56 #include <linux/types.h> 57 #include <linux/fcntl.h> 58 #include <linux/module.h> 59 #include <linux/random.h> 60 #include <linux/cache.h> 61 #include <linux/jhash.h> 62 #include <linux/init.h> 63 #include <linux/times.h> 64 #include <linux/slab.h> 65 66 #include <net/net_namespace.h> 67 #include <net/icmp.h> 68 #include <net/inet_hashtables.h> 69 #include <net/tcp.h> 70 #include <net/transp_v6.h> 71 #include <net/ipv6.h> 72 #include <net/inet_common.h> 73 #include <net/timewait_sock.h> 74 #include <net/xfrm.h> 75 #include <net/secure_seq.h> 76 #include <net/busy_poll.h> 77 78 #include <linux/inet.h> 79 #include <linux/ipv6.h> 80 #include <linux/stddef.h> 81 #include <linux/proc_fs.h> 82 #include <linux/seq_file.h> 83 #include <linux/inetdevice.h> 84 85 #include <crypto/hash.h> 86 #include <linux/scatterlist.h> 87 88 #include <trace/events/tcp.h> 89 90 #ifdef CONFIG_TCP_MD5SIG 91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 92 __be32 daddr, __be32 saddr, const struct tcphdr *th); 93 #endif 94 95 struct inet_hashinfo tcp_hashinfo; 96 EXPORT_SYMBOL(tcp_hashinfo); 97 98 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 99 { 100 return secure_tcp_seq(ip_hdr(skb)->daddr, 101 ip_hdr(skb)->saddr, 102 tcp_hdr(skb)->dest, 103 tcp_hdr(skb)->source); 104 } 105 106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 107 { 108 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 109 } 110 111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 112 { 113 const struct inet_timewait_sock *tw = inet_twsk(sktw); 114 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 115 struct tcp_sock *tp = tcp_sk(sk); 116 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse; 117 118 if (reuse == 2) { 119 /* Still does not detect *everything* that goes through 120 * lo, since we require a loopback src or dst address 121 * or direct binding to 'lo' interface. 122 */ 123 bool loopback = false; 124 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 125 loopback = true; 126 #if IS_ENABLED(CONFIG_IPV6) 127 if (tw->tw_family == AF_INET6) { 128 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 129 (ipv6_addr_v4mapped(&tw->tw_v6_daddr) && 130 (tw->tw_v6_daddr.s6_addr[12] == 127)) || 131 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 132 (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) && 133 (tw->tw_v6_rcv_saddr.s6_addr[12] == 127))) 134 loopback = true; 135 } else 136 #endif 137 { 138 if (ipv4_is_loopback(tw->tw_daddr) || 139 ipv4_is_loopback(tw->tw_rcv_saddr)) 140 loopback = true; 141 } 142 if (!loopback) 143 reuse = 0; 144 } 145 146 /* With PAWS, it is safe from the viewpoint 147 of data integrity. Even without PAWS it is safe provided sequence 148 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 149 150 Actually, the idea is close to VJ's one, only timestamp cache is 151 held not per host, but per port pair and TW bucket is used as state 152 holder. 153 154 If TW bucket has been already destroyed we fall back to VJ's scheme 155 and use initial timestamp retrieved from peer table. 156 */ 157 if (tcptw->tw_ts_recent_stamp && 158 (!twp || (reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { 159 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; 160 if (tp->write_seq == 0) 161 tp->write_seq = 1; 162 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 163 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 164 sock_hold(sktw); 165 return 1; 166 } 167 168 return 0; 169 } 170 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 171 172 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 173 int addr_len) 174 { 175 /* This check is replicated from tcp_v4_connect() and intended to 176 * prevent BPF program called below from accessing bytes that are out 177 * of the bound specified by user in addr_len. 178 */ 179 if (addr_len < sizeof(struct sockaddr_in)) 180 return -EINVAL; 181 182 sock_owned_by_me(sk); 183 184 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr); 185 } 186 187 /* This will initiate an outgoing connection. */ 188 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 189 { 190 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 191 struct inet_sock *inet = inet_sk(sk); 192 struct tcp_sock *tp = tcp_sk(sk); 193 __be16 orig_sport, orig_dport; 194 __be32 daddr, nexthop; 195 struct flowi4 *fl4; 196 struct rtable *rt; 197 int err; 198 struct ip_options_rcu *inet_opt; 199 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 200 201 if (addr_len < sizeof(struct sockaddr_in)) 202 return -EINVAL; 203 204 if (usin->sin_family != AF_INET) 205 return -EAFNOSUPPORT; 206 207 nexthop = daddr = usin->sin_addr.s_addr; 208 inet_opt = rcu_dereference_protected(inet->inet_opt, 209 lockdep_sock_is_held(sk)); 210 if (inet_opt && inet_opt->opt.srr) { 211 if (!daddr) 212 return -EINVAL; 213 nexthop = inet_opt->opt.faddr; 214 } 215 216 orig_sport = inet->inet_sport; 217 orig_dport = usin->sin_port; 218 fl4 = &inet->cork.fl.u.ip4; 219 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 220 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 221 IPPROTO_TCP, 222 orig_sport, orig_dport, sk); 223 if (IS_ERR(rt)) { 224 err = PTR_ERR(rt); 225 if (err == -ENETUNREACH) 226 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 227 return err; 228 } 229 230 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 231 ip_rt_put(rt); 232 return -ENETUNREACH; 233 } 234 235 if (!inet_opt || !inet_opt->opt.srr) 236 daddr = fl4->daddr; 237 238 if (!inet->inet_saddr) 239 inet->inet_saddr = fl4->saddr; 240 sk_rcv_saddr_set(sk, inet->inet_saddr); 241 242 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 243 /* Reset inherited state */ 244 tp->rx_opt.ts_recent = 0; 245 tp->rx_opt.ts_recent_stamp = 0; 246 if (likely(!tp->repair)) 247 tp->write_seq = 0; 248 } 249 250 inet->inet_dport = usin->sin_port; 251 sk_daddr_set(sk, daddr); 252 253 inet_csk(sk)->icsk_ext_hdr_len = 0; 254 if (inet_opt) 255 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 256 257 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 258 259 /* Socket identity is still unknown (sport may be zero). 260 * However we set state to SYN-SENT and not releasing socket 261 * lock select source port, enter ourselves into the hash tables and 262 * complete initialization after this. 263 */ 264 tcp_set_state(sk, TCP_SYN_SENT); 265 err = inet_hash_connect(tcp_death_row, sk); 266 if (err) 267 goto failure; 268 269 sk_set_txhash(sk); 270 271 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 272 inet->inet_sport, inet->inet_dport, sk); 273 if (IS_ERR(rt)) { 274 err = PTR_ERR(rt); 275 rt = NULL; 276 goto failure; 277 } 278 /* OK, now commit destination to socket. */ 279 sk->sk_gso_type = SKB_GSO_TCPV4; 280 sk_setup_caps(sk, &rt->dst); 281 rt = NULL; 282 283 if (likely(!tp->repair)) { 284 if (!tp->write_seq) 285 tp->write_seq = secure_tcp_seq(inet->inet_saddr, 286 inet->inet_daddr, 287 inet->inet_sport, 288 usin->sin_port); 289 tp->tsoffset = secure_tcp_ts_off(sock_net(sk), 290 inet->inet_saddr, 291 inet->inet_daddr); 292 } 293 294 inet->inet_id = tp->write_seq ^ jiffies; 295 296 if (tcp_fastopen_defer_connect(sk, &err)) 297 return err; 298 if (err) 299 goto failure; 300 301 err = tcp_connect(sk); 302 303 if (err) 304 goto failure; 305 306 return 0; 307 308 failure: 309 /* 310 * This unhashes the socket and releases the local port, 311 * if necessary. 312 */ 313 tcp_set_state(sk, TCP_CLOSE); 314 ip_rt_put(rt); 315 sk->sk_route_caps = 0; 316 inet->inet_dport = 0; 317 return err; 318 } 319 EXPORT_SYMBOL(tcp_v4_connect); 320 321 /* 322 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 323 * It can be called through tcp_release_cb() if socket was owned by user 324 * at the time tcp_v4_err() was called to handle ICMP message. 325 */ 326 void tcp_v4_mtu_reduced(struct sock *sk) 327 { 328 struct inet_sock *inet = inet_sk(sk); 329 struct dst_entry *dst; 330 u32 mtu; 331 332 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 333 return; 334 mtu = tcp_sk(sk)->mtu_info; 335 dst = inet_csk_update_pmtu(sk, mtu); 336 if (!dst) 337 return; 338 339 /* Something is about to be wrong... Remember soft error 340 * for the case, if this connection will not able to recover. 341 */ 342 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 343 sk->sk_err_soft = EMSGSIZE; 344 345 mtu = dst_mtu(dst); 346 347 if (inet->pmtudisc != IP_PMTUDISC_DONT && 348 ip_sk_accept_pmtu(sk) && 349 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 350 tcp_sync_mss(sk, mtu); 351 352 /* Resend the TCP packet because it's 353 * clear that the old packet has been 354 * dropped. This is the new "fast" path mtu 355 * discovery. 356 */ 357 tcp_simple_retransmit(sk); 358 } /* else let the usual retransmit timer handle it */ 359 } 360 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 361 362 static void do_redirect(struct sk_buff *skb, struct sock *sk) 363 { 364 struct dst_entry *dst = __sk_dst_check(sk, 0); 365 366 if (dst) 367 dst->ops->redirect(dst, sk, skb); 368 } 369 370 371 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 372 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 373 { 374 struct request_sock *req = inet_reqsk(sk); 375 struct net *net = sock_net(sk); 376 377 /* ICMPs are not backlogged, hence we cannot get 378 * an established socket here. 379 */ 380 if (seq != tcp_rsk(req)->snt_isn) { 381 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 382 } else if (abort) { 383 /* 384 * Still in SYN_RECV, just remove it silently. 385 * There is no good way to pass the error to the newly 386 * created socket, and POSIX does not want network 387 * errors returned from accept(). 388 */ 389 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 390 tcp_listendrop(req->rsk_listener); 391 } 392 reqsk_put(req); 393 } 394 EXPORT_SYMBOL(tcp_req_err); 395 396 /* 397 * This routine is called by the ICMP module when it gets some 398 * sort of error condition. If err < 0 then the socket should 399 * be closed and the error returned to the user. If err > 0 400 * it's just the icmp type << 8 | icmp code. After adjustment 401 * header points to the first 8 bytes of the tcp header. We need 402 * to find the appropriate port. 403 * 404 * The locking strategy used here is very "optimistic". When 405 * someone else accesses the socket the ICMP is just dropped 406 * and for some paths there is no check at all. 407 * A more general error queue to queue errors for later handling 408 * is probably better. 409 * 410 */ 411 412 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) 413 { 414 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data; 415 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); 416 struct inet_connection_sock *icsk; 417 struct tcp_sock *tp; 418 struct inet_sock *inet; 419 const int type = icmp_hdr(icmp_skb)->type; 420 const int code = icmp_hdr(icmp_skb)->code; 421 struct sock *sk; 422 struct sk_buff *skb; 423 struct request_sock *fastopen; 424 u32 seq, snd_una; 425 s32 remaining; 426 u32 delta_us; 427 int err; 428 struct net *net = dev_net(icmp_skb->dev); 429 430 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, 431 th->dest, iph->saddr, ntohs(th->source), 432 inet_iif(icmp_skb), 0); 433 if (!sk) { 434 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 435 return; 436 } 437 if (sk->sk_state == TCP_TIME_WAIT) { 438 inet_twsk_put(inet_twsk(sk)); 439 return; 440 } 441 seq = ntohl(th->seq); 442 if (sk->sk_state == TCP_NEW_SYN_RECV) 443 return tcp_req_err(sk, seq, 444 type == ICMP_PARAMETERPROB || 445 type == ICMP_TIME_EXCEEDED || 446 (type == ICMP_DEST_UNREACH && 447 (code == ICMP_NET_UNREACH || 448 code == ICMP_HOST_UNREACH))); 449 450 bh_lock_sock(sk); 451 /* If too many ICMPs get dropped on busy 452 * servers this needs to be solved differently. 453 * We do take care of PMTU discovery (RFC1191) special case : 454 * we can receive locally generated ICMP messages while socket is held. 455 */ 456 if (sock_owned_by_user(sk)) { 457 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 458 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 459 } 460 if (sk->sk_state == TCP_CLOSE) 461 goto out; 462 463 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 464 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 465 goto out; 466 } 467 468 icsk = inet_csk(sk); 469 tp = tcp_sk(sk); 470 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 471 fastopen = tp->fastopen_rsk; 472 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 473 if (sk->sk_state != TCP_LISTEN && 474 !between(seq, snd_una, tp->snd_nxt)) { 475 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 476 goto out; 477 } 478 479 switch (type) { 480 case ICMP_REDIRECT: 481 if (!sock_owned_by_user(sk)) 482 do_redirect(icmp_skb, sk); 483 goto out; 484 case ICMP_SOURCE_QUENCH: 485 /* Just silently ignore these. */ 486 goto out; 487 case ICMP_PARAMETERPROB: 488 err = EPROTO; 489 break; 490 case ICMP_DEST_UNREACH: 491 if (code > NR_ICMP_UNREACH) 492 goto out; 493 494 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 495 /* We are not interested in TCP_LISTEN and open_requests 496 * (SYN-ACKs send out by Linux are always <576bytes so 497 * they should go through unfragmented). 498 */ 499 if (sk->sk_state == TCP_LISTEN) 500 goto out; 501 502 tp->mtu_info = info; 503 if (!sock_owned_by_user(sk)) { 504 tcp_v4_mtu_reduced(sk); 505 } else { 506 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 507 sock_hold(sk); 508 } 509 goto out; 510 } 511 512 err = icmp_err_convert[code].errno; 513 /* check if icmp_skb allows revert of backoff 514 * (see draft-zimmermann-tcp-lcd) */ 515 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH) 516 break; 517 if (seq != tp->snd_una || !icsk->icsk_retransmits || 518 !icsk->icsk_backoff || fastopen) 519 break; 520 521 if (sock_owned_by_user(sk)) 522 break; 523 524 icsk->icsk_backoff--; 525 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : 526 TCP_TIMEOUT_INIT; 527 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 528 529 skb = tcp_rtx_queue_head(sk); 530 BUG_ON(!skb); 531 532 tcp_mstamp_refresh(tp); 533 delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp); 534 remaining = icsk->icsk_rto - 535 usecs_to_jiffies(delta_us); 536 537 if (remaining > 0) { 538 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 539 remaining, TCP_RTO_MAX); 540 } else { 541 /* RTO revert clocked out retransmission. 542 * Will retransmit now */ 543 tcp_retransmit_timer(sk); 544 } 545 546 break; 547 case ICMP_TIME_EXCEEDED: 548 err = EHOSTUNREACH; 549 break; 550 default: 551 goto out; 552 } 553 554 switch (sk->sk_state) { 555 case TCP_SYN_SENT: 556 case TCP_SYN_RECV: 557 /* Only in fast or simultaneous open. If a fast open socket is 558 * is already accepted it is treated as a connected one below. 559 */ 560 if (fastopen && !fastopen->sk) 561 break; 562 563 if (!sock_owned_by_user(sk)) { 564 sk->sk_err = err; 565 566 sk->sk_error_report(sk); 567 568 tcp_done(sk); 569 } else { 570 sk->sk_err_soft = err; 571 } 572 goto out; 573 } 574 575 /* If we've already connected we will keep trying 576 * until we time out, or the user gives up. 577 * 578 * rfc1122 4.2.3.9 allows to consider as hard errors 579 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 580 * but it is obsoleted by pmtu discovery). 581 * 582 * Note, that in modern internet, where routing is unreliable 583 * and in each dark corner broken firewalls sit, sending random 584 * errors ordered by their masters even this two messages finally lose 585 * their original sense (even Linux sends invalid PORT_UNREACHs) 586 * 587 * Now we are in compliance with RFCs. 588 * --ANK (980905) 589 */ 590 591 inet = inet_sk(sk); 592 if (!sock_owned_by_user(sk) && inet->recverr) { 593 sk->sk_err = err; 594 sk->sk_error_report(sk); 595 } else { /* Only an error on timeout */ 596 sk->sk_err_soft = err; 597 } 598 599 out: 600 bh_unlock_sock(sk); 601 sock_put(sk); 602 } 603 604 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 605 { 606 struct tcphdr *th = tcp_hdr(skb); 607 608 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 609 skb->csum_start = skb_transport_header(skb) - skb->head; 610 skb->csum_offset = offsetof(struct tcphdr, check); 611 } 612 613 /* This routine computes an IPv4 TCP checksum. */ 614 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 615 { 616 const struct inet_sock *inet = inet_sk(sk); 617 618 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 619 } 620 EXPORT_SYMBOL(tcp_v4_send_check); 621 622 /* 623 * This routine will send an RST to the other tcp. 624 * 625 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 626 * for reset. 627 * Answer: if a packet caused RST, it is not for a socket 628 * existing in our system, if it is matched to a socket, 629 * it is just duplicate segment or bug in other side's TCP. 630 * So that we build reply only basing on parameters 631 * arrived with segment. 632 * Exception: precedence violation. We do not implement it in any case. 633 */ 634 635 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) 636 { 637 const struct tcphdr *th = tcp_hdr(skb); 638 struct { 639 struct tcphdr th; 640 #ifdef CONFIG_TCP_MD5SIG 641 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)]; 642 #endif 643 } rep; 644 struct ip_reply_arg arg; 645 #ifdef CONFIG_TCP_MD5SIG 646 struct tcp_md5sig_key *key = NULL; 647 const __u8 *hash_location = NULL; 648 unsigned char newhash[16]; 649 int genhash; 650 struct sock *sk1 = NULL; 651 #endif 652 struct net *net; 653 struct sock *ctl_sk; 654 655 /* Never send a reset in response to a reset. */ 656 if (th->rst) 657 return; 658 659 /* If sk not NULL, it means we did a successful lookup and incoming 660 * route had to be correct. prequeue might have dropped our dst. 661 */ 662 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 663 return; 664 665 /* Swap the send and the receive. */ 666 memset(&rep, 0, sizeof(rep)); 667 rep.th.dest = th->source; 668 rep.th.source = th->dest; 669 rep.th.doff = sizeof(struct tcphdr) / 4; 670 rep.th.rst = 1; 671 672 if (th->ack) { 673 rep.th.seq = th->ack_seq; 674 } else { 675 rep.th.ack = 1; 676 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 677 skb->len - (th->doff << 2)); 678 } 679 680 memset(&arg, 0, sizeof(arg)); 681 arg.iov[0].iov_base = (unsigned char *)&rep; 682 arg.iov[0].iov_len = sizeof(rep.th); 683 684 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 685 #ifdef CONFIG_TCP_MD5SIG 686 rcu_read_lock(); 687 hash_location = tcp_parse_md5sig_option(th); 688 if (sk && sk_fullsock(sk)) { 689 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *) 690 &ip_hdr(skb)->saddr, AF_INET); 691 } else if (hash_location) { 692 /* 693 * active side is lost. Try to find listening socket through 694 * source port, and then find md5 key through listening socket. 695 * we are not loose security here: 696 * Incoming packet is checked with md5 hash with finding key, 697 * no RST generated if md5 hash doesn't match. 698 */ 699 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, 700 ip_hdr(skb)->saddr, 701 th->source, ip_hdr(skb)->daddr, 702 ntohs(th->source), inet_iif(skb), 703 tcp_v4_sdif(skb)); 704 /* don't send rst if it can't find key */ 705 if (!sk1) 706 goto out; 707 708 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *) 709 &ip_hdr(skb)->saddr, AF_INET); 710 if (!key) 711 goto out; 712 713 714 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 715 if (genhash || memcmp(hash_location, newhash, 16) != 0) 716 goto out; 717 718 } 719 720 if (key) { 721 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 722 (TCPOPT_NOP << 16) | 723 (TCPOPT_MD5SIG << 8) | 724 TCPOLEN_MD5SIG); 725 /* Update length and the length the header thinks exists */ 726 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 727 rep.th.doff = arg.iov[0].iov_len / 4; 728 729 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 730 key, ip_hdr(skb)->saddr, 731 ip_hdr(skb)->daddr, &rep.th); 732 } 733 #endif 734 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 735 ip_hdr(skb)->saddr, /* XXX */ 736 arg.iov[0].iov_len, IPPROTO_TCP, 0); 737 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 738 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 739 740 /* When socket is gone, all binding information is lost. 741 * routing might fail in this case. No choice here, if we choose to force 742 * input interface, we will misroute in case of asymmetric route. 743 */ 744 if (sk) { 745 arg.bound_dev_if = sk->sk_bound_dev_if; 746 if (sk_fullsock(sk)) 747 trace_tcp_send_reset(sk, skb); 748 } 749 750 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 751 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 752 753 arg.tos = ip_hdr(skb)->tos; 754 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 755 local_bh_disable(); 756 ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk); 757 if (sk) 758 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 759 inet_twsk(sk)->tw_mark : sk->sk_mark; 760 ip_send_unicast_reply(ctl_sk, 761 skb, &TCP_SKB_CB(skb)->header.h4.opt, 762 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 763 &arg, arg.iov[0].iov_len); 764 765 ctl_sk->sk_mark = 0; 766 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 767 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 768 local_bh_enable(); 769 770 #ifdef CONFIG_TCP_MD5SIG 771 out: 772 rcu_read_unlock(); 773 #endif 774 } 775 776 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 777 outside socket context is ugly, certainly. What can I do? 778 */ 779 780 static void tcp_v4_send_ack(const struct sock *sk, 781 struct sk_buff *skb, u32 seq, u32 ack, 782 u32 win, u32 tsval, u32 tsecr, int oif, 783 struct tcp_md5sig_key *key, 784 int reply_flags, u8 tos) 785 { 786 const struct tcphdr *th = tcp_hdr(skb); 787 struct { 788 struct tcphdr th; 789 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 790 #ifdef CONFIG_TCP_MD5SIG 791 + (TCPOLEN_MD5SIG_ALIGNED >> 2) 792 #endif 793 ]; 794 } rep; 795 struct net *net = sock_net(sk); 796 struct ip_reply_arg arg; 797 struct sock *ctl_sk; 798 799 memset(&rep.th, 0, sizeof(struct tcphdr)); 800 memset(&arg, 0, sizeof(arg)); 801 802 arg.iov[0].iov_base = (unsigned char *)&rep; 803 arg.iov[0].iov_len = sizeof(rep.th); 804 if (tsecr) { 805 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 806 (TCPOPT_TIMESTAMP << 8) | 807 TCPOLEN_TIMESTAMP); 808 rep.opt[1] = htonl(tsval); 809 rep.opt[2] = htonl(tsecr); 810 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 811 } 812 813 /* Swap the send and the receive. */ 814 rep.th.dest = th->source; 815 rep.th.source = th->dest; 816 rep.th.doff = arg.iov[0].iov_len / 4; 817 rep.th.seq = htonl(seq); 818 rep.th.ack_seq = htonl(ack); 819 rep.th.ack = 1; 820 rep.th.window = htons(win); 821 822 #ifdef CONFIG_TCP_MD5SIG 823 if (key) { 824 int offset = (tsecr) ? 3 : 0; 825 826 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 827 (TCPOPT_NOP << 16) | 828 (TCPOPT_MD5SIG << 8) | 829 TCPOLEN_MD5SIG); 830 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 831 rep.th.doff = arg.iov[0].iov_len/4; 832 833 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 834 key, ip_hdr(skb)->saddr, 835 ip_hdr(skb)->daddr, &rep.th); 836 } 837 #endif 838 arg.flags = reply_flags; 839 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 840 ip_hdr(skb)->saddr, /* XXX */ 841 arg.iov[0].iov_len, IPPROTO_TCP, 0); 842 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 843 if (oif) 844 arg.bound_dev_if = oif; 845 arg.tos = tos; 846 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 847 local_bh_disable(); 848 ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk); 849 if (sk) 850 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 851 inet_twsk(sk)->tw_mark : sk->sk_mark; 852 ip_send_unicast_reply(ctl_sk, 853 skb, &TCP_SKB_CB(skb)->header.h4.opt, 854 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 855 &arg, arg.iov[0].iov_len); 856 857 ctl_sk->sk_mark = 0; 858 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 859 local_bh_enable(); 860 } 861 862 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 863 { 864 struct inet_timewait_sock *tw = inet_twsk(sk); 865 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 866 867 tcp_v4_send_ack(sk, skb, 868 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 869 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 870 tcp_time_stamp_raw() + tcptw->tw_ts_offset, 871 tcptw->tw_ts_recent, 872 tw->tw_bound_dev_if, 873 tcp_twsk_md5_key(tcptw), 874 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 875 tw->tw_tos 876 ); 877 878 inet_twsk_put(tw); 879 } 880 881 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 882 struct request_sock *req) 883 { 884 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 885 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 886 */ 887 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 888 tcp_sk(sk)->snd_nxt; 889 890 /* RFC 7323 2.3 891 * The window field (SEG.WND) of every outgoing segment, with the 892 * exception of <SYN> segments, MUST be right-shifted by 893 * Rcv.Wind.Shift bits: 894 */ 895 tcp_v4_send_ack(sk, skb, seq, 896 tcp_rsk(req)->rcv_nxt, 897 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 898 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, 899 req->ts_recent, 900 0, 901 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr, 902 AF_INET), 903 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 904 ip_hdr(skb)->tos); 905 } 906 907 /* 908 * Send a SYN-ACK after having received a SYN. 909 * This still operates on a request_sock only, not on a big 910 * socket. 911 */ 912 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 913 struct flowi *fl, 914 struct request_sock *req, 915 struct tcp_fastopen_cookie *foc, 916 enum tcp_synack_type synack_type) 917 { 918 const struct inet_request_sock *ireq = inet_rsk(req); 919 struct flowi4 fl4; 920 int err = -1; 921 struct sk_buff *skb; 922 923 /* First, grab a route. */ 924 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 925 return -1; 926 927 skb = tcp_make_synack(sk, dst, req, foc, synack_type); 928 929 if (skb) { 930 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 931 932 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 933 ireq->ir_rmt_addr, 934 ireq_opt_deref(ireq)); 935 err = net_xmit_eval(err); 936 } 937 938 return err; 939 } 940 941 /* 942 * IPv4 request_sock destructor. 943 */ 944 static void tcp_v4_reqsk_destructor(struct request_sock *req) 945 { 946 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 947 } 948 949 #ifdef CONFIG_TCP_MD5SIG 950 /* 951 * RFC2385 MD5 checksumming requires a mapping of 952 * IP address->MD5 Key. 953 * We need to maintain these in the sk structure. 954 */ 955 956 /* Find the Key structure for an address. */ 957 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, 958 const union tcp_md5_addr *addr, 959 int family) 960 { 961 const struct tcp_sock *tp = tcp_sk(sk); 962 struct tcp_md5sig_key *key; 963 const struct tcp_md5sig_info *md5sig; 964 __be32 mask; 965 struct tcp_md5sig_key *best_match = NULL; 966 bool match; 967 968 /* caller either holds rcu_read_lock() or socket lock */ 969 md5sig = rcu_dereference_check(tp->md5sig_info, 970 lockdep_sock_is_held(sk)); 971 if (!md5sig) 972 return NULL; 973 974 hlist_for_each_entry_rcu(key, &md5sig->head, node) { 975 if (key->family != family) 976 continue; 977 978 if (family == AF_INET) { 979 mask = inet_make_mask(key->prefixlen); 980 match = (key->addr.a4.s_addr & mask) == 981 (addr->a4.s_addr & mask); 982 #if IS_ENABLED(CONFIG_IPV6) 983 } else if (family == AF_INET6) { 984 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 985 key->prefixlen); 986 #endif 987 } else { 988 match = false; 989 } 990 991 if (match && (!best_match || 992 key->prefixlen > best_match->prefixlen)) 993 best_match = key; 994 } 995 return best_match; 996 } 997 EXPORT_SYMBOL(tcp_md5_do_lookup); 998 999 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1000 const union tcp_md5_addr *addr, 1001 int family, u8 prefixlen) 1002 { 1003 const struct tcp_sock *tp = tcp_sk(sk); 1004 struct tcp_md5sig_key *key; 1005 unsigned int size = sizeof(struct in_addr); 1006 const struct tcp_md5sig_info *md5sig; 1007 1008 /* caller either holds rcu_read_lock() or socket lock */ 1009 md5sig = rcu_dereference_check(tp->md5sig_info, 1010 lockdep_sock_is_held(sk)); 1011 if (!md5sig) 1012 return NULL; 1013 #if IS_ENABLED(CONFIG_IPV6) 1014 if (family == AF_INET6) 1015 size = sizeof(struct in6_addr); 1016 #endif 1017 hlist_for_each_entry_rcu(key, &md5sig->head, node) { 1018 if (key->family != family) 1019 continue; 1020 if (!memcmp(&key->addr, addr, size) && 1021 key->prefixlen == prefixlen) 1022 return key; 1023 } 1024 return NULL; 1025 } 1026 1027 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1028 const struct sock *addr_sk) 1029 { 1030 const union tcp_md5_addr *addr; 1031 1032 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1033 return tcp_md5_do_lookup(sk, addr, AF_INET); 1034 } 1035 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1036 1037 /* This can be called on a newly created socket, from other files */ 1038 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1039 int family, u8 prefixlen, const u8 *newkey, u8 newkeylen, 1040 gfp_t gfp) 1041 { 1042 /* Add Key to the list */ 1043 struct tcp_md5sig_key *key; 1044 struct tcp_sock *tp = tcp_sk(sk); 1045 struct tcp_md5sig_info *md5sig; 1046 1047 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen); 1048 if (key) { 1049 /* Pre-existing entry - just update that one. */ 1050 memcpy(key->key, newkey, newkeylen); 1051 key->keylen = newkeylen; 1052 return 0; 1053 } 1054 1055 md5sig = rcu_dereference_protected(tp->md5sig_info, 1056 lockdep_sock_is_held(sk)); 1057 if (!md5sig) { 1058 md5sig = kmalloc(sizeof(*md5sig), gfp); 1059 if (!md5sig) 1060 return -ENOMEM; 1061 1062 sk_nocaps_add(sk, NETIF_F_GSO_MASK); 1063 INIT_HLIST_HEAD(&md5sig->head); 1064 rcu_assign_pointer(tp->md5sig_info, md5sig); 1065 } 1066 1067 key = sock_kmalloc(sk, sizeof(*key), gfp); 1068 if (!key) 1069 return -ENOMEM; 1070 if (!tcp_alloc_md5sig_pool()) { 1071 sock_kfree_s(sk, key, sizeof(*key)); 1072 return -ENOMEM; 1073 } 1074 1075 memcpy(key->key, newkey, newkeylen); 1076 key->keylen = newkeylen; 1077 key->family = family; 1078 key->prefixlen = prefixlen; 1079 memcpy(&key->addr, addr, 1080 (family == AF_INET6) ? sizeof(struct in6_addr) : 1081 sizeof(struct in_addr)); 1082 hlist_add_head_rcu(&key->node, &md5sig->head); 1083 return 0; 1084 } 1085 EXPORT_SYMBOL(tcp_md5_do_add); 1086 1087 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1088 u8 prefixlen) 1089 { 1090 struct tcp_md5sig_key *key; 1091 1092 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen); 1093 if (!key) 1094 return -ENOENT; 1095 hlist_del_rcu(&key->node); 1096 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1097 kfree_rcu(key, rcu); 1098 return 0; 1099 } 1100 EXPORT_SYMBOL(tcp_md5_do_del); 1101 1102 static void tcp_clear_md5_list(struct sock *sk) 1103 { 1104 struct tcp_sock *tp = tcp_sk(sk); 1105 struct tcp_md5sig_key *key; 1106 struct hlist_node *n; 1107 struct tcp_md5sig_info *md5sig; 1108 1109 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1110 1111 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1112 hlist_del_rcu(&key->node); 1113 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1114 kfree_rcu(key, rcu); 1115 } 1116 } 1117 1118 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1119 char __user *optval, int optlen) 1120 { 1121 struct tcp_md5sig cmd; 1122 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1123 u8 prefixlen = 32; 1124 1125 if (optlen < sizeof(cmd)) 1126 return -EINVAL; 1127 1128 if (copy_from_user(&cmd, optval, sizeof(cmd))) 1129 return -EFAULT; 1130 1131 if (sin->sin_family != AF_INET) 1132 return -EINVAL; 1133 1134 if (optname == TCP_MD5SIG_EXT && 1135 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1136 prefixlen = cmd.tcpm_prefixlen; 1137 if (prefixlen > 32) 1138 return -EINVAL; 1139 } 1140 1141 if (!cmd.tcpm_keylen) 1142 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, 1143 AF_INET, prefixlen); 1144 1145 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1146 return -EINVAL; 1147 1148 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, 1149 AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen, 1150 GFP_KERNEL); 1151 } 1152 1153 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, 1154 __be32 daddr, __be32 saddr, 1155 const struct tcphdr *th, int nbytes) 1156 { 1157 struct tcp4_pseudohdr *bp; 1158 struct scatterlist sg; 1159 struct tcphdr *_th; 1160 1161 bp = hp->scratch; 1162 bp->saddr = saddr; 1163 bp->daddr = daddr; 1164 bp->pad = 0; 1165 bp->protocol = IPPROTO_TCP; 1166 bp->len = cpu_to_be16(nbytes); 1167 1168 _th = (struct tcphdr *)(bp + 1); 1169 memcpy(_th, th, sizeof(*th)); 1170 _th->check = 0; 1171 1172 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1173 ahash_request_set_crypt(hp->md5_req, &sg, NULL, 1174 sizeof(*bp) + sizeof(*th)); 1175 return crypto_ahash_update(hp->md5_req); 1176 } 1177 1178 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1179 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1180 { 1181 struct tcp_md5sig_pool *hp; 1182 struct ahash_request *req; 1183 1184 hp = tcp_get_md5sig_pool(); 1185 if (!hp) 1186 goto clear_hash_noput; 1187 req = hp->md5_req; 1188 1189 if (crypto_ahash_init(req)) 1190 goto clear_hash; 1191 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) 1192 goto clear_hash; 1193 if (tcp_md5_hash_key(hp, key)) 1194 goto clear_hash; 1195 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1196 if (crypto_ahash_final(req)) 1197 goto clear_hash; 1198 1199 tcp_put_md5sig_pool(); 1200 return 0; 1201 1202 clear_hash: 1203 tcp_put_md5sig_pool(); 1204 clear_hash_noput: 1205 memset(md5_hash, 0, 16); 1206 return 1; 1207 } 1208 1209 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1210 const struct sock *sk, 1211 const struct sk_buff *skb) 1212 { 1213 struct tcp_md5sig_pool *hp; 1214 struct ahash_request *req; 1215 const struct tcphdr *th = tcp_hdr(skb); 1216 __be32 saddr, daddr; 1217 1218 if (sk) { /* valid for establish/request sockets */ 1219 saddr = sk->sk_rcv_saddr; 1220 daddr = sk->sk_daddr; 1221 } else { 1222 const struct iphdr *iph = ip_hdr(skb); 1223 saddr = iph->saddr; 1224 daddr = iph->daddr; 1225 } 1226 1227 hp = tcp_get_md5sig_pool(); 1228 if (!hp) 1229 goto clear_hash_noput; 1230 req = hp->md5_req; 1231 1232 if (crypto_ahash_init(req)) 1233 goto clear_hash; 1234 1235 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len)) 1236 goto clear_hash; 1237 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) 1238 goto clear_hash; 1239 if (tcp_md5_hash_key(hp, key)) 1240 goto clear_hash; 1241 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1242 if (crypto_ahash_final(req)) 1243 goto clear_hash; 1244 1245 tcp_put_md5sig_pool(); 1246 return 0; 1247 1248 clear_hash: 1249 tcp_put_md5sig_pool(); 1250 clear_hash_noput: 1251 memset(md5_hash, 0, 16); 1252 return 1; 1253 } 1254 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1255 1256 #endif 1257 1258 /* Called with rcu_read_lock() */ 1259 static bool tcp_v4_inbound_md5_hash(const struct sock *sk, 1260 const struct sk_buff *skb) 1261 { 1262 #ifdef CONFIG_TCP_MD5SIG 1263 /* 1264 * This gets called for each TCP segment that arrives 1265 * so we want to be efficient. 1266 * We have 3 drop cases: 1267 * o No MD5 hash and one expected. 1268 * o MD5 hash and we're not expecting one. 1269 * o MD5 hash and its wrong. 1270 */ 1271 const __u8 *hash_location = NULL; 1272 struct tcp_md5sig_key *hash_expected; 1273 const struct iphdr *iph = ip_hdr(skb); 1274 const struct tcphdr *th = tcp_hdr(skb); 1275 int genhash; 1276 unsigned char newhash[16]; 1277 1278 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr, 1279 AF_INET); 1280 hash_location = tcp_parse_md5sig_option(th); 1281 1282 /* We've parsed the options - do we have a hash? */ 1283 if (!hash_expected && !hash_location) 1284 return false; 1285 1286 if (hash_expected && !hash_location) { 1287 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); 1288 return true; 1289 } 1290 1291 if (!hash_expected && hash_location) { 1292 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); 1293 return true; 1294 } 1295 1296 /* Okay, so this is hash_expected and hash_location - 1297 * so we need to calculate the checksum. 1298 */ 1299 genhash = tcp_v4_md5_hash_skb(newhash, 1300 hash_expected, 1301 NULL, skb); 1302 1303 if (genhash || memcmp(hash_location, newhash, 16) != 0) { 1304 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); 1305 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", 1306 &iph->saddr, ntohs(th->source), 1307 &iph->daddr, ntohs(th->dest), 1308 genhash ? " tcp_v4_calc_md5_hash failed" 1309 : ""); 1310 return true; 1311 } 1312 return false; 1313 #endif 1314 return false; 1315 } 1316 1317 static void tcp_v4_init_req(struct request_sock *req, 1318 const struct sock *sk_listener, 1319 struct sk_buff *skb) 1320 { 1321 struct inet_request_sock *ireq = inet_rsk(req); 1322 struct net *net = sock_net(sk_listener); 1323 1324 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1325 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1326 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1327 } 1328 1329 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1330 struct flowi *fl, 1331 const struct request_sock *req) 1332 { 1333 return inet_csk_route_req(sk, &fl->u.ip4, req); 1334 } 1335 1336 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1337 .family = PF_INET, 1338 .obj_size = sizeof(struct tcp_request_sock), 1339 .rtx_syn_ack = tcp_rtx_synack, 1340 .send_ack = tcp_v4_reqsk_send_ack, 1341 .destructor = tcp_v4_reqsk_destructor, 1342 .send_reset = tcp_v4_send_reset, 1343 .syn_ack_timeout = tcp_syn_ack_timeout, 1344 }; 1345 1346 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1347 .mss_clamp = TCP_MSS_DEFAULT, 1348 #ifdef CONFIG_TCP_MD5SIG 1349 .req_md5_lookup = tcp_v4_md5_lookup, 1350 .calc_md5_hash = tcp_v4_md5_hash_skb, 1351 #endif 1352 .init_req = tcp_v4_init_req, 1353 #ifdef CONFIG_SYN_COOKIES 1354 .cookie_init_seq = cookie_v4_init_sequence, 1355 #endif 1356 .route_req = tcp_v4_route_req, 1357 .init_seq = tcp_v4_init_seq, 1358 .init_ts_off = tcp_v4_init_ts_off, 1359 .send_synack = tcp_v4_send_synack, 1360 }; 1361 1362 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1363 { 1364 /* Never answer to SYNs send to broadcast or multicast */ 1365 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1366 goto drop; 1367 1368 return tcp_conn_request(&tcp_request_sock_ops, 1369 &tcp_request_sock_ipv4_ops, sk, skb); 1370 1371 drop: 1372 tcp_listendrop(sk); 1373 return 0; 1374 } 1375 EXPORT_SYMBOL(tcp_v4_conn_request); 1376 1377 1378 /* 1379 * The three way handshake has completed - we got a valid synack - 1380 * now create the new socket. 1381 */ 1382 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1383 struct request_sock *req, 1384 struct dst_entry *dst, 1385 struct request_sock *req_unhash, 1386 bool *own_req) 1387 { 1388 struct inet_request_sock *ireq; 1389 struct inet_sock *newinet; 1390 struct tcp_sock *newtp; 1391 struct sock *newsk; 1392 #ifdef CONFIG_TCP_MD5SIG 1393 struct tcp_md5sig_key *key; 1394 #endif 1395 struct ip_options_rcu *inet_opt; 1396 1397 if (sk_acceptq_is_full(sk)) 1398 goto exit_overflow; 1399 1400 newsk = tcp_create_openreq_child(sk, req, skb); 1401 if (!newsk) 1402 goto exit_nonewsk; 1403 1404 newsk->sk_gso_type = SKB_GSO_TCPV4; 1405 inet_sk_rx_dst_set(newsk, skb); 1406 1407 newtp = tcp_sk(newsk); 1408 newinet = inet_sk(newsk); 1409 ireq = inet_rsk(req); 1410 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1411 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1412 newsk->sk_bound_dev_if = ireq->ir_iif; 1413 newinet->inet_saddr = ireq->ir_loc_addr; 1414 inet_opt = rcu_dereference(ireq->ireq_opt); 1415 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1416 newinet->mc_index = inet_iif(skb); 1417 newinet->mc_ttl = ip_hdr(skb)->ttl; 1418 newinet->rcv_tos = ip_hdr(skb)->tos; 1419 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1420 if (inet_opt) 1421 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1422 newinet->inet_id = newtp->write_seq ^ jiffies; 1423 1424 if (!dst) { 1425 dst = inet_csk_route_child_sock(sk, newsk, req); 1426 if (!dst) 1427 goto put_and_exit; 1428 } else { 1429 /* syncookie case : see end of cookie_v4_check() */ 1430 } 1431 sk_setup_caps(newsk, dst); 1432 1433 tcp_ca_openreq_child(newsk, dst); 1434 1435 tcp_sync_mss(newsk, dst_mtu(dst)); 1436 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1437 1438 tcp_initialize_rcv_mss(newsk); 1439 1440 #ifdef CONFIG_TCP_MD5SIG 1441 /* Copy over the MD5 key from the original socket */ 1442 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr, 1443 AF_INET); 1444 if (key) { 1445 /* 1446 * We're using one, so create a matching key 1447 * on the newsk structure. If we fail to get 1448 * memory, then we end up not copying the key 1449 * across. Shucks. 1450 */ 1451 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr, 1452 AF_INET, 32, key->key, key->keylen, GFP_ATOMIC); 1453 sk_nocaps_add(newsk, NETIF_F_GSO_MASK); 1454 } 1455 #endif 1456 1457 if (__inet_inherit_port(sk, newsk) < 0) 1458 goto put_and_exit; 1459 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); 1460 if (likely(*own_req)) { 1461 tcp_move_syn(newtp, req); 1462 ireq->ireq_opt = NULL; 1463 } else { 1464 newinet->inet_opt = NULL; 1465 } 1466 return newsk; 1467 1468 exit_overflow: 1469 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1470 exit_nonewsk: 1471 dst_release(dst); 1472 exit: 1473 tcp_listendrop(sk); 1474 return NULL; 1475 put_and_exit: 1476 newinet->inet_opt = NULL; 1477 inet_csk_prepare_forced_close(newsk); 1478 tcp_done(newsk); 1479 goto exit; 1480 } 1481 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1482 1483 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1484 { 1485 #ifdef CONFIG_SYN_COOKIES 1486 const struct tcphdr *th = tcp_hdr(skb); 1487 1488 if (!th->syn) 1489 sk = cookie_v4_check(sk, skb); 1490 #endif 1491 return sk; 1492 } 1493 1494 /* The socket must have it's spinlock held when we get 1495 * here, unless it is a TCP_LISTEN socket. 1496 * 1497 * We have a potential double-lock case here, so even when 1498 * doing backlog processing we use the BH locking scheme. 1499 * This is because we cannot sleep with the original spinlock 1500 * held. 1501 */ 1502 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1503 { 1504 struct sock *rsk; 1505 1506 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1507 struct dst_entry *dst = sk->sk_rx_dst; 1508 1509 sock_rps_save_rxhash(sk, skb); 1510 sk_mark_napi_id(sk, skb); 1511 if (dst) { 1512 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || 1513 !dst->ops->check(dst, 0)) { 1514 dst_release(dst); 1515 sk->sk_rx_dst = NULL; 1516 } 1517 } 1518 tcp_rcv_established(sk, skb); 1519 return 0; 1520 } 1521 1522 if (tcp_checksum_complete(skb)) 1523 goto csum_err; 1524 1525 if (sk->sk_state == TCP_LISTEN) { 1526 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1527 1528 if (!nsk) 1529 goto discard; 1530 if (nsk != sk) { 1531 if (tcp_child_process(sk, nsk, skb)) { 1532 rsk = nsk; 1533 goto reset; 1534 } 1535 return 0; 1536 } 1537 } else 1538 sock_rps_save_rxhash(sk, skb); 1539 1540 if (tcp_rcv_state_process(sk, skb)) { 1541 rsk = sk; 1542 goto reset; 1543 } 1544 return 0; 1545 1546 reset: 1547 tcp_v4_send_reset(rsk, skb); 1548 discard: 1549 kfree_skb(skb); 1550 /* Be careful here. If this function gets more complicated and 1551 * gcc suffers from register pressure on the x86, sk (in %ebx) 1552 * might be destroyed here. This current version compiles correctly, 1553 * but you have been warned. 1554 */ 1555 return 0; 1556 1557 csum_err: 1558 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1559 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1560 goto discard; 1561 } 1562 EXPORT_SYMBOL(tcp_v4_do_rcv); 1563 1564 int tcp_v4_early_demux(struct sk_buff *skb) 1565 { 1566 const struct iphdr *iph; 1567 const struct tcphdr *th; 1568 struct sock *sk; 1569 1570 if (skb->pkt_type != PACKET_HOST) 1571 return 0; 1572 1573 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1574 return 0; 1575 1576 iph = ip_hdr(skb); 1577 th = tcp_hdr(skb); 1578 1579 if (th->doff < sizeof(struct tcphdr) / 4) 1580 return 0; 1581 1582 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, 1583 iph->saddr, th->source, 1584 iph->daddr, ntohs(th->dest), 1585 skb->skb_iif, inet_sdif(skb)); 1586 if (sk) { 1587 skb->sk = sk; 1588 skb->destructor = sock_edemux; 1589 if (sk_fullsock(sk)) { 1590 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); 1591 1592 if (dst) 1593 dst = dst_check(dst, 0); 1594 if (dst && 1595 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) 1596 skb_dst_set_noref(skb, dst); 1597 } 1598 } 1599 return 0; 1600 } 1601 1602 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) 1603 { 1604 u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf; 1605 1606 /* Only socket owner can try to collapse/prune rx queues 1607 * to reduce memory overhead, so add a little headroom here. 1608 * Few sockets backlog are possibly concurrently non empty. 1609 */ 1610 limit += 64*1024; 1611 1612 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1613 * we can fix skb->truesize to its real value to avoid future drops. 1614 * This is valid because skb is not yet charged to the socket. 1615 * It has been noticed pure SACK packets were sometimes dropped 1616 * (if cooked by drivers without copybreak feature). 1617 */ 1618 skb_condense(skb); 1619 1620 if (unlikely(sk_add_backlog(sk, skb, limit))) { 1621 bh_unlock_sock(sk); 1622 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 1623 return true; 1624 } 1625 return false; 1626 } 1627 EXPORT_SYMBOL(tcp_add_backlog); 1628 1629 int tcp_filter(struct sock *sk, struct sk_buff *skb) 1630 { 1631 struct tcphdr *th = (struct tcphdr *)skb->data; 1632 unsigned int eaten = skb->len; 1633 int err; 1634 1635 err = sk_filter_trim_cap(sk, skb, th->doff * 4); 1636 if (!err) { 1637 eaten -= skb->len; 1638 TCP_SKB_CB(skb)->end_seq -= eaten; 1639 } 1640 return err; 1641 } 1642 EXPORT_SYMBOL(tcp_filter); 1643 1644 static void tcp_v4_restore_cb(struct sk_buff *skb) 1645 { 1646 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 1647 sizeof(struct inet_skb_parm)); 1648 } 1649 1650 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 1651 const struct tcphdr *th) 1652 { 1653 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 1654 * barrier() makes sure compiler wont play fool^Waliasing games. 1655 */ 1656 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 1657 sizeof(struct inet_skb_parm)); 1658 barrier(); 1659 1660 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1661 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1662 skb->len - th->doff * 4); 1663 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1664 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 1665 TCP_SKB_CB(skb)->tcp_tw_isn = 0; 1666 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 1667 TCP_SKB_CB(skb)->sacked = 0; 1668 TCP_SKB_CB(skb)->has_rxtstamp = 1669 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 1670 } 1671 1672 /* 1673 * From tcp_input.c 1674 */ 1675 1676 int tcp_v4_rcv(struct sk_buff *skb) 1677 { 1678 struct net *net = dev_net(skb->dev); 1679 int sdif = inet_sdif(skb); 1680 const struct iphdr *iph; 1681 const struct tcphdr *th; 1682 bool refcounted; 1683 struct sock *sk; 1684 int ret; 1685 1686 if (skb->pkt_type != PACKET_HOST) 1687 goto discard_it; 1688 1689 /* Count it even if it's bad */ 1690 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 1691 1692 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 1693 goto discard_it; 1694 1695 th = (const struct tcphdr *)skb->data; 1696 1697 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) 1698 goto bad_packet; 1699 if (!pskb_may_pull(skb, th->doff * 4)) 1700 goto discard_it; 1701 1702 /* An explanation is required here, I think. 1703 * Packet length and doff are validated by header prediction, 1704 * provided case of th->doff==0 is eliminated. 1705 * So, we defer the checks. */ 1706 1707 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 1708 goto csum_error; 1709 1710 th = (const struct tcphdr *)skb->data; 1711 iph = ip_hdr(skb); 1712 lookup: 1713 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, 1714 th->dest, sdif, &refcounted); 1715 if (!sk) 1716 goto no_tcp_socket; 1717 1718 process: 1719 if (sk->sk_state == TCP_TIME_WAIT) 1720 goto do_time_wait; 1721 1722 if (sk->sk_state == TCP_NEW_SYN_RECV) { 1723 struct request_sock *req = inet_reqsk(sk); 1724 bool req_stolen = false; 1725 struct sock *nsk; 1726 1727 sk = req->rsk_listener; 1728 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) { 1729 sk_drops_add(sk, skb); 1730 reqsk_put(req); 1731 goto discard_it; 1732 } 1733 if (unlikely(sk->sk_state != TCP_LISTEN)) { 1734 inet_csk_reqsk_queue_drop_and_put(sk, req); 1735 goto lookup; 1736 } 1737 /* We own a reference on the listener, increase it again 1738 * as we might lose it too soon. 1739 */ 1740 sock_hold(sk); 1741 refcounted = true; 1742 nsk = NULL; 1743 if (!tcp_filter(sk, skb)) { 1744 th = (const struct tcphdr *)skb->data; 1745 iph = ip_hdr(skb); 1746 tcp_v4_fill_cb(skb, iph, th); 1747 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 1748 } 1749 if (!nsk) { 1750 reqsk_put(req); 1751 if (req_stolen) { 1752 /* Another cpu got exclusive access to req 1753 * and created a full blown socket. 1754 * Try to feed this packet to this socket 1755 * instead of discarding it. 1756 */ 1757 tcp_v4_restore_cb(skb); 1758 sock_put(sk); 1759 goto lookup; 1760 } 1761 goto discard_and_relse; 1762 } 1763 if (nsk == sk) { 1764 reqsk_put(req); 1765 tcp_v4_restore_cb(skb); 1766 } else if (tcp_child_process(sk, nsk, skb)) { 1767 tcp_v4_send_reset(nsk, skb); 1768 goto discard_and_relse; 1769 } else { 1770 sock_put(sk); 1771 return 0; 1772 } 1773 } 1774 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 1775 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 1776 goto discard_and_relse; 1777 } 1778 1779 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 1780 goto discard_and_relse; 1781 1782 if (tcp_v4_inbound_md5_hash(sk, skb)) 1783 goto discard_and_relse; 1784 1785 nf_reset(skb); 1786 1787 if (tcp_filter(sk, skb)) 1788 goto discard_and_relse; 1789 th = (const struct tcphdr *)skb->data; 1790 iph = ip_hdr(skb); 1791 tcp_v4_fill_cb(skb, iph, th); 1792 1793 skb->dev = NULL; 1794 1795 if (sk->sk_state == TCP_LISTEN) { 1796 ret = tcp_v4_do_rcv(sk, skb); 1797 goto put_and_return; 1798 } 1799 1800 sk_incoming_cpu_update(sk); 1801 1802 bh_lock_sock_nested(sk); 1803 tcp_segs_in(tcp_sk(sk), skb); 1804 ret = 0; 1805 if (!sock_owned_by_user(sk)) { 1806 ret = tcp_v4_do_rcv(sk, skb); 1807 } else if (tcp_add_backlog(sk, skb)) { 1808 goto discard_and_relse; 1809 } 1810 bh_unlock_sock(sk); 1811 1812 put_and_return: 1813 if (refcounted) 1814 sock_put(sk); 1815 1816 return ret; 1817 1818 no_tcp_socket: 1819 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 1820 goto discard_it; 1821 1822 tcp_v4_fill_cb(skb, iph, th); 1823 1824 if (tcp_checksum_complete(skb)) { 1825 csum_error: 1826 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 1827 bad_packet: 1828 __TCP_INC_STATS(net, TCP_MIB_INERRS); 1829 } else { 1830 tcp_v4_send_reset(NULL, skb); 1831 } 1832 1833 discard_it: 1834 /* Discard frame. */ 1835 kfree_skb(skb); 1836 return 0; 1837 1838 discard_and_relse: 1839 sk_drops_add(sk, skb); 1840 if (refcounted) 1841 sock_put(sk); 1842 goto discard_it; 1843 1844 do_time_wait: 1845 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 1846 inet_twsk_put(inet_twsk(sk)); 1847 goto discard_it; 1848 } 1849 1850 tcp_v4_fill_cb(skb, iph, th); 1851 1852 if (tcp_checksum_complete(skb)) { 1853 inet_twsk_put(inet_twsk(sk)); 1854 goto csum_error; 1855 } 1856 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 1857 case TCP_TW_SYN: { 1858 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), 1859 &tcp_hashinfo, skb, 1860 __tcp_hdrlen(th), 1861 iph->saddr, th->source, 1862 iph->daddr, th->dest, 1863 inet_iif(skb), 1864 sdif); 1865 if (sk2) { 1866 inet_twsk_deschedule_put(inet_twsk(sk)); 1867 sk = sk2; 1868 tcp_v4_restore_cb(skb); 1869 refcounted = false; 1870 goto process; 1871 } 1872 } 1873 /* to ACK */ 1874 /* fall through */ 1875 case TCP_TW_ACK: 1876 tcp_v4_timewait_ack(sk, skb); 1877 break; 1878 case TCP_TW_RST: 1879 tcp_v4_send_reset(sk, skb); 1880 inet_twsk_deschedule_put(inet_twsk(sk)); 1881 goto discard_it; 1882 case TCP_TW_SUCCESS:; 1883 } 1884 goto discard_it; 1885 } 1886 1887 static struct timewait_sock_ops tcp_timewait_sock_ops = { 1888 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 1889 .twsk_unique = tcp_twsk_unique, 1890 .twsk_destructor= tcp_twsk_destructor, 1891 }; 1892 1893 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 1894 { 1895 struct dst_entry *dst = skb_dst(skb); 1896 1897 if (dst && dst_hold_safe(dst)) { 1898 sk->sk_rx_dst = dst; 1899 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; 1900 } 1901 } 1902 EXPORT_SYMBOL(inet_sk_rx_dst_set); 1903 1904 const struct inet_connection_sock_af_ops ipv4_specific = { 1905 .queue_xmit = ip_queue_xmit, 1906 .send_check = tcp_v4_send_check, 1907 .rebuild_header = inet_sk_rebuild_header, 1908 .sk_rx_dst_set = inet_sk_rx_dst_set, 1909 .conn_request = tcp_v4_conn_request, 1910 .syn_recv_sock = tcp_v4_syn_recv_sock, 1911 .net_header_len = sizeof(struct iphdr), 1912 .setsockopt = ip_setsockopt, 1913 .getsockopt = ip_getsockopt, 1914 .addr2sockaddr = inet_csk_addr2sockaddr, 1915 .sockaddr_len = sizeof(struct sockaddr_in), 1916 #ifdef CONFIG_COMPAT 1917 .compat_setsockopt = compat_ip_setsockopt, 1918 .compat_getsockopt = compat_ip_getsockopt, 1919 #endif 1920 .mtu_reduced = tcp_v4_mtu_reduced, 1921 }; 1922 EXPORT_SYMBOL(ipv4_specific); 1923 1924 #ifdef CONFIG_TCP_MD5SIG 1925 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 1926 .md5_lookup = tcp_v4_md5_lookup, 1927 .calc_md5_hash = tcp_v4_md5_hash_skb, 1928 .md5_parse = tcp_v4_parse_md5_keys, 1929 }; 1930 #endif 1931 1932 /* NOTE: A lot of things set to zero explicitly by call to 1933 * sk_alloc() so need not be done here. 1934 */ 1935 static int tcp_v4_init_sock(struct sock *sk) 1936 { 1937 struct inet_connection_sock *icsk = inet_csk(sk); 1938 1939 tcp_init_sock(sk); 1940 1941 icsk->icsk_af_ops = &ipv4_specific; 1942 1943 #ifdef CONFIG_TCP_MD5SIG 1944 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 1945 #endif 1946 1947 return 0; 1948 } 1949 1950 void tcp_v4_destroy_sock(struct sock *sk) 1951 { 1952 struct tcp_sock *tp = tcp_sk(sk); 1953 1954 trace_tcp_destroy_sock(sk); 1955 1956 tcp_clear_xmit_timers(sk); 1957 1958 tcp_cleanup_congestion_control(sk); 1959 1960 tcp_cleanup_ulp(sk); 1961 1962 /* Cleanup up the write buffer. */ 1963 tcp_write_queue_purge(sk); 1964 1965 /* Check if we want to disable active TFO */ 1966 tcp_fastopen_active_disable_ofo_check(sk); 1967 1968 /* Cleans up our, hopefully empty, out_of_order_queue. */ 1969 skb_rbtree_purge(&tp->out_of_order_queue); 1970 1971 #ifdef CONFIG_TCP_MD5SIG 1972 /* Clean up the MD5 key list, if any */ 1973 if (tp->md5sig_info) { 1974 tcp_clear_md5_list(sk); 1975 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu); 1976 tp->md5sig_info = NULL; 1977 } 1978 #endif 1979 1980 /* Clean up a referenced TCP bind bucket. */ 1981 if (inet_csk(sk)->icsk_bind_hash) 1982 inet_put_port(sk); 1983 1984 BUG_ON(tp->fastopen_rsk); 1985 1986 /* If socket is aborted during connect operation */ 1987 tcp_free_fastopen_req(tp); 1988 tcp_fastopen_destroy_cipher(sk); 1989 tcp_saved_syn_free(tp); 1990 1991 sk_sockets_allocated_dec(sk); 1992 } 1993 EXPORT_SYMBOL(tcp_v4_destroy_sock); 1994 1995 #ifdef CONFIG_PROC_FS 1996 /* Proc filesystem TCP sock list dumping. */ 1997 1998 /* 1999 * Get next listener socket follow cur. If cur is NULL, get first socket 2000 * starting from bucket given in st->bucket; when st->bucket is zero the 2001 * very first socket in the hash table is returned. 2002 */ 2003 static void *listening_get_next(struct seq_file *seq, void *cur) 2004 { 2005 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); 2006 struct tcp_iter_state *st = seq->private; 2007 struct net *net = seq_file_net(seq); 2008 struct inet_listen_hashbucket *ilb; 2009 struct sock *sk = cur; 2010 2011 if (!sk) { 2012 get_head: 2013 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2014 spin_lock(&ilb->lock); 2015 sk = sk_head(&ilb->head); 2016 st->offset = 0; 2017 goto get_sk; 2018 } 2019 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2020 ++st->num; 2021 ++st->offset; 2022 2023 sk = sk_next(sk); 2024 get_sk: 2025 sk_for_each_from(sk) { 2026 if (!net_eq(sock_net(sk), net)) 2027 continue; 2028 if (sk->sk_family == afinfo->family) 2029 return sk; 2030 } 2031 spin_unlock(&ilb->lock); 2032 st->offset = 0; 2033 if (++st->bucket < INET_LHTABLE_SIZE) 2034 goto get_head; 2035 return NULL; 2036 } 2037 2038 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2039 { 2040 struct tcp_iter_state *st = seq->private; 2041 void *rc; 2042 2043 st->bucket = 0; 2044 st->offset = 0; 2045 rc = listening_get_next(seq, NULL); 2046 2047 while (rc && *pos) { 2048 rc = listening_get_next(seq, rc); 2049 --*pos; 2050 } 2051 return rc; 2052 } 2053 2054 static inline bool empty_bucket(const struct tcp_iter_state *st) 2055 { 2056 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain); 2057 } 2058 2059 /* 2060 * Get first established socket starting from bucket given in st->bucket. 2061 * If st->bucket is zero, the very first socket in the hash is returned. 2062 */ 2063 static void *established_get_first(struct seq_file *seq) 2064 { 2065 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); 2066 struct tcp_iter_state *st = seq->private; 2067 struct net *net = seq_file_net(seq); 2068 void *rc = NULL; 2069 2070 st->offset = 0; 2071 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { 2072 struct sock *sk; 2073 struct hlist_nulls_node *node; 2074 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 2075 2076 /* Lockless fast path for the common case of empty buckets */ 2077 if (empty_bucket(st)) 2078 continue; 2079 2080 spin_lock_bh(lock); 2081 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 2082 if (sk->sk_family != afinfo->family || 2083 !net_eq(sock_net(sk), net)) { 2084 continue; 2085 } 2086 rc = sk; 2087 goto out; 2088 } 2089 spin_unlock_bh(lock); 2090 } 2091 out: 2092 return rc; 2093 } 2094 2095 static void *established_get_next(struct seq_file *seq, void *cur) 2096 { 2097 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); 2098 struct sock *sk = cur; 2099 struct hlist_nulls_node *node; 2100 struct tcp_iter_state *st = seq->private; 2101 struct net *net = seq_file_net(seq); 2102 2103 ++st->num; 2104 ++st->offset; 2105 2106 sk = sk_nulls_next(sk); 2107 2108 sk_nulls_for_each_from(sk, node) { 2109 if (sk->sk_family == afinfo->family && 2110 net_eq(sock_net(sk), net)) 2111 return sk; 2112 } 2113 2114 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2115 ++st->bucket; 2116 return established_get_first(seq); 2117 } 2118 2119 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2120 { 2121 struct tcp_iter_state *st = seq->private; 2122 void *rc; 2123 2124 st->bucket = 0; 2125 rc = established_get_first(seq); 2126 2127 while (rc && pos) { 2128 rc = established_get_next(seq, rc); 2129 --pos; 2130 } 2131 return rc; 2132 } 2133 2134 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2135 { 2136 void *rc; 2137 struct tcp_iter_state *st = seq->private; 2138 2139 st->state = TCP_SEQ_STATE_LISTENING; 2140 rc = listening_get_idx(seq, &pos); 2141 2142 if (!rc) { 2143 st->state = TCP_SEQ_STATE_ESTABLISHED; 2144 rc = established_get_idx(seq, pos); 2145 } 2146 2147 return rc; 2148 } 2149 2150 static void *tcp_seek_last_pos(struct seq_file *seq) 2151 { 2152 struct tcp_iter_state *st = seq->private; 2153 int offset = st->offset; 2154 int orig_num = st->num; 2155 void *rc = NULL; 2156 2157 switch (st->state) { 2158 case TCP_SEQ_STATE_LISTENING: 2159 if (st->bucket >= INET_LHTABLE_SIZE) 2160 break; 2161 st->state = TCP_SEQ_STATE_LISTENING; 2162 rc = listening_get_next(seq, NULL); 2163 while (offset-- && rc) 2164 rc = listening_get_next(seq, rc); 2165 if (rc) 2166 break; 2167 st->bucket = 0; 2168 st->state = TCP_SEQ_STATE_ESTABLISHED; 2169 /* Fallthrough */ 2170 case TCP_SEQ_STATE_ESTABLISHED: 2171 if (st->bucket > tcp_hashinfo.ehash_mask) 2172 break; 2173 rc = established_get_first(seq); 2174 while (offset-- && rc) 2175 rc = established_get_next(seq, rc); 2176 } 2177 2178 st->num = orig_num; 2179 2180 return rc; 2181 } 2182 2183 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2184 { 2185 struct tcp_iter_state *st = seq->private; 2186 void *rc; 2187 2188 if (*pos && *pos == st->last_pos) { 2189 rc = tcp_seek_last_pos(seq); 2190 if (rc) 2191 goto out; 2192 } 2193 2194 st->state = TCP_SEQ_STATE_LISTENING; 2195 st->num = 0; 2196 st->bucket = 0; 2197 st->offset = 0; 2198 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2199 2200 out: 2201 st->last_pos = *pos; 2202 return rc; 2203 } 2204 EXPORT_SYMBOL(tcp_seq_start); 2205 2206 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2207 { 2208 struct tcp_iter_state *st = seq->private; 2209 void *rc = NULL; 2210 2211 if (v == SEQ_START_TOKEN) { 2212 rc = tcp_get_idx(seq, 0); 2213 goto out; 2214 } 2215 2216 switch (st->state) { 2217 case TCP_SEQ_STATE_LISTENING: 2218 rc = listening_get_next(seq, v); 2219 if (!rc) { 2220 st->state = TCP_SEQ_STATE_ESTABLISHED; 2221 st->bucket = 0; 2222 st->offset = 0; 2223 rc = established_get_first(seq); 2224 } 2225 break; 2226 case TCP_SEQ_STATE_ESTABLISHED: 2227 rc = established_get_next(seq, v); 2228 break; 2229 } 2230 out: 2231 ++*pos; 2232 st->last_pos = *pos; 2233 return rc; 2234 } 2235 EXPORT_SYMBOL(tcp_seq_next); 2236 2237 void tcp_seq_stop(struct seq_file *seq, void *v) 2238 { 2239 struct tcp_iter_state *st = seq->private; 2240 2241 switch (st->state) { 2242 case TCP_SEQ_STATE_LISTENING: 2243 if (v != SEQ_START_TOKEN) 2244 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock); 2245 break; 2246 case TCP_SEQ_STATE_ESTABLISHED: 2247 if (v) 2248 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2249 break; 2250 } 2251 } 2252 EXPORT_SYMBOL(tcp_seq_stop); 2253 2254 static void get_openreq4(const struct request_sock *req, 2255 struct seq_file *f, int i) 2256 { 2257 const struct inet_request_sock *ireq = inet_rsk(req); 2258 long delta = req->rsk_timer.expires - jiffies; 2259 2260 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2261 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2262 i, 2263 ireq->ir_loc_addr, 2264 ireq->ir_num, 2265 ireq->ir_rmt_addr, 2266 ntohs(ireq->ir_rmt_port), 2267 TCP_SYN_RECV, 2268 0, 0, /* could print option size, but that is af dependent. */ 2269 1, /* timers active (only the expire timer) */ 2270 jiffies_delta_to_clock_t(delta), 2271 req->num_timeout, 2272 from_kuid_munged(seq_user_ns(f), 2273 sock_i_uid(req->rsk_listener)), 2274 0, /* non standard timer */ 2275 0, /* open_requests have no inode */ 2276 0, 2277 req); 2278 } 2279 2280 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2281 { 2282 int timer_active; 2283 unsigned long timer_expires; 2284 const struct tcp_sock *tp = tcp_sk(sk); 2285 const struct inet_connection_sock *icsk = inet_csk(sk); 2286 const struct inet_sock *inet = inet_sk(sk); 2287 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2288 __be32 dest = inet->inet_daddr; 2289 __be32 src = inet->inet_rcv_saddr; 2290 __u16 destp = ntohs(inet->inet_dport); 2291 __u16 srcp = ntohs(inet->inet_sport); 2292 int rx_queue; 2293 int state; 2294 2295 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2296 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2297 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2298 timer_active = 1; 2299 timer_expires = icsk->icsk_timeout; 2300 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2301 timer_active = 4; 2302 timer_expires = icsk->icsk_timeout; 2303 } else if (timer_pending(&sk->sk_timer)) { 2304 timer_active = 2; 2305 timer_expires = sk->sk_timer.expires; 2306 } else { 2307 timer_active = 0; 2308 timer_expires = jiffies; 2309 } 2310 2311 state = inet_sk_state_load(sk); 2312 if (state == TCP_LISTEN) 2313 rx_queue = sk->sk_ack_backlog; 2314 else 2315 /* Because we don't lock the socket, 2316 * we might find a transient negative value. 2317 */ 2318 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); 2319 2320 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2321 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2322 i, src, srcp, dest, destp, state, 2323 tp->write_seq - tp->snd_una, 2324 rx_queue, 2325 timer_active, 2326 jiffies_delta_to_clock_t(timer_expires - jiffies), 2327 icsk->icsk_retransmits, 2328 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2329 icsk->icsk_probes_out, 2330 sock_i_ino(sk), 2331 refcount_read(&sk->sk_refcnt), sk, 2332 jiffies_to_clock_t(icsk->icsk_rto), 2333 jiffies_to_clock_t(icsk->icsk_ack.ato), 2334 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, 2335 tp->snd_cwnd, 2336 state == TCP_LISTEN ? 2337 fastopenq->max_qlen : 2338 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2339 } 2340 2341 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2342 struct seq_file *f, int i) 2343 { 2344 long delta = tw->tw_timer.expires - jiffies; 2345 __be32 dest, src; 2346 __u16 destp, srcp; 2347 2348 dest = tw->tw_daddr; 2349 src = tw->tw_rcv_saddr; 2350 destp = ntohs(tw->tw_dport); 2351 srcp = ntohs(tw->tw_sport); 2352 2353 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2354 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2355 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2356 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2357 refcount_read(&tw->tw_refcnt), tw); 2358 } 2359 2360 #define TMPSZ 150 2361 2362 static int tcp4_seq_show(struct seq_file *seq, void *v) 2363 { 2364 struct tcp_iter_state *st; 2365 struct sock *sk = v; 2366 2367 seq_setwidth(seq, TMPSZ - 1); 2368 if (v == SEQ_START_TOKEN) { 2369 seq_puts(seq, " sl local_address rem_address st tx_queue " 2370 "rx_queue tr tm->when retrnsmt uid timeout " 2371 "inode"); 2372 goto out; 2373 } 2374 st = seq->private; 2375 2376 if (sk->sk_state == TCP_TIME_WAIT) 2377 get_timewait4_sock(v, seq, st->num); 2378 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2379 get_openreq4(v, seq, st->num); 2380 else 2381 get_tcp4_sock(v, seq, st->num); 2382 out: 2383 seq_pad(seq, '\n'); 2384 return 0; 2385 } 2386 2387 static const struct seq_operations tcp4_seq_ops = { 2388 .show = tcp4_seq_show, 2389 .start = tcp_seq_start, 2390 .next = tcp_seq_next, 2391 .stop = tcp_seq_stop, 2392 }; 2393 2394 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 2395 .family = AF_INET, 2396 }; 2397 2398 static int __net_init tcp4_proc_init_net(struct net *net) 2399 { 2400 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 2401 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 2402 return -ENOMEM; 2403 return 0; 2404 } 2405 2406 static void __net_exit tcp4_proc_exit_net(struct net *net) 2407 { 2408 remove_proc_entry("tcp", net->proc_net); 2409 } 2410 2411 static struct pernet_operations tcp4_net_ops = { 2412 .init = tcp4_proc_init_net, 2413 .exit = tcp4_proc_exit_net, 2414 }; 2415 2416 int __init tcp4_proc_init(void) 2417 { 2418 return register_pernet_subsys(&tcp4_net_ops); 2419 } 2420 2421 void tcp4_proc_exit(void) 2422 { 2423 unregister_pernet_subsys(&tcp4_net_ops); 2424 } 2425 #endif /* CONFIG_PROC_FS */ 2426 2427 struct proto tcp_prot = { 2428 .name = "TCP", 2429 .owner = THIS_MODULE, 2430 .close = tcp_close, 2431 .pre_connect = tcp_v4_pre_connect, 2432 .connect = tcp_v4_connect, 2433 .disconnect = tcp_disconnect, 2434 .accept = inet_csk_accept, 2435 .ioctl = tcp_ioctl, 2436 .init = tcp_v4_init_sock, 2437 .destroy = tcp_v4_destroy_sock, 2438 .shutdown = tcp_shutdown, 2439 .setsockopt = tcp_setsockopt, 2440 .getsockopt = tcp_getsockopt, 2441 .keepalive = tcp_set_keepalive, 2442 .recvmsg = tcp_recvmsg, 2443 .sendmsg = tcp_sendmsg, 2444 .sendpage = tcp_sendpage, 2445 .backlog_rcv = tcp_v4_do_rcv, 2446 .release_cb = tcp_release_cb, 2447 .hash = inet_hash, 2448 .unhash = inet_unhash, 2449 .get_port = inet_csk_get_port, 2450 .enter_memory_pressure = tcp_enter_memory_pressure, 2451 .leave_memory_pressure = tcp_leave_memory_pressure, 2452 .stream_memory_free = tcp_stream_memory_free, 2453 .sockets_allocated = &tcp_sockets_allocated, 2454 .orphan_count = &tcp_orphan_count, 2455 .memory_allocated = &tcp_memory_allocated, 2456 .memory_pressure = &tcp_memory_pressure, 2457 .sysctl_mem = sysctl_tcp_mem, 2458 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 2459 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 2460 .max_header = MAX_TCP_HEADER, 2461 .obj_size = sizeof(struct tcp_sock), 2462 .slab_flags = SLAB_TYPESAFE_BY_RCU, 2463 .twsk_prot = &tcp_timewait_sock_ops, 2464 .rsk_prot = &tcp_request_sock_ops, 2465 .h.hashinfo = &tcp_hashinfo, 2466 .no_autobind = true, 2467 #ifdef CONFIG_COMPAT 2468 .compat_setsockopt = compat_tcp_setsockopt, 2469 .compat_getsockopt = compat_tcp_getsockopt, 2470 #endif 2471 .diag_destroy = tcp_abort, 2472 }; 2473 EXPORT_SYMBOL(tcp_prot); 2474 2475 static void __net_exit tcp_sk_exit(struct net *net) 2476 { 2477 int cpu; 2478 2479 module_put(net->ipv4.tcp_congestion_control->owner); 2480 2481 for_each_possible_cpu(cpu) 2482 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); 2483 free_percpu(net->ipv4.tcp_sk); 2484 } 2485 2486 static int __net_init tcp_sk_init(struct net *net) 2487 { 2488 int res, cpu, cnt; 2489 2490 net->ipv4.tcp_sk = alloc_percpu(struct sock *); 2491 if (!net->ipv4.tcp_sk) 2492 return -ENOMEM; 2493 2494 for_each_possible_cpu(cpu) { 2495 struct sock *sk; 2496 2497 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 2498 IPPROTO_TCP, net); 2499 if (res) 2500 goto fail; 2501 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 2502 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; 2503 } 2504 2505 net->ipv4.sysctl_tcp_ecn = 2; 2506 net->ipv4.sysctl_tcp_ecn_fallback = 1; 2507 2508 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 2509 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 2510 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 2511 2512 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 2513 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 2514 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 2515 2516 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 2517 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 2518 net->ipv4.sysctl_tcp_syncookies = 1; 2519 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 2520 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 2521 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 2522 net->ipv4.sysctl_tcp_orphan_retries = 0; 2523 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 2524 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 2525 net->ipv4.sysctl_tcp_tw_reuse = 2; 2526 2527 cnt = tcp_hashinfo.ehash_mask + 1; 2528 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2; 2529 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; 2530 2531 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256); 2532 net->ipv4.sysctl_tcp_sack = 1; 2533 net->ipv4.sysctl_tcp_window_scaling = 1; 2534 net->ipv4.sysctl_tcp_timestamps = 1; 2535 net->ipv4.sysctl_tcp_early_retrans = 3; 2536 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 2537 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 2538 net->ipv4.sysctl_tcp_retrans_collapse = 1; 2539 net->ipv4.sysctl_tcp_max_reordering = 300; 2540 net->ipv4.sysctl_tcp_dsack = 1; 2541 net->ipv4.sysctl_tcp_app_win = 31; 2542 net->ipv4.sysctl_tcp_adv_win_scale = 1; 2543 net->ipv4.sysctl_tcp_frto = 2; 2544 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 2545 /* This limits the percentage of the congestion window which we 2546 * will allow a single TSO frame to consume. Building TSO frames 2547 * which are too large can cause TCP streams to be bursty. 2548 */ 2549 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 2550 /* Default TSQ limit of four TSO segments */ 2551 net->ipv4.sysctl_tcp_limit_output_bytes = 262144; 2552 /* rfc5961 challenge ack rate limiting */ 2553 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; 2554 net->ipv4.sysctl_tcp_min_tso_segs = 2; 2555 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 2556 net->ipv4.sysctl_tcp_autocorking = 1; 2557 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 2558 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 2559 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 2560 if (net != &init_net) { 2561 memcpy(net->ipv4.sysctl_tcp_rmem, 2562 init_net.ipv4.sysctl_tcp_rmem, 2563 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 2564 memcpy(net->ipv4.sysctl_tcp_wmem, 2565 init_net.ipv4.sysctl_tcp_wmem, 2566 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 2567 } 2568 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 2569 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 2570 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 2571 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); 2572 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; 2573 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 2574 2575 /* Reno is always built in */ 2576 if (!net_eq(net, &init_net) && 2577 try_module_get(init_net.ipv4.tcp_congestion_control->owner)) 2578 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 2579 else 2580 net->ipv4.tcp_congestion_control = &tcp_reno; 2581 2582 return 0; 2583 fail: 2584 tcp_sk_exit(net); 2585 2586 return res; 2587 } 2588 2589 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 2590 { 2591 struct net *net; 2592 2593 inet_twsk_purge(&tcp_hashinfo, AF_INET); 2594 2595 list_for_each_entry(net, net_exit_list, exit_list) 2596 tcp_fastopen_ctx_destroy(net); 2597 } 2598 2599 static struct pernet_operations __net_initdata tcp_sk_ops = { 2600 .init = tcp_sk_init, 2601 .exit = tcp_sk_exit, 2602 .exit_batch = tcp_sk_exit_batch, 2603 }; 2604 2605 void __init tcp_v4_init(void) 2606 { 2607 if (register_pernet_subsys(&tcp_sk_ops)) 2608 panic("Failed to create the TCP control socket.\n"); 2609 } 2610