1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * IPv4 specific functions 9 * 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 * 18 * This program is free software; you can redistribute it and/or 19 * modify it under the terms of the GNU General Public License 20 * as published by the Free Software Foundation; either version 21 * 2 of the License, or (at your option) any later version. 22 */ 23 24 /* 25 * Changes: 26 * David S. Miller : New socket lookup architecture. 27 * This code is dedicated to John Dyson. 28 * David S. Miller : Change semantics of established hash, 29 * half is devoted to TIME_WAIT sockets 30 * and the rest go in the other half. 31 * Andi Kleen : Add support for syncookies and fixed 32 * some bugs: ip options weren't passed to 33 * the TCP layer, missed a check for an 34 * ACK bit. 35 * Andi Kleen : Implemented fast path mtu discovery. 36 * Fixed many serious bugs in the 37 * request_sock handling and moved 38 * most of it into the af independent code. 39 * Added tail drop and some other bugfixes. 40 * Added new listen semantics. 41 * Mike McLagan : Routing by source 42 * Juan Jose Ciarlante: ip_dynaddr bits 43 * Andi Kleen: various fixes. 44 * Vitaly E. Lavrov : Transparent proxy revived after year 45 * coma. 46 * Andi Kleen : Fix new listen. 47 * Andi Kleen : Fix accept error reporting. 48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 50 * a single port at the same time. 51 */ 52 53 #define pr_fmt(fmt) "TCP: " fmt 54 55 #include <linux/bottom_half.h> 56 #include <linux/types.h> 57 #include <linux/fcntl.h> 58 #include <linux/module.h> 59 #include <linux/random.h> 60 #include <linux/cache.h> 61 #include <linux/jhash.h> 62 #include <linux/init.h> 63 #include <linux/times.h> 64 #include <linux/slab.h> 65 66 #include <net/net_namespace.h> 67 #include <net/icmp.h> 68 #include <net/inet_hashtables.h> 69 #include <net/tcp.h> 70 #include <net/transp_v6.h> 71 #include <net/ipv6.h> 72 #include <net/inet_common.h> 73 #include <net/timewait_sock.h> 74 #include <net/xfrm.h> 75 #include <net/secure_seq.h> 76 #include <net/busy_poll.h> 77 78 #include <linux/inet.h> 79 #include <linux/ipv6.h> 80 #include <linux/stddef.h> 81 #include <linux/proc_fs.h> 82 #include <linux/seq_file.h> 83 #include <linux/inetdevice.h> 84 85 #include <crypto/hash.h> 86 #include <linux/scatterlist.h> 87 88 #include <trace/events/tcp.h> 89 90 #ifdef CONFIG_TCP_MD5SIG 91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 92 __be32 daddr, __be32 saddr, const struct tcphdr *th); 93 #endif 94 95 struct inet_hashinfo tcp_hashinfo; 96 EXPORT_SYMBOL(tcp_hashinfo); 97 98 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 99 { 100 return secure_tcp_seq(ip_hdr(skb)->daddr, 101 ip_hdr(skb)->saddr, 102 tcp_hdr(skb)->dest, 103 tcp_hdr(skb)->source); 104 } 105 106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 107 { 108 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 109 } 110 111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 112 { 113 const struct inet_timewait_sock *tw = inet_twsk(sktw); 114 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 115 struct tcp_sock *tp = tcp_sk(sk); 116 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse; 117 118 if (reuse == 2) { 119 /* Still does not detect *everything* that goes through 120 * lo, since we require a loopback src or dst address 121 * or direct binding to 'lo' interface. 122 */ 123 bool loopback = false; 124 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 125 loopback = true; 126 #if IS_ENABLED(CONFIG_IPV6) 127 if (tw->tw_family == AF_INET6) { 128 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 129 (ipv6_addr_v4mapped(&tw->tw_v6_daddr) && 130 (tw->tw_v6_daddr.s6_addr[12] == 127)) || 131 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 132 (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) && 133 (tw->tw_v6_rcv_saddr.s6_addr[12] == 127))) 134 loopback = true; 135 } else 136 #endif 137 { 138 if (ipv4_is_loopback(tw->tw_daddr) || 139 ipv4_is_loopback(tw->tw_rcv_saddr)) 140 loopback = true; 141 } 142 if (!loopback) 143 reuse = 0; 144 } 145 146 /* With PAWS, it is safe from the viewpoint 147 of data integrity. Even without PAWS it is safe provided sequence 148 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 149 150 Actually, the idea is close to VJ's one, only timestamp cache is 151 held not per host, but per port pair and TW bucket is used as state 152 holder. 153 154 If TW bucket has been already destroyed we fall back to VJ's scheme 155 and use initial timestamp retrieved from peer table. 156 */ 157 if (tcptw->tw_ts_recent_stamp && 158 (!twp || (reuse && time_after32(ktime_get_seconds(), 159 tcptw->tw_ts_recent_stamp)))) { 160 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; 161 if (tp->write_seq == 0) 162 tp->write_seq = 1; 163 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 164 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 165 sock_hold(sktw); 166 return 1; 167 } 168 169 return 0; 170 } 171 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 172 173 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 174 int addr_len) 175 { 176 /* This check is replicated from tcp_v4_connect() and intended to 177 * prevent BPF program called below from accessing bytes that are out 178 * of the bound specified by user in addr_len. 179 */ 180 if (addr_len < sizeof(struct sockaddr_in)) 181 return -EINVAL; 182 183 sock_owned_by_me(sk); 184 185 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr); 186 } 187 188 /* This will initiate an outgoing connection. */ 189 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 190 { 191 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 192 struct inet_sock *inet = inet_sk(sk); 193 struct tcp_sock *tp = tcp_sk(sk); 194 __be16 orig_sport, orig_dport; 195 __be32 daddr, nexthop; 196 struct flowi4 *fl4; 197 struct rtable *rt; 198 int err; 199 struct ip_options_rcu *inet_opt; 200 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 201 202 if (addr_len < sizeof(struct sockaddr_in)) 203 return -EINVAL; 204 205 if (usin->sin_family != AF_INET) 206 return -EAFNOSUPPORT; 207 208 nexthop = daddr = usin->sin_addr.s_addr; 209 inet_opt = rcu_dereference_protected(inet->inet_opt, 210 lockdep_sock_is_held(sk)); 211 if (inet_opt && inet_opt->opt.srr) { 212 if (!daddr) 213 return -EINVAL; 214 nexthop = inet_opt->opt.faddr; 215 } 216 217 orig_sport = inet->inet_sport; 218 orig_dport = usin->sin_port; 219 fl4 = &inet->cork.fl.u.ip4; 220 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 221 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 222 IPPROTO_TCP, 223 orig_sport, orig_dport, sk); 224 if (IS_ERR(rt)) { 225 err = PTR_ERR(rt); 226 if (err == -ENETUNREACH) 227 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 228 return err; 229 } 230 231 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 232 ip_rt_put(rt); 233 return -ENETUNREACH; 234 } 235 236 if (!inet_opt || !inet_opt->opt.srr) 237 daddr = fl4->daddr; 238 239 if (!inet->inet_saddr) 240 inet->inet_saddr = fl4->saddr; 241 sk_rcv_saddr_set(sk, inet->inet_saddr); 242 243 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 244 /* Reset inherited state */ 245 tp->rx_opt.ts_recent = 0; 246 tp->rx_opt.ts_recent_stamp = 0; 247 if (likely(!tp->repair)) 248 tp->write_seq = 0; 249 } 250 251 inet->inet_dport = usin->sin_port; 252 sk_daddr_set(sk, daddr); 253 254 inet_csk(sk)->icsk_ext_hdr_len = 0; 255 if (inet_opt) 256 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 257 258 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 259 260 /* Socket identity is still unknown (sport may be zero). 261 * However we set state to SYN-SENT and not releasing socket 262 * lock select source port, enter ourselves into the hash tables and 263 * complete initialization after this. 264 */ 265 tcp_set_state(sk, TCP_SYN_SENT); 266 err = inet_hash_connect(tcp_death_row, sk); 267 if (err) 268 goto failure; 269 270 sk_set_txhash(sk); 271 272 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 273 inet->inet_sport, inet->inet_dport, sk); 274 if (IS_ERR(rt)) { 275 err = PTR_ERR(rt); 276 rt = NULL; 277 goto failure; 278 } 279 /* OK, now commit destination to socket. */ 280 sk->sk_gso_type = SKB_GSO_TCPV4; 281 sk_setup_caps(sk, &rt->dst); 282 rt = NULL; 283 284 if (likely(!tp->repair)) { 285 if (!tp->write_seq) 286 tp->write_seq = secure_tcp_seq(inet->inet_saddr, 287 inet->inet_daddr, 288 inet->inet_sport, 289 usin->sin_port); 290 tp->tsoffset = secure_tcp_ts_off(sock_net(sk), 291 inet->inet_saddr, 292 inet->inet_daddr); 293 } 294 295 inet->inet_id = tp->write_seq ^ jiffies; 296 297 if (tcp_fastopen_defer_connect(sk, &err)) 298 return err; 299 if (err) 300 goto failure; 301 302 err = tcp_connect(sk); 303 304 if (err) 305 goto failure; 306 307 return 0; 308 309 failure: 310 /* 311 * This unhashes the socket and releases the local port, 312 * if necessary. 313 */ 314 tcp_set_state(sk, TCP_CLOSE); 315 ip_rt_put(rt); 316 sk->sk_route_caps = 0; 317 inet->inet_dport = 0; 318 return err; 319 } 320 EXPORT_SYMBOL(tcp_v4_connect); 321 322 /* 323 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 324 * It can be called through tcp_release_cb() if socket was owned by user 325 * at the time tcp_v4_err() was called to handle ICMP message. 326 */ 327 void tcp_v4_mtu_reduced(struct sock *sk) 328 { 329 struct inet_sock *inet = inet_sk(sk); 330 struct dst_entry *dst; 331 u32 mtu; 332 333 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 334 return; 335 mtu = tcp_sk(sk)->mtu_info; 336 dst = inet_csk_update_pmtu(sk, mtu); 337 if (!dst) 338 return; 339 340 /* Something is about to be wrong... Remember soft error 341 * for the case, if this connection will not able to recover. 342 */ 343 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 344 sk->sk_err_soft = EMSGSIZE; 345 346 mtu = dst_mtu(dst); 347 348 if (inet->pmtudisc != IP_PMTUDISC_DONT && 349 ip_sk_accept_pmtu(sk) && 350 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 351 tcp_sync_mss(sk, mtu); 352 353 /* Resend the TCP packet because it's 354 * clear that the old packet has been 355 * dropped. This is the new "fast" path mtu 356 * discovery. 357 */ 358 tcp_simple_retransmit(sk); 359 } /* else let the usual retransmit timer handle it */ 360 } 361 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 362 363 static void do_redirect(struct sk_buff *skb, struct sock *sk) 364 { 365 struct dst_entry *dst = __sk_dst_check(sk, 0); 366 367 if (dst) 368 dst->ops->redirect(dst, sk, skb); 369 } 370 371 372 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 373 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 374 { 375 struct request_sock *req = inet_reqsk(sk); 376 struct net *net = sock_net(sk); 377 378 /* ICMPs are not backlogged, hence we cannot get 379 * an established socket here. 380 */ 381 if (seq != tcp_rsk(req)->snt_isn) { 382 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 383 } else if (abort) { 384 /* 385 * Still in SYN_RECV, just remove it silently. 386 * There is no good way to pass the error to the newly 387 * created socket, and POSIX does not want network 388 * errors returned from accept(). 389 */ 390 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 391 tcp_listendrop(req->rsk_listener); 392 } 393 reqsk_put(req); 394 } 395 EXPORT_SYMBOL(tcp_req_err); 396 397 /* 398 * This routine is called by the ICMP module when it gets some 399 * sort of error condition. If err < 0 then the socket should 400 * be closed and the error returned to the user. If err > 0 401 * it's just the icmp type << 8 | icmp code. After adjustment 402 * header points to the first 8 bytes of the tcp header. We need 403 * to find the appropriate port. 404 * 405 * The locking strategy used here is very "optimistic". When 406 * someone else accesses the socket the ICMP is just dropped 407 * and for some paths there is no check at all. 408 * A more general error queue to queue errors for later handling 409 * is probably better. 410 * 411 */ 412 413 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) 414 { 415 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data; 416 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); 417 struct inet_connection_sock *icsk; 418 struct tcp_sock *tp; 419 struct inet_sock *inet; 420 const int type = icmp_hdr(icmp_skb)->type; 421 const int code = icmp_hdr(icmp_skb)->code; 422 struct sock *sk; 423 struct sk_buff *skb; 424 struct request_sock *fastopen; 425 u32 seq, snd_una; 426 s32 remaining; 427 u32 delta_us; 428 int err; 429 struct net *net = dev_net(icmp_skb->dev); 430 431 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, 432 th->dest, iph->saddr, ntohs(th->source), 433 inet_iif(icmp_skb), 0); 434 if (!sk) { 435 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 436 return; 437 } 438 if (sk->sk_state == TCP_TIME_WAIT) { 439 inet_twsk_put(inet_twsk(sk)); 440 return; 441 } 442 seq = ntohl(th->seq); 443 if (sk->sk_state == TCP_NEW_SYN_RECV) 444 return tcp_req_err(sk, seq, 445 type == ICMP_PARAMETERPROB || 446 type == ICMP_TIME_EXCEEDED || 447 (type == ICMP_DEST_UNREACH && 448 (code == ICMP_NET_UNREACH || 449 code == ICMP_HOST_UNREACH))); 450 451 bh_lock_sock(sk); 452 /* If too many ICMPs get dropped on busy 453 * servers this needs to be solved differently. 454 * We do take care of PMTU discovery (RFC1191) special case : 455 * we can receive locally generated ICMP messages while socket is held. 456 */ 457 if (sock_owned_by_user(sk)) { 458 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 459 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 460 } 461 if (sk->sk_state == TCP_CLOSE) 462 goto out; 463 464 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 465 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 466 goto out; 467 } 468 469 icsk = inet_csk(sk); 470 tp = tcp_sk(sk); 471 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 472 fastopen = tp->fastopen_rsk; 473 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 474 if (sk->sk_state != TCP_LISTEN && 475 !between(seq, snd_una, tp->snd_nxt)) { 476 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 477 goto out; 478 } 479 480 switch (type) { 481 case ICMP_REDIRECT: 482 if (!sock_owned_by_user(sk)) 483 do_redirect(icmp_skb, sk); 484 goto out; 485 case ICMP_SOURCE_QUENCH: 486 /* Just silently ignore these. */ 487 goto out; 488 case ICMP_PARAMETERPROB: 489 err = EPROTO; 490 break; 491 case ICMP_DEST_UNREACH: 492 if (code > NR_ICMP_UNREACH) 493 goto out; 494 495 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 496 /* We are not interested in TCP_LISTEN and open_requests 497 * (SYN-ACKs send out by Linux are always <576bytes so 498 * they should go through unfragmented). 499 */ 500 if (sk->sk_state == TCP_LISTEN) 501 goto out; 502 503 tp->mtu_info = info; 504 if (!sock_owned_by_user(sk)) { 505 tcp_v4_mtu_reduced(sk); 506 } else { 507 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 508 sock_hold(sk); 509 } 510 goto out; 511 } 512 513 err = icmp_err_convert[code].errno; 514 /* check if icmp_skb allows revert of backoff 515 * (see draft-zimmermann-tcp-lcd) */ 516 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH) 517 break; 518 if (seq != tp->snd_una || !icsk->icsk_retransmits || 519 !icsk->icsk_backoff || fastopen) 520 break; 521 522 if (sock_owned_by_user(sk)) 523 break; 524 525 icsk->icsk_backoff--; 526 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : 527 TCP_TIMEOUT_INIT; 528 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 529 530 skb = tcp_rtx_queue_head(sk); 531 BUG_ON(!skb); 532 533 tcp_mstamp_refresh(tp); 534 delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp); 535 remaining = icsk->icsk_rto - 536 usecs_to_jiffies(delta_us); 537 538 if (remaining > 0) { 539 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 540 remaining, TCP_RTO_MAX); 541 } else { 542 /* RTO revert clocked out retransmission. 543 * Will retransmit now */ 544 tcp_retransmit_timer(sk); 545 } 546 547 break; 548 case ICMP_TIME_EXCEEDED: 549 err = EHOSTUNREACH; 550 break; 551 default: 552 goto out; 553 } 554 555 switch (sk->sk_state) { 556 case TCP_SYN_SENT: 557 case TCP_SYN_RECV: 558 /* Only in fast or simultaneous open. If a fast open socket is 559 * is already accepted it is treated as a connected one below. 560 */ 561 if (fastopen && !fastopen->sk) 562 break; 563 564 if (!sock_owned_by_user(sk)) { 565 sk->sk_err = err; 566 567 sk->sk_error_report(sk); 568 569 tcp_done(sk); 570 } else { 571 sk->sk_err_soft = err; 572 } 573 goto out; 574 } 575 576 /* If we've already connected we will keep trying 577 * until we time out, or the user gives up. 578 * 579 * rfc1122 4.2.3.9 allows to consider as hard errors 580 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 581 * but it is obsoleted by pmtu discovery). 582 * 583 * Note, that in modern internet, where routing is unreliable 584 * and in each dark corner broken firewalls sit, sending random 585 * errors ordered by their masters even this two messages finally lose 586 * their original sense (even Linux sends invalid PORT_UNREACHs) 587 * 588 * Now we are in compliance with RFCs. 589 * --ANK (980905) 590 */ 591 592 inet = inet_sk(sk); 593 if (!sock_owned_by_user(sk) && inet->recverr) { 594 sk->sk_err = err; 595 sk->sk_error_report(sk); 596 } else { /* Only an error on timeout */ 597 sk->sk_err_soft = err; 598 } 599 600 out: 601 bh_unlock_sock(sk); 602 sock_put(sk); 603 } 604 605 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 606 { 607 struct tcphdr *th = tcp_hdr(skb); 608 609 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 610 skb->csum_start = skb_transport_header(skb) - skb->head; 611 skb->csum_offset = offsetof(struct tcphdr, check); 612 } 613 614 /* This routine computes an IPv4 TCP checksum. */ 615 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 616 { 617 const struct inet_sock *inet = inet_sk(sk); 618 619 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 620 } 621 EXPORT_SYMBOL(tcp_v4_send_check); 622 623 /* 624 * This routine will send an RST to the other tcp. 625 * 626 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 627 * for reset. 628 * Answer: if a packet caused RST, it is not for a socket 629 * existing in our system, if it is matched to a socket, 630 * it is just duplicate segment or bug in other side's TCP. 631 * So that we build reply only basing on parameters 632 * arrived with segment. 633 * Exception: precedence violation. We do not implement it in any case. 634 */ 635 636 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) 637 { 638 const struct tcphdr *th = tcp_hdr(skb); 639 struct { 640 struct tcphdr th; 641 #ifdef CONFIG_TCP_MD5SIG 642 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)]; 643 #endif 644 } rep; 645 struct ip_reply_arg arg; 646 #ifdef CONFIG_TCP_MD5SIG 647 struct tcp_md5sig_key *key = NULL; 648 const __u8 *hash_location = NULL; 649 unsigned char newhash[16]; 650 int genhash; 651 struct sock *sk1 = NULL; 652 #endif 653 struct net *net; 654 struct sock *ctl_sk; 655 656 /* Never send a reset in response to a reset. */ 657 if (th->rst) 658 return; 659 660 /* If sk not NULL, it means we did a successful lookup and incoming 661 * route had to be correct. prequeue might have dropped our dst. 662 */ 663 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 664 return; 665 666 /* Swap the send and the receive. */ 667 memset(&rep, 0, sizeof(rep)); 668 rep.th.dest = th->source; 669 rep.th.source = th->dest; 670 rep.th.doff = sizeof(struct tcphdr) / 4; 671 rep.th.rst = 1; 672 673 if (th->ack) { 674 rep.th.seq = th->ack_seq; 675 } else { 676 rep.th.ack = 1; 677 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 678 skb->len - (th->doff << 2)); 679 } 680 681 memset(&arg, 0, sizeof(arg)); 682 arg.iov[0].iov_base = (unsigned char *)&rep; 683 arg.iov[0].iov_len = sizeof(rep.th); 684 685 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 686 #ifdef CONFIG_TCP_MD5SIG 687 rcu_read_lock(); 688 hash_location = tcp_parse_md5sig_option(th); 689 if (sk && sk_fullsock(sk)) { 690 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *) 691 &ip_hdr(skb)->saddr, AF_INET); 692 } else if (hash_location) { 693 /* 694 * active side is lost. Try to find listening socket through 695 * source port, and then find md5 key through listening socket. 696 * we are not loose security here: 697 * Incoming packet is checked with md5 hash with finding key, 698 * no RST generated if md5 hash doesn't match. 699 */ 700 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, 701 ip_hdr(skb)->saddr, 702 th->source, ip_hdr(skb)->daddr, 703 ntohs(th->source), inet_iif(skb), 704 tcp_v4_sdif(skb)); 705 /* don't send rst if it can't find key */ 706 if (!sk1) 707 goto out; 708 709 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *) 710 &ip_hdr(skb)->saddr, AF_INET); 711 if (!key) 712 goto out; 713 714 715 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 716 if (genhash || memcmp(hash_location, newhash, 16) != 0) 717 goto out; 718 719 } 720 721 if (key) { 722 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 723 (TCPOPT_NOP << 16) | 724 (TCPOPT_MD5SIG << 8) | 725 TCPOLEN_MD5SIG); 726 /* Update length and the length the header thinks exists */ 727 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 728 rep.th.doff = arg.iov[0].iov_len / 4; 729 730 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 731 key, ip_hdr(skb)->saddr, 732 ip_hdr(skb)->daddr, &rep.th); 733 } 734 #endif 735 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 736 ip_hdr(skb)->saddr, /* XXX */ 737 arg.iov[0].iov_len, IPPROTO_TCP, 0); 738 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 739 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 740 741 /* When socket is gone, all binding information is lost. 742 * routing might fail in this case. No choice here, if we choose to force 743 * input interface, we will misroute in case of asymmetric route. 744 */ 745 if (sk) { 746 arg.bound_dev_if = sk->sk_bound_dev_if; 747 if (sk_fullsock(sk)) 748 trace_tcp_send_reset(sk, skb); 749 } 750 751 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 752 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 753 754 arg.tos = ip_hdr(skb)->tos; 755 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 756 local_bh_disable(); 757 ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk); 758 if (sk) 759 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 760 inet_twsk(sk)->tw_mark : sk->sk_mark; 761 ip_send_unicast_reply(ctl_sk, 762 skb, &TCP_SKB_CB(skb)->header.h4.opt, 763 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 764 &arg, arg.iov[0].iov_len); 765 766 ctl_sk->sk_mark = 0; 767 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 768 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 769 local_bh_enable(); 770 771 #ifdef CONFIG_TCP_MD5SIG 772 out: 773 rcu_read_unlock(); 774 #endif 775 } 776 777 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 778 outside socket context is ugly, certainly. What can I do? 779 */ 780 781 static void tcp_v4_send_ack(const struct sock *sk, 782 struct sk_buff *skb, u32 seq, u32 ack, 783 u32 win, u32 tsval, u32 tsecr, int oif, 784 struct tcp_md5sig_key *key, 785 int reply_flags, u8 tos) 786 { 787 const struct tcphdr *th = tcp_hdr(skb); 788 struct { 789 struct tcphdr th; 790 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 791 #ifdef CONFIG_TCP_MD5SIG 792 + (TCPOLEN_MD5SIG_ALIGNED >> 2) 793 #endif 794 ]; 795 } rep; 796 struct net *net = sock_net(sk); 797 struct ip_reply_arg arg; 798 struct sock *ctl_sk; 799 800 memset(&rep.th, 0, sizeof(struct tcphdr)); 801 memset(&arg, 0, sizeof(arg)); 802 803 arg.iov[0].iov_base = (unsigned char *)&rep; 804 arg.iov[0].iov_len = sizeof(rep.th); 805 if (tsecr) { 806 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 807 (TCPOPT_TIMESTAMP << 8) | 808 TCPOLEN_TIMESTAMP); 809 rep.opt[1] = htonl(tsval); 810 rep.opt[2] = htonl(tsecr); 811 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 812 } 813 814 /* Swap the send and the receive. */ 815 rep.th.dest = th->source; 816 rep.th.source = th->dest; 817 rep.th.doff = arg.iov[0].iov_len / 4; 818 rep.th.seq = htonl(seq); 819 rep.th.ack_seq = htonl(ack); 820 rep.th.ack = 1; 821 rep.th.window = htons(win); 822 823 #ifdef CONFIG_TCP_MD5SIG 824 if (key) { 825 int offset = (tsecr) ? 3 : 0; 826 827 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 828 (TCPOPT_NOP << 16) | 829 (TCPOPT_MD5SIG << 8) | 830 TCPOLEN_MD5SIG); 831 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 832 rep.th.doff = arg.iov[0].iov_len/4; 833 834 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 835 key, ip_hdr(skb)->saddr, 836 ip_hdr(skb)->daddr, &rep.th); 837 } 838 #endif 839 arg.flags = reply_flags; 840 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 841 ip_hdr(skb)->saddr, /* XXX */ 842 arg.iov[0].iov_len, IPPROTO_TCP, 0); 843 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 844 if (oif) 845 arg.bound_dev_if = oif; 846 arg.tos = tos; 847 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 848 local_bh_disable(); 849 ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk); 850 if (sk) 851 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 852 inet_twsk(sk)->tw_mark : sk->sk_mark; 853 ip_send_unicast_reply(ctl_sk, 854 skb, &TCP_SKB_CB(skb)->header.h4.opt, 855 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 856 &arg, arg.iov[0].iov_len); 857 858 ctl_sk->sk_mark = 0; 859 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 860 local_bh_enable(); 861 } 862 863 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 864 { 865 struct inet_timewait_sock *tw = inet_twsk(sk); 866 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 867 868 tcp_v4_send_ack(sk, skb, 869 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 870 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 871 tcp_time_stamp_raw() + tcptw->tw_ts_offset, 872 tcptw->tw_ts_recent, 873 tw->tw_bound_dev_if, 874 tcp_twsk_md5_key(tcptw), 875 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 876 tw->tw_tos 877 ); 878 879 inet_twsk_put(tw); 880 } 881 882 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 883 struct request_sock *req) 884 { 885 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 886 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 887 */ 888 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 889 tcp_sk(sk)->snd_nxt; 890 891 /* RFC 7323 2.3 892 * The window field (SEG.WND) of every outgoing segment, with the 893 * exception of <SYN> segments, MUST be right-shifted by 894 * Rcv.Wind.Shift bits: 895 */ 896 tcp_v4_send_ack(sk, skb, seq, 897 tcp_rsk(req)->rcv_nxt, 898 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 899 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, 900 req->ts_recent, 901 0, 902 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr, 903 AF_INET), 904 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 905 ip_hdr(skb)->tos); 906 } 907 908 /* 909 * Send a SYN-ACK after having received a SYN. 910 * This still operates on a request_sock only, not on a big 911 * socket. 912 */ 913 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 914 struct flowi *fl, 915 struct request_sock *req, 916 struct tcp_fastopen_cookie *foc, 917 enum tcp_synack_type synack_type) 918 { 919 const struct inet_request_sock *ireq = inet_rsk(req); 920 struct flowi4 fl4; 921 int err = -1; 922 struct sk_buff *skb; 923 924 /* First, grab a route. */ 925 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 926 return -1; 927 928 skb = tcp_make_synack(sk, dst, req, foc, synack_type); 929 930 if (skb) { 931 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 932 933 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 934 ireq->ir_rmt_addr, 935 ireq_opt_deref(ireq)); 936 err = net_xmit_eval(err); 937 } 938 939 return err; 940 } 941 942 /* 943 * IPv4 request_sock destructor. 944 */ 945 static void tcp_v4_reqsk_destructor(struct request_sock *req) 946 { 947 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 948 } 949 950 #ifdef CONFIG_TCP_MD5SIG 951 /* 952 * RFC2385 MD5 checksumming requires a mapping of 953 * IP address->MD5 Key. 954 * We need to maintain these in the sk structure. 955 */ 956 957 /* Find the Key structure for an address. */ 958 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, 959 const union tcp_md5_addr *addr, 960 int family) 961 { 962 const struct tcp_sock *tp = tcp_sk(sk); 963 struct tcp_md5sig_key *key; 964 const struct tcp_md5sig_info *md5sig; 965 __be32 mask; 966 struct tcp_md5sig_key *best_match = NULL; 967 bool match; 968 969 /* caller either holds rcu_read_lock() or socket lock */ 970 md5sig = rcu_dereference_check(tp->md5sig_info, 971 lockdep_sock_is_held(sk)); 972 if (!md5sig) 973 return NULL; 974 975 hlist_for_each_entry_rcu(key, &md5sig->head, node) { 976 if (key->family != family) 977 continue; 978 979 if (family == AF_INET) { 980 mask = inet_make_mask(key->prefixlen); 981 match = (key->addr.a4.s_addr & mask) == 982 (addr->a4.s_addr & mask); 983 #if IS_ENABLED(CONFIG_IPV6) 984 } else if (family == AF_INET6) { 985 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 986 key->prefixlen); 987 #endif 988 } else { 989 match = false; 990 } 991 992 if (match && (!best_match || 993 key->prefixlen > best_match->prefixlen)) 994 best_match = key; 995 } 996 return best_match; 997 } 998 EXPORT_SYMBOL(tcp_md5_do_lookup); 999 1000 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1001 const union tcp_md5_addr *addr, 1002 int family, u8 prefixlen) 1003 { 1004 const struct tcp_sock *tp = tcp_sk(sk); 1005 struct tcp_md5sig_key *key; 1006 unsigned int size = sizeof(struct in_addr); 1007 const struct tcp_md5sig_info *md5sig; 1008 1009 /* caller either holds rcu_read_lock() or socket lock */ 1010 md5sig = rcu_dereference_check(tp->md5sig_info, 1011 lockdep_sock_is_held(sk)); 1012 if (!md5sig) 1013 return NULL; 1014 #if IS_ENABLED(CONFIG_IPV6) 1015 if (family == AF_INET6) 1016 size = sizeof(struct in6_addr); 1017 #endif 1018 hlist_for_each_entry_rcu(key, &md5sig->head, node) { 1019 if (key->family != family) 1020 continue; 1021 if (!memcmp(&key->addr, addr, size) && 1022 key->prefixlen == prefixlen) 1023 return key; 1024 } 1025 return NULL; 1026 } 1027 1028 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1029 const struct sock *addr_sk) 1030 { 1031 const union tcp_md5_addr *addr; 1032 1033 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1034 return tcp_md5_do_lookup(sk, addr, AF_INET); 1035 } 1036 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1037 1038 /* This can be called on a newly created socket, from other files */ 1039 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1040 int family, u8 prefixlen, const u8 *newkey, u8 newkeylen, 1041 gfp_t gfp) 1042 { 1043 /* Add Key to the list */ 1044 struct tcp_md5sig_key *key; 1045 struct tcp_sock *tp = tcp_sk(sk); 1046 struct tcp_md5sig_info *md5sig; 1047 1048 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen); 1049 if (key) { 1050 /* Pre-existing entry - just update that one. */ 1051 memcpy(key->key, newkey, newkeylen); 1052 key->keylen = newkeylen; 1053 return 0; 1054 } 1055 1056 md5sig = rcu_dereference_protected(tp->md5sig_info, 1057 lockdep_sock_is_held(sk)); 1058 if (!md5sig) { 1059 md5sig = kmalloc(sizeof(*md5sig), gfp); 1060 if (!md5sig) 1061 return -ENOMEM; 1062 1063 sk_nocaps_add(sk, NETIF_F_GSO_MASK); 1064 INIT_HLIST_HEAD(&md5sig->head); 1065 rcu_assign_pointer(tp->md5sig_info, md5sig); 1066 } 1067 1068 key = sock_kmalloc(sk, sizeof(*key), gfp); 1069 if (!key) 1070 return -ENOMEM; 1071 if (!tcp_alloc_md5sig_pool()) { 1072 sock_kfree_s(sk, key, sizeof(*key)); 1073 return -ENOMEM; 1074 } 1075 1076 memcpy(key->key, newkey, newkeylen); 1077 key->keylen = newkeylen; 1078 key->family = family; 1079 key->prefixlen = prefixlen; 1080 memcpy(&key->addr, addr, 1081 (family == AF_INET6) ? sizeof(struct in6_addr) : 1082 sizeof(struct in_addr)); 1083 hlist_add_head_rcu(&key->node, &md5sig->head); 1084 return 0; 1085 } 1086 EXPORT_SYMBOL(tcp_md5_do_add); 1087 1088 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1089 u8 prefixlen) 1090 { 1091 struct tcp_md5sig_key *key; 1092 1093 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen); 1094 if (!key) 1095 return -ENOENT; 1096 hlist_del_rcu(&key->node); 1097 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1098 kfree_rcu(key, rcu); 1099 return 0; 1100 } 1101 EXPORT_SYMBOL(tcp_md5_do_del); 1102 1103 static void tcp_clear_md5_list(struct sock *sk) 1104 { 1105 struct tcp_sock *tp = tcp_sk(sk); 1106 struct tcp_md5sig_key *key; 1107 struct hlist_node *n; 1108 struct tcp_md5sig_info *md5sig; 1109 1110 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1111 1112 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1113 hlist_del_rcu(&key->node); 1114 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1115 kfree_rcu(key, rcu); 1116 } 1117 } 1118 1119 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1120 char __user *optval, int optlen) 1121 { 1122 struct tcp_md5sig cmd; 1123 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1124 u8 prefixlen = 32; 1125 1126 if (optlen < sizeof(cmd)) 1127 return -EINVAL; 1128 1129 if (copy_from_user(&cmd, optval, sizeof(cmd))) 1130 return -EFAULT; 1131 1132 if (sin->sin_family != AF_INET) 1133 return -EINVAL; 1134 1135 if (optname == TCP_MD5SIG_EXT && 1136 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1137 prefixlen = cmd.tcpm_prefixlen; 1138 if (prefixlen > 32) 1139 return -EINVAL; 1140 } 1141 1142 if (!cmd.tcpm_keylen) 1143 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, 1144 AF_INET, prefixlen); 1145 1146 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1147 return -EINVAL; 1148 1149 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, 1150 AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen, 1151 GFP_KERNEL); 1152 } 1153 1154 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, 1155 __be32 daddr, __be32 saddr, 1156 const struct tcphdr *th, int nbytes) 1157 { 1158 struct tcp4_pseudohdr *bp; 1159 struct scatterlist sg; 1160 struct tcphdr *_th; 1161 1162 bp = hp->scratch; 1163 bp->saddr = saddr; 1164 bp->daddr = daddr; 1165 bp->pad = 0; 1166 bp->protocol = IPPROTO_TCP; 1167 bp->len = cpu_to_be16(nbytes); 1168 1169 _th = (struct tcphdr *)(bp + 1); 1170 memcpy(_th, th, sizeof(*th)); 1171 _th->check = 0; 1172 1173 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1174 ahash_request_set_crypt(hp->md5_req, &sg, NULL, 1175 sizeof(*bp) + sizeof(*th)); 1176 return crypto_ahash_update(hp->md5_req); 1177 } 1178 1179 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1180 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1181 { 1182 struct tcp_md5sig_pool *hp; 1183 struct ahash_request *req; 1184 1185 hp = tcp_get_md5sig_pool(); 1186 if (!hp) 1187 goto clear_hash_noput; 1188 req = hp->md5_req; 1189 1190 if (crypto_ahash_init(req)) 1191 goto clear_hash; 1192 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) 1193 goto clear_hash; 1194 if (tcp_md5_hash_key(hp, key)) 1195 goto clear_hash; 1196 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1197 if (crypto_ahash_final(req)) 1198 goto clear_hash; 1199 1200 tcp_put_md5sig_pool(); 1201 return 0; 1202 1203 clear_hash: 1204 tcp_put_md5sig_pool(); 1205 clear_hash_noput: 1206 memset(md5_hash, 0, 16); 1207 return 1; 1208 } 1209 1210 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1211 const struct sock *sk, 1212 const struct sk_buff *skb) 1213 { 1214 struct tcp_md5sig_pool *hp; 1215 struct ahash_request *req; 1216 const struct tcphdr *th = tcp_hdr(skb); 1217 __be32 saddr, daddr; 1218 1219 if (sk) { /* valid for establish/request sockets */ 1220 saddr = sk->sk_rcv_saddr; 1221 daddr = sk->sk_daddr; 1222 } else { 1223 const struct iphdr *iph = ip_hdr(skb); 1224 saddr = iph->saddr; 1225 daddr = iph->daddr; 1226 } 1227 1228 hp = tcp_get_md5sig_pool(); 1229 if (!hp) 1230 goto clear_hash_noput; 1231 req = hp->md5_req; 1232 1233 if (crypto_ahash_init(req)) 1234 goto clear_hash; 1235 1236 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len)) 1237 goto clear_hash; 1238 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) 1239 goto clear_hash; 1240 if (tcp_md5_hash_key(hp, key)) 1241 goto clear_hash; 1242 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1243 if (crypto_ahash_final(req)) 1244 goto clear_hash; 1245 1246 tcp_put_md5sig_pool(); 1247 return 0; 1248 1249 clear_hash: 1250 tcp_put_md5sig_pool(); 1251 clear_hash_noput: 1252 memset(md5_hash, 0, 16); 1253 return 1; 1254 } 1255 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1256 1257 #endif 1258 1259 /* Called with rcu_read_lock() */ 1260 static bool tcp_v4_inbound_md5_hash(const struct sock *sk, 1261 const struct sk_buff *skb) 1262 { 1263 #ifdef CONFIG_TCP_MD5SIG 1264 /* 1265 * This gets called for each TCP segment that arrives 1266 * so we want to be efficient. 1267 * We have 3 drop cases: 1268 * o No MD5 hash and one expected. 1269 * o MD5 hash and we're not expecting one. 1270 * o MD5 hash and its wrong. 1271 */ 1272 const __u8 *hash_location = NULL; 1273 struct tcp_md5sig_key *hash_expected; 1274 const struct iphdr *iph = ip_hdr(skb); 1275 const struct tcphdr *th = tcp_hdr(skb); 1276 int genhash; 1277 unsigned char newhash[16]; 1278 1279 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr, 1280 AF_INET); 1281 hash_location = tcp_parse_md5sig_option(th); 1282 1283 /* We've parsed the options - do we have a hash? */ 1284 if (!hash_expected && !hash_location) 1285 return false; 1286 1287 if (hash_expected && !hash_location) { 1288 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); 1289 return true; 1290 } 1291 1292 if (!hash_expected && hash_location) { 1293 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); 1294 return true; 1295 } 1296 1297 /* Okay, so this is hash_expected and hash_location - 1298 * so we need to calculate the checksum. 1299 */ 1300 genhash = tcp_v4_md5_hash_skb(newhash, 1301 hash_expected, 1302 NULL, skb); 1303 1304 if (genhash || memcmp(hash_location, newhash, 16) != 0) { 1305 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); 1306 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", 1307 &iph->saddr, ntohs(th->source), 1308 &iph->daddr, ntohs(th->dest), 1309 genhash ? " tcp_v4_calc_md5_hash failed" 1310 : ""); 1311 return true; 1312 } 1313 return false; 1314 #endif 1315 return false; 1316 } 1317 1318 static void tcp_v4_init_req(struct request_sock *req, 1319 const struct sock *sk_listener, 1320 struct sk_buff *skb) 1321 { 1322 struct inet_request_sock *ireq = inet_rsk(req); 1323 struct net *net = sock_net(sk_listener); 1324 1325 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1326 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1327 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1328 } 1329 1330 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1331 struct flowi *fl, 1332 const struct request_sock *req) 1333 { 1334 return inet_csk_route_req(sk, &fl->u.ip4, req); 1335 } 1336 1337 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1338 .family = PF_INET, 1339 .obj_size = sizeof(struct tcp_request_sock), 1340 .rtx_syn_ack = tcp_rtx_synack, 1341 .send_ack = tcp_v4_reqsk_send_ack, 1342 .destructor = tcp_v4_reqsk_destructor, 1343 .send_reset = tcp_v4_send_reset, 1344 .syn_ack_timeout = tcp_syn_ack_timeout, 1345 }; 1346 1347 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1348 .mss_clamp = TCP_MSS_DEFAULT, 1349 #ifdef CONFIG_TCP_MD5SIG 1350 .req_md5_lookup = tcp_v4_md5_lookup, 1351 .calc_md5_hash = tcp_v4_md5_hash_skb, 1352 #endif 1353 .init_req = tcp_v4_init_req, 1354 #ifdef CONFIG_SYN_COOKIES 1355 .cookie_init_seq = cookie_v4_init_sequence, 1356 #endif 1357 .route_req = tcp_v4_route_req, 1358 .init_seq = tcp_v4_init_seq, 1359 .init_ts_off = tcp_v4_init_ts_off, 1360 .send_synack = tcp_v4_send_synack, 1361 }; 1362 1363 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1364 { 1365 /* Never answer to SYNs send to broadcast or multicast */ 1366 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1367 goto drop; 1368 1369 return tcp_conn_request(&tcp_request_sock_ops, 1370 &tcp_request_sock_ipv4_ops, sk, skb); 1371 1372 drop: 1373 tcp_listendrop(sk); 1374 return 0; 1375 } 1376 EXPORT_SYMBOL(tcp_v4_conn_request); 1377 1378 1379 /* 1380 * The three way handshake has completed - we got a valid synack - 1381 * now create the new socket. 1382 */ 1383 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1384 struct request_sock *req, 1385 struct dst_entry *dst, 1386 struct request_sock *req_unhash, 1387 bool *own_req) 1388 { 1389 struct inet_request_sock *ireq; 1390 struct inet_sock *newinet; 1391 struct tcp_sock *newtp; 1392 struct sock *newsk; 1393 #ifdef CONFIG_TCP_MD5SIG 1394 struct tcp_md5sig_key *key; 1395 #endif 1396 struct ip_options_rcu *inet_opt; 1397 1398 if (sk_acceptq_is_full(sk)) 1399 goto exit_overflow; 1400 1401 newsk = tcp_create_openreq_child(sk, req, skb); 1402 if (!newsk) 1403 goto exit_nonewsk; 1404 1405 newsk->sk_gso_type = SKB_GSO_TCPV4; 1406 inet_sk_rx_dst_set(newsk, skb); 1407 1408 newtp = tcp_sk(newsk); 1409 newinet = inet_sk(newsk); 1410 ireq = inet_rsk(req); 1411 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1412 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1413 newsk->sk_bound_dev_if = ireq->ir_iif; 1414 newinet->inet_saddr = ireq->ir_loc_addr; 1415 inet_opt = rcu_dereference(ireq->ireq_opt); 1416 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1417 newinet->mc_index = inet_iif(skb); 1418 newinet->mc_ttl = ip_hdr(skb)->ttl; 1419 newinet->rcv_tos = ip_hdr(skb)->tos; 1420 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1421 if (inet_opt) 1422 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1423 newinet->inet_id = newtp->write_seq ^ jiffies; 1424 1425 if (!dst) { 1426 dst = inet_csk_route_child_sock(sk, newsk, req); 1427 if (!dst) 1428 goto put_and_exit; 1429 } else { 1430 /* syncookie case : see end of cookie_v4_check() */ 1431 } 1432 sk_setup_caps(newsk, dst); 1433 1434 tcp_ca_openreq_child(newsk, dst); 1435 1436 tcp_sync_mss(newsk, dst_mtu(dst)); 1437 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1438 1439 tcp_initialize_rcv_mss(newsk); 1440 1441 #ifdef CONFIG_TCP_MD5SIG 1442 /* Copy over the MD5 key from the original socket */ 1443 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr, 1444 AF_INET); 1445 if (key) { 1446 /* 1447 * We're using one, so create a matching key 1448 * on the newsk structure. If we fail to get 1449 * memory, then we end up not copying the key 1450 * across. Shucks. 1451 */ 1452 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr, 1453 AF_INET, 32, key->key, key->keylen, GFP_ATOMIC); 1454 sk_nocaps_add(newsk, NETIF_F_GSO_MASK); 1455 } 1456 #endif 1457 1458 if (__inet_inherit_port(sk, newsk) < 0) 1459 goto put_and_exit; 1460 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); 1461 if (likely(*own_req)) { 1462 tcp_move_syn(newtp, req); 1463 ireq->ireq_opt = NULL; 1464 } else { 1465 newinet->inet_opt = NULL; 1466 } 1467 return newsk; 1468 1469 exit_overflow: 1470 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1471 exit_nonewsk: 1472 dst_release(dst); 1473 exit: 1474 tcp_listendrop(sk); 1475 return NULL; 1476 put_and_exit: 1477 newinet->inet_opt = NULL; 1478 inet_csk_prepare_forced_close(newsk); 1479 tcp_done(newsk); 1480 goto exit; 1481 } 1482 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1483 1484 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1485 { 1486 #ifdef CONFIG_SYN_COOKIES 1487 const struct tcphdr *th = tcp_hdr(skb); 1488 1489 if (!th->syn) 1490 sk = cookie_v4_check(sk, skb); 1491 #endif 1492 return sk; 1493 } 1494 1495 /* The socket must have it's spinlock held when we get 1496 * here, unless it is a TCP_LISTEN socket. 1497 * 1498 * We have a potential double-lock case here, so even when 1499 * doing backlog processing we use the BH locking scheme. 1500 * This is because we cannot sleep with the original spinlock 1501 * held. 1502 */ 1503 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1504 { 1505 struct sock *rsk; 1506 1507 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1508 struct dst_entry *dst = sk->sk_rx_dst; 1509 1510 sock_rps_save_rxhash(sk, skb); 1511 sk_mark_napi_id(sk, skb); 1512 if (dst) { 1513 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || 1514 !dst->ops->check(dst, 0)) { 1515 dst_release(dst); 1516 sk->sk_rx_dst = NULL; 1517 } 1518 } 1519 tcp_rcv_established(sk, skb); 1520 return 0; 1521 } 1522 1523 if (tcp_checksum_complete(skb)) 1524 goto csum_err; 1525 1526 if (sk->sk_state == TCP_LISTEN) { 1527 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1528 1529 if (!nsk) 1530 goto discard; 1531 if (nsk != sk) { 1532 if (tcp_child_process(sk, nsk, skb)) { 1533 rsk = nsk; 1534 goto reset; 1535 } 1536 return 0; 1537 } 1538 } else 1539 sock_rps_save_rxhash(sk, skb); 1540 1541 if (tcp_rcv_state_process(sk, skb)) { 1542 rsk = sk; 1543 goto reset; 1544 } 1545 return 0; 1546 1547 reset: 1548 tcp_v4_send_reset(rsk, skb); 1549 discard: 1550 kfree_skb(skb); 1551 /* Be careful here. If this function gets more complicated and 1552 * gcc suffers from register pressure on the x86, sk (in %ebx) 1553 * might be destroyed here. This current version compiles correctly, 1554 * but you have been warned. 1555 */ 1556 return 0; 1557 1558 csum_err: 1559 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1560 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1561 goto discard; 1562 } 1563 EXPORT_SYMBOL(tcp_v4_do_rcv); 1564 1565 int tcp_v4_early_demux(struct sk_buff *skb) 1566 { 1567 const struct iphdr *iph; 1568 const struct tcphdr *th; 1569 struct sock *sk; 1570 1571 if (skb->pkt_type != PACKET_HOST) 1572 return 0; 1573 1574 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1575 return 0; 1576 1577 iph = ip_hdr(skb); 1578 th = tcp_hdr(skb); 1579 1580 if (th->doff < sizeof(struct tcphdr) / 4) 1581 return 0; 1582 1583 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, 1584 iph->saddr, th->source, 1585 iph->daddr, ntohs(th->dest), 1586 skb->skb_iif, inet_sdif(skb)); 1587 if (sk) { 1588 skb->sk = sk; 1589 skb->destructor = sock_edemux; 1590 if (sk_fullsock(sk)) { 1591 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); 1592 1593 if (dst) 1594 dst = dst_check(dst, 0); 1595 if (dst && 1596 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) 1597 skb_dst_set_noref(skb, dst); 1598 } 1599 } 1600 return 0; 1601 } 1602 1603 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) 1604 { 1605 u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf; 1606 1607 /* Only socket owner can try to collapse/prune rx queues 1608 * to reduce memory overhead, so add a little headroom here. 1609 * Few sockets backlog are possibly concurrently non empty. 1610 */ 1611 limit += 64*1024; 1612 1613 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1614 * we can fix skb->truesize to its real value to avoid future drops. 1615 * This is valid because skb is not yet charged to the socket. 1616 * It has been noticed pure SACK packets were sometimes dropped 1617 * (if cooked by drivers without copybreak feature). 1618 */ 1619 skb_condense(skb); 1620 1621 if (unlikely(sk_add_backlog(sk, skb, limit))) { 1622 bh_unlock_sock(sk); 1623 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 1624 return true; 1625 } 1626 return false; 1627 } 1628 EXPORT_SYMBOL(tcp_add_backlog); 1629 1630 int tcp_filter(struct sock *sk, struct sk_buff *skb) 1631 { 1632 struct tcphdr *th = (struct tcphdr *)skb->data; 1633 unsigned int eaten = skb->len; 1634 int err; 1635 1636 err = sk_filter_trim_cap(sk, skb, th->doff * 4); 1637 if (!err) { 1638 eaten -= skb->len; 1639 TCP_SKB_CB(skb)->end_seq -= eaten; 1640 } 1641 return err; 1642 } 1643 EXPORT_SYMBOL(tcp_filter); 1644 1645 static void tcp_v4_restore_cb(struct sk_buff *skb) 1646 { 1647 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 1648 sizeof(struct inet_skb_parm)); 1649 } 1650 1651 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 1652 const struct tcphdr *th) 1653 { 1654 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 1655 * barrier() makes sure compiler wont play fool^Waliasing games. 1656 */ 1657 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 1658 sizeof(struct inet_skb_parm)); 1659 barrier(); 1660 1661 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1662 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1663 skb->len - th->doff * 4); 1664 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1665 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 1666 TCP_SKB_CB(skb)->tcp_tw_isn = 0; 1667 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 1668 TCP_SKB_CB(skb)->sacked = 0; 1669 TCP_SKB_CB(skb)->has_rxtstamp = 1670 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 1671 } 1672 1673 /* 1674 * From tcp_input.c 1675 */ 1676 1677 int tcp_v4_rcv(struct sk_buff *skb) 1678 { 1679 struct net *net = dev_net(skb->dev); 1680 int sdif = inet_sdif(skb); 1681 const struct iphdr *iph; 1682 const struct tcphdr *th; 1683 bool refcounted; 1684 struct sock *sk; 1685 int ret; 1686 1687 if (skb->pkt_type != PACKET_HOST) 1688 goto discard_it; 1689 1690 /* Count it even if it's bad */ 1691 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 1692 1693 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 1694 goto discard_it; 1695 1696 th = (const struct tcphdr *)skb->data; 1697 1698 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) 1699 goto bad_packet; 1700 if (!pskb_may_pull(skb, th->doff * 4)) 1701 goto discard_it; 1702 1703 /* An explanation is required here, I think. 1704 * Packet length and doff are validated by header prediction, 1705 * provided case of th->doff==0 is eliminated. 1706 * So, we defer the checks. */ 1707 1708 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 1709 goto csum_error; 1710 1711 th = (const struct tcphdr *)skb->data; 1712 iph = ip_hdr(skb); 1713 lookup: 1714 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, 1715 th->dest, sdif, &refcounted); 1716 if (!sk) 1717 goto no_tcp_socket; 1718 1719 process: 1720 if (sk->sk_state == TCP_TIME_WAIT) 1721 goto do_time_wait; 1722 1723 if (sk->sk_state == TCP_NEW_SYN_RECV) { 1724 struct request_sock *req = inet_reqsk(sk); 1725 bool req_stolen = false; 1726 struct sock *nsk; 1727 1728 sk = req->rsk_listener; 1729 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) { 1730 sk_drops_add(sk, skb); 1731 reqsk_put(req); 1732 goto discard_it; 1733 } 1734 if (tcp_checksum_complete(skb)) { 1735 reqsk_put(req); 1736 goto csum_error; 1737 } 1738 if (unlikely(sk->sk_state != TCP_LISTEN)) { 1739 inet_csk_reqsk_queue_drop_and_put(sk, req); 1740 goto lookup; 1741 } 1742 /* We own a reference on the listener, increase it again 1743 * as we might lose it too soon. 1744 */ 1745 sock_hold(sk); 1746 refcounted = true; 1747 nsk = NULL; 1748 if (!tcp_filter(sk, skb)) { 1749 th = (const struct tcphdr *)skb->data; 1750 iph = ip_hdr(skb); 1751 tcp_v4_fill_cb(skb, iph, th); 1752 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 1753 } 1754 if (!nsk) { 1755 reqsk_put(req); 1756 if (req_stolen) { 1757 /* Another cpu got exclusive access to req 1758 * and created a full blown socket. 1759 * Try to feed this packet to this socket 1760 * instead of discarding it. 1761 */ 1762 tcp_v4_restore_cb(skb); 1763 sock_put(sk); 1764 goto lookup; 1765 } 1766 goto discard_and_relse; 1767 } 1768 if (nsk == sk) { 1769 reqsk_put(req); 1770 tcp_v4_restore_cb(skb); 1771 } else if (tcp_child_process(sk, nsk, skb)) { 1772 tcp_v4_send_reset(nsk, skb); 1773 goto discard_and_relse; 1774 } else { 1775 sock_put(sk); 1776 return 0; 1777 } 1778 } 1779 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 1780 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 1781 goto discard_and_relse; 1782 } 1783 1784 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 1785 goto discard_and_relse; 1786 1787 if (tcp_v4_inbound_md5_hash(sk, skb)) 1788 goto discard_and_relse; 1789 1790 nf_reset(skb); 1791 1792 if (tcp_filter(sk, skb)) 1793 goto discard_and_relse; 1794 th = (const struct tcphdr *)skb->data; 1795 iph = ip_hdr(skb); 1796 tcp_v4_fill_cb(skb, iph, th); 1797 1798 skb->dev = NULL; 1799 1800 if (sk->sk_state == TCP_LISTEN) { 1801 ret = tcp_v4_do_rcv(sk, skb); 1802 goto put_and_return; 1803 } 1804 1805 sk_incoming_cpu_update(sk); 1806 1807 bh_lock_sock_nested(sk); 1808 tcp_segs_in(tcp_sk(sk), skb); 1809 ret = 0; 1810 if (!sock_owned_by_user(sk)) { 1811 ret = tcp_v4_do_rcv(sk, skb); 1812 } else if (tcp_add_backlog(sk, skb)) { 1813 goto discard_and_relse; 1814 } 1815 bh_unlock_sock(sk); 1816 1817 put_and_return: 1818 if (refcounted) 1819 sock_put(sk); 1820 1821 return ret; 1822 1823 no_tcp_socket: 1824 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 1825 goto discard_it; 1826 1827 tcp_v4_fill_cb(skb, iph, th); 1828 1829 if (tcp_checksum_complete(skb)) { 1830 csum_error: 1831 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 1832 bad_packet: 1833 __TCP_INC_STATS(net, TCP_MIB_INERRS); 1834 } else { 1835 tcp_v4_send_reset(NULL, skb); 1836 } 1837 1838 discard_it: 1839 /* Discard frame. */ 1840 kfree_skb(skb); 1841 return 0; 1842 1843 discard_and_relse: 1844 sk_drops_add(sk, skb); 1845 if (refcounted) 1846 sock_put(sk); 1847 goto discard_it; 1848 1849 do_time_wait: 1850 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 1851 inet_twsk_put(inet_twsk(sk)); 1852 goto discard_it; 1853 } 1854 1855 tcp_v4_fill_cb(skb, iph, th); 1856 1857 if (tcp_checksum_complete(skb)) { 1858 inet_twsk_put(inet_twsk(sk)); 1859 goto csum_error; 1860 } 1861 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 1862 case TCP_TW_SYN: { 1863 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), 1864 &tcp_hashinfo, skb, 1865 __tcp_hdrlen(th), 1866 iph->saddr, th->source, 1867 iph->daddr, th->dest, 1868 inet_iif(skb), 1869 sdif); 1870 if (sk2) { 1871 inet_twsk_deschedule_put(inet_twsk(sk)); 1872 sk = sk2; 1873 tcp_v4_restore_cb(skb); 1874 refcounted = false; 1875 goto process; 1876 } 1877 } 1878 /* to ACK */ 1879 /* fall through */ 1880 case TCP_TW_ACK: 1881 tcp_v4_timewait_ack(sk, skb); 1882 break; 1883 case TCP_TW_RST: 1884 tcp_v4_send_reset(sk, skb); 1885 inet_twsk_deschedule_put(inet_twsk(sk)); 1886 goto discard_it; 1887 case TCP_TW_SUCCESS:; 1888 } 1889 goto discard_it; 1890 } 1891 1892 static struct timewait_sock_ops tcp_timewait_sock_ops = { 1893 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 1894 .twsk_unique = tcp_twsk_unique, 1895 .twsk_destructor= tcp_twsk_destructor, 1896 }; 1897 1898 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 1899 { 1900 struct dst_entry *dst = skb_dst(skb); 1901 1902 if (dst && dst_hold_safe(dst)) { 1903 sk->sk_rx_dst = dst; 1904 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; 1905 } 1906 } 1907 EXPORT_SYMBOL(inet_sk_rx_dst_set); 1908 1909 const struct inet_connection_sock_af_ops ipv4_specific = { 1910 .queue_xmit = ip_queue_xmit, 1911 .send_check = tcp_v4_send_check, 1912 .rebuild_header = inet_sk_rebuild_header, 1913 .sk_rx_dst_set = inet_sk_rx_dst_set, 1914 .conn_request = tcp_v4_conn_request, 1915 .syn_recv_sock = tcp_v4_syn_recv_sock, 1916 .net_header_len = sizeof(struct iphdr), 1917 .setsockopt = ip_setsockopt, 1918 .getsockopt = ip_getsockopt, 1919 .addr2sockaddr = inet_csk_addr2sockaddr, 1920 .sockaddr_len = sizeof(struct sockaddr_in), 1921 #ifdef CONFIG_COMPAT 1922 .compat_setsockopt = compat_ip_setsockopt, 1923 .compat_getsockopt = compat_ip_getsockopt, 1924 #endif 1925 .mtu_reduced = tcp_v4_mtu_reduced, 1926 }; 1927 EXPORT_SYMBOL(ipv4_specific); 1928 1929 #ifdef CONFIG_TCP_MD5SIG 1930 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 1931 .md5_lookup = tcp_v4_md5_lookup, 1932 .calc_md5_hash = tcp_v4_md5_hash_skb, 1933 .md5_parse = tcp_v4_parse_md5_keys, 1934 }; 1935 #endif 1936 1937 /* NOTE: A lot of things set to zero explicitly by call to 1938 * sk_alloc() so need not be done here. 1939 */ 1940 static int tcp_v4_init_sock(struct sock *sk) 1941 { 1942 struct inet_connection_sock *icsk = inet_csk(sk); 1943 1944 tcp_init_sock(sk); 1945 1946 icsk->icsk_af_ops = &ipv4_specific; 1947 1948 #ifdef CONFIG_TCP_MD5SIG 1949 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 1950 #endif 1951 1952 return 0; 1953 } 1954 1955 void tcp_v4_destroy_sock(struct sock *sk) 1956 { 1957 struct tcp_sock *tp = tcp_sk(sk); 1958 1959 trace_tcp_destroy_sock(sk); 1960 1961 tcp_clear_xmit_timers(sk); 1962 1963 tcp_cleanup_congestion_control(sk); 1964 1965 tcp_cleanup_ulp(sk); 1966 1967 /* Cleanup up the write buffer. */ 1968 tcp_write_queue_purge(sk); 1969 1970 /* Check if we want to disable active TFO */ 1971 tcp_fastopen_active_disable_ofo_check(sk); 1972 1973 /* Cleans up our, hopefully empty, out_of_order_queue. */ 1974 skb_rbtree_purge(&tp->out_of_order_queue); 1975 1976 #ifdef CONFIG_TCP_MD5SIG 1977 /* Clean up the MD5 key list, if any */ 1978 if (tp->md5sig_info) { 1979 tcp_clear_md5_list(sk); 1980 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu); 1981 tp->md5sig_info = NULL; 1982 } 1983 #endif 1984 1985 /* Clean up a referenced TCP bind bucket. */ 1986 if (inet_csk(sk)->icsk_bind_hash) 1987 inet_put_port(sk); 1988 1989 BUG_ON(tp->fastopen_rsk); 1990 1991 /* If socket is aborted during connect operation */ 1992 tcp_free_fastopen_req(tp); 1993 tcp_fastopen_destroy_cipher(sk); 1994 tcp_saved_syn_free(tp); 1995 1996 sk_sockets_allocated_dec(sk); 1997 } 1998 EXPORT_SYMBOL(tcp_v4_destroy_sock); 1999 2000 #ifdef CONFIG_PROC_FS 2001 /* Proc filesystem TCP sock list dumping. */ 2002 2003 /* 2004 * Get next listener socket follow cur. If cur is NULL, get first socket 2005 * starting from bucket given in st->bucket; when st->bucket is zero the 2006 * very first socket in the hash table is returned. 2007 */ 2008 static void *listening_get_next(struct seq_file *seq, void *cur) 2009 { 2010 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); 2011 struct tcp_iter_state *st = seq->private; 2012 struct net *net = seq_file_net(seq); 2013 struct inet_listen_hashbucket *ilb; 2014 struct sock *sk = cur; 2015 2016 if (!sk) { 2017 get_head: 2018 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2019 spin_lock(&ilb->lock); 2020 sk = sk_head(&ilb->head); 2021 st->offset = 0; 2022 goto get_sk; 2023 } 2024 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2025 ++st->num; 2026 ++st->offset; 2027 2028 sk = sk_next(sk); 2029 get_sk: 2030 sk_for_each_from(sk) { 2031 if (!net_eq(sock_net(sk), net)) 2032 continue; 2033 if (sk->sk_family == afinfo->family) 2034 return sk; 2035 } 2036 spin_unlock(&ilb->lock); 2037 st->offset = 0; 2038 if (++st->bucket < INET_LHTABLE_SIZE) 2039 goto get_head; 2040 return NULL; 2041 } 2042 2043 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2044 { 2045 struct tcp_iter_state *st = seq->private; 2046 void *rc; 2047 2048 st->bucket = 0; 2049 st->offset = 0; 2050 rc = listening_get_next(seq, NULL); 2051 2052 while (rc && *pos) { 2053 rc = listening_get_next(seq, rc); 2054 --*pos; 2055 } 2056 return rc; 2057 } 2058 2059 static inline bool empty_bucket(const struct tcp_iter_state *st) 2060 { 2061 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain); 2062 } 2063 2064 /* 2065 * Get first established socket starting from bucket given in st->bucket. 2066 * If st->bucket is zero, the very first socket in the hash is returned. 2067 */ 2068 static void *established_get_first(struct seq_file *seq) 2069 { 2070 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); 2071 struct tcp_iter_state *st = seq->private; 2072 struct net *net = seq_file_net(seq); 2073 void *rc = NULL; 2074 2075 st->offset = 0; 2076 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { 2077 struct sock *sk; 2078 struct hlist_nulls_node *node; 2079 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 2080 2081 /* Lockless fast path for the common case of empty buckets */ 2082 if (empty_bucket(st)) 2083 continue; 2084 2085 spin_lock_bh(lock); 2086 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 2087 if (sk->sk_family != afinfo->family || 2088 !net_eq(sock_net(sk), net)) { 2089 continue; 2090 } 2091 rc = sk; 2092 goto out; 2093 } 2094 spin_unlock_bh(lock); 2095 } 2096 out: 2097 return rc; 2098 } 2099 2100 static void *established_get_next(struct seq_file *seq, void *cur) 2101 { 2102 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); 2103 struct sock *sk = cur; 2104 struct hlist_nulls_node *node; 2105 struct tcp_iter_state *st = seq->private; 2106 struct net *net = seq_file_net(seq); 2107 2108 ++st->num; 2109 ++st->offset; 2110 2111 sk = sk_nulls_next(sk); 2112 2113 sk_nulls_for_each_from(sk, node) { 2114 if (sk->sk_family == afinfo->family && 2115 net_eq(sock_net(sk), net)) 2116 return sk; 2117 } 2118 2119 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2120 ++st->bucket; 2121 return established_get_first(seq); 2122 } 2123 2124 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2125 { 2126 struct tcp_iter_state *st = seq->private; 2127 void *rc; 2128 2129 st->bucket = 0; 2130 rc = established_get_first(seq); 2131 2132 while (rc && pos) { 2133 rc = established_get_next(seq, rc); 2134 --pos; 2135 } 2136 return rc; 2137 } 2138 2139 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2140 { 2141 void *rc; 2142 struct tcp_iter_state *st = seq->private; 2143 2144 st->state = TCP_SEQ_STATE_LISTENING; 2145 rc = listening_get_idx(seq, &pos); 2146 2147 if (!rc) { 2148 st->state = TCP_SEQ_STATE_ESTABLISHED; 2149 rc = established_get_idx(seq, pos); 2150 } 2151 2152 return rc; 2153 } 2154 2155 static void *tcp_seek_last_pos(struct seq_file *seq) 2156 { 2157 struct tcp_iter_state *st = seq->private; 2158 int offset = st->offset; 2159 int orig_num = st->num; 2160 void *rc = NULL; 2161 2162 switch (st->state) { 2163 case TCP_SEQ_STATE_LISTENING: 2164 if (st->bucket >= INET_LHTABLE_SIZE) 2165 break; 2166 st->state = TCP_SEQ_STATE_LISTENING; 2167 rc = listening_get_next(seq, NULL); 2168 while (offset-- && rc) 2169 rc = listening_get_next(seq, rc); 2170 if (rc) 2171 break; 2172 st->bucket = 0; 2173 st->state = TCP_SEQ_STATE_ESTABLISHED; 2174 /* Fallthrough */ 2175 case TCP_SEQ_STATE_ESTABLISHED: 2176 if (st->bucket > tcp_hashinfo.ehash_mask) 2177 break; 2178 rc = established_get_first(seq); 2179 while (offset-- && rc) 2180 rc = established_get_next(seq, rc); 2181 } 2182 2183 st->num = orig_num; 2184 2185 return rc; 2186 } 2187 2188 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2189 { 2190 struct tcp_iter_state *st = seq->private; 2191 void *rc; 2192 2193 if (*pos && *pos == st->last_pos) { 2194 rc = tcp_seek_last_pos(seq); 2195 if (rc) 2196 goto out; 2197 } 2198 2199 st->state = TCP_SEQ_STATE_LISTENING; 2200 st->num = 0; 2201 st->bucket = 0; 2202 st->offset = 0; 2203 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2204 2205 out: 2206 st->last_pos = *pos; 2207 return rc; 2208 } 2209 EXPORT_SYMBOL(tcp_seq_start); 2210 2211 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2212 { 2213 struct tcp_iter_state *st = seq->private; 2214 void *rc = NULL; 2215 2216 if (v == SEQ_START_TOKEN) { 2217 rc = tcp_get_idx(seq, 0); 2218 goto out; 2219 } 2220 2221 switch (st->state) { 2222 case TCP_SEQ_STATE_LISTENING: 2223 rc = listening_get_next(seq, v); 2224 if (!rc) { 2225 st->state = TCP_SEQ_STATE_ESTABLISHED; 2226 st->bucket = 0; 2227 st->offset = 0; 2228 rc = established_get_first(seq); 2229 } 2230 break; 2231 case TCP_SEQ_STATE_ESTABLISHED: 2232 rc = established_get_next(seq, v); 2233 break; 2234 } 2235 out: 2236 ++*pos; 2237 st->last_pos = *pos; 2238 return rc; 2239 } 2240 EXPORT_SYMBOL(tcp_seq_next); 2241 2242 void tcp_seq_stop(struct seq_file *seq, void *v) 2243 { 2244 struct tcp_iter_state *st = seq->private; 2245 2246 switch (st->state) { 2247 case TCP_SEQ_STATE_LISTENING: 2248 if (v != SEQ_START_TOKEN) 2249 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock); 2250 break; 2251 case TCP_SEQ_STATE_ESTABLISHED: 2252 if (v) 2253 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2254 break; 2255 } 2256 } 2257 EXPORT_SYMBOL(tcp_seq_stop); 2258 2259 static void get_openreq4(const struct request_sock *req, 2260 struct seq_file *f, int i) 2261 { 2262 const struct inet_request_sock *ireq = inet_rsk(req); 2263 long delta = req->rsk_timer.expires - jiffies; 2264 2265 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2266 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2267 i, 2268 ireq->ir_loc_addr, 2269 ireq->ir_num, 2270 ireq->ir_rmt_addr, 2271 ntohs(ireq->ir_rmt_port), 2272 TCP_SYN_RECV, 2273 0, 0, /* could print option size, but that is af dependent. */ 2274 1, /* timers active (only the expire timer) */ 2275 jiffies_delta_to_clock_t(delta), 2276 req->num_timeout, 2277 from_kuid_munged(seq_user_ns(f), 2278 sock_i_uid(req->rsk_listener)), 2279 0, /* non standard timer */ 2280 0, /* open_requests have no inode */ 2281 0, 2282 req); 2283 } 2284 2285 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2286 { 2287 int timer_active; 2288 unsigned long timer_expires; 2289 const struct tcp_sock *tp = tcp_sk(sk); 2290 const struct inet_connection_sock *icsk = inet_csk(sk); 2291 const struct inet_sock *inet = inet_sk(sk); 2292 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2293 __be32 dest = inet->inet_daddr; 2294 __be32 src = inet->inet_rcv_saddr; 2295 __u16 destp = ntohs(inet->inet_dport); 2296 __u16 srcp = ntohs(inet->inet_sport); 2297 int rx_queue; 2298 int state; 2299 2300 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2301 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2302 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2303 timer_active = 1; 2304 timer_expires = icsk->icsk_timeout; 2305 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2306 timer_active = 4; 2307 timer_expires = icsk->icsk_timeout; 2308 } else if (timer_pending(&sk->sk_timer)) { 2309 timer_active = 2; 2310 timer_expires = sk->sk_timer.expires; 2311 } else { 2312 timer_active = 0; 2313 timer_expires = jiffies; 2314 } 2315 2316 state = inet_sk_state_load(sk); 2317 if (state == TCP_LISTEN) 2318 rx_queue = sk->sk_ack_backlog; 2319 else 2320 /* Because we don't lock the socket, 2321 * we might find a transient negative value. 2322 */ 2323 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); 2324 2325 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2326 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2327 i, src, srcp, dest, destp, state, 2328 tp->write_seq - tp->snd_una, 2329 rx_queue, 2330 timer_active, 2331 jiffies_delta_to_clock_t(timer_expires - jiffies), 2332 icsk->icsk_retransmits, 2333 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2334 icsk->icsk_probes_out, 2335 sock_i_ino(sk), 2336 refcount_read(&sk->sk_refcnt), sk, 2337 jiffies_to_clock_t(icsk->icsk_rto), 2338 jiffies_to_clock_t(icsk->icsk_ack.ato), 2339 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, 2340 tp->snd_cwnd, 2341 state == TCP_LISTEN ? 2342 fastopenq->max_qlen : 2343 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2344 } 2345 2346 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2347 struct seq_file *f, int i) 2348 { 2349 long delta = tw->tw_timer.expires - jiffies; 2350 __be32 dest, src; 2351 __u16 destp, srcp; 2352 2353 dest = tw->tw_daddr; 2354 src = tw->tw_rcv_saddr; 2355 destp = ntohs(tw->tw_dport); 2356 srcp = ntohs(tw->tw_sport); 2357 2358 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2359 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2360 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2361 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2362 refcount_read(&tw->tw_refcnt), tw); 2363 } 2364 2365 #define TMPSZ 150 2366 2367 static int tcp4_seq_show(struct seq_file *seq, void *v) 2368 { 2369 struct tcp_iter_state *st; 2370 struct sock *sk = v; 2371 2372 seq_setwidth(seq, TMPSZ - 1); 2373 if (v == SEQ_START_TOKEN) { 2374 seq_puts(seq, " sl local_address rem_address st tx_queue " 2375 "rx_queue tr tm->when retrnsmt uid timeout " 2376 "inode"); 2377 goto out; 2378 } 2379 st = seq->private; 2380 2381 if (sk->sk_state == TCP_TIME_WAIT) 2382 get_timewait4_sock(v, seq, st->num); 2383 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2384 get_openreq4(v, seq, st->num); 2385 else 2386 get_tcp4_sock(v, seq, st->num); 2387 out: 2388 seq_pad(seq, '\n'); 2389 return 0; 2390 } 2391 2392 static const struct seq_operations tcp4_seq_ops = { 2393 .show = tcp4_seq_show, 2394 .start = tcp_seq_start, 2395 .next = tcp_seq_next, 2396 .stop = tcp_seq_stop, 2397 }; 2398 2399 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 2400 .family = AF_INET, 2401 }; 2402 2403 static int __net_init tcp4_proc_init_net(struct net *net) 2404 { 2405 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 2406 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 2407 return -ENOMEM; 2408 return 0; 2409 } 2410 2411 static void __net_exit tcp4_proc_exit_net(struct net *net) 2412 { 2413 remove_proc_entry("tcp", net->proc_net); 2414 } 2415 2416 static struct pernet_operations tcp4_net_ops = { 2417 .init = tcp4_proc_init_net, 2418 .exit = tcp4_proc_exit_net, 2419 }; 2420 2421 int __init tcp4_proc_init(void) 2422 { 2423 return register_pernet_subsys(&tcp4_net_ops); 2424 } 2425 2426 void tcp4_proc_exit(void) 2427 { 2428 unregister_pernet_subsys(&tcp4_net_ops); 2429 } 2430 #endif /* CONFIG_PROC_FS */ 2431 2432 struct proto tcp_prot = { 2433 .name = "TCP", 2434 .owner = THIS_MODULE, 2435 .close = tcp_close, 2436 .pre_connect = tcp_v4_pre_connect, 2437 .connect = tcp_v4_connect, 2438 .disconnect = tcp_disconnect, 2439 .accept = inet_csk_accept, 2440 .ioctl = tcp_ioctl, 2441 .init = tcp_v4_init_sock, 2442 .destroy = tcp_v4_destroy_sock, 2443 .shutdown = tcp_shutdown, 2444 .setsockopt = tcp_setsockopt, 2445 .getsockopt = tcp_getsockopt, 2446 .keepalive = tcp_set_keepalive, 2447 .recvmsg = tcp_recvmsg, 2448 .sendmsg = tcp_sendmsg, 2449 .sendpage = tcp_sendpage, 2450 .backlog_rcv = tcp_v4_do_rcv, 2451 .release_cb = tcp_release_cb, 2452 .hash = inet_hash, 2453 .unhash = inet_unhash, 2454 .get_port = inet_csk_get_port, 2455 .enter_memory_pressure = tcp_enter_memory_pressure, 2456 .leave_memory_pressure = tcp_leave_memory_pressure, 2457 .stream_memory_free = tcp_stream_memory_free, 2458 .sockets_allocated = &tcp_sockets_allocated, 2459 .orphan_count = &tcp_orphan_count, 2460 .memory_allocated = &tcp_memory_allocated, 2461 .memory_pressure = &tcp_memory_pressure, 2462 .sysctl_mem = sysctl_tcp_mem, 2463 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 2464 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 2465 .max_header = MAX_TCP_HEADER, 2466 .obj_size = sizeof(struct tcp_sock), 2467 .slab_flags = SLAB_TYPESAFE_BY_RCU, 2468 .twsk_prot = &tcp_timewait_sock_ops, 2469 .rsk_prot = &tcp_request_sock_ops, 2470 .h.hashinfo = &tcp_hashinfo, 2471 .no_autobind = true, 2472 #ifdef CONFIG_COMPAT 2473 .compat_setsockopt = compat_tcp_setsockopt, 2474 .compat_getsockopt = compat_tcp_getsockopt, 2475 #endif 2476 .diag_destroy = tcp_abort, 2477 }; 2478 EXPORT_SYMBOL(tcp_prot); 2479 2480 static void __net_exit tcp_sk_exit(struct net *net) 2481 { 2482 int cpu; 2483 2484 module_put(net->ipv4.tcp_congestion_control->owner); 2485 2486 for_each_possible_cpu(cpu) 2487 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); 2488 free_percpu(net->ipv4.tcp_sk); 2489 } 2490 2491 static int __net_init tcp_sk_init(struct net *net) 2492 { 2493 int res, cpu, cnt; 2494 2495 net->ipv4.tcp_sk = alloc_percpu(struct sock *); 2496 if (!net->ipv4.tcp_sk) 2497 return -ENOMEM; 2498 2499 for_each_possible_cpu(cpu) { 2500 struct sock *sk; 2501 2502 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 2503 IPPROTO_TCP, net); 2504 if (res) 2505 goto fail; 2506 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 2507 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; 2508 } 2509 2510 net->ipv4.sysctl_tcp_ecn = 2; 2511 net->ipv4.sysctl_tcp_ecn_fallback = 1; 2512 2513 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 2514 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 2515 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 2516 2517 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 2518 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 2519 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 2520 2521 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 2522 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 2523 net->ipv4.sysctl_tcp_syncookies = 1; 2524 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 2525 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 2526 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 2527 net->ipv4.sysctl_tcp_orphan_retries = 0; 2528 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 2529 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 2530 net->ipv4.sysctl_tcp_tw_reuse = 2; 2531 2532 cnt = tcp_hashinfo.ehash_mask + 1; 2533 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2; 2534 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; 2535 2536 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256); 2537 net->ipv4.sysctl_tcp_sack = 1; 2538 net->ipv4.sysctl_tcp_window_scaling = 1; 2539 net->ipv4.sysctl_tcp_timestamps = 1; 2540 net->ipv4.sysctl_tcp_early_retrans = 3; 2541 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 2542 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 2543 net->ipv4.sysctl_tcp_retrans_collapse = 1; 2544 net->ipv4.sysctl_tcp_max_reordering = 300; 2545 net->ipv4.sysctl_tcp_dsack = 1; 2546 net->ipv4.sysctl_tcp_app_win = 31; 2547 net->ipv4.sysctl_tcp_adv_win_scale = 1; 2548 net->ipv4.sysctl_tcp_frto = 2; 2549 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 2550 /* This limits the percentage of the congestion window which we 2551 * will allow a single TSO frame to consume. Building TSO frames 2552 * which are too large can cause TCP streams to be bursty. 2553 */ 2554 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 2555 /* Default TSQ limit of four TSO segments */ 2556 net->ipv4.sysctl_tcp_limit_output_bytes = 262144; 2557 /* rfc5961 challenge ack rate limiting */ 2558 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; 2559 net->ipv4.sysctl_tcp_min_tso_segs = 2; 2560 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 2561 net->ipv4.sysctl_tcp_autocorking = 1; 2562 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 2563 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 2564 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 2565 if (net != &init_net) { 2566 memcpy(net->ipv4.sysctl_tcp_rmem, 2567 init_net.ipv4.sysctl_tcp_rmem, 2568 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 2569 memcpy(net->ipv4.sysctl_tcp_wmem, 2570 init_net.ipv4.sysctl_tcp_wmem, 2571 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 2572 } 2573 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 2574 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 2575 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 2576 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); 2577 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; 2578 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 2579 2580 /* Reno is always built in */ 2581 if (!net_eq(net, &init_net) && 2582 try_module_get(init_net.ipv4.tcp_congestion_control->owner)) 2583 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 2584 else 2585 net->ipv4.tcp_congestion_control = &tcp_reno; 2586 2587 return 0; 2588 fail: 2589 tcp_sk_exit(net); 2590 2591 return res; 2592 } 2593 2594 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 2595 { 2596 struct net *net; 2597 2598 inet_twsk_purge(&tcp_hashinfo, AF_INET); 2599 2600 list_for_each_entry(net, net_exit_list, exit_list) 2601 tcp_fastopen_ctx_destroy(net); 2602 } 2603 2604 static struct pernet_operations __net_initdata tcp_sk_ops = { 2605 .init = tcp_sk_init, 2606 .exit = tcp_sk_exit, 2607 .exit_batch = tcp_sk_exit_batch, 2608 }; 2609 2610 void __init tcp_v4_init(void) 2611 { 2612 if (register_pernet_subsys(&tcp_sk_ops)) 2613 panic("Failed to create the TCP control socket.\n"); 2614 } 2615