1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 61 #include <net/net_namespace.h> 62 #include <net/icmp.h> 63 #include <net/inet_hashtables.h> 64 #include <net/tcp.h> 65 #include <net/transp_v6.h> 66 #include <net/ipv6.h> 67 #include <net/inet_common.h> 68 #include <net/timewait_sock.h> 69 #include <net/xfrm.h> 70 #include <net/secure_seq.h> 71 #include <net/busy_poll.h> 72 73 #include <linux/inet.h> 74 #include <linux/ipv6.h> 75 #include <linux/stddef.h> 76 #include <linux/proc_fs.h> 77 #include <linux/seq_file.h> 78 #include <linux/inetdevice.h> 79 80 #include <crypto/hash.h> 81 #include <linux/scatterlist.h> 82 83 #include <trace/events/tcp.h> 84 85 #ifdef CONFIG_TCP_MD5SIG 86 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 87 __be32 daddr, __be32 saddr, const struct tcphdr *th); 88 #endif 89 90 struct inet_hashinfo tcp_hashinfo; 91 EXPORT_SYMBOL(tcp_hashinfo); 92 93 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 94 { 95 return secure_tcp_seq(ip_hdr(skb)->daddr, 96 ip_hdr(skb)->saddr, 97 tcp_hdr(skb)->dest, 98 tcp_hdr(skb)->source); 99 } 100 101 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 102 { 103 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 104 } 105 106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 107 { 108 const struct inet_timewait_sock *tw = inet_twsk(sktw); 109 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 110 struct tcp_sock *tp = tcp_sk(sk); 111 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse; 112 113 if (reuse == 2) { 114 /* Still does not detect *everything* that goes through 115 * lo, since we require a loopback src or dst address 116 * or direct binding to 'lo' interface. 117 */ 118 bool loopback = false; 119 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 120 loopback = true; 121 #if IS_ENABLED(CONFIG_IPV6) 122 if (tw->tw_family == AF_INET6) { 123 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 124 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 125 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 126 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 127 loopback = true; 128 } else 129 #endif 130 { 131 if (ipv4_is_loopback(tw->tw_daddr) || 132 ipv4_is_loopback(tw->tw_rcv_saddr)) 133 loopback = true; 134 } 135 if (!loopback) 136 reuse = 0; 137 } 138 139 /* With PAWS, it is safe from the viewpoint 140 of data integrity. Even without PAWS it is safe provided sequence 141 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 142 143 Actually, the idea is close to VJ's one, only timestamp cache is 144 held not per host, but per port pair and TW bucket is used as state 145 holder. 146 147 If TW bucket has been already destroyed we fall back to VJ's scheme 148 and use initial timestamp retrieved from peer table. 149 */ 150 if (tcptw->tw_ts_recent_stamp && 151 (!twp || (reuse && time_after32(ktime_get_seconds(), 152 tcptw->tw_ts_recent_stamp)))) { 153 /* In case of repair and re-using TIME-WAIT sockets we still 154 * want to be sure that it is safe as above but honor the 155 * sequence numbers and time stamps set as part of the repair 156 * process. 157 * 158 * Without this check re-using a TIME-WAIT socket with TCP 159 * repair would accumulate a -1 on the repair assigned 160 * sequence number. The first time it is reused the sequence 161 * is -1, the second time -2, etc. This fixes that issue 162 * without appearing to create any others. 163 */ 164 if (likely(!tp->repair)) { 165 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 166 167 if (!seq) 168 seq = 1; 169 WRITE_ONCE(tp->write_seq, seq); 170 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 171 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 172 } 173 sock_hold(sktw); 174 return 1; 175 } 176 177 return 0; 178 } 179 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 180 181 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 182 int addr_len) 183 { 184 /* This check is replicated from tcp_v4_connect() and intended to 185 * prevent BPF program called below from accessing bytes that are out 186 * of the bound specified by user in addr_len. 187 */ 188 if (addr_len < sizeof(struct sockaddr_in)) 189 return -EINVAL; 190 191 sock_owned_by_me(sk); 192 193 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr); 194 } 195 196 /* This will initiate an outgoing connection. */ 197 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 198 { 199 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 200 struct inet_sock *inet = inet_sk(sk); 201 struct tcp_sock *tp = tcp_sk(sk); 202 __be16 orig_sport, orig_dport; 203 __be32 daddr, nexthop; 204 struct flowi4 *fl4; 205 struct rtable *rt; 206 int err; 207 struct ip_options_rcu *inet_opt; 208 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 209 210 if (addr_len < sizeof(struct sockaddr_in)) 211 return -EINVAL; 212 213 if (usin->sin_family != AF_INET) 214 return -EAFNOSUPPORT; 215 216 nexthop = daddr = usin->sin_addr.s_addr; 217 inet_opt = rcu_dereference_protected(inet->inet_opt, 218 lockdep_sock_is_held(sk)); 219 if (inet_opt && inet_opt->opt.srr) { 220 if (!daddr) 221 return -EINVAL; 222 nexthop = inet_opt->opt.faddr; 223 } 224 225 orig_sport = inet->inet_sport; 226 orig_dport = usin->sin_port; 227 fl4 = &inet->cork.fl.u.ip4; 228 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 229 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 230 IPPROTO_TCP, 231 orig_sport, orig_dport, sk); 232 if (IS_ERR(rt)) { 233 err = PTR_ERR(rt); 234 if (err == -ENETUNREACH) 235 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 236 return err; 237 } 238 239 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 240 ip_rt_put(rt); 241 return -ENETUNREACH; 242 } 243 244 if (!inet_opt || !inet_opt->opt.srr) 245 daddr = fl4->daddr; 246 247 if (!inet->inet_saddr) 248 inet->inet_saddr = fl4->saddr; 249 sk_rcv_saddr_set(sk, inet->inet_saddr); 250 251 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 252 /* Reset inherited state */ 253 tp->rx_opt.ts_recent = 0; 254 tp->rx_opt.ts_recent_stamp = 0; 255 if (likely(!tp->repair)) 256 WRITE_ONCE(tp->write_seq, 0); 257 } 258 259 inet->inet_dport = usin->sin_port; 260 sk_daddr_set(sk, daddr); 261 262 inet_csk(sk)->icsk_ext_hdr_len = 0; 263 if (inet_opt) 264 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 265 266 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 267 268 /* Socket identity is still unknown (sport may be zero). 269 * However we set state to SYN-SENT and not releasing socket 270 * lock select source port, enter ourselves into the hash tables and 271 * complete initialization after this. 272 */ 273 tcp_set_state(sk, TCP_SYN_SENT); 274 err = inet_hash_connect(tcp_death_row, sk); 275 if (err) 276 goto failure; 277 278 sk_set_txhash(sk); 279 280 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 281 inet->inet_sport, inet->inet_dport, sk); 282 if (IS_ERR(rt)) { 283 err = PTR_ERR(rt); 284 rt = NULL; 285 goto failure; 286 } 287 /* OK, now commit destination to socket. */ 288 sk->sk_gso_type = SKB_GSO_TCPV4; 289 sk_setup_caps(sk, &rt->dst); 290 rt = NULL; 291 292 if (likely(!tp->repair)) { 293 if (!tp->write_seq) 294 WRITE_ONCE(tp->write_seq, 295 secure_tcp_seq(inet->inet_saddr, 296 inet->inet_daddr, 297 inet->inet_sport, 298 usin->sin_port)); 299 tp->tsoffset = secure_tcp_ts_off(sock_net(sk), 300 inet->inet_saddr, 301 inet->inet_daddr); 302 } 303 304 inet->inet_id = prandom_u32(); 305 306 if (tcp_fastopen_defer_connect(sk, &err)) 307 return err; 308 if (err) 309 goto failure; 310 311 err = tcp_connect(sk); 312 313 if (err) 314 goto failure; 315 316 return 0; 317 318 failure: 319 /* 320 * This unhashes the socket and releases the local port, 321 * if necessary. 322 */ 323 tcp_set_state(sk, TCP_CLOSE); 324 ip_rt_put(rt); 325 sk->sk_route_caps = 0; 326 inet->inet_dport = 0; 327 return err; 328 } 329 EXPORT_SYMBOL(tcp_v4_connect); 330 331 /* 332 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 333 * It can be called through tcp_release_cb() if socket was owned by user 334 * at the time tcp_v4_err() was called to handle ICMP message. 335 */ 336 void tcp_v4_mtu_reduced(struct sock *sk) 337 { 338 struct inet_sock *inet = inet_sk(sk); 339 struct dst_entry *dst; 340 u32 mtu; 341 342 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 343 return; 344 mtu = tcp_sk(sk)->mtu_info; 345 dst = inet_csk_update_pmtu(sk, mtu); 346 if (!dst) 347 return; 348 349 /* Something is about to be wrong... Remember soft error 350 * for the case, if this connection will not able to recover. 351 */ 352 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 353 sk->sk_err_soft = EMSGSIZE; 354 355 mtu = dst_mtu(dst); 356 357 if (inet->pmtudisc != IP_PMTUDISC_DONT && 358 ip_sk_accept_pmtu(sk) && 359 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 360 tcp_sync_mss(sk, mtu); 361 362 /* Resend the TCP packet because it's 363 * clear that the old packet has been 364 * dropped. This is the new "fast" path mtu 365 * discovery. 366 */ 367 tcp_simple_retransmit(sk); 368 } /* else let the usual retransmit timer handle it */ 369 } 370 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 371 372 static void do_redirect(struct sk_buff *skb, struct sock *sk) 373 { 374 struct dst_entry *dst = __sk_dst_check(sk, 0); 375 376 if (dst) 377 dst->ops->redirect(dst, sk, skb); 378 } 379 380 381 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 382 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 383 { 384 struct request_sock *req = inet_reqsk(sk); 385 struct net *net = sock_net(sk); 386 387 /* ICMPs are not backlogged, hence we cannot get 388 * an established socket here. 389 */ 390 if (seq != tcp_rsk(req)->snt_isn) { 391 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 392 } else if (abort) { 393 /* 394 * Still in SYN_RECV, just remove it silently. 395 * There is no good way to pass the error to the newly 396 * created socket, and POSIX does not want network 397 * errors returned from accept(). 398 */ 399 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 400 tcp_listendrop(req->rsk_listener); 401 } 402 reqsk_put(req); 403 } 404 EXPORT_SYMBOL(tcp_req_err); 405 406 /* 407 * This routine is called by the ICMP module when it gets some 408 * sort of error condition. If err < 0 then the socket should 409 * be closed and the error returned to the user. If err > 0 410 * it's just the icmp type << 8 | icmp code. After adjustment 411 * header points to the first 8 bytes of the tcp header. We need 412 * to find the appropriate port. 413 * 414 * The locking strategy used here is very "optimistic". When 415 * someone else accesses the socket the ICMP is just dropped 416 * and for some paths there is no check at all. 417 * A more general error queue to queue errors for later handling 418 * is probably better. 419 * 420 */ 421 422 int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) 423 { 424 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data; 425 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); 426 struct inet_connection_sock *icsk; 427 struct tcp_sock *tp; 428 struct inet_sock *inet; 429 const int type = icmp_hdr(icmp_skb)->type; 430 const int code = icmp_hdr(icmp_skb)->code; 431 struct sock *sk; 432 struct sk_buff *skb; 433 struct request_sock *fastopen; 434 u32 seq, snd_una; 435 s32 remaining; 436 u32 delta_us; 437 int err; 438 struct net *net = dev_net(icmp_skb->dev); 439 440 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, 441 th->dest, iph->saddr, ntohs(th->source), 442 inet_iif(icmp_skb), 0); 443 if (!sk) { 444 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 445 return -ENOENT; 446 } 447 if (sk->sk_state == TCP_TIME_WAIT) { 448 inet_twsk_put(inet_twsk(sk)); 449 return 0; 450 } 451 seq = ntohl(th->seq); 452 if (sk->sk_state == TCP_NEW_SYN_RECV) { 453 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 454 type == ICMP_TIME_EXCEEDED || 455 (type == ICMP_DEST_UNREACH && 456 (code == ICMP_NET_UNREACH || 457 code == ICMP_HOST_UNREACH))); 458 return 0; 459 } 460 461 bh_lock_sock(sk); 462 /* If too many ICMPs get dropped on busy 463 * servers this needs to be solved differently. 464 * We do take care of PMTU discovery (RFC1191) special case : 465 * we can receive locally generated ICMP messages while socket is held. 466 */ 467 if (sock_owned_by_user(sk)) { 468 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 469 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 470 } 471 if (sk->sk_state == TCP_CLOSE) 472 goto out; 473 474 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 475 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 476 goto out; 477 } 478 479 icsk = inet_csk(sk); 480 tp = tcp_sk(sk); 481 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 482 fastopen = rcu_dereference(tp->fastopen_rsk); 483 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 484 if (sk->sk_state != TCP_LISTEN && 485 !between(seq, snd_una, tp->snd_nxt)) { 486 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 487 goto out; 488 } 489 490 switch (type) { 491 case ICMP_REDIRECT: 492 if (!sock_owned_by_user(sk)) 493 do_redirect(icmp_skb, sk); 494 goto out; 495 case ICMP_SOURCE_QUENCH: 496 /* Just silently ignore these. */ 497 goto out; 498 case ICMP_PARAMETERPROB: 499 err = EPROTO; 500 break; 501 case ICMP_DEST_UNREACH: 502 if (code > NR_ICMP_UNREACH) 503 goto out; 504 505 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 506 /* We are not interested in TCP_LISTEN and open_requests 507 * (SYN-ACKs send out by Linux are always <576bytes so 508 * they should go through unfragmented). 509 */ 510 if (sk->sk_state == TCP_LISTEN) 511 goto out; 512 513 tp->mtu_info = info; 514 if (!sock_owned_by_user(sk)) { 515 tcp_v4_mtu_reduced(sk); 516 } else { 517 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 518 sock_hold(sk); 519 } 520 goto out; 521 } 522 523 err = icmp_err_convert[code].errno; 524 /* check if icmp_skb allows revert of backoff 525 * (see draft-zimmermann-tcp-lcd) */ 526 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH) 527 break; 528 if (seq != tp->snd_una || !icsk->icsk_retransmits || 529 !icsk->icsk_backoff || fastopen) 530 break; 531 532 if (sock_owned_by_user(sk)) 533 break; 534 535 skb = tcp_rtx_queue_head(sk); 536 if (WARN_ON_ONCE(!skb)) 537 break; 538 539 icsk->icsk_backoff--; 540 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : 541 TCP_TIMEOUT_INIT; 542 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 543 544 545 tcp_mstamp_refresh(tp); 546 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 547 remaining = icsk->icsk_rto - 548 usecs_to_jiffies(delta_us); 549 550 if (remaining > 0) { 551 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 552 remaining, TCP_RTO_MAX); 553 } else { 554 /* RTO revert clocked out retransmission. 555 * Will retransmit now */ 556 tcp_retransmit_timer(sk); 557 } 558 559 break; 560 case ICMP_TIME_EXCEEDED: 561 err = EHOSTUNREACH; 562 break; 563 default: 564 goto out; 565 } 566 567 switch (sk->sk_state) { 568 case TCP_SYN_SENT: 569 case TCP_SYN_RECV: 570 /* Only in fast or simultaneous open. If a fast open socket is 571 * is already accepted it is treated as a connected one below. 572 */ 573 if (fastopen && !fastopen->sk) 574 break; 575 576 if (!sock_owned_by_user(sk)) { 577 sk->sk_err = err; 578 579 sk->sk_error_report(sk); 580 581 tcp_done(sk); 582 } else { 583 sk->sk_err_soft = err; 584 } 585 goto out; 586 } 587 588 /* If we've already connected we will keep trying 589 * until we time out, or the user gives up. 590 * 591 * rfc1122 4.2.3.9 allows to consider as hard errors 592 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 593 * but it is obsoleted by pmtu discovery). 594 * 595 * Note, that in modern internet, where routing is unreliable 596 * and in each dark corner broken firewalls sit, sending random 597 * errors ordered by their masters even this two messages finally lose 598 * their original sense (even Linux sends invalid PORT_UNREACHs) 599 * 600 * Now we are in compliance with RFCs. 601 * --ANK (980905) 602 */ 603 604 inet = inet_sk(sk); 605 if (!sock_owned_by_user(sk) && inet->recverr) { 606 sk->sk_err = err; 607 sk->sk_error_report(sk); 608 } else { /* Only an error on timeout */ 609 sk->sk_err_soft = err; 610 } 611 612 out: 613 bh_unlock_sock(sk); 614 sock_put(sk); 615 return 0; 616 } 617 618 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 619 { 620 struct tcphdr *th = tcp_hdr(skb); 621 622 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 623 skb->csum_start = skb_transport_header(skb) - skb->head; 624 skb->csum_offset = offsetof(struct tcphdr, check); 625 } 626 627 /* This routine computes an IPv4 TCP checksum. */ 628 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 629 { 630 const struct inet_sock *inet = inet_sk(sk); 631 632 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 633 } 634 EXPORT_SYMBOL(tcp_v4_send_check); 635 636 /* 637 * This routine will send an RST to the other tcp. 638 * 639 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 640 * for reset. 641 * Answer: if a packet caused RST, it is not for a socket 642 * existing in our system, if it is matched to a socket, 643 * it is just duplicate segment or bug in other side's TCP. 644 * So that we build reply only basing on parameters 645 * arrived with segment. 646 * Exception: precedence violation. We do not implement it in any case. 647 */ 648 649 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) 650 { 651 const struct tcphdr *th = tcp_hdr(skb); 652 struct { 653 struct tcphdr th; 654 #ifdef CONFIG_TCP_MD5SIG 655 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)]; 656 #endif 657 } rep; 658 struct ip_reply_arg arg; 659 #ifdef CONFIG_TCP_MD5SIG 660 struct tcp_md5sig_key *key = NULL; 661 const __u8 *hash_location = NULL; 662 unsigned char newhash[16]; 663 int genhash; 664 struct sock *sk1 = NULL; 665 #endif 666 u64 transmit_time = 0; 667 struct sock *ctl_sk; 668 struct net *net; 669 670 /* Never send a reset in response to a reset. */ 671 if (th->rst) 672 return; 673 674 /* If sk not NULL, it means we did a successful lookup and incoming 675 * route had to be correct. prequeue might have dropped our dst. 676 */ 677 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 678 return; 679 680 /* Swap the send and the receive. */ 681 memset(&rep, 0, sizeof(rep)); 682 rep.th.dest = th->source; 683 rep.th.source = th->dest; 684 rep.th.doff = sizeof(struct tcphdr) / 4; 685 rep.th.rst = 1; 686 687 if (th->ack) { 688 rep.th.seq = th->ack_seq; 689 } else { 690 rep.th.ack = 1; 691 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 692 skb->len - (th->doff << 2)); 693 } 694 695 memset(&arg, 0, sizeof(arg)); 696 arg.iov[0].iov_base = (unsigned char *)&rep; 697 arg.iov[0].iov_len = sizeof(rep.th); 698 699 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 700 #ifdef CONFIG_TCP_MD5SIG 701 rcu_read_lock(); 702 hash_location = tcp_parse_md5sig_option(th); 703 if (sk && sk_fullsock(sk)) { 704 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *) 705 &ip_hdr(skb)->saddr, AF_INET); 706 } else if (hash_location) { 707 /* 708 * active side is lost. Try to find listening socket through 709 * source port, and then find md5 key through listening socket. 710 * we are not loose security here: 711 * Incoming packet is checked with md5 hash with finding key, 712 * no RST generated if md5 hash doesn't match. 713 */ 714 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, 715 ip_hdr(skb)->saddr, 716 th->source, ip_hdr(skb)->daddr, 717 ntohs(th->source), inet_iif(skb), 718 tcp_v4_sdif(skb)); 719 /* don't send rst if it can't find key */ 720 if (!sk1) 721 goto out; 722 723 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *) 724 &ip_hdr(skb)->saddr, AF_INET); 725 if (!key) 726 goto out; 727 728 729 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 730 if (genhash || memcmp(hash_location, newhash, 16) != 0) 731 goto out; 732 733 } 734 735 if (key) { 736 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 737 (TCPOPT_NOP << 16) | 738 (TCPOPT_MD5SIG << 8) | 739 TCPOLEN_MD5SIG); 740 /* Update length and the length the header thinks exists */ 741 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 742 rep.th.doff = arg.iov[0].iov_len / 4; 743 744 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 745 key, ip_hdr(skb)->saddr, 746 ip_hdr(skb)->daddr, &rep.th); 747 } 748 #endif 749 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 750 ip_hdr(skb)->saddr, /* XXX */ 751 arg.iov[0].iov_len, IPPROTO_TCP, 0); 752 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 753 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 754 755 /* When socket is gone, all binding information is lost. 756 * routing might fail in this case. No choice here, if we choose to force 757 * input interface, we will misroute in case of asymmetric route. 758 */ 759 if (sk) { 760 arg.bound_dev_if = sk->sk_bound_dev_if; 761 if (sk_fullsock(sk)) 762 trace_tcp_send_reset(sk, skb); 763 } 764 765 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 766 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 767 768 arg.tos = ip_hdr(skb)->tos; 769 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 770 local_bh_disable(); 771 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); 772 if (sk) { 773 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 774 inet_twsk(sk)->tw_mark : sk->sk_mark; 775 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 776 inet_twsk(sk)->tw_priority : sk->sk_priority; 777 transmit_time = tcp_transmit_time(sk); 778 } 779 ip_send_unicast_reply(ctl_sk, 780 skb, &TCP_SKB_CB(skb)->header.h4.opt, 781 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 782 &arg, arg.iov[0].iov_len, 783 transmit_time); 784 785 ctl_sk->sk_mark = 0; 786 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 787 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 788 local_bh_enable(); 789 790 #ifdef CONFIG_TCP_MD5SIG 791 out: 792 rcu_read_unlock(); 793 #endif 794 } 795 796 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 797 outside socket context is ugly, certainly. What can I do? 798 */ 799 800 static void tcp_v4_send_ack(const struct sock *sk, 801 struct sk_buff *skb, u32 seq, u32 ack, 802 u32 win, u32 tsval, u32 tsecr, int oif, 803 struct tcp_md5sig_key *key, 804 int reply_flags, u8 tos) 805 { 806 const struct tcphdr *th = tcp_hdr(skb); 807 struct { 808 struct tcphdr th; 809 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 810 #ifdef CONFIG_TCP_MD5SIG 811 + (TCPOLEN_MD5SIG_ALIGNED >> 2) 812 #endif 813 ]; 814 } rep; 815 struct net *net = sock_net(sk); 816 struct ip_reply_arg arg; 817 struct sock *ctl_sk; 818 u64 transmit_time; 819 820 memset(&rep.th, 0, sizeof(struct tcphdr)); 821 memset(&arg, 0, sizeof(arg)); 822 823 arg.iov[0].iov_base = (unsigned char *)&rep; 824 arg.iov[0].iov_len = sizeof(rep.th); 825 if (tsecr) { 826 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 827 (TCPOPT_TIMESTAMP << 8) | 828 TCPOLEN_TIMESTAMP); 829 rep.opt[1] = htonl(tsval); 830 rep.opt[2] = htonl(tsecr); 831 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 832 } 833 834 /* Swap the send and the receive. */ 835 rep.th.dest = th->source; 836 rep.th.source = th->dest; 837 rep.th.doff = arg.iov[0].iov_len / 4; 838 rep.th.seq = htonl(seq); 839 rep.th.ack_seq = htonl(ack); 840 rep.th.ack = 1; 841 rep.th.window = htons(win); 842 843 #ifdef CONFIG_TCP_MD5SIG 844 if (key) { 845 int offset = (tsecr) ? 3 : 0; 846 847 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 848 (TCPOPT_NOP << 16) | 849 (TCPOPT_MD5SIG << 8) | 850 TCPOLEN_MD5SIG); 851 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 852 rep.th.doff = arg.iov[0].iov_len/4; 853 854 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 855 key, ip_hdr(skb)->saddr, 856 ip_hdr(skb)->daddr, &rep.th); 857 } 858 #endif 859 arg.flags = reply_flags; 860 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 861 ip_hdr(skb)->saddr, /* XXX */ 862 arg.iov[0].iov_len, IPPROTO_TCP, 0); 863 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 864 if (oif) 865 arg.bound_dev_if = oif; 866 arg.tos = tos; 867 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 868 local_bh_disable(); 869 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); 870 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 871 inet_twsk(sk)->tw_mark : sk->sk_mark; 872 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 873 inet_twsk(sk)->tw_priority : sk->sk_priority; 874 transmit_time = tcp_transmit_time(sk); 875 ip_send_unicast_reply(ctl_sk, 876 skb, &TCP_SKB_CB(skb)->header.h4.opt, 877 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 878 &arg, arg.iov[0].iov_len, 879 transmit_time); 880 881 ctl_sk->sk_mark = 0; 882 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 883 local_bh_enable(); 884 } 885 886 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 887 { 888 struct inet_timewait_sock *tw = inet_twsk(sk); 889 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 890 891 tcp_v4_send_ack(sk, skb, 892 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 893 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 894 tcp_time_stamp_raw() + tcptw->tw_ts_offset, 895 tcptw->tw_ts_recent, 896 tw->tw_bound_dev_if, 897 tcp_twsk_md5_key(tcptw), 898 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 899 tw->tw_tos 900 ); 901 902 inet_twsk_put(tw); 903 } 904 905 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 906 struct request_sock *req) 907 { 908 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 909 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 910 */ 911 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 912 tcp_sk(sk)->snd_nxt; 913 914 /* RFC 7323 2.3 915 * The window field (SEG.WND) of every outgoing segment, with the 916 * exception of <SYN> segments, MUST be right-shifted by 917 * Rcv.Wind.Shift bits: 918 */ 919 tcp_v4_send_ack(sk, skb, seq, 920 tcp_rsk(req)->rcv_nxt, 921 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 922 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, 923 req->ts_recent, 924 0, 925 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr, 926 AF_INET), 927 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 928 ip_hdr(skb)->tos); 929 } 930 931 /* 932 * Send a SYN-ACK after having received a SYN. 933 * This still operates on a request_sock only, not on a big 934 * socket. 935 */ 936 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 937 struct flowi *fl, 938 struct request_sock *req, 939 struct tcp_fastopen_cookie *foc, 940 enum tcp_synack_type synack_type) 941 { 942 const struct inet_request_sock *ireq = inet_rsk(req); 943 struct flowi4 fl4; 944 int err = -1; 945 struct sk_buff *skb; 946 947 /* First, grab a route. */ 948 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 949 return -1; 950 951 skb = tcp_make_synack(sk, dst, req, foc, synack_type); 952 953 if (skb) { 954 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 955 956 rcu_read_lock(); 957 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 958 ireq->ir_rmt_addr, 959 rcu_dereference(ireq->ireq_opt)); 960 rcu_read_unlock(); 961 err = net_xmit_eval(err); 962 } 963 964 return err; 965 } 966 967 /* 968 * IPv4 request_sock destructor. 969 */ 970 static void tcp_v4_reqsk_destructor(struct request_sock *req) 971 { 972 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 973 } 974 975 #ifdef CONFIG_TCP_MD5SIG 976 /* 977 * RFC2385 MD5 checksumming requires a mapping of 978 * IP address->MD5 Key. 979 * We need to maintain these in the sk structure. 980 */ 981 982 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed); 983 EXPORT_SYMBOL(tcp_md5_needed); 984 985 /* Find the Key structure for an address. */ 986 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, 987 const union tcp_md5_addr *addr, 988 int family) 989 { 990 const struct tcp_sock *tp = tcp_sk(sk); 991 struct tcp_md5sig_key *key; 992 const struct tcp_md5sig_info *md5sig; 993 __be32 mask; 994 struct tcp_md5sig_key *best_match = NULL; 995 bool match; 996 997 /* caller either holds rcu_read_lock() or socket lock */ 998 md5sig = rcu_dereference_check(tp->md5sig_info, 999 lockdep_sock_is_held(sk)); 1000 if (!md5sig) 1001 return NULL; 1002 1003 hlist_for_each_entry_rcu(key, &md5sig->head, node) { 1004 if (key->family != family) 1005 continue; 1006 1007 if (family == AF_INET) { 1008 mask = inet_make_mask(key->prefixlen); 1009 match = (key->addr.a4.s_addr & mask) == 1010 (addr->a4.s_addr & mask); 1011 #if IS_ENABLED(CONFIG_IPV6) 1012 } else if (family == AF_INET6) { 1013 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1014 key->prefixlen); 1015 #endif 1016 } else { 1017 match = false; 1018 } 1019 1020 if (match && (!best_match || 1021 key->prefixlen > best_match->prefixlen)) 1022 best_match = key; 1023 } 1024 return best_match; 1025 } 1026 EXPORT_SYMBOL(__tcp_md5_do_lookup); 1027 1028 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1029 const union tcp_md5_addr *addr, 1030 int family, u8 prefixlen) 1031 { 1032 const struct tcp_sock *tp = tcp_sk(sk); 1033 struct tcp_md5sig_key *key; 1034 unsigned int size = sizeof(struct in_addr); 1035 const struct tcp_md5sig_info *md5sig; 1036 1037 /* caller either holds rcu_read_lock() or socket lock */ 1038 md5sig = rcu_dereference_check(tp->md5sig_info, 1039 lockdep_sock_is_held(sk)); 1040 if (!md5sig) 1041 return NULL; 1042 #if IS_ENABLED(CONFIG_IPV6) 1043 if (family == AF_INET6) 1044 size = sizeof(struct in6_addr); 1045 #endif 1046 hlist_for_each_entry_rcu(key, &md5sig->head, node) { 1047 if (key->family != family) 1048 continue; 1049 if (!memcmp(&key->addr, addr, size) && 1050 key->prefixlen == prefixlen) 1051 return key; 1052 } 1053 return NULL; 1054 } 1055 1056 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1057 const struct sock *addr_sk) 1058 { 1059 const union tcp_md5_addr *addr; 1060 1061 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1062 return tcp_md5_do_lookup(sk, addr, AF_INET); 1063 } 1064 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1065 1066 /* This can be called on a newly created socket, from other files */ 1067 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1068 int family, u8 prefixlen, const u8 *newkey, u8 newkeylen, 1069 gfp_t gfp) 1070 { 1071 /* Add Key to the list */ 1072 struct tcp_md5sig_key *key; 1073 struct tcp_sock *tp = tcp_sk(sk); 1074 struct tcp_md5sig_info *md5sig; 1075 1076 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen); 1077 if (key) { 1078 /* Pre-existing entry - just update that one. */ 1079 memcpy(key->key, newkey, newkeylen); 1080 key->keylen = newkeylen; 1081 return 0; 1082 } 1083 1084 md5sig = rcu_dereference_protected(tp->md5sig_info, 1085 lockdep_sock_is_held(sk)); 1086 if (!md5sig) { 1087 md5sig = kmalloc(sizeof(*md5sig), gfp); 1088 if (!md5sig) 1089 return -ENOMEM; 1090 1091 sk_nocaps_add(sk, NETIF_F_GSO_MASK); 1092 INIT_HLIST_HEAD(&md5sig->head); 1093 rcu_assign_pointer(tp->md5sig_info, md5sig); 1094 } 1095 1096 key = sock_kmalloc(sk, sizeof(*key), gfp); 1097 if (!key) 1098 return -ENOMEM; 1099 if (!tcp_alloc_md5sig_pool()) { 1100 sock_kfree_s(sk, key, sizeof(*key)); 1101 return -ENOMEM; 1102 } 1103 1104 memcpy(key->key, newkey, newkeylen); 1105 key->keylen = newkeylen; 1106 key->family = family; 1107 key->prefixlen = prefixlen; 1108 memcpy(&key->addr, addr, 1109 (family == AF_INET6) ? sizeof(struct in6_addr) : 1110 sizeof(struct in_addr)); 1111 hlist_add_head_rcu(&key->node, &md5sig->head); 1112 return 0; 1113 } 1114 EXPORT_SYMBOL(tcp_md5_do_add); 1115 1116 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1117 u8 prefixlen) 1118 { 1119 struct tcp_md5sig_key *key; 1120 1121 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen); 1122 if (!key) 1123 return -ENOENT; 1124 hlist_del_rcu(&key->node); 1125 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1126 kfree_rcu(key, rcu); 1127 return 0; 1128 } 1129 EXPORT_SYMBOL(tcp_md5_do_del); 1130 1131 static void tcp_clear_md5_list(struct sock *sk) 1132 { 1133 struct tcp_sock *tp = tcp_sk(sk); 1134 struct tcp_md5sig_key *key; 1135 struct hlist_node *n; 1136 struct tcp_md5sig_info *md5sig; 1137 1138 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1139 1140 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1141 hlist_del_rcu(&key->node); 1142 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1143 kfree_rcu(key, rcu); 1144 } 1145 } 1146 1147 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1148 char __user *optval, int optlen) 1149 { 1150 struct tcp_md5sig cmd; 1151 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1152 u8 prefixlen = 32; 1153 1154 if (optlen < sizeof(cmd)) 1155 return -EINVAL; 1156 1157 if (copy_from_user(&cmd, optval, sizeof(cmd))) 1158 return -EFAULT; 1159 1160 if (sin->sin_family != AF_INET) 1161 return -EINVAL; 1162 1163 if (optname == TCP_MD5SIG_EXT && 1164 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1165 prefixlen = cmd.tcpm_prefixlen; 1166 if (prefixlen > 32) 1167 return -EINVAL; 1168 } 1169 1170 if (!cmd.tcpm_keylen) 1171 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, 1172 AF_INET, prefixlen); 1173 1174 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1175 return -EINVAL; 1176 1177 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, 1178 AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen, 1179 GFP_KERNEL); 1180 } 1181 1182 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, 1183 __be32 daddr, __be32 saddr, 1184 const struct tcphdr *th, int nbytes) 1185 { 1186 struct tcp4_pseudohdr *bp; 1187 struct scatterlist sg; 1188 struct tcphdr *_th; 1189 1190 bp = hp->scratch; 1191 bp->saddr = saddr; 1192 bp->daddr = daddr; 1193 bp->pad = 0; 1194 bp->protocol = IPPROTO_TCP; 1195 bp->len = cpu_to_be16(nbytes); 1196 1197 _th = (struct tcphdr *)(bp + 1); 1198 memcpy(_th, th, sizeof(*th)); 1199 _th->check = 0; 1200 1201 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1202 ahash_request_set_crypt(hp->md5_req, &sg, NULL, 1203 sizeof(*bp) + sizeof(*th)); 1204 return crypto_ahash_update(hp->md5_req); 1205 } 1206 1207 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1208 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1209 { 1210 struct tcp_md5sig_pool *hp; 1211 struct ahash_request *req; 1212 1213 hp = tcp_get_md5sig_pool(); 1214 if (!hp) 1215 goto clear_hash_noput; 1216 req = hp->md5_req; 1217 1218 if (crypto_ahash_init(req)) 1219 goto clear_hash; 1220 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) 1221 goto clear_hash; 1222 if (tcp_md5_hash_key(hp, key)) 1223 goto clear_hash; 1224 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1225 if (crypto_ahash_final(req)) 1226 goto clear_hash; 1227 1228 tcp_put_md5sig_pool(); 1229 return 0; 1230 1231 clear_hash: 1232 tcp_put_md5sig_pool(); 1233 clear_hash_noput: 1234 memset(md5_hash, 0, 16); 1235 return 1; 1236 } 1237 1238 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1239 const struct sock *sk, 1240 const struct sk_buff *skb) 1241 { 1242 struct tcp_md5sig_pool *hp; 1243 struct ahash_request *req; 1244 const struct tcphdr *th = tcp_hdr(skb); 1245 __be32 saddr, daddr; 1246 1247 if (sk) { /* valid for establish/request sockets */ 1248 saddr = sk->sk_rcv_saddr; 1249 daddr = sk->sk_daddr; 1250 } else { 1251 const struct iphdr *iph = ip_hdr(skb); 1252 saddr = iph->saddr; 1253 daddr = iph->daddr; 1254 } 1255 1256 hp = tcp_get_md5sig_pool(); 1257 if (!hp) 1258 goto clear_hash_noput; 1259 req = hp->md5_req; 1260 1261 if (crypto_ahash_init(req)) 1262 goto clear_hash; 1263 1264 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len)) 1265 goto clear_hash; 1266 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) 1267 goto clear_hash; 1268 if (tcp_md5_hash_key(hp, key)) 1269 goto clear_hash; 1270 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1271 if (crypto_ahash_final(req)) 1272 goto clear_hash; 1273 1274 tcp_put_md5sig_pool(); 1275 return 0; 1276 1277 clear_hash: 1278 tcp_put_md5sig_pool(); 1279 clear_hash_noput: 1280 memset(md5_hash, 0, 16); 1281 return 1; 1282 } 1283 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1284 1285 #endif 1286 1287 /* Called with rcu_read_lock() */ 1288 static bool tcp_v4_inbound_md5_hash(const struct sock *sk, 1289 const struct sk_buff *skb) 1290 { 1291 #ifdef CONFIG_TCP_MD5SIG 1292 /* 1293 * This gets called for each TCP segment that arrives 1294 * so we want to be efficient. 1295 * We have 3 drop cases: 1296 * o No MD5 hash and one expected. 1297 * o MD5 hash and we're not expecting one. 1298 * o MD5 hash and its wrong. 1299 */ 1300 const __u8 *hash_location = NULL; 1301 struct tcp_md5sig_key *hash_expected; 1302 const struct iphdr *iph = ip_hdr(skb); 1303 const struct tcphdr *th = tcp_hdr(skb); 1304 int genhash; 1305 unsigned char newhash[16]; 1306 1307 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr, 1308 AF_INET); 1309 hash_location = tcp_parse_md5sig_option(th); 1310 1311 /* We've parsed the options - do we have a hash? */ 1312 if (!hash_expected && !hash_location) 1313 return false; 1314 1315 if (hash_expected && !hash_location) { 1316 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); 1317 return true; 1318 } 1319 1320 if (!hash_expected && hash_location) { 1321 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); 1322 return true; 1323 } 1324 1325 /* Okay, so this is hash_expected and hash_location - 1326 * so we need to calculate the checksum. 1327 */ 1328 genhash = tcp_v4_md5_hash_skb(newhash, 1329 hash_expected, 1330 NULL, skb); 1331 1332 if (genhash || memcmp(hash_location, newhash, 16) != 0) { 1333 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); 1334 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", 1335 &iph->saddr, ntohs(th->source), 1336 &iph->daddr, ntohs(th->dest), 1337 genhash ? " tcp_v4_calc_md5_hash failed" 1338 : ""); 1339 return true; 1340 } 1341 return false; 1342 #endif 1343 return false; 1344 } 1345 1346 static void tcp_v4_init_req(struct request_sock *req, 1347 const struct sock *sk_listener, 1348 struct sk_buff *skb) 1349 { 1350 struct inet_request_sock *ireq = inet_rsk(req); 1351 struct net *net = sock_net(sk_listener); 1352 1353 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1354 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1355 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1356 } 1357 1358 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1359 struct flowi *fl, 1360 const struct request_sock *req) 1361 { 1362 return inet_csk_route_req(sk, &fl->u.ip4, req); 1363 } 1364 1365 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1366 .family = PF_INET, 1367 .obj_size = sizeof(struct tcp_request_sock), 1368 .rtx_syn_ack = tcp_rtx_synack, 1369 .send_ack = tcp_v4_reqsk_send_ack, 1370 .destructor = tcp_v4_reqsk_destructor, 1371 .send_reset = tcp_v4_send_reset, 1372 .syn_ack_timeout = tcp_syn_ack_timeout, 1373 }; 1374 1375 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1376 .mss_clamp = TCP_MSS_DEFAULT, 1377 #ifdef CONFIG_TCP_MD5SIG 1378 .req_md5_lookup = tcp_v4_md5_lookup, 1379 .calc_md5_hash = tcp_v4_md5_hash_skb, 1380 #endif 1381 .init_req = tcp_v4_init_req, 1382 #ifdef CONFIG_SYN_COOKIES 1383 .cookie_init_seq = cookie_v4_init_sequence, 1384 #endif 1385 .route_req = tcp_v4_route_req, 1386 .init_seq = tcp_v4_init_seq, 1387 .init_ts_off = tcp_v4_init_ts_off, 1388 .send_synack = tcp_v4_send_synack, 1389 }; 1390 1391 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1392 { 1393 /* Never answer to SYNs send to broadcast or multicast */ 1394 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1395 goto drop; 1396 1397 return tcp_conn_request(&tcp_request_sock_ops, 1398 &tcp_request_sock_ipv4_ops, sk, skb); 1399 1400 drop: 1401 tcp_listendrop(sk); 1402 return 0; 1403 } 1404 EXPORT_SYMBOL(tcp_v4_conn_request); 1405 1406 1407 /* 1408 * The three way handshake has completed - we got a valid synack - 1409 * now create the new socket. 1410 */ 1411 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1412 struct request_sock *req, 1413 struct dst_entry *dst, 1414 struct request_sock *req_unhash, 1415 bool *own_req) 1416 { 1417 struct inet_request_sock *ireq; 1418 struct inet_sock *newinet; 1419 struct tcp_sock *newtp; 1420 struct sock *newsk; 1421 #ifdef CONFIG_TCP_MD5SIG 1422 struct tcp_md5sig_key *key; 1423 #endif 1424 struct ip_options_rcu *inet_opt; 1425 1426 if (sk_acceptq_is_full(sk)) 1427 goto exit_overflow; 1428 1429 newsk = tcp_create_openreq_child(sk, req, skb); 1430 if (!newsk) 1431 goto exit_nonewsk; 1432 1433 newsk->sk_gso_type = SKB_GSO_TCPV4; 1434 inet_sk_rx_dst_set(newsk, skb); 1435 1436 newtp = tcp_sk(newsk); 1437 newinet = inet_sk(newsk); 1438 ireq = inet_rsk(req); 1439 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1440 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1441 newsk->sk_bound_dev_if = ireq->ir_iif; 1442 newinet->inet_saddr = ireq->ir_loc_addr; 1443 inet_opt = rcu_dereference(ireq->ireq_opt); 1444 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1445 newinet->mc_index = inet_iif(skb); 1446 newinet->mc_ttl = ip_hdr(skb)->ttl; 1447 newinet->rcv_tos = ip_hdr(skb)->tos; 1448 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1449 if (inet_opt) 1450 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1451 newinet->inet_id = prandom_u32(); 1452 1453 if (!dst) { 1454 dst = inet_csk_route_child_sock(sk, newsk, req); 1455 if (!dst) 1456 goto put_and_exit; 1457 } else { 1458 /* syncookie case : see end of cookie_v4_check() */ 1459 } 1460 sk_setup_caps(newsk, dst); 1461 1462 tcp_ca_openreq_child(newsk, dst); 1463 1464 tcp_sync_mss(newsk, dst_mtu(dst)); 1465 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1466 1467 tcp_initialize_rcv_mss(newsk); 1468 1469 #ifdef CONFIG_TCP_MD5SIG 1470 /* Copy over the MD5 key from the original socket */ 1471 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr, 1472 AF_INET); 1473 if (key) { 1474 /* 1475 * We're using one, so create a matching key 1476 * on the newsk structure. If we fail to get 1477 * memory, then we end up not copying the key 1478 * across. Shucks. 1479 */ 1480 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr, 1481 AF_INET, 32, key->key, key->keylen, GFP_ATOMIC); 1482 sk_nocaps_add(newsk, NETIF_F_GSO_MASK); 1483 } 1484 #endif 1485 1486 if (__inet_inherit_port(sk, newsk) < 0) 1487 goto put_and_exit; 1488 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); 1489 if (likely(*own_req)) { 1490 tcp_move_syn(newtp, req); 1491 ireq->ireq_opt = NULL; 1492 } else { 1493 newinet->inet_opt = NULL; 1494 } 1495 return newsk; 1496 1497 exit_overflow: 1498 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1499 exit_nonewsk: 1500 dst_release(dst); 1501 exit: 1502 tcp_listendrop(sk); 1503 return NULL; 1504 put_and_exit: 1505 newinet->inet_opt = NULL; 1506 inet_csk_prepare_forced_close(newsk); 1507 tcp_done(newsk); 1508 goto exit; 1509 } 1510 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1511 1512 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1513 { 1514 #ifdef CONFIG_SYN_COOKIES 1515 const struct tcphdr *th = tcp_hdr(skb); 1516 1517 if (!th->syn) 1518 sk = cookie_v4_check(sk, skb); 1519 #endif 1520 return sk; 1521 } 1522 1523 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1524 struct tcphdr *th, u32 *cookie) 1525 { 1526 u16 mss = 0; 1527 #ifdef CONFIG_SYN_COOKIES 1528 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1529 &tcp_request_sock_ipv4_ops, sk, th); 1530 if (mss) { 1531 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1532 tcp_synq_overflow(sk); 1533 } 1534 #endif 1535 return mss; 1536 } 1537 1538 /* The socket must have it's spinlock held when we get 1539 * here, unless it is a TCP_LISTEN socket. 1540 * 1541 * We have a potential double-lock case here, so even when 1542 * doing backlog processing we use the BH locking scheme. 1543 * This is because we cannot sleep with the original spinlock 1544 * held. 1545 */ 1546 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1547 { 1548 struct sock *rsk; 1549 1550 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1551 struct dst_entry *dst = sk->sk_rx_dst; 1552 1553 sock_rps_save_rxhash(sk, skb); 1554 sk_mark_napi_id(sk, skb); 1555 if (dst) { 1556 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || 1557 !dst->ops->check(dst, 0)) { 1558 dst_release(dst); 1559 sk->sk_rx_dst = NULL; 1560 } 1561 } 1562 tcp_rcv_established(sk, skb); 1563 return 0; 1564 } 1565 1566 if (tcp_checksum_complete(skb)) 1567 goto csum_err; 1568 1569 if (sk->sk_state == TCP_LISTEN) { 1570 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1571 1572 if (!nsk) 1573 goto discard; 1574 if (nsk != sk) { 1575 if (tcp_child_process(sk, nsk, skb)) { 1576 rsk = nsk; 1577 goto reset; 1578 } 1579 return 0; 1580 } 1581 } else 1582 sock_rps_save_rxhash(sk, skb); 1583 1584 if (tcp_rcv_state_process(sk, skb)) { 1585 rsk = sk; 1586 goto reset; 1587 } 1588 return 0; 1589 1590 reset: 1591 tcp_v4_send_reset(rsk, skb); 1592 discard: 1593 kfree_skb(skb); 1594 /* Be careful here. If this function gets more complicated and 1595 * gcc suffers from register pressure on the x86, sk (in %ebx) 1596 * might be destroyed here. This current version compiles correctly, 1597 * but you have been warned. 1598 */ 1599 return 0; 1600 1601 csum_err: 1602 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1603 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1604 goto discard; 1605 } 1606 EXPORT_SYMBOL(tcp_v4_do_rcv); 1607 1608 int tcp_v4_early_demux(struct sk_buff *skb) 1609 { 1610 const struct iphdr *iph; 1611 const struct tcphdr *th; 1612 struct sock *sk; 1613 1614 if (skb->pkt_type != PACKET_HOST) 1615 return 0; 1616 1617 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1618 return 0; 1619 1620 iph = ip_hdr(skb); 1621 th = tcp_hdr(skb); 1622 1623 if (th->doff < sizeof(struct tcphdr) / 4) 1624 return 0; 1625 1626 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, 1627 iph->saddr, th->source, 1628 iph->daddr, ntohs(th->dest), 1629 skb->skb_iif, inet_sdif(skb)); 1630 if (sk) { 1631 skb->sk = sk; 1632 skb->destructor = sock_edemux; 1633 if (sk_fullsock(sk)) { 1634 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); 1635 1636 if (dst) 1637 dst = dst_check(dst, 0); 1638 if (dst && 1639 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) 1640 skb_dst_set_noref(skb, dst); 1641 } 1642 } 1643 return 0; 1644 } 1645 1646 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) 1647 { 1648 u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf); 1649 struct skb_shared_info *shinfo; 1650 const struct tcphdr *th; 1651 struct tcphdr *thtail; 1652 struct sk_buff *tail; 1653 unsigned int hdrlen; 1654 bool fragstolen; 1655 u32 gso_segs; 1656 int delta; 1657 1658 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1659 * we can fix skb->truesize to its real value to avoid future drops. 1660 * This is valid because skb is not yet charged to the socket. 1661 * It has been noticed pure SACK packets were sometimes dropped 1662 * (if cooked by drivers without copybreak feature). 1663 */ 1664 skb_condense(skb); 1665 1666 skb_dst_drop(skb); 1667 1668 if (unlikely(tcp_checksum_complete(skb))) { 1669 bh_unlock_sock(sk); 1670 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1671 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1672 return true; 1673 } 1674 1675 /* Attempt coalescing to last skb in backlog, even if we are 1676 * above the limits. 1677 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 1678 */ 1679 th = (const struct tcphdr *)skb->data; 1680 hdrlen = th->doff * 4; 1681 shinfo = skb_shinfo(skb); 1682 1683 if (!shinfo->gso_size) 1684 shinfo->gso_size = skb->len - hdrlen; 1685 1686 if (!shinfo->gso_segs) 1687 shinfo->gso_segs = 1; 1688 1689 tail = sk->sk_backlog.tail; 1690 if (!tail) 1691 goto no_coalesce; 1692 thtail = (struct tcphdr *)tail->data; 1693 1694 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 1695 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 1696 ((TCP_SKB_CB(tail)->tcp_flags | 1697 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 1698 !((TCP_SKB_CB(tail)->tcp_flags & 1699 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 1700 ((TCP_SKB_CB(tail)->tcp_flags ^ 1701 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || 1702 #ifdef CONFIG_TLS_DEVICE 1703 tail->decrypted != skb->decrypted || 1704 #endif 1705 thtail->doff != th->doff || 1706 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 1707 goto no_coalesce; 1708 1709 __skb_pull(skb, hdrlen); 1710 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 1711 thtail->window = th->window; 1712 1713 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 1714 1715 if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq)) 1716 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 1717 1718 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 1719 * thtail->fin, so that the fast path in tcp_rcv_established() 1720 * is not entered if we append a packet with a FIN. 1721 * SYN, RST, URG are not present. 1722 * ACK is set on both packets. 1723 * PSH : we do not really care in TCP stack, 1724 * at least for 'GRO' packets. 1725 */ 1726 thtail->fin |= th->fin; 1727 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 1728 1729 if (TCP_SKB_CB(skb)->has_rxtstamp) { 1730 TCP_SKB_CB(tail)->has_rxtstamp = true; 1731 tail->tstamp = skb->tstamp; 1732 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 1733 } 1734 1735 /* Not as strict as GRO. We only need to carry mss max value */ 1736 skb_shinfo(tail)->gso_size = max(shinfo->gso_size, 1737 skb_shinfo(tail)->gso_size); 1738 1739 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs; 1740 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF); 1741 1742 sk->sk_backlog.len += delta; 1743 __NET_INC_STATS(sock_net(sk), 1744 LINUX_MIB_TCPBACKLOGCOALESCE); 1745 kfree_skb_partial(skb, fragstolen); 1746 return false; 1747 } 1748 __skb_push(skb, hdrlen); 1749 1750 no_coalesce: 1751 /* Only socket owner can try to collapse/prune rx queues 1752 * to reduce memory overhead, so add a little headroom here. 1753 * Few sockets backlog are possibly concurrently non empty. 1754 */ 1755 limit += 64*1024; 1756 1757 if (unlikely(sk_add_backlog(sk, skb, limit))) { 1758 bh_unlock_sock(sk); 1759 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 1760 return true; 1761 } 1762 return false; 1763 } 1764 EXPORT_SYMBOL(tcp_add_backlog); 1765 1766 int tcp_filter(struct sock *sk, struct sk_buff *skb) 1767 { 1768 struct tcphdr *th = (struct tcphdr *)skb->data; 1769 1770 return sk_filter_trim_cap(sk, skb, th->doff * 4); 1771 } 1772 EXPORT_SYMBOL(tcp_filter); 1773 1774 static void tcp_v4_restore_cb(struct sk_buff *skb) 1775 { 1776 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 1777 sizeof(struct inet_skb_parm)); 1778 } 1779 1780 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 1781 const struct tcphdr *th) 1782 { 1783 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 1784 * barrier() makes sure compiler wont play fool^Waliasing games. 1785 */ 1786 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 1787 sizeof(struct inet_skb_parm)); 1788 barrier(); 1789 1790 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1791 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1792 skb->len - th->doff * 4); 1793 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1794 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 1795 TCP_SKB_CB(skb)->tcp_tw_isn = 0; 1796 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 1797 TCP_SKB_CB(skb)->sacked = 0; 1798 TCP_SKB_CB(skb)->has_rxtstamp = 1799 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 1800 } 1801 1802 /* 1803 * From tcp_input.c 1804 */ 1805 1806 int tcp_v4_rcv(struct sk_buff *skb) 1807 { 1808 struct net *net = dev_net(skb->dev); 1809 struct sk_buff *skb_to_free; 1810 int sdif = inet_sdif(skb); 1811 const struct iphdr *iph; 1812 const struct tcphdr *th; 1813 bool refcounted; 1814 struct sock *sk; 1815 int ret; 1816 1817 if (skb->pkt_type != PACKET_HOST) 1818 goto discard_it; 1819 1820 /* Count it even if it's bad */ 1821 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 1822 1823 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 1824 goto discard_it; 1825 1826 th = (const struct tcphdr *)skb->data; 1827 1828 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) 1829 goto bad_packet; 1830 if (!pskb_may_pull(skb, th->doff * 4)) 1831 goto discard_it; 1832 1833 /* An explanation is required here, I think. 1834 * Packet length and doff are validated by header prediction, 1835 * provided case of th->doff==0 is eliminated. 1836 * So, we defer the checks. */ 1837 1838 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 1839 goto csum_error; 1840 1841 th = (const struct tcphdr *)skb->data; 1842 iph = ip_hdr(skb); 1843 lookup: 1844 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, 1845 th->dest, sdif, &refcounted); 1846 if (!sk) 1847 goto no_tcp_socket; 1848 1849 process: 1850 if (sk->sk_state == TCP_TIME_WAIT) 1851 goto do_time_wait; 1852 1853 if (sk->sk_state == TCP_NEW_SYN_RECV) { 1854 struct request_sock *req = inet_reqsk(sk); 1855 bool req_stolen = false; 1856 struct sock *nsk; 1857 1858 sk = req->rsk_listener; 1859 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) { 1860 sk_drops_add(sk, skb); 1861 reqsk_put(req); 1862 goto discard_it; 1863 } 1864 if (tcp_checksum_complete(skb)) { 1865 reqsk_put(req); 1866 goto csum_error; 1867 } 1868 if (unlikely(sk->sk_state != TCP_LISTEN)) { 1869 inet_csk_reqsk_queue_drop_and_put(sk, req); 1870 goto lookup; 1871 } 1872 /* We own a reference on the listener, increase it again 1873 * as we might lose it too soon. 1874 */ 1875 sock_hold(sk); 1876 refcounted = true; 1877 nsk = NULL; 1878 if (!tcp_filter(sk, skb)) { 1879 th = (const struct tcphdr *)skb->data; 1880 iph = ip_hdr(skb); 1881 tcp_v4_fill_cb(skb, iph, th); 1882 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 1883 } 1884 if (!nsk) { 1885 reqsk_put(req); 1886 if (req_stolen) { 1887 /* Another cpu got exclusive access to req 1888 * and created a full blown socket. 1889 * Try to feed this packet to this socket 1890 * instead of discarding it. 1891 */ 1892 tcp_v4_restore_cb(skb); 1893 sock_put(sk); 1894 goto lookup; 1895 } 1896 goto discard_and_relse; 1897 } 1898 if (nsk == sk) { 1899 reqsk_put(req); 1900 tcp_v4_restore_cb(skb); 1901 } else if (tcp_child_process(sk, nsk, skb)) { 1902 tcp_v4_send_reset(nsk, skb); 1903 goto discard_and_relse; 1904 } else { 1905 sock_put(sk); 1906 return 0; 1907 } 1908 } 1909 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 1910 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 1911 goto discard_and_relse; 1912 } 1913 1914 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 1915 goto discard_and_relse; 1916 1917 if (tcp_v4_inbound_md5_hash(sk, skb)) 1918 goto discard_and_relse; 1919 1920 nf_reset_ct(skb); 1921 1922 if (tcp_filter(sk, skb)) 1923 goto discard_and_relse; 1924 th = (const struct tcphdr *)skb->data; 1925 iph = ip_hdr(skb); 1926 tcp_v4_fill_cb(skb, iph, th); 1927 1928 skb->dev = NULL; 1929 1930 if (sk->sk_state == TCP_LISTEN) { 1931 ret = tcp_v4_do_rcv(sk, skb); 1932 goto put_and_return; 1933 } 1934 1935 sk_incoming_cpu_update(sk); 1936 1937 bh_lock_sock_nested(sk); 1938 tcp_segs_in(tcp_sk(sk), skb); 1939 ret = 0; 1940 if (!sock_owned_by_user(sk)) { 1941 skb_to_free = sk->sk_rx_skb_cache; 1942 sk->sk_rx_skb_cache = NULL; 1943 ret = tcp_v4_do_rcv(sk, skb); 1944 } else { 1945 if (tcp_add_backlog(sk, skb)) 1946 goto discard_and_relse; 1947 skb_to_free = NULL; 1948 } 1949 bh_unlock_sock(sk); 1950 if (skb_to_free) 1951 __kfree_skb(skb_to_free); 1952 1953 put_and_return: 1954 if (refcounted) 1955 sock_put(sk); 1956 1957 return ret; 1958 1959 no_tcp_socket: 1960 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 1961 goto discard_it; 1962 1963 tcp_v4_fill_cb(skb, iph, th); 1964 1965 if (tcp_checksum_complete(skb)) { 1966 csum_error: 1967 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 1968 bad_packet: 1969 __TCP_INC_STATS(net, TCP_MIB_INERRS); 1970 } else { 1971 tcp_v4_send_reset(NULL, skb); 1972 } 1973 1974 discard_it: 1975 /* Discard frame. */ 1976 kfree_skb(skb); 1977 return 0; 1978 1979 discard_and_relse: 1980 sk_drops_add(sk, skb); 1981 if (refcounted) 1982 sock_put(sk); 1983 goto discard_it; 1984 1985 do_time_wait: 1986 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 1987 inet_twsk_put(inet_twsk(sk)); 1988 goto discard_it; 1989 } 1990 1991 tcp_v4_fill_cb(skb, iph, th); 1992 1993 if (tcp_checksum_complete(skb)) { 1994 inet_twsk_put(inet_twsk(sk)); 1995 goto csum_error; 1996 } 1997 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 1998 case TCP_TW_SYN: { 1999 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), 2000 &tcp_hashinfo, skb, 2001 __tcp_hdrlen(th), 2002 iph->saddr, th->source, 2003 iph->daddr, th->dest, 2004 inet_iif(skb), 2005 sdif); 2006 if (sk2) { 2007 inet_twsk_deschedule_put(inet_twsk(sk)); 2008 sk = sk2; 2009 tcp_v4_restore_cb(skb); 2010 refcounted = false; 2011 goto process; 2012 } 2013 } 2014 /* to ACK */ 2015 /* fall through */ 2016 case TCP_TW_ACK: 2017 tcp_v4_timewait_ack(sk, skb); 2018 break; 2019 case TCP_TW_RST: 2020 tcp_v4_send_reset(sk, skb); 2021 inet_twsk_deschedule_put(inet_twsk(sk)); 2022 goto discard_it; 2023 case TCP_TW_SUCCESS:; 2024 } 2025 goto discard_it; 2026 } 2027 2028 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2029 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2030 .twsk_unique = tcp_twsk_unique, 2031 .twsk_destructor= tcp_twsk_destructor, 2032 }; 2033 2034 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2035 { 2036 struct dst_entry *dst = skb_dst(skb); 2037 2038 if (dst && dst_hold_safe(dst)) { 2039 sk->sk_rx_dst = dst; 2040 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; 2041 } 2042 } 2043 EXPORT_SYMBOL(inet_sk_rx_dst_set); 2044 2045 const struct inet_connection_sock_af_ops ipv4_specific = { 2046 .queue_xmit = ip_queue_xmit, 2047 .send_check = tcp_v4_send_check, 2048 .rebuild_header = inet_sk_rebuild_header, 2049 .sk_rx_dst_set = inet_sk_rx_dst_set, 2050 .conn_request = tcp_v4_conn_request, 2051 .syn_recv_sock = tcp_v4_syn_recv_sock, 2052 .net_header_len = sizeof(struct iphdr), 2053 .setsockopt = ip_setsockopt, 2054 .getsockopt = ip_getsockopt, 2055 .addr2sockaddr = inet_csk_addr2sockaddr, 2056 .sockaddr_len = sizeof(struct sockaddr_in), 2057 #ifdef CONFIG_COMPAT 2058 .compat_setsockopt = compat_ip_setsockopt, 2059 .compat_getsockopt = compat_ip_getsockopt, 2060 #endif 2061 .mtu_reduced = tcp_v4_mtu_reduced, 2062 }; 2063 EXPORT_SYMBOL(ipv4_specific); 2064 2065 #ifdef CONFIG_TCP_MD5SIG 2066 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2067 .md5_lookup = tcp_v4_md5_lookup, 2068 .calc_md5_hash = tcp_v4_md5_hash_skb, 2069 .md5_parse = tcp_v4_parse_md5_keys, 2070 }; 2071 #endif 2072 2073 /* NOTE: A lot of things set to zero explicitly by call to 2074 * sk_alloc() so need not be done here. 2075 */ 2076 static int tcp_v4_init_sock(struct sock *sk) 2077 { 2078 struct inet_connection_sock *icsk = inet_csk(sk); 2079 2080 tcp_init_sock(sk); 2081 2082 icsk->icsk_af_ops = &ipv4_specific; 2083 2084 #ifdef CONFIG_TCP_MD5SIG 2085 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2086 #endif 2087 2088 return 0; 2089 } 2090 2091 void tcp_v4_destroy_sock(struct sock *sk) 2092 { 2093 struct tcp_sock *tp = tcp_sk(sk); 2094 2095 trace_tcp_destroy_sock(sk); 2096 2097 tcp_clear_xmit_timers(sk); 2098 2099 tcp_cleanup_congestion_control(sk); 2100 2101 tcp_cleanup_ulp(sk); 2102 2103 /* Cleanup up the write buffer. */ 2104 tcp_write_queue_purge(sk); 2105 2106 /* Check if we want to disable active TFO */ 2107 tcp_fastopen_active_disable_ofo_check(sk); 2108 2109 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2110 skb_rbtree_purge(&tp->out_of_order_queue); 2111 2112 #ifdef CONFIG_TCP_MD5SIG 2113 /* Clean up the MD5 key list, if any */ 2114 if (tp->md5sig_info) { 2115 tcp_clear_md5_list(sk); 2116 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu); 2117 tp->md5sig_info = NULL; 2118 } 2119 #endif 2120 2121 /* Clean up a referenced TCP bind bucket. */ 2122 if (inet_csk(sk)->icsk_bind_hash) 2123 inet_put_port(sk); 2124 2125 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2126 2127 /* If socket is aborted during connect operation */ 2128 tcp_free_fastopen_req(tp); 2129 tcp_fastopen_destroy_cipher(sk); 2130 tcp_saved_syn_free(tp); 2131 2132 sk_sockets_allocated_dec(sk); 2133 } 2134 EXPORT_SYMBOL(tcp_v4_destroy_sock); 2135 2136 #ifdef CONFIG_PROC_FS 2137 /* Proc filesystem TCP sock list dumping. */ 2138 2139 /* 2140 * Get next listener socket follow cur. If cur is NULL, get first socket 2141 * starting from bucket given in st->bucket; when st->bucket is zero the 2142 * very first socket in the hash table is returned. 2143 */ 2144 static void *listening_get_next(struct seq_file *seq, void *cur) 2145 { 2146 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); 2147 struct tcp_iter_state *st = seq->private; 2148 struct net *net = seq_file_net(seq); 2149 struct inet_listen_hashbucket *ilb; 2150 struct hlist_nulls_node *node; 2151 struct sock *sk = cur; 2152 2153 if (!sk) { 2154 get_head: 2155 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2156 spin_lock(&ilb->lock); 2157 sk = sk_nulls_head(&ilb->nulls_head); 2158 st->offset = 0; 2159 goto get_sk; 2160 } 2161 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2162 ++st->num; 2163 ++st->offset; 2164 2165 sk = sk_nulls_next(sk); 2166 get_sk: 2167 sk_nulls_for_each_from(sk, node) { 2168 if (!net_eq(sock_net(sk), net)) 2169 continue; 2170 if (sk->sk_family == afinfo->family) 2171 return sk; 2172 } 2173 spin_unlock(&ilb->lock); 2174 st->offset = 0; 2175 if (++st->bucket < INET_LHTABLE_SIZE) 2176 goto get_head; 2177 return NULL; 2178 } 2179 2180 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2181 { 2182 struct tcp_iter_state *st = seq->private; 2183 void *rc; 2184 2185 st->bucket = 0; 2186 st->offset = 0; 2187 rc = listening_get_next(seq, NULL); 2188 2189 while (rc && *pos) { 2190 rc = listening_get_next(seq, rc); 2191 --*pos; 2192 } 2193 return rc; 2194 } 2195 2196 static inline bool empty_bucket(const struct tcp_iter_state *st) 2197 { 2198 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain); 2199 } 2200 2201 /* 2202 * Get first established socket starting from bucket given in st->bucket. 2203 * If st->bucket is zero, the very first socket in the hash is returned. 2204 */ 2205 static void *established_get_first(struct seq_file *seq) 2206 { 2207 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); 2208 struct tcp_iter_state *st = seq->private; 2209 struct net *net = seq_file_net(seq); 2210 void *rc = NULL; 2211 2212 st->offset = 0; 2213 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { 2214 struct sock *sk; 2215 struct hlist_nulls_node *node; 2216 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 2217 2218 /* Lockless fast path for the common case of empty buckets */ 2219 if (empty_bucket(st)) 2220 continue; 2221 2222 spin_lock_bh(lock); 2223 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 2224 if (sk->sk_family != afinfo->family || 2225 !net_eq(sock_net(sk), net)) { 2226 continue; 2227 } 2228 rc = sk; 2229 goto out; 2230 } 2231 spin_unlock_bh(lock); 2232 } 2233 out: 2234 return rc; 2235 } 2236 2237 static void *established_get_next(struct seq_file *seq, void *cur) 2238 { 2239 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); 2240 struct sock *sk = cur; 2241 struct hlist_nulls_node *node; 2242 struct tcp_iter_state *st = seq->private; 2243 struct net *net = seq_file_net(seq); 2244 2245 ++st->num; 2246 ++st->offset; 2247 2248 sk = sk_nulls_next(sk); 2249 2250 sk_nulls_for_each_from(sk, node) { 2251 if (sk->sk_family == afinfo->family && 2252 net_eq(sock_net(sk), net)) 2253 return sk; 2254 } 2255 2256 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2257 ++st->bucket; 2258 return established_get_first(seq); 2259 } 2260 2261 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2262 { 2263 struct tcp_iter_state *st = seq->private; 2264 void *rc; 2265 2266 st->bucket = 0; 2267 rc = established_get_first(seq); 2268 2269 while (rc && pos) { 2270 rc = established_get_next(seq, rc); 2271 --pos; 2272 } 2273 return rc; 2274 } 2275 2276 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2277 { 2278 void *rc; 2279 struct tcp_iter_state *st = seq->private; 2280 2281 st->state = TCP_SEQ_STATE_LISTENING; 2282 rc = listening_get_idx(seq, &pos); 2283 2284 if (!rc) { 2285 st->state = TCP_SEQ_STATE_ESTABLISHED; 2286 rc = established_get_idx(seq, pos); 2287 } 2288 2289 return rc; 2290 } 2291 2292 static void *tcp_seek_last_pos(struct seq_file *seq) 2293 { 2294 struct tcp_iter_state *st = seq->private; 2295 int offset = st->offset; 2296 int orig_num = st->num; 2297 void *rc = NULL; 2298 2299 switch (st->state) { 2300 case TCP_SEQ_STATE_LISTENING: 2301 if (st->bucket >= INET_LHTABLE_SIZE) 2302 break; 2303 st->state = TCP_SEQ_STATE_LISTENING; 2304 rc = listening_get_next(seq, NULL); 2305 while (offset-- && rc) 2306 rc = listening_get_next(seq, rc); 2307 if (rc) 2308 break; 2309 st->bucket = 0; 2310 st->state = TCP_SEQ_STATE_ESTABLISHED; 2311 /* Fallthrough */ 2312 case TCP_SEQ_STATE_ESTABLISHED: 2313 if (st->bucket > tcp_hashinfo.ehash_mask) 2314 break; 2315 rc = established_get_first(seq); 2316 while (offset-- && rc) 2317 rc = established_get_next(seq, rc); 2318 } 2319 2320 st->num = orig_num; 2321 2322 return rc; 2323 } 2324 2325 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2326 { 2327 struct tcp_iter_state *st = seq->private; 2328 void *rc; 2329 2330 if (*pos && *pos == st->last_pos) { 2331 rc = tcp_seek_last_pos(seq); 2332 if (rc) 2333 goto out; 2334 } 2335 2336 st->state = TCP_SEQ_STATE_LISTENING; 2337 st->num = 0; 2338 st->bucket = 0; 2339 st->offset = 0; 2340 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2341 2342 out: 2343 st->last_pos = *pos; 2344 return rc; 2345 } 2346 EXPORT_SYMBOL(tcp_seq_start); 2347 2348 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2349 { 2350 struct tcp_iter_state *st = seq->private; 2351 void *rc = NULL; 2352 2353 if (v == SEQ_START_TOKEN) { 2354 rc = tcp_get_idx(seq, 0); 2355 goto out; 2356 } 2357 2358 switch (st->state) { 2359 case TCP_SEQ_STATE_LISTENING: 2360 rc = listening_get_next(seq, v); 2361 if (!rc) { 2362 st->state = TCP_SEQ_STATE_ESTABLISHED; 2363 st->bucket = 0; 2364 st->offset = 0; 2365 rc = established_get_first(seq); 2366 } 2367 break; 2368 case TCP_SEQ_STATE_ESTABLISHED: 2369 rc = established_get_next(seq, v); 2370 break; 2371 } 2372 out: 2373 ++*pos; 2374 st->last_pos = *pos; 2375 return rc; 2376 } 2377 EXPORT_SYMBOL(tcp_seq_next); 2378 2379 void tcp_seq_stop(struct seq_file *seq, void *v) 2380 { 2381 struct tcp_iter_state *st = seq->private; 2382 2383 switch (st->state) { 2384 case TCP_SEQ_STATE_LISTENING: 2385 if (v != SEQ_START_TOKEN) 2386 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock); 2387 break; 2388 case TCP_SEQ_STATE_ESTABLISHED: 2389 if (v) 2390 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2391 break; 2392 } 2393 } 2394 EXPORT_SYMBOL(tcp_seq_stop); 2395 2396 static void get_openreq4(const struct request_sock *req, 2397 struct seq_file *f, int i) 2398 { 2399 const struct inet_request_sock *ireq = inet_rsk(req); 2400 long delta = req->rsk_timer.expires - jiffies; 2401 2402 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2403 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2404 i, 2405 ireq->ir_loc_addr, 2406 ireq->ir_num, 2407 ireq->ir_rmt_addr, 2408 ntohs(ireq->ir_rmt_port), 2409 TCP_SYN_RECV, 2410 0, 0, /* could print option size, but that is af dependent. */ 2411 1, /* timers active (only the expire timer) */ 2412 jiffies_delta_to_clock_t(delta), 2413 req->num_timeout, 2414 from_kuid_munged(seq_user_ns(f), 2415 sock_i_uid(req->rsk_listener)), 2416 0, /* non standard timer */ 2417 0, /* open_requests have no inode */ 2418 0, 2419 req); 2420 } 2421 2422 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2423 { 2424 int timer_active; 2425 unsigned long timer_expires; 2426 const struct tcp_sock *tp = tcp_sk(sk); 2427 const struct inet_connection_sock *icsk = inet_csk(sk); 2428 const struct inet_sock *inet = inet_sk(sk); 2429 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2430 __be32 dest = inet->inet_daddr; 2431 __be32 src = inet->inet_rcv_saddr; 2432 __u16 destp = ntohs(inet->inet_dport); 2433 __u16 srcp = ntohs(inet->inet_sport); 2434 int rx_queue; 2435 int state; 2436 2437 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2438 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2439 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2440 timer_active = 1; 2441 timer_expires = icsk->icsk_timeout; 2442 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2443 timer_active = 4; 2444 timer_expires = icsk->icsk_timeout; 2445 } else if (timer_pending(&sk->sk_timer)) { 2446 timer_active = 2; 2447 timer_expires = sk->sk_timer.expires; 2448 } else { 2449 timer_active = 0; 2450 timer_expires = jiffies; 2451 } 2452 2453 state = inet_sk_state_load(sk); 2454 if (state == TCP_LISTEN) 2455 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2456 else 2457 /* Because we don't lock the socket, 2458 * we might find a transient negative value. 2459 */ 2460 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2461 READ_ONCE(tp->copied_seq), 0); 2462 2463 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2464 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2465 i, src, srcp, dest, destp, state, 2466 READ_ONCE(tp->write_seq) - tp->snd_una, 2467 rx_queue, 2468 timer_active, 2469 jiffies_delta_to_clock_t(timer_expires - jiffies), 2470 icsk->icsk_retransmits, 2471 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2472 icsk->icsk_probes_out, 2473 sock_i_ino(sk), 2474 refcount_read(&sk->sk_refcnt), sk, 2475 jiffies_to_clock_t(icsk->icsk_rto), 2476 jiffies_to_clock_t(icsk->icsk_ack.ato), 2477 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2478 tp->snd_cwnd, 2479 state == TCP_LISTEN ? 2480 fastopenq->max_qlen : 2481 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2482 } 2483 2484 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2485 struct seq_file *f, int i) 2486 { 2487 long delta = tw->tw_timer.expires - jiffies; 2488 __be32 dest, src; 2489 __u16 destp, srcp; 2490 2491 dest = tw->tw_daddr; 2492 src = tw->tw_rcv_saddr; 2493 destp = ntohs(tw->tw_dport); 2494 srcp = ntohs(tw->tw_sport); 2495 2496 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2497 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2498 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2499 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2500 refcount_read(&tw->tw_refcnt), tw); 2501 } 2502 2503 #define TMPSZ 150 2504 2505 static int tcp4_seq_show(struct seq_file *seq, void *v) 2506 { 2507 struct tcp_iter_state *st; 2508 struct sock *sk = v; 2509 2510 seq_setwidth(seq, TMPSZ - 1); 2511 if (v == SEQ_START_TOKEN) { 2512 seq_puts(seq, " sl local_address rem_address st tx_queue " 2513 "rx_queue tr tm->when retrnsmt uid timeout " 2514 "inode"); 2515 goto out; 2516 } 2517 st = seq->private; 2518 2519 if (sk->sk_state == TCP_TIME_WAIT) 2520 get_timewait4_sock(v, seq, st->num); 2521 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2522 get_openreq4(v, seq, st->num); 2523 else 2524 get_tcp4_sock(v, seq, st->num); 2525 out: 2526 seq_pad(seq, '\n'); 2527 return 0; 2528 } 2529 2530 static const struct seq_operations tcp4_seq_ops = { 2531 .show = tcp4_seq_show, 2532 .start = tcp_seq_start, 2533 .next = tcp_seq_next, 2534 .stop = tcp_seq_stop, 2535 }; 2536 2537 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 2538 .family = AF_INET, 2539 }; 2540 2541 static int __net_init tcp4_proc_init_net(struct net *net) 2542 { 2543 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 2544 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 2545 return -ENOMEM; 2546 return 0; 2547 } 2548 2549 static void __net_exit tcp4_proc_exit_net(struct net *net) 2550 { 2551 remove_proc_entry("tcp", net->proc_net); 2552 } 2553 2554 static struct pernet_operations tcp4_net_ops = { 2555 .init = tcp4_proc_init_net, 2556 .exit = tcp4_proc_exit_net, 2557 }; 2558 2559 int __init tcp4_proc_init(void) 2560 { 2561 return register_pernet_subsys(&tcp4_net_ops); 2562 } 2563 2564 void tcp4_proc_exit(void) 2565 { 2566 unregister_pernet_subsys(&tcp4_net_ops); 2567 } 2568 #endif /* CONFIG_PROC_FS */ 2569 2570 struct proto tcp_prot = { 2571 .name = "TCP", 2572 .owner = THIS_MODULE, 2573 .close = tcp_close, 2574 .pre_connect = tcp_v4_pre_connect, 2575 .connect = tcp_v4_connect, 2576 .disconnect = tcp_disconnect, 2577 .accept = inet_csk_accept, 2578 .ioctl = tcp_ioctl, 2579 .init = tcp_v4_init_sock, 2580 .destroy = tcp_v4_destroy_sock, 2581 .shutdown = tcp_shutdown, 2582 .setsockopt = tcp_setsockopt, 2583 .getsockopt = tcp_getsockopt, 2584 .keepalive = tcp_set_keepalive, 2585 .recvmsg = tcp_recvmsg, 2586 .sendmsg = tcp_sendmsg, 2587 .sendpage = tcp_sendpage, 2588 .backlog_rcv = tcp_v4_do_rcv, 2589 .release_cb = tcp_release_cb, 2590 .hash = inet_hash, 2591 .unhash = inet_unhash, 2592 .get_port = inet_csk_get_port, 2593 .enter_memory_pressure = tcp_enter_memory_pressure, 2594 .leave_memory_pressure = tcp_leave_memory_pressure, 2595 .stream_memory_free = tcp_stream_memory_free, 2596 .sockets_allocated = &tcp_sockets_allocated, 2597 .orphan_count = &tcp_orphan_count, 2598 .memory_allocated = &tcp_memory_allocated, 2599 .memory_pressure = &tcp_memory_pressure, 2600 .sysctl_mem = sysctl_tcp_mem, 2601 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 2602 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 2603 .max_header = MAX_TCP_HEADER, 2604 .obj_size = sizeof(struct tcp_sock), 2605 .slab_flags = SLAB_TYPESAFE_BY_RCU, 2606 .twsk_prot = &tcp_timewait_sock_ops, 2607 .rsk_prot = &tcp_request_sock_ops, 2608 .h.hashinfo = &tcp_hashinfo, 2609 .no_autobind = true, 2610 #ifdef CONFIG_COMPAT 2611 .compat_setsockopt = compat_tcp_setsockopt, 2612 .compat_getsockopt = compat_tcp_getsockopt, 2613 #endif 2614 .diag_destroy = tcp_abort, 2615 }; 2616 EXPORT_SYMBOL(tcp_prot); 2617 2618 static void __net_exit tcp_sk_exit(struct net *net) 2619 { 2620 int cpu; 2621 2622 if (net->ipv4.tcp_congestion_control) 2623 module_put(net->ipv4.tcp_congestion_control->owner); 2624 2625 for_each_possible_cpu(cpu) 2626 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); 2627 free_percpu(net->ipv4.tcp_sk); 2628 } 2629 2630 static int __net_init tcp_sk_init(struct net *net) 2631 { 2632 int res, cpu, cnt; 2633 2634 net->ipv4.tcp_sk = alloc_percpu(struct sock *); 2635 if (!net->ipv4.tcp_sk) 2636 return -ENOMEM; 2637 2638 for_each_possible_cpu(cpu) { 2639 struct sock *sk; 2640 2641 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 2642 IPPROTO_TCP, net); 2643 if (res) 2644 goto fail; 2645 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 2646 2647 /* Please enforce IP_DF and IPID==0 for RST and 2648 * ACK sent in SYN-RECV and TIME-WAIT state. 2649 */ 2650 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 2651 2652 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; 2653 } 2654 2655 net->ipv4.sysctl_tcp_ecn = 2; 2656 net->ipv4.sysctl_tcp_ecn_fallback = 1; 2657 2658 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 2659 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 2660 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 2661 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 2662 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 2663 2664 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 2665 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 2666 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 2667 2668 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 2669 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 2670 net->ipv4.sysctl_tcp_syncookies = 1; 2671 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 2672 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 2673 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 2674 net->ipv4.sysctl_tcp_orphan_retries = 0; 2675 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 2676 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 2677 net->ipv4.sysctl_tcp_tw_reuse = 2; 2678 2679 cnt = tcp_hashinfo.ehash_mask + 1; 2680 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2; 2681 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; 2682 2683 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128); 2684 net->ipv4.sysctl_tcp_sack = 1; 2685 net->ipv4.sysctl_tcp_window_scaling = 1; 2686 net->ipv4.sysctl_tcp_timestamps = 1; 2687 net->ipv4.sysctl_tcp_early_retrans = 3; 2688 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 2689 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 2690 net->ipv4.sysctl_tcp_retrans_collapse = 1; 2691 net->ipv4.sysctl_tcp_max_reordering = 300; 2692 net->ipv4.sysctl_tcp_dsack = 1; 2693 net->ipv4.sysctl_tcp_app_win = 31; 2694 net->ipv4.sysctl_tcp_adv_win_scale = 1; 2695 net->ipv4.sysctl_tcp_frto = 2; 2696 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 2697 /* This limits the percentage of the congestion window which we 2698 * will allow a single TSO frame to consume. Building TSO frames 2699 * which are too large can cause TCP streams to be bursty. 2700 */ 2701 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 2702 /* Default TSQ limit of 16 TSO segments */ 2703 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; 2704 /* rfc5961 challenge ack rate limiting */ 2705 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; 2706 net->ipv4.sysctl_tcp_min_tso_segs = 2; 2707 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 2708 net->ipv4.sysctl_tcp_autocorking = 1; 2709 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 2710 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 2711 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 2712 if (net != &init_net) { 2713 memcpy(net->ipv4.sysctl_tcp_rmem, 2714 init_net.ipv4.sysctl_tcp_rmem, 2715 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 2716 memcpy(net->ipv4.sysctl_tcp_wmem, 2717 init_net.ipv4.sysctl_tcp_wmem, 2718 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 2719 } 2720 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 2721 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 2722 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 2723 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); 2724 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; 2725 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 2726 2727 /* Reno is always built in */ 2728 if (!net_eq(net, &init_net) && 2729 try_module_get(init_net.ipv4.tcp_congestion_control->owner)) 2730 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 2731 else 2732 net->ipv4.tcp_congestion_control = &tcp_reno; 2733 2734 return 0; 2735 fail: 2736 tcp_sk_exit(net); 2737 2738 return res; 2739 } 2740 2741 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 2742 { 2743 struct net *net; 2744 2745 inet_twsk_purge(&tcp_hashinfo, AF_INET); 2746 2747 list_for_each_entry(net, net_exit_list, exit_list) 2748 tcp_fastopen_ctx_destroy(net); 2749 } 2750 2751 static struct pernet_operations __net_initdata tcp_sk_ops = { 2752 .init = tcp_sk_init, 2753 .exit = tcp_sk_exit, 2754 .exit_batch = tcp_sk_exit_batch, 2755 }; 2756 2757 void __init tcp_v4_init(void) 2758 { 2759 if (register_pernet_subsys(&tcp_sk_ops)) 2760 panic("Failed to create the TCP control socket.\n"); 2761 } 2762