1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 61 #include <net/net_namespace.h> 62 #include <net/icmp.h> 63 #include <net/inet_hashtables.h> 64 #include <net/tcp.h> 65 #include <net/transp_v6.h> 66 #include <net/ipv6.h> 67 #include <net/inet_common.h> 68 #include <net/timewait_sock.h> 69 #include <net/xfrm.h> 70 #include <net/secure_seq.h> 71 #include <net/busy_poll.h> 72 73 #include <linux/inet.h> 74 #include <linux/ipv6.h> 75 #include <linux/stddef.h> 76 #include <linux/proc_fs.h> 77 #include <linux/seq_file.h> 78 #include <linux/inetdevice.h> 79 80 #include <crypto/hash.h> 81 #include <linux/scatterlist.h> 82 83 #include <trace/events/tcp.h> 84 85 #ifdef CONFIG_TCP_MD5SIG 86 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 87 __be32 daddr, __be32 saddr, const struct tcphdr *th); 88 #endif 89 90 struct inet_hashinfo tcp_hashinfo; 91 EXPORT_SYMBOL(tcp_hashinfo); 92 93 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 94 { 95 return secure_tcp_seq(ip_hdr(skb)->daddr, 96 ip_hdr(skb)->saddr, 97 tcp_hdr(skb)->dest, 98 tcp_hdr(skb)->source); 99 } 100 101 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 102 { 103 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 104 } 105 106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 107 { 108 const struct inet_timewait_sock *tw = inet_twsk(sktw); 109 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 110 struct tcp_sock *tp = tcp_sk(sk); 111 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse; 112 113 if (reuse == 2) { 114 /* Still does not detect *everything* that goes through 115 * lo, since we require a loopback src or dst address 116 * or direct binding to 'lo' interface. 117 */ 118 bool loopback = false; 119 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 120 loopback = true; 121 #if IS_ENABLED(CONFIG_IPV6) 122 if (tw->tw_family == AF_INET6) { 123 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 124 (ipv6_addr_v4mapped(&tw->tw_v6_daddr) && 125 (tw->tw_v6_daddr.s6_addr[12] == 127)) || 126 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 127 (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) && 128 (tw->tw_v6_rcv_saddr.s6_addr[12] == 127))) 129 loopback = true; 130 } else 131 #endif 132 { 133 if (ipv4_is_loopback(tw->tw_daddr) || 134 ipv4_is_loopback(tw->tw_rcv_saddr)) 135 loopback = true; 136 } 137 if (!loopback) 138 reuse = 0; 139 } 140 141 /* With PAWS, it is safe from the viewpoint 142 of data integrity. Even without PAWS it is safe provided sequence 143 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 144 145 Actually, the idea is close to VJ's one, only timestamp cache is 146 held not per host, but per port pair and TW bucket is used as state 147 holder. 148 149 If TW bucket has been already destroyed we fall back to VJ's scheme 150 and use initial timestamp retrieved from peer table. 151 */ 152 if (tcptw->tw_ts_recent_stamp && 153 (!twp || (reuse && time_after32(ktime_get_seconds(), 154 tcptw->tw_ts_recent_stamp)))) { 155 /* In case of repair and re-using TIME-WAIT sockets we still 156 * want to be sure that it is safe as above but honor the 157 * sequence numbers and time stamps set as part of the repair 158 * process. 159 * 160 * Without this check re-using a TIME-WAIT socket with TCP 161 * repair would accumulate a -1 on the repair assigned 162 * sequence number. The first time it is reused the sequence 163 * is -1, the second time -2, etc. This fixes that issue 164 * without appearing to create any others. 165 */ 166 if (likely(!tp->repair)) { 167 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; 168 if (tp->write_seq == 0) 169 tp->write_seq = 1; 170 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 171 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 172 } 173 sock_hold(sktw); 174 return 1; 175 } 176 177 return 0; 178 } 179 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 180 181 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 182 int addr_len) 183 { 184 /* This check is replicated from tcp_v4_connect() and intended to 185 * prevent BPF program called below from accessing bytes that are out 186 * of the bound specified by user in addr_len. 187 */ 188 if (addr_len < sizeof(struct sockaddr_in)) 189 return -EINVAL; 190 191 sock_owned_by_me(sk); 192 193 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr); 194 } 195 196 /* This will initiate an outgoing connection. */ 197 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 198 { 199 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 200 struct inet_sock *inet = inet_sk(sk); 201 struct tcp_sock *tp = tcp_sk(sk); 202 __be16 orig_sport, orig_dport; 203 __be32 daddr, nexthop; 204 struct flowi4 *fl4; 205 struct rtable *rt; 206 int err; 207 struct ip_options_rcu *inet_opt; 208 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 209 210 if (addr_len < sizeof(struct sockaddr_in)) 211 return -EINVAL; 212 213 if (usin->sin_family != AF_INET) 214 return -EAFNOSUPPORT; 215 216 nexthop = daddr = usin->sin_addr.s_addr; 217 inet_opt = rcu_dereference_protected(inet->inet_opt, 218 lockdep_sock_is_held(sk)); 219 if (inet_opt && inet_opt->opt.srr) { 220 if (!daddr) 221 return -EINVAL; 222 nexthop = inet_opt->opt.faddr; 223 } 224 225 orig_sport = inet->inet_sport; 226 orig_dport = usin->sin_port; 227 fl4 = &inet->cork.fl.u.ip4; 228 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 229 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 230 IPPROTO_TCP, 231 orig_sport, orig_dport, sk); 232 if (IS_ERR(rt)) { 233 err = PTR_ERR(rt); 234 if (err == -ENETUNREACH) 235 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 236 return err; 237 } 238 239 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 240 ip_rt_put(rt); 241 return -ENETUNREACH; 242 } 243 244 if (!inet_opt || !inet_opt->opt.srr) 245 daddr = fl4->daddr; 246 247 if (!inet->inet_saddr) 248 inet->inet_saddr = fl4->saddr; 249 sk_rcv_saddr_set(sk, inet->inet_saddr); 250 251 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 252 /* Reset inherited state */ 253 tp->rx_opt.ts_recent = 0; 254 tp->rx_opt.ts_recent_stamp = 0; 255 if (likely(!tp->repair)) 256 tp->write_seq = 0; 257 } 258 259 inet->inet_dport = usin->sin_port; 260 sk_daddr_set(sk, daddr); 261 262 inet_csk(sk)->icsk_ext_hdr_len = 0; 263 if (inet_opt) 264 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 265 266 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 267 268 /* Socket identity is still unknown (sport may be zero). 269 * However we set state to SYN-SENT and not releasing socket 270 * lock select source port, enter ourselves into the hash tables and 271 * complete initialization after this. 272 */ 273 tcp_set_state(sk, TCP_SYN_SENT); 274 err = inet_hash_connect(tcp_death_row, sk); 275 if (err) 276 goto failure; 277 278 sk_set_txhash(sk); 279 280 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 281 inet->inet_sport, inet->inet_dport, sk); 282 if (IS_ERR(rt)) { 283 err = PTR_ERR(rt); 284 rt = NULL; 285 goto failure; 286 } 287 /* OK, now commit destination to socket. */ 288 sk->sk_gso_type = SKB_GSO_TCPV4; 289 sk_setup_caps(sk, &rt->dst); 290 rt = NULL; 291 292 if (likely(!tp->repair)) { 293 if (!tp->write_seq) 294 tp->write_seq = secure_tcp_seq(inet->inet_saddr, 295 inet->inet_daddr, 296 inet->inet_sport, 297 usin->sin_port); 298 tp->tsoffset = secure_tcp_ts_off(sock_net(sk), 299 inet->inet_saddr, 300 inet->inet_daddr); 301 } 302 303 inet->inet_id = tp->write_seq ^ jiffies; 304 305 if (tcp_fastopen_defer_connect(sk, &err)) 306 return err; 307 if (err) 308 goto failure; 309 310 err = tcp_connect(sk); 311 312 if (err) 313 goto failure; 314 315 return 0; 316 317 failure: 318 /* 319 * This unhashes the socket and releases the local port, 320 * if necessary. 321 */ 322 tcp_set_state(sk, TCP_CLOSE); 323 ip_rt_put(rt); 324 sk->sk_route_caps = 0; 325 inet->inet_dport = 0; 326 return err; 327 } 328 EXPORT_SYMBOL(tcp_v4_connect); 329 330 /* 331 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 332 * It can be called through tcp_release_cb() if socket was owned by user 333 * at the time tcp_v4_err() was called to handle ICMP message. 334 */ 335 void tcp_v4_mtu_reduced(struct sock *sk) 336 { 337 struct inet_sock *inet = inet_sk(sk); 338 struct dst_entry *dst; 339 u32 mtu; 340 341 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 342 return; 343 mtu = tcp_sk(sk)->mtu_info; 344 dst = inet_csk_update_pmtu(sk, mtu); 345 if (!dst) 346 return; 347 348 /* Something is about to be wrong... Remember soft error 349 * for the case, if this connection will not able to recover. 350 */ 351 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 352 sk->sk_err_soft = EMSGSIZE; 353 354 mtu = dst_mtu(dst); 355 356 if (inet->pmtudisc != IP_PMTUDISC_DONT && 357 ip_sk_accept_pmtu(sk) && 358 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 359 tcp_sync_mss(sk, mtu); 360 361 /* Resend the TCP packet because it's 362 * clear that the old packet has been 363 * dropped. This is the new "fast" path mtu 364 * discovery. 365 */ 366 tcp_simple_retransmit(sk); 367 } /* else let the usual retransmit timer handle it */ 368 } 369 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 370 371 static void do_redirect(struct sk_buff *skb, struct sock *sk) 372 { 373 struct dst_entry *dst = __sk_dst_check(sk, 0); 374 375 if (dst) 376 dst->ops->redirect(dst, sk, skb); 377 } 378 379 380 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 381 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 382 { 383 struct request_sock *req = inet_reqsk(sk); 384 struct net *net = sock_net(sk); 385 386 /* ICMPs are not backlogged, hence we cannot get 387 * an established socket here. 388 */ 389 if (seq != tcp_rsk(req)->snt_isn) { 390 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 391 } else if (abort) { 392 /* 393 * Still in SYN_RECV, just remove it silently. 394 * There is no good way to pass the error to the newly 395 * created socket, and POSIX does not want network 396 * errors returned from accept(). 397 */ 398 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 399 tcp_listendrop(req->rsk_listener); 400 } 401 reqsk_put(req); 402 } 403 EXPORT_SYMBOL(tcp_req_err); 404 405 /* 406 * This routine is called by the ICMP module when it gets some 407 * sort of error condition. If err < 0 then the socket should 408 * be closed and the error returned to the user. If err > 0 409 * it's just the icmp type << 8 | icmp code. After adjustment 410 * header points to the first 8 bytes of the tcp header. We need 411 * to find the appropriate port. 412 * 413 * The locking strategy used here is very "optimistic". When 414 * someone else accesses the socket the ICMP is just dropped 415 * and for some paths there is no check at all. 416 * A more general error queue to queue errors for later handling 417 * is probably better. 418 * 419 */ 420 421 int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) 422 { 423 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data; 424 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); 425 struct inet_connection_sock *icsk; 426 struct tcp_sock *tp; 427 struct inet_sock *inet; 428 const int type = icmp_hdr(icmp_skb)->type; 429 const int code = icmp_hdr(icmp_skb)->code; 430 struct sock *sk; 431 struct sk_buff *skb; 432 struct request_sock *fastopen; 433 u32 seq, snd_una; 434 s32 remaining; 435 u32 delta_us; 436 int err; 437 struct net *net = dev_net(icmp_skb->dev); 438 439 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, 440 th->dest, iph->saddr, ntohs(th->source), 441 inet_iif(icmp_skb), 0); 442 if (!sk) { 443 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 444 return -ENOENT; 445 } 446 if (sk->sk_state == TCP_TIME_WAIT) { 447 inet_twsk_put(inet_twsk(sk)); 448 return 0; 449 } 450 seq = ntohl(th->seq); 451 if (sk->sk_state == TCP_NEW_SYN_RECV) { 452 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 453 type == ICMP_TIME_EXCEEDED || 454 (type == ICMP_DEST_UNREACH && 455 (code == ICMP_NET_UNREACH || 456 code == ICMP_HOST_UNREACH))); 457 return 0; 458 } 459 460 bh_lock_sock(sk); 461 /* If too many ICMPs get dropped on busy 462 * servers this needs to be solved differently. 463 * We do take care of PMTU discovery (RFC1191) special case : 464 * we can receive locally generated ICMP messages while socket is held. 465 */ 466 if (sock_owned_by_user(sk)) { 467 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 468 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 469 } 470 if (sk->sk_state == TCP_CLOSE) 471 goto out; 472 473 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 474 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 475 goto out; 476 } 477 478 icsk = inet_csk(sk); 479 tp = tcp_sk(sk); 480 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 481 fastopen = tp->fastopen_rsk; 482 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 483 if (sk->sk_state != TCP_LISTEN && 484 !between(seq, snd_una, tp->snd_nxt)) { 485 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 486 goto out; 487 } 488 489 switch (type) { 490 case ICMP_REDIRECT: 491 if (!sock_owned_by_user(sk)) 492 do_redirect(icmp_skb, sk); 493 goto out; 494 case ICMP_SOURCE_QUENCH: 495 /* Just silently ignore these. */ 496 goto out; 497 case ICMP_PARAMETERPROB: 498 err = EPROTO; 499 break; 500 case ICMP_DEST_UNREACH: 501 if (code > NR_ICMP_UNREACH) 502 goto out; 503 504 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 505 /* We are not interested in TCP_LISTEN and open_requests 506 * (SYN-ACKs send out by Linux are always <576bytes so 507 * they should go through unfragmented). 508 */ 509 if (sk->sk_state == TCP_LISTEN) 510 goto out; 511 512 tp->mtu_info = info; 513 if (!sock_owned_by_user(sk)) { 514 tcp_v4_mtu_reduced(sk); 515 } else { 516 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 517 sock_hold(sk); 518 } 519 goto out; 520 } 521 522 err = icmp_err_convert[code].errno; 523 /* check if icmp_skb allows revert of backoff 524 * (see draft-zimmermann-tcp-lcd) */ 525 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH) 526 break; 527 if (seq != tp->snd_una || !icsk->icsk_retransmits || 528 !icsk->icsk_backoff || fastopen) 529 break; 530 531 if (sock_owned_by_user(sk)) 532 break; 533 534 skb = tcp_rtx_queue_head(sk); 535 if (WARN_ON_ONCE(!skb)) 536 break; 537 538 icsk->icsk_backoff--; 539 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : 540 TCP_TIMEOUT_INIT; 541 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 542 543 544 tcp_mstamp_refresh(tp); 545 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 546 remaining = icsk->icsk_rto - 547 usecs_to_jiffies(delta_us); 548 549 if (remaining > 0) { 550 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 551 remaining, TCP_RTO_MAX); 552 } else { 553 /* RTO revert clocked out retransmission. 554 * Will retransmit now */ 555 tcp_retransmit_timer(sk); 556 } 557 558 break; 559 case ICMP_TIME_EXCEEDED: 560 err = EHOSTUNREACH; 561 break; 562 default: 563 goto out; 564 } 565 566 switch (sk->sk_state) { 567 case TCP_SYN_SENT: 568 case TCP_SYN_RECV: 569 /* Only in fast or simultaneous open. If a fast open socket is 570 * is already accepted it is treated as a connected one below. 571 */ 572 if (fastopen && !fastopen->sk) 573 break; 574 575 if (!sock_owned_by_user(sk)) { 576 sk->sk_err = err; 577 578 sk->sk_error_report(sk); 579 580 tcp_done(sk); 581 } else { 582 sk->sk_err_soft = err; 583 } 584 goto out; 585 } 586 587 /* If we've already connected we will keep trying 588 * until we time out, or the user gives up. 589 * 590 * rfc1122 4.2.3.9 allows to consider as hard errors 591 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 592 * but it is obsoleted by pmtu discovery). 593 * 594 * Note, that in modern internet, where routing is unreliable 595 * and in each dark corner broken firewalls sit, sending random 596 * errors ordered by their masters even this two messages finally lose 597 * their original sense (even Linux sends invalid PORT_UNREACHs) 598 * 599 * Now we are in compliance with RFCs. 600 * --ANK (980905) 601 */ 602 603 inet = inet_sk(sk); 604 if (!sock_owned_by_user(sk) && inet->recverr) { 605 sk->sk_err = err; 606 sk->sk_error_report(sk); 607 } else { /* Only an error on timeout */ 608 sk->sk_err_soft = err; 609 } 610 611 out: 612 bh_unlock_sock(sk); 613 sock_put(sk); 614 return 0; 615 } 616 617 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 618 { 619 struct tcphdr *th = tcp_hdr(skb); 620 621 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 622 skb->csum_start = skb_transport_header(skb) - skb->head; 623 skb->csum_offset = offsetof(struct tcphdr, check); 624 } 625 626 /* This routine computes an IPv4 TCP checksum. */ 627 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 628 { 629 const struct inet_sock *inet = inet_sk(sk); 630 631 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 632 } 633 EXPORT_SYMBOL(tcp_v4_send_check); 634 635 /* 636 * This routine will send an RST to the other tcp. 637 * 638 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 639 * for reset. 640 * Answer: if a packet caused RST, it is not for a socket 641 * existing in our system, if it is matched to a socket, 642 * it is just duplicate segment or bug in other side's TCP. 643 * So that we build reply only basing on parameters 644 * arrived with segment. 645 * Exception: precedence violation. We do not implement it in any case. 646 */ 647 648 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) 649 { 650 const struct tcphdr *th = tcp_hdr(skb); 651 struct { 652 struct tcphdr th; 653 #ifdef CONFIG_TCP_MD5SIG 654 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)]; 655 #endif 656 } rep; 657 struct ip_reply_arg arg; 658 #ifdef CONFIG_TCP_MD5SIG 659 struct tcp_md5sig_key *key = NULL; 660 const __u8 *hash_location = NULL; 661 unsigned char newhash[16]; 662 int genhash; 663 struct sock *sk1 = NULL; 664 #endif 665 struct net *net; 666 struct sock *ctl_sk; 667 668 /* Never send a reset in response to a reset. */ 669 if (th->rst) 670 return; 671 672 /* If sk not NULL, it means we did a successful lookup and incoming 673 * route had to be correct. prequeue might have dropped our dst. 674 */ 675 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 676 return; 677 678 /* Swap the send and the receive. */ 679 memset(&rep, 0, sizeof(rep)); 680 rep.th.dest = th->source; 681 rep.th.source = th->dest; 682 rep.th.doff = sizeof(struct tcphdr) / 4; 683 rep.th.rst = 1; 684 685 if (th->ack) { 686 rep.th.seq = th->ack_seq; 687 } else { 688 rep.th.ack = 1; 689 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 690 skb->len - (th->doff << 2)); 691 } 692 693 memset(&arg, 0, sizeof(arg)); 694 arg.iov[0].iov_base = (unsigned char *)&rep; 695 arg.iov[0].iov_len = sizeof(rep.th); 696 697 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 698 #ifdef CONFIG_TCP_MD5SIG 699 rcu_read_lock(); 700 hash_location = tcp_parse_md5sig_option(th); 701 if (sk && sk_fullsock(sk)) { 702 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *) 703 &ip_hdr(skb)->saddr, AF_INET); 704 } else if (hash_location) { 705 /* 706 * active side is lost. Try to find listening socket through 707 * source port, and then find md5 key through listening socket. 708 * we are not loose security here: 709 * Incoming packet is checked with md5 hash with finding key, 710 * no RST generated if md5 hash doesn't match. 711 */ 712 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, 713 ip_hdr(skb)->saddr, 714 th->source, ip_hdr(skb)->daddr, 715 ntohs(th->source), inet_iif(skb), 716 tcp_v4_sdif(skb)); 717 /* don't send rst if it can't find key */ 718 if (!sk1) 719 goto out; 720 721 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *) 722 &ip_hdr(skb)->saddr, AF_INET); 723 if (!key) 724 goto out; 725 726 727 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 728 if (genhash || memcmp(hash_location, newhash, 16) != 0) 729 goto out; 730 731 } 732 733 if (key) { 734 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 735 (TCPOPT_NOP << 16) | 736 (TCPOPT_MD5SIG << 8) | 737 TCPOLEN_MD5SIG); 738 /* Update length and the length the header thinks exists */ 739 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 740 rep.th.doff = arg.iov[0].iov_len / 4; 741 742 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 743 key, ip_hdr(skb)->saddr, 744 ip_hdr(skb)->daddr, &rep.th); 745 } 746 #endif 747 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 748 ip_hdr(skb)->saddr, /* XXX */ 749 arg.iov[0].iov_len, IPPROTO_TCP, 0); 750 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 751 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 752 753 /* When socket is gone, all binding information is lost. 754 * routing might fail in this case. No choice here, if we choose to force 755 * input interface, we will misroute in case of asymmetric route. 756 */ 757 if (sk) { 758 arg.bound_dev_if = sk->sk_bound_dev_if; 759 if (sk_fullsock(sk)) 760 trace_tcp_send_reset(sk, skb); 761 } 762 763 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 764 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 765 766 arg.tos = ip_hdr(skb)->tos; 767 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 768 local_bh_disable(); 769 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); 770 if (sk) { 771 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 772 inet_twsk(sk)->tw_mark : sk->sk_mark; 773 tcp_set_tx_time(skb, sk); 774 } 775 ip_send_unicast_reply(ctl_sk, 776 skb, &TCP_SKB_CB(skb)->header.h4.opt, 777 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 778 &arg, arg.iov[0].iov_len); 779 780 ctl_sk->sk_mark = 0; 781 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 782 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 783 local_bh_enable(); 784 785 #ifdef CONFIG_TCP_MD5SIG 786 out: 787 rcu_read_unlock(); 788 #endif 789 } 790 791 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 792 outside socket context is ugly, certainly. What can I do? 793 */ 794 795 static void tcp_v4_send_ack(const struct sock *sk, 796 struct sk_buff *skb, u32 seq, u32 ack, 797 u32 win, u32 tsval, u32 tsecr, int oif, 798 struct tcp_md5sig_key *key, 799 int reply_flags, u8 tos) 800 { 801 const struct tcphdr *th = tcp_hdr(skb); 802 struct { 803 struct tcphdr th; 804 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 805 #ifdef CONFIG_TCP_MD5SIG 806 + (TCPOLEN_MD5SIG_ALIGNED >> 2) 807 #endif 808 ]; 809 } rep; 810 struct net *net = sock_net(sk); 811 struct ip_reply_arg arg; 812 struct sock *ctl_sk; 813 814 memset(&rep.th, 0, sizeof(struct tcphdr)); 815 memset(&arg, 0, sizeof(arg)); 816 817 arg.iov[0].iov_base = (unsigned char *)&rep; 818 arg.iov[0].iov_len = sizeof(rep.th); 819 if (tsecr) { 820 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 821 (TCPOPT_TIMESTAMP << 8) | 822 TCPOLEN_TIMESTAMP); 823 rep.opt[1] = htonl(tsval); 824 rep.opt[2] = htonl(tsecr); 825 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 826 } 827 828 /* Swap the send and the receive. */ 829 rep.th.dest = th->source; 830 rep.th.source = th->dest; 831 rep.th.doff = arg.iov[0].iov_len / 4; 832 rep.th.seq = htonl(seq); 833 rep.th.ack_seq = htonl(ack); 834 rep.th.ack = 1; 835 rep.th.window = htons(win); 836 837 #ifdef CONFIG_TCP_MD5SIG 838 if (key) { 839 int offset = (tsecr) ? 3 : 0; 840 841 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 842 (TCPOPT_NOP << 16) | 843 (TCPOPT_MD5SIG << 8) | 844 TCPOLEN_MD5SIG); 845 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 846 rep.th.doff = arg.iov[0].iov_len/4; 847 848 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 849 key, ip_hdr(skb)->saddr, 850 ip_hdr(skb)->daddr, &rep.th); 851 } 852 #endif 853 arg.flags = reply_flags; 854 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 855 ip_hdr(skb)->saddr, /* XXX */ 856 arg.iov[0].iov_len, IPPROTO_TCP, 0); 857 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 858 if (oif) 859 arg.bound_dev_if = oif; 860 arg.tos = tos; 861 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 862 local_bh_disable(); 863 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); 864 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 865 inet_twsk(sk)->tw_mark : sk->sk_mark; 866 tcp_set_tx_time(skb, sk); 867 ip_send_unicast_reply(ctl_sk, 868 skb, &TCP_SKB_CB(skb)->header.h4.opt, 869 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 870 &arg, arg.iov[0].iov_len); 871 872 ctl_sk->sk_mark = 0; 873 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 874 local_bh_enable(); 875 } 876 877 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 878 { 879 struct inet_timewait_sock *tw = inet_twsk(sk); 880 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 881 882 tcp_v4_send_ack(sk, skb, 883 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 884 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 885 tcp_time_stamp_raw() + tcptw->tw_ts_offset, 886 tcptw->tw_ts_recent, 887 tw->tw_bound_dev_if, 888 tcp_twsk_md5_key(tcptw), 889 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 890 tw->tw_tos 891 ); 892 893 inet_twsk_put(tw); 894 } 895 896 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 897 struct request_sock *req) 898 { 899 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 900 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 901 */ 902 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 903 tcp_sk(sk)->snd_nxt; 904 905 /* RFC 7323 2.3 906 * The window field (SEG.WND) of every outgoing segment, with the 907 * exception of <SYN> segments, MUST be right-shifted by 908 * Rcv.Wind.Shift bits: 909 */ 910 tcp_v4_send_ack(sk, skb, seq, 911 tcp_rsk(req)->rcv_nxt, 912 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 913 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, 914 req->ts_recent, 915 0, 916 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr, 917 AF_INET), 918 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 919 ip_hdr(skb)->tos); 920 } 921 922 /* 923 * Send a SYN-ACK after having received a SYN. 924 * This still operates on a request_sock only, not on a big 925 * socket. 926 */ 927 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 928 struct flowi *fl, 929 struct request_sock *req, 930 struct tcp_fastopen_cookie *foc, 931 enum tcp_synack_type synack_type) 932 { 933 const struct inet_request_sock *ireq = inet_rsk(req); 934 struct flowi4 fl4; 935 int err = -1; 936 struct sk_buff *skb; 937 938 /* First, grab a route. */ 939 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 940 return -1; 941 942 skb = tcp_make_synack(sk, dst, req, foc, synack_type); 943 944 if (skb) { 945 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 946 947 rcu_read_lock(); 948 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 949 ireq->ir_rmt_addr, 950 rcu_dereference(ireq->ireq_opt)); 951 rcu_read_unlock(); 952 err = net_xmit_eval(err); 953 } 954 955 return err; 956 } 957 958 /* 959 * IPv4 request_sock destructor. 960 */ 961 static void tcp_v4_reqsk_destructor(struct request_sock *req) 962 { 963 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 964 } 965 966 #ifdef CONFIG_TCP_MD5SIG 967 /* 968 * RFC2385 MD5 checksumming requires a mapping of 969 * IP address->MD5 Key. 970 * We need to maintain these in the sk structure. 971 */ 972 973 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed); 974 EXPORT_SYMBOL(tcp_md5_needed); 975 976 /* Find the Key structure for an address. */ 977 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, 978 const union tcp_md5_addr *addr, 979 int family) 980 { 981 const struct tcp_sock *tp = tcp_sk(sk); 982 struct tcp_md5sig_key *key; 983 const struct tcp_md5sig_info *md5sig; 984 __be32 mask; 985 struct tcp_md5sig_key *best_match = NULL; 986 bool match; 987 988 /* caller either holds rcu_read_lock() or socket lock */ 989 md5sig = rcu_dereference_check(tp->md5sig_info, 990 lockdep_sock_is_held(sk)); 991 if (!md5sig) 992 return NULL; 993 994 hlist_for_each_entry_rcu(key, &md5sig->head, node) { 995 if (key->family != family) 996 continue; 997 998 if (family == AF_INET) { 999 mask = inet_make_mask(key->prefixlen); 1000 match = (key->addr.a4.s_addr & mask) == 1001 (addr->a4.s_addr & mask); 1002 #if IS_ENABLED(CONFIG_IPV6) 1003 } else if (family == AF_INET6) { 1004 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1005 key->prefixlen); 1006 #endif 1007 } else { 1008 match = false; 1009 } 1010 1011 if (match && (!best_match || 1012 key->prefixlen > best_match->prefixlen)) 1013 best_match = key; 1014 } 1015 return best_match; 1016 } 1017 EXPORT_SYMBOL(__tcp_md5_do_lookup); 1018 1019 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1020 const union tcp_md5_addr *addr, 1021 int family, u8 prefixlen) 1022 { 1023 const struct tcp_sock *tp = tcp_sk(sk); 1024 struct tcp_md5sig_key *key; 1025 unsigned int size = sizeof(struct in_addr); 1026 const struct tcp_md5sig_info *md5sig; 1027 1028 /* caller either holds rcu_read_lock() or socket lock */ 1029 md5sig = rcu_dereference_check(tp->md5sig_info, 1030 lockdep_sock_is_held(sk)); 1031 if (!md5sig) 1032 return NULL; 1033 #if IS_ENABLED(CONFIG_IPV6) 1034 if (family == AF_INET6) 1035 size = sizeof(struct in6_addr); 1036 #endif 1037 hlist_for_each_entry_rcu(key, &md5sig->head, node) { 1038 if (key->family != family) 1039 continue; 1040 if (!memcmp(&key->addr, addr, size) && 1041 key->prefixlen == prefixlen) 1042 return key; 1043 } 1044 return NULL; 1045 } 1046 1047 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1048 const struct sock *addr_sk) 1049 { 1050 const union tcp_md5_addr *addr; 1051 1052 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1053 return tcp_md5_do_lookup(sk, addr, AF_INET); 1054 } 1055 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1056 1057 /* This can be called on a newly created socket, from other files */ 1058 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1059 int family, u8 prefixlen, const u8 *newkey, u8 newkeylen, 1060 gfp_t gfp) 1061 { 1062 /* Add Key to the list */ 1063 struct tcp_md5sig_key *key; 1064 struct tcp_sock *tp = tcp_sk(sk); 1065 struct tcp_md5sig_info *md5sig; 1066 1067 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen); 1068 if (key) { 1069 /* Pre-existing entry - just update that one. */ 1070 memcpy(key->key, newkey, newkeylen); 1071 key->keylen = newkeylen; 1072 return 0; 1073 } 1074 1075 md5sig = rcu_dereference_protected(tp->md5sig_info, 1076 lockdep_sock_is_held(sk)); 1077 if (!md5sig) { 1078 md5sig = kmalloc(sizeof(*md5sig), gfp); 1079 if (!md5sig) 1080 return -ENOMEM; 1081 1082 sk_nocaps_add(sk, NETIF_F_GSO_MASK); 1083 INIT_HLIST_HEAD(&md5sig->head); 1084 rcu_assign_pointer(tp->md5sig_info, md5sig); 1085 } 1086 1087 key = sock_kmalloc(sk, sizeof(*key), gfp); 1088 if (!key) 1089 return -ENOMEM; 1090 if (!tcp_alloc_md5sig_pool()) { 1091 sock_kfree_s(sk, key, sizeof(*key)); 1092 return -ENOMEM; 1093 } 1094 1095 memcpy(key->key, newkey, newkeylen); 1096 key->keylen = newkeylen; 1097 key->family = family; 1098 key->prefixlen = prefixlen; 1099 memcpy(&key->addr, addr, 1100 (family == AF_INET6) ? sizeof(struct in6_addr) : 1101 sizeof(struct in_addr)); 1102 hlist_add_head_rcu(&key->node, &md5sig->head); 1103 return 0; 1104 } 1105 EXPORT_SYMBOL(tcp_md5_do_add); 1106 1107 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1108 u8 prefixlen) 1109 { 1110 struct tcp_md5sig_key *key; 1111 1112 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen); 1113 if (!key) 1114 return -ENOENT; 1115 hlist_del_rcu(&key->node); 1116 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1117 kfree_rcu(key, rcu); 1118 return 0; 1119 } 1120 EXPORT_SYMBOL(tcp_md5_do_del); 1121 1122 static void tcp_clear_md5_list(struct sock *sk) 1123 { 1124 struct tcp_sock *tp = tcp_sk(sk); 1125 struct tcp_md5sig_key *key; 1126 struct hlist_node *n; 1127 struct tcp_md5sig_info *md5sig; 1128 1129 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1130 1131 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1132 hlist_del_rcu(&key->node); 1133 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1134 kfree_rcu(key, rcu); 1135 } 1136 } 1137 1138 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1139 char __user *optval, int optlen) 1140 { 1141 struct tcp_md5sig cmd; 1142 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1143 u8 prefixlen = 32; 1144 1145 if (optlen < sizeof(cmd)) 1146 return -EINVAL; 1147 1148 if (copy_from_user(&cmd, optval, sizeof(cmd))) 1149 return -EFAULT; 1150 1151 if (sin->sin_family != AF_INET) 1152 return -EINVAL; 1153 1154 if (optname == TCP_MD5SIG_EXT && 1155 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1156 prefixlen = cmd.tcpm_prefixlen; 1157 if (prefixlen > 32) 1158 return -EINVAL; 1159 } 1160 1161 if (!cmd.tcpm_keylen) 1162 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, 1163 AF_INET, prefixlen); 1164 1165 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1166 return -EINVAL; 1167 1168 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, 1169 AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen, 1170 GFP_KERNEL); 1171 } 1172 1173 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, 1174 __be32 daddr, __be32 saddr, 1175 const struct tcphdr *th, int nbytes) 1176 { 1177 struct tcp4_pseudohdr *bp; 1178 struct scatterlist sg; 1179 struct tcphdr *_th; 1180 1181 bp = hp->scratch; 1182 bp->saddr = saddr; 1183 bp->daddr = daddr; 1184 bp->pad = 0; 1185 bp->protocol = IPPROTO_TCP; 1186 bp->len = cpu_to_be16(nbytes); 1187 1188 _th = (struct tcphdr *)(bp + 1); 1189 memcpy(_th, th, sizeof(*th)); 1190 _th->check = 0; 1191 1192 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1193 ahash_request_set_crypt(hp->md5_req, &sg, NULL, 1194 sizeof(*bp) + sizeof(*th)); 1195 return crypto_ahash_update(hp->md5_req); 1196 } 1197 1198 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1199 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1200 { 1201 struct tcp_md5sig_pool *hp; 1202 struct ahash_request *req; 1203 1204 hp = tcp_get_md5sig_pool(); 1205 if (!hp) 1206 goto clear_hash_noput; 1207 req = hp->md5_req; 1208 1209 if (crypto_ahash_init(req)) 1210 goto clear_hash; 1211 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) 1212 goto clear_hash; 1213 if (tcp_md5_hash_key(hp, key)) 1214 goto clear_hash; 1215 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1216 if (crypto_ahash_final(req)) 1217 goto clear_hash; 1218 1219 tcp_put_md5sig_pool(); 1220 return 0; 1221 1222 clear_hash: 1223 tcp_put_md5sig_pool(); 1224 clear_hash_noput: 1225 memset(md5_hash, 0, 16); 1226 return 1; 1227 } 1228 1229 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1230 const struct sock *sk, 1231 const struct sk_buff *skb) 1232 { 1233 struct tcp_md5sig_pool *hp; 1234 struct ahash_request *req; 1235 const struct tcphdr *th = tcp_hdr(skb); 1236 __be32 saddr, daddr; 1237 1238 if (sk) { /* valid for establish/request sockets */ 1239 saddr = sk->sk_rcv_saddr; 1240 daddr = sk->sk_daddr; 1241 } else { 1242 const struct iphdr *iph = ip_hdr(skb); 1243 saddr = iph->saddr; 1244 daddr = iph->daddr; 1245 } 1246 1247 hp = tcp_get_md5sig_pool(); 1248 if (!hp) 1249 goto clear_hash_noput; 1250 req = hp->md5_req; 1251 1252 if (crypto_ahash_init(req)) 1253 goto clear_hash; 1254 1255 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len)) 1256 goto clear_hash; 1257 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) 1258 goto clear_hash; 1259 if (tcp_md5_hash_key(hp, key)) 1260 goto clear_hash; 1261 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1262 if (crypto_ahash_final(req)) 1263 goto clear_hash; 1264 1265 tcp_put_md5sig_pool(); 1266 return 0; 1267 1268 clear_hash: 1269 tcp_put_md5sig_pool(); 1270 clear_hash_noput: 1271 memset(md5_hash, 0, 16); 1272 return 1; 1273 } 1274 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1275 1276 #endif 1277 1278 /* Called with rcu_read_lock() */ 1279 static bool tcp_v4_inbound_md5_hash(const struct sock *sk, 1280 const struct sk_buff *skb) 1281 { 1282 #ifdef CONFIG_TCP_MD5SIG 1283 /* 1284 * This gets called for each TCP segment that arrives 1285 * so we want to be efficient. 1286 * We have 3 drop cases: 1287 * o No MD5 hash and one expected. 1288 * o MD5 hash and we're not expecting one. 1289 * o MD5 hash and its wrong. 1290 */ 1291 const __u8 *hash_location = NULL; 1292 struct tcp_md5sig_key *hash_expected; 1293 const struct iphdr *iph = ip_hdr(skb); 1294 const struct tcphdr *th = tcp_hdr(skb); 1295 int genhash; 1296 unsigned char newhash[16]; 1297 1298 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr, 1299 AF_INET); 1300 hash_location = tcp_parse_md5sig_option(th); 1301 1302 /* We've parsed the options - do we have a hash? */ 1303 if (!hash_expected && !hash_location) 1304 return false; 1305 1306 if (hash_expected && !hash_location) { 1307 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); 1308 return true; 1309 } 1310 1311 if (!hash_expected && hash_location) { 1312 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); 1313 return true; 1314 } 1315 1316 /* Okay, so this is hash_expected and hash_location - 1317 * so we need to calculate the checksum. 1318 */ 1319 genhash = tcp_v4_md5_hash_skb(newhash, 1320 hash_expected, 1321 NULL, skb); 1322 1323 if (genhash || memcmp(hash_location, newhash, 16) != 0) { 1324 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); 1325 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", 1326 &iph->saddr, ntohs(th->source), 1327 &iph->daddr, ntohs(th->dest), 1328 genhash ? " tcp_v4_calc_md5_hash failed" 1329 : ""); 1330 return true; 1331 } 1332 return false; 1333 #endif 1334 return false; 1335 } 1336 1337 static void tcp_v4_init_req(struct request_sock *req, 1338 const struct sock *sk_listener, 1339 struct sk_buff *skb) 1340 { 1341 struct inet_request_sock *ireq = inet_rsk(req); 1342 struct net *net = sock_net(sk_listener); 1343 1344 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1345 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1346 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1347 } 1348 1349 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1350 struct flowi *fl, 1351 const struct request_sock *req) 1352 { 1353 return inet_csk_route_req(sk, &fl->u.ip4, req); 1354 } 1355 1356 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1357 .family = PF_INET, 1358 .obj_size = sizeof(struct tcp_request_sock), 1359 .rtx_syn_ack = tcp_rtx_synack, 1360 .send_ack = tcp_v4_reqsk_send_ack, 1361 .destructor = tcp_v4_reqsk_destructor, 1362 .send_reset = tcp_v4_send_reset, 1363 .syn_ack_timeout = tcp_syn_ack_timeout, 1364 }; 1365 1366 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1367 .mss_clamp = TCP_MSS_DEFAULT, 1368 #ifdef CONFIG_TCP_MD5SIG 1369 .req_md5_lookup = tcp_v4_md5_lookup, 1370 .calc_md5_hash = tcp_v4_md5_hash_skb, 1371 #endif 1372 .init_req = tcp_v4_init_req, 1373 #ifdef CONFIG_SYN_COOKIES 1374 .cookie_init_seq = cookie_v4_init_sequence, 1375 #endif 1376 .route_req = tcp_v4_route_req, 1377 .init_seq = tcp_v4_init_seq, 1378 .init_ts_off = tcp_v4_init_ts_off, 1379 .send_synack = tcp_v4_send_synack, 1380 }; 1381 1382 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1383 { 1384 /* Never answer to SYNs send to broadcast or multicast */ 1385 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1386 goto drop; 1387 1388 return tcp_conn_request(&tcp_request_sock_ops, 1389 &tcp_request_sock_ipv4_ops, sk, skb); 1390 1391 drop: 1392 tcp_listendrop(sk); 1393 return 0; 1394 } 1395 EXPORT_SYMBOL(tcp_v4_conn_request); 1396 1397 1398 /* 1399 * The three way handshake has completed - we got a valid synack - 1400 * now create the new socket. 1401 */ 1402 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1403 struct request_sock *req, 1404 struct dst_entry *dst, 1405 struct request_sock *req_unhash, 1406 bool *own_req) 1407 { 1408 struct inet_request_sock *ireq; 1409 struct inet_sock *newinet; 1410 struct tcp_sock *newtp; 1411 struct sock *newsk; 1412 #ifdef CONFIG_TCP_MD5SIG 1413 struct tcp_md5sig_key *key; 1414 #endif 1415 struct ip_options_rcu *inet_opt; 1416 1417 if (sk_acceptq_is_full(sk)) 1418 goto exit_overflow; 1419 1420 newsk = tcp_create_openreq_child(sk, req, skb); 1421 if (!newsk) 1422 goto exit_nonewsk; 1423 1424 newsk->sk_gso_type = SKB_GSO_TCPV4; 1425 inet_sk_rx_dst_set(newsk, skb); 1426 1427 newtp = tcp_sk(newsk); 1428 newinet = inet_sk(newsk); 1429 ireq = inet_rsk(req); 1430 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1431 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1432 newsk->sk_bound_dev_if = ireq->ir_iif; 1433 newinet->inet_saddr = ireq->ir_loc_addr; 1434 inet_opt = rcu_dereference(ireq->ireq_opt); 1435 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1436 newinet->mc_index = inet_iif(skb); 1437 newinet->mc_ttl = ip_hdr(skb)->ttl; 1438 newinet->rcv_tos = ip_hdr(skb)->tos; 1439 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1440 if (inet_opt) 1441 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1442 newinet->inet_id = newtp->write_seq ^ jiffies; 1443 1444 if (!dst) { 1445 dst = inet_csk_route_child_sock(sk, newsk, req); 1446 if (!dst) 1447 goto put_and_exit; 1448 } else { 1449 /* syncookie case : see end of cookie_v4_check() */ 1450 } 1451 sk_setup_caps(newsk, dst); 1452 1453 tcp_ca_openreq_child(newsk, dst); 1454 1455 tcp_sync_mss(newsk, dst_mtu(dst)); 1456 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1457 1458 tcp_initialize_rcv_mss(newsk); 1459 1460 #ifdef CONFIG_TCP_MD5SIG 1461 /* Copy over the MD5 key from the original socket */ 1462 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr, 1463 AF_INET); 1464 if (key) { 1465 /* 1466 * We're using one, so create a matching key 1467 * on the newsk structure. If we fail to get 1468 * memory, then we end up not copying the key 1469 * across. Shucks. 1470 */ 1471 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr, 1472 AF_INET, 32, key->key, key->keylen, GFP_ATOMIC); 1473 sk_nocaps_add(newsk, NETIF_F_GSO_MASK); 1474 } 1475 #endif 1476 1477 if (__inet_inherit_port(sk, newsk) < 0) 1478 goto put_and_exit; 1479 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); 1480 if (likely(*own_req)) { 1481 tcp_move_syn(newtp, req); 1482 ireq->ireq_opt = NULL; 1483 } else { 1484 newinet->inet_opt = NULL; 1485 } 1486 return newsk; 1487 1488 exit_overflow: 1489 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1490 exit_nonewsk: 1491 dst_release(dst); 1492 exit: 1493 tcp_listendrop(sk); 1494 return NULL; 1495 put_and_exit: 1496 newinet->inet_opt = NULL; 1497 inet_csk_prepare_forced_close(newsk); 1498 tcp_done(newsk); 1499 goto exit; 1500 } 1501 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1502 1503 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1504 { 1505 #ifdef CONFIG_SYN_COOKIES 1506 const struct tcphdr *th = tcp_hdr(skb); 1507 1508 if (!th->syn) 1509 sk = cookie_v4_check(sk, skb); 1510 #endif 1511 return sk; 1512 } 1513 1514 /* The socket must have it's spinlock held when we get 1515 * here, unless it is a TCP_LISTEN socket. 1516 * 1517 * We have a potential double-lock case here, so even when 1518 * doing backlog processing we use the BH locking scheme. 1519 * This is because we cannot sleep with the original spinlock 1520 * held. 1521 */ 1522 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1523 { 1524 struct sock *rsk; 1525 1526 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1527 struct dst_entry *dst = sk->sk_rx_dst; 1528 1529 sock_rps_save_rxhash(sk, skb); 1530 sk_mark_napi_id(sk, skb); 1531 if (dst) { 1532 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || 1533 !dst->ops->check(dst, 0)) { 1534 dst_release(dst); 1535 sk->sk_rx_dst = NULL; 1536 } 1537 } 1538 tcp_rcv_established(sk, skb); 1539 return 0; 1540 } 1541 1542 if (tcp_checksum_complete(skb)) 1543 goto csum_err; 1544 1545 if (sk->sk_state == TCP_LISTEN) { 1546 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1547 1548 if (!nsk) 1549 goto discard; 1550 if (nsk != sk) { 1551 if (tcp_child_process(sk, nsk, skb)) { 1552 rsk = nsk; 1553 goto reset; 1554 } 1555 return 0; 1556 } 1557 } else 1558 sock_rps_save_rxhash(sk, skb); 1559 1560 if (tcp_rcv_state_process(sk, skb)) { 1561 rsk = sk; 1562 goto reset; 1563 } 1564 return 0; 1565 1566 reset: 1567 tcp_v4_send_reset(rsk, skb); 1568 discard: 1569 kfree_skb(skb); 1570 /* Be careful here. If this function gets more complicated and 1571 * gcc suffers from register pressure on the x86, sk (in %ebx) 1572 * might be destroyed here. This current version compiles correctly, 1573 * but you have been warned. 1574 */ 1575 return 0; 1576 1577 csum_err: 1578 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1579 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1580 goto discard; 1581 } 1582 EXPORT_SYMBOL(tcp_v4_do_rcv); 1583 1584 int tcp_v4_early_demux(struct sk_buff *skb) 1585 { 1586 const struct iphdr *iph; 1587 const struct tcphdr *th; 1588 struct sock *sk; 1589 1590 if (skb->pkt_type != PACKET_HOST) 1591 return 0; 1592 1593 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1594 return 0; 1595 1596 iph = ip_hdr(skb); 1597 th = tcp_hdr(skb); 1598 1599 if (th->doff < sizeof(struct tcphdr) / 4) 1600 return 0; 1601 1602 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, 1603 iph->saddr, th->source, 1604 iph->daddr, ntohs(th->dest), 1605 skb->skb_iif, inet_sdif(skb)); 1606 if (sk) { 1607 skb->sk = sk; 1608 skb->destructor = sock_edemux; 1609 if (sk_fullsock(sk)) { 1610 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); 1611 1612 if (dst) 1613 dst = dst_check(dst, 0); 1614 if (dst && 1615 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) 1616 skb_dst_set_noref(skb, dst); 1617 } 1618 } 1619 return 0; 1620 } 1621 1622 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) 1623 { 1624 u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf; 1625 struct skb_shared_info *shinfo; 1626 const struct tcphdr *th; 1627 struct tcphdr *thtail; 1628 struct sk_buff *tail; 1629 unsigned int hdrlen; 1630 bool fragstolen; 1631 u32 gso_segs; 1632 int delta; 1633 1634 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1635 * we can fix skb->truesize to its real value to avoid future drops. 1636 * This is valid because skb is not yet charged to the socket. 1637 * It has been noticed pure SACK packets were sometimes dropped 1638 * (if cooked by drivers without copybreak feature). 1639 */ 1640 skb_condense(skb); 1641 1642 skb_dst_drop(skb); 1643 1644 if (unlikely(tcp_checksum_complete(skb))) { 1645 bh_unlock_sock(sk); 1646 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1647 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1648 return true; 1649 } 1650 1651 /* Attempt coalescing to last skb in backlog, even if we are 1652 * above the limits. 1653 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 1654 */ 1655 th = (const struct tcphdr *)skb->data; 1656 hdrlen = th->doff * 4; 1657 shinfo = skb_shinfo(skb); 1658 1659 if (!shinfo->gso_size) 1660 shinfo->gso_size = skb->len - hdrlen; 1661 1662 if (!shinfo->gso_segs) 1663 shinfo->gso_segs = 1; 1664 1665 tail = sk->sk_backlog.tail; 1666 if (!tail) 1667 goto no_coalesce; 1668 thtail = (struct tcphdr *)tail->data; 1669 1670 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 1671 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 1672 ((TCP_SKB_CB(tail)->tcp_flags | 1673 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 1674 !((TCP_SKB_CB(tail)->tcp_flags & 1675 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 1676 ((TCP_SKB_CB(tail)->tcp_flags ^ 1677 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || 1678 #ifdef CONFIG_TLS_DEVICE 1679 tail->decrypted != skb->decrypted || 1680 #endif 1681 thtail->doff != th->doff || 1682 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 1683 goto no_coalesce; 1684 1685 __skb_pull(skb, hdrlen); 1686 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 1687 thtail->window = th->window; 1688 1689 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 1690 1691 if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq)) 1692 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 1693 1694 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 1695 * thtail->fin, so that the fast path in tcp_rcv_established() 1696 * is not entered if we append a packet with a FIN. 1697 * SYN, RST, URG are not present. 1698 * ACK is set on both packets. 1699 * PSH : we do not really care in TCP stack, 1700 * at least for 'GRO' packets. 1701 */ 1702 thtail->fin |= th->fin; 1703 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 1704 1705 if (TCP_SKB_CB(skb)->has_rxtstamp) { 1706 TCP_SKB_CB(tail)->has_rxtstamp = true; 1707 tail->tstamp = skb->tstamp; 1708 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 1709 } 1710 1711 /* Not as strict as GRO. We only need to carry mss max value */ 1712 skb_shinfo(tail)->gso_size = max(shinfo->gso_size, 1713 skb_shinfo(tail)->gso_size); 1714 1715 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs; 1716 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF); 1717 1718 sk->sk_backlog.len += delta; 1719 __NET_INC_STATS(sock_net(sk), 1720 LINUX_MIB_TCPBACKLOGCOALESCE); 1721 kfree_skb_partial(skb, fragstolen); 1722 return false; 1723 } 1724 __skb_push(skb, hdrlen); 1725 1726 no_coalesce: 1727 /* Only socket owner can try to collapse/prune rx queues 1728 * to reduce memory overhead, so add a little headroom here. 1729 * Few sockets backlog are possibly concurrently non empty. 1730 */ 1731 limit += 64*1024; 1732 1733 if (unlikely(sk_add_backlog(sk, skb, limit))) { 1734 bh_unlock_sock(sk); 1735 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 1736 return true; 1737 } 1738 return false; 1739 } 1740 EXPORT_SYMBOL(tcp_add_backlog); 1741 1742 int tcp_filter(struct sock *sk, struct sk_buff *skb) 1743 { 1744 struct tcphdr *th = (struct tcphdr *)skb->data; 1745 1746 return sk_filter_trim_cap(sk, skb, th->doff * 4); 1747 } 1748 EXPORT_SYMBOL(tcp_filter); 1749 1750 static void tcp_v4_restore_cb(struct sk_buff *skb) 1751 { 1752 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 1753 sizeof(struct inet_skb_parm)); 1754 } 1755 1756 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 1757 const struct tcphdr *th) 1758 { 1759 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 1760 * barrier() makes sure compiler wont play fool^Waliasing games. 1761 */ 1762 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 1763 sizeof(struct inet_skb_parm)); 1764 barrier(); 1765 1766 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1767 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1768 skb->len - th->doff * 4); 1769 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1770 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 1771 TCP_SKB_CB(skb)->tcp_tw_isn = 0; 1772 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 1773 TCP_SKB_CB(skb)->sacked = 0; 1774 TCP_SKB_CB(skb)->has_rxtstamp = 1775 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 1776 } 1777 1778 /* 1779 * From tcp_input.c 1780 */ 1781 1782 int tcp_v4_rcv(struct sk_buff *skb) 1783 { 1784 struct net *net = dev_net(skb->dev); 1785 struct sk_buff *skb_to_free; 1786 int sdif = inet_sdif(skb); 1787 const struct iphdr *iph; 1788 const struct tcphdr *th; 1789 bool refcounted; 1790 struct sock *sk; 1791 int ret; 1792 1793 if (skb->pkt_type != PACKET_HOST) 1794 goto discard_it; 1795 1796 /* Count it even if it's bad */ 1797 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 1798 1799 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 1800 goto discard_it; 1801 1802 th = (const struct tcphdr *)skb->data; 1803 1804 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) 1805 goto bad_packet; 1806 if (!pskb_may_pull(skb, th->doff * 4)) 1807 goto discard_it; 1808 1809 /* An explanation is required here, I think. 1810 * Packet length and doff are validated by header prediction, 1811 * provided case of th->doff==0 is eliminated. 1812 * So, we defer the checks. */ 1813 1814 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 1815 goto csum_error; 1816 1817 th = (const struct tcphdr *)skb->data; 1818 iph = ip_hdr(skb); 1819 lookup: 1820 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, 1821 th->dest, sdif, &refcounted); 1822 if (!sk) 1823 goto no_tcp_socket; 1824 1825 process: 1826 if (sk->sk_state == TCP_TIME_WAIT) 1827 goto do_time_wait; 1828 1829 if (sk->sk_state == TCP_NEW_SYN_RECV) { 1830 struct request_sock *req = inet_reqsk(sk); 1831 bool req_stolen = false; 1832 struct sock *nsk; 1833 1834 sk = req->rsk_listener; 1835 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) { 1836 sk_drops_add(sk, skb); 1837 reqsk_put(req); 1838 goto discard_it; 1839 } 1840 if (tcp_checksum_complete(skb)) { 1841 reqsk_put(req); 1842 goto csum_error; 1843 } 1844 if (unlikely(sk->sk_state != TCP_LISTEN)) { 1845 inet_csk_reqsk_queue_drop_and_put(sk, req); 1846 goto lookup; 1847 } 1848 /* We own a reference on the listener, increase it again 1849 * as we might lose it too soon. 1850 */ 1851 sock_hold(sk); 1852 refcounted = true; 1853 nsk = NULL; 1854 if (!tcp_filter(sk, skb)) { 1855 th = (const struct tcphdr *)skb->data; 1856 iph = ip_hdr(skb); 1857 tcp_v4_fill_cb(skb, iph, th); 1858 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 1859 } 1860 if (!nsk) { 1861 reqsk_put(req); 1862 if (req_stolen) { 1863 /* Another cpu got exclusive access to req 1864 * and created a full blown socket. 1865 * Try to feed this packet to this socket 1866 * instead of discarding it. 1867 */ 1868 tcp_v4_restore_cb(skb); 1869 sock_put(sk); 1870 goto lookup; 1871 } 1872 goto discard_and_relse; 1873 } 1874 if (nsk == sk) { 1875 reqsk_put(req); 1876 tcp_v4_restore_cb(skb); 1877 } else if (tcp_child_process(sk, nsk, skb)) { 1878 tcp_v4_send_reset(nsk, skb); 1879 goto discard_and_relse; 1880 } else { 1881 sock_put(sk); 1882 return 0; 1883 } 1884 } 1885 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 1886 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 1887 goto discard_and_relse; 1888 } 1889 1890 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 1891 goto discard_and_relse; 1892 1893 if (tcp_v4_inbound_md5_hash(sk, skb)) 1894 goto discard_and_relse; 1895 1896 nf_reset(skb); 1897 1898 if (tcp_filter(sk, skb)) 1899 goto discard_and_relse; 1900 th = (const struct tcphdr *)skb->data; 1901 iph = ip_hdr(skb); 1902 tcp_v4_fill_cb(skb, iph, th); 1903 1904 skb->dev = NULL; 1905 1906 if (sk->sk_state == TCP_LISTEN) { 1907 ret = tcp_v4_do_rcv(sk, skb); 1908 goto put_and_return; 1909 } 1910 1911 sk_incoming_cpu_update(sk); 1912 1913 bh_lock_sock_nested(sk); 1914 tcp_segs_in(tcp_sk(sk), skb); 1915 ret = 0; 1916 if (!sock_owned_by_user(sk)) { 1917 skb_to_free = sk->sk_rx_skb_cache; 1918 sk->sk_rx_skb_cache = NULL; 1919 ret = tcp_v4_do_rcv(sk, skb); 1920 } else { 1921 if (tcp_add_backlog(sk, skb)) 1922 goto discard_and_relse; 1923 skb_to_free = NULL; 1924 } 1925 bh_unlock_sock(sk); 1926 if (skb_to_free) 1927 __kfree_skb(skb_to_free); 1928 1929 put_and_return: 1930 if (refcounted) 1931 sock_put(sk); 1932 1933 return ret; 1934 1935 no_tcp_socket: 1936 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 1937 goto discard_it; 1938 1939 tcp_v4_fill_cb(skb, iph, th); 1940 1941 if (tcp_checksum_complete(skb)) { 1942 csum_error: 1943 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 1944 bad_packet: 1945 __TCP_INC_STATS(net, TCP_MIB_INERRS); 1946 } else { 1947 tcp_v4_send_reset(NULL, skb); 1948 } 1949 1950 discard_it: 1951 /* Discard frame. */ 1952 kfree_skb(skb); 1953 return 0; 1954 1955 discard_and_relse: 1956 sk_drops_add(sk, skb); 1957 if (refcounted) 1958 sock_put(sk); 1959 goto discard_it; 1960 1961 do_time_wait: 1962 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 1963 inet_twsk_put(inet_twsk(sk)); 1964 goto discard_it; 1965 } 1966 1967 tcp_v4_fill_cb(skb, iph, th); 1968 1969 if (tcp_checksum_complete(skb)) { 1970 inet_twsk_put(inet_twsk(sk)); 1971 goto csum_error; 1972 } 1973 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 1974 case TCP_TW_SYN: { 1975 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), 1976 &tcp_hashinfo, skb, 1977 __tcp_hdrlen(th), 1978 iph->saddr, th->source, 1979 iph->daddr, th->dest, 1980 inet_iif(skb), 1981 sdif); 1982 if (sk2) { 1983 inet_twsk_deschedule_put(inet_twsk(sk)); 1984 sk = sk2; 1985 tcp_v4_restore_cb(skb); 1986 refcounted = false; 1987 goto process; 1988 } 1989 } 1990 /* to ACK */ 1991 /* fall through */ 1992 case TCP_TW_ACK: 1993 tcp_v4_timewait_ack(sk, skb); 1994 break; 1995 case TCP_TW_RST: 1996 tcp_v4_send_reset(sk, skb); 1997 inet_twsk_deschedule_put(inet_twsk(sk)); 1998 goto discard_it; 1999 case TCP_TW_SUCCESS:; 2000 } 2001 goto discard_it; 2002 } 2003 2004 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2005 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2006 .twsk_unique = tcp_twsk_unique, 2007 .twsk_destructor= tcp_twsk_destructor, 2008 }; 2009 2010 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2011 { 2012 struct dst_entry *dst = skb_dst(skb); 2013 2014 if (dst && dst_hold_safe(dst)) { 2015 sk->sk_rx_dst = dst; 2016 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; 2017 } 2018 } 2019 EXPORT_SYMBOL(inet_sk_rx_dst_set); 2020 2021 const struct inet_connection_sock_af_ops ipv4_specific = { 2022 .queue_xmit = ip_queue_xmit, 2023 .send_check = tcp_v4_send_check, 2024 .rebuild_header = inet_sk_rebuild_header, 2025 .sk_rx_dst_set = inet_sk_rx_dst_set, 2026 .conn_request = tcp_v4_conn_request, 2027 .syn_recv_sock = tcp_v4_syn_recv_sock, 2028 .net_header_len = sizeof(struct iphdr), 2029 .setsockopt = ip_setsockopt, 2030 .getsockopt = ip_getsockopt, 2031 .addr2sockaddr = inet_csk_addr2sockaddr, 2032 .sockaddr_len = sizeof(struct sockaddr_in), 2033 #ifdef CONFIG_COMPAT 2034 .compat_setsockopt = compat_ip_setsockopt, 2035 .compat_getsockopt = compat_ip_getsockopt, 2036 #endif 2037 .mtu_reduced = tcp_v4_mtu_reduced, 2038 }; 2039 EXPORT_SYMBOL(ipv4_specific); 2040 2041 #ifdef CONFIG_TCP_MD5SIG 2042 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2043 .md5_lookup = tcp_v4_md5_lookup, 2044 .calc_md5_hash = tcp_v4_md5_hash_skb, 2045 .md5_parse = tcp_v4_parse_md5_keys, 2046 }; 2047 #endif 2048 2049 /* NOTE: A lot of things set to zero explicitly by call to 2050 * sk_alloc() so need not be done here. 2051 */ 2052 static int tcp_v4_init_sock(struct sock *sk) 2053 { 2054 struct inet_connection_sock *icsk = inet_csk(sk); 2055 2056 tcp_init_sock(sk); 2057 2058 icsk->icsk_af_ops = &ipv4_specific; 2059 2060 #ifdef CONFIG_TCP_MD5SIG 2061 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2062 #endif 2063 2064 return 0; 2065 } 2066 2067 void tcp_v4_destroy_sock(struct sock *sk) 2068 { 2069 struct tcp_sock *tp = tcp_sk(sk); 2070 2071 trace_tcp_destroy_sock(sk); 2072 2073 tcp_clear_xmit_timers(sk); 2074 2075 tcp_cleanup_congestion_control(sk); 2076 2077 tcp_cleanup_ulp(sk); 2078 2079 /* Cleanup up the write buffer. */ 2080 tcp_write_queue_purge(sk); 2081 2082 /* Check if we want to disable active TFO */ 2083 tcp_fastopen_active_disable_ofo_check(sk); 2084 2085 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2086 skb_rbtree_purge(&tp->out_of_order_queue); 2087 2088 #ifdef CONFIG_TCP_MD5SIG 2089 /* Clean up the MD5 key list, if any */ 2090 if (tp->md5sig_info) { 2091 tcp_clear_md5_list(sk); 2092 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu); 2093 tp->md5sig_info = NULL; 2094 } 2095 #endif 2096 2097 /* Clean up a referenced TCP bind bucket. */ 2098 if (inet_csk(sk)->icsk_bind_hash) 2099 inet_put_port(sk); 2100 2101 BUG_ON(tp->fastopen_rsk); 2102 2103 /* If socket is aborted during connect operation */ 2104 tcp_free_fastopen_req(tp); 2105 tcp_fastopen_destroy_cipher(sk); 2106 tcp_saved_syn_free(tp); 2107 2108 sk_sockets_allocated_dec(sk); 2109 } 2110 EXPORT_SYMBOL(tcp_v4_destroy_sock); 2111 2112 #ifdef CONFIG_PROC_FS 2113 /* Proc filesystem TCP sock list dumping. */ 2114 2115 /* 2116 * Get next listener socket follow cur. If cur is NULL, get first socket 2117 * starting from bucket given in st->bucket; when st->bucket is zero the 2118 * very first socket in the hash table is returned. 2119 */ 2120 static void *listening_get_next(struct seq_file *seq, void *cur) 2121 { 2122 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); 2123 struct tcp_iter_state *st = seq->private; 2124 struct net *net = seq_file_net(seq); 2125 struct inet_listen_hashbucket *ilb; 2126 struct sock *sk = cur; 2127 2128 if (!sk) { 2129 get_head: 2130 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2131 spin_lock(&ilb->lock); 2132 sk = sk_head(&ilb->head); 2133 st->offset = 0; 2134 goto get_sk; 2135 } 2136 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2137 ++st->num; 2138 ++st->offset; 2139 2140 sk = sk_next(sk); 2141 get_sk: 2142 sk_for_each_from(sk) { 2143 if (!net_eq(sock_net(sk), net)) 2144 continue; 2145 if (sk->sk_family == afinfo->family) 2146 return sk; 2147 } 2148 spin_unlock(&ilb->lock); 2149 st->offset = 0; 2150 if (++st->bucket < INET_LHTABLE_SIZE) 2151 goto get_head; 2152 return NULL; 2153 } 2154 2155 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2156 { 2157 struct tcp_iter_state *st = seq->private; 2158 void *rc; 2159 2160 st->bucket = 0; 2161 st->offset = 0; 2162 rc = listening_get_next(seq, NULL); 2163 2164 while (rc && *pos) { 2165 rc = listening_get_next(seq, rc); 2166 --*pos; 2167 } 2168 return rc; 2169 } 2170 2171 static inline bool empty_bucket(const struct tcp_iter_state *st) 2172 { 2173 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain); 2174 } 2175 2176 /* 2177 * Get first established socket starting from bucket given in st->bucket. 2178 * If st->bucket is zero, the very first socket in the hash is returned. 2179 */ 2180 static void *established_get_first(struct seq_file *seq) 2181 { 2182 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); 2183 struct tcp_iter_state *st = seq->private; 2184 struct net *net = seq_file_net(seq); 2185 void *rc = NULL; 2186 2187 st->offset = 0; 2188 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { 2189 struct sock *sk; 2190 struct hlist_nulls_node *node; 2191 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 2192 2193 /* Lockless fast path for the common case of empty buckets */ 2194 if (empty_bucket(st)) 2195 continue; 2196 2197 spin_lock_bh(lock); 2198 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 2199 if (sk->sk_family != afinfo->family || 2200 !net_eq(sock_net(sk), net)) { 2201 continue; 2202 } 2203 rc = sk; 2204 goto out; 2205 } 2206 spin_unlock_bh(lock); 2207 } 2208 out: 2209 return rc; 2210 } 2211 2212 static void *established_get_next(struct seq_file *seq, void *cur) 2213 { 2214 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); 2215 struct sock *sk = cur; 2216 struct hlist_nulls_node *node; 2217 struct tcp_iter_state *st = seq->private; 2218 struct net *net = seq_file_net(seq); 2219 2220 ++st->num; 2221 ++st->offset; 2222 2223 sk = sk_nulls_next(sk); 2224 2225 sk_nulls_for_each_from(sk, node) { 2226 if (sk->sk_family == afinfo->family && 2227 net_eq(sock_net(sk), net)) 2228 return sk; 2229 } 2230 2231 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2232 ++st->bucket; 2233 return established_get_first(seq); 2234 } 2235 2236 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2237 { 2238 struct tcp_iter_state *st = seq->private; 2239 void *rc; 2240 2241 st->bucket = 0; 2242 rc = established_get_first(seq); 2243 2244 while (rc && pos) { 2245 rc = established_get_next(seq, rc); 2246 --pos; 2247 } 2248 return rc; 2249 } 2250 2251 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2252 { 2253 void *rc; 2254 struct tcp_iter_state *st = seq->private; 2255 2256 st->state = TCP_SEQ_STATE_LISTENING; 2257 rc = listening_get_idx(seq, &pos); 2258 2259 if (!rc) { 2260 st->state = TCP_SEQ_STATE_ESTABLISHED; 2261 rc = established_get_idx(seq, pos); 2262 } 2263 2264 return rc; 2265 } 2266 2267 static void *tcp_seek_last_pos(struct seq_file *seq) 2268 { 2269 struct tcp_iter_state *st = seq->private; 2270 int offset = st->offset; 2271 int orig_num = st->num; 2272 void *rc = NULL; 2273 2274 switch (st->state) { 2275 case TCP_SEQ_STATE_LISTENING: 2276 if (st->bucket >= INET_LHTABLE_SIZE) 2277 break; 2278 st->state = TCP_SEQ_STATE_LISTENING; 2279 rc = listening_get_next(seq, NULL); 2280 while (offset-- && rc) 2281 rc = listening_get_next(seq, rc); 2282 if (rc) 2283 break; 2284 st->bucket = 0; 2285 st->state = TCP_SEQ_STATE_ESTABLISHED; 2286 /* Fallthrough */ 2287 case TCP_SEQ_STATE_ESTABLISHED: 2288 if (st->bucket > tcp_hashinfo.ehash_mask) 2289 break; 2290 rc = established_get_first(seq); 2291 while (offset-- && rc) 2292 rc = established_get_next(seq, rc); 2293 } 2294 2295 st->num = orig_num; 2296 2297 return rc; 2298 } 2299 2300 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2301 { 2302 struct tcp_iter_state *st = seq->private; 2303 void *rc; 2304 2305 if (*pos && *pos == st->last_pos) { 2306 rc = tcp_seek_last_pos(seq); 2307 if (rc) 2308 goto out; 2309 } 2310 2311 st->state = TCP_SEQ_STATE_LISTENING; 2312 st->num = 0; 2313 st->bucket = 0; 2314 st->offset = 0; 2315 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2316 2317 out: 2318 st->last_pos = *pos; 2319 return rc; 2320 } 2321 EXPORT_SYMBOL(tcp_seq_start); 2322 2323 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2324 { 2325 struct tcp_iter_state *st = seq->private; 2326 void *rc = NULL; 2327 2328 if (v == SEQ_START_TOKEN) { 2329 rc = tcp_get_idx(seq, 0); 2330 goto out; 2331 } 2332 2333 switch (st->state) { 2334 case TCP_SEQ_STATE_LISTENING: 2335 rc = listening_get_next(seq, v); 2336 if (!rc) { 2337 st->state = TCP_SEQ_STATE_ESTABLISHED; 2338 st->bucket = 0; 2339 st->offset = 0; 2340 rc = established_get_first(seq); 2341 } 2342 break; 2343 case TCP_SEQ_STATE_ESTABLISHED: 2344 rc = established_get_next(seq, v); 2345 break; 2346 } 2347 out: 2348 ++*pos; 2349 st->last_pos = *pos; 2350 return rc; 2351 } 2352 EXPORT_SYMBOL(tcp_seq_next); 2353 2354 void tcp_seq_stop(struct seq_file *seq, void *v) 2355 { 2356 struct tcp_iter_state *st = seq->private; 2357 2358 switch (st->state) { 2359 case TCP_SEQ_STATE_LISTENING: 2360 if (v != SEQ_START_TOKEN) 2361 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock); 2362 break; 2363 case TCP_SEQ_STATE_ESTABLISHED: 2364 if (v) 2365 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2366 break; 2367 } 2368 } 2369 EXPORT_SYMBOL(tcp_seq_stop); 2370 2371 static void get_openreq4(const struct request_sock *req, 2372 struct seq_file *f, int i) 2373 { 2374 const struct inet_request_sock *ireq = inet_rsk(req); 2375 long delta = req->rsk_timer.expires - jiffies; 2376 2377 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2378 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2379 i, 2380 ireq->ir_loc_addr, 2381 ireq->ir_num, 2382 ireq->ir_rmt_addr, 2383 ntohs(ireq->ir_rmt_port), 2384 TCP_SYN_RECV, 2385 0, 0, /* could print option size, but that is af dependent. */ 2386 1, /* timers active (only the expire timer) */ 2387 jiffies_delta_to_clock_t(delta), 2388 req->num_timeout, 2389 from_kuid_munged(seq_user_ns(f), 2390 sock_i_uid(req->rsk_listener)), 2391 0, /* non standard timer */ 2392 0, /* open_requests have no inode */ 2393 0, 2394 req); 2395 } 2396 2397 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2398 { 2399 int timer_active; 2400 unsigned long timer_expires; 2401 const struct tcp_sock *tp = tcp_sk(sk); 2402 const struct inet_connection_sock *icsk = inet_csk(sk); 2403 const struct inet_sock *inet = inet_sk(sk); 2404 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2405 __be32 dest = inet->inet_daddr; 2406 __be32 src = inet->inet_rcv_saddr; 2407 __u16 destp = ntohs(inet->inet_dport); 2408 __u16 srcp = ntohs(inet->inet_sport); 2409 int rx_queue; 2410 int state; 2411 2412 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2413 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2414 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2415 timer_active = 1; 2416 timer_expires = icsk->icsk_timeout; 2417 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2418 timer_active = 4; 2419 timer_expires = icsk->icsk_timeout; 2420 } else if (timer_pending(&sk->sk_timer)) { 2421 timer_active = 2; 2422 timer_expires = sk->sk_timer.expires; 2423 } else { 2424 timer_active = 0; 2425 timer_expires = jiffies; 2426 } 2427 2428 state = inet_sk_state_load(sk); 2429 if (state == TCP_LISTEN) 2430 rx_queue = sk->sk_ack_backlog; 2431 else 2432 /* Because we don't lock the socket, 2433 * we might find a transient negative value. 2434 */ 2435 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); 2436 2437 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2438 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2439 i, src, srcp, dest, destp, state, 2440 tp->write_seq - tp->snd_una, 2441 rx_queue, 2442 timer_active, 2443 jiffies_delta_to_clock_t(timer_expires - jiffies), 2444 icsk->icsk_retransmits, 2445 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2446 icsk->icsk_probes_out, 2447 sock_i_ino(sk), 2448 refcount_read(&sk->sk_refcnt), sk, 2449 jiffies_to_clock_t(icsk->icsk_rto), 2450 jiffies_to_clock_t(icsk->icsk_ack.ato), 2451 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2452 tp->snd_cwnd, 2453 state == TCP_LISTEN ? 2454 fastopenq->max_qlen : 2455 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2456 } 2457 2458 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2459 struct seq_file *f, int i) 2460 { 2461 long delta = tw->tw_timer.expires - jiffies; 2462 __be32 dest, src; 2463 __u16 destp, srcp; 2464 2465 dest = tw->tw_daddr; 2466 src = tw->tw_rcv_saddr; 2467 destp = ntohs(tw->tw_dport); 2468 srcp = ntohs(tw->tw_sport); 2469 2470 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2471 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2472 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2473 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2474 refcount_read(&tw->tw_refcnt), tw); 2475 } 2476 2477 #define TMPSZ 150 2478 2479 static int tcp4_seq_show(struct seq_file *seq, void *v) 2480 { 2481 struct tcp_iter_state *st; 2482 struct sock *sk = v; 2483 2484 seq_setwidth(seq, TMPSZ - 1); 2485 if (v == SEQ_START_TOKEN) { 2486 seq_puts(seq, " sl local_address rem_address st tx_queue " 2487 "rx_queue tr tm->when retrnsmt uid timeout " 2488 "inode"); 2489 goto out; 2490 } 2491 st = seq->private; 2492 2493 if (sk->sk_state == TCP_TIME_WAIT) 2494 get_timewait4_sock(v, seq, st->num); 2495 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2496 get_openreq4(v, seq, st->num); 2497 else 2498 get_tcp4_sock(v, seq, st->num); 2499 out: 2500 seq_pad(seq, '\n'); 2501 return 0; 2502 } 2503 2504 static const struct seq_operations tcp4_seq_ops = { 2505 .show = tcp4_seq_show, 2506 .start = tcp_seq_start, 2507 .next = tcp_seq_next, 2508 .stop = tcp_seq_stop, 2509 }; 2510 2511 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 2512 .family = AF_INET, 2513 }; 2514 2515 static int __net_init tcp4_proc_init_net(struct net *net) 2516 { 2517 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 2518 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 2519 return -ENOMEM; 2520 return 0; 2521 } 2522 2523 static void __net_exit tcp4_proc_exit_net(struct net *net) 2524 { 2525 remove_proc_entry("tcp", net->proc_net); 2526 } 2527 2528 static struct pernet_operations tcp4_net_ops = { 2529 .init = tcp4_proc_init_net, 2530 .exit = tcp4_proc_exit_net, 2531 }; 2532 2533 int __init tcp4_proc_init(void) 2534 { 2535 return register_pernet_subsys(&tcp4_net_ops); 2536 } 2537 2538 void tcp4_proc_exit(void) 2539 { 2540 unregister_pernet_subsys(&tcp4_net_ops); 2541 } 2542 #endif /* CONFIG_PROC_FS */ 2543 2544 struct proto tcp_prot = { 2545 .name = "TCP", 2546 .owner = THIS_MODULE, 2547 .close = tcp_close, 2548 .pre_connect = tcp_v4_pre_connect, 2549 .connect = tcp_v4_connect, 2550 .disconnect = tcp_disconnect, 2551 .accept = inet_csk_accept, 2552 .ioctl = tcp_ioctl, 2553 .init = tcp_v4_init_sock, 2554 .destroy = tcp_v4_destroy_sock, 2555 .shutdown = tcp_shutdown, 2556 .setsockopt = tcp_setsockopt, 2557 .getsockopt = tcp_getsockopt, 2558 .keepalive = tcp_set_keepalive, 2559 .recvmsg = tcp_recvmsg, 2560 .sendmsg = tcp_sendmsg, 2561 .sendpage = tcp_sendpage, 2562 .backlog_rcv = tcp_v4_do_rcv, 2563 .release_cb = tcp_release_cb, 2564 .hash = inet_hash, 2565 .unhash = inet_unhash, 2566 .get_port = inet_csk_get_port, 2567 .enter_memory_pressure = tcp_enter_memory_pressure, 2568 .leave_memory_pressure = tcp_leave_memory_pressure, 2569 .stream_memory_free = tcp_stream_memory_free, 2570 .sockets_allocated = &tcp_sockets_allocated, 2571 .orphan_count = &tcp_orphan_count, 2572 .memory_allocated = &tcp_memory_allocated, 2573 .memory_pressure = &tcp_memory_pressure, 2574 .sysctl_mem = sysctl_tcp_mem, 2575 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 2576 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 2577 .max_header = MAX_TCP_HEADER, 2578 .obj_size = sizeof(struct tcp_sock), 2579 .slab_flags = SLAB_TYPESAFE_BY_RCU, 2580 .twsk_prot = &tcp_timewait_sock_ops, 2581 .rsk_prot = &tcp_request_sock_ops, 2582 .h.hashinfo = &tcp_hashinfo, 2583 .no_autobind = true, 2584 #ifdef CONFIG_COMPAT 2585 .compat_setsockopt = compat_tcp_setsockopt, 2586 .compat_getsockopt = compat_tcp_getsockopt, 2587 #endif 2588 .diag_destroy = tcp_abort, 2589 }; 2590 EXPORT_SYMBOL(tcp_prot); 2591 2592 static void __net_exit tcp_sk_exit(struct net *net) 2593 { 2594 int cpu; 2595 2596 if (net->ipv4.tcp_congestion_control) 2597 module_put(net->ipv4.tcp_congestion_control->owner); 2598 2599 for_each_possible_cpu(cpu) 2600 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); 2601 free_percpu(net->ipv4.tcp_sk); 2602 } 2603 2604 static int __net_init tcp_sk_init(struct net *net) 2605 { 2606 int res, cpu, cnt; 2607 2608 net->ipv4.tcp_sk = alloc_percpu(struct sock *); 2609 if (!net->ipv4.tcp_sk) 2610 return -ENOMEM; 2611 2612 for_each_possible_cpu(cpu) { 2613 struct sock *sk; 2614 2615 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 2616 IPPROTO_TCP, net); 2617 if (res) 2618 goto fail; 2619 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 2620 2621 /* Please enforce IP_DF and IPID==0 for RST and 2622 * ACK sent in SYN-RECV and TIME-WAIT state. 2623 */ 2624 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 2625 2626 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; 2627 } 2628 2629 net->ipv4.sysctl_tcp_ecn = 2; 2630 net->ipv4.sysctl_tcp_ecn_fallback = 1; 2631 2632 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 2633 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 2634 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 2635 2636 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 2637 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 2638 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 2639 2640 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 2641 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 2642 net->ipv4.sysctl_tcp_syncookies = 1; 2643 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 2644 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 2645 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 2646 net->ipv4.sysctl_tcp_orphan_retries = 0; 2647 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 2648 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 2649 net->ipv4.sysctl_tcp_tw_reuse = 2; 2650 2651 cnt = tcp_hashinfo.ehash_mask + 1; 2652 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2; 2653 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; 2654 2655 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256); 2656 net->ipv4.sysctl_tcp_sack = 1; 2657 net->ipv4.sysctl_tcp_window_scaling = 1; 2658 net->ipv4.sysctl_tcp_timestamps = 1; 2659 net->ipv4.sysctl_tcp_early_retrans = 3; 2660 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 2661 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 2662 net->ipv4.sysctl_tcp_retrans_collapse = 1; 2663 net->ipv4.sysctl_tcp_max_reordering = 300; 2664 net->ipv4.sysctl_tcp_dsack = 1; 2665 net->ipv4.sysctl_tcp_app_win = 31; 2666 net->ipv4.sysctl_tcp_adv_win_scale = 1; 2667 net->ipv4.sysctl_tcp_frto = 2; 2668 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 2669 /* This limits the percentage of the congestion window which we 2670 * will allow a single TSO frame to consume. Building TSO frames 2671 * which are too large can cause TCP streams to be bursty. 2672 */ 2673 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 2674 /* Default TSQ limit of 16 TSO segments */ 2675 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; 2676 /* rfc5961 challenge ack rate limiting */ 2677 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; 2678 net->ipv4.sysctl_tcp_min_tso_segs = 2; 2679 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 2680 net->ipv4.sysctl_tcp_autocorking = 1; 2681 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 2682 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 2683 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 2684 if (net != &init_net) { 2685 memcpy(net->ipv4.sysctl_tcp_rmem, 2686 init_net.ipv4.sysctl_tcp_rmem, 2687 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 2688 memcpy(net->ipv4.sysctl_tcp_wmem, 2689 init_net.ipv4.sysctl_tcp_wmem, 2690 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 2691 } 2692 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 2693 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 2694 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 2695 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); 2696 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; 2697 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 2698 2699 /* Reno is always built in */ 2700 if (!net_eq(net, &init_net) && 2701 try_module_get(init_net.ipv4.tcp_congestion_control->owner)) 2702 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 2703 else 2704 net->ipv4.tcp_congestion_control = &tcp_reno; 2705 2706 return 0; 2707 fail: 2708 tcp_sk_exit(net); 2709 2710 return res; 2711 } 2712 2713 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 2714 { 2715 struct net *net; 2716 2717 inet_twsk_purge(&tcp_hashinfo, AF_INET); 2718 2719 list_for_each_entry(net, net_exit_list, exit_list) 2720 tcp_fastopen_ctx_destroy(net); 2721 } 2722 2723 static struct pernet_operations __net_initdata tcp_sk_ops = { 2724 .init = tcp_sk_init, 2725 .exit = tcp_sk_exit, 2726 .exit_batch = tcp_sk_exit_batch, 2727 }; 2728 2729 void __init tcp_v4_init(void) 2730 { 2731 if (register_pernet_subsys(&tcp_sk_ops)) 2732 panic("Failed to create the TCP control socket.\n"); 2733 } 2734