1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 61 #include <net/net_namespace.h> 62 #include <net/icmp.h> 63 #include <net/inet_hashtables.h> 64 #include <net/tcp.h> 65 #include <net/transp_v6.h> 66 #include <net/ipv6.h> 67 #include <net/inet_common.h> 68 #include <net/timewait_sock.h> 69 #include <net/xfrm.h> 70 #include <net/secure_seq.h> 71 #include <net/busy_poll.h> 72 73 #include <linux/inet.h> 74 #include <linux/ipv6.h> 75 #include <linux/stddef.h> 76 #include <linux/proc_fs.h> 77 #include <linux/seq_file.h> 78 #include <linux/inetdevice.h> 79 80 #include <crypto/hash.h> 81 #include <linux/scatterlist.h> 82 83 #include <trace/events/tcp.h> 84 85 #ifdef CONFIG_TCP_MD5SIG 86 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 87 __be32 daddr, __be32 saddr, const struct tcphdr *th); 88 #endif 89 90 struct inet_hashinfo tcp_hashinfo; 91 EXPORT_SYMBOL(tcp_hashinfo); 92 93 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 94 { 95 return secure_tcp_seq(ip_hdr(skb)->daddr, 96 ip_hdr(skb)->saddr, 97 tcp_hdr(skb)->dest, 98 tcp_hdr(skb)->source); 99 } 100 101 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 102 { 103 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 104 } 105 106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 107 { 108 const struct inet_timewait_sock *tw = inet_twsk(sktw); 109 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 110 struct tcp_sock *tp = tcp_sk(sk); 111 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse; 112 113 if (reuse == 2) { 114 /* Still does not detect *everything* that goes through 115 * lo, since we require a loopback src or dst address 116 * or direct binding to 'lo' interface. 117 */ 118 bool loopback = false; 119 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 120 loopback = true; 121 #if IS_ENABLED(CONFIG_IPV6) 122 if (tw->tw_family == AF_INET6) { 123 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 124 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 125 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 126 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 127 loopback = true; 128 } else 129 #endif 130 { 131 if (ipv4_is_loopback(tw->tw_daddr) || 132 ipv4_is_loopback(tw->tw_rcv_saddr)) 133 loopback = true; 134 } 135 if (!loopback) 136 reuse = 0; 137 } 138 139 /* With PAWS, it is safe from the viewpoint 140 of data integrity. Even without PAWS it is safe provided sequence 141 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 142 143 Actually, the idea is close to VJ's one, only timestamp cache is 144 held not per host, but per port pair and TW bucket is used as state 145 holder. 146 147 If TW bucket has been already destroyed we fall back to VJ's scheme 148 and use initial timestamp retrieved from peer table. 149 */ 150 if (tcptw->tw_ts_recent_stamp && 151 (!twp || (reuse && time_after32(ktime_get_seconds(), 152 tcptw->tw_ts_recent_stamp)))) { 153 /* In case of repair and re-using TIME-WAIT sockets we still 154 * want to be sure that it is safe as above but honor the 155 * sequence numbers and time stamps set as part of the repair 156 * process. 157 * 158 * Without this check re-using a TIME-WAIT socket with TCP 159 * repair would accumulate a -1 on the repair assigned 160 * sequence number. The first time it is reused the sequence 161 * is -1, the second time -2, etc. This fixes that issue 162 * without appearing to create any others. 163 */ 164 if (likely(!tp->repair)) { 165 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 166 167 if (!seq) 168 seq = 1; 169 WRITE_ONCE(tp->write_seq, seq); 170 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 171 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 172 } 173 sock_hold(sktw); 174 return 1; 175 } 176 177 return 0; 178 } 179 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 180 181 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 182 int addr_len) 183 { 184 /* This check is replicated from tcp_v4_connect() and intended to 185 * prevent BPF program called below from accessing bytes that are out 186 * of the bound specified by user in addr_len. 187 */ 188 if (addr_len < sizeof(struct sockaddr_in)) 189 return -EINVAL; 190 191 sock_owned_by_me(sk); 192 193 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr); 194 } 195 196 /* This will initiate an outgoing connection. */ 197 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 198 { 199 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 200 struct inet_sock *inet = inet_sk(sk); 201 struct tcp_sock *tp = tcp_sk(sk); 202 __be16 orig_sport, orig_dport; 203 __be32 daddr, nexthop; 204 struct flowi4 *fl4; 205 struct rtable *rt; 206 int err; 207 struct ip_options_rcu *inet_opt; 208 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 209 210 if (addr_len < sizeof(struct sockaddr_in)) 211 return -EINVAL; 212 213 if (usin->sin_family != AF_INET) 214 return -EAFNOSUPPORT; 215 216 nexthop = daddr = usin->sin_addr.s_addr; 217 inet_opt = rcu_dereference_protected(inet->inet_opt, 218 lockdep_sock_is_held(sk)); 219 if (inet_opt && inet_opt->opt.srr) { 220 if (!daddr) 221 return -EINVAL; 222 nexthop = inet_opt->opt.faddr; 223 } 224 225 orig_sport = inet->inet_sport; 226 orig_dport = usin->sin_port; 227 fl4 = &inet->cork.fl.u.ip4; 228 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 229 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 230 IPPROTO_TCP, 231 orig_sport, orig_dport, sk); 232 if (IS_ERR(rt)) { 233 err = PTR_ERR(rt); 234 if (err == -ENETUNREACH) 235 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 236 return err; 237 } 238 239 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 240 ip_rt_put(rt); 241 return -ENETUNREACH; 242 } 243 244 if (!inet_opt || !inet_opt->opt.srr) 245 daddr = fl4->daddr; 246 247 if (!inet->inet_saddr) 248 inet->inet_saddr = fl4->saddr; 249 sk_rcv_saddr_set(sk, inet->inet_saddr); 250 251 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 252 /* Reset inherited state */ 253 tp->rx_opt.ts_recent = 0; 254 tp->rx_opt.ts_recent_stamp = 0; 255 if (likely(!tp->repair)) 256 WRITE_ONCE(tp->write_seq, 0); 257 } 258 259 inet->inet_dport = usin->sin_port; 260 sk_daddr_set(sk, daddr); 261 262 inet_csk(sk)->icsk_ext_hdr_len = 0; 263 if (inet_opt) 264 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 265 266 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 267 268 /* Socket identity is still unknown (sport may be zero). 269 * However we set state to SYN-SENT and not releasing socket 270 * lock select source port, enter ourselves into the hash tables and 271 * complete initialization after this. 272 */ 273 tcp_set_state(sk, TCP_SYN_SENT); 274 err = inet_hash_connect(tcp_death_row, sk); 275 if (err) 276 goto failure; 277 278 sk_set_txhash(sk); 279 280 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 281 inet->inet_sport, inet->inet_dport, sk); 282 if (IS_ERR(rt)) { 283 err = PTR_ERR(rt); 284 rt = NULL; 285 goto failure; 286 } 287 /* OK, now commit destination to socket. */ 288 sk->sk_gso_type = SKB_GSO_TCPV4; 289 sk_setup_caps(sk, &rt->dst); 290 rt = NULL; 291 292 if (likely(!tp->repair)) { 293 if (!tp->write_seq) 294 WRITE_ONCE(tp->write_seq, 295 secure_tcp_seq(inet->inet_saddr, 296 inet->inet_daddr, 297 inet->inet_sport, 298 usin->sin_port)); 299 tp->tsoffset = secure_tcp_ts_off(sock_net(sk), 300 inet->inet_saddr, 301 inet->inet_daddr); 302 } 303 304 inet->inet_id = prandom_u32(); 305 306 if (tcp_fastopen_defer_connect(sk, &err)) 307 return err; 308 if (err) 309 goto failure; 310 311 err = tcp_connect(sk); 312 313 if (err) 314 goto failure; 315 316 return 0; 317 318 failure: 319 /* 320 * This unhashes the socket and releases the local port, 321 * if necessary. 322 */ 323 tcp_set_state(sk, TCP_CLOSE); 324 ip_rt_put(rt); 325 sk->sk_route_caps = 0; 326 inet->inet_dport = 0; 327 return err; 328 } 329 EXPORT_SYMBOL(tcp_v4_connect); 330 331 /* 332 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 333 * It can be called through tcp_release_cb() if socket was owned by user 334 * at the time tcp_v4_err() was called to handle ICMP message. 335 */ 336 void tcp_v4_mtu_reduced(struct sock *sk) 337 { 338 struct inet_sock *inet = inet_sk(sk); 339 struct dst_entry *dst; 340 u32 mtu; 341 342 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 343 return; 344 mtu = tcp_sk(sk)->mtu_info; 345 dst = inet_csk_update_pmtu(sk, mtu); 346 if (!dst) 347 return; 348 349 /* Something is about to be wrong... Remember soft error 350 * for the case, if this connection will not able to recover. 351 */ 352 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 353 sk->sk_err_soft = EMSGSIZE; 354 355 mtu = dst_mtu(dst); 356 357 if (inet->pmtudisc != IP_PMTUDISC_DONT && 358 ip_sk_accept_pmtu(sk) && 359 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 360 tcp_sync_mss(sk, mtu); 361 362 /* Resend the TCP packet because it's 363 * clear that the old packet has been 364 * dropped. This is the new "fast" path mtu 365 * discovery. 366 */ 367 tcp_simple_retransmit(sk); 368 } /* else let the usual retransmit timer handle it */ 369 } 370 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 371 372 static void do_redirect(struct sk_buff *skb, struct sock *sk) 373 { 374 struct dst_entry *dst = __sk_dst_check(sk, 0); 375 376 if (dst) 377 dst->ops->redirect(dst, sk, skb); 378 } 379 380 381 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 382 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 383 { 384 struct request_sock *req = inet_reqsk(sk); 385 struct net *net = sock_net(sk); 386 387 /* ICMPs are not backlogged, hence we cannot get 388 * an established socket here. 389 */ 390 if (seq != tcp_rsk(req)->snt_isn) { 391 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 392 } else if (abort) { 393 /* 394 * Still in SYN_RECV, just remove it silently. 395 * There is no good way to pass the error to the newly 396 * created socket, and POSIX does not want network 397 * errors returned from accept(). 398 */ 399 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 400 tcp_listendrop(req->rsk_listener); 401 } 402 reqsk_put(req); 403 } 404 EXPORT_SYMBOL(tcp_req_err); 405 406 /* 407 * This routine is called by the ICMP module when it gets some 408 * sort of error condition. If err < 0 then the socket should 409 * be closed and the error returned to the user. If err > 0 410 * it's just the icmp type << 8 | icmp code. After adjustment 411 * header points to the first 8 bytes of the tcp header. We need 412 * to find the appropriate port. 413 * 414 * The locking strategy used here is very "optimistic". When 415 * someone else accesses the socket the ICMP is just dropped 416 * and for some paths there is no check at all. 417 * A more general error queue to queue errors for later handling 418 * is probably better. 419 * 420 */ 421 422 int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) 423 { 424 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data; 425 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); 426 struct inet_connection_sock *icsk; 427 struct tcp_sock *tp; 428 struct inet_sock *inet; 429 const int type = icmp_hdr(icmp_skb)->type; 430 const int code = icmp_hdr(icmp_skb)->code; 431 struct sock *sk; 432 struct sk_buff *skb; 433 struct request_sock *fastopen; 434 u32 seq, snd_una; 435 s32 remaining; 436 u32 delta_us; 437 int err; 438 struct net *net = dev_net(icmp_skb->dev); 439 440 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, 441 th->dest, iph->saddr, ntohs(th->source), 442 inet_iif(icmp_skb), 0); 443 if (!sk) { 444 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 445 return -ENOENT; 446 } 447 if (sk->sk_state == TCP_TIME_WAIT) { 448 inet_twsk_put(inet_twsk(sk)); 449 return 0; 450 } 451 seq = ntohl(th->seq); 452 if (sk->sk_state == TCP_NEW_SYN_RECV) { 453 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 454 type == ICMP_TIME_EXCEEDED || 455 (type == ICMP_DEST_UNREACH && 456 (code == ICMP_NET_UNREACH || 457 code == ICMP_HOST_UNREACH))); 458 return 0; 459 } 460 461 bh_lock_sock(sk); 462 /* If too many ICMPs get dropped on busy 463 * servers this needs to be solved differently. 464 * We do take care of PMTU discovery (RFC1191) special case : 465 * we can receive locally generated ICMP messages while socket is held. 466 */ 467 if (sock_owned_by_user(sk)) { 468 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 469 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 470 } 471 if (sk->sk_state == TCP_CLOSE) 472 goto out; 473 474 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 475 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 476 goto out; 477 } 478 479 icsk = inet_csk(sk); 480 tp = tcp_sk(sk); 481 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 482 fastopen = rcu_dereference(tp->fastopen_rsk); 483 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 484 if (sk->sk_state != TCP_LISTEN && 485 !between(seq, snd_una, tp->snd_nxt)) { 486 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 487 goto out; 488 } 489 490 switch (type) { 491 case ICMP_REDIRECT: 492 if (!sock_owned_by_user(sk)) 493 do_redirect(icmp_skb, sk); 494 goto out; 495 case ICMP_SOURCE_QUENCH: 496 /* Just silently ignore these. */ 497 goto out; 498 case ICMP_PARAMETERPROB: 499 err = EPROTO; 500 break; 501 case ICMP_DEST_UNREACH: 502 if (code > NR_ICMP_UNREACH) 503 goto out; 504 505 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 506 /* We are not interested in TCP_LISTEN and open_requests 507 * (SYN-ACKs send out by Linux are always <576bytes so 508 * they should go through unfragmented). 509 */ 510 if (sk->sk_state == TCP_LISTEN) 511 goto out; 512 513 tp->mtu_info = info; 514 if (!sock_owned_by_user(sk)) { 515 tcp_v4_mtu_reduced(sk); 516 } else { 517 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 518 sock_hold(sk); 519 } 520 goto out; 521 } 522 523 err = icmp_err_convert[code].errno; 524 /* check if icmp_skb allows revert of backoff 525 * (see draft-zimmermann-tcp-lcd) */ 526 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH) 527 break; 528 if (seq != tp->snd_una || !icsk->icsk_retransmits || 529 !icsk->icsk_backoff || fastopen) 530 break; 531 532 if (sock_owned_by_user(sk)) 533 break; 534 535 skb = tcp_rtx_queue_head(sk); 536 if (WARN_ON_ONCE(!skb)) 537 break; 538 539 icsk->icsk_backoff--; 540 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : 541 TCP_TIMEOUT_INIT; 542 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 543 544 545 tcp_mstamp_refresh(tp); 546 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 547 remaining = icsk->icsk_rto - 548 usecs_to_jiffies(delta_us); 549 550 if (remaining > 0) { 551 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 552 remaining, TCP_RTO_MAX); 553 } else { 554 /* RTO revert clocked out retransmission. 555 * Will retransmit now */ 556 tcp_retransmit_timer(sk); 557 } 558 559 break; 560 case ICMP_TIME_EXCEEDED: 561 err = EHOSTUNREACH; 562 break; 563 default: 564 goto out; 565 } 566 567 switch (sk->sk_state) { 568 case TCP_SYN_SENT: 569 case TCP_SYN_RECV: 570 /* Only in fast or simultaneous open. If a fast open socket is 571 * is already accepted it is treated as a connected one below. 572 */ 573 if (fastopen && !fastopen->sk) 574 break; 575 576 if (!sock_owned_by_user(sk)) { 577 sk->sk_err = err; 578 579 sk->sk_error_report(sk); 580 581 tcp_done(sk); 582 } else { 583 sk->sk_err_soft = err; 584 } 585 goto out; 586 } 587 588 /* If we've already connected we will keep trying 589 * until we time out, or the user gives up. 590 * 591 * rfc1122 4.2.3.9 allows to consider as hard errors 592 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 593 * but it is obsoleted by pmtu discovery). 594 * 595 * Note, that in modern internet, where routing is unreliable 596 * and in each dark corner broken firewalls sit, sending random 597 * errors ordered by their masters even this two messages finally lose 598 * their original sense (even Linux sends invalid PORT_UNREACHs) 599 * 600 * Now we are in compliance with RFCs. 601 * --ANK (980905) 602 */ 603 604 inet = inet_sk(sk); 605 if (!sock_owned_by_user(sk) && inet->recverr) { 606 sk->sk_err = err; 607 sk->sk_error_report(sk); 608 } else { /* Only an error on timeout */ 609 sk->sk_err_soft = err; 610 } 611 612 out: 613 bh_unlock_sock(sk); 614 sock_put(sk); 615 return 0; 616 } 617 618 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 619 { 620 struct tcphdr *th = tcp_hdr(skb); 621 622 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 623 skb->csum_start = skb_transport_header(skb) - skb->head; 624 skb->csum_offset = offsetof(struct tcphdr, check); 625 } 626 627 /* This routine computes an IPv4 TCP checksum. */ 628 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 629 { 630 const struct inet_sock *inet = inet_sk(sk); 631 632 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 633 } 634 EXPORT_SYMBOL(tcp_v4_send_check); 635 636 /* 637 * This routine will send an RST to the other tcp. 638 * 639 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 640 * for reset. 641 * Answer: if a packet caused RST, it is not for a socket 642 * existing in our system, if it is matched to a socket, 643 * it is just duplicate segment or bug in other side's TCP. 644 * So that we build reply only basing on parameters 645 * arrived with segment. 646 * Exception: precedence violation. We do not implement it in any case. 647 */ 648 649 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) 650 { 651 const struct tcphdr *th = tcp_hdr(skb); 652 struct { 653 struct tcphdr th; 654 #ifdef CONFIG_TCP_MD5SIG 655 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)]; 656 #endif 657 } rep; 658 struct ip_reply_arg arg; 659 #ifdef CONFIG_TCP_MD5SIG 660 struct tcp_md5sig_key *key = NULL; 661 const __u8 *hash_location = NULL; 662 unsigned char newhash[16]; 663 int genhash; 664 struct sock *sk1 = NULL; 665 #endif 666 u64 transmit_time = 0; 667 struct sock *ctl_sk; 668 struct net *net; 669 670 /* Never send a reset in response to a reset. */ 671 if (th->rst) 672 return; 673 674 /* If sk not NULL, it means we did a successful lookup and incoming 675 * route had to be correct. prequeue might have dropped our dst. 676 */ 677 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 678 return; 679 680 /* Swap the send and the receive. */ 681 memset(&rep, 0, sizeof(rep)); 682 rep.th.dest = th->source; 683 rep.th.source = th->dest; 684 rep.th.doff = sizeof(struct tcphdr) / 4; 685 rep.th.rst = 1; 686 687 if (th->ack) { 688 rep.th.seq = th->ack_seq; 689 } else { 690 rep.th.ack = 1; 691 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 692 skb->len - (th->doff << 2)); 693 } 694 695 memset(&arg, 0, sizeof(arg)); 696 arg.iov[0].iov_base = (unsigned char *)&rep; 697 arg.iov[0].iov_len = sizeof(rep.th); 698 699 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 700 #ifdef CONFIG_TCP_MD5SIG 701 rcu_read_lock(); 702 hash_location = tcp_parse_md5sig_option(th); 703 if (sk && sk_fullsock(sk)) { 704 const union tcp_md5_addr *addr; 705 int l3index; 706 707 /* sdif set, means packet ingressed via a device 708 * in an L3 domain and inet_iif is set to it. 709 */ 710 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 711 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 712 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 713 } else if (hash_location) { 714 const union tcp_md5_addr *addr; 715 int sdif = tcp_v4_sdif(skb); 716 int dif = inet_iif(skb); 717 int l3index; 718 719 /* 720 * active side is lost. Try to find listening socket through 721 * source port, and then find md5 key through listening socket. 722 * we are not loose security here: 723 * Incoming packet is checked with md5 hash with finding key, 724 * no RST generated if md5 hash doesn't match. 725 */ 726 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, 727 ip_hdr(skb)->saddr, 728 th->source, ip_hdr(skb)->daddr, 729 ntohs(th->source), dif, sdif); 730 /* don't send rst if it can't find key */ 731 if (!sk1) 732 goto out; 733 734 /* sdif set, means packet ingressed via a device 735 * in an L3 domain and dif is set to it. 736 */ 737 l3index = sdif ? dif : 0; 738 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 739 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 740 if (!key) 741 goto out; 742 743 744 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 745 if (genhash || memcmp(hash_location, newhash, 16) != 0) 746 goto out; 747 748 } 749 750 if (key) { 751 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 752 (TCPOPT_NOP << 16) | 753 (TCPOPT_MD5SIG << 8) | 754 TCPOLEN_MD5SIG); 755 /* Update length and the length the header thinks exists */ 756 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 757 rep.th.doff = arg.iov[0].iov_len / 4; 758 759 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 760 key, ip_hdr(skb)->saddr, 761 ip_hdr(skb)->daddr, &rep.th); 762 } 763 #endif 764 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 765 ip_hdr(skb)->saddr, /* XXX */ 766 arg.iov[0].iov_len, IPPROTO_TCP, 0); 767 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 768 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 769 770 /* When socket is gone, all binding information is lost. 771 * routing might fail in this case. No choice here, if we choose to force 772 * input interface, we will misroute in case of asymmetric route. 773 */ 774 if (sk) { 775 arg.bound_dev_if = sk->sk_bound_dev_if; 776 if (sk_fullsock(sk)) 777 trace_tcp_send_reset(sk, skb); 778 } 779 780 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 781 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 782 783 arg.tos = ip_hdr(skb)->tos; 784 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 785 local_bh_disable(); 786 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); 787 if (sk) { 788 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 789 inet_twsk(sk)->tw_mark : sk->sk_mark; 790 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 791 inet_twsk(sk)->tw_priority : sk->sk_priority; 792 transmit_time = tcp_transmit_time(sk); 793 } 794 ip_send_unicast_reply(ctl_sk, 795 skb, &TCP_SKB_CB(skb)->header.h4.opt, 796 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 797 &arg, arg.iov[0].iov_len, 798 transmit_time); 799 800 ctl_sk->sk_mark = 0; 801 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 802 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 803 local_bh_enable(); 804 805 #ifdef CONFIG_TCP_MD5SIG 806 out: 807 rcu_read_unlock(); 808 #endif 809 } 810 811 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 812 outside socket context is ugly, certainly. What can I do? 813 */ 814 815 static void tcp_v4_send_ack(const struct sock *sk, 816 struct sk_buff *skb, u32 seq, u32 ack, 817 u32 win, u32 tsval, u32 tsecr, int oif, 818 struct tcp_md5sig_key *key, 819 int reply_flags, u8 tos) 820 { 821 const struct tcphdr *th = tcp_hdr(skb); 822 struct { 823 struct tcphdr th; 824 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 825 #ifdef CONFIG_TCP_MD5SIG 826 + (TCPOLEN_MD5SIG_ALIGNED >> 2) 827 #endif 828 ]; 829 } rep; 830 struct net *net = sock_net(sk); 831 struct ip_reply_arg arg; 832 struct sock *ctl_sk; 833 u64 transmit_time; 834 835 memset(&rep.th, 0, sizeof(struct tcphdr)); 836 memset(&arg, 0, sizeof(arg)); 837 838 arg.iov[0].iov_base = (unsigned char *)&rep; 839 arg.iov[0].iov_len = sizeof(rep.th); 840 if (tsecr) { 841 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 842 (TCPOPT_TIMESTAMP << 8) | 843 TCPOLEN_TIMESTAMP); 844 rep.opt[1] = htonl(tsval); 845 rep.opt[2] = htonl(tsecr); 846 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 847 } 848 849 /* Swap the send and the receive. */ 850 rep.th.dest = th->source; 851 rep.th.source = th->dest; 852 rep.th.doff = arg.iov[0].iov_len / 4; 853 rep.th.seq = htonl(seq); 854 rep.th.ack_seq = htonl(ack); 855 rep.th.ack = 1; 856 rep.th.window = htons(win); 857 858 #ifdef CONFIG_TCP_MD5SIG 859 if (key) { 860 int offset = (tsecr) ? 3 : 0; 861 862 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 863 (TCPOPT_NOP << 16) | 864 (TCPOPT_MD5SIG << 8) | 865 TCPOLEN_MD5SIG); 866 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 867 rep.th.doff = arg.iov[0].iov_len/4; 868 869 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 870 key, ip_hdr(skb)->saddr, 871 ip_hdr(skb)->daddr, &rep.th); 872 } 873 #endif 874 arg.flags = reply_flags; 875 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 876 ip_hdr(skb)->saddr, /* XXX */ 877 arg.iov[0].iov_len, IPPROTO_TCP, 0); 878 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 879 if (oif) 880 arg.bound_dev_if = oif; 881 arg.tos = tos; 882 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 883 local_bh_disable(); 884 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); 885 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 886 inet_twsk(sk)->tw_mark : sk->sk_mark; 887 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 888 inet_twsk(sk)->tw_priority : sk->sk_priority; 889 transmit_time = tcp_transmit_time(sk); 890 ip_send_unicast_reply(ctl_sk, 891 skb, &TCP_SKB_CB(skb)->header.h4.opt, 892 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 893 &arg, arg.iov[0].iov_len, 894 transmit_time); 895 896 ctl_sk->sk_mark = 0; 897 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 898 local_bh_enable(); 899 } 900 901 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 902 { 903 struct inet_timewait_sock *tw = inet_twsk(sk); 904 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 905 906 tcp_v4_send_ack(sk, skb, 907 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 908 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 909 tcp_time_stamp_raw() + tcptw->tw_ts_offset, 910 tcptw->tw_ts_recent, 911 tw->tw_bound_dev_if, 912 tcp_twsk_md5_key(tcptw), 913 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 914 tw->tw_tos 915 ); 916 917 inet_twsk_put(tw); 918 } 919 920 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 921 struct request_sock *req) 922 { 923 const union tcp_md5_addr *addr; 924 int l3index; 925 926 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 927 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 928 */ 929 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 930 tcp_sk(sk)->snd_nxt; 931 932 /* RFC 7323 2.3 933 * The window field (SEG.WND) of every outgoing segment, with the 934 * exception of <SYN> segments, MUST be right-shifted by 935 * Rcv.Wind.Shift bits: 936 */ 937 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 938 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 939 tcp_v4_send_ack(sk, skb, seq, 940 tcp_rsk(req)->rcv_nxt, 941 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 942 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, 943 req->ts_recent, 944 0, 945 tcp_md5_do_lookup(sk, l3index, addr, AF_INET), 946 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 947 ip_hdr(skb)->tos); 948 } 949 950 /* 951 * Send a SYN-ACK after having received a SYN. 952 * This still operates on a request_sock only, not on a big 953 * socket. 954 */ 955 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 956 struct flowi *fl, 957 struct request_sock *req, 958 struct tcp_fastopen_cookie *foc, 959 enum tcp_synack_type synack_type) 960 { 961 const struct inet_request_sock *ireq = inet_rsk(req); 962 struct flowi4 fl4; 963 int err = -1; 964 struct sk_buff *skb; 965 966 /* First, grab a route. */ 967 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 968 return -1; 969 970 skb = tcp_make_synack(sk, dst, req, foc, synack_type); 971 972 if (skb) { 973 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 974 975 rcu_read_lock(); 976 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 977 ireq->ir_rmt_addr, 978 rcu_dereference(ireq->ireq_opt)); 979 rcu_read_unlock(); 980 err = net_xmit_eval(err); 981 } 982 983 return err; 984 } 985 986 /* 987 * IPv4 request_sock destructor. 988 */ 989 static void tcp_v4_reqsk_destructor(struct request_sock *req) 990 { 991 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 992 } 993 994 #ifdef CONFIG_TCP_MD5SIG 995 /* 996 * RFC2385 MD5 checksumming requires a mapping of 997 * IP address->MD5 Key. 998 * We need to maintain these in the sk structure. 999 */ 1000 1001 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed); 1002 EXPORT_SYMBOL(tcp_md5_needed); 1003 1004 /* Find the Key structure for an address. */ 1005 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1006 const union tcp_md5_addr *addr, 1007 int family) 1008 { 1009 const struct tcp_sock *tp = tcp_sk(sk); 1010 struct tcp_md5sig_key *key; 1011 const struct tcp_md5sig_info *md5sig; 1012 __be32 mask; 1013 struct tcp_md5sig_key *best_match = NULL; 1014 bool match; 1015 1016 /* caller either holds rcu_read_lock() or socket lock */ 1017 md5sig = rcu_dereference_check(tp->md5sig_info, 1018 lockdep_sock_is_held(sk)); 1019 if (!md5sig) 1020 return NULL; 1021 1022 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1023 lockdep_sock_is_held(sk)) { 1024 if (key->family != family) 1025 continue; 1026 if (key->l3index && key->l3index != l3index) 1027 continue; 1028 if (family == AF_INET) { 1029 mask = inet_make_mask(key->prefixlen); 1030 match = (key->addr.a4.s_addr & mask) == 1031 (addr->a4.s_addr & mask); 1032 #if IS_ENABLED(CONFIG_IPV6) 1033 } else if (family == AF_INET6) { 1034 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1035 key->prefixlen); 1036 #endif 1037 } else { 1038 match = false; 1039 } 1040 1041 if (match && (!best_match || 1042 key->prefixlen > best_match->prefixlen)) 1043 best_match = key; 1044 } 1045 return best_match; 1046 } 1047 EXPORT_SYMBOL(__tcp_md5_do_lookup); 1048 1049 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1050 const union tcp_md5_addr *addr, 1051 int family, u8 prefixlen, 1052 int l3index) 1053 { 1054 const struct tcp_sock *tp = tcp_sk(sk); 1055 struct tcp_md5sig_key *key; 1056 unsigned int size = sizeof(struct in_addr); 1057 const struct tcp_md5sig_info *md5sig; 1058 1059 /* caller either holds rcu_read_lock() or socket lock */ 1060 md5sig = rcu_dereference_check(tp->md5sig_info, 1061 lockdep_sock_is_held(sk)); 1062 if (!md5sig) 1063 return NULL; 1064 #if IS_ENABLED(CONFIG_IPV6) 1065 if (family == AF_INET6) 1066 size = sizeof(struct in6_addr); 1067 #endif 1068 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1069 lockdep_sock_is_held(sk)) { 1070 if (key->family != family) 1071 continue; 1072 if (key->l3index && key->l3index != l3index) 1073 continue; 1074 if (!memcmp(&key->addr, addr, size) && 1075 key->prefixlen == prefixlen) 1076 return key; 1077 } 1078 return NULL; 1079 } 1080 1081 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1082 const struct sock *addr_sk) 1083 { 1084 const union tcp_md5_addr *addr; 1085 int l3index; 1086 1087 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1088 addr_sk->sk_bound_dev_if); 1089 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1090 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1091 } 1092 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1093 1094 /* This can be called on a newly created socket, from other files */ 1095 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1096 int family, u8 prefixlen, int l3index, 1097 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1098 { 1099 /* Add Key to the list */ 1100 struct tcp_md5sig_key *key; 1101 struct tcp_sock *tp = tcp_sk(sk); 1102 struct tcp_md5sig_info *md5sig; 1103 1104 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); 1105 if (key) { 1106 /* Pre-existing entry - just update that one. */ 1107 memcpy(key->key, newkey, newkeylen); 1108 key->keylen = newkeylen; 1109 return 0; 1110 } 1111 1112 md5sig = rcu_dereference_protected(tp->md5sig_info, 1113 lockdep_sock_is_held(sk)); 1114 if (!md5sig) { 1115 md5sig = kmalloc(sizeof(*md5sig), gfp); 1116 if (!md5sig) 1117 return -ENOMEM; 1118 1119 sk_nocaps_add(sk, NETIF_F_GSO_MASK); 1120 INIT_HLIST_HEAD(&md5sig->head); 1121 rcu_assign_pointer(tp->md5sig_info, md5sig); 1122 } 1123 1124 key = sock_kmalloc(sk, sizeof(*key), gfp); 1125 if (!key) 1126 return -ENOMEM; 1127 if (!tcp_alloc_md5sig_pool()) { 1128 sock_kfree_s(sk, key, sizeof(*key)); 1129 return -ENOMEM; 1130 } 1131 1132 memcpy(key->key, newkey, newkeylen); 1133 key->keylen = newkeylen; 1134 key->family = family; 1135 key->prefixlen = prefixlen; 1136 key->l3index = l3index; 1137 memcpy(&key->addr, addr, 1138 (family == AF_INET6) ? sizeof(struct in6_addr) : 1139 sizeof(struct in_addr)); 1140 hlist_add_head_rcu(&key->node, &md5sig->head); 1141 return 0; 1142 } 1143 EXPORT_SYMBOL(tcp_md5_do_add); 1144 1145 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1146 u8 prefixlen, int l3index) 1147 { 1148 struct tcp_md5sig_key *key; 1149 1150 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); 1151 if (!key) 1152 return -ENOENT; 1153 hlist_del_rcu(&key->node); 1154 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1155 kfree_rcu(key, rcu); 1156 return 0; 1157 } 1158 EXPORT_SYMBOL(tcp_md5_do_del); 1159 1160 static void tcp_clear_md5_list(struct sock *sk) 1161 { 1162 struct tcp_sock *tp = tcp_sk(sk); 1163 struct tcp_md5sig_key *key; 1164 struct hlist_node *n; 1165 struct tcp_md5sig_info *md5sig; 1166 1167 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1168 1169 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1170 hlist_del_rcu(&key->node); 1171 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1172 kfree_rcu(key, rcu); 1173 } 1174 } 1175 1176 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1177 char __user *optval, int optlen) 1178 { 1179 struct tcp_md5sig cmd; 1180 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1181 const union tcp_md5_addr *addr; 1182 u8 prefixlen = 32; 1183 int l3index = 0; 1184 1185 if (optlen < sizeof(cmd)) 1186 return -EINVAL; 1187 1188 if (copy_from_user(&cmd, optval, sizeof(cmd))) 1189 return -EFAULT; 1190 1191 if (sin->sin_family != AF_INET) 1192 return -EINVAL; 1193 1194 if (optname == TCP_MD5SIG_EXT && 1195 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1196 prefixlen = cmd.tcpm_prefixlen; 1197 if (prefixlen > 32) 1198 return -EINVAL; 1199 } 1200 1201 if (optname == TCP_MD5SIG_EXT && 1202 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1203 struct net_device *dev; 1204 1205 rcu_read_lock(); 1206 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1207 if (dev && netif_is_l3_master(dev)) 1208 l3index = dev->ifindex; 1209 1210 rcu_read_unlock(); 1211 1212 /* ok to reference set/not set outside of rcu; 1213 * right now device MUST be an L3 master 1214 */ 1215 if (!dev || !l3index) 1216 return -EINVAL; 1217 } 1218 1219 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1220 1221 if (!cmd.tcpm_keylen) 1222 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index); 1223 1224 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1225 return -EINVAL; 1226 1227 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, 1228 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); 1229 } 1230 1231 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, 1232 __be32 daddr, __be32 saddr, 1233 const struct tcphdr *th, int nbytes) 1234 { 1235 struct tcp4_pseudohdr *bp; 1236 struct scatterlist sg; 1237 struct tcphdr *_th; 1238 1239 bp = hp->scratch; 1240 bp->saddr = saddr; 1241 bp->daddr = daddr; 1242 bp->pad = 0; 1243 bp->protocol = IPPROTO_TCP; 1244 bp->len = cpu_to_be16(nbytes); 1245 1246 _th = (struct tcphdr *)(bp + 1); 1247 memcpy(_th, th, sizeof(*th)); 1248 _th->check = 0; 1249 1250 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1251 ahash_request_set_crypt(hp->md5_req, &sg, NULL, 1252 sizeof(*bp) + sizeof(*th)); 1253 return crypto_ahash_update(hp->md5_req); 1254 } 1255 1256 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1257 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1258 { 1259 struct tcp_md5sig_pool *hp; 1260 struct ahash_request *req; 1261 1262 hp = tcp_get_md5sig_pool(); 1263 if (!hp) 1264 goto clear_hash_noput; 1265 req = hp->md5_req; 1266 1267 if (crypto_ahash_init(req)) 1268 goto clear_hash; 1269 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) 1270 goto clear_hash; 1271 if (tcp_md5_hash_key(hp, key)) 1272 goto clear_hash; 1273 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1274 if (crypto_ahash_final(req)) 1275 goto clear_hash; 1276 1277 tcp_put_md5sig_pool(); 1278 return 0; 1279 1280 clear_hash: 1281 tcp_put_md5sig_pool(); 1282 clear_hash_noput: 1283 memset(md5_hash, 0, 16); 1284 return 1; 1285 } 1286 1287 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1288 const struct sock *sk, 1289 const struct sk_buff *skb) 1290 { 1291 struct tcp_md5sig_pool *hp; 1292 struct ahash_request *req; 1293 const struct tcphdr *th = tcp_hdr(skb); 1294 __be32 saddr, daddr; 1295 1296 if (sk) { /* valid for establish/request sockets */ 1297 saddr = sk->sk_rcv_saddr; 1298 daddr = sk->sk_daddr; 1299 } else { 1300 const struct iphdr *iph = ip_hdr(skb); 1301 saddr = iph->saddr; 1302 daddr = iph->daddr; 1303 } 1304 1305 hp = tcp_get_md5sig_pool(); 1306 if (!hp) 1307 goto clear_hash_noput; 1308 req = hp->md5_req; 1309 1310 if (crypto_ahash_init(req)) 1311 goto clear_hash; 1312 1313 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len)) 1314 goto clear_hash; 1315 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) 1316 goto clear_hash; 1317 if (tcp_md5_hash_key(hp, key)) 1318 goto clear_hash; 1319 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1320 if (crypto_ahash_final(req)) 1321 goto clear_hash; 1322 1323 tcp_put_md5sig_pool(); 1324 return 0; 1325 1326 clear_hash: 1327 tcp_put_md5sig_pool(); 1328 clear_hash_noput: 1329 memset(md5_hash, 0, 16); 1330 return 1; 1331 } 1332 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1333 1334 #endif 1335 1336 /* Called with rcu_read_lock() */ 1337 static bool tcp_v4_inbound_md5_hash(const struct sock *sk, 1338 const struct sk_buff *skb, 1339 int dif, int sdif) 1340 { 1341 #ifdef CONFIG_TCP_MD5SIG 1342 /* 1343 * This gets called for each TCP segment that arrives 1344 * so we want to be efficient. 1345 * We have 3 drop cases: 1346 * o No MD5 hash and one expected. 1347 * o MD5 hash and we're not expecting one. 1348 * o MD5 hash and its wrong. 1349 */ 1350 const __u8 *hash_location = NULL; 1351 struct tcp_md5sig_key *hash_expected; 1352 const struct iphdr *iph = ip_hdr(skb); 1353 const struct tcphdr *th = tcp_hdr(skb); 1354 const union tcp_md5_addr *addr; 1355 unsigned char newhash[16]; 1356 int genhash, l3index; 1357 1358 /* sdif set, means packet ingressed via a device 1359 * in an L3 domain and dif is set to the l3mdev 1360 */ 1361 l3index = sdif ? dif : 0; 1362 1363 addr = (union tcp_md5_addr *)&iph->saddr; 1364 hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1365 hash_location = tcp_parse_md5sig_option(th); 1366 1367 /* We've parsed the options - do we have a hash? */ 1368 if (!hash_expected && !hash_location) 1369 return false; 1370 1371 if (hash_expected && !hash_location) { 1372 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); 1373 return true; 1374 } 1375 1376 if (!hash_expected && hash_location) { 1377 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); 1378 return true; 1379 } 1380 1381 /* Okay, so this is hash_expected and hash_location - 1382 * so we need to calculate the checksum. 1383 */ 1384 genhash = tcp_v4_md5_hash_skb(newhash, 1385 hash_expected, 1386 NULL, skb); 1387 1388 if (genhash || memcmp(hash_location, newhash, 16) != 0) { 1389 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); 1390 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n", 1391 &iph->saddr, ntohs(th->source), 1392 &iph->daddr, ntohs(th->dest), 1393 genhash ? " tcp_v4_calc_md5_hash failed" 1394 : "", l3index); 1395 return true; 1396 } 1397 return false; 1398 #endif 1399 return false; 1400 } 1401 1402 static void tcp_v4_init_req(struct request_sock *req, 1403 const struct sock *sk_listener, 1404 struct sk_buff *skb) 1405 { 1406 struct inet_request_sock *ireq = inet_rsk(req); 1407 struct net *net = sock_net(sk_listener); 1408 1409 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1410 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1411 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1412 } 1413 1414 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1415 struct flowi *fl, 1416 const struct request_sock *req) 1417 { 1418 return inet_csk_route_req(sk, &fl->u.ip4, req); 1419 } 1420 1421 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1422 .family = PF_INET, 1423 .obj_size = sizeof(struct tcp_request_sock), 1424 .rtx_syn_ack = tcp_rtx_synack, 1425 .send_ack = tcp_v4_reqsk_send_ack, 1426 .destructor = tcp_v4_reqsk_destructor, 1427 .send_reset = tcp_v4_send_reset, 1428 .syn_ack_timeout = tcp_syn_ack_timeout, 1429 }; 1430 1431 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1432 .mss_clamp = TCP_MSS_DEFAULT, 1433 #ifdef CONFIG_TCP_MD5SIG 1434 .req_md5_lookup = tcp_v4_md5_lookup, 1435 .calc_md5_hash = tcp_v4_md5_hash_skb, 1436 #endif 1437 .init_req = tcp_v4_init_req, 1438 #ifdef CONFIG_SYN_COOKIES 1439 .cookie_init_seq = cookie_v4_init_sequence, 1440 #endif 1441 .route_req = tcp_v4_route_req, 1442 .init_seq = tcp_v4_init_seq, 1443 .init_ts_off = tcp_v4_init_ts_off, 1444 .send_synack = tcp_v4_send_synack, 1445 }; 1446 1447 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1448 { 1449 /* Never answer to SYNs send to broadcast or multicast */ 1450 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1451 goto drop; 1452 1453 return tcp_conn_request(&tcp_request_sock_ops, 1454 &tcp_request_sock_ipv4_ops, sk, skb); 1455 1456 drop: 1457 tcp_listendrop(sk); 1458 return 0; 1459 } 1460 EXPORT_SYMBOL(tcp_v4_conn_request); 1461 1462 1463 /* 1464 * The three way handshake has completed - we got a valid synack - 1465 * now create the new socket. 1466 */ 1467 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1468 struct request_sock *req, 1469 struct dst_entry *dst, 1470 struct request_sock *req_unhash, 1471 bool *own_req) 1472 { 1473 struct inet_request_sock *ireq; 1474 struct inet_sock *newinet; 1475 struct tcp_sock *newtp; 1476 struct sock *newsk; 1477 #ifdef CONFIG_TCP_MD5SIG 1478 const union tcp_md5_addr *addr; 1479 struct tcp_md5sig_key *key; 1480 int l3index; 1481 #endif 1482 struct ip_options_rcu *inet_opt; 1483 1484 if (sk_acceptq_is_full(sk)) 1485 goto exit_overflow; 1486 1487 newsk = tcp_create_openreq_child(sk, req, skb); 1488 if (!newsk) 1489 goto exit_nonewsk; 1490 1491 newsk->sk_gso_type = SKB_GSO_TCPV4; 1492 inet_sk_rx_dst_set(newsk, skb); 1493 1494 newtp = tcp_sk(newsk); 1495 newinet = inet_sk(newsk); 1496 ireq = inet_rsk(req); 1497 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1498 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1499 newsk->sk_bound_dev_if = ireq->ir_iif; 1500 newinet->inet_saddr = ireq->ir_loc_addr; 1501 inet_opt = rcu_dereference(ireq->ireq_opt); 1502 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1503 newinet->mc_index = inet_iif(skb); 1504 newinet->mc_ttl = ip_hdr(skb)->ttl; 1505 newinet->rcv_tos = ip_hdr(skb)->tos; 1506 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1507 if (inet_opt) 1508 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1509 newinet->inet_id = prandom_u32(); 1510 1511 if (!dst) { 1512 dst = inet_csk_route_child_sock(sk, newsk, req); 1513 if (!dst) 1514 goto put_and_exit; 1515 } else { 1516 /* syncookie case : see end of cookie_v4_check() */ 1517 } 1518 sk_setup_caps(newsk, dst); 1519 1520 tcp_ca_openreq_child(newsk, dst); 1521 1522 tcp_sync_mss(newsk, dst_mtu(dst)); 1523 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1524 1525 tcp_initialize_rcv_mss(newsk); 1526 1527 #ifdef CONFIG_TCP_MD5SIG 1528 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1529 /* Copy over the MD5 key from the original socket */ 1530 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1531 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1532 if (key) { 1533 /* 1534 * We're using one, so create a matching key 1535 * on the newsk structure. If we fail to get 1536 * memory, then we end up not copying the key 1537 * across. Shucks. 1538 */ 1539 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, 1540 key->key, key->keylen, GFP_ATOMIC); 1541 sk_nocaps_add(newsk, NETIF_F_GSO_MASK); 1542 } 1543 #endif 1544 1545 if (__inet_inherit_port(sk, newsk) < 0) 1546 goto put_and_exit; 1547 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); 1548 if (likely(*own_req)) { 1549 tcp_move_syn(newtp, req); 1550 ireq->ireq_opt = NULL; 1551 } else { 1552 newinet->inet_opt = NULL; 1553 } 1554 return newsk; 1555 1556 exit_overflow: 1557 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1558 exit_nonewsk: 1559 dst_release(dst); 1560 exit: 1561 tcp_listendrop(sk); 1562 return NULL; 1563 put_and_exit: 1564 newinet->inet_opt = NULL; 1565 inet_csk_prepare_forced_close(newsk); 1566 tcp_done(newsk); 1567 goto exit; 1568 } 1569 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1570 1571 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1572 { 1573 #ifdef CONFIG_SYN_COOKIES 1574 const struct tcphdr *th = tcp_hdr(skb); 1575 1576 if (!th->syn) 1577 sk = cookie_v4_check(sk, skb); 1578 #endif 1579 return sk; 1580 } 1581 1582 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1583 struct tcphdr *th, u32 *cookie) 1584 { 1585 u16 mss = 0; 1586 #ifdef CONFIG_SYN_COOKIES 1587 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1588 &tcp_request_sock_ipv4_ops, sk, th); 1589 if (mss) { 1590 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1591 tcp_synq_overflow(sk); 1592 } 1593 #endif 1594 return mss; 1595 } 1596 1597 /* The socket must have it's spinlock held when we get 1598 * here, unless it is a TCP_LISTEN socket. 1599 * 1600 * We have a potential double-lock case here, so even when 1601 * doing backlog processing we use the BH locking scheme. 1602 * This is because we cannot sleep with the original spinlock 1603 * held. 1604 */ 1605 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1606 { 1607 struct sock *rsk; 1608 1609 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1610 struct dst_entry *dst = sk->sk_rx_dst; 1611 1612 sock_rps_save_rxhash(sk, skb); 1613 sk_mark_napi_id(sk, skb); 1614 if (dst) { 1615 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || 1616 !dst->ops->check(dst, 0)) { 1617 dst_release(dst); 1618 sk->sk_rx_dst = NULL; 1619 } 1620 } 1621 tcp_rcv_established(sk, skb); 1622 return 0; 1623 } 1624 1625 if (tcp_checksum_complete(skb)) 1626 goto csum_err; 1627 1628 if (sk->sk_state == TCP_LISTEN) { 1629 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1630 1631 if (!nsk) 1632 goto discard; 1633 if (nsk != sk) { 1634 if (tcp_child_process(sk, nsk, skb)) { 1635 rsk = nsk; 1636 goto reset; 1637 } 1638 return 0; 1639 } 1640 } else 1641 sock_rps_save_rxhash(sk, skb); 1642 1643 if (tcp_rcv_state_process(sk, skb)) { 1644 rsk = sk; 1645 goto reset; 1646 } 1647 return 0; 1648 1649 reset: 1650 tcp_v4_send_reset(rsk, skb); 1651 discard: 1652 kfree_skb(skb); 1653 /* Be careful here. If this function gets more complicated and 1654 * gcc suffers from register pressure on the x86, sk (in %ebx) 1655 * might be destroyed here. This current version compiles correctly, 1656 * but you have been warned. 1657 */ 1658 return 0; 1659 1660 csum_err: 1661 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1662 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1663 goto discard; 1664 } 1665 EXPORT_SYMBOL(tcp_v4_do_rcv); 1666 1667 int tcp_v4_early_demux(struct sk_buff *skb) 1668 { 1669 const struct iphdr *iph; 1670 const struct tcphdr *th; 1671 struct sock *sk; 1672 1673 if (skb->pkt_type != PACKET_HOST) 1674 return 0; 1675 1676 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1677 return 0; 1678 1679 iph = ip_hdr(skb); 1680 th = tcp_hdr(skb); 1681 1682 if (th->doff < sizeof(struct tcphdr) / 4) 1683 return 0; 1684 1685 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, 1686 iph->saddr, th->source, 1687 iph->daddr, ntohs(th->dest), 1688 skb->skb_iif, inet_sdif(skb)); 1689 if (sk) { 1690 skb->sk = sk; 1691 skb->destructor = sock_edemux; 1692 if (sk_fullsock(sk)) { 1693 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); 1694 1695 if (dst) 1696 dst = dst_check(dst, 0); 1697 if (dst && 1698 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) 1699 skb_dst_set_noref(skb, dst); 1700 } 1701 } 1702 return 0; 1703 } 1704 1705 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) 1706 { 1707 u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf); 1708 struct skb_shared_info *shinfo; 1709 const struct tcphdr *th; 1710 struct tcphdr *thtail; 1711 struct sk_buff *tail; 1712 unsigned int hdrlen; 1713 bool fragstolen; 1714 u32 gso_segs; 1715 int delta; 1716 1717 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1718 * we can fix skb->truesize to its real value to avoid future drops. 1719 * This is valid because skb is not yet charged to the socket. 1720 * It has been noticed pure SACK packets were sometimes dropped 1721 * (if cooked by drivers without copybreak feature). 1722 */ 1723 skb_condense(skb); 1724 1725 skb_dst_drop(skb); 1726 1727 if (unlikely(tcp_checksum_complete(skb))) { 1728 bh_unlock_sock(sk); 1729 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1730 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1731 return true; 1732 } 1733 1734 /* Attempt coalescing to last skb in backlog, even if we are 1735 * above the limits. 1736 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 1737 */ 1738 th = (const struct tcphdr *)skb->data; 1739 hdrlen = th->doff * 4; 1740 shinfo = skb_shinfo(skb); 1741 1742 if (!shinfo->gso_size) 1743 shinfo->gso_size = skb->len - hdrlen; 1744 1745 if (!shinfo->gso_segs) 1746 shinfo->gso_segs = 1; 1747 1748 tail = sk->sk_backlog.tail; 1749 if (!tail) 1750 goto no_coalesce; 1751 thtail = (struct tcphdr *)tail->data; 1752 1753 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 1754 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 1755 ((TCP_SKB_CB(tail)->tcp_flags | 1756 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 1757 !((TCP_SKB_CB(tail)->tcp_flags & 1758 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 1759 ((TCP_SKB_CB(tail)->tcp_flags ^ 1760 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || 1761 #ifdef CONFIG_TLS_DEVICE 1762 tail->decrypted != skb->decrypted || 1763 #endif 1764 thtail->doff != th->doff || 1765 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 1766 goto no_coalesce; 1767 1768 __skb_pull(skb, hdrlen); 1769 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 1770 thtail->window = th->window; 1771 1772 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 1773 1774 if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq)) 1775 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 1776 1777 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 1778 * thtail->fin, so that the fast path in tcp_rcv_established() 1779 * is not entered if we append a packet with a FIN. 1780 * SYN, RST, URG are not present. 1781 * ACK is set on both packets. 1782 * PSH : we do not really care in TCP stack, 1783 * at least for 'GRO' packets. 1784 */ 1785 thtail->fin |= th->fin; 1786 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 1787 1788 if (TCP_SKB_CB(skb)->has_rxtstamp) { 1789 TCP_SKB_CB(tail)->has_rxtstamp = true; 1790 tail->tstamp = skb->tstamp; 1791 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 1792 } 1793 1794 /* Not as strict as GRO. We only need to carry mss max value */ 1795 skb_shinfo(tail)->gso_size = max(shinfo->gso_size, 1796 skb_shinfo(tail)->gso_size); 1797 1798 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs; 1799 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF); 1800 1801 sk->sk_backlog.len += delta; 1802 __NET_INC_STATS(sock_net(sk), 1803 LINUX_MIB_TCPBACKLOGCOALESCE); 1804 kfree_skb_partial(skb, fragstolen); 1805 return false; 1806 } 1807 __skb_push(skb, hdrlen); 1808 1809 no_coalesce: 1810 /* Only socket owner can try to collapse/prune rx queues 1811 * to reduce memory overhead, so add a little headroom here. 1812 * Few sockets backlog are possibly concurrently non empty. 1813 */ 1814 limit += 64*1024; 1815 1816 if (unlikely(sk_add_backlog(sk, skb, limit))) { 1817 bh_unlock_sock(sk); 1818 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 1819 return true; 1820 } 1821 return false; 1822 } 1823 EXPORT_SYMBOL(tcp_add_backlog); 1824 1825 int tcp_filter(struct sock *sk, struct sk_buff *skb) 1826 { 1827 struct tcphdr *th = (struct tcphdr *)skb->data; 1828 1829 return sk_filter_trim_cap(sk, skb, th->doff * 4); 1830 } 1831 EXPORT_SYMBOL(tcp_filter); 1832 1833 static void tcp_v4_restore_cb(struct sk_buff *skb) 1834 { 1835 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 1836 sizeof(struct inet_skb_parm)); 1837 } 1838 1839 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 1840 const struct tcphdr *th) 1841 { 1842 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 1843 * barrier() makes sure compiler wont play fool^Waliasing games. 1844 */ 1845 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 1846 sizeof(struct inet_skb_parm)); 1847 barrier(); 1848 1849 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1850 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1851 skb->len - th->doff * 4); 1852 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1853 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 1854 TCP_SKB_CB(skb)->tcp_tw_isn = 0; 1855 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 1856 TCP_SKB_CB(skb)->sacked = 0; 1857 TCP_SKB_CB(skb)->has_rxtstamp = 1858 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 1859 } 1860 1861 /* 1862 * From tcp_input.c 1863 */ 1864 1865 int tcp_v4_rcv(struct sk_buff *skb) 1866 { 1867 struct net *net = dev_net(skb->dev); 1868 struct sk_buff *skb_to_free; 1869 int sdif = inet_sdif(skb); 1870 int dif = inet_iif(skb); 1871 const struct iphdr *iph; 1872 const struct tcphdr *th; 1873 bool refcounted; 1874 struct sock *sk; 1875 int ret; 1876 1877 if (skb->pkt_type != PACKET_HOST) 1878 goto discard_it; 1879 1880 /* Count it even if it's bad */ 1881 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 1882 1883 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 1884 goto discard_it; 1885 1886 th = (const struct tcphdr *)skb->data; 1887 1888 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) 1889 goto bad_packet; 1890 if (!pskb_may_pull(skb, th->doff * 4)) 1891 goto discard_it; 1892 1893 /* An explanation is required here, I think. 1894 * Packet length and doff are validated by header prediction, 1895 * provided case of th->doff==0 is eliminated. 1896 * So, we defer the checks. */ 1897 1898 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 1899 goto csum_error; 1900 1901 th = (const struct tcphdr *)skb->data; 1902 iph = ip_hdr(skb); 1903 lookup: 1904 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, 1905 th->dest, sdif, &refcounted); 1906 if (!sk) 1907 goto no_tcp_socket; 1908 1909 process: 1910 if (sk->sk_state == TCP_TIME_WAIT) 1911 goto do_time_wait; 1912 1913 if (sk->sk_state == TCP_NEW_SYN_RECV) { 1914 struct request_sock *req = inet_reqsk(sk); 1915 bool req_stolen = false; 1916 struct sock *nsk; 1917 1918 sk = req->rsk_listener; 1919 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) { 1920 sk_drops_add(sk, skb); 1921 reqsk_put(req); 1922 goto discard_it; 1923 } 1924 if (tcp_checksum_complete(skb)) { 1925 reqsk_put(req); 1926 goto csum_error; 1927 } 1928 if (unlikely(sk->sk_state != TCP_LISTEN)) { 1929 inet_csk_reqsk_queue_drop_and_put(sk, req); 1930 goto lookup; 1931 } 1932 /* We own a reference on the listener, increase it again 1933 * as we might lose it too soon. 1934 */ 1935 sock_hold(sk); 1936 refcounted = true; 1937 nsk = NULL; 1938 if (!tcp_filter(sk, skb)) { 1939 th = (const struct tcphdr *)skb->data; 1940 iph = ip_hdr(skb); 1941 tcp_v4_fill_cb(skb, iph, th); 1942 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 1943 } 1944 if (!nsk) { 1945 reqsk_put(req); 1946 if (req_stolen) { 1947 /* Another cpu got exclusive access to req 1948 * and created a full blown socket. 1949 * Try to feed this packet to this socket 1950 * instead of discarding it. 1951 */ 1952 tcp_v4_restore_cb(skb); 1953 sock_put(sk); 1954 goto lookup; 1955 } 1956 goto discard_and_relse; 1957 } 1958 if (nsk == sk) { 1959 reqsk_put(req); 1960 tcp_v4_restore_cb(skb); 1961 } else if (tcp_child_process(sk, nsk, skb)) { 1962 tcp_v4_send_reset(nsk, skb); 1963 goto discard_and_relse; 1964 } else { 1965 sock_put(sk); 1966 return 0; 1967 } 1968 } 1969 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 1970 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 1971 goto discard_and_relse; 1972 } 1973 1974 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 1975 goto discard_and_relse; 1976 1977 if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif)) 1978 goto discard_and_relse; 1979 1980 nf_reset_ct(skb); 1981 1982 if (tcp_filter(sk, skb)) 1983 goto discard_and_relse; 1984 th = (const struct tcphdr *)skb->data; 1985 iph = ip_hdr(skb); 1986 tcp_v4_fill_cb(skb, iph, th); 1987 1988 skb->dev = NULL; 1989 1990 if (sk->sk_state == TCP_LISTEN) { 1991 ret = tcp_v4_do_rcv(sk, skb); 1992 goto put_and_return; 1993 } 1994 1995 sk_incoming_cpu_update(sk); 1996 1997 bh_lock_sock_nested(sk); 1998 tcp_segs_in(tcp_sk(sk), skb); 1999 ret = 0; 2000 if (!sock_owned_by_user(sk)) { 2001 skb_to_free = sk->sk_rx_skb_cache; 2002 sk->sk_rx_skb_cache = NULL; 2003 ret = tcp_v4_do_rcv(sk, skb); 2004 } else { 2005 if (tcp_add_backlog(sk, skb)) 2006 goto discard_and_relse; 2007 skb_to_free = NULL; 2008 } 2009 bh_unlock_sock(sk); 2010 if (skb_to_free) 2011 __kfree_skb(skb_to_free); 2012 2013 put_and_return: 2014 if (refcounted) 2015 sock_put(sk); 2016 2017 return ret; 2018 2019 no_tcp_socket: 2020 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2021 goto discard_it; 2022 2023 tcp_v4_fill_cb(skb, iph, th); 2024 2025 if (tcp_checksum_complete(skb)) { 2026 csum_error: 2027 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2028 bad_packet: 2029 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2030 } else { 2031 tcp_v4_send_reset(NULL, skb); 2032 } 2033 2034 discard_it: 2035 /* Discard frame. */ 2036 kfree_skb(skb); 2037 return 0; 2038 2039 discard_and_relse: 2040 sk_drops_add(sk, skb); 2041 if (refcounted) 2042 sock_put(sk); 2043 goto discard_it; 2044 2045 do_time_wait: 2046 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2047 inet_twsk_put(inet_twsk(sk)); 2048 goto discard_it; 2049 } 2050 2051 tcp_v4_fill_cb(skb, iph, th); 2052 2053 if (tcp_checksum_complete(skb)) { 2054 inet_twsk_put(inet_twsk(sk)); 2055 goto csum_error; 2056 } 2057 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 2058 case TCP_TW_SYN: { 2059 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), 2060 &tcp_hashinfo, skb, 2061 __tcp_hdrlen(th), 2062 iph->saddr, th->source, 2063 iph->daddr, th->dest, 2064 inet_iif(skb), 2065 sdif); 2066 if (sk2) { 2067 inet_twsk_deschedule_put(inet_twsk(sk)); 2068 sk = sk2; 2069 tcp_v4_restore_cb(skb); 2070 refcounted = false; 2071 goto process; 2072 } 2073 } 2074 /* to ACK */ 2075 fallthrough; 2076 case TCP_TW_ACK: 2077 tcp_v4_timewait_ack(sk, skb); 2078 break; 2079 case TCP_TW_RST: 2080 tcp_v4_send_reset(sk, skb); 2081 inet_twsk_deschedule_put(inet_twsk(sk)); 2082 goto discard_it; 2083 case TCP_TW_SUCCESS:; 2084 } 2085 goto discard_it; 2086 } 2087 2088 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2089 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2090 .twsk_unique = tcp_twsk_unique, 2091 .twsk_destructor= tcp_twsk_destructor, 2092 }; 2093 2094 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2095 { 2096 struct dst_entry *dst = skb_dst(skb); 2097 2098 if (dst && dst_hold_safe(dst)) { 2099 sk->sk_rx_dst = dst; 2100 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; 2101 } 2102 } 2103 EXPORT_SYMBOL(inet_sk_rx_dst_set); 2104 2105 const struct inet_connection_sock_af_ops ipv4_specific = { 2106 .queue_xmit = ip_queue_xmit, 2107 .send_check = tcp_v4_send_check, 2108 .rebuild_header = inet_sk_rebuild_header, 2109 .sk_rx_dst_set = inet_sk_rx_dst_set, 2110 .conn_request = tcp_v4_conn_request, 2111 .syn_recv_sock = tcp_v4_syn_recv_sock, 2112 .net_header_len = sizeof(struct iphdr), 2113 .setsockopt = ip_setsockopt, 2114 .getsockopt = ip_getsockopt, 2115 .addr2sockaddr = inet_csk_addr2sockaddr, 2116 .sockaddr_len = sizeof(struct sockaddr_in), 2117 #ifdef CONFIG_COMPAT 2118 .compat_setsockopt = compat_ip_setsockopt, 2119 .compat_getsockopt = compat_ip_getsockopt, 2120 #endif 2121 .mtu_reduced = tcp_v4_mtu_reduced, 2122 }; 2123 EXPORT_SYMBOL(ipv4_specific); 2124 2125 #ifdef CONFIG_TCP_MD5SIG 2126 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2127 .md5_lookup = tcp_v4_md5_lookup, 2128 .calc_md5_hash = tcp_v4_md5_hash_skb, 2129 .md5_parse = tcp_v4_parse_md5_keys, 2130 }; 2131 #endif 2132 2133 /* NOTE: A lot of things set to zero explicitly by call to 2134 * sk_alloc() so need not be done here. 2135 */ 2136 static int tcp_v4_init_sock(struct sock *sk) 2137 { 2138 struct inet_connection_sock *icsk = inet_csk(sk); 2139 2140 tcp_init_sock(sk); 2141 2142 icsk->icsk_af_ops = &ipv4_specific; 2143 2144 #ifdef CONFIG_TCP_MD5SIG 2145 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2146 #endif 2147 2148 return 0; 2149 } 2150 2151 void tcp_v4_destroy_sock(struct sock *sk) 2152 { 2153 struct tcp_sock *tp = tcp_sk(sk); 2154 2155 trace_tcp_destroy_sock(sk); 2156 2157 tcp_clear_xmit_timers(sk); 2158 2159 tcp_cleanup_congestion_control(sk); 2160 2161 tcp_cleanup_ulp(sk); 2162 2163 /* Cleanup up the write buffer. */ 2164 tcp_write_queue_purge(sk); 2165 2166 /* Check if we want to disable active TFO */ 2167 tcp_fastopen_active_disable_ofo_check(sk); 2168 2169 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2170 skb_rbtree_purge(&tp->out_of_order_queue); 2171 2172 #ifdef CONFIG_TCP_MD5SIG 2173 /* Clean up the MD5 key list, if any */ 2174 if (tp->md5sig_info) { 2175 tcp_clear_md5_list(sk); 2176 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu); 2177 tp->md5sig_info = NULL; 2178 } 2179 #endif 2180 2181 /* Clean up a referenced TCP bind bucket. */ 2182 if (inet_csk(sk)->icsk_bind_hash) 2183 inet_put_port(sk); 2184 2185 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2186 2187 /* If socket is aborted during connect operation */ 2188 tcp_free_fastopen_req(tp); 2189 tcp_fastopen_destroy_cipher(sk); 2190 tcp_saved_syn_free(tp); 2191 2192 sk_sockets_allocated_dec(sk); 2193 } 2194 EXPORT_SYMBOL(tcp_v4_destroy_sock); 2195 2196 #ifdef CONFIG_PROC_FS 2197 /* Proc filesystem TCP sock list dumping. */ 2198 2199 /* 2200 * Get next listener socket follow cur. If cur is NULL, get first socket 2201 * starting from bucket given in st->bucket; when st->bucket is zero the 2202 * very first socket in the hash table is returned. 2203 */ 2204 static void *listening_get_next(struct seq_file *seq, void *cur) 2205 { 2206 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); 2207 struct tcp_iter_state *st = seq->private; 2208 struct net *net = seq_file_net(seq); 2209 struct inet_listen_hashbucket *ilb; 2210 struct hlist_nulls_node *node; 2211 struct sock *sk = cur; 2212 2213 if (!sk) { 2214 get_head: 2215 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2216 spin_lock(&ilb->lock); 2217 sk = sk_nulls_head(&ilb->nulls_head); 2218 st->offset = 0; 2219 goto get_sk; 2220 } 2221 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2222 ++st->num; 2223 ++st->offset; 2224 2225 sk = sk_nulls_next(sk); 2226 get_sk: 2227 sk_nulls_for_each_from(sk, node) { 2228 if (!net_eq(sock_net(sk), net)) 2229 continue; 2230 if (sk->sk_family == afinfo->family) 2231 return sk; 2232 } 2233 spin_unlock(&ilb->lock); 2234 st->offset = 0; 2235 if (++st->bucket < INET_LHTABLE_SIZE) 2236 goto get_head; 2237 return NULL; 2238 } 2239 2240 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2241 { 2242 struct tcp_iter_state *st = seq->private; 2243 void *rc; 2244 2245 st->bucket = 0; 2246 st->offset = 0; 2247 rc = listening_get_next(seq, NULL); 2248 2249 while (rc && *pos) { 2250 rc = listening_get_next(seq, rc); 2251 --*pos; 2252 } 2253 return rc; 2254 } 2255 2256 static inline bool empty_bucket(const struct tcp_iter_state *st) 2257 { 2258 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain); 2259 } 2260 2261 /* 2262 * Get first established socket starting from bucket given in st->bucket. 2263 * If st->bucket is zero, the very first socket in the hash is returned. 2264 */ 2265 static void *established_get_first(struct seq_file *seq) 2266 { 2267 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); 2268 struct tcp_iter_state *st = seq->private; 2269 struct net *net = seq_file_net(seq); 2270 void *rc = NULL; 2271 2272 st->offset = 0; 2273 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { 2274 struct sock *sk; 2275 struct hlist_nulls_node *node; 2276 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 2277 2278 /* Lockless fast path for the common case of empty buckets */ 2279 if (empty_bucket(st)) 2280 continue; 2281 2282 spin_lock_bh(lock); 2283 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 2284 if (sk->sk_family != afinfo->family || 2285 !net_eq(sock_net(sk), net)) { 2286 continue; 2287 } 2288 rc = sk; 2289 goto out; 2290 } 2291 spin_unlock_bh(lock); 2292 } 2293 out: 2294 return rc; 2295 } 2296 2297 static void *established_get_next(struct seq_file *seq, void *cur) 2298 { 2299 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); 2300 struct sock *sk = cur; 2301 struct hlist_nulls_node *node; 2302 struct tcp_iter_state *st = seq->private; 2303 struct net *net = seq_file_net(seq); 2304 2305 ++st->num; 2306 ++st->offset; 2307 2308 sk = sk_nulls_next(sk); 2309 2310 sk_nulls_for_each_from(sk, node) { 2311 if (sk->sk_family == afinfo->family && 2312 net_eq(sock_net(sk), net)) 2313 return sk; 2314 } 2315 2316 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2317 ++st->bucket; 2318 return established_get_first(seq); 2319 } 2320 2321 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2322 { 2323 struct tcp_iter_state *st = seq->private; 2324 void *rc; 2325 2326 st->bucket = 0; 2327 rc = established_get_first(seq); 2328 2329 while (rc && pos) { 2330 rc = established_get_next(seq, rc); 2331 --pos; 2332 } 2333 return rc; 2334 } 2335 2336 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2337 { 2338 void *rc; 2339 struct tcp_iter_state *st = seq->private; 2340 2341 st->state = TCP_SEQ_STATE_LISTENING; 2342 rc = listening_get_idx(seq, &pos); 2343 2344 if (!rc) { 2345 st->state = TCP_SEQ_STATE_ESTABLISHED; 2346 rc = established_get_idx(seq, pos); 2347 } 2348 2349 return rc; 2350 } 2351 2352 static void *tcp_seek_last_pos(struct seq_file *seq) 2353 { 2354 struct tcp_iter_state *st = seq->private; 2355 int offset = st->offset; 2356 int orig_num = st->num; 2357 void *rc = NULL; 2358 2359 switch (st->state) { 2360 case TCP_SEQ_STATE_LISTENING: 2361 if (st->bucket >= INET_LHTABLE_SIZE) 2362 break; 2363 st->state = TCP_SEQ_STATE_LISTENING; 2364 rc = listening_get_next(seq, NULL); 2365 while (offset-- && rc) 2366 rc = listening_get_next(seq, rc); 2367 if (rc) 2368 break; 2369 st->bucket = 0; 2370 st->state = TCP_SEQ_STATE_ESTABLISHED; 2371 fallthrough; 2372 case TCP_SEQ_STATE_ESTABLISHED: 2373 if (st->bucket > tcp_hashinfo.ehash_mask) 2374 break; 2375 rc = established_get_first(seq); 2376 while (offset-- && rc) 2377 rc = established_get_next(seq, rc); 2378 } 2379 2380 st->num = orig_num; 2381 2382 return rc; 2383 } 2384 2385 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2386 { 2387 struct tcp_iter_state *st = seq->private; 2388 void *rc; 2389 2390 if (*pos && *pos == st->last_pos) { 2391 rc = tcp_seek_last_pos(seq); 2392 if (rc) 2393 goto out; 2394 } 2395 2396 st->state = TCP_SEQ_STATE_LISTENING; 2397 st->num = 0; 2398 st->bucket = 0; 2399 st->offset = 0; 2400 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2401 2402 out: 2403 st->last_pos = *pos; 2404 return rc; 2405 } 2406 EXPORT_SYMBOL(tcp_seq_start); 2407 2408 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2409 { 2410 struct tcp_iter_state *st = seq->private; 2411 void *rc = NULL; 2412 2413 if (v == SEQ_START_TOKEN) { 2414 rc = tcp_get_idx(seq, 0); 2415 goto out; 2416 } 2417 2418 switch (st->state) { 2419 case TCP_SEQ_STATE_LISTENING: 2420 rc = listening_get_next(seq, v); 2421 if (!rc) { 2422 st->state = TCP_SEQ_STATE_ESTABLISHED; 2423 st->bucket = 0; 2424 st->offset = 0; 2425 rc = established_get_first(seq); 2426 } 2427 break; 2428 case TCP_SEQ_STATE_ESTABLISHED: 2429 rc = established_get_next(seq, v); 2430 break; 2431 } 2432 out: 2433 ++*pos; 2434 st->last_pos = *pos; 2435 return rc; 2436 } 2437 EXPORT_SYMBOL(tcp_seq_next); 2438 2439 void tcp_seq_stop(struct seq_file *seq, void *v) 2440 { 2441 struct tcp_iter_state *st = seq->private; 2442 2443 switch (st->state) { 2444 case TCP_SEQ_STATE_LISTENING: 2445 if (v != SEQ_START_TOKEN) 2446 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock); 2447 break; 2448 case TCP_SEQ_STATE_ESTABLISHED: 2449 if (v) 2450 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2451 break; 2452 } 2453 } 2454 EXPORT_SYMBOL(tcp_seq_stop); 2455 2456 static void get_openreq4(const struct request_sock *req, 2457 struct seq_file *f, int i) 2458 { 2459 const struct inet_request_sock *ireq = inet_rsk(req); 2460 long delta = req->rsk_timer.expires - jiffies; 2461 2462 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2463 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2464 i, 2465 ireq->ir_loc_addr, 2466 ireq->ir_num, 2467 ireq->ir_rmt_addr, 2468 ntohs(ireq->ir_rmt_port), 2469 TCP_SYN_RECV, 2470 0, 0, /* could print option size, but that is af dependent. */ 2471 1, /* timers active (only the expire timer) */ 2472 jiffies_delta_to_clock_t(delta), 2473 req->num_timeout, 2474 from_kuid_munged(seq_user_ns(f), 2475 sock_i_uid(req->rsk_listener)), 2476 0, /* non standard timer */ 2477 0, /* open_requests have no inode */ 2478 0, 2479 req); 2480 } 2481 2482 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2483 { 2484 int timer_active; 2485 unsigned long timer_expires; 2486 const struct tcp_sock *tp = tcp_sk(sk); 2487 const struct inet_connection_sock *icsk = inet_csk(sk); 2488 const struct inet_sock *inet = inet_sk(sk); 2489 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2490 __be32 dest = inet->inet_daddr; 2491 __be32 src = inet->inet_rcv_saddr; 2492 __u16 destp = ntohs(inet->inet_dport); 2493 __u16 srcp = ntohs(inet->inet_sport); 2494 int rx_queue; 2495 int state; 2496 2497 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2498 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2499 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2500 timer_active = 1; 2501 timer_expires = icsk->icsk_timeout; 2502 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2503 timer_active = 4; 2504 timer_expires = icsk->icsk_timeout; 2505 } else if (timer_pending(&sk->sk_timer)) { 2506 timer_active = 2; 2507 timer_expires = sk->sk_timer.expires; 2508 } else { 2509 timer_active = 0; 2510 timer_expires = jiffies; 2511 } 2512 2513 state = inet_sk_state_load(sk); 2514 if (state == TCP_LISTEN) 2515 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2516 else 2517 /* Because we don't lock the socket, 2518 * we might find a transient negative value. 2519 */ 2520 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2521 READ_ONCE(tp->copied_seq), 0); 2522 2523 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2524 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2525 i, src, srcp, dest, destp, state, 2526 READ_ONCE(tp->write_seq) - tp->snd_una, 2527 rx_queue, 2528 timer_active, 2529 jiffies_delta_to_clock_t(timer_expires - jiffies), 2530 icsk->icsk_retransmits, 2531 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2532 icsk->icsk_probes_out, 2533 sock_i_ino(sk), 2534 refcount_read(&sk->sk_refcnt), sk, 2535 jiffies_to_clock_t(icsk->icsk_rto), 2536 jiffies_to_clock_t(icsk->icsk_ack.ato), 2537 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2538 tp->snd_cwnd, 2539 state == TCP_LISTEN ? 2540 fastopenq->max_qlen : 2541 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2542 } 2543 2544 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2545 struct seq_file *f, int i) 2546 { 2547 long delta = tw->tw_timer.expires - jiffies; 2548 __be32 dest, src; 2549 __u16 destp, srcp; 2550 2551 dest = tw->tw_daddr; 2552 src = tw->tw_rcv_saddr; 2553 destp = ntohs(tw->tw_dport); 2554 srcp = ntohs(tw->tw_sport); 2555 2556 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2557 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2558 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2559 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2560 refcount_read(&tw->tw_refcnt), tw); 2561 } 2562 2563 #define TMPSZ 150 2564 2565 static int tcp4_seq_show(struct seq_file *seq, void *v) 2566 { 2567 struct tcp_iter_state *st; 2568 struct sock *sk = v; 2569 2570 seq_setwidth(seq, TMPSZ - 1); 2571 if (v == SEQ_START_TOKEN) { 2572 seq_puts(seq, " sl local_address rem_address st tx_queue " 2573 "rx_queue tr tm->when retrnsmt uid timeout " 2574 "inode"); 2575 goto out; 2576 } 2577 st = seq->private; 2578 2579 if (sk->sk_state == TCP_TIME_WAIT) 2580 get_timewait4_sock(v, seq, st->num); 2581 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2582 get_openreq4(v, seq, st->num); 2583 else 2584 get_tcp4_sock(v, seq, st->num); 2585 out: 2586 seq_pad(seq, '\n'); 2587 return 0; 2588 } 2589 2590 static const struct seq_operations tcp4_seq_ops = { 2591 .show = tcp4_seq_show, 2592 .start = tcp_seq_start, 2593 .next = tcp_seq_next, 2594 .stop = tcp_seq_stop, 2595 }; 2596 2597 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 2598 .family = AF_INET, 2599 }; 2600 2601 static int __net_init tcp4_proc_init_net(struct net *net) 2602 { 2603 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 2604 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 2605 return -ENOMEM; 2606 return 0; 2607 } 2608 2609 static void __net_exit tcp4_proc_exit_net(struct net *net) 2610 { 2611 remove_proc_entry("tcp", net->proc_net); 2612 } 2613 2614 static struct pernet_operations tcp4_net_ops = { 2615 .init = tcp4_proc_init_net, 2616 .exit = tcp4_proc_exit_net, 2617 }; 2618 2619 int __init tcp4_proc_init(void) 2620 { 2621 return register_pernet_subsys(&tcp4_net_ops); 2622 } 2623 2624 void tcp4_proc_exit(void) 2625 { 2626 unregister_pernet_subsys(&tcp4_net_ops); 2627 } 2628 #endif /* CONFIG_PROC_FS */ 2629 2630 struct proto tcp_prot = { 2631 .name = "TCP", 2632 .owner = THIS_MODULE, 2633 .close = tcp_close, 2634 .pre_connect = tcp_v4_pre_connect, 2635 .connect = tcp_v4_connect, 2636 .disconnect = tcp_disconnect, 2637 .accept = inet_csk_accept, 2638 .ioctl = tcp_ioctl, 2639 .init = tcp_v4_init_sock, 2640 .destroy = tcp_v4_destroy_sock, 2641 .shutdown = tcp_shutdown, 2642 .setsockopt = tcp_setsockopt, 2643 .getsockopt = tcp_getsockopt, 2644 .keepalive = tcp_set_keepalive, 2645 .recvmsg = tcp_recvmsg, 2646 .sendmsg = tcp_sendmsg, 2647 .sendpage = tcp_sendpage, 2648 .backlog_rcv = tcp_v4_do_rcv, 2649 .release_cb = tcp_release_cb, 2650 .hash = inet_hash, 2651 .unhash = inet_unhash, 2652 .get_port = inet_csk_get_port, 2653 .enter_memory_pressure = tcp_enter_memory_pressure, 2654 .leave_memory_pressure = tcp_leave_memory_pressure, 2655 .stream_memory_free = tcp_stream_memory_free, 2656 .sockets_allocated = &tcp_sockets_allocated, 2657 .orphan_count = &tcp_orphan_count, 2658 .memory_allocated = &tcp_memory_allocated, 2659 .memory_pressure = &tcp_memory_pressure, 2660 .sysctl_mem = sysctl_tcp_mem, 2661 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 2662 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 2663 .max_header = MAX_TCP_HEADER, 2664 .obj_size = sizeof(struct tcp_sock), 2665 .slab_flags = SLAB_TYPESAFE_BY_RCU, 2666 .twsk_prot = &tcp_timewait_sock_ops, 2667 .rsk_prot = &tcp_request_sock_ops, 2668 .h.hashinfo = &tcp_hashinfo, 2669 .no_autobind = true, 2670 #ifdef CONFIG_COMPAT 2671 .compat_setsockopt = compat_tcp_setsockopt, 2672 .compat_getsockopt = compat_tcp_getsockopt, 2673 #endif 2674 .diag_destroy = tcp_abort, 2675 }; 2676 EXPORT_SYMBOL(tcp_prot); 2677 2678 static void __net_exit tcp_sk_exit(struct net *net) 2679 { 2680 int cpu; 2681 2682 if (net->ipv4.tcp_congestion_control) 2683 bpf_module_put(net->ipv4.tcp_congestion_control, 2684 net->ipv4.tcp_congestion_control->owner); 2685 2686 for_each_possible_cpu(cpu) 2687 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); 2688 free_percpu(net->ipv4.tcp_sk); 2689 } 2690 2691 static int __net_init tcp_sk_init(struct net *net) 2692 { 2693 int res, cpu, cnt; 2694 2695 net->ipv4.tcp_sk = alloc_percpu(struct sock *); 2696 if (!net->ipv4.tcp_sk) 2697 return -ENOMEM; 2698 2699 for_each_possible_cpu(cpu) { 2700 struct sock *sk; 2701 2702 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 2703 IPPROTO_TCP, net); 2704 if (res) 2705 goto fail; 2706 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 2707 2708 /* Please enforce IP_DF and IPID==0 for RST and 2709 * ACK sent in SYN-RECV and TIME-WAIT state. 2710 */ 2711 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 2712 2713 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; 2714 } 2715 2716 net->ipv4.sysctl_tcp_ecn = 2; 2717 net->ipv4.sysctl_tcp_ecn_fallback = 1; 2718 2719 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 2720 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 2721 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 2722 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 2723 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 2724 2725 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 2726 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 2727 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 2728 2729 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 2730 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 2731 net->ipv4.sysctl_tcp_syncookies = 1; 2732 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 2733 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 2734 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 2735 net->ipv4.sysctl_tcp_orphan_retries = 0; 2736 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 2737 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 2738 net->ipv4.sysctl_tcp_tw_reuse = 2; 2739 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 2740 2741 cnt = tcp_hashinfo.ehash_mask + 1; 2742 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2; 2743 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; 2744 2745 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128); 2746 net->ipv4.sysctl_tcp_sack = 1; 2747 net->ipv4.sysctl_tcp_window_scaling = 1; 2748 net->ipv4.sysctl_tcp_timestamps = 1; 2749 net->ipv4.sysctl_tcp_early_retrans = 3; 2750 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 2751 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 2752 net->ipv4.sysctl_tcp_retrans_collapse = 1; 2753 net->ipv4.sysctl_tcp_max_reordering = 300; 2754 net->ipv4.sysctl_tcp_dsack = 1; 2755 net->ipv4.sysctl_tcp_app_win = 31; 2756 net->ipv4.sysctl_tcp_adv_win_scale = 1; 2757 net->ipv4.sysctl_tcp_frto = 2; 2758 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 2759 /* This limits the percentage of the congestion window which we 2760 * will allow a single TSO frame to consume. Building TSO frames 2761 * which are too large can cause TCP streams to be bursty. 2762 */ 2763 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 2764 /* Default TSQ limit of 16 TSO segments */ 2765 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; 2766 /* rfc5961 challenge ack rate limiting */ 2767 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; 2768 net->ipv4.sysctl_tcp_min_tso_segs = 2; 2769 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 2770 net->ipv4.sysctl_tcp_autocorking = 1; 2771 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 2772 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 2773 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 2774 if (net != &init_net) { 2775 memcpy(net->ipv4.sysctl_tcp_rmem, 2776 init_net.ipv4.sysctl_tcp_rmem, 2777 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 2778 memcpy(net->ipv4.sysctl_tcp_wmem, 2779 init_net.ipv4.sysctl_tcp_wmem, 2780 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 2781 } 2782 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 2783 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 2784 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 2785 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); 2786 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; 2787 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 2788 2789 /* Reno is always built in */ 2790 if (!net_eq(net, &init_net) && 2791 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 2792 init_net.ipv4.tcp_congestion_control->owner)) 2793 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 2794 else 2795 net->ipv4.tcp_congestion_control = &tcp_reno; 2796 2797 return 0; 2798 fail: 2799 tcp_sk_exit(net); 2800 2801 return res; 2802 } 2803 2804 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 2805 { 2806 struct net *net; 2807 2808 inet_twsk_purge(&tcp_hashinfo, AF_INET); 2809 2810 list_for_each_entry(net, net_exit_list, exit_list) 2811 tcp_fastopen_ctx_destroy(net); 2812 } 2813 2814 static struct pernet_operations __net_initdata tcp_sk_ops = { 2815 .init = tcp_sk_init, 2816 .exit = tcp_sk_exit, 2817 .exit_batch = tcp_sk_exit_batch, 2818 }; 2819 2820 void __init tcp_v4_init(void) 2821 { 2822 if (register_pernet_subsys(&tcp_sk_ops)) 2823 panic("Failed to create the TCP control socket.\n"); 2824 } 2825