1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 61 #include <net/net_namespace.h> 62 #include <net/icmp.h> 63 #include <net/inet_hashtables.h> 64 #include <net/tcp.h> 65 #include <net/transp_v6.h> 66 #include <net/ipv6.h> 67 #include <net/inet_common.h> 68 #include <net/timewait_sock.h> 69 #include <net/xfrm.h> 70 #include <net/secure_seq.h> 71 #include <net/busy_poll.h> 72 73 #include <linux/inet.h> 74 #include <linux/ipv6.h> 75 #include <linux/stddef.h> 76 #include <linux/proc_fs.h> 77 #include <linux/seq_file.h> 78 #include <linux/inetdevice.h> 79 #include <linux/btf_ids.h> 80 81 #include <crypto/hash.h> 82 #include <linux/scatterlist.h> 83 84 #include <trace/events/tcp.h> 85 86 #ifdef CONFIG_TCP_MD5SIG 87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 88 __be32 daddr, __be32 saddr, const struct tcphdr *th); 89 #endif 90 91 struct inet_hashinfo tcp_hashinfo; 92 EXPORT_SYMBOL(tcp_hashinfo); 93 94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk); 95 96 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 97 { 98 return secure_tcp_seq(ip_hdr(skb)->daddr, 99 ip_hdr(skb)->saddr, 100 tcp_hdr(skb)->dest, 101 tcp_hdr(skb)->source); 102 } 103 104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 105 { 106 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 107 } 108 109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 110 { 111 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse); 112 const struct inet_timewait_sock *tw = inet_twsk(sktw); 113 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 114 struct tcp_sock *tp = tcp_sk(sk); 115 116 if (reuse == 2) { 117 /* Still does not detect *everything* that goes through 118 * lo, since we require a loopback src or dst address 119 * or direct binding to 'lo' interface. 120 */ 121 bool loopback = false; 122 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 123 loopback = true; 124 #if IS_ENABLED(CONFIG_IPV6) 125 if (tw->tw_family == AF_INET6) { 126 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 128 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 129 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 130 loopback = true; 131 } else 132 #endif 133 { 134 if (ipv4_is_loopback(tw->tw_daddr) || 135 ipv4_is_loopback(tw->tw_rcv_saddr)) 136 loopback = true; 137 } 138 if (!loopback) 139 reuse = 0; 140 } 141 142 /* With PAWS, it is safe from the viewpoint 143 of data integrity. Even without PAWS it is safe provided sequence 144 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 145 146 Actually, the idea is close to VJ's one, only timestamp cache is 147 held not per host, but per port pair and TW bucket is used as state 148 holder. 149 150 If TW bucket has been already destroyed we fall back to VJ's scheme 151 and use initial timestamp retrieved from peer table. 152 */ 153 if (tcptw->tw_ts_recent_stamp && 154 (!twp || (reuse && time_after32(ktime_get_seconds(), 155 tcptw->tw_ts_recent_stamp)))) { 156 /* In case of repair and re-using TIME-WAIT sockets we still 157 * want to be sure that it is safe as above but honor the 158 * sequence numbers and time stamps set as part of the repair 159 * process. 160 * 161 * Without this check re-using a TIME-WAIT socket with TCP 162 * repair would accumulate a -1 on the repair assigned 163 * sequence number. The first time it is reused the sequence 164 * is -1, the second time -2, etc. This fixes that issue 165 * without appearing to create any others. 166 */ 167 if (likely(!tp->repair)) { 168 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 169 170 if (!seq) 171 seq = 1; 172 WRITE_ONCE(tp->write_seq, seq); 173 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 174 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 175 } 176 sock_hold(sktw); 177 return 1; 178 } 179 180 return 0; 181 } 182 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 183 184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 185 int addr_len) 186 { 187 /* This check is replicated from tcp_v4_connect() and intended to 188 * prevent BPF program called below from accessing bytes that are out 189 * of the bound specified by user in addr_len. 190 */ 191 if (addr_len < sizeof(struct sockaddr_in)) 192 return -EINVAL; 193 194 sock_owned_by_me(sk); 195 196 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr); 197 } 198 199 /* This will initiate an outgoing connection. */ 200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 201 { 202 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 203 struct inet_sock *inet = inet_sk(sk); 204 struct tcp_sock *tp = tcp_sk(sk); 205 __be16 orig_sport, orig_dport; 206 __be32 daddr, nexthop; 207 struct flowi4 *fl4; 208 struct rtable *rt; 209 int err; 210 struct ip_options_rcu *inet_opt; 211 struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row; 212 213 if (addr_len < sizeof(struct sockaddr_in)) 214 return -EINVAL; 215 216 if (usin->sin_family != AF_INET) 217 return -EAFNOSUPPORT; 218 219 nexthop = daddr = usin->sin_addr.s_addr; 220 inet_opt = rcu_dereference_protected(inet->inet_opt, 221 lockdep_sock_is_held(sk)); 222 if (inet_opt && inet_opt->opt.srr) { 223 if (!daddr) 224 return -EINVAL; 225 nexthop = inet_opt->opt.faddr; 226 } 227 228 orig_sport = inet->inet_sport; 229 orig_dport = usin->sin_port; 230 fl4 = &inet->cork.fl.u.ip4; 231 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 232 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, 233 orig_dport, sk); 234 if (IS_ERR(rt)) { 235 err = PTR_ERR(rt); 236 if (err == -ENETUNREACH) 237 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 238 return err; 239 } 240 241 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 242 ip_rt_put(rt); 243 return -ENETUNREACH; 244 } 245 246 if (!inet_opt || !inet_opt->opt.srr) 247 daddr = fl4->daddr; 248 249 if (!inet->inet_saddr) 250 inet->inet_saddr = fl4->saddr; 251 sk_rcv_saddr_set(sk, inet->inet_saddr); 252 253 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 254 /* Reset inherited state */ 255 tp->rx_opt.ts_recent = 0; 256 tp->rx_opt.ts_recent_stamp = 0; 257 if (likely(!tp->repair)) 258 WRITE_ONCE(tp->write_seq, 0); 259 } 260 261 inet->inet_dport = usin->sin_port; 262 sk_daddr_set(sk, daddr); 263 264 inet_csk(sk)->icsk_ext_hdr_len = 0; 265 if (inet_opt) 266 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 267 268 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 269 270 /* Socket identity is still unknown (sport may be zero). 271 * However we set state to SYN-SENT and not releasing socket 272 * lock select source port, enter ourselves into the hash tables and 273 * complete initialization after this. 274 */ 275 tcp_set_state(sk, TCP_SYN_SENT); 276 err = inet_hash_connect(tcp_death_row, sk); 277 if (err) 278 goto failure; 279 280 sk_set_txhash(sk); 281 282 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 283 inet->inet_sport, inet->inet_dport, sk); 284 if (IS_ERR(rt)) { 285 err = PTR_ERR(rt); 286 rt = NULL; 287 goto failure; 288 } 289 /* OK, now commit destination to socket. */ 290 sk->sk_gso_type = SKB_GSO_TCPV4; 291 sk_setup_caps(sk, &rt->dst); 292 rt = NULL; 293 294 if (likely(!tp->repair)) { 295 if (!tp->write_seq) 296 WRITE_ONCE(tp->write_seq, 297 secure_tcp_seq(inet->inet_saddr, 298 inet->inet_daddr, 299 inet->inet_sport, 300 usin->sin_port)); 301 tp->tsoffset = secure_tcp_ts_off(sock_net(sk), 302 inet->inet_saddr, 303 inet->inet_daddr); 304 } 305 306 inet->inet_id = prandom_u32(); 307 308 if (tcp_fastopen_defer_connect(sk, &err)) 309 return err; 310 if (err) 311 goto failure; 312 313 err = tcp_connect(sk); 314 315 if (err) 316 goto failure; 317 318 return 0; 319 320 failure: 321 /* 322 * This unhashes the socket and releases the local port, 323 * if necessary. 324 */ 325 tcp_set_state(sk, TCP_CLOSE); 326 ip_rt_put(rt); 327 sk->sk_route_caps = 0; 328 inet->inet_dport = 0; 329 return err; 330 } 331 EXPORT_SYMBOL(tcp_v4_connect); 332 333 /* 334 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 335 * It can be called through tcp_release_cb() if socket was owned by user 336 * at the time tcp_v4_err() was called to handle ICMP message. 337 */ 338 void tcp_v4_mtu_reduced(struct sock *sk) 339 { 340 struct inet_sock *inet = inet_sk(sk); 341 struct dst_entry *dst; 342 u32 mtu; 343 344 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 345 return; 346 mtu = READ_ONCE(tcp_sk(sk)->mtu_info); 347 dst = inet_csk_update_pmtu(sk, mtu); 348 if (!dst) 349 return; 350 351 /* Something is about to be wrong... Remember soft error 352 * for the case, if this connection will not able to recover. 353 */ 354 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 355 sk->sk_err_soft = EMSGSIZE; 356 357 mtu = dst_mtu(dst); 358 359 if (inet->pmtudisc != IP_PMTUDISC_DONT && 360 ip_sk_accept_pmtu(sk) && 361 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 362 tcp_sync_mss(sk, mtu); 363 364 /* Resend the TCP packet because it's 365 * clear that the old packet has been 366 * dropped. This is the new "fast" path mtu 367 * discovery. 368 */ 369 tcp_simple_retransmit(sk); 370 } /* else let the usual retransmit timer handle it */ 371 } 372 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 373 374 static void do_redirect(struct sk_buff *skb, struct sock *sk) 375 { 376 struct dst_entry *dst = __sk_dst_check(sk, 0); 377 378 if (dst) 379 dst->ops->redirect(dst, sk, skb); 380 } 381 382 383 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 384 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 385 { 386 struct request_sock *req = inet_reqsk(sk); 387 struct net *net = sock_net(sk); 388 389 /* ICMPs are not backlogged, hence we cannot get 390 * an established socket here. 391 */ 392 if (seq != tcp_rsk(req)->snt_isn) { 393 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 394 } else if (abort) { 395 /* 396 * Still in SYN_RECV, just remove it silently. 397 * There is no good way to pass the error to the newly 398 * created socket, and POSIX does not want network 399 * errors returned from accept(). 400 */ 401 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 402 tcp_listendrop(req->rsk_listener); 403 } 404 reqsk_put(req); 405 } 406 EXPORT_SYMBOL(tcp_req_err); 407 408 /* TCP-LD (RFC 6069) logic */ 409 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 410 { 411 struct inet_connection_sock *icsk = inet_csk(sk); 412 struct tcp_sock *tp = tcp_sk(sk); 413 struct sk_buff *skb; 414 s32 remaining; 415 u32 delta_us; 416 417 if (sock_owned_by_user(sk)) 418 return; 419 420 if (seq != tp->snd_una || !icsk->icsk_retransmits || 421 !icsk->icsk_backoff) 422 return; 423 424 skb = tcp_rtx_queue_head(sk); 425 if (WARN_ON_ONCE(!skb)) 426 return; 427 428 icsk->icsk_backoff--; 429 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 430 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 431 432 tcp_mstamp_refresh(tp); 433 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 434 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 435 436 if (remaining > 0) { 437 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 438 remaining, TCP_RTO_MAX); 439 } else { 440 /* RTO revert clocked out retransmission. 441 * Will retransmit now. 442 */ 443 tcp_retransmit_timer(sk); 444 } 445 } 446 EXPORT_SYMBOL(tcp_ld_RTO_revert); 447 448 /* 449 * This routine is called by the ICMP module when it gets some 450 * sort of error condition. If err < 0 then the socket should 451 * be closed and the error returned to the user. If err > 0 452 * it's just the icmp type << 8 | icmp code. After adjustment 453 * header points to the first 8 bytes of the tcp header. We need 454 * to find the appropriate port. 455 * 456 * The locking strategy used here is very "optimistic". When 457 * someone else accesses the socket the ICMP is just dropped 458 * and for some paths there is no check at all. 459 * A more general error queue to queue errors for later handling 460 * is probably better. 461 * 462 */ 463 464 int tcp_v4_err(struct sk_buff *skb, u32 info) 465 { 466 const struct iphdr *iph = (const struct iphdr *)skb->data; 467 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 468 struct tcp_sock *tp; 469 struct inet_sock *inet; 470 const int type = icmp_hdr(skb)->type; 471 const int code = icmp_hdr(skb)->code; 472 struct sock *sk; 473 struct request_sock *fastopen; 474 u32 seq, snd_una; 475 int err; 476 struct net *net = dev_net(skb->dev); 477 478 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, 479 th->dest, iph->saddr, ntohs(th->source), 480 inet_iif(skb), 0); 481 if (!sk) { 482 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 483 return -ENOENT; 484 } 485 if (sk->sk_state == TCP_TIME_WAIT) { 486 inet_twsk_put(inet_twsk(sk)); 487 return 0; 488 } 489 seq = ntohl(th->seq); 490 if (sk->sk_state == TCP_NEW_SYN_RECV) { 491 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 492 type == ICMP_TIME_EXCEEDED || 493 (type == ICMP_DEST_UNREACH && 494 (code == ICMP_NET_UNREACH || 495 code == ICMP_HOST_UNREACH))); 496 return 0; 497 } 498 499 bh_lock_sock(sk); 500 /* If too many ICMPs get dropped on busy 501 * servers this needs to be solved differently. 502 * We do take care of PMTU discovery (RFC1191) special case : 503 * we can receive locally generated ICMP messages while socket is held. 504 */ 505 if (sock_owned_by_user(sk)) { 506 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 507 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 508 } 509 if (sk->sk_state == TCP_CLOSE) 510 goto out; 511 512 if (static_branch_unlikely(&ip4_min_ttl)) { 513 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 514 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 515 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 516 goto out; 517 } 518 } 519 520 tp = tcp_sk(sk); 521 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 522 fastopen = rcu_dereference(tp->fastopen_rsk); 523 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 524 if (sk->sk_state != TCP_LISTEN && 525 !between(seq, snd_una, tp->snd_nxt)) { 526 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 527 goto out; 528 } 529 530 switch (type) { 531 case ICMP_REDIRECT: 532 if (!sock_owned_by_user(sk)) 533 do_redirect(skb, sk); 534 goto out; 535 case ICMP_SOURCE_QUENCH: 536 /* Just silently ignore these. */ 537 goto out; 538 case ICMP_PARAMETERPROB: 539 err = EPROTO; 540 break; 541 case ICMP_DEST_UNREACH: 542 if (code > NR_ICMP_UNREACH) 543 goto out; 544 545 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 546 /* We are not interested in TCP_LISTEN and open_requests 547 * (SYN-ACKs send out by Linux are always <576bytes so 548 * they should go through unfragmented). 549 */ 550 if (sk->sk_state == TCP_LISTEN) 551 goto out; 552 553 WRITE_ONCE(tp->mtu_info, info); 554 if (!sock_owned_by_user(sk)) { 555 tcp_v4_mtu_reduced(sk); 556 } else { 557 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 558 sock_hold(sk); 559 } 560 goto out; 561 } 562 563 err = icmp_err_convert[code].errno; 564 /* check if this ICMP message allows revert of backoff. 565 * (see RFC 6069) 566 */ 567 if (!fastopen && 568 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 569 tcp_ld_RTO_revert(sk, seq); 570 break; 571 case ICMP_TIME_EXCEEDED: 572 err = EHOSTUNREACH; 573 break; 574 default: 575 goto out; 576 } 577 578 switch (sk->sk_state) { 579 case TCP_SYN_SENT: 580 case TCP_SYN_RECV: 581 /* Only in fast or simultaneous open. If a fast open socket is 582 * already accepted it is treated as a connected one below. 583 */ 584 if (fastopen && !fastopen->sk) 585 break; 586 587 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 588 589 if (!sock_owned_by_user(sk)) { 590 sk->sk_err = err; 591 592 sk_error_report(sk); 593 594 tcp_done(sk); 595 } else { 596 sk->sk_err_soft = err; 597 } 598 goto out; 599 } 600 601 /* If we've already connected we will keep trying 602 * until we time out, or the user gives up. 603 * 604 * rfc1122 4.2.3.9 allows to consider as hard errors 605 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 606 * but it is obsoleted by pmtu discovery). 607 * 608 * Note, that in modern internet, where routing is unreliable 609 * and in each dark corner broken firewalls sit, sending random 610 * errors ordered by their masters even this two messages finally lose 611 * their original sense (even Linux sends invalid PORT_UNREACHs) 612 * 613 * Now we are in compliance with RFCs. 614 * --ANK (980905) 615 */ 616 617 inet = inet_sk(sk); 618 if (!sock_owned_by_user(sk) && inet->recverr) { 619 sk->sk_err = err; 620 sk_error_report(sk); 621 } else { /* Only an error on timeout */ 622 sk->sk_err_soft = err; 623 } 624 625 out: 626 bh_unlock_sock(sk); 627 sock_put(sk); 628 return 0; 629 } 630 631 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 632 { 633 struct tcphdr *th = tcp_hdr(skb); 634 635 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 636 skb->csum_start = skb_transport_header(skb) - skb->head; 637 skb->csum_offset = offsetof(struct tcphdr, check); 638 } 639 640 /* This routine computes an IPv4 TCP checksum. */ 641 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 642 { 643 const struct inet_sock *inet = inet_sk(sk); 644 645 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 646 } 647 EXPORT_SYMBOL(tcp_v4_send_check); 648 649 /* 650 * This routine will send an RST to the other tcp. 651 * 652 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 653 * for reset. 654 * Answer: if a packet caused RST, it is not for a socket 655 * existing in our system, if it is matched to a socket, 656 * it is just duplicate segment or bug in other side's TCP. 657 * So that we build reply only basing on parameters 658 * arrived with segment. 659 * Exception: precedence violation. We do not implement it in any case. 660 */ 661 662 #ifdef CONFIG_TCP_MD5SIG 663 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED 664 #else 665 #define OPTION_BYTES sizeof(__be32) 666 #endif 667 668 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) 669 { 670 const struct tcphdr *th = tcp_hdr(skb); 671 struct { 672 struct tcphdr th; 673 __be32 opt[OPTION_BYTES / sizeof(__be32)]; 674 } rep; 675 struct ip_reply_arg arg; 676 #ifdef CONFIG_TCP_MD5SIG 677 struct tcp_md5sig_key *key = NULL; 678 const __u8 *hash_location = NULL; 679 unsigned char newhash[16]; 680 int genhash; 681 struct sock *sk1 = NULL; 682 #endif 683 u64 transmit_time = 0; 684 struct sock *ctl_sk; 685 struct net *net; 686 687 /* Never send a reset in response to a reset. */ 688 if (th->rst) 689 return; 690 691 /* If sk not NULL, it means we did a successful lookup and incoming 692 * route had to be correct. prequeue might have dropped our dst. 693 */ 694 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 695 return; 696 697 /* Swap the send and the receive. */ 698 memset(&rep, 0, sizeof(rep)); 699 rep.th.dest = th->source; 700 rep.th.source = th->dest; 701 rep.th.doff = sizeof(struct tcphdr) / 4; 702 rep.th.rst = 1; 703 704 if (th->ack) { 705 rep.th.seq = th->ack_seq; 706 } else { 707 rep.th.ack = 1; 708 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 709 skb->len - (th->doff << 2)); 710 } 711 712 memset(&arg, 0, sizeof(arg)); 713 arg.iov[0].iov_base = (unsigned char *)&rep; 714 arg.iov[0].iov_len = sizeof(rep.th); 715 716 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 717 #ifdef CONFIG_TCP_MD5SIG 718 rcu_read_lock(); 719 hash_location = tcp_parse_md5sig_option(th); 720 if (sk && sk_fullsock(sk)) { 721 const union tcp_md5_addr *addr; 722 int l3index; 723 724 /* sdif set, means packet ingressed via a device 725 * in an L3 domain and inet_iif is set to it. 726 */ 727 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 728 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 729 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 730 } else if (hash_location) { 731 const union tcp_md5_addr *addr; 732 int sdif = tcp_v4_sdif(skb); 733 int dif = inet_iif(skb); 734 int l3index; 735 736 /* 737 * active side is lost. Try to find listening socket through 738 * source port, and then find md5 key through listening socket. 739 * we are not loose security here: 740 * Incoming packet is checked with md5 hash with finding key, 741 * no RST generated if md5 hash doesn't match. 742 */ 743 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, 744 ip_hdr(skb)->saddr, 745 th->source, ip_hdr(skb)->daddr, 746 ntohs(th->source), dif, sdif); 747 /* don't send rst if it can't find key */ 748 if (!sk1) 749 goto out; 750 751 /* sdif set, means packet ingressed via a device 752 * in an L3 domain and dif is set to it. 753 */ 754 l3index = sdif ? dif : 0; 755 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 756 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 757 if (!key) 758 goto out; 759 760 761 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 762 if (genhash || memcmp(hash_location, newhash, 16) != 0) 763 goto out; 764 765 } 766 767 if (key) { 768 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 769 (TCPOPT_NOP << 16) | 770 (TCPOPT_MD5SIG << 8) | 771 TCPOLEN_MD5SIG); 772 /* Update length and the length the header thinks exists */ 773 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 774 rep.th.doff = arg.iov[0].iov_len / 4; 775 776 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 777 key, ip_hdr(skb)->saddr, 778 ip_hdr(skb)->daddr, &rep.th); 779 } 780 #endif 781 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ 782 if (rep.opt[0] == 0) { 783 __be32 mrst = mptcp_reset_option(skb); 784 785 if (mrst) { 786 rep.opt[0] = mrst; 787 arg.iov[0].iov_len += sizeof(mrst); 788 rep.th.doff = arg.iov[0].iov_len / 4; 789 } 790 } 791 792 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 793 ip_hdr(skb)->saddr, /* XXX */ 794 arg.iov[0].iov_len, IPPROTO_TCP, 0); 795 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 796 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 797 798 /* When socket is gone, all binding information is lost. 799 * routing might fail in this case. No choice here, if we choose to force 800 * input interface, we will misroute in case of asymmetric route. 801 */ 802 if (sk) { 803 arg.bound_dev_if = sk->sk_bound_dev_if; 804 if (sk_fullsock(sk)) 805 trace_tcp_send_reset(sk, skb); 806 } 807 808 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 809 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 810 811 arg.tos = ip_hdr(skb)->tos; 812 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 813 local_bh_disable(); 814 ctl_sk = this_cpu_read(ipv4_tcp_sk); 815 sock_net_set(ctl_sk, net); 816 if (sk) { 817 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 818 inet_twsk(sk)->tw_mark : sk->sk_mark; 819 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 820 inet_twsk(sk)->tw_priority : sk->sk_priority; 821 transmit_time = tcp_transmit_time(sk); 822 xfrm_sk_clone_policy(ctl_sk, sk); 823 } 824 ip_send_unicast_reply(ctl_sk, 825 skb, &TCP_SKB_CB(skb)->header.h4.opt, 826 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 827 &arg, arg.iov[0].iov_len, 828 transmit_time); 829 830 ctl_sk->sk_mark = 0; 831 xfrm_sk_free_policy(ctl_sk); 832 sock_net_set(ctl_sk, &init_net); 833 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 834 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 835 local_bh_enable(); 836 837 #ifdef CONFIG_TCP_MD5SIG 838 out: 839 rcu_read_unlock(); 840 #endif 841 } 842 843 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 844 outside socket context is ugly, certainly. What can I do? 845 */ 846 847 static void tcp_v4_send_ack(const struct sock *sk, 848 struct sk_buff *skb, u32 seq, u32 ack, 849 u32 win, u32 tsval, u32 tsecr, int oif, 850 struct tcp_md5sig_key *key, 851 int reply_flags, u8 tos) 852 { 853 const struct tcphdr *th = tcp_hdr(skb); 854 struct { 855 struct tcphdr th; 856 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 857 #ifdef CONFIG_TCP_MD5SIG 858 + (TCPOLEN_MD5SIG_ALIGNED >> 2) 859 #endif 860 ]; 861 } rep; 862 struct net *net = sock_net(sk); 863 struct ip_reply_arg arg; 864 struct sock *ctl_sk; 865 u64 transmit_time; 866 867 memset(&rep.th, 0, sizeof(struct tcphdr)); 868 memset(&arg, 0, sizeof(arg)); 869 870 arg.iov[0].iov_base = (unsigned char *)&rep; 871 arg.iov[0].iov_len = sizeof(rep.th); 872 if (tsecr) { 873 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 874 (TCPOPT_TIMESTAMP << 8) | 875 TCPOLEN_TIMESTAMP); 876 rep.opt[1] = htonl(tsval); 877 rep.opt[2] = htonl(tsecr); 878 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 879 } 880 881 /* Swap the send and the receive. */ 882 rep.th.dest = th->source; 883 rep.th.source = th->dest; 884 rep.th.doff = arg.iov[0].iov_len / 4; 885 rep.th.seq = htonl(seq); 886 rep.th.ack_seq = htonl(ack); 887 rep.th.ack = 1; 888 rep.th.window = htons(win); 889 890 #ifdef CONFIG_TCP_MD5SIG 891 if (key) { 892 int offset = (tsecr) ? 3 : 0; 893 894 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 895 (TCPOPT_NOP << 16) | 896 (TCPOPT_MD5SIG << 8) | 897 TCPOLEN_MD5SIG); 898 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 899 rep.th.doff = arg.iov[0].iov_len/4; 900 901 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 902 key, ip_hdr(skb)->saddr, 903 ip_hdr(skb)->daddr, &rep.th); 904 } 905 #endif 906 arg.flags = reply_flags; 907 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 908 ip_hdr(skb)->saddr, /* XXX */ 909 arg.iov[0].iov_len, IPPROTO_TCP, 0); 910 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 911 if (oif) 912 arg.bound_dev_if = oif; 913 arg.tos = tos; 914 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 915 local_bh_disable(); 916 ctl_sk = this_cpu_read(ipv4_tcp_sk); 917 sock_net_set(ctl_sk, net); 918 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 919 inet_twsk(sk)->tw_mark : sk->sk_mark; 920 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 921 inet_twsk(sk)->tw_priority : sk->sk_priority; 922 transmit_time = tcp_transmit_time(sk); 923 ip_send_unicast_reply(ctl_sk, 924 skb, &TCP_SKB_CB(skb)->header.h4.opt, 925 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 926 &arg, arg.iov[0].iov_len, 927 transmit_time); 928 929 ctl_sk->sk_mark = 0; 930 sock_net_set(ctl_sk, &init_net); 931 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 932 local_bh_enable(); 933 } 934 935 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 936 { 937 struct inet_timewait_sock *tw = inet_twsk(sk); 938 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 939 940 tcp_v4_send_ack(sk, skb, 941 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 942 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 943 tcp_time_stamp_raw() + tcptw->tw_ts_offset, 944 tcptw->tw_ts_recent, 945 tw->tw_bound_dev_if, 946 tcp_twsk_md5_key(tcptw), 947 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 948 tw->tw_tos 949 ); 950 951 inet_twsk_put(tw); 952 } 953 954 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 955 struct request_sock *req) 956 { 957 const union tcp_md5_addr *addr; 958 int l3index; 959 960 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 961 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 962 */ 963 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 964 tcp_sk(sk)->snd_nxt; 965 966 /* RFC 7323 2.3 967 * The window field (SEG.WND) of every outgoing segment, with the 968 * exception of <SYN> segments, MUST be right-shifted by 969 * Rcv.Wind.Shift bits: 970 */ 971 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 972 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 973 tcp_v4_send_ack(sk, skb, seq, 974 tcp_rsk(req)->rcv_nxt, 975 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 976 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, 977 req->ts_recent, 978 0, 979 tcp_md5_do_lookup(sk, l3index, addr, AF_INET), 980 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 981 ip_hdr(skb)->tos); 982 } 983 984 /* 985 * Send a SYN-ACK after having received a SYN. 986 * This still operates on a request_sock only, not on a big 987 * socket. 988 */ 989 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 990 struct flowi *fl, 991 struct request_sock *req, 992 struct tcp_fastopen_cookie *foc, 993 enum tcp_synack_type synack_type, 994 struct sk_buff *syn_skb) 995 { 996 const struct inet_request_sock *ireq = inet_rsk(req); 997 struct flowi4 fl4; 998 int err = -1; 999 struct sk_buff *skb; 1000 u8 tos; 1001 1002 /* First, grab a route. */ 1003 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 1004 return -1; 1005 1006 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 1007 1008 if (skb) { 1009 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1010 1011 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ? 1012 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | 1013 (inet_sk(sk)->tos & INET_ECN_MASK) : 1014 inet_sk(sk)->tos; 1015 1016 if (!INET_ECN_is_capable(tos) && 1017 tcp_bpf_ca_needs_ecn((struct sock *)req)) 1018 tos |= INET_ECN_ECT_0; 1019 1020 rcu_read_lock(); 1021 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 1022 ireq->ir_rmt_addr, 1023 rcu_dereference(ireq->ireq_opt), 1024 tos); 1025 rcu_read_unlock(); 1026 err = net_xmit_eval(err); 1027 } 1028 1029 return err; 1030 } 1031 1032 /* 1033 * IPv4 request_sock destructor. 1034 */ 1035 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1036 { 1037 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1038 } 1039 1040 #ifdef CONFIG_TCP_MD5SIG 1041 /* 1042 * RFC2385 MD5 checksumming requires a mapping of 1043 * IP address->MD5 Key. 1044 * We need to maintain these in the sk structure. 1045 */ 1046 1047 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed); 1048 EXPORT_SYMBOL(tcp_md5_needed); 1049 1050 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) 1051 { 1052 if (!old) 1053 return true; 1054 1055 /* l3index always overrides non-l3index */ 1056 if (old->l3index && new->l3index == 0) 1057 return false; 1058 if (old->l3index == 0 && new->l3index) 1059 return true; 1060 1061 return old->prefixlen < new->prefixlen; 1062 } 1063 1064 /* Find the Key structure for an address. */ 1065 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1066 const union tcp_md5_addr *addr, 1067 int family) 1068 { 1069 const struct tcp_sock *tp = tcp_sk(sk); 1070 struct tcp_md5sig_key *key; 1071 const struct tcp_md5sig_info *md5sig; 1072 __be32 mask; 1073 struct tcp_md5sig_key *best_match = NULL; 1074 bool match; 1075 1076 /* caller either holds rcu_read_lock() or socket lock */ 1077 md5sig = rcu_dereference_check(tp->md5sig_info, 1078 lockdep_sock_is_held(sk)); 1079 if (!md5sig) 1080 return NULL; 1081 1082 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1083 lockdep_sock_is_held(sk)) { 1084 if (key->family != family) 1085 continue; 1086 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index) 1087 continue; 1088 if (family == AF_INET) { 1089 mask = inet_make_mask(key->prefixlen); 1090 match = (key->addr.a4.s_addr & mask) == 1091 (addr->a4.s_addr & mask); 1092 #if IS_ENABLED(CONFIG_IPV6) 1093 } else if (family == AF_INET6) { 1094 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1095 key->prefixlen); 1096 #endif 1097 } else { 1098 match = false; 1099 } 1100 1101 if (match && better_md5_match(best_match, key)) 1102 best_match = key; 1103 } 1104 return best_match; 1105 } 1106 EXPORT_SYMBOL(__tcp_md5_do_lookup); 1107 1108 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1109 const union tcp_md5_addr *addr, 1110 int family, u8 prefixlen, 1111 int l3index, u8 flags) 1112 { 1113 const struct tcp_sock *tp = tcp_sk(sk); 1114 struct tcp_md5sig_key *key; 1115 unsigned int size = sizeof(struct in_addr); 1116 const struct tcp_md5sig_info *md5sig; 1117 1118 /* caller either holds rcu_read_lock() or socket lock */ 1119 md5sig = rcu_dereference_check(tp->md5sig_info, 1120 lockdep_sock_is_held(sk)); 1121 if (!md5sig) 1122 return NULL; 1123 #if IS_ENABLED(CONFIG_IPV6) 1124 if (family == AF_INET6) 1125 size = sizeof(struct in6_addr); 1126 #endif 1127 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1128 lockdep_sock_is_held(sk)) { 1129 if (key->family != family) 1130 continue; 1131 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) 1132 continue; 1133 if (key->l3index != l3index) 1134 continue; 1135 if (!memcmp(&key->addr, addr, size) && 1136 key->prefixlen == prefixlen) 1137 return key; 1138 } 1139 return NULL; 1140 } 1141 1142 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1143 const struct sock *addr_sk) 1144 { 1145 const union tcp_md5_addr *addr; 1146 int l3index; 1147 1148 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1149 addr_sk->sk_bound_dev_if); 1150 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1151 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1152 } 1153 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1154 1155 /* This can be called on a newly created socket, from other files */ 1156 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1157 int family, u8 prefixlen, int l3index, u8 flags, 1158 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1159 { 1160 /* Add Key to the list */ 1161 struct tcp_md5sig_key *key; 1162 struct tcp_sock *tp = tcp_sk(sk); 1163 struct tcp_md5sig_info *md5sig; 1164 1165 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1166 if (key) { 1167 /* Pre-existing entry - just update that one. 1168 * Note that the key might be used concurrently. 1169 * data_race() is telling kcsan that we do not care of 1170 * key mismatches, since changing MD5 key on live flows 1171 * can lead to packet drops. 1172 */ 1173 data_race(memcpy(key->key, newkey, newkeylen)); 1174 1175 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1176 * Also note that a reader could catch new key->keylen value 1177 * but old key->key[], this is the reason we use __GFP_ZERO 1178 * at sock_kmalloc() time below these lines. 1179 */ 1180 WRITE_ONCE(key->keylen, newkeylen); 1181 1182 return 0; 1183 } 1184 1185 md5sig = rcu_dereference_protected(tp->md5sig_info, 1186 lockdep_sock_is_held(sk)); 1187 if (!md5sig) { 1188 md5sig = kmalloc(sizeof(*md5sig), gfp); 1189 if (!md5sig) 1190 return -ENOMEM; 1191 1192 sk_gso_disable(sk); 1193 INIT_HLIST_HEAD(&md5sig->head); 1194 rcu_assign_pointer(tp->md5sig_info, md5sig); 1195 } 1196 1197 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1198 if (!key) 1199 return -ENOMEM; 1200 if (!tcp_alloc_md5sig_pool()) { 1201 sock_kfree_s(sk, key, sizeof(*key)); 1202 return -ENOMEM; 1203 } 1204 1205 memcpy(key->key, newkey, newkeylen); 1206 key->keylen = newkeylen; 1207 key->family = family; 1208 key->prefixlen = prefixlen; 1209 key->l3index = l3index; 1210 key->flags = flags; 1211 memcpy(&key->addr, addr, 1212 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) : 1213 sizeof(struct in_addr)); 1214 hlist_add_head_rcu(&key->node, &md5sig->head); 1215 return 0; 1216 } 1217 EXPORT_SYMBOL(tcp_md5_do_add); 1218 1219 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1220 u8 prefixlen, int l3index, u8 flags) 1221 { 1222 struct tcp_md5sig_key *key; 1223 1224 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1225 if (!key) 1226 return -ENOENT; 1227 hlist_del_rcu(&key->node); 1228 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1229 kfree_rcu(key, rcu); 1230 return 0; 1231 } 1232 EXPORT_SYMBOL(tcp_md5_do_del); 1233 1234 static void tcp_clear_md5_list(struct sock *sk) 1235 { 1236 struct tcp_sock *tp = tcp_sk(sk); 1237 struct tcp_md5sig_key *key; 1238 struct hlist_node *n; 1239 struct tcp_md5sig_info *md5sig; 1240 1241 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1242 1243 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1244 hlist_del_rcu(&key->node); 1245 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1246 kfree_rcu(key, rcu); 1247 } 1248 } 1249 1250 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1251 sockptr_t optval, int optlen) 1252 { 1253 struct tcp_md5sig cmd; 1254 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1255 const union tcp_md5_addr *addr; 1256 u8 prefixlen = 32; 1257 int l3index = 0; 1258 u8 flags; 1259 1260 if (optlen < sizeof(cmd)) 1261 return -EINVAL; 1262 1263 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1264 return -EFAULT; 1265 1266 if (sin->sin_family != AF_INET) 1267 return -EINVAL; 1268 1269 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1270 1271 if (optname == TCP_MD5SIG_EXT && 1272 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1273 prefixlen = cmd.tcpm_prefixlen; 1274 if (prefixlen > 32) 1275 return -EINVAL; 1276 } 1277 1278 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && 1279 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1280 struct net_device *dev; 1281 1282 rcu_read_lock(); 1283 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1284 if (dev && netif_is_l3_master(dev)) 1285 l3index = dev->ifindex; 1286 1287 rcu_read_unlock(); 1288 1289 /* ok to reference set/not set outside of rcu; 1290 * right now device MUST be an L3 master 1291 */ 1292 if (!dev || !l3index) 1293 return -EINVAL; 1294 } 1295 1296 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1297 1298 if (!cmd.tcpm_keylen) 1299 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); 1300 1301 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1302 return -EINVAL; 1303 1304 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, 1305 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); 1306 } 1307 1308 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, 1309 __be32 daddr, __be32 saddr, 1310 const struct tcphdr *th, int nbytes) 1311 { 1312 struct tcp4_pseudohdr *bp; 1313 struct scatterlist sg; 1314 struct tcphdr *_th; 1315 1316 bp = hp->scratch; 1317 bp->saddr = saddr; 1318 bp->daddr = daddr; 1319 bp->pad = 0; 1320 bp->protocol = IPPROTO_TCP; 1321 bp->len = cpu_to_be16(nbytes); 1322 1323 _th = (struct tcphdr *)(bp + 1); 1324 memcpy(_th, th, sizeof(*th)); 1325 _th->check = 0; 1326 1327 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1328 ahash_request_set_crypt(hp->md5_req, &sg, NULL, 1329 sizeof(*bp) + sizeof(*th)); 1330 return crypto_ahash_update(hp->md5_req); 1331 } 1332 1333 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1334 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1335 { 1336 struct tcp_md5sig_pool *hp; 1337 struct ahash_request *req; 1338 1339 hp = tcp_get_md5sig_pool(); 1340 if (!hp) 1341 goto clear_hash_noput; 1342 req = hp->md5_req; 1343 1344 if (crypto_ahash_init(req)) 1345 goto clear_hash; 1346 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) 1347 goto clear_hash; 1348 if (tcp_md5_hash_key(hp, key)) 1349 goto clear_hash; 1350 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1351 if (crypto_ahash_final(req)) 1352 goto clear_hash; 1353 1354 tcp_put_md5sig_pool(); 1355 return 0; 1356 1357 clear_hash: 1358 tcp_put_md5sig_pool(); 1359 clear_hash_noput: 1360 memset(md5_hash, 0, 16); 1361 return 1; 1362 } 1363 1364 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1365 const struct sock *sk, 1366 const struct sk_buff *skb) 1367 { 1368 struct tcp_md5sig_pool *hp; 1369 struct ahash_request *req; 1370 const struct tcphdr *th = tcp_hdr(skb); 1371 __be32 saddr, daddr; 1372 1373 if (sk) { /* valid for establish/request sockets */ 1374 saddr = sk->sk_rcv_saddr; 1375 daddr = sk->sk_daddr; 1376 } else { 1377 const struct iphdr *iph = ip_hdr(skb); 1378 saddr = iph->saddr; 1379 daddr = iph->daddr; 1380 } 1381 1382 hp = tcp_get_md5sig_pool(); 1383 if (!hp) 1384 goto clear_hash_noput; 1385 req = hp->md5_req; 1386 1387 if (crypto_ahash_init(req)) 1388 goto clear_hash; 1389 1390 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len)) 1391 goto clear_hash; 1392 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) 1393 goto clear_hash; 1394 if (tcp_md5_hash_key(hp, key)) 1395 goto clear_hash; 1396 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1397 if (crypto_ahash_final(req)) 1398 goto clear_hash; 1399 1400 tcp_put_md5sig_pool(); 1401 return 0; 1402 1403 clear_hash: 1404 tcp_put_md5sig_pool(); 1405 clear_hash_noput: 1406 memset(md5_hash, 0, 16); 1407 return 1; 1408 } 1409 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1410 1411 #endif 1412 1413 static void tcp_v4_init_req(struct request_sock *req, 1414 const struct sock *sk_listener, 1415 struct sk_buff *skb) 1416 { 1417 struct inet_request_sock *ireq = inet_rsk(req); 1418 struct net *net = sock_net(sk_listener); 1419 1420 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1421 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1422 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1423 } 1424 1425 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1426 struct sk_buff *skb, 1427 struct flowi *fl, 1428 struct request_sock *req) 1429 { 1430 tcp_v4_init_req(req, sk, skb); 1431 1432 if (security_inet_conn_request(sk, skb, req)) 1433 return NULL; 1434 1435 return inet_csk_route_req(sk, &fl->u.ip4, req); 1436 } 1437 1438 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1439 .family = PF_INET, 1440 .obj_size = sizeof(struct tcp_request_sock), 1441 .rtx_syn_ack = tcp_rtx_synack, 1442 .send_ack = tcp_v4_reqsk_send_ack, 1443 .destructor = tcp_v4_reqsk_destructor, 1444 .send_reset = tcp_v4_send_reset, 1445 .syn_ack_timeout = tcp_syn_ack_timeout, 1446 }; 1447 1448 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1449 .mss_clamp = TCP_MSS_DEFAULT, 1450 #ifdef CONFIG_TCP_MD5SIG 1451 .req_md5_lookup = tcp_v4_md5_lookup, 1452 .calc_md5_hash = tcp_v4_md5_hash_skb, 1453 #endif 1454 #ifdef CONFIG_SYN_COOKIES 1455 .cookie_init_seq = cookie_v4_init_sequence, 1456 #endif 1457 .route_req = tcp_v4_route_req, 1458 .init_seq = tcp_v4_init_seq, 1459 .init_ts_off = tcp_v4_init_ts_off, 1460 .send_synack = tcp_v4_send_synack, 1461 }; 1462 1463 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1464 { 1465 /* Never answer to SYNs send to broadcast or multicast */ 1466 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1467 goto drop; 1468 1469 return tcp_conn_request(&tcp_request_sock_ops, 1470 &tcp_request_sock_ipv4_ops, sk, skb); 1471 1472 drop: 1473 tcp_listendrop(sk); 1474 return 0; 1475 } 1476 EXPORT_SYMBOL(tcp_v4_conn_request); 1477 1478 1479 /* 1480 * The three way handshake has completed - we got a valid synack - 1481 * now create the new socket. 1482 */ 1483 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1484 struct request_sock *req, 1485 struct dst_entry *dst, 1486 struct request_sock *req_unhash, 1487 bool *own_req) 1488 { 1489 struct inet_request_sock *ireq; 1490 bool found_dup_sk = false; 1491 struct inet_sock *newinet; 1492 struct tcp_sock *newtp; 1493 struct sock *newsk; 1494 #ifdef CONFIG_TCP_MD5SIG 1495 const union tcp_md5_addr *addr; 1496 struct tcp_md5sig_key *key; 1497 int l3index; 1498 #endif 1499 struct ip_options_rcu *inet_opt; 1500 1501 if (sk_acceptq_is_full(sk)) 1502 goto exit_overflow; 1503 1504 newsk = tcp_create_openreq_child(sk, req, skb); 1505 if (!newsk) 1506 goto exit_nonewsk; 1507 1508 newsk->sk_gso_type = SKB_GSO_TCPV4; 1509 inet_sk_rx_dst_set(newsk, skb); 1510 1511 newtp = tcp_sk(newsk); 1512 newinet = inet_sk(newsk); 1513 ireq = inet_rsk(req); 1514 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1515 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1516 newsk->sk_bound_dev_if = ireq->ir_iif; 1517 newinet->inet_saddr = ireq->ir_loc_addr; 1518 inet_opt = rcu_dereference(ireq->ireq_opt); 1519 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1520 newinet->mc_index = inet_iif(skb); 1521 newinet->mc_ttl = ip_hdr(skb)->ttl; 1522 newinet->rcv_tos = ip_hdr(skb)->tos; 1523 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1524 if (inet_opt) 1525 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1526 newinet->inet_id = prandom_u32(); 1527 1528 /* Set ToS of the new socket based upon the value of incoming SYN. 1529 * ECT bits are set later in tcp_init_transfer(). 1530 */ 1531 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1532 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1533 1534 if (!dst) { 1535 dst = inet_csk_route_child_sock(sk, newsk, req); 1536 if (!dst) 1537 goto put_and_exit; 1538 } else { 1539 /* syncookie case : see end of cookie_v4_check() */ 1540 } 1541 sk_setup_caps(newsk, dst); 1542 1543 tcp_ca_openreq_child(newsk, dst); 1544 1545 tcp_sync_mss(newsk, dst_mtu(dst)); 1546 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1547 1548 tcp_initialize_rcv_mss(newsk); 1549 1550 #ifdef CONFIG_TCP_MD5SIG 1551 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1552 /* Copy over the MD5 key from the original socket */ 1553 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1554 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1555 if (key) { 1556 /* 1557 * We're using one, so create a matching key 1558 * on the newsk structure. If we fail to get 1559 * memory, then we end up not copying the key 1560 * across. Shucks. 1561 */ 1562 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags, 1563 key->key, key->keylen, GFP_ATOMIC); 1564 sk_gso_disable(newsk); 1565 } 1566 #endif 1567 1568 if (__inet_inherit_port(sk, newsk) < 0) 1569 goto put_and_exit; 1570 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1571 &found_dup_sk); 1572 if (likely(*own_req)) { 1573 tcp_move_syn(newtp, req); 1574 ireq->ireq_opt = NULL; 1575 } else { 1576 newinet->inet_opt = NULL; 1577 1578 if (!req_unhash && found_dup_sk) { 1579 /* This code path should only be executed in the 1580 * syncookie case only 1581 */ 1582 bh_unlock_sock(newsk); 1583 sock_put(newsk); 1584 newsk = NULL; 1585 } 1586 } 1587 return newsk; 1588 1589 exit_overflow: 1590 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1591 exit_nonewsk: 1592 dst_release(dst); 1593 exit: 1594 tcp_listendrop(sk); 1595 return NULL; 1596 put_and_exit: 1597 newinet->inet_opt = NULL; 1598 inet_csk_prepare_forced_close(newsk); 1599 tcp_done(newsk); 1600 goto exit; 1601 } 1602 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1603 1604 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1605 { 1606 #ifdef CONFIG_SYN_COOKIES 1607 const struct tcphdr *th = tcp_hdr(skb); 1608 1609 if (!th->syn) 1610 sk = cookie_v4_check(sk, skb); 1611 #endif 1612 return sk; 1613 } 1614 1615 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1616 struct tcphdr *th, u32 *cookie) 1617 { 1618 u16 mss = 0; 1619 #ifdef CONFIG_SYN_COOKIES 1620 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1621 &tcp_request_sock_ipv4_ops, sk, th); 1622 if (mss) { 1623 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1624 tcp_synq_overflow(sk); 1625 } 1626 #endif 1627 return mss; 1628 } 1629 1630 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 1631 u32)); 1632 /* The socket must have it's spinlock held when we get 1633 * here, unless it is a TCP_LISTEN socket. 1634 * 1635 * We have a potential double-lock case here, so even when 1636 * doing backlog processing we use the BH locking scheme. 1637 * This is because we cannot sleep with the original spinlock 1638 * held. 1639 */ 1640 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1641 { 1642 enum skb_drop_reason reason; 1643 struct sock *rsk; 1644 1645 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1646 struct dst_entry *dst; 1647 1648 dst = rcu_dereference_protected(sk->sk_rx_dst, 1649 lockdep_sock_is_held(sk)); 1650 1651 sock_rps_save_rxhash(sk, skb); 1652 sk_mark_napi_id(sk, skb); 1653 if (dst) { 1654 if (sk->sk_rx_dst_ifindex != skb->skb_iif || 1655 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, 1656 dst, 0)) { 1657 RCU_INIT_POINTER(sk->sk_rx_dst, NULL); 1658 dst_release(dst); 1659 } 1660 } 1661 tcp_rcv_established(sk, skb); 1662 return 0; 1663 } 1664 1665 reason = SKB_DROP_REASON_NOT_SPECIFIED; 1666 if (tcp_checksum_complete(skb)) 1667 goto csum_err; 1668 1669 if (sk->sk_state == TCP_LISTEN) { 1670 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1671 1672 if (!nsk) 1673 goto discard; 1674 if (nsk != sk) { 1675 if (tcp_child_process(sk, nsk, skb)) { 1676 rsk = nsk; 1677 goto reset; 1678 } 1679 return 0; 1680 } 1681 } else 1682 sock_rps_save_rxhash(sk, skb); 1683 1684 if (tcp_rcv_state_process(sk, skb)) { 1685 rsk = sk; 1686 goto reset; 1687 } 1688 return 0; 1689 1690 reset: 1691 tcp_v4_send_reset(rsk, skb); 1692 discard: 1693 kfree_skb_reason(skb, reason); 1694 /* Be careful here. If this function gets more complicated and 1695 * gcc suffers from register pressure on the x86, sk (in %ebx) 1696 * might be destroyed here. This current version compiles correctly, 1697 * but you have been warned. 1698 */ 1699 return 0; 1700 1701 csum_err: 1702 reason = SKB_DROP_REASON_TCP_CSUM; 1703 trace_tcp_bad_csum(skb); 1704 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1705 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1706 goto discard; 1707 } 1708 EXPORT_SYMBOL(tcp_v4_do_rcv); 1709 1710 int tcp_v4_early_demux(struct sk_buff *skb) 1711 { 1712 const struct iphdr *iph; 1713 const struct tcphdr *th; 1714 struct sock *sk; 1715 1716 if (skb->pkt_type != PACKET_HOST) 1717 return 0; 1718 1719 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1720 return 0; 1721 1722 iph = ip_hdr(skb); 1723 th = tcp_hdr(skb); 1724 1725 if (th->doff < sizeof(struct tcphdr) / 4) 1726 return 0; 1727 1728 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, 1729 iph->saddr, th->source, 1730 iph->daddr, ntohs(th->dest), 1731 skb->skb_iif, inet_sdif(skb)); 1732 if (sk) { 1733 skb->sk = sk; 1734 skb->destructor = sock_edemux; 1735 if (sk_fullsock(sk)) { 1736 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); 1737 1738 if (dst) 1739 dst = dst_check(dst, 0); 1740 if (dst && 1741 sk->sk_rx_dst_ifindex == skb->skb_iif) 1742 skb_dst_set_noref(skb, dst); 1743 } 1744 } 1745 return 0; 1746 } 1747 1748 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, 1749 enum skb_drop_reason *reason) 1750 { 1751 u32 limit, tail_gso_size, tail_gso_segs; 1752 struct skb_shared_info *shinfo; 1753 const struct tcphdr *th; 1754 struct tcphdr *thtail; 1755 struct sk_buff *tail; 1756 unsigned int hdrlen; 1757 bool fragstolen; 1758 u32 gso_segs; 1759 u32 gso_size; 1760 int delta; 1761 1762 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1763 * we can fix skb->truesize to its real value to avoid future drops. 1764 * This is valid because skb is not yet charged to the socket. 1765 * It has been noticed pure SACK packets were sometimes dropped 1766 * (if cooked by drivers without copybreak feature). 1767 */ 1768 skb_condense(skb); 1769 1770 skb_dst_drop(skb); 1771 1772 if (unlikely(tcp_checksum_complete(skb))) { 1773 bh_unlock_sock(sk); 1774 trace_tcp_bad_csum(skb); 1775 *reason = SKB_DROP_REASON_TCP_CSUM; 1776 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1777 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1778 return true; 1779 } 1780 1781 /* Attempt coalescing to last skb in backlog, even if we are 1782 * above the limits. 1783 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 1784 */ 1785 th = (const struct tcphdr *)skb->data; 1786 hdrlen = th->doff * 4; 1787 1788 tail = sk->sk_backlog.tail; 1789 if (!tail) 1790 goto no_coalesce; 1791 thtail = (struct tcphdr *)tail->data; 1792 1793 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 1794 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 1795 ((TCP_SKB_CB(tail)->tcp_flags | 1796 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 1797 !((TCP_SKB_CB(tail)->tcp_flags & 1798 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 1799 ((TCP_SKB_CB(tail)->tcp_flags ^ 1800 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || 1801 #ifdef CONFIG_TLS_DEVICE 1802 tail->decrypted != skb->decrypted || 1803 #endif 1804 thtail->doff != th->doff || 1805 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 1806 goto no_coalesce; 1807 1808 __skb_pull(skb, hdrlen); 1809 1810 shinfo = skb_shinfo(skb); 1811 gso_size = shinfo->gso_size ?: skb->len; 1812 gso_segs = shinfo->gso_segs ?: 1; 1813 1814 shinfo = skb_shinfo(tail); 1815 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); 1816 tail_gso_segs = shinfo->gso_segs ?: 1; 1817 1818 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 1819 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 1820 1821 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 1822 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 1823 thtail->window = th->window; 1824 } 1825 1826 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 1827 * thtail->fin, so that the fast path in tcp_rcv_established() 1828 * is not entered if we append a packet with a FIN. 1829 * SYN, RST, URG are not present. 1830 * ACK is set on both packets. 1831 * PSH : we do not really care in TCP stack, 1832 * at least for 'GRO' packets. 1833 */ 1834 thtail->fin |= th->fin; 1835 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 1836 1837 if (TCP_SKB_CB(skb)->has_rxtstamp) { 1838 TCP_SKB_CB(tail)->has_rxtstamp = true; 1839 tail->tstamp = skb->tstamp; 1840 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 1841 } 1842 1843 /* Not as strict as GRO. We only need to carry mss max value */ 1844 shinfo->gso_size = max(gso_size, tail_gso_size); 1845 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); 1846 1847 sk->sk_backlog.len += delta; 1848 __NET_INC_STATS(sock_net(sk), 1849 LINUX_MIB_TCPBACKLOGCOALESCE); 1850 kfree_skb_partial(skb, fragstolen); 1851 return false; 1852 } 1853 __skb_push(skb, hdrlen); 1854 1855 no_coalesce: 1856 /* Only socket owner can try to collapse/prune rx queues 1857 * to reduce memory overhead, so add a little headroom here. 1858 * Few sockets backlog are possibly concurrently non empty. 1859 */ 1860 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024; 1861 1862 if (unlikely(sk_add_backlog(sk, skb, limit))) { 1863 bh_unlock_sock(sk); 1864 *reason = SKB_DROP_REASON_SOCKET_BACKLOG; 1865 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 1866 return true; 1867 } 1868 return false; 1869 } 1870 EXPORT_SYMBOL(tcp_add_backlog); 1871 1872 int tcp_filter(struct sock *sk, struct sk_buff *skb) 1873 { 1874 struct tcphdr *th = (struct tcphdr *)skb->data; 1875 1876 return sk_filter_trim_cap(sk, skb, th->doff * 4); 1877 } 1878 EXPORT_SYMBOL(tcp_filter); 1879 1880 static void tcp_v4_restore_cb(struct sk_buff *skb) 1881 { 1882 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 1883 sizeof(struct inet_skb_parm)); 1884 } 1885 1886 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 1887 const struct tcphdr *th) 1888 { 1889 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 1890 * barrier() makes sure compiler wont play fool^Waliasing games. 1891 */ 1892 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 1893 sizeof(struct inet_skb_parm)); 1894 barrier(); 1895 1896 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1897 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1898 skb->len - th->doff * 4); 1899 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1900 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 1901 TCP_SKB_CB(skb)->tcp_tw_isn = 0; 1902 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 1903 TCP_SKB_CB(skb)->sacked = 0; 1904 TCP_SKB_CB(skb)->has_rxtstamp = 1905 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 1906 } 1907 1908 /* 1909 * From tcp_input.c 1910 */ 1911 1912 int tcp_v4_rcv(struct sk_buff *skb) 1913 { 1914 struct net *net = dev_net(skb->dev); 1915 enum skb_drop_reason drop_reason; 1916 int sdif = inet_sdif(skb); 1917 int dif = inet_iif(skb); 1918 const struct iphdr *iph; 1919 const struct tcphdr *th; 1920 bool refcounted; 1921 struct sock *sk; 1922 int ret; 1923 1924 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 1925 if (skb->pkt_type != PACKET_HOST) 1926 goto discard_it; 1927 1928 /* Count it even if it's bad */ 1929 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 1930 1931 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 1932 goto discard_it; 1933 1934 th = (const struct tcphdr *)skb->data; 1935 1936 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { 1937 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; 1938 goto bad_packet; 1939 } 1940 if (!pskb_may_pull(skb, th->doff * 4)) 1941 goto discard_it; 1942 1943 /* An explanation is required here, I think. 1944 * Packet length and doff are validated by header prediction, 1945 * provided case of th->doff==0 is eliminated. 1946 * So, we defer the checks. */ 1947 1948 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 1949 goto csum_error; 1950 1951 th = (const struct tcphdr *)skb->data; 1952 iph = ip_hdr(skb); 1953 lookup: 1954 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, 1955 th->dest, sdif, &refcounted); 1956 if (!sk) 1957 goto no_tcp_socket; 1958 1959 process: 1960 if (sk->sk_state == TCP_TIME_WAIT) 1961 goto do_time_wait; 1962 1963 if (sk->sk_state == TCP_NEW_SYN_RECV) { 1964 struct request_sock *req = inet_reqsk(sk); 1965 bool req_stolen = false; 1966 struct sock *nsk; 1967 1968 sk = req->rsk_listener; 1969 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 1970 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 1971 else 1972 drop_reason = tcp_inbound_md5_hash(sk, skb, 1973 &iph->saddr, &iph->daddr, 1974 AF_INET, dif, sdif); 1975 if (unlikely(drop_reason)) { 1976 sk_drops_add(sk, skb); 1977 reqsk_put(req); 1978 goto discard_it; 1979 } 1980 if (tcp_checksum_complete(skb)) { 1981 reqsk_put(req); 1982 goto csum_error; 1983 } 1984 if (unlikely(sk->sk_state != TCP_LISTEN)) { 1985 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); 1986 if (!nsk) { 1987 inet_csk_reqsk_queue_drop_and_put(sk, req); 1988 goto lookup; 1989 } 1990 sk = nsk; 1991 /* reuseport_migrate_sock() has already held one sk_refcnt 1992 * before returning. 1993 */ 1994 } else { 1995 /* We own a reference on the listener, increase it again 1996 * as we might lose it too soon. 1997 */ 1998 sock_hold(sk); 1999 } 2000 refcounted = true; 2001 nsk = NULL; 2002 if (!tcp_filter(sk, skb)) { 2003 th = (const struct tcphdr *)skb->data; 2004 iph = ip_hdr(skb); 2005 tcp_v4_fill_cb(skb, iph, th); 2006 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 2007 } else { 2008 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2009 } 2010 if (!nsk) { 2011 reqsk_put(req); 2012 if (req_stolen) { 2013 /* Another cpu got exclusive access to req 2014 * and created a full blown socket. 2015 * Try to feed this packet to this socket 2016 * instead of discarding it. 2017 */ 2018 tcp_v4_restore_cb(skb); 2019 sock_put(sk); 2020 goto lookup; 2021 } 2022 goto discard_and_relse; 2023 } 2024 nf_reset_ct(skb); 2025 if (nsk == sk) { 2026 reqsk_put(req); 2027 tcp_v4_restore_cb(skb); 2028 } else if (tcp_child_process(sk, nsk, skb)) { 2029 tcp_v4_send_reset(nsk, skb); 2030 goto discard_and_relse; 2031 } else { 2032 sock_put(sk); 2033 return 0; 2034 } 2035 } 2036 2037 if (static_branch_unlikely(&ip4_min_ttl)) { 2038 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 2039 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 2040 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2041 goto discard_and_relse; 2042 } 2043 } 2044 2045 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { 2046 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2047 goto discard_and_relse; 2048 } 2049 2050 drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr, 2051 &iph->daddr, AF_INET, dif, sdif); 2052 if (drop_reason) 2053 goto discard_and_relse; 2054 2055 nf_reset_ct(skb); 2056 2057 if (tcp_filter(sk, skb)) { 2058 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2059 goto discard_and_relse; 2060 } 2061 th = (const struct tcphdr *)skb->data; 2062 iph = ip_hdr(skb); 2063 tcp_v4_fill_cb(skb, iph, th); 2064 2065 skb->dev = NULL; 2066 2067 if (sk->sk_state == TCP_LISTEN) { 2068 ret = tcp_v4_do_rcv(sk, skb); 2069 goto put_and_return; 2070 } 2071 2072 sk_incoming_cpu_update(sk); 2073 2074 bh_lock_sock_nested(sk); 2075 tcp_segs_in(tcp_sk(sk), skb); 2076 ret = 0; 2077 if (!sock_owned_by_user(sk)) { 2078 ret = tcp_v4_do_rcv(sk, skb); 2079 } else { 2080 if (tcp_add_backlog(sk, skb, &drop_reason)) 2081 goto discard_and_relse; 2082 } 2083 bh_unlock_sock(sk); 2084 2085 put_and_return: 2086 if (refcounted) 2087 sock_put(sk); 2088 2089 return ret; 2090 2091 no_tcp_socket: 2092 drop_reason = SKB_DROP_REASON_NO_SOCKET; 2093 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2094 goto discard_it; 2095 2096 tcp_v4_fill_cb(skb, iph, th); 2097 2098 if (tcp_checksum_complete(skb)) { 2099 csum_error: 2100 drop_reason = SKB_DROP_REASON_TCP_CSUM; 2101 trace_tcp_bad_csum(skb); 2102 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2103 bad_packet: 2104 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2105 } else { 2106 tcp_v4_send_reset(NULL, skb); 2107 } 2108 2109 discard_it: 2110 SKB_DR_OR(drop_reason, NOT_SPECIFIED); 2111 /* Discard frame. */ 2112 kfree_skb_reason(skb, drop_reason); 2113 return 0; 2114 2115 discard_and_relse: 2116 sk_drops_add(sk, skb); 2117 if (refcounted) 2118 sock_put(sk); 2119 goto discard_it; 2120 2121 do_time_wait: 2122 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2123 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2124 inet_twsk_put(inet_twsk(sk)); 2125 goto discard_it; 2126 } 2127 2128 tcp_v4_fill_cb(skb, iph, th); 2129 2130 if (tcp_checksum_complete(skb)) { 2131 inet_twsk_put(inet_twsk(sk)); 2132 goto csum_error; 2133 } 2134 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 2135 case TCP_TW_SYN: { 2136 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), 2137 &tcp_hashinfo, skb, 2138 __tcp_hdrlen(th), 2139 iph->saddr, th->source, 2140 iph->daddr, th->dest, 2141 inet_iif(skb), 2142 sdif); 2143 if (sk2) { 2144 inet_twsk_deschedule_put(inet_twsk(sk)); 2145 sk = sk2; 2146 tcp_v4_restore_cb(skb); 2147 refcounted = false; 2148 goto process; 2149 } 2150 } 2151 /* to ACK */ 2152 fallthrough; 2153 case TCP_TW_ACK: 2154 tcp_v4_timewait_ack(sk, skb); 2155 break; 2156 case TCP_TW_RST: 2157 tcp_v4_send_reset(sk, skb); 2158 inet_twsk_deschedule_put(inet_twsk(sk)); 2159 goto discard_it; 2160 case TCP_TW_SUCCESS:; 2161 } 2162 goto discard_it; 2163 } 2164 2165 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2166 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2167 .twsk_unique = tcp_twsk_unique, 2168 .twsk_destructor= tcp_twsk_destructor, 2169 }; 2170 2171 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2172 { 2173 struct dst_entry *dst = skb_dst(skb); 2174 2175 if (dst && dst_hold_safe(dst)) { 2176 rcu_assign_pointer(sk->sk_rx_dst, dst); 2177 sk->sk_rx_dst_ifindex = skb->skb_iif; 2178 } 2179 } 2180 EXPORT_SYMBOL(inet_sk_rx_dst_set); 2181 2182 const struct inet_connection_sock_af_ops ipv4_specific = { 2183 .queue_xmit = ip_queue_xmit, 2184 .send_check = tcp_v4_send_check, 2185 .rebuild_header = inet_sk_rebuild_header, 2186 .sk_rx_dst_set = inet_sk_rx_dst_set, 2187 .conn_request = tcp_v4_conn_request, 2188 .syn_recv_sock = tcp_v4_syn_recv_sock, 2189 .net_header_len = sizeof(struct iphdr), 2190 .setsockopt = ip_setsockopt, 2191 .getsockopt = ip_getsockopt, 2192 .addr2sockaddr = inet_csk_addr2sockaddr, 2193 .sockaddr_len = sizeof(struct sockaddr_in), 2194 .mtu_reduced = tcp_v4_mtu_reduced, 2195 }; 2196 EXPORT_SYMBOL(ipv4_specific); 2197 2198 #ifdef CONFIG_TCP_MD5SIG 2199 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2200 .md5_lookup = tcp_v4_md5_lookup, 2201 .calc_md5_hash = tcp_v4_md5_hash_skb, 2202 .md5_parse = tcp_v4_parse_md5_keys, 2203 }; 2204 #endif 2205 2206 /* NOTE: A lot of things set to zero explicitly by call to 2207 * sk_alloc() so need not be done here. 2208 */ 2209 static int tcp_v4_init_sock(struct sock *sk) 2210 { 2211 struct inet_connection_sock *icsk = inet_csk(sk); 2212 2213 tcp_init_sock(sk); 2214 2215 icsk->icsk_af_ops = &ipv4_specific; 2216 2217 #ifdef CONFIG_TCP_MD5SIG 2218 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2219 #endif 2220 2221 return 0; 2222 } 2223 2224 void tcp_v4_destroy_sock(struct sock *sk) 2225 { 2226 struct tcp_sock *tp = tcp_sk(sk); 2227 2228 trace_tcp_destroy_sock(sk); 2229 2230 tcp_clear_xmit_timers(sk); 2231 2232 tcp_cleanup_congestion_control(sk); 2233 2234 tcp_cleanup_ulp(sk); 2235 2236 /* Cleanup up the write buffer. */ 2237 tcp_write_queue_purge(sk); 2238 2239 /* Check if we want to disable active TFO */ 2240 tcp_fastopen_active_disable_ofo_check(sk); 2241 2242 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2243 skb_rbtree_purge(&tp->out_of_order_queue); 2244 2245 #ifdef CONFIG_TCP_MD5SIG 2246 /* Clean up the MD5 key list, if any */ 2247 if (tp->md5sig_info) { 2248 tcp_clear_md5_list(sk); 2249 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu); 2250 tp->md5sig_info = NULL; 2251 } 2252 #endif 2253 2254 /* Clean up a referenced TCP bind bucket. */ 2255 if (inet_csk(sk)->icsk_bind_hash) 2256 inet_put_port(sk); 2257 2258 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2259 2260 /* If socket is aborted during connect operation */ 2261 tcp_free_fastopen_req(tp); 2262 tcp_fastopen_destroy_cipher(sk); 2263 tcp_saved_syn_free(tp); 2264 2265 sk_sockets_allocated_dec(sk); 2266 } 2267 EXPORT_SYMBOL(tcp_v4_destroy_sock); 2268 2269 #ifdef CONFIG_PROC_FS 2270 /* Proc filesystem TCP sock list dumping. */ 2271 2272 static unsigned short seq_file_family(const struct seq_file *seq); 2273 2274 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) 2275 { 2276 unsigned short family = seq_file_family(seq); 2277 2278 /* AF_UNSPEC is used as a match all */ 2279 return ((family == AF_UNSPEC || family == sk->sk_family) && 2280 net_eq(sock_net(sk), seq_file_net(seq))); 2281 } 2282 2283 /* Find a non empty bucket (starting from st->bucket) 2284 * and return the first sk from it. 2285 */ 2286 static void *listening_get_first(struct seq_file *seq) 2287 { 2288 struct tcp_iter_state *st = seq->private; 2289 2290 st->offset = 0; 2291 for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) { 2292 struct inet_listen_hashbucket *ilb2; 2293 struct hlist_nulls_node *node; 2294 struct sock *sk; 2295 2296 ilb2 = &tcp_hashinfo.lhash2[st->bucket]; 2297 if (hlist_nulls_empty(&ilb2->nulls_head)) 2298 continue; 2299 2300 spin_lock(&ilb2->lock); 2301 sk_nulls_for_each(sk, node, &ilb2->nulls_head) { 2302 if (seq_sk_match(seq, sk)) 2303 return sk; 2304 } 2305 spin_unlock(&ilb2->lock); 2306 } 2307 2308 return NULL; 2309 } 2310 2311 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). 2312 * If "cur" is the last one in the st->bucket, 2313 * call listening_get_first() to return the first sk of the next 2314 * non empty bucket. 2315 */ 2316 static void *listening_get_next(struct seq_file *seq, void *cur) 2317 { 2318 struct tcp_iter_state *st = seq->private; 2319 struct inet_listen_hashbucket *ilb2; 2320 struct hlist_nulls_node *node; 2321 struct sock *sk = cur; 2322 2323 ++st->num; 2324 ++st->offset; 2325 2326 sk = sk_nulls_next(sk); 2327 sk_nulls_for_each_from(sk, node) { 2328 if (seq_sk_match(seq, sk)) 2329 return sk; 2330 } 2331 2332 ilb2 = &tcp_hashinfo.lhash2[st->bucket]; 2333 spin_unlock(&ilb2->lock); 2334 ++st->bucket; 2335 return listening_get_first(seq); 2336 } 2337 2338 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2339 { 2340 struct tcp_iter_state *st = seq->private; 2341 void *rc; 2342 2343 st->bucket = 0; 2344 st->offset = 0; 2345 rc = listening_get_first(seq); 2346 2347 while (rc && *pos) { 2348 rc = listening_get_next(seq, rc); 2349 --*pos; 2350 } 2351 return rc; 2352 } 2353 2354 static inline bool empty_bucket(const struct tcp_iter_state *st) 2355 { 2356 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain); 2357 } 2358 2359 /* 2360 * Get first established socket starting from bucket given in st->bucket. 2361 * If st->bucket is zero, the very first socket in the hash is returned. 2362 */ 2363 static void *established_get_first(struct seq_file *seq) 2364 { 2365 struct tcp_iter_state *st = seq->private; 2366 2367 st->offset = 0; 2368 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { 2369 struct sock *sk; 2370 struct hlist_nulls_node *node; 2371 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 2372 2373 /* Lockless fast path for the common case of empty buckets */ 2374 if (empty_bucket(st)) 2375 continue; 2376 2377 spin_lock_bh(lock); 2378 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 2379 if (seq_sk_match(seq, sk)) 2380 return sk; 2381 } 2382 spin_unlock_bh(lock); 2383 } 2384 2385 return NULL; 2386 } 2387 2388 static void *established_get_next(struct seq_file *seq, void *cur) 2389 { 2390 struct sock *sk = cur; 2391 struct hlist_nulls_node *node; 2392 struct tcp_iter_state *st = seq->private; 2393 2394 ++st->num; 2395 ++st->offset; 2396 2397 sk = sk_nulls_next(sk); 2398 2399 sk_nulls_for_each_from(sk, node) { 2400 if (seq_sk_match(seq, sk)) 2401 return sk; 2402 } 2403 2404 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2405 ++st->bucket; 2406 return established_get_first(seq); 2407 } 2408 2409 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2410 { 2411 struct tcp_iter_state *st = seq->private; 2412 void *rc; 2413 2414 st->bucket = 0; 2415 rc = established_get_first(seq); 2416 2417 while (rc && pos) { 2418 rc = established_get_next(seq, rc); 2419 --pos; 2420 } 2421 return rc; 2422 } 2423 2424 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2425 { 2426 void *rc; 2427 struct tcp_iter_state *st = seq->private; 2428 2429 st->state = TCP_SEQ_STATE_LISTENING; 2430 rc = listening_get_idx(seq, &pos); 2431 2432 if (!rc) { 2433 st->state = TCP_SEQ_STATE_ESTABLISHED; 2434 rc = established_get_idx(seq, pos); 2435 } 2436 2437 return rc; 2438 } 2439 2440 static void *tcp_seek_last_pos(struct seq_file *seq) 2441 { 2442 struct tcp_iter_state *st = seq->private; 2443 int bucket = st->bucket; 2444 int offset = st->offset; 2445 int orig_num = st->num; 2446 void *rc = NULL; 2447 2448 switch (st->state) { 2449 case TCP_SEQ_STATE_LISTENING: 2450 if (st->bucket > tcp_hashinfo.lhash2_mask) 2451 break; 2452 st->state = TCP_SEQ_STATE_LISTENING; 2453 rc = listening_get_first(seq); 2454 while (offset-- && rc && bucket == st->bucket) 2455 rc = listening_get_next(seq, rc); 2456 if (rc) 2457 break; 2458 st->bucket = 0; 2459 st->state = TCP_SEQ_STATE_ESTABLISHED; 2460 fallthrough; 2461 case TCP_SEQ_STATE_ESTABLISHED: 2462 if (st->bucket > tcp_hashinfo.ehash_mask) 2463 break; 2464 rc = established_get_first(seq); 2465 while (offset-- && rc && bucket == st->bucket) 2466 rc = established_get_next(seq, rc); 2467 } 2468 2469 st->num = orig_num; 2470 2471 return rc; 2472 } 2473 2474 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2475 { 2476 struct tcp_iter_state *st = seq->private; 2477 void *rc; 2478 2479 if (*pos && *pos == st->last_pos) { 2480 rc = tcp_seek_last_pos(seq); 2481 if (rc) 2482 goto out; 2483 } 2484 2485 st->state = TCP_SEQ_STATE_LISTENING; 2486 st->num = 0; 2487 st->bucket = 0; 2488 st->offset = 0; 2489 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2490 2491 out: 2492 st->last_pos = *pos; 2493 return rc; 2494 } 2495 EXPORT_SYMBOL(tcp_seq_start); 2496 2497 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2498 { 2499 struct tcp_iter_state *st = seq->private; 2500 void *rc = NULL; 2501 2502 if (v == SEQ_START_TOKEN) { 2503 rc = tcp_get_idx(seq, 0); 2504 goto out; 2505 } 2506 2507 switch (st->state) { 2508 case TCP_SEQ_STATE_LISTENING: 2509 rc = listening_get_next(seq, v); 2510 if (!rc) { 2511 st->state = TCP_SEQ_STATE_ESTABLISHED; 2512 st->bucket = 0; 2513 st->offset = 0; 2514 rc = established_get_first(seq); 2515 } 2516 break; 2517 case TCP_SEQ_STATE_ESTABLISHED: 2518 rc = established_get_next(seq, v); 2519 break; 2520 } 2521 out: 2522 ++*pos; 2523 st->last_pos = *pos; 2524 return rc; 2525 } 2526 EXPORT_SYMBOL(tcp_seq_next); 2527 2528 void tcp_seq_stop(struct seq_file *seq, void *v) 2529 { 2530 struct tcp_iter_state *st = seq->private; 2531 2532 switch (st->state) { 2533 case TCP_SEQ_STATE_LISTENING: 2534 if (v != SEQ_START_TOKEN) 2535 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock); 2536 break; 2537 case TCP_SEQ_STATE_ESTABLISHED: 2538 if (v) 2539 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2540 break; 2541 } 2542 } 2543 EXPORT_SYMBOL(tcp_seq_stop); 2544 2545 static void get_openreq4(const struct request_sock *req, 2546 struct seq_file *f, int i) 2547 { 2548 const struct inet_request_sock *ireq = inet_rsk(req); 2549 long delta = req->rsk_timer.expires - jiffies; 2550 2551 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2552 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2553 i, 2554 ireq->ir_loc_addr, 2555 ireq->ir_num, 2556 ireq->ir_rmt_addr, 2557 ntohs(ireq->ir_rmt_port), 2558 TCP_SYN_RECV, 2559 0, 0, /* could print option size, but that is af dependent. */ 2560 1, /* timers active (only the expire timer) */ 2561 jiffies_delta_to_clock_t(delta), 2562 req->num_timeout, 2563 from_kuid_munged(seq_user_ns(f), 2564 sock_i_uid(req->rsk_listener)), 2565 0, /* non standard timer */ 2566 0, /* open_requests have no inode */ 2567 0, 2568 req); 2569 } 2570 2571 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2572 { 2573 int timer_active; 2574 unsigned long timer_expires; 2575 const struct tcp_sock *tp = tcp_sk(sk); 2576 const struct inet_connection_sock *icsk = inet_csk(sk); 2577 const struct inet_sock *inet = inet_sk(sk); 2578 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2579 __be32 dest = inet->inet_daddr; 2580 __be32 src = inet->inet_rcv_saddr; 2581 __u16 destp = ntohs(inet->inet_dport); 2582 __u16 srcp = ntohs(inet->inet_sport); 2583 int rx_queue; 2584 int state; 2585 2586 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2587 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2588 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2589 timer_active = 1; 2590 timer_expires = icsk->icsk_timeout; 2591 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2592 timer_active = 4; 2593 timer_expires = icsk->icsk_timeout; 2594 } else if (timer_pending(&sk->sk_timer)) { 2595 timer_active = 2; 2596 timer_expires = sk->sk_timer.expires; 2597 } else { 2598 timer_active = 0; 2599 timer_expires = jiffies; 2600 } 2601 2602 state = inet_sk_state_load(sk); 2603 if (state == TCP_LISTEN) 2604 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2605 else 2606 /* Because we don't lock the socket, 2607 * we might find a transient negative value. 2608 */ 2609 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2610 READ_ONCE(tp->copied_seq), 0); 2611 2612 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2613 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2614 i, src, srcp, dest, destp, state, 2615 READ_ONCE(tp->write_seq) - tp->snd_una, 2616 rx_queue, 2617 timer_active, 2618 jiffies_delta_to_clock_t(timer_expires - jiffies), 2619 icsk->icsk_retransmits, 2620 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2621 icsk->icsk_probes_out, 2622 sock_i_ino(sk), 2623 refcount_read(&sk->sk_refcnt), sk, 2624 jiffies_to_clock_t(icsk->icsk_rto), 2625 jiffies_to_clock_t(icsk->icsk_ack.ato), 2626 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2627 tcp_snd_cwnd(tp), 2628 state == TCP_LISTEN ? 2629 fastopenq->max_qlen : 2630 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2631 } 2632 2633 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2634 struct seq_file *f, int i) 2635 { 2636 long delta = tw->tw_timer.expires - jiffies; 2637 __be32 dest, src; 2638 __u16 destp, srcp; 2639 2640 dest = tw->tw_daddr; 2641 src = tw->tw_rcv_saddr; 2642 destp = ntohs(tw->tw_dport); 2643 srcp = ntohs(tw->tw_sport); 2644 2645 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2646 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2647 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2648 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2649 refcount_read(&tw->tw_refcnt), tw); 2650 } 2651 2652 #define TMPSZ 150 2653 2654 static int tcp4_seq_show(struct seq_file *seq, void *v) 2655 { 2656 struct tcp_iter_state *st; 2657 struct sock *sk = v; 2658 2659 seq_setwidth(seq, TMPSZ - 1); 2660 if (v == SEQ_START_TOKEN) { 2661 seq_puts(seq, " sl local_address rem_address st tx_queue " 2662 "rx_queue tr tm->when retrnsmt uid timeout " 2663 "inode"); 2664 goto out; 2665 } 2666 st = seq->private; 2667 2668 if (sk->sk_state == TCP_TIME_WAIT) 2669 get_timewait4_sock(v, seq, st->num); 2670 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2671 get_openreq4(v, seq, st->num); 2672 else 2673 get_tcp4_sock(v, seq, st->num); 2674 out: 2675 seq_pad(seq, '\n'); 2676 return 0; 2677 } 2678 2679 #ifdef CONFIG_BPF_SYSCALL 2680 struct bpf_tcp_iter_state { 2681 struct tcp_iter_state state; 2682 unsigned int cur_sk; 2683 unsigned int end_sk; 2684 unsigned int max_sk; 2685 struct sock **batch; 2686 bool st_bucket_done; 2687 }; 2688 2689 struct bpf_iter__tcp { 2690 __bpf_md_ptr(struct bpf_iter_meta *, meta); 2691 __bpf_md_ptr(struct sock_common *, sk_common); 2692 uid_t uid __aligned(8); 2693 }; 2694 2695 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 2696 struct sock_common *sk_common, uid_t uid) 2697 { 2698 struct bpf_iter__tcp ctx; 2699 2700 meta->seq_num--; /* skip SEQ_START_TOKEN */ 2701 ctx.meta = meta; 2702 ctx.sk_common = sk_common; 2703 ctx.uid = uid; 2704 return bpf_iter_run_prog(prog, &ctx); 2705 } 2706 2707 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) 2708 { 2709 while (iter->cur_sk < iter->end_sk) 2710 sock_put(iter->batch[iter->cur_sk++]); 2711 } 2712 2713 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, 2714 unsigned int new_batch_sz) 2715 { 2716 struct sock **new_batch; 2717 2718 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 2719 GFP_USER | __GFP_NOWARN); 2720 if (!new_batch) 2721 return -ENOMEM; 2722 2723 bpf_iter_tcp_put_batch(iter); 2724 kvfree(iter->batch); 2725 iter->batch = new_batch; 2726 iter->max_sk = new_batch_sz; 2727 2728 return 0; 2729 } 2730 2731 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, 2732 struct sock *start_sk) 2733 { 2734 struct bpf_tcp_iter_state *iter = seq->private; 2735 struct tcp_iter_state *st = &iter->state; 2736 struct hlist_nulls_node *node; 2737 unsigned int expected = 1; 2738 struct sock *sk; 2739 2740 sock_hold(start_sk); 2741 iter->batch[iter->end_sk++] = start_sk; 2742 2743 sk = sk_nulls_next(start_sk); 2744 sk_nulls_for_each_from(sk, node) { 2745 if (seq_sk_match(seq, sk)) { 2746 if (iter->end_sk < iter->max_sk) { 2747 sock_hold(sk); 2748 iter->batch[iter->end_sk++] = sk; 2749 } 2750 expected++; 2751 } 2752 } 2753 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock); 2754 2755 return expected; 2756 } 2757 2758 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, 2759 struct sock *start_sk) 2760 { 2761 struct bpf_tcp_iter_state *iter = seq->private; 2762 struct tcp_iter_state *st = &iter->state; 2763 struct hlist_nulls_node *node; 2764 unsigned int expected = 1; 2765 struct sock *sk; 2766 2767 sock_hold(start_sk); 2768 iter->batch[iter->end_sk++] = start_sk; 2769 2770 sk = sk_nulls_next(start_sk); 2771 sk_nulls_for_each_from(sk, node) { 2772 if (seq_sk_match(seq, sk)) { 2773 if (iter->end_sk < iter->max_sk) { 2774 sock_hold(sk); 2775 iter->batch[iter->end_sk++] = sk; 2776 } 2777 expected++; 2778 } 2779 } 2780 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2781 2782 return expected; 2783 } 2784 2785 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) 2786 { 2787 struct bpf_tcp_iter_state *iter = seq->private; 2788 struct tcp_iter_state *st = &iter->state; 2789 unsigned int expected; 2790 bool resized = false; 2791 struct sock *sk; 2792 2793 /* The st->bucket is done. Directly advance to the next 2794 * bucket instead of having the tcp_seek_last_pos() to skip 2795 * one by one in the current bucket and eventually find out 2796 * it has to advance to the next bucket. 2797 */ 2798 if (iter->st_bucket_done) { 2799 st->offset = 0; 2800 st->bucket++; 2801 if (st->state == TCP_SEQ_STATE_LISTENING && 2802 st->bucket > tcp_hashinfo.lhash2_mask) { 2803 st->state = TCP_SEQ_STATE_ESTABLISHED; 2804 st->bucket = 0; 2805 } 2806 } 2807 2808 again: 2809 /* Get a new batch */ 2810 iter->cur_sk = 0; 2811 iter->end_sk = 0; 2812 iter->st_bucket_done = false; 2813 2814 sk = tcp_seek_last_pos(seq); 2815 if (!sk) 2816 return NULL; /* Done */ 2817 2818 if (st->state == TCP_SEQ_STATE_LISTENING) 2819 expected = bpf_iter_tcp_listening_batch(seq, sk); 2820 else 2821 expected = bpf_iter_tcp_established_batch(seq, sk); 2822 2823 if (iter->end_sk == expected) { 2824 iter->st_bucket_done = true; 2825 return sk; 2826 } 2827 2828 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) { 2829 resized = true; 2830 goto again; 2831 } 2832 2833 return sk; 2834 } 2835 2836 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) 2837 { 2838 /* bpf iter does not support lseek, so it always 2839 * continue from where it was stop()-ped. 2840 */ 2841 if (*pos) 2842 return bpf_iter_tcp_batch(seq); 2843 2844 return SEQ_START_TOKEN; 2845 } 2846 2847 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2848 { 2849 struct bpf_tcp_iter_state *iter = seq->private; 2850 struct tcp_iter_state *st = &iter->state; 2851 struct sock *sk; 2852 2853 /* Whenever seq_next() is called, the iter->cur_sk is 2854 * done with seq_show(), so advance to the next sk in 2855 * the batch. 2856 */ 2857 if (iter->cur_sk < iter->end_sk) { 2858 /* Keeping st->num consistent in tcp_iter_state. 2859 * bpf_iter_tcp does not use st->num. 2860 * meta.seq_num is used instead. 2861 */ 2862 st->num++; 2863 /* Move st->offset to the next sk in the bucket such that 2864 * the future start() will resume at st->offset in 2865 * st->bucket. See tcp_seek_last_pos(). 2866 */ 2867 st->offset++; 2868 sock_put(iter->batch[iter->cur_sk++]); 2869 } 2870 2871 if (iter->cur_sk < iter->end_sk) 2872 sk = iter->batch[iter->cur_sk]; 2873 else 2874 sk = bpf_iter_tcp_batch(seq); 2875 2876 ++*pos; 2877 /* Keeping st->last_pos consistent in tcp_iter_state. 2878 * bpf iter does not do lseek, so st->last_pos always equals to *pos. 2879 */ 2880 st->last_pos = *pos; 2881 return sk; 2882 } 2883 2884 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 2885 { 2886 struct bpf_iter_meta meta; 2887 struct bpf_prog *prog; 2888 struct sock *sk = v; 2889 bool slow; 2890 uid_t uid; 2891 int ret; 2892 2893 if (v == SEQ_START_TOKEN) 2894 return 0; 2895 2896 if (sk_fullsock(sk)) 2897 slow = lock_sock_fast(sk); 2898 2899 if (unlikely(sk_unhashed(sk))) { 2900 ret = SEQ_SKIP; 2901 goto unlock; 2902 } 2903 2904 if (sk->sk_state == TCP_TIME_WAIT) { 2905 uid = 0; 2906 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 2907 const struct request_sock *req = v; 2908 2909 uid = from_kuid_munged(seq_user_ns(seq), 2910 sock_i_uid(req->rsk_listener)); 2911 } else { 2912 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 2913 } 2914 2915 meta.seq = seq; 2916 prog = bpf_iter_get_info(&meta, false); 2917 ret = tcp_prog_seq_show(prog, &meta, v, uid); 2918 2919 unlock: 2920 if (sk_fullsock(sk)) 2921 unlock_sock_fast(sk, slow); 2922 return ret; 2923 2924 } 2925 2926 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 2927 { 2928 struct bpf_tcp_iter_state *iter = seq->private; 2929 struct bpf_iter_meta meta; 2930 struct bpf_prog *prog; 2931 2932 if (!v) { 2933 meta.seq = seq; 2934 prog = bpf_iter_get_info(&meta, true); 2935 if (prog) 2936 (void)tcp_prog_seq_show(prog, &meta, v, 0); 2937 } 2938 2939 if (iter->cur_sk < iter->end_sk) { 2940 bpf_iter_tcp_put_batch(iter); 2941 iter->st_bucket_done = false; 2942 } 2943 } 2944 2945 static const struct seq_operations bpf_iter_tcp_seq_ops = { 2946 .show = bpf_iter_tcp_seq_show, 2947 .start = bpf_iter_tcp_seq_start, 2948 .next = bpf_iter_tcp_seq_next, 2949 .stop = bpf_iter_tcp_seq_stop, 2950 }; 2951 #endif 2952 static unsigned short seq_file_family(const struct seq_file *seq) 2953 { 2954 const struct tcp_seq_afinfo *afinfo; 2955 2956 #ifdef CONFIG_BPF_SYSCALL 2957 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ 2958 if (seq->op == &bpf_iter_tcp_seq_ops) 2959 return AF_UNSPEC; 2960 #endif 2961 2962 /* Iterated from proc fs */ 2963 afinfo = pde_data(file_inode(seq->file)); 2964 return afinfo->family; 2965 } 2966 2967 static const struct seq_operations tcp4_seq_ops = { 2968 .show = tcp4_seq_show, 2969 .start = tcp_seq_start, 2970 .next = tcp_seq_next, 2971 .stop = tcp_seq_stop, 2972 }; 2973 2974 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 2975 .family = AF_INET, 2976 }; 2977 2978 static int __net_init tcp4_proc_init_net(struct net *net) 2979 { 2980 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 2981 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 2982 return -ENOMEM; 2983 return 0; 2984 } 2985 2986 static void __net_exit tcp4_proc_exit_net(struct net *net) 2987 { 2988 remove_proc_entry("tcp", net->proc_net); 2989 } 2990 2991 static struct pernet_operations tcp4_net_ops = { 2992 .init = tcp4_proc_init_net, 2993 .exit = tcp4_proc_exit_net, 2994 }; 2995 2996 int __init tcp4_proc_init(void) 2997 { 2998 return register_pernet_subsys(&tcp4_net_ops); 2999 } 3000 3001 void tcp4_proc_exit(void) 3002 { 3003 unregister_pernet_subsys(&tcp4_net_ops); 3004 } 3005 #endif /* CONFIG_PROC_FS */ 3006 3007 /* @wake is one when sk_stream_write_space() calls us. 3008 * This sends EPOLLOUT only if notsent_bytes is half the limit. 3009 * This mimics the strategy used in sock_def_write_space(). 3010 */ 3011 bool tcp_stream_memory_free(const struct sock *sk, int wake) 3012 { 3013 const struct tcp_sock *tp = tcp_sk(sk); 3014 u32 notsent_bytes = READ_ONCE(tp->write_seq) - 3015 READ_ONCE(tp->snd_nxt); 3016 3017 return (notsent_bytes << wake) < tcp_notsent_lowat(tp); 3018 } 3019 EXPORT_SYMBOL(tcp_stream_memory_free); 3020 3021 struct proto tcp_prot = { 3022 .name = "TCP", 3023 .owner = THIS_MODULE, 3024 .close = tcp_close, 3025 .pre_connect = tcp_v4_pre_connect, 3026 .connect = tcp_v4_connect, 3027 .disconnect = tcp_disconnect, 3028 .accept = inet_csk_accept, 3029 .ioctl = tcp_ioctl, 3030 .init = tcp_v4_init_sock, 3031 .destroy = tcp_v4_destroy_sock, 3032 .shutdown = tcp_shutdown, 3033 .setsockopt = tcp_setsockopt, 3034 .getsockopt = tcp_getsockopt, 3035 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 3036 .keepalive = tcp_set_keepalive, 3037 .recvmsg = tcp_recvmsg, 3038 .sendmsg = tcp_sendmsg, 3039 .sendpage = tcp_sendpage, 3040 .backlog_rcv = tcp_v4_do_rcv, 3041 .release_cb = tcp_release_cb, 3042 .hash = inet_hash, 3043 .unhash = inet_unhash, 3044 .get_port = inet_csk_get_port, 3045 .put_port = inet_put_port, 3046 #ifdef CONFIG_BPF_SYSCALL 3047 .psock_update_sk_prot = tcp_bpf_update_proto, 3048 #endif 3049 .enter_memory_pressure = tcp_enter_memory_pressure, 3050 .leave_memory_pressure = tcp_leave_memory_pressure, 3051 .stream_memory_free = tcp_stream_memory_free, 3052 .sockets_allocated = &tcp_sockets_allocated, 3053 .orphan_count = &tcp_orphan_count, 3054 3055 .memory_allocated = &tcp_memory_allocated, 3056 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, 3057 3058 .memory_pressure = &tcp_memory_pressure, 3059 .sysctl_mem = sysctl_tcp_mem, 3060 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3061 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3062 .max_header = MAX_TCP_HEADER, 3063 .obj_size = sizeof(struct tcp_sock), 3064 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3065 .twsk_prot = &tcp_timewait_sock_ops, 3066 .rsk_prot = &tcp_request_sock_ops, 3067 .h.hashinfo = &tcp_hashinfo, 3068 .no_autobind = true, 3069 .diag_destroy = tcp_abort, 3070 }; 3071 EXPORT_SYMBOL(tcp_prot); 3072 3073 static void __net_exit tcp_sk_exit(struct net *net) 3074 { 3075 struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row; 3076 3077 if (net->ipv4.tcp_congestion_control) 3078 bpf_module_put(net->ipv4.tcp_congestion_control, 3079 net->ipv4.tcp_congestion_control->owner); 3080 if (refcount_dec_and_test(&tcp_death_row->tw_refcount)) 3081 kfree(tcp_death_row); 3082 } 3083 3084 static int __net_init tcp_sk_init(struct net *net) 3085 { 3086 int cnt; 3087 3088 net->ipv4.sysctl_tcp_ecn = 2; 3089 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3090 3091 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 3092 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 3093 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 3094 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 3095 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 3096 3097 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 3098 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 3099 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 3100 3101 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 3102 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 3103 net->ipv4.sysctl_tcp_syncookies = 1; 3104 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 3105 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 3106 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 3107 net->ipv4.sysctl_tcp_orphan_retries = 0; 3108 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 3109 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 3110 net->ipv4.sysctl_tcp_tw_reuse = 2; 3111 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 3112 3113 net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL); 3114 if (!net->ipv4.tcp_death_row) 3115 return -ENOMEM; 3116 refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1); 3117 cnt = tcp_hashinfo.ehash_mask + 1; 3118 net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2; 3119 net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo; 3120 3121 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128); 3122 net->ipv4.sysctl_tcp_sack = 1; 3123 net->ipv4.sysctl_tcp_window_scaling = 1; 3124 net->ipv4.sysctl_tcp_timestamps = 1; 3125 net->ipv4.sysctl_tcp_early_retrans = 3; 3126 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 3127 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 3128 net->ipv4.sysctl_tcp_retrans_collapse = 1; 3129 net->ipv4.sysctl_tcp_max_reordering = 300; 3130 net->ipv4.sysctl_tcp_dsack = 1; 3131 net->ipv4.sysctl_tcp_app_win = 31; 3132 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3133 net->ipv4.sysctl_tcp_frto = 2; 3134 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3135 /* This limits the percentage of the congestion window which we 3136 * will allow a single TSO frame to consume. Building TSO frames 3137 * which are too large can cause TCP streams to be bursty. 3138 */ 3139 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3140 /* Default TSQ limit of 16 TSO segments */ 3141 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; 3142 3143 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ 3144 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; 3145 3146 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3147 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ 3148 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3149 net->ipv4.sysctl_tcp_autocorking = 1; 3150 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 3151 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 3152 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 3153 if (net != &init_net) { 3154 memcpy(net->ipv4.sysctl_tcp_rmem, 3155 init_net.ipv4.sysctl_tcp_rmem, 3156 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 3157 memcpy(net->ipv4.sysctl_tcp_wmem, 3158 init_net.ipv4.sysctl_tcp_wmem, 3159 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 3160 } 3161 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3162 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 3163 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3164 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3165 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; 3166 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 3167 3168 /* Reno is always built in */ 3169 if (!net_eq(net, &init_net) && 3170 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 3171 init_net.ipv4.tcp_congestion_control->owner)) 3172 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 3173 else 3174 net->ipv4.tcp_congestion_control = &tcp_reno; 3175 3176 return 0; 3177 } 3178 3179 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 3180 { 3181 struct net *net; 3182 3183 inet_twsk_purge(&tcp_hashinfo, AF_INET); 3184 3185 list_for_each_entry(net, net_exit_list, exit_list) 3186 tcp_fastopen_ctx_destroy(net); 3187 } 3188 3189 static struct pernet_operations __net_initdata tcp_sk_ops = { 3190 .init = tcp_sk_init, 3191 .exit = tcp_sk_exit, 3192 .exit_batch = tcp_sk_exit_batch, 3193 }; 3194 3195 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3196 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 3197 struct sock_common *sk_common, uid_t uid) 3198 3199 #define INIT_BATCH_SZ 16 3200 3201 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 3202 { 3203 struct bpf_tcp_iter_state *iter = priv_data; 3204 int err; 3205 3206 err = bpf_iter_init_seq_net(priv_data, aux); 3207 if (err) 3208 return err; 3209 3210 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ); 3211 if (err) { 3212 bpf_iter_fini_seq_net(priv_data); 3213 return err; 3214 } 3215 3216 return 0; 3217 } 3218 3219 static void bpf_iter_fini_tcp(void *priv_data) 3220 { 3221 struct bpf_tcp_iter_state *iter = priv_data; 3222 3223 bpf_iter_fini_seq_net(priv_data); 3224 kvfree(iter->batch); 3225 } 3226 3227 static const struct bpf_iter_seq_info tcp_seq_info = { 3228 .seq_ops = &bpf_iter_tcp_seq_ops, 3229 .init_seq_private = bpf_iter_init_tcp, 3230 .fini_seq_private = bpf_iter_fini_tcp, 3231 .seq_priv_size = sizeof(struct bpf_tcp_iter_state), 3232 }; 3233 3234 static const struct bpf_func_proto * 3235 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, 3236 const struct bpf_prog *prog) 3237 { 3238 switch (func_id) { 3239 case BPF_FUNC_setsockopt: 3240 return &bpf_sk_setsockopt_proto; 3241 case BPF_FUNC_getsockopt: 3242 return &bpf_sk_getsockopt_proto; 3243 default: 3244 return NULL; 3245 } 3246 } 3247 3248 static struct bpf_iter_reg tcp_reg_info = { 3249 .target = "tcp", 3250 .ctx_arg_info_size = 1, 3251 .ctx_arg_info = { 3252 { offsetof(struct bpf_iter__tcp, sk_common), 3253 PTR_TO_BTF_ID_OR_NULL }, 3254 }, 3255 .get_func_proto = bpf_iter_tcp_get_func_proto, 3256 .seq_info = &tcp_seq_info, 3257 }; 3258 3259 static void __init bpf_iter_register(void) 3260 { 3261 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 3262 if (bpf_iter_reg_target(&tcp_reg_info)) 3263 pr_warn("Warning: could not register bpf iterator tcp\n"); 3264 } 3265 3266 #endif 3267 3268 void __init tcp_v4_init(void) 3269 { 3270 int cpu, res; 3271 3272 for_each_possible_cpu(cpu) { 3273 struct sock *sk; 3274 3275 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 3276 IPPROTO_TCP, &init_net); 3277 if (res) 3278 panic("Failed to create the TCP control socket.\n"); 3279 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 3280 3281 /* Please enforce IP_DF and IPID==0 for RST and 3282 * ACK sent in SYN-RECV and TIME-WAIT state. 3283 */ 3284 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 3285 3286 per_cpu(ipv4_tcp_sk, cpu) = sk; 3287 } 3288 if (register_pernet_subsys(&tcp_sk_ops)) 3289 panic("Failed to create the TCP control socket.\n"); 3290 3291 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3292 bpf_iter_register(); 3293 #endif 3294 } 3295