1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 61 #include <net/net_namespace.h> 62 #include <net/icmp.h> 63 #include <net/inet_hashtables.h> 64 #include <net/tcp.h> 65 #include <net/transp_v6.h> 66 #include <net/ipv6.h> 67 #include <net/inet_common.h> 68 #include <net/timewait_sock.h> 69 #include <net/xfrm.h> 70 #include <net/secure_seq.h> 71 #include <net/busy_poll.h> 72 73 #include <linux/inet.h> 74 #include <linux/ipv6.h> 75 #include <linux/stddef.h> 76 #include <linux/proc_fs.h> 77 #include <linux/seq_file.h> 78 #include <linux/inetdevice.h> 79 #include <linux/btf_ids.h> 80 81 #include <crypto/hash.h> 82 #include <linux/scatterlist.h> 83 84 #include <trace/events/tcp.h> 85 86 #ifdef CONFIG_TCP_MD5SIG 87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 88 __be32 daddr, __be32 saddr, const struct tcphdr *th); 89 #endif 90 91 struct inet_hashinfo tcp_hashinfo; 92 EXPORT_SYMBOL(tcp_hashinfo); 93 94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk); 95 96 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 97 { 98 return secure_tcp_seq(ip_hdr(skb)->daddr, 99 ip_hdr(skb)->saddr, 100 tcp_hdr(skb)->dest, 101 tcp_hdr(skb)->source); 102 } 103 104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 105 { 106 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 107 } 108 109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 110 { 111 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse); 112 const struct inet_timewait_sock *tw = inet_twsk(sktw); 113 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 114 struct tcp_sock *tp = tcp_sk(sk); 115 116 if (reuse == 2) { 117 /* Still does not detect *everything* that goes through 118 * lo, since we require a loopback src or dst address 119 * or direct binding to 'lo' interface. 120 */ 121 bool loopback = false; 122 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 123 loopback = true; 124 #if IS_ENABLED(CONFIG_IPV6) 125 if (tw->tw_family == AF_INET6) { 126 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 128 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 129 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 130 loopback = true; 131 } else 132 #endif 133 { 134 if (ipv4_is_loopback(tw->tw_daddr) || 135 ipv4_is_loopback(tw->tw_rcv_saddr)) 136 loopback = true; 137 } 138 if (!loopback) 139 reuse = 0; 140 } 141 142 /* With PAWS, it is safe from the viewpoint 143 of data integrity. Even without PAWS it is safe provided sequence 144 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 145 146 Actually, the idea is close to VJ's one, only timestamp cache is 147 held not per host, but per port pair and TW bucket is used as state 148 holder. 149 150 If TW bucket has been already destroyed we fall back to VJ's scheme 151 and use initial timestamp retrieved from peer table. 152 */ 153 if (tcptw->tw_ts_recent_stamp && 154 (!twp || (reuse && time_after32(ktime_get_seconds(), 155 tcptw->tw_ts_recent_stamp)))) { 156 /* In case of repair and re-using TIME-WAIT sockets we still 157 * want to be sure that it is safe as above but honor the 158 * sequence numbers and time stamps set as part of the repair 159 * process. 160 * 161 * Without this check re-using a TIME-WAIT socket with TCP 162 * repair would accumulate a -1 on the repair assigned 163 * sequence number. The first time it is reused the sequence 164 * is -1, the second time -2, etc. This fixes that issue 165 * without appearing to create any others. 166 */ 167 if (likely(!tp->repair)) { 168 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 169 170 if (!seq) 171 seq = 1; 172 WRITE_ONCE(tp->write_seq, seq); 173 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 174 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 175 } 176 sock_hold(sktw); 177 return 1; 178 } 179 180 return 0; 181 } 182 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 183 184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 185 int addr_len) 186 { 187 /* This check is replicated from tcp_v4_connect() and intended to 188 * prevent BPF program called below from accessing bytes that are out 189 * of the bound specified by user in addr_len. 190 */ 191 if (addr_len < sizeof(struct sockaddr_in)) 192 return -EINVAL; 193 194 sock_owned_by_me(sk); 195 196 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr); 197 } 198 199 /* This will initiate an outgoing connection. */ 200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 201 { 202 struct inet_bind_hashbucket *prev_addr_hashbucket = NULL; 203 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 204 struct inet_timewait_death_row *tcp_death_row; 205 __be32 daddr, nexthop, prev_sk_rcv_saddr; 206 struct inet_sock *inet = inet_sk(sk); 207 struct tcp_sock *tp = tcp_sk(sk); 208 struct ip_options_rcu *inet_opt; 209 struct net *net = sock_net(sk); 210 __be16 orig_sport, orig_dport; 211 struct flowi4 *fl4; 212 struct rtable *rt; 213 int err; 214 215 if (addr_len < sizeof(struct sockaddr_in)) 216 return -EINVAL; 217 218 if (usin->sin_family != AF_INET) 219 return -EAFNOSUPPORT; 220 221 nexthop = daddr = usin->sin_addr.s_addr; 222 inet_opt = rcu_dereference_protected(inet->inet_opt, 223 lockdep_sock_is_held(sk)); 224 if (inet_opt && inet_opt->opt.srr) { 225 if (!daddr) 226 return -EINVAL; 227 nexthop = inet_opt->opt.faddr; 228 } 229 230 orig_sport = inet->inet_sport; 231 orig_dport = usin->sin_port; 232 fl4 = &inet->cork.fl.u.ip4; 233 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 234 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, 235 orig_dport, sk); 236 if (IS_ERR(rt)) { 237 err = PTR_ERR(rt); 238 if (err == -ENETUNREACH) 239 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); 240 return err; 241 } 242 243 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 244 ip_rt_put(rt); 245 return -ENETUNREACH; 246 } 247 248 if (!inet_opt || !inet_opt->opt.srr) 249 daddr = fl4->daddr; 250 251 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 252 253 if (!inet->inet_saddr) { 254 if (inet_csk(sk)->icsk_bind2_hash) { 255 prev_addr_hashbucket = inet_bhashfn_portaddr(tcp_death_row->hashinfo, 256 sk, net, inet->inet_num); 257 prev_sk_rcv_saddr = sk->sk_rcv_saddr; 258 } 259 inet->inet_saddr = fl4->saddr; 260 } 261 262 sk_rcv_saddr_set(sk, inet->inet_saddr); 263 264 if (prev_addr_hashbucket) { 265 err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk); 266 if (err) { 267 inet->inet_saddr = 0; 268 sk_rcv_saddr_set(sk, prev_sk_rcv_saddr); 269 ip_rt_put(rt); 270 return err; 271 } 272 } 273 274 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 275 /* Reset inherited state */ 276 tp->rx_opt.ts_recent = 0; 277 tp->rx_opt.ts_recent_stamp = 0; 278 if (likely(!tp->repair)) 279 WRITE_ONCE(tp->write_seq, 0); 280 } 281 282 inet->inet_dport = usin->sin_port; 283 sk_daddr_set(sk, daddr); 284 285 inet_csk(sk)->icsk_ext_hdr_len = 0; 286 if (inet_opt) 287 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 288 289 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 290 291 /* Socket identity is still unknown (sport may be zero). 292 * However we set state to SYN-SENT and not releasing socket 293 * lock select source port, enter ourselves into the hash tables and 294 * complete initialization after this. 295 */ 296 tcp_set_state(sk, TCP_SYN_SENT); 297 err = inet_hash_connect(tcp_death_row, sk); 298 if (err) 299 goto failure; 300 301 sk_set_txhash(sk); 302 303 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 304 inet->inet_sport, inet->inet_dport, sk); 305 if (IS_ERR(rt)) { 306 err = PTR_ERR(rt); 307 rt = NULL; 308 goto failure; 309 } 310 /* OK, now commit destination to socket. */ 311 sk->sk_gso_type = SKB_GSO_TCPV4; 312 sk_setup_caps(sk, &rt->dst); 313 rt = NULL; 314 315 if (likely(!tp->repair)) { 316 if (!tp->write_seq) 317 WRITE_ONCE(tp->write_seq, 318 secure_tcp_seq(inet->inet_saddr, 319 inet->inet_daddr, 320 inet->inet_sport, 321 usin->sin_port)); 322 tp->tsoffset = secure_tcp_ts_off(net, inet->inet_saddr, 323 inet->inet_daddr); 324 } 325 326 inet->inet_id = get_random_u16(); 327 328 if (tcp_fastopen_defer_connect(sk, &err)) 329 return err; 330 if (err) 331 goto failure; 332 333 err = tcp_connect(sk); 334 335 if (err) 336 goto failure; 337 338 return 0; 339 340 failure: 341 /* 342 * This unhashes the socket and releases the local port, 343 * if necessary. 344 */ 345 tcp_set_state(sk, TCP_CLOSE); 346 ip_rt_put(rt); 347 sk->sk_route_caps = 0; 348 inet->inet_dport = 0; 349 return err; 350 } 351 EXPORT_SYMBOL(tcp_v4_connect); 352 353 /* 354 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 355 * It can be called through tcp_release_cb() if socket was owned by user 356 * at the time tcp_v4_err() was called to handle ICMP message. 357 */ 358 void tcp_v4_mtu_reduced(struct sock *sk) 359 { 360 struct inet_sock *inet = inet_sk(sk); 361 struct dst_entry *dst; 362 u32 mtu; 363 364 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 365 return; 366 mtu = READ_ONCE(tcp_sk(sk)->mtu_info); 367 dst = inet_csk_update_pmtu(sk, mtu); 368 if (!dst) 369 return; 370 371 /* Something is about to be wrong... Remember soft error 372 * for the case, if this connection will not able to recover. 373 */ 374 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 375 sk->sk_err_soft = EMSGSIZE; 376 377 mtu = dst_mtu(dst); 378 379 if (inet->pmtudisc != IP_PMTUDISC_DONT && 380 ip_sk_accept_pmtu(sk) && 381 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 382 tcp_sync_mss(sk, mtu); 383 384 /* Resend the TCP packet because it's 385 * clear that the old packet has been 386 * dropped. This is the new "fast" path mtu 387 * discovery. 388 */ 389 tcp_simple_retransmit(sk); 390 } /* else let the usual retransmit timer handle it */ 391 } 392 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 393 394 static void do_redirect(struct sk_buff *skb, struct sock *sk) 395 { 396 struct dst_entry *dst = __sk_dst_check(sk, 0); 397 398 if (dst) 399 dst->ops->redirect(dst, sk, skb); 400 } 401 402 403 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 404 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 405 { 406 struct request_sock *req = inet_reqsk(sk); 407 struct net *net = sock_net(sk); 408 409 /* ICMPs are not backlogged, hence we cannot get 410 * an established socket here. 411 */ 412 if (seq != tcp_rsk(req)->snt_isn) { 413 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 414 } else if (abort) { 415 /* 416 * Still in SYN_RECV, just remove it silently. 417 * There is no good way to pass the error to the newly 418 * created socket, and POSIX does not want network 419 * errors returned from accept(). 420 */ 421 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 422 tcp_listendrop(req->rsk_listener); 423 } 424 reqsk_put(req); 425 } 426 EXPORT_SYMBOL(tcp_req_err); 427 428 /* TCP-LD (RFC 6069) logic */ 429 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 430 { 431 struct inet_connection_sock *icsk = inet_csk(sk); 432 struct tcp_sock *tp = tcp_sk(sk); 433 struct sk_buff *skb; 434 s32 remaining; 435 u32 delta_us; 436 437 if (sock_owned_by_user(sk)) 438 return; 439 440 if (seq != tp->snd_una || !icsk->icsk_retransmits || 441 !icsk->icsk_backoff) 442 return; 443 444 skb = tcp_rtx_queue_head(sk); 445 if (WARN_ON_ONCE(!skb)) 446 return; 447 448 icsk->icsk_backoff--; 449 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 450 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 451 452 tcp_mstamp_refresh(tp); 453 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 454 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 455 456 if (remaining > 0) { 457 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 458 remaining, TCP_RTO_MAX); 459 } else { 460 /* RTO revert clocked out retransmission. 461 * Will retransmit now. 462 */ 463 tcp_retransmit_timer(sk); 464 } 465 } 466 EXPORT_SYMBOL(tcp_ld_RTO_revert); 467 468 /* 469 * This routine is called by the ICMP module when it gets some 470 * sort of error condition. If err < 0 then the socket should 471 * be closed and the error returned to the user. If err > 0 472 * it's just the icmp type << 8 | icmp code. After adjustment 473 * header points to the first 8 bytes of the tcp header. We need 474 * to find the appropriate port. 475 * 476 * The locking strategy used here is very "optimistic". When 477 * someone else accesses the socket the ICMP is just dropped 478 * and for some paths there is no check at all. 479 * A more general error queue to queue errors for later handling 480 * is probably better. 481 * 482 */ 483 484 int tcp_v4_err(struct sk_buff *skb, u32 info) 485 { 486 const struct iphdr *iph = (const struct iphdr *)skb->data; 487 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 488 struct tcp_sock *tp; 489 struct inet_sock *inet; 490 const int type = icmp_hdr(skb)->type; 491 const int code = icmp_hdr(skb)->code; 492 struct sock *sk; 493 struct request_sock *fastopen; 494 u32 seq, snd_una; 495 int err; 496 struct net *net = dev_net(skb->dev); 497 498 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 499 iph->daddr, th->dest, iph->saddr, 500 ntohs(th->source), inet_iif(skb), 0); 501 if (!sk) { 502 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 503 return -ENOENT; 504 } 505 if (sk->sk_state == TCP_TIME_WAIT) { 506 inet_twsk_put(inet_twsk(sk)); 507 return 0; 508 } 509 seq = ntohl(th->seq); 510 if (sk->sk_state == TCP_NEW_SYN_RECV) { 511 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 512 type == ICMP_TIME_EXCEEDED || 513 (type == ICMP_DEST_UNREACH && 514 (code == ICMP_NET_UNREACH || 515 code == ICMP_HOST_UNREACH))); 516 return 0; 517 } 518 519 bh_lock_sock(sk); 520 /* If too many ICMPs get dropped on busy 521 * servers this needs to be solved differently. 522 * We do take care of PMTU discovery (RFC1191) special case : 523 * we can receive locally generated ICMP messages while socket is held. 524 */ 525 if (sock_owned_by_user(sk)) { 526 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 527 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 528 } 529 if (sk->sk_state == TCP_CLOSE) 530 goto out; 531 532 if (static_branch_unlikely(&ip4_min_ttl)) { 533 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 534 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 535 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 536 goto out; 537 } 538 } 539 540 tp = tcp_sk(sk); 541 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 542 fastopen = rcu_dereference(tp->fastopen_rsk); 543 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 544 if (sk->sk_state != TCP_LISTEN && 545 !between(seq, snd_una, tp->snd_nxt)) { 546 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 547 goto out; 548 } 549 550 switch (type) { 551 case ICMP_REDIRECT: 552 if (!sock_owned_by_user(sk)) 553 do_redirect(skb, sk); 554 goto out; 555 case ICMP_SOURCE_QUENCH: 556 /* Just silently ignore these. */ 557 goto out; 558 case ICMP_PARAMETERPROB: 559 err = EPROTO; 560 break; 561 case ICMP_DEST_UNREACH: 562 if (code > NR_ICMP_UNREACH) 563 goto out; 564 565 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 566 /* We are not interested in TCP_LISTEN and open_requests 567 * (SYN-ACKs send out by Linux are always <576bytes so 568 * they should go through unfragmented). 569 */ 570 if (sk->sk_state == TCP_LISTEN) 571 goto out; 572 573 WRITE_ONCE(tp->mtu_info, info); 574 if (!sock_owned_by_user(sk)) { 575 tcp_v4_mtu_reduced(sk); 576 } else { 577 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 578 sock_hold(sk); 579 } 580 goto out; 581 } 582 583 err = icmp_err_convert[code].errno; 584 /* check if this ICMP message allows revert of backoff. 585 * (see RFC 6069) 586 */ 587 if (!fastopen && 588 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 589 tcp_ld_RTO_revert(sk, seq); 590 break; 591 case ICMP_TIME_EXCEEDED: 592 err = EHOSTUNREACH; 593 break; 594 default: 595 goto out; 596 } 597 598 switch (sk->sk_state) { 599 case TCP_SYN_SENT: 600 case TCP_SYN_RECV: 601 /* Only in fast or simultaneous open. If a fast open socket is 602 * already accepted it is treated as a connected one below. 603 */ 604 if (fastopen && !fastopen->sk) 605 break; 606 607 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 608 609 if (!sock_owned_by_user(sk)) { 610 sk->sk_err = err; 611 612 sk_error_report(sk); 613 614 tcp_done(sk); 615 } else { 616 sk->sk_err_soft = err; 617 } 618 goto out; 619 } 620 621 /* If we've already connected we will keep trying 622 * until we time out, or the user gives up. 623 * 624 * rfc1122 4.2.3.9 allows to consider as hard errors 625 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 626 * but it is obsoleted by pmtu discovery). 627 * 628 * Note, that in modern internet, where routing is unreliable 629 * and in each dark corner broken firewalls sit, sending random 630 * errors ordered by their masters even this two messages finally lose 631 * their original sense (even Linux sends invalid PORT_UNREACHs) 632 * 633 * Now we are in compliance with RFCs. 634 * --ANK (980905) 635 */ 636 637 inet = inet_sk(sk); 638 if (!sock_owned_by_user(sk) && inet->recverr) { 639 sk->sk_err = err; 640 sk_error_report(sk); 641 } else { /* Only an error on timeout */ 642 sk->sk_err_soft = err; 643 } 644 645 out: 646 bh_unlock_sock(sk); 647 sock_put(sk); 648 return 0; 649 } 650 651 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 652 { 653 struct tcphdr *th = tcp_hdr(skb); 654 655 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 656 skb->csum_start = skb_transport_header(skb) - skb->head; 657 skb->csum_offset = offsetof(struct tcphdr, check); 658 } 659 660 /* This routine computes an IPv4 TCP checksum. */ 661 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 662 { 663 const struct inet_sock *inet = inet_sk(sk); 664 665 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 666 } 667 EXPORT_SYMBOL(tcp_v4_send_check); 668 669 /* 670 * This routine will send an RST to the other tcp. 671 * 672 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 673 * for reset. 674 * Answer: if a packet caused RST, it is not for a socket 675 * existing in our system, if it is matched to a socket, 676 * it is just duplicate segment or bug in other side's TCP. 677 * So that we build reply only basing on parameters 678 * arrived with segment. 679 * Exception: precedence violation. We do not implement it in any case. 680 */ 681 682 #ifdef CONFIG_TCP_MD5SIG 683 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED 684 #else 685 #define OPTION_BYTES sizeof(__be32) 686 #endif 687 688 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) 689 { 690 const struct tcphdr *th = tcp_hdr(skb); 691 struct { 692 struct tcphdr th; 693 __be32 opt[OPTION_BYTES / sizeof(__be32)]; 694 } rep; 695 struct ip_reply_arg arg; 696 #ifdef CONFIG_TCP_MD5SIG 697 struct tcp_md5sig_key *key = NULL; 698 const __u8 *hash_location = NULL; 699 unsigned char newhash[16]; 700 int genhash; 701 struct sock *sk1 = NULL; 702 #endif 703 u64 transmit_time = 0; 704 struct sock *ctl_sk; 705 struct net *net; 706 707 /* Never send a reset in response to a reset. */ 708 if (th->rst) 709 return; 710 711 /* If sk not NULL, it means we did a successful lookup and incoming 712 * route had to be correct. prequeue might have dropped our dst. 713 */ 714 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 715 return; 716 717 /* Swap the send and the receive. */ 718 memset(&rep, 0, sizeof(rep)); 719 rep.th.dest = th->source; 720 rep.th.source = th->dest; 721 rep.th.doff = sizeof(struct tcphdr) / 4; 722 rep.th.rst = 1; 723 724 if (th->ack) { 725 rep.th.seq = th->ack_seq; 726 } else { 727 rep.th.ack = 1; 728 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 729 skb->len - (th->doff << 2)); 730 } 731 732 memset(&arg, 0, sizeof(arg)); 733 arg.iov[0].iov_base = (unsigned char *)&rep; 734 arg.iov[0].iov_len = sizeof(rep.th); 735 736 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 737 #ifdef CONFIG_TCP_MD5SIG 738 rcu_read_lock(); 739 hash_location = tcp_parse_md5sig_option(th); 740 if (sk && sk_fullsock(sk)) { 741 const union tcp_md5_addr *addr; 742 int l3index; 743 744 /* sdif set, means packet ingressed via a device 745 * in an L3 domain and inet_iif is set to it. 746 */ 747 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 748 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 749 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 750 } else if (hash_location) { 751 const union tcp_md5_addr *addr; 752 int sdif = tcp_v4_sdif(skb); 753 int dif = inet_iif(skb); 754 int l3index; 755 756 /* 757 * active side is lost. Try to find listening socket through 758 * source port, and then find md5 key through listening socket. 759 * we are not loose security here: 760 * Incoming packet is checked with md5 hash with finding key, 761 * no RST generated if md5 hash doesn't match. 762 */ 763 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, 764 NULL, 0, ip_hdr(skb)->saddr, 765 th->source, ip_hdr(skb)->daddr, 766 ntohs(th->source), dif, sdif); 767 /* don't send rst if it can't find key */ 768 if (!sk1) 769 goto out; 770 771 /* sdif set, means packet ingressed via a device 772 * in an L3 domain and dif is set to it. 773 */ 774 l3index = sdif ? dif : 0; 775 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 776 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 777 if (!key) 778 goto out; 779 780 781 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 782 if (genhash || memcmp(hash_location, newhash, 16) != 0) 783 goto out; 784 785 } 786 787 if (key) { 788 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 789 (TCPOPT_NOP << 16) | 790 (TCPOPT_MD5SIG << 8) | 791 TCPOLEN_MD5SIG); 792 /* Update length and the length the header thinks exists */ 793 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 794 rep.th.doff = arg.iov[0].iov_len / 4; 795 796 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 797 key, ip_hdr(skb)->saddr, 798 ip_hdr(skb)->daddr, &rep.th); 799 } 800 #endif 801 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ 802 if (rep.opt[0] == 0) { 803 __be32 mrst = mptcp_reset_option(skb); 804 805 if (mrst) { 806 rep.opt[0] = mrst; 807 arg.iov[0].iov_len += sizeof(mrst); 808 rep.th.doff = arg.iov[0].iov_len / 4; 809 } 810 } 811 812 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 813 ip_hdr(skb)->saddr, /* XXX */ 814 arg.iov[0].iov_len, IPPROTO_TCP, 0); 815 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 816 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 817 818 /* When socket is gone, all binding information is lost. 819 * routing might fail in this case. No choice here, if we choose to force 820 * input interface, we will misroute in case of asymmetric route. 821 */ 822 if (sk) { 823 arg.bound_dev_if = sk->sk_bound_dev_if; 824 if (sk_fullsock(sk)) 825 trace_tcp_send_reset(sk, skb); 826 } 827 828 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 829 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 830 831 arg.tos = ip_hdr(skb)->tos; 832 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 833 local_bh_disable(); 834 ctl_sk = this_cpu_read(ipv4_tcp_sk); 835 sock_net_set(ctl_sk, net); 836 if (sk) { 837 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 838 inet_twsk(sk)->tw_mark : sk->sk_mark; 839 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 840 inet_twsk(sk)->tw_priority : sk->sk_priority; 841 transmit_time = tcp_transmit_time(sk); 842 xfrm_sk_clone_policy(ctl_sk, sk); 843 } 844 ip_send_unicast_reply(ctl_sk, 845 skb, &TCP_SKB_CB(skb)->header.h4.opt, 846 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 847 &arg, arg.iov[0].iov_len, 848 transmit_time); 849 850 ctl_sk->sk_mark = 0; 851 xfrm_sk_free_policy(ctl_sk); 852 sock_net_set(ctl_sk, &init_net); 853 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 854 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 855 local_bh_enable(); 856 857 #ifdef CONFIG_TCP_MD5SIG 858 out: 859 rcu_read_unlock(); 860 #endif 861 } 862 863 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 864 outside socket context is ugly, certainly. What can I do? 865 */ 866 867 static void tcp_v4_send_ack(const struct sock *sk, 868 struct sk_buff *skb, u32 seq, u32 ack, 869 u32 win, u32 tsval, u32 tsecr, int oif, 870 struct tcp_md5sig_key *key, 871 int reply_flags, u8 tos) 872 { 873 const struct tcphdr *th = tcp_hdr(skb); 874 struct { 875 struct tcphdr th; 876 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 877 #ifdef CONFIG_TCP_MD5SIG 878 + (TCPOLEN_MD5SIG_ALIGNED >> 2) 879 #endif 880 ]; 881 } rep; 882 struct net *net = sock_net(sk); 883 struct ip_reply_arg arg; 884 struct sock *ctl_sk; 885 u64 transmit_time; 886 887 memset(&rep.th, 0, sizeof(struct tcphdr)); 888 memset(&arg, 0, sizeof(arg)); 889 890 arg.iov[0].iov_base = (unsigned char *)&rep; 891 arg.iov[0].iov_len = sizeof(rep.th); 892 if (tsecr) { 893 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 894 (TCPOPT_TIMESTAMP << 8) | 895 TCPOLEN_TIMESTAMP); 896 rep.opt[1] = htonl(tsval); 897 rep.opt[2] = htonl(tsecr); 898 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 899 } 900 901 /* Swap the send and the receive. */ 902 rep.th.dest = th->source; 903 rep.th.source = th->dest; 904 rep.th.doff = arg.iov[0].iov_len / 4; 905 rep.th.seq = htonl(seq); 906 rep.th.ack_seq = htonl(ack); 907 rep.th.ack = 1; 908 rep.th.window = htons(win); 909 910 #ifdef CONFIG_TCP_MD5SIG 911 if (key) { 912 int offset = (tsecr) ? 3 : 0; 913 914 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 915 (TCPOPT_NOP << 16) | 916 (TCPOPT_MD5SIG << 8) | 917 TCPOLEN_MD5SIG); 918 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 919 rep.th.doff = arg.iov[0].iov_len/4; 920 921 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 922 key, ip_hdr(skb)->saddr, 923 ip_hdr(skb)->daddr, &rep.th); 924 } 925 #endif 926 arg.flags = reply_flags; 927 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 928 ip_hdr(skb)->saddr, /* XXX */ 929 arg.iov[0].iov_len, IPPROTO_TCP, 0); 930 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 931 if (oif) 932 arg.bound_dev_if = oif; 933 arg.tos = tos; 934 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 935 local_bh_disable(); 936 ctl_sk = this_cpu_read(ipv4_tcp_sk); 937 sock_net_set(ctl_sk, net); 938 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 939 inet_twsk(sk)->tw_mark : sk->sk_mark; 940 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 941 inet_twsk(sk)->tw_priority : sk->sk_priority; 942 transmit_time = tcp_transmit_time(sk); 943 ip_send_unicast_reply(ctl_sk, 944 skb, &TCP_SKB_CB(skb)->header.h4.opt, 945 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 946 &arg, arg.iov[0].iov_len, 947 transmit_time); 948 949 ctl_sk->sk_mark = 0; 950 sock_net_set(ctl_sk, &init_net); 951 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 952 local_bh_enable(); 953 } 954 955 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 956 { 957 struct inet_timewait_sock *tw = inet_twsk(sk); 958 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 959 960 tcp_v4_send_ack(sk, skb, 961 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 962 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 963 tcp_time_stamp_raw() + tcptw->tw_ts_offset, 964 tcptw->tw_ts_recent, 965 tw->tw_bound_dev_if, 966 tcp_twsk_md5_key(tcptw), 967 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 968 tw->tw_tos 969 ); 970 971 inet_twsk_put(tw); 972 } 973 974 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 975 struct request_sock *req) 976 { 977 const union tcp_md5_addr *addr; 978 int l3index; 979 980 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 981 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 982 */ 983 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 984 tcp_sk(sk)->snd_nxt; 985 986 /* RFC 7323 2.3 987 * The window field (SEG.WND) of every outgoing segment, with the 988 * exception of <SYN> segments, MUST be right-shifted by 989 * Rcv.Wind.Shift bits: 990 */ 991 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 992 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 993 tcp_v4_send_ack(sk, skb, seq, 994 tcp_rsk(req)->rcv_nxt, 995 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 996 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, 997 req->ts_recent, 998 0, 999 tcp_md5_do_lookup(sk, l3index, addr, AF_INET), 1000 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 1001 ip_hdr(skb)->tos); 1002 } 1003 1004 /* 1005 * Send a SYN-ACK after having received a SYN. 1006 * This still operates on a request_sock only, not on a big 1007 * socket. 1008 */ 1009 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 1010 struct flowi *fl, 1011 struct request_sock *req, 1012 struct tcp_fastopen_cookie *foc, 1013 enum tcp_synack_type synack_type, 1014 struct sk_buff *syn_skb) 1015 { 1016 const struct inet_request_sock *ireq = inet_rsk(req); 1017 struct flowi4 fl4; 1018 int err = -1; 1019 struct sk_buff *skb; 1020 u8 tos; 1021 1022 /* First, grab a route. */ 1023 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 1024 return -1; 1025 1026 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 1027 1028 if (skb) { 1029 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1030 1031 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ? 1032 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | 1033 (inet_sk(sk)->tos & INET_ECN_MASK) : 1034 inet_sk(sk)->tos; 1035 1036 if (!INET_ECN_is_capable(tos) && 1037 tcp_bpf_ca_needs_ecn((struct sock *)req)) 1038 tos |= INET_ECN_ECT_0; 1039 1040 rcu_read_lock(); 1041 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 1042 ireq->ir_rmt_addr, 1043 rcu_dereference(ireq->ireq_opt), 1044 tos); 1045 rcu_read_unlock(); 1046 err = net_xmit_eval(err); 1047 } 1048 1049 return err; 1050 } 1051 1052 /* 1053 * IPv4 request_sock destructor. 1054 */ 1055 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1056 { 1057 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1058 } 1059 1060 #ifdef CONFIG_TCP_MD5SIG 1061 /* 1062 * RFC2385 MD5 checksumming requires a mapping of 1063 * IP address->MD5 Key. 1064 * We need to maintain these in the sk structure. 1065 */ 1066 1067 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed); 1068 EXPORT_SYMBOL(tcp_md5_needed); 1069 1070 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) 1071 { 1072 if (!old) 1073 return true; 1074 1075 /* l3index always overrides non-l3index */ 1076 if (old->l3index && new->l3index == 0) 1077 return false; 1078 if (old->l3index == 0 && new->l3index) 1079 return true; 1080 1081 return old->prefixlen < new->prefixlen; 1082 } 1083 1084 /* Find the Key structure for an address. */ 1085 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1086 const union tcp_md5_addr *addr, 1087 int family) 1088 { 1089 const struct tcp_sock *tp = tcp_sk(sk); 1090 struct tcp_md5sig_key *key; 1091 const struct tcp_md5sig_info *md5sig; 1092 __be32 mask; 1093 struct tcp_md5sig_key *best_match = NULL; 1094 bool match; 1095 1096 /* caller either holds rcu_read_lock() or socket lock */ 1097 md5sig = rcu_dereference_check(tp->md5sig_info, 1098 lockdep_sock_is_held(sk)); 1099 if (!md5sig) 1100 return NULL; 1101 1102 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1103 lockdep_sock_is_held(sk)) { 1104 if (key->family != family) 1105 continue; 1106 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index) 1107 continue; 1108 if (family == AF_INET) { 1109 mask = inet_make_mask(key->prefixlen); 1110 match = (key->addr.a4.s_addr & mask) == 1111 (addr->a4.s_addr & mask); 1112 #if IS_ENABLED(CONFIG_IPV6) 1113 } else if (family == AF_INET6) { 1114 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1115 key->prefixlen); 1116 #endif 1117 } else { 1118 match = false; 1119 } 1120 1121 if (match && better_md5_match(best_match, key)) 1122 best_match = key; 1123 } 1124 return best_match; 1125 } 1126 EXPORT_SYMBOL(__tcp_md5_do_lookup); 1127 1128 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1129 const union tcp_md5_addr *addr, 1130 int family, u8 prefixlen, 1131 int l3index, u8 flags) 1132 { 1133 const struct tcp_sock *tp = tcp_sk(sk); 1134 struct tcp_md5sig_key *key; 1135 unsigned int size = sizeof(struct in_addr); 1136 const struct tcp_md5sig_info *md5sig; 1137 1138 /* caller either holds rcu_read_lock() or socket lock */ 1139 md5sig = rcu_dereference_check(tp->md5sig_info, 1140 lockdep_sock_is_held(sk)); 1141 if (!md5sig) 1142 return NULL; 1143 #if IS_ENABLED(CONFIG_IPV6) 1144 if (family == AF_INET6) 1145 size = sizeof(struct in6_addr); 1146 #endif 1147 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1148 lockdep_sock_is_held(sk)) { 1149 if (key->family != family) 1150 continue; 1151 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) 1152 continue; 1153 if (key->l3index != l3index) 1154 continue; 1155 if (!memcmp(&key->addr, addr, size) && 1156 key->prefixlen == prefixlen) 1157 return key; 1158 } 1159 return NULL; 1160 } 1161 1162 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1163 const struct sock *addr_sk) 1164 { 1165 const union tcp_md5_addr *addr; 1166 int l3index; 1167 1168 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1169 addr_sk->sk_bound_dev_if); 1170 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1171 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1172 } 1173 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1174 1175 /* This can be called on a newly created socket, from other files */ 1176 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1177 int family, u8 prefixlen, int l3index, u8 flags, 1178 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1179 { 1180 /* Add Key to the list */ 1181 struct tcp_md5sig_key *key; 1182 struct tcp_sock *tp = tcp_sk(sk); 1183 struct tcp_md5sig_info *md5sig; 1184 1185 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1186 if (key) { 1187 /* Pre-existing entry - just update that one. 1188 * Note that the key might be used concurrently. 1189 * data_race() is telling kcsan that we do not care of 1190 * key mismatches, since changing MD5 key on live flows 1191 * can lead to packet drops. 1192 */ 1193 data_race(memcpy(key->key, newkey, newkeylen)); 1194 1195 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1196 * Also note that a reader could catch new key->keylen value 1197 * but old key->key[], this is the reason we use __GFP_ZERO 1198 * at sock_kmalloc() time below these lines. 1199 */ 1200 WRITE_ONCE(key->keylen, newkeylen); 1201 1202 return 0; 1203 } 1204 1205 md5sig = rcu_dereference_protected(tp->md5sig_info, 1206 lockdep_sock_is_held(sk)); 1207 if (!md5sig) { 1208 md5sig = kmalloc(sizeof(*md5sig), gfp); 1209 if (!md5sig) 1210 return -ENOMEM; 1211 1212 sk_gso_disable(sk); 1213 INIT_HLIST_HEAD(&md5sig->head); 1214 rcu_assign_pointer(tp->md5sig_info, md5sig); 1215 } 1216 1217 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1218 if (!key) 1219 return -ENOMEM; 1220 if (!tcp_alloc_md5sig_pool()) { 1221 sock_kfree_s(sk, key, sizeof(*key)); 1222 return -ENOMEM; 1223 } 1224 1225 memcpy(key->key, newkey, newkeylen); 1226 key->keylen = newkeylen; 1227 key->family = family; 1228 key->prefixlen = prefixlen; 1229 key->l3index = l3index; 1230 key->flags = flags; 1231 memcpy(&key->addr, addr, 1232 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) : 1233 sizeof(struct in_addr)); 1234 hlist_add_head_rcu(&key->node, &md5sig->head); 1235 return 0; 1236 } 1237 EXPORT_SYMBOL(tcp_md5_do_add); 1238 1239 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1240 u8 prefixlen, int l3index, u8 flags) 1241 { 1242 struct tcp_md5sig_key *key; 1243 1244 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1245 if (!key) 1246 return -ENOENT; 1247 hlist_del_rcu(&key->node); 1248 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1249 kfree_rcu(key, rcu); 1250 return 0; 1251 } 1252 EXPORT_SYMBOL(tcp_md5_do_del); 1253 1254 static void tcp_clear_md5_list(struct sock *sk) 1255 { 1256 struct tcp_sock *tp = tcp_sk(sk); 1257 struct tcp_md5sig_key *key; 1258 struct hlist_node *n; 1259 struct tcp_md5sig_info *md5sig; 1260 1261 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1262 1263 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1264 hlist_del_rcu(&key->node); 1265 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1266 kfree_rcu(key, rcu); 1267 } 1268 } 1269 1270 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1271 sockptr_t optval, int optlen) 1272 { 1273 struct tcp_md5sig cmd; 1274 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1275 const union tcp_md5_addr *addr; 1276 u8 prefixlen = 32; 1277 int l3index = 0; 1278 u8 flags; 1279 1280 if (optlen < sizeof(cmd)) 1281 return -EINVAL; 1282 1283 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1284 return -EFAULT; 1285 1286 if (sin->sin_family != AF_INET) 1287 return -EINVAL; 1288 1289 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1290 1291 if (optname == TCP_MD5SIG_EXT && 1292 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1293 prefixlen = cmd.tcpm_prefixlen; 1294 if (prefixlen > 32) 1295 return -EINVAL; 1296 } 1297 1298 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && 1299 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1300 struct net_device *dev; 1301 1302 rcu_read_lock(); 1303 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1304 if (dev && netif_is_l3_master(dev)) 1305 l3index = dev->ifindex; 1306 1307 rcu_read_unlock(); 1308 1309 /* ok to reference set/not set outside of rcu; 1310 * right now device MUST be an L3 master 1311 */ 1312 if (!dev || !l3index) 1313 return -EINVAL; 1314 } 1315 1316 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1317 1318 if (!cmd.tcpm_keylen) 1319 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); 1320 1321 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1322 return -EINVAL; 1323 1324 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, 1325 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); 1326 } 1327 1328 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, 1329 __be32 daddr, __be32 saddr, 1330 const struct tcphdr *th, int nbytes) 1331 { 1332 struct tcp4_pseudohdr *bp; 1333 struct scatterlist sg; 1334 struct tcphdr *_th; 1335 1336 bp = hp->scratch; 1337 bp->saddr = saddr; 1338 bp->daddr = daddr; 1339 bp->pad = 0; 1340 bp->protocol = IPPROTO_TCP; 1341 bp->len = cpu_to_be16(nbytes); 1342 1343 _th = (struct tcphdr *)(bp + 1); 1344 memcpy(_th, th, sizeof(*th)); 1345 _th->check = 0; 1346 1347 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1348 ahash_request_set_crypt(hp->md5_req, &sg, NULL, 1349 sizeof(*bp) + sizeof(*th)); 1350 return crypto_ahash_update(hp->md5_req); 1351 } 1352 1353 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1354 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1355 { 1356 struct tcp_md5sig_pool *hp; 1357 struct ahash_request *req; 1358 1359 hp = tcp_get_md5sig_pool(); 1360 if (!hp) 1361 goto clear_hash_noput; 1362 req = hp->md5_req; 1363 1364 if (crypto_ahash_init(req)) 1365 goto clear_hash; 1366 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) 1367 goto clear_hash; 1368 if (tcp_md5_hash_key(hp, key)) 1369 goto clear_hash; 1370 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1371 if (crypto_ahash_final(req)) 1372 goto clear_hash; 1373 1374 tcp_put_md5sig_pool(); 1375 return 0; 1376 1377 clear_hash: 1378 tcp_put_md5sig_pool(); 1379 clear_hash_noput: 1380 memset(md5_hash, 0, 16); 1381 return 1; 1382 } 1383 1384 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1385 const struct sock *sk, 1386 const struct sk_buff *skb) 1387 { 1388 struct tcp_md5sig_pool *hp; 1389 struct ahash_request *req; 1390 const struct tcphdr *th = tcp_hdr(skb); 1391 __be32 saddr, daddr; 1392 1393 if (sk) { /* valid for establish/request sockets */ 1394 saddr = sk->sk_rcv_saddr; 1395 daddr = sk->sk_daddr; 1396 } else { 1397 const struct iphdr *iph = ip_hdr(skb); 1398 saddr = iph->saddr; 1399 daddr = iph->daddr; 1400 } 1401 1402 hp = tcp_get_md5sig_pool(); 1403 if (!hp) 1404 goto clear_hash_noput; 1405 req = hp->md5_req; 1406 1407 if (crypto_ahash_init(req)) 1408 goto clear_hash; 1409 1410 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len)) 1411 goto clear_hash; 1412 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) 1413 goto clear_hash; 1414 if (tcp_md5_hash_key(hp, key)) 1415 goto clear_hash; 1416 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1417 if (crypto_ahash_final(req)) 1418 goto clear_hash; 1419 1420 tcp_put_md5sig_pool(); 1421 return 0; 1422 1423 clear_hash: 1424 tcp_put_md5sig_pool(); 1425 clear_hash_noput: 1426 memset(md5_hash, 0, 16); 1427 return 1; 1428 } 1429 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1430 1431 #endif 1432 1433 static void tcp_v4_init_req(struct request_sock *req, 1434 const struct sock *sk_listener, 1435 struct sk_buff *skb) 1436 { 1437 struct inet_request_sock *ireq = inet_rsk(req); 1438 struct net *net = sock_net(sk_listener); 1439 1440 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1441 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1442 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1443 } 1444 1445 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1446 struct sk_buff *skb, 1447 struct flowi *fl, 1448 struct request_sock *req) 1449 { 1450 tcp_v4_init_req(req, sk, skb); 1451 1452 if (security_inet_conn_request(sk, skb, req)) 1453 return NULL; 1454 1455 return inet_csk_route_req(sk, &fl->u.ip4, req); 1456 } 1457 1458 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1459 .family = PF_INET, 1460 .obj_size = sizeof(struct tcp_request_sock), 1461 .rtx_syn_ack = tcp_rtx_synack, 1462 .send_ack = tcp_v4_reqsk_send_ack, 1463 .destructor = tcp_v4_reqsk_destructor, 1464 .send_reset = tcp_v4_send_reset, 1465 .syn_ack_timeout = tcp_syn_ack_timeout, 1466 }; 1467 1468 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1469 .mss_clamp = TCP_MSS_DEFAULT, 1470 #ifdef CONFIG_TCP_MD5SIG 1471 .req_md5_lookup = tcp_v4_md5_lookup, 1472 .calc_md5_hash = tcp_v4_md5_hash_skb, 1473 #endif 1474 #ifdef CONFIG_SYN_COOKIES 1475 .cookie_init_seq = cookie_v4_init_sequence, 1476 #endif 1477 .route_req = tcp_v4_route_req, 1478 .init_seq = tcp_v4_init_seq, 1479 .init_ts_off = tcp_v4_init_ts_off, 1480 .send_synack = tcp_v4_send_synack, 1481 }; 1482 1483 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1484 { 1485 /* Never answer to SYNs send to broadcast or multicast */ 1486 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1487 goto drop; 1488 1489 return tcp_conn_request(&tcp_request_sock_ops, 1490 &tcp_request_sock_ipv4_ops, sk, skb); 1491 1492 drop: 1493 tcp_listendrop(sk); 1494 return 0; 1495 } 1496 EXPORT_SYMBOL(tcp_v4_conn_request); 1497 1498 1499 /* 1500 * The three way handshake has completed - we got a valid synack - 1501 * now create the new socket. 1502 */ 1503 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1504 struct request_sock *req, 1505 struct dst_entry *dst, 1506 struct request_sock *req_unhash, 1507 bool *own_req) 1508 { 1509 struct inet_request_sock *ireq; 1510 bool found_dup_sk = false; 1511 struct inet_sock *newinet; 1512 struct tcp_sock *newtp; 1513 struct sock *newsk; 1514 #ifdef CONFIG_TCP_MD5SIG 1515 const union tcp_md5_addr *addr; 1516 struct tcp_md5sig_key *key; 1517 int l3index; 1518 #endif 1519 struct ip_options_rcu *inet_opt; 1520 1521 if (sk_acceptq_is_full(sk)) 1522 goto exit_overflow; 1523 1524 newsk = tcp_create_openreq_child(sk, req, skb); 1525 if (!newsk) 1526 goto exit_nonewsk; 1527 1528 newsk->sk_gso_type = SKB_GSO_TCPV4; 1529 inet_sk_rx_dst_set(newsk, skb); 1530 1531 newtp = tcp_sk(newsk); 1532 newinet = inet_sk(newsk); 1533 ireq = inet_rsk(req); 1534 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1535 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1536 newsk->sk_bound_dev_if = ireq->ir_iif; 1537 newinet->inet_saddr = ireq->ir_loc_addr; 1538 inet_opt = rcu_dereference(ireq->ireq_opt); 1539 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1540 newinet->mc_index = inet_iif(skb); 1541 newinet->mc_ttl = ip_hdr(skb)->ttl; 1542 newinet->rcv_tos = ip_hdr(skb)->tos; 1543 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1544 if (inet_opt) 1545 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1546 newinet->inet_id = get_random_u16(); 1547 1548 /* Set ToS of the new socket based upon the value of incoming SYN. 1549 * ECT bits are set later in tcp_init_transfer(). 1550 */ 1551 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1552 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1553 1554 if (!dst) { 1555 dst = inet_csk_route_child_sock(sk, newsk, req); 1556 if (!dst) 1557 goto put_and_exit; 1558 } else { 1559 /* syncookie case : see end of cookie_v4_check() */ 1560 } 1561 sk_setup_caps(newsk, dst); 1562 1563 tcp_ca_openreq_child(newsk, dst); 1564 1565 tcp_sync_mss(newsk, dst_mtu(dst)); 1566 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1567 1568 tcp_initialize_rcv_mss(newsk); 1569 1570 #ifdef CONFIG_TCP_MD5SIG 1571 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1572 /* Copy over the MD5 key from the original socket */ 1573 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1574 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1575 if (key) { 1576 /* 1577 * We're using one, so create a matching key 1578 * on the newsk structure. If we fail to get 1579 * memory, then we end up not copying the key 1580 * across. Shucks. 1581 */ 1582 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags, 1583 key->key, key->keylen, GFP_ATOMIC); 1584 sk_gso_disable(newsk); 1585 } 1586 #endif 1587 1588 if (__inet_inherit_port(sk, newsk) < 0) 1589 goto put_and_exit; 1590 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1591 &found_dup_sk); 1592 if (likely(*own_req)) { 1593 tcp_move_syn(newtp, req); 1594 ireq->ireq_opt = NULL; 1595 } else { 1596 newinet->inet_opt = NULL; 1597 1598 if (!req_unhash && found_dup_sk) { 1599 /* This code path should only be executed in the 1600 * syncookie case only 1601 */ 1602 bh_unlock_sock(newsk); 1603 sock_put(newsk); 1604 newsk = NULL; 1605 } 1606 } 1607 return newsk; 1608 1609 exit_overflow: 1610 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1611 exit_nonewsk: 1612 dst_release(dst); 1613 exit: 1614 tcp_listendrop(sk); 1615 return NULL; 1616 put_and_exit: 1617 newinet->inet_opt = NULL; 1618 inet_csk_prepare_forced_close(newsk); 1619 tcp_done(newsk); 1620 goto exit; 1621 } 1622 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1623 1624 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1625 { 1626 #ifdef CONFIG_SYN_COOKIES 1627 const struct tcphdr *th = tcp_hdr(skb); 1628 1629 if (!th->syn) 1630 sk = cookie_v4_check(sk, skb); 1631 #endif 1632 return sk; 1633 } 1634 1635 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1636 struct tcphdr *th, u32 *cookie) 1637 { 1638 u16 mss = 0; 1639 #ifdef CONFIG_SYN_COOKIES 1640 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1641 &tcp_request_sock_ipv4_ops, sk, th); 1642 if (mss) { 1643 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1644 tcp_synq_overflow(sk); 1645 } 1646 #endif 1647 return mss; 1648 } 1649 1650 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 1651 u32)); 1652 /* The socket must have it's spinlock held when we get 1653 * here, unless it is a TCP_LISTEN socket. 1654 * 1655 * We have a potential double-lock case here, so even when 1656 * doing backlog processing we use the BH locking scheme. 1657 * This is because we cannot sleep with the original spinlock 1658 * held. 1659 */ 1660 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1661 { 1662 enum skb_drop_reason reason; 1663 struct sock *rsk; 1664 1665 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1666 struct dst_entry *dst; 1667 1668 dst = rcu_dereference_protected(sk->sk_rx_dst, 1669 lockdep_sock_is_held(sk)); 1670 1671 sock_rps_save_rxhash(sk, skb); 1672 sk_mark_napi_id(sk, skb); 1673 if (dst) { 1674 if (sk->sk_rx_dst_ifindex != skb->skb_iif || 1675 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, 1676 dst, 0)) { 1677 RCU_INIT_POINTER(sk->sk_rx_dst, NULL); 1678 dst_release(dst); 1679 } 1680 } 1681 tcp_rcv_established(sk, skb); 1682 return 0; 1683 } 1684 1685 reason = SKB_DROP_REASON_NOT_SPECIFIED; 1686 if (tcp_checksum_complete(skb)) 1687 goto csum_err; 1688 1689 if (sk->sk_state == TCP_LISTEN) { 1690 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1691 1692 if (!nsk) 1693 goto discard; 1694 if (nsk != sk) { 1695 if (tcp_child_process(sk, nsk, skb)) { 1696 rsk = nsk; 1697 goto reset; 1698 } 1699 return 0; 1700 } 1701 } else 1702 sock_rps_save_rxhash(sk, skb); 1703 1704 if (tcp_rcv_state_process(sk, skb)) { 1705 rsk = sk; 1706 goto reset; 1707 } 1708 return 0; 1709 1710 reset: 1711 tcp_v4_send_reset(rsk, skb); 1712 discard: 1713 kfree_skb_reason(skb, reason); 1714 /* Be careful here. If this function gets more complicated and 1715 * gcc suffers from register pressure on the x86, sk (in %ebx) 1716 * might be destroyed here. This current version compiles correctly, 1717 * but you have been warned. 1718 */ 1719 return 0; 1720 1721 csum_err: 1722 reason = SKB_DROP_REASON_TCP_CSUM; 1723 trace_tcp_bad_csum(skb); 1724 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1725 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1726 goto discard; 1727 } 1728 EXPORT_SYMBOL(tcp_v4_do_rcv); 1729 1730 int tcp_v4_early_demux(struct sk_buff *skb) 1731 { 1732 struct net *net = dev_net(skb->dev); 1733 const struct iphdr *iph; 1734 const struct tcphdr *th; 1735 struct sock *sk; 1736 1737 if (skb->pkt_type != PACKET_HOST) 1738 return 0; 1739 1740 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1741 return 0; 1742 1743 iph = ip_hdr(skb); 1744 th = tcp_hdr(skb); 1745 1746 if (th->doff < sizeof(struct tcphdr) / 4) 1747 return 0; 1748 1749 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 1750 iph->saddr, th->source, 1751 iph->daddr, ntohs(th->dest), 1752 skb->skb_iif, inet_sdif(skb)); 1753 if (sk) { 1754 skb->sk = sk; 1755 skb->destructor = sock_edemux; 1756 if (sk_fullsock(sk)) { 1757 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); 1758 1759 if (dst) 1760 dst = dst_check(dst, 0); 1761 if (dst && 1762 sk->sk_rx_dst_ifindex == skb->skb_iif) 1763 skb_dst_set_noref(skb, dst); 1764 } 1765 } 1766 return 0; 1767 } 1768 1769 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, 1770 enum skb_drop_reason *reason) 1771 { 1772 u32 limit, tail_gso_size, tail_gso_segs; 1773 struct skb_shared_info *shinfo; 1774 const struct tcphdr *th; 1775 struct tcphdr *thtail; 1776 struct sk_buff *tail; 1777 unsigned int hdrlen; 1778 bool fragstolen; 1779 u32 gso_segs; 1780 u32 gso_size; 1781 int delta; 1782 1783 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1784 * we can fix skb->truesize to its real value to avoid future drops. 1785 * This is valid because skb is not yet charged to the socket. 1786 * It has been noticed pure SACK packets were sometimes dropped 1787 * (if cooked by drivers without copybreak feature). 1788 */ 1789 skb_condense(skb); 1790 1791 skb_dst_drop(skb); 1792 1793 if (unlikely(tcp_checksum_complete(skb))) { 1794 bh_unlock_sock(sk); 1795 trace_tcp_bad_csum(skb); 1796 *reason = SKB_DROP_REASON_TCP_CSUM; 1797 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1798 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1799 return true; 1800 } 1801 1802 /* Attempt coalescing to last skb in backlog, even if we are 1803 * above the limits. 1804 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 1805 */ 1806 th = (const struct tcphdr *)skb->data; 1807 hdrlen = th->doff * 4; 1808 1809 tail = sk->sk_backlog.tail; 1810 if (!tail) 1811 goto no_coalesce; 1812 thtail = (struct tcphdr *)tail->data; 1813 1814 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 1815 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 1816 ((TCP_SKB_CB(tail)->tcp_flags | 1817 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 1818 !((TCP_SKB_CB(tail)->tcp_flags & 1819 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 1820 ((TCP_SKB_CB(tail)->tcp_flags ^ 1821 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || 1822 #ifdef CONFIG_TLS_DEVICE 1823 tail->decrypted != skb->decrypted || 1824 #endif 1825 thtail->doff != th->doff || 1826 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 1827 goto no_coalesce; 1828 1829 __skb_pull(skb, hdrlen); 1830 1831 shinfo = skb_shinfo(skb); 1832 gso_size = shinfo->gso_size ?: skb->len; 1833 gso_segs = shinfo->gso_segs ?: 1; 1834 1835 shinfo = skb_shinfo(tail); 1836 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); 1837 tail_gso_segs = shinfo->gso_segs ?: 1; 1838 1839 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 1840 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 1841 1842 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 1843 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 1844 thtail->window = th->window; 1845 } 1846 1847 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 1848 * thtail->fin, so that the fast path in tcp_rcv_established() 1849 * is not entered if we append a packet with a FIN. 1850 * SYN, RST, URG are not present. 1851 * ACK is set on both packets. 1852 * PSH : we do not really care in TCP stack, 1853 * at least for 'GRO' packets. 1854 */ 1855 thtail->fin |= th->fin; 1856 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 1857 1858 if (TCP_SKB_CB(skb)->has_rxtstamp) { 1859 TCP_SKB_CB(tail)->has_rxtstamp = true; 1860 tail->tstamp = skb->tstamp; 1861 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 1862 } 1863 1864 /* Not as strict as GRO. We only need to carry mss max value */ 1865 shinfo->gso_size = max(gso_size, tail_gso_size); 1866 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); 1867 1868 sk->sk_backlog.len += delta; 1869 __NET_INC_STATS(sock_net(sk), 1870 LINUX_MIB_TCPBACKLOGCOALESCE); 1871 kfree_skb_partial(skb, fragstolen); 1872 return false; 1873 } 1874 __skb_push(skb, hdrlen); 1875 1876 no_coalesce: 1877 limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1); 1878 1879 /* Only socket owner can try to collapse/prune rx queues 1880 * to reduce memory overhead, so add a little headroom here. 1881 * Few sockets backlog are possibly concurrently non empty. 1882 */ 1883 limit += 64 * 1024; 1884 1885 if (unlikely(sk_add_backlog(sk, skb, limit))) { 1886 bh_unlock_sock(sk); 1887 *reason = SKB_DROP_REASON_SOCKET_BACKLOG; 1888 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 1889 return true; 1890 } 1891 return false; 1892 } 1893 EXPORT_SYMBOL(tcp_add_backlog); 1894 1895 int tcp_filter(struct sock *sk, struct sk_buff *skb) 1896 { 1897 struct tcphdr *th = (struct tcphdr *)skb->data; 1898 1899 return sk_filter_trim_cap(sk, skb, th->doff * 4); 1900 } 1901 EXPORT_SYMBOL(tcp_filter); 1902 1903 static void tcp_v4_restore_cb(struct sk_buff *skb) 1904 { 1905 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 1906 sizeof(struct inet_skb_parm)); 1907 } 1908 1909 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 1910 const struct tcphdr *th) 1911 { 1912 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 1913 * barrier() makes sure compiler wont play fool^Waliasing games. 1914 */ 1915 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 1916 sizeof(struct inet_skb_parm)); 1917 barrier(); 1918 1919 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1920 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1921 skb->len - th->doff * 4); 1922 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1923 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 1924 TCP_SKB_CB(skb)->tcp_tw_isn = 0; 1925 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 1926 TCP_SKB_CB(skb)->sacked = 0; 1927 TCP_SKB_CB(skb)->has_rxtstamp = 1928 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 1929 } 1930 1931 /* 1932 * From tcp_input.c 1933 */ 1934 1935 int tcp_v4_rcv(struct sk_buff *skb) 1936 { 1937 struct net *net = dev_net(skb->dev); 1938 enum skb_drop_reason drop_reason; 1939 int sdif = inet_sdif(skb); 1940 int dif = inet_iif(skb); 1941 const struct iphdr *iph; 1942 const struct tcphdr *th; 1943 bool refcounted; 1944 struct sock *sk; 1945 int ret; 1946 1947 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 1948 if (skb->pkt_type != PACKET_HOST) 1949 goto discard_it; 1950 1951 /* Count it even if it's bad */ 1952 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 1953 1954 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 1955 goto discard_it; 1956 1957 th = (const struct tcphdr *)skb->data; 1958 1959 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { 1960 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; 1961 goto bad_packet; 1962 } 1963 if (!pskb_may_pull(skb, th->doff * 4)) 1964 goto discard_it; 1965 1966 /* An explanation is required here, I think. 1967 * Packet length and doff are validated by header prediction, 1968 * provided case of th->doff==0 is eliminated. 1969 * So, we defer the checks. */ 1970 1971 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 1972 goto csum_error; 1973 1974 th = (const struct tcphdr *)skb->data; 1975 iph = ip_hdr(skb); 1976 lookup: 1977 sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo, 1978 skb, __tcp_hdrlen(th), th->source, 1979 th->dest, sdif, &refcounted); 1980 if (!sk) 1981 goto no_tcp_socket; 1982 1983 process: 1984 if (sk->sk_state == TCP_TIME_WAIT) 1985 goto do_time_wait; 1986 1987 if (sk->sk_state == TCP_NEW_SYN_RECV) { 1988 struct request_sock *req = inet_reqsk(sk); 1989 bool req_stolen = false; 1990 struct sock *nsk; 1991 1992 sk = req->rsk_listener; 1993 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 1994 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 1995 else 1996 drop_reason = tcp_inbound_md5_hash(sk, skb, 1997 &iph->saddr, &iph->daddr, 1998 AF_INET, dif, sdif); 1999 if (unlikely(drop_reason)) { 2000 sk_drops_add(sk, skb); 2001 reqsk_put(req); 2002 goto discard_it; 2003 } 2004 if (tcp_checksum_complete(skb)) { 2005 reqsk_put(req); 2006 goto csum_error; 2007 } 2008 if (unlikely(sk->sk_state != TCP_LISTEN)) { 2009 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); 2010 if (!nsk) { 2011 inet_csk_reqsk_queue_drop_and_put(sk, req); 2012 goto lookup; 2013 } 2014 sk = nsk; 2015 /* reuseport_migrate_sock() has already held one sk_refcnt 2016 * before returning. 2017 */ 2018 } else { 2019 /* We own a reference on the listener, increase it again 2020 * as we might lose it too soon. 2021 */ 2022 sock_hold(sk); 2023 } 2024 refcounted = true; 2025 nsk = NULL; 2026 if (!tcp_filter(sk, skb)) { 2027 th = (const struct tcphdr *)skb->data; 2028 iph = ip_hdr(skb); 2029 tcp_v4_fill_cb(skb, iph, th); 2030 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 2031 } else { 2032 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2033 } 2034 if (!nsk) { 2035 reqsk_put(req); 2036 if (req_stolen) { 2037 /* Another cpu got exclusive access to req 2038 * and created a full blown socket. 2039 * Try to feed this packet to this socket 2040 * instead of discarding it. 2041 */ 2042 tcp_v4_restore_cb(skb); 2043 sock_put(sk); 2044 goto lookup; 2045 } 2046 goto discard_and_relse; 2047 } 2048 nf_reset_ct(skb); 2049 if (nsk == sk) { 2050 reqsk_put(req); 2051 tcp_v4_restore_cb(skb); 2052 } else if (tcp_child_process(sk, nsk, skb)) { 2053 tcp_v4_send_reset(nsk, skb); 2054 goto discard_and_relse; 2055 } else { 2056 sock_put(sk); 2057 return 0; 2058 } 2059 } 2060 2061 if (static_branch_unlikely(&ip4_min_ttl)) { 2062 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 2063 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 2064 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2065 goto discard_and_relse; 2066 } 2067 } 2068 2069 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { 2070 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2071 goto discard_and_relse; 2072 } 2073 2074 drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr, 2075 &iph->daddr, AF_INET, dif, sdif); 2076 if (drop_reason) 2077 goto discard_and_relse; 2078 2079 nf_reset_ct(skb); 2080 2081 if (tcp_filter(sk, skb)) { 2082 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2083 goto discard_and_relse; 2084 } 2085 th = (const struct tcphdr *)skb->data; 2086 iph = ip_hdr(skb); 2087 tcp_v4_fill_cb(skb, iph, th); 2088 2089 skb->dev = NULL; 2090 2091 if (sk->sk_state == TCP_LISTEN) { 2092 ret = tcp_v4_do_rcv(sk, skb); 2093 goto put_and_return; 2094 } 2095 2096 sk_incoming_cpu_update(sk); 2097 2098 bh_lock_sock_nested(sk); 2099 tcp_segs_in(tcp_sk(sk), skb); 2100 ret = 0; 2101 if (!sock_owned_by_user(sk)) { 2102 ret = tcp_v4_do_rcv(sk, skb); 2103 } else { 2104 if (tcp_add_backlog(sk, skb, &drop_reason)) 2105 goto discard_and_relse; 2106 } 2107 bh_unlock_sock(sk); 2108 2109 put_and_return: 2110 if (refcounted) 2111 sock_put(sk); 2112 2113 return ret; 2114 2115 no_tcp_socket: 2116 drop_reason = SKB_DROP_REASON_NO_SOCKET; 2117 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2118 goto discard_it; 2119 2120 tcp_v4_fill_cb(skb, iph, th); 2121 2122 if (tcp_checksum_complete(skb)) { 2123 csum_error: 2124 drop_reason = SKB_DROP_REASON_TCP_CSUM; 2125 trace_tcp_bad_csum(skb); 2126 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2127 bad_packet: 2128 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2129 } else { 2130 tcp_v4_send_reset(NULL, skb); 2131 } 2132 2133 discard_it: 2134 SKB_DR_OR(drop_reason, NOT_SPECIFIED); 2135 /* Discard frame. */ 2136 kfree_skb_reason(skb, drop_reason); 2137 return 0; 2138 2139 discard_and_relse: 2140 sk_drops_add(sk, skb); 2141 if (refcounted) 2142 sock_put(sk); 2143 goto discard_it; 2144 2145 do_time_wait: 2146 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2147 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2148 inet_twsk_put(inet_twsk(sk)); 2149 goto discard_it; 2150 } 2151 2152 tcp_v4_fill_cb(skb, iph, th); 2153 2154 if (tcp_checksum_complete(skb)) { 2155 inet_twsk_put(inet_twsk(sk)); 2156 goto csum_error; 2157 } 2158 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 2159 case TCP_TW_SYN: { 2160 struct sock *sk2 = inet_lookup_listener(net, 2161 net->ipv4.tcp_death_row.hashinfo, 2162 skb, __tcp_hdrlen(th), 2163 iph->saddr, th->source, 2164 iph->daddr, th->dest, 2165 inet_iif(skb), 2166 sdif); 2167 if (sk2) { 2168 inet_twsk_deschedule_put(inet_twsk(sk)); 2169 sk = sk2; 2170 tcp_v4_restore_cb(skb); 2171 refcounted = false; 2172 goto process; 2173 } 2174 } 2175 /* to ACK */ 2176 fallthrough; 2177 case TCP_TW_ACK: 2178 tcp_v4_timewait_ack(sk, skb); 2179 break; 2180 case TCP_TW_RST: 2181 tcp_v4_send_reset(sk, skb); 2182 inet_twsk_deschedule_put(inet_twsk(sk)); 2183 goto discard_it; 2184 case TCP_TW_SUCCESS:; 2185 } 2186 goto discard_it; 2187 } 2188 2189 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2190 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2191 .twsk_unique = tcp_twsk_unique, 2192 .twsk_destructor= tcp_twsk_destructor, 2193 }; 2194 2195 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2196 { 2197 struct dst_entry *dst = skb_dst(skb); 2198 2199 if (dst && dst_hold_safe(dst)) { 2200 rcu_assign_pointer(sk->sk_rx_dst, dst); 2201 sk->sk_rx_dst_ifindex = skb->skb_iif; 2202 } 2203 } 2204 EXPORT_SYMBOL(inet_sk_rx_dst_set); 2205 2206 const struct inet_connection_sock_af_ops ipv4_specific = { 2207 .queue_xmit = ip_queue_xmit, 2208 .send_check = tcp_v4_send_check, 2209 .rebuild_header = inet_sk_rebuild_header, 2210 .sk_rx_dst_set = inet_sk_rx_dst_set, 2211 .conn_request = tcp_v4_conn_request, 2212 .syn_recv_sock = tcp_v4_syn_recv_sock, 2213 .net_header_len = sizeof(struct iphdr), 2214 .setsockopt = ip_setsockopt, 2215 .getsockopt = ip_getsockopt, 2216 .addr2sockaddr = inet_csk_addr2sockaddr, 2217 .sockaddr_len = sizeof(struct sockaddr_in), 2218 .mtu_reduced = tcp_v4_mtu_reduced, 2219 }; 2220 EXPORT_SYMBOL(ipv4_specific); 2221 2222 #ifdef CONFIG_TCP_MD5SIG 2223 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2224 .md5_lookup = tcp_v4_md5_lookup, 2225 .calc_md5_hash = tcp_v4_md5_hash_skb, 2226 .md5_parse = tcp_v4_parse_md5_keys, 2227 }; 2228 #endif 2229 2230 /* NOTE: A lot of things set to zero explicitly by call to 2231 * sk_alloc() so need not be done here. 2232 */ 2233 static int tcp_v4_init_sock(struct sock *sk) 2234 { 2235 struct inet_connection_sock *icsk = inet_csk(sk); 2236 2237 tcp_init_sock(sk); 2238 2239 icsk->icsk_af_ops = &ipv4_specific; 2240 2241 #ifdef CONFIG_TCP_MD5SIG 2242 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2243 #endif 2244 2245 return 0; 2246 } 2247 2248 void tcp_v4_destroy_sock(struct sock *sk) 2249 { 2250 struct tcp_sock *tp = tcp_sk(sk); 2251 2252 trace_tcp_destroy_sock(sk); 2253 2254 tcp_clear_xmit_timers(sk); 2255 2256 tcp_cleanup_congestion_control(sk); 2257 2258 tcp_cleanup_ulp(sk); 2259 2260 /* Cleanup up the write buffer. */ 2261 tcp_write_queue_purge(sk); 2262 2263 /* Check if we want to disable active TFO */ 2264 tcp_fastopen_active_disable_ofo_check(sk); 2265 2266 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2267 skb_rbtree_purge(&tp->out_of_order_queue); 2268 2269 #ifdef CONFIG_TCP_MD5SIG 2270 /* Clean up the MD5 key list, if any */ 2271 if (tp->md5sig_info) { 2272 tcp_clear_md5_list(sk); 2273 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu); 2274 tp->md5sig_info = NULL; 2275 } 2276 #endif 2277 2278 /* Clean up a referenced TCP bind bucket. */ 2279 if (inet_csk(sk)->icsk_bind_hash) 2280 inet_put_port(sk); 2281 2282 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2283 2284 /* If socket is aborted during connect operation */ 2285 tcp_free_fastopen_req(tp); 2286 tcp_fastopen_destroy_cipher(sk); 2287 tcp_saved_syn_free(tp); 2288 2289 sk_sockets_allocated_dec(sk); 2290 } 2291 EXPORT_SYMBOL(tcp_v4_destroy_sock); 2292 2293 #ifdef CONFIG_PROC_FS 2294 /* Proc filesystem TCP sock list dumping. */ 2295 2296 static unsigned short seq_file_family(const struct seq_file *seq); 2297 2298 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) 2299 { 2300 unsigned short family = seq_file_family(seq); 2301 2302 /* AF_UNSPEC is used as a match all */ 2303 return ((family == AF_UNSPEC || family == sk->sk_family) && 2304 net_eq(sock_net(sk), seq_file_net(seq))); 2305 } 2306 2307 /* Find a non empty bucket (starting from st->bucket) 2308 * and return the first sk from it. 2309 */ 2310 static void *listening_get_first(struct seq_file *seq) 2311 { 2312 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2313 struct tcp_iter_state *st = seq->private; 2314 2315 st->offset = 0; 2316 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) { 2317 struct inet_listen_hashbucket *ilb2; 2318 struct hlist_nulls_node *node; 2319 struct sock *sk; 2320 2321 ilb2 = &hinfo->lhash2[st->bucket]; 2322 if (hlist_nulls_empty(&ilb2->nulls_head)) 2323 continue; 2324 2325 spin_lock(&ilb2->lock); 2326 sk_nulls_for_each(sk, node, &ilb2->nulls_head) { 2327 if (seq_sk_match(seq, sk)) 2328 return sk; 2329 } 2330 spin_unlock(&ilb2->lock); 2331 } 2332 2333 return NULL; 2334 } 2335 2336 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). 2337 * If "cur" is the last one in the st->bucket, 2338 * call listening_get_first() to return the first sk of the next 2339 * non empty bucket. 2340 */ 2341 static void *listening_get_next(struct seq_file *seq, void *cur) 2342 { 2343 struct tcp_iter_state *st = seq->private; 2344 struct inet_listen_hashbucket *ilb2; 2345 struct hlist_nulls_node *node; 2346 struct inet_hashinfo *hinfo; 2347 struct sock *sk = cur; 2348 2349 ++st->num; 2350 ++st->offset; 2351 2352 sk = sk_nulls_next(sk); 2353 sk_nulls_for_each_from(sk, node) { 2354 if (seq_sk_match(seq, sk)) 2355 return sk; 2356 } 2357 2358 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2359 ilb2 = &hinfo->lhash2[st->bucket]; 2360 spin_unlock(&ilb2->lock); 2361 ++st->bucket; 2362 return listening_get_first(seq); 2363 } 2364 2365 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2366 { 2367 struct tcp_iter_state *st = seq->private; 2368 void *rc; 2369 2370 st->bucket = 0; 2371 st->offset = 0; 2372 rc = listening_get_first(seq); 2373 2374 while (rc && *pos) { 2375 rc = listening_get_next(seq, rc); 2376 --*pos; 2377 } 2378 return rc; 2379 } 2380 2381 static inline bool empty_bucket(struct inet_hashinfo *hinfo, 2382 const struct tcp_iter_state *st) 2383 { 2384 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain); 2385 } 2386 2387 /* 2388 * Get first established socket starting from bucket given in st->bucket. 2389 * If st->bucket is zero, the very first socket in the hash is returned. 2390 */ 2391 static void *established_get_first(struct seq_file *seq) 2392 { 2393 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2394 struct tcp_iter_state *st = seq->private; 2395 2396 st->offset = 0; 2397 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) { 2398 struct sock *sk; 2399 struct hlist_nulls_node *node; 2400 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket); 2401 2402 /* Lockless fast path for the common case of empty buckets */ 2403 if (empty_bucket(hinfo, st)) 2404 continue; 2405 2406 spin_lock_bh(lock); 2407 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) { 2408 if (seq_sk_match(seq, sk)) 2409 return sk; 2410 } 2411 spin_unlock_bh(lock); 2412 } 2413 2414 return NULL; 2415 } 2416 2417 static void *established_get_next(struct seq_file *seq, void *cur) 2418 { 2419 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2420 struct tcp_iter_state *st = seq->private; 2421 struct hlist_nulls_node *node; 2422 struct sock *sk = cur; 2423 2424 ++st->num; 2425 ++st->offset; 2426 2427 sk = sk_nulls_next(sk); 2428 2429 sk_nulls_for_each_from(sk, node) { 2430 if (seq_sk_match(seq, sk)) 2431 return sk; 2432 } 2433 2434 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2435 ++st->bucket; 2436 return established_get_first(seq); 2437 } 2438 2439 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2440 { 2441 struct tcp_iter_state *st = seq->private; 2442 void *rc; 2443 2444 st->bucket = 0; 2445 rc = established_get_first(seq); 2446 2447 while (rc && pos) { 2448 rc = established_get_next(seq, rc); 2449 --pos; 2450 } 2451 return rc; 2452 } 2453 2454 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2455 { 2456 void *rc; 2457 struct tcp_iter_state *st = seq->private; 2458 2459 st->state = TCP_SEQ_STATE_LISTENING; 2460 rc = listening_get_idx(seq, &pos); 2461 2462 if (!rc) { 2463 st->state = TCP_SEQ_STATE_ESTABLISHED; 2464 rc = established_get_idx(seq, pos); 2465 } 2466 2467 return rc; 2468 } 2469 2470 static void *tcp_seek_last_pos(struct seq_file *seq) 2471 { 2472 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2473 struct tcp_iter_state *st = seq->private; 2474 int bucket = st->bucket; 2475 int offset = st->offset; 2476 int orig_num = st->num; 2477 void *rc = NULL; 2478 2479 switch (st->state) { 2480 case TCP_SEQ_STATE_LISTENING: 2481 if (st->bucket > hinfo->lhash2_mask) 2482 break; 2483 rc = listening_get_first(seq); 2484 while (offset-- && rc && bucket == st->bucket) 2485 rc = listening_get_next(seq, rc); 2486 if (rc) 2487 break; 2488 st->bucket = 0; 2489 st->state = TCP_SEQ_STATE_ESTABLISHED; 2490 fallthrough; 2491 case TCP_SEQ_STATE_ESTABLISHED: 2492 if (st->bucket > hinfo->ehash_mask) 2493 break; 2494 rc = established_get_first(seq); 2495 while (offset-- && rc && bucket == st->bucket) 2496 rc = established_get_next(seq, rc); 2497 } 2498 2499 st->num = orig_num; 2500 2501 return rc; 2502 } 2503 2504 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2505 { 2506 struct tcp_iter_state *st = seq->private; 2507 void *rc; 2508 2509 if (*pos && *pos == st->last_pos) { 2510 rc = tcp_seek_last_pos(seq); 2511 if (rc) 2512 goto out; 2513 } 2514 2515 st->state = TCP_SEQ_STATE_LISTENING; 2516 st->num = 0; 2517 st->bucket = 0; 2518 st->offset = 0; 2519 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2520 2521 out: 2522 st->last_pos = *pos; 2523 return rc; 2524 } 2525 EXPORT_SYMBOL(tcp_seq_start); 2526 2527 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2528 { 2529 struct tcp_iter_state *st = seq->private; 2530 void *rc = NULL; 2531 2532 if (v == SEQ_START_TOKEN) { 2533 rc = tcp_get_idx(seq, 0); 2534 goto out; 2535 } 2536 2537 switch (st->state) { 2538 case TCP_SEQ_STATE_LISTENING: 2539 rc = listening_get_next(seq, v); 2540 if (!rc) { 2541 st->state = TCP_SEQ_STATE_ESTABLISHED; 2542 st->bucket = 0; 2543 st->offset = 0; 2544 rc = established_get_first(seq); 2545 } 2546 break; 2547 case TCP_SEQ_STATE_ESTABLISHED: 2548 rc = established_get_next(seq, v); 2549 break; 2550 } 2551 out: 2552 ++*pos; 2553 st->last_pos = *pos; 2554 return rc; 2555 } 2556 EXPORT_SYMBOL(tcp_seq_next); 2557 2558 void tcp_seq_stop(struct seq_file *seq, void *v) 2559 { 2560 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2561 struct tcp_iter_state *st = seq->private; 2562 2563 switch (st->state) { 2564 case TCP_SEQ_STATE_LISTENING: 2565 if (v != SEQ_START_TOKEN) 2566 spin_unlock(&hinfo->lhash2[st->bucket].lock); 2567 break; 2568 case TCP_SEQ_STATE_ESTABLISHED: 2569 if (v) 2570 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2571 break; 2572 } 2573 } 2574 EXPORT_SYMBOL(tcp_seq_stop); 2575 2576 static void get_openreq4(const struct request_sock *req, 2577 struct seq_file *f, int i) 2578 { 2579 const struct inet_request_sock *ireq = inet_rsk(req); 2580 long delta = req->rsk_timer.expires - jiffies; 2581 2582 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2583 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2584 i, 2585 ireq->ir_loc_addr, 2586 ireq->ir_num, 2587 ireq->ir_rmt_addr, 2588 ntohs(ireq->ir_rmt_port), 2589 TCP_SYN_RECV, 2590 0, 0, /* could print option size, but that is af dependent. */ 2591 1, /* timers active (only the expire timer) */ 2592 jiffies_delta_to_clock_t(delta), 2593 req->num_timeout, 2594 from_kuid_munged(seq_user_ns(f), 2595 sock_i_uid(req->rsk_listener)), 2596 0, /* non standard timer */ 2597 0, /* open_requests have no inode */ 2598 0, 2599 req); 2600 } 2601 2602 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2603 { 2604 int timer_active; 2605 unsigned long timer_expires; 2606 const struct tcp_sock *tp = tcp_sk(sk); 2607 const struct inet_connection_sock *icsk = inet_csk(sk); 2608 const struct inet_sock *inet = inet_sk(sk); 2609 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2610 __be32 dest = inet->inet_daddr; 2611 __be32 src = inet->inet_rcv_saddr; 2612 __u16 destp = ntohs(inet->inet_dport); 2613 __u16 srcp = ntohs(inet->inet_sport); 2614 int rx_queue; 2615 int state; 2616 2617 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2618 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2619 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2620 timer_active = 1; 2621 timer_expires = icsk->icsk_timeout; 2622 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2623 timer_active = 4; 2624 timer_expires = icsk->icsk_timeout; 2625 } else if (timer_pending(&sk->sk_timer)) { 2626 timer_active = 2; 2627 timer_expires = sk->sk_timer.expires; 2628 } else { 2629 timer_active = 0; 2630 timer_expires = jiffies; 2631 } 2632 2633 state = inet_sk_state_load(sk); 2634 if (state == TCP_LISTEN) 2635 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2636 else 2637 /* Because we don't lock the socket, 2638 * we might find a transient negative value. 2639 */ 2640 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2641 READ_ONCE(tp->copied_seq), 0); 2642 2643 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2644 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2645 i, src, srcp, dest, destp, state, 2646 READ_ONCE(tp->write_seq) - tp->snd_una, 2647 rx_queue, 2648 timer_active, 2649 jiffies_delta_to_clock_t(timer_expires - jiffies), 2650 icsk->icsk_retransmits, 2651 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2652 icsk->icsk_probes_out, 2653 sock_i_ino(sk), 2654 refcount_read(&sk->sk_refcnt), sk, 2655 jiffies_to_clock_t(icsk->icsk_rto), 2656 jiffies_to_clock_t(icsk->icsk_ack.ato), 2657 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2658 tcp_snd_cwnd(tp), 2659 state == TCP_LISTEN ? 2660 fastopenq->max_qlen : 2661 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2662 } 2663 2664 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2665 struct seq_file *f, int i) 2666 { 2667 long delta = tw->tw_timer.expires - jiffies; 2668 __be32 dest, src; 2669 __u16 destp, srcp; 2670 2671 dest = tw->tw_daddr; 2672 src = tw->tw_rcv_saddr; 2673 destp = ntohs(tw->tw_dport); 2674 srcp = ntohs(tw->tw_sport); 2675 2676 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2677 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2678 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2679 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2680 refcount_read(&tw->tw_refcnt), tw); 2681 } 2682 2683 #define TMPSZ 150 2684 2685 static int tcp4_seq_show(struct seq_file *seq, void *v) 2686 { 2687 struct tcp_iter_state *st; 2688 struct sock *sk = v; 2689 2690 seq_setwidth(seq, TMPSZ - 1); 2691 if (v == SEQ_START_TOKEN) { 2692 seq_puts(seq, " sl local_address rem_address st tx_queue " 2693 "rx_queue tr tm->when retrnsmt uid timeout " 2694 "inode"); 2695 goto out; 2696 } 2697 st = seq->private; 2698 2699 if (sk->sk_state == TCP_TIME_WAIT) 2700 get_timewait4_sock(v, seq, st->num); 2701 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2702 get_openreq4(v, seq, st->num); 2703 else 2704 get_tcp4_sock(v, seq, st->num); 2705 out: 2706 seq_pad(seq, '\n'); 2707 return 0; 2708 } 2709 2710 #ifdef CONFIG_BPF_SYSCALL 2711 struct bpf_tcp_iter_state { 2712 struct tcp_iter_state state; 2713 unsigned int cur_sk; 2714 unsigned int end_sk; 2715 unsigned int max_sk; 2716 struct sock **batch; 2717 bool st_bucket_done; 2718 }; 2719 2720 struct bpf_iter__tcp { 2721 __bpf_md_ptr(struct bpf_iter_meta *, meta); 2722 __bpf_md_ptr(struct sock_common *, sk_common); 2723 uid_t uid __aligned(8); 2724 }; 2725 2726 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 2727 struct sock_common *sk_common, uid_t uid) 2728 { 2729 struct bpf_iter__tcp ctx; 2730 2731 meta->seq_num--; /* skip SEQ_START_TOKEN */ 2732 ctx.meta = meta; 2733 ctx.sk_common = sk_common; 2734 ctx.uid = uid; 2735 return bpf_iter_run_prog(prog, &ctx); 2736 } 2737 2738 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) 2739 { 2740 while (iter->cur_sk < iter->end_sk) 2741 sock_put(iter->batch[iter->cur_sk++]); 2742 } 2743 2744 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, 2745 unsigned int new_batch_sz) 2746 { 2747 struct sock **new_batch; 2748 2749 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 2750 GFP_USER | __GFP_NOWARN); 2751 if (!new_batch) 2752 return -ENOMEM; 2753 2754 bpf_iter_tcp_put_batch(iter); 2755 kvfree(iter->batch); 2756 iter->batch = new_batch; 2757 iter->max_sk = new_batch_sz; 2758 2759 return 0; 2760 } 2761 2762 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, 2763 struct sock *start_sk) 2764 { 2765 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2766 struct bpf_tcp_iter_state *iter = seq->private; 2767 struct tcp_iter_state *st = &iter->state; 2768 struct hlist_nulls_node *node; 2769 unsigned int expected = 1; 2770 struct sock *sk; 2771 2772 sock_hold(start_sk); 2773 iter->batch[iter->end_sk++] = start_sk; 2774 2775 sk = sk_nulls_next(start_sk); 2776 sk_nulls_for_each_from(sk, node) { 2777 if (seq_sk_match(seq, sk)) { 2778 if (iter->end_sk < iter->max_sk) { 2779 sock_hold(sk); 2780 iter->batch[iter->end_sk++] = sk; 2781 } 2782 expected++; 2783 } 2784 } 2785 spin_unlock(&hinfo->lhash2[st->bucket].lock); 2786 2787 return expected; 2788 } 2789 2790 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, 2791 struct sock *start_sk) 2792 { 2793 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2794 struct bpf_tcp_iter_state *iter = seq->private; 2795 struct tcp_iter_state *st = &iter->state; 2796 struct hlist_nulls_node *node; 2797 unsigned int expected = 1; 2798 struct sock *sk; 2799 2800 sock_hold(start_sk); 2801 iter->batch[iter->end_sk++] = start_sk; 2802 2803 sk = sk_nulls_next(start_sk); 2804 sk_nulls_for_each_from(sk, node) { 2805 if (seq_sk_match(seq, sk)) { 2806 if (iter->end_sk < iter->max_sk) { 2807 sock_hold(sk); 2808 iter->batch[iter->end_sk++] = sk; 2809 } 2810 expected++; 2811 } 2812 } 2813 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2814 2815 return expected; 2816 } 2817 2818 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) 2819 { 2820 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2821 struct bpf_tcp_iter_state *iter = seq->private; 2822 struct tcp_iter_state *st = &iter->state; 2823 unsigned int expected; 2824 bool resized = false; 2825 struct sock *sk; 2826 2827 /* The st->bucket is done. Directly advance to the next 2828 * bucket instead of having the tcp_seek_last_pos() to skip 2829 * one by one in the current bucket and eventually find out 2830 * it has to advance to the next bucket. 2831 */ 2832 if (iter->st_bucket_done) { 2833 st->offset = 0; 2834 st->bucket++; 2835 if (st->state == TCP_SEQ_STATE_LISTENING && 2836 st->bucket > hinfo->lhash2_mask) { 2837 st->state = TCP_SEQ_STATE_ESTABLISHED; 2838 st->bucket = 0; 2839 } 2840 } 2841 2842 again: 2843 /* Get a new batch */ 2844 iter->cur_sk = 0; 2845 iter->end_sk = 0; 2846 iter->st_bucket_done = false; 2847 2848 sk = tcp_seek_last_pos(seq); 2849 if (!sk) 2850 return NULL; /* Done */ 2851 2852 if (st->state == TCP_SEQ_STATE_LISTENING) 2853 expected = bpf_iter_tcp_listening_batch(seq, sk); 2854 else 2855 expected = bpf_iter_tcp_established_batch(seq, sk); 2856 2857 if (iter->end_sk == expected) { 2858 iter->st_bucket_done = true; 2859 return sk; 2860 } 2861 2862 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) { 2863 resized = true; 2864 goto again; 2865 } 2866 2867 return sk; 2868 } 2869 2870 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) 2871 { 2872 /* bpf iter does not support lseek, so it always 2873 * continue from where it was stop()-ped. 2874 */ 2875 if (*pos) 2876 return bpf_iter_tcp_batch(seq); 2877 2878 return SEQ_START_TOKEN; 2879 } 2880 2881 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2882 { 2883 struct bpf_tcp_iter_state *iter = seq->private; 2884 struct tcp_iter_state *st = &iter->state; 2885 struct sock *sk; 2886 2887 /* Whenever seq_next() is called, the iter->cur_sk is 2888 * done with seq_show(), so advance to the next sk in 2889 * the batch. 2890 */ 2891 if (iter->cur_sk < iter->end_sk) { 2892 /* Keeping st->num consistent in tcp_iter_state. 2893 * bpf_iter_tcp does not use st->num. 2894 * meta.seq_num is used instead. 2895 */ 2896 st->num++; 2897 /* Move st->offset to the next sk in the bucket such that 2898 * the future start() will resume at st->offset in 2899 * st->bucket. See tcp_seek_last_pos(). 2900 */ 2901 st->offset++; 2902 sock_put(iter->batch[iter->cur_sk++]); 2903 } 2904 2905 if (iter->cur_sk < iter->end_sk) 2906 sk = iter->batch[iter->cur_sk]; 2907 else 2908 sk = bpf_iter_tcp_batch(seq); 2909 2910 ++*pos; 2911 /* Keeping st->last_pos consistent in tcp_iter_state. 2912 * bpf iter does not do lseek, so st->last_pos always equals to *pos. 2913 */ 2914 st->last_pos = *pos; 2915 return sk; 2916 } 2917 2918 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 2919 { 2920 struct bpf_iter_meta meta; 2921 struct bpf_prog *prog; 2922 struct sock *sk = v; 2923 bool slow; 2924 uid_t uid; 2925 int ret; 2926 2927 if (v == SEQ_START_TOKEN) 2928 return 0; 2929 2930 if (sk_fullsock(sk)) 2931 slow = lock_sock_fast(sk); 2932 2933 if (unlikely(sk_unhashed(sk))) { 2934 ret = SEQ_SKIP; 2935 goto unlock; 2936 } 2937 2938 if (sk->sk_state == TCP_TIME_WAIT) { 2939 uid = 0; 2940 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 2941 const struct request_sock *req = v; 2942 2943 uid = from_kuid_munged(seq_user_ns(seq), 2944 sock_i_uid(req->rsk_listener)); 2945 } else { 2946 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 2947 } 2948 2949 meta.seq = seq; 2950 prog = bpf_iter_get_info(&meta, false); 2951 ret = tcp_prog_seq_show(prog, &meta, v, uid); 2952 2953 unlock: 2954 if (sk_fullsock(sk)) 2955 unlock_sock_fast(sk, slow); 2956 return ret; 2957 2958 } 2959 2960 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 2961 { 2962 struct bpf_tcp_iter_state *iter = seq->private; 2963 struct bpf_iter_meta meta; 2964 struct bpf_prog *prog; 2965 2966 if (!v) { 2967 meta.seq = seq; 2968 prog = bpf_iter_get_info(&meta, true); 2969 if (prog) 2970 (void)tcp_prog_seq_show(prog, &meta, v, 0); 2971 } 2972 2973 if (iter->cur_sk < iter->end_sk) { 2974 bpf_iter_tcp_put_batch(iter); 2975 iter->st_bucket_done = false; 2976 } 2977 } 2978 2979 static const struct seq_operations bpf_iter_tcp_seq_ops = { 2980 .show = bpf_iter_tcp_seq_show, 2981 .start = bpf_iter_tcp_seq_start, 2982 .next = bpf_iter_tcp_seq_next, 2983 .stop = bpf_iter_tcp_seq_stop, 2984 }; 2985 #endif 2986 static unsigned short seq_file_family(const struct seq_file *seq) 2987 { 2988 const struct tcp_seq_afinfo *afinfo; 2989 2990 #ifdef CONFIG_BPF_SYSCALL 2991 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ 2992 if (seq->op == &bpf_iter_tcp_seq_ops) 2993 return AF_UNSPEC; 2994 #endif 2995 2996 /* Iterated from proc fs */ 2997 afinfo = pde_data(file_inode(seq->file)); 2998 return afinfo->family; 2999 } 3000 3001 static const struct seq_operations tcp4_seq_ops = { 3002 .show = tcp4_seq_show, 3003 .start = tcp_seq_start, 3004 .next = tcp_seq_next, 3005 .stop = tcp_seq_stop, 3006 }; 3007 3008 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 3009 .family = AF_INET, 3010 }; 3011 3012 static int __net_init tcp4_proc_init_net(struct net *net) 3013 { 3014 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 3015 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 3016 return -ENOMEM; 3017 return 0; 3018 } 3019 3020 static void __net_exit tcp4_proc_exit_net(struct net *net) 3021 { 3022 remove_proc_entry("tcp", net->proc_net); 3023 } 3024 3025 static struct pernet_operations tcp4_net_ops = { 3026 .init = tcp4_proc_init_net, 3027 .exit = tcp4_proc_exit_net, 3028 }; 3029 3030 int __init tcp4_proc_init(void) 3031 { 3032 return register_pernet_subsys(&tcp4_net_ops); 3033 } 3034 3035 void tcp4_proc_exit(void) 3036 { 3037 unregister_pernet_subsys(&tcp4_net_ops); 3038 } 3039 #endif /* CONFIG_PROC_FS */ 3040 3041 /* @wake is one when sk_stream_write_space() calls us. 3042 * This sends EPOLLOUT only if notsent_bytes is half the limit. 3043 * This mimics the strategy used in sock_def_write_space(). 3044 */ 3045 bool tcp_stream_memory_free(const struct sock *sk, int wake) 3046 { 3047 const struct tcp_sock *tp = tcp_sk(sk); 3048 u32 notsent_bytes = READ_ONCE(tp->write_seq) - 3049 READ_ONCE(tp->snd_nxt); 3050 3051 return (notsent_bytes << wake) < tcp_notsent_lowat(tp); 3052 } 3053 EXPORT_SYMBOL(tcp_stream_memory_free); 3054 3055 struct proto tcp_prot = { 3056 .name = "TCP", 3057 .owner = THIS_MODULE, 3058 .close = tcp_close, 3059 .pre_connect = tcp_v4_pre_connect, 3060 .connect = tcp_v4_connect, 3061 .disconnect = tcp_disconnect, 3062 .accept = inet_csk_accept, 3063 .ioctl = tcp_ioctl, 3064 .init = tcp_v4_init_sock, 3065 .destroy = tcp_v4_destroy_sock, 3066 .shutdown = tcp_shutdown, 3067 .setsockopt = tcp_setsockopt, 3068 .getsockopt = tcp_getsockopt, 3069 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 3070 .keepalive = tcp_set_keepalive, 3071 .recvmsg = tcp_recvmsg, 3072 .sendmsg = tcp_sendmsg, 3073 .sendpage = tcp_sendpage, 3074 .backlog_rcv = tcp_v4_do_rcv, 3075 .release_cb = tcp_release_cb, 3076 .hash = inet_hash, 3077 .unhash = inet_unhash, 3078 .get_port = inet_csk_get_port, 3079 .put_port = inet_put_port, 3080 #ifdef CONFIG_BPF_SYSCALL 3081 .psock_update_sk_prot = tcp_bpf_update_proto, 3082 #endif 3083 .enter_memory_pressure = tcp_enter_memory_pressure, 3084 .leave_memory_pressure = tcp_leave_memory_pressure, 3085 .stream_memory_free = tcp_stream_memory_free, 3086 .sockets_allocated = &tcp_sockets_allocated, 3087 .orphan_count = &tcp_orphan_count, 3088 3089 .memory_allocated = &tcp_memory_allocated, 3090 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, 3091 3092 .memory_pressure = &tcp_memory_pressure, 3093 .sysctl_mem = sysctl_tcp_mem, 3094 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3095 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3096 .max_header = MAX_TCP_HEADER, 3097 .obj_size = sizeof(struct tcp_sock), 3098 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3099 .twsk_prot = &tcp_timewait_sock_ops, 3100 .rsk_prot = &tcp_request_sock_ops, 3101 .h.hashinfo = NULL, 3102 .no_autobind = true, 3103 .diag_destroy = tcp_abort, 3104 }; 3105 EXPORT_SYMBOL(tcp_prot); 3106 3107 static void __net_exit tcp_sk_exit(struct net *net) 3108 { 3109 if (net->ipv4.tcp_congestion_control) 3110 bpf_module_put(net->ipv4.tcp_congestion_control, 3111 net->ipv4.tcp_congestion_control->owner); 3112 } 3113 3114 static void __net_init tcp_set_hashinfo(struct net *net) 3115 { 3116 struct inet_hashinfo *hinfo; 3117 unsigned int ehash_entries; 3118 struct net *old_net; 3119 3120 if (net_eq(net, &init_net)) 3121 goto fallback; 3122 3123 old_net = current->nsproxy->net_ns; 3124 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries); 3125 if (!ehash_entries) 3126 goto fallback; 3127 3128 ehash_entries = roundup_pow_of_two(ehash_entries); 3129 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries); 3130 if (!hinfo) { 3131 pr_warn("Failed to allocate TCP ehash (entries: %u) " 3132 "for a netns, fallback to the global one\n", 3133 ehash_entries); 3134 fallback: 3135 hinfo = &tcp_hashinfo; 3136 ehash_entries = tcp_hashinfo.ehash_mask + 1; 3137 } 3138 3139 net->ipv4.tcp_death_row.hashinfo = hinfo; 3140 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2; 3141 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128); 3142 } 3143 3144 static int __net_init tcp_sk_init(struct net *net) 3145 { 3146 net->ipv4.sysctl_tcp_ecn = 2; 3147 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3148 3149 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 3150 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 3151 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 3152 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 3153 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 3154 3155 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 3156 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 3157 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 3158 3159 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 3160 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 3161 net->ipv4.sysctl_tcp_syncookies = 1; 3162 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 3163 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 3164 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 3165 net->ipv4.sysctl_tcp_orphan_retries = 0; 3166 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 3167 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 3168 net->ipv4.sysctl_tcp_tw_reuse = 2; 3169 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 3170 3171 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); 3172 tcp_set_hashinfo(net); 3173 3174 net->ipv4.sysctl_tcp_sack = 1; 3175 net->ipv4.sysctl_tcp_window_scaling = 1; 3176 net->ipv4.sysctl_tcp_timestamps = 1; 3177 net->ipv4.sysctl_tcp_early_retrans = 3; 3178 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 3179 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 3180 net->ipv4.sysctl_tcp_retrans_collapse = 1; 3181 net->ipv4.sysctl_tcp_max_reordering = 300; 3182 net->ipv4.sysctl_tcp_dsack = 1; 3183 net->ipv4.sysctl_tcp_app_win = 31; 3184 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3185 net->ipv4.sysctl_tcp_frto = 2; 3186 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3187 /* This limits the percentage of the congestion window which we 3188 * will allow a single TSO frame to consume. Building TSO frames 3189 * which are too large can cause TCP streams to be bursty. 3190 */ 3191 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3192 /* Default TSQ limit of 16 TSO segments */ 3193 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; 3194 3195 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ 3196 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; 3197 3198 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3199 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ 3200 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3201 net->ipv4.sysctl_tcp_autocorking = 1; 3202 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 3203 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 3204 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 3205 if (net != &init_net) { 3206 memcpy(net->ipv4.sysctl_tcp_rmem, 3207 init_net.ipv4.sysctl_tcp_rmem, 3208 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 3209 memcpy(net->ipv4.sysctl_tcp_wmem, 3210 init_net.ipv4.sysctl_tcp_wmem, 3211 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 3212 } 3213 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3214 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 3215 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3216 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3217 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; 3218 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 3219 3220 /* Set default values for PLB */ 3221 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */ 3222 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3; 3223 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; 3224 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; 3225 /* Default congestion threshold for PLB to mark a round is 50% */ 3226 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2; 3227 3228 /* Reno is always built in */ 3229 if (!net_eq(net, &init_net) && 3230 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 3231 init_net.ipv4.tcp_congestion_control->owner)) 3232 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 3233 else 3234 net->ipv4.tcp_congestion_control = &tcp_reno; 3235 3236 return 0; 3237 } 3238 3239 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 3240 { 3241 struct net *net; 3242 3243 tcp_twsk_purge(net_exit_list, AF_INET); 3244 3245 list_for_each_entry(net, net_exit_list, exit_list) { 3246 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); 3247 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount)); 3248 tcp_fastopen_ctx_destroy(net); 3249 } 3250 } 3251 3252 static struct pernet_operations __net_initdata tcp_sk_ops = { 3253 .init = tcp_sk_init, 3254 .exit = tcp_sk_exit, 3255 .exit_batch = tcp_sk_exit_batch, 3256 }; 3257 3258 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3259 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 3260 struct sock_common *sk_common, uid_t uid) 3261 3262 #define INIT_BATCH_SZ 16 3263 3264 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 3265 { 3266 struct bpf_tcp_iter_state *iter = priv_data; 3267 int err; 3268 3269 err = bpf_iter_init_seq_net(priv_data, aux); 3270 if (err) 3271 return err; 3272 3273 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ); 3274 if (err) { 3275 bpf_iter_fini_seq_net(priv_data); 3276 return err; 3277 } 3278 3279 return 0; 3280 } 3281 3282 static void bpf_iter_fini_tcp(void *priv_data) 3283 { 3284 struct bpf_tcp_iter_state *iter = priv_data; 3285 3286 bpf_iter_fini_seq_net(priv_data); 3287 kvfree(iter->batch); 3288 } 3289 3290 static const struct bpf_iter_seq_info tcp_seq_info = { 3291 .seq_ops = &bpf_iter_tcp_seq_ops, 3292 .init_seq_private = bpf_iter_init_tcp, 3293 .fini_seq_private = bpf_iter_fini_tcp, 3294 .seq_priv_size = sizeof(struct bpf_tcp_iter_state), 3295 }; 3296 3297 static const struct bpf_func_proto * 3298 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, 3299 const struct bpf_prog *prog) 3300 { 3301 switch (func_id) { 3302 case BPF_FUNC_setsockopt: 3303 return &bpf_sk_setsockopt_proto; 3304 case BPF_FUNC_getsockopt: 3305 return &bpf_sk_getsockopt_proto; 3306 default: 3307 return NULL; 3308 } 3309 } 3310 3311 static struct bpf_iter_reg tcp_reg_info = { 3312 .target = "tcp", 3313 .ctx_arg_info_size = 1, 3314 .ctx_arg_info = { 3315 { offsetof(struct bpf_iter__tcp, sk_common), 3316 PTR_TO_BTF_ID_OR_NULL }, 3317 }, 3318 .get_func_proto = bpf_iter_tcp_get_func_proto, 3319 .seq_info = &tcp_seq_info, 3320 }; 3321 3322 static void __init bpf_iter_register(void) 3323 { 3324 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 3325 if (bpf_iter_reg_target(&tcp_reg_info)) 3326 pr_warn("Warning: could not register bpf iterator tcp\n"); 3327 } 3328 3329 #endif 3330 3331 void __init tcp_v4_init(void) 3332 { 3333 int cpu, res; 3334 3335 for_each_possible_cpu(cpu) { 3336 struct sock *sk; 3337 3338 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 3339 IPPROTO_TCP, &init_net); 3340 if (res) 3341 panic("Failed to create the TCP control socket.\n"); 3342 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 3343 3344 /* Please enforce IP_DF and IPID==0 for RST and 3345 * ACK sent in SYN-RECV and TIME-WAIT state. 3346 */ 3347 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 3348 3349 per_cpu(ipv4_tcp_sk, cpu) = sk; 3350 } 3351 if (register_pernet_subsys(&tcp_sk_ops)) 3352 panic("Failed to create the TCP control socket.\n"); 3353 3354 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3355 bpf_iter_register(); 3356 #endif 3357 } 3358