1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 #include <linux/sched.h> 61 62 #include <net/net_namespace.h> 63 #include <net/icmp.h> 64 #include <net/inet_hashtables.h> 65 #include <net/tcp.h> 66 #include <net/transp_v6.h> 67 #include <net/ipv6.h> 68 #include <net/inet_common.h> 69 #include <net/timewait_sock.h> 70 #include <net/xfrm.h> 71 #include <net/secure_seq.h> 72 #include <net/busy_poll.h> 73 74 #include <linux/inet.h> 75 #include <linux/ipv6.h> 76 #include <linux/stddef.h> 77 #include <linux/proc_fs.h> 78 #include <linux/seq_file.h> 79 #include <linux/inetdevice.h> 80 #include <linux/btf_ids.h> 81 82 #include <crypto/hash.h> 83 #include <linux/scatterlist.h> 84 85 #include <trace/events/tcp.h> 86 87 #ifdef CONFIG_TCP_MD5SIG 88 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 89 __be32 daddr, __be32 saddr, const struct tcphdr *th); 90 #endif 91 92 struct inet_hashinfo tcp_hashinfo; 93 EXPORT_SYMBOL(tcp_hashinfo); 94 95 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk); 96 97 static DEFINE_MUTEX(tcp_exit_batch_mutex); 98 99 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 100 { 101 return secure_tcp_seq(ip_hdr(skb)->daddr, 102 ip_hdr(skb)->saddr, 103 tcp_hdr(skb)->dest, 104 tcp_hdr(skb)->source); 105 } 106 107 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 108 { 109 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 110 } 111 112 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 113 { 114 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse); 115 const struct inet_timewait_sock *tw = inet_twsk(sktw); 116 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 117 struct tcp_sock *tp = tcp_sk(sk); 118 119 if (tw->tw_substate == TCP_FIN_WAIT2) 120 reuse = 0; 121 122 if (reuse == 2) { 123 /* Still does not detect *everything* that goes through 124 * lo, since we require a loopback src or dst address 125 * or direct binding to 'lo' interface. 126 */ 127 bool loopback = false; 128 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 129 loopback = true; 130 #if IS_ENABLED(CONFIG_IPV6) 131 if (tw->tw_family == AF_INET6) { 132 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 133 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 134 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 135 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 136 loopback = true; 137 } else 138 #endif 139 { 140 if (ipv4_is_loopback(tw->tw_daddr) || 141 ipv4_is_loopback(tw->tw_rcv_saddr)) 142 loopback = true; 143 } 144 if (!loopback) 145 reuse = 0; 146 } 147 148 /* With PAWS, it is safe from the viewpoint 149 of data integrity. Even without PAWS it is safe provided sequence 150 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 151 152 Actually, the idea is close to VJ's one, only timestamp cache is 153 held not per host, but per port pair and TW bucket is used as state 154 holder. 155 156 If TW bucket has been already destroyed we fall back to VJ's scheme 157 and use initial timestamp retrieved from peer table. 158 */ 159 if (tcptw->tw_ts_recent_stamp && 160 (!twp || (reuse && time_after32(ktime_get_seconds(), 161 tcptw->tw_ts_recent_stamp)))) { 162 /* inet_twsk_hashdance() sets sk_refcnt after putting twsk 163 * and releasing the bucket lock. 164 */ 165 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt))) 166 return 0; 167 168 /* In case of repair and re-using TIME-WAIT sockets we still 169 * want to be sure that it is safe as above but honor the 170 * sequence numbers and time stamps set as part of the repair 171 * process. 172 * 173 * Without this check re-using a TIME-WAIT socket with TCP 174 * repair would accumulate a -1 on the repair assigned 175 * sequence number. The first time it is reused the sequence 176 * is -1, the second time -2, etc. This fixes that issue 177 * without appearing to create any others. 178 */ 179 if (likely(!tp->repair)) { 180 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 181 182 if (!seq) 183 seq = 1; 184 WRITE_ONCE(tp->write_seq, seq); 185 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 186 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 187 } 188 189 return 1; 190 } 191 192 return 0; 193 } 194 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 195 196 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 197 int addr_len) 198 { 199 /* This check is replicated from tcp_v4_connect() and intended to 200 * prevent BPF program called below from accessing bytes that are out 201 * of the bound specified by user in addr_len. 202 */ 203 if (addr_len < sizeof(struct sockaddr_in)) 204 return -EINVAL; 205 206 sock_owned_by_me(sk); 207 208 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len); 209 } 210 211 /* This will initiate an outgoing connection. */ 212 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 213 { 214 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 215 struct inet_timewait_death_row *tcp_death_row; 216 struct inet_sock *inet = inet_sk(sk); 217 struct tcp_sock *tp = tcp_sk(sk); 218 struct ip_options_rcu *inet_opt; 219 struct net *net = sock_net(sk); 220 __be16 orig_sport, orig_dport; 221 __be32 daddr, nexthop; 222 struct flowi4 *fl4; 223 struct rtable *rt; 224 int err; 225 226 if (addr_len < sizeof(struct sockaddr_in)) 227 return -EINVAL; 228 229 if (usin->sin_family != AF_INET) 230 return -EAFNOSUPPORT; 231 232 nexthop = daddr = usin->sin_addr.s_addr; 233 inet_opt = rcu_dereference_protected(inet->inet_opt, 234 lockdep_sock_is_held(sk)); 235 if (inet_opt && inet_opt->opt.srr) { 236 if (!daddr) 237 return -EINVAL; 238 nexthop = inet_opt->opt.faddr; 239 } 240 241 orig_sport = inet->inet_sport; 242 orig_dport = usin->sin_port; 243 fl4 = &inet->cork.fl.u.ip4; 244 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 245 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, 246 orig_dport, sk); 247 if (IS_ERR(rt)) { 248 err = PTR_ERR(rt); 249 if (err == -ENETUNREACH) 250 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); 251 return err; 252 } 253 254 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 255 ip_rt_put(rt); 256 return -ENETUNREACH; 257 } 258 259 if (!inet_opt || !inet_opt->opt.srr) 260 daddr = fl4->daddr; 261 262 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 263 264 if (!inet->inet_saddr) { 265 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); 266 if (err) { 267 ip_rt_put(rt); 268 return err; 269 } 270 } else { 271 sk_rcv_saddr_set(sk, inet->inet_saddr); 272 } 273 274 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 275 /* Reset inherited state */ 276 tp->rx_opt.ts_recent = 0; 277 tp->rx_opt.ts_recent_stamp = 0; 278 if (likely(!tp->repair)) 279 WRITE_ONCE(tp->write_seq, 0); 280 } 281 282 inet->inet_dport = usin->sin_port; 283 sk_daddr_set(sk, daddr); 284 285 inet_csk(sk)->icsk_ext_hdr_len = 0; 286 if (inet_opt) 287 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 288 289 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 290 291 /* Socket identity is still unknown (sport may be zero). 292 * However we set state to SYN-SENT and not releasing socket 293 * lock select source port, enter ourselves into the hash tables and 294 * complete initialization after this. 295 */ 296 tcp_set_state(sk, TCP_SYN_SENT); 297 err = inet_hash_connect(tcp_death_row, sk); 298 if (err) 299 goto failure; 300 301 sk_set_txhash(sk); 302 303 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 304 inet->inet_sport, inet->inet_dport, sk); 305 if (IS_ERR(rt)) { 306 err = PTR_ERR(rt); 307 rt = NULL; 308 goto failure; 309 } 310 /* OK, now commit destination to socket. */ 311 sk->sk_gso_type = SKB_GSO_TCPV4; 312 sk_setup_caps(sk, &rt->dst); 313 rt = NULL; 314 315 if (likely(!tp->repair)) { 316 if (!tp->write_seq) 317 WRITE_ONCE(tp->write_seq, 318 secure_tcp_seq(inet->inet_saddr, 319 inet->inet_daddr, 320 inet->inet_sport, 321 usin->sin_port)); 322 WRITE_ONCE(tp->tsoffset, 323 secure_tcp_ts_off(net, inet->inet_saddr, 324 inet->inet_daddr)); 325 } 326 327 atomic_set(&inet->inet_id, get_random_u16()); 328 329 if (tcp_fastopen_defer_connect(sk, &err)) 330 return err; 331 if (err) 332 goto failure; 333 334 err = tcp_connect(sk); 335 336 if (err) 337 goto failure; 338 339 return 0; 340 341 failure: 342 /* 343 * This unhashes the socket and releases the local port, 344 * if necessary. 345 */ 346 tcp_set_state(sk, TCP_CLOSE); 347 inet_bhash2_reset_saddr(sk); 348 ip_rt_put(rt); 349 sk->sk_route_caps = 0; 350 inet->inet_dport = 0; 351 return err; 352 } 353 EXPORT_SYMBOL(tcp_v4_connect); 354 355 /* 356 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 357 * It can be called through tcp_release_cb() if socket was owned by user 358 * at the time tcp_v4_err() was called to handle ICMP message. 359 */ 360 void tcp_v4_mtu_reduced(struct sock *sk) 361 { 362 struct inet_sock *inet = inet_sk(sk); 363 struct dst_entry *dst; 364 u32 mtu; 365 366 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 367 return; 368 mtu = READ_ONCE(tcp_sk(sk)->mtu_info); 369 dst = inet_csk_update_pmtu(sk, mtu); 370 if (!dst) 371 return; 372 373 /* Something is about to be wrong... Remember soft error 374 * for the case, if this connection will not able to recover. 375 */ 376 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 377 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); 378 379 mtu = dst_mtu(dst); 380 381 if (inet->pmtudisc != IP_PMTUDISC_DONT && 382 ip_sk_accept_pmtu(sk) && 383 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 384 tcp_sync_mss(sk, mtu); 385 386 /* Resend the TCP packet because it's 387 * clear that the old packet has been 388 * dropped. This is the new "fast" path mtu 389 * discovery. 390 */ 391 tcp_simple_retransmit(sk); 392 } /* else let the usual retransmit timer handle it */ 393 } 394 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 395 396 static void do_redirect(struct sk_buff *skb, struct sock *sk) 397 { 398 struct dst_entry *dst = __sk_dst_check(sk, 0); 399 400 if (dst) 401 dst->ops->redirect(dst, sk, skb); 402 } 403 404 405 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 406 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 407 { 408 struct request_sock *req = inet_reqsk(sk); 409 struct net *net = sock_net(sk); 410 411 /* ICMPs are not backlogged, hence we cannot get 412 * an established socket here. 413 */ 414 if (seq != tcp_rsk(req)->snt_isn) { 415 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 416 } else if (abort) { 417 /* 418 * Still in SYN_RECV, just remove it silently. 419 * There is no good way to pass the error to the newly 420 * created socket, and POSIX does not want network 421 * errors returned from accept(). 422 */ 423 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 424 tcp_listendrop(req->rsk_listener); 425 } 426 reqsk_put(req); 427 } 428 EXPORT_SYMBOL(tcp_req_err); 429 430 /* TCP-LD (RFC 6069) logic */ 431 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 432 { 433 struct inet_connection_sock *icsk = inet_csk(sk); 434 struct tcp_sock *tp = tcp_sk(sk); 435 struct sk_buff *skb; 436 s32 remaining; 437 u32 delta_us; 438 439 if (sock_owned_by_user(sk)) 440 return; 441 442 if (seq != tp->snd_una || !icsk->icsk_retransmits || 443 !icsk->icsk_backoff) 444 return; 445 446 skb = tcp_rtx_queue_head(sk); 447 if (WARN_ON_ONCE(!skb)) 448 return; 449 450 icsk->icsk_backoff--; 451 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 452 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 453 454 tcp_mstamp_refresh(tp); 455 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 456 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 457 458 if (remaining > 0) { 459 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 460 remaining, TCP_RTO_MAX); 461 } else { 462 /* RTO revert clocked out retransmission. 463 * Will retransmit now. 464 */ 465 tcp_retransmit_timer(sk); 466 } 467 } 468 EXPORT_SYMBOL(tcp_ld_RTO_revert); 469 470 /* 471 * This routine is called by the ICMP module when it gets some 472 * sort of error condition. If err < 0 then the socket should 473 * be closed and the error returned to the user. If err > 0 474 * it's just the icmp type << 8 | icmp code. After adjustment 475 * header points to the first 8 bytes of the tcp header. We need 476 * to find the appropriate port. 477 * 478 * The locking strategy used here is very "optimistic". When 479 * someone else accesses the socket the ICMP is just dropped 480 * and for some paths there is no check at all. 481 * A more general error queue to queue errors for later handling 482 * is probably better. 483 * 484 */ 485 486 int tcp_v4_err(struct sk_buff *skb, u32 info) 487 { 488 const struct iphdr *iph = (const struct iphdr *)skb->data; 489 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 490 struct tcp_sock *tp; 491 const int type = icmp_hdr(skb)->type; 492 const int code = icmp_hdr(skb)->code; 493 struct sock *sk; 494 struct request_sock *fastopen; 495 u32 seq, snd_una; 496 int err; 497 struct net *net = dev_net(skb->dev); 498 499 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 500 iph->daddr, th->dest, iph->saddr, 501 ntohs(th->source), inet_iif(skb), 0); 502 if (!sk) { 503 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 504 return -ENOENT; 505 } 506 if (sk->sk_state == TCP_TIME_WAIT) { 507 inet_twsk_put(inet_twsk(sk)); 508 return 0; 509 } 510 seq = ntohl(th->seq); 511 if (sk->sk_state == TCP_NEW_SYN_RECV) { 512 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 513 type == ICMP_TIME_EXCEEDED || 514 (type == ICMP_DEST_UNREACH && 515 (code == ICMP_NET_UNREACH || 516 code == ICMP_HOST_UNREACH))); 517 return 0; 518 } 519 520 bh_lock_sock(sk); 521 /* If too many ICMPs get dropped on busy 522 * servers this needs to be solved differently. 523 * We do take care of PMTU discovery (RFC1191) special case : 524 * we can receive locally generated ICMP messages while socket is held. 525 */ 526 if (sock_owned_by_user(sk)) { 527 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 528 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 529 } 530 if (sk->sk_state == TCP_CLOSE) 531 goto out; 532 533 if (static_branch_unlikely(&ip4_min_ttl)) { 534 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 535 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 536 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 537 goto out; 538 } 539 } 540 541 tp = tcp_sk(sk); 542 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 543 fastopen = rcu_dereference(tp->fastopen_rsk); 544 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 545 if (sk->sk_state != TCP_LISTEN && 546 !between(seq, snd_una, tp->snd_nxt)) { 547 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 548 goto out; 549 } 550 551 switch (type) { 552 case ICMP_REDIRECT: 553 if (!sock_owned_by_user(sk)) 554 do_redirect(skb, sk); 555 goto out; 556 case ICMP_SOURCE_QUENCH: 557 /* Just silently ignore these. */ 558 goto out; 559 case ICMP_PARAMETERPROB: 560 err = EPROTO; 561 break; 562 case ICMP_DEST_UNREACH: 563 if (code > NR_ICMP_UNREACH) 564 goto out; 565 566 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 567 /* We are not interested in TCP_LISTEN and open_requests 568 * (SYN-ACKs send out by Linux are always <576bytes so 569 * they should go through unfragmented). 570 */ 571 if (sk->sk_state == TCP_LISTEN) 572 goto out; 573 574 WRITE_ONCE(tp->mtu_info, info); 575 if (!sock_owned_by_user(sk)) { 576 tcp_v4_mtu_reduced(sk); 577 } else { 578 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 579 sock_hold(sk); 580 } 581 goto out; 582 } 583 584 err = icmp_err_convert[code].errno; 585 /* check if this ICMP message allows revert of backoff. 586 * (see RFC 6069) 587 */ 588 if (!fastopen && 589 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 590 tcp_ld_RTO_revert(sk, seq); 591 break; 592 case ICMP_TIME_EXCEEDED: 593 err = EHOSTUNREACH; 594 break; 595 default: 596 goto out; 597 } 598 599 switch (sk->sk_state) { 600 case TCP_SYN_SENT: 601 case TCP_SYN_RECV: 602 /* Only in fast or simultaneous open. If a fast open socket is 603 * already accepted it is treated as a connected one below. 604 */ 605 if (fastopen && !fastopen->sk) 606 break; 607 608 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 609 610 if (!sock_owned_by_user(sk)) 611 tcp_done_with_error(sk, err); 612 else 613 WRITE_ONCE(sk->sk_err_soft, err); 614 goto out; 615 } 616 617 /* If we've already connected we will keep trying 618 * until we time out, or the user gives up. 619 * 620 * rfc1122 4.2.3.9 allows to consider as hard errors 621 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 622 * but it is obsoleted by pmtu discovery). 623 * 624 * Note, that in modern internet, where routing is unreliable 625 * and in each dark corner broken firewalls sit, sending random 626 * errors ordered by their masters even this two messages finally lose 627 * their original sense (even Linux sends invalid PORT_UNREACHs) 628 * 629 * Now we are in compliance with RFCs. 630 * --ANK (980905) 631 */ 632 633 if (!sock_owned_by_user(sk) && 634 inet_test_bit(RECVERR, sk)) { 635 WRITE_ONCE(sk->sk_err, err); 636 sk_error_report(sk); 637 } else { /* Only an error on timeout */ 638 WRITE_ONCE(sk->sk_err_soft, err); 639 } 640 641 out: 642 bh_unlock_sock(sk); 643 sock_put(sk); 644 return 0; 645 } 646 647 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 648 { 649 struct tcphdr *th = tcp_hdr(skb); 650 651 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 652 skb->csum_start = skb_transport_header(skb) - skb->head; 653 skb->csum_offset = offsetof(struct tcphdr, check); 654 } 655 656 /* This routine computes an IPv4 TCP checksum. */ 657 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 658 { 659 const struct inet_sock *inet = inet_sk(sk); 660 661 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 662 } 663 EXPORT_SYMBOL(tcp_v4_send_check); 664 665 /* 666 * This routine will send an RST to the other tcp. 667 * 668 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 669 * for reset. 670 * Answer: if a packet caused RST, it is not for a socket 671 * existing in our system, if it is matched to a socket, 672 * it is just duplicate segment or bug in other side's TCP. 673 * So that we build reply only basing on parameters 674 * arrived with segment. 675 * Exception: precedence violation. We do not implement it in any case. 676 */ 677 678 #ifdef CONFIG_TCP_MD5SIG 679 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED 680 #else 681 #define OPTION_BYTES sizeof(__be32) 682 #endif 683 684 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) 685 { 686 const struct tcphdr *th = tcp_hdr(skb); 687 struct { 688 struct tcphdr th; 689 __be32 opt[OPTION_BYTES / sizeof(__be32)]; 690 } rep; 691 struct ip_reply_arg arg; 692 #ifdef CONFIG_TCP_MD5SIG 693 struct tcp_md5sig_key *key = NULL; 694 const __u8 *hash_location = NULL; 695 unsigned char newhash[16]; 696 int genhash; 697 struct sock *sk1 = NULL; 698 #endif 699 u64 transmit_time = 0; 700 struct sock *ctl_sk; 701 struct net *net; 702 u32 txhash = 0; 703 704 /* Never send a reset in response to a reset. */ 705 if (th->rst) 706 return; 707 708 /* If sk not NULL, it means we did a successful lookup and incoming 709 * route had to be correct. prequeue might have dropped our dst. 710 */ 711 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 712 return; 713 714 /* Swap the send and the receive. */ 715 memset(&rep, 0, sizeof(rep)); 716 rep.th.dest = th->source; 717 rep.th.source = th->dest; 718 rep.th.doff = sizeof(struct tcphdr) / 4; 719 rep.th.rst = 1; 720 721 if (th->ack) { 722 rep.th.seq = th->ack_seq; 723 } else { 724 rep.th.ack = 1; 725 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 726 skb->len - (th->doff << 2)); 727 } 728 729 memset(&arg, 0, sizeof(arg)); 730 arg.iov[0].iov_base = (unsigned char *)&rep; 731 arg.iov[0].iov_len = sizeof(rep.th); 732 733 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 734 #ifdef CONFIG_TCP_MD5SIG 735 rcu_read_lock(); 736 hash_location = tcp_parse_md5sig_option(th); 737 if (sk && sk_fullsock(sk)) { 738 const union tcp_md5_addr *addr; 739 int l3index; 740 741 /* sdif set, means packet ingressed via a device 742 * in an L3 domain and inet_iif is set to it. 743 */ 744 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 745 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 746 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 747 } else if (hash_location) { 748 const union tcp_md5_addr *addr; 749 int sdif = tcp_v4_sdif(skb); 750 int dif = inet_iif(skb); 751 int l3index; 752 753 /* 754 * active side is lost. Try to find listening socket through 755 * source port, and then find md5 key through listening socket. 756 * we are not loose security here: 757 * Incoming packet is checked with md5 hash with finding key, 758 * no RST generated if md5 hash doesn't match. 759 */ 760 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, 761 NULL, 0, ip_hdr(skb)->saddr, 762 th->source, ip_hdr(skb)->daddr, 763 ntohs(th->source), dif, sdif); 764 /* don't send rst if it can't find key */ 765 if (!sk1) 766 goto out; 767 768 /* sdif set, means packet ingressed via a device 769 * in an L3 domain and dif is set to it. 770 */ 771 l3index = sdif ? dif : 0; 772 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 773 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 774 if (!key) 775 goto out; 776 777 778 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 779 if (genhash || memcmp(hash_location, newhash, 16) != 0) 780 goto out; 781 782 } 783 784 if (key) { 785 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 786 (TCPOPT_NOP << 16) | 787 (TCPOPT_MD5SIG << 8) | 788 TCPOLEN_MD5SIG); 789 /* Update length and the length the header thinks exists */ 790 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 791 rep.th.doff = arg.iov[0].iov_len / 4; 792 793 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 794 key, ip_hdr(skb)->saddr, 795 ip_hdr(skb)->daddr, &rep.th); 796 } 797 #endif 798 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ 799 if (rep.opt[0] == 0) { 800 __be32 mrst = mptcp_reset_option(skb); 801 802 if (mrst) { 803 rep.opt[0] = mrst; 804 arg.iov[0].iov_len += sizeof(mrst); 805 rep.th.doff = arg.iov[0].iov_len / 4; 806 } 807 } 808 809 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 810 ip_hdr(skb)->saddr, /* XXX */ 811 arg.iov[0].iov_len, IPPROTO_TCP, 0); 812 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 813 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 814 815 /* When socket is gone, all binding information is lost. 816 * routing might fail in this case. No choice here, if we choose to force 817 * input interface, we will misroute in case of asymmetric route. 818 */ 819 if (sk) { 820 arg.bound_dev_if = sk->sk_bound_dev_if; 821 if (sk_fullsock(sk)) 822 trace_tcp_send_reset(sk, skb); 823 } 824 825 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 826 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 827 828 arg.tos = ip_hdr(skb)->tos; 829 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 830 local_bh_disable(); 831 ctl_sk = this_cpu_read(ipv4_tcp_sk); 832 sock_net_set(ctl_sk, net); 833 if (sk) { 834 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 835 inet_twsk(sk)->tw_mark : sk->sk_mark; 836 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 837 inet_twsk(sk)->tw_priority : sk->sk_priority; 838 transmit_time = tcp_transmit_time(sk); 839 xfrm_sk_clone_policy(ctl_sk, sk); 840 txhash = (sk->sk_state == TCP_TIME_WAIT) ? 841 inet_twsk(sk)->tw_txhash : sk->sk_txhash; 842 } else { 843 ctl_sk->sk_mark = 0; 844 ctl_sk->sk_priority = 0; 845 } 846 ip_send_unicast_reply(ctl_sk, 847 skb, &TCP_SKB_CB(skb)->header.h4.opt, 848 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 849 &arg, arg.iov[0].iov_len, 850 transmit_time, txhash); 851 852 xfrm_sk_free_policy(ctl_sk); 853 sock_net_set(ctl_sk, &init_net); 854 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 855 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 856 local_bh_enable(); 857 858 #ifdef CONFIG_TCP_MD5SIG 859 out: 860 rcu_read_unlock(); 861 #endif 862 } 863 864 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 865 outside socket context is ugly, certainly. What can I do? 866 */ 867 868 static void tcp_v4_send_ack(const struct sock *sk, 869 struct sk_buff *skb, u32 seq, u32 ack, 870 u32 win, u32 tsval, u32 tsecr, int oif, 871 struct tcp_md5sig_key *key, 872 int reply_flags, u8 tos, u32 txhash) 873 { 874 const struct tcphdr *th = tcp_hdr(skb); 875 struct { 876 struct tcphdr th; 877 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 878 #ifdef CONFIG_TCP_MD5SIG 879 + (TCPOLEN_MD5SIG_ALIGNED >> 2) 880 #endif 881 ]; 882 } rep; 883 struct net *net = sock_net(sk); 884 struct ip_reply_arg arg; 885 struct sock *ctl_sk; 886 u64 transmit_time; 887 888 memset(&rep.th, 0, sizeof(struct tcphdr)); 889 memset(&arg, 0, sizeof(arg)); 890 891 arg.iov[0].iov_base = (unsigned char *)&rep; 892 arg.iov[0].iov_len = sizeof(rep.th); 893 if (tsecr) { 894 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 895 (TCPOPT_TIMESTAMP << 8) | 896 TCPOLEN_TIMESTAMP); 897 rep.opt[1] = htonl(tsval); 898 rep.opt[2] = htonl(tsecr); 899 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 900 } 901 902 /* Swap the send and the receive. */ 903 rep.th.dest = th->source; 904 rep.th.source = th->dest; 905 rep.th.doff = arg.iov[0].iov_len / 4; 906 rep.th.seq = htonl(seq); 907 rep.th.ack_seq = htonl(ack); 908 rep.th.ack = 1; 909 rep.th.window = htons(win); 910 911 #ifdef CONFIG_TCP_MD5SIG 912 if (key) { 913 int offset = (tsecr) ? 3 : 0; 914 915 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 916 (TCPOPT_NOP << 16) | 917 (TCPOPT_MD5SIG << 8) | 918 TCPOLEN_MD5SIG); 919 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 920 rep.th.doff = arg.iov[0].iov_len/4; 921 922 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 923 key, ip_hdr(skb)->saddr, 924 ip_hdr(skb)->daddr, &rep.th); 925 } 926 #endif 927 arg.flags = reply_flags; 928 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 929 ip_hdr(skb)->saddr, /* XXX */ 930 arg.iov[0].iov_len, IPPROTO_TCP, 0); 931 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 932 if (oif) 933 arg.bound_dev_if = oif; 934 arg.tos = tos; 935 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 936 local_bh_disable(); 937 ctl_sk = this_cpu_read(ipv4_tcp_sk); 938 sock_net_set(ctl_sk, net); 939 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 940 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 941 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 942 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 943 transmit_time = tcp_transmit_time(sk); 944 ip_send_unicast_reply(ctl_sk, 945 skb, &TCP_SKB_CB(skb)->header.h4.opt, 946 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 947 &arg, arg.iov[0].iov_len, 948 transmit_time, txhash); 949 950 sock_net_set(ctl_sk, &init_net); 951 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 952 local_bh_enable(); 953 } 954 955 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 956 { 957 struct inet_timewait_sock *tw = inet_twsk(sk); 958 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 959 960 tcp_v4_send_ack(sk, skb, 961 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 962 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 963 tcp_time_stamp_raw() + tcptw->tw_ts_offset, 964 tcptw->tw_ts_recent, 965 tw->tw_bound_dev_if, 966 tcp_twsk_md5_key(tcptw), 967 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 968 tw->tw_tos, 969 tw->tw_txhash 970 ); 971 972 inet_twsk_put(tw); 973 } 974 975 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 976 struct request_sock *req) 977 { 978 const union tcp_md5_addr *addr; 979 int l3index; 980 981 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 982 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 983 */ 984 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 985 tcp_sk(sk)->snd_nxt; 986 987 /* RFC 7323 2.3 988 * The window field (SEG.WND) of every outgoing segment, with the 989 * exception of <SYN> segments, MUST be right-shifted by 990 * Rcv.Wind.Shift bits: 991 */ 992 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 993 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 994 tcp_v4_send_ack(sk, skb, seq, 995 tcp_rsk(req)->rcv_nxt, 996 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 997 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, 998 READ_ONCE(req->ts_recent), 999 0, 1000 tcp_md5_do_lookup(sk, l3index, addr, AF_INET), 1001 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 1002 ip_hdr(skb)->tos, 1003 READ_ONCE(tcp_rsk(req)->txhash)); 1004 } 1005 1006 /* 1007 * Send a SYN-ACK after having received a SYN. 1008 * This still operates on a request_sock only, not on a big 1009 * socket. 1010 */ 1011 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 1012 struct flowi *fl, 1013 struct request_sock *req, 1014 struct tcp_fastopen_cookie *foc, 1015 enum tcp_synack_type synack_type, 1016 struct sk_buff *syn_skb) 1017 { 1018 const struct inet_request_sock *ireq = inet_rsk(req); 1019 struct flowi4 fl4; 1020 int err = -1; 1021 struct sk_buff *skb; 1022 u8 tos; 1023 1024 /* First, grab a route. */ 1025 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 1026 return -1; 1027 1028 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 1029 1030 if (skb) { 1031 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1032 1033 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ? 1034 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | 1035 (inet_sk(sk)->tos & INET_ECN_MASK) : 1036 inet_sk(sk)->tos; 1037 1038 if (!INET_ECN_is_capable(tos) && 1039 tcp_bpf_ca_needs_ecn((struct sock *)req)) 1040 tos |= INET_ECN_ECT_0; 1041 1042 rcu_read_lock(); 1043 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 1044 ireq->ir_rmt_addr, 1045 rcu_dereference(ireq->ireq_opt), 1046 tos); 1047 rcu_read_unlock(); 1048 err = net_xmit_eval(err); 1049 } 1050 1051 return err; 1052 } 1053 1054 /* 1055 * IPv4 request_sock destructor. 1056 */ 1057 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1058 { 1059 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1060 } 1061 1062 #ifdef CONFIG_TCP_MD5SIG 1063 /* 1064 * RFC2385 MD5 checksumming requires a mapping of 1065 * IP address->MD5 Key. 1066 * We need to maintain these in the sk structure. 1067 */ 1068 1069 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ); 1070 EXPORT_SYMBOL(tcp_md5_needed); 1071 1072 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) 1073 { 1074 if (!old) 1075 return true; 1076 1077 /* l3index always overrides non-l3index */ 1078 if (old->l3index && new->l3index == 0) 1079 return false; 1080 if (old->l3index == 0 && new->l3index) 1081 return true; 1082 1083 return old->prefixlen < new->prefixlen; 1084 } 1085 1086 /* Find the Key structure for an address. */ 1087 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1088 const union tcp_md5_addr *addr, 1089 int family) 1090 { 1091 const struct tcp_sock *tp = tcp_sk(sk); 1092 struct tcp_md5sig_key *key; 1093 const struct tcp_md5sig_info *md5sig; 1094 __be32 mask; 1095 struct tcp_md5sig_key *best_match = NULL; 1096 bool match; 1097 1098 /* caller either holds rcu_read_lock() or socket lock */ 1099 md5sig = rcu_dereference_check(tp->md5sig_info, 1100 lockdep_sock_is_held(sk)); 1101 if (!md5sig) 1102 return NULL; 1103 1104 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1105 lockdep_sock_is_held(sk)) { 1106 if (key->family != family) 1107 continue; 1108 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index) 1109 continue; 1110 if (family == AF_INET) { 1111 mask = inet_make_mask(key->prefixlen); 1112 match = (key->addr.a4.s_addr & mask) == 1113 (addr->a4.s_addr & mask); 1114 #if IS_ENABLED(CONFIG_IPV6) 1115 } else if (family == AF_INET6) { 1116 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1117 key->prefixlen); 1118 #endif 1119 } else { 1120 match = false; 1121 } 1122 1123 if (match && better_md5_match(best_match, key)) 1124 best_match = key; 1125 } 1126 return best_match; 1127 } 1128 EXPORT_SYMBOL(__tcp_md5_do_lookup); 1129 1130 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1131 const union tcp_md5_addr *addr, 1132 int family, u8 prefixlen, 1133 int l3index, u8 flags) 1134 { 1135 const struct tcp_sock *tp = tcp_sk(sk); 1136 struct tcp_md5sig_key *key; 1137 unsigned int size = sizeof(struct in_addr); 1138 const struct tcp_md5sig_info *md5sig; 1139 1140 /* caller either holds rcu_read_lock() or socket lock */ 1141 md5sig = rcu_dereference_check(tp->md5sig_info, 1142 lockdep_sock_is_held(sk)); 1143 if (!md5sig) 1144 return NULL; 1145 #if IS_ENABLED(CONFIG_IPV6) 1146 if (family == AF_INET6) 1147 size = sizeof(struct in6_addr); 1148 #endif 1149 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1150 lockdep_sock_is_held(sk)) { 1151 if (key->family != family) 1152 continue; 1153 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) 1154 continue; 1155 if (key->l3index != l3index) 1156 continue; 1157 if (!memcmp(&key->addr, addr, size) && 1158 key->prefixlen == prefixlen) 1159 return key; 1160 } 1161 return NULL; 1162 } 1163 1164 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1165 const struct sock *addr_sk) 1166 { 1167 const union tcp_md5_addr *addr; 1168 int l3index; 1169 1170 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1171 addr_sk->sk_bound_dev_if); 1172 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1173 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1174 } 1175 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1176 1177 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) 1178 { 1179 struct tcp_sock *tp = tcp_sk(sk); 1180 struct tcp_md5sig_info *md5sig; 1181 1182 md5sig = kmalloc(sizeof(*md5sig), gfp); 1183 if (!md5sig) 1184 return -ENOMEM; 1185 1186 sk_gso_disable(sk); 1187 INIT_HLIST_HEAD(&md5sig->head); 1188 rcu_assign_pointer(tp->md5sig_info, md5sig); 1189 return 0; 1190 } 1191 1192 /* This can be called on a newly created socket, from other files */ 1193 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1194 int family, u8 prefixlen, int l3index, u8 flags, 1195 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1196 { 1197 /* Add Key to the list */ 1198 struct tcp_md5sig_key *key; 1199 struct tcp_sock *tp = tcp_sk(sk); 1200 struct tcp_md5sig_info *md5sig; 1201 1202 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1203 if (key) { 1204 /* Pre-existing entry - just update that one. 1205 * Note that the key might be used concurrently. 1206 * data_race() is telling kcsan that we do not care of 1207 * key mismatches, since changing MD5 key on live flows 1208 * can lead to packet drops. 1209 */ 1210 data_race(memcpy(key->key, newkey, newkeylen)); 1211 1212 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1213 * Also note that a reader could catch new key->keylen value 1214 * but old key->key[], this is the reason we use __GFP_ZERO 1215 * at sock_kmalloc() time below these lines. 1216 */ 1217 WRITE_ONCE(key->keylen, newkeylen); 1218 1219 return 0; 1220 } 1221 1222 md5sig = rcu_dereference_protected(tp->md5sig_info, 1223 lockdep_sock_is_held(sk)); 1224 1225 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1226 if (!key) 1227 return -ENOMEM; 1228 if (!tcp_alloc_md5sig_pool()) { 1229 sock_kfree_s(sk, key, sizeof(*key)); 1230 return -ENOMEM; 1231 } 1232 1233 memcpy(key->key, newkey, newkeylen); 1234 key->keylen = newkeylen; 1235 key->family = family; 1236 key->prefixlen = prefixlen; 1237 key->l3index = l3index; 1238 key->flags = flags; 1239 memcpy(&key->addr, addr, 1240 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) : 1241 sizeof(struct in_addr)); 1242 hlist_add_head_rcu(&key->node, &md5sig->head); 1243 return 0; 1244 } 1245 1246 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1247 int family, u8 prefixlen, int l3index, u8 flags, 1248 const u8 *newkey, u8 newkeylen) 1249 { 1250 struct tcp_sock *tp = tcp_sk(sk); 1251 1252 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1253 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) 1254 return -ENOMEM; 1255 1256 if (!static_branch_inc(&tcp_md5_needed.key)) { 1257 struct tcp_md5sig_info *md5sig; 1258 1259 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1260 rcu_assign_pointer(tp->md5sig_info, NULL); 1261 kfree_rcu(md5sig, rcu); 1262 return -EUSERS; 1263 } 1264 } 1265 1266 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags, 1267 newkey, newkeylen, GFP_KERNEL); 1268 } 1269 EXPORT_SYMBOL(tcp_md5_do_add); 1270 1271 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, 1272 int family, u8 prefixlen, int l3index, 1273 struct tcp_md5sig_key *key) 1274 { 1275 struct tcp_sock *tp = tcp_sk(sk); 1276 1277 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1278 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) 1279 return -ENOMEM; 1280 1281 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) { 1282 struct tcp_md5sig_info *md5sig; 1283 1284 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1285 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n"); 1286 rcu_assign_pointer(tp->md5sig_info, NULL); 1287 kfree_rcu(md5sig, rcu); 1288 return -EUSERS; 1289 } 1290 } 1291 1292 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, 1293 key->flags, key->key, key->keylen, 1294 sk_gfp_mask(sk, GFP_ATOMIC)); 1295 } 1296 EXPORT_SYMBOL(tcp_md5_key_copy); 1297 1298 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1299 u8 prefixlen, int l3index, u8 flags) 1300 { 1301 struct tcp_md5sig_key *key; 1302 1303 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1304 if (!key) 1305 return -ENOENT; 1306 hlist_del_rcu(&key->node); 1307 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1308 kfree_rcu(key, rcu); 1309 return 0; 1310 } 1311 EXPORT_SYMBOL(tcp_md5_do_del); 1312 1313 static void tcp_clear_md5_list(struct sock *sk) 1314 { 1315 struct tcp_sock *tp = tcp_sk(sk); 1316 struct tcp_md5sig_key *key; 1317 struct hlist_node *n; 1318 struct tcp_md5sig_info *md5sig; 1319 1320 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1321 1322 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1323 hlist_del_rcu(&key->node); 1324 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1325 kfree_rcu(key, rcu); 1326 } 1327 } 1328 1329 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1330 sockptr_t optval, int optlen) 1331 { 1332 struct tcp_md5sig cmd; 1333 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1334 const union tcp_md5_addr *addr; 1335 u8 prefixlen = 32; 1336 int l3index = 0; 1337 u8 flags; 1338 1339 if (optlen < sizeof(cmd)) 1340 return -EINVAL; 1341 1342 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1343 return -EFAULT; 1344 1345 if (sin->sin_family != AF_INET) 1346 return -EINVAL; 1347 1348 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1349 1350 if (optname == TCP_MD5SIG_EXT && 1351 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1352 prefixlen = cmd.tcpm_prefixlen; 1353 if (prefixlen > 32) 1354 return -EINVAL; 1355 } 1356 1357 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && 1358 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1359 struct net_device *dev; 1360 1361 rcu_read_lock(); 1362 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1363 if (dev && netif_is_l3_master(dev)) 1364 l3index = dev->ifindex; 1365 1366 rcu_read_unlock(); 1367 1368 /* ok to reference set/not set outside of rcu; 1369 * right now device MUST be an L3 master 1370 */ 1371 if (!dev || !l3index) 1372 return -EINVAL; 1373 } 1374 1375 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1376 1377 if (!cmd.tcpm_keylen) 1378 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); 1379 1380 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1381 return -EINVAL; 1382 1383 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, 1384 cmd.tcpm_key, cmd.tcpm_keylen); 1385 } 1386 1387 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, 1388 __be32 daddr, __be32 saddr, 1389 const struct tcphdr *th, int nbytes) 1390 { 1391 struct tcp4_pseudohdr *bp; 1392 struct scatterlist sg; 1393 struct tcphdr *_th; 1394 1395 bp = hp->scratch; 1396 bp->saddr = saddr; 1397 bp->daddr = daddr; 1398 bp->pad = 0; 1399 bp->protocol = IPPROTO_TCP; 1400 bp->len = cpu_to_be16(nbytes); 1401 1402 _th = (struct tcphdr *)(bp + 1); 1403 memcpy(_th, th, sizeof(*th)); 1404 _th->check = 0; 1405 1406 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1407 ahash_request_set_crypt(hp->md5_req, &sg, NULL, 1408 sizeof(*bp) + sizeof(*th)); 1409 return crypto_ahash_update(hp->md5_req); 1410 } 1411 1412 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1413 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1414 { 1415 struct tcp_md5sig_pool *hp; 1416 struct ahash_request *req; 1417 1418 hp = tcp_get_md5sig_pool(); 1419 if (!hp) 1420 goto clear_hash_noput; 1421 req = hp->md5_req; 1422 1423 if (crypto_ahash_init(req)) 1424 goto clear_hash; 1425 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) 1426 goto clear_hash; 1427 if (tcp_md5_hash_key(hp, key)) 1428 goto clear_hash; 1429 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1430 if (crypto_ahash_final(req)) 1431 goto clear_hash; 1432 1433 tcp_put_md5sig_pool(); 1434 return 0; 1435 1436 clear_hash: 1437 tcp_put_md5sig_pool(); 1438 clear_hash_noput: 1439 memset(md5_hash, 0, 16); 1440 return 1; 1441 } 1442 1443 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1444 const struct sock *sk, 1445 const struct sk_buff *skb) 1446 { 1447 struct tcp_md5sig_pool *hp; 1448 struct ahash_request *req; 1449 const struct tcphdr *th = tcp_hdr(skb); 1450 __be32 saddr, daddr; 1451 1452 if (sk) { /* valid for establish/request sockets */ 1453 saddr = sk->sk_rcv_saddr; 1454 daddr = sk->sk_daddr; 1455 } else { 1456 const struct iphdr *iph = ip_hdr(skb); 1457 saddr = iph->saddr; 1458 daddr = iph->daddr; 1459 } 1460 1461 hp = tcp_get_md5sig_pool(); 1462 if (!hp) 1463 goto clear_hash_noput; 1464 req = hp->md5_req; 1465 1466 if (crypto_ahash_init(req)) 1467 goto clear_hash; 1468 1469 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len)) 1470 goto clear_hash; 1471 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) 1472 goto clear_hash; 1473 if (tcp_md5_hash_key(hp, key)) 1474 goto clear_hash; 1475 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1476 if (crypto_ahash_final(req)) 1477 goto clear_hash; 1478 1479 tcp_put_md5sig_pool(); 1480 return 0; 1481 1482 clear_hash: 1483 tcp_put_md5sig_pool(); 1484 clear_hash_noput: 1485 memset(md5_hash, 0, 16); 1486 return 1; 1487 } 1488 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1489 1490 #endif 1491 1492 static void tcp_v4_init_req(struct request_sock *req, 1493 const struct sock *sk_listener, 1494 struct sk_buff *skb) 1495 { 1496 struct inet_request_sock *ireq = inet_rsk(req); 1497 struct net *net = sock_net(sk_listener); 1498 1499 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1500 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1501 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1502 } 1503 1504 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1505 struct sk_buff *skb, 1506 struct flowi *fl, 1507 struct request_sock *req) 1508 { 1509 tcp_v4_init_req(req, sk, skb); 1510 1511 if (security_inet_conn_request(sk, skb, req)) 1512 return NULL; 1513 1514 return inet_csk_route_req(sk, &fl->u.ip4, req); 1515 } 1516 1517 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1518 .family = PF_INET, 1519 .obj_size = sizeof(struct tcp_request_sock), 1520 .rtx_syn_ack = tcp_rtx_synack, 1521 .send_ack = tcp_v4_reqsk_send_ack, 1522 .destructor = tcp_v4_reqsk_destructor, 1523 .send_reset = tcp_v4_send_reset, 1524 .syn_ack_timeout = tcp_syn_ack_timeout, 1525 }; 1526 1527 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1528 .mss_clamp = TCP_MSS_DEFAULT, 1529 #ifdef CONFIG_TCP_MD5SIG 1530 .req_md5_lookup = tcp_v4_md5_lookup, 1531 .calc_md5_hash = tcp_v4_md5_hash_skb, 1532 #endif 1533 #ifdef CONFIG_SYN_COOKIES 1534 .cookie_init_seq = cookie_v4_init_sequence, 1535 #endif 1536 .route_req = tcp_v4_route_req, 1537 .init_seq = tcp_v4_init_seq, 1538 .init_ts_off = tcp_v4_init_ts_off, 1539 .send_synack = tcp_v4_send_synack, 1540 }; 1541 1542 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1543 { 1544 /* Never answer to SYNs send to broadcast or multicast */ 1545 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1546 goto drop; 1547 1548 return tcp_conn_request(&tcp_request_sock_ops, 1549 &tcp_request_sock_ipv4_ops, sk, skb); 1550 1551 drop: 1552 tcp_listendrop(sk); 1553 return 0; 1554 } 1555 EXPORT_SYMBOL(tcp_v4_conn_request); 1556 1557 1558 /* 1559 * The three way handshake has completed - we got a valid synack - 1560 * now create the new socket. 1561 */ 1562 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1563 struct request_sock *req, 1564 struct dst_entry *dst, 1565 struct request_sock *req_unhash, 1566 bool *own_req) 1567 { 1568 struct inet_request_sock *ireq; 1569 bool found_dup_sk = false; 1570 struct inet_sock *newinet; 1571 struct tcp_sock *newtp; 1572 struct sock *newsk; 1573 #ifdef CONFIG_TCP_MD5SIG 1574 const union tcp_md5_addr *addr; 1575 struct tcp_md5sig_key *key; 1576 int l3index; 1577 #endif 1578 struct ip_options_rcu *inet_opt; 1579 1580 if (sk_acceptq_is_full(sk)) 1581 goto exit_overflow; 1582 1583 newsk = tcp_create_openreq_child(sk, req, skb); 1584 if (!newsk) 1585 goto exit_nonewsk; 1586 1587 newsk->sk_gso_type = SKB_GSO_TCPV4; 1588 inet_sk_rx_dst_set(newsk, skb); 1589 1590 newtp = tcp_sk(newsk); 1591 newinet = inet_sk(newsk); 1592 ireq = inet_rsk(req); 1593 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1594 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1595 newsk->sk_bound_dev_if = ireq->ir_iif; 1596 newinet->inet_saddr = ireq->ir_loc_addr; 1597 inet_opt = rcu_dereference(ireq->ireq_opt); 1598 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1599 newinet->mc_index = inet_iif(skb); 1600 newinet->mc_ttl = ip_hdr(skb)->ttl; 1601 newinet->rcv_tos = ip_hdr(skb)->tos; 1602 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1603 if (inet_opt) 1604 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1605 atomic_set(&newinet->inet_id, get_random_u16()); 1606 1607 /* Set ToS of the new socket based upon the value of incoming SYN. 1608 * ECT bits are set later in tcp_init_transfer(). 1609 */ 1610 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1611 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1612 1613 if (!dst) { 1614 dst = inet_csk_route_child_sock(sk, newsk, req); 1615 if (!dst) 1616 goto put_and_exit; 1617 } else { 1618 /* syncookie case : see end of cookie_v4_check() */ 1619 } 1620 sk_setup_caps(newsk, dst); 1621 1622 tcp_ca_openreq_child(newsk, dst); 1623 1624 tcp_sync_mss(newsk, dst_mtu(dst)); 1625 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1626 1627 tcp_initialize_rcv_mss(newsk); 1628 1629 #ifdef CONFIG_TCP_MD5SIG 1630 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1631 /* Copy over the MD5 key from the original socket */ 1632 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1633 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1634 if (key) { 1635 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key)) 1636 goto put_and_exit; 1637 sk_gso_disable(newsk); 1638 } 1639 #endif 1640 1641 if (__inet_inherit_port(sk, newsk) < 0) 1642 goto put_and_exit; 1643 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1644 &found_dup_sk); 1645 if (likely(*own_req)) { 1646 tcp_move_syn(newtp, req); 1647 ireq->ireq_opt = NULL; 1648 } else { 1649 newinet->inet_opt = NULL; 1650 1651 if (!req_unhash && found_dup_sk) { 1652 /* This code path should only be executed in the 1653 * syncookie case only 1654 */ 1655 bh_unlock_sock(newsk); 1656 sock_put(newsk); 1657 newsk = NULL; 1658 } 1659 } 1660 return newsk; 1661 1662 exit_overflow: 1663 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1664 exit_nonewsk: 1665 dst_release(dst); 1666 exit: 1667 tcp_listendrop(sk); 1668 return NULL; 1669 put_and_exit: 1670 newinet->inet_opt = NULL; 1671 inet_csk_prepare_forced_close(newsk); 1672 tcp_done(newsk); 1673 goto exit; 1674 } 1675 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1676 1677 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1678 { 1679 #ifdef CONFIG_SYN_COOKIES 1680 const struct tcphdr *th = tcp_hdr(skb); 1681 1682 if (!th->syn) 1683 sk = cookie_v4_check(sk, skb); 1684 #endif 1685 return sk; 1686 } 1687 1688 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1689 struct tcphdr *th, u32 *cookie) 1690 { 1691 u16 mss = 0; 1692 #ifdef CONFIG_SYN_COOKIES 1693 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1694 &tcp_request_sock_ipv4_ops, sk, th); 1695 if (mss) { 1696 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1697 tcp_synq_overflow(sk); 1698 } 1699 #endif 1700 return mss; 1701 } 1702 1703 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 1704 u32)); 1705 /* The socket must have it's spinlock held when we get 1706 * here, unless it is a TCP_LISTEN socket. 1707 * 1708 * We have a potential double-lock case here, so even when 1709 * doing backlog processing we use the BH locking scheme. 1710 * This is because we cannot sleep with the original spinlock 1711 * held. 1712 */ 1713 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1714 { 1715 enum skb_drop_reason reason; 1716 struct sock *rsk; 1717 1718 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1719 struct dst_entry *dst; 1720 1721 dst = rcu_dereference_protected(sk->sk_rx_dst, 1722 lockdep_sock_is_held(sk)); 1723 1724 sock_rps_save_rxhash(sk, skb); 1725 sk_mark_napi_id(sk, skb); 1726 if (dst) { 1727 if (sk->sk_rx_dst_ifindex != skb->skb_iif || 1728 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, 1729 dst, 0)) { 1730 RCU_INIT_POINTER(sk->sk_rx_dst, NULL); 1731 dst_release(dst); 1732 } 1733 } 1734 tcp_rcv_established(sk, skb); 1735 return 0; 1736 } 1737 1738 reason = SKB_DROP_REASON_NOT_SPECIFIED; 1739 if (tcp_checksum_complete(skb)) 1740 goto csum_err; 1741 1742 if (sk->sk_state == TCP_LISTEN) { 1743 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1744 1745 if (!nsk) 1746 goto discard; 1747 if (nsk != sk) { 1748 if (tcp_child_process(sk, nsk, skb)) { 1749 rsk = nsk; 1750 goto reset; 1751 } 1752 return 0; 1753 } 1754 } else 1755 sock_rps_save_rxhash(sk, skb); 1756 1757 if (tcp_rcv_state_process(sk, skb)) { 1758 rsk = sk; 1759 goto reset; 1760 } 1761 return 0; 1762 1763 reset: 1764 tcp_v4_send_reset(rsk, skb); 1765 discard: 1766 kfree_skb_reason(skb, reason); 1767 /* Be careful here. If this function gets more complicated and 1768 * gcc suffers from register pressure on the x86, sk (in %ebx) 1769 * might be destroyed here. This current version compiles correctly, 1770 * but you have been warned. 1771 */ 1772 return 0; 1773 1774 csum_err: 1775 reason = SKB_DROP_REASON_TCP_CSUM; 1776 trace_tcp_bad_csum(skb); 1777 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1778 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1779 goto discard; 1780 } 1781 EXPORT_SYMBOL(tcp_v4_do_rcv); 1782 1783 int tcp_v4_early_demux(struct sk_buff *skb) 1784 { 1785 struct net *net = dev_net(skb->dev); 1786 const struct iphdr *iph; 1787 const struct tcphdr *th; 1788 struct sock *sk; 1789 1790 if (skb->pkt_type != PACKET_HOST) 1791 return 0; 1792 1793 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1794 return 0; 1795 1796 iph = ip_hdr(skb); 1797 th = tcp_hdr(skb); 1798 1799 if (th->doff < sizeof(struct tcphdr) / 4) 1800 return 0; 1801 1802 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 1803 iph->saddr, th->source, 1804 iph->daddr, ntohs(th->dest), 1805 skb->skb_iif, inet_sdif(skb)); 1806 if (sk) { 1807 skb->sk = sk; 1808 skb->destructor = sock_edemux; 1809 if (sk_fullsock(sk)) { 1810 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); 1811 1812 if (dst) 1813 dst = dst_check(dst, 0); 1814 if (dst && 1815 sk->sk_rx_dst_ifindex == skb->skb_iif) 1816 skb_dst_set_noref(skb, dst); 1817 } 1818 } 1819 return 0; 1820 } 1821 1822 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, 1823 enum skb_drop_reason *reason) 1824 { 1825 u32 tail_gso_size, tail_gso_segs; 1826 struct skb_shared_info *shinfo; 1827 const struct tcphdr *th; 1828 struct tcphdr *thtail; 1829 struct sk_buff *tail; 1830 unsigned int hdrlen; 1831 bool fragstolen; 1832 u32 gso_segs; 1833 u32 gso_size; 1834 u64 limit; 1835 int delta; 1836 1837 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1838 * we can fix skb->truesize to its real value to avoid future drops. 1839 * This is valid because skb is not yet charged to the socket. 1840 * It has been noticed pure SACK packets were sometimes dropped 1841 * (if cooked by drivers without copybreak feature). 1842 */ 1843 skb_condense(skb); 1844 1845 skb_dst_drop(skb); 1846 1847 if (unlikely(tcp_checksum_complete(skb))) { 1848 bh_unlock_sock(sk); 1849 trace_tcp_bad_csum(skb); 1850 *reason = SKB_DROP_REASON_TCP_CSUM; 1851 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1852 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1853 return true; 1854 } 1855 1856 /* Attempt coalescing to last skb in backlog, even if we are 1857 * above the limits. 1858 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 1859 */ 1860 th = (const struct tcphdr *)skb->data; 1861 hdrlen = th->doff * 4; 1862 1863 tail = sk->sk_backlog.tail; 1864 if (!tail) 1865 goto no_coalesce; 1866 thtail = (struct tcphdr *)tail->data; 1867 1868 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 1869 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 1870 ((TCP_SKB_CB(tail)->tcp_flags | 1871 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 1872 !((TCP_SKB_CB(tail)->tcp_flags & 1873 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 1874 ((TCP_SKB_CB(tail)->tcp_flags ^ 1875 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || 1876 #ifdef CONFIG_TLS_DEVICE 1877 tail->decrypted != skb->decrypted || 1878 #endif 1879 !mptcp_skb_can_collapse(tail, skb) || 1880 thtail->doff != th->doff || 1881 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 1882 goto no_coalesce; 1883 1884 __skb_pull(skb, hdrlen); 1885 1886 shinfo = skb_shinfo(skb); 1887 gso_size = shinfo->gso_size ?: skb->len; 1888 gso_segs = shinfo->gso_segs ?: 1; 1889 1890 shinfo = skb_shinfo(tail); 1891 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); 1892 tail_gso_segs = shinfo->gso_segs ?: 1; 1893 1894 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 1895 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 1896 1897 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 1898 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 1899 thtail->window = th->window; 1900 } 1901 1902 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 1903 * thtail->fin, so that the fast path in tcp_rcv_established() 1904 * is not entered if we append a packet with a FIN. 1905 * SYN, RST, URG are not present. 1906 * ACK is set on both packets. 1907 * PSH : we do not really care in TCP stack, 1908 * at least for 'GRO' packets. 1909 */ 1910 thtail->fin |= th->fin; 1911 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 1912 1913 if (TCP_SKB_CB(skb)->has_rxtstamp) { 1914 TCP_SKB_CB(tail)->has_rxtstamp = true; 1915 tail->tstamp = skb->tstamp; 1916 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 1917 } 1918 1919 /* Not as strict as GRO. We only need to carry mss max value */ 1920 shinfo->gso_size = max(gso_size, tail_gso_size); 1921 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); 1922 1923 sk->sk_backlog.len += delta; 1924 __NET_INC_STATS(sock_net(sk), 1925 LINUX_MIB_TCPBACKLOGCOALESCE); 1926 kfree_skb_partial(skb, fragstolen); 1927 return false; 1928 } 1929 __skb_push(skb, hdrlen); 1930 1931 no_coalesce: 1932 /* sk->sk_backlog.len is reset only at the end of __release_sock(). 1933 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach 1934 * sk_rcvbuf in normal conditions. 1935 */ 1936 limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1; 1937 1938 limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1; 1939 1940 /* Only socket owner can try to collapse/prune rx queues 1941 * to reduce memory overhead, so add a little headroom here. 1942 * Few sockets backlog are possibly concurrently non empty. 1943 */ 1944 limit += 64 * 1024; 1945 1946 limit = min_t(u64, limit, UINT_MAX); 1947 1948 if (unlikely(sk_add_backlog(sk, skb, limit))) { 1949 bh_unlock_sock(sk); 1950 *reason = SKB_DROP_REASON_SOCKET_BACKLOG; 1951 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 1952 return true; 1953 } 1954 return false; 1955 } 1956 EXPORT_SYMBOL(tcp_add_backlog); 1957 1958 int tcp_filter(struct sock *sk, struct sk_buff *skb) 1959 { 1960 struct tcphdr *th = (struct tcphdr *)skb->data; 1961 1962 return sk_filter_trim_cap(sk, skb, th->doff * 4); 1963 } 1964 EXPORT_SYMBOL(tcp_filter); 1965 1966 static void tcp_v4_restore_cb(struct sk_buff *skb) 1967 { 1968 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 1969 sizeof(struct inet_skb_parm)); 1970 } 1971 1972 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 1973 const struct tcphdr *th) 1974 { 1975 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 1976 * barrier() makes sure compiler wont play fool^Waliasing games. 1977 */ 1978 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 1979 sizeof(struct inet_skb_parm)); 1980 barrier(); 1981 1982 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1983 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1984 skb->len - th->doff * 4); 1985 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1986 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 1987 TCP_SKB_CB(skb)->tcp_tw_isn = 0; 1988 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 1989 TCP_SKB_CB(skb)->sacked = 0; 1990 TCP_SKB_CB(skb)->has_rxtstamp = 1991 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 1992 } 1993 1994 /* 1995 * From tcp_input.c 1996 */ 1997 1998 int tcp_v4_rcv(struct sk_buff *skb) 1999 { 2000 struct net *net = dev_net(skb->dev); 2001 enum skb_drop_reason drop_reason; 2002 int sdif = inet_sdif(skb); 2003 int dif = inet_iif(skb); 2004 const struct iphdr *iph; 2005 const struct tcphdr *th; 2006 bool refcounted; 2007 struct sock *sk; 2008 int ret; 2009 2010 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 2011 if (skb->pkt_type != PACKET_HOST) 2012 goto discard_it; 2013 2014 /* Count it even if it's bad */ 2015 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 2016 2017 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 2018 goto discard_it; 2019 2020 th = (const struct tcphdr *)skb->data; 2021 2022 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { 2023 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; 2024 goto bad_packet; 2025 } 2026 if (!pskb_may_pull(skb, th->doff * 4)) 2027 goto discard_it; 2028 2029 /* An explanation is required here, I think. 2030 * Packet length and doff are validated by header prediction, 2031 * provided case of th->doff==0 is eliminated. 2032 * So, we defer the checks. */ 2033 2034 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 2035 goto csum_error; 2036 2037 th = (const struct tcphdr *)skb->data; 2038 iph = ip_hdr(skb); 2039 lookup: 2040 sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo, 2041 skb, __tcp_hdrlen(th), th->source, 2042 th->dest, sdif, &refcounted); 2043 if (!sk) 2044 goto no_tcp_socket; 2045 2046 process: 2047 if (sk->sk_state == TCP_TIME_WAIT) 2048 goto do_time_wait; 2049 2050 if (sk->sk_state == TCP_NEW_SYN_RECV) { 2051 struct request_sock *req = inet_reqsk(sk); 2052 bool req_stolen = false; 2053 struct sock *nsk; 2054 2055 sk = req->rsk_listener; 2056 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 2057 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2058 else 2059 drop_reason = tcp_inbound_md5_hash(sk, skb, 2060 &iph->saddr, &iph->daddr, 2061 AF_INET, dif, sdif); 2062 if (unlikely(drop_reason)) { 2063 sk_drops_add(sk, skb); 2064 reqsk_put(req); 2065 goto discard_it; 2066 } 2067 if (tcp_checksum_complete(skb)) { 2068 reqsk_put(req); 2069 goto csum_error; 2070 } 2071 if (unlikely(sk->sk_state != TCP_LISTEN)) { 2072 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); 2073 if (!nsk) { 2074 inet_csk_reqsk_queue_drop_and_put(sk, req); 2075 goto lookup; 2076 } 2077 sk = nsk; 2078 /* reuseport_migrate_sock() has already held one sk_refcnt 2079 * before returning. 2080 */ 2081 } else { 2082 /* We own a reference on the listener, increase it again 2083 * as we might lose it too soon. 2084 */ 2085 sock_hold(sk); 2086 } 2087 refcounted = true; 2088 nsk = NULL; 2089 if (!tcp_filter(sk, skb)) { 2090 th = (const struct tcphdr *)skb->data; 2091 iph = ip_hdr(skb); 2092 tcp_v4_fill_cb(skb, iph, th); 2093 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 2094 } else { 2095 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2096 } 2097 if (!nsk) { 2098 reqsk_put(req); 2099 if (req_stolen) { 2100 /* Another cpu got exclusive access to req 2101 * and created a full blown socket. 2102 * Try to feed this packet to this socket 2103 * instead of discarding it. 2104 */ 2105 tcp_v4_restore_cb(skb); 2106 sock_put(sk); 2107 goto lookup; 2108 } 2109 goto discard_and_relse; 2110 } 2111 nf_reset_ct(skb); 2112 if (nsk == sk) { 2113 reqsk_put(req); 2114 tcp_v4_restore_cb(skb); 2115 } else if (tcp_child_process(sk, nsk, skb)) { 2116 tcp_v4_send_reset(nsk, skb); 2117 goto discard_and_relse; 2118 } else { 2119 sock_put(sk); 2120 return 0; 2121 } 2122 } 2123 2124 if (static_branch_unlikely(&ip4_min_ttl)) { 2125 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 2126 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 2127 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2128 drop_reason = SKB_DROP_REASON_TCP_MINTTL; 2129 goto discard_and_relse; 2130 } 2131 } 2132 2133 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { 2134 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2135 goto discard_and_relse; 2136 } 2137 2138 drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr, 2139 &iph->daddr, AF_INET, dif, sdif); 2140 if (drop_reason) 2141 goto discard_and_relse; 2142 2143 nf_reset_ct(skb); 2144 2145 if (tcp_filter(sk, skb)) { 2146 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2147 goto discard_and_relse; 2148 } 2149 th = (const struct tcphdr *)skb->data; 2150 iph = ip_hdr(skb); 2151 tcp_v4_fill_cb(skb, iph, th); 2152 2153 skb->dev = NULL; 2154 2155 if (sk->sk_state == TCP_LISTEN) { 2156 ret = tcp_v4_do_rcv(sk, skb); 2157 goto put_and_return; 2158 } 2159 2160 sk_incoming_cpu_update(sk); 2161 2162 bh_lock_sock_nested(sk); 2163 tcp_segs_in(tcp_sk(sk), skb); 2164 ret = 0; 2165 if (!sock_owned_by_user(sk)) { 2166 ret = tcp_v4_do_rcv(sk, skb); 2167 } else { 2168 if (tcp_add_backlog(sk, skb, &drop_reason)) 2169 goto discard_and_relse; 2170 } 2171 bh_unlock_sock(sk); 2172 2173 put_and_return: 2174 if (refcounted) 2175 sock_put(sk); 2176 2177 return ret; 2178 2179 no_tcp_socket: 2180 drop_reason = SKB_DROP_REASON_NO_SOCKET; 2181 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2182 goto discard_it; 2183 2184 tcp_v4_fill_cb(skb, iph, th); 2185 2186 if (tcp_checksum_complete(skb)) { 2187 csum_error: 2188 drop_reason = SKB_DROP_REASON_TCP_CSUM; 2189 trace_tcp_bad_csum(skb); 2190 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2191 bad_packet: 2192 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2193 } else { 2194 tcp_v4_send_reset(NULL, skb); 2195 } 2196 2197 discard_it: 2198 SKB_DR_OR(drop_reason, NOT_SPECIFIED); 2199 /* Discard frame. */ 2200 kfree_skb_reason(skb, drop_reason); 2201 return 0; 2202 2203 discard_and_relse: 2204 sk_drops_add(sk, skb); 2205 if (refcounted) 2206 sock_put(sk); 2207 goto discard_it; 2208 2209 do_time_wait: 2210 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2211 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2212 inet_twsk_put(inet_twsk(sk)); 2213 goto discard_it; 2214 } 2215 2216 tcp_v4_fill_cb(skb, iph, th); 2217 2218 if (tcp_checksum_complete(skb)) { 2219 inet_twsk_put(inet_twsk(sk)); 2220 goto csum_error; 2221 } 2222 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 2223 case TCP_TW_SYN: { 2224 struct sock *sk2 = inet_lookup_listener(net, 2225 net->ipv4.tcp_death_row.hashinfo, 2226 skb, __tcp_hdrlen(th), 2227 iph->saddr, th->source, 2228 iph->daddr, th->dest, 2229 inet_iif(skb), 2230 sdif); 2231 if (sk2) { 2232 inet_twsk_deschedule_put(inet_twsk(sk)); 2233 sk = sk2; 2234 tcp_v4_restore_cb(skb); 2235 refcounted = false; 2236 goto process; 2237 } 2238 } 2239 /* to ACK */ 2240 fallthrough; 2241 case TCP_TW_ACK: 2242 tcp_v4_timewait_ack(sk, skb); 2243 break; 2244 case TCP_TW_RST: 2245 tcp_v4_send_reset(sk, skb); 2246 inet_twsk_deschedule_put(inet_twsk(sk)); 2247 goto discard_it; 2248 case TCP_TW_SUCCESS:; 2249 } 2250 goto discard_it; 2251 } 2252 2253 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2254 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2255 .twsk_unique = tcp_twsk_unique, 2256 .twsk_destructor= tcp_twsk_destructor, 2257 }; 2258 2259 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2260 { 2261 struct dst_entry *dst = skb_dst(skb); 2262 2263 if (dst && dst_hold_safe(dst)) { 2264 rcu_assign_pointer(sk->sk_rx_dst, dst); 2265 sk->sk_rx_dst_ifindex = skb->skb_iif; 2266 } 2267 } 2268 EXPORT_SYMBOL(inet_sk_rx_dst_set); 2269 2270 const struct inet_connection_sock_af_ops ipv4_specific = { 2271 .queue_xmit = ip_queue_xmit, 2272 .send_check = tcp_v4_send_check, 2273 .rebuild_header = inet_sk_rebuild_header, 2274 .sk_rx_dst_set = inet_sk_rx_dst_set, 2275 .conn_request = tcp_v4_conn_request, 2276 .syn_recv_sock = tcp_v4_syn_recv_sock, 2277 .net_header_len = sizeof(struct iphdr), 2278 .setsockopt = ip_setsockopt, 2279 .getsockopt = ip_getsockopt, 2280 .addr2sockaddr = inet_csk_addr2sockaddr, 2281 .sockaddr_len = sizeof(struct sockaddr_in), 2282 .mtu_reduced = tcp_v4_mtu_reduced, 2283 }; 2284 EXPORT_SYMBOL(ipv4_specific); 2285 2286 #ifdef CONFIG_TCP_MD5SIG 2287 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2288 .md5_lookup = tcp_v4_md5_lookup, 2289 .calc_md5_hash = tcp_v4_md5_hash_skb, 2290 .md5_parse = tcp_v4_parse_md5_keys, 2291 }; 2292 #endif 2293 2294 /* NOTE: A lot of things set to zero explicitly by call to 2295 * sk_alloc() so need not be done here. 2296 */ 2297 static int tcp_v4_init_sock(struct sock *sk) 2298 { 2299 struct inet_connection_sock *icsk = inet_csk(sk); 2300 2301 tcp_init_sock(sk); 2302 2303 icsk->icsk_af_ops = &ipv4_specific; 2304 2305 #ifdef CONFIG_TCP_MD5SIG 2306 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2307 #endif 2308 2309 return 0; 2310 } 2311 2312 void tcp_v4_destroy_sock(struct sock *sk) 2313 { 2314 struct tcp_sock *tp = tcp_sk(sk); 2315 2316 trace_tcp_destroy_sock(sk); 2317 2318 tcp_clear_xmit_timers(sk); 2319 2320 tcp_cleanup_congestion_control(sk); 2321 2322 tcp_cleanup_ulp(sk); 2323 2324 /* Cleanup up the write buffer. */ 2325 tcp_write_queue_purge(sk); 2326 2327 /* Check if we want to disable active TFO */ 2328 tcp_fastopen_active_disable_ofo_check(sk); 2329 2330 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2331 skb_rbtree_purge(&tp->out_of_order_queue); 2332 2333 #ifdef CONFIG_TCP_MD5SIG 2334 /* Clean up the MD5 key list, if any */ 2335 if (tp->md5sig_info) { 2336 tcp_clear_md5_list(sk); 2337 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu); 2338 tp->md5sig_info = NULL; 2339 static_branch_slow_dec_deferred(&tcp_md5_needed); 2340 } 2341 #endif 2342 2343 /* Clean up a referenced TCP bind bucket. */ 2344 if (inet_csk(sk)->icsk_bind_hash) 2345 inet_put_port(sk); 2346 2347 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2348 2349 /* If socket is aborted during connect operation */ 2350 tcp_free_fastopen_req(tp); 2351 tcp_fastopen_destroy_cipher(sk); 2352 tcp_saved_syn_free(tp); 2353 2354 sk_sockets_allocated_dec(sk); 2355 } 2356 EXPORT_SYMBOL(tcp_v4_destroy_sock); 2357 2358 #ifdef CONFIG_PROC_FS 2359 /* Proc filesystem TCP sock list dumping. */ 2360 2361 static unsigned short seq_file_family(const struct seq_file *seq); 2362 2363 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) 2364 { 2365 unsigned short family = seq_file_family(seq); 2366 2367 /* AF_UNSPEC is used as a match all */ 2368 return ((family == AF_UNSPEC || family == sk->sk_family) && 2369 net_eq(sock_net(sk), seq_file_net(seq))); 2370 } 2371 2372 /* Find a non empty bucket (starting from st->bucket) 2373 * and return the first sk from it. 2374 */ 2375 static void *listening_get_first(struct seq_file *seq) 2376 { 2377 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2378 struct tcp_iter_state *st = seq->private; 2379 2380 st->offset = 0; 2381 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) { 2382 struct inet_listen_hashbucket *ilb2; 2383 struct hlist_nulls_node *node; 2384 struct sock *sk; 2385 2386 ilb2 = &hinfo->lhash2[st->bucket]; 2387 if (hlist_nulls_empty(&ilb2->nulls_head)) 2388 continue; 2389 2390 spin_lock(&ilb2->lock); 2391 sk_nulls_for_each(sk, node, &ilb2->nulls_head) { 2392 if (seq_sk_match(seq, sk)) 2393 return sk; 2394 } 2395 spin_unlock(&ilb2->lock); 2396 } 2397 2398 return NULL; 2399 } 2400 2401 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). 2402 * If "cur" is the last one in the st->bucket, 2403 * call listening_get_first() to return the first sk of the next 2404 * non empty bucket. 2405 */ 2406 static void *listening_get_next(struct seq_file *seq, void *cur) 2407 { 2408 struct tcp_iter_state *st = seq->private; 2409 struct inet_listen_hashbucket *ilb2; 2410 struct hlist_nulls_node *node; 2411 struct inet_hashinfo *hinfo; 2412 struct sock *sk = cur; 2413 2414 ++st->num; 2415 ++st->offset; 2416 2417 sk = sk_nulls_next(sk); 2418 sk_nulls_for_each_from(sk, node) { 2419 if (seq_sk_match(seq, sk)) 2420 return sk; 2421 } 2422 2423 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2424 ilb2 = &hinfo->lhash2[st->bucket]; 2425 spin_unlock(&ilb2->lock); 2426 ++st->bucket; 2427 return listening_get_first(seq); 2428 } 2429 2430 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2431 { 2432 struct tcp_iter_state *st = seq->private; 2433 void *rc; 2434 2435 st->bucket = 0; 2436 st->offset = 0; 2437 rc = listening_get_first(seq); 2438 2439 while (rc && *pos) { 2440 rc = listening_get_next(seq, rc); 2441 --*pos; 2442 } 2443 return rc; 2444 } 2445 2446 static inline bool empty_bucket(struct inet_hashinfo *hinfo, 2447 const struct tcp_iter_state *st) 2448 { 2449 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain); 2450 } 2451 2452 /* 2453 * Get first established socket starting from bucket given in st->bucket. 2454 * If st->bucket is zero, the very first socket in the hash is returned. 2455 */ 2456 static void *established_get_first(struct seq_file *seq) 2457 { 2458 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2459 struct tcp_iter_state *st = seq->private; 2460 2461 st->offset = 0; 2462 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) { 2463 struct sock *sk; 2464 struct hlist_nulls_node *node; 2465 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket); 2466 2467 cond_resched(); 2468 2469 /* Lockless fast path for the common case of empty buckets */ 2470 if (empty_bucket(hinfo, st)) 2471 continue; 2472 2473 spin_lock_bh(lock); 2474 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) { 2475 if (seq_sk_match(seq, sk)) 2476 return sk; 2477 } 2478 spin_unlock_bh(lock); 2479 } 2480 2481 return NULL; 2482 } 2483 2484 static void *established_get_next(struct seq_file *seq, void *cur) 2485 { 2486 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2487 struct tcp_iter_state *st = seq->private; 2488 struct hlist_nulls_node *node; 2489 struct sock *sk = cur; 2490 2491 ++st->num; 2492 ++st->offset; 2493 2494 sk = sk_nulls_next(sk); 2495 2496 sk_nulls_for_each_from(sk, node) { 2497 if (seq_sk_match(seq, sk)) 2498 return sk; 2499 } 2500 2501 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2502 ++st->bucket; 2503 return established_get_first(seq); 2504 } 2505 2506 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2507 { 2508 struct tcp_iter_state *st = seq->private; 2509 void *rc; 2510 2511 st->bucket = 0; 2512 rc = established_get_first(seq); 2513 2514 while (rc && pos) { 2515 rc = established_get_next(seq, rc); 2516 --pos; 2517 } 2518 return rc; 2519 } 2520 2521 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2522 { 2523 void *rc; 2524 struct tcp_iter_state *st = seq->private; 2525 2526 st->state = TCP_SEQ_STATE_LISTENING; 2527 rc = listening_get_idx(seq, &pos); 2528 2529 if (!rc) { 2530 st->state = TCP_SEQ_STATE_ESTABLISHED; 2531 rc = established_get_idx(seq, pos); 2532 } 2533 2534 return rc; 2535 } 2536 2537 static void *tcp_seek_last_pos(struct seq_file *seq) 2538 { 2539 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2540 struct tcp_iter_state *st = seq->private; 2541 int bucket = st->bucket; 2542 int offset = st->offset; 2543 int orig_num = st->num; 2544 void *rc = NULL; 2545 2546 switch (st->state) { 2547 case TCP_SEQ_STATE_LISTENING: 2548 if (st->bucket > hinfo->lhash2_mask) 2549 break; 2550 rc = listening_get_first(seq); 2551 while (offset-- && rc && bucket == st->bucket) 2552 rc = listening_get_next(seq, rc); 2553 if (rc) 2554 break; 2555 st->bucket = 0; 2556 st->state = TCP_SEQ_STATE_ESTABLISHED; 2557 fallthrough; 2558 case TCP_SEQ_STATE_ESTABLISHED: 2559 if (st->bucket > hinfo->ehash_mask) 2560 break; 2561 rc = established_get_first(seq); 2562 while (offset-- && rc && bucket == st->bucket) 2563 rc = established_get_next(seq, rc); 2564 } 2565 2566 st->num = orig_num; 2567 2568 return rc; 2569 } 2570 2571 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2572 { 2573 struct tcp_iter_state *st = seq->private; 2574 void *rc; 2575 2576 if (*pos && *pos == st->last_pos) { 2577 rc = tcp_seek_last_pos(seq); 2578 if (rc) 2579 goto out; 2580 } 2581 2582 st->state = TCP_SEQ_STATE_LISTENING; 2583 st->num = 0; 2584 st->bucket = 0; 2585 st->offset = 0; 2586 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2587 2588 out: 2589 st->last_pos = *pos; 2590 return rc; 2591 } 2592 EXPORT_SYMBOL(tcp_seq_start); 2593 2594 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2595 { 2596 struct tcp_iter_state *st = seq->private; 2597 void *rc = NULL; 2598 2599 if (v == SEQ_START_TOKEN) { 2600 rc = tcp_get_idx(seq, 0); 2601 goto out; 2602 } 2603 2604 switch (st->state) { 2605 case TCP_SEQ_STATE_LISTENING: 2606 rc = listening_get_next(seq, v); 2607 if (!rc) { 2608 st->state = TCP_SEQ_STATE_ESTABLISHED; 2609 st->bucket = 0; 2610 st->offset = 0; 2611 rc = established_get_first(seq); 2612 } 2613 break; 2614 case TCP_SEQ_STATE_ESTABLISHED: 2615 rc = established_get_next(seq, v); 2616 break; 2617 } 2618 out: 2619 ++*pos; 2620 st->last_pos = *pos; 2621 return rc; 2622 } 2623 EXPORT_SYMBOL(tcp_seq_next); 2624 2625 void tcp_seq_stop(struct seq_file *seq, void *v) 2626 { 2627 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2628 struct tcp_iter_state *st = seq->private; 2629 2630 switch (st->state) { 2631 case TCP_SEQ_STATE_LISTENING: 2632 if (v != SEQ_START_TOKEN) 2633 spin_unlock(&hinfo->lhash2[st->bucket].lock); 2634 break; 2635 case TCP_SEQ_STATE_ESTABLISHED: 2636 if (v) 2637 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2638 break; 2639 } 2640 } 2641 EXPORT_SYMBOL(tcp_seq_stop); 2642 2643 static void get_openreq4(const struct request_sock *req, 2644 struct seq_file *f, int i) 2645 { 2646 const struct inet_request_sock *ireq = inet_rsk(req); 2647 long delta = req->rsk_timer.expires - jiffies; 2648 2649 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2650 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2651 i, 2652 ireq->ir_loc_addr, 2653 ireq->ir_num, 2654 ireq->ir_rmt_addr, 2655 ntohs(ireq->ir_rmt_port), 2656 TCP_SYN_RECV, 2657 0, 0, /* could print option size, but that is af dependent. */ 2658 1, /* timers active (only the expire timer) */ 2659 jiffies_delta_to_clock_t(delta), 2660 req->num_timeout, 2661 from_kuid_munged(seq_user_ns(f), 2662 sock_i_uid(req->rsk_listener)), 2663 0, /* non standard timer */ 2664 0, /* open_requests have no inode */ 2665 0, 2666 req); 2667 } 2668 2669 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2670 { 2671 int timer_active; 2672 unsigned long timer_expires; 2673 const struct tcp_sock *tp = tcp_sk(sk); 2674 const struct inet_connection_sock *icsk = inet_csk(sk); 2675 const struct inet_sock *inet = inet_sk(sk); 2676 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2677 __be32 dest = inet->inet_daddr; 2678 __be32 src = inet->inet_rcv_saddr; 2679 __u16 destp = ntohs(inet->inet_dport); 2680 __u16 srcp = ntohs(inet->inet_sport); 2681 int rx_queue; 2682 int state; 2683 2684 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2685 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2686 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2687 timer_active = 1; 2688 timer_expires = icsk->icsk_timeout; 2689 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2690 timer_active = 4; 2691 timer_expires = icsk->icsk_timeout; 2692 } else if (timer_pending(&sk->sk_timer)) { 2693 timer_active = 2; 2694 timer_expires = sk->sk_timer.expires; 2695 } else { 2696 timer_active = 0; 2697 timer_expires = jiffies; 2698 } 2699 2700 state = inet_sk_state_load(sk); 2701 if (state == TCP_LISTEN) 2702 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2703 else 2704 /* Because we don't lock the socket, 2705 * we might find a transient negative value. 2706 */ 2707 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2708 READ_ONCE(tp->copied_seq), 0); 2709 2710 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2711 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2712 i, src, srcp, dest, destp, state, 2713 READ_ONCE(tp->write_seq) - tp->snd_una, 2714 rx_queue, 2715 timer_active, 2716 jiffies_delta_to_clock_t(timer_expires - jiffies), 2717 icsk->icsk_retransmits, 2718 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2719 icsk->icsk_probes_out, 2720 sock_i_ino(sk), 2721 refcount_read(&sk->sk_refcnt), sk, 2722 jiffies_to_clock_t(icsk->icsk_rto), 2723 jiffies_to_clock_t(icsk->icsk_ack.ato), 2724 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2725 tcp_snd_cwnd(tp), 2726 state == TCP_LISTEN ? 2727 fastopenq->max_qlen : 2728 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2729 } 2730 2731 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2732 struct seq_file *f, int i) 2733 { 2734 long delta = tw->tw_timer.expires - jiffies; 2735 __be32 dest, src; 2736 __u16 destp, srcp; 2737 2738 dest = tw->tw_daddr; 2739 src = tw->tw_rcv_saddr; 2740 destp = ntohs(tw->tw_dport); 2741 srcp = ntohs(tw->tw_sport); 2742 2743 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2744 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2745 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2746 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2747 refcount_read(&tw->tw_refcnt), tw); 2748 } 2749 2750 #define TMPSZ 150 2751 2752 static int tcp4_seq_show(struct seq_file *seq, void *v) 2753 { 2754 struct tcp_iter_state *st; 2755 struct sock *sk = v; 2756 2757 seq_setwidth(seq, TMPSZ - 1); 2758 if (v == SEQ_START_TOKEN) { 2759 seq_puts(seq, " sl local_address rem_address st tx_queue " 2760 "rx_queue tr tm->when retrnsmt uid timeout " 2761 "inode"); 2762 goto out; 2763 } 2764 st = seq->private; 2765 2766 if (sk->sk_state == TCP_TIME_WAIT) 2767 get_timewait4_sock(v, seq, st->num); 2768 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2769 get_openreq4(v, seq, st->num); 2770 else 2771 get_tcp4_sock(v, seq, st->num); 2772 out: 2773 seq_pad(seq, '\n'); 2774 return 0; 2775 } 2776 2777 #ifdef CONFIG_BPF_SYSCALL 2778 struct bpf_tcp_iter_state { 2779 struct tcp_iter_state state; 2780 unsigned int cur_sk; 2781 unsigned int end_sk; 2782 unsigned int max_sk; 2783 struct sock **batch; 2784 bool st_bucket_done; 2785 }; 2786 2787 struct bpf_iter__tcp { 2788 __bpf_md_ptr(struct bpf_iter_meta *, meta); 2789 __bpf_md_ptr(struct sock_common *, sk_common); 2790 uid_t uid __aligned(8); 2791 }; 2792 2793 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 2794 struct sock_common *sk_common, uid_t uid) 2795 { 2796 struct bpf_iter__tcp ctx; 2797 2798 meta->seq_num--; /* skip SEQ_START_TOKEN */ 2799 ctx.meta = meta; 2800 ctx.sk_common = sk_common; 2801 ctx.uid = uid; 2802 return bpf_iter_run_prog(prog, &ctx); 2803 } 2804 2805 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) 2806 { 2807 while (iter->cur_sk < iter->end_sk) 2808 sock_gen_put(iter->batch[iter->cur_sk++]); 2809 } 2810 2811 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, 2812 unsigned int new_batch_sz) 2813 { 2814 struct sock **new_batch; 2815 2816 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 2817 GFP_USER | __GFP_NOWARN); 2818 if (!new_batch) 2819 return -ENOMEM; 2820 2821 bpf_iter_tcp_put_batch(iter); 2822 kvfree(iter->batch); 2823 iter->batch = new_batch; 2824 iter->max_sk = new_batch_sz; 2825 2826 return 0; 2827 } 2828 2829 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, 2830 struct sock *start_sk) 2831 { 2832 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2833 struct bpf_tcp_iter_state *iter = seq->private; 2834 struct tcp_iter_state *st = &iter->state; 2835 struct hlist_nulls_node *node; 2836 unsigned int expected = 1; 2837 struct sock *sk; 2838 2839 sock_hold(start_sk); 2840 iter->batch[iter->end_sk++] = start_sk; 2841 2842 sk = sk_nulls_next(start_sk); 2843 sk_nulls_for_each_from(sk, node) { 2844 if (seq_sk_match(seq, sk)) { 2845 if (iter->end_sk < iter->max_sk) { 2846 sock_hold(sk); 2847 iter->batch[iter->end_sk++] = sk; 2848 } 2849 expected++; 2850 } 2851 } 2852 spin_unlock(&hinfo->lhash2[st->bucket].lock); 2853 2854 return expected; 2855 } 2856 2857 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, 2858 struct sock *start_sk) 2859 { 2860 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2861 struct bpf_tcp_iter_state *iter = seq->private; 2862 struct tcp_iter_state *st = &iter->state; 2863 struct hlist_nulls_node *node; 2864 unsigned int expected = 1; 2865 struct sock *sk; 2866 2867 sock_hold(start_sk); 2868 iter->batch[iter->end_sk++] = start_sk; 2869 2870 sk = sk_nulls_next(start_sk); 2871 sk_nulls_for_each_from(sk, node) { 2872 if (seq_sk_match(seq, sk)) { 2873 if (iter->end_sk < iter->max_sk) { 2874 sock_hold(sk); 2875 iter->batch[iter->end_sk++] = sk; 2876 } 2877 expected++; 2878 } 2879 } 2880 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2881 2882 return expected; 2883 } 2884 2885 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) 2886 { 2887 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2888 struct bpf_tcp_iter_state *iter = seq->private; 2889 struct tcp_iter_state *st = &iter->state; 2890 unsigned int expected; 2891 bool resized = false; 2892 struct sock *sk; 2893 2894 /* The st->bucket is done. Directly advance to the next 2895 * bucket instead of having the tcp_seek_last_pos() to skip 2896 * one by one in the current bucket and eventually find out 2897 * it has to advance to the next bucket. 2898 */ 2899 if (iter->st_bucket_done) { 2900 st->offset = 0; 2901 st->bucket++; 2902 if (st->state == TCP_SEQ_STATE_LISTENING && 2903 st->bucket > hinfo->lhash2_mask) { 2904 st->state = TCP_SEQ_STATE_ESTABLISHED; 2905 st->bucket = 0; 2906 } 2907 } 2908 2909 again: 2910 /* Get a new batch */ 2911 iter->cur_sk = 0; 2912 iter->end_sk = 0; 2913 iter->st_bucket_done = false; 2914 2915 sk = tcp_seek_last_pos(seq); 2916 if (!sk) 2917 return NULL; /* Done */ 2918 2919 if (st->state == TCP_SEQ_STATE_LISTENING) 2920 expected = bpf_iter_tcp_listening_batch(seq, sk); 2921 else 2922 expected = bpf_iter_tcp_established_batch(seq, sk); 2923 2924 if (iter->end_sk == expected) { 2925 iter->st_bucket_done = true; 2926 return sk; 2927 } 2928 2929 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) { 2930 resized = true; 2931 goto again; 2932 } 2933 2934 return sk; 2935 } 2936 2937 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) 2938 { 2939 /* bpf iter does not support lseek, so it always 2940 * continue from where it was stop()-ped. 2941 */ 2942 if (*pos) 2943 return bpf_iter_tcp_batch(seq); 2944 2945 return SEQ_START_TOKEN; 2946 } 2947 2948 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2949 { 2950 struct bpf_tcp_iter_state *iter = seq->private; 2951 struct tcp_iter_state *st = &iter->state; 2952 struct sock *sk; 2953 2954 /* Whenever seq_next() is called, the iter->cur_sk is 2955 * done with seq_show(), so advance to the next sk in 2956 * the batch. 2957 */ 2958 if (iter->cur_sk < iter->end_sk) { 2959 /* Keeping st->num consistent in tcp_iter_state. 2960 * bpf_iter_tcp does not use st->num. 2961 * meta.seq_num is used instead. 2962 */ 2963 st->num++; 2964 /* Move st->offset to the next sk in the bucket such that 2965 * the future start() will resume at st->offset in 2966 * st->bucket. See tcp_seek_last_pos(). 2967 */ 2968 st->offset++; 2969 sock_gen_put(iter->batch[iter->cur_sk++]); 2970 } 2971 2972 if (iter->cur_sk < iter->end_sk) 2973 sk = iter->batch[iter->cur_sk]; 2974 else 2975 sk = bpf_iter_tcp_batch(seq); 2976 2977 ++*pos; 2978 /* Keeping st->last_pos consistent in tcp_iter_state. 2979 * bpf iter does not do lseek, so st->last_pos always equals to *pos. 2980 */ 2981 st->last_pos = *pos; 2982 return sk; 2983 } 2984 2985 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 2986 { 2987 struct bpf_iter_meta meta; 2988 struct bpf_prog *prog; 2989 struct sock *sk = v; 2990 uid_t uid; 2991 int ret; 2992 2993 if (v == SEQ_START_TOKEN) 2994 return 0; 2995 2996 if (sk_fullsock(sk)) 2997 lock_sock(sk); 2998 2999 if (unlikely(sk_unhashed(sk))) { 3000 ret = SEQ_SKIP; 3001 goto unlock; 3002 } 3003 3004 if (sk->sk_state == TCP_TIME_WAIT) { 3005 uid = 0; 3006 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 3007 const struct request_sock *req = v; 3008 3009 uid = from_kuid_munged(seq_user_ns(seq), 3010 sock_i_uid(req->rsk_listener)); 3011 } else { 3012 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 3013 } 3014 3015 meta.seq = seq; 3016 prog = bpf_iter_get_info(&meta, false); 3017 ret = tcp_prog_seq_show(prog, &meta, v, uid); 3018 3019 unlock: 3020 if (sk_fullsock(sk)) 3021 release_sock(sk); 3022 return ret; 3023 3024 } 3025 3026 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 3027 { 3028 struct bpf_tcp_iter_state *iter = seq->private; 3029 struct bpf_iter_meta meta; 3030 struct bpf_prog *prog; 3031 3032 if (!v) { 3033 meta.seq = seq; 3034 prog = bpf_iter_get_info(&meta, true); 3035 if (prog) 3036 (void)tcp_prog_seq_show(prog, &meta, v, 0); 3037 } 3038 3039 if (iter->cur_sk < iter->end_sk) { 3040 bpf_iter_tcp_put_batch(iter); 3041 iter->st_bucket_done = false; 3042 } 3043 } 3044 3045 static const struct seq_operations bpf_iter_tcp_seq_ops = { 3046 .show = bpf_iter_tcp_seq_show, 3047 .start = bpf_iter_tcp_seq_start, 3048 .next = bpf_iter_tcp_seq_next, 3049 .stop = bpf_iter_tcp_seq_stop, 3050 }; 3051 #endif 3052 static unsigned short seq_file_family(const struct seq_file *seq) 3053 { 3054 const struct tcp_seq_afinfo *afinfo; 3055 3056 #ifdef CONFIG_BPF_SYSCALL 3057 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ 3058 if (seq->op == &bpf_iter_tcp_seq_ops) 3059 return AF_UNSPEC; 3060 #endif 3061 3062 /* Iterated from proc fs */ 3063 afinfo = pde_data(file_inode(seq->file)); 3064 return afinfo->family; 3065 } 3066 3067 static const struct seq_operations tcp4_seq_ops = { 3068 .show = tcp4_seq_show, 3069 .start = tcp_seq_start, 3070 .next = tcp_seq_next, 3071 .stop = tcp_seq_stop, 3072 }; 3073 3074 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 3075 .family = AF_INET, 3076 }; 3077 3078 static int __net_init tcp4_proc_init_net(struct net *net) 3079 { 3080 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 3081 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 3082 return -ENOMEM; 3083 return 0; 3084 } 3085 3086 static void __net_exit tcp4_proc_exit_net(struct net *net) 3087 { 3088 remove_proc_entry("tcp", net->proc_net); 3089 } 3090 3091 static struct pernet_operations tcp4_net_ops = { 3092 .init = tcp4_proc_init_net, 3093 .exit = tcp4_proc_exit_net, 3094 }; 3095 3096 int __init tcp4_proc_init(void) 3097 { 3098 return register_pernet_subsys(&tcp4_net_ops); 3099 } 3100 3101 void tcp4_proc_exit(void) 3102 { 3103 unregister_pernet_subsys(&tcp4_net_ops); 3104 } 3105 #endif /* CONFIG_PROC_FS */ 3106 3107 /* @wake is one when sk_stream_write_space() calls us. 3108 * This sends EPOLLOUT only if notsent_bytes is half the limit. 3109 * This mimics the strategy used in sock_def_write_space(). 3110 */ 3111 bool tcp_stream_memory_free(const struct sock *sk, int wake) 3112 { 3113 const struct tcp_sock *tp = tcp_sk(sk); 3114 u32 notsent_bytes = READ_ONCE(tp->write_seq) - 3115 READ_ONCE(tp->snd_nxt); 3116 3117 return (notsent_bytes << wake) < tcp_notsent_lowat(tp); 3118 } 3119 EXPORT_SYMBOL(tcp_stream_memory_free); 3120 3121 struct proto tcp_prot = { 3122 .name = "TCP", 3123 .owner = THIS_MODULE, 3124 .close = tcp_close, 3125 .pre_connect = tcp_v4_pre_connect, 3126 .connect = tcp_v4_connect, 3127 .disconnect = tcp_disconnect, 3128 .accept = inet_csk_accept, 3129 .ioctl = tcp_ioctl, 3130 .init = tcp_v4_init_sock, 3131 .destroy = tcp_v4_destroy_sock, 3132 .shutdown = tcp_shutdown, 3133 .setsockopt = tcp_setsockopt, 3134 .getsockopt = tcp_getsockopt, 3135 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 3136 .keepalive = tcp_set_keepalive, 3137 .recvmsg = tcp_recvmsg, 3138 .sendmsg = tcp_sendmsg, 3139 .splice_eof = tcp_splice_eof, 3140 .backlog_rcv = tcp_v4_do_rcv, 3141 .release_cb = tcp_release_cb, 3142 .hash = inet_hash, 3143 .unhash = inet_unhash, 3144 .get_port = inet_csk_get_port, 3145 .put_port = inet_put_port, 3146 #ifdef CONFIG_BPF_SYSCALL 3147 .psock_update_sk_prot = tcp_bpf_update_proto, 3148 #endif 3149 .enter_memory_pressure = tcp_enter_memory_pressure, 3150 .leave_memory_pressure = tcp_leave_memory_pressure, 3151 .stream_memory_free = tcp_stream_memory_free, 3152 .sockets_allocated = &tcp_sockets_allocated, 3153 .orphan_count = &tcp_orphan_count, 3154 3155 .memory_allocated = &tcp_memory_allocated, 3156 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, 3157 3158 .memory_pressure = &tcp_memory_pressure, 3159 .sysctl_mem = sysctl_tcp_mem, 3160 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3161 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3162 .max_header = MAX_TCP_HEADER, 3163 .obj_size = sizeof(struct tcp_sock), 3164 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3165 .twsk_prot = &tcp_timewait_sock_ops, 3166 .rsk_prot = &tcp_request_sock_ops, 3167 .h.hashinfo = NULL, 3168 .no_autobind = true, 3169 .diag_destroy = tcp_abort, 3170 }; 3171 EXPORT_SYMBOL(tcp_prot); 3172 3173 static void __net_exit tcp_sk_exit(struct net *net) 3174 { 3175 if (net->ipv4.tcp_congestion_control) 3176 bpf_module_put(net->ipv4.tcp_congestion_control, 3177 net->ipv4.tcp_congestion_control->owner); 3178 } 3179 3180 static void __net_init tcp_set_hashinfo(struct net *net) 3181 { 3182 struct inet_hashinfo *hinfo; 3183 unsigned int ehash_entries; 3184 struct net *old_net; 3185 3186 if (net_eq(net, &init_net)) 3187 goto fallback; 3188 3189 old_net = current->nsproxy->net_ns; 3190 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries); 3191 if (!ehash_entries) 3192 goto fallback; 3193 3194 ehash_entries = roundup_pow_of_two(ehash_entries); 3195 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries); 3196 if (!hinfo) { 3197 pr_warn("Failed to allocate TCP ehash (entries: %u) " 3198 "for a netns, fallback to the global one\n", 3199 ehash_entries); 3200 fallback: 3201 hinfo = &tcp_hashinfo; 3202 ehash_entries = tcp_hashinfo.ehash_mask + 1; 3203 } 3204 3205 net->ipv4.tcp_death_row.hashinfo = hinfo; 3206 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2; 3207 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128); 3208 } 3209 3210 static int __net_init tcp_sk_init(struct net *net) 3211 { 3212 net->ipv4.sysctl_tcp_ecn = 2; 3213 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3214 3215 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 3216 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 3217 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 3218 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 3219 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 3220 3221 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 3222 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 3223 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 3224 3225 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 3226 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 3227 net->ipv4.sysctl_tcp_syncookies = 1; 3228 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 3229 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 3230 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 3231 net->ipv4.sysctl_tcp_orphan_retries = 0; 3232 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 3233 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 3234 net->ipv4.sysctl_tcp_tw_reuse = 2; 3235 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 3236 3237 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); 3238 tcp_set_hashinfo(net); 3239 3240 net->ipv4.sysctl_tcp_sack = 1; 3241 net->ipv4.sysctl_tcp_window_scaling = 1; 3242 net->ipv4.sysctl_tcp_timestamps = 1; 3243 net->ipv4.sysctl_tcp_early_retrans = 3; 3244 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 3245 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 3246 net->ipv4.sysctl_tcp_retrans_collapse = 1; 3247 net->ipv4.sysctl_tcp_max_reordering = 300; 3248 net->ipv4.sysctl_tcp_dsack = 1; 3249 net->ipv4.sysctl_tcp_app_win = 31; 3250 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3251 net->ipv4.sysctl_tcp_frto = 2; 3252 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3253 /* This limits the percentage of the congestion window which we 3254 * will allow a single TSO frame to consume. Building TSO frames 3255 * which are too large can cause TCP streams to be bursty. 3256 */ 3257 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3258 /* Default TSQ limit of 16 TSO segments */ 3259 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; 3260 3261 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ 3262 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; 3263 3264 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3265 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ 3266 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3267 net->ipv4.sysctl_tcp_autocorking = 1; 3268 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 3269 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 3270 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 3271 if (net != &init_net) { 3272 memcpy(net->ipv4.sysctl_tcp_rmem, 3273 init_net.ipv4.sysctl_tcp_rmem, 3274 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 3275 memcpy(net->ipv4.sysctl_tcp_wmem, 3276 init_net.ipv4.sysctl_tcp_wmem, 3277 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 3278 } 3279 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3280 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 3281 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3282 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3283 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; 3284 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 3285 3286 /* Set default values for PLB */ 3287 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */ 3288 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3; 3289 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; 3290 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; 3291 /* Default congestion threshold for PLB to mark a round is 50% */ 3292 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2; 3293 3294 /* Reno is always built in */ 3295 if (!net_eq(net, &init_net) && 3296 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 3297 init_net.ipv4.tcp_congestion_control->owner)) 3298 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 3299 else 3300 net->ipv4.tcp_congestion_control = &tcp_reno; 3301 3302 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4; 3303 net->ipv4.sysctl_tcp_shrink_window = 0; 3304 3305 return 0; 3306 } 3307 3308 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 3309 { 3310 struct net *net; 3311 3312 /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work 3313 * and failed setup_net error unwinding path are serialized. 3314 * 3315 * tcp_twsk_purge() handles twsk in any dead netns, not just those in 3316 * net_exit_list, the thread that dismantles a particular twsk must 3317 * do so without other thread progressing to refcount_dec_and_test() of 3318 * tcp_death_row.tw_refcount. 3319 */ 3320 mutex_lock(&tcp_exit_batch_mutex); 3321 3322 tcp_twsk_purge(net_exit_list); 3323 3324 list_for_each_entry(net, net_exit_list, exit_list) { 3325 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); 3326 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount)); 3327 tcp_fastopen_ctx_destroy(net); 3328 } 3329 3330 mutex_unlock(&tcp_exit_batch_mutex); 3331 } 3332 3333 static struct pernet_operations __net_initdata tcp_sk_ops = { 3334 .init = tcp_sk_init, 3335 .exit = tcp_sk_exit, 3336 .exit_batch = tcp_sk_exit_batch, 3337 }; 3338 3339 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3340 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 3341 struct sock_common *sk_common, uid_t uid) 3342 3343 #define INIT_BATCH_SZ 16 3344 3345 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 3346 { 3347 struct bpf_tcp_iter_state *iter = priv_data; 3348 int err; 3349 3350 err = bpf_iter_init_seq_net(priv_data, aux); 3351 if (err) 3352 return err; 3353 3354 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ); 3355 if (err) { 3356 bpf_iter_fini_seq_net(priv_data); 3357 return err; 3358 } 3359 3360 return 0; 3361 } 3362 3363 static void bpf_iter_fini_tcp(void *priv_data) 3364 { 3365 struct bpf_tcp_iter_state *iter = priv_data; 3366 3367 bpf_iter_fini_seq_net(priv_data); 3368 kvfree(iter->batch); 3369 } 3370 3371 static const struct bpf_iter_seq_info tcp_seq_info = { 3372 .seq_ops = &bpf_iter_tcp_seq_ops, 3373 .init_seq_private = bpf_iter_init_tcp, 3374 .fini_seq_private = bpf_iter_fini_tcp, 3375 .seq_priv_size = sizeof(struct bpf_tcp_iter_state), 3376 }; 3377 3378 static const struct bpf_func_proto * 3379 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, 3380 const struct bpf_prog *prog) 3381 { 3382 switch (func_id) { 3383 case BPF_FUNC_setsockopt: 3384 return &bpf_sk_setsockopt_proto; 3385 case BPF_FUNC_getsockopt: 3386 return &bpf_sk_getsockopt_proto; 3387 default: 3388 return NULL; 3389 } 3390 } 3391 3392 static struct bpf_iter_reg tcp_reg_info = { 3393 .target = "tcp", 3394 .ctx_arg_info_size = 1, 3395 .ctx_arg_info = { 3396 { offsetof(struct bpf_iter__tcp, sk_common), 3397 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, 3398 }, 3399 .get_func_proto = bpf_iter_tcp_get_func_proto, 3400 .seq_info = &tcp_seq_info, 3401 }; 3402 3403 static void __init bpf_iter_register(void) 3404 { 3405 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 3406 if (bpf_iter_reg_target(&tcp_reg_info)) 3407 pr_warn("Warning: could not register bpf iterator tcp\n"); 3408 } 3409 3410 #endif 3411 3412 void __init tcp_v4_init(void) 3413 { 3414 int cpu, res; 3415 3416 for_each_possible_cpu(cpu) { 3417 struct sock *sk; 3418 3419 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 3420 IPPROTO_TCP, &init_net); 3421 if (res) 3422 panic("Failed to create the TCP control socket.\n"); 3423 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 3424 3425 /* Please enforce IP_DF and IPID==0 for RST and 3426 * ACK sent in SYN-RECV and TIME-WAIT state. 3427 */ 3428 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 3429 3430 per_cpu(ipv4_tcp_sk, cpu) = sk; 3431 } 3432 if (register_pernet_subsys(&tcp_sk_ops)) 3433 panic("Failed to create the TCP control socket.\n"); 3434 3435 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3436 bpf_iter_register(); 3437 #endif 3438 } 3439