1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 #include <linux/sched.h> 61 62 #include <net/net_namespace.h> 63 #include <net/icmp.h> 64 #include <net/inet_hashtables.h> 65 #include <net/tcp.h> 66 #include <net/transp_v6.h> 67 #include <net/ipv6.h> 68 #include <net/inet_common.h> 69 #include <net/timewait_sock.h> 70 #include <net/xfrm.h> 71 #include <net/secure_seq.h> 72 #include <net/busy_poll.h> 73 74 #include <linux/inet.h> 75 #include <linux/ipv6.h> 76 #include <linux/stddef.h> 77 #include <linux/proc_fs.h> 78 #include <linux/seq_file.h> 79 #include <linux/inetdevice.h> 80 #include <linux/btf_ids.h> 81 82 #include <crypto/hash.h> 83 #include <linux/scatterlist.h> 84 85 #include <trace/events/tcp.h> 86 87 #ifdef CONFIG_TCP_MD5SIG 88 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 89 __be32 daddr, __be32 saddr, const struct tcphdr *th); 90 #endif 91 92 struct inet_hashinfo tcp_hashinfo; 93 EXPORT_SYMBOL(tcp_hashinfo); 94 95 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk); 96 97 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 98 { 99 return secure_tcp_seq(ip_hdr(skb)->daddr, 100 ip_hdr(skb)->saddr, 101 tcp_hdr(skb)->dest, 102 tcp_hdr(skb)->source); 103 } 104 105 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 106 { 107 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 108 } 109 110 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 111 { 112 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse); 113 const struct inet_timewait_sock *tw = inet_twsk(sktw); 114 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 115 struct tcp_sock *tp = tcp_sk(sk); 116 117 if (reuse == 2) { 118 /* Still does not detect *everything* that goes through 119 * lo, since we require a loopback src or dst address 120 * or direct binding to 'lo' interface. 121 */ 122 bool loopback = false; 123 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 124 loopback = true; 125 #if IS_ENABLED(CONFIG_IPV6) 126 if (tw->tw_family == AF_INET6) { 127 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 128 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 129 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 130 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 131 loopback = true; 132 } else 133 #endif 134 { 135 if (ipv4_is_loopback(tw->tw_daddr) || 136 ipv4_is_loopback(tw->tw_rcv_saddr)) 137 loopback = true; 138 } 139 if (!loopback) 140 reuse = 0; 141 } 142 143 /* With PAWS, it is safe from the viewpoint 144 of data integrity. Even without PAWS it is safe provided sequence 145 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 146 147 Actually, the idea is close to VJ's one, only timestamp cache is 148 held not per host, but per port pair and TW bucket is used as state 149 holder. 150 151 If TW bucket has been already destroyed we fall back to VJ's scheme 152 and use initial timestamp retrieved from peer table. 153 */ 154 if (tcptw->tw_ts_recent_stamp && 155 (!twp || (reuse && time_after32(ktime_get_seconds(), 156 tcptw->tw_ts_recent_stamp)))) { 157 /* inet_twsk_hashdance() sets sk_refcnt after putting twsk 158 * and releasing the bucket lock. 159 */ 160 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt))) 161 return 0; 162 163 /* In case of repair and re-using TIME-WAIT sockets we still 164 * want to be sure that it is safe as above but honor the 165 * sequence numbers and time stamps set as part of the repair 166 * process. 167 * 168 * Without this check re-using a TIME-WAIT socket with TCP 169 * repair would accumulate a -1 on the repair assigned 170 * sequence number. The first time it is reused the sequence 171 * is -1, the second time -2, etc. This fixes that issue 172 * without appearing to create any others. 173 */ 174 if (likely(!tp->repair)) { 175 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 176 177 if (!seq) 178 seq = 1; 179 WRITE_ONCE(tp->write_seq, seq); 180 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 181 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 182 } 183 184 return 1; 185 } 186 187 return 0; 188 } 189 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 190 191 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 192 int addr_len) 193 { 194 /* This check is replicated from tcp_v4_connect() and intended to 195 * prevent BPF program called below from accessing bytes that are out 196 * of the bound specified by user in addr_len. 197 */ 198 if (addr_len < sizeof(struct sockaddr_in)) 199 return -EINVAL; 200 201 sock_owned_by_me(sk); 202 203 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len); 204 } 205 206 /* This will initiate an outgoing connection. */ 207 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 208 { 209 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 210 struct inet_timewait_death_row *tcp_death_row; 211 struct inet_sock *inet = inet_sk(sk); 212 struct tcp_sock *tp = tcp_sk(sk); 213 struct ip_options_rcu *inet_opt; 214 struct net *net = sock_net(sk); 215 __be16 orig_sport, orig_dport; 216 __be32 daddr, nexthop; 217 struct flowi4 *fl4; 218 struct rtable *rt; 219 int err; 220 221 if (addr_len < sizeof(struct sockaddr_in)) 222 return -EINVAL; 223 224 if (usin->sin_family != AF_INET) 225 return -EAFNOSUPPORT; 226 227 nexthop = daddr = usin->sin_addr.s_addr; 228 inet_opt = rcu_dereference_protected(inet->inet_opt, 229 lockdep_sock_is_held(sk)); 230 if (inet_opt && inet_opt->opt.srr) { 231 if (!daddr) 232 return -EINVAL; 233 nexthop = inet_opt->opt.faddr; 234 } 235 236 orig_sport = inet->inet_sport; 237 orig_dport = usin->sin_port; 238 fl4 = &inet->cork.fl.u.ip4; 239 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 240 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, 241 orig_dport, sk); 242 if (IS_ERR(rt)) { 243 err = PTR_ERR(rt); 244 if (err == -ENETUNREACH) 245 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); 246 return err; 247 } 248 249 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 250 ip_rt_put(rt); 251 return -ENETUNREACH; 252 } 253 254 if (!inet_opt || !inet_opt->opt.srr) 255 daddr = fl4->daddr; 256 257 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 258 259 if (!inet->inet_saddr) { 260 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); 261 if (err) { 262 ip_rt_put(rt); 263 return err; 264 } 265 } else { 266 sk_rcv_saddr_set(sk, inet->inet_saddr); 267 } 268 269 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 270 /* Reset inherited state */ 271 tp->rx_opt.ts_recent = 0; 272 tp->rx_opt.ts_recent_stamp = 0; 273 if (likely(!tp->repair)) 274 WRITE_ONCE(tp->write_seq, 0); 275 } 276 277 inet->inet_dport = usin->sin_port; 278 sk_daddr_set(sk, daddr); 279 280 inet_csk(sk)->icsk_ext_hdr_len = 0; 281 if (inet_opt) 282 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 283 284 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 285 286 /* Socket identity is still unknown (sport may be zero). 287 * However we set state to SYN-SENT and not releasing socket 288 * lock select source port, enter ourselves into the hash tables and 289 * complete initialization after this. 290 */ 291 tcp_set_state(sk, TCP_SYN_SENT); 292 err = inet_hash_connect(tcp_death_row, sk); 293 if (err) 294 goto failure; 295 296 sk_set_txhash(sk); 297 298 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 299 inet->inet_sport, inet->inet_dport, sk); 300 if (IS_ERR(rt)) { 301 err = PTR_ERR(rt); 302 rt = NULL; 303 goto failure; 304 } 305 /* OK, now commit destination to socket. */ 306 sk->sk_gso_type = SKB_GSO_TCPV4; 307 sk_setup_caps(sk, &rt->dst); 308 rt = NULL; 309 310 if (likely(!tp->repair)) { 311 if (!tp->write_seq) 312 WRITE_ONCE(tp->write_seq, 313 secure_tcp_seq(inet->inet_saddr, 314 inet->inet_daddr, 315 inet->inet_sport, 316 usin->sin_port)); 317 WRITE_ONCE(tp->tsoffset, 318 secure_tcp_ts_off(net, inet->inet_saddr, 319 inet->inet_daddr)); 320 } 321 322 atomic_set(&inet->inet_id, get_random_u16()); 323 324 if (tcp_fastopen_defer_connect(sk, &err)) 325 return err; 326 if (err) 327 goto failure; 328 329 err = tcp_connect(sk); 330 331 if (err) 332 goto failure; 333 334 return 0; 335 336 failure: 337 /* 338 * This unhashes the socket and releases the local port, 339 * if necessary. 340 */ 341 tcp_set_state(sk, TCP_CLOSE); 342 inet_bhash2_reset_saddr(sk); 343 ip_rt_put(rt); 344 sk->sk_route_caps = 0; 345 inet->inet_dport = 0; 346 return err; 347 } 348 EXPORT_SYMBOL(tcp_v4_connect); 349 350 /* 351 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 352 * It can be called through tcp_release_cb() if socket was owned by user 353 * at the time tcp_v4_err() was called to handle ICMP message. 354 */ 355 void tcp_v4_mtu_reduced(struct sock *sk) 356 { 357 struct inet_sock *inet = inet_sk(sk); 358 struct dst_entry *dst; 359 u32 mtu; 360 361 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 362 return; 363 mtu = READ_ONCE(tcp_sk(sk)->mtu_info); 364 dst = inet_csk_update_pmtu(sk, mtu); 365 if (!dst) 366 return; 367 368 /* Something is about to be wrong... Remember soft error 369 * for the case, if this connection will not able to recover. 370 */ 371 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 372 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); 373 374 mtu = dst_mtu(dst); 375 376 if (inet->pmtudisc != IP_PMTUDISC_DONT && 377 ip_sk_accept_pmtu(sk) && 378 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 379 tcp_sync_mss(sk, mtu); 380 381 /* Resend the TCP packet because it's 382 * clear that the old packet has been 383 * dropped. This is the new "fast" path mtu 384 * discovery. 385 */ 386 tcp_simple_retransmit(sk); 387 } /* else let the usual retransmit timer handle it */ 388 } 389 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 390 391 static void do_redirect(struct sk_buff *skb, struct sock *sk) 392 { 393 struct dst_entry *dst = __sk_dst_check(sk, 0); 394 395 if (dst) 396 dst->ops->redirect(dst, sk, skb); 397 } 398 399 400 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 401 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 402 { 403 struct request_sock *req = inet_reqsk(sk); 404 struct net *net = sock_net(sk); 405 406 /* ICMPs are not backlogged, hence we cannot get 407 * an established socket here. 408 */ 409 if (seq != tcp_rsk(req)->snt_isn) { 410 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 411 } else if (abort) { 412 /* 413 * Still in SYN_RECV, just remove it silently. 414 * There is no good way to pass the error to the newly 415 * created socket, and POSIX does not want network 416 * errors returned from accept(). 417 */ 418 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 419 tcp_listendrop(req->rsk_listener); 420 } 421 reqsk_put(req); 422 } 423 EXPORT_SYMBOL(tcp_req_err); 424 425 /* TCP-LD (RFC 6069) logic */ 426 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 427 { 428 struct inet_connection_sock *icsk = inet_csk(sk); 429 struct tcp_sock *tp = tcp_sk(sk); 430 struct sk_buff *skb; 431 s32 remaining; 432 u32 delta_us; 433 434 if (sock_owned_by_user(sk)) 435 return; 436 437 if (seq != tp->snd_una || !icsk->icsk_retransmits || 438 !icsk->icsk_backoff) 439 return; 440 441 skb = tcp_rtx_queue_head(sk); 442 if (WARN_ON_ONCE(!skb)) 443 return; 444 445 icsk->icsk_backoff--; 446 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 447 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 448 449 tcp_mstamp_refresh(tp); 450 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 451 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 452 453 if (remaining > 0) { 454 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 455 remaining, TCP_RTO_MAX); 456 } else { 457 /* RTO revert clocked out retransmission. 458 * Will retransmit now. 459 */ 460 tcp_retransmit_timer(sk); 461 } 462 } 463 EXPORT_SYMBOL(tcp_ld_RTO_revert); 464 465 /* 466 * This routine is called by the ICMP module when it gets some 467 * sort of error condition. If err < 0 then the socket should 468 * be closed and the error returned to the user. If err > 0 469 * it's just the icmp type << 8 | icmp code. After adjustment 470 * header points to the first 8 bytes of the tcp header. We need 471 * to find the appropriate port. 472 * 473 * The locking strategy used here is very "optimistic". When 474 * someone else accesses the socket the ICMP is just dropped 475 * and for some paths there is no check at all. 476 * A more general error queue to queue errors for later handling 477 * is probably better. 478 * 479 */ 480 481 int tcp_v4_err(struct sk_buff *skb, u32 info) 482 { 483 const struct iphdr *iph = (const struct iphdr *)skb->data; 484 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 485 struct tcp_sock *tp; 486 const int type = icmp_hdr(skb)->type; 487 const int code = icmp_hdr(skb)->code; 488 struct sock *sk; 489 struct request_sock *fastopen; 490 u32 seq, snd_una; 491 int err; 492 struct net *net = dev_net(skb->dev); 493 494 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 495 iph->daddr, th->dest, iph->saddr, 496 ntohs(th->source), inet_iif(skb), 0); 497 if (!sk) { 498 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 499 return -ENOENT; 500 } 501 if (sk->sk_state == TCP_TIME_WAIT) { 502 inet_twsk_put(inet_twsk(sk)); 503 return 0; 504 } 505 seq = ntohl(th->seq); 506 if (sk->sk_state == TCP_NEW_SYN_RECV) { 507 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 508 type == ICMP_TIME_EXCEEDED || 509 (type == ICMP_DEST_UNREACH && 510 (code == ICMP_NET_UNREACH || 511 code == ICMP_HOST_UNREACH))); 512 return 0; 513 } 514 515 bh_lock_sock(sk); 516 /* If too many ICMPs get dropped on busy 517 * servers this needs to be solved differently. 518 * We do take care of PMTU discovery (RFC1191) special case : 519 * we can receive locally generated ICMP messages while socket is held. 520 */ 521 if (sock_owned_by_user(sk)) { 522 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 523 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 524 } 525 if (sk->sk_state == TCP_CLOSE) 526 goto out; 527 528 if (static_branch_unlikely(&ip4_min_ttl)) { 529 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 530 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 531 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 532 goto out; 533 } 534 } 535 536 tp = tcp_sk(sk); 537 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 538 fastopen = rcu_dereference(tp->fastopen_rsk); 539 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 540 if (sk->sk_state != TCP_LISTEN && 541 !between(seq, snd_una, tp->snd_nxt)) { 542 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 543 goto out; 544 } 545 546 switch (type) { 547 case ICMP_REDIRECT: 548 if (!sock_owned_by_user(sk)) 549 do_redirect(skb, sk); 550 goto out; 551 case ICMP_SOURCE_QUENCH: 552 /* Just silently ignore these. */ 553 goto out; 554 case ICMP_PARAMETERPROB: 555 err = EPROTO; 556 break; 557 case ICMP_DEST_UNREACH: 558 if (code > NR_ICMP_UNREACH) 559 goto out; 560 561 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 562 /* We are not interested in TCP_LISTEN and open_requests 563 * (SYN-ACKs send out by Linux are always <576bytes so 564 * they should go through unfragmented). 565 */ 566 if (sk->sk_state == TCP_LISTEN) 567 goto out; 568 569 WRITE_ONCE(tp->mtu_info, info); 570 if (!sock_owned_by_user(sk)) { 571 tcp_v4_mtu_reduced(sk); 572 } else { 573 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 574 sock_hold(sk); 575 } 576 goto out; 577 } 578 579 err = icmp_err_convert[code].errno; 580 /* check if this ICMP message allows revert of backoff. 581 * (see RFC 6069) 582 */ 583 if (!fastopen && 584 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 585 tcp_ld_RTO_revert(sk, seq); 586 break; 587 case ICMP_TIME_EXCEEDED: 588 err = EHOSTUNREACH; 589 break; 590 default: 591 goto out; 592 } 593 594 switch (sk->sk_state) { 595 case TCP_SYN_SENT: 596 case TCP_SYN_RECV: 597 /* Only in fast or simultaneous open. If a fast open socket is 598 * already accepted it is treated as a connected one below. 599 */ 600 if (fastopen && !fastopen->sk) 601 break; 602 603 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 604 605 if (!sock_owned_by_user(sk)) { 606 WRITE_ONCE(sk->sk_err, err); 607 608 sk_error_report(sk); 609 610 tcp_done(sk); 611 } else { 612 WRITE_ONCE(sk->sk_err_soft, err); 613 } 614 goto out; 615 } 616 617 /* If we've already connected we will keep trying 618 * until we time out, or the user gives up. 619 * 620 * rfc1122 4.2.3.9 allows to consider as hard errors 621 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 622 * but it is obsoleted by pmtu discovery). 623 * 624 * Note, that in modern internet, where routing is unreliable 625 * and in each dark corner broken firewalls sit, sending random 626 * errors ordered by their masters even this two messages finally lose 627 * their original sense (even Linux sends invalid PORT_UNREACHs) 628 * 629 * Now we are in compliance with RFCs. 630 * --ANK (980905) 631 */ 632 633 if (!sock_owned_by_user(sk) && 634 inet_test_bit(RECVERR, sk)) { 635 WRITE_ONCE(sk->sk_err, err); 636 sk_error_report(sk); 637 } else { /* Only an error on timeout */ 638 WRITE_ONCE(sk->sk_err_soft, err); 639 } 640 641 out: 642 bh_unlock_sock(sk); 643 sock_put(sk); 644 return 0; 645 } 646 647 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 648 { 649 struct tcphdr *th = tcp_hdr(skb); 650 651 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 652 skb->csum_start = skb_transport_header(skb) - skb->head; 653 skb->csum_offset = offsetof(struct tcphdr, check); 654 } 655 656 /* This routine computes an IPv4 TCP checksum. */ 657 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 658 { 659 const struct inet_sock *inet = inet_sk(sk); 660 661 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 662 } 663 EXPORT_SYMBOL(tcp_v4_send_check); 664 665 /* 666 * This routine will send an RST to the other tcp. 667 * 668 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 669 * for reset. 670 * Answer: if a packet caused RST, it is not for a socket 671 * existing in our system, if it is matched to a socket, 672 * it is just duplicate segment or bug in other side's TCP. 673 * So that we build reply only basing on parameters 674 * arrived with segment. 675 * Exception: precedence violation. We do not implement it in any case. 676 */ 677 678 #ifdef CONFIG_TCP_MD5SIG 679 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED 680 #else 681 #define OPTION_BYTES sizeof(__be32) 682 #endif 683 684 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) 685 { 686 const struct tcphdr *th = tcp_hdr(skb); 687 struct { 688 struct tcphdr th; 689 __be32 opt[OPTION_BYTES / sizeof(__be32)]; 690 } rep; 691 struct ip_reply_arg arg; 692 #ifdef CONFIG_TCP_MD5SIG 693 struct tcp_md5sig_key *key = NULL; 694 const __u8 *hash_location = NULL; 695 unsigned char newhash[16]; 696 int genhash; 697 struct sock *sk1 = NULL; 698 #endif 699 u64 transmit_time = 0; 700 struct sock *ctl_sk; 701 struct net *net; 702 u32 txhash = 0; 703 704 /* Never send a reset in response to a reset. */ 705 if (th->rst) 706 return; 707 708 /* If sk not NULL, it means we did a successful lookup and incoming 709 * route had to be correct. prequeue might have dropped our dst. 710 */ 711 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 712 return; 713 714 /* Swap the send and the receive. */ 715 memset(&rep, 0, sizeof(rep)); 716 rep.th.dest = th->source; 717 rep.th.source = th->dest; 718 rep.th.doff = sizeof(struct tcphdr) / 4; 719 rep.th.rst = 1; 720 721 if (th->ack) { 722 rep.th.seq = th->ack_seq; 723 } else { 724 rep.th.ack = 1; 725 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 726 skb->len - (th->doff << 2)); 727 } 728 729 memset(&arg, 0, sizeof(arg)); 730 arg.iov[0].iov_base = (unsigned char *)&rep; 731 arg.iov[0].iov_len = sizeof(rep.th); 732 733 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 734 #ifdef CONFIG_TCP_MD5SIG 735 rcu_read_lock(); 736 hash_location = tcp_parse_md5sig_option(th); 737 if (sk && sk_fullsock(sk)) { 738 const union tcp_md5_addr *addr; 739 int l3index; 740 741 /* sdif set, means packet ingressed via a device 742 * in an L3 domain and inet_iif is set to it. 743 */ 744 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 745 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 746 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 747 } else if (hash_location) { 748 const union tcp_md5_addr *addr; 749 int sdif = tcp_v4_sdif(skb); 750 int dif = inet_iif(skb); 751 int l3index; 752 753 /* 754 * active side is lost. Try to find listening socket through 755 * source port, and then find md5 key through listening socket. 756 * we are not loose security here: 757 * Incoming packet is checked with md5 hash with finding key, 758 * no RST generated if md5 hash doesn't match. 759 */ 760 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, 761 NULL, 0, ip_hdr(skb)->saddr, 762 th->source, ip_hdr(skb)->daddr, 763 ntohs(th->source), dif, sdif); 764 /* don't send rst if it can't find key */ 765 if (!sk1) 766 goto out; 767 768 /* sdif set, means packet ingressed via a device 769 * in an L3 domain and dif is set to it. 770 */ 771 l3index = sdif ? dif : 0; 772 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 773 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 774 if (!key) 775 goto out; 776 777 778 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 779 if (genhash || memcmp(hash_location, newhash, 16) != 0) 780 goto out; 781 782 } 783 784 if (key) { 785 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 786 (TCPOPT_NOP << 16) | 787 (TCPOPT_MD5SIG << 8) | 788 TCPOLEN_MD5SIG); 789 /* Update length and the length the header thinks exists */ 790 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 791 rep.th.doff = arg.iov[0].iov_len / 4; 792 793 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 794 key, ip_hdr(skb)->saddr, 795 ip_hdr(skb)->daddr, &rep.th); 796 } 797 #endif 798 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ 799 if (rep.opt[0] == 0) { 800 __be32 mrst = mptcp_reset_option(skb); 801 802 if (mrst) { 803 rep.opt[0] = mrst; 804 arg.iov[0].iov_len += sizeof(mrst); 805 rep.th.doff = arg.iov[0].iov_len / 4; 806 } 807 } 808 809 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 810 ip_hdr(skb)->saddr, /* XXX */ 811 arg.iov[0].iov_len, IPPROTO_TCP, 0); 812 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 813 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 814 815 /* When socket is gone, all binding information is lost. 816 * routing might fail in this case. No choice here, if we choose to force 817 * input interface, we will misroute in case of asymmetric route. 818 */ 819 if (sk) { 820 arg.bound_dev_if = sk->sk_bound_dev_if; 821 if (sk_fullsock(sk)) 822 trace_tcp_send_reset(sk, skb); 823 } 824 825 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 826 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 827 828 arg.tos = ip_hdr(skb)->tos; 829 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 830 local_bh_disable(); 831 ctl_sk = this_cpu_read(ipv4_tcp_sk); 832 sock_net_set(ctl_sk, net); 833 if (sk) { 834 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 835 inet_twsk(sk)->tw_mark : sk->sk_mark; 836 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 837 inet_twsk(sk)->tw_priority : sk->sk_priority; 838 transmit_time = tcp_transmit_time(sk); 839 xfrm_sk_clone_policy(ctl_sk, sk); 840 txhash = (sk->sk_state == TCP_TIME_WAIT) ? 841 inet_twsk(sk)->tw_txhash : sk->sk_txhash; 842 } else { 843 ctl_sk->sk_mark = 0; 844 ctl_sk->sk_priority = 0; 845 } 846 ip_send_unicast_reply(ctl_sk, 847 skb, &TCP_SKB_CB(skb)->header.h4.opt, 848 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 849 &arg, arg.iov[0].iov_len, 850 transmit_time, txhash); 851 852 xfrm_sk_free_policy(ctl_sk); 853 sock_net_set(ctl_sk, &init_net); 854 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 855 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 856 local_bh_enable(); 857 858 #ifdef CONFIG_TCP_MD5SIG 859 out: 860 rcu_read_unlock(); 861 #endif 862 } 863 864 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 865 outside socket context is ugly, certainly. What can I do? 866 */ 867 868 static void tcp_v4_send_ack(const struct sock *sk, 869 struct sk_buff *skb, u32 seq, u32 ack, 870 u32 win, u32 tsval, u32 tsecr, int oif, 871 struct tcp_md5sig_key *key, 872 int reply_flags, u8 tos, u32 txhash) 873 { 874 const struct tcphdr *th = tcp_hdr(skb); 875 struct { 876 struct tcphdr th; 877 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 878 #ifdef CONFIG_TCP_MD5SIG 879 + (TCPOLEN_MD5SIG_ALIGNED >> 2) 880 #endif 881 ]; 882 } rep; 883 struct net *net = sock_net(sk); 884 struct ip_reply_arg arg; 885 struct sock *ctl_sk; 886 u64 transmit_time; 887 888 memset(&rep.th, 0, sizeof(struct tcphdr)); 889 memset(&arg, 0, sizeof(arg)); 890 891 arg.iov[0].iov_base = (unsigned char *)&rep; 892 arg.iov[0].iov_len = sizeof(rep.th); 893 if (tsecr) { 894 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 895 (TCPOPT_TIMESTAMP << 8) | 896 TCPOLEN_TIMESTAMP); 897 rep.opt[1] = htonl(tsval); 898 rep.opt[2] = htonl(tsecr); 899 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 900 } 901 902 /* Swap the send and the receive. */ 903 rep.th.dest = th->source; 904 rep.th.source = th->dest; 905 rep.th.doff = arg.iov[0].iov_len / 4; 906 rep.th.seq = htonl(seq); 907 rep.th.ack_seq = htonl(ack); 908 rep.th.ack = 1; 909 rep.th.window = htons(win); 910 911 #ifdef CONFIG_TCP_MD5SIG 912 if (key) { 913 int offset = (tsecr) ? 3 : 0; 914 915 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 916 (TCPOPT_NOP << 16) | 917 (TCPOPT_MD5SIG << 8) | 918 TCPOLEN_MD5SIG); 919 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 920 rep.th.doff = arg.iov[0].iov_len/4; 921 922 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 923 key, ip_hdr(skb)->saddr, 924 ip_hdr(skb)->daddr, &rep.th); 925 } 926 #endif 927 arg.flags = reply_flags; 928 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 929 ip_hdr(skb)->saddr, /* XXX */ 930 arg.iov[0].iov_len, IPPROTO_TCP, 0); 931 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 932 if (oif) 933 arg.bound_dev_if = oif; 934 arg.tos = tos; 935 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 936 local_bh_disable(); 937 ctl_sk = this_cpu_read(ipv4_tcp_sk); 938 sock_net_set(ctl_sk, net); 939 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 940 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 941 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 942 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 943 transmit_time = tcp_transmit_time(sk); 944 ip_send_unicast_reply(ctl_sk, 945 skb, &TCP_SKB_CB(skb)->header.h4.opt, 946 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 947 &arg, arg.iov[0].iov_len, 948 transmit_time, txhash); 949 950 sock_net_set(ctl_sk, &init_net); 951 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 952 local_bh_enable(); 953 } 954 955 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 956 { 957 struct inet_timewait_sock *tw = inet_twsk(sk); 958 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 959 960 tcp_v4_send_ack(sk, skb, 961 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 962 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 963 tcp_time_stamp_raw() + tcptw->tw_ts_offset, 964 tcptw->tw_ts_recent, 965 tw->tw_bound_dev_if, 966 tcp_twsk_md5_key(tcptw), 967 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 968 tw->tw_tos, 969 tw->tw_txhash 970 ); 971 972 inet_twsk_put(tw); 973 } 974 975 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 976 struct request_sock *req) 977 { 978 const union tcp_md5_addr *addr; 979 int l3index; 980 981 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 982 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 983 */ 984 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 985 tcp_sk(sk)->snd_nxt; 986 987 /* RFC 7323 2.3 988 * The window field (SEG.WND) of every outgoing segment, with the 989 * exception of <SYN> segments, MUST be right-shifted by 990 * Rcv.Wind.Shift bits: 991 */ 992 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 993 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 994 tcp_v4_send_ack(sk, skb, seq, 995 tcp_rsk(req)->rcv_nxt, 996 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 997 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, 998 READ_ONCE(req->ts_recent), 999 0, 1000 tcp_md5_do_lookup(sk, l3index, addr, AF_INET), 1001 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 1002 ip_hdr(skb)->tos, 1003 READ_ONCE(tcp_rsk(req)->txhash)); 1004 } 1005 1006 /* 1007 * Send a SYN-ACK after having received a SYN. 1008 * This still operates on a request_sock only, not on a big 1009 * socket. 1010 */ 1011 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 1012 struct flowi *fl, 1013 struct request_sock *req, 1014 struct tcp_fastopen_cookie *foc, 1015 enum tcp_synack_type synack_type, 1016 struct sk_buff *syn_skb) 1017 { 1018 const struct inet_request_sock *ireq = inet_rsk(req); 1019 struct flowi4 fl4; 1020 int err = -1; 1021 struct sk_buff *skb; 1022 u8 tos; 1023 1024 /* First, grab a route. */ 1025 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 1026 return -1; 1027 1028 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 1029 1030 if (skb) { 1031 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1032 1033 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ? 1034 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | 1035 (inet_sk(sk)->tos & INET_ECN_MASK) : 1036 inet_sk(sk)->tos; 1037 1038 if (!INET_ECN_is_capable(tos) && 1039 tcp_bpf_ca_needs_ecn((struct sock *)req)) 1040 tos |= INET_ECN_ECT_0; 1041 1042 rcu_read_lock(); 1043 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 1044 ireq->ir_rmt_addr, 1045 rcu_dereference(ireq->ireq_opt), 1046 tos); 1047 rcu_read_unlock(); 1048 err = net_xmit_eval(err); 1049 } 1050 1051 return err; 1052 } 1053 1054 /* 1055 * IPv4 request_sock destructor. 1056 */ 1057 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1058 { 1059 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1060 } 1061 1062 #ifdef CONFIG_TCP_MD5SIG 1063 /* 1064 * RFC2385 MD5 checksumming requires a mapping of 1065 * IP address->MD5 Key. 1066 * We need to maintain these in the sk structure. 1067 */ 1068 1069 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ); 1070 EXPORT_SYMBOL(tcp_md5_needed); 1071 1072 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) 1073 { 1074 if (!old) 1075 return true; 1076 1077 /* l3index always overrides non-l3index */ 1078 if (old->l3index && new->l3index == 0) 1079 return false; 1080 if (old->l3index == 0 && new->l3index) 1081 return true; 1082 1083 return old->prefixlen < new->prefixlen; 1084 } 1085 1086 /* Find the Key structure for an address. */ 1087 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1088 const union tcp_md5_addr *addr, 1089 int family) 1090 { 1091 const struct tcp_sock *tp = tcp_sk(sk); 1092 struct tcp_md5sig_key *key; 1093 const struct tcp_md5sig_info *md5sig; 1094 __be32 mask; 1095 struct tcp_md5sig_key *best_match = NULL; 1096 bool match; 1097 1098 /* caller either holds rcu_read_lock() or socket lock */ 1099 md5sig = rcu_dereference_check(tp->md5sig_info, 1100 lockdep_sock_is_held(sk)); 1101 if (!md5sig) 1102 return NULL; 1103 1104 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1105 lockdep_sock_is_held(sk)) { 1106 if (key->family != family) 1107 continue; 1108 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index) 1109 continue; 1110 if (family == AF_INET) { 1111 mask = inet_make_mask(key->prefixlen); 1112 match = (key->addr.a4.s_addr & mask) == 1113 (addr->a4.s_addr & mask); 1114 #if IS_ENABLED(CONFIG_IPV6) 1115 } else if (family == AF_INET6) { 1116 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1117 key->prefixlen); 1118 #endif 1119 } else { 1120 match = false; 1121 } 1122 1123 if (match && better_md5_match(best_match, key)) 1124 best_match = key; 1125 } 1126 return best_match; 1127 } 1128 EXPORT_SYMBOL(__tcp_md5_do_lookup); 1129 1130 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1131 const union tcp_md5_addr *addr, 1132 int family, u8 prefixlen, 1133 int l3index, u8 flags) 1134 { 1135 const struct tcp_sock *tp = tcp_sk(sk); 1136 struct tcp_md5sig_key *key; 1137 unsigned int size = sizeof(struct in_addr); 1138 const struct tcp_md5sig_info *md5sig; 1139 1140 /* caller either holds rcu_read_lock() or socket lock */ 1141 md5sig = rcu_dereference_check(tp->md5sig_info, 1142 lockdep_sock_is_held(sk)); 1143 if (!md5sig) 1144 return NULL; 1145 #if IS_ENABLED(CONFIG_IPV6) 1146 if (family == AF_INET6) 1147 size = sizeof(struct in6_addr); 1148 #endif 1149 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1150 lockdep_sock_is_held(sk)) { 1151 if (key->family != family) 1152 continue; 1153 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) 1154 continue; 1155 if (key->l3index != l3index) 1156 continue; 1157 if (!memcmp(&key->addr, addr, size) && 1158 key->prefixlen == prefixlen) 1159 return key; 1160 } 1161 return NULL; 1162 } 1163 1164 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1165 const struct sock *addr_sk) 1166 { 1167 const union tcp_md5_addr *addr; 1168 int l3index; 1169 1170 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1171 addr_sk->sk_bound_dev_if); 1172 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1173 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1174 } 1175 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1176 1177 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) 1178 { 1179 struct tcp_sock *tp = tcp_sk(sk); 1180 struct tcp_md5sig_info *md5sig; 1181 1182 md5sig = kmalloc(sizeof(*md5sig), gfp); 1183 if (!md5sig) 1184 return -ENOMEM; 1185 1186 sk_gso_disable(sk); 1187 INIT_HLIST_HEAD(&md5sig->head); 1188 rcu_assign_pointer(tp->md5sig_info, md5sig); 1189 return 0; 1190 } 1191 1192 /* This can be called on a newly created socket, from other files */ 1193 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1194 int family, u8 prefixlen, int l3index, u8 flags, 1195 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1196 { 1197 /* Add Key to the list */ 1198 struct tcp_md5sig_key *key; 1199 struct tcp_sock *tp = tcp_sk(sk); 1200 struct tcp_md5sig_info *md5sig; 1201 1202 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1203 if (key) { 1204 /* Pre-existing entry - just update that one. 1205 * Note that the key might be used concurrently. 1206 * data_race() is telling kcsan that we do not care of 1207 * key mismatches, since changing MD5 key on live flows 1208 * can lead to packet drops. 1209 */ 1210 data_race(memcpy(key->key, newkey, newkeylen)); 1211 1212 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1213 * Also note that a reader could catch new key->keylen value 1214 * but old key->key[], this is the reason we use __GFP_ZERO 1215 * at sock_kmalloc() time below these lines. 1216 */ 1217 WRITE_ONCE(key->keylen, newkeylen); 1218 1219 return 0; 1220 } 1221 1222 md5sig = rcu_dereference_protected(tp->md5sig_info, 1223 lockdep_sock_is_held(sk)); 1224 1225 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1226 if (!key) 1227 return -ENOMEM; 1228 if (!tcp_alloc_md5sig_pool()) { 1229 sock_kfree_s(sk, key, sizeof(*key)); 1230 return -ENOMEM; 1231 } 1232 1233 memcpy(key->key, newkey, newkeylen); 1234 key->keylen = newkeylen; 1235 key->family = family; 1236 key->prefixlen = prefixlen; 1237 key->l3index = l3index; 1238 key->flags = flags; 1239 memcpy(&key->addr, addr, 1240 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) : 1241 sizeof(struct in_addr)); 1242 hlist_add_head_rcu(&key->node, &md5sig->head); 1243 return 0; 1244 } 1245 1246 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1247 int family, u8 prefixlen, int l3index, u8 flags, 1248 const u8 *newkey, u8 newkeylen) 1249 { 1250 struct tcp_sock *tp = tcp_sk(sk); 1251 1252 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1253 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) 1254 return -ENOMEM; 1255 1256 if (!static_branch_inc(&tcp_md5_needed.key)) { 1257 struct tcp_md5sig_info *md5sig; 1258 1259 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1260 rcu_assign_pointer(tp->md5sig_info, NULL); 1261 kfree_rcu(md5sig, rcu); 1262 return -EUSERS; 1263 } 1264 } 1265 1266 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags, 1267 newkey, newkeylen, GFP_KERNEL); 1268 } 1269 EXPORT_SYMBOL(tcp_md5_do_add); 1270 1271 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, 1272 int family, u8 prefixlen, int l3index, 1273 struct tcp_md5sig_key *key) 1274 { 1275 struct tcp_sock *tp = tcp_sk(sk); 1276 1277 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1278 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) 1279 return -ENOMEM; 1280 1281 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) { 1282 struct tcp_md5sig_info *md5sig; 1283 1284 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1285 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n"); 1286 rcu_assign_pointer(tp->md5sig_info, NULL); 1287 kfree_rcu(md5sig, rcu); 1288 return -EUSERS; 1289 } 1290 } 1291 1292 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, 1293 key->flags, key->key, key->keylen, 1294 sk_gfp_mask(sk, GFP_ATOMIC)); 1295 } 1296 EXPORT_SYMBOL(tcp_md5_key_copy); 1297 1298 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1299 u8 prefixlen, int l3index, u8 flags) 1300 { 1301 struct tcp_md5sig_key *key; 1302 1303 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1304 if (!key) 1305 return -ENOENT; 1306 hlist_del_rcu(&key->node); 1307 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1308 kfree_rcu(key, rcu); 1309 return 0; 1310 } 1311 EXPORT_SYMBOL(tcp_md5_do_del); 1312 1313 static void tcp_clear_md5_list(struct sock *sk) 1314 { 1315 struct tcp_sock *tp = tcp_sk(sk); 1316 struct tcp_md5sig_key *key; 1317 struct hlist_node *n; 1318 struct tcp_md5sig_info *md5sig; 1319 1320 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1321 1322 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1323 hlist_del_rcu(&key->node); 1324 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1325 kfree_rcu(key, rcu); 1326 } 1327 } 1328 1329 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1330 sockptr_t optval, int optlen) 1331 { 1332 struct tcp_md5sig cmd; 1333 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1334 const union tcp_md5_addr *addr; 1335 u8 prefixlen = 32; 1336 int l3index = 0; 1337 u8 flags; 1338 1339 if (optlen < sizeof(cmd)) 1340 return -EINVAL; 1341 1342 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1343 return -EFAULT; 1344 1345 if (sin->sin_family != AF_INET) 1346 return -EINVAL; 1347 1348 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1349 1350 if (optname == TCP_MD5SIG_EXT && 1351 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1352 prefixlen = cmd.tcpm_prefixlen; 1353 if (prefixlen > 32) 1354 return -EINVAL; 1355 } 1356 1357 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && 1358 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1359 struct net_device *dev; 1360 1361 rcu_read_lock(); 1362 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1363 if (dev && netif_is_l3_master(dev)) 1364 l3index = dev->ifindex; 1365 1366 rcu_read_unlock(); 1367 1368 /* ok to reference set/not set outside of rcu; 1369 * right now device MUST be an L3 master 1370 */ 1371 if (!dev || !l3index) 1372 return -EINVAL; 1373 } 1374 1375 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1376 1377 if (!cmd.tcpm_keylen) 1378 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); 1379 1380 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1381 return -EINVAL; 1382 1383 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, 1384 cmd.tcpm_key, cmd.tcpm_keylen); 1385 } 1386 1387 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, 1388 __be32 daddr, __be32 saddr, 1389 const struct tcphdr *th, int nbytes) 1390 { 1391 struct tcp4_pseudohdr *bp; 1392 struct scatterlist sg; 1393 struct tcphdr *_th; 1394 1395 bp = hp->scratch; 1396 bp->saddr = saddr; 1397 bp->daddr = daddr; 1398 bp->pad = 0; 1399 bp->protocol = IPPROTO_TCP; 1400 bp->len = cpu_to_be16(nbytes); 1401 1402 _th = (struct tcphdr *)(bp + 1); 1403 memcpy(_th, th, sizeof(*th)); 1404 _th->check = 0; 1405 1406 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1407 ahash_request_set_crypt(hp->md5_req, &sg, NULL, 1408 sizeof(*bp) + sizeof(*th)); 1409 return crypto_ahash_update(hp->md5_req); 1410 } 1411 1412 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1413 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1414 { 1415 struct tcp_md5sig_pool *hp; 1416 struct ahash_request *req; 1417 1418 hp = tcp_get_md5sig_pool(); 1419 if (!hp) 1420 goto clear_hash_noput; 1421 req = hp->md5_req; 1422 1423 if (crypto_ahash_init(req)) 1424 goto clear_hash; 1425 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) 1426 goto clear_hash; 1427 if (tcp_md5_hash_key(hp, key)) 1428 goto clear_hash; 1429 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1430 if (crypto_ahash_final(req)) 1431 goto clear_hash; 1432 1433 tcp_put_md5sig_pool(); 1434 return 0; 1435 1436 clear_hash: 1437 tcp_put_md5sig_pool(); 1438 clear_hash_noput: 1439 memset(md5_hash, 0, 16); 1440 return 1; 1441 } 1442 1443 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1444 const struct sock *sk, 1445 const struct sk_buff *skb) 1446 { 1447 struct tcp_md5sig_pool *hp; 1448 struct ahash_request *req; 1449 const struct tcphdr *th = tcp_hdr(skb); 1450 __be32 saddr, daddr; 1451 1452 if (sk) { /* valid for establish/request sockets */ 1453 saddr = sk->sk_rcv_saddr; 1454 daddr = sk->sk_daddr; 1455 } else { 1456 const struct iphdr *iph = ip_hdr(skb); 1457 saddr = iph->saddr; 1458 daddr = iph->daddr; 1459 } 1460 1461 hp = tcp_get_md5sig_pool(); 1462 if (!hp) 1463 goto clear_hash_noput; 1464 req = hp->md5_req; 1465 1466 if (crypto_ahash_init(req)) 1467 goto clear_hash; 1468 1469 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len)) 1470 goto clear_hash; 1471 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) 1472 goto clear_hash; 1473 if (tcp_md5_hash_key(hp, key)) 1474 goto clear_hash; 1475 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1476 if (crypto_ahash_final(req)) 1477 goto clear_hash; 1478 1479 tcp_put_md5sig_pool(); 1480 return 0; 1481 1482 clear_hash: 1483 tcp_put_md5sig_pool(); 1484 clear_hash_noput: 1485 memset(md5_hash, 0, 16); 1486 return 1; 1487 } 1488 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1489 1490 #endif 1491 1492 static void tcp_v4_init_req(struct request_sock *req, 1493 const struct sock *sk_listener, 1494 struct sk_buff *skb) 1495 { 1496 struct inet_request_sock *ireq = inet_rsk(req); 1497 struct net *net = sock_net(sk_listener); 1498 1499 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1500 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1501 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1502 } 1503 1504 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1505 struct sk_buff *skb, 1506 struct flowi *fl, 1507 struct request_sock *req) 1508 { 1509 tcp_v4_init_req(req, sk, skb); 1510 1511 if (security_inet_conn_request(sk, skb, req)) 1512 return NULL; 1513 1514 return inet_csk_route_req(sk, &fl->u.ip4, req); 1515 } 1516 1517 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1518 .family = PF_INET, 1519 .obj_size = sizeof(struct tcp_request_sock), 1520 .rtx_syn_ack = tcp_rtx_synack, 1521 .send_ack = tcp_v4_reqsk_send_ack, 1522 .destructor = tcp_v4_reqsk_destructor, 1523 .send_reset = tcp_v4_send_reset, 1524 .syn_ack_timeout = tcp_syn_ack_timeout, 1525 }; 1526 1527 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1528 .mss_clamp = TCP_MSS_DEFAULT, 1529 #ifdef CONFIG_TCP_MD5SIG 1530 .req_md5_lookup = tcp_v4_md5_lookup, 1531 .calc_md5_hash = tcp_v4_md5_hash_skb, 1532 #endif 1533 #ifdef CONFIG_SYN_COOKIES 1534 .cookie_init_seq = cookie_v4_init_sequence, 1535 #endif 1536 .route_req = tcp_v4_route_req, 1537 .init_seq = tcp_v4_init_seq, 1538 .init_ts_off = tcp_v4_init_ts_off, 1539 .send_synack = tcp_v4_send_synack, 1540 }; 1541 1542 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1543 { 1544 /* Never answer to SYNs send to broadcast or multicast */ 1545 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1546 goto drop; 1547 1548 return tcp_conn_request(&tcp_request_sock_ops, 1549 &tcp_request_sock_ipv4_ops, sk, skb); 1550 1551 drop: 1552 tcp_listendrop(sk); 1553 return 0; 1554 } 1555 EXPORT_SYMBOL(tcp_v4_conn_request); 1556 1557 1558 /* 1559 * The three way handshake has completed - we got a valid synack - 1560 * now create the new socket. 1561 */ 1562 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1563 struct request_sock *req, 1564 struct dst_entry *dst, 1565 struct request_sock *req_unhash, 1566 bool *own_req) 1567 { 1568 struct inet_request_sock *ireq; 1569 bool found_dup_sk = false; 1570 struct inet_sock *newinet; 1571 struct tcp_sock *newtp; 1572 struct sock *newsk; 1573 #ifdef CONFIG_TCP_MD5SIG 1574 const union tcp_md5_addr *addr; 1575 struct tcp_md5sig_key *key; 1576 int l3index; 1577 #endif 1578 struct ip_options_rcu *inet_opt; 1579 1580 if (sk_acceptq_is_full(sk)) 1581 goto exit_overflow; 1582 1583 newsk = tcp_create_openreq_child(sk, req, skb); 1584 if (!newsk) 1585 goto exit_nonewsk; 1586 1587 newsk->sk_gso_type = SKB_GSO_TCPV4; 1588 inet_sk_rx_dst_set(newsk, skb); 1589 1590 newtp = tcp_sk(newsk); 1591 newinet = inet_sk(newsk); 1592 ireq = inet_rsk(req); 1593 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1594 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1595 newsk->sk_bound_dev_if = ireq->ir_iif; 1596 newinet->inet_saddr = ireq->ir_loc_addr; 1597 inet_opt = rcu_dereference(ireq->ireq_opt); 1598 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1599 newinet->mc_index = inet_iif(skb); 1600 newinet->mc_ttl = ip_hdr(skb)->ttl; 1601 newinet->rcv_tos = ip_hdr(skb)->tos; 1602 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1603 if (inet_opt) 1604 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1605 atomic_set(&newinet->inet_id, get_random_u16()); 1606 1607 /* Set ToS of the new socket based upon the value of incoming SYN. 1608 * ECT bits are set later in tcp_init_transfer(). 1609 */ 1610 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1611 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1612 1613 if (!dst) { 1614 dst = inet_csk_route_child_sock(sk, newsk, req); 1615 if (!dst) 1616 goto put_and_exit; 1617 } else { 1618 /* syncookie case : see end of cookie_v4_check() */ 1619 } 1620 sk_setup_caps(newsk, dst); 1621 1622 tcp_ca_openreq_child(newsk, dst); 1623 1624 tcp_sync_mss(newsk, dst_mtu(dst)); 1625 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1626 1627 tcp_initialize_rcv_mss(newsk); 1628 1629 #ifdef CONFIG_TCP_MD5SIG 1630 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1631 /* Copy over the MD5 key from the original socket */ 1632 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1633 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1634 if (key) { 1635 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key)) 1636 goto put_and_exit; 1637 sk_gso_disable(newsk); 1638 } 1639 #endif 1640 1641 if (__inet_inherit_port(sk, newsk) < 0) 1642 goto put_and_exit; 1643 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1644 &found_dup_sk); 1645 if (likely(*own_req)) { 1646 tcp_move_syn(newtp, req); 1647 ireq->ireq_opt = NULL; 1648 } else { 1649 newinet->inet_opt = NULL; 1650 1651 if (!req_unhash && found_dup_sk) { 1652 /* This code path should only be executed in the 1653 * syncookie case only 1654 */ 1655 bh_unlock_sock(newsk); 1656 sock_put(newsk); 1657 newsk = NULL; 1658 } 1659 } 1660 return newsk; 1661 1662 exit_overflow: 1663 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1664 exit_nonewsk: 1665 dst_release(dst); 1666 exit: 1667 tcp_listendrop(sk); 1668 return NULL; 1669 put_and_exit: 1670 newinet->inet_opt = NULL; 1671 inet_csk_prepare_forced_close(newsk); 1672 tcp_done(newsk); 1673 goto exit; 1674 } 1675 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1676 1677 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1678 { 1679 #ifdef CONFIG_SYN_COOKIES 1680 const struct tcphdr *th = tcp_hdr(skb); 1681 1682 if (!th->syn) 1683 sk = cookie_v4_check(sk, skb); 1684 #endif 1685 return sk; 1686 } 1687 1688 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1689 struct tcphdr *th, u32 *cookie) 1690 { 1691 u16 mss = 0; 1692 #ifdef CONFIG_SYN_COOKIES 1693 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1694 &tcp_request_sock_ipv4_ops, sk, th); 1695 if (mss) { 1696 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1697 tcp_synq_overflow(sk); 1698 } 1699 #endif 1700 return mss; 1701 } 1702 1703 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 1704 u32)); 1705 /* The socket must have it's spinlock held when we get 1706 * here, unless it is a TCP_LISTEN socket. 1707 * 1708 * We have a potential double-lock case here, so even when 1709 * doing backlog processing we use the BH locking scheme. 1710 * This is because we cannot sleep with the original spinlock 1711 * held. 1712 */ 1713 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1714 { 1715 enum skb_drop_reason reason; 1716 struct sock *rsk; 1717 1718 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1719 struct dst_entry *dst; 1720 1721 dst = rcu_dereference_protected(sk->sk_rx_dst, 1722 lockdep_sock_is_held(sk)); 1723 1724 sock_rps_save_rxhash(sk, skb); 1725 sk_mark_napi_id(sk, skb); 1726 if (dst) { 1727 if (sk->sk_rx_dst_ifindex != skb->skb_iif || 1728 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, 1729 dst, 0)) { 1730 RCU_INIT_POINTER(sk->sk_rx_dst, NULL); 1731 dst_release(dst); 1732 } 1733 } 1734 tcp_rcv_established(sk, skb); 1735 return 0; 1736 } 1737 1738 reason = SKB_DROP_REASON_NOT_SPECIFIED; 1739 if (tcp_checksum_complete(skb)) 1740 goto csum_err; 1741 1742 if (sk->sk_state == TCP_LISTEN) { 1743 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1744 1745 if (!nsk) 1746 goto discard; 1747 if (nsk != sk) { 1748 if (tcp_child_process(sk, nsk, skb)) { 1749 rsk = nsk; 1750 goto reset; 1751 } 1752 return 0; 1753 } 1754 } else 1755 sock_rps_save_rxhash(sk, skb); 1756 1757 if (tcp_rcv_state_process(sk, skb)) { 1758 rsk = sk; 1759 goto reset; 1760 } 1761 return 0; 1762 1763 reset: 1764 tcp_v4_send_reset(rsk, skb); 1765 discard: 1766 kfree_skb_reason(skb, reason); 1767 /* Be careful here. If this function gets more complicated and 1768 * gcc suffers from register pressure on the x86, sk (in %ebx) 1769 * might be destroyed here. This current version compiles correctly, 1770 * but you have been warned. 1771 */ 1772 return 0; 1773 1774 csum_err: 1775 reason = SKB_DROP_REASON_TCP_CSUM; 1776 trace_tcp_bad_csum(skb); 1777 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1778 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1779 goto discard; 1780 } 1781 EXPORT_SYMBOL(tcp_v4_do_rcv); 1782 1783 int tcp_v4_early_demux(struct sk_buff *skb) 1784 { 1785 struct net *net = dev_net(skb->dev); 1786 const struct iphdr *iph; 1787 const struct tcphdr *th; 1788 struct sock *sk; 1789 1790 if (skb->pkt_type != PACKET_HOST) 1791 return 0; 1792 1793 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1794 return 0; 1795 1796 iph = ip_hdr(skb); 1797 th = tcp_hdr(skb); 1798 1799 if (th->doff < sizeof(struct tcphdr) / 4) 1800 return 0; 1801 1802 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 1803 iph->saddr, th->source, 1804 iph->daddr, ntohs(th->dest), 1805 skb->skb_iif, inet_sdif(skb)); 1806 if (sk) { 1807 skb->sk = sk; 1808 skb->destructor = sock_edemux; 1809 if (sk_fullsock(sk)) { 1810 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); 1811 1812 if (dst) 1813 dst = dst_check(dst, 0); 1814 if (dst && 1815 sk->sk_rx_dst_ifindex == skb->skb_iif) 1816 skb_dst_set_noref(skb, dst); 1817 } 1818 } 1819 return 0; 1820 } 1821 1822 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, 1823 enum skb_drop_reason *reason) 1824 { 1825 u32 limit, tail_gso_size, tail_gso_segs; 1826 struct skb_shared_info *shinfo; 1827 const struct tcphdr *th; 1828 struct tcphdr *thtail; 1829 struct sk_buff *tail; 1830 unsigned int hdrlen; 1831 bool fragstolen; 1832 u32 gso_segs; 1833 u32 gso_size; 1834 int delta; 1835 1836 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1837 * we can fix skb->truesize to its real value to avoid future drops. 1838 * This is valid because skb is not yet charged to the socket. 1839 * It has been noticed pure SACK packets were sometimes dropped 1840 * (if cooked by drivers without copybreak feature). 1841 */ 1842 skb_condense(skb); 1843 1844 skb_dst_drop(skb); 1845 1846 if (unlikely(tcp_checksum_complete(skb))) { 1847 bh_unlock_sock(sk); 1848 trace_tcp_bad_csum(skb); 1849 *reason = SKB_DROP_REASON_TCP_CSUM; 1850 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1851 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1852 return true; 1853 } 1854 1855 /* Attempt coalescing to last skb in backlog, even if we are 1856 * above the limits. 1857 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 1858 */ 1859 th = (const struct tcphdr *)skb->data; 1860 hdrlen = th->doff * 4; 1861 1862 tail = sk->sk_backlog.tail; 1863 if (!tail) 1864 goto no_coalesce; 1865 thtail = (struct tcphdr *)tail->data; 1866 1867 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 1868 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 1869 ((TCP_SKB_CB(tail)->tcp_flags | 1870 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 1871 !((TCP_SKB_CB(tail)->tcp_flags & 1872 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 1873 ((TCP_SKB_CB(tail)->tcp_flags ^ 1874 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || 1875 #ifdef CONFIG_TLS_DEVICE 1876 tail->decrypted != skb->decrypted || 1877 #endif 1878 !mptcp_skb_can_collapse(tail, skb) || 1879 thtail->doff != th->doff || 1880 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 1881 goto no_coalesce; 1882 1883 __skb_pull(skb, hdrlen); 1884 1885 shinfo = skb_shinfo(skb); 1886 gso_size = shinfo->gso_size ?: skb->len; 1887 gso_segs = shinfo->gso_segs ?: 1; 1888 1889 shinfo = skb_shinfo(tail); 1890 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); 1891 tail_gso_segs = shinfo->gso_segs ?: 1; 1892 1893 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 1894 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 1895 1896 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 1897 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 1898 thtail->window = th->window; 1899 } 1900 1901 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 1902 * thtail->fin, so that the fast path in tcp_rcv_established() 1903 * is not entered if we append a packet with a FIN. 1904 * SYN, RST, URG are not present. 1905 * ACK is set on both packets. 1906 * PSH : we do not really care in TCP stack, 1907 * at least for 'GRO' packets. 1908 */ 1909 thtail->fin |= th->fin; 1910 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 1911 1912 if (TCP_SKB_CB(skb)->has_rxtstamp) { 1913 TCP_SKB_CB(tail)->has_rxtstamp = true; 1914 tail->tstamp = skb->tstamp; 1915 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 1916 } 1917 1918 /* Not as strict as GRO. We only need to carry mss max value */ 1919 shinfo->gso_size = max(gso_size, tail_gso_size); 1920 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); 1921 1922 sk->sk_backlog.len += delta; 1923 __NET_INC_STATS(sock_net(sk), 1924 LINUX_MIB_TCPBACKLOGCOALESCE); 1925 kfree_skb_partial(skb, fragstolen); 1926 return false; 1927 } 1928 __skb_push(skb, hdrlen); 1929 1930 no_coalesce: 1931 limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1); 1932 1933 /* Only socket owner can try to collapse/prune rx queues 1934 * to reduce memory overhead, so add a little headroom here. 1935 * Few sockets backlog are possibly concurrently non empty. 1936 */ 1937 limit += 64 * 1024; 1938 1939 if (unlikely(sk_add_backlog(sk, skb, limit))) { 1940 bh_unlock_sock(sk); 1941 *reason = SKB_DROP_REASON_SOCKET_BACKLOG; 1942 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 1943 return true; 1944 } 1945 return false; 1946 } 1947 EXPORT_SYMBOL(tcp_add_backlog); 1948 1949 int tcp_filter(struct sock *sk, struct sk_buff *skb) 1950 { 1951 struct tcphdr *th = (struct tcphdr *)skb->data; 1952 1953 return sk_filter_trim_cap(sk, skb, th->doff * 4); 1954 } 1955 EXPORT_SYMBOL(tcp_filter); 1956 1957 static void tcp_v4_restore_cb(struct sk_buff *skb) 1958 { 1959 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 1960 sizeof(struct inet_skb_parm)); 1961 } 1962 1963 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 1964 const struct tcphdr *th) 1965 { 1966 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 1967 * barrier() makes sure compiler wont play fool^Waliasing games. 1968 */ 1969 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 1970 sizeof(struct inet_skb_parm)); 1971 barrier(); 1972 1973 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1974 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1975 skb->len - th->doff * 4); 1976 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1977 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 1978 TCP_SKB_CB(skb)->tcp_tw_isn = 0; 1979 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 1980 TCP_SKB_CB(skb)->sacked = 0; 1981 TCP_SKB_CB(skb)->has_rxtstamp = 1982 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 1983 } 1984 1985 /* 1986 * From tcp_input.c 1987 */ 1988 1989 int tcp_v4_rcv(struct sk_buff *skb) 1990 { 1991 struct net *net = dev_net(skb->dev); 1992 enum skb_drop_reason drop_reason; 1993 int sdif = inet_sdif(skb); 1994 int dif = inet_iif(skb); 1995 const struct iphdr *iph; 1996 const struct tcphdr *th; 1997 bool refcounted; 1998 struct sock *sk; 1999 int ret; 2000 2001 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 2002 if (skb->pkt_type != PACKET_HOST) 2003 goto discard_it; 2004 2005 /* Count it even if it's bad */ 2006 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 2007 2008 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 2009 goto discard_it; 2010 2011 th = (const struct tcphdr *)skb->data; 2012 2013 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { 2014 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; 2015 goto bad_packet; 2016 } 2017 if (!pskb_may_pull(skb, th->doff * 4)) 2018 goto discard_it; 2019 2020 /* An explanation is required here, I think. 2021 * Packet length and doff are validated by header prediction, 2022 * provided case of th->doff==0 is eliminated. 2023 * So, we defer the checks. */ 2024 2025 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 2026 goto csum_error; 2027 2028 th = (const struct tcphdr *)skb->data; 2029 iph = ip_hdr(skb); 2030 lookup: 2031 sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo, 2032 skb, __tcp_hdrlen(th), th->source, 2033 th->dest, sdif, &refcounted); 2034 if (!sk) 2035 goto no_tcp_socket; 2036 2037 process: 2038 if (sk->sk_state == TCP_TIME_WAIT) 2039 goto do_time_wait; 2040 2041 if (sk->sk_state == TCP_NEW_SYN_RECV) { 2042 struct request_sock *req = inet_reqsk(sk); 2043 bool req_stolen = false; 2044 struct sock *nsk; 2045 2046 sk = req->rsk_listener; 2047 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 2048 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2049 else 2050 drop_reason = tcp_inbound_md5_hash(sk, skb, 2051 &iph->saddr, &iph->daddr, 2052 AF_INET, dif, sdif); 2053 if (unlikely(drop_reason)) { 2054 sk_drops_add(sk, skb); 2055 reqsk_put(req); 2056 goto discard_it; 2057 } 2058 if (tcp_checksum_complete(skb)) { 2059 reqsk_put(req); 2060 goto csum_error; 2061 } 2062 if (unlikely(sk->sk_state != TCP_LISTEN)) { 2063 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); 2064 if (!nsk) { 2065 inet_csk_reqsk_queue_drop_and_put(sk, req); 2066 goto lookup; 2067 } 2068 sk = nsk; 2069 /* reuseport_migrate_sock() has already held one sk_refcnt 2070 * before returning. 2071 */ 2072 } else { 2073 /* We own a reference on the listener, increase it again 2074 * as we might lose it too soon. 2075 */ 2076 sock_hold(sk); 2077 } 2078 refcounted = true; 2079 nsk = NULL; 2080 if (!tcp_filter(sk, skb)) { 2081 th = (const struct tcphdr *)skb->data; 2082 iph = ip_hdr(skb); 2083 tcp_v4_fill_cb(skb, iph, th); 2084 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 2085 } else { 2086 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2087 } 2088 if (!nsk) { 2089 reqsk_put(req); 2090 if (req_stolen) { 2091 /* Another cpu got exclusive access to req 2092 * and created a full blown socket. 2093 * Try to feed this packet to this socket 2094 * instead of discarding it. 2095 */ 2096 tcp_v4_restore_cb(skb); 2097 sock_put(sk); 2098 goto lookup; 2099 } 2100 goto discard_and_relse; 2101 } 2102 nf_reset_ct(skb); 2103 if (nsk == sk) { 2104 reqsk_put(req); 2105 tcp_v4_restore_cb(skb); 2106 } else if (tcp_child_process(sk, nsk, skb)) { 2107 tcp_v4_send_reset(nsk, skb); 2108 goto discard_and_relse; 2109 } else { 2110 sock_put(sk); 2111 return 0; 2112 } 2113 } 2114 2115 if (static_branch_unlikely(&ip4_min_ttl)) { 2116 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 2117 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 2118 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2119 drop_reason = SKB_DROP_REASON_TCP_MINTTL; 2120 goto discard_and_relse; 2121 } 2122 } 2123 2124 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { 2125 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2126 goto discard_and_relse; 2127 } 2128 2129 drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr, 2130 &iph->daddr, AF_INET, dif, sdif); 2131 if (drop_reason) 2132 goto discard_and_relse; 2133 2134 nf_reset_ct(skb); 2135 2136 if (tcp_filter(sk, skb)) { 2137 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2138 goto discard_and_relse; 2139 } 2140 th = (const struct tcphdr *)skb->data; 2141 iph = ip_hdr(skb); 2142 tcp_v4_fill_cb(skb, iph, th); 2143 2144 skb->dev = NULL; 2145 2146 if (sk->sk_state == TCP_LISTEN) { 2147 ret = tcp_v4_do_rcv(sk, skb); 2148 goto put_and_return; 2149 } 2150 2151 sk_incoming_cpu_update(sk); 2152 2153 bh_lock_sock_nested(sk); 2154 tcp_segs_in(tcp_sk(sk), skb); 2155 ret = 0; 2156 if (!sock_owned_by_user(sk)) { 2157 ret = tcp_v4_do_rcv(sk, skb); 2158 } else { 2159 if (tcp_add_backlog(sk, skb, &drop_reason)) 2160 goto discard_and_relse; 2161 } 2162 bh_unlock_sock(sk); 2163 2164 put_and_return: 2165 if (refcounted) 2166 sock_put(sk); 2167 2168 return ret; 2169 2170 no_tcp_socket: 2171 drop_reason = SKB_DROP_REASON_NO_SOCKET; 2172 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2173 goto discard_it; 2174 2175 tcp_v4_fill_cb(skb, iph, th); 2176 2177 if (tcp_checksum_complete(skb)) { 2178 csum_error: 2179 drop_reason = SKB_DROP_REASON_TCP_CSUM; 2180 trace_tcp_bad_csum(skb); 2181 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2182 bad_packet: 2183 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2184 } else { 2185 tcp_v4_send_reset(NULL, skb); 2186 } 2187 2188 discard_it: 2189 SKB_DR_OR(drop_reason, NOT_SPECIFIED); 2190 /* Discard frame. */ 2191 kfree_skb_reason(skb, drop_reason); 2192 return 0; 2193 2194 discard_and_relse: 2195 sk_drops_add(sk, skb); 2196 if (refcounted) 2197 sock_put(sk); 2198 goto discard_it; 2199 2200 do_time_wait: 2201 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2202 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2203 inet_twsk_put(inet_twsk(sk)); 2204 goto discard_it; 2205 } 2206 2207 tcp_v4_fill_cb(skb, iph, th); 2208 2209 if (tcp_checksum_complete(skb)) { 2210 inet_twsk_put(inet_twsk(sk)); 2211 goto csum_error; 2212 } 2213 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 2214 case TCP_TW_SYN: { 2215 struct sock *sk2 = inet_lookup_listener(net, 2216 net->ipv4.tcp_death_row.hashinfo, 2217 skb, __tcp_hdrlen(th), 2218 iph->saddr, th->source, 2219 iph->daddr, th->dest, 2220 inet_iif(skb), 2221 sdif); 2222 if (sk2) { 2223 inet_twsk_deschedule_put(inet_twsk(sk)); 2224 sk = sk2; 2225 tcp_v4_restore_cb(skb); 2226 refcounted = false; 2227 goto process; 2228 } 2229 } 2230 /* to ACK */ 2231 fallthrough; 2232 case TCP_TW_ACK: 2233 tcp_v4_timewait_ack(sk, skb); 2234 break; 2235 case TCP_TW_RST: 2236 tcp_v4_send_reset(sk, skb); 2237 inet_twsk_deschedule_put(inet_twsk(sk)); 2238 goto discard_it; 2239 case TCP_TW_SUCCESS:; 2240 } 2241 goto discard_it; 2242 } 2243 2244 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2245 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2246 .twsk_unique = tcp_twsk_unique, 2247 .twsk_destructor= tcp_twsk_destructor, 2248 }; 2249 2250 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2251 { 2252 struct dst_entry *dst = skb_dst(skb); 2253 2254 if (dst && dst_hold_safe(dst)) { 2255 rcu_assign_pointer(sk->sk_rx_dst, dst); 2256 sk->sk_rx_dst_ifindex = skb->skb_iif; 2257 } 2258 } 2259 EXPORT_SYMBOL(inet_sk_rx_dst_set); 2260 2261 const struct inet_connection_sock_af_ops ipv4_specific = { 2262 .queue_xmit = ip_queue_xmit, 2263 .send_check = tcp_v4_send_check, 2264 .rebuild_header = inet_sk_rebuild_header, 2265 .sk_rx_dst_set = inet_sk_rx_dst_set, 2266 .conn_request = tcp_v4_conn_request, 2267 .syn_recv_sock = tcp_v4_syn_recv_sock, 2268 .net_header_len = sizeof(struct iphdr), 2269 .setsockopt = ip_setsockopt, 2270 .getsockopt = ip_getsockopt, 2271 .addr2sockaddr = inet_csk_addr2sockaddr, 2272 .sockaddr_len = sizeof(struct sockaddr_in), 2273 .mtu_reduced = tcp_v4_mtu_reduced, 2274 }; 2275 EXPORT_SYMBOL(ipv4_specific); 2276 2277 #ifdef CONFIG_TCP_MD5SIG 2278 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2279 .md5_lookup = tcp_v4_md5_lookup, 2280 .calc_md5_hash = tcp_v4_md5_hash_skb, 2281 .md5_parse = tcp_v4_parse_md5_keys, 2282 }; 2283 #endif 2284 2285 /* NOTE: A lot of things set to zero explicitly by call to 2286 * sk_alloc() so need not be done here. 2287 */ 2288 static int tcp_v4_init_sock(struct sock *sk) 2289 { 2290 struct inet_connection_sock *icsk = inet_csk(sk); 2291 2292 tcp_init_sock(sk); 2293 2294 icsk->icsk_af_ops = &ipv4_specific; 2295 2296 #ifdef CONFIG_TCP_MD5SIG 2297 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2298 #endif 2299 2300 return 0; 2301 } 2302 2303 void tcp_v4_destroy_sock(struct sock *sk) 2304 { 2305 struct tcp_sock *tp = tcp_sk(sk); 2306 2307 trace_tcp_destroy_sock(sk); 2308 2309 tcp_clear_xmit_timers(sk); 2310 2311 tcp_cleanup_congestion_control(sk); 2312 2313 tcp_cleanup_ulp(sk); 2314 2315 /* Cleanup up the write buffer. */ 2316 tcp_write_queue_purge(sk); 2317 2318 /* Check if we want to disable active TFO */ 2319 tcp_fastopen_active_disable_ofo_check(sk); 2320 2321 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2322 skb_rbtree_purge(&tp->out_of_order_queue); 2323 2324 #ifdef CONFIG_TCP_MD5SIG 2325 /* Clean up the MD5 key list, if any */ 2326 if (tp->md5sig_info) { 2327 tcp_clear_md5_list(sk); 2328 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu); 2329 tp->md5sig_info = NULL; 2330 static_branch_slow_dec_deferred(&tcp_md5_needed); 2331 } 2332 #endif 2333 2334 /* Clean up a referenced TCP bind bucket. */ 2335 if (inet_csk(sk)->icsk_bind_hash) 2336 inet_put_port(sk); 2337 2338 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2339 2340 /* If socket is aborted during connect operation */ 2341 tcp_free_fastopen_req(tp); 2342 tcp_fastopen_destroy_cipher(sk); 2343 tcp_saved_syn_free(tp); 2344 2345 sk_sockets_allocated_dec(sk); 2346 } 2347 EXPORT_SYMBOL(tcp_v4_destroy_sock); 2348 2349 #ifdef CONFIG_PROC_FS 2350 /* Proc filesystem TCP sock list dumping. */ 2351 2352 static unsigned short seq_file_family(const struct seq_file *seq); 2353 2354 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) 2355 { 2356 unsigned short family = seq_file_family(seq); 2357 2358 /* AF_UNSPEC is used as a match all */ 2359 return ((family == AF_UNSPEC || family == sk->sk_family) && 2360 net_eq(sock_net(sk), seq_file_net(seq))); 2361 } 2362 2363 /* Find a non empty bucket (starting from st->bucket) 2364 * and return the first sk from it. 2365 */ 2366 static void *listening_get_first(struct seq_file *seq) 2367 { 2368 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2369 struct tcp_iter_state *st = seq->private; 2370 2371 st->offset = 0; 2372 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) { 2373 struct inet_listen_hashbucket *ilb2; 2374 struct hlist_nulls_node *node; 2375 struct sock *sk; 2376 2377 ilb2 = &hinfo->lhash2[st->bucket]; 2378 if (hlist_nulls_empty(&ilb2->nulls_head)) 2379 continue; 2380 2381 spin_lock(&ilb2->lock); 2382 sk_nulls_for_each(sk, node, &ilb2->nulls_head) { 2383 if (seq_sk_match(seq, sk)) 2384 return sk; 2385 } 2386 spin_unlock(&ilb2->lock); 2387 } 2388 2389 return NULL; 2390 } 2391 2392 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). 2393 * If "cur" is the last one in the st->bucket, 2394 * call listening_get_first() to return the first sk of the next 2395 * non empty bucket. 2396 */ 2397 static void *listening_get_next(struct seq_file *seq, void *cur) 2398 { 2399 struct tcp_iter_state *st = seq->private; 2400 struct inet_listen_hashbucket *ilb2; 2401 struct hlist_nulls_node *node; 2402 struct inet_hashinfo *hinfo; 2403 struct sock *sk = cur; 2404 2405 ++st->num; 2406 ++st->offset; 2407 2408 sk = sk_nulls_next(sk); 2409 sk_nulls_for_each_from(sk, node) { 2410 if (seq_sk_match(seq, sk)) 2411 return sk; 2412 } 2413 2414 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2415 ilb2 = &hinfo->lhash2[st->bucket]; 2416 spin_unlock(&ilb2->lock); 2417 ++st->bucket; 2418 return listening_get_first(seq); 2419 } 2420 2421 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2422 { 2423 struct tcp_iter_state *st = seq->private; 2424 void *rc; 2425 2426 st->bucket = 0; 2427 st->offset = 0; 2428 rc = listening_get_first(seq); 2429 2430 while (rc && *pos) { 2431 rc = listening_get_next(seq, rc); 2432 --*pos; 2433 } 2434 return rc; 2435 } 2436 2437 static inline bool empty_bucket(struct inet_hashinfo *hinfo, 2438 const struct tcp_iter_state *st) 2439 { 2440 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain); 2441 } 2442 2443 /* 2444 * Get first established socket starting from bucket given in st->bucket. 2445 * If st->bucket is zero, the very first socket in the hash is returned. 2446 */ 2447 static void *established_get_first(struct seq_file *seq) 2448 { 2449 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2450 struct tcp_iter_state *st = seq->private; 2451 2452 st->offset = 0; 2453 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) { 2454 struct sock *sk; 2455 struct hlist_nulls_node *node; 2456 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket); 2457 2458 cond_resched(); 2459 2460 /* Lockless fast path for the common case of empty buckets */ 2461 if (empty_bucket(hinfo, st)) 2462 continue; 2463 2464 spin_lock_bh(lock); 2465 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) { 2466 if (seq_sk_match(seq, sk)) 2467 return sk; 2468 } 2469 spin_unlock_bh(lock); 2470 } 2471 2472 return NULL; 2473 } 2474 2475 static void *established_get_next(struct seq_file *seq, void *cur) 2476 { 2477 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2478 struct tcp_iter_state *st = seq->private; 2479 struct hlist_nulls_node *node; 2480 struct sock *sk = cur; 2481 2482 ++st->num; 2483 ++st->offset; 2484 2485 sk = sk_nulls_next(sk); 2486 2487 sk_nulls_for_each_from(sk, node) { 2488 if (seq_sk_match(seq, sk)) 2489 return sk; 2490 } 2491 2492 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2493 ++st->bucket; 2494 return established_get_first(seq); 2495 } 2496 2497 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2498 { 2499 struct tcp_iter_state *st = seq->private; 2500 void *rc; 2501 2502 st->bucket = 0; 2503 rc = established_get_first(seq); 2504 2505 while (rc && pos) { 2506 rc = established_get_next(seq, rc); 2507 --pos; 2508 } 2509 return rc; 2510 } 2511 2512 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2513 { 2514 void *rc; 2515 struct tcp_iter_state *st = seq->private; 2516 2517 st->state = TCP_SEQ_STATE_LISTENING; 2518 rc = listening_get_idx(seq, &pos); 2519 2520 if (!rc) { 2521 st->state = TCP_SEQ_STATE_ESTABLISHED; 2522 rc = established_get_idx(seq, pos); 2523 } 2524 2525 return rc; 2526 } 2527 2528 static void *tcp_seek_last_pos(struct seq_file *seq) 2529 { 2530 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2531 struct tcp_iter_state *st = seq->private; 2532 int bucket = st->bucket; 2533 int offset = st->offset; 2534 int orig_num = st->num; 2535 void *rc = NULL; 2536 2537 switch (st->state) { 2538 case TCP_SEQ_STATE_LISTENING: 2539 if (st->bucket > hinfo->lhash2_mask) 2540 break; 2541 rc = listening_get_first(seq); 2542 while (offset-- && rc && bucket == st->bucket) 2543 rc = listening_get_next(seq, rc); 2544 if (rc) 2545 break; 2546 st->bucket = 0; 2547 st->state = TCP_SEQ_STATE_ESTABLISHED; 2548 fallthrough; 2549 case TCP_SEQ_STATE_ESTABLISHED: 2550 if (st->bucket > hinfo->ehash_mask) 2551 break; 2552 rc = established_get_first(seq); 2553 while (offset-- && rc && bucket == st->bucket) 2554 rc = established_get_next(seq, rc); 2555 } 2556 2557 st->num = orig_num; 2558 2559 return rc; 2560 } 2561 2562 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2563 { 2564 struct tcp_iter_state *st = seq->private; 2565 void *rc; 2566 2567 if (*pos && *pos == st->last_pos) { 2568 rc = tcp_seek_last_pos(seq); 2569 if (rc) 2570 goto out; 2571 } 2572 2573 st->state = TCP_SEQ_STATE_LISTENING; 2574 st->num = 0; 2575 st->bucket = 0; 2576 st->offset = 0; 2577 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2578 2579 out: 2580 st->last_pos = *pos; 2581 return rc; 2582 } 2583 EXPORT_SYMBOL(tcp_seq_start); 2584 2585 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2586 { 2587 struct tcp_iter_state *st = seq->private; 2588 void *rc = NULL; 2589 2590 if (v == SEQ_START_TOKEN) { 2591 rc = tcp_get_idx(seq, 0); 2592 goto out; 2593 } 2594 2595 switch (st->state) { 2596 case TCP_SEQ_STATE_LISTENING: 2597 rc = listening_get_next(seq, v); 2598 if (!rc) { 2599 st->state = TCP_SEQ_STATE_ESTABLISHED; 2600 st->bucket = 0; 2601 st->offset = 0; 2602 rc = established_get_first(seq); 2603 } 2604 break; 2605 case TCP_SEQ_STATE_ESTABLISHED: 2606 rc = established_get_next(seq, v); 2607 break; 2608 } 2609 out: 2610 ++*pos; 2611 st->last_pos = *pos; 2612 return rc; 2613 } 2614 EXPORT_SYMBOL(tcp_seq_next); 2615 2616 void tcp_seq_stop(struct seq_file *seq, void *v) 2617 { 2618 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2619 struct tcp_iter_state *st = seq->private; 2620 2621 switch (st->state) { 2622 case TCP_SEQ_STATE_LISTENING: 2623 if (v != SEQ_START_TOKEN) 2624 spin_unlock(&hinfo->lhash2[st->bucket].lock); 2625 break; 2626 case TCP_SEQ_STATE_ESTABLISHED: 2627 if (v) 2628 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2629 break; 2630 } 2631 } 2632 EXPORT_SYMBOL(tcp_seq_stop); 2633 2634 static void get_openreq4(const struct request_sock *req, 2635 struct seq_file *f, int i) 2636 { 2637 const struct inet_request_sock *ireq = inet_rsk(req); 2638 long delta = req->rsk_timer.expires - jiffies; 2639 2640 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2641 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2642 i, 2643 ireq->ir_loc_addr, 2644 ireq->ir_num, 2645 ireq->ir_rmt_addr, 2646 ntohs(ireq->ir_rmt_port), 2647 TCP_SYN_RECV, 2648 0, 0, /* could print option size, but that is af dependent. */ 2649 1, /* timers active (only the expire timer) */ 2650 jiffies_delta_to_clock_t(delta), 2651 req->num_timeout, 2652 from_kuid_munged(seq_user_ns(f), 2653 sock_i_uid(req->rsk_listener)), 2654 0, /* non standard timer */ 2655 0, /* open_requests have no inode */ 2656 0, 2657 req); 2658 } 2659 2660 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2661 { 2662 int timer_active; 2663 unsigned long timer_expires; 2664 const struct tcp_sock *tp = tcp_sk(sk); 2665 const struct inet_connection_sock *icsk = inet_csk(sk); 2666 const struct inet_sock *inet = inet_sk(sk); 2667 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2668 __be32 dest = inet->inet_daddr; 2669 __be32 src = inet->inet_rcv_saddr; 2670 __u16 destp = ntohs(inet->inet_dport); 2671 __u16 srcp = ntohs(inet->inet_sport); 2672 int rx_queue; 2673 int state; 2674 2675 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2676 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2677 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2678 timer_active = 1; 2679 timer_expires = icsk->icsk_timeout; 2680 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2681 timer_active = 4; 2682 timer_expires = icsk->icsk_timeout; 2683 } else if (timer_pending(&sk->sk_timer)) { 2684 timer_active = 2; 2685 timer_expires = sk->sk_timer.expires; 2686 } else { 2687 timer_active = 0; 2688 timer_expires = jiffies; 2689 } 2690 2691 state = inet_sk_state_load(sk); 2692 if (state == TCP_LISTEN) 2693 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2694 else 2695 /* Because we don't lock the socket, 2696 * we might find a transient negative value. 2697 */ 2698 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2699 READ_ONCE(tp->copied_seq), 0); 2700 2701 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2702 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2703 i, src, srcp, dest, destp, state, 2704 READ_ONCE(tp->write_seq) - tp->snd_una, 2705 rx_queue, 2706 timer_active, 2707 jiffies_delta_to_clock_t(timer_expires - jiffies), 2708 icsk->icsk_retransmits, 2709 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2710 icsk->icsk_probes_out, 2711 sock_i_ino(sk), 2712 refcount_read(&sk->sk_refcnt), sk, 2713 jiffies_to_clock_t(icsk->icsk_rto), 2714 jiffies_to_clock_t(icsk->icsk_ack.ato), 2715 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2716 tcp_snd_cwnd(tp), 2717 state == TCP_LISTEN ? 2718 fastopenq->max_qlen : 2719 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2720 } 2721 2722 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2723 struct seq_file *f, int i) 2724 { 2725 long delta = tw->tw_timer.expires - jiffies; 2726 __be32 dest, src; 2727 __u16 destp, srcp; 2728 2729 dest = tw->tw_daddr; 2730 src = tw->tw_rcv_saddr; 2731 destp = ntohs(tw->tw_dport); 2732 srcp = ntohs(tw->tw_sport); 2733 2734 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2735 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2736 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2737 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2738 refcount_read(&tw->tw_refcnt), tw); 2739 } 2740 2741 #define TMPSZ 150 2742 2743 static int tcp4_seq_show(struct seq_file *seq, void *v) 2744 { 2745 struct tcp_iter_state *st; 2746 struct sock *sk = v; 2747 2748 seq_setwidth(seq, TMPSZ - 1); 2749 if (v == SEQ_START_TOKEN) { 2750 seq_puts(seq, " sl local_address rem_address st tx_queue " 2751 "rx_queue tr tm->when retrnsmt uid timeout " 2752 "inode"); 2753 goto out; 2754 } 2755 st = seq->private; 2756 2757 if (sk->sk_state == TCP_TIME_WAIT) 2758 get_timewait4_sock(v, seq, st->num); 2759 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2760 get_openreq4(v, seq, st->num); 2761 else 2762 get_tcp4_sock(v, seq, st->num); 2763 out: 2764 seq_pad(seq, '\n'); 2765 return 0; 2766 } 2767 2768 #ifdef CONFIG_BPF_SYSCALL 2769 struct bpf_tcp_iter_state { 2770 struct tcp_iter_state state; 2771 unsigned int cur_sk; 2772 unsigned int end_sk; 2773 unsigned int max_sk; 2774 struct sock **batch; 2775 bool st_bucket_done; 2776 }; 2777 2778 struct bpf_iter__tcp { 2779 __bpf_md_ptr(struct bpf_iter_meta *, meta); 2780 __bpf_md_ptr(struct sock_common *, sk_common); 2781 uid_t uid __aligned(8); 2782 }; 2783 2784 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 2785 struct sock_common *sk_common, uid_t uid) 2786 { 2787 struct bpf_iter__tcp ctx; 2788 2789 meta->seq_num--; /* skip SEQ_START_TOKEN */ 2790 ctx.meta = meta; 2791 ctx.sk_common = sk_common; 2792 ctx.uid = uid; 2793 return bpf_iter_run_prog(prog, &ctx); 2794 } 2795 2796 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) 2797 { 2798 while (iter->cur_sk < iter->end_sk) 2799 sock_gen_put(iter->batch[iter->cur_sk++]); 2800 } 2801 2802 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, 2803 unsigned int new_batch_sz) 2804 { 2805 struct sock **new_batch; 2806 2807 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 2808 GFP_USER | __GFP_NOWARN); 2809 if (!new_batch) 2810 return -ENOMEM; 2811 2812 bpf_iter_tcp_put_batch(iter); 2813 kvfree(iter->batch); 2814 iter->batch = new_batch; 2815 iter->max_sk = new_batch_sz; 2816 2817 return 0; 2818 } 2819 2820 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, 2821 struct sock *start_sk) 2822 { 2823 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2824 struct bpf_tcp_iter_state *iter = seq->private; 2825 struct tcp_iter_state *st = &iter->state; 2826 struct hlist_nulls_node *node; 2827 unsigned int expected = 1; 2828 struct sock *sk; 2829 2830 sock_hold(start_sk); 2831 iter->batch[iter->end_sk++] = start_sk; 2832 2833 sk = sk_nulls_next(start_sk); 2834 sk_nulls_for_each_from(sk, node) { 2835 if (seq_sk_match(seq, sk)) { 2836 if (iter->end_sk < iter->max_sk) { 2837 sock_hold(sk); 2838 iter->batch[iter->end_sk++] = sk; 2839 } 2840 expected++; 2841 } 2842 } 2843 spin_unlock(&hinfo->lhash2[st->bucket].lock); 2844 2845 return expected; 2846 } 2847 2848 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, 2849 struct sock *start_sk) 2850 { 2851 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2852 struct bpf_tcp_iter_state *iter = seq->private; 2853 struct tcp_iter_state *st = &iter->state; 2854 struct hlist_nulls_node *node; 2855 unsigned int expected = 1; 2856 struct sock *sk; 2857 2858 sock_hold(start_sk); 2859 iter->batch[iter->end_sk++] = start_sk; 2860 2861 sk = sk_nulls_next(start_sk); 2862 sk_nulls_for_each_from(sk, node) { 2863 if (seq_sk_match(seq, sk)) { 2864 if (iter->end_sk < iter->max_sk) { 2865 sock_hold(sk); 2866 iter->batch[iter->end_sk++] = sk; 2867 } 2868 expected++; 2869 } 2870 } 2871 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2872 2873 return expected; 2874 } 2875 2876 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) 2877 { 2878 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2879 struct bpf_tcp_iter_state *iter = seq->private; 2880 struct tcp_iter_state *st = &iter->state; 2881 unsigned int expected; 2882 bool resized = false; 2883 struct sock *sk; 2884 2885 /* The st->bucket is done. Directly advance to the next 2886 * bucket instead of having the tcp_seek_last_pos() to skip 2887 * one by one in the current bucket and eventually find out 2888 * it has to advance to the next bucket. 2889 */ 2890 if (iter->st_bucket_done) { 2891 st->offset = 0; 2892 st->bucket++; 2893 if (st->state == TCP_SEQ_STATE_LISTENING && 2894 st->bucket > hinfo->lhash2_mask) { 2895 st->state = TCP_SEQ_STATE_ESTABLISHED; 2896 st->bucket = 0; 2897 } 2898 } 2899 2900 again: 2901 /* Get a new batch */ 2902 iter->cur_sk = 0; 2903 iter->end_sk = 0; 2904 iter->st_bucket_done = false; 2905 2906 sk = tcp_seek_last_pos(seq); 2907 if (!sk) 2908 return NULL; /* Done */ 2909 2910 if (st->state == TCP_SEQ_STATE_LISTENING) 2911 expected = bpf_iter_tcp_listening_batch(seq, sk); 2912 else 2913 expected = bpf_iter_tcp_established_batch(seq, sk); 2914 2915 if (iter->end_sk == expected) { 2916 iter->st_bucket_done = true; 2917 return sk; 2918 } 2919 2920 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) { 2921 resized = true; 2922 goto again; 2923 } 2924 2925 return sk; 2926 } 2927 2928 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) 2929 { 2930 /* bpf iter does not support lseek, so it always 2931 * continue from where it was stop()-ped. 2932 */ 2933 if (*pos) 2934 return bpf_iter_tcp_batch(seq); 2935 2936 return SEQ_START_TOKEN; 2937 } 2938 2939 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2940 { 2941 struct bpf_tcp_iter_state *iter = seq->private; 2942 struct tcp_iter_state *st = &iter->state; 2943 struct sock *sk; 2944 2945 /* Whenever seq_next() is called, the iter->cur_sk is 2946 * done with seq_show(), so advance to the next sk in 2947 * the batch. 2948 */ 2949 if (iter->cur_sk < iter->end_sk) { 2950 /* Keeping st->num consistent in tcp_iter_state. 2951 * bpf_iter_tcp does not use st->num. 2952 * meta.seq_num is used instead. 2953 */ 2954 st->num++; 2955 /* Move st->offset to the next sk in the bucket such that 2956 * the future start() will resume at st->offset in 2957 * st->bucket. See tcp_seek_last_pos(). 2958 */ 2959 st->offset++; 2960 sock_gen_put(iter->batch[iter->cur_sk++]); 2961 } 2962 2963 if (iter->cur_sk < iter->end_sk) 2964 sk = iter->batch[iter->cur_sk]; 2965 else 2966 sk = bpf_iter_tcp_batch(seq); 2967 2968 ++*pos; 2969 /* Keeping st->last_pos consistent in tcp_iter_state. 2970 * bpf iter does not do lseek, so st->last_pos always equals to *pos. 2971 */ 2972 st->last_pos = *pos; 2973 return sk; 2974 } 2975 2976 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 2977 { 2978 struct bpf_iter_meta meta; 2979 struct bpf_prog *prog; 2980 struct sock *sk = v; 2981 uid_t uid; 2982 int ret; 2983 2984 if (v == SEQ_START_TOKEN) 2985 return 0; 2986 2987 if (sk_fullsock(sk)) 2988 lock_sock(sk); 2989 2990 if (unlikely(sk_unhashed(sk))) { 2991 ret = SEQ_SKIP; 2992 goto unlock; 2993 } 2994 2995 if (sk->sk_state == TCP_TIME_WAIT) { 2996 uid = 0; 2997 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 2998 const struct request_sock *req = v; 2999 3000 uid = from_kuid_munged(seq_user_ns(seq), 3001 sock_i_uid(req->rsk_listener)); 3002 } else { 3003 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 3004 } 3005 3006 meta.seq = seq; 3007 prog = bpf_iter_get_info(&meta, false); 3008 ret = tcp_prog_seq_show(prog, &meta, v, uid); 3009 3010 unlock: 3011 if (sk_fullsock(sk)) 3012 release_sock(sk); 3013 return ret; 3014 3015 } 3016 3017 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 3018 { 3019 struct bpf_tcp_iter_state *iter = seq->private; 3020 struct bpf_iter_meta meta; 3021 struct bpf_prog *prog; 3022 3023 if (!v) { 3024 meta.seq = seq; 3025 prog = bpf_iter_get_info(&meta, true); 3026 if (prog) 3027 (void)tcp_prog_seq_show(prog, &meta, v, 0); 3028 } 3029 3030 if (iter->cur_sk < iter->end_sk) { 3031 bpf_iter_tcp_put_batch(iter); 3032 iter->st_bucket_done = false; 3033 } 3034 } 3035 3036 static const struct seq_operations bpf_iter_tcp_seq_ops = { 3037 .show = bpf_iter_tcp_seq_show, 3038 .start = bpf_iter_tcp_seq_start, 3039 .next = bpf_iter_tcp_seq_next, 3040 .stop = bpf_iter_tcp_seq_stop, 3041 }; 3042 #endif 3043 static unsigned short seq_file_family(const struct seq_file *seq) 3044 { 3045 const struct tcp_seq_afinfo *afinfo; 3046 3047 #ifdef CONFIG_BPF_SYSCALL 3048 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ 3049 if (seq->op == &bpf_iter_tcp_seq_ops) 3050 return AF_UNSPEC; 3051 #endif 3052 3053 /* Iterated from proc fs */ 3054 afinfo = pde_data(file_inode(seq->file)); 3055 return afinfo->family; 3056 } 3057 3058 static const struct seq_operations tcp4_seq_ops = { 3059 .show = tcp4_seq_show, 3060 .start = tcp_seq_start, 3061 .next = tcp_seq_next, 3062 .stop = tcp_seq_stop, 3063 }; 3064 3065 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 3066 .family = AF_INET, 3067 }; 3068 3069 static int __net_init tcp4_proc_init_net(struct net *net) 3070 { 3071 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 3072 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 3073 return -ENOMEM; 3074 return 0; 3075 } 3076 3077 static void __net_exit tcp4_proc_exit_net(struct net *net) 3078 { 3079 remove_proc_entry("tcp", net->proc_net); 3080 } 3081 3082 static struct pernet_operations tcp4_net_ops = { 3083 .init = tcp4_proc_init_net, 3084 .exit = tcp4_proc_exit_net, 3085 }; 3086 3087 int __init tcp4_proc_init(void) 3088 { 3089 return register_pernet_subsys(&tcp4_net_ops); 3090 } 3091 3092 void tcp4_proc_exit(void) 3093 { 3094 unregister_pernet_subsys(&tcp4_net_ops); 3095 } 3096 #endif /* CONFIG_PROC_FS */ 3097 3098 /* @wake is one when sk_stream_write_space() calls us. 3099 * This sends EPOLLOUT only if notsent_bytes is half the limit. 3100 * This mimics the strategy used in sock_def_write_space(). 3101 */ 3102 bool tcp_stream_memory_free(const struct sock *sk, int wake) 3103 { 3104 const struct tcp_sock *tp = tcp_sk(sk); 3105 u32 notsent_bytes = READ_ONCE(tp->write_seq) - 3106 READ_ONCE(tp->snd_nxt); 3107 3108 return (notsent_bytes << wake) < tcp_notsent_lowat(tp); 3109 } 3110 EXPORT_SYMBOL(tcp_stream_memory_free); 3111 3112 struct proto tcp_prot = { 3113 .name = "TCP", 3114 .owner = THIS_MODULE, 3115 .close = tcp_close, 3116 .pre_connect = tcp_v4_pre_connect, 3117 .connect = tcp_v4_connect, 3118 .disconnect = tcp_disconnect, 3119 .accept = inet_csk_accept, 3120 .ioctl = tcp_ioctl, 3121 .init = tcp_v4_init_sock, 3122 .destroy = tcp_v4_destroy_sock, 3123 .shutdown = tcp_shutdown, 3124 .setsockopt = tcp_setsockopt, 3125 .getsockopt = tcp_getsockopt, 3126 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 3127 .keepalive = tcp_set_keepalive, 3128 .recvmsg = tcp_recvmsg, 3129 .sendmsg = tcp_sendmsg, 3130 .splice_eof = tcp_splice_eof, 3131 .backlog_rcv = tcp_v4_do_rcv, 3132 .release_cb = tcp_release_cb, 3133 .hash = inet_hash, 3134 .unhash = inet_unhash, 3135 .get_port = inet_csk_get_port, 3136 .put_port = inet_put_port, 3137 #ifdef CONFIG_BPF_SYSCALL 3138 .psock_update_sk_prot = tcp_bpf_update_proto, 3139 #endif 3140 .enter_memory_pressure = tcp_enter_memory_pressure, 3141 .leave_memory_pressure = tcp_leave_memory_pressure, 3142 .stream_memory_free = tcp_stream_memory_free, 3143 .sockets_allocated = &tcp_sockets_allocated, 3144 .orphan_count = &tcp_orphan_count, 3145 3146 .memory_allocated = &tcp_memory_allocated, 3147 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, 3148 3149 .memory_pressure = &tcp_memory_pressure, 3150 .sysctl_mem = sysctl_tcp_mem, 3151 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3152 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3153 .max_header = MAX_TCP_HEADER, 3154 .obj_size = sizeof(struct tcp_sock), 3155 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3156 .twsk_prot = &tcp_timewait_sock_ops, 3157 .rsk_prot = &tcp_request_sock_ops, 3158 .h.hashinfo = NULL, 3159 .no_autobind = true, 3160 .diag_destroy = tcp_abort, 3161 }; 3162 EXPORT_SYMBOL(tcp_prot); 3163 3164 static void __net_exit tcp_sk_exit(struct net *net) 3165 { 3166 if (net->ipv4.tcp_congestion_control) 3167 bpf_module_put(net->ipv4.tcp_congestion_control, 3168 net->ipv4.tcp_congestion_control->owner); 3169 } 3170 3171 static void __net_init tcp_set_hashinfo(struct net *net) 3172 { 3173 struct inet_hashinfo *hinfo; 3174 unsigned int ehash_entries; 3175 struct net *old_net; 3176 3177 if (net_eq(net, &init_net)) 3178 goto fallback; 3179 3180 old_net = current->nsproxy->net_ns; 3181 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries); 3182 if (!ehash_entries) 3183 goto fallback; 3184 3185 ehash_entries = roundup_pow_of_two(ehash_entries); 3186 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries); 3187 if (!hinfo) { 3188 pr_warn("Failed to allocate TCP ehash (entries: %u) " 3189 "for a netns, fallback to the global one\n", 3190 ehash_entries); 3191 fallback: 3192 hinfo = &tcp_hashinfo; 3193 ehash_entries = tcp_hashinfo.ehash_mask + 1; 3194 } 3195 3196 net->ipv4.tcp_death_row.hashinfo = hinfo; 3197 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2; 3198 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128); 3199 } 3200 3201 static int __net_init tcp_sk_init(struct net *net) 3202 { 3203 net->ipv4.sysctl_tcp_ecn = 2; 3204 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3205 3206 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 3207 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 3208 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 3209 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 3210 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 3211 3212 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 3213 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 3214 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 3215 3216 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 3217 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 3218 net->ipv4.sysctl_tcp_syncookies = 1; 3219 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 3220 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 3221 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 3222 net->ipv4.sysctl_tcp_orphan_retries = 0; 3223 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 3224 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 3225 net->ipv4.sysctl_tcp_tw_reuse = 2; 3226 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 3227 3228 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); 3229 tcp_set_hashinfo(net); 3230 3231 net->ipv4.sysctl_tcp_sack = 1; 3232 net->ipv4.sysctl_tcp_window_scaling = 1; 3233 net->ipv4.sysctl_tcp_timestamps = 1; 3234 net->ipv4.sysctl_tcp_early_retrans = 3; 3235 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 3236 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 3237 net->ipv4.sysctl_tcp_retrans_collapse = 1; 3238 net->ipv4.sysctl_tcp_max_reordering = 300; 3239 net->ipv4.sysctl_tcp_dsack = 1; 3240 net->ipv4.sysctl_tcp_app_win = 31; 3241 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3242 net->ipv4.sysctl_tcp_frto = 2; 3243 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3244 /* This limits the percentage of the congestion window which we 3245 * will allow a single TSO frame to consume. Building TSO frames 3246 * which are too large can cause TCP streams to be bursty. 3247 */ 3248 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3249 /* Default TSQ limit of 16 TSO segments */ 3250 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; 3251 3252 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ 3253 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; 3254 3255 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3256 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ 3257 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3258 net->ipv4.sysctl_tcp_autocorking = 1; 3259 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 3260 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 3261 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 3262 if (net != &init_net) { 3263 memcpy(net->ipv4.sysctl_tcp_rmem, 3264 init_net.ipv4.sysctl_tcp_rmem, 3265 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 3266 memcpy(net->ipv4.sysctl_tcp_wmem, 3267 init_net.ipv4.sysctl_tcp_wmem, 3268 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 3269 } 3270 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3271 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 3272 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3273 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3274 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; 3275 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 3276 3277 /* Set default values for PLB */ 3278 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */ 3279 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3; 3280 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; 3281 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; 3282 /* Default congestion threshold for PLB to mark a round is 50% */ 3283 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2; 3284 3285 /* Reno is always built in */ 3286 if (!net_eq(net, &init_net) && 3287 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 3288 init_net.ipv4.tcp_congestion_control->owner)) 3289 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 3290 else 3291 net->ipv4.tcp_congestion_control = &tcp_reno; 3292 3293 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4; 3294 net->ipv4.sysctl_tcp_shrink_window = 0; 3295 3296 return 0; 3297 } 3298 3299 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 3300 { 3301 struct net *net; 3302 3303 tcp_twsk_purge(net_exit_list, AF_INET); 3304 3305 list_for_each_entry(net, net_exit_list, exit_list) { 3306 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); 3307 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount)); 3308 tcp_fastopen_ctx_destroy(net); 3309 } 3310 } 3311 3312 static struct pernet_operations __net_initdata tcp_sk_ops = { 3313 .init = tcp_sk_init, 3314 .exit = tcp_sk_exit, 3315 .exit_batch = tcp_sk_exit_batch, 3316 }; 3317 3318 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3319 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 3320 struct sock_common *sk_common, uid_t uid) 3321 3322 #define INIT_BATCH_SZ 16 3323 3324 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 3325 { 3326 struct bpf_tcp_iter_state *iter = priv_data; 3327 int err; 3328 3329 err = bpf_iter_init_seq_net(priv_data, aux); 3330 if (err) 3331 return err; 3332 3333 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ); 3334 if (err) { 3335 bpf_iter_fini_seq_net(priv_data); 3336 return err; 3337 } 3338 3339 return 0; 3340 } 3341 3342 static void bpf_iter_fini_tcp(void *priv_data) 3343 { 3344 struct bpf_tcp_iter_state *iter = priv_data; 3345 3346 bpf_iter_fini_seq_net(priv_data); 3347 kvfree(iter->batch); 3348 } 3349 3350 static const struct bpf_iter_seq_info tcp_seq_info = { 3351 .seq_ops = &bpf_iter_tcp_seq_ops, 3352 .init_seq_private = bpf_iter_init_tcp, 3353 .fini_seq_private = bpf_iter_fini_tcp, 3354 .seq_priv_size = sizeof(struct bpf_tcp_iter_state), 3355 }; 3356 3357 static const struct bpf_func_proto * 3358 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, 3359 const struct bpf_prog *prog) 3360 { 3361 switch (func_id) { 3362 case BPF_FUNC_setsockopt: 3363 return &bpf_sk_setsockopt_proto; 3364 case BPF_FUNC_getsockopt: 3365 return &bpf_sk_getsockopt_proto; 3366 default: 3367 return NULL; 3368 } 3369 } 3370 3371 static struct bpf_iter_reg tcp_reg_info = { 3372 .target = "tcp", 3373 .ctx_arg_info_size = 1, 3374 .ctx_arg_info = { 3375 { offsetof(struct bpf_iter__tcp, sk_common), 3376 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, 3377 }, 3378 .get_func_proto = bpf_iter_tcp_get_func_proto, 3379 .seq_info = &tcp_seq_info, 3380 }; 3381 3382 static void __init bpf_iter_register(void) 3383 { 3384 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 3385 if (bpf_iter_reg_target(&tcp_reg_info)) 3386 pr_warn("Warning: could not register bpf iterator tcp\n"); 3387 } 3388 3389 #endif 3390 3391 void __init tcp_v4_init(void) 3392 { 3393 int cpu, res; 3394 3395 for_each_possible_cpu(cpu) { 3396 struct sock *sk; 3397 3398 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 3399 IPPROTO_TCP, &init_net); 3400 if (res) 3401 panic("Failed to create the TCP control socket.\n"); 3402 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 3403 3404 /* Please enforce IP_DF and IPID==0 for RST and 3405 * ACK sent in SYN-RECV and TIME-WAIT state. 3406 */ 3407 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 3408 3409 per_cpu(ipv4_tcp_sk, cpu) = sk; 3410 } 3411 if (register_pernet_subsys(&tcp_sk_ops)) 3412 panic("Failed to create the TCP control socket.\n"); 3413 3414 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3415 bpf_iter_register(); 3416 #endif 3417 } 3418