1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 #include <linux/sched.h> 61 62 #include <net/net_namespace.h> 63 #include <net/icmp.h> 64 #include <net/inet_hashtables.h> 65 #include <net/tcp.h> 66 #include <net/transp_v6.h> 67 #include <net/ipv6.h> 68 #include <net/inet_common.h> 69 #include <net/timewait_sock.h> 70 #include <net/xfrm.h> 71 #include <net/secure_seq.h> 72 #include <net/busy_poll.h> 73 74 #include <linux/inet.h> 75 #include <linux/ipv6.h> 76 #include <linux/stddef.h> 77 #include <linux/proc_fs.h> 78 #include <linux/seq_file.h> 79 #include <linux/inetdevice.h> 80 #include <linux/btf_ids.h> 81 82 #include <crypto/hash.h> 83 #include <linux/scatterlist.h> 84 85 #include <trace/events/tcp.h> 86 87 #ifdef CONFIG_TCP_MD5SIG 88 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 89 __be32 daddr, __be32 saddr, const struct tcphdr *th); 90 #endif 91 92 struct inet_hashinfo tcp_hashinfo; 93 EXPORT_SYMBOL(tcp_hashinfo); 94 95 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk); 96 97 static DEFINE_MUTEX(tcp_exit_batch_mutex); 98 99 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 100 { 101 return secure_tcp_seq(ip_hdr(skb)->daddr, 102 ip_hdr(skb)->saddr, 103 tcp_hdr(skb)->dest, 104 tcp_hdr(skb)->source); 105 } 106 107 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 108 { 109 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 110 } 111 112 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 113 { 114 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse); 115 const struct inet_timewait_sock *tw = inet_twsk(sktw); 116 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 117 struct tcp_sock *tp = tcp_sk(sk); 118 119 if (reuse == 2) { 120 /* Still does not detect *everything* that goes through 121 * lo, since we require a loopback src or dst address 122 * or direct binding to 'lo' interface. 123 */ 124 bool loopback = false; 125 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 126 loopback = true; 127 #if IS_ENABLED(CONFIG_IPV6) 128 if (tw->tw_family == AF_INET6) { 129 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 130 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 131 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 132 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 133 loopback = true; 134 } else 135 #endif 136 { 137 if (ipv4_is_loopback(tw->tw_daddr) || 138 ipv4_is_loopback(tw->tw_rcv_saddr)) 139 loopback = true; 140 } 141 if (!loopback) 142 reuse = 0; 143 } 144 145 /* With PAWS, it is safe from the viewpoint 146 of data integrity. Even without PAWS it is safe provided sequence 147 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 148 149 Actually, the idea is close to VJ's one, only timestamp cache is 150 held not per host, but per port pair and TW bucket is used as state 151 holder. 152 153 If TW bucket has been already destroyed we fall back to VJ's scheme 154 and use initial timestamp retrieved from peer table. 155 */ 156 if (tcptw->tw_ts_recent_stamp && 157 (!twp || (reuse && time_after32(ktime_get_seconds(), 158 tcptw->tw_ts_recent_stamp)))) { 159 /* inet_twsk_hashdance() sets sk_refcnt after putting twsk 160 * and releasing the bucket lock. 161 */ 162 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt))) 163 return 0; 164 165 /* In case of repair and re-using TIME-WAIT sockets we still 166 * want to be sure that it is safe as above but honor the 167 * sequence numbers and time stamps set as part of the repair 168 * process. 169 * 170 * Without this check re-using a TIME-WAIT socket with TCP 171 * repair would accumulate a -1 on the repair assigned 172 * sequence number. The first time it is reused the sequence 173 * is -1, the second time -2, etc. This fixes that issue 174 * without appearing to create any others. 175 */ 176 if (likely(!tp->repair)) { 177 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 178 179 if (!seq) 180 seq = 1; 181 WRITE_ONCE(tp->write_seq, seq); 182 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 183 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 184 } 185 186 return 1; 187 } 188 189 return 0; 190 } 191 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 192 193 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 194 int addr_len) 195 { 196 /* This check is replicated from tcp_v4_connect() and intended to 197 * prevent BPF program called below from accessing bytes that are out 198 * of the bound specified by user in addr_len. 199 */ 200 if (addr_len < sizeof(struct sockaddr_in)) 201 return -EINVAL; 202 203 sock_owned_by_me(sk); 204 205 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len); 206 } 207 208 /* This will initiate an outgoing connection. */ 209 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 210 { 211 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 212 struct inet_timewait_death_row *tcp_death_row; 213 struct inet_sock *inet = inet_sk(sk); 214 struct tcp_sock *tp = tcp_sk(sk); 215 struct ip_options_rcu *inet_opt; 216 struct net *net = sock_net(sk); 217 __be16 orig_sport, orig_dport; 218 __be32 daddr, nexthop; 219 struct flowi4 *fl4; 220 struct rtable *rt; 221 int err; 222 223 if (addr_len < sizeof(struct sockaddr_in)) 224 return -EINVAL; 225 226 if (usin->sin_family != AF_INET) 227 return -EAFNOSUPPORT; 228 229 nexthop = daddr = usin->sin_addr.s_addr; 230 inet_opt = rcu_dereference_protected(inet->inet_opt, 231 lockdep_sock_is_held(sk)); 232 if (inet_opt && inet_opt->opt.srr) { 233 if (!daddr) 234 return -EINVAL; 235 nexthop = inet_opt->opt.faddr; 236 } 237 238 orig_sport = inet->inet_sport; 239 orig_dport = usin->sin_port; 240 fl4 = &inet->cork.fl.u.ip4; 241 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 242 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, 243 orig_dport, sk); 244 if (IS_ERR(rt)) { 245 err = PTR_ERR(rt); 246 if (err == -ENETUNREACH) 247 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); 248 return err; 249 } 250 251 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 252 ip_rt_put(rt); 253 return -ENETUNREACH; 254 } 255 256 if (!inet_opt || !inet_opt->opt.srr) 257 daddr = fl4->daddr; 258 259 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 260 261 if (!inet->inet_saddr) { 262 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); 263 if (err) { 264 ip_rt_put(rt); 265 return err; 266 } 267 } else { 268 sk_rcv_saddr_set(sk, inet->inet_saddr); 269 } 270 271 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 272 /* Reset inherited state */ 273 tp->rx_opt.ts_recent = 0; 274 tp->rx_opt.ts_recent_stamp = 0; 275 if (likely(!tp->repair)) 276 WRITE_ONCE(tp->write_seq, 0); 277 } 278 279 inet->inet_dport = usin->sin_port; 280 sk_daddr_set(sk, daddr); 281 282 inet_csk(sk)->icsk_ext_hdr_len = 0; 283 if (inet_opt) 284 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 285 286 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 287 288 /* Socket identity is still unknown (sport may be zero). 289 * However we set state to SYN-SENT and not releasing socket 290 * lock select source port, enter ourselves into the hash tables and 291 * complete initialization after this. 292 */ 293 tcp_set_state(sk, TCP_SYN_SENT); 294 err = inet_hash_connect(tcp_death_row, sk); 295 if (err) 296 goto failure; 297 298 sk_set_txhash(sk); 299 300 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 301 inet->inet_sport, inet->inet_dport, sk); 302 if (IS_ERR(rt)) { 303 err = PTR_ERR(rt); 304 rt = NULL; 305 goto failure; 306 } 307 /* OK, now commit destination to socket. */ 308 sk->sk_gso_type = SKB_GSO_TCPV4; 309 sk_setup_caps(sk, &rt->dst); 310 rt = NULL; 311 312 if (likely(!tp->repair)) { 313 if (!tp->write_seq) 314 WRITE_ONCE(tp->write_seq, 315 secure_tcp_seq(inet->inet_saddr, 316 inet->inet_daddr, 317 inet->inet_sport, 318 usin->sin_port)); 319 WRITE_ONCE(tp->tsoffset, 320 secure_tcp_ts_off(net, inet->inet_saddr, 321 inet->inet_daddr)); 322 } 323 324 atomic_set(&inet->inet_id, get_random_u16()); 325 326 if (tcp_fastopen_defer_connect(sk, &err)) 327 return err; 328 if (err) 329 goto failure; 330 331 err = tcp_connect(sk); 332 333 if (err) 334 goto failure; 335 336 return 0; 337 338 failure: 339 /* 340 * This unhashes the socket and releases the local port, 341 * if necessary. 342 */ 343 tcp_set_state(sk, TCP_CLOSE); 344 inet_bhash2_reset_saddr(sk); 345 ip_rt_put(rt); 346 sk->sk_route_caps = 0; 347 inet->inet_dport = 0; 348 return err; 349 } 350 EXPORT_SYMBOL(tcp_v4_connect); 351 352 /* 353 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 354 * It can be called through tcp_release_cb() if socket was owned by user 355 * at the time tcp_v4_err() was called to handle ICMP message. 356 */ 357 void tcp_v4_mtu_reduced(struct sock *sk) 358 { 359 struct inet_sock *inet = inet_sk(sk); 360 struct dst_entry *dst; 361 u32 mtu; 362 363 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 364 return; 365 mtu = READ_ONCE(tcp_sk(sk)->mtu_info); 366 dst = inet_csk_update_pmtu(sk, mtu); 367 if (!dst) 368 return; 369 370 /* Something is about to be wrong... Remember soft error 371 * for the case, if this connection will not able to recover. 372 */ 373 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 374 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); 375 376 mtu = dst_mtu(dst); 377 378 if (inet->pmtudisc != IP_PMTUDISC_DONT && 379 ip_sk_accept_pmtu(sk) && 380 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 381 tcp_sync_mss(sk, mtu); 382 383 /* Resend the TCP packet because it's 384 * clear that the old packet has been 385 * dropped. This is the new "fast" path mtu 386 * discovery. 387 */ 388 tcp_simple_retransmit(sk); 389 } /* else let the usual retransmit timer handle it */ 390 } 391 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 392 393 static void do_redirect(struct sk_buff *skb, struct sock *sk) 394 { 395 struct dst_entry *dst = __sk_dst_check(sk, 0); 396 397 if (dst) 398 dst->ops->redirect(dst, sk, skb); 399 } 400 401 402 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 403 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 404 { 405 struct request_sock *req = inet_reqsk(sk); 406 struct net *net = sock_net(sk); 407 408 /* ICMPs are not backlogged, hence we cannot get 409 * an established socket here. 410 */ 411 if (seq != tcp_rsk(req)->snt_isn) { 412 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 413 } else if (abort) { 414 /* 415 * Still in SYN_RECV, just remove it silently. 416 * There is no good way to pass the error to the newly 417 * created socket, and POSIX does not want network 418 * errors returned from accept(). 419 */ 420 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 421 tcp_listendrop(req->rsk_listener); 422 } 423 reqsk_put(req); 424 } 425 EXPORT_SYMBOL(tcp_req_err); 426 427 /* TCP-LD (RFC 6069) logic */ 428 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 429 { 430 struct inet_connection_sock *icsk = inet_csk(sk); 431 struct tcp_sock *tp = tcp_sk(sk); 432 struct sk_buff *skb; 433 s32 remaining; 434 u32 delta_us; 435 436 if (sock_owned_by_user(sk)) 437 return; 438 439 if (seq != tp->snd_una || !icsk->icsk_retransmits || 440 !icsk->icsk_backoff) 441 return; 442 443 skb = tcp_rtx_queue_head(sk); 444 if (WARN_ON_ONCE(!skb)) 445 return; 446 447 icsk->icsk_backoff--; 448 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 449 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 450 451 tcp_mstamp_refresh(tp); 452 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 453 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 454 455 if (remaining > 0) { 456 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 457 remaining, TCP_RTO_MAX); 458 } else { 459 /* RTO revert clocked out retransmission. 460 * Will retransmit now. 461 */ 462 tcp_retransmit_timer(sk); 463 } 464 } 465 EXPORT_SYMBOL(tcp_ld_RTO_revert); 466 467 /* 468 * This routine is called by the ICMP module when it gets some 469 * sort of error condition. If err < 0 then the socket should 470 * be closed and the error returned to the user. If err > 0 471 * it's just the icmp type << 8 | icmp code. After adjustment 472 * header points to the first 8 bytes of the tcp header. We need 473 * to find the appropriate port. 474 * 475 * The locking strategy used here is very "optimistic". When 476 * someone else accesses the socket the ICMP is just dropped 477 * and for some paths there is no check at all. 478 * A more general error queue to queue errors for later handling 479 * is probably better. 480 * 481 */ 482 483 int tcp_v4_err(struct sk_buff *skb, u32 info) 484 { 485 const struct iphdr *iph = (const struct iphdr *)skb->data; 486 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 487 struct tcp_sock *tp; 488 const int type = icmp_hdr(skb)->type; 489 const int code = icmp_hdr(skb)->code; 490 struct sock *sk; 491 struct request_sock *fastopen; 492 u32 seq, snd_una; 493 int err; 494 struct net *net = dev_net(skb->dev); 495 496 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 497 iph->daddr, th->dest, iph->saddr, 498 ntohs(th->source), inet_iif(skb), 0); 499 if (!sk) { 500 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 501 return -ENOENT; 502 } 503 if (sk->sk_state == TCP_TIME_WAIT) { 504 inet_twsk_put(inet_twsk(sk)); 505 return 0; 506 } 507 seq = ntohl(th->seq); 508 if (sk->sk_state == TCP_NEW_SYN_RECV) { 509 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 510 type == ICMP_TIME_EXCEEDED || 511 (type == ICMP_DEST_UNREACH && 512 (code == ICMP_NET_UNREACH || 513 code == ICMP_HOST_UNREACH))); 514 return 0; 515 } 516 517 bh_lock_sock(sk); 518 /* If too many ICMPs get dropped on busy 519 * servers this needs to be solved differently. 520 * We do take care of PMTU discovery (RFC1191) special case : 521 * we can receive locally generated ICMP messages while socket is held. 522 */ 523 if (sock_owned_by_user(sk)) { 524 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 525 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 526 } 527 if (sk->sk_state == TCP_CLOSE) 528 goto out; 529 530 if (static_branch_unlikely(&ip4_min_ttl)) { 531 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 532 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 533 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 534 goto out; 535 } 536 } 537 538 tp = tcp_sk(sk); 539 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 540 fastopen = rcu_dereference(tp->fastopen_rsk); 541 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 542 if (sk->sk_state != TCP_LISTEN && 543 !between(seq, snd_una, tp->snd_nxt)) { 544 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 545 goto out; 546 } 547 548 switch (type) { 549 case ICMP_REDIRECT: 550 if (!sock_owned_by_user(sk)) 551 do_redirect(skb, sk); 552 goto out; 553 case ICMP_SOURCE_QUENCH: 554 /* Just silently ignore these. */ 555 goto out; 556 case ICMP_PARAMETERPROB: 557 err = EPROTO; 558 break; 559 case ICMP_DEST_UNREACH: 560 if (code > NR_ICMP_UNREACH) 561 goto out; 562 563 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 564 /* We are not interested in TCP_LISTEN and open_requests 565 * (SYN-ACKs send out by Linux are always <576bytes so 566 * they should go through unfragmented). 567 */ 568 if (sk->sk_state == TCP_LISTEN) 569 goto out; 570 571 WRITE_ONCE(tp->mtu_info, info); 572 if (!sock_owned_by_user(sk)) { 573 tcp_v4_mtu_reduced(sk); 574 } else { 575 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 576 sock_hold(sk); 577 } 578 goto out; 579 } 580 581 err = icmp_err_convert[code].errno; 582 /* check if this ICMP message allows revert of backoff. 583 * (see RFC 6069) 584 */ 585 if (!fastopen && 586 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 587 tcp_ld_RTO_revert(sk, seq); 588 break; 589 case ICMP_TIME_EXCEEDED: 590 err = EHOSTUNREACH; 591 break; 592 default: 593 goto out; 594 } 595 596 switch (sk->sk_state) { 597 case TCP_SYN_SENT: 598 case TCP_SYN_RECV: 599 /* Only in fast or simultaneous open. If a fast open socket is 600 * already accepted it is treated as a connected one below. 601 */ 602 if (fastopen && !fastopen->sk) 603 break; 604 605 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 606 607 if (!sock_owned_by_user(sk)) 608 tcp_done_with_error(sk, err); 609 else 610 WRITE_ONCE(sk->sk_err_soft, err); 611 goto out; 612 } 613 614 /* If we've already connected we will keep trying 615 * until we time out, or the user gives up. 616 * 617 * rfc1122 4.2.3.9 allows to consider as hard errors 618 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 619 * but it is obsoleted by pmtu discovery). 620 * 621 * Note, that in modern internet, where routing is unreliable 622 * and in each dark corner broken firewalls sit, sending random 623 * errors ordered by their masters even this two messages finally lose 624 * their original sense (even Linux sends invalid PORT_UNREACHs) 625 * 626 * Now we are in compliance with RFCs. 627 * --ANK (980905) 628 */ 629 630 if (!sock_owned_by_user(sk) && 631 inet_test_bit(RECVERR, sk)) { 632 WRITE_ONCE(sk->sk_err, err); 633 sk_error_report(sk); 634 } else { /* Only an error on timeout */ 635 WRITE_ONCE(sk->sk_err_soft, err); 636 } 637 638 out: 639 bh_unlock_sock(sk); 640 sock_put(sk); 641 return 0; 642 } 643 644 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 645 { 646 struct tcphdr *th = tcp_hdr(skb); 647 648 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 649 skb->csum_start = skb_transport_header(skb) - skb->head; 650 skb->csum_offset = offsetof(struct tcphdr, check); 651 } 652 653 /* This routine computes an IPv4 TCP checksum. */ 654 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 655 { 656 const struct inet_sock *inet = inet_sk(sk); 657 658 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 659 } 660 EXPORT_SYMBOL(tcp_v4_send_check); 661 662 /* 663 * This routine will send an RST to the other tcp. 664 * 665 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 666 * for reset. 667 * Answer: if a packet caused RST, it is not for a socket 668 * existing in our system, if it is matched to a socket, 669 * it is just duplicate segment or bug in other side's TCP. 670 * So that we build reply only basing on parameters 671 * arrived with segment. 672 * Exception: precedence violation. We do not implement it in any case. 673 */ 674 675 #ifdef CONFIG_TCP_MD5SIG 676 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED 677 #else 678 #define OPTION_BYTES sizeof(__be32) 679 #endif 680 681 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) 682 { 683 const struct tcphdr *th = tcp_hdr(skb); 684 struct { 685 struct tcphdr th; 686 __be32 opt[OPTION_BYTES / sizeof(__be32)]; 687 } rep; 688 struct ip_reply_arg arg; 689 #ifdef CONFIG_TCP_MD5SIG 690 struct tcp_md5sig_key *key = NULL; 691 const __u8 *hash_location = NULL; 692 unsigned char newhash[16]; 693 int genhash; 694 struct sock *sk1 = NULL; 695 #endif 696 u64 transmit_time = 0; 697 struct sock *ctl_sk; 698 struct net *net; 699 u32 txhash = 0; 700 701 /* Never send a reset in response to a reset. */ 702 if (th->rst) 703 return; 704 705 /* If sk not NULL, it means we did a successful lookup and incoming 706 * route had to be correct. prequeue might have dropped our dst. 707 */ 708 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 709 return; 710 711 /* Swap the send and the receive. */ 712 memset(&rep, 0, sizeof(rep)); 713 rep.th.dest = th->source; 714 rep.th.source = th->dest; 715 rep.th.doff = sizeof(struct tcphdr) / 4; 716 rep.th.rst = 1; 717 718 if (th->ack) { 719 rep.th.seq = th->ack_seq; 720 } else { 721 rep.th.ack = 1; 722 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 723 skb->len - (th->doff << 2)); 724 } 725 726 memset(&arg, 0, sizeof(arg)); 727 arg.iov[0].iov_base = (unsigned char *)&rep; 728 arg.iov[0].iov_len = sizeof(rep.th); 729 730 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 731 #ifdef CONFIG_TCP_MD5SIG 732 rcu_read_lock(); 733 hash_location = tcp_parse_md5sig_option(th); 734 if (sk && sk_fullsock(sk)) { 735 const union tcp_md5_addr *addr; 736 int l3index; 737 738 /* sdif set, means packet ingressed via a device 739 * in an L3 domain and inet_iif is set to it. 740 */ 741 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 742 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 743 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 744 } else if (hash_location) { 745 const union tcp_md5_addr *addr; 746 int sdif = tcp_v4_sdif(skb); 747 int dif = inet_iif(skb); 748 int l3index; 749 750 /* 751 * active side is lost. Try to find listening socket through 752 * source port, and then find md5 key through listening socket. 753 * we are not loose security here: 754 * Incoming packet is checked with md5 hash with finding key, 755 * no RST generated if md5 hash doesn't match. 756 */ 757 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, 758 NULL, 0, ip_hdr(skb)->saddr, 759 th->source, ip_hdr(skb)->daddr, 760 ntohs(th->source), dif, sdif); 761 /* don't send rst if it can't find key */ 762 if (!sk1) 763 goto out; 764 765 /* sdif set, means packet ingressed via a device 766 * in an L3 domain and dif is set to it. 767 */ 768 l3index = sdif ? dif : 0; 769 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 770 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 771 if (!key) 772 goto out; 773 774 775 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 776 if (genhash || memcmp(hash_location, newhash, 16) != 0) 777 goto out; 778 779 } 780 781 if (key) { 782 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 783 (TCPOPT_NOP << 16) | 784 (TCPOPT_MD5SIG << 8) | 785 TCPOLEN_MD5SIG); 786 /* Update length and the length the header thinks exists */ 787 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 788 rep.th.doff = arg.iov[0].iov_len / 4; 789 790 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 791 key, ip_hdr(skb)->saddr, 792 ip_hdr(skb)->daddr, &rep.th); 793 } 794 #endif 795 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ 796 if (rep.opt[0] == 0) { 797 __be32 mrst = mptcp_reset_option(skb); 798 799 if (mrst) { 800 rep.opt[0] = mrst; 801 arg.iov[0].iov_len += sizeof(mrst); 802 rep.th.doff = arg.iov[0].iov_len / 4; 803 } 804 } 805 806 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 807 ip_hdr(skb)->saddr, /* XXX */ 808 arg.iov[0].iov_len, IPPROTO_TCP, 0); 809 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 810 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 811 812 /* When socket is gone, all binding information is lost. 813 * routing might fail in this case. No choice here, if we choose to force 814 * input interface, we will misroute in case of asymmetric route. 815 */ 816 if (sk) { 817 arg.bound_dev_if = sk->sk_bound_dev_if; 818 if (sk_fullsock(sk)) 819 trace_tcp_send_reset(sk, skb); 820 } 821 822 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 823 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 824 825 arg.tos = ip_hdr(skb)->tos; 826 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 827 local_bh_disable(); 828 ctl_sk = this_cpu_read(ipv4_tcp_sk); 829 sock_net_set(ctl_sk, net); 830 if (sk) { 831 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 832 inet_twsk(sk)->tw_mark : sk->sk_mark; 833 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 834 inet_twsk(sk)->tw_priority : sk->sk_priority; 835 transmit_time = tcp_transmit_time(sk); 836 xfrm_sk_clone_policy(ctl_sk, sk); 837 txhash = (sk->sk_state == TCP_TIME_WAIT) ? 838 inet_twsk(sk)->tw_txhash : sk->sk_txhash; 839 } else { 840 ctl_sk->sk_mark = 0; 841 ctl_sk->sk_priority = 0; 842 } 843 ip_send_unicast_reply(ctl_sk, 844 skb, &TCP_SKB_CB(skb)->header.h4.opt, 845 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 846 &arg, arg.iov[0].iov_len, 847 transmit_time, txhash); 848 849 xfrm_sk_free_policy(ctl_sk); 850 sock_net_set(ctl_sk, &init_net); 851 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 852 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 853 local_bh_enable(); 854 855 #ifdef CONFIG_TCP_MD5SIG 856 out: 857 rcu_read_unlock(); 858 #endif 859 } 860 861 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 862 outside socket context is ugly, certainly. What can I do? 863 */ 864 865 static void tcp_v4_send_ack(const struct sock *sk, 866 struct sk_buff *skb, u32 seq, u32 ack, 867 u32 win, u32 tsval, u32 tsecr, int oif, 868 struct tcp_md5sig_key *key, 869 int reply_flags, u8 tos, u32 txhash) 870 { 871 const struct tcphdr *th = tcp_hdr(skb); 872 struct { 873 struct tcphdr th; 874 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 875 #ifdef CONFIG_TCP_MD5SIG 876 + (TCPOLEN_MD5SIG_ALIGNED >> 2) 877 #endif 878 ]; 879 } rep; 880 struct net *net = sock_net(sk); 881 struct ip_reply_arg arg; 882 struct sock *ctl_sk; 883 u64 transmit_time; 884 885 memset(&rep.th, 0, sizeof(struct tcphdr)); 886 memset(&arg, 0, sizeof(arg)); 887 888 arg.iov[0].iov_base = (unsigned char *)&rep; 889 arg.iov[0].iov_len = sizeof(rep.th); 890 if (tsecr) { 891 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 892 (TCPOPT_TIMESTAMP << 8) | 893 TCPOLEN_TIMESTAMP); 894 rep.opt[1] = htonl(tsval); 895 rep.opt[2] = htonl(tsecr); 896 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 897 } 898 899 /* Swap the send and the receive. */ 900 rep.th.dest = th->source; 901 rep.th.source = th->dest; 902 rep.th.doff = arg.iov[0].iov_len / 4; 903 rep.th.seq = htonl(seq); 904 rep.th.ack_seq = htonl(ack); 905 rep.th.ack = 1; 906 rep.th.window = htons(win); 907 908 #ifdef CONFIG_TCP_MD5SIG 909 if (key) { 910 int offset = (tsecr) ? 3 : 0; 911 912 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 913 (TCPOPT_NOP << 16) | 914 (TCPOPT_MD5SIG << 8) | 915 TCPOLEN_MD5SIG); 916 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 917 rep.th.doff = arg.iov[0].iov_len/4; 918 919 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 920 key, ip_hdr(skb)->saddr, 921 ip_hdr(skb)->daddr, &rep.th); 922 } 923 #endif 924 arg.flags = reply_flags; 925 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 926 ip_hdr(skb)->saddr, /* XXX */ 927 arg.iov[0].iov_len, IPPROTO_TCP, 0); 928 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 929 if (oif) 930 arg.bound_dev_if = oif; 931 arg.tos = tos; 932 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 933 local_bh_disable(); 934 ctl_sk = this_cpu_read(ipv4_tcp_sk); 935 sock_net_set(ctl_sk, net); 936 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 937 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 938 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 939 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 940 transmit_time = tcp_transmit_time(sk); 941 ip_send_unicast_reply(ctl_sk, 942 skb, &TCP_SKB_CB(skb)->header.h4.opt, 943 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 944 &arg, arg.iov[0].iov_len, 945 transmit_time, txhash); 946 947 sock_net_set(ctl_sk, &init_net); 948 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 949 local_bh_enable(); 950 } 951 952 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 953 { 954 struct inet_timewait_sock *tw = inet_twsk(sk); 955 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 956 957 tcp_v4_send_ack(sk, skb, 958 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 959 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 960 tcp_time_stamp_raw() + tcptw->tw_ts_offset, 961 tcptw->tw_ts_recent, 962 tw->tw_bound_dev_if, 963 tcp_twsk_md5_key(tcptw), 964 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 965 tw->tw_tos, 966 tw->tw_txhash 967 ); 968 969 inet_twsk_put(tw); 970 } 971 972 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 973 struct request_sock *req) 974 { 975 const union tcp_md5_addr *addr; 976 int l3index; 977 978 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 979 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 980 */ 981 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 982 tcp_sk(sk)->snd_nxt; 983 984 /* RFC 7323 2.3 985 * The window field (SEG.WND) of every outgoing segment, with the 986 * exception of <SYN> segments, MUST be right-shifted by 987 * Rcv.Wind.Shift bits: 988 */ 989 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 990 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 991 tcp_v4_send_ack(sk, skb, seq, 992 tcp_rsk(req)->rcv_nxt, 993 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 994 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, 995 READ_ONCE(req->ts_recent), 996 0, 997 tcp_md5_do_lookup(sk, l3index, addr, AF_INET), 998 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 999 ip_hdr(skb)->tos, 1000 READ_ONCE(tcp_rsk(req)->txhash)); 1001 } 1002 1003 /* 1004 * Send a SYN-ACK after having received a SYN. 1005 * This still operates on a request_sock only, not on a big 1006 * socket. 1007 */ 1008 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 1009 struct flowi *fl, 1010 struct request_sock *req, 1011 struct tcp_fastopen_cookie *foc, 1012 enum tcp_synack_type synack_type, 1013 struct sk_buff *syn_skb) 1014 { 1015 const struct inet_request_sock *ireq = inet_rsk(req); 1016 struct flowi4 fl4; 1017 int err = -1; 1018 struct sk_buff *skb; 1019 u8 tos; 1020 1021 /* First, grab a route. */ 1022 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 1023 return -1; 1024 1025 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 1026 1027 if (skb) { 1028 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1029 1030 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ? 1031 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | 1032 (inet_sk(sk)->tos & INET_ECN_MASK) : 1033 inet_sk(sk)->tos; 1034 1035 if (!INET_ECN_is_capable(tos) && 1036 tcp_bpf_ca_needs_ecn((struct sock *)req)) 1037 tos |= INET_ECN_ECT_0; 1038 1039 rcu_read_lock(); 1040 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 1041 ireq->ir_rmt_addr, 1042 rcu_dereference(ireq->ireq_opt), 1043 tos); 1044 rcu_read_unlock(); 1045 err = net_xmit_eval(err); 1046 } 1047 1048 return err; 1049 } 1050 1051 /* 1052 * IPv4 request_sock destructor. 1053 */ 1054 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1055 { 1056 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1057 } 1058 1059 #ifdef CONFIG_TCP_MD5SIG 1060 /* 1061 * RFC2385 MD5 checksumming requires a mapping of 1062 * IP address->MD5 Key. 1063 * We need to maintain these in the sk structure. 1064 */ 1065 1066 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ); 1067 EXPORT_SYMBOL(tcp_md5_needed); 1068 1069 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) 1070 { 1071 if (!old) 1072 return true; 1073 1074 /* l3index always overrides non-l3index */ 1075 if (old->l3index && new->l3index == 0) 1076 return false; 1077 if (old->l3index == 0 && new->l3index) 1078 return true; 1079 1080 return old->prefixlen < new->prefixlen; 1081 } 1082 1083 /* Find the Key structure for an address. */ 1084 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1085 const union tcp_md5_addr *addr, 1086 int family) 1087 { 1088 const struct tcp_sock *tp = tcp_sk(sk); 1089 struct tcp_md5sig_key *key; 1090 const struct tcp_md5sig_info *md5sig; 1091 __be32 mask; 1092 struct tcp_md5sig_key *best_match = NULL; 1093 bool match; 1094 1095 /* caller either holds rcu_read_lock() or socket lock */ 1096 md5sig = rcu_dereference_check(tp->md5sig_info, 1097 lockdep_sock_is_held(sk)); 1098 if (!md5sig) 1099 return NULL; 1100 1101 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1102 lockdep_sock_is_held(sk)) { 1103 if (key->family != family) 1104 continue; 1105 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index) 1106 continue; 1107 if (family == AF_INET) { 1108 mask = inet_make_mask(key->prefixlen); 1109 match = (key->addr.a4.s_addr & mask) == 1110 (addr->a4.s_addr & mask); 1111 #if IS_ENABLED(CONFIG_IPV6) 1112 } else if (family == AF_INET6) { 1113 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1114 key->prefixlen); 1115 #endif 1116 } else { 1117 match = false; 1118 } 1119 1120 if (match && better_md5_match(best_match, key)) 1121 best_match = key; 1122 } 1123 return best_match; 1124 } 1125 EXPORT_SYMBOL(__tcp_md5_do_lookup); 1126 1127 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1128 const union tcp_md5_addr *addr, 1129 int family, u8 prefixlen, 1130 int l3index, u8 flags) 1131 { 1132 const struct tcp_sock *tp = tcp_sk(sk); 1133 struct tcp_md5sig_key *key; 1134 unsigned int size = sizeof(struct in_addr); 1135 const struct tcp_md5sig_info *md5sig; 1136 1137 /* caller either holds rcu_read_lock() or socket lock */ 1138 md5sig = rcu_dereference_check(tp->md5sig_info, 1139 lockdep_sock_is_held(sk)); 1140 if (!md5sig) 1141 return NULL; 1142 #if IS_ENABLED(CONFIG_IPV6) 1143 if (family == AF_INET6) 1144 size = sizeof(struct in6_addr); 1145 #endif 1146 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1147 lockdep_sock_is_held(sk)) { 1148 if (key->family != family) 1149 continue; 1150 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) 1151 continue; 1152 if (key->l3index != l3index) 1153 continue; 1154 if (!memcmp(&key->addr, addr, size) && 1155 key->prefixlen == prefixlen) 1156 return key; 1157 } 1158 return NULL; 1159 } 1160 1161 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1162 const struct sock *addr_sk) 1163 { 1164 const union tcp_md5_addr *addr; 1165 int l3index; 1166 1167 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1168 addr_sk->sk_bound_dev_if); 1169 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1170 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1171 } 1172 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1173 1174 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) 1175 { 1176 struct tcp_sock *tp = tcp_sk(sk); 1177 struct tcp_md5sig_info *md5sig; 1178 1179 md5sig = kmalloc(sizeof(*md5sig), gfp); 1180 if (!md5sig) 1181 return -ENOMEM; 1182 1183 sk_gso_disable(sk); 1184 INIT_HLIST_HEAD(&md5sig->head); 1185 rcu_assign_pointer(tp->md5sig_info, md5sig); 1186 return 0; 1187 } 1188 1189 /* This can be called on a newly created socket, from other files */ 1190 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1191 int family, u8 prefixlen, int l3index, u8 flags, 1192 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1193 { 1194 /* Add Key to the list */ 1195 struct tcp_md5sig_key *key; 1196 struct tcp_sock *tp = tcp_sk(sk); 1197 struct tcp_md5sig_info *md5sig; 1198 1199 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1200 if (key) { 1201 /* Pre-existing entry - just update that one. 1202 * Note that the key might be used concurrently. 1203 * data_race() is telling kcsan that we do not care of 1204 * key mismatches, since changing MD5 key on live flows 1205 * can lead to packet drops. 1206 */ 1207 data_race(memcpy(key->key, newkey, newkeylen)); 1208 1209 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1210 * Also note that a reader could catch new key->keylen value 1211 * but old key->key[], this is the reason we use __GFP_ZERO 1212 * at sock_kmalloc() time below these lines. 1213 */ 1214 WRITE_ONCE(key->keylen, newkeylen); 1215 1216 return 0; 1217 } 1218 1219 md5sig = rcu_dereference_protected(tp->md5sig_info, 1220 lockdep_sock_is_held(sk)); 1221 1222 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1223 if (!key) 1224 return -ENOMEM; 1225 if (!tcp_alloc_md5sig_pool()) { 1226 sock_kfree_s(sk, key, sizeof(*key)); 1227 return -ENOMEM; 1228 } 1229 1230 memcpy(key->key, newkey, newkeylen); 1231 key->keylen = newkeylen; 1232 key->family = family; 1233 key->prefixlen = prefixlen; 1234 key->l3index = l3index; 1235 key->flags = flags; 1236 memcpy(&key->addr, addr, 1237 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) : 1238 sizeof(struct in_addr)); 1239 hlist_add_head_rcu(&key->node, &md5sig->head); 1240 return 0; 1241 } 1242 1243 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1244 int family, u8 prefixlen, int l3index, u8 flags, 1245 const u8 *newkey, u8 newkeylen) 1246 { 1247 struct tcp_sock *tp = tcp_sk(sk); 1248 1249 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1250 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) 1251 return -ENOMEM; 1252 1253 if (!static_branch_inc(&tcp_md5_needed.key)) { 1254 struct tcp_md5sig_info *md5sig; 1255 1256 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1257 rcu_assign_pointer(tp->md5sig_info, NULL); 1258 kfree_rcu(md5sig, rcu); 1259 return -EUSERS; 1260 } 1261 } 1262 1263 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags, 1264 newkey, newkeylen, GFP_KERNEL); 1265 } 1266 EXPORT_SYMBOL(tcp_md5_do_add); 1267 1268 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, 1269 int family, u8 prefixlen, int l3index, 1270 struct tcp_md5sig_key *key) 1271 { 1272 struct tcp_sock *tp = tcp_sk(sk); 1273 1274 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1275 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) 1276 return -ENOMEM; 1277 1278 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) { 1279 struct tcp_md5sig_info *md5sig; 1280 1281 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1282 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n"); 1283 rcu_assign_pointer(tp->md5sig_info, NULL); 1284 kfree_rcu(md5sig, rcu); 1285 return -EUSERS; 1286 } 1287 } 1288 1289 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, 1290 key->flags, key->key, key->keylen, 1291 sk_gfp_mask(sk, GFP_ATOMIC)); 1292 } 1293 EXPORT_SYMBOL(tcp_md5_key_copy); 1294 1295 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1296 u8 prefixlen, int l3index, u8 flags) 1297 { 1298 struct tcp_md5sig_key *key; 1299 1300 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1301 if (!key) 1302 return -ENOENT; 1303 hlist_del_rcu(&key->node); 1304 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1305 kfree_rcu(key, rcu); 1306 return 0; 1307 } 1308 EXPORT_SYMBOL(tcp_md5_do_del); 1309 1310 static void tcp_clear_md5_list(struct sock *sk) 1311 { 1312 struct tcp_sock *tp = tcp_sk(sk); 1313 struct tcp_md5sig_key *key; 1314 struct hlist_node *n; 1315 struct tcp_md5sig_info *md5sig; 1316 1317 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1318 1319 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1320 hlist_del_rcu(&key->node); 1321 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1322 kfree_rcu(key, rcu); 1323 } 1324 } 1325 1326 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1327 sockptr_t optval, int optlen) 1328 { 1329 struct tcp_md5sig cmd; 1330 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1331 const union tcp_md5_addr *addr; 1332 u8 prefixlen = 32; 1333 int l3index = 0; 1334 u8 flags; 1335 1336 if (optlen < sizeof(cmd)) 1337 return -EINVAL; 1338 1339 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1340 return -EFAULT; 1341 1342 if (sin->sin_family != AF_INET) 1343 return -EINVAL; 1344 1345 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1346 1347 if (optname == TCP_MD5SIG_EXT && 1348 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1349 prefixlen = cmd.tcpm_prefixlen; 1350 if (prefixlen > 32) 1351 return -EINVAL; 1352 } 1353 1354 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && 1355 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1356 struct net_device *dev; 1357 1358 rcu_read_lock(); 1359 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1360 if (dev && netif_is_l3_master(dev)) 1361 l3index = dev->ifindex; 1362 1363 rcu_read_unlock(); 1364 1365 /* ok to reference set/not set outside of rcu; 1366 * right now device MUST be an L3 master 1367 */ 1368 if (!dev || !l3index) 1369 return -EINVAL; 1370 } 1371 1372 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1373 1374 if (!cmd.tcpm_keylen) 1375 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); 1376 1377 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1378 return -EINVAL; 1379 1380 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, 1381 cmd.tcpm_key, cmd.tcpm_keylen); 1382 } 1383 1384 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, 1385 __be32 daddr, __be32 saddr, 1386 const struct tcphdr *th, int nbytes) 1387 { 1388 struct tcp4_pseudohdr *bp; 1389 struct scatterlist sg; 1390 struct tcphdr *_th; 1391 1392 bp = hp->scratch; 1393 bp->saddr = saddr; 1394 bp->daddr = daddr; 1395 bp->pad = 0; 1396 bp->protocol = IPPROTO_TCP; 1397 bp->len = cpu_to_be16(nbytes); 1398 1399 _th = (struct tcphdr *)(bp + 1); 1400 memcpy(_th, th, sizeof(*th)); 1401 _th->check = 0; 1402 1403 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1404 ahash_request_set_crypt(hp->md5_req, &sg, NULL, 1405 sizeof(*bp) + sizeof(*th)); 1406 return crypto_ahash_update(hp->md5_req); 1407 } 1408 1409 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1410 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1411 { 1412 struct tcp_md5sig_pool *hp; 1413 struct ahash_request *req; 1414 1415 hp = tcp_get_md5sig_pool(); 1416 if (!hp) 1417 goto clear_hash_noput; 1418 req = hp->md5_req; 1419 1420 if (crypto_ahash_init(req)) 1421 goto clear_hash; 1422 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) 1423 goto clear_hash; 1424 if (tcp_md5_hash_key(hp, key)) 1425 goto clear_hash; 1426 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1427 if (crypto_ahash_final(req)) 1428 goto clear_hash; 1429 1430 tcp_put_md5sig_pool(); 1431 return 0; 1432 1433 clear_hash: 1434 tcp_put_md5sig_pool(); 1435 clear_hash_noput: 1436 memset(md5_hash, 0, 16); 1437 return 1; 1438 } 1439 1440 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1441 const struct sock *sk, 1442 const struct sk_buff *skb) 1443 { 1444 struct tcp_md5sig_pool *hp; 1445 struct ahash_request *req; 1446 const struct tcphdr *th = tcp_hdr(skb); 1447 __be32 saddr, daddr; 1448 1449 if (sk) { /* valid for establish/request sockets */ 1450 saddr = sk->sk_rcv_saddr; 1451 daddr = sk->sk_daddr; 1452 } else { 1453 const struct iphdr *iph = ip_hdr(skb); 1454 saddr = iph->saddr; 1455 daddr = iph->daddr; 1456 } 1457 1458 hp = tcp_get_md5sig_pool(); 1459 if (!hp) 1460 goto clear_hash_noput; 1461 req = hp->md5_req; 1462 1463 if (crypto_ahash_init(req)) 1464 goto clear_hash; 1465 1466 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len)) 1467 goto clear_hash; 1468 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) 1469 goto clear_hash; 1470 if (tcp_md5_hash_key(hp, key)) 1471 goto clear_hash; 1472 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1473 if (crypto_ahash_final(req)) 1474 goto clear_hash; 1475 1476 tcp_put_md5sig_pool(); 1477 return 0; 1478 1479 clear_hash: 1480 tcp_put_md5sig_pool(); 1481 clear_hash_noput: 1482 memset(md5_hash, 0, 16); 1483 return 1; 1484 } 1485 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1486 1487 #endif 1488 1489 static void tcp_v4_init_req(struct request_sock *req, 1490 const struct sock *sk_listener, 1491 struct sk_buff *skb) 1492 { 1493 struct inet_request_sock *ireq = inet_rsk(req); 1494 struct net *net = sock_net(sk_listener); 1495 1496 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1497 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1498 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1499 } 1500 1501 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1502 struct sk_buff *skb, 1503 struct flowi *fl, 1504 struct request_sock *req) 1505 { 1506 tcp_v4_init_req(req, sk, skb); 1507 1508 if (security_inet_conn_request(sk, skb, req)) 1509 return NULL; 1510 1511 return inet_csk_route_req(sk, &fl->u.ip4, req); 1512 } 1513 1514 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1515 .family = PF_INET, 1516 .obj_size = sizeof(struct tcp_request_sock), 1517 .rtx_syn_ack = tcp_rtx_synack, 1518 .send_ack = tcp_v4_reqsk_send_ack, 1519 .destructor = tcp_v4_reqsk_destructor, 1520 .send_reset = tcp_v4_send_reset, 1521 .syn_ack_timeout = tcp_syn_ack_timeout, 1522 }; 1523 1524 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1525 .mss_clamp = TCP_MSS_DEFAULT, 1526 #ifdef CONFIG_TCP_MD5SIG 1527 .req_md5_lookup = tcp_v4_md5_lookup, 1528 .calc_md5_hash = tcp_v4_md5_hash_skb, 1529 #endif 1530 #ifdef CONFIG_SYN_COOKIES 1531 .cookie_init_seq = cookie_v4_init_sequence, 1532 #endif 1533 .route_req = tcp_v4_route_req, 1534 .init_seq = tcp_v4_init_seq, 1535 .init_ts_off = tcp_v4_init_ts_off, 1536 .send_synack = tcp_v4_send_synack, 1537 }; 1538 1539 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1540 { 1541 /* Never answer to SYNs send to broadcast or multicast */ 1542 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1543 goto drop; 1544 1545 return tcp_conn_request(&tcp_request_sock_ops, 1546 &tcp_request_sock_ipv4_ops, sk, skb); 1547 1548 drop: 1549 tcp_listendrop(sk); 1550 return 0; 1551 } 1552 EXPORT_SYMBOL(tcp_v4_conn_request); 1553 1554 1555 /* 1556 * The three way handshake has completed - we got a valid synack - 1557 * now create the new socket. 1558 */ 1559 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1560 struct request_sock *req, 1561 struct dst_entry *dst, 1562 struct request_sock *req_unhash, 1563 bool *own_req) 1564 { 1565 struct inet_request_sock *ireq; 1566 bool found_dup_sk = false; 1567 struct inet_sock *newinet; 1568 struct tcp_sock *newtp; 1569 struct sock *newsk; 1570 #ifdef CONFIG_TCP_MD5SIG 1571 const union tcp_md5_addr *addr; 1572 struct tcp_md5sig_key *key; 1573 int l3index; 1574 #endif 1575 struct ip_options_rcu *inet_opt; 1576 1577 if (sk_acceptq_is_full(sk)) 1578 goto exit_overflow; 1579 1580 newsk = tcp_create_openreq_child(sk, req, skb); 1581 if (!newsk) 1582 goto exit_nonewsk; 1583 1584 newsk->sk_gso_type = SKB_GSO_TCPV4; 1585 inet_sk_rx_dst_set(newsk, skb); 1586 1587 newtp = tcp_sk(newsk); 1588 newinet = inet_sk(newsk); 1589 ireq = inet_rsk(req); 1590 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1591 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1592 newsk->sk_bound_dev_if = ireq->ir_iif; 1593 newinet->inet_saddr = ireq->ir_loc_addr; 1594 inet_opt = rcu_dereference(ireq->ireq_opt); 1595 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1596 newinet->mc_index = inet_iif(skb); 1597 newinet->mc_ttl = ip_hdr(skb)->ttl; 1598 newinet->rcv_tos = ip_hdr(skb)->tos; 1599 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1600 if (inet_opt) 1601 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1602 atomic_set(&newinet->inet_id, get_random_u16()); 1603 1604 /* Set ToS of the new socket based upon the value of incoming SYN. 1605 * ECT bits are set later in tcp_init_transfer(). 1606 */ 1607 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1608 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1609 1610 if (!dst) { 1611 dst = inet_csk_route_child_sock(sk, newsk, req); 1612 if (!dst) 1613 goto put_and_exit; 1614 } else { 1615 /* syncookie case : see end of cookie_v4_check() */ 1616 } 1617 sk_setup_caps(newsk, dst); 1618 1619 tcp_ca_openreq_child(newsk, dst); 1620 1621 tcp_sync_mss(newsk, dst_mtu(dst)); 1622 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1623 1624 tcp_initialize_rcv_mss(newsk); 1625 1626 #ifdef CONFIG_TCP_MD5SIG 1627 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1628 /* Copy over the MD5 key from the original socket */ 1629 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1630 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1631 if (key) { 1632 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key)) 1633 goto put_and_exit; 1634 sk_gso_disable(newsk); 1635 } 1636 #endif 1637 1638 if (__inet_inherit_port(sk, newsk) < 0) 1639 goto put_and_exit; 1640 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1641 &found_dup_sk); 1642 if (likely(*own_req)) { 1643 tcp_move_syn(newtp, req); 1644 ireq->ireq_opt = NULL; 1645 } else { 1646 newinet->inet_opt = NULL; 1647 1648 if (!req_unhash && found_dup_sk) { 1649 /* This code path should only be executed in the 1650 * syncookie case only 1651 */ 1652 bh_unlock_sock(newsk); 1653 sock_put(newsk); 1654 newsk = NULL; 1655 } 1656 } 1657 return newsk; 1658 1659 exit_overflow: 1660 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1661 exit_nonewsk: 1662 dst_release(dst); 1663 exit: 1664 tcp_listendrop(sk); 1665 return NULL; 1666 put_and_exit: 1667 newinet->inet_opt = NULL; 1668 inet_csk_prepare_forced_close(newsk); 1669 tcp_done(newsk); 1670 goto exit; 1671 } 1672 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1673 1674 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1675 { 1676 #ifdef CONFIG_SYN_COOKIES 1677 const struct tcphdr *th = tcp_hdr(skb); 1678 1679 if (!th->syn) 1680 sk = cookie_v4_check(sk, skb); 1681 #endif 1682 return sk; 1683 } 1684 1685 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1686 struct tcphdr *th, u32 *cookie) 1687 { 1688 u16 mss = 0; 1689 #ifdef CONFIG_SYN_COOKIES 1690 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1691 &tcp_request_sock_ipv4_ops, sk, th); 1692 if (mss) { 1693 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1694 tcp_synq_overflow(sk); 1695 } 1696 #endif 1697 return mss; 1698 } 1699 1700 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 1701 u32)); 1702 /* The socket must have it's spinlock held when we get 1703 * here, unless it is a TCP_LISTEN socket. 1704 * 1705 * We have a potential double-lock case here, so even when 1706 * doing backlog processing we use the BH locking scheme. 1707 * This is because we cannot sleep with the original spinlock 1708 * held. 1709 */ 1710 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1711 { 1712 enum skb_drop_reason reason; 1713 struct sock *rsk; 1714 1715 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1716 struct dst_entry *dst; 1717 1718 dst = rcu_dereference_protected(sk->sk_rx_dst, 1719 lockdep_sock_is_held(sk)); 1720 1721 sock_rps_save_rxhash(sk, skb); 1722 sk_mark_napi_id(sk, skb); 1723 if (dst) { 1724 if (sk->sk_rx_dst_ifindex != skb->skb_iif || 1725 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, 1726 dst, 0)) { 1727 RCU_INIT_POINTER(sk->sk_rx_dst, NULL); 1728 dst_release(dst); 1729 } 1730 } 1731 tcp_rcv_established(sk, skb); 1732 return 0; 1733 } 1734 1735 reason = SKB_DROP_REASON_NOT_SPECIFIED; 1736 if (tcp_checksum_complete(skb)) 1737 goto csum_err; 1738 1739 if (sk->sk_state == TCP_LISTEN) { 1740 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1741 1742 if (!nsk) 1743 goto discard; 1744 if (nsk != sk) { 1745 if (tcp_child_process(sk, nsk, skb)) { 1746 rsk = nsk; 1747 goto reset; 1748 } 1749 return 0; 1750 } 1751 } else 1752 sock_rps_save_rxhash(sk, skb); 1753 1754 if (tcp_rcv_state_process(sk, skb)) { 1755 rsk = sk; 1756 goto reset; 1757 } 1758 return 0; 1759 1760 reset: 1761 tcp_v4_send_reset(rsk, skb); 1762 discard: 1763 kfree_skb_reason(skb, reason); 1764 /* Be careful here. If this function gets more complicated and 1765 * gcc suffers from register pressure on the x86, sk (in %ebx) 1766 * might be destroyed here. This current version compiles correctly, 1767 * but you have been warned. 1768 */ 1769 return 0; 1770 1771 csum_err: 1772 reason = SKB_DROP_REASON_TCP_CSUM; 1773 trace_tcp_bad_csum(skb); 1774 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1775 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1776 goto discard; 1777 } 1778 EXPORT_SYMBOL(tcp_v4_do_rcv); 1779 1780 int tcp_v4_early_demux(struct sk_buff *skb) 1781 { 1782 struct net *net = dev_net(skb->dev); 1783 const struct iphdr *iph; 1784 const struct tcphdr *th; 1785 struct sock *sk; 1786 1787 if (skb->pkt_type != PACKET_HOST) 1788 return 0; 1789 1790 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1791 return 0; 1792 1793 iph = ip_hdr(skb); 1794 th = tcp_hdr(skb); 1795 1796 if (th->doff < sizeof(struct tcphdr) / 4) 1797 return 0; 1798 1799 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 1800 iph->saddr, th->source, 1801 iph->daddr, ntohs(th->dest), 1802 skb->skb_iif, inet_sdif(skb)); 1803 if (sk) { 1804 skb->sk = sk; 1805 skb->destructor = sock_edemux; 1806 if (sk_fullsock(sk)) { 1807 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); 1808 1809 if (dst) 1810 dst = dst_check(dst, 0); 1811 if (dst && 1812 sk->sk_rx_dst_ifindex == skb->skb_iif) 1813 skb_dst_set_noref(skb, dst); 1814 } 1815 } 1816 return 0; 1817 } 1818 1819 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, 1820 enum skb_drop_reason *reason) 1821 { 1822 u32 tail_gso_size, tail_gso_segs; 1823 struct skb_shared_info *shinfo; 1824 const struct tcphdr *th; 1825 struct tcphdr *thtail; 1826 struct sk_buff *tail; 1827 unsigned int hdrlen; 1828 bool fragstolen; 1829 u32 gso_segs; 1830 u32 gso_size; 1831 u64 limit; 1832 int delta; 1833 1834 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1835 * we can fix skb->truesize to its real value to avoid future drops. 1836 * This is valid because skb is not yet charged to the socket. 1837 * It has been noticed pure SACK packets were sometimes dropped 1838 * (if cooked by drivers without copybreak feature). 1839 */ 1840 skb_condense(skb); 1841 1842 skb_dst_drop(skb); 1843 1844 if (unlikely(tcp_checksum_complete(skb))) { 1845 bh_unlock_sock(sk); 1846 trace_tcp_bad_csum(skb); 1847 *reason = SKB_DROP_REASON_TCP_CSUM; 1848 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1849 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1850 return true; 1851 } 1852 1853 /* Attempt coalescing to last skb in backlog, even if we are 1854 * above the limits. 1855 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 1856 */ 1857 th = (const struct tcphdr *)skb->data; 1858 hdrlen = th->doff * 4; 1859 1860 tail = sk->sk_backlog.tail; 1861 if (!tail) 1862 goto no_coalesce; 1863 thtail = (struct tcphdr *)tail->data; 1864 1865 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 1866 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 1867 ((TCP_SKB_CB(tail)->tcp_flags | 1868 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 1869 !((TCP_SKB_CB(tail)->tcp_flags & 1870 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 1871 ((TCP_SKB_CB(tail)->tcp_flags ^ 1872 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || 1873 #ifdef CONFIG_TLS_DEVICE 1874 tail->decrypted != skb->decrypted || 1875 #endif 1876 !mptcp_skb_can_collapse(tail, skb) || 1877 thtail->doff != th->doff || 1878 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 1879 goto no_coalesce; 1880 1881 __skb_pull(skb, hdrlen); 1882 1883 shinfo = skb_shinfo(skb); 1884 gso_size = shinfo->gso_size ?: skb->len; 1885 gso_segs = shinfo->gso_segs ?: 1; 1886 1887 shinfo = skb_shinfo(tail); 1888 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); 1889 tail_gso_segs = shinfo->gso_segs ?: 1; 1890 1891 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 1892 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 1893 1894 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 1895 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 1896 thtail->window = th->window; 1897 } 1898 1899 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 1900 * thtail->fin, so that the fast path in tcp_rcv_established() 1901 * is not entered if we append a packet with a FIN. 1902 * SYN, RST, URG are not present. 1903 * ACK is set on both packets. 1904 * PSH : we do not really care in TCP stack, 1905 * at least for 'GRO' packets. 1906 */ 1907 thtail->fin |= th->fin; 1908 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 1909 1910 if (TCP_SKB_CB(skb)->has_rxtstamp) { 1911 TCP_SKB_CB(tail)->has_rxtstamp = true; 1912 tail->tstamp = skb->tstamp; 1913 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 1914 } 1915 1916 /* Not as strict as GRO. We only need to carry mss max value */ 1917 shinfo->gso_size = max(gso_size, tail_gso_size); 1918 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); 1919 1920 sk->sk_backlog.len += delta; 1921 __NET_INC_STATS(sock_net(sk), 1922 LINUX_MIB_TCPBACKLOGCOALESCE); 1923 kfree_skb_partial(skb, fragstolen); 1924 return false; 1925 } 1926 __skb_push(skb, hdrlen); 1927 1928 no_coalesce: 1929 /* sk->sk_backlog.len is reset only at the end of __release_sock(). 1930 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach 1931 * sk_rcvbuf in normal conditions. 1932 */ 1933 limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1; 1934 1935 limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1; 1936 1937 /* Only socket owner can try to collapse/prune rx queues 1938 * to reduce memory overhead, so add a little headroom here. 1939 * Few sockets backlog are possibly concurrently non empty. 1940 */ 1941 limit += 64 * 1024; 1942 1943 limit = min_t(u64, limit, UINT_MAX); 1944 1945 if (unlikely(sk_add_backlog(sk, skb, limit))) { 1946 bh_unlock_sock(sk); 1947 *reason = SKB_DROP_REASON_SOCKET_BACKLOG; 1948 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 1949 return true; 1950 } 1951 return false; 1952 } 1953 EXPORT_SYMBOL(tcp_add_backlog); 1954 1955 int tcp_filter(struct sock *sk, struct sk_buff *skb) 1956 { 1957 struct tcphdr *th = (struct tcphdr *)skb->data; 1958 1959 return sk_filter_trim_cap(sk, skb, th->doff * 4); 1960 } 1961 EXPORT_SYMBOL(tcp_filter); 1962 1963 static void tcp_v4_restore_cb(struct sk_buff *skb) 1964 { 1965 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 1966 sizeof(struct inet_skb_parm)); 1967 } 1968 1969 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 1970 const struct tcphdr *th) 1971 { 1972 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 1973 * barrier() makes sure compiler wont play fool^Waliasing games. 1974 */ 1975 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 1976 sizeof(struct inet_skb_parm)); 1977 barrier(); 1978 1979 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1980 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1981 skb->len - th->doff * 4); 1982 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1983 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 1984 TCP_SKB_CB(skb)->tcp_tw_isn = 0; 1985 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 1986 TCP_SKB_CB(skb)->sacked = 0; 1987 TCP_SKB_CB(skb)->has_rxtstamp = 1988 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 1989 } 1990 1991 /* 1992 * From tcp_input.c 1993 */ 1994 1995 int tcp_v4_rcv(struct sk_buff *skb) 1996 { 1997 struct net *net = dev_net(skb->dev); 1998 enum skb_drop_reason drop_reason; 1999 int sdif = inet_sdif(skb); 2000 int dif = inet_iif(skb); 2001 const struct iphdr *iph; 2002 const struct tcphdr *th; 2003 bool refcounted; 2004 struct sock *sk; 2005 int ret; 2006 2007 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 2008 if (skb->pkt_type != PACKET_HOST) 2009 goto discard_it; 2010 2011 /* Count it even if it's bad */ 2012 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 2013 2014 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 2015 goto discard_it; 2016 2017 th = (const struct tcphdr *)skb->data; 2018 2019 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { 2020 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; 2021 goto bad_packet; 2022 } 2023 if (!pskb_may_pull(skb, th->doff * 4)) 2024 goto discard_it; 2025 2026 /* An explanation is required here, I think. 2027 * Packet length and doff are validated by header prediction, 2028 * provided case of th->doff==0 is eliminated. 2029 * So, we defer the checks. */ 2030 2031 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 2032 goto csum_error; 2033 2034 th = (const struct tcphdr *)skb->data; 2035 iph = ip_hdr(skb); 2036 lookup: 2037 sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo, 2038 skb, __tcp_hdrlen(th), th->source, 2039 th->dest, sdif, &refcounted); 2040 if (!sk) 2041 goto no_tcp_socket; 2042 2043 process: 2044 if (sk->sk_state == TCP_TIME_WAIT) 2045 goto do_time_wait; 2046 2047 if (sk->sk_state == TCP_NEW_SYN_RECV) { 2048 struct request_sock *req = inet_reqsk(sk); 2049 bool req_stolen = false; 2050 struct sock *nsk; 2051 2052 sk = req->rsk_listener; 2053 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 2054 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2055 else 2056 drop_reason = tcp_inbound_md5_hash(sk, skb, 2057 &iph->saddr, &iph->daddr, 2058 AF_INET, dif, sdif); 2059 if (unlikely(drop_reason)) { 2060 sk_drops_add(sk, skb); 2061 reqsk_put(req); 2062 goto discard_it; 2063 } 2064 if (tcp_checksum_complete(skb)) { 2065 reqsk_put(req); 2066 goto csum_error; 2067 } 2068 if (unlikely(sk->sk_state != TCP_LISTEN)) { 2069 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); 2070 if (!nsk) { 2071 inet_csk_reqsk_queue_drop_and_put(sk, req); 2072 goto lookup; 2073 } 2074 sk = nsk; 2075 /* reuseport_migrate_sock() has already held one sk_refcnt 2076 * before returning. 2077 */ 2078 } else { 2079 /* We own a reference on the listener, increase it again 2080 * as we might lose it too soon. 2081 */ 2082 sock_hold(sk); 2083 } 2084 refcounted = true; 2085 nsk = NULL; 2086 if (!tcp_filter(sk, skb)) { 2087 th = (const struct tcphdr *)skb->data; 2088 iph = ip_hdr(skb); 2089 tcp_v4_fill_cb(skb, iph, th); 2090 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 2091 } else { 2092 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2093 } 2094 if (!nsk) { 2095 reqsk_put(req); 2096 if (req_stolen) { 2097 /* Another cpu got exclusive access to req 2098 * and created a full blown socket. 2099 * Try to feed this packet to this socket 2100 * instead of discarding it. 2101 */ 2102 tcp_v4_restore_cb(skb); 2103 sock_put(sk); 2104 goto lookup; 2105 } 2106 goto discard_and_relse; 2107 } 2108 nf_reset_ct(skb); 2109 if (nsk == sk) { 2110 reqsk_put(req); 2111 tcp_v4_restore_cb(skb); 2112 } else if (tcp_child_process(sk, nsk, skb)) { 2113 tcp_v4_send_reset(nsk, skb); 2114 goto discard_and_relse; 2115 } else { 2116 sock_put(sk); 2117 return 0; 2118 } 2119 } 2120 2121 if (static_branch_unlikely(&ip4_min_ttl)) { 2122 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 2123 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 2124 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2125 drop_reason = SKB_DROP_REASON_TCP_MINTTL; 2126 goto discard_and_relse; 2127 } 2128 } 2129 2130 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { 2131 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2132 goto discard_and_relse; 2133 } 2134 2135 drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr, 2136 &iph->daddr, AF_INET, dif, sdif); 2137 if (drop_reason) 2138 goto discard_and_relse; 2139 2140 nf_reset_ct(skb); 2141 2142 if (tcp_filter(sk, skb)) { 2143 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2144 goto discard_and_relse; 2145 } 2146 th = (const struct tcphdr *)skb->data; 2147 iph = ip_hdr(skb); 2148 tcp_v4_fill_cb(skb, iph, th); 2149 2150 skb->dev = NULL; 2151 2152 if (sk->sk_state == TCP_LISTEN) { 2153 ret = tcp_v4_do_rcv(sk, skb); 2154 goto put_and_return; 2155 } 2156 2157 sk_incoming_cpu_update(sk); 2158 2159 bh_lock_sock_nested(sk); 2160 tcp_segs_in(tcp_sk(sk), skb); 2161 ret = 0; 2162 if (!sock_owned_by_user(sk)) { 2163 ret = tcp_v4_do_rcv(sk, skb); 2164 } else { 2165 if (tcp_add_backlog(sk, skb, &drop_reason)) 2166 goto discard_and_relse; 2167 } 2168 bh_unlock_sock(sk); 2169 2170 put_and_return: 2171 if (refcounted) 2172 sock_put(sk); 2173 2174 return ret; 2175 2176 no_tcp_socket: 2177 drop_reason = SKB_DROP_REASON_NO_SOCKET; 2178 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2179 goto discard_it; 2180 2181 tcp_v4_fill_cb(skb, iph, th); 2182 2183 if (tcp_checksum_complete(skb)) { 2184 csum_error: 2185 drop_reason = SKB_DROP_REASON_TCP_CSUM; 2186 trace_tcp_bad_csum(skb); 2187 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2188 bad_packet: 2189 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2190 } else { 2191 tcp_v4_send_reset(NULL, skb); 2192 } 2193 2194 discard_it: 2195 SKB_DR_OR(drop_reason, NOT_SPECIFIED); 2196 /* Discard frame. */ 2197 kfree_skb_reason(skb, drop_reason); 2198 return 0; 2199 2200 discard_and_relse: 2201 sk_drops_add(sk, skb); 2202 if (refcounted) 2203 sock_put(sk); 2204 goto discard_it; 2205 2206 do_time_wait: 2207 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2208 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2209 inet_twsk_put(inet_twsk(sk)); 2210 goto discard_it; 2211 } 2212 2213 tcp_v4_fill_cb(skb, iph, th); 2214 2215 if (tcp_checksum_complete(skb)) { 2216 inet_twsk_put(inet_twsk(sk)); 2217 goto csum_error; 2218 } 2219 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 2220 case TCP_TW_SYN: { 2221 struct sock *sk2 = inet_lookup_listener(net, 2222 net->ipv4.tcp_death_row.hashinfo, 2223 skb, __tcp_hdrlen(th), 2224 iph->saddr, th->source, 2225 iph->daddr, th->dest, 2226 inet_iif(skb), 2227 sdif); 2228 if (sk2) { 2229 inet_twsk_deschedule_put(inet_twsk(sk)); 2230 sk = sk2; 2231 tcp_v4_restore_cb(skb); 2232 refcounted = false; 2233 goto process; 2234 } 2235 } 2236 /* to ACK */ 2237 fallthrough; 2238 case TCP_TW_ACK: 2239 tcp_v4_timewait_ack(sk, skb); 2240 break; 2241 case TCP_TW_RST: 2242 tcp_v4_send_reset(sk, skb); 2243 inet_twsk_deschedule_put(inet_twsk(sk)); 2244 goto discard_it; 2245 case TCP_TW_SUCCESS:; 2246 } 2247 goto discard_it; 2248 } 2249 2250 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2251 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2252 .twsk_unique = tcp_twsk_unique, 2253 .twsk_destructor= tcp_twsk_destructor, 2254 }; 2255 2256 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2257 { 2258 struct dst_entry *dst = skb_dst(skb); 2259 2260 if (dst && dst_hold_safe(dst)) { 2261 rcu_assign_pointer(sk->sk_rx_dst, dst); 2262 sk->sk_rx_dst_ifindex = skb->skb_iif; 2263 } 2264 } 2265 EXPORT_SYMBOL(inet_sk_rx_dst_set); 2266 2267 const struct inet_connection_sock_af_ops ipv4_specific = { 2268 .queue_xmit = ip_queue_xmit, 2269 .send_check = tcp_v4_send_check, 2270 .rebuild_header = inet_sk_rebuild_header, 2271 .sk_rx_dst_set = inet_sk_rx_dst_set, 2272 .conn_request = tcp_v4_conn_request, 2273 .syn_recv_sock = tcp_v4_syn_recv_sock, 2274 .net_header_len = sizeof(struct iphdr), 2275 .setsockopt = ip_setsockopt, 2276 .getsockopt = ip_getsockopt, 2277 .addr2sockaddr = inet_csk_addr2sockaddr, 2278 .sockaddr_len = sizeof(struct sockaddr_in), 2279 .mtu_reduced = tcp_v4_mtu_reduced, 2280 }; 2281 EXPORT_SYMBOL(ipv4_specific); 2282 2283 #ifdef CONFIG_TCP_MD5SIG 2284 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2285 .md5_lookup = tcp_v4_md5_lookup, 2286 .calc_md5_hash = tcp_v4_md5_hash_skb, 2287 .md5_parse = tcp_v4_parse_md5_keys, 2288 }; 2289 #endif 2290 2291 /* NOTE: A lot of things set to zero explicitly by call to 2292 * sk_alloc() so need not be done here. 2293 */ 2294 static int tcp_v4_init_sock(struct sock *sk) 2295 { 2296 struct inet_connection_sock *icsk = inet_csk(sk); 2297 2298 tcp_init_sock(sk); 2299 2300 icsk->icsk_af_ops = &ipv4_specific; 2301 2302 #ifdef CONFIG_TCP_MD5SIG 2303 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2304 #endif 2305 2306 return 0; 2307 } 2308 2309 void tcp_v4_destroy_sock(struct sock *sk) 2310 { 2311 struct tcp_sock *tp = tcp_sk(sk); 2312 2313 trace_tcp_destroy_sock(sk); 2314 2315 tcp_clear_xmit_timers(sk); 2316 2317 tcp_cleanup_congestion_control(sk); 2318 2319 tcp_cleanup_ulp(sk); 2320 2321 /* Cleanup up the write buffer. */ 2322 tcp_write_queue_purge(sk); 2323 2324 /* Check if we want to disable active TFO */ 2325 tcp_fastopen_active_disable_ofo_check(sk); 2326 2327 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2328 skb_rbtree_purge(&tp->out_of_order_queue); 2329 2330 #ifdef CONFIG_TCP_MD5SIG 2331 /* Clean up the MD5 key list, if any */ 2332 if (tp->md5sig_info) { 2333 tcp_clear_md5_list(sk); 2334 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu); 2335 tp->md5sig_info = NULL; 2336 static_branch_slow_dec_deferred(&tcp_md5_needed); 2337 } 2338 #endif 2339 2340 /* Clean up a referenced TCP bind bucket. */ 2341 if (inet_csk(sk)->icsk_bind_hash) 2342 inet_put_port(sk); 2343 2344 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2345 2346 /* If socket is aborted during connect operation */ 2347 tcp_free_fastopen_req(tp); 2348 tcp_fastopen_destroy_cipher(sk); 2349 tcp_saved_syn_free(tp); 2350 2351 sk_sockets_allocated_dec(sk); 2352 } 2353 EXPORT_SYMBOL(tcp_v4_destroy_sock); 2354 2355 #ifdef CONFIG_PROC_FS 2356 /* Proc filesystem TCP sock list dumping. */ 2357 2358 static unsigned short seq_file_family(const struct seq_file *seq); 2359 2360 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) 2361 { 2362 unsigned short family = seq_file_family(seq); 2363 2364 /* AF_UNSPEC is used as a match all */ 2365 return ((family == AF_UNSPEC || family == sk->sk_family) && 2366 net_eq(sock_net(sk), seq_file_net(seq))); 2367 } 2368 2369 /* Find a non empty bucket (starting from st->bucket) 2370 * and return the first sk from it. 2371 */ 2372 static void *listening_get_first(struct seq_file *seq) 2373 { 2374 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2375 struct tcp_iter_state *st = seq->private; 2376 2377 st->offset = 0; 2378 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) { 2379 struct inet_listen_hashbucket *ilb2; 2380 struct hlist_nulls_node *node; 2381 struct sock *sk; 2382 2383 ilb2 = &hinfo->lhash2[st->bucket]; 2384 if (hlist_nulls_empty(&ilb2->nulls_head)) 2385 continue; 2386 2387 spin_lock(&ilb2->lock); 2388 sk_nulls_for_each(sk, node, &ilb2->nulls_head) { 2389 if (seq_sk_match(seq, sk)) 2390 return sk; 2391 } 2392 spin_unlock(&ilb2->lock); 2393 } 2394 2395 return NULL; 2396 } 2397 2398 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). 2399 * If "cur" is the last one in the st->bucket, 2400 * call listening_get_first() to return the first sk of the next 2401 * non empty bucket. 2402 */ 2403 static void *listening_get_next(struct seq_file *seq, void *cur) 2404 { 2405 struct tcp_iter_state *st = seq->private; 2406 struct inet_listen_hashbucket *ilb2; 2407 struct hlist_nulls_node *node; 2408 struct inet_hashinfo *hinfo; 2409 struct sock *sk = cur; 2410 2411 ++st->num; 2412 ++st->offset; 2413 2414 sk = sk_nulls_next(sk); 2415 sk_nulls_for_each_from(sk, node) { 2416 if (seq_sk_match(seq, sk)) 2417 return sk; 2418 } 2419 2420 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2421 ilb2 = &hinfo->lhash2[st->bucket]; 2422 spin_unlock(&ilb2->lock); 2423 ++st->bucket; 2424 return listening_get_first(seq); 2425 } 2426 2427 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2428 { 2429 struct tcp_iter_state *st = seq->private; 2430 void *rc; 2431 2432 st->bucket = 0; 2433 st->offset = 0; 2434 rc = listening_get_first(seq); 2435 2436 while (rc && *pos) { 2437 rc = listening_get_next(seq, rc); 2438 --*pos; 2439 } 2440 return rc; 2441 } 2442 2443 static inline bool empty_bucket(struct inet_hashinfo *hinfo, 2444 const struct tcp_iter_state *st) 2445 { 2446 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain); 2447 } 2448 2449 /* 2450 * Get first established socket starting from bucket given in st->bucket. 2451 * If st->bucket is zero, the very first socket in the hash is returned. 2452 */ 2453 static void *established_get_first(struct seq_file *seq) 2454 { 2455 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2456 struct tcp_iter_state *st = seq->private; 2457 2458 st->offset = 0; 2459 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) { 2460 struct sock *sk; 2461 struct hlist_nulls_node *node; 2462 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket); 2463 2464 cond_resched(); 2465 2466 /* Lockless fast path for the common case of empty buckets */ 2467 if (empty_bucket(hinfo, st)) 2468 continue; 2469 2470 spin_lock_bh(lock); 2471 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) { 2472 if (seq_sk_match(seq, sk)) 2473 return sk; 2474 } 2475 spin_unlock_bh(lock); 2476 } 2477 2478 return NULL; 2479 } 2480 2481 static void *established_get_next(struct seq_file *seq, void *cur) 2482 { 2483 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2484 struct tcp_iter_state *st = seq->private; 2485 struct hlist_nulls_node *node; 2486 struct sock *sk = cur; 2487 2488 ++st->num; 2489 ++st->offset; 2490 2491 sk = sk_nulls_next(sk); 2492 2493 sk_nulls_for_each_from(sk, node) { 2494 if (seq_sk_match(seq, sk)) 2495 return sk; 2496 } 2497 2498 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2499 ++st->bucket; 2500 return established_get_first(seq); 2501 } 2502 2503 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2504 { 2505 struct tcp_iter_state *st = seq->private; 2506 void *rc; 2507 2508 st->bucket = 0; 2509 rc = established_get_first(seq); 2510 2511 while (rc && pos) { 2512 rc = established_get_next(seq, rc); 2513 --pos; 2514 } 2515 return rc; 2516 } 2517 2518 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2519 { 2520 void *rc; 2521 struct tcp_iter_state *st = seq->private; 2522 2523 st->state = TCP_SEQ_STATE_LISTENING; 2524 rc = listening_get_idx(seq, &pos); 2525 2526 if (!rc) { 2527 st->state = TCP_SEQ_STATE_ESTABLISHED; 2528 rc = established_get_idx(seq, pos); 2529 } 2530 2531 return rc; 2532 } 2533 2534 static void *tcp_seek_last_pos(struct seq_file *seq) 2535 { 2536 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2537 struct tcp_iter_state *st = seq->private; 2538 int bucket = st->bucket; 2539 int offset = st->offset; 2540 int orig_num = st->num; 2541 void *rc = NULL; 2542 2543 switch (st->state) { 2544 case TCP_SEQ_STATE_LISTENING: 2545 if (st->bucket > hinfo->lhash2_mask) 2546 break; 2547 rc = listening_get_first(seq); 2548 while (offset-- && rc && bucket == st->bucket) 2549 rc = listening_get_next(seq, rc); 2550 if (rc) 2551 break; 2552 st->bucket = 0; 2553 st->state = TCP_SEQ_STATE_ESTABLISHED; 2554 fallthrough; 2555 case TCP_SEQ_STATE_ESTABLISHED: 2556 if (st->bucket > hinfo->ehash_mask) 2557 break; 2558 rc = established_get_first(seq); 2559 while (offset-- && rc && bucket == st->bucket) 2560 rc = established_get_next(seq, rc); 2561 } 2562 2563 st->num = orig_num; 2564 2565 return rc; 2566 } 2567 2568 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2569 { 2570 struct tcp_iter_state *st = seq->private; 2571 void *rc; 2572 2573 if (*pos && *pos == st->last_pos) { 2574 rc = tcp_seek_last_pos(seq); 2575 if (rc) 2576 goto out; 2577 } 2578 2579 st->state = TCP_SEQ_STATE_LISTENING; 2580 st->num = 0; 2581 st->bucket = 0; 2582 st->offset = 0; 2583 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2584 2585 out: 2586 st->last_pos = *pos; 2587 return rc; 2588 } 2589 EXPORT_SYMBOL(tcp_seq_start); 2590 2591 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2592 { 2593 struct tcp_iter_state *st = seq->private; 2594 void *rc = NULL; 2595 2596 if (v == SEQ_START_TOKEN) { 2597 rc = tcp_get_idx(seq, 0); 2598 goto out; 2599 } 2600 2601 switch (st->state) { 2602 case TCP_SEQ_STATE_LISTENING: 2603 rc = listening_get_next(seq, v); 2604 if (!rc) { 2605 st->state = TCP_SEQ_STATE_ESTABLISHED; 2606 st->bucket = 0; 2607 st->offset = 0; 2608 rc = established_get_first(seq); 2609 } 2610 break; 2611 case TCP_SEQ_STATE_ESTABLISHED: 2612 rc = established_get_next(seq, v); 2613 break; 2614 } 2615 out: 2616 ++*pos; 2617 st->last_pos = *pos; 2618 return rc; 2619 } 2620 EXPORT_SYMBOL(tcp_seq_next); 2621 2622 void tcp_seq_stop(struct seq_file *seq, void *v) 2623 { 2624 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2625 struct tcp_iter_state *st = seq->private; 2626 2627 switch (st->state) { 2628 case TCP_SEQ_STATE_LISTENING: 2629 if (v != SEQ_START_TOKEN) 2630 spin_unlock(&hinfo->lhash2[st->bucket].lock); 2631 break; 2632 case TCP_SEQ_STATE_ESTABLISHED: 2633 if (v) 2634 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2635 break; 2636 } 2637 } 2638 EXPORT_SYMBOL(tcp_seq_stop); 2639 2640 static void get_openreq4(const struct request_sock *req, 2641 struct seq_file *f, int i) 2642 { 2643 const struct inet_request_sock *ireq = inet_rsk(req); 2644 long delta = req->rsk_timer.expires - jiffies; 2645 2646 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2647 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2648 i, 2649 ireq->ir_loc_addr, 2650 ireq->ir_num, 2651 ireq->ir_rmt_addr, 2652 ntohs(ireq->ir_rmt_port), 2653 TCP_SYN_RECV, 2654 0, 0, /* could print option size, but that is af dependent. */ 2655 1, /* timers active (only the expire timer) */ 2656 jiffies_delta_to_clock_t(delta), 2657 req->num_timeout, 2658 from_kuid_munged(seq_user_ns(f), 2659 sock_i_uid(req->rsk_listener)), 2660 0, /* non standard timer */ 2661 0, /* open_requests have no inode */ 2662 0, 2663 req); 2664 } 2665 2666 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2667 { 2668 int timer_active; 2669 unsigned long timer_expires; 2670 const struct tcp_sock *tp = tcp_sk(sk); 2671 const struct inet_connection_sock *icsk = inet_csk(sk); 2672 const struct inet_sock *inet = inet_sk(sk); 2673 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2674 __be32 dest = inet->inet_daddr; 2675 __be32 src = inet->inet_rcv_saddr; 2676 __u16 destp = ntohs(inet->inet_dport); 2677 __u16 srcp = ntohs(inet->inet_sport); 2678 int rx_queue; 2679 int state; 2680 2681 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2682 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2683 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2684 timer_active = 1; 2685 timer_expires = icsk->icsk_timeout; 2686 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2687 timer_active = 4; 2688 timer_expires = icsk->icsk_timeout; 2689 } else if (timer_pending(&sk->sk_timer)) { 2690 timer_active = 2; 2691 timer_expires = sk->sk_timer.expires; 2692 } else { 2693 timer_active = 0; 2694 timer_expires = jiffies; 2695 } 2696 2697 state = inet_sk_state_load(sk); 2698 if (state == TCP_LISTEN) 2699 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2700 else 2701 /* Because we don't lock the socket, 2702 * we might find a transient negative value. 2703 */ 2704 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2705 READ_ONCE(tp->copied_seq), 0); 2706 2707 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2708 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2709 i, src, srcp, dest, destp, state, 2710 READ_ONCE(tp->write_seq) - tp->snd_una, 2711 rx_queue, 2712 timer_active, 2713 jiffies_delta_to_clock_t(timer_expires - jiffies), 2714 icsk->icsk_retransmits, 2715 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2716 icsk->icsk_probes_out, 2717 sock_i_ino(sk), 2718 refcount_read(&sk->sk_refcnt), sk, 2719 jiffies_to_clock_t(icsk->icsk_rto), 2720 jiffies_to_clock_t(icsk->icsk_ack.ato), 2721 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2722 tcp_snd_cwnd(tp), 2723 state == TCP_LISTEN ? 2724 fastopenq->max_qlen : 2725 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2726 } 2727 2728 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2729 struct seq_file *f, int i) 2730 { 2731 long delta = tw->tw_timer.expires - jiffies; 2732 __be32 dest, src; 2733 __u16 destp, srcp; 2734 2735 dest = tw->tw_daddr; 2736 src = tw->tw_rcv_saddr; 2737 destp = ntohs(tw->tw_dport); 2738 srcp = ntohs(tw->tw_sport); 2739 2740 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2741 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2742 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2743 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2744 refcount_read(&tw->tw_refcnt), tw); 2745 } 2746 2747 #define TMPSZ 150 2748 2749 static int tcp4_seq_show(struct seq_file *seq, void *v) 2750 { 2751 struct tcp_iter_state *st; 2752 struct sock *sk = v; 2753 2754 seq_setwidth(seq, TMPSZ - 1); 2755 if (v == SEQ_START_TOKEN) { 2756 seq_puts(seq, " sl local_address rem_address st tx_queue " 2757 "rx_queue tr tm->when retrnsmt uid timeout " 2758 "inode"); 2759 goto out; 2760 } 2761 st = seq->private; 2762 2763 if (sk->sk_state == TCP_TIME_WAIT) 2764 get_timewait4_sock(v, seq, st->num); 2765 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2766 get_openreq4(v, seq, st->num); 2767 else 2768 get_tcp4_sock(v, seq, st->num); 2769 out: 2770 seq_pad(seq, '\n'); 2771 return 0; 2772 } 2773 2774 #ifdef CONFIG_BPF_SYSCALL 2775 struct bpf_tcp_iter_state { 2776 struct tcp_iter_state state; 2777 unsigned int cur_sk; 2778 unsigned int end_sk; 2779 unsigned int max_sk; 2780 struct sock **batch; 2781 bool st_bucket_done; 2782 }; 2783 2784 struct bpf_iter__tcp { 2785 __bpf_md_ptr(struct bpf_iter_meta *, meta); 2786 __bpf_md_ptr(struct sock_common *, sk_common); 2787 uid_t uid __aligned(8); 2788 }; 2789 2790 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 2791 struct sock_common *sk_common, uid_t uid) 2792 { 2793 struct bpf_iter__tcp ctx; 2794 2795 meta->seq_num--; /* skip SEQ_START_TOKEN */ 2796 ctx.meta = meta; 2797 ctx.sk_common = sk_common; 2798 ctx.uid = uid; 2799 return bpf_iter_run_prog(prog, &ctx); 2800 } 2801 2802 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) 2803 { 2804 while (iter->cur_sk < iter->end_sk) 2805 sock_gen_put(iter->batch[iter->cur_sk++]); 2806 } 2807 2808 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, 2809 unsigned int new_batch_sz) 2810 { 2811 struct sock **new_batch; 2812 2813 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 2814 GFP_USER | __GFP_NOWARN); 2815 if (!new_batch) 2816 return -ENOMEM; 2817 2818 bpf_iter_tcp_put_batch(iter); 2819 kvfree(iter->batch); 2820 iter->batch = new_batch; 2821 iter->max_sk = new_batch_sz; 2822 2823 return 0; 2824 } 2825 2826 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, 2827 struct sock *start_sk) 2828 { 2829 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2830 struct bpf_tcp_iter_state *iter = seq->private; 2831 struct tcp_iter_state *st = &iter->state; 2832 struct hlist_nulls_node *node; 2833 unsigned int expected = 1; 2834 struct sock *sk; 2835 2836 sock_hold(start_sk); 2837 iter->batch[iter->end_sk++] = start_sk; 2838 2839 sk = sk_nulls_next(start_sk); 2840 sk_nulls_for_each_from(sk, node) { 2841 if (seq_sk_match(seq, sk)) { 2842 if (iter->end_sk < iter->max_sk) { 2843 sock_hold(sk); 2844 iter->batch[iter->end_sk++] = sk; 2845 } 2846 expected++; 2847 } 2848 } 2849 spin_unlock(&hinfo->lhash2[st->bucket].lock); 2850 2851 return expected; 2852 } 2853 2854 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, 2855 struct sock *start_sk) 2856 { 2857 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2858 struct bpf_tcp_iter_state *iter = seq->private; 2859 struct tcp_iter_state *st = &iter->state; 2860 struct hlist_nulls_node *node; 2861 unsigned int expected = 1; 2862 struct sock *sk; 2863 2864 sock_hold(start_sk); 2865 iter->batch[iter->end_sk++] = start_sk; 2866 2867 sk = sk_nulls_next(start_sk); 2868 sk_nulls_for_each_from(sk, node) { 2869 if (seq_sk_match(seq, sk)) { 2870 if (iter->end_sk < iter->max_sk) { 2871 sock_hold(sk); 2872 iter->batch[iter->end_sk++] = sk; 2873 } 2874 expected++; 2875 } 2876 } 2877 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2878 2879 return expected; 2880 } 2881 2882 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) 2883 { 2884 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2885 struct bpf_tcp_iter_state *iter = seq->private; 2886 struct tcp_iter_state *st = &iter->state; 2887 unsigned int expected; 2888 bool resized = false; 2889 struct sock *sk; 2890 2891 /* The st->bucket is done. Directly advance to the next 2892 * bucket instead of having the tcp_seek_last_pos() to skip 2893 * one by one in the current bucket and eventually find out 2894 * it has to advance to the next bucket. 2895 */ 2896 if (iter->st_bucket_done) { 2897 st->offset = 0; 2898 st->bucket++; 2899 if (st->state == TCP_SEQ_STATE_LISTENING && 2900 st->bucket > hinfo->lhash2_mask) { 2901 st->state = TCP_SEQ_STATE_ESTABLISHED; 2902 st->bucket = 0; 2903 } 2904 } 2905 2906 again: 2907 /* Get a new batch */ 2908 iter->cur_sk = 0; 2909 iter->end_sk = 0; 2910 iter->st_bucket_done = false; 2911 2912 sk = tcp_seek_last_pos(seq); 2913 if (!sk) 2914 return NULL; /* Done */ 2915 2916 if (st->state == TCP_SEQ_STATE_LISTENING) 2917 expected = bpf_iter_tcp_listening_batch(seq, sk); 2918 else 2919 expected = bpf_iter_tcp_established_batch(seq, sk); 2920 2921 if (iter->end_sk == expected) { 2922 iter->st_bucket_done = true; 2923 return sk; 2924 } 2925 2926 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) { 2927 resized = true; 2928 goto again; 2929 } 2930 2931 return sk; 2932 } 2933 2934 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) 2935 { 2936 /* bpf iter does not support lseek, so it always 2937 * continue from where it was stop()-ped. 2938 */ 2939 if (*pos) 2940 return bpf_iter_tcp_batch(seq); 2941 2942 return SEQ_START_TOKEN; 2943 } 2944 2945 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2946 { 2947 struct bpf_tcp_iter_state *iter = seq->private; 2948 struct tcp_iter_state *st = &iter->state; 2949 struct sock *sk; 2950 2951 /* Whenever seq_next() is called, the iter->cur_sk is 2952 * done with seq_show(), so advance to the next sk in 2953 * the batch. 2954 */ 2955 if (iter->cur_sk < iter->end_sk) { 2956 /* Keeping st->num consistent in tcp_iter_state. 2957 * bpf_iter_tcp does not use st->num. 2958 * meta.seq_num is used instead. 2959 */ 2960 st->num++; 2961 /* Move st->offset to the next sk in the bucket such that 2962 * the future start() will resume at st->offset in 2963 * st->bucket. See tcp_seek_last_pos(). 2964 */ 2965 st->offset++; 2966 sock_gen_put(iter->batch[iter->cur_sk++]); 2967 } 2968 2969 if (iter->cur_sk < iter->end_sk) 2970 sk = iter->batch[iter->cur_sk]; 2971 else 2972 sk = bpf_iter_tcp_batch(seq); 2973 2974 ++*pos; 2975 /* Keeping st->last_pos consistent in tcp_iter_state. 2976 * bpf iter does not do lseek, so st->last_pos always equals to *pos. 2977 */ 2978 st->last_pos = *pos; 2979 return sk; 2980 } 2981 2982 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 2983 { 2984 struct bpf_iter_meta meta; 2985 struct bpf_prog *prog; 2986 struct sock *sk = v; 2987 uid_t uid; 2988 int ret; 2989 2990 if (v == SEQ_START_TOKEN) 2991 return 0; 2992 2993 if (sk_fullsock(sk)) 2994 lock_sock(sk); 2995 2996 if (unlikely(sk_unhashed(sk))) { 2997 ret = SEQ_SKIP; 2998 goto unlock; 2999 } 3000 3001 if (sk->sk_state == TCP_TIME_WAIT) { 3002 uid = 0; 3003 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 3004 const struct request_sock *req = v; 3005 3006 uid = from_kuid_munged(seq_user_ns(seq), 3007 sock_i_uid(req->rsk_listener)); 3008 } else { 3009 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 3010 } 3011 3012 meta.seq = seq; 3013 prog = bpf_iter_get_info(&meta, false); 3014 ret = tcp_prog_seq_show(prog, &meta, v, uid); 3015 3016 unlock: 3017 if (sk_fullsock(sk)) 3018 release_sock(sk); 3019 return ret; 3020 3021 } 3022 3023 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 3024 { 3025 struct bpf_tcp_iter_state *iter = seq->private; 3026 struct bpf_iter_meta meta; 3027 struct bpf_prog *prog; 3028 3029 if (!v) { 3030 meta.seq = seq; 3031 prog = bpf_iter_get_info(&meta, true); 3032 if (prog) 3033 (void)tcp_prog_seq_show(prog, &meta, v, 0); 3034 } 3035 3036 if (iter->cur_sk < iter->end_sk) { 3037 bpf_iter_tcp_put_batch(iter); 3038 iter->st_bucket_done = false; 3039 } 3040 } 3041 3042 static const struct seq_operations bpf_iter_tcp_seq_ops = { 3043 .show = bpf_iter_tcp_seq_show, 3044 .start = bpf_iter_tcp_seq_start, 3045 .next = bpf_iter_tcp_seq_next, 3046 .stop = bpf_iter_tcp_seq_stop, 3047 }; 3048 #endif 3049 static unsigned short seq_file_family(const struct seq_file *seq) 3050 { 3051 const struct tcp_seq_afinfo *afinfo; 3052 3053 #ifdef CONFIG_BPF_SYSCALL 3054 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ 3055 if (seq->op == &bpf_iter_tcp_seq_ops) 3056 return AF_UNSPEC; 3057 #endif 3058 3059 /* Iterated from proc fs */ 3060 afinfo = pde_data(file_inode(seq->file)); 3061 return afinfo->family; 3062 } 3063 3064 static const struct seq_operations tcp4_seq_ops = { 3065 .show = tcp4_seq_show, 3066 .start = tcp_seq_start, 3067 .next = tcp_seq_next, 3068 .stop = tcp_seq_stop, 3069 }; 3070 3071 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 3072 .family = AF_INET, 3073 }; 3074 3075 static int __net_init tcp4_proc_init_net(struct net *net) 3076 { 3077 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 3078 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 3079 return -ENOMEM; 3080 return 0; 3081 } 3082 3083 static void __net_exit tcp4_proc_exit_net(struct net *net) 3084 { 3085 remove_proc_entry("tcp", net->proc_net); 3086 } 3087 3088 static struct pernet_operations tcp4_net_ops = { 3089 .init = tcp4_proc_init_net, 3090 .exit = tcp4_proc_exit_net, 3091 }; 3092 3093 int __init tcp4_proc_init(void) 3094 { 3095 return register_pernet_subsys(&tcp4_net_ops); 3096 } 3097 3098 void tcp4_proc_exit(void) 3099 { 3100 unregister_pernet_subsys(&tcp4_net_ops); 3101 } 3102 #endif /* CONFIG_PROC_FS */ 3103 3104 /* @wake is one when sk_stream_write_space() calls us. 3105 * This sends EPOLLOUT only if notsent_bytes is half the limit. 3106 * This mimics the strategy used in sock_def_write_space(). 3107 */ 3108 bool tcp_stream_memory_free(const struct sock *sk, int wake) 3109 { 3110 const struct tcp_sock *tp = tcp_sk(sk); 3111 u32 notsent_bytes = READ_ONCE(tp->write_seq) - 3112 READ_ONCE(tp->snd_nxt); 3113 3114 return (notsent_bytes << wake) < tcp_notsent_lowat(tp); 3115 } 3116 EXPORT_SYMBOL(tcp_stream_memory_free); 3117 3118 struct proto tcp_prot = { 3119 .name = "TCP", 3120 .owner = THIS_MODULE, 3121 .close = tcp_close, 3122 .pre_connect = tcp_v4_pre_connect, 3123 .connect = tcp_v4_connect, 3124 .disconnect = tcp_disconnect, 3125 .accept = inet_csk_accept, 3126 .ioctl = tcp_ioctl, 3127 .init = tcp_v4_init_sock, 3128 .destroy = tcp_v4_destroy_sock, 3129 .shutdown = tcp_shutdown, 3130 .setsockopt = tcp_setsockopt, 3131 .getsockopt = tcp_getsockopt, 3132 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 3133 .keepalive = tcp_set_keepalive, 3134 .recvmsg = tcp_recvmsg, 3135 .sendmsg = tcp_sendmsg, 3136 .splice_eof = tcp_splice_eof, 3137 .backlog_rcv = tcp_v4_do_rcv, 3138 .release_cb = tcp_release_cb, 3139 .hash = inet_hash, 3140 .unhash = inet_unhash, 3141 .get_port = inet_csk_get_port, 3142 .put_port = inet_put_port, 3143 #ifdef CONFIG_BPF_SYSCALL 3144 .psock_update_sk_prot = tcp_bpf_update_proto, 3145 #endif 3146 .enter_memory_pressure = tcp_enter_memory_pressure, 3147 .leave_memory_pressure = tcp_leave_memory_pressure, 3148 .stream_memory_free = tcp_stream_memory_free, 3149 .sockets_allocated = &tcp_sockets_allocated, 3150 .orphan_count = &tcp_orphan_count, 3151 3152 .memory_allocated = &tcp_memory_allocated, 3153 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, 3154 3155 .memory_pressure = &tcp_memory_pressure, 3156 .sysctl_mem = sysctl_tcp_mem, 3157 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3158 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3159 .max_header = MAX_TCP_HEADER, 3160 .obj_size = sizeof(struct tcp_sock), 3161 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3162 .twsk_prot = &tcp_timewait_sock_ops, 3163 .rsk_prot = &tcp_request_sock_ops, 3164 .h.hashinfo = NULL, 3165 .no_autobind = true, 3166 .diag_destroy = tcp_abort, 3167 }; 3168 EXPORT_SYMBOL(tcp_prot); 3169 3170 static void __net_exit tcp_sk_exit(struct net *net) 3171 { 3172 if (net->ipv4.tcp_congestion_control) 3173 bpf_module_put(net->ipv4.tcp_congestion_control, 3174 net->ipv4.tcp_congestion_control->owner); 3175 } 3176 3177 static void __net_init tcp_set_hashinfo(struct net *net) 3178 { 3179 struct inet_hashinfo *hinfo; 3180 unsigned int ehash_entries; 3181 struct net *old_net; 3182 3183 if (net_eq(net, &init_net)) 3184 goto fallback; 3185 3186 old_net = current->nsproxy->net_ns; 3187 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries); 3188 if (!ehash_entries) 3189 goto fallback; 3190 3191 ehash_entries = roundup_pow_of_two(ehash_entries); 3192 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries); 3193 if (!hinfo) { 3194 pr_warn("Failed to allocate TCP ehash (entries: %u) " 3195 "for a netns, fallback to the global one\n", 3196 ehash_entries); 3197 fallback: 3198 hinfo = &tcp_hashinfo; 3199 ehash_entries = tcp_hashinfo.ehash_mask + 1; 3200 } 3201 3202 net->ipv4.tcp_death_row.hashinfo = hinfo; 3203 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2; 3204 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128); 3205 } 3206 3207 static int __net_init tcp_sk_init(struct net *net) 3208 { 3209 net->ipv4.sysctl_tcp_ecn = 2; 3210 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3211 3212 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 3213 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 3214 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 3215 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 3216 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 3217 3218 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 3219 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 3220 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 3221 3222 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 3223 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 3224 net->ipv4.sysctl_tcp_syncookies = 1; 3225 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 3226 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 3227 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 3228 net->ipv4.sysctl_tcp_orphan_retries = 0; 3229 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 3230 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 3231 net->ipv4.sysctl_tcp_tw_reuse = 2; 3232 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 3233 3234 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); 3235 tcp_set_hashinfo(net); 3236 3237 net->ipv4.sysctl_tcp_sack = 1; 3238 net->ipv4.sysctl_tcp_window_scaling = 1; 3239 net->ipv4.sysctl_tcp_timestamps = 1; 3240 net->ipv4.sysctl_tcp_early_retrans = 3; 3241 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 3242 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 3243 net->ipv4.sysctl_tcp_retrans_collapse = 1; 3244 net->ipv4.sysctl_tcp_max_reordering = 300; 3245 net->ipv4.sysctl_tcp_dsack = 1; 3246 net->ipv4.sysctl_tcp_app_win = 31; 3247 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3248 net->ipv4.sysctl_tcp_frto = 2; 3249 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3250 /* This limits the percentage of the congestion window which we 3251 * will allow a single TSO frame to consume. Building TSO frames 3252 * which are too large can cause TCP streams to be bursty. 3253 */ 3254 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3255 /* Default TSQ limit of 16 TSO segments */ 3256 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; 3257 3258 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ 3259 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; 3260 3261 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3262 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ 3263 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3264 net->ipv4.sysctl_tcp_autocorking = 1; 3265 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 3266 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 3267 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 3268 if (net != &init_net) { 3269 memcpy(net->ipv4.sysctl_tcp_rmem, 3270 init_net.ipv4.sysctl_tcp_rmem, 3271 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 3272 memcpy(net->ipv4.sysctl_tcp_wmem, 3273 init_net.ipv4.sysctl_tcp_wmem, 3274 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 3275 } 3276 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3277 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 3278 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3279 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3280 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; 3281 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 3282 3283 /* Set default values for PLB */ 3284 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */ 3285 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3; 3286 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; 3287 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; 3288 /* Default congestion threshold for PLB to mark a round is 50% */ 3289 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2; 3290 3291 /* Reno is always built in */ 3292 if (!net_eq(net, &init_net) && 3293 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 3294 init_net.ipv4.tcp_congestion_control->owner)) 3295 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 3296 else 3297 net->ipv4.tcp_congestion_control = &tcp_reno; 3298 3299 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4; 3300 net->ipv4.sysctl_tcp_shrink_window = 0; 3301 3302 return 0; 3303 } 3304 3305 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 3306 { 3307 struct net *net; 3308 3309 /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work 3310 * and failed setup_net error unwinding path are serialized. 3311 * 3312 * tcp_twsk_purge() handles twsk in any dead netns, not just those in 3313 * net_exit_list, the thread that dismantles a particular twsk must 3314 * do so without other thread progressing to refcount_dec_and_test() of 3315 * tcp_death_row.tw_refcount. 3316 */ 3317 mutex_lock(&tcp_exit_batch_mutex); 3318 3319 tcp_twsk_purge(net_exit_list); 3320 3321 list_for_each_entry(net, net_exit_list, exit_list) { 3322 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); 3323 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount)); 3324 tcp_fastopen_ctx_destroy(net); 3325 } 3326 3327 mutex_unlock(&tcp_exit_batch_mutex); 3328 } 3329 3330 static struct pernet_operations __net_initdata tcp_sk_ops = { 3331 .init = tcp_sk_init, 3332 .exit = tcp_sk_exit, 3333 .exit_batch = tcp_sk_exit_batch, 3334 }; 3335 3336 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3337 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 3338 struct sock_common *sk_common, uid_t uid) 3339 3340 #define INIT_BATCH_SZ 16 3341 3342 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 3343 { 3344 struct bpf_tcp_iter_state *iter = priv_data; 3345 int err; 3346 3347 err = bpf_iter_init_seq_net(priv_data, aux); 3348 if (err) 3349 return err; 3350 3351 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ); 3352 if (err) { 3353 bpf_iter_fini_seq_net(priv_data); 3354 return err; 3355 } 3356 3357 return 0; 3358 } 3359 3360 static void bpf_iter_fini_tcp(void *priv_data) 3361 { 3362 struct bpf_tcp_iter_state *iter = priv_data; 3363 3364 bpf_iter_fini_seq_net(priv_data); 3365 kvfree(iter->batch); 3366 } 3367 3368 static const struct bpf_iter_seq_info tcp_seq_info = { 3369 .seq_ops = &bpf_iter_tcp_seq_ops, 3370 .init_seq_private = bpf_iter_init_tcp, 3371 .fini_seq_private = bpf_iter_fini_tcp, 3372 .seq_priv_size = sizeof(struct bpf_tcp_iter_state), 3373 }; 3374 3375 static const struct bpf_func_proto * 3376 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, 3377 const struct bpf_prog *prog) 3378 { 3379 switch (func_id) { 3380 case BPF_FUNC_setsockopt: 3381 return &bpf_sk_setsockopt_proto; 3382 case BPF_FUNC_getsockopt: 3383 return &bpf_sk_getsockopt_proto; 3384 default: 3385 return NULL; 3386 } 3387 } 3388 3389 static struct bpf_iter_reg tcp_reg_info = { 3390 .target = "tcp", 3391 .ctx_arg_info_size = 1, 3392 .ctx_arg_info = { 3393 { offsetof(struct bpf_iter__tcp, sk_common), 3394 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, 3395 }, 3396 .get_func_proto = bpf_iter_tcp_get_func_proto, 3397 .seq_info = &tcp_seq_info, 3398 }; 3399 3400 static void __init bpf_iter_register(void) 3401 { 3402 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 3403 if (bpf_iter_reg_target(&tcp_reg_info)) 3404 pr_warn("Warning: could not register bpf iterator tcp\n"); 3405 } 3406 3407 #endif 3408 3409 void __init tcp_v4_init(void) 3410 { 3411 int cpu, res; 3412 3413 for_each_possible_cpu(cpu) { 3414 struct sock *sk; 3415 3416 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 3417 IPPROTO_TCP, &init_net); 3418 if (res) 3419 panic("Failed to create the TCP control socket.\n"); 3420 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 3421 3422 /* Please enforce IP_DF and IPID==0 for RST and 3423 * ACK sent in SYN-RECV and TIME-WAIT state. 3424 */ 3425 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 3426 3427 per_cpu(ipv4_tcp_sk, cpu) = sk; 3428 } 3429 if (register_pernet_subsys(&tcp_sk_ops)) 3430 panic("Failed to create the TCP control socket.\n"); 3431 3432 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3433 bpf_iter_register(); 3434 #endif 3435 } 3436