1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 61 #include <net/net_namespace.h> 62 #include <net/icmp.h> 63 #include <net/inet_hashtables.h> 64 #include <net/tcp.h> 65 #include <net/transp_v6.h> 66 #include <net/ipv6.h> 67 #include <net/inet_common.h> 68 #include <net/timewait_sock.h> 69 #include <net/xfrm.h> 70 #include <net/secure_seq.h> 71 #include <net/busy_poll.h> 72 73 #include <linux/inet.h> 74 #include <linux/ipv6.h> 75 #include <linux/stddef.h> 76 #include <linux/proc_fs.h> 77 #include <linux/seq_file.h> 78 #include <linux/inetdevice.h> 79 #include <linux/btf_ids.h> 80 81 #include <crypto/hash.h> 82 #include <linux/scatterlist.h> 83 84 #include <trace/events/tcp.h> 85 86 #ifdef CONFIG_TCP_MD5SIG 87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 88 __be32 daddr, __be32 saddr, const struct tcphdr *th); 89 #endif 90 91 struct inet_hashinfo tcp_hashinfo; 92 EXPORT_SYMBOL(tcp_hashinfo); 93 94 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 95 { 96 return secure_tcp_seq(ip_hdr(skb)->daddr, 97 ip_hdr(skb)->saddr, 98 tcp_hdr(skb)->dest, 99 tcp_hdr(skb)->source); 100 } 101 102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 103 { 104 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 105 } 106 107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 108 { 109 const struct inet_timewait_sock *tw = inet_twsk(sktw); 110 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 111 struct tcp_sock *tp = tcp_sk(sk); 112 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse; 113 114 if (reuse == 2) { 115 /* Still does not detect *everything* that goes through 116 * lo, since we require a loopback src or dst address 117 * or direct binding to 'lo' interface. 118 */ 119 bool loopback = false; 120 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 121 loopback = true; 122 #if IS_ENABLED(CONFIG_IPV6) 123 if (tw->tw_family == AF_INET6) { 124 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 125 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 126 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 128 loopback = true; 129 } else 130 #endif 131 { 132 if (ipv4_is_loopback(tw->tw_daddr) || 133 ipv4_is_loopback(tw->tw_rcv_saddr)) 134 loopback = true; 135 } 136 if (!loopback) 137 reuse = 0; 138 } 139 140 /* With PAWS, it is safe from the viewpoint 141 of data integrity. Even without PAWS it is safe provided sequence 142 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 143 144 Actually, the idea is close to VJ's one, only timestamp cache is 145 held not per host, but per port pair and TW bucket is used as state 146 holder. 147 148 If TW bucket has been already destroyed we fall back to VJ's scheme 149 and use initial timestamp retrieved from peer table. 150 */ 151 if (tcptw->tw_ts_recent_stamp && 152 (!twp || (reuse && time_after32(ktime_get_seconds(), 153 tcptw->tw_ts_recent_stamp)))) { 154 /* In case of repair and re-using TIME-WAIT sockets we still 155 * want to be sure that it is safe as above but honor the 156 * sequence numbers and time stamps set as part of the repair 157 * process. 158 * 159 * Without this check re-using a TIME-WAIT socket with TCP 160 * repair would accumulate a -1 on the repair assigned 161 * sequence number. The first time it is reused the sequence 162 * is -1, the second time -2, etc. This fixes that issue 163 * without appearing to create any others. 164 */ 165 if (likely(!tp->repair)) { 166 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 167 168 if (!seq) 169 seq = 1; 170 WRITE_ONCE(tp->write_seq, seq); 171 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 172 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 173 } 174 sock_hold(sktw); 175 return 1; 176 } 177 178 return 0; 179 } 180 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 181 182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 183 int addr_len) 184 { 185 /* This check is replicated from tcp_v4_connect() and intended to 186 * prevent BPF program called below from accessing bytes that are out 187 * of the bound specified by user in addr_len. 188 */ 189 if (addr_len < sizeof(struct sockaddr_in)) 190 return -EINVAL; 191 192 sock_owned_by_me(sk); 193 194 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr); 195 } 196 197 /* This will initiate an outgoing connection. */ 198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 199 { 200 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 201 struct inet_sock *inet = inet_sk(sk); 202 struct tcp_sock *tp = tcp_sk(sk); 203 __be16 orig_sport, orig_dport; 204 __be32 daddr, nexthop; 205 struct flowi4 *fl4; 206 struct rtable *rt; 207 int err; 208 struct ip_options_rcu *inet_opt; 209 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 210 211 if (addr_len < sizeof(struct sockaddr_in)) 212 return -EINVAL; 213 214 if (usin->sin_family != AF_INET) 215 return -EAFNOSUPPORT; 216 217 nexthop = daddr = usin->sin_addr.s_addr; 218 inet_opt = rcu_dereference_protected(inet->inet_opt, 219 lockdep_sock_is_held(sk)); 220 if (inet_opt && inet_opt->opt.srr) { 221 if (!daddr) 222 return -EINVAL; 223 nexthop = inet_opt->opt.faddr; 224 } 225 226 orig_sport = inet->inet_sport; 227 orig_dport = usin->sin_port; 228 fl4 = &inet->cork.fl.u.ip4; 229 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 230 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 231 IPPROTO_TCP, 232 orig_sport, orig_dport, sk); 233 if (IS_ERR(rt)) { 234 err = PTR_ERR(rt); 235 if (err == -ENETUNREACH) 236 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 237 return err; 238 } 239 240 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 241 ip_rt_put(rt); 242 return -ENETUNREACH; 243 } 244 245 if (!inet_opt || !inet_opt->opt.srr) 246 daddr = fl4->daddr; 247 248 if (!inet->inet_saddr) 249 inet->inet_saddr = fl4->saddr; 250 sk_rcv_saddr_set(sk, inet->inet_saddr); 251 252 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 253 /* Reset inherited state */ 254 tp->rx_opt.ts_recent = 0; 255 tp->rx_opt.ts_recent_stamp = 0; 256 if (likely(!tp->repair)) 257 WRITE_ONCE(tp->write_seq, 0); 258 } 259 260 inet->inet_dport = usin->sin_port; 261 sk_daddr_set(sk, daddr); 262 263 inet_csk(sk)->icsk_ext_hdr_len = 0; 264 if (inet_opt) 265 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 266 267 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 268 269 /* Socket identity is still unknown (sport may be zero). 270 * However we set state to SYN-SENT and not releasing socket 271 * lock select source port, enter ourselves into the hash tables and 272 * complete initialization after this. 273 */ 274 tcp_set_state(sk, TCP_SYN_SENT); 275 err = inet_hash_connect(tcp_death_row, sk); 276 if (err) 277 goto failure; 278 279 sk_set_txhash(sk); 280 281 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 282 inet->inet_sport, inet->inet_dport, sk); 283 if (IS_ERR(rt)) { 284 err = PTR_ERR(rt); 285 rt = NULL; 286 goto failure; 287 } 288 /* OK, now commit destination to socket. */ 289 sk->sk_gso_type = SKB_GSO_TCPV4; 290 sk_setup_caps(sk, &rt->dst); 291 rt = NULL; 292 293 if (likely(!tp->repair)) { 294 if (!tp->write_seq) 295 WRITE_ONCE(tp->write_seq, 296 secure_tcp_seq(inet->inet_saddr, 297 inet->inet_daddr, 298 inet->inet_sport, 299 usin->sin_port)); 300 tp->tsoffset = secure_tcp_ts_off(sock_net(sk), 301 inet->inet_saddr, 302 inet->inet_daddr); 303 } 304 305 inet->inet_id = prandom_u32(); 306 307 if (tcp_fastopen_defer_connect(sk, &err)) 308 return err; 309 if (err) 310 goto failure; 311 312 err = tcp_connect(sk); 313 314 if (err) 315 goto failure; 316 317 return 0; 318 319 failure: 320 /* 321 * This unhashes the socket and releases the local port, 322 * if necessary. 323 */ 324 tcp_set_state(sk, TCP_CLOSE); 325 ip_rt_put(rt); 326 sk->sk_route_caps = 0; 327 inet->inet_dport = 0; 328 return err; 329 } 330 EXPORT_SYMBOL(tcp_v4_connect); 331 332 /* 333 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 334 * It can be called through tcp_release_cb() if socket was owned by user 335 * at the time tcp_v4_err() was called to handle ICMP message. 336 */ 337 void tcp_v4_mtu_reduced(struct sock *sk) 338 { 339 struct inet_sock *inet = inet_sk(sk); 340 struct dst_entry *dst; 341 u32 mtu; 342 343 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 344 return; 345 mtu = READ_ONCE(tcp_sk(sk)->mtu_info); 346 dst = inet_csk_update_pmtu(sk, mtu); 347 if (!dst) 348 return; 349 350 /* Something is about to be wrong... Remember soft error 351 * for the case, if this connection will not able to recover. 352 */ 353 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 354 sk->sk_err_soft = EMSGSIZE; 355 356 mtu = dst_mtu(dst); 357 358 if (inet->pmtudisc != IP_PMTUDISC_DONT && 359 ip_sk_accept_pmtu(sk) && 360 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 361 tcp_sync_mss(sk, mtu); 362 363 /* Resend the TCP packet because it's 364 * clear that the old packet has been 365 * dropped. This is the new "fast" path mtu 366 * discovery. 367 */ 368 tcp_simple_retransmit(sk); 369 } /* else let the usual retransmit timer handle it */ 370 } 371 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 372 373 static void do_redirect(struct sk_buff *skb, struct sock *sk) 374 { 375 struct dst_entry *dst = __sk_dst_check(sk, 0); 376 377 if (dst) 378 dst->ops->redirect(dst, sk, skb); 379 } 380 381 382 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 383 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 384 { 385 struct request_sock *req = inet_reqsk(sk); 386 struct net *net = sock_net(sk); 387 388 /* ICMPs are not backlogged, hence we cannot get 389 * an established socket here. 390 */ 391 if (seq != tcp_rsk(req)->snt_isn) { 392 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 393 } else if (abort) { 394 /* 395 * Still in SYN_RECV, just remove it silently. 396 * There is no good way to pass the error to the newly 397 * created socket, and POSIX does not want network 398 * errors returned from accept(). 399 */ 400 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 401 tcp_listendrop(req->rsk_listener); 402 } 403 reqsk_put(req); 404 } 405 EXPORT_SYMBOL(tcp_req_err); 406 407 /* TCP-LD (RFC 6069) logic */ 408 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 409 { 410 struct inet_connection_sock *icsk = inet_csk(sk); 411 struct tcp_sock *tp = tcp_sk(sk); 412 struct sk_buff *skb; 413 s32 remaining; 414 u32 delta_us; 415 416 if (sock_owned_by_user(sk)) 417 return; 418 419 if (seq != tp->snd_una || !icsk->icsk_retransmits || 420 !icsk->icsk_backoff) 421 return; 422 423 skb = tcp_rtx_queue_head(sk); 424 if (WARN_ON_ONCE(!skb)) 425 return; 426 427 icsk->icsk_backoff--; 428 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 429 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 430 431 tcp_mstamp_refresh(tp); 432 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 433 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 434 435 if (remaining > 0) { 436 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 437 remaining, TCP_RTO_MAX); 438 } else { 439 /* RTO revert clocked out retransmission. 440 * Will retransmit now. 441 */ 442 tcp_retransmit_timer(sk); 443 } 444 } 445 EXPORT_SYMBOL(tcp_ld_RTO_revert); 446 447 /* 448 * This routine is called by the ICMP module when it gets some 449 * sort of error condition. If err < 0 then the socket should 450 * be closed and the error returned to the user. If err > 0 451 * it's just the icmp type << 8 | icmp code. After adjustment 452 * header points to the first 8 bytes of the tcp header. We need 453 * to find the appropriate port. 454 * 455 * The locking strategy used here is very "optimistic". When 456 * someone else accesses the socket the ICMP is just dropped 457 * and for some paths there is no check at all. 458 * A more general error queue to queue errors for later handling 459 * is probably better. 460 * 461 */ 462 463 int tcp_v4_err(struct sk_buff *skb, u32 info) 464 { 465 const struct iphdr *iph = (const struct iphdr *)skb->data; 466 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 467 struct tcp_sock *tp; 468 struct inet_sock *inet; 469 const int type = icmp_hdr(skb)->type; 470 const int code = icmp_hdr(skb)->code; 471 struct sock *sk; 472 struct request_sock *fastopen; 473 u32 seq, snd_una; 474 int err; 475 struct net *net = dev_net(skb->dev); 476 477 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, 478 th->dest, iph->saddr, ntohs(th->source), 479 inet_iif(skb), 0); 480 if (!sk) { 481 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 482 return -ENOENT; 483 } 484 if (sk->sk_state == TCP_TIME_WAIT) { 485 inet_twsk_put(inet_twsk(sk)); 486 return 0; 487 } 488 seq = ntohl(th->seq); 489 if (sk->sk_state == TCP_NEW_SYN_RECV) { 490 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 491 type == ICMP_TIME_EXCEEDED || 492 (type == ICMP_DEST_UNREACH && 493 (code == ICMP_NET_UNREACH || 494 code == ICMP_HOST_UNREACH))); 495 return 0; 496 } 497 498 bh_lock_sock(sk); 499 /* If too many ICMPs get dropped on busy 500 * servers this needs to be solved differently. 501 * We do take care of PMTU discovery (RFC1191) special case : 502 * we can receive locally generated ICMP messages while socket is held. 503 */ 504 if (sock_owned_by_user(sk)) { 505 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 506 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 507 } 508 if (sk->sk_state == TCP_CLOSE) 509 goto out; 510 511 if (static_branch_unlikely(&ip4_min_ttl)) { 512 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 513 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 514 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 515 goto out; 516 } 517 } 518 519 tp = tcp_sk(sk); 520 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 521 fastopen = rcu_dereference(tp->fastopen_rsk); 522 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 523 if (sk->sk_state != TCP_LISTEN && 524 !between(seq, snd_una, tp->snd_nxt)) { 525 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 526 goto out; 527 } 528 529 switch (type) { 530 case ICMP_REDIRECT: 531 if (!sock_owned_by_user(sk)) 532 do_redirect(skb, sk); 533 goto out; 534 case ICMP_SOURCE_QUENCH: 535 /* Just silently ignore these. */ 536 goto out; 537 case ICMP_PARAMETERPROB: 538 err = EPROTO; 539 break; 540 case ICMP_DEST_UNREACH: 541 if (code > NR_ICMP_UNREACH) 542 goto out; 543 544 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 545 /* We are not interested in TCP_LISTEN and open_requests 546 * (SYN-ACKs send out by Linux are always <576bytes so 547 * they should go through unfragmented). 548 */ 549 if (sk->sk_state == TCP_LISTEN) 550 goto out; 551 552 WRITE_ONCE(tp->mtu_info, info); 553 if (!sock_owned_by_user(sk)) { 554 tcp_v4_mtu_reduced(sk); 555 } else { 556 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 557 sock_hold(sk); 558 } 559 goto out; 560 } 561 562 err = icmp_err_convert[code].errno; 563 /* check if this ICMP message allows revert of backoff. 564 * (see RFC 6069) 565 */ 566 if (!fastopen && 567 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 568 tcp_ld_RTO_revert(sk, seq); 569 break; 570 case ICMP_TIME_EXCEEDED: 571 err = EHOSTUNREACH; 572 break; 573 default: 574 goto out; 575 } 576 577 switch (sk->sk_state) { 578 case TCP_SYN_SENT: 579 case TCP_SYN_RECV: 580 /* Only in fast or simultaneous open. If a fast open socket is 581 * already accepted it is treated as a connected one below. 582 */ 583 if (fastopen && !fastopen->sk) 584 break; 585 586 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 587 588 if (!sock_owned_by_user(sk)) { 589 sk->sk_err = err; 590 591 sk_error_report(sk); 592 593 tcp_done(sk); 594 } else { 595 sk->sk_err_soft = err; 596 } 597 goto out; 598 } 599 600 /* If we've already connected we will keep trying 601 * until we time out, or the user gives up. 602 * 603 * rfc1122 4.2.3.9 allows to consider as hard errors 604 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 605 * but it is obsoleted by pmtu discovery). 606 * 607 * Note, that in modern internet, where routing is unreliable 608 * and in each dark corner broken firewalls sit, sending random 609 * errors ordered by their masters even this two messages finally lose 610 * their original sense (even Linux sends invalid PORT_UNREACHs) 611 * 612 * Now we are in compliance with RFCs. 613 * --ANK (980905) 614 */ 615 616 inet = inet_sk(sk); 617 if (!sock_owned_by_user(sk) && inet->recverr) { 618 sk->sk_err = err; 619 sk_error_report(sk); 620 } else { /* Only an error on timeout */ 621 sk->sk_err_soft = err; 622 } 623 624 out: 625 bh_unlock_sock(sk); 626 sock_put(sk); 627 return 0; 628 } 629 630 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 631 { 632 struct tcphdr *th = tcp_hdr(skb); 633 634 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 635 skb->csum_start = skb_transport_header(skb) - skb->head; 636 skb->csum_offset = offsetof(struct tcphdr, check); 637 } 638 639 /* This routine computes an IPv4 TCP checksum. */ 640 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 641 { 642 const struct inet_sock *inet = inet_sk(sk); 643 644 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 645 } 646 EXPORT_SYMBOL(tcp_v4_send_check); 647 648 /* 649 * This routine will send an RST to the other tcp. 650 * 651 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 652 * for reset. 653 * Answer: if a packet caused RST, it is not for a socket 654 * existing in our system, if it is matched to a socket, 655 * it is just duplicate segment or bug in other side's TCP. 656 * So that we build reply only basing on parameters 657 * arrived with segment. 658 * Exception: precedence violation. We do not implement it in any case. 659 */ 660 661 #ifdef CONFIG_TCP_MD5SIG 662 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED 663 #else 664 #define OPTION_BYTES sizeof(__be32) 665 #endif 666 667 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) 668 { 669 const struct tcphdr *th = tcp_hdr(skb); 670 struct { 671 struct tcphdr th; 672 __be32 opt[OPTION_BYTES / sizeof(__be32)]; 673 } rep; 674 struct ip_reply_arg arg; 675 #ifdef CONFIG_TCP_MD5SIG 676 struct tcp_md5sig_key *key = NULL; 677 const __u8 *hash_location = NULL; 678 unsigned char newhash[16]; 679 int genhash; 680 struct sock *sk1 = NULL; 681 #endif 682 u64 transmit_time = 0; 683 struct sock *ctl_sk; 684 struct net *net; 685 686 /* Never send a reset in response to a reset. */ 687 if (th->rst) 688 return; 689 690 /* If sk not NULL, it means we did a successful lookup and incoming 691 * route had to be correct. prequeue might have dropped our dst. 692 */ 693 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 694 return; 695 696 /* Swap the send and the receive. */ 697 memset(&rep, 0, sizeof(rep)); 698 rep.th.dest = th->source; 699 rep.th.source = th->dest; 700 rep.th.doff = sizeof(struct tcphdr) / 4; 701 rep.th.rst = 1; 702 703 if (th->ack) { 704 rep.th.seq = th->ack_seq; 705 } else { 706 rep.th.ack = 1; 707 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 708 skb->len - (th->doff << 2)); 709 } 710 711 memset(&arg, 0, sizeof(arg)); 712 arg.iov[0].iov_base = (unsigned char *)&rep; 713 arg.iov[0].iov_len = sizeof(rep.th); 714 715 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 716 #ifdef CONFIG_TCP_MD5SIG 717 rcu_read_lock(); 718 hash_location = tcp_parse_md5sig_option(th); 719 if (sk && sk_fullsock(sk)) { 720 const union tcp_md5_addr *addr; 721 int l3index; 722 723 /* sdif set, means packet ingressed via a device 724 * in an L3 domain and inet_iif is set to it. 725 */ 726 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 727 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 728 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 729 } else if (hash_location) { 730 const union tcp_md5_addr *addr; 731 int sdif = tcp_v4_sdif(skb); 732 int dif = inet_iif(skb); 733 int l3index; 734 735 /* 736 * active side is lost. Try to find listening socket through 737 * source port, and then find md5 key through listening socket. 738 * we are not loose security here: 739 * Incoming packet is checked with md5 hash with finding key, 740 * no RST generated if md5 hash doesn't match. 741 */ 742 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, 743 ip_hdr(skb)->saddr, 744 th->source, ip_hdr(skb)->daddr, 745 ntohs(th->source), dif, sdif); 746 /* don't send rst if it can't find key */ 747 if (!sk1) 748 goto out; 749 750 /* sdif set, means packet ingressed via a device 751 * in an L3 domain and dif is set to it. 752 */ 753 l3index = sdif ? dif : 0; 754 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 755 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 756 if (!key) 757 goto out; 758 759 760 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 761 if (genhash || memcmp(hash_location, newhash, 16) != 0) 762 goto out; 763 764 } 765 766 if (key) { 767 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 768 (TCPOPT_NOP << 16) | 769 (TCPOPT_MD5SIG << 8) | 770 TCPOLEN_MD5SIG); 771 /* Update length and the length the header thinks exists */ 772 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 773 rep.th.doff = arg.iov[0].iov_len / 4; 774 775 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 776 key, ip_hdr(skb)->saddr, 777 ip_hdr(skb)->daddr, &rep.th); 778 } 779 #endif 780 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ 781 if (rep.opt[0] == 0) { 782 __be32 mrst = mptcp_reset_option(skb); 783 784 if (mrst) { 785 rep.opt[0] = mrst; 786 arg.iov[0].iov_len += sizeof(mrst); 787 rep.th.doff = arg.iov[0].iov_len / 4; 788 } 789 } 790 791 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 792 ip_hdr(skb)->saddr, /* XXX */ 793 arg.iov[0].iov_len, IPPROTO_TCP, 0); 794 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 795 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 796 797 /* When socket is gone, all binding information is lost. 798 * routing might fail in this case. No choice here, if we choose to force 799 * input interface, we will misroute in case of asymmetric route. 800 */ 801 if (sk) { 802 arg.bound_dev_if = sk->sk_bound_dev_if; 803 if (sk_fullsock(sk)) 804 trace_tcp_send_reset(sk, skb); 805 } 806 807 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 808 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 809 810 arg.tos = ip_hdr(skb)->tos; 811 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 812 local_bh_disable(); 813 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); 814 if (sk) { 815 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 816 inet_twsk(sk)->tw_mark : sk->sk_mark; 817 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 818 inet_twsk(sk)->tw_priority : sk->sk_priority; 819 transmit_time = tcp_transmit_time(sk); 820 } 821 ip_send_unicast_reply(ctl_sk, 822 skb, &TCP_SKB_CB(skb)->header.h4.opt, 823 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 824 &arg, arg.iov[0].iov_len, 825 transmit_time); 826 827 ctl_sk->sk_mark = 0; 828 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 829 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 830 local_bh_enable(); 831 832 #ifdef CONFIG_TCP_MD5SIG 833 out: 834 rcu_read_unlock(); 835 #endif 836 } 837 838 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 839 outside socket context is ugly, certainly. What can I do? 840 */ 841 842 static void tcp_v4_send_ack(const struct sock *sk, 843 struct sk_buff *skb, u32 seq, u32 ack, 844 u32 win, u32 tsval, u32 tsecr, int oif, 845 struct tcp_md5sig_key *key, 846 int reply_flags, u8 tos) 847 { 848 const struct tcphdr *th = tcp_hdr(skb); 849 struct { 850 struct tcphdr th; 851 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 852 #ifdef CONFIG_TCP_MD5SIG 853 + (TCPOLEN_MD5SIG_ALIGNED >> 2) 854 #endif 855 ]; 856 } rep; 857 struct net *net = sock_net(sk); 858 struct ip_reply_arg arg; 859 struct sock *ctl_sk; 860 u64 transmit_time; 861 862 memset(&rep.th, 0, sizeof(struct tcphdr)); 863 memset(&arg, 0, sizeof(arg)); 864 865 arg.iov[0].iov_base = (unsigned char *)&rep; 866 arg.iov[0].iov_len = sizeof(rep.th); 867 if (tsecr) { 868 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 869 (TCPOPT_TIMESTAMP << 8) | 870 TCPOLEN_TIMESTAMP); 871 rep.opt[1] = htonl(tsval); 872 rep.opt[2] = htonl(tsecr); 873 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 874 } 875 876 /* Swap the send and the receive. */ 877 rep.th.dest = th->source; 878 rep.th.source = th->dest; 879 rep.th.doff = arg.iov[0].iov_len / 4; 880 rep.th.seq = htonl(seq); 881 rep.th.ack_seq = htonl(ack); 882 rep.th.ack = 1; 883 rep.th.window = htons(win); 884 885 #ifdef CONFIG_TCP_MD5SIG 886 if (key) { 887 int offset = (tsecr) ? 3 : 0; 888 889 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 890 (TCPOPT_NOP << 16) | 891 (TCPOPT_MD5SIG << 8) | 892 TCPOLEN_MD5SIG); 893 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 894 rep.th.doff = arg.iov[0].iov_len/4; 895 896 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 897 key, ip_hdr(skb)->saddr, 898 ip_hdr(skb)->daddr, &rep.th); 899 } 900 #endif 901 arg.flags = reply_flags; 902 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 903 ip_hdr(skb)->saddr, /* XXX */ 904 arg.iov[0].iov_len, IPPROTO_TCP, 0); 905 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 906 if (oif) 907 arg.bound_dev_if = oif; 908 arg.tos = tos; 909 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 910 local_bh_disable(); 911 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); 912 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 913 inet_twsk(sk)->tw_mark : sk->sk_mark; 914 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 915 inet_twsk(sk)->tw_priority : sk->sk_priority; 916 transmit_time = tcp_transmit_time(sk); 917 ip_send_unicast_reply(ctl_sk, 918 skb, &TCP_SKB_CB(skb)->header.h4.opt, 919 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 920 &arg, arg.iov[0].iov_len, 921 transmit_time); 922 923 ctl_sk->sk_mark = 0; 924 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 925 local_bh_enable(); 926 } 927 928 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 929 { 930 struct inet_timewait_sock *tw = inet_twsk(sk); 931 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 932 933 tcp_v4_send_ack(sk, skb, 934 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 935 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 936 tcp_time_stamp_raw() + tcptw->tw_ts_offset, 937 tcptw->tw_ts_recent, 938 tw->tw_bound_dev_if, 939 tcp_twsk_md5_key(tcptw), 940 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 941 tw->tw_tos 942 ); 943 944 inet_twsk_put(tw); 945 } 946 947 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 948 struct request_sock *req) 949 { 950 const union tcp_md5_addr *addr; 951 int l3index; 952 953 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 954 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 955 */ 956 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 957 tcp_sk(sk)->snd_nxt; 958 959 /* RFC 7323 2.3 960 * The window field (SEG.WND) of every outgoing segment, with the 961 * exception of <SYN> segments, MUST be right-shifted by 962 * Rcv.Wind.Shift bits: 963 */ 964 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 965 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 966 tcp_v4_send_ack(sk, skb, seq, 967 tcp_rsk(req)->rcv_nxt, 968 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 969 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, 970 req->ts_recent, 971 0, 972 tcp_md5_do_lookup(sk, l3index, addr, AF_INET), 973 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 974 ip_hdr(skb)->tos); 975 } 976 977 /* 978 * Send a SYN-ACK after having received a SYN. 979 * This still operates on a request_sock only, not on a big 980 * socket. 981 */ 982 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 983 struct flowi *fl, 984 struct request_sock *req, 985 struct tcp_fastopen_cookie *foc, 986 enum tcp_synack_type synack_type, 987 struct sk_buff *syn_skb) 988 { 989 const struct inet_request_sock *ireq = inet_rsk(req); 990 struct flowi4 fl4; 991 int err = -1; 992 struct sk_buff *skb; 993 u8 tos; 994 995 /* First, grab a route. */ 996 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 997 return -1; 998 999 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 1000 1001 if (skb) { 1002 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1003 1004 tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ? 1005 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | 1006 (inet_sk(sk)->tos & INET_ECN_MASK) : 1007 inet_sk(sk)->tos; 1008 1009 if (!INET_ECN_is_capable(tos) && 1010 tcp_bpf_ca_needs_ecn((struct sock *)req)) 1011 tos |= INET_ECN_ECT_0; 1012 1013 rcu_read_lock(); 1014 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 1015 ireq->ir_rmt_addr, 1016 rcu_dereference(ireq->ireq_opt), 1017 tos); 1018 rcu_read_unlock(); 1019 err = net_xmit_eval(err); 1020 } 1021 1022 return err; 1023 } 1024 1025 /* 1026 * IPv4 request_sock destructor. 1027 */ 1028 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1029 { 1030 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1031 } 1032 1033 #ifdef CONFIG_TCP_MD5SIG 1034 /* 1035 * RFC2385 MD5 checksumming requires a mapping of 1036 * IP address->MD5 Key. 1037 * We need to maintain these in the sk structure. 1038 */ 1039 1040 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed); 1041 EXPORT_SYMBOL(tcp_md5_needed); 1042 1043 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) 1044 { 1045 if (!old) 1046 return true; 1047 1048 /* l3index always overrides non-l3index */ 1049 if (old->l3index && new->l3index == 0) 1050 return false; 1051 if (old->l3index == 0 && new->l3index) 1052 return true; 1053 1054 return old->prefixlen < new->prefixlen; 1055 } 1056 1057 /* Find the Key structure for an address. */ 1058 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1059 const union tcp_md5_addr *addr, 1060 int family) 1061 { 1062 const struct tcp_sock *tp = tcp_sk(sk); 1063 struct tcp_md5sig_key *key; 1064 const struct tcp_md5sig_info *md5sig; 1065 __be32 mask; 1066 struct tcp_md5sig_key *best_match = NULL; 1067 bool match; 1068 1069 /* caller either holds rcu_read_lock() or socket lock */ 1070 md5sig = rcu_dereference_check(tp->md5sig_info, 1071 lockdep_sock_is_held(sk)); 1072 if (!md5sig) 1073 return NULL; 1074 1075 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1076 lockdep_sock_is_held(sk)) { 1077 if (key->family != family) 1078 continue; 1079 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index) 1080 continue; 1081 if (family == AF_INET) { 1082 mask = inet_make_mask(key->prefixlen); 1083 match = (key->addr.a4.s_addr & mask) == 1084 (addr->a4.s_addr & mask); 1085 #if IS_ENABLED(CONFIG_IPV6) 1086 } else if (family == AF_INET6) { 1087 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1088 key->prefixlen); 1089 #endif 1090 } else { 1091 match = false; 1092 } 1093 1094 if (match && better_md5_match(best_match, key)) 1095 best_match = key; 1096 } 1097 return best_match; 1098 } 1099 EXPORT_SYMBOL(__tcp_md5_do_lookup); 1100 1101 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1102 const union tcp_md5_addr *addr, 1103 int family, u8 prefixlen, 1104 int l3index, u8 flags) 1105 { 1106 const struct tcp_sock *tp = tcp_sk(sk); 1107 struct tcp_md5sig_key *key; 1108 unsigned int size = sizeof(struct in_addr); 1109 const struct tcp_md5sig_info *md5sig; 1110 1111 /* caller either holds rcu_read_lock() or socket lock */ 1112 md5sig = rcu_dereference_check(tp->md5sig_info, 1113 lockdep_sock_is_held(sk)); 1114 if (!md5sig) 1115 return NULL; 1116 #if IS_ENABLED(CONFIG_IPV6) 1117 if (family == AF_INET6) 1118 size = sizeof(struct in6_addr); 1119 #endif 1120 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1121 lockdep_sock_is_held(sk)) { 1122 if (key->family != family) 1123 continue; 1124 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) 1125 continue; 1126 if (key->l3index != l3index) 1127 continue; 1128 if (!memcmp(&key->addr, addr, size) && 1129 key->prefixlen == prefixlen) 1130 return key; 1131 } 1132 return NULL; 1133 } 1134 1135 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1136 const struct sock *addr_sk) 1137 { 1138 const union tcp_md5_addr *addr; 1139 int l3index; 1140 1141 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1142 addr_sk->sk_bound_dev_if); 1143 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1144 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1145 } 1146 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1147 1148 /* This can be called on a newly created socket, from other files */ 1149 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1150 int family, u8 prefixlen, int l3index, u8 flags, 1151 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1152 { 1153 /* Add Key to the list */ 1154 struct tcp_md5sig_key *key; 1155 struct tcp_sock *tp = tcp_sk(sk); 1156 struct tcp_md5sig_info *md5sig; 1157 1158 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1159 if (key) { 1160 /* Pre-existing entry - just update that one. 1161 * Note that the key might be used concurrently. 1162 * data_race() is telling kcsan that we do not care of 1163 * key mismatches, since changing MD5 key on live flows 1164 * can lead to packet drops. 1165 */ 1166 data_race(memcpy(key->key, newkey, newkeylen)); 1167 1168 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1169 * Also note that a reader could catch new key->keylen value 1170 * but old key->key[], this is the reason we use __GFP_ZERO 1171 * at sock_kmalloc() time below these lines. 1172 */ 1173 WRITE_ONCE(key->keylen, newkeylen); 1174 1175 return 0; 1176 } 1177 1178 md5sig = rcu_dereference_protected(tp->md5sig_info, 1179 lockdep_sock_is_held(sk)); 1180 if (!md5sig) { 1181 md5sig = kmalloc(sizeof(*md5sig), gfp); 1182 if (!md5sig) 1183 return -ENOMEM; 1184 1185 sk_gso_disable(sk); 1186 INIT_HLIST_HEAD(&md5sig->head); 1187 rcu_assign_pointer(tp->md5sig_info, md5sig); 1188 } 1189 1190 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1191 if (!key) 1192 return -ENOMEM; 1193 if (!tcp_alloc_md5sig_pool()) { 1194 sock_kfree_s(sk, key, sizeof(*key)); 1195 return -ENOMEM; 1196 } 1197 1198 memcpy(key->key, newkey, newkeylen); 1199 key->keylen = newkeylen; 1200 key->family = family; 1201 key->prefixlen = prefixlen; 1202 key->l3index = l3index; 1203 key->flags = flags; 1204 memcpy(&key->addr, addr, 1205 (family == AF_INET6) ? sizeof(struct in6_addr) : 1206 sizeof(struct in_addr)); 1207 hlist_add_head_rcu(&key->node, &md5sig->head); 1208 return 0; 1209 } 1210 EXPORT_SYMBOL(tcp_md5_do_add); 1211 1212 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1213 u8 prefixlen, int l3index, u8 flags) 1214 { 1215 struct tcp_md5sig_key *key; 1216 1217 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1218 if (!key) 1219 return -ENOENT; 1220 hlist_del_rcu(&key->node); 1221 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1222 kfree_rcu(key, rcu); 1223 return 0; 1224 } 1225 EXPORT_SYMBOL(tcp_md5_do_del); 1226 1227 static void tcp_clear_md5_list(struct sock *sk) 1228 { 1229 struct tcp_sock *tp = tcp_sk(sk); 1230 struct tcp_md5sig_key *key; 1231 struct hlist_node *n; 1232 struct tcp_md5sig_info *md5sig; 1233 1234 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1235 1236 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1237 hlist_del_rcu(&key->node); 1238 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1239 kfree_rcu(key, rcu); 1240 } 1241 } 1242 1243 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1244 sockptr_t optval, int optlen) 1245 { 1246 struct tcp_md5sig cmd; 1247 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1248 const union tcp_md5_addr *addr; 1249 u8 prefixlen = 32; 1250 int l3index = 0; 1251 u8 flags; 1252 1253 if (optlen < sizeof(cmd)) 1254 return -EINVAL; 1255 1256 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1257 return -EFAULT; 1258 1259 if (sin->sin_family != AF_INET) 1260 return -EINVAL; 1261 1262 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1263 1264 if (optname == TCP_MD5SIG_EXT && 1265 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1266 prefixlen = cmd.tcpm_prefixlen; 1267 if (prefixlen > 32) 1268 return -EINVAL; 1269 } 1270 1271 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && 1272 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1273 struct net_device *dev; 1274 1275 rcu_read_lock(); 1276 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1277 if (dev && netif_is_l3_master(dev)) 1278 l3index = dev->ifindex; 1279 1280 rcu_read_unlock(); 1281 1282 /* ok to reference set/not set outside of rcu; 1283 * right now device MUST be an L3 master 1284 */ 1285 if (!dev || !l3index) 1286 return -EINVAL; 1287 } 1288 1289 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1290 1291 if (!cmd.tcpm_keylen) 1292 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); 1293 1294 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1295 return -EINVAL; 1296 1297 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, 1298 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); 1299 } 1300 1301 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, 1302 __be32 daddr, __be32 saddr, 1303 const struct tcphdr *th, int nbytes) 1304 { 1305 struct tcp4_pseudohdr *bp; 1306 struct scatterlist sg; 1307 struct tcphdr *_th; 1308 1309 bp = hp->scratch; 1310 bp->saddr = saddr; 1311 bp->daddr = daddr; 1312 bp->pad = 0; 1313 bp->protocol = IPPROTO_TCP; 1314 bp->len = cpu_to_be16(nbytes); 1315 1316 _th = (struct tcphdr *)(bp + 1); 1317 memcpy(_th, th, sizeof(*th)); 1318 _th->check = 0; 1319 1320 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1321 ahash_request_set_crypt(hp->md5_req, &sg, NULL, 1322 sizeof(*bp) + sizeof(*th)); 1323 return crypto_ahash_update(hp->md5_req); 1324 } 1325 1326 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1327 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1328 { 1329 struct tcp_md5sig_pool *hp; 1330 struct ahash_request *req; 1331 1332 hp = tcp_get_md5sig_pool(); 1333 if (!hp) 1334 goto clear_hash_noput; 1335 req = hp->md5_req; 1336 1337 if (crypto_ahash_init(req)) 1338 goto clear_hash; 1339 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) 1340 goto clear_hash; 1341 if (tcp_md5_hash_key(hp, key)) 1342 goto clear_hash; 1343 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1344 if (crypto_ahash_final(req)) 1345 goto clear_hash; 1346 1347 tcp_put_md5sig_pool(); 1348 return 0; 1349 1350 clear_hash: 1351 tcp_put_md5sig_pool(); 1352 clear_hash_noput: 1353 memset(md5_hash, 0, 16); 1354 return 1; 1355 } 1356 1357 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1358 const struct sock *sk, 1359 const struct sk_buff *skb) 1360 { 1361 struct tcp_md5sig_pool *hp; 1362 struct ahash_request *req; 1363 const struct tcphdr *th = tcp_hdr(skb); 1364 __be32 saddr, daddr; 1365 1366 if (sk) { /* valid for establish/request sockets */ 1367 saddr = sk->sk_rcv_saddr; 1368 daddr = sk->sk_daddr; 1369 } else { 1370 const struct iphdr *iph = ip_hdr(skb); 1371 saddr = iph->saddr; 1372 daddr = iph->daddr; 1373 } 1374 1375 hp = tcp_get_md5sig_pool(); 1376 if (!hp) 1377 goto clear_hash_noput; 1378 req = hp->md5_req; 1379 1380 if (crypto_ahash_init(req)) 1381 goto clear_hash; 1382 1383 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len)) 1384 goto clear_hash; 1385 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) 1386 goto clear_hash; 1387 if (tcp_md5_hash_key(hp, key)) 1388 goto clear_hash; 1389 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1390 if (crypto_ahash_final(req)) 1391 goto clear_hash; 1392 1393 tcp_put_md5sig_pool(); 1394 return 0; 1395 1396 clear_hash: 1397 tcp_put_md5sig_pool(); 1398 clear_hash_noput: 1399 memset(md5_hash, 0, 16); 1400 return 1; 1401 } 1402 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1403 1404 #endif 1405 1406 /* Called with rcu_read_lock() */ 1407 static bool tcp_v4_inbound_md5_hash(const struct sock *sk, 1408 const struct sk_buff *skb, 1409 int dif, int sdif) 1410 { 1411 #ifdef CONFIG_TCP_MD5SIG 1412 /* 1413 * This gets called for each TCP segment that arrives 1414 * so we want to be efficient. 1415 * We have 3 drop cases: 1416 * o No MD5 hash and one expected. 1417 * o MD5 hash and we're not expecting one. 1418 * o MD5 hash and its wrong. 1419 */ 1420 const __u8 *hash_location = NULL; 1421 struct tcp_md5sig_key *hash_expected; 1422 const struct iphdr *iph = ip_hdr(skb); 1423 const struct tcphdr *th = tcp_hdr(skb); 1424 const union tcp_md5_addr *addr; 1425 unsigned char newhash[16]; 1426 int genhash, l3index; 1427 1428 /* sdif set, means packet ingressed via a device 1429 * in an L3 domain and dif is set to the l3mdev 1430 */ 1431 l3index = sdif ? dif : 0; 1432 1433 addr = (union tcp_md5_addr *)&iph->saddr; 1434 hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1435 hash_location = tcp_parse_md5sig_option(th); 1436 1437 /* We've parsed the options - do we have a hash? */ 1438 if (!hash_expected && !hash_location) 1439 return false; 1440 1441 if (hash_expected && !hash_location) { 1442 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); 1443 return true; 1444 } 1445 1446 if (!hash_expected && hash_location) { 1447 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); 1448 return true; 1449 } 1450 1451 /* Okay, so this is hash_expected and hash_location - 1452 * so we need to calculate the checksum. 1453 */ 1454 genhash = tcp_v4_md5_hash_skb(newhash, 1455 hash_expected, 1456 NULL, skb); 1457 1458 if (genhash || memcmp(hash_location, newhash, 16) != 0) { 1459 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); 1460 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n", 1461 &iph->saddr, ntohs(th->source), 1462 &iph->daddr, ntohs(th->dest), 1463 genhash ? " tcp_v4_calc_md5_hash failed" 1464 : "", l3index); 1465 return true; 1466 } 1467 return false; 1468 #endif 1469 return false; 1470 } 1471 1472 static void tcp_v4_init_req(struct request_sock *req, 1473 const struct sock *sk_listener, 1474 struct sk_buff *skb) 1475 { 1476 struct inet_request_sock *ireq = inet_rsk(req); 1477 struct net *net = sock_net(sk_listener); 1478 1479 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1480 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1481 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1482 } 1483 1484 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1485 struct sk_buff *skb, 1486 struct flowi *fl, 1487 struct request_sock *req) 1488 { 1489 tcp_v4_init_req(req, sk, skb); 1490 1491 if (security_inet_conn_request(sk, skb, req)) 1492 return NULL; 1493 1494 return inet_csk_route_req(sk, &fl->u.ip4, req); 1495 } 1496 1497 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1498 .family = PF_INET, 1499 .obj_size = sizeof(struct tcp_request_sock), 1500 .rtx_syn_ack = tcp_rtx_synack, 1501 .send_ack = tcp_v4_reqsk_send_ack, 1502 .destructor = tcp_v4_reqsk_destructor, 1503 .send_reset = tcp_v4_send_reset, 1504 .syn_ack_timeout = tcp_syn_ack_timeout, 1505 }; 1506 1507 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1508 .mss_clamp = TCP_MSS_DEFAULT, 1509 #ifdef CONFIG_TCP_MD5SIG 1510 .req_md5_lookup = tcp_v4_md5_lookup, 1511 .calc_md5_hash = tcp_v4_md5_hash_skb, 1512 #endif 1513 #ifdef CONFIG_SYN_COOKIES 1514 .cookie_init_seq = cookie_v4_init_sequence, 1515 #endif 1516 .route_req = tcp_v4_route_req, 1517 .init_seq = tcp_v4_init_seq, 1518 .init_ts_off = tcp_v4_init_ts_off, 1519 .send_synack = tcp_v4_send_synack, 1520 }; 1521 1522 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1523 { 1524 /* Never answer to SYNs send to broadcast or multicast */ 1525 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1526 goto drop; 1527 1528 return tcp_conn_request(&tcp_request_sock_ops, 1529 &tcp_request_sock_ipv4_ops, sk, skb); 1530 1531 drop: 1532 tcp_listendrop(sk); 1533 return 0; 1534 } 1535 EXPORT_SYMBOL(tcp_v4_conn_request); 1536 1537 1538 /* 1539 * The three way handshake has completed - we got a valid synack - 1540 * now create the new socket. 1541 */ 1542 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1543 struct request_sock *req, 1544 struct dst_entry *dst, 1545 struct request_sock *req_unhash, 1546 bool *own_req) 1547 { 1548 struct inet_request_sock *ireq; 1549 bool found_dup_sk = false; 1550 struct inet_sock *newinet; 1551 struct tcp_sock *newtp; 1552 struct sock *newsk; 1553 #ifdef CONFIG_TCP_MD5SIG 1554 const union tcp_md5_addr *addr; 1555 struct tcp_md5sig_key *key; 1556 int l3index; 1557 #endif 1558 struct ip_options_rcu *inet_opt; 1559 1560 if (sk_acceptq_is_full(sk)) 1561 goto exit_overflow; 1562 1563 newsk = tcp_create_openreq_child(sk, req, skb); 1564 if (!newsk) 1565 goto exit_nonewsk; 1566 1567 newsk->sk_gso_type = SKB_GSO_TCPV4; 1568 inet_sk_rx_dst_set(newsk, skb); 1569 1570 newtp = tcp_sk(newsk); 1571 newinet = inet_sk(newsk); 1572 ireq = inet_rsk(req); 1573 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1574 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1575 newsk->sk_bound_dev_if = ireq->ir_iif; 1576 newinet->inet_saddr = ireq->ir_loc_addr; 1577 inet_opt = rcu_dereference(ireq->ireq_opt); 1578 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1579 newinet->mc_index = inet_iif(skb); 1580 newinet->mc_ttl = ip_hdr(skb)->ttl; 1581 newinet->rcv_tos = ip_hdr(skb)->tos; 1582 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1583 if (inet_opt) 1584 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1585 newinet->inet_id = prandom_u32(); 1586 1587 /* Set ToS of the new socket based upon the value of incoming SYN. 1588 * ECT bits are set later in tcp_init_transfer(). 1589 */ 1590 if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) 1591 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1592 1593 if (!dst) { 1594 dst = inet_csk_route_child_sock(sk, newsk, req); 1595 if (!dst) 1596 goto put_and_exit; 1597 } else { 1598 /* syncookie case : see end of cookie_v4_check() */ 1599 } 1600 sk_setup_caps(newsk, dst); 1601 1602 tcp_ca_openreq_child(newsk, dst); 1603 1604 tcp_sync_mss(newsk, dst_mtu(dst)); 1605 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1606 1607 tcp_initialize_rcv_mss(newsk); 1608 1609 #ifdef CONFIG_TCP_MD5SIG 1610 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1611 /* Copy over the MD5 key from the original socket */ 1612 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1613 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1614 if (key) { 1615 /* 1616 * We're using one, so create a matching key 1617 * on the newsk structure. If we fail to get 1618 * memory, then we end up not copying the key 1619 * across. Shucks. 1620 */ 1621 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags, 1622 key->key, key->keylen, GFP_ATOMIC); 1623 sk_gso_disable(newsk); 1624 } 1625 #endif 1626 1627 if (__inet_inherit_port(sk, newsk) < 0) 1628 goto put_and_exit; 1629 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1630 &found_dup_sk); 1631 if (likely(*own_req)) { 1632 tcp_move_syn(newtp, req); 1633 ireq->ireq_opt = NULL; 1634 } else { 1635 newinet->inet_opt = NULL; 1636 1637 if (!req_unhash && found_dup_sk) { 1638 /* This code path should only be executed in the 1639 * syncookie case only 1640 */ 1641 bh_unlock_sock(newsk); 1642 sock_put(newsk); 1643 newsk = NULL; 1644 } 1645 } 1646 return newsk; 1647 1648 exit_overflow: 1649 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1650 exit_nonewsk: 1651 dst_release(dst); 1652 exit: 1653 tcp_listendrop(sk); 1654 return NULL; 1655 put_and_exit: 1656 newinet->inet_opt = NULL; 1657 inet_csk_prepare_forced_close(newsk); 1658 tcp_done(newsk); 1659 goto exit; 1660 } 1661 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1662 1663 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1664 { 1665 #ifdef CONFIG_SYN_COOKIES 1666 const struct tcphdr *th = tcp_hdr(skb); 1667 1668 if (!th->syn) 1669 sk = cookie_v4_check(sk, skb); 1670 #endif 1671 return sk; 1672 } 1673 1674 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1675 struct tcphdr *th, u32 *cookie) 1676 { 1677 u16 mss = 0; 1678 #ifdef CONFIG_SYN_COOKIES 1679 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1680 &tcp_request_sock_ipv4_ops, sk, th); 1681 if (mss) { 1682 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1683 tcp_synq_overflow(sk); 1684 } 1685 #endif 1686 return mss; 1687 } 1688 1689 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 1690 u32)); 1691 /* The socket must have it's spinlock held when we get 1692 * here, unless it is a TCP_LISTEN socket. 1693 * 1694 * We have a potential double-lock case here, so even when 1695 * doing backlog processing we use the BH locking scheme. 1696 * This is because we cannot sleep with the original spinlock 1697 * held. 1698 */ 1699 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1700 { 1701 struct sock *rsk; 1702 1703 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1704 struct dst_entry *dst = sk->sk_rx_dst; 1705 1706 sock_rps_save_rxhash(sk, skb); 1707 sk_mark_napi_id(sk, skb); 1708 if (dst) { 1709 if (sk->sk_rx_dst_ifindex != skb->skb_iif || 1710 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, 1711 dst, 0)) { 1712 dst_release(dst); 1713 sk->sk_rx_dst = NULL; 1714 } 1715 } 1716 tcp_rcv_established(sk, skb); 1717 return 0; 1718 } 1719 1720 if (tcp_checksum_complete(skb)) 1721 goto csum_err; 1722 1723 if (sk->sk_state == TCP_LISTEN) { 1724 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1725 1726 if (!nsk) 1727 goto discard; 1728 if (nsk != sk) { 1729 if (tcp_child_process(sk, nsk, skb)) { 1730 rsk = nsk; 1731 goto reset; 1732 } 1733 return 0; 1734 } 1735 } else 1736 sock_rps_save_rxhash(sk, skb); 1737 1738 if (tcp_rcv_state_process(sk, skb)) { 1739 rsk = sk; 1740 goto reset; 1741 } 1742 return 0; 1743 1744 reset: 1745 tcp_v4_send_reset(rsk, skb); 1746 discard: 1747 kfree_skb(skb); 1748 /* Be careful here. If this function gets more complicated and 1749 * gcc suffers from register pressure on the x86, sk (in %ebx) 1750 * might be destroyed here. This current version compiles correctly, 1751 * but you have been warned. 1752 */ 1753 return 0; 1754 1755 csum_err: 1756 trace_tcp_bad_csum(skb); 1757 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1758 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1759 goto discard; 1760 } 1761 EXPORT_SYMBOL(tcp_v4_do_rcv); 1762 1763 int tcp_v4_early_demux(struct sk_buff *skb) 1764 { 1765 const struct iphdr *iph; 1766 const struct tcphdr *th; 1767 struct sock *sk; 1768 1769 if (skb->pkt_type != PACKET_HOST) 1770 return 0; 1771 1772 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1773 return 0; 1774 1775 iph = ip_hdr(skb); 1776 th = tcp_hdr(skb); 1777 1778 if (th->doff < sizeof(struct tcphdr) / 4) 1779 return 0; 1780 1781 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, 1782 iph->saddr, th->source, 1783 iph->daddr, ntohs(th->dest), 1784 skb->skb_iif, inet_sdif(skb)); 1785 if (sk) { 1786 skb->sk = sk; 1787 skb->destructor = sock_edemux; 1788 if (sk_fullsock(sk)) { 1789 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); 1790 1791 if (dst) 1792 dst = dst_check(dst, 0); 1793 if (dst && 1794 sk->sk_rx_dst_ifindex == skb->skb_iif) 1795 skb_dst_set_noref(skb, dst); 1796 } 1797 } 1798 return 0; 1799 } 1800 1801 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) 1802 { 1803 u32 limit, tail_gso_size, tail_gso_segs; 1804 struct skb_shared_info *shinfo; 1805 const struct tcphdr *th; 1806 struct tcphdr *thtail; 1807 struct sk_buff *tail; 1808 unsigned int hdrlen; 1809 bool fragstolen; 1810 u32 gso_segs; 1811 u32 gso_size; 1812 int delta; 1813 1814 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1815 * we can fix skb->truesize to its real value to avoid future drops. 1816 * This is valid because skb is not yet charged to the socket. 1817 * It has been noticed pure SACK packets were sometimes dropped 1818 * (if cooked by drivers without copybreak feature). 1819 */ 1820 skb_condense(skb); 1821 1822 skb_dst_drop(skb); 1823 1824 if (unlikely(tcp_checksum_complete(skb))) { 1825 bh_unlock_sock(sk); 1826 trace_tcp_bad_csum(skb); 1827 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1828 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1829 return true; 1830 } 1831 1832 /* Attempt coalescing to last skb in backlog, even if we are 1833 * above the limits. 1834 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 1835 */ 1836 th = (const struct tcphdr *)skb->data; 1837 hdrlen = th->doff * 4; 1838 1839 tail = sk->sk_backlog.tail; 1840 if (!tail) 1841 goto no_coalesce; 1842 thtail = (struct tcphdr *)tail->data; 1843 1844 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 1845 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 1846 ((TCP_SKB_CB(tail)->tcp_flags | 1847 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 1848 !((TCP_SKB_CB(tail)->tcp_flags & 1849 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 1850 ((TCP_SKB_CB(tail)->tcp_flags ^ 1851 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || 1852 #ifdef CONFIG_TLS_DEVICE 1853 tail->decrypted != skb->decrypted || 1854 #endif 1855 thtail->doff != th->doff || 1856 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 1857 goto no_coalesce; 1858 1859 __skb_pull(skb, hdrlen); 1860 1861 shinfo = skb_shinfo(skb); 1862 gso_size = shinfo->gso_size ?: skb->len; 1863 gso_segs = shinfo->gso_segs ?: 1; 1864 1865 shinfo = skb_shinfo(tail); 1866 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); 1867 tail_gso_segs = shinfo->gso_segs ?: 1; 1868 1869 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 1870 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 1871 1872 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 1873 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 1874 thtail->window = th->window; 1875 } 1876 1877 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 1878 * thtail->fin, so that the fast path in tcp_rcv_established() 1879 * is not entered if we append a packet with a FIN. 1880 * SYN, RST, URG are not present. 1881 * ACK is set on both packets. 1882 * PSH : we do not really care in TCP stack, 1883 * at least for 'GRO' packets. 1884 */ 1885 thtail->fin |= th->fin; 1886 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 1887 1888 if (TCP_SKB_CB(skb)->has_rxtstamp) { 1889 TCP_SKB_CB(tail)->has_rxtstamp = true; 1890 tail->tstamp = skb->tstamp; 1891 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 1892 } 1893 1894 /* Not as strict as GRO. We only need to carry mss max value */ 1895 shinfo->gso_size = max(gso_size, tail_gso_size); 1896 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); 1897 1898 sk->sk_backlog.len += delta; 1899 __NET_INC_STATS(sock_net(sk), 1900 LINUX_MIB_TCPBACKLOGCOALESCE); 1901 kfree_skb_partial(skb, fragstolen); 1902 return false; 1903 } 1904 __skb_push(skb, hdrlen); 1905 1906 no_coalesce: 1907 /* Only socket owner can try to collapse/prune rx queues 1908 * to reduce memory overhead, so add a little headroom here. 1909 * Few sockets backlog are possibly concurrently non empty. 1910 */ 1911 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024; 1912 1913 if (unlikely(sk_add_backlog(sk, skb, limit))) { 1914 bh_unlock_sock(sk); 1915 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 1916 return true; 1917 } 1918 return false; 1919 } 1920 EXPORT_SYMBOL(tcp_add_backlog); 1921 1922 int tcp_filter(struct sock *sk, struct sk_buff *skb) 1923 { 1924 struct tcphdr *th = (struct tcphdr *)skb->data; 1925 1926 return sk_filter_trim_cap(sk, skb, th->doff * 4); 1927 } 1928 EXPORT_SYMBOL(tcp_filter); 1929 1930 static void tcp_v4_restore_cb(struct sk_buff *skb) 1931 { 1932 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 1933 sizeof(struct inet_skb_parm)); 1934 } 1935 1936 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 1937 const struct tcphdr *th) 1938 { 1939 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 1940 * barrier() makes sure compiler wont play fool^Waliasing games. 1941 */ 1942 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 1943 sizeof(struct inet_skb_parm)); 1944 barrier(); 1945 1946 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1947 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1948 skb->len - th->doff * 4); 1949 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1950 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 1951 TCP_SKB_CB(skb)->tcp_tw_isn = 0; 1952 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 1953 TCP_SKB_CB(skb)->sacked = 0; 1954 TCP_SKB_CB(skb)->has_rxtstamp = 1955 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 1956 } 1957 1958 /* 1959 * From tcp_input.c 1960 */ 1961 1962 int tcp_v4_rcv(struct sk_buff *skb) 1963 { 1964 struct net *net = dev_net(skb->dev); 1965 int sdif = inet_sdif(skb); 1966 int dif = inet_iif(skb); 1967 const struct iphdr *iph; 1968 const struct tcphdr *th; 1969 bool refcounted; 1970 struct sock *sk; 1971 int ret; 1972 1973 if (skb->pkt_type != PACKET_HOST) 1974 goto discard_it; 1975 1976 /* Count it even if it's bad */ 1977 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 1978 1979 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 1980 goto discard_it; 1981 1982 th = (const struct tcphdr *)skb->data; 1983 1984 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) 1985 goto bad_packet; 1986 if (!pskb_may_pull(skb, th->doff * 4)) 1987 goto discard_it; 1988 1989 /* An explanation is required here, I think. 1990 * Packet length and doff are validated by header prediction, 1991 * provided case of th->doff==0 is eliminated. 1992 * So, we defer the checks. */ 1993 1994 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 1995 goto csum_error; 1996 1997 th = (const struct tcphdr *)skb->data; 1998 iph = ip_hdr(skb); 1999 lookup: 2000 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, 2001 th->dest, sdif, &refcounted); 2002 if (!sk) 2003 goto no_tcp_socket; 2004 2005 process: 2006 if (sk->sk_state == TCP_TIME_WAIT) 2007 goto do_time_wait; 2008 2009 if (sk->sk_state == TCP_NEW_SYN_RECV) { 2010 struct request_sock *req = inet_reqsk(sk); 2011 bool req_stolen = false; 2012 struct sock *nsk; 2013 2014 sk = req->rsk_listener; 2015 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) { 2016 sk_drops_add(sk, skb); 2017 reqsk_put(req); 2018 goto discard_it; 2019 } 2020 if (tcp_checksum_complete(skb)) { 2021 reqsk_put(req); 2022 goto csum_error; 2023 } 2024 if (unlikely(sk->sk_state != TCP_LISTEN)) { 2025 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); 2026 if (!nsk) { 2027 inet_csk_reqsk_queue_drop_and_put(sk, req); 2028 goto lookup; 2029 } 2030 sk = nsk; 2031 /* reuseport_migrate_sock() has already held one sk_refcnt 2032 * before returning. 2033 */ 2034 } else { 2035 /* We own a reference on the listener, increase it again 2036 * as we might lose it too soon. 2037 */ 2038 sock_hold(sk); 2039 } 2040 refcounted = true; 2041 nsk = NULL; 2042 if (!tcp_filter(sk, skb)) { 2043 th = (const struct tcphdr *)skb->data; 2044 iph = ip_hdr(skb); 2045 tcp_v4_fill_cb(skb, iph, th); 2046 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 2047 } 2048 if (!nsk) { 2049 reqsk_put(req); 2050 if (req_stolen) { 2051 /* Another cpu got exclusive access to req 2052 * and created a full blown socket. 2053 * Try to feed this packet to this socket 2054 * instead of discarding it. 2055 */ 2056 tcp_v4_restore_cb(skb); 2057 sock_put(sk); 2058 goto lookup; 2059 } 2060 goto discard_and_relse; 2061 } 2062 if (nsk == sk) { 2063 reqsk_put(req); 2064 tcp_v4_restore_cb(skb); 2065 } else if (tcp_child_process(sk, nsk, skb)) { 2066 tcp_v4_send_reset(nsk, skb); 2067 goto discard_and_relse; 2068 } else { 2069 sock_put(sk); 2070 return 0; 2071 } 2072 } 2073 2074 if (static_branch_unlikely(&ip4_min_ttl)) { 2075 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 2076 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 2077 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2078 goto discard_and_relse; 2079 } 2080 } 2081 2082 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 2083 goto discard_and_relse; 2084 2085 if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif)) 2086 goto discard_and_relse; 2087 2088 nf_reset_ct(skb); 2089 2090 if (tcp_filter(sk, skb)) 2091 goto discard_and_relse; 2092 th = (const struct tcphdr *)skb->data; 2093 iph = ip_hdr(skb); 2094 tcp_v4_fill_cb(skb, iph, th); 2095 2096 skb->dev = NULL; 2097 2098 if (sk->sk_state == TCP_LISTEN) { 2099 ret = tcp_v4_do_rcv(sk, skb); 2100 goto put_and_return; 2101 } 2102 2103 sk_incoming_cpu_update(sk); 2104 2105 sk_defer_free_flush(sk); 2106 bh_lock_sock_nested(sk); 2107 tcp_segs_in(tcp_sk(sk), skb); 2108 ret = 0; 2109 if (!sock_owned_by_user(sk)) { 2110 ret = tcp_v4_do_rcv(sk, skb); 2111 } else { 2112 if (tcp_add_backlog(sk, skb)) 2113 goto discard_and_relse; 2114 } 2115 bh_unlock_sock(sk); 2116 2117 put_and_return: 2118 if (refcounted) 2119 sock_put(sk); 2120 2121 return ret; 2122 2123 no_tcp_socket: 2124 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2125 goto discard_it; 2126 2127 tcp_v4_fill_cb(skb, iph, th); 2128 2129 if (tcp_checksum_complete(skb)) { 2130 csum_error: 2131 trace_tcp_bad_csum(skb); 2132 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2133 bad_packet: 2134 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2135 } else { 2136 tcp_v4_send_reset(NULL, skb); 2137 } 2138 2139 discard_it: 2140 /* Discard frame. */ 2141 kfree_skb(skb); 2142 return 0; 2143 2144 discard_and_relse: 2145 sk_drops_add(sk, skb); 2146 if (refcounted) 2147 sock_put(sk); 2148 goto discard_it; 2149 2150 do_time_wait: 2151 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2152 inet_twsk_put(inet_twsk(sk)); 2153 goto discard_it; 2154 } 2155 2156 tcp_v4_fill_cb(skb, iph, th); 2157 2158 if (tcp_checksum_complete(skb)) { 2159 inet_twsk_put(inet_twsk(sk)); 2160 goto csum_error; 2161 } 2162 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 2163 case TCP_TW_SYN: { 2164 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), 2165 &tcp_hashinfo, skb, 2166 __tcp_hdrlen(th), 2167 iph->saddr, th->source, 2168 iph->daddr, th->dest, 2169 inet_iif(skb), 2170 sdif); 2171 if (sk2) { 2172 inet_twsk_deschedule_put(inet_twsk(sk)); 2173 sk = sk2; 2174 tcp_v4_restore_cb(skb); 2175 refcounted = false; 2176 goto process; 2177 } 2178 } 2179 /* to ACK */ 2180 fallthrough; 2181 case TCP_TW_ACK: 2182 tcp_v4_timewait_ack(sk, skb); 2183 break; 2184 case TCP_TW_RST: 2185 tcp_v4_send_reset(sk, skb); 2186 inet_twsk_deschedule_put(inet_twsk(sk)); 2187 goto discard_it; 2188 case TCP_TW_SUCCESS:; 2189 } 2190 goto discard_it; 2191 } 2192 2193 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2194 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2195 .twsk_unique = tcp_twsk_unique, 2196 .twsk_destructor= tcp_twsk_destructor, 2197 }; 2198 2199 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2200 { 2201 struct dst_entry *dst = skb_dst(skb); 2202 2203 if (dst && dst_hold_safe(dst)) { 2204 sk->sk_rx_dst = dst; 2205 sk->sk_rx_dst_ifindex = skb->skb_iif; 2206 } 2207 } 2208 EXPORT_SYMBOL(inet_sk_rx_dst_set); 2209 2210 const struct inet_connection_sock_af_ops ipv4_specific = { 2211 .queue_xmit = ip_queue_xmit, 2212 .send_check = tcp_v4_send_check, 2213 .rebuild_header = inet_sk_rebuild_header, 2214 .sk_rx_dst_set = inet_sk_rx_dst_set, 2215 .conn_request = tcp_v4_conn_request, 2216 .syn_recv_sock = tcp_v4_syn_recv_sock, 2217 .net_header_len = sizeof(struct iphdr), 2218 .setsockopt = ip_setsockopt, 2219 .getsockopt = ip_getsockopt, 2220 .addr2sockaddr = inet_csk_addr2sockaddr, 2221 .sockaddr_len = sizeof(struct sockaddr_in), 2222 .mtu_reduced = tcp_v4_mtu_reduced, 2223 }; 2224 EXPORT_SYMBOL(ipv4_specific); 2225 2226 #ifdef CONFIG_TCP_MD5SIG 2227 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2228 .md5_lookup = tcp_v4_md5_lookup, 2229 .calc_md5_hash = tcp_v4_md5_hash_skb, 2230 .md5_parse = tcp_v4_parse_md5_keys, 2231 }; 2232 #endif 2233 2234 /* NOTE: A lot of things set to zero explicitly by call to 2235 * sk_alloc() so need not be done here. 2236 */ 2237 static int tcp_v4_init_sock(struct sock *sk) 2238 { 2239 struct inet_connection_sock *icsk = inet_csk(sk); 2240 2241 tcp_init_sock(sk); 2242 2243 icsk->icsk_af_ops = &ipv4_specific; 2244 2245 #ifdef CONFIG_TCP_MD5SIG 2246 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2247 #endif 2248 2249 return 0; 2250 } 2251 2252 void tcp_v4_destroy_sock(struct sock *sk) 2253 { 2254 struct tcp_sock *tp = tcp_sk(sk); 2255 2256 trace_tcp_destroy_sock(sk); 2257 2258 tcp_clear_xmit_timers(sk); 2259 2260 tcp_cleanup_congestion_control(sk); 2261 2262 tcp_cleanup_ulp(sk); 2263 2264 /* Cleanup up the write buffer. */ 2265 tcp_write_queue_purge(sk); 2266 2267 /* Check if we want to disable active TFO */ 2268 tcp_fastopen_active_disable_ofo_check(sk); 2269 2270 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2271 skb_rbtree_purge(&tp->out_of_order_queue); 2272 2273 #ifdef CONFIG_TCP_MD5SIG 2274 /* Clean up the MD5 key list, if any */ 2275 if (tp->md5sig_info) { 2276 tcp_clear_md5_list(sk); 2277 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu); 2278 tp->md5sig_info = NULL; 2279 } 2280 #endif 2281 2282 /* Clean up a referenced TCP bind bucket. */ 2283 if (inet_csk(sk)->icsk_bind_hash) 2284 inet_put_port(sk); 2285 2286 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2287 2288 /* If socket is aborted during connect operation */ 2289 tcp_free_fastopen_req(tp); 2290 tcp_fastopen_destroy_cipher(sk); 2291 tcp_saved_syn_free(tp); 2292 2293 sk_sockets_allocated_dec(sk); 2294 } 2295 EXPORT_SYMBOL(tcp_v4_destroy_sock); 2296 2297 #ifdef CONFIG_PROC_FS 2298 /* Proc filesystem TCP sock list dumping. */ 2299 2300 static unsigned short seq_file_family(const struct seq_file *seq); 2301 2302 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) 2303 { 2304 unsigned short family = seq_file_family(seq); 2305 2306 /* AF_UNSPEC is used as a match all */ 2307 return ((family == AF_UNSPEC || family == sk->sk_family) && 2308 net_eq(sock_net(sk), seq_file_net(seq))); 2309 } 2310 2311 /* Find a non empty bucket (starting from st->bucket) 2312 * and return the first sk from it. 2313 */ 2314 static void *listening_get_first(struct seq_file *seq) 2315 { 2316 struct tcp_iter_state *st = seq->private; 2317 2318 st->offset = 0; 2319 for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) { 2320 struct inet_listen_hashbucket *ilb2; 2321 struct inet_connection_sock *icsk; 2322 struct sock *sk; 2323 2324 ilb2 = &tcp_hashinfo.lhash2[st->bucket]; 2325 if (hlist_empty(&ilb2->head)) 2326 continue; 2327 2328 spin_lock(&ilb2->lock); 2329 inet_lhash2_for_each_icsk(icsk, &ilb2->head) { 2330 sk = (struct sock *)icsk; 2331 if (seq_sk_match(seq, sk)) 2332 return sk; 2333 } 2334 spin_unlock(&ilb2->lock); 2335 } 2336 2337 return NULL; 2338 } 2339 2340 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). 2341 * If "cur" is the last one in the st->bucket, 2342 * call listening_get_first() to return the first sk of the next 2343 * non empty bucket. 2344 */ 2345 static void *listening_get_next(struct seq_file *seq, void *cur) 2346 { 2347 struct tcp_iter_state *st = seq->private; 2348 struct inet_listen_hashbucket *ilb2; 2349 struct inet_connection_sock *icsk; 2350 struct sock *sk = cur; 2351 2352 ++st->num; 2353 ++st->offset; 2354 2355 icsk = inet_csk(sk); 2356 inet_lhash2_for_each_icsk_continue(icsk) { 2357 sk = (struct sock *)icsk; 2358 if (seq_sk_match(seq, sk)) 2359 return sk; 2360 } 2361 2362 ilb2 = &tcp_hashinfo.lhash2[st->bucket]; 2363 spin_unlock(&ilb2->lock); 2364 ++st->bucket; 2365 return listening_get_first(seq); 2366 } 2367 2368 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2369 { 2370 struct tcp_iter_state *st = seq->private; 2371 void *rc; 2372 2373 st->bucket = 0; 2374 st->offset = 0; 2375 rc = listening_get_first(seq); 2376 2377 while (rc && *pos) { 2378 rc = listening_get_next(seq, rc); 2379 --*pos; 2380 } 2381 return rc; 2382 } 2383 2384 static inline bool empty_bucket(const struct tcp_iter_state *st) 2385 { 2386 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain); 2387 } 2388 2389 /* 2390 * Get first established socket starting from bucket given in st->bucket. 2391 * If st->bucket is zero, the very first socket in the hash is returned. 2392 */ 2393 static void *established_get_first(struct seq_file *seq) 2394 { 2395 struct tcp_iter_state *st = seq->private; 2396 2397 st->offset = 0; 2398 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { 2399 struct sock *sk; 2400 struct hlist_nulls_node *node; 2401 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 2402 2403 /* Lockless fast path for the common case of empty buckets */ 2404 if (empty_bucket(st)) 2405 continue; 2406 2407 spin_lock_bh(lock); 2408 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 2409 if (seq_sk_match(seq, sk)) 2410 return sk; 2411 } 2412 spin_unlock_bh(lock); 2413 } 2414 2415 return NULL; 2416 } 2417 2418 static void *established_get_next(struct seq_file *seq, void *cur) 2419 { 2420 struct sock *sk = cur; 2421 struct hlist_nulls_node *node; 2422 struct tcp_iter_state *st = seq->private; 2423 2424 ++st->num; 2425 ++st->offset; 2426 2427 sk = sk_nulls_next(sk); 2428 2429 sk_nulls_for_each_from(sk, node) { 2430 if (seq_sk_match(seq, sk)) 2431 return sk; 2432 } 2433 2434 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2435 ++st->bucket; 2436 return established_get_first(seq); 2437 } 2438 2439 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2440 { 2441 struct tcp_iter_state *st = seq->private; 2442 void *rc; 2443 2444 st->bucket = 0; 2445 rc = established_get_first(seq); 2446 2447 while (rc && pos) { 2448 rc = established_get_next(seq, rc); 2449 --pos; 2450 } 2451 return rc; 2452 } 2453 2454 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2455 { 2456 void *rc; 2457 struct tcp_iter_state *st = seq->private; 2458 2459 st->state = TCP_SEQ_STATE_LISTENING; 2460 rc = listening_get_idx(seq, &pos); 2461 2462 if (!rc) { 2463 st->state = TCP_SEQ_STATE_ESTABLISHED; 2464 rc = established_get_idx(seq, pos); 2465 } 2466 2467 return rc; 2468 } 2469 2470 static void *tcp_seek_last_pos(struct seq_file *seq) 2471 { 2472 struct tcp_iter_state *st = seq->private; 2473 int bucket = st->bucket; 2474 int offset = st->offset; 2475 int orig_num = st->num; 2476 void *rc = NULL; 2477 2478 switch (st->state) { 2479 case TCP_SEQ_STATE_LISTENING: 2480 if (st->bucket > tcp_hashinfo.lhash2_mask) 2481 break; 2482 st->state = TCP_SEQ_STATE_LISTENING; 2483 rc = listening_get_first(seq); 2484 while (offset-- && rc && bucket == st->bucket) 2485 rc = listening_get_next(seq, rc); 2486 if (rc) 2487 break; 2488 st->bucket = 0; 2489 st->state = TCP_SEQ_STATE_ESTABLISHED; 2490 fallthrough; 2491 case TCP_SEQ_STATE_ESTABLISHED: 2492 if (st->bucket > tcp_hashinfo.ehash_mask) 2493 break; 2494 rc = established_get_first(seq); 2495 while (offset-- && rc && bucket == st->bucket) 2496 rc = established_get_next(seq, rc); 2497 } 2498 2499 st->num = orig_num; 2500 2501 return rc; 2502 } 2503 2504 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2505 { 2506 struct tcp_iter_state *st = seq->private; 2507 void *rc; 2508 2509 if (*pos && *pos == st->last_pos) { 2510 rc = tcp_seek_last_pos(seq); 2511 if (rc) 2512 goto out; 2513 } 2514 2515 st->state = TCP_SEQ_STATE_LISTENING; 2516 st->num = 0; 2517 st->bucket = 0; 2518 st->offset = 0; 2519 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2520 2521 out: 2522 st->last_pos = *pos; 2523 return rc; 2524 } 2525 EXPORT_SYMBOL(tcp_seq_start); 2526 2527 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2528 { 2529 struct tcp_iter_state *st = seq->private; 2530 void *rc = NULL; 2531 2532 if (v == SEQ_START_TOKEN) { 2533 rc = tcp_get_idx(seq, 0); 2534 goto out; 2535 } 2536 2537 switch (st->state) { 2538 case TCP_SEQ_STATE_LISTENING: 2539 rc = listening_get_next(seq, v); 2540 if (!rc) { 2541 st->state = TCP_SEQ_STATE_ESTABLISHED; 2542 st->bucket = 0; 2543 st->offset = 0; 2544 rc = established_get_first(seq); 2545 } 2546 break; 2547 case TCP_SEQ_STATE_ESTABLISHED: 2548 rc = established_get_next(seq, v); 2549 break; 2550 } 2551 out: 2552 ++*pos; 2553 st->last_pos = *pos; 2554 return rc; 2555 } 2556 EXPORT_SYMBOL(tcp_seq_next); 2557 2558 void tcp_seq_stop(struct seq_file *seq, void *v) 2559 { 2560 struct tcp_iter_state *st = seq->private; 2561 2562 switch (st->state) { 2563 case TCP_SEQ_STATE_LISTENING: 2564 if (v != SEQ_START_TOKEN) 2565 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock); 2566 break; 2567 case TCP_SEQ_STATE_ESTABLISHED: 2568 if (v) 2569 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2570 break; 2571 } 2572 } 2573 EXPORT_SYMBOL(tcp_seq_stop); 2574 2575 static void get_openreq4(const struct request_sock *req, 2576 struct seq_file *f, int i) 2577 { 2578 const struct inet_request_sock *ireq = inet_rsk(req); 2579 long delta = req->rsk_timer.expires - jiffies; 2580 2581 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2582 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2583 i, 2584 ireq->ir_loc_addr, 2585 ireq->ir_num, 2586 ireq->ir_rmt_addr, 2587 ntohs(ireq->ir_rmt_port), 2588 TCP_SYN_RECV, 2589 0, 0, /* could print option size, but that is af dependent. */ 2590 1, /* timers active (only the expire timer) */ 2591 jiffies_delta_to_clock_t(delta), 2592 req->num_timeout, 2593 from_kuid_munged(seq_user_ns(f), 2594 sock_i_uid(req->rsk_listener)), 2595 0, /* non standard timer */ 2596 0, /* open_requests have no inode */ 2597 0, 2598 req); 2599 } 2600 2601 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2602 { 2603 int timer_active; 2604 unsigned long timer_expires; 2605 const struct tcp_sock *tp = tcp_sk(sk); 2606 const struct inet_connection_sock *icsk = inet_csk(sk); 2607 const struct inet_sock *inet = inet_sk(sk); 2608 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2609 __be32 dest = inet->inet_daddr; 2610 __be32 src = inet->inet_rcv_saddr; 2611 __u16 destp = ntohs(inet->inet_dport); 2612 __u16 srcp = ntohs(inet->inet_sport); 2613 int rx_queue; 2614 int state; 2615 2616 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2617 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2618 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2619 timer_active = 1; 2620 timer_expires = icsk->icsk_timeout; 2621 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2622 timer_active = 4; 2623 timer_expires = icsk->icsk_timeout; 2624 } else if (timer_pending(&sk->sk_timer)) { 2625 timer_active = 2; 2626 timer_expires = sk->sk_timer.expires; 2627 } else { 2628 timer_active = 0; 2629 timer_expires = jiffies; 2630 } 2631 2632 state = inet_sk_state_load(sk); 2633 if (state == TCP_LISTEN) 2634 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2635 else 2636 /* Because we don't lock the socket, 2637 * we might find a transient negative value. 2638 */ 2639 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2640 READ_ONCE(tp->copied_seq), 0); 2641 2642 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2643 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2644 i, src, srcp, dest, destp, state, 2645 READ_ONCE(tp->write_seq) - tp->snd_una, 2646 rx_queue, 2647 timer_active, 2648 jiffies_delta_to_clock_t(timer_expires - jiffies), 2649 icsk->icsk_retransmits, 2650 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2651 icsk->icsk_probes_out, 2652 sock_i_ino(sk), 2653 refcount_read(&sk->sk_refcnt), sk, 2654 jiffies_to_clock_t(icsk->icsk_rto), 2655 jiffies_to_clock_t(icsk->icsk_ack.ato), 2656 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2657 tp->snd_cwnd, 2658 state == TCP_LISTEN ? 2659 fastopenq->max_qlen : 2660 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2661 } 2662 2663 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2664 struct seq_file *f, int i) 2665 { 2666 long delta = tw->tw_timer.expires - jiffies; 2667 __be32 dest, src; 2668 __u16 destp, srcp; 2669 2670 dest = tw->tw_daddr; 2671 src = tw->tw_rcv_saddr; 2672 destp = ntohs(tw->tw_dport); 2673 srcp = ntohs(tw->tw_sport); 2674 2675 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2676 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2677 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2678 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2679 refcount_read(&tw->tw_refcnt), tw); 2680 } 2681 2682 #define TMPSZ 150 2683 2684 static int tcp4_seq_show(struct seq_file *seq, void *v) 2685 { 2686 struct tcp_iter_state *st; 2687 struct sock *sk = v; 2688 2689 seq_setwidth(seq, TMPSZ - 1); 2690 if (v == SEQ_START_TOKEN) { 2691 seq_puts(seq, " sl local_address rem_address st tx_queue " 2692 "rx_queue tr tm->when retrnsmt uid timeout " 2693 "inode"); 2694 goto out; 2695 } 2696 st = seq->private; 2697 2698 if (sk->sk_state == TCP_TIME_WAIT) 2699 get_timewait4_sock(v, seq, st->num); 2700 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2701 get_openreq4(v, seq, st->num); 2702 else 2703 get_tcp4_sock(v, seq, st->num); 2704 out: 2705 seq_pad(seq, '\n'); 2706 return 0; 2707 } 2708 2709 #ifdef CONFIG_BPF_SYSCALL 2710 struct bpf_tcp_iter_state { 2711 struct tcp_iter_state state; 2712 unsigned int cur_sk; 2713 unsigned int end_sk; 2714 unsigned int max_sk; 2715 struct sock **batch; 2716 bool st_bucket_done; 2717 }; 2718 2719 struct bpf_iter__tcp { 2720 __bpf_md_ptr(struct bpf_iter_meta *, meta); 2721 __bpf_md_ptr(struct sock_common *, sk_common); 2722 uid_t uid __aligned(8); 2723 }; 2724 2725 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 2726 struct sock_common *sk_common, uid_t uid) 2727 { 2728 struct bpf_iter__tcp ctx; 2729 2730 meta->seq_num--; /* skip SEQ_START_TOKEN */ 2731 ctx.meta = meta; 2732 ctx.sk_common = sk_common; 2733 ctx.uid = uid; 2734 return bpf_iter_run_prog(prog, &ctx); 2735 } 2736 2737 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) 2738 { 2739 while (iter->cur_sk < iter->end_sk) 2740 sock_put(iter->batch[iter->cur_sk++]); 2741 } 2742 2743 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, 2744 unsigned int new_batch_sz) 2745 { 2746 struct sock **new_batch; 2747 2748 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 2749 GFP_USER | __GFP_NOWARN); 2750 if (!new_batch) 2751 return -ENOMEM; 2752 2753 bpf_iter_tcp_put_batch(iter); 2754 kvfree(iter->batch); 2755 iter->batch = new_batch; 2756 iter->max_sk = new_batch_sz; 2757 2758 return 0; 2759 } 2760 2761 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, 2762 struct sock *start_sk) 2763 { 2764 struct bpf_tcp_iter_state *iter = seq->private; 2765 struct tcp_iter_state *st = &iter->state; 2766 struct inet_connection_sock *icsk; 2767 unsigned int expected = 1; 2768 struct sock *sk; 2769 2770 sock_hold(start_sk); 2771 iter->batch[iter->end_sk++] = start_sk; 2772 2773 icsk = inet_csk(start_sk); 2774 inet_lhash2_for_each_icsk_continue(icsk) { 2775 sk = (struct sock *)icsk; 2776 if (seq_sk_match(seq, sk)) { 2777 if (iter->end_sk < iter->max_sk) { 2778 sock_hold(sk); 2779 iter->batch[iter->end_sk++] = sk; 2780 } 2781 expected++; 2782 } 2783 } 2784 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock); 2785 2786 return expected; 2787 } 2788 2789 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, 2790 struct sock *start_sk) 2791 { 2792 struct bpf_tcp_iter_state *iter = seq->private; 2793 struct tcp_iter_state *st = &iter->state; 2794 struct hlist_nulls_node *node; 2795 unsigned int expected = 1; 2796 struct sock *sk; 2797 2798 sock_hold(start_sk); 2799 iter->batch[iter->end_sk++] = start_sk; 2800 2801 sk = sk_nulls_next(start_sk); 2802 sk_nulls_for_each_from(sk, node) { 2803 if (seq_sk_match(seq, sk)) { 2804 if (iter->end_sk < iter->max_sk) { 2805 sock_hold(sk); 2806 iter->batch[iter->end_sk++] = sk; 2807 } 2808 expected++; 2809 } 2810 } 2811 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2812 2813 return expected; 2814 } 2815 2816 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) 2817 { 2818 struct bpf_tcp_iter_state *iter = seq->private; 2819 struct tcp_iter_state *st = &iter->state; 2820 unsigned int expected; 2821 bool resized = false; 2822 struct sock *sk; 2823 2824 /* The st->bucket is done. Directly advance to the next 2825 * bucket instead of having the tcp_seek_last_pos() to skip 2826 * one by one in the current bucket and eventually find out 2827 * it has to advance to the next bucket. 2828 */ 2829 if (iter->st_bucket_done) { 2830 st->offset = 0; 2831 st->bucket++; 2832 if (st->state == TCP_SEQ_STATE_LISTENING && 2833 st->bucket > tcp_hashinfo.lhash2_mask) { 2834 st->state = TCP_SEQ_STATE_ESTABLISHED; 2835 st->bucket = 0; 2836 } 2837 } 2838 2839 again: 2840 /* Get a new batch */ 2841 iter->cur_sk = 0; 2842 iter->end_sk = 0; 2843 iter->st_bucket_done = false; 2844 2845 sk = tcp_seek_last_pos(seq); 2846 if (!sk) 2847 return NULL; /* Done */ 2848 2849 if (st->state == TCP_SEQ_STATE_LISTENING) 2850 expected = bpf_iter_tcp_listening_batch(seq, sk); 2851 else 2852 expected = bpf_iter_tcp_established_batch(seq, sk); 2853 2854 if (iter->end_sk == expected) { 2855 iter->st_bucket_done = true; 2856 return sk; 2857 } 2858 2859 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) { 2860 resized = true; 2861 goto again; 2862 } 2863 2864 return sk; 2865 } 2866 2867 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) 2868 { 2869 /* bpf iter does not support lseek, so it always 2870 * continue from where it was stop()-ped. 2871 */ 2872 if (*pos) 2873 return bpf_iter_tcp_batch(seq); 2874 2875 return SEQ_START_TOKEN; 2876 } 2877 2878 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2879 { 2880 struct bpf_tcp_iter_state *iter = seq->private; 2881 struct tcp_iter_state *st = &iter->state; 2882 struct sock *sk; 2883 2884 /* Whenever seq_next() is called, the iter->cur_sk is 2885 * done with seq_show(), so advance to the next sk in 2886 * the batch. 2887 */ 2888 if (iter->cur_sk < iter->end_sk) { 2889 /* Keeping st->num consistent in tcp_iter_state. 2890 * bpf_iter_tcp does not use st->num. 2891 * meta.seq_num is used instead. 2892 */ 2893 st->num++; 2894 /* Move st->offset to the next sk in the bucket such that 2895 * the future start() will resume at st->offset in 2896 * st->bucket. See tcp_seek_last_pos(). 2897 */ 2898 st->offset++; 2899 sock_put(iter->batch[iter->cur_sk++]); 2900 } 2901 2902 if (iter->cur_sk < iter->end_sk) 2903 sk = iter->batch[iter->cur_sk]; 2904 else 2905 sk = bpf_iter_tcp_batch(seq); 2906 2907 ++*pos; 2908 /* Keeping st->last_pos consistent in tcp_iter_state. 2909 * bpf iter does not do lseek, so st->last_pos always equals to *pos. 2910 */ 2911 st->last_pos = *pos; 2912 return sk; 2913 } 2914 2915 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 2916 { 2917 struct bpf_iter_meta meta; 2918 struct bpf_prog *prog; 2919 struct sock *sk = v; 2920 bool slow; 2921 uid_t uid; 2922 int ret; 2923 2924 if (v == SEQ_START_TOKEN) 2925 return 0; 2926 2927 if (sk_fullsock(sk)) 2928 slow = lock_sock_fast(sk); 2929 2930 if (unlikely(sk_unhashed(sk))) { 2931 ret = SEQ_SKIP; 2932 goto unlock; 2933 } 2934 2935 if (sk->sk_state == TCP_TIME_WAIT) { 2936 uid = 0; 2937 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 2938 const struct request_sock *req = v; 2939 2940 uid = from_kuid_munged(seq_user_ns(seq), 2941 sock_i_uid(req->rsk_listener)); 2942 } else { 2943 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 2944 } 2945 2946 meta.seq = seq; 2947 prog = bpf_iter_get_info(&meta, false); 2948 ret = tcp_prog_seq_show(prog, &meta, v, uid); 2949 2950 unlock: 2951 if (sk_fullsock(sk)) 2952 unlock_sock_fast(sk, slow); 2953 return ret; 2954 2955 } 2956 2957 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 2958 { 2959 struct bpf_tcp_iter_state *iter = seq->private; 2960 struct bpf_iter_meta meta; 2961 struct bpf_prog *prog; 2962 2963 if (!v) { 2964 meta.seq = seq; 2965 prog = bpf_iter_get_info(&meta, true); 2966 if (prog) 2967 (void)tcp_prog_seq_show(prog, &meta, v, 0); 2968 } 2969 2970 if (iter->cur_sk < iter->end_sk) { 2971 bpf_iter_tcp_put_batch(iter); 2972 iter->st_bucket_done = false; 2973 } 2974 } 2975 2976 static const struct seq_operations bpf_iter_tcp_seq_ops = { 2977 .show = bpf_iter_tcp_seq_show, 2978 .start = bpf_iter_tcp_seq_start, 2979 .next = bpf_iter_tcp_seq_next, 2980 .stop = bpf_iter_tcp_seq_stop, 2981 }; 2982 #endif 2983 static unsigned short seq_file_family(const struct seq_file *seq) 2984 { 2985 const struct tcp_seq_afinfo *afinfo; 2986 2987 #ifdef CONFIG_BPF_SYSCALL 2988 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ 2989 if (seq->op == &bpf_iter_tcp_seq_ops) 2990 return AF_UNSPEC; 2991 #endif 2992 2993 /* Iterated from proc fs */ 2994 afinfo = PDE_DATA(file_inode(seq->file)); 2995 return afinfo->family; 2996 } 2997 2998 static const struct seq_operations tcp4_seq_ops = { 2999 .show = tcp4_seq_show, 3000 .start = tcp_seq_start, 3001 .next = tcp_seq_next, 3002 .stop = tcp_seq_stop, 3003 }; 3004 3005 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 3006 .family = AF_INET, 3007 }; 3008 3009 static int __net_init tcp4_proc_init_net(struct net *net) 3010 { 3011 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 3012 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 3013 return -ENOMEM; 3014 return 0; 3015 } 3016 3017 static void __net_exit tcp4_proc_exit_net(struct net *net) 3018 { 3019 remove_proc_entry("tcp", net->proc_net); 3020 } 3021 3022 static struct pernet_operations tcp4_net_ops = { 3023 .init = tcp4_proc_init_net, 3024 .exit = tcp4_proc_exit_net, 3025 }; 3026 3027 int __init tcp4_proc_init(void) 3028 { 3029 return register_pernet_subsys(&tcp4_net_ops); 3030 } 3031 3032 void tcp4_proc_exit(void) 3033 { 3034 unregister_pernet_subsys(&tcp4_net_ops); 3035 } 3036 #endif /* CONFIG_PROC_FS */ 3037 3038 /* @wake is one when sk_stream_write_space() calls us. 3039 * This sends EPOLLOUT only if notsent_bytes is half the limit. 3040 * This mimics the strategy used in sock_def_write_space(). 3041 */ 3042 bool tcp_stream_memory_free(const struct sock *sk, int wake) 3043 { 3044 const struct tcp_sock *tp = tcp_sk(sk); 3045 u32 notsent_bytes = READ_ONCE(tp->write_seq) - 3046 READ_ONCE(tp->snd_nxt); 3047 3048 return (notsent_bytes << wake) < tcp_notsent_lowat(tp); 3049 } 3050 EXPORT_SYMBOL(tcp_stream_memory_free); 3051 3052 struct proto tcp_prot = { 3053 .name = "TCP", 3054 .owner = THIS_MODULE, 3055 .close = tcp_close, 3056 .pre_connect = tcp_v4_pre_connect, 3057 .connect = tcp_v4_connect, 3058 .disconnect = tcp_disconnect, 3059 .accept = inet_csk_accept, 3060 .ioctl = tcp_ioctl, 3061 .init = tcp_v4_init_sock, 3062 .destroy = tcp_v4_destroy_sock, 3063 .shutdown = tcp_shutdown, 3064 .setsockopt = tcp_setsockopt, 3065 .getsockopt = tcp_getsockopt, 3066 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 3067 .keepalive = tcp_set_keepalive, 3068 .recvmsg = tcp_recvmsg, 3069 .sendmsg = tcp_sendmsg, 3070 .sendpage = tcp_sendpage, 3071 .backlog_rcv = tcp_v4_do_rcv, 3072 .release_cb = tcp_release_cb, 3073 .hash = inet_hash, 3074 .unhash = inet_unhash, 3075 .get_port = inet_csk_get_port, 3076 #ifdef CONFIG_BPF_SYSCALL 3077 .psock_update_sk_prot = tcp_bpf_update_proto, 3078 #endif 3079 .enter_memory_pressure = tcp_enter_memory_pressure, 3080 .leave_memory_pressure = tcp_leave_memory_pressure, 3081 .stream_memory_free = tcp_stream_memory_free, 3082 .sockets_allocated = &tcp_sockets_allocated, 3083 .orphan_count = &tcp_orphan_count, 3084 .memory_allocated = &tcp_memory_allocated, 3085 .memory_pressure = &tcp_memory_pressure, 3086 .sysctl_mem = sysctl_tcp_mem, 3087 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3088 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3089 .max_header = MAX_TCP_HEADER, 3090 .obj_size = sizeof(struct tcp_sock), 3091 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3092 .twsk_prot = &tcp_timewait_sock_ops, 3093 .rsk_prot = &tcp_request_sock_ops, 3094 .h.hashinfo = &tcp_hashinfo, 3095 .no_autobind = true, 3096 .diag_destroy = tcp_abort, 3097 }; 3098 EXPORT_SYMBOL(tcp_prot); 3099 3100 static void __net_exit tcp_sk_exit(struct net *net) 3101 { 3102 int cpu; 3103 3104 if (net->ipv4.tcp_congestion_control) 3105 bpf_module_put(net->ipv4.tcp_congestion_control, 3106 net->ipv4.tcp_congestion_control->owner); 3107 3108 for_each_possible_cpu(cpu) 3109 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); 3110 free_percpu(net->ipv4.tcp_sk); 3111 } 3112 3113 static int __net_init tcp_sk_init(struct net *net) 3114 { 3115 int res, cpu, cnt; 3116 3117 net->ipv4.tcp_sk = alloc_percpu(struct sock *); 3118 if (!net->ipv4.tcp_sk) 3119 return -ENOMEM; 3120 3121 for_each_possible_cpu(cpu) { 3122 struct sock *sk; 3123 3124 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 3125 IPPROTO_TCP, net); 3126 if (res) 3127 goto fail; 3128 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 3129 3130 /* Please enforce IP_DF and IPID==0 for RST and 3131 * ACK sent in SYN-RECV and TIME-WAIT state. 3132 */ 3133 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 3134 3135 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; 3136 } 3137 3138 net->ipv4.sysctl_tcp_ecn = 2; 3139 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3140 3141 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 3142 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 3143 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 3144 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 3145 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 3146 3147 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 3148 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 3149 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 3150 3151 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 3152 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 3153 net->ipv4.sysctl_tcp_syncookies = 1; 3154 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 3155 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 3156 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 3157 net->ipv4.sysctl_tcp_orphan_retries = 0; 3158 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 3159 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 3160 net->ipv4.sysctl_tcp_tw_reuse = 2; 3161 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 3162 3163 cnt = tcp_hashinfo.ehash_mask + 1; 3164 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2; 3165 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; 3166 3167 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128); 3168 net->ipv4.sysctl_tcp_sack = 1; 3169 net->ipv4.sysctl_tcp_window_scaling = 1; 3170 net->ipv4.sysctl_tcp_timestamps = 1; 3171 net->ipv4.sysctl_tcp_early_retrans = 3; 3172 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 3173 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 3174 net->ipv4.sysctl_tcp_retrans_collapse = 1; 3175 net->ipv4.sysctl_tcp_max_reordering = 300; 3176 net->ipv4.sysctl_tcp_dsack = 1; 3177 net->ipv4.sysctl_tcp_app_win = 31; 3178 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3179 net->ipv4.sysctl_tcp_frto = 2; 3180 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3181 /* This limits the percentage of the congestion window which we 3182 * will allow a single TSO frame to consume. Building TSO frames 3183 * which are too large can cause TCP streams to be bursty. 3184 */ 3185 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3186 /* Default TSQ limit of 16 TSO segments */ 3187 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; 3188 /* rfc5961 challenge ack rate limiting */ 3189 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; 3190 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3191 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3192 net->ipv4.sysctl_tcp_autocorking = 1; 3193 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 3194 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 3195 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 3196 if (net != &init_net) { 3197 memcpy(net->ipv4.sysctl_tcp_rmem, 3198 init_net.ipv4.sysctl_tcp_rmem, 3199 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 3200 memcpy(net->ipv4.sysctl_tcp_wmem, 3201 init_net.ipv4.sysctl_tcp_wmem, 3202 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 3203 } 3204 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3205 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 3206 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3207 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3208 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; 3209 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 3210 3211 /* Reno is always built in */ 3212 if (!net_eq(net, &init_net) && 3213 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 3214 init_net.ipv4.tcp_congestion_control->owner)) 3215 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 3216 else 3217 net->ipv4.tcp_congestion_control = &tcp_reno; 3218 3219 return 0; 3220 fail: 3221 tcp_sk_exit(net); 3222 3223 return res; 3224 } 3225 3226 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 3227 { 3228 struct net *net; 3229 3230 inet_twsk_purge(&tcp_hashinfo, AF_INET); 3231 3232 list_for_each_entry(net, net_exit_list, exit_list) 3233 tcp_fastopen_ctx_destroy(net); 3234 } 3235 3236 static struct pernet_operations __net_initdata tcp_sk_ops = { 3237 .init = tcp_sk_init, 3238 .exit = tcp_sk_exit, 3239 .exit_batch = tcp_sk_exit_batch, 3240 }; 3241 3242 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3243 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 3244 struct sock_common *sk_common, uid_t uid) 3245 3246 #define INIT_BATCH_SZ 16 3247 3248 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 3249 { 3250 struct bpf_tcp_iter_state *iter = priv_data; 3251 int err; 3252 3253 err = bpf_iter_init_seq_net(priv_data, aux); 3254 if (err) 3255 return err; 3256 3257 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ); 3258 if (err) { 3259 bpf_iter_fini_seq_net(priv_data); 3260 return err; 3261 } 3262 3263 return 0; 3264 } 3265 3266 static void bpf_iter_fini_tcp(void *priv_data) 3267 { 3268 struct bpf_tcp_iter_state *iter = priv_data; 3269 3270 bpf_iter_fini_seq_net(priv_data); 3271 kvfree(iter->batch); 3272 } 3273 3274 static const struct bpf_iter_seq_info tcp_seq_info = { 3275 .seq_ops = &bpf_iter_tcp_seq_ops, 3276 .init_seq_private = bpf_iter_init_tcp, 3277 .fini_seq_private = bpf_iter_fini_tcp, 3278 .seq_priv_size = sizeof(struct bpf_tcp_iter_state), 3279 }; 3280 3281 static const struct bpf_func_proto * 3282 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, 3283 const struct bpf_prog *prog) 3284 { 3285 switch (func_id) { 3286 case BPF_FUNC_setsockopt: 3287 return &bpf_sk_setsockopt_proto; 3288 case BPF_FUNC_getsockopt: 3289 return &bpf_sk_getsockopt_proto; 3290 default: 3291 return NULL; 3292 } 3293 } 3294 3295 static struct bpf_iter_reg tcp_reg_info = { 3296 .target = "tcp", 3297 .ctx_arg_info_size = 1, 3298 .ctx_arg_info = { 3299 { offsetof(struct bpf_iter__tcp, sk_common), 3300 PTR_TO_BTF_ID_OR_NULL }, 3301 }, 3302 .get_func_proto = bpf_iter_tcp_get_func_proto, 3303 .seq_info = &tcp_seq_info, 3304 }; 3305 3306 static void __init bpf_iter_register(void) 3307 { 3308 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 3309 if (bpf_iter_reg_target(&tcp_reg_info)) 3310 pr_warn("Warning: could not register bpf iterator tcp\n"); 3311 } 3312 3313 #endif 3314 3315 void __init tcp_v4_init(void) 3316 { 3317 if (register_pernet_subsys(&tcp_sk_ops)) 3318 panic("Failed to create the TCP control socket.\n"); 3319 3320 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3321 bpf_iter_register(); 3322 #endif 3323 } 3324