1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 61 #include <net/net_namespace.h> 62 #include <net/icmp.h> 63 #include <net/inet_hashtables.h> 64 #include <net/tcp.h> 65 #include <net/transp_v6.h> 66 #include <net/ipv6.h> 67 #include <net/inet_common.h> 68 #include <net/timewait_sock.h> 69 #include <net/xfrm.h> 70 #include <net/secure_seq.h> 71 #include <net/busy_poll.h> 72 73 #include <linux/inet.h> 74 #include <linux/ipv6.h> 75 #include <linux/stddef.h> 76 #include <linux/proc_fs.h> 77 #include <linux/seq_file.h> 78 #include <linux/inetdevice.h> 79 #include <linux/btf_ids.h> 80 81 #include <crypto/hash.h> 82 #include <linux/scatterlist.h> 83 84 #include <trace/events/tcp.h> 85 86 #ifdef CONFIG_TCP_MD5SIG 87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 88 __be32 daddr, __be32 saddr, const struct tcphdr *th); 89 #endif 90 91 struct inet_hashinfo tcp_hashinfo; 92 EXPORT_SYMBOL(tcp_hashinfo); 93 94 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 95 { 96 return secure_tcp_seq(ip_hdr(skb)->daddr, 97 ip_hdr(skb)->saddr, 98 tcp_hdr(skb)->dest, 99 tcp_hdr(skb)->source); 100 } 101 102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 103 { 104 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 105 } 106 107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 108 { 109 const struct inet_timewait_sock *tw = inet_twsk(sktw); 110 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 111 struct tcp_sock *tp = tcp_sk(sk); 112 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse; 113 114 if (reuse == 2) { 115 /* Still does not detect *everything* that goes through 116 * lo, since we require a loopback src or dst address 117 * or direct binding to 'lo' interface. 118 */ 119 bool loopback = false; 120 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 121 loopback = true; 122 #if IS_ENABLED(CONFIG_IPV6) 123 if (tw->tw_family == AF_INET6) { 124 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 125 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 126 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 128 loopback = true; 129 } else 130 #endif 131 { 132 if (ipv4_is_loopback(tw->tw_daddr) || 133 ipv4_is_loopback(tw->tw_rcv_saddr)) 134 loopback = true; 135 } 136 if (!loopback) 137 reuse = 0; 138 } 139 140 /* With PAWS, it is safe from the viewpoint 141 of data integrity. Even without PAWS it is safe provided sequence 142 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 143 144 Actually, the idea is close to VJ's one, only timestamp cache is 145 held not per host, but per port pair and TW bucket is used as state 146 holder. 147 148 If TW bucket has been already destroyed we fall back to VJ's scheme 149 and use initial timestamp retrieved from peer table. 150 */ 151 if (tcptw->tw_ts_recent_stamp && 152 (!twp || (reuse && time_after32(ktime_get_seconds(), 153 tcptw->tw_ts_recent_stamp)))) { 154 /* In case of repair and re-using TIME-WAIT sockets we still 155 * want to be sure that it is safe as above but honor the 156 * sequence numbers and time stamps set as part of the repair 157 * process. 158 * 159 * Without this check re-using a TIME-WAIT socket with TCP 160 * repair would accumulate a -1 on the repair assigned 161 * sequence number. The first time it is reused the sequence 162 * is -1, the second time -2, etc. This fixes that issue 163 * without appearing to create any others. 164 */ 165 if (likely(!tp->repair)) { 166 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 167 168 if (!seq) 169 seq = 1; 170 WRITE_ONCE(tp->write_seq, seq); 171 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 172 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 173 } 174 sock_hold(sktw); 175 return 1; 176 } 177 178 return 0; 179 } 180 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 181 182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 183 int addr_len) 184 { 185 /* This check is replicated from tcp_v4_connect() and intended to 186 * prevent BPF program called below from accessing bytes that are out 187 * of the bound specified by user in addr_len. 188 */ 189 if (addr_len < sizeof(struct sockaddr_in)) 190 return -EINVAL; 191 192 sock_owned_by_me(sk); 193 194 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr); 195 } 196 197 /* This will initiate an outgoing connection. */ 198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 199 { 200 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 201 struct inet_sock *inet = inet_sk(sk); 202 struct tcp_sock *tp = tcp_sk(sk); 203 __be16 orig_sport, orig_dport; 204 __be32 daddr, nexthop; 205 struct flowi4 *fl4; 206 struct rtable *rt; 207 int err; 208 struct ip_options_rcu *inet_opt; 209 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 210 211 if (addr_len < sizeof(struct sockaddr_in)) 212 return -EINVAL; 213 214 if (usin->sin_family != AF_INET) 215 return -EAFNOSUPPORT; 216 217 nexthop = daddr = usin->sin_addr.s_addr; 218 inet_opt = rcu_dereference_protected(inet->inet_opt, 219 lockdep_sock_is_held(sk)); 220 if (inet_opt && inet_opt->opt.srr) { 221 if (!daddr) 222 return -EINVAL; 223 nexthop = inet_opt->opt.faddr; 224 } 225 226 orig_sport = inet->inet_sport; 227 orig_dport = usin->sin_port; 228 fl4 = &inet->cork.fl.u.ip4; 229 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 230 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 231 IPPROTO_TCP, 232 orig_sport, orig_dport, sk); 233 if (IS_ERR(rt)) { 234 err = PTR_ERR(rt); 235 if (err == -ENETUNREACH) 236 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 237 return err; 238 } 239 240 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 241 ip_rt_put(rt); 242 return -ENETUNREACH; 243 } 244 245 if (!inet_opt || !inet_opt->opt.srr) 246 daddr = fl4->daddr; 247 248 if (!inet->inet_saddr) 249 inet->inet_saddr = fl4->saddr; 250 sk_rcv_saddr_set(sk, inet->inet_saddr); 251 252 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 253 /* Reset inherited state */ 254 tp->rx_opt.ts_recent = 0; 255 tp->rx_opt.ts_recent_stamp = 0; 256 if (likely(!tp->repair)) 257 WRITE_ONCE(tp->write_seq, 0); 258 } 259 260 inet->inet_dport = usin->sin_port; 261 sk_daddr_set(sk, daddr); 262 263 inet_csk(sk)->icsk_ext_hdr_len = 0; 264 if (inet_opt) 265 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 266 267 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 268 269 /* Socket identity is still unknown (sport may be zero). 270 * However we set state to SYN-SENT and not releasing socket 271 * lock select source port, enter ourselves into the hash tables and 272 * complete initialization after this. 273 */ 274 tcp_set_state(sk, TCP_SYN_SENT); 275 err = inet_hash_connect(tcp_death_row, sk); 276 if (err) 277 goto failure; 278 279 sk_set_txhash(sk); 280 281 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 282 inet->inet_sport, inet->inet_dport, sk); 283 if (IS_ERR(rt)) { 284 err = PTR_ERR(rt); 285 rt = NULL; 286 goto failure; 287 } 288 /* OK, now commit destination to socket. */ 289 sk->sk_gso_type = SKB_GSO_TCPV4; 290 sk_setup_caps(sk, &rt->dst); 291 rt = NULL; 292 293 if (likely(!tp->repair)) { 294 if (!tp->write_seq) 295 WRITE_ONCE(tp->write_seq, 296 secure_tcp_seq(inet->inet_saddr, 297 inet->inet_daddr, 298 inet->inet_sport, 299 usin->sin_port)); 300 tp->tsoffset = secure_tcp_ts_off(sock_net(sk), 301 inet->inet_saddr, 302 inet->inet_daddr); 303 } 304 305 inet->inet_id = prandom_u32(); 306 307 if (tcp_fastopen_defer_connect(sk, &err)) 308 return err; 309 if (err) 310 goto failure; 311 312 err = tcp_connect(sk); 313 314 if (err) 315 goto failure; 316 317 return 0; 318 319 failure: 320 /* 321 * This unhashes the socket and releases the local port, 322 * if necessary. 323 */ 324 tcp_set_state(sk, TCP_CLOSE); 325 ip_rt_put(rt); 326 sk->sk_route_caps = 0; 327 inet->inet_dport = 0; 328 return err; 329 } 330 EXPORT_SYMBOL(tcp_v4_connect); 331 332 /* 333 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 334 * It can be called through tcp_release_cb() if socket was owned by user 335 * at the time tcp_v4_err() was called to handle ICMP message. 336 */ 337 void tcp_v4_mtu_reduced(struct sock *sk) 338 { 339 struct inet_sock *inet = inet_sk(sk); 340 struct dst_entry *dst; 341 u32 mtu; 342 343 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 344 return; 345 mtu = READ_ONCE(tcp_sk(sk)->mtu_info); 346 dst = inet_csk_update_pmtu(sk, mtu); 347 if (!dst) 348 return; 349 350 /* Something is about to be wrong... Remember soft error 351 * for the case, if this connection will not able to recover. 352 */ 353 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 354 sk->sk_err_soft = EMSGSIZE; 355 356 mtu = dst_mtu(dst); 357 358 if (inet->pmtudisc != IP_PMTUDISC_DONT && 359 ip_sk_accept_pmtu(sk) && 360 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 361 tcp_sync_mss(sk, mtu); 362 363 /* Resend the TCP packet because it's 364 * clear that the old packet has been 365 * dropped. This is the new "fast" path mtu 366 * discovery. 367 */ 368 tcp_simple_retransmit(sk); 369 } /* else let the usual retransmit timer handle it */ 370 } 371 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 372 373 static void do_redirect(struct sk_buff *skb, struct sock *sk) 374 { 375 struct dst_entry *dst = __sk_dst_check(sk, 0); 376 377 if (dst) 378 dst->ops->redirect(dst, sk, skb); 379 } 380 381 382 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 383 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 384 { 385 struct request_sock *req = inet_reqsk(sk); 386 struct net *net = sock_net(sk); 387 388 /* ICMPs are not backlogged, hence we cannot get 389 * an established socket here. 390 */ 391 if (seq != tcp_rsk(req)->snt_isn) { 392 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 393 } else if (abort) { 394 /* 395 * Still in SYN_RECV, just remove it silently. 396 * There is no good way to pass the error to the newly 397 * created socket, and POSIX does not want network 398 * errors returned from accept(). 399 */ 400 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 401 tcp_listendrop(req->rsk_listener); 402 } 403 reqsk_put(req); 404 } 405 EXPORT_SYMBOL(tcp_req_err); 406 407 /* TCP-LD (RFC 6069) logic */ 408 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 409 { 410 struct inet_connection_sock *icsk = inet_csk(sk); 411 struct tcp_sock *tp = tcp_sk(sk); 412 struct sk_buff *skb; 413 s32 remaining; 414 u32 delta_us; 415 416 if (sock_owned_by_user(sk)) 417 return; 418 419 if (seq != tp->snd_una || !icsk->icsk_retransmits || 420 !icsk->icsk_backoff) 421 return; 422 423 skb = tcp_rtx_queue_head(sk); 424 if (WARN_ON_ONCE(!skb)) 425 return; 426 427 icsk->icsk_backoff--; 428 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 429 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 430 431 tcp_mstamp_refresh(tp); 432 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 433 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 434 435 if (remaining > 0) { 436 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 437 remaining, TCP_RTO_MAX); 438 } else { 439 /* RTO revert clocked out retransmission. 440 * Will retransmit now. 441 */ 442 tcp_retransmit_timer(sk); 443 } 444 } 445 EXPORT_SYMBOL(tcp_ld_RTO_revert); 446 447 /* 448 * This routine is called by the ICMP module when it gets some 449 * sort of error condition. If err < 0 then the socket should 450 * be closed and the error returned to the user. If err > 0 451 * it's just the icmp type << 8 | icmp code. After adjustment 452 * header points to the first 8 bytes of the tcp header. We need 453 * to find the appropriate port. 454 * 455 * The locking strategy used here is very "optimistic". When 456 * someone else accesses the socket the ICMP is just dropped 457 * and for some paths there is no check at all. 458 * A more general error queue to queue errors for later handling 459 * is probably better. 460 * 461 */ 462 463 int tcp_v4_err(struct sk_buff *skb, u32 info) 464 { 465 const struct iphdr *iph = (const struct iphdr *)skb->data; 466 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 467 struct tcp_sock *tp; 468 struct inet_sock *inet; 469 const int type = icmp_hdr(skb)->type; 470 const int code = icmp_hdr(skb)->code; 471 struct sock *sk; 472 struct request_sock *fastopen; 473 u32 seq, snd_una; 474 int err; 475 struct net *net = dev_net(skb->dev); 476 477 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, 478 th->dest, iph->saddr, ntohs(th->source), 479 inet_iif(skb), 0); 480 if (!sk) { 481 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 482 return -ENOENT; 483 } 484 if (sk->sk_state == TCP_TIME_WAIT) { 485 inet_twsk_put(inet_twsk(sk)); 486 return 0; 487 } 488 seq = ntohl(th->seq); 489 if (sk->sk_state == TCP_NEW_SYN_RECV) { 490 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 491 type == ICMP_TIME_EXCEEDED || 492 (type == ICMP_DEST_UNREACH && 493 (code == ICMP_NET_UNREACH || 494 code == ICMP_HOST_UNREACH))); 495 return 0; 496 } 497 498 bh_lock_sock(sk); 499 /* If too many ICMPs get dropped on busy 500 * servers this needs to be solved differently. 501 * We do take care of PMTU discovery (RFC1191) special case : 502 * we can receive locally generated ICMP messages while socket is held. 503 */ 504 if (sock_owned_by_user(sk)) { 505 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 506 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 507 } 508 if (sk->sk_state == TCP_CLOSE) 509 goto out; 510 511 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 512 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 513 goto out; 514 } 515 516 tp = tcp_sk(sk); 517 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 518 fastopen = rcu_dereference(tp->fastopen_rsk); 519 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 520 if (sk->sk_state != TCP_LISTEN && 521 !between(seq, snd_una, tp->snd_nxt)) { 522 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 523 goto out; 524 } 525 526 switch (type) { 527 case ICMP_REDIRECT: 528 if (!sock_owned_by_user(sk)) 529 do_redirect(skb, sk); 530 goto out; 531 case ICMP_SOURCE_QUENCH: 532 /* Just silently ignore these. */ 533 goto out; 534 case ICMP_PARAMETERPROB: 535 err = EPROTO; 536 break; 537 case ICMP_DEST_UNREACH: 538 if (code > NR_ICMP_UNREACH) 539 goto out; 540 541 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 542 /* We are not interested in TCP_LISTEN and open_requests 543 * (SYN-ACKs send out by Linux are always <576bytes so 544 * they should go through unfragmented). 545 */ 546 if (sk->sk_state == TCP_LISTEN) 547 goto out; 548 549 WRITE_ONCE(tp->mtu_info, info); 550 if (!sock_owned_by_user(sk)) { 551 tcp_v4_mtu_reduced(sk); 552 } else { 553 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 554 sock_hold(sk); 555 } 556 goto out; 557 } 558 559 err = icmp_err_convert[code].errno; 560 /* check if this ICMP message allows revert of backoff. 561 * (see RFC 6069) 562 */ 563 if (!fastopen && 564 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 565 tcp_ld_RTO_revert(sk, seq); 566 break; 567 case ICMP_TIME_EXCEEDED: 568 err = EHOSTUNREACH; 569 break; 570 default: 571 goto out; 572 } 573 574 switch (sk->sk_state) { 575 case TCP_SYN_SENT: 576 case TCP_SYN_RECV: 577 /* Only in fast or simultaneous open. If a fast open socket is 578 * already accepted it is treated as a connected one below. 579 */ 580 if (fastopen && !fastopen->sk) 581 break; 582 583 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 584 585 if (!sock_owned_by_user(sk)) { 586 sk->sk_err = err; 587 588 sk_error_report(sk); 589 590 tcp_done(sk); 591 } else { 592 sk->sk_err_soft = err; 593 } 594 goto out; 595 } 596 597 /* If we've already connected we will keep trying 598 * until we time out, or the user gives up. 599 * 600 * rfc1122 4.2.3.9 allows to consider as hard errors 601 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 602 * but it is obsoleted by pmtu discovery). 603 * 604 * Note, that in modern internet, where routing is unreliable 605 * and in each dark corner broken firewalls sit, sending random 606 * errors ordered by their masters even this two messages finally lose 607 * their original sense (even Linux sends invalid PORT_UNREACHs) 608 * 609 * Now we are in compliance with RFCs. 610 * --ANK (980905) 611 */ 612 613 inet = inet_sk(sk); 614 if (!sock_owned_by_user(sk) && inet->recverr) { 615 sk->sk_err = err; 616 sk_error_report(sk); 617 } else { /* Only an error on timeout */ 618 sk->sk_err_soft = err; 619 } 620 621 out: 622 bh_unlock_sock(sk); 623 sock_put(sk); 624 return 0; 625 } 626 627 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 628 { 629 struct tcphdr *th = tcp_hdr(skb); 630 631 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 632 skb->csum_start = skb_transport_header(skb) - skb->head; 633 skb->csum_offset = offsetof(struct tcphdr, check); 634 } 635 636 /* This routine computes an IPv4 TCP checksum. */ 637 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 638 { 639 const struct inet_sock *inet = inet_sk(sk); 640 641 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 642 } 643 EXPORT_SYMBOL(tcp_v4_send_check); 644 645 /* 646 * This routine will send an RST to the other tcp. 647 * 648 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 649 * for reset. 650 * Answer: if a packet caused RST, it is not for a socket 651 * existing in our system, if it is matched to a socket, 652 * it is just duplicate segment or bug in other side's TCP. 653 * So that we build reply only basing on parameters 654 * arrived with segment. 655 * Exception: precedence violation. We do not implement it in any case. 656 */ 657 658 #ifdef CONFIG_TCP_MD5SIG 659 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED 660 #else 661 #define OPTION_BYTES sizeof(__be32) 662 #endif 663 664 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) 665 { 666 const struct tcphdr *th = tcp_hdr(skb); 667 struct { 668 struct tcphdr th; 669 __be32 opt[OPTION_BYTES / sizeof(__be32)]; 670 } rep; 671 struct ip_reply_arg arg; 672 #ifdef CONFIG_TCP_MD5SIG 673 struct tcp_md5sig_key *key = NULL; 674 const __u8 *hash_location = NULL; 675 unsigned char newhash[16]; 676 int genhash; 677 struct sock *sk1 = NULL; 678 #endif 679 u64 transmit_time = 0; 680 struct sock *ctl_sk; 681 struct net *net; 682 683 /* Never send a reset in response to a reset. */ 684 if (th->rst) 685 return; 686 687 /* If sk not NULL, it means we did a successful lookup and incoming 688 * route had to be correct. prequeue might have dropped our dst. 689 */ 690 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 691 return; 692 693 /* Swap the send and the receive. */ 694 memset(&rep, 0, sizeof(rep)); 695 rep.th.dest = th->source; 696 rep.th.source = th->dest; 697 rep.th.doff = sizeof(struct tcphdr) / 4; 698 rep.th.rst = 1; 699 700 if (th->ack) { 701 rep.th.seq = th->ack_seq; 702 } else { 703 rep.th.ack = 1; 704 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 705 skb->len - (th->doff << 2)); 706 } 707 708 memset(&arg, 0, sizeof(arg)); 709 arg.iov[0].iov_base = (unsigned char *)&rep; 710 arg.iov[0].iov_len = sizeof(rep.th); 711 712 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 713 #ifdef CONFIG_TCP_MD5SIG 714 rcu_read_lock(); 715 hash_location = tcp_parse_md5sig_option(th); 716 if (sk && sk_fullsock(sk)) { 717 const union tcp_md5_addr *addr; 718 int l3index; 719 720 /* sdif set, means packet ingressed via a device 721 * in an L3 domain and inet_iif is set to it. 722 */ 723 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 724 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 725 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 726 } else if (hash_location) { 727 const union tcp_md5_addr *addr; 728 int sdif = tcp_v4_sdif(skb); 729 int dif = inet_iif(skb); 730 int l3index; 731 732 /* 733 * active side is lost. Try to find listening socket through 734 * source port, and then find md5 key through listening socket. 735 * we are not loose security here: 736 * Incoming packet is checked with md5 hash with finding key, 737 * no RST generated if md5 hash doesn't match. 738 */ 739 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, 740 ip_hdr(skb)->saddr, 741 th->source, ip_hdr(skb)->daddr, 742 ntohs(th->source), dif, sdif); 743 /* don't send rst if it can't find key */ 744 if (!sk1) 745 goto out; 746 747 /* sdif set, means packet ingressed via a device 748 * in an L3 domain and dif is set to it. 749 */ 750 l3index = sdif ? dif : 0; 751 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 752 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 753 if (!key) 754 goto out; 755 756 757 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 758 if (genhash || memcmp(hash_location, newhash, 16) != 0) 759 goto out; 760 761 } 762 763 if (key) { 764 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 765 (TCPOPT_NOP << 16) | 766 (TCPOPT_MD5SIG << 8) | 767 TCPOLEN_MD5SIG); 768 /* Update length and the length the header thinks exists */ 769 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 770 rep.th.doff = arg.iov[0].iov_len / 4; 771 772 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 773 key, ip_hdr(skb)->saddr, 774 ip_hdr(skb)->daddr, &rep.th); 775 } 776 #endif 777 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ 778 if (rep.opt[0] == 0) { 779 __be32 mrst = mptcp_reset_option(skb); 780 781 if (mrst) { 782 rep.opt[0] = mrst; 783 arg.iov[0].iov_len += sizeof(mrst); 784 rep.th.doff = arg.iov[0].iov_len / 4; 785 } 786 } 787 788 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 789 ip_hdr(skb)->saddr, /* XXX */ 790 arg.iov[0].iov_len, IPPROTO_TCP, 0); 791 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 792 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 793 794 /* When socket is gone, all binding information is lost. 795 * routing might fail in this case. No choice here, if we choose to force 796 * input interface, we will misroute in case of asymmetric route. 797 */ 798 if (sk) { 799 arg.bound_dev_if = sk->sk_bound_dev_if; 800 if (sk_fullsock(sk)) 801 trace_tcp_send_reset(sk, skb); 802 } 803 804 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 805 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 806 807 arg.tos = ip_hdr(skb)->tos; 808 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 809 local_bh_disable(); 810 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); 811 if (sk) { 812 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 813 inet_twsk(sk)->tw_mark : sk->sk_mark; 814 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 815 inet_twsk(sk)->tw_priority : sk->sk_priority; 816 transmit_time = tcp_transmit_time(sk); 817 } 818 ip_send_unicast_reply(ctl_sk, 819 skb, &TCP_SKB_CB(skb)->header.h4.opt, 820 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 821 &arg, arg.iov[0].iov_len, 822 transmit_time); 823 824 ctl_sk->sk_mark = 0; 825 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 826 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 827 local_bh_enable(); 828 829 #ifdef CONFIG_TCP_MD5SIG 830 out: 831 rcu_read_unlock(); 832 #endif 833 } 834 835 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 836 outside socket context is ugly, certainly. What can I do? 837 */ 838 839 static void tcp_v4_send_ack(const struct sock *sk, 840 struct sk_buff *skb, u32 seq, u32 ack, 841 u32 win, u32 tsval, u32 tsecr, int oif, 842 struct tcp_md5sig_key *key, 843 int reply_flags, u8 tos) 844 { 845 const struct tcphdr *th = tcp_hdr(skb); 846 struct { 847 struct tcphdr th; 848 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 849 #ifdef CONFIG_TCP_MD5SIG 850 + (TCPOLEN_MD5SIG_ALIGNED >> 2) 851 #endif 852 ]; 853 } rep; 854 struct net *net = sock_net(sk); 855 struct ip_reply_arg arg; 856 struct sock *ctl_sk; 857 u64 transmit_time; 858 859 memset(&rep.th, 0, sizeof(struct tcphdr)); 860 memset(&arg, 0, sizeof(arg)); 861 862 arg.iov[0].iov_base = (unsigned char *)&rep; 863 arg.iov[0].iov_len = sizeof(rep.th); 864 if (tsecr) { 865 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 866 (TCPOPT_TIMESTAMP << 8) | 867 TCPOLEN_TIMESTAMP); 868 rep.opt[1] = htonl(tsval); 869 rep.opt[2] = htonl(tsecr); 870 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 871 } 872 873 /* Swap the send and the receive. */ 874 rep.th.dest = th->source; 875 rep.th.source = th->dest; 876 rep.th.doff = arg.iov[0].iov_len / 4; 877 rep.th.seq = htonl(seq); 878 rep.th.ack_seq = htonl(ack); 879 rep.th.ack = 1; 880 rep.th.window = htons(win); 881 882 #ifdef CONFIG_TCP_MD5SIG 883 if (key) { 884 int offset = (tsecr) ? 3 : 0; 885 886 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 887 (TCPOPT_NOP << 16) | 888 (TCPOPT_MD5SIG << 8) | 889 TCPOLEN_MD5SIG); 890 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 891 rep.th.doff = arg.iov[0].iov_len/4; 892 893 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 894 key, ip_hdr(skb)->saddr, 895 ip_hdr(skb)->daddr, &rep.th); 896 } 897 #endif 898 arg.flags = reply_flags; 899 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 900 ip_hdr(skb)->saddr, /* XXX */ 901 arg.iov[0].iov_len, IPPROTO_TCP, 0); 902 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 903 if (oif) 904 arg.bound_dev_if = oif; 905 arg.tos = tos; 906 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 907 local_bh_disable(); 908 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); 909 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 910 inet_twsk(sk)->tw_mark : sk->sk_mark; 911 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 912 inet_twsk(sk)->tw_priority : sk->sk_priority; 913 transmit_time = tcp_transmit_time(sk); 914 ip_send_unicast_reply(ctl_sk, 915 skb, &TCP_SKB_CB(skb)->header.h4.opt, 916 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 917 &arg, arg.iov[0].iov_len, 918 transmit_time); 919 920 ctl_sk->sk_mark = 0; 921 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 922 local_bh_enable(); 923 } 924 925 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 926 { 927 struct inet_timewait_sock *tw = inet_twsk(sk); 928 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 929 930 tcp_v4_send_ack(sk, skb, 931 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 932 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 933 tcp_time_stamp_raw() + tcptw->tw_ts_offset, 934 tcptw->tw_ts_recent, 935 tw->tw_bound_dev_if, 936 tcp_twsk_md5_key(tcptw), 937 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 938 tw->tw_tos 939 ); 940 941 inet_twsk_put(tw); 942 } 943 944 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 945 struct request_sock *req) 946 { 947 const union tcp_md5_addr *addr; 948 int l3index; 949 950 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 951 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 952 */ 953 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 954 tcp_sk(sk)->snd_nxt; 955 956 /* RFC 7323 2.3 957 * The window field (SEG.WND) of every outgoing segment, with the 958 * exception of <SYN> segments, MUST be right-shifted by 959 * Rcv.Wind.Shift bits: 960 */ 961 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 962 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 963 tcp_v4_send_ack(sk, skb, seq, 964 tcp_rsk(req)->rcv_nxt, 965 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 966 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, 967 req->ts_recent, 968 0, 969 tcp_md5_do_lookup(sk, l3index, addr, AF_INET), 970 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 971 ip_hdr(skb)->tos); 972 } 973 974 /* 975 * Send a SYN-ACK after having received a SYN. 976 * This still operates on a request_sock only, not on a big 977 * socket. 978 */ 979 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 980 struct flowi *fl, 981 struct request_sock *req, 982 struct tcp_fastopen_cookie *foc, 983 enum tcp_synack_type synack_type, 984 struct sk_buff *syn_skb) 985 { 986 const struct inet_request_sock *ireq = inet_rsk(req); 987 struct flowi4 fl4; 988 int err = -1; 989 struct sk_buff *skb; 990 u8 tos; 991 992 /* First, grab a route. */ 993 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 994 return -1; 995 996 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 997 998 if (skb) { 999 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1000 1001 tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ? 1002 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | 1003 (inet_sk(sk)->tos & INET_ECN_MASK) : 1004 inet_sk(sk)->tos; 1005 1006 if (!INET_ECN_is_capable(tos) && 1007 tcp_bpf_ca_needs_ecn((struct sock *)req)) 1008 tos |= INET_ECN_ECT_0; 1009 1010 rcu_read_lock(); 1011 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 1012 ireq->ir_rmt_addr, 1013 rcu_dereference(ireq->ireq_opt), 1014 tos); 1015 rcu_read_unlock(); 1016 err = net_xmit_eval(err); 1017 } 1018 1019 return err; 1020 } 1021 1022 /* 1023 * IPv4 request_sock destructor. 1024 */ 1025 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1026 { 1027 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1028 } 1029 1030 #ifdef CONFIG_TCP_MD5SIG 1031 /* 1032 * RFC2385 MD5 checksumming requires a mapping of 1033 * IP address->MD5 Key. 1034 * We need to maintain these in the sk structure. 1035 */ 1036 1037 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed); 1038 EXPORT_SYMBOL(tcp_md5_needed); 1039 1040 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) 1041 { 1042 if (!old) 1043 return true; 1044 1045 /* l3index always overrides non-l3index */ 1046 if (old->l3index && new->l3index == 0) 1047 return false; 1048 if (old->l3index == 0 && new->l3index) 1049 return true; 1050 1051 return old->prefixlen < new->prefixlen; 1052 } 1053 1054 /* Find the Key structure for an address. */ 1055 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1056 const union tcp_md5_addr *addr, 1057 int family) 1058 { 1059 const struct tcp_sock *tp = tcp_sk(sk); 1060 struct tcp_md5sig_key *key; 1061 const struct tcp_md5sig_info *md5sig; 1062 __be32 mask; 1063 struct tcp_md5sig_key *best_match = NULL; 1064 bool match; 1065 1066 /* caller either holds rcu_read_lock() or socket lock */ 1067 md5sig = rcu_dereference_check(tp->md5sig_info, 1068 lockdep_sock_is_held(sk)); 1069 if (!md5sig) 1070 return NULL; 1071 1072 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1073 lockdep_sock_is_held(sk)) { 1074 if (key->family != family) 1075 continue; 1076 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index) 1077 continue; 1078 if (family == AF_INET) { 1079 mask = inet_make_mask(key->prefixlen); 1080 match = (key->addr.a4.s_addr & mask) == 1081 (addr->a4.s_addr & mask); 1082 #if IS_ENABLED(CONFIG_IPV6) 1083 } else if (family == AF_INET6) { 1084 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1085 key->prefixlen); 1086 #endif 1087 } else { 1088 match = false; 1089 } 1090 1091 if (match && better_md5_match(best_match, key)) 1092 best_match = key; 1093 } 1094 return best_match; 1095 } 1096 EXPORT_SYMBOL(__tcp_md5_do_lookup); 1097 1098 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1099 const union tcp_md5_addr *addr, 1100 int family, u8 prefixlen, 1101 int l3index, u8 flags) 1102 { 1103 const struct tcp_sock *tp = tcp_sk(sk); 1104 struct tcp_md5sig_key *key; 1105 unsigned int size = sizeof(struct in_addr); 1106 const struct tcp_md5sig_info *md5sig; 1107 1108 /* caller either holds rcu_read_lock() or socket lock */ 1109 md5sig = rcu_dereference_check(tp->md5sig_info, 1110 lockdep_sock_is_held(sk)); 1111 if (!md5sig) 1112 return NULL; 1113 #if IS_ENABLED(CONFIG_IPV6) 1114 if (family == AF_INET6) 1115 size = sizeof(struct in6_addr); 1116 #endif 1117 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1118 lockdep_sock_is_held(sk)) { 1119 if (key->family != family) 1120 continue; 1121 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) 1122 continue; 1123 if (key->l3index != l3index) 1124 continue; 1125 if (!memcmp(&key->addr, addr, size) && 1126 key->prefixlen == prefixlen) 1127 return key; 1128 } 1129 return NULL; 1130 } 1131 1132 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1133 const struct sock *addr_sk) 1134 { 1135 const union tcp_md5_addr *addr; 1136 int l3index; 1137 1138 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1139 addr_sk->sk_bound_dev_if); 1140 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1141 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1142 } 1143 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1144 1145 /* This can be called on a newly created socket, from other files */ 1146 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1147 int family, u8 prefixlen, int l3index, u8 flags, 1148 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1149 { 1150 /* Add Key to the list */ 1151 struct tcp_md5sig_key *key; 1152 struct tcp_sock *tp = tcp_sk(sk); 1153 struct tcp_md5sig_info *md5sig; 1154 1155 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1156 if (key) { 1157 /* Pre-existing entry - just update that one. 1158 * Note that the key might be used concurrently. 1159 * data_race() is telling kcsan that we do not care of 1160 * key mismatches, since changing MD5 key on live flows 1161 * can lead to packet drops. 1162 */ 1163 data_race(memcpy(key->key, newkey, newkeylen)); 1164 1165 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1166 * Also note that a reader could catch new key->keylen value 1167 * but old key->key[], this is the reason we use __GFP_ZERO 1168 * at sock_kmalloc() time below these lines. 1169 */ 1170 WRITE_ONCE(key->keylen, newkeylen); 1171 1172 return 0; 1173 } 1174 1175 md5sig = rcu_dereference_protected(tp->md5sig_info, 1176 lockdep_sock_is_held(sk)); 1177 if (!md5sig) { 1178 md5sig = kmalloc(sizeof(*md5sig), gfp); 1179 if (!md5sig) 1180 return -ENOMEM; 1181 1182 sk_nocaps_add(sk, NETIF_F_GSO_MASK); 1183 INIT_HLIST_HEAD(&md5sig->head); 1184 rcu_assign_pointer(tp->md5sig_info, md5sig); 1185 } 1186 1187 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1188 if (!key) 1189 return -ENOMEM; 1190 if (!tcp_alloc_md5sig_pool()) { 1191 sock_kfree_s(sk, key, sizeof(*key)); 1192 return -ENOMEM; 1193 } 1194 1195 memcpy(key->key, newkey, newkeylen); 1196 key->keylen = newkeylen; 1197 key->family = family; 1198 key->prefixlen = prefixlen; 1199 key->l3index = l3index; 1200 key->flags = flags; 1201 memcpy(&key->addr, addr, 1202 (family == AF_INET6) ? sizeof(struct in6_addr) : 1203 sizeof(struct in_addr)); 1204 hlist_add_head_rcu(&key->node, &md5sig->head); 1205 return 0; 1206 } 1207 EXPORT_SYMBOL(tcp_md5_do_add); 1208 1209 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1210 u8 prefixlen, int l3index, u8 flags) 1211 { 1212 struct tcp_md5sig_key *key; 1213 1214 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1215 if (!key) 1216 return -ENOENT; 1217 hlist_del_rcu(&key->node); 1218 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1219 kfree_rcu(key, rcu); 1220 return 0; 1221 } 1222 EXPORT_SYMBOL(tcp_md5_do_del); 1223 1224 static void tcp_clear_md5_list(struct sock *sk) 1225 { 1226 struct tcp_sock *tp = tcp_sk(sk); 1227 struct tcp_md5sig_key *key; 1228 struct hlist_node *n; 1229 struct tcp_md5sig_info *md5sig; 1230 1231 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1232 1233 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1234 hlist_del_rcu(&key->node); 1235 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1236 kfree_rcu(key, rcu); 1237 } 1238 } 1239 1240 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1241 sockptr_t optval, int optlen) 1242 { 1243 struct tcp_md5sig cmd; 1244 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1245 const union tcp_md5_addr *addr; 1246 u8 prefixlen = 32; 1247 int l3index = 0; 1248 u8 flags; 1249 1250 if (optlen < sizeof(cmd)) 1251 return -EINVAL; 1252 1253 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1254 return -EFAULT; 1255 1256 if (sin->sin_family != AF_INET) 1257 return -EINVAL; 1258 1259 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1260 1261 if (optname == TCP_MD5SIG_EXT && 1262 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1263 prefixlen = cmd.tcpm_prefixlen; 1264 if (prefixlen > 32) 1265 return -EINVAL; 1266 } 1267 1268 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && 1269 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1270 struct net_device *dev; 1271 1272 rcu_read_lock(); 1273 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1274 if (dev && netif_is_l3_master(dev)) 1275 l3index = dev->ifindex; 1276 1277 rcu_read_unlock(); 1278 1279 /* ok to reference set/not set outside of rcu; 1280 * right now device MUST be an L3 master 1281 */ 1282 if (!dev || !l3index) 1283 return -EINVAL; 1284 } 1285 1286 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1287 1288 if (!cmd.tcpm_keylen) 1289 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); 1290 1291 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1292 return -EINVAL; 1293 1294 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, 1295 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); 1296 } 1297 1298 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, 1299 __be32 daddr, __be32 saddr, 1300 const struct tcphdr *th, int nbytes) 1301 { 1302 struct tcp4_pseudohdr *bp; 1303 struct scatterlist sg; 1304 struct tcphdr *_th; 1305 1306 bp = hp->scratch; 1307 bp->saddr = saddr; 1308 bp->daddr = daddr; 1309 bp->pad = 0; 1310 bp->protocol = IPPROTO_TCP; 1311 bp->len = cpu_to_be16(nbytes); 1312 1313 _th = (struct tcphdr *)(bp + 1); 1314 memcpy(_th, th, sizeof(*th)); 1315 _th->check = 0; 1316 1317 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1318 ahash_request_set_crypt(hp->md5_req, &sg, NULL, 1319 sizeof(*bp) + sizeof(*th)); 1320 return crypto_ahash_update(hp->md5_req); 1321 } 1322 1323 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1324 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1325 { 1326 struct tcp_md5sig_pool *hp; 1327 struct ahash_request *req; 1328 1329 hp = tcp_get_md5sig_pool(); 1330 if (!hp) 1331 goto clear_hash_noput; 1332 req = hp->md5_req; 1333 1334 if (crypto_ahash_init(req)) 1335 goto clear_hash; 1336 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) 1337 goto clear_hash; 1338 if (tcp_md5_hash_key(hp, key)) 1339 goto clear_hash; 1340 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1341 if (crypto_ahash_final(req)) 1342 goto clear_hash; 1343 1344 tcp_put_md5sig_pool(); 1345 return 0; 1346 1347 clear_hash: 1348 tcp_put_md5sig_pool(); 1349 clear_hash_noput: 1350 memset(md5_hash, 0, 16); 1351 return 1; 1352 } 1353 1354 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1355 const struct sock *sk, 1356 const struct sk_buff *skb) 1357 { 1358 struct tcp_md5sig_pool *hp; 1359 struct ahash_request *req; 1360 const struct tcphdr *th = tcp_hdr(skb); 1361 __be32 saddr, daddr; 1362 1363 if (sk) { /* valid for establish/request sockets */ 1364 saddr = sk->sk_rcv_saddr; 1365 daddr = sk->sk_daddr; 1366 } else { 1367 const struct iphdr *iph = ip_hdr(skb); 1368 saddr = iph->saddr; 1369 daddr = iph->daddr; 1370 } 1371 1372 hp = tcp_get_md5sig_pool(); 1373 if (!hp) 1374 goto clear_hash_noput; 1375 req = hp->md5_req; 1376 1377 if (crypto_ahash_init(req)) 1378 goto clear_hash; 1379 1380 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len)) 1381 goto clear_hash; 1382 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) 1383 goto clear_hash; 1384 if (tcp_md5_hash_key(hp, key)) 1385 goto clear_hash; 1386 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1387 if (crypto_ahash_final(req)) 1388 goto clear_hash; 1389 1390 tcp_put_md5sig_pool(); 1391 return 0; 1392 1393 clear_hash: 1394 tcp_put_md5sig_pool(); 1395 clear_hash_noput: 1396 memset(md5_hash, 0, 16); 1397 return 1; 1398 } 1399 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1400 1401 #endif 1402 1403 /* Called with rcu_read_lock() */ 1404 static bool tcp_v4_inbound_md5_hash(const struct sock *sk, 1405 const struct sk_buff *skb, 1406 int dif, int sdif) 1407 { 1408 #ifdef CONFIG_TCP_MD5SIG 1409 /* 1410 * This gets called for each TCP segment that arrives 1411 * so we want to be efficient. 1412 * We have 3 drop cases: 1413 * o No MD5 hash and one expected. 1414 * o MD5 hash and we're not expecting one. 1415 * o MD5 hash and its wrong. 1416 */ 1417 const __u8 *hash_location = NULL; 1418 struct tcp_md5sig_key *hash_expected; 1419 const struct iphdr *iph = ip_hdr(skb); 1420 const struct tcphdr *th = tcp_hdr(skb); 1421 const union tcp_md5_addr *addr; 1422 unsigned char newhash[16]; 1423 int genhash, l3index; 1424 1425 /* sdif set, means packet ingressed via a device 1426 * in an L3 domain and dif is set to the l3mdev 1427 */ 1428 l3index = sdif ? dif : 0; 1429 1430 addr = (union tcp_md5_addr *)&iph->saddr; 1431 hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1432 hash_location = tcp_parse_md5sig_option(th); 1433 1434 /* We've parsed the options - do we have a hash? */ 1435 if (!hash_expected && !hash_location) 1436 return false; 1437 1438 if (hash_expected && !hash_location) { 1439 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); 1440 return true; 1441 } 1442 1443 if (!hash_expected && hash_location) { 1444 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); 1445 return true; 1446 } 1447 1448 /* Okay, so this is hash_expected and hash_location - 1449 * so we need to calculate the checksum. 1450 */ 1451 genhash = tcp_v4_md5_hash_skb(newhash, 1452 hash_expected, 1453 NULL, skb); 1454 1455 if (genhash || memcmp(hash_location, newhash, 16) != 0) { 1456 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); 1457 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n", 1458 &iph->saddr, ntohs(th->source), 1459 &iph->daddr, ntohs(th->dest), 1460 genhash ? " tcp_v4_calc_md5_hash failed" 1461 : "", l3index); 1462 return true; 1463 } 1464 return false; 1465 #endif 1466 return false; 1467 } 1468 1469 static void tcp_v4_init_req(struct request_sock *req, 1470 const struct sock *sk_listener, 1471 struct sk_buff *skb) 1472 { 1473 struct inet_request_sock *ireq = inet_rsk(req); 1474 struct net *net = sock_net(sk_listener); 1475 1476 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1477 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1478 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1479 } 1480 1481 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1482 struct sk_buff *skb, 1483 struct flowi *fl, 1484 struct request_sock *req) 1485 { 1486 tcp_v4_init_req(req, sk, skb); 1487 1488 if (security_inet_conn_request(sk, skb, req)) 1489 return NULL; 1490 1491 return inet_csk_route_req(sk, &fl->u.ip4, req); 1492 } 1493 1494 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1495 .family = PF_INET, 1496 .obj_size = sizeof(struct tcp_request_sock), 1497 .rtx_syn_ack = tcp_rtx_synack, 1498 .send_ack = tcp_v4_reqsk_send_ack, 1499 .destructor = tcp_v4_reqsk_destructor, 1500 .send_reset = tcp_v4_send_reset, 1501 .syn_ack_timeout = tcp_syn_ack_timeout, 1502 }; 1503 1504 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1505 .mss_clamp = TCP_MSS_DEFAULT, 1506 #ifdef CONFIG_TCP_MD5SIG 1507 .req_md5_lookup = tcp_v4_md5_lookup, 1508 .calc_md5_hash = tcp_v4_md5_hash_skb, 1509 #endif 1510 #ifdef CONFIG_SYN_COOKIES 1511 .cookie_init_seq = cookie_v4_init_sequence, 1512 #endif 1513 .route_req = tcp_v4_route_req, 1514 .init_seq = tcp_v4_init_seq, 1515 .init_ts_off = tcp_v4_init_ts_off, 1516 .send_synack = tcp_v4_send_synack, 1517 }; 1518 1519 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1520 { 1521 /* Never answer to SYNs send to broadcast or multicast */ 1522 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1523 goto drop; 1524 1525 return tcp_conn_request(&tcp_request_sock_ops, 1526 &tcp_request_sock_ipv4_ops, sk, skb); 1527 1528 drop: 1529 tcp_listendrop(sk); 1530 return 0; 1531 } 1532 EXPORT_SYMBOL(tcp_v4_conn_request); 1533 1534 1535 /* 1536 * The three way handshake has completed - we got a valid synack - 1537 * now create the new socket. 1538 */ 1539 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1540 struct request_sock *req, 1541 struct dst_entry *dst, 1542 struct request_sock *req_unhash, 1543 bool *own_req) 1544 { 1545 struct inet_request_sock *ireq; 1546 bool found_dup_sk = false; 1547 struct inet_sock *newinet; 1548 struct tcp_sock *newtp; 1549 struct sock *newsk; 1550 #ifdef CONFIG_TCP_MD5SIG 1551 const union tcp_md5_addr *addr; 1552 struct tcp_md5sig_key *key; 1553 int l3index; 1554 #endif 1555 struct ip_options_rcu *inet_opt; 1556 1557 if (sk_acceptq_is_full(sk)) 1558 goto exit_overflow; 1559 1560 newsk = tcp_create_openreq_child(sk, req, skb); 1561 if (!newsk) 1562 goto exit_nonewsk; 1563 1564 newsk->sk_gso_type = SKB_GSO_TCPV4; 1565 inet_sk_rx_dst_set(newsk, skb); 1566 1567 newtp = tcp_sk(newsk); 1568 newinet = inet_sk(newsk); 1569 ireq = inet_rsk(req); 1570 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1571 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1572 newsk->sk_bound_dev_if = ireq->ir_iif; 1573 newinet->inet_saddr = ireq->ir_loc_addr; 1574 inet_opt = rcu_dereference(ireq->ireq_opt); 1575 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1576 newinet->mc_index = inet_iif(skb); 1577 newinet->mc_ttl = ip_hdr(skb)->ttl; 1578 newinet->rcv_tos = ip_hdr(skb)->tos; 1579 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1580 if (inet_opt) 1581 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1582 newinet->inet_id = prandom_u32(); 1583 1584 /* Set ToS of the new socket based upon the value of incoming SYN. 1585 * ECT bits are set later in tcp_init_transfer(). 1586 */ 1587 if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) 1588 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1589 1590 if (!dst) { 1591 dst = inet_csk_route_child_sock(sk, newsk, req); 1592 if (!dst) 1593 goto put_and_exit; 1594 } else { 1595 /* syncookie case : see end of cookie_v4_check() */ 1596 } 1597 sk_setup_caps(newsk, dst); 1598 1599 tcp_ca_openreq_child(newsk, dst); 1600 1601 tcp_sync_mss(newsk, dst_mtu(dst)); 1602 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1603 1604 tcp_initialize_rcv_mss(newsk); 1605 1606 #ifdef CONFIG_TCP_MD5SIG 1607 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1608 /* Copy over the MD5 key from the original socket */ 1609 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1610 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1611 if (key) { 1612 /* 1613 * We're using one, so create a matching key 1614 * on the newsk structure. If we fail to get 1615 * memory, then we end up not copying the key 1616 * across. Shucks. 1617 */ 1618 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags, 1619 key->key, key->keylen, GFP_ATOMIC); 1620 sk_nocaps_add(newsk, NETIF_F_GSO_MASK); 1621 } 1622 #endif 1623 1624 if (__inet_inherit_port(sk, newsk) < 0) 1625 goto put_and_exit; 1626 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1627 &found_dup_sk); 1628 if (likely(*own_req)) { 1629 tcp_move_syn(newtp, req); 1630 ireq->ireq_opt = NULL; 1631 } else { 1632 newinet->inet_opt = NULL; 1633 1634 if (!req_unhash && found_dup_sk) { 1635 /* This code path should only be executed in the 1636 * syncookie case only 1637 */ 1638 bh_unlock_sock(newsk); 1639 sock_put(newsk); 1640 newsk = NULL; 1641 } 1642 } 1643 return newsk; 1644 1645 exit_overflow: 1646 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1647 exit_nonewsk: 1648 dst_release(dst); 1649 exit: 1650 tcp_listendrop(sk); 1651 return NULL; 1652 put_and_exit: 1653 newinet->inet_opt = NULL; 1654 inet_csk_prepare_forced_close(newsk); 1655 tcp_done(newsk); 1656 goto exit; 1657 } 1658 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1659 1660 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1661 { 1662 #ifdef CONFIG_SYN_COOKIES 1663 const struct tcphdr *th = tcp_hdr(skb); 1664 1665 if (!th->syn) 1666 sk = cookie_v4_check(sk, skb); 1667 #endif 1668 return sk; 1669 } 1670 1671 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1672 struct tcphdr *th, u32 *cookie) 1673 { 1674 u16 mss = 0; 1675 #ifdef CONFIG_SYN_COOKIES 1676 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1677 &tcp_request_sock_ipv4_ops, sk, th); 1678 if (mss) { 1679 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1680 tcp_synq_overflow(sk); 1681 } 1682 #endif 1683 return mss; 1684 } 1685 1686 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 1687 u32)); 1688 /* The socket must have it's spinlock held when we get 1689 * here, unless it is a TCP_LISTEN socket. 1690 * 1691 * We have a potential double-lock case here, so even when 1692 * doing backlog processing we use the BH locking scheme. 1693 * This is because we cannot sleep with the original spinlock 1694 * held. 1695 */ 1696 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1697 { 1698 struct sock *rsk; 1699 1700 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1701 struct dst_entry *dst = sk->sk_rx_dst; 1702 1703 sock_rps_save_rxhash(sk, skb); 1704 sk_mark_napi_id(sk, skb); 1705 if (dst) { 1706 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || 1707 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, 1708 dst, 0)) { 1709 dst_release(dst); 1710 sk->sk_rx_dst = NULL; 1711 } 1712 } 1713 tcp_rcv_established(sk, skb); 1714 return 0; 1715 } 1716 1717 if (tcp_checksum_complete(skb)) 1718 goto csum_err; 1719 1720 if (sk->sk_state == TCP_LISTEN) { 1721 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1722 1723 if (!nsk) 1724 goto discard; 1725 if (nsk != sk) { 1726 if (tcp_child_process(sk, nsk, skb)) { 1727 rsk = nsk; 1728 goto reset; 1729 } 1730 return 0; 1731 } 1732 } else 1733 sock_rps_save_rxhash(sk, skb); 1734 1735 if (tcp_rcv_state_process(sk, skb)) { 1736 rsk = sk; 1737 goto reset; 1738 } 1739 return 0; 1740 1741 reset: 1742 tcp_v4_send_reset(rsk, skb); 1743 discard: 1744 kfree_skb(skb); 1745 /* Be careful here. If this function gets more complicated and 1746 * gcc suffers from register pressure on the x86, sk (in %ebx) 1747 * might be destroyed here. This current version compiles correctly, 1748 * but you have been warned. 1749 */ 1750 return 0; 1751 1752 csum_err: 1753 trace_tcp_bad_csum(skb); 1754 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1755 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1756 goto discard; 1757 } 1758 EXPORT_SYMBOL(tcp_v4_do_rcv); 1759 1760 int tcp_v4_early_demux(struct sk_buff *skb) 1761 { 1762 const struct iphdr *iph; 1763 const struct tcphdr *th; 1764 struct sock *sk; 1765 1766 if (skb->pkt_type != PACKET_HOST) 1767 return 0; 1768 1769 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1770 return 0; 1771 1772 iph = ip_hdr(skb); 1773 th = tcp_hdr(skb); 1774 1775 if (th->doff < sizeof(struct tcphdr) / 4) 1776 return 0; 1777 1778 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, 1779 iph->saddr, th->source, 1780 iph->daddr, ntohs(th->dest), 1781 skb->skb_iif, inet_sdif(skb)); 1782 if (sk) { 1783 skb->sk = sk; 1784 skb->destructor = sock_edemux; 1785 if (sk_fullsock(sk)) { 1786 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); 1787 1788 if (dst) 1789 dst = dst_check(dst, 0); 1790 if (dst && 1791 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) 1792 skb_dst_set_noref(skb, dst); 1793 } 1794 } 1795 return 0; 1796 } 1797 1798 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) 1799 { 1800 u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf); 1801 u32 tail_gso_size, tail_gso_segs; 1802 struct skb_shared_info *shinfo; 1803 const struct tcphdr *th; 1804 struct tcphdr *thtail; 1805 struct sk_buff *tail; 1806 unsigned int hdrlen; 1807 bool fragstolen; 1808 u32 gso_segs; 1809 u32 gso_size; 1810 int delta; 1811 1812 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1813 * we can fix skb->truesize to its real value to avoid future drops. 1814 * This is valid because skb is not yet charged to the socket. 1815 * It has been noticed pure SACK packets were sometimes dropped 1816 * (if cooked by drivers without copybreak feature). 1817 */ 1818 skb_condense(skb); 1819 1820 skb_dst_drop(skb); 1821 1822 if (unlikely(tcp_checksum_complete(skb))) { 1823 bh_unlock_sock(sk); 1824 trace_tcp_bad_csum(skb); 1825 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1826 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1827 return true; 1828 } 1829 1830 /* Attempt coalescing to last skb in backlog, even if we are 1831 * above the limits. 1832 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 1833 */ 1834 th = (const struct tcphdr *)skb->data; 1835 hdrlen = th->doff * 4; 1836 1837 tail = sk->sk_backlog.tail; 1838 if (!tail) 1839 goto no_coalesce; 1840 thtail = (struct tcphdr *)tail->data; 1841 1842 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 1843 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 1844 ((TCP_SKB_CB(tail)->tcp_flags | 1845 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 1846 !((TCP_SKB_CB(tail)->tcp_flags & 1847 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 1848 ((TCP_SKB_CB(tail)->tcp_flags ^ 1849 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || 1850 #ifdef CONFIG_TLS_DEVICE 1851 tail->decrypted != skb->decrypted || 1852 #endif 1853 thtail->doff != th->doff || 1854 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 1855 goto no_coalesce; 1856 1857 __skb_pull(skb, hdrlen); 1858 1859 shinfo = skb_shinfo(skb); 1860 gso_size = shinfo->gso_size ?: skb->len; 1861 gso_segs = shinfo->gso_segs ?: 1; 1862 1863 shinfo = skb_shinfo(tail); 1864 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); 1865 tail_gso_segs = shinfo->gso_segs ?: 1; 1866 1867 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 1868 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 1869 1870 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 1871 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 1872 thtail->window = th->window; 1873 } 1874 1875 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 1876 * thtail->fin, so that the fast path in tcp_rcv_established() 1877 * is not entered if we append a packet with a FIN. 1878 * SYN, RST, URG are not present. 1879 * ACK is set on both packets. 1880 * PSH : we do not really care in TCP stack, 1881 * at least for 'GRO' packets. 1882 */ 1883 thtail->fin |= th->fin; 1884 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 1885 1886 if (TCP_SKB_CB(skb)->has_rxtstamp) { 1887 TCP_SKB_CB(tail)->has_rxtstamp = true; 1888 tail->tstamp = skb->tstamp; 1889 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 1890 } 1891 1892 /* Not as strict as GRO. We only need to carry mss max value */ 1893 shinfo->gso_size = max(gso_size, tail_gso_size); 1894 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); 1895 1896 sk->sk_backlog.len += delta; 1897 __NET_INC_STATS(sock_net(sk), 1898 LINUX_MIB_TCPBACKLOGCOALESCE); 1899 kfree_skb_partial(skb, fragstolen); 1900 return false; 1901 } 1902 __skb_push(skb, hdrlen); 1903 1904 no_coalesce: 1905 /* Only socket owner can try to collapse/prune rx queues 1906 * to reduce memory overhead, so add a little headroom here. 1907 * Few sockets backlog are possibly concurrently non empty. 1908 */ 1909 limit += 64*1024; 1910 1911 if (unlikely(sk_add_backlog(sk, skb, limit))) { 1912 bh_unlock_sock(sk); 1913 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 1914 return true; 1915 } 1916 return false; 1917 } 1918 EXPORT_SYMBOL(tcp_add_backlog); 1919 1920 int tcp_filter(struct sock *sk, struct sk_buff *skb) 1921 { 1922 struct tcphdr *th = (struct tcphdr *)skb->data; 1923 1924 return sk_filter_trim_cap(sk, skb, th->doff * 4); 1925 } 1926 EXPORT_SYMBOL(tcp_filter); 1927 1928 static void tcp_v4_restore_cb(struct sk_buff *skb) 1929 { 1930 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 1931 sizeof(struct inet_skb_parm)); 1932 } 1933 1934 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 1935 const struct tcphdr *th) 1936 { 1937 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 1938 * barrier() makes sure compiler wont play fool^Waliasing games. 1939 */ 1940 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 1941 sizeof(struct inet_skb_parm)); 1942 barrier(); 1943 1944 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1945 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1946 skb->len - th->doff * 4); 1947 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1948 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 1949 TCP_SKB_CB(skb)->tcp_tw_isn = 0; 1950 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 1951 TCP_SKB_CB(skb)->sacked = 0; 1952 TCP_SKB_CB(skb)->has_rxtstamp = 1953 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 1954 } 1955 1956 /* 1957 * From tcp_input.c 1958 */ 1959 1960 int tcp_v4_rcv(struct sk_buff *skb) 1961 { 1962 struct net *net = dev_net(skb->dev); 1963 int sdif = inet_sdif(skb); 1964 int dif = inet_iif(skb); 1965 const struct iphdr *iph; 1966 const struct tcphdr *th; 1967 bool refcounted; 1968 struct sock *sk; 1969 int ret; 1970 1971 if (skb->pkt_type != PACKET_HOST) 1972 goto discard_it; 1973 1974 /* Count it even if it's bad */ 1975 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 1976 1977 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 1978 goto discard_it; 1979 1980 th = (const struct tcphdr *)skb->data; 1981 1982 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) 1983 goto bad_packet; 1984 if (!pskb_may_pull(skb, th->doff * 4)) 1985 goto discard_it; 1986 1987 /* An explanation is required here, I think. 1988 * Packet length and doff are validated by header prediction, 1989 * provided case of th->doff==0 is eliminated. 1990 * So, we defer the checks. */ 1991 1992 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 1993 goto csum_error; 1994 1995 th = (const struct tcphdr *)skb->data; 1996 iph = ip_hdr(skb); 1997 lookup: 1998 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, 1999 th->dest, sdif, &refcounted); 2000 if (!sk) 2001 goto no_tcp_socket; 2002 2003 process: 2004 if (sk->sk_state == TCP_TIME_WAIT) 2005 goto do_time_wait; 2006 2007 if (sk->sk_state == TCP_NEW_SYN_RECV) { 2008 struct request_sock *req = inet_reqsk(sk); 2009 bool req_stolen = false; 2010 struct sock *nsk; 2011 2012 sk = req->rsk_listener; 2013 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) { 2014 sk_drops_add(sk, skb); 2015 reqsk_put(req); 2016 goto discard_it; 2017 } 2018 if (tcp_checksum_complete(skb)) { 2019 reqsk_put(req); 2020 goto csum_error; 2021 } 2022 if (unlikely(sk->sk_state != TCP_LISTEN)) { 2023 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); 2024 if (!nsk) { 2025 inet_csk_reqsk_queue_drop_and_put(sk, req); 2026 goto lookup; 2027 } 2028 sk = nsk; 2029 /* reuseport_migrate_sock() has already held one sk_refcnt 2030 * before returning. 2031 */ 2032 } else { 2033 /* We own a reference on the listener, increase it again 2034 * as we might lose it too soon. 2035 */ 2036 sock_hold(sk); 2037 } 2038 refcounted = true; 2039 nsk = NULL; 2040 if (!tcp_filter(sk, skb)) { 2041 th = (const struct tcphdr *)skb->data; 2042 iph = ip_hdr(skb); 2043 tcp_v4_fill_cb(skb, iph, th); 2044 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 2045 } 2046 if (!nsk) { 2047 reqsk_put(req); 2048 if (req_stolen) { 2049 /* Another cpu got exclusive access to req 2050 * and created a full blown socket. 2051 * Try to feed this packet to this socket 2052 * instead of discarding it. 2053 */ 2054 tcp_v4_restore_cb(skb); 2055 sock_put(sk); 2056 goto lookup; 2057 } 2058 goto discard_and_relse; 2059 } 2060 if (nsk == sk) { 2061 reqsk_put(req); 2062 tcp_v4_restore_cb(skb); 2063 } else if (tcp_child_process(sk, nsk, skb)) { 2064 tcp_v4_send_reset(nsk, skb); 2065 goto discard_and_relse; 2066 } else { 2067 sock_put(sk); 2068 return 0; 2069 } 2070 } 2071 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 2072 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2073 goto discard_and_relse; 2074 } 2075 2076 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 2077 goto discard_and_relse; 2078 2079 if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif)) 2080 goto discard_and_relse; 2081 2082 nf_reset_ct(skb); 2083 2084 if (tcp_filter(sk, skb)) 2085 goto discard_and_relse; 2086 th = (const struct tcphdr *)skb->data; 2087 iph = ip_hdr(skb); 2088 tcp_v4_fill_cb(skb, iph, th); 2089 2090 skb->dev = NULL; 2091 2092 if (sk->sk_state == TCP_LISTEN) { 2093 ret = tcp_v4_do_rcv(sk, skb); 2094 goto put_and_return; 2095 } 2096 2097 sk_incoming_cpu_update(sk); 2098 2099 bh_lock_sock_nested(sk); 2100 tcp_segs_in(tcp_sk(sk), skb); 2101 ret = 0; 2102 if (!sock_owned_by_user(sk)) { 2103 ret = tcp_v4_do_rcv(sk, skb); 2104 } else { 2105 if (tcp_add_backlog(sk, skb)) 2106 goto discard_and_relse; 2107 } 2108 bh_unlock_sock(sk); 2109 2110 put_and_return: 2111 if (refcounted) 2112 sock_put(sk); 2113 2114 return ret; 2115 2116 no_tcp_socket: 2117 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2118 goto discard_it; 2119 2120 tcp_v4_fill_cb(skb, iph, th); 2121 2122 if (tcp_checksum_complete(skb)) { 2123 csum_error: 2124 trace_tcp_bad_csum(skb); 2125 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2126 bad_packet: 2127 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2128 } else { 2129 tcp_v4_send_reset(NULL, skb); 2130 } 2131 2132 discard_it: 2133 /* Discard frame. */ 2134 kfree_skb(skb); 2135 return 0; 2136 2137 discard_and_relse: 2138 sk_drops_add(sk, skb); 2139 if (refcounted) 2140 sock_put(sk); 2141 goto discard_it; 2142 2143 do_time_wait: 2144 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2145 inet_twsk_put(inet_twsk(sk)); 2146 goto discard_it; 2147 } 2148 2149 tcp_v4_fill_cb(skb, iph, th); 2150 2151 if (tcp_checksum_complete(skb)) { 2152 inet_twsk_put(inet_twsk(sk)); 2153 goto csum_error; 2154 } 2155 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 2156 case TCP_TW_SYN: { 2157 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), 2158 &tcp_hashinfo, skb, 2159 __tcp_hdrlen(th), 2160 iph->saddr, th->source, 2161 iph->daddr, th->dest, 2162 inet_iif(skb), 2163 sdif); 2164 if (sk2) { 2165 inet_twsk_deschedule_put(inet_twsk(sk)); 2166 sk = sk2; 2167 tcp_v4_restore_cb(skb); 2168 refcounted = false; 2169 goto process; 2170 } 2171 } 2172 /* to ACK */ 2173 fallthrough; 2174 case TCP_TW_ACK: 2175 tcp_v4_timewait_ack(sk, skb); 2176 break; 2177 case TCP_TW_RST: 2178 tcp_v4_send_reset(sk, skb); 2179 inet_twsk_deschedule_put(inet_twsk(sk)); 2180 goto discard_it; 2181 case TCP_TW_SUCCESS:; 2182 } 2183 goto discard_it; 2184 } 2185 2186 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2187 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2188 .twsk_unique = tcp_twsk_unique, 2189 .twsk_destructor= tcp_twsk_destructor, 2190 }; 2191 2192 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2193 { 2194 struct dst_entry *dst = skb_dst(skb); 2195 2196 if (dst && dst_hold_safe(dst)) { 2197 sk->sk_rx_dst = dst; 2198 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; 2199 } 2200 } 2201 EXPORT_SYMBOL(inet_sk_rx_dst_set); 2202 2203 const struct inet_connection_sock_af_ops ipv4_specific = { 2204 .queue_xmit = ip_queue_xmit, 2205 .send_check = tcp_v4_send_check, 2206 .rebuild_header = inet_sk_rebuild_header, 2207 .sk_rx_dst_set = inet_sk_rx_dst_set, 2208 .conn_request = tcp_v4_conn_request, 2209 .syn_recv_sock = tcp_v4_syn_recv_sock, 2210 .net_header_len = sizeof(struct iphdr), 2211 .setsockopt = ip_setsockopt, 2212 .getsockopt = ip_getsockopt, 2213 .addr2sockaddr = inet_csk_addr2sockaddr, 2214 .sockaddr_len = sizeof(struct sockaddr_in), 2215 .mtu_reduced = tcp_v4_mtu_reduced, 2216 }; 2217 EXPORT_SYMBOL(ipv4_specific); 2218 2219 #ifdef CONFIG_TCP_MD5SIG 2220 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2221 .md5_lookup = tcp_v4_md5_lookup, 2222 .calc_md5_hash = tcp_v4_md5_hash_skb, 2223 .md5_parse = tcp_v4_parse_md5_keys, 2224 }; 2225 #endif 2226 2227 /* NOTE: A lot of things set to zero explicitly by call to 2228 * sk_alloc() so need not be done here. 2229 */ 2230 static int tcp_v4_init_sock(struct sock *sk) 2231 { 2232 struct inet_connection_sock *icsk = inet_csk(sk); 2233 2234 tcp_init_sock(sk); 2235 2236 icsk->icsk_af_ops = &ipv4_specific; 2237 2238 #ifdef CONFIG_TCP_MD5SIG 2239 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2240 #endif 2241 2242 return 0; 2243 } 2244 2245 void tcp_v4_destroy_sock(struct sock *sk) 2246 { 2247 struct tcp_sock *tp = tcp_sk(sk); 2248 2249 trace_tcp_destroy_sock(sk); 2250 2251 tcp_clear_xmit_timers(sk); 2252 2253 tcp_cleanup_congestion_control(sk); 2254 2255 tcp_cleanup_ulp(sk); 2256 2257 /* Cleanup up the write buffer. */ 2258 tcp_write_queue_purge(sk); 2259 2260 /* Check if we want to disable active TFO */ 2261 tcp_fastopen_active_disable_ofo_check(sk); 2262 2263 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2264 skb_rbtree_purge(&tp->out_of_order_queue); 2265 2266 #ifdef CONFIG_TCP_MD5SIG 2267 /* Clean up the MD5 key list, if any */ 2268 if (tp->md5sig_info) { 2269 tcp_clear_md5_list(sk); 2270 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu); 2271 tp->md5sig_info = NULL; 2272 } 2273 #endif 2274 2275 /* Clean up a referenced TCP bind bucket. */ 2276 if (inet_csk(sk)->icsk_bind_hash) 2277 inet_put_port(sk); 2278 2279 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2280 2281 /* If socket is aborted during connect operation */ 2282 tcp_free_fastopen_req(tp); 2283 tcp_fastopen_destroy_cipher(sk); 2284 tcp_saved_syn_free(tp); 2285 2286 sk_sockets_allocated_dec(sk); 2287 } 2288 EXPORT_SYMBOL(tcp_v4_destroy_sock); 2289 2290 #ifdef CONFIG_PROC_FS 2291 /* Proc filesystem TCP sock list dumping. */ 2292 2293 static unsigned short seq_file_family(const struct seq_file *seq); 2294 2295 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) 2296 { 2297 unsigned short family = seq_file_family(seq); 2298 2299 /* AF_UNSPEC is used as a match all */ 2300 return ((family == AF_UNSPEC || family == sk->sk_family) && 2301 net_eq(sock_net(sk), seq_file_net(seq))); 2302 } 2303 2304 /* Find a non empty bucket (starting from st->bucket) 2305 * and return the first sk from it. 2306 */ 2307 static void *listening_get_first(struct seq_file *seq) 2308 { 2309 struct tcp_iter_state *st = seq->private; 2310 2311 st->offset = 0; 2312 for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) { 2313 struct inet_listen_hashbucket *ilb2; 2314 struct inet_connection_sock *icsk; 2315 struct sock *sk; 2316 2317 ilb2 = &tcp_hashinfo.lhash2[st->bucket]; 2318 if (hlist_empty(&ilb2->head)) 2319 continue; 2320 2321 spin_lock(&ilb2->lock); 2322 inet_lhash2_for_each_icsk(icsk, &ilb2->head) { 2323 sk = (struct sock *)icsk; 2324 if (seq_sk_match(seq, sk)) 2325 return sk; 2326 } 2327 spin_unlock(&ilb2->lock); 2328 } 2329 2330 return NULL; 2331 } 2332 2333 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). 2334 * If "cur" is the last one in the st->bucket, 2335 * call listening_get_first() to return the first sk of the next 2336 * non empty bucket. 2337 */ 2338 static void *listening_get_next(struct seq_file *seq, void *cur) 2339 { 2340 struct tcp_iter_state *st = seq->private; 2341 struct inet_listen_hashbucket *ilb2; 2342 struct inet_connection_sock *icsk; 2343 struct sock *sk = cur; 2344 2345 ++st->num; 2346 ++st->offset; 2347 2348 icsk = inet_csk(sk); 2349 inet_lhash2_for_each_icsk_continue(icsk) { 2350 sk = (struct sock *)icsk; 2351 if (seq_sk_match(seq, sk)) 2352 return sk; 2353 } 2354 2355 ilb2 = &tcp_hashinfo.lhash2[st->bucket]; 2356 spin_unlock(&ilb2->lock); 2357 ++st->bucket; 2358 return listening_get_first(seq); 2359 } 2360 2361 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2362 { 2363 struct tcp_iter_state *st = seq->private; 2364 void *rc; 2365 2366 st->bucket = 0; 2367 st->offset = 0; 2368 rc = listening_get_first(seq); 2369 2370 while (rc && *pos) { 2371 rc = listening_get_next(seq, rc); 2372 --*pos; 2373 } 2374 return rc; 2375 } 2376 2377 static inline bool empty_bucket(const struct tcp_iter_state *st) 2378 { 2379 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain); 2380 } 2381 2382 /* 2383 * Get first established socket starting from bucket given in st->bucket. 2384 * If st->bucket is zero, the very first socket in the hash is returned. 2385 */ 2386 static void *established_get_first(struct seq_file *seq) 2387 { 2388 struct tcp_iter_state *st = seq->private; 2389 2390 st->offset = 0; 2391 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { 2392 struct sock *sk; 2393 struct hlist_nulls_node *node; 2394 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 2395 2396 /* Lockless fast path for the common case of empty buckets */ 2397 if (empty_bucket(st)) 2398 continue; 2399 2400 spin_lock_bh(lock); 2401 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 2402 if (seq_sk_match(seq, sk)) 2403 return sk; 2404 } 2405 spin_unlock_bh(lock); 2406 } 2407 2408 return NULL; 2409 } 2410 2411 static void *established_get_next(struct seq_file *seq, void *cur) 2412 { 2413 struct sock *sk = cur; 2414 struct hlist_nulls_node *node; 2415 struct tcp_iter_state *st = seq->private; 2416 2417 ++st->num; 2418 ++st->offset; 2419 2420 sk = sk_nulls_next(sk); 2421 2422 sk_nulls_for_each_from(sk, node) { 2423 if (seq_sk_match(seq, sk)) 2424 return sk; 2425 } 2426 2427 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2428 ++st->bucket; 2429 return established_get_first(seq); 2430 } 2431 2432 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2433 { 2434 struct tcp_iter_state *st = seq->private; 2435 void *rc; 2436 2437 st->bucket = 0; 2438 rc = established_get_first(seq); 2439 2440 while (rc && pos) { 2441 rc = established_get_next(seq, rc); 2442 --pos; 2443 } 2444 return rc; 2445 } 2446 2447 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2448 { 2449 void *rc; 2450 struct tcp_iter_state *st = seq->private; 2451 2452 st->state = TCP_SEQ_STATE_LISTENING; 2453 rc = listening_get_idx(seq, &pos); 2454 2455 if (!rc) { 2456 st->state = TCP_SEQ_STATE_ESTABLISHED; 2457 rc = established_get_idx(seq, pos); 2458 } 2459 2460 return rc; 2461 } 2462 2463 static void *tcp_seek_last_pos(struct seq_file *seq) 2464 { 2465 struct tcp_iter_state *st = seq->private; 2466 int bucket = st->bucket; 2467 int offset = st->offset; 2468 int orig_num = st->num; 2469 void *rc = NULL; 2470 2471 switch (st->state) { 2472 case TCP_SEQ_STATE_LISTENING: 2473 if (st->bucket > tcp_hashinfo.lhash2_mask) 2474 break; 2475 st->state = TCP_SEQ_STATE_LISTENING; 2476 rc = listening_get_first(seq); 2477 while (offset-- && rc && bucket == st->bucket) 2478 rc = listening_get_next(seq, rc); 2479 if (rc) 2480 break; 2481 st->bucket = 0; 2482 st->state = TCP_SEQ_STATE_ESTABLISHED; 2483 fallthrough; 2484 case TCP_SEQ_STATE_ESTABLISHED: 2485 if (st->bucket > tcp_hashinfo.ehash_mask) 2486 break; 2487 rc = established_get_first(seq); 2488 while (offset-- && rc && bucket == st->bucket) 2489 rc = established_get_next(seq, rc); 2490 } 2491 2492 st->num = orig_num; 2493 2494 return rc; 2495 } 2496 2497 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2498 { 2499 struct tcp_iter_state *st = seq->private; 2500 void *rc; 2501 2502 if (*pos && *pos == st->last_pos) { 2503 rc = tcp_seek_last_pos(seq); 2504 if (rc) 2505 goto out; 2506 } 2507 2508 st->state = TCP_SEQ_STATE_LISTENING; 2509 st->num = 0; 2510 st->bucket = 0; 2511 st->offset = 0; 2512 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2513 2514 out: 2515 st->last_pos = *pos; 2516 return rc; 2517 } 2518 EXPORT_SYMBOL(tcp_seq_start); 2519 2520 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2521 { 2522 struct tcp_iter_state *st = seq->private; 2523 void *rc = NULL; 2524 2525 if (v == SEQ_START_TOKEN) { 2526 rc = tcp_get_idx(seq, 0); 2527 goto out; 2528 } 2529 2530 switch (st->state) { 2531 case TCP_SEQ_STATE_LISTENING: 2532 rc = listening_get_next(seq, v); 2533 if (!rc) { 2534 st->state = TCP_SEQ_STATE_ESTABLISHED; 2535 st->bucket = 0; 2536 st->offset = 0; 2537 rc = established_get_first(seq); 2538 } 2539 break; 2540 case TCP_SEQ_STATE_ESTABLISHED: 2541 rc = established_get_next(seq, v); 2542 break; 2543 } 2544 out: 2545 ++*pos; 2546 st->last_pos = *pos; 2547 return rc; 2548 } 2549 EXPORT_SYMBOL(tcp_seq_next); 2550 2551 void tcp_seq_stop(struct seq_file *seq, void *v) 2552 { 2553 struct tcp_iter_state *st = seq->private; 2554 2555 switch (st->state) { 2556 case TCP_SEQ_STATE_LISTENING: 2557 if (v != SEQ_START_TOKEN) 2558 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock); 2559 break; 2560 case TCP_SEQ_STATE_ESTABLISHED: 2561 if (v) 2562 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2563 break; 2564 } 2565 } 2566 EXPORT_SYMBOL(tcp_seq_stop); 2567 2568 static void get_openreq4(const struct request_sock *req, 2569 struct seq_file *f, int i) 2570 { 2571 const struct inet_request_sock *ireq = inet_rsk(req); 2572 long delta = req->rsk_timer.expires - jiffies; 2573 2574 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2575 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2576 i, 2577 ireq->ir_loc_addr, 2578 ireq->ir_num, 2579 ireq->ir_rmt_addr, 2580 ntohs(ireq->ir_rmt_port), 2581 TCP_SYN_RECV, 2582 0, 0, /* could print option size, but that is af dependent. */ 2583 1, /* timers active (only the expire timer) */ 2584 jiffies_delta_to_clock_t(delta), 2585 req->num_timeout, 2586 from_kuid_munged(seq_user_ns(f), 2587 sock_i_uid(req->rsk_listener)), 2588 0, /* non standard timer */ 2589 0, /* open_requests have no inode */ 2590 0, 2591 req); 2592 } 2593 2594 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2595 { 2596 int timer_active; 2597 unsigned long timer_expires; 2598 const struct tcp_sock *tp = tcp_sk(sk); 2599 const struct inet_connection_sock *icsk = inet_csk(sk); 2600 const struct inet_sock *inet = inet_sk(sk); 2601 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2602 __be32 dest = inet->inet_daddr; 2603 __be32 src = inet->inet_rcv_saddr; 2604 __u16 destp = ntohs(inet->inet_dport); 2605 __u16 srcp = ntohs(inet->inet_sport); 2606 int rx_queue; 2607 int state; 2608 2609 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2610 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2611 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2612 timer_active = 1; 2613 timer_expires = icsk->icsk_timeout; 2614 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2615 timer_active = 4; 2616 timer_expires = icsk->icsk_timeout; 2617 } else if (timer_pending(&sk->sk_timer)) { 2618 timer_active = 2; 2619 timer_expires = sk->sk_timer.expires; 2620 } else { 2621 timer_active = 0; 2622 timer_expires = jiffies; 2623 } 2624 2625 state = inet_sk_state_load(sk); 2626 if (state == TCP_LISTEN) 2627 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2628 else 2629 /* Because we don't lock the socket, 2630 * we might find a transient negative value. 2631 */ 2632 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2633 READ_ONCE(tp->copied_seq), 0); 2634 2635 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2636 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2637 i, src, srcp, dest, destp, state, 2638 READ_ONCE(tp->write_seq) - tp->snd_una, 2639 rx_queue, 2640 timer_active, 2641 jiffies_delta_to_clock_t(timer_expires - jiffies), 2642 icsk->icsk_retransmits, 2643 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2644 icsk->icsk_probes_out, 2645 sock_i_ino(sk), 2646 refcount_read(&sk->sk_refcnt), sk, 2647 jiffies_to_clock_t(icsk->icsk_rto), 2648 jiffies_to_clock_t(icsk->icsk_ack.ato), 2649 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2650 tp->snd_cwnd, 2651 state == TCP_LISTEN ? 2652 fastopenq->max_qlen : 2653 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2654 } 2655 2656 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2657 struct seq_file *f, int i) 2658 { 2659 long delta = tw->tw_timer.expires - jiffies; 2660 __be32 dest, src; 2661 __u16 destp, srcp; 2662 2663 dest = tw->tw_daddr; 2664 src = tw->tw_rcv_saddr; 2665 destp = ntohs(tw->tw_dport); 2666 srcp = ntohs(tw->tw_sport); 2667 2668 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2669 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2670 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2671 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2672 refcount_read(&tw->tw_refcnt), tw); 2673 } 2674 2675 #define TMPSZ 150 2676 2677 static int tcp4_seq_show(struct seq_file *seq, void *v) 2678 { 2679 struct tcp_iter_state *st; 2680 struct sock *sk = v; 2681 2682 seq_setwidth(seq, TMPSZ - 1); 2683 if (v == SEQ_START_TOKEN) { 2684 seq_puts(seq, " sl local_address rem_address st tx_queue " 2685 "rx_queue tr tm->when retrnsmt uid timeout " 2686 "inode"); 2687 goto out; 2688 } 2689 st = seq->private; 2690 2691 if (sk->sk_state == TCP_TIME_WAIT) 2692 get_timewait4_sock(v, seq, st->num); 2693 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2694 get_openreq4(v, seq, st->num); 2695 else 2696 get_tcp4_sock(v, seq, st->num); 2697 out: 2698 seq_pad(seq, '\n'); 2699 return 0; 2700 } 2701 2702 #ifdef CONFIG_BPF_SYSCALL 2703 struct bpf_tcp_iter_state { 2704 struct tcp_iter_state state; 2705 unsigned int cur_sk; 2706 unsigned int end_sk; 2707 unsigned int max_sk; 2708 struct sock **batch; 2709 bool st_bucket_done; 2710 }; 2711 2712 struct bpf_iter__tcp { 2713 __bpf_md_ptr(struct bpf_iter_meta *, meta); 2714 __bpf_md_ptr(struct sock_common *, sk_common); 2715 uid_t uid __aligned(8); 2716 }; 2717 2718 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 2719 struct sock_common *sk_common, uid_t uid) 2720 { 2721 struct bpf_iter__tcp ctx; 2722 2723 meta->seq_num--; /* skip SEQ_START_TOKEN */ 2724 ctx.meta = meta; 2725 ctx.sk_common = sk_common; 2726 ctx.uid = uid; 2727 return bpf_iter_run_prog(prog, &ctx); 2728 } 2729 2730 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) 2731 { 2732 while (iter->cur_sk < iter->end_sk) 2733 sock_put(iter->batch[iter->cur_sk++]); 2734 } 2735 2736 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, 2737 unsigned int new_batch_sz) 2738 { 2739 struct sock **new_batch; 2740 2741 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 2742 GFP_USER | __GFP_NOWARN); 2743 if (!new_batch) 2744 return -ENOMEM; 2745 2746 bpf_iter_tcp_put_batch(iter); 2747 kvfree(iter->batch); 2748 iter->batch = new_batch; 2749 iter->max_sk = new_batch_sz; 2750 2751 return 0; 2752 } 2753 2754 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, 2755 struct sock *start_sk) 2756 { 2757 struct bpf_tcp_iter_state *iter = seq->private; 2758 struct tcp_iter_state *st = &iter->state; 2759 struct inet_connection_sock *icsk; 2760 unsigned int expected = 1; 2761 struct sock *sk; 2762 2763 sock_hold(start_sk); 2764 iter->batch[iter->end_sk++] = start_sk; 2765 2766 icsk = inet_csk(start_sk); 2767 inet_lhash2_for_each_icsk_continue(icsk) { 2768 sk = (struct sock *)icsk; 2769 if (seq_sk_match(seq, sk)) { 2770 if (iter->end_sk < iter->max_sk) { 2771 sock_hold(sk); 2772 iter->batch[iter->end_sk++] = sk; 2773 } 2774 expected++; 2775 } 2776 } 2777 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock); 2778 2779 return expected; 2780 } 2781 2782 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, 2783 struct sock *start_sk) 2784 { 2785 struct bpf_tcp_iter_state *iter = seq->private; 2786 struct tcp_iter_state *st = &iter->state; 2787 struct hlist_nulls_node *node; 2788 unsigned int expected = 1; 2789 struct sock *sk; 2790 2791 sock_hold(start_sk); 2792 iter->batch[iter->end_sk++] = start_sk; 2793 2794 sk = sk_nulls_next(start_sk); 2795 sk_nulls_for_each_from(sk, node) { 2796 if (seq_sk_match(seq, sk)) { 2797 if (iter->end_sk < iter->max_sk) { 2798 sock_hold(sk); 2799 iter->batch[iter->end_sk++] = sk; 2800 } 2801 expected++; 2802 } 2803 } 2804 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2805 2806 return expected; 2807 } 2808 2809 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) 2810 { 2811 struct bpf_tcp_iter_state *iter = seq->private; 2812 struct tcp_iter_state *st = &iter->state; 2813 unsigned int expected; 2814 bool resized = false; 2815 struct sock *sk; 2816 2817 /* The st->bucket is done. Directly advance to the next 2818 * bucket instead of having the tcp_seek_last_pos() to skip 2819 * one by one in the current bucket and eventually find out 2820 * it has to advance to the next bucket. 2821 */ 2822 if (iter->st_bucket_done) { 2823 st->offset = 0; 2824 st->bucket++; 2825 if (st->state == TCP_SEQ_STATE_LISTENING && 2826 st->bucket > tcp_hashinfo.lhash2_mask) { 2827 st->state = TCP_SEQ_STATE_ESTABLISHED; 2828 st->bucket = 0; 2829 } 2830 } 2831 2832 again: 2833 /* Get a new batch */ 2834 iter->cur_sk = 0; 2835 iter->end_sk = 0; 2836 iter->st_bucket_done = false; 2837 2838 sk = tcp_seek_last_pos(seq); 2839 if (!sk) 2840 return NULL; /* Done */ 2841 2842 if (st->state == TCP_SEQ_STATE_LISTENING) 2843 expected = bpf_iter_tcp_listening_batch(seq, sk); 2844 else 2845 expected = bpf_iter_tcp_established_batch(seq, sk); 2846 2847 if (iter->end_sk == expected) { 2848 iter->st_bucket_done = true; 2849 return sk; 2850 } 2851 2852 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) { 2853 resized = true; 2854 goto again; 2855 } 2856 2857 return sk; 2858 } 2859 2860 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) 2861 { 2862 /* bpf iter does not support lseek, so it always 2863 * continue from where it was stop()-ped. 2864 */ 2865 if (*pos) 2866 return bpf_iter_tcp_batch(seq); 2867 2868 return SEQ_START_TOKEN; 2869 } 2870 2871 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2872 { 2873 struct bpf_tcp_iter_state *iter = seq->private; 2874 struct tcp_iter_state *st = &iter->state; 2875 struct sock *sk; 2876 2877 /* Whenever seq_next() is called, the iter->cur_sk is 2878 * done with seq_show(), so advance to the next sk in 2879 * the batch. 2880 */ 2881 if (iter->cur_sk < iter->end_sk) { 2882 /* Keeping st->num consistent in tcp_iter_state. 2883 * bpf_iter_tcp does not use st->num. 2884 * meta.seq_num is used instead. 2885 */ 2886 st->num++; 2887 /* Move st->offset to the next sk in the bucket such that 2888 * the future start() will resume at st->offset in 2889 * st->bucket. See tcp_seek_last_pos(). 2890 */ 2891 st->offset++; 2892 sock_put(iter->batch[iter->cur_sk++]); 2893 } 2894 2895 if (iter->cur_sk < iter->end_sk) 2896 sk = iter->batch[iter->cur_sk]; 2897 else 2898 sk = bpf_iter_tcp_batch(seq); 2899 2900 ++*pos; 2901 /* Keeping st->last_pos consistent in tcp_iter_state. 2902 * bpf iter does not do lseek, so st->last_pos always equals to *pos. 2903 */ 2904 st->last_pos = *pos; 2905 return sk; 2906 } 2907 2908 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 2909 { 2910 struct bpf_iter_meta meta; 2911 struct bpf_prog *prog; 2912 struct sock *sk = v; 2913 bool slow; 2914 uid_t uid; 2915 int ret; 2916 2917 if (v == SEQ_START_TOKEN) 2918 return 0; 2919 2920 if (sk_fullsock(sk)) 2921 slow = lock_sock_fast(sk); 2922 2923 if (unlikely(sk_unhashed(sk))) { 2924 ret = SEQ_SKIP; 2925 goto unlock; 2926 } 2927 2928 if (sk->sk_state == TCP_TIME_WAIT) { 2929 uid = 0; 2930 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 2931 const struct request_sock *req = v; 2932 2933 uid = from_kuid_munged(seq_user_ns(seq), 2934 sock_i_uid(req->rsk_listener)); 2935 } else { 2936 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 2937 } 2938 2939 meta.seq = seq; 2940 prog = bpf_iter_get_info(&meta, false); 2941 ret = tcp_prog_seq_show(prog, &meta, v, uid); 2942 2943 unlock: 2944 if (sk_fullsock(sk)) 2945 unlock_sock_fast(sk, slow); 2946 return ret; 2947 2948 } 2949 2950 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 2951 { 2952 struct bpf_tcp_iter_state *iter = seq->private; 2953 struct bpf_iter_meta meta; 2954 struct bpf_prog *prog; 2955 2956 if (!v) { 2957 meta.seq = seq; 2958 prog = bpf_iter_get_info(&meta, true); 2959 if (prog) 2960 (void)tcp_prog_seq_show(prog, &meta, v, 0); 2961 } 2962 2963 if (iter->cur_sk < iter->end_sk) { 2964 bpf_iter_tcp_put_batch(iter); 2965 iter->st_bucket_done = false; 2966 } 2967 } 2968 2969 static const struct seq_operations bpf_iter_tcp_seq_ops = { 2970 .show = bpf_iter_tcp_seq_show, 2971 .start = bpf_iter_tcp_seq_start, 2972 .next = bpf_iter_tcp_seq_next, 2973 .stop = bpf_iter_tcp_seq_stop, 2974 }; 2975 #endif 2976 static unsigned short seq_file_family(const struct seq_file *seq) 2977 { 2978 const struct tcp_seq_afinfo *afinfo; 2979 2980 #ifdef CONFIG_BPF_SYSCALL 2981 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ 2982 if (seq->op == &bpf_iter_tcp_seq_ops) 2983 return AF_UNSPEC; 2984 #endif 2985 2986 /* Iterated from proc fs */ 2987 afinfo = PDE_DATA(file_inode(seq->file)); 2988 return afinfo->family; 2989 } 2990 2991 static const struct seq_operations tcp4_seq_ops = { 2992 .show = tcp4_seq_show, 2993 .start = tcp_seq_start, 2994 .next = tcp_seq_next, 2995 .stop = tcp_seq_stop, 2996 }; 2997 2998 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 2999 .family = AF_INET, 3000 }; 3001 3002 static int __net_init tcp4_proc_init_net(struct net *net) 3003 { 3004 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 3005 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 3006 return -ENOMEM; 3007 return 0; 3008 } 3009 3010 static void __net_exit tcp4_proc_exit_net(struct net *net) 3011 { 3012 remove_proc_entry("tcp", net->proc_net); 3013 } 3014 3015 static struct pernet_operations tcp4_net_ops = { 3016 .init = tcp4_proc_init_net, 3017 .exit = tcp4_proc_exit_net, 3018 }; 3019 3020 int __init tcp4_proc_init(void) 3021 { 3022 return register_pernet_subsys(&tcp4_net_ops); 3023 } 3024 3025 void tcp4_proc_exit(void) 3026 { 3027 unregister_pernet_subsys(&tcp4_net_ops); 3028 } 3029 #endif /* CONFIG_PROC_FS */ 3030 3031 /* @wake is one when sk_stream_write_space() calls us. 3032 * This sends EPOLLOUT only if notsent_bytes is half the limit. 3033 * This mimics the strategy used in sock_def_write_space(). 3034 */ 3035 bool tcp_stream_memory_free(const struct sock *sk, int wake) 3036 { 3037 const struct tcp_sock *tp = tcp_sk(sk); 3038 u32 notsent_bytes = READ_ONCE(tp->write_seq) - 3039 READ_ONCE(tp->snd_nxt); 3040 3041 return (notsent_bytes << wake) < tcp_notsent_lowat(tp); 3042 } 3043 EXPORT_SYMBOL(tcp_stream_memory_free); 3044 3045 struct proto tcp_prot = { 3046 .name = "TCP", 3047 .owner = THIS_MODULE, 3048 .close = tcp_close, 3049 .pre_connect = tcp_v4_pre_connect, 3050 .connect = tcp_v4_connect, 3051 .disconnect = tcp_disconnect, 3052 .accept = inet_csk_accept, 3053 .ioctl = tcp_ioctl, 3054 .init = tcp_v4_init_sock, 3055 .destroy = tcp_v4_destroy_sock, 3056 .shutdown = tcp_shutdown, 3057 .setsockopt = tcp_setsockopt, 3058 .getsockopt = tcp_getsockopt, 3059 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 3060 .keepalive = tcp_set_keepalive, 3061 .recvmsg = tcp_recvmsg, 3062 .sendmsg = tcp_sendmsg, 3063 .sendpage = tcp_sendpage, 3064 .backlog_rcv = tcp_v4_do_rcv, 3065 .release_cb = tcp_release_cb, 3066 .hash = inet_hash, 3067 .unhash = inet_unhash, 3068 .get_port = inet_csk_get_port, 3069 #ifdef CONFIG_BPF_SYSCALL 3070 .psock_update_sk_prot = tcp_bpf_update_proto, 3071 #endif 3072 .enter_memory_pressure = tcp_enter_memory_pressure, 3073 .leave_memory_pressure = tcp_leave_memory_pressure, 3074 .stream_memory_free = tcp_stream_memory_free, 3075 .sockets_allocated = &tcp_sockets_allocated, 3076 .orphan_count = &tcp_orphan_count, 3077 .memory_allocated = &tcp_memory_allocated, 3078 .memory_pressure = &tcp_memory_pressure, 3079 .sysctl_mem = sysctl_tcp_mem, 3080 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3081 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3082 .max_header = MAX_TCP_HEADER, 3083 .obj_size = sizeof(struct tcp_sock), 3084 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3085 .twsk_prot = &tcp_timewait_sock_ops, 3086 .rsk_prot = &tcp_request_sock_ops, 3087 .h.hashinfo = &tcp_hashinfo, 3088 .no_autobind = true, 3089 .diag_destroy = tcp_abort, 3090 }; 3091 EXPORT_SYMBOL(tcp_prot); 3092 3093 static void __net_exit tcp_sk_exit(struct net *net) 3094 { 3095 int cpu; 3096 3097 if (net->ipv4.tcp_congestion_control) 3098 bpf_module_put(net->ipv4.tcp_congestion_control, 3099 net->ipv4.tcp_congestion_control->owner); 3100 3101 for_each_possible_cpu(cpu) 3102 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); 3103 free_percpu(net->ipv4.tcp_sk); 3104 } 3105 3106 static int __net_init tcp_sk_init(struct net *net) 3107 { 3108 int res, cpu, cnt; 3109 3110 net->ipv4.tcp_sk = alloc_percpu(struct sock *); 3111 if (!net->ipv4.tcp_sk) 3112 return -ENOMEM; 3113 3114 for_each_possible_cpu(cpu) { 3115 struct sock *sk; 3116 3117 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 3118 IPPROTO_TCP, net); 3119 if (res) 3120 goto fail; 3121 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 3122 3123 /* Please enforce IP_DF and IPID==0 for RST and 3124 * ACK sent in SYN-RECV and TIME-WAIT state. 3125 */ 3126 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 3127 3128 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; 3129 } 3130 3131 net->ipv4.sysctl_tcp_ecn = 2; 3132 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3133 3134 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 3135 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 3136 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 3137 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 3138 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 3139 3140 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 3141 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 3142 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 3143 3144 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 3145 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 3146 net->ipv4.sysctl_tcp_syncookies = 1; 3147 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 3148 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 3149 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 3150 net->ipv4.sysctl_tcp_orphan_retries = 0; 3151 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 3152 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 3153 net->ipv4.sysctl_tcp_tw_reuse = 2; 3154 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 3155 3156 cnt = tcp_hashinfo.ehash_mask + 1; 3157 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2; 3158 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; 3159 3160 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128); 3161 net->ipv4.sysctl_tcp_sack = 1; 3162 net->ipv4.sysctl_tcp_window_scaling = 1; 3163 net->ipv4.sysctl_tcp_timestamps = 1; 3164 net->ipv4.sysctl_tcp_early_retrans = 3; 3165 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 3166 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 3167 net->ipv4.sysctl_tcp_retrans_collapse = 1; 3168 net->ipv4.sysctl_tcp_max_reordering = 300; 3169 net->ipv4.sysctl_tcp_dsack = 1; 3170 net->ipv4.sysctl_tcp_app_win = 31; 3171 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3172 net->ipv4.sysctl_tcp_frto = 2; 3173 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3174 /* This limits the percentage of the congestion window which we 3175 * will allow a single TSO frame to consume. Building TSO frames 3176 * which are too large can cause TCP streams to be bursty. 3177 */ 3178 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3179 /* Default TSQ limit of 16 TSO segments */ 3180 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; 3181 /* rfc5961 challenge ack rate limiting */ 3182 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; 3183 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3184 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3185 net->ipv4.sysctl_tcp_autocorking = 1; 3186 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 3187 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 3188 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 3189 if (net != &init_net) { 3190 memcpy(net->ipv4.sysctl_tcp_rmem, 3191 init_net.ipv4.sysctl_tcp_rmem, 3192 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 3193 memcpy(net->ipv4.sysctl_tcp_wmem, 3194 init_net.ipv4.sysctl_tcp_wmem, 3195 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 3196 } 3197 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3198 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 3199 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3200 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3201 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; 3202 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 3203 3204 /* Reno is always built in */ 3205 if (!net_eq(net, &init_net) && 3206 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 3207 init_net.ipv4.tcp_congestion_control->owner)) 3208 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 3209 else 3210 net->ipv4.tcp_congestion_control = &tcp_reno; 3211 3212 return 0; 3213 fail: 3214 tcp_sk_exit(net); 3215 3216 return res; 3217 } 3218 3219 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 3220 { 3221 struct net *net; 3222 3223 inet_twsk_purge(&tcp_hashinfo, AF_INET); 3224 3225 list_for_each_entry(net, net_exit_list, exit_list) 3226 tcp_fastopen_ctx_destroy(net); 3227 } 3228 3229 static struct pernet_operations __net_initdata tcp_sk_ops = { 3230 .init = tcp_sk_init, 3231 .exit = tcp_sk_exit, 3232 .exit_batch = tcp_sk_exit_batch, 3233 }; 3234 3235 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3236 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 3237 struct sock_common *sk_common, uid_t uid) 3238 3239 #define INIT_BATCH_SZ 16 3240 3241 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 3242 { 3243 struct bpf_tcp_iter_state *iter = priv_data; 3244 int err; 3245 3246 err = bpf_iter_init_seq_net(priv_data, aux); 3247 if (err) 3248 return err; 3249 3250 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ); 3251 if (err) { 3252 bpf_iter_fini_seq_net(priv_data); 3253 return err; 3254 } 3255 3256 return 0; 3257 } 3258 3259 static void bpf_iter_fini_tcp(void *priv_data) 3260 { 3261 struct bpf_tcp_iter_state *iter = priv_data; 3262 3263 bpf_iter_fini_seq_net(priv_data); 3264 kvfree(iter->batch); 3265 } 3266 3267 static const struct bpf_iter_seq_info tcp_seq_info = { 3268 .seq_ops = &bpf_iter_tcp_seq_ops, 3269 .init_seq_private = bpf_iter_init_tcp, 3270 .fini_seq_private = bpf_iter_fini_tcp, 3271 .seq_priv_size = sizeof(struct bpf_tcp_iter_state), 3272 }; 3273 3274 static const struct bpf_func_proto * 3275 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, 3276 const struct bpf_prog *prog) 3277 { 3278 switch (func_id) { 3279 case BPF_FUNC_setsockopt: 3280 return &bpf_sk_setsockopt_proto; 3281 case BPF_FUNC_getsockopt: 3282 return &bpf_sk_getsockopt_proto; 3283 default: 3284 return NULL; 3285 } 3286 } 3287 3288 static struct bpf_iter_reg tcp_reg_info = { 3289 .target = "tcp", 3290 .ctx_arg_info_size = 1, 3291 .ctx_arg_info = { 3292 { offsetof(struct bpf_iter__tcp, sk_common), 3293 PTR_TO_BTF_ID_OR_NULL }, 3294 }, 3295 .get_func_proto = bpf_iter_tcp_get_func_proto, 3296 .seq_info = &tcp_seq_info, 3297 }; 3298 3299 static void __init bpf_iter_register(void) 3300 { 3301 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 3302 if (bpf_iter_reg_target(&tcp_reg_info)) 3303 pr_warn("Warning: could not register bpf iterator tcp\n"); 3304 } 3305 3306 #endif 3307 3308 void __init tcp_v4_init(void) 3309 { 3310 if (register_pernet_subsys(&tcp_sk_ops)) 3311 panic("Failed to create the TCP control socket.\n"); 3312 3313 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3314 bpf_iter_register(); 3315 #endif 3316 } 3317