1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 61 #include <net/net_namespace.h> 62 #include <net/icmp.h> 63 #include <net/inet_hashtables.h> 64 #include <net/tcp.h> 65 #include <net/transp_v6.h> 66 #include <net/ipv6.h> 67 #include <net/inet_common.h> 68 #include <net/timewait_sock.h> 69 #include <net/xfrm.h> 70 #include <net/secure_seq.h> 71 #include <net/busy_poll.h> 72 73 #include <linux/inet.h> 74 #include <linux/ipv6.h> 75 #include <linux/stddef.h> 76 #include <linux/proc_fs.h> 77 #include <linux/seq_file.h> 78 #include <linux/inetdevice.h> 79 #include <linux/btf_ids.h> 80 81 #include <crypto/hash.h> 82 #include <linux/scatterlist.h> 83 84 #include <trace/events/tcp.h> 85 86 #ifdef CONFIG_TCP_MD5SIG 87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 88 __be32 daddr, __be32 saddr, const struct tcphdr *th); 89 #endif 90 91 struct inet_hashinfo tcp_hashinfo; 92 EXPORT_SYMBOL(tcp_hashinfo); 93 94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk); 95 96 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 97 { 98 return secure_tcp_seq(ip_hdr(skb)->daddr, 99 ip_hdr(skb)->saddr, 100 tcp_hdr(skb)->dest, 101 tcp_hdr(skb)->source); 102 } 103 104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 105 { 106 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 107 } 108 109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 110 { 111 const struct inet_timewait_sock *tw = inet_twsk(sktw); 112 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 113 struct tcp_sock *tp = tcp_sk(sk); 114 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse; 115 116 if (reuse == 2) { 117 /* Still does not detect *everything* that goes through 118 * lo, since we require a loopback src or dst address 119 * or direct binding to 'lo' interface. 120 */ 121 bool loopback = false; 122 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 123 loopback = true; 124 #if IS_ENABLED(CONFIG_IPV6) 125 if (tw->tw_family == AF_INET6) { 126 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 128 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 129 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 130 loopback = true; 131 } else 132 #endif 133 { 134 if (ipv4_is_loopback(tw->tw_daddr) || 135 ipv4_is_loopback(tw->tw_rcv_saddr)) 136 loopback = true; 137 } 138 if (!loopback) 139 reuse = 0; 140 } 141 142 /* With PAWS, it is safe from the viewpoint 143 of data integrity. Even without PAWS it is safe provided sequence 144 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 145 146 Actually, the idea is close to VJ's one, only timestamp cache is 147 held not per host, but per port pair and TW bucket is used as state 148 holder. 149 150 If TW bucket has been already destroyed we fall back to VJ's scheme 151 and use initial timestamp retrieved from peer table. 152 */ 153 if (tcptw->tw_ts_recent_stamp && 154 (!twp || (reuse && time_after32(ktime_get_seconds(), 155 tcptw->tw_ts_recent_stamp)))) { 156 /* In case of repair and re-using TIME-WAIT sockets we still 157 * want to be sure that it is safe as above but honor the 158 * sequence numbers and time stamps set as part of the repair 159 * process. 160 * 161 * Without this check re-using a TIME-WAIT socket with TCP 162 * repair would accumulate a -1 on the repair assigned 163 * sequence number. The first time it is reused the sequence 164 * is -1, the second time -2, etc. This fixes that issue 165 * without appearing to create any others. 166 */ 167 if (likely(!tp->repair)) { 168 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 169 170 if (!seq) 171 seq = 1; 172 WRITE_ONCE(tp->write_seq, seq); 173 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 174 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 175 } 176 sock_hold(sktw); 177 return 1; 178 } 179 180 return 0; 181 } 182 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 183 184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 185 int addr_len) 186 { 187 /* This check is replicated from tcp_v4_connect() and intended to 188 * prevent BPF program called below from accessing bytes that are out 189 * of the bound specified by user in addr_len. 190 */ 191 if (addr_len < sizeof(struct sockaddr_in)) 192 return -EINVAL; 193 194 sock_owned_by_me(sk); 195 196 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr); 197 } 198 199 /* This will initiate an outgoing connection. */ 200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 201 { 202 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 203 struct inet_sock *inet = inet_sk(sk); 204 struct tcp_sock *tp = tcp_sk(sk); 205 __be16 orig_sport, orig_dport; 206 __be32 daddr, nexthop; 207 struct flowi4 *fl4; 208 struct rtable *rt; 209 int err; 210 struct ip_options_rcu *inet_opt; 211 struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row; 212 213 if (addr_len < sizeof(struct sockaddr_in)) 214 return -EINVAL; 215 216 if (usin->sin_family != AF_INET) 217 return -EAFNOSUPPORT; 218 219 nexthop = daddr = usin->sin_addr.s_addr; 220 inet_opt = rcu_dereference_protected(inet->inet_opt, 221 lockdep_sock_is_held(sk)); 222 if (inet_opt && inet_opt->opt.srr) { 223 if (!daddr) 224 return -EINVAL; 225 nexthop = inet_opt->opt.faddr; 226 } 227 228 orig_sport = inet->inet_sport; 229 orig_dport = usin->sin_port; 230 fl4 = &inet->cork.fl.u.ip4; 231 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 232 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 233 IPPROTO_TCP, 234 orig_sport, orig_dport, sk); 235 if (IS_ERR(rt)) { 236 err = PTR_ERR(rt); 237 if (err == -ENETUNREACH) 238 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 239 return err; 240 } 241 242 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 243 ip_rt_put(rt); 244 return -ENETUNREACH; 245 } 246 247 if (!inet_opt || !inet_opt->opt.srr) 248 daddr = fl4->daddr; 249 250 if (!inet->inet_saddr) 251 inet->inet_saddr = fl4->saddr; 252 sk_rcv_saddr_set(sk, inet->inet_saddr); 253 254 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 255 /* Reset inherited state */ 256 tp->rx_opt.ts_recent = 0; 257 tp->rx_opt.ts_recent_stamp = 0; 258 if (likely(!tp->repair)) 259 WRITE_ONCE(tp->write_seq, 0); 260 } 261 262 inet->inet_dport = usin->sin_port; 263 sk_daddr_set(sk, daddr); 264 265 inet_csk(sk)->icsk_ext_hdr_len = 0; 266 if (inet_opt) 267 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 268 269 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 270 271 /* Socket identity is still unknown (sport may be zero). 272 * However we set state to SYN-SENT and not releasing socket 273 * lock select source port, enter ourselves into the hash tables and 274 * complete initialization after this. 275 */ 276 tcp_set_state(sk, TCP_SYN_SENT); 277 err = inet_hash_connect(tcp_death_row, sk); 278 if (err) 279 goto failure; 280 281 sk_set_txhash(sk); 282 283 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 284 inet->inet_sport, inet->inet_dport, sk); 285 if (IS_ERR(rt)) { 286 err = PTR_ERR(rt); 287 rt = NULL; 288 goto failure; 289 } 290 /* OK, now commit destination to socket. */ 291 sk->sk_gso_type = SKB_GSO_TCPV4; 292 sk_setup_caps(sk, &rt->dst); 293 rt = NULL; 294 295 if (likely(!tp->repair)) { 296 if (!tp->write_seq) 297 WRITE_ONCE(tp->write_seq, 298 secure_tcp_seq(inet->inet_saddr, 299 inet->inet_daddr, 300 inet->inet_sport, 301 usin->sin_port)); 302 tp->tsoffset = secure_tcp_ts_off(sock_net(sk), 303 inet->inet_saddr, 304 inet->inet_daddr); 305 } 306 307 inet->inet_id = prandom_u32(); 308 309 if (tcp_fastopen_defer_connect(sk, &err)) 310 return err; 311 if (err) 312 goto failure; 313 314 err = tcp_connect(sk); 315 316 if (err) 317 goto failure; 318 319 return 0; 320 321 failure: 322 /* 323 * This unhashes the socket and releases the local port, 324 * if necessary. 325 */ 326 tcp_set_state(sk, TCP_CLOSE); 327 ip_rt_put(rt); 328 sk->sk_route_caps = 0; 329 inet->inet_dport = 0; 330 return err; 331 } 332 EXPORT_SYMBOL(tcp_v4_connect); 333 334 /* 335 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 336 * It can be called through tcp_release_cb() if socket was owned by user 337 * at the time tcp_v4_err() was called to handle ICMP message. 338 */ 339 void tcp_v4_mtu_reduced(struct sock *sk) 340 { 341 struct inet_sock *inet = inet_sk(sk); 342 struct dst_entry *dst; 343 u32 mtu; 344 345 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 346 return; 347 mtu = READ_ONCE(tcp_sk(sk)->mtu_info); 348 dst = inet_csk_update_pmtu(sk, mtu); 349 if (!dst) 350 return; 351 352 /* Something is about to be wrong... Remember soft error 353 * for the case, if this connection will not able to recover. 354 */ 355 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 356 sk->sk_err_soft = EMSGSIZE; 357 358 mtu = dst_mtu(dst); 359 360 if (inet->pmtudisc != IP_PMTUDISC_DONT && 361 ip_sk_accept_pmtu(sk) && 362 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 363 tcp_sync_mss(sk, mtu); 364 365 /* Resend the TCP packet because it's 366 * clear that the old packet has been 367 * dropped. This is the new "fast" path mtu 368 * discovery. 369 */ 370 tcp_simple_retransmit(sk); 371 } /* else let the usual retransmit timer handle it */ 372 } 373 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 374 375 static void do_redirect(struct sk_buff *skb, struct sock *sk) 376 { 377 struct dst_entry *dst = __sk_dst_check(sk, 0); 378 379 if (dst) 380 dst->ops->redirect(dst, sk, skb); 381 } 382 383 384 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 385 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 386 { 387 struct request_sock *req = inet_reqsk(sk); 388 struct net *net = sock_net(sk); 389 390 /* ICMPs are not backlogged, hence we cannot get 391 * an established socket here. 392 */ 393 if (seq != tcp_rsk(req)->snt_isn) { 394 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 395 } else if (abort) { 396 /* 397 * Still in SYN_RECV, just remove it silently. 398 * There is no good way to pass the error to the newly 399 * created socket, and POSIX does not want network 400 * errors returned from accept(). 401 */ 402 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 403 tcp_listendrop(req->rsk_listener); 404 } 405 reqsk_put(req); 406 } 407 EXPORT_SYMBOL(tcp_req_err); 408 409 /* TCP-LD (RFC 6069) logic */ 410 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 411 { 412 struct inet_connection_sock *icsk = inet_csk(sk); 413 struct tcp_sock *tp = tcp_sk(sk); 414 struct sk_buff *skb; 415 s32 remaining; 416 u32 delta_us; 417 418 if (sock_owned_by_user(sk)) 419 return; 420 421 if (seq != tp->snd_una || !icsk->icsk_retransmits || 422 !icsk->icsk_backoff) 423 return; 424 425 skb = tcp_rtx_queue_head(sk); 426 if (WARN_ON_ONCE(!skb)) 427 return; 428 429 icsk->icsk_backoff--; 430 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 431 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 432 433 tcp_mstamp_refresh(tp); 434 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 435 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 436 437 if (remaining > 0) { 438 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 439 remaining, TCP_RTO_MAX); 440 } else { 441 /* RTO revert clocked out retransmission. 442 * Will retransmit now. 443 */ 444 tcp_retransmit_timer(sk); 445 } 446 } 447 EXPORT_SYMBOL(tcp_ld_RTO_revert); 448 449 /* 450 * This routine is called by the ICMP module when it gets some 451 * sort of error condition. If err < 0 then the socket should 452 * be closed and the error returned to the user. If err > 0 453 * it's just the icmp type << 8 | icmp code. After adjustment 454 * header points to the first 8 bytes of the tcp header. We need 455 * to find the appropriate port. 456 * 457 * The locking strategy used here is very "optimistic". When 458 * someone else accesses the socket the ICMP is just dropped 459 * and for some paths there is no check at all. 460 * A more general error queue to queue errors for later handling 461 * is probably better. 462 * 463 */ 464 465 int tcp_v4_err(struct sk_buff *skb, u32 info) 466 { 467 const struct iphdr *iph = (const struct iphdr *)skb->data; 468 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 469 struct tcp_sock *tp; 470 struct inet_sock *inet; 471 const int type = icmp_hdr(skb)->type; 472 const int code = icmp_hdr(skb)->code; 473 struct sock *sk; 474 struct request_sock *fastopen; 475 u32 seq, snd_una; 476 int err; 477 struct net *net = dev_net(skb->dev); 478 479 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, 480 th->dest, iph->saddr, ntohs(th->source), 481 inet_iif(skb), 0); 482 if (!sk) { 483 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 484 return -ENOENT; 485 } 486 if (sk->sk_state == TCP_TIME_WAIT) { 487 inet_twsk_put(inet_twsk(sk)); 488 return 0; 489 } 490 seq = ntohl(th->seq); 491 if (sk->sk_state == TCP_NEW_SYN_RECV) { 492 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 493 type == ICMP_TIME_EXCEEDED || 494 (type == ICMP_DEST_UNREACH && 495 (code == ICMP_NET_UNREACH || 496 code == ICMP_HOST_UNREACH))); 497 return 0; 498 } 499 500 bh_lock_sock(sk); 501 /* If too many ICMPs get dropped on busy 502 * servers this needs to be solved differently. 503 * We do take care of PMTU discovery (RFC1191) special case : 504 * we can receive locally generated ICMP messages while socket is held. 505 */ 506 if (sock_owned_by_user(sk)) { 507 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 508 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 509 } 510 if (sk->sk_state == TCP_CLOSE) 511 goto out; 512 513 if (static_branch_unlikely(&ip4_min_ttl)) { 514 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 515 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 516 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 517 goto out; 518 } 519 } 520 521 tp = tcp_sk(sk); 522 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 523 fastopen = rcu_dereference(tp->fastopen_rsk); 524 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 525 if (sk->sk_state != TCP_LISTEN && 526 !between(seq, snd_una, tp->snd_nxt)) { 527 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 528 goto out; 529 } 530 531 switch (type) { 532 case ICMP_REDIRECT: 533 if (!sock_owned_by_user(sk)) 534 do_redirect(skb, sk); 535 goto out; 536 case ICMP_SOURCE_QUENCH: 537 /* Just silently ignore these. */ 538 goto out; 539 case ICMP_PARAMETERPROB: 540 err = EPROTO; 541 break; 542 case ICMP_DEST_UNREACH: 543 if (code > NR_ICMP_UNREACH) 544 goto out; 545 546 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 547 /* We are not interested in TCP_LISTEN and open_requests 548 * (SYN-ACKs send out by Linux are always <576bytes so 549 * they should go through unfragmented). 550 */ 551 if (sk->sk_state == TCP_LISTEN) 552 goto out; 553 554 WRITE_ONCE(tp->mtu_info, info); 555 if (!sock_owned_by_user(sk)) { 556 tcp_v4_mtu_reduced(sk); 557 } else { 558 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 559 sock_hold(sk); 560 } 561 goto out; 562 } 563 564 err = icmp_err_convert[code].errno; 565 /* check if this ICMP message allows revert of backoff. 566 * (see RFC 6069) 567 */ 568 if (!fastopen && 569 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 570 tcp_ld_RTO_revert(sk, seq); 571 break; 572 case ICMP_TIME_EXCEEDED: 573 err = EHOSTUNREACH; 574 break; 575 default: 576 goto out; 577 } 578 579 switch (sk->sk_state) { 580 case TCP_SYN_SENT: 581 case TCP_SYN_RECV: 582 /* Only in fast or simultaneous open. If a fast open socket is 583 * already accepted it is treated as a connected one below. 584 */ 585 if (fastopen && !fastopen->sk) 586 break; 587 588 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 589 590 if (!sock_owned_by_user(sk)) { 591 sk->sk_err = err; 592 593 sk_error_report(sk); 594 595 tcp_done(sk); 596 } else { 597 sk->sk_err_soft = err; 598 } 599 goto out; 600 } 601 602 /* If we've already connected we will keep trying 603 * until we time out, or the user gives up. 604 * 605 * rfc1122 4.2.3.9 allows to consider as hard errors 606 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 607 * but it is obsoleted by pmtu discovery). 608 * 609 * Note, that in modern internet, where routing is unreliable 610 * and in each dark corner broken firewalls sit, sending random 611 * errors ordered by their masters even this two messages finally lose 612 * their original sense (even Linux sends invalid PORT_UNREACHs) 613 * 614 * Now we are in compliance with RFCs. 615 * --ANK (980905) 616 */ 617 618 inet = inet_sk(sk); 619 if (!sock_owned_by_user(sk) && inet->recverr) { 620 sk->sk_err = err; 621 sk_error_report(sk); 622 } else { /* Only an error on timeout */ 623 sk->sk_err_soft = err; 624 } 625 626 out: 627 bh_unlock_sock(sk); 628 sock_put(sk); 629 return 0; 630 } 631 632 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 633 { 634 struct tcphdr *th = tcp_hdr(skb); 635 636 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 637 skb->csum_start = skb_transport_header(skb) - skb->head; 638 skb->csum_offset = offsetof(struct tcphdr, check); 639 } 640 641 /* This routine computes an IPv4 TCP checksum. */ 642 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 643 { 644 const struct inet_sock *inet = inet_sk(sk); 645 646 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 647 } 648 EXPORT_SYMBOL(tcp_v4_send_check); 649 650 /* 651 * This routine will send an RST to the other tcp. 652 * 653 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 654 * for reset. 655 * Answer: if a packet caused RST, it is not for a socket 656 * existing in our system, if it is matched to a socket, 657 * it is just duplicate segment or bug in other side's TCP. 658 * So that we build reply only basing on parameters 659 * arrived with segment. 660 * Exception: precedence violation. We do not implement it in any case. 661 */ 662 663 #ifdef CONFIG_TCP_MD5SIG 664 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED 665 #else 666 #define OPTION_BYTES sizeof(__be32) 667 #endif 668 669 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) 670 { 671 const struct tcphdr *th = tcp_hdr(skb); 672 struct { 673 struct tcphdr th; 674 __be32 opt[OPTION_BYTES / sizeof(__be32)]; 675 } rep; 676 struct ip_reply_arg arg; 677 #ifdef CONFIG_TCP_MD5SIG 678 struct tcp_md5sig_key *key = NULL; 679 const __u8 *hash_location = NULL; 680 unsigned char newhash[16]; 681 int genhash; 682 struct sock *sk1 = NULL; 683 #endif 684 u64 transmit_time = 0; 685 struct sock *ctl_sk; 686 struct net *net; 687 688 /* Never send a reset in response to a reset. */ 689 if (th->rst) 690 return; 691 692 /* If sk not NULL, it means we did a successful lookup and incoming 693 * route had to be correct. prequeue might have dropped our dst. 694 */ 695 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 696 return; 697 698 /* Swap the send and the receive. */ 699 memset(&rep, 0, sizeof(rep)); 700 rep.th.dest = th->source; 701 rep.th.source = th->dest; 702 rep.th.doff = sizeof(struct tcphdr) / 4; 703 rep.th.rst = 1; 704 705 if (th->ack) { 706 rep.th.seq = th->ack_seq; 707 } else { 708 rep.th.ack = 1; 709 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 710 skb->len - (th->doff << 2)); 711 } 712 713 memset(&arg, 0, sizeof(arg)); 714 arg.iov[0].iov_base = (unsigned char *)&rep; 715 arg.iov[0].iov_len = sizeof(rep.th); 716 717 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 718 #ifdef CONFIG_TCP_MD5SIG 719 rcu_read_lock(); 720 hash_location = tcp_parse_md5sig_option(th); 721 if (sk && sk_fullsock(sk)) { 722 const union tcp_md5_addr *addr; 723 int l3index; 724 725 /* sdif set, means packet ingressed via a device 726 * in an L3 domain and inet_iif is set to it. 727 */ 728 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 729 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 730 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 731 } else if (hash_location) { 732 const union tcp_md5_addr *addr; 733 int sdif = tcp_v4_sdif(skb); 734 int dif = inet_iif(skb); 735 int l3index; 736 737 /* 738 * active side is lost. Try to find listening socket through 739 * source port, and then find md5 key through listening socket. 740 * we are not loose security here: 741 * Incoming packet is checked with md5 hash with finding key, 742 * no RST generated if md5 hash doesn't match. 743 */ 744 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, 745 ip_hdr(skb)->saddr, 746 th->source, ip_hdr(skb)->daddr, 747 ntohs(th->source), dif, sdif); 748 /* don't send rst if it can't find key */ 749 if (!sk1) 750 goto out; 751 752 /* sdif set, means packet ingressed via a device 753 * in an L3 domain and dif is set to it. 754 */ 755 l3index = sdif ? dif : 0; 756 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 757 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 758 if (!key) 759 goto out; 760 761 762 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 763 if (genhash || memcmp(hash_location, newhash, 16) != 0) 764 goto out; 765 766 } 767 768 if (key) { 769 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 770 (TCPOPT_NOP << 16) | 771 (TCPOPT_MD5SIG << 8) | 772 TCPOLEN_MD5SIG); 773 /* Update length and the length the header thinks exists */ 774 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 775 rep.th.doff = arg.iov[0].iov_len / 4; 776 777 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 778 key, ip_hdr(skb)->saddr, 779 ip_hdr(skb)->daddr, &rep.th); 780 } 781 #endif 782 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ 783 if (rep.opt[0] == 0) { 784 __be32 mrst = mptcp_reset_option(skb); 785 786 if (mrst) { 787 rep.opt[0] = mrst; 788 arg.iov[0].iov_len += sizeof(mrst); 789 rep.th.doff = arg.iov[0].iov_len / 4; 790 } 791 } 792 793 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 794 ip_hdr(skb)->saddr, /* XXX */ 795 arg.iov[0].iov_len, IPPROTO_TCP, 0); 796 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 797 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 798 799 /* When socket is gone, all binding information is lost. 800 * routing might fail in this case. No choice here, if we choose to force 801 * input interface, we will misroute in case of asymmetric route. 802 */ 803 if (sk) { 804 arg.bound_dev_if = sk->sk_bound_dev_if; 805 if (sk_fullsock(sk)) 806 trace_tcp_send_reset(sk, skb); 807 } 808 809 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 810 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 811 812 arg.tos = ip_hdr(skb)->tos; 813 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 814 local_bh_disable(); 815 ctl_sk = this_cpu_read(ipv4_tcp_sk); 816 sock_net_set(ctl_sk, net); 817 if (sk) { 818 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 819 inet_twsk(sk)->tw_mark : sk->sk_mark; 820 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 821 inet_twsk(sk)->tw_priority : sk->sk_priority; 822 transmit_time = tcp_transmit_time(sk); 823 } 824 ip_send_unicast_reply(ctl_sk, 825 skb, &TCP_SKB_CB(skb)->header.h4.opt, 826 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 827 &arg, arg.iov[0].iov_len, 828 transmit_time); 829 830 ctl_sk->sk_mark = 0; 831 sock_net_set(ctl_sk, &init_net); 832 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 833 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 834 local_bh_enable(); 835 836 #ifdef CONFIG_TCP_MD5SIG 837 out: 838 rcu_read_unlock(); 839 #endif 840 } 841 842 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 843 outside socket context is ugly, certainly. What can I do? 844 */ 845 846 static void tcp_v4_send_ack(const struct sock *sk, 847 struct sk_buff *skb, u32 seq, u32 ack, 848 u32 win, u32 tsval, u32 tsecr, int oif, 849 struct tcp_md5sig_key *key, 850 int reply_flags, u8 tos) 851 { 852 const struct tcphdr *th = tcp_hdr(skb); 853 struct { 854 struct tcphdr th; 855 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 856 #ifdef CONFIG_TCP_MD5SIG 857 + (TCPOLEN_MD5SIG_ALIGNED >> 2) 858 #endif 859 ]; 860 } rep; 861 struct net *net = sock_net(sk); 862 struct ip_reply_arg arg; 863 struct sock *ctl_sk; 864 u64 transmit_time; 865 866 memset(&rep.th, 0, sizeof(struct tcphdr)); 867 memset(&arg, 0, sizeof(arg)); 868 869 arg.iov[0].iov_base = (unsigned char *)&rep; 870 arg.iov[0].iov_len = sizeof(rep.th); 871 if (tsecr) { 872 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 873 (TCPOPT_TIMESTAMP << 8) | 874 TCPOLEN_TIMESTAMP); 875 rep.opt[1] = htonl(tsval); 876 rep.opt[2] = htonl(tsecr); 877 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 878 } 879 880 /* Swap the send and the receive. */ 881 rep.th.dest = th->source; 882 rep.th.source = th->dest; 883 rep.th.doff = arg.iov[0].iov_len / 4; 884 rep.th.seq = htonl(seq); 885 rep.th.ack_seq = htonl(ack); 886 rep.th.ack = 1; 887 rep.th.window = htons(win); 888 889 #ifdef CONFIG_TCP_MD5SIG 890 if (key) { 891 int offset = (tsecr) ? 3 : 0; 892 893 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 894 (TCPOPT_NOP << 16) | 895 (TCPOPT_MD5SIG << 8) | 896 TCPOLEN_MD5SIG); 897 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 898 rep.th.doff = arg.iov[0].iov_len/4; 899 900 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 901 key, ip_hdr(skb)->saddr, 902 ip_hdr(skb)->daddr, &rep.th); 903 } 904 #endif 905 arg.flags = reply_flags; 906 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 907 ip_hdr(skb)->saddr, /* XXX */ 908 arg.iov[0].iov_len, IPPROTO_TCP, 0); 909 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 910 if (oif) 911 arg.bound_dev_if = oif; 912 arg.tos = tos; 913 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 914 local_bh_disable(); 915 ctl_sk = this_cpu_read(ipv4_tcp_sk); 916 sock_net_set(ctl_sk, net); 917 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 918 inet_twsk(sk)->tw_mark : sk->sk_mark; 919 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 920 inet_twsk(sk)->tw_priority : sk->sk_priority; 921 transmit_time = tcp_transmit_time(sk); 922 ip_send_unicast_reply(ctl_sk, 923 skb, &TCP_SKB_CB(skb)->header.h4.opt, 924 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 925 &arg, arg.iov[0].iov_len, 926 transmit_time); 927 928 ctl_sk->sk_mark = 0; 929 sock_net_set(ctl_sk, &init_net); 930 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 931 local_bh_enable(); 932 } 933 934 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 935 { 936 struct inet_timewait_sock *tw = inet_twsk(sk); 937 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 938 939 tcp_v4_send_ack(sk, skb, 940 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 941 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 942 tcp_time_stamp_raw() + tcptw->tw_ts_offset, 943 tcptw->tw_ts_recent, 944 tw->tw_bound_dev_if, 945 tcp_twsk_md5_key(tcptw), 946 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 947 tw->tw_tos 948 ); 949 950 inet_twsk_put(tw); 951 } 952 953 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 954 struct request_sock *req) 955 { 956 const union tcp_md5_addr *addr; 957 int l3index; 958 959 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 960 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 961 */ 962 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 963 tcp_sk(sk)->snd_nxt; 964 965 /* RFC 7323 2.3 966 * The window field (SEG.WND) of every outgoing segment, with the 967 * exception of <SYN> segments, MUST be right-shifted by 968 * Rcv.Wind.Shift bits: 969 */ 970 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 971 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 972 tcp_v4_send_ack(sk, skb, seq, 973 tcp_rsk(req)->rcv_nxt, 974 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 975 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, 976 req->ts_recent, 977 0, 978 tcp_md5_do_lookup(sk, l3index, addr, AF_INET), 979 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 980 ip_hdr(skb)->tos); 981 } 982 983 /* 984 * Send a SYN-ACK after having received a SYN. 985 * This still operates on a request_sock only, not on a big 986 * socket. 987 */ 988 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 989 struct flowi *fl, 990 struct request_sock *req, 991 struct tcp_fastopen_cookie *foc, 992 enum tcp_synack_type synack_type, 993 struct sk_buff *syn_skb) 994 { 995 const struct inet_request_sock *ireq = inet_rsk(req); 996 struct flowi4 fl4; 997 int err = -1; 998 struct sk_buff *skb; 999 u8 tos; 1000 1001 /* First, grab a route. */ 1002 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 1003 return -1; 1004 1005 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 1006 1007 if (skb) { 1008 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1009 1010 tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ? 1011 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | 1012 (inet_sk(sk)->tos & INET_ECN_MASK) : 1013 inet_sk(sk)->tos; 1014 1015 if (!INET_ECN_is_capable(tos) && 1016 tcp_bpf_ca_needs_ecn((struct sock *)req)) 1017 tos |= INET_ECN_ECT_0; 1018 1019 rcu_read_lock(); 1020 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 1021 ireq->ir_rmt_addr, 1022 rcu_dereference(ireq->ireq_opt), 1023 tos); 1024 rcu_read_unlock(); 1025 err = net_xmit_eval(err); 1026 } 1027 1028 return err; 1029 } 1030 1031 /* 1032 * IPv4 request_sock destructor. 1033 */ 1034 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1035 { 1036 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1037 } 1038 1039 #ifdef CONFIG_TCP_MD5SIG 1040 /* 1041 * RFC2385 MD5 checksumming requires a mapping of 1042 * IP address->MD5 Key. 1043 * We need to maintain these in the sk structure. 1044 */ 1045 1046 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed); 1047 EXPORT_SYMBOL(tcp_md5_needed); 1048 1049 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) 1050 { 1051 if (!old) 1052 return true; 1053 1054 /* l3index always overrides non-l3index */ 1055 if (old->l3index && new->l3index == 0) 1056 return false; 1057 if (old->l3index == 0 && new->l3index) 1058 return true; 1059 1060 return old->prefixlen < new->prefixlen; 1061 } 1062 1063 /* Find the Key structure for an address. */ 1064 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1065 const union tcp_md5_addr *addr, 1066 int family) 1067 { 1068 const struct tcp_sock *tp = tcp_sk(sk); 1069 struct tcp_md5sig_key *key; 1070 const struct tcp_md5sig_info *md5sig; 1071 __be32 mask; 1072 struct tcp_md5sig_key *best_match = NULL; 1073 bool match; 1074 1075 /* caller either holds rcu_read_lock() or socket lock */ 1076 md5sig = rcu_dereference_check(tp->md5sig_info, 1077 lockdep_sock_is_held(sk)); 1078 if (!md5sig) 1079 return NULL; 1080 1081 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1082 lockdep_sock_is_held(sk)) { 1083 if (key->family != family) 1084 continue; 1085 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index) 1086 continue; 1087 if (family == AF_INET) { 1088 mask = inet_make_mask(key->prefixlen); 1089 match = (key->addr.a4.s_addr & mask) == 1090 (addr->a4.s_addr & mask); 1091 #if IS_ENABLED(CONFIG_IPV6) 1092 } else if (family == AF_INET6) { 1093 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1094 key->prefixlen); 1095 #endif 1096 } else { 1097 match = false; 1098 } 1099 1100 if (match && better_md5_match(best_match, key)) 1101 best_match = key; 1102 } 1103 return best_match; 1104 } 1105 EXPORT_SYMBOL(__tcp_md5_do_lookup); 1106 1107 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1108 const union tcp_md5_addr *addr, 1109 int family, u8 prefixlen, 1110 int l3index, u8 flags) 1111 { 1112 const struct tcp_sock *tp = tcp_sk(sk); 1113 struct tcp_md5sig_key *key; 1114 unsigned int size = sizeof(struct in_addr); 1115 const struct tcp_md5sig_info *md5sig; 1116 1117 /* caller either holds rcu_read_lock() or socket lock */ 1118 md5sig = rcu_dereference_check(tp->md5sig_info, 1119 lockdep_sock_is_held(sk)); 1120 if (!md5sig) 1121 return NULL; 1122 #if IS_ENABLED(CONFIG_IPV6) 1123 if (family == AF_INET6) 1124 size = sizeof(struct in6_addr); 1125 #endif 1126 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1127 lockdep_sock_is_held(sk)) { 1128 if (key->family != family) 1129 continue; 1130 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) 1131 continue; 1132 if (key->l3index != l3index) 1133 continue; 1134 if (!memcmp(&key->addr, addr, size) && 1135 key->prefixlen == prefixlen) 1136 return key; 1137 } 1138 return NULL; 1139 } 1140 1141 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1142 const struct sock *addr_sk) 1143 { 1144 const union tcp_md5_addr *addr; 1145 int l3index; 1146 1147 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1148 addr_sk->sk_bound_dev_if); 1149 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1150 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1151 } 1152 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1153 1154 /* This can be called on a newly created socket, from other files */ 1155 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1156 int family, u8 prefixlen, int l3index, u8 flags, 1157 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1158 { 1159 /* Add Key to the list */ 1160 struct tcp_md5sig_key *key; 1161 struct tcp_sock *tp = tcp_sk(sk); 1162 struct tcp_md5sig_info *md5sig; 1163 1164 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1165 if (key) { 1166 /* Pre-existing entry - just update that one. 1167 * Note that the key might be used concurrently. 1168 * data_race() is telling kcsan that we do not care of 1169 * key mismatches, since changing MD5 key on live flows 1170 * can lead to packet drops. 1171 */ 1172 data_race(memcpy(key->key, newkey, newkeylen)); 1173 1174 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1175 * Also note that a reader could catch new key->keylen value 1176 * but old key->key[], this is the reason we use __GFP_ZERO 1177 * at sock_kmalloc() time below these lines. 1178 */ 1179 WRITE_ONCE(key->keylen, newkeylen); 1180 1181 return 0; 1182 } 1183 1184 md5sig = rcu_dereference_protected(tp->md5sig_info, 1185 lockdep_sock_is_held(sk)); 1186 if (!md5sig) { 1187 md5sig = kmalloc(sizeof(*md5sig), gfp); 1188 if (!md5sig) 1189 return -ENOMEM; 1190 1191 sk_gso_disable(sk); 1192 INIT_HLIST_HEAD(&md5sig->head); 1193 rcu_assign_pointer(tp->md5sig_info, md5sig); 1194 } 1195 1196 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1197 if (!key) 1198 return -ENOMEM; 1199 if (!tcp_alloc_md5sig_pool()) { 1200 sock_kfree_s(sk, key, sizeof(*key)); 1201 return -ENOMEM; 1202 } 1203 1204 memcpy(key->key, newkey, newkeylen); 1205 key->keylen = newkeylen; 1206 key->family = family; 1207 key->prefixlen = prefixlen; 1208 key->l3index = l3index; 1209 key->flags = flags; 1210 memcpy(&key->addr, addr, 1211 (family == AF_INET6) ? sizeof(struct in6_addr) : 1212 sizeof(struct in_addr)); 1213 hlist_add_head_rcu(&key->node, &md5sig->head); 1214 return 0; 1215 } 1216 EXPORT_SYMBOL(tcp_md5_do_add); 1217 1218 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1219 u8 prefixlen, int l3index, u8 flags) 1220 { 1221 struct tcp_md5sig_key *key; 1222 1223 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1224 if (!key) 1225 return -ENOENT; 1226 hlist_del_rcu(&key->node); 1227 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1228 kfree_rcu(key, rcu); 1229 return 0; 1230 } 1231 EXPORT_SYMBOL(tcp_md5_do_del); 1232 1233 static void tcp_clear_md5_list(struct sock *sk) 1234 { 1235 struct tcp_sock *tp = tcp_sk(sk); 1236 struct tcp_md5sig_key *key; 1237 struct hlist_node *n; 1238 struct tcp_md5sig_info *md5sig; 1239 1240 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1241 1242 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1243 hlist_del_rcu(&key->node); 1244 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1245 kfree_rcu(key, rcu); 1246 } 1247 } 1248 1249 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1250 sockptr_t optval, int optlen) 1251 { 1252 struct tcp_md5sig cmd; 1253 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1254 const union tcp_md5_addr *addr; 1255 u8 prefixlen = 32; 1256 int l3index = 0; 1257 u8 flags; 1258 1259 if (optlen < sizeof(cmd)) 1260 return -EINVAL; 1261 1262 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1263 return -EFAULT; 1264 1265 if (sin->sin_family != AF_INET) 1266 return -EINVAL; 1267 1268 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1269 1270 if (optname == TCP_MD5SIG_EXT && 1271 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1272 prefixlen = cmd.tcpm_prefixlen; 1273 if (prefixlen > 32) 1274 return -EINVAL; 1275 } 1276 1277 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && 1278 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1279 struct net_device *dev; 1280 1281 rcu_read_lock(); 1282 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1283 if (dev && netif_is_l3_master(dev)) 1284 l3index = dev->ifindex; 1285 1286 rcu_read_unlock(); 1287 1288 /* ok to reference set/not set outside of rcu; 1289 * right now device MUST be an L3 master 1290 */ 1291 if (!dev || !l3index) 1292 return -EINVAL; 1293 } 1294 1295 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1296 1297 if (!cmd.tcpm_keylen) 1298 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); 1299 1300 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1301 return -EINVAL; 1302 1303 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, 1304 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); 1305 } 1306 1307 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, 1308 __be32 daddr, __be32 saddr, 1309 const struct tcphdr *th, int nbytes) 1310 { 1311 struct tcp4_pseudohdr *bp; 1312 struct scatterlist sg; 1313 struct tcphdr *_th; 1314 1315 bp = hp->scratch; 1316 bp->saddr = saddr; 1317 bp->daddr = daddr; 1318 bp->pad = 0; 1319 bp->protocol = IPPROTO_TCP; 1320 bp->len = cpu_to_be16(nbytes); 1321 1322 _th = (struct tcphdr *)(bp + 1); 1323 memcpy(_th, th, sizeof(*th)); 1324 _th->check = 0; 1325 1326 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1327 ahash_request_set_crypt(hp->md5_req, &sg, NULL, 1328 sizeof(*bp) + sizeof(*th)); 1329 return crypto_ahash_update(hp->md5_req); 1330 } 1331 1332 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1333 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1334 { 1335 struct tcp_md5sig_pool *hp; 1336 struct ahash_request *req; 1337 1338 hp = tcp_get_md5sig_pool(); 1339 if (!hp) 1340 goto clear_hash_noput; 1341 req = hp->md5_req; 1342 1343 if (crypto_ahash_init(req)) 1344 goto clear_hash; 1345 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) 1346 goto clear_hash; 1347 if (tcp_md5_hash_key(hp, key)) 1348 goto clear_hash; 1349 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1350 if (crypto_ahash_final(req)) 1351 goto clear_hash; 1352 1353 tcp_put_md5sig_pool(); 1354 return 0; 1355 1356 clear_hash: 1357 tcp_put_md5sig_pool(); 1358 clear_hash_noput: 1359 memset(md5_hash, 0, 16); 1360 return 1; 1361 } 1362 1363 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1364 const struct sock *sk, 1365 const struct sk_buff *skb) 1366 { 1367 struct tcp_md5sig_pool *hp; 1368 struct ahash_request *req; 1369 const struct tcphdr *th = tcp_hdr(skb); 1370 __be32 saddr, daddr; 1371 1372 if (sk) { /* valid for establish/request sockets */ 1373 saddr = sk->sk_rcv_saddr; 1374 daddr = sk->sk_daddr; 1375 } else { 1376 const struct iphdr *iph = ip_hdr(skb); 1377 saddr = iph->saddr; 1378 daddr = iph->daddr; 1379 } 1380 1381 hp = tcp_get_md5sig_pool(); 1382 if (!hp) 1383 goto clear_hash_noput; 1384 req = hp->md5_req; 1385 1386 if (crypto_ahash_init(req)) 1387 goto clear_hash; 1388 1389 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len)) 1390 goto clear_hash; 1391 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) 1392 goto clear_hash; 1393 if (tcp_md5_hash_key(hp, key)) 1394 goto clear_hash; 1395 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1396 if (crypto_ahash_final(req)) 1397 goto clear_hash; 1398 1399 tcp_put_md5sig_pool(); 1400 return 0; 1401 1402 clear_hash: 1403 tcp_put_md5sig_pool(); 1404 clear_hash_noput: 1405 memset(md5_hash, 0, 16); 1406 return 1; 1407 } 1408 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1409 1410 #endif 1411 1412 static void tcp_v4_init_req(struct request_sock *req, 1413 const struct sock *sk_listener, 1414 struct sk_buff *skb) 1415 { 1416 struct inet_request_sock *ireq = inet_rsk(req); 1417 struct net *net = sock_net(sk_listener); 1418 1419 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1420 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1421 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1422 } 1423 1424 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1425 struct sk_buff *skb, 1426 struct flowi *fl, 1427 struct request_sock *req) 1428 { 1429 tcp_v4_init_req(req, sk, skb); 1430 1431 if (security_inet_conn_request(sk, skb, req)) 1432 return NULL; 1433 1434 return inet_csk_route_req(sk, &fl->u.ip4, req); 1435 } 1436 1437 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1438 .family = PF_INET, 1439 .obj_size = sizeof(struct tcp_request_sock), 1440 .rtx_syn_ack = tcp_rtx_synack, 1441 .send_ack = tcp_v4_reqsk_send_ack, 1442 .destructor = tcp_v4_reqsk_destructor, 1443 .send_reset = tcp_v4_send_reset, 1444 .syn_ack_timeout = tcp_syn_ack_timeout, 1445 }; 1446 1447 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1448 .mss_clamp = TCP_MSS_DEFAULT, 1449 #ifdef CONFIG_TCP_MD5SIG 1450 .req_md5_lookup = tcp_v4_md5_lookup, 1451 .calc_md5_hash = tcp_v4_md5_hash_skb, 1452 #endif 1453 #ifdef CONFIG_SYN_COOKIES 1454 .cookie_init_seq = cookie_v4_init_sequence, 1455 #endif 1456 .route_req = tcp_v4_route_req, 1457 .init_seq = tcp_v4_init_seq, 1458 .init_ts_off = tcp_v4_init_ts_off, 1459 .send_synack = tcp_v4_send_synack, 1460 }; 1461 1462 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1463 { 1464 /* Never answer to SYNs send to broadcast or multicast */ 1465 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1466 goto drop; 1467 1468 return tcp_conn_request(&tcp_request_sock_ops, 1469 &tcp_request_sock_ipv4_ops, sk, skb); 1470 1471 drop: 1472 tcp_listendrop(sk); 1473 return 0; 1474 } 1475 EXPORT_SYMBOL(tcp_v4_conn_request); 1476 1477 1478 /* 1479 * The three way handshake has completed - we got a valid synack - 1480 * now create the new socket. 1481 */ 1482 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1483 struct request_sock *req, 1484 struct dst_entry *dst, 1485 struct request_sock *req_unhash, 1486 bool *own_req) 1487 { 1488 struct inet_request_sock *ireq; 1489 bool found_dup_sk = false; 1490 struct inet_sock *newinet; 1491 struct tcp_sock *newtp; 1492 struct sock *newsk; 1493 #ifdef CONFIG_TCP_MD5SIG 1494 const union tcp_md5_addr *addr; 1495 struct tcp_md5sig_key *key; 1496 int l3index; 1497 #endif 1498 struct ip_options_rcu *inet_opt; 1499 1500 if (sk_acceptq_is_full(sk)) 1501 goto exit_overflow; 1502 1503 newsk = tcp_create_openreq_child(sk, req, skb); 1504 if (!newsk) 1505 goto exit_nonewsk; 1506 1507 newsk->sk_gso_type = SKB_GSO_TCPV4; 1508 inet_sk_rx_dst_set(newsk, skb); 1509 1510 newtp = tcp_sk(newsk); 1511 newinet = inet_sk(newsk); 1512 ireq = inet_rsk(req); 1513 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1514 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1515 newsk->sk_bound_dev_if = ireq->ir_iif; 1516 newinet->inet_saddr = ireq->ir_loc_addr; 1517 inet_opt = rcu_dereference(ireq->ireq_opt); 1518 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1519 newinet->mc_index = inet_iif(skb); 1520 newinet->mc_ttl = ip_hdr(skb)->ttl; 1521 newinet->rcv_tos = ip_hdr(skb)->tos; 1522 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1523 if (inet_opt) 1524 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1525 newinet->inet_id = prandom_u32(); 1526 1527 /* Set ToS of the new socket based upon the value of incoming SYN. 1528 * ECT bits are set later in tcp_init_transfer(). 1529 */ 1530 if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) 1531 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1532 1533 if (!dst) { 1534 dst = inet_csk_route_child_sock(sk, newsk, req); 1535 if (!dst) 1536 goto put_and_exit; 1537 } else { 1538 /* syncookie case : see end of cookie_v4_check() */ 1539 } 1540 sk_setup_caps(newsk, dst); 1541 1542 tcp_ca_openreq_child(newsk, dst); 1543 1544 tcp_sync_mss(newsk, dst_mtu(dst)); 1545 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1546 1547 tcp_initialize_rcv_mss(newsk); 1548 1549 #ifdef CONFIG_TCP_MD5SIG 1550 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1551 /* Copy over the MD5 key from the original socket */ 1552 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1553 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1554 if (key) { 1555 /* 1556 * We're using one, so create a matching key 1557 * on the newsk structure. If we fail to get 1558 * memory, then we end up not copying the key 1559 * across. Shucks. 1560 */ 1561 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags, 1562 key->key, key->keylen, GFP_ATOMIC); 1563 sk_gso_disable(newsk); 1564 } 1565 #endif 1566 1567 if (__inet_inherit_port(sk, newsk) < 0) 1568 goto put_and_exit; 1569 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1570 &found_dup_sk); 1571 if (likely(*own_req)) { 1572 tcp_move_syn(newtp, req); 1573 ireq->ireq_opt = NULL; 1574 } else { 1575 newinet->inet_opt = NULL; 1576 1577 if (!req_unhash && found_dup_sk) { 1578 /* This code path should only be executed in the 1579 * syncookie case only 1580 */ 1581 bh_unlock_sock(newsk); 1582 sock_put(newsk); 1583 newsk = NULL; 1584 } 1585 } 1586 return newsk; 1587 1588 exit_overflow: 1589 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1590 exit_nonewsk: 1591 dst_release(dst); 1592 exit: 1593 tcp_listendrop(sk); 1594 return NULL; 1595 put_and_exit: 1596 newinet->inet_opt = NULL; 1597 inet_csk_prepare_forced_close(newsk); 1598 tcp_done(newsk); 1599 goto exit; 1600 } 1601 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1602 1603 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1604 { 1605 #ifdef CONFIG_SYN_COOKIES 1606 const struct tcphdr *th = tcp_hdr(skb); 1607 1608 if (!th->syn) 1609 sk = cookie_v4_check(sk, skb); 1610 #endif 1611 return sk; 1612 } 1613 1614 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1615 struct tcphdr *th, u32 *cookie) 1616 { 1617 u16 mss = 0; 1618 #ifdef CONFIG_SYN_COOKIES 1619 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1620 &tcp_request_sock_ipv4_ops, sk, th); 1621 if (mss) { 1622 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1623 tcp_synq_overflow(sk); 1624 } 1625 #endif 1626 return mss; 1627 } 1628 1629 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 1630 u32)); 1631 /* The socket must have it's spinlock held when we get 1632 * here, unless it is a TCP_LISTEN socket. 1633 * 1634 * We have a potential double-lock case here, so even when 1635 * doing backlog processing we use the BH locking scheme. 1636 * This is because we cannot sleep with the original spinlock 1637 * held. 1638 */ 1639 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1640 { 1641 enum skb_drop_reason reason; 1642 struct sock *rsk; 1643 1644 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1645 struct dst_entry *dst; 1646 1647 dst = rcu_dereference_protected(sk->sk_rx_dst, 1648 lockdep_sock_is_held(sk)); 1649 1650 sock_rps_save_rxhash(sk, skb); 1651 sk_mark_napi_id(sk, skb); 1652 if (dst) { 1653 if (sk->sk_rx_dst_ifindex != skb->skb_iif || 1654 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, 1655 dst, 0)) { 1656 RCU_INIT_POINTER(sk->sk_rx_dst, NULL); 1657 dst_release(dst); 1658 } 1659 } 1660 tcp_rcv_established(sk, skb); 1661 return 0; 1662 } 1663 1664 reason = SKB_DROP_REASON_NOT_SPECIFIED; 1665 if (tcp_checksum_complete(skb)) 1666 goto csum_err; 1667 1668 if (sk->sk_state == TCP_LISTEN) { 1669 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1670 1671 if (!nsk) 1672 goto discard; 1673 if (nsk != sk) { 1674 if (tcp_child_process(sk, nsk, skb)) { 1675 rsk = nsk; 1676 goto reset; 1677 } 1678 return 0; 1679 } 1680 } else 1681 sock_rps_save_rxhash(sk, skb); 1682 1683 if (tcp_rcv_state_process(sk, skb)) { 1684 rsk = sk; 1685 goto reset; 1686 } 1687 return 0; 1688 1689 reset: 1690 tcp_v4_send_reset(rsk, skb); 1691 discard: 1692 kfree_skb_reason(skb, reason); 1693 /* Be careful here. If this function gets more complicated and 1694 * gcc suffers from register pressure on the x86, sk (in %ebx) 1695 * might be destroyed here. This current version compiles correctly, 1696 * but you have been warned. 1697 */ 1698 return 0; 1699 1700 csum_err: 1701 reason = SKB_DROP_REASON_TCP_CSUM; 1702 trace_tcp_bad_csum(skb); 1703 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1704 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1705 goto discard; 1706 } 1707 EXPORT_SYMBOL(tcp_v4_do_rcv); 1708 1709 int tcp_v4_early_demux(struct sk_buff *skb) 1710 { 1711 const struct iphdr *iph; 1712 const struct tcphdr *th; 1713 struct sock *sk; 1714 1715 if (skb->pkt_type != PACKET_HOST) 1716 return 0; 1717 1718 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1719 return 0; 1720 1721 iph = ip_hdr(skb); 1722 th = tcp_hdr(skb); 1723 1724 if (th->doff < sizeof(struct tcphdr) / 4) 1725 return 0; 1726 1727 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, 1728 iph->saddr, th->source, 1729 iph->daddr, ntohs(th->dest), 1730 skb->skb_iif, inet_sdif(skb)); 1731 if (sk) { 1732 skb->sk = sk; 1733 skb->destructor = sock_edemux; 1734 if (sk_fullsock(sk)) { 1735 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); 1736 1737 if (dst) 1738 dst = dst_check(dst, 0); 1739 if (dst && 1740 sk->sk_rx_dst_ifindex == skb->skb_iif) 1741 skb_dst_set_noref(skb, dst); 1742 } 1743 } 1744 return 0; 1745 } 1746 1747 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, 1748 enum skb_drop_reason *reason) 1749 { 1750 u32 limit, tail_gso_size, tail_gso_segs; 1751 struct skb_shared_info *shinfo; 1752 const struct tcphdr *th; 1753 struct tcphdr *thtail; 1754 struct sk_buff *tail; 1755 unsigned int hdrlen; 1756 bool fragstolen; 1757 u32 gso_segs; 1758 u32 gso_size; 1759 int delta; 1760 1761 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1762 * we can fix skb->truesize to its real value to avoid future drops. 1763 * This is valid because skb is not yet charged to the socket. 1764 * It has been noticed pure SACK packets were sometimes dropped 1765 * (if cooked by drivers without copybreak feature). 1766 */ 1767 skb_condense(skb); 1768 1769 skb_dst_drop(skb); 1770 1771 if (unlikely(tcp_checksum_complete(skb))) { 1772 bh_unlock_sock(sk); 1773 trace_tcp_bad_csum(skb); 1774 *reason = SKB_DROP_REASON_TCP_CSUM; 1775 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1776 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1777 return true; 1778 } 1779 1780 /* Attempt coalescing to last skb in backlog, even if we are 1781 * above the limits. 1782 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 1783 */ 1784 th = (const struct tcphdr *)skb->data; 1785 hdrlen = th->doff * 4; 1786 1787 tail = sk->sk_backlog.tail; 1788 if (!tail) 1789 goto no_coalesce; 1790 thtail = (struct tcphdr *)tail->data; 1791 1792 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 1793 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 1794 ((TCP_SKB_CB(tail)->tcp_flags | 1795 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 1796 !((TCP_SKB_CB(tail)->tcp_flags & 1797 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 1798 ((TCP_SKB_CB(tail)->tcp_flags ^ 1799 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || 1800 #ifdef CONFIG_TLS_DEVICE 1801 tail->decrypted != skb->decrypted || 1802 #endif 1803 thtail->doff != th->doff || 1804 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 1805 goto no_coalesce; 1806 1807 __skb_pull(skb, hdrlen); 1808 1809 shinfo = skb_shinfo(skb); 1810 gso_size = shinfo->gso_size ?: skb->len; 1811 gso_segs = shinfo->gso_segs ?: 1; 1812 1813 shinfo = skb_shinfo(tail); 1814 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); 1815 tail_gso_segs = shinfo->gso_segs ?: 1; 1816 1817 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 1818 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 1819 1820 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 1821 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 1822 thtail->window = th->window; 1823 } 1824 1825 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 1826 * thtail->fin, so that the fast path in tcp_rcv_established() 1827 * is not entered if we append a packet with a FIN. 1828 * SYN, RST, URG are not present. 1829 * ACK is set on both packets. 1830 * PSH : we do not really care in TCP stack, 1831 * at least for 'GRO' packets. 1832 */ 1833 thtail->fin |= th->fin; 1834 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 1835 1836 if (TCP_SKB_CB(skb)->has_rxtstamp) { 1837 TCP_SKB_CB(tail)->has_rxtstamp = true; 1838 tail->tstamp = skb->tstamp; 1839 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 1840 } 1841 1842 /* Not as strict as GRO. We only need to carry mss max value */ 1843 shinfo->gso_size = max(gso_size, tail_gso_size); 1844 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); 1845 1846 sk->sk_backlog.len += delta; 1847 __NET_INC_STATS(sock_net(sk), 1848 LINUX_MIB_TCPBACKLOGCOALESCE); 1849 kfree_skb_partial(skb, fragstolen); 1850 return false; 1851 } 1852 __skb_push(skb, hdrlen); 1853 1854 no_coalesce: 1855 /* Only socket owner can try to collapse/prune rx queues 1856 * to reduce memory overhead, so add a little headroom here. 1857 * Few sockets backlog are possibly concurrently non empty. 1858 */ 1859 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024; 1860 1861 if (unlikely(sk_add_backlog(sk, skb, limit))) { 1862 bh_unlock_sock(sk); 1863 *reason = SKB_DROP_REASON_SOCKET_BACKLOG; 1864 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 1865 return true; 1866 } 1867 return false; 1868 } 1869 EXPORT_SYMBOL(tcp_add_backlog); 1870 1871 int tcp_filter(struct sock *sk, struct sk_buff *skb) 1872 { 1873 struct tcphdr *th = (struct tcphdr *)skb->data; 1874 1875 return sk_filter_trim_cap(sk, skb, th->doff * 4); 1876 } 1877 EXPORT_SYMBOL(tcp_filter); 1878 1879 static void tcp_v4_restore_cb(struct sk_buff *skb) 1880 { 1881 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 1882 sizeof(struct inet_skb_parm)); 1883 } 1884 1885 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 1886 const struct tcphdr *th) 1887 { 1888 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 1889 * barrier() makes sure compiler wont play fool^Waliasing games. 1890 */ 1891 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 1892 sizeof(struct inet_skb_parm)); 1893 barrier(); 1894 1895 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1896 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1897 skb->len - th->doff * 4); 1898 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1899 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 1900 TCP_SKB_CB(skb)->tcp_tw_isn = 0; 1901 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 1902 TCP_SKB_CB(skb)->sacked = 0; 1903 TCP_SKB_CB(skb)->has_rxtstamp = 1904 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 1905 } 1906 1907 /* 1908 * From tcp_input.c 1909 */ 1910 1911 int tcp_v4_rcv(struct sk_buff *skb) 1912 { 1913 struct net *net = dev_net(skb->dev); 1914 enum skb_drop_reason drop_reason; 1915 int sdif = inet_sdif(skb); 1916 int dif = inet_iif(skb); 1917 const struct iphdr *iph; 1918 const struct tcphdr *th; 1919 bool refcounted; 1920 struct sock *sk; 1921 int ret; 1922 1923 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 1924 if (skb->pkt_type != PACKET_HOST) 1925 goto discard_it; 1926 1927 /* Count it even if it's bad */ 1928 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 1929 1930 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 1931 goto discard_it; 1932 1933 th = (const struct tcphdr *)skb->data; 1934 1935 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { 1936 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; 1937 goto bad_packet; 1938 } 1939 if (!pskb_may_pull(skb, th->doff * 4)) 1940 goto discard_it; 1941 1942 /* An explanation is required here, I think. 1943 * Packet length and doff are validated by header prediction, 1944 * provided case of th->doff==0 is eliminated. 1945 * So, we defer the checks. */ 1946 1947 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 1948 goto csum_error; 1949 1950 th = (const struct tcphdr *)skb->data; 1951 iph = ip_hdr(skb); 1952 lookup: 1953 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, 1954 th->dest, sdif, &refcounted); 1955 if (!sk) 1956 goto no_tcp_socket; 1957 1958 process: 1959 if (sk->sk_state == TCP_TIME_WAIT) 1960 goto do_time_wait; 1961 1962 if (sk->sk_state == TCP_NEW_SYN_RECV) { 1963 struct request_sock *req = inet_reqsk(sk); 1964 bool req_stolen = false; 1965 struct sock *nsk; 1966 1967 sk = req->rsk_listener; 1968 drop_reason = tcp_inbound_md5_hash(sk, skb, 1969 &iph->saddr, &iph->daddr, 1970 AF_INET, dif, sdif); 1971 if (unlikely(drop_reason)) { 1972 sk_drops_add(sk, skb); 1973 reqsk_put(req); 1974 goto discard_it; 1975 } 1976 if (tcp_checksum_complete(skb)) { 1977 reqsk_put(req); 1978 goto csum_error; 1979 } 1980 if (unlikely(sk->sk_state != TCP_LISTEN)) { 1981 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); 1982 if (!nsk) { 1983 inet_csk_reqsk_queue_drop_and_put(sk, req); 1984 goto lookup; 1985 } 1986 sk = nsk; 1987 /* reuseport_migrate_sock() has already held one sk_refcnt 1988 * before returning. 1989 */ 1990 } else { 1991 /* We own a reference on the listener, increase it again 1992 * as we might lose it too soon. 1993 */ 1994 sock_hold(sk); 1995 } 1996 refcounted = true; 1997 nsk = NULL; 1998 if (!tcp_filter(sk, skb)) { 1999 th = (const struct tcphdr *)skb->data; 2000 iph = ip_hdr(skb); 2001 tcp_v4_fill_cb(skb, iph, th); 2002 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 2003 } else { 2004 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2005 } 2006 if (!nsk) { 2007 reqsk_put(req); 2008 if (req_stolen) { 2009 /* Another cpu got exclusive access to req 2010 * and created a full blown socket. 2011 * Try to feed this packet to this socket 2012 * instead of discarding it. 2013 */ 2014 tcp_v4_restore_cb(skb); 2015 sock_put(sk); 2016 goto lookup; 2017 } 2018 goto discard_and_relse; 2019 } 2020 if (nsk == sk) { 2021 reqsk_put(req); 2022 tcp_v4_restore_cb(skb); 2023 } else if (tcp_child_process(sk, nsk, skb)) { 2024 tcp_v4_send_reset(nsk, skb); 2025 goto discard_and_relse; 2026 } else { 2027 sock_put(sk); 2028 return 0; 2029 } 2030 } 2031 2032 if (static_branch_unlikely(&ip4_min_ttl)) { 2033 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 2034 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 2035 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2036 goto discard_and_relse; 2037 } 2038 } 2039 2040 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { 2041 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2042 goto discard_and_relse; 2043 } 2044 2045 drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr, 2046 &iph->daddr, AF_INET, dif, sdif); 2047 if (drop_reason) 2048 goto discard_and_relse; 2049 2050 nf_reset_ct(skb); 2051 2052 if (tcp_filter(sk, skb)) { 2053 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2054 goto discard_and_relse; 2055 } 2056 th = (const struct tcphdr *)skb->data; 2057 iph = ip_hdr(skb); 2058 tcp_v4_fill_cb(skb, iph, th); 2059 2060 skb->dev = NULL; 2061 2062 if (sk->sk_state == TCP_LISTEN) { 2063 ret = tcp_v4_do_rcv(sk, skb); 2064 goto put_and_return; 2065 } 2066 2067 sk_incoming_cpu_update(sk); 2068 2069 sk_defer_free_flush(sk); 2070 bh_lock_sock_nested(sk); 2071 tcp_segs_in(tcp_sk(sk), skb); 2072 ret = 0; 2073 if (!sock_owned_by_user(sk)) { 2074 ret = tcp_v4_do_rcv(sk, skb); 2075 } else { 2076 if (tcp_add_backlog(sk, skb, &drop_reason)) 2077 goto discard_and_relse; 2078 } 2079 bh_unlock_sock(sk); 2080 2081 put_and_return: 2082 if (refcounted) 2083 sock_put(sk); 2084 2085 return ret; 2086 2087 no_tcp_socket: 2088 drop_reason = SKB_DROP_REASON_NO_SOCKET; 2089 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2090 goto discard_it; 2091 2092 tcp_v4_fill_cb(skb, iph, th); 2093 2094 if (tcp_checksum_complete(skb)) { 2095 csum_error: 2096 drop_reason = SKB_DROP_REASON_TCP_CSUM; 2097 trace_tcp_bad_csum(skb); 2098 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2099 bad_packet: 2100 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2101 } else { 2102 tcp_v4_send_reset(NULL, skb); 2103 } 2104 2105 discard_it: 2106 /* Discard frame. */ 2107 kfree_skb_reason(skb, drop_reason); 2108 return 0; 2109 2110 discard_and_relse: 2111 sk_drops_add(sk, skb); 2112 if (refcounted) 2113 sock_put(sk); 2114 goto discard_it; 2115 2116 do_time_wait: 2117 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2118 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2119 inet_twsk_put(inet_twsk(sk)); 2120 goto discard_it; 2121 } 2122 2123 tcp_v4_fill_cb(skb, iph, th); 2124 2125 if (tcp_checksum_complete(skb)) { 2126 inet_twsk_put(inet_twsk(sk)); 2127 goto csum_error; 2128 } 2129 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 2130 case TCP_TW_SYN: { 2131 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), 2132 &tcp_hashinfo, skb, 2133 __tcp_hdrlen(th), 2134 iph->saddr, th->source, 2135 iph->daddr, th->dest, 2136 inet_iif(skb), 2137 sdif); 2138 if (sk2) { 2139 inet_twsk_deschedule_put(inet_twsk(sk)); 2140 sk = sk2; 2141 tcp_v4_restore_cb(skb); 2142 refcounted = false; 2143 goto process; 2144 } 2145 } 2146 /* to ACK */ 2147 fallthrough; 2148 case TCP_TW_ACK: 2149 tcp_v4_timewait_ack(sk, skb); 2150 break; 2151 case TCP_TW_RST: 2152 tcp_v4_send_reset(sk, skb); 2153 inet_twsk_deschedule_put(inet_twsk(sk)); 2154 goto discard_it; 2155 case TCP_TW_SUCCESS:; 2156 } 2157 goto discard_it; 2158 } 2159 2160 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2161 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2162 .twsk_unique = tcp_twsk_unique, 2163 .twsk_destructor= tcp_twsk_destructor, 2164 }; 2165 2166 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2167 { 2168 struct dst_entry *dst = skb_dst(skb); 2169 2170 if (dst && dst_hold_safe(dst)) { 2171 rcu_assign_pointer(sk->sk_rx_dst, dst); 2172 sk->sk_rx_dst_ifindex = skb->skb_iif; 2173 } 2174 } 2175 EXPORT_SYMBOL(inet_sk_rx_dst_set); 2176 2177 const struct inet_connection_sock_af_ops ipv4_specific = { 2178 .queue_xmit = ip_queue_xmit, 2179 .send_check = tcp_v4_send_check, 2180 .rebuild_header = inet_sk_rebuild_header, 2181 .sk_rx_dst_set = inet_sk_rx_dst_set, 2182 .conn_request = tcp_v4_conn_request, 2183 .syn_recv_sock = tcp_v4_syn_recv_sock, 2184 .net_header_len = sizeof(struct iphdr), 2185 .setsockopt = ip_setsockopt, 2186 .getsockopt = ip_getsockopt, 2187 .addr2sockaddr = inet_csk_addr2sockaddr, 2188 .sockaddr_len = sizeof(struct sockaddr_in), 2189 .mtu_reduced = tcp_v4_mtu_reduced, 2190 }; 2191 EXPORT_SYMBOL(ipv4_specific); 2192 2193 #ifdef CONFIG_TCP_MD5SIG 2194 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2195 .md5_lookup = tcp_v4_md5_lookup, 2196 .calc_md5_hash = tcp_v4_md5_hash_skb, 2197 .md5_parse = tcp_v4_parse_md5_keys, 2198 }; 2199 #endif 2200 2201 /* NOTE: A lot of things set to zero explicitly by call to 2202 * sk_alloc() so need not be done here. 2203 */ 2204 static int tcp_v4_init_sock(struct sock *sk) 2205 { 2206 struct inet_connection_sock *icsk = inet_csk(sk); 2207 2208 tcp_init_sock(sk); 2209 2210 icsk->icsk_af_ops = &ipv4_specific; 2211 2212 #ifdef CONFIG_TCP_MD5SIG 2213 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2214 #endif 2215 2216 return 0; 2217 } 2218 2219 void tcp_v4_destroy_sock(struct sock *sk) 2220 { 2221 struct tcp_sock *tp = tcp_sk(sk); 2222 2223 trace_tcp_destroy_sock(sk); 2224 2225 tcp_clear_xmit_timers(sk); 2226 2227 tcp_cleanup_congestion_control(sk); 2228 2229 tcp_cleanup_ulp(sk); 2230 2231 /* Cleanup up the write buffer. */ 2232 tcp_write_queue_purge(sk); 2233 2234 /* Check if we want to disable active TFO */ 2235 tcp_fastopen_active_disable_ofo_check(sk); 2236 2237 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2238 skb_rbtree_purge(&tp->out_of_order_queue); 2239 2240 #ifdef CONFIG_TCP_MD5SIG 2241 /* Clean up the MD5 key list, if any */ 2242 if (tp->md5sig_info) { 2243 tcp_clear_md5_list(sk); 2244 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu); 2245 tp->md5sig_info = NULL; 2246 } 2247 #endif 2248 2249 /* Clean up a referenced TCP bind bucket. */ 2250 if (inet_csk(sk)->icsk_bind_hash) 2251 inet_put_port(sk); 2252 2253 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2254 2255 /* If socket is aborted during connect operation */ 2256 tcp_free_fastopen_req(tp); 2257 tcp_fastopen_destroy_cipher(sk); 2258 tcp_saved_syn_free(tp); 2259 2260 sk_sockets_allocated_dec(sk); 2261 } 2262 EXPORT_SYMBOL(tcp_v4_destroy_sock); 2263 2264 #ifdef CONFIG_PROC_FS 2265 /* Proc filesystem TCP sock list dumping. */ 2266 2267 static unsigned short seq_file_family(const struct seq_file *seq); 2268 2269 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) 2270 { 2271 unsigned short family = seq_file_family(seq); 2272 2273 /* AF_UNSPEC is used as a match all */ 2274 return ((family == AF_UNSPEC || family == sk->sk_family) && 2275 net_eq(sock_net(sk), seq_file_net(seq))); 2276 } 2277 2278 /* Find a non empty bucket (starting from st->bucket) 2279 * and return the first sk from it. 2280 */ 2281 static void *listening_get_first(struct seq_file *seq) 2282 { 2283 struct tcp_iter_state *st = seq->private; 2284 2285 st->offset = 0; 2286 for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) { 2287 struct inet_listen_hashbucket *ilb2; 2288 struct inet_connection_sock *icsk; 2289 struct sock *sk; 2290 2291 ilb2 = &tcp_hashinfo.lhash2[st->bucket]; 2292 if (hlist_empty(&ilb2->head)) 2293 continue; 2294 2295 spin_lock(&ilb2->lock); 2296 inet_lhash2_for_each_icsk(icsk, &ilb2->head) { 2297 sk = (struct sock *)icsk; 2298 if (seq_sk_match(seq, sk)) 2299 return sk; 2300 } 2301 spin_unlock(&ilb2->lock); 2302 } 2303 2304 return NULL; 2305 } 2306 2307 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). 2308 * If "cur" is the last one in the st->bucket, 2309 * call listening_get_first() to return the first sk of the next 2310 * non empty bucket. 2311 */ 2312 static void *listening_get_next(struct seq_file *seq, void *cur) 2313 { 2314 struct tcp_iter_state *st = seq->private; 2315 struct inet_listen_hashbucket *ilb2; 2316 struct inet_connection_sock *icsk; 2317 struct sock *sk = cur; 2318 2319 ++st->num; 2320 ++st->offset; 2321 2322 icsk = inet_csk(sk); 2323 inet_lhash2_for_each_icsk_continue(icsk) { 2324 sk = (struct sock *)icsk; 2325 if (seq_sk_match(seq, sk)) 2326 return sk; 2327 } 2328 2329 ilb2 = &tcp_hashinfo.lhash2[st->bucket]; 2330 spin_unlock(&ilb2->lock); 2331 ++st->bucket; 2332 return listening_get_first(seq); 2333 } 2334 2335 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2336 { 2337 struct tcp_iter_state *st = seq->private; 2338 void *rc; 2339 2340 st->bucket = 0; 2341 st->offset = 0; 2342 rc = listening_get_first(seq); 2343 2344 while (rc && *pos) { 2345 rc = listening_get_next(seq, rc); 2346 --*pos; 2347 } 2348 return rc; 2349 } 2350 2351 static inline bool empty_bucket(const struct tcp_iter_state *st) 2352 { 2353 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain); 2354 } 2355 2356 /* 2357 * Get first established socket starting from bucket given in st->bucket. 2358 * If st->bucket is zero, the very first socket in the hash is returned. 2359 */ 2360 static void *established_get_first(struct seq_file *seq) 2361 { 2362 struct tcp_iter_state *st = seq->private; 2363 2364 st->offset = 0; 2365 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { 2366 struct sock *sk; 2367 struct hlist_nulls_node *node; 2368 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 2369 2370 /* Lockless fast path for the common case of empty buckets */ 2371 if (empty_bucket(st)) 2372 continue; 2373 2374 spin_lock_bh(lock); 2375 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 2376 if (seq_sk_match(seq, sk)) 2377 return sk; 2378 } 2379 spin_unlock_bh(lock); 2380 } 2381 2382 return NULL; 2383 } 2384 2385 static void *established_get_next(struct seq_file *seq, void *cur) 2386 { 2387 struct sock *sk = cur; 2388 struct hlist_nulls_node *node; 2389 struct tcp_iter_state *st = seq->private; 2390 2391 ++st->num; 2392 ++st->offset; 2393 2394 sk = sk_nulls_next(sk); 2395 2396 sk_nulls_for_each_from(sk, node) { 2397 if (seq_sk_match(seq, sk)) 2398 return sk; 2399 } 2400 2401 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2402 ++st->bucket; 2403 return established_get_first(seq); 2404 } 2405 2406 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2407 { 2408 struct tcp_iter_state *st = seq->private; 2409 void *rc; 2410 2411 st->bucket = 0; 2412 rc = established_get_first(seq); 2413 2414 while (rc && pos) { 2415 rc = established_get_next(seq, rc); 2416 --pos; 2417 } 2418 return rc; 2419 } 2420 2421 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2422 { 2423 void *rc; 2424 struct tcp_iter_state *st = seq->private; 2425 2426 st->state = TCP_SEQ_STATE_LISTENING; 2427 rc = listening_get_idx(seq, &pos); 2428 2429 if (!rc) { 2430 st->state = TCP_SEQ_STATE_ESTABLISHED; 2431 rc = established_get_idx(seq, pos); 2432 } 2433 2434 return rc; 2435 } 2436 2437 static void *tcp_seek_last_pos(struct seq_file *seq) 2438 { 2439 struct tcp_iter_state *st = seq->private; 2440 int bucket = st->bucket; 2441 int offset = st->offset; 2442 int orig_num = st->num; 2443 void *rc = NULL; 2444 2445 switch (st->state) { 2446 case TCP_SEQ_STATE_LISTENING: 2447 if (st->bucket > tcp_hashinfo.lhash2_mask) 2448 break; 2449 st->state = TCP_SEQ_STATE_LISTENING; 2450 rc = listening_get_first(seq); 2451 while (offset-- && rc && bucket == st->bucket) 2452 rc = listening_get_next(seq, rc); 2453 if (rc) 2454 break; 2455 st->bucket = 0; 2456 st->state = TCP_SEQ_STATE_ESTABLISHED; 2457 fallthrough; 2458 case TCP_SEQ_STATE_ESTABLISHED: 2459 if (st->bucket > tcp_hashinfo.ehash_mask) 2460 break; 2461 rc = established_get_first(seq); 2462 while (offset-- && rc && bucket == st->bucket) 2463 rc = established_get_next(seq, rc); 2464 } 2465 2466 st->num = orig_num; 2467 2468 return rc; 2469 } 2470 2471 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2472 { 2473 struct tcp_iter_state *st = seq->private; 2474 void *rc; 2475 2476 if (*pos && *pos == st->last_pos) { 2477 rc = tcp_seek_last_pos(seq); 2478 if (rc) 2479 goto out; 2480 } 2481 2482 st->state = TCP_SEQ_STATE_LISTENING; 2483 st->num = 0; 2484 st->bucket = 0; 2485 st->offset = 0; 2486 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2487 2488 out: 2489 st->last_pos = *pos; 2490 return rc; 2491 } 2492 EXPORT_SYMBOL(tcp_seq_start); 2493 2494 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2495 { 2496 struct tcp_iter_state *st = seq->private; 2497 void *rc = NULL; 2498 2499 if (v == SEQ_START_TOKEN) { 2500 rc = tcp_get_idx(seq, 0); 2501 goto out; 2502 } 2503 2504 switch (st->state) { 2505 case TCP_SEQ_STATE_LISTENING: 2506 rc = listening_get_next(seq, v); 2507 if (!rc) { 2508 st->state = TCP_SEQ_STATE_ESTABLISHED; 2509 st->bucket = 0; 2510 st->offset = 0; 2511 rc = established_get_first(seq); 2512 } 2513 break; 2514 case TCP_SEQ_STATE_ESTABLISHED: 2515 rc = established_get_next(seq, v); 2516 break; 2517 } 2518 out: 2519 ++*pos; 2520 st->last_pos = *pos; 2521 return rc; 2522 } 2523 EXPORT_SYMBOL(tcp_seq_next); 2524 2525 void tcp_seq_stop(struct seq_file *seq, void *v) 2526 { 2527 struct tcp_iter_state *st = seq->private; 2528 2529 switch (st->state) { 2530 case TCP_SEQ_STATE_LISTENING: 2531 if (v != SEQ_START_TOKEN) 2532 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock); 2533 break; 2534 case TCP_SEQ_STATE_ESTABLISHED: 2535 if (v) 2536 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2537 break; 2538 } 2539 } 2540 EXPORT_SYMBOL(tcp_seq_stop); 2541 2542 static void get_openreq4(const struct request_sock *req, 2543 struct seq_file *f, int i) 2544 { 2545 const struct inet_request_sock *ireq = inet_rsk(req); 2546 long delta = req->rsk_timer.expires - jiffies; 2547 2548 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2549 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2550 i, 2551 ireq->ir_loc_addr, 2552 ireq->ir_num, 2553 ireq->ir_rmt_addr, 2554 ntohs(ireq->ir_rmt_port), 2555 TCP_SYN_RECV, 2556 0, 0, /* could print option size, but that is af dependent. */ 2557 1, /* timers active (only the expire timer) */ 2558 jiffies_delta_to_clock_t(delta), 2559 req->num_timeout, 2560 from_kuid_munged(seq_user_ns(f), 2561 sock_i_uid(req->rsk_listener)), 2562 0, /* non standard timer */ 2563 0, /* open_requests have no inode */ 2564 0, 2565 req); 2566 } 2567 2568 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2569 { 2570 int timer_active; 2571 unsigned long timer_expires; 2572 const struct tcp_sock *tp = tcp_sk(sk); 2573 const struct inet_connection_sock *icsk = inet_csk(sk); 2574 const struct inet_sock *inet = inet_sk(sk); 2575 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2576 __be32 dest = inet->inet_daddr; 2577 __be32 src = inet->inet_rcv_saddr; 2578 __u16 destp = ntohs(inet->inet_dport); 2579 __u16 srcp = ntohs(inet->inet_sport); 2580 int rx_queue; 2581 int state; 2582 2583 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2584 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2585 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2586 timer_active = 1; 2587 timer_expires = icsk->icsk_timeout; 2588 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2589 timer_active = 4; 2590 timer_expires = icsk->icsk_timeout; 2591 } else if (timer_pending(&sk->sk_timer)) { 2592 timer_active = 2; 2593 timer_expires = sk->sk_timer.expires; 2594 } else { 2595 timer_active = 0; 2596 timer_expires = jiffies; 2597 } 2598 2599 state = inet_sk_state_load(sk); 2600 if (state == TCP_LISTEN) 2601 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2602 else 2603 /* Because we don't lock the socket, 2604 * we might find a transient negative value. 2605 */ 2606 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2607 READ_ONCE(tp->copied_seq), 0); 2608 2609 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2610 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2611 i, src, srcp, dest, destp, state, 2612 READ_ONCE(tp->write_seq) - tp->snd_una, 2613 rx_queue, 2614 timer_active, 2615 jiffies_delta_to_clock_t(timer_expires - jiffies), 2616 icsk->icsk_retransmits, 2617 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2618 icsk->icsk_probes_out, 2619 sock_i_ino(sk), 2620 refcount_read(&sk->sk_refcnt), sk, 2621 jiffies_to_clock_t(icsk->icsk_rto), 2622 jiffies_to_clock_t(icsk->icsk_ack.ato), 2623 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2624 tp->snd_cwnd, 2625 state == TCP_LISTEN ? 2626 fastopenq->max_qlen : 2627 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2628 } 2629 2630 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2631 struct seq_file *f, int i) 2632 { 2633 long delta = tw->tw_timer.expires - jiffies; 2634 __be32 dest, src; 2635 __u16 destp, srcp; 2636 2637 dest = tw->tw_daddr; 2638 src = tw->tw_rcv_saddr; 2639 destp = ntohs(tw->tw_dport); 2640 srcp = ntohs(tw->tw_sport); 2641 2642 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2643 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2644 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2645 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2646 refcount_read(&tw->tw_refcnt), tw); 2647 } 2648 2649 #define TMPSZ 150 2650 2651 static int tcp4_seq_show(struct seq_file *seq, void *v) 2652 { 2653 struct tcp_iter_state *st; 2654 struct sock *sk = v; 2655 2656 seq_setwidth(seq, TMPSZ - 1); 2657 if (v == SEQ_START_TOKEN) { 2658 seq_puts(seq, " sl local_address rem_address st tx_queue " 2659 "rx_queue tr tm->when retrnsmt uid timeout " 2660 "inode"); 2661 goto out; 2662 } 2663 st = seq->private; 2664 2665 if (sk->sk_state == TCP_TIME_WAIT) 2666 get_timewait4_sock(v, seq, st->num); 2667 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2668 get_openreq4(v, seq, st->num); 2669 else 2670 get_tcp4_sock(v, seq, st->num); 2671 out: 2672 seq_pad(seq, '\n'); 2673 return 0; 2674 } 2675 2676 #ifdef CONFIG_BPF_SYSCALL 2677 struct bpf_tcp_iter_state { 2678 struct tcp_iter_state state; 2679 unsigned int cur_sk; 2680 unsigned int end_sk; 2681 unsigned int max_sk; 2682 struct sock **batch; 2683 bool st_bucket_done; 2684 }; 2685 2686 struct bpf_iter__tcp { 2687 __bpf_md_ptr(struct bpf_iter_meta *, meta); 2688 __bpf_md_ptr(struct sock_common *, sk_common); 2689 uid_t uid __aligned(8); 2690 }; 2691 2692 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 2693 struct sock_common *sk_common, uid_t uid) 2694 { 2695 struct bpf_iter__tcp ctx; 2696 2697 meta->seq_num--; /* skip SEQ_START_TOKEN */ 2698 ctx.meta = meta; 2699 ctx.sk_common = sk_common; 2700 ctx.uid = uid; 2701 return bpf_iter_run_prog(prog, &ctx); 2702 } 2703 2704 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) 2705 { 2706 while (iter->cur_sk < iter->end_sk) 2707 sock_put(iter->batch[iter->cur_sk++]); 2708 } 2709 2710 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, 2711 unsigned int new_batch_sz) 2712 { 2713 struct sock **new_batch; 2714 2715 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 2716 GFP_USER | __GFP_NOWARN); 2717 if (!new_batch) 2718 return -ENOMEM; 2719 2720 bpf_iter_tcp_put_batch(iter); 2721 kvfree(iter->batch); 2722 iter->batch = new_batch; 2723 iter->max_sk = new_batch_sz; 2724 2725 return 0; 2726 } 2727 2728 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, 2729 struct sock *start_sk) 2730 { 2731 struct bpf_tcp_iter_state *iter = seq->private; 2732 struct tcp_iter_state *st = &iter->state; 2733 struct inet_connection_sock *icsk; 2734 unsigned int expected = 1; 2735 struct sock *sk; 2736 2737 sock_hold(start_sk); 2738 iter->batch[iter->end_sk++] = start_sk; 2739 2740 icsk = inet_csk(start_sk); 2741 inet_lhash2_for_each_icsk_continue(icsk) { 2742 sk = (struct sock *)icsk; 2743 if (seq_sk_match(seq, sk)) { 2744 if (iter->end_sk < iter->max_sk) { 2745 sock_hold(sk); 2746 iter->batch[iter->end_sk++] = sk; 2747 } 2748 expected++; 2749 } 2750 } 2751 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock); 2752 2753 return expected; 2754 } 2755 2756 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, 2757 struct sock *start_sk) 2758 { 2759 struct bpf_tcp_iter_state *iter = seq->private; 2760 struct tcp_iter_state *st = &iter->state; 2761 struct hlist_nulls_node *node; 2762 unsigned int expected = 1; 2763 struct sock *sk; 2764 2765 sock_hold(start_sk); 2766 iter->batch[iter->end_sk++] = start_sk; 2767 2768 sk = sk_nulls_next(start_sk); 2769 sk_nulls_for_each_from(sk, node) { 2770 if (seq_sk_match(seq, sk)) { 2771 if (iter->end_sk < iter->max_sk) { 2772 sock_hold(sk); 2773 iter->batch[iter->end_sk++] = sk; 2774 } 2775 expected++; 2776 } 2777 } 2778 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2779 2780 return expected; 2781 } 2782 2783 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) 2784 { 2785 struct bpf_tcp_iter_state *iter = seq->private; 2786 struct tcp_iter_state *st = &iter->state; 2787 unsigned int expected; 2788 bool resized = false; 2789 struct sock *sk; 2790 2791 /* The st->bucket is done. Directly advance to the next 2792 * bucket instead of having the tcp_seek_last_pos() to skip 2793 * one by one in the current bucket and eventually find out 2794 * it has to advance to the next bucket. 2795 */ 2796 if (iter->st_bucket_done) { 2797 st->offset = 0; 2798 st->bucket++; 2799 if (st->state == TCP_SEQ_STATE_LISTENING && 2800 st->bucket > tcp_hashinfo.lhash2_mask) { 2801 st->state = TCP_SEQ_STATE_ESTABLISHED; 2802 st->bucket = 0; 2803 } 2804 } 2805 2806 again: 2807 /* Get a new batch */ 2808 iter->cur_sk = 0; 2809 iter->end_sk = 0; 2810 iter->st_bucket_done = false; 2811 2812 sk = tcp_seek_last_pos(seq); 2813 if (!sk) 2814 return NULL; /* Done */ 2815 2816 if (st->state == TCP_SEQ_STATE_LISTENING) 2817 expected = bpf_iter_tcp_listening_batch(seq, sk); 2818 else 2819 expected = bpf_iter_tcp_established_batch(seq, sk); 2820 2821 if (iter->end_sk == expected) { 2822 iter->st_bucket_done = true; 2823 return sk; 2824 } 2825 2826 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) { 2827 resized = true; 2828 goto again; 2829 } 2830 2831 return sk; 2832 } 2833 2834 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) 2835 { 2836 /* bpf iter does not support lseek, so it always 2837 * continue from where it was stop()-ped. 2838 */ 2839 if (*pos) 2840 return bpf_iter_tcp_batch(seq); 2841 2842 return SEQ_START_TOKEN; 2843 } 2844 2845 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2846 { 2847 struct bpf_tcp_iter_state *iter = seq->private; 2848 struct tcp_iter_state *st = &iter->state; 2849 struct sock *sk; 2850 2851 /* Whenever seq_next() is called, the iter->cur_sk is 2852 * done with seq_show(), so advance to the next sk in 2853 * the batch. 2854 */ 2855 if (iter->cur_sk < iter->end_sk) { 2856 /* Keeping st->num consistent in tcp_iter_state. 2857 * bpf_iter_tcp does not use st->num. 2858 * meta.seq_num is used instead. 2859 */ 2860 st->num++; 2861 /* Move st->offset to the next sk in the bucket such that 2862 * the future start() will resume at st->offset in 2863 * st->bucket. See tcp_seek_last_pos(). 2864 */ 2865 st->offset++; 2866 sock_put(iter->batch[iter->cur_sk++]); 2867 } 2868 2869 if (iter->cur_sk < iter->end_sk) 2870 sk = iter->batch[iter->cur_sk]; 2871 else 2872 sk = bpf_iter_tcp_batch(seq); 2873 2874 ++*pos; 2875 /* Keeping st->last_pos consistent in tcp_iter_state. 2876 * bpf iter does not do lseek, so st->last_pos always equals to *pos. 2877 */ 2878 st->last_pos = *pos; 2879 return sk; 2880 } 2881 2882 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 2883 { 2884 struct bpf_iter_meta meta; 2885 struct bpf_prog *prog; 2886 struct sock *sk = v; 2887 bool slow; 2888 uid_t uid; 2889 int ret; 2890 2891 if (v == SEQ_START_TOKEN) 2892 return 0; 2893 2894 if (sk_fullsock(sk)) 2895 slow = lock_sock_fast(sk); 2896 2897 if (unlikely(sk_unhashed(sk))) { 2898 ret = SEQ_SKIP; 2899 goto unlock; 2900 } 2901 2902 if (sk->sk_state == TCP_TIME_WAIT) { 2903 uid = 0; 2904 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 2905 const struct request_sock *req = v; 2906 2907 uid = from_kuid_munged(seq_user_ns(seq), 2908 sock_i_uid(req->rsk_listener)); 2909 } else { 2910 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 2911 } 2912 2913 meta.seq = seq; 2914 prog = bpf_iter_get_info(&meta, false); 2915 ret = tcp_prog_seq_show(prog, &meta, v, uid); 2916 2917 unlock: 2918 if (sk_fullsock(sk)) 2919 unlock_sock_fast(sk, slow); 2920 return ret; 2921 2922 } 2923 2924 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 2925 { 2926 struct bpf_tcp_iter_state *iter = seq->private; 2927 struct bpf_iter_meta meta; 2928 struct bpf_prog *prog; 2929 2930 if (!v) { 2931 meta.seq = seq; 2932 prog = bpf_iter_get_info(&meta, true); 2933 if (prog) 2934 (void)tcp_prog_seq_show(prog, &meta, v, 0); 2935 } 2936 2937 if (iter->cur_sk < iter->end_sk) { 2938 bpf_iter_tcp_put_batch(iter); 2939 iter->st_bucket_done = false; 2940 } 2941 } 2942 2943 static const struct seq_operations bpf_iter_tcp_seq_ops = { 2944 .show = bpf_iter_tcp_seq_show, 2945 .start = bpf_iter_tcp_seq_start, 2946 .next = bpf_iter_tcp_seq_next, 2947 .stop = bpf_iter_tcp_seq_stop, 2948 }; 2949 #endif 2950 static unsigned short seq_file_family(const struct seq_file *seq) 2951 { 2952 const struct tcp_seq_afinfo *afinfo; 2953 2954 #ifdef CONFIG_BPF_SYSCALL 2955 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ 2956 if (seq->op == &bpf_iter_tcp_seq_ops) 2957 return AF_UNSPEC; 2958 #endif 2959 2960 /* Iterated from proc fs */ 2961 afinfo = pde_data(file_inode(seq->file)); 2962 return afinfo->family; 2963 } 2964 2965 static const struct seq_operations tcp4_seq_ops = { 2966 .show = tcp4_seq_show, 2967 .start = tcp_seq_start, 2968 .next = tcp_seq_next, 2969 .stop = tcp_seq_stop, 2970 }; 2971 2972 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 2973 .family = AF_INET, 2974 }; 2975 2976 static int __net_init tcp4_proc_init_net(struct net *net) 2977 { 2978 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 2979 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 2980 return -ENOMEM; 2981 return 0; 2982 } 2983 2984 static void __net_exit tcp4_proc_exit_net(struct net *net) 2985 { 2986 remove_proc_entry("tcp", net->proc_net); 2987 } 2988 2989 static struct pernet_operations tcp4_net_ops = { 2990 .init = tcp4_proc_init_net, 2991 .exit = tcp4_proc_exit_net, 2992 }; 2993 2994 int __init tcp4_proc_init(void) 2995 { 2996 return register_pernet_subsys(&tcp4_net_ops); 2997 } 2998 2999 void tcp4_proc_exit(void) 3000 { 3001 unregister_pernet_subsys(&tcp4_net_ops); 3002 } 3003 #endif /* CONFIG_PROC_FS */ 3004 3005 /* @wake is one when sk_stream_write_space() calls us. 3006 * This sends EPOLLOUT only if notsent_bytes is half the limit. 3007 * This mimics the strategy used in sock_def_write_space(). 3008 */ 3009 bool tcp_stream_memory_free(const struct sock *sk, int wake) 3010 { 3011 const struct tcp_sock *tp = tcp_sk(sk); 3012 u32 notsent_bytes = READ_ONCE(tp->write_seq) - 3013 READ_ONCE(tp->snd_nxt); 3014 3015 return (notsent_bytes << wake) < tcp_notsent_lowat(tp); 3016 } 3017 EXPORT_SYMBOL(tcp_stream_memory_free); 3018 3019 struct proto tcp_prot = { 3020 .name = "TCP", 3021 .owner = THIS_MODULE, 3022 .close = tcp_close, 3023 .pre_connect = tcp_v4_pre_connect, 3024 .connect = tcp_v4_connect, 3025 .disconnect = tcp_disconnect, 3026 .accept = inet_csk_accept, 3027 .ioctl = tcp_ioctl, 3028 .init = tcp_v4_init_sock, 3029 .destroy = tcp_v4_destroy_sock, 3030 .shutdown = tcp_shutdown, 3031 .setsockopt = tcp_setsockopt, 3032 .getsockopt = tcp_getsockopt, 3033 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 3034 .keepalive = tcp_set_keepalive, 3035 .recvmsg = tcp_recvmsg, 3036 .sendmsg = tcp_sendmsg, 3037 .sendpage = tcp_sendpage, 3038 .backlog_rcv = tcp_v4_do_rcv, 3039 .release_cb = tcp_release_cb, 3040 .hash = inet_hash, 3041 .unhash = inet_unhash, 3042 .get_port = inet_csk_get_port, 3043 .put_port = inet_put_port, 3044 #ifdef CONFIG_BPF_SYSCALL 3045 .psock_update_sk_prot = tcp_bpf_update_proto, 3046 #endif 3047 .enter_memory_pressure = tcp_enter_memory_pressure, 3048 .leave_memory_pressure = tcp_leave_memory_pressure, 3049 .stream_memory_free = tcp_stream_memory_free, 3050 .sockets_allocated = &tcp_sockets_allocated, 3051 .orphan_count = &tcp_orphan_count, 3052 .memory_allocated = &tcp_memory_allocated, 3053 .memory_pressure = &tcp_memory_pressure, 3054 .sysctl_mem = sysctl_tcp_mem, 3055 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3056 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3057 .max_header = MAX_TCP_HEADER, 3058 .obj_size = sizeof(struct tcp_sock), 3059 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3060 .twsk_prot = &tcp_timewait_sock_ops, 3061 .rsk_prot = &tcp_request_sock_ops, 3062 .h.hashinfo = &tcp_hashinfo, 3063 .no_autobind = true, 3064 .diag_destroy = tcp_abort, 3065 }; 3066 EXPORT_SYMBOL(tcp_prot); 3067 3068 static void __net_exit tcp_sk_exit(struct net *net) 3069 { 3070 struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row; 3071 3072 if (net->ipv4.tcp_congestion_control) 3073 bpf_module_put(net->ipv4.tcp_congestion_control, 3074 net->ipv4.tcp_congestion_control->owner); 3075 if (refcount_dec_and_test(&tcp_death_row->tw_refcount)) 3076 kfree(tcp_death_row); 3077 } 3078 3079 static int __net_init tcp_sk_init(struct net *net) 3080 { 3081 int cnt; 3082 3083 net->ipv4.sysctl_tcp_ecn = 2; 3084 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3085 3086 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 3087 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 3088 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 3089 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 3090 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 3091 3092 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 3093 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 3094 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 3095 3096 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 3097 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 3098 net->ipv4.sysctl_tcp_syncookies = 1; 3099 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 3100 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 3101 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 3102 net->ipv4.sysctl_tcp_orphan_retries = 0; 3103 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 3104 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 3105 net->ipv4.sysctl_tcp_tw_reuse = 2; 3106 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 3107 3108 net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL); 3109 if (!net->ipv4.tcp_death_row) 3110 return -ENOMEM; 3111 refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1); 3112 cnt = tcp_hashinfo.ehash_mask + 1; 3113 net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2; 3114 net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo; 3115 3116 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128); 3117 net->ipv4.sysctl_tcp_sack = 1; 3118 net->ipv4.sysctl_tcp_window_scaling = 1; 3119 net->ipv4.sysctl_tcp_timestamps = 1; 3120 net->ipv4.sysctl_tcp_early_retrans = 3; 3121 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 3122 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 3123 net->ipv4.sysctl_tcp_retrans_collapse = 1; 3124 net->ipv4.sysctl_tcp_max_reordering = 300; 3125 net->ipv4.sysctl_tcp_dsack = 1; 3126 net->ipv4.sysctl_tcp_app_win = 31; 3127 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3128 net->ipv4.sysctl_tcp_frto = 2; 3129 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3130 /* This limits the percentage of the congestion window which we 3131 * will allow a single TSO frame to consume. Building TSO frames 3132 * which are too large can cause TCP streams to be bursty. 3133 */ 3134 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3135 /* Default TSQ limit of 16 TSO segments */ 3136 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; 3137 /* rfc5961 challenge ack rate limiting */ 3138 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; 3139 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3140 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ 3141 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3142 net->ipv4.sysctl_tcp_autocorking = 1; 3143 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 3144 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 3145 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 3146 if (net != &init_net) { 3147 memcpy(net->ipv4.sysctl_tcp_rmem, 3148 init_net.ipv4.sysctl_tcp_rmem, 3149 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 3150 memcpy(net->ipv4.sysctl_tcp_wmem, 3151 init_net.ipv4.sysctl_tcp_wmem, 3152 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 3153 } 3154 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3155 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 3156 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3157 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3158 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; 3159 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 3160 3161 /* Reno is always built in */ 3162 if (!net_eq(net, &init_net) && 3163 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 3164 init_net.ipv4.tcp_congestion_control->owner)) 3165 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 3166 else 3167 net->ipv4.tcp_congestion_control = &tcp_reno; 3168 3169 return 0; 3170 } 3171 3172 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 3173 { 3174 struct net *net; 3175 3176 list_for_each_entry(net, net_exit_list, exit_list) 3177 tcp_fastopen_ctx_destroy(net); 3178 } 3179 3180 static struct pernet_operations __net_initdata tcp_sk_ops = { 3181 .init = tcp_sk_init, 3182 .exit = tcp_sk_exit, 3183 .exit_batch = tcp_sk_exit_batch, 3184 }; 3185 3186 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3187 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 3188 struct sock_common *sk_common, uid_t uid) 3189 3190 #define INIT_BATCH_SZ 16 3191 3192 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 3193 { 3194 struct bpf_tcp_iter_state *iter = priv_data; 3195 int err; 3196 3197 err = bpf_iter_init_seq_net(priv_data, aux); 3198 if (err) 3199 return err; 3200 3201 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ); 3202 if (err) { 3203 bpf_iter_fini_seq_net(priv_data); 3204 return err; 3205 } 3206 3207 return 0; 3208 } 3209 3210 static void bpf_iter_fini_tcp(void *priv_data) 3211 { 3212 struct bpf_tcp_iter_state *iter = priv_data; 3213 3214 bpf_iter_fini_seq_net(priv_data); 3215 kvfree(iter->batch); 3216 } 3217 3218 static const struct bpf_iter_seq_info tcp_seq_info = { 3219 .seq_ops = &bpf_iter_tcp_seq_ops, 3220 .init_seq_private = bpf_iter_init_tcp, 3221 .fini_seq_private = bpf_iter_fini_tcp, 3222 .seq_priv_size = sizeof(struct bpf_tcp_iter_state), 3223 }; 3224 3225 static const struct bpf_func_proto * 3226 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, 3227 const struct bpf_prog *prog) 3228 { 3229 switch (func_id) { 3230 case BPF_FUNC_setsockopt: 3231 return &bpf_sk_setsockopt_proto; 3232 case BPF_FUNC_getsockopt: 3233 return &bpf_sk_getsockopt_proto; 3234 default: 3235 return NULL; 3236 } 3237 } 3238 3239 static struct bpf_iter_reg tcp_reg_info = { 3240 .target = "tcp", 3241 .ctx_arg_info_size = 1, 3242 .ctx_arg_info = { 3243 { offsetof(struct bpf_iter__tcp, sk_common), 3244 PTR_TO_BTF_ID_OR_NULL }, 3245 }, 3246 .get_func_proto = bpf_iter_tcp_get_func_proto, 3247 .seq_info = &tcp_seq_info, 3248 }; 3249 3250 static void __init bpf_iter_register(void) 3251 { 3252 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 3253 if (bpf_iter_reg_target(&tcp_reg_info)) 3254 pr_warn("Warning: could not register bpf iterator tcp\n"); 3255 } 3256 3257 #endif 3258 3259 void __init tcp_v4_init(void) 3260 { 3261 int cpu, res; 3262 3263 for_each_possible_cpu(cpu) { 3264 struct sock *sk; 3265 3266 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 3267 IPPROTO_TCP, &init_net); 3268 if (res) 3269 panic("Failed to create the TCP control socket.\n"); 3270 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 3271 3272 /* Please enforce IP_DF and IPID==0 for RST and 3273 * ACK sent in SYN-RECV and TIME-WAIT state. 3274 */ 3275 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 3276 3277 per_cpu(ipv4_tcp_sk, cpu) = sk; 3278 } 3279 if (register_pernet_subsys(&tcp_sk_ops)) 3280 panic("Failed to create the TCP control socket.\n"); 3281 3282 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3283 bpf_iter_register(); 3284 #endif 3285 } 3286