1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 61 #include <net/net_namespace.h> 62 #include <net/icmp.h> 63 #include <net/inet_hashtables.h> 64 #include <net/tcp.h> 65 #include <net/transp_v6.h> 66 #include <net/ipv6.h> 67 #include <net/inet_common.h> 68 #include <net/timewait_sock.h> 69 #include <net/xfrm.h> 70 #include <net/secure_seq.h> 71 #include <net/busy_poll.h> 72 73 #include <linux/inet.h> 74 #include <linux/ipv6.h> 75 #include <linux/stddef.h> 76 #include <linux/proc_fs.h> 77 #include <linux/seq_file.h> 78 #include <linux/inetdevice.h> 79 #include <linux/btf_ids.h> 80 81 #include <crypto/hash.h> 82 #include <linux/scatterlist.h> 83 84 #include <trace/events/tcp.h> 85 86 #ifdef CONFIG_TCP_MD5SIG 87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 88 __be32 daddr, __be32 saddr, const struct tcphdr *th); 89 #endif 90 91 struct inet_hashinfo tcp_hashinfo; 92 EXPORT_SYMBOL(tcp_hashinfo); 93 94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk); 95 96 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 97 { 98 return secure_tcp_seq(ip_hdr(skb)->daddr, 99 ip_hdr(skb)->saddr, 100 tcp_hdr(skb)->dest, 101 tcp_hdr(skb)->source); 102 } 103 104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 105 { 106 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 107 } 108 109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 110 { 111 const struct inet_timewait_sock *tw = inet_twsk(sktw); 112 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 113 struct tcp_sock *tp = tcp_sk(sk); 114 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse; 115 116 if (reuse == 2) { 117 /* Still does not detect *everything* that goes through 118 * lo, since we require a loopback src or dst address 119 * or direct binding to 'lo' interface. 120 */ 121 bool loopback = false; 122 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 123 loopback = true; 124 #if IS_ENABLED(CONFIG_IPV6) 125 if (tw->tw_family == AF_INET6) { 126 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 128 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 129 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 130 loopback = true; 131 } else 132 #endif 133 { 134 if (ipv4_is_loopback(tw->tw_daddr) || 135 ipv4_is_loopback(tw->tw_rcv_saddr)) 136 loopback = true; 137 } 138 if (!loopback) 139 reuse = 0; 140 } 141 142 /* With PAWS, it is safe from the viewpoint 143 of data integrity. Even without PAWS it is safe provided sequence 144 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 145 146 Actually, the idea is close to VJ's one, only timestamp cache is 147 held not per host, but per port pair and TW bucket is used as state 148 holder. 149 150 If TW bucket has been already destroyed we fall back to VJ's scheme 151 and use initial timestamp retrieved from peer table. 152 */ 153 if (tcptw->tw_ts_recent_stamp && 154 (!twp || (reuse && time_after32(ktime_get_seconds(), 155 tcptw->tw_ts_recent_stamp)))) { 156 /* In case of repair and re-using TIME-WAIT sockets we still 157 * want to be sure that it is safe as above but honor the 158 * sequence numbers and time stamps set as part of the repair 159 * process. 160 * 161 * Without this check re-using a TIME-WAIT socket with TCP 162 * repair would accumulate a -1 on the repair assigned 163 * sequence number. The first time it is reused the sequence 164 * is -1, the second time -2, etc. This fixes that issue 165 * without appearing to create any others. 166 */ 167 if (likely(!tp->repair)) { 168 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 169 170 if (!seq) 171 seq = 1; 172 WRITE_ONCE(tp->write_seq, seq); 173 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 174 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 175 } 176 sock_hold(sktw); 177 return 1; 178 } 179 180 return 0; 181 } 182 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 183 184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 185 int addr_len) 186 { 187 /* This check is replicated from tcp_v4_connect() and intended to 188 * prevent BPF program called below from accessing bytes that are out 189 * of the bound specified by user in addr_len. 190 */ 191 if (addr_len < sizeof(struct sockaddr_in)) 192 return -EINVAL; 193 194 sock_owned_by_me(sk); 195 196 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr); 197 } 198 199 /* This will initiate an outgoing connection. */ 200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 201 { 202 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 203 struct inet_sock *inet = inet_sk(sk); 204 struct tcp_sock *tp = tcp_sk(sk); 205 __be16 orig_sport, orig_dport; 206 __be32 daddr, nexthop; 207 struct flowi4 *fl4; 208 struct rtable *rt; 209 int err; 210 struct ip_options_rcu *inet_opt; 211 struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row; 212 213 if (addr_len < sizeof(struct sockaddr_in)) 214 return -EINVAL; 215 216 if (usin->sin_family != AF_INET) 217 return -EAFNOSUPPORT; 218 219 nexthop = daddr = usin->sin_addr.s_addr; 220 inet_opt = rcu_dereference_protected(inet->inet_opt, 221 lockdep_sock_is_held(sk)); 222 if (inet_opt && inet_opt->opt.srr) { 223 if (!daddr) 224 return -EINVAL; 225 nexthop = inet_opt->opt.faddr; 226 } 227 228 orig_sport = inet->inet_sport; 229 orig_dport = usin->sin_port; 230 fl4 = &inet->cork.fl.u.ip4; 231 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 232 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 233 IPPROTO_TCP, 234 orig_sport, orig_dport, sk); 235 if (IS_ERR(rt)) { 236 err = PTR_ERR(rt); 237 if (err == -ENETUNREACH) 238 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 239 return err; 240 } 241 242 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 243 ip_rt_put(rt); 244 return -ENETUNREACH; 245 } 246 247 if (!inet_opt || !inet_opt->opt.srr) 248 daddr = fl4->daddr; 249 250 if (!inet->inet_saddr) 251 inet->inet_saddr = fl4->saddr; 252 sk_rcv_saddr_set(sk, inet->inet_saddr); 253 254 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 255 /* Reset inherited state */ 256 tp->rx_opt.ts_recent = 0; 257 tp->rx_opt.ts_recent_stamp = 0; 258 if (likely(!tp->repair)) 259 WRITE_ONCE(tp->write_seq, 0); 260 } 261 262 inet->inet_dport = usin->sin_port; 263 sk_daddr_set(sk, daddr); 264 265 inet_csk(sk)->icsk_ext_hdr_len = 0; 266 if (inet_opt) 267 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 268 269 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 270 271 /* Socket identity is still unknown (sport may be zero). 272 * However we set state to SYN-SENT and not releasing socket 273 * lock select source port, enter ourselves into the hash tables and 274 * complete initialization after this. 275 */ 276 tcp_set_state(sk, TCP_SYN_SENT); 277 err = inet_hash_connect(tcp_death_row, sk); 278 if (err) 279 goto failure; 280 281 sk_set_txhash(sk); 282 283 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 284 inet->inet_sport, inet->inet_dport, sk); 285 if (IS_ERR(rt)) { 286 err = PTR_ERR(rt); 287 rt = NULL; 288 goto failure; 289 } 290 /* OK, now commit destination to socket. */ 291 sk->sk_gso_type = SKB_GSO_TCPV4; 292 sk_setup_caps(sk, &rt->dst); 293 rt = NULL; 294 295 if (likely(!tp->repair)) { 296 if (!tp->write_seq) 297 WRITE_ONCE(tp->write_seq, 298 secure_tcp_seq(inet->inet_saddr, 299 inet->inet_daddr, 300 inet->inet_sport, 301 usin->sin_port)); 302 tp->tsoffset = secure_tcp_ts_off(sock_net(sk), 303 inet->inet_saddr, 304 inet->inet_daddr); 305 } 306 307 inet->inet_id = prandom_u32(); 308 309 if (tcp_fastopen_defer_connect(sk, &err)) 310 return err; 311 if (err) 312 goto failure; 313 314 err = tcp_connect(sk); 315 316 if (err) 317 goto failure; 318 319 return 0; 320 321 failure: 322 /* 323 * This unhashes the socket and releases the local port, 324 * if necessary. 325 */ 326 tcp_set_state(sk, TCP_CLOSE); 327 ip_rt_put(rt); 328 sk->sk_route_caps = 0; 329 inet->inet_dport = 0; 330 return err; 331 } 332 EXPORT_SYMBOL(tcp_v4_connect); 333 334 /* 335 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 336 * It can be called through tcp_release_cb() if socket was owned by user 337 * at the time tcp_v4_err() was called to handle ICMP message. 338 */ 339 void tcp_v4_mtu_reduced(struct sock *sk) 340 { 341 struct inet_sock *inet = inet_sk(sk); 342 struct dst_entry *dst; 343 u32 mtu; 344 345 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 346 return; 347 mtu = READ_ONCE(tcp_sk(sk)->mtu_info); 348 dst = inet_csk_update_pmtu(sk, mtu); 349 if (!dst) 350 return; 351 352 /* Something is about to be wrong... Remember soft error 353 * for the case, if this connection will not able to recover. 354 */ 355 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 356 sk->sk_err_soft = EMSGSIZE; 357 358 mtu = dst_mtu(dst); 359 360 if (inet->pmtudisc != IP_PMTUDISC_DONT && 361 ip_sk_accept_pmtu(sk) && 362 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 363 tcp_sync_mss(sk, mtu); 364 365 /* Resend the TCP packet because it's 366 * clear that the old packet has been 367 * dropped. This is the new "fast" path mtu 368 * discovery. 369 */ 370 tcp_simple_retransmit(sk); 371 } /* else let the usual retransmit timer handle it */ 372 } 373 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 374 375 static void do_redirect(struct sk_buff *skb, struct sock *sk) 376 { 377 struct dst_entry *dst = __sk_dst_check(sk, 0); 378 379 if (dst) 380 dst->ops->redirect(dst, sk, skb); 381 } 382 383 384 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 385 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 386 { 387 struct request_sock *req = inet_reqsk(sk); 388 struct net *net = sock_net(sk); 389 390 /* ICMPs are not backlogged, hence we cannot get 391 * an established socket here. 392 */ 393 if (seq != tcp_rsk(req)->snt_isn) { 394 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 395 } else if (abort) { 396 /* 397 * Still in SYN_RECV, just remove it silently. 398 * There is no good way to pass the error to the newly 399 * created socket, and POSIX does not want network 400 * errors returned from accept(). 401 */ 402 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 403 tcp_listendrop(req->rsk_listener); 404 } 405 reqsk_put(req); 406 } 407 EXPORT_SYMBOL(tcp_req_err); 408 409 /* TCP-LD (RFC 6069) logic */ 410 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 411 { 412 struct inet_connection_sock *icsk = inet_csk(sk); 413 struct tcp_sock *tp = tcp_sk(sk); 414 struct sk_buff *skb; 415 s32 remaining; 416 u32 delta_us; 417 418 if (sock_owned_by_user(sk)) 419 return; 420 421 if (seq != tp->snd_una || !icsk->icsk_retransmits || 422 !icsk->icsk_backoff) 423 return; 424 425 skb = tcp_rtx_queue_head(sk); 426 if (WARN_ON_ONCE(!skb)) 427 return; 428 429 icsk->icsk_backoff--; 430 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 431 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 432 433 tcp_mstamp_refresh(tp); 434 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 435 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 436 437 if (remaining > 0) { 438 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 439 remaining, TCP_RTO_MAX); 440 } else { 441 /* RTO revert clocked out retransmission. 442 * Will retransmit now. 443 */ 444 tcp_retransmit_timer(sk); 445 } 446 } 447 EXPORT_SYMBOL(tcp_ld_RTO_revert); 448 449 /* 450 * This routine is called by the ICMP module when it gets some 451 * sort of error condition. If err < 0 then the socket should 452 * be closed and the error returned to the user. If err > 0 453 * it's just the icmp type << 8 | icmp code. After adjustment 454 * header points to the first 8 bytes of the tcp header. We need 455 * to find the appropriate port. 456 * 457 * The locking strategy used here is very "optimistic". When 458 * someone else accesses the socket the ICMP is just dropped 459 * and for some paths there is no check at all. 460 * A more general error queue to queue errors for later handling 461 * is probably better. 462 * 463 */ 464 465 int tcp_v4_err(struct sk_buff *skb, u32 info) 466 { 467 const struct iphdr *iph = (const struct iphdr *)skb->data; 468 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 469 struct tcp_sock *tp; 470 struct inet_sock *inet; 471 const int type = icmp_hdr(skb)->type; 472 const int code = icmp_hdr(skb)->code; 473 struct sock *sk; 474 struct request_sock *fastopen; 475 u32 seq, snd_una; 476 int err; 477 struct net *net = dev_net(skb->dev); 478 479 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, 480 th->dest, iph->saddr, ntohs(th->source), 481 inet_iif(skb), 0); 482 if (!sk) { 483 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 484 return -ENOENT; 485 } 486 if (sk->sk_state == TCP_TIME_WAIT) { 487 inet_twsk_put(inet_twsk(sk)); 488 return 0; 489 } 490 seq = ntohl(th->seq); 491 if (sk->sk_state == TCP_NEW_SYN_RECV) { 492 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 493 type == ICMP_TIME_EXCEEDED || 494 (type == ICMP_DEST_UNREACH && 495 (code == ICMP_NET_UNREACH || 496 code == ICMP_HOST_UNREACH))); 497 return 0; 498 } 499 500 bh_lock_sock(sk); 501 /* If too many ICMPs get dropped on busy 502 * servers this needs to be solved differently. 503 * We do take care of PMTU discovery (RFC1191) special case : 504 * we can receive locally generated ICMP messages while socket is held. 505 */ 506 if (sock_owned_by_user(sk)) { 507 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 508 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 509 } 510 if (sk->sk_state == TCP_CLOSE) 511 goto out; 512 513 if (static_branch_unlikely(&ip4_min_ttl)) { 514 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 515 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 516 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 517 goto out; 518 } 519 } 520 521 tp = tcp_sk(sk); 522 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 523 fastopen = rcu_dereference(tp->fastopen_rsk); 524 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 525 if (sk->sk_state != TCP_LISTEN && 526 !between(seq, snd_una, tp->snd_nxt)) { 527 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 528 goto out; 529 } 530 531 switch (type) { 532 case ICMP_REDIRECT: 533 if (!sock_owned_by_user(sk)) 534 do_redirect(skb, sk); 535 goto out; 536 case ICMP_SOURCE_QUENCH: 537 /* Just silently ignore these. */ 538 goto out; 539 case ICMP_PARAMETERPROB: 540 err = EPROTO; 541 break; 542 case ICMP_DEST_UNREACH: 543 if (code > NR_ICMP_UNREACH) 544 goto out; 545 546 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 547 /* We are not interested in TCP_LISTEN and open_requests 548 * (SYN-ACKs send out by Linux are always <576bytes so 549 * they should go through unfragmented). 550 */ 551 if (sk->sk_state == TCP_LISTEN) 552 goto out; 553 554 WRITE_ONCE(tp->mtu_info, info); 555 if (!sock_owned_by_user(sk)) { 556 tcp_v4_mtu_reduced(sk); 557 } else { 558 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 559 sock_hold(sk); 560 } 561 goto out; 562 } 563 564 err = icmp_err_convert[code].errno; 565 /* check if this ICMP message allows revert of backoff. 566 * (see RFC 6069) 567 */ 568 if (!fastopen && 569 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 570 tcp_ld_RTO_revert(sk, seq); 571 break; 572 case ICMP_TIME_EXCEEDED: 573 err = EHOSTUNREACH; 574 break; 575 default: 576 goto out; 577 } 578 579 switch (sk->sk_state) { 580 case TCP_SYN_SENT: 581 case TCP_SYN_RECV: 582 /* Only in fast or simultaneous open. If a fast open socket is 583 * already accepted it is treated as a connected one below. 584 */ 585 if (fastopen && !fastopen->sk) 586 break; 587 588 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 589 590 if (!sock_owned_by_user(sk)) { 591 sk->sk_err = err; 592 593 sk_error_report(sk); 594 595 tcp_done(sk); 596 } else { 597 sk->sk_err_soft = err; 598 } 599 goto out; 600 } 601 602 /* If we've already connected we will keep trying 603 * until we time out, or the user gives up. 604 * 605 * rfc1122 4.2.3.9 allows to consider as hard errors 606 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 607 * but it is obsoleted by pmtu discovery). 608 * 609 * Note, that in modern internet, where routing is unreliable 610 * and in each dark corner broken firewalls sit, sending random 611 * errors ordered by their masters even this two messages finally lose 612 * their original sense (even Linux sends invalid PORT_UNREACHs) 613 * 614 * Now we are in compliance with RFCs. 615 * --ANK (980905) 616 */ 617 618 inet = inet_sk(sk); 619 if (!sock_owned_by_user(sk) && inet->recverr) { 620 sk->sk_err = err; 621 sk_error_report(sk); 622 } else { /* Only an error on timeout */ 623 sk->sk_err_soft = err; 624 } 625 626 out: 627 bh_unlock_sock(sk); 628 sock_put(sk); 629 return 0; 630 } 631 632 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 633 { 634 struct tcphdr *th = tcp_hdr(skb); 635 636 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 637 skb->csum_start = skb_transport_header(skb) - skb->head; 638 skb->csum_offset = offsetof(struct tcphdr, check); 639 } 640 641 /* This routine computes an IPv4 TCP checksum. */ 642 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 643 { 644 const struct inet_sock *inet = inet_sk(sk); 645 646 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 647 } 648 EXPORT_SYMBOL(tcp_v4_send_check); 649 650 /* 651 * This routine will send an RST to the other tcp. 652 * 653 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 654 * for reset. 655 * Answer: if a packet caused RST, it is not for a socket 656 * existing in our system, if it is matched to a socket, 657 * it is just duplicate segment or bug in other side's TCP. 658 * So that we build reply only basing on parameters 659 * arrived with segment. 660 * Exception: precedence violation. We do not implement it in any case. 661 */ 662 663 #ifdef CONFIG_TCP_MD5SIG 664 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED 665 #else 666 #define OPTION_BYTES sizeof(__be32) 667 #endif 668 669 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) 670 { 671 const struct tcphdr *th = tcp_hdr(skb); 672 struct { 673 struct tcphdr th; 674 __be32 opt[OPTION_BYTES / sizeof(__be32)]; 675 } rep; 676 struct ip_reply_arg arg; 677 #ifdef CONFIG_TCP_MD5SIG 678 struct tcp_md5sig_key *key = NULL; 679 const __u8 *hash_location = NULL; 680 unsigned char newhash[16]; 681 int genhash; 682 struct sock *sk1 = NULL; 683 #endif 684 u64 transmit_time = 0; 685 struct sock *ctl_sk; 686 struct net *net; 687 688 /* Never send a reset in response to a reset. */ 689 if (th->rst) 690 return; 691 692 /* If sk not NULL, it means we did a successful lookup and incoming 693 * route had to be correct. prequeue might have dropped our dst. 694 */ 695 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 696 return; 697 698 /* Swap the send and the receive. */ 699 memset(&rep, 0, sizeof(rep)); 700 rep.th.dest = th->source; 701 rep.th.source = th->dest; 702 rep.th.doff = sizeof(struct tcphdr) / 4; 703 rep.th.rst = 1; 704 705 if (th->ack) { 706 rep.th.seq = th->ack_seq; 707 } else { 708 rep.th.ack = 1; 709 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 710 skb->len - (th->doff << 2)); 711 } 712 713 memset(&arg, 0, sizeof(arg)); 714 arg.iov[0].iov_base = (unsigned char *)&rep; 715 arg.iov[0].iov_len = sizeof(rep.th); 716 717 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 718 #ifdef CONFIG_TCP_MD5SIG 719 rcu_read_lock(); 720 hash_location = tcp_parse_md5sig_option(th); 721 if (sk && sk_fullsock(sk)) { 722 const union tcp_md5_addr *addr; 723 int l3index; 724 725 /* sdif set, means packet ingressed via a device 726 * in an L3 domain and inet_iif is set to it. 727 */ 728 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 729 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 730 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 731 } else if (hash_location) { 732 const union tcp_md5_addr *addr; 733 int sdif = tcp_v4_sdif(skb); 734 int dif = inet_iif(skb); 735 int l3index; 736 737 /* 738 * active side is lost. Try to find listening socket through 739 * source port, and then find md5 key through listening socket. 740 * we are not loose security here: 741 * Incoming packet is checked with md5 hash with finding key, 742 * no RST generated if md5 hash doesn't match. 743 */ 744 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, 745 ip_hdr(skb)->saddr, 746 th->source, ip_hdr(skb)->daddr, 747 ntohs(th->source), dif, sdif); 748 /* don't send rst if it can't find key */ 749 if (!sk1) 750 goto out; 751 752 /* sdif set, means packet ingressed via a device 753 * in an L3 domain and dif is set to it. 754 */ 755 l3index = sdif ? dif : 0; 756 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 757 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 758 if (!key) 759 goto out; 760 761 762 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 763 if (genhash || memcmp(hash_location, newhash, 16) != 0) 764 goto out; 765 766 } 767 768 if (key) { 769 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 770 (TCPOPT_NOP << 16) | 771 (TCPOPT_MD5SIG << 8) | 772 TCPOLEN_MD5SIG); 773 /* Update length and the length the header thinks exists */ 774 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 775 rep.th.doff = arg.iov[0].iov_len / 4; 776 777 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 778 key, ip_hdr(skb)->saddr, 779 ip_hdr(skb)->daddr, &rep.th); 780 } 781 #endif 782 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ 783 if (rep.opt[0] == 0) { 784 __be32 mrst = mptcp_reset_option(skb); 785 786 if (mrst) { 787 rep.opt[0] = mrst; 788 arg.iov[0].iov_len += sizeof(mrst); 789 rep.th.doff = arg.iov[0].iov_len / 4; 790 } 791 } 792 793 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 794 ip_hdr(skb)->saddr, /* XXX */ 795 arg.iov[0].iov_len, IPPROTO_TCP, 0); 796 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 797 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 798 799 /* When socket is gone, all binding information is lost. 800 * routing might fail in this case. No choice here, if we choose to force 801 * input interface, we will misroute in case of asymmetric route. 802 */ 803 if (sk) { 804 arg.bound_dev_if = sk->sk_bound_dev_if; 805 if (sk_fullsock(sk)) 806 trace_tcp_send_reset(sk, skb); 807 } 808 809 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 810 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 811 812 arg.tos = ip_hdr(skb)->tos; 813 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 814 local_bh_disable(); 815 ctl_sk = this_cpu_read(ipv4_tcp_sk); 816 sock_net_set(ctl_sk, net); 817 if (sk) { 818 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 819 inet_twsk(sk)->tw_mark : sk->sk_mark; 820 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 821 inet_twsk(sk)->tw_priority : sk->sk_priority; 822 transmit_time = tcp_transmit_time(sk); 823 } 824 ip_send_unicast_reply(ctl_sk, 825 skb, &TCP_SKB_CB(skb)->header.h4.opt, 826 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 827 &arg, arg.iov[0].iov_len, 828 transmit_time); 829 830 ctl_sk->sk_mark = 0; 831 sock_net_set(ctl_sk, &init_net); 832 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 833 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 834 local_bh_enable(); 835 836 #ifdef CONFIG_TCP_MD5SIG 837 out: 838 rcu_read_unlock(); 839 #endif 840 } 841 842 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 843 outside socket context is ugly, certainly. What can I do? 844 */ 845 846 static void tcp_v4_send_ack(const struct sock *sk, 847 struct sk_buff *skb, u32 seq, u32 ack, 848 u32 win, u32 tsval, u32 tsecr, int oif, 849 struct tcp_md5sig_key *key, 850 int reply_flags, u8 tos) 851 { 852 const struct tcphdr *th = tcp_hdr(skb); 853 struct { 854 struct tcphdr th; 855 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 856 #ifdef CONFIG_TCP_MD5SIG 857 + (TCPOLEN_MD5SIG_ALIGNED >> 2) 858 #endif 859 ]; 860 } rep; 861 struct net *net = sock_net(sk); 862 struct ip_reply_arg arg; 863 struct sock *ctl_sk; 864 u64 transmit_time; 865 866 memset(&rep.th, 0, sizeof(struct tcphdr)); 867 memset(&arg, 0, sizeof(arg)); 868 869 arg.iov[0].iov_base = (unsigned char *)&rep; 870 arg.iov[0].iov_len = sizeof(rep.th); 871 if (tsecr) { 872 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 873 (TCPOPT_TIMESTAMP << 8) | 874 TCPOLEN_TIMESTAMP); 875 rep.opt[1] = htonl(tsval); 876 rep.opt[2] = htonl(tsecr); 877 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 878 } 879 880 /* Swap the send and the receive. */ 881 rep.th.dest = th->source; 882 rep.th.source = th->dest; 883 rep.th.doff = arg.iov[0].iov_len / 4; 884 rep.th.seq = htonl(seq); 885 rep.th.ack_seq = htonl(ack); 886 rep.th.ack = 1; 887 rep.th.window = htons(win); 888 889 #ifdef CONFIG_TCP_MD5SIG 890 if (key) { 891 int offset = (tsecr) ? 3 : 0; 892 893 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 894 (TCPOPT_NOP << 16) | 895 (TCPOPT_MD5SIG << 8) | 896 TCPOLEN_MD5SIG); 897 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 898 rep.th.doff = arg.iov[0].iov_len/4; 899 900 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 901 key, ip_hdr(skb)->saddr, 902 ip_hdr(skb)->daddr, &rep.th); 903 } 904 #endif 905 arg.flags = reply_flags; 906 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 907 ip_hdr(skb)->saddr, /* XXX */ 908 arg.iov[0].iov_len, IPPROTO_TCP, 0); 909 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 910 if (oif) 911 arg.bound_dev_if = oif; 912 arg.tos = tos; 913 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 914 local_bh_disable(); 915 ctl_sk = this_cpu_read(ipv4_tcp_sk); 916 sock_net_set(ctl_sk, net); 917 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 918 inet_twsk(sk)->tw_mark : sk->sk_mark; 919 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 920 inet_twsk(sk)->tw_priority : sk->sk_priority; 921 transmit_time = tcp_transmit_time(sk); 922 ip_send_unicast_reply(ctl_sk, 923 skb, &TCP_SKB_CB(skb)->header.h4.opt, 924 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 925 &arg, arg.iov[0].iov_len, 926 transmit_time); 927 928 ctl_sk->sk_mark = 0; 929 sock_net_set(ctl_sk, &init_net); 930 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 931 local_bh_enable(); 932 } 933 934 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 935 { 936 struct inet_timewait_sock *tw = inet_twsk(sk); 937 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 938 939 tcp_v4_send_ack(sk, skb, 940 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 941 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 942 tcp_time_stamp_raw() + tcptw->tw_ts_offset, 943 tcptw->tw_ts_recent, 944 tw->tw_bound_dev_if, 945 tcp_twsk_md5_key(tcptw), 946 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 947 tw->tw_tos 948 ); 949 950 inet_twsk_put(tw); 951 } 952 953 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 954 struct request_sock *req) 955 { 956 const union tcp_md5_addr *addr; 957 int l3index; 958 959 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 960 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 961 */ 962 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 963 tcp_sk(sk)->snd_nxt; 964 965 /* RFC 7323 2.3 966 * The window field (SEG.WND) of every outgoing segment, with the 967 * exception of <SYN> segments, MUST be right-shifted by 968 * Rcv.Wind.Shift bits: 969 */ 970 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 971 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 972 tcp_v4_send_ack(sk, skb, seq, 973 tcp_rsk(req)->rcv_nxt, 974 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 975 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, 976 req->ts_recent, 977 0, 978 tcp_md5_do_lookup(sk, l3index, addr, AF_INET), 979 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 980 ip_hdr(skb)->tos); 981 } 982 983 /* 984 * Send a SYN-ACK after having received a SYN. 985 * This still operates on a request_sock only, not on a big 986 * socket. 987 */ 988 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 989 struct flowi *fl, 990 struct request_sock *req, 991 struct tcp_fastopen_cookie *foc, 992 enum tcp_synack_type synack_type, 993 struct sk_buff *syn_skb) 994 { 995 const struct inet_request_sock *ireq = inet_rsk(req); 996 struct flowi4 fl4; 997 int err = -1; 998 struct sk_buff *skb; 999 u8 tos; 1000 1001 /* First, grab a route. */ 1002 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 1003 return -1; 1004 1005 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 1006 1007 if (skb) { 1008 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1009 1010 tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ? 1011 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | 1012 (inet_sk(sk)->tos & INET_ECN_MASK) : 1013 inet_sk(sk)->tos; 1014 1015 if (!INET_ECN_is_capable(tos) && 1016 tcp_bpf_ca_needs_ecn((struct sock *)req)) 1017 tos |= INET_ECN_ECT_0; 1018 1019 rcu_read_lock(); 1020 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 1021 ireq->ir_rmt_addr, 1022 rcu_dereference(ireq->ireq_opt), 1023 tos); 1024 rcu_read_unlock(); 1025 err = net_xmit_eval(err); 1026 } 1027 1028 return err; 1029 } 1030 1031 /* 1032 * IPv4 request_sock destructor. 1033 */ 1034 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1035 { 1036 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1037 } 1038 1039 #ifdef CONFIG_TCP_MD5SIG 1040 /* 1041 * RFC2385 MD5 checksumming requires a mapping of 1042 * IP address->MD5 Key. 1043 * We need to maintain these in the sk structure. 1044 */ 1045 1046 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed); 1047 EXPORT_SYMBOL(tcp_md5_needed); 1048 1049 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) 1050 { 1051 if (!old) 1052 return true; 1053 1054 /* l3index always overrides non-l3index */ 1055 if (old->l3index && new->l3index == 0) 1056 return false; 1057 if (old->l3index == 0 && new->l3index) 1058 return true; 1059 1060 return old->prefixlen < new->prefixlen; 1061 } 1062 1063 /* Find the Key structure for an address. */ 1064 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1065 const union tcp_md5_addr *addr, 1066 int family) 1067 { 1068 const struct tcp_sock *tp = tcp_sk(sk); 1069 struct tcp_md5sig_key *key; 1070 const struct tcp_md5sig_info *md5sig; 1071 __be32 mask; 1072 struct tcp_md5sig_key *best_match = NULL; 1073 bool match; 1074 1075 /* caller either holds rcu_read_lock() or socket lock */ 1076 md5sig = rcu_dereference_check(tp->md5sig_info, 1077 lockdep_sock_is_held(sk)); 1078 if (!md5sig) 1079 return NULL; 1080 1081 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1082 lockdep_sock_is_held(sk)) { 1083 if (key->family != family) 1084 continue; 1085 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index) 1086 continue; 1087 if (family == AF_INET) { 1088 mask = inet_make_mask(key->prefixlen); 1089 match = (key->addr.a4.s_addr & mask) == 1090 (addr->a4.s_addr & mask); 1091 #if IS_ENABLED(CONFIG_IPV6) 1092 } else if (family == AF_INET6) { 1093 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1094 key->prefixlen); 1095 #endif 1096 } else { 1097 match = false; 1098 } 1099 1100 if (match && better_md5_match(best_match, key)) 1101 best_match = key; 1102 } 1103 return best_match; 1104 } 1105 EXPORT_SYMBOL(__tcp_md5_do_lookup); 1106 1107 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1108 const union tcp_md5_addr *addr, 1109 int family, u8 prefixlen, 1110 int l3index, u8 flags) 1111 { 1112 const struct tcp_sock *tp = tcp_sk(sk); 1113 struct tcp_md5sig_key *key; 1114 unsigned int size = sizeof(struct in_addr); 1115 const struct tcp_md5sig_info *md5sig; 1116 1117 /* caller either holds rcu_read_lock() or socket lock */ 1118 md5sig = rcu_dereference_check(tp->md5sig_info, 1119 lockdep_sock_is_held(sk)); 1120 if (!md5sig) 1121 return NULL; 1122 #if IS_ENABLED(CONFIG_IPV6) 1123 if (family == AF_INET6) 1124 size = sizeof(struct in6_addr); 1125 #endif 1126 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1127 lockdep_sock_is_held(sk)) { 1128 if (key->family != family) 1129 continue; 1130 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) 1131 continue; 1132 if (key->l3index != l3index) 1133 continue; 1134 if (!memcmp(&key->addr, addr, size) && 1135 key->prefixlen == prefixlen) 1136 return key; 1137 } 1138 return NULL; 1139 } 1140 1141 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1142 const struct sock *addr_sk) 1143 { 1144 const union tcp_md5_addr *addr; 1145 int l3index; 1146 1147 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1148 addr_sk->sk_bound_dev_if); 1149 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1150 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1151 } 1152 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1153 1154 /* This can be called on a newly created socket, from other files */ 1155 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1156 int family, u8 prefixlen, int l3index, u8 flags, 1157 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1158 { 1159 /* Add Key to the list */ 1160 struct tcp_md5sig_key *key; 1161 struct tcp_sock *tp = tcp_sk(sk); 1162 struct tcp_md5sig_info *md5sig; 1163 1164 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1165 if (key) { 1166 /* Pre-existing entry - just update that one. 1167 * Note that the key might be used concurrently. 1168 * data_race() is telling kcsan that we do not care of 1169 * key mismatches, since changing MD5 key on live flows 1170 * can lead to packet drops. 1171 */ 1172 data_race(memcpy(key->key, newkey, newkeylen)); 1173 1174 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1175 * Also note that a reader could catch new key->keylen value 1176 * but old key->key[], this is the reason we use __GFP_ZERO 1177 * at sock_kmalloc() time below these lines. 1178 */ 1179 WRITE_ONCE(key->keylen, newkeylen); 1180 1181 return 0; 1182 } 1183 1184 md5sig = rcu_dereference_protected(tp->md5sig_info, 1185 lockdep_sock_is_held(sk)); 1186 if (!md5sig) { 1187 md5sig = kmalloc(sizeof(*md5sig), gfp); 1188 if (!md5sig) 1189 return -ENOMEM; 1190 1191 sk_gso_disable(sk); 1192 INIT_HLIST_HEAD(&md5sig->head); 1193 rcu_assign_pointer(tp->md5sig_info, md5sig); 1194 } 1195 1196 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1197 if (!key) 1198 return -ENOMEM; 1199 if (!tcp_alloc_md5sig_pool()) { 1200 sock_kfree_s(sk, key, sizeof(*key)); 1201 return -ENOMEM; 1202 } 1203 1204 memcpy(key->key, newkey, newkeylen); 1205 key->keylen = newkeylen; 1206 key->family = family; 1207 key->prefixlen = prefixlen; 1208 key->l3index = l3index; 1209 key->flags = flags; 1210 memcpy(&key->addr, addr, 1211 (family == AF_INET6) ? sizeof(struct in6_addr) : 1212 sizeof(struct in_addr)); 1213 hlist_add_head_rcu(&key->node, &md5sig->head); 1214 return 0; 1215 } 1216 EXPORT_SYMBOL(tcp_md5_do_add); 1217 1218 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1219 u8 prefixlen, int l3index, u8 flags) 1220 { 1221 struct tcp_md5sig_key *key; 1222 1223 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1224 if (!key) 1225 return -ENOENT; 1226 hlist_del_rcu(&key->node); 1227 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1228 kfree_rcu(key, rcu); 1229 return 0; 1230 } 1231 EXPORT_SYMBOL(tcp_md5_do_del); 1232 1233 static void tcp_clear_md5_list(struct sock *sk) 1234 { 1235 struct tcp_sock *tp = tcp_sk(sk); 1236 struct tcp_md5sig_key *key; 1237 struct hlist_node *n; 1238 struct tcp_md5sig_info *md5sig; 1239 1240 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1241 1242 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1243 hlist_del_rcu(&key->node); 1244 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1245 kfree_rcu(key, rcu); 1246 } 1247 } 1248 1249 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1250 sockptr_t optval, int optlen) 1251 { 1252 struct tcp_md5sig cmd; 1253 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1254 const union tcp_md5_addr *addr; 1255 u8 prefixlen = 32; 1256 int l3index = 0; 1257 u8 flags; 1258 1259 if (optlen < sizeof(cmd)) 1260 return -EINVAL; 1261 1262 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1263 return -EFAULT; 1264 1265 if (sin->sin_family != AF_INET) 1266 return -EINVAL; 1267 1268 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1269 1270 if (optname == TCP_MD5SIG_EXT && 1271 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1272 prefixlen = cmd.tcpm_prefixlen; 1273 if (prefixlen > 32) 1274 return -EINVAL; 1275 } 1276 1277 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && 1278 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1279 struct net_device *dev; 1280 1281 rcu_read_lock(); 1282 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1283 if (dev && netif_is_l3_master(dev)) 1284 l3index = dev->ifindex; 1285 1286 rcu_read_unlock(); 1287 1288 /* ok to reference set/not set outside of rcu; 1289 * right now device MUST be an L3 master 1290 */ 1291 if (!dev || !l3index) 1292 return -EINVAL; 1293 } 1294 1295 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1296 1297 if (!cmd.tcpm_keylen) 1298 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); 1299 1300 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1301 return -EINVAL; 1302 1303 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, 1304 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); 1305 } 1306 1307 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, 1308 __be32 daddr, __be32 saddr, 1309 const struct tcphdr *th, int nbytes) 1310 { 1311 struct tcp4_pseudohdr *bp; 1312 struct scatterlist sg; 1313 struct tcphdr *_th; 1314 1315 bp = hp->scratch; 1316 bp->saddr = saddr; 1317 bp->daddr = daddr; 1318 bp->pad = 0; 1319 bp->protocol = IPPROTO_TCP; 1320 bp->len = cpu_to_be16(nbytes); 1321 1322 _th = (struct tcphdr *)(bp + 1); 1323 memcpy(_th, th, sizeof(*th)); 1324 _th->check = 0; 1325 1326 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1327 ahash_request_set_crypt(hp->md5_req, &sg, NULL, 1328 sizeof(*bp) + sizeof(*th)); 1329 return crypto_ahash_update(hp->md5_req); 1330 } 1331 1332 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1333 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1334 { 1335 struct tcp_md5sig_pool *hp; 1336 struct ahash_request *req; 1337 1338 hp = tcp_get_md5sig_pool(); 1339 if (!hp) 1340 goto clear_hash_noput; 1341 req = hp->md5_req; 1342 1343 if (crypto_ahash_init(req)) 1344 goto clear_hash; 1345 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) 1346 goto clear_hash; 1347 if (tcp_md5_hash_key(hp, key)) 1348 goto clear_hash; 1349 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1350 if (crypto_ahash_final(req)) 1351 goto clear_hash; 1352 1353 tcp_put_md5sig_pool(); 1354 return 0; 1355 1356 clear_hash: 1357 tcp_put_md5sig_pool(); 1358 clear_hash_noput: 1359 memset(md5_hash, 0, 16); 1360 return 1; 1361 } 1362 1363 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1364 const struct sock *sk, 1365 const struct sk_buff *skb) 1366 { 1367 struct tcp_md5sig_pool *hp; 1368 struct ahash_request *req; 1369 const struct tcphdr *th = tcp_hdr(skb); 1370 __be32 saddr, daddr; 1371 1372 if (sk) { /* valid for establish/request sockets */ 1373 saddr = sk->sk_rcv_saddr; 1374 daddr = sk->sk_daddr; 1375 } else { 1376 const struct iphdr *iph = ip_hdr(skb); 1377 saddr = iph->saddr; 1378 daddr = iph->daddr; 1379 } 1380 1381 hp = tcp_get_md5sig_pool(); 1382 if (!hp) 1383 goto clear_hash_noput; 1384 req = hp->md5_req; 1385 1386 if (crypto_ahash_init(req)) 1387 goto clear_hash; 1388 1389 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len)) 1390 goto clear_hash; 1391 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) 1392 goto clear_hash; 1393 if (tcp_md5_hash_key(hp, key)) 1394 goto clear_hash; 1395 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1396 if (crypto_ahash_final(req)) 1397 goto clear_hash; 1398 1399 tcp_put_md5sig_pool(); 1400 return 0; 1401 1402 clear_hash: 1403 tcp_put_md5sig_pool(); 1404 clear_hash_noput: 1405 memset(md5_hash, 0, 16); 1406 return 1; 1407 } 1408 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1409 1410 #endif 1411 1412 /* Called with rcu_read_lock() */ 1413 static bool tcp_v4_inbound_md5_hash(const struct sock *sk, 1414 const struct sk_buff *skb, 1415 int dif, int sdif) 1416 { 1417 #ifdef CONFIG_TCP_MD5SIG 1418 /* 1419 * This gets called for each TCP segment that arrives 1420 * so we want to be efficient. 1421 * We have 3 drop cases: 1422 * o No MD5 hash and one expected. 1423 * o MD5 hash and we're not expecting one. 1424 * o MD5 hash and its wrong. 1425 */ 1426 const __u8 *hash_location = NULL; 1427 struct tcp_md5sig_key *hash_expected; 1428 const struct iphdr *iph = ip_hdr(skb); 1429 const struct tcphdr *th = tcp_hdr(skb); 1430 const union tcp_md5_addr *addr; 1431 unsigned char newhash[16]; 1432 int genhash, l3index; 1433 1434 /* sdif set, means packet ingressed via a device 1435 * in an L3 domain and dif is set to the l3mdev 1436 */ 1437 l3index = sdif ? dif : 0; 1438 1439 addr = (union tcp_md5_addr *)&iph->saddr; 1440 hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1441 hash_location = tcp_parse_md5sig_option(th); 1442 1443 /* We've parsed the options - do we have a hash? */ 1444 if (!hash_expected && !hash_location) 1445 return false; 1446 1447 if (hash_expected && !hash_location) { 1448 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); 1449 return true; 1450 } 1451 1452 if (!hash_expected && hash_location) { 1453 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); 1454 return true; 1455 } 1456 1457 /* Okay, so this is hash_expected and hash_location - 1458 * so we need to calculate the checksum. 1459 */ 1460 genhash = tcp_v4_md5_hash_skb(newhash, 1461 hash_expected, 1462 NULL, skb); 1463 1464 if (genhash || memcmp(hash_location, newhash, 16) != 0) { 1465 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); 1466 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n", 1467 &iph->saddr, ntohs(th->source), 1468 &iph->daddr, ntohs(th->dest), 1469 genhash ? " tcp_v4_calc_md5_hash failed" 1470 : "", l3index); 1471 return true; 1472 } 1473 return false; 1474 #endif 1475 return false; 1476 } 1477 1478 static void tcp_v4_init_req(struct request_sock *req, 1479 const struct sock *sk_listener, 1480 struct sk_buff *skb) 1481 { 1482 struct inet_request_sock *ireq = inet_rsk(req); 1483 struct net *net = sock_net(sk_listener); 1484 1485 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1486 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1487 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1488 } 1489 1490 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1491 struct sk_buff *skb, 1492 struct flowi *fl, 1493 struct request_sock *req) 1494 { 1495 tcp_v4_init_req(req, sk, skb); 1496 1497 if (security_inet_conn_request(sk, skb, req)) 1498 return NULL; 1499 1500 return inet_csk_route_req(sk, &fl->u.ip4, req); 1501 } 1502 1503 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1504 .family = PF_INET, 1505 .obj_size = sizeof(struct tcp_request_sock), 1506 .rtx_syn_ack = tcp_rtx_synack, 1507 .send_ack = tcp_v4_reqsk_send_ack, 1508 .destructor = tcp_v4_reqsk_destructor, 1509 .send_reset = tcp_v4_send_reset, 1510 .syn_ack_timeout = tcp_syn_ack_timeout, 1511 }; 1512 1513 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1514 .mss_clamp = TCP_MSS_DEFAULT, 1515 #ifdef CONFIG_TCP_MD5SIG 1516 .req_md5_lookup = tcp_v4_md5_lookup, 1517 .calc_md5_hash = tcp_v4_md5_hash_skb, 1518 #endif 1519 #ifdef CONFIG_SYN_COOKIES 1520 .cookie_init_seq = cookie_v4_init_sequence, 1521 #endif 1522 .route_req = tcp_v4_route_req, 1523 .init_seq = tcp_v4_init_seq, 1524 .init_ts_off = tcp_v4_init_ts_off, 1525 .send_synack = tcp_v4_send_synack, 1526 }; 1527 1528 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1529 { 1530 /* Never answer to SYNs send to broadcast or multicast */ 1531 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1532 goto drop; 1533 1534 return tcp_conn_request(&tcp_request_sock_ops, 1535 &tcp_request_sock_ipv4_ops, sk, skb); 1536 1537 drop: 1538 tcp_listendrop(sk); 1539 return 0; 1540 } 1541 EXPORT_SYMBOL(tcp_v4_conn_request); 1542 1543 1544 /* 1545 * The three way handshake has completed - we got a valid synack - 1546 * now create the new socket. 1547 */ 1548 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1549 struct request_sock *req, 1550 struct dst_entry *dst, 1551 struct request_sock *req_unhash, 1552 bool *own_req) 1553 { 1554 struct inet_request_sock *ireq; 1555 bool found_dup_sk = false; 1556 struct inet_sock *newinet; 1557 struct tcp_sock *newtp; 1558 struct sock *newsk; 1559 #ifdef CONFIG_TCP_MD5SIG 1560 const union tcp_md5_addr *addr; 1561 struct tcp_md5sig_key *key; 1562 int l3index; 1563 #endif 1564 struct ip_options_rcu *inet_opt; 1565 1566 if (sk_acceptq_is_full(sk)) 1567 goto exit_overflow; 1568 1569 newsk = tcp_create_openreq_child(sk, req, skb); 1570 if (!newsk) 1571 goto exit_nonewsk; 1572 1573 newsk->sk_gso_type = SKB_GSO_TCPV4; 1574 inet_sk_rx_dst_set(newsk, skb); 1575 1576 newtp = tcp_sk(newsk); 1577 newinet = inet_sk(newsk); 1578 ireq = inet_rsk(req); 1579 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1580 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1581 newsk->sk_bound_dev_if = ireq->ir_iif; 1582 newinet->inet_saddr = ireq->ir_loc_addr; 1583 inet_opt = rcu_dereference(ireq->ireq_opt); 1584 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1585 newinet->mc_index = inet_iif(skb); 1586 newinet->mc_ttl = ip_hdr(skb)->ttl; 1587 newinet->rcv_tos = ip_hdr(skb)->tos; 1588 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1589 if (inet_opt) 1590 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1591 newinet->inet_id = prandom_u32(); 1592 1593 /* Set ToS of the new socket based upon the value of incoming SYN. 1594 * ECT bits are set later in tcp_init_transfer(). 1595 */ 1596 if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) 1597 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1598 1599 if (!dst) { 1600 dst = inet_csk_route_child_sock(sk, newsk, req); 1601 if (!dst) 1602 goto put_and_exit; 1603 } else { 1604 /* syncookie case : see end of cookie_v4_check() */ 1605 } 1606 sk_setup_caps(newsk, dst); 1607 1608 tcp_ca_openreq_child(newsk, dst); 1609 1610 tcp_sync_mss(newsk, dst_mtu(dst)); 1611 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1612 1613 tcp_initialize_rcv_mss(newsk); 1614 1615 #ifdef CONFIG_TCP_MD5SIG 1616 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1617 /* Copy over the MD5 key from the original socket */ 1618 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1619 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1620 if (key) { 1621 /* 1622 * We're using one, so create a matching key 1623 * on the newsk structure. If we fail to get 1624 * memory, then we end up not copying the key 1625 * across. Shucks. 1626 */ 1627 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags, 1628 key->key, key->keylen, GFP_ATOMIC); 1629 sk_gso_disable(newsk); 1630 } 1631 #endif 1632 1633 if (__inet_inherit_port(sk, newsk) < 0) 1634 goto put_and_exit; 1635 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1636 &found_dup_sk); 1637 if (likely(*own_req)) { 1638 tcp_move_syn(newtp, req); 1639 ireq->ireq_opt = NULL; 1640 } else { 1641 newinet->inet_opt = NULL; 1642 1643 if (!req_unhash && found_dup_sk) { 1644 /* This code path should only be executed in the 1645 * syncookie case only 1646 */ 1647 bh_unlock_sock(newsk); 1648 sock_put(newsk); 1649 newsk = NULL; 1650 } 1651 } 1652 return newsk; 1653 1654 exit_overflow: 1655 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1656 exit_nonewsk: 1657 dst_release(dst); 1658 exit: 1659 tcp_listendrop(sk); 1660 return NULL; 1661 put_and_exit: 1662 newinet->inet_opt = NULL; 1663 inet_csk_prepare_forced_close(newsk); 1664 tcp_done(newsk); 1665 goto exit; 1666 } 1667 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1668 1669 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1670 { 1671 #ifdef CONFIG_SYN_COOKIES 1672 const struct tcphdr *th = tcp_hdr(skb); 1673 1674 if (!th->syn) 1675 sk = cookie_v4_check(sk, skb); 1676 #endif 1677 return sk; 1678 } 1679 1680 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1681 struct tcphdr *th, u32 *cookie) 1682 { 1683 u16 mss = 0; 1684 #ifdef CONFIG_SYN_COOKIES 1685 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1686 &tcp_request_sock_ipv4_ops, sk, th); 1687 if (mss) { 1688 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1689 tcp_synq_overflow(sk); 1690 } 1691 #endif 1692 return mss; 1693 } 1694 1695 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 1696 u32)); 1697 /* The socket must have it's spinlock held when we get 1698 * here, unless it is a TCP_LISTEN socket. 1699 * 1700 * We have a potential double-lock case here, so even when 1701 * doing backlog processing we use the BH locking scheme. 1702 * This is because we cannot sleep with the original spinlock 1703 * held. 1704 */ 1705 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1706 { 1707 struct sock *rsk; 1708 1709 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1710 struct dst_entry *dst; 1711 1712 dst = rcu_dereference_protected(sk->sk_rx_dst, 1713 lockdep_sock_is_held(sk)); 1714 1715 sock_rps_save_rxhash(sk, skb); 1716 sk_mark_napi_id(sk, skb); 1717 if (dst) { 1718 if (sk->sk_rx_dst_ifindex != skb->skb_iif || 1719 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, 1720 dst, 0)) { 1721 RCU_INIT_POINTER(sk->sk_rx_dst, NULL); 1722 dst_release(dst); 1723 } 1724 } 1725 tcp_rcv_established(sk, skb); 1726 return 0; 1727 } 1728 1729 if (tcp_checksum_complete(skb)) 1730 goto csum_err; 1731 1732 if (sk->sk_state == TCP_LISTEN) { 1733 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1734 1735 if (!nsk) 1736 goto discard; 1737 if (nsk != sk) { 1738 if (tcp_child_process(sk, nsk, skb)) { 1739 rsk = nsk; 1740 goto reset; 1741 } 1742 return 0; 1743 } 1744 } else 1745 sock_rps_save_rxhash(sk, skb); 1746 1747 if (tcp_rcv_state_process(sk, skb)) { 1748 rsk = sk; 1749 goto reset; 1750 } 1751 return 0; 1752 1753 reset: 1754 tcp_v4_send_reset(rsk, skb); 1755 discard: 1756 kfree_skb(skb); 1757 /* Be careful here. If this function gets more complicated and 1758 * gcc suffers from register pressure on the x86, sk (in %ebx) 1759 * might be destroyed here. This current version compiles correctly, 1760 * but you have been warned. 1761 */ 1762 return 0; 1763 1764 csum_err: 1765 trace_tcp_bad_csum(skb); 1766 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1767 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1768 goto discard; 1769 } 1770 EXPORT_SYMBOL(tcp_v4_do_rcv); 1771 1772 int tcp_v4_early_demux(struct sk_buff *skb) 1773 { 1774 const struct iphdr *iph; 1775 const struct tcphdr *th; 1776 struct sock *sk; 1777 1778 if (skb->pkt_type != PACKET_HOST) 1779 return 0; 1780 1781 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1782 return 0; 1783 1784 iph = ip_hdr(skb); 1785 th = tcp_hdr(skb); 1786 1787 if (th->doff < sizeof(struct tcphdr) / 4) 1788 return 0; 1789 1790 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, 1791 iph->saddr, th->source, 1792 iph->daddr, ntohs(th->dest), 1793 skb->skb_iif, inet_sdif(skb)); 1794 if (sk) { 1795 skb->sk = sk; 1796 skb->destructor = sock_edemux; 1797 if (sk_fullsock(sk)) { 1798 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); 1799 1800 if (dst) 1801 dst = dst_check(dst, 0); 1802 if (dst && 1803 sk->sk_rx_dst_ifindex == skb->skb_iif) 1804 skb_dst_set_noref(skb, dst); 1805 } 1806 } 1807 return 0; 1808 } 1809 1810 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) 1811 { 1812 u32 limit, tail_gso_size, tail_gso_segs; 1813 struct skb_shared_info *shinfo; 1814 const struct tcphdr *th; 1815 struct tcphdr *thtail; 1816 struct sk_buff *tail; 1817 unsigned int hdrlen; 1818 bool fragstolen; 1819 u32 gso_segs; 1820 u32 gso_size; 1821 int delta; 1822 1823 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1824 * we can fix skb->truesize to its real value to avoid future drops. 1825 * This is valid because skb is not yet charged to the socket. 1826 * It has been noticed pure SACK packets were sometimes dropped 1827 * (if cooked by drivers without copybreak feature). 1828 */ 1829 skb_condense(skb); 1830 1831 skb_dst_drop(skb); 1832 1833 if (unlikely(tcp_checksum_complete(skb))) { 1834 bh_unlock_sock(sk); 1835 trace_tcp_bad_csum(skb); 1836 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1837 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1838 return true; 1839 } 1840 1841 /* Attempt coalescing to last skb in backlog, even if we are 1842 * above the limits. 1843 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 1844 */ 1845 th = (const struct tcphdr *)skb->data; 1846 hdrlen = th->doff * 4; 1847 1848 tail = sk->sk_backlog.tail; 1849 if (!tail) 1850 goto no_coalesce; 1851 thtail = (struct tcphdr *)tail->data; 1852 1853 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 1854 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 1855 ((TCP_SKB_CB(tail)->tcp_flags | 1856 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 1857 !((TCP_SKB_CB(tail)->tcp_flags & 1858 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 1859 ((TCP_SKB_CB(tail)->tcp_flags ^ 1860 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || 1861 #ifdef CONFIG_TLS_DEVICE 1862 tail->decrypted != skb->decrypted || 1863 #endif 1864 thtail->doff != th->doff || 1865 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 1866 goto no_coalesce; 1867 1868 __skb_pull(skb, hdrlen); 1869 1870 shinfo = skb_shinfo(skb); 1871 gso_size = shinfo->gso_size ?: skb->len; 1872 gso_segs = shinfo->gso_segs ?: 1; 1873 1874 shinfo = skb_shinfo(tail); 1875 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); 1876 tail_gso_segs = shinfo->gso_segs ?: 1; 1877 1878 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 1879 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 1880 1881 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 1882 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 1883 thtail->window = th->window; 1884 } 1885 1886 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 1887 * thtail->fin, so that the fast path in tcp_rcv_established() 1888 * is not entered if we append a packet with a FIN. 1889 * SYN, RST, URG are not present. 1890 * ACK is set on both packets. 1891 * PSH : we do not really care in TCP stack, 1892 * at least for 'GRO' packets. 1893 */ 1894 thtail->fin |= th->fin; 1895 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 1896 1897 if (TCP_SKB_CB(skb)->has_rxtstamp) { 1898 TCP_SKB_CB(tail)->has_rxtstamp = true; 1899 tail->tstamp = skb->tstamp; 1900 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 1901 } 1902 1903 /* Not as strict as GRO. We only need to carry mss max value */ 1904 shinfo->gso_size = max(gso_size, tail_gso_size); 1905 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); 1906 1907 sk->sk_backlog.len += delta; 1908 __NET_INC_STATS(sock_net(sk), 1909 LINUX_MIB_TCPBACKLOGCOALESCE); 1910 kfree_skb_partial(skb, fragstolen); 1911 return false; 1912 } 1913 __skb_push(skb, hdrlen); 1914 1915 no_coalesce: 1916 /* Only socket owner can try to collapse/prune rx queues 1917 * to reduce memory overhead, so add a little headroom here. 1918 * Few sockets backlog are possibly concurrently non empty. 1919 */ 1920 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024; 1921 1922 if (unlikely(sk_add_backlog(sk, skb, limit))) { 1923 bh_unlock_sock(sk); 1924 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 1925 return true; 1926 } 1927 return false; 1928 } 1929 EXPORT_SYMBOL(tcp_add_backlog); 1930 1931 int tcp_filter(struct sock *sk, struct sk_buff *skb) 1932 { 1933 struct tcphdr *th = (struct tcphdr *)skb->data; 1934 1935 return sk_filter_trim_cap(sk, skb, th->doff * 4); 1936 } 1937 EXPORT_SYMBOL(tcp_filter); 1938 1939 static void tcp_v4_restore_cb(struct sk_buff *skb) 1940 { 1941 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 1942 sizeof(struct inet_skb_parm)); 1943 } 1944 1945 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 1946 const struct tcphdr *th) 1947 { 1948 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 1949 * barrier() makes sure compiler wont play fool^Waliasing games. 1950 */ 1951 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 1952 sizeof(struct inet_skb_parm)); 1953 barrier(); 1954 1955 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1956 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1957 skb->len - th->doff * 4); 1958 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1959 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 1960 TCP_SKB_CB(skb)->tcp_tw_isn = 0; 1961 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 1962 TCP_SKB_CB(skb)->sacked = 0; 1963 TCP_SKB_CB(skb)->has_rxtstamp = 1964 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 1965 } 1966 1967 /* 1968 * From tcp_input.c 1969 */ 1970 1971 int tcp_v4_rcv(struct sk_buff *skb) 1972 { 1973 struct net *net = dev_net(skb->dev); 1974 int sdif = inet_sdif(skb); 1975 int dif = inet_iif(skb); 1976 const struct iphdr *iph; 1977 const struct tcphdr *th; 1978 bool refcounted; 1979 struct sock *sk; 1980 int drop_reason; 1981 int ret; 1982 1983 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 1984 if (skb->pkt_type != PACKET_HOST) 1985 goto discard_it; 1986 1987 /* Count it even if it's bad */ 1988 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 1989 1990 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 1991 goto discard_it; 1992 1993 th = (const struct tcphdr *)skb->data; 1994 1995 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { 1996 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; 1997 goto bad_packet; 1998 } 1999 if (!pskb_may_pull(skb, th->doff * 4)) 2000 goto discard_it; 2001 2002 /* An explanation is required here, I think. 2003 * Packet length and doff are validated by header prediction, 2004 * provided case of th->doff==0 is eliminated. 2005 * So, we defer the checks. */ 2006 2007 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 2008 goto csum_error; 2009 2010 th = (const struct tcphdr *)skb->data; 2011 iph = ip_hdr(skb); 2012 lookup: 2013 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, 2014 th->dest, sdif, &refcounted); 2015 if (!sk) 2016 goto no_tcp_socket; 2017 2018 process: 2019 if (sk->sk_state == TCP_TIME_WAIT) 2020 goto do_time_wait; 2021 2022 if (sk->sk_state == TCP_NEW_SYN_RECV) { 2023 struct request_sock *req = inet_reqsk(sk); 2024 bool req_stolen = false; 2025 struct sock *nsk; 2026 2027 sk = req->rsk_listener; 2028 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) { 2029 sk_drops_add(sk, skb); 2030 reqsk_put(req); 2031 goto discard_it; 2032 } 2033 if (tcp_checksum_complete(skb)) { 2034 reqsk_put(req); 2035 goto csum_error; 2036 } 2037 if (unlikely(sk->sk_state != TCP_LISTEN)) { 2038 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); 2039 if (!nsk) { 2040 inet_csk_reqsk_queue_drop_and_put(sk, req); 2041 goto lookup; 2042 } 2043 sk = nsk; 2044 /* reuseport_migrate_sock() has already held one sk_refcnt 2045 * before returning. 2046 */ 2047 } else { 2048 /* We own a reference on the listener, increase it again 2049 * as we might lose it too soon. 2050 */ 2051 sock_hold(sk); 2052 } 2053 refcounted = true; 2054 nsk = NULL; 2055 if (!tcp_filter(sk, skb)) { 2056 th = (const struct tcphdr *)skb->data; 2057 iph = ip_hdr(skb); 2058 tcp_v4_fill_cb(skb, iph, th); 2059 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 2060 } 2061 if (!nsk) { 2062 reqsk_put(req); 2063 if (req_stolen) { 2064 /* Another cpu got exclusive access to req 2065 * and created a full blown socket. 2066 * Try to feed this packet to this socket 2067 * instead of discarding it. 2068 */ 2069 tcp_v4_restore_cb(skb); 2070 sock_put(sk); 2071 goto lookup; 2072 } 2073 goto discard_and_relse; 2074 } 2075 if (nsk == sk) { 2076 reqsk_put(req); 2077 tcp_v4_restore_cb(skb); 2078 } else if (tcp_child_process(sk, nsk, skb)) { 2079 tcp_v4_send_reset(nsk, skb); 2080 goto discard_and_relse; 2081 } else { 2082 sock_put(sk); 2083 return 0; 2084 } 2085 } 2086 2087 if (static_branch_unlikely(&ip4_min_ttl)) { 2088 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 2089 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 2090 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2091 goto discard_and_relse; 2092 } 2093 } 2094 2095 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 2096 goto discard_and_relse; 2097 2098 if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif)) 2099 goto discard_and_relse; 2100 2101 nf_reset_ct(skb); 2102 2103 if (tcp_filter(sk, skb)) { 2104 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2105 goto discard_and_relse; 2106 } 2107 th = (const struct tcphdr *)skb->data; 2108 iph = ip_hdr(skb); 2109 tcp_v4_fill_cb(skb, iph, th); 2110 2111 skb->dev = NULL; 2112 2113 if (sk->sk_state == TCP_LISTEN) { 2114 ret = tcp_v4_do_rcv(sk, skb); 2115 goto put_and_return; 2116 } 2117 2118 sk_incoming_cpu_update(sk); 2119 2120 sk_defer_free_flush(sk); 2121 bh_lock_sock_nested(sk); 2122 tcp_segs_in(tcp_sk(sk), skb); 2123 ret = 0; 2124 if (!sock_owned_by_user(sk)) { 2125 ret = tcp_v4_do_rcv(sk, skb); 2126 } else { 2127 if (tcp_add_backlog(sk, skb)) 2128 goto discard_and_relse; 2129 } 2130 bh_unlock_sock(sk); 2131 2132 put_and_return: 2133 if (refcounted) 2134 sock_put(sk); 2135 2136 return ret; 2137 2138 no_tcp_socket: 2139 drop_reason = SKB_DROP_REASON_NO_SOCKET; 2140 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2141 goto discard_it; 2142 2143 tcp_v4_fill_cb(skb, iph, th); 2144 2145 if (tcp_checksum_complete(skb)) { 2146 csum_error: 2147 drop_reason = SKB_DROP_REASON_TCP_CSUM; 2148 trace_tcp_bad_csum(skb); 2149 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2150 bad_packet: 2151 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2152 } else { 2153 tcp_v4_send_reset(NULL, skb); 2154 } 2155 2156 discard_it: 2157 /* Discard frame. */ 2158 kfree_skb_reason(skb, drop_reason); 2159 return 0; 2160 2161 discard_and_relse: 2162 sk_drops_add(sk, skb); 2163 if (refcounted) 2164 sock_put(sk); 2165 goto discard_it; 2166 2167 do_time_wait: 2168 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2169 inet_twsk_put(inet_twsk(sk)); 2170 goto discard_it; 2171 } 2172 2173 tcp_v4_fill_cb(skb, iph, th); 2174 2175 if (tcp_checksum_complete(skb)) { 2176 inet_twsk_put(inet_twsk(sk)); 2177 goto csum_error; 2178 } 2179 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 2180 case TCP_TW_SYN: { 2181 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), 2182 &tcp_hashinfo, skb, 2183 __tcp_hdrlen(th), 2184 iph->saddr, th->source, 2185 iph->daddr, th->dest, 2186 inet_iif(skb), 2187 sdif); 2188 if (sk2) { 2189 inet_twsk_deschedule_put(inet_twsk(sk)); 2190 sk = sk2; 2191 tcp_v4_restore_cb(skb); 2192 refcounted = false; 2193 goto process; 2194 } 2195 } 2196 /* to ACK */ 2197 fallthrough; 2198 case TCP_TW_ACK: 2199 tcp_v4_timewait_ack(sk, skb); 2200 break; 2201 case TCP_TW_RST: 2202 tcp_v4_send_reset(sk, skb); 2203 inet_twsk_deschedule_put(inet_twsk(sk)); 2204 goto discard_it; 2205 case TCP_TW_SUCCESS:; 2206 } 2207 goto discard_it; 2208 } 2209 2210 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2211 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2212 .twsk_unique = tcp_twsk_unique, 2213 .twsk_destructor= tcp_twsk_destructor, 2214 }; 2215 2216 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2217 { 2218 struct dst_entry *dst = skb_dst(skb); 2219 2220 if (dst && dst_hold_safe(dst)) { 2221 rcu_assign_pointer(sk->sk_rx_dst, dst); 2222 sk->sk_rx_dst_ifindex = skb->skb_iif; 2223 } 2224 } 2225 EXPORT_SYMBOL(inet_sk_rx_dst_set); 2226 2227 const struct inet_connection_sock_af_ops ipv4_specific = { 2228 .queue_xmit = ip_queue_xmit, 2229 .send_check = tcp_v4_send_check, 2230 .rebuild_header = inet_sk_rebuild_header, 2231 .sk_rx_dst_set = inet_sk_rx_dst_set, 2232 .conn_request = tcp_v4_conn_request, 2233 .syn_recv_sock = tcp_v4_syn_recv_sock, 2234 .net_header_len = sizeof(struct iphdr), 2235 .setsockopt = ip_setsockopt, 2236 .getsockopt = ip_getsockopt, 2237 .addr2sockaddr = inet_csk_addr2sockaddr, 2238 .sockaddr_len = sizeof(struct sockaddr_in), 2239 .mtu_reduced = tcp_v4_mtu_reduced, 2240 }; 2241 EXPORT_SYMBOL(ipv4_specific); 2242 2243 #ifdef CONFIG_TCP_MD5SIG 2244 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2245 .md5_lookup = tcp_v4_md5_lookup, 2246 .calc_md5_hash = tcp_v4_md5_hash_skb, 2247 .md5_parse = tcp_v4_parse_md5_keys, 2248 }; 2249 #endif 2250 2251 /* NOTE: A lot of things set to zero explicitly by call to 2252 * sk_alloc() so need not be done here. 2253 */ 2254 static int tcp_v4_init_sock(struct sock *sk) 2255 { 2256 struct inet_connection_sock *icsk = inet_csk(sk); 2257 2258 tcp_init_sock(sk); 2259 2260 icsk->icsk_af_ops = &ipv4_specific; 2261 2262 #ifdef CONFIG_TCP_MD5SIG 2263 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2264 #endif 2265 2266 return 0; 2267 } 2268 2269 void tcp_v4_destroy_sock(struct sock *sk) 2270 { 2271 struct tcp_sock *tp = tcp_sk(sk); 2272 2273 trace_tcp_destroy_sock(sk); 2274 2275 tcp_clear_xmit_timers(sk); 2276 2277 tcp_cleanup_congestion_control(sk); 2278 2279 tcp_cleanup_ulp(sk); 2280 2281 /* Cleanup up the write buffer. */ 2282 tcp_write_queue_purge(sk); 2283 2284 /* Check if we want to disable active TFO */ 2285 tcp_fastopen_active_disable_ofo_check(sk); 2286 2287 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2288 skb_rbtree_purge(&tp->out_of_order_queue); 2289 2290 #ifdef CONFIG_TCP_MD5SIG 2291 /* Clean up the MD5 key list, if any */ 2292 if (tp->md5sig_info) { 2293 tcp_clear_md5_list(sk); 2294 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu); 2295 tp->md5sig_info = NULL; 2296 } 2297 #endif 2298 2299 /* Clean up a referenced TCP bind bucket. */ 2300 if (inet_csk(sk)->icsk_bind_hash) 2301 inet_put_port(sk); 2302 2303 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2304 2305 /* If socket is aborted during connect operation */ 2306 tcp_free_fastopen_req(tp); 2307 tcp_fastopen_destroy_cipher(sk); 2308 tcp_saved_syn_free(tp); 2309 2310 sk_sockets_allocated_dec(sk); 2311 } 2312 EXPORT_SYMBOL(tcp_v4_destroy_sock); 2313 2314 #ifdef CONFIG_PROC_FS 2315 /* Proc filesystem TCP sock list dumping. */ 2316 2317 static unsigned short seq_file_family(const struct seq_file *seq); 2318 2319 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) 2320 { 2321 unsigned short family = seq_file_family(seq); 2322 2323 /* AF_UNSPEC is used as a match all */ 2324 return ((family == AF_UNSPEC || family == sk->sk_family) && 2325 net_eq(sock_net(sk), seq_file_net(seq))); 2326 } 2327 2328 /* Find a non empty bucket (starting from st->bucket) 2329 * and return the first sk from it. 2330 */ 2331 static void *listening_get_first(struct seq_file *seq) 2332 { 2333 struct tcp_iter_state *st = seq->private; 2334 2335 st->offset = 0; 2336 for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) { 2337 struct inet_listen_hashbucket *ilb2; 2338 struct inet_connection_sock *icsk; 2339 struct sock *sk; 2340 2341 ilb2 = &tcp_hashinfo.lhash2[st->bucket]; 2342 if (hlist_empty(&ilb2->head)) 2343 continue; 2344 2345 spin_lock(&ilb2->lock); 2346 inet_lhash2_for_each_icsk(icsk, &ilb2->head) { 2347 sk = (struct sock *)icsk; 2348 if (seq_sk_match(seq, sk)) 2349 return sk; 2350 } 2351 spin_unlock(&ilb2->lock); 2352 } 2353 2354 return NULL; 2355 } 2356 2357 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). 2358 * If "cur" is the last one in the st->bucket, 2359 * call listening_get_first() to return the first sk of the next 2360 * non empty bucket. 2361 */ 2362 static void *listening_get_next(struct seq_file *seq, void *cur) 2363 { 2364 struct tcp_iter_state *st = seq->private; 2365 struct inet_listen_hashbucket *ilb2; 2366 struct inet_connection_sock *icsk; 2367 struct sock *sk = cur; 2368 2369 ++st->num; 2370 ++st->offset; 2371 2372 icsk = inet_csk(sk); 2373 inet_lhash2_for_each_icsk_continue(icsk) { 2374 sk = (struct sock *)icsk; 2375 if (seq_sk_match(seq, sk)) 2376 return sk; 2377 } 2378 2379 ilb2 = &tcp_hashinfo.lhash2[st->bucket]; 2380 spin_unlock(&ilb2->lock); 2381 ++st->bucket; 2382 return listening_get_first(seq); 2383 } 2384 2385 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2386 { 2387 struct tcp_iter_state *st = seq->private; 2388 void *rc; 2389 2390 st->bucket = 0; 2391 st->offset = 0; 2392 rc = listening_get_first(seq); 2393 2394 while (rc && *pos) { 2395 rc = listening_get_next(seq, rc); 2396 --*pos; 2397 } 2398 return rc; 2399 } 2400 2401 static inline bool empty_bucket(const struct tcp_iter_state *st) 2402 { 2403 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain); 2404 } 2405 2406 /* 2407 * Get first established socket starting from bucket given in st->bucket. 2408 * If st->bucket is zero, the very first socket in the hash is returned. 2409 */ 2410 static void *established_get_first(struct seq_file *seq) 2411 { 2412 struct tcp_iter_state *st = seq->private; 2413 2414 st->offset = 0; 2415 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { 2416 struct sock *sk; 2417 struct hlist_nulls_node *node; 2418 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 2419 2420 /* Lockless fast path for the common case of empty buckets */ 2421 if (empty_bucket(st)) 2422 continue; 2423 2424 spin_lock_bh(lock); 2425 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 2426 if (seq_sk_match(seq, sk)) 2427 return sk; 2428 } 2429 spin_unlock_bh(lock); 2430 } 2431 2432 return NULL; 2433 } 2434 2435 static void *established_get_next(struct seq_file *seq, void *cur) 2436 { 2437 struct sock *sk = cur; 2438 struct hlist_nulls_node *node; 2439 struct tcp_iter_state *st = seq->private; 2440 2441 ++st->num; 2442 ++st->offset; 2443 2444 sk = sk_nulls_next(sk); 2445 2446 sk_nulls_for_each_from(sk, node) { 2447 if (seq_sk_match(seq, sk)) 2448 return sk; 2449 } 2450 2451 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2452 ++st->bucket; 2453 return established_get_first(seq); 2454 } 2455 2456 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2457 { 2458 struct tcp_iter_state *st = seq->private; 2459 void *rc; 2460 2461 st->bucket = 0; 2462 rc = established_get_first(seq); 2463 2464 while (rc && pos) { 2465 rc = established_get_next(seq, rc); 2466 --pos; 2467 } 2468 return rc; 2469 } 2470 2471 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2472 { 2473 void *rc; 2474 struct tcp_iter_state *st = seq->private; 2475 2476 st->state = TCP_SEQ_STATE_LISTENING; 2477 rc = listening_get_idx(seq, &pos); 2478 2479 if (!rc) { 2480 st->state = TCP_SEQ_STATE_ESTABLISHED; 2481 rc = established_get_idx(seq, pos); 2482 } 2483 2484 return rc; 2485 } 2486 2487 static void *tcp_seek_last_pos(struct seq_file *seq) 2488 { 2489 struct tcp_iter_state *st = seq->private; 2490 int bucket = st->bucket; 2491 int offset = st->offset; 2492 int orig_num = st->num; 2493 void *rc = NULL; 2494 2495 switch (st->state) { 2496 case TCP_SEQ_STATE_LISTENING: 2497 if (st->bucket > tcp_hashinfo.lhash2_mask) 2498 break; 2499 st->state = TCP_SEQ_STATE_LISTENING; 2500 rc = listening_get_first(seq); 2501 while (offset-- && rc && bucket == st->bucket) 2502 rc = listening_get_next(seq, rc); 2503 if (rc) 2504 break; 2505 st->bucket = 0; 2506 st->state = TCP_SEQ_STATE_ESTABLISHED; 2507 fallthrough; 2508 case TCP_SEQ_STATE_ESTABLISHED: 2509 if (st->bucket > tcp_hashinfo.ehash_mask) 2510 break; 2511 rc = established_get_first(seq); 2512 while (offset-- && rc && bucket == st->bucket) 2513 rc = established_get_next(seq, rc); 2514 } 2515 2516 st->num = orig_num; 2517 2518 return rc; 2519 } 2520 2521 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2522 { 2523 struct tcp_iter_state *st = seq->private; 2524 void *rc; 2525 2526 if (*pos && *pos == st->last_pos) { 2527 rc = tcp_seek_last_pos(seq); 2528 if (rc) 2529 goto out; 2530 } 2531 2532 st->state = TCP_SEQ_STATE_LISTENING; 2533 st->num = 0; 2534 st->bucket = 0; 2535 st->offset = 0; 2536 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2537 2538 out: 2539 st->last_pos = *pos; 2540 return rc; 2541 } 2542 EXPORT_SYMBOL(tcp_seq_start); 2543 2544 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2545 { 2546 struct tcp_iter_state *st = seq->private; 2547 void *rc = NULL; 2548 2549 if (v == SEQ_START_TOKEN) { 2550 rc = tcp_get_idx(seq, 0); 2551 goto out; 2552 } 2553 2554 switch (st->state) { 2555 case TCP_SEQ_STATE_LISTENING: 2556 rc = listening_get_next(seq, v); 2557 if (!rc) { 2558 st->state = TCP_SEQ_STATE_ESTABLISHED; 2559 st->bucket = 0; 2560 st->offset = 0; 2561 rc = established_get_first(seq); 2562 } 2563 break; 2564 case TCP_SEQ_STATE_ESTABLISHED: 2565 rc = established_get_next(seq, v); 2566 break; 2567 } 2568 out: 2569 ++*pos; 2570 st->last_pos = *pos; 2571 return rc; 2572 } 2573 EXPORT_SYMBOL(tcp_seq_next); 2574 2575 void tcp_seq_stop(struct seq_file *seq, void *v) 2576 { 2577 struct tcp_iter_state *st = seq->private; 2578 2579 switch (st->state) { 2580 case TCP_SEQ_STATE_LISTENING: 2581 if (v != SEQ_START_TOKEN) 2582 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock); 2583 break; 2584 case TCP_SEQ_STATE_ESTABLISHED: 2585 if (v) 2586 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2587 break; 2588 } 2589 } 2590 EXPORT_SYMBOL(tcp_seq_stop); 2591 2592 static void get_openreq4(const struct request_sock *req, 2593 struct seq_file *f, int i) 2594 { 2595 const struct inet_request_sock *ireq = inet_rsk(req); 2596 long delta = req->rsk_timer.expires - jiffies; 2597 2598 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2599 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2600 i, 2601 ireq->ir_loc_addr, 2602 ireq->ir_num, 2603 ireq->ir_rmt_addr, 2604 ntohs(ireq->ir_rmt_port), 2605 TCP_SYN_RECV, 2606 0, 0, /* could print option size, but that is af dependent. */ 2607 1, /* timers active (only the expire timer) */ 2608 jiffies_delta_to_clock_t(delta), 2609 req->num_timeout, 2610 from_kuid_munged(seq_user_ns(f), 2611 sock_i_uid(req->rsk_listener)), 2612 0, /* non standard timer */ 2613 0, /* open_requests have no inode */ 2614 0, 2615 req); 2616 } 2617 2618 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2619 { 2620 int timer_active; 2621 unsigned long timer_expires; 2622 const struct tcp_sock *tp = tcp_sk(sk); 2623 const struct inet_connection_sock *icsk = inet_csk(sk); 2624 const struct inet_sock *inet = inet_sk(sk); 2625 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2626 __be32 dest = inet->inet_daddr; 2627 __be32 src = inet->inet_rcv_saddr; 2628 __u16 destp = ntohs(inet->inet_dport); 2629 __u16 srcp = ntohs(inet->inet_sport); 2630 int rx_queue; 2631 int state; 2632 2633 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2634 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2635 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2636 timer_active = 1; 2637 timer_expires = icsk->icsk_timeout; 2638 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2639 timer_active = 4; 2640 timer_expires = icsk->icsk_timeout; 2641 } else if (timer_pending(&sk->sk_timer)) { 2642 timer_active = 2; 2643 timer_expires = sk->sk_timer.expires; 2644 } else { 2645 timer_active = 0; 2646 timer_expires = jiffies; 2647 } 2648 2649 state = inet_sk_state_load(sk); 2650 if (state == TCP_LISTEN) 2651 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2652 else 2653 /* Because we don't lock the socket, 2654 * we might find a transient negative value. 2655 */ 2656 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2657 READ_ONCE(tp->copied_seq), 0); 2658 2659 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2660 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2661 i, src, srcp, dest, destp, state, 2662 READ_ONCE(tp->write_seq) - tp->snd_una, 2663 rx_queue, 2664 timer_active, 2665 jiffies_delta_to_clock_t(timer_expires - jiffies), 2666 icsk->icsk_retransmits, 2667 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2668 icsk->icsk_probes_out, 2669 sock_i_ino(sk), 2670 refcount_read(&sk->sk_refcnt), sk, 2671 jiffies_to_clock_t(icsk->icsk_rto), 2672 jiffies_to_clock_t(icsk->icsk_ack.ato), 2673 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2674 tp->snd_cwnd, 2675 state == TCP_LISTEN ? 2676 fastopenq->max_qlen : 2677 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2678 } 2679 2680 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2681 struct seq_file *f, int i) 2682 { 2683 long delta = tw->tw_timer.expires - jiffies; 2684 __be32 dest, src; 2685 __u16 destp, srcp; 2686 2687 dest = tw->tw_daddr; 2688 src = tw->tw_rcv_saddr; 2689 destp = ntohs(tw->tw_dport); 2690 srcp = ntohs(tw->tw_sport); 2691 2692 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2693 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2694 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2695 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2696 refcount_read(&tw->tw_refcnt), tw); 2697 } 2698 2699 #define TMPSZ 150 2700 2701 static int tcp4_seq_show(struct seq_file *seq, void *v) 2702 { 2703 struct tcp_iter_state *st; 2704 struct sock *sk = v; 2705 2706 seq_setwidth(seq, TMPSZ - 1); 2707 if (v == SEQ_START_TOKEN) { 2708 seq_puts(seq, " sl local_address rem_address st tx_queue " 2709 "rx_queue tr tm->when retrnsmt uid timeout " 2710 "inode"); 2711 goto out; 2712 } 2713 st = seq->private; 2714 2715 if (sk->sk_state == TCP_TIME_WAIT) 2716 get_timewait4_sock(v, seq, st->num); 2717 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2718 get_openreq4(v, seq, st->num); 2719 else 2720 get_tcp4_sock(v, seq, st->num); 2721 out: 2722 seq_pad(seq, '\n'); 2723 return 0; 2724 } 2725 2726 #ifdef CONFIG_BPF_SYSCALL 2727 struct bpf_tcp_iter_state { 2728 struct tcp_iter_state state; 2729 unsigned int cur_sk; 2730 unsigned int end_sk; 2731 unsigned int max_sk; 2732 struct sock **batch; 2733 bool st_bucket_done; 2734 }; 2735 2736 struct bpf_iter__tcp { 2737 __bpf_md_ptr(struct bpf_iter_meta *, meta); 2738 __bpf_md_ptr(struct sock_common *, sk_common); 2739 uid_t uid __aligned(8); 2740 }; 2741 2742 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 2743 struct sock_common *sk_common, uid_t uid) 2744 { 2745 struct bpf_iter__tcp ctx; 2746 2747 meta->seq_num--; /* skip SEQ_START_TOKEN */ 2748 ctx.meta = meta; 2749 ctx.sk_common = sk_common; 2750 ctx.uid = uid; 2751 return bpf_iter_run_prog(prog, &ctx); 2752 } 2753 2754 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) 2755 { 2756 while (iter->cur_sk < iter->end_sk) 2757 sock_put(iter->batch[iter->cur_sk++]); 2758 } 2759 2760 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, 2761 unsigned int new_batch_sz) 2762 { 2763 struct sock **new_batch; 2764 2765 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 2766 GFP_USER | __GFP_NOWARN); 2767 if (!new_batch) 2768 return -ENOMEM; 2769 2770 bpf_iter_tcp_put_batch(iter); 2771 kvfree(iter->batch); 2772 iter->batch = new_batch; 2773 iter->max_sk = new_batch_sz; 2774 2775 return 0; 2776 } 2777 2778 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, 2779 struct sock *start_sk) 2780 { 2781 struct bpf_tcp_iter_state *iter = seq->private; 2782 struct tcp_iter_state *st = &iter->state; 2783 struct inet_connection_sock *icsk; 2784 unsigned int expected = 1; 2785 struct sock *sk; 2786 2787 sock_hold(start_sk); 2788 iter->batch[iter->end_sk++] = start_sk; 2789 2790 icsk = inet_csk(start_sk); 2791 inet_lhash2_for_each_icsk_continue(icsk) { 2792 sk = (struct sock *)icsk; 2793 if (seq_sk_match(seq, sk)) { 2794 if (iter->end_sk < iter->max_sk) { 2795 sock_hold(sk); 2796 iter->batch[iter->end_sk++] = sk; 2797 } 2798 expected++; 2799 } 2800 } 2801 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock); 2802 2803 return expected; 2804 } 2805 2806 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, 2807 struct sock *start_sk) 2808 { 2809 struct bpf_tcp_iter_state *iter = seq->private; 2810 struct tcp_iter_state *st = &iter->state; 2811 struct hlist_nulls_node *node; 2812 unsigned int expected = 1; 2813 struct sock *sk; 2814 2815 sock_hold(start_sk); 2816 iter->batch[iter->end_sk++] = start_sk; 2817 2818 sk = sk_nulls_next(start_sk); 2819 sk_nulls_for_each_from(sk, node) { 2820 if (seq_sk_match(seq, sk)) { 2821 if (iter->end_sk < iter->max_sk) { 2822 sock_hold(sk); 2823 iter->batch[iter->end_sk++] = sk; 2824 } 2825 expected++; 2826 } 2827 } 2828 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2829 2830 return expected; 2831 } 2832 2833 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) 2834 { 2835 struct bpf_tcp_iter_state *iter = seq->private; 2836 struct tcp_iter_state *st = &iter->state; 2837 unsigned int expected; 2838 bool resized = false; 2839 struct sock *sk; 2840 2841 /* The st->bucket is done. Directly advance to the next 2842 * bucket instead of having the tcp_seek_last_pos() to skip 2843 * one by one in the current bucket and eventually find out 2844 * it has to advance to the next bucket. 2845 */ 2846 if (iter->st_bucket_done) { 2847 st->offset = 0; 2848 st->bucket++; 2849 if (st->state == TCP_SEQ_STATE_LISTENING && 2850 st->bucket > tcp_hashinfo.lhash2_mask) { 2851 st->state = TCP_SEQ_STATE_ESTABLISHED; 2852 st->bucket = 0; 2853 } 2854 } 2855 2856 again: 2857 /* Get a new batch */ 2858 iter->cur_sk = 0; 2859 iter->end_sk = 0; 2860 iter->st_bucket_done = false; 2861 2862 sk = tcp_seek_last_pos(seq); 2863 if (!sk) 2864 return NULL; /* Done */ 2865 2866 if (st->state == TCP_SEQ_STATE_LISTENING) 2867 expected = bpf_iter_tcp_listening_batch(seq, sk); 2868 else 2869 expected = bpf_iter_tcp_established_batch(seq, sk); 2870 2871 if (iter->end_sk == expected) { 2872 iter->st_bucket_done = true; 2873 return sk; 2874 } 2875 2876 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) { 2877 resized = true; 2878 goto again; 2879 } 2880 2881 return sk; 2882 } 2883 2884 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) 2885 { 2886 /* bpf iter does not support lseek, so it always 2887 * continue from where it was stop()-ped. 2888 */ 2889 if (*pos) 2890 return bpf_iter_tcp_batch(seq); 2891 2892 return SEQ_START_TOKEN; 2893 } 2894 2895 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2896 { 2897 struct bpf_tcp_iter_state *iter = seq->private; 2898 struct tcp_iter_state *st = &iter->state; 2899 struct sock *sk; 2900 2901 /* Whenever seq_next() is called, the iter->cur_sk is 2902 * done with seq_show(), so advance to the next sk in 2903 * the batch. 2904 */ 2905 if (iter->cur_sk < iter->end_sk) { 2906 /* Keeping st->num consistent in tcp_iter_state. 2907 * bpf_iter_tcp does not use st->num. 2908 * meta.seq_num is used instead. 2909 */ 2910 st->num++; 2911 /* Move st->offset to the next sk in the bucket such that 2912 * the future start() will resume at st->offset in 2913 * st->bucket. See tcp_seek_last_pos(). 2914 */ 2915 st->offset++; 2916 sock_put(iter->batch[iter->cur_sk++]); 2917 } 2918 2919 if (iter->cur_sk < iter->end_sk) 2920 sk = iter->batch[iter->cur_sk]; 2921 else 2922 sk = bpf_iter_tcp_batch(seq); 2923 2924 ++*pos; 2925 /* Keeping st->last_pos consistent in tcp_iter_state. 2926 * bpf iter does not do lseek, so st->last_pos always equals to *pos. 2927 */ 2928 st->last_pos = *pos; 2929 return sk; 2930 } 2931 2932 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 2933 { 2934 struct bpf_iter_meta meta; 2935 struct bpf_prog *prog; 2936 struct sock *sk = v; 2937 bool slow; 2938 uid_t uid; 2939 int ret; 2940 2941 if (v == SEQ_START_TOKEN) 2942 return 0; 2943 2944 if (sk_fullsock(sk)) 2945 slow = lock_sock_fast(sk); 2946 2947 if (unlikely(sk_unhashed(sk))) { 2948 ret = SEQ_SKIP; 2949 goto unlock; 2950 } 2951 2952 if (sk->sk_state == TCP_TIME_WAIT) { 2953 uid = 0; 2954 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 2955 const struct request_sock *req = v; 2956 2957 uid = from_kuid_munged(seq_user_ns(seq), 2958 sock_i_uid(req->rsk_listener)); 2959 } else { 2960 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 2961 } 2962 2963 meta.seq = seq; 2964 prog = bpf_iter_get_info(&meta, false); 2965 ret = tcp_prog_seq_show(prog, &meta, v, uid); 2966 2967 unlock: 2968 if (sk_fullsock(sk)) 2969 unlock_sock_fast(sk, slow); 2970 return ret; 2971 2972 } 2973 2974 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 2975 { 2976 struct bpf_tcp_iter_state *iter = seq->private; 2977 struct bpf_iter_meta meta; 2978 struct bpf_prog *prog; 2979 2980 if (!v) { 2981 meta.seq = seq; 2982 prog = bpf_iter_get_info(&meta, true); 2983 if (prog) 2984 (void)tcp_prog_seq_show(prog, &meta, v, 0); 2985 } 2986 2987 if (iter->cur_sk < iter->end_sk) { 2988 bpf_iter_tcp_put_batch(iter); 2989 iter->st_bucket_done = false; 2990 } 2991 } 2992 2993 static const struct seq_operations bpf_iter_tcp_seq_ops = { 2994 .show = bpf_iter_tcp_seq_show, 2995 .start = bpf_iter_tcp_seq_start, 2996 .next = bpf_iter_tcp_seq_next, 2997 .stop = bpf_iter_tcp_seq_stop, 2998 }; 2999 #endif 3000 static unsigned short seq_file_family(const struct seq_file *seq) 3001 { 3002 const struct tcp_seq_afinfo *afinfo; 3003 3004 #ifdef CONFIG_BPF_SYSCALL 3005 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ 3006 if (seq->op == &bpf_iter_tcp_seq_ops) 3007 return AF_UNSPEC; 3008 #endif 3009 3010 /* Iterated from proc fs */ 3011 afinfo = pde_data(file_inode(seq->file)); 3012 return afinfo->family; 3013 } 3014 3015 static const struct seq_operations tcp4_seq_ops = { 3016 .show = tcp4_seq_show, 3017 .start = tcp_seq_start, 3018 .next = tcp_seq_next, 3019 .stop = tcp_seq_stop, 3020 }; 3021 3022 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 3023 .family = AF_INET, 3024 }; 3025 3026 static int __net_init tcp4_proc_init_net(struct net *net) 3027 { 3028 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 3029 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 3030 return -ENOMEM; 3031 return 0; 3032 } 3033 3034 static void __net_exit tcp4_proc_exit_net(struct net *net) 3035 { 3036 remove_proc_entry("tcp", net->proc_net); 3037 } 3038 3039 static struct pernet_operations tcp4_net_ops = { 3040 .init = tcp4_proc_init_net, 3041 .exit = tcp4_proc_exit_net, 3042 }; 3043 3044 int __init tcp4_proc_init(void) 3045 { 3046 return register_pernet_subsys(&tcp4_net_ops); 3047 } 3048 3049 void tcp4_proc_exit(void) 3050 { 3051 unregister_pernet_subsys(&tcp4_net_ops); 3052 } 3053 #endif /* CONFIG_PROC_FS */ 3054 3055 /* @wake is one when sk_stream_write_space() calls us. 3056 * This sends EPOLLOUT only if notsent_bytes is half the limit. 3057 * This mimics the strategy used in sock_def_write_space(). 3058 */ 3059 bool tcp_stream_memory_free(const struct sock *sk, int wake) 3060 { 3061 const struct tcp_sock *tp = tcp_sk(sk); 3062 u32 notsent_bytes = READ_ONCE(tp->write_seq) - 3063 READ_ONCE(tp->snd_nxt); 3064 3065 return (notsent_bytes << wake) < tcp_notsent_lowat(tp); 3066 } 3067 EXPORT_SYMBOL(tcp_stream_memory_free); 3068 3069 struct proto tcp_prot = { 3070 .name = "TCP", 3071 .owner = THIS_MODULE, 3072 .close = tcp_close, 3073 .pre_connect = tcp_v4_pre_connect, 3074 .connect = tcp_v4_connect, 3075 .disconnect = tcp_disconnect, 3076 .accept = inet_csk_accept, 3077 .ioctl = tcp_ioctl, 3078 .init = tcp_v4_init_sock, 3079 .destroy = tcp_v4_destroy_sock, 3080 .shutdown = tcp_shutdown, 3081 .setsockopt = tcp_setsockopt, 3082 .getsockopt = tcp_getsockopt, 3083 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 3084 .keepalive = tcp_set_keepalive, 3085 .recvmsg = tcp_recvmsg, 3086 .sendmsg = tcp_sendmsg, 3087 .sendpage = tcp_sendpage, 3088 .backlog_rcv = tcp_v4_do_rcv, 3089 .release_cb = tcp_release_cb, 3090 .hash = inet_hash, 3091 .unhash = inet_unhash, 3092 .get_port = inet_csk_get_port, 3093 .put_port = inet_put_port, 3094 #ifdef CONFIG_BPF_SYSCALL 3095 .psock_update_sk_prot = tcp_bpf_update_proto, 3096 #endif 3097 .enter_memory_pressure = tcp_enter_memory_pressure, 3098 .leave_memory_pressure = tcp_leave_memory_pressure, 3099 .stream_memory_free = tcp_stream_memory_free, 3100 .sockets_allocated = &tcp_sockets_allocated, 3101 .orphan_count = &tcp_orphan_count, 3102 .memory_allocated = &tcp_memory_allocated, 3103 .memory_pressure = &tcp_memory_pressure, 3104 .sysctl_mem = sysctl_tcp_mem, 3105 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3106 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3107 .max_header = MAX_TCP_HEADER, 3108 .obj_size = sizeof(struct tcp_sock), 3109 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3110 .twsk_prot = &tcp_timewait_sock_ops, 3111 .rsk_prot = &tcp_request_sock_ops, 3112 .h.hashinfo = &tcp_hashinfo, 3113 .no_autobind = true, 3114 .diag_destroy = tcp_abort, 3115 }; 3116 EXPORT_SYMBOL(tcp_prot); 3117 3118 static void __net_exit tcp_sk_exit(struct net *net) 3119 { 3120 struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row; 3121 3122 if (net->ipv4.tcp_congestion_control) 3123 bpf_module_put(net->ipv4.tcp_congestion_control, 3124 net->ipv4.tcp_congestion_control->owner); 3125 if (refcount_dec_and_test(&tcp_death_row->tw_refcount)) 3126 kfree(tcp_death_row); 3127 } 3128 3129 static int __net_init tcp_sk_init(struct net *net) 3130 { 3131 int cnt; 3132 3133 net->ipv4.sysctl_tcp_ecn = 2; 3134 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3135 3136 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 3137 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 3138 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 3139 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 3140 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 3141 3142 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 3143 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 3144 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 3145 3146 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 3147 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 3148 net->ipv4.sysctl_tcp_syncookies = 1; 3149 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 3150 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 3151 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 3152 net->ipv4.sysctl_tcp_orphan_retries = 0; 3153 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 3154 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 3155 net->ipv4.sysctl_tcp_tw_reuse = 2; 3156 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 3157 3158 net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL); 3159 if (!net->ipv4.tcp_death_row) 3160 return -ENOMEM; 3161 refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1); 3162 cnt = tcp_hashinfo.ehash_mask + 1; 3163 net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2; 3164 net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo; 3165 3166 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128); 3167 net->ipv4.sysctl_tcp_sack = 1; 3168 net->ipv4.sysctl_tcp_window_scaling = 1; 3169 net->ipv4.sysctl_tcp_timestamps = 1; 3170 net->ipv4.sysctl_tcp_early_retrans = 3; 3171 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 3172 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 3173 net->ipv4.sysctl_tcp_retrans_collapse = 1; 3174 net->ipv4.sysctl_tcp_max_reordering = 300; 3175 net->ipv4.sysctl_tcp_dsack = 1; 3176 net->ipv4.sysctl_tcp_app_win = 31; 3177 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3178 net->ipv4.sysctl_tcp_frto = 2; 3179 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3180 /* This limits the percentage of the congestion window which we 3181 * will allow a single TSO frame to consume. Building TSO frames 3182 * which are too large can cause TCP streams to be bursty. 3183 */ 3184 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3185 /* Default TSQ limit of 16 TSO segments */ 3186 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; 3187 /* rfc5961 challenge ack rate limiting */ 3188 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; 3189 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3190 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3191 net->ipv4.sysctl_tcp_autocorking = 1; 3192 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 3193 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 3194 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 3195 if (net != &init_net) { 3196 memcpy(net->ipv4.sysctl_tcp_rmem, 3197 init_net.ipv4.sysctl_tcp_rmem, 3198 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 3199 memcpy(net->ipv4.sysctl_tcp_wmem, 3200 init_net.ipv4.sysctl_tcp_wmem, 3201 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 3202 } 3203 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3204 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 3205 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3206 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3207 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; 3208 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 3209 3210 /* Reno is always built in */ 3211 if (!net_eq(net, &init_net) && 3212 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 3213 init_net.ipv4.tcp_congestion_control->owner)) 3214 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 3215 else 3216 net->ipv4.tcp_congestion_control = &tcp_reno; 3217 3218 return 0; 3219 } 3220 3221 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 3222 { 3223 struct net *net; 3224 3225 list_for_each_entry(net, net_exit_list, exit_list) 3226 tcp_fastopen_ctx_destroy(net); 3227 } 3228 3229 static struct pernet_operations __net_initdata tcp_sk_ops = { 3230 .init = tcp_sk_init, 3231 .exit = tcp_sk_exit, 3232 .exit_batch = tcp_sk_exit_batch, 3233 }; 3234 3235 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3236 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 3237 struct sock_common *sk_common, uid_t uid) 3238 3239 #define INIT_BATCH_SZ 16 3240 3241 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 3242 { 3243 struct bpf_tcp_iter_state *iter = priv_data; 3244 int err; 3245 3246 err = bpf_iter_init_seq_net(priv_data, aux); 3247 if (err) 3248 return err; 3249 3250 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ); 3251 if (err) { 3252 bpf_iter_fini_seq_net(priv_data); 3253 return err; 3254 } 3255 3256 return 0; 3257 } 3258 3259 static void bpf_iter_fini_tcp(void *priv_data) 3260 { 3261 struct bpf_tcp_iter_state *iter = priv_data; 3262 3263 bpf_iter_fini_seq_net(priv_data); 3264 kvfree(iter->batch); 3265 } 3266 3267 static const struct bpf_iter_seq_info tcp_seq_info = { 3268 .seq_ops = &bpf_iter_tcp_seq_ops, 3269 .init_seq_private = bpf_iter_init_tcp, 3270 .fini_seq_private = bpf_iter_fini_tcp, 3271 .seq_priv_size = sizeof(struct bpf_tcp_iter_state), 3272 }; 3273 3274 static const struct bpf_func_proto * 3275 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, 3276 const struct bpf_prog *prog) 3277 { 3278 switch (func_id) { 3279 case BPF_FUNC_setsockopt: 3280 return &bpf_sk_setsockopt_proto; 3281 case BPF_FUNC_getsockopt: 3282 return &bpf_sk_getsockopt_proto; 3283 default: 3284 return NULL; 3285 } 3286 } 3287 3288 static struct bpf_iter_reg tcp_reg_info = { 3289 .target = "tcp", 3290 .ctx_arg_info_size = 1, 3291 .ctx_arg_info = { 3292 { offsetof(struct bpf_iter__tcp, sk_common), 3293 PTR_TO_BTF_ID_OR_NULL }, 3294 }, 3295 .get_func_proto = bpf_iter_tcp_get_func_proto, 3296 .seq_info = &tcp_seq_info, 3297 }; 3298 3299 static void __init bpf_iter_register(void) 3300 { 3301 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 3302 if (bpf_iter_reg_target(&tcp_reg_info)) 3303 pr_warn("Warning: could not register bpf iterator tcp\n"); 3304 } 3305 3306 #endif 3307 3308 void __init tcp_v4_init(void) 3309 { 3310 int cpu, res; 3311 3312 for_each_possible_cpu(cpu) { 3313 struct sock *sk; 3314 3315 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 3316 IPPROTO_TCP, &init_net); 3317 if (res) 3318 panic("Failed to create the TCP control socket.\n"); 3319 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 3320 3321 /* Please enforce IP_DF and IPID==0 for RST and 3322 * ACK sent in SYN-RECV and TIME-WAIT state. 3323 */ 3324 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 3325 3326 per_cpu(ipv4_tcp_sk, cpu) = sk; 3327 } 3328 if (register_pernet_subsys(&tcp_sk_ops)) 3329 panic("Failed to create the TCP control socket.\n"); 3330 3331 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3332 bpf_iter_register(); 3333 #endif 3334 } 3335