1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 61 #include <net/net_namespace.h> 62 #include <net/icmp.h> 63 #include <net/inet_hashtables.h> 64 #include <net/tcp.h> 65 #include <net/transp_v6.h> 66 #include <net/ipv6.h> 67 #include <net/inet_common.h> 68 #include <net/timewait_sock.h> 69 #include <net/xfrm.h> 70 #include <net/secure_seq.h> 71 #include <net/busy_poll.h> 72 73 #include <linux/inet.h> 74 #include <linux/ipv6.h> 75 #include <linux/stddef.h> 76 #include <linux/proc_fs.h> 77 #include <linux/seq_file.h> 78 #include <linux/inetdevice.h> 79 #include <linux/btf_ids.h> 80 81 #include <crypto/hash.h> 82 #include <linux/scatterlist.h> 83 84 #include <trace/events/tcp.h> 85 86 #ifdef CONFIG_TCP_MD5SIG 87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 88 __be32 daddr, __be32 saddr, const struct tcphdr *th); 89 #endif 90 91 struct inet_hashinfo tcp_hashinfo; 92 EXPORT_SYMBOL(tcp_hashinfo); 93 94 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 95 { 96 return secure_tcp_seq(ip_hdr(skb)->daddr, 97 ip_hdr(skb)->saddr, 98 tcp_hdr(skb)->dest, 99 tcp_hdr(skb)->source); 100 } 101 102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 103 { 104 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 105 } 106 107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 108 { 109 const struct inet_timewait_sock *tw = inet_twsk(sktw); 110 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 111 struct tcp_sock *tp = tcp_sk(sk); 112 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse; 113 114 if (reuse == 2) { 115 /* Still does not detect *everything* that goes through 116 * lo, since we require a loopback src or dst address 117 * or direct binding to 'lo' interface. 118 */ 119 bool loopback = false; 120 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 121 loopback = true; 122 #if IS_ENABLED(CONFIG_IPV6) 123 if (tw->tw_family == AF_INET6) { 124 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 125 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 126 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 128 loopback = true; 129 } else 130 #endif 131 { 132 if (ipv4_is_loopback(tw->tw_daddr) || 133 ipv4_is_loopback(tw->tw_rcv_saddr)) 134 loopback = true; 135 } 136 if (!loopback) 137 reuse = 0; 138 } 139 140 /* With PAWS, it is safe from the viewpoint 141 of data integrity. Even without PAWS it is safe provided sequence 142 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 143 144 Actually, the idea is close to VJ's one, only timestamp cache is 145 held not per host, but per port pair and TW bucket is used as state 146 holder. 147 148 If TW bucket has been already destroyed we fall back to VJ's scheme 149 and use initial timestamp retrieved from peer table. 150 */ 151 if (tcptw->tw_ts_recent_stamp && 152 (!twp || (reuse && time_after32(ktime_get_seconds(), 153 tcptw->tw_ts_recent_stamp)))) { 154 /* In case of repair and re-using TIME-WAIT sockets we still 155 * want to be sure that it is safe as above but honor the 156 * sequence numbers and time stamps set as part of the repair 157 * process. 158 * 159 * Without this check re-using a TIME-WAIT socket with TCP 160 * repair would accumulate a -1 on the repair assigned 161 * sequence number. The first time it is reused the sequence 162 * is -1, the second time -2, etc. This fixes that issue 163 * without appearing to create any others. 164 */ 165 if (likely(!tp->repair)) { 166 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 167 168 if (!seq) 169 seq = 1; 170 WRITE_ONCE(tp->write_seq, seq); 171 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 172 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 173 } 174 sock_hold(sktw); 175 return 1; 176 } 177 178 return 0; 179 } 180 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 181 182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 183 int addr_len) 184 { 185 /* This check is replicated from tcp_v4_connect() and intended to 186 * prevent BPF program called below from accessing bytes that are out 187 * of the bound specified by user in addr_len. 188 */ 189 if (addr_len < sizeof(struct sockaddr_in)) 190 return -EINVAL; 191 192 sock_owned_by_me(sk); 193 194 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr); 195 } 196 197 /* This will initiate an outgoing connection. */ 198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 199 { 200 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 201 struct inet_sock *inet = inet_sk(sk); 202 struct tcp_sock *tp = tcp_sk(sk); 203 __be16 orig_sport, orig_dport; 204 __be32 daddr, nexthop; 205 struct flowi4 *fl4; 206 struct rtable *rt; 207 int err; 208 struct ip_options_rcu *inet_opt; 209 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 210 211 if (addr_len < sizeof(struct sockaddr_in)) 212 return -EINVAL; 213 214 if (usin->sin_family != AF_INET) 215 return -EAFNOSUPPORT; 216 217 nexthop = daddr = usin->sin_addr.s_addr; 218 inet_opt = rcu_dereference_protected(inet->inet_opt, 219 lockdep_sock_is_held(sk)); 220 if (inet_opt && inet_opt->opt.srr) { 221 if (!daddr) 222 return -EINVAL; 223 nexthop = inet_opt->opt.faddr; 224 } 225 226 orig_sport = inet->inet_sport; 227 orig_dport = usin->sin_port; 228 fl4 = &inet->cork.fl.u.ip4; 229 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 230 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 231 IPPROTO_TCP, 232 orig_sport, orig_dport, sk); 233 if (IS_ERR(rt)) { 234 err = PTR_ERR(rt); 235 if (err == -ENETUNREACH) 236 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 237 return err; 238 } 239 240 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 241 ip_rt_put(rt); 242 return -ENETUNREACH; 243 } 244 245 if (!inet_opt || !inet_opt->opt.srr) 246 daddr = fl4->daddr; 247 248 if (!inet->inet_saddr) 249 inet->inet_saddr = fl4->saddr; 250 sk_rcv_saddr_set(sk, inet->inet_saddr); 251 252 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 253 /* Reset inherited state */ 254 tp->rx_opt.ts_recent = 0; 255 tp->rx_opt.ts_recent_stamp = 0; 256 if (likely(!tp->repair)) 257 WRITE_ONCE(tp->write_seq, 0); 258 } 259 260 inet->inet_dport = usin->sin_port; 261 sk_daddr_set(sk, daddr); 262 263 inet_csk(sk)->icsk_ext_hdr_len = 0; 264 if (inet_opt) 265 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 266 267 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 268 269 /* Socket identity is still unknown (sport may be zero). 270 * However we set state to SYN-SENT and not releasing socket 271 * lock select source port, enter ourselves into the hash tables and 272 * complete initialization after this. 273 */ 274 tcp_set_state(sk, TCP_SYN_SENT); 275 err = inet_hash_connect(tcp_death_row, sk); 276 if (err) 277 goto failure; 278 279 sk_set_txhash(sk); 280 281 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 282 inet->inet_sport, inet->inet_dport, sk); 283 if (IS_ERR(rt)) { 284 err = PTR_ERR(rt); 285 rt = NULL; 286 goto failure; 287 } 288 /* OK, now commit destination to socket. */ 289 sk->sk_gso_type = SKB_GSO_TCPV4; 290 sk_setup_caps(sk, &rt->dst); 291 rt = NULL; 292 293 if (likely(!tp->repair)) { 294 if (!tp->write_seq) 295 WRITE_ONCE(tp->write_seq, 296 secure_tcp_seq(inet->inet_saddr, 297 inet->inet_daddr, 298 inet->inet_sport, 299 usin->sin_port)); 300 tp->tsoffset = secure_tcp_ts_off(sock_net(sk), 301 inet->inet_saddr, 302 inet->inet_daddr); 303 } 304 305 inet->inet_id = prandom_u32(); 306 307 if (tcp_fastopen_defer_connect(sk, &err)) 308 return err; 309 if (err) 310 goto failure; 311 312 err = tcp_connect(sk); 313 314 if (err) 315 goto failure; 316 317 return 0; 318 319 failure: 320 /* 321 * This unhashes the socket and releases the local port, 322 * if necessary. 323 */ 324 tcp_set_state(sk, TCP_CLOSE); 325 ip_rt_put(rt); 326 sk->sk_route_caps = 0; 327 inet->inet_dport = 0; 328 return err; 329 } 330 EXPORT_SYMBOL(tcp_v4_connect); 331 332 /* 333 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 334 * It can be called through tcp_release_cb() if socket was owned by user 335 * at the time tcp_v4_err() was called to handle ICMP message. 336 */ 337 void tcp_v4_mtu_reduced(struct sock *sk) 338 { 339 struct inet_sock *inet = inet_sk(sk); 340 struct dst_entry *dst; 341 u32 mtu; 342 343 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 344 return; 345 mtu = tcp_sk(sk)->mtu_info; 346 dst = inet_csk_update_pmtu(sk, mtu); 347 if (!dst) 348 return; 349 350 /* Something is about to be wrong... Remember soft error 351 * for the case, if this connection will not able to recover. 352 */ 353 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 354 sk->sk_err_soft = EMSGSIZE; 355 356 mtu = dst_mtu(dst); 357 358 if (inet->pmtudisc != IP_PMTUDISC_DONT && 359 ip_sk_accept_pmtu(sk) && 360 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 361 tcp_sync_mss(sk, mtu); 362 363 /* Resend the TCP packet because it's 364 * clear that the old packet has been 365 * dropped. This is the new "fast" path mtu 366 * discovery. 367 */ 368 tcp_simple_retransmit(sk); 369 } /* else let the usual retransmit timer handle it */ 370 } 371 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 372 373 static void do_redirect(struct sk_buff *skb, struct sock *sk) 374 { 375 struct dst_entry *dst = __sk_dst_check(sk, 0); 376 377 if (dst) 378 dst->ops->redirect(dst, sk, skb); 379 } 380 381 382 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 383 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 384 { 385 struct request_sock *req = inet_reqsk(sk); 386 struct net *net = sock_net(sk); 387 388 /* ICMPs are not backlogged, hence we cannot get 389 * an established socket here. 390 */ 391 if (seq != tcp_rsk(req)->snt_isn) { 392 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 393 } else if (abort) { 394 /* 395 * Still in SYN_RECV, just remove it silently. 396 * There is no good way to pass the error to the newly 397 * created socket, and POSIX does not want network 398 * errors returned from accept(). 399 */ 400 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 401 tcp_listendrop(req->rsk_listener); 402 } 403 reqsk_put(req); 404 } 405 EXPORT_SYMBOL(tcp_req_err); 406 407 /* TCP-LD (RFC 6069) logic */ 408 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 409 { 410 struct inet_connection_sock *icsk = inet_csk(sk); 411 struct tcp_sock *tp = tcp_sk(sk); 412 struct sk_buff *skb; 413 s32 remaining; 414 u32 delta_us; 415 416 if (sock_owned_by_user(sk)) 417 return; 418 419 if (seq != tp->snd_una || !icsk->icsk_retransmits || 420 !icsk->icsk_backoff) 421 return; 422 423 skb = tcp_rtx_queue_head(sk); 424 if (WARN_ON_ONCE(!skb)) 425 return; 426 427 icsk->icsk_backoff--; 428 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 429 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 430 431 tcp_mstamp_refresh(tp); 432 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 433 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 434 435 if (remaining > 0) { 436 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 437 remaining, TCP_RTO_MAX); 438 } else { 439 /* RTO revert clocked out retransmission. 440 * Will retransmit now. 441 */ 442 tcp_retransmit_timer(sk); 443 } 444 } 445 EXPORT_SYMBOL(tcp_ld_RTO_revert); 446 447 /* 448 * This routine is called by the ICMP module when it gets some 449 * sort of error condition. If err < 0 then the socket should 450 * be closed and the error returned to the user. If err > 0 451 * it's just the icmp type << 8 | icmp code. After adjustment 452 * header points to the first 8 bytes of the tcp header. We need 453 * to find the appropriate port. 454 * 455 * The locking strategy used here is very "optimistic". When 456 * someone else accesses the socket the ICMP is just dropped 457 * and for some paths there is no check at all. 458 * A more general error queue to queue errors for later handling 459 * is probably better. 460 * 461 */ 462 463 int tcp_v4_err(struct sk_buff *skb, u32 info) 464 { 465 const struct iphdr *iph = (const struct iphdr *)skb->data; 466 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 467 struct tcp_sock *tp; 468 struct inet_sock *inet; 469 const int type = icmp_hdr(skb)->type; 470 const int code = icmp_hdr(skb)->code; 471 struct sock *sk; 472 struct request_sock *fastopen; 473 u32 seq, snd_una; 474 int err; 475 struct net *net = dev_net(skb->dev); 476 477 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, 478 th->dest, iph->saddr, ntohs(th->source), 479 inet_iif(skb), 0); 480 if (!sk) { 481 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 482 return -ENOENT; 483 } 484 if (sk->sk_state == TCP_TIME_WAIT) { 485 inet_twsk_put(inet_twsk(sk)); 486 return 0; 487 } 488 seq = ntohl(th->seq); 489 if (sk->sk_state == TCP_NEW_SYN_RECV) { 490 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 491 type == ICMP_TIME_EXCEEDED || 492 (type == ICMP_DEST_UNREACH && 493 (code == ICMP_NET_UNREACH || 494 code == ICMP_HOST_UNREACH))); 495 return 0; 496 } 497 498 bh_lock_sock(sk); 499 /* If too many ICMPs get dropped on busy 500 * servers this needs to be solved differently. 501 * We do take care of PMTU discovery (RFC1191) special case : 502 * we can receive locally generated ICMP messages while socket is held. 503 */ 504 if (sock_owned_by_user(sk)) { 505 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 506 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 507 } 508 if (sk->sk_state == TCP_CLOSE) 509 goto out; 510 511 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 512 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 513 goto out; 514 } 515 516 tp = tcp_sk(sk); 517 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 518 fastopen = rcu_dereference(tp->fastopen_rsk); 519 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 520 if (sk->sk_state != TCP_LISTEN && 521 !between(seq, snd_una, tp->snd_nxt)) { 522 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 523 goto out; 524 } 525 526 switch (type) { 527 case ICMP_REDIRECT: 528 if (!sock_owned_by_user(sk)) 529 do_redirect(skb, sk); 530 goto out; 531 case ICMP_SOURCE_QUENCH: 532 /* Just silently ignore these. */ 533 goto out; 534 case ICMP_PARAMETERPROB: 535 err = EPROTO; 536 break; 537 case ICMP_DEST_UNREACH: 538 if (code > NR_ICMP_UNREACH) 539 goto out; 540 541 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 542 /* We are not interested in TCP_LISTEN and open_requests 543 * (SYN-ACKs send out by Linux are always <576bytes so 544 * they should go through unfragmented). 545 */ 546 if (sk->sk_state == TCP_LISTEN) 547 goto out; 548 549 tp->mtu_info = info; 550 if (!sock_owned_by_user(sk)) { 551 tcp_v4_mtu_reduced(sk); 552 } else { 553 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 554 sock_hold(sk); 555 } 556 goto out; 557 } 558 559 err = icmp_err_convert[code].errno; 560 /* check if this ICMP message allows revert of backoff. 561 * (see RFC 6069) 562 */ 563 if (!fastopen && 564 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 565 tcp_ld_RTO_revert(sk, seq); 566 break; 567 case ICMP_TIME_EXCEEDED: 568 err = EHOSTUNREACH; 569 break; 570 default: 571 goto out; 572 } 573 574 switch (sk->sk_state) { 575 case TCP_SYN_SENT: 576 case TCP_SYN_RECV: 577 /* Only in fast or simultaneous open. If a fast open socket is 578 * already accepted it is treated as a connected one below. 579 */ 580 if (fastopen && !fastopen->sk) 581 break; 582 583 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 584 585 if (!sock_owned_by_user(sk)) { 586 sk->sk_err = err; 587 588 sk->sk_error_report(sk); 589 590 tcp_done(sk); 591 } else { 592 sk->sk_err_soft = err; 593 } 594 goto out; 595 } 596 597 /* If we've already connected we will keep trying 598 * until we time out, or the user gives up. 599 * 600 * rfc1122 4.2.3.9 allows to consider as hard errors 601 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 602 * but it is obsoleted by pmtu discovery). 603 * 604 * Note, that in modern internet, where routing is unreliable 605 * and in each dark corner broken firewalls sit, sending random 606 * errors ordered by their masters even this two messages finally lose 607 * their original sense (even Linux sends invalid PORT_UNREACHs) 608 * 609 * Now we are in compliance with RFCs. 610 * --ANK (980905) 611 */ 612 613 inet = inet_sk(sk); 614 if (!sock_owned_by_user(sk) && inet->recverr) { 615 sk->sk_err = err; 616 sk->sk_error_report(sk); 617 } else { /* Only an error on timeout */ 618 sk->sk_err_soft = err; 619 } 620 621 out: 622 bh_unlock_sock(sk); 623 sock_put(sk); 624 return 0; 625 } 626 627 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 628 { 629 struct tcphdr *th = tcp_hdr(skb); 630 631 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 632 skb->csum_start = skb_transport_header(skb) - skb->head; 633 skb->csum_offset = offsetof(struct tcphdr, check); 634 } 635 636 /* This routine computes an IPv4 TCP checksum. */ 637 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 638 { 639 const struct inet_sock *inet = inet_sk(sk); 640 641 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 642 } 643 EXPORT_SYMBOL(tcp_v4_send_check); 644 645 /* 646 * This routine will send an RST to the other tcp. 647 * 648 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 649 * for reset. 650 * Answer: if a packet caused RST, it is not for a socket 651 * existing in our system, if it is matched to a socket, 652 * it is just duplicate segment or bug in other side's TCP. 653 * So that we build reply only basing on parameters 654 * arrived with segment. 655 * Exception: precedence violation. We do not implement it in any case. 656 */ 657 658 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) 659 { 660 const struct tcphdr *th = tcp_hdr(skb); 661 struct { 662 struct tcphdr th; 663 #ifdef CONFIG_TCP_MD5SIG 664 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)]; 665 #endif 666 } rep; 667 struct ip_reply_arg arg; 668 #ifdef CONFIG_TCP_MD5SIG 669 struct tcp_md5sig_key *key = NULL; 670 const __u8 *hash_location = NULL; 671 unsigned char newhash[16]; 672 int genhash; 673 struct sock *sk1 = NULL; 674 #endif 675 u64 transmit_time = 0; 676 struct sock *ctl_sk; 677 struct net *net; 678 679 /* Never send a reset in response to a reset. */ 680 if (th->rst) 681 return; 682 683 /* If sk not NULL, it means we did a successful lookup and incoming 684 * route had to be correct. prequeue might have dropped our dst. 685 */ 686 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 687 return; 688 689 /* Swap the send and the receive. */ 690 memset(&rep, 0, sizeof(rep)); 691 rep.th.dest = th->source; 692 rep.th.source = th->dest; 693 rep.th.doff = sizeof(struct tcphdr) / 4; 694 rep.th.rst = 1; 695 696 if (th->ack) { 697 rep.th.seq = th->ack_seq; 698 } else { 699 rep.th.ack = 1; 700 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 701 skb->len - (th->doff << 2)); 702 } 703 704 memset(&arg, 0, sizeof(arg)); 705 arg.iov[0].iov_base = (unsigned char *)&rep; 706 arg.iov[0].iov_len = sizeof(rep.th); 707 708 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 709 #ifdef CONFIG_TCP_MD5SIG 710 rcu_read_lock(); 711 hash_location = tcp_parse_md5sig_option(th); 712 if (sk && sk_fullsock(sk)) { 713 const union tcp_md5_addr *addr; 714 int l3index; 715 716 /* sdif set, means packet ingressed via a device 717 * in an L3 domain and inet_iif is set to it. 718 */ 719 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 720 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 721 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 722 } else if (hash_location) { 723 const union tcp_md5_addr *addr; 724 int sdif = tcp_v4_sdif(skb); 725 int dif = inet_iif(skb); 726 int l3index; 727 728 /* 729 * active side is lost. Try to find listening socket through 730 * source port, and then find md5 key through listening socket. 731 * we are not loose security here: 732 * Incoming packet is checked with md5 hash with finding key, 733 * no RST generated if md5 hash doesn't match. 734 */ 735 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, 736 ip_hdr(skb)->saddr, 737 th->source, ip_hdr(skb)->daddr, 738 ntohs(th->source), dif, sdif); 739 /* don't send rst if it can't find key */ 740 if (!sk1) 741 goto out; 742 743 /* sdif set, means packet ingressed via a device 744 * in an L3 domain and dif is set to it. 745 */ 746 l3index = sdif ? dif : 0; 747 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 748 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 749 if (!key) 750 goto out; 751 752 753 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 754 if (genhash || memcmp(hash_location, newhash, 16) != 0) 755 goto out; 756 757 } 758 759 if (key) { 760 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 761 (TCPOPT_NOP << 16) | 762 (TCPOPT_MD5SIG << 8) | 763 TCPOLEN_MD5SIG); 764 /* Update length and the length the header thinks exists */ 765 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 766 rep.th.doff = arg.iov[0].iov_len / 4; 767 768 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 769 key, ip_hdr(skb)->saddr, 770 ip_hdr(skb)->daddr, &rep.th); 771 } 772 #endif 773 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 774 ip_hdr(skb)->saddr, /* XXX */ 775 arg.iov[0].iov_len, IPPROTO_TCP, 0); 776 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 777 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 778 779 /* When socket is gone, all binding information is lost. 780 * routing might fail in this case. No choice here, if we choose to force 781 * input interface, we will misroute in case of asymmetric route. 782 */ 783 if (sk) { 784 arg.bound_dev_if = sk->sk_bound_dev_if; 785 if (sk_fullsock(sk)) 786 trace_tcp_send_reset(sk, skb); 787 } 788 789 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 790 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 791 792 arg.tos = ip_hdr(skb)->tos; 793 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 794 local_bh_disable(); 795 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); 796 if (sk) { 797 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 798 inet_twsk(sk)->tw_mark : sk->sk_mark; 799 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 800 inet_twsk(sk)->tw_priority : sk->sk_priority; 801 transmit_time = tcp_transmit_time(sk); 802 } 803 ip_send_unicast_reply(ctl_sk, 804 skb, &TCP_SKB_CB(skb)->header.h4.opt, 805 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 806 &arg, arg.iov[0].iov_len, 807 transmit_time); 808 809 ctl_sk->sk_mark = 0; 810 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 811 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 812 local_bh_enable(); 813 814 #ifdef CONFIG_TCP_MD5SIG 815 out: 816 rcu_read_unlock(); 817 #endif 818 } 819 820 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 821 outside socket context is ugly, certainly. What can I do? 822 */ 823 824 static void tcp_v4_send_ack(const struct sock *sk, 825 struct sk_buff *skb, u32 seq, u32 ack, 826 u32 win, u32 tsval, u32 tsecr, int oif, 827 struct tcp_md5sig_key *key, 828 int reply_flags, u8 tos) 829 { 830 const struct tcphdr *th = tcp_hdr(skb); 831 struct { 832 struct tcphdr th; 833 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 834 #ifdef CONFIG_TCP_MD5SIG 835 + (TCPOLEN_MD5SIG_ALIGNED >> 2) 836 #endif 837 ]; 838 } rep; 839 struct net *net = sock_net(sk); 840 struct ip_reply_arg arg; 841 struct sock *ctl_sk; 842 u64 transmit_time; 843 844 memset(&rep.th, 0, sizeof(struct tcphdr)); 845 memset(&arg, 0, sizeof(arg)); 846 847 arg.iov[0].iov_base = (unsigned char *)&rep; 848 arg.iov[0].iov_len = sizeof(rep.th); 849 if (tsecr) { 850 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 851 (TCPOPT_TIMESTAMP << 8) | 852 TCPOLEN_TIMESTAMP); 853 rep.opt[1] = htonl(tsval); 854 rep.opt[2] = htonl(tsecr); 855 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 856 } 857 858 /* Swap the send and the receive. */ 859 rep.th.dest = th->source; 860 rep.th.source = th->dest; 861 rep.th.doff = arg.iov[0].iov_len / 4; 862 rep.th.seq = htonl(seq); 863 rep.th.ack_seq = htonl(ack); 864 rep.th.ack = 1; 865 rep.th.window = htons(win); 866 867 #ifdef CONFIG_TCP_MD5SIG 868 if (key) { 869 int offset = (tsecr) ? 3 : 0; 870 871 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 872 (TCPOPT_NOP << 16) | 873 (TCPOPT_MD5SIG << 8) | 874 TCPOLEN_MD5SIG); 875 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 876 rep.th.doff = arg.iov[0].iov_len/4; 877 878 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 879 key, ip_hdr(skb)->saddr, 880 ip_hdr(skb)->daddr, &rep.th); 881 } 882 #endif 883 arg.flags = reply_flags; 884 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 885 ip_hdr(skb)->saddr, /* XXX */ 886 arg.iov[0].iov_len, IPPROTO_TCP, 0); 887 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 888 if (oif) 889 arg.bound_dev_if = oif; 890 arg.tos = tos; 891 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 892 local_bh_disable(); 893 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); 894 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 895 inet_twsk(sk)->tw_mark : sk->sk_mark; 896 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 897 inet_twsk(sk)->tw_priority : sk->sk_priority; 898 transmit_time = tcp_transmit_time(sk); 899 ip_send_unicast_reply(ctl_sk, 900 skb, &TCP_SKB_CB(skb)->header.h4.opt, 901 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 902 &arg, arg.iov[0].iov_len, 903 transmit_time); 904 905 ctl_sk->sk_mark = 0; 906 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 907 local_bh_enable(); 908 } 909 910 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 911 { 912 struct inet_timewait_sock *tw = inet_twsk(sk); 913 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 914 915 tcp_v4_send_ack(sk, skb, 916 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 917 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 918 tcp_time_stamp_raw() + tcptw->tw_ts_offset, 919 tcptw->tw_ts_recent, 920 tw->tw_bound_dev_if, 921 tcp_twsk_md5_key(tcptw), 922 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 923 tw->tw_tos 924 ); 925 926 inet_twsk_put(tw); 927 } 928 929 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 930 struct request_sock *req) 931 { 932 const union tcp_md5_addr *addr; 933 int l3index; 934 935 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 936 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 937 */ 938 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 939 tcp_sk(sk)->snd_nxt; 940 941 /* RFC 7323 2.3 942 * The window field (SEG.WND) of every outgoing segment, with the 943 * exception of <SYN> segments, MUST be right-shifted by 944 * Rcv.Wind.Shift bits: 945 */ 946 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 947 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 948 tcp_v4_send_ack(sk, skb, seq, 949 tcp_rsk(req)->rcv_nxt, 950 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 951 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, 952 req->ts_recent, 953 0, 954 tcp_md5_do_lookup(sk, l3index, addr, AF_INET), 955 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 956 ip_hdr(skb)->tos); 957 } 958 959 /* 960 * Send a SYN-ACK after having received a SYN. 961 * This still operates on a request_sock only, not on a big 962 * socket. 963 */ 964 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 965 struct flowi *fl, 966 struct request_sock *req, 967 struct tcp_fastopen_cookie *foc, 968 enum tcp_synack_type synack_type, 969 struct sk_buff *syn_skb) 970 { 971 const struct inet_request_sock *ireq = inet_rsk(req); 972 struct flowi4 fl4; 973 int err = -1; 974 struct sk_buff *skb; 975 u8 tos; 976 977 /* First, grab a route. */ 978 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 979 return -1; 980 981 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 982 983 if (skb) { 984 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 985 986 tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ? 987 tcp_rsk(req)->syn_tos & ~INET_ECN_MASK : 988 inet_sk(sk)->tos; 989 990 if (!INET_ECN_is_capable(tos) && 991 tcp_bpf_ca_needs_ecn((struct sock *)req)) 992 tos |= INET_ECN_ECT_0; 993 994 rcu_read_lock(); 995 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 996 ireq->ir_rmt_addr, 997 rcu_dereference(ireq->ireq_opt), 998 tos); 999 rcu_read_unlock(); 1000 err = net_xmit_eval(err); 1001 } 1002 1003 return err; 1004 } 1005 1006 /* 1007 * IPv4 request_sock destructor. 1008 */ 1009 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1010 { 1011 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1012 } 1013 1014 #ifdef CONFIG_TCP_MD5SIG 1015 /* 1016 * RFC2385 MD5 checksumming requires a mapping of 1017 * IP address->MD5 Key. 1018 * We need to maintain these in the sk structure. 1019 */ 1020 1021 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed); 1022 EXPORT_SYMBOL(tcp_md5_needed); 1023 1024 /* Find the Key structure for an address. */ 1025 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1026 const union tcp_md5_addr *addr, 1027 int family) 1028 { 1029 const struct tcp_sock *tp = tcp_sk(sk); 1030 struct tcp_md5sig_key *key; 1031 const struct tcp_md5sig_info *md5sig; 1032 __be32 mask; 1033 struct tcp_md5sig_key *best_match = NULL; 1034 bool match; 1035 1036 /* caller either holds rcu_read_lock() or socket lock */ 1037 md5sig = rcu_dereference_check(tp->md5sig_info, 1038 lockdep_sock_is_held(sk)); 1039 if (!md5sig) 1040 return NULL; 1041 1042 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1043 lockdep_sock_is_held(sk)) { 1044 if (key->family != family) 1045 continue; 1046 if (key->l3index && key->l3index != l3index) 1047 continue; 1048 if (family == AF_INET) { 1049 mask = inet_make_mask(key->prefixlen); 1050 match = (key->addr.a4.s_addr & mask) == 1051 (addr->a4.s_addr & mask); 1052 #if IS_ENABLED(CONFIG_IPV6) 1053 } else if (family == AF_INET6) { 1054 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1055 key->prefixlen); 1056 #endif 1057 } else { 1058 match = false; 1059 } 1060 1061 if (match && (!best_match || 1062 key->prefixlen > best_match->prefixlen)) 1063 best_match = key; 1064 } 1065 return best_match; 1066 } 1067 EXPORT_SYMBOL(__tcp_md5_do_lookup); 1068 1069 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1070 const union tcp_md5_addr *addr, 1071 int family, u8 prefixlen, 1072 int l3index) 1073 { 1074 const struct tcp_sock *tp = tcp_sk(sk); 1075 struct tcp_md5sig_key *key; 1076 unsigned int size = sizeof(struct in_addr); 1077 const struct tcp_md5sig_info *md5sig; 1078 1079 /* caller either holds rcu_read_lock() or socket lock */ 1080 md5sig = rcu_dereference_check(tp->md5sig_info, 1081 lockdep_sock_is_held(sk)); 1082 if (!md5sig) 1083 return NULL; 1084 #if IS_ENABLED(CONFIG_IPV6) 1085 if (family == AF_INET6) 1086 size = sizeof(struct in6_addr); 1087 #endif 1088 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1089 lockdep_sock_is_held(sk)) { 1090 if (key->family != family) 1091 continue; 1092 if (key->l3index && key->l3index != l3index) 1093 continue; 1094 if (!memcmp(&key->addr, addr, size) && 1095 key->prefixlen == prefixlen) 1096 return key; 1097 } 1098 return NULL; 1099 } 1100 1101 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1102 const struct sock *addr_sk) 1103 { 1104 const union tcp_md5_addr *addr; 1105 int l3index; 1106 1107 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1108 addr_sk->sk_bound_dev_if); 1109 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1110 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1111 } 1112 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1113 1114 /* This can be called on a newly created socket, from other files */ 1115 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1116 int family, u8 prefixlen, int l3index, 1117 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1118 { 1119 /* Add Key to the list */ 1120 struct tcp_md5sig_key *key; 1121 struct tcp_sock *tp = tcp_sk(sk); 1122 struct tcp_md5sig_info *md5sig; 1123 1124 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); 1125 if (key) { 1126 /* Pre-existing entry - just update that one. 1127 * Note that the key might be used concurrently. 1128 * data_race() is telling kcsan that we do not care of 1129 * key mismatches, since changing MD5 key on live flows 1130 * can lead to packet drops. 1131 */ 1132 data_race(memcpy(key->key, newkey, newkeylen)); 1133 1134 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1135 * Also note that a reader could catch new key->keylen value 1136 * but old key->key[], this is the reason we use __GFP_ZERO 1137 * at sock_kmalloc() time below these lines. 1138 */ 1139 WRITE_ONCE(key->keylen, newkeylen); 1140 1141 return 0; 1142 } 1143 1144 md5sig = rcu_dereference_protected(tp->md5sig_info, 1145 lockdep_sock_is_held(sk)); 1146 if (!md5sig) { 1147 md5sig = kmalloc(sizeof(*md5sig), gfp); 1148 if (!md5sig) 1149 return -ENOMEM; 1150 1151 sk_nocaps_add(sk, NETIF_F_GSO_MASK); 1152 INIT_HLIST_HEAD(&md5sig->head); 1153 rcu_assign_pointer(tp->md5sig_info, md5sig); 1154 } 1155 1156 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1157 if (!key) 1158 return -ENOMEM; 1159 if (!tcp_alloc_md5sig_pool()) { 1160 sock_kfree_s(sk, key, sizeof(*key)); 1161 return -ENOMEM; 1162 } 1163 1164 memcpy(key->key, newkey, newkeylen); 1165 key->keylen = newkeylen; 1166 key->family = family; 1167 key->prefixlen = prefixlen; 1168 key->l3index = l3index; 1169 memcpy(&key->addr, addr, 1170 (family == AF_INET6) ? sizeof(struct in6_addr) : 1171 sizeof(struct in_addr)); 1172 hlist_add_head_rcu(&key->node, &md5sig->head); 1173 return 0; 1174 } 1175 EXPORT_SYMBOL(tcp_md5_do_add); 1176 1177 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1178 u8 prefixlen, int l3index) 1179 { 1180 struct tcp_md5sig_key *key; 1181 1182 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); 1183 if (!key) 1184 return -ENOENT; 1185 hlist_del_rcu(&key->node); 1186 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1187 kfree_rcu(key, rcu); 1188 return 0; 1189 } 1190 EXPORT_SYMBOL(tcp_md5_do_del); 1191 1192 static void tcp_clear_md5_list(struct sock *sk) 1193 { 1194 struct tcp_sock *tp = tcp_sk(sk); 1195 struct tcp_md5sig_key *key; 1196 struct hlist_node *n; 1197 struct tcp_md5sig_info *md5sig; 1198 1199 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1200 1201 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1202 hlist_del_rcu(&key->node); 1203 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1204 kfree_rcu(key, rcu); 1205 } 1206 } 1207 1208 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1209 sockptr_t optval, int optlen) 1210 { 1211 struct tcp_md5sig cmd; 1212 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1213 const union tcp_md5_addr *addr; 1214 u8 prefixlen = 32; 1215 int l3index = 0; 1216 1217 if (optlen < sizeof(cmd)) 1218 return -EINVAL; 1219 1220 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1221 return -EFAULT; 1222 1223 if (sin->sin_family != AF_INET) 1224 return -EINVAL; 1225 1226 if (optname == TCP_MD5SIG_EXT && 1227 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1228 prefixlen = cmd.tcpm_prefixlen; 1229 if (prefixlen > 32) 1230 return -EINVAL; 1231 } 1232 1233 if (optname == TCP_MD5SIG_EXT && 1234 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1235 struct net_device *dev; 1236 1237 rcu_read_lock(); 1238 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1239 if (dev && netif_is_l3_master(dev)) 1240 l3index = dev->ifindex; 1241 1242 rcu_read_unlock(); 1243 1244 /* ok to reference set/not set outside of rcu; 1245 * right now device MUST be an L3 master 1246 */ 1247 if (!dev || !l3index) 1248 return -EINVAL; 1249 } 1250 1251 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1252 1253 if (!cmd.tcpm_keylen) 1254 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index); 1255 1256 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1257 return -EINVAL; 1258 1259 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, 1260 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); 1261 } 1262 1263 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, 1264 __be32 daddr, __be32 saddr, 1265 const struct tcphdr *th, int nbytes) 1266 { 1267 struct tcp4_pseudohdr *bp; 1268 struct scatterlist sg; 1269 struct tcphdr *_th; 1270 1271 bp = hp->scratch; 1272 bp->saddr = saddr; 1273 bp->daddr = daddr; 1274 bp->pad = 0; 1275 bp->protocol = IPPROTO_TCP; 1276 bp->len = cpu_to_be16(nbytes); 1277 1278 _th = (struct tcphdr *)(bp + 1); 1279 memcpy(_th, th, sizeof(*th)); 1280 _th->check = 0; 1281 1282 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1283 ahash_request_set_crypt(hp->md5_req, &sg, NULL, 1284 sizeof(*bp) + sizeof(*th)); 1285 return crypto_ahash_update(hp->md5_req); 1286 } 1287 1288 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1289 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1290 { 1291 struct tcp_md5sig_pool *hp; 1292 struct ahash_request *req; 1293 1294 hp = tcp_get_md5sig_pool(); 1295 if (!hp) 1296 goto clear_hash_noput; 1297 req = hp->md5_req; 1298 1299 if (crypto_ahash_init(req)) 1300 goto clear_hash; 1301 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) 1302 goto clear_hash; 1303 if (tcp_md5_hash_key(hp, key)) 1304 goto clear_hash; 1305 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1306 if (crypto_ahash_final(req)) 1307 goto clear_hash; 1308 1309 tcp_put_md5sig_pool(); 1310 return 0; 1311 1312 clear_hash: 1313 tcp_put_md5sig_pool(); 1314 clear_hash_noput: 1315 memset(md5_hash, 0, 16); 1316 return 1; 1317 } 1318 1319 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1320 const struct sock *sk, 1321 const struct sk_buff *skb) 1322 { 1323 struct tcp_md5sig_pool *hp; 1324 struct ahash_request *req; 1325 const struct tcphdr *th = tcp_hdr(skb); 1326 __be32 saddr, daddr; 1327 1328 if (sk) { /* valid for establish/request sockets */ 1329 saddr = sk->sk_rcv_saddr; 1330 daddr = sk->sk_daddr; 1331 } else { 1332 const struct iphdr *iph = ip_hdr(skb); 1333 saddr = iph->saddr; 1334 daddr = iph->daddr; 1335 } 1336 1337 hp = tcp_get_md5sig_pool(); 1338 if (!hp) 1339 goto clear_hash_noput; 1340 req = hp->md5_req; 1341 1342 if (crypto_ahash_init(req)) 1343 goto clear_hash; 1344 1345 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len)) 1346 goto clear_hash; 1347 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) 1348 goto clear_hash; 1349 if (tcp_md5_hash_key(hp, key)) 1350 goto clear_hash; 1351 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1352 if (crypto_ahash_final(req)) 1353 goto clear_hash; 1354 1355 tcp_put_md5sig_pool(); 1356 return 0; 1357 1358 clear_hash: 1359 tcp_put_md5sig_pool(); 1360 clear_hash_noput: 1361 memset(md5_hash, 0, 16); 1362 return 1; 1363 } 1364 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1365 1366 #endif 1367 1368 /* Called with rcu_read_lock() */ 1369 static bool tcp_v4_inbound_md5_hash(const struct sock *sk, 1370 const struct sk_buff *skb, 1371 int dif, int sdif) 1372 { 1373 #ifdef CONFIG_TCP_MD5SIG 1374 /* 1375 * This gets called for each TCP segment that arrives 1376 * so we want to be efficient. 1377 * We have 3 drop cases: 1378 * o No MD5 hash and one expected. 1379 * o MD5 hash and we're not expecting one. 1380 * o MD5 hash and its wrong. 1381 */ 1382 const __u8 *hash_location = NULL; 1383 struct tcp_md5sig_key *hash_expected; 1384 const struct iphdr *iph = ip_hdr(skb); 1385 const struct tcphdr *th = tcp_hdr(skb); 1386 const union tcp_md5_addr *addr; 1387 unsigned char newhash[16]; 1388 int genhash, l3index; 1389 1390 /* sdif set, means packet ingressed via a device 1391 * in an L3 domain and dif is set to the l3mdev 1392 */ 1393 l3index = sdif ? dif : 0; 1394 1395 addr = (union tcp_md5_addr *)&iph->saddr; 1396 hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1397 hash_location = tcp_parse_md5sig_option(th); 1398 1399 /* We've parsed the options - do we have a hash? */ 1400 if (!hash_expected && !hash_location) 1401 return false; 1402 1403 if (hash_expected && !hash_location) { 1404 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); 1405 return true; 1406 } 1407 1408 if (!hash_expected && hash_location) { 1409 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); 1410 return true; 1411 } 1412 1413 /* Okay, so this is hash_expected and hash_location - 1414 * so we need to calculate the checksum. 1415 */ 1416 genhash = tcp_v4_md5_hash_skb(newhash, 1417 hash_expected, 1418 NULL, skb); 1419 1420 if (genhash || memcmp(hash_location, newhash, 16) != 0) { 1421 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); 1422 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n", 1423 &iph->saddr, ntohs(th->source), 1424 &iph->daddr, ntohs(th->dest), 1425 genhash ? " tcp_v4_calc_md5_hash failed" 1426 : "", l3index); 1427 return true; 1428 } 1429 return false; 1430 #endif 1431 return false; 1432 } 1433 1434 static void tcp_v4_init_req(struct request_sock *req, 1435 const struct sock *sk_listener, 1436 struct sk_buff *skb) 1437 { 1438 struct inet_request_sock *ireq = inet_rsk(req); 1439 struct net *net = sock_net(sk_listener); 1440 1441 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1442 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1443 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1444 } 1445 1446 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1447 struct flowi *fl, 1448 const struct request_sock *req) 1449 { 1450 return inet_csk_route_req(sk, &fl->u.ip4, req); 1451 } 1452 1453 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1454 .family = PF_INET, 1455 .obj_size = sizeof(struct tcp_request_sock), 1456 .rtx_syn_ack = tcp_rtx_synack, 1457 .send_ack = tcp_v4_reqsk_send_ack, 1458 .destructor = tcp_v4_reqsk_destructor, 1459 .send_reset = tcp_v4_send_reset, 1460 .syn_ack_timeout = tcp_syn_ack_timeout, 1461 }; 1462 1463 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1464 .mss_clamp = TCP_MSS_DEFAULT, 1465 #ifdef CONFIG_TCP_MD5SIG 1466 .req_md5_lookup = tcp_v4_md5_lookup, 1467 .calc_md5_hash = tcp_v4_md5_hash_skb, 1468 #endif 1469 .init_req = tcp_v4_init_req, 1470 #ifdef CONFIG_SYN_COOKIES 1471 .cookie_init_seq = cookie_v4_init_sequence, 1472 #endif 1473 .route_req = tcp_v4_route_req, 1474 .init_seq = tcp_v4_init_seq, 1475 .init_ts_off = tcp_v4_init_ts_off, 1476 .send_synack = tcp_v4_send_synack, 1477 }; 1478 1479 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1480 { 1481 /* Never answer to SYNs send to broadcast or multicast */ 1482 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1483 goto drop; 1484 1485 return tcp_conn_request(&tcp_request_sock_ops, 1486 &tcp_request_sock_ipv4_ops, sk, skb); 1487 1488 drop: 1489 tcp_listendrop(sk); 1490 return 0; 1491 } 1492 EXPORT_SYMBOL(tcp_v4_conn_request); 1493 1494 1495 /* 1496 * The three way handshake has completed - we got a valid synack - 1497 * now create the new socket. 1498 */ 1499 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1500 struct request_sock *req, 1501 struct dst_entry *dst, 1502 struct request_sock *req_unhash, 1503 bool *own_req) 1504 { 1505 struct inet_request_sock *ireq; 1506 bool found_dup_sk = false; 1507 struct inet_sock *newinet; 1508 struct tcp_sock *newtp; 1509 struct sock *newsk; 1510 #ifdef CONFIG_TCP_MD5SIG 1511 const union tcp_md5_addr *addr; 1512 struct tcp_md5sig_key *key; 1513 int l3index; 1514 #endif 1515 struct ip_options_rcu *inet_opt; 1516 1517 if (sk_acceptq_is_full(sk)) 1518 goto exit_overflow; 1519 1520 newsk = tcp_create_openreq_child(sk, req, skb); 1521 if (!newsk) 1522 goto exit_nonewsk; 1523 1524 newsk->sk_gso_type = SKB_GSO_TCPV4; 1525 inet_sk_rx_dst_set(newsk, skb); 1526 1527 newtp = tcp_sk(newsk); 1528 newinet = inet_sk(newsk); 1529 ireq = inet_rsk(req); 1530 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1531 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1532 newsk->sk_bound_dev_if = ireq->ir_iif; 1533 newinet->inet_saddr = ireq->ir_loc_addr; 1534 inet_opt = rcu_dereference(ireq->ireq_opt); 1535 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1536 newinet->mc_index = inet_iif(skb); 1537 newinet->mc_ttl = ip_hdr(skb)->ttl; 1538 newinet->rcv_tos = ip_hdr(skb)->tos; 1539 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1540 if (inet_opt) 1541 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1542 newinet->inet_id = prandom_u32(); 1543 1544 /* Set ToS of the new socket based upon the value of incoming SYN. */ 1545 if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) 1546 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1547 1548 if (!dst) { 1549 dst = inet_csk_route_child_sock(sk, newsk, req); 1550 if (!dst) 1551 goto put_and_exit; 1552 } else { 1553 /* syncookie case : see end of cookie_v4_check() */ 1554 } 1555 sk_setup_caps(newsk, dst); 1556 1557 tcp_ca_openreq_child(newsk, dst); 1558 1559 tcp_sync_mss(newsk, dst_mtu(dst)); 1560 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1561 1562 tcp_initialize_rcv_mss(newsk); 1563 1564 #ifdef CONFIG_TCP_MD5SIG 1565 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1566 /* Copy over the MD5 key from the original socket */ 1567 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1568 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1569 if (key) { 1570 /* 1571 * We're using one, so create a matching key 1572 * on the newsk structure. If we fail to get 1573 * memory, then we end up not copying the key 1574 * across. Shucks. 1575 */ 1576 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, 1577 key->key, key->keylen, GFP_ATOMIC); 1578 sk_nocaps_add(newsk, NETIF_F_GSO_MASK); 1579 } 1580 #endif 1581 1582 if (__inet_inherit_port(sk, newsk) < 0) 1583 goto put_and_exit; 1584 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1585 &found_dup_sk); 1586 if (likely(*own_req)) { 1587 tcp_move_syn(newtp, req); 1588 ireq->ireq_opt = NULL; 1589 } else { 1590 if (!req_unhash && found_dup_sk) { 1591 /* This code path should only be executed in the 1592 * syncookie case only 1593 */ 1594 bh_unlock_sock(newsk); 1595 sock_put(newsk); 1596 newsk = NULL; 1597 } else { 1598 newinet->inet_opt = NULL; 1599 } 1600 } 1601 return newsk; 1602 1603 exit_overflow: 1604 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1605 exit_nonewsk: 1606 dst_release(dst); 1607 exit: 1608 tcp_listendrop(sk); 1609 return NULL; 1610 put_and_exit: 1611 newinet->inet_opt = NULL; 1612 inet_csk_prepare_forced_close(newsk); 1613 tcp_done(newsk); 1614 goto exit; 1615 } 1616 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1617 1618 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1619 { 1620 #ifdef CONFIG_SYN_COOKIES 1621 const struct tcphdr *th = tcp_hdr(skb); 1622 1623 if (!th->syn) 1624 sk = cookie_v4_check(sk, skb); 1625 #endif 1626 return sk; 1627 } 1628 1629 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1630 struct tcphdr *th, u32 *cookie) 1631 { 1632 u16 mss = 0; 1633 #ifdef CONFIG_SYN_COOKIES 1634 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1635 &tcp_request_sock_ipv4_ops, sk, th); 1636 if (mss) { 1637 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1638 tcp_synq_overflow(sk); 1639 } 1640 #endif 1641 return mss; 1642 } 1643 1644 /* The socket must have it's spinlock held when we get 1645 * here, unless it is a TCP_LISTEN socket. 1646 * 1647 * We have a potential double-lock case here, so even when 1648 * doing backlog processing we use the BH locking scheme. 1649 * This is because we cannot sleep with the original spinlock 1650 * held. 1651 */ 1652 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1653 { 1654 struct sock *rsk; 1655 1656 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1657 struct dst_entry *dst = sk->sk_rx_dst; 1658 1659 sock_rps_save_rxhash(sk, skb); 1660 sk_mark_napi_id(sk, skb); 1661 if (dst) { 1662 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || 1663 !dst->ops->check(dst, 0)) { 1664 dst_release(dst); 1665 sk->sk_rx_dst = NULL; 1666 } 1667 } 1668 tcp_rcv_established(sk, skb); 1669 return 0; 1670 } 1671 1672 if (tcp_checksum_complete(skb)) 1673 goto csum_err; 1674 1675 if (sk->sk_state == TCP_LISTEN) { 1676 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1677 1678 if (!nsk) 1679 goto discard; 1680 if (nsk != sk) { 1681 if (tcp_child_process(sk, nsk, skb)) { 1682 rsk = nsk; 1683 goto reset; 1684 } 1685 return 0; 1686 } 1687 } else 1688 sock_rps_save_rxhash(sk, skb); 1689 1690 if (tcp_rcv_state_process(sk, skb)) { 1691 rsk = sk; 1692 goto reset; 1693 } 1694 return 0; 1695 1696 reset: 1697 tcp_v4_send_reset(rsk, skb); 1698 discard: 1699 kfree_skb(skb); 1700 /* Be careful here. If this function gets more complicated and 1701 * gcc suffers from register pressure on the x86, sk (in %ebx) 1702 * might be destroyed here. This current version compiles correctly, 1703 * but you have been warned. 1704 */ 1705 return 0; 1706 1707 csum_err: 1708 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1709 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1710 goto discard; 1711 } 1712 EXPORT_SYMBOL(tcp_v4_do_rcv); 1713 1714 int tcp_v4_early_demux(struct sk_buff *skb) 1715 { 1716 const struct iphdr *iph; 1717 const struct tcphdr *th; 1718 struct sock *sk; 1719 1720 if (skb->pkt_type != PACKET_HOST) 1721 return 0; 1722 1723 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1724 return 0; 1725 1726 iph = ip_hdr(skb); 1727 th = tcp_hdr(skb); 1728 1729 if (th->doff < sizeof(struct tcphdr) / 4) 1730 return 0; 1731 1732 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, 1733 iph->saddr, th->source, 1734 iph->daddr, ntohs(th->dest), 1735 skb->skb_iif, inet_sdif(skb)); 1736 if (sk) { 1737 skb->sk = sk; 1738 skb->destructor = sock_edemux; 1739 if (sk_fullsock(sk)) { 1740 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); 1741 1742 if (dst) 1743 dst = dst_check(dst, 0); 1744 if (dst && 1745 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) 1746 skb_dst_set_noref(skb, dst); 1747 } 1748 } 1749 return 0; 1750 } 1751 1752 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) 1753 { 1754 u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf); 1755 struct skb_shared_info *shinfo; 1756 const struct tcphdr *th; 1757 struct tcphdr *thtail; 1758 struct sk_buff *tail; 1759 unsigned int hdrlen; 1760 bool fragstolen; 1761 u32 gso_segs; 1762 int delta; 1763 1764 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1765 * we can fix skb->truesize to its real value to avoid future drops. 1766 * This is valid because skb is not yet charged to the socket. 1767 * It has been noticed pure SACK packets were sometimes dropped 1768 * (if cooked by drivers without copybreak feature). 1769 */ 1770 skb_condense(skb); 1771 1772 skb_dst_drop(skb); 1773 1774 if (unlikely(tcp_checksum_complete(skb))) { 1775 bh_unlock_sock(sk); 1776 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1777 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1778 return true; 1779 } 1780 1781 /* Attempt coalescing to last skb in backlog, even if we are 1782 * above the limits. 1783 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 1784 */ 1785 th = (const struct tcphdr *)skb->data; 1786 hdrlen = th->doff * 4; 1787 shinfo = skb_shinfo(skb); 1788 1789 if (!shinfo->gso_size) 1790 shinfo->gso_size = skb->len - hdrlen; 1791 1792 if (!shinfo->gso_segs) 1793 shinfo->gso_segs = 1; 1794 1795 tail = sk->sk_backlog.tail; 1796 if (!tail) 1797 goto no_coalesce; 1798 thtail = (struct tcphdr *)tail->data; 1799 1800 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 1801 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 1802 ((TCP_SKB_CB(tail)->tcp_flags | 1803 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 1804 !((TCP_SKB_CB(tail)->tcp_flags & 1805 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 1806 ((TCP_SKB_CB(tail)->tcp_flags ^ 1807 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || 1808 #ifdef CONFIG_TLS_DEVICE 1809 tail->decrypted != skb->decrypted || 1810 #endif 1811 thtail->doff != th->doff || 1812 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 1813 goto no_coalesce; 1814 1815 __skb_pull(skb, hdrlen); 1816 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 1817 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 1818 1819 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 1820 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 1821 thtail->window = th->window; 1822 } 1823 1824 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 1825 * thtail->fin, so that the fast path in tcp_rcv_established() 1826 * is not entered if we append a packet with a FIN. 1827 * SYN, RST, URG are not present. 1828 * ACK is set on both packets. 1829 * PSH : we do not really care in TCP stack, 1830 * at least for 'GRO' packets. 1831 */ 1832 thtail->fin |= th->fin; 1833 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 1834 1835 if (TCP_SKB_CB(skb)->has_rxtstamp) { 1836 TCP_SKB_CB(tail)->has_rxtstamp = true; 1837 tail->tstamp = skb->tstamp; 1838 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 1839 } 1840 1841 /* Not as strict as GRO. We only need to carry mss max value */ 1842 skb_shinfo(tail)->gso_size = max(shinfo->gso_size, 1843 skb_shinfo(tail)->gso_size); 1844 1845 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs; 1846 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF); 1847 1848 sk->sk_backlog.len += delta; 1849 __NET_INC_STATS(sock_net(sk), 1850 LINUX_MIB_TCPBACKLOGCOALESCE); 1851 kfree_skb_partial(skb, fragstolen); 1852 return false; 1853 } 1854 __skb_push(skb, hdrlen); 1855 1856 no_coalesce: 1857 /* Only socket owner can try to collapse/prune rx queues 1858 * to reduce memory overhead, so add a little headroom here. 1859 * Few sockets backlog are possibly concurrently non empty. 1860 */ 1861 limit += 64*1024; 1862 1863 if (unlikely(sk_add_backlog(sk, skb, limit))) { 1864 bh_unlock_sock(sk); 1865 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 1866 return true; 1867 } 1868 return false; 1869 } 1870 EXPORT_SYMBOL(tcp_add_backlog); 1871 1872 int tcp_filter(struct sock *sk, struct sk_buff *skb) 1873 { 1874 struct tcphdr *th = (struct tcphdr *)skb->data; 1875 1876 return sk_filter_trim_cap(sk, skb, th->doff * 4); 1877 } 1878 EXPORT_SYMBOL(tcp_filter); 1879 1880 static void tcp_v4_restore_cb(struct sk_buff *skb) 1881 { 1882 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 1883 sizeof(struct inet_skb_parm)); 1884 } 1885 1886 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 1887 const struct tcphdr *th) 1888 { 1889 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 1890 * barrier() makes sure compiler wont play fool^Waliasing games. 1891 */ 1892 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 1893 sizeof(struct inet_skb_parm)); 1894 barrier(); 1895 1896 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1897 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1898 skb->len - th->doff * 4); 1899 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1900 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 1901 TCP_SKB_CB(skb)->tcp_tw_isn = 0; 1902 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 1903 TCP_SKB_CB(skb)->sacked = 0; 1904 TCP_SKB_CB(skb)->has_rxtstamp = 1905 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 1906 } 1907 1908 /* 1909 * From tcp_input.c 1910 */ 1911 1912 int tcp_v4_rcv(struct sk_buff *skb) 1913 { 1914 struct net *net = dev_net(skb->dev); 1915 struct sk_buff *skb_to_free; 1916 int sdif = inet_sdif(skb); 1917 int dif = inet_iif(skb); 1918 const struct iphdr *iph; 1919 const struct tcphdr *th; 1920 bool refcounted; 1921 struct sock *sk; 1922 int ret; 1923 1924 if (skb->pkt_type != PACKET_HOST) 1925 goto discard_it; 1926 1927 /* Count it even if it's bad */ 1928 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 1929 1930 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 1931 goto discard_it; 1932 1933 th = (const struct tcphdr *)skb->data; 1934 1935 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) 1936 goto bad_packet; 1937 if (!pskb_may_pull(skb, th->doff * 4)) 1938 goto discard_it; 1939 1940 /* An explanation is required here, I think. 1941 * Packet length and doff are validated by header prediction, 1942 * provided case of th->doff==0 is eliminated. 1943 * So, we defer the checks. */ 1944 1945 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 1946 goto csum_error; 1947 1948 th = (const struct tcphdr *)skb->data; 1949 iph = ip_hdr(skb); 1950 lookup: 1951 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, 1952 th->dest, sdif, &refcounted); 1953 if (!sk) 1954 goto no_tcp_socket; 1955 1956 process: 1957 if (sk->sk_state == TCP_TIME_WAIT) 1958 goto do_time_wait; 1959 1960 if (sk->sk_state == TCP_NEW_SYN_RECV) { 1961 struct request_sock *req = inet_reqsk(sk); 1962 bool req_stolen = false; 1963 struct sock *nsk; 1964 1965 sk = req->rsk_listener; 1966 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) { 1967 sk_drops_add(sk, skb); 1968 reqsk_put(req); 1969 goto discard_it; 1970 } 1971 if (tcp_checksum_complete(skb)) { 1972 reqsk_put(req); 1973 goto csum_error; 1974 } 1975 if (unlikely(sk->sk_state != TCP_LISTEN)) { 1976 inet_csk_reqsk_queue_drop_and_put(sk, req); 1977 goto lookup; 1978 } 1979 /* We own a reference on the listener, increase it again 1980 * as we might lose it too soon. 1981 */ 1982 sock_hold(sk); 1983 refcounted = true; 1984 nsk = NULL; 1985 if (!tcp_filter(sk, skb)) { 1986 th = (const struct tcphdr *)skb->data; 1987 iph = ip_hdr(skb); 1988 tcp_v4_fill_cb(skb, iph, th); 1989 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 1990 } 1991 if (!nsk) { 1992 reqsk_put(req); 1993 if (req_stolen) { 1994 /* Another cpu got exclusive access to req 1995 * and created a full blown socket. 1996 * Try to feed this packet to this socket 1997 * instead of discarding it. 1998 */ 1999 tcp_v4_restore_cb(skb); 2000 sock_put(sk); 2001 goto lookup; 2002 } 2003 goto discard_and_relse; 2004 } 2005 if (nsk == sk) { 2006 reqsk_put(req); 2007 tcp_v4_restore_cb(skb); 2008 } else if (tcp_child_process(sk, nsk, skb)) { 2009 tcp_v4_send_reset(nsk, skb); 2010 goto discard_and_relse; 2011 } else { 2012 sock_put(sk); 2013 return 0; 2014 } 2015 } 2016 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 2017 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2018 goto discard_and_relse; 2019 } 2020 2021 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 2022 goto discard_and_relse; 2023 2024 if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif)) 2025 goto discard_and_relse; 2026 2027 nf_reset_ct(skb); 2028 2029 if (tcp_filter(sk, skb)) 2030 goto discard_and_relse; 2031 th = (const struct tcphdr *)skb->data; 2032 iph = ip_hdr(skb); 2033 tcp_v4_fill_cb(skb, iph, th); 2034 2035 skb->dev = NULL; 2036 2037 if (sk->sk_state == TCP_LISTEN) { 2038 ret = tcp_v4_do_rcv(sk, skb); 2039 goto put_and_return; 2040 } 2041 2042 sk_incoming_cpu_update(sk); 2043 2044 bh_lock_sock_nested(sk); 2045 tcp_segs_in(tcp_sk(sk), skb); 2046 ret = 0; 2047 if (!sock_owned_by_user(sk)) { 2048 skb_to_free = sk->sk_rx_skb_cache; 2049 sk->sk_rx_skb_cache = NULL; 2050 ret = tcp_v4_do_rcv(sk, skb); 2051 } else { 2052 if (tcp_add_backlog(sk, skb)) 2053 goto discard_and_relse; 2054 skb_to_free = NULL; 2055 } 2056 bh_unlock_sock(sk); 2057 if (skb_to_free) 2058 __kfree_skb(skb_to_free); 2059 2060 put_and_return: 2061 if (refcounted) 2062 sock_put(sk); 2063 2064 return ret; 2065 2066 no_tcp_socket: 2067 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2068 goto discard_it; 2069 2070 tcp_v4_fill_cb(skb, iph, th); 2071 2072 if (tcp_checksum_complete(skb)) { 2073 csum_error: 2074 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2075 bad_packet: 2076 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2077 } else { 2078 tcp_v4_send_reset(NULL, skb); 2079 } 2080 2081 discard_it: 2082 /* Discard frame. */ 2083 kfree_skb(skb); 2084 return 0; 2085 2086 discard_and_relse: 2087 sk_drops_add(sk, skb); 2088 if (refcounted) 2089 sock_put(sk); 2090 goto discard_it; 2091 2092 do_time_wait: 2093 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2094 inet_twsk_put(inet_twsk(sk)); 2095 goto discard_it; 2096 } 2097 2098 tcp_v4_fill_cb(skb, iph, th); 2099 2100 if (tcp_checksum_complete(skb)) { 2101 inet_twsk_put(inet_twsk(sk)); 2102 goto csum_error; 2103 } 2104 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 2105 case TCP_TW_SYN: { 2106 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), 2107 &tcp_hashinfo, skb, 2108 __tcp_hdrlen(th), 2109 iph->saddr, th->source, 2110 iph->daddr, th->dest, 2111 inet_iif(skb), 2112 sdif); 2113 if (sk2) { 2114 inet_twsk_deschedule_put(inet_twsk(sk)); 2115 sk = sk2; 2116 tcp_v4_restore_cb(skb); 2117 refcounted = false; 2118 goto process; 2119 } 2120 } 2121 /* to ACK */ 2122 fallthrough; 2123 case TCP_TW_ACK: 2124 tcp_v4_timewait_ack(sk, skb); 2125 break; 2126 case TCP_TW_RST: 2127 tcp_v4_send_reset(sk, skb); 2128 inet_twsk_deschedule_put(inet_twsk(sk)); 2129 goto discard_it; 2130 case TCP_TW_SUCCESS:; 2131 } 2132 goto discard_it; 2133 } 2134 2135 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2136 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2137 .twsk_unique = tcp_twsk_unique, 2138 .twsk_destructor= tcp_twsk_destructor, 2139 }; 2140 2141 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2142 { 2143 struct dst_entry *dst = skb_dst(skb); 2144 2145 if (dst && dst_hold_safe(dst)) { 2146 sk->sk_rx_dst = dst; 2147 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; 2148 } 2149 } 2150 EXPORT_SYMBOL(inet_sk_rx_dst_set); 2151 2152 const struct inet_connection_sock_af_ops ipv4_specific = { 2153 .queue_xmit = ip_queue_xmit, 2154 .send_check = tcp_v4_send_check, 2155 .rebuild_header = inet_sk_rebuild_header, 2156 .sk_rx_dst_set = inet_sk_rx_dst_set, 2157 .conn_request = tcp_v4_conn_request, 2158 .syn_recv_sock = tcp_v4_syn_recv_sock, 2159 .net_header_len = sizeof(struct iphdr), 2160 .setsockopt = ip_setsockopt, 2161 .getsockopt = ip_getsockopt, 2162 .addr2sockaddr = inet_csk_addr2sockaddr, 2163 .sockaddr_len = sizeof(struct sockaddr_in), 2164 .mtu_reduced = tcp_v4_mtu_reduced, 2165 }; 2166 EXPORT_SYMBOL(ipv4_specific); 2167 2168 #ifdef CONFIG_TCP_MD5SIG 2169 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2170 .md5_lookup = tcp_v4_md5_lookup, 2171 .calc_md5_hash = tcp_v4_md5_hash_skb, 2172 .md5_parse = tcp_v4_parse_md5_keys, 2173 }; 2174 #endif 2175 2176 /* NOTE: A lot of things set to zero explicitly by call to 2177 * sk_alloc() so need not be done here. 2178 */ 2179 static int tcp_v4_init_sock(struct sock *sk) 2180 { 2181 struct inet_connection_sock *icsk = inet_csk(sk); 2182 2183 tcp_init_sock(sk); 2184 2185 icsk->icsk_af_ops = &ipv4_specific; 2186 2187 #ifdef CONFIG_TCP_MD5SIG 2188 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2189 #endif 2190 2191 return 0; 2192 } 2193 2194 void tcp_v4_destroy_sock(struct sock *sk) 2195 { 2196 struct tcp_sock *tp = tcp_sk(sk); 2197 2198 trace_tcp_destroy_sock(sk); 2199 2200 tcp_clear_xmit_timers(sk); 2201 2202 tcp_cleanup_congestion_control(sk); 2203 2204 tcp_cleanup_ulp(sk); 2205 2206 /* Cleanup up the write buffer. */ 2207 tcp_write_queue_purge(sk); 2208 2209 /* Check if we want to disable active TFO */ 2210 tcp_fastopen_active_disable_ofo_check(sk); 2211 2212 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2213 skb_rbtree_purge(&tp->out_of_order_queue); 2214 2215 #ifdef CONFIG_TCP_MD5SIG 2216 /* Clean up the MD5 key list, if any */ 2217 if (tp->md5sig_info) { 2218 tcp_clear_md5_list(sk); 2219 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu); 2220 tp->md5sig_info = NULL; 2221 } 2222 #endif 2223 2224 /* Clean up a referenced TCP bind bucket. */ 2225 if (inet_csk(sk)->icsk_bind_hash) 2226 inet_put_port(sk); 2227 2228 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2229 2230 /* If socket is aborted during connect operation */ 2231 tcp_free_fastopen_req(tp); 2232 tcp_fastopen_destroy_cipher(sk); 2233 tcp_saved_syn_free(tp); 2234 2235 sk_sockets_allocated_dec(sk); 2236 } 2237 EXPORT_SYMBOL(tcp_v4_destroy_sock); 2238 2239 #ifdef CONFIG_PROC_FS 2240 /* Proc filesystem TCP sock list dumping. */ 2241 2242 /* 2243 * Get next listener socket follow cur. If cur is NULL, get first socket 2244 * starting from bucket given in st->bucket; when st->bucket is zero the 2245 * very first socket in the hash table is returned. 2246 */ 2247 static void *listening_get_next(struct seq_file *seq, void *cur) 2248 { 2249 struct tcp_seq_afinfo *afinfo; 2250 struct tcp_iter_state *st = seq->private; 2251 struct net *net = seq_file_net(seq); 2252 struct inet_listen_hashbucket *ilb; 2253 struct hlist_nulls_node *node; 2254 struct sock *sk = cur; 2255 2256 if (st->bpf_seq_afinfo) 2257 afinfo = st->bpf_seq_afinfo; 2258 else 2259 afinfo = PDE_DATA(file_inode(seq->file)); 2260 2261 if (!sk) { 2262 get_head: 2263 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2264 spin_lock(&ilb->lock); 2265 sk = sk_nulls_head(&ilb->nulls_head); 2266 st->offset = 0; 2267 goto get_sk; 2268 } 2269 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2270 ++st->num; 2271 ++st->offset; 2272 2273 sk = sk_nulls_next(sk); 2274 get_sk: 2275 sk_nulls_for_each_from(sk, node) { 2276 if (!net_eq(sock_net(sk), net)) 2277 continue; 2278 if (afinfo->family == AF_UNSPEC || 2279 sk->sk_family == afinfo->family) 2280 return sk; 2281 } 2282 spin_unlock(&ilb->lock); 2283 st->offset = 0; 2284 if (++st->bucket < INET_LHTABLE_SIZE) 2285 goto get_head; 2286 return NULL; 2287 } 2288 2289 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2290 { 2291 struct tcp_iter_state *st = seq->private; 2292 void *rc; 2293 2294 st->bucket = 0; 2295 st->offset = 0; 2296 rc = listening_get_next(seq, NULL); 2297 2298 while (rc && *pos) { 2299 rc = listening_get_next(seq, rc); 2300 --*pos; 2301 } 2302 return rc; 2303 } 2304 2305 static inline bool empty_bucket(const struct tcp_iter_state *st) 2306 { 2307 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain); 2308 } 2309 2310 /* 2311 * Get first established socket starting from bucket given in st->bucket. 2312 * If st->bucket is zero, the very first socket in the hash is returned. 2313 */ 2314 static void *established_get_first(struct seq_file *seq) 2315 { 2316 struct tcp_seq_afinfo *afinfo; 2317 struct tcp_iter_state *st = seq->private; 2318 struct net *net = seq_file_net(seq); 2319 void *rc = NULL; 2320 2321 if (st->bpf_seq_afinfo) 2322 afinfo = st->bpf_seq_afinfo; 2323 else 2324 afinfo = PDE_DATA(file_inode(seq->file)); 2325 2326 st->offset = 0; 2327 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { 2328 struct sock *sk; 2329 struct hlist_nulls_node *node; 2330 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 2331 2332 /* Lockless fast path for the common case of empty buckets */ 2333 if (empty_bucket(st)) 2334 continue; 2335 2336 spin_lock_bh(lock); 2337 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 2338 if ((afinfo->family != AF_UNSPEC && 2339 sk->sk_family != afinfo->family) || 2340 !net_eq(sock_net(sk), net)) { 2341 continue; 2342 } 2343 rc = sk; 2344 goto out; 2345 } 2346 spin_unlock_bh(lock); 2347 } 2348 out: 2349 return rc; 2350 } 2351 2352 static void *established_get_next(struct seq_file *seq, void *cur) 2353 { 2354 struct tcp_seq_afinfo *afinfo; 2355 struct sock *sk = cur; 2356 struct hlist_nulls_node *node; 2357 struct tcp_iter_state *st = seq->private; 2358 struct net *net = seq_file_net(seq); 2359 2360 if (st->bpf_seq_afinfo) 2361 afinfo = st->bpf_seq_afinfo; 2362 else 2363 afinfo = PDE_DATA(file_inode(seq->file)); 2364 2365 ++st->num; 2366 ++st->offset; 2367 2368 sk = sk_nulls_next(sk); 2369 2370 sk_nulls_for_each_from(sk, node) { 2371 if ((afinfo->family == AF_UNSPEC || 2372 sk->sk_family == afinfo->family) && 2373 net_eq(sock_net(sk), net)) 2374 return sk; 2375 } 2376 2377 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2378 ++st->bucket; 2379 return established_get_first(seq); 2380 } 2381 2382 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2383 { 2384 struct tcp_iter_state *st = seq->private; 2385 void *rc; 2386 2387 st->bucket = 0; 2388 rc = established_get_first(seq); 2389 2390 while (rc && pos) { 2391 rc = established_get_next(seq, rc); 2392 --pos; 2393 } 2394 return rc; 2395 } 2396 2397 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2398 { 2399 void *rc; 2400 struct tcp_iter_state *st = seq->private; 2401 2402 st->state = TCP_SEQ_STATE_LISTENING; 2403 rc = listening_get_idx(seq, &pos); 2404 2405 if (!rc) { 2406 st->state = TCP_SEQ_STATE_ESTABLISHED; 2407 rc = established_get_idx(seq, pos); 2408 } 2409 2410 return rc; 2411 } 2412 2413 static void *tcp_seek_last_pos(struct seq_file *seq) 2414 { 2415 struct tcp_iter_state *st = seq->private; 2416 int offset = st->offset; 2417 int orig_num = st->num; 2418 void *rc = NULL; 2419 2420 switch (st->state) { 2421 case TCP_SEQ_STATE_LISTENING: 2422 if (st->bucket >= INET_LHTABLE_SIZE) 2423 break; 2424 st->state = TCP_SEQ_STATE_LISTENING; 2425 rc = listening_get_next(seq, NULL); 2426 while (offset-- && rc) 2427 rc = listening_get_next(seq, rc); 2428 if (rc) 2429 break; 2430 st->bucket = 0; 2431 st->state = TCP_SEQ_STATE_ESTABLISHED; 2432 fallthrough; 2433 case TCP_SEQ_STATE_ESTABLISHED: 2434 if (st->bucket > tcp_hashinfo.ehash_mask) 2435 break; 2436 rc = established_get_first(seq); 2437 while (offset-- && rc) 2438 rc = established_get_next(seq, rc); 2439 } 2440 2441 st->num = orig_num; 2442 2443 return rc; 2444 } 2445 2446 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2447 { 2448 struct tcp_iter_state *st = seq->private; 2449 void *rc; 2450 2451 if (*pos && *pos == st->last_pos) { 2452 rc = tcp_seek_last_pos(seq); 2453 if (rc) 2454 goto out; 2455 } 2456 2457 st->state = TCP_SEQ_STATE_LISTENING; 2458 st->num = 0; 2459 st->bucket = 0; 2460 st->offset = 0; 2461 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2462 2463 out: 2464 st->last_pos = *pos; 2465 return rc; 2466 } 2467 EXPORT_SYMBOL(tcp_seq_start); 2468 2469 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2470 { 2471 struct tcp_iter_state *st = seq->private; 2472 void *rc = NULL; 2473 2474 if (v == SEQ_START_TOKEN) { 2475 rc = tcp_get_idx(seq, 0); 2476 goto out; 2477 } 2478 2479 switch (st->state) { 2480 case TCP_SEQ_STATE_LISTENING: 2481 rc = listening_get_next(seq, v); 2482 if (!rc) { 2483 st->state = TCP_SEQ_STATE_ESTABLISHED; 2484 st->bucket = 0; 2485 st->offset = 0; 2486 rc = established_get_first(seq); 2487 } 2488 break; 2489 case TCP_SEQ_STATE_ESTABLISHED: 2490 rc = established_get_next(seq, v); 2491 break; 2492 } 2493 out: 2494 ++*pos; 2495 st->last_pos = *pos; 2496 return rc; 2497 } 2498 EXPORT_SYMBOL(tcp_seq_next); 2499 2500 void tcp_seq_stop(struct seq_file *seq, void *v) 2501 { 2502 struct tcp_iter_state *st = seq->private; 2503 2504 switch (st->state) { 2505 case TCP_SEQ_STATE_LISTENING: 2506 if (v != SEQ_START_TOKEN) 2507 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock); 2508 break; 2509 case TCP_SEQ_STATE_ESTABLISHED: 2510 if (v) 2511 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2512 break; 2513 } 2514 } 2515 EXPORT_SYMBOL(tcp_seq_stop); 2516 2517 static void get_openreq4(const struct request_sock *req, 2518 struct seq_file *f, int i) 2519 { 2520 const struct inet_request_sock *ireq = inet_rsk(req); 2521 long delta = req->rsk_timer.expires - jiffies; 2522 2523 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2524 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2525 i, 2526 ireq->ir_loc_addr, 2527 ireq->ir_num, 2528 ireq->ir_rmt_addr, 2529 ntohs(ireq->ir_rmt_port), 2530 TCP_SYN_RECV, 2531 0, 0, /* could print option size, but that is af dependent. */ 2532 1, /* timers active (only the expire timer) */ 2533 jiffies_delta_to_clock_t(delta), 2534 req->num_timeout, 2535 from_kuid_munged(seq_user_ns(f), 2536 sock_i_uid(req->rsk_listener)), 2537 0, /* non standard timer */ 2538 0, /* open_requests have no inode */ 2539 0, 2540 req); 2541 } 2542 2543 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2544 { 2545 int timer_active; 2546 unsigned long timer_expires; 2547 const struct tcp_sock *tp = tcp_sk(sk); 2548 const struct inet_connection_sock *icsk = inet_csk(sk); 2549 const struct inet_sock *inet = inet_sk(sk); 2550 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2551 __be32 dest = inet->inet_daddr; 2552 __be32 src = inet->inet_rcv_saddr; 2553 __u16 destp = ntohs(inet->inet_dport); 2554 __u16 srcp = ntohs(inet->inet_sport); 2555 int rx_queue; 2556 int state; 2557 2558 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2559 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2560 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2561 timer_active = 1; 2562 timer_expires = icsk->icsk_timeout; 2563 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2564 timer_active = 4; 2565 timer_expires = icsk->icsk_timeout; 2566 } else if (timer_pending(&sk->sk_timer)) { 2567 timer_active = 2; 2568 timer_expires = sk->sk_timer.expires; 2569 } else { 2570 timer_active = 0; 2571 timer_expires = jiffies; 2572 } 2573 2574 state = inet_sk_state_load(sk); 2575 if (state == TCP_LISTEN) 2576 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2577 else 2578 /* Because we don't lock the socket, 2579 * we might find a transient negative value. 2580 */ 2581 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2582 READ_ONCE(tp->copied_seq), 0); 2583 2584 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2585 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2586 i, src, srcp, dest, destp, state, 2587 READ_ONCE(tp->write_seq) - tp->snd_una, 2588 rx_queue, 2589 timer_active, 2590 jiffies_delta_to_clock_t(timer_expires - jiffies), 2591 icsk->icsk_retransmits, 2592 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2593 icsk->icsk_probes_out, 2594 sock_i_ino(sk), 2595 refcount_read(&sk->sk_refcnt), sk, 2596 jiffies_to_clock_t(icsk->icsk_rto), 2597 jiffies_to_clock_t(icsk->icsk_ack.ato), 2598 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2599 tp->snd_cwnd, 2600 state == TCP_LISTEN ? 2601 fastopenq->max_qlen : 2602 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2603 } 2604 2605 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2606 struct seq_file *f, int i) 2607 { 2608 long delta = tw->tw_timer.expires - jiffies; 2609 __be32 dest, src; 2610 __u16 destp, srcp; 2611 2612 dest = tw->tw_daddr; 2613 src = tw->tw_rcv_saddr; 2614 destp = ntohs(tw->tw_dport); 2615 srcp = ntohs(tw->tw_sport); 2616 2617 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2618 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2619 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2620 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2621 refcount_read(&tw->tw_refcnt), tw); 2622 } 2623 2624 #define TMPSZ 150 2625 2626 static int tcp4_seq_show(struct seq_file *seq, void *v) 2627 { 2628 struct tcp_iter_state *st; 2629 struct sock *sk = v; 2630 2631 seq_setwidth(seq, TMPSZ - 1); 2632 if (v == SEQ_START_TOKEN) { 2633 seq_puts(seq, " sl local_address rem_address st tx_queue " 2634 "rx_queue tr tm->when retrnsmt uid timeout " 2635 "inode"); 2636 goto out; 2637 } 2638 st = seq->private; 2639 2640 if (sk->sk_state == TCP_TIME_WAIT) 2641 get_timewait4_sock(v, seq, st->num); 2642 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2643 get_openreq4(v, seq, st->num); 2644 else 2645 get_tcp4_sock(v, seq, st->num); 2646 out: 2647 seq_pad(seq, '\n'); 2648 return 0; 2649 } 2650 2651 #ifdef CONFIG_BPF_SYSCALL 2652 struct bpf_iter__tcp { 2653 __bpf_md_ptr(struct bpf_iter_meta *, meta); 2654 __bpf_md_ptr(struct sock_common *, sk_common); 2655 uid_t uid __aligned(8); 2656 }; 2657 2658 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 2659 struct sock_common *sk_common, uid_t uid) 2660 { 2661 struct bpf_iter__tcp ctx; 2662 2663 meta->seq_num--; /* skip SEQ_START_TOKEN */ 2664 ctx.meta = meta; 2665 ctx.sk_common = sk_common; 2666 ctx.uid = uid; 2667 return bpf_iter_run_prog(prog, &ctx); 2668 } 2669 2670 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 2671 { 2672 struct bpf_iter_meta meta; 2673 struct bpf_prog *prog; 2674 struct sock *sk = v; 2675 uid_t uid; 2676 2677 if (v == SEQ_START_TOKEN) 2678 return 0; 2679 2680 if (sk->sk_state == TCP_TIME_WAIT) { 2681 uid = 0; 2682 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 2683 const struct request_sock *req = v; 2684 2685 uid = from_kuid_munged(seq_user_ns(seq), 2686 sock_i_uid(req->rsk_listener)); 2687 } else { 2688 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 2689 } 2690 2691 meta.seq = seq; 2692 prog = bpf_iter_get_info(&meta, false); 2693 return tcp_prog_seq_show(prog, &meta, v, uid); 2694 } 2695 2696 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 2697 { 2698 struct bpf_iter_meta meta; 2699 struct bpf_prog *prog; 2700 2701 if (!v) { 2702 meta.seq = seq; 2703 prog = bpf_iter_get_info(&meta, true); 2704 if (prog) 2705 (void)tcp_prog_seq_show(prog, &meta, v, 0); 2706 } 2707 2708 tcp_seq_stop(seq, v); 2709 } 2710 2711 static const struct seq_operations bpf_iter_tcp_seq_ops = { 2712 .show = bpf_iter_tcp_seq_show, 2713 .start = tcp_seq_start, 2714 .next = tcp_seq_next, 2715 .stop = bpf_iter_tcp_seq_stop, 2716 }; 2717 #endif 2718 2719 static const struct seq_operations tcp4_seq_ops = { 2720 .show = tcp4_seq_show, 2721 .start = tcp_seq_start, 2722 .next = tcp_seq_next, 2723 .stop = tcp_seq_stop, 2724 }; 2725 2726 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 2727 .family = AF_INET, 2728 }; 2729 2730 static int __net_init tcp4_proc_init_net(struct net *net) 2731 { 2732 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 2733 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 2734 return -ENOMEM; 2735 return 0; 2736 } 2737 2738 static void __net_exit tcp4_proc_exit_net(struct net *net) 2739 { 2740 remove_proc_entry("tcp", net->proc_net); 2741 } 2742 2743 static struct pernet_operations tcp4_net_ops = { 2744 .init = tcp4_proc_init_net, 2745 .exit = tcp4_proc_exit_net, 2746 }; 2747 2748 int __init tcp4_proc_init(void) 2749 { 2750 return register_pernet_subsys(&tcp4_net_ops); 2751 } 2752 2753 void tcp4_proc_exit(void) 2754 { 2755 unregister_pernet_subsys(&tcp4_net_ops); 2756 } 2757 #endif /* CONFIG_PROC_FS */ 2758 2759 struct proto tcp_prot = { 2760 .name = "TCP", 2761 .owner = THIS_MODULE, 2762 .close = tcp_close, 2763 .pre_connect = tcp_v4_pre_connect, 2764 .connect = tcp_v4_connect, 2765 .disconnect = tcp_disconnect, 2766 .accept = inet_csk_accept, 2767 .ioctl = tcp_ioctl, 2768 .init = tcp_v4_init_sock, 2769 .destroy = tcp_v4_destroy_sock, 2770 .shutdown = tcp_shutdown, 2771 .setsockopt = tcp_setsockopt, 2772 .getsockopt = tcp_getsockopt, 2773 .keepalive = tcp_set_keepalive, 2774 .recvmsg = tcp_recvmsg, 2775 .sendmsg = tcp_sendmsg, 2776 .sendpage = tcp_sendpage, 2777 .backlog_rcv = tcp_v4_do_rcv, 2778 .release_cb = tcp_release_cb, 2779 .hash = inet_hash, 2780 .unhash = inet_unhash, 2781 .get_port = inet_csk_get_port, 2782 .enter_memory_pressure = tcp_enter_memory_pressure, 2783 .leave_memory_pressure = tcp_leave_memory_pressure, 2784 .stream_memory_free = tcp_stream_memory_free, 2785 .sockets_allocated = &tcp_sockets_allocated, 2786 .orphan_count = &tcp_orphan_count, 2787 .memory_allocated = &tcp_memory_allocated, 2788 .memory_pressure = &tcp_memory_pressure, 2789 .sysctl_mem = sysctl_tcp_mem, 2790 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 2791 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 2792 .max_header = MAX_TCP_HEADER, 2793 .obj_size = sizeof(struct tcp_sock), 2794 .slab_flags = SLAB_TYPESAFE_BY_RCU, 2795 .twsk_prot = &tcp_timewait_sock_ops, 2796 .rsk_prot = &tcp_request_sock_ops, 2797 .h.hashinfo = &tcp_hashinfo, 2798 .no_autobind = true, 2799 .diag_destroy = tcp_abort, 2800 }; 2801 EXPORT_SYMBOL(tcp_prot); 2802 2803 static void __net_exit tcp_sk_exit(struct net *net) 2804 { 2805 int cpu; 2806 2807 if (net->ipv4.tcp_congestion_control) 2808 bpf_module_put(net->ipv4.tcp_congestion_control, 2809 net->ipv4.tcp_congestion_control->owner); 2810 2811 for_each_possible_cpu(cpu) 2812 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); 2813 free_percpu(net->ipv4.tcp_sk); 2814 } 2815 2816 static int __net_init tcp_sk_init(struct net *net) 2817 { 2818 int res, cpu, cnt; 2819 2820 net->ipv4.tcp_sk = alloc_percpu(struct sock *); 2821 if (!net->ipv4.tcp_sk) 2822 return -ENOMEM; 2823 2824 for_each_possible_cpu(cpu) { 2825 struct sock *sk; 2826 2827 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 2828 IPPROTO_TCP, net); 2829 if (res) 2830 goto fail; 2831 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 2832 2833 /* Please enforce IP_DF and IPID==0 for RST and 2834 * ACK sent in SYN-RECV and TIME-WAIT state. 2835 */ 2836 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 2837 2838 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; 2839 } 2840 2841 net->ipv4.sysctl_tcp_ecn = 2; 2842 net->ipv4.sysctl_tcp_ecn_fallback = 1; 2843 2844 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 2845 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 2846 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 2847 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 2848 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 2849 2850 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 2851 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 2852 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 2853 2854 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 2855 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 2856 net->ipv4.sysctl_tcp_syncookies = 1; 2857 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 2858 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 2859 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 2860 net->ipv4.sysctl_tcp_orphan_retries = 0; 2861 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 2862 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 2863 net->ipv4.sysctl_tcp_tw_reuse = 2; 2864 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 2865 2866 cnt = tcp_hashinfo.ehash_mask + 1; 2867 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2; 2868 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; 2869 2870 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128); 2871 net->ipv4.sysctl_tcp_sack = 1; 2872 net->ipv4.sysctl_tcp_window_scaling = 1; 2873 net->ipv4.sysctl_tcp_timestamps = 1; 2874 net->ipv4.sysctl_tcp_early_retrans = 3; 2875 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 2876 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 2877 net->ipv4.sysctl_tcp_retrans_collapse = 1; 2878 net->ipv4.sysctl_tcp_max_reordering = 300; 2879 net->ipv4.sysctl_tcp_dsack = 1; 2880 net->ipv4.sysctl_tcp_app_win = 31; 2881 net->ipv4.sysctl_tcp_adv_win_scale = 1; 2882 net->ipv4.sysctl_tcp_frto = 2; 2883 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 2884 /* This limits the percentage of the congestion window which we 2885 * will allow a single TSO frame to consume. Building TSO frames 2886 * which are too large can cause TCP streams to be bursty. 2887 */ 2888 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 2889 /* Default TSQ limit of 16 TSO segments */ 2890 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; 2891 /* rfc5961 challenge ack rate limiting */ 2892 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; 2893 net->ipv4.sysctl_tcp_min_tso_segs = 2; 2894 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 2895 net->ipv4.sysctl_tcp_autocorking = 1; 2896 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 2897 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 2898 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 2899 if (net != &init_net) { 2900 memcpy(net->ipv4.sysctl_tcp_rmem, 2901 init_net.ipv4.sysctl_tcp_rmem, 2902 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 2903 memcpy(net->ipv4.sysctl_tcp_wmem, 2904 init_net.ipv4.sysctl_tcp_wmem, 2905 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 2906 } 2907 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 2908 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 2909 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 2910 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 2911 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); 2912 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; 2913 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 2914 2915 /* Reno is always built in */ 2916 if (!net_eq(net, &init_net) && 2917 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 2918 init_net.ipv4.tcp_congestion_control->owner)) 2919 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 2920 else 2921 net->ipv4.tcp_congestion_control = &tcp_reno; 2922 2923 return 0; 2924 fail: 2925 tcp_sk_exit(net); 2926 2927 return res; 2928 } 2929 2930 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 2931 { 2932 struct net *net; 2933 2934 inet_twsk_purge(&tcp_hashinfo, AF_INET); 2935 2936 list_for_each_entry(net, net_exit_list, exit_list) 2937 tcp_fastopen_ctx_destroy(net); 2938 } 2939 2940 static struct pernet_operations __net_initdata tcp_sk_ops = { 2941 .init = tcp_sk_init, 2942 .exit = tcp_sk_exit, 2943 .exit_batch = tcp_sk_exit_batch, 2944 }; 2945 2946 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 2947 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 2948 struct sock_common *sk_common, uid_t uid) 2949 2950 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 2951 { 2952 struct tcp_iter_state *st = priv_data; 2953 struct tcp_seq_afinfo *afinfo; 2954 int ret; 2955 2956 afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN); 2957 if (!afinfo) 2958 return -ENOMEM; 2959 2960 afinfo->family = AF_UNSPEC; 2961 st->bpf_seq_afinfo = afinfo; 2962 ret = bpf_iter_init_seq_net(priv_data, aux); 2963 if (ret) 2964 kfree(afinfo); 2965 return ret; 2966 } 2967 2968 static void bpf_iter_fini_tcp(void *priv_data) 2969 { 2970 struct tcp_iter_state *st = priv_data; 2971 2972 kfree(st->bpf_seq_afinfo); 2973 bpf_iter_fini_seq_net(priv_data); 2974 } 2975 2976 static const struct bpf_iter_seq_info tcp_seq_info = { 2977 .seq_ops = &bpf_iter_tcp_seq_ops, 2978 .init_seq_private = bpf_iter_init_tcp, 2979 .fini_seq_private = bpf_iter_fini_tcp, 2980 .seq_priv_size = sizeof(struct tcp_iter_state), 2981 }; 2982 2983 static struct bpf_iter_reg tcp_reg_info = { 2984 .target = "tcp", 2985 .ctx_arg_info_size = 1, 2986 .ctx_arg_info = { 2987 { offsetof(struct bpf_iter__tcp, sk_common), 2988 PTR_TO_BTF_ID_OR_NULL }, 2989 }, 2990 .seq_info = &tcp_seq_info, 2991 }; 2992 2993 static void __init bpf_iter_register(void) 2994 { 2995 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 2996 if (bpf_iter_reg_target(&tcp_reg_info)) 2997 pr_warn("Warning: could not register bpf iterator tcp\n"); 2998 } 2999 3000 #endif 3001 3002 void __init tcp_v4_init(void) 3003 { 3004 if (register_pernet_subsys(&tcp_sk_ops)) 3005 panic("Failed to create the TCP control socket.\n"); 3006 3007 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3008 bpf_iter_register(); 3009 #endif 3010 } 3011