1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 61 #include <net/net_namespace.h> 62 #include <net/icmp.h> 63 #include <net/inet_hashtables.h> 64 #include <net/tcp.h> 65 #include <net/transp_v6.h> 66 #include <net/ipv6.h> 67 #include <net/inet_common.h> 68 #include <net/timewait_sock.h> 69 #include <net/xfrm.h> 70 #include <net/secure_seq.h> 71 #include <net/busy_poll.h> 72 73 #include <linux/inet.h> 74 #include <linux/ipv6.h> 75 #include <linux/stddef.h> 76 #include <linux/proc_fs.h> 77 #include <linux/seq_file.h> 78 #include <linux/inetdevice.h> 79 #include <linux/btf_ids.h> 80 81 #include <crypto/hash.h> 82 #include <linux/scatterlist.h> 83 84 #include <trace/events/tcp.h> 85 86 #ifdef CONFIG_TCP_MD5SIG 87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 88 __be32 daddr, __be32 saddr, const struct tcphdr *th); 89 #endif 90 91 struct inet_hashinfo tcp_hashinfo; 92 EXPORT_SYMBOL(tcp_hashinfo); 93 94 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 95 { 96 return secure_tcp_seq(ip_hdr(skb)->daddr, 97 ip_hdr(skb)->saddr, 98 tcp_hdr(skb)->dest, 99 tcp_hdr(skb)->source); 100 } 101 102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 103 { 104 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 105 } 106 107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 108 { 109 const struct inet_timewait_sock *tw = inet_twsk(sktw); 110 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 111 struct tcp_sock *tp = tcp_sk(sk); 112 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse; 113 114 if (reuse == 2) { 115 /* Still does not detect *everything* that goes through 116 * lo, since we require a loopback src or dst address 117 * or direct binding to 'lo' interface. 118 */ 119 bool loopback = false; 120 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 121 loopback = true; 122 #if IS_ENABLED(CONFIG_IPV6) 123 if (tw->tw_family == AF_INET6) { 124 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 125 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 126 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 128 loopback = true; 129 } else 130 #endif 131 { 132 if (ipv4_is_loopback(tw->tw_daddr) || 133 ipv4_is_loopback(tw->tw_rcv_saddr)) 134 loopback = true; 135 } 136 if (!loopback) 137 reuse = 0; 138 } 139 140 /* With PAWS, it is safe from the viewpoint 141 of data integrity. Even without PAWS it is safe provided sequence 142 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 143 144 Actually, the idea is close to VJ's one, only timestamp cache is 145 held not per host, but per port pair and TW bucket is used as state 146 holder. 147 148 If TW bucket has been already destroyed we fall back to VJ's scheme 149 and use initial timestamp retrieved from peer table. 150 */ 151 if (tcptw->tw_ts_recent_stamp && 152 (!twp || (reuse && time_after32(ktime_get_seconds(), 153 tcptw->tw_ts_recent_stamp)))) { 154 /* In case of repair and re-using TIME-WAIT sockets we still 155 * want to be sure that it is safe as above but honor the 156 * sequence numbers and time stamps set as part of the repair 157 * process. 158 * 159 * Without this check re-using a TIME-WAIT socket with TCP 160 * repair would accumulate a -1 on the repair assigned 161 * sequence number. The first time it is reused the sequence 162 * is -1, the second time -2, etc. This fixes that issue 163 * without appearing to create any others. 164 */ 165 if (likely(!tp->repair)) { 166 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 167 168 if (!seq) 169 seq = 1; 170 WRITE_ONCE(tp->write_seq, seq); 171 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 172 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 173 } 174 sock_hold(sktw); 175 return 1; 176 } 177 178 return 0; 179 } 180 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 181 182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 183 int addr_len) 184 { 185 /* This check is replicated from tcp_v4_connect() and intended to 186 * prevent BPF program called below from accessing bytes that are out 187 * of the bound specified by user in addr_len. 188 */ 189 if (addr_len < sizeof(struct sockaddr_in)) 190 return -EINVAL; 191 192 sock_owned_by_me(sk); 193 194 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr); 195 } 196 197 /* This will initiate an outgoing connection. */ 198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 199 { 200 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 201 struct inet_sock *inet = inet_sk(sk); 202 struct tcp_sock *tp = tcp_sk(sk); 203 __be16 orig_sport, orig_dport; 204 __be32 daddr, nexthop; 205 struct flowi4 *fl4; 206 struct rtable *rt; 207 int err; 208 struct ip_options_rcu *inet_opt; 209 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 210 211 if (addr_len < sizeof(struct sockaddr_in)) 212 return -EINVAL; 213 214 if (usin->sin_family != AF_INET) 215 return -EAFNOSUPPORT; 216 217 nexthop = daddr = usin->sin_addr.s_addr; 218 inet_opt = rcu_dereference_protected(inet->inet_opt, 219 lockdep_sock_is_held(sk)); 220 if (inet_opt && inet_opt->opt.srr) { 221 if (!daddr) 222 return -EINVAL; 223 nexthop = inet_opt->opt.faddr; 224 } 225 226 orig_sport = inet->inet_sport; 227 orig_dport = usin->sin_port; 228 fl4 = &inet->cork.fl.u.ip4; 229 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 230 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 231 IPPROTO_TCP, 232 orig_sport, orig_dport, sk); 233 if (IS_ERR(rt)) { 234 err = PTR_ERR(rt); 235 if (err == -ENETUNREACH) 236 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 237 return err; 238 } 239 240 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 241 ip_rt_put(rt); 242 return -ENETUNREACH; 243 } 244 245 if (!inet_opt || !inet_opt->opt.srr) 246 daddr = fl4->daddr; 247 248 if (!inet->inet_saddr) 249 inet->inet_saddr = fl4->saddr; 250 sk_rcv_saddr_set(sk, inet->inet_saddr); 251 252 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 253 /* Reset inherited state */ 254 tp->rx_opt.ts_recent = 0; 255 tp->rx_opt.ts_recent_stamp = 0; 256 if (likely(!tp->repair)) 257 WRITE_ONCE(tp->write_seq, 0); 258 } 259 260 inet->inet_dport = usin->sin_port; 261 sk_daddr_set(sk, daddr); 262 263 inet_csk(sk)->icsk_ext_hdr_len = 0; 264 if (inet_opt) 265 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 266 267 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 268 269 /* Socket identity is still unknown (sport may be zero). 270 * However we set state to SYN-SENT and not releasing socket 271 * lock select source port, enter ourselves into the hash tables and 272 * complete initialization after this. 273 */ 274 tcp_set_state(sk, TCP_SYN_SENT); 275 err = inet_hash_connect(tcp_death_row, sk); 276 if (err) 277 goto failure; 278 279 sk_set_txhash(sk); 280 281 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 282 inet->inet_sport, inet->inet_dport, sk); 283 if (IS_ERR(rt)) { 284 err = PTR_ERR(rt); 285 rt = NULL; 286 goto failure; 287 } 288 /* OK, now commit destination to socket. */ 289 sk->sk_gso_type = SKB_GSO_TCPV4; 290 sk_setup_caps(sk, &rt->dst); 291 rt = NULL; 292 293 if (likely(!tp->repair)) { 294 if (!tp->write_seq) 295 WRITE_ONCE(tp->write_seq, 296 secure_tcp_seq(inet->inet_saddr, 297 inet->inet_daddr, 298 inet->inet_sport, 299 usin->sin_port)); 300 tp->tsoffset = secure_tcp_ts_off(sock_net(sk), 301 inet->inet_saddr, 302 inet->inet_daddr); 303 } 304 305 inet->inet_id = prandom_u32(); 306 307 if (tcp_fastopen_defer_connect(sk, &err)) 308 return err; 309 if (err) 310 goto failure; 311 312 err = tcp_connect(sk); 313 314 if (err) 315 goto failure; 316 317 return 0; 318 319 failure: 320 /* 321 * This unhashes the socket and releases the local port, 322 * if necessary. 323 */ 324 tcp_set_state(sk, TCP_CLOSE); 325 ip_rt_put(rt); 326 sk->sk_route_caps = 0; 327 inet->inet_dport = 0; 328 return err; 329 } 330 EXPORT_SYMBOL(tcp_v4_connect); 331 332 /* 333 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 334 * It can be called through tcp_release_cb() if socket was owned by user 335 * at the time tcp_v4_err() was called to handle ICMP message. 336 */ 337 void tcp_v4_mtu_reduced(struct sock *sk) 338 { 339 struct inet_sock *inet = inet_sk(sk); 340 struct dst_entry *dst; 341 u32 mtu; 342 343 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 344 return; 345 mtu = tcp_sk(sk)->mtu_info; 346 dst = inet_csk_update_pmtu(sk, mtu); 347 if (!dst) 348 return; 349 350 /* Something is about to be wrong... Remember soft error 351 * for the case, if this connection will not able to recover. 352 */ 353 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 354 sk->sk_err_soft = EMSGSIZE; 355 356 mtu = dst_mtu(dst); 357 358 if (inet->pmtudisc != IP_PMTUDISC_DONT && 359 ip_sk_accept_pmtu(sk) && 360 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 361 tcp_sync_mss(sk, mtu); 362 363 /* Resend the TCP packet because it's 364 * clear that the old packet has been 365 * dropped. This is the new "fast" path mtu 366 * discovery. 367 */ 368 tcp_simple_retransmit(sk); 369 } /* else let the usual retransmit timer handle it */ 370 } 371 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 372 373 static void do_redirect(struct sk_buff *skb, struct sock *sk) 374 { 375 struct dst_entry *dst = __sk_dst_check(sk, 0); 376 377 if (dst) 378 dst->ops->redirect(dst, sk, skb); 379 } 380 381 382 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 383 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 384 { 385 struct request_sock *req = inet_reqsk(sk); 386 struct net *net = sock_net(sk); 387 388 /* ICMPs are not backlogged, hence we cannot get 389 * an established socket here. 390 */ 391 if (seq != tcp_rsk(req)->snt_isn) { 392 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 393 } else if (abort) { 394 /* 395 * Still in SYN_RECV, just remove it silently. 396 * There is no good way to pass the error to the newly 397 * created socket, and POSIX does not want network 398 * errors returned from accept(). 399 */ 400 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 401 tcp_listendrop(req->rsk_listener); 402 } 403 reqsk_put(req); 404 } 405 EXPORT_SYMBOL(tcp_req_err); 406 407 /* TCP-LD (RFC 6069) logic */ 408 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 409 { 410 struct inet_connection_sock *icsk = inet_csk(sk); 411 struct tcp_sock *tp = tcp_sk(sk); 412 struct sk_buff *skb; 413 s32 remaining; 414 u32 delta_us; 415 416 if (sock_owned_by_user(sk)) 417 return; 418 419 if (seq != tp->snd_una || !icsk->icsk_retransmits || 420 !icsk->icsk_backoff) 421 return; 422 423 skb = tcp_rtx_queue_head(sk); 424 if (WARN_ON_ONCE(!skb)) 425 return; 426 427 icsk->icsk_backoff--; 428 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 429 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 430 431 tcp_mstamp_refresh(tp); 432 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 433 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 434 435 if (remaining > 0) { 436 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 437 remaining, TCP_RTO_MAX); 438 } else { 439 /* RTO revert clocked out retransmission. 440 * Will retransmit now. 441 */ 442 tcp_retransmit_timer(sk); 443 } 444 } 445 EXPORT_SYMBOL(tcp_ld_RTO_revert); 446 447 /* 448 * This routine is called by the ICMP module when it gets some 449 * sort of error condition. If err < 0 then the socket should 450 * be closed and the error returned to the user. If err > 0 451 * it's just the icmp type << 8 | icmp code. After adjustment 452 * header points to the first 8 bytes of the tcp header. We need 453 * to find the appropriate port. 454 * 455 * The locking strategy used here is very "optimistic". When 456 * someone else accesses the socket the ICMP is just dropped 457 * and for some paths there is no check at all. 458 * A more general error queue to queue errors for later handling 459 * is probably better. 460 * 461 */ 462 463 int tcp_v4_err(struct sk_buff *skb, u32 info) 464 { 465 const struct iphdr *iph = (const struct iphdr *)skb->data; 466 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 467 struct tcp_sock *tp; 468 struct inet_sock *inet; 469 const int type = icmp_hdr(skb)->type; 470 const int code = icmp_hdr(skb)->code; 471 struct sock *sk; 472 struct request_sock *fastopen; 473 u32 seq, snd_una; 474 int err; 475 struct net *net = dev_net(skb->dev); 476 477 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, 478 th->dest, iph->saddr, ntohs(th->source), 479 inet_iif(skb), 0); 480 if (!sk) { 481 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 482 return -ENOENT; 483 } 484 if (sk->sk_state == TCP_TIME_WAIT) { 485 inet_twsk_put(inet_twsk(sk)); 486 return 0; 487 } 488 seq = ntohl(th->seq); 489 if (sk->sk_state == TCP_NEW_SYN_RECV) { 490 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 491 type == ICMP_TIME_EXCEEDED || 492 (type == ICMP_DEST_UNREACH && 493 (code == ICMP_NET_UNREACH || 494 code == ICMP_HOST_UNREACH))); 495 return 0; 496 } 497 498 bh_lock_sock(sk); 499 /* If too many ICMPs get dropped on busy 500 * servers this needs to be solved differently. 501 * We do take care of PMTU discovery (RFC1191) special case : 502 * we can receive locally generated ICMP messages while socket is held. 503 */ 504 if (sock_owned_by_user(sk)) { 505 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 506 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 507 } 508 if (sk->sk_state == TCP_CLOSE) 509 goto out; 510 511 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 512 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 513 goto out; 514 } 515 516 tp = tcp_sk(sk); 517 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 518 fastopen = rcu_dereference(tp->fastopen_rsk); 519 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 520 if (sk->sk_state != TCP_LISTEN && 521 !between(seq, snd_una, tp->snd_nxt)) { 522 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 523 goto out; 524 } 525 526 switch (type) { 527 case ICMP_REDIRECT: 528 if (!sock_owned_by_user(sk)) 529 do_redirect(skb, sk); 530 goto out; 531 case ICMP_SOURCE_QUENCH: 532 /* Just silently ignore these. */ 533 goto out; 534 case ICMP_PARAMETERPROB: 535 err = EPROTO; 536 break; 537 case ICMP_DEST_UNREACH: 538 if (code > NR_ICMP_UNREACH) 539 goto out; 540 541 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 542 /* We are not interested in TCP_LISTEN and open_requests 543 * (SYN-ACKs send out by Linux are always <576bytes so 544 * they should go through unfragmented). 545 */ 546 if (sk->sk_state == TCP_LISTEN) 547 goto out; 548 549 tp->mtu_info = info; 550 if (!sock_owned_by_user(sk)) { 551 tcp_v4_mtu_reduced(sk); 552 } else { 553 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 554 sock_hold(sk); 555 } 556 goto out; 557 } 558 559 err = icmp_err_convert[code].errno; 560 /* check if this ICMP message allows revert of backoff. 561 * (see RFC 6069) 562 */ 563 if (!fastopen && 564 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 565 tcp_ld_RTO_revert(sk, seq); 566 break; 567 case ICMP_TIME_EXCEEDED: 568 err = EHOSTUNREACH; 569 break; 570 default: 571 goto out; 572 } 573 574 switch (sk->sk_state) { 575 case TCP_SYN_SENT: 576 case TCP_SYN_RECV: 577 /* Only in fast or simultaneous open. If a fast open socket is 578 * already accepted it is treated as a connected one below. 579 */ 580 if (fastopen && !fastopen->sk) 581 break; 582 583 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 584 585 if (!sock_owned_by_user(sk)) { 586 sk->sk_err = err; 587 588 sk->sk_error_report(sk); 589 590 tcp_done(sk); 591 } else { 592 sk->sk_err_soft = err; 593 } 594 goto out; 595 } 596 597 /* If we've already connected we will keep trying 598 * until we time out, or the user gives up. 599 * 600 * rfc1122 4.2.3.9 allows to consider as hard errors 601 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 602 * but it is obsoleted by pmtu discovery). 603 * 604 * Note, that in modern internet, where routing is unreliable 605 * and in each dark corner broken firewalls sit, sending random 606 * errors ordered by their masters even this two messages finally lose 607 * their original sense (even Linux sends invalid PORT_UNREACHs) 608 * 609 * Now we are in compliance with RFCs. 610 * --ANK (980905) 611 */ 612 613 inet = inet_sk(sk); 614 if (!sock_owned_by_user(sk) && inet->recverr) { 615 sk->sk_err = err; 616 sk->sk_error_report(sk); 617 } else { /* Only an error on timeout */ 618 sk->sk_err_soft = err; 619 } 620 621 out: 622 bh_unlock_sock(sk); 623 sock_put(sk); 624 return 0; 625 } 626 627 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 628 { 629 struct tcphdr *th = tcp_hdr(skb); 630 631 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 632 skb->csum_start = skb_transport_header(skb) - skb->head; 633 skb->csum_offset = offsetof(struct tcphdr, check); 634 } 635 636 /* This routine computes an IPv4 TCP checksum. */ 637 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 638 { 639 const struct inet_sock *inet = inet_sk(sk); 640 641 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 642 } 643 EXPORT_SYMBOL(tcp_v4_send_check); 644 645 /* 646 * This routine will send an RST to the other tcp. 647 * 648 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 649 * for reset. 650 * Answer: if a packet caused RST, it is not for a socket 651 * existing in our system, if it is matched to a socket, 652 * it is just duplicate segment or bug in other side's TCP. 653 * So that we build reply only basing on parameters 654 * arrived with segment. 655 * Exception: precedence violation. We do not implement it in any case. 656 */ 657 658 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) 659 { 660 const struct tcphdr *th = tcp_hdr(skb); 661 struct { 662 struct tcphdr th; 663 #ifdef CONFIG_TCP_MD5SIG 664 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)]; 665 #endif 666 } rep; 667 struct ip_reply_arg arg; 668 #ifdef CONFIG_TCP_MD5SIG 669 struct tcp_md5sig_key *key = NULL; 670 const __u8 *hash_location = NULL; 671 unsigned char newhash[16]; 672 int genhash; 673 struct sock *sk1 = NULL; 674 #endif 675 u64 transmit_time = 0; 676 struct sock *ctl_sk; 677 struct net *net; 678 679 /* Never send a reset in response to a reset. */ 680 if (th->rst) 681 return; 682 683 /* If sk not NULL, it means we did a successful lookup and incoming 684 * route had to be correct. prequeue might have dropped our dst. 685 */ 686 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 687 return; 688 689 /* Swap the send and the receive. */ 690 memset(&rep, 0, sizeof(rep)); 691 rep.th.dest = th->source; 692 rep.th.source = th->dest; 693 rep.th.doff = sizeof(struct tcphdr) / 4; 694 rep.th.rst = 1; 695 696 if (th->ack) { 697 rep.th.seq = th->ack_seq; 698 } else { 699 rep.th.ack = 1; 700 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 701 skb->len - (th->doff << 2)); 702 } 703 704 memset(&arg, 0, sizeof(arg)); 705 arg.iov[0].iov_base = (unsigned char *)&rep; 706 arg.iov[0].iov_len = sizeof(rep.th); 707 708 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 709 #ifdef CONFIG_TCP_MD5SIG 710 rcu_read_lock(); 711 hash_location = tcp_parse_md5sig_option(th); 712 if (sk && sk_fullsock(sk)) { 713 const union tcp_md5_addr *addr; 714 int l3index; 715 716 /* sdif set, means packet ingressed via a device 717 * in an L3 domain and inet_iif is set to it. 718 */ 719 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 720 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 721 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 722 } else if (hash_location) { 723 const union tcp_md5_addr *addr; 724 int sdif = tcp_v4_sdif(skb); 725 int dif = inet_iif(skb); 726 int l3index; 727 728 /* 729 * active side is lost. Try to find listening socket through 730 * source port, and then find md5 key through listening socket. 731 * we are not loose security here: 732 * Incoming packet is checked with md5 hash with finding key, 733 * no RST generated if md5 hash doesn't match. 734 */ 735 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, 736 ip_hdr(skb)->saddr, 737 th->source, ip_hdr(skb)->daddr, 738 ntohs(th->source), dif, sdif); 739 /* don't send rst if it can't find key */ 740 if (!sk1) 741 goto out; 742 743 /* sdif set, means packet ingressed via a device 744 * in an L3 domain and dif is set to it. 745 */ 746 l3index = sdif ? dif : 0; 747 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 748 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 749 if (!key) 750 goto out; 751 752 753 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 754 if (genhash || memcmp(hash_location, newhash, 16) != 0) 755 goto out; 756 757 } 758 759 if (key) { 760 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 761 (TCPOPT_NOP << 16) | 762 (TCPOPT_MD5SIG << 8) | 763 TCPOLEN_MD5SIG); 764 /* Update length and the length the header thinks exists */ 765 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 766 rep.th.doff = arg.iov[0].iov_len / 4; 767 768 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 769 key, ip_hdr(skb)->saddr, 770 ip_hdr(skb)->daddr, &rep.th); 771 } 772 #endif 773 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 774 ip_hdr(skb)->saddr, /* XXX */ 775 arg.iov[0].iov_len, IPPROTO_TCP, 0); 776 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 777 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 778 779 /* When socket is gone, all binding information is lost. 780 * routing might fail in this case. No choice here, if we choose to force 781 * input interface, we will misroute in case of asymmetric route. 782 */ 783 if (sk) { 784 arg.bound_dev_if = sk->sk_bound_dev_if; 785 if (sk_fullsock(sk)) 786 trace_tcp_send_reset(sk, skb); 787 } 788 789 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 790 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 791 792 arg.tos = ip_hdr(skb)->tos; 793 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 794 local_bh_disable(); 795 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); 796 if (sk) { 797 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 798 inet_twsk(sk)->tw_mark : sk->sk_mark; 799 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 800 inet_twsk(sk)->tw_priority : sk->sk_priority; 801 transmit_time = tcp_transmit_time(sk); 802 } 803 ip_send_unicast_reply(ctl_sk, 804 skb, &TCP_SKB_CB(skb)->header.h4.opt, 805 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 806 &arg, arg.iov[0].iov_len, 807 transmit_time); 808 809 ctl_sk->sk_mark = 0; 810 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 811 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 812 local_bh_enable(); 813 814 #ifdef CONFIG_TCP_MD5SIG 815 out: 816 rcu_read_unlock(); 817 #endif 818 } 819 820 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 821 outside socket context is ugly, certainly. What can I do? 822 */ 823 824 static void tcp_v4_send_ack(const struct sock *sk, 825 struct sk_buff *skb, u32 seq, u32 ack, 826 u32 win, u32 tsval, u32 tsecr, int oif, 827 struct tcp_md5sig_key *key, 828 int reply_flags, u8 tos) 829 { 830 const struct tcphdr *th = tcp_hdr(skb); 831 struct { 832 struct tcphdr th; 833 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 834 #ifdef CONFIG_TCP_MD5SIG 835 + (TCPOLEN_MD5SIG_ALIGNED >> 2) 836 #endif 837 ]; 838 } rep; 839 struct net *net = sock_net(sk); 840 struct ip_reply_arg arg; 841 struct sock *ctl_sk; 842 u64 transmit_time; 843 844 memset(&rep.th, 0, sizeof(struct tcphdr)); 845 memset(&arg, 0, sizeof(arg)); 846 847 arg.iov[0].iov_base = (unsigned char *)&rep; 848 arg.iov[0].iov_len = sizeof(rep.th); 849 if (tsecr) { 850 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 851 (TCPOPT_TIMESTAMP << 8) | 852 TCPOLEN_TIMESTAMP); 853 rep.opt[1] = htonl(tsval); 854 rep.opt[2] = htonl(tsecr); 855 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 856 } 857 858 /* Swap the send and the receive. */ 859 rep.th.dest = th->source; 860 rep.th.source = th->dest; 861 rep.th.doff = arg.iov[0].iov_len / 4; 862 rep.th.seq = htonl(seq); 863 rep.th.ack_seq = htonl(ack); 864 rep.th.ack = 1; 865 rep.th.window = htons(win); 866 867 #ifdef CONFIG_TCP_MD5SIG 868 if (key) { 869 int offset = (tsecr) ? 3 : 0; 870 871 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 872 (TCPOPT_NOP << 16) | 873 (TCPOPT_MD5SIG << 8) | 874 TCPOLEN_MD5SIG); 875 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 876 rep.th.doff = arg.iov[0].iov_len/4; 877 878 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 879 key, ip_hdr(skb)->saddr, 880 ip_hdr(skb)->daddr, &rep.th); 881 } 882 #endif 883 arg.flags = reply_flags; 884 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 885 ip_hdr(skb)->saddr, /* XXX */ 886 arg.iov[0].iov_len, IPPROTO_TCP, 0); 887 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 888 if (oif) 889 arg.bound_dev_if = oif; 890 arg.tos = tos; 891 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 892 local_bh_disable(); 893 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); 894 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 895 inet_twsk(sk)->tw_mark : sk->sk_mark; 896 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 897 inet_twsk(sk)->tw_priority : sk->sk_priority; 898 transmit_time = tcp_transmit_time(sk); 899 ip_send_unicast_reply(ctl_sk, 900 skb, &TCP_SKB_CB(skb)->header.h4.opt, 901 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 902 &arg, arg.iov[0].iov_len, 903 transmit_time); 904 905 ctl_sk->sk_mark = 0; 906 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 907 local_bh_enable(); 908 } 909 910 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 911 { 912 struct inet_timewait_sock *tw = inet_twsk(sk); 913 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 914 915 tcp_v4_send_ack(sk, skb, 916 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 917 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 918 tcp_time_stamp_raw() + tcptw->tw_ts_offset, 919 tcptw->tw_ts_recent, 920 tw->tw_bound_dev_if, 921 tcp_twsk_md5_key(tcptw), 922 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 923 tw->tw_tos 924 ); 925 926 inet_twsk_put(tw); 927 } 928 929 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 930 struct request_sock *req) 931 { 932 const union tcp_md5_addr *addr; 933 int l3index; 934 935 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 936 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 937 */ 938 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 939 tcp_sk(sk)->snd_nxt; 940 941 /* RFC 7323 2.3 942 * The window field (SEG.WND) of every outgoing segment, with the 943 * exception of <SYN> segments, MUST be right-shifted by 944 * Rcv.Wind.Shift bits: 945 */ 946 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 947 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 948 tcp_v4_send_ack(sk, skb, seq, 949 tcp_rsk(req)->rcv_nxt, 950 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 951 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, 952 req->ts_recent, 953 0, 954 tcp_md5_do_lookup(sk, l3index, addr, AF_INET), 955 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 956 ip_hdr(skb)->tos); 957 } 958 959 /* 960 * Send a SYN-ACK after having received a SYN. 961 * This still operates on a request_sock only, not on a big 962 * socket. 963 */ 964 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 965 struct flowi *fl, 966 struct request_sock *req, 967 struct tcp_fastopen_cookie *foc, 968 enum tcp_synack_type synack_type, 969 struct sk_buff *syn_skb) 970 { 971 const struct inet_request_sock *ireq = inet_rsk(req); 972 struct flowi4 fl4; 973 int err = -1; 974 struct sk_buff *skb; 975 u8 tos; 976 977 /* First, grab a route. */ 978 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 979 return -1; 980 981 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 982 983 tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ? 984 tcp_rsk(req)->syn_tos : inet_sk(sk)->tos; 985 986 if (skb) { 987 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 988 989 rcu_read_lock(); 990 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 991 ireq->ir_rmt_addr, 992 rcu_dereference(ireq->ireq_opt), 993 tos & ~INET_ECN_MASK); 994 rcu_read_unlock(); 995 err = net_xmit_eval(err); 996 } 997 998 return err; 999 } 1000 1001 /* 1002 * IPv4 request_sock destructor. 1003 */ 1004 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1005 { 1006 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1007 } 1008 1009 #ifdef CONFIG_TCP_MD5SIG 1010 /* 1011 * RFC2385 MD5 checksumming requires a mapping of 1012 * IP address->MD5 Key. 1013 * We need to maintain these in the sk structure. 1014 */ 1015 1016 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed); 1017 EXPORT_SYMBOL(tcp_md5_needed); 1018 1019 /* Find the Key structure for an address. */ 1020 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1021 const union tcp_md5_addr *addr, 1022 int family) 1023 { 1024 const struct tcp_sock *tp = tcp_sk(sk); 1025 struct tcp_md5sig_key *key; 1026 const struct tcp_md5sig_info *md5sig; 1027 __be32 mask; 1028 struct tcp_md5sig_key *best_match = NULL; 1029 bool match; 1030 1031 /* caller either holds rcu_read_lock() or socket lock */ 1032 md5sig = rcu_dereference_check(tp->md5sig_info, 1033 lockdep_sock_is_held(sk)); 1034 if (!md5sig) 1035 return NULL; 1036 1037 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1038 lockdep_sock_is_held(sk)) { 1039 if (key->family != family) 1040 continue; 1041 if (key->l3index && key->l3index != l3index) 1042 continue; 1043 if (family == AF_INET) { 1044 mask = inet_make_mask(key->prefixlen); 1045 match = (key->addr.a4.s_addr & mask) == 1046 (addr->a4.s_addr & mask); 1047 #if IS_ENABLED(CONFIG_IPV6) 1048 } else if (family == AF_INET6) { 1049 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1050 key->prefixlen); 1051 #endif 1052 } else { 1053 match = false; 1054 } 1055 1056 if (match && (!best_match || 1057 key->prefixlen > best_match->prefixlen)) 1058 best_match = key; 1059 } 1060 return best_match; 1061 } 1062 EXPORT_SYMBOL(__tcp_md5_do_lookup); 1063 1064 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1065 const union tcp_md5_addr *addr, 1066 int family, u8 prefixlen, 1067 int l3index) 1068 { 1069 const struct tcp_sock *tp = tcp_sk(sk); 1070 struct tcp_md5sig_key *key; 1071 unsigned int size = sizeof(struct in_addr); 1072 const struct tcp_md5sig_info *md5sig; 1073 1074 /* caller either holds rcu_read_lock() or socket lock */ 1075 md5sig = rcu_dereference_check(tp->md5sig_info, 1076 lockdep_sock_is_held(sk)); 1077 if (!md5sig) 1078 return NULL; 1079 #if IS_ENABLED(CONFIG_IPV6) 1080 if (family == AF_INET6) 1081 size = sizeof(struct in6_addr); 1082 #endif 1083 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1084 lockdep_sock_is_held(sk)) { 1085 if (key->family != family) 1086 continue; 1087 if (key->l3index && key->l3index != l3index) 1088 continue; 1089 if (!memcmp(&key->addr, addr, size) && 1090 key->prefixlen == prefixlen) 1091 return key; 1092 } 1093 return NULL; 1094 } 1095 1096 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1097 const struct sock *addr_sk) 1098 { 1099 const union tcp_md5_addr *addr; 1100 int l3index; 1101 1102 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1103 addr_sk->sk_bound_dev_if); 1104 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1105 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1106 } 1107 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1108 1109 /* This can be called on a newly created socket, from other files */ 1110 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1111 int family, u8 prefixlen, int l3index, 1112 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1113 { 1114 /* Add Key to the list */ 1115 struct tcp_md5sig_key *key; 1116 struct tcp_sock *tp = tcp_sk(sk); 1117 struct tcp_md5sig_info *md5sig; 1118 1119 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); 1120 if (key) { 1121 /* Pre-existing entry - just update that one. 1122 * Note that the key might be used concurrently. 1123 * data_race() is telling kcsan that we do not care of 1124 * key mismatches, since changing MD5 key on live flows 1125 * can lead to packet drops. 1126 */ 1127 data_race(memcpy(key->key, newkey, newkeylen)); 1128 1129 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1130 * Also note that a reader could catch new key->keylen value 1131 * but old key->key[], this is the reason we use __GFP_ZERO 1132 * at sock_kmalloc() time below these lines. 1133 */ 1134 WRITE_ONCE(key->keylen, newkeylen); 1135 1136 return 0; 1137 } 1138 1139 md5sig = rcu_dereference_protected(tp->md5sig_info, 1140 lockdep_sock_is_held(sk)); 1141 if (!md5sig) { 1142 md5sig = kmalloc(sizeof(*md5sig), gfp); 1143 if (!md5sig) 1144 return -ENOMEM; 1145 1146 sk_nocaps_add(sk, NETIF_F_GSO_MASK); 1147 INIT_HLIST_HEAD(&md5sig->head); 1148 rcu_assign_pointer(tp->md5sig_info, md5sig); 1149 } 1150 1151 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1152 if (!key) 1153 return -ENOMEM; 1154 if (!tcp_alloc_md5sig_pool()) { 1155 sock_kfree_s(sk, key, sizeof(*key)); 1156 return -ENOMEM; 1157 } 1158 1159 memcpy(key->key, newkey, newkeylen); 1160 key->keylen = newkeylen; 1161 key->family = family; 1162 key->prefixlen = prefixlen; 1163 key->l3index = l3index; 1164 memcpy(&key->addr, addr, 1165 (family == AF_INET6) ? sizeof(struct in6_addr) : 1166 sizeof(struct in_addr)); 1167 hlist_add_head_rcu(&key->node, &md5sig->head); 1168 return 0; 1169 } 1170 EXPORT_SYMBOL(tcp_md5_do_add); 1171 1172 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1173 u8 prefixlen, int l3index) 1174 { 1175 struct tcp_md5sig_key *key; 1176 1177 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); 1178 if (!key) 1179 return -ENOENT; 1180 hlist_del_rcu(&key->node); 1181 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1182 kfree_rcu(key, rcu); 1183 return 0; 1184 } 1185 EXPORT_SYMBOL(tcp_md5_do_del); 1186 1187 static void tcp_clear_md5_list(struct sock *sk) 1188 { 1189 struct tcp_sock *tp = tcp_sk(sk); 1190 struct tcp_md5sig_key *key; 1191 struct hlist_node *n; 1192 struct tcp_md5sig_info *md5sig; 1193 1194 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1195 1196 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1197 hlist_del_rcu(&key->node); 1198 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1199 kfree_rcu(key, rcu); 1200 } 1201 } 1202 1203 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1204 sockptr_t optval, int optlen) 1205 { 1206 struct tcp_md5sig cmd; 1207 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1208 const union tcp_md5_addr *addr; 1209 u8 prefixlen = 32; 1210 int l3index = 0; 1211 1212 if (optlen < sizeof(cmd)) 1213 return -EINVAL; 1214 1215 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1216 return -EFAULT; 1217 1218 if (sin->sin_family != AF_INET) 1219 return -EINVAL; 1220 1221 if (optname == TCP_MD5SIG_EXT && 1222 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1223 prefixlen = cmd.tcpm_prefixlen; 1224 if (prefixlen > 32) 1225 return -EINVAL; 1226 } 1227 1228 if (optname == TCP_MD5SIG_EXT && 1229 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1230 struct net_device *dev; 1231 1232 rcu_read_lock(); 1233 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1234 if (dev && netif_is_l3_master(dev)) 1235 l3index = dev->ifindex; 1236 1237 rcu_read_unlock(); 1238 1239 /* ok to reference set/not set outside of rcu; 1240 * right now device MUST be an L3 master 1241 */ 1242 if (!dev || !l3index) 1243 return -EINVAL; 1244 } 1245 1246 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1247 1248 if (!cmd.tcpm_keylen) 1249 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index); 1250 1251 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1252 return -EINVAL; 1253 1254 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, 1255 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); 1256 } 1257 1258 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, 1259 __be32 daddr, __be32 saddr, 1260 const struct tcphdr *th, int nbytes) 1261 { 1262 struct tcp4_pseudohdr *bp; 1263 struct scatterlist sg; 1264 struct tcphdr *_th; 1265 1266 bp = hp->scratch; 1267 bp->saddr = saddr; 1268 bp->daddr = daddr; 1269 bp->pad = 0; 1270 bp->protocol = IPPROTO_TCP; 1271 bp->len = cpu_to_be16(nbytes); 1272 1273 _th = (struct tcphdr *)(bp + 1); 1274 memcpy(_th, th, sizeof(*th)); 1275 _th->check = 0; 1276 1277 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1278 ahash_request_set_crypt(hp->md5_req, &sg, NULL, 1279 sizeof(*bp) + sizeof(*th)); 1280 return crypto_ahash_update(hp->md5_req); 1281 } 1282 1283 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1284 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1285 { 1286 struct tcp_md5sig_pool *hp; 1287 struct ahash_request *req; 1288 1289 hp = tcp_get_md5sig_pool(); 1290 if (!hp) 1291 goto clear_hash_noput; 1292 req = hp->md5_req; 1293 1294 if (crypto_ahash_init(req)) 1295 goto clear_hash; 1296 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) 1297 goto clear_hash; 1298 if (tcp_md5_hash_key(hp, key)) 1299 goto clear_hash; 1300 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1301 if (crypto_ahash_final(req)) 1302 goto clear_hash; 1303 1304 tcp_put_md5sig_pool(); 1305 return 0; 1306 1307 clear_hash: 1308 tcp_put_md5sig_pool(); 1309 clear_hash_noput: 1310 memset(md5_hash, 0, 16); 1311 return 1; 1312 } 1313 1314 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1315 const struct sock *sk, 1316 const struct sk_buff *skb) 1317 { 1318 struct tcp_md5sig_pool *hp; 1319 struct ahash_request *req; 1320 const struct tcphdr *th = tcp_hdr(skb); 1321 __be32 saddr, daddr; 1322 1323 if (sk) { /* valid for establish/request sockets */ 1324 saddr = sk->sk_rcv_saddr; 1325 daddr = sk->sk_daddr; 1326 } else { 1327 const struct iphdr *iph = ip_hdr(skb); 1328 saddr = iph->saddr; 1329 daddr = iph->daddr; 1330 } 1331 1332 hp = tcp_get_md5sig_pool(); 1333 if (!hp) 1334 goto clear_hash_noput; 1335 req = hp->md5_req; 1336 1337 if (crypto_ahash_init(req)) 1338 goto clear_hash; 1339 1340 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len)) 1341 goto clear_hash; 1342 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) 1343 goto clear_hash; 1344 if (tcp_md5_hash_key(hp, key)) 1345 goto clear_hash; 1346 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1347 if (crypto_ahash_final(req)) 1348 goto clear_hash; 1349 1350 tcp_put_md5sig_pool(); 1351 return 0; 1352 1353 clear_hash: 1354 tcp_put_md5sig_pool(); 1355 clear_hash_noput: 1356 memset(md5_hash, 0, 16); 1357 return 1; 1358 } 1359 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1360 1361 #endif 1362 1363 /* Called with rcu_read_lock() */ 1364 static bool tcp_v4_inbound_md5_hash(const struct sock *sk, 1365 const struct sk_buff *skb, 1366 int dif, int sdif) 1367 { 1368 #ifdef CONFIG_TCP_MD5SIG 1369 /* 1370 * This gets called for each TCP segment that arrives 1371 * so we want to be efficient. 1372 * We have 3 drop cases: 1373 * o No MD5 hash and one expected. 1374 * o MD5 hash and we're not expecting one. 1375 * o MD5 hash and its wrong. 1376 */ 1377 const __u8 *hash_location = NULL; 1378 struct tcp_md5sig_key *hash_expected; 1379 const struct iphdr *iph = ip_hdr(skb); 1380 const struct tcphdr *th = tcp_hdr(skb); 1381 const union tcp_md5_addr *addr; 1382 unsigned char newhash[16]; 1383 int genhash, l3index; 1384 1385 /* sdif set, means packet ingressed via a device 1386 * in an L3 domain and dif is set to the l3mdev 1387 */ 1388 l3index = sdif ? dif : 0; 1389 1390 addr = (union tcp_md5_addr *)&iph->saddr; 1391 hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1392 hash_location = tcp_parse_md5sig_option(th); 1393 1394 /* We've parsed the options - do we have a hash? */ 1395 if (!hash_expected && !hash_location) 1396 return false; 1397 1398 if (hash_expected && !hash_location) { 1399 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); 1400 return true; 1401 } 1402 1403 if (!hash_expected && hash_location) { 1404 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); 1405 return true; 1406 } 1407 1408 /* Okay, so this is hash_expected and hash_location - 1409 * so we need to calculate the checksum. 1410 */ 1411 genhash = tcp_v4_md5_hash_skb(newhash, 1412 hash_expected, 1413 NULL, skb); 1414 1415 if (genhash || memcmp(hash_location, newhash, 16) != 0) { 1416 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); 1417 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n", 1418 &iph->saddr, ntohs(th->source), 1419 &iph->daddr, ntohs(th->dest), 1420 genhash ? " tcp_v4_calc_md5_hash failed" 1421 : "", l3index); 1422 return true; 1423 } 1424 return false; 1425 #endif 1426 return false; 1427 } 1428 1429 static void tcp_v4_init_req(struct request_sock *req, 1430 const struct sock *sk_listener, 1431 struct sk_buff *skb) 1432 { 1433 struct inet_request_sock *ireq = inet_rsk(req); 1434 struct net *net = sock_net(sk_listener); 1435 1436 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1437 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1438 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1439 } 1440 1441 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1442 struct flowi *fl, 1443 const struct request_sock *req) 1444 { 1445 return inet_csk_route_req(sk, &fl->u.ip4, req); 1446 } 1447 1448 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1449 .family = PF_INET, 1450 .obj_size = sizeof(struct tcp_request_sock), 1451 .rtx_syn_ack = tcp_rtx_synack, 1452 .send_ack = tcp_v4_reqsk_send_ack, 1453 .destructor = tcp_v4_reqsk_destructor, 1454 .send_reset = tcp_v4_send_reset, 1455 .syn_ack_timeout = tcp_syn_ack_timeout, 1456 }; 1457 1458 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1459 .mss_clamp = TCP_MSS_DEFAULT, 1460 #ifdef CONFIG_TCP_MD5SIG 1461 .req_md5_lookup = tcp_v4_md5_lookup, 1462 .calc_md5_hash = tcp_v4_md5_hash_skb, 1463 #endif 1464 .init_req = tcp_v4_init_req, 1465 #ifdef CONFIG_SYN_COOKIES 1466 .cookie_init_seq = cookie_v4_init_sequence, 1467 #endif 1468 .route_req = tcp_v4_route_req, 1469 .init_seq = tcp_v4_init_seq, 1470 .init_ts_off = tcp_v4_init_ts_off, 1471 .send_synack = tcp_v4_send_synack, 1472 }; 1473 1474 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1475 { 1476 /* Never answer to SYNs send to broadcast or multicast */ 1477 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1478 goto drop; 1479 1480 return tcp_conn_request(&tcp_request_sock_ops, 1481 &tcp_request_sock_ipv4_ops, sk, skb); 1482 1483 drop: 1484 tcp_listendrop(sk); 1485 return 0; 1486 } 1487 EXPORT_SYMBOL(tcp_v4_conn_request); 1488 1489 1490 /* 1491 * The three way handshake has completed - we got a valid synack - 1492 * now create the new socket. 1493 */ 1494 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1495 struct request_sock *req, 1496 struct dst_entry *dst, 1497 struct request_sock *req_unhash, 1498 bool *own_req) 1499 { 1500 struct inet_request_sock *ireq; 1501 struct inet_sock *newinet; 1502 struct tcp_sock *newtp; 1503 struct sock *newsk; 1504 #ifdef CONFIG_TCP_MD5SIG 1505 const union tcp_md5_addr *addr; 1506 struct tcp_md5sig_key *key; 1507 int l3index; 1508 #endif 1509 struct ip_options_rcu *inet_opt; 1510 1511 if (sk_acceptq_is_full(sk)) 1512 goto exit_overflow; 1513 1514 newsk = tcp_create_openreq_child(sk, req, skb); 1515 if (!newsk) 1516 goto exit_nonewsk; 1517 1518 newsk->sk_gso_type = SKB_GSO_TCPV4; 1519 inet_sk_rx_dst_set(newsk, skb); 1520 1521 newtp = tcp_sk(newsk); 1522 newinet = inet_sk(newsk); 1523 ireq = inet_rsk(req); 1524 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1525 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1526 newsk->sk_bound_dev_if = ireq->ir_iif; 1527 newinet->inet_saddr = ireq->ir_loc_addr; 1528 inet_opt = rcu_dereference(ireq->ireq_opt); 1529 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1530 newinet->mc_index = inet_iif(skb); 1531 newinet->mc_ttl = ip_hdr(skb)->ttl; 1532 newinet->rcv_tos = ip_hdr(skb)->tos; 1533 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1534 if (inet_opt) 1535 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1536 newinet->inet_id = prandom_u32(); 1537 1538 /* Set ToS of the new socket based upon the value of incoming SYN. */ 1539 if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) 1540 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1541 1542 if (!dst) { 1543 dst = inet_csk_route_child_sock(sk, newsk, req); 1544 if (!dst) 1545 goto put_and_exit; 1546 } else { 1547 /* syncookie case : see end of cookie_v4_check() */ 1548 } 1549 sk_setup_caps(newsk, dst); 1550 1551 tcp_ca_openreq_child(newsk, dst); 1552 1553 tcp_sync_mss(newsk, dst_mtu(dst)); 1554 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1555 1556 tcp_initialize_rcv_mss(newsk); 1557 1558 #ifdef CONFIG_TCP_MD5SIG 1559 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1560 /* Copy over the MD5 key from the original socket */ 1561 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1562 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1563 if (key) { 1564 /* 1565 * We're using one, so create a matching key 1566 * on the newsk structure. If we fail to get 1567 * memory, then we end up not copying the key 1568 * across. Shucks. 1569 */ 1570 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, 1571 key->key, key->keylen, GFP_ATOMIC); 1572 sk_nocaps_add(newsk, NETIF_F_GSO_MASK); 1573 } 1574 #endif 1575 1576 if (__inet_inherit_port(sk, newsk) < 0) 1577 goto put_and_exit; 1578 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); 1579 if (likely(*own_req)) { 1580 tcp_move_syn(newtp, req); 1581 ireq->ireq_opt = NULL; 1582 } else { 1583 newinet->inet_opt = NULL; 1584 } 1585 return newsk; 1586 1587 exit_overflow: 1588 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1589 exit_nonewsk: 1590 dst_release(dst); 1591 exit: 1592 tcp_listendrop(sk); 1593 return NULL; 1594 put_and_exit: 1595 newinet->inet_opt = NULL; 1596 inet_csk_prepare_forced_close(newsk); 1597 tcp_done(newsk); 1598 goto exit; 1599 } 1600 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1601 1602 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1603 { 1604 #ifdef CONFIG_SYN_COOKIES 1605 const struct tcphdr *th = tcp_hdr(skb); 1606 1607 if (!th->syn) 1608 sk = cookie_v4_check(sk, skb); 1609 #endif 1610 return sk; 1611 } 1612 1613 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1614 struct tcphdr *th, u32 *cookie) 1615 { 1616 u16 mss = 0; 1617 #ifdef CONFIG_SYN_COOKIES 1618 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1619 &tcp_request_sock_ipv4_ops, sk, th); 1620 if (mss) { 1621 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1622 tcp_synq_overflow(sk); 1623 } 1624 #endif 1625 return mss; 1626 } 1627 1628 /* The socket must have it's spinlock held when we get 1629 * here, unless it is a TCP_LISTEN socket. 1630 * 1631 * We have a potential double-lock case here, so even when 1632 * doing backlog processing we use the BH locking scheme. 1633 * This is because we cannot sleep with the original spinlock 1634 * held. 1635 */ 1636 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1637 { 1638 struct sock *rsk; 1639 1640 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1641 struct dst_entry *dst = sk->sk_rx_dst; 1642 1643 sock_rps_save_rxhash(sk, skb); 1644 sk_mark_napi_id(sk, skb); 1645 if (dst) { 1646 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || 1647 !dst->ops->check(dst, 0)) { 1648 dst_release(dst); 1649 sk->sk_rx_dst = NULL; 1650 } 1651 } 1652 tcp_rcv_established(sk, skb); 1653 return 0; 1654 } 1655 1656 if (tcp_checksum_complete(skb)) 1657 goto csum_err; 1658 1659 if (sk->sk_state == TCP_LISTEN) { 1660 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1661 1662 if (!nsk) 1663 goto discard; 1664 if (nsk != sk) { 1665 if (tcp_child_process(sk, nsk, skb)) { 1666 rsk = nsk; 1667 goto reset; 1668 } 1669 return 0; 1670 } 1671 } else 1672 sock_rps_save_rxhash(sk, skb); 1673 1674 if (tcp_rcv_state_process(sk, skb)) { 1675 rsk = sk; 1676 goto reset; 1677 } 1678 return 0; 1679 1680 reset: 1681 tcp_v4_send_reset(rsk, skb); 1682 discard: 1683 kfree_skb(skb); 1684 /* Be careful here. If this function gets more complicated and 1685 * gcc suffers from register pressure on the x86, sk (in %ebx) 1686 * might be destroyed here. This current version compiles correctly, 1687 * but you have been warned. 1688 */ 1689 return 0; 1690 1691 csum_err: 1692 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1693 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1694 goto discard; 1695 } 1696 EXPORT_SYMBOL(tcp_v4_do_rcv); 1697 1698 int tcp_v4_early_demux(struct sk_buff *skb) 1699 { 1700 const struct iphdr *iph; 1701 const struct tcphdr *th; 1702 struct sock *sk; 1703 1704 if (skb->pkt_type != PACKET_HOST) 1705 return 0; 1706 1707 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1708 return 0; 1709 1710 iph = ip_hdr(skb); 1711 th = tcp_hdr(skb); 1712 1713 if (th->doff < sizeof(struct tcphdr) / 4) 1714 return 0; 1715 1716 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, 1717 iph->saddr, th->source, 1718 iph->daddr, ntohs(th->dest), 1719 skb->skb_iif, inet_sdif(skb)); 1720 if (sk) { 1721 skb->sk = sk; 1722 skb->destructor = sock_edemux; 1723 if (sk_fullsock(sk)) { 1724 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); 1725 1726 if (dst) 1727 dst = dst_check(dst, 0); 1728 if (dst && 1729 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) 1730 skb_dst_set_noref(skb, dst); 1731 } 1732 } 1733 return 0; 1734 } 1735 1736 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) 1737 { 1738 u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf); 1739 struct skb_shared_info *shinfo; 1740 const struct tcphdr *th; 1741 struct tcphdr *thtail; 1742 struct sk_buff *tail; 1743 unsigned int hdrlen; 1744 bool fragstolen; 1745 u32 gso_segs; 1746 int delta; 1747 1748 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1749 * we can fix skb->truesize to its real value to avoid future drops. 1750 * This is valid because skb is not yet charged to the socket. 1751 * It has been noticed pure SACK packets were sometimes dropped 1752 * (if cooked by drivers without copybreak feature). 1753 */ 1754 skb_condense(skb); 1755 1756 skb_dst_drop(skb); 1757 1758 if (unlikely(tcp_checksum_complete(skb))) { 1759 bh_unlock_sock(sk); 1760 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1761 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1762 return true; 1763 } 1764 1765 /* Attempt coalescing to last skb in backlog, even if we are 1766 * above the limits. 1767 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 1768 */ 1769 th = (const struct tcphdr *)skb->data; 1770 hdrlen = th->doff * 4; 1771 shinfo = skb_shinfo(skb); 1772 1773 if (!shinfo->gso_size) 1774 shinfo->gso_size = skb->len - hdrlen; 1775 1776 if (!shinfo->gso_segs) 1777 shinfo->gso_segs = 1; 1778 1779 tail = sk->sk_backlog.tail; 1780 if (!tail) 1781 goto no_coalesce; 1782 thtail = (struct tcphdr *)tail->data; 1783 1784 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 1785 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 1786 ((TCP_SKB_CB(tail)->tcp_flags | 1787 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 1788 !((TCP_SKB_CB(tail)->tcp_flags & 1789 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 1790 ((TCP_SKB_CB(tail)->tcp_flags ^ 1791 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || 1792 #ifdef CONFIG_TLS_DEVICE 1793 tail->decrypted != skb->decrypted || 1794 #endif 1795 thtail->doff != th->doff || 1796 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 1797 goto no_coalesce; 1798 1799 __skb_pull(skb, hdrlen); 1800 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 1801 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 1802 1803 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 1804 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 1805 thtail->window = th->window; 1806 } 1807 1808 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 1809 * thtail->fin, so that the fast path in tcp_rcv_established() 1810 * is not entered if we append a packet with a FIN. 1811 * SYN, RST, URG are not present. 1812 * ACK is set on both packets. 1813 * PSH : we do not really care in TCP stack, 1814 * at least for 'GRO' packets. 1815 */ 1816 thtail->fin |= th->fin; 1817 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 1818 1819 if (TCP_SKB_CB(skb)->has_rxtstamp) { 1820 TCP_SKB_CB(tail)->has_rxtstamp = true; 1821 tail->tstamp = skb->tstamp; 1822 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 1823 } 1824 1825 /* Not as strict as GRO. We only need to carry mss max value */ 1826 skb_shinfo(tail)->gso_size = max(shinfo->gso_size, 1827 skb_shinfo(tail)->gso_size); 1828 1829 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs; 1830 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF); 1831 1832 sk->sk_backlog.len += delta; 1833 __NET_INC_STATS(sock_net(sk), 1834 LINUX_MIB_TCPBACKLOGCOALESCE); 1835 kfree_skb_partial(skb, fragstolen); 1836 return false; 1837 } 1838 __skb_push(skb, hdrlen); 1839 1840 no_coalesce: 1841 /* Only socket owner can try to collapse/prune rx queues 1842 * to reduce memory overhead, so add a little headroom here. 1843 * Few sockets backlog are possibly concurrently non empty. 1844 */ 1845 limit += 64*1024; 1846 1847 if (unlikely(sk_add_backlog(sk, skb, limit))) { 1848 bh_unlock_sock(sk); 1849 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 1850 return true; 1851 } 1852 return false; 1853 } 1854 EXPORT_SYMBOL(tcp_add_backlog); 1855 1856 int tcp_filter(struct sock *sk, struct sk_buff *skb) 1857 { 1858 struct tcphdr *th = (struct tcphdr *)skb->data; 1859 1860 return sk_filter_trim_cap(sk, skb, th->doff * 4); 1861 } 1862 EXPORT_SYMBOL(tcp_filter); 1863 1864 static void tcp_v4_restore_cb(struct sk_buff *skb) 1865 { 1866 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 1867 sizeof(struct inet_skb_parm)); 1868 } 1869 1870 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 1871 const struct tcphdr *th) 1872 { 1873 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 1874 * barrier() makes sure compiler wont play fool^Waliasing games. 1875 */ 1876 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 1877 sizeof(struct inet_skb_parm)); 1878 barrier(); 1879 1880 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1881 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1882 skb->len - th->doff * 4); 1883 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1884 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 1885 TCP_SKB_CB(skb)->tcp_tw_isn = 0; 1886 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 1887 TCP_SKB_CB(skb)->sacked = 0; 1888 TCP_SKB_CB(skb)->has_rxtstamp = 1889 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 1890 } 1891 1892 /* 1893 * From tcp_input.c 1894 */ 1895 1896 int tcp_v4_rcv(struct sk_buff *skb) 1897 { 1898 struct net *net = dev_net(skb->dev); 1899 struct sk_buff *skb_to_free; 1900 int sdif = inet_sdif(skb); 1901 int dif = inet_iif(skb); 1902 const struct iphdr *iph; 1903 const struct tcphdr *th; 1904 bool refcounted; 1905 struct sock *sk; 1906 int ret; 1907 1908 if (skb->pkt_type != PACKET_HOST) 1909 goto discard_it; 1910 1911 /* Count it even if it's bad */ 1912 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 1913 1914 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 1915 goto discard_it; 1916 1917 th = (const struct tcphdr *)skb->data; 1918 1919 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) 1920 goto bad_packet; 1921 if (!pskb_may_pull(skb, th->doff * 4)) 1922 goto discard_it; 1923 1924 /* An explanation is required here, I think. 1925 * Packet length and doff are validated by header prediction, 1926 * provided case of th->doff==0 is eliminated. 1927 * So, we defer the checks. */ 1928 1929 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 1930 goto csum_error; 1931 1932 th = (const struct tcphdr *)skb->data; 1933 iph = ip_hdr(skb); 1934 lookup: 1935 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, 1936 th->dest, sdif, &refcounted); 1937 if (!sk) 1938 goto no_tcp_socket; 1939 1940 process: 1941 if (sk->sk_state == TCP_TIME_WAIT) 1942 goto do_time_wait; 1943 1944 if (sk->sk_state == TCP_NEW_SYN_RECV) { 1945 struct request_sock *req = inet_reqsk(sk); 1946 bool req_stolen = false; 1947 struct sock *nsk; 1948 1949 sk = req->rsk_listener; 1950 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) { 1951 sk_drops_add(sk, skb); 1952 reqsk_put(req); 1953 goto discard_it; 1954 } 1955 if (tcp_checksum_complete(skb)) { 1956 reqsk_put(req); 1957 goto csum_error; 1958 } 1959 if (unlikely(sk->sk_state != TCP_LISTEN)) { 1960 inet_csk_reqsk_queue_drop_and_put(sk, req); 1961 goto lookup; 1962 } 1963 /* We own a reference on the listener, increase it again 1964 * as we might lose it too soon. 1965 */ 1966 sock_hold(sk); 1967 refcounted = true; 1968 nsk = NULL; 1969 if (!tcp_filter(sk, skb)) { 1970 th = (const struct tcphdr *)skb->data; 1971 iph = ip_hdr(skb); 1972 tcp_v4_fill_cb(skb, iph, th); 1973 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 1974 } 1975 if (!nsk) { 1976 reqsk_put(req); 1977 if (req_stolen) { 1978 /* Another cpu got exclusive access to req 1979 * and created a full blown socket. 1980 * Try to feed this packet to this socket 1981 * instead of discarding it. 1982 */ 1983 tcp_v4_restore_cb(skb); 1984 sock_put(sk); 1985 goto lookup; 1986 } 1987 goto discard_and_relse; 1988 } 1989 if (nsk == sk) { 1990 reqsk_put(req); 1991 tcp_v4_restore_cb(skb); 1992 } else if (tcp_child_process(sk, nsk, skb)) { 1993 tcp_v4_send_reset(nsk, skb); 1994 goto discard_and_relse; 1995 } else { 1996 sock_put(sk); 1997 return 0; 1998 } 1999 } 2000 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 2001 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2002 goto discard_and_relse; 2003 } 2004 2005 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 2006 goto discard_and_relse; 2007 2008 if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif)) 2009 goto discard_and_relse; 2010 2011 nf_reset_ct(skb); 2012 2013 if (tcp_filter(sk, skb)) 2014 goto discard_and_relse; 2015 th = (const struct tcphdr *)skb->data; 2016 iph = ip_hdr(skb); 2017 tcp_v4_fill_cb(skb, iph, th); 2018 2019 skb->dev = NULL; 2020 2021 if (sk->sk_state == TCP_LISTEN) { 2022 ret = tcp_v4_do_rcv(sk, skb); 2023 goto put_and_return; 2024 } 2025 2026 sk_incoming_cpu_update(sk); 2027 2028 bh_lock_sock_nested(sk); 2029 tcp_segs_in(tcp_sk(sk), skb); 2030 ret = 0; 2031 if (!sock_owned_by_user(sk)) { 2032 skb_to_free = sk->sk_rx_skb_cache; 2033 sk->sk_rx_skb_cache = NULL; 2034 ret = tcp_v4_do_rcv(sk, skb); 2035 } else { 2036 if (tcp_add_backlog(sk, skb)) 2037 goto discard_and_relse; 2038 skb_to_free = NULL; 2039 } 2040 bh_unlock_sock(sk); 2041 if (skb_to_free) 2042 __kfree_skb(skb_to_free); 2043 2044 put_and_return: 2045 if (refcounted) 2046 sock_put(sk); 2047 2048 return ret; 2049 2050 no_tcp_socket: 2051 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2052 goto discard_it; 2053 2054 tcp_v4_fill_cb(skb, iph, th); 2055 2056 if (tcp_checksum_complete(skb)) { 2057 csum_error: 2058 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2059 bad_packet: 2060 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2061 } else { 2062 tcp_v4_send_reset(NULL, skb); 2063 } 2064 2065 discard_it: 2066 /* Discard frame. */ 2067 kfree_skb(skb); 2068 return 0; 2069 2070 discard_and_relse: 2071 sk_drops_add(sk, skb); 2072 if (refcounted) 2073 sock_put(sk); 2074 goto discard_it; 2075 2076 do_time_wait: 2077 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2078 inet_twsk_put(inet_twsk(sk)); 2079 goto discard_it; 2080 } 2081 2082 tcp_v4_fill_cb(skb, iph, th); 2083 2084 if (tcp_checksum_complete(skb)) { 2085 inet_twsk_put(inet_twsk(sk)); 2086 goto csum_error; 2087 } 2088 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 2089 case TCP_TW_SYN: { 2090 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), 2091 &tcp_hashinfo, skb, 2092 __tcp_hdrlen(th), 2093 iph->saddr, th->source, 2094 iph->daddr, th->dest, 2095 inet_iif(skb), 2096 sdif); 2097 if (sk2) { 2098 inet_twsk_deschedule_put(inet_twsk(sk)); 2099 sk = sk2; 2100 tcp_v4_restore_cb(skb); 2101 refcounted = false; 2102 goto process; 2103 } 2104 } 2105 /* to ACK */ 2106 fallthrough; 2107 case TCP_TW_ACK: 2108 tcp_v4_timewait_ack(sk, skb); 2109 break; 2110 case TCP_TW_RST: 2111 tcp_v4_send_reset(sk, skb); 2112 inet_twsk_deschedule_put(inet_twsk(sk)); 2113 goto discard_it; 2114 case TCP_TW_SUCCESS:; 2115 } 2116 goto discard_it; 2117 } 2118 2119 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2120 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2121 .twsk_unique = tcp_twsk_unique, 2122 .twsk_destructor= tcp_twsk_destructor, 2123 }; 2124 2125 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2126 { 2127 struct dst_entry *dst = skb_dst(skb); 2128 2129 if (dst && dst_hold_safe(dst)) { 2130 sk->sk_rx_dst = dst; 2131 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; 2132 } 2133 } 2134 EXPORT_SYMBOL(inet_sk_rx_dst_set); 2135 2136 const struct inet_connection_sock_af_ops ipv4_specific = { 2137 .queue_xmit = ip_queue_xmit, 2138 .send_check = tcp_v4_send_check, 2139 .rebuild_header = inet_sk_rebuild_header, 2140 .sk_rx_dst_set = inet_sk_rx_dst_set, 2141 .conn_request = tcp_v4_conn_request, 2142 .syn_recv_sock = tcp_v4_syn_recv_sock, 2143 .net_header_len = sizeof(struct iphdr), 2144 .setsockopt = ip_setsockopt, 2145 .getsockopt = ip_getsockopt, 2146 .addr2sockaddr = inet_csk_addr2sockaddr, 2147 .sockaddr_len = sizeof(struct sockaddr_in), 2148 .mtu_reduced = tcp_v4_mtu_reduced, 2149 }; 2150 EXPORT_SYMBOL(ipv4_specific); 2151 2152 #ifdef CONFIG_TCP_MD5SIG 2153 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2154 .md5_lookup = tcp_v4_md5_lookup, 2155 .calc_md5_hash = tcp_v4_md5_hash_skb, 2156 .md5_parse = tcp_v4_parse_md5_keys, 2157 }; 2158 #endif 2159 2160 /* NOTE: A lot of things set to zero explicitly by call to 2161 * sk_alloc() so need not be done here. 2162 */ 2163 static int tcp_v4_init_sock(struct sock *sk) 2164 { 2165 struct inet_connection_sock *icsk = inet_csk(sk); 2166 2167 tcp_init_sock(sk); 2168 2169 icsk->icsk_af_ops = &ipv4_specific; 2170 2171 #ifdef CONFIG_TCP_MD5SIG 2172 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2173 #endif 2174 2175 return 0; 2176 } 2177 2178 void tcp_v4_destroy_sock(struct sock *sk) 2179 { 2180 struct tcp_sock *tp = tcp_sk(sk); 2181 2182 trace_tcp_destroy_sock(sk); 2183 2184 tcp_clear_xmit_timers(sk); 2185 2186 tcp_cleanup_congestion_control(sk); 2187 2188 tcp_cleanup_ulp(sk); 2189 2190 /* Cleanup up the write buffer. */ 2191 tcp_write_queue_purge(sk); 2192 2193 /* Check if we want to disable active TFO */ 2194 tcp_fastopen_active_disable_ofo_check(sk); 2195 2196 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2197 skb_rbtree_purge(&tp->out_of_order_queue); 2198 2199 #ifdef CONFIG_TCP_MD5SIG 2200 /* Clean up the MD5 key list, if any */ 2201 if (tp->md5sig_info) { 2202 tcp_clear_md5_list(sk); 2203 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu); 2204 tp->md5sig_info = NULL; 2205 } 2206 #endif 2207 2208 /* Clean up a referenced TCP bind bucket. */ 2209 if (inet_csk(sk)->icsk_bind_hash) 2210 inet_put_port(sk); 2211 2212 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2213 2214 /* If socket is aborted during connect operation */ 2215 tcp_free_fastopen_req(tp); 2216 tcp_fastopen_destroy_cipher(sk); 2217 tcp_saved_syn_free(tp); 2218 2219 sk_sockets_allocated_dec(sk); 2220 } 2221 EXPORT_SYMBOL(tcp_v4_destroy_sock); 2222 2223 #ifdef CONFIG_PROC_FS 2224 /* Proc filesystem TCP sock list dumping. */ 2225 2226 /* 2227 * Get next listener socket follow cur. If cur is NULL, get first socket 2228 * starting from bucket given in st->bucket; when st->bucket is zero the 2229 * very first socket in the hash table is returned. 2230 */ 2231 static void *listening_get_next(struct seq_file *seq, void *cur) 2232 { 2233 struct tcp_seq_afinfo *afinfo; 2234 struct tcp_iter_state *st = seq->private; 2235 struct net *net = seq_file_net(seq); 2236 struct inet_listen_hashbucket *ilb; 2237 struct hlist_nulls_node *node; 2238 struct sock *sk = cur; 2239 2240 if (st->bpf_seq_afinfo) 2241 afinfo = st->bpf_seq_afinfo; 2242 else 2243 afinfo = PDE_DATA(file_inode(seq->file)); 2244 2245 if (!sk) { 2246 get_head: 2247 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2248 spin_lock(&ilb->lock); 2249 sk = sk_nulls_head(&ilb->nulls_head); 2250 st->offset = 0; 2251 goto get_sk; 2252 } 2253 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2254 ++st->num; 2255 ++st->offset; 2256 2257 sk = sk_nulls_next(sk); 2258 get_sk: 2259 sk_nulls_for_each_from(sk, node) { 2260 if (!net_eq(sock_net(sk), net)) 2261 continue; 2262 if (afinfo->family == AF_UNSPEC || 2263 sk->sk_family == afinfo->family) 2264 return sk; 2265 } 2266 spin_unlock(&ilb->lock); 2267 st->offset = 0; 2268 if (++st->bucket < INET_LHTABLE_SIZE) 2269 goto get_head; 2270 return NULL; 2271 } 2272 2273 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2274 { 2275 struct tcp_iter_state *st = seq->private; 2276 void *rc; 2277 2278 st->bucket = 0; 2279 st->offset = 0; 2280 rc = listening_get_next(seq, NULL); 2281 2282 while (rc && *pos) { 2283 rc = listening_get_next(seq, rc); 2284 --*pos; 2285 } 2286 return rc; 2287 } 2288 2289 static inline bool empty_bucket(const struct tcp_iter_state *st) 2290 { 2291 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain); 2292 } 2293 2294 /* 2295 * Get first established socket starting from bucket given in st->bucket. 2296 * If st->bucket is zero, the very first socket in the hash is returned. 2297 */ 2298 static void *established_get_first(struct seq_file *seq) 2299 { 2300 struct tcp_seq_afinfo *afinfo; 2301 struct tcp_iter_state *st = seq->private; 2302 struct net *net = seq_file_net(seq); 2303 void *rc = NULL; 2304 2305 if (st->bpf_seq_afinfo) 2306 afinfo = st->bpf_seq_afinfo; 2307 else 2308 afinfo = PDE_DATA(file_inode(seq->file)); 2309 2310 st->offset = 0; 2311 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { 2312 struct sock *sk; 2313 struct hlist_nulls_node *node; 2314 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 2315 2316 /* Lockless fast path for the common case of empty buckets */ 2317 if (empty_bucket(st)) 2318 continue; 2319 2320 spin_lock_bh(lock); 2321 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 2322 if ((afinfo->family != AF_UNSPEC && 2323 sk->sk_family != afinfo->family) || 2324 !net_eq(sock_net(sk), net)) { 2325 continue; 2326 } 2327 rc = sk; 2328 goto out; 2329 } 2330 spin_unlock_bh(lock); 2331 } 2332 out: 2333 return rc; 2334 } 2335 2336 static void *established_get_next(struct seq_file *seq, void *cur) 2337 { 2338 struct tcp_seq_afinfo *afinfo; 2339 struct sock *sk = cur; 2340 struct hlist_nulls_node *node; 2341 struct tcp_iter_state *st = seq->private; 2342 struct net *net = seq_file_net(seq); 2343 2344 if (st->bpf_seq_afinfo) 2345 afinfo = st->bpf_seq_afinfo; 2346 else 2347 afinfo = PDE_DATA(file_inode(seq->file)); 2348 2349 ++st->num; 2350 ++st->offset; 2351 2352 sk = sk_nulls_next(sk); 2353 2354 sk_nulls_for_each_from(sk, node) { 2355 if ((afinfo->family == AF_UNSPEC || 2356 sk->sk_family == afinfo->family) && 2357 net_eq(sock_net(sk), net)) 2358 return sk; 2359 } 2360 2361 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2362 ++st->bucket; 2363 return established_get_first(seq); 2364 } 2365 2366 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2367 { 2368 struct tcp_iter_state *st = seq->private; 2369 void *rc; 2370 2371 st->bucket = 0; 2372 rc = established_get_first(seq); 2373 2374 while (rc && pos) { 2375 rc = established_get_next(seq, rc); 2376 --pos; 2377 } 2378 return rc; 2379 } 2380 2381 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2382 { 2383 void *rc; 2384 struct tcp_iter_state *st = seq->private; 2385 2386 st->state = TCP_SEQ_STATE_LISTENING; 2387 rc = listening_get_idx(seq, &pos); 2388 2389 if (!rc) { 2390 st->state = TCP_SEQ_STATE_ESTABLISHED; 2391 rc = established_get_idx(seq, pos); 2392 } 2393 2394 return rc; 2395 } 2396 2397 static void *tcp_seek_last_pos(struct seq_file *seq) 2398 { 2399 struct tcp_iter_state *st = seq->private; 2400 int offset = st->offset; 2401 int orig_num = st->num; 2402 void *rc = NULL; 2403 2404 switch (st->state) { 2405 case TCP_SEQ_STATE_LISTENING: 2406 if (st->bucket >= INET_LHTABLE_SIZE) 2407 break; 2408 st->state = TCP_SEQ_STATE_LISTENING; 2409 rc = listening_get_next(seq, NULL); 2410 while (offset-- && rc) 2411 rc = listening_get_next(seq, rc); 2412 if (rc) 2413 break; 2414 st->bucket = 0; 2415 st->state = TCP_SEQ_STATE_ESTABLISHED; 2416 fallthrough; 2417 case TCP_SEQ_STATE_ESTABLISHED: 2418 if (st->bucket > tcp_hashinfo.ehash_mask) 2419 break; 2420 rc = established_get_first(seq); 2421 while (offset-- && rc) 2422 rc = established_get_next(seq, rc); 2423 } 2424 2425 st->num = orig_num; 2426 2427 return rc; 2428 } 2429 2430 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2431 { 2432 struct tcp_iter_state *st = seq->private; 2433 void *rc; 2434 2435 if (*pos && *pos == st->last_pos) { 2436 rc = tcp_seek_last_pos(seq); 2437 if (rc) 2438 goto out; 2439 } 2440 2441 st->state = TCP_SEQ_STATE_LISTENING; 2442 st->num = 0; 2443 st->bucket = 0; 2444 st->offset = 0; 2445 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2446 2447 out: 2448 st->last_pos = *pos; 2449 return rc; 2450 } 2451 EXPORT_SYMBOL(tcp_seq_start); 2452 2453 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2454 { 2455 struct tcp_iter_state *st = seq->private; 2456 void *rc = NULL; 2457 2458 if (v == SEQ_START_TOKEN) { 2459 rc = tcp_get_idx(seq, 0); 2460 goto out; 2461 } 2462 2463 switch (st->state) { 2464 case TCP_SEQ_STATE_LISTENING: 2465 rc = listening_get_next(seq, v); 2466 if (!rc) { 2467 st->state = TCP_SEQ_STATE_ESTABLISHED; 2468 st->bucket = 0; 2469 st->offset = 0; 2470 rc = established_get_first(seq); 2471 } 2472 break; 2473 case TCP_SEQ_STATE_ESTABLISHED: 2474 rc = established_get_next(seq, v); 2475 break; 2476 } 2477 out: 2478 ++*pos; 2479 st->last_pos = *pos; 2480 return rc; 2481 } 2482 EXPORT_SYMBOL(tcp_seq_next); 2483 2484 void tcp_seq_stop(struct seq_file *seq, void *v) 2485 { 2486 struct tcp_iter_state *st = seq->private; 2487 2488 switch (st->state) { 2489 case TCP_SEQ_STATE_LISTENING: 2490 if (v != SEQ_START_TOKEN) 2491 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock); 2492 break; 2493 case TCP_SEQ_STATE_ESTABLISHED: 2494 if (v) 2495 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2496 break; 2497 } 2498 } 2499 EXPORT_SYMBOL(tcp_seq_stop); 2500 2501 static void get_openreq4(const struct request_sock *req, 2502 struct seq_file *f, int i) 2503 { 2504 const struct inet_request_sock *ireq = inet_rsk(req); 2505 long delta = req->rsk_timer.expires - jiffies; 2506 2507 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2508 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2509 i, 2510 ireq->ir_loc_addr, 2511 ireq->ir_num, 2512 ireq->ir_rmt_addr, 2513 ntohs(ireq->ir_rmt_port), 2514 TCP_SYN_RECV, 2515 0, 0, /* could print option size, but that is af dependent. */ 2516 1, /* timers active (only the expire timer) */ 2517 jiffies_delta_to_clock_t(delta), 2518 req->num_timeout, 2519 from_kuid_munged(seq_user_ns(f), 2520 sock_i_uid(req->rsk_listener)), 2521 0, /* non standard timer */ 2522 0, /* open_requests have no inode */ 2523 0, 2524 req); 2525 } 2526 2527 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2528 { 2529 int timer_active; 2530 unsigned long timer_expires; 2531 const struct tcp_sock *tp = tcp_sk(sk); 2532 const struct inet_connection_sock *icsk = inet_csk(sk); 2533 const struct inet_sock *inet = inet_sk(sk); 2534 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2535 __be32 dest = inet->inet_daddr; 2536 __be32 src = inet->inet_rcv_saddr; 2537 __u16 destp = ntohs(inet->inet_dport); 2538 __u16 srcp = ntohs(inet->inet_sport); 2539 int rx_queue; 2540 int state; 2541 2542 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2543 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2544 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2545 timer_active = 1; 2546 timer_expires = icsk->icsk_timeout; 2547 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2548 timer_active = 4; 2549 timer_expires = icsk->icsk_timeout; 2550 } else if (timer_pending(&sk->sk_timer)) { 2551 timer_active = 2; 2552 timer_expires = sk->sk_timer.expires; 2553 } else { 2554 timer_active = 0; 2555 timer_expires = jiffies; 2556 } 2557 2558 state = inet_sk_state_load(sk); 2559 if (state == TCP_LISTEN) 2560 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2561 else 2562 /* Because we don't lock the socket, 2563 * we might find a transient negative value. 2564 */ 2565 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2566 READ_ONCE(tp->copied_seq), 0); 2567 2568 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2569 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2570 i, src, srcp, dest, destp, state, 2571 READ_ONCE(tp->write_seq) - tp->snd_una, 2572 rx_queue, 2573 timer_active, 2574 jiffies_delta_to_clock_t(timer_expires - jiffies), 2575 icsk->icsk_retransmits, 2576 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2577 icsk->icsk_probes_out, 2578 sock_i_ino(sk), 2579 refcount_read(&sk->sk_refcnt), sk, 2580 jiffies_to_clock_t(icsk->icsk_rto), 2581 jiffies_to_clock_t(icsk->icsk_ack.ato), 2582 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2583 tp->snd_cwnd, 2584 state == TCP_LISTEN ? 2585 fastopenq->max_qlen : 2586 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2587 } 2588 2589 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2590 struct seq_file *f, int i) 2591 { 2592 long delta = tw->tw_timer.expires - jiffies; 2593 __be32 dest, src; 2594 __u16 destp, srcp; 2595 2596 dest = tw->tw_daddr; 2597 src = tw->tw_rcv_saddr; 2598 destp = ntohs(tw->tw_dport); 2599 srcp = ntohs(tw->tw_sport); 2600 2601 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2602 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2603 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2604 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2605 refcount_read(&tw->tw_refcnt), tw); 2606 } 2607 2608 #define TMPSZ 150 2609 2610 static int tcp4_seq_show(struct seq_file *seq, void *v) 2611 { 2612 struct tcp_iter_state *st; 2613 struct sock *sk = v; 2614 2615 seq_setwidth(seq, TMPSZ - 1); 2616 if (v == SEQ_START_TOKEN) { 2617 seq_puts(seq, " sl local_address rem_address st tx_queue " 2618 "rx_queue tr tm->when retrnsmt uid timeout " 2619 "inode"); 2620 goto out; 2621 } 2622 st = seq->private; 2623 2624 if (sk->sk_state == TCP_TIME_WAIT) 2625 get_timewait4_sock(v, seq, st->num); 2626 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2627 get_openreq4(v, seq, st->num); 2628 else 2629 get_tcp4_sock(v, seq, st->num); 2630 out: 2631 seq_pad(seq, '\n'); 2632 return 0; 2633 } 2634 2635 #ifdef CONFIG_BPF_SYSCALL 2636 struct bpf_iter__tcp { 2637 __bpf_md_ptr(struct bpf_iter_meta *, meta); 2638 __bpf_md_ptr(struct sock_common *, sk_common); 2639 uid_t uid __aligned(8); 2640 }; 2641 2642 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 2643 struct sock_common *sk_common, uid_t uid) 2644 { 2645 struct bpf_iter__tcp ctx; 2646 2647 meta->seq_num--; /* skip SEQ_START_TOKEN */ 2648 ctx.meta = meta; 2649 ctx.sk_common = sk_common; 2650 ctx.uid = uid; 2651 return bpf_iter_run_prog(prog, &ctx); 2652 } 2653 2654 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 2655 { 2656 struct bpf_iter_meta meta; 2657 struct bpf_prog *prog; 2658 struct sock *sk = v; 2659 uid_t uid; 2660 2661 if (v == SEQ_START_TOKEN) 2662 return 0; 2663 2664 if (sk->sk_state == TCP_TIME_WAIT) { 2665 uid = 0; 2666 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 2667 const struct request_sock *req = v; 2668 2669 uid = from_kuid_munged(seq_user_ns(seq), 2670 sock_i_uid(req->rsk_listener)); 2671 } else { 2672 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 2673 } 2674 2675 meta.seq = seq; 2676 prog = bpf_iter_get_info(&meta, false); 2677 return tcp_prog_seq_show(prog, &meta, v, uid); 2678 } 2679 2680 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 2681 { 2682 struct bpf_iter_meta meta; 2683 struct bpf_prog *prog; 2684 2685 if (!v) { 2686 meta.seq = seq; 2687 prog = bpf_iter_get_info(&meta, true); 2688 if (prog) 2689 (void)tcp_prog_seq_show(prog, &meta, v, 0); 2690 } 2691 2692 tcp_seq_stop(seq, v); 2693 } 2694 2695 static const struct seq_operations bpf_iter_tcp_seq_ops = { 2696 .show = bpf_iter_tcp_seq_show, 2697 .start = tcp_seq_start, 2698 .next = tcp_seq_next, 2699 .stop = bpf_iter_tcp_seq_stop, 2700 }; 2701 #endif 2702 2703 static const struct seq_operations tcp4_seq_ops = { 2704 .show = tcp4_seq_show, 2705 .start = tcp_seq_start, 2706 .next = tcp_seq_next, 2707 .stop = tcp_seq_stop, 2708 }; 2709 2710 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 2711 .family = AF_INET, 2712 }; 2713 2714 static int __net_init tcp4_proc_init_net(struct net *net) 2715 { 2716 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 2717 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 2718 return -ENOMEM; 2719 return 0; 2720 } 2721 2722 static void __net_exit tcp4_proc_exit_net(struct net *net) 2723 { 2724 remove_proc_entry("tcp", net->proc_net); 2725 } 2726 2727 static struct pernet_operations tcp4_net_ops = { 2728 .init = tcp4_proc_init_net, 2729 .exit = tcp4_proc_exit_net, 2730 }; 2731 2732 int __init tcp4_proc_init(void) 2733 { 2734 return register_pernet_subsys(&tcp4_net_ops); 2735 } 2736 2737 void tcp4_proc_exit(void) 2738 { 2739 unregister_pernet_subsys(&tcp4_net_ops); 2740 } 2741 #endif /* CONFIG_PROC_FS */ 2742 2743 struct proto tcp_prot = { 2744 .name = "TCP", 2745 .owner = THIS_MODULE, 2746 .close = tcp_close, 2747 .pre_connect = tcp_v4_pre_connect, 2748 .connect = tcp_v4_connect, 2749 .disconnect = tcp_disconnect, 2750 .accept = inet_csk_accept, 2751 .ioctl = tcp_ioctl, 2752 .init = tcp_v4_init_sock, 2753 .destroy = tcp_v4_destroy_sock, 2754 .shutdown = tcp_shutdown, 2755 .setsockopt = tcp_setsockopt, 2756 .getsockopt = tcp_getsockopt, 2757 .keepalive = tcp_set_keepalive, 2758 .recvmsg = tcp_recvmsg, 2759 .sendmsg = tcp_sendmsg, 2760 .sendpage = tcp_sendpage, 2761 .backlog_rcv = tcp_v4_do_rcv, 2762 .release_cb = tcp_release_cb, 2763 .hash = inet_hash, 2764 .unhash = inet_unhash, 2765 .get_port = inet_csk_get_port, 2766 .enter_memory_pressure = tcp_enter_memory_pressure, 2767 .leave_memory_pressure = tcp_leave_memory_pressure, 2768 .stream_memory_free = tcp_stream_memory_free, 2769 .sockets_allocated = &tcp_sockets_allocated, 2770 .orphan_count = &tcp_orphan_count, 2771 .memory_allocated = &tcp_memory_allocated, 2772 .memory_pressure = &tcp_memory_pressure, 2773 .sysctl_mem = sysctl_tcp_mem, 2774 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 2775 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 2776 .max_header = MAX_TCP_HEADER, 2777 .obj_size = sizeof(struct tcp_sock), 2778 .slab_flags = SLAB_TYPESAFE_BY_RCU, 2779 .twsk_prot = &tcp_timewait_sock_ops, 2780 .rsk_prot = &tcp_request_sock_ops, 2781 .h.hashinfo = &tcp_hashinfo, 2782 .no_autobind = true, 2783 .diag_destroy = tcp_abort, 2784 }; 2785 EXPORT_SYMBOL(tcp_prot); 2786 2787 static void __net_exit tcp_sk_exit(struct net *net) 2788 { 2789 int cpu; 2790 2791 if (net->ipv4.tcp_congestion_control) 2792 bpf_module_put(net->ipv4.tcp_congestion_control, 2793 net->ipv4.tcp_congestion_control->owner); 2794 2795 for_each_possible_cpu(cpu) 2796 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); 2797 free_percpu(net->ipv4.tcp_sk); 2798 } 2799 2800 static int __net_init tcp_sk_init(struct net *net) 2801 { 2802 int res, cpu, cnt; 2803 2804 net->ipv4.tcp_sk = alloc_percpu(struct sock *); 2805 if (!net->ipv4.tcp_sk) 2806 return -ENOMEM; 2807 2808 for_each_possible_cpu(cpu) { 2809 struct sock *sk; 2810 2811 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 2812 IPPROTO_TCP, net); 2813 if (res) 2814 goto fail; 2815 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 2816 2817 /* Please enforce IP_DF and IPID==0 for RST and 2818 * ACK sent in SYN-RECV and TIME-WAIT state. 2819 */ 2820 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 2821 2822 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; 2823 } 2824 2825 net->ipv4.sysctl_tcp_ecn = 2; 2826 net->ipv4.sysctl_tcp_ecn_fallback = 1; 2827 2828 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 2829 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 2830 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 2831 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 2832 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 2833 2834 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 2835 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 2836 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 2837 2838 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 2839 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 2840 net->ipv4.sysctl_tcp_syncookies = 1; 2841 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 2842 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 2843 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 2844 net->ipv4.sysctl_tcp_orphan_retries = 0; 2845 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 2846 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 2847 net->ipv4.sysctl_tcp_tw_reuse = 2; 2848 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 2849 2850 cnt = tcp_hashinfo.ehash_mask + 1; 2851 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2; 2852 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; 2853 2854 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128); 2855 net->ipv4.sysctl_tcp_sack = 1; 2856 net->ipv4.sysctl_tcp_window_scaling = 1; 2857 net->ipv4.sysctl_tcp_timestamps = 1; 2858 net->ipv4.sysctl_tcp_early_retrans = 3; 2859 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 2860 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 2861 net->ipv4.sysctl_tcp_retrans_collapse = 1; 2862 net->ipv4.sysctl_tcp_max_reordering = 300; 2863 net->ipv4.sysctl_tcp_dsack = 1; 2864 net->ipv4.sysctl_tcp_app_win = 31; 2865 net->ipv4.sysctl_tcp_adv_win_scale = 1; 2866 net->ipv4.sysctl_tcp_frto = 2; 2867 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 2868 /* This limits the percentage of the congestion window which we 2869 * will allow a single TSO frame to consume. Building TSO frames 2870 * which are too large can cause TCP streams to be bursty. 2871 */ 2872 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 2873 /* Default TSQ limit of 16 TSO segments */ 2874 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; 2875 /* rfc5961 challenge ack rate limiting */ 2876 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; 2877 net->ipv4.sysctl_tcp_min_tso_segs = 2; 2878 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 2879 net->ipv4.sysctl_tcp_autocorking = 1; 2880 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 2881 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 2882 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 2883 if (net != &init_net) { 2884 memcpy(net->ipv4.sysctl_tcp_rmem, 2885 init_net.ipv4.sysctl_tcp_rmem, 2886 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 2887 memcpy(net->ipv4.sysctl_tcp_wmem, 2888 init_net.ipv4.sysctl_tcp_wmem, 2889 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 2890 } 2891 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 2892 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 2893 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 2894 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 2895 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); 2896 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; 2897 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 2898 2899 /* Reno is always built in */ 2900 if (!net_eq(net, &init_net) && 2901 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 2902 init_net.ipv4.tcp_congestion_control->owner)) 2903 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 2904 else 2905 net->ipv4.tcp_congestion_control = &tcp_reno; 2906 2907 return 0; 2908 fail: 2909 tcp_sk_exit(net); 2910 2911 return res; 2912 } 2913 2914 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 2915 { 2916 struct net *net; 2917 2918 inet_twsk_purge(&tcp_hashinfo, AF_INET); 2919 2920 list_for_each_entry(net, net_exit_list, exit_list) 2921 tcp_fastopen_ctx_destroy(net); 2922 } 2923 2924 static struct pernet_operations __net_initdata tcp_sk_ops = { 2925 .init = tcp_sk_init, 2926 .exit = tcp_sk_exit, 2927 .exit_batch = tcp_sk_exit_batch, 2928 }; 2929 2930 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 2931 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 2932 struct sock_common *sk_common, uid_t uid) 2933 2934 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 2935 { 2936 struct tcp_iter_state *st = priv_data; 2937 struct tcp_seq_afinfo *afinfo; 2938 int ret; 2939 2940 afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN); 2941 if (!afinfo) 2942 return -ENOMEM; 2943 2944 afinfo->family = AF_UNSPEC; 2945 st->bpf_seq_afinfo = afinfo; 2946 ret = bpf_iter_init_seq_net(priv_data, aux); 2947 if (ret) 2948 kfree(afinfo); 2949 return ret; 2950 } 2951 2952 static void bpf_iter_fini_tcp(void *priv_data) 2953 { 2954 struct tcp_iter_state *st = priv_data; 2955 2956 kfree(st->bpf_seq_afinfo); 2957 bpf_iter_fini_seq_net(priv_data); 2958 } 2959 2960 static const struct bpf_iter_seq_info tcp_seq_info = { 2961 .seq_ops = &bpf_iter_tcp_seq_ops, 2962 .init_seq_private = bpf_iter_init_tcp, 2963 .fini_seq_private = bpf_iter_fini_tcp, 2964 .seq_priv_size = sizeof(struct tcp_iter_state), 2965 }; 2966 2967 static struct bpf_iter_reg tcp_reg_info = { 2968 .target = "tcp", 2969 .ctx_arg_info_size = 1, 2970 .ctx_arg_info = { 2971 { offsetof(struct bpf_iter__tcp, sk_common), 2972 PTR_TO_BTF_ID_OR_NULL }, 2973 }, 2974 .seq_info = &tcp_seq_info, 2975 }; 2976 2977 static void __init bpf_iter_register(void) 2978 { 2979 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 2980 if (bpf_iter_reg_target(&tcp_reg_info)) 2981 pr_warn("Warning: could not register bpf iterator tcp\n"); 2982 } 2983 2984 #endif 2985 2986 void __init tcp_v4_init(void) 2987 { 2988 if (register_pernet_subsys(&tcp_sk_ops)) 2989 panic("Failed to create the TCP control socket.\n"); 2990 2991 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 2992 bpf_iter_register(); 2993 #endif 2994 } 2995