1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 61 #include <net/net_namespace.h> 62 #include <net/icmp.h> 63 #include <net/inet_hashtables.h> 64 #include <net/tcp.h> 65 #include <net/transp_v6.h> 66 #include <net/ipv6.h> 67 #include <net/inet_common.h> 68 #include <net/timewait_sock.h> 69 #include <net/xfrm.h> 70 #include <net/secure_seq.h> 71 #include <net/busy_poll.h> 72 73 #include <linux/inet.h> 74 #include <linux/ipv6.h> 75 #include <linux/stddef.h> 76 #include <linux/proc_fs.h> 77 #include <linux/seq_file.h> 78 #include <linux/inetdevice.h> 79 80 #include <crypto/hash.h> 81 #include <linux/scatterlist.h> 82 83 #include <trace/events/tcp.h> 84 85 #ifdef CONFIG_TCP_MD5SIG 86 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 87 __be32 daddr, __be32 saddr, const struct tcphdr *th); 88 #endif 89 90 struct inet_hashinfo tcp_hashinfo; 91 EXPORT_SYMBOL(tcp_hashinfo); 92 93 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 94 { 95 return secure_tcp_seq(ip_hdr(skb)->daddr, 96 ip_hdr(skb)->saddr, 97 tcp_hdr(skb)->dest, 98 tcp_hdr(skb)->source); 99 } 100 101 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 102 { 103 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 104 } 105 106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 107 { 108 const struct inet_timewait_sock *tw = inet_twsk(sktw); 109 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 110 struct tcp_sock *tp = tcp_sk(sk); 111 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse; 112 113 if (reuse == 2) { 114 /* Still does not detect *everything* that goes through 115 * lo, since we require a loopback src or dst address 116 * or direct binding to 'lo' interface. 117 */ 118 bool loopback = false; 119 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 120 loopback = true; 121 #if IS_ENABLED(CONFIG_IPV6) 122 if (tw->tw_family == AF_INET6) { 123 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 124 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 125 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 126 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 127 loopback = true; 128 } else 129 #endif 130 { 131 if (ipv4_is_loopback(tw->tw_daddr) || 132 ipv4_is_loopback(tw->tw_rcv_saddr)) 133 loopback = true; 134 } 135 if (!loopback) 136 reuse = 0; 137 } 138 139 /* With PAWS, it is safe from the viewpoint 140 of data integrity. Even without PAWS it is safe provided sequence 141 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 142 143 Actually, the idea is close to VJ's one, only timestamp cache is 144 held not per host, but per port pair and TW bucket is used as state 145 holder. 146 147 If TW bucket has been already destroyed we fall back to VJ's scheme 148 and use initial timestamp retrieved from peer table. 149 */ 150 if (tcptw->tw_ts_recent_stamp && 151 (!twp || (reuse && time_after32(ktime_get_seconds(), 152 tcptw->tw_ts_recent_stamp)))) { 153 /* In case of repair and re-using TIME-WAIT sockets we still 154 * want to be sure that it is safe as above but honor the 155 * sequence numbers and time stamps set as part of the repair 156 * process. 157 * 158 * Without this check re-using a TIME-WAIT socket with TCP 159 * repair would accumulate a -1 on the repair assigned 160 * sequence number. The first time it is reused the sequence 161 * is -1, the second time -2, etc. This fixes that issue 162 * without appearing to create any others. 163 */ 164 if (likely(!tp->repair)) { 165 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 166 167 if (!seq) 168 seq = 1; 169 WRITE_ONCE(tp->write_seq, seq); 170 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 171 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 172 } 173 sock_hold(sktw); 174 return 1; 175 } 176 177 return 0; 178 } 179 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 180 181 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 182 int addr_len) 183 { 184 /* This check is replicated from tcp_v4_connect() and intended to 185 * prevent BPF program called below from accessing bytes that are out 186 * of the bound specified by user in addr_len. 187 */ 188 if (addr_len < sizeof(struct sockaddr_in)) 189 return -EINVAL; 190 191 sock_owned_by_me(sk); 192 193 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr); 194 } 195 196 /* This will initiate an outgoing connection. */ 197 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 198 { 199 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 200 struct inet_sock *inet = inet_sk(sk); 201 struct tcp_sock *tp = tcp_sk(sk); 202 __be16 orig_sport, orig_dport; 203 __be32 daddr, nexthop; 204 struct flowi4 *fl4; 205 struct rtable *rt; 206 int err; 207 struct ip_options_rcu *inet_opt; 208 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 209 210 if (addr_len < sizeof(struct sockaddr_in)) 211 return -EINVAL; 212 213 if (usin->sin_family != AF_INET) 214 return -EAFNOSUPPORT; 215 216 nexthop = daddr = usin->sin_addr.s_addr; 217 inet_opt = rcu_dereference_protected(inet->inet_opt, 218 lockdep_sock_is_held(sk)); 219 if (inet_opt && inet_opt->opt.srr) { 220 if (!daddr) 221 return -EINVAL; 222 nexthop = inet_opt->opt.faddr; 223 } 224 225 orig_sport = inet->inet_sport; 226 orig_dport = usin->sin_port; 227 fl4 = &inet->cork.fl.u.ip4; 228 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 229 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 230 IPPROTO_TCP, 231 orig_sport, orig_dport, sk); 232 if (IS_ERR(rt)) { 233 err = PTR_ERR(rt); 234 if (err == -ENETUNREACH) 235 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 236 return err; 237 } 238 239 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 240 ip_rt_put(rt); 241 return -ENETUNREACH; 242 } 243 244 if (!inet_opt || !inet_opt->opt.srr) 245 daddr = fl4->daddr; 246 247 if (!inet->inet_saddr) 248 inet->inet_saddr = fl4->saddr; 249 sk_rcv_saddr_set(sk, inet->inet_saddr); 250 251 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 252 /* Reset inherited state */ 253 tp->rx_opt.ts_recent = 0; 254 tp->rx_opt.ts_recent_stamp = 0; 255 if (likely(!tp->repair)) 256 WRITE_ONCE(tp->write_seq, 0); 257 } 258 259 inet->inet_dport = usin->sin_port; 260 sk_daddr_set(sk, daddr); 261 262 inet_csk(sk)->icsk_ext_hdr_len = 0; 263 if (inet_opt) 264 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 265 266 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 267 268 /* Socket identity is still unknown (sport may be zero). 269 * However we set state to SYN-SENT and not releasing socket 270 * lock select source port, enter ourselves into the hash tables and 271 * complete initialization after this. 272 */ 273 tcp_set_state(sk, TCP_SYN_SENT); 274 err = inet_hash_connect(tcp_death_row, sk); 275 if (err) 276 goto failure; 277 278 sk_set_txhash(sk); 279 280 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 281 inet->inet_sport, inet->inet_dport, sk); 282 if (IS_ERR(rt)) { 283 err = PTR_ERR(rt); 284 rt = NULL; 285 goto failure; 286 } 287 /* OK, now commit destination to socket. */ 288 sk->sk_gso_type = SKB_GSO_TCPV4; 289 sk_setup_caps(sk, &rt->dst); 290 rt = NULL; 291 292 if (likely(!tp->repair)) { 293 if (!tp->write_seq) 294 WRITE_ONCE(tp->write_seq, 295 secure_tcp_seq(inet->inet_saddr, 296 inet->inet_daddr, 297 inet->inet_sport, 298 usin->sin_port)); 299 tp->tsoffset = secure_tcp_ts_off(sock_net(sk), 300 inet->inet_saddr, 301 inet->inet_daddr); 302 } 303 304 inet->inet_id = prandom_u32(); 305 306 if (tcp_fastopen_defer_connect(sk, &err)) 307 return err; 308 if (err) 309 goto failure; 310 311 err = tcp_connect(sk); 312 313 if (err) 314 goto failure; 315 316 return 0; 317 318 failure: 319 /* 320 * This unhashes the socket and releases the local port, 321 * if necessary. 322 */ 323 tcp_set_state(sk, TCP_CLOSE); 324 ip_rt_put(rt); 325 sk->sk_route_caps = 0; 326 inet->inet_dport = 0; 327 return err; 328 } 329 EXPORT_SYMBOL(tcp_v4_connect); 330 331 /* 332 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 333 * It can be called through tcp_release_cb() if socket was owned by user 334 * at the time tcp_v4_err() was called to handle ICMP message. 335 */ 336 void tcp_v4_mtu_reduced(struct sock *sk) 337 { 338 struct inet_sock *inet = inet_sk(sk); 339 struct dst_entry *dst; 340 u32 mtu; 341 342 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 343 return; 344 mtu = tcp_sk(sk)->mtu_info; 345 dst = inet_csk_update_pmtu(sk, mtu); 346 if (!dst) 347 return; 348 349 /* Something is about to be wrong... Remember soft error 350 * for the case, if this connection will not able to recover. 351 */ 352 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 353 sk->sk_err_soft = EMSGSIZE; 354 355 mtu = dst_mtu(dst); 356 357 if (inet->pmtudisc != IP_PMTUDISC_DONT && 358 ip_sk_accept_pmtu(sk) && 359 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 360 tcp_sync_mss(sk, mtu); 361 362 /* Resend the TCP packet because it's 363 * clear that the old packet has been 364 * dropped. This is the new "fast" path mtu 365 * discovery. 366 */ 367 tcp_simple_retransmit(sk); 368 } /* else let the usual retransmit timer handle it */ 369 } 370 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 371 372 static void do_redirect(struct sk_buff *skb, struct sock *sk) 373 { 374 struct dst_entry *dst = __sk_dst_check(sk, 0); 375 376 if (dst) 377 dst->ops->redirect(dst, sk, skb); 378 } 379 380 381 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 382 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 383 { 384 struct request_sock *req = inet_reqsk(sk); 385 struct net *net = sock_net(sk); 386 387 /* ICMPs are not backlogged, hence we cannot get 388 * an established socket here. 389 */ 390 if (seq != tcp_rsk(req)->snt_isn) { 391 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 392 } else if (abort) { 393 /* 394 * Still in SYN_RECV, just remove it silently. 395 * There is no good way to pass the error to the newly 396 * created socket, and POSIX does not want network 397 * errors returned from accept(). 398 */ 399 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 400 tcp_listendrop(req->rsk_listener); 401 } 402 reqsk_put(req); 403 } 404 EXPORT_SYMBOL(tcp_req_err); 405 406 /* TCP-LD (RFC 6069) logic */ 407 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 408 { 409 struct inet_connection_sock *icsk = inet_csk(sk); 410 struct tcp_sock *tp = tcp_sk(sk); 411 struct sk_buff *skb; 412 s32 remaining; 413 u32 delta_us; 414 415 if (sock_owned_by_user(sk)) 416 return; 417 418 if (seq != tp->snd_una || !icsk->icsk_retransmits || 419 !icsk->icsk_backoff) 420 return; 421 422 skb = tcp_rtx_queue_head(sk); 423 if (WARN_ON_ONCE(!skb)) 424 return; 425 426 icsk->icsk_backoff--; 427 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 428 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 429 430 tcp_mstamp_refresh(tp); 431 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 432 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 433 434 if (remaining > 0) { 435 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 436 remaining, TCP_RTO_MAX); 437 } else { 438 /* RTO revert clocked out retransmission. 439 * Will retransmit now. 440 */ 441 tcp_retransmit_timer(sk); 442 } 443 } 444 EXPORT_SYMBOL(tcp_ld_RTO_revert); 445 446 /* 447 * This routine is called by the ICMP module when it gets some 448 * sort of error condition. If err < 0 then the socket should 449 * be closed and the error returned to the user. If err > 0 450 * it's just the icmp type << 8 | icmp code. After adjustment 451 * header points to the first 8 bytes of the tcp header. We need 452 * to find the appropriate port. 453 * 454 * The locking strategy used here is very "optimistic". When 455 * someone else accesses the socket the ICMP is just dropped 456 * and for some paths there is no check at all. 457 * A more general error queue to queue errors for later handling 458 * is probably better. 459 * 460 */ 461 462 int tcp_v4_err(struct sk_buff *skb, u32 info) 463 { 464 const struct iphdr *iph = (const struct iphdr *)skb->data; 465 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 466 struct tcp_sock *tp; 467 struct inet_sock *inet; 468 const int type = icmp_hdr(skb)->type; 469 const int code = icmp_hdr(skb)->code; 470 struct sock *sk; 471 struct request_sock *fastopen; 472 u32 seq, snd_una; 473 int err; 474 struct net *net = dev_net(skb->dev); 475 476 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, 477 th->dest, iph->saddr, ntohs(th->source), 478 inet_iif(skb), 0); 479 if (!sk) { 480 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 481 return -ENOENT; 482 } 483 if (sk->sk_state == TCP_TIME_WAIT) { 484 inet_twsk_put(inet_twsk(sk)); 485 return 0; 486 } 487 seq = ntohl(th->seq); 488 if (sk->sk_state == TCP_NEW_SYN_RECV) { 489 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 490 type == ICMP_TIME_EXCEEDED || 491 (type == ICMP_DEST_UNREACH && 492 (code == ICMP_NET_UNREACH || 493 code == ICMP_HOST_UNREACH))); 494 return 0; 495 } 496 497 bh_lock_sock(sk); 498 /* If too many ICMPs get dropped on busy 499 * servers this needs to be solved differently. 500 * We do take care of PMTU discovery (RFC1191) special case : 501 * we can receive locally generated ICMP messages while socket is held. 502 */ 503 if (sock_owned_by_user(sk)) { 504 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 505 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 506 } 507 if (sk->sk_state == TCP_CLOSE) 508 goto out; 509 510 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 511 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 512 goto out; 513 } 514 515 tp = tcp_sk(sk); 516 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 517 fastopen = rcu_dereference(tp->fastopen_rsk); 518 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 519 if (sk->sk_state != TCP_LISTEN && 520 !between(seq, snd_una, tp->snd_nxt)) { 521 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 522 goto out; 523 } 524 525 switch (type) { 526 case ICMP_REDIRECT: 527 if (!sock_owned_by_user(sk)) 528 do_redirect(skb, sk); 529 goto out; 530 case ICMP_SOURCE_QUENCH: 531 /* Just silently ignore these. */ 532 goto out; 533 case ICMP_PARAMETERPROB: 534 err = EPROTO; 535 break; 536 case ICMP_DEST_UNREACH: 537 if (code > NR_ICMP_UNREACH) 538 goto out; 539 540 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 541 /* We are not interested in TCP_LISTEN and open_requests 542 * (SYN-ACKs send out by Linux are always <576bytes so 543 * they should go through unfragmented). 544 */ 545 if (sk->sk_state == TCP_LISTEN) 546 goto out; 547 548 tp->mtu_info = info; 549 if (!sock_owned_by_user(sk)) { 550 tcp_v4_mtu_reduced(sk); 551 } else { 552 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 553 sock_hold(sk); 554 } 555 goto out; 556 } 557 558 err = icmp_err_convert[code].errno; 559 /* check if this ICMP message allows revert of backoff. 560 * (see RFC 6069) 561 */ 562 if (!fastopen && 563 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 564 tcp_ld_RTO_revert(sk, seq); 565 break; 566 case ICMP_TIME_EXCEEDED: 567 err = EHOSTUNREACH; 568 break; 569 default: 570 goto out; 571 } 572 573 switch (sk->sk_state) { 574 case TCP_SYN_SENT: 575 case TCP_SYN_RECV: 576 /* Only in fast or simultaneous open. If a fast open socket is 577 * is already accepted it is treated as a connected one below. 578 */ 579 if (fastopen && !fastopen->sk) 580 break; 581 582 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 583 584 if (!sock_owned_by_user(sk)) { 585 sk->sk_err = err; 586 587 sk->sk_error_report(sk); 588 589 tcp_done(sk); 590 } else { 591 sk->sk_err_soft = err; 592 } 593 goto out; 594 } 595 596 /* If we've already connected we will keep trying 597 * until we time out, or the user gives up. 598 * 599 * rfc1122 4.2.3.9 allows to consider as hard errors 600 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 601 * but it is obsoleted by pmtu discovery). 602 * 603 * Note, that in modern internet, where routing is unreliable 604 * and in each dark corner broken firewalls sit, sending random 605 * errors ordered by their masters even this two messages finally lose 606 * their original sense (even Linux sends invalid PORT_UNREACHs) 607 * 608 * Now we are in compliance with RFCs. 609 * --ANK (980905) 610 */ 611 612 inet = inet_sk(sk); 613 if (!sock_owned_by_user(sk) && inet->recverr) { 614 sk->sk_err = err; 615 sk->sk_error_report(sk); 616 } else { /* Only an error on timeout */ 617 sk->sk_err_soft = err; 618 } 619 620 out: 621 bh_unlock_sock(sk); 622 sock_put(sk); 623 return 0; 624 } 625 626 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 627 { 628 struct tcphdr *th = tcp_hdr(skb); 629 630 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 631 skb->csum_start = skb_transport_header(skb) - skb->head; 632 skb->csum_offset = offsetof(struct tcphdr, check); 633 } 634 635 /* This routine computes an IPv4 TCP checksum. */ 636 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 637 { 638 const struct inet_sock *inet = inet_sk(sk); 639 640 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 641 } 642 EXPORT_SYMBOL(tcp_v4_send_check); 643 644 /* 645 * This routine will send an RST to the other tcp. 646 * 647 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 648 * for reset. 649 * Answer: if a packet caused RST, it is not for a socket 650 * existing in our system, if it is matched to a socket, 651 * it is just duplicate segment or bug in other side's TCP. 652 * So that we build reply only basing on parameters 653 * arrived with segment. 654 * Exception: precedence violation. We do not implement it in any case. 655 */ 656 657 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) 658 { 659 const struct tcphdr *th = tcp_hdr(skb); 660 struct { 661 struct tcphdr th; 662 #ifdef CONFIG_TCP_MD5SIG 663 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)]; 664 #endif 665 } rep; 666 struct ip_reply_arg arg; 667 #ifdef CONFIG_TCP_MD5SIG 668 struct tcp_md5sig_key *key = NULL; 669 const __u8 *hash_location = NULL; 670 unsigned char newhash[16]; 671 int genhash; 672 struct sock *sk1 = NULL; 673 #endif 674 u64 transmit_time = 0; 675 struct sock *ctl_sk; 676 struct net *net; 677 678 /* Never send a reset in response to a reset. */ 679 if (th->rst) 680 return; 681 682 /* If sk not NULL, it means we did a successful lookup and incoming 683 * route had to be correct. prequeue might have dropped our dst. 684 */ 685 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 686 return; 687 688 /* Swap the send and the receive. */ 689 memset(&rep, 0, sizeof(rep)); 690 rep.th.dest = th->source; 691 rep.th.source = th->dest; 692 rep.th.doff = sizeof(struct tcphdr) / 4; 693 rep.th.rst = 1; 694 695 if (th->ack) { 696 rep.th.seq = th->ack_seq; 697 } else { 698 rep.th.ack = 1; 699 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 700 skb->len - (th->doff << 2)); 701 } 702 703 memset(&arg, 0, sizeof(arg)); 704 arg.iov[0].iov_base = (unsigned char *)&rep; 705 arg.iov[0].iov_len = sizeof(rep.th); 706 707 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 708 #ifdef CONFIG_TCP_MD5SIG 709 rcu_read_lock(); 710 hash_location = tcp_parse_md5sig_option(th); 711 if (sk && sk_fullsock(sk)) { 712 const union tcp_md5_addr *addr; 713 int l3index; 714 715 /* sdif set, means packet ingressed via a device 716 * in an L3 domain and inet_iif is set to it. 717 */ 718 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 719 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 720 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 721 } else if (hash_location) { 722 const union tcp_md5_addr *addr; 723 int sdif = tcp_v4_sdif(skb); 724 int dif = inet_iif(skb); 725 int l3index; 726 727 /* 728 * active side is lost. Try to find listening socket through 729 * source port, and then find md5 key through listening socket. 730 * we are not loose security here: 731 * Incoming packet is checked with md5 hash with finding key, 732 * no RST generated if md5 hash doesn't match. 733 */ 734 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, 735 ip_hdr(skb)->saddr, 736 th->source, ip_hdr(skb)->daddr, 737 ntohs(th->source), dif, sdif); 738 /* don't send rst if it can't find key */ 739 if (!sk1) 740 goto out; 741 742 /* sdif set, means packet ingressed via a device 743 * in an L3 domain and dif is set to it. 744 */ 745 l3index = sdif ? dif : 0; 746 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 747 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 748 if (!key) 749 goto out; 750 751 752 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 753 if (genhash || memcmp(hash_location, newhash, 16) != 0) 754 goto out; 755 756 } 757 758 if (key) { 759 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 760 (TCPOPT_NOP << 16) | 761 (TCPOPT_MD5SIG << 8) | 762 TCPOLEN_MD5SIG); 763 /* Update length and the length the header thinks exists */ 764 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 765 rep.th.doff = arg.iov[0].iov_len / 4; 766 767 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 768 key, ip_hdr(skb)->saddr, 769 ip_hdr(skb)->daddr, &rep.th); 770 } 771 #endif 772 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 773 ip_hdr(skb)->saddr, /* XXX */ 774 arg.iov[0].iov_len, IPPROTO_TCP, 0); 775 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 776 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 777 778 /* When socket is gone, all binding information is lost. 779 * routing might fail in this case. No choice here, if we choose to force 780 * input interface, we will misroute in case of asymmetric route. 781 */ 782 if (sk) { 783 arg.bound_dev_if = sk->sk_bound_dev_if; 784 if (sk_fullsock(sk)) 785 trace_tcp_send_reset(sk, skb); 786 } 787 788 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 789 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 790 791 arg.tos = ip_hdr(skb)->tos; 792 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 793 local_bh_disable(); 794 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); 795 if (sk) { 796 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 797 inet_twsk(sk)->tw_mark : sk->sk_mark; 798 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 799 inet_twsk(sk)->tw_priority : sk->sk_priority; 800 transmit_time = tcp_transmit_time(sk); 801 } 802 ip_send_unicast_reply(ctl_sk, 803 skb, &TCP_SKB_CB(skb)->header.h4.opt, 804 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 805 &arg, arg.iov[0].iov_len, 806 transmit_time); 807 808 ctl_sk->sk_mark = 0; 809 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 810 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 811 local_bh_enable(); 812 813 #ifdef CONFIG_TCP_MD5SIG 814 out: 815 rcu_read_unlock(); 816 #endif 817 } 818 819 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 820 outside socket context is ugly, certainly. What can I do? 821 */ 822 823 static void tcp_v4_send_ack(const struct sock *sk, 824 struct sk_buff *skb, u32 seq, u32 ack, 825 u32 win, u32 tsval, u32 tsecr, int oif, 826 struct tcp_md5sig_key *key, 827 int reply_flags, u8 tos) 828 { 829 const struct tcphdr *th = tcp_hdr(skb); 830 struct { 831 struct tcphdr th; 832 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 833 #ifdef CONFIG_TCP_MD5SIG 834 + (TCPOLEN_MD5SIG_ALIGNED >> 2) 835 #endif 836 ]; 837 } rep; 838 struct net *net = sock_net(sk); 839 struct ip_reply_arg arg; 840 struct sock *ctl_sk; 841 u64 transmit_time; 842 843 memset(&rep.th, 0, sizeof(struct tcphdr)); 844 memset(&arg, 0, sizeof(arg)); 845 846 arg.iov[0].iov_base = (unsigned char *)&rep; 847 arg.iov[0].iov_len = sizeof(rep.th); 848 if (tsecr) { 849 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 850 (TCPOPT_TIMESTAMP << 8) | 851 TCPOLEN_TIMESTAMP); 852 rep.opt[1] = htonl(tsval); 853 rep.opt[2] = htonl(tsecr); 854 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 855 } 856 857 /* Swap the send and the receive. */ 858 rep.th.dest = th->source; 859 rep.th.source = th->dest; 860 rep.th.doff = arg.iov[0].iov_len / 4; 861 rep.th.seq = htonl(seq); 862 rep.th.ack_seq = htonl(ack); 863 rep.th.ack = 1; 864 rep.th.window = htons(win); 865 866 #ifdef CONFIG_TCP_MD5SIG 867 if (key) { 868 int offset = (tsecr) ? 3 : 0; 869 870 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 871 (TCPOPT_NOP << 16) | 872 (TCPOPT_MD5SIG << 8) | 873 TCPOLEN_MD5SIG); 874 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 875 rep.th.doff = arg.iov[0].iov_len/4; 876 877 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 878 key, ip_hdr(skb)->saddr, 879 ip_hdr(skb)->daddr, &rep.th); 880 } 881 #endif 882 arg.flags = reply_flags; 883 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 884 ip_hdr(skb)->saddr, /* XXX */ 885 arg.iov[0].iov_len, IPPROTO_TCP, 0); 886 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 887 if (oif) 888 arg.bound_dev_if = oif; 889 arg.tos = tos; 890 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 891 local_bh_disable(); 892 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); 893 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 894 inet_twsk(sk)->tw_mark : sk->sk_mark; 895 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 896 inet_twsk(sk)->tw_priority : sk->sk_priority; 897 transmit_time = tcp_transmit_time(sk); 898 ip_send_unicast_reply(ctl_sk, 899 skb, &TCP_SKB_CB(skb)->header.h4.opt, 900 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 901 &arg, arg.iov[0].iov_len, 902 transmit_time); 903 904 ctl_sk->sk_mark = 0; 905 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 906 local_bh_enable(); 907 } 908 909 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 910 { 911 struct inet_timewait_sock *tw = inet_twsk(sk); 912 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 913 914 tcp_v4_send_ack(sk, skb, 915 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 916 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 917 tcp_time_stamp_raw() + tcptw->tw_ts_offset, 918 tcptw->tw_ts_recent, 919 tw->tw_bound_dev_if, 920 tcp_twsk_md5_key(tcptw), 921 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 922 tw->tw_tos 923 ); 924 925 inet_twsk_put(tw); 926 } 927 928 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 929 struct request_sock *req) 930 { 931 const union tcp_md5_addr *addr; 932 int l3index; 933 934 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 935 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 936 */ 937 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 938 tcp_sk(sk)->snd_nxt; 939 940 /* RFC 7323 2.3 941 * The window field (SEG.WND) of every outgoing segment, with the 942 * exception of <SYN> segments, MUST be right-shifted by 943 * Rcv.Wind.Shift bits: 944 */ 945 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 946 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 947 tcp_v4_send_ack(sk, skb, seq, 948 tcp_rsk(req)->rcv_nxt, 949 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 950 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, 951 req->ts_recent, 952 0, 953 tcp_md5_do_lookup(sk, l3index, addr, AF_INET), 954 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 955 ip_hdr(skb)->tos); 956 } 957 958 /* 959 * Send a SYN-ACK after having received a SYN. 960 * This still operates on a request_sock only, not on a big 961 * socket. 962 */ 963 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 964 struct flowi *fl, 965 struct request_sock *req, 966 struct tcp_fastopen_cookie *foc, 967 enum tcp_synack_type synack_type) 968 { 969 const struct inet_request_sock *ireq = inet_rsk(req); 970 struct flowi4 fl4; 971 int err = -1; 972 struct sk_buff *skb; 973 974 /* First, grab a route. */ 975 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 976 return -1; 977 978 skb = tcp_make_synack(sk, dst, req, foc, synack_type); 979 980 if (skb) { 981 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 982 983 rcu_read_lock(); 984 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 985 ireq->ir_rmt_addr, 986 rcu_dereference(ireq->ireq_opt)); 987 rcu_read_unlock(); 988 err = net_xmit_eval(err); 989 } 990 991 return err; 992 } 993 994 /* 995 * IPv4 request_sock destructor. 996 */ 997 static void tcp_v4_reqsk_destructor(struct request_sock *req) 998 { 999 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1000 } 1001 1002 #ifdef CONFIG_TCP_MD5SIG 1003 /* 1004 * RFC2385 MD5 checksumming requires a mapping of 1005 * IP address->MD5 Key. 1006 * We need to maintain these in the sk structure. 1007 */ 1008 1009 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed); 1010 EXPORT_SYMBOL(tcp_md5_needed); 1011 1012 /* Find the Key structure for an address. */ 1013 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1014 const union tcp_md5_addr *addr, 1015 int family) 1016 { 1017 const struct tcp_sock *tp = tcp_sk(sk); 1018 struct tcp_md5sig_key *key; 1019 const struct tcp_md5sig_info *md5sig; 1020 __be32 mask; 1021 struct tcp_md5sig_key *best_match = NULL; 1022 bool match; 1023 1024 /* caller either holds rcu_read_lock() or socket lock */ 1025 md5sig = rcu_dereference_check(tp->md5sig_info, 1026 lockdep_sock_is_held(sk)); 1027 if (!md5sig) 1028 return NULL; 1029 1030 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1031 lockdep_sock_is_held(sk)) { 1032 if (key->family != family) 1033 continue; 1034 if (key->l3index && key->l3index != l3index) 1035 continue; 1036 if (family == AF_INET) { 1037 mask = inet_make_mask(key->prefixlen); 1038 match = (key->addr.a4.s_addr & mask) == 1039 (addr->a4.s_addr & mask); 1040 #if IS_ENABLED(CONFIG_IPV6) 1041 } else if (family == AF_INET6) { 1042 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1043 key->prefixlen); 1044 #endif 1045 } else { 1046 match = false; 1047 } 1048 1049 if (match && (!best_match || 1050 key->prefixlen > best_match->prefixlen)) 1051 best_match = key; 1052 } 1053 return best_match; 1054 } 1055 EXPORT_SYMBOL(__tcp_md5_do_lookup); 1056 1057 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1058 const union tcp_md5_addr *addr, 1059 int family, u8 prefixlen, 1060 int l3index) 1061 { 1062 const struct tcp_sock *tp = tcp_sk(sk); 1063 struct tcp_md5sig_key *key; 1064 unsigned int size = sizeof(struct in_addr); 1065 const struct tcp_md5sig_info *md5sig; 1066 1067 /* caller either holds rcu_read_lock() or socket lock */ 1068 md5sig = rcu_dereference_check(tp->md5sig_info, 1069 lockdep_sock_is_held(sk)); 1070 if (!md5sig) 1071 return NULL; 1072 #if IS_ENABLED(CONFIG_IPV6) 1073 if (family == AF_INET6) 1074 size = sizeof(struct in6_addr); 1075 #endif 1076 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1077 lockdep_sock_is_held(sk)) { 1078 if (key->family != family) 1079 continue; 1080 if (key->l3index && key->l3index != l3index) 1081 continue; 1082 if (!memcmp(&key->addr, addr, size) && 1083 key->prefixlen == prefixlen) 1084 return key; 1085 } 1086 return NULL; 1087 } 1088 1089 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1090 const struct sock *addr_sk) 1091 { 1092 const union tcp_md5_addr *addr; 1093 int l3index; 1094 1095 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1096 addr_sk->sk_bound_dev_if); 1097 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1098 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1099 } 1100 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1101 1102 /* This can be called on a newly created socket, from other files */ 1103 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1104 int family, u8 prefixlen, int l3index, 1105 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1106 { 1107 /* Add Key to the list */ 1108 struct tcp_md5sig_key *key; 1109 struct tcp_sock *tp = tcp_sk(sk); 1110 struct tcp_md5sig_info *md5sig; 1111 1112 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); 1113 if (key) { 1114 /* Pre-existing entry - just update that one. 1115 * Note that the key might be used concurrently. 1116 * data_race() is telling kcsan that we do not care of 1117 * key mismatches, since changing MD5 key on live flows 1118 * can lead to packet drops. 1119 */ 1120 data_race(memcpy(key->key, newkey, newkeylen)); 1121 1122 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1123 * Also note that a reader could catch new key->keylen value 1124 * but old key->key[], this is the reason we use __GFP_ZERO 1125 * at sock_kmalloc() time below these lines. 1126 */ 1127 WRITE_ONCE(key->keylen, newkeylen); 1128 1129 return 0; 1130 } 1131 1132 md5sig = rcu_dereference_protected(tp->md5sig_info, 1133 lockdep_sock_is_held(sk)); 1134 if (!md5sig) { 1135 md5sig = kmalloc(sizeof(*md5sig), gfp); 1136 if (!md5sig) 1137 return -ENOMEM; 1138 1139 sk_nocaps_add(sk, NETIF_F_GSO_MASK); 1140 INIT_HLIST_HEAD(&md5sig->head); 1141 rcu_assign_pointer(tp->md5sig_info, md5sig); 1142 } 1143 1144 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1145 if (!key) 1146 return -ENOMEM; 1147 if (!tcp_alloc_md5sig_pool()) { 1148 sock_kfree_s(sk, key, sizeof(*key)); 1149 return -ENOMEM; 1150 } 1151 1152 memcpy(key->key, newkey, newkeylen); 1153 key->keylen = newkeylen; 1154 key->family = family; 1155 key->prefixlen = prefixlen; 1156 key->l3index = l3index; 1157 memcpy(&key->addr, addr, 1158 (family == AF_INET6) ? sizeof(struct in6_addr) : 1159 sizeof(struct in_addr)); 1160 hlist_add_head_rcu(&key->node, &md5sig->head); 1161 return 0; 1162 } 1163 EXPORT_SYMBOL(tcp_md5_do_add); 1164 1165 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1166 u8 prefixlen, int l3index) 1167 { 1168 struct tcp_md5sig_key *key; 1169 1170 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); 1171 if (!key) 1172 return -ENOENT; 1173 hlist_del_rcu(&key->node); 1174 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1175 kfree_rcu(key, rcu); 1176 return 0; 1177 } 1178 EXPORT_SYMBOL(tcp_md5_do_del); 1179 1180 static void tcp_clear_md5_list(struct sock *sk) 1181 { 1182 struct tcp_sock *tp = tcp_sk(sk); 1183 struct tcp_md5sig_key *key; 1184 struct hlist_node *n; 1185 struct tcp_md5sig_info *md5sig; 1186 1187 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1188 1189 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1190 hlist_del_rcu(&key->node); 1191 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1192 kfree_rcu(key, rcu); 1193 } 1194 } 1195 1196 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1197 char __user *optval, int optlen) 1198 { 1199 struct tcp_md5sig cmd; 1200 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1201 const union tcp_md5_addr *addr; 1202 u8 prefixlen = 32; 1203 int l3index = 0; 1204 1205 if (optlen < sizeof(cmd)) 1206 return -EINVAL; 1207 1208 if (copy_from_user(&cmd, optval, sizeof(cmd))) 1209 return -EFAULT; 1210 1211 if (sin->sin_family != AF_INET) 1212 return -EINVAL; 1213 1214 if (optname == TCP_MD5SIG_EXT && 1215 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1216 prefixlen = cmd.tcpm_prefixlen; 1217 if (prefixlen > 32) 1218 return -EINVAL; 1219 } 1220 1221 if (optname == TCP_MD5SIG_EXT && 1222 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1223 struct net_device *dev; 1224 1225 rcu_read_lock(); 1226 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1227 if (dev && netif_is_l3_master(dev)) 1228 l3index = dev->ifindex; 1229 1230 rcu_read_unlock(); 1231 1232 /* ok to reference set/not set outside of rcu; 1233 * right now device MUST be an L3 master 1234 */ 1235 if (!dev || !l3index) 1236 return -EINVAL; 1237 } 1238 1239 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1240 1241 if (!cmd.tcpm_keylen) 1242 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index); 1243 1244 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1245 return -EINVAL; 1246 1247 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, 1248 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); 1249 } 1250 1251 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, 1252 __be32 daddr, __be32 saddr, 1253 const struct tcphdr *th, int nbytes) 1254 { 1255 struct tcp4_pseudohdr *bp; 1256 struct scatterlist sg; 1257 struct tcphdr *_th; 1258 1259 bp = hp->scratch; 1260 bp->saddr = saddr; 1261 bp->daddr = daddr; 1262 bp->pad = 0; 1263 bp->protocol = IPPROTO_TCP; 1264 bp->len = cpu_to_be16(nbytes); 1265 1266 _th = (struct tcphdr *)(bp + 1); 1267 memcpy(_th, th, sizeof(*th)); 1268 _th->check = 0; 1269 1270 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1271 ahash_request_set_crypt(hp->md5_req, &sg, NULL, 1272 sizeof(*bp) + sizeof(*th)); 1273 return crypto_ahash_update(hp->md5_req); 1274 } 1275 1276 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1277 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1278 { 1279 struct tcp_md5sig_pool *hp; 1280 struct ahash_request *req; 1281 1282 hp = tcp_get_md5sig_pool(); 1283 if (!hp) 1284 goto clear_hash_noput; 1285 req = hp->md5_req; 1286 1287 if (crypto_ahash_init(req)) 1288 goto clear_hash; 1289 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) 1290 goto clear_hash; 1291 if (tcp_md5_hash_key(hp, key)) 1292 goto clear_hash; 1293 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1294 if (crypto_ahash_final(req)) 1295 goto clear_hash; 1296 1297 tcp_put_md5sig_pool(); 1298 return 0; 1299 1300 clear_hash: 1301 tcp_put_md5sig_pool(); 1302 clear_hash_noput: 1303 memset(md5_hash, 0, 16); 1304 return 1; 1305 } 1306 1307 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1308 const struct sock *sk, 1309 const struct sk_buff *skb) 1310 { 1311 struct tcp_md5sig_pool *hp; 1312 struct ahash_request *req; 1313 const struct tcphdr *th = tcp_hdr(skb); 1314 __be32 saddr, daddr; 1315 1316 if (sk) { /* valid for establish/request sockets */ 1317 saddr = sk->sk_rcv_saddr; 1318 daddr = sk->sk_daddr; 1319 } else { 1320 const struct iphdr *iph = ip_hdr(skb); 1321 saddr = iph->saddr; 1322 daddr = iph->daddr; 1323 } 1324 1325 hp = tcp_get_md5sig_pool(); 1326 if (!hp) 1327 goto clear_hash_noput; 1328 req = hp->md5_req; 1329 1330 if (crypto_ahash_init(req)) 1331 goto clear_hash; 1332 1333 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len)) 1334 goto clear_hash; 1335 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) 1336 goto clear_hash; 1337 if (tcp_md5_hash_key(hp, key)) 1338 goto clear_hash; 1339 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1340 if (crypto_ahash_final(req)) 1341 goto clear_hash; 1342 1343 tcp_put_md5sig_pool(); 1344 return 0; 1345 1346 clear_hash: 1347 tcp_put_md5sig_pool(); 1348 clear_hash_noput: 1349 memset(md5_hash, 0, 16); 1350 return 1; 1351 } 1352 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1353 1354 #endif 1355 1356 /* Called with rcu_read_lock() */ 1357 static bool tcp_v4_inbound_md5_hash(const struct sock *sk, 1358 const struct sk_buff *skb, 1359 int dif, int sdif) 1360 { 1361 #ifdef CONFIG_TCP_MD5SIG 1362 /* 1363 * This gets called for each TCP segment that arrives 1364 * so we want to be efficient. 1365 * We have 3 drop cases: 1366 * o No MD5 hash and one expected. 1367 * o MD5 hash and we're not expecting one. 1368 * o MD5 hash and its wrong. 1369 */ 1370 const __u8 *hash_location = NULL; 1371 struct tcp_md5sig_key *hash_expected; 1372 const struct iphdr *iph = ip_hdr(skb); 1373 const struct tcphdr *th = tcp_hdr(skb); 1374 const union tcp_md5_addr *addr; 1375 unsigned char newhash[16]; 1376 int genhash, l3index; 1377 1378 /* sdif set, means packet ingressed via a device 1379 * in an L3 domain and dif is set to the l3mdev 1380 */ 1381 l3index = sdif ? dif : 0; 1382 1383 addr = (union tcp_md5_addr *)&iph->saddr; 1384 hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1385 hash_location = tcp_parse_md5sig_option(th); 1386 1387 /* We've parsed the options - do we have a hash? */ 1388 if (!hash_expected && !hash_location) 1389 return false; 1390 1391 if (hash_expected && !hash_location) { 1392 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); 1393 return true; 1394 } 1395 1396 if (!hash_expected && hash_location) { 1397 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); 1398 return true; 1399 } 1400 1401 /* Okay, so this is hash_expected and hash_location - 1402 * so we need to calculate the checksum. 1403 */ 1404 genhash = tcp_v4_md5_hash_skb(newhash, 1405 hash_expected, 1406 NULL, skb); 1407 1408 if (genhash || memcmp(hash_location, newhash, 16) != 0) { 1409 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); 1410 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n", 1411 &iph->saddr, ntohs(th->source), 1412 &iph->daddr, ntohs(th->dest), 1413 genhash ? " tcp_v4_calc_md5_hash failed" 1414 : "", l3index); 1415 return true; 1416 } 1417 return false; 1418 #endif 1419 return false; 1420 } 1421 1422 static void tcp_v4_init_req(struct request_sock *req, 1423 const struct sock *sk_listener, 1424 struct sk_buff *skb) 1425 { 1426 struct inet_request_sock *ireq = inet_rsk(req); 1427 struct net *net = sock_net(sk_listener); 1428 1429 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1430 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1431 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1432 } 1433 1434 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1435 struct flowi *fl, 1436 const struct request_sock *req) 1437 { 1438 return inet_csk_route_req(sk, &fl->u.ip4, req); 1439 } 1440 1441 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1442 .family = PF_INET, 1443 .obj_size = sizeof(struct tcp_request_sock), 1444 .rtx_syn_ack = tcp_rtx_synack, 1445 .send_ack = tcp_v4_reqsk_send_ack, 1446 .destructor = tcp_v4_reqsk_destructor, 1447 .send_reset = tcp_v4_send_reset, 1448 .syn_ack_timeout = tcp_syn_ack_timeout, 1449 }; 1450 1451 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1452 .mss_clamp = TCP_MSS_DEFAULT, 1453 #ifdef CONFIG_TCP_MD5SIG 1454 .req_md5_lookup = tcp_v4_md5_lookup, 1455 .calc_md5_hash = tcp_v4_md5_hash_skb, 1456 #endif 1457 .init_req = tcp_v4_init_req, 1458 #ifdef CONFIG_SYN_COOKIES 1459 .cookie_init_seq = cookie_v4_init_sequence, 1460 #endif 1461 .route_req = tcp_v4_route_req, 1462 .init_seq = tcp_v4_init_seq, 1463 .init_ts_off = tcp_v4_init_ts_off, 1464 .send_synack = tcp_v4_send_synack, 1465 }; 1466 1467 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1468 { 1469 /* Never answer to SYNs send to broadcast or multicast */ 1470 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1471 goto drop; 1472 1473 return tcp_conn_request(&tcp_request_sock_ops, 1474 &tcp_request_sock_ipv4_ops, sk, skb); 1475 1476 drop: 1477 tcp_listendrop(sk); 1478 return 0; 1479 } 1480 EXPORT_SYMBOL(tcp_v4_conn_request); 1481 1482 1483 /* 1484 * The three way handshake has completed - we got a valid synack - 1485 * now create the new socket. 1486 */ 1487 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1488 struct request_sock *req, 1489 struct dst_entry *dst, 1490 struct request_sock *req_unhash, 1491 bool *own_req) 1492 { 1493 struct inet_request_sock *ireq; 1494 struct inet_sock *newinet; 1495 struct tcp_sock *newtp; 1496 struct sock *newsk; 1497 #ifdef CONFIG_TCP_MD5SIG 1498 const union tcp_md5_addr *addr; 1499 struct tcp_md5sig_key *key; 1500 int l3index; 1501 #endif 1502 struct ip_options_rcu *inet_opt; 1503 1504 if (sk_acceptq_is_full(sk)) 1505 goto exit_overflow; 1506 1507 newsk = tcp_create_openreq_child(sk, req, skb); 1508 if (!newsk) 1509 goto exit_nonewsk; 1510 1511 newsk->sk_gso_type = SKB_GSO_TCPV4; 1512 inet_sk_rx_dst_set(newsk, skb); 1513 1514 newtp = tcp_sk(newsk); 1515 newinet = inet_sk(newsk); 1516 ireq = inet_rsk(req); 1517 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1518 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1519 newsk->sk_bound_dev_if = ireq->ir_iif; 1520 newinet->inet_saddr = ireq->ir_loc_addr; 1521 inet_opt = rcu_dereference(ireq->ireq_opt); 1522 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1523 newinet->mc_index = inet_iif(skb); 1524 newinet->mc_ttl = ip_hdr(skb)->ttl; 1525 newinet->rcv_tos = ip_hdr(skb)->tos; 1526 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1527 if (inet_opt) 1528 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1529 newinet->inet_id = prandom_u32(); 1530 1531 if (!dst) { 1532 dst = inet_csk_route_child_sock(sk, newsk, req); 1533 if (!dst) 1534 goto put_and_exit; 1535 } else { 1536 /* syncookie case : see end of cookie_v4_check() */ 1537 } 1538 sk_setup_caps(newsk, dst); 1539 1540 tcp_ca_openreq_child(newsk, dst); 1541 1542 tcp_sync_mss(newsk, dst_mtu(dst)); 1543 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1544 1545 tcp_initialize_rcv_mss(newsk); 1546 1547 #ifdef CONFIG_TCP_MD5SIG 1548 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1549 /* Copy over the MD5 key from the original socket */ 1550 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1551 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1552 if (key) { 1553 /* 1554 * We're using one, so create a matching key 1555 * on the newsk structure. If we fail to get 1556 * memory, then we end up not copying the key 1557 * across. Shucks. 1558 */ 1559 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, 1560 key->key, key->keylen, GFP_ATOMIC); 1561 sk_nocaps_add(newsk, NETIF_F_GSO_MASK); 1562 } 1563 #endif 1564 1565 if (__inet_inherit_port(sk, newsk) < 0) 1566 goto put_and_exit; 1567 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); 1568 if (likely(*own_req)) { 1569 tcp_move_syn(newtp, req); 1570 ireq->ireq_opt = NULL; 1571 } else { 1572 newinet->inet_opt = NULL; 1573 } 1574 return newsk; 1575 1576 exit_overflow: 1577 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1578 exit_nonewsk: 1579 dst_release(dst); 1580 exit: 1581 tcp_listendrop(sk); 1582 return NULL; 1583 put_and_exit: 1584 newinet->inet_opt = NULL; 1585 inet_csk_prepare_forced_close(newsk); 1586 tcp_done(newsk); 1587 goto exit; 1588 } 1589 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1590 1591 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1592 { 1593 #ifdef CONFIG_SYN_COOKIES 1594 const struct tcphdr *th = tcp_hdr(skb); 1595 1596 if (!th->syn) 1597 sk = cookie_v4_check(sk, skb); 1598 #endif 1599 return sk; 1600 } 1601 1602 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1603 struct tcphdr *th, u32 *cookie) 1604 { 1605 u16 mss = 0; 1606 #ifdef CONFIG_SYN_COOKIES 1607 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1608 &tcp_request_sock_ipv4_ops, sk, th); 1609 if (mss) { 1610 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1611 tcp_synq_overflow(sk); 1612 } 1613 #endif 1614 return mss; 1615 } 1616 1617 /* The socket must have it's spinlock held when we get 1618 * here, unless it is a TCP_LISTEN socket. 1619 * 1620 * We have a potential double-lock case here, so even when 1621 * doing backlog processing we use the BH locking scheme. 1622 * This is because we cannot sleep with the original spinlock 1623 * held. 1624 */ 1625 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1626 { 1627 struct sock *rsk; 1628 1629 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1630 struct dst_entry *dst = sk->sk_rx_dst; 1631 1632 sock_rps_save_rxhash(sk, skb); 1633 sk_mark_napi_id(sk, skb); 1634 if (dst) { 1635 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || 1636 !dst->ops->check(dst, 0)) { 1637 dst_release(dst); 1638 sk->sk_rx_dst = NULL; 1639 } 1640 } 1641 tcp_rcv_established(sk, skb); 1642 return 0; 1643 } 1644 1645 if (tcp_checksum_complete(skb)) 1646 goto csum_err; 1647 1648 if (sk->sk_state == TCP_LISTEN) { 1649 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1650 1651 if (!nsk) 1652 goto discard; 1653 if (nsk != sk) { 1654 if (tcp_child_process(sk, nsk, skb)) { 1655 rsk = nsk; 1656 goto reset; 1657 } 1658 return 0; 1659 } 1660 } else 1661 sock_rps_save_rxhash(sk, skb); 1662 1663 if (tcp_rcv_state_process(sk, skb)) { 1664 rsk = sk; 1665 goto reset; 1666 } 1667 return 0; 1668 1669 reset: 1670 tcp_v4_send_reset(rsk, skb); 1671 discard: 1672 kfree_skb(skb); 1673 /* Be careful here. If this function gets more complicated and 1674 * gcc suffers from register pressure on the x86, sk (in %ebx) 1675 * might be destroyed here. This current version compiles correctly, 1676 * but you have been warned. 1677 */ 1678 return 0; 1679 1680 csum_err: 1681 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1682 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1683 goto discard; 1684 } 1685 EXPORT_SYMBOL(tcp_v4_do_rcv); 1686 1687 int tcp_v4_early_demux(struct sk_buff *skb) 1688 { 1689 const struct iphdr *iph; 1690 const struct tcphdr *th; 1691 struct sock *sk; 1692 1693 if (skb->pkt_type != PACKET_HOST) 1694 return 0; 1695 1696 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1697 return 0; 1698 1699 iph = ip_hdr(skb); 1700 th = tcp_hdr(skb); 1701 1702 if (th->doff < sizeof(struct tcphdr) / 4) 1703 return 0; 1704 1705 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, 1706 iph->saddr, th->source, 1707 iph->daddr, ntohs(th->dest), 1708 skb->skb_iif, inet_sdif(skb)); 1709 if (sk) { 1710 skb->sk = sk; 1711 skb->destructor = sock_edemux; 1712 if (sk_fullsock(sk)) { 1713 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); 1714 1715 if (dst) 1716 dst = dst_check(dst, 0); 1717 if (dst && 1718 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) 1719 skb_dst_set_noref(skb, dst); 1720 } 1721 } 1722 return 0; 1723 } 1724 1725 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) 1726 { 1727 u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf); 1728 struct skb_shared_info *shinfo; 1729 const struct tcphdr *th; 1730 struct tcphdr *thtail; 1731 struct sk_buff *tail; 1732 unsigned int hdrlen; 1733 bool fragstolen; 1734 u32 gso_segs; 1735 int delta; 1736 1737 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1738 * we can fix skb->truesize to its real value to avoid future drops. 1739 * This is valid because skb is not yet charged to the socket. 1740 * It has been noticed pure SACK packets were sometimes dropped 1741 * (if cooked by drivers without copybreak feature). 1742 */ 1743 skb_condense(skb); 1744 1745 skb_dst_drop(skb); 1746 1747 if (unlikely(tcp_checksum_complete(skb))) { 1748 bh_unlock_sock(sk); 1749 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1750 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1751 return true; 1752 } 1753 1754 /* Attempt coalescing to last skb in backlog, even if we are 1755 * above the limits. 1756 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 1757 */ 1758 th = (const struct tcphdr *)skb->data; 1759 hdrlen = th->doff * 4; 1760 shinfo = skb_shinfo(skb); 1761 1762 if (!shinfo->gso_size) 1763 shinfo->gso_size = skb->len - hdrlen; 1764 1765 if (!shinfo->gso_segs) 1766 shinfo->gso_segs = 1; 1767 1768 tail = sk->sk_backlog.tail; 1769 if (!tail) 1770 goto no_coalesce; 1771 thtail = (struct tcphdr *)tail->data; 1772 1773 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 1774 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 1775 ((TCP_SKB_CB(tail)->tcp_flags | 1776 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 1777 !((TCP_SKB_CB(tail)->tcp_flags & 1778 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 1779 ((TCP_SKB_CB(tail)->tcp_flags ^ 1780 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || 1781 #ifdef CONFIG_TLS_DEVICE 1782 tail->decrypted != skb->decrypted || 1783 #endif 1784 thtail->doff != th->doff || 1785 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 1786 goto no_coalesce; 1787 1788 __skb_pull(skb, hdrlen); 1789 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 1790 thtail->window = th->window; 1791 1792 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 1793 1794 if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq)) 1795 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 1796 1797 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 1798 * thtail->fin, so that the fast path in tcp_rcv_established() 1799 * is not entered if we append a packet with a FIN. 1800 * SYN, RST, URG are not present. 1801 * ACK is set on both packets. 1802 * PSH : we do not really care in TCP stack, 1803 * at least for 'GRO' packets. 1804 */ 1805 thtail->fin |= th->fin; 1806 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 1807 1808 if (TCP_SKB_CB(skb)->has_rxtstamp) { 1809 TCP_SKB_CB(tail)->has_rxtstamp = true; 1810 tail->tstamp = skb->tstamp; 1811 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 1812 } 1813 1814 /* Not as strict as GRO. We only need to carry mss max value */ 1815 skb_shinfo(tail)->gso_size = max(shinfo->gso_size, 1816 skb_shinfo(tail)->gso_size); 1817 1818 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs; 1819 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF); 1820 1821 sk->sk_backlog.len += delta; 1822 __NET_INC_STATS(sock_net(sk), 1823 LINUX_MIB_TCPBACKLOGCOALESCE); 1824 kfree_skb_partial(skb, fragstolen); 1825 return false; 1826 } 1827 __skb_push(skb, hdrlen); 1828 1829 no_coalesce: 1830 /* Only socket owner can try to collapse/prune rx queues 1831 * to reduce memory overhead, so add a little headroom here. 1832 * Few sockets backlog are possibly concurrently non empty. 1833 */ 1834 limit += 64*1024; 1835 1836 if (unlikely(sk_add_backlog(sk, skb, limit))) { 1837 bh_unlock_sock(sk); 1838 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 1839 return true; 1840 } 1841 return false; 1842 } 1843 EXPORT_SYMBOL(tcp_add_backlog); 1844 1845 int tcp_filter(struct sock *sk, struct sk_buff *skb) 1846 { 1847 struct tcphdr *th = (struct tcphdr *)skb->data; 1848 1849 return sk_filter_trim_cap(sk, skb, th->doff * 4); 1850 } 1851 EXPORT_SYMBOL(tcp_filter); 1852 1853 static void tcp_v4_restore_cb(struct sk_buff *skb) 1854 { 1855 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 1856 sizeof(struct inet_skb_parm)); 1857 } 1858 1859 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 1860 const struct tcphdr *th) 1861 { 1862 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 1863 * barrier() makes sure compiler wont play fool^Waliasing games. 1864 */ 1865 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 1866 sizeof(struct inet_skb_parm)); 1867 barrier(); 1868 1869 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1870 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1871 skb->len - th->doff * 4); 1872 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1873 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 1874 TCP_SKB_CB(skb)->tcp_tw_isn = 0; 1875 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 1876 TCP_SKB_CB(skb)->sacked = 0; 1877 TCP_SKB_CB(skb)->has_rxtstamp = 1878 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 1879 } 1880 1881 /* 1882 * From tcp_input.c 1883 */ 1884 1885 int tcp_v4_rcv(struct sk_buff *skb) 1886 { 1887 struct net *net = dev_net(skb->dev); 1888 struct sk_buff *skb_to_free; 1889 int sdif = inet_sdif(skb); 1890 int dif = inet_iif(skb); 1891 const struct iphdr *iph; 1892 const struct tcphdr *th; 1893 bool refcounted; 1894 struct sock *sk; 1895 int ret; 1896 1897 if (skb->pkt_type != PACKET_HOST) 1898 goto discard_it; 1899 1900 /* Count it even if it's bad */ 1901 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 1902 1903 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 1904 goto discard_it; 1905 1906 th = (const struct tcphdr *)skb->data; 1907 1908 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) 1909 goto bad_packet; 1910 if (!pskb_may_pull(skb, th->doff * 4)) 1911 goto discard_it; 1912 1913 /* An explanation is required here, I think. 1914 * Packet length and doff are validated by header prediction, 1915 * provided case of th->doff==0 is eliminated. 1916 * So, we defer the checks. */ 1917 1918 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 1919 goto csum_error; 1920 1921 th = (const struct tcphdr *)skb->data; 1922 iph = ip_hdr(skb); 1923 lookup: 1924 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, 1925 th->dest, sdif, &refcounted); 1926 if (!sk) 1927 goto no_tcp_socket; 1928 1929 process: 1930 if (sk->sk_state == TCP_TIME_WAIT) 1931 goto do_time_wait; 1932 1933 if (sk->sk_state == TCP_NEW_SYN_RECV) { 1934 struct request_sock *req = inet_reqsk(sk); 1935 bool req_stolen = false; 1936 struct sock *nsk; 1937 1938 sk = req->rsk_listener; 1939 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) { 1940 sk_drops_add(sk, skb); 1941 reqsk_put(req); 1942 goto discard_it; 1943 } 1944 if (tcp_checksum_complete(skb)) { 1945 reqsk_put(req); 1946 goto csum_error; 1947 } 1948 if (unlikely(sk->sk_state != TCP_LISTEN)) { 1949 inet_csk_reqsk_queue_drop_and_put(sk, req); 1950 goto lookup; 1951 } 1952 /* We own a reference on the listener, increase it again 1953 * as we might lose it too soon. 1954 */ 1955 sock_hold(sk); 1956 refcounted = true; 1957 nsk = NULL; 1958 if (!tcp_filter(sk, skb)) { 1959 th = (const struct tcphdr *)skb->data; 1960 iph = ip_hdr(skb); 1961 tcp_v4_fill_cb(skb, iph, th); 1962 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 1963 } 1964 if (!nsk) { 1965 reqsk_put(req); 1966 if (req_stolen) { 1967 /* Another cpu got exclusive access to req 1968 * and created a full blown socket. 1969 * Try to feed this packet to this socket 1970 * instead of discarding it. 1971 */ 1972 tcp_v4_restore_cb(skb); 1973 sock_put(sk); 1974 goto lookup; 1975 } 1976 goto discard_and_relse; 1977 } 1978 if (nsk == sk) { 1979 reqsk_put(req); 1980 tcp_v4_restore_cb(skb); 1981 } else if (tcp_child_process(sk, nsk, skb)) { 1982 tcp_v4_send_reset(nsk, skb); 1983 goto discard_and_relse; 1984 } else { 1985 sock_put(sk); 1986 return 0; 1987 } 1988 } 1989 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 1990 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 1991 goto discard_and_relse; 1992 } 1993 1994 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 1995 goto discard_and_relse; 1996 1997 if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif)) 1998 goto discard_and_relse; 1999 2000 nf_reset_ct(skb); 2001 2002 if (tcp_filter(sk, skb)) 2003 goto discard_and_relse; 2004 th = (const struct tcphdr *)skb->data; 2005 iph = ip_hdr(skb); 2006 tcp_v4_fill_cb(skb, iph, th); 2007 2008 skb->dev = NULL; 2009 2010 if (sk->sk_state == TCP_LISTEN) { 2011 ret = tcp_v4_do_rcv(sk, skb); 2012 goto put_and_return; 2013 } 2014 2015 sk_incoming_cpu_update(sk); 2016 2017 bh_lock_sock_nested(sk); 2018 tcp_segs_in(tcp_sk(sk), skb); 2019 ret = 0; 2020 if (!sock_owned_by_user(sk)) { 2021 skb_to_free = sk->sk_rx_skb_cache; 2022 sk->sk_rx_skb_cache = NULL; 2023 ret = tcp_v4_do_rcv(sk, skb); 2024 } else { 2025 if (tcp_add_backlog(sk, skb)) 2026 goto discard_and_relse; 2027 skb_to_free = NULL; 2028 } 2029 bh_unlock_sock(sk); 2030 if (skb_to_free) 2031 __kfree_skb(skb_to_free); 2032 2033 put_and_return: 2034 if (refcounted) 2035 sock_put(sk); 2036 2037 return ret; 2038 2039 no_tcp_socket: 2040 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2041 goto discard_it; 2042 2043 tcp_v4_fill_cb(skb, iph, th); 2044 2045 if (tcp_checksum_complete(skb)) { 2046 csum_error: 2047 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2048 bad_packet: 2049 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2050 } else { 2051 tcp_v4_send_reset(NULL, skb); 2052 } 2053 2054 discard_it: 2055 /* Discard frame. */ 2056 kfree_skb(skb); 2057 return 0; 2058 2059 discard_and_relse: 2060 sk_drops_add(sk, skb); 2061 if (refcounted) 2062 sock_put(sk); 2063 goto discard_it; 2064 2065 do_time_wait: 2066 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2067 inet_twsk_put(inet_twsk(sk)); 2068 goto discard_it; 2069 } 2070 2071 tcp_v4_fill_cb(skb, iph, th); 2072 2073 if (tcp_checksum_complete(skb)) { 2074 inet_twsk_put(inet_twsk(sk)); 2075 goto csum_error; 2076 } 2077 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 2078 case TCP_TW_SYN: { 2079 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), 2080 &tcp_hashinfo, skb, 2081 __tcp_hdrlen(th), 2082 iph->saddr, th->source, 2083 iph->daddr, th->dest, 2084 inet_iif(skb), 2085 sdif); 2086 if (sk2) { 2087 inet_twsk_deschedule_put(inet_twsk(sk)); 2088 sk = sk2; 2089 tcp_v4_restore_cb(skb); 2090 refcounted = false; 2091 goto process; 2092 } 2093 } 2094 /* to ACK */ 2095 fallthrough; 2096 case TCP_TW_ACK: 2097 tcp_v4_timewait_ack(sk, skb); 2098 break; 2099 case TCP_TW_RST: 2100 tcp_v4_send_reset(sk, skb); 2101 inet_twsk_deschedule_put(inet_twsk(sk)); 2102 goto discard_it; 2103 case TCP_TW_SUCCESS:; 2104 } 2105 goto discard_it; 2106 } 2107 2108 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2109 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2110 .twsk_unique = tcp_twsk_unique, 2111 .twsk_destructor= tcp_twsk_destructor, 2112 }; 2113 2114 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2115 { 2116 struct dst_entry *dst = skb_dst(skb); 2117 2118 if (dst && dst_hold_safe(dst)) { 2119 sk->sk_rx_dst = dst; 2120 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; 2121 } 2122 } 2123 EXPORT_SYMBOL(inet_sk_rx_dst_set); 2124 2125 const struct inet_connection_sock_af_ops ipv4_specific = { 2126 .queue_xmit = ip_queue_xmit, 2127 .send_check = tcp_v4_send_check, 2128 .rebuild_header = inet_sk_rebuild_header, 2129 .sk_rx_dst_set = inet_sk_rx_dst_set, 2130 .conn_request = tcp_v4_conn_request, 2131 .syn_recv_sock = tcp_v4_syn_recv_sock, 2132 .net_header_len = sizeof(struct iphdr), 2133 .setsockopt = ip_setsockopt, 2134 .getsockopt = ip_getsockopt, 2135 .addr2sockaddr = inet_csk_addr2sockaddr, 2136 .sockaddr_len = sizeof(struct sockaddr_in), 2137 #ifdef CONFIG_COMPAT 2138 .compat_setsockopt = compat_ip_setsockopt, 2139 .compat_getsockopt = compat_ip_getsockopt, 2140 #endif 2141 .mtu_reduced = tcp_v4_mtu_reduced, 2142 }; 2143 EXPORT_SYMBOL(ipv4_specific); 2144 2145 #ifdef CONFIG_TCP_MD5SIG 2146 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2147 .md5_lookup = tcp_v4_md5_lookup, 2148 .calc_md5_hash = tcp_v4_md5_hash_skb, 2149 .md5_parse = tcp_v4_parse_md5_keys, 2150 }; 2151 #endif 2152 2153 /* NOTE: A lot of things set to zero explicitly by call to 2154 * sk_alloc() so need not be done here. 2155 */ 2156 static int tcp_v4_init_sock(struct sock *sk) 2157 { 2158 struct inet_connection_sock *icsk = inet_csk(sk); 2159 2160 tcp_init_sock(sk); 2161 2162 icsk->icsk_af_ops = &ipv4_specific; 2163 2164 #ifdef CONFIG_TCP_MD5SIG 2165 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2166 #endif 2167 2168 return 0; 2169 } 2170 2171 void tcp_v4_destroy_sock(struct sock *sk) 2172 { 2173 struct tcp_sock *tp = tcp_sk(sk); 2174 2175 trace_tcp_destroy_sock(sk); 2176 2177 tcp_clear_xmit_timers(sk); 2178 2179 tcp_cleanup_congestion_control(sk); 2180 2181 tcp_cleanup_ulp(sk); 2182 2183 /* Cleanup up the write buffer. */ 2184 tcp_write_queue_purge(sk); 2185 2186 /* Check if we want to disable active TFO */ 2187 tcp_fastopen_active_disable_ofo_check(sk); 2188 2189 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2190 skb_rbtree_purge(&tp->out_of_order_queue); 2191 2192 #ifdef CONFIG_TCP_MD5SIG 2193 /* Clean up the MD5 key list, if any */ 2194 if (tp->md5sig_info) { 2195 tcp_clear_md5_list(sk); 2196 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu); 2197 tp->md5sig_info = NULL; 2198 } 2199 #endif 2200 2201 /* Clean up a referenced TCP bind bucket. */ 2202 if (inet_csk(sk)->icsk_bind_hash) 2203 inet_put_port(sk); 2204 2205 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2206 2207 /* If socket is aborted during connect operation */ 2208 tcp_free_fastopen_req(tp); 2209 tcp_fastopen_destroy_cipher(sk); 2210 tcp_saved_syn_free(tp); 2211 2212 sk_sockets_allocated_dec(sk); 2213 } 2214 EXPORT_SYMBOL(tcp_v4_destroy_sock); 2215 2216 #ifdef CONFIG_PROC_FS 2217 /* Proc filesystem TCP sock list dumping. */ 2218 2219 /* 2220 * Get next listener socket follow cur. If cur is NULL, get first socket 2221 * starting from bucket given in st->bucket; when st->bucket is zero the 2222 * very first socket in the hash table is returned. 2223 */ 2224 static void *listening_get_next(struct seq_file *seq, void *cur) 2225 { 2226 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); 2227 struct tcp_iter_state *st = seq->private; 2228 struct net *net = seq_file_net(seq); 2229 struct inet_listen_hashbucket *ilb; 2230 struct hlist_nulls_node *node; 2231 struct sock *sk = cur; 2232 2233 if (!sk) { 2234 get_head: 2235 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2236 spin_lock(&ilb->lock); 2237 sk = sk_nulls_head(&ilb->nulls_head); 2238 st->offset = 0; 2239 goto get_sk; 2240 } 2241 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2242 ++st->num; 2243 ++st->offset; 2244 2245 sk = sk_nulls_next(sk); 2246 get_sk: 2247 sk_nulls_for_each_from(sk, node) { 2248 if (!net_eq(sock_net(sk), net)) 2249 continue; 2250 if (sk->sk_family == afinfo->family) 2251 return sk; 2252 } 2253 spin_unlock(&ilb->lock); 2254 st->offset = 0; 2255 if (++st->bucket < INET_LHTABLE_SIZE) 2256 goto get_head; 2257 return NULL; 2258 } 2259 2260 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2261 { 2262 struct tcp_iter_state *st = seq->private; 2263 void *rc; 2264 2265 st->bucket = 0; 2266 st->offset = 0; 2267 rc = listening_get_next(seq, NULL); 2268 2269 while (rc && *pos) { 2270 rc = listening_get_next(seq, rc); 2271 --*pos; 2272 } 2273 return rc; 2274 } 2275 2276 static inline bool empty_bucket(const struct tcp_iter_state *st) 2277 { 2278 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain); 2279 } 2280 2281 /* 2282 * Get first established socket starting from bucket given in st->bucket. 2283 * If st->bucket is zero, the very first socket in the hash is returned. 2284 */ 2285 static void *established_get_first(struct seq_file *seq) 2286 { 2287 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); 2288 struct tcp_iter_state *st = seq->private; 2289 struct net *net = seq_file_net(seq); 2290 void *rc = NULL; 2291 2292 st->offset = 0; 2293 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { 2294 struct sock *sk; 2295 struct hlist_nulls_node *node; 2296 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 2297 2298 /* Lockless fast path for the common case of empty buckets */ 2299 if (empty_bucket(st)) 2300 continue; 2301 2302 spin_lock_bh(lock); 2303 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 2304 if (sk->sk_family != afinfo->family || 2305 !net_eq(sock_net(sk), net)) { 2306 continue; 2307 } 2308 rc = sk; 2309 goto out; 2310 } 2311 spin_unlock_bh(lock); 2312 } 2313 out: 2314 return rc; 2315 } 2316 2317 static void *established_get_next(struct seq_file *seq, void *cur) 2318 { 2319 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); 2320 struct sock *sk = cur; 2321 struct hlist_nulls_node *node; 2322 struct tcp_iter_state *st = seq->private; 2323 struct net *net = seq_file_net(seq); 2324 2325 ++st->num; 2326 ++st->offset; 2327 2328 sk = sk_nulls_next(sk); 2329 2330 sk_nulls_for_each_from(sk, node) { 2331 if (sk->sk_family == afinfo->family && 2332 net_eq(sock_net(sk), net)) 2333 return sk; 2334 } 2335 2336 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2337 ++st->bucket; 2338 return established_get_first(seq); 2339 } 2340 2341 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2342 { 2343 struct tcp_iter_state *st = seq->private; 2344 void *rc; 2345 2346 st->bucket = 0; 2347 rc = established_get_first(seq); 2348 2349 while (rc && pos) { 2350 rc = established_get_next(seq, rc); 2351 --pos; 2352 } 2353 return rc; 2354 } 2355 2356 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2357 { 2358 void *rc; 2359 struct tcp_iter_state *st = seq->private; 2360 2361 st->state = TCP_SEQ_STATE_LISTENING; 2362 rc = listening_get_idx(seq, &pos); 2363 2364 if (!rc) { 2365 st->state = TCP_SEQ_STATE_ESTABLISHED; 2366 rc = established_get_idx(seq, pos); 2367 } 2368 2369 return rc; 2370 } 2371 2372 static void *tcp_seek_last_pos(struct seq_file *seq) 2373 { 2374 struct tcp_iter_state *st = seq->private; 2375 int offset = st->offset; 2376 int orig_num = st->num; 2377 void *rc = NULL; 2378 2379 switch (st->state) { 2380 case TCP_SEQ_STATE_LISTENING: 2381 if (st->bucket >= INET_LHTABLE_SIZE) 2382 break; 2383 st->state = TCP_SEQ_STATE_LISTENING; 2384 rc = listening_get_next(seq, NULL); 2385 while (offset-- && rc) 2386 rc = listening_get_next(seq, rc); 2387 if (rc) 2388 break; 2389 st->bucket = 0; 2390 st->state = TCP_SEQ_STATE_ESTABLISHED; 2391 fallthrough; 2392 case TCP_SEQ_STATE_ESTABLISHED: 2393 if (st->bucket > tcp_hashinfo.ehash_mask) 2394 break; 2395 rc = established_get_first(seq); 2396 while (offset-- && rc) 2397 rc = established_get_next(seq, rc); 2398 } 2399 2400 st->num = orig_num; 2401 2402 return rc; 2403 } 2404 2405 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2406 { 2407 struct tcp_iter_state *st = seq->private; 2408 void *rc; 2409 2410 if (*pos && *pos == st->last_pos) { 2411 rc = tcp_seek_last_pos(seq); 2412 if (rc) 2413 goto out; 2414 } 2415 2416 st->state = TCP_SEQ_STATE_LISTENING; 2417 st->num = 0; 2418 st->bucket = 0; 2419 st->offset = 0; 2420 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2421 2422 out: 2423 st->last_pos = *pos; 2424 return rc; 2425 } 2426 EXPORT_SYMBOL(tcp_seq_start); 2427 2428 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2429 { 2430 struct tcp_iter_state *st = seq->private; 2431 void *rc = NULL; 2432 2433 if (v == SEQ_START_TOKEN) { 2434 rc = tcp_get_idx(seq, 0); 2435 goto out; 2436 } 2437 2438 switch (st->state) { 2439 case TCP_SEQ_STATE_LISTENING: 2440 rc = listening_get_next(seq, v); 2441 if (!rc) { 2442 st->state = TCP_SEQ_STATE_ESTABLISHED; 2443 st->bucket = 0; 2444 st->offset = 0; 2445 rc = established_get_first(seq); 2446 } 2447 break; 2448 case TCP_SEQ_STATE_ESTABLISHED: 2449 rc = established_get_next(seq, v); 2450 break; 2451 } 2452 out: 2453 ++*pos; 2454 st->last_pos = *pos; 2455 return rc; 2456 } 2457 EXPORT_SYMBOL(tcp_seq_next); 2458 2459 void tcp_seq_stop(struct seq_file *seq, void *v) 2460 { 2461 struct tcp_iter_state *st = seq->private; 2462 2463 switch (st->state) { 2464 case TCP_SEQ_STATE_LISTENING: 2465 if (v != SEQ_START_TOKEN) 2466 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock); 2467 break; 2468 case TCP_SEQ_STATE_ESTABLISHED: 2469 if (v) 2470 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2471 break; 2472 } 2473 } 2474 EXPORT_SYMBOL(tcp_seq_stop); 2475 2476 static void get_openreq4(const struct request_sock *req, 2477 struct seq_file *f, int i) 2478 { 2479 const struct inet_request_sock *ireq = inet_rsk(req); 2480 long delta = req->rsk_timer.expires - jiffies; 2481 2482 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2483 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2484 i, 2485 ireq->ir_loc_addr, 2486 ireq->ir_num, 2487 ireq->ir_rmt_addr, 2488 ntohs(ireq->ir_rmt_port), 2489 TCP_SYN_RECV, 2490 0, 0, /* could print option size, but that is af dependent. */ 2491 1, /* timers active (only the expire timer) */ 2492 jiffies_delta_to_clock_t(delta), 2493 req->num_timeout, 2494 from_kuid_munged(seq_user_ns(f), 2495 sock_i_uid(req->rsk_listener)), 2496 0, /* non standard timer */ 2497 0, /* open_requests have no inode */ 2498 0, 2499 req); 2500 } 2501 2502 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2503 { 2504 int timer_active; 2505 unsigned long timer_expires; 2506 const struct tcp_sock *tp = tcp_sk(sk); 2507 const struct inet_connection_sock *icsk = inet_csk(sk); 2508 const struct inet_sock *inet = inet_sk(sk); 2509 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2510 __be32 dest = inet->inet_daddr; 2511 __be32 src = inet->inet_rcv_saddr; 2512 __u16 destp = ntohs(inet->inet_dport); 2513 __u16 srcp = ntohs(inet->inet_sport); 2514 int rx_queue; 2515 int state; 2516 2517 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2518 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2519 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2520 timer_active = 1; 2521 timer_expires = icsk->icsk_timeout; 2522 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2523 timer_active = 4; 2524 timer_expires = icsk->icsk_timeout; 2525 } else if (timer_pending(&sk->sk_timer)) { 2526 timer_active = 2; 2527 timer_expires = sk->sk_timer.expires; 2528 } else { 2529 timer_active = 0; 2530 timer_expires = jiffies; 2531 } 2532 2533 state = inet_sk_state_load(sk); 2534 if (state == TCP_LISTEN) 2535 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2536 else 2537 /* Because we don't lock the socket, 2538 * we might find a transient negative value. 2539 */ 2540 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2541 READ_ONCE(tp->copied_seq), 0); 2542 2543 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2544 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2545 i, src, srcp, dest, destp, state, 2546 READ_ONCE(tp->write_seq) - tp->snd_una, 2547 rx_queue, 2548 timer_active, 2549 jiffies_delta_to_clock_t(timer_expires - jiffies), 2550 icsk->icsk_retransmits, 2551 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2552 icsk->icsk_probes_out, 2553 sock_i_ino(sk), 2554 refcount_read(&sk->sk_refcnt), sk, 2555 jiffies_to_clock_t(icsk->icsk_rto), 2556 jiffies_to_clock_t(icsk->icsk_ack.ato), 2557 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2558 tp->snd_cwnd, 2559 state == TCP_LISTEN ? 2560 fastopenq->max_qlen : 2561 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2562 } 2563 2564 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2565 struct seq_file *f, int i) 2566 { 2567 long delta = tw->tw_timer.expires - jiffies; 2568 __be32 dest, src; 2569 __u16 destp, srcp; 2570 2571 dest = tw->tw_daddr; 2572 src = tw->tw_rcv_saddr; 2573 destp = ntohs(tw->tw_dport); 2574 srcp = ntohs(tw->tw_sport); 2575 2576 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2577 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2578 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2579 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2580 refcount_read(&tw->tw_refcnt), tw); 2581 } 2582 2583 #define TMPSZ 150 2584 2585 static int tcp4_seq_show(struct seq_file *seq, void *v) 2586 { 2587 struct tcp_iter_state *st; 2588 struct sock *sk = v; 2589 2590 seq_setwidth(seq, TMPSZ - 1); 2591 if (v == SEQ_START_TOKEN) { 2592 seq_puts(seq, " sl local_address rem_address st tx_queue " 2593 "rx_queue tr tm->when retrnsmt uid timeout " 2594 "inode"); 2595 goto out; 2596 } 2597 st = seq->private; 2598 2599 if (sk->sk_state == TCP_TIME_WAIT) 2600 get_timewait4_sock(v, seq, st->num); 2601 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2602 get_openreq4(v, seq, st->num); 2603 else 2604 get_tcp4_sock(v, seq, st->num); 2605 out: 2606 seq_pad(seq, '\n'); 2607 return 0; 2608 } 2609 2610 static const struct seq_operations tcp4_seq_ops = { 2611 .show = tcp4_seq_show, 2612 .start = tcp_seq_start, 2613 .next = tcp_seq_next, 2614 .stop = tcp_seq_stop, 2615 }; 2616 2617 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 2618 .family = AF_INET, 2619 }; 2620 2621 static int __net_init tcp4_proc_init_net(struct net *net) 2622 { 2623 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 2624 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 2625 return -ENOMEM; 2626 return 0; 2627 } 2628 2629 static void __net_exit tcp4_proc_exit_net(struct net *net) 2630 { 2631 remove_proc_entry("tcp", net->proc_net); 2632 } 2633 2634 static struct pernet_operations tcp4_net_ops = { 2635 .init = tcp4_proc_init_net, 2636 .exit = tcp4_proc_exit_net, 2637 }; 2638 2639 int __init tcp4_proc_init(void) 2640 { 2641 return register_pernet_subsys(&tcp4_net_ops); 2642 } 2643 2644 void tcp4_proc_exit(void) 2645 { 2646 unregister_pernet_subsys(&tcp4_net_ops); 2647 } 2648 #endif /* CONFIG_PROC_FS */ 2649 2650 struct proto tcp_prot = { 2651 .name = "TCP", 2652 .owner = THIS_MODULE, 2653 .close = tcp_close, 2654 .pre_connect = tcp_v4_pre_connect, 2655 .connect = tcp_v4_connect, 2656 .disconnect = tcp_disconnect, 2657 .accept = inet_csk_accept, 2658 .ioctl = tcp_ioctl, 2659 .init = tcp_v4_init_sock, 2660 .destroy = tcp_v4_destroy_sock, 2661 .shutdown = tcp_shutdown, 2662 .setsockopt = tcp_setsockopt, 2663 .getsockopt = tcp_getsockopt, 2664 .keepalive = tcp_set_keepalive, 2665 .recvmsg = tcp_recvmsg, 2666 .sendmsg = tcp_sendmsg, 2667 .sendpage = tcp_sendpage, 2668 .backlog_rcv = tcp_v4_do_rcv, 2669 .release_cb = tcp_release_cb, 2670 .hash = inet_hash, 2671 .unhash = inet_unhash, 2672 .get_port = inet_csk_get_port, 2673 .enter_memory_pressure = tcp_enter_memory_pressure, 2674 .leave_memory_pressure = tcp_leave_memory_pressure, 2675 .stream_memory_free = tcp_stream_memory_free, 2676 .sockets_allocated = &tcp_sockets_allocated, 2677 .orphan_count = &tcp_orphan_count, 2678 .memory_allocated = &tcp_memory_allocated, 2679 .memory_pressure = &tcp_memory_pressure, 2680 .sysctl_mem = sysctl_tcp_mem, 2681 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 2682 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 2683 .max_header = MAX_TCP_HEADER, 2684 .obj_size = sizeof(struct tcp_sock), 2685 .slab_flags = SLAB_TYPESAFE_BY_RCU, 2686 .twsk_prot = &tcp_timewait_sock_ops, 2687 .rsk_prot = &tcp_request_sock_ops, 2688 .h.hashinfo = &tcp_hashinfo, 2689 .no_autobind = true, 2690 #ifdef CONFIG_COMPAT 2691 .compat_setsockopt = compat_tcp_setsockopt, 2692 .compat_getsockopt = compat_tcp_getsockopt, 2693 #endif 2694 .diag_destroy = tcp_abort, 2695 }; 2696 EXPORT_SYMBOL(tcp_prot); 2697 2698 static void __net_exit tcp_sk_exit(struct net *net) 2699 { 2700 int cpu; 2701 2702 if (net->ipv4.tcp_congestion_control) 2703 bpf_module_put(net->ipv4.tcp_congestion_control, 2704 net->ipv4.tcp_congestion_control->owner); 2705 2706 for_each_possible_cpu(cpu) 2707 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); 2708 free_percpu(net->ipv4.tcp_sk); 2709 } 2710 2711 static int __net_init tcp_sk_init(struct net *net) 2712 { 2713 int res, cpu, cnt; 2714 2715 net->ipv4.tcp_sk = alloc_percpu(struct sock *); 2716 if (!net->ipv4.tcp_sk) 2717 return -ENOMEM; 2718 2719 for_each_possible_cpu(cpu) { 2720 struct sock *sk; 2721 2722 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 2723 IPPROTO_TCP, net); 2724 if (res) 2725 goto fail; 2726 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 2727 2728 /* Please enforce IP_DF and IPID==0 for RST and 2729 * ACK sent in SYN-RECV and TIME-WAIT state. 2730 */ 2731 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 2732 2733 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; 2734 } 2735 2736 net->ipv4.sysctl_tcp_ecn = 2; 2737 net->ipv4.sysctl_tcp_ecn_fallback = 1; 2738 2739 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 2740 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 2741 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 2742 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 2743 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 2744 2745 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 2746 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 2747 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 2748 2749 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 2750 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 2751 net->ipv4.sysctl_tcp_syncookies = 1; 2752 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 2753 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 2754 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 2755 net->ipv4.sysctl_tcp_orphan_retries = 0; 2756 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 2757 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 2758 net->ipv4.sysctl_tcp_tw_reuse = 2; 2759 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 2760 2761 cnt = tcp_hashinfo.ehash_mask + 1; 2762 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2; 2763 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; 2764 2765 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128); 2766 net->ipv4.sysctl_tcp_sack = 1; 2767 net->ipv4.sysctl_tcp_window_scaling = 1; 2768 net->ipv4.sysctl_tcp_timestamps = 1; 2769 net->ipv4.sysctl_tcp_early_retrans = 3; 2770 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 2771 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 2772 net->ipv4.sysctl_tcp_retrans_collapse = 1; 2773 net->ipv4.sysctl_tcp_max_reordering = 300; 2774 net->ipv4.sysctl_tcp_dsack = 1; 2775 net->ipv4.sysctl_tcp_app_win = 31; 2776 net->ipv4.sysctl_tcp_adv_win_scale = 1; 2777 net->ipv4.sysctl_tcp_frto = 2; 2778 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 2779 /* This limits the percentage of the congestion window which we 2780 * will allow a single TSO frame to consume. Building TSO frames 2781 * which are too large can cause TCP streams to be bursty. 2782 */ 2783 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 2784 /* Default TSQ limit of 16 TSO segments */ 2785 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; 2786 /* rfc5961 challenge ack rate limiting */ 2787 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; 2788 net->ipv4.sysctl_tcp_min_tso_segs = 2; 2789 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 2790 net->ipv4.sysctl_tcp_autocorking = 1; 2791 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 2792 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 2793 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 2794 if (net != &init_net) { 2795 memcpy(net->ipv4.sysctl_tcp_rmem, 2796 init_net.ipv4.sysctl_tcp_rmem, 2797 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 2798 memcpy(net->ipv4.sysctl_tcp_wmem, 2799 init_net.ipv4.sysctl_tcp_wmem, 2800 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 2801 } 2802 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 2803 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 2804 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 2805 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 2806 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); 2807 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; 2808 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 2809 2810 /* Reno is always built in */ 2811 if (!net_eq(net, &init_net) && 2812 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 2813 init_net.ipv4.tcp_congestion_control->owner)) 2814 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 2815 else 2816 net->ipv4.tcp_congestion_control = &tcp_reno; 2817 2818 return 0; 2819 fail: 2820 tcp_sk_exit(net); 2821 2822 return res; 2823 } 2824 2825 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 2826 { 2827 struct net *net; 2828 2829 inet_twsk_purge(&tcp_hashinfo, AF_INET); 2830 2831 list_for_each_entry(net, net_exit_list, exit_list) 2832 tcp_fastopen_ctx_destroy(net); 2833 } 2834 2835 static struct pernet_operations __net_initdata tcp_sk_ops = { 2836 .init = tcp_sk_init, 2837 .exit = tcp_sk_exit, 2838 .exit_batch = tcp_sk_exit_batch, 2839 }; 2840 2841 void __init tcp_v4_init(void) 2842 { 2843 if (register_pernet_subsys(&tcp_sk_ops)) 2844 panic("Failed to create the TCP control socket.\n"); 2845 } 2846