1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 61 #include <net/net_namespace.h> 62 #include <net/icmp.h> 63 #include <net/inet_hashtables.h> 64 #include <net/tcp.h> 65 #include <net/transp_v6.h> 66 #include <net/ipv6.h> 67 #include <net/inet_common.h> 68 #include <net/timewait_sock.h> 69 #include <net/xfrm.h> 70 #include <net/secure_seq.h> 71 #include <net/busy_poll.h> 72 73 #include <linux/inet.h> 74 #include <linux/ipv6.h> 75 #include <linux/stddef.h> 76 #include <linux/proc_fs.h> 77 #include <linux/seq_file.h> 78 #include <linux/inetdevice.h> 79 80 #include <crypto/hash.h> 81 #include <linux/scatterlist.h> 82 83 #include <trace/events/tcp.h> 84 85 #ifdef CONFIG_TCP_MD5SIG 86 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 87 __be32 daddr, __be32 saddr, const struct tcphdr *th); 88 #endif 89 90 struct inet_hashinfo tcp_hashinfo; 91 EXPORT_SYMBOL(tcp_hashinfo); 92 93 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 94 { 95 return secure_tcp_seq(ip_hdr(skb)->daddr, 96 ip_hdr(skb)->saddr, 97 tcp_hdr(skb)->dest, 98 tcp_hdr(skb)->source); 99 } 100 101 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 102 { 103 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 104 } 105 106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 107 { 108 const struct inet_timewait_sock *tw = inet_twsk(sktw); 109 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 110 struct tcp_sock *tp = tcp_sk(sk); 111 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse; 112 113 if (reuse == 2) { 114 /* Still does not detect *everything* that goes through 115 * lo, since we require a loopback src or dst address 116 * or direct binding to 'lo' interface. 117 */ 118 bool loopback = false; 119 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 120 loopback = true; 121 #if IS_ENABLED(CONFIG_IPV6) 122 if (tw->tw_family == AF_INET6) { 123 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 124 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 125 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 126 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 127 loopback = true; 128 } else 129 #endif 130 { 131 if (ipv4_is_loopback(tw->tw_daddr) || 132 ipv4_is_loopback(tw->tw_rcv_saddr)) 133 loopback = true; 134 } 135 if (!loopback) 136 reuse = 0; 137 } 138 139 /* With PAWS, it is safe from the viewpoint 140 of data integrity. Even without PAWS it is safe provided sequence 141 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 142 143 Actually, the idea is close to VJ's one, only timestamp cache is 144 held not per host, but per port pair and TW bucket is used as state 145 holder. 146 147 If TW bucket has been already destroyed we fall back to VJ's scheme 148 and use initial timestamp retrieved from peer table. 149 */ 150 if (tcptw->tw_ts_recent_stamp && 151 (!twp || (reuse && time_after32(ktime_get_seconds(), 152 tcptw->tw_ts_recent_stamp)))) { 153 /* In case of repair and re-using TIME-WAIT sockets we still 154 * want to be sure that it is safe as above but honor the 155 * sequence numbers and time stamps set as part of the repair 156 * process. 157 * 158 * Without this check re-using a TIME-WAIT socket with TCP 159 * repair would accumulate a -1 on the repair assigned 160 * sequence number. The first time it is reused the sequence 161 * is -1, the second time -2, etc. This fixes that issue 162 * without appearing to create any others. 163 */ 164 if (likely(!tp->repair)) { 165 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 166 167 if (!seq) 168 seq = 1; 169 WRITE_ONCE(tp->write_seq, seq); 170 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 171 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 172 } 173 sock_hold(sktw); 174 return 1; 175 } 176 177 return 0; 178 } 179 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 180 181 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 182 int addr_len) 183 { 184 /* This check is replicated from tcp_v4_connect() and intended to 185 * prevent BPF program called below from accessing bytes that are out 186 * of the bound specified by user in addr_len. 187 */ 188 if (addr_len < sizeof(struct sockaddr_in)) 189 return -EINVAL; 190 191 sock_owned_by_me(sk); 192 193 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr); 194 } 195 196 /* This will initiate an outgoing connection. */ 197 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 198 { 199 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 200 struct inet_sock *inet = inet_sk(sk); 201 struct tcp_sock *tp = tcp_sk(sk); 202 __be16 orig_sport, orig_dport; 203 __be32 daddr, nexthop; 204 struct flowi4 *fl4; 205 struct rtable *rt; 206 int err; 207 struct ip_options_rcu *inet_opt; 208 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 209 210 if (addr_len < sizeof(struct sockaddr_in)) 211 return -EINVAL; 212 213 if (usin->sin_family != AF_INET) 214 return -EAFNOSUPPORT; 215 216 nexthop = daddr = usin->sin_addr.s_addr; 217 inet_opt = rcu_dereference_protected(inet->inet_opt, 218 lockdep_sock_is_held(sk)); 219 if (inet_opt && inet_opt->opt.srr) { 220 if (!daddr) 221 return -EINVAL; 222 nexthop = inet_opt->opt.faddr; 223 } 224 225 orig_sport = inet->inet_sport; 226 orig_dport = usin->sin_port; 227 fl4 = &inet->cork.fl.u.ip4; 228 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 229 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 230 IPPROTO_TCP, 231 orig_sport, orig_dport, sk); 232 if (IS_ERR(rt)) { 233 err = PTR_ERR(rt); 234 if (err == -ENETUNREACH) 235 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 236 return err; 237 } 238 239 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 240 ip_rt_put(rt); 241 return -ENETUNREACH; 242 } 243 244 if (!inet_opt || !inet_opt->opt.srr) 245 daddr = fl4->daddr; 246 247 if (!inet->inet_saddr) 248 inet->inet_saddr = fl4->saddr; 249 sk_rcv_saddr_set(sk, inet->inet_saddr); 250 251 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 252 /* Reset inherited state */ 253 tp->rx_opt.ts_recent = 0; 254 tp->rx_opt.ts_recent_stamp = 0; 255 if (likely(!tp->repair)) 256 WRITE_ONCE(tp->write_seq, 0); 257 } 258 259 inet->inet_dport = usin->sin_port; 260 sk_daddr_set(sk, daddr); 261 262 inet_csk(sk)->icsk_ext_hdr_len = 0; 263 if (inet_opt) 264 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 265 266 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 267 268 /* Socket identity is still unknown (sport may be zero). 269 * However we set state to SYN-SENT and not releasing socket 270 * lock select source port, enter ourselves into the hash tables and 271 * complete initialization after this. 272 */ 273 tcp_set_state(sk, TCP_SYN_SENT); 274 err = inet_hash_connect(tcp_death_row, sk); 275 if (err) 276 goto failure; 277 278 sk_set_txhash(sk); 279 280 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 281 inet->inet_sport, inet->inet_dport, sk); 282 if (IS_ERR(rt)) { 283 err = PTR_ERR(rt); 284 rt = NULL; 285 goto failure; 286 } 287 /* OK, now commit destination to socket. */ 288 sk->sk_gso_type = SKB_GSO_TCPV4; 289 sk_setup_caps(sk, &rt->dst); 290 rt = NULL; 291 292 if (likely(!tp->repair)) { 293 if (!tp->write_seq) 294 WRITE_ONCE(tp->write_seq, 295 secure_tcp_seq(inet->inet_saddr, 296 inet->inet_daddr, 297 inet->inet_sport, 298 usin->sin_port)); 299 tp->tsoffset = secure_tcp_ts_off(sock_net(sk), 300 inet->inet_saddr, 301 inet->inet_daddr); 302 } 303 304 inet->inet_id = prandom_u32(); 305 306 if (tcp_fastopen_defer_connect(sk, &err)) 307 return err; 308 if (err) 309 goto failure; 310 311 err = tcp_connect(sk); 312 313 if (err) 314 goto failure; 315 316 return 0; 317 318 failure: 319 /* 320 * This unhashes the socket and releases the local port, 321 * if necessary. 322 */ 323 tcp_set_state(sk, TCP_CLOSE); 324 ip_rt_put(rt); 325 sk->sk_route_caps = 0; 326 inet->inet_dport = 0; 327 return err; 328 } 329 EXPORT_SYMBOL(tcp_v4_connect); 330 331 /* 332 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 333 * It can be called through tcp_release_cb() if socket was owned by user 334 * at the time tcp_v4_err() was called to handle ICMP message. 335 */ 336 void tcp_v4_mtu_reduced(struct sock *sk) 337 { 338 struct inet_sock *inet = inet_sk(sk); 339 struct dst_entry *dst; 340 u32 mtu; 341 342 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 343 return; 344 mtu = tcp_sk(sk)->mtu_info; 345 dst = inet_csk_update_pmtu(sk, mtu); 346 if (!dst) 347 return; 348 349 /* Something is about to be wrong... Remember soft error 350 * for the case, if this connection will not able to recover. 351 */ 352 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 353 sk->sk_err_soft = EMSGSIZE; 354 355 mtu = dst_mtu(dst); 356 357 if (inet->pmtudisc != IP_PMTUDISC_DONT && 358 ip_sk_accept_pmtu(sk) && 359 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 360 tcp_sync_mss(sk, mtu); 361 362 /* Resend the TCP packet because it's 363 * clear that the old packet has been 364 * dropped. This is the new "fast" path mtu 365 * discovery. 366 */ 367 tcp_simple_retransmit(sk); 368 } /* else let the usual retransmit timer handle it */ 369 } 370 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 371 372 static void do_redirect(struct sk_buff *skb, struct sock *sk) 373 { 374 struct dst_entry *dst = __sk_dst_check(sk, 0); 375 376 if (dst) 377 dst->ops->redirect(dst, sk, skb); 378 } 379 380 381 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 382 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 383 { 384 struct request_sock *req = inet_reqsk(sk); 385 struct net *net = sock_net(sk); 386 387 /* ICMPs are not backlogged, hence we cannot get 388 * an established socket here. 389 */ 390 if (seq != tcp_rsk(req)->snt_isn) { 391 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 392 } else if (abort) { 393 /* 394 * Still in SYN_RECV, just remove it silently. 395 * There is no good way to pass the error to the newly 396 * created socket, and POSIX does not want network 397 * errors returned from accept(). 398 */ 399 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 400 tcp_listendrop(req->rsk_listener); 401 } 402 reqsk_put(req); 403 } 404 EXPORT_SYMBOL(tcp_req_err); 405 406 /* TCP-LD (RFC 6069) logic */ 407 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 408 { 409 struct inet_connection_sock *icsk = inet_csk(sk); 410 struct tcp_sock *tp = tcp_sk(sk); 411 struct sk_buff *skb; 412 s32 remaining; 413 u32 delta_us; 414 415 if (sock_owned_by_user(sk)) 416 return; 417 418 if (seq != tp->snd_una || !icsk->icsk_retransmits || 419 !icsk->icsk_backoff) 420 return; 421 422 skb = tcp_rtx_queue_head(sk); 423 if (WARN_ON_ONCE(!skb)) 424 return; 425 426 icsk->icsk_backoff--; 427 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 428 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 429 430 tcp_mstamp_refresh(tp); 431 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 432 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 433 434 if (remaining > 0) { 435 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 436 remaining, TCP_RTO_MAX); 437 } else { 438 /* RTO revert clocked out retransmission. 439 * Will retransmit now. 440 */ 441 tcp_retransmit_timer(sk); 442 } 443 } 444 EXPORT_SYMBOL(tcp_ld_RTO_revert); 445 446 /* 447 * This routine is called by the ICMP module when it gets some 448 * sort of error condition. If err < 0 then the socket should 449 * be closed and the error returned to the user. If err > 0 450 * it's just the icmp type << 8 | icmp code. After adjustment 451 * header points to the first 8 bytes of the tcp header. We need 452 * to find the appropriate port. 453 * 454 * The locking strategy used here is very "optimistic". When 455 * someone else accesses the socket the ICMP is just dropped 456 * and for some paths there is no check at all. 457 * A more general error queue to queue errors for later handling 458 * is probably better. 459 * 460 */ 461 462 int tcp_v4_err(struct sk_buff *skb, u32 info) 463 { 464 const struct iphdr *iph = (const struct iphdr *)skb->data; 465 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 466 struct tcp_sock *tp; 467 struct inet_sock *inet; 468 const int type = icmp_hdr(skb)->type; 469 const int code = icmp_hdr(skb)->code; 470 struct sock *sk; 471 struct request_sock *fastopen; 472 u32 seq, snd_una; 473 int err; 474 struct net *net = dev_net(skb->dev); 475 476 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, 477 th->dest, iph->saddr, ntohs(th->source), 478 inet_iif(skb), 0); 479 if (!sk) { 480 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 481 return -ENOENT; 482 } 483 if (sk->sk_state == TCP_TIME_WAIT) { 484 inet_twsk_put(inet_twsk(sk)); 485 return 0; 486 } 487 seq = ntohl(th->seq); 488 if (sk->sk_state == TCP_NEW_SYN_RECV) { 489 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 490 type == ICMP_TIME_EXCEEDED || 491 (type == ICMP_DEST_UNREACH && 492 (code == ICMP_NET_UNREACH || 493 code == ICMP_HOST_UNREACH))); 494 return 0; 495 } 496 497 bh_lock_sock(sk); 498 /* If too many ICMPs get dropped on busy 499 * servers this needs to be solved differently. 500 * We do take care of PMTU discovery (RFC1191) special case : 501 * we can receive locally generated ICMP messages while socket is held. 502 */ 503 if (sock_owned_by_user(sk)) { 504 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 505 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 506 } 507 if (sk->sk_state == TCP_CLOSE) 508 goto out; 509 510 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 511 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 512 goto out; 513 } 514 515 tp = tcp_sk(sk); 516 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 517 fastopen = rcu_dereference(tp->fastopen_rsk); 518 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 519 if (sk->sk_state != TCP_LISTEN && 520 !between(seq, snd_una, tp->snd_nxt)) { 521 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 522 goto out; 523 } 524 525 switch (type) { 526 case ICMP_REDIRECT: 527 if (!sock_owned_by_user(sk)) 528 do_redirect(skb, sk); 529 goto out; 530 case ICMP_SOURCE_QUENCH: 531 /* Just silently ignore these. */ 532 goto out; 533 case ICMP_PARAMETERPROB: 534 err = EPROTO; 535 break; 536 case ICMP_DEST_UNREACH: 537 if (code > NR_ICMP_UNREACH) 538 goto out; 539 540 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 541 /* We are not interested in TCP_LISTEN and open_requests 542 * (SYN-ACKs send out by Linux are always <576bytes so 543 * they should go through unfragmented). 544 */ 545 if (sk->sk_state == TCP_LISTEN) 546 goto out; 547 548 tp->mtu_info = info; 549 if (!sock_owned_by_user(sk)) { 550 tcp_v4_mtu_reduced(sk); 551 } else { 552 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 553 sock_hold(sk); 554 } 555 goto out; 556 } 557 558 err = icmp_err_convert[code].errno; 559 /* check if this ICMP message allows revert of backoff. 560 * (see RFC 6069) 561 */ 562 if (!fastopen && 563 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 564 tcp_ld_RTO_revert(sk, seq); 565 break; 566 case ICMP_TIME_EXCEEDED: 567 err = EHOSTUNREACH; 568 break; 569 default: 570 goto out; 571 } 572 573 switch (sk->sk_state) { 574 case TCP_SYN_SENT: 575 case TCP_SYN_RECV: 576 /* Only in fast or simultaneous open. If a fast open socket is 577 * is already accepted it is treated as a connected one below. 578 */ 579 if (fastopen && !fastopen->sk) 580 break; 581 582 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 583 584 if (!sock_owned_by_user(sk)) { 585 sk->sk_err = err; 586 587 sk->sk_error_report(sk); 588 589 tcp_done(sk); 590 } else { 591 sk->sk_err_soft = err; 592 } 593 goto out; 594 } 595 596 /* If we've already connected we will keep trying 597 * until we time out, or the user gives up. 598 * 599 * rfc1122 4.2.3.9 allows to consider as hard errors 600 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 601 * but it is obsoleted by pmtu discovery). 602 * 603 * Note, that in modern internet, where routing is unreliable 604 * and in each dark corner broken firewalls sit, sending random 605 * errors ordered by their masters even this two messages finally lose 606 * their original sense (even Linux sends invalid PORT_UNREACHs) 607 * 608 * Now we are in compliance with RFCs. 609 * --ANK (980905) 610 */ 611 612 inet = inet_sk(sk); 613 if (!sock_owned_by_user(sk) && inet->recverr) { 614 sk->sk_err = err; 615 sk->sk_error_report(sk); 616 } else { /* Only an error on timeout */ 617 sk->sk_err_soft = err; 618 } 619 620 out: 621 bh_unlock_sock(sk); 622 sock_put(sk); 623 return 0; 624 } 625 626 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 627 { 628 struct tcphdr *th = tcp_hdr(skb); 629 630 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 631 skb->csum_start = skb_transport_header(skb) - skb->head; 632 skb->csum_offset = offsetof(struct tcphdr, check); 633 } 634 635 /* This routine computes an IPv4 TCP checksum. */ 636 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 637 { 638 const struct inet_sock *inet = inet_sk(sk); 639 640 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 641 } 642 EXPORT_SYMBOL(tcp_v4_send_check); 643 644 /* 645 * This routine will send an RST to the other tcp. 646 * 647 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 648 * for reset. 649 * Answer: if a packet caused RST, it is not for a socket 650 * existing in our system, if it is matched to a socket, 651 * it is just duplicate segment or bug in other side's TCP. 652 * So that we build reply only basing on parameters 653 * arrived with segment. 654 * Exception: precedence violation. We do not implement it in any case. 655 */ 656 657 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) 658 { 659 const struct tcphdr *th = tcp_hdr(skb); 660 struct { 661 struct tcphdr th; 662 #ifdef CONFIG_TCP_MD5SIG 663 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)]; 664 #endif 665 } rep; 666 struct ip_reply_arg arg; 667 #ifdef CONFIG_TCP_MD5SIG 668 struct tcp_md5sig_key *key = NULL; 669 const __u8 *hash_location = NULL; 670 unsigned char newhash[16]; 671 int genhash; 672 struct sock *sk1 = NULL; 673 #endif 674 u64 transmit_time = 0; 675 struct sock *ctl_sk; 676 struct net *net; 677 678 /* Never send a reset in response to a reset. */ 679 if (th->rst) 680 return; 681 682 /* If sk not NULL, it means we did a successful lookup and incoming 683 * route had to be correct. prequeue might have dropped our dst. 684 */ 685 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 686 return; 687 688 /* Swap the send and the receive. */ 689 memset(&rep, 0, sizeof(rep)); 690 rep.th.dest = th->source; 691 rep.th.source = th->dest; 692 rep.th.doff = sizeof(struct tcphdr) / 4; 693 rep.th.rst = 1; 694 695 if (th->ack) { 696 rep.th.seq = th->ack_seq; 697 } else { 698 rep.th.ack = 1; 699 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 700 skb->len - (th->doff << 2)); 701 } 702 703 memset(&arg, 0, sizeof(arg)); 704 arg.iov[0].iov_base = (unsigned char *)&rep; 705 arg.iov[0].iov_len = sizeof(rep.th); 706 707 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 708 #ifdef CONFIG_TCP_MD5SIG 709 rcu_read_lock(); 710 hash_location = tcp_parse_md5sig_option(th); 711 if (sk && sk_fullsock(sk)) { 712 const union tcp_md5_addr *addr; 713 int l3index; 714 715 /* sdif set, means packet ingressed via a device 716 * in an L3 domain and inet_iif is set to it. 717 */ 718 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 719 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 720 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 721 } else if (hash_location) { 722 const union tcp_md5_addr *addr; 723 int sdif = tcp_v4_sdif(skb); 724 int dif = inet_iif(skb); 725 int l3index; 726 727 /* 728 * active side is lost. Try to find listening socket through 729 * source port, and then find md5 key through listening socket. 730 * we are not loose security here: 731 * Incoming packet is checked with md5 hash with finding key, 732 * no RST generated if md5 hash doesn't match. 733 */ 734 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, 735 ip_hdr(skb)->saddr, 736 th->source, ip_hdr(skb)->daddr, 737 ntohs(th->source), dif, sdif); 738 /* don't send rst if it can't find key */ 739 if (!sk1) 740 goto out; 741 742 /* sdif set, means packet ingressed via a device 743 * in an L3 domain and dif is set to it. 744 */ 745 l3index = sdif ? dif : 0; 746 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 747 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 748 if (!key) 749 goto out; 750 751 752 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 753 if (genhash || memcmp(hash_location, newhash, 16) != 0) 754 goto out; 755 756 } 757 758 if (key) { 759 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 760 (TCPOPT_NOP << 16) | 761 (TCPOPT_MD5SIG << 8) | 762 TCPOLEN_MD5SIG); 763 /* Update length and the length the header thinks exists */ 764 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 765 rep.th.doff = arg.iov[0].iov_len / 4; 766 767 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 768 key, ip_hdr(skb)->saddr, 769 ip_hdr(skb)->daddr, &rep.th); 770 } 771 #endif 772 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 773 ip_hdr(skb)->saddr, /* XXX */ 774 arg.iov[0].iov_len, IPPROTO_TCP, 0); 775 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 776 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 777 778 /* When socket is gone, all binding information is lost. 779 * routing might fail in this case. No choice here, if we choose to force 780 * input interface, we will misroute in case of asymmetric route. 781 */ 782 if (sk) { 783 arg.bound_dev_if = sk->sk_bound_dev_if; 784 if (sk_fullsock(sk)) 785 trace_tcp_send_reset(sk, skb); 786 } 787 788 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 789 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 790 791 arg.tos = ip_hdr(skb)->tos; 792 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 793 local_bh_disable(); 794 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); 795 if (sk) { 796 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 797 inet_twsk(sk)->tw_mark : sk->sk_mark; 798 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 799 inet_twsk(sk)->tw_priority : sk->sk_priority; 800 transmit_time = tcp_transmit_time(sk); 801 } 802 ip_send_unicast_reply(ctl_sk, 803 skb, &TCP_SKB_CB(skb)->header.h4.opt, 804 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 805 &arg, arg.iov[0].iov_len, 806 transmit_time); 807 808 ctl_sk->sk_mark = 0; 809 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 810 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 811 local_bh_enable(); 812 813 #ifdef CONFIG_TCP_MD5SIG 814 out: 815 rcu_read_unlock(); 816 #endif 817 } 818 819 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 820 outside socket context is ugly, certainly. What can I do? 821 */ 822 823 static void tcp_v4_send_ack(const struct sock *sk, 824 struct sk_buff *skb, u32 seq, u32 ack, 825 u32 win, u32 tsval, u32 tsecr, int oif, 826 struct tcp_md5sig_key *key, 827 int reply_flags, u8 tos) 828 { 829 const struct tcphdr *th = tcp_hdr(skb); 830 struct { 831 struct tcphdr th; 832 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 833 #ifdef CONFIG_TCP_MD5SIG 834 + (TCPOLEN_MD5SIG_ALIGNED >> 2) 835 #endif 836 ]; 837 } rep; 838 struct net *net = sock_net(sk); 839 struct ip_reply_arg arg; 840 struct sock *ctl_sk; 841 u64 transmit_time; 842 843 memset(&rep.th, 0, sizeof(struct tcphdr)); 844 memset(&arg, 0, sizeof(arg)); 845 846 arg.iov[0].iov_base = (unsigned char *)&rep; 847 arg.iov[0].iov_len = sizeof(rep.th); 848 if (tsecr) { 849 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 850 (TCPOPT_TIMESTAMP << 8) | 851 TCPOLEN_TIMESTAMP); 852 rep.opt[1] = htonl(tsval); 853 rep.opt[2] = htonl(tsecr); 854 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 855 } 856 857 /* Swap the send and the receive. */ 858 rep.th.dest = th->source; 859 rep.th.source = th->dest; 860 rep.th.doff = arg.iov[0].iov_len / 4; 861 rep.th.seq = htonl(seq); 862 rep.th.ack_seq = htonl(ack); 863 rep.th.ack = 1; 864 rep.th.window = htons(win); 865 866 #ifdef CONFIG_TCP_MD5SIG 867 if (key) { 868 int offset = (tsecr) ? 3 : 0; 869 870 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 871 (TCPOPT_NOP << 16) | 872 (TCPOPT_MD5SIG << 8) | 873 TCPOLEN_MD5SIG); 874 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 875 rep.th.doff = arg.iov[0].iov_len/4; 876 877 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 878 key, ip_hdr(skb)->saddr, 879 ip_hdr(skb)->daddr, &rep.th); 880 } 881 #endif 882 arg.flags = reply_flags; 883 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 884 ip_hdr(skb)->saddr, /* XXX */ 885 arg.iov[0].iov_len, IPPROTO_TCP, 0); 886 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 887 if (oif) 888 arg.bound_dev_if = oif; 889 arg.tos = tos; 890 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 891 local_bh_disable(); 892 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); 893 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 894 inet_twsk(sk)->tw_mark : sk->sk_mark; 895 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 896 inet_twsk(sk)->tw_priority : sk->sk_priority; 897 transmit_time = tcp_transmit_time(sk); 898 ip_send_unicast_reply(ctl_sk, 899 skb, &TCP_SKB_CB(skb)->header.h4.opt, 900 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 901 &arg, arg.iov[0].iov_len, 902 transmit_time); 903 904 ctl_sk->sk_mark = 0; 905 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 906 local_bh_enable(); 907 } 908 909 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 910 { 911 struct inet_timewait_sock *tw = inet_twsk(sk); 912 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 913 914 tcp_v4_send_ack(sk, skb, 915 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 916 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 917 tcp_time_stamp_raw() + tcptw->tw_ts_offset, 918 tcptw->tw_ts_recent, 919 tw->tw_bound_dev_if, 920 tcp_twsk_md5_key(tcptw), 921 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 922 tw->tw_tos 923 ); 924 925 inet_twsk_put(tw); 926 } 927 928 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 929 struct request_sock *req) 930 { 931 const union tcp_md5_addr *addr; 932 int l3index; 933 934 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 935 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 936 */ 937 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 938 tcp_sk(sk)->snd_nxt; 939 940 /* RFC 7323 2.3 941 * The window field (SEG.WND) of every outgoing segment, with the 942 * exception of <SYN> segments, MUST be right-shifted by 943 * Rcv.Wind.Shift bits: 944 */ 945 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 946 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 947 tcp_v4_send_ack(sk, skb, seq, 948 tcp_rsk(req)->rcv_nxt, 949 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 950 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, 951 req->ts_recent, 952 0, 953 tcp_md5_do_lookup(sk, l3index, addr, AF_INET), 954 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 955 ip_hdr(skb)->tos); 956 } 957 958 /* 959 * Send a SYN-ACK after having received a SYN. 960 * This still operates on a request_sock only, not on a big 961 * socket. 962 */ 963 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 964 struct flowi *fl, 965 struct request_sock *req, 966 struct tcp_fastopen_cookie *foc, 967 enum tcp_synack_type synack_type) 968 { 969 const struct inet_request_sock *ireq = inet_rsk(req); 970 struct flowi4 fl4; 971 int err = -1; 972 struct sk_buff *skb; 973 974 /* First, grab a route. */ 975 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 976 return -1; 977 978 skb = tcp_make_synack(sk, dst, req, foc, synack_type); 979 980 if (skb) { 981 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 982 983 rcu_read_lock(); 984 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 985 ireq->ir_rmt_addr, 986 rcu_dereference(ireq->ireq_opt)); 987 rcu_read_unlock(); 988 err = net_xmit_eval(err); 989 } 990 991 return err; 992 } 993 994 /* 995 * IPv4 request_sock destructor. 996 */ 997 static void tcp_v4_reqsk_destructor(struct request_sock *req) 998 { 999 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1000 } 1001 1002 #ifdef CONFIG_TCP_MD5SIG 1003 /* 1004 * RFC2385 MD5 checksumming requires a mapping of 1005 * IP address->MD5 Key. 1006 * We need to maintain these in the sk structure. 1007 */ 1008 1009 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed); 1010 EXPORT_SYMBOL(tcp_md5_needed); 1011 1012 /* Find the Key structure for an address. */ 1013 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1014 const union tcp_md5_addr *addr, 1015 int family) 1016 { 1017 const struct tcp_sock *tp = tcp_sk(sk); 1018 struct tcp_md5sig_key *key; 1019 const struct tcp_md5sig_info *md5sig; 1020 __be32 mask; 1021 struct tcp_md5sig_key *best_match = NULL; 1022 bool match; 1023 1024 /* caller either holds rcu_read_lock() or socket lock */ 1025 md5sig = rcu_dereference_check(tp->md5sig_info, 1026 lockdep_sock_is_held(sk)); 1027 if (!md5sig) 1028 return NULL; 1029 1030 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1031 lockdep_sock_is_held(sk)) { 1032 if (key->family != family) 1033 continue; 1034 if (key->l3index && key->l3index != l3index) 1035 continue; 1036 if (family == AF_INET) { 1037 mask = inet_make_mask(key->prefixlen); 1038 match = (key->addr.a4.s_addr & mask) == 1039 (addr->a4.s_addr & mask); 1040 #if IS_ENABLED(CONFIG_IPV6) 1041 } else if (family == AF_INET6) { 1042 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1043 key->prefixlen); 1044 #endif 1045 } else { 1046 match = false; 1047 } 1048 1049 if (match && (!best_match || 1050 key->prefixlen > best_match->prefixlen)) 1051 best_match = key; 1052 } 1053 return best_match; 1054 } 1055 EXPORT_SYMBOL(__tcp_md5_do_lookup); 1056 1057 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1058 const union tcp_md5_addr *addr, 1059 int family, u8 prefixlen, 1060 int l3index) 1061 { 1062 const struct tcp_sock *tp = tcp_sk(sk); 1063 struct tcp_md5sig_key *key; 1064 unsigned int size = sizeof(struct in_addr); 1065 const struct tcp_md5sig_info *md5sig; 1066 1067 /* caller either holds rcu_read_lock() or socket lock */ 1068 md5sig = rcu_dereference_check(tp->md5sig_info, 1069 lockdep_sock_is_held(sk)); 1070 if (!md5sig) 1071 return NULL; 1072 #if IS_ENABLED(CONFIG_IPV6) 1073 if (family == AF_INET6) 1074 size = sizeof(struct in6_addr); 1075 #endif 1076 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1077 lockdep_sock_is_held(sk)) { 1078 if (key->family != family) 1079 continue; 1080 if (key->l3index && key->l3index != l3index) 1081 continue; 1082 if (!memcmp(&key->addr, addr, size) && 1083 key->prefixlen == prefixlen) 1084 return key; 1085 } 1086 return NULL; 1087 } 1088 1089 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1090 const struct sock *addr_sk) 1091 { 1092 const union tcp_md5_addr *addr; 1093 int l3index; 1094 1095 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1096 addr_sk->sk_bound_dev_if); 1097 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1098 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1099 } 1100 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1101 1102 /* This can be called on a newly created socket, from other files */ 1103 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1104 int family, u8 prefixlen, int l3index, 1105 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1106 { 1107 /* Add Key to the list */ 1108 struct tcp_md5sig_key *key; 1109 struct tcp_sock *tp = tcp_sk(sk); 1110 struct tcp_md5sig_info *md5sig; 1111 1112 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); 1113 if (key) { 1114 /* Pre-existing entry - just update that one. */ 1115 memcpy(key->key, newkey, newkeylen); 1116 key->keylen = newkeylen; 1117 return 0; 1118 } 1119 1120 md5sig = rcu_dereference_protected(tp->md5sig_info, 1121 lockdep_sock_is_held(sk)); 1122 if (!md5sig) { 1123 md5sig = kmalloc(sizeof(*md5sig), gfp); 1124 if (!md5sig) 1125 return -ENOMEM; 1126 1127 sk_nocaps_add(sk, NETIF_F_GSO_MASK); 1128 INIT_HLIST_HEAD(&md5sig->head); 1129 rcu_assign_pointer(tp->md5sig_info, md5sig); 1130 } 1131 1132 key = sock_kmalloc(sk, sizeof(*key), gfp); 1133 if (!key) 1134 return -ENOMEM; 1135 if (!tcp_alloc_md5sig_pool()) { 1136 sock_kfree_s(sk, key, sizeof(*key)); 1137 return -ENOMEM; 1138 } 1139 1140 memcpy(key->key, newkey, newkeylen); 1141 key->keylen = newkeylen; 1142 key->family = family; 1143 key->prefixlen = prefixlen; 1144 key->l3index = l3index; 1145 memcpy(&key->addr, addr, 1146 (family == AF_INET6) ? sizeof(struct in6_addr) : 1147 sizeof(struct in_addr)); 1148 hlist_add_head_rcu(&key->node, &md5sig->head); 1149 return 0; 1150 } 1151 EXPORT_SYMBOL(tcp_md5_do_add); 1152 1153 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1154 u8 prefixlen, int l3index) 1155 { 1156 struct tcp_md5sig_key *key; 1157 1158 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); 1159 if (!key) 1160 return -ENOENT; 1161 hlist_del_rcu(&key->node); 1162 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1163 kfree_rcu(key, rcu); 1164 return 0; 1165 } 1166 EXPORT_SYMBOL(tcp_md5_do_del); 1167 1168 static void tcp_clear_md5_list(struct sock *sk) 1169 { 1170 struct tcp_sock *tp = tcp_sk(sk); 1171 struct tcp_md5sig_key *key; 1172 struct hlist_node *n; 1173 struct tcp_md5sig_info *md5sig; 1174 1175 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1176 1177 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1178 hlist_del_rcu(&key->node); 1179 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1180 kfree_rcu(key, rcu); 1181 } 1182 } 1183 1184 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1185 char __user *optval, int optlen) 1186 { 1187 struct tcp_md5sig cmd; 1188 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1189 const union tcp_md5_addr *addr; 1190 u8 prefixlen = 32; 1191 int l3index = 0; 1192 1193 if (optlen < sizeof(cmd)) 1194 return -EINVAL; 1195 1196 if (copy_from_user(&cmd, optval, sizeof(cmd))) 1197 return -EFAULT; 1198 1199 if (sin->sin_family != AF_INET) 1200 return -EINVAL; 1201 1202 if (optname == TCP_MD5SIG_EXT && 1203 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1204 prefixlen = cmd.tcpm_prefixlen; 1205 if (prefixlen > 32) 1206 return -EINVAL; 1207 } 1208 1209 if (optname == TCP_MD5SIG_EXT && 1210 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1211 struct net_device *dev; 1212 1213 rcu_read_lock(); 1214 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1215 if (dev && netif_is_l3_master(dev)) 1216 l3index = dev->ifindex; 1217 1218 rcu_read_unlock(); 1219 1220 /* ok to reference set/not set outside of rcu; 1221 * right now device MUST be an L3 master 1222 */ 1223 if (!dev || !l3index) 1224 return -EINVAL; 1225 } 1226 1227 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1228 1229 if (!cmd.tcpm_keylen) 1230 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index); 1231 1232 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1233 return -EINVAL; 1234 1235 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, 1236 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); 1237 } 1238 1239 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, 1240 __be32 daddr, __be32 saddr, 1241 const struct tcphdr *th, int nbytes) 1242 { 1243 struct tcp4_pseudohdr *bp; 1244 struct scatterlist sg; 1245 struct tcphdr *_th; 1246 1247 bp = hp->scratch; 1248 bp->saddr = saddr; 1249 bp->daddr = daddr; 1250 bp->pad = 0; 1251 bp->protocol = IPPROTO_TCP; 1252 bp->len = cpu_to_be16(nbytes); 1253 1254 _th = (struct tcphdr *)(bp + 1); 1255 memcpy(_th, th, sizeof(*th)); 1256 _th->check = 0; 1257 1258 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1259 ahash_request_set_crypt(hp->md5_req, &sg, NULL, 1260 sizeof(*bp) + sizeof(*th)); 1261 return crypto_ahash_update(hp->md5_req); 1262 } 1263 1264 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1265 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1266 { 1267 struct tcp_md5sig_pool *hp; 1268 struct ahash_request *req; 1269 1270 hp = tcp_get_md5sig_pool(); 1271 if (!hp) 1272 goto clear_hash_noput; 1273 req = hp->md5_req; 1274 1275 if (crypto_ahash_init(req)) 1276 goto clear_hash; 1277 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) 1278 goto clear_hash; 1279 if (tcp_md5_hash_key(hp, key)) 1280 goto clear_hash; 1281 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1282 if (crypto_ahash_final(req)) 1283 goto clear_hash; 1284 1285 tcp_put_md5sig_pool(); 1286 return 0; 1287 1288 clear_hash: 1289 tcp_put_md5sig_pool(); 1290 clear_hash_noput: 1291 memset(md5_hash, 0, 16); 1292 return 1; 1293 } 1294 1295 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1296 const struct sock *sk, 1297 const struct sk_buff *skb) 1298 { 1299 struct tcp_md5sig_pool *hp; 1300 struct ahash_request *req; 1301 const struct tcphdr *th = tcp_hdr(skb); 1302 __be32 saddr, daddr; 1303 1304 if (sk) { /* valid for establish/request sockets */ 1305 saddr = sk->sk_rcv_saddr; 1306 daddr = sk->sk_daddr; 1307 } else { 1308 const struct iphdr *iph = ip_hdr(skb); 1309 saddr = iph->saddr; 1310 daddr = iph->daddr; 1311 } 1312 1313 hp = tcp_get_md5sig_pool(); 1314 if (!hp) 1315 goto clear_hash_noput; 1316 req = hp->md5_req; 1317 1318 if (crypto_ahash_init(req)) 1319 goto clear_hash; 1320 1321 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len)) 1322 goto clear_hash; 1323 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) 1324 goto clear_hash; 1325 if (tcp_md5_hash_key(hp, key)) 1326 goto clear_hash; 1327 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1328 if (crypto_ahash_final(req)) 1329 goto clear_hash; 1330 1331 tcp_put_md5sig_pool(); 1332 return 0; 1333 1334 clear_hash: 1335 tcp_put_md5sig_pool(); 1336 clear_hash_noput: 1337 memset(md5_hash, 0, 16); 1338 return 1; 1339 } 1340 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1341 1342 #endif 1343 1344 /* Called with rcu_read_lock() */ 1345 static bool tcp_v4_inbound_md5_hash(const struct sock *sk, 1346 const struct sk_buff *skb, 1347 int dif, int sdif) 1348 { 1349 #ifdef CONFIG_TCP_MD5SIG 1350 /* 1351 * This gets called for each TCP segment that arrives 1352 * so we want to be efficient. 1353 * We have 3 drop cases: 1354 * o No MD5 hash and one expected. 1355 * o MD5 hash and we're not expecting one. 1356 * o MD5 hash and its wrong. 1357 */ 1358 const __u8 *hash_location = NULL; 1359 struct tcp_md5sig_key *hash_expected; 1360 const struct iphdr *iph = ip_hdr(skb); 1361 const struct tcphdr *th = tcp_hdr(skb); 1362 const union tcp_md5_addr *addr; 1363 unsigned char newhash[16]; 1364 int genhash, l3index; 1365 1366 /* sdif set, means packet ingressed via a device 1367 * in an L3 domain and dif is set to the l3mdev 1368 */ 1369 l3index = sdif ? dif : 0; 1370 1371 addr = (union tcp_md5_addr *)&iph->saddr; 1372 hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1373 hash_location = tcp_parse_md5sig_option(th); 1374 1375 /* We've parsed the options - do we have a hash? */ 1376 if (!hash_expected && !hash_location) 1377 return false; 1378 1379 if (hash_expected && !hash_location) { 1380 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); 1381 return true; 1382 } 1383 1384 if (!hash_expected && hash_location) { 1385 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); 1386 return true; 1387 } 1388 1389 /* Okay, so this is hash_expected and hash_location - 1390 * so we need to calculate the checksum. 1391 */ 1392 genhash = tcp_v4_md5_hash_skb(newhash, 1393 hash_expected, 1394 NULL, skb); 1395 1396 if (genhash || memcmp(hash_location, newhash, 16) != 0) { 1397 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); 1398 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n", 1399 &iph->saddr, ntohs(th->source), 1400 &iph->daddr, ntohs(th->dest), 1401 genhash ? " tcp_v4_calc_md5_hash failed" 1402 : "", l3index); 1403 return true; 1404 } 1405 return false; 1406 #endif 1407 return false; 1408 } 1409 1410 static void tcp_v4_init_req(struct request_sock *req, 1411 const struct sock *sk_listener, 1412 struct sk_buff *skb) 1413 { 1414 struct inet_request_sock *ireq = inet_rsk(req); 1415 struct net *net = sock_net(sk_listener); 1416 1417 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1418 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1419 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1420 } 1421 1422 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1423 struct flowi *fl, 1424 const struct request_sock *req) 1425 { 1426 return inet_csk_route_req(sk, &fl->u.ip4, req); 1427 } 1428 1429 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1430 .family = PF_INET, 1431 .obj_size = sizeof(struct tcp_request_sock), 1432 .rtx_syn_ack = tcp_rtx_synack, 1433 .send_ack = tcp_v4_reqsk_send_ack, 1434 .destructor = tcp_v4_reqsk_destructor, 1435 .send_reset = tcp_v4_send_reset, 1436 .syn_ack_timeout = tcp_syn_ack_timeout, 1437 }; 1438 1439 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1440 .mss_clamp = TCP_MSS_DEFAULT, 1441 #ifdef CONFIG_TCP_MD5SIG 1442 .req_md5_lookup = tcp_v4_md5_lookup, 1443 .calc_md5_hash = tcp_v4_md5_hash_skb, 1444 #endif 1445 .init_req = tcp_v4_init_req, 1446 #ifdef CONFIG_SYN_COOKIES 1447 .cookie_init_seq = cookie_v4_init_sequence, 1448 #endif 1449 .route_req = tcp_v4_route_req, 1450 .init_seq = tcp_v4_init_seq, 1451 .init_ts_off = tcp_v4_init_ts_off, 1452 .send_synack = tcp_v4_send_synack, 1453 }; 1454 1455 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1456 { 1457 /* Never answer to SYNs send to broadcast or multicast */ 1458 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1459 goto drop; 1460 1461 return tcp_conn_request(&tcp_request_sock_ops, 1462 &tcp_request_sock_ipv4_ops, sk, skb); 1463 1464 drop: 1465 tcp_listendrop(sk); 1466 return 0; 1467 } 1468 EXPORT_SYMBOL(tcp_v4_conn_request); 1469 1470 1471 /* 1472 * The three way handshake has completed - we got a valid synack - 1473 * now create the new socket. 1474 */ 1475 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1476 struct request_sock *req, 1477 struct dst_entry *dst, 1478 struct request_sock *req_unhash, 1479 bool *own_req) 1480 { 1481 struct inet_request_sock *ireq; 1482 struct inet_sock *newinet; 1483 struct tcp_sock *newtp; 1484 struct sock *newsk; 1485 #ifdef CONFIG_TCP_MD5SIG 1486 const union tcp_md5_addr *addr; 1487 struct tcp_md5sig_key *key; 1488 int l3index; 1489 #endif 1490 struct ip_options_rcu *inet_opt; 1491 1492 if (sk_acceptq_is_full(sk)) 1493 goto exit_overflow; 1494 1495 newsk = tcp_create_openreq_child(sk, req, skb); 1496 if (!newsk) 1497 goto exit_nonewsk; 1498 1499 newsk->sk_gso_type = SKB_GSO_TCPV4; 1500 inet_sk_rx_dst_set(newsk, skb); 1501 1502 newtp = tcp_sk(newsk); 1503 newinet = inet_sk(newsk); 1504 ireq = inet_rsk(req); 1505 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1506 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1507 newsk->sk_bound_dev_if = ireq->ir_iif; 1508 newinet->inet_saddr = ireq->ir_loc_addr; 1509 inet_opt = rcu_dereference(ireq->ireq_opt); 1510 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1511 newinet->mc_index = inet_iif(skb); 1512 newinet->mc_ttl = ip_hdr(skb)->ttl; 1513 newinet->rcv_tos = ip_hdr(skb)->tos; 1514 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1515 if (inet_opt) 1516 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1517 newinet->inet_id = prandom_u32(); 1518 1519 if (!dst) { 1520 dst = inet_csk_route_child_sock(sk, newsk, req); 1521 if (!dst) 1522 goto put_and_exit; 1523 } else { 1524 /* syncookie case : see end of cookie_v4_check() */ 1525 } 1526 sk_setup_caps(newsk, dst); 1527 1528 tcp_ca_openreq_child(newsk, dst); 1529 1530 tcp_sync_mss(newsk, dst_mtu(dst)); 1531 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1532 1533 tcp_initialize_rcv_mss(newsk); 1534 1535 #ifdef CONFIG_TCP_MD5SIG 1536 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1537 /* Copy over the MD5 key from the original socket */ 1538 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1539 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1540 if (key) { 1541 /* 1542 * We're using one, so create a matching key 1543 * on the newsk structure. If we fail to get 1544 * memory, then we end up not copying the key 1545 * across. Shucks. 1546 */ 1547 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, 1548 key->key, key->keylen, GFP_ATOMIC); 1549 sk_nocaps_add(newsk, NETIF_F_GSO_MASK); 1550 } 1551 #endif 1552 1553 if (__inet_inherit_port(sk, newsk) < 0) 1554 goto put_and_exit; 1555 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); 1556 if (likely(*own_req)) { 1557 tcp_move_syn(newtp, req); 1558 ireq->ireq_opt = NULL; 1559 } else { 1560 newinet->inet_opt = NULL; 1561 } 1562 return newsk; 1563 1564 exit_overflow: 1565 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1566 exit_nonewsk: 1567 dst_release(dst); 1568 exit: 1569 tcp_listendrop(sk); 1570 return NULL; 1571 put_and_exit: 1572 newinet->inet_opt = NULL; 1573 inet_csk_prepare_forced_close(newsk); 1574 tcp_done(newsk); 1575 goto exit; 1576 } 1577 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1578 1579 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1580 { 1581 #ifdef CONFIG_SYN_COOKIES 1582 const struct tcphdr *th = tcp_hdr(skb); 1583 1584 if (!th->syn) 1585 sk = cookie_v4_check(sk, skb); 1586 #endif 1587 return sk; 1588 } 1589 1590 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1591 struct tcphdr *th, u32 *cookie) 1592 { 1593 u16 mss = 0; 1594 #ifdef CONFIG_SYN_COOKIES 1595 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1596 &tcp_request_sock_ipv4_ops, sk, th); 1597 if (mss) { 1598 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1599 tcp_synq_overflow(sk); 1600 } 1601 #endif 1602 return mss; 1603 } 1604 1605 /* The socket must have it's spinlock held when we get 1606 * here, unless it is a TCP_LISTEN socket. 1607 * 1608 * We have a potential double-lock case here, so even when 1609 * doing backlog processing we use the BH locking scheme. 1610 * This is because we cannot sleep with the original spinlock 1611 * held. 1612 */ 1613 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1614 { 1615 struct sock *rsk; 1616 1617 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1618 struct dst_entry *dst = sk->sk_rx_dst; 1619 1620 sock_rps_save_rxhash(sk, skb); 1621 sk_mark_napi_id(sk, skb); 1622 if (dst) { 1623 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || 1624 !dst->ops->check(dst, 0)) { 1625 dst_release(dst); 1626 sk->sk_rx_dst = NULL; 1627 } 1628 } 1629 tcp_rcv_established(sk, skb); 1630 return 0; 1631 } 1632 1633 if (tcp_checksum_complete(skb)) 1634 goto csum_err; 1635 1636 if (sk->sk_state == TCP_LISTEN) { 1637 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1638 1639 if (!nsk) 1640 goto discard; 1641 if (nsk != sk) { 1642 if (tcp_child_process(sk, nsk, skb)) { 1643 rsk = nsk; 1644 goto reset; 1645 } 1646 return 0; 1647 } 1648 } else 1649 sock_rps_save_rxhash(sk, skb); 1650 1651 if (tcp_rcv_state_process(sk, skb)) { 1652 rsk = sk; 1653 goto reset; 1654 } 1655 return 0; 1656 1657 reset: 1658 tcp_v4_send_reset(rsk, skb); 1659 discard: 1660 kfree_skb(skb); 1661 /* Be careful here. If this function gets more complicated and 1662 * gcc suffers from register pressure on the x86, sk (in %ebx) 1663 * might be destroyed here. This current version compiles correctly, 1664 * but you have been warned. 1665 */ 1666 return 0; 1667 1668 csum_err: 1669 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1670 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1671 goto discard; 1672 } 1673 EXPORT_SYMBOL(tcp_v4_do_rcv); 1674 1675 int tcp_v4_early_demux(struct sk_buff *skb) 1676 { 1677 const struct iphdr *iph; 1678 const struct tcphdr *th; 1679 struct sock *sk; 1680 1681 if (skb->pkt_type != PACKET_HOST) 1682 return 0; 1683 1684 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1685 return 0; 1686 1687 iph = ip_hdr(skb); 1688 th = tcp_hdr(skb); 1689 1690 if (th->doff < sizeof(struct tcphdr) / 4) 1691 return 0; 1692 1693 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, 1694 iph->saddr, th->source, 1695 iph->daddr, ntohs(th->dest), 1696 skb->skb_iif, inet_sdif(skb)); 1697 if (sk) { 1698 skb->sk = sk; 1699 skb->destructor = sock_edemux; 1700 if (sk_fullsock(sk)) { 1701 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); 1702 1703 if (dst) 1704 dst = dst_check(dst, 0); 1705 if (dst && 1706 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) 1707 skb_dst_set_noref(skb, dst); 1708 } 1709 } 1710 return 0; 1711 } 1712 1713 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) 1714 { 1715 u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf); 1716 struct skb_shared_info *shinfo; 1717 const struct tcphdr *th; 1718 struct tcphdr *thtail; 1719 struct sk_buff *tail; 1720 unsigned int hdrlen; 1721 bool fragstolen; 1722 u32 gso_segs; 1723 int delta; 1724 1725 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1726 * we can fix skb->truesize to its real value to avoid future drops. 1727 * This is valid because skb is not yet charged to the socket. 1728 * It has been noticed pure SACK packets were sometimes dropped 1729 * (if cooked by drivers without copybreak feature). 1730 */ 1731 skb_condense(skb); 1732 1733 skb_dst_drop(skb); 1734 1735 if (unlikely(tcp_checksum_complete(skb))) { 1736 bh_unlock_sock(sk); 1737 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1738 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1739 return true; 1740 } 1741 1742 /* Attempt coalescing to last skb in backlog, even if we are 1743 * above the limits. 1744 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 1745 */ 1746 th = (const struct tcphdr *)skb->data; 1747 hdrlen = th->doff * 4; 1748 shinfo = skb_shinfo(skb); 1749 1750 if (!shinfo->gso_size) 1751 shinfo->gso_size = skb->len - hdrlen; 1752 1753 if (!shinfo->gso_segs) 1754 shinfo->gso_segs = 1; 1755 1756 tail = sk->sk_backlog.tail; 1757 if (!tail) 1758 goto no_coalesce; 1759 thtail = (struct tcphdr *)tail->data; 1760 1761 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 1762 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 1763 ((TCP_SKB_CB(tail)->tcp_flags | 1764 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 1765 !((TCP_SKB_CB(tail)->tcp_flags & 1766 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 1767 ((TCP_SKB_CB(tail)->tcp_flags ^ 1768 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || 1769 #ifdef CONFIG_TLS_DEVICE 1770 tail->decrypted != skb->decrypted || 1771 #endif 1772 thtail->doff != th->doff || 1773 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 1774 goto no_coalesce; 1775 1776 __skb_pull(skb, hdrlen); 1777 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 1778 thtail->window = th->window; 1779 1780 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 1781 1782 if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq)) 1783 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 1784 1785 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 1786 * thtail->fin, so that the fast path in tcp_rcv_established() 1787 * is not entered if we append a packet with a FIN. 1788 * SYN, RST, URG are not present. 1789 * ACK is set on both packets. 1790 * PSH : we do not really care in TCP stack, 1791 * at least for 'GRO' packets. 1792 */ 1793 thtail->fin |= th->fin; 1794 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 1795 1796 if (TCP_SKB_CB(skb)->has_rxtstamp) { 1797 TCP_SKB_CB(tail)->has_rxtstamp = true; 1798 tail->tstamp = skb->tstamp; 1799 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 1800 } 1801 1802 /* Not as strict as GRO. We only need to carry mss max value */ 1803 skb_shinfo(tail)->gso_size = max(shinfo->gso_size, 1804 skb_shinfo(tail)->gso_size); 1805 1806 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs; 1807 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF); 1808 1809 sk->sk_backlog.len += delta; 1810 __NET_INC_STATS(sock_net(sk), 1811 LINUX_MIB_TCPBACKLOGCOALESCE); 1812 kfree_skb_partial(skb, fragstolen); 1813 return false; 1814 } 1815 __skb_push(skb, hdrlen); 1816 1817 no_coalesce: 1818 /* Only socket owner can try to collapse/prune rx queues 1819 * to reduce memory overhead, so add a little headroom here. 1820 * Few sockets backlog are possibly concurrently non empty. 1821 */ 1822 limit += 64*1024; 1823 1824 if (unlikely(sk_add_backlog(sk, skb, limit))) { 1825 bh_unlock_sock(sk); 1826 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 1827 return true; 1828 } 1829 return false; 1830 } 1831 EXPORT_SYMBOL(tcp_add_backlog); 1832 1833 int tcp_filter(struct sock *sk, struct sk_buff *skb) 1834 { 1835 struct tcphdr *th = (struct tcphdr *)skb->data; 1836 1837 return sk_filter_trim_cap(sk, skb, th->doff * 4); 1838 } 1839 EXPORT_SYMBOL(tcp_filter); 1840 1841 static void tcp_v4_restore_cb(struct sk_buff *skb) 1842 { 1843 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 1844 sizeof(struct inet_skb_parm)); 1845 } 1846 1847 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 1848 const struct tcphdr *th) 1849 { 1850 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 1851 * barrier() makes sure compiler wont play fool^Waliasing games. 1852 */ 1853 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 1854 sizeof(struct inet_skb_parm)); 1855 barrier(); 1856 1857 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1858 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1859 skb->len - th->doff * 4); 1860 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1861 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 1862 TCP_SKB_CB(skb)->tcp_tw_isn = 0; 1863 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 1864 TCP_SKB_CB(skb)->sacked = 0; 1865 TCP_SKB_CB(skb)->has_rxtstamp = 1866 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 1867 } 1868 1869 /* 1870 * From tcp_input.c 1871 */ 1872 1873 int tcp_v4_rcv(struct sk_buff *skb) 1874 { 1875 struct net *net = dev_net(skb->dev); 1876 struct sk_buff *skb_to_free; 1877 int sdif = inet_sdif(skb); 1878 int dif = inet_iif(skb); 1879 const struct iphdr *iph; 1880 const struct tcphdr *th; 1881 bool refcounted; 1882 struct sock *sk; 1883 int ret; 1884 1885 if (skb->pkt_type != PACKET_HOST) 1886 goto discard_it; 1887 1888 /* Count it even if it's bad */ 1889 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 1890 1891 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 1892 goto discard_it; 1893 1894 th = (const struct tcphdr *)skb->data; 1895 1896 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) 1897 goto bad_packet; 1898 if (!pskb_may_pull(skb, th->doff * 4)) 1899 goto discard_it; 1900 1901 /* An explanation is required here, I think. 1902 * Packet length and doff are validated by header prediction, 1903 * provided case of th->doff==0 is eliminated. 1904 * So, we defer the checks. */ 1905 1906 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 1907 goto csum_error; 1908 1909 th = (const struct tcphdr *)skb->data; 1910 iph = ip_hdr(skb); 1911 lookup: 1912 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, 1913 th->dest, sdif, &refcounted); 1914 if (!sk) 1915 goto no_tcp_socket; 1916 1917 process: 1918 if (sk->sk_state == TCP_TIME_WAIT) 1919 goto do_time_wait; 1920 1921 if (sk->sk_state == TCP_NEW_SYN_RECV) { 1922 struct request_sock *req = inet_reqsk(sk); 1923 bool req_stolen = false; 1924 struct sock *nsk; 1925 1926 sk = req->rsk_listener; 1927 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) { 1928 sk_drops_add(sk, skb); 1929 reqsk_put(req); 1930 goto discard_it; 1931 } 1932 if (tcp_checksum_complete(skb)) { 1933 reqsk_put(req); 1934 goto csum_error; 1935 } 1936 if (unlikely(sk->sk_state != TCP_LISTEN)) { 1937 inet_csk_reqsk_queue_drop_and_put(sk, req); 1938 goto lookup; 1939 } 1940 /* We own a reference on the listener, increase it again 1941 * as we might lose it too soon. 1942 */ 1943 sock_hold(sk); 1944 refcounted = true; 1945 nsk = NULL; 1946 if (!tcp_filter(sk, skb)) { 1947 th = (const struct tcphdr *)skb->data; 1948 iph = ip_hdr(skb); 1949 tcp_v4_fill_cb(skb, iph, th); 1950 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 1951 } 1952 if (!nsk) { 1953 reqsk_put(req); 1954 if (req_stolen) { 1955 /* Another cpu got exclusive access to req 1956 * and created a full blown socket. 1957 * Try to feed this packet to this socket 1958 * instead of discarding it. 1959 */ 1960 tcp_v4_restore_cb(skb); 1961 sock_put(sk); 1962 goto lookup; 1963 } 1964 goto discard_and_relse; 1965 } 1966 if (nsk == sk) { 1967 reqsk_put(req); 1968 tcp_v4_restore_cb(skb); 1969 } else if (tcp_child_process(sk, nsk, skb)) { 1970 tcp_v4_send_reset(nsk, skb); 1971 goto discard_and_relse; 1972 } else { 1973 sock_put(sk); 1974 return 0; 1975 } 1976 } 1977 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 1978 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 1979 goto discard_and_relse; 1980 } 1981 1982 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 1983 goto discard_and_relse; 1984 1985 if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif)) 1986 goto discard_and_relse; 1987 1988 nf_reset_ct(skb); 1989 1990 if (tcp_filter(sk, skb)) 1991 goto discard_and_relse; 1992 th = (const struct tcphdr *)skb->data; 1993 iph = ip_hdr(skb); 1994 tcp_v4_fill_cb(skb, iph, th); 1995 1996 skb->dev = NULL; 1997 1998 if (sk->sk_state == TCP_LISTEN) { 1999 ret = tcp_v4_do_rcv(sk, skb); 2000 goto put_and_return; 2001 } 2002 2003 sk_incoming_cpu_update(sk); 2004 2005 bh_lock_sock_nested(sk); 2006 tcp_segs_in(tcp_sk(sk), skb); 2007 ret = 0; 2008 if (!sock_owned_by_user(sk)) { 2009 skb_to_free = sk->sk_rx_skb_cache; 2010 sk->sk_rx_skb_cache = NULL; 2011 ret = tcp_v4_do_rcv(sk, skb); 2012 } else { 2013 if (tcp_add_backlog(sk, skb)) 2014 goto discard_and_relse; 2015 skb_to_free = NULL; 2016 } 2017 bh_unlock_sock(sk); 2018 if (skb_to_free) 2019 __kfree_skb(skb_to_free); 2020 2021 put_and_return: 2022 if (refcounted) 2023 sock_put(sk); 2024 2025 return ret; 2026 2027 no_tcp_socket: 2028 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2029 goto discard_it; 2030 2031 tcp_v4_fill_cb(skb, iph, th); 2032 2033 if (tcp_checksum_complete(skb)) { 2034 csum_error: 2035 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2036 bad_packet: 2037 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2038 } else { 2039 tcp_v4_send_reset(NULL, skb); 2040 } 2041 2042 discard_it: 2043 /* Discard frame. */ 2044 kfree_skb(skb); 2045 return 0; 2046 2047 discard_and_relse: 2048 sk_drops_add(sk, skb); 2049 if (refcounted) 2050 sock_put(sk); 2051 goto discard_it; 2052 2053 do_time_wait: 2054 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2055 inet_twsk_put(inet_twsk(sk)); 2056 goto discard_it; 2057 } 2058 2059 tcp_v4_fill_cb(skb, iph, th); 2060 2061 if (tcp_checksum_complete(skb)) { 2062 inet_twsk_put(inet_twsk(sk)); 2063 goto csum_error; 2064 } 2065 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 2066 case TCP_TW_SYN: { 2067 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), 2068 &tcp_hashinfo, skb, 2069 __tcp_hdrlen(th), 2070 iph->saddr, th->source, 2071 iph->daddr, th->dest, 2072 inet_iif(skb), 2073 sdif); 2074 if (sk2) { 2075 inet_twsk_deschedule_put(inet_twsk(sk)); 2076 sk = sk2; 2077 tcp_v4_restore_cb(skb); 2078 refcounted = false; 2079 goto process; 2080 } 2081 } 2082 /* to ACK */ 2083 fallthrough; 2084 case TCP_TW_ACK: 2085 tcp_v4_timewait_ack(sk, skb); 2086 break; 2087 case TCP_TW_RST: 2088 tcp_v4_send_reset(sk, skb); 2089 inet_twsk_deschedule_put(inet_twsk(sk)); 2090 goto discard_it; 2091 case TCP_TW_SUCCESS:; 2092 } 2093 goto discard_it; 2094 } 2095 2096 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2097 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2098 .twsk_unique = tcp_twsk_unique, 2099 .twsk_destructor= tcp_twsk_destructor, 2100 }; 2101 2102 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2103 { 2104 struct dst_entry *dst = skb_dst(skb); 2105 2106 if (dst && dst_hold_safe(dst)) { 2107 sk->sk_rx_dst = dst; 2108 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; 2109 } 2110 } 2111 EXPORT_SYMBOL(inet_sk_rx_dst_set); 2112 2113 const struct inet_connection_sock_af_ops ipv4_specific = { 2114 .queue_xmit = ip_queue_xmit, 2115 .send_check = tcp_v4_send_check, 2116 .rebuild_header = inet_sk_rebuild_header, 2117 .sk_rx_dst_set = inet_sk_rx_dst_set, 2118 .conn_request = tcp_v4_conn_request, 2119 .syn_recv_sock = tcp_v4_syn_recv_sock, 2120 .net_header_len = sizeof(struct iphdr), 2121 .setsockopt = ip_setsockopt, 2122 .getsockopt = ip_getsockopt, 2123 .addr2sockaddr = inet_csk_addr2sockaddr, 2124 .sockaddr_len = sizeof(struct sockaddr_in), 2125 #ifdef CONFIG_COMPAT 2126 .compat_setsockopt = compat_ip_setsockopt, 2127 .compat_getsockopt = compat_ip_getsockopt, 2128 #endif 2129 .mtu_reduced = tcp_v4_mtu_reduced, 2130 }; 2131 EXPORT_SYMBOL(ipv4_specific); 2132 2133 #ifdef CONFIG_TCP_MD5SIG 2134 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2135 .md5_lookup = tcp_v4_md5_lookup, 2136 .calc_md5_hash = tcp_v4_md5_hash_skb, 2137 .md5_parse = tcp_v4_parse_md5_keys, 2138 }; 2139 #endif 2140 2141 /* NOTE: A lot of things set to zero explicitly by call to 2142 * sk_alloc() so need not be done here. 2143 */ 2144 static int tcp_v4_init_sock(struct sock *sk) 2145 { 2146 struct inet_connection_sock *icsk = inet_csk(sk); 2147 2148 tcp_init_sock(sk); 2149 2150 icsk->icsk_af_ops = &ipv4_specific; 2151 2152 #ifdef CONFIG_TCP_MD5SIG 2153 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2154 #endif 2155 2156 return 0; 2157 } 2158 2159 void tcp_v4_destroy_sock(struct sock *sk) 2160 { 2161 struct tcp_sock *tp = tcp_sk(sk); 2162 2163 trace_tcp_destroy_sock(sk); 2164 2165 tcp_clear_xmit_timers(sk); 2166 2167 tcp_cleanup_congestion_control(sk); 2168 2169 tcp_cleanup_ulp(sk); 2170 2171 /* Cleanup up the write buffer. */ 2172 tcp_write_queue_purge(sk); 2173 2174 /* Check if we want to disable active TFO */ 2175 tcp_fastopen_active_disable_ofo_check(sk); 2176 2177 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2178 skb_rbtree_purge(&tp->out_of_order_queue); 2179 2180 #ifdef CONFIG_TCP_MD5SIG 2181 /* Clean up the MD5 key list, if any */ 2182 if (tp->md5sig_info) { 2183 tcp_clear_md5_list(sk); 2184 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu); 2185 tp->md5sig_info = NULL; 2186 } 2187 #endif 2188 2189 /* Clean up a referenced TCP bind bucket. */ 2190 if (inet_csk(sk)->icsk_bind_hash) 2191 inet_put_port(sk); 2192 2193 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2194 2195 /* If socket is aborted during connect operation */ 2196 tcp_free_fastopen_req(tp); 2197 tcp_fastopen_destroy_cipher(sk); 2198 tcp_saved_syn_free(tp); 2199 2200 sk_sockets_allocated_dec(sk); 2201 } 2202 EXPORT_SYMBOL(tcp_v4_destroy_sock); 2203 2204 #ifdef CONFIG_PROC_FS 2205 /* Proc filesystem TCP sock list dumping. */ 2206 2207 /* 2208 * Get next listener socket follow cur. If cur is NULL, get first socket 2209 * starting from bucket given in st->bucket; when st->bucket is zero the 2210 * very first socket in the hash table is returned. 2211 */ 2212 static void *listening_get_next(struct seq_file *seq, void *cur) 2213 { 2214 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); 2215 struct tcp_iter_state *st = seq->private; 2216 struct net *net = seq_file_net(seq); 2217 struct inet_listen_hashbucket *ilb; 2218 struct hlist_nulls_node *node; 2219 struct sock *sk = cur; 2220 2221 if (!sk) { 2222 get_head: 2223 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2224 spin_lock(&ilb->lock); 2225 sk = sk_nulls_head(&ilb->nulls_head); 2226 st->offset = 0; 2227 goto get_sk; 2228 } 2229 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2230 ++st->num; 2231 ++st->offset; 2232 2233 sk = sk_nulls_next(sk); 2234 get_sk: 2235 sk_nulls_for_each_from(sk, node) { 2236 if (!net_eq(sock_net(sk), net)) 2237 continue; 2238 if (sk->sk_family == afinfo->family) 2239 return sk; 2240 } 2241 spin_unlock(&ilb->lock); 2242 st->offset = 0; 2243 if (++st->bucket < INET_LHTABLE_SIZE) 2244 goto get_head; 2245 return NULL; 2246 } 2247 2248 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2249 { 2250 struct tcp_iter_state *st = seq->private; 2251 void *rc; 2252 2253 st->bucket = 0; 2254 st->offset = 0; 2255 rc = listening_get_next(seq, NULL); 2256 2257 while (rc && *pos) { 2258 rc = listening_get_next(seq, rc); 2259 --*pos; 2260 } 2261 return rc; 2262 } 2263 2264 static inline bool empty_bucket(const struct tcp_iter_state *st) 2265 { 2266 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain); 2267 } 2268 2269 /* 2270 * Get first established socket starting from bucket given in st->bucket. 2271 * If st->bucket is zero, the very first socket in the hash is returned. 2272 */ 2273 static void *established_get_first(struct seq_file *seq) 2274 { 2275 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); 2276 struct tcp_iter_state *st = seq->private; 2277 struct net *net = seq_file_net(seq); 2278 void *rc = NULL; 2279 2280 st->offset = 0; 2281 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { 2282 struct sock *sk; 2283 struct hlist_nulls_node *node; 2284 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 2285 2286 /* Lockless fast path for the common case of empty buckets */ 2287 if (empty_bucket(st)) 2288 continue; 2289 2290 spin_lock_bh(lock); 2291 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 2292 if (sk->sk_family != afinfo->family || 2293 !net_eq(sock_net(sk), net)) { 2294 continue; 2295 } 2296 rc = sk; 2297 goto out; 2298 } 2299 spin_unlock_bh(lock); 2300 } 2301 out: 2302 return rc; 2303 } 2304 2305 static void *established_get_next(struct seq_file *seq, void *cur) 2306 { 2307 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); 2308 struct sock *sk = cur; 2309 struct hlist_nulls_node *node; 2310 struct tcp_iter_state *st = seq->private; 2311 struct net *net = seq_file_net(seq); 2312 2313 ++st->num; 2314 ++st->offset; 2315 2316 sk = sk_nulls_next(sk); 2317 2318 sk_nulls_for_each_from(sk, node) { 2319 if (sk->sk_family == afinfo->family && 2320 net_eq(sock_net(sk), net)) 2321 return sk; 2322 } 2323 2324 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2325 ++st->bucket; 2326 return established_get_first(seq); 2327 } 2328 2329 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2330 { 2331 struct tcp_iter_state *st = seq->private; 2332 void *rc; 2333 2334 st->bucket = 0; 2335 rc = established_get_first(seq); 2336 2337 while (rc && pos) { 2338 rc = established_get_next(seq, rc); 2339 --pos; 2340 } 2341 return rc; 2342 } 2343 2344 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2345 { 2346 void *rc; 2347 struct tcp_iter_state *st = seq->private; 2348 2349 st->state = TCP_SEQ_STATE_LISTENING; 2350 rc = listening_get_idx(seq, &pos); 2351 2352 if (!rc) { 2353 st->state = TCP_SEQ_STATE_ESTABLISHED; 2354 rc = established_get_idx(seq, pos); 2355 } 2356 2357 return rc; 2358 } 2359 2360 static void *tcp_seek_last_pos(struct seq_file *seq) 2361 { 2362 struct tcp_iter_state *st = seq->private; 2363 int offset = st->offset; 2364 int orig_num = st->num; 2365 void *rc = NULL; 2366 2367 switch (st->state) { 2368 case TCP_SEQ_STATE_LISTENING: 2369 if (st->bucket >= INET_LHTABLE_SIZE) 2370 break; 2371 st->state = TCP_SEQ_STATE_LISTENING; 2372 rc = listening_get_next(seq, NULL); 2373 while (offset-- && rc) 2374 rc = listening_get_next(seq, rc); 2375 if (rc) 2376 break; 2377 st->bucket = 0; 2378 st->state = TCP_SEQ_STATE_ESTABLISHED; 2379 fallthrough; 2380 case TCP_SEQ_STATE_ESTABLISHED: 2381 if (st->bucket > tcp_hashinfo.ehash_mask) 2382 break; 2383 rc = established_get_first(seq); 2384 while (offset-- && rc) 2385 rc = established_get_next(seq, rc); 2386 } 2387 2388 st->num = orig_num; 2389 2390 return rc; 2391 } 2392 2393 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2394 { 2395 struct tcp_iter_state *st = seq->private; 2396 void *rc; 2397 2398 if (*pos && *pos == st->last_pos) { 2399 rc = tcp_seek_last_pos(seq); 2400 if (rc) 2401 goto out; 2402 } 2403 2404 st->state = TCP_SEQ_STATE_LISTENING; 2405 st->num = 0; 2406 st->bucket = 0; 2407 st->offset = 0; 2408 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2409 2410 out: 2411 st->last_pos = *pos; 2412 return rc; 2413 } 2414 EXPORT_SYMBOL(tcp_seq_start); 2415 2416 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2417 { 2418 struct tcp_iter_state *st = seq->private; 2419 void *rc = NULL; 2420 2421 if (v == SEQ_START_TOKEN) { 2422 rc = tcp_get_idx(seq, 0); 2423 goto out; 2424 } 2425 2426 switch (st->state) { 2427 case TCP_SEQ_STATE_LISTENING: 2428 rc = listening_get_next(seq, v); 2429 if (!rc) { 2430 st->state = TCP_SEQ_STATE_ESTABLISHED; 2431 st->bucket = 0; 2432 st->offset = 0; 2433 rc = established_get_first(seq); 2434 } 2435 break; 2436 case TCP_SEQ_STATE_ESTABLISHED: 2437 rc = established_get_next(seq, v); 2438 break; 2439 } 2440 out: 2441 ++*pos; 2442 st->last_pos = *pos; 2443 return rc; 2444 } 2445 EXPORT_SYMBOL(tcp_seq_next); 2446 2447 void tcp_seq_stop(struct seq_file *seq, void *v) 2448 { 2449 struct tcp_iter_state *st = seq->private; 2450 2451 switch (st->state) { 2452 case TCP_SEQ_STATE_LISTENING: 2453 if (v != SEQ_START_TOKEN) 2454 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock); 2455 break; 2456 case TCP_SEQ_STATE_ESTABLISHED: 2457 if (v) 2458 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2459 break; 2460 } 2461 } 2462 EXPORT_SYMBOL(tcp_seq_stop); 2463 2464 static void get_openreq4(const struct request_sock *req, 2465 struct seq_file *f, int i) 2466 { 2467 const struct inet_request_sock *ireq = inet_rsk(req); 2468 long delta = req->rsk_timer.expires - jiffies; 2469 2470 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2471 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2472 i, 2473 ireq->ir_loc_addr, 2474 ireq->ir_num, 2475 ireq->ir_rmt_addr, 2476 ntohs(ireq->ir_rmt_port), 2477 TCP_SYN_RECV, 2478 0, 0, /* could print option size, but that is af dependent. */ 2479 1, /* timers active (only the expire timer) */ 2480 jiffies_delta_to_clock_t(delta), 2481 req->num_timeout, 2482 from_kuid_munged(seq_user_ns(f), 2483 sock_i_uid(req->rsk_listener)), 2484 0, /* non standard timer */ 2485 0, /* open_requests have no inode */ 2486 0, 2487 req); 2488 } 2489 2490 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2491 { 2492 int timer_active; 2493 unsigned long timer_expires; 2494 const struct tcp_sock *tp = tcp_sk(sk); 2495 const struct inet_connection_sock *icsk = inet_csk(sk); 2496 const struct inet_sock *inet = inet_sk(sk); 2497 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2498 __be32 dest = inet->inet_daddr; 2499 __be32 src = inet->inet_rcv_saddr; 2500 __u16 destp = ntohs(inet->inet_dport); 2501 __u16 srcp = ntohs(inet->inet_sport); 2502 int rx_queue; 2503 int state; 2504 2505 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2506 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2507 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2508 timer_active = 1; 2509 timer_expires = icsk->icsk_timeout; 2510 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2511 timer_active = 4; 2512 timer_expires = icsk->icsk_timeout; 2513 } else if (timer_pending(&sk->sk_timer)) { 2514 timer_active = 2; 2515 timer_expires = sk->sk_timer.expires; 2516 } else { 2517 timer_active = 0; 2518 timer_expires = jiffies; 2519 } 2520 2521 state = inet_sk_state_load(sk); 2522 if (state == TCP_LISTEN) 2523 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2524 else 2525 /* Because we don't lock the socket, 2526 * we might find a transient negative value. 2527 */ 2528 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2529 READ_ONCE(tp->copied_seq), 0); 2530 2531 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2532 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2533 i, src, srcp, dest, destp, state, 2534 READ_ONCE(tp->write_seq) - tp->snd_una, 2535 rx_queue, 2536 timer_active, 2537 jiffies_delta_to_clock_t(timer_expires - jiffies), 2538 icsk->icsk_retransmits, 2539 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2540 icsk->icsk_probes_out, 2541 sock_i_ino(sk), 2542 refcount_read(&sk->sk_refcnt), sk, 2543 jiffies_to_clock_t(icsk->icsk_rto), 2544 jiffies_to_clock_t(icsk->icsk_ack.ato), 2545 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2546 tp->snd_cwnd, 2547 state == TCP_LISTEN ? 2548 fastopenq->max_qlen : 2549 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2550 } 2551 2552 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2553 struct seq_file *f, int i) 2554 { 2555 long delta = tw->tw_timer.expires - jiffies; 2556 __be32 dest, src; 2557 __u16 destp, srcp; 2558 2559 dest = tw->tw_daddr; 2560 src = tw->tw_rcv_saddr; 2561 destp = ntohs(tw->tw_dport); 2562 srcp = ntohs(tw->tw_sport); 2563 2564 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2565 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2566 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2567 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2568 refcount_read(&tw->tw_refcnt), tw); 2569 } 2570 2571 #define TMPSZ 150 2572 2573 static int tcp4_seq_show(struct seq_file *seq, void *v) 2574 { 2575 struct tcp_iter_state *st; 2576 struct sock *sk = v; 2577 2578 seq_setwidth(seq, TMPSZ - 1); 2579 if (v == SEQ_START_TOKEN) { 2580 seq_puts(seq, " sl local_address rem_address st tx_queue " 2581 "rx_queue tr tm->when retrnsmt uid timeout " 2582 "inode"); 2583 goto out; 2584 } 2585 st = seq->private; 2586 2587 if (sk->sk_state == TCP_TIME_WAIT) 2588 get_timewait4_sock(v, seq, st->num); 2589 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2590 get_openreq4(v, seq, st->num); 2591 else 2592 get_tcp4_sock(v, seq, st->num); 2593 out: 2594 seq_pad(seq, '\n'); 2595 return 0; 2596 } 2597 2598 static const struct seq_operations tcp4_seq_ops = { 2599 .show = tcp4_seq_show, 2600 .start = tcp_seq_start, 2601 .next = tcp_seq_next, 2602 .stop = tcp_seq_stop, 2603 }; 2604 2605 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 2606 .family = AF_INET, 2607 }; 2608 2609 static int __net_init tcp4_proc_init_net(struct net *net) 2610 { 2611 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 2612 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 2613 return -ENOMEM; 2614 return 0; 2615 } 2616 2617 static void __net_exit tcp4_proc_exit_net(struct net *net) 2618 { 2619 remove_proc_entry("tcp", net->proc_net); 2620 } 2621 2622 static struct pernet_operations tcp4_net_ops = { 2623 .init = tcp4_proc_init_net, 2624 .exit = tcp4_proc_exit_net, 2625 }; 2626 2627 int __init tcp4_proc_init(void) 2628 { 2629 return register_pernet_subsys(&tcp4_net_ops); 2630 } 2631 2632 void tcp4_proc_exit(void) 2633 { 2634 unregister_pernet_subsys(&tcp4_net_ops); 2635 } 2636 #endif /* CONFIG_PROC_FS */ 2637 2638 struct proto tcp_prot = { 2639 .name = "TCP", 2640 .owner = THIS_MODULE, 2641 .close = tcp_close, 2642 .pre_connect = tcp_v4_pre_connect, 2643 .connect = tcp_v4_connect, 2644 .disconnect = tcp_disconnect, 2645 .accept = inet_csk_accept, 2646 .ioctl = tcp_ioctl, 2647 .init = tcp_v4_init_sock, 2648 .destroy = tcp_v4_destroy_sock, 2649 .shutdown = tcp_shutdown, 2650 .setsockopt = tcp_setsockopt, 2651 .getsockopt = tcp_getsockopt, 2652 .keepalive = tcp_set_keepalive, 2653 .recvmsg = tcp_recvmsg, 2654 .sendmsg = tcp_sendmsg, 2655 .sendpage = tcp_sendpage, 2656 .backlog_rcv = tcp_v4_do_rcv, 2657 .release_cb = tcp_release_cb, 2658 .hash = inet_hash, 2659 .unhash = inet_unhash, 2660 .get_port = inet_csk_get_port, 2661 .enter_memory_pressure = tcp_enter_memory_pressure, 2662 .leave_memory_pressure = tcp_leave_memory_pressure, 2663 .stream_memory_free = tcp_stream_memory_free, 2664 .sockets_allocated = &tcp_sockets_allocated, 2665 .orphan_count = &tcp_orphan_count, 2666 .memory_allocated = &tcp_memory_allocated, 2667 .memory_pressure = &tcp_memory_pressure, 2668 .sysctl_mem = sysctl_tcp_mem, 2669 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 2670 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 2671 .max_header = MAX_TCP_HEADER, 2672 .obj_size = sizeof(struct tcp_sock), 2673 .slab_flags = SLAB_TYPESAFE_BY_RCU, 2674 .twsk_prot = &tcp_timewait_sock_ops, 2675 .rsk_prot = &tcp_request_sock_ops, 2676 .h.hashinfo = &tcp_hashinfo, 2677 .no_autobind = true, 2678 #ifdef CONFIG_COMPAT 2679 .compat_setsockopt = compat_tcp_setsockopt, 2680 .compat_getsockopt = compat_tcp_getsockopt, 2681 #endif 2682 .diag_destroy = tcp_abort, 2683 }; 2684 EXPORT_SYMBOL(tcp_prot); 2685 2686 static void __net_exit tcp_sk_exit(struct net *net) 2687 { 2688 int cpu; 2689 2690 if (net->ipv4.tcp_congestion_control) 2691 bpf_module_put(net->ipv4.tcp_congestion_control, 2692 net->ipv4.tcp_congestion_control->owner); 2693 2694 for_each_possible_cpu(cpu) 2695 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); 2696 free_percpu(net->ipv4.tcp_sk); 2697 } 2698 2699 static int __net_init tcp_sk_init(struct net *net) 2700 { 2701 int res, cpu, cnt; 2702 2703 net->ipv4.tcp_sk = alloc_percpu(struct sock *); 2704 if (!net->ipv4.tcp_sk) 2705 return -ENOMEM; 2706 2707 for_each_possible_cpu(cpu) { 2708 struct sock *sk; 2709 2710 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 2711 IPPROTO_TCP, net); 2712 if (res) 2713 goto fail; 2714 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 2715 2716 /* Please enforce IP_DF and IPID==0 for RST and 2717 * ACK sent in SYN-RECV and TIME-WAIT state. 2718 */ 2719 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 2720 2721 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; 2722 } 2723 2724 net->ipv4.sysctl_tcp_ecn = 2; 2725 net->ipv4.sysctl_tcp_ecn_fallback = 1; 2726 2727 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 2728 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 2729 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 2730 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 2731 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 2732 2733 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 2734 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 2735 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 2736 2737 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 2738 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 2739 net->ipv4.sysctl_tcp_syncookies = 1; 2740 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 2741 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 2742 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 2743 net->ipv4.sysctl_tcp_orphan_retries = 0; 2744 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 2745 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 2746 net->ipv4.sysctl_tcp_tw_reuse = 2; 2747 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 2748 2749 cnt = tcp_hashinfo.ehash_mask + 1; 2750 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2; 2751 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; 2752 2753 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128); 2754 net->ipv4.sysctl_tcp_sack = 1; 2755 net->ipv4.sysctl_tcp_window_scaling = 1; 2756 net->ipv4.sysctl_tcp_timestamps = 1; 2757 net->ipv4.sysctl_tcp_early_retrans = 3; 2758 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 2759 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 2760 net->ipv4.sysctl_tcp_retrans_collapse = 1; 2761 net->ipv4.sysctl_tcp_max_reordering = 300; 2762 net->ipv4.sysctl_tcp_dsack = 1; 2763 net->ipv4.sysctl_tcp_app_win = 31; 2764 net->ipv4.sysctl_tcp_adv_win_scale = 1; 2765 net->ipv4.sysctl_tcp_frto = 2; 2766 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 2767 /* This limits the percentage of the congestion window which we 2768 * will allow a single TSO frame to consume. Building TSO frames 2769 * which are too large can cause TCP streams to be bursty. 2770 */ 2771 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 2772 /* Default TSQ limit of 16 TSO segments */ 2773 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; 2774 /* rfc5961 challenge ack rate limiting */ 2775 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; 2776 net->ipv4.sysctl_tcp_min_tso_segs = 2; 2777 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 2778 net->ipv4.sysctl_tcp_autocorking = 1; 2779 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 2780 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 2781 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 2782 if (net != &init_net) { 2783 memcpy(net->ipv4.sysctl_tcp_rmem, 2784 init_net.ipv4.sysctl_tcp_rmem, 2785 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 2786 memcpy(net->ipv4.sysctl_tcp_wmem, 2787 init_net.ipv4.sysctl_tcp_wmem, 2788 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 2789 } 2790 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 2791 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 2792 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 2793 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 2794 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); 2795 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; 2796 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 2797 2798 /* Reno is always built in */ 2799 if (!net_eq(net, &init_net) && 2800 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 2801 init_net.ipv4.tcp_congestion_control->owner)) 2802 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 2803 else 2804 net->ipv4.tcp_congestion_control = &tcp_reno; 2805 2806 return 0; 2807 fail: 2808 tcp_sk_exit(net); 2809 2810 return res; 2811 } 2812 2813 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 2814 { 2815 struct net *net; 2816 2817 inet_twsk_purge(&tcp_hashinfo, AF_INET); 2818 2819 list_for_each_entry(net, net_exit_list, exit_list) 2820 tcp_fastopen_ctx_destroy(net); 2821 } 2822 2823 static struct pernet_operations __net_initdata tcp_sk_ops = { 2824 .init = tcp_sk_init, 2825 .exit = tcp_sk_exit, 2826 .exit_batch = tcp_sk_exit_batch, 2827 }; 2828 2829 void __init tcp_v4_init(void) 2830 { 2831 if (register_pernet_subsys(&tcp_sk_ops)) 2832 panic("Failed to create the TCP control socket.\n"); 2833 } 2834