1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 61 #include <net/net_namespace.h> 62 #include <net/icmp.h> 63 #include <net/inet_hashtables.h> 64 #include <net/tcp.h> 65 #include <net/transp_v6.h> 66 #include <net/ipv6.h> 67 #include <net/inet_common.h> 68 #include <net/timewait_sock.h> 69 #include <net/xfrm.h> 70 #include <net/secure_seq.h> 71 #include <net/busy_poll.h> 72 73 #include <linux/inet.h> 74 #include <linux/ipv6.h> 75 #include <linux/stddef.h> 76 #include <linux/proc_fs.h> 77 #include <linux/seq_file.h> 78 #include <linux/inetdevice.h> 79 80 #include <crypto/hash.h> 81 #include <linux/scatterlist.h> 82 83 #include <trace/events/tcp.h> 84 85 #ifdef CONFIG_TCP_MD5SIG 86 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 87 __be32 daddr, __be32 saddr, const struct tcphdr *th); 88 #endif 89 90 struct inet_hashinfo tcp_hashinfo; 91 EXPORT_SYMBOL(tcp_hashinfo); 92 93 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 94 { 95 return secure_tcp_seq(ip_hdr(skb)->daddr, 96 ip_hdr(skb)->saddr, 97 tcp_hdr(skb)->dest, 98 tcp_hdr(skb)->source); 99 } 100 101 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 102 { 103 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 104 } 105 106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 107 { 108 const struct inet_timewait_sock *tw = inet_twsk(sktw); 109 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 110 struct tcp_sock *tp = tcp_sk(sk); 111 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse; 112 113 if (reuse == 2) { 114 /* Still does not detect *everything* that goes through 115 * lo, since we require a loopback src or dst address 116 * or direct binding to 'lo' interface. 117 */ 118 bool loopback = false; 119 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 120 loopback = true; 121 #if IS_ENABLED(CONFIG_IPV6) 122 if (tw->tw_family == AF_INET6) { 123 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 124 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 125 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 126 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 127 loopback = true; 128 } else 129 #endif 130 { 131 if (ipv4_is_loopback(tw->tw_daddr) || 132 ipv4_is_loopback(tw->tw_rcv_saddr)) 133 loopback = true; 134 } 135 if (!loopback) 136 reuse = 0; 137 } 138 139 /* With PAWS, it is safe from the viewpoint 140 of data integrity. Even without PAWS it is safe provided sequence 141 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 142 143 Actually, the idea is close to VJ's one, only timestamp cache is 144 held not per host, but per port pair and TW bucket is used as state 145 holder. 146 147 If TW bucket has been already destroyed we fall back to VJ's scheme 148 and use initial timestamp retrieved from peer table. 149 */ 150 if (tcptw->tw_ts_recent_stamp && 151 (!twp || (reuse && time_after32(ktime_get_seconds(), 152 tcptw->tw_ts_recent_stamp)))) { 153 /* In case of repair and re-using TIME-WAIT sockets we still 154 * want to be sure that it is safe as above but honor the 155 * sequence numbers and time stamps set as part of the repair 156 * process. 157 * 158 * Without this check re-using a TIME-WAIT socket with TCP 159 * repair would accumulate a -1 on the repair assigned 160 * sequence number. The first time it is reused the sequence 161 * is -1, the second time -2, etc. This fixes that issue 162 * without appearing to create any others. 163 */ 164 if (likely(!tp->repair)) { 165 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 166 167 if (!seq) 168 seq = 1; 169 WRITE_ONCE(tp->write_seq, seq); 170 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 171 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 172 } 173 sock_hold(sktw); 174 return 1; 175 } 176 177 return 0; 178 } 179 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 180 181 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 182 int addr_len) 183 { 184 /* This check is replicated from tcp_v4_connect() and intended to 185 * prevent BPF program called below from accessing bytes that are out 186 * of the bound specified by user in addr_len. 187 */ 188 if (addr_len < sizeof(struct sockaddr_in)) 189 return -EINVAL; 190 191 sock_owned_by_me(sk); 192 193 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr); 194 } 195 196 /* This will initiate an outgoing connection. */ 197 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 198 { 199 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 200 struct inet_sock *inet = inet_sk(sk); 201 struct tcp_sock *tp = tcp_sk(sk); 202 __be16 orig_sport, orig_dport; 203 __be32 daddr, nexthop; 204 struct flowi4 *fl4; 205 struct rtable *rt; 206 int err; 207 struct ip_options_rcu *inet_opt; 208 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 209 210 if (addr_len < sizeof(struct sockaddr_in)) 211 return -EINVAL; 212 213 if (usin->sin_family != AF_INET) 214 return -EAFNOSUPPORT; 215 216 nexthop = daddr = usin->sin_addr.s_addr; 217 inet_opt = rcu_dereference_protected(inet->inet_opt, 218 lockdep_sock_is_held(sk)); 219 if (inet_opt && inet_opt->opt.srr) { 220 if (!daddr) 221 return -EINVAL; 222 nexthop = inet_opt->opt.faddr; 223 } 224 225 orig_sport = inet->inet_sport; 226 orig_dport = usin->sin_port; 227 fl4 = &inet->cork.fl.u.ip4; 228 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 229 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 230 IPPROTO_TCP, 231 orig_sport, orig_dport, sk); 232 if (IS_ERR(rt)) { 233 err = PTR_ERR(rt); 234 if (err == -ENETUNREACH) 235 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 236 return err; 237 } 238 239 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 240 ip_rt_put(rt); 241 return -ENETUNREACH; 242 } 243 244 if (!inet_opt || !inet_opt->opt.srr) 245 daddr = fl4->daddr; 246 247 if (!inet->inet_saddr) 248 inet->inet_saddr = fl4->saddr; 249 sk_rcv_saddr_set(sk, inet->inet_saddr); 250 251 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 252 /* Reset inherited state */ 253 tp->rx_opt.ts_recent = 0; 254 tp->rx_opt.ts_recent_stamp = 0; 255 if (likely(!tp->repair)) 256 WRITE_ONCE(tp->write_seq, 0); 257 } 258 259 inet->inet_dport = usin->sin_port; 260 sk_daddr_set(sk, daddr); 261 262 inet_csk(sk)->icsk_ext_hdr_len = 0; 263 if (inet_opt) 264 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 265 266 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 267 268 /* Socket identity is still unknown (sport may be zero). 269 * However we set state to SYN-SENT and not releasing socket 270 * lock select source port, enter ourselves into the hash tables and 271 * complete initialization after this. 272 */ 273 tcp_set_state(sk, TCP_SYN_SENT); 274 err = inet_hash_connect(tcp_death_row, sk); 275 if (err) 276 goto failure; 277 278 sk_set_txhash(sk); 279 280 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 281 inet->inet_sport, inet->inet_dport, sk); 282 if (IS_ERR(rt)) { 283 err = PTR_ERR(rt); 284 rt = NULL; 285 goto failure; 286 } 287 /* OK, now commit destination to socket. */ 288 sk->sk_gso_type = SKB_GSO_TCPV4; 289 sk_setup_caps(sk, &rt->dst); 290 rt = NULL; 291 292 if (likely(!tp->repair)) { 293 if (!tp->write_seq) 294 WRITE_ONCE(tp->write_seq, 295 secure_tcp_seq(inet->inet_saddr, 296 inet->inet_daddr, 297 inet->inet_sport, 298 usin->sin_port)); 299 tp->tsoffset = secure_tcp_ts_off(sock_net(sk), 300 inet->inet_saddr, 301 inet->inet_daddr); 302 } 303 304 inet->inet_id = prandom_u32(); 305 306 if (tcp_fastopen_defer_connect(sk, &err)) 307 return err; 308 if (err) 309 goto failure; 310 311 err = tcp_connect(sk); 312 313 if (err) 314 goto failure; 315 316 return 0; 317 318 failure: 319 /* 320 * This unhashes the socket and releases the local port, 321 * if necessary. 322 */ 323 tcp_set_state(sk, TCP_CLOSE); 324 ip_rt_put(rt); 325 sk->sk_route_caps = 0; 326 inet->inet_dport = 0; 327 return err; 328 } 329 EXPORT_SYMBOL(tcp_v4_connect); 330 331 /* 332 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 333 * It can be called through tcp_release_cb() if socket was owned by user 334 * at the time tcp_v4_err() was called to handle ICMP message. 335 */ 336 void tcp_v4_mtu_reduced(struct sock *sk) 337 { 338 struct inet_sock *inet = inet_sk(sk); 339 struct dst_entry *dst; 340 u32 mtu; 341 342 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 343 return; 344 mtu = tcp_sk(sk)->mtu_info; 345 dst = inet_csk_update_pmtu(sk, mtu); 346 if (!dst) 347 return; 348 349 /* Something is about to be wrong... Remember soft error 350 * for the case, if this connection will not able to recover. 351 */ 352 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 353 sk->sk_err_soft = EMSGSIZE; 354 355 mtu = dst_mtu(dst); 356 357 if (inet->pmtudisc != IP_PMTUDISC_DONT && 358 ip_sk_accept_pmtu(sk) && 359 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 360 tcp_sync_mss(sk, mtu); 361 362 /* Resend the TCP packet because it's 363 * clear that the old packet has been 364 * dropped. This is the new "fast" path mtu 365 * discovery. 366 */ 367 tcp_simple_retransmit(sk); 368 } /* else let the usual retransmit timer handle it */ 369 } 370 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 371 372 static void do_redirect(struct sk_buff *skb, struct sock *sk) 373 { 374 struct dst_entry *dst = __sk_dst_check(sk, 0); 375 376 if (dst) 377 dst->ops->redirect(dst, sk, skb); 378 } 379 380 381 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 382 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 383 { 384 struct request_sock *req = inet_reqsk(sk); 385 struct net *net = sock_net(sk); 386 387 /* ICMPs are not backlogged, hence we cannot get 388 * an established socket here. 389 */ 390 if (seq != tcp_rsk(req)->snt_isn) { 391 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 392 } else if (abort) { 393 /* 394 * Still in SYN_RECV, just remove it silently. 395 * There is no good way to pass the error to the newly 396 * created socket, and POSIX does not want network 397 * errors returned from accept(). 398 */ 399 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 400 tcp_listendrop(req->rsk_listener); 401 } 402 reqsk_put(req); 403 } 404 EXPORT_SYMBOL(tcp_req_err); 405 406 /* 407 * This routine is called by the ICMP module when it gets some 408 * sort of error condition. If err < 0 then the socket should 409 * be closed and the error returned to the user. If err > 0 410 * it's just the icmp type << 8 | icmp code. After adjustment 411 * header points to the first 8 bytes of the tcp header. We need 412 * to find the appropriate port. 413 * 414 * The locking strategy used here is very "optimistic". When 415 * someone else accesses the socket the ICMP is just dropped 416 * and for some paths there is no check at all. 417 * A more general error queue to queue errors for later handling 418 * is probably better. 419 * 420 */ 421 422 int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) 423 { 424 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data; 425 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); 426 struct inet_connection_sock *icsk; 427 struct tcp_sock *tp; 428 struct inet_sock *inet; 429 const int type = icmp_hdr(icmp_skb)->type; 430 const int code = icmp_hdr(icmp_skb)->code; 431 struct sock *sk; 432 struct sk_buff *skb; 433 struct request_sock *fastopen; 434 u32 seq, snd_una; 435 s32 remaining; 436 u32 delta_us; 437 int err; 438 struct net *net = dev_net(icmp_skb->dev); 439 440 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, 441 th->dest, iph->saddr, ntohs(th->source), 442 inet_iif(icmp_skb), 0); 443 if (!sk) { 444 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 445 return -ENOENT; 446 } 447 if (sk->sk_state == TCP_TIME_WAIT) { 448 inet_twsk_put(inet_twsk(sk)); 449 return 0; 450 } 451 seq = ntohl(th->seq); 452 if (sk->sk_state == TCP_NEW_SYN_RECV) { 453 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 454 type == ICMP_TIME_EXCEEDED || 455 (type == ICMP_DEST_UNREACH && 456 (code == ICMP_NET_UNREACH || 457 code == ICMP_HOST_UNREACH))); 458 return 0; 459 } 460 461 bh_lock_sock(sk); 462 /* If too many ICMPs get dropped on busy 463 * servers this needs to be solved differently. 464 * We do take care of PMTU discovery (RFC1191) special case : 465 * we can receive locally generated ICMP messages while socket is held. 466 */ 467 if (sock_owned_by_user(sk)) { 468 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 469 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 470 } 471 if (sk->sk_state == TCP_CLOSE) 472 goto out; 473 474 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 475 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 476 goto out; 477 } 478 479 icsk = inet_csk(sk); 480 tp = tcp_sk(sk); 481 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 482 fastopen = rcu_dereference(tp->fastopen_rsk); 483 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 484 if (sk->sk_state != TCP_LISTEN && 485 !between(seq, snd_una, tp->snd_nxt)) { 486 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 487 goto out; 488 } 489 490 switch (type) { 491 case ICMP_REDIRECT: 492 if (!sock_owned_by_user(sk)) 493 do_redirect(icmp_skb, sk); 494 goto out; 495 case ICMP_SOURCE_QUENCH: 496 /* Just silently ignore these. */ 497 goto out; 498 case ICMP_PARAMETERPROB: 499 err = EPROTO; 500 break; 501 case ICMP_DEST_UNREACH: 502 if (code > NR_ICMP_UNREACH) 503 goto out; 504 505 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 506 /* We are not interested in TCP_LISTEN and open_requests 507 * (SYN-ACKs send out by Linux are always <576bytes so 508 * they should go through unfragmented). 509 */ 510 if (sk->sk_state == TCP_LISTEN) 511 goto out; 512 513 tp->mtu_info = info; 514 if (!sock_owned_by_user(sk)) { 515 tcp_v4_mtu_reduced(sk); 516 } else { 517 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 518 sock_hold(sk); 519 } 520 goto out; 521 } 522 523 err = icmp_err_convert[code].errno; 524 /* check if icmp_skb allows revert of backoff 525 * (see draft-zimmermann-tcp-lcd) */ 526 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH) 527 break; 528 if (seq != tp->snd_una || !icsk->icsk_retransmits || 529 !icsk->icsk_backoff || fastopen) 530 break; 531 532 if (sock_owned_by_user(sk)) 533 break; 534 535 skb = tcp_rtx_queue_head(sk); 536 if (WARN_ON_ONCE(!skb)) 537 break; 538 539 icsk->icsk_backoff--; 540 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : 541 TCP_TIMEOUT_INIT; 542 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 543 544 545 tcp_mstamp_refresh(tp); 546 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 547 remaining = icsk->icsk_rto - 548 usecs_to_jiffies(delta_us); 549 550 if (remaining > 0) { 551 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 552 remaining, TCP_RTO_MAX); 553 } else { 554 /* RTO revert clocked out retransmission. 555 * Will retransmit now */ 556 tcp_retransmit_timer(sk); 557 } 558 559 break; 560 case ICMP_TIME_EXCEEDED: 561 err = EHOSTUNREACH; 562 break; 563 default: 564 goto out; 565 } 566 567 switch (sk->sk_state) { 568 case TCP_SYN_SENT: 569 case TCP_SYN_RECV: 570 /* Only in fast or simultaneous open. If a fast open socket is 571 * is already accepted it is treated as a connected one below. 572 */ 573 if (fastopen && !fastopen->sk) 574 break; 575 576 if (!sock_owned_by_user(sk)) { 577 sk->sk_err = err; 578 579 sk->sk_error_report(sk); 580 581 tcp_done(sk); 582 } else { 583 sk->sk_err_soft = err; 584 } 585 goto out; 586 } 587 588 /* If we've already connected we will keep trying 589 * until we time out, or the user gives up. 590 * 591 * rfc1122 4.2.3.9 allows to consider as hard errors 592 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 593 * but it is obsoleted by pmtu discovery). 594 * 595 * Note, that in modern internet, where routing is unreliable 596 * and in each dark corner broken firewalls sit, sending random 597 * errors ordered by their masters even this two messages finally lose 598 * their original sense (even Linux sends invalid PORT_UNREACHs) 599 * 600 * Now we are in compliance with RFCs. 601 * --ANK (980905) 602 */ 603 604 inet = inet_sk(sk); 605 if (!sock_owned_by_user(sk) && inet->recverr) { 606 sk->sk_err = err; 607 sk->sk_error_report(sk); 608 } else { /* Only an error on timeout */ 609 sk->sk_err_soft = err; 610 } 611 612 out: 613 bh_unlock_sock(sk); 614 sock_put(sk); 615 return 0; 616 } 617 618 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 619 { 620 struct tcphdr *th = tcp_hdr(skb); 621 622 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 623 skb->csum_start = skb_transport_header(skb) - skb->head; 624 skb->csum_offset = offsetof(struct tcphdr, check); 625 } 626 627 /* This routine computes an IPv4 TCP checksum. */ 628 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 629 { 630 const struct inet_sock *inet = inet_sk(sk); 631 632 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 633 } 634 EXPORT_SYMBOL(tcp_v4_send_check); 635 636 /* 637 * This routine will send an RST to the other tcp. 638 * 639 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 640 * for reset. 641 * Answer: if a packet caused RST, it is not for a socket 642 * existing in our system, if it is matched to a socket, 643 * it is just duplicate segment or bug in other side's TCP. 644 * So that we build reply only basing on parameters 645 * arrived with segment. 646 * Exception: precedence violation. We do not implement it in any case. 647 */ 648 649 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) 650 { 651 const struct tcphdr *th = tcp_hdr(skb); 652 struct { 653 struct tcphdr th; 654 #ifdef CONFIG_TCP_MD5SIG 655 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)]; 656 #endif 657 } rep; 658 struct ip_reply_arg arg; 659 #ifdef CONFIG_TCP_MD5SIG 660 struct tcp_md5sig_key *key = NULL; 661 const __u8 *hash_location = NULL; 662 unsigned char newhash[16]; 663 int genhash; 664 struct sock *sk1 = NULL; 665 #endif 666 u64 transmit_time = 0; 667 struct sock *ctl_sk; 668 struct net *net; 669 670 /* Never send a reset in response to a reset. */ 671 if (th->rst) 672 return; 673 674 /* If sk not NULL, it means we did a successful lookup and incoming 675 * route had to be correct. prequeue might have dropped our dst. 676 */ 677 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 678 return; 679 680 /* Swap the send and the receive. */ 681 memset(&rep, 0, sizeof(rep)); 682 rep.th.dest = th->source; 683 rep.th.source = th->dest; 684 rep.th.doff = sizeof(struct tcphdr) / 4; 685 rep.th.rst = 1; 686 687 if (th->ack) { 688 rep.th.seq = th->ack_seq; 689 } else { 690 rep.th.ack = 1; 691 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 692 skb->len - (th->doff << 2)); 693 } 694 695 memset(&arg, 0, sizeof(arg)); 696 arg.iov[0].iov_base = (unsigned char *)&rep; 697 arg.iov[0].iov_len = sizeof(rep.th); 698 699 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 700 #ifdef CONFIG_TCP_MD5SIG 701 rcu_read_lock(); 702 hash_location = tcp_parse_md5sig_option(th); 703 if (sk && sk_fullsock(sk)) { 704 const union tcp_md5_addr *addr; 705 int l3index; 706 707 /* sdif set, means packet ingressed via a device 708 * in an L3 domain and inet_iif is set to it. 709 */ 710 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 711 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 712 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 713 } else if (hash_location) { 714 const union tcp_md5_addr *addr; 715 int sdif = tcp_v4_sdif(skb); 716 int dif = inet_iif(skb); 717 int l3index; 718 719 /* 720 * active side is lost. Try to find listening socket through 721 * source port, and then find md5 key through listening socket. 722 * we are not loose security here: 723 * Incoming packet is checked with md5 hash with finding key, 724 * no RST generated if md5 hash doesn't match. 725 */ 726 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, 727 ip_hdr(skb)->saddr, 728 th->source, ip_hdr(skb)->daddr, 729 ntohs(th->source), dif, sdif); 730 /* don't send rst if it can't find key */ 731 if (!sk1) 732 goto out; 733 734 /* sdif set, means packet ingressed via a device 735 * in an L3 domain and dif is set to it. 736 */ 737 l3index = sdif ? dif : 0; 738 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 739 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 740 if (!key) 741 goto out; 742 743 744 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 745 if (genhash || memcmp(hash_location, newhash, 16) != 0) 746 goto out; 747 748 } 749 750 if (key) { 751 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 752 (TCPOPT_NOP << 16) | 753 (TCPOPT_MD5SIG << 8) | 754 TCPOLEN_MD5SIG); 755 /* Update length and the length the header thinks exists */ 756 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 757 rep.th.doff = arg.iov[0].iov_len / 4; 758 759 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 760 key, ip_hdr(skb)->saddr, 761 ip_hdr(skb)->daddr, &rep.th); 762 } 763 #endif 764 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 765 ip_hdr(skb)->saddr, /* XXX */ 766 arg.iov[0].iov_len, IPPROTO_TCP, 0); 767 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 768 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 769 770 /* When socket is gone, all binding information is lost. 771 * routing might fail in this case. No choice here, if we choose to force 772 * input interface, we will misroute in case of asymmetric route. 773 */ 774 if (sk) { 775 arg.bound_dev_if = sk->sk_bound_dev_if; 776 if (sk_fullsock(sk)) 777 trace_tcp_send_reset(sk, skb); 778 } 779 780 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 781 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 782 783 arg.tos = ip_hdr(skb)->tos; 784 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 785 local_bh_disable(); 786 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); 787 if (sk) { 788 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 789 inet_twsk(sk)->tw_mark : sk->sk_mark; 790 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 791 inet_twsk(sk)->tw_priority : sk->sk_priority; 792 transmit_time = tcp_transmit_time(sk); 793 } 794 ip_send_unicast_reply(ctl_sk, 795 skb, &TCP_SKB_CB(skb)->header.h4.opt, 796 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 797 &arg, arg.iov[0].iov_len, 798 transmit_time); 799 800 ctl_sk->sk_mark = 0; 801 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 802 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 803 local_bh_enable(); 804 805 #ifdef CONFIG_TCP_MD5SIG 806 out: 807 rcu_read_unlock(); 808 #endif 809 } 810 811 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 812 outside socket context is ugly, certainly. What can I do? 813 */ 814 815 static void tcp_v4_send_ack(const struct sock *sk, 816 struct sk_buff *skb, u32 seq, u32 ack, 817 u32 win, u32 tsval, u32 tsecr, int oif, 818 struct tcp_md5sig_key *key, 819 int reply_flags, u8 tos) 820 { 821 const struct tcphdr *th = tcp_hdr(skb); 822 struct { 823 struct tcphdr th; 824 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 825 #ifdef CONFIG_TCP_MD5SIG 826 + (TCPOLEN_MD5SIG_ALIGNED >> 2) 827 #endif 828 ]; 829 } rep; 830 struct net *net = sock_net(sk); 831 struct ip_reply_arg arg; 832 struct sock *ctl_sk; 833 u64 transmit_time; 834 835 memset(&rep.th, 0, sizeof(struct tcphdr)); 836 memset(&arg, 0, sizeof(arg)); 837 838 arg.iov[0].iov_base = (unsigned char *)&rep; 839 arg.iov[0].iov_len = sizeof(rep.th); 840 if (tsecr) { 841 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 842 (TCPOPT_TIMESTAMP << 8) | 843 TCPOLEN_TIMESTAMP); 844 rep.opt[1] = htonl(tsval); 845 rep.opt[2] = htonl(tsecr); 846 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 847 } 848 849 /* Swap the send and the receive. */ 850 rep.th.dest = th->source; 851 rep.th.source = th->dest; 852 rep.th.doff = arg.iov[0].iov_len / 4; 853 rep.th.seq = htonl(seq); 854 rep.th.ack_seq = htonl(ack); 855 rep.th.ack = 1; 856 rep.th.window = htons(win); 857 858 #ifdef CONFIG_TCP_MD5SIG 859 if (key) { 860 int offset = (tsecr) ? 3 : 0; 861 862 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 863 (TCPOPT_NOP << 16) | 864 (TCPOPT_MD5SIG << 8) | 865 TCPOLEN_MD5SIG); 866 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 867 rep.th.doff = arg.iov[0].iov_len/4; 868 869 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 870 key, ip_hdr(skb)->saddr, 871 ip_hdr(skb)->daddr, &rep.th); 872 } 873 #endif 874 arg.flags = reply_flags; 875 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 876 ip_hdr(skb)->saddr, /* XXX */ 877 arg.iov[0].iov_len, IPPROTO_TCP, 0); 878 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 879 if (oif) 880 arg.bound_dev_if = oif; 881 arg.tos = tos; 882 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 883 local_bh_disable(); 884 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); 885 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 886 inet_twsk(sk)->tw_mark : sk->sk_mark; 887 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 888 inet_twsk(sk)->tw_priority : sk->sk_priority; 889 transmit_time = tcp_transmit_time(sk); 890 ip_send_unicast_reply(ctl_sk, 891 skb, &TCP_SKB_CB(skb)->header.h4.opt, 892 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 893 &arg, arg.iov[0].iov_len, 894 transmit_time); 895 896 ctl_sk->sk_mark = 0; 897 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 898 local_bh_enable(); 899 } 900 901 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 902 { 903 struct inet_timewait_sock *tw = inet_twsk(sk); 904 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 905 906 tcp_v4_send_ack(sk, skb, 907 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 908 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 909 tcp_time_stamp_raw() + tcptw->tw_ts_offset, 910 tcptw->tw_ts_recent, 911 tw->tw_bound_dev_if, 912 tcp_twsk_md5_key(tcptw), 913 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 914 tw->tw_tos 915 ); 916 917 inet_twsk_put(tw); 918 } 919 920 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 921 struct request_sock *req) 922 { 923 const union tcp_md5_addr *addr; 924 int l3index; 925 926 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 927 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 928 */ 929 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 930 tcp_sk(sk)->snd_nxt; 931 932 /* RFC 7323 2.3 933 * The window field (SEG.WND) of every outgoing segment, with the 934 * exception of <SYN> segments, MUST be right-shifted by 935 * Rcv.Wind.Shift bits: 936 */ 937 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 938 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 939 tcp_v4_send_ack(sk, skb, seq, 940 tcp_rsk(req)->rcv_nxt, 941 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 942 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, 943 req->ts_recent, 944 0, 945 tcp_md5_do_lookup(sk, l3index, addr, AF_INET), 946 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 947 ip_hdr(skb)->tos); 948 } 949 950 /* 951 * Send a SYN-ACK after having received a SYN. 952 * This still operates on a request_sock only, not on a big 953 * socket. 954 */ 955 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 956 struct flowi *fl, 957 struct request_sock *req, 958 struct tcp_fastopen_cookie *foc, 959 enum tcp_synack_type synack_type) 960 { 961 const struct inet_request_sock *ireq = inet_rsk(req); 962 struct flowi4 fl4; 963 int err = -1; 964 struct sk_buff *skb; 965 966 /* First, grab a route. */ 967 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 968 return -1; 969 970 skb = tcp_make_synack(sk, dst, req, foc, synack_type); 971 972 if (skb) { 973 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 974 975 rcu_read_lock(); 976 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 977 ireq->ir_rmt_addr, 978 rcu_dereference(ireq->ireq_opt)); 979 rcu_read_unlock(); 980 err = net_xmit_eval(err); 981 } 982 983 return err; 984 } 985 986 /* 987 * IPv4 request_sock destructor. 988 */ 989 static void tcp_v4_reqsk_destructor(struct request_sock *req) 990 { 991 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 992 } 993 994 #ifdef CONFIG_TCP_MD5SIG 995 /* 996 * RFC2385 MD5 checksumming requires a mapping of 997 * IP address->MD5 Key. 998 * We need to maintain these in the sk structure. 999 */ 1000 1001 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed); 1002 EXPORT_SYMBOL(tcp_md5_needed); 1003 1004 /* Find the Key structure for an address. */ 1005 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1006 const union tcp_md5_addr *addr, 1007 int family) 1008 { 1009 const struct tcp_sock *tp = tcp_sk(sk); 1010 struct tcp_md5sig_key *key; 1011 const struct tcp_md5sig_info *md5sig; 1012 __be32 mask; 1013 struct tcp_md5sig_key *best_match = NULL; 1014 bool match; 1015 1016 /* caller either holds rcu_read_lock() or socket lock */ 1017 md5sig = rcu_dereference_check(tp->md5sig_info, 1018 lockdep_sock_is_held(sk)); 1019 if (!md5sig) 1020 return NULL; 1021 1022 hlist_for_each_entry_rcu(key, &md5sig->head, node) { 1023 if (key->family != family) 1024 continue; 1025 if (key->l3index && key->l3index != l3index) 1026 continue; 1027 if (family == AF_INET) { 1028 mask = inet_make_mask(key->prefixlen); 1029 match = (key->addr.a4.s_addr & mask) == 1030 (addr->a4.s_addr & mask); 1031 #if IS_ENABLED(CONFIG_IPV6) 1032 } else if (family == AF_INET6) { 1033 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1034 key->prefixlen); 1035 #endif 1036 } else { 1037 match = false; 1038 } 1039 1040 if (match && (!best_match || 1041 key->prefixlen > best_match->prefixlen)) 1042 best_match = key; 1043 } 1044 return best_match; 1045 } 1046 EXPORT_SYMBOL(__tcp_md5_do_lookup); 1047 1048 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1049 const union tcp_md5_addr *addr, 1050 int family, u8 prefixlen, 1051 int l3index) 1052 { 1053 const struct tcp_sock *tp = tcp_sk(sk); 1054 struct tcp_md5sig_key *key; 1055 unsigned int size = sizeof(struct in_addr); 1056 const struct tcp_md5sig_info *md5sig; 1057 1058 /* caller either holds rcu_read_lock() or socket lock */ 1059 md5sig = rcu_dereference_check(tp->md5sig_info, 1060 lockdep_sock_is_held(sk)); 1061 if (!md5sig) 1062 return NULL; 1063 #if IS_ENABLED(CONFIG_IPV6) 1064 if (family == AF_INET6) 1065 size = sizeof(struct in6_addr); 1066 #endif 1067 hlist_for_each_entry_rcu(key, &md5sig->head, node) { 1068 if (key->family != family) 1069 continue; 1070 if (key->l3index && key->l3index != l3index) 1071 continue; 1072 if (!memcmp(&key->addr, addr, size) && 1073 key->prefixlen == prefixlen) 1074 return key; 1075 } 1076 return NULL; 1077 } 1078 1079 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1080 const struct sock *addr_sk) 1081 { 1082 const union tcp_md5_addr *addr; 1083 int l3index; 1084 1085 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1086 addr_sk->sk_bound_dev_if); 1087 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1088 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1089 } 1090 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1091 1092 /* This can be called on a newly created socket, from other files */ 1093 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1094 int family, u8 prefixlen, int l3index, 1095 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1096 { 1097 /* Add Key to the list */ 1098 struct tcp_md5sig_key *key; 1099 struct tcp_sock *tp = tcp_sk(sk); 1100 struct tcp_md5sig_info *md5sig; 1101 1102 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); 1103 if (key) { 1104 /* Pre-existing entry - just update that one. */ 1105 memcpy(key->key, newkey, newkeylen); 1106 key->keylen = newkeylen; 1107 return 0; 1108 } 1109 1110 md5sig = rcu_dereference_protected(tp->md5sig_info, 1111 lockdep_sock_is_held(sk)); 1112 if (!md5sig) { 1113 md5sig = kmalloc(sizeof(*md5sig), gfp); 1114 if (!md5sig) 1115 return -ENOMEM; 1116 1117 sk_nocaps_add(sk, NETIF_F_GSO_MASK); 1118 INIT_HLIST_HEAD(&md5sig->head); 1119 rcu_assign_pointer(tp->md5sig_info, md5sig); 1120 } 1121 1122 key = sock_kmalloc(sk, sizeof(*key), gfp); 1123 if (!key) 1124 return -ENOMEM; 1125 if (!tcp_alloc_md5sig_pool()) { 1126 sock_kfree_s(sk, key, sizeof(*key)); 1127 return -ENOMEM; 1128 } 1129 1130 memcpy(key->key, newkey, newkeylen); 1131 key->keylen = newkeylen; 1132 key->family = family; 1133 key->prefixlen = prefixlen; 1134 key->l3index = l3index; 1135 memcpy(&key->addr, addr, 1136 (family == AF_INET6) ? sizeof(struct in6_addr) : 1137 sizeof(struct in_addr)); 1138 hlist_add_head_rcu(&key->node, &md5sig->head); 1139 return 0; 1140 } 1141 EXPORT_SYMBOL(tcp_md5_do_add); 1142 1143 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1144 u8 prefixlen, int l3index) 1145 { 1146 struct tcp_md5sig_key *key; 1147 1148 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); 1149 if (!key) 1150 return -ENOENT; 1151 hlist_del_rcu(&key->node); 1152 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1153 kfree_rcu(key, rcu); 1154 return 0; 1155 } 1156 EXPORT_SYMBOL(tcp_md5_do_del); 1157 1158 static void tcp_clear_md5_list(struct sock *sk) 1159 { 1160 struct tcp_sock *tp = tcp_sk(sk); 1161 struct tcp_md5sig_key *key; 1162 struct hlist_node *n; 1163 struct tcp_md5sig_info *md5sig; 1164 1165 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1166 1167 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1168 hlist_del_rcu(&key->node); 1169 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1170 kfree_rcu(key, rcu); 1171 } 1172 } 1173 1174 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1175 char __user *optval, int optlen) 1176 { 1177 struct tcp_md5sig cmd; 1178 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1179 const union tcp_md5_addr *addr; 1180 u8 prefixlen = 32; 1181 int l3index = 0; 1182 1183 if (optlen < sizeof(cmd)) 1184 return -EINVAL; 1185 1186 if (copy_from_user(&cmd, optval, sizeof(cmd))) 1187 return -EFAULT; 1188 1189 if (sin->sin_family != AF_INET) 1190 return -EINVAL; 1191 1192 if (optname == TCP_MD5SIG_EXT && 1193 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1194 prefixlen = cmd.tcpm_prefixlen; 1195 if (prefixlen > 32) 1196 return -EINVAL; 1197 } 1198 1199 if (optname == TCP_MD5SIG_EXT && 1200 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1201 struct net_device *dev; 1202 1203 rcu_read_lock(); 1204 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1205 if (dev && netif_is_l3_master(dev)) 1206 l3index = dev->ifindex; 1207 1208 rcu_read_unlock(); 1209 1210 /* ok to reference set/not set outside of rcu; 1211 * right now device MUST be an L3 master 1212 */ 1213 if (!dev || !l3index) 1214 return -EINVAL; 1215 } 1216 1217 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1218 1219 if (!cmd.tcpm_keylen) 1220 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index); 1221 1222 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1223 return -EINVAL; 1224 1225 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, 1226 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); 1227 } 1228 1229 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, 1230 __be32 daddr, __be32 saddr, 1231 const struct tcphdr *th, int nbytes) 1232 { 1233 struct tcp4_pseudohdr *bp; 1234 struct scatterlist sg; 1235 struct tcphdr *_th; 1236 1237 bp = hp->scratch; 1238 bp->saddr = saddr; 1239 bp->daddr = daddr; 1240 bp->pad = 0; 1241 bp->protocol = IPPROTO_TCP; 1242 bp->len = cpu_to_be16(nbytes); 1243 1244 _th = (struct tcphdr *)(bp + 1); 1245 memcpy(_th, th, sizeof(*th)); 1246 _th->check = 0; 1247 1248 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1249 ahash_request_set_crypt(hp->md5_req, &sg, NULL, 1250 sizeof(*bp) + sizeof(*th)); 1251 return crypto_ahash_update(hp->md5_req); 1252 } 1253 1254 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1255 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1256 { 1257 struct tcp_md5sig_pool *hp; 1258 struct ahash_request *req; 1259 1260 hp = tcp_get_md5sig_pool(); 1261 if (!hp) 1262 goto clear_hash_noput; 1263 req = hp->md5_req; 1264 1265 if (crypto_ahash_init(req)) 1266 goto clear_hash; 1267 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) 1268 goto clear_hash; 1269 if (tcp_md5_hash_key(hp, key)) 1270 goto clear_hash; 1271 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1272 if (crypto_ahash_final(req)) 1273 goto clear_hash; 1274 1275 tcp_put_md5sig_pool(); 1276 return 0; 1277 1278 clear_hash: 1279 tcp_put_md5sig_pool(); 1280 clear_hash_noput: 1281 memset(md5_hash, 0, 16); 1282 return 1; 1283 } 1284 1285 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1286 const struct sock *sk, 1287 const struct sk_buff *skb) 1288 { 1289 struct tcp_md5sig_pool *hp; 1290 struct ahash_request *req; 1291 const struct tcphdr *th = tcp_hdr(skb); 1292 __be32 saddr, daddr; 1293 1294 if (sk) { /* valid for establish/request sockets */ 1295 saddr = sk->sk_rcv_saddr; 1296 daddr = sk->sk_daddr; 1297 } else { 1298 const struct iphdr *iph = ip_hdr(skb); 1299 saddr = iph->saddr; 1300 daddr = iph->daddr; 1301 } 1302 1303 hp = tcp_get_md5sig_pool(); 1304 if (!hp) 1305 goto clear_hash_noput; 1306 req = hp->md5_req; 1307 1308 if (crypto_ahash_init(req)) 1309 goto clear_hash; 1310 1311 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len)) 1312 goto clear_hash; 1313 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) 1314 goto clear_hash; 1315 if (tcp_md5_hash_key(hp, key)) 1316 goto clear_hash; 1317 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1318 if (crypto_ahash_final(req)) 1319 goto clear_hash; 1320 1321 tcp_put_md5sig_pool(); 1322 return 0; 1323 1324 clear_hash: 1325 tcp_put_md5sig_pool(); 1326 clear_hash_noput: 1327 memset(md5_hash, 0, 16); 1328 return 1; 1329 } 1330 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1331 1332 #endif 1333 1334 /* Called with rcu_read_lock() */ 1335 static bool tcp_v4_inbound_md5_hash(const struct sock *sk, 1336 const struct sk_buff *skb, 1337 int dif, int sdif) 1338 { 1339 #ifdef CONFIG_TCP_MD5SIG 1340 /* 1341 * This gets called for each TCP segment that arrives 1342 * so we want to be efficient. 1343 * We have 3 drop cases: 1344 * o No MD5 hash and one expected. 1345 * o MD5 hash and we're not expecting one. 1346 * o MD5 hash and its wrong. 1347 */ 1348 const __u8 *hash_location = NULL; 1349 struct tcp_md5sig_key *hash_expected; 1350 const struct iphdr *iph = ip_hdr(skb); 1351 const struct tcphdr *th = tcp_hdr(skb); 1352 const union tcp_md5_addr *addr; 1353 unsigned char newhash[16]; 1354 int genhash, l3index; 1355 1356 /* sdif set, means packet ingressed via a device 1357 * in an L3 domain and dif is set to the l3mdev 1358 */ 1359 l3index = sdif ? dif : 0; 1360 1361 addr = (union tcp_md5_addr *)&iph->saddr; 1362 hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1363 hash_location = tcp_parse_md5sig_option(th); 1364 1365 /* We've parsed the options - do we have a hash? */ 1366 if (!hash_expected && !hash_location) 1367 return false; 1368 1369 if (hash_expected && !hash_location) { 1370 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); 1371 return true; 1372 } 1373 1374 if (!hash_expected && hash_location) { 1375 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); 1376 return true; 1377 } 1378 1379 /* Okay, so this is hash_expected and hash_location - 1380 * so we need to calculate the checksum. 1381 */ 1382 genhash = tcp_v4_md5_hash_skb(newhash, 1383 hash_expected, 1384 NULL, skb); 1385 1386 if (genhash || memcmp(hash_location, newhash, 16) != 0) { 1387 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); 1388 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n", 1389 &iph->saddr, ntohs(th->source), 1390 &iph->daddr, ntohs(th->dest), 1391 genhash ? " tcp_v4_calc_md5_hash failed" 1392 : "", l3index); 1393 return true; 1394 } 1395 return false; 1396 #endif 1397 return false; 1398 } 1399 1400 static void tcp_v4_init_req(struct request_sock *req, 1401 const struct sock *sk_listener, 1402 struct sk_buff *skb) 1403 { 1404 struct inet_request_sock *ireq = inet_rsk(req); 1405 struct net *net = sock_net(sk_listener); 1406 1407 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1408 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1409 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1410 } 1411 1412 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1413 struct flowi *fl, 1414 const struct request_sock *req) 1415 { 1416 return inet_csk_route_req(sk, &fl->u.ip4, req); 1417 } 1418 1419 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1420 .family = PF_INET, 1421 .obj_size = sizeof(struct tcp_request_sock), 1422 .rtx_syn_ack = tcp_rtx_synack, 1423 .send_ack = tcp_v4_reqsk_send_ack, 1424 .destructor = tcp_v4_reqsk_destructor, 1425 .send_reset = tcp_v4_send_reset, 1426 .syn_ack_timeout = tcp_syn_ack_timeout, 1427 }; 1428 1429 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1430 .mss_clamp = TCP_MSS_DEFAULT, 1431 #ifdef CONFIG_TCP_MD5SIG 1432 .req_md5_lookup = tcp_v4_md5_lookup, 1433 .calc_md5_hash = tcp_v4_md5_hash_skb, 1434 #endif 1435 .init_req = tcp_v4_init_req, 1436 #ifdef CONFIG_SYN_COOKIES 1437 .cookie_init_seq = cookie_v4_init_sequence, 1438 #endif 1439 .route_req = tcp_v4_route_req, 1440 .init_seq = tcp_v4_init_seq, 1441 .init_ts_off = tcp_v4_init_ts_off, 1442 .send_synack = tcp_v4_send_synack, 1443 }; 1444 1445 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1446 { 1447 /* Never answer to SYNs send to broadcast or multicast */ 1448 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1449 goto drop; 1450 1451 return tcp_conn_request(&tcp_request_sock_ops, 1452 &tcp_request_sock_ipv4_ops, sk, skb); 1453 1454 drop: 1455 tcp_listendrop(sk); 1456 return 0; 1457 } 1458 EXPORT_SYMBOL(tcp_v4_conn_request); 1459 1460 1461 /* 1462 * The three way handshake has completed - we got a valid synack - 1463 * now create the new socket. 1464 */ 1465 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1466 struct request_sock *req, 1467 struct dst_entry *dst, 1468 struct request_sock *req_unhash, 1469 bool *own_req) 1470 { 1471 struct inet_request_sock *ireq; 1472 struct inet_sock *newinet; 1473 struct tcp_sock *newtp; 1474 struct sock *newsk; 1475 #ifdef CONFIG_TCP_MD5SIG 1476 const union tcp_md5_addr *addr; 1477 struct tcp_md5sig_key *key; 1478 int l3index; 1479 #endif 1480 struct ip_options_rcu *inet_opt; 1481 1482 if (sk_acceptq_is_full(sk)) 1483 goto exit_overflow; 1484 1485 newsk = tcp_create_openreq_child(sk, req, skb); 1486 if (!newsk) 1487 goto exit_nonewsk; 1488 1489 newsk->sk_gso_type = SKB_GSO_TCPV4; 1490 inet_sk_rx_dst_set(newsk, skb); 1491 1492 newtp = tcp_sk(newsk); 1493 newinet = inet_sk(newsk); 1494 ireq = inet_rsk(req); 1495 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1496 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1497 newsk->sk_bound_dev_if = ireq->ir_iif; 1498 newinet->inet_saddr = ireq->ir_loc_addr; 1499 inet_opt = rcu_dereference(ireq->ireq_opt); 1500 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1501 newinet->mc_index = inet_iif(skb); 1502 newinet->mc_ttl = ip_hdr(skb)->ttl; 1503 newinet->rcv_tos = ip_hdr(skb)->tos; 1504 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1505 if (inet_opt) 1506 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1507 newinet->inet_id = prandom_u32(); 1508 1509 if (!dst) { 1510 dst = inet_csk_route_child_sock(sk, newsk, req); 1511 if (!dst) 1512 goto put_and_exit; 1513 } else { 1514 /* syncookie case : see end of cookie_v4_check() */ 1515 } 1516 sk_setup_caps(newsk, dst); 1517 1518 tcp_ca_openreq_child(newsk, dst); 1519 1520 tcp_sync_mss(newsk, dst_mtu(dst)); 1521 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1522 1523 tcp_initialize_rcv_mss(newsk); 1524 1525 #ifdef CONFIG_TCP_MD5SIG 1526 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1527 /* Copy over the MD5 key from the original socket */ 1528 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1529 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1530 if (key) { 1531 /* 1532 * We're using one, so create a matching key 1533 * on the newsk structure. If we fail to get 1534 * memory, then we end up not copying the key 1535 * across. Shucks. 1536 */ 1537 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, 1538 key->key, key->keylen, GFP_ATOMIC); 1539 sk_nocaps_add(newsk, NETIF_F_GSO_MASK); 1540 } 1541 #endif 1542 1543 if (__inet_inherit_port(sk, newsk) < 0) 1544 goto put_and_exit; 1545 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); 1546 if (likely(*own_req)) { 1547 tcp_move_syn(newtp, req); 1548 ireq->ireq_opt = NULL; 1549 } else { 1550 newinet->inet_opt = NULL; 1551 } 1552 return newsk; 1553 1554 exit_overflow: 1555 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1556 exit_nonewsk: 1557 dst_release(dst); 1558 exit: 1559 tcp_listendrop(sk); 1560 return NULL; 1561 put_and_exit: 1562 newinet->inet_opt = NULL; 1563 inet_csk_prepare_forced_close(newsk); 1564 tcp_done(newsk); 1565 goto exit; 1566 } 1567 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1568 1569 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1570 { 1571 #ifdef CONFIG_SYN_COOKIES 1572 const struct tcphdr *th = tcp_hdr(skb); 1573 1574 if (!th->syn) 1575 sk = cookie_v4_check(sk, skb); 1576 #endif 1577 return sk; 1578 } 1579 1580 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1581 struct tcphdr *th, u32 *cookie) 1582 { 1583 u16 mss = 0; 1584 #ifdef CONFIG_SYN_COOKIES 1585 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1586 &tcp_request_sock_ipv4_ops, sk, th); 1587 if (mss) { 1588 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1589 tcp_synq_overflow(sk); 1590 } 1591 #endif 1592 return mss; 1593 } 1594 1595 /* The socket must have it's spinlock held when we get 1596 * here, unless it is a TCP_LISTEN socket. 1597 * 1598 * We have a potential double-lock case here, so even when 1599 * doing backlog processing we use the BH locking scheme. 1600 * This is because we cannot sleep with the original spinlock 1601 * held. 1602 */ 1603 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1604 { 1605 struct sock *rsk; 1606 1607 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1608 struct dst_entry *dst = sk->sk_rx_dst; 1609 1610 sock_rps_save_rxhash(sk, skb); 1611 sk_mark_napi_id(sk, skb); 1612 if (dst) { 1613 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || 1614 !dst->ops->check(dst, 0)) { 1615 dst_release(dst); 1616 sk->sk_rx_dst = NULL; 1617 } 1618 } 1619 tcp_rcv_established(sk, skb); 1620 return 0; 1621 } 1622 1623 if (tcp_checksum_complete(skb)) 1624 goto csum_err; 1625 1626 if (sk->sk_state == TCP_LISTEN) { 1627 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1628 1629 if (!nsk) 1630 goto discard; 1631 if (nsk != sk) { 1632 if (tcp_child_process(sk, nsk, skb)) { 1633 rsk = nsk; 1634 goto reset; 1635 } 1636 return 0; 1637 } 1638 } else 1639 sock_rps_save_rxhash(sk, skb); 1640 1641 if (tcp_rcv_state_process(sk, skb)) { 1642 rsk = sk; 1643 goto reset; 1644 } 1645 return 0; 1646 1647 reset: 1648 tcp_v4_send_reset(rsk, skb); 1649 discard: 1650 kfree_skb(skb); 1651 /* Be careful here. If this function gets more complicated and 1652 * gcc suffers from register pressure on the x86, sk (in %ebx) 1653 * might be destroyed here. This current version compiles correctly, 1654 * but you have been warned. 1655 */ 1656 return 0; 1657 1658 csum_err: 1659 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1660 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1661 goto discard; 1662 } 1663 EXPORT_SYMBOL(tcp_v4_do_rcv); 1664 1665 int tcp_v4_early_demux(struct sk_buff *skb) 1666 { 1667 const struct iphdr *iph; 1668 const struct tcphdr *th; 1669 struct sock *sk; 1670 1671 if (skb->pkt_type != PACKET_HOST) 1672 return 0; 1673 1674 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1675 return 0; 1676 1677 iph = ip_hdr(skb); 1678 th = tcp_hdr(skb); 1679 1680 if (th->doff < sizeof(struct tcphdr) / 4) 1681 return 0; 1682 1683 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, 1684 iph->saddr, th->source, 1685 iph->daddr, ntohs(th->dest), 1686 skb->skb_iif, inet_sdif(skb)); 1687 if (sk) { 1688 skb->sk = sk; 1689 skb->destructor = sock_edemux; 1690 if (sk_fullsock(sk)) { 1691 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); 1692 1693 if (dst) 1694 dst = dst_check(dst, 0); 1695 if (dst && 1696 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) 1697 skb_dst_set_noref(skb, dst); 1698 } 1699 } 1700 return 0; 1701 } 1702 1703 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) 1704 { 1705 u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf); 1706 struct skb_shared_info *shinfo; 1707 const struct tcphdr *th; 1708 struct tcphdr *thtail; 1709 struct sk_buff *tail; 1710 unsigned int hdrlen; 1711 bool fragstolen; 1712 u32 gso_segs; 1713 int delta; 1714 1715 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1716 * we can fix skb->truesize to its real value to avoid future drops. 1717 * This is valid because skb is not yet charged to the socket. 1718 * It has been noticed pure SACK packets were sometimes dropped 1719 * (if cooked by drivers without copybreak feature). 1720 */ 1721 skb_condense(skb); 1722 1723 skb_dst_drop(skb); 1724 1725 if (unlikely(tcp_checksum_complete(skb))) { 1726 bh_unlock_sock(sk); 1727 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1728 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1729 return true; 1730 } 1731 1732 /* Attempt coalescing to last skb in backlog, even if we are 1733 * above the limits. 1734 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 1735 */ 1736 th = (const struct tcphdr *)skb->data; 1737 hdrlen = th->doff * 4; 1738 shinfo = skb_shinfo(skb); 1739 1740 if (!shinfo->gso_size) 1741 shinfo->gso_size = skb->len - hdrlen; 1742 1743 if (!shinfo->gso_segs) 1744 shinfo->gso_segs = 1; 1745 1746 tail = sk->sk_backlog.tail; 1747 if (!tail) 1748 goto no_coalesce; 1749 thtail = (struct tcphdr *)tail->data; 1750 1751 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 1752 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 1753 ((TCP_SKB_CB(tail)->tcp_flags | 1754 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 1755 !((TCP_SKB_CB(tail)->tcp_flags & 1756 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 1757 ((TCP_SKB_CB(tail)->tcp_flags ^ 1758 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || 1759 #ifdef CONFIG_TLS_DEVICE 1760 tail->decrypted != skb->decrypted || 1761 #endif 1762 thtail->doff != th->doff || 1763 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 1764 goto no_coalesce; 1765 1766 __skb_pull(skb, hdrlen); 1767 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 1768 thtail->window = th->window; 1769 1770 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 1771 1772 if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq)) 1773 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 1774 1775 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 1776 * thtail->fin, so that the fast path in tcp_rcv_established() 1777 * is not entered if we append a packet with a FIN. 1778 * SYN, RST, URG are not present. 1779 * ACK is set on both packets. 1780 * PSH : we do not really care in TCP stack, 1781 * at least for 'GRO' packets. 1782 */ 1783 thtail->fin |= th->fin; 1784 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 1785 1786 if (TCP_SKB_CB(skb)->has_rxtstamp) { 1787 TCP_SKB_CB(tail)->has_rxtstamp = true; 1788 tail->tstamp = skb->tstamp; 1789 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 1790 } 1791 1792 /* Not as strict as GRO. We only need to carry mss max value */ 1793 skb_shinfo(tail)->gso_size = max(shinfo->gso_size, 1794 skb_shinfo(tail)->gso_size); 1795 1796 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs; 1797 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF); 1798 1799 sk->sk_backlog.len += delta; 1800 __NET_INC_STATS(sock_net(sk), 1801 LINUX_MIB_TCPBACKLOGCOALESCE); 1802 kfree_skb_partial(skb, fragstolen); 1803 return false; 1804 } 1805 __skb_push(skb, hdrlen); 1806 1807 no_coalesce: 1808 /* Only socket owner can try to collapse/prune rx queues 1809 * to reduce memory overhead, so add a little headroom here. 1810 * Few sockets backlog are possibly concurrently non empty. 1811 */ 1812 limit += 64*1024; 1813 1814 if (unlikely(sk_add_backlog(sk, skb, limit))) { 1815 bh_unlock_sock(sk); 1816 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 1817 return true; 1818 } 1819 return false; 1820 } 1821 EXPORT_SYMBOL(tcp_add_backlog); 1822 1823 int tcp_filter(struct sock *sk, struct sk_buff *skb) 1824 { 1825 struct tcphdr *th = (struct tcphdr *)skb->data; 1826 1827 return sk_filter_trim_cap(sk, skb, th->doff * 4); 1828 } 1829 EXPORT_SYMBOL(tcp_filter); 1830 1831 static void tcp_v4_restore_cb(struct sk_buff *skb) 1832 { 1833 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 1834 sizeof(struct inet_skb_parm)); 1835 } 1836 1837 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 1838 const struct tcphdr *th) 1839 { 1840 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 1841 * barrier() makes sure compiler wont play fool^Waliasing games. 1842 */ 1843 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 1844 sizeof(struct inet_skb_parm)); 1845 barrier(); 1846 1847 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1848 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1849 skb->len - th->doff * 4); 1850 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1851 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 1852 TCP_SKB_CB(skb)->tcp_tw_isn = 0; 1853 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 1854 TCP_SKB_CB(skb)->sacked = 0; 1855 TCP_SKB_CB(skb)->has_rxtstamp = 1856 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 1857 } 1858 1859 /* 1860 * From tcp_input.c 1861 */ 1862 1863 int tcp_v4_rcv(struct sk_buff *skb) 1864 { 1865 struct net *net = dev_net(skb->dev); 1866 struct sk_buff *skb_to_free; 1867 int sdif = inet_sdif(skb); 1868 int dif = inet_iif(skb); 1869 const struct iphdr *iph; 1870 const struct tcphdr *th; 1871 bool refcounted; 1872 struct sock *sk; 1873 int ret; 1874 1875 if (skb->pkt_type != PACKET_HOST) 1876 goto discard_it; 1877 1878 /* Count it even if it's bad */ 1879 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 1880 1881 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 1882 goto discard_it; 1883 1884 th = (const struct tcphdr *)skb->data; 1885 1886 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) 1887 goto bad_packet; 1888 if (!pskb_may_pull(skb, th->doff * 4)) 1889 goto discard_it; 1890 1891 /* An explanation is required here, I think. 1892 * Packet length and doff are validated by header prediction, 1893 * provided case of th->doff==0 is eliminated. 1894 * So, we defer the checks. */ 1895 1896 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 1897 goto csum_error; 1898 1899 th = (const struct tcphdr *)skb->data; 1900 iph = ip_hdr(skb); 1901 lookup: 1902 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, 1903 th->dest, sdif, &refcounted); 1904 if (!sk) 1905 goto no_tcp_socket; 1906 1907 process: 1908 if (sk->sk_state == TCP_TIME_WAIT) 1909 goto do_time_wait; 1910 1911 if (sk->sk_state == TCP_NEW_SYN_RECV) { 1912 struct request_sock *req = inet_reqsk(sk); 1913 bool req_stolen = false; 1914 struct sock *nsk; 1915 1916 sk = req->rsk_listener; 1917 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) { 1918 sk_drops_add(sk, skb); 1919 reqsk_put(req); 1920 goto discard_it; 1921 } 1922 if (tcp_checksum_complete(skb)) { 1923 reqsk_put(req); 1924 goto csum_error; 1925 } 1926 if (unlikely(sk->sk_state != TCP_LISTEN)) { 1927 inet_csk_reqsk_queue_drop_and_put(sk, req); 1928 goto lookup; 1929 } 1930 /* We own a reference on the listener, increase it again 1931 * as we might lose it too soon. 1932 */ 1933 sock_hold(sk); 1934 refcounted = true; 1935 nsk = NULL; 1936 if (!tcp_filter(sk, skb)) { 1937 th = (const struct tcphdr *)skb->data; 1938 iph = ip_hdr(skb); 1939 tcp_v4_fill_cb(skb, iph, th); 1940 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 1941 } 1942 if (!nsk) { 1943 reqsk_put(req); 1944 if (req_stolen) { 1945 /* Another cpu got exclusive access to req 1946 * and created a full blown socket. 1947 * Try to feed this packet to this socket 1948 * instead of discarding it. 1949 */ 1950 tcp_v4_restore_cb(skb); 1951 sock_put(sk); 1952 goto lookup; 1953 } 1954 goto discard_and_relse; 1955 } 1956 if (nsk == sk) { 1957 reqsk_put(req); 1958 tcp_v4_restore_cb(skb); 1959 } else if (tcp_child_process(sk, nsk, skb)) { 1960 tcp_v4_send_reset(nsk, skb); 1961 goto discard_and_relse; 1962 } else { 1963 sock_put(sk); 1964 return 0; 1965 } 1966 } 1967 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 1968 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 1969 goto discard_and_relse; 1970 } 1971 1972 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 1973 goto discard_and_relse; 1974 1975 if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif)) 1976 goto discard_and_relse; 1977 1978 nf_reset_ct(skb); 1979 1980 if (tcp_filter(sk, skb)) 1981 goto discard_and_relse; 1982 th = (const struct tcphdr *)skb->data; 1983 iph = ip_hdr(skb); 1984 tcp_v4_fill_cb(skb, iph, th); 1985 1986 skb->dev = NULL; 1987 1988 if (sk->sk_state == TCP_LISTEN) { 1989 ret = tcp_v4_do_rcv(sk, skb); 1990 goto put_and_return; 1991 } 1992 1993 sk_incoming_cpu_update(sk); 1994 1995 bh_lock_sock_nested(sk); 1996 tcp_segs_in(tcp_sk(sk), skb); 1997 ret = 0; 1998 if (!sock_owned_by_user(sk)) { 1999 skb_to_free = sk->sk_rx_skb_cache; 2000 sk->sk_rx_skb_cache = NULL; 2001 ret = tcp_v4_do_rcv(sk, skb); 2002 } else { 2003 if (tcp_add_backlog(sk, skb)) 2004 goto discard_and_relse; 2005 skb_to_free = NULL; 2006 } 2007 bh_unlock_sock(sk); 2008 if (skb_to_free) 2009 __kfree_skb(skb_to_free); 2010 2011 put_and_return: 2012 if (refcounted) 2013 sock_put(sk); 2014 2015 return ret; 2016 2017 no_tcp_socket: 2018 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2019 goto discard_it; 2020 2021 tcp_v4_fill_cb(skb, iph, th); 2022 2023 if (tcp_checksum_complete(skb)) { 2024 csum_error: 2025 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2026 bad_packet: 2027 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2028 } else { 2029 tcp_v4_send_reset(NULL, skb); 2030 } 2031 2032 discard_it: 2033 /* Discard frame. */ 2034 kfree_skb(skb); 2035 return 0; 2036 2037 discard_and_relse: 2038 sk_drops_add(sk, skb); 2039 if (refcounted) 2040 sock_put(sk); 2041 goto discard_it; 2042 2043 do_time_wait: 2044 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2045 inet_twsk_put(inet_twsk(sk)); 2046 goto discard_it; 2047 } 2048 2049 tcp_v4_fill_cb(skb, iph, th); 2050 2051 if (tcp_checksum_complete(skb)) { 2052 inet_twsk_put(inet_twsk(sk)); 2053 goto csum_error; 2054 } 2055 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 2056 case TCP_TW_SYN: { 2057 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), 2058 &tcp_hashinfo, skb, 2059 __tcp_hdrlen(th), 2060 iph->saddr, th->source, 2061 iph->daddr, th->dest, 2062 inet_iif(skb), 2063 sdif); 2064 if (sk2) { 2065 inet_twsk_deschedule_put(inet_twsk(sk)); 2066 sk = sk2; 2067 tcp_v4_restore_cb(skb); 2068 refcounted = false; 2069 goto process; 2070 } 2071 } 2072 /* to ACK */ 2073 /* fall through */ 2074 case TCP_TW_ACK: 2075 tcp_v4_timewait_ack(sk, skb); 2076 break; 2077 case TCP_TW_RST: 2078 tcp_v4_send_reset(sk, skb); 2079 inet_twsk_deschedule_put(inet_twsk(sk)); 2080 goto discard_it; 2081 case TCP_TW_SUCCESS:; 2082 } 2083 goto discard_it; 2084 } 2085 2086 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2087 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2088 .twsk_unique = tcp_twsk_unique, 2089 .twsk_destructor= tcp_twsk_destructor, 2090 }; 2091 2092 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2093 { 2094 struct dst_entry *dst = skb_dst(skb); 2095 2096 if (dst && dst_hold_safe(dst)) { 2097 sk->sk_rx_dst = dst; 2098 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; 2099 } 2100 } 2101 EXPORT_SYMBOL(inet_sk_rx_dst_set); 2102 2103 const struct inet_connection_sock_af_ops ipv4_specific = { 2104 .queue_xmit = ip_queue_xmit, 2105 .send_check = tcp_v4_send_check, 2106 .rebuild_header = inet_sk_rebuild_header, 2107 .sk_rx_dst_set = inet_sk_rx_dst_set, 2108 .conn_request = tcp_v4_conn_request, 2109 .syn_recv_sock = tcp_v4_syn_recv_sock, 2110 .net_header_len = sizeof(struct iphdr), 2111 .setsockopt = ip_setsockopt, 2112 .getsockopt = ip_getsockopt, 2113 .addr2sockaddr = inet_csk_addr2sockaddr, 2114 .sockaddr_len = sizeof(struct sockaddr_in), 2115 #ifdef CONFIG_COMPAT 2116 .compat_setsockopt = compat_ip_setsockopt, 2117 .compat_getsockopt = compat_ip_getsockopt, 2118 #endif 2119 .mtu_reduced = tcp_v4_mtu_reduced, 2120 }; 2121 EXPORT_SYMBOL(ipv4_specific); 2122 2123 #ifdef CONFIG_TCP_MD5SIG 2124 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2125 .md5_lookup = tcp_v4_md5_lookup, 2126 .calc_md5_hash = tcp_v4_md5_hash_skb, 2127 .md5_parse = tcp_v4_parse_md5_keys, 2128 }; 2129 #endif 2130 2131 /* NOTE: A lot of things set to zero explicitly by call to 2132 * sk_alloc() so need not be done here. 2133 */ 2134 static int tcp_v4_init_sock(struct sock *sk) 2135 { 2136 struct inet_connection_sock *icsk = inet_csk(sk); 2137 2138 tcp_init_sock(sk); 2139 2140 icsk->icsk_af_ops = &ipv4_specific; 2141 2142 #ifdef CONFIG_TCP_MD5SIG 2143 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2144 #endif 2145 2146 return 0; 2147 } 2148 2149 void tcp_v4_destroy_sock(struct sock *sk) 2150 { 2151 struct tcp_sock *tp = tcp_sk(sk); 2152 2153 trace_tcp_destroy_sock(sk); 2154 2155 tcp_clear_xmit_timers(sk); 2156 2157 tcp_cleanup_congestion_control(sk); 2158 2159 tcp_cleanup_ulp(sk); 2160 2161 /* Cleanup up the write buffer. */ 2162 tcp_write_queue_purge(sk); 2163 2164 /* Check if we want to disable active TFO */ 2165 tcp_fastopen_active_disable_ofo_check(sk); 2166 2167 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2168 skb_rbtree_purge(&tp->out_of_order_queue); 2169 2170 #ifdef CONFIG_TCP_MD5SIG 2171 /* Clean up the MD5 key list, if any */ 2172 if (tp->md5sig_info) { 2173 tcp_clear_md5_list(sk); 2174 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu); 2175 tp->md5sig_info = NULL; 2176 } 2177 #endif 2178 2179 /* Clean up a referenced TCP bind bucket. */ 2180 if (inet_csk(sk)->icsk_bind_hash) 2181 inet_put_port(sk); 2182 2183 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2184 2185 /* If socket is aborted during connect operation */ 2186 tcp_free_fastopen_req(tp); 2187 tcp_fastopen_destroy_cipher(sk); 2188 tcp_saved_syn_free(tp); 2189 2190 sk_sockets_allocated_dec(sk); 2191 } 2192 EXPORT_SYMBOL(tcp_v4_destroy_sock); 2193 2194 #ifdef CONFIG_PROC_FS 2195 /* Proc filesystem TCP sock list dumping. */ 2196 2197 /* 2198 * Get next listener socket follow cur. If cur is NULL, get first socket 2199 * starting from bucket given in st->bucket; when st->bucket is zero the 2200 * very first socket in the hash table is returned. 2201 */ 2202 static void *listening_get_next(struct seq_file *seq, void *cur) 2203 { 2204 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); 2205 struct tcp_iter_state *st = seq->private; 2206 struct net *net = seq_file_net(seq); 2207 struct inet_listen_hashbucket *ilb; 2208 struct hlist_nulls_node *node; 2209 struct sock *sk = cur; 2210 2211 if (!sk) { 2212 get_head: 2213 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2214 spin_lock(&ilb->lock); 2215 sk = sk_nulls_head(&ilb->nulls_head); 2216 st->offset = 0; 2217 goto get_sk; 2218 } 2219 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2220 ++st->num; 2221 ++st->offset; 2222 2223 sk = sk_nulls_next(sk); 2224 get_sk: 2225 sk_nulls_for_each_from(sk, node) { 2226 if (!net_eq(sock_net(sk), net)) 2227 continue; 2228 if (sk->sk_family == afinfo->family) 2229 return sk; 2230 } 2231 spin_unlock(&ilb->lock); 2232 st->offset = 0; 2233 if (++st->bucket < INET_LHTABLE_SIZE) 2234 goto get_head; 2235 return NULL; 2236 } 2237 2238 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2239 { 2240 struct tcp_iter_state *st = seq->private; 2241 void *rc; 2242 2243 st->bucket = 0; 2244 st->offset = 0; 2245 rc = listening_get_next(seq, NULL); 2246 2247 while (rc && *pos) { 2248 rc = listening_get_next(seq, rc); 2249 --*pos; 2250 } 2251 return rc; 2252 } 2253 2254 static inline bool empty_bucket(const struct tcp_iter_state *st) 2255 { 2256 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain); 2257 } 2258 2259 /* 2260 * Get first established socket starting from bucket given in st->bucket. 2261 * If st->bucket is zero, the very first socket in the hash is returned. 2262 */ 2263 static void *established_get_first(struct seq_file *seq) 2264 { 2265 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); 2266 struct tcp_iter_state *st = seq->private; 2267 struct net *net = seq_file_net(seq); 2268 void *rc = NULL; 2269 2270 st->offset = 0; 2271 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { 2272 struct sock *sk; 2273 struct hlist_nulls_node *node; 2274 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 2275 2276 /* Lockless fast path for the common case of empty buckets */ 2277 if (empty_bucket(st)) 2278 continue; 2279 2280 spin_lock_bh(lock); 2281 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 2282 if (sk->sk_family != afinfo->family || 2283 !net_eq(sock_net(sk), net)) { 2284 continue; 2285 } 2286 rc = sk; 2287 goto out; 2288 } 2289 spin_unlock_bh(lock); 2290 } 2291 out: 2292 return rc; 2293 } 2294 2295 static void *established_get_next(struct seq_file *seq, void *cur) 2296 { 2297 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); 2298 struct sock *sk = cur; 2299 struct hlist_nulls_node *node; 2300 struct tcp_iter_state *st = seq->private; 2301 struct net *net = seq_file_net(seq); 2302 2303 ++st->num; 2304 ++st->offset; 2305 2306 sk = sk_nulls_next(sk); 2307 2308 sk_nulls_for_each_from(sk, node) { 2309 if (sk->sk_family == afinfo->family && 2310 net_eq(sock_net(sk), net)) 2311 return sk; 2312 } 2313 2314 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2315 ++st->bucket; 2316 return established_get_first(seq); 2317 } 2318 2319 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2320 { 2321 struct tcp_iter_state *st = seq->private; 2322 void *rc; 2323 2324 st->bucket = 0; 2325 rc = established_get_first(seq); 2326 2327 while (rc && pos) { 2328 rc = established_get_next(seq, rc); 2329 --pos; 2330 } 2331 return rc; 2332 } 2333 2334 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2335 { 2336 void *rc; 2337 struct tcp_iter_state *st = seq->private; 2338 2339 st->state = TCP_SEQ_STATE_LISTENING; 2340 rc = listening_get_idx(seq, &pos); 2341 2342 if (!rc) { 2343 st->state = TCP_SEQ_STATE_ESTABLISHED; 2344 rc = established_get_idx(seq, pos); 2345 } 2346 2347 return rc; 2348 } 2349 2350 static void *tcp_seek_last_pos(struct seq_file *seq) 2351 { 2352 struct tcp_iter_state *st = seq->private; 2353 int offset = st->offset; 2354 int orig_num = st->num; 2355 void *rc = NULL; 2356 2357 switch (st->state) { 2358 case TCP_SEQ_STATE_LISTENING: 2359 if (st->bucket >= INET_LHTABLE_SIZE) 2360 break; 2361 st->state = TCP_SEQ_STATE_LISTENING; 2362 rc = listening_get_next(seq, NULL); 2363 while (offset-- && rc) 2364 rc = listening_get_next(seq, rc); 2365 if (rc) 2366 break; 2367 st->bucket = 0; 2368 st->state = TCP_SEQ_STATE_ESTABLISHED; 2369 /* Fallthrough */ 2370 case TCP_SEQ_STATE_ESTABLISHED: 2371 if (st->bucket > tcp_hashinfo.ehash_mask) 2372 break; 2373 rc = established_get_first(seq); 2374 while (offset-- && rc) 2375 rc = established_get_next(seq, rc); 2376 } 2377 2378 st->num = orig_num; 2379 2380 return rc; 2381 } 2382 2383 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2384 { 2385 struct tcp_iter_state *st = seq->private; 2386 void *rc; 2387 2388 if (*pos && *pos == st->last_pos) { 2389 rc = tcp_seek_last_pos(seq); 2390 if (rc) 2391 goto out; 2392 } 2393 2394 st->state = TCP_SEQ_STATE_LISTENING; 2395 st->num = 0; 2396 st->bucket = 0; 2397 st->offset = 0; 2398 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2399 2400 out: 2401 st->last_pos = *pos; 2402 return rc; 2403 } 2404 EXPORT_SYMBOL(tcp_seq_start); 2405 2406 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2407 { 2408 struct tcp_iter_state *st = seq->private; 2409 void *rc = NULL; 2410 2411 if (v == SEQ_START_TOKEN) { 2412 rc = tcp_get_idx(seq, 0); 2413 goto out; 2414 } 2415 2416 switch (st->state) { 2417 case TCP_SEQ_STATE_LISTENING: 2418 rc = listening_get_next(seq, v); 2419 if (!rc) { 2420 st->state = TCP_SEQ_STATE_ESTABLISHED; 2421 st->bucket = 0; 2422 st->offset = 0; 2423 rc = established_get_first(seq); 2424 } 2425 break; 2426 case TCP_SEQ_STATE_ESTABLISHED: 2427 rc = established_get_next(seq, v); 2428 break; 2429 } 2430 out: 2431 ++*pos; 2432 st->last_pos = *pos; 2433 return rc; 2434 } 2435 EXPORT_SYMBOL(tcp_seq_next); 2436 2437 void tcp_seq_stop(struct seq_file *seq, void *v) 2438 { 2439 struct tcp_iter_state *st = seq->private; 2440 2441 switch (st->state) { 2442 case TCP_SEQ_STATE_LISTENING: 2443 if (v != SEQ_START_TOKEN) 2444 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock); 2445 break; 2446 case TCP_SEQ_STATE_ESTABLISHED: 2447 if (v) 2448 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2449 break; 2450 } 2451 } 2452 EXPORT_SYMBOL(tcp_seq_stop); 2453 2454 static void get_openreq4(const struct request_sock *req, 2455 struct seq_file *f, int i) 2456 { 2457 const struct inet_request_sock *ireq = inet_rsk(req); 2458 long delta = req->rsk_timer.expires - jiffies; 2459 2460 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2461 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2462 i, 2463 ireq->ir_loc_addr, 2464 ireq->ir_num, 2465 ireq->ir_rmt_addr, 2466 ntohs(ireq->ir_rmt_port), 2467 TCP_SYN_RECV, 2468 0, 0, /* could print option size, but that is af dependent. */ 2469 1, /* timers active (only the expire timer) */ 2470 jiffies_delta_to_clock_t(delta), 2471 req->num_timeout, 2472 from_kuid_munged(seq_user_ns(f), 2473 sock_i_uid(req->rsk_listener)), 2474 0, /* non standard timer */ 2475 0, /* open_requests have no inode */ 2476 0, 2477 req); 2478 } 2479 2480 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2481 { 2482 int timer_active; 2483 unsigned long timer_expires; 2484 const struct tcp_sock *tp = tcp_sk(sk); 2485 const struct inet_connection_sock *icsk = inet_csk(sk); 2486 const struct inet_sock *inet = inet_sk(sk); 2487 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2488 __be32 dest = inet->inet_daddr; 2489 __be32 src = inet->inet_rcv_saddr; 2490 __u16 destp = ntohs(inet->inet_dport); 2491 __u16 srcp = ntohs(inet->inet_sport); 2492 int rx_queue; 2493 int state; 2494 2495 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2496 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2497 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2498 timer_active = 1; 2499 timer_expires = icsk->icsk_timeout; 2500 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2501 timer_active = 4; 2502 timer_expires = icsk->icsk_timeout; 2503 } else if (timer_pending(&sk->sk_timer)) { 2504 timer_active = 2; 2505 timer_expires = sk->sk_timer.expires; 2506 } else { 2507 timer_active = 0; 2508 timer_expires = jiffies; 2509 } 2510 2511 state = inet_sk_state_load(sk); 2512 if (state == TCP_LISTEN) 2513 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2514 else 2515 /* Because we don't lock the socket, 2516 * we might find a transient negative value. 2517 */ 2518 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2519 READ_ONCE(tp->copied_seq), 0); 2520 2521 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2522 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2523 i, src, srcp, dest, destp, state, 2524 READ_ONCE(tp->write_seq) - tp->snd_una, 2525 rx_queue, 2526 timer_active, 2527 jiffies_delta_to_clock_t(timer_expires - jiffies), 2528 icsk->icsk_retransmits, 2529 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2530 icsk->icsk_probes_out, 2531 sock_i_ino(sk), 2532 refcount_read(&sk->sk_refcnt), sk, 2533 jiffies_to_clock_t(icsk->icsk_rto), 2534 jiffies_to_clock_t(icsk->icsk_ack.ato), 2535 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2536 tp->snd_cwnd, 2537 state == TCP_LISTEN ? 2538 fastopenq->max_qlen : 2539 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2540 } 2541 2542 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2543 struct seq_file *f, int i) 2544 { 2545 long delta = tw->tw_timer.expires - jiffies; 2546 __be32 dest, src; 2547 __u16 destp, srcp; 2548 2549 dest = tw->tw_daddr; 2550 src = tw->tw_rcv_saddr; 2551 destp = ntohs(tw->tw_dport); 2552 srcp = ntohs(tw->tw_sport); 2553 2554 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2555 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2556 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2557 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2558 refcount_read(&tw->tw_refcnt), tw); 2559 } 2560 2561 #define TMPSZ 150 2562 2563 static int tcp4_seq_show(struct seq_file *seq, void *v) 2564 { 2565 struct tcp_iter_state *st; 2566 struct sock *sk = v; 2567 2568 seq_setwidth(seq, TMPSZ - 1); 2569 if (v == SEQ_START_TOKEN) { 2570 seq_puts(seq, " sl local_address rem_address st tx_queue " 2571 "rx_queue tr tm->when retrnsmt uid timeout " 2572 "inode"); 2573 goto out; 2574 } 2575 st = seq->private; 2576 2577 if (sk->sk_state == TCP_TIME_WAIT) 2578 get_timewait4_sock(v, seq, st->num); 2579 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2580 get_openreq4(v, seq, st->num); 2581 else 2582 get_tcp4_sock(v, seq, st->num); 2583 out: 2584 seq_pad(seq, '\n'); 2585 return 0; 2586 } 2587 2588 static const struct seq_operations tcp4_seq_ops = { 2589 .show = tcp4_seq_show, 2590 .start = tcp_seq_start, 2591 .next = tcp_seq_next, 2592 .stop = tcp_seq_stop, 2593 }; 2594 2595 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 2596 .family = AF_INET, 2597 }; 2598 2599 static int __net_init tcp4_proc_init_net(struct net *net) 2600 { 2601 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 2602 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 2603 return -ENOMEM; 2604 return 0; 2605 } 2606 2607 static void __net_exit tcp4_proc_exit_net(struct net *net) 2608 { 2609 remove_proc_entry("tcp", net->proc_net); 2610 } 2611 2612 static struct pernet_operations tcp4_net_ops = { 2613 .init = tcp4_proc_init_net, 2614 .exit = tcp4_proc_exit_net, 2615 }; 2616 2617 int __init tcp4_proc_init(void) 2618 { 2619 return register_pernet_subsys(&tcp4_net_ops); 2620 } 2621 2622 void tcp4_proc_exit(void) 2623 { 2624 unregister_pernet_subsys(&tcp4_net_ops); 2625 } 2626 #endif /* CONFIG_PROC_FS */ 2627 2628 struct proto tcp_prot = { 2629 .name = "TCP", 2630 .owner = THIS_MODULE, 2631 .close = tcp_close, 2632 .pre_connect = tcp_v4_pre_connect, 2633 .connect = tcp_v4_connect, 2634 .disconnect = tcp_disconnect, 2635 .accept = inet_csk_accept, 2636 .ioctl = tcp_ioctl, 2637 .init = tcp_v4_init_sock, 2638 .destroy = tcp_v4_destroy_sock, 2639 .shutdown = tcp_shutdown, 2640 .setsockopt = tcp_setsockopt, 2641 .getsockopt = tcp_getsockopt, 2642 .keepalive = tcp_set_keepalive, 2643 .recvmsg = tcp_recvmsg, 2644 .sendmsg = tcp_sendmsg, 2645 .sendpage = tcp_sendpage, 2646 .backlog_rcv = tcp_v4_do_rcv, 2647 .release_cb = tcp_release_cb, 2648 .hash = inet_hash, 2649 .unhash = inet_unhash, 2650 .get_port = inet_csk_get_port, 2651 .enter_memory_pressure = tcp_enter_memory_pressure, 2652 .leave_memory_pressure = tcp_leave_memory_pressure, 2653 .stream_memory_free = tcp_stream_memory_free, 2654 .sockets_allocated = &tcp_sockets_allocated, 2655 .orphan_count = &tcp_orphan_count, 2656 .memory_allocated = &tcp_memory_allocated, 2657 .memory_pressure = &tcp_memory_pressure, 2658 .sysctl_mem = sysctl_tcp_mem, 2659 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 2660 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 2661 .max_header = MAX_TCP_HEADER, 2662 .obj_size = sizeof(struct tcp_sock), 2663 .slab_flags = SLAB_TYPESAFE_BY_RCU, 2664 .twsk_prot = &tcp_timewait_sock_ops, 2665 .rsk_prot = &tcp_request_sock_ops, 2666 .h.hashinfo = &tcp_hashinfo, 2667 .no_autobind = true, 2668 #ifdef CONFIG_COMPAT 2669 .compat_setsockopt = compat_tcp_setsockopt, 2670 .compat_getsockopt = compat_tcp_getsockopt, 2671 #endif 2672 .diag_destroy = tcp_abort, 2673 }; 2674 EXPORT_SYMBOL(tcp_prot); 2675 2676 static void __net_exit tcp_sk_exit(struct net *net) 2677 { 2678 int cpu; 2679 2680 if (net->ipv4.tcp_congestion_control) 2681 bpf_module_put(net->ipv4.tcp_congestion_control, 2682 net->ipv4.tcp_congestion_control->owner); 2683 2684 for_each_possible_cpu(cpu) 2685 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); 2686 free_percpu(net->ipv4.tcp_sk); 2687 } 2688 2689 static int __net_init tcp_sk_init(struct net *net) 2690 { 2691 int res, cpu, cnt; 2692 2693 net->ipv4.tcp_sk = alloc_percpu(struct sock *); 2694 if (!net->ipv4.tcp_sk) 2695 return -ENOMEM; 2696 2697 for_each_possible_cpu(cpu) { 2698 struct sock *sk; 2699 2700 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 2701 IPPROTO_TCP, net); 2702 if (res) 2703 goto fail; 2704 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 2705 2706 /* Please enforce IP_DF and IPID==0 for RST and 2707 * ACK sent in SYN-RECV and TIME-WAIT state. 2708 */ 2709 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 2710 2711 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; 2712 } 2713 2714 net->ipv4.sysctl_tcp_ecn = 2; 2715 net->ipv4.sysctl_tcp_ecn_fallback = 1; 2716 2717 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 2718 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 2719 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 2720 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 2721 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 2722 2723 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 2724 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 2725 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 2726 2727 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 2728 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 2729 net->ipv4.sysctl_tcp_syncookies = 1; 2730 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 2731 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 2732 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 2733 net->ipv4.sysctl_tcp_orphan_retries = 0; 2734 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 2735 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 2736 net->ipv4.sysctl_tcp_tw_reuse = 2; 2737 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 2738 2739 cnt = tcp_hashinfo.ehash_mask + 1; 2740 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2; 2741 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; 2742 2743 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128); 2744 net->ipv4.sysctl_tcp_sack = 1; 2745 net->ipv4.sysctl_tcp_window_scaling = 1; 2746 net->ipv4.sysctl_tcp_timestamps = 1; 2747 net->ipv4.sysctl_tcp_early_retrans = 3; 2748 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 2749 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 2750 net->ipv4.sysctl_tcp_retrans_collapse = 1; 2751 net->ipv4.sysctl_tcp_max_reordering = 300; 2752 net->ipv4.sysctl_tcp_dsack = 1; 2753 net->ipv4.sysctl_tcp_app_win = 31; 2754 net->ipv4.sysctl_tcp_adv_win_scale = 1; 2755 net->ipv4.sysctl_tcp_frto = 2; 2756 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 2757 /* This limits the percentage of the congestion window which we 2758 * will allow a single TSO frame to consume. Building TSO frames 2759 * which are too large can cause TCP streams to be bursty. 2760 */ 2761 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 2762 /* Default TSQ limit of 16 TSO segments */ 2763 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; 2764 /* rfc5961 challenge ack rate limiting */ 2765 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; 2766 net->ipv4.sysctl_tcp_min_tso_segs = 2; 2767 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 2768 net->ipv4.sysctl_tcp_autocorking = 1; 2769 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 2770 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 2771 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 2772 if (net != &init_net) { 2773 memcpy(net->ipv4.sysctl_tcp_rmem, 2774 init_net.ipv4.sysctl_tcp_rmem, 2775 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 2776 memcpy(net->ipv4.sysctl_tcp_wmem, 2777 init_net.ipv4.sysctl_tcp_wmem, 2778 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 2779 } 2780 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 2781 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 2782 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 2783 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); 2784 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; 2785 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 2786 2787 /* Reno is always built in */ 2788 if (!net_eq(net, &init_net) && 2789 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 2790 init_net.ipv4.tcp_congestion_control->owner)) 2791 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 2792 else 2793 net->ipv4.tcp_congestion_control = &tcp_reno; 2794 2795 return 0; 2796 fail: 2797 tcp_sk_exit(net); 2798 2799 return res; 2800 } 2801 2802 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 2803 { 2804 struct net *net; 2805 2806 inet_twsk_purge(&tcp_hashinfo, AF_INET); 2807 2808 list_for_each_entry(net, net_exit_list, exit_list) 2809 tcp_fastopen_ctx_destroy(net); 2810 } 2811 2812 static struct pernet_operations __net_initdata tcp_sk_ops = { 2813 .init = tcp_sk_init, 2814 .exit = tcp_sk_exit, 2815 .exit_batch = tcp_sk_exit_batch, 2816 }; 2817 2818 void __init tcp_v4_init(void) 2819 { 2820 if (register_pernet_subsys(&tcp_sk_ops)) 2821 panic("Failed to create the TCP control socket.\n"); 2822 } 2823