1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * IPv4 specific functions 9 * 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 * 18 * This program is free software; you can redistribute it and/or 19 * modify it under the terms of the GNU General Public License 20 * as published by the Free Software Foundation; either version 21 * 2 of the License, or (at your option) any later version. 22 */ 23 24 /* 25 * Changes: 26 * David S. Miller : New socket lookup architecture. 27 * This code is dedicated to John Dyson. 28 * David S. Miller : Change semantics of established hash, 29 * half is devoted to TIME_WAIT sockets 30 * and the rest go in the other half. 31 * Andi Kleen : Add support for syncookies and fixed 32 * some bugs: ip options weren't passed to 33 * the TCP layer, missed a check for an 34 * ACK bit. 35 * Andi Kleen : Implemented fast path mtu discovery. 36 * Fixed many serious bugs in the 37 * request_sock handling and moved 38 * most of it into the af independent code. 39 * Added tail drop and some other bugfixes. 40 * Added new listen semantics. 41 * Mike McLagan : Routing by source 42 * Juan Jose Ciarlante: ip_dynaddr bits 43 * Andi Kleen: various fixes. 44 * Vitaly E. Lavrov : Transparent proxy revived after year 45 * coma. 46 * Andi Kleen : Fix new listen. 47 * Andi Kleen : Fix accept error reporting. 48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 50 * a single port at the same time. 51 */ 52 53 #define pr_fmt(fmt) "TCP: " fmt 54 55 #include <linux/bottom_half.h> 56 #include <linux/types.h> 57 #include <linux/fcntl.h> 58 #include <linux/module.h> 59 #include <linux/random.h> 60 #include <linux/cache.h> 61 #include <linux/jhash.h> 62 #include <linux/init.h> 63 #include <linux/times.h> 64 #include <linux/slab.h> 65 66 #include <net/net_namespace.h> 67 #include <net/icmp.h> 68 #include <net/inet_hashtables.h> 69 #include <net/tcp.h> 70 #include <net/transp_v6.h> 71 #include <net/ipv6.h> 72 #include <net/inet_common.h> 73 #include <net/timewait_sock.h> 74 #include <net/xfrm.h> 75 #include <net/secure_seq.h> 76 #include <net/busy_poll.h> 77 78 #include <linux/inet.h> 79 #include <linux/ipv6.h> 80 #include <linux/stddef.h> 81 #include <linux/proc_fs.h> 82 #include <linux/seq_file.h> 83 84 #include <linux/crypto.h> 85 #include <linux/scatterlist.h> 86 87 int sysctl_tcp_tw_reuse __read_mostly; 88 int sysctl_tcp_low_latency __read_mostly; 89 EXPORT_SYMBOL(sysctl_tcp_low_latency); 90 91 #ifdef CONFIG_TCP_MD5SIG 92 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 93 __be32 daddr, __be32 saddr, const struct tcphdr *th); 94 #endif 95 96 struct inet_hashinfo tcp_hashinfo; 97 EXPORT_SYMBOL(tcp_hashinfo); 98 99 static __u32 tcp_v4_init_sequence(const struct sk_buff *skb) 100 { 101 return secure_tcp_sequence_number(ip_hdr(skb)->daddr, 102 ip_hdr(skb)->saddr, 103 tcp_hdr(skb)->dest, 104 tcp_hdr(skb)->source); 105 } 106 107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 108 { 109 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 110 struct tcp_sock *tp = tcp_sk(sk); 111 112 /* With PAWS, it is safe from the viewpoint 113 of data integrity. Even without PAWS it is safe provided sequence 114 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 115 116 Actually, the idea is close to VJ's one, only timestamp cache is 117 held not per host, but per port pair and TW bucket is used as state 118 holder. 119 120 If TW bucket has been already destroyed we fall back to VJ's scheme 121 and use initial timestamp retrieved from peer table. 122 */ 123 if (tcptw->tw_ts_recent_stamp && 124 (!twp || (sysctl_tcp_tw_reuse && 125 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { 126 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; 127 if (tp->write_seq == 0) 128 tp->write_seq = 1; 129 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 130 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 131 sock_hold(sktw); 132 return 1; 133 } 134 135 return 0; 136 } 137 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 138 139 /* This will initiate an outgoing connection. */ 140 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 141 { 142 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 143 struct inet_sock *inet = inet_sk(sk); 144 struct tcp_sock *tp = tcp_sk(sk); 145 __be16 orig_sport, orig_dport; 146 __be32 daddr, nexthop; 147 struct flowi4 *fl4; 148 struct rtable *rt; 149 int err; 150 struct ip_options_rcu *inet_opt; 151 152 if (addr_len < sizeof(struct sockaddr_in)) 153 return -EINVAL; 154 155 if (usin->sin_family != AF_INET) 156 return -EAFNOSUPPORT; 157 158 nexthop = daddr = usin->sin_addr.s_addr; 159 inet_opt = rcu_dereference_protected(inet->inet_opt, 160 sock_owned_by_user(sk)); 161 if (inet_opt && inet_opt->opt.srr) { 162 if (!daddr) 163 return -EINVAL; 164 nexthop = inet_opt->opt.faddr; 165 } 166 167 orig_sport = inet->inet_sport; 168 orig_dport = usin->sin_port; 169 fl4 = &inet->cork.fl.u.ip4; 170 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 171 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 172 IPPROTO_TCP, 173 orig_sport, orig_dport, sk); 174 if (IS_ERR(rt)) { 175 err = PTR_ERR(rt); 176 if (err == -ENETUNREACH) 177 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 178 return err; 179 } 180 181 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 182 ip_rt_put(rt); 183 return -ENETUNREACH; 184 } 185 186 if (!inet_opt || !inet_opt->opt.srr) 187 daddr = fl4->daddr; 188 189 if (!inet->inet_saddr) 190 inet->inet_saddr = fl4->saddr; 191 sk_rcv_saddr_set(sk, inet->inet_saddr); 192 193 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 194 /* Reset inherited state */ 195 tp->rx_opt.ts_recent = 0; 196 tp->rx_opt.ts_recent_stamp = 0; 197 if (likely(!tp->repair)) 198 tp->write_seq = 0; 199 } 200 201 if (tcp_death_row.sysctl_tw_recycle && 202 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) 203 tcp_fetch_timewait_stamp(sk, &rt->dst); 204 205 inet->inet_dport = usin->sin_port; 206 sk_daddr_set(sk, daddr); 207 208 inet_csk(sk)->icsk_ext_hdr_len = 0; 209 if (inet_opt) 210 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 211 212 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 213 214 /* Socket identity is still unknown (sport may be zero). 215 * However we set state to SYN-SENT and not releasing socket 216 * lock select source port, enter ourselves into the hash tables and 217 * complete initialization after this. 218 */ 219 tcp_set_state(sk, TCP_SYN_SENT); 220 err = inet_hash_connect(&tcp_death_row, sk); 221 if (err) 222 goto failure; 223 224 sk_set_txhash(sk); 225 226 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 227 inet->inet_sport, inet->inet_dport, sk); 228 if (IS_ERR(rt)) { 229 err = PTR_ERR(rt); 230 rt = NULL; 231 goto failure; 232 } 233 /* OK, now commit destination to socket. */ 234 sk->sk_gso_type = SKB_GSO_TCPV4; 235 sk_setup_caps(sk, &rt->dst); 236 237 if (!tp->write_seq && likely(!tp->repair)) 238 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr, 239 inet->inet_daddr, 240 inet->inet_sport, 241 usin->sin_port); 242 243 inet->inet_id = tp->write_seq ^ jiffies; 244 245 err = tcp_connect(sk); 246 247 rt = NULL; 248 if (err) 249 goto failure; 250 251 return 0; 252 253 failure: 254 /* 255 * This unhashes the socket and releases the local port, 256 * if necessary. 257 */ 258 tcp_set_state(sk, TCP_CLOSE); 259 ip_rt_put(rt); 260 sk->sk_route_caps = 0; 261 inet->inet_dport = 0; 262 return err; 263 } 264 EXPORT_SYMBOL(tcp_v4_connect); 265 266 /* 267 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 268 * It can be called through tcp_release_cb() if socket was owned by user 269 * at the time tcp_v4_err() was called to handle ICMP message. 270 */ 271 void tcp_v4_mtu_reduced(struct sock *sk) 272 { 273 struct dst_entry *dst; 274 struct inet_sock *inet = inet_sk(sk); 275 u32 mtu = tcp_sk(sk)->mtu_info; 276 277 dst = inet_csk_update_pmtu(sk, mtu); 278 if (!dst) 279 return; 280 281 /* Something is about to be wrong... Remember soft error 282 * for the case, if this connection will not able to recover. 283 */ 284 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 285 sk->sk_err_soft = EMSGSIZE; 286 287 mtu = dst_mtu(dst); 288 289 if (inet->pmtudisc != IP_PMTUDISC_DONT && 290 ip_sk_accept_pmtu(sk) && 291 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 292 tcp_sync_mss(sk, mtu); 293 294 /* Resend the TCP packet because it's 295 * clear that the old packet has been 296 * dropped. This is the new "fast" path mtu 297 * discovery. 298 */ 299 tcp_simple_retransmit(sk); 300 } /* else let the usual retransmit timer handle it */ 301 } 302 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 303 304 static void do_redirect(struct sk_buff *skb, struct sock *sk) 305 { 306 struct dst_entry *dst = __sk_dst_check(sk, 0); 307 308 if (dst) 309 dst->ops->redirect(dst, sk, skb); 310 } 311 312 313 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 314 void tcp_req_err(struct sock *sk, u32 seq) 315 { 316 struct request_sock *req = inet_reqsk(sk); 317 struct net *net = sock_net(sk); 318 319 /* ICMPs are not backlogged, hence we cannot get 320 * an established socket here. 321 */ 322 WARN_ON(req->sk); 323 324 if (seq != tcp_rsk(req)->snt_isn) { 325 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); 326 } else { 327 /* 328 * Still in SYN_RECV, just remove it silently. 329 * There is no good way to pass the error to the newly 330 * created socket, and POSIX does not want network 331 * errors returned from accept(). 332 */ 333 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 334 NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS); 335 } 336 reqsk_put(req); 337 } 338 EXPORT_SYMBOL(tcp_req_err); 339 340 /* 341 * This routine is called by the ICMP module when it gets some 342 * sort of error condition. If err < 0 then the socket should 343 * be closed and the error returned to the user. If err > 0 344 * it's just the icmp type << 8 | icmp code. After adjustment 345 * header points to the first 8 bytes of the tcp header. We need 346 * to find the appropriate port. 347 * 348 * The locking strategy used here is very "optimistic". When 349 * someone else accesses the socket the ICMP is just dropped 350 * and for some paths there is no check at all. 351 * A more general error queue to queue errors for later handling 352 * is probably better. 353 * 354 */ 355 356 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) 357 { 358 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data; 359 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); 360 struct inet_connection_sock *icsk; 361 struct tcp_sock *tp; 362 struct inet_sock *inet; 363 const int type = icmp_hdr(icmp_skb)->type; 364 const int code = icmp_hdr(icmp_skb)->code; 365 struct sock *sk; 366 struct sk_buff *skb; 367 struct request_sock *fastopen; 368 __u32 seq, snd_una; 369 __u32 remaining; 370 int err; 371 struct net *net = dev_net(icmp_skb->dev); 372 373 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, 374 th->dest, iph->saddr, ntohs(th->source), 375 inet_iif(icmp_skb)); 376 if (!sk) { 377 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); 378 return; 379 } 380 if (sk->sk_state == TCP_TIME_WAIT) { 381 inet_twsk_put(inet_twsk(sk)); 382 return; 383 } 384 seq = ntohl(th->seq); 385 if (sk->sk_state == TCP_NEW_SYN_RECV) 386 return tcp_req_err(sk, seq); 387 388 bh_lock_sock(sk); 389 /* If too many ICMPs get dropped on busy 390 * servers this needs to be solved differently. 391 * We do take care of PMTU discovery (RFC1191) special case : 392 * we can receive locally generated ICMP messages while socket is held. 393 */ 394 if (sock_owned_by_user(sk)) { 395 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 396 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); 397 } 398 if (sk->sk_state == TCP_CLOSE) 399 goto out; 400 401 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 402 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); 403 goto out; 404 } 405 406 icsk = inet_csk(sk); 407 tp = tcp_sk(sk); 408 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 409 fastopen = tp->fastopen_rsk; 410 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 411 if (sk->sk_state != TCP_LISTEN && 412 !between(seq, snd_una, tp->snd_nxt)) { 413 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); 414 goto out; 415 } 416 417 switch (type) { 418 case ICMP_REDIRECT: 419 do_redirect(icmp_skb, sk); 420 goto out; 421 case ICMP_SOURCE_QUENCH: 422 /* Just silently ignore these. */ 423 goto out; 424 case ICMP_PARAMETERPROB: 425 err = EPROTO; 426 break; 427 case ICMP_DEST_UNREACH: 428 if (code > NR_ICMP_UNREACH) 429 goto out; 430 431 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 432 /* We are not interested in TCP_LISTEN and open_requests 433 * (SYN-ACKs send out by Linux are always <576bytes so 434 * they should go through unfragmented). 435 */ 436 if (sk->sk_state == TCP_LISTEN) 437 goto out; 438 439 tp->mtu_info = info; 440 if (!sock_owned_by_user(sk)) { 441 tcp_v4_mtu_reduced(sk); 442 } else { 443 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags)) 444 sock_hold(sk); 445 } 446 goto out; 447 } 448 449 err = icmp_err_convert[code].errno; 450 /* check if icmp_skb allows revert of backoff 451 * (see draft-zimmermann-tcp-lcd) */ 452 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH) 453 break; 454 if (seq != tp->snd_una || !icsk->icsk_retransmits || 455 !icsk->icsk_backoff || fastopen) 456 break; 457 458 if (sock_owned_by_user(sk)) 459 break; 460 461 icsk->icsk_backoff--; 462 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : 463 TCP_TIMEOUT_INIT; 464 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 465 466 skb = tcp_write_queue_head(sk); 467 BUG_ON(!skb); 468 469 remaining = icsk->icsk_rto - 470 min(icsk->icsk_rto, 471 tcp_time_stamp - tcp_skb_timestamp(skb)); 472 473 if (remaining) { 474 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 475 remaining, TCP_RTO_MAX); 476 } else { 477 /* RTO revert clocked out retransmission. 478 * Will retransmit now */ 479 tcp_retransmit_timer(sk); 480 } 481 482 break; 483 case ICMP_TIME_EXCEEDED: 484 err = EHOSTUNREACH; 485 break; 486 default: 487 goto out; 488 } 489 490 switch (sk->sk_state) { 491 case TCP_SYN_SENT: 492 case TCP_SYN_RECV: 493 /* Only in fast or simultaneous open. If a fast open socket is 494 * is already accepted it is treated as a connected one below. 495 */ 496 if (fastopen && !fastopen->sk) 497 break; 498 499 if (!sock_owned_by_user(sk)) { 500 sk->sk_err = err; 501 502 sk->sk_error_report(sk); 503 504 tcp_done(sk); 505 } else { 506 sk->sk_err_soft = err; 507 } 508 goto out; 509 } 510 511 /* If we've already connected we will keep trying 512 * until we time out, or the user gives up. 513 * 514 * rfc1122 4.2.3.9 allows to consider as hard errors 515 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 516 * but it is obsoleted by pmtu discovery). 517 * 518 * Note, that in modern internet, where routing is unreliable 519 * and in each dark corner broken firewalls sit, sending random 520 * errors ordered by their masters even this two messages finally lose 521 * their original sense (even Linux sends invalid PORT_UNREACHs) 522 * 523 * Now we are in compliance with RFCs. 524 * --ANK (980905) 525 */ 526 527 inet = inet_sk(sk); 528 if (!sock_owned_by_user(sk) && inet->recverr) { 529 sk->sk_err = err; 530 sk->sk_error_report(sk); 531 } else { /* Only an error on timeout */ 532 sk->sk_err_soft = err; 533 } 534 535 out: 536 bh_unlock_sock(sk); 537 sock_put(sk); 538 } 539 540 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 541 { 542 struct tcphdr *th = tcp_hdr(skb); 543 544 if (skb->ip_summed == CHECKSUM_PARTIAL) { 545 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 546 skb->csum_start = skb_transport_header(skb) - skb->head; 547 skb->csum_offset = offsetof(struct tcphdr, check); 548 } else { 549 th->check = tcp_v4_check(skb->len, saddr, daddr, 550 csum_partial(th, 551 th->doff << 2, 552 skb->csum)); 553 } 554 } 555 556 /* This routine computes an IPv4 TCP checksum. */ 557 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 558 { 559 const struct inet_sock *inet = inet_sk(sk); 560 561 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 562 } 563 EXPORT_SYMBOL(tcp_v4_send_check); 564 565 /* 566 * This routine will send an RST to the other tcp. 567 * 568 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 569 * for reset. 570 * Answer: if a packet caused RST, it is not for a socket 571 * existing in our system, if it is matched to a socket, 572 * it is just duplicate segment or bug in other side's TCP. 573 * So that we build reply only basing on parameters 574 * arrived with segment. 575 * Exception: precedence violation. We do not implement it in any case. 576 */ 577 578 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) 579 { 580 const struct tcphdr *th = tcp_hdr(skb); 581 struct { 582 struct tcphdr th; 583 #ifdef CONFIG_TCP_MD5SIG 584 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)]; 585 #endif 586 } rep; 587 struct ip_reply_arg arg; 588 #ifdef CONFIG_TCP_MD5SIG 589 struct tcp_md5sig_key *key = NULL; 590 const __u8 *hash_location = NULL; 591 unsigned char newhash[16]; 592 int genhash; 593 struct sock *sk1 = NULL; 594 #endif 595 struct net *net; 596 597 /* Never send a reset in response to a reset. */ 598 if (th->rst) 599 return; 600 601 /* If sk not NULL, it means we did a successful lookup and incoming 602 * route had to be correct. prequeue might have dropped our dst. 603 */ 604 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 605 return; 606 607 /* Swap the send and the receive. */ 608 memset(&rep, 0, sizeof(rep)); 609 rep.th.dest = th->source; 610 rep.th.source = th->dest; 611 rep.th.doff = sizeof(struct tcphdr) / 4; 612 rep.th.rst = 1; 613 614 if (th->ack) { 615 rep.th.seq = th->ack_seq; 616 } else { 617 rep.th.ack = 1; 618 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 619 skb->len - (th->doff << 2)); 620 } 621 622 memset(&arg, 0, sizeof(arg)); 623 arg.iov[0].iov_base = (unsigned char *)&rep; 624 arg.iov[0].iov_len = sizeof(rep.th); 625 626 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 627 #ifdef CONFIG_TCP_MD5SIG 628 hash_location = tcp_parse_md5sig_option(th); 629 if (sk && sk_fullsock(sk)) { 630 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *) 631 &ip_hdr(skb)->saddr, AF_INET); 632 } else if (hash_location) { 633 /* 634 * active side is lost. Try to find listening socket through 635 * source port, and then find md5 key through listening socket. 636 * we are not loose security here: 637 * Incoming packet is checked with md5 hash with finding key, 638 * no RST generated if md5 hash doesn't match. 639 */ 640 sk1 = __inet_lookup_listener(net, 641 &tcp_hashinfo, ip_hdr(skb)->saddr, 642 th->source, ip_hdr(skb)->daddr, 643 ntohs(th->source), inet_iif(skb)); 644 /* don't send rst if it can't find key */ 645 if (!sk1) 646 return; 647 rcu_read_lock(); 648 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *) 649 &ip_hdr(skb)->saddr, AF_INET); 650 if (!key) 651 goto release_sk1; 652 653 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 654 if (genhash || memcmp(hash_location, newhash, 16) != 0) 655 goto release_sk1; 656 } 657 658 if (key) { 659 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 660 (TCPOPT_NOP << 16) | 661 (TCPOPT_MD5SIG << 8) | 662 TCPOLEN_MD5SIG); 663 /* Update length and the length the header thinks exists */ 664 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 665 rep.th.doff = arg.iov[0].iov_len / 4; 666 667 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 668 key, ip_hdr(skb)->saddr, 669 ip_hdr(skb)->daddr, &rep.th); 670 } 671 #endif 672 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 673 ip_hdr(skb)->saddr, /* XXX */ 674 arg.iov[0].iov_len, IPPROTO_TCP, 0); 675 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 676 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 677 678 /* When socket is gone, all binding information is lost. 679 * routing might fail in this case. No choice here, if we choose to force 680 * input interface, we will misroute in case of asymmetric route. 681 */ 682 if (sk) 683 arg.bound_dev_if = sk->sk_bound_dev_if; 684 685 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 686 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 687 688 arg.tos = ip_hdr(skb)->tos; 689 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), 690 skb, &TCP_SKB_CB(skb)->header.h4.opt, 691 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 692 &arg, arg.iov[0].iov_len); 693 694 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); 695 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); 696 697 #ifdef CONFIG_TCP_MD5SIG 698 release_sk1: 699 if (sk1) { 700 rcu_read_unlock(); 701 sock_put(sk1); 702 } 703 #endif 704 } 705 706 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 707 outside socket context is ugly, certainly. What can I do? 708 */ 709 710 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, 711 u32 win, u32 tsval, u32 tsecr, int oif, 712 struct tcp_md5sig_key *key, 713 int reply_flags, u8 tos) 714 { 715 const struct tcphdr *th = tcp_hdr(skb); 716 struct { 717 struct tcphdr th; 718 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 719 #ifdef CONFIG_TCP_MD5SIG 720 + (TCPOLEN_MD5SIG_ALIGNED >> 2) 721 #endif 722 ]; 723 } rep; 724 struct ip_reply_arg arg; 725 struct net *net = dev_net(skb_dst(skb)->dev); 726 727 memset(&rep.th, 0, sizeof(struct tcphdr)); 728 memset(&arg, 0, sizeof(arg)); 729 730 arg.iov[0].iov_base = (unsigned char *)&rep; 731 arg.iov[0].iov_len = sizeof(rep.th); 732 if (tsecr) { 733 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 734 (TCPOPT_TIMESTAMP << 8) | 735 TCPOLEN_TIMESTAMP); 736 rep.opt[1] = htonl(tsval); 737 rep.opt[2] = htonl(tsecr); 738 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 739 } 740 741 /* Swap the send and the receive. */ 742 rep.th.dest = th->source; 743 rep.th.source = th->dest; 744 rep.th.doff = arg.iov[0].iov_len / 4; 745 rep.th.seq = htonl(seq); 746 rep.th.ack_seq = htonl(ack); 747 rep.th.ack = 1; 748 rep.th.window = htons(win); 749 750 #ifdef CONFIG_TCP_MD5SIG 751 if (key) { 752 int offset = (tsecr) ? 3 : 0; 753 754 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 755 (TCPOPT_NOP << 16) | 756 (TCPOPT_MD5SIG << 8) | 757 TCPOLEN_MD5SIG); 758 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 759 rep.th.doff = arg.iov[0].iov_len/4; 760 761 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 762 key, ip_hdr(skb)->saddr, 763 ip_hdr(skb)->daddr, &rep.th); 764 } 765 #endif 766 arg.flags = reply_flags; 767 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 768 ip_hdr(skb)->saddr, /* XXX */ 769 arg.iov[0].iov_len, IPPROTO_TCP, 0); 770 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 771 if (oif) 772 arg.bound_dev_if = oif; 773 arg.tos = tos; 774 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), 775 skb, &TCP_SKB_CB(skb)->header.h4.opt, 776 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 777 &arg, arg.iov[0].iov_len); 778 779 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); 780 } 781 782 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 783 { 784 struct inet_timewait_sock *tw = inet_twsk(sk); 785 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 786 787 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 788 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 789 tcp_time_stamp + tcptw->tw_ts_offset, 790 tcptw->tw_ts_recent, 791 tw->tw_bound_dev_if, 792 tcp_twsk_md5_key(tcptw), 793 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 794 tw->tw_tos 795 ); 796 797 inet_twsk_put(tw); 798 } 799 800 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 801 struct request_sock *req) 802 { 803 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 804 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 805 */ 806 tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ? 807 tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, 808 tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd, 809 tcp_time_stamp, 810 req->ts_recent, 811 0, 812 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, 813 AF_INET), 814 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 815 ip_hdr(skb)->tos); 816 } 817 818 /* 819 * Send a SYN-ACK after having received a SYN. 820 * This still operates on a request_sock only, not on a big 821 * socket. 822 */ 823 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 824 struct flowi *fl, 825 struct request_sock *req, 826 struct tcp_fastopen_cookie *foc, 827 bool attach_req) 828 { 829 const struct inet_request_sock *ireq = inet_rsk(req); 830 struct flowi4 fl4; 831 int err = -1; 832 struct sk_buff *skb; 833 834 /* First, grab a route. */ 835 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 836 return -1; 837 838 skb = tcp_make_synack(sk, dst, req, foc, attach_req); 839 840 if (skb) { 841 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 842 843 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 844 ireq->ir_rmt_addr, 845 ireq->opt); 846 err = net_xmit_eval(err); 847 } 848 849 return err; 850 } 851 852 /* 853 * IPv4 request_sock destructor. 854 */ 855 static void tcp_v4_reqsk_destructor(struct request_sock *req) 856 { 857 kfree(inet_rsk(req)->opt); 858 } 859 860 861 #ifdef CONFIG_TCP_MD5SIG 862 /* 863 * RFC2385 MD5 checksumming requires a mapping of 864 * IP address->MD5 Key. 865 * We need to maintain these in the sk structure. 866 */ 867 868 /* Find the Key structure for an address. */ 869 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, 870 const union tcp_md5_addr *addr, 871 int family) 872 { 873 const struct tcp_sock *tp = tcp_sk(sk); 874 struct tcp_md5sig_key *key; 875 unsigned int size = sizeof(struct in_addr); 876 const struct tcp_md5sig_info *md5sig; 877 878 /* caller either holds rcu_read_lock() or socket lock */ 879 md5sig = rcu_dereference_check(tp->md5sig_info, 880 sock_owned_by_user(sk) || 881 lockdep_is_held((spinlock_t *)&sk->sk_lock.slock)); 882 if (!md5sig) 883 return NULL; 884 #if IS_ENABLED(CONFIG_IPV6) 885 if (family == AF_INET6) 886 size = sizeof(struct in6_addr); 887 #endif 888 hlist_for_each_entry_rcu(key, &md5sig->head, node) { 889 if (key->family != family) 890 continue; 891 if (!memcmp(&key->addr, addr, size)) 892 return key; 893 } 894 return NULL; 895 } 896 EXPORT_SYMBOL(tcp_md5_do_lookup); 897 898 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 899 const struct sock *addr_sk) 900 { 901 const union tcp_md5_addr *addr; 902 903 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 904 return tcp_md5_do_lookup(sk, addr, AF_INET); 905 } 906 EXPORT_SYMBOL(tcp_v4_md5_lookup); 907 908 /* This can be called on a newly created socket, from other files */ 909 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 910 int family, const u8 *newkey, u8 newkeylen, gfp_t gfp) 911 { 912 /* Add Key to the list */ 913 struct tcp_md5sig_key *key; 914 struct tcp_sock *tp = tcp_sk(sk); 915 struct tcp_md5sig_info *md5sig; 916 917 key = tcp_md5_do_lookup(sk, addr, family); 918 if (key) { 919 /* Pre-existing entry - just update that one. */ 920 memcpy(key->key, newkey, newkeylen); 921 key->keylen = newkeylen; 922 return 0; 923 } 924 925 md5sig = rcu_dereference_protected(tp->md5sig_info, 926 sock_owned_by_user(sk) || 927 lockdep_is_held(&sk->sk_lock.slock)); 928 if (!md5sig) { 929 md5sig = kmalloc(sizeof(*md5sig), gfp); 930 if (!md5sig) 931 return -ENOMEM; 932 933 sk_nocaps_add(sk, NETIF_F_GSO_MASK); 934 INIT_HLIST_HEAD(&md5sig->head); 935 rcu_assign_pointer(tp->md5sig_info, md5sig); 936 } 937 938 key = sock_kmalloc(sk, sizeof(*key), gfp); 939 if (!key) 940 return -ENOMEM; 941 if (!tcp_alloc_md5sig_pool()) { 942 sock_kfree_s(sk, key, sizeof(*key)); 943 return -ENOMEM; 944 } 945 946 memcpy(key->key, newkey, newkeylen); 947 key->keylen = newkeylen; 948 key->family = family; 949 memcpy(&key->addr, addr, 950 (family == AF_INET6) ? sizeof(struct in6_addr) : 951 sizeof(struct in_addr)); 952 hlist_add_head_rcu(&key->node, &md5sig->head); 953 return 0; 954 } 955 EXPORT_SYMBOL(tcp_md5_do_add); 956 957 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family) 958 { 959 struct tcp_md5sig_key *key; 960 961 key = tcp_md5_do_lookup(sk, addr, family); 962 if (!key) 963 return -ENOENT; 964 hlist_del_rcu(&key->node); 965 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 966 kfree_rcu(key, rcu); 967 return 0; 968 } 969 EXPORT_SYMBOL(tcp_md5_do_del); 970 971 static void tcp_clear_md5_list(struct sock *sk) 972 { 973 struct tcp_sock *tp = tcp_sk(sk); 974 struct tcp_md5sig_key *key; 975 struct hlist_node *n; 976 struct tcp_md5sig_info *md5sig; 977 978 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 979 980 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 981 hlist_del_rcu(&key->node); 982 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 983 kfree_rcu(key, rcu); 984 } 985 } 986 987 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval, 988 int optlen) 989 { 990 struct tcp_md5sig cmd; 991 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 992 993 if (optlen < sizeof(cmd)) 994 return -EINVAL; 995 996 if (copy_from_user(&cmd, optval, sizeof(cmd))) 997 return -EFAULT; 998 999 if (sin->sin_family != AF_INET) 1000 return -EINVAL; 1001 1002 if (!cmd.tcpm_keylen) 1003 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, 1004 AF_INET); 1005 1006 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1007 return -EINVAL; 1008 1009 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, 1010 AF_INET, cmd.tcpm_key, cmd.tcpm_keylen, 1011 GFP_KERNEL); 1012 } 1013 1014 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp, 1015 __be32 daddr, __be32 saddr, int nbytes) 1016 { 1017 struct tcp4_pseudohdr *bp; 1018 struct scatterlist sg; 1019 1020 bp = &hp->md5_blk.ip4; 1021 1022 /* 1023 * 1. the TCP pseudo-header (in the order: source IP address, 1024 * destination IP address, zero-padded protocol number, and 1025 * segment length) 1026 */ 1027 bp->saddr = saddr; 1028 bp->daddr = daddr; 1029 bp->pad = 0; 1030 bp->protocol = IPPROTO_TCP; 1031 bp->len = cpu_to_be16(nbytes); 1032 1033 sg_init_one(&sg, bp, sizeof(*bp)); 1034 return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp)); 1035 } 1036 1037 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1038 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1039 { 1040 struct tcp_md5sig_pool *hp; 1041 struct hash_desc *desc; 1042 1043 hp = tcp_get_md5sig_pool(); 1044 if (!hp) 1045 goto clear_hash_noput; 1046 desc = &hp->md5_desc; 1047 1048 if (crypto_hash_init(desc)) 1049 goto clear_hash; 1050 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2)) 1051 goto clear_hash; 1052 if (tcp_md5_hash_header(hp, th)) 1053 goto clear_hash; 1054 if (tcp_md5_hash_key(hp, key)) 1055 goto clear_hash; 1056 if (crypto_hash_final(desc, md5_hash)) 1057 goto clear_hash; 1058 1059 tcp_put_md5sig_pool(); 1060 return 0; 1061 1062 clear_hash: 1063 tcp_put_md5sig_pool(); 1064 clear_hash_noput: 1065 memset(md5_hash, 0, 16); 1066 return 1; 1067 } 1068 1069 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1070 const struct sock *sk, 1071 const struct sk_buff *skb) 1072 { 1073 struct tcp_md5sig_pool *hp; 1074 struct hash_desc *desc; 1075 const struct tcphdr *th = tcp_hdr(skb); 1076 __be32 saddr, daddr; 1077 1078 if (sk) { /* valid for establish/request sockets */ 1079 saddr = sk->sk_rcv_saddr; 1080 daddr = sk->sk_daddr; 1081 } else { 1082 const struct iphdr *iph = ip_hdr(skb); 1083 saddr = iph->saddr; 1084 daddr = iph->daddr; 1085 } 1086 1087 hp = tcp_get_md5sig_pool(); 1088 if (!hp) 1089 goto clear_hash_noput; 1090 desc = &hp->md5_desc; 1091 1092 if (crypto_hash_init(desc)) 1093 goto clear_hash; 1094 1095 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len)) 1096 goto clear_hash; 1097 if (tcp_md5_hash_header(hp, th)) 1098 goto clear_hash; 1099 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) 1100 goto clear_hash; 1101 if (tcp_md5_hash_key(hp, key)) 1102 goto clear_hash; 1103 if (crypto_hash_final(desc, md5_hash)) 1104 goto clear_hash; 1105 1106 tcp_put_md5sig_pool(); 1107 return 0; 1108 1109 clear_hash: 1110 tcp_put_md5sig_pool(); 1111 clear_hash_noput: 1112 memset(md5_hash, 0, 16); 1113 return 1; 1114 } 1115 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1116 1117 #endif 1118 1119 /* Called with rcu_read_lock() */ 1120 static bool tcp_v4_inbound_md5_hash(const struct sock *sk, 1121 const struct sk_buff *skb) 1122 { 1123 #ifdef CONFIG_TCP_MD5SIG 1124 /* 1125 * This gets called for each TCP segment that arrives 1126 * so we want to be efficient. 1127 * We have 3 drop cases: 1128 * o No MD5 hash and one expected. 1129 * o MD5 hash and we're not expecting one. 1130 * o MD5 hash and its wrong. 1131 */ 1132 const __u8 *hash_location = NULL; 1133 struct tcp_md5sig_key *hash_expected; 1134 const struct iphdr *iph = ip_hdr(skb); 1135 const struct tcphdr *th = tcp_hdr(skb); 1136 int genhash; 1137 unsigned char newhash[16]; 1138 1139 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr, 1140 AF_INET); 1141 hash_location = tcp_parse_md5sig_option(th); 1142 1143 /* We've parsed the options - do we have a hash? */ 1144 if (!hash_expected && !hash_location) 1145 return false; 1146 1147 if (hash_expected && !hash_location) { 1148 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); 1149 return true; 1150 } 1151 1152 if (!hash_expected && hash_location) { 1153 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); 1154 return true; 1155 } 1156 1157 /* Okay, so this is hash_expected and hash_location - 1158 * so we need to calculate the checksum. 1159 */ 1160 genhash = tcp_v4_md5_hash_skb(newhash, 1161 hash_expected, 1162 NULL, skb); 1163 1164 if (genhash || memcmp(hash_location, newhash, 16) != 0) { 1165 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", 1166 &iph->saddr, ntohs(th->source), 1167 &iph->daddr, ntohs(th->dest), 1168 genhash ? " tcp_v4_calc_md5_hash failed" 1169 : ""); 1170 return true; 1171 } 1172 return false; 1173 #endif 1174 return false; 1175 } 1176 1177 static void tcp_v4_init_req(struct request_sock *req, 1178 const struct sock *sk_listener, 1179 struct sk_buff *skb) 1180 { 1181 struct inet_request_sock *ireq = inet_rsk(req); 1182 1183 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1184 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1185 ireq->no_srccheck = inet_sk(sk_listener)->transparent; 1186 ireq->opt = tcp_v4_save_options(skb); 1187 } 1188 1189 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1190 struct flowi *fl, 1191 const struct request_sock *req, 1192 bool *strict) 1193 { 1194 struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req); 1195 1196 if (strict) { 1197 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr) 1198 *strict = true; 1199 else 1200 *strict = false; 1201 } 1202 1203 return dst; 1204 } 1205 1206 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1207 .family = PF_INET, 1208 .obj_size = sizeof(struct tcp_request_sock), 1209 .rtx_syn_ack = tcp_rtx_synack, 1210 .send_ack = tcp_v4_reqsk_send_ack, 1211 .destructor = tcp_v4_reqsk_destructor, 1212 .send_reset = tcp_v4_send_reset, 1213 .syn_ack_timeout = tcp_syn_ack_timeout, 1214 }; 1215 1216 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1217 .mss_clamp = TCP_MSS_DEFAULT, 1218 #ifdef CONFIG_TCP_MD5SIG 1219 .req_md5_lookup = tcp_v4_md5_lookup, 1220 .calc_md5_hash = tcp_v4_md5_hash_skb, 1221 #endif 1222 .init_req = tcp_v4_init_req, 1223 #ifdef CONFIG_SYN_COOKIES 1224 .cookie_init_seq = cookie_v4_init_sequence, 1225 #endif 1226 .route_req = tcp_v4_route_req, 1227 .init_seq = tcp_v4_init_sequence, 1228 .send_synack = tcp_v4_send_synack, 1229 }; 1230 1231 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1232 { 1233 /* Never answer to SYNs send to broadcast or multicast */ 1234 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1235 goto drop; 1236 1237 return tcp_conn_request(&tcp_request_sock_ops, 1238 &tcp_request_sock_ipv4_ops, sk, skb); 1239 1240 drop: 1241 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); 1242 return 0; 1243 } 1244 EXPORT_SYMBOL(tcp_v4_conn_request); 1245 1246 1247 /* 1248 * The three way handshake has completed - we got a valid synack - 1249 * now create the new socket. 1250 */ 1251 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1252 struct request_sock *req, 1253 struct dst_entry *dst, 1254 struct request_sock *req_unhash, 1255 bool *own_req) 1256 { 1257 struct inet_request_sock *ireq; 1258 struct inet_sock *newinet; 1259 struct tcp_sock *newtp; 1260 struct sock *newsk; 1261 #ifdef CONFIG_TCP_MD5SIG 1262 struct tcp_md5sig_key *key; 1263 #endif 1264 struct ip_options_rcu *inet_opt; 1265 1266 if (sk_acceptq_is_full(sk)) 1267 goto exit_overflow; 1268 1269 newsk = tcp_create_openreq_child(sk, req, skb); 1270 if (!newsk) 1271 goto exit_nonewsk; 1272 1273 newsk->sk_gso_type = SKB_GSO_TCPV4; 1274 inet_sk_rx_dst_set(newsk, skb); 1275 1276 newtp = tcp_sk(newsk); 1277 newinet = inet_sk(newsk); 1278 ireq = inet_rsk(req); 1279 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1280 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1281 newsk->sk_bound_dev_if = ireq->ir_iif; 1282 newinet->inet_saddr = ireq->ir_loc_addr; 1283 inet_opt = ireq->opt; 1284 rcu_assign_pointer(newinet->inet_opt, inet_opt); 1285 ireq->opt = NULL; 1286 newinet->mc_index = inet_iif(skb); 1287 newinet->mc_ttl = ip_hdr(skb)->ttl; 1288 newinet->rcv_tos = ip_hdr(skb)->tos; 1289 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1290 if (inet_opt) 1291 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1292 newinet->inet_id = newtp->write_seq ^ jiffies; 1293 1294 if (!dst) { 1295 dst = inet_csk_route_child_sock(sk, newsk, req); 1296 if (!dst) 1297 goto put_and_exit; 1298 } else { 1299 /* syncookie case : see end of cookie_v4_check() */ 1300 } 1301 sk_setup_caps(newsk, dst); 1302 1303 tcp_ca_openreq_child(newsk, dst); 1304 1305 tcp_sync_mss(newsk, dst_mtu(dst)); 1306 newtp->advmss = dst_metric_advmss(dst); 1307 if (tcp_sk(sk)->rx_opt.user_mss && 1308 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss) 1309 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; 1310 1311 tcp_initialize_rcv_mss(newsk); 1312 1313 #ifdef CONFIG_TCP_MD5SIG 1314 /* Copy over the MD5 key from the original socket */ 1315 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr, 1316 AF_INET); 1317 if (key) { 1318 /* 1319 * We're using one, so create a matching key 1320 * on the newsk structure. If we fail to get 1321 * memory, then we end up not copying the key 1322 * across. Shucks. 1323 */ 1324 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr, 1325 AF_INET, key->key, key->keylen, GFP_ATOMIC); 1326 sk_nocaps_add(newsk, NETIF_F_GSO_MASK); 1327 } 1328 #endif 1329 1330 if (__inet_inherit_port(sk, newsk) < 0) 1331 goto put_and_exit; 1332 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); 1333 if (*own_req) 1334 tcp_move_syn(newtp, req); 1335 1336 return newsk; 1337 1338 exit_overflow: 1339 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1340 exit_nonewsk: 1341 dst_release(dst); 1342 exit: 1343 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); 1344 return NULL; 1345 put_and_exit: 1346 inet_csk_prepare_forced_close(newsk); 1347 tcp_done(newsk); 1348 goto exit; 1349 } 1350 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1351 1352 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1353 { 1354 #ifdef CONFIG_SYN_COOKIES 1355 const struct tcphdr *th = tcp_hdr(skb); 1356 1357 if (!th->syn) 1358 sk = cookie_v4_check(sk, skb); 1359 #endif 1360 return sk; 1361 } 1362 1363 /* The socket must have it's spinlock held when we get 1364 * here, unless it is a TCP_LISTEN socket. 1365 * 1366 * We have a potential double-lock case here, so even when 1367 * doing backlog processing we use the BH locking scheme. 1368 * This is because we cannot sleep with the original spinlock 1369 * held. 1370 */ 1371 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1372 { 1373 struct sock *rsk; 1374 1375 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1376 struct dst_entry *dst = sk->sk_rx_dst; 1377 1378 sock_rps_save_rxhash(sk, skb); 1379 sk_mark_napi_id(sk, skb); 1380 if (dst) { 1381 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || 1382 !dst->ops->check(dst, 0)) { 1383 dst_release(dst); 1384 sk->sk_rx_dst = NULL; 1385 } 1386 } 1387 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len); 1388 return 0; 1389 } 1390 1391 if (tcp_checksum_complete(skb)) 1392 goto csum_err; 1393 1394 if (sk->sk_state == TCP_LISTEN) { 1395 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1396 1397 if (!nsk) 1398 goto discard; 1399 if (nsk != sk) { 1400 sock_rps_save_rxhash(nsk, skb); 1401 sk_mark_napi_id(nsk, skb); 1402 if (tcp_child_process(sk, nsk, skb)) { 1403 rsk = nsk; 1404 goto reset; 1405 } 1406 return 0; 1407 } 1408 } else 1409 sock_rps_save_rxhash(sk, skb); 1410 1411 if (tcp_rcv_state_process(sk, skb)) { 1412 rsk = sk; 1413 goto reset; 1414 } 1415 return 0; 1416 1417 reset: 1418 tcp_v4_send_reset(rsk, skb); 1419 discard: 1420 kfree_skb(skb); 1421 /* Be careful here. If this function gets more complicated and 1422 * gcc suffers from register pressure on the x86, sk (in %ebx) 1423 * might be destroyed here. This current version compiles correctly, 1424 * but you have been warned. 1425 */ 1426 return 0; 1427 1428 csum_err: 1429 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS); 1430 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); 1431 goto discard; 1432 } 1433 EXPORT_SYMBOL(tcp_v4_do_rcv); 1434 1435 void tcp_v4_early_demux(struct sk_buff *skb) 1436 { 1437 const struct iphdr *iph; 1438 const struct tcphdr *th; 1439 struct sock *sk; 1440 1441 if (skb->pkt_type != PACKET_HOST) 1442 return; 1443 1444 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1445 return; 1446 1447 iph = ip_hdr(skb); 1448 th = tcp_hdr(skb); 1449 1450 if (th->doff < sizeof(struct tcphdr) / 4) 1451 return; 1452 1453 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, 1454 iph->saddr, th->source, 1455 iph->daddr, ntohs(th->dest), 1456 skb->skb_iif); 1457 if (sk) { 1458 skb->sk = sk; 1459 skb->destructor = sock_edemux; 1460 if (sk_fullsock(sk)) { 1461 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); 1462 1463 if (dst) 1464 dst = dst_check(dst, 0); 1465 if (dst && 1466 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) 1467 skb_dst_set_noref(skb, dst); 1468 } 1469 } 1470 } 1471 1472 /* Packet is added to VJ-style prequeue for processing in process 1473 * context, if a reader task is waiting. Apparently, this exciting 1474 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93) 1475 * failed somewhere. Latency? Burstiness? Well, at least now we will 1476 * see, why it failed. 8)8) --ANK 1477 * 1478 */ 1479 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb) 1480 { 1481 struct tcp_sock *tp = tcp_sk(sk); 1482 1483 if (sysctl_tcp_low_latency || !tp->ucopy.task) 1484 return false; 1485 1486 if (skb->len <= tcp_hdrlen(skb) && 1487 skb_queue_len(&tp->ucopy.prequeue) == 0) 1488 return false; 1489 1490 /* Before escaping RCU protected region, we need to take care of skb 1491 * dst. Prequeue is only enabled for established sockets. 1492 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst 1493 * Instead of doing full sk_rx_dst validity here, let's perform 1494 * an optimistic check. 1495 */ 1496 if (likely(sk->sk_rx_dst)) 1497 skb_dst_drop(skb); 1498 else 1499 skb_dst_force_safe(skb); 1500 1501 __skb_queue_tail(&tp->ucopy.prequeue, skb); 1502 tp->ucopy.memory += skb->truesize; 1503 if (tp->ucopy.memory > sk->sk_rcvbuf) { 1504 struct sk_buff *skb1; 1505 1506 BUG_ON(sock_owned_by_user(sk)); 1507 1508 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) { 1509 sk_backlog_rcv(sk, skb1); 1510 NET_INC_STATS_BH(sock_net(sk), 1511 LINUX_MIB_TCPPREQUEUEDROPPED); 1512 } 1513 1514 tp->ucopy.memory = 0; 1515 } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) { 1516 wake_up_interruptible_sync_poll(sk_sleep(sk), 1517 POLLIN | POLLRDNORM | POLLRDBAND); 1518 if (!inet_csk_ack_scheduled(sk)) 1519 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, 1520 (3 * tcp_rto_min(sk)) / 4, 1521 TCP_RTO_MAX); 1522 } 1523 return true; 1524 } 1525 EXPORT_SYMBOL(tcp_prequeue); 1526 1527 /* 1528 * From tcp_input.c 1529 */ 1530 1531 int tcp_v4_rcv(struct sk_buff *skb) 1532 { 1533 const struct iphdr *iph; 1534 const struct tcphdr *th; 1535 struct sock *sk; 1536 int ret; 1537 struct net *net = dev_net(skb->dev); 1538 1539 if (skb->pkt_type != PACKET_HOST) 1540 goto discard_it; 1541 1542 /* Count it even if it's bad */ 1543 TCP_INC_STATS_BH(net, TCP_MIB_INSEGS); 1544 1545 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 1546 goto discard_it; 1547 1548 th = tcp_hdr(skb); 1549 1550 if (th->doff < sizeof(struct tcphdr) / 4) 1551 goto bad_packet; 1552 if (!pskb_may_pull(skb, th->doff * 4)) 1553 goto discard_it; 1554 1555 /* An explanation is required here, I think. 1556 * Packet length and doff are validated by header prediction, 1557 * provided case of th->doff==0 is eliminated. 1558 * So, we defer the checks. */ 1559 1560 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 1561 goto csum_error; 1562 1563 th = tcp_hdr(skb); 1564 iph = ip_hdr(skb); 1565 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 1566 * barrier() makes sure compiler wont play fool^Waliasing games. 1567 */ 1568 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 1569 sizeof(struct inet_skb_parm)); 1570 barrier(); 1571 1572 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1573 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1574 skb->len - th->doff * 4); 1575 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1576 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 1577 TCP_SKB_CB(skb)->tcp_tw_isn = 0; 1578 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 1579 TCP_SKB_CB(skb)->sacked = 0; 1580 1581 lookup: 1582 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); 1583 if (!sk) 1584 goto no_tcp_socket; 1585 1586 process: 1587 if (sk->sk_state == TCP_TIME_WAIT) 1588 goto do_time_wait; 1589 1590 if (sk->sk_state == TCP_NEW_SYN_RECV) { 1591 struct request_sock *req = inet_reqsk(sk); 1592 struct sock *nsk = NULL; 1593 1594 sk = req->rsk_listener; 1595 if (tcp_v4_inbound_md5_hash(sk, skb)) 1596 goto discard_and_relse; 1597 if (likely(sk->sk_state == TCP_LISTEN)) { 1598 nsk = tcp_check_req(sk, skb, req, false); 1599 } else { 1600 inet_csk_reqsk_queue_drop_and_put(sk, req); 1601 goto lookup; 1602 } 1603 if (!nsk) { 1604 reqsk_put(req); 1605 goto discard_it; 1606 } 1607 if (nsk == sk) { 1608 sock_hold(sk); 1609 reqsk_put(req); 1610 } else if (tcp_child_process(sk, nsk, skb)) { 1611 tcp_v4_send_reset(nsk, skb); 1612 goto discard_it; 1613 } else { 1614 return 0; 1615 } 1616 } 1617 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 1618 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); 1619 goto discard_and_relse; 1620 } 1621 1622 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 1623 goto discard_and_relse; 1624 1625 if (tcp_v4_inbound_md5_hash(sk, skb)) 1626 goto discard_and_relse; 1627 1628 nf_reset(skb); 1629 1630 if (sk_filter(sk, skb)) 1631 goto discard_and_relse; 1632 1633 skb->dev = NULL; 1634 1635 if (sk->sk_state == TCP_LISTEN) { 1636 ret = tcp_v4_do_rcv(sk, skb); 1637 goto put_and_return; 1638 } 1639 1640 sk_incoming_cpu_update(sk); 1641 1642 bh_lock_sock_nested(sk); 1643 tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs); 1644 ret = 0; 1645 if (!sock_owned_by_user(sk)) { 1646 if (!tcp_prequeue(sk, skb)) 1647 ret = tcp_v4_do_rcv(sk, skb); 1648 } else if (unlikely(sk_add_backlog(sk, skb, 1649 sk->sk_rcvbuf + sk->sk_sndbuf))) { 1650 bh_unlock_sock(sk); 1651 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); 1652 goto discard_and_relse; 1653 } 1654 bh_unlock_sock(sk); 1655 1656 put_and_return: 1657 sock_put(sk); 1658 1659 return ret; 1660 1661 no_tcp_socket: 1662 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 1663 goto discard_it; 1664 1665 if (tcp_checksum_complete(skb)) { 1666 csum_error: 1667 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS); 1668 bad_packet: 1669 TCP_INC_STATS_BH(net, TCP_MIB_INERRS); 1670 } else { 1671 tcp_v4_send_reset(NULL, skb); 1672 } 1673 1674 discard_it: 1675 /* Discard frame. */ 1676 kfree_skb(skb); 1677 return 0; 1678 1679 discard_and_relse: 1680 sock_put(sk); 1681 goto discard_it; 1682 1683 do_time_wait: 1684 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 1685 inet_twsk_put(inet_twsk(sk)); 1686 goto discard_it; 1687 } 1688 1689 if (tcp_checksum_complete(skb)) { 1690 inet_twsk_put(inet_twsk(sk)); 1691 goto csum_error; 1692 } 1693 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 1694 case TCP_TW_SYN: { 1695 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), 1696 &tcp_hashinfo, 1697 iph->saddr, th->source, 1698 iph->daddr, th->dest, 1699 inet_iif(skb)); 1700 if (sk2) { 1701 inet_twsk_deschedule_put(inet_twsk(sk)); 1702 sk = sk2; 1703 goto process; 1704 } 1705 /* Fall through to ACK */ 1706 } 1707 case TCP_TW_ACK: 1708 tcp_v4_timewait_ack(sk, skb); 1709 break; 1710 case TCP_TW_RST: 1711 tcp_v4_send_reset(sk, skb); 1712 inet_twsk_deschedule_put(inet_twsk(sk)); 1713 goto discard_it; 1714 case TCP_TW_SUCCESS:; 1715 } 1716 goto discard_it; 1717 } 1718 1719 static struct timewait_sock_ops tcp_timewait_sock_ops = { 1720 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 1721 .twsk_unique = tcp_twsk_unique, 1722 .twsk_destructor= tcp_twsk_destructor, 1723 }; 1724 1725 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 1726 { 1727 struct dst_entry *dst = skb_dst(skb); 1728 1729 if (dst && dst_hold_safe(dst)) { 1730 sk->sk_rx_dst = dst; 1731 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; 1732 } 1733 } 1734 EXPORT_SYMBOL(inet_sk_rx_dst_set); 1735 1736 const struct inet_connection_sock_af_ops ipv4_specific = { 1737 .queue_xmit = ip_queue_xmit, 1738 .send_check = tcp_v4_send_check, 1739 .rebuild_header = inet_sk_rebuild_header, 1740 .sk_rx_dst_set = inet_sk_rx_dst_set, 1741 .conn_request = tcp_v4_conn_request, 1742 .syn_recv_sock = tcp_v4_syn_recv_sock, 1743 .net_header_len = sizeof(struct iphdr), 1744 .setsockopt = ip_setsockopt, 1745 .getsockopt = ip_getsockopt, 1746 .addr2sockaddr = inet_csk_addr2sockaddr, 1747 .sockaddr_len = sizeof(struct sockaddr_in), 1748 .bind_conflict = inet_csk_bind_conflict, 1749 #ifdef CONFIG_COMPAT 1750 .compat_setsockopt = compat_ip_setsockopt, 1751 .compat_getsockopt = compat_ip_getsockopt, 1752 #endif 1753 .mtu_reduced = tcp_v4_mtu_reduced, 1754 }; 1755 EXPORT_SYMBOL(ipv4_specific); 1756 1757 #ifdef CONFIG_TCP_MD5SIG 1758 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 1759 .md5_lookup = tcp_v4_md5_lookup, 1760 .calc_md5_hash = tcp_v4_md5_hash_skb, 1761 .md5_parse = tcp_v4_parse_md5_keys, 1762 }; 1763 #endif 1764 1765 /* NOTE: A lot of things set to zero explicitly by call to 1766 * sk_alloc() so need not be done here. 1767 */ 1768 static int tcp_v4_init_sock(struct sock *sk) 1769 { 1770 struct inet_connection_sock *icsk = inet_csk(sk); 1771 1772 tcp_init_sock(sk); 1773 1774 icsk->icsk_af_ops = &ipv4_specific; 1775 1776 #ifdef CONFIG_TCP_MD5SIG 1777 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 1778 #endif 1779 1780 return 0; 1781 } 1782 1783 void tcp_v4_destroy_sock(struct sock *sk) 1784 { 1785 struct tcp_sock *tp = tcp_sk(sk); 1786 1787 tcp_clear_xmit_timers(sk); 1788 1789 tcp_cleanup_congestion_control(sk); 1790 1791 /* Cleanup up the write buffer. */ 1792 tcp_write_queue_purge(sk); 1793 1794 /* Cleans up our, hopefully empty, out_of_order_queue. */ 1795 __skb_queue_purge(&tp->out_of_order_queue); 1796 1797 #ifdef CONFIG_TCP_MD5SIG 1798 /* Clean up the MD5 key list, if any */ 1799 if (tp->md5sig_info) { 1800 tcp_clear_md5_list(sk); 1801 kfree_rcu(tp->md5sig_info, rcu); 1802 tp->md5sig_info = NULL; 1803 } 1804 #endif 1805 1806 /* Clean prequeue, it must be empty really */ 1807 __skb_queue_purge(&tp->ucopy.prequeue); 1808 1809 /* Clean up a referenced TCP bind bucket. */ 1810 if (inet_csk(sk)->icsk_bind_hash) 1811 inet_put_port(sk); 1812 1813 BUG_ON(tp->fastopen_rsk); 1814 1815 /* If socket is aborted during connect operation */ 1816 tcp_free_fastopen_req(tp); 1817 tcp_saved_syn_free(tp); 1818 1819 sk_sockets_allocated_dec(sk); 1820 1821 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 1822 sock_release_memcg(sk); 1823 } 1824 EXPORT_SYMBOL(tcp_v4_destroy_sock); 1825 1826 #ifdef CONFIG_PROC_FS 1827 /* Proc filesystem TCP sock list dumping. */ 1828 1829 /* 1830 * Get next listener socket follow cur. If cur is NULL, get first socket 1831 * starting from bucket given in st->bucket; when st->bucket is zero the 1832 * very first socket in the hash table is returned. 1833 */ 1834 static void *listening_get_next(struct seq_file *seq, void *cur) 1835 { 1836 struct inet_connection_sock *icsk; 1837 struct hlist_nulls_node *node; 1838 struct sock *sk = cur; 1839 struct inet_listen_hashbucket *ilb; 1840 struct tcp_iter_state *st = seq->private; 1841 struct net *net = seq_file_net(seq); 1842 1843 if (!sk) { 1844 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 1845 spin_lock_bh(&ilb->lock); 1846 sk = sk_nulls_head(&ilb->head); 1847 st->offset = 0; 1848 goto get_sk; 1849 } 1850 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 1851 ++st->num; 1852 ++st->offset; 1853 1854 sk = sk_nulls_next(sk); 1855 get_sk: 1856 sk_nulls_for_each_from(sk, node) { 1857 if (!net_eq(sock_net(sk), net)) 1858 continue; 1859 if (sk->sk_family == st->family) { 1860 cur = sk; 1861 goto out; 1862 } 1863 icsk = inet_csk(sk); 1864 } 1865 spin_unlock_bh(&ilb->lock); 1866 st->offset = 0; 1867 if (++st->bucket < INET_LHTABLE_SIZE) { 1868 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 1869 spin_lock_bh(&ilb->lock); 1870 sk = sk_nulls_head(&ilb->head); 1871 goto get_sk; 1872 } 1873 cur = NULL; 1874 out: 1875 return cur; 1876 } 1877 1878 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 1879 { 1880 struct tcp_iter_state *st = seq->private; 1881 void *rc; 1882 1883 st->bucket = 0; 1884 st->offset = 0; 1885 rc = listening_get_next(seq, NULL); 1886 1887 while (rc && *pos) { 1888 rc = listening_get_next(seq, rc); 1889 --*pos; 1890 } 1891 return rc; 1892 } 1893 1894 static inline bool empty_bucket(const struct tcp_iter_state *st) 1895 { 1896 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain); 1897 } 1898 1899 /* 1900 * Get first established socket starting from bucket given in st->bucket. 1901 * If st->bucket is zero, the very first socket in the hash is returned. 1902 */ 1903 static void *established_get_first(struct seq_file *seq) 1904 { 1905 struct tcp_iter_state *st = seq->private; 1906 struct net *net = seq_file_net(seq); 1907 void *rc = NULL; 1908 1909 st->offset = 0; 1910 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { 1911 struct sock *sk; 1912 struct hlist_nulls_node *node; 1913 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 1914 1915 /* Lockless fast path for the common case of empty buckets */ 1916 if (empty_bucket(st)) 1917 continue; 1918 1919 spin_lock_bh(lock); 1920 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 1921 if (sk->sk_family != st->family || 1922 !net_eq(sock_net(sk), net)) { 1923 continue; 1924 } 1925 rc = sk; 1926 goto out; 1927 } 1928 spin_unlock_bh(lock); 1929 } 1930 out: 1931 return rc; 1932 } 1933 1934 static void *established_get_next(struct seq_file *seq, void *cur) 1935 { 1936 struct sock *sk = cur; 1937 struct hlist_nulls_node *node; 1938 struct tcp_iter_state *st = seq->private; 1939 struct net *net = seq_file_net(seq); 1940 1941 ++st->num; 1942 ++st->offset; 1943 1944 sk = sk_nulls_next(sk); 1945 1946 sk_nulls_for_each_from(sk, node) { 1947 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) 1948 return sk; 1949 } 1950 1951 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 1952 ++st->bucket; 1953 return established_get_first(seq); 1954 } 1955 1956 static void *established_get_idx(struct seq_file *seq, loff_t pos) 1957 { 1958 struct tcp_iter_state *st = seq->private; 1959 void *rc; 1960 1961 st->bucket = 0; 1962 rc = established_get_first(seq); 1963 1964 while (rc && pos) { 1965 rc = established_get_next(seq, rc); 1966 --pos; 1967 } 1968 return rc; 1969 } 1970 1971 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 1972 { 1973 void *rc; 1974 struct tcp_iter_state *st = seq->private; 1975 1976 st->state = TCP_SEQ_STATE_LISTENING; 1977 rc = listening_get_idx(seq, &pos); 1978 1979 if (!rc) { 1980 st->state = TCP_SEQ_STATE_ESTABLISHED; 1981 rc = established_get_idx(seq, pos); 1982 } 1983 1984 return rc; 1985 } 1986 1987 static void *tcp_seek_last_pos(struct seq_file *seq) 1988 { 1989 struct tcp_iter_state *st = seq->private; 1990 int offset = st->offset; 1991 int orig_num = st->num; 1992 void *rc = NULL; 1993 1994 switch (st->state) { 1995 case TCP_SEQ_STATE_LISTENING: 1996 if (st->bucket >= INET_LHTABLE_SIZE) 1997 break; 1998 st->state = TCP_SEQ_STATE_LISTENING; 1999 rc = listening_get_next(seq, NULL); 2000 while (offset-- && rc) 2001 rc = listening_get_next(seq, rc); 2002 if (rc) 2003 break; 2004 st->bucket = 0; 2005 st->state = TCP_SEQ_STATE_ESTABLISHED; 2006 /* Fallthrough */ 2007 case TCP_SEQ_STATE_ESTABLISHED: 2008 if (st->bucket > tcp_hashinfo.ehash_mask) 2009 break; 2010 rc = established_get_first(seq); 2011 while (offset-- && rc) 2012 rc = established_get_next(seq, rc); 2013 } 2014 2015 st->num = orig_num; 2016 2017 return rc; 2018 } 2019 2020 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2021 { 2022 struct tcp_iter_state *st = seq->private; 2023 void *rc; 2024 2025 if (*pos && *pos == st->last_pos) { 2026 rc = tcp_seek_last_pos(seq); 2027 if (rc) 2028 goto out; 2029 } 2030 2031 st->state = TCP_SEQ_STATE_LISTENING; 2032 st->num = 0; 2033 st->bucket = 0; 2034 st->offset = 0; 2035 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2036 2037 out: 2038 st->last_pos = *pos; 2039 return rc; 2040 } 2041 2042 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2043 { 2044 struct tcp_iter_state *st = seq->private; 2045 void *rc = NULL; 2046 2047 if (v == SEQ_START_TOKEN) { 2048 rc = tcp_get_idx(seq, 0); 2049 goto out; 2050 } 2051 2052 switch (st->state) { 2053 case TCP_SEQ_STATE_LISTENING: 2054 rc = listening_get_next(seq, v); 2055 if (!rc) { 2056 st->state = TCP_SEQ_STATE_ESTABLISHED; 2057 st->bucket = 0; 2058 st->offset = 0; 2059 rc = established_get_first(seq); 2060 } 2061 break; 2062 case TCP_SEQ_STATE_ESTABLISHED: 2063 rc = established_get_next(seq, v); 2064 break; 2065 } 2066 out: 2067 ++*pos; 2068 st->last_pos = *pos; 2069 return rc; 2070 } 2071 2072 static void tcp_seq_stop(struct seq_file *seq, void *v) 2073 { 2074 struct tcp_iter_state *st = seq->private; 2075 2076 switch (st->state) { 2077 case TCP_SEQ_STATE_LISTENING: 2078 if (v != SEQ_START_TOKEN) 2079 spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock); 2080 break; 2081 case TCP_SEQ_STATE_ESTABLISHED: 2082 if (v) 2083 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2084 break; 2085 } 2086 } 2087 2088 int tcp_seq_open(struct inode *inode, struct file *file) 2089 { 2090 struct tcp_seq_afinfo *afinfo = PDE_DATA(inode); 2091 struct tcp_iter_state *s; 2092 int err; 2093 2094 err = seq_open_net(inode, file, &afinfo->seq_ops, 2095 sizeof(struct tcp_iter_state)); 2096 if (err < 0) 2097 return err; 2098 2099 s = ((struct seq_file *)file->private_data)->private; 2100 s->family = afinfo->family; 2101 s->last_pos = 0; 2102 return 0; 2103 } 2104 EXPORT_SYMBOL(tcp_seq_open); 2105 2106 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo) 2107 { 2108 int rc = 0; 2109 struct proc_dir_entry *p; 2110 2111 afinfo->seq_ops.start = tcp_seq_start; 2112 afinfo->seq_ops.next = tcp_seq_next; 2113 afinfo->seq_ops.stop = tcp_seq_stop; 2114 2115 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net, 2116 afinfo->seq_fops, afinfo); 2117 if (!p) 2118 rc = -ENOMEM; 2119 return rc; 2120 } 2121 EXPORT_SYMBOL(tcp_proc_register); 2122 2123 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo) 2124 { 2125 remove_proc_entry(afinfo->name, net->proc_net); 2126 } 2127 EXPORT_SYMBOL(tcp_proc_unregister); 2128 2129 static void get_openreq4(const struct request_sock *req, 2130 struct seq_file *f, int i) 2131 { 2132 const struct inet_request_sock *ireq = inet_rsk(req); 2133 long delta = req->rsk_timer.expires - jiffies; 2134 2135 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2136 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2137 i, 2138 ireq->ir_loc_addr, 2139 ireq->ir_num, 2140 ireq->ir_rmt_addr, 2141 ntohs(ireq->ir_rmt_port), 2142 TCP_SYN_RECV, 2143 0, 0, /* could print option size, but that is af dependent. */ 2144 1, /* timers active (only the expire timer) */ 2145 jiffies_delta_to_clock_t(delta), 2146 req->num_timeout, 2147 from_kuid_munged(seq_user_ns(f), 2148 sock_i_uid(req->rsk_listener)), 2149 0, /* non standard timer */ 2150 0, /* open_requests have no inode */ 2151 0, 2152 req); 2153 } 2154 2155 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2156 { 2157 int timer_active; 2158 unsigned long timer_expires; 2159 const struct tcp_sock *tp = tcp_sk(sk); 2160 const struct inet_connection_sock *icsk = inet_csk(sk); 2161 const struct inet_sock *inet = inet_sk(sk); 2162 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2163 __be32 dest = inet->inet_daddr; 2164 __be32 src = inet->inet_rcv_saddr; 2165 __u16 destp = ntohs(inet->inet_dport); 2166 __u16 srcp = ntohs(inet->inet_sport); 2167 int rx_queue; 2168 int state; 2169 2170 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2171 icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || 2172 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2173 timer_active = 1; 2174 timer_expires = icsk->icsk_timeout; 2175 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2176 timer_active = 4; 2177 timer_expires = icsk->icsk_timeout; 2178 } else if (timer_pending(&sk->sk_timer)) { 2179 timer_active = 2; 2180 timer_expires = sk->sk_timer.expires; 2181 } else { 2182 timer_active = 0; 2183 timer_expires = jiffies; 2184 } 2185 2186 state = sk_state_load(sk); 2187 if (state == TCP_LISTEN) 2188 rx_queue = sk->sk_ack_backlog; 2189 else 2190 /* Because we don't lock the socket, 2191 * we might find a transient negative value. 2192 */ 2193 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); 2194 2195 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2196 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2197 i, src, srcp, dest, destp, state, 2198 tp->write_seq - tp->snd_una, 2199 rx_queue, 2200 timer_active, 2201 jiffies_delta_to_clock_t(timer_expires - jiffies), 2202 icsk->icsk_retransmits, 2203 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2204 icsk->icsk_probes_out, 2205 sock_i_ino(sk), 2206 atomic_read(&sk->sk_refcnt), sk, 2207 jiffies_to_clock_t(icsk->icsk_rto), 2208 jiffies_to_clock_t(icsk->icsk_ack.ato), 2209 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, 2210 tp->snd_cwnd, 2211 state == TCP_LISTEN ? 2212 fastopenq->max_qlen : 2213 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2214 } 2215 2216 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2217 struct seq_file *f, int i) 2218 { 2219 long delta = tw->tw_timer.expires - jiffies; 2220 __be32 dest, src; 2221 __u16 destp, srcp; 2222 2223 dest = tw->tw_daddr; 2224 src = tw->tw_rcv_saddr; 2225 destp = ntohs(tw->tw_dport); 2226 srcp = ntohs(tw->tw_sport); 2227 2228 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2229 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2230 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2231 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2232 atomic_read(&tw->tw_refcnt), tw); 2233 } 2234 2235 #define TMPSZ 150 2236 2237 static int tcp4_seq_show(struct seq_file *seq, void *v) 2238 { 2239 struct tcp_iter_state *st; 2240 struct sock *sk = v; 2241 2242 seq_setwidth(seq, TMPSZ - 1); 2243 if (v == SEQ_START_TOKEN) { 2244 seq_puts(seq, " sl local_address rem_address st tx_queue " 2245 "rx_queue tr tm->when retrnsmt uid timeout " 2246 "inode"); 2247 goto out; 2248 } 2249 st = seq->private; 2250 2251 if (sk->sk_state == TCP_TIME_WAIT) 2252 get_timewait4_sock(v, seq, st->num); 2253 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2254 get_openreq4(v, seq, st->num); 2255 else 2256 get_tcp4_sock(v, seq, st->num); 2257 out: 2258 seq_pad(seq, '\n'); 2259 return 0; 2260 } 2261 2262 static const struct file_operations tcp_afinfo_seq_fops = { 2263 .owner = THIS_MODULE, 2264 .open = tcp_seq_open, 2265 .read = seq_read, 2266 .llseek = seq_lseek, 2267 .release = seq_release_net 2268 }; 2269 2270 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 2271 .name = "tcp", 2272 .family = AF_INET, 2273 .seq_fops = &tcp_afinfo_seq_fops, 2274 .seq_ops = { 2275 .show = tcp4_seq_show, 2276 }, 2277 }; 2278 2279 static int __net_init tcp4_proc_init_net(struct net *net) 2280 { 2281 return tcp_proc_register(net, &tcp4_seq_afinfo); 2282 } 2283 2284 static void __net_exit tcp4_proc_exit_net(struct net *net) 2285 { 2286 tcp_proc_unregister(net, &tcp4_seq_afinfo); 2287 } 2288 2289 static struct pernet_operations tcp4_net_ops = { 2290 .init = tcp4_proc_init_net, 2291 .exit = tcp4_proc_exit_net, 2292 }; 2293 2294 int __init tcp4_proc_init(void) 2295 { 2296 return register_pernet_subsys(&tcp4_net_ops); 2297 } 2298 2299 void tcp4_proc_exit(void) 2300 { 2301 unregister_pernet_subsys(&tcp4_net_ops); 2302 } 2303 #endif /* CONFIG_PROC_FS */ 2304 2305 struct proto tcp_prot = { 2306 .name = "TCP", 2307 .owner = THIS_MODULE, 2308 .close = tcp_close, 2309 .connect = tcp_v4_connect, 2310 .disconnect = tcp_disconnect, 2311 .accept = inet_csk_accept, 2312 .ioctl = tcp_ioctl, 2313 .init = tcp_v4_init_sock, 2314 .destroy = tcp_v4_destroy_sock, 2315 .shutdown = tcp_shutdown, 2316 .setsockopt = tcp_setsockopt, 2317 .getsockopt = tcp_getsockopt, 2318 .recvmsg = tcp_recvmsg, 2319 .sendmsg = tcp_sendmsg, 2320 .sendpage = tcp_sendpage, 2321 .backlog_rcv = tcp_v4_do_rcv, 2322 .release_cb = tcp_release_cb, 2323 .hash = inet_hash, 2324 .unhash = inet_unhash, 2325 .get_port = inet_csk_get_port, 2326 .enter_memory_pressure = tcp_enter_memory_pressure, 2327 .stream_memory_free = tcp_stream_memory_free, 2328 .sockets_allocated = &tcp_sockets_allocated, 2329 .orphan_count = &tcp_orphan_count, 2330 .memory_allocated = &tcp_memory_allocated, 2331 .memory_pressure = &tcp_memory_pressure, 2332 .sysctl_mem = sysctl_tcp_mem, 2333 .sysctl_wmem = sysctl_tcp_wmem, 2334 .sysctl_rmem = sysctl_tcp_rmem, 2335 .max_header = MAX_TCP_HEADER, 2336 .obj_size = sizeof(struct tcp_sock), 2337 .slab_flags = SLAB_DESTROY_BY_RCU, 2338 .twsk_prot = &tcp_timewait_sock_ops, 2339 .rsk_prot = &tcp_request_sock_ops, 2340 .h.hashinfo = &tcp_hashinfo, 2341 .no_autobind = true, 2342 #ifdef CONFIG_COMPAT 2343 .compat_setsockopt = compat_tcp_setsockopt, 2344 .compat_getsockopt = compat_tcp_getsockopt, 2345 #endif 2346 .diag_destroy = tcp_abort, 2347 }; 2348 EXPORT_SYMBOL(tcp_prot); 2349 2350 static void __net_exit tcp_sk_exit(struct net *net) 2351 { 2352 int cpu; 2353 2354 for_each_possible_cpu(cpu) 2355 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); 2356 free_percpu(net->ipv4.tcp_sk); 2357 } 2358 2359 static int __net_init tcp_sk_init(struct net *net) 2360 { 2361 int res, cpu; 2362 2363 net->ipv4.tcp_sk = alloc_percpu(struct sock *); 2364 if (!net->ipv4.tcp_sk) 2365 return -ENOMEM; 2366 2367 for_each_possible_cpu(cpu) { 2368 struct sock *sk; 2369 2370 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 2371 IPPROTO_TCP, net); 2372 if (res) 2373 goto fail; 2374 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; 2375 } 2376 2377 net->ipv4.sysctl_tcp_ecn = 2; 2378 net->ipv4.sysctl_tcp_ecn_fallback = 1; 2379 2380 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 2381 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 2382 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 2383 2384 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 2385 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 2386 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 2387 2388 return 0; 2389 fail: 2390 tcp_sk_exit(net); 2391 2392 return res; 2393 } 2394 2395 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 2396 { 2397 inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET); 2398 } 2399 2400 static struct pernet_operations __net_initdata tcp_sk_ops = { 2401 .init = tcp_sk_init, 2402 .exit = tcp_sk_exit, 2403 .exit_batch = tcp_sk_exit_batch, 2404 }; 2405 2406 void __init tcp_v4_init(void) 2407 { 2408 inet_hashinfo_init(&tcp_hashinfo); 2409 if (register_pernet_subsys(&tcp_sk_ops)) 2410 panic("Failed to create the TCP control socket.\n"); 2411 } 2412