1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * IPv4 specific functions 9 * 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 * 18 * This program is free software; you can redistribute it and/or 19 * modify it under the terms of the GNU General Public License 20 * as published by the Free Software Foundation; either version 21 * 2 of the License, or (at your option) any later version. 22 */ 23 24 /* 25 * Changes: 26 * David S. Miller : New socket lookup architecture. 27 * This code is dedicated to John Dyson. 28 * David S. Miller : Change semantics of established hash, 29 * half is devoted to TIME_WAIT sockets 30 * and the rest go in the other half. 31 * Andi Kleen : Add support for syncookies and fixed 32 * some bugs: ip options weren't passed to 33 * the TCP layer, missed a check for an 34 * ACK bit. 35 * Andi Kleen : Implemented fast path mtu discovery. 36 * Fixed many serious bugs in the 37 * request_sock handling and moved 38 * most of it into the af independent code. 39 * Added tail drop and some other bugfixes. 40 * Added new listen semantics. 41 * Mike McLagan : Routing by source 42 * Juan Jose Ciarlante: ip_dynaddr bits 43 * Andi Kleen: various fixes. 44 * Vitaly E. Lavrov : Transparent proxy revived after year 45 * coma. 46 * Andi Kleen : Fix new listen. 47 * Andi Kleen : Fix accept error reporting. 48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 50 * a single port at the same time. 51 */ 52 53 #define pr_fmt(fmt) "TCP: " fmt 54 55 #include <linux/bottom_half.h> 56 #include <linux/types.h> 57 #include <linux/fcntl.h> 58 #include <linux/module.h> 59 #include <linux/random.h> 60 #include <linux/cache.h> 61 #include <linux/jhash.h> 62 #include <linux/init.h> 63 #include <linux/times.h> 64 #include <linux/slab.h> 65 66 #include <net/net_namespace.h> 67 #include <net/icmp.h> 68 #include <net/inet_hashtables.h> 69 #include <net/tcp.h> 70 #include <net/transp_v6.h> 71 #include <net/ipv6.h> 72 #include <net/inet_common.h> 73 #include <net/timewait_sock.h> 74 #include <net/xfrm.h> 75 #include <net/secure_seq.h> 76 #include <net/busy_poll.h> 77 78 #include <linux/inet.h> 79 #include <linux/ipv6.h> 80 #include <linux/stddef.h> 81 #include <linux/proc_fs.h> 82 #include <linux/seq_file.h> 83 84 #include <crypto/hash.h> 85 #include <linux/scatterlist.h> 86 87 int sysctl_tcp_low_latency __read_mostly; 88 89 #ifdef CONFIG_TCP_MD5SIG 90 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 91 __be32 daddr, __be32 saddr, const struct tcphdr *th); 92 #endif 93 94 struct inet_hashinfo tcp_hashinfo; 95 EXPORT_SYMBOL(tcp_hashinfo); 96 97 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 98 { 99 return secure_tcp_seq(ip_hdr(skb)->daddr, 100 ip_hdr(skb)->saddr, 101 tcp_hdr(skb)->dest, 102 tcp_hdr(skb)->source); 103 } 104 105 static u32 tcp_v4_init_ts_off(const struct sk_buff *skb) 106 { 107 return secure_tcp_ts_off(ip_hdr(skb)->daddr, 108 ip_hdr(skb)->saddr); 109 } 110 111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 112 { 113 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 114 struct tcp_sock *tp = tcp_sk(sk); 115 116 /* With PAWS, it is safe from the viewpoint 117 of data integrity. Even without PAWS it is safe provided sequence 118 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 119 120 Actually, the idea is close to VJ's one, only timestamp cache is 121 held not per host, but per port pair and TW bucket is used as state 122 holder. 123 124 If TW bucket has been already destroyed we fall back to VJ's scheme 125 and use initial timestamp retrieved from peer table. 126 */ 127 if (tcptw->tw_ts_recent_stamp && 128 (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse && 129 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { 130 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; 131 if (tp->write_seq == 0) 132 tp->write_seq = 1; 133 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 134 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 135 sock_hold(sktw); 136 return 1; 137 } 138 139 return 0; 140 } 141 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 142 143 /* This will initiate an outgoing connection. */ 144 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 145 { 146 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 147 struct inet_sock *inet = inet_sk(sk); 148 struct tcp_sock *tp = tcp_sk(sk); 149 __be16 orig_sport, orig_dport; 150 __be32 daddr, nexthop; 151 struct flowi4 *fl4; 152 struct rtable *rt; 153 int err; 154 struct ip_options_rcu *inet_opt; 155 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 156 157 if (addr_len < sizeof(struct sockaddr_in)) 158 return -EINVAL; 159 160 if (usin->sin_family != AF_INET) 161 return -EAFNOSUPPORT; 162 163 nexthop = daddr = usin->sin_addr.s_addr; 164 inet_opt = rcu_dereference_protected(inet->inet_opt, 165 lockdep_sock_is_held(sk)); 166 if (inet_opt && inet_opt->opt.srr) { 167 if (!daddr) 168 return -EINVAL; 169 nexthop = inet_opt->opt.faddr; 170 } 171 172 orig_sport = inet->inet_sport; 173 orig_dport = usin->sin_port; 174 fl4 = &inet->cork.fl.u.ip4; 175 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 176 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 177 IPPROTO_TCP, 178 orig_sport, orig_dport, sk); 179 if (IS_ERR(rt)) { 180 err = PTR_ERR(rt); 181 if (err == -ENETUNREACH) 182 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 183 return err; 184 } 185 186 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 187 ip_rt_put(rt); 188 return -ENETUNREACH; 189 } 190 191 if (!inet_opt || !inet_opt->opt.srr) 192 daddr = fl4->daddr; 193 194 if (!inet->inet_saddr) 195 inet->inet_saddr = fl4->saddr; 196 sk_rcv_saddr_set(sk, inet->inet_saddr); 197 198 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 199 /* Reset inherited state */ 200 tp->rx_opt.ts_recent = 0; 201 tp->rx_opt.ts_recent_stamp = 0; 202 if (likely(!tp->repair)) 203 tp->write_seq = 0; 204 } 205 206 inet->inet_dport = usin->sin_port; 207 sk_daddr_set(sk, daddr); 208 209 inet_csk(sk)->icsk_ext_hdr_len = 0; 210 if (inet_opt) 211 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 212 213 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 214 215 /* Socket identity is still unknown (sport may be zero). 216 * However we set state to SYN-SENT and not releasing socket 217 * lock select source port, enter ourselves into the hash tables and 218 * complete initialization after this. 219 */ 220 tcp_set_state(sk, TCP_SYN_SENT); 221 err = inet_hash_connect(tcp_death_row, sk); 222 if (err) 223 goto failure; 224 225 sk_set_txhash(sk); 226 227 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 228 inet->inet_sport, inet->inet_dport, sk); 229 if (IS_ERR(rt)) { 230 err = PTR_ERR(rt); 231 rt = NULL; 232 goto failure; 233 } 234 /* OK, now commit destination to socket. */ 235 sk->sk_gso_type = SKB_GSO_TCPV4; 236 sk_setup_caps(sk, &rt->dst); 237 rt = NULL; 238 239 if (likely(!tp->repair)) { 240 if (!tp->write_seq) 241 tp->write_seq = secure_tcp_seq(inet->inet_saddr, 242 inet->inet_daddr, 243 inet->inet_sport, 244 usin->sin_port); 245 tp->tsoffset = secure_tcp_ts_off(inet->inet_saddr, 246 inet->inet_daddr); 247 } 248 249 inet->inet_id = tp->write_seq ^ jiffies; 250 251 if (tcp_fastopen_defer_connect(sk, &err)) 252 return err; 253 if (err) 254 goto failure; 255 256 err = tcp_connect(sk); 257 258 if (err) 259 goto failure; 260 261 return 0; 262 263 failure: 264 /* 265 * This unhashes the socket and releases the local port, 266 * if necessary. 267 */ 268 tcp_set_state(sk, TCP_CLOSE); 269 ip_rt_put(rt); 270 sk->sk_route_caps = 0; 271 inet->inet_dport = 0; 272 return err; 273 } 274 EXPORT_SYMBOL(tcp_v4_connect); 275 276 /* 277 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 278 * It can be called through tcp_release_cb() if socket was owned by user 279 * at the time tcp_v4_err() was called to handle ICMP message. 280 */ 281 void tcp_v4_mtu_reduced(struct sock *sk) 282 { 283 struct inet_sock *inet = inet_sk(sk); 284 struct dst_entry *dst; 285 u32 mtu; 286 287 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 288 return; 289 mtu = tcp_sk(sk)->mtu_info; 290 dst = inet_csk_update_pmtu(sk, mtu); 291 if (!dst) 292 return; 293 294 /* Something is about to be wrong... Remember soft error 295 * for the case, if this connection will not able to recover. 296 */ 297 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 298 sk->sk_err_soft = EMSGSIZE; 299 300 mtu = dst_mtu(dst); 301 302 if (inet->pmtudisc != IP_PMTUDISC_DONT && 303 ip_sk_accept_pmtu(sk) && 304 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 305 tcp_sync_mss(sk, mtu); 306 307 /* Resend the TCP packet because it's 308 * clear that the old packet has been 309 * dropped. This is the new "fast" path mtu 310 * discovery. 311 */ 312 tcp_simple_retransmit(sk); 313 } /* else let the usual retransmit timer handle it */ 314 } 315 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 316 317 static void do_redirect(struct sk_buff *skb, struct sock *sk) 318 { 319 struct dst_entry *dst = __sk_dst_check(sk, 0); 320 321 if (dst) 322 dst->ops->redirect(dst, sk, skb); 323 } 324 325 326 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 327 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 328 { 329 struct request_sock *req = inet_reqsk(sk); 330 struct net *net = sock_net(sk); 331 332 /* ICMPs are not backlogged, hence we cannot get 333 * an established socket here. 334 */ 335 if (seq != tcp_rsk(req)->snt_isn) { 336 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 337 } else if (abort) { 338 /* 339 * Still in SYN_RECV, just remove it silently. 340 * There is no good way to pass the error to the newly 341 * created socket, and POSIX does not want network 342 * errors returned from accept(). 343 */ 344 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 345 tcp_listendrop(req->rsk_listener); 346 } 347 reqsk_put(req); 348 } 349 EXPORT_SYMBOL(tcp_req_err); 350 351 /* 352 * This routine is called by the ICMP module when it gets some 353 * sort of error condition. If err < 0 then the socket should 354 * be closed and the error returned to the user. If err > 0 355 * it's just the icmp type << 8 | icmp code. After adjustment 356 * header points to the first 8 bytes of the tcp header. We need 357 * to find the appropriate port. 358 * 359 * The locking strategy used here is very "optimistic". When 360 * someone else accesses the socket the ICMP is just dropped 361 * and for some paths there is no check at all. 362 * A more general error queue to queue errors for later handling 363 * is probably better. 364 * 365 */ 366 367 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) 368 { 369 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data; 370 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); 371 struct inet_connection_sock *icsk; 372 struct tcp_sock *tp; 373 struct inet_sock *inet; 374 const int type = icmp_hdr(icmp_skb)->type; 375 const int code = icmp_hdr(icmp_skb)->code; 376 struct sock *sk; 377 struct sk_buff *skb; 378 struct request_sock *fastopen; 379 u32 seq, snd_una; 380 s32 remaining; 381 u32 delta_us; 382 int err; 383 struct net *net = dev_net(icmp_skb->dev); 384 385 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, 386 th->dest, iph->saddr, ntohs(th->source), 387 inet_iif(icmp_skb)); 388 if (!sk) { 389 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 390 return; 391 } 392 if (sk->sk_state == TCP_TIME_WAIT) { 393 inet_twsk_put(inet_twsk(sk)); 394 return; 395 } 396 seq = ntohl(th->seq); 397 if (sk->sk_state == TCP_NEW_SYN_RECV) 398 return tcp_req_err(sk, seq, 399 type == ICMP_PARAMETERPROB || 400 type == ICMP_TIME_EXCEEDED || 401 (type == ICMP_DEST_UNREACH && 402 (code == ICMP_NET_UNREACH || 403 code == ICMP_HOST_UNREACH))); 404 405 bh_lock_sock(sk); 406 /* If too many ICMPs get dropped on busy 407 * servers this needs to be solved differently. 408 * We do take care of PMTU discovery (RFC1191) special case : 409 * we can receive locally generated ICMP messages while socket is held. 410 */ 411 if (sock_owned_by_user(sk)) { 412 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 413 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 414 } 415 if (sk->sk_state == TCP_CLOSE) 416 goto out; 417 418 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 419 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 420 goto out; 421 } 422 423 icsk = inet_csk(sk); 424 tp = tcp_sk(sk); 425 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 426 fastopen = tp->fastopen_rsk; 427 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 428 if (sk->sk_state != TCP_LISTEN && 429 !between(seq, snd_una, tp->snd_nxt)) { 430 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 431 goto out; 432 } 433 434 switch (type) { 435 case ICMP_REDIRECT: 436 if (!sock_owned_by_user(sk)) 437 do_redirect(icmp_skb, sk); 438 goto out; 439 case ICMP_SOURCE_QUENCH: 440 /* Just silently ignore these. */ 441 goto out; 442 case ICMP_PARAMETERPROB: 443 err = EPROTO; 444 break; 445 case ICMP_DEST_UNREACH: 446 if (code > NR_ICMP_UNREACH) 447 goto out; 448 449 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 450 /* We are not interested in TCP_LISTEN and open_requests 451 * (SYN-ACKs send out by Linux are always <576bytes so 452 * they should go through unfragmented). 453 */ 454 if (sk->sk_state == TCP_LISTEN) 455 goto out; 456 457 tp->mtu_info = info; 458 if (!sock_owned_by_user(sk)) { 459 tcp_v4_mtu_reduced(sk); 460 } else { 461 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 462 sock_hold(sk); 463 } 464 goto out; 465 } 466 467 err = icmp_err_convert[code].errno; 468 /* check if icmp_skb allows revert of backoff 469 * (see draft-zimmermann-tcp-lcd) */ 470 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH) 471 break; 472 if (seq != tp->snd_una || !icsk->icsk_retransmits || 473 !icsk->icsk_backoff || fastopen) 474 break; 475 476 if (sock_owned_by_user(sk)) 477 break; 478 479 icsk->icsk_backoff--; 480 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : 481 TCP_TIMEOUT_INIT; 482 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 483 484 skb = tcp_write_queue_head(sk); 485 BUG_ON(!skb); 486 487 tcp_mstamp_refresh(tp); 488 delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp); 489 remaining = icsk->icsk_rto - 490 usecs_to_jiffies(delta_us); 491 492 if (remaining > 0) { 493 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 494 remaining, TCP_RTO_MAX); 495 } else { 496 /* RTO revert clocked out retransmission. 497 * Will retransmit now */ 498 tcp_retransmit_timer(sk); 499 } 500 501 break; 502 case ICMP_TIME_EXCEEDED: 503 err = EHOSTUNREACH; 504 break; 505 default: 506 goto out; 507 } 508 509 switch (sk->sk_state) { 510 case TCP_SYN_SENT: 511 case TCP_SYN_RECV: 512 /* Only in fast or simultaneous open. If a fast open socket is 513 * is already accepted it is treated as a connected one below. 514 */ 515 if (fastopen && !fastopen->sk) 516 break; 517 518 if (!sock_owned_by_user(sk)) { 519 sk->sk_err = err; 520 521 sk->sk_error_report(sk); 522 523 tcp_done(sk); 524 } else { 525 sk->sk_err_soft = err; 526 } 527 goto out; 528 } 529 530 /* If we've already connected we will keep trying 531 * until we time out, or the user gives up. 532 * 533 * rfc1122 4.2.3.9 allows to consider as hard errors 534 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 535 * but it is obsoleted by pmtu discovery). 536 * 537 * Note, that in modern internet, where routing is unreliable 538 * and in each dark corner broken firewalls sit, sending random 539 * errors ordered by their masters even this two messages finally lose 540 * their original sense (even Linux sends invalid PORT_UNREACHs) 541 * 542 * Now we are in compliance with RFCs. 543 * --ANK (980905) 544 */ 545 546 inet = inet_sk(sk); 547 if (!sock_owned_by_user(sk) && inet->recverr) { 548 sk->sk_err = err; 549 sk->sk_error_report(sk); 550 } else { /* Only an error on timeout */ 551 sk->sk_err_soft = err; 552 } 553 554 out: 555 bh_unlock_sock(sk); 556 sock_put(sk); 557 } 558 559 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 560 { 561 struct tcphdr *th = tcp_hdr(skb); 562 563 if (skb->ip_summed == CHECKSUM_PARTIAL) { 564 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 565 skb->csum_start = skb_transport_header(skb) - skb->head; 566 skb->csum_offset = offsetof(struct tcphdr, check); 567 } else { 568 th->check = tcp_v4_check(skb->len, saddr, daddr, 569 csum_partial(th, 570 th->doff << 2, 571 skb->csum)); 572 } 573 } 574 575 /* This routine computes an IPv4 TCP checksum. */ 576 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 577 { 578 const struct inet_sock *inet = inet_sk(sk); 579 580 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 581 } 582 EXPORT_SYMBOL(tcp_v4_send_check); 583 584 /* 585 * This routine will send an RST to the other tcp. 586 * 587 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 588 * for reset. 589 * Answer: if a packet caused RST, it is not for a socket 590 * existing in our system, if it is matched to a socket, 591 * it is just duplicate segment or bug in other side's TCP. 592 * So that we build reply only basing on parameters 593 * arrived with segment. 594 * Exception: precedence violation. We do not implement it in any case. 595 */ 596 597 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) 598 { 599 const struct tcphdr *th = tcp_hdr(skb); 600 struct { 601 struct tcphdr th; 602 #ifdef CONFIG_TCP_MD5SIG 603 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)]; 604 #endif 605 } rep; 606 struct ip_reply_arg arg; 607 #ifdef CONFIG_TCP_MD5SIG 608 struct tcp_md5sig_key *key = NULL; 609 const __u8 *hash_location = NULL; 610 unsigned char newhash[16]; 611 int genhash; 612 struct sock *sk1 = NULL; 613 #endif 614 struct net *net; 615 616 /* Never send a reset in response to a reset. */ 617 if (th->rst) 618 return; 619 620 /* If sk not NULL, it means we did a successful lookup and incoming 621 * route had to be correct. prequeue might have dropped our dst. 622 */ 623 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 624 return; 625 626 /* Swap the send and the receive. */ 627 memset(&rep, 0, sizeof(rep)); 628 rep.th.dest = th->source; 629 rep.th.source = th->dest; 630 rep.th.doff = sizeof(struct tcphdr) / 4; 631 rep.th.rst = 1; 632 633 if (th->ack) { 634 rep.th.seq = th->ack_seq; 635 } else { 636 rep.th.ack = 1; 637 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 638 skb->len - (th->doff << 2)); 639 } 640 641 memset(&arg, 0, sizeof(arg)); 642 arg.iov[0].iov_base = (unsigned char *)&rep; 643 arg.iov[0].iov_len = sizeof(rep.th); 644 645 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 646 #ifdef CONFIG_TCP_MD5SIG 647 rcu_read_lock(); 648 hash_location = tcp_parse_md5sig_option(th); 649 if (sk && sk_fullsock(sk)) { 650 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *) 651 &ip_hdr(skb)->saddr, AF_INET); 652 } else if (hash_location) { 653 /* 654 * active side is lost. Try to find listening socket through 655 * source port, and then find md5 key through listening socket. 656 * we are not loose security here: 657 * Incoming packet is checked with md5 hash with finding key, 658 * no RST generated if md5 hash doesn't match. 659 */ 660 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, 661 ip_hdr(skb)->saddr, 662 th->source, ip_hdr(skb)->daddr, 663 ntohs(th->source), inet_iif(skb)); 664 /* don't send rst if it can't find key */ 665 if (!sk1) 666 goto out; 667 668 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *) 669 &ip_hdr(skb)->saddr, AF_INET); 670 if (!key) 671 goto out; 672 673 674 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 675 if (genhash || memcmp(hash_location, newhash, 16) != 0) 676 goto out; 677 678 } 679 680 if (key) { 681 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 682 (TCPOPT_NOP << 16) | 683 (TCPOPT_MD5SIG << 8) | 684 TCPOLEN_MD5SIG); 685 /* Update length and the length the header thinks exists */ 686 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 687 rep.th.doff = arg.iov[0].iov_len / 4; 688 689 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 690 key, ip_hdr(skb)->saddr, 691 ip_hdr(skb)->daddr, &rep.th); 692 } 693 #endif 694 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 695 ip_hdr(skb)->saddr, /* XXX */ 696 arg.iov[0].iov_len, IPPROTO_TCP, 0); 697 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 698 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 699 700 /* When socket is gone, all binding information is lost. 701 * routing might fail in this case. No choice here, if we choose to force 702 * input interface, we will misroute in case of asymmetric route. 703 */ 704 if (sk) 705 arg.bound_dev_if = sk->sk_bound_dev_if; 706 707 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 708 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 709 710 arg.tos = ip_hdr(skb)->tos; 711 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 712 local_bh_disable(); 713 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), 714 skb, &TCP_SKB_CB(skb)->header.h4.opt, 715 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 716 &arg, arg.iov[0].iov_len); 717 718 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 719 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 720 local_bh_enable(); 721 722 #ifdef CONFIG_TCP_MD5SIG 723 out: 724 rcu_read_unlock(); 725 #endif 726 } 727 728 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 729 outside socket context is ugly, certainly. What can I do? 730 */ 731 732 static void tcp_v4_send_ack(const struct sock *sk, 733 struct sk_buff *skb, u32 seq, u32 ack, 734 u32 win, u32 tsval, u32 tsecr, int oif, 735 struct tcp_md5sig_key *key, 736 int reply_flags, u8 tos) 737 { 738 const struct tcphdr *th = tcp_hdr(skb); 739 struct { 740 struct tcphdr th; 741 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 742 #ifdef CONFIG_TCP_MD5SIG 743 + (TCPOLEN_MD5SIG_ALIGNED >> 2) 744 #endif 745 ]; 746 } rep; 747 struct net *net = sock_net(sk); 748 struct ip_reply_arg arg; 749 750 memset(&rep.th, 0, sizeof(struct tcphdr)); 751 memset(&arg, 0, sizeof(arg)); 752 753 arg.iov[0].iov_base = (unsigned char *)&rep; 754 arg.iov[0].iov_len = sizeof(rep.th); 755 if (tsecr) { 756 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 757 (TCPOPT_TIMESTAMP << 8) | 758 TCPOLEN_TIMESTAMP); 759 rep.opt[1] = htonl(tsval); 760 rep.opt[2] = htonl(tsecr); 761 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 762 } 763 764 /* Swap the send and the receive. */ 765 rep.th.dest = th->source; 766 rep.th.source = th->dest; 767 rep.th.doff = arg.iov[0].iov_len / 4; 768 rep.th.seq = htonl(seq); 769 rep.th.ack_seq = htonl(ack); 770 rep.th.ack = 1; 771 rep.th.window = htons(win); 772 773 #ifdef CONFIG_TCP_MD5SIG 774 if (key) { 775 int offset = (tsecr) ? 3 : 0; 776 777 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 778 (TCPOPT_NOP << 16) | 779 (TCPOPT_MD5SIG << 8) | 780 TCPOLEN_MD5SIG); 781 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 782 rep.th.doff = arg.iov[0].iov_len/4; 783 784 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 785 key, ip_hdr(skb)->saddr, 786 ip_hdr(skb)->daddr, &rep.th); 787 } 788 #endif 789 arg.flags = reply_flags; 790 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 791 ip_hdr(skb)->saddr, /* XXX */ 792 arg.iov[0].iov_len, IPPROTO_TCP, 0); 793 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 794 if (oif) 795 arg.bound_dev_if = oif; 796 arg.tos = tos; 797 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 798 local_bh_disable(); 799 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), 800 skb, &TCP_SKB_CB(skb)->header.h4.opt, 801 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 802 &arg, arg.iov[0].iov_len); 803 804 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 805 local_bh_enable(); 806 } 807 808 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 809 { 810 struct inet_timewait_sock *tw = inet_twsk(sk); 811 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 812 813 tcp_v4_send_ack(sk, skb, 814 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 815 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 816 tcp_time_stamp_raw() + tcptw->tw_ts_offset, 817 tcptw->tw_ts_recent, 818 tw->tw_bound_dev_if, 819 tcp_twsk_md5_key(tcptw), 820 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 821 tw->tw_tos 822 ); 823 824 inet_twsk_put(tw); 825 } 826 827 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 828 struct request_sock *req) 829 { 830 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 831 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 832 */ 833 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 834 tcp_sk(sk)->snd_nxt; 835 836 /* RFC 7323 2.3 837 * The window field (SEG.WND) of every outgoing segment, with the 838 * exception of <SYN> segments, MUST be right-shifted by 839 * Rcv.Wind.Shift bits: 840 */ 841 tcp_v4_send_ack(sk, skb, seq, 842 tcp_rsk(req)->rcv_nxt, 843 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 844 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, 845 req->ts_recent, 846 0, 847 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, 848 AF_INET), 849 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 850 ip_hdr(skb)->tos); 851 } 852 853 /* 854 * Send a SYN-ACK after having received a SYN. 855 * This still operates on a request_sock only, not on a big 856 * socket. 857 */ 858 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 859 struct flowi *fl, 860 struct request_sock *req, 861 struct tcp_fastopen_cookie *foc, 862 enum tcp_synack_type synack_type) 863 { 864 const struct inet_request_sock *ireq = inet_rsk(req); 865 struct flowi4 fl4; 866 int err = -1; 867 struct sk_buff *skb; 868 869 /* First, grab a route. */ 870 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 871 return -1; 872 873 skb = tcp_make_synack(sk, dst, req, foc, synack_type); 874 875 if (skb) { 876 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 877 878 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 879 ireq->ir_rmt_addr, 880 ireq->opt); 881 err = net_xmit_eval(err); 882 } 883 884 return err; 885 } 886 887 /* 888 * IPv4 request_sock destructor. 889 */ 890 static void tcp_v4_reqsk_destructor(struct request_sock *req) 891 { 892 kfree(inet_rsk(req)->opt); 893 } 894 895 #ifdef CONFIG_TCP_MD5SIG 896 /* 897 * RFC2385 MD5 checksumming requires a mapping of 898 * IP address->MD5 Key. 899 * We need to maintain these in the sk structure. 900 */ 901 902 /* Find the Key structure for an address. */ 903 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, 904 const union tcp_md5_addr *addr, 905 int family) 906 { 907 const struct tcp_sock *tp = tcp_sk(sk); 908 struct tcp_md5sig_key *key; 909 unsigned int size = sizeof(struct in_addr); 910 const struct tcp_md5sig_info *md5sig; 911 912 /* caller either holds rcu_read_lock() or socket lock */ 913 md5sig = rcu_dereference_check(tp->md5sig_info, 914 lockdep_sock_is_held(sk)); 915 if (!md5sig) 916 return NULL; 917 #if IS_ENABLED(CONFIG_IPV6) 918 if (family == AF_INET6) 919 size = sizeof(struct in6_addr); 920 #endif 921 hlist_for_each_entry_rcu(key, &md5sig->head, node) { 922 if (key->family != family) 923 continue; 924 if (!memcmp(&key->addr, addr, size)) 925 return key; 926 } 927 return NULL; 928 } 929 EXPORT_SYMBOL(tcp_md5_do_lookup); 930 931 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 932 const struct sock *addr_sk) 933 { 934 const union tcp_md5_addr *addr; 935 936 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 937 return tcp_md5_do_lookup(sk, addr, AF_INET); 938 } 939 EXPORT_SYMBOL(tcp_v4_md5_lookup); 940 941 /* This can be called on a newly created socket, from other files */ 942 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 943 int family, const u8 *newkey, u8 newkeylen, gfp_t gfp) 944 { 945 /* Add Key to the list */ 946 struct tcp_md5sig_key *key; 947 struct tcp_sock *tp = tcp_sk(sk); 948 struct tcp_md5sig_info *md5sig; 949 950 key = tcp_md5_do_lookup(sk, addr, family); 951 if (key) { 952 /* Pre-existing entry - just update that one. */ 953 memcpy(key->key, newkey, newkeylen); 954 key->keylen = newkeylen; 955 return 0; 956 } 957 958 md5sig = rcu_dereference_protected(tp->md5sig_info, 959 lockdep_sock_is_held(sk)); 960 if (!md5sig) { 961 md5sig = kmalloc(sizeof(*md5sig), gfp); 962 if (!md5sig) 963 return -ENOMEM; 964 965 sk_nocaps_add(sk, NETIF_F_GSO_MASK); 966 INIT_HLIST_HEAD(&md5sig->head); 967 rcu_assign_pointer(tp->md5sig_info, md5sig); 968 } 969 970 key = sock_kmalloc(sk, sizeof(*key), gfp); 971 if (!key) 972 return -ENOMEM; 973 if (!tcp_alloc_md5sig_pool()) { 974 sock_kfree_s(sk, key, sizeof(*key)); 975 return -ENOMEM; 976 } 977 978 memcpy(key->key, newkey, newkeylen); 979 key->keylen = newkeylen; 980 key->family = family; 981 memcpy(&key->addr, addr, 982 (family == AF_INET6) ? sizeof(struct in6_addr) : 983 sizeof(struct in_addr)); 984 hlist_add_head_rcu(&key->node, &md5sig->head); 985 return 0; 986 } 987 EXPORT_SYMBOL(tcp_md5_do_add); 988 989 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family) 990 { 991 struct tcp_md5sig_key *key; 992 993 key = tcp_md5_do_lookup(sk, addr, family); 994 if (!key) 995 return -ENOENT; 996 hlist_del_rcu(&key->node); 997 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 998 kfree_rcu(key, rcu); 999 return 0; 1000 } 1001 EXPORT_SYMBOL(tcp_md5_do_del); 1002 1003 static void tcp_clear_md5_list(struct sock *sk) 1004 { 1005 struct tcp_sock *tp = tcp_sk(sk); 1006 struct tcp_md5sig_key *key; 1007 struct hlist_node *n; 1008 struct tcp_md5sig_info *md5sig; 1009 1010 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1011 1012 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1013 hlist_del_rcu(&key->node); 1014 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1015 kfree_rcu(key, rcu); 1016 } 1017 } 1018 1019 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval, 1020 int optlen) 1021 { 1022 struct tcp_md5sig cmd; 1023 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1024 1025 if (optlen < sizeof(cmd)) 1026 return -EINVAL; 1027 1028 if (copy_from_user(&cmd, optval, sizeof(cmd))) 1029 return -EFAULT; 1030 1031 if (sin->sin_family != AF_INET) 1032 return -EINVAL; 1033 1034 if (!cmd.tcpm_keylen) 1035 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, 1036 AF_INET); 1037 1038 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1039 return -EINVAL; 1040 1041 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, 1042 AF_INET, cmd.tcpm_key, cmd.tcpm_keylen, 1043 GFP_KERNEL); 1044 } 1045 1046 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, 1047 __be32 daddr, __be32 saddr, 1048 const struct tcphdr *th, int nbytes) 1049 { 1050 struct tcp4_pseudohdr *bp; 1051 struct scatterlist sg; 1052 struct tcphdr *_th; 1053 1054 bp = hp->scratch; 1055 bp->saddr = saddr; 1056 bp->daddr = daddr; 1057 bp->pad = 0; 1058 bp->protocol = IPPROTO_TCP; 1059 bp->len = cpu_to_be16(nbytes); 1060 1061 _th = (struct tcphdr *)(bp + 1); 1062 memcpy(_th, th, sizeof(*th)); 1063 _th->check = 0; 1064 1065 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1066 ahash_request_set_crypt(hp->md5_req, &sg, NULL, 1067 sizeof(*bp) + sizeof(*th)); 1068 return crypto_ahash_update(hp->md5_req); 1069 } 1070 1071 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1072 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1073 { 1074 struct tcp_md5sig_pool *hp; 1075 struct ahash_request *req; 1076 1077 hp = tcp_get_md5sig_pool(); 1078 if (!hp) 1079 goto clear_hash_noput; 1080 req = hp->md5_req; 1081 1082 if (crypto_ahash_init(req)) 1083 goto clear_hash; 1084 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) 1085 goto clear_hash; 1086 if (tcp_md5_hash_key(hp, key)) 1087 goto clear_hash; 1088 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1089 if (crypto_ahash_final(req)) 1090 goto clear_hash; 1091 1092 tcp_put_md5sig_pool(); 1093 return 0; 1094 1095 clear_hash: 1096 tcp_put_md5sig_pool(); 1097 clear_hash_noput: 1098 memset(md5_hash, 0, 16); 1099 return 1; 1100 } 1101 1102 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1103 const struct sock *sk, 1104 const struct sk_buff *skb) 1105 { 1106 struct tcp_md5sig_pool *hp; 1107 struct ahash_request *req; 1108 const struct tcphdr *th = tcp_hdr(skb); 1109 __be32 saddr, daddr; 1110 1111 if (sk) { /* valid for establish/request sockets */ 1112 saddr = sk->sk_rcv_saddr; 1113 daddr = sk->sk_daddr; 1114 } else { 1115 const struct iphdr *iph = ip_hdr(skb); 1116 saddr = iph->saddr; 1117 daddr = iph->daddr; 1118 } 1119 1120 hp = tcp_get_md5sig_pool(); 1121 if (!hp) 1122 goto clear_hash_noput; 1123 req = hp->md5_req; 1124 1125 if (crypto_ahash_init(req)) 1126 goto clear_hash; 1127 1128 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len)) 1129 goto clear_hash; 1130 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) 1131 goto clear_hash; 1132 if (tcp_md5_hash_key(hp, key)) 1133 goto clear_hash; 1134 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1135 if (crypto_ahash_final(req)) 1136 goto clear_hash; 1137 1138 tcp_put_md5sig_pool(); 1139 return 0; 1140 1141 clear_hash: 1142 tcp_put_md5sig_pool(); 1143 clear_hash_noput: 1144 memset(md5_hash, 0, 16); 1145 return 1; 1146 } 1147 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1148 1149 #endif 1150 1151 /* Called with rcu_read_lock() */ 1152 static bool tcp_v4_inbound_md5_hash(const struct sock *sk, 1153 const struct sk_buff *skb) 1154 { 1155 #ifdef CONFIG_TCP_MD5SIG 1156 /* 1157 * This gets called for each TCP segment that arrives 1158 * so we want to be efficient. 1159 * We have 3 drop cases: 1160 * o No MD5 hash and one expected. 1161 * o MD5 hash and we're not expecting one. 1162 * o MD5 hash and its wrong. 1163 */ 1164 const __u8 *hash_location = NULL; 1165 struct tcp_md5sig_key *hash_expected; 1166 const struct iphdr *iph = ip_hdr(skb); 1167 const struct tcphdr *th = tcp_hdr(skb); 1168 int genhash; 1169 unsigned char newhash[16]; 1170 1171 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr, 1172 AF_INET); 1173 hash_location = tcp_parse_md5sig_option(th); 1174 1175 /* We've parsed the options - do we have a hash? */ 1176 if (!hash_expected && !hash_location) 1177 return false; 1178 1179 if (hash_expected && !hash_location) { 1180 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); 1181 return true; 1182 } 1183 1184 if (!hash_expected && hash_location) { 1185 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); 1186 return true; 1187 } 1188 1189 /* Okay, so this is hash_expected and hash_location - 1190 * so we need to calculate the checksum. 1191 */ 1192 genhash = tcp_v4_md5_hash_skb(newhash, 1193 hash_expected, 1194 NULL, skb); 1195 1196 if (genhash || memcmp(hash_location, newhash, 16) != 0) { 1197 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); 1198 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", 1199 &iph->saddr, ntohs(th->source), 1200 &iph->daddr, ntohs(th->dest), 1201 genhash ? " tcp_v4_calc_md5_hash failed" 1202 : ""); 1203 return true; 1204 } 1205 return false; 1206 #endif 1207 return false; 1208 } 1209 1210 static void tcp_v4_init_req(struct request_sock *req, 1211 const struct sock *sk_listener, 1212 struct sk_buff *skb) 1213 { 1214 struct inet_request_sock *ireq = inet_rsk(req); 1215 1216 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1217 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1218 ireq->opt = tcp_v4_save_options(skb); 1219 } 1220 1221 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1222 struct flowi *fl, 1223 const struct request_sock *req) 1224 { 1225 return inet_csk_route_req(sk, &fl->u.ip4, req); 1226 } 1227 1228 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1229 .family = PF_INET, 1230 .obj_size = sizeof(struct tcp_request_sock), 1231 .rtx_syn_ack = tcp_rtx_synack, 1232 .send_ack = tcp_v4_reqsk_send_ack, 1233 .destructor = tcp_v4_reqsk_destructor, 1234 .send_reset = tcp_v4_send_reset, 1235 .syn_ack_timeout = tcp_syn_ack_timeout, 1236 }; 1237 1238 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1239 .mss_clamp = TCP_MSS_DEFAULT, 1240 #ifdef CONFIG_TCP_MD5SIG 1241 .req_md5_lookup = tcp_v4_md5_lookup, 1242 .calc_md5_hash = tcp_v4_md5_hash_skb, 1243 #endif 1244 .init_req = tcp_v4_init_req, 1245 #ifdef CONFIG_SYN_COOKIES 1246 .cookie_init_seq = cookie_v4_init_sequence, 1247 #endif 1248 .route_req = tcp_v4_route_req, 1249 .init_seq = tcp_v4_init_seq, 1250 .init_ts_off = tcp_v4_init_ts_off, 1251 .send_synack = tcp_v4_send_synack, 1252 }; 1253 1254 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1255 { 1256 /* Never answer to SYNs send to broadcast or multicast */ 1257 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1258 goto drop; 1259 1260 return tcp_conn_request(&tcp_request_sock_ops, 1261 &tcp_request_sock_ipv4_ops, sk, skb); 1262 1263 drop: 1264 tcp_listendrop(sk); 1265 return 0; 1266 } 1267 EXPORT_SYMBOL(tcp_v4_conn_request); 1268 1269 1270 /* 1271 * The three way handshake has completed - we got a valid synack - 1272 * now create the new socket. 1273 */ 1274 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1275 struct request_sock *req, 1276 struct dst_entry *dst, 1277 struct request_sock *req_unhash, 1278 bool *own_req) 1279 { 1280 struct inet_request_sock *ireq; 1281 struct inet_sock *newinet; 1282 struct tcp_sock *newtp; 1283 struct sock *newsk; 1284 #ifdef CONFIG_TCP_MD5SIG 1285 struct tcp_md5sig_key *key; 1286 #endif 1287 struct ip_options_rcu *inet_opt; 1288 1289 if (sk_acceptq_is_full(sk)) 1290 goto exit_overflow; 1291 1292 newsk = tcp_create_openreq_child(sk, req, skb); 1293 if (!newsk) 1294 goto exit_nonewsk; 1295 1296 newsk->sk_gso_type = SKB_GSO_TCPV4; 1297 inet_sk_rx_dst_set(newsk, skb); 1298 1299 newtp = tcp_sk(newsk); 1300 newinet = inet_sk(newsk); 1301 ireq = inet_rsk(req); 1302 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1303 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1304 newsk->sk_bound_dev_if = ireq->ir_iif; 1305 newinet->inet_saddr = ireq->ir_loc_addr; 1306 inet_opt = ireq->opt; 1307 rcu_assign_pointer(newinet->inet_opt, inet_opt); 1308 ireq->opt = NULL; 1309 newinet->mc_index = inet_iif(skb); 1310 newinet->mc_ttl = ip_hdr(skb)->ttl; 1311 newinet->rcv_tos = ip_hdr(skb)->tos; 1312 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1313 if (inet_opt) 1314 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1315 newinet->inet_id = newtp->write_seq ^ jiffies; 1316 1317 if (!dst) { 1318 dst = inet_csk_route_child_sock(sk, newsk, req); 1319 if (!dst) 1320 goto put_and_exit; 1321 } else { 1322 /* syncookie case : see end of cookie_v4_check() */ 1323 } 1324 sk_setup_caps(newsk, dst); 1325 1326 tcp_ca_openreq_child(newsk, dst); 1327 1328 tcp_sync_mss(newsk, dst_mtu(dst)); 1329 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1330 1331 tcp_initialize_rcv_mss(newsk); 1332 1333 #ifdef CONFIG_TCP_MD5SIG 1334 /* Copy over the MD5 key from the original socket */ 1335 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr, 1336 AF_INET); 1337 if (key) { 1338 /* 1339 * We're using one, so create a matching key 1340 * on the newsk structure. If we fail to get 1341 * memory, then we end up not copying the key 1342 * across. Shucks. 1343 */ 1344 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr, 1345 AF_INET, key->key, key->keylen, GFP_ATOMIC); 1346 sk_nocaps_add(newsk, NETIF_F_GSO_MASK); 1347 } 1348 #endif 1349 1350 if (__inet_inherit_port(sk, newsk) < 0) 1351 goto put_and_exit; 1352 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); 1353 if (*own_req) 1354 tcp_move_syn(newtp, req); 1355 1356 return newsk; 1357 1358 exit_overflow: 1359 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1360 exit_nonewsk: 1361 dst_release(dst); 1362 exit: 1363 tcp_listendrop(sk); 1364 return NULL; 1365 put_and_exit: 1366 inet_csk_prepare_forced_close(newsk); 1367 tcp_done(newsk); 1368 goto exit; 1369 } 1370 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1371 1372 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1373 { 1374 #ifdef CONFIG_SYN_COOKIES 1375 const struct tcphdr *th = tcp_hdr(skb); 1376 1377 if (!th->syn) 1378 sk = cookie_v4_check(sk, skb); 1379 #endif 1380 return sk; 1381 } 1382 1383 /* The socket must have it's spinlock held when we get 1384 * here, unless it is a TCP_LISTEN socket. 1385 * 1386 * We have a potential double-lock case here, so even when 1387 * doing backlog processing we use the BH locking scheme. 1388 * This is because we cannot sleep with the original spinlock 1389 * held. 1390 */ 1391 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1392 { 1393 struct sock *rsk; 1394 1395 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1396 struct dst_entry *dst = sk->sk_rx_dst; 1397 1398 sock_rps_save_rxhash(sk, skb); 1399 sk_mark_napi_id(sk, skb); 1400 if (dst) { 1401 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || 1402 !dst->ops->check(dst, 0)) { 1403 dst_release(dst); 1404 sk->sk_rx_dst = NULL; 1405 } 1406 } 1407 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len); 1408 return 0; 1409 } 1410 1411 if (tcp_checksum_complete(skb)) 1412 goto csum_err; 1413 1414 if (sk->sk_state == TCP_LISTEN) { 1415 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1416 1417 if (!nsk) 1418 goto discard; 1419 if (nsk != sk) { 1420 if (tcp_child_process(sk, nsk, skb)) { 1421 rsk = nsk; 1422 goto reset; 1423 } 1424 return 0; 1425 } 1426 } else 1427 sock_rps_save_rxhash(sk, skb); 1428 1429 if (tcp_rcv_state_process(sk, skb)) { 1430 rsk = sk; 1431 goto reset; 1432 } 1433 return 0; 1434 1435 reset: 1436 tcp_v4_send_reset(rsk, skb); 1437 discard: 1438 kfree_skb(skb); 1439 /* Be careful here. If this function gets more complicated and 1440 * gcc suffers from register pressure on the x86, sk (in %ebx) 1441 * might be destroyed here. This current version compiles correctly, 1442 * but you have been warned. 1443 */ 1444 return 0; 1445 1446 csum_err: 1447 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1448 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1449 goto discard; 1450 } 1451 EXPORT_SYMBOL(tcp_v4_do_rcv); 1452 1453 void tcp_v4_early_demux(struct sk_buff *skb) 1454 { 1455 const struct iphdr *iph; 1456 const struct tcphdr *th; 1457 struct sock *sk; 1458 1459 if (skb->pkt_type != PACKET_HOST) 1460 return; 1461 1462 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1463 return; 1464 1465 iph = ip_hdr(skb); 1466 th = tcp_hdr(skb); 1467 1468 if (th->doff < sizeof(struct tcphdr) / 4) 1469 return; 1470 1471 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, 1472 iph->saddr, th->source, 1473 iph->daddr, ntohs(th->dest), 1474 skb->skb_iif); 1475 if (sk) { 1476 skb->sk = sk; 1477 skb->destructor = sock_edemux; 1478 if (sk_fullsock(sk)) { 1479 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); 1480 1481 if (dst) 1482 dst = dst_check(dst, 0); 1483 if (dst && 1484 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) 1485 skb_dst_set_noref(skb, dst); 1486 } 1487 } 1488 } 1489 1490 /* Packet is added to VJ-style prequeue for processing in process 1491 * context, if a reader task is waiting. Apparently, this exciting 1492 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93) 1493 * failed somewhere. Latency? Burstiness? Well, at least now we will 1494 * see, why it failed. 8)8) --ANK 1495 * 1496 */ 1497 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb) 1498 { 1499 struct tcp_sock *tp = tcp_sk(sk); 1500 1501 if (sysctl_tcp_low_latency || !tp->ucopy.task) 1502 return false; 1503 1504 if (skb->len <= tcp_hdrlen(skb) && 1505 skb_queue_len(&tp->ucopy.prequeue) == 0) 1506 return false; 1507 1508 /* Before escaping RCU protected region, we need to take care of skb 1509 * dst. Prequeue is only enabled for established sockets. 1510 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst 1511 * Instead of doing full sk_rx_dst validity here, let's perform 1512 * an optimistic check. 1513 */ 1514 if (likely(sk->sk_rx_dst)) 1515 skb_dst_drop(skb); 1516 else 1517 skb_dst_force_safe(skb); 1518 1519 __skb_queue_tail(&tp->ucopy.prequeue, skb); 1520 tp->ucopy.memory += skb->truesize; 1521 if (skb_queue_len(&tp->ucopy.prequeue) >= 32 || 1522 tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) { 1523 struct sk_buff *skb1; 1524 1525 BUG_ON(sock_owned_by_user(sk)); 1526 __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED, 1527 skb_queue_len(&tp->ucopy.prequeue)); 1528 1529 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) 1530 sk_backlog_rcv(sk, skb1); 1531 1532 tp->ucopy.memory = 0; 1533 } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) { 1534 wake_up_interruptible_sync_poll(sk_sleep(sk), 1535 POLLIN | POLLRDNORM | POLLRDBAND); 1536 if (!inet_csk_ack_scheduled(sk)) 1537 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, 1538 (3 * tcp_rto_min(sk)) / 4, 1539 TCP_RTO_MAX); 1540 } 1541 return true; 1542 } 1543 EXPORT_SYMBOL(tcp_prequeue); 1544 1545 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) 1546 { 1547 u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf; 1548 1549 /* Only socket owner can try to collapse/prune rx queues 1550 * to reduce memory overhead, so add a little headroom here. 1551 * Few sockets backlog are possibly concurrently non empty. 1552 */ 1553 limit += 64*1024; 1554 1555 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1556 * we can fix skb->truesize to its real value to avoid future drops. 1557 * This is valid because skb is not yet charged to the socket. 1558 * It has been noticed pure SACK packets were sometimes dropped 1559 * (if cooked by drivers without copybreak feature). 1560 */ 1561 skb_condense(skb); 1562 1563 if (unlikely(sk_add_backlog(sk, skb, limit))) { 1564 bh_unlock_sock(sk); 1565 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 1566 return true; 1567 } 1568 return false; 1569 } 1570 EXPORT_SYMBOL(tcp_add_backlog); 1571 1572 int tcp_filter(struct sock *sk, struct sk_buff *skb) 1573 { 1574 struct tcphdr *th = (struct tcphdr *)skb->data; 1575 unsigned int eaten = skb->len; 1576 int err; 1577 1578 err = sk_filter_trim_cap(sk, skb, th->doff * 4); 1579 if (!err) { 1580 eaten -= skb->len; 1581 TCP_SKB_CB(skb)->end_seq -= eaten; 1582 } 1583 return err; 1584 } 1585 EXPORT_SYMBOL(tcp_filter); 1586 1587 /* 1588 * From tcp_input.c 1589 */ 1590 1591 int tcp_v4_rcv(struct sk_buff *skb) 1592 { 1593 struct net *net = dev_net(skb->dev); 1594 const struct iphdr *iph; 1595 const struct tcphdr *th; 1596 bool refcounted; 1597 struct sock *sk; 1598 int ret; 1599 1600 if (skb->pkt_type != PACKET_HOST) 1601 goto discard_it; 1602 1603 /* Count it even if it's bad */ 1604 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 1605 1606 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 1607 goto discard_it; 1608 1609 th = (const struct tcphdr *)skb->data; 1610 1611 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) 1612 goto bad_packet; 1613 if (!pskb_may_pull(skb, th->doff * 4)) 1614 goto discard_it; 1615 1616 /* An explanation is required here, I think. 1617 * Packet length and doff are validated by header prediction, 1618 * provided case of th->doff==0 is eliminated. 1619 * So, we defer the checks. */ 1620 1621 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 1622 goto csum_error; 1623 1624 th = (const struct tcphdr *)skb->data; 1625 iph = ip_hdr(skb); 1626 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 1627 * barrier() makes sure compiler wont play fool^Waliasing games. 1628 */ 1629 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 1630 sizeof(struct inet_skb_parm)); 1631 barrier(); 1632 1633 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1634 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1635 skb->len - th->doff * 4); 1636 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1637 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 1638 TCP_SKB_CB(skb)->tcp_tw_isn = 0; 1639 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 1640 TCP_SKB_CB(skb)->sacked = 0; 1641 1642 lookup: 1643 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, 1644 th->dest, &refcounted); 1645 if (!sk) 1646 goto no_tcp_socket; 1647 1648 process: 1649 if (sk->sk_state == TCP_TIME_WAIT) 1650 goto do_time_wait; 1651 1652 if (sk->sk_state == TCP_NEW_SYN_RECV) { 1653 struct request_sock *req = inet_reqsk(sk); 1654 struct sock *nsk; 1655 1656 sk = req->rsk_listener; 1657 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) { 1658 sk_drops_add(sk, skb); 1659 reqsk_put(req); 1660 goto discard_it; 1661 } 1662 if (unlikely(sk->sk_state != TCP_LISTEN)) { 1663 inet_csk_reqsk_queue_drop_and_put(sk, req); 1664 goto lookup; 1665 } 1666 /* We own a reference on the listener, increase it again 1667 * as we might lose it too soon. 1668 */ 1669 sock_hold(sk); 1670 refcounted = true; 1671 nsk = tcp_check_req(sk, skb, req, false); 1672 if (!nsk) { 1673 reqsk_put(req); 1674 goto discard_and_relse; 1675 } 1676 if (nsk == sk) { 1677 reqsk_put(req); 1678 } else if (tcp_child_process(sk, nsk, skb)) { 1679 tcp_v4_send_reset(nsk, skb); 1680 goto discard_and_relse; 1681 } else { 1682 sock_put(sk); 1683 return 0; 1684 } 1685 } 1686 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 1687 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 1688 goto discard_and_relse; 1689 } 1690 1691 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 1692 goto discard_and_relse; 1693 1694 if (tcp_v4_inbound_md5_hash(sk, skb)) 1695 goto discard_and_relse; 1696 1697 nf_reset(skb); 1698 1699 if (tcp_filter(sk, skb)) 1700 goto discard_and_relse; 1701 th = (const struct tcphdr *)skb->data; 1702 iph = ip_hdr(skb); 1703 1704 skb->dev = NULL; 1705 1706 if (sk->sk_state == TCP_LISTEN) { 1707 ret = tcp_v4_do_rcv(sk, skb); 1708 goto put_and_return; 1709 } 1710 1711 sk_incoming_cpu_update(sk); 1712 1713 bh_lock_sock_nested(sk); 1714 tcp_segs_in(tcp_sk(sk), skb); 1715 ret = 0; 1716 if (!sock_owned_by_user(sk)) { 1717 if (!tcp_prequeue(sk, skb)) 1718 ret = tcp_v4_do_rcv(sk, skb); 1719 } else if (tcp_add_backlog(sk, skb)) { 1720 goto discard_and_relse; 1721 } 1722 bh_unlock_sock(sk); 1723 1724 put_and_return: 1725 if (refcounted) 1726 sock_put(sk); 1727 1728 return ret; 1729 1730 no_tcp_socket: 1731 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 1732 goto discard_it; 1733 1734 if (tcp_checksum_complete(skb)) { 1735 csum_error: 1736 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 1737 bad_packet: 1738 __TCP_INC_STATS(net, TCP_MIB_INERRS); 1739 } else { 1740 tcp_v4_send_reset(NULL, skb); 1741 } 1742 1743 discard_it: 1744 /* Discard frame. */ 1745 kfree_skb(skb); 1746 return 0; 1747 1748 discard_and_relse: 1749 sk_drops_add(sk, skb); 1750 if (refcounted) 1751 sock_put(sk); 1752 goto discard_it; 1753 1754 do_time_wait: 1755 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 1756 inet_twsk_put(inet_twsk(sk)); 1757 goto discard_it; 1758 } 1759 1760 if (tcp_checksum_complete(skb)) { 1761 inet_twsk_put(inet_twsk(sk)); 1762 goto csum_error; 1763 } 1764 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 1765 case TCP_TW_SYN: { 1766 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), 1767 &tcp_hashinfo, skb, 1768 __tcp_hdrlen(th), 1769 iph->saddr, th->source, 1770 iph->daddr, th->dest, 1771 inet_iif(skb)); 1772 if (sk2) { 1773 inet_twsk_deschedule_put(inet_twsk(sk)); 1774 sk = sk2; 1775 refcounted = false; 1776 goto process; 1777 } 1778 /* Fall through to ACK */ 1779 } 1780 case TCP_TW_ACK: 1781 tcp_v4_timewait_ack(sk, skb); 1782 break; 1783 case TCP_TW_RST: 1784 tcp_v4_send_reset(sk, skb); 1785 inet_twsk_deschedule_put(inet_twsk(sk)); 1786 goto discard_it; 1787 case TCP_TW_SUCCESS:; 1788 } 1789 goto discard_it; 1790 } 1791 1792 static struct timewait_sock_ops tcp_timewait_sock_ops = { 1793 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 1794 .twsk_unique = tcp_twsk_unique, 1795 .twsk_destructor= tcp_twsk_destructor, 1796 }; 1797 1798 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 1799 { 1800 struct dst_entry *dst = skb_dst(skb); 1801 1802 if (dst && dst_hold_safe(dst)) { 1803 sk->sk_rx_dst = dst; 1804 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; 1805 } 1806 } 1807 EXPORT_SYMBOL(inet_sk_rx_dst_set); 1808 1809 const struct inet_connection_sock_af_ops ipv4_specific = { 1810 .queue_xmit = ip_queue_xmit, 1811 .send_check = tcp_v4_send_check, 1812 .rebuild_header = inet_sk_rebuild_header, 1813 .sk_rx_dst_set = inet_sk_rx_dst_set, 1814 .conn_request = tcp_v4_conn_request, 1815 .syn_recv_sock = tcp_v4_syn_recv_sock, 1816 .net_header_len = sizeof(struct iphdr), 1817 .setsockopt = ip_setsockopt, 1818 .getsockopt = ip_getsockopt, 1819 .addr2sockaddr = inet_csk_addr2sockaddr, 1820 .sockaddr_len = sizeof(struct sockaddr_in), 1821 #ifdef CONFIG_COMPAT 1822 .compat_setsockopt = compat_ip_setsockopt, 1823 .compat_getsockopt = compat_ip_getsockopt, 1824 #endif 1825 .mtu_reduced = tcp_v4_mtu_reduced, 1826 }; 1827 EXPORT_SYMBOL(ipv4_specific); 1828 1829 #ifdef CONFIG_TCP_MD5SIG 1830 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 1831 .md5_lookup = tcp_v4_md5_lookup, 1832 .calc_md5_hash = tcp_v4_md5_hash_skb, 1833 .md5_parse = tcp_v4_parse_md5_keys, 1834 }; 1835 #endif 1836 1837 /* NOTE: A lot of things set to zero explicitly by call to 1838 * sk_alloc() so need not be done here. 1839 */ 1840 static int tcp_v4_init_sock(struct sock *sk) 1841 { 1842 struct inet_connection_sock *icsk = inet_csk(sk); 1843 1844 tcp_init_sock(sk); 1845 1846 icsk->icsk_af_ops = &ipv4_specific; 1847 1848 #ifdef CONFIG_TCP_MD5SIG 1849 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 1850 #endif 1851 1852 return 0; 1853 } 1854 1855 void tcp_v4_destroy_sock(struct sock *sk) 1856 { 1857 struct tcp_sock *tp = tcp_sk(sk); 1858 1859 tcp_clear_xmit_timers(sk); 1860 1861 tcp_cleanup_congestion_control(sk); 1862 1863 /* Cleanup up the write buffer. */ 1864 tcp_write_queue_purge(sk); 1865 1866 /* Check if we want to disable active TFO */ 1867 tcp_fastopen_active_disable_ofo_check(sk); 1868 1869 /* Cleans up our, hopefully empty, out_of_order_queue. */ 1870 skb_rbtree_purge(&tp->out_of_order_queue); 1871 1872 #ifdef CONFIG_TCP_MD5SIG 1873 /* Clean up the MD5 key list, if any */ 1874 if (tp->md5sig_info) { 1875 tcp_clear_md5_list(sk); 1876 kfree_rcu(tp->md5sig_info, rcu); 1877 tp->md5sig_info = NULL; 1878 } 1879 #endif 1880 1881 /* Clean prequeue, it must be empty really */ 1882 __skb_queue_purge(&tp->ucopy.prequeue); 1883 1884 /* Clean up a referenced TCP bind bucket. */ 1885 if (inet_csk(sk)->icsk_bind_hash) 1886 inet_put_port(sk); 1887 1888 BUG_ON(tp->fastopen_rsk); 1889 1890 /* If socket is aborted during connect operation */ 1891 tcp_free_fastopen_req(tp); 1892 tcp_saved_syn_free(tp); 1893 1894 sk_sockets_allocated_dec(sk); 1895 } 1896 EXPORT_SYMBOL(tcp_v4_destroy_sock); 1897 1898 #ifdef CONFIG_PROC_FS 1899 /* Proc filesystem TCP sock list dumping. */ 1900 1901 /* 1902 * Get next listener socket follow cur. If cur is NULL, get first socket 1903 * starting from bucket given in st->bucket; when st->bucket is zero the 1904 * very first socket in the hash table is returned. 1905 */ 1906 static void *listening_get_next(struct seq_file *seq, void *cur) 1907 { 1908 struct tcp_iter_state *st = seq->private; 1909 struct net *net = seq_file_net(seq); 1910 struct inet_listen_hashbucket *ilb; 1911 struct sock *sk = cur; 1912 1913 if (!sk) { 1914 get_head: 1915 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 1916 spin_lock(&ilb->lock); 1917 sk = sk_head(&ilb->head); 1918 st->offset = 0; 1919 goto get_sk; 1920 } 1921 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 1922 ++st->num; 1923 ++st->offset; 1924 1925 sk = sk_next(sk); 1926 get_sk: 1927 sk_for_each_from(sk) { 1928 if (!net_eq(sock_net(sk), net)) 1929 continue; 1930 if (sk->sk_family == st->family) 1931 return sk; 1932 } 1933 spin_unlock(&ilb->lock); 1934 st->offset = 0; 1935 if (++st->bucket < INET_LHTABLE_SIZE) 1936 goto get_head; 1937 return NULL; 1938 } 1939 1940 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 1941 { 1942 struct tcp_iter_state *st = seq->private; 1943 void *rc; 1944 1945 st->bucket = 0; 1946 st->offset = 0; 1947 rc = listening_get_next(seq, NULL); 1948 1949 while (rc && *pos) { 1950 rc = listening_get_next(seq, rc); 1951 --*pos; 1952 } 1953 return rc; 1954 } 1955 1956 static inline bool empty_bucket(const struct tcp_iter_state *st) 1957 { 1958 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain); 1959 } 1960 1961 /* 1962 * Get first established socket starting from bucket given in st->bucket. 1963 * If st->bucket is zero, the very first socket in the hash is returned. 1964 */ 1965 static void *established_get_first(struct seq_file *seq) 1966 { 1967 struct tcp_iter_state *st = seq->private; 1968 struct net *net = seq_file_net(seq); 1969 void *rc = NULL; 1970 1971 st->offset = 0; 1972 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { 1973 struct sock *sk; 1974 struct hlist_nulls_node *node; 1975 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 1976 1977 /* Lockless fast path for the common case of empty buckets */ 1978 if (empty_bucket(st)) 1979 continue; 1980 1981 spin_lock_bh(lock); 1982 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 1983 if (sk->sk_family != st->family || 1984 !net_eq(sock_net(sk), net)) { 1985 continue; 1986 } 1987 rc = sk; 1988 goto out; 1989 } 1990 spin_unlock_bh(lock); 1991 } 1992 out: 1993 return rc; 1994 } 1995 1996 static void *established_get_next(struct seq_file *seq, void *cur) 1997 { 1998 struct sock *sk = cur; 1999 struct hlist_nulls_node *node; 2000 struct tcp_iter_state *st = seq->private; 2001 struct net *net = seq_file_net(seq); 2002 2003 ++st->num; 2004 ++st->offset; 2005 2006 sk = sk_nulls_next(sk); 2007 2008 sk_nulls_for_each_from(sk, node) { 2009 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) 2010 return sk; 2011 } 2012 2013 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2014 ++st->bucket; 2015 return established_get_first(seq); 2016 } 2017 2018 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2019 { 2020 struct tcp_iter_state *st = seq->private; 2021 void *rc; 2022 2023 st->bucket = 0; 2024 rc = established_get_first(seq); 2025 2026 while (rc && pos) { 2027 rc = established_get_next(seq, rc); 2028 --pos; 2029 } 2030 return rc; 2031 } 2032 2033 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2034 { 2035 void *rc; 2036 struct tcp_iter_state *st = seq->private; 2037 2038 st->state = TCP_SEQ_STATE_LISTENING; 2039 rc = listening_get_idx(seq, &pos); 2040 2041 if (!rc) { 2042 st->state = TCP_SEQ_STATE_ESTABLISHED; 2043 rc = established_get_idx(seq, pos); 2044 } 2045 2046 return rc; 2047 } 2048 2049 static void *tcp_seek_last_pos(struct seq_file *seq) 2050 { 2051 struct tcp_iter_state *st = seq->private; 2052 int offset = st->offset; 2053 int orig_num = st->num; 2054 void *rc = NULL; 2055 2056 switch (st->state) { 2057 case TCP_SEQ_STATE_LISTENING: 2058 if (st->bucket >= INET_LHTABLE_SIZE) 2059 break; 2060 st->state = TCP_SEQ_STATE_LISTENING; 2061 rc = listening_get_next(seq, NULL); 2062 while (offset-- && rc) 2063 rc = listening_get_next(seq, rc); 2064 if (rc) 2065 break; 2066 st->bucket = 0; 2067 st->state = TCP_SEQ_STATE_ESTABLISHED; 2068 /* Fallthrough */ 2069 case TCP_SEQ_STATE_ESTABLISHED: 2070 if (st->bucket > tcp_hashinfo.ehash_mask) 2071 break; 2072 rc = established_get_first(seq); 2073 while (offset-- && rc) 2074 rc = established_get_next(seq, rc); 2075 } 2076 2077 st->num = orig_num; 2078 2079 return rc; 2080 } 2081 2082 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2083 { 2084 struct tcp_iter_state *st = seq->private; 2085 void *rc; 2086 2087 if (*pos && *pos == st->last_pos) { 2088 rc = tcp_seek_last_pos(seq); 2089 if (rc) 2090 goto out; 2091 } 2092 2093 st->state = TCP_SEQ_STATE_LISTENING; 2094 st->num = 0; 2095 st->bucket = 0; 2096 st->offset = 0; 2097 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2098 2099 out: 2100 st->last_pos = *pos; 2101 return rc; 2102 } 2103 2104 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2105 { 2106 struct tcp_iter_state *st = seq->private; 2107 void *rc = NULL; 2108 2109 if (v == SEQ_START_TOKEN) { 2110 rc = tcp_get_idx(seq, 0); 2111 goto out; 2112 } 2113 2114 switch (st->state) { 2115 case TCP_SEQ_STATE_LISTENING: 2116 rc = listening_get_next(seq, v); 2117 if (!rc) { 2118 st->state = TCP_SEQ_STATE_ESTABLISHED; 2119 st->bucket = 0; 2120 st->offset = 0; 2121 rc = established_get_first(seq); 2122 } 2123 break; 2124 case TCP_SEQ_STATE_ESTABLISHED: 2125 rc = established_get_next(seq, v); 2126 break; 2127 } 2128 out: 2129 ++*pos; 2130 st->last_pos = *pos; 2131 return rc; 2132 } 2133 2134 static void tcp_seq_stop(struct seq_file *seq, void *v) 2135 { 2136 struct tcp_iter_state *st = seq->private; 2137 2138 switch (st->state) { 2139 case TCP_SEQ_STATE_LISTENING: 2140 if (v != SEQ_START_TOKEN) 2141 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock); 2142 break; 2143 case TCP_SEQ_STATE_ESTABLISHED: 2144 if (v) 2145 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2146 break; 2147 } 2148 } 2149 2150 int tcp_seq_open(struct inode *inode, struct file *file) 2151 { 2152 struct tcp_seq_afinfo *afinfo = PDE_DATA(inode); 2153 struct tcp_iter_state *s; 2154 int err; 2155 2156 err = seq_open_net(inode, file, &afinfo->seq_ops, 2157 sizeof(struct tcp_iter_state)); 2158 if (err < 0) 2159 return err; 2160 2161 s = ((struct seq_file *)file->private_data)->private; 2162 s->family = afinfo->family; 2163 s->last_pos = 0; 2164 return 0; 2165 } 2166 EXPORT_SYMBOL(tcp_seq_open); 2167 2168 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo) 2169 { 2170 int rc = 0; 2171 struct proc_dir_entry *p; 2172 2173 afinfo->seq_ops.start = tcp_seq_start; 2174 afinfo->seq_ops.next = tcp_seq_next; 2175 afinfo->seq_ops.stop = tcp_seq_stop; 2176 2177 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net, 2178 afinfo->seq_fops, afinfo); 2179 if (!p) 2180 rc = -ENOMEM; 2181 return rc; 2182 } 2183 EXPORT_SYMBOL(tcp_proc_register); 2184 2185 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo) 2186 { 2187 remove_proc_entry(afinfo->name, net->proc_net); 2188 } 2189 EXPORT_SYMBOL(tcp_proc_unregister); 2190 2191 static void get_openreq4(const struct request_sock *req, 2192 struct seq_file *f, int i) 2193 { 2194 const struct inet_request_sock *ireq = inet_rsk(req); 2195 long delta = req->rsk_timer.expires - jiffies; 2196 2197 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2198 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2199 i, 2200 ireq->ir_loc_addr, 2201 ireq->ir_num, 2202 ireq->ir_rmt_addr, 2203 ntohs(ireq->ir_rmt_port), 2204 TCP_SYN_RECV, 2205 0, 0, /* could print option size, but that is af dependent. */ 2206 1, /* timers active (only the expire timer) */ 2207 jiffies_delta_to_clock_t(delta), 2208 req->num_timeout, 2209 from_kuid_munged(seq_user_ns(f), 2210 sock_i_uid(req->rsk_listener)), 2211 0, /* non standard timer */ 2212 0, /* open_requests have no inode */ 2213 0, 2214 req); 2215 } 2216 2217 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2218 { 2219 int timer_active; 2220 unsigned long timer_expires; 2221 const struct tcp_sock *tp = tcp_sk(sk); 2222 const struct inet_connection_sock *icsk = inet_csk(sk); 2223 const struct inet_sock *inet = inet_sk(sk); 2224 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2225 __be32 dest = inet->inet_daddr; 2226 __be32 src = inet->inet_rcv_saddr; 2227 __u16 destp = ntohs(inet->inet_dport); 2228 __u16 srcp = ntohs(inet->inet_sport); 2229 int rx_queue; 2230 int state; 2231 2232 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2233 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2234 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2235 timer_active = 1; 2236 timer_expires = icsk->icsk_timeout; 2237 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2238 timer_active = 4; 2239 timer_expires = icsk->icsk_timeout; 2240 } else if (timer_pending(&sk->sk_timer)) { 2241 timer_active = 2; 2242 timer_expires = sk->sk_timer.expires; 2243 } else { 2244 timer_active = 0; 2245 timer_expires = jiffies; 2246 } 2247 2248 state = sk_state_load(sk); 2249 if (state == TCP_LISTEN) 2250 rx_queue = sk->sk_ack_backlog; 2251 else 2252 /* Because we don't lock the socket, 2253 * we might find a transient negative value. 2254 */ 2255 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); 2256 2257 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2258 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2259 i, src, srcp, dest, destp, state, 2260 tp->write_seq - tp->snd_una, 2261 rx_queue, 2262 timer_active, 2263 jiffies_delta_to_clock_t(timer_expires - jiffies), 2264 icsk->icsk_retransmits, 2265 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2266 icsk->icsk_probes_out, 2267 sock_i_ino(sk), 2268 atomic_read(&sk->sk_refcnt), sk, 2269 jiffies_to_clock_t(icsk->icsk_rto), 2270 jiffies_to_clock_t(icsk->icsk_ack.ato), 2271 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, 2272 tp->snd_cwnd, 2273 state == TCP_LISTEN ? 2274 fastopenq->max_qlen : 2275 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2276 } 2277 2278 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2279 struct seq_file *f, int i) 2280 { 2281 long delta = tw->tw_timer.expires - jiffies; 2282 __be32 dest, src; 2283 __u16 destp, srcp; 2284 2285 dest = tw->tw_daddr; 2286 src = tw->tw_rcv_saddr; 2287 destp = ntohs(tw->tw_dport); 2288 srcp = ntohs(tw->tw_sport); 2289 2290 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2291 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2292 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2293 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2294 atomic_read(&tw->tw_refcnt), tw); 2295 } 2296 2297 #define TMPSZ 150 2298 2299 static int tcp4_seq_show(struct seq_file *seq, void *v) 2300 { 2301 struct tcp_iter_state *st; 2302 struct sock *sk = v; 2303 2304 seq_setwidth(seq, TMPSZ - 1); 2305 if (v == SEQ_START_TOKEN) { 2306 seq_puts(seq, " sl local_address rem_address st tx_queue " 2307 "rx_queue tr tm->when retrnsmt uid timeout " 2308 "inode"); 2309 goto out; 2310 } 2311 st = seq->private; 2312 2313 if (sk->sk_state == TCP_TIME_WAIT) 2314 get_timewait4_sock(v, seq, st->num); 2315 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2316 get_openreq4(v, seq, st->num); 2317 else 2318 get_tcp4_sock(v, seq, st->num); 2319 out: 2320 seq_pad(seq, '\n'); 2321 return 0; 2322 } 2323 2324 static const struct file_operations tcp_afinfo_seq_fops = { 2325 .owner = THIS_MODULE, 2326 .open = tcp_seq_open, 2327 .read = seq_read, 2328 .llseek = seq_lseek, 2329 .release = seq_release_net 2330 }; 2331 2332 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 2333 .name = "tcp", 2334 .family = AF_INET, 2335 .seq_fops = &tcp_afinfo_seq_fops, 2336 .seq_ops = { 2337 .show = tcp4_seq_show, 2338 }, 2339 }; 2340 2341 static int __net_init tcp4_proc_init_net(struct net *net) 2342 { 2343 return tcp_proc_register(net, &tcp4_seq_afinfo); 2344 } 2345 2346 static void __net_exit tcp4_proc_exit_net(struct net *net) 2347 { 2348 tcp_proc_unregister(net, &tcp4_seq_afinfo); 2349 } 2350 2351 static struct pernet_operations tcp4_net_ops = { 2352 .init = tcp4_proc_init_net, 2353 .exit = tcp4_proc_exit_net, 2354 }; 2355 2356 int __init tcp4_proc_init(void) 2357 { 2358 return register_pernet_subsys(&tcp4_net_ops); 2359 } 2360 2361 void tcp4_proc_exit(void) 2362 { 2363 unregister_pernet_subsys(&tcp4_net_ops); 2364 } 2365 #endif /* CONFIG_PROC_FS */ 2366 2367 struct proto tcp_prot = { 2368 .name = "TCP", 2369 .owner = THIS_MODULE, 2370 .close = tcp_close, 2371 .connect = tcp_v4_connect, 2372 .disconnect = tcp_disconnect, 2373 .accept = inet_csk_accept, 2374 .ioctl = tcp_ioctl, 2375 .init = tcp_v4_init_sock, 2376 .destroy = tcp_v4_destroy_sock, 2377 .shutdown = tcp_shutdown, 2378 .setsockopt = tcp_setsockopt, 2379 .getsockopt = tcp_getsockopt, 2380 .keepalive = tcp_set_keepalive, 2381 .recvmsg = tcp_recvmsg, 2382 .sendmsg = tcp_sendmsg, 2383 .sendpage = tcp_sendpage, 2384 .backlog_rcv = tcp_v4_do_rcv, 2385 .release_cb = tcp_release_cb, 2386 .hash = inet_hash, 2387 .unhash = inet_unhash, 2388 .get_port = inet_csk_get_port, 2389 .enter_memory_pressure = tcp_enter_memory_pressure, 2390 .stream_memory_free = tcp_stream_memory_free, 2391 .sockets_allocated = &tcp_sockets_allocated, 2392 .orphan_count = &tcp_orphan_count, 2393 .memory_allocated = &tcp_memory_allocated, 2394 .memory_pressure = &tcp_memory_pressure, 2395 .sysctl_mem = sysctl_tcp_mem, 2396 .sysctl_wmem = sysctl_tcp_wmem, 2397 .sysctl_rmem = sysctl_tcp_rmem, 2398 .max_header = MAX_TCP_HEADER, 2399 .obj_size = sizeof(struct tcp_sock), 2400 .slab_flags = SLAB_TYPESAFE_BY_RCU, 2401 .twsk_prot = &tcp_timewait_sock_ops, 2402 .rsk_prot = &tcp_request_sock_ops, 2403 .h.hashinfo = &tcp_hashinfo, 2404 .no_autobind = true, 2405 #ifdef CONFIG_COMPAT 2406 .compat_setsockopt = compat_tcp_setsockopt, 2407 .compat_getsockopt = compat_tcp_getsockopt, 2408 #endif 2409 .diag_destroy = tcp_abort, 2410 }; 2411 EXPORT_SYMBOL(tcp_prot); 2412 2413 static void __net_exit tcp_sk_exit(struct net *net) 2414 { 2415 int cpu; 2416 2417 for_each_possible_cpu(cpu) 2418 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); 2419 free_percpu(net->ipv4.tcp_sk); 2420 } 2421 2422 static int __net_init tcp_sk_init(struct net *net) 2423 { 2424 int res, cpu, cnt; 2425 2426 net->ipv4.tcp_sk = alloc_percpu(struct sock *); 2427 if (!net->ipv4.tcp_sk) 2428 return -ENOMEM; 2429 2430 for_each_possible_cpu(cpu) { 2431 struct sock *sk; 2432 2433 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 2434 IPPROTO_TCP, net); 2435 if (res) 2436 goto fail; 2437 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 2438 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; 2439 } 2440 2441 net->ipv4.sysctl_tcp_ecn = 2; 2442 net->ipv4.sysctl_tcp_ecn_fallback = 1; 2443 2444 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 2445 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 2446 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 2447 2448 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 2449 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 2450 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 2451 2452 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 2453 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 2454 net->ipv4.sysctl_tcp_syncookies = 1; 2455 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 2456 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 2457 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 2458 net->ipv4.sysctl_tcp_orphan_retries = 0; 2459 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 2460 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 2461 net->ipv4.sysctl_tcp_tw_reuse = 0; 2462 2463 cnt = tcp_hashinfo.ehash_mask + 1; 2464 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2; 2465 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; 2466 2467 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256); 2468 net->ipv4.sysctl_tcp_sack = 1; 2469 2470 return 0; 2471 fail: 2472 tcp_sk_exit(net); 2473 2474 return res; 2475 } 2476 2477 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 2478 { 2479 inet_twsk_purge(&tcp_hashinfo, AF_INET); 2480 } 2481 2482 static struct pernet_operations __net_initdata tcp_sk_ops = { 2483 .init = tcp_sk_init, 2484 .exit = tcp_sk_exit, 2485 .exit_batch = tcp_sk_exit_batch, 2486 }; 2487 2488 void __init tcp_v4_init(void) 2489 { 2490 if (register_pernet_subsys(&tcp_sk_ops)) 2491 panic("Failed to create the TCP control socket.\n"); 2492 } 2493