xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision 1c2dd16a)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77 
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 
84 #include <crypto/hash.h>
85 #include <linux/scatterlist.h>
86 
87 int sysctl_tcp_low_latency __read_mostly;
88 
89 #ifdef CONFIG_TCP_MD5SIG
90 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
91 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
92 #endif
93 
94 struct inet_hashinfo tcp_hashinfo;
95 EXPORT_SYMBOL(tcp_hashinfo);
96 
97 static u32 tcp_v4_init_seq_and_tsoff(const struct sk_buff *skb, u32 *tsoff)
98 {
99 	return secure_tcp_seq_and_tsoff(ip_hdr(skb)->daddr,
100 					ip_hdr(skb)->saddr,
101 					tcp_hdr(skb)->dest,
102 					tcp_hdr(skb)->source, tsoff);
103 }
104 
105 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
106 {
107 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
108 	struct tcp_sock *tp = tcp_sk(sk);
109 
110 	/* With PAWS, it is safe from the viewpoint
111 	   of data integrity. Even without PAWS it is safe provided sequence
112 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
113 
114 	   Actually, the idea is close to VJ's one, only timestamp cache is
115 	   held not per host, but per port pair and TW bucket is used as state
116 	   holder.
117 
118 	   If TW bucket has been already destroyed we fall back to VJ's scheme
119 	   and use initial timestamp retrieved from peer table.
120 	 */
121 	if (tcptw->tw_ts_recent_stamp &&
122 	    (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
123 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
124 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
125 		if (tp->write_seq == 0)
126 			tp->write_seq = 1;
127 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
128 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
129 		sock_hold(sktw);
130 		return 1;
131 	}
132 
133 	return 0;
134 }
135 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
136 
137 /* This will initiate an outgoing connection. */
138 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
139 {
140 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
141 	struct inet_sock *inet = inet_sk(sk);
142 	struct tcp_sock *tp = tcp_sk(sk);
143 	__be16 orig_sport, orig_dport;
144 	__be32 daddr, nexthop;
145 	struct flowi4 *fl4;
146 	struct rtable *rt;
147 	int err;
148 	u32 seq;
149 	struct ip_options_rcu *inet_opt;
150 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
151 
152 	if (addr_len < sizeof(struct sockaddr_in))
153 		return -EINVAL;
154 
155 	if (usin->sin_family != AF_INET)
156 		return -EAFNOSUPPORT;
157 
158 	nexthop = daddr = usin->sin_addr.s_addr;
159 	inet_opt = rcu_dereference_protected(inet->inet_opt,
160 					     lockdep_sock_is_held(sk));
161 	if (inet_opt && inet_opt->opt.srr) {
162 		if (!daddr)
163 			return -EINVAL;
164 		nexthop = inet_opt->opt.faddr;
165 	}
166 
167 	orig_sport = inet->inet_sport;
168 	orig_dport = usin->sin_port;
169 	fl4 = &inet->cork.fl.u.ip4;
170 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
171 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
172 			      IPPROTO_TCP,
173 			      orig_sport, orig_dport, sk);
174 	if (IS_ERR(rt)) {
175 		err = PTR_ERR(rt);
176 		if (err == -ENETUNREACH)
177 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
178 		return err;
179 	}
180 
181 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
182 		ip_rt_put(rt);
183 		return -ENETUNREACH;
184 	}
185 
186 	if (!inet_opt || !inet_opt->opt.srr)
187 		daddr = fl4->daddr;
188 
189 	if (!inet->inet_saddr)
190 		inet->inet_saddr = fl4->saddr;
191 	sk_rcv_saddr_set(sk, inet->inet_saddr);
192 
193 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
194 		/* Reset inherited state */
195 		tp->rx_opt.ts_recent	   = 0;
196 		tp->rx_opt.ts_recent_stamp = 0;
197 		if (likely(!tp->repair))
198 			tp->write_seq	   = 0;
199 	}
200 
201 	inet->inet_dport = usin->sin_port;
202 	sk_daddr_set(sk, daddr);
203 
204 	inet_csk(sk)->icsk_ext_hdr_len = 0;
205 	if (inet_opt)
206 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
207 
208 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
209 
210 	/* Socket identity is still unknown (sport may be zero).
211 	 * However we set state to SYN-SENT and not releasing socket
212 	 * lock select source port, enter ourselves into the hash tables and
213 	 * complete initialization after this.
214 	 */
215 	tcp_set_state(sk, TCP_SYN_SENT);
216 	err = inet_hash_connect(tcp_death_row, sk);
217 	if (err)
218 		goto failure;
219 
220 	sk_set_txhash(sk);
221 
222 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
223 			       inet->inet_sport, inet->inet_dport, sk);
224 	if (IS_ERR(rt)) {
225 		err = PTR_ERR(rt);
226 		rt = NULL;
227 		goto failure;
228 	}
229 	/* OK, now commit destination to socket.  */
230 	sk->sk_gso_type = SKB_GSO_TCPV4;
231 	sk_setup_caps(sk, &rt->dst);
232 	rt = NULL;
233 
234 	if (likely(!tp->repair)) {
235 		seq = secure_tcp_seq_and_tsoff(inet->inet_saddr,
236 					       inet->inet_daddr,
237 					       inet->inet_sport,
238 					       usin->sin_port,
239 					       &tp->tsoffset);
240 		if (!tp->write_seq)
241 			tp->write_seq = seq;
242 	}
243 
244 	inet->inet_id = tp->write_seq ^ jiffies;
245 
246 	if (tcp_fastopen_defer_connect(sk, &err))
247 		return err;
248 	if (err)
249 		goto failure;
250 
251 	err = tcp_connect(sk);
252 
253 	if (err)
254 		goto failure;
255 
256 	return 0;
257 
258 failure:
259 	/*
260 	 * This unhashes the socket and releases the local port,
261 	 * if necessary.
262 	 */
263 	tcp_set_state(sk, TCP_CLOSE);
264 	ip_rt_put(rt);
265 	sk->sk_route_caps = 0;
266 	inet->inet_dport = 0;
267 	return err;
268 }
269 EXPORT_SYMBOL(tcp_v4_connect);
270 
271 /*
272  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
273  * It can be called through tcp_release_cb() if socket was owned by user
274  * at the time tcp_v4_err() was called to handle ICMP message.
275  */
276 void tcp_v4_mtu_reduced(struct sock *sk)
277 {
278 	struct inet_sock *inet = inet_sk(sk);
279 	struct dst_entry *dst;
280 	u32 mtu;
281 
282 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
283 		return;
284 	mtu = tcp_sk(sk)->mtu_info;
285 	dst = inet_csk_update_pmtu(sk, mtu);
286 	if (!dst)
287 		return;
288 
289 	/* Something is about to be wrong... Remember soft error
290 	 * for the case, if this connection will not able to recover.
291 	 */
292 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
293 		sk->sk_err_soft = EMSGSIZE;
294 
295 	mtu = dst_mtu(dst);
296 
297 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
298 	    ip_sk_accept_pmtu(sk) &&
299 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
300 		tcp_sync_mss(sk, mtu);
301 
302 		/* Resend the TCP packet because it's
303 		 * clear that the old packet has been
304 		 * dropped. This is the new "fast" path mtu
305 		 * discovery.
306 		 */
307 		tcp_simple_retransmit(sk);
308 	} /* else let the usual retransmit timer handle it */
309 }
310 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
311 
312 static void do_redirect(struct sk_buff *skb, struct sock *sk)
313 {
314 	struct dst_entry *dst = __sk_dst_check(sk, 0);
315 
316 	if (dst)
317 		dst->ops->redirect(dst, sk, skb);
318 }
319 
320 
321 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
322 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
323 {
324 	struct request_sock *req = inet_reqsk(sk);
325 	struct net *net = sock_net(sk);
326 
327 	/* ICMPs are not backlogged, hence we cannot get
328 	 * an established socket here.
329 	 */
330 	if (seq != tcp_rsk(req)->snt_isn) {
331 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
332 	} else if (abort) {
333 		/*
334 		 * Still in SYN_RECV, just remove it silently.
335 		 * There is no good way to pass the error to the newly
336 		 * created socket, and POSIX does not want network
337 		 * errors returned from accept().
338 		 */
339 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
340 		tcp_listendrop(req->rsk_listener);
341 	}
342 	reqsk_put(req);
343 }
344 EXPORT_SYMBOL(tcp_req_err);
345 
346 /*
347  * This routine is called by the ICMP module when it gets some
348  * sort of error condition.  If err < 0 then the socket should
349  * be closed and the error returned to the user.  If err > 0
350  * it's just the icmp type << 8 | icmp code.  After adjustment
351  * header points to the first 8 bytes of the tcp header.  We need
352  * to find the appropriate port.
353  *
354  * The locking strategy used here is very "optimistic". When
355  * someone else accesses the socket the ICMP is just dropped
356  * and for some paths there is no check at all.
357  * A more general error queue to queue errors for later handling
358  * is probably better.
359  *
360  */
361 
362 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
363 {
364 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
365 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
366 	struct inet_connection_sock *icsk;
367 	struct tcp_sock *tp;
368 	struct inet_sock *inet;
369 	const int type = icmp_hdr(icmp_skb)->type;
370 	const int code = icmp_hdr(icmp_skb)->code;
371 	struct sock *sk;
372 	struct sk_buff *skb;
373 	struct request_sock *fastopen;
374 	__u32 seq, snd_una;
375 	__u32 remaining;
376 	int err;
377 	struct net *net = dev_net(icmp_skb->dev);
378 
379 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
380 				       th->dest, iph->saddr, ntohs(th->source),
381 				       inet_iif(icmp_skb));
382 	if (!sk) {
383 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
384 		return;
385 	}
386 	if (sk->sk_state == TCP_TIME_WAIT) {
387 		inet_twsk_put(inet_twsk(sk));
388 		return;
389 	}
390 	seq = ntohl(th->seq);
391 	if (sk->sk_state == TCP_NEW_SYN_RECV)
392 		return tcp_req_err(sk, seq,
393 				  type == ICMP_PARAMETERPROB ||
394 				  type == ICMP_TIME_EXCEEDED ||
395 				  (type == ICMP_DEST_UNREACH &&
396 				   (code == ICMP_NET_UNREACH ||
397 				    code == ICMP_HOST_UNREACH)));
398 
399 	bh_lock_sock(sk);
400 	/* If too many ICMPs get dropped on busy
401 	 * servers this needs to be solved differently.
402 	 * We do take care of PMTU discovery (RFC1191) special case :
403 	 * we can receive locally generated ICMP messages while socket is held.
404 	 */
405 	if (sock_owned_by_user(sk)) {
406 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
407 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
408 	}
409 	if (sk->sk_state == TCP_CLOSE)
410 		goto out;
411 
412 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
413 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
414 		goto out;
415 	}
416 
417 	icsk = inet_csk(sk);
418 	tp = tcp_sk(sk);
419 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
420 	fastopen = tp->fastopen_rsk;
421 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
422 	if (sk->sk_state != TCP_LISTEN &&
423 	    !between(seq, snd_una, tp->snd_nxt)) {
424 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
425 		goto out;
426 	}
427 
428 	switch (type) {
429 	case ICMP_REDIRECT:
430 		if (!sock_owned_by_user(sk))
431 			do_redirect(icmp_skb, sk);
432 		goto out;
433 	case ICMP_SOURCE_QUENCH:
434 		/* Just silently ignore these. */
435 		goto out;
436 	case ICMP_PARAMETERPROB:
437 		err = EPROTO;
438 		break;
439 	case ICMP_DEST_UNREACH:
440 		if (code > NR_ICMP_UNREACH)
441 			goto out;
442 
443 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
444 			/* We are not interested in TCP_LISTEN and open_requests
445 			 * (SYN-ACKs send out by Linux are always <576bytes so
446 			 * they should go through unfragmented).
447 			 */
448 			if (sk->sk_state == TCP_LISTEN)
449 				goto out;
450 
451 			tp->mtu_info = info;
452 			if (!sock_owned_by_user(sk)) {
453 				tcp_v4_mtu_reduced(sk);
454 			} else {
455 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
456 					sock_hold(sk);
457 			}
458 			goto out;
459 		}
460 
461 		err = icmp_err_convert[code].errno;
462 		/* check if icmp_skb allows revert of backoff
463 		 * (see draft-zimmermann-tcp-lcd) */
464 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
465 			break;
466 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
467 		    !icsk->icsk_backoff || fastopen)
468 			break;
469 
470 		if (sock_owned_by_user(sk))
471 			break;
472 
473 		icsk->icsk_backoff--;
474 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
475 					       TCP_TIMEOUT_INIT;
476 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
477 
478 		skb = tcp_write_queue_head(sk);
479 		BUG_ON(!skb);
480 
481 		remaining = icsk->icsk_rto -
482 			    min(icsk->icsk_rto,
483 				tcp_time_stamp - tcp_skb_timestamp(skb));
484 
485 		if (remaining) {
486 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
487 						  remaining, TCP_RTO_MAX);
488 		} else {
489 			/* RTO revert clocked out retransmission.
490 			 * Will retransmit now */
491 			tcp_retransmit_timer(sk);
492 		}
493 
494 		break;
495 	case ICMP_TIME_EXCEEDED:
496 		err = EHOSTUNREACH;
497 		break;
498 	default:
499 		goto out;
500 	}
501 
502 	switch (sk->sk_state) {
503 	case TCP_SYN_SENT:
504 	case TCP_SYN_RECV:
505 		/* Only in fast or simultaneous open. If a fast open socket is
506 		 * is already accepted it is treated as a connected one below.
507 		 */
508 		if (fastopen && !fastopen->sk)
509 			break;
510 
511 		if (!sock_owned_by_user(sk)) {
512 			sk->sk_err = err;
513 
514 			sk->sk_error_report(sk);
515 
516 			tcp_done(sk);
517 		} else {
518 			sk->sk_err_soft = err;
519 		}
520 		goto out;
521 	}
522 
523 	/* If we've already connected we will keep trying
524 	 * until we time out, or the user gives up.
525 	 *
526 	 * rfc1122 4.2.3.9 allows to consider as hard errors
527 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
528 	 * but it is obsoleted by pmtu discovery).
529 	 *
530 	 * Note, that in modern internet, where routing is unreliable
531 	 * and in each dark corner broken firewalls sit, sending random
532 	 * errors ordered by their masters even this two messages finally lose
533 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
534 	 *
535 	 * Now we are in compliance with RFCs.
536 	 *							--ANK (980905)
537 	 */
538 
539 	inet = inet_sk(sk);
540 	if (!sock_owned_by_user(sk) && inet->recverr) {
541 		sk->sk_err = err;
542 		sk->sk_error_report(sk);
543 	} else	{ /* Only an error on timeout */
544 		sk->sk_err_soft = err;
545 	}
546 
547 out:
548 	bh_unlock_sock(sk);
549 	sock_put(sk);
550 }
551 
552 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
553 {
554 	struct tcphdr *th = tcp_hdr(skb);
555 
556 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
557 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
558 		skb->csum_start = skb_transport_header(skb) - skb->head;
559 		skb->csum_offset = offsetof(struct tcphdr, check);
560 	} else {
561 		th->check = tcp_v4_check(skb->len, saddr, daddr,
562 					 csum_partial(th,
563 						      th->doff << 2,
564 						      skb->csum));
565 	}
566 }
567 
568 /* This routine computes an IPv4 TCP checksum. */
569 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
570 {
571 	const struct inet_sock *inet = inet_sk(sk);
572 
573 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
574 }
575 EXPORT_SYMBOL(tcp_v4_send_check);
576 
577 /*
578  *	This routine will send an RST to the other tcp.
579  *
580  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
581  *		      for reset.
582  *	Answer: if a packet caused RST, it is not for a socket
583  *		existing in our system, if it is matched to a socket,
584  *		it is just duplicate segment or bug in other side's TCP.
585  *		So that we build reply only basing on parameters
586  *		arrived with segment.
587  *	Exception: precedence violation. We do not implement it in any case.
588  */
589 
590 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
591 {
592 	const struct tcphdr *th = tcp_hdr(skb);
593 	struct {
594 		struct tcphdr th;
595 #ifdef CONFIG_TCP_MD5SIG
596 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
597 #endif
598 	} rep;
599 	struct ip_reply_arg arg;
600 #ifdef CONFIG_TCP_MD5SIG
601 	struct tcp_md5sig_key *key = NULL;
602 	const __u8 *hash_location = NULL;
603 	unsigned char newhash[16];
604 	int genhash;
605 	struct sock *sk1 = NULL;
606 #endif
607 	struct net *net;
608 
609 	/* Never send a reset in response to a reset. */
610 	if (th->rst)
611 		return;
612 
613 	/* If sk not NULL, it means we did a successful lookup and incoming
614 	 * route had to be correct. prequeue might have dropped our dst.
615 	 */
616 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
617 		return;
618 
619 	/* Swap the send and the receive. */
620 	memset(&rep, 0, sizeof(rep));
621 	rep.th.dest   = th->source;
622 	rep.th.source = th->dest;
623 	rep.th.doff   = sizeof(struct tcphdr) / 4;
624 	rep.th.rst    = 1;
625 
626 	if (th->ack) {
627 		rep.th.seq = th->ack_seq;
628 	} else {
629 		rep.th.ack = 1;
630 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
631 				       skb->len - (th->doff << 2));
632 	}
633 
634 	memset(&arg, 0, sizeof(arg));
635 	arg.iov[0].iov_base = (unsigned char *)&rep;
636 	arg.iov[0].iov_len  = sizeof(rep.th);
637 
638 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
639 #ifdef CONFIG_TCP_MD5SIG
640 	rcu_read_lock();
641 	hash_location = tcp_parse_md5sig_option(th);
642 	if (sk && sk_fullsock(sk)) {
643 		key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
644 					&ip_hdr(skb)->saddr, AF_INET);
645 	} else if (hash_location) {
646 		/*
647 		 * active side is lost. Try to find listening socket through
648 		 * source port, and then find md5 key through listening socket.
649 		 * we are not loose security here:
650 		 * Incoming packet is checked with md5 hash with finding key,
651 		 * no RST generated if md5 hash doesn't match.
652 		 */
653 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
654 					     ip_hdr(skb)->saddr,
655 					     th->source, ip_hdr(skb)->daddr,
656 					     ntohs(th->source), inet_iif(skb));
657 		/* don't send rst if it can't find key */
658 		if (!sk1)
659 			goto out;
660 
661 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
662 					&ip_hdr(skb)->saddr, AF_INET);
663 		if (!key)
664 			goto out;
665 
666 
667 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
668 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
669 			goto out;
670 
671 	}
672 
673 	if (key) {
674 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
675 				   (TCPOPT_NOP << 16) |
676 				   (TCPOPT_MD5SIG << 8) |
677 				   TCPOLEN_MD5SIG);
678 		/* Update length and the length the header thinks exists */
679 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
680 		rep.th.doff = arg.iov[0].iov_len / 4;
681 
682 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
683 				     key, ip_hdr(skb)->saddr,
684 				     ip_hdr(skb)->daddr, &rep.th);
685 	}
686 #endif
687 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
688 				      ip_hdr(skb)->saddr, /* XXX */
689 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
690 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
691 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
692 
693 	/* When socket is gone, all binding information is lost.
694 	 * routing might fail in this case. No choice here, if we choose to force
695 	 * input interface, we will misroute in case of asymmetric route.
696 	 */
697 	if (sk)
698 		arg.bound_dev_if = sk->sk_bound_dev_if;
699 
700 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
701 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
702 
703 	arg.tos = ip_hdr(skb)->tos;
704 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
705 	local_bh_disable();
706 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
707 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
708 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
709 			      &arg, arg.iov[0].iov_len);
710 
711 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
712 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
713 	local_bh_enable();
714 
715 #ifdef CONFIG_TCP_MD5SIG
716 out:
717 	rcu_read_unlock();
718 #endif
719 }
720 
721 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
722    outside socket context is ugly, certainly. What can I do?
723  */
724 
725 static void tcp_v4_send_ack(const struct sock *sk,
726 			    struct sk_buff *skb, u32 seq, u32 ack,
727 			    u32 win, u32 tsval, u32 tsecr, int oif,
728 			    struct tcp_md5sig_key *key,
729 			    int reply_flags, u8 tos)
730 {
731 	const struct tcphdr *th = tcp_hdr(skb);
732 	struct {
733 		struct tcphdr th;
734 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
735 #ifdef CONFIG_TCP_MD5SIG
736 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
737 #endif
738 			];
739 	} rep;
740 	struct net *net = sock_net(sk);
741 	struct ip_reply_arg arg;
742 
743 	memset(&rep.th, 0, sizeof(struct tcphdr));
744 	memset(&arg, 0, sizeof(arg));
745 
746 	arg.iov[0].iov_base = (unsigned char *)&rep;
747 	arg.iov[0].iov_len  = sizeof(rep.th);
748 	if (tsecr) {
749 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
750 				   (TCPOPT_TIMESTAMP << 8) |
751 				   TCPOLEN_TIMESTAMP);
752 		rep.opt[1] = htonl(tsval);
753 		rep.opt[2] = htonl(tsecr);
754 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
755 	}
756 
757 	/* Swap the send and the receive. */
758 	rep.th.dest    = th->source;
759 	rep.th.source  = th->dest;
760 	rep.th.doff    = arg.iov[0].iov_len / 4;
761 	rep.th.seq     = htonl(seq);
762 	rep.th.ack_seq = htonl(ack);
763 	rep.th.ack     = 1;
764 	rep.th.window  = htons(win);
765 
766 #ifdef CONFIG_TCP_MD5SIG
767 	if (key) {
768 		int offset = (tsecr) ? 3 : 0;
769 
770 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
771 					  (TCPOPT_NOP << 16) |
772 					  (TCPOPT_MD5SIG << 8) |
773 					  TCPOLEN_MD5SIG);
774 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
775 		rep.th.doff = arg.iov[0].iov_len/4;
776 
777 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
778 				    key, ip_hdr(skb)->saddr,
779 				    ip_hdr(skb)->daddr, &rep.th);
780 	}
781 #endif
782 	arg.flags = reply_flags;
783 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
784 				      ip_hdr(skb)->saddr, /* XXX */
785 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
786 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
787 	if (oif)
788 		arg.bound_dev_if = oif;
789 	arg.tos = tos;
790 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
791 	local_bh_disable();
792 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
793 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
794 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
795 			      &arg, arg.iov[0].iov_len);
796 
797 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
798 	local_bh_enable();
799 }
800 
801 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
802 {
803 	struct inet_timewait_sock *tw = inet_twsk(sk);
804 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
805 
806 	tcp_v4_send_ack(sk, skb,
807 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
808 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
809 			tcp_time_stamp + tcptw->tw_ts_offset,
810 			tcptw->tw_ts_recent,
811 			tw->tw_bound_dev_if,
812 			tcp_twsk_md5_key(tcptw),
813 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
814 			tw->tw_tos
815 			);
816 
817 	inet_twsk_put(tw);
818 }
819 
820 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
821 				  struct request_sock *req)
822 {
823 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
824 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
825 	 */
826 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
827 					     tcp_sk(sk)->snd_nxt;
828 
829 	/* RFC 7323 2.3
830 	 * The window field (SEG.WND) of every outgoing segment, with the
831 	 * exception of <SYN> segments, MUST be right-shifted by
832 	 * Rcv.Wind.Shift bits:
833 	 */
834 	tcp_v4_send_ack(sk, skb, seq,
835 			tcp_rsk(req)->rcv_nxt,
836 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
837 			tcp_time_stamp + tcp_rsk(req)->ts_off,
838 			req->ts_recent,
839 			0,
840 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
841 					  AF_INET),
842 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
843 			ip_hdr(skb)->tos);
844 }
845 
846 /*
847  *	Send a SYN-ACK after having received a SYN.
848  *	This still operates on a request_sock only, not on a big
849  *	socket.
850  */
851 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
852 			      struct flowi *fl,
853 			      struct request_sock *req,
854 			      struct tcp_fastopen_cookie *foc,
855 			      enum tcp_synack_type synack_type)
856 {
857 	const struct inet_request_sock *ireq = inet_rsk(req);
858 	struct flowi4 fl4;
859 	int err = -1;
860 	struct sk_buff *skb;
861 
862 	/* First, grab a route. */
863 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
864 		return -1;
865 
866 	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
867 
868 	if (skb) {
869 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
870 
871 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
872 					    ireq->ir_rmt_addr,
873 					    ireq->opt);
874 		err = net_xmit_eval(err);
875 	}
876 
877 	return err;
878 }
879 
880 /*
881  *	IPv4 request_sock destructor.
882  */
883 static void tcp_v4_reqsk_destructor(struct request_sock *req)
884 {
885 	kfree(inet_rsk(req)->opt);
886 }
887 
888 #ifdef CONFIG_TCP_MD5SIG
889 /*
890  * RFC2385 MD5 checksumming requires a mapping of
891  * IP address->MD5 Key.
892  * We need to maintain these in the sk structure.
893  */
894 
895 /* Find the Key structure for an address.  */
896 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
897 					 const union tcp_md5_addr *addr,
898 					 int family)
899 {
900 	const struct tcp_sock *tp = tcp_sk(sk);
901 	struct tcp_md5sig_key *key;
902 	unsigned int size = sizeof(struct in_addr);
903 	const struct tcp_md5sig_info *md5sig;
904 
905 	/* caller either holds rcu_read_lock() or socket lock */
906 	md5sig = rcu_dereference_check(tp->md5sig_info,
907 				       lockdep_sock_is_held(sk));
908 	if (!md5sig)
909 		return NULL;
910 #if IS_ENABLED(CONFIG_IPV6)
911 	if (family == AF_INET6)
912 		size = sizeof(struct in6_addr);
913 #endif
914 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
915 		if (key->family != family)
916 			continue;
917 		if (!memcmp(&key->addr, addr, size))
918 			return key;
919 	}
920 	return NULL;
921 }
922 EXPORT_SYMBOL(tcp_md5_do_lookup);
923 
924 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
925 					 const struct sock *addr_sk)
926 {
927 	const union tcp_md5_addr *addr;
928 
929 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
930 	return tcp_md5_do_lookup(sk, addr, AF_INET);
931 }
932 EXPORT_SYMBOL(tcp_v4_md5_lookup);
933 
934 /* This can be called on a newly created socket, from other files */
935 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
936 		   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
937 {
938 	/* Add Key to the list */
939 	struct tcp_md5sig_key *key;
940 	struct tcp_sock *tp = tcp_sk(sk);
941 	struct tcp_md5sig_info *md5sig;
942 
943 	key = tcp_md5_do_lookup(sk, addr, family);
944 	if (key) {
945 		/* Pre-existing entry - just update that one. */
946 		memcpy(key->key, newkey, newkeylen);
947 		key->keylen = newkeylen;
948 		return 0;
949 	}
950 
951 	md5sig = rcu_dereference_protected(tp->md5sig_info,
952 					   lockdep_sock_is_held(sk));
953 	if (!md5sig) {
954 		md5sig = kmalloc(sizeof(*md5sig), gfp);
955 		if (!md5sig)
956 			return -ENOMEM;
957 
958 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
959 		INIT_HLIST_HEAD(&md5sig->head);
960 		rcu_assign_pointer(tp->md5sig_info, md5sig);
961 	}
962 
963 	key = sock_kmalloc(sk, sizeof(*key), gfp);
964 	if (!key)
965 		return -ENOMEM;
966 	if (!tcp_alloc_md5sig_pool()) {
967 		sock_kfree_s(sk, key, sizeof(*key));
968 		return -ENOMEM;
969 	}
970 
971 	memcpy(key->key, newkey, newkeylen);
972 	key->keylen = newkeylen;
973 	key->family = family;
974 	memcpy(&key->addr, addr,
975 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
976 				      sizeof(struct in_addr));
977 	hlist_add_head_rcu(&key->node, &md5sig->head);
978 	return 0;
979 }
980 EXPORT_SYMBOL(tcp_md5_do_add);
981 
982 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
983 {
984 	struct tcp_md5sig_key *key;
985 
986 	key = tcp_md5_do_lookup(sk, addr, family);
987 	if (!key)
988 		return -ENOENT;
989 	hlist_del_rcu(&key->node);
990 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
991 	kfree_rcu(key, rcu);
992 	return 0;
993 }
994 EXPORT_SYMBOL(tcp_md5_do_del);
995 
996 static void tcp_clear_md5_list(struct sock *sk)
997 {
998 	struct tcp_sock *tp = tcp_sk(sk);
999 	struct tcp_md5sig_key *key;
1000 	struct hlist_node *n;
1001 	struct tcp_md5sig_info *md5sig;
1002 
1003 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1004 
1005 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1006 		hlist_del_rcu(&key->node);
1007 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1008 		kfree_rcu(key, rcu);
1009 	}
1010 }
1011 
1012 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1013 				 int optlen)
1014 {
1015 	struct tcp_md5sig cmd;
1016 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1017 
1018 	if (optlen < sizeof(cmd))
1019 		return -EINVAL;
1020 
1021 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1022 		return -EFAULT;
1023 
1024 	if (sin->sin_family != AF_INET)
1025 		return -EINVAL;
1026 
1027 	if (!cmd.tcpm_keylen)
1028 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1029 				      AF_INET);
1030 
1031 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1032 		return -EINVAL;
1033 
1034 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1035 			      AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1036 			      GFP_KERNEL);
1037 }
1038 
1039 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1040 				   __be32 daddr, __be32 saddr,
1041 				   const struct tcphdr *th, int nbytes)
1042 {
1043 	struct tcp4_pseudohdr *bp;
1044 	struct scatterlist sg;
1045 	struct tcphdr *_th;
1046 
1047 	bp = hp->scratch;
1048 	bp->saddr = saddr;
1049 	bp->daddr = daddr;
1050 	bp->pad = 0;
1051 	bp->protocol = IPPROTO_TCP;
1052 	bp->len = cpu_to_be16(nbytes);
1053 
1054 	_th = (struct tcphdr *)(bp + 1);
1055 	memcpy(_th, th, sizeof(*th));
1056 	_th->check = 0;
1057 
1058 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1059 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1060 				sizeof(*bp) + sizeof(*th));
1061 	return crypto_ahash_update(hp->md5_req);
1062 }
1063 
1064 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1065 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1066 {
1067 	struct tcp_md5sig_pool *hp;
1068 	struct ahash_request *req;
1069 
1070 	hp = tcp_get_md5sig_pool();
1071 	if (!hp)
1072 		goto clear_hash_noput;
1073 	req = hp->md5_req;
1074 
1075 	if (crypto_ahash_init(req))
1076 		goto clear_hash;
1077 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1078 		goto clear_hash;
1079 	if (tcp_md5_hash_key(hp, key))
1080 		goto clear_hash;
1081 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1082 	if (crypto_ahash_final(req))
1083 		goto clear_hash;
1084 
1085 	tcp_put_md5sig_pool();
1086 	return 0;
1087 
1088 clear_hash:
1089 	tcp_put_md5sig_pool();
1090 clear_hash_noput:
1091 	memset(md5_hash, 0, 16);
1092 	return 1;
1093 }
1094 
1095 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1096 			const struct sock *sk,
1097 			const struct sk_buff *skb)
1098 {
1099 	struct tcp_md5sig_pool *hp;
1100 	struct ahash_request *req;
1101 	const struct tcphdr *th = tcp_hdr(skb);
1102 	__be32 saddr, daddr;
1103 
1104 	if (sk) { /* valid for establish/request sockets */
1105 		saddr = sk->sk_rcv_saddr;
1106 		daddr = sk->sk_daddr;
1107 	} else {
1108 		const struct iphdr *iph = ip_hdr(skb);
1109 		saddr = iph->saddr;
1110 		daddr = iph->daddr;
1111 	}
1112 
1113 	hp = tcp_get_md5sig_pool();
1114 	if (!hp)
1115 		goto clear_hash_noput;
1116 	req = hp->md5_req;
1117 
1118 	if (crypto_ahash_init(req))
1119 		goto clear_hash;
1120 
1121 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1122 		goto clear_hash;
1123 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1124 		goto clear_hash;
1125 	if (tcp_md5_hash_key(hp, key))
1126 		goto clear_hash;
1127 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1128 	if (crypto_ahash_final(req))
1129 		goto clear_hash;
1130 
1131 	tcp_put_md5sig_pool();
1132 	return 0;
1133 
1134 clear_hash:
1135 	tcp_put_md5sig_pool();
1136 clear_hash_noput:
1137 	memset(md5_hash, 0, 16);
1138 	return 1;
1139 }
1140 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1141 
1142 #endif
1143 
1144 /* Called with rcu_read_lock() */
1145 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1146 				    const struct sk_buff *skb)
1147 {
1148 #ifdef CONFIG_TCP_MD5SIG
1149 	/*
1150 	 * This gets called for each TCP segment that arrives
1151 	 * so we want to be efficient.
1152 	 * We have 3 drop cases:
1153 	 * o No MD5 hash and one expected.
1154 	 * o MD5 hash and we're not expecting one.
1155 	 * o MD5 hash and its wrong.
1156 	 */
1157 	const __u8 *hash_location = NULL;
1158 	struct tcp_md5sig_key *hash_expected;
1159 	const struct iphdr *iph = ip_hdr(skb);
1160 	const struct tcphdr *th = tcp_hdr(skb);
1161 	int genhash;
1162 	unsigned char newhash[16];
1163 
1164 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1165 					  AF_INET);
1166 	hash_location = tcp_parse_md5sig_option(th);
1167 
1168 	/* We've parsed the options - do we have a hash? */
1169 	if (!hash_expected && !hash_location)
1170 		return false;
1171 
1172 	if (hash_expected && !hash_location) {
1173 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1174 		return true;
1175 	}
1176 
1177 	if (!hash_expected && hash_location) {
1178 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1179 		return true;
1180 	}
1181 
1182 	/* Okay, so this is hash_expected and hash_location -
1183 	 * so we need to calculate the checksum.
1184 	 */
1185 	genhash = tcp_v4_md5_hash_skb(newhash,
1186 				      hash_expected,
1187 				      NULL, skb);
1188 
1189 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1190 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1191 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1192 				     &iph->saddr, ntohs(th->source),
1193 				     &iph->daddr, ntohs(th->dest),
1194 				     genhash ? " tcp_v4_calc_md5_hash failed"
1195 				     : "");
1196 		return true;
1197 	}
1198 	return false;
1199 #endif
1200 	return false;
1201 }
1202 
1203 static void tcp_v4_init_req(struct request_sock *req,
1204 			    const struct sock *sk_listener,
1205 			    struct sk_buff *skb)
1206 {
1207 	struct inet_request_sock *ireq = inet_rsk(req);
1208 
1209 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1210 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1211 	ireq->opt = tcp_v4_save_options(skb);
1212 }
1213 
1214 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1215 					  struct flowi *fl,
1216 					  const struct request_sock *req)
1217 {
1218 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1219 }
1220 
1221 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1222 	.family		=	PF_INET,
1223 	.obj_size	=	sizeof(struct tcp_request_sock),
1224 	.rtx_syn_ack	=	tcp_rtx_synack,
1225 	.send_ack	=	tcp_v4_reqsk_send_ack,
1226 	.destructor	=	tcp_v4_reqsk_destructor,
1227 	.send_reset	=	tcp_v4_send_reset,
1228 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1229 };
1230 
1231 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1232 	.mss_clamp	=	TCP_MSS_DEFAULT,
1233 #ifdef CONFIG_TCP_MD5SIG
1234 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1235 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1236 #endif
1237 	.init_req	=	tcp_v4_init_req,
1238 #ifdef CONFIG_SYN_COOKIES
1239 	.cookie_init_seq =	cookie_v4_init_sequence,
1240 #endif
1241 	.route_req	=	tcp_v4_route_req,
1242 	.init_seq_tsoff	=	tcp_v4_init_seq_and_tsoff,
1243 	.send_synack	=	tcp_v4_send_synack,
1244 };
1245 
1246 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1247 {
1248 	/* Never answer to SYNs send to broadcast or multicast */
1249 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1250 		goto drop;
1251 
1252 	return tcp_conn_request(&tcp_request_sock_ops,
1253 				&tcp_request_sock_ipv4_ops, sk, skb);
1254 
1255 drop:
1256 	tcp_listendrop(sk);
1257 	return 0;
1258 }
1259 EXPORT_SYMBOL(tcp_v4_conn_request);
1260 
1261 
1262 /*
1263  * The three way handshake has completed - we got a valid synack -
1264  * now create the new socket.
1265  */
1266 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1267 				  struct request_sock *req,
1268 				  struct dst_entry *dst,
1269 				  struct request_sock *req_unhash,
1270 				  bool *own_req)
1271 {
1272 	struct inet_request_sock *ireq;
1273 	struct inet_sock *newinet;
1274 	struct tcp_sock *newtp;
1275 	struct sock *newsk;
1276 #ifdef CONFIG_TCP_MD5SIG
1277 	struct tcp_md5sig_key *key;
1278 #endif
1279 	struct ip_options_rcu *inet_opt;
1280 
1281 	if (sk_acceptq_is_full(sk))
1282 		goto exit_overflow;
1283 
1284 	newsk = tcp_create_openreq_child(sk, req, skb);
1285 	if (!newsk)
1286 		goto exit_nonewsk;
1287 
1288 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1289 	inet_sk_rx_dst_set(newsk, skb);
1290 
1291 	newtp		      = tcp_sk(newsk);
1292 	newinet		      = inet_sk(newsk);
1293 	ireq		      = inet_rsk(req);
1294 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1295 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1296 	newsk->sk_bound_dev_if = ireq->ir_iif;
1297 	newinet->inet_saddr	      = ireq->ir_loc_addr;
1298 	inet_opt	      = ireq->opt;
1299 	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1300 	ireq->opt	      = NULL;
1301 	newinet->mc_index     = inet_iif(skb);
1302 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1303 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1304 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1305 	if (inet_opt)
1306 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1307 	newinet->inet_id = newtp->write_seq ^ jiffies;
1308 
1309 	if (!dst) {
1310 		dst = inet_csk_route_child_sock(sk, newsk, req);
1311 		if (!dst)
1312 			goto put_and_exit;
1313 	} else {
1314 		/* syncookie case : see end of cookie_v4_check() */
1315 	}
1316 	sk_setup_caps(newsk, dst);
1317 
1318 	tcp_ca_openreq_child(newsk, dst);
1319 
1320 	tcp_sync_mss(newsk, dst_mtu(dst));
1321 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1322 
1323 	tcp_initialize_rcv_mss(newsk);
1324 
1325 #ifdef CONFIG_TCP_MD5SIG
1326 	/* Copy over the MD5 key from the original socket */
1327 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1328 				AF_INET);
1329 	if (key) {
1330 		/*
1331 		 * We're using one, so create a matching key
1332 		 * on the newsk structure. If we fail to get
1333 		 * memory, then we end up not copying the key
1334 		 * across. Shucks.
1335 		 */
1336 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1337 			       AF_INET, key->key, key->keylen, GFP_ATOMIC);
1338 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1339 	}
1340 #endif
1341 
1342 	if (__inet_inherit_port(sk, newsk) < 0)
1343 		goto put_and_exit;
1344 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1345 	if (*own_req)
1346 		tcp_move_syn(newtp, req);
1347 
1348 	return newsk;
1349 
1350 exit_overflow:
1351 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1352 exit_nonewsk:
1353 	dst_release(dst);
1354 exit:
1355 	tcp_listendrop(sk);
1356 	return NULL;
1357 put_and_exit:
1358 	inet_csk_prepare_forced_close(newsk);
1359 	tcp_done(newsk);
1360 	goto exit;
1361 }
1362 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1363 
1364 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1365 {
1366 #ifdef CONFIG_SYN_COOKIES
1367 	const struct tcphdr *th = tcp_hdr(skb);
1368 
1369 	if (!th->syn)
1370 		sk = cookie_v4_check(sk, skb);
1371 #endif
1372 	return sk;
1373 }
1374 
1375 /* The socket must have it's spinlock held when we get
1376  * here, unless it is a TCP_LISTEN socket.
1377  *
1378  * We have a potential double-lock case here, so even when
1379  * doing backlog processing we use the BH locking scheme.
1380  * This is because we cannot sleep with the original spinlock
1381  * held.
1382  */
1383 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1384 {
1385 	struct sock *rsk;
1386 
1387 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1388 		struct dst_entry *dst = sk->sk_rx_dst;
1389 
1390 		sock_rps_save_rxhash(sk, skb);
1391 		sk_mark_napi_id(sk, skb);
1392 		if (dst) {
1393 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1394 			    !dst->ops->check(dst, 0)) {
1395 				dst_release(dst);
1396 				sk->sk_rx_dst = NULL;
1397 			}
1398 		}
1399 		tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1400 		return 0;
1401 	}
1402 
1403 	if (tcp_checksum_complete(skb))
1404 		goto csum_err;
1405 
1406 	if (sk->sk_state == TCP_LISTEN) {
1407 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1408 
1409 		if (!nsk)
1410 			goto discard;
1411 		if (nsk != sk) {
1412 			if (tcp_child_process(sk, nsk, skb)) {
1413 				rsk = nsk;
1414 				goto reset;
1415 			}
1416 			return 0;
1417 		}
1418 	} else
1419 		sock_rps_save_rxhash(sk, skb);
1420 
1421 	if (tcp_rcv_state_process(sk, skb)) {
1422 		rsk = sk;
1423 		goto reset;
1424 	}
1425 	return 0;
1426 
1427 reset:
1428 	tcp_v4_send_reset(rsk, skb);
1429 discard:
1430 	kfree_skb(skb);
1431 	/* Be careful here. If this function gets more complicated and
1432 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1433 	 * might be destroyed here. This current version compiles correctly,
1434 	 * but you have been warned.
1435 	 */
1436 	return 0;
1437 
1438 csum_err:
1439 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1440 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1441 	goto discard;
1442 }
1443 EXPORT_SYMBOL(tcp_v4_do_rcv);
1444 
1445 void tcp_v4_early_demux(struct sk_buff *skb)
1446 {
1447 	const struct iphdr *iph;
1448 	const struct tcphdr *th;
1449 	struct sock *sk;
1450 
1451 	if (skb->pkt_type != PACKET_HOST)
1452 		return;
1453 
1454 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1455 		return;
1456 
1457 	iph = ip_hdr(skb);
1458 	th = tcp_hdr(skb);
1459 
1460 	if (th->doff < sizeof(struct tcphdr) / 4)
1461 		return;
1462 
1463 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1464 				       iph->saddr, th->source,
1465 				       iph->daddr, ntohs(th->dest),
1466 				       skb->skb_iif);
1467 	if (sk) {
1468 		skb->sk = sk;
1469 		skb->destructor = sock_edemux;
1470 		if (sk_fullsock(sk)) {
1471 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1472 
1473 			if (dst)
1474 				dst = dst_check(dst, 0);
1475 			if (dst &&
1476 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1477 				skb_dst_set_noref(skb, dst);
1478 		}
1479 	}
1480 }
1481 
1482 /* Packet is added to VJ-style prequeue for processing in process
1483  * context, if a reader task is waiting. Apparently, this exciting
1484  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1485  * failed somewhere. Latency? Burstiness? Well, at least now we will
1486  * see, why it failed. 8)8)				  --ANK
1487  *
1488  */
1489 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1490 {
1491 	struct tcp_sock *tp = tcp_sk(sk);
1492 
1493 	if (sysctl_tcp_low_latency || !tp->ucopy.task)
1494 		return false;
1495 
1496 	if (skb->len <= tcp_hdrlen(skb) &&
1497 	    skb_queue_len(&tp->ucopy.prequeue) == 0)
1498 		return false;
1499 
1500 	/* Before escaping RCU protected region, we need to take care of skb
1501 	 * dst. Prequeue is only enabled for established sockets.
1502 	 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1503 	 * Instead of doing full sk_rx_dst validity here, let's perform
1504 	 * an optimistic check.
1505 	 */
1506 	if (likely(sk->sk_rx_dst))
1507 		skb_dst_drop(skb);
1508 	else
1509 		skb_dst_force_safe(skb);
1510 
1511 	__skb_queue_tail(&tp->ucopy.prequeue, skb);
1512 	tp->ucopy.memory += skb->truesize;
1513 	if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
1514 	    tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
1515 		struct sk_buff *skb1;
1516 
1517 		BUG_ON(sock_owned_by_user(sk));
1518 		__NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1519 				skb_queue_len(&tp->ucopy.prequeue));
1520 
1521 		while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1522 			sk_backlog_rcv(sk, skb1);
1523 
1524 		tp->ucopy.memory = 0;
1525 	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1526 		wake_up_interruptible_sync_poll(sk_sleep(sk),
1527 					   POLLIN | POLLRDNORM | POLLRDBAND);
1528 		if (!inet_csk_ack_scheduled(sk))
1529 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1530 						  (3 * tcp_rto_min(sk)) / 4,
1531 						  TCP_RTO_MAX);
1532 	}
1533 	return true;
1534 }
1535 EXPORT_SYMBOL(tcp_prequeue);
1536 
1537 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1538 {
1539 	u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1540 
1541 	/* Only socket owner can try to collapse/prune rx queues
1542 	 * to reduce memory overhead, so add a little headroom here.
1543 	 * Few sockets backlog are possibly concurrently non empty.
1544 	 */
1545 	limit += 64*1024;
1546 
1547 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1548 	 * we can fix skb->truesize to its real value to avoid future drops.
1549 	 * This is valid because skb is not yet charged to the socket.
1550 	 * It has been noticed pure SACK packets were sometimes dropped
1551 	 * (if cooked by drivers without copybreak feature).
1552 	 */
1553 	skb_condense(skb);
1554 
1555 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1556 		bh_unlock_sock(sk);
1557 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1558 		return true;
1559 	}
1560 	return false;
1561 }
1562 EXPORT_SYMBOL(tcp_add_backlog);
1563 
1564 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1565 {
1566 	struct tcphdr *th = (struct tcphdr *)skb->data;
1567 	unsigned int eaten = skb->len;
1568 	int err;
1569 
1570 	err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1571 	if (!err) {
1572 		eaten -= skb->len;
1573 		TCP_SKB_CB(skb)->end_seq -= eaten;
1574 	}
1575 	return err;
1576 }
1577 EXPORT_SYMBOL(tcp_filter);
1578 
1579 /*
1580  *	From tcp_input.c
1581  */
1582 
1583 int tcp_v4_rcv(struct sk_buff *skb)
1584 {
1585 	struct net *net = dev_net(skb->dev);
1586 	const struct iphdr *iph;
1587 	const struct tcphdr *th;
1588 	bool refcounted;
1589 	struct sock *sk;
1590 	int ret;
1591 
1592 	if (skb->pkt_type != PACKET_HOST)
1593 		goto discard_it;
1594 
1595 	/* Count it even if it's bad */
1596 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1597 
1598 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1599 		goto discard_it;
1600 
1601 	th = (const struct tcphdr *)skb->data;
1602 
1603 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1604 		goto bad_packet;
1605 	if (!pskb_may_pull(skb, th->doff * 4))
1606 		goto discard_it;
1607 
1608 	/* An explanation is required here, I think.
1609 	 * Packet length and doff are validated by header prediction,
1610 	 * provided case of th->doff==0 is eliminated.
1611 	 * So, we defer the checks. */
1612 
1613 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1614 		goto csum_error;
1615 
1616 	th = (const struct tcphdr *)skb->data;
1617 	iph = ip_hdr(skb);
1618 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1619 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1620 	 */
1621 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1622 		sizeof(struct inet_skb_parm));
1623 	barrier();
1624 
1625 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1626 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1627 				    skb->len - th->doff * 4);
1628 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1629 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1630 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1631 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1632 	TCP_SKB_CB(skb)->sacked	 = 0;
1633 
1634 lookup:
1635 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1636 			       th->dest, &refcounted);
1637 	if (!sk)
1638 		goto no_tcp_socket;
1639 
1640 process:
1641 	if (sk->sk_state == TCP_TIME_WAIT)
1642 		goto do_time_wait;
1643 
1644 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1645 		struct request_sock *req = inet_reqsk(sk);
1646 		struct sock *nsk;
1647 
1648 		sk = req->rsk_listener;
1649 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1650 			sk_drops_add(sk, skb);
1651 			reqsk_put(req);
1652 			goto discard_it;
1653 		}
1654 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1655 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1656 			goto lookup;
1657 		}
1658 		/* We own a reference on the listener, increase it again
1659 		 * as we might lose it too soon.
1660 		 */
1661 		sock_hold(sk);
1662 		refcounted = true;
1663 		nsk = tcp_check_req(sk, skb, req, false);
1664 		if (!nsk) {
1665 			reqsk_put(req);
1666 			goto discard_and_relse;
1667 		}
1668 		if (nsk == sk) {
1669 			reqsk_put(req);
1670 		} else if (tcp_child_process(sk, nsk, skb)) {
1671 			tcp_v4_send_reset(nsk, skb);
1672 			goto discard_and_relse;
1673 		} else {
1674 			sock_put(sk);
1675 			return 0;
1676 		}
1677 	}
1678 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1679 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1680 		goto discard_and_relse;
1681 	}
1682 
1683 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1684 		goto discard_and_relse;
1685 
1686 	if (tcp_v4_inbound_md5_hash(sk, skb))
1687 		goto discard_and_relse;
1688 
1689 	nf_reset(skb);
1690 
1691 	if (tcp_filter(sk, skb))
1692 		goto discard_and_relse;
1693 	th = (const struct tcphdr *)skb->data;
1694 	iph = ip_hdr(skb);
1695 
1696 	skb->dev = NULL;
1697 
1698 	if (sk->sk_state == TCP_LISTEN) {
1699 		ret = tcp_v4_do_rcv(sk, skb);
1700 		goto put_and_return;
1701 	}
1702 
1703 	sk_incoming_cpu_update(sk);
1704 
1705 	bh_lock_sock_nested(sk);
1706 	tcp_segs_in(tcp_sk(sk), skb);
1707 	ret = 0;
1708 	if (!sock_owned_by_user(sk)) {
1709 		if (!tcp_prequeue(sk, skb))
1710 			ret = tcp_v4_do_rcv(sk, skb);
1711 	} else if (tcp_add_backlog(sk, skb)) {
1712 		goto discard_and_relse;
1713 	}
1714 	bh_unlock_sock(sk);
1715 
1716 put_and_return:
1717 	if (refcounted)
1718 		sock_put(sk);
1719 
1720 	return ret;
1721 
1722 no_tcp_socket:
1723 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1724 		goto discard_it;
1725 
1726 	if (tcp_checksum_complete(skb)) {
1727 csum_error:
1728 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1729 bad_packet:
1730 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
1731 	} else {
1732 		tcp_v4_send_reset(NULL, skb);
1733 	}
1734 
1735 discard_it:
1736 	/* Discard frame. */
1737 	kfree_skb(skb);
1738 	return 0;
1739 
1740 discard_and_relse:
1741 	sk_drops_add(sk, skb);
1742 	if (refcounted)
1743 		sock_put(sk);
1744 	goto discard_it;
1745 
1746 do_time_wait:
1747 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1748 		inet_twsk_put(inet_twsk(sk));
1749 		goto discard_it;
1750 	}
1751 
1752 	if (tcp_checksum_complete(skb)) {
1753 		inet_twsk_put(inet_twsk(sk));
1754 		goto csum_error;
1755 	}
1756 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1757 	case TCP_TW_SYN: {
1758 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1759 							&tcp_hashinfo, skb,
1760 							__tcp_hdrlen(th),
1761 							iph->saddr, th->source,
1762 							iph->daddr, th->dest,
1763 							inet_iif(skb));
1764 		if (sk2) {
1765 			inet_twsk_deschedule_put(inet_twsk(sk));
1766 			sk = sk2;
1767 			refcounted = false;
1768 			goto process;
1769 		}
1770 		/* Fall through to ACK */
1771 	}
1772 	case TCP_TW_ACK:
1773 		tcp_v4_timewait_ack(sk, skb);
1774 		break;
1775 	case TCP_TW_RST:
1776 		tcp_v4_send_reset(sk, skb);
1777 		inet_twsk_deschedule_put(inet_twsk(sk));
1778 		goto discard_it;
1779 	case TCP_TW_SUCCESS:;
1780 	}
1781 	goto discard_it;
1782 }
1783 
1784 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1785 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1786 	.twsk_unique	= tcp_twsk_unique,
1787 	.twsk_destructor= tcp_twsk_destructor,
1788 };
1789 
1790 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1791 {
1792 	struct dst_entry *dst = skb_dst(skb);
1793 
1794 	if (dst && dst_hold_safe(dst)) {
1795 		sk->sk_rx_dst = dst;
1796 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1797 	}
1798 }
1799 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1800 
1801 const struct inet_connection_sock_af_ops ipv4_specific = {
1802 	.queue_xmit	   = ip_queue_xmit,
1803 	.send_check	   = tcp_v4_send_check,
1804 	.rebuild_header	   = inet_sk_rebuild_header,
1805 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
1806 	.conn_request	   = tcp_v4_conn_request,
1807 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1808 	.net_header_len	   = sizeof(struct iphdr),
1809 	.setsockopt	   = ip_setsockopt,
1810 	.getsockopt	   = ip_getsockopt,
1811 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1812 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1813 #ifdef CONFIG_COMPAT
1814 	.compat_setsockopt = compat_ip_setsockopt,
1815 	.compat_getsockopt = compat_ip_getsockopt,
1816 #endif
1817 	.mtu_reduced	   = tcp_v4_mtu_reduced,
1818 };
1819 EXPORT_SYMBOL(ipv4_specific);
1820 
1821 #ifdef CONFIG_TCP_MD5SIG
1822 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1823 	.md5_lookup		= tcp_v4_md5_lookup,
1824 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1825 	.md5_parse		= tcp_v4_parse_md5_keys,
1826 };
1827 #endif
1828 
1829 /* NOTE: A lot of things set to zero explicitly by call to
1830  *       sk_alloc() so need not be done here.
1831  */
1832 static int tcp_v4_init_sock(struct sock *sk)
1833 {
1834 	struct inet_connection_sock *icsk = inet_csk(sk);
1835 
1836 	tcp_init_sock(sk);
1837 
1838 	icsk->icsk_af_ops = &ipv4_specific;
1839 
1840 #ifdef CONFIG_TCP_MD5SIG
1841 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1842 #endif
1843 
1844 	return 0;
1845 }
1846 
1847 void tcp_v4_destroy_sock(struct sock *sk)
1848 {
1849 	struct tcp_sock *tp = tcp_sk(sk);
1850 
1851 	tcp_clear_xmit_timers(sk);
1852 
1853 	tcp_cleanup_congestion_control(sk);
1854 
1855 	/* Cleanup up the write buffer. */
1856 	tcp_write_queue_purge(sk);
1857 
1858 	/* Check if we want to disable active TFO */
1859 	tcp_fastopen_active_disable_ofo_check(sk);
1860 
1861 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1862 	skb_rbtree_purge(&tp->out_of_order_queue);
1863 
1864 #ifdef CONFIG_TCP_MD5SIG
1865 	/* Clean up the MD5 key list, if any */
1866 	if (tp->md5sig_info) {
1867 		tcp_clear_md5_list(sk);
1868 		kfree_rcu(tp->md5sig_info, rcu);
1869 		tp->md5sig_info = NULL;
1870 	}
1871 #endif
1872 
1873 	/* Clean prequeue, it must be empty really */
1874 	__skb_queue_purge(&tp->ucopy.prequeue);
1875 
1876 	/* Clean up a referenced TCP bind bucket. */
1877 	if (inet_csk(sk)->icsk_bind_hash)
1878 		inet_put_port(sk);
1879 
1880 	BUG_ON(tp->fastopen_rsk);
1881 
1882 	/* If socket is aborted during connect operation */
1883 	tcp_free_fastopen_req(tp);
1884 	tcp_saved_syn_free(tp);
1885 
1886 	sk_sockets_allocated_dec(sk);
1887 }
1888 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1889 
1890 #ifdef CONFIG_PROC_FS
1891 /* Proc filesystem TCP sock list dumping. */
1892 
1893 /*
1894  * Get next listener socket follow cur.  If cur is NULL, get first socket
1895  * starting from bucket given in st->bucket; when st->bucket is zero the
1896  * very first socket in the hash table is returned.
1897  */
1898 static void *listening_get_next(struct seq_file *seq, void *cur)
1899 {
1900 	struct tcp_iter_state *st = seq->private;
1901 	struct net *net = seq_file_net(seq);
1902 	struct inet_listen_hashbucket *ilb;
1903 	struct sock *sk = cur;
1904 
1905 	if (!sk) {
1906 get_head:
1907 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1908 		spin_lock(&ilb->lock);
1909 		sk = sk_head(&ilb->head);
1910 		st->offset = 0;
1911 		goto get_sk;
1912 	}
1913 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1914 	++st->num;
1915 	++st->offset;
1916 
1917 	sk = sk_next(sk);
1918 get_sk:
1919 	sk_for_each_from(sk) {
1920 		if (!net_eq(sock_net(sk), net))
1921 			continue;
1922 		if (sk->sk_family == st->family)
1923 			return sk;
1924 	}
1925 	spin_unlock(&ilb->lock);
1926 	st->offset = 0;
1927 	if (++st->bucket < INET_LHTABLE_SIZE)
1928 		goto get_head;
1929 	return NULL;
1930 }
1931 
1932 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1933 {
1934 	struct tcp_iter_state *st = seq->private;
1935 	void *rc;
1936 
1937 	st->bucket = 0;
1938 	st->offset = 0;
1939 	rc = listening_get_next(seq, NULL);
1940 
1941 	while (rc && *pos) {
1942 		rc = listening_get_next(seq, rc);
1943 		--*pos;
1944 	}
1945 	return rc;
1946 }
1947 
1948 static inline bool empty_bucket(const struct tcp_iter_state *st)
1949 {
1950 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1951 }
1952 
1953 /*
1954  * Get first established socket starting from bucket given in st->bucket.
1955  * If st->bucket is zero, the very first socket in the hash is returned.
1956  */
1957 static void *established_get_first(struct seq_file *seq)
1958 {
1959 	struct tcp_iter_state *st = seq->private;
1960 	struct net *net = seq_file_net(seq);
1961 	void *rc = NULL;
1962 
1963 	st->offset = 0;
1964 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1965 		struct sock *sk;
1966 		struct hlist_nulls_node *node;
1967 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1968 
1969 		/* Lockless fast path for the common case of empty buckets */
1970 		if (empty_bucket(st))
1971 			continue;
1972 
1973 		spin_lock_bh(lock);
1974 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1975 			if (sk->sk_family != st->family ||
1976 			    !net_eq(sock_net(sk), net)) {
1977 				continue;
1978 			}
1979 			rc = sk;
1980 			goto out;
1981 		}
1982 		spin_unlock_bh(lock);
1983 	}
1984 out:
1985 	return rc;
1986 }
1987 
1988 static void *established_get_next(struct seq_file *seq, void *cur)
1989 {
1990 	struct sock *sk = cur;
1991 	struct hlist_nulls_node *node;
1992 	struct tcp_iter_state *st = seq->private;
1993 	struct net *net = seq_file_net(seq);
1994 
1995 	++st->num;
1996 	++st->offset;
1997 
1998 	sk = sk_nulls_next(sk);
1999 
2000 	sk_nulls_for_each_from(sk, node) {
2001 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2002 			return sk;
2003 	}
2004 
2005 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2006 	++st->bucket;
2007 	return established_get_first(seq);
2008 }
2009 
2010 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2011 {
2012 	struct tcp_iter_state *st = seq->private;
2013 	void *rc;
2014 
2015 	st->bucket = 0;
2016 	rc = established_get_first(seq);
2017 
2018 	while (rc && pos) {
2019 		rc = established_get_next(seq, rc);
2020 		--pos;
2021 	}
2022 	return rc;
2023 }
2024 
2025 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2026 {
2027 	void *rc;
2028 	struct tcp_iter_state *st = seq->private;
2029 
2030 	st->state = TCP_SEQ_STATE_LISTENING;
2031 	rc	  = listening_get_idx(seq, &pos);
2032 
2033 	if (!rc) {
2034 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2035 		rc	  = established_get_idx(seq, pos);
2036 	}
2037 
2038 	return rc;
2039 }
2040 
2041 static void *tcp_seek_last_pos(struct seq_file *seq)
2042 {
2043 	struct tcp_iter_state *st = seq->private;
2044 	int offset = st->offset;
2045 	int orig_num = st->num;
2046 	void *rc = NULL;
2047 
2048 	switch (st->state) {
2049 	case TCP_SEQ_STATE_LISTENING:
2050 		if (st->bucket >= INET_LHTABLE_SIZE)
2051 			break;
2052 		st->state = TCP_SEQ_STATE_LISTENING;
2053 		rc = listening_get_next(seq, NULL);
2054 		while (offset-- && rc)
2055 			rc = listening_get_next(seq, rc);
2056 		if (rc)
2057 			break;
2058 		st->bucket = 0;
2059 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2060 		/* Fallthrough */
2061 	case TCP_SEQ_STATE_ESTABLISHED:
2062 		if (st->bucket > tcp_hashinfo.ehash_mask)
2063 			break;
2064 		rc = established_get_first(seq);
2065 		while (offset-- && rc)
2066 			rc = established_get_next(seq, rc);
2067 	}
2068 
2069 	st->num = orig_num;
2070 
2071 	return rc;
2072 }
2073 
2074 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2075 {
2076 	struct tcp_iter_state *st = seq->private;
2077 	void *rc;
2078 
2079 	if (*pos && *pos == st->last_pos) {
2080 		rc = tcp_seek_last_pos(seq);
2081 		if (rc)
2082 			goto out;
2083 	}
2084 
2085 	st->state = TCP_SEQ_STATE_LISTENING;
2086 	st->num = 0;
2087 	st->bucket = 0;
2088 	st->offset = 0;
2089 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2090 
2091 out:
2092 	st->last_pos = *pos;
2093 	return rc;
2094 }
2095 
2096 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2097 {
2098 	struct tcp_iter_state *st = seq->private;
2099 	void *rc = NULL;
2100 
2101 	if (v == SEQ_START_TOKEN) {
2102 		rc = tcp_get_idx(seq, 0);
2103 		goto out;
2104 	}
2105 
2106 	switch (st->state) {
2107 	case TCP_SEQ_STATE_LISTENING:
2108 		rc = listening_get_next(seq, v);
2109 		if (!rc) {
2110 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2111 			st->bucket = 0;
2112 			st->offset = 0;
2113 			rc	  = established_get_first(seq);
2114 		}
2115 		break;
2116 	case TCP_SEQ_STATE_ESTABLISHED:
2117 		rc = established_get_next(seq, v);
2118 		break;
2119 	}
2120 out:
2121 	++*pos;
2122 	st->last_pos = *pos;
2123 	return rc;
2124 }
2125 
2126 static void tcp_seq_stop(struct seq_file *seq, void *v)
2127 {
2128 	struct tcp_iter_state *st = seq->private;
2129 
2130 	switch (st->state) {
2131 	case TCP_SEQ_STATE_LISTENING:
2132 		if (v != SEQ_START_TOKEN)
2133 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2134 		break;
2135 	case TCP_SEQ_STATE_ESTABLISHED:
2136 		if (v)
2137 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2138 		break;
2139 	}
2140 }
2141 
2142 int tcp_seq_open(struct inode *inode, struct file *file)
2143 {
2144 	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2145 	struct tcp_iter_state *s;
2146 	int err;
2147 
2148 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2149 			  sizeof(struct tcp_iter_state));
2150 	if (err < 0)
2151 		return err;
2152 
2153 	s = ((struct seq_file *)file->private_data)->private;
2154 	s->family		= afinfo->family;
2155 	s->last_pos		= 0;
2156 	return 0;
2157 }
2158 EXPORT_SYMBOL(tcp_seq_open);
2159 
2160 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2161 {
2162 	int rc = 0;
2163 	struct proc_dir_entry *p;
2164 
2165 	afinfo->seq_ops.start		= tcp_seq_start;
2166 	afinfo->seq_ops.next		= tcp_seq_next;
2167 	afinfo->seq_ops.stop		= tcp_seq_stop;
2168 
2169 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2170 			     afinfo->seq_fops, afinfo);
2171 	if (!p)
2172 		rc = -ENOMEM;
2173 	return rc;
2174 }
2175 EXPORT_SYMBOL(tcp_proc_register);
2176 
2177 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2178 {
2179 	remove_proc_entry(afinfo->name, net->proc_net);
2180 }
2181 EXPORT_SYMBOL(tcp_proc_unregister);
2182 
2183 static void get_openreq4(const struct request_sock *req,
2184 			 struct seq_file *f, int i)
2185 {
2186 	const struct inet_request_sock *ireq = inet_rsk(req);
2187 	long delta = req->rsk_timer.expires - jiffies;
2188 
2189 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2190 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2191 		i,
2192 		ireq->ir_loc_addr,
2193 		ireq->ir_num,
2194 		ireq->ir_rmt_addr,
2195 		ntohs(ireq->ir_rmt_port),
2196 		TCP_SYN_RECV,
2197 		0, 0, /* could print option size, but that is af dependent. */
2198 		1,    /* timers active (only the expire timer) */
2199 		jiffies_delta_to_clock_t(delta),
2200 		req->num_timeout,
2201 		from_kuid_munged(seq_user_ns(f),
2202 				 sock_i_uid(req->rsk_listener)),
2203 		0,  /* non standard timer */
2204 		0, /* open_requests have no inode */
2205 		0,
2206 		req);
2207 }
2208 
2209 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2210 {
2211 	int timer_active;
2212 	unsigned long timer_expires;
2213 	const struct tcp_sock *tp = tcp_sk(sk);
2214 	const struct inet_connection_sock *icsk = inet_csk(sk);
2215 	const struct inet_sock *inet = inet_sk(sk);
2216 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2217 	__be32 dest = inet->inet_daddr;
2218 	__be32 src = inet->inet_rcv_saddr;
2219 	__u16 destp = ntohs(inet->inet_dport);
2220 	__u16 srcp = ntohs(inet->inet_sport);
2221 	int rx_queue;
2222 	int state;
2223 
2224 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2225 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2226 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2227 		timer_active	= 1;
2228 		timer_expires	= icsk->icsk_timeout;
2229 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2230 		timer_active	= 4;
2231 		timer_expires	= icsk->icsk_timeout;
2232 	} else if (timer_pending(&sk->sk_timer)) {
2233 		timer_active	= 2;
2234 		timer_expires	= sk->sk_timer.expires;
2235 	} else {
2236 		timer_active	= 0;
2237 		timer_expires = jiffies;
2238 	}
2239 
2240 	state = sk_state_load(sk);
2241 	if (state == TCP_LISTEN)
2242 		rx_queue = sk->sk_ack_backlog;
2243 	else
2244 		/* Because we don't lock the socket,
2245 		 * we might find a transient negative value.
2246 		 */
2247 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2248 
2249 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2250 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2251 		i, src, srcp, dest, destp, state,
2252 		tp->write_seq - tp->snd_una,
2253 		rx_queue,
2254 		timer_active,
2255 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2256 		icsk->icsk_retransmits,
2257 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2258 		icsk->icsk_probes_out,
2259 		sock_i_ino(sk),
2260 		atomic_read(&sk->sk_refcnt), sk,
2261 		jiffies_to_clock_t(icsk->icsk_rto),
2262 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2263 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2264 		tp->snd_cwnd,
2265 		state == TCP_LISTEN ?
2266 		    fastopenq->max_qlen :
2267 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2268 }
2269 
2270 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2271 			       struct seq_file *f, int i)
2272 {
2273 	long delta = tw->tw_timer.expires - jiffies;
2274 	__be32 dest, src;
2275 	__u16 destp, srcp;
2276 
2277 	dest  = tw->tw_daddr;
2278 	src   = tw->tw_rcv_saddr;
2279 	destp = ntohs(tw->tw_dport);
2280 	srcp  = ntohs(tw->tw_sport);
2281 
2282 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2283 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2284 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2285 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2286 		atomic_read(&tw->tw_refcnt), tw);
2287 }
2288 
2289 #define TMPSZ 150
2290 
2291 static int tcp4_seq_show(struct seq_file *seq, void *v)
2292 {
2293 	struct tcp_iter_state *st;
2294 	struct sock *sk = v;
2295 
2296 	seq_setwidth(seq, TMPSZ - 1);
2297 	if (v == SEQ_START_TOKEN) {
2298 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2299 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2300 			   "inode");
2301 		goto out;
2302 	}
2303 	st = seq->private;
2304 
2305 	if (sk->sk_state == TCP_TIME_WAIT)
2306 		get_timewait4_sock(v, seq, st->num);
2307 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2308 		get_openreq4(v, seq, st->num);
2309 	else
2310 		get_tcp4_sock(v, seq, st->num);
2311 out:
2312 	seq_pad(seq, '\n');
2313 	return 0;
2314 }
2315 
2316 static const struct file_operations tcp_afinfo_seq_fops = {
2317 	.owner   = THIS_MODULE,
2318 	.open    = tcp_seq_open,
2319 	.read    = seq_read,
2320 	.llseek  = seq_lseek,
2321 	.release = seq_release_net
2322 };
2323 
2324 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2325 	.name		= "tcp",
2326 	.family		= AF_INET,
2327 	.seq_fops	= &tcp_afinfo_seq_fops,
2328 	.seq_ops	= {
2329 		.show		= tcp4_seq_show,
2330 	},
2331 };
2332 
2333 static int __net_init tcp4_proc_init_net(struct net *net)
2334 {
2335 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2336 }
2337 
2338 static void __net_exit tcp4_proc_exit_net(struct net *net)
2339 {
2340 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2341 }
2342 
2343 static struct pernet_operations tcp4_net_ops = {
2344 	.init = tcp4_proc_init_net,
2345 	.exit = tcp4_proc_exit_net,
2346 };
2347 
2348 int __init tcp4_proc_init(void)
2349 {
2350 	return register_pernet_subsys(&tcp4_net_ops);
2351 }
2352 
2353 void tcp4_proc_exit(void)
2354 {
2355 	unregister_pernet_subsys(&tcp4_net_ops);
2356 }
2357 #endif /* CONFIG_PROC_FS */
2358 
2359 struct proto tcp_prot = {
2360 	.name			= "TCP",
2361 	.owner			= THIS_MODULE,
2362 	.close			= tcp_close,
2363 	.connect		= tcp_v4_connect,
2364 	.disconnect		= tcp_disconnect,
2365 	.accept			= inet_csk_accept,
2366 	.ioctl			= tcp_ioctl,
2367 	.init			= tcp_v4_init_sock,
2368 	.destroy		= tcp_v4_destroy_sock,
2369 	.shutdown		= tcp_shutdown,
2370 	.setsockopt		= tcp_setsockopt,
2371 	.getsockopt		= tcp_getsockopt,
2372 	.keepalive		= tcp_set_keepalive,
2373 	.recvmsg		= tcp_recvmsg,
2374 	.sendmsg		= tcp_sendmsg,
2375 	.sendpage		= tcp_sendpage,
2376 	.backlog_rcv		= tcp_v4_do_rcv,
2377 	.release_cb		= tcp_release_cb,
2378 	.hash			= inet_hash,
2379 	.unhash			= inet_unhash,
2380 	.get_port		= inet_csk_get_port,
2381 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2382 	.stream_memory_free	= tcp_stream_memory_free,
2383 	.sockets_allocated	= &tcp_sockets_allocated,
2384 	.orphan_count		= &tcp_orphan_count,
2385 	.memory_allocated	= &tcp_memory_allocated,
2386 	.memory_pressure	= &tcp_memory_pressure,
2387 	.sysctl_mem		= sysctl_tcp_mem,
2388 	.sysctl_wmem		= sysctl_tcp_wmem,
2389 	.sysctl_rmem		= sysctl_tcp_rmem,
2390 	.max_header		= MAX_TCP_HEADER,
2391 	.obj_size		= sizeof(struct tcp_sock),
2392 	.slab_flags		= SLAB_DESTROY_BY_RCU,
2393 	.twsk_prot		= &tcp_timewait_sock_ops,
2394 	.rsk_prot		= &tcp_request_sock_ops,
2395 	.h.hashinfo		= &tcp_hashinfo,
2396 	.no_autobind		= true,
2397 #ifdef CONFIG_COMPAT
2398 	.compat_setsockopt	= compat_tcp_setsockopt,
2399 	.compat_getsockopt	= compat_tcp_getsockopt,
2400 #endif
2401 	.diag_destroy		= tcp_abort,
2402 };
2403 EXPORT_SYMBOL(tcp_prot);
2404 
2405 static void __net_exit tcp_sk_exit(struct net *net)
2406 {
2407 	int cpu;
2408 
2409 	for_each_possible_cpu(cpu)
2410 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2411 	free_percpu(net->ipv4.tcp_sk);
2412 }
2413 
2414 static int __net_init tcp_sk_init(struct net *net)
2415 {
2416 	int res, cpu, cnt;
2417 
2418 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2419 	if (!net->ipv4.tcp_sk)
2420 		return -ENOMEM;
2421 
2422 	for_each_possible_cpu(cpu) {
2423 		struct sock *sk;
2424 
2425 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2426 					   IPPROTO_TCP, net);
2427 		if (res)
2428 			goto fail;
2429 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2430 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2431 	}
2432 
2433 	net->ipv4.sysctl_tcp_ecn = 2;
2434 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2435 
2436 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2437 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2438 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2439 
2440 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2441 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2442 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2443 
2444 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2445 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2446 	net->ipv4.sysctl_tcp_syncookies = 1;
2447 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2448 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2449 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2450 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2451 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2452 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2453 	net->ipv4.sysctl_tcp_tw_reuse = 0;
2454 
2455 	cnt = tcp_hashinfo.ehash_mask + 1;
2456 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2457 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2458 
2459 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2460 
2461 	return 0;
2462 fail:
2463 	tcp_sk_exit(net);
2464 
2465 	return res;
2466 }
2467 
2468 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2469 {
2470 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2471 }
2472 
2473 static struct pernet_operations __net_initdata tcp_sk_ops = {
2474        .init	   = tcp_sk_init,
2475        .exit	   = tcp_sk_exit,
2476        .exit_batch = tcp_sk_exit_batch,
2477 };
2478 
2479 void __init tcp_v4_init(void)
2480 {
2481 	if (register_pernet_subsys(&tcp_sk_ops))
2482 		panic("Failed to create the TCP control socket.\n");
2483 }
2484