xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision 110e6f26)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77 
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 
84 #include <crypto/hash.h>
85 #include <linux/scatterlist.h>
86 
87 int sysctl_tcp_tw_reuse __read_mostly;
88 int sysctl_tcp_low_latency __read_mostly;
89 EXPORT_SYMBOL(sysctl_tcp_low_latency);
90 
91 #ifdef CONFIG_TCP_MD5SIG
92 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
93 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
94 #endif
95 
96 struct inet_hashinfo tcp_hashinfo;
97 EXPORT_SYMBOL(tcp_hashinfo);
98 
99 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
100 {
101 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
102 					  ip_hdr(skb)->saddr,
103 					  tcp_hdr(skb)->dest,
104 					  tcp_hdr(skb)->source);
105 }
106 
107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 {
109 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
110 	struct tcp_sock *tp = tcp_sk(sk);
111 
112 	/* With PAWS, it is safe from the viewpoint
113 	   of data integrity. Even without PAWS it is safe provided sequence
114 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
115 
116 	   Actually, the idea is close to VJ's one, only timestamp cache is
117 	   held not per host, but per port pair and TW bucket is used as state
118 	   holder.
119 
120 	   If TW bucket has been already destroyed we fall back to VJ's scheme
121 	   and use initial timestamp retrieved from peer table.
122 	 */
123 	if (tcptw->tw_ts_recent_stamp &&
124 	    (!twp || (sysctl_tcp_tw_reuse &&
125 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
126 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
127 		if (tp->write_seq == 0)
128 			tp->write_seq = 1;
129 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
130 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
131 		sock_hold(sktw);
132 		return 1;
133 	}
134 
135 	return 0;
136 }
137 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
138 
139 /* This will initiate an outgoing connection. */
140 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
141 {
142 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
143 	struct inet_sock *inet = inet_sk(sk);
144 	struct tcp_sock *tp = tcp_sk(sk);
145 	__be16 orig_sport, orig_dport;
146 	__be32 daddr, nexthop;
147 	struct flowi4 *fl4;
148 	struct rtable *rt;
149 	int err;
150 	struct ip_options_rcu *inet_opt;
151 
152 	if (addr_len < sizeof(struct sockaddr_in))
153 		return -EINVAL;
154 
155 	if (usin->sin_family != AF_INET)
156 		return -EAFNOSUPPORT;
157 
158 	nexthop = daddr = usin->sin_addr.s_addr;
159 	inet_opt = rcu_dereference_protected(inet->inet_opt,
160 					     sock_owned_by_user(sk));
161 	if (inet_opt && inet_opt->opt.srr) {
162 		if (!daddr)
163 			return -EINVAL;
164 		nexthop = inet_opt->opt.faddr;
165 	}
166 
167 	orig_sport = inet->inet_sport;
168 	orig_dport = usin->sin_port;
169 	fl4 = &inet->cork.fl.u.ip4;
170 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
171 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
172 			      IPPROTO_TCP,
173 			      orig_sport, orig_dport, sk);
174 	if (IS_ERR(rt)) {
175 		err = PTR_ERR(rt);
176 		if (err == -ENETUNREACH)
177 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
178 		return err;
179 	}
180 
181 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
182 		ip_rt_put(rt);
183 		return -ENETUNREACH;
184 	}
185 
186 	if (!inet_opt || !inet_opt->opt.srr)
187 		daddr = fl4->daddr;
188 
189 	if (!inet->inet_saddr)
190 		inet->inet_saddr = fl4->saddr;
191 	sk_rcv_saddr_set(sk, inet->inet_saddr);
192 
193 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
194 		/* Reset inherited state */
195 		tp->rx_opt.ts_recent	   = 0;
196 		tp->rx_opt.ts_recent_stamp = 0;
197 		if (likely(!tp->repair))
198 			tp->write_seq	   = 0;
199 	}
200 
201 	if (tcp_death_row.sysctl_tw_recycle &&
202 	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
203 		tcp_fetch_timewait_stamp(sk, &rt->dst);
204 
205 	inet->inet_dport = usin->sin_port;
206 	sk_daddr_set(sk, daddr);
207 
208 	inet_csk(sk)->icsk_ext_hdr_len = 0;
209 	if (inet_opt)
210 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
211 
212 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
213 
214 	/* Socket identity is still unknown (sport may be zero).
215 	 * However we set state to SYN-SENT and not releasing socket
216 	 * lock select source port, enter ourselves into the hash tables and
217 	 * complete initialization after this.
218 	 */
219 	tcp_set_state(sk, TCP_SYN_SENT);
220 	err = inet_hash_connect(&tcp_death_row, sk);
221 	if (err)
222 		goto failure;
223 
224 	sk_set_txhash(sk);
225 
226 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
227 			       inet->inet_sport, inet->inet_dport, sk);
228 	if (IS_ERR(rt)) {
229 		err = PTR_ERR(rt);
230 		rt = NULL;
231 		goto failure;
232 	}
233 	/* OK, now commit destination to socket.  */
234 	sk->sk_gso_type = SKB_GSO_TCPV4;
235 	sk_setup_caps(sk, &rt->dst);
236 
237 	if (!tp->write_seq && likely(!tp->repair))
238 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
239 							   inet->inet_daddr,
240 							   inet->inet_sport,
241 							   usin->sin_port);
242 
243 	inet->inet_id = tp->write_seq ^ jiffies;
244 
245 	err = tcp_connect(sk);
246 
247 	rt = NULL;
248 	if (err)
249 		goto failure;
250 
251 	return 0;
252 
253 failure:
254 	/*
255 	 * This unhashes the socket and releases the local port,
256 	 * if necessary.
257 	 */
258 	tcp_set_state(sk, TCP_CLOSE);
259 	ip_rt_put(rt);
260 	sk->sk_route_caps = 0;
261 	inet->inet_dport = 0;
262 	return err;
263 }
264 EXPORT_SYMBOL(tcp_v4_connect);
265 
266 /*
267  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
268  * It can be called through tcp_release_cb() if socket was owned by user
269  * at the time tcp_v4_err() was called to handle ICMP message.
270  */
271 void tcp_v4_mtu_reduced(struct sock *sk)
272 {
273 	struct dst_entry *dst;
274 	struct inet_sock *inet = inet_sk(sk);
275 	u32 mtu = tcp_sk(sk)->mtu_info;
276 
277 	dst = inet_csk_update_pmtu(sk, mtu);
278 	if (!dst)
279 		return;
280 
281 	/* Something is about to be wrong... Remember soft error
282 	 * for the case, if this connection will not able to recover.
283 	 */
284 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
285 		sk->sk_err_soft = EMSGSIZE;
286 
287 	mtu = dst_mtu(dst);
288 
289 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
290 	    ip_sk_accept_pmtu(sk) &&
291 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
292 		tcp_sync_mss(sk, mtu);
293 
294 		/* Resend the TCP packet because it's
295 		 * clear that the old packet has been
296 		 * dropped. This is the new "fast" path mtu
297 		 * discovery.
298 		 */
299 		tcp_simple_retransmit(sk);
300 	} /* else let the usual retransmit timer handle it */
301 }
302 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
303 
304 static void do_redirect(struct sk_buff *skb, struct sock *sk)
305 {
306 	struct dst_entry *dst = __sk_dst_check(sk, 0);
307 
308 	if (dst)
309 		dst->ops->redirect(dst, sk, skb);
310 }
311 
312 
313 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
314 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
315 {
316 	struct request_sock *req = inet_reqsk(sk);
317 	struct net *net = sock_net(sk);
318 
319 	/* ICMPs are not backlogged, hence we cannot get
320 	 * an established socket here.
321 	 */
322 	if (seq != tcp_rsk(req)->snt_isn) {
323 		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
324 	} else if (abort) {
325 		/*
326 		 * Still in SYN_RECV, just remove it silently.
327 		 * There is no good way to pass the error to the newly
328 		 * created socket, and POSIX does not want network
329 		 * errors returned from accept().
330 		 */
331 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
332 		NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
333 	}
334 	reqsk_put(req);
335 }
336 EXPORT_SYMBOL(tcp_req_err);
337 
338 /*
339  * This routine is called by the ICMP module when it gets some
340  * sort of error condition.  If err < 0 then the socket should
341  * be closed and the error returned to the user.  If err > 0
342  * it's just the icmp type << 8 | icmp code.  After adjustment
343  * header points to the first 8 bytes of the tcp header.  We need
344  * to find the appropriate port.
345  *
346  * The locking strategy used here is very "optimistic". When
347  * someone else accesses the socket the ICMP is just dropped
348  * and for some paths there is no check at all.
349  * A more general error queue to queue errors for later handling
350  * is probably better.
351  *
352  */
353 
354 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
355 {
356 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
357 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
358 	struct inet_connection_sock *icsk;
359 	struct tcp_sock *tp;
360 	struct inet_sock *inet;
361 	const int type = icmp_hdr(icmp_skb)->type;
362 	const int code = icmp_hdr(icmp_skb)->code;
363 	struct sock *sk;
364 	struct sk_buff *skb;
365 	struct request_sock *fastopen;
366 	__u32 seq, snd_una;
367 	__u32 remaining;
368 	int err;
369 	struct net *net = dev_net(icmp_skb->dev);
370 
371 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
372 				       th->dest, iph->saddr, ntohs(th->source),
373 				       inet_iif(icmp_skb));
374 	if (!sk) {
375 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
376 		return;
377 	}
378 	if (sk->sk_state == TCP_TIME_WAIT) {
379 		inet_twsk_put(inet_twsk(sk));
380 		return;
381 	}
382 	seq = ntohl(th->seq);
383 	if (sk->sk_state == TCP_NEW_SYN_RECV)
384 		return tcp_req_err(sk, seq,
385 				  type == ICMP_PARAMETERPROB ||
386 				  type == ICMP_TIME_EXCEEDED ||
387 				  (type == ICMP_DEST_UNREACH &&
388 				   (code == ICMP_NET_UNREACH ||
389 				    code == ICMP_HOST_UNREACH)));
390 
391 	bh_lock_sock(sk);
392 	/* If too many ICMPs get dropped on busy
393 	 * servers this needs to be solved differently.
394 	 * We do take care of PMTU discovery (RFC1191) special case :
395 	 * we can receive locally generated ICMP messages while socket is held.
396 	 */
397 	if (sock_owned_by_user(sk)) {
398 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
399 			NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
400 	}
401 	if (sk->sk_state == TCP_CLOSE)
402 		goto out;
403 
404 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
405 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
406 		goto out;
407 	}
408 
409 	icsk = inet_csk(sk);
410 	tp = tcp_sk(sk);
411 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
412 	fastopen = tp->fastopen_rsk;
413 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
414 	if (sk->sk_state != TCP_LISTEN &&
415 	    !between(seq, snd_una, tp->snd_nxt)) {
416 		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
417 		goto out;
418 	}
419 
420 	switch (type) {
421 	case ICMP_REDIRECT:
422 		do_redirect(icmp_skb, sk);
423 		goto out;
424 	case ICMP_SOURCE_QUENCH:
425 		/* Just silently ignore these. */
426 		goto out;
427 	case ICMP_PARAMETERPROB:
428 		err = EPROTO;
429 		break;
430 	case ICMP_DEST_UNREACH:
431 		if (code > NR_ICMP_UNREACH)
432 			goto out;
433 
434 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
435 			/* We are not interested in TCP_LISTEN and open_requests
436 			 * (SYN-ACKs send out by Linux are always <576bytes so
437 			 * they should go through unfragmented).
438 			 */
439 			if (sk->sk_state == TCP_LISTEN)
440 				goto out;
441 
442 			tp->mtu_info = info;
443 			if (!sock_owned_by_user(sk)) {
444 				tcp_v4_mtu_reduced(sk);
445 			} else {
446 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
447 					sock_hold(sk);
448 			}
449 			goto out;
450 		}
451 
452 		err = icmp_err_convert[code].errno;
453 		/* check if icmp_skb allows revert of backoff
454 		 * (see draft-zimmermann-tcp-lcd) */
455 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
456 			break;
457 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
458 		    !icsk->icsk_backoff || fastopen)
459 			break;
460 
461 		if (sock_owned_by_user(sk))
462 			break;
463 
464 		icsk->icsk_backoff--;
465 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
466 					       TCP_TIMEOUT_INIT;
467 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
468 
469 		skb = tcp_write_queue_head(sk);
470 		BUG_ON(!skb);
471 
472 		remaining = icsk->icsk_rto -
473 			    min(icsk->icsk_rto,
474 				tcp_time_stamp - tcp_skb_timestamp(skb));
475 
476 		if (remaining) {
477 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
478 						  remaining, TCP_RTO_MAX);
479 		} else {
480 			/* RTO revert clocked out retransmission.
481 			 * Will retransmit now */
482 			tcp_retransmit_timer(sk);
483 		}
484 
485 		break;
486 	case ICMP_TIME_EXCEEDED:
487 		err = EHOSTUNREACH;
488 		break;
489 	default:
490 		goto out;
491 	}
492 
493 	switch (sk->sk_state) {
494 	case TCP_SYN_SENT:
495 	case TCP_SYN_RECV:
496 		/* Only in fast or simultaneous open. If a fast open socket is
497 		 * is already accepted it is treated as a connected one below.
498 		 */
499 		if (fastopen && !fastopen->sk)
500 			break;
501 
502 		if (!sock_owned_by_user(sk)) {
503 			sk->sk_err = err;
504 
505 			sk->sk_error_report(sk);
506 
507 			tcp_done(sk);
508 		} else {
509 			sk->sk_err_soft = err;
510 		}
511 		goto out;
512 	}
513 
514 	/* If we've already connected we will keep trying
515 	 * until we time out, or the user gives up.
516 	 *
517 	 * rfc1122 4.2.3.9 allows to consider as hard errors
518 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
519 	 * but it is obsoleted by pmtu discovery).
520 	 *
521 	 * Note, that in modern internet, where routing is unreliable
522 	 * and in each dark corner broken firewalls sit, sending random
523 	 * errors ordered by their masters even this two messages finally lose
524 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
525 	 *
526 	 * Now we are in compliance with RFCs.
527 	 *							--ANK (980905)
528 	 */
529 
530 	inet = inet_sk(sk);
531 	if (!sock_owned_by_user(sk) && inet->recverr) {
532 		sk->sk_err = err;
533 		sk->sk_error_report(sk);
534 	} else	{ /* Only an error on timeout */
535 		sk->sk_err_soft = err;
536 	}
537 
538 out:
539 	bh_unlock_sock(sk);
540 	sock_put(sk);
541 }
542 
543 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
544 {
545 	struct tcphdr *th = tcp_hdr(skb);
546 
547 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
548 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
549 		skb->csum_start = skb_transport_header(skb) - skb->head;
550 		skb->csum_offset = offsetof(struct tcphdr, check);
551 	} else {
552 		th->check = tcp_v4_check(skb->len, saddr, daddr,
553 					 csum_partial(th,
554 						      th->doff << 2,
555 						      skb->csum));
556 	}
557 }
558 
559 /* This routine computes an IPv4 TCP checksum. */
560 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
561 {
562 	const struct inet_sock *inet = inet_sk(sk);
563 
564 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
565 }
566 EXPORT_SYMBOL(tcp_v4_send_check);
567 
568 /*
569  *	This routine will send an RST to the other tcp.
570  *
571  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
572  *		      for reset.
573  *	Answer: if a packet caused RST, it is not for a socket
574  *		existing in our system, if it is matched to a socket,
575  *		it is just duplicate segment or bug in other side's TCP.
576  *		So that we build reply only basing on parameters
577  *		arrived with segment.
578  *	Exception: precedence violation. We do not implement it in any case.
579  */
580 
581 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
582 {
583 	const struct tcphdr *th = tcp_hdr(skb);
584 	struct {
585 		struct tcphdr th;
586 #ifdef CONFIG_TCP_MD5SIG
587 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
588 #endif
589 	} rep;
590 	struct ip_reply_arg arg;
591 #ifdef CONFIG_TCP_MD5SIG
592 	struct tcp_md5sig_key *key = NULL;
593 	const __u8 *hash_location = NULL;
594 	unsigned char newhash[16];
595 	int genhash;
596 	struct sock *sk1 = NULL;
597 #endif
598 	struct net *net;
599 
600 	/* Never send a reset in response to a reset. */
601 	if (th->rst)
602 		return;
603 
604 	/* If sk not NULL, it means we did a successful lookup and incoming
605 	 * route had to be correct. prequeue might have dropped our dst.
606 	 */
607 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
608 		return;
609 
610 	/* Swap the send and the receive. */
611 	memset(&rep, 0, sizeof(rep));
612 	rep.th.dest   = th->source;
613 	rep.th.source = th->dest;
614 	rep.th.doff   = sizeof(struct tcphdr) / 4;
615 	rep.th.rst    = 1;
616 
617 	if (th->ack) {
618 		rep.th.seq = th->ack_seq;
619 	} else {
620 		rep.th.ack = 1;
621 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
622 				       skb->len - (th->doff << 2));
623 	}
624 
625 	memset(&arg, 0, sizeof(arg));
626 	arg.iov[0].iov_base = (unsigned char *)&rep;
627 	arg.iov[0].iov_len  = sizeof(rep.th);
628 
629 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
630 #ifdef CONFIG_TCP_MD5SIG
631 	hash_location = tcp_parse_md5sig_option(th);
632 	if (sk && sk_fullsock(sk)) {
633 		key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
634 					&ip_hdr(skb)->saddr, AF_INET);
635 	} else if (hash_location) {
636 		/*
637 		 * active side is lost. Try to find listening socket through
638 		 * source port, and then find md5 key through listening socket.
639 		 * we are not loose security here:
640 		 * Incoming packet is checked with md5 hash with finding key,
641 		 * no RST generated if md5 hash doesn't match.
642 		 */
643 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
644 					     ip_hdr(skb)->saddr,
645 					     th->source, ip_hdr(skb)->daddr,
646 					     ntohs(th->source), inet_iif(skb));
647 		/* don't send rst if it can't find key */
648 		if (!sk1)
649 			return;
650 		rcu_read_lock();
651 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
652 					&ip_hdr(skb)->saddr, AF_INET);
653 		if (!key)
654 			goto release_sk1;
655 
656 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
657 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
658 			goto release_sk1;
659 	}
660 
661 	if (key) {
662 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
663 				   (TCPOPT_NOP << 16) |
664 				   (TCPOPT_MD5SIG << 8) |
665 				   TCPOLEN_MD5SIG);
666 		/* Update length and the length the header thinks exists */
667 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
668 		rep.th.doff = arg.iov[0].iov_len / 4;
669 
670 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
671 				     key, ip_hdr(skb)->saddr,
672 				     ip_hdr(skb)->daddr, &rep.th);
673 	}
674 #endif
675 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
676 				      ip_hdr(skb)->saddr, /* XXX */
677 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
678 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
679 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
680 
681 	/* When socket is gone, all binding information is lost.
682 	 * routing might fail in this case. No choice here, if we choose to force
683 	 * input interface, we will misroute in case of asymmetric route.
684 	 */
685 	if (sk)
686 		arg.bound_dev_if = sk->sk_bound_dev_if;
687 
688 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
689 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
690 
691 	arg.tos = ip_hdr(skb)->tos;
692 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
693 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
694 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
695 			      &arg, arg.iov[0].iov_len);
696 
697 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
698 	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
699 
700 #ifdef CONFIG_TCP_MD5SIG
701 release_sk1:
702 	if (sk1) {
703 		rcu_read_unlock();
704 		sock_put(sk1);
705 	}
706 #endif
707 }
708 
709 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
710    outside socket context is ugly, certainly. What can I do?
711  */
712 
713 static void tcp_v4_send_ack(struct net *net,
714 			    struct sk_buff *skb, u32 seq, u32 ack,
715 			    u32 win, u32 tsval, u32 tsecr, int oif,
716 			    struct tcp_md5sig_key *key,
717 			    int reply_flags, u8 tos)
718 {
719 	const struct tcphdr *th = tcp_hdr(skb);
720 	struct {
721 		struct tcphdr th;
722 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
723 #ifdef CONFIG_TCP_MD5SIG
724 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
725 #endif
726 			];
727 	} rep;
728 	struct ip_reply_arg arg;
729 
730 	memset(&rep.th, 0, sizeof(struct tcphdr));
731 	memset(&arg, 0, sizeof(arg));
732 
733 	arg.iov[0].iov_base = (unsigned char *)&rep;
734 	arg.iov[0].iov_len  = sizeof(rep.th);
735 	if (tsecr) {
736 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
737 				   (TCPOPT_TIMESTAMP << 8) |
738 				   TCPOLEN_TIMESTAMP);
739 		rep.opt[1] = htonl(tsval);
740 		rep.opt[2] = htonl(tsecr);
741 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
742 	}
743 
744 	/* Swap the send and the receive. */
745 	rep.th.dest    = th->source;
746 	rep.th.source  = th->dest;
747 	rep.th.doff    = arg.iov[0].iov_len / 4;
748 	rep.th.seq     = htonl(seq);
749 	rep.th.ack_seq = htonl(ack);
750 	rep.th.ack     = 1;
751 	rep.th.window  = htons(win);
752 
753 #ifdef CONFIG_TCP_MD5SIG
754 	if (key) {
755 		int offset = (tsecr) ? 3 : 0;
756 
757 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
758 					  (TCPOPT_NOP << 16) |
759 					  (TCPOPT_MD5SIG << 8) |
760 					  TCPOLEN_MD5SIG);
761 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
762 		rep.th.doff = arg.iov[0].iov_len/4;
763 
764 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
765 				    key, ip_hdr(skb)->saddr,
766 				    ip_hdr(skb)->daddr, &rep.th);
767 	}
768 #endif
769 	arg.flags = reply_flags;
770 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
771 				      ip_hdr(skb)->saddr, /* XXX */
772 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
773 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
774 	if (oif)
775 		arg.bound_dev_if = oif;
776 	arg.tos = tos;
777 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
778 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
779 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
780 			      &arg, arg.iov[0].iov_len);
781 
782 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
783 }
784 
785 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
786 {
787 	struct inet_timewait_sock *tw = inet_twsk(sk);
788 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
789 
790 	tcp_v4_send_ack(sock_net(sk), skb,
791 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
792 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
793 			tcp_time_stamp + tcptw->tw_ts_offset,
794 			tcptw->tw_ts_recent,
795 			tw->tw_bound_dev_if,
796 			tcp_twsk_md5_key(tcptw),
797 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
798 			tw->tw_tos
799 			);
800 
801 	inet_twsk_put(tw);
802 }
803 
804 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
805 				  struct request_sock *req)
806 {
807 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
808 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
809 	 */
810 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
811 					     tcp_sk(sk)->snd_nxt;
812 
813 	tcp_v4_send_ack(sock_net(sk), skb, seq,
814 			tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
815 			tcp_time_stamp,
816 			req->ts_recent,
817 			0,
818 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
819 					  AF_INET),
820 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
821 			ip_hdr(skb)->tos);
822 }
823 
824 /*
825  *	Send a SYN-ACK after having received a SYN.
826  *	This still operates on a request_sock only, not on a big
827  *	socket.
828  */
829 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
830 			      struct flowi *fl,
831 			      struct request_sock *req,
832 			      struct tcp_fastopen_cookie *foc,
833 				  bool attach_req)
834 {
835 	const struct inet_request_sock *ireq = inet_rsk(req);
836 	struct flowi4 fl4;
837 	int err = -1;
838 	struct sk_buff *skb;
839 
840 	/* First, grab a route. */
841 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
842 		return -1;
843 
844 	skb = tcp_make_synack(sk, dst, req, foc, attach_req);
845 
846 	if (skb) {
847 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
848 
849 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
850 					    ireq->ir_rmt_addr,
851 					    ireq->opt);
852 		err = net_xmit_eval(err);
853 	}
854 
855 	return err;
856 }
857 
858 /*
859  *	IPv4 request_sock destructor.
860  */
861 static void tcp_v4_reqsk_destructor(struct request_sock *req)
862 {
863 	kfree(inet_rsk(req)->opt);
864 }
865 
866 #ifdef CONFIG_TCP_MD5SIG
867 /*
868  * RFC2385 MD5 checksumming requires a mapping of
869  * IP address->MD5 Key.
870  * We need to maintain these in the sk structure.
871  */
872 
873 /* Find the Key structure for an address.  */
874 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
875 					 const union tcp_md5_addr *addr,
876 					 int family)
877 {
878 	const struct tcp_sock *tp = tcp_sk(sk);
879 	struct tcp_md5sig_key *key;
880 	unsigned int size = sizeof(struct in_addr);
881 	const struct tcp_md5sig_info *md5sig;
882 
883 	/* caller either holds rcu_read_lock() or socket lock */
884 	md5sig = rcu_dereference_check(tp->md5sig_info,
885 				       sock_owned_by_user(sk) ||
886 				       lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
887 	if (!md5sig)
888 		return NULL;
889 #if IS_ENABLED(CONFIG_IPV6)
890 	if (family == AF_INET6)
891 		size = sizeof(struct in6_addr);
892 #endif
893 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
894 		if (key->family != family)
895 			continue;
896 		if (!memcmp(&key->addr, addr, size))
897 			return key;
898 	}
899 	return NULL;
900 }
901 EXPORT_SYMBOL(tcp_md5_do_lookup);
902 
903 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
904 					 const struct sock *addr_sk)
905 {
906 	const union tcp_md5_addr *addr;
907 
908 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
909 	return tcp_md5_do_lookup(sk, addr, AF_INET);
910 }
911 EXPORT_SYMBOL(tcp_v4_md5_lookup);
912 
913 /* This can be called on a newly created socket, from other files */
914 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
915 		   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
916 {
917 	/* Add Key to the list */
918 	struct tcp_md5sig_key *key;
919 	struct tcp_sock *tp = tcp_sk(sk);
920 	struct tcp_md5sig_info *md5sig;
921 
922 	key = tcp_md5_do_lookup(sk, addr, family);
923 	if (key) {
924 		/* Pre-existing entry - just update that one. */
925 		memcpy(key->key, newkey, newkeylen);
926 		key->keylen = newkeylen;
927 		return 0;
928 	}
929 
930 	md5sig = rcu_dereference_protected(tp->md5sig_info,
931 					   sock_owned_by_user(sk) ||
932 					   lockdep_is_held(&sk->sk_lock.slock));
933 	if (!md5sig) {
934 		md5sig = kmalloc(sizeof(*md5sig), gfp);
935 		if (!md5sig)
936 			return -ENOMEM;
937 
938 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
939 		INIT_HLIST_HEAD(&md5sig->head);
940 		rcu_assign_pointer(tp->md5sig_info, md5sig);
941 	}
942 
943 	key = sock_kmalloc(sk, sizeof(*key), gfp);
944 	if (!key)
945 		return -ENOMEM;
946 	if (!tcp_alloc_md5sig_pool()) {
947 		sock_kfree_s(sk, key, sizeof(*key));
948 		return -ENOMEM;
949 	}
950 
951 	memcpy(key->key, newkey, newkeylen);
952 	key->keylen = newkeylen;
953 	key->family = family;
954 	memcpy(&key->addr, addr,
955 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
956 				      sizeof(struct in_addr));
957 	hlist_add_head_rcu(&key->node, &md5sig->head);
958 	return 0;
959 }
960 EXPORT_SYMBOL(tcp_md5_do_add);
961 
962 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
963 {
964 	struct tcp_md5sig_key *key;
965 
966 	key = tcp_md5_do_lookup(sk, addr, family);
967 	if (!key)
968 		return -ENOENT;
969 	hlist_del_rcu(&key->node);
970 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
971 	kfree_rcu(key, rcu);
972 	return 0;
973 }
974 EXPORT_SYMBOL(tcp_md5_do_del);
975 
976 static void tcp_clear_md5_list(struct sock *sk)
977 {
978 	struct tcp_sock *tp = tcp_sk(sk);
979 	struct tcp_md5sig_key *key;
980 	struct hlist_node *n;
981 	struct tcp_md5sig_info *md5sig;
982 
983 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
984 
985 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
986 		hlist_del_rcu(&key->node);
987 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
988 		kfree_rcu(key, rcu);
989 	}
990 }
991 
992 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
993 				 int optlen)
994 {
995 	struct tcp_md5sig cmd;
996 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
997 
998 	if (optlen < sizeof(cmd))
999 		return -EINVAL;
1000 
1001 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1002 		return -EFAULT;
1003 
1004 	if (sin->sin_family != AF_INET)
1005 		return -EINVAL;
1006 
1007 	if (!cmd.tcpm_keylen)
1008 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1009 				      AF_INET);
1010 
1011 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1012 		return -EINVAL;
1013 
1014 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1015 			      AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1016 			      GFP_KERNEL);
1017 }
1018 
1019 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1020 					__be32 daddr, __be32 saddr, int nbytes)
1021 {
1022 	struct tcp4_pseudohdr *bp;
1023 	struct scatterlist sg;
1024 
1025 	bp = &hp->md5_blk.ip4;
1026 
1027 	/*
1028 	 * 1. the TCP pseudo-header (in the order: source IP address,
1029 	 * destination IP address, zero-padded protocol number, and
1030 	 * segment length)
1031 	 */
1032 	bp->saddr = saddr;
1033 	bp->daddr = daddr;
1034 	bp->pad = 0;
1035 	bp->protocol = IPPROTO_TCP;
1036 	bp->len = cpu_to_be16(nbytes);
1037 
1038 	sg_init_one(&sg, bp, sizeof(*bp));
1039 	ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(*bp));
1040 	return crypto_ahash_update(hp->md5_req);
1041 }
1042 
1043 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1044 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1045 {
1046 	struct tcp_md5sig_pool *hp;
1047 	struct ahash_request *req;
1048 
1049 	hp = tcp_get_md5sig_pool();
1050 	if (!hp)
1051 		goto clear_hash_noput;
1052 	req = hp->md5_req;
1053 
1054 	if (crypto_ahash_init(req))
1055 		goto clear_hash;
1056 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1057 		goto clear_hash;
1058 	if (tcp_md5_hash_header(hp, th))
1059 		goto clear_hash;
1060 	if (tcp_md5_hash_key(hp, key))
1061 		goto clear_hash;
1062 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1063 	if (crypto_ahash_final(req))
1064 		goto clear_hash;
1065 
1066 	tcp_put_md5sig_pool();
1067 	return 0;
1068 
1069 clear_hash:
1070 	tcp_put_md5sig_pool();
1071 clear_hash_noput:
1072 	memset(md5_hash, 0, 16);
1073 	return 1;
1074 }
1075 
1076 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1077 			const struct sock *sk,
1078 			const struct sk_buff *skb)
1079 {
1080 	struct tcp_md5sig_pool *hp;
1081 	struct ahash_request *req;
1082 	const struct tcphdr *th = tcp_hdr(skb);
1083 	__be32 saddr, daddr;
1084 
1085 	if (sk) { /* valid for establish/request sockets */
1086 		saddr = sk->sk_rcv_saddr;
1087 		daddr = sk->sk_daddr;
1088 	} else {
1089 		const struct iphdr *iph = ip_hdr(skb);
1090 		saddr = iph->saddr;
1091 		daddr = iph->daddr;
1092 	}
1093 
1094 	hp = tcp_get_md5sig_pool();
1095 	if (!hp)
1096 		goto clear_hash_noput;
1097 	req = hp->md5_req;
1098 
1099 	if (crypto_ahash_init(req))
1100 		goto clear_hash;
1101 
1102 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1103 		goto clear_hash;
1104 	if (tcp_md5_hash_header(hp, th))
1105 		goto clear_hash;
1106 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1107 		goto clear_hash;
1108 	if (tcp_md5_hash_key(hp, key))
1109 		goto clear_hash;
1110 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1111 	if (crypto_ahash_final(req))
1112 		goto clear_hash;
1113 
1114 	tcp_put_md5sig_pool();
1115 	return 0;
1116 
1117 clear_hash:
1118 	tcp_put_md5sig_pool();
1119 clear_hash_noput:
1120 	memset(md5_hash, 0, 16);
1121 	return 1;
1122 }
1123 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1124 
1125 #endif
1126 
1127 /* Called with rcu_read_lock() */
1128 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1129 				    const struct sk_buff *skb)
1130 {
1131 #ifdef CONFIG_TCP_MD5SIG
1132 	/*
1133 	 * This gets called for each TCP segment that arrives
1134 	 * so we want to be efficient.
1135 	 * We have 3 drop cases:
1136 	 * o No MD5 hash and one expected.
1137 	 * o MD5 hash and we're not expecting one.
1138 	 * o MD5 hash and its wrong.
1139 	 */
1140 	const __u8 *hash_location = NULL;
1141 	struct tcp_md5sig_key *hash_expected;
1142 	const struct iphdr *iph = ip_hdr(skb);
1143 	const struct tcphdr *th = tcp_hdr(skb);
1144 	int genhash;
1145 	unsigned char newhash[16];
1146 
1147 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1148 					  AF_INET);
1149 	hash_location = tcp_parse_md5sig_option(th);
1150 
1151 	/* We've parsed the options - do we have a hash? */
1152 	if (!hash_expected && !hash_location)
1153 		return false;
1154 
1155 	if (hash_expected && !hash_location) {
1156 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1157 		return true;
1158 	}
1159 
1160 	if (!hash_expected && hash_location) {
1161 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1162 		return true;
1163 	}
1164 
1165 	/* Okay, so this is hash_expected and hash_location -
1166 	 * so we need to calculate the checksum.
1167 	 */
1168 	genhash = tcp_v4_md5_hash_skb(newhash,
1169 				      hash_expected,
1170 				      NULL, skb);
1171 
1172 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1173 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1174 				     &iph->saddr, ntohs(th->source),
1175 				     &iph->daddr, ntohs(th->dest),
1176 				     genhash ? " tcp_v4_calc_md5_hash failed"
1177 				     : "");
1178 		return true;
1179 	}
1180 	return false;
1181 #endif
1182 	return false;
1183 }
1184 
1185 static void tcp_v4_init_req(struct request_sock *req,
1186 			    const struct sock *sk_listener,
1187 			    struct sk_buff *skb)
1188 {
1189 	struct inet_request_sock *ireq = inet_rsk(req);
1190 
1191 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1192 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1193 	ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1194 	ireq->opt = tcp_v4_save_options(skb);
1195 }
1196 
1197 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1198 					  struct flowi *fl,
1199 					  const struct request_sock *req,
1200 					  bool *strict)
1201 {
1202 	struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1203 
1204 	if (strict) {
1205 		if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1206 			*strict = true;
1207 		else
1208 			*strict = false;
1209 	}
1210 
1211 	return dst;
1212 }
1213 
1214 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1215 	.family		=	PF_INET,
1216 	.obj_size	=	sizeof(struct tcp_request_sock),
1217 	.rtx_syn_ack	=	tcp_rtx_synack,
1218 	.send_ack	=	tcp_v4_reqsk_send_ack,
1219 	.destructor	=	tcp_v4_reqsk_destructor,
1220 	.send_reset	=	tcp_v4_send_reset,
1221 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1222 };
1223 
1224 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1225 	.mss_clamp	=	TCP_MSS_DEFAULT,
1226 #ifdef CONFIG_TCP_MD5SIG
1227 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1228 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1229 #endif
1230 	.init_req	=	tcp_v4_init_req,
1231 #ifdef CONFIG_SYN_COOKIES
1232 	.cookie_init_seq =	cookie_v4_init_sequence,
1233 #endif
1234 	.route_req	=	tcp_v4_route_req,
1235 	.init_seq	=	tcp_v4_init_sequence,
1236 	.send_synack	=	tcp_v4_send_synack,
1237 };
1238 
1239 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1240 {
1241 	/* Never answer to SYNs send to broadcast or multicast */
1242 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1243 		goto drop;
1244 
1245 	return tcp_conn_request(&tcp_request_sock_ops,
1246 				&tcp_request_sock_ipv4_ops, sk, skb);
1247 
1248 drop:
1249 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1250 	return 0;
1251 }
1252 EXPORT_SYMBOL(tcp_v4_conn_request);
1253 
1254 
1255 /*
1256  * The three way handshake has completed - we got a valid synack -
1257  * now create the new socket.
1258  */
1259 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1260 				  struct request_sock *req,
1261 				  struct dst_entry *dst,
1262 				  struct request_sock *req_unhash,
1263 				  bool *own_req)
1264 {
1265 	struct inet_request_sock *ireq;
1266 	struct inet_sock *newinet;
1267 	struct tcp_sock *newtp;
1268 	struct sock *newsk;
1269 #ifdef CONFIG_TCP_MD5SIG
1270 	struct tcp_md5sig_key *key;
1271 #endif
1272 	struct ip_options_rcu *inet_opt;
1273 
1274 	if (sk_acceptq_is_full(sk))
1275 		goto exit_overflow;
1276 
1277 	newsk = tcp_create_openreq_child(sk, req, skb);
1278 	if (!newsk)
1279 		goto exit_nonewsk;
1280 
1281 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1282 	inet_sk_rx_dst_set(newsk, skb);
1283 
1284 	newtp		      = tcp_sk(newsk);
1285 	newinet		      = inet_sk(newsk);
1286 	ireq		      = inet_rsk(req);
1287 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1288 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1289 	newsk->sk_bound_dev_if = ireq->ir_iif;
1290 	newinet->inet_saddr	      = ireq->ir_loc_addr;
1291 	inet_opt	      = ireq->opt;
1292 	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1293 	ireq->opt	      = NULL;
1294 	newinet->mc_index     = inet_iif(skb);
1295 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1296 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1297 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1298 	if (inet_opt)
1299 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1300 	newinet->inet_id = newtp->write_seq ^ jiffies;
1301 
1302 	if (!dst) {
1303 		dst = inet_csk_route_child_sock(sk, newsk, req);
1304 		if (!dst)
1305 			goto put_and_exit;
1306 	} else {
1307 		/* syncookie case : see end of cookie_v4_check() */
1308 	}
1309 	sk_setup_caps(newsk, dst);
1310 
1311 	tcp_ca_openreq_child(newsk, dst);
1312 
1313 	tcp_sync_mss(newsk, dst_mtu(dst));
1314 	newtp->advmss = dst_metric_advmss(dst);
1315 	if (tcp_sk(sk)->rx_opt.user_mss &&
1316 	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1317 		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1318 
1319 	tcp_initialize_rcv_mss(newsk);
1320 
1321 #ifdef CONFIG_TCP_MD5SIG
1322 	/* Copy over the MD5 key from the original socket */
1323 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1324 				AF_INET);
1325 	if (key) {
1326 		/*
1327 		 * We're using one, so create a matching key
1328 		 * on the newsk structure. If we fail to get
1329 		 * memory, then we end up not copying the key
1330 		 * across. Shucks.
1331 		 */
1332 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1333 			       AF_INET, key->key, key->keylen, GFP_ATOMIC);
1334 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1335 	}
1336 #endif
1337 
1338 	if (__inet_inherit_port(sk, newsk) < 0)
1339 		goto put_and_exit;
1340 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1341 	if (*own_req)
1342 		tcp_move_syn(newtp, req);
1343 
1344 	return newsk;
1345 
1346 exit_overflow:
1347 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1348 exit_nonewsk:
1349 	dst_release(dst);
1350 exit:
1351 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1352 	return NULL;
1353 put_and_exit:
1354 	inet_csk_prepare_forced_close(newsk);
1355 	tcp_done(newsk);
1356 	goto exit;
1357 }
1358 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1359 
1360 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1361 {
1362 #ifdef CONFIG_SYN_COOKIES
1363 	const struct tcphdr *th = tcp_hdr(skb);
1364 
1365 	if (!th->syn)
1366 		sk = cookie_v4_check(sk, skb);
1367 #endif
1368 	return sk;
1369 }
1370 
1371 /* The socket must have it's spinlock held when we get
1372  * here, unless it is a TCP_LISTEN socket.
1373  *
1374  * We have a potential double-lock case here, so even when
1375  * doing backlog processing we use the BH locking scheme.
1376  * This is because we cannot sleep with the original spinlock
1377  * held.
1378  */
1379 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1380 {
1381 	struct sock *rsk;
1382 
1383 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1384 		struct dst_entry *dst = sk->sk_rx_dst;
1385 
1386 		sock_rps_save_rxhash(sk, skb);
1387 		sk_mark_napi_id(sk, skb);
1388 		if (dst) {
1389 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1390 			    !dst->ops->check(dst, 0)) {
1391 				dst_release(dst);
1392 				sk->sk_rx_dst = NULL;
1393 			}
1394 		}
1395 		tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1396 		return 0;
1397 	}
1398 
1399 	if (tcp_checksum_complete(skb))
1400 		goto csum_err;
1401 
1402 	if (sk->sk_state == TCP_LISTEN) {
1403 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1404 
1405 		if (!nsk)
1406 			goto discard;
1407 		if (nsk != sk) {
1408 			sock_rps_save_rxhash(nsk, skb);
1409 			sk_mark_napi_id(nsk, skb);
1410 			if (tcp_child_process(sk, nsk, skb)) {
1411 				rsk = nsk;
1412 				goto reset;
1413 			}
1414 			return 0;
1415 		}
1416 	} else
1417 		sock_rps_save_rxhash(sk, skb);
1418 
1419 	if (tcp_rcv_state_process(sk, skb)) {
1420 		rsk = sk;
1421 		goto reset;
1422 	}
1423 	return 0;
1424 
1425 reset:
1426 	tcp_v4_send_reset(rsk, skb);
1427 discard:
1428 	kfree_skb(skb);
1429 	/* Be careful here. If this function gets more complicated and
1430 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1431 	 * might be destroyed here. This current version compiles correctly,
1432 	 * but you have been warned.
1433 	 */
1434 	return 0;
1435 
1436 csum_err:
1437 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1438 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1439 	goto discard;
1440 }
1441 EXPORT_SYMBOL(tcp_v4_do_rcv);
1442 
1443 void tcp_v4_early_demux(struct sk_buff *skb)
1444 {
1445 	const struct iphdr *iph;
1446 	const struct tcphdr *th;
1447 	struct sock *sk;
1448 
1449 	if (skb->pkt_type != PACKET_HOST)
1450 		return;
1451 
1452 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1453 		return;
1454 
1455 	iph = ip_hdr(skb);
1456 	th = tcp_hdr(skb);
1457 
1458 	if (th->doff < sizeof(struct tcphdr) / 4)
1459 		return;
1460 
1461 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1462 				       iph->saddr, th->source,
1463 				       iph->daddr, ntohs(th->dest),
1464 				       skb->skb_iif);
1465 	if (sk) {
1466 		skb->sk = sk;
1467 		skb->destructor = sock_edemux;
1468 		if (sk_fullsock(sk)) {
1469 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1470 
1471 			if (dst)
1472 				dst = dst_check(dst, 0);
1473 			if (dst &&
1474 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1475 				skb_dst_set_noref(skb, dst);
1476 		}
1477 	}
1478 }
1479 
1480 /* Packet is added to VJ-style prequeue for processing in process
1481  * context, if a reader task is waiting. Apparently, this exciting
1482  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1483  * failed somewhere. Latency? Burstiness? Well, at least now we will
1484  * see, why it failed. 8)8)				  --ANK
1485  *
1486  */
1487 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1488 {
1489 	struct tcp_sock *tp = tcp_sk(sk);
1490 
1491 	if (sysctl_tcp_low_latency || !tp->ucopy.task)
1492 		return false;
1493 
1494 	if (skb->len <= tcp_hdrlen(skb) &&
1495 	    skb_queue_len(&tp->ucopy.prequeue) == 0)
1496 		return false;
1497 
1498 	/* Before escaping RCU protected region, we need to take care of skb
1499 	 * dst. Prequeue is only enabled for established sockets.
1500 	 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1501 	 * Instead of doing full sk_rx_dst validity here, let's perform
1502 	 * an optimistic check.
1503 	 */
1504 	if (likely(sk->sk_rx_dst))
1505 		skb_dst_drop(skb);
1506 	else
1507 		skb_dst_force_safe(skb);
1508 
1509 	__skb_queue_tail(&tp->ucopy.prequeue, skb);
1510 	tp->ucopy.memory += skb->truesize;
1511 	if (tp->ucopy.memory > sk->sk_rcvbuf) {
1512 		struct sk_buff *skb1;
1513 
1514 		BUG_ON(sock_owned_by_user(sk));
1515 
1516 		while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1517 			sk_backlog_rcv(sk, skb1);
1518 			NET_INC_STATS_BH(sock_net(sk),
1519 					 LINUX_MIB_TCPPREQUEUEDROPPED);
1520 		}
1521 
1522 		tp->ucopy.memory = 0;
1523 	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1524 		wake_up_interruptible_sync_poll(sk_sleep(sk),
1525 					   POLLIN | POLLRDNORM | POLLRDBAND);
1526 		if (!inet_csk_ack_scheduled(sk))
1527 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1528 						  (3 * tcp_rto_min(sk)) / 4,
1529 						  TCP_RTO_MAX);
1530 	}
1531 	return true;
1532 }
1533 EXPORT_SYMBOL(tcp_prequeue);
1534 
1535 /*
1536  *	From tcp_input.c
1537  */
1538 
1539 int tcp_v4_rcv(struct sk_buff *skb)
1540 {
1541 	const struct iphdr *iph;
1542 	const struct tcphdr *th;
1543 	struct sock *sk;
1544 	int ret;
1545 	struct net *net = dev_net(skb->dev);
1546 
1547 	if (skb->pkt_type != PACKET_HOST)
1548 		goto discard_it;
1549 
1550 	/* Count it even if it's bad */
1551 	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1552 
1553 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1554 		goto discard_it;
1555 
1556 	th = tcp_hdr(skb);
1557 
1558 	if (th->doff < sizeof(struct tcphdr) / 4)
1559 		goto bad_packet;
1560 	if (!pskb_may_pull(skb, th->doff * 4))
1561 		goto discard_it;
1562 
1563 	/* An explanation is required here, I think.
1564 	 * Packet length and doff are validated by header prediction,
1565 	 * provided case of th->doff==0 is eliminated.
1566 	 * So, we defer the checks. */
1567 
1568 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1569 		goto csum_error;
1570 
1571 	th = tcp_hdr(skb);
1572 	iph = ip_hdr(skb);
1573 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1574 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1575 	 */
1576 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1577 		sizeof(struct inet_skb_parm));
1578 	barrier();
1579 
1580 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1581 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1582 				    skb->len - th->doff * 4);
1583 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1584 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1585 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1586 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1587 	TCP_SKB_CB(skb)->sacked	 = 0;
1588 
1589 lookup:
1590 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1591 			       th->dest);
1592 	if (!sk)
1593 		goto no_tcp_socket;
1594 
1595 process:
1596 	if (sk->sk_state == TCP_TIME_WAIT)
1597 		goto do_time_wait;
1598 
1599 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1600 		struct request_sock *req = inet_reqsk(sk);
1601 		struct sock *nsk;
1602 
1603 		sk = req->rsk_listener;
1604 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1605 			reqsk_put(req);
1606 			goto discard_it;
1607 		}
1608 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1609 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1610 			goto lookup;
1611 		}
1612 		sock_hold(sk);
1613 		nsk = tcp_check_req(sk, skb, req, false);
1614 		if (!nsk) {
1615 			reqsk_put(req);
1616 			goto discard_and_relse;
1617 		}
1618 		if (nsk == sk) {
1619 			reqsk_put(req);
1620 		} else if (tcp_child_process(sk, nsk, skb)) {
1621 			tcp_v4_send_reset(nsk, skb);
1622 			goto discard_and_relse;
1623 		} else {
1624 			sock_put(sk);
1625 			return 0;
1626 		}
1627 	}
1628 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1629 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1630 		goto discard_and_relse;
1631 	}
1632 
1633 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1634 		goto discard_and_relse;
1635 
1636 	if (tcp_v4_inbound_md5_hash(sk, skb))
1637 		goto discard_and_relse;
1638 
1639 	nf_reset(skb);
1640 
1641 	if (sk_filter(sk, skb))
1642 		goto discard_and_relse;
1643 
1644 	skb->dev = NULL;
1645 
1646 	if (sk->sk_state == TCP_LISTEN) {
1647 		ret = tcp_v4_do_rcv(sk, skb);
1648 		goto put_and_return;
1649 	}
1650 
1651 	sk_incoming_cpu_update(sk);
1652 
1653 	bh_lock_sock_nested(sk);
1654 	tcp_segs_in(tcp_sk(sk), skb);
1655 	ret = 0;
1656 	if (!sock_owned_by_user(sk)) {
1657 		if (!tcp_prequeue(sk, skb))
1658 			ret = tcp_v4_do_rcv(sk, skb);
1659 	} else if (unlikely(sk_add_backlog(sk, skb,
1660 					   sk->sk_rcvbuf + sk->sk_sndbuf))) {
1661 		bh_unlock_sock(sk);
1662 		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1663 		goto discard_and_relse;
1664 	}
1665 	bh_unlock_sock(sk);
1666 
1667 put_and_return:
1668 	sock_put(sk);
1669 
1670 	return ret;
1671 
1672 no_tcp_socket:
1673 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1674 		goto discard_it;
1675 
1676 	if (tcp_checksum_complete(skb)) {
1677 csum_error:
1678 		TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1679 bad_packet:
1680 		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1681 	} else {
1682 		tcp_v4_send_reset(NULL, skb);
1683 	}
1684 
1685 discard_it:
1686 	/* Discard frame. */
1687 	kfree_skb(skb);
1688 	return 0;
1689 
1690 discard_and_relse:
1691 	sock_put(sk);
1692 	goto discard_it;
1693 
1694 do_time_wait:
1695 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1696 		inet_twsk_put(inet_twsk(sk));
1697 		goto discard_it;
1698 	}
1699 
1700 	if (tcp_checksum_complete(skb)) {
1701 		inet_twsk_put(inet_twsk(sk));
1702 		goto csum_error;
1703 	}
1704 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1705 	case TCP_TW_SYN: {
1706 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1707 							&tcp_hashinfo, skb,
1708 							__tcp_hdrlen(th),
1709 							iph->saddr, th->source,
1710 							iph->daddr, th->dest,
1711 							inet_iif(skb));
1712 		if (sk2) {
1713 			inet_twsk_deschedule_put(inet_twsk(sk));
1714 			sk = sk2;
1715 			goto process;
1716 		}
1717 		/* Fall through to ACK */
1718 	}
1719 	case TCP_TW_ACK:
1720 		tcp_v4_timewait_ack(sk, skb);
1721 		break;
1722 	case TCP_TW_RST:
1723 		tcp_v4_send_reset(sk, skb);
1724 		inet_twsk_deschedule_put(inet_twsk(sk));
1725 		goto discard_it;
1726 	case TCP_TW_SUCCESS:;
1727 	}
1728 	goto discard_it;
1729 }
1730 
1731 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1732 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1733 	.twsk_unique	= tcp_twsk_unique,
1734 	.twsk_destructor= tcp_twsk_destructor,
1735 };
1736 
1737 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1738 {
1739 	struct dst_entry *dst = skb_dst(skb);
1740 
1741 	if (dst && dst_hold_safe(dst)) {
1742 		sk->sk_rx_dst = dst;
1743 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1744 	}
1745 }
1746 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1747 
1748 const struct inet_connection_sock_af_ops ipv4_specific = {
1749 	.queue_xmit	   = ip_queue_xmit,
1750 	.send_check	   = tcp_v4_send_check,
1751 	.rebuild_header	   = inet_sk_rebuild_header,
1752 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
1753 	.conn_request	   = tcp_v4_conn_request,
1754 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1755 	.net_header_len	   = sizeof(struct iphdr),
1756 	.setsockopt	   = ip_setsockopt,
1757 	.getsockopt	   = ip_getsockopt,
1758 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1759 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1760 	.bind_conflict	   = inet_csk_bind_conflict,
1761 #ifdef CONFIG_COMPAT
1762 	.compat_setsockopt = compat_ip_setsockopt,
1763 	.compat_getsockopt = compat_ip_getsockopt,
1764 #endif
1765 	.mtu_reduced	   = tcp_v4_mtu_reduced,
1766 };
1767 EXPORT_SYMBOL(ipv4_specific);
1768 
1769 #ifdef CONFIG_TCP_MD5SIG
1770 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1771 	.md5_lookup		= tcp_v4_md5_lookup,
1772 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1773 	.md5_parse		= tcp_v4_parse_md5_keys,
1774 };
1775 #endif
1776 
1777 /* NOTE: A lot of things set to zero explicitly by call to
1778  *       sk_alloc() so need not be done here.
1779  */
1780 static int tcp_v4_init_sock(struct sock *sk)
1781 {
1782 	struct inet_connection_sock *icsk = inet_csk(sk);
1783 
1784 	tcp_init_sock(sk);
1785 
1786 	icsk->icsk_af_ops = &ipv4_specific;
1787 
1788 #ifdef CONFIG_TCP_MD5SIG
1789 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1790 #endif
1791 
1792 	return 0;
1793 }
1794 
1795 void tcp_v4_destroy_sock(struct sock *sk)
1796 {
1797 	struct tcp_sock *tp = tcp_sk(sk);
1798 
1799 	tcp_clear_xmit_timers(sk);
1800 
1801 	tcp_cleanup_congestion_control(sk);
1802 
1803 	/* Cleanup up the write buffer. */
1804 	tcp_write_queue_purge(sk);
1805 
1806 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1807 	__skb_queue_purge(&tp->out_of_order_queue);
1808 
1809 #ifdef CONFIG_TCP_MD5SIG
1810 	/* Clean up the MD5 key list, if any */
1811 	if (tp->md5sig_info) {
1812 		tcp_clear_md5_list(sk);
1813 		kfree_rcu(tp->md5sig_info, rcu);
1814 		tp->md5sig_info = NULL;
1815 	}
1816 #endif
1817 
1818 	/* Clean prequeue, it must be empty really */
1819 	__skb_queue_purge(&tp->ucopy.prequeue);
1820 
1821 	/* Clean up a referenced TCP bind bucket. */
1822 	if (inet_csk(sk)->icsk_bind_hash)
1823 		inet_put_port(sk);
1824 
1825 	BUG_ON(tp->fastopen_rsk);
1826 
1827 	/* If socket is aborted during connect operation */
1828 	tcp_free_fastopen_req(tp);
1829 	tcp_saved_syn_free(tp);
1830 
1831 	sk_sockets_allocated_dec(sk);
1832 
1833 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
1834 		sock_release_memcg(sk);
1835 }
1836 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1837 
1838 #ifdef CONFIG_PROC_FS
1839 /* Proc filesystem TCP sock list dumping. */
1840 
1841 /*
1842  * Get next listener socket follow cur.  If cur is NULL, get first socket
1843  * starting from bucket given in st->bucket; when st->bucket is zero the
1844  * very first socket in the hash table is returned.
1845  */
1846 static void *listening_get_next(struct seq_file *seq, void *cur)
1847 {
1848 	struct inet_connection_sock *icsk;
1849 	struct hlist_nulls_node *node;
1850 	struct sock *sk = cur;
1851 	struct inet_listen_hashbucket *ilb;
1852 	struct tcp_iter_state *st = seq->private;
1853 	struct net *net = seq_file_net(seq);
1854 
1855 	if (!sk) {
1856 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1857 		spin_lock_bh(&ilb->lock);
1858 		sk = sk_nulls_head(&ilb->head);
1859 		st->offset = 0;
1860 		goto get_sk;
1861 	}
1862 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1863 	++st->num;
1864 	++st->offset;
1865 
1866 	sk = sk_nulls_next(sk);
1867 get_sk:
1868 	sk_nulls_for_each_from(sk, node) {
1869 		if (!net_eq(sock_net(sk), net))
1870 			continue;
1871 		if (sk->sk_family == st->family) {
1872 			cur = sk;
1873 			goto out;
1874 		}
1875 		icsk = inet_csk(sk);
1876 	}
1877 	spin_unlock_bh(&ilb->lock);
1878 	st->offset = 0;
1879 	if (++st->bucket < INET_LHTABLE_SIZE) {
1880 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1881 		spin_lock_bh(&ilb->lock);
1882 		sk = sk_nulls_head(&ilb->head);
1883 		goto get_sk;
1884 	}
1885 	cur = NULL;
1886 out:
1887 	return cur;
1888 }
1889 
1890 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1891 {
1892 	struct tcp_iter_state *st = seq->private;
1893 	void *rc;
1894 
1895 	st->bucket = 0;
1896 	st->offset = 0;
1897 	rc = listening_get_next(seq, NULL);
1898 
1899 	while (rc && *pos) {
1900 		rc = listening_get_next(seq, rc);
1901 		--*pos;
1902 	}
1903 	return rc;
1904 }
1905 
1906 static inline bool empty_bucket(const struct tcp_iter_state *st)
1907 {
1908 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1909 }
1910 
1911 /*
1912  * Get first established socket starting from bucket given in st->bucket.
1913  * If st->bucket is zero, the very first socket in the hash is returned.
1914  */
1915 static void *established_get_first(struct seq_file *seq)
1916 {
1917 	struct tcp_iter_state *st = seq->private;
1918 	struct net *net = seq_file_net(seq);
1919 	void *rc = NULL;
1920 
1921 	st->offset = 0;
1922 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1923 		struct sock *sk;
1924 		struct hlist_nulls_node *node;
1925 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1926 
1927 		/* Lockless fast path for the common case of empty buckets */
1928 		if (empty_bucket(st))
1929 			continue;
1930 
1931 		spin_lock_bh(lock);
1932 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1933 			if (sk->sk_family != st->family ||
1934 			    !net_eq(sock_net(sk), net)) {
1935 				continue;
1936 			}
1937 			rc = sk;
1938 			goto out;
1939 		}
1940 		spin_unlock_bh(lock);
1941 	}
1942 out:
1943 	return rc;
1944 }
1945 
1946 static void *established_get_next(struct seq_file *seq, void *cur)
1947 {
1948 	struct sock *sk = cur;
1949 	struct hlist_nulls_node *node;
1950 	struct tcp_iter_state *st = seq->private;
1951 	struct net *net = seq_file_net(seq);
1952 
1953 	++st->num;
1954 	++st->offset;
1955 
1956 	sk = sk_nulls_next(sk);
1957 
1958 	sk_nulls_for_each_from(sk, node) {
1959 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1960 			return sk;
1961 	}
1962 
1963 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1964 	++st->bucket;
1965 	return established_get_first(seq);
1966 }
1967 
1968 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1969 {
1970 	struct tcp_iter_state *st = seq->private;
1971 	void *rc;
1972 
1973 	st->bucket = 0;
1974 	rc = established_get_first(seq);
1975 
1976 	while (rc && pos) {
1977 		rc = established_get_next(seq, rc);
1978 		--pos;
1979 	}
1980 	return rc;
1981 }
1982 
1983 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1984 {
1985 	void *rc;
1986 	struct tcp_iter_state *st = seq->private;
1987 
1988 	st->state = TCP_SEQ_STATE_LISTENING;
1989 	rc	  = listening_get_idx(seq, &pos);
1990 
1991 	if (!rc) {
1992 		st->state = TCP_SEQ_STATE_ESTABLISHED;
1993 		rc	  = established_get_idx(seq, pos);
1994 	}
1995 
1996 	return rc;
1997 }
1998 
1999 static void *tcp_seek_last_pos(struct seq_file *seq)
2000 {
2001 	struct tcp_iter_state *st = seq->private;
2002 	int offset = st->offset;
2003 	int orig_num = st->num;
2004 	void *rc = NULL;
2005 
2006 	switch (st->state) {
2007 	case TCP_SEQ_STATE_LISTENING:
2008 		if (st->bucket >= INET_LHTABLE_SIZE)
2009 			break;
2010 		st->state = TCP_SEQ_STATE_LISTENING;
2011 		rc = listening_get_next(seq, NULL);
2012 		while (offset-- && rc)
2013 			rc = listening_get_next(seq, rc);
2014 		if (rc)
2015 			break;
2016 		st->bucket = 0;
2017 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2018 		/* Fallthrough */
2019 	case TCP_SEQ_STATE_ESTABLISHED:
2020 		if (st->bucket > tcp_hashinfo.ehash_mask)
2021 			break;
2022 		rc = established_get_first(seq);
2023 		while (offset-- && rc)
2024 			rc = established_get_next(seq, rc);
2025 	}
2026 
2027 	st->num = orig_num;
2028 
2029 	return rc;
2030 }
2031 
2032 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2033 {
2034 	struct tcp_iter_state *st = seq->private;
2035 	void *rc;
2036 
2037 	if (*pos && *pos == st->last_pos) {
2038 		rc = tcp_seek_last_pos(seq);
2039 		if (rc)
2040 			goto out;
2041 	}
2042 
2043 	st->state = TCP_SEQ_STATE_LISTENING;
2044 	st->num = 0;
2045 	st->bucket = 0;
2046 	st->offset = 0;
2047 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2048 
2049 out:
2050 	st->last_pos = *pos;
2051 	return rc;
2052 }
2053 
2054 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2055 {
2056 	struct tcp_iter_state *st = seq->private;
2057 	void *rc = NULL;
2058 
2059 	if (v == SEQ_START_TOKEN) {
2060 		rc = tcp_get_idx(seq, 0);
2061 		goto out;
2062 	}
2063 
2064 	switch (st->state) {
2065 	case TCP_SEQ_STATE_LISTENING:
2066 		rc = listening_get_next(seq, v);
2067 		if (!rc) {
2068 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2069 			st->bucket = 0;
2070 			st->offset = 0;
2071 			rc	  = established_get_first(seq);
2072 		}
2073 		break;
2074 	case TCP_SEQ_STATE_ESTABLISHED:
2075 		rc = established_get_next(seq, v);
2076 		break;
2077 	}
2078 out:
2079 	++*pos;
2080 	st->last_pos = *pos;
2081 	return rc;
2082 }
2083 
2084 static void tcp_seq_stop(struct seq_file *seq, void *v)
2085 {
2086 	struct tcp_iter_state *st = seq->private;
2087 
2088 	switch (st->state) {
2089 	case TCP_SEQ_STATE_LISTENING:
2090 		if (v != SEQ_START_TOKEN)
2091 			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2092 		break;
2093 	case TCP_SEQ_STATE_ESTABLISHED:
2094 		if (v)
2095 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2096 		break;
2097 	}
2098 }
2099 
2100 int tcp_seq_open(struct inode *inode, struct file *file)
2101 {
2102 	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2103 	struct tcp_iter_state *s;
2104 	int err;
2105 
2106 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2107 			  sizeof(struct tcp_iter_state));
2108 	if (err < 0)
2109 		return err;
2110 
2111 	s = ((struct seq_file *)file->private_data)->private;
2112 	s->family		= afinfo->family;
2113 	s->last_pos		= 0;
2114 	return 0;
2115 }
2116 EXPORT_SYMBOL(tcp_seq_open);
2117 
2118 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2119 {
2120 	int rc = 0;
2121 	struct proc_dir_entry *p;
2122 
2123 	afinfo->seq_ops.start		= tcp_seq_start;
2124 	afinfo->seq_ops.next		= tcp_seq_next;
2125 	afinfo->seq_ops.stop		= tcp_seq_stop;
2126 
2127 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2128 			     afinfo->seq_fops, afinfo);
2129 	if (!p)
2130 		rc = -ENOMEM;
2131 	return rc;
2132 }
2133 EXPORT_SYMBOL(tcp_proc_register);
2134 
2135 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2136 {
2137 	remove_proc_entry(afinfo->name, net->proc_net);
2138 }
2139 EXPORT_SYMBOL(tcp_proc_unregister);
2140 
2141 static void get_openreq4(const struct request_sock *req,
2142 			 struct seq_file *f, int i)
2143 {
2144 	const struct inet_request_sock *ireq = inet_rsk(req);
2145 	long delta = req->rsk_timer.expires - jiffies;
2146 
2147 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2148 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2149 		i,
2150 		ireq->ir_loc_addr,
2151 		ireq->ir_num,
2152 		ireq->ir_rmt_addr,
2153 		ntohs(ireq->ir_rmt_port),
2154 		TCP_SYN_RECV,
2155 		0, 0, /* could print option size, but that is af dependent. */
2156 		1,    /* timers active (only the expire timer) */
2157 		jiffies_delta_to_clock_t(delta),
2158 		req->num_timeout,
2159 		from_kuid_munged(seq_user_ns(f),
2160 				 sock_i_uid(req->rsk_listener)),
2161 		0,  /* non standard timer */
2162 		0, /* open_requests have no inode */
2163 		0,
2164 		req);
2165 }
2166 
2167 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2168 {
2169 	int timer_active;
2170 	unsigned long timer_expires;
2171 	const struct tcp_sock *tp = tcp_sk(sk);
2172 	const struct inet_connection_sock *icsk = inet_csk(sk);
2173 	const struct inet_sock *inet = inet_sk(sk);
2174 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2175 	__be32 dest = inet->inet_daddr;
2176 	__be32 src = inet->inet_rcv_saddr;
2177 	__u16 destp = ntohs(inet->inet_dport);
2178 	__u16 srcp = ntohs(inet->inet_sport);
2179 	int rx_queue;
2180 	int state;
2181 
2182 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2183 	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2184 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2185 		timer_active	= 1;
2186 		timer_expires	= icsk->icsk_timeout;
2187 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2188 		timer_active	= 4;
2189 		timer_expires	= icsk->icsk_timeout;
2190 	} else if (timer_pending(&sk->sk_timer)) {
2191 		timer_active	= 2;
2192 		timer_expires	= sk->sk_timer.expires;
2193 	} else {
2194 		timer_active	= 0;
2195 		timer_expires = jiffies;
2196 	}
2197 
2198 	state = sk_state_load(sk);
2199 	if (state == TCP_LISTEN)
2200 		rx_queue = sk->sk_ack_backlog;
2201 	else
2202 		/* Because we don't lock the socket,
2203 		 * we might find a transient negative value.
2204 		 */
2205 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2206 
2207 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2208 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2209 		i, src, srcp, dest, destp, state,
2210 		tp->write_seq - tp->snd_una,
2211 		rx_queue,
2212 		timer_active,
2213 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2214 		icsk->icsk_retransmits,
2215 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2216 		icsk->icsk_probes_out,
2217 		sock_i_ino(sk),
2218 		atomic_read(&sk->sk_refcnt), sk,
2219 		jiffies_to_clock_t(icsk->icsk_rto),
2220 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2221 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2222 		tp->snd_cwnd,
2223 		state == TCP_LISTEN ?
2224 		    fastopenq->max_qlen :
2225 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2226 }
2227 
2228 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2229 			       struct seq_file *f, int i)
2230 {
2231 	long delta = tw->tw_timer.expires - jiffies;
2232 	__be32 dest, src;
2233 	__u16 destp, srcp;
2234 
2235 	dest  = tw->tw_daddr;
2236 	src   = tw->tw_rcv_saddr;
2237 	destp = ntohs(tw->tw_dport);
2238 	srcp  = ntohs(tw->tw_sport);
2239 
2240 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2241 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2242 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2243 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2244 		atomic_read(&tw->tw_refcnt), tw);
2245 }
2246 
2247 #define TMPSZ 150
2248 
2249 static int tcp4_seq_show(struct seq_file *seq, void *v)
2250 {
2251 	struct tcp_iter_state *st;
2252 	struct sock *sk = v;
2253 
2254 	seq_setwidth(seq, TMPSZ - 1);
2255 	if (v == SEQ_START_TOKEN) {
2256 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2257 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2258 			   "inode");
2259 		goto out;
2260 	}
2261 	st = seq->private;
2262 
2263 	if (sk->sk_state == TCP_TIME_WAIT)
2264 		get_timewait4_sock(v, seq, st->num);
2265 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2266 		get_openreq4(v, seq, st->num);
2267 	else
2268 		get_tcp4_sock(v, seq, st->num);
2269 out:
2270 	seq_pad(seq, '\n');
2271 	return 0;
2272 }
2273 
2274 static const struct file_operations tcp_afinfo_seq_fops = {
2275 	.owner   = THIS_MODULE,
2276 	.open    = tcp_seq_open,
2277 	.read    = seq_read,
2278 	.llseek  = seq_lseek,
2279 	.release = seq_release_net
2280 };
2281 
2282 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2283 	.name		= "tcp",
2284 	.family		= AF_INET,
2285 	.seq_fops	= &tcp_afinfo_seq_fops,
2286 	.seq_ops	= {
2287 		.show		= tcp4_seq_show,
2288 	},
2289 };
2290 
2291 static int __net_init tcp4_proc_init_net(struct net *net)
2292 {
2293 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2294 }
2295 
2296 static void __net_exit tcp4_proc_exit_net(struct net *net)
2297 {
2298 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2299 }
2300 
2301 static struct pernet_operations tcp4_net_ops = {
2302 	.init = tcp4_proc_init_net,
2303 	.exit = tcp4_proc_exit_net,
2304 };
2305 
2306 int __init tcp4_proc_init(void)
2307 {
2308 	return register_pernet_subsys(&tcp4_net_ops);
2309 }
2310 
2311 void tcp4_proc_exit(void)
2312 {
2313 	unregister_pernet_subsys(&tcp4_net_ops);
2314 }
2315 #endif /* CONFIG_PROC_FS */
2316 
2317 struct proto tcp_prot = {
2318 	.name			= "TCP",
2319 	.owner			= THIS_MODULE,
2320 	.close			= tcp_close,
2321 	.connect		= tcp_v4_connect,
2322 	.disconnect		= tcp_disconnect,
2323 	.accept			= inet_csk_accept,
2324 	.ioctl			= tcp_ioctl,
2325 	.init			= tcp_v4_init_sock,
2326 	.destroy		= tcp_v4_destroy_sock,
2327 	.shutdown		= tcp_shutdown,
2328 	.setsockopt		= tcp_setsockopt,
2329 	.getsockopt		= tcp_getsockopt,
2330 	.recvmsg		= tcp_recvmsg,
2331 	.sendmsg		= tcp_sendmsg,
2332 	.sendpage		= tcp_sendpage,
2333 	.backlog_rcv		= tcp_v4_do_rcv,
2334 	.release_cb		= tcp_release_cb,
2335 	.hash			= inet_hash,
2336 	.unhash			= inet_unhash,
2337 	.get_port		= inet_csk_get_port,
2338 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2339 	.stream_memory_free	= tcp_stream_memory_free,
2340 	.sockets_allocated	= &tcp_sockets_allocated,
2341 	.orphan_count		= &tcp_orphan_count,
2342 	.memory_allocated	= &tcp_memory_allocated,
2343 	.memory_pressure	= &tcp_memory_pressure,
2344 	.sysctl_mem		= sysctl_tcp_mem,
2345 	.sysctl_wmem		= sysctl_tcp_wmem,
2346 	.sysctl_rmem		= sysctl_tcp_rmem,
2347 	.max_header		= MAX_TCP_HEADER,
2348 	.obj_size		= sizeof(struct tcp_sock),
2349 	.slab_flags		= SLAB_DESTROY_BY_RCU,
2350 	.twsk_prot		= &tcp_timewait_sock_ops,
2351 	.rsk_prot		= &tcp_request_sock_ops,
2352 	.h.hashinfo		= &tcp_hashinfo,
2353 	.no_autobind		= true,
2354 #ifdef CONFIG_COMPAT
2355 	.compat_setsockopt	= compat_tcp_setsockopt,
2356 	.compat_getsockopt	= compat_tcp_getsockopt,
2357 #endif
2358 	.diag_destroy		= tcp_abort,
2359 };
2360 EXPORT_SYMBOL(tcp_prot);
2361 
2362 static void __net_exit tcp_sk_exit(struct net *net)
2363 {
2364 	int cpu;
2365 
2366 	for_each_possible_cpu(cpu)
2367 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2368 	free_percpu(net->ipv4.tcp_sk);
2369 }
2370 
2371 static int __net_init tcp_sk_init(struct net *net)
2372 {
2373 	int res, cpu;
2374 
2375 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2376 	if (!net->ipv4.tcp_sk)
2377 		return -ENOMEM;
2378 
2379 	for_each_possible_cpu(cpu) {
2380 		struct sock *sk;
2381 
2382 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2383 					   IPPROTO_TCP, net);
2384 		if (res)
2385 			goto fail;
2386 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2387 	}
2388 
2389 	net->ipv4.sysctl_tcp_ecn = 2;
2390 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2391 
2392 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2393 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2394 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2395 
2396 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2397 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2398 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2399 
2400 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2401 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2402 	net->ipv4.sysctl_tcp_syncookies = 1;
2403 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2404 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2405 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2406 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2407 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2408 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2409 
2410 	return 0;
2411 fail:
2412 	tcp_sk_exit(net);
2413 
2414 	return res;
2415 }
2416 
2417 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2418 {
2419 	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2420 }
2421 
2422 static struct pernet_operations __net_initdata tcp_sk_ops = {
2423        .init	   = tcp_sk_init,
2424        .exit	   = tcp_sk_exit,
2425        .exit_batch = tcp_sk_exit_batch,
2426 };
2427 
2428 void __init tcp_v4_init(void)
2429 {
2430 	inet_hashinfo_init(&tcp_hashinfo);
2431 	if (register_pernet_subsys(&tcp_sk_ops))
2432 		panic("Failed to create the TCP control socket.\n");
2433 }
2434