xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision 2596e07a)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77 
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 
84 #include <linux/crypto.h>
85 #include <linux/scatterlist.h>
86 
87 int sysctl_tcp_tw_reuse __read_mostly;
88 int sysctl_tcp_low_latency __read_mostly;
89 EXPORT_SYMBOL(sysctl_tcp_low_latency);
90 
91 #ifdef CONFIG_TCP_MD5SIG
92 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
93 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
94 #endif
95 
96 struct inet_hashinfo tcp_hashinfo;
97 EXPORT_SYMBOL(tcp_hashinfo);
98 
99 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
100 {
101 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
102 					  ip_hdr(skb)->saddr,
103 					  tcp_hdr(skb)->dest,
104 					  tcp_hdr(skb)->source);
105 }
106 
107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 {
109 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
110 	struct tcp_sock *tp = tcp_sk(sk);
111 
112 	/* With PAWS, it is safe from the viewpoint
113 	   of data integrity. Even without PAWS it is safe provided sequence
114 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
115 
116 	   Actually, the idea is close to VJ's one, only timestamp cache is
117 	   held not per host, but per port pair and TW bucket is used as state
118 	   holder.
119 
120 	   If TW bucket has been already destroyed we fall back to VJ's scheme
121 	   and use initial timestamp retrieved from peer table.
122 	 */
123 	if (tcptw->tw_ts_recent_stamp &&
124 	    (!twp || (sysctl_tcp_tw_reuse &&
125 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
126 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
127 		if (tp->write_seq == 0)
128 			tp->write_seq = 1;
129 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
130 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
131 		sock_hold(sktw);
132 		return 1;
133 	}
134 
135 	return 0;
136 }
137 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
138 
139 /* This will initiate an outgoing connection. */
140 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
141 {
142 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
143 	struct inet_sock *inet = inet_sk(sk);
144 	struct tcp_sock *tp = tcp_sk(sk);
145 	__be16 orig_sport, orig_dport;
146 	__be32 daddr, nexthop;
147 	struct flowi4 *fl4;
148 	struct rtable *rt;
149 	int err;
150 	struct ip_options_rcu *inet_opt;
151 
152 	if (addr_len < sizeof(struct sockaddr_in))
153 		return -EINVAL;
154 
155 	if (usin->sin_family != AF_INET)
156 		return -EAFNOSUPPORT;
157 
158 	nexthop = daddr = usin->sin_addr.s_addr;
159 	inet_opt = rcu_dereference_protected(inet->inet_opt,
160 					     sock_owned_by_user(sk));
161 	if (inet_opt && inet_opt->opt.srr) {
162 		if (!daddr)
163 			return -EINVAL;
164 		nexthop = inet_opt->opt.faddr;
165 	}
166 
167 	orig_sport = inet->inet_sport;
168 	orig_dport = usin->sin_port;
169 	fl4 = &inet->cork.fl.u.ip4;
170 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
171 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
172 			      IPPROTO_TCP,
173 			      orig_sport, orig_dport, sk);
174 	if (IS_ERR(rt)) {
175 		err = PTR_ERR(rt);
176 		if (err == -ENETUNREACH)
177 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
178 		return err;
179 	}
180 
181 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
182 		ip_rt_put(rt);
183 		return -ENETUNREACH;
184 	}
185 
186 	if (!inet_opt || !inet_opt->opt.srr)
187 		daddr = fl4->daddr;
188 
189 	if (!inet->inet_saddr)
190 		inet->inet_saddr = fl4->saddr;
191 	sk_rcv_saddr_set(sk, inet->inet_saddr);
192 
193 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
194 		/* Reset inherited state */
195 		tp->rx_opt.ts_recent	   = 0;
196 		tp->rx_opt.ts_recent_stamp = 0;
197 		if (likely(!tp->repair))
198 			tp->write_seq	   = 0;
199 	}
200 
201 	if (tcp_death_row.sysctl_tw_recycle &&
202 	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
203 		tcp_fetch_timewait_stamp(sk, &rt->dst);
204 
205 	inet->inet_dport = usin->sin_port;
206 	sk_daddr_set(sk, daddr);
207 
208 	inet_csk(sk)->icsk_ext_hdr_len = 0;
209 	if (inet_opt)
210 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
211 
212 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
213 
214 	/* Socket identity is still unknown (sport may be zero).
215 	 * However we set state to SYN-SENT and not releasing socket
216 	 * lock select source port, enter ourselves into the hash tables and
217 	 * complete initialization after this.
218 	 */
219 	tcp_set_state(sk, TCP_SYN_SENT);
220 	err = inet_hash_connect(&tcp_death_row, sk);
221 	if (err)
222 		goto failure;
223 
224 	sk_set_txhash(sk);
225 
226 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
227 			       inet->inet_sport, inet->inet_dport, sk);
228 	if (IS_ERR(rt)) {
229 		err = PTR_ERR(rt);
230 		rt = NULL;
231 		goto failure;
232 	}
233 	/* OK, now commit destination to socket.  */
234 	sk->sk_gso_type = SKB_GSO_TCPV4;
235 	sk_setup_caps(sk, &rt->dst);
236 
237 	if (!tp->write_seq && likely(!tp->repair))
238 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
239 							   inet->inet_daddr,
240 							   inet->inet_sport,
241 							   usin->sin_port);
242 
243 	inet->inet_id = tp->write_seq ^ jiffies;
244 
245 	err = tcp_connect(sk);
246 
247 	rt = NULL;
248 	if (err)
249 		goto failure;
250 
251 	return 0;
252 
253 failure:
254 	/*
255 	 * This unhashes the socket and releases the local port,
256 	 * if necessary.
257 	 */
258 	tcp_set_state(sk, TCP_CLOSE);
259 	ip_rt_put(rt);
260 	sk->sk_route_caps = 0;
261 	inet->inet_dport = 0;
262 	return err;
263 }
264 EXPORT_SYMBOL(tcp_v4_connect);
265 
266 /*
267  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
268  * It can be called through tcp_release_cb() if socket was owned by user
269  * at the time tcp_v4_err() was called to handle ICMP message.
270  */
271 void tcp_v4_mtu_reduced(struct sock *sk)
272 {
273 	struct dst_entry *dst;
274 	struct inet_sock *inet = inet_sk(sk);
275 	u32 mtu = tcp_sk(sk)->mtu_info;
276 
277 	dst = inet_csk_update_pmtu(sk, mtu);
278 	if (!dst)
279 		return;
280 
281 	/* Something is about to be wrong... Remember soft error
282 	 * for the case, if this connection will not able to recover.
283 	 */
284 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
285 		sk->sk_err_soft = EMSGSIZE;
286 
287 	mtu = dst_mtu(dst);
288 
289 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
290 	    ip_sk_accept_pmtu(sk) &&
291 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
292 		tcp_sync_mss(sk, mtu);
293 
294 		/* Resend the TCP packet because it's
295 		 * clear that the old packet has been
296 		 * dropped. This is the new "fast" path mtu
297 		 * discovery.
298 		 */
299 		tcp_simple_retransmit(sk);
300 	} /* else let the usual retransmit timer handle it */
301 }
302 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
303 
304 static void do_redirect(struct sk_buff *skb, struct sock *sk)
305 {
306 	struct dst_entry *dst = __sk_dst_check(sk, 0);
307 
308 	if (dst)
309 		dst->ops->redirect(dst, sk, skb);
310 }
311 
312 
313 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
314 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
315 {
316 	struct request_sock *req = inet_reqsk(sk);
317 	struct net *net = sock_net(sk);
318 
319 	/* ICMPs are not backlogged, hence we cannot get
320 	 * an established socket here.
321 	 */
322 	WARN_ON(req->sk);
323 
324 	if (seq != tcp_rsk(req)->snt_isn) {
325 		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
326 	} else if (abort) {
327 		/*
328 		 * Still in SYN_RECV, just remove it silently.
329 		 * There is no good way to pass the error to the newly
330 		 * created socket, and POSIX does not want network
331 		 * errors returned from accept().
332 		 */
333 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
334 		NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
335 	}
336 	reqsk_put(req);
337 }
338 EXPORT_SYMBOL(tcp_req_err);
339 
340 /*
341  * This routine is called by the ICMP module when it gets some
342  * sort of error condition.  If err < 0 then the socket should
343  * be closed and the error returned to the user.  If err > 0
344  * it's just the icmp type << 8 | icmp code.  After adjustment
345  * header points to the first 8 bytes of the tcp header.  We need
346  * to find the appropriate port.
347  *
348  * The locking strategy used here is very "optimistic". When
349  * someone else accesses the socket the ICMP is just dropped
350  * and for some paths there is no check at all.
351  * A more general error queue to queue errors for later handling
352  * is probably better.
353  *
354  */
355 
356 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
357 {
358 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
359 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
360 	struct inet_connection_sock *icsk;
361 	struct tcp_sock *tp;
362 	struct inet_sock *inet;
363 	const int type = icmp_hdr(icmp_skb)->type;
364 	const int code = icmp_hdr(icmp_skb)->code;
365 	struct sock *sk;
366 	struct sk_buff *skb;
367 	struct request_sock *fastopen;
368 	__u32 seq, snd_una;
369 	__u32 remaining;
370 	int err;
371 	struct net *net = dev_net(icmp_skb->dev);
372 
373 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
374 				       th->dest, iph->saddr, ntohs(th->source),
375 				       inet_iif(icmp_skb));
376 	if (!sk) {
377 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
378 		return;
379 	}
380 	if (sk->sk_state == TCP_TIME_WAIT) {
381 		inet_twsk_put(inet_twsk(sk));
382 		return;
383 	}
384 	seq = ntohl(th->seq);
385 	if (sk->sk_state == TCP_NEW_SYN_RECV)
386 		return tcp_req_err(sk, seq,
387 				  type == ICMP_PARAMETERPROB ||
388 				  type == ICMP_TIME_EXCEEDED ||
389 				  (type == ICMP_DEST_UNREACH &&
390 				   (code == ICMP_NET_UNREACH ||
391 				    code == ICMP_HOST_UNREACH)));
392 
393 	bh_lock_sock(sk);
394 	/* If too many ICMPs get dropped on busy
395 	 * servers this needs to be solved differently.
396 	 * We do take care of PMTU discovery (RFC1191) special case :
397 	 * we can receive locally generated ICMP messages while socket is held.
398 	 */
399 	if (sock_owned_by_user(sk)) {
400 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
401 			NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
402 	}
403 	if (sk->sk_state == TCP_CLOSE)
404 		goto out;
405 
406 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
407 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
408 		goto out;
409 	}
410 
411 	icsk = inet_csk(sk);
412 	tp = tcp_sk(sk);
413 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
414 	fastopen = tp->fastopen_rsk;
415 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
416 	if (sk->sk_state != TCP_LISTEN &&
417 	    !between(seq, snd_una, tp->snd_nxt)) {
418 		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
419 		goto out;
420 	}
421 
422 	switch (type) {
423 	case ICMP_REDIRECT:
424 		do_redirect(icmp_skb, sk);
425 		goto out;
426 	case ICMP_SOURCE_QUENCH:
427 		/* Just silently ignore these. */
428 		goto out;
429 	case ICMP_PARAMETERPROB:
430 		err = EPROTO;
431 		break;
432 	case ICMP_DEST_UNREACH:
433 		if (code > NR_ICMP_UNREACH)
434 			goto out;
435 
436 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
437 			/* We are not interested in TCP_LISTEN and open_requests
438 			 * (SYN-ACKs send out by Linux are always <576bytes so
439 			 * they should go through unfragmented).
440 			 */
441 			if (sk->sk_state == TCP_LISTEN)
442 				goto out;
443 
444 			tp->mtu_info = info;
445 			if (!sock_owned_by_user(sk)) {
446 				tcp_v4_mtu_reduced(sk);
447 			} else {
448 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
449 					sock_hold(sk);
450 			}
451 			goto out;
452 		}
453 
454 		err = icmp_err_convert[code].errno;
455 		/* check if icmp_skb allows revert of backoff
456 		 * (see draft-zimmermann-tcp-lcd) */
457 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
458 			break;
459 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
460 		    !icsk->icsk_backoff || fastopen)
461 			break;
462 
463 		if (sock_owned_by_user(sk))
464 			break;
465 
466 		icsk->icsk_backoff--;
467 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
468 					       TCP_TIMEOUT_INIT;
469 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
470 
471 		skb = tcp_write_queue_head(sk);
472 		BUG_ON(!skb);
473 
474 		remaining = icsk->icsk_rto -
475 			    min(icsk->icsk_rto,
476 				tcp_time_stamp - tcp_skb_timestamp(skb));
477 
478 		if (remaining) {
479 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
480 						  remaining, TCP_RTO_MAX);
481 		} else {
482 			/* RTO revert clocked out retransmission.
483 			 * Will retransmit now */
484 			tcp_retransmit_timer(sk);
485 		}
486 
487 		break;
488 	case ICMP_TIME_EXCEEDED:
489 		err = EHOSTUNREACH;
490 		break;
491 	default:
492 		goto out;
493 	}
494 
495 	switch (sk->sk_state) {
496 	case TCP_SYN_SENT:
497 	case TCP_SYN_RECV:
498 		/* Only in fast or simultaneous open. If a fast open socket is
499 		 * is already accepted it is treated as a connected one below.
500 		 */
501 		if (fastopen && !fastopen->sk)
502 			break;
503 
504 		if (!sock_owned_by_user(sk)) {
505 			sk->sk_err = err;
506 
507 			sk->sk_error_report(sk);
508 
509 			tcp_done(sk);
510 		} else {
511 			sk->sk_err_soft = err;
512 		}
513 		goto out;
514 	}
515 
516 	/* If we've already connected we will keep trying
517 	 * until we time out, or the user gives up.
518 	 *
519 	 * rfc1122 4.2.3.9 allows to consider as hard errors
520 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
521 	 * but it is obsoleted by pmtu discovery).
522 	 *
523 	 * Note, that in modern internet, where routing is unreliable
524 	 * and in each dark corner broken firewalls sit, sending random
525 	 * errors ordered by their masters even this two messages finally lose
526 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
527 	 *
528 	 * Now we are in compliance with RFCs.
529 	 *							--ANK (980905)
530 	 */
531 
532 	inet = inet_sk(sk);
533 	if (!sock_owned_by_user(sk) && inet->recverr) {
534 		sk->sk_err = err;
535 		sk->sk_error_report(sk);
536 	} else	{ /* Only an error on timeout */
537 		sk->sk_err_soft = err;
538 	}
539 
540 out:
541 	bh_unlock_sock(sk);
542 	sock_put(sk);
543 }
544 
545 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
546 {
547 	struct tcphdr *th = tcp_hdr(skb);
548 
549 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
550 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
551 		skb->csum_start = skb_transport_header(skb) - skb->head;
552 		skb->csum_offset = offsetof(struct tcphdr, check);
553 	} else {
554 		th->check = tcp_v4_check(skb->len, saddr, daddr,
555 					 csum_partial(th,
556 						      th->doff << 2,
557 						      skb->csum));
558 	}
559 }
560 
561 /* This routine computes an IPv4 TCP checksum. */
562 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
563 {
564 	const struct inet_sock *inet = inet_sk(sk);
565 
566 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
567 }
568 EXPORT_SYMBOL(tcp_v4_send_check);
569 
570 /*
571  *	This routine will send an RST to the other tcp.
572  *
573  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
574  *		      for reset.
575  *	Answer: if a packet caused RST, it is not for a socket
576  *		existing in our system, if it is matched to a socket,
577  *		it is just duplicate segment or bug in other side's TCP.
578  *		So that we build reply only basing on parameters
579  *		arrived with segment.
580  *	Exception: precedence violation. We do not implement it in any case.
581  */
582 
583 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
584 {
585 	const struct tcphdr *th = tcp_hdr(skb);
586 	struct {
587 		struct tcphdr th;
588 #ifdef CONFIG_TCP_MD5SIG
589 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
590 #endif
591 	} rep;
592 	struct ip_reply_arg arg;
593 #ifdef CONFIG_TCP_MD5SIG
594 	struct tcp_md5sig_key *key = NULL;
595 	const __u8 *hash_location = NULL;
596 	unsigned char newhash[16];
597 	int genhash;
598 	struct sock *sk1 = NULL;
599 #endif
600 	struct net *net;
601 
602 	/* Never send a reset in response to a reset. */
603 	if (th->rst)
604 		return;
605 
606 	/* If sk not NULL, it means we did a successful lookup and incoming
607 	 * route had to be correct. prequeue might have dropped our dst.
608 	 */
609 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
610 		return;
611 
612 	/* Swap the send and the receive. */
613 	memset(&rep, 0, sizeof(rep));
614 	rep.th.dest   = th->source;
615 	rep.th.source = th->dest;
616 	rep.th.doff   = sizeof(struct tcphdr) / 4;
617 	rep.th.rst    = 1;
618 
619 	if (th->ack) {
620 		rep.th.seq = th->ack_seq;
621 	} else {
622 		rep.th.ack = 1;
623 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
624 				       skb->len - (th->doff << 2));
625 	}
626 
627 	memset(&arg, 0, sizeof(arg));
628 	arg.iov[0].iov_base = (unsigned char *)&rep;
629 	arg.iov[0].iov_len  = sizeof(rep.th);
630 
631 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
632 #ifdef CONFIG_TCP_MD5SIG
633 	hash_location = tcp_parse_md5sig_option(th);
634 	if (sk && sk_fullsock(sk)) {
635 		key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
636 					&ip_hdr(skb)->saddr, AF_INET);
637 	} else if (hash_location) {
638 		/*
639 		 * active side is lost. Try to find listening socket through
640 		 * source port, and then find md5 key through listening socket.
641 		 * we are not loose security here:
642 		 * Incoming packet is checked with md5 hash with finding key,
643 		 * no RST generated if md5 hash doesn't match.
644 		 */
645 		sk1 = __inet_lookup_listener(net,
646 					     &tcp_hashinfo, ip_hdr(skb)->saddr,
647 					     th->source, ip_hdr(skb)->daddr,
648 					     ntohs(th->source), inet_iif(skb));
649 		/* don't send rst if it can't find key */
650 		if (!sk1)
651 			return;
652 		rcu_read_lock();
653 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
654 					&ip_hdr(skb)->saddr, AF_INET);
655 		if (!key)
656 			goto release_sk1;
657 
658 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
659 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
660 			goto release_sk1;
661 	}
662 
663 	if (key) {
664 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
665 				   (TCPOPT_NOP << 16) |
666 				   (TCPOPT_MD5SIG << 8) |
667 				   TCPOLEN_MD5SIG);
668 		/* Update length and the length the header thinks exists */
669 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
670 		rep.th.doff = arg.iov[0].iov_len / 4;
671 
672 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
673 				     key, ip_hdr(skb)->saddr,
674 				     ip_hdr(skb)->daddr, &rep.th);
675 	}
676 #endif
677 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
678 				      ip_hdr(skb)->saddr, /* XXX */
679 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
680 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
681 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
682 
683 	/* When socket is gone, all binding information is lost.
684 	 * routing might fail in this case. No choice here, if we choose to force
685 	 * input interface, we will misroute in case of asymmetric route.
686 	 */
687 	if (sk)
688 		arg.bound_dev_if = sk->sk_bound_dev_if;
689 
690 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
691 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
692 
693 	arg.tos = ip_hdr(skb)->tos;
694 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
695 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
696 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
697 			      &arg, arg.iov[0].iov_len);
698 
699 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
700 	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
701 
702 #ifdef CONFIG_TCP_MD5SIG
703 release_sk1:
704 	if (sk1) {
705 		rcu_read_unlock();
706 		sock_put(sk1);
707 	}
708 #endif
709 }
710 
711 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
712    outside socket context is ugly, certainly. What can I do?
713  */
714 
715 static void tcp_v4_send_ack(struct net *net,
716 			    struct sk_buff *skb, u32 seq, u32 ack,
717 			    u32 win, u32 tsval, u32 tsecr, int oif,
718 			    struct tcp_md5sig_key *key,
719 			    int reply_flags, u8 tos)
720 {
721 	const struct tcphdr *th = tcp_hdr(skb);
722 	struct {
723 		struct tcphdr th;
724 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
725 #ifdef CONFIG_TCP_MD5SIG
726 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
727 #endif
728 			];
729 	} rep;
730 	struct ip_reply_arg arg;
731 
732 	memset(&rep.th, 0, sizeof(struct tcphdr));
733 	memset(&arg, 0, sizeof(arg));
734 
735 	arg.iov[0].iov_base = (unsigned char *)&rep;
736 	arg.iov[0].iov_len  = sizeof(rep.th);
737 	if (tsecr) {
738 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
739 				   (TCPOPT_TIMESTAMP << 8) |
740 				   TCPOLEN_TIMESTAMP);
741 		rep.opt[1] = htonl(tsval);
742 		rep.opt[2] = htonl(tsecr);
743 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
744 	}
745 
746 	/* Swap the send and the receive. */
747 	rep.th.dest    = th->source;
748 	rep.th.source  = th->dest;
749 	rep.th.doff    = arg.iov[0].iov_len / 4;
750 	rep.th.seq     = htonl(seq);
751 	rep.th.ack_seq = htonl(ack);
752 	rep.th.ack     = 1;
753 	rep.th.window  = htons(win);
754 
755 #ifdef CONFIG_TCP_MD5SIG
756 	if (key) {
757 		int offset = (tsecr) ? 3 : 0;
758 
759 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
760 					  (TCPOPT_NOP << 16) |
761 					  (TCPOPT_MD5SIG << 8) |
762 					  TCPOLEN_MD5SIG);
763 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
764 		rep.th.doff = arg.iov[0].iov_len/4;
765 
766 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
767 				    key, ip_hdr(skb)->saddr,
768 				    ip_hdr(skb)->daddr, &rep.th);
769 	}
770 #endif
771 	arg.flags = reply_flags;
772 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
773 				      ip_hdr(skb)->saddr, /* XXX */
774 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
775 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
776 	if (oif)
777 		arg.bound_dev_if = oif;
778 	arg.tos = tos;
779 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
780 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
781 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
782 			      &arg, arg.iov[0].iov_len);
783 
784 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
785 }
786 
787 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
788 {
789 	struct inet_timewait_sock *tw = inet_twsk(sk);
790 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
791 
792 	tcp_v4_send_ack(sock_net(sk), skb,
793 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
794 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
795 			tcp_time_stamp + tcptw->tw_ts_offset,
796 			tcptw->tw_ts_recent,
797 			tw->tw_bound_dev_if,
798 			tcp_twsk_md5_key(tcptw),
799 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
800 			tw->tw_tos
801 			);
802 
803 	inet_twsk_put(tw);
804 }
805 
806 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
807 				  struct request_sock *req)
808 {
809 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
810 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
811 	 */
812 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
813 					     tcp_sk(sk)->snd_nxt;
814 
815 	tcp_v4_send_ack(sock_net(sk), skb, seq,
816 			tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
817 			tcp_time_stamp,
818 			req->ts_recent,
819 			0,
820 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
821 					  AF_INET),
822 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
823 			ip_hdr(skb)->tos);
824 }
825 
826 /*
827  *	Send a SYN-ACK after having received a SYN.
828  *	This still operates on a request_sock only, not on a big
829  *	socket.
830  */
831 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
832 			      struct flowi *fl,
833 			      struct request_sock *req,
834 			      struct tcp_fastopen_cookie *foc,
835 				  bool attach_req)
836 {
837 	const struct inet_request_sock *ireq = inet_rsk(req);
838 	struct flowi4 fl4;
839 	int err = -1;
840 	struct sk_buff *skb;
841 
842 	/* First, grab a route. */
843 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
844 		return -1;
845 
846 	skb = tcp_make_synack(sk, dst, req, foc, attach_req);
847 
848 	if (skb) {
849 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
850 
851 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
852 					    ireq->ir_rmt_addr,
853 					    ireq->opt);
854 		err = net_xmit_eval(err);
855 	}
856 
857 	return err;
858 }
859 
860 /*
861  *	IPv4 request_sock destructor.
862  */
863 static void tcp_v4_reqsk_destructor(struct request_sock *req)
864 {
865 	kfree(inet_rsk(req)->opt);
866 }
867 
868 
869 #ifdef CONFIG_TCP_MD5SIG
870 /*
871  * RFC2385 MD5 checksumming requires a mapping of
872  * IP address->MD5 Key.
873  * We need to maintain these in the sk structure.
874  */
875 
876 /* Find the Key structure for an address.  */
877 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
878 					 const union tcp_md5_addr *addr,
879 					 int family)
880 {
881 	const struct tcp_sock *tp = tcp_sk(sk);
882 	struct tcp_md5sig_key *key;
883 	unsigned int size = sizeof(struct in_addr);
884 	const struct tcp_md5sig_info *md5sig;
885 
886 	/* caller either holds rcu_read_lock() or socket lock */
887 	md5sig = rcu_dereference_check(tp->md5sig_info,
888 				       sock_owned_by_user(sk) ||
889 				       lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
890 	if (!md5sig)
891 		return NULL;
892 #if IS_ENABLED(CONFIG_IPV6)
893 	if (family == AF_INET6)
894 		size = sizeof(struct in6_addr);
895 #endif
896 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
897 		if (key->family != family)
898 			continue;
899 		if (!memcmp(&key->addr, addr, size))
900 			return key;
901 	}
902 	return NULL;
903 }
904 EXPORT_SYMBOL(tcp_md5_do_lookup);
905 
906 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
907 					 const struct sock *addr_sk)
908 {
909 	const union tcp_md5_addr *addr;
910 
911 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
912 	return tcp_md5_do_lookup(sk, addr, AF_INET);
913 }
914 EXPORT_SYMBOL(tcp_v4_md5_lookup);
915 
916 /* This can be called on a newly created socket, from other files */
917 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
918 		   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
919 {
920 	/* Add Key to the list */
921 	struct tcp_md5sig_key *key;
922 	struct tcp_sock *tp = tcp_sk(sk);
923 	struct tcp_md5sig_info *md5sig;
924 
925 	key = tcp_md5_do_lookup(sk, addr, family);
926 	if (key) {
927 		/* Pre-existing entry - just update that one. */
928 		memcpy(key->key, newkey, newkeylen);
929 		key->keylen = newkeylen;
930 		return 0;
931 	}
932 
933 	md5sig = rcu_dereference_protected(tp->md5sig_info,
934 					   sock_owned_by_user(sk) ||
935 					   lockdep_is_held(&sk->sk_lock.slock));
936 	if (!md5sig) {
937 		md5sig = kmalloc(sizeof(*md5sig), gfp);
938 		if (!md5sig)
939 			return -ENOMEM;
940 
941 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
942 		INIT_HLIST_HEAD(&md5sig->head);
943 		rcu_assign_pointer(tp->md5sig_info, md5sig);
944 	}
945 
946 	key = sock_kmalloc(sk, sizeof(*key), gfp);
947 	if (!key)
948 		return -ENOMEM;
949 	if (!tcp_alloc_md5sig_pool()) {
950 		sock_kfree_s(sk, key, sizeof(*key));
951 		return -ENOMEM;
952 	}
953 
954 	memcpy(key->key, newkey, newkeylen);
955 	key->keylen = newkeylen;
956 	key->family = family;
957 	memcpy(&key->addr, addr,
958 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
959 				      sizeof(struct in_addr));
960 	hlist_add_head_rcu(&key->node, &md5sig->head);
961 	return 0;
962 }
963 EXPORT_SYMBOL(tcp_md5_do_add);
964 
965 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
966 {
967 	struct tcp_md5sig_key *key;
968 
969 	key = tcp_md5_do_lookup(sk, addr, family);
970 	if (!key)
971 		return -ENOENT;
972 	hlist_del_rcu(&key->node);
973 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
974 	kfree_rcu(key, rcu);
975 	return 0;
976 }
977 EXPORT_SYMBOL(tcp_md5_do_del);
978 
979 static void tcp_clear_md5_list(struct sock *sk)
980 {
981 	struct tcp_sock *tp = tcp_sk(sk);
982 	struct tcp_md5sig_key *key;
983 	struct hlist_node *n;
984 	struct tcp_md5sig_info *md5sig;
985 
986 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
987 
988 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
989 		hlist_del_rcu(&key->node);
990 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
991 		kfree_rcu(key, rcu);
992 	}
993 }
994 
995 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
996 				 int optlen)
997 {
998 	struct tcp_md5sig cmd;
999 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1000 
1001 	if (optlen < sizeof(cmd))
1002 		return -EINVAL;
1003 
1004 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1005 		return -EFAULT;
1006 
1007 	if (sin->sin_family != AF_INET)
1008 		return -EINVAL;
1009 
1010 	if (!cmd.tcpm_keylen)
1011 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1012 				      AF_INET);
1013 
1014 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1015 		return -EINVAL;
1016 
1017 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1018 			      AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1019 			      GFP_KERNEL);
1020 }
1021 
1022 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1023 					__be32 daddr, __be32 saddr, int nbytes)
1024 {
1025 	struct tcp4_pseudohdr *bp;
1026 	struct scatterlist sg;
1027 
1028 	bp = &hp->md5_blk.ip4;
1029 
1030 	/*
1031 	 * 1. the TCP pseudo-header (in the order: source IP address,
1032 	 * destination IP address, zero-padded protocol number, and
1033 	 * segment length)
1034 	 */
1035 	bp->saddr = saddr;
1036 	bp->daddr = daddr;
1037 	bp->pad = 0;
1038 	bp->protocol = IPPROTO_TCP;
1039 	bp->len = cpu_to_be16(nbytes);
1040 
1041 	sg_init_one(&sg, bp, sizeof(*bp));
1042 	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1043 }
1044 
1045 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1046 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1047 {
1048 	struct tcp_md5sig_pool *hp;
1049 	struct hash_desc *desc;
1050 
1051 	hp = tcp_get_md5sig_pool();
1052 	if (!hp)
1053 		goto clear_hash_noput;
1054 	desc = &hp->md5_desc;
1055 
1056 	if (crypto_hash_init(desc))
1057 		goto clear_hash;
1058 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1059 		goto clear_hash;
1060 	if (tcp_md5_hash_header(hp, th))
1061 		goto clear_hash;
1062 	if (tcp_md5_hash_key(hp, key))
1063 		goto clear_hash;
1064 	if (crypto_hash_final(desc, md5_hash))
1065 		goto clear_hash;
1066 
1067 	tcp_put_md5sig_pool();
1068 	return 0;
1069 
1070 clear_hash:
1071 	tcp_put_md5sig_pool();
1072 clear_hash_noput:
1073 	memset(md5_hash, 0, 16);
1074 	return 1;
1075 }
1076 
1077 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1078 			const struct sock *sk,
1079 			const struct sk_buff *skb)
1080 {
1081 	struct tcp_md5sig_pool *hp;
1082 	struct hash_desc *desc;
1083 	const struct tcphdr *th = tcp_hdr(skb);
1084 	__be32 saddr, daddr;
1085 
1086 	if (sk) { /* valid for establish/request sockets */
1087 		saddr = sk->sk_rcv_saddr;
1088 		daddr = sk->sk_daddr;
1089 	} else {
1090 		const struct iphdr *iph = ip_hdr(skb);
1091 		saddr = iph->saddr;
1092 		daddr = iph->daddr;
1093 	}
1094 
1095 	hp = tcp_get_md5sig_pool();
1096 	if (!hp)
1097 		goto clear_hash_noput;
1098 	desc = &hp->md5_desc;
1099 
1100 	if (crypto_hash_init(desc))
1101 		goto clear_hash;
1102 
1103 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1104 		goto clear_hash;
1105 	if (tcp_md5_hash_header(hp, th))
1106 		goto clear_hash;
1107 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1108 		goto clear_hash;
1109 	if (tcp_md5_hash_key(hp, key))
1110 		goto clear_hash;
1111 	if (crypto_hash_final(desc, md5_hash))
1112 		goto clear_hash;
1113 
1114 	tcp_put_md5sig_pool();
1115 	return 0;
1116 
1117 clear_hash:
1118 	tcp_put_md5sig_pool();
1119 clear_hash_noput:
1120 	memset(md5_hash, 0, 16);
1121 	return 1;
1122 }
1123 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1124 
1125 #endif
1126 
1127 /* Called with rcu_read_lock() */
1128 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1129 				    const struct sk_buff *skb)
1130 {
1131 #ifdef CONFIG_TCP_MD5SIG
1132 	/*
1133 	 * This gets called for each TCP segment that arrives
1134 	 * so we want to be efficient.
1135 	 * We have 3 drop cases:
1136 	 * o No MD5 hash and one expected.
1137 	 * o MD5 hash and we're not expecting one.
1138 	 * o MD5 hash and its wrong.
1139 	 */
1140 	const __u8 *hash_location = NULL;
1141 	struct tcp_md5sig_key *hash_expected;
1142 	const struct iphdr *iph = ip_hdr(skb);
1143 	const struct tcphdr *th = tcp_hdr(skb);
1144 	int genhash;
1145 	unsigned char newhash[16];
1146 
1147 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1148 					  AF_INET);
1149 	hash_location = tcp_parse_md5sig_option(th);
1150 
1151 	/* We've parsed the options - do we have a hash? */
1152 	if (!hash_expected && !hash_location)
1153 		return false;
1154 
1155 	if (hash_expected && !hash_location) {
1156 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1157 		return true;
1158 	}
1159 
1160 	if (!hash_expected && hash_location) {
1161 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1162 		return true;
1163 	}
1164 
1165 	/* Okay, so this is hash_expected and hash_location -
1166 	 * so we need to calculate the checksum.
1167 	 */
1168 	genhash = tcp_v4_md5_hash_skb(newhash,
1169 				      hash_expected,
1170 				      NULL, skb);
1171 
1172 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1173 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1174 				     &iph->saddr, ntohs(th->source),
1175 				     &iph->daddr, ntohs(th->dest),
1176 				     genhash ? " tcp_v4_calc_md5_hash failed"
1177 				     : "");
1178 		return true;
1179 	}
1180 	return false;
1181 #endif
1182 	return false;
1183 }
1184 
1185 static void tcp_v4_init_req(struct request_sock *req,
1186 			    const struct sock *sk_listener,
1187 			    struct sk_buff *skb)
1188 {
1189 	struct inet_request_sock *ireq = inet_rsk(req);
1190 
1191 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1192 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1193 	ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1194 	ireq->opt = tcp_v4_save_options(skb);
1195 }
1196 
1197 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1198 					  struct flowi *fl,
1199 					  const struct request_sock *req,
1200 					  bool *strict)
1201 {
1202 	struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1203 
1204 	if (strict) {
1205 		if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1206 			*strict = true;
1207 		else
1208 			*strict = false;
1209 	}
1210 
1211 	return dst;
1212 }
1213 
1214 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1215 	.family		=	PF_INET,
1216 	.obj_size	=	sizeof(struct tcp_request_sock),
1217 	.rtx_syn_ack	=	tcp_rtx_synack,
1218 	.send_ack	=	tcp_v4_reqsk_send_ack,
1219 	.destructor	=	tcp_v4_reqsk_destructor,
1220 	.send_reset	=	tcp_v4_send_reset,
1221 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1222 };
1223 
1224 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1225 	.mss_clamp	=	TCP_MSS_DEFAULT,
1226 #ifdef CONFIG_TCP_MD5SIG
1227 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1228 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1229 #endif
1230 	.init_req	=	tcp_v4_init_req,
1231 #ifdef CONFIG_SYN_COOKIES
1232 	.cookie_init_seq =	cookie_v4_init_sequence,
1233 #endif
1234 	.route_req	=	tcp_v4_route_req,
1235 	.init_seq	=	tcp_v4_init_sequence,
1236 	.send_synack	=	tcp_v4_send_synack,
1237 };
1238 
1239 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1240 {
1241 	/* Never answer to SYNs send to broadcast or multicast */
1242 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1243 		goto drop;
1244 
1245 	return tcp_conn_request(&tcp_request_sock_ops,
1246 				&tcp_request_sock_ipv4_ops, sk, skb);
1247 
1248 drop:
1249 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1250 	return 0;
1251 }
1252 EXPORT_SYMBOL(tcp_v4_conn_request);
1253 
1254 
1255 /*
1256  * The three way handshake has completed - we got a valid synack -
1257  * now create the new socket.
1258  */
1259 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1260 				  struct request_sock *req,
1261 				  struct dst_entry *dst,
1262 				  struct request_sock *req_unhash,
1263 				  bool *own_req)
1264 {
1265 	struct inet_request_sock *ireq;
1266 	struct inet_sock *newinet;
1267 	struct tcp_sock *newtp;
1268 	struct sock *newsk;
1269 #ifdef CONFIG_TCP_MD5SIG
1270 	struct tcp_md5sig_key *key;
1271 #endif
1272 	struct ip_options_rcu *inet_opt;
1273 
1274 	if (sk_acceptq_is_full(sk))
1275 		goto exit_overflow;
1276 
1277 	newsk = tcp_create_openreq_child(sk, req, skb);
1278 	if (!newsk)
1279 		goto exit_nonewsk;
1280 
1281 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1282 	inet_sk_rx_dst_set(newsk, skb);
1283 
1284 	newtp		      = tcp_sk(newsk);
1285 	newinet		      = inet_sk(newsk);
1286 	ireq		      = inet_rsk(req);
1287 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1288 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1289 	newsk->sk_bound_dev_if = ireq->ir_iif;
1290 	newinet->inet_saddr	      = ireq->ir_loc_addr;
1291 	inet_opt	      = ireq->opt;
1292 	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1293 	ireq->opt	      = NULL;
1294 	newinet->mc_index     = inet_iif(skb);
1295 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1296 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1297 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1298 	if (inet_opt)
1299 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1300 	newinet->inet_id = newtp->write_seq ^ jiffies;
1301 
1302 	if (!dst) {
1303 		dst = inet_csk_route_child_sock(sk, newsk, req);
1304 		if (!dst)
1305 			goto put_and_exit;
1306 	} else {
1307 		/* syncookie case : see end of cookie_v4_check() */
1308 	}
1309 	sk_setup_caps(newsk, dst);
1310 
1311 	tcp_ca_openreq_child(newsk, dst);
1312 
1313 	tcp_sync_mss(newsk, dst_mtu(dst));
1314 	newtp->advmss = dst_metric_advmss(dst);
1315 	if (tcp_sk(sk)->rx_opt.user_mss &&
1316 	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1317 		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1318 
1319 	tcp_initialize_rcv_mss(newsk);
1320 
1321 #ifdef CONFIG_TCP_MD5SIG
1322 	/* Copy over the MD5 key from the original socket */
1323 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1324 				AF_INET);
1325 	if (key) {
1326 		/*
1327 		 * We're using one, so create a matching key
1328 		 * on the newsk structure. If we fail to get
1329 		 * memory, then we end up not copying the key
1330 		 * across. Shucks.
1331 		 */
1332 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1333 			       AF_INET, key->key, key->keylen, GFP_ATOMIC);
1334 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1335 	}
1336 #endif
1337 
1338 	if (__inet_inherit_port(sk, newsk) < 0)
1339 		goto put_and_exit;
1340 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1341 	if (*own_req)
1342 		tcp_move_syn(newtp, req);
1343 
1344 	return newsk;
1345 
1346 exit_overflow:
1347 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1348 exit_nonewsk:
1349 	dst_release(dst);
1350 exit:
1351 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1352 	return NULL;
1353 put_and_exit:
1354 	inet_csk_prepare_forced_close(newsk);
1355 	tcp_done(newsk);
1356 	goto exit;
1357 }
1358 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1359 
1360 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1361 {
1362 #ifdef CONFIG_SYN_COOKIES
1363 	const struct tcphdr *th = tcp_hdr(skb);
1364 
1365 	if (!th->syn)
1366 		sk = cookie_v4_check(sk, skb);
1367 #endif
1368 	return sk;
1369 }
1370 
1371 /* The socket must have it's spinlock held when we get
1372  * here, unless it is a TCP_LISTEN socket.
1373  *
1374  * We have a potential double-lock case here, so even when
1375  * doing backlog processing we use the BH locking scheme.
1376  * This is because we cannot sleep with the original spinlock
1377  * held.
1378  */
1379 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1380 {
1381 	struct sock *rsk;
1382 
1383 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1384 		struct dst_entry *dst = sk->sk_rx_dst;
1385 
1386 		sock_rps_save_rxhash(sk, skb);
1387 		sk_mark_napi_id(sk, skb);
1388 		if (dst) {
1389 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1390 			    !dst->ops->check(dst, 0)) {
1391 				dst_release(dst);
1392 				sk->sk_rx_dst = NULL;
1393 			}
1394 		}
1395 		tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1396 		return 0;
1397 	}
1398 
1399 	if (tcp_checksum_complete(skb))
1400 		goto csum_err;
1401 
1402 	if (sk->sk_state == TCP_LISTEN) {
1403 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1404 
1405 		if (!nsk)
1406 			goto discard;
1407 		if (nsk != sk) {
1408 			sock_rps_save_rxhash(nsk, skb);
1409 			sk_mark_napi_id(nsk, skb);
1410 			if (tcp_child_process(sk, nsk, skb)) {
1411 				rsk = nsk;
1412 				goto reset;
1413 			}
1414 			return 0;
1415 		}
1416 	} else
1417 		sock_rps_save_rxhash(sk, skb);
1418 
1419 	if (tcp_rcv_state_process(sk, skb)) {
1420 		rsk = sk;
1421 		goto reset;
1422 	}
1423 	return 0;
1424 
1425 reset:
1426 	tcp_v4_send_reset(rsk, skb);
1427 discard:
1428 	kfree_skb(skb);
1429 	/* Be careful here. If this function gets more complicated and
1430 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1431 	 * might be destroyed here. This current version compiles correctly,
1432 	 * but you have been warned.
1433 	 */
1434 	return 0;
1435 
1436 csum_err:
1437 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1438 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1439 	goto discard;
1440 }
1441 EXPORT_SYMBOL(tcp_v4_do_rcv);
1442 
1443 void tcp_v4_early_demux(struct sk_buff *skb)
1444 {
1445 	const struct iphdr *iph;
1446 	const struct tcphdr *th;
1447 	struct sock *sk;
1448 
1449 	if (skb->pkt_type != PACKET_HOST)
1450 		return;
1451 
1452 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1453 		return;
1454 
1455 	iph = ip_hdr(skb);
1456 	th = tcp_hdr(skb);
1457 
1458 	if (th->doff < sizeof(struct tcphdr) / 4)
1459 		return;
1460 
1461 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1462 				       iph->saddr, th->source,
1463 				       iph->daddr, ntohs(th->dest),
1464 				       skb->skb_iif);
1465 	if (sk) {
1466 		skb->sk = sk;
1467 		skb->destructor = sock_edemux;
1468 		if (sk_fullsock(sk)) {
1469 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1470 
1471 			if (dst)
1472 				dst = dst_check(dst, 0);
1473 			if (dst &&
1474 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1475 				skb_dst_set_noref(skb, dst);
1476 		}
1477 	}
1478 }
1479 
1480 /* Packet is added to VJ-style prequeue for processing in process
1481  * context, if a reader task is waiting. Apparently, this exciting
1482  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1483  * failed somewhere. Latency? Burstiness? Well, at least now we will
1484  * see, why it failed. 8)8)				  --ANK
1485  *
1486  */
1487 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1488 {
1489 	struct tcp_sock *tp = tcp_sk(sk);
1490 
1491 	if (sysctl_tcp_low_latency || !tp->ucopy.task)
1492 		return false;
1493 
1494 	if (skb->len <= tcp_hdrlen(skb) &&
1495 	    skb_queue_len(&tp->ucopy.prequeue) == 0)
1496 		return false;
1497 
1498 	/* Before escaping RCU protected region, we need to take care of skb
1499 	 * dst. Prequeue is only enabled for established sockets.
1500 	 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1501 	 * Instead of doing full sk_rx_dst validity here, let's perform
1502 	 * an optimistic check.
1503 	 */
1504 	if (likely(sk->sk_rx_dst))
1505 		skb_dst_drop(skb);
1506 	else
1507 		skb_dst_force_safe(skb);
1508 
1509 	__skb_queue_tail(&tp->ucopy.prequeue, skb);
1510 	tp->ucopy.memory += skb->truesize;
1511 	if (tp->ucopy.memory > sk->sk_rcvbuf) {
1512 		struct sk_buff *skb1;
1513 
1514 		BUG_ON(sock_owned_by_user(sk));
1515 
1516 		while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1517 			sk_backlog_rcv(sk, skb1);
1518 			NET_INC_STATS_BH(sock_net(sk),
1519 					 LINUX_MIB_TCPPREQUEUEDROPPED);
1520 		}
1521 
1522 		tp->ucopy.memory = 0;
1523 	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1524 		wake_up_interruptible_sync_poll(sk_sleep(sk),
1525 					   POLLIN | POLLRDNORM | POLLRDBAND);
1526 		if (!inet_csk_ack_scheduled(sk))
1527 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1528 						  (3 * tcp_rto_min(sk)) / 4,
1529 						  TCP_RTO_MAX);
1530 	}
1531 	return true;
1532 }
1533 EXPORT_SYMBOL(tcp_prequeue);
1534 
1535 /*
1536  *	From tcp_input.c
1537  */
1538 
1539 int tcp_v4_rcv(struct sk_buff *skb)
1540 {
1541 	const struct iphdr *iph;
1542 	const struct tcphdr *th;
1543 	struct sock *sk;
1544 	int ret;
1545 	struct net *net = dev_net(skb->dev);
1546 
1547 	if (skb->pkt_type != PACKET_HOST)
1548 		goto discard_it;
1549 
1550 	/* Count it even if it's bad */
1551 	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1552 
1553 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1554 		goto discard_it;
1555 
1556 	th = tcp_hdr(skb);
1557 
1558 	if (th->doff < sizeof(struct tcphdr) / 4)
1559 		goto bad_packet;
1560 	if (!pskb_may_pull(skb, th->doff * 4))
1561 		goto discard_it;
1562 
1563 	/* An explanation is required here, I think.
1564 	 * Packet length and doff are validated by header prediction,
1565 	 * provided case of th->doff==0 is eliminated.
1566 	 * So, we defer the checks. */
1567 
1568 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1569 		goto csum_error;
1570 
1571 	th = tcp_hdr(skb);
1572 	iph = ip_hdr(skb);
1573 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1574 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1575 	 */
1576 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1577 		sizeof(struct inet_skb_parm));
1578 	barrier();
1579 
1580 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1581 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1582 				    skb->len - th->doff * 4);
1583 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1584 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1585 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1586 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1587 	TCP_SKB_CB(skb)->sacked	 = 0;
1588 
1589 lookup:
1590 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1591 	if (!sk)
1592 		goto no_tcp_socket;
1593 
1594 process:
1595 	if (sk->sk_state == TCP_TIME_WAIT)
1596 		goto do_time_wait;
1597 
1598 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1599 		struct request_sock *req = inet_reqsk(sk);
1600 		struct sock *nsk;
1601 
1602 		sk = req->rsk_listener;
1603 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1604 			reqsk_put(req);
1605 			goto discard_it;
1606 		}
1607 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1608 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1609 			goto lookup;
1610 		}
1611 		sock_hold(sk);
1612 		nsk = tcp_check_req(sk, skb, req, false);
1613 		if (!nsk) {
1614 			reqsk_put(req);
1615 			goto discard_and_relse;
1616 		}
1617 		if (nsk == sk) {
1618 			reqsk_put(req);
1619 		} else if (tcp_child_process(sk, nsk, skb)) {
1620 			tcp_v4_send_reset(nsk, skb);
1621 			goto discard_and_relse;
1622 		} else {
1623 			sock_put(sk);
1624 			return 0;
1625 		}
1626 	}
1627 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1628 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1629 		goto discard_and_relse;
1630 	}
1631 
1632 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1633 		goto discard_and_relse;
1634 
1635 	if (tcp_v4_inbound_md5_hash(sk, skb))
1636 		goto discard_and_relse;
1637 
1638 	nf_reset(skb);
1639 
1640 	if (sk_filter(sk, skb))
1641 		goto discard_and_relse;
1642 
1643 	skb->dev = NULL;
1644 
1645 	if (sk->sk_state == TCP_LISTEN) {
1646 		ret = tcp_v4_do_rcv(sk, skb);
1647 		goto put_and_return;
1648 	}
1649 
1650 	sk_incoming_cpu_update(sk);
1651 
1652 	bh_lock_sock_nested(sk);
1653 	tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
1654 	ret = 0;
1655 	if (!sock_owned_by_user(sk)) {
1656 		if (!tcp_prequeue(sk, skb))
1657 			ret = tcp_v4_do_rcv(sk, skb);
1658 	} else if (unlikely(sk_add_backlog(sk, skb,
1659 					   sk->sk_rcvbuf + sk->sk_sndbuf))) {
1660 		bh_unlock_sock(sk);
1661 		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1662 		goto discard_and_relse;
1663 	}
1664 	bh_unlock_sock(sk);
1665 
1666 put_and_return:
1667 	sock_put(sk);
1668 
1669 	return ret;
1670 
1671 no_tcp_socket:
1672 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1673 		goto discard_it;
1674 
1675 	if (tcp_checksum_complete(skb)) {
1676 csum_error:
1677 		TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1678 bad_packet:
1679 		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1680 	} else {
1681 		tcp_v4_send_reset(NULL, skb);
1682 	}
1683 
1684 discard_it:
1685 	/* Discard frame. */
1686 	kfree_skb(skb);
1687 	return 0;
1688 
1689 discard_and_relse:
1690 	sock_put(sk);
1691 	goto discard_it;
1692 
1693 do_time_wait:
1694 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1695 		inet_twsk_put(inet_twsk(sk));
1696 		goto discard_it;
1697 	}
1698 
1699 	if (tcp_checksum_complete(skb)) {
1700 		inet_twsk_put(inet_twsk(sk));
1701 		goto csum_error;
1702 	}
1703 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1704 	case TCP_TW_SYN: {
1705 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1706 							&tcp_hashinfo,
1707 							iph->saddr, th->source,
1708 							iph->daddr, th->dest,
1709 							inet_iif(skb));
1710 		if (sk2) {
1711 			inet_twsk_deschedule_put(inet_twsk(sk));
1712 			sk = sk2;
1713 			goto process;
1714 		}
1715 		/* Fall through to ACK */
1716 	}
1717 	case TCP_TW_ACK:
1718 		tcp_v4_timewait_ack(sk, skb);
1719 		break;
1720 	case TCP_TW_RST:
1721 		tcp_v4_send_reset(sk, skb);
1722 		inet_twsk_deschedule_put(inet_twsk(sk));
1723 		goto discard_it;
1724 	case TCP_TW_SUCCESS:;
1725 	}
1726 	goto discard_it;
1727 }
1728 
1729 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1730 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1731 	.twsk_unique	= tcp_twsk_unique,
1732 	.twsk_destructor= tcp_twsk_destructor,
1733 };
1734 
1735 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1736 {
1737 	struct dst_entry *dst = skb_dst(skb);
1738 
1739 	if (dst && dst_hold_safe(dst)) {
1740 		sk->sk_rx_dst = dst;
1741 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1742 	}
1743 }
1744 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1745 
1746 const struct inet_connection_sock_af_ops ipv4_specific = {
1747 	.queue_xmit	   = ip_queue_xmit,
1748 	.send_check	   = tcp_v4_send_check,
1749 	.rebuild_header	   = inet_sk_rebuild_header,
1750 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
1751 	.conn_request	   = tcp_v4_conn_request,
1752 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1753 	.net_header_len	   = sizeof(struct iphdr),
1754 	.setsockopt	   = ip_setsockopt,
1755 	.getsockopt	   = ip_getsockopt,
1756 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1757 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1758 	.bind_conflict	   = inet_csk_bind_conflict,
1759 #ifdef CONFIG_COMPAT
1760 	.compat_setsockopt = compat_ip_setsockopt,
1761 	.compat_getsockopt = compat_ip_getsockopt,
1762 #endif
1763 	.mtu_reduced	   = tcp_v4_mtu_reduced,
1764 };
1765 EXPORT_SYMBOL(ipv4_specific);
1766 
1767 #ifdef CONFIG_TCP_MD5SIG
1768 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1769 	.md5_lookup		= tcp_v4_md5_lookup,
1770 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1771 	.md5_parse		= tcp_v4_parse_md5_keys,
1772 };
1773 #endif
1774 
1775 /* NOTE: A lot of things set to zero explicitly by call to
1776  *       sk_alloc() so need not be done here.
1777  */
1778 static int tcp_v4_init_sock(struct sock *sk)
1779 {
1780 	struct inet_connection_sock *icsk = inet_csk(sk);
1781 
1782 	tcp_init_sock(sk);
1783 
1784 	icsk->icsk_af_ops = &ipv4_specific;
1785 
1786 #ifdef CONFIG_TCP_MD5SIG
1787 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1788 #endif
1789 
1790 	return 0;
1791 }
1792 
1793 void tcp_v4_destroy_sock(struct sock *sk)
1794 {
1795 	struct tcp_sock *tp = tcp_sk(sk);
1796 
1797 	tcp_clear_xmit_timers(sk);
1798 
1799 	tcp_cleanup_congestion_control(sk);
1800 
1801 	/* Cleanup up the write buffer. */
1802 	tcp_write_queue_purge(sk);
1803 
1804 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1805 	__skb_queue_purge(&tp->out_of_order_queue);
1806 
1807 #ifdef CONFIG_TCP_MD5SIG
1808 	/* Clean up the MD5 key list, if any */
1809 	if (tp->md5sig_info) {
1810 		tcp_clear_md5_list(sk);
1811 		kfree_rcu(tp->md5sig_info, rcu);
1812 		tp->md5sig_info = NULL;
1813 	}
1814 #endif
1815 
1816 	/* Clean prequeue, it must be empty really */
1817 	__skb_queue_purge(&tp->ucopy.prequeue);
1818 
1819 	/* Clean up a referenced TCP bind bucket. */
1820 	if (inet_csk(sk)->icsk_bind_hash)
1821 		inet_put_port(sk);
1822 
1823 	BUG_ON(tp->fastopen_rsk);
1824 
1825 	/* If socket is aborted during connect operation */
1826 	tcp_free_fastopen_req(tp);
1827 	tcp_saved_syn_free(tp);
1828 
1829 	sk_sockets_allocated_dec(sk);
1830 
1831 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
1832 		sock_release_memcg(sk);
1833 }
1834 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1835 
1836 #ifdef CONFIG_PROC_FS
1837 /* Proc filesystem TCP sock list dumping. */
1838 
1839 /*
1840  * Get next listener socket follow cur.  If cur is NULL, get first socket
1841  * starting from bucket given in st->bucket; when st->bucket is zero the
1842  * very first socket in the hash table is returned.
1843  */
1844 static void *listening_get_next(struct seq_file *seq, void *cur)
1845 {
1846 	struct inet_connection_sock *icsk;
1847 	struct hlist_nulls_node *node;
1848 	struct sock *sk = cur;
1849 	struct inet_listen_hashbucket *ilb;
1850 	struct tcp_iter_state *st = seq->private;
1851 	struct net *net = seq_file_net(seq);
1852 
1853 	if (!sk) {
1854 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1855 		spin_lock_bh(&ilb->lock);
1856 		sk = sk_nulls_head(&ilb->head);
1857 		st->offset = 0;
1858 		goto get_sk;
1859 	}
1860 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1861 	++st->num;
1862 	++st->offset;
1863 
1864 	sk = sk_nulls_next(sk);
1865 get_sk:
1866 	sk_nulls_for_each_from(sk, node) {
1867 		if (!net_eq(sock_net(sk), net))
1868 			continue;
1869 		if (sk->sk_family == st->family) {
1870 			cur = sk;
1871 			goto out;
1872 		}
1873 		icsk = inet_csk(sk);
1874 	}
1875 	spin_unlock_bh(&ilb->lock);
1876 	st->offset = 0;
1877 	if (++st->bucket < INET_LHTABLE_SIZE) {
1878 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1879 		spin_lock_bh(&ilb->lock);
1880 		sk = sk_nulls_head(&ilb->head);
1881 		goto get_sk;
1882 	}
1883 	cur = NULL;
1884 out:
1885 	return cur;
1886 }
1887 
1888 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1889 {
1890 	struct tcp_iter_state *st = seq->private;
1891 	void *rc;
1892 
1893 	st->bucket = 0;
1894 	st->offset = 0;
1895 	rc = listening_get_next(seq, NULL);
1896 
1897 	while (rc && *pos) {
1898 		rc = listening_get_next(seq, rc);
1899 		--*pos;
1900 	}
1901 	return rc;
1902 }
1903 
1904 static inline bool empty_bucket(const struct tcp_iter_state *st)
1905 {
1906 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1907 }
1908 
1909 /*
1910  * Get first established socket starting from bucket given in st->bucket.
1911  * If st->bucket is zero, the very first socket in the hash is returned.
1912  */
1913 static void *established_get_first(struct seq_file *seq)
1914 {
1915 	struct tcp_iter_state *st = seq->private;
1916 	struct net *net = seq_file_net(seq);
1917 	void *rc = NULL;
1918 
1919 	st->offset = 0;
1920 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1921 		struct sock *sk;
1922 		struct hlist_nulls_node *node;
1923 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1924 
1925 		/* Lockless fast path for the common case of empty buckets */
1926 		if (empty_bucket(st))
1927 			continue;
1928 
1929 		spin_lock_bh(lock);
1930 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1931 			if (sk->sk_family != st->family ||
1932 			    !net_eq(sock_net(sk), net)) {
1933 				continue;
1934 			}
1935 			rc = sk;
1936 			goto out;
1937 		}
1938 		spin_unlock_bh(lock);
1939 	}
1940 out:
1941 	return rc;
1942 }
1943 
1944 static void *established_get_next(struct seq_file *seq, void *cur)
1945 {
1946 	struct sock *sk = cur;
1947 	struct hlist_nulls_node *node;
1948 	struct tcp_iter_state *st = seq->private;
1949 	struct net *net = seq_file_net(seq);
1950 
1951 	++st->num;
1952 	++st->offset;
1953 
1954 	sk = sk_nulls_next(sk);
1955 
1956 	sk_nulls_for_each_from(sk, node) {
1957 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1958 			return sk;
1959 	}
1960 
1961 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1962 	++st->bucket;
1963 	return established_get_first(seq);
1964 }
1965 
1966 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1967 {
1968 	struct tcp_iter_state *st = seq->private;
1969 	void *rc;
1970 
1971 	st->bucket = 0;
1972 	rc = established_get_first(seq);
1973 
1974 	while (rc && pos) {
1975 		rc = established_get_next(seq, rc);
1976 		--pos;
1977 	}
1978 	return rc;
1979 }
1980 
1981 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1982 {
1983 	void *rc;
1984 	struct tcp_iter_state *st = seq->private;
1985 
1986 	st->state = TCP_SEQ_STATE_LISTENING;
1987 	rc	  = listening_get_idx(seq, &pos);
1988 
1989 	if (!rc) {
1990 		st->state = TCP_SEQ_STATE_ESTABLISHED;
1991 		rc	  = established_get_idx(seq, pos);
1992 	}
1993 
1994 	return rc;
1995 }
1996 
1997 static void *tcp_seek_last_pos(struct seq_file *seq)
1998 {
1999 	struct tcp_iter_state *st = seq->private;
2000 	int offset = st->offset;
2001 	int orig_num = st->num;
2002 	void *rc = NULL;
2003 
2004 	switch (st->state) {
2005 	case TCP_SEQ_STATE_LISTENING:
2006 		if (st->bucket >= INET_LHTABLE_SIZE)
2007 			break;
2008 		st->state = TCP_SEQ_STATE_LISTENING;
2009 		rc = listening_get_next(seq, NULL);
2010 		while (offset-- && rc)
2011 			rc = listening_get_next(seq, rc);
2012 		if (rc)
2013 			break;
2014 		st->bucket = 0;
2015 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2016 		/* Fallthrough */
2017 	case TCP_SEQ_STATE_ESTABLISHED:
2018 		if (st->bucket > tcp_hashinfo.ehash_mask)
2019 			break;
2020 		rc = established_get_first(seq);
2021 		while (offset-- && rc)
2022 			rc = established_get_next(seq, rc);
2023 	}
2024 
2025 	st->num = orig_num;
2026 
2027 	return rc;
2028 }
2029 
2030 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2031 {
2032 	struct tcp_iter_state *st = seq->private;
2033 	void *rc;
2034 
2035 	if (*pos && *pos == st->last_pos) {
2036 		rc = tcp_seek_last_pos(seq);
2037 		if (rc)
2038 			goto out;
2039 	}
2040 
2041 	st->state = TCP_SEQ_STATE_LISTENING;
2042 	st->num = 0;
2043 	st->bucket = 0;
2044 	st->offset = 0;
2045 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2046 
2047 out:
2048 	st->last_pos = *pos;
2049 	return rc;
2050 }
2051 
2052 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2053 {
2054 	struct tcp_iter_state *st = seq->private;
2055 	void *rc = NULL;
2056 
2057 	if (v == SEQ_START_TOKEN) {
2058 		rc = tcp_get_idx(seq, 0);
2059 		goto out;
2060 	}
2061 
2062 	switch (st->state) {
2063 	case TCP_SEQ_STATE_LISTENING:
2064 		rc = listening_get_next(seq, v);
2065 		if (!rc) {
2066 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2067 			st->bucket = 0;
2068 			st->offset = 0;
2069 			rc	  = established_get_first(seq);
2070 		}
2071 		break;
2072 	case TCP_SEQ_STATE_ESTABLISHED:
2073 		rc = established_get_next(seq, v);
2074 		break;
2075 	}
2076 out:
2077 	++*pos;
2078 	st->last_pos = *pos;
2079 	return rc;
2080 }
2081 
2082 static void tcp_seq_stop(struct seq_file *seq, void *v)
2083 {
2084 	struct tcp_iter_state *st = seq->private;
2085 
2086 	switch (st->state) {
2087 	case TCP_SEQ_STATE_LISTENING:
2088 		if (v != SEQ_START_TOKEN)
2089 			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2090 		break;
2091 	case TCP_SEQ_STATE_ESTABLISHED:
2092 		if (v)
2093 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2094 		break;
2095 	}
2096 }
2097 
2098 int tcp_seq_open(struct inode *inode, struct file *file)
2099 {
2100 	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2101 	struct tcp_iter_state *s;
2102 	int err;
2103 
2104 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2105 			  sizeof(struct tcp_iter_state));
2106 	if (err < 0)
2107 		return err;
2108 
2109 	s = ((struct seq_file *)file->private_data)->private;
2110 	s->family		= afinfo->family;
2111 	s->last_pos		= 0;
2112 	return 0;
2113 }
2114 EXPORT_SYMBOL(tcp_seq_open);
2115 
2116 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2117 {
2118 	int rc = 0;
2119 	struct proc_dir_entry *p;
2120 
2121 	afinfo->seq_ops.start		= tcp_seq_start;
2122 	afinfo->seq_ops.next		= tcp_seq_next;
2123 	afinfo->seq_ops.stop		= tcp_seq_stop;
2124 
2125 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2126 			     afinfo->seq_fops, afinfo);
2127 	if (!p)
2128 		rc = -ENOMEM;
2129 	return rc;
2130 }
2131 EXPORT_SYMBOL(tcp_proc_register);
2132 
2133 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2134 {
2135 	remove_proc_entry(afinfo->name, net->proc_net);
2136 }
2137 EXPORT_SYMBOL(tcp_proc_unregister);
2138 
2139 static void get_openreq4(const struct request_sock *req,
2140 			 struct seq_file *f, int i)
2141 {
2142 	const struct inet_request_sock *ireq = inet_rsk(req);
2143 	long delta = req->rsk_timer.expires - jiffies;
2144 
2145 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2146 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2147 		i,
2148 		ireq->ir_loc_addr,
2149 		ireq->ir_num,
2150 		ireq->ir_rmt_addr,
2151 		ntohs(ireq->ir_rmt_port),
2152 		TCP_SYN_RECV,
2153 		0, 0, /* could print option size, but that is af dependent. */
2154 		1,    /* timers active (only the expire timer) */
2155 		jiffies_delta_to_clock_t(delta),
2156 		req->num_timeout,
2157 		from_kuid_munged(seq_user_ns(f),
2158 				 sock_i_uid(req->rsk_listener)),
2159 		0,  /* non standard timer */
2160 		0, /* open_requests have no inode */
2161 		0,
2162 		req);
2163 }
2164 
2165 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2166 {
2167 	int timer_active;
2168 	unsigned long timer_expires;
2169 	const struct tcp_sock *tp = tcp_sk(sk);
2170 	const struct inet_connection_sock *icsk = inet_csk(sk);
2171 	const struct inet_sock *inet = inet_sk(sk);
2172 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2173 	__be32 dest = inet->inet_daddr;
2174 	__be32 src = inet->inet_rcv_saddr;
2175 	__u16 destp = ntohs(inet->inet_dport);
2176 	__u16 srcp = ntohs(inet->inet_sport);
2177 	int rx_queue;
2178 	int state;
2179 
2180 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2181 	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2182 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2183 		timer_active	= 1;
2184 		timer_expires	= icsk->icsk_timeout;
2185 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2186 		timer_active	= 4;
2187 		timer_expires	= icsk->icsk_timeout;
2188 	} else if (timer_pending(&sk->sk_timer)) {
2189 		timer_active	= 2;
2190 		timer_expires	= sk->sk_timer.expires;
2191 	} else {
2192 		timer_active	= 0;
2193 		timer_expires = jiffies;
2194 	}
2195 
2196 	state = sk_state_load(sk);
2197 	if (state == TCP_LISTEN)
2198 		rx_queue = sk->sk_ack_backlog;
2199 	else
2200 		/* Because we don't lock the socket,
2201 		 * we might find a transient negative value.
2202 		 */
2203 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2204 
2205 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2206 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2207 		i, src, srcp, dest, destp, state,
2208 		tp->write_seq - tp->snd_una,
2209 		rx_queue,
2210 		timer_active,
2211 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2212 		icsk->icsk_retransmits,
2213 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2214 		icsk->icsk_probes_out,
2215 		sock_i_ino(sk),
2216 		atomic_read(&sk->sk_refcnt), sk,
2217 		jiffies_to_clock_t(icsk->icsk_rto),
2218 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2219 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2220 		tp->snd_cwnd,
2221 		state == TCP_LISTEN ?
2222 		    fastopenq->max_qlen :
2223 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2224 }
2225 
2226 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2227 			       struct seq_file *f, int i)
2228 {
2229 	long delta = tw->tw_timer.expires - jiffies;
2230 	__be32 dest, src;
2231 	__u16 destp, srcp;
2232 
2233 	dest  = tw->tw_daddr;
2234 	src   = tw->tw_rcv_saddr;
2235 	destp = ntohs(tw->tw_dport);
2236 	srcp  = ntohs(tw->tw_sport);
2237 
2238 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2239 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2240 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2241 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2242 		atomic_read(&tw->tw_refcnt), tw);
2243 }
2244 
2245 #define TMPSZ 150
2246 
2247 static int tcp4_seq_show(struct seq_file *seq, void *v)
2248 {
2249 	struct tcp_iter_state *st;
2250 	struct sock *sk = v;
2251 
2252 	seq_setwidth(seq, TMPSZ - 1);
2253 	if (v == SEQ_START_TOKEN) {
2254 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2255 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2256 			   "inode");
2257 		goto out;
2258 	}
2259 	st = seq->private;
2260 
2261 	if (sk->sk_state == TCP_TIME_WAIT)
2262 		get_timewait4_sock(v, seq, st->num);
2263 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2264 		get_openreq4(v, seq, st->num);
2265 	else
2266 		get_tcp4_sock(v, seq, st->num);
2267 out:
2268 	seq_pad(seq, '\n');
2269 	return 0;
2270 }
2271 
2272 static const struct file_operations tcp_afinfo_seq_fops = {
2273 	.owner   = THIS_MODULE,
2274 	.open    = tcp_seq_open,
2275 	.read    = seq_read,
2276 	.llseek  = seq_lseek,
2277 	.release = seq_release_net
2278 };
2279 
2280 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2281 	.name		= "tcp",
2282 	.family		= AF_INET,
2283 	.seq_fops	= &tcp_afinfo_seq_fops,
2284 	.seq_ops	= {
2285 		.show		= tcp4_seq_show,
2286 	},
2287 };
2288 
2289 static int __net_init tcp4_proc_init_net(struct net *net)
2290 {
2291 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2292 }
2293 
2294 static void __net_exit tcp4_proc_exit_net(struct net *net)
2295 {
2296 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2297 }
2298 
2299 static struct pernet_operations tcp4_net_ops = {
2300 	.init = tcp4_proc_init_net,
2301 	.exit = tcp4_proc_exit_net,
2302 };
2303 
2304 int __init tcp4_proc_init(void)
2305 {
2306 	return register_pernet_subsys(&tcp4_net_ops);
2307 }
2308 
2309 void tcp4_proc_exit(void)
2310 {
2311 	unregister_pernet_subsys(&tcp4_net_ops);
2312 }
2313 #endif /* CONFIG_PROC_FS */
2314 
2315 struct proto tcp_prot = {
2316 	.name			= "TCP",
2317 	.owner			= THIS_MODULE,
2318 	.close			= tcp_close,
2319 	.connect		= tcp_v4_connect,
2320 	.disconnect		= tcp_disconnect,
2321 	.accept			= inet_csk_accept,
2322 	.ioctl			= tcp_ioctl,
2323 	.init			= tcp_v4_init_sock,
2324 	.destroy		= tcp_v4_destroy_sock,
2325 	.shutdown		= tcp_shutdown,
2326 	.setsockopt		= tcp_setsockopt,
2327 	.getsockopt		= tcp_getsockopt,
2328 	.recvmsg		= tcp_recvmsg,
2329 	.sendmsg		= tcp_sendmsg,
2330 	.sendpage		= tcp_sendpage,
2331 	.backlog_rcv		= tcp_v4_do_rcv,
2332 	.release_cb		= tcp_release_cb,
2333 	.hash			= inet_hash,
2334 	.unhash			= inet_unhash,
2335 	.get_port		= inet_csk_get_port,
2336 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2337 	.stream_memory_free	= tcp_stream_memory_free,
2338 	.sockets_allocated	= &tcp_sockets_allocated,
2339 	.orphan_count		= &tcp_orphan_count,
2340 	.memory_allocated	= &tcp_memory_allocated,
2341 	.memory_pressure	= &tcp_memory_pressure,
2342 	.sysctl_mem		= sysctl_tcp_mem,
2343 	.sysctl_wmem		= sysctl_tcp_wmem,
2344 	.sysctl_rmem		= sysctl_tcp_rmem,
2345 	.max_header		= MAX_TCP_HEADER,
2346 	.obj_size		= sizeof(struct tcp_sock),
2347 	.slab_flags		= SLAB_DESTROY_BY_RCU,
2348 	.twsk_prot		= &tcp_timewait_sock_ops,
2349 	.rsk_prot		= &tcp_request_sock_ops,
2350 	.h.hashinfo		= &tcp_hashinfo,
2351 	.no_autobind		= true,
2352 #ifdef CONFIG_COMPAT
2353 	.compat_setsockopt	= compat_tcp_setsockopt,
2354 	.compat_getsockopt	= compat_tcp_getsockopt,
2355 #endif
2356 	.diag_destroy		= tcp_abort,
2357 };
2358 EXPORT_SYMBOL(tcp_prot);
2359 
2360 static void __net_exit tcp_sk_exit(struct net *net)
2361 {
2362 	int cpu;
2363 
2364 	for_each_possible_cpu(cpu)
2365 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2366 	free_percpu(net->ipv4.tcp_sk);
2367 }
2368 
2369 static int __net_init tcp_sk_init(struct net *net)
2370 {
2371 	int res, cpu;
2372 
2373 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2374 	if (!net->ipv4.tcp_sk)
2375 		return -ENOMEM;
2376 
2377 	for_each_possible_cpu(cpu) {
2378 		struct sock *sk;
2379 
2380 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2381 					   IPPROTO_TCP, net);
2382 		if (res)
2383 			goto fail;
2384 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2385 	}
2386 
2387 	net->ipv4.sysctl_tcp_ecn = 2;
2388 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2389 
2390 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2391 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2392 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2393 
2394 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2395 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2396 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2397 
2398 	return 0;
2399 fail:
2400 	tcp_sk_exit(net);
2401 
2402 	return res;
2403 }
2404 
2405 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2406 {
2407 	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2408 }
2409 
2410 static struct pernet_operations __net_initdata tcp_sk_ops = {
2411        .init	   = tcp_sk_init,
2412        .exit	   = tcp_sk_exit,
2413        .exit_batch = tcp_sk_exit_batch,
2414 };
2415 
2416 void __init tcp_v4_init(void)
2417 {
2418 	inet_hashinfo_init(&tcp_hashinfo);
2419 	if (register_pernet_subsys(&tcp_sk_ops))
2420 		panic("Failed to create the TCP control socket.\n");
2421 }
2422