xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision 90bbcc608369a1b46089b0f5aa22b8ea31ffa12e)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77 
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 
84 #include <crypto/hash.h>
85 #include <linux/scatterlist.h>
86 
87 int sysctl_tcp_tw_reuse __read_mostly;
88 int sysctl_tcp_low_latency __read_mostly;
89 EXPORT_SYMBOL(sysctl_tcp_low_latency);
90 
91 #ifdef CONFIG_TCP_MD5SIG
92 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
93 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
94 #endif
95 
96 struct inet_hashinfo tcp_hashinfo;
97 EXPORT_SYMBOL(tcp_hashinfo);
98 
99 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
100 {
101 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
102 					  ip_hdr(skb)->saddr,
103 					  tcp_hdr(skb)->dest,
104 					  tcp_hdr(skb)->source);
105 }
106 
107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 {
109 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
110 	struct tcp_sock *tp = tcp_sk(sk);
111 
112 	/* With PAWS, it is safe from the viewpoint
113 	   of data integrity. Even without PAWS it is safe provided sequence
114 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
115 
116 	   Actually, the idea is close to VJ's one, only timestamp cache is
117 	   held not per host, but per port pair and TW bucket is used as state
118 	   holder.
119 
120 	   If TW bucket has been already destroyed we fall back to VJ's scheme
121 	   and use initial timestamp retrieved from peer table.
122 	 */
123 	if (tcptw->tw_ts_recent_stamp &&
124 	    (!twp || (sysctl_tcp_tw_reuse &&
125 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
126 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
127 		if (tp->write_seq == 0)
128 			tp->write_seq = 1;
129 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
130 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
131 		sock_hold(sktw);
132 		return 1;
133 	}
134 
135 	return 0;
136 }
137 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
138 
139 /* This will initiate an outgoing connection. */
140 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
141 {
142 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
143 	struct inet_sock *inet = inet_sk(sk);
144 	struct tcp_sock *tp = tcp_sk(sk);
145 	__be16 orig_sport, orig_dport;
146 	__be32 daddr, nexthop;
147 	struct flowi4 *fl4;
148 	struct rtable *rt;
149 	int err;
150 	struct ip_options_rcu *inet_opt;
151 
152 	if (addr_len < sizeof(struct sockaddr_in))
153 		return -EINVAL;
154 
155 	if (usin->sin_family != AF_INET)
156 		return -EAFNOSUPPORT;
157 
158 	nexthop = daddr = usin->sin_addr.s_addr;
159 	inet_opt = rcu_dereference_protected(inet->inet_opt,
160 					     lockdep_sock_is_held(sk));
161 	if (inet_opt && inet_opt->opt.srr) {
162 		if (!daddr)
163 			return -EINVAL;
164 		nexthop = inet_opt->opt.faddr;
165 	}
166 
167 	orig_sport = inet->inet_sport;
168 	orig_dport = usin->sin_port;
169 	fl4 = &inet->cork.fl.u.ip4;
170 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
171 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
172 			      IPPROTO_TCP,
173 			      orig_sport, orig_dport, sk);
174 	if (IS_ERR(rt)) {
175 		err = PTR_ERR(rt);
176 		if (err == -ENETUNREACH)
177 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
178 		return err;
179 	}
180 
181 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
182 		ip_rt_put(rt);
183 		return -ENETUNREACH;
184 	}
185 
186 	if (!inet_opt || !inet_opt->opt.srr)
187 		daddr = fl4->daddr;
188 
189 	if (!inet->inet_saddr)
190 		inet->inet_saddr = fl4->saddr;
191 	sk_rcv_saddr_set(sk, inet->inet_saddr);
192 
193 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
194 		/* Reset inherited state */
195 		tp->rx_opt.ts_recent	   = 0;
196 		tp->rx_opt.ts_recent_stamp = 0;
197 		if (likely(!tp->repair))
198 			tp->write_seq	   = 0;
199 	}
200 
201 	if (tcp_death_row.sysctl_tw_recycle &&
202 	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
203 		tcp_fetch_timewait_stamp(sk, &rt->dst);
204 
205 	inet->inet_dport = usin->sin_port;
206 	sk_daddr_set(sk, daddr);
207 
208 	inet_csk(sk)->icsk_ext_hdr_len = 0;
209 	if (inet_opt)
210 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
211 
212 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
213 
214 	/* Socket identity is still unknown (sport may be zero).
215 	 * However we set state to SYN-SENT and not releasing socket
216 	 * lock select source port, enter ourselves into the hash tables and
217 	 * complete initialization after this.
218 	 */
219 	tcp_set_state(sk, TCP_SYN_SENT);
220 	err = inet_hash_connect(&tcp_death_row, sk);
221 	if (err)
222 		goto failure;
223 
224 	sk_set_txhash(sk);
225 
226 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
227 			       inet->inet_sport, inet->inet_dport, sk);
228 	if (IS_ERR(rt)) {
229 		err = PTR_ERR(rt);
230 		rt = NULL;
231 		goto failure;
232 	}
233 	/* OK, now commit destination to socket.  */
234 	sk->sk_gso_type = SKB_GSO_TCPV4;
235 	sk_setup_caps(sk, &rt->dst);
236 
237 	if (!tp->write_seq && likely(!tp->repair))
238 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
239 							   inet->inet_daddr,
240 							   inet->inet_sport,
241 							   usin->sin_port);
242 
243 	inet->inet_id = tp->write_seq ^ jiffies;
244 
245 	err = tcp_connect(sk);
246 
247 	rt = NULL;
248 	if (err)
249 		goto failure;
250 
251 	return 0;
252 
253 failure:
254 	/*
255 	 * This unhashes the socket and releases the local port,
256 	 * if necessary.
257 	 */
258 	tcp_set_state(sk, TCP_CLOSE);
259 	ip_rt_put(rt);
260 	sk->sk_route_caps = 0;
261 	inet->inet_dport = 0;
262 	return err;
263 }
264 EXPORT_SYMBOL(tcp_v4_connect);
265 
266 /*
267  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
268  * It can be called through tcp_release_cb() if socket was owned by user
269  * at the time tcp_v4_err() was called to handle ICMP message.
270  */
271 void tcp_v4_mtu_reduced(struct sock *sk)
272 {
273 	struct dst_entry *dst;
274 	struct inet_sock *inet = inet_sk(sk);
275 	u32 mtu = tcp_sk(sk)->mtu_info;
276 
277 	dst = inet_csk_update_pmtu(sk, mtu);
278 	if (!dst)
279 		return;
280 
281 	/* Something is about to be wrong... Remember soft error
282 	 * for the case, if this connection will not able to recover.
283 	 */
284 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
285 		sk->sk_err_soft = EMSGSIZE;
286 
287 	mtu = dst_mtu(dst);
288 
289 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
290 	    ip_sk_accept_pmtu(sk) &&
291 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
292 		tcp_sync_mss(sk, mtu);
293 
294 		/* Resend the TCP packet because it's
295 		 * clear that the old packet has been
296 		 * dropped. This is the new "fast" path mtu
297 		 * discovery.
298 		 */
299 		tcp_simple_retransmit(sk);
300 	} /* else let the usual retransmit timer handle it */
301 }
302 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
303 
304 static void do_redirect(struct sk_buff *skb, struct sock *sk)
305 {
306 	struct dst_entry *dst = __sk_dst_check(sk, 0);
307 
308 	if (dst)
309 		dst->ops->redirect(dst, sk, skb);
310 }
311 
312 
313 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
314 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
315 {
316 	struct request_sock *req = inet_reqsk(sk);
317 	struct net *net = sock_net(sk);
318 
319 	/* ICMPs are not backlogged, hence we cannot get
320 	 * an established socket here.
321 	 */
322 	if (seq != tcp_rsk(req)->snt_isn) {
323 		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
324 	} else if (abort) {
325 		/*
326 		 * Still in SYN_RECV, just remove it silently.
327 		 * There is no good way to pass the error to the newly
328 		 * created socket, and POSIX does not want network
329 		 * errors returned from accept().
330 		 */
331 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
332 		tcp_listendrop(req->rsk_listener);
333 	}
334 	reqsk_put(req);
335 }
336 EXPORT_SYMBOL(tcp_req_err);
337 
338 /*
339  * This routine is called by the ICMP module when it gets some
340  * sort of error condition.  If err < 0 then the socket should
341  * be closed and the error returned to the user.  If err > 0
342  * it's just the icmp type << 8 | icmp code.  After adjustment
343  * header points to the first 8 bytes of the tcp header.  We need
344  * to find the appropriate port.
345  *
346  * The locking strategy used here is very "optimistic". When
347  * someone else accesses the socket the ICMP is just dropped
348  * and for some paths there is no check at all.
349  * A more general error queue to queue errors for later handling
350  * is probably better.
351  *
352  */
353 
354 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
355 {
356 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
357 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
358 	struct inet_connection_sock *icsk;
359 	struct tcp_sock *tp;
360 	struct inet_sock *inet;
361 	const int type = icmp_hdr(icmp_skb)->type;
362 	const int code = icmp_hdr(icmp_skb)->code;
363 	struct sock *sk;
364 	struct sk_buff *skb;
365 	struct request_sock *fastopen;
366 	__u32 seq, snd_una;
367 	__u32 remaining;
368 	int err;
369 	struct net *net = dev_net(icmp_skb->dev);
370 
371 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
372 				       th->dest, iph->saddr, ntohs(th->source),
373 				       inet_iif(icmp_skb));
374 	if (!sk) {
375 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
376 		return;
377 	}
378 	if (sk->sk_state == TCP_TIME_WAIT) {
379 		inet_twsk_put(inet_twsk(sk));
380 		return;
381 	}
382 	seq = ntohl(th->seq);
383 	if (sk->sk_state == TCP_NEW_SYN_RECV)
384 		return tcp_req_err(sk, seq,
385 				  type == ICMP_PARAMETERPROB ||
386 				  type == ICMP_TIME_EXCEEDED ||
387 				  (type == ICMP_DEST_UNREACH &&
388 				   (code == ICMP_NET_UNREACH ||
389 				    code == ICMP_HOST_UNREACH)));
390 
391 	bh_lock_sock(sk);
392 	/* If too many ICMPs get dropped on busy
393 	 * servers this needs to be solved differently.
394 	 * We do take care of PMTU discovery (RFC1191) special case :
395 	 * we can receive locally generated ICMP messages while socket is held.
396 	 */
397 	if (sock_owned_by_user(sk)) {
398 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
399 			NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
400 	}
401 	if (sk->sk_state == TCP_CLOSE)
402 		goto out;
403 
404 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
405 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
406 		goto out;
407 	}
408 
409 	icsk = inet_csk(sk);
410 	tp = tcp_sk(sk);
411 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
412 	fastopen = tp->fastopen_rsk;
413 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
414 	if (sk->sk_state != TCP_LISTEN &&
415 	    !between(seq, snd_una, tp->snd_nxt)) {
416 		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
417 		goto out;
418 	}
419 
420 	switch (type) {
421 	case ICMP_REDIRECT:
422 		do_redirect(icmp_skb, sk);
423 		goto out;
424 	case ICMP_SOURCE_QUENCH:
425 		/* Just silently ignore these. */
426 		goto out;
427 	case ICMP_PARAMETERPROB:
428 		err = EPROTO;
429 		break;
430 	case ICMP_DEST_UNREACH:
431 		if (code > NR_ICMP_UNREACH)
432 			goto out;
433 
434 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
435 			/* We are not interested in TCP_LISTEN and open_requests
436 			 * (SYN-ACKs send out by Linux are always <576bytes so
437 			 * they should go through unfragmented).
438 			 */
439 			if (sk->sk_state == TCP_LISTEN)
440 				goto out;
441 
442 			tp->mtu_info = info;
443 			if (!sock_owned_by_user(sk)) {
444 				tcp_v4_mtu_reduced(sk);
445 			} else {
446 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
447 					sock_hold(sk);
448 			}
449 			goto out;
450 		}
451 
452 		err = icmp_err_convert[code].errno;
453 		/* check if icmp_skb allows revert of backoff
454 		 * (see draft-zimmermann-tcp-lcd) */
455 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
456 			break;
457 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
458 		    !icsk->icsk_backoff || fastopen)
459 			break;
460 
461 		if (sock_owned_by_user(sk))
462 			break;
463 
464 		icsk->icsk_backoff--;
465 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
466 					       TCP_TIMEOUT_INIT;
467 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
468 
469 		skb = tcp_write_queue_head(sk);
470 		BUG_ON(!skb);
471 
472 		remaining = icsk->icsk_rto -
473 			    min(icsk->icsk_rto,
474 				tcp_time_stamp - tcp_skb_timestamp(skb));
475 
476 		if (remaining) {
477 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
478 						  remaining, TCP_RTO_MAX);
479 		} else {
480 			/* RTO revert clocked out retransmission.
481 			 * Will retransmit now */
482 			tcp_retransmit_timer(sk);
483 		}
484 
485 		break;
486 	case ICMP_TIME_EXCEEDED:
487 		err = EHOSTUNREACH;
488 		break;
489 	default:
490 		goto out;
491 	}
492 
493 	switch (sk->sk_state) {
494 	case TCP_SYN_SENT:
495 	case TCP_SYN_RECV:
496 		/* Only in fast or simultaneous open. If a fast open socket is
497 		 * is already accepted it is treated as a connected one below.
498 		 */
499 		if (fastopen && !fastopen->sk)
500 			break;
501 
502 		if (!sock_owned_by_user(sk)) {
503 			sk->sk_err = err;
504 
505 			sk->sk_error_report(sk);
506 
507 			tcp_done(sk);
508 		} else {
509 			sk->sk_err_soft = err;
510 		}
511 		goto out;
512 	}
513 
514 	/* If we've already connected we will keep trying
515 	 * until we time out, or the user gives up.
516 	 *
517 	 * rfc1122 4.2.3.9 allows to consider as hard errors
518 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
519 	 * but it is obsoleted by pmtu discovery).
520 	 *
521 	 * Note, that in modern internet, where routing is unreliable
522 	 * and in each dark corner broken firewalls sit, sending random
523 	 * errors ordered by their masters even this two messages finally lose
524 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
525 	 *
526 	 * Now we are in compliance with RFCs.
527 	 *							--ANK (980905)
528 	 */
529 
530 	inet = inet_sk(sk);
531 	if (!sock_owned_by_user(sk) && inet->recverr) {
532 		sk->sk_err = err;
533 		sk->sk_error_report(sk);
534 	} else	{ /* Only an error on timeout */
535 		sk->sk_err_soft = err;
536 	}
537 
538 out:
539 	bh_unlock_sock(sk);
540 	sock_put(sk);
541 }
542 
543 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
544 {
545 	struct tcphdr *th = tcp_hdr(skb);
546 
547 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
548 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
549 		skb->csum_start = skb_transport_header(skb) - skb->head;
550 		skb->csum_offset = offsetof(struct tcphdr, check);
551 	} else {
552 		th->check = tcp_v4_check(skb->len, saddr, daddr,
553 					 csum_partial(th,
554 						      th->doff << 2,
555 						      skb->csum));
556 	}
557 }
558 
559 /* This routine computes an IPv4 TCP checksum. */
560 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
561 {
562 	const struct inet_sock *inet = inet_sk(sk);
563 
564 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
565 }
566 EXPORT_SYMBOL(tcp_v4_send_check);
567 
568 /*
569  *	This routine will send an RST to the other tcp.
570  *
571  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
572  *		      for reset.
573  *	Answer: if a packet caused RST, it is not for a socket
574  *		existing in our system, if it is matched to a socket,
575  *		it is just duplicate segment or bug in other side's TCP.
576  *		So that we build reply only basing on parameters
577  *		arrived with segment.
578  *	Exception: precedence violation. We do not implement it in any case.
579  */
580 
581 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
582 {
583 	const struct tcphdr *th = tcp_hdr(skb);
584 	struct {
585 		struct tcphdr th;
586 #ifdef CONFIG_TCP_MD5SIG
587 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
588 #endif
589 	} rep;
590 	struct ip_reply_arg arg;
591 #ifdef CONFIG_TCP_MD5SIG
592 	struct tcp_md5sig_key *key = NULL;
593 	const __u8 *hash_location = NULL;
594 	unsigned char newhash[16];
595 	int genhash;
596 	struct sock *sk1 = NULL;
597 #endif
598 	struct net *net;
599 
600 	/* Never send a reset in response to a reset. */
601 	if (th->rst)
602 		return;
603 
604 	/* If sk not NULL, it means we did a successful lookup and incoming
605 	 * route had to be correct. prequeue might have dropped our dst.
606 	 */
607 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
608 		return;
609 
610 	/* Swap the send and the receive. */
611 	memset(&rep, 0, sizeof(rep));
612 	rep.th.dest   = th->source;
613 	rep.th.source = th->dest;
614 	rep.th.doff   = sizeof(struct tcphdr) / 4;
615 	rep.th.rst    = 1;
616 
617 	if (th->ack) {
618 		rep.th.seq = th->ack_seq;
619 	} else {
620 		rep.th.ack = 1;
621 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
622 				       skb->len - (th->doff << 2));
623 	}
624 
625 	memset(&arg, 0, sizeof(arg));
626 	arg.iov[0].iov_base = (unsigned char *)&rep;
627 	arg.iov[0].iov_len  = sizeof(rep.th);
628 
629 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
630 #ifdef CONFIG_TCP_MD5SIG
631 	rcu_read_lock();
632 	hash_location = tcp_parse_md5sig_option(th);
633 	if (sk && sk_fullsock(sk)) {
634 		key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
635 					&ip_hdr(skb)->saddr, AF_INET);
636 	} else if (hash_location) {
637 		/*
638 		 * active side is lost. Try to find listening socket through
639 		 * source port, and then find md5 key through listening socket.
640 		 * we are not loose security here:
641 		 * Incoming packet is checked with md5 hash with finding key,
642 		 * no RST generated if md5 hash doesn't match.
643 		 */
644 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
645 					     ip_hdr(skb)->saddr,
646 					     th->source, ip_hdr(skb)->daddr,
647 					     ntohs(th->source), inet_iif(skb));
648 		/* don't send rst if it can't find key */
649 		if (!sk1)
650 			goto out;
651 
652 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
653 					&ip_hdr(skb)->saddr, AF_INET);
654 		if (!key)
655 			goto out;
656 
657 
658 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
659 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
660 			goto out;
661 
662 	}
663 
664 	if (key) {
665 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
666 				   (TCPOPT_NOP << 16) |
667 				   (TCPOPT_MD5SIG << 8) |
668 				   TCPOLEN_MD5SIG);
669 		/* Update length and the length the header thinks exists */
670 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
671 		rep.th.doff = arg.iov[0].iov_len / 4;
672 
673 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
674 				     key, ip_hdr(skb)->saddr,
675 				     ip_hdr(skb)->daddr, &rep.th);
676 	}
677 #endif
678 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
679 				      ip_hdr(skb)->saddr, /* XXX */
680 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
681 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
682 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
683 
684 	/* When socket is gone, all binding information is lost.
685 	 * routing might fail in this case. No choice here, if we choose to force
686 	 * input interface, we will misroute in case of asymmetric route.
687 	 */
688 	if (sk)
689 		arg.bound_dev_if = sk->sk_bound_dev_if;
690 
691 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
692 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
693 
694 	arg.tos = ip_hdr(skb)->tos;
695 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
696 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
697 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
698 			      &arg, arg.iov[0].iov_len);
699 
700 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
701 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
702 
703 #ifdef CONFIG_TCP_MD5SIG
704 out:
705 	rcu_read_unlock();
706 #endif
707 }
708 
709 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
710    outside socket context is ugly, certainly. What can I do?
711  */
712 
713 static void tcp_v4_send_ack(struct net *net,
714 			    struct sk_buff *skb, u32 seq, u32 ack,
715 			    u32 win, u32 tsval, u32 tsecr, int oif,
716 			    struct tcp_md5sig_key *key,
717 			    int reply_flags, u8 tos)
718 {
719 	const struct tcphdr *th = tcp_hdr(skb);
720 	struct {
721 		struct tcphdr th;
722 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
723 #ifdef CONFIG_TCP_MD5SIG
724 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
725 #endif
726 			];
727 	} rep;
728 	struct ip_reply_arg arg;
729 
730 	memset(&rep.th, 0, sizeof(struct tcphdr));
731 	memset(&arg, 0, sizeof(arg));
732 
733 	arg.iov[0].iov_base = (unsigned char *)&rep;
734 	arg.iov[0].iov_len  = sizeof(rep.th);
735 	if (tsecr) {
736 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
737 				   (TCPOPT_TIMESTAMP << 8) |
738 				   TCPOLEN_TIMESTAMP);
739 		rep.opt[1] = htonl(tsval);
740 		rep.opt[2] = htonl(tsecr);
741 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
742 	}
743 
744 	/* Swap the send and the receive. */
745 	rep.th.dest    = th->source;
746 	rep.th.source  = th->dest;
747 	rep.th.doff    = arg.iov[0].iov_len / 4;
748 	rep.th.seq     = htonl(seq);
749 	rep.th.ack_seq = htonl(ack);
750 	rep.th.ack     = 1;
751 	rep.th.window  = htons(win);
752 
753 #ifdef CONFIG_TCP_MD5SIG
754 	if (key) {
755 		int offset = (tsecr) ? 3 : 0;
756 
757 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
758 					  (TCPOPT_NOP << 16) |
759 					  (TCPOPT_MD5SIG << 8) |
760 					  TCPOLEN_MD5SIG);
761 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
762 		rep.th.doff = arg.iov[0].iov_len/4;
763 
764 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
765 				    key, ip_hdr(skb)->saddr,
766 				    ip_hdr(skb)->daddr, &rep.th);
767 	}
768 #endif
769 	arg.flags = reply_flags;
770 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
771 				      ip_hdr(skb)->saddr, /* XXX */
772 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
773 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
774 	if (oif)
775 		arg.bound_dev_if = oif;
776 	arg.tos = tos;
777 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
778 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
779 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
780 			      &arg, arg.iov[0].iov_len);
781 
782 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
783 }
784 
785 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
786 {
787 	struct inet_timewait_sock *tw = inet_twsk(sk);
788 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
789 
790 	tcp_v4_send_ack(sock_net(sk), skb,
791 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
792 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
793 			tcp_time_stamp + tcptw->tw_ts_offset,
794 			tcptw->tw_ts_recent,
795 			tw->tw_bound_dev_if,
796 			tcp_twsk_md5_key(tcptw),
797 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
798 			tw->tw_tos
799 			);
800 
801 	inet_twsk_put(tw);
802 }
803 
804 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
805 				  struct request_sock *req)
806 {
807 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
808 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
809 	 */
810 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
811 					     tcp_sk(sk)->snd_nxt;
812 
813 	tcp_v4_send_ack(sock_net(sk), skb, seq,
814 			tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
815 			tcp_time_stamp,
816 			req->ts_recent,
817 			0,
818 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
819 					  AF_INET),
820 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
821 			ip_hdr(skb)->tos);
822 }
823 
824 /*
825  *	Send a SYN-ACK after having received a SYN.
826  *	This still operates on a request_sock only, not on a big
827  *	socket.
828  */
829 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
830 			      struct flowi *fl,
831 			      struct request_sock *req,
832 			      struct tcp_fastopen_cookie *foc,
833 			      enum tcp_synack_type synack_type)
834 {
835 	const struct inet_request_sock *ireq = inet_rsk(req);
836 	struct flowi4 fl4;
837 	int err = -1;
838 	struct sk_buff *skb;
839 
840 	/* First, grab a route. */
841 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
842 		return -1;
843 
844 	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
845 
846 	if (skb) {
847 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
848 
849 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
850 					    ireq->ir_rmt_addr,
851 					    ireq->opt);
852 		err = net_xmit_eval(err);
853 	}
854 
855 	return err;
856 }
857 
858 /*
859  *	IPv4 request_sock destructor.
860  */
861 static void tcp_v4_reqsk_destructor(struct request_sock *req)
862 {
863 	kfree(inet_rsk(req)->opt);
864 }
865 
866 #ifdef CONFIG_TCP_MD5SIG
867 /*
868  * RFC2385 MD5 checksumming requires a mapping of
869  * IP address->MD5 Key.
870  * We need to maintain these in the sk structure.
871  */
872 
873 /* Find the Key structure for an address.  */
874 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
875 					 const union tcp_md5_addr *addr,
876 					 int family)
877 {
878 	const struct tcp_sock *tp = tcp_sk(sk);
879 	struct tcp_md5sig_key *key;
880 	unsigned int size = sizeof(struct in_addr);
881 	const struct tcp_md5sig_info *md5sig;
882 
883 	/* caller either holds rcu_read_lock() or socket lock */
884 	md5sig = rcu_dereference_check(tp->md5sig_info,
885 				       lockdep_sock_is_held(sk));
886 	if (!md5sig)
887 		return NULL;
888 #if IS_ENABLED(CONFIG_IPV6)
889 	if (family == AF_INET6)
890 		size = sizeof(struct in6_addr);
891 #endif
892 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
893 		if (key->family != family)
894 			continue;
895 		if (!memcmp(&key->addr, addr, size))
896 			return key;
897 	}
898 	return NULL;
899 }
900 EXPORT_SYMBOL(tcp_md5_do_lookup);
901 
902 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
903 					 const struct sock *addr_sk)
904 {
905 	const union tcp_md5_addr *addr;
906 
907 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
908 	return tcp_md5_do_lookup(sk, addr, AF_INET);
909 }
910 EXPORT_SYMBOL(tcp_v4_md5_lookup);
911 
912 /* This can be called on a newly created socket, from other files */
913 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
914 		   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
915 {
916 	/* Add Key to the list */
917 	struct tcp_md5sig_key *key;
918 	struct tcp_sock *tp = tcp_sk(sk);
919 	struct tcp_md5sig_info *md5sig;
920 
921 	key = tcp_md5_do_lookup(sk, addr, family);
922 	if (key) {
923 		/* Pre-existing entry - just update that one. */
924 		memcpy(key->key, newkey, newkeylen);
925 		key->keylen = newkeylen;
926 		return 0;
927 	}
928 
929 	md5sig = rcu_dereference_protected(tp->md5sig_info,
930 					   lockdep_sock_is_held(sk));
931 	if (!md5sig) {
932 		md5sig = kmalloc(sizeof(*md5sig), gfp);
933 		if (!md5sig)
934 			return -ENOMEM;
935 
936 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
937 		INIT_HLIST_HEAD(&md5sig->head);
938 		rcu_assign_pointer(tp->md5sig_info, md5sig);
939 	}
940 
941 	key = sock_kmalloc(sk, sizeof(*key), gfp);
942 	if (!key)
943 		return -ENOMEM;
944 	if (!tcp_alloc_md5sig_pool()) {
945 		sock_kfree_s(sk, key, sizeof(*key));
946 		return -ENOMEM;
947 	}
948 
949 	memcpy(key->key, newkey, newkeylen);
950 	key->keylen = newkeylen;
951 	key->family = family;
952 	memcpy(&key->addr, addr,
953 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
954 				      sizeof(struct in_addr));
955 	hlist_add_head_rcu(&key->node, &md5sig->head);
956 	return 0;
957 }
958 EXPORT_SYMBOL(tcp_md5_do_add);
959 
960 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
961 {
962 	struct tcp_md5sig_key *key;
963 
964 	key = tcp_md5_do_lookup(sk, addr, family);
965 	if (!key)
966 		return -ENOENT;
967 	hlist_del_rcu(&key->node);
968 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
969 	kfree_rcu(key, rcu);
970 	return 0;
971 }
972 EXPORT_SYMBOL(tcp_md5_do_del);
973 
974 static void tcp_clear_md5_list(struct sock *sk)
975 {
976 	struct tcp_sock *tp = tcp_sk(sk);
977 	struct tcp_md5sig_key *key;
978 	struct hlist_node *n;
979 	struct tcp_md5sig_info *md5sig;
980 
981 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
982 
983 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
984 		hlist_del_rcu(&key->node);
985 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
986 		kfree_rcu(key, rcu);
987 	}
988 }
989 
990 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
991 				 int optlen)
992 {
993 	struct tcp_md5sig cmd;
994 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
995 
996 	if (optlen < sizeof(cmd))
997 		return -EINVAL;
998 
999 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1000 		return -EFAULT;
1001 
1002 	if (sin->sin_family != AF_INET)
1003 		return -EINVAL;
1004 
1005 	if (!cmd.tcpm_keylen)
1006 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1007 				      AF_INET);
1008 
1009 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1010 		return -EINVAL;
1011 
1012 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1013 			      AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1014 			      GFP_KERNEL);
1015 }
1016 
1017 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1018 					__be32 daddr, __be32 saddr, int nbytes)
1019 {
1020 	struct tcp4_pseudohdr *bp;
1021 	struct scatterlist sg;
1022 
1023 	bp = &hp->md5_blk.ip4;
1024 
1025 	/*
1026 	 * 1. the TCP pseudo-header (in the order: source IP address,
1027 	 * destination IP address, zero-padded protocol number, and
1028 	 * segment length)
1029 	 */
1030 	bp->saddr = saddr;
1031 	bp->daddr = daddr;
1032 	bp->pad = 0;
1033 	bp->protocol = IPPROTO_TCP;
1034 	bp->len = cpu_to_be16(nbytes);
1035 
1036 	sg_init_one(&sg, bp, sizeof(*bp));
1037 	ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(*bp));
1038 	return crypto_ahash_update(hp->md5_req);
1039 }
1040 
1041 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1042 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1043 {
1044 	struct tcp_md5sig_pool *hp;
1045 	struct ahash_request *req;
1046 
1047 	hp = tcp_get_md5sig_pool();
1048 	if (!hp)
1049 		goto clear_hash_noput;
1050 	req = hp->md5_req;
1051 
1052 	if (crypto_ahash_init(req))
1053 		goto clear_hash;
1054 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1055 		goto clear_hash;
1056 	if (tcp_md5_hash_header(hp, th))
1057 		goto clear_hash;
1058 	if (tcp_md5_hash_key(hp, key))
1059 		goto clear_hash;
1060 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1061 	if (crypto_ahash_final(req))
1062 		goto clear_hash;
1063 
1064 	tcp_put_md5sig_pool();
1065 	return 0;
1066 
1067 clear_hash:
1068 	tcp_put_md5sig_pool();
1069 clear_hash_noput:
1070 	memset(md5_hash, 0, 16);
1071 	return 1;
1072 }
1073 
1074 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1075 			const struct sock *sk,
1076 			const struct sk_buff *skb)
1077 {
1078 	struct tcp_md5sig_pool *hp;
1079 	struct ahash_request *req;
1080 	const struct tcphdr *th = tcp_hdr(skb);
1081 	__be32 saddr, daddr;
1082 
1083 	if (sk) { /* valid for establish/request sockets */
1084 		saddr = sk->sk_rcv_saddr;
1085 		daddr = sk->sk_daddr;
1086 	} else {
1087 		const struct iphdr *iph = ip_hdr(skb);
1088 		saddr = iph->saddr;
1089 		daddr = iph->daddr;
1090 	}
1091 
1092 	hp = tcp_get_md5sig_pool();
1093 	if (!hp)
1094 		goto clear_hash_noput;
1095 	req = hp->md5_req;
1096 
1097 	if (crypto_ahash_init(req))
1098 		goto clear_hash;
1099 
1100 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1101 		goto clear_hash;
1102 	if (tcp_md5_hash_header(hp, th))
1103 		goto clear_hash;
1104 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1105 		goto clear_hash;
1106 	if (tcp_md5_hash_key(hp, key))
1107 		goto clear_hash;
1108 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1109 	if (crypto_ahash_final(req))
1110 		goto clear_hash;
1111 
1112 	tcp_put_md5sig_pool();
1113 	return 0;
1114 
1115 clear_hash:
1116 	tcp_put_md5sig_pool();
1117 clear_hash_noput:
1118 	memset(md5_hash, 0, 16);
1119 	return 1;
1120 }
1121 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1122 
1123 #endif
1124 
1125 /* Called with rcu_read_lock() */
1126 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1127 				    const struct sk_buff *skb)
1128 {
1129 #ifdef CONFIG_TCP_MD5SIG
1130 	/*
1131 	 * This gets called for each TCP segment that arrives
1132 	 * so we want to be efficient.
1133 	 * We have 3 drop cases:
1134 	 * o No MD5 hash and one expected.
1135 	 * o MD5 hash and we're not expecting one.
1136 	 * o MD5 hash and its wrong.
1137 	 */
1138 	const __u8 *hash_location = NULL;
1139 	struct tcp_md5sig_key *hash_expected;
1140 	const struct iphdr *iph = ip_hdr(skb);
1141 	const struct tcphdr *th = tcp_hdr(skb);
1142 	int genhash;
1143 	unsigned char newhash[16];
1144 
1145 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1146 					  AF_INET);
1147 	hash_location = tcp_parse_md5sig_option(th);
1148 
1149 	/* We've parsed the options - do we have a hash? */
1150 	if (!hash_expected && !hash_location)
1151 		return false;
1152 
1153 	if (hash_expected && !hash_location) {
1154 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1155 		return true;
1156 	}
1157 
1158 	if (!hash_expected && hash_location) {
1159 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1160 		return true;
1161 	}
1162 
1163 	/* Okay, so this is hash_expected and hash_location -
1164 	 * so we need to calculate the checksum.
1165 	 */
1166 	genhash = tcp_v4_md5_hash_skb(newhash,
1167 				      hash_expected,
1168 				      NULL, skb);
1169 
1170 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1171 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1172 				     &iph->saddr, ntohs(th->source),
1173 				     &iph->daddr, ntohs(th->dest),
1174 				     genhash ? " tcp_v4_calc_md5_hash failed"
1175 				     : "");
1176 		return true;
1177 	}
1178 	return false;
1179 #endif
1180 	return false;
1181 }
1182 
1183 static void tcp_v4_init_req(struct request_sock *req,
1184 			    const struct sock *sk_listener,
1185 			    struct sk_buff *skb)
1186 {
1187 	struct inet_request_sock *ireq = inet_rsk(req);
1188 
1189 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1190 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1191 	ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1192 	ireq->opt = tcp_v4_save_options(skb);
1193 }
1194 
1195 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1196 					  struct flowi *fl,
1197 					  const struct request_sock *req,
1198 					  bool *strict)
1199 {
1200 	struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1201 
1202 	if (strict) {
1203 		if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1204 			*strict = true;
1205 		else
1206 			*strict = false;
1207 	}
1208 
1209 	return dst;
1210 }
1211 
1212 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1213 	.family		=	PF_INET,
1214 	.obj_size	=	sizeof(struct tcp_request_sock),
1215 	.rtx_syn_ack	=	tcp_rtx_synack,
1216 	.send_ack	=	tcp_v4_reqsk_send_ack,
1217 	.destructor	=	tcp_v4_reqsk_destructor,
1218 	.send_reset	=	tcp_v4_send_reset,
1219 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1220 };
1221 
1222 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1223 	.mss_clamp	=	TCP_MSS_DEFAULT,
1224 #ifdef CONFIG_TCP_MD5SIG
1225 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1226 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1227 #endif
1228 	.init_req	=	tcp_v4_init_req,
1229 #ifdef CONFIG_SYN_COOKIES
1230 	.cookie_init_seq =	cookie_v4_init_sequence,
1231 #endif
1232 	.route_req	=	tcp_v4_route_req,
1233 	.init_seq	=	tcp_v4_init_sequence,
1234 	.send_synack	=	tcp_v4_send_synack,
1235 };
1236 
1237 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1238 {
1239 	/* Never answer to SYNs send to broadcast or multicast */
1240 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1241 		goto drop;
1242 
1243 	return tcp_conn_request(&tcp_request_sock_ops,
1244 				&tcp_request_sock_ipv4_ops, sk, skb);
1245 
1246 drop:
1247 	tcp_listendrop(sk);
1248 	return 0;
1249 }
1250 EXPORT_SYMBOL(tcp_v4_conn_request);
1251 
1252 
1253 /*
1254  * The three way handshake has completed - we got a valid synack -
1255  * now create the new socket.
1256  */
1257 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1258 				  struct request_sock *req,
1259 				  struct dst_entry *dst,
1260 				  struct request_sock *req_unhash,
1261 				  bool *own_req)
1262 {
1263 	struct inet_request_sock *ireq;
1264 	struct inet_sock *newinet;
1265 	struct tcp_sock *newtp;
1266 	struct sock *newsk;
1267 #ifdef CONFIG_TCP_MD5SIG
1268 	struct tcp_md5sig_key *key;
1269 #endif
1270 	struct ip_options_rcu *inet_opt;
1271 
1272 	if (sk_acceptq_is_full(sk))
1273 		goto exit_overflow;
1274 
1275 	newsk = tcp_create_openreq_child(sk, req, skb);
1276 	if (!newsk)
1277 		goto exit_nonewsk;
1278 
1279 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1280 	inet_sk_rx_dst_set(newsk, skb);
1281 
1282 	newtp		      = tcp_sk(newsk);
1283 	newinet		      = inet_sk(newsk);
1284 	ireq		      = inet_rsk(req);
1285 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1286 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1287 	newsk->sk_bound_dev_if = ireq->ir_iif;
1288 	newinet->inet_saddr	      = ireq->ir_loc_addr;
1289 	inet_opt	      = ireq->opt;
1290 	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1291 	ireq->opt	      = NULL;
1292 	newinet->mc_index     = inet_iif(skb);
1293 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1294 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1295 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1296 	if (inet_opt)
1297 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1298 	newinet->inet_id = newtp->write_seq ^ jiffies;
1299 
1300 	if (!dst) {
1301 		dst = inet_csk_route_child_sock(sk, newsk, req);
1302 		if (!dst)
1303 			goto put_and_exit;
1304 	} else {
1305 		/* syncookie case : see end of cookie_v4_check() */
1306 	}
1307 	sk_setup_caps(newsk, dst);
1308 
1309 	tcp_ca_openreq_child(newsk, dst);
1310 
1311 	tcp_sync_mss(newsk, dst_mtu(dst));
1312 	newtp->advmss = dst_metric_advmss(dst);
1313 	if (tcp_sk(sk)->rx_opt.user_mss &&
1314 	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1315 		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1316 
1317 	tcp_initialize_rcv_mss(newsk);
1318 
1319 #ifdef CONFIG_TCP_MD5SIG
1320 	/* Copy over the MD5 key from the original socket */
1321 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1322 				AF_INET);
1323 	if (key) {
1324 		/*
1325 		 * We're using one, so create a matching key
1326 		 * on the newsk structure. If we fail to get
1327 		 * memory, then we end up not copying the key
1328 		 * across. Shucks.
1329 		 */
1330 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1331 			       AF_INET, key->key, key->keylen, GFP_ATOMIC);
1332 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1333 	}
1334 #endif
1335 
1336 	if (__inet_inherit_port(sk, newsk) < 0)
1337 		goto put_and_exit;
1338 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1339 	if (*own_req)
1340 		tcp_move_syn(newtp, req);
1341 
1342 	return newsk;
1343 
1344 exit_overflow:
1345 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1346 exit_nonewsk:
1347 	dst_release(dst);
1348 exit:
1349 	tcp_listendrop(sk);
1350 	return NULL;
1351 put_and_exit:
1352 	inet_csk_prepare_forced_close(newsk);
1353 	tcp_done(newsk);
1354 	goto exit;
1355 }
1356 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1357 
1358 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1359 {
1360 #ifdef CONFIG_SYN_COOKIES
1361 	const struct tcphdr *th = tcp_hdr(skb);
1362 
1363 	if (!th->syn)
1364 		sk = cookie_v4_check(sk, skb);
1365 #endif
1366 	return sk;
1367 }
1368 
1369 /* The socket must have it's spinlock held when we get
1370  * here, unless it is a TCP_LISTEN socket.
1371  *
1372  * We have a potential double-lock case here, so even when
1373  * doing backlog processing we use the BH locking scheme.
1374  * This is because we cannot sleep with the original spinlock
1375  * held.
1376  */
1377 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1378 {
1379 	struct sock *rsk;
1380 
1381 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1382 		struct dst_entry *dst = sk->sk_rx_dst;
1383 
1384 		sock_rps_save_rxhash(sk, skb);
1385 		sk_mark_napi_id(sk, skb);
1386 		if (dst) {
1387 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1388 			    !dst->ops->check(dst, 0)) {
1389 				dst_release(dst);
1390 				sk->sk_rx_dst = NULL;
1391 			}
1392 		}
1393 		tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1394 		return 0;
1395 	}
1396 
1397 	if (tcp_checksum_complete(skb))
1398 		goto csum_err;
1399 
1400 	if (sk->sk_state == TCP_LISTEN) {
1401 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1402 
1403 		if (!nsk)
1404 			goto discard;
1405 		if (nsk != sk) {
1406 			sock_rps_save_rxhash(nsk, skb);
1407 			sk_mark_napi_id(nsk, skb);
1408 			if (tcp_child_process(sk, nsk, skb)) {
1409 				rsk = nsk;
1410 				goto reset;
1411 			}
1412 			return 0;
1413 		}
1414 	} else
1415 		sock_rps_save_rxhash(sk, skb);
1416 
1417 	if (tcp_rcv_state_process(sk, skb)) {
1418 		rsk = sk;
1419 		goto reset;
1420 	}
1421 	return 0;
1422 
1423 reset:
1424 	tcp_v4_send_reset(rsk, skb);
1425 discard:
1426 	kfree_skb(skb);
1427 	/* Be careful here. If this function gets more complicated and
1428 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1429 	 * might be destroyed here. This current version compiles correctly,
1430 	 * but you have been warned.
1431 	 */
1432 	return 0;
1433 
1434 csum_err:
1435 	__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1436 	__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1437 	goto discard;
1438 }
1439 EXPORT_SYMBOL(tcp_v4_do_rcv);
1440 
1441 void tcp_v4_early_demux(struct sk_buff *skb)
1442 {
1443 	const struct iphdr *iph;
1444 	const struct tcphdr *th;
1445 	struct sock *sk;
1446 
1447 	if (skb->pkt_type != PACKET_HOST)
1448 		return;
1449 
1450 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1451 		return;
1452 
1453 	iph = ip_hdr(skb);
1454 	th = tcp_hdr(skb);
1455 
1456 	if (th->doff < sizeof(struct tcphdr) / 4)
1457 		return;
1458 
1459 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1460 				       iph->saddr, th->source,
1461 				       iph->daddr, ntohs(th->dest),
1462 				       skb->skb_iif);
1463 	if (sk) {
1464 		skb->sk = sk;
1465 		skb->destructor = sock_edemux;
1466 		if (sk_fullsock(sk)) {
1467 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1468 
1469 			if (dst)
1470 				dst = dst_check(dst, 0);
1471 			if (dst &&
1472 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1473 				skb_dst_set_noref(skb, dst);
1474 		}
1475 	}
1476 }
1477 
1478 /* Packet is added to VJ-style prequeue for processing in process
1479  * context, if a reader task is waiting. Apparently, this exciting
1480  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1481  * failed somewhere. Latency? Burstiness? Well, at least now we will
1482  * see, why it failed. 8)8)				  --ANK
1483  *
1484  */
1485 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1486 {
1487 	struct tcp_sock *tp = tcp_sk(sk);
1488 
1489 	if (sysctl_tcp_low_latency || !tp->ucopy.task)
1490 		return false;
1491 
1492 	if (skb->len <= tcp_hdrlen(skb) &&
1493 	    skb_queue_len(&tp->ucopy.prequeue) == 0)
1494 		return false;
1495 
1496 	/* Before escaping RCU protected region, we need to take care of skb
1497 	 * dst. Prequeue is only enabled for established sockets.
1498 	 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1499 	 * Instead of doing full sk_rx_dst validity here, let's perform
1500 	 * an optimistic check.
1501 	 */
1502 	if (likely(sk->sk_rx_dst))
1503 		skb_dst_drop(skb);
1504 	else
1505 		skb_dst_force_safe(skb);
1506 
1507 	__skb_queue_tail(&tp->ucopy.prequeue, skb);
1508 	tp->ucopy.memory += skb->truesize;
1509 	if (tp->ucopy.memory > sk->sk_rcvbuf) {
1510 		struct sk_buff *skb1;
1511 
1512 		BUG_ON(sock_owned_by_user(sk));
1513 
1514 		while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1515 			sk_backlog_rcv(sk, skb1);
1516 			NET_INC_STATS_BH(sock_net(sk),
1517 					 LINUX_MIB_TCPPREQUEUEDROPPED);
1518 		}
1519 
1520 		tp->ucopy.memory = 0;
1521 	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1522 		wake_up_interruptible_sync_poll(sk_sleep(sk),
1523 					   POLLIN | POLLRDNORM | POLLRDBAND);
1524 		if (!inet_csk_ack_scheduled(sk))
1525 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1526 						  (3 * tcp_rto_min(sk)) / 4,
1527 						  TCP_RTO_MAX);
1528 	}
1529 	return true;
1530 }
1531 EXPORT_SYMBOL(tcp_prequeue);
1532 
1533 /*
1534  *	From tcp_input.c
1535  */
1536 
1537 int tcp_v4_rcv(struct sk_buff *skb)
1538 {
1539 	struct net *net = dev_net(skb->dev);
1540 	const struct iphdr *iph;
1541 	const struct tcphdr *th;
1542 	bool refcounted;
1543 	struct sock *sk;
1544 	int ret;
1545 
1546 	if (skb->pkt_type != PACKET_HOST)
1547 		goto discard_it;
1548 
1549 	/* Count it even if it's bad */
1550 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1551 
1552 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1553 		goto discard_it;
1554 
1555 	th = tcp_hdr(skb);
1556 
1557 	if (th->doff < sizeof(struct tcphdr) / 4)
1558 		goto bad_packet;
1559 	if (!pskb_may_pull(skb, th->doff * 4))
1560 		goto discard_it;
1561 
1562 	/* An explanation is required here, I think.
1563 	 * Packet length and doff are validated by header prediction,
1564 	 * provided case of th->doff==0 is eliminated.
1565 	 * So, we defer the checks. */
1566 
1567 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1568 		goto csum_error;
1569 
1570 	th = tcp_hdr(skb);
1571 	iph = ip_hdr(skb);
1572 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1573 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1574 	 */
1575 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1576 		sizeof(struct inet_skb_parm));
1577 	barrier();
1578 
1579 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1580 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1581 				    skb->len - th->doff * 4);
1582 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1583 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1584 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1585 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1586 	TCP_SKB_CB(skb)->sacked	 = 0;
1587 
1588 lookup:
1589 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1590 			       th->dest, &refcounted);
1591 	if (!sk)
1592 		goto no_tcp_socket;
1593 
1594 process:
1595 	if (sk->sk_state == TCP_TIME_WAIT)
1596 		goto do_time_wait;
1597 
1598 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1599 		struct request_sock *req = inet_reqsk(sk);
1600 		struct sock *nsk;
1601 
1602 		sk = req->rsk_listener;
1603 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1604 			reqsk_put(req);
1605 			goto discard_it;
1606 		}
1607 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1608 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1609 			goto lookup;
1610 		}
1611 		/* We own a reference on the listener, increase it again
1612 		 * as we might lose it too soon.
1613 		 */
1614 		sock_hold(sk);
1615 		refcounted = true;
1616 		nsk = tcp_check_req(sk, skb, req, false);
1617 		if (!nsk) {
1618 			reqsk_put(req);
1619 			goto discard_and_relse;
1620 		}
1621 		if (nsk == sk) {
1622 			reqsk_put(req);
1623 		} else if (tcp_child_process(sk, nsk, skb)) {
1624 			tcp_v4_send_reset(nsk, skb);
1625 			goto discard_and_relse;
1626 		} else {
1627 			sock_put(sk);
1628 			return 0;
1629 		}
1630 	}
1631 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1632 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1633 		goto discard_and_relse;
1634 	}
1635 
1636 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1637 		goto discard_and_relse;
1638 
1639 	if (tcp_v4_inbound_md5_hash(sk, skb))
1640 		goto discard_and_relse;
1641 
1642 	nf_reset(skb);
1643 
1644 	if (sk_filter(sk, skb))
1645 		goto discard_and_relse;
1646 
1647 	skb->dev = NULL;
1648 
1649 	if (sk->sk_state == TCP_LISTEN) {
1650 		ret = tcp_v4_do_rcv(sk, skb);
1651 		goto put_and_return;
1652 	}
1653 
1654 	sk_incoming_cpu_update(sk);
1655 
1656 	bh_lock_sock_nested(sk);
1657 	tcp_segs_in(tcp_sk(sk), skb);
1658 	ret = 0;
1659 	if (!sock_owned_by_user(sk)) {
1660 		if (!tcp_prequeue(sk, skb))
1661 			ret = tcp_v4_do_rcv(sk, skb);
1662 	} else if (unlikely(sk_add_backlog(sk, skb,
1663 					   sk->sk_rcvbuf + sk->sk_sndbuf))) {
1664 		bh_unlock_sock(sk);
1665 		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1666 		goto discard_and_relse;
1667 	}
1668 	bh_unlock_sock(sk);
1669 
1670 put_and_return:
1671 	if (refcounted)
1672 		sock_put(sk);
1673 
1674 	return ret;
1675 
1676 no_tcp_socket:
1677 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1678 		goto discard_it;
1679 
1680 	if (tcp_checksum_complete(skb)) {
1681 csum_error:
1682 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1683 bad_packet:
1684 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
1685 	} else {
1686 		tcp_v4_send_reset(NULL, skb);
1687 	}
1688 
1689 discard_it:
1690 	/* Discard frame. */
1691 	kfree_skb(skb);
1692 	return 0;
1693 
1694 discard_and_relse:
1695 	sk_drops_add(sk, skb);
1696 	if (refcounted)
1697 		sock_put(sk);
1698 	goto discard_it;
1699 
1700 do_time_wait:
1701 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1702 		inet_twsk_put(inet_twsk(sk));
1703 		goto discard_it;
1704 	}
1705 
1706 	if (tcp_checksum_complete(skb)) {
1707 		inet_twsk_put(inet_twsk(sk));
1708 		goto csum_error;
1709 	}
1710 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1711 	case TCP_TW_SYN: {
1712 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1713 							&tcp_hashinfo, skb,
1714 							__tcp_hdrlen(th),
1715 							iph->saddr, th->source,
1716 							iph->daddr, th->dest,
1717 							inet_iif(skb));
1718 		if (sk2) {
1719 			inet_twsk_deschedule_put(inet_twsk(sk));
1720 			sk = sk2;
1721 			refcounted = false;
1722 			goto process;
1723 		}
1724 		/* Fall through to ACK */
1725 	}
1726 	case TCP_TW_ACK:
1727 		tcp_v4_timewait_ack(sk, skb);
1728 		break;
1729 	case TCP_TW_RST:
1730 		tcp_v4_send_reset(sk, skb);
1731 		inet_twsk_deschedule_put(inet_twsk(sk));
1732 		goto discard_it;
1733 	case TCP_TW_SUCCESS:;
1734 	}
1735 	goto discard_it;
1736 }
1737 
1738 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1739 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1740 	.twsk_unique	= tcp_twsk_unique,
1741 	.twsk_destructor= tcp_twsk_destructor,
1742 };
1743 
1744 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1745 {
1746 	struct dst_entry *dst = skb_dst(skb);
1747 
1748 	if (dst && dst_hold_safe(dst)) {
1749 		sk->sk_rx_dst = dst;
1750 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1751 	}
1752 }
1753 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1754 
1755 const struct inet_connection_sock_af_ops ipv4_specific = {
1756 	.queue_xmit	   = ip_queue_xmit,
1757 	.send_check	   = tcp_v4_send_check,
1758 	.rebuild_header	   = inet_sk_rebuild_header,
1759 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
1760 	.conn_request	   = tcp_v4_conn_request,
1761 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1762 	.net_header_len	   = sizeof(struct iphdr),
1763 	.setsockopt	   = ip_setsockopt,
1764 	.getsockopt	   = ip_getsockopt,
1765 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1766 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1767 	.bind_conflict	   = inet_csk_bind_conflict,
1768 #ifdef CONFIG_COMPAT
1769 	.compat_setsockopt = compat_ip_setsockopt,
1770 	.compat_getsockopt = compat_ip_getsockopt,
1771 #endif
1772 	.mtu_reduced	   = tcp_v4_mtu_reduced,
1773 };
1774 EXPORT_SYMBOL(ipv4_specific);
1775 
1776 #ifdef CONFIG_TCP_MD5SIG
1777 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1778 	.md5_lookup		= tcp_v4_md5_lookup,
1779 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1780 	.md5_parse		= tcp_v4_parse_md5_keys,
1781 };
1782 #endif
1783 
1784 /* NOTE: A lot of things set to zero explicitly by call to
1785  *       sk_alloc() so need not be done here.
1786  */
1787 static int tcp_v4_init_sock(struct sock *sk)
1788 {
1789 	struct inet_connection_sock *icsk = inet_csk(sk);
1790 
1791 	tcp_init_sock(sk);
1792 
1793 	icsk->icsk_af_ops = &ipv4_specific;
1794 
1795 #ifdef CONFIG_TCP_MD5SIG
1796 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1797 #endif
1798 
1799 	return 0;
1800 }
1801 
1802 void tcp_v4_destroy_sock(struct sock *sk)
1803 {
1804 	struct tcp_sock *tp = tcp_sk(sk);
1805 
1806 	tcp_clear_xmit_timers(sk);
1807 
1808 	tcp_cleanup_congestion_control(sk);
1809 
1810 	/* Cleanup up the write buffer. */
1811 	tcp_write_queue_purge(sk);
1812 
1813 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1814 	__skb_queue_purge(&tp->out_of_order_queue);
1815 
1816 #ifdef CONFIG_TCP_MD5SIG
1817 	/* Clean up the MD5 key list, if any */
1818 	if (tp->md5sig_info) {
1819 		tcp_clear_md5_list(sk);
1820 		kfree_rcu(tp->md5sig_info, rcu);
1821 		tp->md5sig_info = NULL;
1822 	}
1823 #endif
1824 
1825 	/* Clean prequeue, it must be empty really */
1826 	__skb_queue_purge(&tp->ucopy.prequeue);
1827 
1828 	/* Clean up a referenced TCP bind bucket. */
1829 	if (inet_csk(sk)->icsk_bind_hash)
1830 		inet_put_port(sk);
1831 
1832 	BUG_ON(tp->fastopen_rsk);
1833 
1834 	/* If socket is aborted during connect operation */
1835 	tcp_free_fastopen_req(tp);
1836 	tcp_saved_syn_free(tp);
1837 
1838 	sk_sockets_allocated_dec(sk);
1839 
1840 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
1841 		sock_release_memcg(sk);
1842 }
1843 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1844 
1845 #ifdef CONFIG_PROC_FS
1846 /* Proc filesystem TCP sock list dumping. */
1847 
1848 /*
1849  * Get next listener socket follow cur.  If cur is NULL, get first socket
1850  * starting from bucket given in st->bucket; when st->bucket is zero the
1851  * very first socket in the hash table is returned.
1852  */
1853 static void *listening_get_next(struct seq_file *seq, void *cur)
1854 {
1855 	struct tcp_iter_state *st = seq->private;
1856 	struct net *net = seq_file_net(seq);
1857 	struct inet_listen_hashbucket *ilb;
1858 	struct inet_connection_sock *icsk;
1859 	struct sock *sk = cur;
1860 
1861 	if (!sk) {
1862 get_head:
1863 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1864 		spin_lock_bh(&ilb->lock);
1865 		sk = sk_head(&ilb->head);
1866 		st->offset = 0;
1867 		goto get_sk;
1868 	}
1869 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1870 	++st->num;
1871 	++st->offset;
1872 
1873 	sk = sk_next(sk);
1874 get_sk:
1875 	sk_for_each_from(sk) {
1876 		if (!net_eq(sock_net(sk), net))
1877 			continue;
1878 		if (sk->sk_family == st->family)
1879 			return sk;
1880 		icsk = inet_csk(sk);
1881 	}
1882 	spin_unlock_bh(&ilb->lock);
1883 	st->offset = 0;
1884 	if (++st->bucket < INET_LHTABLE_SIZE)
1885 		goto get_head;
1886 	return NULL;
1887 }
1888 
1889 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1890 {
1891 	struct tcp_iter_state *st = seq->private;
1892 	void *rc;
1893 
1894 	st->bucket = 0;
1895 	st->offset = 0;
1896 	rc = listening_get_next(seq, NULL);
1897 
1898 	while (rc && *pos) {
1899 		rc = listening_get_next(seq, rc);
1900 		--*pos;
1901 	}
1902 	return rc;
1903 }
1904 
1905 static inline bool empty_bucket(const struct tcp_iter_state *st)
1906 {
1907 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1908 }
1909 
1910 /*
1911  * Get first established socket starting from bucket given in st->bucket.
1912  * If st->bucket is zero, the very first socket in the hash is returned.
1913  */
1914 static void *established_get_first(struct seq_file *seq)
1915 {
1916 	struct tcp_iter_state *st = seq->private;
1917 	struct net *net = seq_file_net(seq);
1918 	void *rc = NULL;
1919 
1920 	st->offset = 0;
1921 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1922 		struct sock *sk;
1923 		struct hlist_nulls_node *node;
1924 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1925 
1926 		/* Lockless fast path for the common case of empty buckets */
1927 		if (empty_bucket(st))
1928 			continue;
1929 
1930 		spin_lock_bh(lock);
1931 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1932 			if (sk->sk_family != st->family ||
1933 			    !net_eq(sock_net(sk), net)) {
1934 				continue;
1935 			}
1936 			rc = sk;
1937 			goto out;
1938 		}
1939 		spin_unlock_bh(lock);
1940 	}
1941 out:
1942 	return rc;
1943 }
1944 
1945 static void *established_get_next(struct seq_file *seq, void *cur)
1946 {
1947 	struct sock *sk = cur;
1948 	struct hlist_nulls_node *node;
1949 	struct tcp_iter_state *st = seq->private;
1950 	struct net *net = seq_file_net(seq);
1951 
1952 	++st->num;
1953 	++st->offset;
1954 
1955 	sk = sk_nulls_next(sk);
1956 
1957 	sk_nulls_for_each_from(sk, node) {
1958 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1959 			return sk;
1960 	}
1961 
1962 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1963 	++st->bucket;
1964 	return established_get_first(seq);
1965 }
1966 
1967 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1968 {
1969 	struct tcp_iter_state *st = seq->private;
1970 	void *rc;
1971 
1972 	st->bucket = 0;
1973 	rc = established_get_first(seq);
1974 
1975 	while (rc && pos) {
1976 		rc = established_get_next(seq, rc);
1977 		--pos;
1978 	}
1979 	return rc;
1980 }
1981 
1982 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1983 {
1984 	void *rc;
1985 	struct tcp_iter_state *st = seq->private;
1986 
1987 	st->state = TCP_SEQ_STATE_LISTENING;
1988 	rc	  = listening_get_idx(seq, &pos);
1989 
1990 	if (!rc) {
1991 		st->state = TCP_SEQ_STATE_ESTABLISHED;
1992 		rc	  = established_get_idx(seq, pos);
1993 	}
1994 
1995 	return rc;
1996 }
1997 
1998 static void *tcp_seek_last_pos(struct seq_file *seq)
1999 {
2000 	struct tcp_iter_state *st = seq->private;
2001 	int offset = st->offset;
2002 	int orig_num = st->num;
2003 	void *rc = NULL;
2004 
2005 	switch (st->state) {
2006 	case TCP_SEQ_STATE_LISTENING:
2007 		if (st->bucket >= INET_LHTABLE_SIZE)
2008 			break;
2009 		st->state = TCP_SEQ_STATE_LISTENING;
2010 		rc = listening_get_next(seq, NULL);
2011 		while (offset-- && rc)
2012 			rc = listening_get_next(seq, rc);
2013 		if (rc)
2014 			break;
2015 		st->bucket = 0;
2016 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2017 		/* Fallthrough */
2018 	case TCP_SEQ_STATE_ESTABLISHED:
2019 		if (st->bucket > tcp_hashinfo.ehash_mask)
2020 			break;
2021 		rc = established_get_first(seq);
2022 		while (offset-- && rc)
2023 			rc = established_get_next(seq, rc);
2024 	}
2025 
2026 	st->num = orig_num;
2027 
2028 	return rc;
2029 }
2030 
2031 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2032 {
2033 	struct tcp_iter_state *st = seq->private;
2034 	void *rc;
2035 
2036 	if (*pos && *pos == st->last_pos) {
2037 		rc = tcp_seek_last_pos(seq);
2038 		if (rc)
2039 			goto out;
2040 	}
2041 
2042 	st->state = TCP_SEQ_STATE_LISTENING;
2043 	st->num = 0;
2044 	st->bucket = 0;
2045 	st->offset = 0;
2046 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2047 
2048 out:
2049 	st->last_pos = *pos;
2050 	return rc;
2051 }
2052 
2053 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2054 {
2055 	struct tcp_iter_state *st = seq->private;
2056 	void *rc = NULL;
2057 
2058 	if (v == SEQ_START_TOKEN) {
2059 		rc = tcp_get_idx(seq, 0);
2060 		goto out;
2061 	}
2062 
2063 	switch (st->state) {
2064 	case TCP_SEQ_STATE_LISTENING:
2065 		rc = listening_get_next(seq, v);
2066 		if (!rc) {
2067 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2068 			st->bucket = 0;
2069 			st->offset = 0;
2070 			rc	  = established_get_first(seq);
2071 		}
2072 		break;
2073 	case TCP_SEQ_STATE_ESTABLISHED:
2074 		rc = established_get_next(seq, v);
2075 		break;
2076 	}
2077 out:
2078 	++*pos;
2079 	st->last_pos = *pos;
2080 	return rc;
2081 }
2082 
2083 static void tcp_seq_stop(struct seq_file *seq, void *v)
2084 {
2085 	struct tcp_iter_state *st = seq->private;
2086 
2087 	switch (st->state) {
2088 	case TCP_SEQ_STATE_LISTENING:
2089 		if (v != SEQ_START_TOKEN)
2090 			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2091 		break;
2092 	case TCP_SEQ_STATE_ESTABLISHED:
2093 		if (v)
2094 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2095 		break;
2096 	}
2097 }
2098 
2099 int tcp_seq_open(struct inode *inode, struct file *file)
2100 {
2101 	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2102 	struct tcp_iter_state *s;
2103 	int err;
2104 
2105 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2106 			  sizeof(struct tcp_iter_state));
2107 	if (err < 0)
2108 		return err;
2109 
2110 	s = ((struct seq_file *)file->private_data)->private;
2111 	s->family		= afinfo->family;
2112 	s->last_pos		= 0;
2113 	return 0;
2114 }
2115 EXPORT_SYMBOL(tcp_seq_open);
2116 
2117 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2118 {
2119 	int rc = 0;
2120 	struct proc_dir_entry *p;
2121 
2122 	afinfo->seq_ops.start		= tcp_seq_start;
2123 	afinfo->seq_ops.next		= tcp_seq_next;
2124 	afinfo->seq_ops.stop		= tcp_seq_stop;
2125 
2126 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2127 			     afinfo->seq_fops, afinfo);
2128 	if (!p)
2129 		rc = -ENOMEM;
2130 	return rc;
2131 }
2132 EXPORT_SYMBOL(tcp_proc_register);
2133 
2134 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2135 {
2136 	remove_proc_entry(afinfo->name, net->proc_net);
2137 }
2138 EXPORT_SYMBOL(tcp_proc_unregister);
2139 
2140 static void get_openreq4(const struct request_sock *req,
2141 			 struct seq_file *f, int i)
2142 {
2143 	const struct inet_request_sock *ireq = inet_rsk(req);
2144 	long delta = req->rsk_timer.expires - jiffies;
2145 
2146 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2147 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2148 		i,
2149 		ireq->ir_loc_addr,
2150 		ireq->ir_num,
2151 		ireq->ir_rmt_addr,
2152 		ntohs(ireq->ir_rmt_port),
2153 		TCP_SYN_RECV,
2154 		0, 0, /* could print option size, but that is af dependent. */
2155 		1,    /* timers active (only the expire timer) */
2156 		jiffies_delta_to_clock_t(delta),
2157 		req->num_timeout,
2158 		from_kuid_munged(seq_user_ns(f),
2159 				 sock_i_uid(req->rsk_listener)),
2160 		0,  /* non standard timer */
2161 		0, /* open_requests have no inode */
2162 		0,
2163 		req);
2164 }
2165 
2166 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2167 {
2168 	int timer_active;
2169 	unsigned long timer_expires;
2170 	const struct tcp_sock *tp = tcp_sk(sk);
2171 	const struct inet_connection_sock *icsk = inet_csk(sk);
2172 	const struct inet_sock *inet = inet_sk(sk);
2173 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2174 	__be32 dest = inet->inet_daddr;
2175 	__be32 src = inet->inet_rcv_saddr;
2176 	__u16 destp = ntohs(inet->inet_dport);
2177 	__u16 srcp = ntohs(inet->inet_sport);
2178 	int rx_queue;
2179 	int state;
2180 
2181 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2182 	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2183 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2184 		timer_active	= 1;
2185 		timer_expires	= icsk->icsk_timeout;
2186 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2187 		timer_active	= 4;
2188 		timer_expires	= icsk->icsk_timeout;
2189 	} else if (timer_pending(&sk->sk_timer)) {
2190 		timer_active	= 2;
2191 		timer_expires	= sk->sk_timer.expires;
2192 	} else {
2193 		timer_active	= 0;
2194 		timer_expires = jiffies;
2195 	}
2196 
2197 	state = sk_state_load(sk);
2198 	if (state == TCP_LISTEN)
2199 		rx_queue = sk->sk_ack_backlog;
2200 	else
2201 		/* Because we don't lock the socket,
2202 		 * we might find a transient negative value.
2203 		 */
2204 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2205 
2206 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2207 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2208 		i, src, srcp, dest, destp, state,
2209 		tp->write_seq - tp->snd_una,
2210 		rx_queue,
2211 		timer_active,
2212 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2213 		icsk->icsk_retransmits,
2214 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2215 		icsk->icsk_probes_out,
2216 		sock_i_ino(sk),
2217 		atomic_read(&sk->sk_refcnt), sk,
2218 		jiffies_to_clock_t(icsk->icsk_rto),
2219 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2220 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2221 		tp->snd_cwnd,
2222 		state == TCP_LISTEN ?
2223 		    fastopenq->max_qlen :
2224 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2225 }
2226 
2227 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2228 			       struct seq_file *f, int i)
2229 {
2230 	long delta = tw->tw_timer.expires - jiffies;
2231 	__be32 dest, src;
2232 	__u16 destp, srcp;
2233 
2234 	dest  = tw->tw_daddr;
2235 	src   = tw->tw_rcv_saddr;
2236 	destp = ntohs(tw->tw_dport);
2237 	srcp  = ntohs(tw->tw_sport);
2238 
2239 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2240 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2241 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2242 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2243 		atomic_read(&tw->tw_refcnt), tw);
2244 }
2245 
2246 #define TMPSZ 150
2247 
2248 static int tcp4_seq_show(struct seq_file *seq, void *v)
2249 {
2250 	struct tcp_iter_state *st;
2251 	struct sock *sk = v;
2252 
2253 	seq_setwidth(seq, TMPSZ - 1);
2254 	if (v == SEQ_START_TOKEN) {
2255 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2256 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2257 			   "inode");
2258 		goto out;
2259 	}
2260 	st = seq->private;
2261 
2262 	if (sk->sk_state == TCP_TIME_WAIT)
2263 		get_timewait4_sock(v, seq, st->num);
2264 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2265 		get_openreq4(v, seq, st->num);
2266 	else
2267 		get_tcp4_sock(v, seq, st->num);
2268 out:
2269 	seq_pad(seq, '\n');
2270 	return 0;
2271 }
2272 
2273 static const struct file_operations tcp_afinfo_seq_fops = {
2274 	.owner   = THIS_MODULE,
2275 	.open    = tcp_seq_open,
2276 	.read    = seq_read,
2277 	.llseek  = seq_lseek,
2278 	.release = seq_release_net
2279 };
2280 
2281 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2282 	.name		= "tcp",
2283 	.family		= AF_INET,
2284 	.seq_fops	= &tcp_afinfo_seq_fops,
2285 	.seq_ops	= {
2286 		.show		= tcp4_seq_show,
2287 	},
2288 };
2289 
2290 static int __net_init tcp4_proc_init_net(struct net *net)
2291 {
2292 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2293 }
2294 
2295 static void __net_exit tcp4_proc_exit_net(struct net *net)
2296 {
2297 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2298 }
2299 
2300 static struct pernet_operations tcp4_net_ops = {
2301 	.init = tcp4_proc_init_net,
2302 	.exit = tcp4_proc_exit_net,
2303 };
2304 
2305 int __init tcp4_proc_init(void)
2306 {
2307 	return register_pernet_subsys(&tcp4_net_ops);
2308 }
2309 
2310 void tcp4_proc_exit(void)
2311 {
2312 	unregister_pernet_subsys(&tcp4_net_ops);
2313 }
2314 #endif /* CONFIG_PROC_FS */
2315 
2316 struct proto tcp_prot = {
2317 	.name			= "TCP",
2318 	.owner			= THIS_MODULE,
2319 	.close			= tcp_close,
2320 	.connect		= tcp_v4_connect,
2321 	.disconnect		= tcp_disconnect,
2322 	.accept			= inet_csk_accept,
2323 	.ioctl			= tcp_ioctl,
2324 	.init			= tcp_v4_init_sock,
2325 	.destroy		= tcp_v4_destroy_sock,
2326 	.shutdown		= tcp_shutdown,
2327 	.setsockopt		= tcp_setsockopt,
2328 	.getsockopt		= tcp_getsockopt,
2329 	.recvmsg		= tcp_recvmsg,
2330 	.sendmsg		= tcp_sendmsg,
2331 	.sendpage		= tcp_sendpage,
2332 	.backlog_rcv		= tcp_v4_do_rcv,
2333 	.release_cb		= tcp_release_cb,
2334 	.hash			= inet_hash,
2335 	.unhash			= inet_unhash,
2336 	.get_port		= inet_csk_get_port,
2337 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2338 	.stream_memory_free	= tcp_stream_memory_free,
2339 	.sockets_allocated	= &tcp_sockets_allocated,
2340 	.orphan_count		= &tcp_orphan_count,
2341 	.memory_allocated	= &tcp_memory_allocated,
2342 	.memory_pressure	= &tcp_memory_pressure,
2343 	.sysctl_mem		= sysctl_tcp_mem,
2344 	.sysctl_wmem		= sysctl_tcp_wmem,
2345 	.sysctl_rmem		= sysctl_tcp_rmem,
2346 	.max_header		= MAX_TCP_HEADER,
2347 	.obj_size		= sizeof(struct tcp_sock),
2348 	.slab_flags		= SLAB_DESTROY_BY_RCU,
2349 	.twsk_prot		= &tcp_timewait_sock_ops,
2350 	.rsk_prot		= &tcp_request_sock_ops,
2351 	.h.hashinfo		= &tcp_hashinfo,
2352 	.no_autobind		= true,
2353 #ifdef CONFIG_COMPAT
2354 	.compat_setsockopt	= compat_tcp_setsockopt,
2355 	.compat_getsockopt	= compat_tcp_getsockopt,
2356 #endif
2357 	.diag_destroy		= tcp_abort,
2358 };
2359 EXPORT_SYMBOL(tcp_prot);
2360 
2361 static void __net_exit tcp_sk_exit(struct net *net)
2362 {
2363 	int cpu;
2364 
2365 	for_each_possible_cpu(cpu)
2366 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2367 	free_percpu(net->ipv4.tcp_sk);
2368 }
2369 
2370 static int __net_init tcp_sk_init(struct net *net)
2371 {
2372 	int res, cpu;
2373 
2374 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2375 	if (!net->ipv4.tcp_sk)
2376 		return -ENOMEM;
2377 
2378 	for_each_possible_cpu(cpu) {
2379 		struct sock *sk;
2380 
2381 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2382 					   IPPROTO_TCP, net);
2383 		if (res)
2384 			goto fail;
2385 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2386 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2387 	}
2388 
2389 	net->ipv4.sysctl_tcp_ecn = 2;
2390 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2391 
2392 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2393 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2394 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2395 
2396 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2397 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2398 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2399 
2400 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2401 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2402 	net->ipv4.sysctl_tcp_syncookies = 1;
2403 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2404 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2405 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2406 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2407 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2408 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2409 
2410 	return 0;
2411 fail:
2412 	tcp_sk_exit(net);
2413 
2414 	return res;
2415 }
2416 
2417 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2418 {
2419 	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2420 }
2421 
2422 static struct pernet_operations __net_initdata tcp_sk_ops = {
2423        .init	   = tcp_sk_init,
2424        .exit	   = tcp_sk_exit,
2425        .exit_batch = tcp_sk_exit_batch,
2426 };
2427 
2428 void __init tcp_v4_init(void)
2429 {
2430 	inet_hashinfo_init(&tcp_hashinfo);
2431 	if (register_pernet_subsys(&tcp_sk_ops))
2432 		panic("Failed to create the TCP control socket.\n");
2433 }
2434