xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision 77a87824)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77 
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 
84 #include <crypto/hash.h>
85 #include <linux/scatterlist.h>
86 
87 int sysctl_tcp_tw_reuse __read_mostly;
88 int sysctl_tcp_low_latency __read_mostly;
89 EXPORT_SYMBOL(sysctl_tcp_low_latency);
90 
91 #ifdef CONFIG_TCP_MD5SIG
92 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
93 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
94 #endif
95 
96 struct inet_hashinfo tcp_hashinfo;
97 EXPORT_SYMBOL(tcp_hashinfo);
98 
99 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
100 {
101 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
102 					  ip_hdr(skb)->saddr,
103 					  tcp_hdr(skb)->dest,
104 					  tcp_hdr(skb)->source);
105 }
106 
107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 {
109 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
110 	struct tcp_sock *tp = tcp_sk(sk);
111 
112 	/* With PAWS, it is safe from the viewpoint
113 	   of data integrity. Even without PAWS it is safe provided sequence
114 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
115 
116 	   Actually, the idea is close to VJ's one, only timestamp cache is
117 	   held not per host, but per port pair and TW bucket is used as state
118 	   holder.
119 
120 	   If TW bucket has been already destroyed we fall back to VJ's scheme
121 	   and use initial timestamp retrieved from peer table.
122 	 */
123 	if (tcptw->tw_ts_recent_stamp &&
124 	    (!twp || (sysctl_tcp_tw_reuse &&
125 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
126 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
127 		if (tp->write_seq == 0)
128 			tp->write_seq = 1;
129 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
130 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
131 		sock_hold(sktw);
132 		return 1;
133 	}
134 
135 	return 0;
136 }
137 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
138 
139 /* This will initiate an outgoing connection. */
140 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
141 {
142 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
143 	struct inet_sock *inet = inet_sk(sk);
144 	struct tcp_sock *tp = tcp_sk(sk);
145 	__be16 orig_sport, orig_dport;
146 	__be32 daddr, nexthop;
147 	struct flowi4 *fl4;
148 	struct rtable *rt;
149 	int err;
150 	struct ip_options_rcu *inet_opt;
151 
152 	if (addr_len < sizeof(struct sockaddr_in))
153 		return -EINVAL;
154 
155 	if (usin->sin_family != AF_INET)
156 		return -EAFNOSUPPORT;
157 
158 	nexthop = daddr = usin->sin_addr.s_addr;
159 	inet_opt = rcu_dereference_protected(inet->inet_opt,
160 					     lockdep_sock_is_held(sk));
161 	if (inet_opt && inet_opt->opt.srr) {
162 		if (!daddr)
163 			return -EINVAL;
164 		nexthop = inet_opt->opt.faddr;
165 	}
166 
167 	orig_sport = inet->inet_sport;
168 	orig_dport = usin->sin_port;
169 	fl4 = &inet->cork.fl.u.ip4;
170 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
171 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
172 			      IPPROTO_TCP,
173 			      orig_sport, orig_dport, sk);
174 	if (IS_ERR(rt)) {
175 		err = PTR_ERR(rt);
176 		if (err == -ENETUNREACH)
177 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
178 		return err;
179 	}
180 
181 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
182 		ip_rt_put(rt);
183 		return -ENETUNREACH;
184 	}
185 
186 	if (!inet_opt || !inet_opt->opt.srr)
187 		daddr = fl4->daddr;
188 
189 	if (!inet->inet_saddr)
190 		inet->inet_saddr = fl4->saddr;
191 	sk_rcv_saddr_set(sk, inet->inet_saddr);
192 
193 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
194 		/* Reset inherited state */
195 		tp->rx_opt.ts_recent	   = 0;
196 		tp->rx_opt.ts_recent_stamp = 0;
197 		if (likely(!tp->repair))
198 			tp->write_seq	   = 0;
199 	}
200 
201 	if (tcp_death_row.sysctl_tw_recycle &&
202 	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
203 		tcp_fetch_timewait_stamp(sk, &rt->dst);
204 
205 	inet->inet_dport = usin->sin_port;
206 	sk_daddr_set(sk, daddr);
207 
208 	inet_csk(sk)->icsk_ext_hdr_len = 0;
209 	if (inet_opt)
210 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
211 
212 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
213 
214 	/* Socket identity is still unknown (sport may be zero).
215 	 * However we set state to SYN-SENT and not releasing socket
216 	 * lock select source port, enter ourselves into the hash tables and
217 	 * complete initialization after this.
218 	 */
219 	tcp_set_state(sk, TCP_SYN_SENT);
220 	err = inet_hash_connect(&tcp_death_row, sk);
221 	if (err)
222 		goto failure;
223 
224 	sk_set_txhash(sk);
225 
226 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
227 			       inet->inet_sport, inet->inet_dport, sk);
228 	if (IS_ERR(rt)) {
229 		err = PTR_ERR(rt);
230 		rt = NULL;
231 		goto failure;
232 	}
233 	/* OK, now commit destination to socket.  */
234 	sk->sk_gso_type = SKB_GSO_TCPV4;
235 	sk_setup_caps(sk, &rt->dst);
236 
237 	if (!tp->write_seq && likely(!tp->repair))
238 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
239 							   inet->inet_daddr,
240 							   inet->inet_sport,
241 							   usin->sin_port);
242 
243 	inet->inet_id = tp->write_seq ^ jiffies;
244 
245 	err = tcp_connect(sk);
246 
247 	rt = NULL;
248 	if (err)
249 		goto failure;
250 
251 	return 0;
252 
253 failure:
254 	/*
255 	 * This unhashes the socket and releases the local port,
256 	 * if necessary.
257 	 */
258 	tcp_set_state(sk, TCP_CLOSE);
259 	ip_rt_put(rt);
260 	sk->sk_route_caps = 0;
261 	inet->inet_dport = 0;
262 	return err;
263 }
264 EXPORT_SYMBOL(tcp_v4_connect);
265 
266 /*
267  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
268  * It can be called through tcp_release_cb() if socket was owned by user
269  * at the time tcp_v4_err() was called to handle ICMP message.
270  */
271 void tcp_v4_mtu_reduced(struct sock *sk)
272 {
273 	struct dst_entry *dst;
274 	struct inet_sock *inet = inet_sk(sk);
275 	u32 mtu = tcp_sk(sk)->mtu_info;
276 
277 	dst = inet_csk_update_pmtu(sk, mtu);
278 	if (!dst)
279 		return;
280 
281 	/* Something is about to be wrong... Remember soft error
282 	 * for the case, if this connection will not able to recover.
283 	 */
284 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
285 		sk->sk_err_soft = EMSGSIZE;
286 
287 	mtu = dst_mtu(dst);
288 
289 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
290 	    ip_sk_accept_pmtu(sk) &&
291 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
292 		tcp_sync_mss(sk, mtu);
293 
294 		/* Resend the TCP packet because it's
295 		 * clear that the old packet has been
296 		 * dropped. This is the new "fast" path mtu
297 		 * discovery.
298 		 */
299 		tcp_simple_retransmit(sk);
300 	} /* else let the usual retransmit timer handle it */
301 }
302 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
303 
304 static void do_redirect(struct sk_buff *skb, struct sock *sk)
305 {
306 	struct dst_entry *dst = __sk_dst_check(sk, 0);
307 
308 	if (dst)
309 		dst->ops->redirect(dst, sk, skb);
310 }
311 
312 
313 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
314 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
315 {
316 	struct request_sock *req = inet_reqsk(sk);
317 	struct net *net = sock_net(sk);
318 
319 	/* ICMPs are not backlogged, hence we cannot get
320 	 * an established socket here.
321 	 */
322 	if (seq != tcp_rsk(req)->snt_isn) {
323 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
324 	} else if (abort) {
325 		/*
326 		 * Still in SYN_RECV, just remove it silently.
327 		 * There is no good way to pass the error to the newly
328 		 * created socket, and POSIX does not want network
329 		 * errors returned from accept().
330 		 */
331 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
332 		tcp_listendrop(req->rsk_listener);
333 	}
334 	reqsk_put(req);
335 }
336 EXPORT_SYMBOL(tcp_req_err);
337 
338 /*
339  * This routine is called by the ICMP module when it gets some
340  * sort of error condition.  If err < 0 then the socket should
341  * be closed and the error returned to the user.  If err > 0
342  * it's just the icmp type << 8 | icmp code.  After adjustment
343  * header points to the first 8 bytes of the tcp header.  We need
344  * to find the appropriate port.
345  *
346  * The locking strategy used here is very "optimistic". When
347  * someone else accesses the socket the ICMP is just dropped
348  * and for some paths there is no check at all.
349  * A more general error queue to queue errors for later handling
350  * is probably better.
351  *
352  */
353 
354 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
355 {
356 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
357 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
358 	struct inet_connection_sock *icsk;
359 	struct tcp_sock *tp;
360 	struct inet_sock *inet;
361 	const int type = icmp_hdr(icmp_skb)->type;
362 	const int code = icmp_hdr(icmp_skb)->code;
363 	struct sock *sk;
364 	struct sk_buff *skb;
365 	struct request_sock *fastopen;
366 	__u32 seq, snd_una;
367 	__u32 remaining;
368 	int err;
369 	struct net *net = dev_net(icmp_skb->dev);
370 
371 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
372 				       th->dest, iph->saddr, ntohs(th->source),
373 				       inet_iif(icmp_skb));
374 	if (!sk) {
375 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
376 		return;
377 	}
378 	if (sk->sk_state == TCP_TIME_WAIT) {
379 		inet_twsk_put(inet_twsk(sk));
380 		return;
381 	}
382 	seq = ntohl(th->seq);
383 	if (sk->sk_state == TCP_NEW_SYN_RECV)
384 		return tcp_req_err(sk, seq,
385 				  type == ICMP_PARAMETERPROB ||
386 				  type == ICMP_TIME_EXCEEDED ||
387 				  (type == ICMP_DEST_UNREACH &&
388 				   (code == ICMP_NET_UNREACH ||
389 				    code == ICMP_HOST_UNREACH)));
390 
391 	bh_lock_sock(sk);
392 	/* If too many ICMPs get dropped on busy
393 	 * servers this needs to be solved differently.
394 	 * We do take care of PMTU discovery (RFC1191) special case :
395 	 * we can receive locally generated ICMP messages while socket is held.
396 	 */
397 	if (sock_owned_by_user(sk)) {
398 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
399 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
400 	}
401 	if (sk->sk_state == TCP_CLOSE)
402 		goto out;
403 
404 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
405 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
406 		goto out;
407 	}
408 
409 	icsk = inet_csk(sk);
410 	tp = tcp_sk(sk);
411 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
412 	fastopen = tp->fastopen_rsk;
413 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
414 	if (sk->sk_state != TCP_LISTEN &&
415 	    !between(seq, snd_una, tp->snd_nxt)) {
416 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
417 		goto out;
418 	}
419 
420 	switch (type) {
421 	case ICMP_REDIRECT:
422 		do_redirect(icmp_skb, sk);
423 		goto out;
424 	case ICMP_SOURCE_QUENCH:
425 		/* Just silently ignore these. */
426 		goto out;
427 	case ICMP_PARAMETERPROB:
428 		err = EPROTO;
429 		break;
430 	case ICMP_DEST_UNREACH:
431 		if (code > NR_ICMP_UNREACH)
432 			goto out;
433 
434 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
435 			/* We are not interested in TCP_LISTEN and open_requests
436 			 * (SYN-ACKs send out by Linux are always <576bytes so
437 			 * they should go through unfragmented).
438 			 */
439 			if (sk->sk_state == TCP_LISTEN)
440 				goto out;
441 
442 			tp->mtu_info = info;
443 			if (!sock_owned_by_user(sk)) {
444 				tcp_v4_mtu_reduced(sk);
445 			} else {
446 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
447 					sock_hold(sk);
448 			}
449 			goto out;
450 		}
451 
452 		err = icmp_err_convert[code].errno;
453 		/* check if icmp_skb allows revert of backoff
454 		 * (see draft-zimmermann-tcp-lcd) */
455 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
456 			break;
457 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
458 		    !icsk->icsk_backoff || fastopen)
459 			break;
460 
461 		if (sock_owned_by_user(sk))
462 			break;
463 
464 		icsk->icsk_backoff--;
465 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
466 					       TCP_TIMEOUT_INIT;
467 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
468 
469 		skb = tcp_write_queue_head(sk);
470 		BUG_ON(!skb);
471 
472 		remaining = icsk->icsk_rto -
473 			    min(icsk->icsk_rto,
474 				tcp_time_stamp - tcp_skb_timestamp(skb));
475 
476 		if (remaining) {
477 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
478 						  remaining, TCP_RTO_MAX);
479 		} else {
480 			/* RTO revert clocked out retransmission.
481 			 * Will retransmit now */
482 			tcp_retransmit_timer(sk);
483 		}
484 
485 		break;
486 	case ICMP_TIME_EXCEEDED:
487 		err = EHOSTUNREACH;
488 		break;
489 	default:
490 		goto out;
491 	}
492 
493 	switch (sk->sk_state) {
494 	case TCP_SYN_SENT:
495 	case TCP_SYN_RECV:
496 		/* Only in fast or simultaneous open. If a fast open socket is
497 		 * is already accepted it is treated as a connected one below.
498 		 */
499 		if (fastopen && !fastopen->sk)
500 			break;
501 
502 		if (!sock_owned_by_user(sk)) {
503 			sk->sk_err = err;
504 
505 			sk->sk_error_report(sk);
506 
507 			tcp_done(sk);
508 		} else {
509 			sk->sk_err_soft = err;
510 		}
511 		goto out;
512 	}
513 
514 	/* If we've already connected we will keep trying
515 	 * until we time out, or the user gives up.
516 	 *
517 	 * rfc1122 4.2.3.9 allows to consider as hard errors
518 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
519 	 * but it is obsoleted by pmtu discovery).
520 	 *
521 	 * Note, that in modern internet, where routing is unreliable
522 	 * and in each dark corner broken firewalls sit, sending random
523 	 * errors ordered by their masters even this two messages finally lose
524 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
525 	 *
526 	 * Now we are in compliance with RFCs.
527 	 *							--ANK (980905)
528 	 */
529 
530 	inet = inet_sk(sk);
531 	if (!sock_owned_by_user(sk) && inet->recverr) {
532 		sk->sk_err = err;
533 		sk->sk_error_report(sk);
534 	} else	{ /* Only an error on timeout */
535 		sk->sk_err_soft = err;
536 	}
537 
538 out:
539 	bh_unlock_sock(sk);
540 	sock_put(sk);
541 }
542 
543 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
544 {
545 	struct tcphdr *th = tcp_hdr(skb);
546 
547 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
548 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
549 		skb->csum_start = skb_transport_header(skb) - skb->head;
550 		skb->csum_offset = offsetof(struct tcphdr, check);
551 	} else {
552 		th->check = tcp_v4_check(skb->len, saddr, daddr,
553 					 csum_partial(th,
554 						      th->doff << 2,
555 						      skb->csum));
556 	}
557 }
558 
559 /* This routine computes an IPv4 TCP checksum. */
560 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
561 {
562 	const struct inet_sock *inet = inet_sk(sk);
563 
564 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
565 }
566 EXPORT_SYMBOL(tcp_v4_send_check);
567 
568 /*
569  *	This routine will send an RST to the other tcp.
570  *
571  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
572  *		      for reset.
573  *	Answer: if a packet caused RST, it is not for a socket
574  *		existing in our system, if it is matched to a socket,
575  *		it is just duplicate segment or bug in other side's TCP.
576  *		So that we build reply only basing on parameters
577  *		arrived with segment.
578  *	Exception: precedence violation. We do not implement it in any case.
579  */
580 
581 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
582 {
583 	const struct tcphdr *th = tcp_hdr(skb);
584 	struct {
585 		struct tcphdr th;
586 #ifdef CONFIG_TCP_MD5SIG
587 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
588 #endif
589 	} rep;
590 	struct ip_reply_arg arg;
591 #ifdef CONFIG_TCP_MD5SIG
592 	struct tcp_md5sig_key *key = NULL;
593 	const __u8 *hash_location = NULL;
594 	unsigned char newhash[16];
595 	int genhash;
596 	struct sock *sk1 = NULL;
597 #endif
598 	struct net *net;
599 
600 	/* Never send a reset in response to a reset. */
601 	if (th->rst)
602 		return;
603 
604 	/* If sk not NULL, it means we did a successful lookup and incoming
605 	 * route had to be correct. prequeue might have dropped our dst.
606 	 */
607 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
608 		return;
609 
610 	/* Swap the send and the receive. */
611 	memset(&rep, 0, sizeof(rep));
612 	rep.th.dest   = th->source;
613 	rep.th.source = th->dest;
614 	rep.th.doff   = sizeof(struct tcphdr) / 4;
615 	rep.th.rst    = 1;
616 
617 	if (th->ack) {
618 		rep.th.seq = th->ack_seq;
619 	} else {
620 		rep.th.ack = 1;
621 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
622 				       skb->len - (th->doff << 2));
623 	}
624 
625 	memset(&arg, 0, sizeof(arg));
626 	arg.iov[0].iov_base = (unsigned char *)&rep;
627 	arg.iov[0].iov_len  = sizeof(rep.th);
628 
629 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
630 #ifdef CONFIG_TCP_MD5SIG
631 	rcu_read_lock();
632 	hash_location = tcp_parse_md5sig_option(th);
633 	if (sk && sk_fullsock(sk)) {
634 		key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
635 					&ip_hdr(skb)->saddr, AF_INET);
636 	} else if (hash_location) {
637 		/*
638 		 * active side is lost. Try to find listening socket through
639 		 * source port, and then find md5 key through listening socket.
640 		 * we are not loose security here:
641 		 * Incoming packet is checked with md5 hash with finding key,
642 		 * no RST generated if md5 hash doesn't match.
643 		 */
644 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
645 					     ip_hdr(skb)->saddr,
646 					     th->source, ip_hdr(skb)->daddr,
647 					     ntohs(th->source), inet_iif(skb));
648 		/* don't send rst if it can't find key */
649 		if (!sk1)
650 			goto out;
651 
652 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
653 					&ip_hdr(skb)->saddr, AF_INET);
654 		if (!key)
655 			goto out;
656 
657 
658 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
659 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
660 			goto out;
661 
662 	}
663 
664 	if (key) {
665 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
666 				   (TCPOPT_NOP << 16) |
667 				   (TCPOPT_MD5SIG << 8) |
668 				   TCPOLEN_MD5SIG);
669 		/* Update length and the length the header thinks exists */
670 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
671 		rep.th.doff = arg.iov[0].iov_len / 4;
672 
673 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
674 				     key, ip_hdr(skb)->saddr,
675 				     ip_hdr(skb)->daddr, &rep.th);
676 	}
677 #endif
678 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
679 				      ip_hdr(skb)->saddr, /* XXX */
680 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
681 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
682 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
683 
684 	/* When socket is gone, all binding information is lost.
685 	 * routing might fail in this case. No choice here, if we choose to force
686 	 * input interface, we will misroute in case of asymmetric route.
687 	 */
688 	if (sk)
689 		arg.bound_dev_if = sk->sk_bound_dev_if;
690 
691 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
692 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
693 
694 	arg.tos = ip_hdr(skb)->tos;
695 	local_bh_disable();
696 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
697 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
698 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
699 			      &arg, arg.iov[0].iov_len);
700 
701 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
702 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
703 	local_bh_enable();
704 
705 #ifdef CONFIG_TCP_MD5SIG
706 out:
707 	rcu_read_unlock();
708 #endif
709 }
710 
711 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
712    outside socket context is ugly, certainly. What can I do?
713  */
714 
715 static void tcp_v4_send_ack(struct net *net,
716 			    struct sk_buff *skb, u32 seq, u32 ack,
717 			    u32 win, u32 tsval, u32 tsecr, int oif,
718 			    struct tcp_md5sig_key *key,
719 			    int reply_flags, u8 tos)
720 {
721 	const struct tcphdr *th = tcp_hdr(skb);
722 	struct {
723 		struct tcphdr th;
724 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
725 #ifdef CONFIG_TCP_MD5SIG
726 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
727 #endif
728 			];
729 	} rep;
730 	struct ip_reply_arg arg;
731 
732 	memset(&rep.th, 0, sizeof(struct tcphdr));
733 	memset(&arg, 0, sizeof(arg));
734 
735 	arg.iov[0].iov_base = (unsigned char *)&rep;
736 	arg.iov[0].iov_len  = sizeof(rep.th);
737 	if (tsecr) {
738 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
739 				   (TCPOPT_TIMESTAMP << 8) |
740 				   TCPOLEN_TIMESTAMP);
741 		rep.opt[1] = htonl(tsval);
742 		rep.opt[2] = htonl(tsecr);
743 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
744 	}
745 
746 	/* Swap the send and the receive. */
747 	rep.th.dest    = th->source;
748 	rep.th.source  = th->dest;
749 	rep.th.doff    = arg.iov[0].iov_len / 4;
750 	rep.th.seq     = htonl(seq);
751 	rep.th.ack_seq = htonl(ack);
752 	rep.th.ack     = 1;
753 	rep.th.window  = htons(win);
754 
755 #ifdef CONFIG_TCP_MD5SIG
756 	if (key) {
757 		int offset = (tsecr) ? 3 : 0;
758 
759 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
760 					  (TCPOPT_NOP << 16) |
761 					  (TCPOPT_MD5SIG << 8) |
762 					  TCPOLEN_MD5SIG);
763 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
764 		rep.th.doff = arg.iov[0].iov_len/4;
765 
766 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
767 				    key, ip_hdr(skb)->saddr,
768 				    ip_hdr(skb)->daddr, &rep.th);
769 	}
770 #endif
771 	arg.flags = reply_flags;
772 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
773 				      ip_hdr(skb)->saddr, /* XXX */
774 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
775 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
776 	if (oif)
777 		arg.bound_dev_if = oif;
778 	arg.tos = tos;
779 	local_bh_disable();
780 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
781 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
782 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
783 			      &arg, arg.iov[0].iov_len);
784 
785 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
786 	local_bh_enable();
787 }
788 
789 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
790 {
791 	struct inet_timewait_sock *tw = inet_twsk(sk);
792 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
793 
794 	tcp_v4_send_ack(sock_net(sk), skb,
795 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
796 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
797 			tcp_time_stamp + tcptw->tw_ts_offset,
798 			tcptw->tw_ts_recent,
799 			tw->tw_bound_dev_if,
800 			tcp_twsk_md5_key(tcptw),
801 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
802 			tw->tw_tos
803 			);
804 
805 	inet_twsk_put(tw);
806 }
807 
808 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
809 				  struct request_sock *req)
810 {
811 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
812 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
813 	 */
814 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
815 					     tcp_sk(sk)->snd_nxt;
816 
817 	tcp_v4_send_ack(sock_net(sk), skb, seq,
818 			tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
819 			tcp_time_stamp,
820 			req->ts_recent,
821 			0,
822 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
823 					  AF_INET),
824 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
825 			ip_hdr(skb)->tos);
826 }
827 
828 /*
829  *	Send a SYN-ACK after having received a SYN.
830  *	This still operates on a request_sock only, not on a big
831  *	socket.
832  */
833 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
834 			      struct flowi *fl,
835 			      struct request_sock *req,
836 			      struct tcp_fastopen_cookie *foc,
837 			      enum tcp_synack_type synack_type)
838 {
839 	const struct inet_request_sock *ireq = inet_rsk(req);
840 	struct flowi4 fl4;
841 	int err = -1;
842 	struct sk_buff *skb;
843 
844 	/* First, grab a route. */
845 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
846 		return -1;
847 
848 	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
849 
850 	if (skb) {
851 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
852 
853 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
854 					    ireq->ir_rmt_addr,
855 					    ireq->opt);
856 		err = net_xmit_eval(err);
857 	}
858 
859 	return err;
860 }
861 
862 /*
863  *	IPv4 request_sock destructor.
864  */
865 static void tcp_v4_reqsk_destructor(struct request_sock *req)
866 {
867 	kfree(inet_rsk(req)->opt);
868 }
869 
870 #ifdef CONFIG_TCP_MD5SIG
871 /*
872  * RFC2385 MD5 checksumming requires a mapping of
873  * IP address->MD5 Key.
874  * We need to maintain these in the sk structure.
875  */
876 
877 /* Find the Key structure for an address.  */
878 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
879 					 const union tcp_md5_addr *addr,
880 					 int family)
881 {
882 	const struct tcp_sock *tp = tcp_sk(sk);
883 	struct tcp_md5sig_key *key;
884 	unsigned int size = sizeof(struct in_addr);
885 	const struct tcp_md5sig_info *md5sig;
886 
887 	/* caller either holds rcu_read_lock() or socket lock */
888 	md5sig = rcu_dereference_check(tp->md5sig_info,
889 				       lockdep_sock_is_held(sk));
890 	if (!md5sig)
891 		return NULL;
892 #if IS_ENABLED(CONFIG_IPV6)
893 	if (family == AF_INET6)
894 		size = sizeof(struct in6_addr);
895 #endif
896 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
897 		if (key->family != family)
898 			continue;
899 		if (!memcmp(&key->addr, addr, size))
900 			return key;
901 	}
902 	return NULL;
903 }
904 EXPORT_SYMBOL(tcp_md5_do_lookup);
905 
906 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
907 					 const struct sock *addr_sk)
908 {
909 	const union tcp_md5_addr *addr;
910 
911 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
912 	return tcp_md5_do_lookup(sk, addr, AF_INET);
913 }
914 EXPORT_SYMBOL(tcp_v4_md5_lookup);
915 
916 /* This can be called on a newly created socket, from other files */
917 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
918 		   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
919 {
920 	/* Add Key to the list */
921 	struct tcp_md5sig_key *key;
922 	struct tcp_sock *tp = tcp_sk(sk);
923 	struct tcp_md5sig_info *md5sig;
924 
925 	key = tcp_md5_do_lookup(sk, addr, family);
926 	if (key) {
927 		/* Pre-existing entry - just update that one. */
928 		memcpy(key->key, newkey, newkeylen);
929 		key->keylen = newkeylen;
930 		return 0;
931 	}
932 
933 	md5sig = rcu_dereference_protected(tp->md5sig_info,
934 					   lockdep_sock_is_held(sk));
935 	if (!md5sig) {
936 		md5sig = kmalloc(sizeof(*md5sig), gfp);
937 		if (!md5sig)
938 			return -ENOMEM;
939 
940 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
941 		INIT_HLIST_HEAD(&md5sig->head);
942 		rcu_assign_pointer(tp->md5sig_info, md5sig);
943 	}
944 
945 	key = sock_kmalloc(sk, sizeof(*key), gfp);
946 	if (!key)
947 		return -ENOMEM;
948 	if (!tcp_alloc_md5sig_pool()) {
949 		sock_kfree_s(sk, key, sizeof(*key));
950 		return -ENOMEM;
951 	}
952 
953 	memcpy(key->key, newkey, newkeylen);
954 	key->keylen = newkeylen;
955 	key->family = family;
956 	memcpy(&key->addr, addr,
957 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
958 				      sizeof(struct in_addr));
959 	hlist_add_head_rcu(&key->node, &md5sig->head);
960 	return 0;
961 }
962 EXPORT_SYMBOL(tcp_md5_do_add);
963 
964 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
965 {
966 	struct tcp_md5sig_key *key;
967 
968 	key = tcp_md5_do_lookup(sk, addr, family);
969 	if (!key)
970 		return -ENOENT;
971 	hlist_del_rcu(&key->node);
972 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
973 	kfree_rcu(key, rcu);
974 	return 0;
975 }
976 EXPORT_SYMBOL(tcp_md5_do_del);
977 
978 static void tcp_clear_md5_list(struct sock *sk)
979 {
980 	struct tcp_sock *tp = tcp_sk(sk);
981 	struct tcp_md5sig_key *key;
982 	struct hlist_node *n;
983 	struct tcp_md5sig_info *md5sig;
984 
985 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
986 
987 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
988 		hlist_del_rcu(&key->node);
989 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
990 		kfree_rcu(key, rcu);
991 	}
992 }
993 
994 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
995 				 int optlen)
996 {
997 	struct tcp_md5sig cmd;
998 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
999 
1000 	if (optlen < sizeof(cmd))
1001 		return -EINVAL;
1002 
1003 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1004 		return -EFAULT;
1005 
1006 	if (sin->sin_family != AF_INET)
1007 		return -EINVAL;
1008 
1009 	if (!cmd.tcpm_keylen)
1010 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1011 				      AF_INET);
1012 
1013 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1014 		return -EINVAL;
1015 
1016 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1017 			      AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1018 			      GFP_KERNEL);
1019 }
1020 
1021 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1022 				   __be32 daddr, __be32 saddr,
1023 				   const struct tcphdr *th, int nbytes)
1024 {
1025 	struct tcp4_pseudohdr *bp;
1026 	struct scatterlist sg;
1027 	struct tcphdr *_th;
1028 
1029 	bp = hp->scratch;
1030 	bp->saddr = saddr;
1031 	bp->daddr = daddr;
1032 	bp->pad = 0;
1033 	bp->protocol = IPPROTO_TCP;
1034 	bp->len = cpu_to_be16(nbytes);
1035 
1036 	_th = (struct tcphdr *)(bp + 1);
1037 	memcpy(_th, th, sizeof(*th));
1038 	_th->check = 0;
1039 
1040 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1041 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1042 				sizeof(*bp) + sizeof(*th));
1043 	return crypto_ahash_update(hp->md5_req);
1044 }
1045 
1046 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1047 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1048 {
1049 	struct tcp_md5sig_pool *hp;
1050 	struct ahash_request *req;
1051 
1052 	hp = tcp_get_md5sig_pool();
1053 	if (!hp)
1054 		goto clear_hash_noput;
1055 	req = hp->md5_req;
1056 
1057 	if (crypto_ahash_init(req))
1058 		goto clear_hash;
1059 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1060 		goto clear_hash;
1061 	if (tcp_md5_hash_key(hp, key))
1062 		goto clear_hash;
1063 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1064 	if (crypto_ahash_final(req))
1065 		goto clear_hash;
1066 
1067 	tcp_put_md5sig_pool();
1068 	return 0;
1069 
1070 clear_hash:
1071 	tcp_put_md5sig_pool();
1072 clear_hash_noput:
1073 	memset(md5_hash, 0, 16);
1074 	return 1;
1075 }
1076 
1077 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1078 			const struct sock *sk,
1079 			const struct sk_buff *skb)
1080 {
1081 	struct tcp_md5sig_pool *hp;
1082 	struct ahash_request *req;
1083 	const struct tcphdr *th = tcp_hdr(skb);
1084 	__be32 saddr, daddr;
1085 
1086 	if (sk) { /* valid for establish/request sockets */
1087 		saddr = sk->sk_rcv_saddr;
1088 		daddr = sk->sk_daddr;
1089 	} else {
1090 		const struct iphdr *iph = ip_hdr(skb);
1091 		saddr = iph->saddr;
1092 		daddr = iph->daddr;
1093 	}
1094 
1095 	hp = tcp_get_md5sig_pool();
1096 	if (!hp)
1097 		goto clear_hash_noput;
1098 	req = hp->md5_req;
1099 
1100 	if (crypto_ahash_init(req))
1101 		goto clear_hash;
1102 
1103 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1104 		goto clear_hash;
1105 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1106 		goto clear_hash;
1107 	if (tcp_md5_hash_key(hp, key))
1108 		goto clear_hash;
1109 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1110 	if (crypto_ahash_final(req))
1111 		goto clear_hash;
1112 
1113 	tcp_put_md5sig_pool();
1114 	return 0;
1115 
1116 clear_hash:
1117 	tcp_put_md5sig_pool();
1118 clear_hash_noput:
1119 	memset(md5_hash, 0, 16);
1120 	return 1;
1121 }
1122 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1123 
1124 #endif
1125 
1126 /* Called with rcu_read_lock() */
1127 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1128 				    const struct sk_buff *skb)
1129 {
1130 #ifdef CONFIG_TCP_MD5SIG
1131 	/*
1132 	 * This gets called for each TCP segment that arrives
1133 	 * so we want to be efficient.
1134 	 * We have 3 drop cases:
1135 	 * o No MD5 hash and one expected.
1136 	 * o MD5 hash and we're not expecting one.
1137 	 * o MD5 hash and its wrong.
1138 	 */
1139 	const __u8 *hash_location = NULL;
1140 	struct tcp_md5sig_key *hash_expected;
1141 	const struct iphdr *iph = ip_hdr(skb);
1142 	const struct tcphdr *th = tcp_hdr(skb);
1143 	int genhash;
1144 	unsigned char newhash[16];
1145 
1146 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1147 					  AF_INET);
1148 	hash_location = tcp_parse_md5sig_option(th);
1149 
1150 	/* We've parsed the options - do we have a hash? */
1151 	if (!hash_expected && !hash_location)
1152 		return false;
1153 
1154 	if (hash_expected && !hash_location) {
1155 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1156 		return true;
1157 	}
1158 
1159 	if (!hash_expected && hash_location) {
1160 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1161 		return true;
1162 	}
1163 
1164 	/* Okay, so this is hash_expected and hash_location -
1165 	 * so we need to calculate the checksum.
1166 	 */
1167 	genhash = tcp_v4_md5_hash_skb(newhash,
1168 				      hash_expected,
1169 				      NULL, skb);
1170 
1171 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1172 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1173 				     &iph->saddr, ntohs(th->source),
1174 				     &iph->daddr, ntohs(th->dest),
1175 				     genhash ? " tcp_v4_calc_md5_hash failed"
1176 				     : "");
1177 		return true;
1178 	}
1179 	return false;
1180 #endif
1181 	return false;
1182 }
1183 
1184 static void tcp_v4_init_req(struct request_sock *req,
1185 			    const struct sock *sk_listener,
1186 			    struct sk_buff *skb)
1187 {
1188 	struct inet_request_sock *ireq = inet_rsk(req);
1189 
1190 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1191 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1192 	ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1193 	ireq->opt = tcp_v4_save_options(skb);
1194 }
1195 
1196 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1197 					  struct flowi *fl,
1198 					  const struct request_sock *req,
1199 					  bool *strict)
1200 {
1201 	struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1202 
1203 	if (strict) {
1204 		if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1205 			*strict = true;
1206 		else
1207 			*strict = false;
1208 	}
1209 
1210 	return dst;
1211 }
1212 
1213 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1214 	.family		=	PF_INET,
1215 	.obj_size	=	sizeof(struct tcp_request_sock),
1216 	.rtx_syn_ack	=	tcp_rtx_synack,
1217 	.send_ack	=	tcp_v4_reqsk_send_ack,
1218 	.destructor	=	tcp_v4_reqsk_destructor,
1219 	.send_reset	=	tcp_v4_send_reset,
1220 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1221 };
1222 
1223 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1224 	.mss_clamp	=	TCP_MSS_DEFAULT,
1225 #ifdef CONFIG_TCP_MD5SIG
1226 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1227 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1228 #endif
1229 	.init_req	=	tcp_v4_init_req,
1230 #ifdef CONFIG_SYN_COOKIES
1231 	.cookie_init_seq =	cookie_v4_init_sequence,
1232 #endif
1233 	.route_req	=	tcp_v4_route_req,
1234 	.init_seq	=	tcp_v4_init_sequence,
1235 	.send_synack	=	tcp_v4_send_synack,
1236 };
1237 
1238 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1239 {
1240 	/* Never answer to SYNs send to broadcast or multicast */
1241 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1242 		goto drop;
1243 
1244 	return tcp_conn_request(&tcp_request_sock_ops,
1245 				&tcp_request_sock_ipv4_ops, sk, skb);
1246 
1247 drop:
1248 	tcp_listendrop(sk);
1249 	return 0;
1250 }
1251 EXPORT_SYMBOL(tcp_v4_conn_request);
1252 
1253 
1254 /*
1255  * The three way handshake has completed - we got a valid synack -
1256  * now create the new socket.
1257  */
1258 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1259 				  struct request_sock *req,
1260 				  struct dst_entry *dst,
1261 				  struct request_sock *req_unhash,
1262 				  bool *own_req)
1263 {
1264 	struct inet_request_sock *ireq;
1265 	struct inet_sock *newinet;
1266 	struct tcp_sock *newtp;
1267 	struct sock *newsk;
1268 #ifdef CONFIG_TCP_MD5SIG
1269 	struct tcp_md5sig_key *key;
1270 #endif
1271 	struct ip_options_rcu *inet_opt;
1272 
1273 	if (sk_acceptq_is_full(sk))
1274 		goto exit_overflow;
1275 
1276 	newsk = tcp_create_openreq_child(sk, req, skb);
1277 	if (!newsk)
1278 		goto exit_nonewsk;
1279 
1280 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1281 	inet_sk_rx_dst_set(newsk, skb);
1282 
1283 	newtp		      = tcp_sk(newsk);
1284 	newinet		      = inet_sk(newsk);
1285 	ireq		      = inet_rsk(req);
1286 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1287 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1288 	newsk->sk_bound_dev_if = ireq->ir_iif;
1289 	newinet->inet_saddr	      = ireq->ir_loc_addr;
1290 	inet_opt	      = ireq->opt;
1291 	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1292 	ireq->opt	      = NULL;
1293 	newinet->mc_index     = inet_iif(skb);
1294 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1295 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1296 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1297 	if (inet_opt)
1298 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1299 	newinet->inet_id = newtp->write_seq ^ jiffies;
1300 
1301 	if (!dst) {
1302 		dst = inet_csk_route_child_sock(sk, newsk, req);
1303 		if (!dst)
1304 			goto put_and_exit;
1305 	} else {
1306 		/* syncookie case : see end of cookie_v4_check() */
1307 	}
1308 	sk_setup_caps(newsk, dst);
1309 
1310 	tcp_ca_openreq_child(newsk, dst);
1311 
1312 	tcp_sync_mss(newsk, dst_mtu(dst));
1313 	newtp->advmss = dst_metric_advmss(dst);
1314 	if (tcp_sk(sk)->rx_opt.user_mss &&
1315 	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1316 		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1317 
1318 	tcp_initialize_rcv_mss(newsk);
1319 
1320 #ifdef CONFIG_TCP_MD5SIG
1321 	/* Copy over the MD5 key from the original socket */
1322 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1323 				AF_INET);
1324 	if (key) {
1325 		/*
1326 		 * We're using one, so create a matching key
1327 		 * on the newsk structure. If we fail to get
1328 		 * memory, then we end up not copying the key
1329 		 * across. Shucks.
1330 		 */
1331 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1332 			       AF_INET, key->key, key->keylen, GFP_ATOMIC);
1333 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1334 	}
1335 #endif
1336 
1337 	if (__inet_inherit_port(sk, newsk) < 0)
1338 		goto put_and_exit;
1339 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1340 	if (*own_req)
1341 		tcp_move_syn(newtp, req);
1342 
1343 	return newsk;
1344 
1345 exit_overflow:
1346 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1347 exit_nonewsk:
1348 	dst_release(dst);
1349 exit:
1350 	tcp_listendrop(sk);
1351 	return NULL;
1352 put_and_exit:
1353 	inet_csk_prepare_forced_close(newsk);
1354 	tcp_done(newsk);
1355 	goto exit;
1356 }
1357 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1358 
1359 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1360 {
1361 #ifdef CONFIG_SYN_COOKIES
1362 	const struct tcphdr *th = tcp_hdr(skb);
1363 
1364 	if (!th->syn)
1365 		sk = cookie_v4_check(sk, skb);
1366 #endif
1367 	return sk;
1368 }
1369 
1370 /* The socket must have it's spinlock held when we get
1371  * here, unless it is a TCP_LISTEN socket.
1372  *
1373  * We have a potential double-lock case here, so even when
1374  * doing backlog processing we use the BH locking scheme.
1375  * This is because we cannot sleep with the original spinlock
1376  * held.
1377  */
1378 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1379 {
1380 	struct sock *rsk;
1381 
1382 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1383 		struct dst_entry *dst = sk->sk_rx_dst;
1384 
1385 		sock_rps_save_rxhash(sk, skb);
1386 		sk_mark_napi_id(sk, skb);
1387 		if (dst) {
1388 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1389 			    !dst->ops->check(dst, 0)) {
1390 				dst_release(dst);
1391 				sk->sk_rx_dst = NULL;
1392 			}
1393 		}
1394 		tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1395 		return 0;
1396 	}
1397 
1398 	if (tcp_checksum_complete(skb))
1399 		goto csum_err;
1400 
1401 	if (sk->sk_state == TCP_LISTEN) {
1402 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1403 
1404 		if (!nsk)
1405 			goto discard;
1406 		if (nsk != sk) {
1407 			sock_rps_save_rxhash(nsk, skb);
1408 			sk_mark_napi_id(nsk, skb);
1409 			if (tcp_child_process(sk, nsk, skb)) {
1410 				rsk = nsk;
1411 				goto reset;
1412 			}
1413 			return 0;
1414 		}
1415 	} else
1416 		sock_rps_save_rxhash(sk, skb);
1417 
1418 	if (tcp_rcv_state_process(sk, skb)) {
1419 		rsk = sk;
1420 		goto reset;
1421 	}
1422 	return 0;
1423 
1424 reset:
1425 	tcp_v4_send_reset(rsk, skb);
1426 discard:
1427 	kfree_skb(skb);
1428 	/* Be careful here. If this function gets more complicated and
1429 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1430 	 * might be destroyed here. This current version compiles correctly,
1431 	 * but you have been warned.
1432 	 */
1433 	return 0;
1434 
1435 csum_err:
1436 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1437 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1438 	goto discard;
1439 }
1440 EXPORT_SYMBOL(tcp_v4_do_rcv);
1441 
1442 void tcp_v4_early_demux(struct sk_buff *skb)
1443 {
1444 	const struct iphdr *iph;
1445 	const struct tcphdr *th;
1446 	struct sock *sk;
1447 
1448 	if (skb->pkt_type != PACKET_HOST)
1449 		return;
1450 
1451 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1452 		return;
1453 
1454 	iph = ip_hdr(skb);
1455 	th = tcp_hdr(skb);
1456 
1457 	if (th->doff < sizeof(struct tcphdr) / 4)
1458 		return;
1459 
1460 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1461 				       iph->saddr, th->source,
1462 				       iph->daddr, ntohs(th->dest),
1463 				       skb->skb_iif);
1464 	if (sk) {
1465 		skb->sk = sk;
1466 		skb->destructor = sock_edemux;
1467 		if (sk_fullsock(sk)) {
1468 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1469 
1470 			if (dst)
1471 				dst = dst_check(dst, 0);
1472 			if (dst &&
1473 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1474 				skb_dst_set_noref(skb, dst);
1475 		}
1476 	}
1477 }
1478 
1479 /* Packet is added to VJ-style prequeue for processing in process
1480  * context, if a reader task is waiting. Apparently, this exciting
1481  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1482  * failed somewhere. Latency? Burstiness? Well, at least now we will
1483  * see, why it failed. 8)8)				  --ANK
1484  *
1485  */
1486 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1487 {
1488 	struct tcp_sock *tp = tcp_sk(sk);
1489 
1490 	if (sysctl_tcp_low_latency || !tp->ucopy.task)
1491 		return false;
1492 
1493 	if (skb->len <= tcp_hdrlen(skb) &&
1494 	    skb_queue_len(&tp->ucopy.prequeue) == 0)
1495 		return false;
1496 
1497 	/* Before escaping RCU protected region, we need to take care of skb
1498 	 * dst. Prequeue is only enabled for established sockets.
1499 	 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1500 	 * Instead of doing full sk_rx_dst validity here, let's perform
1501 	 * an optimistic check.
1502 	 */
1503 	if (likely(sk->sk_rx_dst))
1504 		skb_dst_drop(skb);
1505 	else
1506 		skb_dst_force_safe(skb);
1507 
1508 	__skb_queue_tail(&tp->ucopy.prequeue, skb);
1509 	tp->ucopy.memory += skb->truesize;
1510 	if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
1511 	    tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
1512 		struct sk_buff *skb1;
1513 
1514 		BUG_ON(sock_owned_by_user(sk));
1515 		__NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1516 				skb_queue_len(&tp->ucopy.prequeue));
1517 
1518 		while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1519 			sk_backlog_rcv(sk, skb1);
1520 
1521 		tp->ucopy.memory = 0;
1522 	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1523 		wake_up_interruptible_sync_poll(sk_sleep(sk),
1524 					   POLLIN | POLLRDNORM | POLLRDBAND);
1525 		if (!inet_csk_ack_scheduled(sk))
1526 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1527 						  (3 * tcp_rto_min(sk)) / 4,
1528 						  TCP_RTO_MAX);
1529 	}
1530 	return true;
1531 }
1532 EXPORT_SYMBOL(tcp_prequeue);
1533 
1534 /*
1535  *	From tcp_input.c
1536  */
1537 
1538 int tcp_v4_rcv(struct sk_buff *skb)
1539 {
1540 	struct net *net = dev_net(skb->dev);
1541 	const struct iphdr *iph;
1542 	const struct tcphdr *th;
1543 	bool refcounted;
1544 	struct sock *sk;
1545 	int ret;
1546 
1547 	if (skb->pkt_type != PACKET_HOST)
1548 		goto discard_it;
1549 
1550 	/* Count it even if it's bad */
1551 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1552 
1553 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1554 		goto discard_it;
1555 
1556 	th = (const struct tcphdr *)skb->data;
1557 
1558 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1559 		goto bad_packet;
1560 	if (!pskb_may_pull(skb, th->doff * 4))
1561 		goto discard_it;
1562 
1563 	/* An explanation is required here, I think.
1564 	 * Packet length and doff are validated by header prediction,
1565 	 * provided case of th->doff==0 is eliminated.
1566 	 * So, we defer the checks. */
1567 
1568 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1569 		goto csum_error;
1570 
1571 	th = (const struct tcphdr *)skb->data;
1572 	iph = ip_hdr(skb);
1573 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1574 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1575 	 */
1576 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1577 		sizeof(struct inet_skb_parm));
1578 	barrier();
1579 
1580 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1581 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1582 				    skb->len - th->doff * 4);
1583 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1584 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1585 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1586 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1587 	TCP_SKB_CB(skb)->sacked	 = 0;
1588 
1589 lookup:
1590 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1591 			       th->dest, &refcounted);
1592 	if (!sk)
1593 		goto no_tcp_socket;
1594 
1595 process:
1596 	if (sk->sk_state == TCP_TIME_WAIT)
1597 		goto do_time_wait;
1598 
1599 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1600 		struct request_sock *req = inet_reqsk(sk);
1601 		struct sock *nsk;
1602 
1603 		sk = req->rsk_listener;
1604 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1605 			reqsk_put(req);
1606 			goto discard_it;
1607 		}
1608 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1609 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1610 			goto lookup;
1611 		}
1612 		/* We own a reference on the listener, increase it again
1613 		 * as we might lose it too soon.
1614 		 */
1615 		sock_hold(sk);
1616 		refcounted = true;
1617 		nsk = tcp_check_req(sk, skb, req, false);
1618 		if (!nsk) {
1619 			reqsk_put(req);
1620 			goto discard_and_relse;
1621 		}
1622 		if (nsk == sk) {
1623 			reqsk_put(req);
1624 		} else if (tcp_child_process(sk, nsk, skb)) {
1625 			tcp_v4_send_reset(nsk, skb);
1626 			goto discard_and_relse;
1627 		} else {
1628 			sock_put(sk);
1629 			return 0;
1630 		}
1631 	}
1632 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1633 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1634 		goto discard_and_relse;
1635 	}
1636 
1637 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1638 		goto discard_and_relse;
1639 
1640 	if (tcp_v4_inbound_md5_hash(sk, skb))
1641 		goto discard_and_relse;
1642 
1643 	nf_reset(skb);
1644 
1645 	if (sk_filter(sk, skb))
1646 		goto discard_and_relse;
1647 
1648 	skb->dev = NULL;
1649 
1650 	if (sk->sk_state == TCP_LISTEN) {
1651 		ret = tcp_v4_do_rcv(sk, skb);
1652 		goto put_and_return;
1653 	}
1654 
1655 	sk_incoming_cpu_update(sk);
1656 
1657 	bh_lock_sock_nested(sk);
1658 	tcp_segs_in(tcp_sk(sk), skb);
1659 	ret = 0;
1660 	if (!sock_owned_by_user(sk)) {
1661 		if (!tcp_prequeue(sk, skb))
1662 			ret = tcp_v4_do_rcv(sk, skb);
1663 	} else if (unlikely(sk_add_backlog(sk, skb,
1664 					   sk->sk_rcvbuf + sk->sk_sndbuf))) {
1665 		bh_unlock_sock(sk);
1666 		__NET_INC_STATS(net, LINUX_MIB_TCPBACKLOGDROP);
1667 		goto discard_and_relse;
1668 	}
1669 	bh_unlock_sock(sk);
1670 
1671 put_and_return:
1672 	if (refcounted)
1673 		sock_put(sk);
1674 
1675 	return ret;
1676 
1677 no_tcp_socket:
1678 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1679 		goto discard_it;
1680 
1681 	if (tcp_checksum_complete(skb)) {
1682 csum_error:
1683 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1684 bad_packet:
1685 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
1686 	} else {
1687 		tcp_v4_send_reset(NULL, skb);
1688 	}
1689 
1690 discard_it:
1691 	/* Discard frame. */
1692 	kfree_skb(skb);
1693 	return 0;
1694 
1695 discard_and_relse:
1696 	sk_drops_add(sk, skb);
1697 	if (refcounted)
1698 		sock_put(sk);
1699 	goto discard_it;
1700 
1701 do_time_wait:
1702 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1703 		inet_twsk_put(inet_twsk(sk));
1704 		goto discard_it;
1705 	}
1706 
1707 	if (tcp_checksum_complete(skb)) {
1708 		inet_twsk_put(inet_twsk(sk));
1709 		goto csum_error;
1710 	}
1711 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1712 	case TCP_TW_SYN: {
1713 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1714 							&tcp_hashinfo, skb,
1715 							__tcp_hdrlen(th),
1716 							iph->saddr, th->source,
1717 							iph->daddr, th->dest,
1718 							inet_iif(skb));
1719 		if (sk2) {
1720 			inet_twsk_deschedule_put(inet_twsk(sk));
1721 			sk = sk2;
1722 			refcounted = false;
1723 			goto process;
1724 		}
1725 		/* Fall through to ACK */
1726 	}
1727 	case TCP_TW_ACK:
1728 		tcp_v4_timewait_ack(sk, skb);
1729 		break;
1730 	case TCP_TW_RST:
1731 		tcp_v4_send_reset(sk, skb);
1732 		inet_twsk_deschedule_put(inet_twsk(sk));
1733 		goto discard_it;
1734 	case TCP_TW_SUCCESS:;
1735 	}
1736 	goto discard_it;
1737 }
1738 
1739 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1740 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1741 	.twsk_unique	= tcp_twsk_unique,
1742 	.twsk_destructor= tcp_twsk_destructor,
1743 };
1744 
1745 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1746 {
1747 	struct dst_entry *dst = skb_dst(skb);
1748 
1749 	if (dst && dst_hold_safe(dst)) {
1750 		sk->sk_rx_dst = dst;
1751 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1752 	}
1753 }
1754 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1755 
1756 const struct inet_connection_sock_af_ops ipv4_specific = {
1757 	.queue_xmit	   = ip_queue_xmit,
1758 	.send_check	   = tcp_v4_send_check,
1759 	.rebuild_header	   = inet_sk_rebuild_header,
1760 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
1761 	.conn_request	   = tcp_v4_conn_request,
1762 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1763 	.net_header_len	   = sizeof(struct iphdr),
1764 	.setsockopt	   = ip_setsockopt,
1765 	.getsockopt	   = ip_getsockopt,
1766 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1767 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1768 	.bind_conflict	   = inet_csk_bind_conflict,
1769 #ifdef CONFIG_COMPAT
1770 	.compat_setsockopt = compat_ip_setsockopt,
1771 	.compat_getsockopt = compat_ip_getsockopt,
1772 #endif
1773 	.mtu_reduced	   = tcp_v4_mtu_reduced,
1774 };
1775 EXPORT_SYMBOL(ipv4_specific);
1776 
1777 #ifdef CONFIG_TCP_MD5SIG
1778 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1779 	.md5_lookup		= tcp_v4_md5_lookup,
1780 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1781 	.md5_parse		= tcp_v4_parse_md5_keys,
1782 };
1783 #endif
1784 
1785 /* NOTE: A lot of things set to zero explicitly by call to
1786  *       sk_alloc() so need not be done here.
1787  */
1788 static int tcp_v4_init_sock(struct sock *sk)
1789 {
1790 	struct inet_connection_sock *icsk = inet_csk(sk);
1791 
1792 	tcp_init_sock(sk);
1793 
1794 	icsk->icsk_af_ops = &ipv4_specific;
1795 
1796 #ifdef CONFIG_TCP_MD5SIG
1797 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1798 #endif
1799 
1800 	return 0;
1801 }
1802 
1803 void tcp_v4_destroy_sock(struct sock *sk)
1804 {
1805 	struct tcp_sock *tp = tcp_sk(sk);
1806 
1807 	tcp_clear_xmit_timers(sk);
1808 
1809 	tcp_cleanup_congestion_control(sk);
1810 
1811 	/* Cleanup up the write buffer. */
1812 	tcp_write_queue_purge(sk);
1813 
1814 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1815 	__skb_queue_purge(&tp->out_of_order_queue);
1816 
1817 #ifdef CONFIG_TCP_MD5SIG
1818 	/* Clean up the MD5 key list, if any */
1819 	if (tp->md5sig_info) {
1820 		tcp_clear_md5_list(sk);
1821 		kfree_rcu(tp->md5sig_info, rcu);
1822 		tp->md5sig_info = NULL;
1823 	}
1824 #endif
1825 
1826 	/* Clean prequeue, it must be empty really */
1827 	__skb_queue_purge(&tp->ucopy.prequeue);
1828 
1829 	/* Clean up a referenced TCP bind bucket. */
1830 	if (inet_csk(sk)->icsk_bind_hash)
1831 		inet_put_port(sk);
1832 
1833 	BUG_ON(tp->fastopen_rsk);
1834 
1835 	/* If socket is aborted during connect operation */
1836 	tcp_free_fastopen_req(tp);
1837 	tcp_saved_syn_free(tp);
1838 
1839 	local_bh_disable();
1840 	sk_sockets_allocated_dec(sk);
1841 	local_bh_enable();
1842 
1843 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
1844 		sock_release_memcg(sk);
1845 }
1846 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1847 
1848 #ifdef CONFIG_PROC_FS
1849 /* Proc filesystem TCP sock list dumping. */
1850 
1851 /*
1852  * Get next listener socket follow cur.  If cur is NULL, get first socket
1853  * starting from bucket given in st->bucket; when st->bucket is zero the
1854  * very first socket in the hash table is returned.
1855  */
1856 static void *listening_get_next(struct seq_file *seq, void *cur)
1857 {
1858 	struct tcp_iter_state *st = seq->private;
1859 	struct net *net = seq_file_net(seq);
1860 	struct inet_listen_hashbucket *ilb;
1861 	struct inet_connection_sock *icsk;
1862 	struct sock *sk = cur;
1863 
1864 	if (!sk) {
1865 get_head:
1866 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1867 		spin_lock_bh(&ilb->lock);
1868 		sk = sk_head(&ilb->head);
1869 		st->offset = 0;
1870 		goto get_sk;
1871 	}
1872 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1873 	++st->num;
1874 	++st->offset;
1875 
1876 	sk = sk_next(sk);
1877 get_sk:
1878 	sk_for_each_from(sk) {
1879 		if (!net_eq(sock_net(sk), net))
1880 			continue;
1881 		if (sk->sk_family == st->family)
1882 			return sk;
1883 		icsk = inet_csk(sk);
1884 	}
1885 	spin_unlock_bh(&ilb->lock);
1886 	st->offset = 0;
1887 	if (++st->bucket < INET_LHTABLE_SIZE)
1888 		goto get_head;
1889 	return NULL;
1890 }
1891 
1892 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1893 {
1894 	struct tcp_iter_state *st = seq->private;
1895 	void *rc;
1896 
1897 	st->bucket = 0;
1898 	st->offset = 0;
1899 	rc = listening_get_next(seq, NULL);
1900 
1901 	while (rc && *pos) {
1902 		rc = listening_get_next(seq, rc);
1903 		--*pos;
1904 	}
1905 	return rc;
1906 }
1907 
1908 static inline bool empty_bucket(const struct tcp_iter_state *st)
1909 {
1910 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1911 }
1912 
1913 /*
1914  * Get first established socket starting from bucket given in st->bucket.
1915  * If st->bucket is zero, the very first socket in the hash is returned.
1916  */
1917 static void *established_get_first(struct seq_file *seq)
1918 {
1919 	struct tcp_iter_state *st = seq->private;
1920 	struct net *net = seq_file_net(seq);
1921 	void *rc = NULL;
1922 
1923 	st->offset = 0;
1924 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1925 		struct sock *sk;
1926 		struct hlist_nulls_node *node;
1927 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1928 
1929 		/* Lockless fast path for the common case of empty buckets */
1930 		if (empty_bucket(st))
1931 			continue;
1932 
1933 		spin_lock_bh(lock);
1934 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1935 			if (sk->sk_family != st->family ||
1936 			    !net_eq(sock_net(sk), net)) {
1937 				continue;
1938 			}
1939 			rc = sk;
1940 			goto out;
1941 		}
1942 		spin_unlock_bh(lock);
1943 	}
1944 out:
1945 	return rc;
1946 }
1947 
1948 static void *established_get_next(struct seq_file *seq, void *cur)
1949 {
1950 	struct sock *sk = cur;
1951 	struct hlist_nulls_node *node;
1952 	struct tcp_iter_state *st = seq->private;
1953 	struct net *net = seq_file_net(seq);
1954 
1955 	++st->num;
1956 	++st->offset;
1957 
1958 	sk = sk_nulls_next(sk);
1959 
1960 	sk_nulls_for_each_from(sk, node) {
1961 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1962 			return sk;
1963 	}
1964 
1965 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1966 	++st->bucket;
1967 	return established_get_first(seq);
1968 }
1969 
1970 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1971 {
1972 	struct tcp_iter_state *st = seq->private;
1973 	void *rc;
1974 
1975 	st->bucket = 0;
1976 	rc = established_get_first(seq);
1977 
1978 	while (rc && pos) {
1979 		rc = established_get_next(seq, rc);
1980 		--pos;
1981 	}
1982 	return rc;
1983 }
1984 
1985 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1986 {
1987 	void *rc;
1988 	struct tcp_iter_state *st = seq->private;
1989 
1990 	st->state = TCP_SEQ_STATE_LISTENING;
1991 	rc	  = listening_get_idx(seq, &pos);
1992 
1993 	if (!rc) {
1994 		st->state = TCP_SEQ_STATE_ESTABLISHED;
1995 		rc	  = established_get_idx(seq, pos);
1996 	}
1997 
1998 	return rc;
1999 }
2000 
2001 static void *tcp_seek_last_pos(struct seq_file *seq)
2002 {
2003 	struct tcp_iter_state *st = seq->private;
2004 	int offset = st->offset;
2005 	int orig_num = st->num;
2006 	void *rc = NULL;
2007 
2008 	switch (st->state) {
2009 	case TCP_SEQ_STATE_LISTENING:
2010 		if (st->bucket >= INET_LHTABLE_SIZE)
2011 			break;
2012 		st->state = TCP_SEQ_STATE_LISTENING;
2013 		rc = listening_get_next(seq, NULL);
2014 		while (offset-- && rc)
2015 			rc = listening_get_next(seq, rc);
2016 		if (rc)
2017 			break;
2018 		st->bucket = 0;
2019 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2020 		/* Fallthrough */
2021 	case TCP_SEQ_STATE_ESTABLISHED:
2022 		if (st->bucket > tcp_hashinfo.ehash_mask)
2023 			break;
2024 		rc = established_get_first(seq);
2025 		while (offset-- && rc)
2026 			rc = established_get_next(seq, rc);
2027 	}
2028 
2029 	st->num = orig_num;
2030 
2031 	return rc;
2032 }
2033 
2034 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2035 {
2036 	struct tcp_iter_state *st = seq->private;
2037 	void *rc;
2038 
2039 	if (*pos && *pos == st->last_pos) {
2040 		rc = tcp_seek_last_pos(seq);
2041 		if (rc)
2042 			goto out;
2043 	}
2044 
2045 	st->state = TCP_SEQ_STATE_LISTENING;
2046 	st->num = 0;
2047 	st->bucket = 0;
2048 	st->offset = 0;
2049 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2050 
2051 out:
2052 	st->last_pos = *pos;
2053 	return rc;
2054 }
2055 
2056 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2057 {
2058 	struct tcp_iter_state *st = seq->private;
2059 	void *rc = NULL;
2060 
2061 	if (v == SEQ_START_TOKEN) {
2062 		rc = tcp_get_idx(seq, 0);
2063 		goto out;
2064 	}
2065 
2066 	switch (st->state) {
2067 	case TCP_SEQ_STATE_LISTENING:
2068 		rc = listening_get_next(seq, v);
2069 		if (!rc) {
2070 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2071 			st->bucket = 0;
2072 			st->offset = 0;
2073 			rc	  = established_get_first(seq);
2074 		}
2075 		break;
2076 	case TCP_SEQ_STATE_ESTABLISHED:
2077 		rc = established_get_next(seq, v);
2078 		break;
2079 	}
2080 out:
2081 	++*pos;
2082 	st->last_pos = *pos;
2083 	return rc;
2084 }
2085 
2086 static void tcp_seq_stop(struct seq_file *seq, void *v)
2087 {
2088 	struct tcp_iter_state *st = seq->private;
2089 
2090 	switch (st->state) {
2091 	case TCP_SEQ_STATE_LISTENING:
2092 		if (v != SEQ_START_TOKEN)
2093 			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2094 		break;
2095 	case TCP_SEQ_STATE_ESTABLISHED:
2096 		if (v)
2097 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2098 		break;
2099 	}
2100 }
2101 
2102 int tcp_seq_open(struct inode *inode, struct file *file)
2103 {
2104 	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2105 	struct tcp_iter_state *s;
2106 	int err;
2107 
2108 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2109 			  sizeof(struct tcp_iter_state));
2110 	if (err < 0)
2111 		return err;
2112 
2113 	s = ((struct seq_file *)file->private_data)->private;
2114 	s->family		= afinfo->family;
2115 	s->last_pos		= 0;
2116 	return 0;
2117 }
2118 EXPORT_SYMBOL(tcp_seq_open);
2119 
2120 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2121 {
2122 	int rc = 0;
2123 	struct proc_dir_entry *p;
2124 
2125 	afinfo->seq_ops.start		= tcp_seq_start;
2126 	afinfo->seq_ops.next		= tcp_seq_next;
2127 	afinfo->seq_ops.stop		= tcp_seq_stop;
2128 
2129 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2130 			     afinfo->seq_fops, afinfo);
2131 	if (!p)
2132 		rc = -ENOMEM;
2133 	return rc;
2134 }
2135 EXPORT_SYMBOL(tcp_proc_register);
2136 
2137 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2138 {
2139 	remove_proc_entry(afinfo->name, net->proc_net);
2140 }
2141 EXPORT_SYMBOL(tcp_proc_unregister);
2142 
2143 static void get_openreq4(const struct request_sock *req,
2144 			 struct seq_file *f, int i)
2145 {
2146 	const struct inet_request_sock *ireq = inet_rsk(req);
2147 	long delta = req->rsk_timer.expires - jiffies;
2148 
2149 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2150 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2151 		i,
2152 		ireq->ir_loc_addr,
2153 		ireq->ir_num,
2154 		ireq->ir_rmt_addr,
2155 		ntohs(ireq->ir_rmt_port),
2156 		TCP_SYN_RECV,
2157 		0, 0, /* could print option size, but that is af dependent. */
2158 		1,    /* timers active (only the expire timer) */
2159 		jiffies_delta_to_clock_t(delta),
2160 		req->num_timeout,
2161 		from_kuid_munged(seq_user_ns(f),
2162 				 sock_i_uid(req->rsk_listener)),
2163 		0,  /* non standard timer */
2164 		0, /* open_requests have no inode */
2165 		0,
2166 		req);
2167 }
2168 
2169 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2170 {
2171 	int timer_active;
2172 	unsigned long timer_expires;
2173 	const struct tcp_sock *tp = tcp_sk(sk);
2174 	const struct inet_connection_sock *icsk = inet_csk(sk);
2175 	const struct inet_sock *inet = inet_sk(sk);
2176 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2177 	__be32 dest = inet->inet_daddr;
2178 	__be32 src = inet->inet_rcv_saddr;
2179 	__u16 destp = ntohs(inet->inet_dport);
2180 	__u16 srcp = ntohs(inet->inet_sport);
2181 	int rx_queue;
2182 	int state;
2183 
2184 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2185 	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2186 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2187 		timer_active	= 1;
2188 		timer_expires	= icsk->icsk_timeout;
2189 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2190 		timer_active	= 4;
2191 		timer_expires	= icsk->icsk_timeout;
2192 	} else if (timer_pending(&sk->sk_timer)) {
2193 		timer_active	= 2;
2194 		timer_expires	= sk->sk_timer.expires;
2195 	} else {
2196 		timer_active	= 0;
2197 		timer_expires = jiffies;
2198 	}
2199 
2200 	state = sk_state_load(sk);
2201 	if (state == TCP_LISTEN)
2202 		rx_queue = sk->sk_ack_backlog;
2203 	else
2204 		/* Because we don't lock the socket,
2205 		 * we might find a transient negative value.
2206 		 */
2207 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2208 
2209 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2210 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2211 		i, src, srcp, dest, destp, state,
2212 		tp->write_seq - tp->snd_una,
2213 		rx_queue,
2214 		timer_active,
2215 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2216 		icsk->icsk_retransmits,
2217 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2218 		icsk->icsk_probes_out,
2219 		sock_i_ino(sk),
2220 		atomic_read(&sk->sk_refcnt), sk,
2221 		jiffies_to_clock_t(icsk->icsk_rto),
2222 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2223 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2224 		tp->snd_cwnd,
2225 		state == TCP_LISTEN ?
2226 		    fastopenq->max_qlen :
2227 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2228 }
2229 
2230 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2231 			       struct seq_file *f, int i)
2232 {
2233 	long delta = tw->tw_timer.expires - jiffies;
2234 	__be32 dest, src;
2235 	__u16 destp, srcp;
2236 
2237 	dest  = tw->tw_daddr;
2238 	src   = tw->tw_rcv_saddr;
2239 	destp = ntohs(tw->tw_dport);
2240 	srcp  = ntohs(tw->tw_sport);
2241 
2242 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2243 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2244 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2245 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2246 		atomic_read(&tw->tw_refcnt), tw);
2247 }
2248 
2249 #define TMPSZ 150
2250 
2251 static int tcp4_seq_show(struct seq_file *seq, void *v)
2252 {
2253 	struct tcp_iter_state *st;
2254 	struct sock *sk = v;
2255 
2256 	seq_setwidth(seq, TMPSZ - 1);
2257 	if (v == SEQ_START_TOKEN) {
2258 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2259 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2260 			   "inode");
2261 		goto out;
2262 	}
2263 	st = seq->private;
2264 
2265 	if (sk->sk_state == TCP_TIME_WAIT)
2266 		get_timewait4_sock(v, seq, st->num);
2267 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2268 		get_openreq4(v, seq, st->num);
2269 	else
2270 		get_tcp4_sock(v, seq, st->num);
2271 out:
2272 	seq_pad(seq, '\n');
2273 	return 0;
2274 }
2275 
2276 static const struct file_operations tcp_afinfo_seq_fops = {
2277 	.owner   = THIS_MODULE,
2278 	.open    = tcp_seq_open,
2279 	.read    = seq_read,
2280 	.llseek  = seq_lseek,
2281 	.release = seq_release_net
2282 };
2283 
2284 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2285 	.name		= "tcp",
2286 	.family		= AF_INET,
2287 	.seq_fops	= &tcp_afinfo_seq_fops,
2288 	.seq_ops	= {
2289 		.show		= tcp4_seq_show,
2290 	},
2291 };
2292 
2293 static int __net_init tcp4_proc_init_net(struct net *net)
2294 {
2295 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2296 }
2297 
2298 static void __net_exit tcp4_proc_exit_net(struct net *net)
2299 {
2300 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2301 }
2302 
2303 static struct pernet_operations tcp4_net_ops = {
2304 	.init = tcp4_proc_init_net,
2305 	.exit = tcp4_proc_exit_net,
2306 };
2307 
2308 int __init tcp4_proc_init(void)
2309 {
2310 	return register_pernet_subsys(&tcp4_net_ops);
2311 }
2312 
2313 void tcp4_proc_exit(void)
2314 {
2315 	unregister_pernet_subsys(&tcp4_net_ops);
2316 }
2317 #endif /* CONFIG_PROC_FS */
2318 
2319 struct proto tcp_prot = {
2320 	.name			= "TCP",
2321 	.owner			= THIS_MODULE,
2322 	.close			= tcp_close,
2323 	.connect		= tcp_v4_connect,
2324 	.disconnect		= tcp_disconnect,
2325 	.accept			= inet_csk_accept,
2326 	.ioctl			= tcp_ioctl,
2327 	.init			= tcp_v4_init_sock,
2328 	.destroy		= tcp_v4_destroy_sock,
2329 	.shutdown		= tcp_shutdown,
2330 	.setsockopt		= tcp_setsockopt,
2331 	.getsockopt		= tcp_getsockopt,
2332 	.recvmsg		= tcp_recvmsg,
2333 	.sendmsg		= tcp_sendmsg,
2334 	.sendpage		= tcp_sendpage,
2335 	.backlog_rcv		= tcp_v4_do_rcv,
2336 	.release_cb		= tcp_release_cb,
2337 	.hash			= inet_hash,
2338 	.unhash			= inet_unhash,
2339 	.get_port		= inet_csk_get_port,
2340 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2341 	.stream_memory_free	= tcp_stream_memory_free,
2342 	.sockets_allocated	= &tcp_sockets_allocated,
2343 	.orphan_count		= &tcp_orphan_count,
2344 	.memory_allocated	= &tcp_memory_allocated,
2345 	.memory_pressure	= &tcp_memory_pressure,
2346 	.sysctl_mem		= sysctl_tcp_mem,
2347 	.sysctl_wmem		= sysctl_tcp_wmem,
2348 	.sysctl_rmem		= sysctl_tcp_rmem,
2349 	.max_header		= MAX_TCP_HEADER,
2350 	.obj_size		= sizeof(struct tcp_sock),
2351 	.slab_flags		= SLAB_DESTROY_BY_RCU,
2352 	.twsk_prot		= &tcp_timewait_sock_ops,
2353 	.rsk_prot		= &tcp_request_sock_ops,
2354 	.h.hashinfo		= &tcp_hashinfo,
2355 	.no_autobind		= true,
2356 #ifdef CONFIG_COMPAT
2357 	.compat_setsockopt	= compat_tcp_setsockopt,
2358 	.compat_getsockopt	= compat_tcp_getsockopt,
2359 #endif
2360 	.diag_destroy		= tcp_abort,
2361 };
2362 EXPORT_SYMBOL(tcp_prot);
2363 
2364 static void __net_exit tcp_sk_exit(struct net *net)
2365 {
2366 	int cpu;
2367 
2368 	for_each_possible_cpu(cpu)
2369 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2370 	free_percpu(net->ipv4.tcp_sk);
2371 }
2372 
2373 static int __net_init tcp_sk_init(struct net *net)
2374 {
2375 	int res, cpu;
2376 
2377 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2378 	if (!net->ipv4.tcp_sk)
2379 		return -ENOMEM;
2380 
2381 	for_each_possible_cpu(cpu) {
2382 		struct sock *sk;
2383 
2384 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2385 					   IPPROTO_TCP, net);
2386 		if (res)
2387 			goto fail;
2388 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2389 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2390 	}
2391 
2392 	net->ipv4.sysctl_tcp_ecn = 2;
2393 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2394 
2395 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2396 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2397 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2398 
2399 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2400 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2401 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2402 
2403 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2404 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2405 	net->ipv4.sysctl_tcp_syncookies = 1;
2406 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2407 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2408 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2409 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2410 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2411 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2412 
2413 	return 0;
2414 fail:
2415 	tcp_sk_exit(net);
2416 
2417 	return res;
2418 }
2419 
2420 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2421 {
2422 	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2423 }
2424 
2425 static struct pernet_operations __net_initdata tcp_sk_ops = {
2426        .init	   = tcp_sk_init,
2427        .exit	   = tcp_sk_exit,
2428        .exit_batch = tcp_sk_exit_batch,
2429 };
2430 
2431 void __init tcp_v4_init(void)
2432 {
2433 	inet_hashinfo_init(&tcp_hashinfo);
2434 	if (register_pernet_subsys(&tcp_sk_ops))
2435 		panic("Failed to create the TCP control socket.\n");
2436 }
2437