xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision f930103421f6579719b8252285c94c1195f6e032)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77 
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 
84 #include <crypto/hash.h>
85 #include <linux/scatterlist.h>
86 
87 int sysctl_tcp_low_latency __read_mostly;
88 
89 #ifdef CONFIG_TCP_MD5SIG
90 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
91 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
92 #endif
93 
94 struct inet_hashinfo tcp_hashinfo;
95 EXPORT_SYMBOL(tcp_hashinfo);
96 
97 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
98 {
99 	return secure_tcp_seq(ip_hdr(skb)->daddr,
100 			      ip_hdr(skb)->saddr,
101 			      tcp_hdr(skb)->dest,
102 			      tcp_hdr(skb)->source);
103 }
104 
105 static u32 tcp_v4_init_ts_off(const struct sk_buff *skb)
106 {
107 	return secure_tcp_ts_off(ip_hdr(skb)->daddr,
108 				 ip_hdr(skb)->saddr);
109 }
110 
111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112 {
113 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114 	struct tcp_sock *tp = tcp_sk(sk);
115 
116 	/* With PAWS, it is safe from the viewpoint
117 	   of data integrity. Even without PAWS it is safe provided sequence
118 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
119 
120 	   Actually, the idea is close to VJ's one, only timestamp cache is
121 	   held not per host, but per port pair and TW bucket is used as state
122 	   holder.
123 
124 	   If TW bucket has been already destroyed we fall back to VJ's scheme
125 	   and use initial timestamp retrieved from peer table.
126 	 */
127 	if (tcptw->tw_ts_recent_stamp &&
128 	    (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
129 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
130 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
131 		if (tp->write_seq == 0)
132 			tp->write_seq = 1;
133 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
134 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
135 		sock_hold(sktw);
136 		return 1;
137 	}
138 
139 	return 0;
140 }
141 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
142 
143 /* This will initiate an outgoing connection. */
144 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
145 {
146 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
147 	struct inet_sock *inet = inet_sk(sk);
148 	struct tcp_sock *tp = tcp_sk(sk);
149 	__be16 orig_sport, orig_dport;
150 	__be32 daddr, nexthop;
151 	struct flowi4 *fl4;
152 	struct rtable *rt;
153 	int err;
154 	struct ip_options_rcu *inet_opt;
155 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
156 
157 	if (addr_len < sizeof(struct sockaddr_in))
158 		return -EINVAL;
159 
160 	if (usin->sin_family != AF_INET)
161 		return -EAFNOSUPPORT;
162 
163 	nexthop = daddr = usin->sin_addr.s_addr;
164 	inet_opt = rcu_dereference_protected(inet->inet_opt,
165 					     lockdep_sock_is_held(sk));
166 	if (inet_opt && inet_opt->opt.srr) {
167 		if (!daddr)
168 			return -EINVAL;
169 		nexthop = inet_opt->opt.faddr;
170 	}
171 
172 	orig_sport = inet->inet_sport;
173 	orig_dport = usin->sin_port;
174 	fl4 = &inet->cork.fl.u.ip4;
175 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
176 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
177 			      IPPROTO_TCP,
178 			      orig_sport, orig_dport, sk);
179 	if (IS_ERR(rt)) {
180 		err = PTR_ERR(rt);
181 		if (err == -ENETUNREACH)
182 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
183 		return err;
184 	}
185 
186 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
187 		ip_rt_put(rt);
188 		return -ENETUNREACH;
189 	}
190 
191 	if (!inet_opt || !inet_opt->opt.srr)
192 		daddr = fl4->daddr;
193 
194 	if (!inet->inet_saddr)
195 		inet->inet_saddr = fl4->saddr;
196 	sk_rcv_saddr_set(sk, inet->inet_saddr);
197 
198 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
199 		/* Reset inherited state */
200 		tp->rx_opt.ts_recent	   = 0;
201 		tp->rx_opt.ts_recent_stamp = 0;
202 		if (likely(!tp->repair))
203 			tp->write_seq	   = 0;
204 	}
205 
206 	inet->inet_dport = usin->sin_port;
207 	sk_daddr_set(sk, daddr);
208 
209 	inet_csk(sk)->icsk_ext_hdr_len = 0;
210 	if (inet_opt)
211 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
212 
213 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
214 
215 	/* Socket identity is still unknown (sport may be zero).
216 	 * However we set state to SYN-SENT and not releasing socket
217 	 * lock select source port, enter ourselves into the hash tables and
218 	 * complete initialization after this.
219 	 */
220 	tcp_set_state(sk, TCP_SYN_SENT);
221 	err = inet_hash_connect(tcp_death_row, sk);
222 	if (err)
223 		goto failure;
224 
225 	sk_set_txhash(sk);
226 
227 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
228 			       inet->inet_sport, inet->inet_dport, sk);
229 	if (IS_ERR(rt)) {
230 		err = PTR_ERR(rt);
231 		rt = NULL;
232 		goto failure;
233 	}
234 	/* OK, now commit destination to socket.  */
235 	sk->sk_gso_type = SKB_GSO_TCPV4;
236 	sk_setup_caps(sk, &rt->dst);
237 	rt = NULL;
238 
239 	if (likely(!tp->repair)) {
240 		if (!tp->write_seq)
241 			tp->write_seq = secure_tcp_seq(inet->inet_saddr,
242 						       inet->inet_daddr,
243 						       inet->inet_sport,
244 						       usin->sin_port);
245 		tp->tsoffset = secure_tcp_ts_off(inet->inet_saddr,
246 						 inet->inet_daddr);
247 	}
248 
249 	inet->inet_id = tp->write_seq ^ jiffies;
250 
251 	if (tcp_fastopen_defer_connect(sk, &err))
252 		return err;
253 	if (err)
254 		goto failure;
255 
256 	err = tcp_connect(sk);
257 
258 	if (err)
259 		goto failure;
260 
261 	return 0;
262 
263 failure:
264 	/*
265 	 * This unhashes the socket and releases the local port,
266 	 * if necessary.
267 	 */
268 	tcp_set_state(sk, TCP_CLOSE);
269 	ip_rt_put(rt);
270 	sk->sk_route_caps = 0;
271 	inet->inet_dport = 0;
272 	return err;
273 }
274 EXPORT_SYMBOL(tcp_v4_connect);
275 
276 /*
277  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
278  * It can be called through tcp_release_cb() if socket was owned by user
279  * at the time tcp_v4_err() was called to handle ICMP message.
280  */
281 void tcp_v4_mtu_reduced(struct sock *sk)
282 {
283 	struct inet_sock *inet = inet_sk(sk);
284 	struct dst_entry *dst;
285 	u32 mtu;
286 
287 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
288 		return;
289 	mtu = tcp_sk(sk)->mtu_info;
290 	dst = inet_csk_update_pmtu(sk, mtu);
291 	if (!dst)
292 		return;
293 
294 	/* Something is about to be wrong... Remember soft error
295 	 * for the case, if this connection will not able to recover.
296 	 */
297 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
298 		sk->sk_err_soft = EMSGSIZE;
299 
300 	mtu = dst_mtu(dst);
301 
302 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
303 	    ip_sk_accept_pmtu(sk) &&
304 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
305 		tcp_sync_mss(sk, mtu);
306 
307 		/* Resend the TCP packet because it's
308 		 * clear that the old packet has been
309 		 * dropped. This is the new "fast" path mtu
310 		 * discovery.
311 		 */
312 		tcp_simple_retransmit(sk);
313 	} /* else let the usual retransmit timer handle it */
314 }
315 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
316 
317 static void do_redirect(struct sk_buff *skb, struct sock *sk)
318 {
319 	struct dst_entry *dst = __sk_dst_check(sk, 0);
320 
321 	if (dst)
322 		dst->ops->redirect(dst, sk, skb);
323 }
324 
325 
326 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
327 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
328 {
329 	struct request_sock *req = inet_reqsk(sk);
330 	struct net *net = sock_net(sk);
331 
332 	/* ICMPs are not backlogged, hence we cannot get
333 	 * an established socket here.
334 	 */
335 	if (seq != tcp_rsk(req)->snt_isn) {
336 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
337 	} else if (abort) {
338 		/*
339 		 * Still in SYN_RECV, just remove it silently.
340 		 * There is no good way to pass the error to the newly
341 		 * created socket, and POSIX does not want network
342 		 * errors returned from accept().
343 		 */
344 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
345 		tcp_listendrop(req->rsk_listener);
346 	}
347 	reqsk_put(req);
348 }
349 EXPORT_SYMBOL(tcp_req_err);
350 
351 /*
352  * This routine is called by the ICMP module when it gets some
353  * sort of error condition.  If err < 0 then the socket should
354  * be closed and the error returned to the user.  If err > 0
355  * it's just the icmp type << 8 | icmp code.  After adjustment
356  * header points to the first 8 bytes of the tcp header.  We need
357  * to find the appropriate port.
358  *
359  * The locking strategy used here is very "optimistic". When
360  * someone else accesses the socket the ICMP is just dropped
361  * and for some paths there is no check at all.
362  * A more general error queue to queue errors for later handling
363  * is probably better.
364  *
365  */
366 
367 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
368 {
369 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
370 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
371 	struct inet_connection_sock *icsk;
372 	struct tcp_sock *tp;
373 	struct inet_sock *inet;
374 	const int type = icmp_hdr(icmp_skb)->type;
375 	const int code = icmp_hdr(icmp_skb)->code;
376 	struct sock *sk;
377 	struct sk_buff *skb;
378 	struct request_sock *fastopen;
379 	u32 seq, snd_una;
380 	s32 remaining;
381 	u32 delta_us;
382 	int err;
383 	struct net *net = dev_net(icmp_skb->dev);
384 
385 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
386 				       th->dest, iph->saddr, ntohs(th->source),
387 				       inet_iif(icmp_skb));
388 	if (!sk) {
389 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
390 		return;
391 	}
392 	if (sk->sk_state == TCP_TIME_WAIT) {
393 		inet_twsk_put(inet_twsk(sk));
394 		return;
395 	}
396 	seq = ntohl(th->seq);
397 	if (sk->sk_state == TCP_NEW_SYN_RECV)
398 		return tcp_req_err(sk, seq,
399 				  type == ICMP_PARAMETERPROB ||
400 				  type == ICMP_TIME_EXCEEDED ||
401 				  (type == ICMP_DEST_UNREACH &&
402 				   (code == ICMP_NET_UNREACH ||
403 				    code == ICMP_HOST_UNREACH)));
404 
405 	bh_lock_sock(sk);
406 	/* If too many ICMPs get dropped on busy
407 	 * servers this needs to be solved differently.
408 	 * We do take care of PMTU discovery (RFC1191) special case :
409 	 * we can receive locally generated ICMP messages while socket is held.
410 	 */
411 	if (sock_owned_by_user(sk)) {
412 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
413 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
414 	}
415 	if (sk->sk_state == TCP_CLOSE)
416 		goto out;
417 
418 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
419 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
420 		goto out;
421 	}
422 
423 	icsk = inet_csk(sk);
424 	tp = tcp_sk(sk);
425 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
426 	fastopen = tp->fastopen_rsk;
427 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
428 	if (sk->sk_state != TCP_LISTEN &&
429 	    !between(seq, snd_una, tp->snd_nxt)) {
430 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
431 		goto out;
432 	}
433 
434 	switch (type) {
435 	case ICMP_REDIRECT:
436 		if (!sock_owned_by_user(sk))
437 			do_redirect(icmp_skb, sk);
438 		goto out;
439 	case ICMP_SOURCE_QUENCH:
440 		/* Just silently ignore these. */
441 		goto out;
442 	case ICMP_PARAMETERPROB:
443 		err = EPROTO;
444 		break;
445 	case ICMP_DEST_UNREACH:
446 		if (code > NR_ICMP_UNREACH)
447 			goto out;
448 
449 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
450 			/* We are not interested in TCP_LISTEN and open_requests
451 			 * (SYN-ACKs send out by Linux are always <576bytes so
452 			 * they should go through unfragmented).
453 			 */
454 			if (sk->sk_state == TCP_LISTEN)
455 				goto out;
456 
457 			tp->mtu_info = info;
458 			if (!sock_owned_by_user(sk)) {
459 				tcp_v4_mtu_reduced(sk);
460 			} else {
461 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
462 					sock_hold(sk);
463 			}
464 			goto out;
465 		}
466 
467 		err = icmp_err_convert[code].errno;
468 		/* check if icmp_skb allows revert of backoff
469 		 * (see draft-zimmermann-tcp-lcd) */
470 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
471 			break;
472 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
473 		    !icsk->icsk_backoff || fastopen)
474 			break;
475 
476 		if (sock_owned_by_user(sk))
477 			break;
478 
479 		icsk->icsk_backoff--;
480 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
481 					       TCP_TIMEOUT_INIT;
482 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
483 
484 		skb = tcp_write_queue_head(sk);
485 		BUG_ON(!skb);
486 
487 		tcp_mstamp_refresh(tp);
488 		delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
489 		remaining = icsk->icsk_rto -
490 			    usecs_to_jiffies(delta_us);
491 
492 		if (remaining > 0) {
493 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
494 						  remaining, TCP_RTO_MAX);
495 		} else {
496 			/* RTO revert clocked out retransmission.
497 			 * Will retransmit now */
498 			tcp_retransmit_timer(sk);
499 		}
500 
501 		break;
502 	case ICMP_TIME_EXCEEDED:
503 		err = EHOSTUNREACH;
504 		break;
505 	default:
506 		goto out;
507 	}
508 
509 	switch (sk->sk_state) {
510 	case TCP_SYN_SENT:
511 	case TCP_SYN_RECV:
512 		/* Only in fast or simultaneous open. If a fast open socket is
513 		 * is already accepted it is treated as a connected one below.
514 		 */
515 		if (fastopen && !fastopen->sk)
516 			break;
517 
518 		if (!sock_owned_by_user(sk)) {
519 			sk->sk_err = err;
520 
521 			sk->sk_error_report(sk);
522 
523 			tcp_done(sk);
524 		} else {
525 			sk->sk_err_soft = err;
526 		}
527 		goto out;
528 	}
529 
530 	/* If we've already connected we will keep trying
531 	 * until we time out, or the user gives up.
532 	 *
533 	 * rfc1122 4.2.3.9 allows to consider as hard errors
534 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
535 	 * but it is obsoleted by pmtu discovery).
536 	 *
537 	 * Note, that in modern internet, where routing is unreliable
538 	 * and in each dark corner broken firewalls sit, sending random
539 	 * errors ordered by their masters even this two messages finally lose
540 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
541 	 *
542 	 * Now we are in compliance with RFCs.
543 	 *							--ANK (980905)
544 	 */
545 
546 	inet = inet_sk(sk);
547 	if (!sock_owned_by_user(sk) && inet->recverr) {
548 		sk->sk_err = err;
549 		sk->sk_error_report(sk);
550 	} else	{ /* Only an error on timeout */
551 		sk->sk_err_soft = err;
552 	}
553 
554 out:
555 	bh_unlock_sock(sk);
556 	sock_put(sk);
557 }
558 
559 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
560 {
561 	struct tcphdr *th = tcp_hdr(skb);
562 
563 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
564 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
565 		skb->csum_start = skb_transport_header(skb) - skb->head;
566 		skb->csum_offset = offsetof(struct tcphdr, check);
567 	} else {
568 		th->check = tcp_v4_check(skb->len, saddr, daddr,
569 					 csum_partial(th,
570 						      th->doff << 2,
571 						      skb->csum));
572 	}
573 }
574 
575 /* This routine computes an IPv4 TCP checksum. */
576 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
577 {
578 	const struct inet_sock *inet = inet_sk(sk);
579 
580 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
581 }
582 EXPORT_SYMBOL(tcp_v4_send_check);
583 
584 /*
585  *	This routine will send an RST to the other tcp.
586  *
587  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
588  *		      for reset.
589  *	Answer: if a packet caused RST, it is not for a socket
590  *		existing in our system, if it is matched to a socket,
591  *		it is just duplicate segment or bug in other side's TCP.
592  *		So that we build reply only basing on parameters
593  *		arrived with segment.
594  *	Exception: precedence violation. We do not implement it in any case.
595  */
596 
597 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
598 {
599 	const struct tcphdr *th = tcp_hdr(skb);
600 	struct {
601 		struct tcphdr th;
602 #ifdef CONFIG_TCP_MD5SIG
603 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
604 #endif
605 	} rep;
606 	struct ip_reply_arg arg;
607 #ifdef CONFIG_TCP_MD5SIG
608 	struct tcp_md5sig_key *key = NULL;
609 	const __u8 *hash_location = NULL;
610 	unsigned char newhash[16];
611 	int genhash;
612 	struct sock *sk1 = NULL;
613 #endif
614 	struct net *net;
615 
616 	/* Never send a reset in response to a reset. */
617 	if (th->rst)
618 		return;
619 
620 	/* If sk not NULL, it means we did a successful lookup and incoming
621 	 * route had to be correct. prequeue might have dropped our dst.
622 	 */
623 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
624 		return;
625 
626 	/* Swap the send and the receive. */
627 	memset(&rep, 0, sizeof(rep));
628 	rep.th.dest   = th->source;
629 	rep.th.source = th->dest;
630 	rep.th.doff   = sizeof(struct tcphdr) / 4;
631 	rep.th.rst    = 1;
632 
633 	if (th->ack) {
634 		rep.th.seq = th->ack_seq;
635 	} else {
636 		rep.th.ack = 1;
637 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
638 				       skb->len - (th->doff << 2));
639 	}
640 
641 	memset(&arg, 0, sizeof(arg));
642 	arg.iov[0].iov_base = (unsigned char *)&rep;
643 	arg.iov[0].iov_len  = sizeof(rep.th);
644 
645 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
646 #ifdef CONFIG_TCP_MD5SIG
647 	rcu_read_lock();
648 	hash_location = tcp_parse_md5sig_option(th);
649 	if (sk && sk_fullsock(sk)) {
650 		key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
651 					&ip_hdr(skb)->saddr, AF_INET);
652 	} else if (hash_location) {
653 		/*
654 		 * active side is lost. Try to find listening socket through
655 		 * source port, and then find md5 key through listening socket.
656 		 * we are not loose security here:
657 		 * Incoming packet is checked with md5 hash with finding key,
658 		 * no RST generated if md5 hash doesn't match.
659 		 */
660 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
661 					     ip_hdr(skb)->saddr,
662 					     th->source, ip_hdr(skb)->daddr,
663 					     ntohs(th->source), inet_iif(skb));
664 		/* don't send rst if it can't find key */
665 		if (!sk1)
666 			goto out;
667 
668 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
669 					&ip_hdr(skb)->saddr, AF_INET);
670 		if (!key)
671 			goto out;
672 
673 
674 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
675 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
676 			goto out;
677 
678 	}
679 
680 	if (key) {
681 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
682 				   (TCPOPT_NOP << 16) |
683 				   (TCPOPT_MD5SIG << 8) |
684 				   TCPOLEN_MD5SIG);
685 		/* Update length and the length the header thinks exists */
686 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
687 		rep.th.doff = arg.iov[0].iov_len / 4;
688 
689 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
690 				     key, ip_hdr(skb)->saddr,
691 				     ip_hdr(skb)->daddr, &rep.th);
692 	}
693 #endif
694 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
695 				      ip_hdr(skb)->saddr, /* XXX */
696 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
697 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
698 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
699 
700 	/* When socket is gone, all binding information is lost.
701 	 * routing might fail in this case. No choice here, if we choose to force
702 	 * input interface, we will misroute in case of asymmetric route.
703 	 */
704 	if (sk)
705 		arg.bound_dev_if = sk->sk_bound_dev_if;
706 
707 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
708 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
709 
710 	arg.tos = ip_hdr(skb)->tos;
711 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
712 	local_bh_disable();
713 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
714 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
715 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
716 			      &arg, arg.iov[0].iov_len);
717 
718 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
719 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
720 	local_bh_enable();
721 
722 #ifdef CONFIG_TCP_MD5SIG
723 out:
724 	rcu_read_unlock();
725 #endif
726 }
727 
728 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
729    outside socket context is ugly, certainly. What can I do?
730  */
731 
732 static void tcp_v4_send_ack(const struct sock *sk,
733 			    struct sk_buff *skb, u32 seq, u32 ack,
734 			    u32 win, u32 tsval, u32 tsecr, int oif,
735 			    struct tcp_md5sig_key *key,
736 			    int reply_flags, u8 tos)
737 {
738 	const struct tcphdr *th = tcp_hdr(skb);
739 	struct {
740 		struct tcphdr th;
741 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
742 #ifdef CONFIG_TCP_MD5SIG
743 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
744 #endif
745 			];
746 	} rep;
747 	struct net *net = sock_net(sk);
748 	struct ip_reply_arg arg;
749 
750 	memset(&rep.th, 0, sizeof(struct tcphdr));
751 	memset(&arg, 0, sizeof(arg));
752 
753 	arg.iov[0].iov_base = (unsigned char *)&rep;
754 	arg.iov[0].iov_len  = sizeof(rep.th);
755 	if (tsecr) {
756 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
757 				   (TCPOPT_TIMESTAMP << 8) |
758 				   TCPOLEN_TIMESTAMP);
759 		rep.opt[1] = htonl(tsval);
760 		rep.opt[2] = htonl(tsecr);
761 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
762 	}
763 
764 	/* Swap the send and the receive. */
765 	rep.th.dest    = th->source;
766 	rep.th.source  = th->dest;
767 	rep.th.doff    = arg.iov[0].iov_len / 4;
768 	rep.th.seq     = htonl(seq);
769 	rep.th.ack_seq = htonl(ack);
770 	rep.th.ack     = 1;
771 	rep.th.window  = htons(win);
772 
773 #ifdef CONFIG_TCP_MD5SIG
774 	if (key) {
775 		int offset = (tsecr) ? 3 : 0;
776 
777 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
778 					  (TCPOPT_NOP << 16) |
779 					  (TCPOPT_MD5SIG << 8) |
780 					  TCPOLEN_MD5SIG);
781 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
782 		rep.th.doff = arg.iov[0].iov_len/4;
783 
784 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
785 				    key, ip_hdr(skb)->saddr,
786 				    ip_hdr(skb)->daddr, &rep.th);
787 	}
788 #endif
789 	arg.flags = reply_flags;
790 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
791 				      ip_hdr(skb)->saddr, /* XXX */
792 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
793 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
794 	if (oif)
795 		arg.bound_dev_if = oif;
796 	arg.tos = tos;
797 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
798 	local_bh_disable();
799 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
800 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
801 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
802 			      &arg, arg.iov[0].iov_len);
803 
804 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
805 	local_bh_enable();
806 }
807 
808 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
809 {
810 	struct inet_timewait_sock *tw = inet_twsk(sk);
811 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
812 
813 	tcp_v4_send_ack(sk, skb,
814 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
815 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
816 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
817 			tcptw->tw_ts_recent,
818 			tw->tw_bound_dev_if,
819 			tcp_twsk_md5_key(tcptw),
820 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
821 			tw->tw_tos
822 			);
823 
824 	inet_twsk_put(tw);
825 }
826 
827 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
828 				  struct request_sock *req)
829 {
830 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
831 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
832 	 */
833 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
834 					     tcp_sk(sk)->snd_nxt;
835 
836 	/* RFC 7323 2.3
837 	 * The window field (SEG.WND) of every outgoing segment, with the
838 	 * exception of <SYN> segments, MUST be right-shifted by
839 	 * Rcv.Wind.Shift bits:
840 	 */
841 	tcp_v4_send_ack(sk, skb, seq,
842 			tcp_rsk(req)->rcv_nxt,
843 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
844 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
845 			req->ts_recent,
846 			0,
847 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
848 					  AF_INET),
849 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
850 			ip_hdr(skb)->tos);
851 }
852 
853 /*
854  *	Send a SYN-ACK after having received a SYN.
855  *	This still operates on a request_sock only, not on a big
856  *	socket.
857  */
858 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
859 			      struct flowi *fl,
860 			      struct request_sock *req,
861 			      struct tcp_fastopen_cookie *foc,
862 			      enum tcp_synack_type synack_type)
863 {
864 	const struct inet_request_sock *ireq = inet_rsk(req);
865 	struct flowi4 fl4;
866 	int err = -1;
867 	struct sk_buff *skb;
868 
869 	/* First, grab a route. */
870 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
871 		return -1;
872 
873 	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
874 
875 	if (skb) {
876 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
877 
878 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
879 					    ireq->ir_rmt_addr,
880 					    ireq->opt);
881 		err = net_xmit_eval(err);
882 	}
883 
884 	return err;
885 }
886 
887 /*
888  *	IPv4 request_sock destructor.
889  */
890 static void tcp_v4_reqsk_destructor(struct request_sock *req)
891 {
892 	kfree(inet_rsk(req)->opt);
893 }
894 
895 #ifdef CONFIG_TCP_MD5SIG
896 /*
897  * RFC2385 MD5 checksumming requires a mapping of
898  * IP address->MD5 Key.
899  * We need to maintain these in the sk structure.
900  */
901 
902 /* Find the Key structure for an address.  */
903 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
904 					 const union tcp_md5_addr *addr,
905 					 int family)
906 {
907 	const struct tcp_sock *tp = tcp_sk(sk);
908 	struct tcp_md5sig_key *key;
909 	unsigned int size = sizeof(struct in_addr);
910 	const struct tcp_md5sig_info *md5sig;
911 
912 	/* caller either holds rcu_read_lock() or socket lock */
913 	md5sig = rcu_dereference_check(tp->md5sig_info,
914 				       lockdep_sock_is_held(sk));
915 	if (!md5sig)
916 		return NULL;
917 #if IS_ENABLED(CONFIG_IPV6)
918 	if (family == AF_INET6)
919 		size = sizeof(struct in6_addr);
920 #endif
921 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
922 		if (key->family != family)
923 			continue;
924 		if (!memcmp(&key->addr, addr, size))
925 			return key;
926 	}
927 	return NULL;
928 }
929 EXPORT_SYMBOL(tcp_md5_do_lookup);
930 
931 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
932 					 const struct sock *addr_sk)
933 {
934 	const union tcp_md5_addr *addr;
935 
936 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
937 	return tcp_md5_do_lookup(sk, addr, AF_INET);
938 }
939 EXPORT_SYMBOL(tcp_v4_md5_lookup);
940 
941 /* This can be called on a newly created socket, from other files */
942 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
943 		   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
944 {
945 	/* Add Key to the list */
946 	struct tcp_md5sig_key *key;
947 	struct tcp_sock *tp = tcp_sk(sk);
948 	struct tcp_md5sig_info *md5sig;
949 
950 	key = tcp_md5_do_lookup(sk, addr, family);
951 	if (key) {
952 		/* Pre-existing entry - just update that one. */
953 		memcpy(key->key, newkey, newkeylen);
954 		key->keylen = newkeylen;
955 		return 0;
956 	}
957 
958 	md5sig = rcu_dereference_protected(tp->md5sig_info,
959 					   lockdep_sock_is_held(sk));
960 	if (!md5sig) {
961 		md5sig = kmalloc(sizeof(*md5sig), gfp);
962 		if (!md5sig)
963 			return -ENOMEM;
964 
965 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
966 		INIT_HLIST_HEAD(&md5sig->head);
967 		rcu_assign_pointer(tp->md5sig_info, md5sig);
968 	}
969 
970 	key = sock_kmalloc(sk, sizeof(*key), gfp);
971 	if (!key)
972 		return -ENOMEM;
973 	if (!tcp_alloc_md5sig_pool()) {
974 		sock_kfree_s(sk, key, sizeof(*key));
975 		return -ENOMEM;
976 	}
977 
978 	memcpy(key->key, newkey, newkeylen);
979 	key->keylen = newkeylen;
980 	key->family = family;
981 	memcpy(&key->addr, addr,
982 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
983 				      sizeof(struct in_addr));
984 	hlist_add_head_rcu(&key->node, &md5sig->head);
985 	return 0;
986 }
987 EXPORT_SYMBOL(tcp_md5_do_add);
988 
989 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
990 {
991 	struct tcp_md5sig_key *key;
992 
993 	key = tcp_md5_do_lookup(sk, addr, family);
994 	if (!key)
995 		return -ENOENT;
996 	hlist_del_rcu(&key->node);
997 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
998 	kfree_rcu(key, rcu);
999 	return 0;
1000 }
1001 EXPORT_SYMBOL(tcp_md5_do_del);
1002 
1003 static void tcp_clear_md5_list(struct sock *sk)
1004 {
1005 	struct tcp_sock *tp = tcp_sk(sk);
1006 	struct tcp_md5sig_key *key;
1007 	struct hlist_node *n;
1008 	struct tcp_md5sig_info *md5sig;
1009 
1010 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1011 
1012 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1013 		hlist_del_rcu(&key->node);
1014 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1015 		kfree_rcu(key, rcu);
1016 	}
1017 }
1018 
1019 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1020 				 int optlen)
1021 {
1022 	struct tcp_md5sig cmd;
1023 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1024 
1025 	if (optlen < sizeof(cmd))
1026 		return -EINVAL;
1027 
1028 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1029 		return -EFAULT;
1030 
1031 	if (sin->sin_family != AF_INET)
1032 		return -EINVAL;
1033 
1034 	if (!cmd.tcpm_keylen)
1035 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1036 				      AF_INET);
1037 
1038 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1039 		return -EINVAL;
1040 
1041 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1042 			      AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1043 			      GFP_KERNEL);
1044 }
1045 
1046 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1047 				   __be32 daddr, __be32 saddr,
1048 				   const struct tcphdr *th, int nbytes)
1049 {
1050 	struct tcp4_pseudohdr *bp;
1051 	struct scatterlist sg;
1052 	struct tcphdr *_th;
1053 
1054 	bp = hp->scratch;
1055 	bp->saddr = saddr;
1056 	bp->daddr = daddr;
1057 	bp->pad = 0;
1058 	bp->protocol = IPPROTO_TCP;
1059 	bp->len = cpu_to_be16(nbytes);
1060 
1061 	_th = (struct tcphdr *)(bp + 1);
1062 	memcpy(_th, th, sizeof(*th));
1063 	_th->check = 0;
1064 
1065 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1066 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1067 				sizeof(*bp) + sizeof(*th));
1068 	return crypto_ahash_update(hp->md5_req);
1069 }
1070 
1071 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1072 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1073 {
1074 	struct tcp_md5sig_pool *hp;
1075 	struct ahash_request *req;
1076 
1077 	hp = tcp_get_md5sig_pool();
1078 	if (!hp)
1079 		goto clear_hash_noput;
1080 	req = hp->md5_req;
1081 
1082 	if (crypto_ahash_init(req))
1083 		goto clear_hash;
1084 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1085 		goto clear_hash;
1086 	if (tcp_md5_hash_key(hp, key))
1087 		goto clear_hash;
1088 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1089 	if (crypto_ahash_final(req))
1090 		goto clear_hash;
1091 
1092 	tcp_put_md5sig_pool();
1093 	return 0;
1094 
1095 clear_hash:
1096 	tcp_put_md5sig_pool();
1097 clear_hash_noput:
1098 	memset(md5_hash, 0, 16);
1099 	return 1;
1100 }
1101 
1102 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1103 			const struct sock *sk,
1104 			const struct sk_buff *skb)
1105 {
1106 	struct tcp_md5sig_pool *hp;
1107 	struct ahash_request *req;
1108 	const struct tcphdr *th = tcp_hdr(skb);
1109 	__be32 saddr, daddr;
1110 
1111 	if (sk) { /* valid for establish/request sockets */
1112 		saddr = sk->sk_rcv_saddr;
1113 		daddr = sk->sk_daddr;
1114 	} else {
1115 		const struct iphdr *iph = ip_hdr(skb);
1116 		saddr = iph->saddr;
1117 		daddr = iph->daddr;
1118 	}
1119 
1120 	hp = tcp_get_md5sig_pool();
1121 	if (!hp)
1122 		goto clear_hash_noput;
1123 	req = hp->md5_req;
1124 
1125 	if (crypto_ahash_init(req))
1126 		goto clear_hash;
1127 
1128 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1129 		goto clear_hash;
1130 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1131 		goto clear_hash;
1132 	if (tcp_md5_hash_key(hp, key))
1133 		goto clear_hash;
1134 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1135 	if (crypto_ahash_final(req))
1136 		goto clear_hash;
1137 
1138 	tcp_put_md5sig_pool();
1139 	return 0;
1140 
1141 clear_hash:
1142 	tcp_put_md5sig_pool();
1143 clear_hash_noput:
1144 	memset(md5_hash, 0, 16);
1145 	return 1;
1146 }
1147 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1148 
1149 #endif
1150 
1151 /* Called with rcu_read_lock() */
1152 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1153 				    const struct sk_buff *skb)
1154 {
1155 #ifdef CONFIG_TCP_MD5SIG
1156 	/*
1157 	 * This gets called for each TCP segment that arrives
1158 	 * so we want to be efficient.
1159 	 * We have 3 drop cases:
1160 	 * o No MD5 hash and one expected.
1161 	 * o MD5 hash and we're not expecting one.
1162 	 * o MD5 hash and its wrong.
1163 	 */
1164 	const __u8 *hash_location = NULL;
1165 	struct tcp_md5sig_key *hash_expected;
1166 	const struct iphdr *iph = ip_hdr(skb);
1167 	const struct tcphdr *th = tcp_hdr(skb);
1168 	int genhash;
1169 	unsigned char newhash[16];
1170 
1171 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1172 					  AF_INET);
1173 	hash_location = tcp_parse_md5sig_option(th);
1174 
1175 	/* We've parsed the options - do we have a hash? */
1176 	if (!hash_expected && !hash_location)
1177 		return false;
1178 
1179 	if (hash_expected && !hash_location) {
1180 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1181 		return true;
1182 	}
1183 
1184 	if (!hash_expected && hash_location) {
1185 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1186 		return true;
1187 	}
1188 
1189 	/* Okay, so this is hash_expected and hash_location -
1190 	 * so we need to calculate the checksum.
1191 	 */
1192 	genhash = tcp_v4_md5_hash_skb(newhash,
1193 				      hash_expected,
1194 				      NULL, skb);
1195 
1196 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1197 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1198 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1199 				     &iph->saddr, ntohs(th->source),
1200 				     &iph->daddr, ntohs(th->dest),
1201 				     genhash ? " tcp_v4_calc_md5_hash failed"
1202 				     : "");
1203 		return true;
1204 	}
1205 	return false;
1206 #endif
1207 	return false;
1208 }
1209 
1210 static void tcp_v4_init_req(struct request_sock *req,
1211 			    const struct sock *sk_listener,
1212 			    struct sk_buff *skb)
1213 {
1214 	struct inet_request_sock *ireq = inet_rsk(req);
1215 
1216 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1217 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1218 	ireq->opt = tcp_v4_save_options(skb);
1219 }
1220 
1221 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1222 					  struct flowi *fl,
1223 					  const struct request_sock *req)
1224 {
1225 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1226 }
1227 
1228 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1229 	.family		=	PF_INET,
1230 	.obj_size	=	sizeof(struct tcp_request_sock),
1231 	.rtx_syn_ack	=	tcp_rtx_synack,
1232 	.send_ack	=	tcp_v4_reqsk_send_ack,
1233 	.destructor	=	tcp_v4_reqsk_destructor,
1234 	.send_reset	=	tcp_v4_send_reset,
1235 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1236 };
1237 
1238 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1239 	.mss_clamp	=	TCP_MSS_DEFAULT,
1240 #ifdef CONFIG_TCP_MD5SIG
1241 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1242 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1243 #endif
1244 	.init_req	=	tcp_v4_init_req,
1245 #ifdef CONFIG_SYN_COOKIES
1246 	.cookie_init_seq =	cookie_v4_init_sequence,
1247 #endif
1248 	.route_req	=	tcp_v4_route_req,
1249 	.init_seq	=	tcp_v4_init_seq,
1250 	.init_ts_off	=	tcp_v4_init_ts_off,
1251 	.send_synack	=	tcp_v4_send_synack,
1252 };
1253 
1254 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1255 {
1256 	/* Never answer to SYNs send to broadcast or multicast */
1257 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1258 		goto drop;
1259 
1260 	return tcp_conn_request(&tcp_request_sock_ops,
1261 				&tcp_request_sock_ipv4_ops, sk, skb);
1262 
1263 drop:
1264 	tcp_listendrop(sk);
1265 	return 0;
1266 }
1267 EXPORT_SYMBOL(tcp_v4_conn_request);
1268 
1269 
1270 /*
1271  * The three way handshake has completed - we got a valid synack -
1272  * now create the new socket.
1273  */
1274 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1275 				  struct request_sock *req,
1276 				  struct dst_entry *dst,
1277 				  struct request_sock *req_unhash,
1278 				  bool *own_req)
1279 {
1280 	struct inet_request_sock *ireq;
1281 	struct inet_sock *newinet;
1282 	struct tcp_sock *newtp;
1283 	struct sock *newsk;
1284 #ifdef CONFIG_TCP_MD5SIG
1285 	struct tcp_md5sig_key *key;
1286 #endif
1287 	struct ip_options_rcu *inet_opt;
1288 
1289 	if (sk_acceptq_is_full(sk))
1290 		goto exit_overflow;
1291 
1292 	newsk = tcp_create_openreq_child(sk, req, skb);
1293 	if (!newsk)
1294 		goto exit_nonewsk;
1295 
1296 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1297 	inet_sk_rx_dst_set(newsk, skb);
1298 
1299 	newtp		      = tcp_sk(newsk);
1300 	newinet		      = inet_sk(newsk);
1301 	ireq		      = inet_rsk(req);
1302 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1303 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1304 	newsk->sk_bound_dev_if = ireq->ir_iif;
1305 	newinet->inet_saddr	      = ireq->ir_loc_addr;
1306 	inet_opt	      = ireq->opt;
1307 	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1308 	ireq->opt	      = NULL;
1309 	newinet->mc_index     = inet_iif(skb);
1310 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1311 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1312 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1313 	if (inet_opt)
1314 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1315 	newinet->inet_id = newtp->write_seq ^ jiffies;
1316 
1317 	if (!dst) {
1318 		dst = inet_csk_route_child_sock(sk, newsk, req);
1319 		if (!dst)
1320 			goto put_and_exit;
1321 	} else {
1322 		/* syncookie case : see end of cookie_v4_check() */
1323 	}
1324 	sk_setup_caps(newsk, dst);
1325 
1326 	tcp_ca_openreq_child(newsk, dst);
1327 
1328 	tcp_sync_mss(newsk, dst_mtu(dst));
1329 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1330 
1331 	tcp_initialize_rcv_mss(newsk);
1332 
1333 #ifdef CONFIG_TCP_MD5SIG
1334 	/* Copy over the MD5 key from the original socket */
1335 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1336 				AF_INET);
1337 	if (key) {
1338 		/*
1339 		 * We're using one, so create a matching key
1340 		 * on the newsk structure. If we fail to get
1341 		 * memory, then we end up not copying the key
1342 		 * across. Shucks.
1343 		 */
1344 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1345 			       AF_INET, key->key, key->keylen, GFP_ATOMIC);
1346 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1347 	}
1348 #endif
1349 
1350 	if (__inet_inherit_port(sk, newsk) < 0)
1351 		goto put_and_exit;
1352 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1353 	if (*own_req)
1354 		tcp_move_syn(newtp, req);
1355 
1356 	return newsk;
1357 
1358 exit_overflow:
1359 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1360 exit_nonewsk:
1361 	dst_release(dst);
1362 exit:
1363 	tcp_listendrop(sk);
1364 	return NULL;
1365 put_and_exit:
1366 	inet_csk_prepare_forced_close(newsk);
1367 	tcp_done(newsk);
1368 	goto exit;
1369 }
1370 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1371 
1372 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1373 {
1374 #ifdef CONFIG_SYN_COOKIES
1375 	const struct tcphdr *th = tcp_hdr(skb);
1376 
1377 	if (!th->syn)
1378 		sk = cookie_v4_check(sk, skb);
1379 #endif
1380 	return sk;
1381 }
1382 
1383 /* The socket must have it's spinlock held when we get
1384  * here, unless it is a TCP_LISTEN socket.
1385  *
1386  * We have a potential double-lock case here, so even when
1387  * doing backlog processing we use the BH locking scheme.
1388  * This is because we cannot sleep with the original spinlock
1389  * held.
1390  */
1391 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1392 {
1393 	struct sock *rsk;
1394 
1395 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1396 		struct dst_entry *dst = sk->sk_rx_dst;
1397 
1398 		sock_rps_save_rxhash(sk, skb);
1399 		sk_mark_napi_id(sk, skb);
1400 		if (dst) {
1401 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1402 			    !dst->ops->check(dst, 0)) {
1403 				dst_release(dst);
1404 				sk->sk_rx_dst = NULL;
1405 			}
1406 		}
1407 		tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1408 		return 0;
1409 	}
1410 
1411 	if (tcp_checksum_complete(skb))
1412 		goto csum_err;
1413 
1414 	if (sk->sk_state == TCP_LISTEN) {
1415 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1416 
1417 		if (!nsk)
1418 			goto discard;
1419 		if (nsk != sk) {
1420 			if (tcp_child_process(sk, nsk, skb)) {
1421 				rsk = nsk;
1422 				goto reset;
1423 			}
1424 			return 0;
1425 		}
1426 	} else
1427 		sock_rps_save_rxhash(sk, skb);
1428 
1429 	if (tcp_rcv_state_process(sk, skb)) {
1430 		rsk = sk;
1431 		goto reset;
1432 	}
1433 	return 0;
1434 
1435 reset:
1436 	tcp_v4_send_reset(rsk, skb);
1437 discard:
1438 	kfree_skb(skb);
1439 	/* Be careful here. If this function gets more complicated and
1440 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1441 	 * might be destroyed here. This current version compiles correctly,
1442 	 * but you have been warned.
1443 	 */
1444 	return 0;
1445 
1446 csum_err:
1447 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1448 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1449 	goto discard;
1450 }
1451 EXPORT_SYMBOL(tcp_v4_do_rcv);
1452 
1453 void tcp_v4_early_demux(struct sk_buff *skb)
1454 {
1455 	const struct iphdr *iph;
1456 	const struct tcphdr *th;
1457 	struct sock *sk;
1458 
1459 	if (skb->pkt_type != PACKET_HOST)
1460 		return;
1461 
1462 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1463 		return;
1464 
1465 	iph = ip_hdr(skb);
1466 	th = tcp_hdr(skb);
1467 
1468 	if (th->doff < sizeof(struct tcphdr) / 4)
1469 		return;
1470 
1471 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1472 				       iph->saddr, th->source,
1473 				       iph->daddr, ntohs(th->dest),
1474 				       skb->skb_iif);
1475 	if (sk) {
1476 		skb->sk = sk;
1477 		skb->destructor = sock_edemux;
1478 		if (sk_fullsock(sk)) {
1479 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1480 
1481 			if (dst)
1482 				dst = dst_check(dst, 0);
1483 			if (dst &&
1484 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1485 				skb_dst_set_noref(skb, dst);
1486 		}
1487 	}
1488 }
1489 
1490 /* Packet is added to VJ-style prequeue for processing in process
1491  * context, if a reader task is waiting. Apparently, this exciting
1492  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1493  * failed somewhere. Latency? Burstiness? Well, at least now we will
1494  * see, why it failed. 8)8)				  --ANK
1495  *
1496  */
1497 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1498 {
1499 	struct tcp_sock *tp = tcp_sk(sk);
1500 
1501 	if (sysctl_tcp_low_latency || !tp->ucopy.task)
1502 		return false;
1503 
1504 	if (skb->len <= tcp_hdrlen(skb) &&
1505 	    skb_queue_len(&tp->ucopy.prequeue) == 0)
1506 		return false;
1507 
1508 	/* Before escaping RCU protected region, we need to take care of skb
1509 	 * dst. Prequeue is only enabled for established sockets.
1510 	 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1511 	 * Instead of doing full sk_rx_dst validity here, let's perform
1512 	 * an optimistic check.
1513 	 */
1514 	if (likely(sk->sk_rx_dst))
1515 		skb_dst_drop(skb);
1516 	else
1517 		skb_dst_force_safe(skb);
1518 
1519 	__skb_queue_tail(&tp->ucopy.prequeue, skb);
1520 	tp->ucopy.memory += skb->truesize;
1521 	if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
1522 	    tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
1523 		struct sk_buff *skb1;
1524 
1525 		BUG_ON(sock_owned_by_user(sk));
1526 		__NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1527 				skb_queue_len(&tp->ucopy.prequeue));
1528 
1529 		while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1530 			sk_backlog_rcv(sk, skb1);
1531 
1532 		tp->ucopy.memory = 0;
1533 	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1534 		wake_up_interruptible_sync_poll(sk_sleep(sk),
1535 					   POLLIN | POLLRDNORM | POLLRDBAND);
1536 		if (!inet_csk_ack_scheduled(sk))
1537 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1538 						  (3 * tcp_rto_min(sk)) / 4,
1539 						  TCP_RTO_MAX);
1540 	}
1541 	return true;
1542 }
1543 EXPORT_SYMBOL(tcp_prequeue);
1544 
1545 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1546 {
1547 	u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1548 
1549 	/* Only socket owner can try to collapse/prune rx queues
1550 	 * to reduce memory overhead, so add a little headroom here.
1551 	 * Few sockets backlog are possibly concurrently non empty.
1552 	 */
1553 	limit += 64*1024;
1554 
1555 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1556 	 * we can fix skb->truesize to its real value to avoid future drops.
1557 	 * This is valid because skb is not yet charged to the socket.
1558 	 * It has been noticed pure SACK packets were sometimes dropped
1559 	 * (if cooked by drivers without copybreak feature).
1560 	 */
1561 	skb_condense(skb);
1562 
1563 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1564 		bh_unlock_sock(sk);
1565 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1566 		return true;
1567 	}
1568 	return false;
1569 }
1570 EXPORT_SYMBOL(tcp_add_backlog);
1571 
1572 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1573 {
1574 	struct tcphdr *th = (struct tcphdr *)skb->data;
1575 	unsigned int eaten = skb->len;
1576 	int err;
1577 
1578 	err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1579 	if (!err) {
1580 		eaten -= skb->len;
1581 		TCP_SKB_CB(skb)->end_seq -= eaten;
1582 	}
1583 	return err;
1584 }
1585 EXPORT_SYMBOL(tcp_filter);
1586 
1587 /*
1588  *	From tcp_input.c
1589  */
1590 
1591 int tcp_v4_rcv(struct sk_buff *skb)
1592 {
1593 	struct net *net = dev_net(skb->dev);
1594 	const struct iphdr *iph;
1595 	const struct tcphdr *th;
1596 	bool refcounted;
1597 	struct sock *sk;
1598 	int ret;
1599 
1600 	if (skb->pkt_type != PACKET_HOST)
1601 		goto discard_it;
1602 
1603 	/* Count it even if it's bad */
1604 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1605 
1606 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1607 		goto discard_it;
1608 
1609 	th = (const struct tcphdr *)skb->data;
1610 
1611 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1612 		goto bad_packet;
1613 	if (!pskb_may_pull(skb, th->doff * 4))
1614 		goto discard_it;
1615 
1616 	/* An explanation is required here, I think.
1617 	 * Packet length and doff are validated by header prediction,
1618 	 * provided case of th->doff==0 is eliminated.
1619 	 * So, we defer the checks. */
1620 
1621 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1622 		goto csum_error;
1623 
1624 	th = (const struct tcphdr *)skb->data;
1625 	iph = ip_hdr(skb);
1626 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1627 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1628 	 */
1629 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1630 		sizeof(struct inet_skb_parm));
1631 	barrier();
1632 
1633 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1634 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1635 				    skb->len - th->doff * 4);
1636 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1637 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1638 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1639 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1640 	TCP_SKB_CB(skb)->sacked	 = 0;
1641 
1642 lookup:
1643 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1644 			       th->dest, &refcounted);
1645 	if (!sk)
1646 		goto no_tcp_socket;
1647 
1648 process:
1649 	if (sk->sk_state == TCP_TIME_WAIT)
1650 		goto do_time_wait;
1651 
1652 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1653 		struct request_sock *req = inet_reqsk(sk);
1654 		struct sock *nsk;
1655 
1656 		sk = req->rsk_listener;
1657 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1658 			sk_drops_add(sk, skb);
1659 			reqsk_put(req);
1660 			goto discard_it;
1661 		}
1662 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1663 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1664 			goto lookup;
1665 		}
1666 		/* We own a reference on the listener, increase it again
1667 		 * as we might lose it too soon.
1668 		 */
1669 		sock_hold(sk);
1670 		refcounted = true;
1671 		nsk = tcp_check_req(sk, skb, req, false);
1672 		if (!nsk) {
1673 			reqsk_put(req);
1674 			goto discard_and_relse;
1675 		}
1676 		if (nsk == sk) {
1677 			reqsk_put(req);
1678 		} else if (tcp_child_process(sk, nsk, skb)) {
1679 			tcp_v4_send_reset(nsk, skb);
1680 			goto discard_and_relse;
1681 		} else {
1682 			sock_put(sk);
1683 			return 0;
1684 		}
1685 	}
1686 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1687 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1688 		goto discard_and_relse;
1689 	}
1690 
1691 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1692 		goto discard_and_relse;
1693 
1694 	if (tcp_v4_inbound_md5_hash(sk, skb))
1695 		goto discard_and_relse;
1696 
1697 	nf_reset(skb);
1698 
1699 	if (tcp_filter(sk, skb))
1700 		goto discard_and_relse;
1701 	th = (const struct tcphdr *)skb->data;
1702 	iph = ip_hdr(skb);
1703 
1704 	skb->dev = NULL;
1705 
1706 	if (sk->sk_state == TCP_LISTEN) {
1707 		ret = tcp_v4_do_rcv(sk, skb);
1708 		goto put_and_return;
1709 	}
1710 
1711 	sk_incoming_cpu_update(sk);
1712 
1713 	bh_lock_sock_nested(sk);
1714 	tcp_segs_in(tcp_sk(sk), skb);
1715 	ret = 0;
1716 	if (!sock_owned_by_user(sk)) {
1717 		if (!tcp_prequeue(sk, skb))
1718 			ret = tcp_v4_do_rcv(sk, skb);
1719 	} else if (tcp_add_backlog(sk, skb)) {
1720 		goto discard_and_relse;
1721 	}
1722 	bh_unlock_sock(sk);
1723 
1724 put_and_return:
1725 	if (refcounted)
1726 		sock_put(sk);
1727 
1728 	return ret;
1729 
1730 no_tcp_socket:
1731 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1732 		goto discard_it;
1733 
1734 	if (tcp_checksum_complete(skb)) {
1735 csum_error:
1736 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1737 bad_packet:
1738 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
1739 	} else {
1740 		tcp_v4_send_reset(NULL, skb);
1741 	}
1742 
1743 discard_it:
1744 	/* Discard frame. */
1745 	kfree_skb(skb);
1746 	return 0;
1747 
1748 discard_and_relse:
1749 	sk_drops_add(sk, skb);
1750 	if (refcounted)
1751 		sock_put(sk);
1752 	goto discard_it;
1753 
1754 do_time_wait:
1755 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1756 		inet_twsk_put(inet_twsk(sk));
1757 		goto discard_it;
1758 	}
1759 
1760 	if (tcp_checksum_complete(skb)) {
1761 		inet_twsk_put(inet_twsk(sk));
1762 		goto csum_error;
1763 	}
1764 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1765 	case TCP_TW_SYN: {
1766 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1767 							&tcp_hashinfo, skb,
1768 							__tcp_hdrlen(th),
1769 							iph->saddr, th->source,
1770 							iph->daddr, th->dest,
1771 							inet_iif(skb));
1772 		if (sk2) {
1773 			inet_twsk_deschedule_put(inet_twsk(sk));
1774 			sk = sk2;
1775 			refcounted = false;
1776 			goto process;
1777 		}
1778 		/* Fall through to ACK */
1779 	}
1780 	case TCP_TW_ACK:
1781 		tcp_v4_timewait_ack(sk, skb);
1782 		break;
1783 	case TCP_TW_RST:
1784 		tcp_v4_send_reset(sk, skb);
1785 		inet_twsk_deschedule_put(inet_twsk(sk));
1786 		goto discard_it;
1787 	case TCP_TW_SUCCESS:;
1788 	}
1789 	goto discard_it;
1790 }
1791 
1792 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1793 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1794 	.twsk_unique	= tcp_twsk_unique,
1795 	.twsk_destructor= tcp_twsk_destructor,
1796 };
1797 
1798 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1799 {
1800 	struct dst_entry *dst = skb_dst(skb);
1801 
1802 	if (dst && dst_hold_safe(dst)) {
1803 		sk->sk_rx_dst = dst;
1804 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1805 	}
1806 }
1807 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1808 
1809 const struct inet_connection_sock_af_ops ipv4_specific = {
1810 	.queue_xmit	   = ip_queue_xmit,
1811 	.send_check	   = tcp_v4_send_check,
1812 	.rebuild_header	   = inet_sk_rebuild_header,
1813 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
1814 	.conn_request	   = tcp_v4_conn_request,
1815 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1816 	.net_header_len	   = sizeof(struct iphdr),
1817 	.setsockopt	   = ip_setsockopt,
1818 	.getsockopt	   = ip_getsockopt,
1819 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1820 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1821 #ifdef CONFIG_COMPAT
1822 	.compat_setsockopt = compat_ip_setsockopt,
1823 	.compat_getsockopt = compat_ip_getsockopt,
1824 #endif
1825 	.mtu_reduced	   = tcp_v4_mtu_reduced,
1826 };
1827 EXPORT_SYMBOL(ipv4_specific);
1828 
1829 #ifdef CONFIG_TCP_MD5SIG
1830 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1831 	.md5_lookup		= tcp_v4_md5_lookup,
1832 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1833 	.md5_parse		= tcp_v4_parse_md5_keys,
1834 };
1835 #endif
1836 
1837 /* NOTE: A lot of things set to zero explicitly by call to
1838  *       sk_alloc() so need not be done here.
1839  */
1840 static int tcp_v4_init_sock(struct sock *sk)
1841 {
1842 	struct inet_connection_sock *icsk = inet_csk(sk);
1843 
1844 	tcp_init_sock(sk);
1845 
1846 	icsk->icsk_af_ops = &ipv4_specific;
1847 
1848 #ifdef CONFIG_TCP_MD5SIG
1849 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1850 #endif
1851 
1852 	return 0;
1853 }
1854 
1855 void tcp_v4_destroy_sock(struct sock *sk)
1856 {
1857 	struct tcp_sock *tp = tcp_sk(sk);
1858 
1859 	tcp_clear_xmit_timers(sk);
1860 
1861 	tcp_cleanup_congestion_control(sk);
1862 
1863 	/* Cleanup up the write buffer. */
1864 	tcp_write_queue_purge(sk);
1865 
1866 	/* Check if we want to disable active TFO */
1867 	tcp_fastopen_active_disable_ofo_check(sk);
1868 
1869 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1870 	skb_rbtree_purge(&tp->out_of_order_queue);
1871 
1872 #ifdef CONFIG_TCP_MD5SIG
1873 	/* Clean up the MD5 key list, if any */
1874 	if (tp->md5sig_info) {
1875 		tcp_clear_md5_list(sk);
1876 		kfree_rcu(tp->md5sig_info, rcu);
1877 		tp->md5sig_info = NULL;
1878 	}
1879 #endif
1880 
1881 	/* Clean prequeue, it must be empty really */
1882 	__skb_queue_purge(&tp->ucopy.prequeue);
1883 
1884 	/* Clean up a referenced TCP bind bucket. */
1885 	if (inet_csk(sk)->icsk_bind_hash)
1886 		inet_put_port(sk);
1887 
1888 	BUG_ON(tp->fastopen_rsk);
1889 
1890 	/* If socket is aborted during connect operation */
1891 	tcp_free_fastopen_req(tp);
1892 	tcp_saved_syn_free(tp);
1893 
1894 	sk_sockets_allocated_dec(sk);
1895 }
1896 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1897 
1898 #ifdef CONFIG_PROC_FS
1899 /* Proc filesystem TCP sock list dumping. */
1900 
1901 /*
1902  * Get next listener socket follow cur.  If cur is NULL, get first socket
1903  * starting from bucket given in st->bucket; when st->bucket is zero the
1904  * very first socket in the hash table is returned.
1905  */
1906 static void *listening_get_next(struct seq_file *seq, void *cur)
1907 {
1908 	struct tcp_iter_state *st = seq->private;
1909 	struct net *net = seq_file_net(seq);
1910 	struct inet_listen_hashbucket *ilb;
1911 	struct sock *sk = cur;
1912 
1913 	if (!sk) {
1914 get_head:
1915 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1916 		spin_lock(&ilb->lock);
1917 		sk = sk_head(&ilb->head);
1918 		st->offset = 0;
1919 		goto get_sk;
1920 	}
1921 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1922 	++st->num;
1923 	++st->offset;
1924 
1925 	sk = sk_next(sk);
1926 get_sk:
1927 	sk_for_each_from(sk) {
1928 		if (!net_eq(sock_net(sk), net))
1929 			continue;
1930 		if (sk->sk_family == st->family)
1931 			return sk;
1932 	}
1933 	spin_unlock(&ilb->lock);
1934 	st->offset = 0;
1935 	if (++st->bucket < INET_LHTABLE_SIZE)
1936 		goto get_head;
1937 	return NULL;
1938 }
1939 
1940 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1941 {
1942 	struct tcp_iter_state *st = seq->private;
1943 	void *rc;
1944 
1945 	st->bucket = 0;
1946 	st->offset = 0;
1947 	rc = listening_get_next(seq, NULL);
1948 
1949 	while (rc && *pos) {
1950 		rc = listening_get_next(seq, rc);
1951 		--*pos;
1952 	}
1953 	return rc;
1954 }
1955 
1956 static inline bool empty_bucket(const struct tcp_iter_state *st)
1957 {
1958 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1959 }
1960 
1961 /*
1962  * Get first established socket starting from bucket given in st->bucket.
1963  * If st->bucket is zero, the very first socket in the hash is returned.
1964  */
1965 static void *established_get_first(struct seq_file *seq)
1966 {
1967 	struct tcp_iter_state *st = seq->private;
1968 	struct net *net = seq_file_net(seq);
1969 	void *rc = NULL;
1970 
1971 	st->offset = 0;
1972 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1973 		struct sock *sk;
1974 		struct hlist_nulls_node *node;
1975 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1976 
1977 		/* Lockless fast path for the common case of empty buckets */
1978 		if (empty_bucket(st))
1979 			continue;
1980 
1981 		spin_lock_bh(lock);
1982 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1983 			if (sk->sk_family != st->family ||
1984 			    !net_eq(sock_net(sk), net)) {
1985 				continue;
1986 			}
1987 			rc = sk;
1988 			goto out;
1989 		}
1990 		spin_unlock_bh(lock);
1991 	}
1992 out:
1993 	return rc;
1994 }
1995 
1996 static void *established_get_next(struct seq_file *seq, void *cur)
1997 {
1998 	struct sock *sk = cur;
1999 	struct hlist_nulls_node *node;
2000 	struct tcp_iter_state *st = seq->private;
2001 	struct net *net = seq_file_net(seq);
2002 
2003 	++st->num;
2004 	++st->offset;
2005 
2006 	sk = sk_nulls_next(sk);
2007 
2008 	sk_nulls_for_each_from(sk, node) {
2009 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2010 			return sk;
2011 	}
2012 
2013 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2014 	++st->bucket;
2015 	return established_get_first(seq);
2016 }
2017 
2018 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2019 {
2020 	struct tcp_iter_state *st = seq->private;
2021 	void *rc;
2022 
2023 	st->bucket = 0;
2024 	rc = established_get_first(seq);
2025 
2026 	while (rc && pos) {
2027 		rc = established_get_next(seq, rc);
2028 		--pos;
2029 	}
2030 	return rc;
2031 }
2032 
2033 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2034 {
2035 	void *rc;
2036 	struct tcp_iter_state *st = seq->private;
2037 
2038 	st->state = TCP_SEQ_STATE_LISTENING;
2039 	rc	  = listening_get_idx(seq, &pos);
2040 
2041 	if (!rc) {
2042 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2043 		rc	  = established_get_idx(seq, pos);
2044 	}
2045 
2046 	return rc;
2047 }
2048 
2049 static void *tcp_seek_last_pos(struct seq_file *seq)
2050 {
2051 	struct tcp_iter_state *st = seq->private;
2052 	int offset = st->offset;
2053 	int orig_num = st->num;
2054 	void *rc = NULL;
2055 
2056 	switch (st->state) {
2057 	case TCP_SEQ_STATE_LISTENING:
2058 		if (st->bucket >= INET_LHTABLE_SIZE)
2059 			break;
2060 		st->state = TCP_SEQ_STATE_LISTENING;
2061 		rc = listening_get_next(seq, NULL);
2062 		while (offset-- && rc)
2063 			rc = listening_get_next(seq, rc);
2064 		if (rc)
2065 			break;
2066 		st->bucket = 0;
2067 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2068 		/* Fallthrough */
2069 	case TCP_SEQ_STATE_ESTABLISHED:
2070 		if (st->bucket > tcp_hashinfo.ehash_mask)
2071 			break;
2072 		rc = established_get_first(seq);
2073 		while (offset-- && rc)
2074 			rc = established_get_next(seq, rc);
2075 	}
2076 
2077 	st->num = orig_num;
2078 
2079 	return rc;
2080 }
2081 
2082 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2083 {
2084 	struct tcp_iter_state *st = seq->private;
2085 	void *rc;
2086 
2087 	if (*pos && *pos == st->last_pos) {
2088 		rc = tcp_seek_last_pos(seq);
2089 		if (rc)
2090 			goto out;
2091 	}
2092 
2093 	st->state = TCP_SEQ_STATE_LISTENING;
2094 	st->num = 0;
2095 	st->bucket = 0;
2096 	st->offset = 0;
2097 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2098 
2099 out:
2100 	st->last_pos = *pos;
2101 	return rc;
2102 }
2103 
2104 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2105 {
2106 	struct tcp_iter_state *st = seq->private;
2107 	void *rc = NULL;
2108 
2109 	if (v == SEQ_START_TOKEN) {
2110 		rc = tcp_get_idx(seq, 0);
2111 		goto out;
2112 	}
2113 
2114 	switch (st->state) {
2115 	case TCP_SEQ_STATE_LISTENING:
2116 		rc = listening_get_next(seq, v);
2117 		if (!rc) {
2118 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2119 			st->bucket = 0;
2120 			st->offset = 0;
2121 			rc	  = established_get_first(seq);
2122 		}
2123 		break;
2124 	case TCP_SEQ_STATE_ESTABLISHED:
2125 		rc = established_get_next(seq, v);
2126 		break;
2127 	}
2128 out:
2129 	++*pos;
2130 	st->last_pos = *pos;
2131 	return rc;
2132 }
2133 
2134 static void tcp_seq_stop(struct seq_file *seq, void *v)
2135 {
2136 	struct tcp_iter_state *st = seq->private;
2137 
2138 	switch (st->state) {
2139 	case TCP_SEQ_STATE_LISTENING:
2140 		if (v != SEQ_START_TOKEN)
2141 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2142 		break;
2143 	case TCP_SEQ_STATE_ESTABLISHED:
2144 		if (v)
2145 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2146 		break;
2147 	}
2148 }
2149 
2150 int tcp_seq_open(struct inode *inode, struct file *file)
2151 {
2152 	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2153 	struct tcp_iter_state *s;
2154 	int err;
2155 
2156 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2157 			  sizeof(struct tcp_iter_state));
2158 	if (err < 0)
2159 		return err;
2160 
2161 	s = ((struct seq_file *)file->private_data)->private;
2162 	s->family		= afinfo->family;
2163 	s->last_pos		= 0;
2164 	return 0;
2165 }
2166 EXPORT_SYMBOL(tcp_seq_open);
2167 
2168 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2169 {
2170 	int rc = 0;
2171 	struct proc_dir_entry *p;
2172 
2173 	afinfo->seq_ops.start		= tcp_seq_start;
2174 	afinfo->seq_ops.next		= tcp_seq_next;
2175 	afinfo->seq_ops.stop		= tcp_seq_stop;
2176 
2177 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2178 			     afinfo->seq_fops, afinfo);
2179 	if (!p)
2180 		rc = -ENOMEM;
2181 	return rc;
2182 }
2183 EXPORT_SYMBOL(tcp_proc_register);
2184 
2185 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2186 {
2187 	remove_proc_entry(afinfo->name, net->proc_net);
2188 }
2189 EXPORT_SYMBOL(tcp_proc_unregister);
2190 
2191 static void get_openreq4(const struct request_sock *req,
2192 			 struct seq_file *f, int i)
2193 {
2194 	const struct inet_request_sock *ireq = inet_rsk(req);
2195 	long delta = req->rsk_timer.expires - jiffies;
2196 
2197 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2198 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2199 		i,
2200 		ireq->ir_loc_addr,
2201 		ireq->ir_num,
2202 		ireq->ir_rmt_addr,
2203 		ntohs(ireq->ir_rmt_port),
2204 		TCP_SYN_RECV,
2205 		0, 0, /* could print option size, but that is af dependent. */
2206 		1,    /* timers active (only the expire timer) */
2207 		jiffies_delta_to_clock_t(delta),
2208 		req->num_timeout,
2209 		from_kuid_munged(seq_user_ns(f),
2210 				 sock_i_uid(req->rsk_listener)),
2211 		0,  /* non standard timer */
2212 		0, /* open_requests have no inode */
2213 		0,
2214 		req);
2215 }
2216 
2217 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2218 {
2219 	int timer_active;
2220 	unsigned long timer_expires;
2221 	const struct tcp_sock *tp = tcp_sk(sk);
2222 	const struct inet_connection_sock *icsk = inet_csk(sk);
2223 	const struct inet_sock *inet = inet_sk(sk);
2224 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2225 	__be32 dest = inet->inet_daddr;
2226 	__be32 src = inet->inet_rcv_saddr;
2227 	__u16 destp = ntohs(inet->inet_dport);
2228 	__u16 srcp = ntohs(inet->inet_sport);
2229 	int rx_queue;
2230 	int state;
2231 
2232 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2233 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2234 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2235 		timer_active	= 1;
2236 		timer_expires	= icsk->icsk_timeout;
2237 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2238 		timer_active	= 4;
2239 		timer_expires	= icsk->icsk_timeout;
2240 	} else if (timer_pending(&sk->sk_timer)) {
2241 		timer_active	= 2;
2242 		timer_expires	= sk->sk_timer.expires;
2243 	} else {
2244 		timer_active	= 0;
2245 		timer_expires = jiffies;
2246 	}
2247 
2248 	state = sk_state_load(sk);
2249 	if (state == TCP_LISTEN)
2250 		rx_queue = sk->sk_ack_backlog;
2251 	else
2252 		/* Because we don't lock the socket,
2253 		 * we might find a transient negative value.
2254 		 */
2255 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2256 
2257 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2258 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2259 		i, src, srcp, dest, destp, state,
2260 		tp->write_seq - tp->snd_una,
2261 		rx_queue,
2262 		timer_active,
2263 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2264 		icsk->icsk_retransmits,
2265 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2266 		icsk->icsk_probes_out,
2267 		sock_i_ino(sk),
2268 		atomic_read(&sk->sk_refcnt), sk,
2269 		jiffies_to_clock_t(icsk->icsk_rto),
2270 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2271 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2272 		tp->snd_cwnd,
2273 		state == TCP_LISTEN ?
2274 		    fastopenq->max_qlen :
2275 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2276 }
2277 
2278 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2279 			       struct seq_file *f, int i)
2280 {
2281 	long delta = tw->tw_timer.expires - jiffies;
2282 	__be32 dest, src;
2283 	__u16 destp, srcp;
2284 
2285 	dest  = tw->tw_daddr;
2286 	src   = tw->tw_rcv_saddr;
2287 	destp = ntohs(tw->tw_dport);
2288 	srcp  = ntohs(tw->tw_sport);
2289 
2290 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2291 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2292 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2293 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2294 		atomic_read(&tw->tw_refcnt), tw);
2295 }
2296 
2297 #define TMPSZ 150
2298 
2299 static int tcp4_seq_show(struct seq_file *seq, void *v)
2300 {
2301 	struct tcp_iter_state *st;
2302 	struct sock *sk = v;
2303 
2304 	seq_setwidth(seq, TMPSZ - 1);
2305 	if (v == SEQ_START_TOKEN) {
2306 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2307 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2308 			   "inode");
2309 		goto out;
2310 	}
2311 	st = seq->private;
2312 
2313 	if (sk->sk_state == TCP_TIME_WAIT)
2314 		get_timewait4_sock(v, seq, st->num);
2315 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2316 		get_openreq4(v, seq, st->num);
2317 	else
2318 		get_tcp4_sock(v, seq, st->num);
2319 out:
2320 	seq_pad(seq, '\n');
2321 	return 0;
2322 }
2323 
2324 static const struct file_operations tcp_afinfo_seq_fops = {
2325 	.owner   = THIS_MODULE,
2326 	.open    = tcp_seq_open,
2327 	.read    = seq_read,
2328 	.llseek  = seq_lseek,
2329 	.release = seq_release_net
2330 };
2331 
2332 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2333 	.name		= "tcp",
2334 	.family		= AF_INET,
2335 	.seq_fops	= &tcp_afinfo_seq_fops,
2336 	.seq_ops	= {
2337 		.show		= tcp4_seq_show,
2338 	},
2339 };
2340 
2341 static int __net_init tcp4_proc_init_net(struct net *net)
2342 {
2343 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2344 }
2345 
2346 static void __net_exit tcp4_proc_exit_net(struct net *net)
2347 {
2348 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2349 }
2350 
2351 static struct pernet_operations tcp4_net_ops = {
2352 	.init = tcp4_proc_init_net,
2353 	.exit = tcp4_proc_exit_net,
2354 };
2355 
2356 int __init tcp4_proc_init(void)
2357 {
2358 	return register_pernet_subsys(&tcp4_net_ops);
2359 }
2360 
2361 void tcp4_proc_exit(void)
2362 {
2363 	unregister_pernet_subsys(&tcp4_net_ops);
2364 }
2365 #endif /* CONFIG_PROC_FS */
2366 
2367 struct proto tcp_prot = {
2368 	.name			= "TCP",
2369 	.owner			= THIS_MODULE,
2370 	.close			= tcp_close,
2371 	.connect		= tcp_v4_connect,
2372 	.disconnect		= tcp_disconnect,
2373 	.accept			= inet_csk_accept,
2374 	.ioctl			= tcp_ioctl,
2375 	.init			= tcp_v4_init_sock,
2376 	.destroy		= tcp_v4_destroy_sock,
2377 	.shutdown		= tcp_shutdown,
2378 	.setsockopt		= tcp_setsockopt,
2379 	.getsockopt		= tcp_getsockopt,
2380 	.keepalive		= tcp_set_keepalive,
2381 	.recvmsg		= tcp_recvmsg,
2382 	.sendmsg		= tcp_sendmsg,
2383 	.sendpage		= tcp_sendpage,
2384 	.backlog_rcv		= tcp_v4_do_rcv,
2385 	.release_cb		= tcp_release_cb,
2386 	.hash			= inet_hash,
2387 	.unhash			= inet_unhash,
2388 	.get_port		= inet_csk_get_port,
2389 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2390 	.stream_memory_free	= tcp_stream_memory_free,
2391 	.sockets_allocated	= &tcp_sockets_allocated,
2392 	.orphan_count		= &tcp_orphan_count,
2393 	.memory_allocated	= &tcp_memory_allocated,
2394 	.memory_pressure	= &tcp_memory_pressure,
2395 	.sysctl_mem		= sysctl_tcp_mem,
2396 	.sysctl_wmem		= sysctl_tcp_wmem,
2397 	.sysctl_rmem		= sysctl_tcp_rmem,
2398 	.max_header		= MAX_TCP_HEADER,
2399 	.obj_size		= sizeof(struct tcp_sock),
2400 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
2401 	.twsk_prot		= &tcp_timewait_sock_ops,
2402 	.rsk_prot		= &tcp_request_sock_ops,
2403 	.h.hashinfo		= &tcp_hashinfo,
2404 	.no_autobind		= true,
2405 #ifdef CONFIG_COMPAT
2406 	.compat_setsockopt	= compat_tcp_setsockopt,
2407 	.compat_getsockopt	= compat_tcp_getsockopt,
2408 #endif
2409 	.diag_destroy		= tcp_abort,
2410 };
2411 EXPORT_SYMBOL(tcp_prot);
2412 
2413 static void __net_exit tcp_sk_exit(struct net *net)
2414 {
2415 	int cpu;
2416 
2417 	for_each_possible_cpu(cpu)
2418 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2419 	free_percpu(net->ipv4.tcp_sk);
2420 }
2421 
2422 static int __net_init tcp_sk_init(struct net *net)
2423 {
2424 	int res, cpu, cnt;
2425 
2426 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2427 	if (!net->ipv4.tcp_sk)
2428 		return -ENOMEM;
2429 
2430 	for_each_possible_cpu(cpu) {
2431 		struct sock *sk;
2432 
2433 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2434 					   IPPROTO_TCP, net);
2435 		if (res)
2436 			goto fail;
2437 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2438 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2439 	}
2440 
2441 	net->ipv4.sysctl_tcp_ecn = 2;
2442 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2443 
2444 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2445 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2446 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2447 
2448 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2449 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2450 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2451 
2452 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2453 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2454 	net->ipv4.sysctl_tcp_syncookies = 1;
2455 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2456 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2457 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2458 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2459 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2460 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2461 	net->ipv4.sysctl_tcp_tw_reuse = 0;
2462 
2463 	cnt = tcp_hashinfo.ehash_mask + 1;
2464 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2465 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2466 
2467 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2468 	net->ipv4.sysctl_tcp_sack = 1;
2469 
2470 	return 0;
2471 fail:
2472 	tcp_sk_exit(net);
2473 
2474 	return res;
2475 }
2476 
2477 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2478 {
2479 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2480 }
2481 
2482 static struct pernet_operations __net_initdata tcp_sk_ops = {
2483        .init	   = tcp_sk_init,
2484        .exit	   = tcp_sk_exit,
2485        .exit_batch = tcp_sk_exit_batch,
2486 };
2487 
2488 void __init tcp_v4_init(void)
2489 {
2490 	if (register_pernet_subsys(&tcp_sk_ops))
2491 		panic("Failed to create the TCP control socket.\n");
2492 }
2493