xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision e639c869)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77 
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/inetdevice.h>
84 
85 #include <crypto/hash.h>
86 #include <linux/scatterlist.h>
87 
88 #include <trace/events/tcp.h>
89 
90 #ifdef CONFIG_TCP_MD5SIG
91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
92 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
93 #endif
94 
95 struct inet_hashinfo tcp_hashinfo;
96 EXPORT_SYMBOL(tcp_hashinfo);
97 
98 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
99 {
100 	return secure_tcp_seq(ip_hdr(skb)->daddr,
101 			      ip_hdr(skb)->saddr,
102 			      tcp_hdr(skb)->dest,
103 			      tcp_hdr(skb)->source);
104 }
105 
106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
107 {
108 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
109 }
110 
111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112 {
113 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114 	struct tcp_sock *tp = tcp_sk(sk);
115 
116 	/* With PAWS, it is safe from the viewpoint
117 	   of data integrity. Even without PAWS it is safe provided sequence
118 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
119 
120 	   Actually, the idea is close to VJ's one, only timestamp cache is
121 	   held not per host, but per port pair and TW bucket is used as state
122 	   holder.
123 
124 	   If TW bucket has been already destroyed we fall back to VJ's scheme
125 	   and use initial timestamp retrieved from peer table.
126 	 */
127 	if (tcptw->tw_ts_recent_stamp &&
128 	    (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
129 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
130 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
131 		if (tp->write_seq == 0)
132 			tp->write_seq = 1;
133 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
134 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
135 		sock_hold(sktw);
136 		return 1;
137 	}
138 
139 	return 0;
140 }
141 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
142 
143 /* This will initiate an outgoing connection. */
144 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
145 {
146 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
147 	struct inet_sock *inet = inet_sk(sk);
148 	struct tcp_sock *tp = tcp_sk(sk);
149 	__be16 orig_sport, orig_dport;
150 	__be32 daddr, nexthop;
151 	struct flowi4 *fl4;
152 	struct rtable *rt;
153 	int err;
154 	struct ip_options_rcu *inet_opt;
155 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
156 
157 	if (addr_len < sizeof(struct sockaddr_in))
158 		return -EINVAL;
159 
160 	if (usin->sin_family != AF_INET)
161 		return -EAFNOSUPPORT;
162 
163 	nexthop = daddr = usin->sin_addr.s_addr;
164 	inet_opt = rcu_dereference_protected(inet->inet_opt,
165 					     lockdep_sock_is_held(sk));
166 	if (inet_opt && inet_opt->opt.srr) {
167 		if (!daddr)
168 			return -EINVAL;
169 		nexthop = inet_opt->opt.faddr;
170 	}
171 
172 	orig_sport = inet->inet_sport;
173 	orig_dport = usin->sin_port;
174 	fl4 = &inet->cork.fl.u.ip4;
175 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
176 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
177 			      IPPROTO_TCP,
178 			      orig_sport, orig_dport, sk);
179 	if (IS_ERR(rt)) {
180 		err = PTR_ERR(rt);
181 		if (err == -ENETUNREACH)
182 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
183 		return err;
184 	}
185 
186 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
187 		ip_rt_put(rt);
188 		return -ENETUNREACH;
189 	}
190 
191 	if (!inet_opt || !inet_opt->opt.srr)
192 		daddr = fl4->daddr;
193 
194 	if (!inet->inet_saddr)
195 		inet->inet_saddr = fl4->saddr;
196 	sk_rcv_saddr_set(sk, inet->inet_saddr);
197 
198 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
199 		/* Reset inherited state */
200 		tp->rx_opt.ts_recent	   = 0;
201 		tp->rx_opt.ts_recent_stamp = 0;
202 		if (likely(!tp->repair))
203 			tp->write_seq	   = 0;
204 	}
205 
206 	inet->inet_dport = usin->sin_port;
207 	sk_daddr_set(sk, daddr);
208 
209 	inet_csk(sk)->icsk_ext_hdr_len = 0;
210 	if (inet_opt)
211 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
212 
213 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
214 
215 	/* Socket identity is still unknown (sport may be zero).
216 	 * However we set state to SYN-SENT and not releasing socket
217 	 * lock select source port, enter ourselves into the hash tables and
218 	 * complete initialization after this.
219 	 */
220 	tcp_set_state(sk, TCP_SYN_SENT);
221 	err = inet_hash_connect(tcp_death_row, sk);
222 	if (err)
223 		goto failure;
224 
225 	sk_set_txhash(sk);
226 
227 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
228 			       inet->inet_sport, inet->inet_dport, sk);
229 	if (IS_ERR(rt)) {
230 		err = PTR_ERR(rt);
231 		rt = NULL;
232 		goto failure;
233 	}
234 	/* OK, now commit destination to socket.  */
235 	sk->sk_gso_type = SKB_GSO_TCPV4;
236 	sk_setup_caps(sk, &rt->dst);
237 	rt = NULL;
238 
239 	if (likely(!tp->repair)) {
240 		if (!tp->write_seq)
241 			tp->write_seq = secure_tcp_seq(inet->inet_saddr,
242 						       inet->inet_daddr,
243 						       inet->inet_sport,
244 						       usin->sin_port);
245 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
246 						 inet->inet_saddr,
247 						 inet->inet_daddr);
248 	}
249 
250 	inet->inet_id = tp->write_seq ^ jiffies;
251 
252 	if (tcp_fastopen_defer_connect(sk, &err))
253 		return err;
254 	if (err)
255 		goto failure;
256 
257 	err = tcp_connect(sk);
258 
259 	if (err)
260 		goto failure;
261 
262 	return 0;
263 
264 failure:
265 	/*
266 	 * This unhashes the socket and releases the local port,
267 	 * if necessary.
268 	 */
269 	tcp_set_state(sk, TCP_CLOSE);
270 	ip_rt_put(rt);
271 	sk->sk_route_caps = 0;
272 	inet->inet_dport = 0;
273 	return err;
274 }
275 EXPORT_SYMBOL(tcp_v4_connect);
276 
277 /*
278  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
279  * It can be called through tcp_release_cb() if socket was owned by user
280  * at the time tcp_v4_err() was called to handle ICMP message.
281  */
282 void tcp_v4_mtu_reduced(struct sock *sk)
283 {
284 	struct inet_sock *inet = inet_sk(sk);
285 	struct dst_entry *dst;
286 	u32 mtu;
287 
288 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
289 		return;
290 	mtu = tcp_sk(sk)->mtu_info;
291 	dst = inet_csk_update_pmtu(sk, mtu);
292 	if (!dst)
293 		return;
294 
295 	/* Something is about to be wrong... Remember soft error
296 	 * for the case, if this connection will not able to recover.
297 	 */
298 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
299 		sk->sk_err_soft = EMSGSIZE;
300 
301 	mtu = dst_mtu(dst);
302 
303 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
304 	    ip_sk_accept_pmtu(sk) &&
305 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
306 		tcp_sync_mss(sk, mtu);
307 
308 		/* Resend the TCP packet because it's
309 		 * clear that the old packet has been
310 		 * dropped. This is the new "fast" path mtu
311 		 * discovery.
312 		 */
313 		tcp_simple_retransmit(sk);
314 	} /* else let the usual retransmit timer handle it */
315 }
316 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
317 
318 static void do_redirect(struct sk_buff *skb, struct sock *sk)
319 {
320 	struct dst_entry *dst = __sk_dst_check(sk, 0);
321 
322 	if (dst)
323 		dst->ops->redirect(dst, sk, skb);
324 }
325 
326 
327 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
328 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
329 {
330 	struct request_sock *req = inet_reqsk(sk);
331 	struct net *net = sock_net(sk);
332 
333 	/* ICMPs are not backlogged, hence we cannot get
334 	 * an established socket here.
335 	 */
336 	if (seq != tcp_rsk(req)->snt_isn) {
337 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
338 	} else if (abort) {
339 		/*
340 		 * Still in SYN_RECV, just remove it silently.
341 		 * There is no good way to pass the error to the newly
342 		 * created socket, and POSIX does not want network
343 		 * errors returned from accept().
344 		 */
345 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
346 		tcp_listendrop(req->rsk_listener);
347 	}
348 	reqsk_put(req);
349 }
350 EXPORT_SYMBOL(tcp_req_err);
351 
352 /*
353  * This routine is called by the ICMP module when it gets some
354  * sort of error condition.  If err < 0 then the socket should
355  * be closed and the error returned to the user.  If err > 0
356  * it's just the icmp type << 8 | icmp code.  After adjustment
357  * header points to the first 8 bytes of the tcp header.  We need
358  * to find the appropriate port.
359  *
360  * The locking strategy used here is very "optimistic". When
361  * someone else accesses the socket the ICMP is just dropped
362  * and for some paths there is no check at all.
363  * A more general error queue to queue errors for later handling
364  * is probably better.
365  *
366  */
367 
368 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
369 {
370 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
371 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
372 	struct inet_connection_sock *icsk;
373 	struct tcp_sock *tp;
374 	struct inet_sock *inet;
375 	const int type = icmp_hdr(icmp_skb)->type;
376 	const int code = icmp_hdr(icmp_skb)->code;
377 	struct sock *sk;
378 	struct sk_buff *skb;
379 	struct request_sock *fastopen;
380 	u32 seq, snd_una;
381 	s32 remaining;
382 	u32 delta_us;
383 	int err;
384 	struct net *net = dev_net(icmp_skb->dev);
385 
386 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
387 				       th->dest, iph->saddr, ntohs(th->source),
388 				       inet_iif(icmp_skb), 0);
389 	if (!sk) {
390 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
391 		return;
392 	}
393 	if (sk->sk_state == TCP_TIME_WAIT) {
394 		inet_twsk_put(inet_twsk(sk));
395 		return;
396 	}
397 	seq = ntohl(th->seq);
398 	if (sk->sk_state == TCP_NEW_SYN_RECV)
399 		return tcp_req_err(sk, seq,
400 				  type == ICMP_PARAMETERPROB ||
401 				  type == ICMP_TIME_EXCEEDED ||
402 				  (type == ICMP_DEST_UNREACH &&
403 				   (code == ICMP_NET_UNREACH ||
404 				    code == ICMP_HOST_UNREACH)));
405 
406 	bh_lock_sock(sk);
407 	/* If too many ICMPs get dropped on busy
408 	 * servers this needs to be solved differently.
409 	 * We do take care of PMTU discovery (RFC1191) special case :
410 	 * we can receive locally generated ICMP messages while socket is held.
411 	 */
412 	if (sock_owned_by_user(sk)) {
413 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
414 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
415 	}
416 	if (sk->sk_state == TCP_CLOSE)
417 		goto out;
418 
419 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
420 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
421 		goto out;
422 	}
423 
424 	icsk = inet_csk(sk);
425 	tp = tcp_sk(sk);
426 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
427 	fastopen = tp->fastopen_rsk;
428 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
429 	if (sk->sk_state != TCP_LISTEN &&
430 	    !between(seq, snd_una, tp->snd_nxt)) {
431 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
432 		goto out;
433 	}
434 
435 	switch (type) {
436 	case ICMP_REDIRECT:
437 		if (!sock_owned_by_user(sk))
438 			do_redirect(icmp_skb, sk);
439 		goto out;
440 	case ICMP_SOURCE_QUENCH:
441 		/* Just silently ignore these. */
442 		goto out;
443 	case ICMP_PARAMETERPROB:
444 		err = EPROTO;
445 		break;
446 	case ICMP_DEST_UNREACH:
447 		if (code > NR_ICMP_UNREACH)
448 			goto out;
449 
450 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
451 			/* We are not interested in TCP_LISTEN and open_requests
452 			 * (SYN-ACKs send out by Linux are always <576bytes so
453 			 * they should go through unfragmented).
454 			 */
455 			if (sk->sk_state == TCP_LISTEN)
456 				goto out;
457 
458 			tp->mtu_info = info;
459 			if (!sock_owned_by_user(sk)) {
460 				tcp_v4_mtu_reduced(sk);
461 			} else {
462 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
463 					sock_hold(sk);
464 			}
465 			goto out;
466 		}
467 
468 		err = icmp_err_convert[code].errno;
469 		/* check if icmp_skb allows revert of backoff
470 		 * (see draft-zimmermann-tcp-lcd) */
471 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
472 			break;
473 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
474 		    !icsk->icsk_backoff || fastopen)
475 			break;
476 
477 		if (sock_owned_by_user(sk))
478 			break;
479 
480 		icsk->icsk_backoff--;
481 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
482 					       TCP_TIMEOUT_INIT;
483 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
484 
485 		skb = tcp_rtx_queue_head(sk);
486 		BUG_ON(!skb);
487 
488 		tcp_mstamp_refresh(tp);
489 		delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
490 		remaining = icsk->icsk_rto -
491 			    usecs_to_jiffies(delta_us);
492 
493 		if (remaining > 0) {
494 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
495 						  remaining, TCP_RTO_MAX);
496 		} else {
497 			/* RTO revert clocked out retransmission.
498 			 * Will retransmit now */
499 			tcp_retransmit_timer(sk);
500 		}
501 
502 		break;
503 	case ICMP_TIME_EXCEEDED:
504 		err = EHOSTUNREACH;
505 		break;
506 	default:
507 		goto out;
508 	}
509 
510 	switch (sk->sk_state) {
511 	case TCP_SYN_SENT:
512 	case TCP_SYN_RECV:
513 		/* Only in fast or simultaneous open. If a fast open socket is
514 		 * is already accepted it is treated as a connected one below.
515 		 */
516 		if (fastopen && !fastopen->sk)
517 			break;
518 
519 		if (!sock_owned_by_user(sk)) {
520 			sk->sk_err = err;
521 
522 			sk->sk_error_report(sk);
523 
524 			tcp_done(sk);
525 		} else {
526 			sk->sk_err_soft = err;
527 		}
528 		goto out;
529 	}
530 
531 	/* If we've already connected we will keep trying
532 	 * until we time out, or the user gives up.
533 	 *
534 	 * rfc1122 4.2.3.9 allows to consider as hard errors
535 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
536 	 * but it is obsoleted by pmtu discovery).
537 	 *
538 	 * Note, that in modern internet, where routing is unreliable
539 	 * and in each dark corner broken firewalls sit, sending random
540 	 * errors ordered by their masters even this two messages finally lose
541 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
542 	 *
543 	 * Now we are in compliance with RFCs.
544 	 *							--ANK (980905)
545 	 */
546 
547 	inet = inet_sk(sk);
548 	if (!sock_owned_by_user(sk) && inet->recverr) {
549 		sk->sk_err = err;
550 		sk->sk_error_report(sk);
551 	} else	{ /* Only an error on timeout */
552 		sk->sk_err_soft = err;
553 	}
554 
555 out:
556 	bh_unlock_sock(sk);
557 	sock_put(sk);
558 }
559 
560 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
561 {
562 	struct tcphdr *th = tcp_hdr(skb);
563 
564 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
565 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
566 		skb->csum_start = skb_transport_header(skb) - skb->head;
567 		skb->csum_offset = offsetof(struct tcphdr, check);
568 	} else {
569 		th->check = tcp_v4_check(skb->len, saddr, daddr,
570 					 csum_partial(th,
571 						      th->doff << 2,
572 						      skb->csum));
573 	}
574 }
575 
576 /* This routine computes an IPv4 TCP checksum. */
577 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
578 {
579 	const struct inet_sock *inet = inet_sk(sk);
580 
581 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
582 }
583 EXPORT_SYMBOL(tcp_v4_send_check);
584 
585 /*
586  *	This routine will send an RST to the other tcp.
587  *
588  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
589  *		      for reset.
590  *	Answer: if a packet caused RST, it is not for a socket
591  *		existing in our system, if it is matched to a socket,
592  *		it is just duplicate segment or bug in other side's TCP.
593  *		So that we build reply only basing on parameters
594  *		arrived with segment.
595  *	Exception: precedence violation. We do not implement it in any case.
596  */
597 
598 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
599 {
600 	const struct tcphdr *th = tcp_hdr(skb);
601 	struct {
602 		struct tcphdr th;
603 #ifdef CONFIG_TCP_MD5SIG
604 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
605 #endif
606 	} rep;
607 	struct ip_reply_arg arg;
608 #ifdef CONFIG_TCP_MD5SIG
609 	struct tcp_md5sig_key *key = NULL;
610 	const __u8 *hash_location = NULL;
611 	unsigned char newhash[16];
612 	int genhash;
613 	struct sock *sk1 = NULL;
614 #endif
615 	struct net *net;
616 
617 	/* Never send a reset in response to a reset. */
618 	if (th->rst)
619 		return;
620 
621 	/* If sk not NULL, it means we did a successful lookup and incoming
622 	 * route had to be correct. prequeue might have dropped our dst.
623 	 */
624 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
625 		return;
626 
627 	/* Swap the send and the receive. */
628 	memset(&rep, 0, sizeof(rep));
629 	rep.th.dest   = th->source;
630 	rep.th.source = th->dest;
631 	rep.th.doff   = sizeof(struct tcphdr) / 4;
632 	rep.th.rst    = 1;
633 
634 	if (th->ack) {
635 		rep.th.seq = th->ack_seq;
636 	} else {
637 		rep.th.ack = 1;
638 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
639 				       skb->len - (th->doff << 2));
640 	}
641 
642 	memset(&arg, 0, sizeof(arg));
643 	arg.iov[0].iov_base = (unsigned char *)&rep;
644 	arg.iov[0].iov_len  = sizeof(rep.th);
645 
646 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
647 #ifdef CONFIG_TCP_MD5SIG
648 	rcu_read_lock();
649 	hash_location = tcp_parse_md5sig_option(th);
650 	if (sk && sk_fullsock(sk)) {
651 		key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
652 					&ip_hdr(skb)->saddr, AF_INET);
653 	} else if (hash_location) {
654 		/*
655 		 * active side is lost. Try to find listening socket through
656 		 * source port, and then find md5 key through listening socket.
657 		 * we are not loose security here:
658 		 * Incoming packet is checked with md5 hash with finding key,
659 		 * no RST generated if md5 hash doesn't match.
660 		 */
661 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
662 					     ip_hdr(skb)->saddr,
663 					     th->source, ip_hdr(skb)->daddr,
664 					     ntohs(th->source), inet_iif(skb),
665 					     tcp_v4_sdif(skb));
666 		/* don't send rst if it can't find key */
667 		if (!sk1)
668 			goto out;
669 
670 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
671 					&ip_hdr(skb)->saddr, AF_INET);
672 		if (!key)
673 			goto out;
674 
675 
676 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
677 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
678 			goto out;
679 
680 	}
681 
682 	if (key) {
683 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
684 				   (TCPOPT_NOP << 16) |
685 				   (TCPOPT_MD5SIG << 8) |
686 				   TCPOLEN_MD5SIG);
687 		/* Update length and the length the header thinks exists */
688 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
689 		rep.th.doff = arg.iov[0].iov_len / 4;
690 
691 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
692 				     key, ip_hdr(skb)->saddr,
693 				     ip_hdr(skb)->daddr, &rep.th);
694 	}
695 #endif
696 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
697 				      ip_hdr(skb)->saddr, /* XXX */
698 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
699 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
700 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
701 
702 	/* When socket is gone, all binding information is lost.
703 	 * routing might fail in this case. No choice here, if we choose to force
704 	 * input interface, we will misroute in case of asymmetric route.
705 	 */
706 	if (sk) {
707 		arg.bound_dev_if = sk->sk_bound_dev_if;
708 		trace_tcp_send_reset(sk, skb);
709 	}
710 
711 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
712 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
713 
714 	arg.tos = ip_hdr(skb)->tos;
715 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
716 	local_bh_disable();
717 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
718 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
719 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
720 			      &arg, arg.iov[0].iov_len);
721 
722 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
723 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
724 	local_bh_enable();
725 
726 #ifdef CONFIG_TCP_MD5SIG
727 out:
728 	rcu_read_unlock();
729 #endif
730 }
731 
732 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
733    outside socket context is ugly, certainly. What can I do?
734  */
735 
736 static void tcp_v4_send_ack(const struct sock *sk,
737 			    struct sk_buff *skb, u32 seq, u32 ack,
738 			    u32 win, u32 tsval, u32 tsecr, int oif,
739 			    struct tcp_md5sig_key *key,
740 			    int reply_flags, u8 tos)
741 {
742 	const struct tcphdr *th = tcp_hdr(skb);
743 	struct {
744 		struct tcphdr th;
745 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
746 #ifdef CONFIG_TCP_MD5SIG
747 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
748 #endif
749 			];
750 	} rep;
751 	struct net *net = sock_net(sk);
752 	struct ip_reply_arg arg;
753 
754 	memset(&rep.th, 0, sizeof(struct tcphdr));
755 	memset(&arg, 0, sizeof(arg));
756 
757 	arg.iov[0].iov_base = (unsigned char *)&rep;
758 	arg.iov[0].iov_len  = sizeof(rep.th);
759 	if (tsecr) {
760 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
761 				   (TCPOPT_TIMESTAMP << 8) |
762 				   TCPOLEN_TIMESTAMP);
763 		rep.opt[1] = htonl(tsval);
764 		rep.opt[2] = htonl(tsecr);
765 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
766 	}
767 
768 	/* Swap the send and the receive. */
769 	rep.th.dest    = th->source;
770 	rep.th.source  = th->dest;
771 	rep.th.doff    = arg.iov[0].iov_len / 4;
772 	rep.th.seq     = htonl(seq);
773 	rep.th.ack_seq = htonl(ack);
774 	rep.th.ack     = 1;
775 	rep.th.window  = htons(win);
776 
777 #ifdef CONFIG_TCP_MD5SIG
778 	if (key) {
779 		int offset = (tsecr) ? 3 : 0;
780 
781 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
782 					  (TCPOPT_NOP << 16) |
783 					  (TCPOPT_MD5SIG << 8) |
784 					  TCPOLEN_MD5SIG);
785 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
786 		rep.th.doff = arg.iov[0].iov_len/4;
787 
788 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
789 				    key, ip_hdr(skb)->saddr,
790 				    ip_hdr(skb)->daddr, &rep.th);
791 	}
792 #endif
793 	arg.flags = reply_flags;
794 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
795 				      ip_hdr(skb)->saddr, /* XXX */
796 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
797 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
798 	if (oif)
799 		arg.bound_dev_if = oif;
800 	arg.tos = tos;
801 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
802 	local_bh_disable();
803 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
804 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
805 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
806 			      &arg, arg.iov[0].iov_len);
807 
808 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
809 	local_bh_enable();
810 }
811 
812 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
813 {
814 	struct inet_timewait_sock *tw = inet_twsk(sk);
815 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
816 
817 	tcp_v4_send_ack(sk, skb,
818 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
819 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
820 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
821 			tcptw->tw_ts_recent,
822 			tw->tw_bound_dev_if,
823 			tcp_twsk_md5_key(tcptw),
824 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
825 			tw->tw_tos
826 			);
827 
828 	inet_twsk_put(tw);
829 }
830 
831 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
832 				  struct request_sock *req)
833 {
834 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
835 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
836 	 */
837 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
838 					     tcp_sk(sk)->snd_nxt;
839 
840 	/* RFC 7323 2.3
841 	 * The window field (SEG.WND) of every outgoing segment, with the
842 	 * exception of <SYN> segments, MUST be right-shifted by
843 	 * Rcv.Wind.Shift bits:
844 	 */
845 	tcp_v4_send_ack(sk, skb, seq,
846 			tcp_rsk(req)->rcv_nxt,
847 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
848 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
849 			req->ts_recent,
850 			0,
851 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
852 					  AF_INET),
853 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
854 			ip_hdr(skb)->tos);
855 }
856 
857 /*
858  *	Send a SYN-ACK after having received a SYN.
859  *	This still operates on a request_sock only, not on a big
860  *	socket.
861  */
862 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
863 			      struct flowi *fl,
864 			      struct request_sock *req,
865 			      struct tcp_fastopen_cookie *foc,
866 			      enum tcp_synack_type synack_type)
867 {
868 	const struct inet_request_sock *ireq = inet_rsk(req);
869 	struct flowi4 fl4;
870 	int err = -1;
871 	struct sk_buff *skb;
872 
873 	/* First, grab a route. */
874 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
875 		return -1;
876 
877 	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
878 
879 	if (skb) {
880 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
881 
882 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
883 					    ireq->ir_rmt_addr,
884 					    ireq_opt_deref(ireq));
885 		err = net_xmit_eval(err);
886 	}
887 
888 	return err;
889 }
890 
891 /*
892  *	IPv4 request_sock destructor.
893  */
894 static void tcp_v4_reqsk_destructor(struct request_sock *req)
895 {
896 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
897 }
898 
899 #ifdef CONFIG_TCP_MD5SIG
900 /*
901  * RFC2385 MD5 checksumming requires a mapping of
902  * IP address->MD5 Key.
903  * We need to maintain these in the sk structure.
904  */
905 
906 /* Find the Key structure for an address.  */
907 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
908 					 const union tcp_md5_addr *addr,
909 					 int family)
910 {
911 	const struct tcp_sock *tp = tcp_sk(sk);
912 	struct tcp_md5sig_key *key;
913 	const struct tcp_md5sig_info *md5sig;
914 	__be32 mask;
915 	struct tcp_md5sig_key *best_match = NULL;
916 	bool match;
917 
918 	/* caller either holds rcu_read_lock() or socket lock */
919 	md5sig = rcu_dereference_check(tp->md5sig_info,
920 				       lockdep_sock_is_held(sk));
921 	if (!md5sig)
922 		return NULL;
923 
924 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
925 		if (key->family != family)
926 			continue;
927 
928 		if (family == AF_INET) {
929 			mask = inet_make_mask(key->prefixlen);
930 			match = (key->addr.a4.s_addr & mask) ==
931 				(addr->a4.s_addr & mask);
932 #if IS_ENABLED(CONFIG_IPV6)
933 		} else if (family == AF_INET6) {
934 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
935 						  key->prefixlen);
936 #endif
937 		} else {
938 			match = false;
939 		}
940 
941 		if (match && (!best_match ||
942 			      key->prefixlen > best_match->prefixlen))
943 			best_match = key;
944 	}
945 	return best_match;
946 }
947 EXPORT_SYMBOL(tcp_md5_do_lookup);
948 
949 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
950 						      const union tcp_md5_addr *addr,
951 						      int family, u8 prefixlen)
952 {
953 	const struct tcp_sock *tp = tcp_sk(sk);
954 	struct tcp_md5sig_key *key;
955 	unsigned int size = sizeof(struct in_addr);
956 	const struct tcp_md5sig_info *md5sig;
957 
958 	/* caller either holds rcu_read_lock() or socket lock */
959 	md5sig = rcu_dereference_check(tp->md5sig_info,
960 				       lockdep_sock_is_held(sk));
961 	if (!md5sig)
962 		return NULL;
963 #if IS_ENABLED(CONFIG_IPV6)
964 	if (family == AF_INET6)
965 		size = sizeof(struct in6_addr);
966 #endif
967 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
968 		if (key->family != family)
969 			continue;
970 		if (!memcmp(&key->addr, addr, size) &&
971 		    key->prefixlen == prefixlen)
972 			return key;
973 	}
974 	return NULL;
975 }
976 
977 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
978 					 const struct sock *addr_sk)
979 {
980 	const union tcp_md5_addr *addr;
981 
982 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
983 	return tcp_md5_do_lookup(sk, addr, AF_INET);
984 }
985 EXPORT_SYMBOL(tcp_v4_md5_lookup);
986 
987 /* This can be called on a newly created socket, from other files */
988 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
989 		   int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
990 		   gfp_t gfp)
991 {
992 	/* Add Key to the list */
993 	struct tcp_md5sig_key *key;
994 	struct tcp_sock *tp = tcp_sk(sk);
995 	struct tcp_md5sig_info *md5sig;
996 
997 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
998 	if (key) {
999 		/* Pre-existing entry - just update that one. */
1000 		memcpy(key->key, newkey, newkeylen);
1001 		key->keylen = newkeylen;
1002 		return 0;
1003 	}
1004 
1005 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1006 					   lockdep_sock_is_held(sk));
1007 	if (!md5sig) {
1008 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1009 		if (!md5sig)
1010 			return -ENOMEM;
1011 
1012 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1013 		INIT_HLIST_HEAD(&md5sig->head);
1014 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1015 	}
1016 
1017 	key = sock_kmalloc(sk, sizeof(*key), gfp);
1018 	if (!key)
1019 		return -ENOMEM;
1020 	if (!tcp_alloc_md5sig_pool()) {
1021 		sock_kfree_s(sk, key, sizeof(*key));
1022 		return -ENOMEM;
1023 	}
1024 
1025 	memcpy(key->key, newkey, newkeylen);
1026 	key->keylen = newkeylen;
1027 	key->family = family;
1028 	key->prefixlen = prefixlen;
1029 	memcpy(&key->addr, addr,
1030 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1031 				      sizeof(struct in_addr));
1032 	hlist_add_head_rcu(&key->node, &md5sig->head);
1033 	return 0;
1034 }
1035 EXPORT_SYMBOL(tcp_md5_do_add);
1036 
1037 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1038 		   u8 prefixlen)
1039 {
1040 	struct tcp_md5sig_key *key;
1041 
1042 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1043 	if (!key)
1044 		return -ENOENT;
1045 	hlist_del_rcu(&key->node);
1046 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1047 	kfree_rcu(key, rcu);
1048 	return 0;
1049 }
1050 EXPORT_SYMBOL(tcp_md5_do_del);
1051 
1052 static void tcp_clear_md5_list(struct sock *sk)
1053 {
1054 	struct tcp_sock *tp = tcp_sk(sk);
1055 	struct tcp_md5sig_key *key;
1056 	struct hlist_node *n;
1057 	struct tcp_md5sig_info *md5sig;
1058 
1059 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1060 
1061 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1062 		hlist_del_rcu(&key->node);
1063 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1064 		kfree_rcu(key, rcu);
1065 	}
1066 }
1067 
1068 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1069 				 char __user *optval, int optlen)
1070 {
1071 	struct tcp_md5sig cmd;
1072 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1073 	u8 prefixlen = 32;
1074 
1075 	if (optlen < sizeof(cmd))
1076 		return -EINVAL;
1077 
1078 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1079 		return -EFAULT;
1080 
1081 	if (sin->sin_family != AF_INET)
1082 		return -EINVAL;
1083 
1084 	if (optname == TCP_MD5SIG_EXT &&
1085 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1086 		prefixlen = cmd.tcpm_prefixlen;
1087 		if (prefixlen > 32)
1088 			return -EINVAL;
1089 	}
1090 
1091 	if (!cmd.tcpm_keylen)
1092 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1093 				      AF_INET, prefixlen);
1094 
1095 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1096 		return -EINVAL;
1097 
1098 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1099 			      AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1100 			      GFP_KERNEL);
1101 }
1102 
1103 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1104 				   __be32 daddr, __be32 saddr,
1105 				   const struct tcphdr *th, int nbytes)
1106 {
1107 	struct tcp4_pseudohdr *bp;
1108 	struct scatterlist sg;
1109 	struct tcphdr *_th;
1110 
1111 	bp = hp->scratch;
1112 	bp->saddr = saddr;
1113 	bp->daddr = daddr;
1114 	bp->pad = 0;
1115 	bp->protocol = IPPROTO_TCP;
1116 	bp->len = cpu_to_be16(nbytes);
1117 
1118 	_th = (struct tcphdr *)(bp + 1);
1119 	memcpy(_th, th, sizeof(*th));
1120 	_th->check = 0;
1121 
1122 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1123 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1124 				sizeof(*bp) + sizeof(*th));
1125 	return crypto_ahash_update(hp->md5_req);
1126 }
1127 
1128 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1129 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1130 {
1131 	struct tcp_md5sig_pool *hp;
1132 	struct ahash_request *req;
1133 
1134 	hp = tcp_get_md5sig_pool();
1135 	if (!hp)
1136 		goto clear_hash_noput;
1137 	req = hp->md5_req;
1138 
1139 	if (crypto_ahash_init(req))
1140 		goto clear_hash;
1141 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1142 		goto clear_hash;
1143 	if (tcp_md5_hash_key(hp, key))
1144 		goto clear_hash;
1145 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1146 	if (crypto_ahash_final(req))
1147 		goto clear_hash;
1148 
1149 	tcp_put_md5sig_pool();
1150 	return 0;
1151 
1152 clear_hash:
1153 	tcp_put_md5sig_pool();
1154 clear_hash_noput:
1155 	memset(md5_hash, 0, 16);
1156 	return 1;
1157 }
1158 
1159 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1160 			const struct sock *sk,
1161 			const struct sk_buff *skb)
1162 {
1163 	struct tcp_md5sig_pool *hp;
1164 	struct ahash_request *req;
1165 	const struct tcphdr *th = tcp_hdr(skb);
1166 	__be32 saddr, daddr;
1167 
1168 	if (sk) { /* valid for establish/request sockets */
1169 		saddr = sk->sk_rcv_saddr;
1170 		daddr = sk->sk_daddr;
1171 	} else {
1172 		const struct iphdr *iph = ip_hdr(skb);
1173 		saddr = iph->saddr;
1174 		daddr = iph->daddr;
1175 	}
1176 
1177 	hp = tcp_get_md5sig_pool();
1178 	if (!hp)
1179 		goto clear_hash_noput;
1180 	req = hp->md5_req;
1181 
1182 	if (crypto_ahash_init(req))
1183 		goto clear_hash;
1184 
1185 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1186 		goto clear_hash;
1187 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1188 		goto clear_hash;
1189 	if (tcp_md5_hash_key(hp, key))
1190 		goto clear_hash;
1191 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1192 	if (crypto_ahash_final(req))
1193 		goto clear_hash;
1194 
1195 	tcp_put_md5sig_pool();
1196 	return 0;
1197 
1198 clear_hash:
1199 	tcp_put_md5sig_pool();
1200 clear_hash_noput:
1201 	memset(md5_hash, 0, 16);
1202 	return 1;
1203 }
1204 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1205 
1206 #endif
1207 
1208 /* Called with rcu_read_lock() */
1209 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1210 				    const struct sk_buff *skb)
1211 {
1212 #ifdef CONFIG_TCP_MD5SIG
1213 	/*
1214 	 * This gets called for each TCP segment that arrives
1215 	 * so we want to be efficient.
1216 	 * We have 3 drop cases:
1217 	 * o No MD5 hash and one expected.
1218 	 * o MD5 hash and we're not expecting one.
1219 	 * o MD5 hash and its wrong.
1220 	 */
1221 	const __u8 *hash_location = NULL;
1222 	struct tcp_md5sig_key *hash_expected;
1223 	const struct iphdr *iph = ip_hdr(skb);
1224 	const struct tcphdr *th = tcp_hdr(skb);
1225 	int genhash;
1226 	unsigned char newhash[16];
1227 
1228 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1229 					  AF_INET);
1230 	hash_location = tcp_parse_md5sig_option(th);
1231 
1232 	/* We've parsed the options - do we have a hash? */
1233 	if (!hash_expected && !hash_location)
1234 		return false;
1235 
1236 	if (hash_expected && !hash_location) {
1237 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1238 		return true;
1239 	}
1240 
1241 	if (!hash_expected && hash_location) {
1242 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1243 		return true;
1244 	}
1245 
1246 	/* Okay, so this is hash_expected and hash_location -
1247 	 * so we need to calculate the checksum.
1248 	 */
1249 	genhash = tcp_v4_md5_hash_skb(newhash,
1250 				      hash_expected,
1251 				      NULL, skb);
1252 
1253 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1254 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1255 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1256 				     &iph->saddr, ntohs(th->source),
1257 				     &iph->daddr, ntohs(th->dest),
1258 				     genhash ? " tcp_v4_calc_md5_hash failed"
1259 				     : "");
1260 		return true;
1261 	}
1262 	return false;
1263 #endif
1264 	return false;
1265 }
1266 
1267 static void tcp_v4_init_req(struct request_sock *req,
1268 			    const struct sock *sk_listener,
1269 			    struct sk_buff *skb)
1270 {
1271 	struct inet_request_sock *ireq = inet_rsk(req);
1272 	struct net *net = sock_net(sk_listener);
1273 
1274 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1275 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1276 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1277 }
1278 
1279 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1280 					  struct flowi *fl,
1281 					  const struct request_sock *req)
1282 {
1283 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1284 }
1285 
1286 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1287 	.family		=	PF_INET,
1288 	.obj_size	=	sizeof(struct tcp_request_sock),
1289 	.rtx_syn_ack	=	tcp_rtx_synack,
1290 	.send_ack	=	tcp_v4_reqsk_send_ack,
1291 	.destructor	=	tcp_v4_reqsk_destructor,
1292 	.send_reset	=	tcp_v4_send_reset,
1293 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1294 };
1295 
1296 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1297 	.mss_clamp	=	TCP_MSS_DEFAULT,
1298 #ifdef CONFIG_TCP_MD5SIG
1299 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1300 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1301 #endif
1302 	.init_req	=	tcp_v4_init_req,
1303 #ifdef CONFIG_SYN_COOKIES
1304 	.cookie_init_seq =	cookie_v4_init_sequence,
1305 #endif
1306 	.route_req	=	tcp_v4_route_req,
1307 	.init_seq	=	tcp_v4_init_seq,
1308 	.init_ts_off	=	tcp_v4_init_ts_off,
1309 	.send_synack	=	tcp_v4_send_synack,
1310 };
1311 
1312 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1313 {
1314 	/* Never answer to SYNs send to broadcast or multicast */
1315 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1316 		goto drop;
1317 
1318 	return tcp_conn_request(&tcp_request_sock_ops,
1319 				&tcp_request_sock_ipv4_ops, sk, skb);
1320 
1321 drop:
1322 	tcp_listendrop(sk);
1323 	return 0;
1324 }
1325 EXPORT_SYMBOL(tcp_v4_conn_request);
1326 
1327 
1328 /*
1329  * The three way handshake has completed - we got a valid synack -
1330  * now create the new socket.
1331  */
1332 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1333 				  struct request_sock *req,
1334 				  struct dst_entry *dst,
1335 				  struct request_sock *req_unhash,
1336 				  bool *own_req)
1337 {
1338 	struct inet_request_sock *ireq;
1339 	struct inet_sock *newinet;
1340 	struct tcp_sock *newtp;
1341 	struct sock *newsk;
1342 #ifdef CONFIG_TCP_MD5SIG
1343 	struct tcp_md5sig_key *key;
1344 #endif
1345 	struct ip_options_rcu *inet_opt;
1346 
1347 	if (sk_acceptq_is_full(sk))
1348 		goto exit_overflow;
1349 
1350 	newsk = tcp_create_openreq_child(sk, req, skb);
1351 	if (!newsk)
1352 		goto exit_nonewsk;
1353 
1354 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1355 	inet_sk_rx_dst_set(newsk, skb);
1356 
1357 	newtp		      = tcp_sk(newsk);
1358 	newinet		      = inet_sk(newsk);
1359 	ireq		      = inet_rsk(req);
1360 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1361 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1362 	newsk->sk_bound_dev_if = ireq->ir_iif;
1363 	newinet->inet_saddr   = ireq->ir_loc_addr;
1364 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1365 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1366 	newinet->mc_index     = inet_iif(skb);
1367 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1368 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1369 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1370 	if (inet_opt)
1371 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1372 	newinet->inet_id = newtp->write_seq ^ jiffies;
1373 
1374 	if (!dst) {
1375 		dst = inet_csk_route_child_sock(sk, newsk, req);
1376 		if (!dst)
1377 			goto put_and_exit;
1378 	} else {
1379 		/* syncookie case : see end of cookie_v4_check() */
1380 	}
1381 	sk_setup_caps(newsk, dst);
1382 
1383 	tcp_ca_openreq_child(newsk, dst);
1384 
1385 	tcp_sync_mss(newsk, dst_mtu(dst));
1386 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1387 
1388 	tcp_initialize_rcv_mss(newsk);
1389 
1390 #ifdef CONFIG_TCP_MD5SIG
1391 	/* Copy over the MD5 key from the original socket */
1392 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1393 				AF_INET);
1394 	if (key) {
1395 		/*
1396 		 * We're using one, so create a matching key
1397 		 * on the newsk structure. If we fail to get
1398 		 * memory, then we end up not copying the key
1399 		 * across. Shucks.
1400 		 */
1401 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1402 			       AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1403 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1404 	}
1405 #endif
1406 
1407 	if (__inet_inherit_port(sk, newsk) < 0)
1408 		goto put_and_exit;
1409 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1410 	if (likely(*own_req)) {
1411 		tcp_move_syn(newtp, req);
1412 		ireq->ireq_opt = NULL;
1413 	} else {
1414 		newinet->inet_opt = NULL;
1415 	}
1416 	return newsk;
1417 
1418 exit_overflow:
1419 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1420 exit_nonewsk:
1421 	dst_release(dst);
1422 exit:
1423 	tcp_listendrop(sk);
1424 	return NULL;
1425 put_and_exit:
1426 	newinet->inet_opt = NULL;
1427 	inet_csk_prepare_forced_close(newsk);
1428 	tcp_done(newsk);
1429 	goto exit;
1430 }
1431 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1432 
1433 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1434 {
1435 #ifdef CONFIG_SYN_COOKIES
1436 	const struct tcphdr *th = tcp_hdr(skb);
1437 
1438 	if (!th->syn)
1439 		sk = cookie_v4_check(sk, skb);
1440 #endif
1441 	return sk;
1442 }
1443 
1444 /* The socket must have it's spinlock held when we get
1445  * here, unless it is a TCP_LISTEN socket.
1446  *
1447  * We have a potential double-lock case here, so even when
1448  * doing backlog processing we use the BH locking scheme.
1449  * This is because we cannot sleep with the original spinlock
1450  * held.
1451  */
1452 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1453 {
1454 	struct sock *rsk;
1455 
1456 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1457 		struct dst_entry *dst = sk->sk_rx_dst;
1458 
1459 		sock_rps_save_rxhash(sk, skb);
1460 		sk_mark_napi_id(sk, skb);
1461 		if (dst) {
1462 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1463 			    !dst->ops->check(dst, 0)) {
1464 				dst_release(dst);
1465 				sk->sk_rx_dst = NULL;
1466 			}
1467 		}
1468 		tcp_rcv_established(sk, skb, tcp_hdr(skb));
1469 		return 0;
1470 	}
1471 
1472 	if (tcp_checksum_complete(skb))
1473 		goto csum_err;
1474 
1475 	if (sk->sk_state == TCP_LISTEN) {
1476 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1477 
1478 		if (!nsk)
1479 			goto discard;
1480 		if (nsk != sk) {
1481 			if (tcp_child_process(sk, nsk, skb)) {
1482 				rsk = nsk;
1483 				goto reset;
1484 			}
1485 			return 0;
1486 		}
1487 	} else
1488 		sock_rps_save_rxhash(sk, skb);
1489 
1490 	if (tcp_rcv_state_process(sk, skb)) {
1491 		rsk = sk;
1492 		goto reset;
1493 	}
1494 	return 0;
1495 
1496 reset:
1497 	tcp_v4_send_reset(rsk, skb);
1498 discard:
1499 	kfree_skb(skb);
1500 	/* Be careful here. If this function gets more complicated and
1501 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1502 	 * might be destroyed here. This current version compiles correctly,
1503 	 * but you have been warned.
1504 	 */
1505 	return 0;
1506 
1507 csum_err:
1508 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1509 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1510 	goto discard;
1511 }
1512 EXPORT_SYMBOL(tcp_v4_do_rcv);
1513 
1514 int tcp_v4_early_demux(struct sk_buff *skb)
1515 {
1516 	const struct iphdr *iph;
1517 	const struct tcphdr *th;
1518 	struct sock *sk;
1519 
1520 	if (skb->pkt_type != PACKET_HOST)
1521 		return 0;
1522 
1523 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1524 		return 0;
1525 
1526 	iph = ip_hdr(skb);
1527 	th = tcp_hdr(skb);
1528 
1529 	if (th->doff < sizeof(struct tcphdr) / 4)
1530 		return 0;
1531 
1532 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1533 				       iph->saddr, th->source,
1534 				       iph->daddr, ntohs(th->dest),
1535 				       skb->skb_iif, inet_sdif(skb));
1536 	if (sk) {
1537 		skb->sk = sk;
1538 		skb->destructor = sock_edemux;
1539 		if (sk_fullsock(sk)) {
1540 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1541 
1542 			if (dst)
1543 				dst = dst_check(dst, 0);
1544 			if (dst &&
1545 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1546 				skb_dst_set_noref(skb, dst);
1547 		}
1548 	}
1549 	return 0;
1550 }
1551 
1552 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1553 {
1554 	u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1555 
1556 	/* Only socket owner can try to collapse/prune rx queues
1557 	 * to reduce memory overhead, so add a little headroom here.
1558 	 * Few sockets backlog are possibly concurrently non empty.
1559 	 */
1560 	limit += 64*1024;
1561 
1562 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1563 	 * we can fix skb->truesize to its real value to avoid future drops.
1564 	 * This is valid because skb is not yet charged to the socket.
1565 	 * It has been noticed pure SACK packets were sometimes dropped
1566 	 * (if cooked by drivers without copybreak feature).
1567 	 */
1568 	skb_condense(skb);
1569 
1570 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1571 		bh_unlock_sock(sk);
1572 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1573 		return true;
1574 	}
1575 	return false;
1576 }
1577 EXPORT_SYMBOL(tcp_add_backlog);
1578 
1579 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1580 {
1581 	struct tcphdr *th = (struct tcphdr *)skb->data;
1582 	unsigned int eaten = skb->len;
1583 	int err;
1584 
1585 	err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1586 	if (!err) {
1587 		eaten -= skb->len;
1588 		TCP_SKB_CB(skb)->end_seq -= eaten;
1589 	}
1590 	return err;
1591 }
1592 EXPORT_SYMBOL(tcp_filter);
1593 
1594 /*
1595  *	From tcp_input.c
1596  */
1597 
1598 int tcp_v4_rcv(struct sk_buff *skb)
1599 {
1600 	struct net *net = dev_net(skb->dev);
1601 	int sdif = inet_sdif(skb);
1602 	const struct iphdr *iph;
1603 	const struct tcphdr *th;
1604 	bool refcounted;
1605 	struct sock *sk;
1606 	int ret;
1607 
1608 	if (skb->pkt_type != PACKET_HOST)
1609 		goto discard_it;
1610 
1611 	/* Count it even if it's bad */
1612 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1613 
1614 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1615 		goto discard_it;
1616 
1617 	th = (const struct tcphdr *)skb->data;
1618 
1619 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1620 		goto bad_packet;
1621 	if (!pskb_may_pull(skb, th->doff * 4))
1622 		goto discard_it;
1623 
1624 	/* An explanation is required here, I think.
1625 	 * Packet length and doff are validated by header prediction,
1626 	 * provided case of th->doff==0 is eliminated.
1627 	 * So, we defer the checks. */
1628 
1629 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1630 		goto csum_error;
1631 
1632 	th = (const struct tcphdr *)skb->data;
1633 	iph = ip_hdr(skb);
1634 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1635 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1636 	 */
1637 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1638 		sizeof(struct inet_skb_parm));
1639 	barrier();
1640 
1641 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1642 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1643 				    skb->len - th->doff * 4);
1644 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1645 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1646 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1647 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1648 	TCP_SKB_CB(skb)->sacked	 = 0;
1649 	TCP_SKB_CB(skb)->has_rxtstamp =
1650 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1651 
1652 lookup:
1653 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1654 			       th->dest, sdif, &refcounted);
1655 	if (!sk)
1656 		goto no_tcp_socket;
1657 
1658 process:
1659 	if (sk->sk_state == TCP_TIME_WAIT)
1660 		goto do_time_wait;
1661 
1662 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1663 		struct request_sock *req = inet_reqsk(sk);
1664 		struct sock *nsk;
1665 
1666 		sk = req->rsk_listener;
1667 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1668 			sk_drops_add(sk, skb);
1669 			reqsk_put(req);
1670 			goto discard_it;
1671 		}
1672 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1673 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1674 			goto lookup;
1675 		}
1676 		/* We own a reference on the listener, increase it again
1677 		 * as we might lose it too soon.
1678 		 */
1679 		sock_hold(sk);
1680 		refcounted = true;
1681 		nsk = NULL;
1682 		if (!tcp_filter(sk, skb))
1683 			nsk = tcp_check_req(sk, skb, req, false);
1684 		if (!nsk) {
1685 			reqsk_put(req);
1686 			goto discard_and_relse;
1687 		}
1688 		if (nsk == sk) {
1689 			reqsk_put(req);
1690 		} else if (tcp_child_process(sk, nsk, skb)) {
1691 			tcp_v4_send_reset(nsk, skb);
1692 			goto discard_and_relse;
1693 		} else {
1694 			sock_put(sk);
1695 			return 0;
1696 		}
1697 	}
1698 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1699 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1700 		goto discard_and_relse;
1701 	}
1702 
1703 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1704 		goto discard_and_relse;
1705 
1706 	if (tcp_v4_inbound_md5_hash(sk, skb))
1707 		goto discard_and_relse;
1708 
1709 	nf_reset(skb);
1710 
1711 	if (tcp_filter(sk, skb))
1712 		goto discard_and_relse;
1713 	th = (const struct tcphdr *)skb->data;
1714 	iph = ip_hdr(skb);
1715 
1716 	skb->dev = NULL;
1717 
1718 	if (sk->sk_state == TCP_LISTEN) {
1719 		ret = tcp_v4_do_rcv(sk, skb);
1720 		goto put_and_return;
1721 	}
1722 
1723 	sk_incoming_cpu_update(sk);
1724 
1725 	bh_lock_sock_nested(sk);
1726 	tcp_segs_in(tcp_sk(sk), skb);
1727 	ret = 0;
1728 	if (!sock_owned_by_user(sk)) {
1729 		ret = tcp_v4_do_rcv(sk, skb);
1730 	} else if (tcp_add_backlog(sk, skb)) {
1731 		goto discard_and_relse;
1732 	}
1733 	bh_unlock_sock(sk);
1734 
1735 put_and_return:
1736 	if (refcounted)
1737 		sock_put(sk);
1738 
1739 	return ret;
1740 
1741 no_tcp_socket:
1742 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1743 		goto discard_it;
1744 
1745 	if (tcp_checksum_complete(skb)) {
1746 csum_error:
1747 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1748 bad_packet:
1749 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
1750 	} else {
1751 		tcp_v4_send_reset(NULL, skb);
1752 	}
1753 
1754 discard_it:
1755 	/* Discard frame. */
1756 	kfree_skb(skb);
1757 	return 0;
1758 
1759 discard_and_relse:
1760 	sk_drops_add(sk, skb);
1761 	if (refcounted)
1762 		sock_put(sk);
1763 	goto discard_it;
1764 
1765 do_time_wait:
1766 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1767 		inet_twsk_put(inet_twsk(sk));
1768 		goto discard_it;
1769 	}
1770 
1771 	if (tcp_checksum_complete(skb)) {
1772 		inet_twsk_put(inet_twsk(sk));
1773 		goto csum_error;
1774 	}
1775 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1776 	case TCP_TW_SYN: {
1777 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1778 							&tcp_hashinfo, skb,
1779 							__tcp_hdrlen(th),
1780 							iph->saddr, th->source,
1781 							iph->daddr, th->dest,
1782 							inet_iif(skb),
1783 							sdif);
1784 		if (sk2) {
1785 			inet_twsk_deschedule_put(inet_twsk(sk));
1786 			sk = sk2;
1787 			refcounted = false;
1788 			goto process;
1789 		}
1790 	}
1791 		/* to ACK */
1792 		/* fall through */
1793 	case TCP_TW_ACK:
1794 		tcp_v4_timewait_ack(sk, skb);
1795 		break;
1796 	case TCP_TW_RST:
1797 		tcp_v4_send_reset(sk, skb);
1798 		inet_twsk_deschedule_put(inet_twsk(sk));
1799 		goto discard_it;
1800 	case TCP_TW_SUCCESS:;
1801 	}
1802 	goto discard_it;
1803 }
1804 
1805 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1806 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1807 	.twsk_unique	= tcp_twsk_unique,
1808 	.twsk_destructor= tcp_twsk_destructor,
1809 };
1810 
1811 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1812 {
1813 	struct dst_entry *dst = skb_dst(skb);
1814 
1815 	if (dst && dst_hold_safe(dst)) {
1816 		sk->sk_rx_dst = dst;
1817 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1818 	}
1819 }
1820 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1821 
1822 const struct inet_connection_sock_af_ops ipv4_specific = {
1823 	.queue_xmit	   = ip_queue_xmit,
1824 	.send_check	   = tcp_v4_send_check,
1825 	.rebuild_header	   = inet_sk_rebuild_header,
1826 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
1827 	.conn_request	   = tcp_v4_conn_request,
1828 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1829 	.net_header_len	   = sizeof(struct iphdr),
1830 	.setsockopt	   = ip_setsockopt,
1831 	.getsockopt	   = ip_getsockopt,
1832 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1833 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1834 #ifdef CONFIG_COMPAT
1835 	.compat_setsockopt = compat_ip_setsockopt,
1836 	.compat_getsockopt = compat_ip_getsockopt,
1837 #endif
1838 	.mtu_reduced	   = tcp_v4_mtu_reduced,
1839 };
1840 EXPORT_SYMBOL(ipv4_specific);
1841 
1842 #ifdef CONFIG_TCP_MD5SIG
1843 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1844 	.md5_lookup		= tcp_v4_md5_lookup,
1845 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1846 	.md5_parse		= tcp_v4_parse_md5_keys,
1847 };
1848 #endif
1849 
1850 /* NOTE: A lot of things set to zero explicitly by call to
1851  *       sk_alloc() so need not be done here.
1852  */
1853 static int tcp_v4_init_sock(struct sock *sk)
1854 {
1855 	struct inet_connection_sock *icsk = inet_csk(sk);
1856 
1857 	tcp_init_sock(sk);
1858 
1859 	icsk->icsk_af_ops = &ipv4_specific;
1860 
1861 #ifdef CONFIG_TCP_MD5SIG
1862 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1863 #endif
1864 
1865 	return 0;
1866 }
1867 
1868 void tcp_v4_destroy_sock(struct sock *sk)
1869 {
1870 	struct tcp_sock *tp = tcp_sk(sk);
1871 
1872 	trace_tcp_destroy_sock(sk);
1873 
1874 	tcp_clear_xmit_timers(sk);
1875 
1876 	tcp_cleanup_congestion_control(sk);
1877 
1878 	tcp_cleanup_ulp(sk);
1879 
1880 	/* Cleanup up the write buffer. */
1881 	tcp_write_queue_purge(sk);
1882 
1883 	/* Check if we want to disable active TFO */
1884 	tcp_fastopen_active_disable_ofo_check(sk);
1885 
1886 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1887 	skb_rbtree_purge(&tp->out_of_order_queue);
1888 
1889 #ifdef CONFIG_TCP_MD5SIG
1890 	/* Clean up the MD5 key list, if any */
1891 	if (tp->md5sig_info) {
1892 		tcp_clear_md5_list(sk);
1893 		kfree_rcu(tp->md5sig_info, rcu);
1894 		tp->md5sig_info = NULL;
1895 	}
1896 #endif
1897 
1898 	/* Clean up a referenced TCP bind bucket. */
1899 	if (inet_csk(sk)->icsk_bind_hash)
1900 		inet_put_port(sk);
1901 
1902 	BUG_ON(tp->fastopen_rsk);
1903 
1904 	/* If socket is aborted during connect operation */
1905 	tcp_free_fastopen_req(tp);
1906 	tcp_fastopen_destroy_cipher(sk);
1907 	tcp_saved_syn_free(tp);
1908 
1909 	sk_sockets_allocated_dec(sk);
1910 }
1911 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1912 
1913 #ifdef CONFIG_PROC_FS
1914 /* Proc filesystem TCP sock list dumping. */
1915 
1916 /*
1917  * Get next listener socket follow cur.  If cur is NULL, get first socket
1918  * starting from bucket given in st->bucket; when st->bucket is zero the
1919  * very first socket in the hash table is returned.
1920  */
1921 static void *listening_get_next(struct seq_file *seq, void *cur)
1922 {
1923 	struct tcp_iter_state *st = seq->private;
1924 	struct net *net = seq_file_net(seq);
1925 	struct inet_listen_hashbucket *ilb;
1926 	struct sock *sk = cur;
1927 
1928 	if (!sk) {
1929 get_head:
1930 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1931 		spin_lock(&ilb->lock);
1932 		sk = sk_head(&ilb->head);
1933 		st->offset = 0;
1934 		goto get_sk;
1935 	}
1936 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1937 	++st->num;
1938 	++st->offset;
1939 
1940 	sk = sk_next(sk);
1941 get_sk:
1942 	sk_for_each_from(sk) {
1943 		if (!net_eq(sock_net(sk), net))
1944 			continue;
1945 		if (sk->sk_family == st->family)
1946 			return sk;
1947 	}
1948 	spin_unlock(&ilb->lock);
1949 	st->offset = 0;
1950 	if (++st->bucket < INET_LHTABLE_SIZE)
1951 		goto get_head;
1952 	return NULL;
1953 }
1954 
1955 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1956 {
1957 	struct tcp_iter_state *st = seq->private;
1958 	void *rc;
1959 
1960 	st->bucket = 0;
1961 	st->offset = 0;
1962 	rc = listening_get_next(seq, NULL);
1963 
1964 	while (rc && *pos) {
1965 		rc = listening_get_next(seq, rc);
1966 		--*pos;
1967 	}
1968 	return rc;
1969 }
1970 
1971 static inline bool empty_bucket(const struct tcp_iter_state *st)
1972 {
1973 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1974 }
1975 
1976 /*
1977  * Get first established socket starting from bucket given in st->bucket.
1978  * If st->bucket is zero, the very first socket in the hash is returned.
1979  */
1980 static void *established_get_first(struct seq_file *seq)
1981 {
1982 	struct tcp_iter_state *st = seq->private;
1983 	struct net *net = seq_file_net(seq);
1984 	void *rc = NULL;
1985 
1986 	st->offset = 0;
1987 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1988 		struct sock *sk;
1989 		struct hlist_nulls_node *node;
1990 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1991 
1992 		/* Lockless fast path for the common case of empty buckets */
1993 		if (empty_bucket(st))
1994 			continue;
1995 
1996 		spin_lock_bh(lock);
1997 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1998 			if (sk->sk_family != st->family ||
1999 			    !net_eq(sock_net(sk), net)) {
2000 				continue;
2001 			}
2002 			rc = sk;
2003 			goto out;
2004 		}
2005 		spin_unlock_bh(lock);
2006 	}
2007 out:
2008 	return rc;
2009 }
2010 
2011 static void *established_get_next(struct seq_file *seq, void *cur)
2012 {
2013 	struct sock *sk = cur;
2014 	struct hlist_nulls_node *node;
2015 	struct tcp_iter_state *st = seq->private;
2016 	struct net *net = seq_file_net(seq);
2017 
2018 	++st->num;
2019 	++st->offset;
2020 
2021 	sk = sk_nulls_next(sk);
2022 
2023 	sk_nulls_for_each_from(sk, node) {
2024 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2025 			return sk;
2026 	}
2027 
2028 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2029 	++st->bucket;
2030 	return established_get_first(seq);
2031 }
2032 
2033 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2034 {
2035 	struct tcp_iter_state *st = seq->private;
2036 	void *rc;
2037 
2038 	st->bucket = 0;
2039 	rc = established_get_first(seq);
2040 
2041 	while (rc && pos) {
2042 		rc = established_get_next(seq, rc);
2043 		--pos;
2044 	}
2045 	return rc;
2046 }
2047 
2048 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2049 {
2050 	void *rc;
2051 	struct tcp_iter_state *st = seq->private;
2052 
2053 	st->state = TCP_SEQ_STATE_LISTENING;
2054 	rc	  = listening_get_idx(seq, &pos);
2055 
2056 	if (!rc) {
2057 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2058 		rc	  = established_get_idx(seq, pos);
2059 	}
2060 
2061 	return rc;
2062 }
2063 
2064 static void *tcp_seek_last_pos(struct seq_file *seq)
2065 {
2066 	struct tcp_iter_state *st = seq->private;
2067 	int offset = st->offset;
2068 	int orig_num = st->num;
2069 	void *rc = NULL;
2070 
2071 	switch (st->state) {
2072 	case TCP_SEQ_STATE_LISTENING:
2073 		if (st->bucket >= INET_LHTABLE_SIZE)
2074 			break;
2075 		st->state = TCP_SEQ_STATE_LISTENING;
2076 		rc = listening_get_next(seq, NULL);
2077 		while (offset-- && rc)
2078 			rc = listening_get_next(seq, rc);
2079 		if (rc)
2080 			break;
2081 		st->bucket = 0;
2082 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2083 		/* Fallthrough */
2084 	case TCP_SEQ_STATE_ESTABLISHED:
2085 		if (st->bucket > tcp_hashinfo.ehash_mask)
2086 			break;
2087 		rc = established_get_first(seq);
2088 		while (offset-- && rc)
2089 			rc = established_get_next(seq, rc);
2090 	}
2091 
2092 	st->num = orig_num;
2093 
2094 	return rc;
2095 }
2096 
2097 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2098 {
2099 	struct tcp_iter_state *st = seq->private;
2100 	void *rc;
2101 
2102 	if (*pos && *pos == st->last_pos) {
2103 		rc = tcp_seek_last_pos(seq);
2104 		if (rc)
2105 			goto out;
2106 	}
2107 
2108 	st->state = TCP_SEQ_STATE_LISTENING;
2109 	st->num = 0;
2110 	st->bucket = 0;
2111 	st->offset = 0;
2112 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2113 
2114 out:
2115 	st->last_pos = *pos;
2116 	return rc;
2117 }
2118 
2119 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2120 {
2121 	struct tcp_iter_state *st = seq->private;
2122 	void *rc = NULL;
2123 
2124 	if (v == SEQ_START_TOKEN) {
2125 		rc = tcp_get_idx(seq, 0);
2126 		goto out;
2127 	}
2128 
2129 	switch (st->state) {
2130 	case TCP_SEQ_STATE_LISTENING:
2131 		rc = listening_get_next(seq, v);
2132 		if (!rc) {
2133 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2134 			st->bucket = 0;
2135 			st->offset = 0;
2136 			rc	  = established_get_first(seq);
2137 		}
2138 		break;
2139 	case TCP_SEQ_STATE_ESTABLISHED:
2140 		rc = established_get_next(seq, v);
2141 		break;
2142 	}
2143 out:
2144 	++*pos;
2145 	st->last_pos = *pos;
2146 	return rc;
2147 }
2148 
2149 static void tcp_seq_stop(struct seq_file *seq, void *v)
2150 {
2151 	struct tcp_iter_state *st = seq->private;
2152 
2153 	switch (st->state) {
2154 	case TCP_SEQ_STATE_LISTENING:
2155 		if (v != SEQ_START_TOKEN)
2156 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2157 		break;
2158 	case TCP_SEQ_STATE_ESTABLISHED:
2159 		if (v)
2160 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2161 		break;
2162 	}
2163 }
2164 
2165 int tcp_seq_open(struct inode *inode, struct file *file)
2166 {
2167 	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2168 	struct tcp_iter_state *s;
2169 	int err;
2170 
2171 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2172 			  sizeof(struct tcp_iter_state));
2173 	if (err < 0)
2174 		return err;
2175 
2176 	s = ((struct seq_file *)file->private_data)->private;
2177 	s->family		= afinfo->family;
2178 	s->last_pos		= 0;
2179 	return 0;
2180 }
2181 EXPORT_SYMBOL(tcp_seq_open);
2182 
2183 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2184 {
2185 	int rc = 0;
2186 	struct proc_dir_entry *p;
2187 
2188 	afinfo->seq_ops.start		= tcp_seq_start;
2189 	afinfo->seq_ops.next		= tcp_seq_next;
2190 	afinfo->seq_ops.stop		= tcp_seq_stop;
2191 
2192 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2193 			     afinfo->seq_fops, afinfo);
2194 	if (!p)
2195 		rc = -ENOMEM;
2196 	return rc;
2197 }
2198 EXPORT_SYMBOL(tcp_proc_register);
2199 
2200 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2201 {
2202 	remove_proc_entry(afinfo->name, net->proc_net);
2203 }
2204 EXPORT_SYMBOL(tcp_proc_unregister);
2205 
2206 static void get_openreq4(const struct request_sock *req,
2207 			 struct seq_file *f, int i)
2208 {
2209 	const struct inet_request_sock *ireq = inet_rsk(req);
2210 	long delta = req->rsk_timer.expires - jiffies;
2211 
2212 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2213 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2214 		i,
2215 		ireq->ir_loc_addr,
2216 		ireq->ir_num,
2217 		ireq->ir_rmt_addr,
2218 		ntohs(ireq->ir_rmt_port),
2219 		TCP_SYN_RECV,
2220 		0, 0, /* could print option size, but that is af dependent. */
2221 		1,    /* timers active (only the expire timer) */
2222 		jiffies_delta_to_clock_t(delta),
2223 		req->num_timeout,
2224 		from_kuid_munged(seq_user_ns(f),
2225 				 sock_i_uid(req->rsk_listener)),
2226 		0,  /* non standard timer */
2227 		0, /* open_requests have no inode */
2228 		0,
2229 		req);
2230 }
2231 
2232 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2233 {
2234 	int timer_active;
2235 	unsigned long timer_expires;
2236 	const struct tcp_sock *tp = tcp_sk(sk);
2237 	const struct inet_connection_sock *icsk = inet_csk(sk);
2238 	const struct inet_sock *inet = inet_sk(sk);
2239 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2240 	__be32 dest = inet->inet_daddr;
2241 	__be32 src = inet->inet_rcv_saddr;
2242 	__u16 destp = ntohs(inet->inet_dport);
2243 	__u16 srcp = ntohs(inet->inet_sport);
2244 	int rx_queue;
2245 	int state;
2246 
2247 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2248 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2249 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2250 		timer_active	= 1;
2251 		timer_expires	= icsk->icsk_timeout;
2252 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2253 		timer_active	= 4;
2254 		timer_expires	= icsk->icsk_timeout;
2255 	} else if (timer_pending(&sk->sk_timer)) {
2256 		timer_active	= 2;
2257 		timer_expires	= sk->sk_timer.expires;
2258 	} else {
2259 		timer_active	= 0;
2260 		timer_expires = jiffies;
2261 	}
2262 
2263 	state = sk_state_load(sk);
2264 	if (state == TCP_LISTEN)
2265 		rx_queue = sk->sk_ack_backlog;
2266 	else
2267 		/* Because we don't lock the socket,
2268 		 * we might find a transient negative value.
2269 		 */
2270 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2271 
2272 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2273 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2274 		i, src, srcp, dest, destp, state,
2275 		tp->write_seq - tp->snd_una,
2276 		rx_queue,
2277 		timer_active,
2278 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2279 		icsk->icsk_retransmits,
2280 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2281 		icsk->icsk_probes_out,
2282 		sock_i_ino(sk),
2283 		refcount_read(&sk->sk_refcnt), sk,
2284 		jiffies_to_clock_t(icsk->icsk_rto),
2285 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2286 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2287 		tp->snd_cwnd,
2288 		state == TCP_LISTEN ?
2289 		    fastopenq->max_qlen :
2290 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2291 }
2292 
2293 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2294 			       struct seq_file *f, int i)
2295 {
2296 	long delta = tw->tw_timer.expires - jiffies;
2297 	__be32 dest, src;
2298 	__u16 destp, srcp;
2299 
2300 	dest  = tw->tw_daddr;
2301 	src   = tw->tw_rcv_saddr;
2302 	destp = ntohs(tw->tw_dport);
2303 	srcp  = ntohs(tw->tw_sport);
2304 
2305 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2306 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2307 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2308 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2309 		refcount_read(&tw->tw_refcnt), tw);
2310 }
2311 
2312 #define TMPSZ 150
2313 
2314 static int tcp4_seq_show(struct seq_file *seq, void *v)
2315 {
2316 	struct tcp_iter_state *st;
2317 	struct sock *sk = v;
2318 
2319 	seq_setwidth(seq, TMPSZ - 1);
2320 	if (v == SEQ_START_TOKEN) {
2321 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2322 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2323 			   "inode");
2324 		goto out;
2325 	}
2326 	st = seq->private;
2327 
2328 	if (sk->sk_state == TCP_TIME_WAIT)
2329 		get_timewait4_sock(v, seq, st->num);
2330 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2331 		get_openreq4(v, seq, st->num);
2332 	else
2333 		get_tcp4_sock(v, seq, st->num);
2334 out:
2335 	seq_pad(seq, '\n');
2336 	return 0;
2337 }
2338 
2339 static const struct file_operations tcp_afinfo_seq_fops = {
2340 	.owner   = THIS_MODULE,
2341 	.open    = tcp_seq_open,
2342 	.read    = seq_read,
2343 	.llseek  = seq_lseek,
2344 	.release = seq_release_net
2345 };
2346 
2347 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2348 	.name		= "tcp",
2349 	.family		= AF_INET,
2350 	.seq_fops	= &tcp_afinfo_seq_fops,
2351 	.seq_ops	= {
2352 		.show		= tcp4_seq_show,
2353 	},
2354 };
2355 
2356 static int __net_init tcp4_proc_init_net(struct net *net)
2357 {
2358 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2359 }
2360 
2361 static void __net_exit tcp4_proc_exit_net(struct net *net)
2362 {
2363 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2364 }
2365 
2366 static struct pernet_operations tcp4_net_ops = {
2367 	.init = tcp4_proc_init_net,
2368 	.exit = tcp4_proc_exit_net,
2369 };
2370 
2371 int __init tcp4_proc_init(void)
2372 {
2373 	return register_pernet_subsys(&tcp4_net_ops);
2374 }
2375 
2376 void tcp4_proc_exit(void)
2377 {
2378 	unregister_pernet_subsys(&tcp4_net_ops);
2379 }
2380 #endif /* CONFIG_PROC_FS */
2381 
2382 struct proto tcp_prot = {
2383 	.name			= "TCP",
2384 	.owner			= THIS_MODULE,
2385 	.close			= tcp_close,
2386 	.connect		= tcp_v4_connect,
2387 	.disconnect		= tcp_disconnect,
2388 	.accept			= inet_csk_accept,
2389 	.ioctl			= tcp_ioctl,
2390 	.init			= tcp_v4_init_sock,
2391 	.destroy		= tcp_v4_destroy_sock,
2392 	.shutdown		= tcp_shutdown,
2393 	.setsockopt		= tcp_setsockopt,
2394 	.getsockopt		= tcp_getsockopt,
2395 	.keepalive		= tcp_set_keepalive,
2396 	.recvmsg		= tcp_recvmsg,
2397 	.sendmsg		= tcp_sendmsg,
2398 	.sendpage		= tcp_sendpage,
2399 	.backlog_rcv		= tcp_v4_do_rcv,
2400 	.release_cb		= tcp_release_cb,
2401 	.hash			= inet_hash,
2402 	.unhash			= inet_unhash,
2403 	.get_port		= inet_csk_get_port,
2404 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2405 	.leave_memory_pressure	= tcp_leave_memory_pressure,
2406 	.stream_memory_free	= tcp_stream_memory_free,
2407 	.sockets_allocated	= &tcp_sockets_allocated,
2408 	.orphan_count		= &tcp_orphan_count,
2409 	.memory_allocated	= &tcp_memory_allocated,
2410 	.memory_pressure	= &tcp_memory_pressure,
2411 	.sysctl_mem		= sysctl_tcp_mem,
2412 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
2413 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
2414 	.max_header		= MAX_TCP_HEADER,
2415 	.obj_size		= sizeof(struct tcp_sock),
2416 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
2417 	.twsk_prot		= &tcp_timewait_sock_ops,
2418 	.rsk_prot		= &tcp_request_sock_ops,
2419 	.h.hashinfo		= &tcp_hashinfo,
2420 	.no_autobind		= true,
2421 #ifdef CONFIG_COMPAT
2422 	.compat_setsockopt	= compat_tcp_setsockopt,
2423 	.compat_getsockopt	= compat_tcp_getsockopt,
2424 #endif
2425 	.diag_destroy		= tcp_abort,
2426 };
2427 EXPORT_SYMBOL(tcp_prot);
2428 
2429 static void __net_exit tcp_sk_exit(struct net *net)
2430 {
2431 	int cpu;
2432 
2433 	module_put(net->ipv4.tcp_congestion_control->owner);
2434 
2435 	for_each_possible_cpu(cpu)
2436 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2437 	free_percpu(net->ipv4.tcp_sk);
2438 }
2439 
2440 static int __net_init tcp_sk_init(struct net *net)
2441 {
2442 	int res, cpu, cnt;
2443 
2444 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2445 	if (!net->ipv4.tcp_sk)
2446 		return -ENOMEM;
2447 
2448 	for_each_possible_cpu(cpu) {
2449 		struct sock *sk;
2450 
2451 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2452 					   IPPROTO_TCP, net);
2453 		if (res)
2454 			goto fail;
2455 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2456 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2457 	}
2458 
2459 	net->ipv4.sysctl_tcp_ecn = 2;
2460 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2461 
2462 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2463 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2464 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2465 
2466 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2467 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2468 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2469 
2470 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2471 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2472 	net->ipv4.sysctl_tcp_syncookies = 1;
2473 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2474 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2475 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2476 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2477 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2478 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2479 	net->ipv4.sysctl_tcp_tw_reuse = 0;
2480 
2481 	cnt = tcp_hashinfo.ehash_mask + 1;
2482 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2483 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2484 
2485 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2486 	net->ipv4.sysctl_tcp_sack = 1;
2487 	net->ipv4.sysctl_tcp_window_scaling = 1;
2488 	net->ipv4.sysctl_tcp_timestamps = 1;
2489 	net->ipv4.sysctl_tcp_early_retrans = 3;
2490 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2491 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2492 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
2493 	net->ipv4.sysctl_tcp_max_reordering = 300;
2494 	net->ipv4.sysctl_tcp_dsack = 1;
2495 	net->ipv4.sysctl_tcp_app_win = 31;
2496 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
2497 	net->ipv4.sysctl_tcp_frto = 2;
2498 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2499 	/* This limits the percentage of the congestion window which we
2500 	 * will allow a single TSO frame to consume.  Building TSO frames
2501 	 * which are too large can cause TCP streams to be bursty.
2502 	 */
2503 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2504 	/* Default TSQ limit of four TSO segments */
2505 	net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
2506 	/* rfc5961 challenge ack rate limiting */
2507 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2508 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
2509 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2510 	net->ipv4.sysctl_tcp_autocorking = 1;
2511 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2512 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2513 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2514 	if (net != &init_net) {
2515 		memcpy(net->ipv4.sysctl_tcp_rmem,
2516 		       init_net.ipv4.sysctl_tcp_rmem,
2517 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2518 		memcpy(net->ipv4.sysctl_tcp_wmem,
2519 		       init_net.ipv4.sysctl_tcp_wmem,
2520 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2521 	}
2522 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2523 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2524 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2525 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2526 
2527 	/* Reno is always built in */
2528 	if (!net_eq(net, &init_net) &&
2529 	    try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2530 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2531 	else
2532 		net->ipv4.tcp_congestion_control = &tcp_reno;
2533 
2534 	return 0;
2535 fail:
2536 	tcp_sk_exit(net);
2537 
2538 	return res;
2539 }
2540 
2541 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2542 {
2543 	struct net *net;
2544 
2545 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2546 
2547 	list_for_each_entry(net, net_exit_list, exit_list)
2548 		tcp_fastopen_ctx_destroy(net);
2549 }
2550 
2551 static struct pernet_operations __net_initdata tcp_sk_ops = {
2552        .init	   = tcp_sk_init,
2553        .exit	   = tcp_sk_exit,
2554        .exit_batch = tcp_sk_exit_batch,
2555 };
2556 
2557 void __init tcp_v4_init(void)
2558 {
2559 	if (register_pernet_subsys(&tcp_sk_ops))
2560 		panic("Failed to create the TCP control socket.\n");
2561 }
2562