xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision e0f6d1a5)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77 
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/inetdevice.h>
84 
85 #include <crypto/hash.h>
86 #include <linux/scatterlist.h>
87 
88 #include <trace/events/tcp.h>
89 
90 #ifdef CONFIG_TCP_MD5SIG
91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
92 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
93 #endif
94 
95 struct inet_hashinfo tcp_hashinfo;
96 EXPORT_SYMBOL(tcp_hashinfo);
97 
98 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
99 {
100 	return secure_tcp_seq(ip_hdr(skb)->daddr,
101 			      ip_hdr(skb)->saddr,
102 			      tcp_hdr(skb)->dest,
103 			      tcp_hdr(skb)->source);
104 }
105 
106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
107 {
108 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
109 }
110 
111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112 {
113 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114 	struct tcp_sock *tp = tcp_sk(sk);
115 
116 	/* With PAWS, it is safe from the viewpoint
117 	   of data integrity. Even without PAWS it is safe provided sequence
118 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
119 
120 	   Actually, the idea is close to VJ's one, only timestamp cache is
121 	   held not per host, but per port pair and TW bucket is used as state
122 	   holder.
123 
124 	   If TW bucket has been already destroyed we fall back to VJ's scheme
125 	   and use initial timestamp retrieved from peer table.
126 	 */
127 	if (tcptw->tw_ts_recent_stamp &&
128 	    (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
129 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
130 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
131 		if (tp->write_seq == 0)
132 			tp->write_seq = 1;
133 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
134 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
135 		sock_hold(sktw);
136 		return 1;
137 	}
138 
139 	return 0;
140 }
141 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
142 
143 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
144 			      int addr_len)
145 {
146 	/* This check is replicated from tcp_v4_connect() and intended to
147 	 * prevent BPF program called below from accessing bytes that are out
148 	 * of the bound specified by user in addr_len.
149 	 */
150 	if (addr_len < sizeof(struct sockaddr_in))
151 		return -EINVAL;
152 
153 	sock_owned_by_me(sk);
154 
155 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
156 }
157 
158 /* This will initiate an outgoing connection. */
159 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
160 {
161 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
162 	struct inet_sock *inet = inet_sk(sk);
163 	struct tcp_sock *tp = tcp_sk(sk);
164 	__be16 orig_sport, orig_dport;
165 	__be32 daddr, nexthop;
166 	struct flowi4 *fl4;
167 	struct rtable *rt;
168 	int err;
169 	struct ip_options_rcu *inet_opt;
170 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
171 
172 	if (addr_len < sizeof(struct sockaddr_in))
173 		return -EINVAL;
174 
175 	if (usin->sin_family != AF_INET)
176 		return -EAFNOSUPPORT;
177 
178 	nexthop = daddr = usin->sin_addr.s_addr;
179 	inet_opt = rcu_dereference_protected(inet->inet_opt,
180 					     lockdep_sock_is_held(sk));
181 	if (inet_opt && inet_opt->opt.srr) {
182 		if (!daddr)
183 			return -EINVAL;
184 		nexthop = inet_opt->opt.faddr;
185 	}
186 
187 	orig_sport = inet->inet_sport;
188 	orig_dport = usin->sin_port;
189 	fl4 = &inet->cork.fl.u.ip4;
190 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
191 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
192 			      IPPROTO_TCP,
193 			      orig_sport, orig_dport, sk);
194 	if (IS_ERR(rt)) {
195 		err = PTR_ERR(rt);
196 		if (err == -ENETUNREACH)
197 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
198 		return err;
199 	}
200 
201 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
202 		ip_rt_put(rt);
203 		return -ENETUNREACH;
204 	}
205 
206 	if (!inet_opt || !inet_opt->opt.srr)
207 		daddr = fl4->daddr;
208 
209 	if (!inet->inet_saddr)
210 		inet->inet_saddr = fl4->saddr;
211 	sk_rcv_saddr_set(sk, inet->inet_saddr);
212 
213 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
214 		/* Reset inherited state */
215 		tp->rx_opt.ts_recent	   = 0;
216 		tp->rx_opt.ts_recent_stamp = 0;
217 		if (likely(!tp->repair))
218 			tp->write_seq	   = 0;
219 	}
220 
221 	inet->inet_dport = usin->sin_port;
222 	sk_daddr_set(sk, daddr);
223 
224 	inet_csk(sk)->icsk_ext_hdr_len = 0;
225 	if (inet_opt)
226 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
227 
228 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
229 
230 	/* Socket identity is still unknown (sport may be zero).
231 	 * However we set state to SYN-SENT and not releasing socket
232 	 * lock select source port, enter ourselves into the hash tables and
233 	 * complete initialization after this.
234 	 */
235 	tcp_set_state(sk, TCP_SYN_SENT);
236 	err = inet_hash_connect(tcp_death_row, sk);
237 	if (err)
238 		goto failure;
239 
240 	sk_set_txhash(sk);
241 
242 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
243 			       inet->inet_sport, inet->inet_dport, sk);
244 	if (IS_ERR(rt)) {
245 		err = PTR_ERR(rt);
246 		rt = NULL;
247 		goto failure;
248 	}
249 	/* OK, now commit destination to socket.  */
250 	sk->sk_gso_type = SKB_GSO_TCPV4;
251 	sk_setup_caps(sk, &rt->dst);
252 	rt = NULL;
253 
254 	if (likely(!tp->repair)) {
255 		if (!tp->write_seq)
256 			tp->write_seq = secure_tcp_seq(inet->inet_saddr,
257 						       inet->inet_daddr,
258 						       inet->inet_sport,
259 						       usin->sin_port);
260 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
261 						 inet->inet_saddr,
262 						 inet->inet_daddr);
263 	}
264 
265 	inet->inet_id = tp->write_seq ^ jiffies;
266 
267 	if (tcp_fastopen_defer_connect(sk, &err))
268 		return err;
269 	if (err)
270 		goto failure;
271 
272 	err = tcp_connect(sk);
273 
274 	if (err)
275 		goto failure;
276 
277 	return 0;
278 
279 failure:
280 	/*
281 	 * This unhashes the socket and releases the local port,
282 	 * if necessary.
283 	 */
284 	tcp_set_state(sk, TCP_CLOSE);
285 	ip_rt_put(rt);
286 	sk->sk_route_caps = 0;
287 	inet->inet_dport = 0;
288 	return err;
289 }
290 EXPORT_SYMBOL(tcp_v4_connect);
291 
292 /*
293  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
294  * It can be called through tcp_release_cb() if socket was owned by user
295  * at the time tcp_v4_err() was called to handle ICMP message.
296  */
297 void tcp_v4_mtu_reduced(struct sock *sk)
298 {
299 	struct inet_sock *inet = inet_sk(sk);
300 	struct dst_entry *dst;
301 	u32 mtu;
302 
303 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
304 		return;
305 	mtu = tcp_sk(sk)->mtu_info;
306 	dst = inet_csk_update_pmtu(sk, mtu);
307 	if (!dst)
308 		return;
309 
310 	/* Something is about to be wrong... Remember soft error
311 	 * for the case, if this connection will not able to recover.
312 	 */
313 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
314 		sk->sk_err_soft = EMSGSIZE;
315 
316 	mtu = dst_mtu(dst);
317 
318 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
319 	    ip_sk_accept_pmtu(sk) &&
320 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
321 		tcp_sync_mss(sk, mtu);
322 
323 		/* Resend the TCP packet because it's
324 		 * clear that the old packet has been
325 		 * dropped. This is the new "fast" path mtu
326 		 * discovery.
327 		 */
328 		tcp_simple_retransmit(sk);
329 	} /* else let the usual retransmit timer handle it */
330 }
331 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
332 
333 static void do_redirect(struct sk_buff *skb, struct sock *sk)
334 {
335 	struct dst_entry *dst = __sk_dst_check(sk, 0);
336 
337 	if (dst)
338 		dst->ops->redirect(dst, sk, skb);
339 }
340 
341 
342 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
343 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
344 {
345 	struct request_sock *req = inet_reqsk(sk);
346 	struct net *net = sock_net(sk);
347 
348 	/* ICMPs are not backlogged, hence we cannot get
349 	 * an established socket here.
350 	 */
351 	if (seq != tcp_rsk(req)->snt_isn) {
352 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
353 	} else if (abort) {
354 		/*
355 		 * Still in SYN_RECV, just remove it silently.
356 		 * There is no good way to pass the error to the newly
357 		 * created socket, and POSIX does not want network
358 		 * errors returned from accept().
359 		 */
360 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
361 		tcp_listendrop(req->rsk_listener);
362 	}
363 	reqsk_put(req);
364 }
365 EXPORT_SYMBOL(tcp_req_err);
366 
367 /*
368  * This routine is called by the ICMP module when it gets some
369  * sort of error condition.  If err < 0 then the socket should
370  * be closed and the error returned to the user.  If err > 0
371  * it's just the icmp type << 8 | icmp code.  After adjustment
372  * header points to the first 8 bytes of the tcp header.  We need
373  * to find the appropriate port.
374  *
375  * The locking strategy used here is very "optimistic". When
376  * someone else accesses the socket the ICMP is just dropped
377  * and for some paths there is no check at all.
378  * A more general error queue to queue errors for later handling
379  * is probably better.
380  *
381  */
382 
383 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
384 {
385 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
386 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
387 	struct inet_connection_sock *icsk;
388 	struct tcp_sock *tp;
389 	struct inet_sock *inet;
390 	const int type = icmp_hdr(icmp_skb)->type;
391 	const int code = icmp_hdr(icmp_skb)->code;
392 	struct sock *sk;
393 	struct sk_buff *skb;
394 	struct request_sock *fastopen;
395 	u32 seq, snd_una;
396 	s32 remaining;
397 	u32 delta_us;
398 	int err;
399 	struct net *net = dev_net(icmp_skb->dev);
400 
401 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
402 				       th->dest, iph->saddr, ntohs(th->source),
403 				       inet_iif(icmp_skb), 0);
404 	if (!sk) {
405 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
406 		return;
407 	}
408 	if (sk->sk_state == TCP_TIME_WAIT) {
409 		inet_twsk_put(inet_twsk(sk));
410 		return;
411 	}
412 	seq = ntohl(th->seq);
413 	if (sk->sk_state == TCP_NEW_SYN_RECV)
414 		return tcp_req_err(sk, seq,
415 				  type == ICMP_PARAMETERPROB ||
416 				  type == ICMP_TIME_EXCEEDED ||
417 				  (type == ICMP_DEST_UNREACH &&
418 				   (code == ICMP_NET_UNREACH ||
419 				    code == ICMP_HOST_UNREACH)));
420 
421 	bh_lock_sock(sk);
422 	/* If too many ICMPs get dropped on busy
423 	 * servers this needs to be solved differently.
424 	 * We do take care of PMTU discovery (RFC1191) special case :
425 	 * we can receive locally generated ICMP messages while socket is held.
426 	 */
427 	if (sock_owned_by_user(sk)) {
428 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
429 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
430 	}
431 	if (sk->sk_state == TCP_CLOSE)
432 		goto out;
433 
434 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
435 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
436 		goto out;
437 	}
438 
439 	icsk = inet_csk(sk);
440 	tp = tcp_sk(sk);
441 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
442 	fastopen = tp->fastopen_rsk;
443 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
444 	if (sk->sk_state != TCP_LISTEN &&
445 	    !between(seq, snd_una, tp->snd_nxt)) {
446 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
447 		goto out;
448 	}
449 
450 	switch (type) {
451 	case ICMP_REDIRECT:
452 		if (!sock_owned_by_user(sk))
453 			do_redirect(icmp_skb, sk);
454 		goto out;
455 	case ICMP_SOURCE_QUENCH:
456 		/* Just silently ignore these. */
457 		goto out;
458 	case ICMP_PARAMETERPROB:
459 		err = EPROTO;
460 		break;
461 	case ICMP_DEST_UNREACH:
462 		if (code > NR_ICMP_UNREACH)
463 			goto out;
464 
465 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
466 			/* We are not interested in TCP_LISTEN and open_requests
467 			 * (SYN-ACKs send out by Linux are always <576bytes so
468 			 * they should go through unfragmented).
469 			 */
470 			if (sk->sk_state == TCP_LISTEN)
471 				goto out;
472 
473 			tp->mtu_info = info;
474 			if (!sock_owned_by_user(sk)) {
475 				tcp_v4_mtu_reduced(sk);
476 			} else {
477 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
478 					sock_hold(sk);
479 			}
480 			goto out;
481 		}
482 
483 		err = icmp_err_convert[code].errno;
484 		/* check if icmp_skb allows revert of backoff
485 		 * (see draft-zimmermann-tcp-lcd) */
486 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
487 			break;
488 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
489 		    !icsk->icsk_backoff || fastopen)
490 			break;
491 
492 		if (sock_owned_by_user(sk))
493 			break;
494 
495 		icsk->icsk_backoff--;
496 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
497 					       TCP_TIMEOUT_INIT;
498 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
499 
500 		skb = tcp_rtx_queue_head(sk);
501 		BUG_ON(!skb);
502 
503 		tcp_mstamp_refresh(tp);
504 		delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
505 		remaining = icsk->icsk_rto -
506 			    usecs_to_jiffies(delta_us);
507 
508 		if (remaining > 0) {
509 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
510 						  remaining, TCP_RTO_MAX);
511 		} else {
512 			/* RTO revert clocked out retransmission.
513 			 * Will retransmit now */
514 			tcp_retransmit_timer(sk);
515 		}
516 
517 		break;
518 	case ICMP_TIME_EXCEEDED:
519 		err = EHOSTUNREACH;
520 		break;
521 	default:
522 		goto out;
523 	}
524 
525 	switch (sk->sk_state) {
526 	case TCP_SYN_SENT:
527 	case TCP_SYN_RECV:
528 		/* Only in fast or simultaneous open. If a fast open socket is
529 		 * is already accepted it is treated as a connected one below.
530 		 */
531 		if (fastopen && !fastopen->sk)
532 			break;
533 
534 		if (!sock_owned_by_user(sk)) {
535 			sk->sk_err = err;
536 
537 			sk->sk_error_report(sk);
538 
539 			tcp_done(sk);
540 		} else {
541 			sk->sk_err_soft = err;
542 		}
543 		goto out;
544 	}
545 
546 	/* If we've already connected we will keep trying
547 	 * until we time out, or the user gives up.
548 	 *
549 	 * rfc1122 4.2.3.9 allows to consider as hard errors
550 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
551 	 * but it is obsoleted by pmtu discovery).
552 	 *
553 	 * Note, that in modern internet, where routing is unreliable
554 	 * and in each dark corner broken firewalls sit, sending random
555 	 * errors ordered by their masters even this two messages finally lose
556 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
557 	 *
558 	 * Now we are in compliance with RFCs.
559 	 *							--ANK (980905)
560 	 */
561 
562 	inet = inet_sk(sk);
563 	if (!sock_owned_by_user(sk) && inet->recverr) {
564 		sk->sk_err = err;
565 		sk->sk_error_report(sk);
566 	} else	{ /* Only an error on timeout */
567 		sk->sk_err_soft = err;
568 	}
569 
570 out:
571 	bh_unlock_sock(sk);
572 	sock_put(sk);
573 }
574 
575 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
576 {
577 	struct tcphdr *th = tcp_hdr(skb);
578 
579 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
580 	skb->csum_start = skb_transport_header(skb) - skb->head;
581 	skb->csum_offset = offsetof(struct tcphdr, check);
582 }
583 
584 /* This routine computes an IPv4 TCP checksum. */
585 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
586 {
587 	const struct inet_sock *inet = inet_sk(sk);
588 
589 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
590 }
591 EXPORT_SYMBOL(tcp_v4_send_check);
592 
593 /*
594  *	This routine will send an RST to the other tcp.
595  *
596  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
597  *		      for reset.
598  *	Answer: if a packet caused RST, it is not for a socket
599  *		existing in our system, if it is matched to a socket,
600  *		it is just duplicate segment or bug in other side's TCP.
601  *		So that we build reply only basing on parameters
602  *		arrived with segment.
603  *	Exception: precedence violation. We do not implement it in any case.
604  */
605 
606 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
607 {
608 	const struct tcphdr *th = tcp_hdr(skb);
609 	struct {
610 		struct tcphdr th;
611 #ifdef CONFIG_TCP_MD5SIG
612 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
613 #endif
614 	} rep;
615 	struct ip_reply_arg arg;
616 #ifdef CONFIG_TCP_MD5SIG
617 	struct tcp_md5sig_key *key = NULL;
618 	const __u8 *hash_location = NULL;
619 	unsigned char newhash[16];
620 	int genhash;
621 	struct sock *sk1 = NULL;
622 #endif
623 	struct net *net;
624 
625 	/* Never send a reset in response to a reset. */
626 	if (th->rst)
627 		return;
628 
629 	/* If sk not NULL, it means we did a successful lookup and incoming
630 	 * route had to be correct. prequeue might have dropped our dst.
631 	 */
632 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
633 		return;
634 
635 	/* Swap the send and the receive. */
636 	memset(&rep, 0, sizeof(rep));
637 	rep.th.dest   = th->source;
638 	rep.th.source = th->dest;
639 	rep.th.doff   = sizeof(struct tcphdr) / 4;
640 	rep.th.rst    = 1;
641 
642 	if (th->ack) {
643 		rep.th.seq = th->ack_seq;
644 	} else {
645 		rep.th.ack = 1;
646 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
647 				       skb->len - (th->doff << 2));
648 	}
649 
650 	memset(&arg, 0, sizeof(arg));
651 	arg.iov[0].iov_base = (unsigned char *)&rep;
652 	arg.iov[0].iov_len  = sizeof(rep.th);
653 
654 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
655 #ifdef CONFIG_TCP_MD5SIG
656 	rcu_read_lock();
657 	hash_location = tcp_parse_md5sig_option(th);
658 	if (sk && sk_fullsock(sk)) {
659 		key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
660 					&ip_hdr(skb)->saddr, AF_INET);
661 	} else if (hash_location) {
662 		/*
663 		 * active side is lost. Try to find listening socket through
664 		 * source port, and then find md5 key through listening socket.
665 		 * we are not loose security here:
666 		 * Incoming packet is checked with md5 hash with finding key,
667 		 * no RST generated if md5 hash doesn't match.
668 		 */
669 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
670 					     ip_hdr(skb)->saddr,
671 					     th->source, ip_hdr(skb)->daddr,
672 					     ntohs(th->source), inet_iif(skb),
673 					     tcp_v4_sdif(skb));
674 		/* don't send rst if it can't find key */
675 		if (!sk1)
676 			goto out;
677 
678 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
679 					&ip_hdr(skb)->saddr, AF_INET);
680 		if (!key)
681 			goto out;
682 
683 
684 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
685 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
686 			goto out;
687 
688 	}
689 
690 	if (key) {
691 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
692 				   (TCPOPT_NOP << 16) |
693 				   (TCPOPT_MD5SIG << 8) |
694 				   TCPOLEN_MD5SIG);
695 		/* Update length and the length the header thinks exists */
696 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
697 		rep.th.doff = arg.iov[0].iov_len / 4;
698 
699 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
700 				     key, ip_hdr(skb)->saddr,
701 				     ip_hdr(skb)->daddr, &rep.th);
702 	}
703 #endif
704 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
705 				      ip_hdr(skb)->saddr, /* XXX */
706 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
707 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
708 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
709 
710 	/* When socket is gone, all binding information is lost.
711 	 * routing might fail in this case. No choice here, if we choose to force
712 	 * input interface, we will misroute in case of asymmetric route.
713 	 */
714 	if (sk) {
715 		arg.bound_dev_if = sk->sk_bound_dev_if;
716 		if (sk_fullsock(sk))
717 			trace_tcp_send_reset(sk, skb);
718 	}
719 
720 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
721 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
722 
723 	arg.tos = ip_hdr(skb)->tos;
724 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
725 	local_bh_disable();
726 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
727 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
728 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
729 			      &arg, arg.iov[0].iov_len);
730 
731 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
732 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
733 	local_bh_enable();
734 
735 #ifdef CONFIG_TCP_MD5SIG
736 out:
737 	rcu_read_unlock();
738 #endif
739 }
740 
741 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
742    outside socket context is ugly, certainly. What can I do?
743  */
744 
745 static void tcp_v4_send_ack(const struct sock *sk,
746 			    struct sk_buff *skb, u32 seq, u32 ack,
747 			    u32 win, u32 tsval, u32 tsecr, int oif,
748 			    struct tcp_md5sig_key *key,
749 			    int reply_flags, u8 tos)
750 {
751 	const struct tcphdr *th = tcp_hdr(skb);
752 	struct {
753 		struct tcphdr th;
754 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
755 #ifdef CONFIG_TCP_MD5SIG
756 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
757 #endif
758 			];
759 	} rep;
760 	struct net *net = sock_net(sk);
761 	struct ip_reply_arg arg;
762 
763 	memset(&rep.th, 0, sizeof(struct tcphdr));
764 	memset(&arg, 0, sizeof(arg));
765 
766 	arg.iov[0].iov_base = (unsigned char *)&rep;
767 	arg.iov[0].iov_len  = sizeof(rep.th);
768 	if (tsecr) {
769 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
770 				   (TCPOPT_TIMESTAMP << 8) |
771 				   TCPOLEN_TIMESTAMP);
772 		rep.opt[1] = htonl(tsval);
773 		rep.opt[2] = htonl(tsecr);
774 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
775 	}
776 
777 	/* Swap the send and the receive. */
778 	rep.th.dest    = th->source;
779 	rep.th.source  = th->dest;
780 	rep.th.doff    = arg.iov[0].iov_len / 4;
781 	rep.th.seq     = htonl(seq);
782 	rep.th.ack_seq = htonl(ack);
783 	rep.th.ack     = 1;
784 	rep.th.window  = htons(win);
785 
786 #ifdef CONFIG_TCP_MD5SIG
787 	if (key) {
788 		int offset = (tsecr) ? 3 : 0;
789 
790 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
791 					  (TCPOPT_NOP << 16) |
792 					  (TCPOPT_MD5SIG << 8) |
793 					  TCPOLEN_MD5SIG);
794 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
795 		rep.th.doff = arg.iov[0].iov_len/4;
796 
797 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
798 				    key, ip_hdr(skb)->saddr,
799 				    ip_hdr(skb)->daddr, &rep.th);
800 	}
801 #endif
802 	arg.flags = reply_flags;
803 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
804 				      ip_hdr(skb)->saddr, /* XXX */
805 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
806 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
807 	if (oif)
808 		arg.bound_dev_if = oif;
809 	arg.tos = tos;
810 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
811 	local_bh_disable();
812 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
813 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
814 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
815 			      &arg, arg.iov[0].iov_len);
816 
817 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
818 	local_bh_enable();
819 }
820 
821 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
822 {
823 	struct inet_timewait_sock *tw = inet_twsk(sk);
824 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
825 
826 	tcp_v4_send_ack(sk, skb,
827 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
828 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
829 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
830 			tcptw->tw_ts_recent,
831 			tw->tw_bound_dev_if,
832 			tcp_twsk_md5_key(tcptw),
833 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
834 			tw->tw_tos
835 			);
836 
837 	inet_twsk_put(tw);
838 }
839 
840 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
841 				  struct request_sock *req)
842 {
843 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
844 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
845 	 */
846 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
847 					     tcp_sk(sk)->snd_nxt;
848 
849 	/* RFC 7323 2.3
850 	 * The window field (SEG.WND) of every outgoing segment, with the
851 	 * exception of <SYN> segments, MUST be right-shifted by
852 	 * Rcv.Wind.Shift bits:
853 	 */
854 	tcp_v4_send_ack(sk, skb, seq,
855 			tcp_rsk(req)->rcv_nxt,
856 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
857 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
858 			req->ts_recent,
859 			0,
860 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
861 					  AF_INET),
862 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
863 			ip_hdr(skb)->tos);
864 }
865 
866 /*
867  *	Send a SYN-ACK after having received a SYN.
868  *	This still operates on a request_sock only, not on a big
869  *	socket.
870  */
871 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
872 			      struct flowi *fl,
873 			      struct request_sock *req,
874 			      struct tcp_fastopen_cookie *foc,
875 			      enum tcp_synack_type synack_type)
876 {
877 	const struct inet_request_sock *ireq = inet_rsk(req);
878 	struct flowi4 fl4;
879 	int err = -1;
880 	struct sk_buff *skb;
881 
882 	/* First, grab a route. */
883 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
884 		return -1;
885 
886 	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
887 
888 	if (skb) {
889 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
890 
891 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
892 					    ireq->ir_rmt_addr,
893 					    ireq_opt_deref(ireq));
894 		err = net_xmit_eval(err);
895 	}
896 
897 	return err;
898 }
899 
900 /*
901  *	IPv4 request_sock destructor.
902  */
903 static void tcp_v4_reqsk_destructor(struct request_sock *req)
904 {
905 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
906 }
907 
908 #ifdef CONFIG_TCP_MD5SIG
909 /*
910  * RFC2385 MD5 checksumming requires a mapping of
911  * IP address->MD5 Key.
912  * We need to maintain these in the sk structure.
913  */
914 
915 /* Find the Key structure for an address.  */
916 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
917 					 const union tcp_md5_addr *addr,
918 					 int family)
919 {
920 	const struct tcp_sock *tp = tcp_sk(sk);
921 	struct tcp_md5sig_key *key;
922 	const struct tcp_md5sig_info *md5sig;
923 	__be32 mask;
924 	struct tcp_md5sig_key *best_match = NULL;
925 	bool match;
926 
927 	/* caller either holds rcu_read_lock() or socket lock */
928 	md5sig = rcu_dereference_check(tp->md5sig_info,
929 				       lockdep_sock_is_held(sk));
930 	if (!md5sig)
931 		return NULL;
932 
933 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
934 		if (key->family != family)
935 			continue;
936 
937 		if (family == AF_INET) {
938 			mask = inet_make_mask(key->prefixlen);
939 			match = (key->addr.a4.s_addr & mask) ==
940 				(addr->a4.s_addr & mask);
941 #if IS_ENABLED(CONFIG_IPV6)
942 		} else if (family == AF_INET6) {
943 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
944 						  key->prefixlen);
945 #endif
946 		} else {
947 			match = false;
948 		}
949 
950 		if (match && (!best_match ||
951 			      key->prefixlen > best_match->prefixlen))
952 			best_match = key;
953 	}
954 	return best_match;
955 }
956 EXPORT_SYMBOL(tcp_md5_do_lookup);
957 
958 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
959 						      const union tcp_md5_addr *addr,
960 						      int family, u8 prefixlen)
961 {
962 	const struct tcp_sock *tp = tcp_sk(sk);
963 	struct tcp_md5sig_key *key;
964 	unsigned int size = sizeof(struct in_addr);
965 	const struct tcp_md5sig_info *md5sig;
966 
967 	/* caller either holds rcu_read_lock() or socket lock */
968 	md5sig = rcu_dereference_check(tp->md5sig_info,
969 				       lockdep_sock_is_held(sk));
970 	if (!md5sig)
971 		return NULL;
972 #if IS_ENABLED(CONFIG_IPV6)
973 	if (family == AF_INET6)
974 		size = sizeof(struct in6_addr);
975 #endif
976 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
977 		if (key->family != family)
978 			continue;
979 		if (!memcmp(&key->addr, addr, size) &&
980 		    key->prefixlen == prefixlen)
981 			return key;
982 	}
983 	return NULL;
984 }
985 
986 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
987 					 const struct sock *addr_sk)
988 {
989 	const union tcp_md5_addr *addr;
990 
991 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
992 	return tcp_md5_do_lookup(sk, addr, AF_INET);
993 }
994 EXPORT_SYMBOL(tcp_v4_md5_lookup);
995 
996 /* This can be called on a newly created socket, from other files */
997 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
998 		   int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
999 		   gfp_t gfp)
1000 {
1001 	/* Add Key to the list */
1002 	struct tcp_md5sig_key *key;
1003 	struct tcp_sock *tp = tcp_sk(sk);
1004 	struct tcp_md5sig_info *md5sig;
1005 
1006 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1007 	if (key) {
1008 		/* Pre-existing entry - just update that one. */
1009 		memcpy(key->key, newkey, newkeylen);
1010 		key->keylen = newkeylen;
1011 		return 0;
1012 	}
1013 
1014 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1015 					   lockdep_sock_is_held(sk));
1016 	if (!md5sig) {
1017 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1018 		if (!md5sig)
1019 			return -ENOMEM;
1020 
1021 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1022 		INIT_HLIST_HEAD(&md5sig->head);
1023 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1024 	}
1025 
1026 	key = sock_kmalloc(sk, sizeof(*key), gfp);
1027 	if (!key)
1028 		return -ENOMEM;
1029 	if (!tcp_alloc_md5sig_pool()) {
1030 		sock_kfree_s(sk, key, sizeof(*key));
1031 		return -ENOMEM;
1032 	}
1033 
1034 	memcpy(key->key, newkey, newkeylen);
1035 	key->keylen = newkeylen;
1036 	key->family = family;
1037 	key->prefixlen = prefixlen;
1038 	memcpy(&key->addr, addr,
1039 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1040 				      sizeof(struct in_addr));
1041 	hlist_add_head_rcu(&key->node, &md5sig->head);
1042 	return 0;
1043 }
1044 EXPORT_SYMBOL(tcp_md5_do_add);
1045 
1046 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1047 		   u8 prefixlen)
1048 {
1049 	struct tcp_md5sig_key *key;
1050 
1051 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1052 	if (!key)
1053 		return -ENOENT;
1054 	hlist_del_rcu(&key->node);
1055 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1056 	kfree_rcu(key, rcu);
1057 	return 0;
1058 }
1059 EXPORT_SYMBOL(tcp_md5_do_del);
1060 
1061 static void tcp_clear_md5_list(struct sock *sk)
1062 {
1063 	struct tcp_sock *tp = tcp_sk(sk);
1064 	struct tcp_md5sig_key *key;
1065 	struct hlist_node *n;
1066 	struct tcp_md5sig_info *md5sig;
1067 
1068 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1069 
1070 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1071 		hlist_del_rcu(&key->node);
1072 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1073 		kfree_rcu(key, rcu);
1074 	}
1075 }
1076 
1077 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1078 				 char __user *optval, int optlen)
1079 {
1080 	struct tcp_md5sig cmd;
1081 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1082 	u8 prefixlen = 32;
1083 
1084 	if (optlen < sizeof(cmd))
1085 		return -EINVAL;
1086 
1087 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1088 		return -EFAULT;
1089 
1090 	if (sin->sin_family != AF_INET)
1091 		return -EINVAL;
1092 
1093 	if (optname == TCP_MD5SIG_EXT &&
1094 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1095 		prefixlen = cmd.tcpm_prefixlen;
1096 		if (prefixlen > 32)
1097 			return -EINVAL;
1098 	}
1099 
1100 	if (!cmd.tcpm_keylen)
1101 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1102 				      AF_INET, prefixlen);
1103 
1104 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1105 		return -EINVAL;
1106 
1107 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1108 			      AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1109 			      GFP_KERNEL);
1110 }
1111 
1112 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1113 				   __be32 daddr, __be32 saddr,
1114 				   const struct tcphdr *th, int nbytes)
1115 {
1116 	struct tcp4_pseudohdr *bp;
1117 	struct scatterlist sg;
1118 	struct tcphdr *_th;
1119 
1120 	bp = hp->scratch;
1121 	bp->saddr = saddr;
1122 	bp->daddr = daddr;
1123 	bp->pad = 0;
1124 	bp->protocol = IPPROTO_TCP;
1125 	bp->len = cpu_to_be16(nbytes);
1126 
1127 	_th = (struct tcphdr *)(bp + 1);
1128 	memcpy(_th, th, sizeof(*th));
1129 	_th->check = 0;
1130 
1131 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1132 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1133 				sizeof(*bp) + sizeof(*th));
1134 	return crypto_ahash_update(hp->md5_req);
1135 }
1136 
1137 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1138 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1139 {
1140 	struct tcp_md5sig_pool *hp;
1141 	struct ahash_request *req;
1142 
1143 	hp = tcp_get_md5sig_pool();
1144 	if (!hp)
1145 		goto clear_hash_noput;
1146 	req = hp->md5_req;
1147 
1148 	if (crypto_ahash_init(req))
1149 		goto clear_hash;
1150 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1151 		goto clear_hash;
1152 	if (tcp_md5_hash_key(hp, key))
1153 		goto clear_hash;
1154 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1155 	if (crypto_ahash_final(req))
1156 		goto clear_hash;
1157 
1158 	tcp_put_md5sig_pool();
1159 	return 0;
1160 
1161 clear_hash:
1162 	tcp_put_md5sig_pool();
1163 clear_hash_noput:
1164 	memset(md5_hash, 0, 16);
1165 	return 1;
1166 }
1167 
1168 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1169 			const struct sock *sk,
1170 			const struct sk_buff *skb)
1171 {
1172 	struct tcp_md5sig_pool *hp;
1173 	struct ahash_request *req;
1174 	const struct tcphdr *th = tcp_hdr(skb);
1175 	__be32 saddr, daddr;
1176 
1177 	if (sk) { /* valid for establish/request sockets */
1178 		saddr = sk->sk_rcv_saddr;
1179 		daddr = sk->sk_daddr;
1180 	} else {
1181 		const struct iphdr *iph = ip_hdr(skb);
1182 		saddr = iph->saddr;
1183 		daddr = iph->daddr;
1184 	}
1185 
1186 	hp = tcp_get_md5sig_pool();
1187 	if (!hp)
1188 		goto clear_hash_noput;
1189 	req = hp->md5_req;
1190 
1191 	if (crypto_ahash_init(req))
1192 		goto clear_hash;
1193 
1194 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1195 		goto clear_hash;
1196 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1197 		goto clear_hash;
1198 	if (tcp_md5_hash_key(hp, key))
1199 		goto clear_hash;
1200 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1201 	if (crypto_ahash_final(req))
1202 		goto clear_hash;
1203 
1204 	tcp_put_md5sig_pool();
1205 	return 0;
1206 
1207 clear_hash:
1208 	tcp_put_md5sig_pool();
1209 clear_hash_noput:
1210 	memset(md5_hash, 0, 16);
1211 	return 1;
1212 }
1213 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1214 
1215 #endif
1216 
1217 /* Called with rcu_read_lock() */
1218 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1219 				    const struct sk_buff *skb)
1220 {
1221 #ifdef CONFIG_TCP_MD5SIG
1222 	/*
1223 	 * This gets called for each TCP segment that arrives
1224 	 * so we want to be efficient.
1225 	 * We have 3 drop cases:
1226 	 * o No MD5 hash and one expected.
1227 	 * o MD5 hash and we're not expecting one.
1228 	 * o MD5 hash and its wrong.
1229 	 */
1230 	const __u8 *hash_location = NULL;
1231 	struct tcp_md5sig_key *hash_expected;
1232 	const struct iphdr *iph = ip_hdr(skb);
1233 	const struct tcphdr *th = tcp_hdr(skb);
1234 	int genhash;
1235 	unsigned char newhash[16];
1236 
1237 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1238 					  AF_INET);
1239 	hash_location = tcp_parse_md5sig_option(th);
1240 
1241 	/* We've parsed the options - do we have a hash? */
1242 	if (!hash_expected && !hash_location)
1243 		return false;
1244 
1245 	if (hash_expected && !hash_location) {
1246 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1247 		return true;
1248 	}
1249 
1250 	if (!hash_expected && hash_location) {
1251 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1252 		return true;
1253 	}
1254 
1255 	/* Okay, so this is hash_expected and hash_location -
1256 	 * so we need to calculate the checksum.
1257 	 */
1258 	genhash = tcp_v4_md5_hash_skb(newhash,
1259 				      hash_expected,
1260 				      NULL, skb);
1261 
1262 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1263 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1264 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1265 				     &iph->saddr, ntohs(th->source),
1266 				     &iph->daddr, ntohs(th->dest),
1267 				     genhash ? " tcp_v4_calc_md5_hash failed"
1268 				     : "");
1269 		return true;
1270 	}
1271 	return false;
1272 #endif
1273 	return false;
1274 }
1275 
1276 static void tcp_v4_init_req(struct request_sock *req,
1277 			    const struct sock *sk_listener,
1278 			    struct sk_buff *skb)
1279 {
1280 	struct inet_request_sock *ireq = inet_rsk(req);
1281 	struct net *net = sock_net(sk_listener);
1282 
1283 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1284 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1285 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1286 }
1287 
1288 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1289 					  struct flowi *fl,
1290 					  const struct request_sock *req)
1291 {
1292 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1293 }
1294 
1295 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1296 	.family		=	PF_INET,
1297 	.obj_size	=	sizeof(struct tcp_request_sock),
1298 	.rtx_syn_ack	=	tcp_rtx_synack,
1299 	.send_ack	=	tcp_v4_reqsk_send_ack,
1300 	.destructor	=	tcp_v4_reqsk_destructor,
1301 	.send_reset	=	tcp_v4_send_reset,
1302 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1303 };
1304 
1305 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1306 	.mss_clamp	=	TCP_MSS_DEFAULT,
1307 #ifdef CONFIG_TCP_MD5SIG
1308 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1309 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1310 #endif
1311 	.init_req	=	tcp_v4_init_req,
1312 #ifdef CONFIG_SYN_COOKIES
1313 	.cookie_init_seq =	cookie_v4_init_sequence,
1314 #endif
1315 	.route_req	=	tcp_v4_route_req,
1316 	.init_seq	=	tcp_v4_init_seq,
1317 	.init_ts_off	=	tcp_v4_init_ts_off,
1318 	.send_synack	=	tcp_v4_send_synack,
1319 };
1320 
1321 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1322 {
1323 	/* Never answer to SYNs send to broadcast or multicast */
1324 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1325 		goto drop;
1326 
1327 	return tcp_conn_request(&tcp_request_sock_ops,
1328 				&tcp_request_sock_ipv4_ops, sk, skb);
1329 
1330 drop:
1331 	tcp_listendrop(sk);
1332 	return 0;
1333 }
1334 EXPORT_SYMBOL(tcp_v4_conn_request);
1335 
1336 
1337 /*
1338  * The three way handshake has completed - we got a valid synack -
1339  * now create the new socket.
1340  */
1341 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1342 				  struct request_sock *req,
1343 				  struct dst_entry *dst,
1344 				  struct request_sock *req_unhash,
1345 				  bool *own_req)
1346 {
1347 	struct inet_request_sock *ireq;
1348 	struct inet_sock *newinet;
1349 	struct tcp_sock *newtp;
1350 	struct sock *newsk;
1351 #ifdef CONFIG_TCP_MD5SIG
1352 	struct tcp_md5sig_key *key;
1353 #endif
1354 	struct ip_options_rcu *inet_opt;
1355 
1356 	if (sk_acceptq_is_full(sk))
1357 		goto exit_overflow;
1358 
1359 	newsk = tcp_create_openreq_child(sk, req, skb);
1360 	if (!newsk)
1361 		goto exit_nonewsk;
1362 
1363 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1364 	inet_sk_rx_dst_set(newsk, skb);
1365 
1366 	newtp		      = tcp_sk(newsk);
1367 	newinet		      = inet_sk(newsk);
1368 	ireq		      = inet_rsk(req);
1369 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1370 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1371 	newsk->sk_bound_dev_if = ireq->ir_iif;
1372 	newinet->inet_saddr   = ireq->ir_loc_addr;
1373 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1374 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1375 	newinet->mc_index     = inet_iif(skb);
1376 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1377 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1378 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1379 	if (inet_opt)
1380 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1381 	newinet->inet_id = newtp->write_seq ^ jiffies;
1382 
1383 	if (!dst) {
1384 		dst = inet_csk_route_child_sock(sk, newsk, req);
1385 		if (!dst)
1386 			goto put_and_exit;
1387 	} else {
1388 		/* syncookie case : see end of cookie_v4_check() */
1389 	}
1390 	sk_setup_caps(newsk, dst);
1391 
1392 	tcp_ca_openreq_child(newsk, dst);
1393 
1394 	tcp_sync_mss(newsk, dst_mtu(dst));
1395 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1396 
1397 	tcp_initialize_rcv_mss(newsk);
1398 
1399 #ifdef CONFIG_TCP_MD5SIG
1400 	/* Copy over the MD5 key from the original socket */
1401 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1402 				AF_INET);
1403 	if (key) {
1404 		/*
1405 		 * We're using one, so create a matching key
1406 		 * on the newsk structure. If we fail to get
1407 		 * memory, then we end up not copying the key
1408 		 * across. Shucks.
1409 		 */
1410 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1411 			       AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1412 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1413 	}
1414 #endif
1415 
1416 	if (__inet_inherit_port(sk, newsk) < 0)
1417 		goto put_and_exit;
1418 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1419 	if (likely(*own_req)) {
1420 		tcp_move_syn(newtp, req);
1421 		ireq->ireq_opt = NULL;
1422 	} else {
1423 		newinet->inet_opt = NULL;
1424 	}
1425 	return newsk;
1426 
1427 exit_overflow:
1428 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1429 exit_nonewsk:
1430 	dst_release(dst);
1431 exit:
1432 	tcp_listendrop(sk);
1433 	return NULL;
1434 put_and_exit:
1435 	newinet->inet_opt = NULL;
1436 	inet_csk_prepare_forced_close(newsk);
1437 	tcp_done(newsk);
1438 	goto exit;
1439 }
1440 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1441 
1442 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1443 {
1444 #ifdef CONFIG_SYN_COOKIES
1445 	const struct tcphdr *th = tcp_hdr(skb);
1446 
1447 	if (!th->syn)
1448 		sk = cookie_v4_check(sk, skb);
1449 #endif
1450 	return sk;
1451 }
1452 
1453 /* The socket must have it's spinlock held when we get
1454  * here, unless it is a TCP_LISTEN socket.
1455  *
1456  * We have a potential double-lock case here, so even when
1457  * doing backlog processing we use the BH locking scheme.
1458  * This is because we cannot sleep with the original spinlock
1459  * held.
1460  */
1461 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1462 {
1463 	struct sock *rsk;
1464 
1465 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1466 		struct dst_entry *dst = sk->sk_rx_dst;
1467 
1468 		sock_rps_save_rxhash(sk, skb);
1469 		sk_mark_napi_id(sk, skb);
1470 		if (dst) {
1471 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1472 			    !dst->ops->check(dst, 0)) {
1473 				dst_release(dst);
1474 				sk->sk_rx_dst = NULL;
1475 			}
1476 		}
1477 		tcp_rcv_established(sk, skb, tcp_hdr(skb));
1478 		return 0;
1479 	}
1480 
1481 	if (tcp_checksum_complete(skb))
1482 		goto csum_err;
1483 
1484 	if (sk->sk_state == TCP_LISTEN) {
1485 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1486 
1487 		if (!nsk)
1488 			goto discard;
1489 		if (nsk != sk) {
1490 			if (tcp_child_process(sk, nsk, skb)) {
1491 				rsk = nsk;
1492 				goto reset;
1493 			}
1494 			return 0;
1495 		}
1496 	} else
1497 		sock_rps_save_rxhash(sk, skb);
1498 
1499 	if (tcp_rcv_state_process(sk, skb)) {
1500 		rsk = sk;
1501 		goto reset;
1502 	}
1503 	return 0;
1504 
1505 reset:
1506 	tcp_v4_send_reset(rsk, skb);
1507 discard:
1508 	kfree_skb(skb);
1509 	/* Be careful here. If this function gets more complicated and
1510 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1511 	 * might be destroyed here. This current version compiles correctly,
1512 	 * but you have been warned.
1513 	 */
1514 	return 0;
1515 
1516 csum_err:
1517 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1518 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1519 	goto discard;
1520 }
1521 EXPORT_SYMBOL(tcp_v4_do_rcv);
1522 
1523 int tcp_v4_early_demux(struct sk_buff *skb)
1524 {
1525 	const struct iphdr *iph;
1526 	const struct tcphdr *th;
1527 	struct sock *sk;
1528 
1529 	if (skb->pkt_type != PACKET_HOST)
1530 		return 0;
1531 
1532 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1533 		return 0;
1534 
1535 	iph = ip_hdr(skb);
1536 	th = tcp_hdr(skb);
1537 
1538 	if (th->doff < sizeof(struct tcphdr) / 4)
1539 		return 0;
1540 
1541 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1542 				       iph->saddr, th->source,
1543 				       iph->daddr, ntohs(th->dest),
1544 				       skb->skb_iif, inet_sdif(skb));
1545 	if (sk) {
1546 		skb->sk = sk;
1547 		skb->destructor = sock_edemux;
1548 		if (sk_fullsock(sk)) {
1549 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1550 
1551 			if (dst)
1552 				dst = dst_check(dst, 0);
1553 			if (dst &&
1554 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1555 				skb_dst_set_noref(skb, dst);
1556 		}
1557 	}
1558 	return 0;
1559 }
1560 
1561 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1562 {
1563 	u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1564 
1565 	/* Only socket owner can try to collapse/prune rx queues
1566 	 * to reduce memory overhead, so add a little headroom here.
1567 	 * Few sockets backlog are possibly concurrently non empty.
1568 	 */
1569 	limit += 64*1024;
1570 
1571 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1572 	 * we can fix skb->truesize to its real value to avoid future drops.
1573 	 * This is valid because skb is not yet charged to the socket.
1574 	 * It has been noticed pure SACK packets were sometimes dropped
1575 	 * (if cooked by drivers without copybreak feature).
1576 	 */
1577 	skb_condense(skb);
1578 
1579 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1580 		bh_unlock_sock(sk);
1581 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1582 		return true;
1583 	}
1584 	return false;
1585 }
1586 EXPORT_SYMBOL(tcp_add_backlog);
1587 
1588 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1589 {
1590 	struct tcphdr *th = (struct tcphdr *)skb->data;
1591 	unsigned int eaten = skb->len;
1592 	int err;
1593 
1594 	err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1595 	if (!err) {
1596 		eaten -= skb->len;
1597 		TCP_SKB_CB(skb)->end_seq -= eaten;
1598 	}
1599 	return err;
1600 }
1601 EXPORT_SYMBOL(tcp_filter);
1602 
1603 static void tcp_v4_restore_cb(struct sk_buff *skb)
1604 {
1605 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1606 		sizeof(struct inet_skb_parm));
1607 }
1608 
1609 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1610 			   const struct tcphdr *th)
1611 {
1612 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1613 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1614 	 */
1615 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1616 		sizeof(struct inet_skb_parm));
1617 	barrier();
1618 
1619 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1620 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1621 				    skb->len - th->doff * 4);
1622 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1623 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1624 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1625 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1626 	TCP_SKB_CB(skb)->sacked	 = 0;
1627 	TCP_SKB_CB(skb)->has_rxtstamp =
1628 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1629 }
1630 
1631 /*
1632  *	From tcp_input.c
1633  */
1634 
1635 int tcp_v4_rcv(struct sk_buff *skb)
1636 {
1637 	struct net *net = dev_net(skb->dev);
1638 	int sdif = inet_sdif(skb);
1639 	const struct iphdr *iph;
1640 	const struct tcphdr *th;
1641 	bool refcounted;
1642 	struct sock *sk;
1643 	int ret;
1644 
1645 	if (skb->pkt_type != PACKET_HOST)
1646 		goto discard_it;
1647 
1648 	/* Count it even if it's bad */
1649 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1650 
1651 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1652 		goto discard_it;
1653 
1654 	th = (const struct tcphdr *)skb->data;
1655 
1656 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1657 		goto bad_packet;
1658 	if (!pskb_may_pull(skb, th->doff * 4))
1659 		goto discard_it;
1660 
1661 	/* An explanation is required here, I think.
1662 	 * Packet length and doff are validated by header prediction,
1663 	 * provided case of th->doff==0 is eliminated.
1664 	 * So, we defer the checks. */
1665 
1666 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1667 		goto csum_error;
1668 
1669 	th = (const struct tcphdr *)skb->data;
1670 	iph = ip_hdr(skb);
1671 lookup:
1672 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1673 			       th->dest, sdif, &refcounted);
1674 	if (!sk)
1675 		goto no_tcp_socket;
1676 
1677 process:
1678 	if (sk->sk_state == TCP_TIME_WAIT)
1679 		goto do_time_wait;
1680 
1681 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1682 		struct request_sock *req = inet_reqsk(sk);
1683 		bool req_stolen = false;
1684 		struct sock *nsk;
1685 
1686 		sk = req->rsk_listener;
1687 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1688 			sk_drops_add(sk, skb);
1689 			reqsk_put(req);
1690 			goto discard_it;
1691 		}
1692 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1693 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1694 			goto lookup;
1695 		}
1696 		/* We own a reference on the listener, increase it again
1697 		 * as we might lose it too soon.
1698 		 */
1699 		sock_hold(sk);
1700 		refcounted = true;
1701 		nsk = NULL;
1702 		if (!tcp_filter(sk, skb)) {
1703 			th = (const struct tcphdr *)skb->data;
1704 			iph = ip_hdr(skb);
1705 			tcp_v4_fill_cb(skb, iph, th);
1706 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1707 		}
1708 		if (!nsk) {
1709 			reqsk_put(req);
1710 			if (req_stolen) {
1711 				/* Another cpu got exclusive access to req
1712 				 * and created a full blown socket.
1713 				 * Try to feed this packet to this socket
1714 				 * instead of discarding it.
1715 				 */
1716 				tcp_v4_restore_cb(skb);
1717 				sock_put(sk);
1718 				goto lookup;
1719 			}
1720 			goto discard_and_relse;
1721 		}
1722 		if (nsk == sk) {
1723 			reqsk_put(req);
1724 			tcp_v4_restore_cb(skb);
1725 		} else if (tcp_child_process(sk, nsk, skb)) {
1726 			tcp_v4_send_reset(nsk, skb);
1727 			goto discard_and_relse;
1728 		} else {
1729 			sock_put(sk);
1730 			return 0;
1731 		}
1732 	}
1733 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1734 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1735 		goto discard_and_relse;
1736 	}
1737 
1738 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1739 		goto discard_and_relse;
1740 
1741 	if (tcp_v4_inbound_md5_hash(sk, skb))
1742 		goto discard_and_relse;
1743 
1744 	nf_reset(skb);
1745 
1746 	if (tcp_filter(sk, skb))
1747 		goto discard_and_relse;
1748 	th = (const struct tcphdr *)skb->data;
1749 	iph = ip_hdr(skb);
1750 	tcp_v4_fill_cb(skb, iph, th);
1751 
1752 	skb->dev = NULL;
1753 
1754 	if (sk->sk_state == TCP_LISTEN) {
1755 		ret = tcp_v4_do_rcv(sk, skb);
1756 		goto put_and_return;
1757 	}
1758 
1759 	sk_incoming_cpu_update(sk);
1760 
1761 	bh_lock_sock_nested(sk);
1762 	tcp_segs_in(tcp_sk(sk), skb);
1763 	ret = 0;
1764 	if (!sock_owned_by_user(sk)) {
1765 		ret = tcp_v4_do_rcv(sk, skb);
1766 	} else if (tcp_add_backlog(sk, skb)) {
1767 		goto discard_and_relse;
1768 	}
1769 	bh_unlock_sock(sk);
1770 
1771 put_and_return:
1772 	if (refcounted)
1773 		sock_put(sk);
1774 
1775 	return ret;
1776 
1777 no_tcp_socket:
1778 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1779 		goto discard_it;
1780 
1781 	tcp_v4_fill_cb(skb, iph, th);
1782 
1783 	if (tcp_checksum_complete(skb)) {
1784 csum_error:
1785 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1786 bad_packet:
1787 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
1788 	} else {
1789 		tcp_v4_send_reset(NULL, skb);
1790 	}
1791 
1792 discard_it:
1793 	/* Discard frame. */
1794 	kfree_skb(skb);
1795 	return 0;
1796 
1797 discard_and_relse:
1798 	sk_drops_add(sk, skb);
1799 	if (refcounted)
1800 		sock_put(sk);
1801 	goto discard_it;
1802 
1803 do_time_wait:
1804 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1805 		inet_twsk_put(inet_twsk(sk));
1806 		goto discard_it;
1807 	}
1808 
1809 	tcp_v4_fill_cb(skb, iph, th);
1810 
1811 	if (tcp_checksum_complete(skb)) {
1812 		inet_twsk_put(inet_twsk(sk));
1813 		goto csum_error;
1814 	}
1815 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1816 	case TCP_TW_SYN: {
1817 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1818 							&tcp_hashinfo, skb,
1819 							__tcp_hdrlen(th),
1820 							iph->saddr, th->source,
1821 							iph->daddr, th->dest,
1822 							inet_iif(skb),
1823 							sdif);
1824 		if (sk2) {
1825 			inet_twsk_deschedule_put(inet_twsk(sk));
1826 			sk = sk2;
1827 			tcp_v4_restore_cb(skb);
1828 			refcounted = false;
1829 			goto process;
1830 		}
1831 	}
1832 		/* to ACK */
1833 		/* fall through */
1834 	case TCP_TW_ACK:
1835 		tcp_v4_timewait_ack(sk, skb);
1836 		break;
1837 	case TCP_TW_RST:
1838 		tcp_v4_send_reset(sk, skb);
1839 		inet_twsk_deschedule_put(inet_twsk(sk));
1840 		goto discard_it;
1841 	case TCP_TW_SUCCESS:;
1842 	}
1843 	goto discard_it;
1844 }
1845 
1846 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1847 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1848 	.twsk_unique	= tcp_twsk_unique,
1849 	.twsk_destructor= tcp_twsk_destructor,
1850 };
1851 
1852 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1853 {
1854 	struct dst_entry *dst = skb_dst(skb);
1855 
1856 	if (dst && dst_hold_safe(dst)) {
1857 		sk->sk_rx_dst = dst;
1858 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1859 	}
1860 }
1861 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1862 
1863 const struct inet_connection_sock_af_ops ipv4_specific = {
1864 	.queue_xmit	   = ip_queue_xmit,
1865 	.send_check	   = tcp_v4_send_check,
1866 	.rebuild_header	   = inet_sk_rebuild_header,
1867 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
1868 	.conn_request	   = tcp_v4_conn_request,
1869 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1870 	.net_header_len	   = sizeof(struct iphdr),
1871 	.setsockopt	   = ip_setsockopt,
1872 	.getsockopt	   = ip_getsockopt,
1873 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1874 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1875 #ifdef CONFIG_COMPAT
1876 	.compat_setsockopt = compat_ip_setsockopt,
1877 	.compat_getsockopt = compat_ip_getsockopt,
1878 #endif
1879 	.mtu_reduced	   = tcp_v4_mtu_reduced,
1880 };
1881 EXPORT_SYMBOL(ipv4_specific);
1882 
1883 #ifdef CONFIG_TCP_MD5SIG
1884 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1885 	.md5_lookup		= tcp_v4_md5_lookup,
1886 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1887 	.md5_parse		= tcp_v4_parse_md5_keys,
1888 };
1889 #endif
1890 
1891 /* NOTE: A lot of things set to zero explicitly by call to
1892  *       sk_alloc() so need not be done here.
1893  */
1894 static int tcp_v4_init_sock(struct sock *sk)
1895 {
1896 	struct inet_connection_sock *icsk = inet_csk(sk);
1897 
1898 	tcp_init_sock(sk);
1899 
1900 	icsk->icsk_af_ops = &ipv4_specific;
1901 
1902 #ifdef CONFIG_TCP_MD5SIG
1903 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1904 #endif
1905 
1906 	return 0;
1907 }
1908 
1909 void tcp_v4_destroy_sock(struct sock *sk)
1910 {
1911 	struct tcp_sock *tp = tcp_sk(sk);
1912 
1913 	trace_tcp_destroy_sock(sk);
1914 
1915 	tcp_clear_xmit_timers(sk);
1916 
1917 	tcp_cleanup_congestion_control(sk);
1918 
1919 	tcp_cleanup_ulp(sk);
1920 
1921 	/* Cleanup up the write buffer. */
1922 	tcp_write_queue_purge(sk);
1923 
1924 	/* Check if we want to disable active TFO */
1925 	tcp_fastopen_active_disable_ofo_check(sk);
1926 
1927 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1928 	skb_rbtree_purge(&tp->out_of_order_queue);
1929 
1930 #ifdef CONFIG_TCP_MD5SIG
1931 	/* Clean up the MD5 key list, if any */
1932 	if (tp->md5sig_info) {
1933 		tcp_clear_md5_list(sk);
1934 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
1935 		tp->md5sig_info = NULL;
1936 	}
1937 #endif
1938 
1939 	/* Clean up a referenced TCP bind bucket. */
1940 	if (inet_csk(sk)->icsk_bind_hash)
1941 		inet_put_port(sk);
1942 
1943 	BUG_ON(tp->fastopen_rsk);
1944 
1945 	/* If socket is aborted during connect operation */
1946 	tcp_free_fastopen_req(tp);
1947 	tcp_fastopen_destroy_cipher(sk);
1948 	tcp_saved_syn_free(tp);
1949 
1950 	sk_sockets_allocated_dec(sk);
1951 }
1952 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1953 
1954 #ifdef CONFIG_PROC_FS
1955 /* Proc filesystem TCP sock list dumping. */
1956 
1957 /*
1958  * Get next listener socket follow cur.  If cur is NULL, get first socket
1959  * starting from bucket given in st->bucket; when st->bucket is zero the
1960  * very first socket in the hash table is returned.
1961  */
1962 static void *listening_get_next(struct seq_file *seq, void *cur)
1963 {
1964 	struct tcp_iter_state *st = seq->private;
1965 	struct net *net = seq_file_net(seq);
1966 	struct inet_listen_hashbucket *ilb;
1967 	struct sock *sk = cur;
1968 
1969 	if (!sk) {
1970 get_head:
1971 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1972 		spin_lock(&ilb->lock);
1973 		sk = sk_head(&ilb->head);
1974 		st->offset = 0;
1975 		goto get_sk;
1976 	}
1977 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1978 	++st->num;
1979 	++st->offset;
1980 
1981 	sk = sk_next(sk);
1982 get_sk:
1983 	sk_for_each_from(sk) {
1984 		if (!net_eq(sock_net(sk), net))
1985 			continue;
1986 		if (sk->sk_family == st->family)
1987 			return sk;
1988 	}
1989 	spin_unlock(&ilb->lock);
1990 	st->offset = 0;
1991 	if (++st->bucket < INET_LHTABLE_SIZE)
1992 		goto get_head;
1993 	return NULL;
1994 }
1995 
1996 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1997 {
1998 	struct tcp_iter_state *st = seq->private;
1999 	void *rc;
2000 
2001 	st->bucket = 0;
2002 	st->offset = 0;
2003 	rc = listening_get_next(seq, NULL);
2004 
2005 	while (rc && *pos) {
2006 		rc = listening_get_next(seq, rc);
2007 		--*pos;
2008 	}
2009 	return rc;
2010 }
2011 
2012 static inline bool empty_bucket(const struct tcp_iter_state *st)
2013 {
2014 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2015 }
2016 
2017 /*
2018  * Get first established socket starting from bucket given in st->bucket.
2019  * If st->bucket is zero, the very first socket in the hash is returned.
2020  */
2021 static void *established_get_first(struct seq_file *seq)
2022 {
2023 	struct tcp_iter_state *st = seq->private;
2024 	struct net *net = seq_file_net(seq);
2025 	void *rc = NULL;
2026 
2027 	st->offset = 0;
2028 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2029 		struct sock *sk;
2030 		struct hlist_nulls_node *node;
2031 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2032 
2033 		/* Lockless fast path for the common case of empty buckets */
2034 		if (empty_bucket(st))
2035 			continue;
2036 
2037 		spin_lock_bh(lock);
2038 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2039 			if (sk->sk_family != st->family ||
2040 			    !net_eq(sock_net(sk), net)) {
2041 				continue;
2042 			}
2043 			rc = sk;
2044 			goto out;
2045 		}
2046 		spin_unlock_bh(lock);
2047 	}
2048 out:
2049 	return rc;
2050 }
2051 
2052 static void *established_get_next(struct seq_file *seq, void *cur)
2053 {
2054 	struct sock *sk = cur;
2055 	struct hlist_nulls_node *node;
2056 	struct tcp_iter_state *st = seq->private;
2057 	struct net *net = seq_file_net(seq);
2058 
2059 	++st->num;
2060 	++st->offset;
2061 
2062 	sk = sk_nulls_next(sk);
2063 
2064 	sk_nulls_for_each_from(sk, node) {
2065 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2066 			return sk;
2067 	}
2068 
2069 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2070 	++st->bucket;
2071 	return established_get_first(seq);
2072 }
2073 
2074 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2075 {
2076 	struct tcp_iter_state *st = seq->private;
2077 	void *rc;
2078 
2079 	st->bucket = 0;
2080 	rc = established_get_first(seq);
2081 
2082 	while (rc && pos) {
2083 		rc = established_get_next(seq, rc);
2084 		--pos;
2085 	}
2086 	return rc;
2087 }
2088 
2089 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2090 {
2091 	void *rc;
2092 	struct tcp_iter_state *st = seq->private;
2093 
2094 	st->state = TCP_SEQ_STATE_LISTENING;
2095 	rc	  = listening_get_idx(seq, &pos);
2096 
2097 	if (!rc) {
2098 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2099 		rc	  = established_get_idx(seq, pos);
2100 	}
2101 
2102 	return rc;
2103 }
2104 
2105 static void *tcp_seek_last_pos(struct seq_file *seq)
2106 {
2107 	struct tcp_iter_state *st = seq->private;
2108 	int offset = st->offset;
2109 	int orig_num = st->num;
2110 	void *rc = NULL;
2111 
2112 	switch (st->state) {
2113 	case TCP_SEQ_STATE_LISTENING:
2114 		if (st->bucket >= INET_LHTABLE_SIZE)
2115 			break;
2116 		st->state = TCP_SEQ_STATE_LISTENING;
2117 		rc = listening_get_next(seq, NULL);
2118 		while (offset-- && rc)
2119 			rc = listening_get_next(seq, rc);
2120 		if (rc)
2121 			break;
2122 		st->bucket = 0;
2123 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2124 		/* Fallthrough */
2125 	case TCP_SEQ_STATE_ESTABLISHED:
2126 		if (st->bucket > tcp_hashinfo.ehash_mask)
2127 			break;
2128 		rc = established_get_first(seq);
2129 		while (offset-- && rc)
2130 			rc = established_get_next(seq, rc);
2131 	}
2132 
2133 	st->num = orig_num;
2134 
2135 	return rc;
2136 }
2137 
2138 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2139 {
2140 	struct tcp_iter_state *st = seq->private;
2141 	void *rc;
2142 
2143 	if (*pos && *pos == st->last_pos) {
2144 		rc = tcp_seek_last_pos(seq);
2145 		if (rc)
2146 			goto out;
2147 	}
2148 
2149 	st->state = TCP_SEQ_STATE_LISTENING;
2150 	st->num = 0;
2151 	st->bucket = 0;
2152 	st->offset = 0;
2153 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2154 
2155 out:
2156 	st->last_pos = *pos;
2157 	return rc;
2158 }
2159 
2160 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2161 {
2162 	struct tcp_iter_state *st = seq->private;
2163 	void *rc = NULL;
2164 
2165 	if (v == SEQ_START_TOKEN) {
2166 		rc = tcp_get_idx(seq, 0);
2167 		goto out;
2168 	}
2169 
2170 	switch (st->state) {
2171 	case TCP_SEQ_STATE_LISTENING:
2172 		rc = listening_get_next(seq, v);
2173 		if (!rc) {
2174 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2175 			st->bucket = 0;
2176 			st->offset = 0;
2177 			rc	  = established_get_first(seq);
2178 		}
2179 		break;
2180 	case TCP_SEQ_STATE_ESTABLISHED:
2181 		rc = established_get_next(seq, v);
2182 		break;
2183 	}
2184 out:
2185 	++*pos;
2186 	st->last_pos = *pos;
2187 	return rc;
2188 }
2189 
2190 static void tcp_seq_stop(struct seq_file *seq, void *v)
2191 {
2192 	struct tcp_iter_state *st = seq->private;
2193 
2194 	switch (st->state) {
2195 	case TCP_SEQ_STATE_LISTENING:
2196 		if (v != SEQ_START_TOKEN)
2197 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2198 		break;
2199 	case TCP_SEQ_STATE_ESTABLISHED:
2200 		if (v)
2201 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2202 		break;
2203 	}
2204 }
2205 
2206 int tcp_seq_open(struct inode *inode, struct file *file)
2207 {
2208 	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2209 	struct tcp_iter_state *s;
2210 	int err;
2211 
2212 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2213 			  sizeof(struct tcp_iter_state));
2214 	if (err < 0)
2215 		return err;
2216 
2217 	s = ((struct seq_file *)file->private_data)->private;
2218 	s->family		= afinfo->family;
2219 	s->last_pos		= 0;
2220 	return 0;
2221 }
2222 EXPORT_SYMBOL(tcp_seq_open);
2223 
2224 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2225 {
2226 	int rc = 0;
2227 	struct proc_dir_entry *p;
2228 
2229 	afinfo->seq_ops.start		= tcp_seq_start;
2230 	afinfo->seq_ops.next		= tcp_seq_next;
2231 	afinfo->seq_ops.stop		= tcp_seq_stop;
2232 
2233 	p = proc_create_data(afinfo->name, 0444, net->proc_net,
2234 			     afinfo->seq_fops, afinfo);
2235 	if (!p)
2236 		rc = -ENOMEM;
2237 	return rc;
2238 }
2239 EXPORT_SYMBOL(tcp_proc_register);
2240 
2241 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2242 {
2243 	remove_proc_entry(afinfo->name, net->proc_net);
2244 }
2245 EXPORT_SYMBOL(tcp_proc_unregister);
2246 
2247 static void get_openreq4(const struct request_sock *req,
2248 			 struct seq_file *f, int i)
2249 {
2250 	const struct inet_request_sock *ireq = inet_rsk(req);
2251 	long delta = req->rsk_timer.expires - jiffies;
2252 
2253 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2254 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2255 		i,
2256 		ireq->ir_loc_addr,
2257 		ireq->ir_num,
2258 		ireq->ir_rmt_addr,
2259 		ntohs(ireq->ir_rmt_port),
2260 		TCP_SYN_RECV,
2261 		0, 0, /* could print option size, but that is af dependent. */
2262 		1,    /* timers active (only the expire timer) */
2263 		jiffies_delta_to_clock_t(delta),
2264 		req->num_timeout,
2265 		from_kuid_munged(seq_user_ns(f),
2266 				 sock_i_uid(req->rsk_listener)),
2267 		0,  /* non standard timer */
2268 		0, /* open_requests have no inode */
2269 		0,
2270 		req);
2271 }
2272 
2273 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2274 {
2275 	int timer_active;
2276 	unsigned long timer_expires;
2277 	const struct tcp_sock *tp = tcp_sk(sk);
2278 	const struct inet_connection_sock *icsk = inet_csk(sk);
2279 	const struct inet_sock *inet = inet_sk(sk);
2280 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2281 	__be32 dest = inet->inet_daddr;
2282 	__be32 src = inet->inet_rcv_saddr;
2283 	__u16 destp = ntohs(inet->inet_dport);
2284 	__u16 srcp = ntohs(inet->inet_sport);
2285 	int rx_queue;
2286 	int state;
2287 
2288 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2289 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2290 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2291 		timer_active	= 1;
2292 		timer_expires	= icsk->icsk_timeout;
2293 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2294 		timer_active	= 4;
2295 		timer_expires	= icsk->icsk_timeout;
2296 	} else if (timer_pending(&sk->sk_timer)) {
2297 		timer_active	= 2;
2298 		timer_expires	= sk->sk_timer.expires;
2299 	} else {
2300 		timer_active	= 0;
2301 		timer_expires = jiffies;
2302 	}
2303 
2304 	state = inet_sk_state_load(sk);
2305 	if (state == TCP_LISTEN)
2306 		rx_queue = sk->sk_ack_backlog;
2307 	else
2308 		/* Because we don't lock the socket,
2309 		 * we might find a transient negative value.
2310 		 */
2311 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2312 
2313 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2314 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2315 		i, src, srcp, dest, destp, state,
2316 		tp->write_seq - tp->snd_una,
2317 		rx_queue,
2318 		timer_active,
2319 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2320 		icsk->icsk_retransmits,
2321 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2322 		icsk->icsk_probes_out,
2323 		sock_i_ino(sk),
2324 		refcount_read(&sk->sk_refcnt), sk,
2325 		jiffies_to_clock_t(icsk->icsk_rto),
2326 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2327 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2328 		tp->snd_cwnd,
2329 		state == TCP_LISTEN ?
2330 		    fastopenq->max_qlen :
2331 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2332 }
2333 
2334 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2335 			       struct seq_file *f, int i)
2336 {
2337 	long delta = tw->tw_timer.expires - jiffies;
2338 	__be32 dest, src;
2339 	__u16 destp, srcp;
2340 
2341 	dest  = tw->tw_daddr;
2342 	src   = tw->tw_rcv_saddr;
2343 	destp = ntohs(tw->tw_dport);
2344 	srcp  = ntohs(tw->tw_sport);
2345 
2346 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2347 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2348 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2349 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2350 		refcount_read(&tw->tw_refcnt), tw);
2351 }
2352 
2353 #define TMPSZ 150
2354 
2355 static int tcp4_seq_show(struct seq_file *seq, void *v)
2356 {
2357 	struct tcp_iter_state *st;
2358 	struct sock *sk = v;
2359 
2360 	seq_setwidth(seq, TMPSZ - 1);
2361 	if (v == SEQ_START_TOKEN) {
2362 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2363 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2364 			   "inode");
2365 		goto out;
2366 	}
2367 	st = seq->private;
2368 
2369 	if (sk->sk_state == TCP_TIME_WAIT)
2370 		get_timewait4_sock(v, seq, st->num);
2371 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2372 		get_openreq4(v, seq, st->num);
2373 	else
2374 		get_tcp4_sock(v, seq, st->num);
2375 out:
2376 	seq_pad(seq, '\n');
2377 	return 0;
2378 }
2379 
2380 static const struct file_operations tcp_afinfo_seq_fops = {
2381 	.open    = tcp_seq_open,
2382 	.read    = seq_read,
2383 	.llseek  = seq_lseek,
2384 	.release = seq_release_net
2385 };
2386 
2387 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2388 	.name		= "tcp",
2389 	.family		= AF_INET,
2390 	.seq_fops	= &tcp_afinfo_seq_fops,
2391 	.seq_ops	= {
2392 		.show		= tcp4_seq_show,
2393 	},
2394 };
2395 
2396 static int __net_init tcp4_proc_init_net(struct net *net)
2397 {
2398 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2399 }
2400 
2401 static void __net_exit tcp4_proc_exit_net(struct net *net)
2402 {
2403 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2404 }
2405 
2406 static struct pernet_operations tcp4_net_ops = {
2407 	.init = tcp4_proc_init_net,
2408 	.exit = tcp4_proc_exit_net,
2409 };
2410 
2411 int __init tcp4_proc_init(void)
2412 {
2413 	return register_pernet_subsys(&tcp4_net_ops);
2414 }
2415 
2416 void tcp4_proc_exit(void)
2417 {
2418 	unregister_pernet_subsys(&tcp4_net_ops);
2419 }
2420 #endif /* CONFIG_PROC_FS */
2421 
2422 struct proto tcp_prot = {
2423 	.name			= "TCP",
2424 	.owner			= THIS_MODULE,
2425 	.close			= tcp_close,
2426 	.pre_connect		= tcp_v4_pre_connect,
2427 	.connect		= tcp_v4_connect,
2428 	.disconnect		= tcp_disconnect,
2429 	.accept			= inet_csk_accept,
2430 	.ioctl			= tcp_ioctl,
2431 	.init			= tcp_v4_init_sock,
2432 	.destroy		= tcp_v4_destroy_sock,
2433 	.shutdown		= tcp_shutdown,
2434 	.setsockopt		= tcp_setsockopt,
2435 	.getsockopt		= tcp_getsockopt,
2436 	.keepalive		= tcp_set_keepalive,
2437 	.recvmsg		= tcp_recvmsg,
2438 	.sendmsg		= tcp_sendmsg,
2439 	.sendpage		= tcp_sendpage,
2440 	.backlog_rcv		= tcp_v4_do_rcv,
2441 	.release_cb		= tcp_release_cb,
2442 	.hash			= inet_hash,
2443 	.unhash			= inet_unhash,
2444 	.get_port		= inet_csk_get_port,
2445 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2446 	.leave_memory_pressure	= tcp_leave_memory_pressure,
2447 	.stream_memory_free	= tcp_stream_memory_free,
2448 	.sockets_allocated	= &tcp_sockets_allocated,
2449 	.orphan_count		= &tcp_orphan_count,
2450 	.memory_allocated	= &tcp_memory_allocated,
2451 	.memory_pressure	= &tcp_memory_pressure,
2452 	.sysctl_mem		= sysctl_tcp_mem,
2453 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
2454 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
2455 	.max_header		= MAX_TCP_HEADER,
2456 	.obj_size		= sizeof(struct tcp_sock),
2457 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
2458 	.twsk_prot		= &tcp_timewait_sock_ops,
2459 	.rsk_prot		= &tcp_request_sock_ops,
2460 	.h.hashinfo		= &tcp_hashinfo,
2461 	.no_autobind		= true,
2462 #ifdef CONFIG_COMPAT
2463 	.compat_setsockopt	= compat_tcp_setsockopt,
2464 	.compat_getsockopt	= compat_tcp_getsockopt,
2465 #endif
2466 	.diag_destroy		= tcp_abort,
2467 };
2468 EXPORT_SYMBOL(tcp_prot);
2469 
2470 static void __net_exit tcp_sk_exit(struct net *net)
2471 {
2472 	int cpu;
2473 
2474 	module_put(net->ipv4.tcp_congestion_control->owner);
2475 
2476 	for_each_possible_cpu(cpu)
2477 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2478 	free_percpu(net->ipv4.tcp_sk);
2479 }
2480 
2481 static int __net_init tcp_sk_init(struct net *net)
2482 {
2483 	int res, cpu, cnt;
2484 
2485 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2486 	if (!net->ipv4.tcp_sk)
2487 		return -ENOMEM;
2488 
2489 	for_each_possible_cpu(cpu) {
2490 		struct sock *sk;
2491 
2492 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2493 					   IPPROTO_TCP, net);
2494 		if (res)
2495 			goto fail;
2496 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2497 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2498 	}
2499 
2500 	net->ipv4.sysctl_tcp_ecn = 2;
2501 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2502 
2503 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2504 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2505 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2506 
2507 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2508 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2509 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2510 
2511 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2512 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2513 	net->ipv4.sysctl_tcp_syncookies = 1;
2514 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2515 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2516 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2517 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2518 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2519 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2520 	net->ipv4.sysctl_tcp_tw_reuse = 0;
2521 
2522 	cnt = tcp_hashinfo.ehash_mask + 1;
2523 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2524 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2525 
2526 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2527 	net->ipv4.sysctl_tcp_sack = 1;
2528 	net->ipv4.sysctl_tcp_window_scaling = 1;
2529 	net->ipv4.sysctl_tcp_timestamps = 1;
2530 	net->ipv4.sysctl_tcp_early_retrans = 3;
2531 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2532 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2533 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
2534 	net->ipv4.sysctl_tcp_max_reordering = 300;
2535 	net->ipv4.sysctl_tcp_dsack = 1;
2536 	net->ipv4.sysctl_tcp_app_win = 31;
2537 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
2538 	net->ipv4.sysctl_tcp_frto = 2;
2539 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2540 	/* This limits the percentage of the congestion window which we
2541 	 * will allow a single TSO frame to consume.  Building TSO frames
2542 	 * which are too large can cause TCP streams to be bursty.
2543 	 */
2544 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2545 	/* Default TSQ limit of four TSO segments */
2546 	net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
2547 	/* rfc5961 challenge ack rate limiting */
2548 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2549 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
2550 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2551 	net->ipv4.sysctl_tcp_autocorking = 1;
2552 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2553 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2554 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2555 	if (net != &init_net) {
2556 		memcpy(net->ipv4.sysctl_tcp_rmem,
2557 		       init_net.ipv4.sysctl_tcp_rmem,
2558 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2559 		memcpy(net->ipv4.sysctl_tcp_wmem,
2560 		       init_net.ipv4.sysctl_tcp_wmem,
2561 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2562 	}
2563 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2564 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2565 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2566 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2567 
2568 	/* Reno is always built in */
2569 	if (!net_eq(net, &init_net) &&
2570 	    try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2571 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2572 	else
2573 		net->ipv4.tcp_congestion_control = &tcp_reno;
2574 
2575 	return 0;
2576 fail:
2577 	tcp_sk_exit(net);
2578 
2579 	return res;
2580 }
2581 
2582 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2583 {
2584 	struct net *net;
2585 
2586 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2587 
2588 	list_for_each_entry(net, net_exit_list, exit_list)
2589 		tcp_fastopen_ctx_destroy(net);
2590 }
2591 
2592 static struct pernet_operations __net_initdata tcp_sk_ops = {
2593        .init	   = tcp_sk_init,
2594        .exit	   = tcp_sk_exit,
2595        .exit_batch = tcp_sk_exit_batch,
2596 };
2597 
2598 void __init tcp_v4_init(void)
2599 {
2600 	if (register_pernet_subsys(&tcp_sk_ops))
2601 		panic("Failed to create the TCP control socket.\n");
2602 }
2603