xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision 133f9794)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77 
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/inetdevice.h>
84 
85 #include <crypto/hash.h>
86 #include <linux/scatterlist.h>
87 
88 #include <trace/events/tcp.h>
89 
90 #ifdef CONFIG_TCP_MD5SIG
91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
92 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
93 #endif
94 
95 struct inet_hashinfo tcp_hashinfo;
96 EXPORT_SYMBOL(tcp_hashinfo);
97 
98 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
99 {
100 	return secure_tcp_seq(ip_hdr(skb)->daddr,
101 			      ip_hdr(skb)->saddr,
102 			      tcp_hdr(skb)->dest,
103 			      tcp_hdr(skb)->source);
104 }
105 
106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
107 {
108 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
109 }
110 
111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112 {
113 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114 	struct tcp_sock *tp = tcp_sk(sk);
115 
116 	/* With PAWS, it is safe from the viewpoint
117 	   of data integrity. Even without PAWS it is safe provided sequence
118 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
119 
120 	   Actually, the idea is close to VJ's one, only timestamp cache is
121 	   held not per host, but per port pair and TW bucket is used as state
122 	   holder.
123 
124 	   If TW bucket has been already destroyed we fall back to VJ's scheme
125 	   and use initial timestamp retrieved from peer table.
126 	 */
127 	if (tcptw->tw_ts_recent_stamp &&
128 	    (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
129 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
130 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
131 		if (tp->write_seq == 0)
132 			tp->write_seq = 1;
133 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
134 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
135 		sock_hold(sktw);
136 		return 1;
137 	}
138 
139 	return 0;
140 }
141 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
142 
143 /* This will initiate an outgoing connection. */
144 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
145 {
146 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
147 	struct inet_sock *inet = inet_sk(sk);
148 	struct tcp_sock *tp = tcp_sk(sk);
149 	__be16 orig_sport, orig_dport;
150 	__be32 daddr, nexthop;
151 	struct flowi4 *fl4;
152 	struct rtable *rt;
153 	int err;
154 	struct ip_options_rcu *inet_opt;
155 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
156 
157 	if (addr_len < sizeof(struct sockaddr_in))
158 		return -EINVAL;
159 
160 	if (usin->sin_family != AF_INET)
161 		return -EAFNOSUPPORT;
162 
163 	nexthop = daddr = usin->sin_addr.s_addr;
164 	inet_opt = rcu_dereference_protected(inet->inet_opt,
165 					     lockdep_sock_is_held(sk));
166 	if (inet_opt && inet_opt->opt.srr) {
167 		if (!daddr)
168 			return -EINVAL;
169 		nexthop = inet_opt->opt.faddr;
170 	}
171 
172 	orig_sport = inet->inet_sport;
173 	orig_dport = usin->sin_port;
174 	fl4 = &inet->cork.fl.u.ip4;
175 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
176 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
177 			      IPPROTO_TCP,
178 			      orig_sport, orig_dport, sk);
179 	if (IS_ERR(rt)) {
180 		err = PTR_ERR(rt);
181 		if (err == -ENETUNREACH)
182 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
183 		return err;
184 	}
185 
186 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
187 		ip_rt_put(rt);
188 		return -ENETUNREACH;
189 	}
190 
191 	if (!inet_opt || !inet_opt->opt.srr)
192 		daddr = fl4->daddr;
193 
194 	if (!inet->inet_saddr)
195 		inet->inet_saddr = fl4->saddr;
196 	sk_rcv_saddr_set(sk, inet->inet_saddr);
197 
198 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
199 		/* Reset inherited state */
200 		tp->rx_opt.ts_recent	   = 0;
201 		tp->rx_opt.ts_recent_stamp = 0;
202 		if (likely(!tp->repair))
203 			tp->write_seq	   = 0;
204 	}
205 
206 	inet->inet_dport = usin->sin_port;
207 	sk_daddr_set(sk, daddr);
208 
209 	inet_csk(sk)->icsk_ext_hdr_len = 0;
210 	if (inet_opt)
211 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
212 
213 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
214 
215 	/* Socket identity is still unknown (sport may be zero).
216 	 * However we set state to SYN-SENT and not releasing socket
217 	 * lock select source port, enter ourselves into the hash tables and
218 	 * complete initialization after this.
219 	 */
220 	tcp_set_state(sk, TCP_SYN_SENT);
221 	err = inet_hash_connect(tcp_death_row, sk);
222 	if (err)
223 		goto failure;
224 
225 	sk_set_txhash(sk);
226 
227 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
228 			       inet->inet_sport, inet->inet_dport, sk);
229 	if (IS_ERR(rt)) {
230 		err = PTR_ERR(rt);
231 		rt = NULL;
232 		goto failure;
233 	}
234 	/* OK, now commit destination to socket.  */
235 	sk->sk_gso_type = SKB_GSO_TCPV4;
236 	sk_setup_caps(sk, &rt->dst);
237 	rt = NULL;
238 
239 	if (likely(!tp->repair)) {
240 		if (!tp->write_seq)
241 			tp->write_seq = secure_tcp_seq(inet->inet_saddr,
242 						       inet->inet_daddr,
243 						       inet->inet_sport,
244 						       usin->sin_port);
245 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
246 						 inet->inet_saddr,
247 						 inet->inet_daddr);
248 	}
249 
250 	inet->inet_id = tp->write_seq ^ jiffies;
251 
252 	if (tcp_fastopen_defer_connect(sk, &err))
253 		return err;
254 	if (err)
255 		goto failure;
256 
257 	err = tcp_connect(sk);
258 
259 	if (err)
260 		goto failure;
261 
262 	return 0;
263 
264 failure:
265 	/*
266 	 * This unhashes the socket and releases the local port,
267 	 * if necessary.
268 	 */
269 	tcp_set_state(sk, TCP_CLOSE);
270 	ip_rt_put(rt);
271 	sk->sk_route_caps = 0;
272 	inet->inet_dport = 0;
273 	return err;
274 }
275 EXPORT_SYMBOL(tcp_v4_connect);
276 
277 /*
278  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
279  * It can be called through tcp_release_cb() if socket was owned by user
280  * at the time tcp_v4_err() was called to handle ICMP message.
281  */
282 void tcp_v4_mtu_reduced(struct sock *sk)
283 {
284 	struct inet_sock *inet = inet_sk(sk);
285 	struct dst_entry *dst;
286 	u32 mtu;
287 
288 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
289 		return;
290 	mtu = tcp_sk(sk)->mtu_info;
291 	dst = inet_csk_update_pmtu(sk, mtu);
292 	if (!dst)
293 		return;
294 
295 	/* Something is about to be wrong... Remember soft error
296 	 * for the case, if this connection will not able to recover.
297 	 */
298 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
299 		sk->sk_err_soft = EMSGSIZE;
300 
301 	mtu = dst_mtu(dst);
302 
303 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
304 	    ip_sk_accept_pmtu(sk) &&
305 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
306 		tcp_sync_mss(sk, mtu);
307 
308 		/* Resend the TCP packet because it's
309 		 * clear that the old packet has been
310 		 * dropped. This is the new "fast" path mtu
311 		 * discovery.
312 		 */
313 		tcp_simple_retransmit(sk);
314 	} /* else let the usual retransmit timer handle it */
315 }
316 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
317 
318 static void do_redirect(struct sk_buff *skb, struct sock *sk)
319 {
320 	struct dst_entry *dst = __sk_dst_check(sk, 0);
321 
322 	if (dst)
323 		dst->ops->redirect(dst, sk, skb);
324 }
325 
326 
327 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
328 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
329 {
330 	struct request_sock *req = inet_reqsk(sk);
331 	struct net *net = sock_net(sk);
332 
333 	/* ICMPs are not backlogged, hence we cannot get
334 	 * an established socket here.
335 	 */
336 	if (seq != tcp_rsk(req)->snt_isn) {
337 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
338 	} else if (abort) {
339 		/*
340 		 * Still in SYN_RECV, just remove it silently.
341 		 * There is no good way to pass the error to the newly
342 		 * created socket, and POSIX does not want network
343 		 * errors returned from accept().
344 		 */
345 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
346 		tcp_listendrop(req->rsk_listener);
347 	}
348 	reqsk_put(req);
349 }
350 EXPORT_SYMBOL(tcp_req_err);
351 
352 /*
353  * This routine is called by the ICMP module when it gets some
354  * sort of error condition.  If err < 0 then the socket should
355  * be closed and the error returned to the user.  If err > 0
356  * it's just the icmp type << 8 | icmp code.  After adjustment
357  * header points to the first 8 bytes of the tcp header.  We need
358  * to find the appropriate port.
359  *
360  * The locking strategy used here is very "optimistic". When
361  * someone else accesses the socket the ICMP is just dropped
362  * and for some paths there is no check at all.
363  * A more general error queue to queue errors for later handling
364  * is probably better.
365  *
366  */
367 
368 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
369 {
370 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
371 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
372 	struct inet_connection_sock *icsk;
373 	struct tcp_sock *tp;
374 	struct inet_sock *inet;
375 	const int type = icmp_hdr(icmp_skb)->type;
376 	const int code = icmp_hdr(icmp_skb)->code;
377 	struct sock *sk;
378 	struct sk_buff *skb;
379 	struct request_sock *fastopen;
380 	u32 seq, snd_una;
381 	s32 remaining;
382 	u32 delta_us;
383 	int err;
384 	struct net *net = dev_net(icmp_skb->dev);
385 
386 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
387 				       th->dest, iph->saddr, ntohs(th->source),
388 				       inet_iif(icmp_skb), 0);
389 	if (!sk) {
390 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
391 		return;
392 	}
393 	if (sk->sk_state == TCP_TIME_WAIT) {
394 		inet_twsk_put(inet_twsk(sk));
395 		return;
396 	}
397 	seq = ntohl(th->seq);
398 	if (sk->sk_state == TCP_NEW_SYN_RECV)
399 		return tcp_req_err(sk, seq,
400 				  type == ICMP_PARAMETERPROB ||
401 				  type == ICMP_TIME_EXCEEDED ||
402 				  (type == ICMP_DEST_UNREACH &&
403 				   (code == ICMP_NET_UNREACH ||
404 				    code == ICMP_HOST_UNREACH)));
405 
406 	bh_lock_sock(sk);
407 	/* If too many ICMPs get dropped on busy
408 	 * servers this needs to be solved differently.
409 	 * We do take care of PMTU discovery (RFC1191) special case :
410 	 * we can receive locally generated ICMP messages while socket is held.
411 	 */
412 	if (sock_owned_by_user(sk)) {
413 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
414 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
415 	}
416 	if (sk->sk_state == TCP_CLOSE)
417 		goto out;
418 
419 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
420 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
421 		goto out;
422 	}
423 
424 	icsk = inet_csk(sk);
425 	tp = tcp_sk(sk);
426 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
427 	fastopen = tp->fastopen_rsk;
428 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
429 	if (sk->sk_state != TCP_LISTEN &&
430 	    !between(seq, snd_una, tp->snd_nxt)) {
431 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
432 		goto out;
433 	}
434 
435 	switch (type) {
436 	case ICMP_REDIRECT:
437 		if (!sock_owned_by_user(sk))
438 			do_redirect(icmp_skb, sk);
439 		goto out;
440 	case ICMP_SOURCE_QUENCH:
441 		/* Just silently ignore these. */
442 		goto out;
443 	case ICMP_PARAMETERPROB:
444 		err = EPROTO;
445 		break;
446 	case ICMP_DEST_UNREACH:
447 		if (code > NR_ICMP_UNREACH)
448 			goto out;
449 
450 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
451 			/* We are not interested in TCP_LISTEN and open_requests
452 			 * (SYN-ACKs send out by Linux are always <576bytes so
453 			 * they should go through unfragmented).
454 			 */
455 			if (sk->sk_state == TCP_LISTEN)
456 				goto out;
457 
458 			tp->mtu_info = info;
459 			if (!sock_owned_by_user(sk)) {
460 				tcp_v4_mtu_reduced(sk);
461 			} else {
462 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
463 					sock_hold(sk);
464 			}
465 			goto out;
466 		}
467 
468 		err = icmp_err_convert[code].errno;
469 		/* check if icmp_skb allows revert of backoff
470 		 * (see draft-zimmermann-tcp-lcd) */
471 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
472 			break;
473 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
474 		    !icsk->icsk_backoff || fastopen)
475 			break;
476 
477 		if (sock_owned_by_user(sk))
478 			break;
479 
480 		icsk->icsk_backoff--;
481 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
482 					       TCP_TIMEOUT_INIT;
483 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
484 
485 		skb = tcp_rtx_queue_head(sk);
486 		BUG_ON(!skb);
487 
488 		tcp_mstamp_refresh(tp);
489 		delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
490 		remaining = icsk->icsk_rto -
491 			    usecs_to_jiffies(delta_us);
492 
493 		if (remaining > 0) {
494 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
495 						  remaining, TCP_RTO_MAX);
496 		} else {
497 			/* RTO revert clocked out retransmission.
498 			 * Will retransmit now */
499 			tcp_retransmit_timer(sk);
500 		}
501 
502 		break;
503 	case ICMP_TIME_EXCEEDED:
504 		err = EHOSTUNREACH;
505 		break;
506 	default:
507 		goto out;
508 	}
509 
510 	switch (sk->sk_state) {
511 	case TCP_SYN_SENT:
512 	case TCP_SYN_RECV:
513 		/* Only in fast or simultaneous open. If a fast open socket is
514 		 * is already accepted it is treated as a connected one below.
515 		 */
516 		if (fastopen && !fastopen->sk)
517 			break;
518 
519 		if (!sock_owned_by_user(sk)) {
520 			sk->sk_err = err;
521 
522 			sk->sk_error_report(sk);
523 
524 			tcp_done(sk);
525 		} else {
526 			sk->sk_err_soft = err;
527 		}
528 		goto out;
529 	}
530 
531 	/* If we've already connected we will keep trying
532 	 * until we time out, or the user gives up.
533 	 *
534 	 * rfc1122 4.2.3.9 allows to consider as hard errors
535 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
536 	 * but it is obsoleted by pmtu discovery).
537 	 *
538 	 * Note, that in modern internet, where routing is unreliable
539 	 * and in each dark corner broken firewalls sit, sending random
540 	 * errors ordered by their masters even this two messages finally lose
541 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
542 	 *
543 	 * Now we are in compliance with RFCs.
544 	 *							--ANK (980905)
545 	 */
546 
547 	inet = inet_sk(sk);
548 	if (!sock_owned_by_user(sk) && inet->recverr) {
549 		sk->sk_err = err;
550 		sk->sk_error_report(sk);
551 	} else	{ /* Only an error on timeout */
552 		sk->sk_err_soft = err;
553 	}
554 
555 out:
556 	bh_unlock_sock(sk);
557 	sock_put(sk);
558 }
559 
560 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
561 {
562 	struct tcphdr *th = tcp_hdr(skb);
563 
564 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
565 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
566 		skb->csum_start = skb_transport_header(skb) - skb->head;
567 		skb->csum_offset = offsetof(struct tcphdr, check);
568 	} else {
569 		th->check = tcp_v4_check(skb->len, saddr, daddr,
570 					 csum_partial(th,
571 						      th->doff << 2,
572 						      skb->csum));
573 	}
574 }
575 
576 /* This routine computes an IPv4 TCP checksum. */
577 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
578 {
579 	const struct inet_sock *inet = inet_sk(sk);
580 
581 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
582 }
583 EXPORT_SYMBOL(tcp_v4_send_check);
584 
585 /*
586  *	This routine will send an RST to the other tcp.
587  *
588  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
589  *		      for reset.
590  *	Answer: if a packet caused RST, it is not for a socket
591  *		existing in our system, if it is matched to a socket,
592  *		it is just duplicate segment or bug in other side's TCP.
593  *		So that we build reply only basing on parameters
594  *		arrived with segment.
595  *	Exception: precedence violation. We do not implement it in any case.
596  */
597 
598 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
599 {
600 	const struct tcphdr *th = tcp_hdr(skb);
601 	struct {
602 		struct tcphdr th;
603 #ifdef CONFIG_TCP_MD5SIG
604 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
605 #endif
606 	} rep;
607 	struct ip_reply_arg arg;
608 #ifdef CONFIG_TCP_MD5SIG
609 	struct tcp_md5sig_key *key = NULL;
610 	const __u8 *hash_location = NULL;
611 	unsigned char newhash[16];
612 	int genhash;
613 	struct sock *sk1 = NULL;
614 #endif
615 	struct net *net;
616 
617 	/* Never send a reset in response to a reset. */
618 	if (th->rst)
619 		return;
620 
621 	/* If sk not NULL, it means we did a successful lookup and incoming
622 	 * route had to be correct. prequeue might have dropped our dst.
623 	 */
624 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
625 		return;
626 
627 	/* Swap the send and the receive. */
628 	memset(&rep, 0, sizeof(rep));
629 	rep.th.dest   = th->source;
630 	rep.th.source = th->dest;
631 	rep.th.doff   = sizeof(struct tcphdr) / 4;
632 	rep.th.rst    = 1;
633 
634 	if (th->ack) {
635 		rep.th.seq = th->ack_seq;
636 	} else {
637 		rep.th.ack = 1;
638 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
639 				       skb->len - (th->doff << 2));
640 	}
641 
642 	memset(&arg, 0, sizeof(arg));
643 	arg.iov[0].iov_base = (unsigned char *)&rep;
644 	arg.iov[0].iov_len  = sizeof(rep.th);
645 
646 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
647 #ifdef CONFIG_TCP_MD5SIG
648 	rcu_read_lock();
649 	hash_location = tcp_parse_md5sig_option(th);
650 	if (sk && sk_fullsock(sk)) {
651 		key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
652 					&ip_hdr(skb)->saddr, AF_INET);
653 	} else if (hash_location) {
654 		/*
655 		 * active side is lost. Try to find listening socket through
656 		 * source port, and then find md5 key through listening socket.
657 		 * we are not loose security here:
658 		 * Incoming packet is checked with md5 hash with finding key,
659 		 * no RST generated if md5 hash doesn't match.
660 		 */
661 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
662 					     ip_hdr(skb)->saddr,
663 					     th->source, ip_hdr(skb)->daddr,
664 					     ntohs(th->source), inet_iif(skb),
665 					     tcp_v4_sdif(skb));
666 		/* don't send rst if it can't find key */
667 		if (!sk1)
668 			goto out;
669 
670 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
671 					&ip_hdr(skb)->saddr, AF_INET);
672 		if (!key)
673 			goto out;
674 
675 
676 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
677 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
678 			goto out;
679 
680 	}
681 
682 	if (key) {
683 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
684 				   (TCPOPT_NOP << 16) |
685 				   (TCPOPT_MD5SIG << 8) |
686 				   TCPOLEN_MD5SIG);
687 		/* Update length and the length the header thinks exists */
688 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
689 		rep.th.doff = arg.iov[0].iov_len / 4;
690 
691 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
692 				     key, ip_hdr(skb)->saddr,
693 				     ip_hdr(skb)->daddr, &rep.th);
694 	}
695 #endif
696 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
697 				      ip_hdr(skb)->saddr, /* XXX */
698 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
699 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
700 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
701 
702 	/* When socket is gone, all binding information is lost.
703 	 * routing might fail in this case. No choice here, if we choose to force
704 	 * input interface, we will misroute in case of asymmetric route.
705 	 */
706 	if (sk) {
707 		arg.bound_dev_if = sk->sk_bound_dev_if;
708 		if (sk_fullsock(sk))
709 			trace_tcp_send_reset(sk, skb);
710 	}
711 
712 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
713 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
714 
715 	arg.tos = ip_hdr(skb)->tos;
716 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
717 	local_bh_disable();
718 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
719 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
720 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
721 			      &arg, arg.iov[0].iov_len);
722 
723 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
724 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
725 	local_bh_enable();
726 
727 #ifdef CONFIG_TCP_MD5SIG
728 out:
729 	rcu_read_unlock();
730 #endif
731 }
732 
733 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
734    outside socket context is ugly, certainly. What can I do?
735  */
736 
737 static void tcp_v4_send_ack(const struct sock *sk,
738 			    struct sk_buff *skb, u32 seq, u32 ack,
739 			    u32 win, u32 tsval, u32 tsecr, int oif,
740 			    struct tcp_md5sig_key *key,
741 			    int reply_flags, u8 tos)
742 {
743 	const struct tcphdr *th = tcp_hdr(skb);
744 	struct {
745 		struct tcphdr th;
746 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
747 #ifdef CONFIG_TCP_MD5SIG
748 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
749 #endif
750 			];
751 	} rep;
752 	struct net *net = sock_net(sk);
753 	struct ip_reply_arg arg;
754 
755 	memset(&rep.th, 0, sizeof(struct tcphdr));
756 	memset(&arg, 0, sizeof(arg));
757 
758 	arg.iov[0].iov_base = (unsigned char *)&rep;
759 	arg.iov[0].iov_len  = sizeof(rep.th);
760 	if (tsecr) {
761 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
762 				   (TCPOPT_TIMESTAMP << 8) |
763 				   TCPOLEN_TIMESTAMP);
764 		rep.opt[1] = htonl(tsval);
765 		rep.opt[2] = htonl(tsecr);
766 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
767 	}
768 
769 	/* Swap the send and the receive. */
770 	rep.th.dest    = th->source;
771 	rep.th.source  = th->dest;
772 	rep.th.doff    = arg.iov[0].iov_len / 4;
773 	rep.th.seq     = htonl(seq);
774 	rep.th.ack_seq = htonl(ack);
775 	rep.th.ack     = 1;
776 	rep.th.window  = htons(win);
777 
778 #ifdef CONFIG_TCP_MD5SIG
779 	if (key) {
780 		int offset = (tsecr) ? 3 : 0;
781 
782 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
783 					  (TCPOPT_NOP << 16) |
784 					  (TCPOPT_MD5SIG << 8) |
785 					  TCPOLEN_MD5SIG);
786 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
787 		rep.th.doff = arg.iov[0].iov_len/4;
788 
789 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
790 				    key, ip_hdr(skb)->saddr,
791 				    ip_hdr(skb)->daddr, &rep.th);
792 	}
793 #endif
794 	arg.flags = reply_flags;
795 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
796 				      ip_hdr(skb)->saddr, /* XXX */
797 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
798 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
799 	if (oif)
800 		arg.bound_dev_if = oif;
801 	arg.tos = tos;
802 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
803 	local_bh_disable();
804 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
805 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
806 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
807 			      &arg, arg.iov[0].iov_len);
808 
809 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
810 	local_bh_enable();
811 }
812 
813 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
814 {
815 	struct inet_timewait_sock *tw = inet_twsk(sk);
816 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
817 
818 	tcp_v4_send_ack(sk, skb,
819 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
820 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
821 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
822 			tcptw->tw_ts_recent,
823 			tw->tw_bound_dev_if,
824 			tcp_twsk_md5_key(tcptw),
825 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
826 			tw->tw_tos
827 			);
828 
829 	inet_twsk_put(tw);
830 }
831 
832 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
833 				  struct request_sock *req)
834 {
835 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
836 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
837 	 */
838 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
839 					     tcp_sk(sk)->snd_nxt;
840 
841 	/* RFC 7323 2.3
842 	 * The window field (SEG.WND) of every outgoing segment, with the
843 	 * exception of <SYN> segments, MUST be right-shifted by
844 	 * Rcv.Wind.Shift bits:
845 	 */
846 	tcp_v4_send_ack(sk, skb, seq,
847 			tcp_rsk(req)->rcv_nxt,
848 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
849 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
850 			req->ts_recent,
851 			0,
852 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
853 					  AF_INET),
854 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
855 			ip_hdr(skb)->tos);
856 }
857 
858 /*
859  *	Send a SYN-ACK after having received a SYN.
860  *	This still operates on a request_sock only, not on a big
861  *	socket.
862  */
863 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
864 			      struct flowi *fl,
865 			      struct request_sock *req,
866 			      struct tcp_fastopen_cookie *foc,
867 			      enum tcp_synack_type synack_type)
868 {
869 	const struct inet_request_sock *ireq = inet_rsk(req);
870 	struct flowi4 fl4;
871 	int err = -1;
872 	struct sk_buff *skb;
873 
874 	/* First, grab a route. */
875 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
876 		return -1;
877 
878 	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
879 
880 	if (skb) {
881 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
882 
883 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
884 					    ireq->ir_rmt_addr,
885 					    ireq_opt_deref(ireq));
886 		err = net_xmit_eval(err);
887 	}
888 
889 	return err;
890 }
891 
892 /*
893  *	IPv4 request_sock destructor.
894  */
895 static void tcp_v4_reqsk_destructor(struct request_sock *req)
896 {
897 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
898 }
899 
900 #ifdef CONFIG_TCP_MD5SIG
901 /*
902  * RFC2385 MD5 checksumming requires a mapping of
903  * IP address->MD5 Key.
904  * We need to maintain these in the sk structure.
905  */
906 
907 /* Find the Key structure for an address.  */
908 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
909 					 const union tcp_md5_addr *addr,
910 					 int family)
911 {
912 	const struct tcp_sock *tp = tcp_sk(sk);
913 	struct tcp_md5sig_key *key;
914 	const struct tcp_md5sig_info *md5sig;
915 	__be32 mask;
916 	struct tcp_md5sig_key *best_match = NULL;
917 	bool match;
918 
919 	/* caller either holds rcu_read_lock() or socket lock */
920 	md5sig = rcu_dereference_check(tp->md5sig_info,
921 				       lockdep_sock_is_held(sk));
922 	if (!md5sig)
923 		return NULL;
924 
925 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
926 		if (key->family != family)
927 			continue;
928 
929 		if (family == AF_INET) {
930 			mask = inet_make_mask(key->prefixlen);
931 			match = (key->addr.a4.s_addr & mask) ==
932 				(addr->a4.s_addr & mask);
933 #if IS_ENABLED(CONFIG_IPV6)
934 		} else if (family == AF_INET6) {
935 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
936 						  key->prefixlen);
937 #endif
938 		} else {
939 			match = false;
940 		}
941 
942 		if (match && (!best_match ||
943 			      key->prefixlen > best_match->prefixlen))
944 			best_match = key;
945 	}
946 	return best_match;
947 }
948 EXPORT_SYMBOL(tcp_md5_do_lookup);
949 
950 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
951 						      const union tcp_md5_addr *addr,
952 						      int family, u8 prefixlen)
953 {
954 	const struct tcp_sock *tp = tcp_sk(sk);
955 	struct tcp_md5sig_key *key;
956 	unsigned int size = sizeof(struct in_addr);
957 	const struct tcp_md5sig_info *md5sig;
958 
959 	/* caller either holds rcu_read_lock() or socket lock */
960 	md5sig = rcu_dereference_check(tp->md5sig_info,
961 				       lockdep_sock_is_held(sk));
962 	if (!md5sig)
963 		return NULL;
964 #if IS_ENABLED(CONFIG_IPV6)
965 	if (family == AF_INET6)
966 		size = sizeof(struct in6_addr);
967 #endif
968 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
969 		if (key->family != family)
970 			continue;
971 		if (!memcmp(&key->addr, addr, size) &&
972 		    key->prefixlen == prefixlen)
973 			return key;
974 	}
975 	return NULL;
976 }
977 
978 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
979 					 const struct sock *addr_sk)
980 {
981 	const union tcp_md5_addr *addr;
982 
983 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
984 	return tcp_md5_do_lookup(sk, addr, AF_INET);
985 }
986 EXPORT_SYMBOL(tcp_v4_md5_lookup);
987 
988 /* This can be called on a newly created socket, from other files */
989 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
990 		   int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
991 		   gfp_t gfp)
992 {
993 	/* Add Key to the list */
994 	struct tcp_md5sig_key *key;
995 	struct tcp_sock *tp = tcp_sk(sk);
996 	struct tcp_md5sig_info *md5sig;
997 
998 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
999 	if (key) {
1000 		/* Pre-existing entry - just update that one. */
1001 		memcpy(key->key, newkey, newkeylen);
1002 		key->keylen = newkeylen;
1003 		return 0;
1004 	}
1005 
1006 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1007 					   lockdep_sock_is_held(sk));
1008 	if (!md5sig) {
1009 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1010 		if (!md5sig)
1011 			return -ENOMEM;
1012 
1013 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1014 		INIT_HLIST_HEAD(&md5sig->head);
1015 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1016 	}
1017 
1018 	key = sock_kmalloc(sk, sizeof(*key), gfp);
1019 	if (!key)
1020 		return -ENOMEM;
1021 	if (!tcp_alloc_md5sig_pool()) {
1022 		sock_kfree_s(sk, key, sizeof(*key));
1023 		return -ENOMEM;
1024 	}
1025 
1026 	memcpy(key->key, newkey, newkeylen);
1027 	key->keylen = newkeylen;
1028 	key->family = family;
1029 	key->prefixlen = prefixlen;
1030 	memcpy(&key->addr, addr,
1031 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1032 				      sizeof(struct in_addr));
1033 	hlist_add_head_rcu(&key->node, &md5sig->head);
1034 	return 0;
1035 }
1036 EXPORT_SYMBOL(tcp_md5_do_add);
1037 
1038 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1039 		   u8 prefixlen)
1040 {
1041 	struct tcp_md5sig_key *key;
1042 
1043 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1044 	if (!key)
1045 		return -ENOENT;
1046 	hlist_del_rcu(&key->node);
1047 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1048 	kfree_rcu(key, rcu);
1049 	return 0;
1050 }
1051 EXPORT_SYMBOL(tcp_md5_do_del);
1052 
1053 static void tcp_clear_md5_list(struct sock *sk)
1054 {
1055 	struct tcp_sock *tp = tcp_sk(sk);
1056 	struct tcp_md5sig_key *key;
1057 	struct hlist_node *n;
1058 	struct tcp_md5sig_info *md5sig;
1059 
1060 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1061 
1062 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1063 		hlist_del_rcu(&key->node);
1064 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1065 		kfree_rcu(key, rcu);
1066 	}
1067 }
1068 
1069 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1070 				 char __user *optval, int optlen)
1071 {
1072 	struct tcp_md5sig cmd;
1073 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1074 	u8 prefixlen = 32;
1075 
1076 	if (optlen < sizeof(cmd))
1077 		return -EINVAL;
1078 
1079 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1080 		return -EFAULT;
1081 
1082 	if (sin->sin_family != AF_INET)
1083 		return -EINVAL;
1084 
1085 	if (optname == TCP_MD5SIG_EXT &&
1086 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1087 		prefixlen = cmd.tcpm_prefixlen;
1088 		if (prefixlen > 32)
1089 			return -EINVAL;
1090 	}
1091 
1092 	if (!cmd.tcpm_keylen)
1093 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1094 				      AF_INET, prefixlen);
1095 
1096 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1097 		return -EINVAL;
1098 
1099 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1100 			      AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1101 			      GFP_KERNEL);
1102 }
1103 
1104 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1105 				   __be32 daddr, __be32 saddr,
1106 				   const struct tcphdr *th, int nbytes)
1107 {
1108 	struct tcp4_pseudohdr *bp;
1109 	struct scatterlist sg;
1110 	struct tcphdr *_th;
1111 
1112 	bp = hp->scratch;
1113 	bp->saddr = saddr;
1114 	bp->daddr = daddr;
1115 	bp->pad = 0;
1116 	bp->protocol = IPPROTO_TCP;
1117 	bp->len = cpu_to_be16(nbytes);
1118 
1119 	_th = (struct tcphdr *)(bp + 1);
1120 	memcpy(_th, th, sizeof(*th));
1121 	_th->check = 0;
1122 
1123 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1124 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1125 				sizeof(*bp) + sizeof(*th));
1126 	return crypto_ahash_update(hp->md5_req);
1127 }
1128 
1129 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1130 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1131 {
1132 	struct tcp_md5sig_pool *hp;
1133 	struct ahash_request *req;
1134 
1135 	hp = tcp_get_md5sig_pool();
1136 	if (!hp)
1137 		goto clear_hash_noput;
1138 	req = hp->md5_req;
1139 
1140 	if (crypto_ahash_init(req))
1141 		goto clear_hash;
1142 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1143 		goto clear_hash;
1144 	if (tcp_md5_hash_key(hp, key))
1145 		goto clear_hash;
1146 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1147 	if (crypto_ahash_final(req))
1148 		goto clear_hash;
1149 
1150 	tcp_put_md5sig_pool();
1151 	return 0;
1152 
1153 clear_hash:
1154 	tcp_put_md5sig_pool();
1155 clear_hash_noput:
1156 	memset(md5_hash, 0, 16);
1157 	return 1;
1158 }
1159 
1160 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1161 			const struct sock *sk,
1162 			const struct sk_buff *skb)
1163 {
1164 	struct tcp_md5sig_pool *hp;
1165 	struct ahash_request *req;
1166 	const struct tcphdr *th = tcp_hdr(skb);
1167 	__be32 saddr, daddr;
1168 
1169 	if (sk) { /* valid for establish/request sockets */
1170 		saddr = sk->sk_rcv_saddr;
1171 		daddr = sk->sk_daddr;
1172 	} else {
1173 		const struct iphdr *iph = ip_hdr(skb);
1174 		saddr = iph->saddr;
1175 		daddr = iph->daddr;
1176 	}
1177 
1178 	hp = tcp_get_md5sig_pool();
1179 	if (!hp)
1180 		goto clear_hash_noput;
1181 	req = hp->md5_req;
1182 
1183 	if (crypto_ahash_init(req))
1184 		goto clear_hash;
1185 
1186 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1187 		goto clear_hash;
1188 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1189 		goto clear_hash;
1190 	if (tcp_md5_hash_key(hp, key))
1191 		goto clear_hash;
1192 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1193 	if (crypto_ahash_final(req))
1194 		goto clear_hash;
1195 
1196 	tcp_put_md5sig_pool();
1197 	return 0;
1198 
1199 clear_hash:
1200 	tcp_put_md5sig_pool();
1201 clear_hash_noput:
1202 	memset(md5_hash, 0, 16);
1203 	return 1;
1204 }
1205 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1206 
1207 #endif
1208 
1209 /* Called with rcu_read_lock() */
1210 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1211 				    const struct sk_buff *skb)
1212 {
1213 #ifdef CONFIG_TCP_MD5SIG
1214 	/*
1215 	 * This gets called for each TCP segment that arrives
1216 	 * so we want to be efficient.
1217 	 * We have 3 drop cases:
1218 	 * o No MD5 hash and one expected.
1219 	 * o MD5 hash and we're not expecting one.
1220 	 * o MD5 hash and its wrong.
1221 	 */
1222 	const __u8 *hash_location = NULL;
1223 	struct tcp_md5sig_key *hash_expected;
1224 	const struct iphdr *iph = ip_hdr(skb);
1225 	const struct tcphdr *th = tcp_hdr(skb);
1226 	int genhash;
1227 	unsigned char newhash[16];
1228 
1229 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1230 					  AF_INET);
1231 	hash_location = tcp_parse_md5sig_option(th);
1232 
1233 	/* We've parsed the options - do we have a hash? */
1234 	if (!hash_expected && !hash_location)
1235 		return false;
1236 
1237 	if (hash_expected && !hash_location) {
1238 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1239 		return true;
1240 	}
1241 
1242 	if (!hash_expected && hash_location) {
1243 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1244 		return true;
1245 	}
1246 
1247 	/* Okay, so this is hash_expected and hash_location -
1248 	 * so we need to calculate the checksum.
1249 	 */
1250 	genhash = tcp_v4_md5_hash_skb(newhash,
1251 				      hash_expected,
1252 				      NULL, skb);
1253 
1254 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1255 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1256 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1257 				     &iph->saddr, ntohs(th->source),
1258 				     &iph->daddr, ntohs(th->dest),
1259 				     genhash ? " tcp_v4_calc_md5_hash failed"
1260 				     : "");
1261 		return true;
1262 	}
1263 	return false;
1264 #endif
1265 	return false;
1266 }
1267 
1268 static void tcp_v4_init_req(struct request_sock *req,
1269 			    const struct sock *sk_listener,
1270 			    struct sk_buff *skb)
1271 {
1272 	struct inet_request_sock *ireq = inet_rsk(req);
1273 	struct net *net = sock_net(sk_listener);
1274 
1275 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1276 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1277 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1278 }
1279 
1280 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1281 					  struct flowi *fl,
1282 					  const struct request_sock *req)
1283 {
1284 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1285 }
1286 
1287 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1288 	.family		=	PF_INET,
1289 	.obj_size	=	sizeof(struct tcp_request_sock),
1290 	.rtx_syn_ack	=	tcp_rtx_synack,
1291 	.send_ack	=	tcp_v4_reqsk_send_ack,
1292 	.destructor	=	tcp_v4_reqsk_destructor,
1293 	.send_reset	=	tcp_v4_send_reset,
1294 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1295 };
1296 
1297 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1298 	.mss_clamp	=	TCP_MSS_DEFAULT,
1299 #ifdef CONFIG_TCP_MD5SIG
1300 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1301 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1302 #endif
1303 	.init_req	=	tcp_v4_init_req,
1304 #ifdef CONFIG_SYN_COOKIES
1305 	.cookie_init_seq =	cookie_v4_init_sequence,
1306 #endif
1307 	.route_req	=	tcp_v4_route_req,
1308 	.init_seq	=	tcp_v4_init_seq,
1309 	.init_ts_off	=	tcp_v4_init_ts_off,
1310 	.send_synack	=	tcp_v4_send_synack,
1311 };
1312 
1313 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1314 {
1315 	/* Never answer to SYNs send to broadcast or multicast */
1316 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1317 		goto drop;
1318 
1319 	return tcp_conn_request(&tcp_request_sock_ops,
1320 				&tcp_request_sock_ipv4_ops, sk, skb);
1321 
1322 drop:
1323 	tcp_listendrop(sk);
1324 	return 0;
1325 }
1326 EXPORT_SYMBOL(tcp_v4_conn_request);
1327 
1328 
1329 /*
1330  * The three way handshake has completed - we got a valid synack -
1331  * now create the new socket.
1332  */
1333 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1334 				  struct request_sock *req,
1335 				  struct dst_entry *dst,
1336 				  struct request_sock *req_unhash,
1337 				  bool *own_req)
1338 {
1339 	struct inet_request_sock *ireq;
1340 	struct inet_sock *newinet;
1341 	struct tcp_sock *newtp;
1342 	struct sock *newsk;
1343 #ifdef CONFIG_TCP_MD5SIG
1344 	struct tcp_md5sig_key *key;
1345 #endif
1346 	struct ip_options_rcu *inet_opt;
1347 
1348 	if (sk_acceptq_is_full(sk))
1349 		goto exit_overflow;
1350 
1351 	newsk = tcp_create_openreq_child(sk, req, skb);
1352 	if (!newsk)
1353 		goto exit_nonewsk;
1354 
1355 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1356 	inet_sk_rx_dst_set(newsk, skb);
1357 
1358 	newtp		      = tcp_sk(newsk);
1359 	newinet		      = inet_sk(newsk);
1360 	ireq		      = inet_rsk(req);
1361 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1362 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1363 	newsk->sk_bound_dev_if = ireq->ir_iif;
1364 	newinet->inet_saddr   = ireq->ir_loc_addr;
1365 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1366 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1367 	newinet->mc_index     = inet_iif(skb);
1368 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1369 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1370 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1371 	if (inet_opt)
1372 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1373 	newinet->inet_id = newtp->write_seq ^ jiffies;
1374 
1375 	if (!dst) {
1376 		dst = inet_csk_route_child_sock(sk, newsk, req);
1377 		if (!dst)
1378 			goto put_and_exit;
1379 	} else {
1380 		/* syncookie case : see end of cookie_v4_check() */
1381 	}
1382 	sk_setup_caps(newsk, dst);
1383 
1384 	tcp_ca_openreq_child(newsk, dst);
1385 
1386 	tcp_sync_mss(newsk, dst_mtu(dst));
1387 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1388 
1389 	tcp_initialize_rcv_mss(newsk);
1390 
1391 #ifdef CONFIG_TCP_MD5SIG
1392 	/* Copy over the MD5 key from the original socket */
1393 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1394 				AF_INET);
1395 	if (key) {
1396 		/*
1397 		 * We're using one, so create a matching key
1398 		 * on the newsk structure. If we fail to get
1399 		 * memory, then we end up not copying the key
1400 		 * across. Shucks.
1401 		 */
1402 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1403 			       AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1404 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1405 	}
1406 #endif
1407 
1408 	if (__inet_inherit_port(sk, newsk) < 0)
1409 		goto put_and_exit;
1410 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1411 	if (likely(*own_req)) {
1412 		tcp_move_syn(newtp, req);
1413 		ireq->ireq_opt = NULL;
1414 	} else {
1415 		newinet->inet_opt = NULL;
1416 	}
1417 	return newsk;
1418 
1419 exit_overflow:
1420 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1421 exit_nonewsk:
1422 	dst_release(dst);
1423 exit:
1424 	tcp_listendrop(sk);
1425 	return NULL;
1426 put_and_exit:
1427 	newinet->inet_opt = NULL;
1428 	inet_csk_prepare_forced_close(newsk);
1429 	tcp_done(newsk);
1430 	goto exit;
1431 }
1432 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1433 
1434 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1435 {
1436 #ifdef CONFIG_SYN_COOKIES
1437 	const struct tcphdr *th = tcp_hdr(skb);
1438 
1439 	if (!th->syn)
1440 		sk = cookie_v4_check(sk, skb);
1441 #endif
1442 	return sk;
1443 }
1444 
1445 /* The socket must have it's spinlock held when we get
1446  * here, unless it is a TCP_LISTEN socket.
1447  *
1448  * We have a potential double-lock case here, so even when
1449  * doing backlog processing we use the BH locking scheme.
1450  * This is because we cannot sleep with the original spinlock
1451  * held.
1452  */
1453 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1454 {
1455 	struct sock *rsk;
1456 
1457 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1458 		struct dst_entry *dst = sk->sk_rx_dst;
1459 
1460 		sock_rps_save_rxhash(sk, skb);
1461 		sk_mark_napi_id(sk, skb);
1462 		if (dst) {
1463 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1464 			    !dst->ops->check(dst, 0)) {
1465 				dst_release(dst);
1466 				sk->sk_rx_dst = NULL;
1467 			}
1468 		}
1469 		tcp_rcv_established(sk, skb, tcp_hdr(skb));
1470 		return 0;
1471 	}
1472 
1473 	if (tcp_checksum_complete(skb))
1474 		goto csum_err;
1475 
1476 	if (sk->sk_state == TCP_LISTEN) {
1477 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1478 
1479 		if (!nsk)
1480 			goto discard;
1481 		if (nsk != sk) {
1482 			if (tcp_child_process(sk, nsk, skb)) {
1483 				rsk = nsk;
1484 				goto reset;
1485 			}
1486 			return 0;
1487 		}
1488 	} else
1489 		sock_rps_save_rxhash(sk, skb);
1490 
1491 	if (tcp_rcv_state_process(sk, skb)) {
1492 		rsk = sk;
1493 		goto reset;
1494 	}
1495 	return 0;
1496 
1497 reset:
1498 	tcp_v4_send_reset(rsk, skb);
1499 discard:
1500 	kfree_skb(skb);
1501 	/* Be careful here. If this function gets more complicated and
1502 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1503 	 * might be destroyed here. This current version compiles correctly,
1504 	 * but you have been warned.
1505 	 */
1506 	return 0;
1507 
1508 csum_err:
1509 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1510 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1511 	goto discard;
1512 }
1513 EXPORT_SYMBOL(tcp_v4_do_rcv);
1514 
1515 int tcp_v4_early_demux(struct sk_buff *skb)
1516 {
1517 	const struct iphdr *iph;
1518 	const struct tcphdr *th;
1519 	struct sock *sk;
1520 
1521 	if (skb->pkt_type != PACKET_HOST)
1522 		return 0;
1523 
1524 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1525 		return 0;
1526 
1527 	iph = ip_hdr(skb);
1528 	th = tcp_hdr(skb);
1529 
1530 	if (th->doff < sizeof(struct tcphdr) / 4)
1531 		return 0;
1532 
1533 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1534 				       iph->saddr, th->source,
1535 				       iph->daddr, ntohs(th->dest),
1536 				       skb->skb_iif, inet_sdif(skb));
1537 	if (sk) {
1538 		skb->sk = sk;
1539 		skb->destructor = sock_edemux;
1540 		if (sk_fullsock(sk)) {
1541 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1542 
1543 			if (dst)
1544 				dst = dst_check(dst, 0);
1545 			if (dst &&
1546 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1547 				skb_dst_set_noref(skb, dst);
1548 		}
1549 	}
1550 	return 0;
1551 }
1552 
1553 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1554 {
1555 	u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1556 
1557 	/* Only socket owner can try to collapse/prune rx queues
1558 	 * to reduce memory overhead, so add a little headroom here.
1559 	 * Few sockets backlog are possibly concurrently non empty.
1560 	 */
1561 	limit += 64*1024;
1562 
1563 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1564 	 * we can fix skb->truesize to its real value to avoid future drops.
1565 	 * This is valid because skb is not yet charged to the socket.
1566 	 * It has been noticed pure SACK packets were sometimes dropped
1567 	 * (if cooked by drivers without copybreak feature).
1568 	 */
1569 	skb_condense(skb);
1570 
1571 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1572 		bh_unlock_sock(sk);
1573 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1574 		return true;
1575 	}
1576 	return false;
1577 }
1578 EXPORT_SYMBOL(tcp_add_backlog);
1579 
1580 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1581 {
1582 	struct tcphdr *th = (struct tcphdr *)skb->data;
1583 	unsigned int eaten = skb->len;
1584 	int err;
1585 
1586 	err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1587 	if (!err) {
1588 		eaten -= skb->len;
1589 		TCP_SKB_CB(skb)->end_seq -= eaten;
1590 	}
1591 	return err;
1592 }
1593 EXPORT_SYMBOL(tcp_filter);
1594 
1595 static void tcp_v4_restore_cb(struct sk_buff *skb)
1596 {
1597 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1598 		sizeof(struct inet_skb_parm));
1599 }
1600 
1601 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1602 			   const struct tcphdr *th)
1603 {
1604 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1605 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1606 	 */
1607 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1608 		sizeof(struct inet_skb_parm));
1609 	barrier();
1610 
1611 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1612 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1613 				    skb->len - th->doff * 4);
1614 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1615 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1616 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1617 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1618 	TCP_SKB_CB(skb)->sacked	 = 0;
1619 	TCP_SKB_CB(skb)->has_rxtstamp =
1620 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1621 }
1622 
1623 /*
1624  *	From tcp_input.c
1625  */
1626 
1627 int tcp_v4_rcv(struct sk_buff *skb)
1628 {
1629 	struct net *net = dev_net(skb->dev);
1630 	int sdif = inet_sdif(skb);
1631 	const struct iphdr *iph;
1632 	const struct tcphdr *th;
1633 	bool refcounted;
1634 	struct sock *sk;
1635 	int ret;
1636 
1637 	if (skb->pkt_type != PACKET_HOST)
1638 		goto discard_it;
1639 
1640 	/* Count it even if it's bad */
1641 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1642 
1643 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1644 		goto discard_it;
1645 
1646 	th = (const struct tcphdr *)skb->data;
1647 
1648 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1649 		goto bad_packet;
1650 	if (!pskb_may_pull(skb, th->doff * 4))
1651 		goto discard_it;
1652 
1653 	/* An explanation is required here, I think.
1654 	 * Packet length and doff are validated by header prediction,
1655 	 * provided case of th->doff==0 is eliminated.
1656 	 * So, we defer the checks. */
1657 
1658 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1659 		goto csum_error;
1660 
1661 	th = (const struct tcphdr *)skb->data;
1662 	iph = ip_hdr(skb);
1663 lookup:
1664 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1665 			       th->dest, sdif, &refcounted);
1666 	if (!sk)
1667 		goto no_tcp_socket;
1668 
1669 process:
1670 	if (sk->sk_state == TCP_TIME_WAIT)
1671 		goto do_time_wait;
1672 
1673 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1674 		struct request_sock *req = inet_reqsk(sk);
1675 		struct sock *nsk;
1676 
1677 		sk = req->rsk_listener;
1678 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1679 			sk_drops_add(sk, skb);
1680 			reqsk_put(req);
1681 			goto discard_it;
1682 		}
1683 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1684 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1685 			goto lookup;
1686 		}
1687 		/* We own a reference on the listener, increase it again
1688 		 * as we might lose it too soon.
1689 		 */
1690 		sock_hold(sk);
1691 		refcounted = true;
1692 		nsk = NULL;
1693 		if (!tcp_filter(sk, skb)) {
1694 			th = (const struct tcphdr *)skb->data;
1695 			iph = ip_hdr(skb);
1696 			tcp_v4_fill_cb(skb, iph, th);
1697 			nsk = tcp_check_req(sk, skb, req, false);
1698 		}
1699 		if (!nsk) {
1700 			reqsk_put(req);
1701 			goto discard_and_relse;
1702 		}
1703 		if (nsk == sk) {
1704 			reqsk_put(req);
1705 			tcp_v4_restore_cb(skb);
1706 		} else if (tcp_child_process(sk, nsk, skb)) {
1707 			tcp_v4_send_reset(nsk, skb);
1708 			goto discard_and_relse;
1709 		} else {
1710 			sock_put(sk);
1711 			return 0;
1712 		}
1713 	}
1714 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1715 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1716 		goto discard_and_relse;
1717 	}
1718 
1719 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1720 		goto discard_and_relse;
1721 
1722 	if (tcp_v4_inbound_md5_hash(sk, skb))
1723 		goto discard_and_relse;
1724 
1725 	nf_reset(skb);
1726 
1727 	if (tcp_filter(sk, skb))
1728 		goto discard_and_relse;
1729 	th = (const struct tcphdr *)skb->data;
1730 	iph = ip_hdr(skb);
1731 	tcp_v4_fill_cb(skb, iph, th);
1732 
1733 	skb->dev = NULL;
1734 
1735 	if (sk->sk_state == TCP_LISTEN) {
1736 		ret = tcp_v4_do_rcv(sk, skb);
1737 		goto put_and_return;
1738 	}
1739 
1740 	sk_incoming_cpu_update(sk);
1741 
1742 	bh_lock_sock_nested(sk);
1743 	tcp_segs_in(tcp_sk(sk), skb);
1744 	ret = 0;
1745 	if (!sock_owned_by_user(sk)) {
1746 		ret = tcp_v4_do_rcv(sk, skb);
1747 	} else if (tcp_add_backlog(sk, skb)) {
1748 		goto discard_and_relse;
1749 	}
1750 	bh_unlock_sock(sk);
1751 
1752 put_and_return:
1753 	if (refcounted)
1754 		sock_put(sk);
1755 
1756 	return ret;
1757 
1758 no_tcp_socket:
1759 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1760 		goto discard_it;
1761 
1762 	tcp_v4_fill_cb(skb, iph, th);
1763 
1764 	if (tcp_checksum_complete(skb)) {
1765 csum_error:
1766 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1767 bad_packet:
1768 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
1769 	} else {
1770 		tcp_v4_send_reset(NULL, skb);
1771 	}
1772 
1773 discard_it:
1774 	/* Discard frame. */
1775 	kfree_skb(skb);
1776 	return 0;
1777 
1778 discard_and_relse:
1779 	sk_drops_add(sk, skb);
1780 	if (refcounted)
1781 		sock_put(sk);
1782 	goto discard_it;
1783 
1784 do_time_wait:
1785 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1786 		inet_twsk_put(inet_twsk(sk));
1787 		goto discard_it;
1788 	}
1789 
1790 	tcp_v4_fill_cb(skb, iph, th);
1791 
1792 	if (tcp_checksum_complete(skb)) {
1793 		inet_twsk_put(inet_twsk(sk));
1794 		goto csum_error;
1795 	}
1796 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1797 	case TCP_TW_SYN: {
1798 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1799 							&tcp_hashinfo, skb,
1800 							__tcp_hdrlen(th),
1801 							iph->saddr, th->source,
1802 							iph->daddr, th->dest,
1803 							inet_iif(skb),
1804 							sdif);
1805 		if (sk2) {
1806 			inet_twsk_deschedule_put(inet_twsk(sk));
1807 			sk = sk2;
1808 			tcp_v4_restore_cb(skb);
1809 			refcounted = false;
1810 			goto process;
1811 		}
1812 	}
1813 		/* to ACK */
1814 		/* fall through */
1815 	case TCP_TW_ACK:
1816 		tcp_v4_timewait_ack(sk, skb);
1817 		break;
1818 	case TCP_TW_RST:
1819 		tcp_v4_send_reset(sk, skb);
1820 		inet_twsk_deschedule_put(inet_twsk(sk));
1821 		goto discard_it;
1822 	case TCP_TW_SUCCESS:;
1823 	}
1824 	goto discard_it;
1825 }
1826 
1827 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1828 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1829 	.twsk_unique	= tcp_twsk_unique,
1830 	.twsk_destructor= tcp_twsk_destructor,
1831 };
1832 
1833 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1834 {
1835 	struct dst_entry *dst = skb_dst(skb);
1836 
1837 	if (dst && dst_hold_safe(dst)) {
1838 		sk->sk_rx_dst = dst;
1839 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1840 	}
1841 }
1842 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1843 
1844 const struct inet_connection_sock_af_ops ipv4_specific = {
1845 	.queue_xmit	   = ip_queue_xmit,
1846 	.send_check	   = tcp_v4_send_check,
1847 	.rebuild_header	   = inet_sk_rebuild_header,
1848 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
1849 	.conn_request	   = tcp_v4_conn_request,
1850 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1851 	.net_header_len	   = sizeof(struct iphdr),
1852 	.setsockopt	   = ip_setsockopt,
1853 	.getsockopt	   = ip_getsockopt,
1854 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1855 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1856 #ifdef CONFIG_COMPAT
1857 	.compat_setsockopt = compat_ip_setsockopt,
1858 	.compat_getsockopt = compat_ip_getsockopt,
1859 #endif
1860 	.mtu_reduced	   = tcp_v4_mtu_reduced,
1861 };
1862 EXPORT_SYMBOL(ipv4_specific);
1863 
1864 #ifdef CONFIG_TCP_MD5SIG
1865 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1866 	.md5_lookup		= tcp_v4_md5_lookup,
1867 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1868 	.md5_parse		= tcp_v4_parse_md5_keys,
1869 };
1870 #endif
1871 
1872 /* NOTE: A lot of things set to zero explicitly by call to
1873  *       sk_alloc() so need not be done here.
1874  */
1875 static int tcp_v4_init_sock(struct sock *sk)
1876 {
1877 	struct inet_connection_sock *icsk = inet_csk(sk);
1878 
1879 	tcp_init_sock(sk);
1880 
1881 	icsk->icsk_af_ops = &ipv4_specific;
1882 
1883 #ifdef CONFIG_TCP_MD5SIG
1884 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1885 #endif
1886 
1887 	return 0;
1888 }
1889 
1890 void tcp_v4_destroy_sock(struct sock *sk)
1891 {
1892 	struct tcp_sock *tp = tcp_sk(sk);
1893 
1894 	trace_tcp_destroy_sock(sk);
1895 
1896 	tcp_clear_xmit_timers(sk);
1897 
1898 	tcp_cleanup_congestion_control(sk);
1899 
1900 	tcp_cleanup_ulp(sk);
1901 
1902 	/* Cleanup up the write buffer. */
1903 	tcp_write_queue_purge(sk);
1904 
1905 	/* Check if we want to disable active TFO */
1906 	tcp_fastopen_active_disable_ofo_check(sk);
1907 
1908 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1909 	skb_rbtree_purge(&tp->out_of_order_queue);
1910 
1911 #ifdef CONFIG_TCP_MD5SIG
1912 	/* Clean up the MD5 key list, if any */
1913 	if (tp->md5sig_info) {
1914 		tcp_clear_md5_list(sk);
1915 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
1916 		tp->md5sig_info = NULL;
1917 	}
1918 #endif
1919 
1920 	/* Clean up a referenced TCP bind bucket. */
1921 	if (inet_csk(sk)->icsk_bind_hash)
1922 		inet_put_port(sk);
1923 
1924 	BUG_ON(tp->fastopen_rsk);
1925 
1926 	/* If socket is aborted during connect operation */
1927 	tcp_free_fastopen_req(tp);
1928 	tcp_fastopen_destroy_cipher(sk);
1929 	tcp_saved_syn_free(tp);
1930 
1931 	sk_sockets_allocated_dec(sk);
1932 }
1933 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1934 
1935 #ifdef CONFIG_PROC_FS
1936 /* Proc filesystem TCP sock list dumping. */
1937 
1938 /*
1939  * Get next listener socket follow cur.  If cur is NULL, get first socket
1940  * starting from bucket given in st->bucket; when st->bucket is zero the
1941  * very first socket in the hash table is returned.
1942  */
1943 static void *listening_get_next(struct seq_file *seq, void *cur)
1944 {
1945 	struct tcp_iter_state *st = seq->private;
1946 	struct net *net = seq_file_net(seq);
1947 	struct inet_listen_hashbucket *ilb;
1948 	struct sock *sk = cur;
1949 
1950 	if (!sk) {
1951 get_head:
1952 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1953 		spin_lock(&ilb->lock);
1954 		sk = sk_head(&ilb->head);
1955 		st->offset = 0;
1956 		goto get_sk;
1957 	}
1958 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1959 	++st->num;
1960 	++st->offset;
1961 
1962 	sk = sk_next(sk);
1963 get_sk:
1964 	sk_for_each_from(sk) {
1965 		if (!net_eq(sock_net(sk), net))
1966 			continue;
1967 		if (sk->sk_family == st->family)
1968 			return sk;
1969 	}
1970 	spin_unlock(&ilb->lock);
1971 	st->offset = 0;
1972 	if (++st->bucket < INET_LHTABLE_SIZE)
1973 		goto get_head;
1974 	return NULL;
1975 }
1976 
1977 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1978 {
1979 	struct tcp_iter_state *st = seq->private;
1980 	void *rc;
1981 
1982 	st->bucket = 0;
1983 	st->offset = 0;
1984 	rc = listening_get_next(seq, NULL);
1985 
1986 	while (rc && *pos) {
1987 		rc = listening_get_next(seq, rc);
1988 		--*pos;
1989 	}
1990 	return rc;
1991 }
1992 
1993 static inline bool empty_bucket(const struct tcp_iter_state *st)
1994 {
1995 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1996 }
1997 
1998 /*
1999  * Get first established socket starting from bucket given in st->bucket.
2000  * If st->bucket is zero, the very first socket in the hash is returned.
2001  */
2002 static void *established_get_first(struct seq_file *seq)
2003 {
2004 	struct tcp_iter_state *st = seq->private;
2005 	struct net *net = seq_file_net(seq);
2006 	void *rc = NULL;
2007 
2008 	st->offset = 0;
2009 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2010 		struct sock *sk;
2011 		struct hlist_nulls_node *node;
2012 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2013 
2014 		/* Lockless fast path for the common case of empty buckets */
2015 		if (empty_bucket(st))
2016 			continue;
2017 
2018 		spin_lock_bh(lock);
2019 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2020 			if (sk->sk_family != st->family ||
2021 			    !net_eq(sock_net(sk), net)) {
2022 				continue;
2023 			}
2024 			rc = sk;
2025 			goto out;
2026 		}
2027 		spin_unlock_bh(lock);
2028 	}
2029 out:
2030 	return rc;
2031 }
2032 
2033 static void *established_get_next(struct seq_file *seq, void *cur)
2034 {
2035 	struct sock *sk = cur;
2036 	struct hlist_nulls_node *node;
2037 	struct tcp_iter_state *st = seq->private;
2038 	struct net *net = seq_file_net(seq);
2039 
2040 	++st->num;
2041 	++st->offset;
2042 
2043 	sk = sk_nulls_next(sk);
2044 
2045 	sk_nulls_for_each_from(sk, node) {
2046 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2047 			return sk;
2048 	}
2049 
2050 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2051 	++st->bucket;
2052 	return established_get_first(seq);
2053 }
2054 
2055 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2056 {
2057 	struct tcp_iter_state *st = seq->private;
2058 	void *rc;
2059 
2060 	st->bucket = 0;
2061 	rc = established_get_first(seq);
2062 
2063 	while (rc && pos) {
2064 		rc = established_get_next(seq, rc);
2065 		--pos;
2066 	}
2067 	return rc;
2068 }
2069 
2070 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2071 {
2072 	void *rc;
2073 	struct tcp_iter_state *st = seq->private;
2074 
2075 	st->state = TCP_SEQ_STATE_LISTENING;
2076 	rc	  = listening_get_idx(seq, &pos);
2077 
2078 	if (!rc) {
2079 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2080 		rc	  = established_get_idx(seq, pos);
2081 	}
2082 
2083 	return rc;
2084 }
2085 
2086 static void *tcp_seek_last_pos(struct seq_file *seq)
2087 {
2088 	struct tcp_iter_state *st = seq->private;
2089 	int offset = st->offset;
2090 	int orig_num = st->num;
2091 	void *rc = NULL;
2092 
2093 	switch (st->state) {
2094 	case TCP_SEQ_STATE_LISTENING:
2095 		if (st->bucket >= INET_LHTABLE_SIZE)
2096 			break;
2097 		st->state = TCP_SEQ_STATE_LISTENING;
2098 		rc = listening_get_next(seq, NULL);
2099 		while (offset-- && rc)
2100 			rc = listening_get_next(seq, rc);
2101 		if (rc)
2102 			break;
2103 		st->bucket = 0;
2104 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2105 		/* Fallthrough */
2106 	case TCP_SEQ_STATE_ESTABLISHED:
2107 		if (st->bucket > tcp_hashinfo.ehash_mask)
2108 			break;
2109 		rc = established_get_first(seq);
2110 		while (offset-- && rc)
2111 			rc = established_get_next(seq, rc);
2112 	}
2113 
2114 	st->num = orig_num;
2115 
2116 	return rc;
2117 }
2118 
2119 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2120 {
2121 	struct tcp_iter_state *st = seq->private;
2122 	void *rc;
2123 
2124 	if (*pos && *pos == st->last_pos) {
2125 		rc = tcp_seek_last_pos(seq);
2126 		if (rc)
2127 			goto out;
2128 	}
2129 
2130 	st->state = TCP_SEQ_STATE_LISTENING;
2131 	st->num = 0;
2132 	st->bucket = 0;
2133 	st->offset = 0;
2134 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2135 
2136 out:
2137 	st->last_pos = *pos;
2138 	return rc;
2139 }
2140 
2141 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2142 {
2143 	struct tcp_iter_state *st = seq->private;
2144 	void *rc = NULL;
2145 
2146 	if (v == SEQ_START_TOKEN) {
2147 		rc = tcp_get_idx(seq, 0);
2148 		goto out;
2149 	}
2150 
2151 	switch (st->state) {
2152 	case TCP_SEQ_STATE_LISTENING:
2153 		rc = listening_get_next(seq, v);
2154 		if (!rc) {
2155 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2156 			st->bucket = 0;
2157 			st->offset = 0;
2158 			rc	  = established_get_first(seq);
2159 		}
2160 		break;
2161 	case TCP_SEQ_STATE_ESTABLISHED:
2162 		rc = established_get_next(seq, v);
2163 		break;
2164 	}
2165 out:
2166 	++*pos;
2167 	st->last_pos = *pos;
2168 	return rc;
2169 }
2170 
2171 static void tcp_seq_stop(struct seq_file *seq, void *v)
2172 {
2173 	struct tcp_iter_state *st = seq->private;
2174 
2175 	switch (st->state) {
2176 	case TCP_SEQ_STATE_LISTENING:
2177 		if (v != SEQ_START_TOKEN)
2178 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2179 		break;
2180 	case TCP_SEQ_STATE_ESTABLISHED:
2181 		if (v)
2182 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2183 		break;
2184 	}
2185 }
2186 
2187 int tcp_seq_open(struct inode *inode, struct file *file)
2188 {
2189 	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2190 	struct tcp_iter_state *s;
2191 	int err;
2192 
2193 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2194 			  sizeof(struct tcp_iter_state));
2195 	if (err < 0)
2196 		return err;
2197 
2198 	s = ((struct seq_file *)file->private_data)->private;
2199 	s->family		= afinfo->family;
2200 	s->last_pos		= 0;
2201 	return 0;
2202 }
2203 EXPORT_SYMBOL(tcp_seq_open);
2204 
2205 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2206 {
2207 	int rc = 0;
2208 	struct proc_dir_entry *p;
2209 
2210 	afinfo->seq_ops.start		= tcp_seq_start;
2211 	afinfo->seq_ops.next		= tcp_seq_next;
2212 	afinfo->seq_ops.stop		= tcp_seq_stop;
2213 
2214 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2215 			     afinfo->seq_fops, afinfo);
2216 	if (!p)
2217 		rc = -ENOMEM;
2218 	return rc;
2219 }
2220 EXPORT_SYMBOL(tcp_proc_register);
2221 
2222 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2223 {
2224 	remove_proc_entry(afinfo->name, net->proc_net);
2225 }
2226 EXPORT_SYMBOL(tcp_proc_unregister);
2227 
2228 static void get_openreq4(const struct request_sock *req,
2229 			 struct seq_file *f, int i)
2230 {
2231 	const struct inet_request_sock *ireq = inet_rsk(req);
2232 	long delta = req->rsk_timer.expires - jiffies;
2233 
2234 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2235 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2236 		i,
2237 		ireq->ir_loc_addr,
2238 		ireq->ir_num,
2239 		ireq->ir_rmt_addr,
2240 		ntohs(ireq->ir_rmt_port),
2241 		TCP_SYN_RECV,
2242 		0, 0, /* could print option size, but that is af dependent. */
2243 		1,    /* timers active (only the expire timer) */
2244 		jiffies_delta_to_clock_t(delta),
2245 		req->num_timeout,
2246 		from_kuid_munged(seq_user_ns(f),
2247 				 sock_i_uid(req->rsk_listener)),
2248 		0,  /* non standard timer */
2249 		0, /* open_requests have no inode */
2250 		0,
2251 		req);
2252 }
2253 
2254 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2255 {
2256 	int timer_active;
2257 	unsigned long timer_expires;
2258 	const struct tcp_sock *tp = tcp_sk(sk);
2259 	const struct inet_connection_sock *icsk = inet_csk(sk);
2260 	const struct inet_sock *inet = inet_sk(sk);
2261 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2262 	__be32 dest = inet->inet_daddr;
2263 	__be32 src = inet->inet_rcv_saddr;
2264 	__u16 destp = ntohs(inet->inet_dport);
2265 	__u16 srcp = ntohs(inet->inet_sport);
2266 	int rx_queue;
2267 	int state;
2268 
2269 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2270 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2271 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2272 		timer_active	= 1;
2273 		timer_expires	= icsk->icsk_timeout;
2274 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2275 		timer_active	= 4;
2276 		timer_expires	= icsk->icsk_timeout;
2277 	} else if (timer_pending(&sk->sk_timer)) {
2278 		timer_active	= 2;
2279 		timer_expires	= sk->sk_timer.expires;
2280 	} else {
2281 		timer_active	= 0;
2282 		timer_expires = jiffies;
2283 	}
2284 
2285 	state = inet_sk_state_load(sk);
2286 	if (state == TCP_LISTEN)
2287 		rx_queue = sk->sk_ack_backlog;
2288 	else
2289 		/* Because we don't lock the socket,
2290 		 * we might find a transient negative value.
2291 		 */
2292 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2293 
2294 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2295 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2296 		i, src, srcp, dest, destp, state,
2297 		tp->write_seq - tp->snd_una,
2298 		rx_queue,
2299 		timer_active,
2300 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2301 		icsk->icsk_retransmits,
2302 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2303 		icsk->icsk_probes_out,
2304 		sock_i_ino(sk),
2305 		refcount_read(&sk->sk_refcnt), sk,
2306 		jiffies_to_clock_t(icsk->icsk_rto),
2307 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2308 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2309 		tp->snd_cwnd,
2310 		state == TCP_LISTEN ?
2311 		    fastopenq->max_qlen :
2312 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2313 }
2314 
2315 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2316 			       struct seq_file *f, int i)
2317 {
2318 	long delta = tw->tw_timer.expires - jiffies;
2319 	__be32 dest, src;
2320 	__u16 destp, srcp;
2321 
2322 	dest  = tw->tw_daddr;
2323 	src   = tw->tw_rcv_saddr;
2324 	destp = ntohs(tw->tw_dport);
2325 	srcp  = ntohs(tw->tw_sport);
2326 
2327 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2328 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2329 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2330 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2331 		refcount_read(&tw->tw_refcnt), tw);
2332 }
2333 
2334 #define TMPSZ 150
2335 
2336 static int tcp4_seq_show(struct seq_file *seq, void *v)
2337 {
2338 	struct tcp_iter_state *st;
2339 	struct sock *sk = v;
2340 
2341 	seq_setwidth(seq, TMPSZ - 1);
2342 	if (v == SEQ_START_TOKEN) {
2343 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2344 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2345 			   "inode");
2346 		goto out;
2347 	}
2348 	st = seq->private;
2349 
2350 	if (sk->sk_state == TCP_TIME_WAIT)
2351 		get_timewait4_sock(v, seq, st->num);
2352 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2353 		get_openreq4(v, seq, st->num);
2354 	else
2355 		get_tcp4_sock(v, seq, st->num);
2356 out:
2357 	seq_pad(seq, '\n');
2358 	return 0;
2359 }
2360 
2361 static const struct file_operations tcp_afinfo_seq_fops = {
2362 	.open    = tcp_seq_open,
2363 	.read    = seq_read,
2364 	.llseek  = seq_lseek,
2365 	.release = seq_release_net
2366 };
2367 
2368 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2369 	.name		= "tcp",
2370 	.family		= AF_INET,
2371 	.seq_fops	= &tcp_afinfo_seq_fops,
2372 	.seq_ops	= {
2373 		.show		= tcp4_seq_show,
2374 	},
2375 };
2376 
2377 static int __net_init tcp4_proc_init_net(struct net *net)
2378 {
2379 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2380 }
2381 
2382 static void __net_exit tcp4_proc_exit_net(struct net *net)
2383 {
2384 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2385 }
2386 
2387 static struct pernet_operations tcp4_net_ops = {
2388 	.init = tcp4_proc_init_net,
2389 	.exit = tcp4_proc_exit_net,
2390 };
2391 
2392 int __init tcp4_proc_init(void)
2393 {
2394 	return register_pernet_subsys(&tcp4_net_ops);
2395 }
2396 
2397 void tcp4_proc_exit(void)
2398 {
2399 	unregister_pernet_subsys(&tcp4_net_ops);
2400 }
2401 #endif /* CONFIG_PROC_FS */
2402 
2403 struct proto tcp_prot = {
2404 	.name			= "TCP",
2405 	.owner			= THIS_MODULE,
2406 	.close			= tcp_close,
2407 	.connect		= tcp_v4_connect,
2408 	.disconnect		= tcp_disconnect,
2409 	.accept			= inet_csk_accept,
2410 	.ioctl			= tcp_ioctl,
2411 	.init			= tcp_v4_init_sock,
2412 	.destroy		= tcp_v4_destroy_sock,
2413 	.shutdown		= tcp_shutdown,
2414 	.setsockopt		= tcp_setsockopt,
2415 	.getsockopt		= tcp_getsockopt,
2416 	.keepalive		= tcp_set_keepalive,
2417 	.recvmsg		= tcp_recvmsg,
2418 	.sendmsg		= tcp_sendmsg,
2419 	.sendpage		= tcp_sendpage,
2420 	.backlog_rcv		= tcp_v4_do_rcv,
2421 	.release_cb		= tcp_release_cb,
2422 	.hash			= inet_hash,
2423 	.unhash			= inet_unhash,
2424 	.get_port		= inet_csk_get_port,
2425 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2426 	.leave_memory_pressure	= tcp_leave_memory_pressure,
2427 	.stream_memory_free	= tcp_stream_memory_free,
2428 	.sockets_allocated	= &tcp_sockets_allocated,
2429 	.orphan_count		= &tcp_orphan_count,
2430 	.memory_allocated	= &tcp_memory_allocated,
2431 	.memory_pressure	= &tcp_memory_pressure,
2432 	.sysctl_mem		= sysctl_tcp_mem,
2433 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
2434 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
2435 	.max_header		= MAX_TCP_HEADER,
2436 	.obj_size		= sizeof(struct tcp_sock),
2437 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
2438 	.twsk_prot		= &tcp_timewait_sock_ops,
2439 	.rsk_prot		= &tcp_request_sock_ops,
2440 	.h.hashinfo		= &tcp_hashinfo,
2441 	.no_autobind		= true,
2442 #ifdef CONFIG_COMPAT
2443 	.compat_setsockopt	= compat_tcp_setsockopt,
2444 	.compat_getsockopt	= compat_tcp_getsockopt,
2445 #endif
2446 	.diag_destroy		= tcp_abort,
2447 };
2448 EXPORT_SYMBOL(tcp_prot);
2449 
2450 static void __net_exit tcp_sk_exit(struct net *net)
2451 {
2452 	int cpu;
2453 
2454 	module_put(net->ipv4.tcp_congestion_control->owner);
2455 
2456 	for_each_possible_cpu(cpu)
2457 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2458 	free_percpu(net->ipv4.tcp_sk);
2459 }
2460 
2461 static int __net_init tcp_sk_init(struct net *net)
2462 {
2463 	int res, cpu, cnt;
2464 
2465 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2466 	if (!net->ipv4.tcp_sk)
2467 		return -ENOMEM;
2468 
2469 	for_each_possible_cpu(cpu) {
2470 		struct sock *sk;
2471 
2472 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2473 					   IPPROTO_TCP, net);
2474 		if (res)
2475 			goto fail;
2476 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2477 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2478 	}
2479 
2480 	net->ipv4.sysctl_tcp_ecn = 2;
2481 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2482 
2483 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2484 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2485 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2486 
2487 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2488 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2489 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2490 
2491 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2492 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2493 	net->ipv4.sysctl_tcp_syncookies = 1;
2494 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2495 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2496 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2497 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2498 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2499 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2500 	net->ipv4.sysctl_tcp_tw_reuse = 0;
2501 
2502 	cnt = tcp_hashinfo.ehash_mask + 1;
2503 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2504 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2505 
2506 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2507 	net->ipv4.sysctl_tcp_sack = 1;
2508 	net->ipv4.sysctl_tcp_window_scaling = 1;
2509 	net->ipv4.sysctl_tcp_timestamps = 1;
2510 	net->ipv4.sysctl_tcp_early_retrans = 3;
2511 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2512 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2513 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
2514 	net->ipv4.sysctl_tcp_max_reordering = 300;
2515 	net->ipv4.sysctl_tcp_dsack = 1;
2516 	net->ipv4.sysctl_tcp_app_win = 31;
2517 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
2518 	net->ipv4.sysctl_tcp_frto = 2;
2519 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2520 	/* This limits the percentage of the congestion window which we
2521 	 * will allow a single TSO frame to consume.  Building TSO frames
2522 	 * which are too large can cause TCP streams to be bursty.
2523 	 */
2524 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2525 	/* Default TSQ limit of four TSO segments */
2526 	net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
2527 	/* rfc5961 challenge ack rate limiting */
2528 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2529 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
2530 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2531 	net->ipv4.sysctl_tcp_autocorking = 1;
2532 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2533 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2534 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2535 	if (net != &init_net) {
2536 		memcpy(net->ipv4.sysctl_tcp_rmem,
2537 		       init_net.ipv4.sysctl_tcp_rmem,
2538 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2539 		memcpy(net->ipv4.sysctl_tcp_wmem,
2540 		       init_net.ipv4.sysctl_tcp_wmem,
2541 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2542 	}
2543 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2544 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2545 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2546 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2547 
2548 	/* Reno is always built in */
2549 	if (!net_eq(net, &init_net) &&
2550 	    try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2551 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2552 	else
2553 		net->ipv4.tcp_congestion_control = &tcp_reno;
2554 
2555 	return 0;
2556 fail:
2557 	tcp_sk_exit(net);
2558 
2559 	return res;
2560 }
2561 
2562 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2563 {
2564 	struct net *net;
2565 
2566 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2567 
2568 	list_for_each_entry(net, net_exit_list, exit_list)
2569 		tcp_fastopen_ctx_destroy(net);
2570 }
2571 
2572 static struct pernet_operations __net_initdata tcp_sk_ops = {
2573        .init	   = tcp_sk_init,
2574        .exit	   = tcp_sk_exit,
2575        .exit_batch = tcp_sk_exit_batch,
2576 };
2577 
2578 void __init tcp_v4_init(void)
2579 {
2580 	if (register_pernet_subsys(&tcp_sk_ops))
2581 		panic("Failed to create the TCP control socket.\n");
2582 }
2583