xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision bd336e63)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77 
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 
84 #include <crypto/hash.h>
85 #include <linux/scatterlist.h>
86 
87 int sysctl_tcp_tw_reuse __read_mostly;
88 int sysctl_tcp_low_latency __read_mostly;
89 
90 #ifdef CONFIG_TCP_MD5SIG
91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
92 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
93 #endif
94 
95 struct inet_hashinfo tcp_hashinfo;
96 EXPORT_SYMBOL(tcp_hashinfo);
97 
98 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
99 {
100 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
101 					  ip_hdr(skb)->saddr,
102 					  tcp_hdr(skb)->dest,
103 					  tcp_hdr(skb)->source);
104 }
105 
106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
107 {
108 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
109 	struct tcp_sock *tp = tcp_sk(sk);
110 
111 	/* With PAWS, it is safe from the viewpoint
112 	   of data integrity. Even without PAWS it is safe provided sequence
113 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
114 
115 	   Actually, the idea is close to VJ's one, only timestamp cache is
116 	   held not per host, but per port pair and TW bucket is used as state
117 	   holder.
118 
119 	   If TW bucket has been already destroyed we fall back to VJ's scheme
120 	   and use initial timestamp retrieved from peer table.
121 	 */
122 	if (tcptw->tw_ts_recent_stamp &&
123 	    (!twp || (sysctl_tcp_tw_reuse &&
124 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
125 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
126 		if (tp->write_seq == 0)
127 			tp->write_seq = 1;
128 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
129 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
130 		sock_hold(sktw);
131 		return 1;
132 	}
133 
134 	return 0;
135 }
136 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
137 
138 /* This will initiate an outgoing connection. */
139 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
140 {
141 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
142 	struct inet_sock *inet = inet_sk(sk);
143 	struct tcp_sock *tp = tcp_sk(sk);
144 	__be16 orig_sport, orig_dport;
145 	__be32 daddr, nexthop;
146 	struct flowi4 *fl4;
147 	struct rtable *rt;
148 	int err;
149 	struct ip_options_rcu *inet_opt;
150 
151 	if (addr_len < sizeof(struct sockaddr_in))
152 		return -EINVAL;
153 
154 	if (usin->sin_family != AF_INET)
155 		return -EAFNOSUPPORT;
156 
157 	nexthop = daddr = usin->sin_addr.s_addr;
158 	inet_opt = rcu_dereference_protected(inet->inet_opt,
159 					     lockdep_sock_is_held(sk));
160 	if (inet_opt && inet_opt->opt.srr) {
161 		if (!daddr)
162 			return -EINVAL;
163 		nexthop = inet_opt->opt.faddr;
164 	}
165 
166 	orig_sport = inet->inet_sport;
167 	orig_dport = usin->sin_port;
168 	fl4 = &inet->cork.fl.u.ip4;
169 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
170 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
171 			      IPPROTO_TCP,
172 			      orig_sport, orig_dport, sk);
173 	if (IS_ERR(rt)) {
174 		err = PTR_ERR(rt);
175 		if (err == -ENETUNREACH)
176 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
177 		return err;
178 	}
179 
180 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
181 		ip_rt_put(rt);
182 		return -ENETUNREACH;
183 	}
184 
185 	if (!inet_opt || !inet_opt->opt.srr)
186 		daddr = fl4->daddr;
187 
188 	if (!inet->inet_saddr)
189 		inet->inet_saddr = fl4->saddr;
190 	sk_rcv_saddr_set(sk, inet->inet_saddr);
191 
192 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
193 		/* Reset inherited state */
194 		tp->rx_opt.ts_recent	   = 0;
195 		tp->rx_opt.ts_recent_stamp = 0;
196 		if (likely(!tp->repair))
197 			tp->write_seq	   = 0;
198 	}
199 
200 	if (tcp_death_row.sysctl_tw_recycle &&
201 	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
202 		tcp_fetch_timewait_stamp(sk, &rt->dst);
203 
204 	inet->inet_dport = usin->sin_port;
205 	sk_daddr_set(sk, daddr);
206 
207 	inet_csk(sk)->icsk_ext_hdr_len = 0;
208 	if (inet_opt)
209 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
210 
211 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
212 
213 	/* Socket identity is still unknown (sport may be zero).
214 	 * However we set state to SYN-SENT and not releasing socket
215 	 * lock select source port, enter ourselves into the hash tables and
216 	 * complete initialization after this.
217 	 */
218 	tcp_set_state(sk, TCP_SYN_SENT);
219 	err = inet_hash_connect(&tcp_death_row, sk);
220 	if (err)
221 		goto failure;
222 
223 	sk_set_txhash(sk);
224 
225 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
226 			       inet->inet_sport, inet->inet_dport, sk);
227 	if (IS_ERR(rt)) {
228 		err = PTR_ERR(rt);
229 		rt = NULL;
230 		goto failure;
231 	}
232 	/* OK, now commit destination to socket.  */
233 	sk->sk_gso_type = SKB_GSO_TCPV4;
234 	sk_setup_caps(sk, &rt->dst);
235 
236 	if (!tp->write_seq && likely(!tp->repair))
237 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
238 							   inet->inet_daddr,
239 							   inet->inet_sport,
240 							   usin->sin_port);
241 
242 	inet->inet_id = tp->write_seq ^ jiffies;
243 
244 	err = tcp_connect(sk);
245 
246 	rt = NULL;
247 	if (err)
248 		goto failure;
249 
250 	return 0;
251 
252 failure:
253 	/*
254 	 * This unhashes the socket and releases the local port,
255 	 * if necessary.
256 	 */
257 	tcp_set_state(sk, TCP_CLOSE);
258 	ip_rt_put(rt);
259 	sk->sk_route_caps = 0;
260 	inet->inet_dport = 0;
261 	return err;
262 }
263 EXPORT_SYMBOL(tcp_v4_connect);
264 
265 /*
266  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
267  * It can be called through tcp_release_cb() if socket was owned by user
268  * at the time tcp_v4_err() was called to handle ICMP message.
269  */
270 void tcp_v4_mtu_reduced(struct sock *sk)
271 {
272 	struct dst_entry *dst;
273 	struct inet_sock *inet = inet_sk(sk);
274 	u32 mtu = tcp_sk(sk)->mtu_info;
275 
276 	dst = inet_csk_update_pmtu(sk, mtu);
277 	if (!dst)
278 		return;
279 
280 	/* Something is about to be wrong... Remember soft error
281 	 * for the case, if this connection will not able to recover.
282 	 */
283 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
284 		sk->sk_err_soft = EMSGSIZE;
285 
286 	mtu = dst_mtu(dst);
287 
288 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
289 	    ip_sk_accept_pmtu(sk) &&
290 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
291 		tcp_sync_mss(sk, mtu);
292 
293 		/* Resend the TCP packet because it's
294 		 * clear that the old packet has been
295 		 * dropped. This is the new "fast" path mtu
296 		 * discovery.
297 		 */
298 		tcp_simple_retransmit(sk);
299 	} /* else let the usual retransmit timer handle it */
300 }
301 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
302 
303 static void do_redirect(struct sk_buff *skb, struct sock *sk)
304 {
305 	struct dst_entry *dst = __sk_dst_check(sk, 0);
306 
307 	if (dst)
308 		dst->ops->redirect(dst, sk, skb);
309 }
310 
311 
312 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
313 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
314 {
315 	struct request_sock *req = inet_reqsk(sk);
316 	struct net *net = sock_net(sk);
317 
318 	/* ICMPs are not backlogged, hence we cannot get
319 	 * an established socket here.
320 	 */
321 	if (seq != tcp_rsk(req)->snt_isn) {
322 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
323 	} else if (abort) {
324 		/*
325 		 * Still in SYN_RECV, just remove it silently.
326 		 * There is no good way to pass the error to the newly
327 		 * created socket, and POSIX does not want network
328 		 * errors returned from accept().
329 		 */
330 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
331 		tcp_listendrop(req->rsk_listener);
332 	}
333 	reqsk_put(req);
334 }
335 EXPORT_SYMBOL(tcp_req_err);
336 
337 /*
338  * This routine is called by the ICMP module when it gets some
339  * sort of error condition.  If err < 0 then the socket should
340  * be closed and the error returned to the user.  If err > 0
341  * it's just the icmp type << 8 | icmp code.  After adjustment
342  * header points to the first 8 bytes of the tcp header.  We need
343  * to find the appropriate port.
344  *
345  * The locking strategy used here is very "optimistic". When
346  * someone else accesses the socket the ICMP is just dropped
347  * and for some paths there is no check at all.
348  * A more general error queue to queue errors for later handling
349  * is probably better.
350  *
351  */
352 
353 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
354 {
355 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
356 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
357 	struct inet_connection_sock *icsk;
358 	struct tcp_sock *tp;
359 	struct inet_sock *inet;
360 	const int type = icmp_hdr(icmp_skb)->type;
361 	const int code = icmp_hdr(icmp_skb)->code;
362 	struct sock *sk;
363 	struct sk_buff *skb;
364 	struct request_sock *fastopen;
365 	__u32 seq, snd_una;
366 	__u32 remaining;
367 	int err;
368 	struct net *net = dev_net(icmp_skb->dev);
369 
370 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
371 				       th->dest, iph->saddr, ntohs(th->source),
372 				       inet_iif(icmp_skb));
373 	if (!sk) {
374 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
375 		return;
376 	}
377 	if (sk->sk_state == TCP_TIME_WAIT) {
378 		inet_twsk_put(inet_twsk(sk));
379 		return;
380 	}
381 	seq = ntohl(th->seq);
382 	if (sk->sk_state == TCP_NEW_SYN_RECV)
383 		return tcp_req_err(sk, seq,
384 				  type == ICMP_PARAMETERPROB ||
385 				  type == ICMP_TIME_EXCEEDED ||
386 				  (type == ICMP_DEST_UNREACH &&
387 				   (code == ICMP_NET_UNREACH ||
388 				    code == ICMP_HOST_UNREACH)));
389 
390 	bh_lock_sock(sk);
391 	/* If too many ICMPs get dropped on busy
392 	 * servers this needs to be solved differently.
393 	 * We do take care of PMTU discovery (RFC1191) special case :
394 	 * we can receive locally generated ICMP messages while socket is held.
395 	 */
396 	if (sock_owned_by_user(sk)) {
397 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
398 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
399 	}
400 	if (sk->sk_state == TCP_CLOSE)
401 		goto out;
402 
403 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
404 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
405 		goto out;
406 	}
407 
408 	icsk = inet_csk(sk);
409 	tp = tcp_sk(sk);
410 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
411 	fastopen = tp->fastopen_rsk;
412 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
413 	if (sk->sk_state != TCP_LISTEN &&
414 	    !between(seq, snd_una, tp->snd_nxt)) {
415 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
416 		goto out;
417 	}
418 
419 	switch (type) {
420 	case ICMP_REDIRECT:
421 		do_redirect(icmp_skb, sk);
422 		goto out;
423 	case ICMP_SOURCE_QUENCH:
424 		/* Just silently ignore these. */
425 		goto out;
426 	case ICMP_PARAMETERPROB:
427 		err = EPROTO;
428 		break;
429 	case ICMP_DEST_UNREACH:
430 		if (code > NR_ICMP_UNREACH)
431 			goto out;
432 
433 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
434 			/* We are not interested in TCP_LISTEN and open_requests
435 			 * (SYN-ACKs send out by Linux are always <576bytes so
436 			 * they should go through unfragmented).
437 			 */
438 			if (sk->sk_state == TCP_LISTEN)
439 				goto out;
440 
441 			tp->mtu_info = info;
442 			if (!sock_owned_by_user(sk)) {
443 				tcp_v4_mtu_reduced(sk);
444 			} else {
445 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
446 					sock_hold(sk);
447 			}
448 			goto out;
449 		}
450 
451 		err = icmp_err_convert[code].errno;
452 		/* check if icmp_skb allows revert of backoff
453 		 * (see draft-zimmermann-tcp-lcd) */
454 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
455 			break;
456 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
457 		    !icsk->icsk_backoff || fastopen)
458 			break;
459 
460 		if (sock_owned_by_user(sk))
461 			break;
462 
463 		icsk->icsk_backoff--;
464 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
465 					       TCP_TIMEOUT_INIT;
466 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
467 
468 		skb = tcp_write_queue_head(sk);
469 		BUG_ON(!skb);
470 
471 		remaining = icsk->icsk_rto -
472 			    min(icsk->icsk_rto,
473 				tcp_time_stamp - tcp_skb_timestamp(skb));
474 
475 		if (remaining) {
476 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
477 						  remaining, TCP_RTO_MAX);
478 		} else {
479 			/* RTO revert clocked out retransmission.
480 			 * Will retransmit now */
481 			tcp_retransmit_timer(sk);
482 		}
483 
484 		break;
485 	case ICMP_TIME_EXCEEDED:
486 		err = EHOSTUNREACH;
487 		break;
488 	default:
489 		goto out;
490 	}
491 
492 	switch (sk->sk_state) {
493 	case TCP_SYN_SENT:
494 	case TCP_SYN_RECV:
495 		/* Only in fast or simultaneous open. If a fast open socket is
496 		 * is already accepted it is treated as a connected one below.
497 		 */
498 		if (fastopen && !fastopen->sk)
499 			break;
500 
501 		if (!sock_owned_by_user(sk)) {
502 			sk->sk_err = err;
503 
504 			sk->sk_error_report(sk);
505 
506 			tcp_done(sk);
507 		} else {
508 			sk->sk_err_soft = err;
509 		}
510 		goto out;
511 	}
512 
513 	/* If we've already connected we will keep trying
514 	 * until we time out, or the user gives up.
515 	 *
516 	 * rfc1122 4.2.3.9 allows to consider as hard errors
517 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
518 	 * but it is obsoleted by pmtu discovery).
519 	 *
520 	 * Note, that in modern internet, where routing is unreliable
521 	 * and in each dark corner broken firewalls sit, sending random
522 	 * errors ordered by their masters even this two messages finally lose
523 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
524 	 *
525 	 * Now we are in compliance with RFCs.
526 	 *							--ANK (980905)
527 	 */
528 
529 	inet = inet_sk(sk);
530 	if (!sock_owned_by_user(sk) && inet->recverr) {
531 		sk->sk_err = err;
532 		sk->sk_error_report(sk);
533 	} else	{ /* Only an error on timeout */
534 		sk->sk_err_soft = err;
535 	}
536 
537 out:
538 	bh_unlock_sock(sk);
539 	sock_put(sk);
540 }
541 
542 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
543 {
544 	struct tcphdr *th = tcp_hdr(skb);
545 
546 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
547 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
548 		skb->csum_start = skb_transport_header(skb) - skb->head;
549 		skb->csum_offset = offsetof(struct tcphdr, check);
550 	} else {
551 		th->check = tcp_v4_check(skb->len, saddr, daddr,
552 					 csum_partial(th,
553 						      th->doff << 2,
554 						      skb->csum));
555 	}
556 }
557 
558 /* This routine computes an IPv4 TCP checksum. */
559 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
560 {
561 	const struct inet_sock *inet = inet_sk(sk);
562 
563 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
564 }
565 EXPORT_SYMBOL(tcp_v4_send_check);
566 
567 /*
568  *	This routine will send an RST to the other tcp.
569  *
570  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
571  *		      for reset.
572  *	Answer: if a packet caused RST, it is not for a socket
573  *		existing in our system, if it is matched to a socket,
574  *		it is just duplicate segment or bug in other side's TCP.
575  *		So that we build reply only basing on parameters
576  *		arrived with segment.
577  *	Exception: precedence violation. We do not implement it in any case.
578  */
579 
580 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
581 {
582 	const struct tcphdr *th = tcp_hdr(skb);
583 	struct {
584 		struct tcphdr th;
585 #ifdef CONFIG_TCP_MD5SIG
586 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
587 #endif
588 	} rep;
589 	struct ip_reply_arg arg;
590 #ifdef CONFIG_TCP_MD5SIG
591 	struct tcp_md5sig_key *key = NULL;
592 	const __u8 *hash_location = NULL;
593 	unsigned char newhash[16];
594 	int genhash;
595 	struct sock *sk1 = NULL;
596 #endif
597 	struct net *net;
598 
599 	/* Never send a reset in response to a reset. */
600 	if (th->rst)
601 		return;
602 
603 	/* If sk not NULL, it means we did a successful lookup and incoming
604 	 * route had to be correct. prequeue might have dropped our dst.
605 	 */
606 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
607 		return;
608 
609 	/* Swap the send and the receive. */
610 	memset(&rep, 0, sizeof(rep));
611 	rep.th.dest   = th->source;
612 	rep.th.source = th->dest;
613 	rep.th.doff   = sizeof(struct tcphdr) / 4;
614 	rep.th.rst    = 1;
615 
616 	if (th->ack) {
617 		rep.th.seq = th->ack_seq;
618 	} else {
619 		rep.th.ack = 1;
620 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
621 				       skb->len - (th->doff << 2));
622 	}
623 
624 	memset(&arg, 0, sizeof(arg));
625 	arg.iov[0].iov_base = (unsigned char *)&rep;
626 	arg.iov[0].iov_len  = sizeof(rep.th);
627 
628 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
629 #ifdef CONFIG_TCP_MD5SIG
630 	rcu_read_lock();
631 	hash_location = tcp_parse_md5sig_option(th);
632 	if (sk && sk_fullsock(sk)) {
633 		key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
634 					&ip_hdr(skb)->saddr, AF_INET);
635 	} else if (hash_location) {
636 		/*
637 		 * active side is lost. Try to find listening socket through
638 		 * source port, and then find md5 key through listening socket.
639 		 * we are not loose security here:
640 		 * Incoming packet is checked with md5 hash with finding key,
641 		 * no RST generated if md5 hash doesn't match.
642 		 */
643 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
644 					     ip_hdr(skb)->saddr,
645 					     th->source, ip_hdr(skb)->daddr,
646 					     ntohs(th->source), inet_iif(skb));
647 		/* don't send rst if it can't find key */
648 		if (!sk1)
649 			goto out;
650 
651 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
652 					&ip_hdr(skb)->saddr, AF_INET);
653 		if (!key)
654 			goto out;
655 
656 
657 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
658 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
659 			goto out;
660 
661 	}
662 
663 	if (key) {
664 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
665 				   (TCPOPT_NOP << 16) |
666 				   (TCPOPT_MD5SIG << 8) |
667 				   TCPOLEN_MD5SIG);
668 		/* Update length and the length the header thinks exists */
669 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
670 		rep.th.doff = arg.iov[0].iov_len / 4;
671 
672 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
673 				     key, ip_hdr(skb)->saddr,
674 				     ip_hdr(skb)->daddr, &rep.th);
675 	}
676 #endif
677 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
678 				      ip_hdr(skb)->saddr, /* XXX */
679 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
680 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
681 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
682 
683 	/* When socket is gone, all binding information is lost.
684 	 * routing might fail in this case. No choice here, if we choose to force
685 	 * input interface, we will misroute in case of asymmetric route.
686 	 */
687 	if (sk)
688 		arg.bound_dev_if = sk->sk_bound_dev_if;
689 
690 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
691 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
692 
693 	arg.tos = ip_hdr(skb)->tos;
694 	local_bh_disable();
695 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
696 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
697 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
698 			      &arg, arg.iov[0].iov_len);
699 
700 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
701 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
702 	local_bh_enable();
703 
704 #ifdef CONFIG_TCP_MD5SIG
705 out:
706 	rcu_read_unlock();
707 #endif
708 }
709 
710 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
711    outside socket context is ugly, certainly. What can I do?
712  */
713 
714 static void tcp_v4_send_ack(struct net *net,
715 			    struct sk_buff *skb, u32 seq, u32 ack,
716 			    u32 win, u32 tsval, u32 tsecr, int oif,
717 			    struct tcp_md5sig_key *key,
718 			    int reply_flags, u8 tos)
719 {
720 	const struct tcphdr *th = tcp_hdr(skb);
721 	struct {
722 		struct tcphdr th;
723 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
724 #ifdef CONFIG_TCP_MD5SIG
725 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
726 #endif
727 			];
728 	} rep;
729 	struct ip_reply_arg arg;
730 
731 	memset(&rep.th, 0, sizeof(struct tcphdr));
732 	memset(&arg, 0, sizeof(arg));
733 
734 	arg.iov[0].iov_base = (unsigned char *)&rep;
735 	arg.iov[0].iov_len  = sizeof(rep.th);
736 	if (tsecr) {
737 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
738 				   (TCPOPT_TIMESTAMP << 8) |
739 				   TCPOLEN_TIMESTAMP);
740 		rep.opt[1] = htonl(tsval);
741 		rep.opt[2] = htonl(tsecr);
742 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
743 	}
744 
745 	/* Swap the send and the receive. */
746 	rep.th.dest    = th->source;
747 	rep.th.source  = th->dest;
748 	rep.th.doff    = arg.iov[0].iov_len / 4;
749 	rep.th.seq     = htonl(seq);
750 	rep.th.ack_seq = htonl(ack);
751 	rep.th.ack     = 1;
752 	rep.th.window  = htons(win);
753 
754 #ifdef CONFIG_TCP_MD5SIG
755 	if (key) {
756 		int offset = (tsecr) ? 3 : 0;
757 
758 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
759 					  (TCPOPT_NOP << 16) |
760 					  (TCPOPT_MD5SIG << 8) |
761 					  TCPOLEN_MD5SIG);
762 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
763 		rep.th.doff = arg.iov[0].iov_len/4;
764 
765 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
766 				    key, ip_hdr(skb)->saddr,
767 				    ip_hdr(skb)->daddr, &rep.th);
768 	}
769 #endif
770 	arg.flags = reply_flags;
771 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
772 				      ip_hdr(skb)->saddr, /* XXX */
773 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
774 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
775 	if (oif)
776 		arg.bound_dev_if = oif;
777 	arg.tos = tos;
778 	local_bh_disable();
779 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
780 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
781 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
782 			      &arg, arg.iov[0].iov_len);
783 
784 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
785 	local_bh_enable();
786 }
787 
788 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
789 {
790 	struct inet_timewait_sock *tw = inet_twsk(sk);
791 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
792 
793 	tcp_v4_send_ack(sock_net(sk), skb,
794 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
795 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
796 			tcp_time_stamp + tcptw->tw_ts_offset,
797 			tcptw->tw_ts_recent,
798 			tw->tw_bound_dev_if,
799 			tcp_twsk_md5_key(tcptw),
800 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
801 			tw->tw_tos
802 			);
803 
804 	inet_twsk_put(tw);
805 }
806 
807 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
808 				  struct request_sock *req)
809 {
810 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
811 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
812 	 */
813 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
814 					     tcp_sk(sk)->snd_nxt;
815 
816 	/* RFC 7323 2.3
817 	 * The window field (SEG.WND) of every outgoing segment, with the
818 	 * exception of <SYN> segments, MUST be right-shifted by
819 	 * Rcv.Wind.Shift bits:
820 	 */
821 	tcp_v4_send_ack(sock_net(sk), skb, seq,
822 			tcp_rsk(req)->rcv_nxt,
823 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
824 			tcp_time_stamp,
825 			req->ts_recent,
826 			0,
827 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
828 					  AF_INET),
829 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
830 			ip_hdr(skb)->tos);
831 }
832 
833 /*
834  *	Send a SYN-ACK after having received a SYN.
835  *	This still operates on a request_sock only, not on a big
836  *	socket.
837  */
838 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
839 			      struct flowi *fl,
840 			      struct request_sock *req,
841 			      struct tcp_fastopen_cookie *foc,
842 			      enum tcp_synack_type synack_type)
843 {
844 	const struct inet_request_sock *ireq = inet_rsk(req);
845 	struct flowi4 fl4;
846 	int err = -1;
847 	struct sk_buff *skb;
848 
849 	/* First, grab a route. */
850 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
851 		return -1;
852 
853 	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
854 
855 	if (skb) {
856 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
857 
858 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
859 					    ireq->ir_rmt_addr,
860 					    ireq->opt);
861 		err = net_xmit_eval(err);
862 	}
863 
864 	return err;
865 }
866 
867 /*
868  *	IPv4 request_sock destructor.
869  */
870 static void tcp_v4_reqsk_destructor(struct request_sock *req)
871 {
872 	kfree(inet_rsk(req)->opt);
873 }
874 
875 #ifdef CONFIG_TCP_MD5SIG
876 /*
877  * RFC2385 MD5 checksumming requires a mapping of
878  * IP address->MD5 Key.
879  * We need to maintain these in the sk structure.
880  */
881 
882 /* Find the Key structure for an address.  */
883 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
884 					 const union tcp_md5_addr *addr,
885 					 int family)
886 {
887 	const struct tcp_sock *tp = tcp_sk(sk);
888 	struct tcp_md5sig_key *key;
889 	unsigned int size = sizeof(struct in_addr);
890 	const struct tcp_md5sig_info *md5sig;
891 
892 	/* caller either holds rcu_read_lock() or socket lock */
893 	md5sig = rcu_dereference_check(tp->md5sig_info,
894 				       lockdep_sock_is_held(sk));
895 	if (!md5sig)
896 		return NULL;
897 #if IS_ENABLED(CONFIG_IPV6)
898 	if (family == AF_INET6)
899 		size = sizeof(struct in6_addr);
900 #endif
901 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
902 		if (key->family != family)
903 			continue;
904 		if (!memcmp(&key->addr, addr, size))
905 			return key;
906 	}
907 	return NULL;
908 }
909 EXPORT_SYMBOL(tcp_md5_do_lookup);
910 
911 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
912 					 const struct sock *addr_sk)
913 {
914 	const union tcp_md5_addr *addr;
915 
916 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
917 	return tcp_md5_do_lookup(sk, addr, AF_INET);
918 }
919 EXPORT_SYMBOL(tcp_v4_md5_lookup);
920 
921 /* This can be called on a newly created socket, from other files */
922 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
923 		   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
924 {
925 	/* Add Key to the list */
926 	struct tcp_md5sig_key *key;
927 	struct tcp_sock *tp = tcp_sk(sk);
928 	struct tcp_md5sig_info *md5sig;
929 
930 	key = tcp_md5_do_lookup(sk, addr, family);
931 	if (key) {
932 		/* Pre-existing entry - just update that one. */
933 		memcpy(key->key, newkey, newkeylen);
934 		key->keylen = newkeylen;
935 		return 0;
936 	}
937 
938 	md5sig = rcu_dereference_protected(tp->md5sig_info,
939 					   lockdep_sock_is_held(sk));
940 	if (!md5sig) {
941 		md5sig = kmalloc(sizeof(*md5sig), gfp);
942 		if (!md5sig)
943 			return -ENOMEM;
944 
945 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
946 		INIT_HLIST_HEAD(&md5sig->head);
947 		rcu_assign_pointer(tp->md5sig_info, md5sig);
948 	}
949 
950 	key = sock_kmalloc(sk, sizeof(*key), gfp);
951 	if (!key)
952 		return -ENOMEM;
953 	if (!tcp_alloc_md5sig_pool()) {
954 		sock_kfree_s(sk, key, sizeof(*key));
955 		return -ENOMEM;
956 	}
957 
958 	memcpy(key->key, newkey, newkeylen);
959 	key->keylen = newkeylen;
960 	key->family = family;
961 	memcpy(&key->addr, addr,
962 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
963 				      sizeof(struct in_addr));
964 	hlist_add_head_rcu(&key->node, &md5sig->head);
965 	return 0;
966 }
967 EXPORT_SYMBOL(tcp_md5_do_add);
968 
969 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
970 {
971 	struct tcp_md5sig_key *key;
972 
973 	key = tcp_md5_do_lookup(sk, addr, family);
974 	if (!key)
975 		return -ENOENT;
976 	hlist_del_rcu(&key->node);
977 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
978 	kfree_rcu(key, rcu);
979 	return 0;
980 }
981 EXPORT_SYMBOL(tcp_md5_do_del);
982 
983 static void tcp_clear_md5_list(struct sock *sk)
984 {
985 	struct tcp_sock *tp = tcp_sk(sk);
986 	struct tcp_md5sig_key *key;
987 	struct hlist_node *n;
988 	struct tcp_md5sig_info *md5sig;
989 
990 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
991 
992 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
993 		hlist_del_rcu(&key->node);
994 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
995 		kfree_rcu(key, rcu);
996 	}
997 }
998 
999 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1000 				 int optlen)
1001 {
1002 	struct tcp_md5sig cmd;
1003 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1004 
1005 	if (optlen < sizeof(cmd))
1006 		return -EINVAL;
1007 
1008 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1009 		return -EFAULT;
1010 
1011 	if (sin->sin_family != AF_INET)
1012 		return -EINVAL;
1013 
1014 	if (!cmd.tcpm_keylen)
1015 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1016 				      AF_INET);
1017 
1018 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1019 		return -EINVAL;
1020 
1021 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1022 			      AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1023 			      GFP_KERNEL);
1024 }
1025 
1026 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1027 				   __be32 daddr, __be32 saddr,
1028 				   const struct tcphdr *th, int nbytes)
1029 {
1030 	struct tcp4_pseudohdr *bp;
1031 	struct scatterlist sg;
1032 	struct tcphdr *_th;
1033 
1034 	bp = hp->scratch;
1035 	bp->saddr = saddr;
1036 	bp->daddr = daddr;
1037 	bp->pad = 0;
1038 	bp->protocol = IPPROTO_TCP;
1039 	bp->len = cpu_to_be16(nbytes);
1040 
1041 	_th = (struct tcphdr *)(bp + 1);
1042 	memcpy(_th, th, sizeof(*th));
1043 	_th->check = 0;
1044 
1045 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1046 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1047 				sizeof(*bp) + sizeof(*th));
1048 	return crypto_ahash_update(hp->md5_req);
1049 }
1050 
1051 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1052 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1053 {
1054 	struct tcp_md5sig_pool *hp;
1055 	struct ahash_request *req;
1056 
1057 	hp = tcp_get_md5sig_pool();
1058 	if (!hp)
1059 		goto clear_hash_noput;
1060 	req = hp->md5_req;
1061 
1062 	if (crypto_ahash_init(req))
1063 		goto clear_hash;
1064 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1065 		goto clear_hash;
1066 	if (tcp_md5_hash_key(hp, key))
1067 		goto clear_hash;
1068 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1069 	if (crypto_ahash_final(req))
1070 		goto clear_hash;
1071 
1072 	tcp_put_md5sig_pool();
1073 	return 0;
1074 
1075 clear_hash:
1076 	tcp_put_md5sig_pool();
1077 clear_hash_noput:
1078 	memset(md5_hash, 0, 16);
1079 	return 1;
1080 }
1081 
1082 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1083 			const struct sock *sk,
1084 			const struct sk_buff *skb)
1085 {
1086 	struct tcp_md5sig_pool *hp;
1087 	struct ahash_request *req;
1088 	const struct tcphdr *th = tcp_hdr(skb);
1089 	__be32 saddr, daddr;
1090 
1091 	if (sk) { /* valid for establish/request sockets */
1092 		saddr = sk->sk_rcv_saddr;
1093 		daddr = sk->sk_daddr;
1094 	} else {
1095 		const struct iphdr *iph = ip_hdr(skb);
1096 		saddr = iph->saddr;
1097 		daddr = iph->daddr;
1098 	}
1099 
1100 	hp = tcp_get_md5sig_pool();
1101 	if (!hp)
1102 		goto clear_hash_noput;
1103 	req = hp->md5_req;
1104 
1105 	if (crypto_ahash_init(req))
1106 		goto clear_hash;
1107 
1108 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1109 		goto clear_hash;
1110 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1111 		goto clear_hash;
1112 	if (tcp_md5_hash_key(hp, key))
1113 		goto clear_hash;
1114 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1115 	if (crypto_ahash_final(req))
1116 		goto clear_hash;
1117 
1118 	tcp_put_md5sig_pool();
1119 	return 0;
1120 
1121 clear_hash:
1122 	tcp_put_md5sig_pool();
1123 clear_hash_noput:
1124 	memset(md5_hash, 0, 16);
1125 	return 1;
1126 }
1127 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1128 
1129 #endif
1130 
1131 /* Called with rcu_read_lock() */
1132 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1133 				    const struct sk_buff *skb)
1134 {
1135 #ifdef CONFIG_TCP_MD5SIG
1136 	/*
1137 	 * This gets called for each TCP segment that arrives
1138 	 * so we want to be efficient.
1139 	 * We have 3 drop cases:
1140 	 * o No MD5 hash and one expected.
1141 	 * o MD5 hash and we're not expecting one.
1142 	 * o MD5 hash and its wrong.
1143 	 */
1144 	const __u8 *hash_location = NULL;
1145 	struct tcp_md5sig_key *hash_expected;
1146 	const struct iphdr *iph = ip_hdr(skb);
1147 	const struct tcphdr *th = tcp_hdr(skb);
1148 	int genhash;
1149 	unsigned char newhash[16];
1150 
1151 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1152 					  AF_INET);
1153 	hash_location = tcp_parse_md5sig_option(th);
1154 
1155 	/* We've parsed the options - do we have a hash? */
1156 	if (!hash_expected && !hash_location)
1157 		return false;
1158 
1159 	if (hash_expected && !hash_location) {
1160 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1161 		return true;
1162 	}
1163 
1164 	if (!hash_expected && hash_location) {
1165 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1166 		return true;
1167 	}
1168 
1169 	/* Okay, so this is hash_expected and hash_location -
1170 	 * so we need to calculate the checksum.
1171 	 */
1172 	genhash = tcp_v4_md5_hash_skb(newhash,
1173 				      hash_expected,
1174 				      NULL, skb);
1175 
1176 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1177 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1178 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1179 				     &iph->saddr, ntohs(th->source),
1180 				     &iph->daddr, ntohs(th->dest),
1181 				     genhash ? " tcp_v4_calc_md5_hash failed"
1182 				     : "");
1183 		return true;
1184 	}
1185 	return false;
1186 #endif
1187 	return false;
1188 }
1189 
1190 static void tcp_v4_init_req(struct request_sock *req,
1191 			    const struct sock *sk_listener,
1192 			    struct sk_buff *skb)
1193 {
1194 	struct inet_request_sock *ireq = inet_rsk(req);
1195 
1196 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1197 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1198 	ireq->opt = tcp_v4_save_options(skb);
1199 }
1200 
1201 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1202 					  struct flowi *fl,
1203 					  const struct request_sock *req,
1204 					  bool *strict)
1205 {
1206 	struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1207 
1208 	if (strict) {
1209 		if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1210 			*strict = true;
1211 		else
1212 			*strict = false;
1213 	}
1214 
1215 	return dst;
1216 }
1217 
1218 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1219 	.family		=	PF_INET,
1220 	.obj_size	=	sizeof(struct tcp_request_sock),
1221 	.rtx_syn_ack	=	tcp_rtx_synack,
1222 	.send_ack	=	tcp_v4_reqsk_send_ack,
1223 	.destructor	=	tcp_v4_reqsk_destructor,
1224 	.send_reset	=	tcp_v4_send_reset,
1225 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1226 };
1227 
1228 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1229 	.mss_clamp	=	TCP_MSS_DEFAULT,
1230 #ifdef CONFIG_TCP_MD5SIG
1231 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1232 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1233 #endif
1234 	.init_req	=	tcp_v4_init_req,
1235 #ifdef CONFIG_SYN_COOKIES
1236 	.cookie_init_seq =	cookie_v4_init_sequence,
1237 #endif
1238 	.route_req	=	tcp_v4_route_req,
1239 	.init_seq	=	tcp_v4_init_sequence,
1240 	.send_synack	=	tcp_v4_send_synack,
1241 };
1242 
1243 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1244 {
1245 	/* Never answer to SYNs send to broadcast or multicast */
1246 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1247 		goto drop;
1248 
1249 	return tcp_conn_request(&tcp_request_sock_ops,
1250 				&tcp_request_sock_ipv4_ops, sk, skb);
1251 
1252 drop:
1253 	tcp_listendrop(sk);
1254 	return 0;
1255 }
1256 EXPORT_SYMBOL(tcp_v4_conn_request);
1257 
1258 
1259 /*
1260  * The three way handshake has completed - we got a valid synack -
1261  * now create the new socket.
1262  */
1263 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1264 				  struct request_sock *req,
1265 				  struct dst_entry *dst,
1266 				  struct request_sock *req_unhash,
1267 				  bool *own_req)
1268 {
1269 	struct inet_request_sock *ireq;
1270 	struct inet_sock *newinet;
1271 	struct tcp_sock *newtp;
1272 	struct sock *newsk;
1273 #ifdef CONFIG_TCP_MD5SIG
1274 	struct tcp_md5sig_key *key;
1275 #endif
1276 	struct ip_options_rcu *inet_opt;
1277 
1278 	if (sk_acceptq_is_full(sk))
1279 		goto exit_overflow;
1280 
1281 	newsk = tcp_create_openreq_child(sk, req, skb);
1282 	if (!newsk)
1283 		goto exit_nonewsk;
1284 
1285 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1286 	inet_sk_rx_dst_set(newsk, skb);
1287 
1288 	newtp		      = tcp_sk(newsk);
1289 	newinet		      = inet_sk(newsk);
1290 	ireq		      = inet_rsk(req);
1291 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1292 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1293 	newsk->sk_bound_dev_if = ireq->ir_iif;
1294 	newinet->inet_saddr	      = ireq->ir_loc_addr;
1295 	inet_opt	      = ireq->opt;
1296 	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1297 	ireq->opt	      = NULL;
1298 	newinet->mc_index     = inet_iif(skb);
1299 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1300 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1301 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1302 	if (inet_opt)
1303 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1304 	newinet->inet_id = newtp->write_seq ^ jiffies;
1305 
1306 	if (!dst) {
1307 		dst = inet_csk_route_child_sock(sk, newsk, req);
1308 		if (!dst)
1309 			goto put_and_exit;
1310 	} else {
1311 		/* syncookie case : see end of cookie_v4_check() */
1312 	}
1313 	sk_setup_caps(newsk, dst);
1314 
1315 	tcp_ca_openreq_child(newsk, dst);
1316 
1317 	tcp_sync_mss(newsk, dst_mtu(dst));
1318 	newtp->advmss = dst_metric_advmss(dst);
1319 	if (tcp_sk(sk)->rx_opt.user_mss &&
1320 	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1321 		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1322 
1323 	tcp_initialize_rcv_mss(newsk);
1324 
1325 #ifdef CONFIG_TCP_MD5SIG
1326 	/* Copy over the MD5 key from the original socket */
1327 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1328 				AF_INET);
1329 	if (key) {
1330 		/*
1331 		 * We're using one, so create a matching key
1332 		 * on the newsk structure. If we fail to get
1333 		 * memory, then we end up not copying the key
1334 		 * across. Shucks.
1335 		 */
1336 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1337 			       AF_INET, key->key, key->keylen, GFP_ATOMIC);
1338 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1339 	}
1340 #endif
1341 
1342 	if (__inet_inherit_port(sk, newsk) < 0)
1343 		goto put_and_exit;
1344 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1345 	if (*own_req)
1346 		tcp_move_syn(newtp, req);
1347 
1348 	return newsk;
1349 
1350 exit_overflow:
1351 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1352 exit_nonewsk:
1353 	dst_release(dst);
1354 exit:
1355 	tcp_listendrop(sk);
1356 	return NULL;
1357 put_and_exit:
1358 	inet_csk_prepare_forced_close(newsk);
1359 	tcp_done(newsk);
1360 	goto exit;
1361 }
1362 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1363 
1364 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1365 {
1366 #ifdef CONFIG_SYN_COOKIES
1367 	const struct tcphdr *th = tcp_hdr(skb);
1368 
1369 	if (!th->syn)
1370 		sk = cookie_v4_check(sk, skb);
1371 #endif
1372 	return sk;
1373 }
1374 
1375 /* The socket must have it's spinlock held when we get
1376  * here, unless it is a TCP_LISTEN socket.
1377  *
1378  * We have a potential double-lock case here, so even when
1379  * doing backlog processing we use the BH locking scheme.
1380  * This is because we cannot sleep with the original spinlock
1381  * held.
1382  */
1383 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1384 {
1385 	struct sock *rsk;
1386 
1387 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1388 		struct dst_entry *dst = sk->sk_rx_dst;
1389 
1390 		sock_rps_save_rxhash(sk, skb);
1391 		sk_mark_napi_id(sk, skb);
1392 		if (dst) {
1393 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1394 			    !dst->ops->check(dst, 0)) {
1395 				dst_release(dst);
1396 				sk->sk_rx_dst = NULL;
1397 			}
1398 		}
1399 		tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1400 		return 0;
1401 	}
1402 
1403 	if (tcp_checksum_complete(skb))
1404 		goto csum_err;
1405 
1406 	if (sk->sk_state == TCP_LISTEN) {
1407 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1408 
1409 		if (!nsk)
1410 			goto discard;
1411 		if (nsk != sk) {
1412 			sock_rps_save_rxhash(nsk, skb);
1413 			sk_mark_napi_id(nsk, skb);
1414 			if (tcp_child_process(sk, nsk, skb)) {
1415 				rsk = nsk;
1416 				goto reset;
1417 			}
1418 			return 0;
1419 		}
1420 	} else
1421 		sock_rps_save_rxhash(sk, skb);
1422 
1423 	if (tcp_rcv_state_process(sk, skb)) {
1424 		rsk = sk;
1425 		goto reset;
1426 	}
1427 	return 0;
1428 
1429 reset:
1430 	tcp_v4_send_reset(rsk, skb);
1431 discard:
1432 	kfree_skb(skb);
1433 	/* Be careful here. If this function gets more complicated and
1434 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1435 	 * might be destroyed here. This current version compiles correctly,
1436 	 * but you have been warned.
1437 	 */
1438 	return 0;
1439 
1440 csum_err:
1441 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1442 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1443 	goto discard;
1444 }
1445 EXPORT_SYMBOL(tcp_v4_do_rcv);
1446 
1447 void tcp_v4_early_demux(struct sk_buff *skb)
1448 {
1449 	const struct iphdr *iph;
1450 	const struct tcphdr *th;
1451 	struct sock *sk;
1452 
1453 	if (skb->pkt_type != PACKET_HOST)
1454 		return;
1455 
1456 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1457 		return;
1458 
1459 	iph = ip_hdr(skb);
1460 	th = tcp_hdr(skb);
1461 
1462 	if (th->doff < sizeof(struct tcphdr) / 4)
1463 		return;
1464 
1465 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1466 				       iph->saddr, th->source,
1467 				       iph->daddr, ntohs(th->dest),
1468 				       skb->skb_iif);
1469 	if (sk) {
1470 		skb->sk = sk;
1471 		skb->destructor = sock_edemux;
1472 		if (sk_fullsock(sk)) {
1473 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1474 
1475 			if (dst)
1476 				dst = dst_check(dst, 0);
1477 			if (dst &&
1478 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1479 				skb_dst_set_noref(skb, dst);
1480 		}
1481 	}
1482 }
1483 
1484 /* Packet is added to VJ-style prequeue for processing in process
1485  * context, if a reader task is waiting. Apparently, this exciting
1486  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1487  * failed somewhere. Latency? Burstiness? Well, at least now we will
1488  * see, why it failed. 8)8)				  --ANK
1489  *
1490  */
1491 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1492 {
1493 	struct tcp_sock *tp = tcp_sk(sk);
1494 
1495 	if (sysctl_tcp_low_latency || !tp->ucopy.task)
1496 		return false;
1497 
1498 	if (skb->len <= tcp_hdrlen(skb) &&
1499 	    skb_queue_len(&tp->ucopy.prequeue) == 0)
1500 		return false;
1501 
1502 	/* Before escaping RCU protected region, we need to take care of skb
1503 	 * dst. Prequeue is only enabled for established sockets.
1504 	 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1505 	 * Instead of doing full sk_rx_dst validity here, let's perform
1506 	 * an optimistic check.
1507 	 */
1508 	if (likely(sk->sk_rx_dst))
1509 		skb_dst_drop(skb);
1510 	else
1511 		skb_dst_force_safe(skb);
1512 
1513 	__skb_queue_tail(&tp->ucopy.prequeue, skb);
1514 	tp->ucopy.memory += skb->truesize;
1515 	if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
1516 	    tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
1517 		struct sk_buff *skb1;
1518 
1519 		BUG_ON(sock_owned_by_user(sk));
1520 		__NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1521 				skb_queue_len(&tp->ucopy.prequeue));
1522 
1523 		while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1524 			sk_backlog_rcv(sk, skb1);
1525 
1526 		tp->ucopy.memory = 0;
1527 	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1528 		wake_up_interruptible_sync_poll(sk_sleep(sk),
1529 					   POLLIN | POLLRDNORM | POLLRDBAND);
1530 		if (!inet_csk_ack_scheduled(sk))
1531 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1532 						  (3 * tcp_rto_min(sk)) / 4,
1533 						  TCP_RTO_MAX);
1534 	}
1535 	return true;
1536 }
1537 EXPORT_SYMBOL(tcp_prequeue);
1538 
1539 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1540 {
1541 	u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1542 
1543 	/* Only socket owner can try to collapse/prune rx queues
1544 	 * to reduce memory overhead, so add a little headroom here.
1545 	 * Few sockets backlog are possibly concurrently non empty.
1546 	 */
1547 	limit += 64*1024;
1548 
1549 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1550 	 * we can fix skb->truesize to its real value to avoid future drops.
1551 	 * This is valid because skb is not yet charged to the socket.
1552 	 * It has been noticed pure SACK packets were sometimes dropped
1553 	 * (if cooked by drivers without copybreak feature).
1554 	 */
1555 	if (!skb->data_len)
1556 		skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
1557 
1558 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1559 		bh_unlock_sock(sk);
1560 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1561 		return true;
1562 	}
1563 	return false;
1564 }
1565 EXPORT_SYMBOL(tcp_add_backlog);
1566 
1567 /*
1568  *	From tcp_input.c
1569  */
1570 
1571 int tcp_v4_rcv(struct sk_buff *skb)
1572 {
1573 	struct net *net = dev_net(skb->dev);
1574 	const struct iphdr *iph;
1575 	const struct tcphdr *th;
1576 	bool refcounted;
1577 	struct sock *sk;
1578 	int ret;
1579 
1580 	if (skb->pkt_type != PACKET_HOST)
1581 		goto discard_it;
1582 
1583 	/* Count it even if it's bad */
1584 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1585 
1586 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1587 		goto discard_it;
1588 
1589 	th = (const struct tcphdr *)skb->data;
1590 
1591 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1592 		goto bad_packet;
1593 	if (!pskb_may_pull(skb, th->doff * 4))
1594 		goto discard_it;
1595 
1596 	/* An explanation is required here, I think.
1597 	 * Packet length and doff are validated by header prediction,
1598 	 * provided case of th->doff==0 is eliminated.
1599 	 * So, we defer the checks. */
1600 
1601 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1602 		goto csum_error;
1603 
1604 	th = (const struct tcphdr *)skb->data;
1605 	iph = ip_hdr(skb);
1606 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1607 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1608 	 */
1609 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1610 		sizeof(struct inet_skb_parm));
1611 	barrier();
1612 
1613 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1614 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1615 				    skb->len - th->doff * 4);
1616 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1617 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1618 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1619 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1620 	TCP_SKB_CB(skb)->sacked	 = 0;
1621 
1622 lookup:
1623 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1624 			       th->dest, &refcounted);
1625 	if (!sk)
1626 		goto no_tcp_socket;
1627 
1628 process:
1629 	if (sk->sk_state == TCP_TIME_WAIT)
1630 		goto do_time_wait;
1631 
1632 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1633 		struct request_sock *req = inet_reqsk(sk);
1634 		struct sock *nsk;
1635 
1636 		sk = req->rsk_listener;
1637 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1638 			sk_drops_add(sk, skb);
1639 			reqsk_put(req);
1640 			goto discard_it;
1641 		}
1642 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1643 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1644 			goto lookup;
1645 		}
1646 		/* We own a reference on the listener, increase it again
1647 		 * as we might lose it too soon.
1648 		 */
1649 		sock_hold(sk);
1650 		refcounted = true;
1651 		nsk = tcp_check_req(sk, skb, req, false);
1652 		if (!nsk) {
1653 			reqsk_put(req);
1654 			goto discard_and_relse;
1655 		}
1656 		if (nsk == sk) {
1657 			reqsk_put(req);
1658 		} else if (tcp_child_process(sk, nsk, skb)) {
1659 			tcp_v4_send_reset(nsk, skb);
1660 			goto discard_and_relse;
1661 		} else {
1662 			sock_put(sk);
1663 			return 0;
1664 		}
1665 	}
1666 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1667 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1668 		goto discard_and_relse;
1669 	}
1670 
1671 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1672 		goto discard_and_relse;
1673 
1674 	if (tcp_v4_inbound_md5_hash(sk, skb))
1675 		goto discard_and_relse;
1676 
1677 	nf_reset(skb);
1678 
1679 	if (sk_filter(sk, skb))
1680 		goto discard_and_relse;
1681 
1682 	skb->dev = NULL;
1683 
1684 	if (sk->sk_state == TCP_LISTEN) {
1685 		ret = tcp_v4_do_rcv(sk, skb);
1686 		goto put_and_return;
1687 	}
1688 
1689 	sk_incoming_cpu_update(sk);
1690 
1691 	bh_lock_sock_nested(sk);
1692 	tcp_segs_in(tcp_sk(sk), skb);
1693 	ret = 0;
1694 	if (!sock_owned_by_user(sk)) {
1695 		if (!tcp_prequeue(sk, skb))
1696 			ret = tcp_v4_do_rcv(sk, skb);
1697 	} else if (tcp_add_backlog(sk, skb)) {
1698 		goto discard_and_relse;
1699 	}
1700 	bh_unlock_sock(sk);
1701 
1702 put_and_return:
1703 	if (refcounted)
1704 		sock_put(sk);
1705 
1706 	return ret;
1707 
1708 no_tcp_socket:
1709 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1710 		goto discard_it;
1711 
1712 	if (tcp_checksum_complete(skb)) {
1713 csum_error:
1714 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1715 bad_packet:
1716 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
1717 	} else {
1718 		tcp_v4_send_reset(NULL, skb);
1719 	}
1720 
1721 discard_it:
1722 	/* Discard frame. */
1723 	kfree_skb(skb);
1724 	return 0;
1725 
1726 discard_and_relse:
1727 	sk_drops_add(sk, skb);
1728 	if (refcounted)
1729 		sock_put(sk);
1730 	goto discard_it;
1731 
1732 do_time_wait:
1733 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1734 		inet_twsk_put(inet_twsk(sk));
1735 		goto discard_it;
1736 	}
1737 
1738 	if (tcp_checksum_complete(skb)) {
1739 		inet_twsk_put(inet_twsk(sk));
1740 		goto csum_error;
1741 	}
1742 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1743 	case TCP_TW_SYN: {
1744 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1745 							&tcp_hashinfo, skb,
1746 							__tcp_hdrlen(th),
1747 							iph->saddr, th->source,
1748 							iph->daddr, th->dest,
1749 							inet_iif(skb));
1750 		if (sk2) {
1751 			inet_twsk_deschedule_put(inet_twsk(sk));
1752 			sk = sk2;
1753 			refcounted = false;
1754 			goto process;
1755 		}
1756 		/* Fall through to ACK */
1757 	}
1758 	case TCP_TW_ACK:
1759 		tcp_v4_timewait_ack(sk, skb);
1760 		break;
1761 	case TCP_TW_RST:
1762 		tcp_v4_send_reset(sk, skb);
1763 		inet_twsk_deschedule_put(inet_twsk(sk));
1764 		goto discard_it;
1765 	case TCP_TW_SUCCESS:;
1766 	}
1767 	goto discard_it;
1768 }
1769 
1770 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1771 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1772 	.twsk_unique	= tcp_twsk_unique,
1773 	.twsk_destructor= tcp_twsk_destructor,
1774 };
1775 
1776 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1777 {
1778 	struct dst_entry *dst = skb_dst(skb);
1779 
1780 	if (dst && dst_hold_safe(dst)) {
1781 		sk->sk_rx_dst = dst;
1782 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1783 	}
1784 }
1785 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1786 
1787 const struct inet_connection_sock_af_ops ipv4_specific = {
1788 	.queue_xmit	   = ip_queue_xmit,
1789 	.send_check	   = tcp_v4_send_check,
1790 	.rebuild_header	   = inet_sk_rebuild_header,
1791 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
1792 	.conn_request	   = tcp_v4_conn_request,
1793 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1794 	.net_header_len	   = sizeof(struct iphdr),
1795 	.setsockopt	   = ip_setsockopt,
1796 	.getsockopt	   = ip_getsockopt,
1797 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1798 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1799 	.bind_conflict	   = inet_csk_bind_conflict,
1800 #ifdef CONFIG_COMPAT
1801 	.compat_setsockopt = compat_ip_setsockopt,
1802 	.compat_getsockopt = compat_ip_getsockopt,
1803 #endif
1804 	.mtu_reduced	   = tcp_v4_mtu_reduced,
1805 };
1806 EXPORT_SYMBOL(ipv4_specific);
1807 
1808 #ifdef CONFIG_TCP_MD5SIG
1809 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1810 	.md5_lookup		= tcp_v4_md5_lookup,
1811 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1812 	.md5_parse		= tcp_v4_parse_md5_keys,
1813 };
1814 #endif
1815 
1816 /* NOTE: A lot of things set to zero explicitly by call to
1817  *       sk_alloc() so need not be done here.
1818  */
1819 static int tcp_v4_init_sock(struct sock *sk)
1820 {
1821 	struct inet_connection_sock *icsk = inet_csk(sk);
1822 
1823 	tcp_init_sock(sk);
1824 
1825 	icsk->icsk_af_ops = &ipv4_specific;
1826 
1827 #ifdef CONFIG_TCP_MD5SIG
1828 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1829 #endif
1830 
1831 	return 0;
1832 }
1833 
1834 void tcp_v4_destroy_sock(struct sock *sk)
1835 {
1836 	struct tcp_sock *tp = tcp_sk(sk);
1837 
1838 	tcp_clear_xmit_timers(sk);
1839 
1840 	tcp_cleanup_congestion_control(sk);
1841 
1842 	/* Cleanup up the write buffer. */
1843 	tcp_write_queue_purge(sk);
1844 
1845 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1846 	skb_rbtree_purge(&tp->out_of_order_queue);
1847 
1848 #ifdef CONFIG_TCP_MD5SIG
1849 	/* Clean up the MD5 key list, if any */
1850 	if (tp->md5sig_info) {
1851 		tcp_clear_md5_list(sk);
1852 		kfree_rcu(tp->md5sig_info, rcu);
1853 		tp->md5sig_info = NULL;
1854 	}
1855 #endif
1856 
1857 	/* Clean prequeue, it must be empty really */
1858 	__skb_queue_purge(&tp->ucopy.prequeue);
1859 
1860 	/* Clean up a referenced TCP bind bucket. */
1861 	if (inet_csk(sk)->icsk_bind_hash)
1862 		inet_put_port(sk);
1863 
1864 	BUG_ON(tp->fastopen_rsk);
1865 
1866 	/* If socket is aborted during connect operation */
1867 	tcp_free_fastopen_req(tp);
1868 	tcp_saved_syn_free(tp);
1869 
1870 	local_bh_disable();
1871 	sk_sockets_allocated_dec(sk);
1872 	local_bh_enable();
1873 }
1874 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1875 
1876 #ifdef CONFIG_PROC_FS
1877 /* Proc filesystem TCP sock list dumping. */
1878 
1879 /*
1880  * Get next listener socket follow cur.  If cur is NULL, get first socket
1881  * starting from bucket given in st->bucket; when st->bucket is zero the
1882  * very first socket in the hash table is returned.
1883  */
1884 static void *listening_get_next(struct seq_file *seq, void *cur)
1885 {
1886 	struct tcp_iter_state *st = seq->private;
1887 	struct net *net = seq_file_net(seq);
1888 	struct inet_listen_hashbucket *ilb;
1889 	struct sock *sk = cur;
1890 
1891 	if (!sk) {
1892 get_head:
1893 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1894 		spin_lock_bh(&ilb->lock);
1895 		sk = sk_head(&ilb->head);
1896 		st->offset = 0;
1897 		goto get_sk;
1898 	}
1899 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1900 	++st->num;
1901 	++st->offset;
1902 
1903 	sk = sk_next(sk);
1904 get_sk:
1905 	sk_for_each_from(sk) {
1906 		if (!net_eq(sock_net(sk), net))
1907 			continue;
1908 		if (sk->sk_family == st->family)
1909 			return sk;
1910 	}
1911 	spin_unlock_bh(&ilb->lock);
1912 	st->offset = 0;
1913 	if (++st->bucket < INET_LHTABLE_SIZE)
1914 		goto get_head;
1915 	return NULL;
1916 }
1917 
1918 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1919 {
1920 	struct tcp_iter_state *st = seq->private;
1921 	void *rc;
1922 
1923 	st->bucket = 0;
1924 	st->offset = 0;
1925 	rc = listening_get_next(seq, NULL);
1926 
1927 	while (rc && *pos) {
1928 		rc = listening_get_next(seq, rc);
1929 		--*pos;
1930 	}
1931 	return rc;
1932 }
1933 
1934 static inline bool empty_bucket(const struct tcp_iter_state *st)
1935 {
1936 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1937 }
1938 
1939 /*
1940  * Get first established socket starting from bucket given in st->bucket.
1941  * If st->bucket is zero, the very first socket in the hash is returned.
1942  */
1943 static void *established_get_first(struct seq_file *seq)
1944 {
1945 	struct tcp_iter_state *st = seq->private;
1946 	struct net *net = seq_file_net(seq);
1947 	void *rc = NULL;
1948 
1949 	st->offset = 0;
1950 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1951 		struct sock *sk;
1952 		struct hlist_nulls_node *node;
1953 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1954 
1955 		/* Lockless fast path for the common case of empty buckets */
1956 		if (empty_bucket(st))
1957 			continue;
1958 
1959 		spin_lock_bh(lock);
1960 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1961 			if (sk->sk_family != st->family ||
1962 			    !net_eq(sock_net(sk), net)) {
1963 				continue;
1964 			}
1965 			rc = sk;
1966 			goto out;
1967 		}
1968 		spin_unlock_bh(lock);
1969 	}
1970 out:
1971 	return rc;
1972 }
1973 
1974 static void *established_get_next(struct seq_file *seq, void *cur)
1975 {
1976 	struct sock *sk = cur;
1977 	struct hlist_nulls_node *node;
1978 	struct tcp_iter_state *st = seq->private;
1979 	struct net *net = seq_file_net(seq);
1980 
1981 	++st->num;
1982 	++st->offset;
1983 
1984 	sk = sk_nulls_next(sk);
1985 
1986 	sk_nulls_for_each_from(sk, node) {
1987 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1988 			return sk;
1989 	}
1990 
1991 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1992 	++st->bucket;
1993 	return established_get_first(seq);
1994 }
1995 
1996 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1997 {
1998 	struct tcp_iter_state *st = seq->private;
1999 	void *rc;
2000 
2001 	st->bucket = 0;
2002 	rc = established_get_first(seq);
2003 
2004 	while (rc && pos) {
2005 		rc = established_get_next(seq, rc);
2006 		--pos;
2007 	}
2008 	return rc;
2009 }
2010 
2011 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2012 {
2013 	void *rc;
2014 	struct tcp_iter_state *st = seq->private;
2015 
2016 	st->state = TCP_SEQ_STATE_LISTENING;
2017 	rc	  = listening_get_idx(seq, &pos);
2018 
2019 	if (!rc) {
2020 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2021 		rc	  = established_get_idx(seq, pos);
2022 	}
2023 
2024 	return rc;
2025 }
2026 
2027 static void *tcp_seek_last_pos(struct seq_file *seq)
2028 {
2029 	struct tcp_iter_state *st = seq->private;
2030 	int offset = st->offset;
2031 	int orig_num = st->num;
2032 	void *rc = NULL;
2033 
2034 	switch (st->state) {
2035 	case TCP_SEQ_STATE_LISTENING:
2036 		if (st->bucket >= INET_LHTABLE_SIZE)
2037 			break;
2038 		st->state = TCP_SEQ_STATE_LISTENING;
2039 		rc = listening_get_next(seq, NULL);
2040 		while (offset-- && rc)
2041 			rc = listening_get_next(seq, rc);
2042 		if (rc)
2043 			break;
2044 		st->bucket = 0;
2045 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2046 		/* Fallthrough */
2047 	case TCP_SEQ_STATE_ESTABLISHED:
2048 		if (st->bucket > tcp_hashinfo.ehash_mask)
2049 			break;
2050 		rc = established_get_first(seq);
2051 		while (offset-- && rc)
2052 			rc = established_get_next(seq, rc);
2053 	}
2054 
2055 	st->num = orig_num;
2056 
2057 	return rc;
2058 }
2059 
2060 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2061 {
2062 	struct tcp_iter_state *st = seq->private;
2063 	void *rc;
2064 
2065 	if (*pos && *pos == st->last_pos) {
2066 		rc = tcp_seek_last_pos(seq);
2067 		if (rc)
2068 			goto out;
2069 	}
2070 
2071 	st->state = TCP_SEQ_STATE_LISTENING;
2072 	st->num = 0;
2073 	st->bucket = 0;
2074 	st->offset = 0;
2075 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2076 
2077 out:
2078 	st->last_pos = *pos;
2079 	return rc;
2080 }
2081 
2082 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2083 {
2084 	struct tcp_iter_state *st = seq->private;
2085 	void *rc = NULL;
2086 
2087 	if (v == SEQ_START_TOKEN) {
2088 		rc = tcp_get_idx(seq, 0);
2089 		goto out;
2090 	}
2091 
2092 	switch (st->state) {
2093 	case TCP_SEQ_STATE_LISTENING:
2094 		rc = listening_get_next(seq, v);
2095 		if (!rc) {
2096 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2097 			st->bucket = 0;
2098 			st->offset = 0;
2099 			rc	  = established_get_first(seq);
2100 		}
2101 		break;
2102 	case TCP_SEQ_STATE_ESTABLISHED:
2103 		rc = established_get_next(seq, v);
2104 		break;
2105 	}
2106 out:
2107 	++*pos;
2108 	st->last_pos = *pos;
2109 	return rc;
2110 }
2111 
2112 static void tcp_seq_stop(struct seq_file *seq, void *v)
2113 {
2114 	struct tcp_iter_state *st = seq->private;
2115 
2116 	switch (st->state) {
2117 	case TCP_SEQ_STATE_LISTENING:
2118 		if (v != SEQ_START_TOKEN)
2119 			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2120 		break;
2121 	case TCP_SEQ_STATE_ESTABLISHED:
2122 		if (v)
2123 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2124 		break;
2125 	}
2126 }
2127 
2128 int tcp_seq_open(struct inode *inode, struct file *file)
2129 {
2130 	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2131 	struct tcp_iter_state *s;
2132 	int err;
2133 
2134 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2135 			  sizeof(struct tcp_iter_state));
2136 	if (err < 0)
2137 		return err;
2138 
2139 	s = ((struct seq_file *)file->private_data)->private;
2140 	s->family		= afinfo->family;
2141 	s->last_pos		= 0;
2142 	return 0;
2143 }
2144 EXPORT_SYMBOL(tcp_seq_open);
2145 
2146 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2147 {
2148 	int rc = 0;
2149 	struct proc_dir_entry *p;
2150 
2151 	afinfo->seq_ops.start		= tcp_seq_start;
2152 	afinfo->seq_ops.next		= tcp_seq_next;
2153 	afinfo->seq_ops.stop		= tcp_seq_stop;
2154 
2155 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2156 			     afinfo->seq_fops, afinfo);
2157 	if (!p)
2158 		rc = -ENOMEM;
2159 	return rc;
2160 }
2161 EXPORT_SYMBOL(tcp_proc_register);
2162 
2163 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2164 {
2165 	remove_proc_entry(afinfo->name, net->proc_net);
2166 }
2167 EXPORT_SYMBOL(tcp_proc_unregister);
2168 
2169 static void get_openreq4(const struct request_sock *req,
2170 			 struct seq_file *f, int i)
2171 {
2172 	const struct inet_request_sock *ireq = inet_rsk(req);
2173 	long delta = req->rsk_timer.expires - jiffies;
2174 
2175 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2176 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2177 		i,
2178 		ireq->ir_loc_addr,
2179 		ireq->ir_num,
2180 		ireq->ir_rmt_addr,
2181 		ntohs(ireq->ir_rmt_port),
2182 		TCP_SYN_RECV,
2183 		0, 0, /* could print option size, but that is af dependent. */
2184 		1,    /* timers active (only the expire timer) */
2185 		jiffies_delta_to_clock_t(delta),
2186 		req->num_timeout,
2187 		from_kuid_munged(seq_user_ns(f),
2188 				 sock_i_uid(req->rsk_listener)),
2189 		0,  /* non standard timer */
2190 		0, /* open_requests have no inode */
2191 		0,
2192 		req);
2193 }
2194 
2195 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2196 {
2197 	int timer_active;
2198 	unsigned long timer_expires;
2199 	const struct tcp_sock *tp = tcp_sk(sk);
2200 	const struct inet_connection_sock *icsk = inet_csk(sk);
2201 	const struct inet_sock *inet = inet_sk(sk);
2202 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2203 	__be32 dest = inet->inet_daddr;
2204 	__be32 src = inet->inet_rcv_saddr;
2205 	__u16 destp = ntohs(inet->inet_dport);
2206 	__u16 srcp = ntohs(inet->inet_sport);
2207 	int rx_queue;
2208 	int state;
2209 
2210 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2211 	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2212 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2213 		timer_active	= 1;
2214 		timer_expires	= icsk->icsk_timeout;
2215 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2216 		timer_active	= 4;
2217 		timer_expires	= icsk->icsk_timeout;
2218 	} else if (timer_pending(&sk->sk_timer)) {
2219 		timer_active	= 2;
2220 		timer_expires	= sk->sk_timer.expires;
2221 	} else {
2222 		timer_active	= 0;
2223 		timer_expires = jiffies;
2224 	}
2225 
2226 	state = sk_state_load(sk);
2227 	if (state == TCP_LISTEN)
2228 		rx_queue = sk->sk_ack_backlog;
2229 	else
2230 		/* Because we don't lock the socket,
2231 		 * we might find a transient negative value.
2232 		 */
2233 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2234 
2235 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2236 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2237 		i, src, srcp, dest, destp, state,
2238 		tp->write_seq - tp->snd_una,
2239 		rx_queue,
2240 		timer_active,
2241 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2242 		icsk->icsk_retransmits,
2243 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2244 		icsk->icsk_probes_out,
2245 		sock_i_ino(sk),
2246 		atomic_read(&sk->sk_refcnt), sk,
2247 		jiffies_to_clock_t(icsk->icsk_rto),
2248 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2249 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2250 		tp->snd_cwnd,
2251 		state == TCP_LISTEN ?
2252 		    fastopenq->max_qlen :
2253 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2254 }
2255 
2256 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2257 			       struct seq_file *f, int i)
2258 {
2259 	long delta = tw->tw_timer.expires - jiffies;
2260 	__be32 dest, src;
2261 	__u16 destp, srcp;
2262 
2263 	dest  = tw->tw_daddr;
2264 	src   = tw->tw_rcv_saddr;
2265 	destp = ntohs(tw->tw_dport);
2266 	srcp  = ntohs(tw->tw_sport);
2267 
2268 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2269 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2270 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2271 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2272 		atomic_read(&tw->tw_refcnt), tw);
2273 }
2274 
2275 #define TMPSZ 150
2276 
2277 static int tcp4_seq_show(struct seq_file *seq, void *v)
2278 {
2279 	struct tcp_iter_state *st;
2280 	struct sock *sk = v;
2281 
2282 	seq_setwidth(seq, TMPSZ - 1);
2283 	if (v == SEQ_START_TOKEN) {
2284 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2285 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2286 			   "inode");
2287 		goto out;
2288 	}
2289 	st = seq->private;
2290 
2291 	if (sk->sk_state == TCP_TIME_WAIT)
2292 		get_timewait4_sock(v, seq, st->num);
2293 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2294 		get_openreq4(v, seq, st->num);
2295 	else
2296 		get_tcp4_sock(v, seq, st->num);
2297 out:
2298 	seq_pad(seq, '\n');
2299 	return 0;
2300 }
2301 
2302 static const struct file_operations tcp_afinfo_seq_fops = {
2303 	.owner   = THIS_MODULE,
2304 	.open    = tcp_seq_open,
2305 	.read    = seq_read,
2306 	.llseek  = seq_lseek,
2307 	.release = seq_release_net
2308 };
2309 
2310 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2311 	.name		= "tcp",
2312 	.family		= AF_INET,
2313 	.seq_fops	= &tcp_afinfo_seq_fops,
2314 	.seq_ops	= {
2315 		.show		= tcp4_seq_show,
2316 	},
2317 };
2318 
2319 static int __net_init tcp4_proc_init_net(struct net *net)
2320 {
2321 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2322 }
2323 
2324 static void __net_exit tcp4_proc_exit_net(struct net *net)
2325 {
2326 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2327 }
2328 
2329 static struct pernet_operations tcp4_net_ops = {
2330 	.init = tcp4_proc_init_net,
2331 	.exit = tcp4_proc_exit_net,
2332 };
2333 
2334 int __init tcp4_proc_init(void)
2335 {
2336 	return register_pernet_subsys(&tcp4_net_ops);
2337 }
2338 
2339 void tcp4_proc_exit(void)
2340 {
2341 	unregister_pernet_subsys(&tcp4_net_ops);
2342 }
2343 #endif /* CONFIG_PROC_FS */
2344 
2345 struct proto tcp_prot = {
2346 	.name			= "TCP",
2347 	.owner			= THIS_MODULE,
2348 	.close			= tcp_close,
2349 	.connect		= tcp_v4_connect,
2350 	.disconnect		= tcp_disconnect,
2351 	.accept			= inet_csk_accept,
2352 	.ioctl			= tcp_ioctl,
2353 	.init			= tcp_v4_init_sock,
2354 	.destroy		= tcp_v4_destroy_sock,
2355 	.shutdown		= tcp_shutdown,
2356 	.setsockopt		= tcp_setsockopt,
2357 	.getsockopt		= tcp_getsockopt,
2358 	.recvmsg		= tcp_recvmsg,
2359 	.sendmsg		= tcp_sendmsg,
2360 	.sendpage		= tcp_sendpage,
2361 	.backlog_rcv		= tcp_v4_do_rcv,
2362 	.release_cb		= tcp_release_cb,
2363 	.hash			= inet_hash,
2364 	.unhash			= inet_unhash,
2365 	.get_port		= inet_csk_get_port,
2366 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2367 	.stream_memory_free	= tcp_stream_memory_free,
2368 	.sockets_allocated	= &tcp_sockets_allocated,
2369 	.orphan_count		= &tcp_orphan_count,
2370 	.memory_allocated	= &tcp_memory_allocated,
2371 	.memory_pressure	= &tcp_memory_pressure,
2372 	.sysctl_mem		= sysctl_tcp_mem,
2373 	.sysctl_wmem		= sysctl_tcp_wmem,
2374 	.sysctl_rmem		= sysctl_tcp_rmem,
2375 	.max_header		= MAX_TCP_HEADER,
2376 	.obj_size		= sizeof(struct tcp_sock),
2377 	.slab_flags		= SLAB_DESTROY_BY_RCU,
2378 	.twsk_prot		= &tcp_timewait_sock_ops,
2379 	.rsk_prot		= &tcp_request_sock_ops,
2380 	.h.hashinfo		= &tcp_hashinfo,
2381 	.no_autobind		= true,
2382 #ifdef CONFIG_COMPAT
2383 	.compat_setsockopt	= compat_tcp_setsockopt,
2384 	.compat_getsockopt	= compat_tcp_getsockopt,
2385 #endif
2386 	.diag_destroy		= tcp_abort,
2387 };
2388 EXPORT_SYMBOL(tcp_prot);
2389 
2390 static void __net_exit tcp_sk_exit(struct net *net)
2391 {
2392 	int cpu;
2393 
2394 	for_each_possible_cpu(cpu)
2395 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2396 	free_percpu(net->ipv4.tcp_sk);
2397 }
2398 
2399 static int __net_init tcp_sk_init(struct net *net)
2400 {
2401 	int res, cpu;
2402 
2403 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2404 	if (!net->ipv4.tcp_sk)
2405 		return -ENOMEM;
2406 
2407 	for_each_possible_cpu(cpu) {
2408 		struct sock *sk;
2409 
2410 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2411 					   IPPROTO_TCP, net);
2412 		if (res)
2413 			goto fail;
2414 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2415 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2416 	}
2417 
2418 	net->ipv4.sysctl_tcp_ecn = 2;
2419 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2420 
2421 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2422 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2423 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2424 
2425 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2426 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2427 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2428 
2429 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2430 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2431 	net->ipv4.sysctl_tcp_syncookies = 1;
2432 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2433 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2434 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2435 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2436 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2437 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2438 
2439 	return 0;
2440 fail:
2441 	tcp_sk_exit(net);
2442 
2443 	return res;
2444 }
2445 
2446 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2447 {
2448 	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2449 }
2450 
2451 static struct pernet_operations __net_initdata tcp_sk_ops = {
2452        .init	   = tcp_sk_init,
2453        .exit	   = tcp_sk_exit,
2454        .exit_batch = tcp_sk_exit_batch,
2455 };
2456 
2457 void __init tcp_v4_init(void)
2458 {
2459 	inet_hashinfo_init(&tcp_hashinfo);
2460 	if (register_pernet_subsys(&tcp_sk_ops))
2461 		panic("Failed to create the TCP control socket.\n");
2462 }
2463