xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision 8b036556)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/tcp_memcontrol.h>
77 #include <net/busy_poll.h>
78 
79 #include <linux/inet.h>
80 #include <linux/ipv6.h>
81 #include <linux/stddef.h>
82 #include <linux/proc_fs.h>
83 #include <linux/seq_file.h>
84 
85 #include <linux/crypto.h>
86 #include <linux/scatterlist.h>
87 
88 int sysctl_tcp_tw_reuse __read_mostly;
89 int sysctl_tcp_low_latency __read_mostly;
90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
91 
92 #ifdef CONFIG_TCP_MD5SIG
93 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
94 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
95 #endif
96 
97 struct inet_hashinfo tcp_hashinfo;
98 EXPORT_SYMBOL(tcp_hashinfo);
99 
100 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
101 {
102 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
103 					  ip_hdr(skb)->saddr,
104 					  tcp_hdr(skb)->dest,
105 					  tcp_hdr(skb)->source);
106 }
107 
108 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
109 {
110 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 	struct tcp_sock *tp = tcp_sk(sk);
112 
113 	/* With PAWS, it is safe from the viewpoint
114 	   of data integrity. Even without PAWS it is safe provided sequence
115 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
116 
117 	   Actually, the idea is close to VJ's one, only timestamp cache is
118 	   held not per host, but per port pair and TW bucket is used as state
119 	   holder.
120 
121 	   If TW bucket has been already destroyed we fall back to VJ's scheme
122 	   and use initial timestamp retrieved from peer table.
123 	 */
124 	if (tcptw->tw_ts_recent_stamp &&
125 	    (twp == NULL || (sysctl_tcp_tw_reuse &&
126 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
127 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
128 		if (tp->write_seq == 0)
129 			tp->write_seq = 1;
130 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
131 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
132 		sock_hold(sktw);
133 		return 1;
134 	}
135 
136 	return 0;
137 }
138 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
139 
140 /* This will initiate an outgoing connection. */
141 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
142 {
143 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
144 	struct inet_sock *inet = inet_sk(sk);
145 	struct tcp_sock *tp = tcp_sk(sk);
146 	__be16 orig_sport, orig_dport;
147 	__be32 daddr, nexthop;
148 	struct flowi4 *fl4;
149 	struct rtable *rt;
150 	int err;
151 	struct ip_options_rcu *inet_opt;
152 
153 	if (addr_len < sizeof(struct sockaddr_in))
154 		return -EINVAL;
155 
156 	if (usin->sin_family != AF_INET)
157 		return -EAFNOSUPPORT;
158 
159 	nexthop = daddr = usin->sin_addr.s_addr;
160 	inet_opt = rcu_dereference_protected(inet->inet_opt,
161 					     sock_owned_by_user(sk));
162 	if (inet_opt && inet_opt->opt.srr) {
163 		if (!daddr)
164 			return -EINVAL;
165 		nexthop = inet_opt->opt.faddr;
166 	}
167 
168 	orig_sport = inet->inet_sport;
169 	orig_dport = usin->sin_port;
170 	fl4 = &inet->cork.fl.u.ip4;
171 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
172 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
173 			      IPPROTO_TCP,
174 			      orig_sport, orig_dport, sk);
175 	if (IS_ERR(rt)) {
176 		err = PTR_ERR(rt);
177 		if (err == -ENETUNREACH)
178 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
179 		return err;
180 	}
181 
182 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
183 		ip_rt_put(rt);
184 		return -ENETUNREACH;
185 	}
186 
187 	if (!inet_opt || !inet_opt->opt.srr)
188 		daddr = fl4->daddr;
189 
190 	if (!inet->inet_saddr)
191 		inet->inet_saddr = fl4->saddr;
192 	inet->inet_rcv_saddr = inet->inet_saddr;
193 
194 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
195 		/* Reset inherited state */
196 		tp->rx_opt.ts_recent	   = 0;
197 		tp->rx_opt.ts_recent_stamp = 0;
198 		if (likely(!tp->repair))
199 			tp->write_seq	   = 0;
200 	}
201 
202 	if (tcp_death_row.sysctl_tw_recycle &&
203 	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
204 		tcp_fetch_timewait_stamp(sk, &rt->dst);
205 
206 	inet->inet_dport = usin->sin_port;
207 	inet->inet_daddr = daddr;
208 
209 	inet_csk(sk)->icsk_ext_hdr_len = 0;
210 	if (inet_opt)
211 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
212 
213 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
214 
215 	/* Socket identity is still unknown (sport may be zero).
216 	 * However we set state to SYN-SENT and not releasing socket
217 	 * lock select source port, enter ourselves into the hash tables and
218 	 * complete initialization after this.
219 	 */
220 	tcp_set_state(sk, TCP_SYN_SENT);
221 	err = inet_hash_connect(&tcp_death_row, sk);
222 	if (err)
223 		goto failure;
224 
225 	inet_set_txhash(sk);
226 
227 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
228 			       inet->inet_sport, inet->inet_dport, sk);
229 	if (IS_ERR(rt)) {
230 		err = PTR_ERR(rt);
231 		rt = NULL;
232 		goto failure;
233 	}
234 	/* OK, now commit destination to socket.  */
235 	sk->sk_gso_type = SKB_GSO_TCPV4;
236 	sk_setup_caps(sk, &rt->dst);
237 
238 	if (!tp->write_seq && likely(!tp->repair))
239 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
240 							   inet->inet_daddr,
241 							   inet->inet_sport,
242 							   usin->sin_port);
243 
244 	inet->inet_id = tp->write_seq ^ jiffies;
245 
246 	err = tcp_connect(sk);
247 
248 	rt = NULL;
249 	if (err)
250 		goto failure;
251 
252 	return 0;
253 
254 failure:
255 	/*
256 	 * This unhashes the socket and releases the local port,
257 	 * if necessary.
258 	 */
259 	tcp_set_state(sk, TCP_CLOSE);
260 	ip_rt_put(rt);
261 	sk->sk_route_caps = 0;
262 	inet->inet_dport = 0;
263 	return err;
264 }
265 EXPORT_SYMBOL(tcp_v4_connect);
266 
267 /*
268  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
269  * It can be called through tcp_release_cb() if socket was owned by user
270  * at the time tcp_v4_err() was called to handle ICMP message.
271  */
272 void tcp_v4_mtu_reduced(struct sock *sk)
273 {
274 	struct dst_entry *dst;
275 	struct inet_sock *inet = inet_sk(sk);
276 	u32 mtu = tcp_sk(sk)->mtu_info;
277 
278 	dst = inet_csk_update_pmtu(sk, mtu);
279 	if (!dst)
280 		return;
281 
282 	/* Something is about to be wrong... Remember soft error
283 	 * for the case, if this connection will not able to recover.
284 	 */
285 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
286 		sk->sk_err_soft = EMSGSIZE;
287 
288 	mtu = dst_mtu(dst);
289 
290 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
291 	    ip_sk_accept_pmtu(sk) &&
292 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
293 		tcp_sync_mss(sk, mtu);
294 
295 		/* Resend the TCP packet because it's
296 		 * clear that the old packet has been
297 		 * dropped. This is the new "fast" path mtu
298 		 * discovery.
299 		 */
300 		tcp_simple_retransmit(sk);
301 	} /* else let the usual retransmit timer handle it */
302 }
303 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
304 
305 static void do_redirect(struct sk_buff *skb, struct sock *sk)
306 {
307 	struct dst_entry *dst = __sk_dst_check(sk, 0);
308 
309 	if (dst)
310 		dst->ops->redirect(dst, sk, skb);
311 }
312 
313 /*
314  * This routine is called by the ICMP module when it gets some
315  * sort of error condition.  If err < 0 then the socket should
316  * be closed and the error returned to the user.  If err > 0
317  * it's just the icmp type << 8 | icmp code.  After adjustment
318  * header points to the first 8 bytes of the tcp header.  We need
319  * to find the appropriate port.
320  *
321  * The locking strategy used here is very "optimistic". When
322  * someone else accesses the socket the ICMP is just dropped
323  * and for some paths there is no check at all.
324  * A more general error queue to queue errors for later handling
325  * is probably better.
326  *
327  */
328 
329 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
330 {
331 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
332 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
333 	struct inet_connection_sock *icsk;
334 	struct tcp_sock *tp;
335 	struct inet_sock *inet;
336 	const int type = icmp_hdr(icmp_skb)->type;
337 	const int code = icmp_hdr(icmp_skb)->code;
338 	struct sock *sk;
339 	struct sk_buff *skb;
340 	struct request_sock *fastopen;
341 	__u32 seq, snd_una;
342 	__u32 remaining;
343 	int err;
344 	struct net *net = dev_net(icmp_skb->dev);
345 
346 	sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
347 			iph->saddr, th->source, inet_iif(icmp_skb));
348 	if (!sk) {
349 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
350 		return;
351 	}
352 	if (sk->sk_state == TCP_TIME_WAIT) {
353 		inet_twsk_put(inet_twsk(sk));
354 		return;
355 	}
356 
357 	bh_lock_sock(sk);
358 	/* If too many ICMPs get dropped on busy
359 	 * servers this needs to be solved differently.
360 	 * We do take care of PMTU discovery (RFC1191) special case :
361 	 * we can receive locally generated ICMP messages while socket is held.
362 	 */
363 	if (sock_owned_by_user(sk)) {
364 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
365 			NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
366 	}
367 	if (sk->sk_state == TCP_CLOSE)
368 		goto out;
369 
370 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
371 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
372 		goto out;
373 	}
374 
375 	icsk = inet_csk(sk);
376 	tp = tcp_sk(sk);
377 	seq = ntohl(th->seq);
378 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
379 	fastopen = tp->fastopen_rsk;
380 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
381 	if (sk->sk_state != TCP_LISTEN &&
382 	    !between(seq, snd_una, tp->snd_nxt)) {
383 		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
384 		goto out;
385 	}
386 
387 	switch (type) {
388 	case ICMP_REDIRECT:
389 		do_redirect(icmp_skb, sk);
390 		goto out;
391 	case ICMP_SOURCE_QUENCH:
392 		/* Just silently ignore these. */
393 		goto out;
394 	case ICMP_PARAMETERPROB:
395 		err = EPROTO;
396 		break;
397 	case ICMP_DEST_UNREACH:
398 		if (code > NR_ICMP_UNREACH)
399 			goto out;
400 
401 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
402 			/* We are not interested in TCP_LISTEN and open_requests
403 			 * (SYN-ACKs send out by Linux are always <576bytes so
404 			 * they should go through unfragmented).
405 			 */
406 			if (sk->sk_state == TCP_LISTEN)
407 				goto out;
408 
409 			tp->mtu_info = info;
410 			if (!sock_owned_by_user(sk)) {
411 				tcp_v4_mtu_reduced(sk);
412 			} else {
413 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
414 					sock_hold(sk);
415 			}
416 			goto out;
417 		}
418 
419 		err = icmp_err_convert[code].errno;
420 		/* check if icmp_skb allows revert of backoff
421 		 * (see draft-zimmermann-tcp-lcd) */
422 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
423 			break;
424 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
425 		    !icsk->icsk_backoff || fastopen)
426 			break;
427 
428 		if (sock_owned_by_user(sk))
429 			break;
430 
431 		icsk->icsk_backoff--;
432 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
433 					       TCP_TIMEOUT_INIT;
434 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
435 
436 		skb = tcp_write_queue_head(sk);
437 		BUG_ON(!skb);
438 
439 		remaining = icsk->icsk_rto -
440 			    min(icsk->icsk_rto,
441 				tcp_time_stamp - tcp_skb_timestamp(skb));
442 
443 		if (remaining) {
444 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
445 						  remaining, TCP_RTO_MAX);
446 		} else {
447 			/* RTO revert clocked out retransmission.
448 			 * Will retransmit now */
449 			tcp_retransmit_timer(sk);
450 		}
451 
452 		break;
453 	case ICMP_TIME_EXCEEDED:
454 		err = EHOSTUNREACH;
455 		break;
456 	default:
457 		goto out;
458 	}
459 
460 	switch (sk->sk_state) {
461 		struct request_sock *req, **prev;
462 	case TCP_LISTEN:
463 		if (sock_owned_by_user(sk))
464 			goto out;
465 
466 		req = inet_csk_search_req(sk, &prev, th->dest,
467 					  iph->daddr, iph->saddr);
468 		if (!req)
469 			goto out;
470 
471 		/* ICMPs are not backlogged, hence we cannot get
472 		   an established socket here.
473 		 */
474 		WARN_ON(req->sk);
475 
476 		if (seq != tcp_rsk(req)->snt_isn) {
477 			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
478 			goto out;
479 		}
480 
481 		/*
482 		 * Still in SYN_RECV, just remove it silently.
483 		 * There is no good way to pass the error to the newly
484 		 * created socket, and POSIX does not want network
485 		 * errors returned from accept().
486 		 */
487 		inet_csk_reqsk_queue_drop(sk, req, prev);
488 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
489 		goto out;
490 
491 	case TCP_SYN_SENT:
492 	case TCP_SYN_RECV:
493 		/* Only in fast or simultaneous open. If a fast open socket is
494 		 * is already accepted it is treated as a connected one below.
495 		 */
496 		if (fastopen && fastopen->sk == NULL)
497 			break;
498 
499 		if (!sock_owned_by_user(sk)) {
500 			sk->sk_err = err;
501 
502 			sk->sk_error_report(sk);
503 
504 			tcp_done(sk);
505 		} else {
506 			sk->sk_err_soft = err;
507 		}
508 		goto out;
509 	}
510 
511 	/* If we've already connected we will keep trying
512 	 * until we time out, or the user gives up.
513 	 *
514 	 * rfc1122 4.2.3.9 allows to consider as hard errors
515 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
516 	 * but it is obsoleted by pmtu discovery).
517 	 *
518 	 * Note, that in modern internet, where routing is unreliable
519 	 * and in each dark corner broken firewalls sit, sending random
520 	 * errors ordered by their masters even this two messages finally lose
521 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
522 	 *
523 	 * Now we are in compliance with RFCs.
524 	 *							--ANK (980905)
525 	 */
526 
527 	inet = inet_sk(sk);
528 	if (!sock_owned_by_user(sk) && inet->recverr) {
529 		sk->sk_err = err;
530 		sk->sk_error_report(sk);
531 	} else	{ /* Only an error on timeout */
532 		sk->sk_err_soft = err;
533 	}
534 
535 out:
536 	bh_unlock_sock(sk);
537 	sock_put(sk);
538 }
539 
540 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
541 {
542 	struct tcphdr *th = tcp_hdr(skb);
543 
544 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
545 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
546 		skb->csum_start = skb_transport_header(skb) - skb->head;
547 		skb->csum_offset = offsetof(struct tcphdr, check);
548 	} else {
549 		th->check = tcp_v4_check(skb->len, saddr, daddr,
550 					 csum_partial(th,
551 						      th->doff << 2,
552 						      skb->csum));
553 	}
554 }
555 
556 /* This routine computes an IPv4 TCP checksum. */
557 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
558 {
559 	const struct inet_sock *inet = inet_sk(sk);
560 
561 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
562 }
563 EXPORT_SYMBOL(tcp_v4_send_check);
564 
565 /*
566  *	This routine will send an RST to the other tcp.
567  *
568  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
569  *		      for reset.
570  *	Answer: if a packet caused RST, it is not for a socket
571  *		existing in our system, if it is matched to a socket,
572  *		it is just duplicate segment or bug in other side's TCP.
573  *		So that we build reply only basing on parameters
574  *		arrived with segment.
575  *	Exception: precedence violation. We do not implement it in any case.
576  */
577 
578 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
579 {
580 	const struct tcphdr *th = tcp_hdr(skb);
581 	struct {
582 		struct tcphdr th;
583 #ifdef CONFIG_TCP_MD5SIG
584 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
585 #endif
586 	} rep;
587 	struct ip_reply_arg arg;
588 #ifdef CONFIG_TCP_MD5SIG
589 	struct tcp_md5sig_key *key;
590 	const __u8 *hash_location = NULL;
591 	unsigned char newhash[16];
592 	int genhash;
593 	struct sock *sk1 = NULL;
594 #endif
595 	struct net *net;
596 
597 	/* Never send a reset in response to a reset. */
598 	if (th->rst)
599 		return;
600 
601 	/* If sk not NULL, it means we did a successful lookup and incoming
602 	 * route had to be correct. prequeue might have dropped our dst.
603 	 */
604 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
605 		return;
606 
607 	/* Swap the send and the receive. */
608 	memset(&rep, 0, sizeof(rep));
609 	rep.th.dest   = th->source;
610 	rep.th.source = th->dest;
611 	rep.th.doff   = sizeof(struct tcphdr) / 4;
612 	rep.th.rst    = 1;
613 
614 	if (th->ack) {
615 		rep.th.seq = th->ack_seq;
616 	} else {
617 		rep.th.ack = 1;
618 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
619 				       skb->len - (th->doff << 2));
620 	}
621 
622 	memset(&arg, 0, sizeof(arg));
623 	arg.iov[0].iov_base = (unsigned char *)&rep;
624 	arg.iov[0].iov_len  = sizeof(rep.th);
625 
626 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
627 #ifdef CONFIG_TCP_MD5SIG
628 	hash_location = tcp_parse_md5sig_option(th);
629 	if (!sk && hash_location) {
630 		/*
631 		 * active side is lost. Try to find listening socket through
632 		 * source port, and then find md5 key through listening socket.
633 		 * we are not loose security here:
634 		 * Incoming packet is checked with md5 hash with finding key,
635 		 * no RST generated if md5 hash doesn't match.
636 		 */
637 		sk1 = __inet_lookup_listener(net,
638 					     &tcp_hashinfo, ip_hdr(skb)->saddr,
639 					     th->source, ip_hdr(skb)->daddr,
640 					     ntohs(th->source), inet_iif(skb));
641 		/* don't send rst if it can't find key */
642 		if (!sk1)
643 			return;
644 		rcu_read_lock();
645 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
646 					&ip_hdr(skb)->saddr, AF_INET);
647 		if (!key)
648 			goto release_sk1;
649 
650 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
651 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
652 			goto release_sk1;
653 	} else {
654 		key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
655 					     &ip_hdr(skb)->saddr,
656 					     AF_INET) : NULL;
657 	}
658 
659 	if (key) {
660 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
661 				   (TCPOPT_NOP << 16) |
662 				   (TCPOPT_MD5SIG << 8) |
663 				   TCPOLEN_MD5SIG);
664 		/* Update length and the length the header thinks exists */
665 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
666 		rep.th.doff = arg.iov[0].iov_len / 4;
667 
668 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
669 				     key, ip_hdr(skb)->saddr,
670 				     ip_hdr(skb)->daddr, &rep.th);
671 	}
672 #endif
673 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
674 				      ip_hdr(skb)->saddr, /* XXX */
675 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
676 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
677 	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
678 	/* When socket is gone, all binding information is lost.
679 	 * routing might fail in this case. No choice here, if we choose to force
680 	 * input interface, we will misroute in case of asymmetric route.
681 	 */
682 	if (sk)
683 		arg.bound_dev_if = sk->sk_bound_dev_if;
684 
685 	arg.tos = ip_hdr(skb)->tos;
686 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
687 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
688 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
689 			      &arg, arg.iov[0].iov_len);
690 
691 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
692 	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
693 
694 #ifdef CONFIG_TCP_MD5SIG
695 release_sk1:
696 	if (sk1) {
697 		rcu_read_unlock();
698 		sock_put(sk1);
699 	}
700 #endif
701 }
702 
703 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
704    outside socket context is ugly, certainly. What can I do?
705  */
706 
707 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
708 			    u32 win, u32 tsval, u32 tsecr, int oif,
709 			    struct tcp_md5sig_key *key,
710 			    int reply_flags, u8 tos)
711 {
712 	const struct tcphdr *th = tcp_hdr(skb);
713 	struct {
714 		struct tcphdr th;
715 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
716 #ifdef CONFIG_TCP_MD5SIG
717 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
718 #endif
719 			];
720 	} rep;
721 	struct ip_reply_arg arg;
722 	struct net *net = dev_net(skb_dst(skb)->dev);
723 
724 	memset(&rep.th, 0, sizeof(struct tcphdr));
725 	memset(&arg, 0, sizeof(arg));
726 
727 	arg.iov[0].iov_base = (unsigned char *)&rep;
728 	arg.iov[0].iov_len  = sizeof(rep.th);
729 	if (tsecr) {
730 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
731 				   (TCPOPT_TIMESTAMP << 8) |
732 				   TCPOLEN_TIMESTAMP);
733 		rep.opt[1] = htonl(tsval);
734 		rep.opt[2] = htonl(tsecr);
735 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
736 	}
737 
738 	/* Swap the send and the receive. */
739 	rep.th.dest    = th->source;
740 	rep.th.source  = th->dest;
741 	rep.th.doff    = arg.iov[0].iov_len / 4;
742 	rep.th.seq     = htonl(seq);
743 	rep.th.ack_seq = htonl(ack);
744 	rep.th.ack     = 1;
745 	rep.th.window  = htons(win);
746 
747 #ifdef CONFIG_TCP_MD5SIG
748 	if (key) {
749 		int offset = (tsecr) ? 3 : 0;
750 
751 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
752 					  (TCPOPT_NOP << 16) |
753 					  (TCPOPT_MD5SIG << 8) |
754 					  TCPOLEN_MD5SIG);
755 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
756 		rep.th.doff = arg.iov[0].iov_len/4;
757 
758 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
759 				    key, ip_hdr(skb)->saddr,
760 				    ip_hdr(skb)->daddr, &rep.th);
761 	}
762 #endif
763 	arg.flags = reply_flags;
764 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
765 				      ip_hdr(skb)->saddr, /* XXX */
766 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
767 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
768 	if (oif)
769 		arg.bound_dev_if = oif;
770 	arg.tos = tos;
771 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
772 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
773 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
774 			      &arg, arg.iov[0].iov_len);
775 
776 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
777 }
778 
779 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
780 {
781 	struct inet_timewait_sock *tw = inet_twsk(sk);
782 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
783 
784 	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
785 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
786 			tcp_time_stamp + tcptw->tw_ts_offset,
787 			tcptw->tw_ts_recent,
788 			tw->tw_bound_dev_if,
789 			tcp_twsk_md5_key(tcptw),
790 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
791 			tw->tw_tos
792 			);
793 
794 	inet_twsk_put(tw);
795 }
796 
797 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
798 				  struct request_sock *req)
799 {
800 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
801 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
802 	 */
803 	tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
804 			tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
805 			tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
806 			tcp_time_stamp,
807 			req->ts_recent,
808 			0,
809 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
810 					  AF_INET),
811 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
812 			ip_hdr(skb)->tos);
813 }
814 
815 /*
816  *	Send a SYN-ACK after having received a SYN.
817  *	This still operates on a request_sock only, not on a big
818  *	socket.
819  */
820 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
821 			      struct flowi *fl,
822 			      struct request_sock *req,
823 			      u16 queue_mapping,
824 			      struct tcp_fastopen_cookie *foc)
825 {
826 	const struct inet_request_sock *ireq = inet_rsk(req);
827 	struct flowi4 fl4;
828 	int err = -1;
829 	struct sk_buff *skb;
830 
831 	/* First, grab a route. */
832 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
833 		return -1;
834 
835 	skb = tcp_make_synack(sk, dst, req, foc);
836 
837 	if (skb) {
838 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
839 
840 		skb_set_queue_mapping(skb, queue_mapping);
841 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
842 					    ireq->ir_rmt_addr,
843 					    ireq->opt);
844 		err = net_xmit_eval(err);
845 	}
846 
847 	return err;
848 }
849 
850 /*
851  *	IPv4 request_sock destructor.
852  */
853 static void tcp_v4_reqsk_destructor(struct request_sock *req)
854 {
855 	kfree(inet_rsk(req)->opt);
856 }
857 
858 /*
859  * Return true if a syncookie should be sent
860  */
861 bool tcp_syn_flood_action(struct sock *sk,
862 			 const struct sk_buff *skb,
863 			 const char *proto)
864 {
865 	const char *msg = "Dropping request";
866 	bool want_cookie = false;
867 	struct listen_sock *lopt;
868 
869 #ifdef CONFIG_SYN_COOKIES
870 	if (sysctl_tcp_syncookies) {
871 		msg = "Sending cookies";
872 		want_cookie = true;
873 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
874 	} else
875 #endif
876 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
877 
878 	lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
879 	if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
880 		lopt->synflood_warned = 1;
881 		pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
882 			proto, ntohs(tcp_hdr(skb)->dest), msg);
883 	}
884 	return want_cookie;
885 }
886 EXPORT_SYMBOL(tcp_syn_flood_action);
887 
888 #ifdef CONFIG_TCP_MD5SIG
889 /*
890  * RFC2385 MD5 checksumming requires a mapping of
891  * IP address->MD5 Key.
892  * We need to maintain these in the sk structure.
893  */
894 
895 /* Find the Key structure for an address.  */
896 struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
897 					 const union tcp_md5_addr *addr,
898 					 int family)
899 {
900 	struct tcp_sock *tp = tcp_sk(sk);
901 	struct tcp_md5sig_key *key;
902 	unsigned int size = sizeof(struct in_addr);
903 	struct tcp_md5sig_info *md5sig;
904 
905 	/* caller either holds rcu_read_lock() or socket lock */
906 	md5sig = rcu_dereference_check(tp->md5sig_info,
907 				       sock_owned_by_user(sk) ||
908 				       lockdep_is_held(&sk->sk_lock.slock));
909 	if (!md5sig)
910 		return NULL;
911 #if IS_ENABLED(CONFIG_IPV6)
912 	if (family == AF_INET6)
913 		size = sizeof(struct in6_addr);
914 #endif
915 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
916 		if (key->family != family)
917 			continue;
918 		if (!memcmp(&key->addr, addr, size))
919 			return key;
920 	}
921 	return NULL;
922 }
923 EXPORT_SYMBOL(tcp_md5_do_lookup);
924 
925 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
926 					 struct sock *addr_sk)
927 {
928 	union tcp_md5_addr *addr;
929 
930 	addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
931 	return tcp_md5_do_lookup(sk, addr, AF_INET);
932 }
933 EXPORT_SYMBOL(tcp_v4_md5_lookup);
934 
935 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
936 						      struct request_sock *req)
937 {
938 	union tcp_md5_addr *addr;
939 
940 	addr = (union tcp_md5_addr *)&inet_rsk(req)->ir_rmt_addr;
941 	return tcp_md5_do_lookup(sk, addr, AF_INET);
942 }
943 
944 /* This can be called on a newly created socket, from other files */
945 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
946 		   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
947 {
948 	/* Add Key to the list */
949 	struct tcp_md5sig_key *key;
950 	struct tcp_sock *tp = tcp_sk(sk);
951 	struct tcp_md5sig_info *md5sig;
952 
953 	key = tcp_md5_do_lookup(sk, addr, family);
954 	if (key) {
955 		/* Pre-existing entry - just update that one. */
956 		memcpy(key->key, newkey, newkeylen);
957 		key->keylen = newkeylen;
958 		return 0;
959 	}
960 
961 	md5sig = rcu_dereference_protected(tp->md5sig_info,
962 					   sock_owned_by_user(sk));
963 	if (!md5sig) {
964 		md5sig = kmalloc(sizeof(*md5sig), gfp);
965 		if (!md5sig)
966 			return -ENOMEM;
967 
968 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
969 		INIT_HLIST_HEAD(&md5sig->head);
970 		rcu_assign_pointer(tp->md5sig_info, md5sig);
971 	}
972 
973 	key = sock_kmalloc(sk, sizeof(*key), gfp);
974 	if (!key)
975 		return -ENOMEM;
976 	if (!tcp_alloc_md5sig_pool()) {
977 		sock_kfree_s(sk, key, sizeof(*key));
978 		return -ENOMEM;
979 	}
980 
981 	memcpy(key->key, newkey, newkeylen);
982 	key->keylen = newkeylen;
983 	key->family = family;
984 	memcpy(&key->addr, addr,
985 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
986 				      sizeof(struct in_addr));
987 	hlist_add_head_rcu(&key->node, &md5sig->head);
988 	return 0;
989 }
990 EXPORT_SYMBOL(tcp_md5_do_add);
991 
992 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
993 {
994 	struct tcp_md5sig_key *key;
995 
996 	key = tcp_md5_do_lookup(sk, addr, family);
997 	if (!key)
998 		return -ENOENT;
999 	hlist_del_rcu(&key->node);
1000 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1001 	kfree_rcu(key, rcu);
1002 	return 0;
1003 }
1004 EXPORT_SYMBOL(tcp_md5_do_del);
1005 
1006 static void tcp_clear_md5_list(struct sock *sk)
1007 {
1008 	struct tcp_sock *tp = tcp_sk(sk);
1009 	struct tcp_md5sig_key *key;
1010 	struct hlist_node *n;
1011 	struct tcp_md5sig_info *md5sig;
1012 
1013 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1014 
1015 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1016 		hlist_del_rcu(&key->node);
1017 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1018 		kfree_rcu(key, rcu);
1019 	}
1020 }
1021 
1022 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1023 				 int optlen)
1024 {
1025 	struct tcp_md5sig cmd;
1026 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1027 
1028 	if (optlen < sizeof(cmd))
1029 		return -EINVAL;
1030 
1031 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1032 		return -EFAULT;
1033 
1034 	if (sin->sin_family != AF_INET)
1035 		return -EINVAL;
1036 
1037 	if (!cmd.tcpm_keylen)
1038 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1039 				      AF_INET);
1040 
1041 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1042 		return -EINVAL;
1043 
1044 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1045 			      AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1046 			      GFP_KERNEL);
1047 }
1048 
1049 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1050 					__be32 daddr, __be32 saddr, int nbytes)
1051 {
1052 	struct tcp4_pseudohdr *bp;
1053 	struct scatterlist sg;
1054 
1055 	bp = &hp->md5_blk.ip4;
1056 
1057 	/*
1058 	 * 1. the TCP pseudo-header (in the order: source IP address,
1059 	 * destination IP address, zero-padded protocol number, and
1060 	 * segment length)
1061 	 */
1062 	bp->saddr = saddr;
1063 	bp->daddr = daddr;
1064 	bp->pad = 0;
1065 	bp->protocol = IPPROTO_TCP;
1066 	bp->len = cpu_to_be16(nbytes);
1067 
1068 	sg_init_one(&sg, bp, sizeof(*bp));
1069 	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1070 }
1071 
1072 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1073 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1074 {
1075 	struct tcp_md5sig_pool *hp;
1076 	struct hash_desc *desc;
1077 
1078 	hp = tcp_get_md5sig_pool();
1079 	if (!hp)
1080 		goto clear_hash_noput;
1081 	desc = &hp->md5_desc;
1082 
1083 	if (crypto_hash_init(desc))
1084 		goto clear_hash;
1085 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1086 		goto clear_hash;
1087 	if (tcp_md5_hash_header(hp, th))
1088 		goto clear_hash;
1089 	if (tcp_md5_hash_key(hp, key))
1090 		goto clear_hash;
1091 	if (crypto_hash_final(desc, md5_hash))
1092 		goto clear_hash;
1093 
1094 	tcp_put_md5sig_pool();
1095 	return 0;
1096 
1097 clear_hash:
1098 	tcp_put_md5sig_pool();
1099 clear_hash_noput:
1100 	memset(md5_hash, 0, 16);
1101 	return 1;
1102 }
1103 
1104 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1105 			const struct sock *sk, const struct request_sock *req,
1106 			const struct sk_buff *skb)
1107 {
1108 	struct tcp_md5sig_pool *hp;
1109 	struct hash_desc *desc;
1110 	const struct tcphdr *th = tcp_hdr(skb);
1111 	__be32 saddr, daddr;
1112 
1113 	if (sk) {
1114 		saddr = inet_sk(sk)->inet_saddr;
1115 		daddr = inet_sk(sk)->inet_daddr;
1116 	} else if (req) {
1117 		saddr = inet_rsk(req)->ir_loc_addr;
1118 		daddr = inet_rsk(req)->ir_rmt_addr;
1119 	} else {
1120 		const struct iphdr *iph = ip_hdr(skb);
1121 		saddr = iph->saddr;
1122 		daddr = iph->daddr;
1123 	}
1124 
1125 	hp = tcp_get_md5sig_pool();
1126 	if (!hp)
1127 		goto clear_hash_noput;
1128 	desc = &hp->md5_desc;
1129 
1130 	if (crypto_hash_init(desc))
1131 		goto clear_hash;
1132 
1133 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1134 		goto clear_hash;
1135 	if (tcp_md5_hash_header(hp, th))
1136 		goto clear_hash;
1137 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1138 		goto clear_hash;
1139 	if (tcp_md5_hash_key(hp, key))
1140 		goto clear_hash;
1141 	if (crypto_hash_final(desc, md5_hash))
1142 		goto clear_hash;
1143 
1144 	tcp_put_md5sig_pool();
1145 	return 0;
1146 
1147 clear_hash:
1148 	tcp_put_md5sig_pool();
1149 clear_hash_noput:
1150 	memset(md5_hash, 0, 16);
1151 	return 1;
1152 }
1153 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1154 
1155 static bool __tcp_v4_inbound_md5_hash(struct sock *sk,
1156 				      const struct sk_buff *skb)
1157 {
1158 	/*
1159 	 * This gets called for each TCP segment that arrives
1160 	 * so we want to be efficient.
1161 	 * We have 3 drop cases:
1162 	 * o No MD5 hash and one expected.
1163 	 * o MD5 hash and we're not expecting one.
1164 	 * o MD5 hash and its wrong.
1165 	 */
1166 	const __u8 *hash_location = NULL;
1167 	struct tcp_md5sig_key *hash_expected;
1168 	const struct iphdr *iph = ip_hdr(skb);
1169 	const struct tcphdr *th = tcp_hdr(skb);
1170 	int genhash;
1171 	unsigned char newhash[16];
1172 
1173 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1174 					  AF_INET);
1175 	hash_location = tcp_parse_md5sig_option(th);
1176 
1177 	/* We've parsed the options - do we have a hash? */
1178 	if (!hash_expected && !hash_location)
1179 		return false;
1180 
1181 	if (hash_expected && !hash_location) {
1182 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1183 		return true;
1184 	}
1185 
1186 	if (!hash_expected && hash_location) {
1187 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1188 		return true;
1189 	}
1190 
1191 	/* Okay, so this is hash_expected and hash_location -
1192 	 * so we need to calculate the checksum.
1193 	 */
1194 	genhash = tcp_v4_md5_hash_skb(newhash,
1195 				      hash_expected,
1196 				      NULL, NULL, skb);
1197 
1198 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1199 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1200 				     &iph->saddr, ntohs(th->source),
1201 				     &iph->daddr, ntohs(th->dest),
1202 				     genhash ? " tcp_v4_calc_md5_hash failed"
1203 				     : "");
1204 		return true;
1205 	}
1206 	return false;
1207 }
1208 
1209 static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1210 {
1211 	bool ret;
1212 
1213 	rcu_read_lock();
1214 	ret = __tcp_v4_inbound_md5_hash(sk, skb);
1215 	rcu_read_unlock();
1216 
1217 	return ret;
1218 }
1219 
1220 #endif
1221 
1222 static void tcp_v4_init_req(struct request_sock *req, struct sock *sk,
1223 			    struct sk_buff *skb)
1224 {
1225 	struct inet_request_sock *ireq = inet_rsk(req);
1226 
1227 	ireq->ir_loc_addr = ip_hdr(skb)->daddr;
1228 	ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
1229 	ireq->no_srccheck = inet_sk(sk)->transparent;
1230 	ireq->opt = tcp_v4_save_options(skb);
1231 }
1232 
1233 static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl,
1234 					  const struct request_sock *req,
1235 					  bool *strict)
1236 {
1237 	struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1238 
1239 	if (strict) {
1240 		if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1241 			*strict = true;
1242 		else
1243 			*strict = false;
1244 	}
1245 
1246 	return dst;
1247 }
1248 
1249 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1250 	.family		=	PF_INET,
1251 	.obj_size	=	sizeof(struct tcp_request_sock),
1252 	.rtx_syn_ack	=	tcp_rtx_synack,
1253 	.send_ack	=	tcp_v4_reqsk_send_ack,
1254 	.destructor	=	tcp_v4_reqsk_destructor,
1255 	.send_reset	=	tcp_v4_send_reset,
1256 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1257 };
1258 
1259 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1260 	.mss_clamp	=	TCP_MSS_DEFAULT,
1261 #ifdef CONFIG_TCP_MD5SIG
1262 	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
1263 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1264 #endif
1265 	.init_req	=	tcp_v4_init_req,
1266 #ifdef CONFIG_SYN_COOKIES
1267 	.cookie_init_seq =	cookie_v4_init_sequence,
1268 #endif
1269 	.route_req	=	tcp_v4_route_req,
1270 	.init_seq	=	tcp_v4_init_sequence,
1271 	.send_synack	=	tcp_v4_send_synack,
1272 	.queue_hash_add =	inet_csk_reqsk_queue_hash_add,
1273 };
1274 
1275 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1276 {
1277 	/* Never answer to SYNs send to broadcast or multicast */
1278 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1279 		goto drop;
1280 
1281 	return tcp_conn_request(&tcp_request_sock_ops,
1282 				&tcp_request_sock_ipv4_ops, sk, skb);
1283 
1284 drop:
1285 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1286 	return 0;
1287 }
1288 EXPORT_SYMBOL(tcp_v4_conn_request);
1289 
1290 
1291 /*
1292  * The three way handshake has completed - we got a valid synack -
1293  * now create the new socket.
1294  */
1295 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1296 				  struct request_sock *req,
1297 				  struct dst_entry *dst)
1298 {
1299 	struct inet_request_sock *ireq;
1300 	struct inet_sock *newinet;
1301 	struct tcp_sock *newtp;
1302 	struct sock *newsk;
1303 #ifdef CONFIG_TCP_MD5SIG
1304 	struct tcp_md5sig_key *key;
1305 #endif
1306 	struct ip_options_rcu *inet_opt;
1307 
1308 	if (sk_acceptq_is_full(sk))
1309 		goto exit_overflow;
1310 
1311 	newsk = tcp_create_openreq_child(sk, req, skb);
1312 	if (!newsk)
1313 		goto exit_nonewsk;
1314 
1315 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1316 	inet_sk_rx_dst_set(newsk, skb);
1317 
1318 	newtp		      = tcp_sk(newsk);
1319 	newinet		      = inet_sk(newsk);
1320 	ireq		      = inet_rsk(req);
1321 	newinet->inet_daddr   = ireq->ir_rmt_addr;
1322 	newinet->inet_rcv_saddr = ireq->ir_loc_addr;
1323 	newinet->inet_saddr	      = ireq->ir_loc_addr;
1324 	inet_opt	      = ireq->opt;
1325 	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1326 	ireq->opt	      = NULL;
1327 	newinet->mc_index     = inet_iif(skb);
1328 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1329 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1330 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1331 	inet_set_txhash(newsk);
1332 	if (inet_opt)
1333 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1334 	newinet->inet_id = newtp->write_seq ^ jiffies;
1335 
1336 	if (!dst) {
1337 		dst = inet_csk_route_child_sock(sk, newsk, req);
1338 		if (!dst)
1339 			goto put_and_exit;
1340 	} else {
1341 		/* syncookie case : see end of cookie_v4_check() */
1342 	}
1343 	sk_setup_caps(newsk, dst);
1344 
1345 	tcp_ca_openreq_child(newsk, dst);
1346 
1347 	tcp_sync_mss(newsk, dst_mtu(dst));
1348 	newtp->advmss = dst_metric_advmss(dst);
1349 	if (tcp_sk(sk)->rx_opt.user_mss &&
1350 	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1351 		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1352 
1353 	tcp_initialize_rcv_mss(newsk);
1354 
1355 #ifdef CONFIG_TCP_MD5SIG
1356 	/* Copy over the MD5 key from the original socket */
1357 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1358 				AF_INET);
1359 	if (key != NULL) {
1360 		/*
1361 		 * We're using one, so create a matching key
1362 		 * on the newsk structure. If we fail to get
1363 		 * memory, then we end up not copying the key
1364 		 * across. Shucks.
1365 		 */
1366 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1367 			       AF_INET, key->key, key->keylen, GFP_ATOMIC);
1368 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1369 	}
1370 #endif
1371 
1372 	if (__inet_inherit_port(sk, newsk) < 0)
1373 		goto put_and_exit;
1374 	__inet_hash_nolisten(newsk, NULL);
1375 
1376 	return newsk;
1377 
1378 exit_overflow:
1379 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1380 exit_nonewsk:
1381 	dst_release(dst);
1382 exit:
1383 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1384 	return NULL;
1385 put_and_exit:
1386 	inet_csk_prepare_forced_close(newsk);
1387 	tcp_done(newsk);
1388 	goto exit;
1389 }
1390 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1391 
1392 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1393 {
1394 	struct tcphdr *th = tcp_hdr(skb);
1395 	const struct iphdr *iph = ip_hdr(skb);
1396 	struct sock *nsk;
1397 	struct request_sock **prev;
1398 	/* Find possible connection requests. */
1399 	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1400 						       iph->saddr, iph->daddr);
1401 	if (req)
1402 		return tcp_check_req(sk, skb, req, prev, false);
1403 
1404 	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1405 			th->source, iph->daddr, th->dest, inet_iif(skb));
1406 
1407 	if (nsk) {
1408 		if (nsk->sk_state != TCP_TIME_WAIT) {
1409 			bh_lock_sock(nsk);
1410 			return nsk;
1411 		}
1412 		inet_twsk_put(inet_twsk(nsk));
1413 		return NULL;
1414 	}
1415 
1416 #ifdef CONFIG_SYN_COOKIES
1417 	if (!th->syn)
1418 		sk = cookie_v4_check(sk, skb);
1419 #endif
1420 	return sk;
1421 }
1422 
1423 /* The socket must have it's spinlock held when we get
1424  * here.
1425  *
1426  * We have a potential double-lock case here, so even when
1427  * doing backlog processing we use the BH locking scheme.
1428  * This is because we cannot sleep with the original spinlock
1429  * held.
1430  */
1431 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1432 {
1433 	struct sock *rsk;
1434 
1435 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1436 		struct dst_entry *dst = sk->sk_rx_dst;
1437 
1438 		sock_rps_save_rxhash(sk, skb);
1439 		sk_mark_napi_id(sk, skb);
1440 		if (dst) {
1441 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1442 			    dst->ops->check(dst, 0) == NULL) {
1443 				dst_release(dst);
1444 				sk->sk_rx_dst = NULL;
1445 			}
1446 		}
1447 		tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1448 		return 0;
1449 	}
1450 
1451 	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1452 		goto csum_err;
1453 
1454 	if (sk->sk_state == TCP_LISTEN) {
1455 		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1456 		if (!nsk)
1457 			goto discard;
1458 
1459 		if (nsk != sk) {
1460 			sock_rps_save_rxhash(nsk, skb);
1461 			sk_mark_napi_id(sk, skb);
1462 			if (tcp_child_process(sk, nsk, skb)) {
1463 				rsk = nsk;
1464 				goto reset;
1465 			}
1466 			return 0;
1467 		}
1468 	} else
1469 		sock_rps_save_rxhash(sk, skb);
1470 
1471 	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1472 		rsk = sk;
1473 		goto reset;
1474 	}
1475 	return 0;
1476 
1477 reset:
1478 	tcp_v4_send_reset(rsk, skb);
1479 discard:
1480 	kfree_skb(skb);
1481 	/* Be careful here. If this function gets more complicated and
1482 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1483 	 * might be destroyed here. This current version compiles correctly,
1484 	 * but you have been warned.
1485 	 */
1486 	return 0;
1487 
1488 csum_err:
1489 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1490 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1491 	goto discard;
1492 }
1493 EXPORT_SYMBOL(tcp_v4_do_rcv);
1494 
1495 void tcp_v4_early_demux(struct sk_buff *skb)
1496 {
1497 	const struct iphdr *iph;
1498 	const struct tcphdr *th;
1499 	struct sock *sk;
1500 
1501 	if (skb->pkt_type != PACKET_HOST)
1502 		return;
1503 
1504 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1505 		return;
1506 
1507 	iph = ip_hdr(skb);
1508 	th = tcp_hdr(skb);
1509 
1510 	if (th->doff < sizeof(struct tcphdr) / 4)
1511 		return;
1512 
1513 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1514 				       iph->saddr, th->source,
1515 				       iph->daddr, ntohs(th->dest),
1516 				       skb->skb_iif);
1517 	if (sk) {
1518 		skb->sk = sk;
1519 		skb->destructor = sock_edemux;
1520 		if (sk->sk_state != TCP_TIME_WAIT) {
1521 			struct dst_entry *dst = sk->sk_rx_dst;
1522 
1523 			if (dst)
1524 				dst = dst_check(dst, 0);
1525 			if (dst &&
1526 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1527 				skb_dst_set_noref(skb, dst);
1528 		}
1529 	}
1530 }
1531 
1532 /* Packet is added to VJ-style prequeue for processing in process
1533  * context, if a reader task is waiting. Apparently, this exciting
1534  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1535  * failed somewhere. Latency? Burstiness? Well, at least now we will
1536  * see, why it failed. 8)8)				  --ANK
1537  *
1538  */
1539 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1540 {
1541 	struct tcp_sock *tp = tcp_sk(sk);
1542 
1543 	if (sysctl_tcp_low_latency || !tp->ucopy.task)
1544 		return false;
1545 
1546 	if (skb->len <= tcp_hdrlen(skb) &&
1547 	    skb_queue_len(&tp->ucopy.prequeue) == 0)
1548 		return false;
1549 
1550 	/* Before escaping RCU protected region, we need to take care of skb
1551 	 * dst. Prequeue is only enabled for established sockets.
1552 	 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1553 	 * Instead of doing full sk_rx_dst validity here, let's perform
1554 	 * an optimistic check.
1555 	 */
1556 	if (likely(sk->sk_rx_dst))
1557 		skb_dst_drop(skb);
1558 	else
1559 		skb_dst_force(skb);
1560 
1561 	__skb_queue_tail(&tp->ucopy.prequeue, skb);
1562 	tp->ucopy.memory += skb->truesize;
1563 	if (tp->ucopy.memory > sk->sk_rcvbuf) {
1564 		struct sk_buff *skb1;
1565 
1566 		BUG_ON(sock_owned_by_user(sk));
1567 
1568 		while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1569 			sk_backlog_rcv(sk, skb1);
1570 			NET_INC_STATS_BH(sock_net(sk),
1571 					 LINUX_MIB_TCPPREQUEUEDROPPED);
1572 		}
1573 
1574 		tp->ucopy.memory = 0;
1575 	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1576 		wake_up_interruptible_sync_poll(sk_sleep(sk),
1577 					   POLLIN | POLLRDNORM | POLLRDBAND);
1578 		if (!inet_csk_ack_scheduled(sk))
1579 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1580 						  (3 * tcp_rto_min(sk)) / 4,
1581 						  TCP_RTO_MAX);
1582 	}
1583 	return true;
1584 }
1585 EXPORT_SYMBOL(tcp_prequeue);
1586 
1587 /*
1588  *	From tcp_input.c
1589  */
1590 
1591 int tcp_v4_rcv(struct sk_buff *skb)
1592 {
1593 	const struct iphdr *iph;
1594 	const struct tcphdr *th;
1595 	struct sock *sk;
1596 	int ret;
1597 	struct net *net = dev_net(skb->dev);
1598 
1599 	if (skb->pkt_type != PACKET_HOST)
1600 		goto discard_it;
1601 
1602 	/* Count it even if it's bad */
1603 	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1604 
1605 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1606 		goto discard_it;
1607 
1608 	th = tcp_hdr(skb);
1609 
1610 	if (th->doff < sizeof(struct tcphdr) / 4)
1611 		goto bad_packet;
1612 	if (!pskb_may_pull(skb, th->doff * 4))
1613 		goto discard_it;
1614 
1615 	/* An explanation is required here, I think.
1616 	 * Packet length and doff are validated by header prediction,
1617 	 * provided case of th->doff==0 is eliminated.
1618 	 * So, we defer the checks. */
1619 
1620 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1621 		goto csum_error;
1622 
1623 	th = tcp_hdr(skb);
1624 	iph = ip_hdr(skb);
1625 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1626 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1627 	 */
1628 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1629 		sizeof(struct inet_skb_parm));
1630 	barrier();
1631 
1632 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1633 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1634 				    skb->len - th->doff * 4);
1635 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1636 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1637 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1638 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1639 	TCP_SKB_CB(skb)->sacked	 = 0;
1640 
1641 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1642 	if (!sk)
1643 		goto no_tcp_socket;
1644 
1645 process:
1646 	if (sk->sk_state == TCP_TIME_WAIT)
1647 		goto do_time_wait;
1648 
1649 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1650 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1651 		goto discard_and_relse;
1652 	}
1653 
1654 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1655 		goto discard_and_relse;
1656 
1657 #ifdef CONFIG_TCP_MD5SIG
1658 	/*
1659 	 * We really want to reject the packet as early as possible
1660 	 * if:
1661 	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1662 	 *  o There is an MD5 option and we're not expecting one
1663 	 */
1664 	if (tcp_v4_inbound_md5_hash(sk, skb))
1665 		goto discard_and_relse;
1666 #endif
1667 
1668 	nf_reset(skb);
1669 
1670 	if (sk_filter(sk, skb))
1671 		goto discard_and_relse;
1672 
1673 	sk_incoming_cpu_update(sk);
1674 	skb->dev = NULL;
1675 
1676 	bh_lock_sock_nested(sk);
1677 	ret = 0;
1678 	if (!sock_owned_by_user(sk)) {
1679 		if (!tcp_prequeue(sk, skb))
1680 			ret = tcp_v4_do_rcv(sk, skb);
1681 	} else if (unlikely(sk_add_backlog(sk, skb,
1682 					   sk->sk_rcvbuf + sk->sk_sndbuf))) {
1683 		bh_unlock_sock(sk);
1684 		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1685 		goto discard_and_relse;
1686 	}
1687 	bh_unlock_sock(sk);
1688 
1689 	sock_put(sk);
1690 
1691 	return ret;
1692 
1693 no_tcp_socket:
1694 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1695 		goto discard_it;
1696 
1697 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1698 csum_error:
1699 		TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1700 bad_packet:
1701 		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1702 	} else {
1703 		tcp_v4_send_reset(NULL, skb);
1704 	}
1705 
1706 discard_it:
1707 	/* Discard frame. */
1708 	kfree_skb(skb);
1709 	return 0;
1710 
1711 discard_and_relse:
1712 	sock_put(sk);
1713 	goto discard_it;
1714 
1715 do_time_wait:
1716 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1717 		inet_twsk_put(inet_twsk(sk));
1718 		goto discard_it;
1719 	}
1720 
1721 	if (skb->len < (th->doff << 2)) {
1722 		inet_twsk_put(inet_twsk(sk));
1723 		goto bad_packet;
1724 	}
1725 	if (tcp_checksum_complete(skb)) {
1726 		inet_twsk_put(inet_twsk(sk));
1727 		goto csum_error;
1728 	}
1729 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1730 	case TCP_TW_SYN: {
1731 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1732 							&tcp_hashinfo,
1733 							iph->saddr, th->source,
1734 							iph->daddr, th->dest,
1735 							inet_iif(skb));
1736 		if (sk2) {
1737 			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1738 			inet_twsk_put(inet_twsk(sk));
1739 			sk = sk2;
1740 			goto process;
1741 		}
1742 		/* Fall through to ACK */
1743 	}
1744 	case TCP_TW_ACK:
1745 		tcp_v4_timewait_ack(sk, skb);
1746 		break;
1747 	case TCP_TW_RST:
1748 		goto no_tcp_socket;
1749 	case TCP_TW_SUCCESS:;
1750 	}
1751 	goto discard_it;
1752 }
1753 
1754 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1755 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1756 	.twsk_unique	= tcp_twsk_unique,
1757 	.twsk_destructor= tcp_twsk_destructor,
1758 };
1759 
1760 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1761 {
1762 	struct dst_entry *dst = skb_dst(skb);
1763 
1764 	if (dst) {
1765 		dst_hold(dst);
1766 		sk->sk_rx_dst = dst;
1767 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1768 	}
1769 }
1770 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1771 
1772 const struct inet_connection_sock_af_ops ipv4_specific = {
1773 	.queue_xmit	   = ip_queue_xmit,
1774 	.send_check	   = tcp_v4_send_check,
1775 	.rebuild_header	   = inet_sk_rebuild_header,
1776 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
1777 	.conn_request	   = tcp_v4_conn_request,
1778 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1779 	.net_header_len	   = sizeof(struct iphdr),
1780 	.setsockopt	   = ip_setsockopt,
1781 	.getsockopt	   = ip_getsockopt,
1782 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1783 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1784 	.bind_conflict	   = inet_csk_bind_conflict,
1785 #ifdef CONFIG_COMPAT
1786 	.compat_setsockopt = compat_ip_setsockopt,
1787 	.compat_getsockopt = compat_ip_getsockopt,
1788 #endif
1789 	.mtu_reduced	   = tcp_v4_mtu_reduced,
1790 };
1791 EXPORT_SYMBOL(ipv4_specific);
1792 
1793 #ifdef CONFIG_TCP_MD5SIG
1794 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1795 	.md5_lookup		= tcp_v4_md5_lookup,
1796 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1797 	.md5_parse		= tcp_v4_parse_md5_keys,
1798 };
1799 #endif
1800 
1801 /* NOTE: A lot of things set to zero explicitly by call to
1802  *       sk_alloc() so need not be done here.
1803  */
1804 static int tcp_v4_init_sock(struct sock *sk)
1805 {
1806 	struct inet_connection_sock *icsk = inet_csk(sk);
1807 
1808 	tcp_init_sock(sk);
1809 
1810 	icsk->icsk_af_ops = &ipv4_specific;
1811 
1812 #ifdef CONFIG_TCP_MD5SIG
1813 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1814 #endif
1815 
1816 	return 0;
1817 }
1818 
1819 void tcp_v4_destroy_sock(struct sock *sk)
1820 {
1821 	struct tcp_sock *tp = tcp_sk(sk);
1822 
1823 	tcp_clear_xmit_timers(sk);
1824 
1825 	tcp_cleanup_congestion_control(sk);
1826 
1827 	/* Cleanup up the write buffer. */
1828 	tcp_write_queue_purge(sk);
1829 
1830 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1831 	__skb_queue_purge(&tp->out_of_order_queue);
1832 
1833 #ifdef CONFIG_TCP_MD5SIG
1834 	/* Clean up the MD5 key list, if any */
1835 	if (tp->md5sig_info) {
1836 		tcp_clear_md5_list(sk);
1837 		kfree_rcu(tp->md5sig_info, rcu);
1838 		tp->md5sig_info = NULL;
1839 	}
1840 #endif
1841 
1842 	/* Clean prequeue, it must be empty really */
1843 	__skb_queue_purge(&tp->ucopy.prequeue);
1844 
1845 	/* Clean up a referenced TCP bind bucket. */
1846 	if (inet_csk(sk)->icsk_bind_hash)
1847 		inet_put_port(sk);
1848 
1849 	BUG_ON(tp->fastopen_rsk != NULL);
1850 
1851 	/* If socket is aborted during connect operation */
1852 	tcp_free_fastopen_req(tp);
1853 
1854 	sk_sockets_allocated_dec(sk);
1855 	sock_release_memcg(sk);
1856 }
1857 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1858 
1859 #ifdef CONFIG_PROC_FS
1860 /* Proc filesystem TCP sock list dumping. */
1861 
1862 /*
1863  * Get next listener socket follow cur.  If cur is NULL, get first socket
1864  * starting from bucket given in st->bucket; when st->bucket is zero the
1865  * very first socket in the hash table is returned.
1866  */
1867 static void *listening_get_next(struct seq_file *seq, void *cur)
1868 {
1869 	struct inet_connection_sock *icsk;
1870 	struct hlist_nulls_node *node;
1871 	struct sock *sk = cur;
1872 	struct inet_listen_hashbucket *ilb;
1873 	struct tcp_iter_state *st = seq->private;
1874 	struct net *net = seq_file_net(seq);
1875 
1876 	if (!sk) {
1877 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1878 		spin_lock_bh(&ilb->lock);
1879 		sk = sk_nulls_head(&ilb->head);
1880 		st->offset = 0;
1881 		goto get_sk;
1882 	}
1883 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1884 	++st->num;
1885 	++st->offset;
1886 
1887 	if (st->state == TCP_SEQ_STATE_OPENREQ) {
1888 		struct request_sock *req = cur;
1889 
1890 		icsk = inet_csk(st->syn_wait_sk);
1891 		req = req->dl_next;
1892 		while (1) {
1893 			while (req) {
1894 				if (req->rsk_ops->family == st->family) {
1895 					cur = req;
1896 					goto out;
1897 				}
1898 				req = req->dl_next;
1899 			}
1900 			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1901 				break;
1902 get_req:
1903 			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1904 		}
1905 		sk	  = sk_nulls_next(st->syn_wait_sk);
1906 		st->state = TCP_SEQ_STATE_LISTENING;
1907 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1908 	} else {
1909 		icsk = inet_csk(sk);
1910 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1911 		if (reqsk_queue_len(&icsk->icsk_accept_queue))
1912 			goto start_req;
1913 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1914 		sk = sk_nulls_next(sk);
1915 	}
1916 get_sk:
1917 	sk_nulls_for_each_from(sk, node) {
1918 		if (!net_eq(sock_net(sk), net))
1919 			continue;
1920 		if (sk->sk_family == st->family) {
1921 			cur = sk;
1922 			goto out;
1923 		}
1924 		icsk = inet_csk(sk);
1925 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1926 		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1927 start_req:
1928 			st->uid		= sock_i_uid(sk);
1929 			st->syn_wait_sk = sk;
1930 			st->state	= TCP_SEQ_STATE_OPENREQ;
1931 			st->sbucket	= 0;
1932 			goto get_req;
1933 		}
1934 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1935 	}
1936 	spin_unlock_bh(&ilb->lock);
1937 	st->offset = 0;
1938 	if (++st->bucket < INET_LHTABLE_SIZE) {
1939 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1940 		spin_lock_bh(&ilb->lock);
1941 		sk = sk_nulls_head(&ilb->head);
1942 		goto get_sk;
1943 	}
1944 	cur = NULL;
1945 out:
1946 	return cur;
1947 }
1948 
1949 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1950 {
1951 	struct tcp_iter_state *st = seq->private;
1952 	void *rc;
1953 
1954 	st->bucket = 0;
1955 	st->offset = 0;
1956 	rc = listening_get_next(seq, NULL);
1957 
1958 	while (rc && *pos) {
1959 		rc = listening_get_next(seq, rc);
1960 		--*pos;
1961 	}
1962 	return rc;
1963 }
1964 
1965 static inline bool empty_bucket(const struct tcp_iter_state *st)
1966 {
1967 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1968 }
1969 
1970 /*
1971  * Get first established socket starting from bucket given in st->bucket.
1972  * If st->bucket is zero, the very first socket in the hash is returned.
1973  */
1974 static void *established_get_first(struct seq_file *seq)
1975 {
1976 	struct tcp_iter_state *st = seq->private;
1977 	struct net *net = seq_file_net(seq);
1978 	void *rc = NULL;
1979 
1980 	st->offset = 0;
1981 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1982 		struct sock *sk;
1983 		struct hlist_nulls_node *node;
1984 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1985 
1986 		/* Lockless fast path for the common case of empty buckets */
1987 		if (empty_bucket(st))
1988 			continue;
1989 
1990 		spin_lock_bh(lock);
1991 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1992 			if (sk->sk_family != st->family ||
1993 			    !net_eq(sock_net(sk), net)) {
1994 				continue;
1995 			}
1996 			rc = sk;
1997 			goto out;
1998 		}
1999 		spin_unlock_bh(lock);
2000 	}
2001 out:
2002 	return rc;
2003 }
2004 
2005 static void *established_get_next(struct seq_file *seq, void *cur)
2006 {
2007 	struct sock *sk = cur;
2008 	struct hlist_nulls_node *node;
2009 	struct tcp_iter_state *st = seq->private;
2010 	struct net *net = seq_file_net(seq);
2011 
2012 	++st->num;
2013 	++st->offset;
2014 
2015 	sk = sk_nulls_next(sk);
2016 
2017 	sk_nulls_for_each_from(sk, node) {
2018 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2019 			return sk;
2020 	}
2021 
2022 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2023 	++st->bucket;
2024 	return established_get_first(seq);
2025 }
2026 
2027 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2028 {
2029 	struct tcp_iter_state *st = seq->private;
2030 	void *rc;
2031 
2032 	st->bucket = 0;
2033 	rc = established_get_first(seq);
2034 
2035 	while (rc && pos) {
2036 		rc = established_get_next(seq, rc);
2037 		--pos;
2038 	}
2039 	return rc;
2040 }
2041 
2042 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2043 {
2044 	void *rc;
2045 	struct tcp_iter_state *st = seq->private;
2046 
2047 	st->state = TCP_SEQ_STATE_LISTENING;
2048 	rc	  = listening_get_idx(seq, &pos);
2049 
2050 	if (!rc) {
2051 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2052 		rc	  = established_get_idx(seq, pos);
2053 	}
2054 
2055 	return rc;
2056 }
2057 
2058 static void *tcp_seek_last_pos(struct seq_file *seq)
2059 {
2060 	struct tcp_iter_state *st = seq->private;
2061 	int offset = st->offset;
2062 	int orig_num = st->num;
2063 	void *rc = NULL;
2064 
2065 	switch (st->state) {
2066 	case TCP_SEQ_STATE_OPENREQ:
2067 	case TCP_SEQ_STATE_LISTENING:
2068 		if (st->bucket >= INET_LHTABLE_SIZE)
2069 			break;
2070 		st->state = TCP_SEQ_STATE_LISTENING;
2071 		rc = listening_get_next(seq, NULL);
2072 		while (offset-- && rc)
2073 			rc = listening_get_next(seq, rc);
2074 		if (rc)
2075 			break;
2076 		st->bucket = 0;
2077 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2078 		/* Fallthrough */
2079 	case TCP_SEQ_STATE_ESTABLISHED:
2080 		if (st->bucket > tcp_hashinfo.ehash_mask)
2081 			break;
2082 		rc = established_get_first(seq);
2083 		while (offset-- && rc)
2084 			rc = established_get_next(seq, rc);
2085 	}
2086 
2087 	st->num = orig_num;
2088 
2089 	return rc;
2090 }
2091 
2092 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2093 {
2094 	struct tcp_iter_state *st = seq->private;
2095 	void *rc;
2096 
2097 	if (*pos && *pos == st->last_pos) {
2098 		rc = tcp_seek_last_pos(seq);
2099 		if (rc)
2100 			goto out;
2101 	}
2102 
2103 	st->state = TCP_SEQ_STATE_LISTENING;
2104 	st->num = 0;
2105 	st->bucket = 0;
2106 	st->offset = 0;
2107 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2108 
2109 out:
2110 	st->last_pos = *pos;
2111 	return rc;
2112 }
2113 
2114 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2115 {
2116 	struct tcp_iter_state *st = seq->private;
2117 	void *rc = NULL;
2118 
2119 	if (v == SEQ_START_TOKEN) {
2120 		rc = tcp_get_idx(seq, 0);
2121 		goto out;
2122 	}
2123 
2124 	switch (st->state) {
2125 	case TCP_SEQ_STATE_OPENREQ:
2126 	case TCP_SEQ_STATE_LISTENING:
2127 		rc = listening_get_next(seq, v);
2128 		if (!rc) {
2129 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2130 			st->bucket = 0;
2131 			st->offset = 0;
2132 			rc	  = established_get_first(seq);
2133 		}
2134 		break;
2135 	case TCP_SEQ_STATE_ESTABLISHED:
2136 		rc = established_get_next(seq, v);
2137 		break;
2138 	}
2139 out:
2140 	++*pos;
2141 	st->last_pos = *pos;
2142 	return rc;
2143 }
2144 
2145 static void tcp_seq_stop(struct seq_file *seq, void *v)
2146 {
2147 	struct tcp_iter_state *st = seq->private;
2148 
2149 	switch (st->state) {
2150 	case TCP_SEQ_STATE_OPENREQ:
2151 		if (v) {
2152 			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2153 			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2154 		}
2155 	case TCP_SEQ_STATE_LISTENING:
2156 		if (v != SEQ_START_TOKEN)
2157 			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2158 		break;
2159 	case TCP_SEQ_STATE_ESTABLISHED:
2160 		if (v)
2161 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2162 		break;
2163 	}
2164 }
2165 
2166 int tcp_seq_open(struct inode *inode, struct file *file)
2167 {
2168 	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2169 	struct tcp_iter_state *s;
2170 	int err;
2171 
2172 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2173 			  sizeof(struct tcp_iter_state));
2174 	if (err < 0)
2175 		return err;
2176 
2177 	s = ((struct seq_file *)file->private_data)->private;
2178 	s->family		= afinfo->family;
2179 	s->last_pos		= 0;
2180 	return 0;
2181 }
2182 EXPORT_SYMBOL(tcp_seq_open);
2183 
2184 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2185 {
2186 	int rc = 0;
2187 	struct proc_dir_entry *p;
2188 
2189 	afinfo->seq_ops.start		= tcp_seq_start;
2190 	afinfo->seq_ops.next		= tcp_seq_next;
2191 	afinfo->seq_ops.stop		= tcp_seq_stop;
2192 
2193 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2194 			     afinfo->seq_fops, afinfo);
2195 	if (!p)
2196 		rc = -ENOMEM;
2197 	return rc;
2198 }
2199 EXPORT_SYMBOL(tcp_proc_register);
2200 
2201 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2202 {
2203 	remove_proc_entry(afinfo->name, net->proc_net);
2204 }
2205 EXPORT_SYMBOL(tcp_proc_unregister);
2206 
2207 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2208 			 struct seq_file *f, int i, kuid_t uid)
2209 {
2210 	const struct inet_request_sock *ireq = inet_rsk(req);
2211 	long delta = req->expires - jiffies;
2212 
2213 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2214 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2215 		i,
2216 		ireq->ir_loc_addr,
2217 		ntohs(inet_sk(sk)->inet_sport),
2218 		ireq->ir_rmt_addr,
2219 		ntohs(ireq->ir_rmt_port),
2220 		TCP_SYN_RECV,
2221 		0, 0, /* could print option size, but that is af dependent. */
2222 		1,    /* timers active (only the expire timer) */
2223 		jiffies_delta_to_clock_t(delta),
2224 		req->num_timeout,
2225 		from_kuid_munged(seq_user_ns(f), uid),
2226 		0,  /* non standard timer */
2227 		0, /* open_requests have no inode */
2228 		atomic_read(&sk->sk_refcnt),
2229 		req);
2230 }
2231 
2232 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2233 {
2234 	int timer_active;
2235 	unsigned long timer_expires;
2236 	const struct tcp_sock *tp = tcp_sk(sk);
2237 	const struct inet_connection_sock *icsk = inet_csk(sk);
2238 	const struct inet_sock *inet = inet_sk(sk);
2239 	struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
2240 	__be32 dest = inet->inet_daddr;
2241 	__be32 src = inet->inet_rcv_saddr;
2242 	__u16 destp = ntohs(inet->inet_dport);
2243 	__u16 srcp = ntohs(inet->inet_sport);
2244 	int rx_queue;
2245 
2246 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2247 	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2248 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2249 		timer_active	= 1;
2250 		timer_expires	= icsk->icsk_timeout;
2251 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2252 		timer_active	= 4;
2253 		timer_expires	= icsk->icsk_timeout;
2254 	} else if (timer_pending(&sk->sk_timer)) {
2255 		timer_active	= 2;
2256 		timer_expires	= sk->sk_timer.expires;
2257 	} else {
2258 		timer_active	= 0;
2259 		timer_expires = jiffies;
2260 	}
2261 
2262 	if (sk->sk_state == TCP_LISTEN)
2263 		rx_queue = sk->sk_ack_backlog;
2264 	else
2265 		/*
2266 		 * because we dont lock socket, we might find a transient negative value
2267 		 */
2268 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2269 
2270 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2271 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2272 		i, src, srcp, dest, destp, sk->sk_state,
2273 		tp->write_seq - tp->snd_una,
2274 		rx_queue,
2275 		timer_active,
2276 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2277 		icsk->icsk_retransmits,
2278 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2279 		icsk->icsk_probes_out,
2280 		sock_i_ino(sk),
2281 		atomic_read(&sk->sk_refcnt), sk,
2282 		jiffies_to_clock_t(icsk->icsk_rto),
2283 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2284 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2285 		tp->snd_cwnd,
2286 		sk->sk_state == TCP_LISTEN ?
2287 		    (fastopenq ? fastopenq->max_qlen : 0) :
2288 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2289 }
2290 
2291 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2292 			       struct seq_file *f, int i)
2293 {
2294 	__be32 dest, src;
2295 	__u16 destp, srcp;
2296 	s32 delta = tw->tw_ttd - inet_tw_time_stamp();
2297 
2298 	dest  = tw->tw_daddr;
2299 	src   = tw->tw_rcv_saddr;
2300 	destp = ntohs(tw->tw_dport);
2301 	srcp  = ntohs(tw->tw_sport);
2302 
2303 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2304 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2305 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2306 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2307 		atomic_read(&tw->tw_refcnt), tw);
2308 }
2309 
2310 #define TMPSZ 150
2311 
2312 static int tcp4_seq_show(struct seq_file *seq, void *v)
2313 {
2314 	struct tcp_iter_state *st;
2315 	struct sock *sk = v;
2316 
2317 	seq_setwidth(seq, TMPSZ - 1);
2318 	if (v == SEQ_START_TOKEN) {
2319 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2320 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2321 			   "inode");
2322 		goto out;
2323 	}
2324 	st = seq->private;
2325 
2326 	switch (st->state) {
2327 	case TCP_SEQ_STATE_LISTENING:
2328 	case TCP_SEQ_STATE_ESTABLISHED:
2329 		if (sk->sk_state == TCP_TIME_WAIT)
2330 			get_timewait4_sock(v, seq, st->num);
2331 		else
2332 			get_tcp4_sock(v, seq, st->num);
2333 		break;
2334 	case TCP_SEQ_STATE_OPENREQ:
2335 		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid);
2336 		break;
2337 	}
2338 out:
2339 	seq_pad(seq, '\n');
2340 	return 0;
2341 }
2342 
2343 static const struct file_operations tcp_afinfo_seq_fops = {
2344 	.owner   = THIS_MODULE,
2345 	.open    = tcp_seq_open,
2346 	.read    = seq_read,
2347 	.llseek  = seq_lseek,
2348 	.release = seq_release_net
2349 };
2350 
2351 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2352 	.name		= "tcp",
2353 	.family		= AF_INET,
2354 	.seq_fops	= &tcp_afinfo_seq_fops,
2355 	.seq_ops	= {
2356 		.show		= tcp4_seq_show,
2357 	},
2358 };
2359 
2360 static int __net_init tcp4_proc_init_net(struct net *net)
2361 {
2362 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2363 }
2364 
2365 static void __net_exit tcp4_proc_exit_net(struct net *net)
2366 {
2367 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2368 }
2369 
2370 static struct pernet_operations tcp4_net_ops = {
2371 	.init = tcp4_proc_init_net,
2372 	.exit = tcp4_proc_exit_net,
2373 };
2374 
2375 int __init tcp4_proc_init(void)
2376 {
2377 	return register_pernet_subsys(&tcp4_net_ops);
2378 }
2379 
2380 void tcp4_proc_exit(void)
2381 {
2382 	unregister_pernet_subsys(&tcp4_net_ops);
2383 }
2384 #endif /* CONFIG_PROC_FS */
2385 
2386 struct proto tcp_prot = {
2387 	.name			= "TCP",
2388 	.owner			= THIS_MODULE,
2389 	.close			= tcp_close,
2390 	.connect		= tcp_v4_connect,
2391 	.disconnect		= tcp_disconnect,
2392 	.accept			= inet_csk_accept,
2393 	.ioctl			= tcp_ioctl,
2394 	.init			= tcp_v4_init_sock,
2395 	.destroy		= tcp_v4_destroy_sock,
2396 	.shutdown		= tcp_shutdown,
2397 	.setsockopt		= tcp_setsockopt,
2398 	.getsockopt		= tcp_getsockopt,
2399 	.recvmsg		= tcp_recvmsg,
2400 	.sendmsg		= tcp_sendmsg,
2401 	.sendpage		= tcp_sendpage,
2402 	.backlog_rcv		= tcp_v4_do_rcv,
2403 	.release_cb		= tcp_release_cb,
2404 	.hash			= inet_hash,
2405 	.unhash			= inet_unhash,
2406 	.get_port		= inet_csk_get_port,
2407 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2408 	.stream_memory_free	= tcp_stream_memory_free,
2409 	.sockets_allocated	= &tcp_sockets_allocated,
2410 	.orphan_count		= &tcp_orphan_count,
2411 	.memory_allocated	= &tcp_memory_allocated,
2412 	.memory_pressure	= &tcp_memory_pressure,
2413 	.sysctl_mem		= sysctl_tcp_mem,
2414 	.sysctl_wmem		= sysctl_tcp_wmem,
2415 	.sysctl_rmem		= sysctl_tcp_rmem,
2416 	.max_header		= MAX_TCP_HEADER,
2417 	.obj_size		= sizeof(struct tcp_sock),
2418 	.slab_flags		= SLAB_DESTROY_BY_RCU,
2419 	.twsk_prot		= &tcp_timewait_sock_ops,
2420 	.rsk_prot		= &tcp_request_sock_ops,
2421 	.h.hashinfo		= &tcp_hashinfo,
2422 	.no_autobind		= true,
2423 #ifdef CONFIG_COMPAT
2424 	.compat_setsockopt	= compat_tcp_setsockopt,
2425 	.compat_getsockopt	= compat_tcp_getsockopt,
2426 #endif
2427 #ifdef CONFIG_MEMCG_KMEM
2428 	.init_cgroup		= tcp_init_cgroup,
2429 	.destroy_cgroup		= tcp_destroy_cgroup,
2430 	.proto_cgroup		= tcp_proto_cgroup,
2431 #endif
2432 };
2433 EXPORT_SYMBOL(tcp_prot);
2434 
2435 static void __net_exit tcp_sk_exit(struct net *net)
2436 {
2437 	int cpu;
2438 
2439 	for_each_possible_cpu(cpu)
2440 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2441 	free_percpu(net->ipv4.tcp_sk);
2442 }
2443 
2444 static int __net_init tcp_sk_init(struct net *net)
2445 {
2446 	int res, cpu;
2447 
2448 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2449 	if (!net->ipv4.tcp_sk)
2450 		return -ENOMEM;
2451 
2452 	for_each_possible_cpu(cpu) {
2453 		struct sock *sk;
2454 
2455 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2456 					   IPPROTO_TCP, net);
2457 		if (res)
2458 			goto fail;
2459 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2460 	}
2461 	net->ipv4.sysctl_tcp_ecn = 2;
2462 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2463 	return 0;
2464 
2465 fail:
2466 	tcp_sk_exit(net);
2467 
2468 	return res;
2469 }
2470 
2471 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2472 {
2473 	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2474 }
2475 
2476 static struct pernet_operations __net_initdata tcp_sk_ops = {
2477        .init	   = tcp_sk_init,
2478        .exit	   = tcp_sk_exit,
2479        .exit_batch = tcp_sk_exit_batch,
2480 };
2481 
2482 void __init tcp_v4_init(void)
2483 {
2484 	inet_hashinfo_init(&tcp_hashinfo);
2485 	if (register_pernet_subsys(&tcp_sk_ops))
2486 		panic("Failed to create the TCP control socket.\n");
2487 }
2488