xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision a977d045)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77 
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/inetdevice.h>
84 
85 #include <crypto/hash.h>
86 #include <linux/scatterlist.h>
87 
88 int sysctl_tcp_low_latency __read_mostly;
89 
90 #ifdef CONFIG_TCP_MD5SIG
91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
92 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
93 #endif
94 
95 struct inet_hashinfo tcp_hashinfo;
96 EXPORT_SYMBOL(tcp_hashinfo);
97 
98 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
99 {
100 	return secure_tcp_seq(ip_hdr(skb)->daddr,
101 			      ip_hdr(skb)->saddr,
102 			      tcp_hdr(skb)->dest,
103 			      tcp_hdr(skb)->source);
104 }
105 
106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
107 {
108 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
109 }
110 
111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112 {
113 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114 	struct tcp_sock *tp = tcp_sk(sk);
115 
116 	/* With PAWS, it is safe from the viewpoint
117 	   of data integrity. Even without PAWS it is safe provided sequence
118 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
119 
120 	   Actually, the idea is close to VJ's one, only timestamp cache is
121 	   held not per host, but per port pair and TW bucket is used as state
122 	   holder.
123 
124 	   If TW bucket has been already destroyed we fall back to VJ's scheme
125 	   and use initial timestamp retrieved from peer table.
126 	 */
127 	if (tcptw->tw_ts_recent_stamp &&
128 	    (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
129 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
130 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
131 		if (tp->write_seq == 0)
132 			tp->write_seq = 1;
133 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
134 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
135 		sock_hold(sktw);
136 		return 1;
137 	}
138 
139 	return 0;
140 }
141 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
142 
143 /* This will initiate an outgoing connection. */
144 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
145 {
146 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
147 	struct inet_sock *inet = inet_sk(sk);
148 	struct tcp_sock *tp = tcp_sk(sk);
149 	__be16 orig_sport, orig_dport;
150 	__be32 daddr, nexthop;
151 	struct flowi4 *fl4;
152 	struct rtable *rt;
153 	int err;
154 	struct ip_options_rcu *inet_opt;
155 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
156 
157 	if (addr_len < sizeof(struct sockaddr_in))
158 		return -EINVAL;
159 
160 	if (usin->sin_family != AF_INET)
161 		return -EAFNOSUPPORT;
162 
163 	nexthop = daddr = usin->sin_addr.s_addr;
164 	inet_opt = rcu_dereference_protected(inet->inet_opt,
165 					     lockdep_sock_is_held(sk));
166 	if (inet_opt && inet_opt->opt.srr) {
167 		if (!daddr)
168 			return -EINVAL;
169 		nexthop = inet_opt->opt.faddr;
170 	}
171 
172 	orig_sport = inet->inet_sport;
173 	orig_dport = usin->sin_port;
174 	fl4 = &inet->cork.fl.u.ip4;
175 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
176 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
177 			      IPPROTO_TCP,
178 			      orig_sport, orig_dport, sk);
179 	if (IS_ERR(rt)) {
180 		err = PTR_ERR(rt);
181 		if (err == -ENETUNREACH)
182 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
183 		return err;
184 	}
185 
186 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
187 		ip_rt_put(rt);
188 		return -ENETUNREACH;
189 	}
190 
191 	if (!inet_opt || !inet_opt->opt.srr)
192 		daddr = fl4->daddr;
193 
194 	if (!inet->inet_saddr)
195 		inet->inet_saddr = fl4->saddr;
196 	sk_rcv_saddr_set(sk, inet->inet_saddr);
197 
198 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
199 		/* Reset inherited state */
200 		tp->rx_opt.ts_recent	   = 0;
201 		tp->rx_opt.ts_recent_stamp = 0;
202 		if (likely(!tp->repair))
203 			tp->write_seq	   = 0;
204 	}
205 
206 	inet->inet_dport = usin->sin_port;
207 	sk_daddr_set(sk, daddr);
208 
209 	inet_csk(sk)->icsk_ext_hdr_len = 0;
210 	if (inet_opt)
211 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
212 
213 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
214 
215 	/* Socket identity is still unknown (sport may be zero).
216 	 * However we set state to SYN-SENT and not releasing socket
217 	 * lock select source port, enter ourselves into the hash tables and
218 	 * complete initialization after this.
219 	 */
220 	tcp_set_state(sk, TCP_SYN_SENT);
221 	err = inet_hash_connect(tcp_death_row, sk);
222 	if (err)
223 		goto failure;
224 
225 	sk_set_txhash(sk);
226 
227 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
228 			       inet->inet_sport, inet->inet_dport, sk);
229 	if (IS_ERR(rt)) {
230 		err = PTR_ERR(rt);
231 		rt = NULL;
232 		goto failure;
233 	}
234 	/* OK, now commit destination to socket.  */
235 	sk->sk_gso_type = SKB_GSO_TCPV4;
236 	sk_setup_caps(sk, &rt->dst);
237 	rt = NULL;
238 
239 	if (likely(!tp->repair)) {
240 		if (!tp->write_seq)
241 			tp->write_seq = secure_tcp_seq(inet->inet_saddr,
242 						       inet->inet_daddr,
243 						       inet->inet_sport,
244 						       usin->sin_port);
245 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
246 						 inet->inet_saddr,
247 						 inet->inet_daddr);
248 	}
249 
250 	inet->inet_id = tp->write_seq ^ jiffies;
251 
252 	if (tcp_fastopen_defer_connect(sk, &err))
253 		return err;
254 	if (err)
255 		goto failure;
256 
257 	err = tcp_connect(sk);
258 
259 	if (err)
260 		goto failure;
261 
262 	return 0;
263 
264 failure:
265 	/*
266 	 * This unhashes the socket and releases the local port,
267 	 * if necessary.
268 	 */
269 	tcp_set_state(sk, TCP_CLOSE);
270 	ip_rt_put(rt);
271 	sk->sk_route_caps = 0;
272 	inet->inet_dport = 0;
273 	return err;
274 }
275 EXPORT_SYMBOL(tcp_v4_connect);
276 
277 /*
278  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
279  * It can be called through tcp_release_cb() if socket was owned by user
280  * at the time tcp_v4_err() was called to handle ICMP message.
281  */
282 void tcp_v4_mtu_reduced(struct sock *sk)
283 {
284 	struct inet_sock *inet = inet_sk(sk);
285 	struct dst_entry *dst;
286 	u32 mtu;
287 
288 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
289 		return;
290 	mtu = tcp_sk(sk)->mtu_info;
291 	dst = inet_csk_update_pmtu(sk, mtu);
292 	if (!dst)
293 		return;
294 
295 	/* Something is about to be wrong... Remember soft error
296 	 * for the case, if this connection will not able to recover.
297 	 */
298 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
299 		sk->sk_err_soft = EMSGSIZE;
300 
301 	mtu = dst_mtu(dst);
302 
303 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
304 	    ip_sk_accept_pmtu(sk) &&
305 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
306 		tcp_sync_mss(sk, mtu);
307 
308 		/* Resend the TCP packet because it's
309 		 * clear that the old packet has been
310 		 * dropped. This is the new "fast" path mtu
311 		 * discovery.
312 		 */
313 		tcp_simple_retransmit(sk);
314 	} /* else let the usual retransmit timer handle it */
315 }
316 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
317 
318 static void do_redirect(struct sk_buff *skb, struct sock *sk)
319 {
320 	struct dst_entry *dst = __sk_dst_check(sk, 0);
321 
322 	if (dst)
323 		dst->ops->redirect(dst, sk, skb);
324 }
325 
326 
327 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
328 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
329 {
330 	struct request_sock *req = inet_reqsk(sk);
331 	struct net *net = sock_net(sk);
332 
333 	/* ICMPs are not backlogged, hence we cannot get
334 	 * an established socket here.
335 	 */
336 	if (seq != tcp_rsk(req)->snt_isn) {
337 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
338 	} else if (abort) {
339 		/*
340 		 * Still in SYN_RECV, just remove it silently.
341 		 * There is no good way to pass the error to the newly
342 		 * created socket, and POSIX does not want network
343 		 * errors returned from accept().
344 		 */
345 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
346 		tcp_listendrop(req->rsk_listener);
347 	}
348 	reqsk_put(req);
349 }
350 EXPORT_SYMBOL(tcp_req_err);
351 
352 /*
353  * This routine is called by the ICMP module when it gets some
354  * sort of error condition.  If err < 0 then the socket should
355  * be closed and the error returned to the user.  If err > 0
356  * it's just the icmp type << 8 | icmp code.  After adjustment
357  * header points to the first 8 bytes of the tcp header.  We need
358  * to find the appropriate port.
359  *
360  * The locking strategy used here is very "optimistic". When
361  * someone else accesses the socket the ICMP is just dropped
362  * and for some paths there is no check at all.
363  * A more general error queue to queue errors for later handling
364  * is probably better.
365  *
366  */
367 
368 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
369 {
370 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
371 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
372 	struct inet_connection_sock *icsk;
373 	struct tcp_sock *tp;
374 	struct inet_sock *inet;
375 	const int type = icmp_hdr(icmp_skb)->type;
376 	const int code = icmp_hdr(icmp_skb)->code;
377 	struct sock *sk;
378 	struct sk_buff *skb;
379 	struct request_sock *fastopen;
380 	u32 seq, snd_una;
381 	s32 remaining;
382 	u32 delta_us;
383 	int err;
384 	struct net *net = dev_net(icmp_skb->dev);
385 
386 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
387 				       th->dest, iph->saddr, ntohs(th->source),
388 				       inet_iif(icmp_skb));
389 	if (!sk) {
390 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
391 		return;
392 	}
393 	if (sk->sk_state == TCP_TIME_WAIT) {
394 		inet_twsk_put(inet_twsk(sk));
395 		return;
396 	}
397 	seq = ntohl(th->seq);
398 	if (sk->sk_state == TCP_NEW_SYN_RECV)
399 		return tcp_req_err(sk, seq,
400 				  type == ICMP_PARAMETERPROB ||
401 				  type == ICMP_TIME_EXCEEDED ||
402 				  (type == ICMP_DEST_UNREACH &&
403 				   (code == ICMP_NET_UNREACH ||
404 				    code == ICMP_HOST_UNREACH)));
405 
406 	bh_lock_sock(sk);
407 	/* If too many ICMPs get dropped on busy
408 	 * servers this needs to be solved differently.
409 	 * We do take care of PMTU discovery (RFC1191) special case :
410 	 * we can receive locally generated ICMP messages while socket is held.
411 	 */
412 	if (sock_owned_by_user(sk)) {
413 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
414 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
415 	}
416 	if (sk->sk_state == TCP_CLOSE)
417 		goto out;
418 
419 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
420 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
421 		goto out;
422 	}
423 
424 	icsk = inet_csk(sk);
425 	tp = tcp_sk(sk);
426 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
427 	fastopen = tp->fastopen_rsk;
428 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
429 	if (sk->sk_state != TCP_LISTEN &&
430 	    !between(seq, snd_una, tp->snd_nxt)) {
431 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
432 		goto out;
433 	}
434 
435 	switch (type) {
436 	case ICMP_REDIRECT:
437 		if (!sock_owned_by_user(sk))
438 			do_redirect(icmp_skb, sk);
439 		goto out;
440 	case ICMP_SOURCE_QUENCH:
441 		/* Just silently ignore these. */
442 		goto out;
443 	case ICMP_PARAMETERPROB:
444 		err = EPROTO;
445 		break;
446 	case ICMP_DEST_UNREACH:
447 		if (code > NR_ICMP_UNREACH)
448 			goto out;
449 
450 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
451 			/* We are not interested in TCP_LISTEN and open_requests
452 			 * (SYN-ACKs send out by Linux are always <576bytes so
453 			 * they should go through unfragmented).
454 			 */
455 			if (sk->sk_state == TCP_LISTEN)
456 				goto out;
457 
458 			tp->mtu_info = info;
459 			if (!sock_owned_by_user(sk)) {
460 				tcp_v4_mtu_reduced(sk);
461 			} else {
462 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
463 					sock_hold(sk);
464 			}
465 			goto out;
466 		}
467 
468 		err = icmp_err_convert[code].errno;
469 		/* check if icmp_skb allows revert of backoff
470 		 * (see draft-zimmermann-tcp-lcd) */
471 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
472 			break;
473 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
474 		    !icsk->icsk_backoff || fastopen)
475 			break;
476 
477 		if (sock_owned_by_user(sk))
478 			break;
479 
480 		icsk->icsk_backoff--;
481 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
482 					       TCP_TIMEOUT_INIT;
483 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
484 
485 		skb = tcp_write_queue_head(sk);
486 		BUG_ON(!skb);
487 
488 		tcp_mstamp_refresh(tp);
489 		delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
490 		remaining = icsk->icsk_rto -
491 			    usecs_to_jiffies(delta_us);
492 
493 		if (remaining > 0) {
494 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
495 						  remaining, TCP_RTO_MAX);
496 		} else {
497 			/* RTO revert clocked out retransmission.
498 			 * Will retransmit now */
499 			tcp_retransmit_timer(sk);
500 		}
501 
502 		break;
503 	case ICMP_TIME_EXCEEDED:
504 		err = EHOSTUNREACH;
505 		break;
506 	default:
507 		goto out;
508 	}
509 
510 	switch (sk->sk_state) {
511 	case TCP_SYN_SENT:
512 	case TCP_SYN_RECV:
513 		/* Only in fast or simultaneous open. If a fast open socket is
514 		 * is already accepted it is treated as a connected one below.
515 		 */
516 		if (fastopen && !fastopen->sk)
517 			break;
518 
519 		if (!sock_owned_by_user(sk)) {
520 			sk->sk_err = err;
521 
522 			sk->sk_error_report(sk);
523 
524 			tcp_done(sk);
525 		} else {
526 			sk->sk_err_soft = err;
527 		}
528 		goto out;
529 	}
530 
531 	/* If we've already connected we will keep trying
532 	 * until we time out, or the user gives up.
533 	 *
534 	 * rfc1122 4.2.3.9 allows to consider as hard errors
535 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
536 	 * but it is obsoleted by pmtu discovery).
537 	 *
538 	 * Note, that in modern internet, where routing is unreliable
539 	 * and in each dark corner broken firewalls sit, sending random
540 	 * errors ordered by their masters even this two messages finally lose
541 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
542 	 *
543 	 * Now we are in compliance with RFCs.
544 	 *							--ANK (980905)
545 	 */
546 
547 	inet = inet_sk(sk);
548 	if (!sock_owned_by_user(sk) && inet->recverr) {
549 		sk->sk_err = err;
550 		sk->sk_error_report(sk);
551 	} else	{ /* Only an error on timeout */
552 		sk->sk_err_soft = err;
553 	}
554 
555 out:
556 	bh_unlock_sock(sk);
557 	sock_put(sk);
558 }
559 
560 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
561 {
562 	struct tcphdr *th = tcp_hdr(skb);
563 
564 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
565 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
566 		skb->csum_start = skb_transport_header(skb) - skb->head;
567 		skb->csum_offset = offsetof(struct tcphdr, check);
568 	} else {
569 		th->check = tcp_v4_check(skb->len, saddr, daddr,
570 					 csum_partial(th,
571 						      th->doff << 2,
572 						      skb->csum));
573 	}
574 }
575 
576 /* This routine computes an IPv4 TCP checksum. */
577 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
578 {
579 	const struct inet_sock *inet = inet_sk(sk);
580 
581 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
582 }
583 EXPORT_SYMBOL(tcp_v4_send_check);
584 
585 /*
586  *	This routine will send an RST to the other tcp.
587  *
588  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
589  *		      for reset.
590  *	Answer: if a packet caused RST, it is not for a socket
591  *		existing in our system, if it is matched to a socket,
592  *		it is just duplicate segment or bug in other side's TCP.
593  *		So that we build reply only basing on parameters
594  *		arrived with segment.
595  *	Exception: precedence violation. We do not implement it in any case.
596  */
597 
598 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
599 {
600 	const struct tcphdr *th = tcp_hdr(skb);
601 	struct {
602 		struct tcphdr th;
603 #ifdef CONFIG_TCP_MD5SIG
604 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
605 #endif
606 	} rep;
607 	struct ip_reply_arg arg;
608 #ifdef CONFIG_TCP_MD5SIG
609 	struct tcp_md5sig_key *key = NULL;
610 	const __u8 *hash_location = NULL;
611 	unsigned char newhash[16];
612 	int genhash;
613 	struct sock *sk1 = NULL;
614 #endif
615 	struct net *net;
616 
617 	/* Never send a reset in response to a reset. */
618 	if (th->rst)
619 		return;
620 
621 	/* If sk not NULL, it means we did a successful lookup and incoming
622 	 * route had to be correct. prequeue might have dropped our dst.
623 	 */
624 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
625 		return;
626 
627 	/* Swap the send and the receive. */
628 	memset(&rep, 0, sizeof(rep));
629 	rep.th.dest   = th->source;
630 	rep.th.source = th->dest;
631 	rep.th.doff   = sizeof(struct tcphdr) / 4;
632 	rep.th.rst    = 1;
633 
634 	if (th->ack) {
635 		rep.th.seq = th->ack_seq;
636 	} else {
637 		rep.th.ack = 1;
638 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
639 				       skb->len - (th->doff << 2));
640 	}
641 
642 	memset(&arg, 0, sizeof(arg));
643 	arg.iov[0].iov_base = (unsigned char *)&rep;
644 	arg.iov[0].iov_len  = sizeof(rep.th);
645 
646 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
647 #ifdef CONFIG_TCP_MD5SIG
648 	rcu_read_lock();
649 	hash_location = tcp_parse_md5sig_option(th);
650 	if (sk && sk_fullsock(sk)) {
651 		key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
652 					&ip_hdr(skb)->saddr, AF_INET);
653 	} else if (hash_location) {
654 		/*
655 		 * active side is lost. Try to find listening socket through
656 		 * source port, and then find md5 key through listening socket.
657 		 * we are not loose security here:
658 		 * Incoming packet is checked with md5 hash with finding key,
659 		 * no RST generated if md5 hash doesn't match.
660 		 */
661 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
662 					     ip_hdr(skb)->saddr,
663 					     th->source, ip_hdr(skb)->daddr,
664 					     ntohs(th->source), inet_iif(skb));
665 		/* don't send rst if it can't find key */
666 		if (!sk1)
667 			goto out;
668 
669 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
670 					&ip_hdr(skb)->saddr, AF_INET);
671 		if (!key)
672 			goto out;
673 
674 
675 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
676 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
677 			goto out;
678 
679 	}
680 
681 	if (key) {
682 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
683 				   (TCPOPT_NOP << 16) |
684 				   (TCPOPT_MD5SIG << 8) |
685 				   TCPOLEN_MD5SIG);
686 		/* Update length and the length the header thinks exists */
687 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
688 		rep.th.doff = arg.iov[0].iov_len / 4;
689 
690 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
691 				     key, ip_hdr(skb)->saddr,
692 				     ip_hdr(skb)->daddr, &rep.th);
693 	}
694 #endif
695 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
696 				      ip_hdr(skb)->saddr, /* XXX */
697 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
698 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
699 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
700 
701 	/* When socket is gone, all binding information is lost.
702 	 * routing might fail in this case. No choice here, if we choose to force
703 	 * input interface, we will misroute in case of asymmetric route.
704 	 */
705 	if (sk)
706 		arg.bound_dev_if = sk->sk_bound_dev_if;
707 
708 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
709 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
710 
711 	arg.tos = ip_hdr(skb)->tos;
712 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
713 	local_bh_disable();
714 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
715 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
716 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
717 			      &arg, arg.iov[0].iov_len);
718 
719 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
720 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
721 	local_bh_enable();
722 
723 #ifdef CONFIG_TCP_MD5SIG
724 out:
725 	rcu_read_unlock();
726 #endif
727 }
728 
729 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
730    outside socket context is ugly, certainly. What can I do?
731  */
732 
733 static void tcp_v4_send_ack(const struct sock *sk,
734 			    struct sk_buff *skb, u32 seq, u32 ack,
735 			    u32 win, u32 tsval, u32 tsecr, int oif,
736 			    struct tcp_md5sig_key *key,
737 			    int reply_flags, u8 tos)
738 {
739 	const struct tcphdr *th = tcp_hdr(skb);
740 	struct {
741 		struct tcphdr th;
742 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
743 #ifdef CONFIG_TCP_MD5SIG
744 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
745 #endif
746 			];
747 	} rep;
748 	struct net *net = sock_net(sk);
749 	struct ip_reply_arg arg;
750 
751 	memset(&rep.th, 0, sizeof(struct tcphdr));
752 	memset(&arg, 0, sizeof(arg));
753 
754 	arg.iov[0].iov_base = (unsigned char *)&rep;
755 	arg.iov[0].iov_len  = sizeof(rep.th);
756 	if (tsecr) {
757 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
758 				   (TCPOPT_TIMESTAMP << 8) |
759 				   TCPOLEN_TIMESTAMP);
760 		rep.opt[1] = htonl(tsval);
761 		rep.opt[2] = htonl(tsecr);
762 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
763 	}
764 
765 	/* Swap the send and the receive. */
766 	rep.th.dest    = th->source;
767 	rep.th.source  = th->dest;
768 	rep.th.doff    = arg.iov[0].iov_len / 4;
769 	rep.th.seq     = htonl(seq);
770 	rep.th.ack_seq = htonl(ack);
771 	rep.th.ack     = 1;
772 	rep.th.window  = htons(win);
773 
774 #ifdef CONFIG_TCP_MD5SIG
775 	if (key) {
776 		int offset = (tsecr) ? 3 : 0;
777 
778 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
779 					  (TCPOPT_NOP << 16) |
780 					  (TCPOPT_MD5SIG << 8) |
781 					  TCPOLEN_MD5SIG);
782 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
783 		rep.th.doff = arg.iov[0].iov_len/4;
784 
785 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
786 				    key, ip_hdr(skb)->saddr,
787 				    ip_hdr(skb)->daddr, &rep.th);
788 	}
789 #endif
790 	arg.flags = reply_flags;
791 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
792 				      ip_hdr(skb)->saddr, /* XXX */
793 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
794 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
795 	if (oif)
796 		arg.bound_dev_if = oif;
797 	arg.tos = tos;
798 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
799 	local_bh_disable();
800 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
801 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
802 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
803 			      &arg, arg.iov[0].iov_len);
804 
805 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
806 	local_bh_enable();
807 }
808 
809 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
810 {
811 	struct inet_timewait_sock *tw = inet_twsk(sk);
812 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
813 
814 	tcp_v4_send_ack(sk, skb,
815 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
816 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
817 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
818 			tcptw->tw_ts_recent,
819 			tw->tw_bound_dev_if,
820 			tcp_twsk_md5_key(tcptw),
821 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
822 			tw->tw_tos
823 			);
824 
825 	inet_twsk_put(tw);
826 }
827 
828 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
829 				  struct request_sock *req)
830 {
831 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
832 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
833 	 */
834 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
835 					     tcp_sk(sk)->snd_nxt;
836 
837 	/* RFC 7323 2.3
838 	 * The window field (SEG.WND) of every outgoing segment, with the
839 	 * exception of <SYN> segments, MUST be right-shifted by
840 	 * Rcv.Wind.Shift bits:
841 	 */
842 	tcp_v4_send_ack(sk, skb, seq,
843 			tcp_rsk(req)->rcv_nxt,
844 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
845 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
846 			req->ts_recent,
847 			0,
848 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
849 					  AF_INET),
850 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
851 			ip_hdr(skb)->tos);
852 }
853 
854 /*
855  *	Send a SYN-ACK after having received a SYN.
856  *	This still operates on a request_sock only, not on a big
857  *	socket.
858  */
859 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
860 			      struct flowi *fl,
861 			      struct request_sock *req,
862 			      struct tcp_fastopen_cookie *foc,
863 			      enum tcp_synack_type synack_type)
864 {
865 	const struct inet_request_sock *ireq = inet_rsk(req);
866 	struct flowi4 fl4;
867 	int err = -1;
868 	struct sk_buff *skb;
869 
870 	/* First, grab a route. */
871 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
872 		return -1;
873 
874 	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
875 
876 	if (skb) {
877 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
878 
879 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
880 					    ireq->ir_rmt_addr,
881 					    ireq->opt);
882 		err = net_xmit_eval(err);
883 	}
884 
885 	return err;
886 }
887 
888 /*
889  *	IPv4 request_sock destructor.
890  */
891 static void tcp_v4_reqsk_destructor(struct request_sock *req)
892 {
893 	kfree(inet_rsk(req)->opt);
894 }
895 
896 #ifdef CONFIG_TCP_MD5SIG
897 /*
898  * RFC2385 MD5 checksumming requires a mapping of
899  * IP address->MD5 Key.
900  * We need to maintain these in the sk structure.
901  */
902 
903 /* Find the Key structure for an address.  */
904 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
905 					 const union tcp_md5_addr *addr,
906 					 int family)
907 {
908 	const struct tcp_sock *tp = tcp_sk(sk);
909 	struct tcp_md5sig_key *key;
910 	const struct tcp_md5sig_info *md5sig;
911 	__be32 mask;
912 	struct tcp_md5sig_key *best_match = NULL;
913 	bool match;
914 
915 	/* caller either holds rcu_read_lock() or socket lock */
916 	md5sig = rcu_dereference_check(tp->md5sig_info,
917 				       lockdep_sock_is_held(sk));
918 	if (!md5sig)
919 		return NULL;
920 
921 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
922 		if (key->family != family)
923 			continue;
924 
925 		if (family == AF_INET) {
926 			mask = inet_make_mask(key->prefixlen);
927 			match = (key->addr.a4.s_addr & mask) ==
928 				(addr->a4.s_addr & mask);
929 #if IS_ENABLED(CONFIG_IPV6)
930 		} else if (family == AF_INET6) {
931 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
932 						  key->prefixlen);
933 #endif
934 		} else {
935 			match = false;
936 		}
937 
938 		if (match && (!best_match ||
939 			      key->prefixlen > best_match->prefixlen))
940 			best_match = key;
941 	}
942 	return best_match;
943 }
944 EXPORT_SYMBOL(tcp_md5_do_lookup);
945 
946 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
947 						      const union tcp_md5_addr *addr,
948 						      int family, u8 prefixlen)
949 {
950 	const struct tcp_sock *tp = tcp_sk(sk);
951 	struct tcp_md5sig_key *key;
952 	unsigned int size = sizeof(struct in_addr);
953 	const struct tcp_md5sig_info *md5sig;
954 
955 	/* caller either holds rcu_read_lock() or socket lock */
956 	md5sig = rcu_dereference_check(tp->md5sig_info,
957 				       lockdep_sock_is_held(sk));
958 	if (!md5sig)
959 		return NULL;
960 #if IS_ENABLED(CONFIG_IPV6)
961 	if (family == AF_INET6)
962 		size = sizeof(struct in6_addr);
963 #endif
964 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
965 		if (key->family != family)
966 			continue;
967 		if (!memcmp(&key->addr, addr, size) &&
968 		    key->prefixlen == prefixlen)
969 			return key;
970 	}
971 	return NULL;
972 }
973 
974 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
975 					 const struct sock *addr_sk)
976 {
977 	const union tcp_md5_addr *addr;
978 
979 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
980 	return tcp_md5_do_lookup(sk, addr, AF_INET);
981 }
982 EXPORT_SYMBOL(tcp_v4_md5_lookup);
983 
984 /* This can be called on a newly created socket, from other files */
985 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
986 		   int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
987 		   gfp_t gfp)
988 {
989 	/* Add Key to the list */
990 	struct tcp_md5sig_key *key;
991 	struct tcp_sock *tp = tcp_sk(sk);
992 	struct tcp_md5sig_info *md5sig;
993 
994 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
995 	if (key) {
996 		/* Pre-existing entry - just update that one. */
997 		memcpy(key->key, newkey, newkeylen);
998 		key->keylen = newkeylen;
999 		return 0;
1000 	}
1001 
1002 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1003 					   lockdep_sock_is_held(sk));
1004 	if (!md5sig) {
1005 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1006 		if (!md5sig)
1007 			return -ENOMEM;
1008 
1009 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1010 		INIT_HLIST_HEAD(&md5sig->head);
1011 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1012 	}
1013 
1014 	key = sock_kmalloc(sk, sizeof(*key), gfp);
1015 	if (!key)
1016 		return -ENOMEM;
1017 	if (!tcp_alloc_md5sig_pool()) {
1018 		sock_kfree_s(sk, key, sizeof(*key));
1019 		return -ENOMEM;
1020 	}
1021 
1022 	memcpy(key->key, newkey, newkeylen);
1023 	key->keylen = newkeylen;
1024 	key->family = family;
1025 	key->prefixlen = prefixlen;
1026 	memcpy(&key->addr, addr,
1027 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1028 				      sizeof(struct in_addr));
1029 	hlist_add_head_rcu(&key->node, &md5sig->head);
1030 	return 0;
1031 }
1032 EXPORT_SYMBOL(tcp_md5_do_add);
1033 
1034 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1035 		   u8 prefixlen)
1036 {
1037 	struct tcp_md5sig_key *key;
1038 
1039 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1040 	if (!key)
1041 		return -ENOENT;
1042 	hlist_del_rcu(&key->node);
1043 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1044 	kfree_rcu(key, rcu);
1045 	return 0;
1046 }
1047 EXPORT_SYMBOL(tcp_md5_do_del);
1048 
1049 static void tcp_clear_md5_list(struct sock *sk)
1050 {
1051 	struct tcp_sock *tp = tcp_sk(sk);
1052 	struct tcp_md5sig_key *key;
1053 	struct hlist_node *n;
1054 	struct tcp_md5sig_info *md5sig;
1055 
1056 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1057 
1058 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1059 		hlist_del_rcu(&key->node);
1060 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1061 		kfree_rcu(key, rcu);
1062 	}
1063 }
1064 
1065 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1066 				 char __user *optval, int optlen)
1067 {
1068 	struct tcp_md5sig cmd;
1069 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1070 	u8 prefixlen = 32;
1071 
1072 	if (optlen < sizeof(cmd))
1073 		return -EINVAL;
1074 
1075 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1076 		return -EFAULT;
1077 
1078 	if (sin->sin_family != AF_INET)
1079 		return -EINVAL;
1080 
1081 	if (optname == TCP_MD5SIG_EXT &&
1082 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1083 		prefixlen = cmd.tcpm_prefixlen;
1084 		if (prefixlen > 32)
1085 			return -EINVAL;
1086 	}
1087 
1088 	if (!cmd.tcpm_keylen)
1089 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1090 				      AF_INET, prefixlen);
1091 
1092 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1093 		return -EINVAL;
1094 
1095 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1096 			      AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1097 			      GFP_KERNEL);
1098 }
1099 
1100 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1101 				   __be32 daddr, __be32 saddr,
1102 				   const struct tcphdr *th, int nbytes)
1103 {
1104 	struct tcp4_pseudohdr *bp;
1105 	struct scatterlist sg;
1106 	struct tcphdr *_th;
1107 
1108 	bp = hp->scratch;
1109 	bp->saddr = saddr;
1110 	bp->daddr = daddr;
1111 	bp->pad = 0;
1112 	bp->protocol = IPPROTO_TCP;
1113 	bp->len = cpu_to_be16(nbytes);
1114 
1115 	_th = (struct tcphdr *)(bp + 1);
1116 	memcpy(_th, th, sizeof(*th));
1117 	_th->check = 0;
1118 
1119 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1120 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1121 				sizeof(*bp) + sizeof(*th));
1122 	return crypto_ahash_update(hp->md5_req);
1123 }
1124 
1125 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1126 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1127 {
1128 	struct tcp_md5sig_pool *hp;
1129 	struct ahash_request *req;
1130 
1131 	hp = tcp_get_md5sig_pool();
1132 	if (!hp)
1133 		goto clear_hash_noput;
1134 	req = hp->md5_req;
1135 
1136 	if (crypto_ahash_init(req))
1137 		goto clear_hash;
1138 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1139 		goto clear_hash;
1140 	if (tcp_md5_hash_key(hp, key))
1141 		goto clear_hash;
1142 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1143 	if (crypto_ahash_final(req))
1144 		goto clear_hash;
1145 
1146 	tcp_put_md5sig_pool();
1147 	return 0;
1148 
1149 clear_hash:
1150 	tcp_put_md5sig_pool();
1151 clear_hash_noput:
1152 	memset(md5_hash, 0, 16);
1153 	return 1;
1154 }
1155 
1156 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1157 			const struct sock *sk,
1158 			const struct sk_buff *skb)
1159 {
1160 	struct tcp_md5sig_pool *hp;
1161 	struct ahash_request *req;
1162 	const struct tcphdr *th = tcp_hdr(skb);
1163 	__be32 saddr, daddr;
1164 
1165 	if (sk) { /* valid for establish/request sockets */
1166 		saddr = sk->sk_rcv_saddr;
1167 		daddr = sk->sk_daddr;
1168 	} else {
1169 		const struct iphdr *iph = ip_hdr(skb);
1170 		saddr = iph->saddr;
1171 		daddr = iph->daddr;
1172 	}
1173 
1174 	hp = tcp_get_md5sig_pool();
1175 	if (!hp)
1176 		goto clear_hash_noput;
1177 	req = hp->md5_req;
1178 
1179 	if (crypto_ahash_init(req))
1180 		goto clear_hash;
1181 
1182 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1183 		goto clear_hash;
1184 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1185 		goto clear_hash;
1186 	if (tcp_md5_hash_key(hp, key))
1187 		goto clear_hash;
1188 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1189 	if (crypto_ahash_final(req))
1190 		goto clear_hash;
1191 
1192 	tcp_put_md5sig_pool();
1193 	return 0;
1194 
1195 clear_hash:
1196 	tcp_put_md5sig_pool();
1197 clear_hash_noput:
1198 	memset(md5_hash, 0, 16);
1199 	return 1;
1200 }
1201 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1202 
1203 #endif
1204 
1205 /* Called with rcu_read_lock() */
1206 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1207 				    const struct sk_buff *skb)
1208 {
1209 #ifdef CONFIG_TCP_MD5SIG
1210 	/*
1211 	 * This gets called for each TCP segment that arrives
1212 	 * so we want to be efficient.
1213 	 * We have 3 drop cases:
1214 	 * o No MD5 hash and one expected.
1215 	 * o MD5 hash and we're not expecting one.
1216 	 * o MD5 hash and its wrong.
1217 	 */
1218 	const __u8 *hash_location = NULL;
1219 	struct tcp_md5sig_key *hash_expected;
1220 	const struct iphdr *iph = ip_hdr(skb);
1221 	const struct tcphdr *th = tcp_hdr(skb);
1222 	int genhash;
1223 	unsigned char newhash[16];
1224 
1225 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1226 					  AF_INET);
1227 	hash_location = tcp_parse_md5sig_option(th);
1228 
1229 	/* We've parsed the options - do we have a hash? */
1230 	if (!hash_expected && !hash_location)
1231 		return false;
1232 
1233 	if (hash_expected && !hash_location) {
1234 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1235 		return true;
1236 	}
1237 
1238 	if (!hash_expected && hash_location) {
1239 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1240 		return true;
1241 	}
1242 
1243 	/* Okay, so this is hash_expected and hash_location -
1244 	 * so we need to calculate the checksum.
1245 	 */
1246 	genhash = tcp_v4_md5_hash_skb(newhash,
1247 				      hash_expected,
1248 				      NULL, skb);
1249 
1250 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1251 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1252 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1253 				     &iph->saddr, ntohs(th->source),
1254 				     &iph->daddr, ntohs(th->dest),
1255 				     genhash ? " tcp_v4_calc_md5_hash failed"
1256 				     : "");
1257 		return true;
1258 	}
1259 	return false;
1260 #endif
1261 	return false;
1262 }
1263 
1264 static void tcp_v4_init_req(struct request_sock *req,
1265 			    const struct sock *sk_listener,
1266 			    struct sk_buff *skb)
1267 {
1268 	struct inet_request_sock *ireq = inet_rsk(req);
1269 
1270 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1271 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1272 	ireq->opt = tcp_v4_save_options(skb);
1273 }
1274 
1275 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1276 					  struct flowi *fl,
1277 					  const struct request_sock *req)
1278 {
1279 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1280 }
1281 
1282 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1283 	.family		=	PF_INET,
1284 	.obj_size	=	sizeof(struct tcp_request_sock),
1285 	.rtx_syn_ack	=	tcp_rtx_synack,
1286 	.send_ack	=	tcp_v4_reqsk_send_ack,
1287 	.destructor	=	tcp_v4_reqsk_destructor,
1288 	.send_reset	=	tcp_v4_send_reset,
1289 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1290 };
1291 
1292 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1293 	.mss_clamp	=	TCP_MSS_DEFAULT,
1294 #ifdef CONFIG_TCP_MD5SIG
1295 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1296 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1297 #endif
1298 	.init_req	=	tcp_v4_init_req,
1299 #ifdef CONFIG_SYN_COOKIES
1300 	.cookie_init_seq =	cookie_v4_init_sequence,
1301 #endif
1302 	.route_req	=	tcp_v4_route_req,
1303 	.init_seq	=	tcp_v4_init_seq,
1304 	.init_ts_off	=	tcp_v4_init_ts_off,
1305 	.send_synack	=	tcp_v4_send_synack,
1306 };
1307 
1308 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1309 {
1310 	/* Never answer to SYNs send to broadcast or multicast */
1311 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1312 		goto drop;
1313 
1314 	return tcp_conn_request(&tcp_request_sock_ops,
1315 				&tcp_request_sock_ipv4_ops, sk, skb);
1316 
1317 drop:
1318 	tcp_listendrop(sk);
1319 	return 0;
1320 }
1321 EXPORT_SYMBOL(tcp_v4_conn_request);
1322 
1323 
1324 /*
1325  * The three way handshake has completed - we got a valid synack -
1326  * now create the new socket.
1327  */
1328 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1329 				  struct request_sock *req,
1330 				  struct dst_entry *dst,
1331 				  struct request_sock *req_unhash,
1332 				  bool *own_req)
1333 {
1334 	struct inet_request_sock *ireq;
1335 	struct inet_sock *newinet;
1336 	struct tcp_sock *newtp;
1337 	struct sock *newsk;
1338 #ifdef CONFIG_TCP_MD5SIG
1339 	struct tcp_md5sig_key *key;
1340 #endif
1341 	struct ip_options_rcu *inet_opt;
1342 
1343 	if (sk_acceptq_is_full(sk))
1344 		goto exit_overflow;
1345 
1346 	newsk = tcp_create_openreq_child(sk, req, skb);
1347 	if (!newsk)
1348 		goto exit_nonewsk;
1349 
1350 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1351 	inet_sk_rx_dst_set(newsk, skb);
1352 
1353 	newtp		      = tcp_sk(newsk);
1354 	newinet		      = inet_sk(newsk);
1355 	ireq		      = inet_rsk(req);
1356 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1357 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1358 	newsk->sk_bound_dev_if = ireq->ir_iif;
1359 	newinet->inet_saddr	      = ireq->ir_loc_addr;
1360 	inet_opt	      = ireq->opt;
1361 	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1362 	ireq->opt	      = NULL;
1363 	newinet->mc_index     = inet_iif(skb);
1364 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1365 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1366 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1367 	if (inet_opt)
1368 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1369 	newinet->inet_id = newtp->write_seq ^ jiffies;
1370 
1371 	if (!dst) {
1372 		dst = inet_csk_route_child_sock(sk, newsk, req);
1373 		if (!dst)
1374 			goto put_and_exit;
1375 	} else {
1376 		/* syncookie case : see end of cookie_v4_check() */
1377 	}
1378 	sk_setup_caps(newsk, dst);
1379 
1380 	tcp_ca_openreq_child(newsk, dst);
1381 
1382 	tcp_sync_mss(newsk, dst_mtu(dst));
1383 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1384 
1385 	tcp_initialize_rcv_mss(newsk);
1386 
1387 #ifdef CONFIG_TCP_MD5SIG
1388 	/* Copy over the MD5 key from the original socket */
1389 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1390 				AF_INET);
1391 	if (key) {
1392 		/*
1393 		 * We're using one, so create a matching key
1394 		 * on the newsk structure. If we fail to get
1395 		 * memory, then we end up not copying the key
1396 		 * across. Shucks.
1397 		 */
1398 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1399 			       AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1400 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1401 	}
1402 #endif
1403 
1404 	if (__inet_inherit_port(sk, newsk) < 0)
1405 		goto put_and_exit;
1406 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1407 	if (*own_req)
1408 		tcp_move_syn(newtp, req);
1409 
1410 	return newsk;
1411 
1412 exit_overflow:
1413 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1414 exit_nonewsk:
1415 	dst_release(dst);
1416 exit:
1417 	tcp_listendrop(sk);
1418 	return NULL;
1419 put_and_exit:
1420 	inet_csk_prepare_forced_close(newsk);
1421 	tcp_done(newsk);
1422 	goto exit;
1423 }
1424 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1425 
1426 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1427 {
1428 #ifdef CONFIG_SYN_COOKIES
1429 	const struct tcphdr *th = tcp_hdr(skb);
1430 
1431 	if (!th->syn)
1432 		sk = cookie_v4_check(sk, skb);
1433 #endif
1434 	return sk;
1435 }
1436 
1437 /* The socket must have it's spinlock held when we get
1438  * here, unless it is a TCP_LISTEN socket.
1439  *
1440  * We have a potential double-lock case here, so even when
1441  * doing backlog processing we use the BH locking scheme.
1442  * This is because we cannot sleep with the original spinlock
1443  * held.
1444  */
1445 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1446 {
1447 	struct sock *rsk;
1448 
1449 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1450 		struct dst_entry *dst = sk->sk_rx_dst;
1451 
1452 		sock_rps_save_rxhash(sk, skb);
1453 		sk_mark_napi_id(sk, skb);
1454 		if (dst) {
1455 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1456 			    !dst->ops->check(dst, 0)) {
1457 				dst_release(dst);
1458 				sk->sk_rx_dst = NULL;
1459 			}
1460 		}
1461 		tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1462 		return 0;
1463 	}
1464 
1465 	if (tcp_checksum_complete(skb))
1466 		goto csum_err;
1467 
1468 	if (sk->sk_state == TCP_LISTEN) {
1469 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1470 
1471 		if (!nsk)
1472 			goto discard;
1473 		if (nsk != sk) {
1474 			if (tcp_child_process(sk, nsk, skb)) {
1475 				rsk = nsk;
1476 				goto reset;
1477 			}
1478 			return 0;
1479 		}
1480 	} else
1481 		sock_rps_save_rxhash(sk, skb);
1482 
1483 	if (tcp_rcv_state_process(sk, skb)) {
1484 		rsk = sk;
1485 		goto reset;
1486 	}
1487 	return 0;
1488 
1489 reset:
1490 	tcp_v4_send_reset(rsk, skb);
1491 discard:
1492 	kfree_skb(skb);
1493 	/* Be careful here. If this function gets more complicated and
1494 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1495 	 * might be destroyed here. This current version compiles correctly,
1496 	 * but you have been warned.
1497 	 */
1498 	return 0;
1499 
1500 csum_err:
1501 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1502 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1503 	goto discard;
1504 }
1505 EXPORT_SYMBOL(tcp_v4_do_rcv);
1506 
1507 void tcp_v4_early_demux(struct sk_buff *skb)
1508 {
1509 	const struct iphdr *iph;
1510 	const struct tcphdr *th;
1511 	struct sock *sk;
1512 
1513 	if (skb->pkt_type != PACKET_HOST)
1514 		return;
1515 
1516 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1517 		return;
1518 
1519 	iph = ip_hdr(skb);
1520 	th = tcp_hdr(skb);
1521 
1522 	if (th->doff < sizeof(struct tcphdr) / 4)
1523 		return;
1524 
1525 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1526 				       iph->saddr, th->source,
1527 				       iph->daddr, ntohs(th->dest),
1528 				       skb->skb_iif);
1529 	if (sk) {
1530 		skb->sk = sk;
1531 		skb->destructor = sock_edemux;
1532 		if (sk_fullsock(sk)) {
1533 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1534 
1535 			if (dst)
1536 				dst = dst_check(dst, 0);
1537 			if (dst &&
1538 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1539 				skb_dst_set_noref(skb, dst);
1540 		}
1541 	}
1542 }
1543 
1544 /* Packet is added to VJ-style prequeue for processing in process
1545  * context, if a reader task is waiting. Apparently, this exciting
1546  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1547  * failed somewhere. Latency? Burstiness? Well, at least now we will
1548  * see, why it failed. 8)8)				  --ANK
1549  *
1550  */
1551 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1552 {
1553 	struct tcp_sock *tp = tcp_sk(sk);
1554 
1555 	if (sysctl_tcp_low_latency || !tp->ucopy.task)
1556 		return false;
1557 
1558 	if (skb->len <= tcp_hdrlen(skb) &&
1559 	    skb_queue_len(&tp->ucopy.prequeue) == 0)
1560 		return false;
1561 
1562 	/* Before escaping RCU protected region, we need to take care of skb
1563 	 * dst. Prequeue is only enabled for established sockets.
1564 	 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1565 	 * Instead of doing full sk_rx_dst validity here, let's perform
1566 	 * an optimistic check.
1567 	 */
1568 	if (likely(sk->sk_rx_dst))
1569 		skb_dst_drop(skb);
1570 	else
1571 		skb_dst_force_safe(skb);
1572 
1573 	__skb_queue_tail(&tp->ucopy.prequeue, skb);
1574 	tp->ucopy.memory += skb->truesize;
1575 	if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
1576 	    tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
1577 		struct sk_buff *skb1;
1578 
1579 		BUG_ON(sock_owned_by_user(sk));
1580 		__NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1581 				skb_queue_len(&tp->ucopy.prequeue));
1582 
1583 		while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1584 			sk_backlog_rcv(sk, skb1);
1585 
1586 		tp->ucopy.memory = 0;
1587 	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1588 		wake_up_interruptible_sync_poll(sk_sleep(sk),
1589 					   POLLIN | POLLRDNORM | POLLRDBAND);
1590 		if (!inet_csk_ack_scheduled(sk))
1591 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1592 						  (3 * tcp_rto_min(sk)) / 4,
1593 						  TCP_RTO_MAX);
1594 	}
1595 	return true;
1596 }
1597 EXPORT_SYMBOL(tcp_prequeue);
1598 
1599 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1600 {
1601 	u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1602 
1603 	/* Only socket owner can try to collapse/prune rx queues
1604 	 * to reduce memory overhead, so add a little headroom here.
1605 	 * Few sockets backlog are possibly concurrently non empty.
1606 	 */
1607 	limit += 64*1024;
1608 
1609 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1610 	 * we can fix skb->truesize to its real value to avoid future drops.
1611 	 * This is valid because skb is not yet charged to the socket.
1612 	 * It has been noticed pure SACK packets were sometimes dropped
1613 	 * (if cooked by drivers without copybreak feature).
1614 	 */
1615 	skb_condense(skb);
1616 
1617 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1618 		bh_unlock_sock(sk);
1619 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1620 		return true;
1621 	}
1622 	return false;
1623 }
1624 EXPORT_SYMBOL(tcp_add_backlog);
1625 
1626 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1627 {
1628 	struct tcphdr *th = (struct tcphdr *)skb->data;
1629 	unsigned int eaten = skb->len;
1630 	int err;
1631 
1632 	err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1633 	if (!err) {
1634 		eaten -= skb->len;
1635 		TCP_SKB_CB(skb)->end_seq -= eaten;
1636 	}
1637 	return err;
1638 }
1639 EXPORT_SYMBOL(tcp_filter);
1640 
1641 /*
1642  *	From tcp_input.c
1643  */
1644 
1645 int tcp_v4_rcv(struct sk_buff *skb)
1646 {
1647 	struct net *net = dev_net(skb->dev);
1648 	const struct iphdr *iph;
1649 	const struct tcphdr *th;
1650 	bool refcounted;
1651 	struct sock *sk;
1652 	int ret;
1653 
1654 	if (skb->pkt_type != PACKET_HOST)
1655 		goto discard_it;
1656 
1657 	/* Count it even if it's bad */
1658 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1659 
1660 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1661 		goto discard_it;
1662 
1663 	th = (const struct tcphdr *)skb->data;
1664 
1665 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1666 		goto bad_packet;
1667 	if (!pskb_may_pull(skb, th->doff * 4))
1668 		goto discard_it;
1669 
1670 	/* An explanation is required here, I think.
1671 	 * Packet length and doff are validated by header prediction,
1672 	 * provided case of th->doff==0 is eliminated.
1673 	 * So, we defer the checks. */
1674 
1675 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1676 		goto csum_error;
1677 
1678 	th = (const struct tcphdr *)skb->data;
1679 	iph = ip_hdr(skb);
1680 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1681 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1682 	 */
1683 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1684 		sizeof(struct inet_skb_parm));
1685 	barrier();
1686 
1687 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1688 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1689 				    skb->len - th->doff * 4);
1690 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1691 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1692 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1693 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1694 	TCP_SKB_CB(skb)->sacked	 = 0;
1695 
1696 lookup:
1697 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1698 			       th->dest, &refcounted);
1699 	if (!sk)
1700 		goto no_tcp_socket;
1701 
1702 process:
1703 	if (sk->sk_state == TCP_TIME_WAIT)
1704 		goto do_time_wait;
1705 
1706 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1707 		struct request_sock *req = inet_reqsk(sk);
1708 		struct sock *nsk;
1709 
1710 		sk = req->rsk_listener;
1711 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1712 			sk_drops_add(sk, skb);
1713 			reqsk_put(req);
1714 			goto discard_it;
1715 		}
1716 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1717 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1718 			goto lookup;
1719 		}
1720 		/* We own a reference on the listener, increase it again
1721 		 * as we might lose it too soon.
1722 		 */
1723 		sock_hold(sk);
1724 		refcounted = true;
1725 		nsk = tcp_check_req(sk, skb, req, false);
1726 		if (!nsk) {
1727 			reqsk_put(req);
1728 			goto discard_and_relse;
1729 		}
1730 		if (nsk == sk) {
1731 			reqsk_put(req);
1732 		} else if (tcp_filter(sk, skb)) {
1733 			goto discard_and_relse;
1734 		} else if (tcp_child_process(sk, nsk, skb)) {
1735 			tcp_v4_send_reset(nsk, skb);
1736 			goto discard_and_relse;
1737 		} else {
1738 			sock_put(sk);
1739 			return 0;
1740 		}
1741 	}
1742 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1743 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1744 		goto discard_and_relse;
1745 	}
1746 
1747 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1748 		goto discard_and_relse;
1749 
1750 	if (tcp_v4_inbound_md5_hash(sk, skb))
1751 		goto discard_and_relse;
1752 
1753 	nf_reset(skb);
1754 
1755 	if (tcp_filter(sk, skb))
1756 		goto discard_and_relse;
1757 	th = (const struct tcphdr *)skb->data;
1758 	iph = ip_hdr(skb);
1759 
1760 	skb->dev = NULL;
1761 
1762 	if (sk->sk_state == TCP_LISTEN) {
1763 		ret = tcp_v4_do_rcv(sk, skb);
1764 		goto put_and_return;
1765 	}
1766 
1767 	sk_incoming_cpu_update(sk);
1768 
1769 	bh_lock_sock_nested(sk);
1770 	tcp_segs_in(tcp_sk(sk), skb);
1771 	ret = 0;
1772 	if (!sock_owned_by_user(sk)) {
1773 		if (!tcp_prequeue(sk, skb))
1774 			ret = tcp_v4_do_rcv(sk, skb);
1775 	} else if (tcp_add_backlog(sk, skb)) {
1776 		goto discard_and_relse;
1777 	}
1778 	bh_unlock_sock(sk);
1779 
1780 put_and_return:
1781 	if (refcounted)
1782 		sock_put(sk);
1783 
1784 	return ret;
1785 
1786 no_tcp_socket:
1787 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1788 		goto discard_it;
1789 
1790 	if (tcp_checksum_complete(skb)) {
1791 csum_error:
1792 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1793 bad_packet:
1794 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
1795 	} else {
1796 		tcp_v4_send_reset(NULL, skb);
1797 	}
1798 
1799 discard_it:
1800 	/* Discard frame. */
1801 	kfree_skb(skb);
1802 	return 0;
1803 
1804 discard_and_relse:
1805 	sk_drops_add(sk, skb);
1806 	if (refcounted)
1807 		sock_put(sk);
1808 	goto discard_it;
1809 
1810 do_time_wait:
1811 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1812 		inet_twsk_put(inet_twsk(sk));
1813 		goto discard_it;
1814 	}
1815 
1816 	if (tcp_checksum_complete(skb)) {
1817 		inet_twsk_put(inet_twsk(sk));
1818 		goto csum_error;
1819 	}
1820 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1821 	case TCP_TW_SYN: {
1822 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1823 							&tcp_hashinfo, skb,
1824 							__tcp_hdrlen(th),
1825 							iph->saddr, th->source,
1826 							iph->daddr, th->dest,
1827 							inet_iif(skb));
1828 		if (sk2) {
1829 			inet_twsk_deschedule_put(inet_twsk(sk));
1830 			sk = sk2;
1831 			refcounted = false;
1832 			goto process;
1833 		}
1834 		/* Fall through to ACK */
1835 	}
1836 	case TCP_TW_ACK:
1837 		tcp_v4_timewait_ack(sk, skb);
1838 		break;
1839 	case TCP_TW_RST:
1840 		tcp_v4_send_reset(sk, skb);
1841 		inet_twsk_deschedule_put(inet_twsk(sk));
1842 		goto discard_it;
1843 	case TCP_TW_SUCCESS:;
1844 	}
1845 	goto discard_it;
1846 }
1847 
1848 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1849 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1850 	.twsk_unique	= tcp_twsk_unique,
1851 	.twsk_destructor= tcp_twsk_destructor,
1852 };
1853 
1854 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1855 {
1856 	struct dst_entry *dst = skb_dst(skb);
1857 
1858 	if (dst && dst_hold_safe(dst)) {
1859 		sk->sk_rx_dst = dst;
1860 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1861 	}
1862 }
1863 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1864 
1865 const struct inet_connection_sock_af_ops ipv4_specific = {
1866 	.queue_xmit	   = ip_queue_xmit,
1867 	.send_check	   = tcp_v4_send_check,
1868 	.rebuild_header	   = inet_sk_rebuild_header,
1869 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
1870 	.conn_request	   = tcp_v4_conn_request,
1871 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1872 	.net_header_len	   = sizeof(struct iphdr),
1873 	.setsockopt	   = ip_setsockopt,
1874 	.getsockopt	   = ip_getsockopt,
1875 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1876 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1877 #ifdef CONFIG_COMPAT
1878 	.compat_setsockopt = compat_ip_setsockopt,
1879 	.compat_getsockopt = compat_ip_getsockopt,
1880 #endif
1881 	.mtu_reduced	   = tcp_v4_mtu_reduced,
1882 };
1883 EXPORT_SYMBOL(ipv4_specific);
1884 
1885 #ifdef CONFIG_TCP_MD5SIG
1886 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1887 	.md5_lookup		= tcp_v4_md5_lookup,
1888 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1889 	.md5_parse		= tcp_v4_parse_md5_keys,
1890 };
1891 #endif
1892 
1893 /* NOTE: A lot of things set to zero explicitly by call to
1894  *       sk_alloc() so need not be done here.
1895  */
1896 static int tcp_v4_init_sock(struct sock *sk)
1897 {
1898 	struct inet_connection_sock *icsk = inet_csk(sk);
1899 
1900 	tcp_init_sock(sk);
1901 
1902 	icsk->icsk_af_ops = &ipv4_specific;
1903 
1904 #ifdef CONFIG_TCP_MD5SIG
1905 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1906 #endif
1907 
1908 	return 0;
1909 }
1910 
1911 void tcp_v4_destroy_sock(struct sock *sk)
1912 {
1913 	struct tcp_sock *tp = tcp_sk(sk);
1914 
1915 	tcp_clear_xmit_timers(sk);
1916 
1917 	tcp_cleanup_congestion_control(sk);
1918 
1919 	tcp_cleanup_ulp(sk);
1920 
1921 	/* Cleanup up the write buffer. */
1922 	tcp_write_queue_purge(sk);
1923 
1924 	/* Check if we want to disable active TFO */
1925 	tcp_fastopen_active_disable_ofo_check(sk);
1926 
1927 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1928 	skb_rbtree_purge(&tp->out_of_order_queue);
1929 
1930 #ifdef CONFIG_TCP_MD5SIG
1931 	/* Clean up the MD5 key list, if any */
1932 	if (tp->md5sig_info) {
1933 		tcp_clear_md5_list(sk);
1934 		kfree_rcu(tp->md5sig_info, rcu);
1935 		tp->md5sig_info = NULL;
1936 	}
1937 #endif
1938 
1939 	/* Clean prequeue, it must be empty really */
1940 	__skb_queue_purge(&tp->ucopy.prequeue);
1941 
1942 	/* Clean up a referenced TCP bind bucket. */
1943 	if (inet_csk(sk)->icsk_bind_hash)
1944 		inet_put_port(sk);
1945 
1946 	BUG_ON(tp->fastopen_rsk);
1947 
1948 	/* If socket is aborted during connect operation */
1949 	tcp_free_fastopen_req(tp);
1950 	tcp_saved_syn_free(tp);
1951 
1952 	sk_sockets_allocated_dec(sk);
1953 }
1954 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1955 
1956 #ifdef CONFIG_PROC_FS
1957 /* Proc filesystem TCP sock list dumping. */
1958 
1959 /*
1960  * Get next listener socket follow cur.  If cur is NULL, get first socket
1961  * starting from bucket given in st->bucket; when st->bucket is zero the
1962  * very first socket in the hash table is returned.
1963  */
1964 static void *listening_get_next(struct seq_file *seq, void *cur)
1965 {
1966 	struct tcp_iter_state *st = seq->private;
1967 	struct net *net = seq_file_net(seq);
1968 	struct inet_listen_hashbucket *ilb;
1969 	struct sock *sk = cur;
1970 
1971 	if (!sk) {
1972 get_head:
1973 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1974 		spin_lock(&ilb->lock);
1975 		sk = sk_head(&ilb->head);
1976 		st->offset = 0;
1977 		goto get_sk;
1978 	}
1979 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1980 	++st->num;
1981 	++st->offset;
1982 
1983 	sk = sk_next(sk);
1984 get_sk:
1985 	sk_for_each_from(sk) {
1986 		if (!net_eq(sock_net(sk), net))
1987 			continue;
1988 		if (sk->sk_family == st->family)
1989 			return sk;
1990 	}
1991 	spin_unlock(&ilb->lock);
1992 	st->offset = 0;
1993 	if (++st->bucket < INET_LHTABLE_SIZE)
1994 		goto get_head;
1995 	return NULL;
1996 }
1997 
1998 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1999 {
2000 	struct tcp_iter_state *st = seq->private;
2001 	void *rc;
2002 
2003 	st->bucket = 0;
2004 	st->offset = 0;
2005 	rc = listening_get_next(seq, NULL);
2006 
2007 	while (rc && *pos) {
2008 		rc = listening_get_next(seq, rc);
2009 		--*pos;
2010 	}
2011 	return rc;
2012 }
2013 
2014 static inline bool empty_bucket(const struct tcp_iter_state *st)
2015 {
2016 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2017 }
2018 
2019 /*
2020  * Get first established socket starting from bucket given in st->bucket.
2021  * If st->bucket is zero, the very first socket in the hash is returned.
2022  */
2023 static void *established_get_first(struct seq_file *seq)
2024 {
2025 	struct tcp_iter_state *st = seq->private;
2026 	struct net *net = seq_file_net(seq);
2027 	void *rc = NULL;
2028 
2029 	st->offset = 0;
2030 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2031 		struct sock *sk;
2032 		struct hlist_nulls_node *node;
2033 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2034 
2035 		/* Lockless fast path for the common case of empty buckets */
2036 		if (empty_bucket(st))
2037 			continue;
2038 
2039 		spin_lock_bh(lock);
2040 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2041 			if (sk->sk_family != st->family ||
2042 			    !net_eq(sock_net(sk), net)) {
2043 				continue;
2044 			}
2045 			rc = sk;
2046 			goto out;
2047 		}
2048 		spin_unlock_bh(lock);
2049 	}
2050 out:
2051 	return rc;
2052 }
2053 
2054 static void *established_get_next(struct seq_file *seq, void *cur)
2055 {
2056 	struct sock *sk = cur;
2057 	struct hlist_nulls_node *node;
2058 	struct tcp_iter_state *st = seq->private;
2059 	struct net *net = seq_file_net(seq);
2060 
2061 	++st->num;
2062 	++st->offset;
2063 
2064 	sk = sk_nulls_next(sk);
2065 
2066 	sk_nulls_for_each_from(sk, node) {
2067 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2068 			return sk;
2069 	}
2070 
2071 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2072 	++st->bucket;
2073 	return established_get_first(seq);
2074 }
2075 
2076 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2077 {
2078 	struct tcp_iter_state *st = seq->private;
2079 	void *rc;
2080 
2081 	st->bucket = 0;
2082 	rc = established_get_first(seq);
2083 
2084 	while (rc && pos) {
2085 		rc = established_get_next(seq, rc);
2086 		--pos;
2087 	}
2088 	return rc;
2089 }
2090 
2091 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2092 {
2093 	void *rc;
2094 	struct tcp_iter_state *st = seq->private;
2095 
2096 	st->state = TCP_SEQ_STATE_LISTENING;
2097 	rc	  = listening_get_idx(seq, &pos);
2098 
2099 	if (!rc) {
2100 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2101 		rc	  = established_get_idx(seq, pos);
2102 	}
2103 
2104 	return rc;
2105 }
2106 
2107 static void *tcp_seek_last_pos(struct seq_file *seq)
2108 {
2109 	struct tcp_iter_state *st = seq->private;
2110 	int offset = st->offset;
2111 	int orig_num = st->num;
2112 	void *rc = NULL;
2113 
2114 	switch (st->state) {
2115 	case TCP_SEQ_STATE_LISTENING:
2116 		if (st->bucket >= INET_LHTABLE_SIZE)
2117 			break;
2118 		st->state = TCP_SEQ_STATE_LISTENING;
2119 		rc = listening_get_next(seq, NULL);
2120 		while (offset-- && rc)
2121 			rc = listening_get_next(seq, rc);
2122 		if (rc)
2123 			break;
2124 		st->bucket = 0;
2125 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2126 		/* Fallthrough */
2127 	case TCP_SEQ_STATE_ESTABLISHED:
2128 		if (st->bucket > tcp_hashinfo.ehash_mask)
2129 			break;
2130 		rc = established_get_first(seq);
2131 		while (offset-- && rc)
2132 			rc = established_get_next(seq, rc);
2133 	}
2134 
2135 	st->num = orig_num;
2136 
2137 	return rc;
2138 }
2139 
2140 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2141 {
2142 	struct tcp_iter_state *st = seq->private;
2143 	void *rc;
2144 
2145 	if (*pos && *pos == st->last_pos) {
2146 		rc = tcp_seek_last_pos(seq);
2147 		if (rc)
2148 			goto out;
2149 	}
2150 
2151 	st->state = TCP_SEQ_STATE_LISTENING;
2152 	st->num = 0;
2153 	st->bucket = 0;
2154 	st->offset = 0;
2155 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2156 
2157 out:
2158 	st->last_pos = *pos;
2159 	return rc;
2160 }
2161 
2162 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2163 {
2164 	struct tcp_iter_state *st = seq->private;
2165 	void *rc = NULL;
2166 
2167 	if (v == SEQ_START_TOKEN) {
2168 		rc = tcp_get_idx(seq, 0);
2169 		goto out;
2170 	}
2171 
2172 	switch (st->state) {
2173 	case TCP_SEQ_STATE_LISTENING:
2174 		rc = listening_get_next(seq, v);
2175 		if (!rc) {
2176 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2177 			st->bucket = 0;
2178 			st->offset = 0;
2179 			rc	  = established_get_first(seq);
2180 		}
2181 		break;
2182 	case TCP_SEQ_STATE_ESTABLISHED:
2183 		rc = established_get_next(seq, v);
2184 		break;
2185 	}
2186 out:
2187 	++*pos;
2188 	st->last_pos = *pos;
2189 	return rc;
2190 }
2191 
2192 static void tcp_seq_stop(struct seq_file *seq, void *v)
2193 {
2194 	struct tcp_iter_state *st = seq->private;
2195 
2196 	switch (st->state) {
2197 	case TCP_SEQ_STATE_LISTENING:
2198 		if (v != SEQ_START_TOKEN)
2199 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2200 		break;
2201 	case TCP_SEQ_STATE_ESTABLISHED:
2202 		if (v)
2203 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2204 		break;
2205 	}
2206 }
2207 
2208 int tcp_seq_open(struct inode *inode, struct file *file)
2209 {
2210 	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2211 	struct tcp_iter_state *s;
2212 	int err;
2213 
2214 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2215 			  sizeof(struct tcp_iter_state));
2216 	if (err < 0)
2217 		return err;
2218 
2219 	s = ((struct seq_file *)file->private_data)->private;
2220 	s->family		= afinfo->family;
2221 	s->last_pos		= 0;
2222 	return 0;
2223 }
2224 EXPORT_SYMBOL(tcp_seq_open);
2225 
2226 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2227 {
2228 	int rc = 0;
2229 	struct proc_dir_entry *p;
2230 
2231 	afinfo->seq_ops.start		= tcp_seq_start;
2232 	afinfo->seq_ops.next		= tcp_seq_next;
2233 	afinfo->seq_ops.stop		= tcp_seq_stop;
2234 
2235 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2236 			     afinfo->seq_fops, afinfo);
2237 	if (!p)
2238 		rc = -ENOMEM;
2239 	return rc;
2240 }
2241 EXPORT_SYMBOL(tcp_proc_register);
2242 
2243 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2244 {
2245 	remove_proc_entry(afinfo->name, net->proc_net);
2246 }
2247 EXPORT_SYMBOL(tcp_proc_unregister);
2248 
2249 static void get_openreq4(const struct request_sock *req,
2250 			 struct seq_file *f, int i)
2251 {
2252 	const struct inet_request_sock *ireq = inet_rsk(req);
2253 	long delta = req->rsk_timer.expires - jiffies;
2254 
2255 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2256 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2257 		i,
2258 		ireq->ir_loc_addr,
2259 		ireq->ir_num,
2260 		ireq->ir_rmt_addr,
2261 		ntohs(ireq->ir_rmt_port),
2262 		TCP_SYN_RECV,
2263 		0, 0, /* could print option size, but that is af dependent. */
2264 		1,    /* timers active (only the expire timer) */
2265 		jiffies_delta_to_clock_t(delta),
2266 		req->num_timeout,
2267 		from_kuid_munged(seq_user_ns(f),
2268 				 sock_i_uid(req->rsk_listener)),
2269 		0,  /* non standard timer */
2270 		0, /* open_requests have no inode */
2271 		0,
2272 		req);
2273 }
2274 
2275 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2276 {
2277 	int timer_active;
2278 	unsigned long timer_expires;
2279 	const struct tcp_sock *tp = tcp_sk(sk);
2280 	const struct inet_connection_sock *icsk = inet_csk(sk);
2281 	const struct inet_sock *inet = inet_sk(sk);
2282 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2283 	__be32 dest = inet->inet_daddr;
2284 	__be32 src = inet->inet_rcv_saddr;
2285 	__u16 destp = ntohs(inet->inet_dport);
2286 	__u16 srcp = ntohs(inet->inet_sport);
2287 	int rx_queue;
2288 	int state;
2289 
2290 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2291 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2292 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2293 		timer_active	= 1;
2294 		timer_expires	= icsk->icsk_timeout;
2295 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2296 		timer_active	= 4;
2297 		timer_expires	= icsk->icsk_timeout;
2298 	} else if (timer_pending(&sk->sk_timer)) {
2299 		timer_active	= 2;
2300 		timer_expires	= sk->sk_timer.expires;
2301 	} else {
2302 		timer_active	= 0;
2303 		timer_expires = jiffies;
2304 	}
2305 
2306 	state = sk_state_load(sk);
2307 	if (state == TCP_LISTEN)
2308 		rx_queue = sk->sk_ack_backlog;
2309 	else
2310 		/* Because we don't lock the socket,
2311 		 * we might find a transient negative value.
2312 		 */
2313 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2314 
2315 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2316 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2317 		i, src, srcp, dest, destp, state,
2318 		tp->write_seq - tp->snd_una,
2319 		rx_queue,
2320 		timer_active,
2321 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2322 		icsk->icsk_retransmits,
2323 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2324 		icsk->icsk_probes_out,
2325 		sock_i_ino(sk),
2326 		refcount_read(&sk->sk_refcnt), sk,
2327 		jiffies_to_clock_t(icsk->icsk_rto),
2328 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2329 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2330 		tp->snd_cwnd,
2331 		state == TCP_LISTEN ?
2332 		    fastopenq->max_qlen :
2333 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2334 }
2335 
2336 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2337 			       struct seq_file *f, int i)
2338 {
2339 	long delta = tw->tw_timer.expires - jiffies;
2340 	__be32 dest, src;
2341 	__u16 destp, srcp;
2342 
2343 	dest  = tw->tw_daddr;
2344 	src   = tw->tw_rcv_saddr;
2345 	destp = ntohs(tw->tw_dport);
2346 	srcp  = ntohs(tw->tw_sport);
2347 
2348 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2349 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2350 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2351 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2352 		refcount_read(&tw->tw_refcnt), tw);
2353 }
2354 
2355 #define TMPSZ 150
2356 
2357 static int tcp4_seq_show(struct seq_file *seq, void *v)
2358 {
2359 	struct tcp_iter_state *st;
2360 	struct sock *sk = v;
2361 
2362 	seq_setwidth(seq, TMPSZ - 1);
2363 	if (v == SEQ_START_TOKEN) {
2364 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2365 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2366 			   "inode");
2367 		goto out;
2368 	}
2369 	st = seq->private;
2370 
2371 	if (sk->sk_state == TCP_TIME_WAIT)
2372 		get_timewait4_sock(v, seq, st->num);
2373 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2374 		get_openreq4(v, seq, st->num);
2375 	else
2376 		get_tcp4_sock(v, seq, st->num);
2377 out:
2378 	seq_pad(seq, '\n');
2379 	return 0;
2380 }
2381 
2382 static const struct file_operations tcp_afinfo_seq_fops = {
2383 	.owner   = THIS_MODULE,
2384 	.open    = tcp_seq_open,
2385 	.read    = seq_read,
2386 	.llseek  = seq_lseek,
2387 	.release = seq_release_net
2388 };
2389 
2390 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2391 	.name		= "tcp",
2392 	.family		= AF_INET,
2393 	.seq_fops	= &tcp_afinfo_seq_fops,
2394 	.seq_ops	= {
2395 		.show		= tcp4_seq_show,
2396 	},
2397 };
2398 
2399 static int __net_init tcp4_proc_init_net(struct net *net)
2400 {
2401 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2402 }
2403 
2404 static void __net_exit tcp4_proc_exit_net(struct net *net)
2405 {
2406 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2407 }
2408 
2409 static struct pernet_operations tcp4_net_ops = {
2410 	.init = tcp4_proc_init_net,
2411 	.exit = tcp4_proc_exit_net,
2412 };
2413 
2414 int __init tcp4_proc_init(void)
2415 {
2416 	return register_pernet_subsys(&tcp4_net_ops);
2417 }
2418 
2419 void tcp4_proc_exit(void)
2420 {
2421 	unregister_pernet_subsys(&tcp4_net_ops);
2422 }
2423 #endif /* CONFIG_PROC_FS */
2424 
2425 struct proto tcp_prot = {
2426 	.name			= "TCP",
2427 	.owner			= THIS_MODULE,
2428 	.close			= tcp_close,
2429 	.connect		= tcp_v4_connect,
2430 	.disconnect		= tcp_disconnect,
2431 	.accept			= inet_csk_accept,
2432 	.ioctl			= tcp_ioctl,
2433 	.init			= tcp_v4_init_sock,
2434 	.destroy		= tcp_v4_destroy_sock,
2435 	.shutdown		= tcp_shutdown,
2436 	.setsockopt		= tcp_setsockopt,
2437 	.getsockopt		= tcp_getsockopt,
2438 	.keepalive		= tcp_set_keepalive,
2439 	.recvmsg		= tcp_recvmsg,
2440 	.sendmsg		= tcp_sendmsg,
2441 	.sendpage		= tcp_sendpage,
2442 	.backlog_rcv		= tcp_v4_do_rcv,
2443 	.release_cb		= tcp_release_cb,
2444 	.hash			= inet_hash,
2445 	.unhash			= inet_unhash,
2446 	.get_port		= inet_csk_get_port,
2447 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2448 	.leave_memory_pressure	= tcp_leave_memory_pressure,
2449 	.stream_memory_free	= tcp_stream_memory_free,
2450 	.sockets_allocated	= &tcp_sockets_allocated,
2451 	.orphan_count		= &tcp_orphan_count,
2452 	.memory_allocated	= &tcp_memory_allocated,
2453 	.memory_pressure	= &tcp_memory_pressure,
2454 	.sysctl_mem		= sysctl_tcp_mem,
2455 	.sysctl_wmem		= sysctl_tcp_wmem,
2456 	.sysctl_rmem		= sysctl_tcp_rmem,
2457 	.max_header		= MAX_TCP_HEADER,
2458 	.obj_size		= sizeof(struct tcp_sock),
2459 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
2460 	.twsk_prot		= &tcp_timewait_sock_ops,
2461 	.rsk_prot		= &tcp_request_sock_ops,
2462 	.h.hashinfo		= &tcp_hashinfo,
2463 	.no_autobind		= true,
2464 #ifdef CONFIG_COMPAT
2465 	.compat_setsockopt	= compat_tcp_setsockopt,
2466 	.compat_getsockopt	= compat_tcp_getsockopt,
2467 #endif
2468 	.diag_destroy		= tcp_abort,
2469 };
2470 EXPORT_SYMBOL(tcp_prot);
2471 
2472 static void __net_exit tcp_sk_exit(struct net *net)
2473 {
2474 	int cpu;
2475 
2476 	for_each_possible_cpu(cpu)
2477 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2478 	free_percpu(net->ipv4.tcp_sk);
2479 }
2480 
2481 static int __net_init tcp_sk_init(struct net *net)
2482 {
2483 	int res, cpu, cnt;
2484 
2485 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2486 	if (!net->ipv4.tcp_sk)
2487 		return -ENOMEM;
2488 
2489 	for_each_possible_cpu(cpu) {
2490 		struct sock *sk;
2491 
2492 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2493 					   IPPROTO_TCP, net);
2494 		if (res)
2495 			goto fail;
2496 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2497 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2498 	}
2499 
2500 	net->ipv4.sysctl_tcp_ecn = 2;
2501 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2502 
2503 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2504 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2505 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2506 
2507 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2508 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2509 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2510 
2511 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2512 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2513 	net->ipv4.sysctl_tcp_syncookies = 1;
2514 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2515 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2516 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2517 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2518 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2519 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2520 	net->ipv4.sysctl_tcp_tw_reuse = 0;
2521 
2522 	cnt = tcp_hashinfo.ehash_mask + 1;
2523 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2524 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2525 
2526 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2527 	net->ipv4.sysctl_tcp_sack = 1;
2528 	net->ipv4.sysctl_tcp_window_scaling = 1;
2529 	net->ipv4.sysctl_tcp_timestamps = 1;
2530 
2531 	return 0;
2532 fail:
2533 	tcp_sk_exit(net);
2534 
2535 	return res;
2536 }
2537 
2538 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2539 {
2540 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2541 }
2542 
2543 static struct pernet_operations __net_initdata tcp_sk_ops = {
2544        .init	   = tcp_sk_init,
2545        .exit	   = tcp_sk_exit,
2546        .exit_batch = tcp_sk_exit_batch,
2547 };
2548 
2549 void __init tcp_v4_init(void)
2550 {
2551 	if (register_pernet_subsys(&tcp_sk_ops))
2552 		panic("Failed to create the TCP control socket.\n");
2553 }
2554