xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision a8a28aff)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/netdma.h>
76 #include <net/secure_seq.h>
77 #include <net/tcp_memcontrol.h>
78 #include <net/busy_poll.h>
79 
80 #include <linux/inet.h>
81 #include <linux/ipv6.h>
82 #include <linux/stddef.h>
83 #include <linux/proc_fs.h>
84 #include <linux/seq_file.h>
85 
86 #include <linux/crypto.h>
87 #include <linux/scatterlist.h>
88 
89 int sysctl_tcp_tw_reuse __read_mostly;
90 int sysctl_tcp_low_latency __read_mostly;
91 EXPORT_SYMBOL(sysctl_tcp_low_latency);
92 
93 
94 #ifdef CONFIG_TCP_MD5SIG
95 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
96 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
97 #endif
98 
99 struct inet_hashinfo tcp_hashinfo;
100 EXPORT_SYMBOL(tcp_hashinfo);
101 
102 static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
103 {
104 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
105 					  ip_hdr(skb)->saddr,
106 					  tcp_hdr(skb)->dest,
107 					  tcp_hdr(skb)->source);
108 }
109 
110 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
111 {
112 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
113 	struct tcp_sock *tp = tcp_sk(sk);
114 
115 	/* With PAWS, it is safe from the viewpoint
116 	   of data integrity. Even without PAWS it is safe provided sequence
117 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
118 
119 	   Actually, the idea is close to VJ's one, only timestamp cache is
120 	   held not per host, but per port pair and TW bucket is used as state
121 	   holder.
122 
123 	   If TW bucket has been already destroyed we fall back to VJ's scheme
124 	   and use initial timestamp retrieved from peer table.
125 	 */
126 	if (tcptw->tw_ts_recent_stamp &&
127 	    (twp == NULL || (sysctl_tcp_tw_reuse &&
128 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
129 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
130 		if (tp->write_seq == 0)
131 			tp->write_seq = 1;
132 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
133 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
134 		sock_hold(sktw);
135 		return 1;
136 	}
137 
138 	return 0;
139 }
140 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
141 
142 /* This will initiate an outgoing connection. */
143 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
144 {
145 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
146 	struct inet_sock *inet = inet_sk(sk);
147 	struct tcp_sock *tp = tcp_sk(sk);
148 	__be16 orig_sport, orig_dport;
149 	__be32 daddr, nexthop;
150 	struct flowi4 *fl4;
151 	struct rtable *rt;
152 	int err;
153 	struct ip_options_rcu *inet_opt;
154 
155 	if (addr_len < sizeof(struct sockaddr_in))
156 		return -EINVAL;
157 
158 	if (usin->sin_family != AF_INET)
159 		return -EAFNOSUPPORT;
160 
161 	nexthop = daddr = usin->sin_addr.s_addr;
162 	inet_opt = rcu_dereference_protected(inet->inet_opt,
163 					     sock_owned_by_user(sk));
164 	if (inet_opt && inet_opt->opt.srr) {
165 		if (!daddr)
166 			return -EINVAL;
167 		nexthop = inet_opt->opt.faddr;
168 	}
169 
170 	orig_sport = inet->inet_sport;
171 	orig_dport = usin->sin_port;
172 	fl4 = &inet->cork.fl.u.ip4;
173 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
174 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
175 			      IPPROTO_TCP,
176 			      orig_sport, orig_dport, sk);
177 	if (IS_ERR(rt)) {
178 		err = PTR_ERR(rt);
179 		if (err == -ENETUNREACH)
180 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
181 		return err;
182 	}
183 
184 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
185 		ip_rt_put(rt);
186 		return -ENETUNREACH;
187 	}
188 
189 	if (!inet_opt || !inet_opt->opt.srr)
190 		daddr = fl4->daddr;
191 
192 	if (!inet->inet_saddr)
193 		inet->inet_saddr = fl4->saddr;
194 	inet->inet_rcv_saddr = inet->inet_saddr;
195 
196 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
197 		/* Reset inherited state */
198 		tp->rx_opt.ts_recent	   = 0;
199 		tp->rx_opt.ts_recent_stamp = 0;
200 		if (likely(!tp->repair))
201 			tp->write_seq	   = 0;
202 	}
203 
204 	if (tcp_death_row.sysctl_tw_recycle &&
205 	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
206 		tcp_fetch_timewait_stamp(sk, &rt->dst);
207 
208 	inet->inet_dport = usin->sin_port;
209 	inet->inet_daddr = daddr;
210 
211 	inet_csk(sk)->icsk_ext_hdr_len = 0;
212 	if (inet_opt)
213 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
214 
215 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
216 
217 	/* Socket identity is still unknown (sport may be zero).
218 	 * However we set state to SYN-SENT and not releasing socket
219 	 * lock select source port, enter ourselves into the hash tables and
220 	 * complete initialization after this.
221 	 */
222 	tcp_set_state(sk, TCP_SYN_SENT);
223 	err = inet_hash_connect(&tcp_death_row, sk);
224 	if (err)
225 		goto failure;
226 
227 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
228 			       inet->inet_sport, inet->inet_dport, sk);
229 	if (IS_ERR(rt)) {
230 		err = PTR_ERR(rt);
231 		rt = NULL;
232 		goto failure;
233 	}
234 	/* OK, now commit destination to socket.  */
235 	sk->sk_gso_type = SKB_GSO_TCPV4;
236 	sk_setup_caps(sk, &rt->dst);
237 
238 	if (!tp->write_seq && likely(!tp->repair))
239 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
240 							   inet->inet_daddr,
241 							   inet->inet_sport,
242 							   usin->sin_port);
243 
244 	inet->inet_id = tp->write_seq ^ jiffies;
245 
246 	err = tcp_connect(sk);
247 
248 	rt = NULL;
249 	if (err)
250 		goto failure;
251 
252 	return 0;
253 
254 failure:
255 	/*
256 	 * This unhashes the socket and releases the local port,
257 	 * if necessary.
258 	 */
259 	tcp_set_state(sk, TCP_CLOSE);
260 	ip_rt_put(rt);
261 	sk->sk_route_caps = 0;
262 	inet->inet_dport = 0;
263 	return err;
264 }
265 EXPORT_SYMBOL(tcp_v4_connect);
266 
267 /*
268  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
269  * It can be called through tcp_release_cb() if socket was owned by user
270  * at the time tcp_v4_err() was called to handle ICMP message.
271  */
272 static void tcp_v4_mtu_reduced(struct sock *sk)
273 {
274 	struct dst_entry *dst;
275 	struct inet_sock *inet = inet_sk(sk);
276 	u32 mtu = tcp_sk(sk)->mtu_info;
277 
278 	dst = inet_csk_update_pmtu(sk, mtu);
279 	if (!dst)
280 		return;
281 
282 	/* Something is about to be wrong... Remember soft error
283 	 * for the case, if this connection will not able to recover.
284 	 */
285 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
286 		sk->sk_err_soft = EMSGSIZE;
287 
288 	mtu = dst_mtu(dst);
289 
290 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
291 	    ip_sk_accept_pmtu(sk) &&
292 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
293 		tcp_sync_mss(sk, mtu);
294 
295 		/* Resend the TCP packet because it's
296 		 * clear that the old packet has been
297 		 * dropped. This is the new "fast" path mtu
298 		 * discovery.
299 		 */
300 		tcp_simple_retransmit(sk);
301 	} /* else let the usual retransmit timer handle it */
302 }
303 
304 static void do_redirect(struct sk_buff *skb, struct sock *sk)
305 {
306 	struct dst_entry *dst = __sk_dst_check(sk, 0);
307 
308 	if (dst)
309 		dst->ops->redirect(dst, sk, skb);
310 }
311 
312 /*
313  * This routine is called by the ICMP module when it gets some
314  * sort of error condition.  If err < 0 then the socket should
315  * be closed and the error returned to the user.  If err > 0
316  * it's just the icmp type << 8 | icmp code.  After adjustment
317  * header points to the first 8 bytes of the tcp header.  We need
318  * to find the appropriate port.
319  *
320  * The locking strategy used here is very "optimistic". When
321  * someone else accesses the socket the ICMP is just dropped
322  * and for some paths there is no check at all.
323  * A more general error queue to queue errors for later handling
324  * is probably better.
325  *
326  */
327 
328 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
329 {
330 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
331 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
332 	struct inet_connection_sock *icsk;
333 	struct tcp_sock *tp;
334 	struct inet_sock *inet;
335 	const int type = icmp_hdr(icmp_skb)->type;
336 	const int code = icmp_hdr(icmp_skb)->code;
337 	struct sock *sk;
338 	struct sk_buff *skb;
339 	struct request_sock *fastopen;
340 	__u32 seq, snd_una;
341 	__u32 remaining;
342 	int err;
343 	struct net *net = dev_net(icmp_skb->dev);
344 
345 	if (icmp_skb->len < (iph->ihl << 2) + 8) {
346 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
347 		return;
348 	}
349 
350 	sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
351 			iph->saddr, th->source, inet_iif(icmp_skb));
352 	if (!sk) {
353 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
354 		return;
355 	}
356 	if (sk->sk_state == TCP_TIME_WAIT) {
357 		inet_twsk_put(inet_twsk(sk));
358 		return;
359 	}
360 
361 	bh_lock_sock(sk);
362 	/* If too many ICMPs get dropped on busy
363 	 * servers this needs to be solved differently.
364 	 * We do take care of PMTU discovery (RFC1191) special case :
365 	 * we can receive locally generated ICMP messages while socket is held.
366 	 */
367 	if (sock_owned_by_user(sk)) {
368 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
369 			NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
370 	}
371 	if (sk->sk_state == TCP_CLOSE)
372 		goto out;
373 
374 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
375 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
376 		goto out;
377 	}
378 
379 	icsk = inet_csk(sk);
380 	tp = tcp_sk(sk);
381 	seq = ntohl(th->seq);
382 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
383 	fastopen = tp->fastopen_rsk;
384 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
385 	if (sk->sk_state != TCP_LISTEN &&
386 	    !between(seq, snd_una, tp->snd_nxt)) {
387 		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
388 		goto out;
389 	}
390 
391 	switch (type) {
392 	case ICMP_REDIRECT:
393 		do_redirect(icmp_skb, sk);
394 		goto out;
395 	case ICMP_SOURCE_QUENCH:
396 		/* Just silently ignore these. */
397 		goto out;
398 	case ICMP_PARAMETERPROB:
399 		err = EPROTO;
400 		break;
401 	case ICMP_DEST_UNREACH:
402 		if (code > NR_ICMP_UNREACH)
403 			goto out;
404 
405 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
406 			/* We are not interested in TCP_LISTEN and open_requests
407 			 * (SYN-ACKs send out by Linux are always <576bytes so
408 			 * they should go through unfragmented).
409 			 */
410 			if (sk->sk_state == TCP_LISTEN)
411 				goto out;
412 
413 			tp->mtu_info = info;
414 			if (!sock_owned_by_user(sk)) {
415 				tcp_v4_mtu_reduced(sk);
416 			} else {
417 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
418 					sock_hold(sk);
419 			}
420 			goto out;
421 		}
422 
423 		err = icmp_err_convert[code].errno;
424 		/* check if icmp_skb allows revert of backoff
425 		 * (see draft-zimmermann-tcp-lcd) */
426 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
427 			break;
428 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
429 		    !icsk->icsk_backoff || fastopen)
430 			break;
431 
432 		if (sock_owned_by_user(sk))
433 			break;
434 
435 		icsk->icsk_backoff--;
436 		inet_csk(sk)->icsk_rto = (tp->srtt_us ? __tcp_set_rto(tp) :
437 			TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
438 		tcp_bound_rto(sk);
439 
440 		skb = tcp_write_queue_head(sk);
441 		BUG_ON(!skb);
442 
443 		remaining = icsk->icsk_rto - min(icsk->icsk_rto,
444 				tcp_time_stamp - TCP_SKB_CB(skb)->when);
445 
446 		if (remaining) {
447 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
448 						  remaining, TCP_RTO_MAX);
449 		} else {
450 			/* RTO revert clocked out retransmission.
451 			 * Will retransmit now */
452 			tcp_retransmit_timer(sk);
453 		}
454 
455 		break;
456 	case ICMP_TIME_EXCEEDED:
457 		err = EHOSTUNREACH;
458 		break;
459 	default:
460 		goto out;
461 	}
462 
463 	switch (sk->sk_state) {
464 		struct request_sock *req, **prev;
465 	case TCP_LISTEN:
466 		if (sock_owned_by_user(sk))
467 			goto out;
468 
469 		req = inet_csk_search_req(sk, &prev, th->dest,
470 					  iph->daddr, iph->saddr);
471 		if (!req)
472 			goto out;
473 
474 		/* ICMPs are not backlogged, hence we cannot get
475 		   an established socket here.
476 		 */
477 		WARN_ON(req->sk);
478 
479 		if (seq != tcp_rsk(req)->snt_isn) {
480 			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
481 			goto out;
482 		}
483 
484 		/*
485 		 * Still in SYN_RECV, just remove it silently.
486 		 * There is no good way to pass the error to the newly
487 		 * created socket, and POSIX does not want network
488 		 * errors returned from accept().
489 		 */
490 		inet_csk_reqsk_queue_drop(sk, req, prev);
491 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
492 		goto out;
493 
494 	case TCP_SYN_SENT:
495 	case TCP_SYN_RECV:
496 		/* Only in fast or simultaneous open. If a fast open socket is
497 		 * is already accepted it is treated as a connected one below.
498 		 */
499 		if (fastopen && fastopen->sk == NULL)
500 			break;
501 
502 		if (!sock_owned_by_user(sk)) {
503 			sk->sk_err = err;
504 
505 			sk->sk_error_report(sk);
506 
507 			tcp_done(sk);
508 		} else {
509 			sk->sk_err_soft = err;
510 		}
511 		goto out;
512 	}
513 
514 	/* If we've already connected we will keep trying
515 	 * until we time out, or the user gives up.
516 	 *
517 	 * rfc1122 4.2.3.9 allows to consider as hard errors
518 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
519 	 * but it is obsoleted by pmtu discovery).
520 	 *
521 	 * Note, that in modern internet, where routing is unreliable
522 	 * and in each dark corner broken firewalls sit, sending random
523 	 * errors ordered by their masters even this two messages finally lose
524 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
525 	 *
526 	 * Now we are in compliance with RFCs.
527 	 *							--ANK (980905)
528 	 */
529 
530 	inet = inet_sk(sk);
531 	if (!sock_owned_by_user(sk) && inet->recverr) {
532 		sk->sk_err = err;
533 		sk->sk_error_report(sk);
534 	} else	{ /* Only an error on timeout */
535 		sk->sk_err_soft = err;
536 	}
537 
538 out:
539 	bh_unlock_sock(sk);
540 	sock_put(sk);
541 }
542 
543 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
544 {
545 	struct tcphdr *th = tcp_hdr(skb);
546 
547 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
548 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
549 		skb->csum_start = skb_transport_header(skb) - skb->head;
550 		skb->csum_offset = offsetof(struct tcphdr, check);
551 	} else {
552 		th->check = tcp_v4_check(skb->len, saddr, daddr,
553 					 csum_partial(th,
554 						      th->doff << 2,
555 						      skb->csum));
556 	}
557 }
558 
559 /* This routine computes an IPv4 TCP checksum. */
560 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
561 {
562 	const struct inet_sock *inet = inet_sk(sk);
563 
564 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
565 }
566 EXPORT_SYMBOL(tcp_v4_send_check);
567 
568 /*
569  *	This routine will send an RST to the other tcp.
570  *
571  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
572  *		      for reset.
573  *	Answer: if a packet caused RST, it is not for a socket
574  *		existing in our system, if it is matched to a socket,
575  *		it is just duplicate segment or bug in other side's TCP.
576  *		So that we build reply only basing on parameters
577  *		arrived with segment.
578  *	Exception: precedence violation. We do not implement it in any case.
579  */
580 
581 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
582 {
583 	const struct tcphdr *th = tcp_hdr(skb);
584 	struct {
585 		struct tcphdr th;
586 #ifdef CONFIG_TCP_MD5SIG
587 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
588 #endif
589 	} rep;
590 	struct ip_reply_arg arg;
591 #ifdef CONFIG_TCP_MD5SIG
592 	struct tcp_md5sig_key *key;
593 	const __u8 *hash_location = NULL;
594 	unsigned char newhash[16];
595 	int genhash;
596 	struct sock *sk1 = NULL;
597 #endif
598 	struct net *net;
599 
600 	/* Never send a reset in response to a reset. */
601 	if (th->rst)
602 		return;
603 
604 	if (skb_rtable(skb)->rt_type != RTN_LOCAL)
605 		return;
606 
607 	/* Swap the send and the receive. */
608 	memset(&rep, 0, sizeof(rep));
609 	rep.th.dest   = th->source;
610 	rep.th.source = th->dest;
611 	rep.th.doff   = sizeof(struct tcphdr) / 4;
612 	rep.th.rst    = 1;
613 
614 	if (th->ack) {
615 		rep.th.seq = th->ack_seq;
616 	} else {
617 		rep.th.ack = 1;
618 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
619 				       skb->len - (th->doff << 2));
620 	}
621 
622 	memset(&arg, 0, sizeof(arg));
623 	arg.iov[0].iov_base = (unsigned char *)&rep;
624 	arg.iov[0].iov_len  = sizeof(rep.th);
625 
626 #ifdef CONFIG_TCP_MD5SIG
627 	hash_location = tcp_parse_md5sig_option(th);
628 	if (!sk && hash_location) {
629 		/*
630 		 * active side is lost. Try to find listening socket through
631 		 * source port, and then find md5 key through listening socket.
632 		 * we are not loose security here:
633 		 * Incoming packet is checked with md5 hash with finding key,
634 		 * no RST generated if md5 hash doesn't match.
635 		 */
636 		sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
637 					     &tcp_hashinfo, ip_hdr(skb)->saddr,
638 					     th->source, ip_hdr(skb)->daddr,
639 					     ntohs(th->source), inet_iif(skb));
640 		/* don't send rst if it can't find key */
641 		if (!sk1)
642 			return;
643 		rcu_read_lock();
644 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
645 					&ip_hdr(skb)->saddr, AF_INET);
646 		if (!key)
647 			goto release_sk1;
648 
649 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
650 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
651 			goto release_sk1;
652 	} else {
653 		key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
654 					     &ip_hdr(skb)->saddr,
655 					     AF_INET) : NULL;
656 	}
657 
658 	if (key) {
659 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
660 				   (TCPOPT_NOP << 16) |
661 				   (TCPOPT_MD5SIG << 8) |
662 				   TCPOLEN_MD5SIG);
663 		/* Update length and the length the header thinks exists */
664 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
665 		rep.th.doff = arg.iov[0].iov_len / 4;
666 
667 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
668 				     key, ip_hdr(skb)->saddr,
669 				     ip_hdr(skb)->daddr, &rep.th);
670 	}
671 #endif
672 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
673 				      ip_hdr(skb)->saddr, /* XXX */
674 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
675 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
676 	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
677 	/* When socket is gone, all binding information is lost.
678 	 * routing might fail in this case. No choice here, if we choose to force
679 	 * input interface, we will misroute in case of asymmetric route.
680 	 */
681 	if (sk)
682 		arg.bound_dev_if = sk->sk_bound_dev_if;
683 
684 	net = dev_net(skb_dst(skb)->dev);
685 	arg.tos = ip_hdr(skb)->tos;
686 	ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
687 			      ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
688 
689 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
690 	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
691 
692 #ifdef CONFIG_TCP_MD5SIG
693 release_sk1:
694 	if (sk1) {
695 		rcu_read_unlock();
696 		sock_put(sk1);
697 	}
698 #endif
699 }
700 
701 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
702    outside socket context is ugly, certainly. What can I do?
703  */
704 
705 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
706 			    u32 win, u32 tsval, u32 tsecr, int oif,
707 			    struct tcp_md5sig_key *key,
708 			    int reply_flags, u8 tos)
709 {
710 	const struct tcphdr *th = tcp_hdr(skb);
711 	struct {
712 		struct tcphdr th;
713 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
714 #ifdef CONFIG_TCP_MD5SIG
715 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
716 #endif
717 			];
718 	} rep;
719 	struct ip_reply_arg arg;
720 	struct net *net = dev_net(skb_dst(skb)->dev);
721 
722 	memset(&rep.th, 0, sizeof(struct tcphdr));
723 	memset(&arg, 0, sizeof(arg));
724 
725 	arg.iov[0].iov_base = (unsigned char *)&rep;
726 	arg.iov[0].iov_len  = sizeof(rep.th);
727 	if (tsecr) {
728 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
729 				   (TCPOPT_TIMESTAMP << 8) |
730 				   TCPOLEN_TIMESTAMP);
731 		rep.opt[1] = htonl(tsval);
732 		rep.opt[2] = htonl(tsecr);
733 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
734 	}
735 
736 	/* Swap the send and the receive. */
737 	rep.th.dest    = th->source;
738 	rep.th.source  = th->dest;
739 	rep.th.doff    = arg.iov[0].iov_len / 4;
740 	rep.th.seq     = htonl(seq);
741 	rep.th.ack_seq = htonl(ack);
742 	rep.th.ack     = 1;
743 	rep.th.window  = htons(win);
744 
745 #ifdef CONFIG_TCP_MD5SIG
746 	if (key) {
747 		int offset = (tsecr) ? 3 : 0;
748 
749 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
750 					  (TCPOPT_NOP << 16) |
751 					  (TCPOPT_MD5SIG << 8) |
752 					  TCPOLEN_MD5SIG);
753 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
754 		rep.th.doff = arg.iov[0].iov_len/4;
755 
756 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
757 				    key, ip_hdr(skb)->saddr,
758 				    ip_hdr(skb)->daddr, &rep.th);
759 	}
760 #endif
761 	arg.flags = reply_flags;
762 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
763 				      ip_hdr(skb)->saddr, /* XXX */
764 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
765 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
766 	if (oif)
767 		arg.bound_dev_if = oif;
768 	arg.tos = tos;
769 	ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
770 			      ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
771 
772 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
773 }
774 
775 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
776 {
777 	struct inet_timewait_sock *tw = inet_twsk(sk);
778 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
779 
780 	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
781 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
782 			tcp_time_stamp + tcptw->tw_ts_offset,
783 			tcptw->tw_ts_recent,
784 			tw->tw_bound_dev_if,
785 			tcp_twsk_md5_key(tcptw),
786 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
787 			tw->tw_tos
788 			);
789 
790 	inet_twsk_put(tw);
791 }
792 
793 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
794 				  struct request_sock *req)
795 {
796 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
797 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
798 	 */
799 	tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
800 			tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
801 			tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
802 			tcp_time_stamp,
803 			req->ts_recent,
804 			0,
805 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
806 					  AF_INET),
807 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
808 			ip_hdr(skb)->tos);
809 }
810 
811 /*
812  *	Send a SYN-ACK after having received a SYN.
813  *	This still operates on a request_sock only, not on a big
814  *	socket.
815  */
816 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
817 			      struct request_sock *req,
818 			      u16 queue_mapping,
819 			      struct tcp_fastopen_cookie *foc)
820 {
821 	const struct inet_request_sock *ireq = inet_rsk(req);
822 	struct flowi4 fl4;
823 	int err = -1;
824 	struct sk_buff *skb;
825 
826 	/* First, grab a route. */
827 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
828 		return -1;
829 
830 	skb = tcp_make_synack(sk, dst, req, foc);
831 
832 	if (skb) {
833 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
834 
835 		skb_set_queue_mapping(skb, queue_mapping);
836 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
837 					    ireq->ir_rmt_addr,
838 					    ireq->opt);
839 		err = net_xmit_eval(err);
840 		if (!tcp_rsk(req)->snt_synack && !err)
841 			tcp_rsk(req)->snt_synack = tcp_time_stamp;
842 	}
843 
844 	return err;
845 }
846 
847 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
848 {
849 	int res = tcp_v4_send_synack(sk, NULL, req, 0, NULL);
850 
851 	if (!res) {
852 		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
853 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
854 	}
855 	return res;
856 }
857 
858 /*
859  *	IPv4 request_sock destructor.
860  */
861 static void tcp_v4_reqsk_destructor(struct request_sock *req)
862 {
863 	kfree(inet_rsk(req)->opt);
864 }
865 
866 /*
867  * Return true if a syncookie should be sent
868  */
869 bool tcp_syn_flood_action(struct sock *sk,
870 			 const struct sk_buff *skb,
871 			 const char *proto)
872 {
873 	const char *msg = "Dropping request";
874 	bool want_cookie = false;
875 	struct listen_sock *lopt;
876 
877 #ifdef CONFIG_SYN_COOKIES
878 	if (sysctl_tcp_syncookies) {
879 		msg = "Sending cookies";
880 		want_cookie = true;
881 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
882 	} else
883 #endif
884 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
885 
886 	lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
887 	if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
888 		lopt->synflood_warned = 1;
889 		pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
890 			proto, ntohs(tcp_hdr(skb)->dest), msg);
891 	}
892 	return want_cookie;
893 }
894 EXPORT_SYMBOL(tcp_syn_flood_action);
895 
896 /*
897  * Save and compile IPv4 options into the request_sock if needed.
898  */
899 static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
900 {
901 	const struct ip_options *opt = &(IPCB(skb)->opt);
902 	struct ip_options_rcu *dopt = NULL;
903 
904 	if (opt && opt->optlen) {
905 		int opt_size = sizeof(*dopt) + opt->optlen;
906 
907 		dopt = kmalloc(opt_size, GFP_ATOMIC);
908 		if (dopt) {
909 			if (ip_options_echo(&dopt->opt, skb)) {
910 				kfree(dopt);
911 				dopt = NULL;
912 			}
913 		}
914 	}
915 	return dopt;
916 }
917 
918 #ifdef CONFIG_TCP_MD5SIG
919 /*
920  * RFC2385 MD5 checksumming requires a mapping of
921  * IP address->MD5 Key.
922  * We need to maintain these in the sk structure.
923  */
924 
925 /* Find the Key structure for an address.  */
926 struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
927 					 const union tcp_md5_addr *addr,
928 					 int family)
929 {
930 	struct tcp_sock *tp = tcp_sk(sk);
931 	struct tcp_md5sig_key *key;
932 	unsigned int size = sizeof(struct in_addr);
933 	struct tcp_md5sig_info *md5sig;
934 
935 	/* caller either holds rcu_read_lock() or socket lock */
936 	md5sig = rcu_dereference_check(tp->md5sig_info,
937 				       sock_owned_by_user(sk) ||
938 				       lockdep_is_held(&sk->sk_lock.slock));
939 	if (!md5sig)
940 		return NULL;
941 #if IS_ENABLED(CONFIG_IPV6)
942 	if (family == AF_INET6)
943 		size = sizeof(struct in6_addr);
944 #endif
945 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
946 		if (key->family != family)
947 			continue;
948 		if (!memcmp(&key->addr, addr, size))
949 			return key;
950 	}
951 	return NULL;
952 }
953 EXPORT_SYMBOL(tcp_md5_do_lookup);
954 
955 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
956 					 struct sock *addr_sk)
957 {
958 	union tcp_md5_addr *addr;
959 
960 	addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
961 	return tcp_md5_do_lookup(sk, addr, AF_INET);
962 }
963 EXPORT_SYMBOL(tcp_v4_md5_lookup);
964 
965 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
966 						      struct request_sock *req)
967 {
968 	union tcp_md5_addr *addr;
969 
970 	addr = (union tcp_md5_addr *)&inet_rsk(req)->ir_rmt_addr;
971 	return tcp_md5_do_lookup(sk, addr, AF_INET);
972 }
973 
974 /* This can be called on a newly created socket, from other files */
975 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
976 		   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
977 {
978 	/* Add Key to the list */
979 	struct tcp_md5sig_key *key;
980 	struct tcp_sock *tp = tcp_sk(sk);
981 	struct tcp_md5sig_info *md5sig;
982 
983 	key = tcp_md5_do_lookup(sk, addr, family);
984 	if (key) {
985 		/* Pre-existing entry - just update that one. */
986 		memcpy(key->key, newkey, newkeylen);
987 		key->keylen = newkeylen;
988 		return 0;
989 	}
990 
991 	md5sig = rcu_dereference_protected(tp->md5sig_info,
992 					   sock_owned_by_user(sk));
993 	if (!md5sig) {
994 		md5sig = kmalloc(sizeof(*md5sig), gfp);
995 		if (!md5sig)
996 			return -ENOMEM;
997 
998 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
999 		INIT_HLIST_HEAD(&md5sig->head);
1000 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1001 	}
1002 
1003 	key = sock_kmalloc(sk, sizeof(*key), gfp);
1004 	if (!key)
1005 		return -ENOMEM;
1006 	if (!tcp_alloc_md5sig_pool()) {
1007 		sock_kfree_s(sk, key, sizeof(*key));
1008 		return -ENOMEM;
1009 	}
1010 
1011 	memcpy(key->key, newkey, newkeylen);
1012 	key->keylen = newkeylen;
1013 	key->family = family;
1014 	memcpy(&key->addr, addr,
1015 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1016 				      sizeof(struct in_addr));
1017 	hlist_add_head_rcu(&key->node, &md5sig->head);
1018 	return 0;
1019 }
1020 EXPORT_SYMBOL(tcp_md5_do_add);
1021 
1022 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
1023 {
1024 	struct tcp_md5sig_key *key;
1025 
1026 	key = tcp_md5_do_lookup(sk, addr, family);
1027 	if (!key)
1028 		return -ENOENT;
1029 	hlist_del_rcu(&key->node);
1030 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1031 	kfree_rcu(key, rcu);
1032 	return 0;
1033 }
1034 EXPORT_SYMBOL(tcp_md5_do_del);
1035 
1036 static void tcp_clear_md5_list(struct sock *sk)
1037 {
1038 	struct tcp_sock *tp = tcp_sk(sk);
1039 	struct tcp_md5sig_key *key;
1040 	struct hlist_node *n;
1041 	struct tcp_md5sig_info *md5sig;
1042 
1043 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1044 
1045 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1046 		hlist_del_rcu(&key->node);
1047 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1048 		kfree_rcu(key, rcu);
1049 	}
1050 }
1051 
1052 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1053 				 int optlen)
1054 {
1055 	struct tcp_md5sig cmd;
1056 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1057 
1058 	if (optlen < sizeof(cmd))
1059 		return -EINVAL;
1060 
1061 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1062 		return -EFAULT;
1063 
1064 	if (sin->sin_family != AF_INET)
1065 		return -EINVAL;
1066 
1067 	if (!cmd.tcpm_key || !cmd.tcpm_keylen)
1068 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1069 				      AF_INET);
1070 
1071 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1072 		return -EINVAL;
1073 
1074 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1075 			      AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1076 			      GFP_KERNEL);
1077 }
1078 
1079 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1080 					__be32 daddr, __be32 saddr, int nbytes)
1081 {
1082 	struct tcp4_pseudohdr *bp;
1083 	struct scatterlist sg;
1084 
1085 	bp = &hp->md5_blk.ip4;
1086 
1087 	/*
1088 	 * 1. the TCP pseudo-header (in the order: source IP address,
1089 	 * destination IP address, zero-padded protocol number, and
1090 	 * segment length)
1091 	 */
1092 	bp->saddr = saddr;
1093 	bp->daddr = daddr;
1094 	bp->pad = 0;
1095 	bp->protocol = IPPROTO_TCP;
1096 	bp->len = cpu_to_be16(nbytes);
1097 
1098 	sg_init_one(&sg, bp, sizeof(*bp));
1099 	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1100 }
1101 
1102 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1103 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1104 {
1105 	struct tcp_md5sig_pool *hp;
1106 	struct hash_desc *desc;
1107 
1108 	hp = tcp_get_md5sig_pool();
1109 	if (!hp)
1110 		goto clear_hash_noput;
1111 	desc = &hp->md5_desc;
1112 
1113 	if (crypto_hash_init(desc))
1114 		goto clear_hash;
1115 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1116 		goto clear_hash;
1117 	if (tcp_md5_hash_header(hp, th))
1118 		goto clear_hash;
1119 	if (tcp_md5_hash_key(hp, key))
1120 		goto clear_hash;
1121 	if (crypto_hash_final(desc, md5_hash))
1122 		goto clear_hash;
1123 
1124 	tcp_put_md5sig_pool();
1125 	return 0;
1126 
1127 clear_hash:
1128 	tcp_put_md5sig_pool();
1129 clear_hash_noput:
1130 	memset(md5_hash, 0, 16);
1131 	return 1;
1132 }
1133 
1134 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1135 			const struct sock *sk, const struct request_sock *req,
1136 			const struct sk_buff *skb)
1137 {
1138 	struct tcp_md5sig_pool *hp;
1139 	struct hash_desc *desc;
1140 	const struct tcphdr *th = tcp_hdr(skb);
1141 	__be32 saddr, daddr;
1142 
1143 	if (sk) {
1144 		saddr = inet_sk(sk)->inet_saddr;
1145 		daddr = inet_sk(sk)->inet_daddr;
1146 	} else if (req) {
1147 		saddr = inet_rsk(req)->ir_loc_addr;
1148 		daddr = inet_rsk(req)->ir_rmt_addr;
1149 	} else {
1150 		const struct iphdr *iph = ip_hdr(skb);
1151 		saddr = iph->saddr;
1152 		daddr = iph->daddr;
1153 	}
1154 
1155 	hp = tcp_get_md5sig_pool();
1156 	if (!hp)
1157 		goto clear_hash_noput;
1158 	desc = &hp->md5_desc;
1159 
1160 	if (crypto_hash_init(desc))
1161 		goto clear_hash;
1162 
1163 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1164 		goto clear_hash;
1165 	if (tcp_md5_hash_header(hp, th))
1166 		goto clear_hash;
1167 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1168 		goto clear_hash;
1169 	if (tcp_md5_hash_key(hp, key))
1170 		goto clear_hash;
1171 	if (crypto_hash_final(desc, md5_hash))
1172 		goto clear_hash;
1173 
1174 	tcp_put_md5sig_pool();
1175 	return 0;
1176 
1177 clear_hash:
1178 	tcp_put_md5sig_pool();
1179 clear_hash_noput:
1180 	memset(md5_hash, 0, 16);
1181 	return 1;
1182 }
1183 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1184 
1185 static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1186 {
1187 	/*
1188 	 * This gets called for each TCP segment that arrives
1189 	 * so we want to be efficient.
1190 	 * We have 3 drop cases:
1191 	 * o No MD5 hash and one expected.
1192 	 * o MD5 hash and we're not expecting one.
1193 	 * o MD5 hash and its wrong.
1194 	 */
1195 	const __u8 *hash_location = NULL;
1196 	struct tcp_md5sig_key *hash_expected;
1197 	const struct iphdr *iph = ip_hdr(skb);
1198 	const struct tcphdr *th = tcp_hdr(skb);
1199 	int genhash;
1200 	unsigned char newhash[16];
1201 
1202 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1203 					  AF_INET);
1204 	hash_location = tcp_parse_md5sig_option(th);
1205 
1206 	/* We've parsed the options - do we have a hash? */
1207 	if (!hash_expected && !hash_location)
1208 		return false;
1209 
1210 	if (hash_expected && !hash_location) {
1211 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1212 		return true;
1213 	}
1214 
1215 	if (!hash_expected && hash_location) {
1216 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1217 		return true;
1218 	}
1219 
1220 	/* Okay, so this is hash_expected and hash_location -
1221 	 * so we need to calculate the checksum.
1222 	 */
1223 	genhash = tcp_v4_md5_hash_skb(newhash,
1224 				      hash_expected,
1225 				      NULL, NULL, skb);
1226 
1227 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1228 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1229 				     &iph->saddr, ntohs(th->source),
1230 				     &iph->daddr, ntohs(th->dest),
1231 				     genhash ? " tcp_v4_calc_md5_hash failed"
1232 				     : "");
1233 		return true;
1234 	}
1235 	return false;
1236 }
1237 
1238 #endif
1239 
1240 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1241 	.family		=	PF_INET,
1242 	.obj_size	=	sizeof(struct tcp_request_sock),
1243 	.rtx_syn_ack	=	tcp_v4_rtx_synack,
1244 	.send_ack	=	tcp_v4_reqsk_send_ack,
1245 	.destructor	=	tcp_v4_reqsk_destructor,
1246 	.send_reset	=	tcp_v4_send_reset,
1247 	.syn_ack_timeout = 	tcp_syn_ack_timeout,
1248 };
1249 
1250 #ifdef CONFIG_TCP_MD5SIG
1251 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1252 	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
1253 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1254 };
1255 #endif
1256 
1257 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1258 {
1259 	struct tcp_options_received tmp_opt;
1260 	struct request_sock *req;
1261 	struct inet_request_sock *ireq;
1262 	struct tcp_sock *tp = tcp_sk(sk);
1263 	struct dst_entry *dst = NULL;
1264 	__be32 saddr = ip_hdr(skb)->saddr;
1265 	__be32 daddr = ip_hdr(skb)->daddr;
1266 	__u32 isn = TCP_SKB_CB(skb)->when;
1267 	bool want_cookie = false, fastopen;
1268 	struct flowi4 fl4;
1269 	struct tcp_fastopen_cookie foc = { .len = -1 };
1270 	int err;
1271 
1272 	/* Never answer to SYNs send to broadcast or multicast */
1273 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1274 		goto drop;
1275 
1276 	/* TW buckets are converted to open requests without
1277 	 * limitations, they conserve resources and peer is
1278 	 * evidently real one.
1279 	 */
1280 	if ((sysctl_tcp_syncookies == 2 ||
1281 	     inet_csk_reqsk_queue_is_full(sk)) && !isn) {
1282 		want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1283 		if (!want_cookie)
1284 			goto drop;
1285 	}
1286 
1287 	/* Accept backlog is full. If we have already queued enough
1288 	 * of warm entries in syn queue, drop request. It is better than
1289 	 * clogging syn queue with openreqs with exponentially increasing
1290 	 * timeout.
1291 	 */
1292 	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
1293 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1294 		goto drop;
1295 	}
1296 
1297 	req = inet_reqsk_alloc(&tcp_request_sock_ops);
1298 	if (!req)
1299 		goto drop;
1300 
1301 #ifdef CONFIG_TCP_MD5SIG
1302 	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1303 #endif
1304 
1305 	tcp_clear_options(&tmp_opt);
1306 	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1307 	tmp_opt.user_mss  = tp->rx_opt.user_mss;
1308 	tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
1309 
1310 	if (want_cookie && !tmp_opt.saw_tstamp)
1311 		tcp_clear_options(&tmp_opt);
1312 
1313 	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1314 	tcp_openreq_init(req, &tmp_opt, skb);
1315 
1316 	ireq = inet_rsk(req);
1317 	ireq->ir_loc_addr = daddr;
1318 	ireq->ir_rmt_addr = saddr;
1319 	ireq->no_srccheck = inet_sk(sk)->transparent;
1320 	ireq->opt = tcp_v4_save_options(skb);
1321 	ireq->ir_mark = inet_request_mark(sk, skb);
1322 
1323 	if (security_inet_conn_request(sk, skb, req))
1324 		goto drop_and_free;
1325 
1326 	if (!want_cookie || tmp_opt.tstamp_ok)
1327 		TCP_ECN_create_request(req, skb, sock_net(sk));
1328 
1329 	if (want_cookie) {
1330 		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1331 		req->cookie_ts = tmp_opt.tstamp_ok;
1332 	} else if (!isn) {
1333 		/* VJ's idea. We save last timestamp seen
1334 		 * from the destination in peer table, when entering
1335 		 * state TIME-WAIT, and check against it before
1336 		 * accepting new connection request.
1337 		 *
1338 		 * If "isn" is not zero, this request hit alive
1339 		 * timewait bucket, so that all the necessary checks
1340 		 * are made in the function processing timewait state.
1341 		 */
1342 		if (tmp_opt.saw_tstamp &&
1343 		    tcp_death_row.sysctl_tw_recycle &&
1344 		    (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1345 		    fl4.daddr == saddr) {
1346 			if (!tcp_peer_is_proven(req, dst, true)) {
1347 				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1348 				goto drop_and_release;
1349 			}
1350 		}
1351 		/* Kill the following clause, if you dislike this way. */
1352 		else if (!sysctl_tcp_syncookies &&
1353 			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1354 			  (sysctl_max_syn_backlog >> 2)) &&
1355 			 !tcp_peer_is_proven(req, dst, false)) {
1356 			/* Without syncookies last quarter of
1357 			 * backlog is filled with destinations,
1358 			 * proven to be alive.
1359 			 * It means that we continue to communicate
1360 			 * to destinations, already remembered
1361 			 * to the moment of synflood.
1362 			 */
1363 			LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
1364 				       &saddr, ntohs(tcp_hdr(skb)->source));
1365 			goto drop_and_release;
1366 		}
1367 
1368 		isn = tcp_v4_init_sequence(skb);
1369 	}
1370 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1371 		goto drop_and_free;
1372 
1373 	tcp_rsk(req)->snt_isn = isn;
1374 	tcp_rsk(req)->snt_synack = tcp_time_stamp;
1375 	tcp_openreq_init_rwin(req, sk, dst);
1376 	fastopen = !want_cookie &&
1377 		   tcp_try_fastopen(sk, skb, req, &foc, dst);
1378 	err = tcp_v4_send_synack(sk, dst, req,
1379 				 skb_get_queue_mapping(skb), &foc);
1380 	if (!fastopen) {
1381 		if (err || want_cookie)
1382 			goto drop_and_free;
1383 
1384 		tcp_rsk(req)->snt_synack = tcp_time_stamp;
1385 		tcp_rsk(req)->listener = NULL;
1386 		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1387 	}
1388 
1389 	return 0;
1390 
1391 drop_and_release:
1392 	dst_release(dst);
1393 drop_and_free:
1394 	reqsk_free(req);
1395 drop:
1396 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1397 	return 0;
1398 }
1399 EXPORT_SYMBOL(tcp_v4_conn_request);
1400 
1401 
1402 /*
1403  * The three way handshake has completed - we got a valid synack -
1404  * now create the new socket.
1405  */
1406 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1407 				  struct request_sock *req,
1408 				  struct dst_entry *dst)
1409 {
1410 	struct inet_request_sock *ireq;
1411 	struct inet_sock *newinet;
1412 	struct tcp_sock *newtp;
1413 	struct sock *newsk;
1414 #ifdef CONFIG_TCP_MD5SIG
1415 	struct tcp_md5sig_key *key;
1416 #endif
1417 	struct ip_options_rcu *inet_opt;
1418 
1419 	if (sk_acceptq_is_full(sk))
1420 		goto exit_overflow;
1421 
1422 	newsk = tcp_create_openreq_child(sk, req, skb);
1423 	if (!newsk)
1424 		goto exit_nonewsk;
1425 
1426 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1427 	inet_sk_rx_dst_set(newsk, skb);
1428 
1429 	newtp		      = tcp_sk(newsk);
1430 	newinet		      = inet_sk(newsk);
1431 	ireq		      = inet_rsk(req);
1432 	newinet->inet_daddr   = ireq->ir_rmt_addr;
1433 	newinet->inet_rcv_saddr = ireq->ir_loc_addr;
1434 	newinet->inet_saddr	      = ireq->ir_loc_addr;
1435 	inet_opt	      = ireq->opt;
1436 	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1437 	ireq->opt	      = NULL;
1438 	newinet->mc_index     = inet_iif(skb);
1439 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1440 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1441 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1442 	if (inet_opt)
1443 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1444 	newinet->inet_id = newtp->write_seq ^ jiffies;
1445 
1446 	if (!dst) {
1447 		dst = inet_csk_route_child_sock(sk, newsk, req);
1448 		if (!dst)
1449 			goto put_and_exit;
1450 	} else {
1451 		/* syncookie case : see end of cookie_v4_check() */
1452 	}
1453 	sk_setup_caps(newsk, dst);
1454 
1455 	tcp_sync_mss(newsk, dst_mtu(dst));
1456 	newtp->advmss = dst_metric_advmss(dst);
1457 	if (tcp_sk(sk)->rx_opt.user_mss &&
1458 	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1459 		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1460 
1461 	tcp_initialize_rcv_mss(newsk);
1462 
1463 #ifdef CONFIG_TCP_MD5SIG
1464 	/* Copy over the MD5 key from the original socket */
1465 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1466 				AF_INET);
1467 	if (key != NULL) {
1468 		/*
1469 		 * We're using one, so create a matching key
1470 		 * on the newsk structure. If we fail to get
1471 		 * memory, then we end up not copying the key
1472 		 * across. Shucks.
1473 		 */
1474 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1475 			       AF_INET, key->key, key->keylen, GFP_ATOMIC);
1476 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1477 	}
1478 #endif
1479 
1480 	if (__inet_inherit_port(sk, newsk) < 0)
1481 		goto put_and_exit;
1482 	__inet_hash_nolisten(newsk, NULL);
1483 
1484 	return newsk;
1485 
1486 exit_overflow:
1487 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1488 exit_nonewsk:
1489 	dst_release(dst);
1490 exit:
1491 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1492 	return NULL;
1493 put_and_exit:
1494 	inet_csk_prepare_forced_close(newsk);
1495 	tcp_done(newsk);
1496 	goto exit;
1497 }
1498 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1499 
1500 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1501 {
1502 	struct tcphdr *th = tcp_hdr(skb);
1503 	const struct iphdr *iph = ip_hdr(skb);
1504 	struct sock *nsk;
1505 	struct request_sock **prev;
1506 	/* Find possible connection requests. */
1507 	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1508 						       iph->saddr, iph->daddr);
1509 	if (req)
1510 		return tcp_check_req(sk, skb, req, prev, false);
1511 
1512 	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1513 			th->source, iph->daddr, th->dest, inet_iif(skb));
1514 
1515 	if (nsk) {
1516 		if (nsk->sk_state != TCP_TIME_WAIT) {
1517 			bh_lock_sock(nsk);
1518 			return nsk;
1519 		}
1520 		inet_twsk_put(inet_twsk(nsk));
1521 		return NULL;
1522 	}
1523 
1524 #ifdef CONFIG_SYN_COOKIES
1525 	if (!th->syn)
1526 		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1527 #endif
1528 	return sk;
1529 }
1530 
1531 /* The socket must have it's spinlock held when we get
1532  * here.
1533  *
1534  * We have a potential double-lock case here, so even when
1535  * doing backlog processing we use the BH locking scheme.
1536  * This is because we cannot sleep with the original spinlock
1537  * held.
1538  */
1539 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1540 {
1541 	struct sock *rsk;
1542 #ifdef CONFIG_TCP_MD5SIG
1543 	/*
1544 	 * We really want to reject the packet as early as possible
1545 	 * if:
1546 	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1547 	 *  o There is an MD5 option and we're not expecting one
1548 	 */
1549 	if (tcp_v4_inbound_md5_hash(sk, skb))
1550 		goto discard;
1551 #endif
1552 
1553 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1554 		struct dst_entry *dst = sk->sk_rx_dst;
1555 
1556 		sock_rps_save_rxhash(sk, skb);
1557 		if (dst) {
1558 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1559 			    dst->ops->check(dst, 0) == NULL) {
1560 				dst_release(dst);
1561 				sk->sk_rx_dst = NULL;
1562 			}
1563 		}
1564 		tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1565 		return 0;
1566 	}
1567 
1568 	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1569 		goto csum_err;
1570 
1571 	if (sk->sk_state == TCP_LISTEN) {
1572 		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1573 		if (!nsk)
1574 			goto discard;
1575 
1576 		if (nsk != sk) {
1577 			sock_rps_save_rxhash(nsk, skb);
1578 			if (tcp_child_process(sk, nsk, skb)) {
1579 				rsk = nsk;
1580 				goto reset;
1581 			}
1582 			return 0;
1583 		}
1584 	} else
1585 		sock_rps_save_rxhash(sk, skb);
1586 
1587 	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1588 		rsk = sk;
1589 		goto reset;
1590 	}
1591 	return 0;
1592 
1593 reset:
1594 	tcp_v4_send_reset(rsk, skb);
1595 discard:
1596 	kfree_skb(skb);
1597 	/* Be careful here. If this function gets more complicated and
1598 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1599 	 * might be destroyed here. This current version compiles correctly,
1600 	 * but you have been warned.
1601 	 */
1602 	return 0;
1603 
1604 csum_err:
1605 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1606 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1607 	goto discard;
1608 }
1609 EXPORT_SYMBOL(tcp_v4_do_rcv);
1610 
1611 void tcp_v4_early_demux(struct sk_buff *skb)
1612 {
1613 	const struct iphdr *iph;
1614 	const struct tcphdr *th;
1615 	struct sock *sk;
1616 
1617 	if (skb->pkt_type != PACKET_HOST)
1618 		return;
1619 
1620 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1621 		return;
1622 
1623 	iph = ip_hdr(skb);
1624 	th = tcp_hdr(skb);
1625 
1626 	if (th->doff < sizeof(struct tcphdr) / 4)
1627 		return;
1628 
1629 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1630 				       iph->saddr, th->source,
1631 				       iph->daddr, ntohs(th->dest),
1632 				       skb->skb_iif);
1633 	if (sk) {
1634 		skb->sk = sk;
1635 		skb->destructor = sock_edemux;
1636 		if (sk->sk_state != TCP_TIME_WAIT) {
1637 			struct dst_entry *dst = sk->sk_rx_dst;
1638 
1639 			if (dst)
1640 				dst = dst_check(dst, 0);
1641 			if (dst &&
1642 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1643 				skb_dst_set_noref(skb, dst);
1644 		}
1645 	}
1646 }
1647 
1648 /* Packet is added to VJ-style prequeue for processing in process
1649  * context, if a reader task is waiting. Apparently, this exciting
1650  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1651  * failed somewhere. Latency? Burstiness? Well, at least now we will
1652  * see, why it failed. 8)8)				  --ANK
1653  *
1654  */
1655 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1656 {
1657 	struct tcp_sock *tp = tcp_sk(sk);
1658 
1659 	if (sysctl_tcp_low_latency || !tp->ucopy.task)
1660 		return false;
1661 
1662 	if (skb->len <= tcp_hdrlen(skb) &&
1663 	    skb_queue_len(&tp->ucopy.prequeue) == 0)
1664 		return false;
1665 
1666 	skb_dst_force(skb);
1667 	__skb_queue_tail(&tp->ucopy.prequeue, skb);
1668 	tp->ucopy.memory += skb->truesize;
1669 	if (tp->ucopy.memory > sk->sk_rcvbuf) {
1670 		struct sk_buff *skb1;
1671 
1672 		BUG_ON(sock_owned_by_user(sk));
1673 
1674 		while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1675 			sk_backlog_rcv(sk, skb1);
1676 			NET_INC_STATS_BH(sock_net(sk),
1677 					 LINUX_MIB_TCPPREQUEUEDROPPED);
1678 		}
1679 
1680 		tp->ucopy.memory = 0;
1681 	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1682 		wake_up_interruptible_sync_poll(sk_sleep(sk),
1683 					   POLLIN | POLLRDNORM | POLLRDBAND);
1684 		if (!inet_csk_ack_scheduled(sk))
1685 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1686 						  (3 * tcp_rto_min(sk)) / 4,
1687 						  TCP_RTO_MAX);
1688 	}
1689 	return true;
1690 }
1691 EXPORT_SYMBOL(tcp_prequeue);
1692 
1693 /*
1694  *	From tcp_input.c
1695  */
1696 
1697 int tcp_v4_rcv(struct sk_buff *skb)
1698 {
1699 	const struct iphdr *iph;
1700 	const struct tcphdr *th;
1701 	struct sock *sk;
1702 	int ret;
1703 	struct net *net = dev_net(skb->dev);
1704 
1705 	if (skb->pkt_type != PACKET_HOST)
1706 		goto discard_it;
1707 
1708 	/* Count it even if it's bad */
1709 	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1710 
1711 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1712 		goto discard_it;
1713 
1714 	th = tcp_hdr(skb);
1715 
1716 	if (th->doff < sizeof(struct tcphdr) / 4)
1717 		goto bad_packet;
1718 	if (!pskb_may_pull(skb, th->doff * 4))
1719 		goto discard_it;
1720 
1721 	/* An explanation is required here, I think.
1722 	 * Packet length and doff are validated by header prediction,
1723 	 * provided case of th->doff==0 is eliminated.
1724 	 * So, we defer the checks. */
1725 
1726 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1727 		goto csum_error;
1728 
1729 	th = tcp_hdr(skb);
1730 	iph = ip_hdr(skb);
1731 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1732 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1733 				    skb->len - th->doff * 4);
1734 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1735 	TCP_SKB_CB(skb)->when	 = 0;
1736 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1737 	TCP_SKB_CB(skb)->sacked	 = 0;
1738 
1739 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1740 	if (!sk)
1741 		goto no_tcp_socket;
1742 
1743 process:
1744 	if (sk->sk_state == TCP_TIME_WAIT)
1745 		goto do_time_wait;
1746 
1747 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1748 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1749 		goto discard_and_relse;
1750 	}
1751 
1752 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1753 		goto discard_and_relse;
1754 	nf_reset(skb);
1755 
1756 	if (sk_filter(sk, skb))
1757 		goto discard_and_relse;
1758 
1759 	sk_mark_napi_id(sk, skb);
1760 	skb->dev = NULL;
1761 
1762 	bh_lock_sock_nested(sk);
1763 	ret = 0;
1764 	if (!sock_owned_by_user(sk)) {
1765 #ifdef CONFIG_NET_DMA
1766 		struct tcp_sock *tp = tcp_sk(sk);
1767 		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1768 			tp->ucopy.dma_chan = net_dma_find_channel();
1769 		if (tp->ucopy.dma_chan)
1770 			ret = tcp_v4_do_rcv(sk, skb);
1771 		else
1772 #endif
1773 		{
1774 			if (!tcp_prequeue(sk, skb))
1775 				ret = tcp_v4_do_rcv(sk, skb);
1776 		}
1777 	} else if (unlikely(sk_add_backlog(sk, skb,
1778 					   sk->sk_rcvbuf + sk->sk_sndbuf))) {
1779 		bh_unlock_sock(sk);
1780 		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1781 		goto discard_and_relse;
1782 	}
1783 	bh_unlock_sock(sk);
1784 
1785 	sock_put(sk);
1786 
1787 	return ret;
1788 
1789 no_tcp_socket:
1790 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1791 		goto discard_it;
1792 
1793 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1794 csum_error:
1795 		TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1796 bad_packet:
1797 		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1798 	} else {
1799 		tcp_v4_send_reset(NULL, skb);
1800 	}
1801 
1802 discard_it:
1803 	/* Discard frame. */
1804 	kfree_skb(skb);
1805 	return 0;
1806 
1807 discard_and_relse:
1808 	sock_put(sk);
1809 	goto discard_it;
1810 
1811 do_time_wait:
1812 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1813 		inet_twsk_put(inet_twsk(sk));
1814 		goto discard_it;
1815 	}
1816 
1817 	if (skb->len < (th->doff << 2)) {
1818 		inet_twsk_put(inet_twsk(sk));
1819 		goto bad_packet;
1820 	}
1821 	if (tcp_checksum_complete(skb)) {
1822 		inet_twsk_put(inet_twsk(sk));
1823 		goto csum_error;
1824 	}
1825 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1826 	case TCP_TW_SYN: {
1827 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1828 							&tcp_hashinfo,
1829 							iph->saddr, th->source,
1830 							iph->daddr, th->dest,
1831 							inet_iif(skb));
1832 		if (sk2) {
1833 			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1834 			inet_twsk_put(inet_twsk(sk));
1835 			sk = sk2;
1836 			goto process;
1837 		}
1838 		/* Fall through to ACK */
1839 	}
1840 	case TCP_TW_ACK:
1841 		tcp_v4_timewait_ack(sk, skb);
1842 		break;
1843 	case TCP_TW_RST:
1844 		goto no_tcp_socket;
1845 	case TCP_TW_SUCCESS:;
1846 	}
1847 	goto discard_it;
1848 }
1849 
1850 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1851 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1852 	.twsk_unique	= tcp_twsk_unique,
1853 	.twsk_destructor= tcp_twsk_destructor,
1854 };
1855 
1856 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1857 {
1858 	struct dst_entry *dst = skb_dst(skb);
1859 
1860 	dst_hold(dst);
1861 	sk->sk_rx_dst = dst;
1862 	inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1863 }
1864 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1865 
1866 const struct inet_connection_sock_af_ops ipv4_specific = {
1867 	.queue_xmit	   = ip_queue_xmit,
1868 	.send_check	   = tcp_v4_send_check,
1869 	.rebuild_header	   = inet_sk_rebuild_header,
1870 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
1871 	.conn_request	   = tcp_v4_conn_request,
1872 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1873 	.net_header_len	   = sizeof(struct iphdr),
1874 	.setsockopt	   = ip_setsockopt,
1875 	.getsockopt	   = ip_getsockopt,
1876 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1877 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1878 	.bind_conflict	   = inet_csk_bind_conflict,
1879 #ifdef CONFIG_COMPAT
1880 	.compat_setsockopt = compat_ip_setsockopt,
1881 	.compat_getsockopt = compat_ip_getsockopt,
1882 #endif
1883 };
1884 EXPORT_SYMBOL(ipv4_specific);
1885 
1886 #ifdef CONFIG_TCP_MD5SIG
1887 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1888 	.md5_lookup		= tcp_v4_md5_lookup,
1889 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1890 	.md5_parse		= tcp_v4_parse_md5_keys,
1891 };
1892 #endif
1893 
1894 /* NOTE: A lot of things set to zero explicitly by call to
1895  *       sk_alloc() so need not be done here.
1896  */
1897 static int tcp_v4_init_sock(struct sock *sk)
1898 {
1899 	struct inet_connection_sock *icsk = inet_csk(sk);
1900 
1901 	tcp_init_sock(sk);
1902 
1903 	icsk->icsk_af_ops = &ipv4_specific;
1904 
1905 #ifdef CONFIG_TCP_MD5SIG
1906 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1907 #endif
1908 
1909 	return 0;
1910 }
1911 
1912 void tcp_v4_destroy_sock(struct sock *sk)
1913 {
1914 	struct tcp_sock *tp = tcp_sk(sk);
1915 
1916 	tcp_clear_xmit_timers(sk);
1917 
1918 	tcp_cleanup_congestion_control(sk);
1919 
1920 	/* Cleanup up the write buffer. */
1921 	tcp_write_queue_purge(sk);
1922 
1923 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1924 	__skb_queue_purge(&tp->out_of_order_queue);
1925 
1926 #ifdef CONFIG_TCP_MD5SIG
1927 	/* Clean up the MD5 key list, if any */
1928 	if (tp->md5sig_info) {
1929 		tcp_clear_md5_list(sk);
1930 		kfree_rcu(tp->md5sig_info, rcu);
1931 		tp->md5sig_info = NULL;
1932 	}
1933 #endif
1934 
1935 #ifdef CONFIG_NET_DMA
1936 	/* Cleans up our sk_async_wait_queue */
1937 	__skb_queue_purge(&sk->sk_async_wait_queue);
1938 #endif
1939 
1940 	/* Clean prequeue, it must be empty really */
1941 	__skb_queue_purge(&tp->ucopy.prequeue);
1942 
1943 	/* Clean up a referenced TCP bind bucket. */
1944 	if (inet_csk(sk)->icsk_bind_hash)
1945 		inet_put_port(sk);
1946 
1947 	BUG_ON(tp->fastopen_rsk != NULL);
1948 
1949 	/* If socket is aborted during connect operation */
1950 	tcp_free_fastopen_req(tp);
1951 
1952 	sk_sockets_allocated_dec(sk);
1953 	sock_release_memcg(sk);
1954 }
1955 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1956 
1957 #ifdef CONFIG_PROC_FS
1958 /* Proc filesystem TCP sock list dumping. */
1959 
1960 /*
1961  * Get next listener socket follow cur.  If cur is NULL, get first socket
1962  * starting from bucket given in st->bucket; when st->bucket is zero the
1963  * very first socket in the hash table is returned.
1964  */
1965 static void *listening_get_next(struct seq_file *seq, void *cur)
1966 {
1967 	struct inet_connection_sock *icsk;
1968 	struct hlist_nulls_node *node;
1969 	struct sock *sk = cur;
1970 	struct inet_listen_hashbucket *ilb;
1971 	struct tcp_iter_state *st = seq->private;
1972 	struct net *net = seq_file_net(seq);
1973 
1974 	if (!sk) {
1975 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1976 		spin_lock_bh(&ilb->lock);
1977 		sk = sk_nulls_head(&ilb->head);
1978 		st->offset = 0;
1979 		goto get_sk;
1980 	}
1981 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1982 	++st->num;
1983 	++st->offset;
1984 
1985 	if (st->state == TCP_SEQ_STATE_OPENREQ) {
1986 		struct request_sock *req = cur;
1987 
1988 		icsk = inet_csk(st->syn_wait_sk);
1989 		req = req->dl_next;
1990 		while (1) {
1991 			while (req) {
1992 				if (req->rsk_ops->family == st->family) {
1993 					cur = req;
1994 					goto out;
1995 				}
1996 				req = req->dl_next;
1997 			}
1998 			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1999 				break;
2000 get_req:
2001 			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2002 		}
2003 		sk	  = sk_nulls_next(st->syn_wait_sk);
2004 		st->state = TCP_SEQ_STATE_LISTENING;
2005 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2006 	} else {
2007 		icsk = inet_csk(sk);
2008 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2009 		if (reqsk_queue_len(&icsk->icsk_accept_queue))
2010 			goto start_req;
2011 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2012 		sk = sk_nulls_next(sk);
2013 	}
2014 get_sk:
2015 	sk_nulls_for_each_from(sk, node) {
2016 		if (!net_eq(sock_net(sk), net))
2017 			continue;
2018 		if (sk->sk_family == st->family) {
2019 			cur = sk;
2020 			goto out;
2021 		}
2022 		icsk = inet_csk(sk);
2023 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2024 		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2025 start_req:
2026 			st->uid		= sock_i_uid(sk);
2027 			st->syn_wait_sk = sk;
2028 			st->state	= TCP_SEQ_STATE_OPENREQ;
2029 			st->sbucket	= 0;
2030 			goto get_req;
2031 		}
2032 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2033 	}
2034 	spin_unlock_bh(&ilb->lock);
2035 	st->offset = 0;
2036 	if (++st->bucket < INET_LHTABLE_SIZE) {
2037 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2038 		spin_lock_bh(&ilb->lock);
2039 		sk = sk_nulls_head(&ilb->head);
2040 		goto get_sk;
2041 	}
2042 	cur = NULL;
2043 out:
2044 	return cur;
2045 }
2046 
2047 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2048 {
2049 	struct tcp_iter_state *st = seq->private;
2050 	void *rc;
2051 
2052 	st->bucket = 0;
2053 	st->offset = 0;
2054 	rc = listening_get_next(seq, NULL);
2055 
2056 	while (rc && *pos) {
2057 		rc = listening_get_next(seq, rc);
2058 		--*pos;
2059 	}
2060 	return rc;
2061 }
2062 
2063 static inline bool empty_bucket(const struct tcp_iter_state *st)
2064 {
2065 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2066 }
2067 
2068 /*
2069  * Get first established socket starting from bucket given in st->bucket.
2070  * If st->bucket is zero, the very first socket in the hash is returned.
2071  */
2072 static void *established_get_first(struct seq_file *seq)
2073 {
2074 	struct tcp_iter_state *st = seq->private;
2075 	struct net *net = seq_file_net(seq);
2076 	void *rc = NULL;
2077 
2078 	st->offset = 0;
2079 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2080 		struct sock *sk;
2081 		struct hlist_nulls_node *node;
2082 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2083 
2084 		/* Lockless fast path for the common case of empty buckets */
2085 		if (empty_bucket(st))
2086 			continue;
2087 
2088 		spin_lock_bh(lock);
2089 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2090 			if (sk->sk_family != st->family ||
2091 			    !net_eq(sock_net(sk), net)) {
2092 				continue;
2093 			}
2094 			rc = sk;
2095 			goto out;
2096 		}
2097 		spin_unlock_bh(lock);
2098 	}
2099 out:
2100 	return rc;
2101 }
2102 
2103 static void *established_get_next(struct seq_file *seq, void *cur)
2104 {
2105 	struct sock *sk = cur;
2106 	struct hlist_nulls_node *node;
2107 	struct tcp_iter_state *st = seq->private;
2108 	struct net *net = seq_file_net(seq);
2109 
2110 	++st->num;
2111 	++st->offset;
2112 
2113 	sk = sk_nulls_next(sk);
2114 
2115 	sk_nulls_for_each_from(sk, node) {
2116 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2117 			return sk;
2118 	}
2119 
2120 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2121 	++st->bucket;
2122 	return established_get_first(seq);
2123 }
2124 
2125 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2126 {
2127 	struct tcp_iter_state *st = seq->private;
2128 	void *rc;
2129 
2130 	st->bucket = 0;
2131 	rc = established_get_first(seq);
2132 
2133 	while (rc && pos) {
2134 		rc = established_get_next(seq, rc);
2135 		--pos;
2136 	}
2137 	return rc;
2138 }
2139 
2140 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2141 {
2142 	void *rc;
2143 	struct tcp_iter_state *st = seq->private;
2144 
2145 	st->state = TCP_SEQ_STATE_LISTENING;
2146 	rc	  = listening_get_idx(seq, &pos);
2147 
2148 	if (!rc) {
2149 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2150 		rc	  = established_get_idx(seq, pos);
2151 	}
2152 
2153 	return rc;
2154 }
2155 
2156 static void *tcp_seek_last_pos(struct seq_file *seq)
2157 {
2158 	struct tcp_iter_state *st = seq->private;
2159 	int offset = st->offset;
2160 	int orig_num = st->num;
2161 	void *rc = NULL;
2162 
2163 	switch (st->state) {
2164 	case TCP_SEQ_STATE_OPENREQ:
2165 	case TCP_SEQ_STATE_LISTENING:
2166 		if (st->bucket >= INET_LHTABLE_SIZE)
2167 			break;
2168 		st->state = TCP_SEQ_STATE_LISTENING;
2169 		rc = listening_get_next(seq, NULL);
2170 		while (offset-- && rc)
2171 			rc = listening_get_next(seq, rc);
2172 		if (rc)
2173 			break;
2174 		st->bucket = 0;
2175 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2176 		/* Fallthrough */
2177 	case TCP_SEQ_STATE_ESTABLISHED:
2178 		if (st->bucket > tcp_hashinfo.ehash_mask)
2179 			break;
2180 		rc = established_get_first(seq);
2181 		while (offset-- && rc)
2182 			rc = established_get_next(seq, rc);
2183 	}
2184 
2185 	st->num = orig_num;
2186 
2187 	return rc;
2188 }
2189 
2190 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2191 {
2192 	struct tcp_iter_state *st = seq->private;
2193 	void *rc;
2194 
2195 	if (*pos && *pos == st->last_pos) {
2196 		rc = tcp_seek_last_pos(seq);
2197 		if (rc)
2198 			goto out;
2199 	}
2200 
2201 	st->state = TCP_SEQ_STATE_LISTENING;
2202 	st->num = 0;
2203 	st->bucket = 0;
2204 	st->offset = 0;
2205 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2206 
2207 out:
2208 	st->last_pos = *pos;
2209 	return rc;
2210 }
2211 
2212 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2213 {
2214 	struct tcp_iter_state *st = seq->private;
2215 	void *rc = NULL;
2216 
2217 	if (v == SEQ_START_TOKEN) {
2218 		rc = tcp_get_idx(seq, 0);
2219 		goto out;
2220 	}
2221 
2222 	switch (st->state) {
2223 	case TCP_SEQ_STATE_OPENREQ:
2224 	case TCP_SEQ_STATE_LISTENING:
2225 		rc = listening_get_next(seq, v);
2226 		if (!rc) {
2227 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2228 			st->bucket = 0;
2229 			st->offset = 0;
2230 			rc	  = established_get_first(seq);
2231 		}
2232 		break;
2233 	case TCP_SEQ_STATE_ESTABLISHED:
2234 		rc = established_get_next(seq, v);
2235 		break;
2236 	}
2237 out:
2238 	++*pos;
2239 	st->last_pos = *pos;
2240 	return rc;
2241 }
2242 
2243 static void tcp_seq_stop(struct seq_file *seq, void *v)
2244 {
2245 	struct tcp_iter_state *st = seq->private;
2246 
2247 	switch (st->state) {
2248 	case TCP_SEQ_STATE_OPENREQ:
2249 		if (v) {
2250 			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2251 			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2252 		}
2253 	case TCP_SEQ_STATE_LISTENING:
2254 		if (v != SEQ_START_TOKEN)
2255 			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2256 		break;
2257 	case TCP_SEQ_STATE_ESTABLISHED:
2258 		if (v)
2259 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2260 		break;
2261 	}
2262 }
2263 
2264 int tcp_seq_open(struct inode *inode, struct file *file)
2265 {
2266 	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2267 	struct tcp_iter_state *s;
2268 	int err;
2269 
2270 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2271 			  sizeof(struct tcp_iter_state));
2272 	if (err < 0)
2273 		return err;
2274 
2275 	s = ((struct seq_file *)file->private_data)->private;
2276 	s->family		= afinfo->family;
2277 	s->last_pos 		= 0;
2278 	return 0;
2279 }
2280 EXPORT_SYMBOL(tcp_seq_open);
2281 
2282 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2283 {
2284 	int rc = 0;
2285 	struct proc_dir_entry *p;
2286 
2287 	afinfo->seq_ops.start		= tcp_seq_start;
2288 	afinfo->seq_ops.next		= tcp_seq_next;
2289 	afinfo->seq_ops.stop		= tcp_seq_stop;
2290 
2291 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2292 			     afinfo->seq_fops, afinfo);
2293 	if (!p)
2294 		rc = -ENOMEM;
2295 	return rc;
2296 }
2297 EXPORT_SYMBOL(tcp_proc_register);
2298 
2299 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2300 {
2301 	remove_proc_entry(afinfo->name, net->proc_net);
2302 }
2303 EXPORT_SYMBOL(tcp_proc_unregister);
2304 
2305 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2306 			 struct seq_file *f, int i, kuid_t uid)
2307 {
2308 	const struct inet_request_sock *ireq = inet_rsk(req);
2309 	long delta = req->expires - jiffies;
2310 
2311 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2312 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2313 		i,
2314 		ireq->ir_loc_addr,
2315 		ntohs(inet_sk(sk)->inet_sport),
2316 		ireq->ir_rmt_addr,
2317 		ntohs(ireq->ir_rmt_port),
2318 		TCP_SYN_RECV,
2319 		0, 0, /* could print option size, but that is af dependent. */
2320 		1,    /* timers active (only the expire timer) */
2321 		jiffies_delta_to_clock_t(delta),
2322 		req->num_timeout,
2323 		from_kuid_munged(seq_user_ns(f), uid),
2324 		0,  /* non standard timer */
2325 		0, /* open_requests have no inode */
2326 		atomic_read(&sk->sk_refcnt),
2327 		req);
2328 }
2329 
2330 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2331 {
2332 	int timer_active;
2333 	unsigned long timer_expires;
2334 	const struct tcp_sock *tp = tcp_sk(sk);
2335 	const struct inet_connection_sock *icsk = inet_csk(sk);
2336 	const struct inet_sock *inet = inet_sk(sk);
2337 	struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
2338 	__be32 dest = inet->inet_daddr;
2339 	__be32 src = inet->inet_rcv_saddr;
2340 	__u16 destp = ntohs(inet->inet_dport);
2341 	__u16 srcp = ntohs(inet->inet_sport);
2342 	int rx_queue;
2343 
2344 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2345 	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2346 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2347 		timer_active	= 1;
2348 		timer_expires	= icsk->icsk_timeout;
2349 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2350 		timer_active	= 4;
2351 		timer_expires	= icsk->icsk_timeout;
2352 	} else if (timer_pending(&sk->sk_timer)) {
2353 		timer_active	= 2;
2354 		timer_expires	= sk->sk_timer.expires;
2355 	} else {
2356 		timer_active	= 0;
2357 		timer_expires = jiffies;
2358 	}
2359 
2360 	if (sk->sk_state == TCP_LISTEN)
2361 		rx_queue = sk->sk_ack_backlog;
2362 	else
2363 		/*
2364 		 * because we dont lock socket, we might find a transient negative value
2365 		 */
2366 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2367 
2368 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2369 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2370 		i, src, srcp, dest, destp, sk->sk_state,
2371 		tp->write_seq - tp->snd_una,
2372 		rx_queue,
2373 		timer_active,
2374 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2375 		icsk->icsk_retransmits,
2376 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2377 		icsk->icsk_probes_out,
2378 		sock_i_ino(sk),
2379 		atomic_read(&sk->sk_refcnt), sk,
2380 		jiffies_to_clock_t(icsk->icsk_rto),
2381 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2382 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2383 		tp->snd_cwnd,
2384 		sk->sk_state == TCP_LISTEN ?
2385 		    (fastopenq ? fastopenq->max_qlen : 0) :
2386 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2387 }
2388 
2389 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2390 			       struct seq_file *f, int i)
2391 {
2392 	__be32 dest, src;
2393 	__u16 destp, srcp;
2394 	s32 delta = tw->tw_ttd - inet_tw_time_stamp();
2395 
2396 	dest  = tw->tw_daddr;
2397 	src   = tw->tw_rcv_saddr;
2398 	destp = ntohs(tw->tw_dport);
2399 	srcp  = ntohs(tw->tw_sport);
2400 
2401 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2402 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2403 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2404 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2405 		atomic_read(&tw->tw_refcnt), tw);
2406 }
2407 
2408 #define TMPSZ 150
2409 
2410 static int tcp4_seq_show(struct seq_file *seq, void *v)
2411 {
2412 	struct tcp_iter_state *st;
2413 	struct sock *sk = v;
2414 
2415 	seq_setwidth(seq, TMPSZ - 1);
2416 	if (v == SEQ_START_TOKEN) {
2417 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2418 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2419 			   "inode");
2420 		goto out;
2421 	}
2422 	st = seq->private;
2423 
2424 	switch (st->state) {
2425 	case TCP_SEQ_STATE_LISTENING:
2426 	case TCP_SEQ_STATE_ESTABLISHED:
2427 		if (sk->sk_state == TCP_TIME_WAIT)
2428 			get_timewait4_sock(v, seq, st->num);
2429 		else
2430 			get_tcp4_sock(v, seq, st->num);
2431 		break;
2432 	case TCP_SEQ_STATE_OPENREQ:
2433 		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid);
2434 		break;
2435 	}
2436 out:
2437 	seq_pad(seq, '\n');
2438 	return 0;
2439 }
2440 
2441 static const struct file_operations tcp_afinfo_seq_fops = {
2442 	.owner   = THIS_MODULE,
2443 	.open    = tcp_seq_open,
2444 	.read    = seq_read,
2445 	.llseek  = seq_lseek,
2446 	.release = seq_release_net
2447 };
2448 
2449 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2450 	.name		= "tcp",
2451 	.family		= AF_INET,
2452 	.seq_fops	= &tcp_afinfo_seq_fops,
2453 	.seq_ops	= {
2454 		.show		= tcp4_seq_show,
2455 	},
2456 };
2457 
2458 static int __net_init tcp4_proc_init_net(struct net *net)
2459 {
2460 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2461 }
2462 
2463 static void __net_exit tcp4_proc_exit_net(struct net *net)
2464 {
2465 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2466 }
2467 
2468 static struct pernet_operations tcp4_net_ops = {
2469 	.init = tcp4_proc_init_net,
2470 	.exit = tcp4_proc_exit_net,
2471 };
2472 
2473 int __init tcp4_proc_init(void)
2474 {
2475 	return register_pernet_subsys(&tcp4_net_ops);
2476 }
2477 
2478 void tcp4_proc_exit(void)
2479 {
2480 	unregister_pernet_subsys(&tcp4_net_ops);
2481 }
2482 #endif /* CONFIG_PROC_FS */
2483 
2484 struct proto tcp_prot = {
2485 	.name			= "TCP",
2486 	.owner			= THIS_MODULE,
2487 	.close			= tcp_close,
2488 	.connect		= tcp_v4_connect,
2489 	.disconnect		= tcp_disconnect,
2490 	.accept			= inet_csk_accept,
2491 	.ioctl			= tcp_ioctl,
2492 	.init			= tcp_v4_init_sock,
2493 	.destroy		= tcp_v4_destroy_sock,
2494 	.shutdown		= tcp_shutdown,
2495 	.setsockopt		= tcp_setsockopt,
2496 	.getsockopt		= tcp_getsockopt,
2497 	.recvmsg		= tcp_recvmsg,
2498 	.sendmsg		= tcp_sendmsg,
2499 	.sendpage		= tcp_sendpage,
2500 	.backlog_rcv		= tcp_v4_do_rcv,
2501 	.release_cb		= tcp_release_cb,
2502 	.mtu_reduced		= tcp_v4_mtu_reduced,
2503 	.hash			= inet_hash,
2504 	.unhash			= inet_unhash,
2505 	.get_port		= inet_csk_get_port,
2506 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2507 	.stream_memory_free	= tcp_stream_memory_free,
2508 	.sockets_allocated	= &tcp_sockets_allocated,
2509 	.orphan_count		= &tcp_orphan_count,
2510 	.memory_allocated	= &tcp_memory_allocated,
2511 	.memory_pressure	= &tcp_memory_pressure,
2512 	.sysctl_mem		= sysctl_tcp_mem,
2513 	.sysctl_wmem		= sysctl_tcp_wmem,
2514 	.sysctl_rmem		= sysctl_tcp_rmem,
2515 	.max_header		= MAX_TCP_HEADER,
2516 	.obj_size		= sizeof(struct tcp_sock),
2517 	.slab_flags		= SLAB_DESTROY_BY_RCU,
2518 	.twsk_prot		= &tcp_timewait_sock_ops,
2519 	.rsk_prot		= &tcp_request_sock_ops,
2520 	.h.hashinfo		= &tcp_hashinfo,
2521 	.no_autobind		= true,
2522 #ifdef CONFIG_COMPAT
2523 	.compat_setsockopt	= compat_tcp_setsockopt,
2524 	.compat_getsockopt	= compat_tcp_getsockopt,
2525 #endif
2526 #ifdef CONFIG_MEMCG_KMEM
2527 	.init_cgroup		= tcp_init_cgroup,
2528 	.destroy_cgroup		= tcp_destroy_cgroup,
2529 	.proto_cgroup		= tcp_proto_cgroup,
2530 #endif
2531 };
2532 EXPORT_SYMBOL(tcp_prot);
2533 
2534 static int __net_init tcp_sk_init(struct net *net)
2535 {
2536 	net->ipv4.sysctl_tcp_ecn = 2;
2537 	return 0;
2538 }
2539 
2540 static void __net_exit tcp_sk_exit(struct net *net)
2541 {
2542 }
2543 
2544 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2545 {
2546 	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2547 }
2548 
2549 static struct pernet_operations __net_initdata tcp_sk_ops = {
2550        .init	   = tcp_sk_init,
2551        .exit	   = tcp_sk_exit,
2552        .exit_batch = tcp_sk_exit_batch,
2553 };
2554 
2555 void __init tcp_v4_init(void)
2556 {
2557 	inet_hashinfo_init(&tcp_hashinfo);
2558 	if (register_pernet_subsys(&tcp_sk_ops))
2559 		panic("Failed to create the TCP control socket.\n");
2560 }
2561