xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision e23feb16)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/netdma.h>
76 #include <net/secure_seq.h>
77 #include <net/tcp_memcontrol.h>
78 #include <net/busy_poll.h>
79 
80 #include <linux/inet.h>
81 #include <linux/ipv6.h>
82 #include <linux/stddef.h>
83 #include <linux/proc_fs.h>
84 #include <linux/seq_file.h>
85 
86 #include <linux/crypto.h>
87 #include <linux/scatterlist.h>
88 
89 int sysctl_tcp_tw_reuse __read_mostly;
90 int sysctl_tcp_low_latency __read_mostly;
91 EXPORT_SYMBOL(sysctl_tcp_low_latency);
92 
93 
94 #ifdef CONFIG_TCP_MD5SIG
95 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
96 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
97 #endif
98 
99 struct inet_hashinfo tcp_hashinfo;
100 EXPORT_SYMBOL(tcp_hashinfo);
101 
102 static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
103 {
104 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
105 					  ip_hdr(skb)->saddr,
106 					  tcp_hdr(skb)->dest,
107 					  tcp_hdr(skb)->source);
108 }
109 
110 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
111 {
112 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
113 	struct tcp_sock *tp = tcp_sk(sk);
114 
115 	/* With PAWS, it is safe from the viewpoint
116 	   of data integrity. Even without PAWS it is safe provided sequence
117 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
118 
119 	   Actually, the idea is close to VJ's one, only timestamp cache is
120 	   held not per host, but per port pair and TW bucket is used as state
121 	   holder.
122 
123 	   If TW bucket has been already destroyed we fall back to VJ's scheme
124 	   and use initial timestamp retrieved from peer table.
125 	 */
126 	if (tcptw->tw_ts_recent_stamp &&
127 	    (twp == NULL || (sysctl_tcp_tw_reuse &&
128 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
129 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
130 		if (tp->write_seq == 0)
131 			tp->write_seq = 1;
132 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
133 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
134 		sock_hold(sktw);
135 		return 1;
136 	}
137 
138 	return 0;
139 }
140 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
141 
142 /* This will initiate an outgoing connection. */
143 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
144 {
145 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
146 	struct inet_sock *inet = inet_sk(sk);
147 	struct tcp_sock *tp = tcp_sk(sk);
148 	__be16 orig_sport, orig_dport;
149 	__be32 daddr, nexthop;
150 	struct flowi4 *fl4;
151 	struct rtable *rt;
152 	int err;
153 	struct ip_options_rcu *inet_opt;
154 
155 	if (addr_len < sizeof(struct sockaddr_in))
156 		return -EINVAL;
157 
158 	if (usin->sin_family != AF_INET)
159 		return -EAFNOSUPPORT;
160 
161 	nexthop = daddr = usin->sin_addr.s_addr;
162 	inet_opt = rcu_dereference_protected(inet->inet_opt,
163 					     sock_owned_by_user(sk));
164 	if (inet_opt && inet_opt->opt.srr) {
165 		if (!daddr)
166 			return -EINVAL;
167 		nexthop = inet_opt->opt.faddr;
168 	}
169 
170 	orig_sport = inet->inet_sport;
171 	orig_dport = usin->sin_port;
172 	fl4 = &inet->cork.fl.u.ip4;
173 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
174 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
175 			      IPPROTO_TCP,
176 			      orig_sport, orig_dport, sk, true);
177 	if (IS_ERR(rt)) {
178 		err = PTR_ERR(rt);
179 		if (err == -ENETUNREACH)
180 			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
181 		return err;
182 	}
183 
184 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
185 		ip_rt_put(rt);
186 		return -ENETUNREACH;
187 	}
188 
189 	if (!inet_opt || !inet_opt->opt.srr)
190 		daddr = fl4->daddr;
191 
192 	if (!inet->inet_saddr)
193 		inet->inet_saddr = fl4->saddr;
194 	inet->inet_rcv_saddr = inet->inet_saddr;
195 
196 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
197 		/* Reset inherited state */
198 		tp->rx_opt.ts_recent	   = 0;
199 		tp->rx_opt.ts_recent_stamp = 0;
200 		if (likely(!tp->repair))
201 			tp->write_seq	   = 0;
202 	}
203 
204 	if (tcp_death_row.sysctl_tw_recycle &&
205 	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
206 		tcp_fetch_timewait_stamp(sk, &rt->dst);
207 
208 	inet->inet_dport = usin->sin_port;
209 	inet->inet_daddr = daddr;
210 
211 	inet_csk(sk)->icsk_ext_hdr_len = 0;
212 	if (inet_opt)
213 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
214 
215 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
216 
217 	/* Socket identity is still unknown (sport may be zero).
218 	 * However we set state to SYN-SENT and not releasing socket
219 	 * lock select source port, enter ourselves into the hash tables and
220 	 * complete initialization after this.
221 	 */
222 	tcp_set_state(sk, TCP_SYN_SENT);
223 	err = inet_hash_connect(&tcp_death_row, sk);
224 	if (err)
225 		goto failure;
226 
227 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
228 			       inet->inet_sport, inet->inet_dport, sk);
229 	if (IS_ERR(rt)) {
230 		err = PTR_ERR(rt);
231 		rt = NULL;
232 		goto failure;
233 	}
234 	/* OK, now commit destination to socket.  */
235 	sk->sk_gso_type = SKB_GSO_TCPV4;
236 	sk_setup_caps(sk, &rt->dst);
237 
238 	if (!tp->write_seq && likely(!tp->repair))
239 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
240 							   inet->inet_daddr,
241 							   inet->inet_sport,
242 							   usin->sin_port);
243 
244 	inet->inet_id = tp->write_seq ^ jiffies;
245 
246 	err = tcp_connect(sk);
247 
248 	rt = NULL;
249 	if (err)
250 		goto failure;
251 
252 	return 0;
253 
254 failure:
255 	/*
256 	 * This unhashes the socket and releases the local port,
257 	 * if necessary.
258 	 */
259 	tcp_set_state(sk, TCP_CLOSE);
260 	ip_rt_put(rt);
261 	sk->sk_route_caps = 0;
262 	inet->inet_dport = 0;
263 	return err;
264 }
265 EXPORT_SYMBOL(tcp_v4_connect);
266 
267 /*
268  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
269  * It can be called through tcp_release_cb() if socket was owned by user
270  * at the time tcp_v4_err() was called to handle ICMP message.
271  */
272 static void tcp_v4_mtu_reduced(struct sock *sk)
273 {
274 	struct dst_entry *dst;
275 	struct inet_sock *inet = inet_sk(sk);
276 	u32 mtu = tcp_sk(sk)->mtu_info;
277 
278 	dst = inet_csk_update_pmtu(sk, mtu);
279 	if (!dst)
280 		return;
281 
282 	/* Something is about to be wrong... Remember soft error
283 	 * for the case, if this connection will not able to recover.
284 	 */
285 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
286 		sk->sk_err_soft = EMSGSIZE;
287 
288 	mtu = dst_mtu(dst);
289 
290 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
291 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
292 		tcp_sync_mss(sk, mtu);
293 
294 		/* Resend the TCP packet because it's
295 		 * clear that the old packet has been
296 		 * dropped. This is the new "fast" path mtu
297 		 * discovery.
298 		 */
299 		tcp_simple_retransmit(sk);
300 	} /* else let the usual retransmit timer handle it */
301 }
302 
303 static void do_redirect(struct sk_buff *skb, struct sock *sk)
304 {
305 	struct dst_entry *dst = __sk_dst_check(sk, 0);
306 
307 	if (dst)
308 		dst->ops->redirect(dst, sk, skb);
309 }
310 
311 /*
312  * This routine is called by the ICMP module when it gets some
313  * sort of error condition.  If err < 0 then the socket should
314  * be closed and the error returned to the user.  If err > 0
315  * it's just the icmp type << 8 | icmp code.  After adjustment
316  * header points to the first 8 bytes of the tcp header.  We need
317  * to find the appropriate port.
318  *
319  * The locking strategy used here is very "optimistic". When
320  * someone else accesses the socket the ICMP is just dropped
321  * and for some paths there is no check at all.
322  * A more general error queue to queue errors for later handling
323  * is probably better.
324  *
325  */
326 
327 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
328 {
329 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
330 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
331 	struct inet_connection_sock *icsk;
332 	struct tcp_sock *tp;
333 	struct inet_sock *inet;
334 	const int type = icmp_hdr(icmp_skb)->type;
335 	const int code = icmp_hdr(icmp_skb)->code;
336 	struct sock *sk;
337 	struct sk_buff *skb;
338 	struct request_sock *req;
339 	__u32 seq;
340 	__u32 remaining;
341 	int err;
342 	struct net *net = dev_net(icmp_skb->dev);
343 
344 	if (icmp_skb->len < (iph->ihl << 2) + 8) {
345 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
346 		return;
347 	}
348 
349 	sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
350 			iph->saddr, th->source, inet_iif(icmp_skb));
351 	if (!sk) {
352 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
353 		return;
354 	}
355 	if (sk->sk_state == TCP_TIME_WAIT) {
356 		inet_twsk_put(inet_twsk(sk));
357 		return;
358 	}
359 
360 	bh_lock_sock(sk);
361 	/* If too many ICMPs get dropped on busy
362 	 * servers this needs to be solved differently.
363 	 * We do take care of PMTU discovery (RFC1191) special case :
364 	 * we can receive locally generated ICMP messages while socket is held.
365 	 */
366 	if (sock_owned_by_user(sk)) {
367 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
368 			NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
369 	}
370 	if (sk->sk_state == TCP_CLOSE)
371 		goto out;
372 
373 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
374 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
375 		goto out;
376 	}
377 
378 	icsk = inet_csk(sk);
379 	tp = tcp_sk(sk);
380 	req = tp->fastopen_rsk;
381 	seq = ntohl(th->seq);
382 	if (sk->sk_state != TCP_LISTEN &&
383 	    !between(seq, tp->snd_una, tp->snd_nxt) &&
384 	    (req == NULL || seq != tcp_rsk(req)->snt_isn)) {
385 		/* For a Fast Open socket, allow seq to be snt_isn. */
386 		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
387 		goto out;
388 	}
389 
390 	switch (type) {
391 	case ICMP_REDIRECT:
392 		do_redirect(icmp_skb, sk);
393 		goto out;
394 	case ICMP_SOURCE_QUENCH:
395 		/* Just silently ignore these. */
396 		goto out;
397 	case ICMP_PARAMETERPROB:
398 		err = EPROTO;
399 		break;
400 	case ICMP_DEST_UNREACH:
401 		if (code > NR_ICMP_UNREACH)
402 			goto out;
403 
404 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
405 			/* We are not interested in TCP_LISTEN and open_requests
406 			 * (SYN-ACKs send out by Linux are always <576bytes so
407 			 * they should go through unfragmented).
408 			 */
409 			if (sk->sk_state == TCP_LISTEN)
410 				goto out;
411 
412 			tp->mtu_info = info;
413 			if (!sock_owned_by_user(sk)) {
414 				tcp_v4_mtu_reduced(sk);
415 			} else {
416 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
417 					sock_hold(sk);
418 			}
419 			goto out;
420 		}
421 
422 		err = icmp_err_convert[code].errno;
423 		/* check if icmp_skb allows revert of backoff
424 		 * (see draft-zimmermann-tcp-lcd) */
425 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
426 			break;
427 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
428 		    !icsk->icsk_backoff)
429 			break;
430 
431 		/* XXX (TFO) - revisit the following logic for TFO */
432 
433 		if (sock_owned_by_user(sk))
434 			break;
435 
436 		icsk->icsk_backoff--;
437 		inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
438 			TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
439 		tcp_bound_rto(sk);
440 
441 		skb = tcp_write_queue_head(sk);
442 		BUG_ON(!skb);
443 
444 		remaining = icsk->icsk_rto - min(icsk->icsk_rto,
445 				tcp_time_stamp - TCP_SKB_CB(skb)->when);
446 
447 		if (remaining) {
448 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
449 						  remaining, TCP_RTO_MAX);
450 		} else {
451 			/* RTO revert clocked out retransmission.
452 			 * Will retransmit now */
453 			tcp_retransmit_timer(sk);
454 		}
455 
456 		break;
457 	case ICMP_TIME_EXCEEDED:
458 		err = EHOSTUNREACH;
459 		break;
460 	default:
461 		goto out;
462 	}
463 
464 	/* XXX (TFO) - if it's a TFO socket and has been accepted, rather
465 	 * than following the TCP_SYN_RECV case and closing the socket,
466 	 * we ignore the ICMP error and keep trying like a fully established
467 	 * socket. Is this the right thing to do?
468 	 */
469 	if (req && req->sk == NULL)
470 		goto out;
471 
472 	switch (sk->sk_state) {
473 		struct request_sock *req, **prev;
474 	case TCP_LISTEN:
475 		if (sock_owned_by_user(sk))
476 			goto out;
477 
478 		req = inet_csk_search_req(sk, &prev, th->dest,
479 					  iph->daddr, iph->saddr);
480 		if (!req)
481 			goto out;
482 
483 		/* ICMPs are not backlogged, hence we cannot get
484 		   an established socket here.
485 		 */
486 		WARN_ON(req->sk);
487 
488 		if (seq != tcp_rsk(req)->snt_isn) {
489 			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
490 			goto out;
491 		}
492 
493 		/*
494 		 * Still in SYN_RECV, just remove it silently.
495 		 * There is no good way to pass the error to the newly
496 		 * created socket, and POSIX does not want network
497 		 * errors returned from accept().
498 		 */
499 		inet_csk_reqsk_queue_drop(sk, req, prev);
500 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
501 		goto out;
502 
503 	case TCP_SYN_SENT:
504 	case TCP_SYN_RECV:  /* Cannot happen.
505 			       It can f.e. if SYNs crossed,
506 			       or Fast Open.
507 			     */
508 		if (!sock_owned_by_user(sk)) {
509 			sk->sk_err = err;
510 
511 			sk->sk_error_report(sk);
512 
513 			tcp_done(sk);
514 		} else {
515 			sk->sk_err_soft = err;
516 		}
517 		goto out;
518 	}
519 
520 	/* If we've already connected we will keep trying
521 	 * until we time out, or the user gives up.
522 	 *
523 	 * rfc1122 4.2.3.9 allows to consider as hard errors
524 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
525 	 * but it is obsoleted by pmtu discovery).
526 	 *
527 	 * Note, that in modern internet, where routing is unreliable
528 	 * and in each dark corner broken firewalls sit, sending random
529 	 * errors ordered by their masters even this two messages finally lose
530 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
531 	 *
532 	 * Now we are in compliance with RFCs.
533 	 *							--ANK (980905)
534 	 */
535 
536 	inet = inet_sk(sk);
537 	if (!sock_owned_by_user(sk) && inet->recverr) {
538 		sk->sk_err = err;
539 		sk->sk_error_report(sk);
540 	} else	{ /* Only an error on timeout */
541 		sk->sk_err_soft = err;
542 	}
543 
544 out:
545 	bh_unlock_sock(sk);
546 	sock_put(sk);
547 }
548 
549 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
550 {
551 	struct tcphdr *th = tcp_hdr(skb);
552 
553 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
554 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
555 		skb->csum_start = skb_transport_header(skb) - skb->head;
556 		skb->csum_offset = offsetof(struct tcphdr, check);
557 	} else {
558 		th->check = tcp_v4_check(skb->len, saddr, daddr,
559 					 csum_partial(th,
560 						      th->doff << 2,
561 						      skb->csum));
562 	}
563 }
564 
565 /* This routine computes an IPv4 TCP checksum. */
566 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
567 {
568 	const struct inet_sock *inet = inet_sk(sk);
569 
570 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
571 }
572 EXPORT_SYMBOL(tcp_v4_send_check);
573 
574 /*
575  *	This routine will send an RST to the other tcp.
576  *
577  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
578  *		      for reset.
579  *	Answer: if a packet caused RST, it is not for a socket
580  *		existing in our system, if it is matched to a socket,
581  *		it is just duplicate segment or bug in other side's TCP.
582  *		So that we build reply only basing on parameters
583  *		arrived with segment.
584  *	Exception: precedence violation. We do not implement it in any case.
585  */
586 
587 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
588 {
589 	const struct tcphdr *th = tcp_hdr(skb);
590 	struct {
591 		struct tcphdr th;
592 #ifdef CONFIG_TCP_MD5SIG
593 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
594 #endif
595 	} rep;
596 	struct ip_reply_arg arg;
597 #ifdef CONFIG_TCP_MD5SIG
598 	struct tcp_md5sig_key *key;
599 	const __u8 *hash_location = NULL;
600 	unsigned char newhash[16];
601 	int genhash;
602 	struct sock *sk1 = NULL;
603 #endif
604 	struct net *net;
605 
606 	/* Never send a reset in response to a reset. */
607 	if (th->rst)
608 		return;
609 
610 	if (skb_rtable(skb)->rt_type != RTN_LOCAL)
611 		return;
612 
613 	/* Swap the send and the receive. */
614 	memset(&rep, 0, sizeof(rep));
615 	rep.th.dest   = th->source;
616 	rep.th.source = th->dest;
617 	rep.th.doff   = sizeof(struct tcphdr) / 4;
618 	rep.th.rst    = 1;
619 
620 	if (th->ack) {
621 		rep.th.seq = th->ack_seq;
622 	} else {
623 		rep.th.ack = 1;
624 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
625 				       skb->len - (th->doff << 2));
626 	}
627 
628 	memset(&arg, 0, sizeof(arg));
629 	arg.iov[0].iov_base = (unsigned char *)&rep;
630 	arg.iov[0].iov_len  = sizeof(rep.th);
631 
632 #ifdef CONFIG_TCP_MD5SIG
633 	hash_location = tcp_parse_md5sig_option(th);
634 	if (!sk && hash_location) {
635 		/*
636 		 * active side is lost. Try to find listening socket through
637 		 * source port, and then find md5 key through listening socket.
638 		 * we are not loose security here:
639 		 * Incoming packet is checked with md5 hash with finding key,
640 		 * no RST generated if md5 hash doesn't match.
641 		 */
642 		sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
643 					     &tcp_hashinfo, ip_hdr(skb)->saddr,
644 					     th->source, ip_hdr(skb)->daddr,
645 					     ntohs(th->source), inet_iif(skb));
646 		/* don't send rst if it can't find key */
647 		if (!sk1)
648 			return;
649 		rcu_read_lock();
650 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
651 					&ip_hdr(skb)->saddr, AF_INET);
652 		if (!key)
653 			goto release_sk1;
654 
655 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
656 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
657 			goto release_sk1;
658 	} else {
659 		key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
660 					     &ip_hdr(skb)->saddr,
661 					     AF_INET) : NULL;
662 	}
663 
664 	if (key) {
665 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
666 				   (TCPOPT_NOP << 16) |
667 				   (TCPOPT_MD5SIG << 8) |
668 				   TCPOLEN_MD5SIG);
669 		/* Update length and the length the header thinks exists */
670 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
671 		rep.th.doff = arg.iov[0].iov_len / 4;
672 
673 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
674 				     key, ip_hdr(skb)->saddr,
675 				     ip_hdr(skb)->daddr, &rep.th);
676 	}
677 #endif
678 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
679 				      ip_hdr(skb)->saddr, /* XXX */
680 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
681 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
682 	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
683 	/* When socket is gone, all binding information is lost.
684 	 * routing might fail in this case. No choice here, if we choose to force
685 	 * input interface, we will misroute in case of asymmetric route.
686 	 */
687 	if (sk)
688 		arg.bound_dev_if = sk->sk_bound_dev_if;
689 
690 	net = dev_net(skb_dst(skb)->dev);
691 	arg.tos = ip_hdr(skb)->tos;
692 	ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
693 			      ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
694 
695 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
696 	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
697 
698 #ifdef CONFIG_TCP_MD5SIG
699 release_sk1:
700 	if (sk1) {
701 		rcu_read_unlock();
702 		sock_put(sk1);
703 	}
704 #endif
705 }
706 
707 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
708    outside socket context is ugly, certainly. What can I do?
709  */
710 
711 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
712 			    u32 win, u32 tsval, u32 tsecr, int oif,
713 			    struct tcp_md5sig_key *key,
714 			    int reply_flags, u8 tos)
715 {
716 	const struct tcphdr *th = tcp_hdr(skb);
717 	struct {
718 		struct tcphdr th;
719 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
720 #ifdef CONFIG_TCP_MD5SIG
721 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
722 #endif
723 			];
724 	} rep;
725 	struct ip_reply_arg arg;
726 	struct net *net = dev_net(skb_dst(skb)->dev);
727 
728 	memset(&rep.th, 0, sizeof(struct tcphdr));
729 	memset(&arg, 0, sizeof(arg));
730 
731 	arg.iov[0].iov_base = (unsigned char *)&rep;
732 	arg.iov[0].iov_len  = sizeof(rep.th);
733 	if (tsecr) {
734 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
735 				   (TCPOPT_TIMESTAMP << 8) |
736 				   TCPOLEN_TIMESTAMP);
737 		rep.opt[1] = htonl(tsval);
738 		rep.opt[2] = htonl(tsecr);
739 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
740 	}
741 
742 	/* Swap the send and the receive. */
743 	rep.th.dest    = th->source;
744 	rep.th.source  = th->dest;
745 	rep.th.doff    = arg.iov[0].iov_len / 4;
746 	rep.th.seq     = htonl(seq);
747 	rep.th.ack_seq = htonl(ack);
748 	rep.th.ack     = 1;
749 	rep.th.window  = htons(win);
750 
751 #ifdef CONFIG_TCP_MD5SIG
752 	if (key) {
753 		int offset = (tsecr) ? 3 : 0;
754 
755 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
756 					  (TCPOPT_NOP << 16) |
757 					  (TCPOPT_MD5SIG << 8) |
758 					  TCPOLEN_MD5SIG);
759 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
760 		rep.th.doff = arg.iov[0].iov_len/4;
761 
762 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
763 				    key, ip_hdr(skb)->saddr,
764 				    ip_hdr(skb)->daddr, &rep.th);
765 	}
766 #endif
767 	arg.flags = reply_flags;
768 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
769 				      ip_hdr(skb)->saddr, /* XXX */
770 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
771 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
772 	if (oif)
773 		arg.bound_dev_if = oif;
774 	arg.tos = tos;
775 	ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
776 			      ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
777 
778 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
779 }
780 
781 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
782 {
783 	struct inet_timewait_sock *tw = inet_twsk(sk);
784 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
785 
786 	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
787 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
788 			tcp_time_stamp + tcptw->tw_ts_offset,
789 			tcptw->tw_ts_recent,
790 			tw->tw_bound_dev_if,
791 			tcp_twsk_md5_key(tcptw),
792 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
793 			tw->tw_tos
794 			);
795 
796 	inet_twsk_put(tw);
797 }
798 
799 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
800 				  struct request_sock *req)
801 {
802 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
803 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
804 	 */
805 	tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
806 			tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
807 			tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
808 			tcp_time_stamp,
809 			req->ts_recent,
810 			0,
811 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
812 					  AF_INET),
813 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
814 			ip_hdr(skb)->tos);
815 }
816 
817 /*
818  *	Send a SYN-ACK after having received a SYN.
819  *	This still operates on a request_sock only, not on a big
820  *	socket.
821  */
822 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
823 			      struct request_sock *req,
824 			      u16 queue_mapping)
825 {
826 	const struct inet_request_sock *ireq = inet_rsk(req);
827 	struct flowi4 fl4;
828 	int err = -1;
829 	struct sk_buff * skb;
830 
831 	/* First, grab a route. */
832 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
833 		return -1;
834 
835 	skb = tcp_make_synack(sk, dst, req, NULL);
836 
837 	if (skb) {
838 		__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
839 
840 		skb_set_queue_mapping(skb, queue_mapping);
841 		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
842 					    ireq->rmt_addr,
843 					    ireq->opt);
844 		err = net_xmit_eval(err);
845 		if (!tcp_rsk(req)->snt_synack && !err)
846 			tcp_rsk(req)->snt_synack = tcp_time_stamp;
847 	}
848 
849 	return err;
850 }
851 
852 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
853 {
854 	int res = tcp_v4_send_synack(sk, NULL, req, 0);
855 
856 	if (!res)
857 		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
858 	return res;
859 }
860 
861 /*
862  *	IPv4 request_sock destructor.
863  */
864 static void tcp_v4_reqsk_destructor(struct request_sock *req)
865 {
866 	kfree(inet_rsk(req)->opt);
867 }
868 
869 /*
870  * Return true if a syncookie should be sent
871  */
872 bool tcp_syn_flood_action(struct sock *sk,
873 			 const struct sk_buff *skb,
874 			 const char *proto)
875 {
876 	const char *msg = "Dropping request";
877 	bool want_cookie = false;
878 	struct listen_sock *lopt;
879 
880 
881 
882 #ifdef CONFIG_SYN_COOKIES
883 	if (sysctl_tcp_syncookies) {
884 		msg = "Sending cookies";
885 		want_cookie = true;
886 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
887 	} else
888 #endif
889 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
890 
891 	lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
892 	if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
893 		lopt->synflood_warned = 1;
894 		pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
895 			proto, ntohs(tcp_hdr(skb)->dest), msg);
896 	}
897 	return want_cookie;
898 }
899 EXPORT_SYMBOL(tcp_syn_flood_action);
900 
901 /*
902  * Save and compile IPv4 options into the request_sock if needed.
903  */
904 static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
905 {
906 	const struct ip_options *opt = &(IPCB(skb)->opt);
907 	struct ip_options_rcu *dopt = NULL;
908 
909 	if (opt && opt->optlen) {
910 		int opt_size = sizeof(*dopt) + opt->optlen;
911 
912 		dopt = kmalloc(opt_size, GFP_ATOMIC);
913 		if (dopt) {
914 			if (ip_options_echo(&dopt->opt, skb)) {
915 				kfree(dopt);
916 				dopt = NULL;
917 			}
918 		}
919 	}
920 	return dopt;
921 }
922 
923 #ifdef CONFIG_TCP_MD5SIG
924 /*
925  * RFC2385 MD5 checksumming requires a mapping of
926  * IP address->MD5 Key.
927  * We need to maintain these in the sk structure.
928  */
929 
930 /* Find the Key structure for an address.  */
931 struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
932 					 const union tcp_md5_addr *addr,
933 					 int family)
934 {
935 	struct tcp_sock *tp = tcp_sk(sk);
936 	struct tcp_md5sig_key *key;
937 	unsigned int size = sizeof(struct in_addr);
938 	struct tcp_md5sig_info *md5sig;
939 
940 	/* caller either holds rcu_read_lock() or socket lock */
941 	md5sig = rcu_dereference_check(tp->md5sig_info,
942 				       sock_owned_by_user(sk) ||
943 				       lockdep_is_held(&sk->sk_lock.slock));
944 	if (!md5sig)
945 		return NULL;
946 #if IS_ENABLED(CONFIG_IPV6)
947 	if (family == AF_INET6)
948 		size = sizeof(struct in6_addr);
949 #endif
950 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
951 		if (key->family != family)
952 			continue;
953 		if (!memcmp(&key->addr, addr, size))
954 			return key;
955 	}
956 	return NULL;
957 }
958 EXPORT_SYMBOL(tcp_md5_do_lookup);
959 
960 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
961 					 struct sock *addr_sk)
962 {
963 	union tcp_md5_addr *addr;
964 
965 	addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
966 	return tcp_md5_do_lookup(sk, addr, AF_INET);
967 }
968 EXPORT_SYMBOL(tcp_v4_md5_lookup);
969 
970 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
971 						      struct request_sock *req)
972 {
973 	union tcp_md5_addr *addr;
974 
975 	addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
976 	return tcp_md5_do_lookup(sk, addr, AF_INET);
977 }
978 
979 /* This can be called on a newly created socket, from other files */
980 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
981 		   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
982 {
983 	/* Add Key to the list */
984 	struct tcp_md5sig_key *key;
985 	struct tcp_sock *tp = tcp_sk(sk);
986 	struct tcp_md5sig_info *md5sig;
987 
988 	key = tcp_md5_do_lookup(sk, addr, family);
989 	if (key) {
990 		/* Pre-existing entry - just update that one. */
991 		memcpy(key->key, newkey, newkeylen);
992 		key->keylen = newkeylen;
993 		return 0;
994 	}
995 
996 	md5sig = rcu_dereference_protected(tp->md5sig_info,
997 					   sock_owned_by_user(sk));
998 	if (!md5sig) {
999 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1000 		if (!md5sig)
1001 			return -ENOMEM;
1002 
1003 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1004 		INIT_HLIST_HEAD(&md5sig->head);
1005 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1006 	}
1007 
1008 	key = sock_kmalloc(sk, sizeof(*key), gfp);
1009 	if (!key)
1010 		return -ENOMEM;
1011 	if (!tcp_alloc_md5sig_pool()) {
1012 		sock_kfree_s(sk, key, sizeof(*key));
1013 		return -ENOMEM;
1014 	}
1015 
1016 	memcpy(key->key, newkey, newkeylen);
1017 	key->keylen = newkeylen;
1018 	key->family = family;
1019 	memcpy(&key->addr, addr,
1020 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1021 				      sizeof(struct in_addr));
1022 	hlist_add_head_rcu(&key->node, &md5sig->head);
1023 	return 0;
1024 }
1025 EXPORT_SYMBOL(tcp_md5_do_add);
1026 
1027 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
1028 {
1029 	struct tcp_md5sig_key *key;
1030 
1031 	key = tcp_md5_do_lookup(sk, addr, family);
1032 	if (!key)
1033 		return -ENOENT;
1034 	hlist_del_rcu(&key->node);
1035 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1036 	kfree_rcu(key, rcu);
1037 	return 0;
1038 }
1039 EXPORT_SYMBOL(tcp_md5_do_del);
1040 
1041 static void tcp_clear_md5_list(struct sock *sk)
1042 {
1043 	struct tcp_sock *tp = tcp_sk(sk);
1044 	struct tcp_md5sig_key *key;
1045 	struct hlist_node *n;
1046 	struct tcp_md5sig_info *md5sig;
1047 
1048 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1049 
1050 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1051 		hlist_del_rcu(&key->node);
1052 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1053 		kfree_rcu(key, rcu);
1054 	}
1055 }
1056 
1057 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1058 				 int optlen)
1059 {
1060 	struct tcp_md5sig cmd;
1061 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1062 
1063 	if (optlen < sizeof(cmd))
1064 		return -EINVAL;
1065 
1066 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1067 		return -EFAULT;
1068 
1069 	if (sin->sin_family != AF_INET)
1070 		return -EINVAL;
1071 
1072 	if (!cmd.tcpm_key || !cmd.tcpm_keylen)
1073 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1074 				      AF_INET);
1075 
1076 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1077 		return -EINVAL;
1078 
1079 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1080 			      AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1081 			      GFP_KERNEL);
1082 }
1083 
1084 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1085 					__be32 daddr, __be32 saddr, int nbytes)
1086 {
1087 	struct tcp4_pseudohdr *bp;
1088 	struct scatterlist sg;
1089 
1090 	bp = &hp->md5_blk.ip4;
1091 
1092 	/*
1093 	 * 1. the TCP pseudo-header (in the order: source IP address,
1094 	 * destination IP address, zero-padded protocol number, and
1095 	 * segment length)
1096 	 */
1097 	bp->saddr = saddr;
1098 	bp->daddr = daddr;
1099 	bp->pad = 0;
1100 	bp->protocol = IPPROTO_TCP;
1101 	bp->len = cpu_to_be16(nbytes);
1102 
1103 	sg_init_one(&sg, bp, sizeof(*bp));
1104 	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1105 }
1106 
1107 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1108 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1109 {
1110 	struct tcp_md5sig_pool *hp;
1111 	struct hash_desc *desc;
1112 
1113 	hp = tcp_get_md5sig_pool();
1114 	if (!hp)
1115 		goto clear_hash_noput;
1116 	desc = &hp->md5_desc;
1117 
1118 	if (crypto_hash_init(desc))
1119 		goto clear_hash;
1120 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1121 		goto clear_hash;
1122 	if (tcp_md5_hash_header(hp, th))
1123 		goto clear_hash;
1124 	if (tcp_md5_hash_key(hp, key))
1125 		goto clear_hash;
1126 	if (crypto_hash_final(desc, md5_hash))
1127 		goto clear_hash;
1128 
1129 	tcp_put_md5sig_pool();
1130 	return 0;
1131 
1132 clear_hash:
1133 	tcp_put_md5sig_pool();
1134 clear_hash_noput:
1135 	memset(md5_hash, 0, 16);
1136 	return 1;
1137 }
1138 
1139 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1140 			const struct sock *sk, const struct request_sock *req,
1141 			const struct sk_buff *skb)
1142 {
1143 	struct tcp_md5sig_pool *hp;
1144 	struct hash_desc *desc;
1145 	const struct tcphdr *th = tcp_hdr(skb);
1146 	__be32 saddr, daddr;
1147 
1148 	if (sk) {
1149 		saddr = inet_sk(sk)->inet_saddr;
1150 		daddr = inet_sk(sk)->inet_daddr;
1151 	} else if (req) {
1152 		saddr = inet_rsk(req)->loc_addr;
1153 		daddr = inet_rsk(req)->rmt_addr;
1154 	} else {
1155 		const struct iphdr *iph = ip_hdr(skb);
1156 		saddr = iph->saddr;
1157 		daddr = iph->daddr;
1158 	}
1159 
1160 	hp = tcp_get_md5sig_pool();
1161 	if (!hp)
1162 		goto clear_hash_noput;
1163 	desc = &hp->md5_desc;
1164 
1165 	if (crypto_hash_init(desc))
1166 		goto clear_hash;
1167 
1168 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1169 		goto clear_hash;
1170 	if (tcp_md5_hash_header(hp, th))
1171 		goto clear_hash;
1172 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1173 		goto clear_hash;
1174 	if (tcp_md5_hash_key(hp, key))
1175 		goto clear_hash;
1176 	if (crypto_hash_final(desc, md5_hash))
1177 		goto clear_hash;
1178 
1179 	tcp_put_md5sig_pool();
1180 	return 0;
1181 
1182 clear_hash:
1183 	tcp_put_md5sig_pool();
1184 clear_hash_noput:
1185 	memset(md5_hash, 0, 16);
1186 	return 1;
1187 }
1188 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1189 
1190 static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1191 {
1192 	/*
1193 	 * This gets called for each TCP segment that arrives
1194 	 * so we want to be efficient.
1195 	 * We have 3 drop cases:
1196 	 * o No MD5 hash and one expected.
1197 	 * o MD5 hash and we're not expecting one.
1198 	 * o MD5 hash and its wrong.
1199 	 */
1200 	const __u8 *hash_location = NULL;
1201 	struct tcp_md5sig_key *hash_expected;
1202 	const struct iphdr *iph = ip_hdr(skb);
1203 	const struct tcphdr *th = tcp_hdr(skb);
1204 	int genhash;
1205 	unsigned char newhash[16];
1206 
1207 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1208 					  AF_INET);
1209 	hash_location = tcp_parse_md5sig_option(th);
1210 
1211 	/* We've parsed the options - do we have a hash? */
1212 	if (!hash_expected && !hash_location)
1213 		return false;
1214 
1215 	if (hash_expected && !hash_location) {
1216 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1217 		return true;
1218 	}
1219 
1220 	if (!hash_expected && hash_location) {
1221 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1222 		return true;
1223 	}
1224 
1225 	/* Okay, so this is hash_expected and hash_location -
1226 	 * so we need to calculate the checksum.
1227 	 */
1228 	genhash = tcp_v4_md5_hash_skb(newhash,
1229 				      hash_expected,
1230 				      NULL, NULL, skb);
1231 
1232 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1233 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1234 				     &iph->saddr, ntohs(th->source),
1235 				     &iph->daddr, ntohs(th->dest),
1236 				     genhash ? " tcp_v4_calc_md5_hash failed"
1237 				     : "");
1238 		return true;
1239 	}
1240 	return false;
1241 }
1242 
1243 #endif
1244 
1245 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1246 	.family		=	PF_INET,
1247 	.obj_size	=	sizeof(struct tcp_request_sock),
1248 	.rtx_syn_ack	=	tcp_v4_rtx_synack,
1249 	.send_ack	=	tcp_v4_reqsk_send_ack,
1250 	.destructor	=	tcp_v4_reqsk_destructor,
1251 	.send_reset	=	tcp_v4_send_reset,
1252 	.syn_ack_timeout = 	tcp_syn_ack_timeout,
1253 };
1254 
1255 #ifdef CONFIG_TCP_MD5SIG
1256 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1257 	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
1258 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1259 };
1260 #endif
1261 
1262 static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
1263 			       struct request_sock *req,
1264 			       struct tcp_fastopen_cookie *foc,
1265 			       struct tcp_fastopen_cookie *valid_foc)
1266 {
1267 	bool skip_cookie = false;
1268 	struct fastopen_queue *fastopenq;
1269 
1270 	if (likely(!fastopen_cookie_present(foc))) {
1271 		/* See include/net/tcp.h for the meaning of these knobs */
1272 		if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) ||
1273 		    ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) &&
1274 		    (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1)))
1275 			skip_cookie = true; /* no cookie to validate */
1276 		else
1277 			return false;
1278 	}
1279 	fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
1280 	/* A FO option is present; bump the counter. */
1281 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE);
1282 
1283 	/* Make sure the listener has enabled fastopen, and we don't
1284 	 * exceed the max # of pending TFO requests allowed before trying
1285 	 * to validating the cookie in order to avoid burning CPU cycles
1286 	 * unnecessarily.
1287 	 *
1288 	 * XXX (TFO) - The implication of checking the max_qlen before
1289 	 * processing a cookie request is that clients can't differentiate
1290 	 * between qlen overflow causing Fast Open to be disabled
1291 	 * temporarily vs a server not supporting Fast Open at all.
1292 	 */
1293 	if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 ||
1294 	    fastopenq == NULL || fastopenq->max_qlen == 0)
1295 		return false;
1296 
1297 	if (fastopenq->qlen >= fastopenq->max_qlen) {
1298 		struct request_sock *req1;
1299 		spin_lock(&fastopenq->lock);
1300 		req1 = fastopenq->rskq_rst_head;
1301 		if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
1302 			spin_unlock(&fastopenq->lock);
1303 			NET_INC_STATS_BH(sock_net(sk),
1304 			    LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
1305 			/* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/
1306 			foc->len = -1;
1307 			return false;
1308 		}
1309 		fastopenq->rskq_rst_head = req1->dl_next;
1310 		fastopenq->qlen--;
1311 		spin_unlock(&fastopenq->lock);
1312 		reqsk_free(req1);
1313 	}
1314 	if (skip_cookie) {
1315 		tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1316 		return true;
1317 	}
1318 
1319 	if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
1320 		if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
1321 			tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr,
1322 						ip_hdr(skb)->daddr, valid_foc);
1323 			if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
1324 			    memcmp(&foc->val[0], &valid_foc->val[0],
1325 			    TCP_FASTOPEN_COOKIE_SIZE) != 0)
1326 				return false;
1327 			valid_foc->len = -1;
1328 		}
1329 		/* Acknowledge the data received from the peer. */
1330 		tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1331 		return true;
1332 	} else if (foc->len == 0) { /* Client requesting a cookie */
1333 		tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr,
1334 					ip_hdr(skb)->daddr, valid_foc);
1335 		NET_INC_STATS_BH(sock_net(sk),
1336 		    LINUX_MIB_TCPFASTOPENCOOKIEREQD);
1337 	} else {
1338 		/* Client sent a cookie with wrong size. Treat it
1339 		 * the same as invalid and return a valid one.
1340 		 */
1341 		tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr,
1342 					ip_hdr(skb)->daddr, valid_foc);
1343 	}
1344 	return false;
1345 }
1346 
1347 static int tcp_v4_conn_req_fastopen(struct sock *sk,
1348 				    struct sk_buff *skb,
1349 				    struct sk_buff *skb_synack,
1350 				    struct request_sock *req)
1351 {
1352 	struct tcp_sock *tp = tcp_sk(sk);
1353 	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
1354 	const struct inet_request_sock *ireq = inet_rsk(req);
1355 	struct sock *child;
1356 	int err;
1357 
1358 	req->num_retrans = 0;
1359 	req->num_timeout = 0;
1360 	req->sk = NULL;
1361 
1362 	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
1363 	if (child == NULL) {
1364 		NET_INC_STATS_BH(sock_net(sk),
1365 				 LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1366 		kfree_skb(skb_synack);
1367 		return -1;
1368 	}
1369 	err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1370 				    ireq->rmt_addr, ireq->opt);
1371 	err = net_xmit_eval(err);
1372 	if (!err)
1373 		tcp_rsk(req)->snt_synack = tcp_time_stamp;
1374 	/* XXX (TFO) - is it ok to ignore error and continue? */
1375 
1376 	spin_lock(&queue->fastopenq->lock);
1377 	queue->fastopenq->qlen++;
1378 	spin_unlock(&queue->fastopenq->lock);
1379 
1380 	/* Initialize the child socket. Have to fix some values to take
1381 	 * into account the child is a Fast Open socket and is created
1382 	 * only out of the bits carried in the SYN packet.
1383 	 */
1384 	tp = tcp_sk(child);
1385 
1386 	tp->fastopen_rsk = req;
1387 	/* Do a hold on the listner sk so that if the listener is being
1388 	 * closed, the child that has been accepted can live on and still
1389 	 * access listen_lock.
1390 	 */
1391 	sock_hold(sk);
1392 	tcp_rsk(req)->listener = sk;
1393 
1394 	/* RFC1323: The window in SYN & SYN/ACK segments is never
1395 	 * scaled. So correct it appropriately.
1396 	 */
1397 	tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
1398 
1399 	/* Activate the retrans timer so that SYNACK can be retransmitted.
1400 	 * The request socket is not added to the SYN table of the parent
1401 	 * because it's been added to the accept queue directly.
1402 	 */
1403 	inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
1404 	    TCP_TIMEOUT_INIT, TCP_RTO_MAX);
1405 
1406 	/* Add the child socket directly into the accept queue */
1407 	inet_csk_reqsk_queue_add(sk, req, child);
1408 
1409 	/* Now finish processing the fastopen child socket. */
1410 	inet_csk(child)->icsk_af_ops->rebuild_header(child);
1411 	tcp_init_congestion_control(child);
1412 	tcp_mtup_init(child);
1413 	tcp_init_buffer_space(child);
1414 	tcp_init_metrics(child);
1415 
1416 	/* Queue the data carried in the SYN packet. We need to first
1417 	 * bump skb's refcnt because the caller will attempt to free it.
1418 	 *
1419 	 * XXX (TFO) - we honor a zero-payload TFO request for now.
1420 	 * (Any reason not to?)
1421 	 */
1422 	if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) {
1423 		/* Don't queue the skb if there is no payload in SYN.
1424 		 * XXX (TFO) - How about SYN+FIN?
1425 		 */
1426 		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1427 	} else {
1428 		skb = skb_get(skb);
1429 		skb_dst_drop(skb);
1430 		__skb_pull(skb, tcp_hdr(skb)->doff * 4);
1431 		skb_set_owner_r(skb, child);
1432 		__skb_queue_tail(&child->sk_receive_queue, skb);
1433 		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1434 		tp->syn_data_acked = 1;
1435 	}
1436 	sk->sk_data_ready(sk, 0);
1437 	bh_unlock_sock(child);
1438 	sock_put(child);
1439 	WARN_ON(req->sk == NULL);
1440 	return 0;
1441 }
1442 
1443 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1444 {
1445 	struct tcp_options_received tmp_opt;
1446 	struct request_sock *req;
1447 	struct inet_request_sock *ireq;
1448 	struct tcp_sock *tp = tcp_sk(sk);
1449 	struct dst_entry *dst = NULL;
1450 	__be32 saddr = ip_hdr(skb)->saddr;
1451 	__be32 daddr = ip_hdr(skb)->daddr;
1452 	__u32 isn = TCP_SKB_CB(skb)->when;
1453 	bool want_cookie = false;
1454 	struct flowi4 fl4;
1455 	struct tcp_fastopen_cookie foc = { .len = -1 };
1456 	struct tcp_fastopen_cookie valid_foc = { .len = -1 };
1457 	struct sk_buff *skb_synack;
1458 	int do_fastopen;
1459 
1460 	/* Never answer to SYNs send to broadcast or multicast */
1461 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1462 		goto drop;
1463 
1464 	/* TW buckets are converted to open requests without
1465 	 * limitations, they conserve resources and peer is
1466 	 * evidently real one.
1467 	 */
1468 	if ((sysctl_tcp_syncookies == 2 ||
1469 	     inet_csk_reqsk_queue_is_full(sk)) && !isn) {
1470 		want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1471 		if (!want_cookie)
1472 			goto drop;
1473 	}
1474 
1475 	/* Accept backlog is full. If we have already queued enough
1476 	 * of warm entries in syn queue, drop request. It is better than
1477 	 * clogging syn queue with openreqs with exponentially increasing
1478 	 * timeout.
1479 	 */
1480 	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
1481 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1482 		goto drop;
1483 	}
1484 
1485 	req = inet_reqsk_alloc(&tcp_request_sock_ops);
1486 	if (!req)
1487 		goto drop;
1488 
1489 #ifdef CONFIG_TCP_MD5SIG
1490 	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1491 #endif
1492 
1493 	tcp_clear_options(&tmp_opt);
1494 	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1495 	tmp_opt.user_mss  = tp->rx_opt.user_mss;
1496 	tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
1497 
1498 	if (want_cookie && !tmp_opt.saw_tstamp)
1499 		tcp_clear_options(&tmp_opt);
1500 
1501 	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1502 	tcp_openreq_init(req, &tmp_opt, skb);
1503 
1504 	ireq = inet_rsk(req);
1505 	ireq->loc_addr = daddr;
1506 	ireq->rmt_addr = saddr;
1507 	ireq->no_srccheck = inet_sk(sk)->transparent;
1508 	ireq->opt = tcp_v4_save_options(skb);
1509 
1510 	if (security_inet_conn_request(sk, skb, req))
1511 		goto drop_and_free;
1512 
1513 	if (!want_cookie || tmp_opt.tstamp_ok)
1514 		TCP_ECN_create_request(req, skb, sock_net(sk));
1515 
1516 	if (want_cookie) {
1517 		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1518 		req->cookie_ts = tmp_opt.tstamp_ok;
1519 	} else if (!isn) {
1520 		/* VJ's idea. We save last timestamp seen
1521 		 * from the destination in peer table, when entering
1522 		 * state TIME-WAIT, and check against it before
1523 		 * accepting new connection request.
1524 		 *
1525 		 * If "isn" is not zero, this request hit alive
1526 		 * timewait bucket, so that all the necessary checks
1527 		 * are made in the function processing timewait state.
1528 		 */
1529 		if (tmp_opt.saw_tstamp &&
1530 		    tcp_death_row.sysctl_tw_recycle &&
1531 		    (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1532 		    fl4.daddr == saddr) {
1533 			if (!tcp_peer_is_proven(req, dst, true)) {
1534 				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1535 				goto drop_and_release;
1536 			}
1537 		}
1538 		/* Kill the following clause, if you dislike this way. */
1539 		else if (!sysctl_tcp_syncookies &&
1540 			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1541 			  (sysctl_max_syn_backlog >> 2)) &&
1542 			 !tcp_peer_is_proven(req, dst, false)) {
1543 			/* Without syncookies last quarter of
1544 			 * backlog is filled with destinations,
1545 			 * proven to be alive.
1546 			 * It means that we continue to communicate
1547 			 * to destinations, already remembered
1548 			 * to the moment of synflood.
1549 			 */
1550 			LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
1551 				       &saddr, ntohs(tcp_hdr(skb)->source));
1552 			goto drop_and_release;
1553 		}
1554 
1555 		isn = tcp_v4_init_sequence(skb);
1556 	}
1557 	tcp_rsk(req)->snt_isn = isn;
1558 
1559 	if (dst == NULL) {
1560 		dst = inet_csk_route_req(sk, &fl4, req);
1561 		if (dst == NULL)
1562 			goto drop_and_free;
1563 	}
1564 	do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc);
1565 
1566 	/* We don't call tcp_v4_send_synack() directly because we need
1567 	 * to make sure a child socket can be created successfully before
1568 	 * sending back synack!
1569 	 *
1570 	 * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
1571 	 * (or better yet, call tcp_send_synack() in the child context
1572 	 * directly, but will have to fix bunch of other code first)
1573 	 * after syn_recv_sock() except one will need to first fix the
1574 	 * latter to remove its dependency on the current implementation
1575 	 * of tcp_v4_send_synack()->tcp_select_initial_window().
1576 	 */
1577 	skb_synack = tcp_make_synack(sk, dst, req,
1578 	    fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
1579 
1580 	if (skb_synack) {
1581 		__tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr);
1582 		skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
1583 	} else
1584 		goto drop_and_free;
1585 
1586 	if (likely(!do_fastopen)) {
1587 		int err;
1588 		err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1589 		     ireq->rmt_addr, ireq->opt);
1590 		err = net_xmit_eval(err);
1591 		if (err || want_cookie)
1592 			goto drop_and_free;
1593 
1594 		tcp_rsk(req)->snt_synack = tcp_time_stamp;
1595 		tcp_rsk(req)->listener = NULL;
1596 		/* Add the request_sock to the SYN table */
1597 		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1598 		if (fastopen_cookie_present(&foc) && foc.len != 0)
1599 			NET_INC_STATS_BH(sock_net(sk),
1600 			    LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1601 	} else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req))
1602 		goto drop_and_free;
1603 
1604 	return 0;
1605 
1606 drop_and_release:
1607 	dst_release(dst);
1608 drop_and_free:
1609 	reqsk_free(req);
1610 drop:
1611 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1612 	return 0;
1613 }
1614 EXPORT_SYMBOL(tcp_v4_conn_request);
1615 
1616 
1617 /*
1618  * The three way handshake has completed - we got a valid synack -
1619  * now create the new socket.
1620  */
1621 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1622 				  struct request_sock *req,
1623 				  struct dst_entry *dst)
1624 {
1625 	struct inet_request_sock *ireq;
1626 	struct inet_sock *newinet;
1627 	struct tcp_sock *newtp;
1628 	struct sock *newsk;
1629 #ifdef CONFIG_TCP_MD5SIG
1630 	struct tcp_md5sig_key *key;
1631 #endif
1632 	struct ip_options_rcu *inet_opt;
1633 
1634 	if (sk_acceptq_is_full(sk))
1635 		goto exit_overflow;
1636 
1637 	newsk = tcp_create_openreq_child(sk, req, skb);
1638 	if (!newsk)
1639 		goto exit_nonewsk;
1640 
1641 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1642 	inet_sk_rx_dst_set(newsk, skb);
1643 
1644 	newtp		      = tcp_sk(newsk);
1645 	newinet		      = inet_sk(newsk);
1646 	ireq		      = inet_rsk(req);
1647 	newinet->inet_daddr   = ireq->rmt_addr;
1648 	newinet->inet_rcv_saddr = ireq->loc_addr;
1649 	newinet->inet_saddr	      = ireq->loc_addr;
1650 	inet_opt	      = ireq->opt;
1651 	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1652 	ireq->opt	      = NULL;
1653 	newinet->mc_index     = inet_iif(skb);
1654 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1655 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1656 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1657 	if (inet_opt)
1658 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1659 	newinet->inet_id = newtp->write_seq ^ jiffies;
1660 
1661 	if (!dst) {
1662 		dst = inet_csk_route_child_sock(sk, newsk, req);
1663 		if (!dst)
1664 			goto put_and_exit;
1665 	} else {
1666 		/* syncookie case : see end of cookie_v4_check() */
1667 	}
1668 	sk_setup_caps(newsk, dst);
1669 
1670 	tcp_mtup_init(newsk);
1671 	tcp_sync_mss(newsk, dst_mtu(dst));
1672 	newtp->advmss = dst_metric_advmss(dst);
1673 	if (tcp_sk(sk)->rx_opt.user_mss &&
1674 	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1675 		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1676 
1677 	tcp_initialize_rcv_mss(newsk);
1678 
1679 #ifdef CONFIG_TCP_MD5SIG
1680 	/* Copy over the MD5 key from the original socket */
1681 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1682 				AF_INET);
1683 	if (key != NULL) {
1684 		/*
1685 		 * We're using one, so create a matching key
1686 		 * on the newsk structure. If we fail to get
1687 		 * memory, then we end up not copying the key
1688 		 * across. Shucks.
1689 		 */
1690 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1691 			       AF_INET, key->key, key->keylen, GFP_ATOMIC);
1692 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1693 	}
1694 #endif
1695 
1696 	if (__inet_inherit_port(sk, newsk) < 0)
1697 		goto put_and_exit;
1698 	__inet_hash_nolisten(newsk, NULL);
1699 
1700 	return newsk;
1701 
1702 exit_overflow:
1703 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1704 exit_nonewsk:
1705 	dst_release(dst);
1706 exit:
1707 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1708 	return NULL;
1709 put_and_exit:
1710 	inet_csk_prepare_forced_close(newsk);
1711 	tcp_done(newsk);
1712 	goto exit;
1713 }
1714 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1715 
1716 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1717 {
1718 	struct tcphdr *th = tcp_hdr(skb);
1719 	const struct iphdr *iph = ip_hdr(skb);
1720 	struct sock *nsk;
1721 	struct request_sock **prev;
1722 	/* Find possible connection requests. */
1723 	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1724 						       iph->saddr, iph->daddr);
1725 	if (req)
1726 		return tcp_check_req(sk, skb, req, prev, false);
1727 
1728 	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1729 			th->source, iph->daddr, th->dest, inet_iif(skb));
1730 
1731 	if (nsk) {
1732 		if (nsk->sk_state != TCP_TIME_WAIT) {
1733 			bh_lock_sock(nsk);
1734 			return nsk;
1735 		}
1736 		inet_twsk_put(inet_twsk(nsk));
1737 		return NULL;
1738 	}
1739 
1740 #ifdef CONFIG_SYN_COOKIES
1741 	if (!th->syn)
1742 		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1743 #endif
1744 	return sk;
1745 }
1746 
1747 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1748 {
1749 	const struct iphdr *iph = ip_hdr(skb);
1750 
1751 	if (skb->ip_summed == CHECKSUM_COMPLETE) {
1752 		if (!tcp_v4_check(skb->len, iph->saddr,
1753 				  iph->daddr, skb->csum)) {
1754 			skb->ip_summed = CHECKSUM_UNNECESSARY;
1755 			return 0;
1756 		}
1757 	}
1758 
1759 	skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1760 				       skb->len, IPPROTO_TCP, 0);
1761 
1762 	if (skb->len <= 76) {
1763 		return __skb_checksum_complete(skb);
1764 	}
1765 	return 0;
1766 }
1767 
1768 
1769 /* The socket must have it's spinlock held when we get
1770  * here.
1771  *
1772  * We have a potential double-lock case here, so even when
1773  * doing backlog processing we use the BH locking scheme.
1774  * This is because we cannot sleep with the original spinlock
1775  * held.
1776  */
1777 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1778 {
1779 	struct sock *rsk;
1780 #ifdef CONFIG_TCP_MD5SIG
1781 	/*
1782 	 * We really want to reject the packet as early as possible
1783 	 * if:
1784 	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1785 	 *  o There is an MD5 option and we're not expecting one
1786 	 */
1787 	if (tcp_v4_inbound_md5_hash(sk, skb))
1788 		goto discard;
1789 #endif
1790 
1791 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1792 		struct dst_entry *dst = sk->sk_rx_dst;
1793 
1794 		sock_rps_save_rxhash(sk, skb);
1795 		if (dst) {
1796 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1797 			    dst->ops->check(dst, 0) == NULL) {
1798 				dst_release(dst);
1799 				sk->sk_rx_dst = NULL;
1800 			}
1801 		}
1802 		tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1803 		return 0;
1804 	}
1805 
1806 	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1807 		goto csum_err;
1808 
1809 	if (sk->sk_state == TCP_LISTEN) {
1810 		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1811 		if (!nsk)
1812 			goto discard;
1813 
1814 		if (nsk != sk) {
1815 			sock_rps_save_rxhash(nsk, skb);
1816 			if (tcp_child_process(sk, nsk, skb)) {
1817 				rsk = nsk;
1818 				goto reset;
1819 			}
1820 			return 0;
1821 		}
1822 	} else
1823 		sock_rps_save_rxhash(sk, skb);
1824 
1825 	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1826 		rsk = sk;
1827 		goto reset;
1828 	}
1829 	return 0;
1830 
1831 reset:
1832 	tcp_v4_send_reset(rsk, skb);
1833 discard:
1834 	kfree_skb(skb);
1835 	/* Be careful here. If this function gets more complicated and
1836 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1837 	 * might be destroyed here. This current version compiles correctly,
1838 	 * but you have been warned.
1839 	 */
1840 	return 0;
1841 
1842 csum_err:
1843 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1844 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1845 	goto discard;
1846 }
1847 EXPORT_SYMBOL(tcp_v4_do_rcv);
1848 
1849 void tcp_v4_early_demux(struct sk_buff *skb)
1850 {
1851 	const struct iphdr *iph;
1852 	const struct tcphdr *th;
1853 	struct sock *sk;
1854 
1855 	if (skb->pkt_type != PACKET_HOST)
1856 		return;
1857 
1858 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1859 		return;
1860 
1861 	iph = ip_hdr(skb);
1862 	th = tcp_hdr(skb);
1863 
1864 	if (th->doff < sizeof(struct tcphdr) / 4)
1865 		return;
1866 
1867 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1868 				       iph->saddr, th->source,
1869 				       iph->daddr, ntohs(th->dest),
1870 				       skb->skb_iif);
1871 	if (sk) {
1872 		skb->sk = sk;
1873 		skb->destructor = sock_edemux;
1874 		if (sk->sk_state != TCP_TIME_WAIT) {
1875 			struct dst_entry *dst = sk->sk_rx_dst;
1876 
1877 			if (dst)
1878 				dst = dst_check(dst, 0);
1879 			if (dst &&
1880 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1881 				skb_dst_set_noref(skb, dst);
1882 		}
1883 	}
1884 }
1885 
1886 /* Packet is added to VJ-style prequeue for processing in process
1887  * context, if a reader task is waiting. Apparently, this exciting
1888  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1889  * failed somewhere. Latency? Burstiness? Well, at least now we will
1890  * see, why it failed. 8)8)				  --ANK
1891  *
1892  */
1893 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1894 {
1895 	struct tcp_sock *tp = tcp_sk(sk);
1896 
1897 	if (sysctl_tcp_low_latency || !tp->ucopy.task)
1898 		return false;
1899 
1900 	if (skb->len <= tcp_hdrlen(skb) &&
1901 	    skb_queue_len(&tp->ucopy.prequeue) == 0)
1902 		return false;
1903 
1904 	skb_dst_force(skb);
1905 	__skb_queue_tail(&tp->ucopy.prequeue, skb);
1906 	tp->ucopy.memory += skb->truesize;
1907 	if (tp->ucopy.memory > sk->sk_rcvbuf) {
1908 		struct sk_buff *skb1;
1909 
1910 		BUG_ON(sock_owned_by_user(sk));
1911 
1912 		while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1913 			sk_backlog_rcv(sk, skb1);
1914 			NET_INC_STATS_BH(sock_net(sk),
1915 					 LINUX_MIB_TCPPREQUEUEDROPPED);
1916 		}
1917 
1918 		tp->ucopy.memory = 0;
1919 	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1920 		wake_up_interruptible_sync_poll(sk_sleep(sk),
1921 					   POLLIN | POLLRDNORM | POLLRDBAND);
1922 		if (!inet_csk_ack_scheduled(sk))
1923 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1924 						  (3 * tcp_rto_min(sk)) / 4,
1925 						  TCP_RTO_MAX);
1926 	}
1927 	return true;
1928 }
1929 EXPORT_SYMBOL(tcp_prequeue);
1930 
1931 /*
1932  *	From tcp_input.c
1933  */
1934 
1935 int tcp_v4_rcv(struct sk_buff *skb)
1936 {
1937 	const struct iphdr *iph;
1938 	const struct tcphdr *th;
1939 	struct sock *sk;
1940 	int ret;
1941 	struct net *net = dev_net(skb->dev);
1942 
1943 	if (skb->pkt_type != PACKET_HOST)
1944 		goto discard_it;
1945 
1946 	/* Count it even if it's bad */
1947 	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1948 
1949 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1950 		goto discard_it;
1951 
1952 	th = tcp_hdr(skb);
1953 
1954 	if (th->doff < sizeof(struct tcphdr) / 4)
1955 		goto bad_packet;
1956 	if (!pskb_may_pull(skb, th->doff * 4))
1957 		goto discard_it;
1958 
1959 	/* An explanation is required here, I think.
1960 	 * Packet length and doff are validated by header prediction,
1961 	 * provided case of th->doff==0 is eliminated.
1962 	 * So, we defer the checks. */
1963 	if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1964 		goto csum_error;
1965 
1966 	th = tcp_hdr(skb);
1967 	iph = ip_hdr(skb);
1968 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1969 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1970 				    skb->len - th->doff * 4);
1971 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1972 	TCP_SKB_CB(skb)->when	 = 0;
1973 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1974 	TCP_SKB_CB(skb)->sacked	 = 0;
1975 
1976 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1977 	if (!sk)
1978 		goto no_tcp_socket;
1979 
1980 process:
1981 	if (sk->sk_state == TCP_TIME_WAIT)
1982 		goto do_time_wait;
1983 
1984 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1985 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1986 		goto discard_and_relse;
1987 	}
1988 
1989 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1990 		goto discard_and_relse;
1991 	nf_reset(skb);
1992 
1993 	if (sk_filter(sk, skb))
1994 		goto discard_and_relse;
1995 
1996 	sk_mark_napi_id(sk, skb);
1997 	skb->dev = NULL;
1998 
1999 	bh_lock_sock_nested(sk);
2000 	ret = 0;
2001 	if (!sock_owned_by_user(sk)) {
2002 #ifdef CONFIG_NET_DMA
2003 		struct tcp_sock *tp = tcp_sk(sk);
2004 		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
2005 			tp->ucopy.dma_chan = net_dma_find_channel();
2006 		if (tp->ucopy.dma_chan)
2007 			ret = tcp_v4_do_rcv(sk, skb);
2008 		else
2009 #endif
2010 		{
2011 			if (!tcp_prequeue(sk, skb))
2012 				ret = tcp_v4_do_rcv(sk, skb);
2013 		}
2014 	} else if (unlikely(sk_add_backlog(sk, skb,
2015 					   sk->sk_rcvbuf + sk->sk_sndbuf))) {
2016 		bh_unlock_sock(sk);
2017 		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
2018 		goto discard_and_relse;
2019 	}
2020 	bh_unlock_sock(sk);
2021 
2022 	sock_put(sk);
2023 
2024 	return ret;
2025 
2026 no_tcp_socket:
2027 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2028 		goto discard_it;
2029 
2030 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
2031 csum_error:
2032 		TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
2033 bad_packet:
2034 		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
2035 	} else {
2036 		tcp_v4_send_reset(NULL, skb);
2037 	}
2038 
2039 discard_it:
2040 	/* Discard frame. */
2041 	kfree_skb(skb);
2042 	return 0;
2043 
2044 discard_and_relse:
2045 	sock_put(sk);
2046 	goto discard_it;
2047 
2048 do_time_wait:
2049 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2050 		inet_twsk_put(inet_twsk(sk));
2051 		goto discard_it;
2052 	}
2053 
2054 	if (skb->len < (th->doff << 2)) {
2055 		inet_twsk_put(inet_twsk(sk));
2056 		goto bad_packet;
2057 	}
2058 	if (tcp_checksum_complete(skb)) {
2059 		inet_twsk_put(inet_twsk(sk));
2060 		goto csum_error;
2061 	}
2062 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2063 	case TCP_TW_SYN: {
2064 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2065 							&tcp_hashinfo,
2066 							iph->saddr, th->source,
2067 							iph->daddr, th->dest,
2068 							inet_iif(skb));
2069 		if (sk2) {
2070 			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
2071 			inet_twsk_put(inet_twsk(sk));
2072 			sk = sk2;
2073 			goto process;
2074 		}
2075 		/* Fall through to ACK */
2076 	}
2077 	case TCP_TW_ACK:
2078 		tcp_v4_timewait_ack(sk, skb);
2079 		break;
2080 	case TCP_TW_RST:
2081 		goto no_tcp_socket;
2082 	case TCP_TW_SUCCESS:;
2083 	}
2084 	goto discard_it;
2085 }
2086 
2087 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2088 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2089 	.twsk_unique	= tcp_twsk_unique,
2090 	.twsk_destructor= tcp_twsk_destructor,
2091 };
2092 
2093 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2094 {
2095 	struct dst_entry *dst = skb_dst(skb);
2096 
2097 	dst_hold(dst);
2098 	sk->sk_rx_dst = dst;
2099 	inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2100 }
2101 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2102 
2103 const struct inet_connection_sock_af_ops ipv4_specific = {
2104 	.queue_xmit	   = ip_queue_xmit,
2105 	.send_check	   = tcp_v4_send_check,
2106 	.rebuild_header	   = inet_sk_rebuild_header,
2107 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2108 	.conn_request	   = tcp_v4_conn_request,
2109 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2110 	.net_header_len	   = sizeof(struct iphdr),
2111 	.setsockopt	   = ip_setsockopt,
2112 	.getsockopt	   = ip_getsockopt,
2113 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2114 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2115 	.bind_conflict	   = inet_csk_bind_conflict,
2116 #ifdef CONFIG_COMPAT
2117 	.compat_setsockopt = compat_ip_setsockopt,
2118 	.compat_getsockopt = compat_ip_getsockopt,
2119 #endif
2120 };
2121 EXPORT_SYMBOL(ipv4_specific);
2122 
2123 #ifdef CONFIG_TCP_MD5SIG
2124 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2125 	.md5_lookup		= tcp_v4_md5_lookup,
2126 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2127 	.md5_parse		= tcp_v4_parse_md5_keys,
2128 };
2129 #endif
2130 
2131 /* NOTE: A lot of things set to zero explicitly by call to
2132  *       sk_alloc() so need not be done here.
2133  */
2134 static int tcp_v4_init_sock(struct sock *sk)
2135 {
2136 	struct inet_connection_sock *icsk = inet_csk(sk);
2137 
2138 	tcp_init_sock(sk);
2139 
2140 	icsk->icsk_af_ops = &ipv4_specific;
2141 
2142 #ifdef CONFIG_TCP_MD5SIG
2143 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2144 #endif
2145 
2146 	return 0;
2147 }
2148 
2149 void tcp_v4_destroy_sock(struct sock *sk)
2150 {
2151 	struct tcp_sock *tp = tcp_sk(sk);
2152 
2153 	tcp_clear_xmit_timers(sk);
2154 
2155 	tcp_cleanup_congestion_control(sk);
2156 
2157 	/* Cleanup up the write buffer. */
2158 	tcp_write_queue_purge(sk);
2159 
2160 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2161 	__skb_queue_purge(&tp->out_of_order_queue);
2162 
2163 #ifdef CONFIG_TCP_MD5SIG
2164 	/* Clean up the MD5 key list, if any */
2165 	if (tp->md5sig_info) {
2166 		tcp_clear_md5_list(sk);
2167 		kfree_rcu(tp->md5sig_info, rcu);
2168 		tp->md5sig_info = NULL;
2169 	}
2170 #endif
2171 
2172 #ifdef CONFIG_NET_DMA
2173 	/* Cleans up our sk_async_wait_queue */
2174 	__skb_queue_purge(&sk->sk_async_wait_queue);
2175 #endif
2176 
2177 	/* Clean prequeue, it must be empty really */
2178 	__skb_queue_purge(&tp->ucopy.prequeue);
2179 
2180 	/* Clean up a referenced TCP bind bucket. */
2181 	if (inet_csk(sk)->icsk_bind_hash)
2182 		inet_put_port(sk);
2183 
2184 	BUG_ON(tp->fastopen_rsk != NULL);
2185 
2186 	/* If socket is aborted during connect operation */
2187 	tcp_free_fastopen_req(tp);
2188 
2189 	sk_sockets_allocated_dec(sk);
2190 	sock_release_memcg(sk);
2191 }
2192 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2193 
2194 #ifdef CONFIG_PROC_FS
2195 /* Proc filesystem TCP sock list dumping. */
2196 
2197 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
2198 {
2199 	return hlist_nulls_empty(head) ? NULL :
2200 		list_entry(head->first, struct inet_timewait_sock, tw_node);
2201 }
2202 
2203 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
2204 {
2205 	return !is_a_nulls(tw->tw_node.next) ?
2206 		hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2207 }
2208 
2209 /*
2210  * Get next listener socket follow cur.  If cur is NULL, get first socket
2211  * starting from bucket given in st->bucket; when st->bucket is zero the
2212  * very first socket in the hash table is returned.
2213  */
2214 static void *listening_get_next(struct seq_file *seq, void *cur)
2215 {
2216 	struct inet_connection_sock *icsk;
2217 	struct hlist_nulls_node *node;
2218 	struct sock *sk = cur;
2219 	struct inet_listen_hashbucket *ilb;
2220 	struct tcp_iter_state *st = seq->private;
2221 	struct net *net = seq_file_net(seq);
2222 
2223 	if (!sk) {
2224 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2225 		spin_lock_bh(&ilb->lock);
2226 		sk = sk_nulls_head(&ilb->head);
2227 		st->offset = 0;
2228 		goto get_sk;
2229 	}
2230 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2231 	++st->num;
2232 	++st->offset;
2233 
2234 	if (st->state == TCP_SEQ_STATE_OPENREQ) {
2235 		struct request_sock *req = cur;
2236 
2237 		icsk = inet_csk(st->syn_wait_sk);
2238 		req = req->dl_next;
2239 		while (1) {
2240 			while (req) {
2241 				if (req->rsk_ops->family == st->family) {
2242 					cur = req;
2243 					goto out;
2244 				}
2245 				req = req->dl_next;
2246 			}
2247 			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2248 				break;
2249 get_req:
2250 			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2251 		}
2252 		sk	  = sk_nulls_next(st->syn_wait_sk);
2253 		st->state = TCP_SEQ_STATE_LISTENING;
2254 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2255 	} else {
2256 		icsk = inet_csk(sk);
2257 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2258 		if (reqsk_queue_len(&icsk->icsk_accept_queue))
2259 			goto start_req;
2260 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2261 		sk = sk_nulls_next(sk);
2262 	}
2263 get_sk:
2264 	sk_nulls_for_each_from(sk, node) {
2265 		if (!net_eq(sock_net(sk), net))
2266 			continue;
2267 		if (sk->sk_family == st->family) {
2268 			cur = sk;
2269 			goto out;
2270 		}
2271 		icsk = inet_csk(sk);
2272 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2273 		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2274 start_req:
2275 			st->uid		= sock_i_uid(sk);
2276 			st->syn_wait_sk = sk;
2277 			st->state	= TCP_SEQ_STATE_OPENREQ;
2278 			st->sbucket	= 0;
2279 			goto get_req;
2280 		}
2281 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2282 	}
2283 	spin_unlock_bh(&ilb->lock);
2284 	st->offset = 0;
2285 	if (++st->bucket < INET_LHTABLE_SIZE) {
2286 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2287 		spin_lock_bh(&ilb->lock);
2288 		sk = sk_nulls_head(&ilb->head);
2289 		goto get_sk;
2290 	}
2291 	cur = NULL;
2292 out:
2293 	return cur;
2294 }
2295 
2296 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2297 {
2298 	struct tcp_iter_state *st = seq->private;
2299 	void *rc;
2300 
2301 	st->bucket = 0;
2302 	st->offset = 0;
2303 	rc = listening_get_next(seq, NULL);
2304 
2305 	while (rc && *pos) {
2306 		rc = listening_get_next(seq, rc);
2307 		--*pos;
2308 	}
2309 	return rc;
2310 }
2311 
2312 static inline bool empty_bucket(struct tcp_iter_state *st)
2313 {
2314 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2315 		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2316 }
2317 
2318 /*
2319  * Get first established socket starting from bucket given in st->bucket.
2320  * If st->bucket is zero, the very first socket in the hash is returned.
2321  */
2322 static void *established_get_first(struct seq_file *seq)
2323 {
2324 	struct tcp_iter_state *st = seq->private;
2325 	struct net *net = seq_file_net(seq);
2326 	void *rc = NULL;
2327 
2328 	st->offset = 0;
2329 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2330 		struct sock *sk;
2331 		struct hlist_nulls_node *node;
2332 		struct inet_timewait_sock *tw;
2333 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2334 
2335 		/* Lockless fast path for the common case of empty buckets */
2336 		if (empty_bucket(st))
2337 			continue;
2338 
2339 		spin_lock_bh(lock);
2340 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2341 			if (sk->sk_family != st->family ||
2342 			    !net_eq(sock_net(sk), net)) {
2343 				continue;
2344 			}
2345 			rc = sk;
2346 			goto out;
2347 		}
2348 		st->state = TCP_SEQ_STATE_TIME_WAIT;
2349 		inet_twsk_for_each(tw, node,
2350 				   &tcp_hashinfo.ehash[st->bucket].twchain) {
2351 			if (tw->tw_family != st->family ||
2352 			    !net_eq(twsk_net(tw), net)) {
2353 				continue;
2354 			}
2355 			rc = tw;
2356 			goto out;
2357 		}
2358 		spin_unlock_bh(lock);
2359 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2360 	}
2361 out:
2362 	return rc;
2363 }
2364 
2365 static void *established_get_next(struct seq_file *seq, void *cur)
2366 {
2367 	struct sock *sk = cur;
2368 	struct inet_timewait_sock *tw;
2369 	struct hlist_nulls_node *node;
2370 	struct tcp_iter_state *st = seq->private;
2371 	struct net *net = seq_file_net(seq);
2372 
2373 	++st->num;
2374 	++st->offset;
2375 
2376 	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2377 		tw = cur;
2378 		tw = tw_next(tw);
2379 get_tw:
2380 		while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2381 			tw = tw_next(tw);
2382 		}
2383 		if (tw) {
2384 			cur = tw;
2385 			goto out;
2386 		}
2387 		spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2388 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2389 
2390 		/* Look for next non empty bucket */
2391 		st->offset = 0;
2392 		while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2393 				empty_bucket(st))
2394 			;
2395 		if (st->bucket > tcp_hashinfo.ehash_mask)
2396 			return NULL;
2397 
2398 		spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2399 		sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2400 	} else
2401 		sk = sk_nulls_next(sk);
2402 
2403 	sk_nulls_for_each_from(sk, node) {
2404 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2405 			goto found;
2406 	}
2407 
2408 	st->state = TCP_SEQ_STATE_TIME_WAIT;
2409 	tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2410 	goto get_tw;
2411 found:
2412 	cur = sk;
2413 out:
2414 	return cur;
2415 }
2416 
2417 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2418 {
2419 	struct tcp_iter_state *st = seq->private;
2420 	void *rc;
2421 
2422 	st->bucket = 0;
2423 	rc = established_get_first(seq);
2424 
2425 	while (rc && pos) {
2426 		rc = established_get_next(seq, rc);
2427 		--pos;
2428 	}
2429 	return rc;
2430 }
2431 
2432 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2433 {
2434 	void *rc;
2435 	struct tcp_iter_state *st = seq->private;
2436 
2437 	st->state = TCP_SEQ_STATE_LISTENING;
2438 	rc	  = listening_get_idx(seq, &pos);
2439 
2440 	if (!rc) {
2441 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2442 		rc	  = established_get_idx(seq, pos);
2443 	}
2444 
2445 	return rc;
2446 }
2447 
2448 static void *tcp_seek_last_pos(struct seq_file *seq)
2449 {
2450 	struct tcp_iter_state *st = seq->private;
2451 	int offset = st->offset;
2452 	int orig_num = st->num;
2453 	void *rc = NULL;
2454 
2455 	switch (st->state) {
2456 	case TCP_SEQ_STATE_OPENREQ:
2457 	case TCP_SEQ_STATE_LISTENING:
2458 		if (st->bucket >= INET_LHTABLE_SIZE)
2459 			break;
2460 		st->state = TCP_SEQ_STATE_LISTENING;
2461 		rc = listening_get_next(seq, NULL);
2462 		while (offset-- && rc)
2463 			rc = listening_get_next(seq, rc);
2464 		if (rc)
2465 			break;
2466 		st->bucket = 0;
2467 		/* Fallthrough */
2468 	case TCP_SEQ_STATE_ESTABLISHED:
2469 	case TCP_SEQ_STATE_TIME_WAIT:
2470 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2471 		if (st->bucket > tcp_hashinfo.ehash_mask)
2472 			break;
2473 		rc = established_get_first(seq);
2474 		while (offset-- && rc)
2475 			rc = established_get_next(seq, rc);
2476 	}
2477 
2478 	st->num = orig_num;
2479 
2480 	return rc;
2481 }
2482 
2483 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2484 {
2485 	struct tcp_iter_state *st = seq->private;
2486 	void *rc;
2487 
2488 	if (*pos && *pos == st->last_pos) {
2489 		rc = tcp_seek_last_pos(seq);
2490 		if (rc)
2491 			goto out;
2492 	}
2493 
2494 	st->state = TCP_SEQ_STATE_LISTENING;
2495 	st->num = 0;
2496 	st->bucket = 0;
2497 	st->offset = 0;
2498 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2499 
2500 out:
2501 	st->last_pos = *pos;
2502 	return rc;
2503 }
2504 
2505 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2506 {
2507 	struct tcp_iter_state *st = seq->private;
2508 	void *rc = NULL;
2509 
2510 	if (v == SEQ_START_TOKEN) {
2511 		rc = tcp_get_idx(seq, 0);
2512 		goto out;
2513 	}
2514 
2515 	switch (st->state) {
2516 	case TCP_SEQ_STATE_OPENREQ:
2517 	case TCP_SEQ_STATE_LISTENING:
2518 		rc = listening_get_next(seq, v);
2519 		if (!rc) {
2520 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2521 			st->bucket = 0;
2522 			st->offset = 0;
2523 			rc	  = established_get_first(seq);
2524 		}
2525 		break;
2526 	case TCP_SEQ_STATE_ESTABLISHED:
2527 	case TCP_SEQ_STATE_TIME_WAIT:
2528 		rc = established_get_next(seq, v);
2529 		break;
2530 	}
2531 out:
2532 	++*pos;
2533 	st->last_pos = *pos;
2534 	return rc;
2535 }
2536 
2537 static void tcp_seq_stop(struct seq_file *seq, void *v)
2538 {
2539 	struct tcp_iter_state *st = seq->private;
2540 
2541 	switch (st->state) {
2542 	case TCP_SEQ_STATE_OPENREQ:
2543 		if (v) {
2544 			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2545 			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2546 		}
2547 	case TCP_SEQ_STATE_LISTENING:
2548 		if (v != SEQ_START_TOKEN)
2549 			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2550 		break;
2551 	case TCP_SEQ_STATE_TIME_WAIT:
2552 	case TCP_SEQ_STATE_ESTABLISHED:
2553 		if (v)
2554 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2555 		break;
2556 	}
2557 }
2558 
2559 int tcp_seq_open(struct inode *inode, struct file *file)
2560 {
2561 	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2562 	struct tcp_iter_state *s;
2563 	int err;
2564 
2565 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2566 			  sizeof(struct tcp_iter_state));
2567 	if (err < 0)
2568 		return err;
2569 
2570 	s = ((struct seq_file *)file->private_data)->private;
2571 	s->family		= afinfo->family;
2572 	s->last_pos 		= 0;
2573 	return 0;
2574 }
2575 EXPORT_SYMBOL(tcp_seq_open);
2576 
2577 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2578 {
2579 	int rc = 0;
2580 	struct proc_dir_entry *p;
2581 
2582 	afinfo->seq_ops.start		= tcp_seq_start;
2583 	afinfo->seq_ops.next		= tcp_seq_next;
2584 	afinfo->seq_ops.stop		= tcp_seq_stop;
2585 
2586 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2587 			     afinfo->seq_fops, afinfo);
2588 	if (!p)
2589 		rc = -ENOMEM;
2590 	return rc;
2591 }
2592 EXPORT_SYMBOL(tcp_proc_register);
2593 
2594 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2595 {
2596 	remove_proc_entry(afinfo->name, net->proc_net);
2597 }
2598 EXPORT_SYMBOL(tcp_proc_unregister);
2599 
2600 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2601 			 struct seq_file *f, int i, kuid_t uid, int *len)
2602 {
2603 	const struct inet_request_sock *ireq = inet_rsk(req);
2604 	long delta = req->expires - jiffies;
2605 
2606 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2607 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK%n",
2608 		i,
2609 		ireq->loc_addr,
2610 		ntohs(inet_sk(sk)->inet_sport),
2611 		ireq->rmt_addr,
2612 		ntohs(ireq->rmt_port),
2613 		TCP_SYN_RECV,
2614 		0, 0, /* could print option size, but that is af dependent. */
2615 		1,    /* timers active (only the expire timer) */
2616 		jiffies_delta_to_clock_t(delta),
2617 		req->num_timeout,
2618 		from_kuid_munged(seq_user_ns(f), uid),
2619 		0,  /* non standard timer */
2620 		0, /* open_requests have no inode */
2621 		atomic_read(&sk->sk_refcnt),
2622 		req,
2623 		len);
2624 }
2625 
2626 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2627 {
2628 	int timer_active;
2629 	unsigned long timer_expires;
2630 	const struct tcp_sock *tp = tcp_sk(sk);
2631 	const struct inet_connection_sock *icsk = inet_csk(sk);
2632 	const struct inet_sock *inet = inet_sk(sk);
2633 	struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
2634 	__be32 dest = inet->inet_daddr;
2635 	__be32 src = inet->inet_rcv_saddr;
2636 	__u16 destp = ntohs(inet->inet_dport);
2637 	__u16 srcp = ntohs(inet->inet_sport);
2638 	int rx_queue;
2639 
2640 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2641 	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2642 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2643 		timer_active	= 1;
2644 		timer_expires	= icsk->icsk_timeout;
2645 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2646 		timer_active	= 4;
2647 		timer_expires	= icsk->icsk_timeout;
2648 	} else if (timer_pending(&sk->sk_timer)) {
2649 		timer_active	= 2;
2650 		timer_expires	= sk->sk_timer.expires;
2651 	} else {
2652 		timer_active	= 0;
2653 		timer_expires = jiffies;
2654 	}
2655 
2656 	if (sk->sk_state == TCP_LISTEN)
2657 		rx_queue = sk->sk_ack_backlog;
2658 	else
2659 		/*
2660 		 * because we dont lock socket, we might find a transient negative value
2661 		 */
2662 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2663 
2664 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2665 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d%n",
2666 		i, src, srcp, dest, destp, sk->sk_state,
2667 		tp->write_seq - tp->snd_una,
2668 		rx_queue,
2669 		timer_active,
2670 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2671 		icsk->icsk_retransmits,
2672 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2673 		icsk->icsk_probes_out,
2674 		sock_i_ino(sk),
2675 		atomic_read(&sk->sk_refcnt), sk,
2676 		jiffies_to_clock_t(icsk->icsk_rto),
2677 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2678 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2679 		tp->snd_cwnd,
2680 		sk->sk_state == TCP_LISTEN ?
2681 		    (fastopenq ? fastopenq->max_qlen : 0) :
2682 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh),
2683 		len);
2684 }
2685 
2686 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2687 			       struct seq_file *f, int i, int *len)
2688 {
2689 	__be32 dest, src;
2690 	__u16 destp, srcp;
2691 	long delta = tw->tw_ttd - jiffies;
2692 
2693 	dest  = tw->tw_daddr;
2694 	src   = tw->tw_rcv_saddr;
2695 	destp = ntohs(tw->tw_dport);
2696 	srcp  = ntohs(tw->tw_sport);
2697 
2698 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2699 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2700 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2701 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2702 		atomic_read(&tw->tw_refcnt), tw, len);
2703 }
2704 
2705 #define TMPSZ 150
2706 
2707 static int tcp4_seq_show(struct seq_file *seq, void *v)
2708 {
2709 	struct tcp_iter_state *st;
2710 	int len;
2711 
2712 	if (v == SEQ_START_TOKEN) {
2713 		seq_printf(seq, "%-*s\n", TMPSZ - 1,
2714 			   "  sl  local_address rem_address   st tx_queue "
2715 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2716 			   "inode");
2717 		goto out;
2718 	}
2719 	st = seq->private;
2720 
2721 	switch (st->state) {
2722 	case TCP_SEQ_STATE_LISTENING:
2723 	case TCP_SEQ_STATE_ESTABLISHED:
2724 		get_tcp4_sock(v, seq, st->num, &len);
2725 		break;
2726 	case TCP_SEQ_STATE_OPENREQ:
2727 		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2728 		break;
2729 	case TCP_SEQ_STATE_TIME_WAIT:
2730 		get_timewait4_sock(v, seq, st->num, &len);
2731 		break;
2732 	}
2733 	seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2734 out:
2735 	return 0;
2736 }
2737 
2738 static const struct file_operations tcp_afinfo_seq_fops = {
2739 	.owner   = THIS_MODULE,
2740 	.open    = tcp_seq_open,
2741 	.read    = seq_read,
2742 	.llseek  = seq_lseek,
2743 	.release = seq_release_net
2744 };
2745 
2746 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2747 	.name		= "tcp",
2748 	.family		= AF_INET,
2749 	.seq_fops	= &tcp_afinfo_seq_fops,
2750 	.seq_ops	= {
2751 		.show		= tcp4_seq_show,
2752 	},
2753 };
2754 
2755 static int __net_init tcp4_proc_init_net(struct net *net)
2756 {
2757 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2758 }
2759 
2760 static void __net_exit tcp4_proc_exit_net(struct net *net)
2761 {
2762 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2763 }
2764 
2765 static struct pernet_operations tcp4_net_ops = {
2766 	.init = tcp4_proc_init_net,
2767 	.exit = tcp4_proc_exit_net,
2768 };
2769 
2770 int __init tcp4_proc_init(void)
2771 {
2772 	return register_pernet_subsys(&tcp4_net_ops);
2773 }
2774 
2775 void tcp4_proc_exit(void)
2776 {
2777 	unregister_pernet_subsys(&tcp4_net_ops);
2778 }
2779 #endif /* CONFIG_PROC_FS */
2780 
2781 struct proto tcp_prot = {
2782 	.name			= "TCP",
2783 	.owner			= THIS_MODULE,
2784 	.close			= tcp_close,
2785 	.connect		= tcp_v4_connect,
2786 	.disconnect		= tcp_disconnect,
2787 	.accept			= inet_csk_accept,
2788 	.ioctl			= tcp_ioctl,
2789 	.init			= tcp_v4_init_sock,
2790 	.destroy		= tcp_v4_destroy_sock,
2791 	.shutdown		= tcp_shutdown,
2792 	.setsockopt		= tcp_setsockopt,
2793 	.getsockopt		= tcp_getsockopt,
2794 	.recvmsg		= tcp_recvmsg,
2795 	.sendmsg		= tcp_sendmsg,
2796 	.sendpage		= tcp_sendpage,
2797 	.backlog_rcv		= tcp_v4_do_rcv,
2798 	.release_cb		= tcp_release_cb,
2799 	.mtu_reduced		= tcp_v4_mtu_reduced,
2800 	.hash			= inet_hash,
2801 	.unhash			= inet_unhash,
2802 	.get_port		= inet_csk_get_port,
2803 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2804 	.stream_memory_free	= tcp_stream_memory_free,
2805 	.sockets_allocated	= &tcp_sockets_allocated,
2806 	.orphan_count		= &tcp_orphan_count,
2807 	.memory_allocated	= &tcp_memory_allocated,
2808 	.memory_pressure	= &tcp_memory_pressure,
2809 	.sysctl_wmem		= sysctl_tcp_wmem,
2810 	.sysctl_rmem		= sysctl_tcp_rmem,
2811 	.max_header		= MAX_TCP_HEADER,
2812 	.obj_size		= sizeof(struct tcp_sock),
2813 	.slab_flags		= SLAB_DESTROY_BY_RCU,
2814 	.twsk_prot		= &tcp_timewait_sock_ops,
2815 	.rsk_prot		= &tcp_request_sock_ops,
2816 	.h.hashinfo		= &tcp_hashinfo,
2817 	.no_autobind		= true,
2818 #ifdef CONFIG_COMPAT
2819 	.compat_setsockopt	= compat_tcp_setsockopt,
2820 	.compat_getsockopt	= compat_tcp_getsockopt,
2821 #endif
2822 #ifdef CONFIG_MEMCG_KMEM
2823 	.init_cgroup		= tcp_init_cgroup,
2824 	.destroy_cgroup		= tcp_destroy_cgroup,
2825 	.proto_cgroup		= tcp_proto_cgroup,
2826 #endif
2827 };
2828 EXPORT_SYMBOL(tcp_prot);
2829 
2830 static int __net_init tcp_sk_init(struct net *net)
2831 {
2832 	net->ipv4.sysctl_tcp_ecn = 2;
2833 	return 0;
2834 }
2835 
2836 static void __net_exit tcp_sk_exit(struct net *net)
2837 {
2838 }
2839 
2840 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2841 {
2842 	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2843 }
2844 
2845 static struct pernet_operations __net_initdata tcp_sk_ops = {
2846        .init	   = tcp_sk_init,
2847        .exit	   = tcp_sk_exit,
2848        .exit_batch = tcp_sk_exit_batch,
2849 };
2850 
2851 void __init tcp_v4_init(void)
2852 {
2853 	inet_hashinfo_init(&tcp_hashinfo);
2854 	if (register_pernet_subsys(&tcp_sk_ops))
2855 		panic("Failed to create the TCP control socket.\n");
2856 }
2857