xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision e0f3d4c2)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77 
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/inetdevice.h>
84 
85 #include <crypto/hash.h>
86 #include <linux/scatterlist.h>
87 
88 #include <trace/events/tcp.h>
89 
90 #ifdef CONFIG_TCP_MD5SIG
91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
92 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
93 #endif
94 
95 struct inet_hashinfo tcp_hashinfo;
96 EXPORT_SYMBOL(tcp_hashinfo);
97 
98 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
99 {
100 	return secure_tcp_seq(ip_hdr(skb)->daddr,
101 			      ip_hdr(skb)->saddr,
102 			      tcp_hdr(skb)->dest,
103 			      tcp_hdr(skb)->source);
104 }
105 
106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
107 {
108 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
109 }
110 
111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112 {
113 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114 	struct tcp_sock *tp = tcp_sk(sk);
115 
116 	/* With PAWS, it is safe from the viewpoint
117 	   of data integrity. Even without PAWS it is safe provided sequence
118 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
119 
120 	   Actually, the idea is close to VJ's one, only timestamp cache is
121 	   held not per host, but per port pair and TW bucket is used as state
122 	   holder.
123 
124 	   If TW bucket has been already destroyed we fall back to VJ's scheme
125 	   and use initial timestamp retrieved from peer table.
126 	 */
127 	if (tcptw->tw_ts_recent_stamp &&
128 	    (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
129 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
130 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
131 		if (tp->write_seq == 0)
132 			tp->write_seq = 1;
133 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
134 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
135 		sock_hold(sktw);
136 		return 1;
137 	}
138 
139 	return 0;
140 }
141 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
142 
143 /* This will initiate an outgoing connection. */
144 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
145 {
146 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
147 	struct inet_sock *inet = inet_sk(sk);
148 	struct tcp_sock *tp = tcp_sk(sk);
149 	__be16 orig_sport, orig_dport;
150 	__be32 daddr, nexthop;
151 	struct flowi4 *fl4;
152 	struct rtable *rt;
153 	int err;
154 	struct ip_options_rcu *inet_opt;
155 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
156 
157 	if (addr_len < sizeof(struct sockaddr_in))
158 		return -EINVAL;
159 
160 	if (usin->sin_family != AF_INET)
161 		return -EAFNOSUPPORT;
162 
163 	nexthop = daddr = usin->sin_addr.s_addr;
164 	inet_opt = rcu_dereference_protected(inet->inet_opt,
165 					     lockdep_sock_is_held(sk));
166 	if (inet_opt && inet_opt->opt.srr) {
167 		if (!daddr)
168 			return -EINVAL;
169 		nexthop = inet_opt->opt.faddr;
170 	}
171 
172 	orig_sport = inet->inet_sport;
173 	orig_dport = usin->sin_port;
174 	fl4 = &inet->cork.fl.u.ip4;
175 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
176 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
177 			      IPPROTO_TCP,
178 			      orig_sport, orig_dport, sk);
179 	if (IS_ERR(rt)) {
180 		err = PTR_ERR(rt);
181 		if (err == -ENETUNREACH)
182 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
183 		return err;
184 	}
185 
186 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
187 		ip_rt_put(rt);
188 		return -ENETUNREACH;
189 	}
190 
191 	if (!inet_opt || !inet_opt->opt.srr)
192 		daddr = fl4->daddr;
193 
194 	if (!inet->inet_saddr)
195 		inet->inet_saddr = fl4->saddr;
196 	sk_rcv_saddr_set(sk, inet->inet_saddr);
197 
198 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
199 		/* Reset inherited state */
200 		tp->rx_opt.ts_recent	   = 0;
201 		tp->rx_opt.ts_recent_stamp = 0;
202 		if (likely(!tp->repair))
203 			tp->write_seq	   = 0;
204 	}
205 
206 	inet->inet_dport = usin->sin_port;
207 	sk_daddr_set(sk, daddr);
208 
209 	inet_csk(sk)->icsk_ext_hdr_len = 0;
210 	if (inet_opt)
211 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
212 
213 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
214 
215 	/* Socket identity is still unknown (sport may be zero).
216 	 * However we set state to SYN-SENT and not releasing socket
217 	 * lock select source port, enter ourselves into the hash tables and
218 	 * complete initialization after this.
219 	 */
220 	tcp_set_state(sk, TCP_SYN_SENT);
221 	err = inet_hash_connect(tcp_death_row, sk);
222 	if (err)
223 		goto failure;
224 
225 	sk_set_txhash(sk);
226 
227 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
228 			       inet->inet_sport, inet->inet_dport, sk);
229 	if (IS_ERR(rt)) {
230 		err = PTR_ERR(rt);
231 		rt = NULL;
232 		goto failure;
233 	}
234 	/* OK, now commit destination to socket.  */
235 	sk->sk_gso_type = SKB_GSO_TCPV4;
236 	sk_setup_caps(sk, &rt->dst);
237 	rt = NULL;
238 
239 	if (likely(!tp->repair)) {
240 		if (!tp->write_seq)
241 			tp->write_seq = secure_tcp_seq(inet->inet_saddr,
242 						       inet->inet_daddr,
243 						       inet->inet_sport,
244 						       usin->sin_port);
245 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
246 						 inet->inet_saddr,
247 						 inet->inet_daddr);
248 	}
249 
250 	inet->inet_id = tp->write_seq ^ jiffies;
251 
252 	if (tcp_fastopen_defer_connect(sk, &err))
253 		return err;
254 	if (err)
255 		goto failure;
256 
257 	err = tcp_connect(sk);
258 
259 	if (err)
260 		goto failure;
261 
262 	return 0;
263 
264 failure:
265 	/*
266 	 * This unhashes the socket and releases the local port,
267 	 * if necessary.
268 	 */
269 	tcp_set_state(sk, TCP_CLOSE);
270 	ip_rt_put(rt);
271 	sk->sk_route_caps = 0;
272 	inet->inet_dport = 0;
273 	return err;
274 }
275 EXPORT_SYMBOL(tcp_v4_connect);
276 
277 /*
278  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
279  * It can be called through tcp_release_cb() if socket was owned by user
280  * at the time tcp_v4_err() was called to handle ICMP message.
281  */
282 void tcp_v4_mtu_reduced(struct sock *sk)
283 {
284 	struct inet_sock *inet = inet_sk(sk);
285 	struct dst_entry *dst;
286 	u32 mtu;
287 
288 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
289 		return;
290 	mtu = tcp_sk(sk)->mtu_info;
291 	dst = inet_csk_update_pmtu(sk, mtu);
292 	if (!dst)
293 		return;
294 
295 	/* Something is about to be wrong... Remember soft error
296 	 * for the case, if this connection will not able to recover.
297 	 */
298 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
299 		sk->sk_err_soft = EMSGSIZE;
300 
301 	mtu = dst_mtu(dst);
302 
303 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
304 	    ip_sk_accept_pmtu(sk) &&
305 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
306 		tcp_sync_mss(sk, mtu);
307 
308 		/* Resend the TCP packet because it's
309 		 * clear that the old packet has been
310 		 * dropped. This is the new "fast" path mtu
311 		 * discovery.
312 		 */
313 		tcp_simple_retransmit(sk);
314 	} /* else let the usual retransmit timer handle it */
315 }
316 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
317 
318 static void do_redirect(struct sk_buff *skb, struct sock *sk)
319 {
320 	struct dst_entry *dst = __sk_dst_check(sk, 0);
321 
322 	if (dst)
323 		dst->ops->redirect(dst, sk, skb);
324 }
325 
326 
327 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
328 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
329 {
330 	struct request_sock *req = inet_reqsk(sk);
331 	struct net *net = sock_net(sk);
332 
333 	/* ICMPs are not backlogged, hence we cannot get
334 	 * an established socket here.
335 	 */
336 	if (seq != tcp_rsk(req)->snt_isn) {
337 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
338 	} else if (abort) {
339 		/*
340 		 * Still in SYN_RECV, just remove it silently.
341 		 * There is no good way to pass the error to the newly
342 		 * created socket, and POSIX does not want network
343 		 * errors returned from accept().
344 		 */
345 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
346 		tcp_listendrop(req->rsk_listener);
347 	}
348 	reqsk_put(req);
349 }
350 EXPORT_SYMBOL(tcp_req_err);
351 
352 /*
353  * This routine is called by the ICMP module when it gets some
354  * sort of error condition.  If err < 0 then the socket should
355  * be closed and the error returned to the user.  If err > 0
356  * it's just the icmp type << 8 | icmp code.  After adjustment
357  * header points to the first 8 bytes of the tcp header.  We need
358  * to find the appropriate port.
359  *
360  * The locking strategy used here is very "optimistic". When
361  * someone else accesses the socket the ICMP is just dropped
362  * and for some paths there is no check at all.
363  * A more general error queue to queue errors for later handling
364  * is probably better.
365  *
366  */
367 
368 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
369 {
370 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
371 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
372 	struct inet_connection_sock *icsk;
373 	struct tcp_sock *tp;
374 	struct inet_sock *inet;
375 	const int type = icmp_hdr(icmp_skb)->type;
376 	const int code = icmp_hdr(icmp_skb)->code;
377 	struct sock *sk;
378 	struct sk_buff *skb;
379 	struct request_sock *fastopen;
380 	u32 seq, snd_una;
381 	s32 remaining;
382 	u32 delta_us;
383 	int err;
384 	struct net *net = dev_net(icmp_skb->dev);
385 
386 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
387 				       th->dest, iph->saddr, ntohs(th->source),
388 				       inet_iif(icmp_skb), 0);
389 	if (!sk) {
390 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
391 		return;
392 	}
393 	if (sk->sk_state == TCP_TIME_WAIT) {
394 		inet_twsk_put(inet_twsk(sk));
395 		return;
396 	}
397 	seq = ntohl(th->seq);
398 	if (sk->sk_state == TCP_NEW_SYN_RECV)
399 		return tcp_req_err(sk, seq,
400 				  type == ICMP_PARAMETERPROB ||
401 				  type == ICMP_TIME_EXCEEDED ||
402 				  (type == ICMP_DEST_UNREACH &&
403 				   (code == ICMP_NET_UNREACH ||
404 				    code == ICMP_HOST_UNREACH)));
405 
406 	bh_lock_sock(sk);
407 	/* If too many ICMPs get dropped on busy
408 	 * servers this needs to be solved differently.
409 	 * We do take care of PMTU discovery (RFC1191) special case :
410 	 * we can receive locally generated ICMP messages while socket is held.
411 	 */
412 	if (sock_owned_by_user(sk)) {
413 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
414 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
415 	}
416 	if (sk->sk_state == TCP_CLOSE)
417 		goto out;
418 
419 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
420 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
421 		goto out;
422 	}
423 
424 	icsk = inet_csk(sk);
425 	tp = tcp_sk(sk);
426 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
427 	fastopen = tp->fastopen_rsk;
428 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
429 	if (sk->sk_state != TCP_LISTEN &&
430 	    !between(seq, snd_una, tp->snd_nxt)) {
431 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
432 		goto out;
433 	}
434 
435 	switch (type) {
436 	case ICMP_REDIRECT:
437 		if (!sock_owned_by_user(sk))
438 			do_redirect(icmp_skb, sk);
439 		goto out;
440 	case ICMP_SOURCE_QUENCH:
441 		/* Just silently ignore these. */
442 		goto out;
443 	case ICMP_PARAMETERPROB:
444 		err = EPROTO;
445 		break;
446 	case ICMP_DEST_UNREACH:
447 		if (code > NR_ICMP_UNREACH)
448 			goto out;
449 
450 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
451 			/* We are not interested in TCP_LISTEN and open_requests
452 			 * (SYN-ACKs send out by Linux are always <576bytes so
453 			 * they should go through unfragmented).
454 			 */
455 			if (sk->sk_state == TCP_LISTEN)
456 				goto out;
457 
458 			tp->mtu_info = info;
459 			if (!sock_owned_by_user(sk)) {
460 				tcp_v4_mtu_reduced(sk);
461 			} else {
462 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
463 					sock_hold(sk);
464 			}
465 			goto out;
466 		}
467 
468 		err = icmp_err_convert[code].errno;
469 		/* check if icmp_skb allows revert of backoff
470 		 * (see draft-zimmermann-tcp-lcd) */
471 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
472 			break;
473 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
474 		    !icsk->icsk_backoff || fastopen)
475 			break;
476 
477 		if (sock_owned_by_user(sk))
478 			break;
479 
480 		icsk->icsk_backoff--;
481 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
482 					       TCP_TIMEOUT_INIT;
483 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
484 
485 		skb = tcp_rtx_queue_head(sk);
486 		BUG_ON(!skb);
487 
488 		tcp_mstamp_refresh(tp);
489 		delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
490 		remaining = icsk->icsk_rto -
491 			    usecs_to_jiffies(delta_us);
492 
493 		if (remaining > 0) {
494 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
495 						  remaining, TCP_RTO_MAX);
496 		} else {
497 			/* RTO revert clocked out retransmission.
498 			 * Will retransmit now */
499 			tcp_retransmit_timer(sk);
500 		}
501 
502 		break;
503 	case ICMP_TIME_EXCEEDED:
504 		err = EHOSTUNREACH;
505 		break;
506 	default:
507 		goto out;
508 	}
509 
510 	switch (sk->sk_state) {
511 	case TCP_SYN_SENT:
512 	case TCP_SYN_RECV:
513 		/* Only in fast or simultaneous open. If a fast open socket is
514 		 * is already accepted it is treated as a connected one below.
515 		 */
516 		if (fastopen && !fastopen->sk)
517 			break;
518 
519 		if (!sock_owned_by_user(sk)) {
520 			sk->sk_err = err;
521 
522 			sk->sk_error_report(sk);
523 
524 			tcp_done(sk);
525 		} else {
526 			sk->sk_err_soft = err;
527 		}
528 		goto out;
529 	}
530 
531 	/* If we've already connected we will keep trying
532 	 * until we time out, or the user gives up.
533 	 *
534 	 * rfc1122 4.2.3.9 allows to consider as hard errors
535 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
536 	 * but it is obsoleted by pmtu discovery).
537 	 *
538 	 * Note, that in modern internet, where routing is unreliable
539 	 * and in each dark corner broken firewalls sit, sending random
540 	 * errors ordered by their masters even this two messages finally lose
541 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
542 	 *
543 	 * Now we are in compliance with RFCs.
544 	 *							--ANK (980905)
545 	 */
546 
547 	inet = inet_sk(sk);
548 	if (!sock_owned_by_user(sk) && inet->recverr) {
549 		sk->sk_err = err;
550 		sk->sk_error_report(sk);
551 	} else	{ /* Only an error on timeout */
552 		sk->sk_err_soft = err;
553 	}
554 
555 out:
556 	bh_unlock_sock(sk);
557 	sock_put(sk);
558 }
559 
560 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
561 {
562 	struct tcphdr *th = tcp_hdr(skb);
563 
564 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
565 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
566 		skb->csum_start = skb_transport_header(skb) - skb->head;
567 		skb->csum_offset = offsetof(struct tcphdr, check);
568 	} else {
569 		th->check = tcp_v4_check(skb->len, saddr, daddr,
570 					 csum_partial(th,
571 						      th->doff << 2,
572 						      skb->csum));
573 	}
574 }
575 
576 /* This routine computes an IPv4 TCP checksum. */
577 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
578 {
579 	const struct inet_sock *inet = inet_sk(sk);
580 
581 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
582 }
583 EXPORT_SYMBOL(tcp_v4_send_check);
584 
585 /*
586  *	This routine will send an RST to the other tcp.
587  *
588  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
589  *		      for reset.
590  *	Answer: if a packet caused RST, it is not for a socket
591  *		existing in our system, if it is matched to a socket,
592  *		it is just duplicate segment or bug in other side's TCP.
593  *		So that we build reply only basing on parameters
594  *		arrived with segment.
595  *	Exception: precedence violation. We do not implement it in any case.
596  */
597 
598 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
599 {
600 	const struct tcphdr *th = tcp_hdr(skb);
601 	struct {
602 		struct tcphdr th;
603 #ifdef CONFIG_TCP_MD5SIG
604 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
605 #endif
606 	} rep;
607 	struct ip_reply_arg arg;
608 #ifdef CONFIG_TCP_MD5SIG
609 	struct tcp_md5sig_key *key = NULL;
610 	const __u8 *hash_location = NULL;
611 	unsigned char newhash[16];
612 	int genhash;
613 	struct sock *sk1 = NULL;
614 #endif
615 	struct net *net;
616 
617 	/* Never send a reset in response to a reset. */
618 	if (th->rst)
619 		return;
620 
621 	/* If sk not NULL, it means we did a successful lookup and incoming
622 	 * route had to be correct. prequeue might have dropped our dst.
623 	 */
624 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
625 		return;
626 
627 	/* Swap the send and the receive. */
628 	memset(&rep, 0, sizeof(rep));
629 	rep.th.dest   = th->source;
630 	rep.th.source = th->dest;
631 	rep.th.doff   = sizeof(struct tcphdr) / 4;
632 	rep.th.rst    = 1;
633 
634 	if (th->ack) {
635 		rep.th.seq = th->ack_seq;
636 	} else {
637 		rep.th.ack = 1;
638 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
639 				       skb->len - (th->doff << 2));
640 	}
641 
642 	memset(&arg, 0, sizeof(arg));
643 	arg.iov[0].iov_base = (unsigned char *)&rep;
644 	arg.iov[0].iov_len  = sizeof(rep.th);
645 
646 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
647 #ifdef CONFIG_TCP_MD5SIG
648 	rcu_read_lock();
649 	hash_location = tcp_parse_md5sig_option(th);
650 	if (sk && sk_fullsock(sk)) {
651 		key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
652 					&ip_hdr(skb)->saddr, AF_INET);
653 	} else if (hash_location) {
654 		/*
655 		 * active side is lost. Try to find listening socket through
656 		 * source port, and then find md5 key through listening socket.
657 		 * we are not loose security here:
658 		 * Incoming packet is checked with md5 hash with finding key,
659 		 * no RST generated if md5 hash doesn't match.
660 		 */
661 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
662 					     ip_hdr(skb)->saddr,
663 					     th->source, ip_hdr(skb)->daddr,
664 					     ntohs(th->source), inet_iif(skb),
665 					     tcp_v4_sdif(skb));
666 		/* don't send rst if it can't find key */
667 		if (!sk1)
668 			goto out;
669 
670 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
671 					&ip_hdr(skb)->saddr, AF_INET);
672 		if (!key)
673 			goto out;
674 
675 
676 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
677 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
678 			goto out;
679 
680 	}
681 
682 	if (key) {
683 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
684 				   (TCPOPT_NOP << 16) |
685 				   (TCPOPT_MD5SIG << 8) |
686 				   TCPOLEN_MD5SIG);
687 		/* Update length and the length the header thinks exists */
688 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
689 		rep.th.doff = arg.iov[0].iov_len / 4;
690 
691 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
692 				     key, ip_hdr(skb)->saddr,
693 				     ip_hdr(skb)->daddr, &rep.th);
694 	}
695 #endif
696 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
697 				      ip_hdr(skb)->saddr, /* XXX */
698 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
699 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
700 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
701 
702 	/* When socket is gone, all binding information is lost.
703 	 * routing might fail in this case. No choice here, if we choose to force
704 	 * input interface, we will misroute in case of asymmetric route.
705 	 */
706 	if (sk) {
707 		arg.bound_dev_if = sk->sk_bound_dev_if;
708 		trace_tcp_send_reset(sk, skb);
709 	}
710 
711 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
712 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
713 
714 	arg.tos = ip_hdr(skb)->tos;
715 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
716 	local_bh_disable();
717 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
718 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
719 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
720 			      &arg, arg.iov[0].iov_len);
721 
722 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
723 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
724 	local_bh_enable();
725 
726 #ifdef CONFIG_TCP_MD5SIG
727 out:
728 	rcu_read_unlock();
729 #endif
730 }
731 
732 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
733    outside socket context is ugly, certainly. What can I do?
734  */
735 
736 static void tcp_v4_send_ack(const struct sock *sk,
737 			    struct sk_buff *skb, u32 seq, u32 ack,
738 			    u32 win, u32 tsval, u32 tsecr, int oif,
739 			    struct tcp_md5sig_key *key,
740 			    int reply_flags, u8 tos)
741 {
742 	const struct tcphdr *th = tcp_hdr(skb);
743 	struct {
744 		struct tcphdr th;
745 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
746 #ifdef CONFIG_TCP_MD5SIG
747 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
748 #endif
749 			];
750 	} rep;
751 	struct net *net = sock_net(sk);
752 	struct ip_reply_arg arg;
753 
754 	memset(&rep.th, 0, sizeof(struct tcphdr));
755 	memset(&arg, 0, sizeof(arg));
756 
757 	arg.iov[0].iov_base = (unsigned char *)&rep;
758 	arg.iov[0].iov_len  = sizeof(rep.th);
759 	if (tsecr) {
760 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
761 				   (TCPOPT_TIMESTAMP << 8) |
762 				   TCPOLEN_TIMESTAMP);
763 		rep.opt[1] = htonl(tsval);
764 		rep.opt[2] = htonl(tsecr);
765 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
766 	}
767 
768 	/* Swap the send and the receive. */
769 	rep.th.dest    = th->source;
770 	rep.th.source  = th->dest;
771 	rep.th.doff    = arg.iov[0].iov_len / 4;
772 	rep.th.seq     = htonl(seq);
773 	rep.th.ack_seq = htonl(ack);
774 	rep.th.ack     = 1;
775 	rep.th.window  = htons(win);
776 
777 #ifdef CONFIG_TCP_MD5SIG
778 	if (key) {
779 		int offset = (tsecr) ? 3 : 0;
780 
781 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
782 					  (TCPOPT_NOP << 16) |
783 					  (TCPOPT_MD5SIG << 8) |
784 					  TCPOLEN_MD5SIG);
785 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
786 		rep.th.doff = arg.iov[0].iov_len/4;
787 
788 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
789 				    key, ip_hdr(skb)->saddr,
790 				    ip_hdr(skb)->daddr, &rep.th);
791 	}
792 #endif
793 	arg.flags = reply_flags;
794 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
795 				      ip_hdr(skb)->saddr, /* XXX */
796 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
797 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
798 	if (oif)
799 		arg.bound_dev_if = oif;
800 	arg.tos = tos;
801 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
802 	local_bh_disable();
803 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
804 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
805 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
806 			      &arg, arg.iov[0].iov_len);
807 
808 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
809 	local_bh_enable();
810 }
811 
812 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
813 {
814 	struct inet_timewait_sock *tw = inet_twsk(sk);
815 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
816 
817 	tcp_v4_send_ack(sk, skb,
818 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
819 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
820 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
821 			tcptw->tw_ts_recent,
822 			tw->tw_bound_dev_if,
823 			tcp_twsk_md5_key(tcptw),
824 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
825 			tw->tw_tos
826 			);
827 
828 	inet_twsk_put(tw);
829 }
830 
831 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
832 				  struct request_sock *req)
833 {
834 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
835 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
836 	 */
837 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
838 					     tcp_sk(sk)->snd_nxt;
839 
840 	/* RFC 7323 2.3
841 	 * The window field (SEG.WND) of every outgoing segment, with the
842 	 * exception of <SYN> segments, MUST be right-shifted by
843 	 * Rcv.Wind.Shift bits:
844 	 */
845 	tcp_v4_send_ack(sk, skb, seq,
846 			tcp_rsk(req)->rcv_nxt,
847 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
848 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
849 			req->ts_recent,
850 			0,
851 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
852 					  AF_INET),
853 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
854 			ip_hdr(skb)->tos);
855 }
856 
857 /*
858  *	Send a SYN-ACK after having received a SYN.
859  *	This still operates on a request_sock only, not on a big
860  *	socket.
861  */
862 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
863 			      struct flowi *fl,
864 			      struct request_sock *req,
865 			      struct tcp_fastopen_cookie *foc,
866 			      enum tcp_synack_type synack_type)
867 {
868 	const struct inet_request_sock *ireq = inet_rsk(req);
869 	struct flowi4 fl4;
870 	int err = -1;
871 	struct sk_buff *skb;
872 
873 	/* First, grab a route. */
874 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
875 		return -1;
876 
877 	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
878 
879 	if (skb) {
880 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
881 
882 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
883 					    ireq->ir_rmt_addr,
884 					    ireq_opt_deref(ireq));
885 		err = net_xmit_eval(err);
886 	}
887 
888 	return err;
889 }
890 
891 /*
892  *	IPv4 request_sock destructor.
893  */
894 static void tcp_v4_reqsk_destructor(struct request_sock *req)
895 {
896 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
897 }
898 
899 #ifdef CONFIG_TCP_MD5SIG
900 /*
901  * RFC2385 MD5 checksumming requires a mapping of
902  * IP address->MD5 Key.
903  * We need to maintain these in the sk structure.
904  */
905 
906 /* Find the Key structure for an address.  */
907 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
908 					 const union tcp_md5_addr *addr,
909 					 int family)
910 {
911 	const struct tcp_sock *tp = tcp_sk(sk);
912 	struct tcp_md5sig_key *key;
913 	const struct tcp_md5sig_info *md5sig;
914 	__be32 mask;
915 	struct tcp_md5sig_key *best_match = NULL;
916 	bool match;
917 
918 	/* caller either holds rcu_read_lock() or socket lock */
919 	md5sig = rcu_dereference_check(tp->md5sig_info,
920 				       lockdep_sock_is_held(sk));
921 	if (!md5sig)
922 		return NULL;
923 
924 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
925 		if (key->family != family)
926 			continue;
927 
928 		if (family == AF_INET) {
929 			mask = inet_make_mask(key->prefixlen);
930 			match = (key->addr.a4.s_addr & mask) ==
931 				(addr->a4.s_addr & mask);
932 #if IS_ENABLED(CONFIG_IPV6)
933 		} else if (family == AF_INET6) {
934 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
935 						  key->prefixlen);
936 #endif
937 		} else {
938 			match = false;
939 		}
940 
941 		if (match && (!best_match ||
942 			      key->prefixlen > best_match->prefixlen))
943 			best_match = key;
944 	}
945 	return best_match;
946 }
947 EXPORT_SYMBOL(tcp_md5_do_lookup);
948 
949 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
950 						      const union tcp_md5_addr *addr,
951 						      int family, u8 prefixlen)
952 {
953 	const struct tcp_sock *tp = tcp_sk(sk);
954 	struct tcp_md5sig_key *key;
955 	unsigned int size = sizeof(struct in_addr);
956 	const struct tcp_md5sig_info *md5sig;
957 
958 	/* caller either holds rcu_read_lock() or socket lock */
959 	md5sig = rcu_dereference_check(tp->md5sig_info,
960 				       lockdep_sock_is_held(sk));
961 	if (!md5sig)
962 		return NULL;
963 #if IS_ENABLED(CONFIG_IPV6)
964 	if (family == AF_INET6)
965 		size = sizeof(struct in6_addr);
966 #endif
967 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
968 		if (key->family != family)
969 			continue;
970 		if (!memcmp(&key->addr, addr, size) &&
971 		    key->prefixlen == prefixlen)
972 			return key;
973 	}
974 	return NULL;
975 }
976 
977 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
978 					 const struct sock *addr_sk)
979 {
980 	const union tcp_md5_addr *addr;
981 
982 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
983 	return tcp_md5_do_lookup(sk, addr, AF_INET);
984 }
985 EXPORT_SYMBOL(tcp_v4_md5_lookup);
986 
987 /* This can be called on a newly created socket, from other files */
988 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
989 		   int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
990 		   gfp_t gfp)
991 {
992 	/* Add Key to the list */
993 	struct tcp_md5sig_key *key;
994 	struct tcp_sock *tp = tcp_sk(sk);
995 	struct tcp_md5sig_info *md5sig;
996 
997 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
998 	if (key) {
999 		/* Pre-existing entry - just update that one. */
1000 		memcpy(key->key, newkey, newkeylen);
1001 		key->keylen = newkeylen;
1002 		return 0;
1003 	}
1004 
1005 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1006 					   lockdep_sock_is_held(sk));
1007 	if (!md5sig) {
1008 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1009 		if (!md5sig)
1010 			return -ENOMEM;
1011 
1012 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1013 		INIT_HLIST_HEAD(&md5sig->head);
1014 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1015 	}
1016 
1017 	key = sock_kmalloc(sk, sizeof(*key), gfp);
1018 	if (!key)
1019 		return -ENOMEM;
1020 	if (!tcp_alloc_md5sig_pool()) {
1021 		sock_kfree_s(sk, key, sizeof(*key));
1022 		return -ENOMEM;
1023 	}
1024 
1025 	memcpy(key->key, newkey, newkeylen);
1026 	key->keylen = newkeylen;
1027 	key->family = family;
1028 	key->prefixlen = prefixlen;
1029 	memcpy(&key->addr, addr,
1030 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1031 				      sizeof(struct in_addr));
1032 	hlist_add_head_rcu(&key->node, &md5sig->head);
1033 	return 0;
1034 }
1035 EXPORT_SYMBOL(tcp_md5_do_add);
1036 
1037 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1038 		   u8 prefixlen)
1039 {
1040 	struct tcp_md5sig_key *key;
1041 
1042 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1043 	if (!key)
1044 		return -ENOENT;
1045 	hlist_del_rcu(&key->node);
1046 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1047 	kfree_rcu(key, rcu);
1048 	return 0;
1049 }
1050 EXPORT_SYMBOL(tcp_md5_do_del);
1051 
1052 static void tcp_clear_md5_list(struct sock *sk)
1053 {
1054 	struct tcp_sock *tp = tcp_sk(sk);
1055 	struct tcp_md5sig_key *key;
1056 	struct hlist_node *n;
1057 	struct tcp_md5sig_info *md5sig;
1058 
1059 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1060 
1061 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1062 		hlist_del_rcu(&key->node);
1063 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1064 		kfree_rcu(key, rcu);
1065 	}
1066 }
1067 
1068 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1069 				 char __user *optval, int optlen)
1070 {
1071 	struct tcp_md5sig cmd;
1072 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1073 	u8 prefixlen = 32;
1074 
1075 	if (optlen < sizeof(cmd))
1076 		return -EINVAL;
1077 
1078 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1079 		return -EFAULT;
1080 
1081 	if (sin->sin_family != AF_INET)
1082 		return -EINVAL;
1083 
1084 	if (optname == TCP_MD5SIG_EXT &&
1085 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1086 		prefixlen = cmd.tcpm_prefixlen;
1087 		if (prefixlen > 32)
1088 			return -EINVAL;
1089 	}
1090 
1091 	if (!cmd.tcpm_keylen)
1092 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1093 				      AF_INET, prefixlen);
1094 
1095 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1096 		return -EINVAL;
1097 
1098 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1099 			      AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1100 			      GFP_KERNEL);
1101 }
1102 
1103 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1104 				   __be32 daddr, __be32 saddr,
1105 				   const struct tcphdr *th, int nbytes)
1106 {
1107 	struct tcp4_pseudohdr *bp;
1108 	struct scatterlist sg;
1109 	struct tcphdr *_th;
1110 
1111 	bp = hp->scratch;
1112 	bp->saddr = saddr;
1113 	bp->daddr = daddr;
1114 	bp->pad = 0;
1115 	bp->protocol = IPPROTO_TCP;
1116 	bp->len = cpu_to_be16(nbytes);
1117 
1118 	_th = (struct tcphdr *)(bp + 1);
1119 	memcpy(_th, th, sizeof(*th));
1120 	_th->check = 0;
1121 
1122 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1123 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1124 				sizeof(*bp) + sizeof(*th));
1125 	return crypto_ahash_update(hp->md5_req);
1126 }
1127 
1128 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1129 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1130 {
1131 	struct tcp_md5sig_pool *hp;
1132 	struct ahash_request *req;
1133 
1134 	hp = tcp_get_md5sig_pool();
1135 	if (!hp)
1136 		goto clear_hash_noput;
1137 	req = hp->md5_req;
1138 
1139 	if (crypto_ahash_init(req))
1140 		goto clear_hash;
1141 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1142 		goto clear_hash;
1143 	if (tcp_md5_hash_key(hp, key))
1144 		goto clear_hash;
1145 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1146 	if (crypto_ahash_final(req))
1147 		goto clear_hash;
1148 
1149 	tcp_put_md5sig_pool();
1150 	return 0;
1151 
1152 clear_hash:
1153 	tcp_put_md5sig_pool();
1154 clear_hash_noput:
1155 	memset(md5_hash, 0, 16);
1156 	return 1;
1157 }
1158 
1159 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1160 			const struct sock *sk,
1161 			const struct sk_buff *skb)
1162 {
1163 	struct tcp_md5sig_pool *hp;
1164 	struct ahash_request *req;
1165 	const struct tcphdr *th = tcp_hdr(skb);
1166 	__be32 saddr, daddr;
1167 
1168 	if (sk) { /* valid for establish/request sockets */
1169 		saddr = sk->sk_rcv_saddr;
1170 		daddr = sk->sk_daddr;
1171 	} else {
1172 		const struct iphdr *iph = ip_hdr(skb);
1173 		saddr = iph->saddr;
1174 		daddr = iph->daddr;
1175 	}
1176 
1177 	hp = tcp_get_md5sig_pool();
1178 	if (!hp)
1179 		goto clear_hash_noput;
1180 	req = hp->md5_req;
1181 
1182 	if (crypto_ahash_init(req))
1183 		goto clear_hash;
1184 
1185 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1186 		goto clear_hash;
1187 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1188 		goto clear_hash;
1189 	if (tcp_md5_hash_key(hp, key))
1190 		goto clear_hash;
1191 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1192 	if (crypto_ahash_final(req))
1193 		goto clear_hash;
1194 
1195 	tcp_put_md5sig_pool();
1196 	return 0;
1197 
1198 clear_hash:
1199 	tcp_put_md5sig_pool();
1200 clear_hash_noput:
1201 	memset(md5_hash, 0, 16);
1202 	return 1;
1203 }
1204 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1205 
1206 #endif
1207 
1208 /* Called with rcu_read_lock() */
1209 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1210 				    const struct sk_buff *skb)
1211 {
1212 #ifdef CONFIG_TCP_MD5SIG
1213 	/*
1214 	 * This gets called for each TCP segment that arrives
1215 	 * so we want to be efficient.
1216 	 * We have 3 drop cases:
1217 	 * o No MD5 hash and one expected.
1218 	 * o MD5 hash and we're not expecting one.
1219 	 * o MD5 hash and its wrong.
1220 	 */
1221 	const __u8 *hash_location = NULL;
1222 	struct tcp_md5sig_key *hash_expected;
1223 	const struct iphdr *iph = ip_hdr(skb);
1224 	const struct tcphdr *th = tcp_hdr(skb);
1225 	int genhash;
1226 	unsigned char newhash[16];
1227 
1228 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1229 					  AF_INET);
1230 	hash_location = tcp_parse_md5sig_option(th);
1231 
1232 	/* We've parsed the options - do we have a hash? */
1233 	if (!hash_expected && !hash_location)
1234 		return false;
1235 
1236 	if (hash_expected && !hash_location) {
1237 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1238 		return true;
1239 	}
1240 
1241 	if (!hash_expected && hash_location) {
1242 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1243 		return true;
1244 	}
1245 
1246 	/* Okay, so this is hash_expected and hash_location -
1247 	 * so we need to calculate the checksum.
1248 	 */
1249 	genhash = tcp_v4_md5_hash_skb(newhash,
1250 				      hash_expected,
1251 				      NULL, skb);
1252 
1253 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1254 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1255 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1256 				     &iph->saddr, ntohs(th->source),
1257 				     &iph->daddr, ntohs(th->dest),
1258 				     genhash ? " tcp_v4_calc_md5_hash failed"
1259 				     : "");
1260 		return true;
1261 	}
1262 	return false;
1263 #endif
1264 	return false;
1265 }
1266 
1267 static void tcp_v4_init_req(struct request_sock *req,
1268 			    const struct sock *sk_listener,
1269 			    struct sk_buff *skb)
1270 {
1271 	struct inet_request_sock *ireq = inet_rsk(req);
1272 	struct net *net = sock_net(sk_listener);
1273 
1274 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1275 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1276 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1277 }
1278 
1279 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1280 					  struct flowi *fl,
1281 					  const struct request_sock *req)
1282 {
1283 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1284 }
1285 
1286 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1287 	.family		=	PF_INET,
1288 	.obj_size	=	sizeof(struct tcp_request_sock),
1289 	.rtx_syn_ack	=	tcp_rtx_synack,
1290 	.send_ack	=	tcp_v4_reqsk_send_ack,
1291 	.destructor	=	tcp_v4_reqsk_destructor,
1292 	.send_reset	=	tcp_v4_send_reset,
1293 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1294 };
1295 
1296 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1297 	.mss_clamp	=	TCP_MSS_DEFAULT,
1298 #ifdef CONFIG_TCP_MD5SIG
1299 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1300 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1301 #endif
1302 	.init_req	=	tcp_v4_init_req,
1303 #ifdef CONFIG_SYN_COOKIES
1304 	.cookie_init_seq =	cookie_v4_init_sequence,
1305 #endif
1306 	.route_req	=	tcp_v4_route_req,
1307 	.init_seq	=	tcp_v4_init_seq,
1308 	.init_ts_off	=	tcp_v4_init_ts_off,
1309 	.send_synack	=	tcp_v4_send_synack,
1310 };
1311 
1312 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1313 {
1314 	/* Never answer to SYNs send to broadcast or multicast */
1315 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1316 		goto drop;
1317 
1318 	return tcp_conn_request(&tcp_request_sock_ops,
1319 				&tcp_request_sock_ipv4_ops, sk, skb);
1320 
1321 drop:
1322 	tcp_listendrop(sk);
1323 	return 0;
1324 }
1325 EXPORT_SYMBOL(tcp_v4_conn_request);
1326 
1327 
1328 /*
1329  * The three way handshake has completed - we got a valid synack -
1330  * now create the new socket.
1331  */
1332 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1333 				  struct request_sock *req,
1334 				  struct dst_entry *dst,
1335 				  struct request_sock *req_unhash,
1336 				  bool *own_req)
1337 {
1338 	struct inet_request_sock *ireq;
1339 	struct inet_sock *newinet;
1340 	struct tcp_sock *newtp;
1341 	struct sock *newsk;
1342 #ifdef CONFIG_TCP_MD5SIG
1343 	struct tcp_md5sig_key *key;
1344 #endif
1345 	struct ip_options_rcu *inet_opt;
1346 
1347 	if (sk_acceptq_is_full(sk))
1348 		goto exit_overflow;
1349 
1350 	newsk = tcp_create_openreq_child(sk, req, skb);
1351 	if (!newsk)
1352 		goto exit_nonewsk;
1353 
1354 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1355 	inet_sk_rx_dst_set(newsk, skb);
1356 
1357 	newtp		      = tcp_sk(newsk);
1358 	newinet		      = inet_sk(newsk);
1359 	ireq		      = inet_rsk(req);
1360 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1361 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1362 	newsk->sk_bound_dev_if = ireq->ir_iif;
1363 	newinet->inet_saddr   = ireq->ir_loc_addr;
1364 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1365 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1366 	newinet->mc_index     = inet_iif(skb);
1367 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1368 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1369 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1370 	if (inet_opt)
1371 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1372 	newinet->inet_id = newtp->write_seq ^ jiffies;
1373 
1374 	if (!dst) {
1375 		dst = inet_csk_route_child_sock(sk, newsk, req);
1376 		if (!dst)
1377 			goto put_and_exit;
1378 	} else {
1379 		/* syncookie case : see end of cookie_v4_check() */
1380 	}
1381 	sk_setup_caps(newsk, dst);
1382 
1383 	tcp_ca_openreq_child(newsk, dst);
1384 
1385 	tcp_sync_mss(newsk, dst_mtu(dst));
1386 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1387 
1388 	tcp_initialize_rcv_mss(newsk);
1389 
1390 #ifdef CONFIG_TCP_MD5SIG
1391 	/* Copy over the MD5 key from the original socket */
1392 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1393 				AF_INET);
1394 	if (key) {
1395 		/*
1396 		 * We're using one, so create a matching key
1397 		 * on the newsk structure. If we fail to get
1398 		 * memory, then we end up not copying the key
1399 		 * across. Shucks.
1400 		 */
1401 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1402 			       AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1403 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1404 	}
1405 #endif
1406 
1407 	if (__inet_inherit_port(sk, newsk) < 0)
1408 		goto put_and_exit;
1409 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1410 	if (likely(*own_req)) {
1411 		tcp_move_syn(newtp, req);
1412 		ireq->ireq_opt = NULL;
1413 	} else {
1414 		newinet->inet_opt = NULL;
1415 	}
1416 	return newsk;
1417 
1418 exit_overflow:
1419 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1420 exit_nonewsk:
1421 	dst_release(dst);
1422 exit:
1423 	tcp_listendrop(sk);
1424 	return NULL;
1425 put_and_exit:
1426 	newinet->inet_opt = NULL;
1427 	inet_csk_prepare_forced_close(newsk);
1428 	tcp_done(newsk);
1429 	goto exit;
1430 }
1431 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1432 
1433 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1434 {
1435 #ifdef CONFIG_SYN_COOKIES
1436 	const struct tcphdr *th = tcp_hdr(skb);
1437 
1438 	if (!th->syn)
1439 		sk = cookie_v4_check(sk, skb);
1440 #endif
1441 	return sk;
1442 }
1443 
1444 /* The socket must have it's spinlock held when we get
1445  * here, unless it is a TCP_LISTEN socket.
1446  *
1447  * We have a potential double-lock case here, so even when
1448  * doing backlog processing we use the BH locking scheme.
1449  * This is because we cannot sleep with the original spinlock
1450  * held.
1451  */
1452 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1453 {
1454 	struct sock *rsk;
1455 
1456 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1457 		struct dst_entry *dst = sk->sk_rx_dst;
1458 
1459 		sock_rps_save_rxhash(sk, skb);
1460 		sk_mark_napi_id(sk, skb);
1461 		if (dst) {
1462 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1463 			    !dst->ops->check(dst, 0)) {
1464 				dst_release(dst);
1465 				sk->sk_rx_dst = NULL;
1466 			}
1467 		}
1468 		tcp_rcv_established(sk, skb, tcp_hdr(skb));
1469 		return 0;
1470 	}
1471 
1472 	if (tcp_checksum_complete(skb))
1473 		goto csum_err;
1474 
1475 	if (sk->sk_state == TCP_LISTEN) {
1476 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1477 
1478 		if (!nsk)
1479 			goto discard;
1480 		if (nsk != sk) {
1481 			if (tcp_child_process(sk, nsk, skb)) {
1482 				rsk = nsk;
1483 				goto reset;
1484 			}
1485 			return 0;
1486 		}
1487 	} else
1488 		sock_rps_save_rxhash(sk, skb);
1489 
1490 	if (tcp_rcv_state_process(sk, skb)) {
1491 		rsk = sk;
1492 		goto reset;
1493 	}
1494 	return 0;
1495 
1496 reset:
1497 	tcp_v4_send_reset(rsk, skb);
1498 discard:
1499 	kfree_skb(skb);
1500 	/* Be careful here. If this function gets more complicated and
1501 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1502 	 * might be destroyed here. This current version compiles correctly,
1503 	 * but you have been warned.
1504 	 */
1505 	return 0;
1506 
1507 csum_err:
1508 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1509 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1510 	goto discard;
1511 }
1512 EXPORT_SYMBOL(tcp_v4_do_rcv);
1513 
1514 int tcp_v4_early_demux(struct sk_buff *skb)
1515 {
1516 	const struct iphdr *iph;
1517 	const struct tcphdr *th;
1518 	struct sock *sk;
1519 
1520 	if (skb->pkt_type != PACKET_HOST)
1521 		return 0;
1522 
1523 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1524 		return 0;
1525 
1526 	iph = ip_hdr(skb);
1527 	th = tcp_hdr(skb);
1528 
1529 	if (th->doff < sizeof(struct tcphdr) / 4)
1530 		return 0;
1531 
1532 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1533 				       iph->saddr, th->source,
1534 				       iph->daddr, ntohs(th->dest),
1535 				       skb->skb_iif, inet_sdif(skb));
1536 	if (sk) {
1537 		skb->sk = sk;
1538 		skb->destructor = sock_edemux;
1539 		if (sk_fullsock(sk)) {
1540 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1541 
1542 			if (dst)
1543 				dst = dst_check(dst, 0);
1544 			if (dst &&
1545 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1546 				skb_dst_set_noref(skb, dst);
1547 		}
1548 	}
1549 	return 0;
1550 }
1551 
1552 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1553 {
1554 	u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1555 
1556 	/* Only socket owner can try to collapse/prune rx queues
1557 	 * to reduce memory overhead, so add a little headroom here.
1558 	 * Few sockets backlog are possibly concurrently non empty.
1559 	 */
1560 	limit += 64*1024;
1561 
1562 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1563 	 * we can fix skb->truesize to its real value to avoid future drops.
1564 	 * This is valid because skb is not yet charged to the socket.
1565 	 * It has been noticed pure SACK packets were sometimes dropped
1566 	 * (if cooked by drivers without copybreak feature).
1567 	 */
1568 	skb_condense(skb);
1569 
1570 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1571 		bh_unlock_sock(sk);
1572 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1573 		return true;
1574 	}
1575 	return false;
1576 }
1577 EXPORT_SYMBOL(tcp_add_backlog);
1578 
1579 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1580 {
1581 	struct tcphdr *th = (struct tcphdr *)skb->data;
1582 	unsigned int eaten = skb->len;
1583 	int err;
1584 
1585 	err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1586 	if (!err) {
1587 		eaten -= skb->len;
1588 		TCP_SKB_CB(skb)->end_seq -= eaten;
1589 	}
1590 	return err;
1591 }
1592 EXPORT_SYMBOL(tcp_filter);
1593 
1594 static void tcp_v4_restore_cb(struct sk_buff *skb)
1595 {
1596 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1597 		sizeof(struct inet_skb_parm));
1598 }
1599 
1600 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1601 			   const struct tcphdr *th)
1602 {
1603 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1604 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1605 	 */
1606 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1607 		sizeof(struct inet_skb_parm));
1608 	barrier();
1609 
1610 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1611 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1612 				    skb->len - th->doff * 4);
1613 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1614 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1615 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1616 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1617 	TCP_SKB_CB(skb)->sacked	 = 0;
1618 	TCP_SKB_CB(skb)->has_rxtstamp =
1619 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1620 }
1621 
1622 /*
1623  *	From tcp_input.c
1624  */
1625 
1626 int tcp_v4_rcv(struct sk_buff *skb)
1627 {
1628 	struct net *net = dev_net(skb->dev);
1629 	int sdif = inet_sdif(skb);
1630 	const struct iphdr *iph;
1631 	const struct tcphdr *th;
1632 	bool refcounted;
1633 	struct sock *sk;
1634 	int ret;
1635 
1636 	if (skb->pkt_type != PACKET_HOST)
1637 		goto discard_it;
1638 
1639 	/* Count it even if it's bad */
1640 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1641 
1642 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1643 		goto discard_it;
1644 
1645 	th = (const struct tcphdr *)skb->data;
1646 
1647 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1648 		goto bad_packet;
1649 	if (!pskb_may_pull(skb, th->doff * 4))
1650 		goto discard_it;
1651 
1652 	/* An explanation is required here, I think.
1653 	 * Packet length and doff are validated by header prediction,
1654 	 * provided case of th->doff==0 is eliminated.
1655 	 * So, we defer the checks. */
1656 
1657 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1658 		goto csum_error;
1659 
1660 	th = (const struct tcphdr *)skb->data;
1661 	iph = ip_hdr(skb);
1662 lookup:
1663 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1664 			       th->dest, sdif, &refcounted);
1665 	if (!sk)
1666 		goto no_tcp_socket;
1667 
1668 process:
1669 	if (sk->sk_state == TCP_TIME_WAIT)
1670 		goto do_time_wait;
1671 
1672 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1673 		struct request_sock *req = inet_reqsk(sk);
1674 		struct sock *nsk;
1675 
1676 		sk = req->rsk_listener;
1677 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1678 			sk_drops_add(sk, skb);
1679 			reqsk_put(req);
1680 			goto discard_it;
1681 		}
1682 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1683 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1684 			goto lookup;
1685 		}
1686 		/* We own a reference on the listener, increase it again
1687 		 * as we might lose it too soon.
1688 		 */
1689 		sock_hold(sk);
1690 		refcounted = true;
1691 		nsk = NULL;
1692 		if (!tcp_filter(sk, skb)) {
1693 			th = (const struct tcphdr *)skb->data;
1694 			iph = ip_hdr(skb);
1695 			tcp_v4_fill_cb(skb, iph, th);
1696 			nsk = tcp_check_req(sk, skb, req, false);
1697 		}
1698 		if (!nsk) {
1699 			reqsk_put(req);
1700 			goto discard_and_relse;
1701 		}
1702 		if (nsk == sk) {
1703 			reqsk_put(req);
1704 			tcp_v4_restore_cb(skb);
1705 		} else if (tcp_child_process(sk, nsk, skb)) {
1706 			tcp_v4_send_reset(nsk, skb);
1707 			goto discard_and_relse;
1708 		} else {
1709 			sock_put(sk);
1710 			return 0;
1711 		}
1712 	}
1713 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1714 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1715 		goto discard_and_relse;
1716 	}
1717 
1718 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1719 		goto discard_and_relse;
1720 
1721 	if (tcp_v4_inbound_md5_hash(sk, skb))
1722 		goto discard_and_relse;
1723 
1724 	nf_reset(skb);
1725 
1726 	if (tcp_filter(sk, skb))
1727 		goto discard_and_relse;
1728 	th = (const struct tcphdr *)skb->data;
1729 	iph = ip_hdr(skb);
1730 	tcp_v4_fill_cb(skb, iph, th);
1731 
1732 	skb->dev = NULL;
1733 
1734 	if (sk->sk_state == TCP_LISTEN) {
1735 		ret = tcp_v4_do_rcv(sk, skb);
1736 		goto put_and_return;
1737 	}
1738 
1739 	sk_incoming_cpu_update(sk);
1740 
1741 	bh_lock_sock_nested(sk);
1742 	tcp_segs_in(tcp_sk(sk), skb);
1743 	ret = 0;
1744 	if (!sock_owned_by_user(sk)) {
1745 		ret = tcp_v4_do_rcv(sk, skb);
1746 	} else if (tcp_add_backlog(sk, skb)) {
1747 		goto discard_and_relse;
1748 	}
1749 	bh_unlock_sock(sk);
1750 
1751 put_and_return:
1752 	if (refcounted)
1753 		sock_put(sk);
1754 
1755 	return ret;
1756 
1757 no_tcp_socket:
1758 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1759 		goto discard_it;
1760 
1761 	tcp_v4_fill_cb(skb, iph, th);
1762 
1763 	if (tcp_checksum_complete(skb)) {
1764 csum_error:
1765 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1766 bad_packet:
1767 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
1768 	} else {
1769 		tcp_v4_send_reset(NULL, skb);
1770 	}
1771 
1772 discard_it:
1773 	/* Discard frame. */
1774 	kfree_skb(skb);
1775 	return 0;
1776 
1777 discard_and_relse:
1778 	sk_drops_add(sk, skb);
1779 	if (refcounted)
1780 		sock_put(sk);
1781 	goto discard_it;
1782 
1783 do_time_wait:
1784 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1785 		inet_twsk_put(inet_twsk(sk));
1786 		goto discard_it;
1787 	}
1788 
1789 	tcp_v4_fill_cb(skb, iph, th);
1790 
1791 	if (tcp_checksum_complete(skb)) {
1792 		inet_twsk_put(inet_twsk(sk));
1793 		goto csum_error;
1794 	}
1795 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1796 	case TCP_TW_SYN: {
1797 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1798 							&tcp_hashinfo, skb,
1799 							__tcp_hdrlen(th),
1800 							iph->saddr, th->source,
1801 							iph->daddr, th->dest,
1802 							inet_iif(skb),
1803 							sdif);
1804 		if (sk2) {
1805 			inet_twsk_deschedule_put(inet_twsk(sk));
1806 			sk = sk2;
1807 			tcp_v4_restore_cb(skb);
1808 			refcounted = false;
1809 			goto process;
1810 		}
1811 	}
1812 		/* to ACK */
1813 		/* fall through */
1814 	case TCP_TW_ACK:
1815 		tcp_v4_timewait_ack(sk, skb);
1816 		break;
1817 	case TCP_TW_RST:
1818 		tcp_v4_send_reset(sk, skb);
1819 		inet_twsk_deschedule_put(inet_twsk(sk));
1820 		goto discard_it;
1821 	case TCP_TW_SUCCESS:;
1822 	}
1823 	goto discard_it;
1824 }
1825 
1826 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1827 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1828 	.twsk_unique	= tcp_twsk_unique,
1829 	.twsk_destructor= tcp_twsk_destructor,
1830 };
1831 
1832 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1833 {
1834 	struct dst_entry *dst = skb_dst(skb);
1835 
1836 	if (dst && dst_hold_safe(dst)) {
1837 		sk->sk_rx_dst = dst;
1838 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1839 	}
1840 }
1841 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1842 
1843 const struct inet_connection_sock_af_ops ipv4_specific = {
1844 	.queue_xmit	   = ip_queue_xmit,
1845 	.send_check	   = tcp_v4_send_check,
1846 	.rebuild_header	   = inet_sk_rebuild_header,
1847 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
1848 	.conn_request	   = tcp_v4_conn_request,
1849 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1850 	.net_header_len	   = sizeof(struct iphdr),
1851 	.setsockopt	   = ip_setsockopt,
1852 	.getsockopt	   = ip_getsockopt,
1853 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1854 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1855 #ifdef CONFIG_COMPAT
1856 	.compat_setsockopt = compat_ip_setsockopt,
1857 	.compat_getsockopt = compat_ip_getsockopt,
1858 #endif
1859 	.mtu_reduced	   = tcp_v4_mtu_reduced,
1860 };
1861 EXPORT_SYMBOL(ipv4_specific);
1862 
1863 #ifdef CONFIG_TCP_MD5SIG
1864 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1865 	.md5_lookup		= tcp_v4_md5_lookup,
1866 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1867 	.md5_parse		= tcp_v4_parse_md5_keys,
1868 };
1869 #endif
1870 
1871 /* NOTE: A lot of things set to zero explicitly by call to
1872  *       sk_alloc() so need not be done here.
1873  */
1874 static int tcp_v4_init_sock(struct sock *sk)
1875 {
1876 	struct inet_connection_sock *icsk = inet_csk(sk);
1877 
1878 	tcp_init_sock(sk);
1879 
1880 	icsk->icsk_af_ops = &ipv4_specific;
1881 
1882 #ifdef CONFIG_TCP_MD5SIG
1883 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1884 #endif
1885 
1886 	return 0;
1887 }
1888 
1889 void tcp_v4_destroy_sock(struct sock *sk)
1890 {
1891 	struct tcp_sock *tp = tcp_sk(sk);
1892 
1893 	trace_tcp_destroy_sock(sk);
1894 
1895 	tcp_clear_xmit_timers(sk);
1896 
1897 	tcp_cleanup_congestion_control(sk);
1898 
1899 	tcp_cleanup_ulp(sk);
1900 
1901 	/* Cleanup up the write buffer. */
1902 	tcp_write_queue_purge(sk);
1903 
1904 	/* Check if we want to disable active TFO */
1905 	tcp_fastopen_active_disable_ofo_check(sk);
1906 
1907 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1908 	skb_rbtree_purge(&tp->out_of_order_queue);
1909 
1910 #ifdef CONFIG_TCP_MD5SIG
1911 	/* Clean up the MD5 key list, if any */
1912 	if (tp->md5sig_info) {
1913 		tcp_clear_md5_list(sk);
1914 		kfree_rcu(tp->md5sig_info, rcu);
1915 		tp->md5sig_info = NULL;
1916 	}
1917 #endif
1918 
1919 	/* Clean up a referenced TCP bind bucket. */
1920 	if (inet_csk(sk)->icsk_bind_hash)
1921 		inet_put_port(sk);
1922 
1923 	BUG_ON(tp->fastopen_rsk);
1924 
1925 	/* If socket is aborted during connect operation */
1926 	tcp_free_fastopen_req(tp);
1927 	tcp_fastopen_destroy_cipher(sk);
1928 	tcp_saved_syn_free(tp);
1929 
1930 	sk_sockets_allocated_dec(sk);
1931 }
1932 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1933 
1934 #ifdef CONFIG_PROC_FS
1935 /* Proc filesystem TCP sock list dumping. */
1936 
1937 /*
1938  * Get next listener socket follow cur.  If cur is NULL, get first socket
1939  * starting from bucket given in st->bucket; when st->bucket is zero the
1940  * very first socket in the hash table is returned.
1941  */
1942 static void *listening_get_next(struct seq_file *seq, void *cur)
1943 {
1944 	struct tcp_iter_state *st = seq->private;
1945 	struct net *net = seq_file_net(seq);
1946 	struct inet_listen_hashbucket *ilb;
1947 	struct sock *sk = cur;
1948 
1949 	if (!sk) {
1950 get_head:
1951 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1952 		spin_lock(&ilb->lock);
1953 		sk = sk_head(&ilb->head);
1954 		st->offset = 0;
1955 		goto get_sk;
1956 	}
1957 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1958 	++st->num;
1959 	++st->offset;
1960 
1961 	sk = sk_next(sk);
1962 get_sk:
1963 	sk_for_each_from(sk) {
1964 		if (!net_eq(sock_net(sk), net))
1965 			continue;
1966 		if (sk->sk_family == st->family)
1967 			return sk;
1968 	}
1969 	spin_unlock(&ilb->lock);
1970 	st->offset = 0;
1971 	if (++st->bucket < INET_LHTABLE_SIZE)
1972 		goto get_head;
1973 	return NULL;
1974 }
1975 
1976 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1977 {
1978 	struct tcp_iter_state *st = seq->private;
1979 	void *rc;
1980 
1981 	st->bucket = 0;
1982 	st->offset = 0;
1983 	rc = listening_get_next(seq, NULL);
1984 
1985 	while (rc && *pos) {
1986 		rc = listening_get_next(seq, rc);
1987 		--*pos;
1988 	}
1989 	return rc;
1990 }
1991 
1992 static inline bool empty_bucket(const struct tcp_iter_state *st)
1993 {
1994 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1995 }
1996 
1997 /*
1998  * Get first established socket starting from bucket given in st->bucket.
1999  * If st->bucket is zero, the very first socket in the hash is returned.
2000  */
2001 static void *established_get_first(struct seq_file *seq)
2002 {
2003 	struct tcp_iter_state *st = seq->private;
2004 	struct net *net = seq_file_net(seq);
2005 	void *rc = NULL;
2006 
2007 	st->offset = 0;
2008 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2009 		struct sock *sk;
2010 		struct hlist_nulls_node *node;
2011 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2012 
2013 		/* Lockless fast path for the common case of empty buckets */
2014 		if (empty_bucket(st))
2015 			continue;
2016 
2017 		spin_lock_bh(lock);
2018 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2019 			if (sk->sk_family != st->family ||
2020 			    !net_eq(sock_net(sk), net)) {
2021 				continue;
2022 			}
2023 			rc = sk;
2024 			goto out;
2025 		}
2026 		spin_unlock_bh(lock);
2027 	}
2028 out:
2029 	return rc;
2030 }
2031 
2032 static void *established_get_next(struct seq_file *seq, void *cur)
2033 {
2034 	struct sock *sk = cur;
2035 	struct hlist_nulls_node *node;
2036 	struct tcp_iter_state *st = seq->private;
2037 	struct net *net = seq_file_net(seq);
2038 
2039 	++st->num;
2040 	++st->offset;
2041 
2042 	sk = sk_nulls_next(sk);
2043 
2044 	sk_nulls_for_each_from(sk, node) {
2045 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2046 			return sk;
2047 	}
2048 
2049 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2050 	++st->bucket;
2051 	return established_get_first(seq);
2052 }
2053 
2054 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2055 {
2056 	struct tcp_iter_state *st = seq->private;
2057 	void *rc;
2058 
2059 	st->bucket = 0;
2060 	rc = established_get_first(seq);
2061 
2062 	while (rc && pos) {
2063 		rc = established_get_next(seq, rc);
2064 		--pos;
2065 	}
2066 	return rc;
2067 }
2068 
2069 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2070 {
2071 	void *rc;
2072 	struct tcp_iter_state *st = seq->private;
2073 
2074 	st->state = TCP_SEQ_STATE_LISTENING;
2075 	rc	  = listening_get_idx(seq, &pos);
2076 
2077 	if (!rc) {
2078 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2079 		rc	  = established_get_idx(seq, pos);
2080 	}
2081 
2082 	return rc;
2083 }
2084 
2085 static void *tcp_seek_last_pos(struct seq_file *seq)
2086 {
2087 	struct tcp_iter_state *st = seq->private;
2088 	int offset = st->offset;
2089 	int orig_num = st->num;
2090 	void *rc = NULL;
2091 
2092 	switch (st->state) {
2093 	case TCP_SEQ_STATE_LISTENING:
2094 		if (st->bucket >= INET_LHTABLE_SIZE)
2095 			break;
2096 		st->state = TCP_SEQ_STATE_LISTENING;
2097 		rc = listening_get_next(seq, NULL);
2098 		while (offset-- && rc)
2099 			rc = listening_get_next(seq, rc);
2100 		if (rc)
2101 			break;
2102 		st->bucket = 0;
2103 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2104 		/* Fallthrough */
2105 	case TCP_SEQ_STATE_ESTABLISHED:
2106 		if (st->bucket > tcp_hashinfo.ehash_mask)
2107 			break;
2108 		rc = established_get_first(seq);
2109 		while (offset-- && rc)
2110 			rc = established_get_next(seq, rc);
2111 	}
2112 
2113 	st->num = orig_num;
2114 
2115 	return rc;
2116 }
2117 
2118 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2119 {
2120 	struct tcp_iter_state *st = seq->private;
2121 	void *rc;
2122 
2123 	if (*pos && *pos == st->last_pos) {
2124 		rc = tcp_seek_last_pos(seq);
2125 		if (rc)
2126 			goto out;
2127 	}
2128 
2129 	st->state = TCP_SEQ_STATE_LISTENING;
2130 	st->num = 0;
2131 	st->bucket = 0;
2132 	st->offset = 0;
2133 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2134 
2135 out:
2136 	st->last_pos = *pos;
2137 	return rc;
2138 }
2139 
2140 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2141 {
2142 	struct tcp_iter_state *st = seq->private;
2143 	void *rc = NULL;
2144 
2145 	if (v == SEQ_START_TOKEN) {
2146 		rc = tcp_get_idx(seq, 0);
2147 		goto out;
2148 	}
2149 
2150 	switch (st->state) {
2151 	case TCP_SEQ_STATE_LISTENING:
2152 		rc = listening_get_next(seq, v);
2153 		if (!rc) {
2154 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2155 			st->bucket = 0;
2156 			st->offset = 0;
2157 			rc	  = established_get_first(seq);
2158 		}
2159 		break;
2160 	case TCP_SEQ_STATE_ESTABLISHED:
2161 		rc = established_get_next(seq, v);
2162 		break;
2163 	}
2164 out:
2165 	++*pos;
2166 	st->last_pos = *pos;
2167 	return rc;
2168 }
2169 
2170 static void tcp_seq_stop(struct seq_file *seq, void *v)
2171 {
2172 	struct tcp_iter_state *st = seq->private;
2173 
2174 	switch (st->state) {
2175 	case TCP_SEQ_STATE_LISTENING:
2176 		if (v != SEQ_START_TOKEN)
2177 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2178 		break;
2179 	case TCP_SEQ_STATE_ESTABLISHED:
2180 		if (v)
2181 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2182 		break;
2183 	}
2184 }
2185 
2186 int tcp_seq_open(struct inode *inode, struct file *file)
2187 {
2188 	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2189 	struct tcp_iter_state *s;
2190 	int err;
2191 
2192 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2193 			  sizeof(struct tcp_iter_state));
2194 	if (err < 0)
2195 		return err;
2196 
2197 	s = ((struct seq_file *)file->private_data)->private;
2198 	s->family		= afinfo->family;
2199 	s->last_pos		= 0;
2200 	return 0;
2201 }
2202 EXPORT_SYMBOL(tcp_seq_open);
2203 
2204 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2205 {
2206 	int rc = 0;
2207 	struct proc_dir_entry *p;
2208 
2209 	afinfo->seq_ops.start		= tcp_seq_start;
2210 	afinfo->seq_ops.next		= tcp_seq_next;
2211 	afinfo->seq_ops.stop		= tcp_seq_stop;
2212 
2213 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2214 			     afinfo->seq_fops, afinfo);
2215 	if (!p)
2216 		rc = -ENOMEM;
2217 	return rc;
2218 }
2219 EXPORT_SYMBOL(tcp_proc_register);
2220 
2221 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2222 {
2223 	remove_proc_entry(afinfo->name, net->proc_net);
2224 }
2225 EXPORT_SYMBOL(tcp_proc_unregister);
2226 
2227 static void get_openreq4(const struct request_sock *req,
2228 			 struct seq_file *f, int i)
2229 {
2230 	const struct inet_request_sock *ireq = inet_rsk(req);
2231 	long delta = req->rsk_timer.expires - jiffies;
2232 
2233 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2234 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2235 		i,
2236 		ireq->ir_loc_addr,
2237 		ireq->ir_num,
2238 		ireq->ir_rmt_addr,
2239 		ntohs(ireq->ir_rmt_port),
2240 		TCP_SYN_RECV,
2241 		0, 0, /* could print option size, but that is af dependent. */
2242 		1,    /* timers active (only the expire timer) */
2243 		jiffies_delta_to_clock_t(delta),
2244 		req->num_timeout,
2245 		from_kuid_munged(seq_user_ns(f),
2246 				 sock_i_uid(req->rsk_listener)),
2247 		0,  /* non standard timer */
2248 		0, /* open_requests have no inode */
2249 		0,
2250 		req);
2251 }
2252 
2253 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2254 {
2255 	int timer_active;
2256 	unsigned long timer_expires;
2257 	const struct tcp_sock *tp = tcp_sk(sk);
2258 	const struct inet_connection_sock *icsk = inet_csk(sk);
2259 	const struct inet_sock *inet = inet_sk(sk);
2260 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2261 	__be32 dest = inet->inet_daddr;
2262 	__be32 src = inet->inet_rcv_saddr;
2263 	__u16 destp = ntohs(inet->inet_dport);
2264 	__u16 srcp = ntohs(inet->inet_sport);
2265 	int rx_queue;
2266 	int state;
2267 
2268 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2269 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2270 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2271 		timer_active	= 1;
2272 		timer_expires	= icsk->icsk_timeout;
2273 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2274 		timer_active	= 4;
2275 		timer_expires	= icsk->icsk_timeout;
2276 	} else if (timer_pending(&sk->sk_timer)) {
2277 		timer_active	= 2;
2278 		timer_expires	= sk->sk_timer.expires;
2279 	} else {
2280 		timer_active	= 0;
2281 		timer_expires = jiffies;
2282 	}
2283 
2284 	state = sk_state_load(sk);
2285 	if (state == TCP_LISTEN)
2286 		rx_queue = sk->sk_ack_backlog;
2287 	else
2288 		/* Because we don't lock the socket,
2289 		 * we might find a transient negative value.
2290 		 */
2291 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2292 
2293 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2294 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2295 		i, src, srcp, dest, destp, state,
2296 		tp->write_seq - tp->snd_una,
2297 		rx_queue,
2298 		timer_active,
2299 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2300 		icsk->icsk_retransmits,
2301 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2302 		icsk->icsk_probes_out,
2303 		sock_i_ino(sk),
2304 		refcount_read(&sk->sk_refcnt), sk,
2305 		jiffies_to_clock_t(icsk->icsk_rto),
2306 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2307 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2308 		tp->snd_cwnd,
2309 		state == TCP_LISTEN ?
2310 		    fastopenq->max_qlen :
2311 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2312 }
2313 
2314 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2315 			       struct seq_file *f, int i)
2316 {
2317 	long delta = tw->tw_timer.expires - jiffies;
2318 	__be32 dest, src;
2319 	__u16 destp, srcp;
2320 
2321 	dest  = tw->tw_daddr;
2322 	src   = tw->tw_rcv_saddr;
2323 	destp = ntohs(tw->tw_dport);
2324 	srcp  = ntohs(tw->tw_sport);
2325 
2326 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2327 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2328 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2329 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2330 		refcount_read(&tw->tw_refcnt), tw);
2331 }
2332 
2333 #define TMPSZ 150
2334 
2335 static int tcp4_seq_show(struct seq_file *seq, void *v)
2336 {
2337 	struct tcp_iter_state *st;
2338 	struct sock *sk = v;
2339 
2340 	seq_setwidth(seq, TMPSZ - 1);
2341 	if (v == SEQ_START_TOKEN) {
2342 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2343 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2344 			   "inode");
2345 		goto out;
2346 	}
2347 	st = seq->private;
2348 
2349 	if (sk->sk_state == TCP_TIME_WAIT)
2350 		get_timewait4_sock(v, seq, st->num);
2351 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2352 		get_openreq4(v, seq, st->num);
2353 	else
2354 		get_tcp4_sock(v, seq, st->num);
2355 out:
2356 	seq_pad(seq, '\n');
2357 	return 0;
2358 }
2359 
2360 static const struct file_operations tcp_afinfo_seq_fops = {
2361 	.owner   = THIS_MODULE,
2362 	.open    = tcp_seq_open,
2363 	.read    = seq_read,
2364 	.llseek  = seq_lseek,
2365 	.release = seq_release_net
2366 };
2367 
2368 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2369 	.name		= "tcp",
2370 	.family		= AF_INET,
2371 	.seq_fops	= &tcp_afinfo_seq_fops,
2372 	.seq_ops	= {
2373 		.show		= tcp4_seq_show,
2374 	},
2375 };
2376 
2377 static int __net_init tcp4_proc_init_net(struct net *net)
2378 {
2379 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2380 }
2381 
2382 static void __net_exit tcp4_proc_exit_net(struct net *net)
2383 {
2384 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2385 }
2386 
2387 static struct pernet_operations tcp4_net_ops = {
2388 	.init = tcp4_proc_init_net,
2389 	.exit = tcp4_proc_exit_net,
2390 };
2391 
2392 int __init tcp4_proc_init(void)
2393 {
2394 	return register_pernet_subsys(&tcp4_net_ops);
2395 }
2396 
2397 void tcp4_proc_exit(void)
2398 {
2399 	unregister_pernet_subsys(&tcp4_net_ops);
2400 }
2401 #endif /* CONFIG_PROC_FS */
2402 
2403 struct proto tcp_prot = {
2404 	.name			= "TCP",
2405 	.owner			= THIS_MODULE,
2406 	.close			= tcp_close,
2407 	.connect		= tcp_v4_connect,
2408 	.disconnect		= tcp_disconnect,
2409 	.accept			= inet_csk_accept,
2410 	.ioctl			= tcp_ioctl,
2411 	.init			= tcp_v4_init_sock,
2412 	.destroy		= tcp_v4_destroy_sock,
2413 	.shutdown		= tcp_shutdown,
2414 	.setsockopt		= tcp_setsockopt,
2415 	.getsockopt		= tcp_getsockopt,
2416 	.keepalive		= tcp_set_keepalive,
2417 	.recvmsg		= tcp_recvmsg,
2418 	.sendmsg		= tcp_sendmsg,
2419 	.sendpage		= tcp_sendpage,
2420 	.backlog_rcv		= tcp_v4_do_rcv,
2421 	.release_cb		= tcp_release_cb,
2422 	.hash			= inet_hash,
2423 	.unhash			= inet_unhash,
2424 	.get_port		= inet_csk_get_port,
2425 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2426 	.leave_memory_pressure	= tcp_leave_memory_pressure,
2427 	.stream_memory_free	= tcp_stream_memory_free,
2428 	.sockets_allocated	= &tcp_sockets_allocated,
2429 	.orphan_count		= &tcp_orphan_count,
2430 	.memory_allocated	= &tcp_memory_allocated,
2431 	.memory_pressure	= &tcp_memory_pressure,
2432 	.sysctl_mem		= sysctl_tcp_mem,
2433 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
2434 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
2435 	.max_header		= MAX_TCP_HEADER,
2436 	.obj_size		= sizeof(struct tcp_sock),
2437 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
2438 	.twsk_prot		= &tcp_timewait_sock_ops,
2439 	.rsk_prot		= &tcp_request_sock_ops,
2440 	.h.hashinfo		= &tcp_hashinfo,
2441 	.no_autobind		= true,
2442 #ifdef CONFIG_COMPAT
2443 	.compat_setsockopt	= compat_tcp_setsockopt,
2444 	.compat_getsockopt	= compat_tcp_getsockopt,
2445 #endif
2446 	.diag_destroy		= tcp_abort,
2447 };
2448 EXPORT_SYMBOL(tcp_prot);
2449 
2450 static void __net_exit tcp_sk_exit(struct net *net)
2451 {
2452 	int cpu;
2453 
2454 	module_put(net->ipv4.tcp_congestion_control->owner);
2455 
2456 	for_each_possible_cpu(cpu)
2457 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2458 	free_percpu(net->ipv4.tcp_sk);
2459 }
2460 
2461 static int __net_init tcp_sk_init(struct net *net)
2462 {
2463 	int res, cpu, cnt;
2464 
2465 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2466 	if (!net->ipv4.tcp_sk)
2467 		return -ENOMEM;
2468 
2469 	for_each_possible_cpu(cpu) {
2470 		struct sock *sk;
2471 
2472 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2473 					   IPPROTO_TCP, net);
2474 		if (res)
2475 			goto fail;
2476 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2477 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2478 	}
2479 
2480 	net->ipv4.sysctl_tcp_ecn = 2;
2481 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2482 
2483 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2484 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2485 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2486 
2487 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2488 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2489 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2490 
2491 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2492 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2493 	net->ipv4.sysctl_tcp_syncookies = 1;
2494 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2495 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2496 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2497 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2498 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2499 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2500 	net->ipv4.sysctl_tcp_tw_reuse = 0;
2501 
2502 	cnt = tcp_hashinfo.ehash_mask + 1;
2503 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2504 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2505 
2506 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2507 	net->ipv4.sysctl_tcp_sack = 1;
2508 	net->ipv4.sysctl_tcp_window_scaling = 1;
2509 	net->ipv4.sysctl_tcp_timestamps = 1;
2510 	net->ipv4.sysctl_tcp_early_retrans = 3;
2511 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2512 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2513 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
2514 	net->ipv4.sysctl_tcp_max_reordering = 300;
2515 	net->ipv4.sysctl_tcp_dsack = 1;
2516 	net->ipv4.sysctl_tcp_app_win = 31;
2517 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
2518 	net->ipv4.sysctl_tcp_frto = 2;
2519 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2520 	/* This limits the percentage of the congestion window which we
2521 	 * will allow a single TSO frame to consume.  Building TSO frames
2522 	 * which are too large can cause TCP streams to be bursty.
2523 	 */
2524 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2525 	/* Default TSQ limit of four TSO segments */
2526 	net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
2527 	/* rfc5961 challenge ack rate limiting */
2528 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2529 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
2530 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2531 	net->ipv4.sysctl_tcp_autocorking = 1;
2532 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2533 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2534 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2535 	if (net != &init_net) {
2536 		memcpy(net->ipv4.sysctl_tcp_rmem,
2537 		       init_net.ipv4.sysctl_tcp_rmem,
2538 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2539 		memcpy(net->ipv4.sysctl_tcp_wmem,
2540 		       init_net.ipv4.sysctl_tcp_wmem,
2541 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2542 	}
2543 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2544 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2545 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2546 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2547 
2548 	/* Reno is always built in */
2549 	if (!net_eq(net, &init_net) &&
2550 	    try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2551 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2552 	else
2553 		net->ipv4.tcp_congestion_control = &tcp_reno;
2554 
2555 	return 0;
2556 fail:
2557 	tcp_sk_exit(net);
2558 
2559 	return res;
2560 }
2561 
2562 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2563 {
2564 	struct net *net;
2565 
2566 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2567 
2568 	list_for_each_entry(net, net_exit_list, exit_list)
2569 		tcp_fastopen_ctx_destroy(net);
2570 }
2571 
2572 static struct pernet_operations __net_initdata tcp_sk_ops = {
2573        .init	   = tcp_sk_init,
2574        .exit	   = tcp_sk_exit,
2575        .exit_batch = tcp_sk_exit_batch,
2576 };
2577 
2578 void __init tcp_v4_init(void)
2579 {
2580 	if (register_pernet_subsys(&tcp_sk_ops))
2581 		panic("Failed to create the TCP control socket.\n");
2582 }
2583