xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision d2ba09c1)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77 
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/inetdevice.h>
84 
85 #include <crypto/hash.h>
86 #include <linux/scatterlist.h>
87 
88 #include <trace/events/tcp.h>
89 
90 #ifdef CONFIG_TCP_MD5SIG
91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
92 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
93 #endif
94 
95 struct inet_hashinfo tcp_hashinfo;
96 EXPORT_SYMBOL(tcp_hashinfo);
97 
98 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
99 {
100 	return secure_tcp_seq(ip_hdr(skb)->daddr,
101 			      ip_hdr(skb)->saddr,
102 			      tcp_hdr(skb)->dest,
103 			      tcp_hdr(skb)->source);
104 }
105 
106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
107 {
108 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
109 }
110 
111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112 {
113 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114 	struct tcp_sock *tp = tcp_sk(sk);
115 
116 	/* With PAWS, it is safe from the viewpoint
117 	   of data integrity. Even without PAWS it is safe provided sequence
118 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
119 
120 	   Actually, the idea is close to VJ's one, only timestamp cache is
121 	   held not per host, but per port pair and TW bucket is used as state
122 	   holder.
123 
124 	   If TW bucket has been already destroyed we fall back to VJ's scheme
125 	   and use initial timestamp retrieved from peer table.
126 	 */
127 	if (tcptw->tw_ts_recent_stamp &&
128 	    (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
129 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
130 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
131 		if (tp->write_seq == 0)
132 			tp->write_seq = 1;
133 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
134 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
135 		sock_hold(sktw);
136 		return 1;
137 	}
138 
139 	return 0;
140 }
141 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
142 
143 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
144 			      int addr_len)
145 {
146 	/* This check is replicated from tcp_v4_connect() and intended to
147 	 * prevent BPF program called below from accessing bytes that are out
148 	 * of the bound specified by user in addr_len.
149 	 */
150 	if (addr_len < sizeof(struct sockaddr_in))
151 		return -EINVAL;
152 
153 	sock_owned_by_me(sk);
154 
155 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
156 }
157 
158 /* This will initiate an outgoing connection. */
159 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
160 {
161 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
162 	struct inet_sock *inet = inet_sk(sk);
163 	struct tcp_sock *tp = tcp_sk(sk);
164 	__be16 orig_sport, orig_dport;
165 	__be32 daddr, nexthop;
166 	struct flowi4 *fl4;
167 	struct rtable *rt;
168 	int err;
169 	struct ip_options_rcu *inet_opt;
170 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
171 
172 	if (addr_len < sizeof(struct sockaddr_in))
173 		return -EINVAL;
174 
175 	if (usin->sin_family != AF_INET)
176 		return -EAFNOSUPPORT;
177 
178 	nexthop = daddr = usin->sin_addr.s_addr;
179 	inet_opt = rcu_dereference_protected(inet->inet_opt,
180 					     lockdep_sock_is_held(sk));
181 	if (inet_opt && inet_opt->opt.srr) {
182 		if (!daddr)
183 			return -EINVAL;
184 		nexthop = inet_opt->opt.faddr;
185 	}
186 
187 	orig_sport = inet->inet_sport;
188 	orig_dport = usin->sin_port;
189 	fl4 = &inet->cork.fl.u.ip4;
190 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
191 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
192 			      IPPROTO_TCP,
193 			      orig_sport, orig_dport, sk);
194 	if (IS_ERR(rt)) {
195 		err = PTR_ERR(rt);
196 		if (err == -ENETUNREACH)
197 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
198 		return err;
199 	}
200 
201 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
202 		ip_rt_put(rt);
203 		return -ENETUNREACH;
204 	}
205 
206 	if (!inet_opt || !inet_opt->opt.srr)
207 		daddr = fl4->daddr;
208 
209 	if (!inet->inet_saddr)
210 		inet->inet_saddr = fl4->saddr;
211 	sk_rcv_saddr_set(sk, inet->inet_saddr);
212 
213 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
214 		/* Reset inherited state */
215 		tp->rx_opt.ts_recent	   = 0;
216 		tp->rx_opt.ts_recent_stamp = 0;
217 		if (likely(!tp->repair))
218 			tp->write_seq	   = 0;
219 	}
220 
221 	inet->inet_dport = usin->sin_port;
222 	sk_daddr_set(sk, daddr);
223 
224 	inet_csk(sk)->icsk_ext_hdr_len = 0;
225 	if (inet_opt)
226 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
227 
228 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
229 
230 	/* Socket identity is still unknown (sport may be zero).
231 	 * However we set state to SYN-SENT and not releasing socket
232 	 * lock select source port, enter ourselves into the hash tables and
233 	 * complete initialization after this.
234 	 */
235 	tcp_set_state(sk, TCP_SYN_SENT);
236 	err = inet_hash_connect(tcp_death_row, sk);
237 	if (err)
238 		goto failure;
239 
240 	sk_set_txhash(sk);
241 
242 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
243 			       inet->inet_sport, inet->inet_dport, sk);
244 	if (IS_ERR(rt)) {
245 		err = PTR_ERR(rt);
246 		rt = NULL;
247 		goto failure;
248 	}
249 	/* OK, now commit destination to socket.  */
250 	sk->sk_gso_type = SKB_GSO_TCPV4;
251 	sk_setup_caps(sk, &rt->dst);
252 	rt = NULL;
253 
254 	if (likely(!tp->repair)) {
255 		if (!tp->write_seq)
256 			tp->write_seq = secure_tcp_seq(inet->inet_saddr,
257 						       inet->inet_daddr,
258 						       inet->inet_sport,
259 						       usin->sin_port);
260 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
261 						 inet->inet_saddr,
262 						 inet->inet_daddr);
263 	}
264 
265 	inet->inet_id = tp->write_seq ^ jiffies;
266 
267 	if (tcp_fastopen_defer_connect(sk, &err))
268 		return err;
269 	if (err)
270 		goto failure;
271 
272 	err = tcp_connect(sk);
273 
274 	if (err)
275 		goto failure;
276 
277 	return 0;
278 
279 failure:
280 	/*
281 	 * This unhashes the socket and releases the local port,
282 	 * if necessary.
283 	 */
284 	tcp_set_state(sk, TCP_CLOSE);
285 	ip_rt_put(rt);
286 	sk->sk_route_caps = 0;
287 	inet->inet_dport = 0;
288 	return err;
289 }
290 EXPORT_SYMBOL(tcp_v4_connect);
291 
292 /*
293  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
294  * It can be called through tcp_release_cb() if socket was owned by user
295  * at the time tcp_v4_err() was called to handle ICMP message.
296  */
297 void tcp_v4_mtu_reduced(struct sock *sk)
298 {
299 	struct inet_sock *inet = inet_sk(sk);
300 	struct dst_entry *dst;
301 	u32 mtu;
302 
303 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
304 		return;
305 	mtu = tcp_sk(sk)->mtu_info;
306 	dst = inet_csk_update_pmtu(sk, mtu);
307 	if (!dst)
308 		return;
309 
310 	/* Something is about to be wrong... Remember soft error
311 	 * for the case, if this connection will not able to recover.
312 	 */
313 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
314 		sk->sk_err_soft = EMSGSIZE;
315 
316 	mtu = dst_mtu(dst);
317 
318 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
319 	    ip_sk_accept_pmtu(sk) &&
320 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
321 		tcp_sync_mss(sk, mtu);
322 
323 		/* Resend the TCP packet because it's
324 		 * clear that the old packet has been
325 		 * dropped. This is the new "fast" path mtu
326 		 * discovery.
327 		 */
328 		tcp_simple_retransmit(sk);
329 	} /* else let the usual retransmit timer handle it */
330 }
331 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
332 
333 static void do_redirect(struct sk_buff *skb, struct sock *sk)
334 {
335 	struct dst_entry *dst = __sk_dst_check(sk, 0);
336 
337 	if (dst)
338 		dst->ops->redirect(dst, sk, skb);
339 }
340 
341 
342 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
343 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
344 {
345 	struct request_sock *req = inet_reqsk(sk);
346 	struct net *net = sock_net(sk);
347 
348 	/* ICMPs are not backlogged, hence we cannot get
349 	 * an established socket here.
350 	 */
351 	if (seq != tcp_rsk(req)->snt_isn) {
352 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
353 	} else if (abort) {
354 		/*
355 		 * Still in SYN_RECV, just remove it silently.
356 		 * There is no good way to pass the error to the newly
357 		 * created socket, and POSIX does not want network
358 		 * errors returned from accept().
359 		 */
360 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
361 		tcp_listendrop(req->rsk_listener);
362 	}
363 	reqsk_put(req);
364 }
365 EXPORT_SYMBOL(tcp_req_err);
366 
367 /*
368  * This routine is called by the ICMP module when it gets some
369  * sort of error condition.  If err < 0 then the socket should
370  * be closed and the error returned to the user.  If err > 0
371  * it's just the icmp type << 8 | icmp code.  After adjustment
372  * header points to the first 8 bytes of the tcp header.  We need
373  * to find the appropriate port.
374  *
375  * The locking strategy used here is very "optimistic". When
376  * someone else accesses the socket the ICMP is just dropped
377  * and for some paths there is no check at all.
378  * A more general error queue to queue errors for later handling
379  * is probably better.
380  *
381  */
382 
383 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
384 {
385 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
386 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
387 	struct inet_connection_sock *icsk;
388 	struct tcp_sock *tp;
389 	struct inet_sock *inet;
390 	const int type = icmp_hdr(icmp_skb)->type;
391 	const int code = icmp_hdr(icmp_skb)->code;
392 	struct sock *sk;
393 	struct sk_buff *skb;
394 	struct request_sock *fastopen;
395 	u32 seq, snd_una;
396 	s32 remaining;
397 	u32 delta_us;
398 	int err;
399 	struct net *net = dev_net(icmp_skb->dev);
400 
401 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
402 				       th->dest, iph->saddr, ntohs(th->source),
403 				       inet_iif(icmp_skb), 0);
404 	if (!sk) {
405 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
406 		return;
407 	}
408 	if (sk->sk_state == TCP_TIME_WAIT) {
409 		inet_twsk_put(inet_twsk(sk));
410 		return;
411 	}
412 	seq = ntohl(th->seq);
413 	if (sk->sk_state == TCP_NEW_SYN_RECV)
414 		return tcp_req_err(sk, seq,
415 				  type == ICMP_PARAMETERPROB ||
416 				  type == ICMP_TIME_EXCEEDED ||
417 				  (type == ICMP_DEST_UNREACH &&
418 				   (code == ICMP_NET_UNREACH ||
419 				    code == ICMP_HOST_UNREACH)));
420 
421 	bh_lock_sock(sk);
422 	/* If too many ICMPs get dropped on busy
423 	 * servers this needs to be solved differently.
424 	 * We do take care of PMTU discovery (RFC1191) special case :
425 	 * we can receive locally generated ICMP messages while socket is held.
426 	 */
427 	if (sock_owned_by_user(sk)) {
428 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
429 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
430 	}
431 	if (sk->sk_state == TCP_CLOSE)
432 		goto out;
433 
434 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
435 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
436 		goto out;
437 	}
438 
439 	icsk = inet_csk(sk);
440 	tp = tcp_sk(sk);
441 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
442 	fastopen = tp->fastopen_rsk;
443 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
444 	if (sk->sk_state != TCP_LISTEN &&
445 	    !between(seq, snd_una, tp->snd_nxt)) {
446 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
447 		goto out;
448 	}
449 
450 	switch (type) {
451 	case ICMP_REDIRECT:
452 		if (!sock_owned_by_user(sk))
453 			do_redirect(icmp_skb, sk);
454 		goto out;
455 	case ICMP_SOURCE_QUENCH:
456 		/* Just silently ignore these. */
457 		goto out;
458 	case ICMP_PARAMETERPROB:
459 		err = EPROTO;
460 		break;
461 	case ICMP_DEST_UNREACH:
462 		if (code > NR_ICMP_UNREACH)
463 			goto out;
464 
465 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
466 			/* We are not interested in TCP_LISTEN and open_requests
467 			 * (SYN-ACKs send out by Linux are always <576bytes so
468 			 * they should go through unfragmented).
469 			 */
470 			if (sk->sk_state == TCP_LISTEN)
471 				goto out;
472 
473 			tp->mtu_info = info;
474 			if (!sock_owned_by_user(sk)) {
475 				tcp_v4_mtu_reduced(sk);
476 			} else {
477 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
478 					sock_hold(sk);
479 			}
480 			goto out;
481 		}
482 
483 		err = icmp_err_convert[code].errno;
484 		/* check if icmp_skb allows revert of backoff
485 		 * (see draft-zimmermann-tcp-lcd) */
486 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
487 			break;
488 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
489 		    !icsk->icsk_backoff || fastopen)
490 			break;
491 
492 		if (sock_owned_by_user(sk))
493 			break;
494 
495 		icsk->icsk_backoff--;
496 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
497 					       TCP_TIMEOUT_INIT;
498 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
499 
500 		skb = tcp_rtx_queue_head(sk);
501 		BUG_ON(!skb);
502 
503 		tcp_mstamp_refresh(tp);
504 		delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
505 		remaining = icsk->icsk_rto -
506 			    usecs_to_jiffies(delta_us);
507 
508 		if (remaining > 0) {
509 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
510 						  remaining, TCP_RTO_MAX);
511 		} else {
512 			/* RTO revert clocked out retransmission.
513 			 * Will retransmit now */
514 			tcp_retransmit_timer(sk);
515 		}
516 
517 		break;
518 	case ICMP_TIME_EXCEEDED:
519 		err = EHOSTUNREACH;
520 		break;
521 	default:
522 		goto out;
523 	}
524 
525 	switch (sk->sk_state) {
526 	case TCP_SYN_SENT:
527 	case TCP_SYN_RECV:
528 		/* Only in fast or simultaneous open. If a fast open socket is
529 		 * is already accepted it is treated as a connected one below.
530 		 */
531 		if (fastopen && !fastopen->sk)
532 			break;
533 
534 		if (!sock_owned_by_user(sk)) {
535 			sk->sk_err = err;
536 
537 			sk->sk_error_report(sk);
538 
539 			tcp_done(sk);
540 		} else {
541 			sk->sk_err_soft = err;
542 		}
543 		goto out;
544 	}
545 
546 	/* If we've already connected we will keep trying
547 	 * until we time out, or the user gives up.
548 	 *
549 	 * rfc1122 4.2.3.9 allows to consider as hard errors
550 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
551 	 * but it is obsoleted by pmtu discovery).
552 	 *
553 	 * Note, that in modern internet, where routing is unreliable
554 	 * and in each dark corner broken firewalls sit, sending random
555 	 * errors ordered by their masters even this two messages finally lose
556 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
557 	 *
558 	 * Now we are in compliance with RFCs.
559 	 *							--ANK (980905)
560 	 */
561 
562 	inet = inet_sk(sk);
563 	if (!sock_owned_by_user(sk) && inet->recverr) {
564 		sk->sk_err = err;
565 		sk->sk_error_report(sk);
566 	} else	{ /* Only an error on timeout */
567 		sk->sk_err_soft = err;
568 	}
569 
570 out:
571 	bh_unlock_sock(sk);
572 	sock_put(sk);
573 }
574 
575 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
576 {
577 	struct tcphdr *th = tcp_hdr(skb);
578 
579 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
580 	skb->csum_start = skb_transport_header(skb) - skb->head;
581 	skb->csum_offset = offsetof(struct tcphdr, check);
582 }
583 
584 /* This routine computes an IPv4 TCP checksum. */
585 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
586 {
587 	const struct inet_sock *inet = inet_sk(sk);
588 
589 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
590 }
591 EXPORT_SYMBOL(tcp_v4_send_check);
592 
593 /*
594  *	This routine will send an RST to the other tcp.
595  *
596  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
597  *		      for reset.
598  *	Answer: if a packet caused RST, it is not for a socket
599  *		existing in our system, if it is matched to a socket,
600  *		it is just duplicate segment or bug in other side's TCP.
601  *		So that we build reply only basing on parameters
602  *		arrived with segment.
603  *	Exception: precedence violation. We do not implement it in any case.
604  */
605 
606 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
607 {
608 	const struct tcphdr *th = tcp_hdr(skb);
609 	struct {
610 		struct tcphdr th;
611 #ifdef CONFIG_TCP_MD5SIG
612 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
613 #endif
614 	} rep;
615 	struct ip_reply_arg arg;
616 #ifdef CONFIG_TCP_MD5SIG
617 	struct tcp_md5sig_key *key = NULL;
618 	const __u8 *hash_location = NULL;
619 	unsigned char newhash[16];
620 	int genhash;
621 	struct sock *sk1 = NULL;
622 #endif
623 	struct net *net;
624 	struct sock *ctl_sk;
625 
626 	/* Never send a reset in response to a reset. */
627 	if (th->rst)
628 		return;
629 
630 	/* If sk not NULL, it means we did a successful lookup and incoming
631 	 * route had to be correct. prequeue might have dropped our dst.
632 	 */
633 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
634 		return;
635 
636 	/* Swap the send and the receive. */
637 	memset(&rep, 0, sizeof(rep));
638 	rep.th.dest   = th->source;
639 	rep.th.source = th->dest;
640 	rep.th.doff   = sizeof(struct tcphdr) / 4;
641 	rep.th.rst    = 1;
642 
643 	if (th->ack) {
644 		rep.th.seq = th->ack_seq;
645 	} else {
646 		rep.th.ack = 1;
647 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
648 				       skb->len - (th->doff << 2));
649 	}
650 
651 	memset(&arg, 0, sizeof(arg));
652 	arg.iov[0].iov_base = (unsigned char *)&rep;
653 	arg.iov[0].iov_len  = sizeof(rep.th);
654 
655 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
656 #ifdef CONFIG_TCP_MD5SIG
657 	rcu_read_lock();
658 	hash_location = tcp_parse_md5sig_option(th);
659 	if (sk && sk_fullsock(sk)) {
660 		key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
661 					&ip_hdr(skb)->saddr, AF_INET);
662 	} else if (hash_location) {
663 		/*
664 		 * active side is lost. Try to find listening socket through
665 		 * source port, and then find md5 key through listening socket.
666 		 * we are not loose security here:
667 		 * Incoming packet is checked with md5 hash with finding key,
668 		 * no RST generated if md5 hash doesn't match.
669 		 */
670 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
671 					     ip_hdr(skb)->saddr,
672 					     th->source, ip_hdr(skb)->daddr,
673 					     ntohs(th->source), inet_iif(skb),
674 					     tcp_v4_sdif(skb));
675 		/* don't send rst if it can't find key */
676 		if (!sk1)
677 			goto out;
678 
679 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
680 					&ip_hdr(skb)->saddr, AF_INET);
681 		if (!key)
682 			goto out;
683 
684 
685 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
686 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
687 			goto out;
688 
689 	}
690 
691 	if (key) {
692 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
693 				   (TCPOPT_NOP << 16) |
694 				   (TCPOPT_MD5SIG << 8) |
695 				   TCPOLEN_MD5SIG);
696 		/* Update length and the length the header thinks exists */
697 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
698 		rep.th.doff = arg.iov[0].iov_len / 4;
699 
700 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
701 				     key, ip_hdr(skb)->saddr,
702 				     ip_hdr(skb)->daddr, &rep.th);
703 	}
704 #endif
705 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
706 				      ip_hdr(skb)->saddr, /* XXX */
707 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
708 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
709 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
710 
711 	/* When socket is gone, all binding information is lost.
712 	 * routing might fail in this case. No choice here, if we choose to force
713 	 * input interface, we will misroute in case of asymmetric route.
714 	 */
715 	if (sk) {
716 		arg.bound_dev_if = sk->sk_bound_dev_if;
717 		if (sk_fullsock(sk))
718 			trace_tcp_send_reset(sk, skb);
719 	}
720 
721 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
722 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
723 
724 	arg.tos = ip_hdr(skb)->tos;
725 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
726 	local_bh_disable();
727 	ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
728 	if (sk)
729 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
730 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
731 	ip_send_unicast_reply(ctl_sk,
732 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
733 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
734 			      &arg, arg.iov[0].iov_len);
735 
736 	ctl_sk->sk_mark = 0;
737 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
738 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
739 	local_bh_enable();
740 
741 #ifdef CONFIG_TCP_MD5SIG
742 out:
743 	rcu_read_unlock();
744 #endif
745 }
746 
747 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
748    outside socket context is ugly, certainly. What can I do?
749  */
750 
751 static void tcp_v4_send_ack(const struct sock *sk,
752 			    struct sk_buff *skb, u32 seq, u32 ack,
753 			    u32 win, u32 tsval, u32 tsecr, int oif,
754 			    struct tcp_md5sig_key *key,
755 			    int reply_flags, u8 tos)
756 {
757 	const struct tcphdr *th = tcp_hdr(skb);
758 	struct {
759 		struct tcphdr th;
760 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
761 #ifdef CONFIG_TCP_MD5SIG
762 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
763 #endif
764 			];
765 	} rep;
766 	struct net *net = sock_net(sk);
767 	struct ip_reply_arg arg;
768 	struct sock *ctl_sk;
769 
770 	memset(&rep.th, 0, sizeof(struct tcphdr));
771 	memset(&arg, 0, sizeof(arg));
772 
773 	arg.iov[0].iov_base = (unsigned char *)&rep;
774 	arg.iov[0].iov_len  = sizeof(rep.th);
775 	if (tsecr) {
776 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
777 				   (TCPOPT_TIMESTAMP << 8) |
778 				   TCPOLEN_TIMESTAMP);
779 		rep.opt[1] = htonl(tsval);
780 		rep.opt[2] = htonl(tsecr);
781 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
782 	}
783 
784 	/* Swap the send and the receive. */
785 	rep.th.dest    = th->source;
786 	rep.th.source  = th->dest;
787 	rep.th.doff    = arg.iov[0].iov_len / 4;
788 	rep.th.seq     = htonl(seq);
789 	rep.th.ack_seq = htonl(ack);
790 	rep.th.ack     = 1;
791 	rep.th.window  = htons(win);
792 
793 #ifdef CONFIG_TCP_MD5SIG
794 	if (key) {
795 		int offset = (tsecr) ? 3 : 0;
796 
797 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
798 					  (TCPOPT_NOP << 16) |
799 					  (TCPOPT_MD5SIG << 8) |
800 					  TCPOLEN_MD5SIG);
801 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
802 		rep.th.doff = arg.iov[0].iov_len/4;
803 
804 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
805 				    key, ip_hdr(skb)->saddr,
806 				    ip_hdr(skb)->daddr, &rep.th);
807 	}
808 #endif
809 	arg.flags = reply_flags;
810 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
811 				      ip_hdr(skb)->saddr, /* XXX */
812 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
813 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
814 	if (oif)
815 		arg.bound_dev_if = oif;
816 	arg.tos = tos;
817 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
818 	local_bh_disable();
819 	ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
820 	if (sk)
821 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
822 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
823 	ip_send_unicast_reply(ctl_sk,
824 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
825 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
826 			      &arg, arg.iov[0].iov_len);
827 
828 	ctl_sk->sk_mark = 0;
829 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
830 	local_bh_enable();
831 }
832 
833 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
834 {
835 	struct inet_timewait_sock *tw = inet_twsk(sk);
836 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
837 
838 	tcp_v4_send_ack(sk, skb,
839 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
840 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
841 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
842 			tcptw->tw_ts_recent,
843 			tw->tw_bound_dev_if,
844 			tcp_twsk_md5_key(tcptw),
845 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
846 			tw->tw_tos
847 			);
848 
849 	inet_twsk_put(tw);
850 }
851 
852 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
853 				  struct request_sock *req)
854 {
855 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
856 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
857 	 */
858 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
859 					     tcp_sk(sk)->snd_nxt;
860 
861 	/* RFC 7323 2.3
862 	 * The window field (SEG.WND) of every outgoing segment, with the
863 	 * exception of <SYN> segments, MUST be right-shifted by
864 	 * Rcv.Wind.Shift bits:
865 	 */
866 	tcp_v4_send_ack(sk, skb, seq,
867 			tcp_rsk(req)->rcv_nxt,
868 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
869 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
870 			req->ts_recent,
871 			0,
872 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
873 					  AF_INET),
874 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
875 			ip_hdr(skb)->tos);
876 }
877 
878 /*
879  *	Send a SYN-ACK after having received a SYN.
880  *	This still operates on a request_sock only, not on a big
881  *	socket.
882  */
883 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
884 			      struct flowi *fl,
885 			      struct request_sock *req,
886 			      struct tcp_fastopen_cookie *foc,
887 			      enum tcp_synack_type synack_type)
888 {
889 	const struct inet_request_sock *ireq = inet_rsk(req);
890 	struct flowi4 fl4;
891 	int err = -1;
892 	struct sk_buff *skb;
893 
894 	/* First, grab a route. */
895 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
896 		return -1;
897 
898 	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
899 
900 	if (skb) {
901 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
902 
903 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
904 					    ireq->ir_rmt_addr,
905 					    ireq_opt_deref(ireq));
906 		err = net_xmit_eval(err);
907 	}
908 
909 	return err;
910 }
911 
912 /*
913  *	IPv4 request_sock destructor.
914  */
915 static void tcp_v4_reqsk_destructor(struct request_sock *req)
916 {
917 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
918 }
919 
920 #ifdef CONFIG_TCP_MD5SIG
921 /*
922  * RFC2385 MD5 checksumming requires a mapping of
923  * IP address->MD5 Key.
924  * We need to maintain these in the sk structure.
925  */
926 
927 /* Find the Key structure for an address.  */
928 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
929 					 const union tcp_md5_addr *addr,
930 					 int family)
931 {
932 	const struct tcp_sock *tp = tcp_sk(sk);
933 	struct tcp_md5sig_key *key;
934 	const struct tcp_md5sig_info *md5sig;
935 	__be32 mask;
936 	struct tcp_md5sig_key *best_match = NULL;
937 	bool match;
938 
939 	/* caller either holds rcu_read_lock() or socket lock */
940 	md5sig = rcu_dereference_check(tp->md5sig_info,
941 				       lockdep_sock_is_held(sk));
942 	if (!md5sig)
943 		return NULL;
944 
945 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
946 		if (key->family != family)
947 			continue;
948 
949 		if (family == AF_INET) {
950 			mask = inet_make_mask(key->prefixlen);
951 			match = (key->addr.a4.s_addr & mask) ==
952 				(addr->a4.s_addr & mask);
953 #if IS_ENABLED(CONFIG_IPV6)
954 		} else if (family == AF_INET6) {
955 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
956 						  key->prefixlen);
957 #endif
958 		} else {
959 			match = false;
960 		}
961 
962 		if (match && (!best_match ||
963 			      key->prefixlen > best_match->prefixlen))
964 			best_match = key;
965 	}
966 	return best_match;
967 }
968 EXPORT_SYMBOL(tcp_md5_do_lookup);
969 
970 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
971 						      const union tcp_md5_addr *addr,
972 						      int family, u8 prefixlen)
973 {
974 	const struct tcp_sock *tp = tcp_sk(sk);
975 	struct tcp_md5sig_key *key;
976 	unsigned int size = sizeof(struct in_addr);
977 	const struct tcp_md5sig_info *md5sig;
978 
979 	/* caller either holds rcu_read_lock() or socket lock */
980 	md5sig = rcu_dereference_check(tp->md5sig_info,
981 				       lockdep_sock_is_held(sk));
982 	if (!md5sig)
983 		return NULL;
984 #if IS_ENABLED(CONFIG_IPV6)
985 	if (family == AF_INET6)
986 		size = sizeof(struct in6_addr);
987 #endif
988 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
989 		if (key->family != family)
990 			continue;
991 		if (!memcmp(&key->addr, addr, size) &&
992 		    key->prefixlen == prefixlen)
993 			return key;
994 	}
995 	return NULL;
996 }
997 
998 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
999 					 const struct sock *addr_sk)
1000 {
1001 	const union tcp_md5_addr *addr;
1002 
1003 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1004 	return tcp_md5_do_lookup(sk, addr, AF_INET);
1005 }
1006 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1007 
1008 /* This can be called on a newly created socket, from other files */
1009 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1010 		   int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1011 		   gfp_t gfp)
1012 {
1013 	/* Add Key to the list */
1014 	struct tcp_md5sig_key *key;
1015 	struct tcp_sock *tp = tcp_sk(sk);
1016 	struct tcp_md5sig_info *md5sig;
1017 
1018 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1019 	if (key) {
1020 		/* Pre-existing entry - just update that one. */
1021 		memcpy(key->key, newkey, newkeylen);
1022 		key->keylen = newkeylen;
1023 		return 0;
1024 	}
1025 
1026 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1027 					   lockdep_sock_is_held(sk));
1028 	if (!md5sig) {
1029 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1030 		if (!md5sig)
1031 			return -ENOMEM;
1032 
1033 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1034 		INIT_HLIST_HEAD(&md5sig->head);
1035 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1036 	}
1037 
1038 	key = sock_kmalloc(sk, sizeof(*key), gfp);
1039 	if (!key)
1040 		return -ENOMEM;
1041 	if (!tcp_alloc_md5sig_pool()) {
1042 		sock_kfree_s(sk, key, sizeof(*key));
1043 		return -ENOMEM;
1044 	}
1045 
1046 	memcpy(key->key, newkey, newkeylen);
1047 	key->keylen = newkeylen;
1048 	key->family = family;
1049 	key->prefixlen = prefixlen;
1050 	memcpy(&key->addr, addr,
1051 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1052 				      sizeof(struct in_addr));
1053 	hlist_add_head_rcu(&key->node, &md5sig->head);
1054 	return 0;
1055 }
1056 EXPORT_SYMBOL(tcp_md5_do_add);
1057 
1058 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1059 		   u8 prefixlen)
1060 {
1061 	struct tcp_md5sig_key *key;
1062 
1063 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1064 	if (!key)
1065 		return -ENOENT;
1066 	hlist_del_rcu(&key->node);
1067 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1068 	kfree_rcu(key, rcu);
1069 	return 0;
1070 }
1071 EXPORT_SYMBOL(tcp_md5_do_del);
1072 
1073 static void tcp_clear_md5_list(struct sock *sk)
1074 {
1075 	struct tcp_sock *tp = tcp_sk(sk);
1076 	struct tcp_md5sig_key *key;
1077 	struct hlist_node *n;
1078 	struct tcp_md5sig_info *md5sig;
1079 
1080 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1081 
1082 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1083 		hlist_del_rcu(&key->node);
1084 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1085 		kfree_rcu(key, rcu);
1086 	}
1087 }
1088 
1089 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1090 				 char __user *optval, int optlen)
1091 {
1092 	struct tcp_md5sig cmd;
1093 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1094 	u8 prefixlen = 32;
1095 
1096 	if (optlen < sizeof(cmd))
1097 		return -EINVAL;
1098 
1099 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1100 		return -EFAULT;
1101 
1102 	if (sin->sin_family != AF_INET)
1103 		return -EINVAL;
1104 
1105 	if (optname == TCP_MD5SIG_EXT &&
1106 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1107 		prefixlen = cmd.tcpm_prefixlen;
1108 		if (prefixlen > 32)
1109 			return -EINVAL;
1110 	}
1111 
1112 	if (!cmd.tcpm_keylen)
1113 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1114 				      AF_INET, prefixlen);
1115 
1116 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1117 		return -EINVAL;
1118 
1119 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1120 			      AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1121 			      GFP_KERNEL);
1122 }
1123 
1124 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1125 				   __be32 daddr, __be32 saddr,
1126 				   const struct tcphdr *th, int nbytes)
1127 {
1128 	struct tcp4_pseudohdr *bp;
1129 	struct scatterlist sg;
1130 	struct tcphdr *_th;
1131 
1132 	bp = hp->scratch;
1133 	bp->saddr = saddr;
1134 	bp->daddr = daddr;
1135 	bp->pad = 0;
1136 	bp->protocol = IPPROTO_TCP;
1137 	bp->len = cpu_to_be16(nbytes);
1138 
1139 	_th = (struct tcphdr *)(bp + 1);
1140 	memcpy(_th, th, sizeof(*th));
1141 	_th->check = 0;
1142 
1143 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1144 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1145 				sizeof(*bp) + sizeof(*th));
1146 	return crypto_ahash_update(hp->md5_req);
1147 }
1148 
1149 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1150 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1151 {
1152 	struct tcp_md5sig_pool *hp;
1153 	struct ahash_request *req;
1154 
1155 	hp = tcp_get_md5sig_pool();
1156 	if (!hp)
1157 		goto clear_hash_noput;
1158 	req = hp->md5_req;
1159 
1160 	if (crypto_ahash_init(req))
1161 		goto clear_hash;
1162 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1163 		goto clear_hash;
1164 	if (tcp_md5_hash_key(hp, key))
1165 		goto clear_hash;
1166 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1167 	if (crypto_ahash_final(req))
1168 		goto clear_hash;
1169 
1170 	tcp_put_md5sig_pool();
1171 	return 0;
1172 
1173 clear_hash:
1174 	tcp_put_md5sig_pool();
1175 clear_hash_noput:
1176 	memset(md5_hash, 0, 16);
1177 	return 1;
1178 }
1179 
1180 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1181 			const struct sock *sk,
1182 			const struct sk_buff *skb)
1183 {
1184 	struct tcp_md5sig_pool *hp;
1185 	struct ahash_request *req;
1186 	const struct tcphdr *th = tcp_hdr(skb);
1187 	__be32 saddr, daddr;
1188 
1189 	if (sk) { /* valid for establish/request sockets */
1190 		saddr = sk->sk_rcv_saddr;
1191 		daddr = sk->sk_daddr;
1192 	} else {
1193 		const struct iphdr *iph = ip_hdr(skb);
1194 		saddr = iph->saddr;
1195 		daddr = iph->daddr;
1196 	}
1197 
1198 	hp = tcp_get_md5sig_pool();
1199 	if (!hp)
1200 		goto clear_hash_noput;
1201 	req = hp->md5_req;
1202 
1203 	if (crypto_ahash_init(req))
1204 		goto clear_hash;
1205 
1206 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1207 		goto clear_hash;
1208 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1209 		goto clear_hash;
1210 	if (tcp_md5_hash_key(hp, key))
1211 		goto clear_hash;
1212 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1213 	if (crypto_ahash_final(req))
1214 		goto clear_hash;
1215 
1216 	tcp_put_md5sig_pool();
1217 	return 0;
1218 
1219 clear_hash:
1220 	tcp_put_md5sig_pool();
1221 clear_hash_noput:
1222 	memset(md5_hash, 0, 16);
1223 	return 1;
1224 }
1225 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1226 
1227 #endif
1228 
1229 /* Called with rcu_read_lock() */
1230 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1231 				    const struct sk_buff *skb)
1232 {
1233 #ifdef CONFIG_TCP_MD5SIG
1234 	/*
1235 	 * This gets called for each TCP segment that arrives
1236 	 * so we want to be efficient.
1237 	 * We have 3 drop cases:
1238 	 * o No MD5 hash and one expected.
1239 	 * o MD5 hash and we're not expecting one.
1240 	 * o MD5 hash and its wrong.
1241 	 */
1242 	const __u8 *hash_location = NULL;
1243 	struct tcp_md5sig_key *hash_expected;
1244 	const struct iphdr *iph = ip_hdr(skb);
1245 	const struct tcphdr *th = tcp_hdr(skb);
1246 	int genhash;
1247 	unsigned char newhash[16];
1248 
1249 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1250 					  AF_INET);
1251 	hash_location = tcp_parse_md5sig_option(th);
1252 
1253 	/* We've parsed the options - do we have a hash? */
1254 	if (!hash_expected && !hash_location)
1255 		return false;
1256 
1257 	if (hash_expected && !hash_location) {
1258 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1259 		return true;
1260 	}
1261 
1262 	if (!hash_expected && hash_location) {
1263 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1264 		return true;
1265 	}
1266 
1267 	/* Okay, so this is hash_expected and hash_location -
1268 	 * so we need to calculate the checksum.
1269 	 */
1270 	genhash = tcp_v4_md5_hash_skb(newhash,
1271 				      hash_expected,
1272 				      NULL, skb);
1273 
1274 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1275 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1276 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1277 				     &iph->saddr, ntohs(th->source),
1278 				     &iph->daddr, ntohs(th->dest),
1279 				     genhash ? " tcp_v4_calc_md5_hash failed"
1280 				     : "");
1281 		return true;
1282 	}
1283 	return false;
1284 #endif
1285 	return false;
1286 }
1287 
1288 static void tcp_v4_init_req(struct request_sock *req,
1289 			    const struct sock *sk_listener,
1290 			    struct sk_buff *skb)
1291 {
1292 	struct inet_request_sock *ireq = inet_rsk(req);
1293 	struct net *net = sock_net(sk_listener);
1294 
1295 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1296 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1297 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1298 }
1299 
1300 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1301 					  struct flowi *fl,
1302 					  const struct request_sock *req)
1303 {
1304 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1305 }
1306 
1307 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1308 	.family		=	PF_INET,
1309 	.obj_size	=	sizeof(struct tcp_request_sock),
1310 	.rtx_syn_ack	=	tcp_rtx_synack,
1311 	.send_ack	=	tcp_v4_reqsk_send_ack,
1312 	.destructor	=	tcp_v4_reqsk_destructor,
1313 	.send_reset	=	tcp_v4_send_reset,
1314 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1315 };
1316 
1317 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1318 	.mss_clamp	=	TCP_MSS_DEFAULT,
1319 #ifdef CONFIG_TCP_MD5SIG
1320 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1321 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1322 #endif
1323 	.init_req	=	tcp_v4_init_req,
1324 #ifdef CONFIG_SYN_COOKIES
1325 	.cookie_init_seq =	cookie_v4_init_sequence,
1326 #endif
1327 	.route_req	=	tcp_v4_route_req,
1328 	.init_seq	=	tcp_v4_init_seq,
1329 	.init_ts_off	=	tcp_v4_init_ts_off,
1330 	.send_synack	=	tcp_v4_send_synack,
1331 };
1332 
1333 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1334 {
1335 	/* Never answer to SYNs send to broadcast or multicast */
1336 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1337 		goto drop;
1338 
1339 	return tcp_conn_request(&tcp_request_sock_ops,
1340 				&tcp_request_sock_ipv4_ops, sk, skb);
1341 
1342 drop:
1343 	tcp_listendrop(sk);
1344 	return 0;
1345 }
1346 EXPORT_SYMBOL(tcp_v4_conn_request);
1347 
1348 
1349 /*
1350  * The three way handshake has completed - we got a valid synack -
1351  * now create the new socket.
1352  */
1353 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1354 				  struct request_sock *req,
1355 				  struct dst_entry *dst,
1356 				  struct request_sock *req_unhash,
1357 				  bool *own_req)
1358 {
1359 	struct inet_request_sock *ireq;
1360 	struct inet_sock *newinet;
1361 	struct tcp_sock *newtp;
1362 	struct sock *newsk;
1363 #ifdef CONFIG_TCP_MD5SIG
1364 	struct tcp_md5sig_key *key;
1365 #endif
1366 	struct ip_options_rcu *inet_opt;
1367 
1368 	if (sk_acceptq_is_full(sk))
1369 		goto exit_overflow;
1370 
1371 	newsk = tcp_create_openreq_child(sk, req, skb);
1372 	if (!newsk)
1373 		goto exit_nonewsk;
1374 
1375 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1376 	inet_sk_rx_dst_set(newsk, skb);
1377 
1378 	newtp		      = tcp_sk(newsk);
1379 	newinet		      = inet_sk(newsk);
1380 	ireq		      = inet_rsk(req);
1381 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1382 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1383 	newsk->sk_bound_dev_if = ireq->ir_iif;
1384 	newinet->inet_saddr   = ireq->ir_loc_addr;
1385 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1386 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1387 	newinet->mc_index     = inet_iif(skb);
1388 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1389 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1390 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1391 	if (inet_opt)
1392 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1393 	newinet->inet_id = newtp->write_seq ^ jiffies;
1394 
1395 	if (!dst) {
1396 		dst = inet_csk_route_child_sock(sk, newsk, req);
1397 		if (!dst)
1398 			goto put_and_exit;
1399 	} else {
1400 		/* syncookie case : see end of cookie_v4_check() */
1401 	}
1402 	sk_setup_caps(newsk, dst);
1403 
1404 	tcp_ca_openreq_child(newsk, dst);
1405 
1406 	tcp_sync_mss(newsk, dst_mtu(dst));
1407 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1408 
1409 	tcp_initialize_rcv_mss(newsk);
1410 
1411 #ifdef CONFIG_TCP_MD5SIG
1412 	/* Copy over the MD5 key from the original socket */
1413 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1414 				AF_INET);
1415 	if (key) {
1416 		/*
1417 		 * We're using one, so create a matching key
1418 		 * on the newsk structure. If we fail to get
1419 		 * memory, then we end up not copying the key
1420 		 * across. Shucks.
1421 		 */
1422 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1423 			       AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1424 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1425 	}
1426 #endif
1427 
1428 	if (__inet_inherit_port(sk, newsk) < 0)
1429 		goto put_and_exit;
1430 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1431 	if (likely(*own_req)) {
1432 		tcp_move_syn(newtp, req);
1433 		ireq->ireq_opt = NULL;
1434 	} else {
1435 		newinet->inet_opt = NULL;
1436 	}
1437 	return newsk;
1438 
1439 exit_overflow:
1440 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1441 exit_nonewsk:
1442 	dst_release(dst);
1443 exit:
1444 	tcp_listendrop(sk);
1445 	return NULL;
1446 put_and_exit:
1447 	newinet->inet_opt = NULL;
1448 	inet_csk_prepare_forced_close(newsk);
1449 	tcp_done(newsk);
1450 	goto exit;
1451 }
1452 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1453 
1454 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1455 {
1456 #ifdef CONFIG_SYN_COOKIES
1457 	const struct tcphdr *th = tcp_hdr(skb);
1458 
1459 	if (!th->syn)
1460 		sk = cookie_v4_check(sk, skb);
1461 #endif
1462 	return sk;
1463 }
1464 
1465 /* The socket must have it's spinlock held when we get
1466  * here, unless it is a TCP_LISTEN socket.
1467  *
1468  * We have a potential double-lock case here, so even when
1469  * doing backlog processing we use the BH locking scheme.
1470  * This is because we cannot sleep with the original spinlock
1471  * held.
1472  */
1473 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1474 {
1475 	struct sock *rsk;
1476 
1477 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1478 		struct dst_entry *dst = sk->sk_rx_dst;
1479 
1480 		sock_rps_save_rxhash(sk, skb);
1481 		sk_mark_napi_id(sk, skb);
1482 		if (dst) {
1483 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1484 			    !dst->ops->check(dst, 0)) {
1485 				dst_release(dst);
1486 				sk->sk_rx_dst = NULL;
1487 			}
1488 		}
1489 		tcp_rcv_established(sk, skb, tcp_hdr(skb));
1490 		return 0;
1491 	}
1492 
1493 	if (tcp_checksum_complete(skb))
1494 		goto csum_err;
1495 
1496 	if (sk->sk_state == TCP_LISTEN) {
1497 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1498 
1499 		if (!nsk)
1500 			goto discard;
1501 		if (nsk != sk) {
1502 			if (tcp_child_process(sk, nsk, skb)) {
1503 				rsk = nsk;
1504 				goto reset;
1505 			}
1506 			return 0;
1507 		}
1508 	} else
1509 		sock_rps_save_rxhash(sk, skb);
1510 
1511 	if (tcp_rcv_state_process(sk, skb)) {
1512 		rsk = sk;
1513 		goto reset;
1514 	}
1515 	return 0;
1516 
1517 reset:
1518 	tcp_v4_send_reset(rsk, skb);
1519 discard:
1520 	kfree_skb(skb);
1521 	/* Be careful here. If this function gets more complicated and
1522 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1523 	 * might be destroyed here. This current version compiles correctly,
1524 	 * but you have been warned.
1525 	 */
1526 	return 0;
1527 
1528 csum_err:
1529 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1530 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1531 	goto discard;
1532 }
1533 EXPORT_SYMBOL(tcp_v4_do_rcv);
1534 
1535 int tcp_v4_early_demux(struct sk_buff *skb)
1536 {
1537 	const struct iphdr *iph;
1538 	const struct tcphdr *th;
1539 	struct sock *sk;
1540 
1541 	if (skb->pkt_type != PACKET_HOST)
1542 		return 0;
1543 
1544 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1545 		return 0;
1546 
1547 	iph = ip_hdr(skb);
1548 	th = tcp_hdr(skb);
1549 
1550 	if (th->doff < sizeof(struct tcphdr) / 4)
1551 		return 0;
1552 
1553 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1554 				       iph->saddr, th->source,
1555 				       iph->daddr, ntohs(th->dest),
1556 				       skb->skb_iif, inet_sdif(skb));
1557 	if (sk) {
1558 		skb->sk = sk;
1559 		skb->destructor = sock_edemux;
1560 		if (sk_fullsock(sk)) {
1561 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1562 
1563 			if (dst)
1564 				dst = dst_check(dst, 0);
1565 			if (dst &&
1566 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1567 				skb_dst_set_noref(skb, dst);
1568 		}
1569 	}
1570 	return 0;
1571 }
1572 
1573 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1574 {
1575 	u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1576 
1577 	/* Only socket owner can try to collapse/prune rx queues
1578 	 * to reduce memory overhead, so add a little headroom here.
1579 	 * Few sockets backlog are possibly concurrently non empty.
1580 	 */
1581 	limit += 64*1024;
1582 
1583 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1584 	 * we can fix skb->truesize to its real value to avoid future drops.
1585 	 * This is valid because skb is not yet charged to the socket.
1586 	 * It has been noticed pure SACK packets were sometimes dropped
1587 	 * (if cooked by drivers without copybreak feature).
1588 	 */
1589 	skb_condense(skb);
1590 
1591 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1592 		bh_unlock_sock(sk);
1593 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1594 		return true;
1595 	}
1596 	return false;
1597 }
1598 EXPORT_SYMBOL(tcp_add_backlog);
1599 
1600 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1601 {
1602 	struct tcphdr *th = (struct tcphdr *)skb->data;
1603 	unsigned int eaten = skb->len;
1604 	int err;
1605 
1606 	err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1607 	if (!err) {
1608 		eaten -= skb->len;
1609 		TCP_SKB_CB(skb)->end_seq -= eaten;
1610 	}
1611 	return err;
1612 }
1613 EXPORT_SYMBOL(tcp_filter);
1614 
1615 static void tcp_v4_restore_cb(struct sk_buff *skb)
1616 {
1617 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1618 		sizeof(struct inet_skb_parm));
1619 }
1620 
1621 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1622 			   const struct tcphdr *th)
1623 {
1624 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1625 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1626 	 */
1627 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1628 		sizeof(struct inet_skb_parm));
1629 	barrier();
1630 
1631 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1632 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1633 				    skb->len - th->doff * 4);
1634 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1635 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1636 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1637 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1638 	TCP_SKB_CB(skb)->sacked	 = 0;
1639 	TCP_SKB_CB(skb)->has_rxtstamp =
1640 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1641 }
1642 
1643 /*
1644  *	From tcp_input.c
1645  */
1646 
1647 int tcp_v4_rcv(struct sk_buff *skb)
1648 {
1649 	struct net *net = dev_net(skb->dev);
1650 	int sdif = inet_sdif(skb);
1651 	const struct iphdr *iph;
1652 	const struct tcphdr *th;
1653 	bool refcounted;
1654 	struct sock *sk;
1655 	int ret;
1656 
1657 	if (skb->pkt_type != PACKET_HOST)
1658 		goto discard_it;
1659 
1660 	/* Count it even if it's bad */
1661 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1662 
1663 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1664 		goto discard_it;
1665 
1666 	th = (const struct tcphdr *)skb->data;
1667 
1668 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1669 		goto bad_packet;
1670 	if (!pskb_may_pull(skb, th->doff * 4))
1671 		goto discard_it;
1672 
1673 	/* An explanation is required here, I think.
1674 	 * Packet length and doff are validated by header prediction,
1675 	 * provided case of th->doff==0 is eliminated.
1676 	 * So, we defer the checks. */
1677 
1678 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1679 		goto csum_error;
1680 
1681 	th = (const struct tcphdr *)skb->data;
1682 	iph = ip_hdr(skb);
1683 lookup:
1684 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1685 			       th->dest, sdif, &refcounted);
1686 	if (!sk)
1687 		goto no_tcp_socket;
1688 
1689 process:
1690 	if (sk->sk_state == TCP_TIME_WAIT)
1691 		goto do_time_wait;
1692 
1693 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1694 		struct request_sock *req = inet_reqsk(sk);
1695 		bool req_stolen = false;
1696 		struct sock *nsk;
1697 
1698 		sk = req->rsk_listener;
1699 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1700 			sk_drops_add(sk, skb);
1701 			reqsk_put(req);
1702 			goto discard_it;
1703 		}
1704 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1705 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1706 			goto lookup;
1707 		}
1708 		/* We own a reference on the listener, increase it again
1709 		 * as we might lose it too soon.
1710 		 */
1711 		sock_hold(sk);
1712 		refcounted = true;
1713 		nsk = NULL;
1714 		if (!tcp_filter(sk, skb)) {
1715 			th = (const struct tcphdr *)skb->data;
1716 			iph = ip_hdr(skb);
1717 			tcp_v4_fill_cb(skb, iph, th);
1718 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1719 		}
1720 		if (!nsk) {
1721 			reqsk_put(req);
1722 			if (req_stolen) {
1723 				/* Another cpu got exclusive access to req
1724 				 * and created a full blown socket.
1725 				 * Try to feed this packet to this socket
1726 				 * instead of discarding it.
1727 				 */
1728 				tcp_v4_restore_cb(skb);
1729 				sock_put(sk);
1730 				goto lookup;
1731 			}
1732 			goto discard_and_relse;
1733 		}
1734 		if (nsk == sk) {
1735 			reqsk_put(req);
1736 			tcp_v4_restore_cb(skb);
1737 		} else if (tcp_child_process(sk, nsk, skb)) {
1738 			tcp_v4_send_reset(nsk, skb);
1739 			goto discard_and_relse;
1740 		} else {
1741 			sock_put(sk);
1742 			return 0;
1743 		}
1744 	}
1745 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1746 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1747 		goto discard_and_relse;
1748 	}
1749 
1750 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1751 		goto discard_and_relse;
1752 
1753 	if (tcp_v4_inbound_md5_hash(sk, skb))
1754 		goto discard_and_relse;
1755 
1756 	nf_reset(skb);
1757 
1758 	if (tcp_filter(sk, skb))
1759 		goto discard_and_relse;
1760 	th = (const struct tcphdr *)skb->data;
1761 	iph = ip_hdr(skb);
1762 	tcp_v4_fill_cb(skb, iph, th);
1763 
1764 	skb->dev = NULL;
1765 
1766 	if (sk->sk_state == TCP_LISTEN) {
1767 		ret = tcp_v4_do_rcv(sk, skb);
1768 		goto put_and_return;
1769 	}
1770 
1771 	sk_incoming_cpu_update(sk);
1772 
1773 	bh_lock_sock_nested(sk);
1774 	tcp_segs_in(tcp_sk(sk), skb);
1775 	ret = 0;
1776 	if (!sock_owned_by_user(sk)) {
1777 		ret = tcp_v4_do_rcv(sk, skb);
1778 	} else if (tcp_add_backlog(sk, skb)) {
1779 		goto discard_and_relse;
1780 	}
1781 	bh_unlock_sock(sk);
1782 
1783 put_and_return:
1784 	if (refcounted)
1785 		sock_put(sk);
1786 
1787 	return ret;
1788 
1789 no_tcp_socket:
1790 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1791 		goto discard_it;
1792 
1793 	tcp_v4_fill_cb(skb, iph, th);
1794 
1795 	if (tcp_checksum_complete(skb)) {
1796 csum_error:
1797 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1798 bad_packet:
1799 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
1800 	} else {
1801 		tcp_v4_send_reset(NULL, skb);
1802 	}
1803 
1804 discard_it:
1805 	/* Discard frame. */
1806 	kfree_skb(skb);
1807 	return 0;
1808 
1809 discard_and_relse:
1810 	sk_drops_add(sk, skb);
1811 	if (refcounted)
1812 		sock_put(sk);
1813 	goto discard_it;
1814 
1815 do_time_wait:
1816 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1817 		inet_twsk_put(inet_twsk(sk));
1818 		goto discard_it;
1819 	}
1820 
1821 	tcp_v4_fill_cb(skb, iph, th);
1822 
1823 	if (tcp_checksum_complete(skb)) {
1824 		inet_twsk_put(inet_twsk(sk));
1825 		goto csum_error;
1826 	}
1827 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1828 	case TCP_TW_SYN: {
1829 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1830 							&tcp_hashinfo, skb,
1831 							__tcp_hdrlen(th),
1832 							iph->saddr, th->source,
1833 							iph->daddr, th->dest,
1834 							inet_iif(skb),
1835 							sdif);
1836 		if (sk2) {
1837 			inet_twsk_deschedule_put(inet_twsk(sk));
1838 			sk = sk2;
1839 			tcp_v4_restore_cb(skb);
1840 			refcounted = false;
1841 			goto process;
1842 		}
1843 	}
1844 		/* to ACK */
1845 		/* fall through */
1846 	case TCP_TW_ACK:
1847 		tcp_v4_timewait_ack(sk, skb);
1848 		break;
1849 	case TCP_TW_RST:
1850 		tcp_v4_send_reset(sk, skb);
1851 		inet_twsk_deschedule_put(inet_twsk(sk));
1852 		goto discard_it;
1853 	case TCP_TW_SUCCESS:;
1854 	}
1855 	goto discard_it;
1856 }
1857 
1858 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1859 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1860 	.twsk_unique	= tcp_twsk_unique,
1861 	.twsk_destructor= tcp_twsk_destructor,
1862 };
1863 
1864 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1865 {
1866 	struct dst_entry *dst = skb_dst(skb);
1867 
1868 	if (dst && dst_hold_safe(dst)) {
1869 		sk->sk_rx_dst = dst;
1870 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1871 	}
1872 }
1873 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1874 
1875 const struct inet_connection_sock_af_ops ipv4_specific = {
1876 	.queue_xmit	   = ip_queue_xmit,
1877 	.send_check	   = tcp_v4_send_check,
1878 	.rebuild_header	   = inet_sk_rebuild_header,
1879 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
1880 	.conn_request	   = tcp_v4_conn_request,
1881 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1882 	.net_header_len	   = sizeof(struct iphdr),
1883 	.setsockopt	   = ip_setsockopt,
1884 	.getsockopt	   = ip_getsockopt,
1885 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1886 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1887 #ifdef CONFIG_COMPAT
1888 	.compat_setsockopt = compat_ip_setsockopt,
1889 	.compat_getsockopt = compat_ip_getsockopt,
1890 #endif
1891 	.mtu_reduced	   = tcp_v4_mtu_reduced,
1892 };
1893 EXPORT_SYMBOL(ipv4_specific);
1894 
1895 #ifdef CONFIG_TCP_MD5SIG
1896 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1897 	.md5_lookup		= tcp_v4_md5_lookup,
1898 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1899 	.md5_parse		= tcp_v4_parse_md5_keys,
1900 };
1901 #endif
1902 
1903 /* NOTE: A lot of things set to zero explicitly by call to
1904  *       sk_alloc() so need not be done here.
1905  */
1906 static int tcp_v4_init_sock(struct sock *sk)
1907 {
1908 	struct inet_connection_sock *icsk = inet_csk(sk);
1909 
1910 	tcp_init_sock(sk);
1911 
1912 	icsk->icsk_af_ops = &ipv4_specific;
1913 
1914 #ifdef CONFIG_TCP_MD5SIG
1915 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1916 #endif
1917 
1918 	return 0;
1919 }
1920 
1921 void tcp_v4_destroy_sock(struct sock *sk)
1922 {
1923 	struct tcp_sock *tp = tcp_sk(sk);
1924 
1925 	trace_tcp_destroy_sock(sk);
1926 
1927 	tcp_clear_xmit_timers(sk);
1928 
1929 	tcp_cleanup_congestion_control(sk);
1930 
1931 	tcp_cleanup_ulp(sk);
1932 
1933 	/* Cleanup up the write buffer. */
1934 	tcp_write_queue_purge(sk);
1935 
1936 	/* Check if we want to disable active TFO */
1937 	tcp_fastopen_active_disable_ofo_check(sk);
1938 
1939 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1940 	skb_rbtree_purge(&tp->out_of_order_queue);
1941 
1942 #ifdef CONFIG_TCP_MD5SIG
1943 	/* Clean up the MD5 key list, if any */
1944 	if (tp->md5sig_info) {
1945 		tcp_clear_md5_list(sk);
1946 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
1947 		tp->md5sig_info = NULL;
1948 	}
1949 #endif
1950 
1951 	/* Clean up a referenced TCP bind bucket. */
1952 	if (inet_csk(sk)->icsk_bind_hash)
1953 		inet_put_port(sk);
1954 
1955 	BUG_ON(tp->fastopen_rsk);
1956 
1957 	/* If socket is aborted during connect operation */
1958 	tcp_free_fastopen_req(tp);
1959 	tcp_fastopen_destroy_cipher(sk);
1960 	tcp_saved_syn_free(tp);
1961 
1962 	sk_sockets_allocated_dec(sk);
1963 }
1964 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1965 
1966 #ifdef CONFIG_PROC_FS
1967 /* Proc filesystem TCP sock list dumping. */
1968 
1969 /*
1970  * Get next listener socket follow cur.  If cur is NULL, get first socket
1971  * starting from bucket given in st->bucket; when st->bucket is zero the
1972  * very first socket in the hash table is returned.
1973  */
1974 static void *listening_get_next(struct seq_file *seq, void *cur)
1975 {
1976 	struct tcp_iter_state *st = seq->private;
1977 	struct net *net = seq_file_net(seq);
1978 	struct inet_listen_hashbucket *ilb;
1979 	struct sock *sk = cur;
1980 
1981 	if (!sk) {
1982 get_head:
1983 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1984 		spin_lock(&ilb->lock);
1985 		sk = sk_head(&ilb->head);
1986 		st->offset = 0;
1987 		goto get_sk;
1988 	}
1989 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1990 	++st->num;
1991 	++st->offset;
1992 
1993 	sk = sk_next(sk);
1994 get_sk:
1995 	sk_for_each_from(sk) {
1996 		if (!net_eq(sock_net(sk), net))
1997 			continue;
1998 		if (sk->sk_family == st->family)
1999 			return sk;
2000 	}
2001 	spin_unlock(&ilb->lock);
2002 	st->offset = 0;
2003 	if (++st->bucket < INET_LHTABLE_SIZE)
2004 		goto get_head;
2005 	return NULL;
2006 }
2007 
2008 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2009 {
2010 	struct tcp_iter_state *st = seq->private;
2011 	void *rc;
2012 
2013 	st->bucket = 0;
2014 	st->offset = 0;
2015 	rc = listening_get_next(seq, NULL);
2016 
2017 	while (rc && *pos) {
2018 		rc = listening_get_next(seq, rc);
2019 		--*pos;
2020 	}
2021 	return rc;
2022 }
2023 
2024 static inline bool empty_bucket(const struct tcp_iter_state *st)
2025 {
2026 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2027 }
2028 
2029 /*
2030  * Get first established socket starting from bucket given in st->bucket.
2031  * If st->bucket is zero, the very first socket in the hash is returned.
2032  */
2033 static void *established_get_first(struct seq_file *seq)
2034 {
2035 	struct tcp_iter_state *st = seq->private;
2036 	struct net *net = seq_file_net(seq);
2037 	void *rc = NULL;
2038 
2039 	st->offset = 0;
2040 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2041 		struct sock *sk;
2042 		struct hlist_nulls_node *node;
2043 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2044 
2045 		/* Lockless fast path for the common case of empty buckets */
2046 		if (empty_bucket(st))
2047 			continue;
2048 
2049 		spin_lock_bh(lock);
2050 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2051 			if (sk->sk_family != st->family ||
2052 			    !net_eq(sock_net(sk), net)) {
2053 				continue;
2054 			}
2055 			rc = sk;
2056 			goto out;
2057 		}
2058 		spin_unlock_bh(lock);
2059 	}
2060 out:
2061 	return rc;
2062 }
2063 
2064 static void *established_get_next(struct seq_file *seq, void *cur)
2065 {
2066 	struct sock *sk = cur;
2067 	struct hlist_nulls_node *node;
2068 	struct tcp_iter_state *st = seq->private;
2069 	struct net *net = seq_file_net(seq);
2070 
2071 	++st->num;
2072 	++st->offset;
2073 
2074 	sk = sk_nulls_next(sk);
2075 
2076 	sk_nulls_for_each_from(sk, node) {
2077 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2078 			return sk;
2079 	}
2080 
2081 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2082 	++st->bucket;
2083 	return established_get_first(seq);
2084 }
2085 
2086 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2087 {
2088 	struct tcp_iter_state *st = seq->private;
2089 	void *rc;
2090 
2091 	st->bucket = 0;
2092 	rc = established_get_first(seq);
2093 
2094 	while (rc && pos) {
2095 		rc = established_get_next(seq, rc);
2096 		--pos;
2097 	}
2098 	return rc;
2099 }
2100 
2101 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2102 {
2103 	void *rc;
2104 	struct tcp_iter_state *st = seq->private;
2105 
2106 	st->state = TCP_SEQ_STATE_LISTENING;
2107 	rc	  = listening_get_idx(seq, &pos);
2108 
2109 	if (!rc) {
2110 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2111 		rc	  = established_get_idx(seq, pos);
2112 	}
2113 
2114 	return rc;
2115 }
2116 
2117 static void *tcp_seek_last_pos(struct seq_file *seq)
2118 {
2119 	struct tcp_iter_state *st = seq->private;
2120 	int offset = st->offset;
2121 	int orig_num = st->num;
2122 	void *rc = NULL;
2123 
2124 	switch (st->state) {
2125 	case TCP_SEQ_STATE_LISTENING:
2126 		if (st->bucket >= INET_LHTABLE_SIZE)
2127 			break;
2128 		st->state = TCP_SEQ_STATE_LISTENING;
2129 		rc = listening_get_next(seq, NULL);
2130 		while (offset-- && rc)
2131 			rc = listening_get_next(seq, rc);
2132 		if (rc)
2133 			break;
2134 		st->bucket = 0;
2135 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2136 		/* Fallthrough */
2137 	case TCP_SEQ_STATE_ESTABLISHED:
2138 		if (st->bucket > tcp_hashinfo.ehash_mask)
2139 			break;
2140 		rc = established_get_first(seq);
2141 		while (offset-- && rc)
2142 			rc = established_get_next(seq, rc);
2143 	}
2144 
2145 	st->num = orig_num;
2146 
2147 	return rc;
2148 }
2149 
2150 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2151 {
2152 	struct tcp_iter_state *st = seq->private;
2153 	void *rc;
2154 
2155 	if (*pos && *pos == st->last_pos) {
2156 		rc = tcp_seek_last_pos(seq);
2157 		if (rc)
2158 			goto out;
2159 	}
2160 
2161 	st->state = TCP_SEQ_STATE_LISTENING;
2162 	st->num = 0;
2163 	st->bucket = 0;
2164 	st->offset = 0;
2165 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2166 
2167 out:
2168 	st->last_pos = *pos;
2169 	return rc;
2170 }
2171 
2172 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2173 {
2174 	struct tcp_iter_state *st = seq->private;
2175 	void *rc = NULL;
2176 
2177 	if (v == SEQ_START_TOKEN) {
2178 		rc = tcp_get_idx(seq, 0);
2179 		goto out;
2180 	}
2181 
2182 	switch (st->state) {
2183 	case TCP_SEQ_STATE_LISTENING:
2184 		rc = listening_get_next(seq, v);
2185 		if (!rc) {
2186 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2187 			st->bucket = 0;
2188 			st->offset = 0;
2189 			rc	  = established_get_first(seq);
2190 		}
2191 		break;
2192 	case TCP_SEQ_STATE_ESTABLISHED:
2193 		rc = established_get_next(seq, v);
2194 		break;
2195 	}
2196 out:
2197 	++*pos;
2198 	st->last_pos = *pos;
2199 	return rc;
2200 }
2201 
2202 static void tcp_seq_stop(struct seq_file *seq, void *v)
2203 {
2204 	struct tcp_iter_state *st = seq->private;
2205 
2206 	switch (st->state) {
2207 	case TCP_SEQ_STATE_LISTENING:
2208 		if (v != SEQ_START_TOKEN)
2209 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2210 		break;
2211 	case TCP_SEQ_STATE_ESTABLISHED:
2212 		if (v)
2213 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2214 		break;
2215 	}
2216 }
2217 
2218 int tcp_seq_open(struct inode *inode, struct file *file)
2219 {
2220 	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2221 	struct tcp_iter_state *s;
2222 	int err;
2223 
2224 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2225 			  sizeof(struct tcp_iter_state));
2226 	if (err < 0)
2227 		return err;
2228 
2229 	s = ((struct seq_file *)file->private_data)->private;
2230 	s->family		= afinfo->family;
2231 	s->last_pos		= 0;
2232 	return 0;
2233 }
2234 EXPORT_SYMBOL(tcp_seq_open);
2235 
2236 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2237 {
2238 	int rc = 0;
2239 	struct proc_dir_entry *p;
2240 
2241 	afinfo->seq_ops.start		= tcp_seq_start;
2242 	afinfo->seq_ops.next		= tcp_seq_next;
2243 	afinfo->seq_ops.stop		= tcp_seq_stop;
2244 
2245 	p = proc_create_data(afinfo->name, 0444, net->proc_net,
2246 			     afinfo->seq_fops, afinfo);
2247 	if (!p)
2248 		rc = -ENOMEM;
2249 	return rc;
2250 }
2251 EXPORT_SYMBOL(tcp_proc_register);
2252 
2253 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2254 {
2255 	remove_proc_entry(afinfo->name, net->proc_net);
2256 }
2257 EXPORT_SYMBOL(tcp_proc_unregister);
2258 
2259 static void get_openreq4(const struct request_sock *req,
2260 			 struct seq_file *f, int i)
2261 {
2262 	const struct inet_request_sock *ireq = inet_rsk(req);
2263 	long delta = req->rsk_timer.expires - jiffies;
2264 
2265 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2266 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2267 		i,
2268 		ireq->ir_loc_addr,
2269 		ireq->ir_num,
2270 		ireq->ir_rmt_addr,
2271 		ntohs(ireq->ir_rmt_port),
2272 		TCP_SYN_RECV,
2273 		0, 0, /* could print option size, but that is af dependent. */
2274 		1,    /* timers active (only the expire timer) */
2275 		jiffies_delta_to_clock_t(delta),
2276 		req->num_timeout,
2277 		from_kuid_munged(seq_user_ns(f),
2278 				 sock_i_uid(req->rsk_listener)),
2279 		0,  /* non standard timer */
2280 		0, /* open_requests have no inode */
2281 		0,
2282 		req);
2283 }
2284 
2285 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2286 {
2287 	int timer_active;
2288 	unsigned long timer_expires;
2289 	const struct tcp_sock *tp = tcp_sk(sk);
2290 	const struct inet_connection_sock *icsk = inet_csk(sk);
2291 	const struct inet_sock *inet = inet_sk(sk);
2292 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2293 	__be32 dest = inet->inet_daddr;
2294 	__be32 src = inet->inet_rcv_saddr;
2295 	__u16 destp = ntohs(inet->inet_dport);
2296 	__u16 srcp = ntohs(inet->inet_sport);
2297 	int rx_queue;
2298 	int state;
2299 
2300 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2301 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2302 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2303 		timer_active	= 1;
2304 		timer_expires	= icsk->icsk_timeout;
2305 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2306 		timer_active	= 4;
2307 		timer_expires	= icsk->icsk_timeout;
2308 	} else if (timer_pending(&sk->sk_timer)) {
2309 		timer_active	= 2;
2310 		timer_expires	= sk->sk_timer.expires;
2311 	} else {
2312 		timer_active	= 0;
2313 		timer_expires = jiffies;
2314 	}
2315 
2316 	state = inet_sk_state_load(sk);
2317 	if (state == TCP_LISTEN)
2318 		rx_queue = sk->sk_ack_backlog;
2319 	else
2320 		/* Because we don't lock the socket,
2321 		 * we might find a transient negative value.
2322 		 */
2323 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2324 
2325 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2326 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2327 		i, src, srcp, dest, destp, state,
2328 		tp->write_seq - tp->snd_una,
2329 		rx_queue,
2330 		timer_active,
2331 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2332 		icsk->icsk_retransmits,
2333 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2334 		icsk->icsk_probes_out,
2335 		sock_i_ino(sk),
2336 		refcount_read(&sk->sk_refcnt), sk,
2337 		jiffies_to_clock_t(icsk->icsk_rto),
2338 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2339 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2340 		tp->snd_cwnd,
2341 		state == TCP_LISTEN ?
2342 		    fastopenq->max_qlen :
2343 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2344 }
2345 
2346 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2347 			       struct seq_file *f, int i)
2348 {
2349 	long delta = tw->tw_timer.expires - jiffies;
2350 	__be32 dest, src;
2351 	__u16 destp, srcp;
2352 
2353 	dest  = tw->tw_daddr;
2354 	src   = tw->tw_rcv_saddr;
2355 	destp = ntohs(tw->tw_dport);
2356 	srcp  = ntohs(tw->tw_sport);
2357 
2358 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2359 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2360 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2361 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2362 		refcount_read(&tw->tw_refcnt), tw);
2363 }
2364 
2365 #define TMPSZ 150
2366 
2367 static int tcp4_seq_show(struct seq_file *seq, void *v)
2368 {
2369 	struct tcp_iter_state *st;
2370 	struct sock *sk = v;
2371 
2372 	seq_setwidth(seq, TMPSZ - 1);
2373 	if (v == SEQ_START_TOKEN) {
2374 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2375 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2376 			   "inode");
2377 		goto out;
2378 	}
2379 	st = seq->private;
2380 
2381 	if (sk->sk_state == TCP_TIME_WAIT)
2382 		get_timewait4_sock(v, seq, st->num);
2383 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2384 		get_openreq4(v, seq, st->num);
2385 	else
2386 		get_tcp4_sock(v, seq, st->num);
2387 out:
2388 	seq_pad(seq, '\n');
2389 	return 0;
2390 }
2391 
2392 static const struct file_operations tcp_afinfo_seq_fops = {
2393 	.open    = tcp_seq_open,
2394 	.read    = seq_read,
2395 	.llseek  = seq_lseek,
2396 	.release = seq_release_net
2397 };
2398 
2399 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2400 	.name		= "tcp",
2401 	.family		= AF_INET,
2402 	.seq_fops	= &tcp_afinfo_seq_fops,
2403 	.seq_ops	= {
2404 		.show		= tcp4_seq_show,
2405 	},
2406 };
2407 
2408 static int __net_init tcp4_proc_init_net(struct net *net)
2409 {
2410 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2411 }
2412 
2413 static void __net_exit tcp4_proc_exit_net(struct net *net)
2414 {
2415 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2416 }
2417 
2418 static struct pernet_operations tcp4_net_ops = {
2419 	.init = tcp4_proc_init_net,
2420 	.exit = tcp4_proc_exit_net,
2421 };
2422 
2423 int __init tcp4_proc_init(void)
2424 {
2425 	return register_pernet_subsys(&tcp4_net_ops);
2426 }
2427 
2428 void tcp4_proc_exit(void)
2429 {
2430 	unregister_pernet_subsys(&tcp4_net_ops);
2431 }
2432 #endif /* CONFIG_PROC_FS */
2433 
2434 struct proto tcp_prot = {
2435 	.name			= "TCP",
2436 	.owner			= THIS_MODULE,
2437 	.close			= tcp_close,
2438 	.pre_connect		= tcp_v4_pre_connect,
2439 	.connect		= tcp_v4_connect,
2440 	.disconnect		= tcp_disconnect,
2441 	.accept			= inet_csk_accept,
2442 	.ioctl			= tcp_ioctl,
2443 	.init			= tcp_v4_init_sock,
2444 	.destroy		= tcp_v4_destroy_sock,
2445 	.shutdown		= tcp_shutdown,
2446 	.setsockopt		= tcp_setsockopt,
2447 	.getsockopt		= tcp_getsockopt,
2448 	.keepalive		= tcp_set_keepalive,
2449 	.recvmsg		= tcp_recvmsg,
2450 	.sendmsg		= tcp_sendmsg,
2451 	.sendpage		= tcp_sendpage,
2452 	.backlog_rcv		= tcp_v4_do_rcv,
2453 	.release_cb		= tcp_release_cb,
2454 	.hash			= inet_hash,
2455 	.unhash			= inet_unhash,
2456 	.get_port		= inet_csk_get_port,
2457 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2458 	.leave_memory_pressure	= tcp_leave_memory_pressure,
2459 	.stream_memory_free	= tcp_stream_memory_free,
2460 	.sockets_allocated	= &tcp_sockets_allocated,
2461 	.orphan_count		= &tcp_orphan_count,
2462 	.memory_allocated	= &tcp_memory_allocated,
2463 	.memory_pressure	= &tcp_memory_pressure,
2464 	.sysctl_mem		= sysctl_tcp_mem,
2465 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
2466 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
2467 	.max_header		= MAX_TCP_HEADER,
2468 	.obj_size		= sizeof(struct tcp_sock),
2469 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
2470 	.twsk_prot		= &tcp_timewait_sock_ops,
2471 	.rsk_prot		= &tcp_request_sock_ops,
2472 	.h.hashinfo		= &tcp_hashinfo,
2473 	.no_autobind		= true,
2474 #ifdef CONFIG_COMPAT
2475 	.compat_setsockopt	= compat_tcp_setsockopt,
2476 	.compat_getsockopt	= compat_tcp_getsockopt,
2477 #endif
2478 	.diag_destroy		= tcp_abort,
2479 };
2480 EXPORT_SYMBOL(tcp_prot);
2481 
2482 static void __net_exit tcp_sk_exit(struct net *net)
2483 {
2484 	int cpu;
2485 
2486 	module_put(net->ipv4.tcp_congestion_control->owner);
2487 
2488 	for_each_possible_cpu(cpu)
2489 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2490 	free_percpu(net->ipv4.tcp_sk);
2491 }
2492 
2493 static int __net_init tcp_sk_init(struct net *net)
2494 {
2495 	int res, cpu, cnt;
2496 
2497 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2498 	if (!net->ipv4.tcp_sk)
2499 		return -ENOMEM;
2500 
2501 	for_each_possible_cpu(cpu) {
2502 		struct sock *sk;
2503 
2504 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2505 					   IPPROTO_TCP, net);
2506 		if (res)
2507 			goto fail;
2508 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2509 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2510 	}
2511 
2512 	net->ipv4.sysctl_tcp_ecn = 2;
2513 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2514 
2515 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2516 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2517 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2518 
2519 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2520 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2521 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2522 
2523 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2524 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2525 	net->ipv4.sysctl_tcp_syncookies = 1;
2526 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2527 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2528 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2529 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2530 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2531 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2532 	net->ipv4.sysctl_tcp_tw_reuse = 0;
2533 
2534 	cnt = tcp_hashinfo.ehash_mask + 1;
2535 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2536 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2537 
2538 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2539 	net->ipv4.sysctl_tcp_sack = 1;
2540 	net->ipv4.sysctl_tcp_window_scaling = 1;
2541 	net->ipv4.sysctl_tcp_timestamps = 1;
2542 	net->ipv4.sysctl_tcp_early_retrans = 3;
2543 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2544 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2545 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
2546 	net->ipv4.sysctl_tcp_max_reordering = 300;
2547 	net->ipv4.sysctl_tcp_dsack = 1;
2548 	net->ipv4.sysctl_tcp_app_win = 31;
2549 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
2550 	net->ipv4.sysctl_tcp_frto = 2;
2551 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2552 	/* This limits the percentage of the congestion window which we
2553 	 * will allow a single TSO frame to consume.  Building TSO frames
2554 	 * which are too large can cause TCP streams to be bursty.
2555 	 */
2556 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2557 	/* Default TSQ limit of four TSO segments */
2558 	net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
2559 	/* rfc5961 challenge ack rate limiting */
2560 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2561 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
2562 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2563 	net->ipv4.sysctl_tcp_autocorking = 1;
2564 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2565 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2566 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2567 	if (net != &init_net) {
2568 		memcpy(net->ipv4.sysctl_tcp_rmem,
2569 		       init_net.ipv4.sysctl_tcp_rmem,
2570 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2571 		memcpy(net->ipv4.sysctl_tcp_wmem,
2572 		       init_net.ipv4.sysctl_tcp_wmem,
2573 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2574 	}
2575 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2576 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2577 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2578 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2579 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2580 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2581 
2582 	/* Reno is always built in */
2583 	if (!net_eq(net, &init_net) &&
2584 	    try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2585 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2586 	else
2587 		net->ipv4.tcp_congestion_control = &tcp_reno;
2588 
2589 	return 0;
2590 fail:
2591 	tcp_sk_exit(net);
2592 
2593 	return res;
2594 }
2595 
2596 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2597 {
2598 	struct net *net;
2599 
2600 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2601 
2602 	list_for_each_entry(net, net_exit_list, exit_list)
2603 		tcp_fastopen_ctx_destroy(net);
2604 }
2605 
2606 static struct pernet_operations __net_initdata tcp_sk_ops = {
2607        .init	   = tcp_sk_init,
2608        .exit	   = tcp_sk_exit,
2609        .exit_batch = tcp_sk_exit_batch,
2610 };
2611 
2612 void __init tcp_v4_init(void)
2613 {
2614 	if (register_pernet_subsys(&tcp_sk_ops))
2615 		panic("Failed to create the TCP control socket.\n");
2616 }
2617