xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision afc74ce7)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72 
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
80 
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
83 
84 #include <trace/events/tcp.h>
85 
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 #endif
90 
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
93 
94 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
95 {
96 	return secure_tcp_seq(ip_hdr(skb)->daddr,
97 			      ip_hdr(skb)->saddr,
98 			      tcp_hdr(skb)->dest,
99 			      tcp_hdr(skb)->source);
100 }
101 
102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
103 {
104 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
105 }
106 
107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 {
109 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
110 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 	struct tcp_sock *tp = tcp_sk(sk);
112 	int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
113 
114 	if (reuse == 2) {
115 		/* Still does not detect *everything* that goes through
116 		 * lo, since we require a loopback src or dst address
117 		 * or direct binding to 'lo' interface.
118 		 */
119 		bool loopback = false;
120 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
121 			loopback = true;
122 #if IS_ENABLED(CONFIG_IPV6)
123 		if (tw->tw_family == AF_INET6) {
124 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
125 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
126 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
127 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
128 				loopback = true;
129 		} else
130 #endif
131 		{
132 			if (ipv4_is_loopback(tw->tw_daddr) ||
133 			    ipv4_is_loopback(tw->tw_rcv_saddr))
134 				loopback = true;
135 		}
136 		if (!loopback)
137 			reuse = 0;
138 	}
139 
140 	/* With PAWS, it is safe from the viewpoint
141 	   of data integrity. Even without PAWS it is safe provided sequence
142 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
143 
144 	   Actually, the idea is close to VJ's one, only timestamp cache is
145 	   held not per host, but per port pair and TW bucket is used as state
146 	   holder.
147 
148 	   If TW bucket has been already destroyed we fall back to VJ's scheme
149 	   and use initial timestamp retrieved from peer table.
150 	 */
151 	if (tcptw->tw_ts_recent_stamp &&
152 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
153 					    tcptw->tw_ts_recent_stamp)))) {
154 		/* In case of repair and re-using TIME-WAIT sockets we still
155 		 * want to be sure that it is safe as above but honor the
156 		 * sequence numbers and time stamps set as part of the repair
157 		 * process.
158 		 *
159 		 * Without this check re-using a TIME-WAIT socket with TCP
160 		 * repair would accumulate a -1 on the repair assigned
161 		 * sequence number. The first time it is reused the sequence
162 		 * is -1, the second time -2, etc. This fixes that issue
163 		 * without appearing to create any others.
164 		 */
165 		if (likely(!tp->repair)) {
166 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
167 
168 			if (!seq)
169 				seq = 1;
170 			WRITE_ONCE(tp->write_seq, seq);
171 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
172 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
173 		}
174 		sock_hold(sktw);
175 		return 1;
176 	}
177 
178 	return 0;
179 }
180 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
181 
182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
183 			      int addr_len)
184 {
185 	/* This check is replicated from tcp_v4_connect() and intended to
186 	 * prevent BPF program called below from accessing bytes that are out
187 	 * of the bound specified by user in addr_len.
188 	 */
189 	if (addr_len < sizeof(struct sockaddr_in))
190 		return -EINVAL;
191 
192 	sock_owned_by_me(sk);
193 
194 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
195 }
196 
197 /* This will initiate an outgoing connection. */
198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
199 {
200 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
201 	struct inet_sock *inet = inet_sk(sk);
202 	struct tcp_sock *tp = tcp_sk(sk);
203 	__be16 orig_sport, orig_dport;
204 	__be32 daddr, nexthop;
205 	struct flowi4 *fl4;
206 	struct rtable *rt;
207 	int err;
208 	struct ip_options_rcu *inet_opt;
209 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
210 
211 	if (addr_len < sizeof(struct sockaddr_in))
212 		return -EINVAL;
213 
214 	if (usin->sin_family != AF_INET)
215 		return -EAFNOSUPPORT;
216 
217 	nexthop = daddr = usin->sin_addr.s_addr;
218 	inet_opt = rcu_dereference_protected(inet->inet_opt,
219 					     lockdep_sock_is_held(sk));
220 	if (inet_opt && inet_opt->opt.srr) {
221 		if (!daddr)
222 			return -EINVAL;
223 		nexthop = inet_opt->opt.faddr;
224 	}
225 
226 	orig_sport = inet->inet_sport;
227 	orig_dport = usin->sin_port;
228 	fl4 = &inet->cork.fl.u.ip4;
229 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
230 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
231 			      IPPROTO_TCP,
232 			      orig_sport, orig_dport, sk);
233 	if (IS_ERR(rt)) {
234 		err = PTR_ERR(rt);
235 		if (err == -ENETUNREACH)
236 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
237 		return err;
238 	}
239 
240 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
241 		ip_rt_put(rt);
242 		return -ENETUNREACH;
243 	}
244 
245 	if (!inet_opt || !inet_opt->opt.srr)
246 		daddr = fl4->daddr;
247 
248 	if (!inet->inet_saddr)
249 		inet->inet_saddr = fl4->saddr;
250 	sk_rcv_saddr_set(sk, inet->inet_saddr);
251 
252 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
253 		/* Reset inherited state */
254 		tp->rx_opt.ts_recent	   = 0;
255 		tp->rx_opt.ts_recent_stamp = 0;
256 		if (likely(!tp->repair))
257 			WRITE_ONCE(tp->write_seq, 0);
258 	}
259 
260 	inet->inet_dport = usin->sin_port;
261 	sk_daddr_set(sk, daddr);
262 
263 	inet_csk(sk)->icsk_ext_hdr_len = 0;
264 	if (inet_opt)
265 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
266 
267 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
268 
269 	/* Socket identity is still unknown (sport may be zero).
270 	 * However we set state to SYN-SENT and not releasing socket
271 	 * lock select source port, enter ourselves into the hash tables and
272 	 * complete initialization after this.
273 	 */
274 	tcp_set_state(sk, TCP_SYN_SENT);
275 	err = inet_hash_connect(tcp_death_row, sk);
276 	if (err)
277 		goto failure;
278 
279 	sk_set_txhash(sk);
280 
281 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
282 			       inet->inet_sport, inet->inet_dport, sk);
283 	if (IS_ERR(rt)) {
284 		err = PTR_ERR(rt);
285 		rt = NULL;
286 		goto failure;
287 	}
288 	/* OK, now commit destination to socket.  */
289 	sk->sk_gso_type = SKB_GSO_TCPV4;
290 	sk_setup_caps(sk, &rt->dst);
291 	rt = NULL;
292 
293 	if (likely(!tp->repair)) {
294 		if (!tp->write_seq)
295 			WRITE_ONCE(tp->write_seq,
296 				   secure_tcp_seq(inet->inet_saddr,
297 						  inet->inet_daddr,
298 						  inet->inet_sport,
299 						  usin->sin_port));
300 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
301 						 inet->inet_saddr,
302 						 inet->inet_daddr);
303 	}
304 
305 	inet->inet_id = prandom_u32();
306 
307 	if (tcp_fastopen_defer_connect(sk, &err))
308 		return err;
309 	if (err)
310 		goto failure;
311 
312 	err = tcp_connect(sk);
313 
314 	if (err)
315 		goto failure;
316 
317 	return 0;
318 
319 failure:
320 	/*
321 	 * This unhashes the socket and releases the local port,
322 	 * if necessary.
323 	 */
324 	tcp_set_state(sk, TCP_CLOSE);
325 	ip_rt_put(rt);
326 	sk->sk_route_caps = 0;
327 	inet->inet_dport = 0;
328 	return err;
329 }
330 EXPORT_SYMBOL(tcp_v4_connect);
331 
332 /*
333  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
334  * It can be called through tcp_release_cb() if socket was owned by user
335  * at the time tcp_v4_err() was called to handle ICMP message.
336  */
337 void tcp_v4_mtu_reduced(struct sock *sk)
338 {
339 	struct inet_sock *inet = inet_sk(sk);
340 	struct dst_entry *dst;
341 	u32 mtu;
342 
343 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
344 		return;
345 	mtu = tcp_sk(sk)->mtu_info;
346 	dst = inet_csk_update_pmtu(sk, mtu);
347 	if (!dst)
348 		return;
349 
350 	/* Something is about to be wrong... Remember soft error
351 	 * for the case, if this connection will not able to recover.
352 	 */
353 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
354 		sk->sk_err_soft = EMSGSIZE;
355 
356 	mtu = dst_mtu(dst);
357 
358 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
359 	    ip_sk_accept_pmtu(sk) &&
360 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
361 		tcp_sync_mss(sk, mtu);
362 
363 		/* Resend the TCP packet because it's
364 		 * clear that the old packet has been
365 		 * dropped. This is the new "fast" path mtu
366 		 * discovery.
367 		 */
368 		tcp_simple_retransmit(sk);
369 	} /* else let the usual retransmit timer handle it */
370 }
371 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
372 
373 static void do_redirect(struct sk_buff *skb, struct sock *sk)
374 {
375 	struct dst_entry *dst = __sk_dst_check(sk, 0);
376 
377 	if (dst)
378 		dst->ops->redirect(dst, sk, skb);
379 }
380 
381 
382 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
383 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
384 {
385 	struct request_sock *req = inet_reqsk(sk);
386 	struct net *net = sock_net(sk);
387 
388 	/* ICMPs are not backlogged, hence we cannot get
389 	 * an established socket here.
390 	 */
391 	if (seq != tcp_rsk(req)->snt_isn) {
392 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
393 	} else if (abort) {
394 		/*
395 		 * Still in SYN_RECV, just remove it silently.
396 		 * There is no good way to pass the error to the newly
397 		 * created socket, and POSIX does not want network
398 		 * errors returned from accept().
399 		 */
400 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
401 		tcp_listendrop(req->rsk_listener);
402 	}
403 	reqsk_put(req);
404 }
405 EXPORT_SYMBOL(tcp_req_err);
406 
407 /* TCP-LD (RFC 6069) logic */
408 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
409 {
410 	struct inet_connection_sock *icsk = inet_csk(sk);
411 	struct tcp_sock *tp = tcp_sk(sk);
412 	struct sk_buff *skb;
413 	s32 remaining;
414 	u32 delta_us;
415 
416 	if (sock_owned_by_user(sk))
417 		return;
418 
419 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
420 	    !icsk->icsk_backoff)
421 		return;
422 
423 	skb = tcp_rtx_queue_head(sk);
424 	if (WARN_ON_ONCE(!skb))
425 		return;
426 
427 	icsk->icsk_backoff--;
428 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
429 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
430 
431 	tcp_mstamp_refresh(tp);
432 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
433 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
434 
435 	if (remaining > 0) {
436 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
437 					  remaining, TCP_RTO_MAX);
438 	} else {
439 		/* RTO revert clocked out retransmission.
440 		 * Will retransmit now.
441 		 */
442 		tcp_retransmit_timer(sk);
443 	}
444 }
445 EXPORT_SYMBOL(tcp_ld_RTO_revert);
446 
447 /*
448  * This routine is called by the ICMP module when it gets some
449  * sort of error condition.  If err < 0 then the socket should
450  * be closed and the error returned to the user.  If err > 0
451  * it's just the icmp type << 8 | icmp code.  After adjustment
452  * header points to the first 8 bytes of the tcp header.  We need
453  * to find the appropriate port.
454  *
455  * The locking strategy used here is very "optimistic". When
456  * someone else accesses the socket the ICMP is just dropped
457  * and for some paths there is no check at all.
458  * A more general error queue to queue errors for later handling
459  * is probably better.
460  *
461  */
462 
463 int tcp_v4_err(struct sk_buff *skb, u32 info)
464 {
465 	const struct iphdr *iph = (const struct iphdr *)skb->data;
466 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
467 	struct tcp_sock *tp;
468 	struct inet_sock *inet;
469 	const int type = icmp_hdr(skb)->type;
470 	const int code = icmp_hdr(skb)->code;
471 	struct sock *sk;
472 	struct request_sock *fastopen;
473 	u32 seq, snd_una;
474 	int err;
475 	struct net *net = dev_net(skb->dev);
476 
477 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
478 				       th->dest, iph->saddr, ntohs(th->source),
479 				       inet_iif(skb), 0);
480 	if (!sk) {
481 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
482 		return -ENOENT;
483 	}
484 	if (sk->sk_state == TCP_TIME_WAIT) {
485 		inet_twsk_put(inet_twsk(sk));
486 		return 0;
487 	}
488 	seq = ntohl(th->seq);
489 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
490 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
491 				     type == ICMP_TIME_EXCEEDED ||
492 				     (type == ICMP_DEST_UNREACH &&
493 				      (code == ICMP_NET_UNREACH ||
494 				       code == ICMP_HOST_UNREACH)));
495 		return 0;
496 	}
497 
498 	bh_lock_sock(sk);
499 	/* If too many ICMPs get dropped on busy
500 	 * servers this needs to be solved differently.
501 	 * We do take care of PMTU discovery (RFC1191) special case :
502 	 * we can receive locally generated ICMP messages while socket is held.
503 	 */
504 	if (sock_owned_by_user(sk)) {
505 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
506 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
507 	}
508 	if (sk->sk_state == TCP_CLOSE)
509 		goto out;
510 
511 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
512 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
513 		goto out;
514 	}
515 
516 	tp = tcp_sk(sk);
517 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
518 	fastopen = rcu_dereference(tp->fastopen_rsk);
519 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
520 	if (sk->sk_state != TCP_LISTEN &&
521 	    !between(seq, snd_una, tp->snd_nxt)) {
522 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
523 		goto out;
524 	}
525 
526 	switch (type) {
527 	case ICMP_REDIRECT:
528 		if (!sock_owned_by_user(sk))
529 			do_redirect(skb, sk);
530 		goto out;
531 	case ICMP_SOURCE_QUENCH:
532 		/* Just silently ignore these. */
533 		goto out;
534 	case ICMP_PARAMETERPROB:
535 		err = EPROTO;
536 		break;
537 	case ICMP_DEST_UNREACH:
538 		if (code > NR_ICMP_UNREACH)
539 			goto out;
540 
541 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
542 			/* We are not interested in TCP_LISTEN and open_requests
543 			 * (SYN-ACKs send out by Linux are always <576bytes so
544 			 * they should go through unfragmented).
545 			 */
546 			if (sk->sk_state == TCP_LISTEN)
547 				goto out;
548 
549 			tp->mtu_info = info;
550 			if (!sock_owned_by_user(sk)) {
551 				tcp_v4_mtu_reduced(sk);
552 			} else {
553 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
554 					sock_hold(sk);
555 			}
556 			goto out;
557 		}
558 
559 		err = icmp_err_convert[code].errno;
560 		/* check if this ICMP message allows revert of backoff.
561 		 * (see RFC 6069)
562 		 */
563 		if (!fastopen &&
564 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
565 			tcp_ld_RTO_revert(sk, seq);
566 		break;
567 	case ICMP_TIME_EXCEEDED:
568 		err = EHOSTUNREACH;
569 		break;
570 	default:
571 		goto out;
572 	}
573 
574 	switch (sk->sk_state) {
575 	case TCP_SYN_SENT:
576 	case TCP_SYN_RECV:
577 		/* Only in fast or simultaneous open. If a fast open socket is
578 		 * already accepted it is treated as a connected one below.
579 		 */
580 		if (fastopen && !fastopen->sk)
581 			break;
582 
583 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
584 
585 		if (!sock_owned_by_user(sk)) {
586 			sk->sk_err = err;
587 
588 			sk->sk_error_report(sk);
589 
590 			tcp_done(sk);
591 		} else {
592 			sk->sk_err_soft = err;
593 		}
594 		goto out;
595 	}
596 
597 	/* If we've already connected we will keep trying
598 	 * until we time out, or the user gives up.
599 	 *
600 	 * rfc1122 4.2.3.9 allows to consider as hard errors
601 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
602 	 * but it is obsoleted by pmtu discovery).
603 	 *
604 	 * Note, that in modern internet, where routing is unreliable
605 	 * and in each dark corner broken firewalls sit, sending random
606 	 * errors ordered by their masters even this two messages finally lose
607 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
608 	 *
609 	 * Now we are in compliance with RFCs.
610 	 *							--ANK (980905)
611 	 */
612 
613 	inet = inet_sk(sk);
614 	if (!sock_owned_by_user(sk) && inet->recverr) {
615 		sk->sk_err = err;
616 		sk->sk_error_report(sk);
617 	} else	{ /* Only an error on timeout */
618 		sk->sk_err_soft = err;
619 	}
620 
621 out:
622 	bh_unlock_sock(sk);
623 	sock_put(sk);
624 	return 0;
625 }
626 
627 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
628 {
629 	struct tcphdr *th = tcp_hdr(skb);
630 
631 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
632 	skb->csum_start = skb_transport_header(skb) - skb->head;
633 	skb->csum_offset = offsetof(struct tcphdr, check);
634 }
635 
636 /* This routine computes an IPv4 TCP checksum. */
637 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
638 {
639 	const struct inet_sock *inet = inet_sk(sk);
640 
641 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
642 }
643 EXPORT_SYMBOL(tcp_v4_send_check);
644 
645 /*
646  *	This routine will send an RST to the other tcp.
647  *
648  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
649  *		      for reset.
650  *	Answer: if a packet caused RST, it is not for a socket
651  *		existing in our system, if it is matched to a socket,
652  *		it is just duplicate segment or bug in other side's TCP.
653  *		So that we build reply only basing on parameters
654  *		arrived with segment.
655  *	Exception: precedence violation. We do not implement it in any case.
656  */
657 
658 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
659 {
660 	const struct tcphdr *th = tcp_hdr(skb);
661 	struct {
662 		struct tcphdr th;
663 #ifdef CONFIG_TCP_MD5SIG
664 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
665 #endif
666 	} rep;
667 	struct ip_reply_arg arg;
668 #ifdef CONFIG_TCP_MD5SIG
669 	struct tcp_md5sig_key *key = NULL;
670 	const __u8 *hash_location = NULL;
671 	unsigned char newhash[16];
672 	int genhash;
673 	struct sock *sk1 = NULL;
674 #endif
675 	u64 transmit_time = 0;
676 	struct sock *ctl_sk;
677 	struct net *net;
678 
679 	/* Never send a reset in response to a reset. */
680 	if (th->rst)
681 		return;
682 
683 	/* If sk not NULL, it means we did a successful lookup and incoming
684 	 * route had to be correct. prequeue might have dropped our dst.
685 	 */
686 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
687 		return;
688 
689 	/* Swap the send and the receive. */
690 	memset(&rep, 0, sizeof(rep));
691 	rep.th.dest   = th->source;
692 	rep.th.source = th->dest;
693 	rep.th.doff   = sizeof(struct tcphdr) / 4;
694 	rep.th.rst    = 1;
695 
696 	if (th->ack) {
697 		rep.th.seq = th->ack_seq;
698 	} else {
699 		rep.th.ack = 1;
700 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
701 				       skb->len - (th->doff << 2));
702 	}
703 
704 	memset(&arg, 0, sizeof(arg));
705 	arg.iov[0].iov_base = (unsigned char *)&rep;
706 	arg.iov[0].iov_len  = sizeof(rep.th);
707 
708 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
709 #ifdef CONFIG_TCP_MD5SIG
710 	rcu_read_lock();
711 	hash_location = tcp_parse_md5sig_option(th);
712 	if (sk && sk_fullsock(sk)) {
713 		const union tcp_md5_addr *addr;
714 		int l3index;
715 
716 		/* sdif set, means packet ingressed via a device
717 		 * in an L3 domain and inet_iif is set to it.
718 		 */
719 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
720 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
721 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
722 	} else if (hash_location) {
723 		const union tcp_md5_addr *addr;
724 		int sdif = tcp_v4_sdif(skb);
725 		int dif = inet_iif(skb);
726 		int l3index;
727 
728 		/*
729 		 * active side is lost. Try to find listening socket through
730 		 * source port, and then find md5 key through listening socket.
731 		 * we are not loose security here:
732 		 * Incoming packet is checked with md5 hash with finding key,
733 		 * no RST generated if md5 hash doesn't match.
734 		 */
735 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
736 					     ip_hdr(skb)->saddr,
737 					     th->source, ip_hdr(skb)->daddr,
738 					     ntohs(th->source), dif, sdif);
739 		/* don't send rst if it can't find key */
740 		if (!sk1)
741 			goto out;
742 
743 		/* sdif set, means packet ingressed via a device
744 		 * in an L3 domain and dif is set to it.
745 		 */
746 		l3index = sdif ? dif : 0;
747 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
748 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
749 		if (!key)
750 			goto out;
751 
752 
753 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
754 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
755 			goto out;
756 
757 	}
758 
759 	if (key) {
760 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
761 				   (TCPOPT_NOP << 16) |
762 				   (TCPOPT_MD5SIG << 8) |
763 				   TCPOLEN_MD5SIG);
764 		/* Update length and the length the header thinks exists */
765 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
766 		rep.th.doff = arg.iov[0].iov_len / 4;
767 
768 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
769 				     key, ip_hdr(skb)->saddr,
770 				     ip_hdr(skb)->daddr, &rep.th);
771 	}
772 #endif
773 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
774 				      ip_hdr(skb)->saddr, /* XXX */
775 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
776 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
777 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
778 
779 	/* When socket is gone, all binding information is lost.
780 	 * routing might fail in this case. No choice here, if we choose to force
781 	 * input interface, we will misroute in case of asymmetric route.
782 	 */
783 	if (sk) {
784 		arg.bound_dev_if = sk->sk_bound_dev_if;
785 		if (sk_fullsock(sk))
786 			trace_tcp_send_reset(sk, skb);
787 	}
788 
789 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
790 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
791 
792 	arg.tos = ip_hdr(skb)->tos;
793 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
794 	local_bh_disable();
795 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
796 	if (sk) {
797 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
798 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
799 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
800 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
801 		transmit_time = tcp_transmit_time(sk);
802 	}
803 	ip_send_unicast_reply(ctl_sk,
804 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
805 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
806 			      &arg, arg.iov[0].iov_len,
807 			      transmit_time);
808 
809 	ctl_sk->sk_mark = 0;
810 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
811 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
812 	local_bh_enable();
813 
814 #ifdef CONFIG_TCP_MD5SIG
815 out:
816 	rcu_read_unlock();
817 #endif
818 }
819 
820 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
821    outside socket context is ugly, certainly. What can I do?
822  */
823 
824 static void tcp_v4_send_ack(const struct sock *sk,
825 			    struct sk_buff *skb, u32 seq, u32 ack,
826 			    u32 win, u32 tsval, u32 tsecr, int oif,
827 			    struct tcp_md5sig_key *key,
828 			    int reply_flags, u8 tos)
829 {
830 	const struct tcphdr *th = tcp_hdr(skb);
831 	struct {
832 		struct tcphdr th;
833 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
834 #ifdef CONFIG_TCP_MD5SIG
835 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
836 #endif
837 			];
838 	} rep;
839 	struct net *net = sock_net(sk);
840 	struct ip_reply_arg arg;
841 	struct sock *ctl_sk;
842 	u64 transmit_time;
843 
844 	memset(&rep.th, 0, sizeof(struct tcphdr));
845 	memset(&arg, 0, sizeof(arg));
846 
847 	arg.iov[0].iov_base = (unsigned char *)&rep;
848 	arg.iov[0].iov_len  = sizeof(rep.th);
849 	if (tsecr) {
850 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
851 				   (TCPOPT_TIMESTAMP << 8) |
852 				   TCPOLEN_TIMESTAMP);
853 		rep.opt[1] = htonl(tsval);
854 		rep.opt[2] = htonl(tsecr);
855 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
856 	}
857 
858 	/* Swap the send and the receive. */
859 	rep.th.dest    = th->source;
860 	rep.th.source  = th->dest;
861 	rep.th.doff    = arg.iov[0].iov_len / 4;
862 	rep.th.seq     = htonl(seq);
863 	rep.th.ack_seq = htonl(ack);
864 	rep.th.ack     = 1;
865 	rep.th.window  = htons(win);
866 
867 #ifdef CONFIG_TCP_MD5SIG
868 	if (key) {
869 		int offset = (tsecr) ? 3 : 0;
870 
871 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
872 					  (TCPOPT_NOP << 16) |
873 					  (TCPOPT_MD5SIG << 8) |
874 					  TCPOLEN_MD5SIG);
875 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
876 		rep.th.doff = arg.iov[0].iov_len/4;
877 
878 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
879 				    key, ip_hdr(skb)->saddr,
880 				    ip_hdr(skb)->daddr, &rep.th);
881 	}
882 #endif
883 	arg.flags = reply_flags;
884 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
885 				      ip_hdr(skb)->saddr, /* XXX */
886 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
887 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
888 	if (oif)
889 		arg.bound_dev_if = oif;
890 	arg.tos = tos;
891 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
892 	local_bh_disable();
893 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
894 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
895 			   inet_twsk(sk)->tw_mark : sk->sk_mark;
896 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
897 			   inet_twsk(sk)->tw_priority : sk->sk_priority;
898 	transmit_time = tcp_transmit_time(sk);
899 	ip_send_unicast_reply(ctl_sk,
900 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
901 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
902 			      &arg, arg.iov[0].iov_len,
903 			      transmit_time);
904 
905 	ctl_sk->sk_mark = 0;
906 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
907 	local_bh_enable();
908 }
909 
910 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
911 {
912 	struct inet_timewait_sock *tw = inet_twsk(sk);
913 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
914 
915 	tcp_v4_send_ack(sk, skb,
916 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
917 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
918 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
919 			tcptw->tw_ts_recent,
920 			tw->tw_bound_dev_if,
921 			tcp_twsk_md5_key(tcptw),
922 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
923 			tw->tw_tos
924 			);
925 
926 	inet_twsk_put(tw);
927 }
928 
929 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
930 				  struct request_sock *req)
931 {
932 	const union tcp_md5_addr *addr;
933 	int l3index;
934 
935 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
936 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
937 	 */
938 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
939 					     tcp_sk(sk)->snd_nxt;
940 
941 	/* RFC 7323 2.3
942 	 * The window field (SEG.WND) of every outgoing segment, with the
943 	 * exception of <SYN> segments, MUST be right-shifted by
944 	 * Rcv.Wind.Shift bits:
945 	 */
946 	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
947 	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
948 	tcp_v4_send_ack(sk, skb, seq,
949 			tcp_rsk(req)->rcv_nxt,
950 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
951 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
952 			req->ts_recent,
953 			0,
954 			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
955 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
956 			ip_hdr(skb)->tos);
957 }
958 
959 /*
960  *	Send a SYN-ACK after having received a SYN.
961  *	This still operates on a request_sock only, not on a big
962  *	socket.
963  */
964 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
965 			      struct flowi *fl,
966 			      struct request_sock *req,
967 			      struct tcp_fastopen_cookie *foc,
968 			      enum tcp_synack_type synack_type,
969 			      struct sk_buff *syn_skb)
970 {
971 	const struct inet_request_sock *ireq = inet_rsk(req);
972 	struct flowi4 fl4;
973 	int err = -1;
974 	struct sk_buff *skb;
975 	u8 tos;
976 
977 	/* First, grab a route. */
978 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
979 		return -1;
980 
981 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
982 
983 	tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
984 			tcp_rsk(req)->syn_tos : inet_sk(sk)->tos;
985 
986 	if (skb) {
987 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
988 
989 		rcu_read_lock();
990 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
991 					    ireq->ir_rmt_addr,
992 					    rcu_dereference(ireq->ireq_opt),
993 					    tos & ~INET_ECN_MASK);
994 		rcu_read_unlock();
995 		err = net_xmit_eval(err);
996 	}
997 
998 	return err;
999 }
1000 
1001 /*
1002  *	IPv4 request_sock destructor.
1003  */
1004 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1005 {
1006 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1007 }
1008 
1009 #ifdef CONFIG_TCP_MD5SIG
1010 /*
1011  * RFC2385 MD5 checksumming requires a mapping of
1012  * IP address->MD5 Key.
1013  * We need to maintain these in the sk structure.
1014  */
1015 
1016 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1017 EXPORT_SYMBOL(tcp_md5_needed);
1018 
1019 /* Find the Key structure for an address.  */
1020 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1021 					   const union tcp_md5_addr *addr,
1022 					   int family)
1023 {
1024 	const struct tcp_sock *tp = tcp_sk(sk);
1025 	struct tcp_md5sig_key *key;
1026 	const struct tcp_md5sig_info *md5sig;
1027 	__be32 mask;
1028 	struct tcp_md5sig_key *best_match = NULL;
1029 	bool match;
1030 
1031 	/* caller either holds rcu_read_lock() or socket lock */
1032 	md5sig = rcu_dereference_check(tp->md5sig_info,
1033 				       lockdep_sock_is_held(sk));
1034 	if (!md5sig)
1035 		return NULL;
1036 
1037 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1038 				 lockdep_sock_is_held(sk)) {
1039 		if (key->family != family)
1040 			continue;
1041 		if (key->l3index && key->l3index != l3index)
1042 			continue;
1043 		if (family == AF_INET) {
1044 			mask = inet_make_mask(key->prefixlen);
1045 			match = (key->addr.a4.s_addr & mask) ==
1046 				(addr->a4.s_addr & mask);
1047 #if IS_ENABLED(CONFIG_IPV6)
1048 		} else if (family == AF_INET6) {
1049 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1050 						  key->prefixlen);
1051 #endif
1052 		} else {
1053 			match = false;
1054 		}
1055 
1056 		if (match && (!best_match ||
1057 			      key->prefixlen > best_match->prefixlen))
1058 			best_match = key;
1059 	}
1060 	return best_match;
1061 }
1062 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1063 
1064 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1065 						      const union tcp_md5_addr *addr,
1066 						      int family, u8 prefixlen,
1067 						      int l3index)
1068 {
1069 	const struct tcp_sock *tp = tcp_sk(sk);
1070 	struct tcp_md5sig_key *key;
1071 	unsigned int size = sizeof(struct in_addr);
1072 	const struct tcp_md5sig_info *md5sig;
1073 
1074 	/* caller either holds rcu_read_lock() or socket lock */
1075 	md5sig = rcu_dereference_check(tp->md5sig_info,
1076 				       lockdep_sock_is_held(sk));
1077 	if (!md5sig)
1078 		return NULL;
1079 #if IS_ENABLED(CONFIG_IPV6)
1080 	if (family == AF_INET6)
1081 		size = sizeof(struct in6_addr);
1082 #endif
1083 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1084 				 lockdep_sock_is_held(sk)) {
1085 		if (key->family != family)
1086 			continue;
1087 		if (key->l3index && key->l3index != l3index)
1088 			continue;
1089 		if (!memcmp(&key->addr, addr, size) &&
1090 		    key->prefixlen == prefixlen)
1091 			return key;
1092 	}
1093 	return NULL;
1094 }
1095 
1096 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1097 					 const struct sock *addr_sk)
1098 {
1099 	const union tcp_md5_addr *addr;
1100 	int l3index;
1101 
1102 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1103 						 addr_sk->sk_bound_dev_if);
1104 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1105 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1106 }
1107 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1108 
1109 /* This can be called on a newly created socket, from other files */
1110 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1111 		   int family, u8 prefixlen, int l3index,
1112 		   const u8 *newkey, u8 newkeylen, gfp_t gfp)
1113 {
1114 	/* Add Key to the list */
1115 	struct tcp_md5sig_key *key;
1116 	struct tcp_sock *tp = tcp_sk(sk);
1117 	struct tcp_md5sig_info *md5sig;
1118 
1119 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1120 	if (key) {
1121 		/* Pre-existing entry - just update that one.
1122 		 * Note that the key might be used concurrently.
1123 		 * data_race() is telling kcsan that we do not care of
1124 		 * key mismatches, since changing MD5 key on live flows
1125 		 * can lead to packet drops.
1126 		 */
1127 		data_race(memcpy(key->key, newkey, newkeylen));
1128 
1129 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1130 		 * Also note that a reader could catch new key->keylen value
1131 		 * but old key->key[], this is the reason we use __GFP_ZERO
1132 		 * at sock_kmalloc() time below these lines.
1133 		 */
1134 		WRITE_ONCE(key->keylen, newkeylen);
1135 
1136 		return 0;
1137 	}
1138 
1139 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1140 					   lockdep_sock_is_held(sk));
1141 	if (!md5sig) {
1142 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1143 		if (!md5sig)
1144 			return -ENOMEM;
1145 
1146 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1147 		INIT_HLIST_HEAD(&md5sig->head);
1148 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1149 	}
1150 
1151 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1152 	if (!key)
1153 		return -ENOMEM;
1154 	if (!tcp_alloc_md5sig_pool()) {
1155 		sock_kfree_s(sk, key, sizeof(*key));
1156 		return -ENOMEM;
1157 	}
1158 
1159 	memcpy(key->key, newkey, newkeylen);
1160 	key->keylen = newkeylen;
1161 	key->family = family;
1162 	key->prefixlen = prefixlen;
1163 	key->l3index = l3index;
1164 	memcpy(&key->addr, addr,
1165 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1166 				      sizeof(struct in_addr));
1167 	hlist_add_head_rcu(&key->node, &md5sig->head);
1168 	return 0;
1169 }
1170 EXPORT_SYMBOL(tcp_md5_do_add);
1171 
1172 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1173 		   u8 prefixlen, int l3index)
1174 {
1175 	struct tcp_md5sig_key *key;
1176 
1177 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1178 	if (!key)
1179 		return -ENOENT;
1180 	hlist_del_rcu(&key->node);
1181 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1182 	kfree_rcu(key, rcu);
1183 	return 0;
1184 }
1185 EXPORT_SYMBOL(tcp_md5_do_del);
1186 
1187 static void tcp_clear_md5_list(struct sock *sk)
1188 {
1189 	struct tcp_sock *tp = tcp_sk(sk);
1190 	struct tcp_md5sig_key *key;
1191 	struct hlist_node *n;
1192 	struct tcp_md5sig_info *md5sig;
1193 
1194 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1195 
1196 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1197 		hlist_del_rcu(&key->node);
1198 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1199 		kfree_rcu(key, rcu);
1200 	}
1201 }
1202 
1203 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1204 				 sockptr_t optval, int optlen)
1205 {
1206 	struct tcp_md5sig cmd;
1207 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1208 	const union tcp_md5_addr *addr;
1209 	u8 prefixlen = 32;
1210 	int l3index = 0;
1211 
1212 	if (optlen < sizeof(cmd))
1213 		return -EINVAL;
1214 
1215 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1216 		return -EFAULT;
1217 
1218 	if (sin->sin_family != AF_INET)
1219 		return -EINVAL;
1220 
1221 	if (optname == TCP_MD5SIG_EXT &&
1222 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1223 		prefixlen = cmd.tcpm_prefixlen;
1224 		if (prefixlen > 32)
1225 			return -EINVAL;
1226 	}
1227 
1228 	if (optname == TCP_MD5SIG_EXT &&
1229 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1230 		struct net_device *dev;
1231 
1232 		rcu_read_lock();
1233 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1234 		if (dev && netif_is_l3_master(dev))
1235 			l3index = dev->ifindex;
1236 
1237 		rcu_read_unlock();
1238 
1239 		/* ok to reference set/not set outside of rcu;
1240 		 * right now device MUST be an L3 master
1241 		 */
1242 		if (!dev || !l3index)
1243 			return -EINVAL;
1244 	}
1245 
1246 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1247 
1248 	if (!cmd.tcpm_keylen)
1249 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1250 
1251 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1252 		return -EINVAL;
1253 
1254 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1255 			      cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1256 }
1257 
1258 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1259 				   __be32 daddr, __be32 saddr,
1260 				   const struct tcphdr *th, int nbytes)
1261 {
1262 	struct tcp4_pseudohdr *bp;
1263 	struct scatterlist sg;
1264 	struct tcphdr *_th;
1265 
1266 	bp = hp->scratch;
1267 	bp->saddr = saddr;
1268 	bp->daddr = daddr;
1269 	bp->pad = 0;
1270 	bp->protocol = IPPROTO_TCP;
1271 	bp->len = cpu_to_be16(nbytes);
1272 
1273 	_th = (struct tcphdr *)(bp + 1);
1274 	memcpy(_th, th, sizeof(*th));
1275 	_th->check = 0;
1276 
1277 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1278 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1279 				sizeof(*bp) + sizeof(*th));
1280 	return crypto_ahash_update(hp->md5_req);
1281 }
1282 
1283 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1284 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1285 {
1286 	struct tcp_md5sig_pool *hp;
1287 	struct ahash_request *req;
1288 
1289 	hp = tcp_get_md5sig_pool();
1290 	if (!hp)
1291 		goto clear_hash_noput;
1292 	req = hp->md5_req;
1293 
1294 	if (crypto_ahash_init(req))
1295 		goto clear_hash;
1296 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1297 		goto clear_hash;
1298 	if (tcp_md5_hash_key(hp, key))
1299 		goto clear_hash;
1300 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1301 	if (crypto_ahash_final(req))
1302 		goto clear_hash;
1303 
1304 	tcp_put_md5sig_pool();
1305 	return 0;
1306 
1307 clear_hash:
1308 	tcp_put_md5sig_pool();
1309 clear_hash_noput:
1310 	memset(md5_hash, 0, 16);
1311 	return 1;
1312 }
1313 
1314 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1315 			const struct sock *sk,
1316 			const struct sk_buff *skb)
1317 {
1318 	struct tcp_md5sig_pool *hp;
1319 	struct ahash_request *req;
1320 	const struct tcphdr *th = tcp_hdr(skb);
1321 	__be32 saddr, daddr;
1322 
1323 	if (sk) { /* valid for establish/request sockets */
1324 		saddr = sk->sk_rcv_saddr;
1325 		daddr = sk->sk_daddr;
1326 	} else {
1327 		const struct iphdr *iph = ip_hdr(skb);
1328 		saddr = iph->saddr;
1329 		daddr = iph->daddr;
1330 	}
1331 
1332 	hp = tcp_get_md5sig_pool();
1333 	if (!hp)
1334 		goto clear_hash_noput;
1335 	req = hp->md5_req;
1336 
1337 	if (crypto_ahash_init(req))
1338 		goto clear_hash;
1339 
1340 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1341 		goto clear_hash;
1342 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1343 		goto clear_hash;
1344 	if (tcp_md5_hash_key(hp, key))
1345 		goto clear_hash;
1346 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1347 	if (crypto_ahash_final(req))
1348 		goto clear_hash;
1349 
1350 	tcp_put_md5sig_pool();
1351 	return 0;
1352 
1353 clear_hash:
1354 	tcp_put_md5sig_pool();
1355 clear_hash_noput:
1356 	memset(md5_hash, 0, 16);
1357 	return 1;
1358 }
1359 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1360 
1361 #endif
1362 
1363 /* Called with rcu_read_lock() */
1364 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1365 				    const struct sk_buff *skb,
1366 				    int dif, int sdif)
1367 {
1368 #ifdef CONFIG_TCP_MD5SIG
1369 	/*
1370 	 * This gets called for each TCP segment that arrives
1371 	 * so we want to be efficient.
1372 	 * We have 3 drop cases:
1373 	 * o No MD5 hash and one expected.
1374 	 * o MD5 hash and we're not expecting one.
1375 	 * o MD5 hash and its wrong.
1376 	 */
1377 	const __u8 *hash_location = NULL;
1378 	struct tcp_md5sig_key *hash_expected;
1379 	const struct iphdr *iph = ip_hdr(skb);
1380 	const struct tcphdr *th = tcp_hdr(skb);
1381 	const union tcp_md5_addr *addr;
1382 	unsigned char newhash[16];
1383 	int genhash, l3index;
1384 
1385 	/* sdif set, means packet ingressed via a device
1386 	 * in an L3 domain and dif is set to the l3mdev
1387 	 */
1388 	l3index = sdif ? dif : 0;
1389 
1390 	addr = (union tcp_md5_addr *)&iph->saddr;
1391 	hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1392 	hash_location = tcp_parse_md5sig_option(th);
1393 
1394 	/* We've parsed the options - do we have a hash? */
1395 	if (!hash_expected && !hash_location)
1396 		return false;
1397 
1398 	if (hash_expected && !hash_location) {
1399 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1400 		return true;
1401 	}
1402 
1403 	if (!hash_expected && hash_location) {
1404 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1405 		return true;
1406 	}
1407 
1408 	/* Okay, so this is hash_expected and hash_location -
1409 	 * so we need to calculate the checksum.
1410 	 */
1411 	genhash = tcp_v4_md5_hash_skb(newhash,
1412 				      hash_expected,
1413 				      NULL, skb);
1414 
1415 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1416 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1417 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1418 				     &iph->saddr, ntohs(th->source),
1419 				     &iph->daddr, ntohs(th->dest),
1420 				     genhash ? " tcp_v4_calc_md5_hash failed"
1421 				     : "", l3index);
1422 		return true;
1423 	}
1424 	return false;
1425 #endif
1426 	return false;
1427 }
1428 
1429 static void tcp_v4_init_req(struct request_sock *req,
1430 			    const struct sock *sk_listener,
1431 			    struct sk_buff *skb)
1432 {
1433 	struct inet_request_sock *ireq = inet_rsk(req);
1434 	struct net *net = sock_net(sk_listener);
1435 
1436 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1437 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1438 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1439 }
1440 
1441 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1442 					  struct flowi *fl,
1443 					  const struct request_sock *req)
1444 {
1445 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1446 }
1447 
1448 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1449 	.family		=	PF_INET,
1450 	.obj_size	=	sizeof(struct tcp_request_sock),
1451 	.rtx_syn_ack	=	tcp_rtx_synack,
1452 	.send_ack	=	tcp_v4_reqsk_send_ack,
1453 	.destructor	=	tcp_v4_reqsk_destructor,
1454 	.send_reset	=	tcp_v4_send_reset,
1455 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1456 };
1457 
1458 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1459 	.mss_clamp	=	TCP_MSS_DEFAULT,
1460 #ifdef CONFIG_TCP_MD5SIG
1461 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1462 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1463 #endif
1464 	.init_req	=	tcp_v4_init_req,
1465 #ifdef CONFIG_SYN_COOKIES
1466 	.cookie_init_seq =	cookie_v4_init_sequence,
1467 #endif
1468 	.route_req	=	tcp_v4_route_req,
1469 	.init_seq	=	tcp_v4_init_seq,
1470 	.init_ts_off	=	tcp_v4_init_ts_off,
1471 	.send_synack	=	tcp_v4_send_synack,
1472 };
1473 
1474 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1475 {
1476 	/* Never answer to SYNs send to broadcast or multicast */
1477 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1478 		goto drop;
1479 
1480 	return tcp_conn_request(&tcp_request_sock_ops,
1481 				&tcp_request_sock_ipv4_ops, sk, skb);
1482 
1483 drop:
1484 	tcp_listendrop(sk);
1485 	return 0;
1486 }
1487 EXPORT_SYMBOL(tcp_v4_conn_request);
1488 
1489 
1490 /*
1491  * The three way handshake has completed - we got a valid synack -
1492  * now create the new socket.
1493  */
1494 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1495 				  struct request_sock *req,
1496 				  struct dst_entry *dst,
1497 				  struct request_sock *req_unhash,
1498 				  bool *own_req)
1499 {
1500 	struct inet_request_sock *ireq;
1501 	struct inet_sock *newinet;
1502 	struct tcp_sock *newtp;
1503 	struct sock *newsk;
1504 #ifdef CONFIG_TCP_MD5SIG
1505 	const union tcp_md5_addr *addr;
1506 	struct tcp_md5sig_key *key;
1507 	int l3index;
1508 #endif
1509 	struct ip_options_rcu *inet_opt;
1510 
1511 	if (sk_acceptq_is_full(sk))
1512 		goto exit_overflow;
1513 
1514 	newsk = tcp_create_openreq_child(sk, req, skb);
1515 	if (!newsk)
1516 		goto exit_nonewsk;
1517 
1518 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1519 	inet_sk_rx_dst_set(newsk, skb);
1520 
1521 	newtp		      = tcp_sk(newsk);
1522 	newinet		      = inet_sk(newsk);
1523 	ireq		      = inet_rsk(req);
1524 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1525 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1526 	newsk->sk_bound_dev_if = ireq->ir_iif;
1527 	newinet->inet_saddr   = ireq->ir_loc_addr;
1528 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1529 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1530 	newinet->mc_index     = inet_iif(skb);
1531 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1532 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1533 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1534 	if (inet_opt)
1535 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1536 	newinet->inet_id = prandom_u32();
1537 
1538 	/* Set ToS of the new socket based upon the value of incoming SYN. */
1539 	if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1540 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1541 
1542 	if (!dst) {
1543 		dst = inet_csk_route_child_sock(sk, newsk, req);
1544 		if (!dst)
1545 			goto put_and_exit;
1546 	} else {
1547 		/* syncookie case : see end of cookie_v4_check() */
1548 	}
1549 	sk_setup_caps(newsk, dst);
1550 
1551 	tcp_ca_openreq_child(newsk, dst);
1552 
1553 	tcp_sync_mss(newsk, dst_mtu(dst));
1554 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1555 
1556 	tcp_initialize_rcv_mss(newsk);
1557 
1558 #ifdef CONFIG_TCP_MD5SIG
1559 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1560 	/* Copy over the MD5 key from the original socket */
1561 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1562 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1563 	if (key) {
1564 		/*
1565 		 * We're using one, so create a matching key
1566 		 * on the newsk structure. If we fail to get
1567 		 * memory, then we end up not copying the key
1568 		 * across. Shucks.
1569 		 */
1570 		tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1571 			       key->key, key->keylen, GFP_ATOMIC);
1572 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1573 	}
1574 #endif
1575 
1576 	if (__inet_inherit_port(sk, newsk) < 0)
1577 		goto put_and_exit;
1578 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1579 	if (likely(*own_req)) {
1580 		tcp_move_syn(newtp, req);
1581 		ireq->ireq_opt = NULL;
1582 	} else {
1583 		newinet->inet_opt = NULL;
1584 	}
1585 	return newsk;
1586 
1587 exit_overflow:
1588 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1589 exit_nonewsk:
1590 	dst_release(dst);
1591 exit:
1592 	tcp_listendrop(sk);
1593 	return NULL;
1594 put_and_exit:
1595 	newinet->inet_opt = NULL;
1596 	inet_csk_prepare_forced_close(newsk);
1597 	tcp_done(newsk);
1598 	goto exit;
1599 }
1600 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1601 
1602 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1603 {
1604 #ifdef CONFIG_SYN_COOKIES
1605 	const struct tcphdr *th = tcp_hdr(skb);
1606 
1607 	if (!th->syn)
1608 		sk = cookie_v4_check(sk, skb);
1609 #endif
1610 	return sk;
1611 }
1612 
1613 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1614 			 struct tcphdr *th, u32 *cookie)
1615 {
1616 	u16 mss = 0;
1617 #ifdef CONFIG_SYN_COOKIES
1618 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1619 				    &tcp_request_sock_ipv4_ops, sk, th);
1620 	if (mss) {
1621 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1622 		tcp_synq_overflow(sk);
1623 	}
1624 #endif
1625 	return mss;
1626 }
1627 
1628 /* The socket must have it's spinlock held when we get
1629  * here, unless it is a TCP_LISTEN socket.
1630  *
1631  * We have a potential double-lock case here, so even when
1632  * doing backlog processing we use the BH locking scheme.
1633  * This is because we cannot sleep with the original spinlock
1634  * held.
1635  */
1636 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1637 {
1638 	struct sock *rsk;
1639 
1640 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1641 		struct dst_entry *dst = sk->sk_rx_dst;
1642 
1643 		sock_rps_save_rxhash(sk, skb);
1644 		sk_mark_napi_id(sk, skb);
1645 		if (dst) {
1646 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1647 			    !dst->ops->check(dst, 0)) {
1648 				dst_release(dst);
1649 				sk->sk_rx_dst = NULL;
1650 			}
1651 		}
1652 		tcp_rcv_established(sk, skb);
1653 		return 0;
1654 	}
1655 
1656 	if (tcp_checksum_complete(skb))
1657 		goto csum_err;
1658 
1659 	if (sk->sk_state == TCP_LISTEN) {
1660 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1661 
1662 		if (!nsk)
1663 			goto discard;
1664 		if (nsk != sk) {
1665 			if (tcp_child_process(sk, nsk, skb)) {
1666 				rsk = nsk;
1667 				goto reset;
1668 			}
1669 			return 0;
1670 		}
1671 	} else
1672 		sock_rps_save_rxhash(sk, skb);
1673 
1674 	if (tcp_rcv_state_process(sk, skb)) {
1675 		rsk = sk;
1676 		goto reset;
1677 	}
1678 	return 0;
1679 
1680 reset:
1681 	tcp_v4_send_reset(rsk, skb);
1682 discard:
1683 	kfree_skb(skb);
1684 	/* Be careful here. If this function gets more complicated and
1685 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1686 	 * might be destroyed here. This current version compiles correctly,
1687 	 * but you have been warned.
1688 	 */
1689 	return 0;
1690 
1691 csum_err:
1692 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1693 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1694 	goto discard;
1695 }
1696 EXPORT_SYMBOL(tcp_v4_do_rcv);
1697 
1698 int tcp_v4_early_demux(struct sk_buff *skb)
1699 {
1700 	const struct iphdr *iph;
1701 	const struct tcphdr *th;
1702 	struct sock *sk;
1703 
1704 	if (skb->pkt_type != PACKET_HOST)
1705 		return 0;
1706 
1707 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1708 		return 0;
1709 
1710 	iph = ip_hdr(skb);
1711 	th = tcp_hdr(skb);
1712 
1713 	if (th->doff < sizeof(struct tcphdr) / 4)
1714 		return 0;
1715 
1716 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1717 				       iph->saddr, th->source,
1718 				       iph->daddr, ntohs(th->dest),
1719 				       skb->skb_iif, inet_sdif(skb));
1720 	if (sk) {
1721 		skb->sk = sk;
1722 		skb->destructor = sock_edemux;
1723 		if (sk_fullsock(sk)) {
1724 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1725 
1726 			if (dst)
1727 				dst = dst_check(dst, 0);
1728 			if (dst &&
1729 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1730 				skb_dst_set_noref(skb, dst);
1731 		}
1732 	}
1733 	return 0;
1734 }
1735 
1736 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1737 {
1738 	u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1739 	struct skb_shared_info *shinfo;
1740 	const struct tcphdr *th;
1741 	struct tcphdr *thtail;
1742 	struct sk_buff *tail;
1743 	unsigned int hdrlen;
1744 	bool fragstolen;
1745 	u32 gso_segs;
1746 	int delta;
1747 
1748 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1749 	 * we can fix skb->truesize to its real value to avoid future drops.
1750 	 * This is valid because skb is not yet charged to the socket.
1751 	 * It has been noticed pure SACK packets were sometimes dropped
1752 	 * (if cooked by drivers without copybreak feature).
1753 	 */
1754 	skb_condense(skb);
1755 
1756 	skb_dst_drop(skb);
1757 
1758 	if (unlikely(tcp_checksum_complete(skb))) {
1759 		bh_unlock_sock(sk);
1760 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1761 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1762 		return true;
1763 	}
1764 
1765 	/* Attempt coalescing to last skb in backlog, even if we are
1766 	 * above the limits.
1767 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1768 	 */
1769 	th = (const struct tcphdr *)skb->data;
1770 	hdrlen = th->doff * 4;
1771 	shinfo = skb_shinfo(skb);
1772 
1773 	if (!shinfo->gso_size)
1774 		shinfo->gso_size = skb->len - hdrlen;
1775 
1776 	if (!shinfo->gso_segs)
1777 		shinfo->gso_segs = 1;
1778 
1779 	tail = sk->sk_backlog.tail;
1780 	if (!tail)
1781 		goto no_coalesce;
1782 	thtail = (struct tcphdr *)tail->data;
1783 
1784 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1785 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1786 	    ((TCP_SKB_CB(tail)->tcp_flags |
1787 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1788 	    !((TCP_SKB_CB(tail)->tcp_flags &
1789 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1790 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1791 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1792 #ifdef CONFIG_TLS_DEVICE
1793 	    tail->decrypted != skb->decrypted ||
1794 #endif
1795 	    thtail->doff != th->doff ||
1796 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1797 		goto no_coalesce;
1798 
1799 	__skb_pull(skb, hdrlen);
1800 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1801 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1802 
1803 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1804 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1805 			thtail->window = th->window;
1806 		}
1807 
1808 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1809 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1810 		 * is not entered if we append a packet with a FIN.
1811 		 * SYN, RST, URG are not present.
1812 		 * ACK is set on both packets.
1813 		 * PSH : we do not really care in TCP stack,
1814 		 *       at least for 'GRO' packets.
1815 		 */
1816 		thtail->fin |= th->fin;
1817 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1818 
1819 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1820 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1821 			tail->tstamp = skb->tstamp;
1822 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1823 		}
1824 
1825 		/* Not as strict as GRO. We only need to carry mss max value */
1826 		skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1827 						 skb_shinfo(tail)->gso_size);
1828 
1829 		gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1830 		skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1831 
1832 		sk->sk_backlog.len += delta;
1833 		__NET_INC_STATS(sock_net(sk),
1834 				LINUX_MIB_TCPBACKLOGCOALESCE);
1835 		kfree_skb_partial(skb, fragstolen);
1836 		return false;
1837 	}
1838 	__skb_push(skb, hdrlen);
1839 
1840 no_coalesce:
1841 	/* Only socket owner can try to collapse/prune rx queues
1842 	 * to reduce memory overhead, so add a little headroom here.
1843 	 * Few sockets backlog are possibly concurrently non empty.
1844 	 */
1845 	limit += 64*1024;
1846 
1847 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1848 		bh_unlock_sock(sk);
1849 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1850 		return true;
1851 	}
1852 	return false;
1853 }
1854 EXPORT_SYMBOL(tcp_add_backlog);
1855 
1856 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1857 {
1858 	struct tcphdr *th = (struct tcphdr *)skb->data;
1859 
1860 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1861 }
1862 EXPORT_SYMBOL(tcp_filter);
1863 
1864 static void tcp_v4_restore_cb(struct sk_buff *skb)
1865 {
1866 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1867 		sizeof(struct inet_skb_parm));
1868 }
1869 
1870 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1871 			   const struct tcphdr *th)
1872 {
1873 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1874 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1875 	 */
1876 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1877 		sizeof(struct inet_skb_parm));
1878 	barrier();
1879 
1880 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1881 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1882 				    skb->len - th->doff * 4);
1883 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1884 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1885 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1886 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1887 	TCP_SKB_CB(skb)->sacked	 = 0;
1888 	TCP_SKB_CB(skb)->has_rxtstamp =
1889 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1890 }
1891 
1892 /*
1893  *	From tcp_input.c
1894  */
1895 
1896 int tcp_v4_rcv(struct sk_buff *skb)
1897 {
1898 	struct net *net = dev_net(skb->dev);
1899 	struct sk_buff *skb_to_free;
1900 	int sdif = inet_sdif(skb);
1901 	int dif = inet_iif(skb);
1902 	const struct iphdr *iph;
1903 	const struct tcphdr *th;
1904 	bool refcounted;
1905 	struct sock *sk;
1906 	int ret;
1907 
1908 	if (skb->pkt_type != PACKET_HOST)
1909 		goto discard_it;
1910 
1911 	/* Count it even if it's bad */
1912 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1913 
1914 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1915 		goto discard_it;
1916 
1917 	th = (const struct tcphdr *)skb->data;
1918 
1919 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1920 		goto bad_packet;
1921 	if (!pskb_may_pull(skb, th->doff * 4))
1922 		goto discard_it;
1923 
1924 	/* An explanation is required here, I think.
1925 	 * Packet length and doff are validated by header prediction,
1926 	 * provided case of th->doff==0 is eliminated.
1927 	 * So, we defer the checks. */
1928 
1929 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1930 		goto csum_error;
1931 
1932 	th = (const struct tcphdr *)skb->data;
1933 	iph = ip_hdr(skb);
1934 lookup:
1935 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1936 			       th->dest, sdif, &refcounted);
1937 	if (!sk)
1938 		goto no_tcp_socket;
1939 
1940 process:
1941 	if (sk->sk_state == TCP_TIME_WAIT)
1942 		goto do_time_wait;
1943 
1944 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1945 		struct request_sock *req = inet_reqsk(sk);
1946 		bool req_stolen = false;
1947 		struct sock *nsk;
1948 
1949 		sk = req->rsk_listener;
1950 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1951 			sk_drops_add(sk, skb);
1952 			reqsk_put(req);
1953 			goto discard_it;
1954 		}
1955 		if (tcp_checksum_complete(skb)) {
1956 			reqsk_put(req);
1957 			goto csum_error;
1958 		}
1959 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1960 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1961 			goto lookup;
1962 		}
1963 		/* We own a reference on the listener, increase it again
1964 		 * as we might lose it too soon.
1965 		 */
1966 		sock_hold(sk);
1967 		refcounted = true;
1968 		nsk = NULL;
1969 		if (!tcp_filter(sk, skb)) {
1970 			th = (const struct tcphdr *)skb->data;
1971 			iph = ip_hdr(skb);
1972 			tcp_v4_fill_cb(skb, iph, th);
1973 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1974 		}
1975 		if (!nsk) {
1976 			reqsk_put(req);
1977 			if (req_stolen) {
1978 				/* Another cpu got exclusive access to req
1979 				 * and created a full blown socket.
1980 				 * Try to feed this packet to this socket
1981 				 * instead of discarding it.
1982 				 */
1983 				tcp_v4_restore_cb(skb);
1984 				sock_put(sk);
1985 				goto lookup;
1986 			}
1987 			goto discard_and_relse;
1988 		}
1989 		if (nsk == sk) {
1990 			reqsk_put(req);
1991 			tcp_v4_restore_cb(skb);
1992 		} else if (tcp_child_process(sk, nsk, skb)) {
1993 			tcp_v4_send_reset(nsk, skb);
1994 			goto discard_and_relse;
1995 		} else {
1996 			sock_put(sk);
1997 			return 0;
1998 		}
1999 	}
2000 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2001 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2002 		goto discard_and_relse;
2003 	}
2004 
2005 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2006 		goto discard_and_relse;
2007 
2008 	if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2009 		goto discard_and_relse;
2010 
2011 	nf_reset_ct(skb);
2012 
2013 	if (tcp_filter(sk, skb))
2014 		goto discard_and_relse;
2015 	th = (const struct tcphdr *)skb->data;
2016 	iph = ip_hdr(skb);
2017 	tcp_v4_fill_cb(skb, iph, th);
2018 
2019 	skb->dev = NULL;
2020 
2021 	if (sk->sk_state == TCP_LISTEN) {
2022 		ret = tcp_v4_do_rcv(sk, skb);
2023 		goto put_and_return;
2024 	}
2025 
2026 	sk_incoming_cpu_update(sk);
2027 
2028 	bh_lock_sock_nested(sk);
2029 	tcp_segs_in(tcp_sk(sk), skb);
2030 	ret = 0;
2031 	if (!sock_owned_by_user(sk)) {
2032 		skb_to_free = sk->sk_rx_skb_cache;
2033 		sk->sk_rx_skb_cache = NULL;
2034 		ret = tcp_v4_do_rcv(sk, skb);
2035 	} else {
2036 		if (tcp_add_backlog(sk, skb))
2037 			goto discard_and_relse;
2038 		skb_to_free = NULL;
2039 	}
2040 	bh_unlock_sock(sk);
2041 	if (skb_to_free)
2042 		__kfree_skb(skb_to_free);
2043 
2044 put_and_return:
2045 	if (refcounted)
2046 		sock_put(sk);
2047 
2048 	return ret;
2049 
2050 no_tcp_socket:
2051 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2052 		goto discard_it;
2053 
2054 	tcp_v4_fill_cb(skb, iph, th);
2055 
2056 	if (tcp_checksum_complete(skb)) {
2057 csum_error:
2058 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2059 bad_packet:
2060 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2061 	} else {
2062 		tcp_v4_send_reset(NULL, skb);
2063 	}
2064 
2065 discard_it:
2066 	/* Discard frame. */
2067 	kfree_skb(skb);
2068 	return 0;
2069 
2070 discard_and_relse:
2071 	sk_drops_add(sk, skb);
2072 	if (refcounted)
2073 		sock_put(sk);
2074 	goto discard_it;
2075 
2076 do_time_wait:
2077 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2078 		inet_twsk_put(inet_twsk(sk));
2079 		goto discard_it;
2080 	}
2081 
2082 	tcp_v4_fill_cb(skb, iph, th);
2083 
2084 	if (tcp_checksum_complete(skb)) {
2085 		inet_twsk_put(inet_twsk(sk));
2086 		goto csum_error;
2087 	}
2088 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2089 	case TCP_TW_SYN: {
2090 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2091 							&tcp_hashinfo, skb,
2092 							__tcp_hdrlen(th),
2093 							iph->saddr, th->source,
2094 							iph->daddr, th->dest,
2095 							inet_iif(skb),
2096 							sdif);
2097 		if (sk2) {
2098 			inet_twsk_deschedule_put(inet_twsk(sk));
2099 			sk = sk2;
2100 			tcp_v4_restore_cb(skb);
2101 			refcounted = false;
2102 			goto process;
2103 		}
2104 	}
2105 		/* to ACK */
2106 		fallthrough;
2107 	case TCP_TW_ACK:
2108 		tcp_v4_timewait_ack(sk, skb);
2109 		break;
2110 	case TCP_TW_RST:
2111 		tcp_v4_send_reset(sk, skb);
2112 		inet_twsk_deschedule_put(inet_twsk(sk));
2113 		goto discard_it;
2114 	case TCP_TW_SUCCESS:;
2115 	}
2116 	goto discard_it;
2117 }
2118 
2119 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2120 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2121 	.twsk_unique	= tcp_twsk_unique,
2122 	.twsk_destructor= tcp_twsk_destructor,
2123 };
2124 
2125 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2126 {
2127 	struct dst_entry *dst = skb_dst(skb);
2128 
2129 	if (dst && dst_hold_safe(dst)) {
2130 		sk->sk_rx_dst = dst;
2131 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2132 	}
2133 }
2134 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2135 
2136 const struct inet_connection_sock_af_ops ipv4_specific = {
2137 	.queue_xmit	   = ip_queue_xmit,
2138 	.send_check	   = tcp_v4_send_check,
2139 	.rebuild_header	   = inet_sk_rebuild_header,
2140 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2141 	.conn_request	   = tcp_v4_conn_request,
2142 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2143 	.net_header_len	   = sizeof(struct iphdr),
2144 	.setsockopt	   = ip_setsockopt,
2145 	.getsockopt	   = ip_getsockopt,
2146 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2147 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2148 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2149 };
2150 EXPORT_SYMBOL(ipv4_specific);
2151 
2152 #ifdef CONFIG_TCP_MD5SIG
2153 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2154 	.md5_lookup		= tcp_v4_md5_lookup,
2155 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2156 	.md5_parse		= tcp_v4_parse_md5_keys,
2157 };
2158 #endif
2159 
2160 /* NOTE: A lot of things set to zero explicitly by call to
2161  *       sk_alloc() so need not be done here.
2162  */
2163 static int tcp_v4_init_sock(struct sock *sk)
2164 {
2165 	struct inet_connection_sock *icsk = inet_csk(sk);
2166 
2167 	tcp_init_sock(sk);
2168 
2169 	icsk->icsk_af_ops = &ipv4_specific;
2170 
2171 #ifdef CONFIG_TCP_MD5SIG
2172 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2173 #endif
2174 
2175 	return 0;
2176 }
2177 
2178 void tcp_v4_destroy_sock(struct sock *sk)
2179 {
2180 	struct tcp_sock *tp = tcp_sk(sk);
2181 
2182 	trace_tcp_destroy_sock(sk);
2183 
2184 	tcp_clear_xmit_timers(sk);
2185 
2186 	tcp_cleanup_congestion_control(sk);
2187 
2188 	tcp_cleanup_ulp(sk);
2189 
2190 	/* Cleanup up the write buffer. */
2191 	tcp_write_queue_purge(sk);
2192 
2193 	/* Check if we want to disable active TFO */
2194 	tcp_fastopen_active_disable_ofo_check(sk);
2195 
2196 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2197 	skb_rbtree_purge(&tp->out_of_order_queue);
2198 
2199 #ifdef CONFIG_TCP_MD5SIG
2200 	/* Clean up the MD5 key list, if any */
2201 	if (tp->md5sig_info) {
2202 		tcp_clear_md5_list(sk);
2203 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2204 		tp->md5sig_info = NULL;
2205 	}
2206 #endif
2207 
2208 	/* Clean up a referenced TCP bind bucket. */
2209 	if (inet_csk(sk)->icsk_bind_hash)
2210 		inet_put_port(sk);
2211 
2212 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2213 
2214 	/* If socket is aborted during connect operation */
2215 	tcp_free_fastopen_req(tp);
2216 	tcp_fastopen_destroy_cipher(sk);
2217 	tcp_saved_syn_free(tp);
2218 
2219 	sk_sockets_allocated_dec(sk);
2220 }
2221 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2222 
2223 #ifdef CONFIG_PROC_FS
2224 /* Proc filesystem TCP sock list dumping. */
2225 
2226 /*
2227  * Get next listener socket follow cur.  If cur is NULL, get first socket
2228  * starting from bucket given in st->bucket; when st->bucket is zero the
2229  * very first socket in the hash table is returned.
2230  */
2231 static void *listening_get_next(struct seq_file *seq, void *cur)
2232 {
2233 	struct tcp_seq_afinfo *afinfo;
2234 	struct tcp_iter_state *st = seq->private;
2235 	struct net *net = seq_file_net(seq);
2236 	struct inet_listen_hashbucket *ilb;
2237 	struct hlist_nulls_node *node;
2238 	struct sock *sk = cur;
2239 
2240 	if (st->bpf_seq_afinfo)
2241 		afinfo = st->bpf_seq_afinfo;
2242 	else
2243 		afinfo = PDE_DATA(file_inode(seq->file));
2244 
2245 	if (!sk) {
2246 get_head:
2247 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2248 		spin_lock(&ilb->lock);
2249 		sk = sk_nulls_head(&ilb->nulls_head);
2250 		st->offset = 0;
2251 		goto get_sk;
2252 	}
2253 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2254 	++st->num;
2255 	++st->offset;
2256 
2257 	sk = sk_nulls_next(sk);
2258 get_sk:
2259 	sk_nulls_for_each_from(sk, node) {
2260 		if (!net_eq(sock_net(sk), net))
2261 			continue;
2262 		if (afinfo->family == AF_UNSPEC ||
2263 		    sk->sk_family == afinfo->family)
2264 			return sk;
2265 	}
2266 	spin_unlock(&ilb->lock);
2267 	st->offset = 0;
2268 	if (++st->bucket < INET_LHTABLE_SIZE)
2269 		goto get_head;
2270 	return NULL;
2271 }
2272 
2273 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2274 {
2275 	struct tcp_iter_state *st = seq->private;
2276 	void *rc;
2277 
2278 	st->bucket = 0;
2279 	st->offset = 0;
2280 	rc = listening_get_next(seq, NULL);
2281 
2282 	while (rc && *pos) {
2283 		rc = listening_get_next(seq, rc);
2284 		--*pos;
2285 	}
2286 	return rc;
2287 }
2288 
2289 static inline bool empty_bucket(const struct tcp_iter_state *st)
2290 {
2291 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2292 }
2293 
2294 /*
2295  * Get first established socket starting from bucket given in st->bucket.
2296  * If st->bucket is zero, the very first socket in the hash is returned.
2297  */
2298 static void *established_get_first(struct seq_file *seq)
2299 {
2300 	struct tcp_seq_afinfo *afinfo;
2301 	struct tcp_iter_state *st = seq->private;
2302 	struct net *net = seq_file_net(seq);
2303 	void *rc = NULL;
2304 
2305 	if (st->bpf_seq_afinfo)
2306 		afinfo = st->bpf_seq_afinfo;
2307 	else
2308 		afinfo = PDE_DATA(file_inode(seq->file));
2309 
2310 	st->offset = 0;
2311 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2312 		struct sock *sk;
2313 		struct hlist_nulls_node *node;
2314 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2315 
2316 		/* Lockless fast path for the common case of empty buckets */
2317 		if (empty_bucket(st))
2318 			continue;
2319 
2320 		spin_lock_bh(lock);
2321 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2322 			if ((afinfo->family != AF_UNSPEC &&
2323 			     sk->sk_family != afinfo->family) ||
2324 			    !net_eq(sock_net(sk), net)) {
2325 				continue;
2326 			}
2327 			rc = sk;
2328 			goto out;
2329 		}
2330 		spin_unlock_bh(lock);
2331 	}
2332 out:
2333 	return rc;
2334 }
2335 
2336 static void *established_get_next(struct seq_file *seq, void *cur)
2337 {
2338 	struct tcp_seq_afinfo *afinfo;
2339 	struct sock *sk = cur;
2340 	struct hlist_nulls_node *node;
2341 	struct tcp_iter_state *st = seq->private;
2342 	struct net *net = seq_file_net(seq);
2343 
2344 	if (st->bpf_seq_afinfo)
2345 		afinfo = st->bpf_seq_afinfo;
2346 	else
2347 		afinfo = PDE_DATA(file_inode(seq->file));
2348 
2349 	++st->num;
2350 	++st->offset;
2351 
2352 	sk = sk_nulls_next(sk);
2353 
2354 	sk_nulls_for_each_from(sk, node) {
2355 		if ((afinfo->family == AF_UNSPEC ||
2356 		     sk->sk_family == afinfo->family) &&
2357 		    net_eq(sock_net(sk), net))
2358 			return sk;
2359 	}
2360 
2361 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2362 	++st->bucket;
2363 	return established_get_first(seq);
2364 }
2365 
2366 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2367 {
2368 	struct tcp_iter_state *st = seq->private;
2369 	void *rc;
2370 
2371 	st->bucket = 0;
2372 	rc = established_get_first(seq);
2373 
2374 	while (rc && pos) {
2375 		rc = established_get_next(seq, rc);
2376 		--pos;
2377 	}
2378 	return rc;
2379 }
2380 
2381 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2382 {
2383 	void *rc;
2384 	struct tcp_iter_state *st = seq->private;
2385 
2386 	st->state = TCP_SEQ_STATE_LISTENING;
2387 	rc	  = listening_get_idx(seq, &pos);
2388 
2389 	if (!rc) {
2390 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2391 		rc	  = established_get_idx(seq, pos);
2392 	}
2393 
2394 	return rc;
2395 }
2396 
2397 static void *tcp_seek_last_pos(struct seq_file *seq)
2398 {
2399 	struct tcp_iter_state *st = seq->private;
2400 	int offset = st->offset;
2401 	int orig_num = st->num;
2402 	void *rc = NULL;
2403 
2404 	switch (st->state) {
2405 	case TCP_SEQ_STATE_LISTENING:
2406 		if (st->bucket >= INET_LHTABLE_SIZE)
2407 			break;
2408 		st->state = TCP_SEQ_STATE_LISTENING;
2409 		rc = listening_get_next(seq, NULL);
2410 		while (offset-- && rc)
2411 			rc = listening_get_next(seq, rc);
2412 		if (rc)
2413 			break;
2414 		st->bucket = 0;
2415 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2416 		fallthrough;
2417 	case TCP_SEQ_STATE_ESTABLISHED:
2418 		if (st->bucket > tcp_hashinfo.ehash_mask)
2419 			break;
2420 		rc = established_get_first(seq);
2421 		while (offset-- && rc)
2422 			rc = established_get_next(seq, rc);
2423 	}
2424 
2425 	st->num = orig_num;
2426 
2427 	return rc;
2428 }
2429 
2430 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2431 {
2432 	struct tcp_iter_state *st = seq->private;
2433 	void *rc;
2434 
2435 	if (*pos && *pos == st->last_pos) {
2436 		rc = tcp_seek_last_pos(seq);
2437 		if (rc)
2438 			goto out;
2439 	}
2440 
2441 	st->state = TCP_SEQ_STATE_LISTENING;
2442 	st->num = 0;
2443 	st->bucket = 0;
2444 	st->offset = 0;
2445 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2446 
2447 out:
2448 	st->last_pos = *pos;
2449 	return rc;
2450 }
2451 EXPORT_SYMBOL(tcp_seq_start);
2452 
2453 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2454 {
2455 	struct tcp_iter_state *st = seq->private;
2456 	void *rc = NULL;
2457 
2458 	if (v == SEQ_START_TOKEN) {
2459 		rc = tcp_get_idx(seq, 0);
2460 		goto out;
2461 	}
2462 
2463 	switch (st->state) {
2464 	case TCP_SEQ_STATE_LISTENING:
2465 		rc = listening_get_next(seq, v);
2466 		if (!rc) {
2467 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2468 			st->bucket = 0;
2469 			st->offset = 0;
2470 			rc	  = established_get_first(seq);
2471 		}
2472 		break;
2473 	case TCP_SEQ_STATE_ESTABLISHED:
2474 		rc = established_get_next(seq, v);
2475 		break;
2476 	}
2477 out:
2478 	++*pos;
2479 	st->last_pos = *pos;
2480 	return rc;
2481 }
2482 EXPORT_SYMBOL(tcp_seq_next);
2483 
2484 void tcp_seq_stop(struct seq_file *seq, void *v)
2485 {
2486 	struct tcp_iter_state *st = seq->private;
2487 
2488 	switch (st->state) {
2489 	case TCP_SEQ_STATE_LISTENING:
2490 		if (v != SEQ_START_TOKEN)
2491 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2492 		break;
2493 	case TCP_SEQ_STATE_ESTABLISHED:
2494 		if (v)
2495 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2496 		break;
2497 	}
2498 }
2499 EXPORT_SYMBOL(tcp_seq_stop);
2500 
2501 static void get_openreq4(const struct request_sock *req,
2502 			 struct seq_file *f, int i)
2503 {
2504 	const struct inet_request_sock *ireq = inet_rsk(req);
2505 	long delta = req->rsk_timer.expires - jiffies;
2506 
2507 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2508 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2509 		i,
2510 		ireq->ir_loc_addr,
2511 		ireq->ir_num,
2512 		ireq->ir_rmt_addr,
2513 		ntohs(ireq->ir_rmt_port),
2514 		TCP_SYN_RECV,
2515 		0, 0, /* could print option size, but that is af dependent. */
2516 		1,    /* timers active (only the expire timer) */
2517 		jiffies_delta_to_clock_t(delta),
2518 		req->num_timeout,
2519 		from_kuid_munged(seq_user_ns(f),
2520 				 sock_i_uid(req->rsk_listener)),
2521 		0,  /* non standard timer */
2522 		0, /* open_requests have no inode */
2523 		0,
2524 		req);
2525 }
2526 
2527 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2528 {
2529 	int timer_active;
2530 	unsigned long timer_expires;
2531 	const struct tcp_sock *tp = tcp_sk(sk);
2532 	const struct inet_connection_sock *icsk = inet_csk(sk);
2533 	const struct inet_sock *inet = inet_sk(sk);
2534 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2535 	__be32 dest = inet->inet_daddr;
2536 	__be32 src = inet->inet_rcv_saddr;
2537 	__u16 destp = ntohs(inet->inet_dport);
2538 	__u16 srcp = ntohs(inet->inet_sport);
2539 	int rx_queue;
2540 	int state;
2541 
2542 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2543 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2544 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2545 		timer_active	= 1;
2546 		timer_expires	= icsk->icsk_timeout;
2547 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2548 		timer_active	= 4;
2549 		timer_expires	= icsk->icsk_timeout;
2550 	} else if (timer_pending(&sk->sk_timer)) {
2551 		timer_active	= 2;
2552 		timer_expires	= sk->sk_timer.expires;
2553 	} else {
2554 		timer_active	= 0;
2555 		timer_expires = jiffies;
2556 	}
2557 
2558 	state = inet_sk_state_load(sk);
2559 	if (state == TCP_LISTEN)
2560 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2561 	else
2562 		/* Because we don't lock the socket,
2563 		 * we might find a transient negative value.
2564 		 */
2565 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2566 				      READ_ONCE(tp->copied_seq), 0);
2567 
2568 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2569 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2570 		i, src, srcp, dest, destp, state,
2571 		READ_ONCE(tp->write_seq) - tp->snd_una,
2572 		rx_queue,
2573 		timer_active,
2574 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2575 		icsk->icsk_retransmits,
2576 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2577 		icsk->icsk_probes_out,
2578 		sock_i_ino(sk),
2579 		refcount_read(&sk->sk_refcnt), sk,
2580 		jiffies_to_clock_t(icsk->icsk_rto),
2581 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2582 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2583 		tp->snd_cwnd,
2584 		state == TCP_LISTEN ?
2585 		    fastopenq->max_qlen :
2586 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2587 }
2588 
2589 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2590 			       struct seq_file *f, int i)
2591 {
2592 	long delta = tw->tw_timer.expires - jiffies;
2593 	__be32 dest, src;
2594 	__u16 destp, srcp;
2595 
2596 	dest  = tw->tw_daddr;
2597 	src   = tw->tw_rcv_saddr;
2598 	destp = ntohs(tw->tw_dport);
2599 	srcp  = ntohs(tw->tw_sport);
2600 
2601 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2602 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2603 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2604 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2605 		refcount_read(&tw->tw_refcnt), tw);
2606 }
2607 
2608 #define TMPSZ 150
2609 
2610 static int tcp4_seq_show(struct seq_file *seq, void *v)
2611 {
2612 	struct tcp_iter_state *st;
2613 	struct sock *sk = v;
2614 
2615 	seq_setwidth(seq, TMPSZ - 1);
2616 	if (v == SEQ_START_TOKEN) {
2617 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2618 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2619 			   "inode");
2620 		goto out;
2621 	}
2622 	st = seq->private;
2623 
2624 	if (sk->sk_state == TCP_TIME_WAIT)
2625 		get_timewait4_sock(v, seq, st->num);
2626 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2627 		get_openreq4(v, seq, st->num);
2628 	else
2629 		get_tcp4_sock(v, seq, st->num);
2630 out:
2631 	seq_pad(seq, '\n');
2632 	return 0;
2633 }
2634 
2635 #ifdef CONFIG_BPF_SYSCALL
2636 struct bpf_iter__tcp {
2637 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2638 	__bpf_md_ptr(struct sock_common *, sk_common);
2639 	uid_t uid __aligned(8);
2640 };
2641 
2642 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2643 			     struct sock_common *sk_common, uid_t uid)
2644 {
2645 	struct bpf_iter__tcp ctx;
2646 
2647 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2648 	ctx.meta = meta;
2649 	ctx.sk_common = sk_common;
2650 	ctx.uid = uid;
2651 	return bpf_iter_run_prog(prog, &ctx);
2652 }
2653 
2654 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2655 {
2656 	struct bpf_iter_meta meta;
2657 	struct bpf_prog *prog;
2658 	struct sock *sk = v;
2659 	uid_t uid;
2660 
2661 	if (v == SEQ_START_TOKEN)
2662 		return 0;
2663 
2664 	if (sk->sk_state == TCP_TIME_WAIT) {
2665 		uid = 0;
2666 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2667 		const struct request_sock *req = v;
2668 
2669 		uid = from_kuid_munged(seq_user_ns(seq),
2670 				       sock_i_uid(req->rsk_listener));
2671 	} else {
2672 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2673 	}
2674 
2675 	meta.seq = seq;
2676 	prog = bpf_iter_get_info(&meta, false);
2677 	return tcp_prog_seq_show(prog, &meta, v, uid);
2678 }
2679 
2680 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2681 {
2682 	struct bpf_iter_meta meta;
2683 	struct bpf_prog *prog;
2684 
2685 	if (!v) {
2686 		meta.seq = seq;
2687 		prog = bpf_iter_get_info(&meta, true);
2688 		if (prog)
2689 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
2690 	}
2691 
2692 	tcp_seq_stop(seq, v);
2693 }
2694 
2695 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2696 	.show		= bpf_iter_tcp_seq_show,
2697 	.start		= tcp_seq_start,
2698 	.next		= tcp_seq_next,
2699 	.stop		= bpf_iter_tcp_seq_stop,
2700 };
2701 #endif
2702 
2703 static const struct seq_operations tcp4_seq_ops = {
2704 	.show		= tcp4_seq_show,
2705 	.start		= tcp_seq_start,
2706 	.next		= tcp_seq_next,
2707 	.stop		= tcp_seq_stop,
2708 };
2709 
2710 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2711 	.family		= AF_INET,
2712 };
2713 
2714 static int __net_init tcp4_proc_init_net(struct net *net)
2715 {
2716 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2717 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2718 		return -ENOMEM;
2719 	return 0;
2720 }
2721 
2722 static void __net_exit tcp4_proc_exit_net(struct net *net)
2723 {
2724 	remove_proc_entry("tcp", net->proc_net);
2725 }
2726 
2727 static struct pernet_operations tcp4_net_ops = {
2728 	.init = tcp4_proc_init_net,
2729 	.exit = tcp4_proc_exit_net,
2730 };
2731 
2732 int __init tcp4_proc_init(void)
2733 {
2734 	return register_pernet_subsys(&tcp4_net_ops);
2735 }
2736 
2737 void tcp4_proc_exit(void)
2738 {
2739 	unregister_pernet_subsys(&tcp4_net_ops);
2740 }
2741 #endif /* CONFIG_PROC_FS */
2742 
2743 struct proto tcp_prot = {
2744 	.name			= "TCP",
2745 	.owner			= THIS_MODULE,
2746 	.close			= tcp_close,
2747 	.pre_connect		= tcp_v4_pre_connect,
2748 	.connect		= tcp_v4_connect,
2749 	.disconnect		= tcp_disconnect,
2750 	.accept			= inet_csk_accept,
2751 	.ioctl			= tcp_ioctl,
2752 	.init			= tcp_v4_init_sock,
2753 	.destroy		= tcp_v4_destroy_sock,
2754 	.shutdown		= tcp_shutdown,
2755 	.setsockopt		= tcp_setsockopt,
2756 	.getsockopt		= tcp_getsockopt,
2757 	.keepalive		= tcp_set_keepalive,
2758 	.recvmsg		= tcp_recvmsg,
2759 	.sendmsg		= tcp_sendmsg,
2760 	.sendpage		= tcp_sendpage,
2761 	.backlog_rcv		= tcp_v4_do_rcv,
2762 	.release_cb		= tcp_release_cb,
2763 	.hash			= inet_hash,
2764 	.unhash			= inet_unhash,
2765 	.get_port		= inet_csk_get_port,
2766 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2767 	.leave_memory_pressure	= tcp_leave_memory_pressure,
2768 	.stream_memory_free	= tcp_stream_memory_free,
2769 	.sockets_allocated	= &tcp_sockets_allocated,
2770 	.orphan_count		= &tcp_orphan_count,
2771 	.memory_allocated	= &tcp_memory_allocated,
2772 	.memory_pressure	= &tcp_memory_pressure,
2773 	.sysctl_mem		= sysctl_tcp_mem,
2774 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
2775 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
2776 	.max_header		= MAX_TCP_HEADER,
2777 	.obj_size		= sizeof(struct tcp_sock),
2778 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
2779 	.twsk_prot		= &tcp_timewait_sock_ops,
2780 	.rsk_prot		= &tcp_request_sock_ops,
2781 	.h.hashinfo		= &tcp_hashinfo,
2782 	.no_autobind		= true,
2783 	.diag_destroy		= tcp_abort,
2784 };
2785 EXPORT_SYMBOL(tcp_prot);
2786 
2787 static void __net_exit tcp_sk_exit(struct net *net)
2788 {
2789 	int cpu;
2790 
2791 	if (net->ipv4.tcp_congestion_control)
2792 		bpf_module_put(net->ipv4.tcp_congestion_control,
2793 			       net->ipv4.tcp_congestion_control->owner);
2794 
2795 	for_each_possible_cpu(cpu)
2796 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2797 	free_percpu(net->ipv4.tcp_sk);
2798 }
2799 
2800 static int __net_init tcp_sk_init(struct net *net)
2801 {
2802 	int res, cpu, cnt;
2803 
2804 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2805 	if (!net->ipv4.tcp_sk)
2806 		return -ENOMEM;
2807 
2808 	for_each_possible_cpu(cpu) {
2809 		struct sock *sk;
2810 
2811 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2812 					   IPPROTO_TCP, net);
2813 		if (res)
2814 			goto fail;
2815 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2816 
2817 		/* Please enforce IP_DF and IPID==0 for RST and
2818 		 * ACK sent in SYN-RECV and TIME-WAIT state.
2819 		 */
2820 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2821 
2822 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2823 	}
2824 
2825 	net->ipv4.sysctl_tcp_ecn = 2;
2826 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2827 
2828 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2829 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2830 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2831 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2832 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2833 
2834 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2835 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2836 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2837 
2838 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2839 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2840 	net->ipv4.sysctl_tcp_syncookies = 1;
2841 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2842 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2843 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2844 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2845 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2846 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2847 	net->ipv4.sysctl_tcp_tw_reuse = 2;
2848 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2849 
2850 	cnt = tcp_hashinfo.ehash_mask + 1;
2851 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2852 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2853 
2854 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2855 	net->ipv4.sysctl_tcp_sack = 1;
2856 	net->ipv4.sysctl_tcp_window_scaling = 1;
2857 	net->ipv4.sysctl_tcp_timestamps = 1;
2858 	net->ipv4.sysctl_tcp_early_retrans = 3;
2859 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2860 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2861 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
2862 	net->ipv4.sysctl_tcp_max_reordering = 300;
2863 	net->ipv4.sysctl_tcp_dsack = 1;
2864 	net->ipv4.sysctl_tcp_app_win = 31;
2865 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
2866 	net->ipv4.sysctl_tcp_frto = 2;
2867 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2868 	/* This limits the percentage of the congestion window which we
2869 	 * will allow a single TSO frame to consume.  Building TSO frames
2870 	 * which are too large can cause TCP streams to be bursty.
2871 	 */
2872 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2873 	/* Default TSQ limit of 16 TSO segments */
2874 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2875 	/* rfc5961 challenge ack rate limiting */
2876 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2877 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
2878 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2879 	net->ipv4.sysctl_tcp_autocorking = 1;
2880 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2881 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2882 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2883 	if (net != &init_net) {
2884 		memcpy(net->ipv4.sysctl_tcp_rmem,
2885 		       init_net.ipv4.sysctl_tcp_rmem,
2886 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2887 		memcpy(net->ipv4.sysctl_tcp_wmem,
2888 		       init_net.ipv4.sysctl_tcp_wmem,
2889 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2890 	}
2891 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2892 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
2893 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2894 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2895 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2896 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2897 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2898 
2899 	/* Reno is always built in */
2900 	if (!net_eq(net, &init_net) &&
2901 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2902 			       init_net.ipv4.tcp_congestion_control->owner))
2903 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2904 	else
2905 		net->ipv4.tcp_congestion_control = &tcp_reno;
2906 
2907 	return 0;
2908 fail:
2909 	tcp_sk_exit(net);
2910 
2911 	return res;
2912 }
2913 
2914 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2915 {
2916 	struct net *net;
2917 
2918 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2919 
2920 	list_for_each_entry(net, net_exit_list, exit_list)
2921 		tcp_fastopen_ctx_destroy(net);
2922 }
2923 
2924 static struct pernet_operations __net_initdata tcp_sk_ops = {
2925        .init	   = tcp_sk_init,
2926        .exit	   = tcp_sk_exit,
2927        .exit_batch = tcp_sk_exit_batch,
2928 };
2929 
2930 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
2931 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
2932 		     struct sock_common *sk_common, uid_t uid)
2933 
2934 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
2935 {
2936 	struct tcp_iter_state *st = priv_data;
2937 	struct tcp_seq_afinfo *afinfo;
2938 	int ret;
2939 
2940 	afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
2941 	if (!afinfo)
2942 		return -ENOMEM;
2943 
2944 	afinfo->family = AF_UNSPEC;
2945 	st->bpf_seq_afinfo = afinfo;
2946 	ret = bpf_iter_init_seq_net(priv_data, aux);
2947 	if (ret)
2948 		kfree(afinfo);
2949 	return ret;
2950 }
2951 
2952 static void bpf_iter_fini_tcp(void *priv_data)
2953 {
2954 	struct tcp_iter_state *st = priv_data;
2955 
2956 	kfree(st->bpf_seq_afinfo);
2957 	bpf_iter_fini_seq_net(priv_data);
2958 }
2959 
2960 static const struct bpf_iter_seq_info tcp_seq_info = {
2961 	.seq_ops		= &bpf_iter_tcp_seq_ops,
2962 	.init_seq_private	= bpf_iter_init_tcp,
2963 	.fini_seq_private	= bpf_iter_fini_tcp,
2964 	.seq_priv_size		= sizeof(struct tcp_iter_state),
2965 };
2966 
2967 static struct bpf_iter_reg tcp_reg_info = {
2968 	.target			= "tcp",
2969 	.ctx_arg_info_size	= 1,
2970 	.ctx_arg_info		= {
2971 		{ offsetof(struct bpf_iter__tcp, sk_common),
2972 		  PTR_TO_BTF_ID_OR_NULL },
2973 	},
2974 	.seq_info		= &tcp_seq_info,
2975 };
2976 
2977 static void __init bpf_iter_register(void)
2978 {
2979 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
2980 	if (bpf_iter_reg_target(&tcp_reg_info))
2981 		pr_warn("Warning: could not register bpf iterator tcp\n");
2982 }
2983 
2984 #endif
2985 
2986 void __init tcp_v4_init(void)
2987 {
2988 	if (register_pernet_subsys(&tcp_sk_ops))
2989 		panic("Failed to create the TCP control socket.\n");
2990 
2991 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
2992 	bpf_iter_register();
2993 #endif
2994 }
2995