xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision f97cee494dc92395a668445bcd24d34c89f4ff8c)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72 
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
80 
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
83 
84 #include <trace/events/tcp.h>
85 
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 #endif
90 
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
93 
94 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
95 {
96 	return secure_tcp_seq(ip_hdr(skb)->daddr,
97 			      ip_hdr(skb)->saddr,
98 			      tcp_hdr(skb)->dest,
99 			      tcp_hdr(skb)->source);
100 }
101 
102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
103 {
104 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
105 }
106 
107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 {
109 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
110 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 	struct tcp_sock *tp = tcp_sk(sk);
112 	int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
113 
114 	if (reuse == 2) {
115 		/* Still does not detect *everything* that goes through
116 		 * lo, since we require a loopback src or dst address
117 		 * or direct binding to 'lo' interface.
118 		 */
119 		bool loopback = false;
120 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
121 			loopback = true;
122 #if IS_ENABLED(CONFIG_IPV6)
123 		if (tw->tw_family == AF_INET6) {
124 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
125 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
126 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
127 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
128 				loopback = true;
129 		} else
130 #endif
131 		{
132 			if (ipv4_is_loopback(tw->tw_daddr) ||
133 			    ipv4_is_loopback(tw->tw_rcv_saddr))
134 				loopback = true;
135 		}
136 		if (!loopback)
137 			reuse = 0;
138 	}
139 
140 	/* With PAWS, it is safe from the viewpoint
141 	   of data integrity. Even without PAWS it is safe provided sequence
142 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
143 
144 	   Actually, the idea is close to VJ's one, only timestamp cache is
145 	   held not per host, but per port pair and TW bucket is used as state
146 	   holder.
147 
148 	   If TW bucket has been already destroyed we fall back to VJ's scheme
149 	   and use initial timestamp retrieved from peer table.
150 	 */
151 	if (tcptw->tw_ts_recent_stamp &&
152 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
153 					    tcptw->tw_ts_recent_stamp)))) {
154 		/* In case of repair and re-using TIME-WAIT sockets we still
155 		 * want to be sure that it is safe as above but honor the
156 		 * sequence numbers and time stamps set as part of the repair
157 		 * process.
158 		 *
159 		 * Without this check re-using a TIME-WAIT socket with TCP
160 		 * repair would accumulate a -1 on the repair assigned
161 		 * sequence number. The first time it is reused the sequence
162 		 * is -1, the second time -2, etc. This fixes that issue
163 		 * without appearing to create any others.
164 		 */
165 		if (likely(!tp->repair)) {
166 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
167 
168 			if (!seq)
169 				seq = 1;
170 			WRITE_ONCE(tp->write_seq, seq);
171 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
172 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
173 		}
174 		sock_hold(sktw);
175 		return 1;
176 	}
177 
178 	return 0;
179 }
180 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
181 
182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
183 			      int addr_len)
184 {
185 	/* This check is replicated from tcp_v4_connect() and intended to
186 	 * prevent BPF program called below from accessing bytes that are out
187 	 * of the bound specified by user in addr_len.
188 	 */
189 	if (addr_len < sizeof(struct sockaddr_in))
190 		return -EINVAL;
191 
192 	sock_owned_by_me(sk);
193 
194 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
195 }
196 
197 /* This will initiate an outgoing connection. */
198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
199 {
200 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
201 	struct inet_sock *inet = inet_sk(sk);
202 	struct tcp_sock *tp = tcp_sk(sk);
203 	__be16 orig_sport, orig_dport;
204 	__be32 daddr, nexthop;
205 	struct flowi4 *fl4;
206 	struct rtable *rt;
207 	int err;
208 	struct ip_options_rcu *inet_opt;
209 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
210 
211 	if (addr_len < sizeof(struct sockaddr_in))
212 		return -EINVAL;
213 
214 	if (usin->sin_family != AF_INET)
215 		return -EAFNOSUPPORT;
216 
217 	nexthop = daddr = usin->sin_addr.s_addr;
218 	inet_opt = rcu_dereference_protected(inet->inet_opt,
219 					     lockdep_sock_is_held(sk));
220 	if (inet_opt && inet_opt->opt.srr) {
221 		if (!daddr)
222 			return -EINVAL;
223 		nexthop = inet_opt->opt.faddr;
224 	}
225 
226 	orig_sport = inet->inet_sport;
227 	orig_dport = usin->sin_port;
228 	fl4 = &inet->cork.fl.u.ip4;
229 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
230 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
231 			      IPPROTO_TCP,
232 			      orig_sport, orig_dport, sk);
233 	if (IS_ERR(rt)) {
234 		err = PTR_ERR(rt);
235 		if (err == -ENETUNREACH)
236 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
237 		return err;
238 	}
239 
240 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
241 		ip_rt_put(rt);
242 		return -ENETUNREACH;
243 	}
244 
245 	if (!inet_opt || !inet_opt->opt.srr)
246 		daddr = fl4->daddr;
247 
248 	if (!inet->inet_saddr)
249 		inet->inet_saddr = fl4->saddr;
250 	sk_rcv_saddr_set(sk, inet->inet_saddr);
251 
252 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
253 		/* Reset inherited state */
254 		tp->rx_opt.ts_recent	   = 0;
255 		tp->rx_opt.ts_recent_stamp = 0;
256 		if (likely(!tp->repair))
257 			WRITE_ONCE(tp->write_seq, 0);
258 	}
259 
260 	inet->inet_dport = usin->sin_port;
261 	sk_daddr_set(sk, daddr);
262 
263 	inet_csk(sk)->icsk_ext_hdr_len = 0;
264 	if (inet_opt)
265 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
266 
267 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
268 
269 	/* Socket identity is still unknown (sport may be zero).
270 	 * However we set state to SYN-SENT and not releasing socket
271 	 * lock select source port, enter ourselves into the hash tables and
272 	 * complete initialization after this.
273 	 */
274 	tcp_set_state(sk, TCP_SYN_SENT);
275 	err = inet_hash_connect(tcp_death_row, sk);
276 	if (err)
277 		goto failure;
278 
279 	sk_set_txhash(sk);
280 
281 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
282 			       inet->inet_sport, inet->inet_dport, sk);
283 	if (IS_ERR(rt)) {
284 		err = PTR_ERR(rt);
285 		rt = NULL;
286 		goto failure;
287 	}
288 	/* OK, now commit destination to socket.  */
289 	sk->sk_gso_type = SKB_GSO_TCPV4;
290 	sk_setup_caps(sk, &rt->dst);
291 	rt = NULL;
292 
293 	if (likely(!tp->repair)) {
294 		if (!tp->write_seq)
295 			WRITE_ONCE(tp->write_seq,
296 				   secure_tcp_seq(inet->inet_saddr,
297 						  inet->inet_daddr,
298 						  inet->inet_sport,
299 						  usin->sin_port));
300 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
301 						 inet->inet_saddr,
302 						 inet->inet_daddr);
303 	}
304 
305 	inet->inet_id = prandom_u32();
306 
307 	if (tcp_fastopen_defer_connect(sk, &err))
308 		return err;
309 	if (err)
310 		goto failure;
311 
312 	err = tcp_connect(sk);
313 
314 	if (err)
315 		goto failure;
316 
317 	return 0;
318 
319 failure:
320 	/*
321 	 * This unhashes the socket and releases the local port,
322 	 * if necessary.
323 	 */
324 	tcp_set_state(sk, TCP_CLOSE);
325 	ip_rt_put(rt);
326 	sk->sk_route_caps = 0;
327 	inet->inet_dport = 0;
328 	return err;
329 }
330 EXPORT_SYMBOL(tcp_v4_connect);
331 
332 /*
333  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
334  * It can be called through tcp_release_cb() if socket was owned by user
335  * at the time tcp_v4_err() was called to handle ICMP message.
336  */
337 void tcp_v4_mtu_reduced(struct sock *sk)
338 {
339 	struct inet_sock *inet = inet_sk(sk);
340 	struct dst_entry *dst;
341 	u32 mtu;
342 
343 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
344 		return;
345 	mtu = tcp_sk(sk)->mtu_info;
346 	dst = inet_csk_update_pmtu(sk, mtu);
347 	if (!dst)
348 		return;
349 
350 	/* Something is about to be wrong... Remember soft error
351 	 * for the case, if this connection will not able to recover.
352 	 */
353 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
354 		sk->sk_err_soft = EMSGSIZE;
355 
356 	mtu = dst_mtu(dst);
357 
358 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
359 	    ip_sk_accept_pmtu(sk) &&
360 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
361 		tcp_sync_mss(sk, mtu);
362 
363 		/* Resend the TCP packet because it's
364 		 * clear that the old packet has been
365 		 * dropped. This is the new "fast" path mtu
366 		 * discovery.
367 		 */
368 		tcp_simple_retransmit(sk);
369 	} /* else let the usual retransmit timer handle it */
370 }
371 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
372 
373 static void do_redirect(struct sk_buff *skb, struct sock *sk)
374 {
375 	struct dst_entry *dst = __sk_dst_check(sk, 0);
376 
377 	if (dst)
378 		dst->ops->redirect(dst, sk, skb);
379 }
380 
381 
382 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
383 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
384 {
385 	struct request_sock *req = inet_reqsk(sk);
386 	struct net *net = sock_net(sk);
387 
388 	/* ICMPs are not backlogged, hence we cannot get
389 	 * an established socket here.
390 	 */
391 	if (seq != tcp_rsk(req)->snt_isn) {
392 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
393 	} else if (abort) {
394 		/*
395 		 * Still in SYN_RECV, just remove it silently.
396 		 * There is no good way to pass the error to the newly
397 		 * created socket, and POSIX does not want network
398 		 * errors returned from accept().
399 		 */
400 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
401 		tcp_listendrop(req->rsk_listener);
402 	}
403 	reqsk_put(req);
404 }
405 EXPORT_SYMBOL(tcp_req_err);
406 
407 /* TCP-LD (RFC 6069) logic */
408 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
409 {
410 	struct inet_connection_sock *icsk = inet_csk(sk);
411 	struct tcp_sock *tp = tcp_sk(sk);
412 	struct sk_buff *skb;
413 	s32 remaining;
414 	u32 delta_us;
415 
416 	if (sock_owned_by_user(sk))
417 		return;
418 
419 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
420 	    !icsk->icsk_backoff)
421 		return;
422 
423 	skb = tcp_rtx_queue_head(sk);
424 	if (WARN_ON_ONCE(!skb))
425 		return;
426 
427 	icsk->icsk_backoff--;
428 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
429 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
430 
431 	tcp_mstamp_refresh(tp);
432 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
433 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
434 
435 	if (remaining > 0) {
436 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
437 					  remaining, TCP_RTO_MAX);
438 	} else {
439 		/* RTO revert clocked out retransmission.
440 		 * Will retransmit now.
441 		 */
442 		tcp_retransmit_timer(sk);
443 	}
444 }
445 EXPORT_SYMBOL(tcp_ld_RTO_revert);
446 
447 /*
448  * This routine is called by the ICMP module when it gets some
449  * sort of error condition.  If err < 0 then the socket should
450  * be closed and the error returned to the user.  If err > 0
451  * it's just the icmp type << 8 | icmp code.  After adjustment
452  * header points to the first 8 bytes of the tcp header.  We need
453  * to find the appropriate port.
454  *
455  * The locking strategy used here is very "optimistic". When
456  * someone else accesses the socket the ICMP is just dropped
457  * and for some paths there is no check at all.
458  * A more general error queue to queue errors for later handling
459  * is probably better.
460  *
461  */
462 
463 int tcp_v4_err(struct sk_buff *skb, u32 info)
464 {
465 	const struct iphdr *iph = (const struct iphdr *)skb->data;
466 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
467 	struct tcp_sock *tp;
468 	struct inet_sock *inet;
469 	const int type = icmp_hdr(skb)->type;
470 	const int code = icmp_hdr(skb)->code;
471 	struct sock *sk;
472 	struct request_sock *fastopen;
473 	u32 seq, snd_una;
474 	int err;
475 	struct net *net = dev_net(skb->dev);
476 
477 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
478 				       th->dest, iph->saddr, ntohs(th->source),
479 				       inet_iif(skb), 0);
480 	if (!sk) {
481 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
482 		return -ENOENT;
483 	}
484 	if (sk->sk_state == TCP_TIME_WAIT) {
485 		inet_twsk_put(inet_twsk(sk));
486 		return 0;
487 	}
488 	seq = ntohl(th->seq);
489 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
490 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
491 				     type == ICMP_TIME_EXCEEDED ||
492 				     (type == ICMP_DEST_UNREACH &&
493 				      (code == ICMP_NET_UNREACH ||
494 				       code == ICMP_HOST_UNREACH)));
495 		return 0;
496 	}
497 
498 	bh_lock_sock(sk);
499 	/* If too many ICMPs get dropped on busy
500 	 * servers this needs to be solved differently.
501 	 * We do take care of PMTU discovery (RFC1191) special case :
502 	 * we can receive locally generated ICMP messages while socket is held.
503 	 */
504 	if (sock_owned_by_user(sk)) {
505 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
506 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
507 	}
508 	if (sk->sk_state == TCP_CLOSE)
509 		goto out;
510 
511 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
512 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
513 		goto out;
514 	}
515 
516 	tp = tcp_sk(sk);
517 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
518 	fastopen = rcu_dereference(tp->fastopen_rsk);
519 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
520 	if (sk->sk_state != TCP_LISTEN &&
521 	    !between(seq, snd_una, tp->snd_nxt)) {
522 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
523 		goto out;
524 	}
525 
526 	switch (type) {
527 	case ICMP_REDIRECT:
528 		if (!sock_owned_by_user(sk))
529 			do_redirect(skb, sk);
530 		goto out;
531 	case ICMP_SOURCE_QUENCH:
532 		/* Just silently ignore these. */
533 		goto out;
534 	case ICMP_PARAMETERPROB:
535 		err = EPROTO;
536 		break;
537 	case ICMP_DEST_UNREACH:
538 		if (code > NR_ICMP_UNREACH)
539 			goto out;
540 
541 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
542 			/* We are not interested in TCP_LISTEN and open_requests
543 			 * (SYN-ACKs send out by Linux are always <576bytes so
544 			 * they should go through unfragmented).
545 			 */
546 			if (sk->sk_state == TCP_LISTEN)
547 				goto out;
548 
549 			tp->mtu_info = info;
550 			if (!sock_owned_by_user(sk)) {
551 				tcp_v4_mtu_reduced(sk);
552 			} else {
553 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
554 					sock_hold(sk);
555 			}
556 			goto out;
557 		}
558 
559 		err = icmp_err_convert[code].errno;
560 		/* check if this ICMP message allows revert of backoff.
561 		 * (see RFC 6069)
562 		 */
563 		if (!fastopen &&
564 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
565 			tcp_ld_RTO_revert(sk, seq);
566 		break;
567 	case ICMP_TIME_EXCEEDED:
568 		err = EHOSTUNREACH;
569 		break;
570 	default:
571 		goto out;
572 	}
573 
574 	switch (sk->sk_state) {
575 	case TCP_SYN_SENT:
576 	case TCP_SYN_RECV:
577 		/* Only in fast or simultaneous open. If a fast open socket is
578 		 * is already accepted it is treated as a connected one below.
579 		 */
580 		if (fastopen && !fastopen->sk)
581 			break;
582 
583 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
584 
585 		if (!sock_owned_by_user(sk)) {
586 			sk->sk_err = err;
587 
588 			sk->sk_error_report(sk);
589 
590 			tcp_done(sk);
591 		} else {
592 			sk->sk_err_soft = err;
593 		}
594 		goto out;
595 	}
596 
597 	/* If we've already connected we will keep trying
598 	 * until we time out, or the user gives up.
599 	 *
600 	 * rfc1122 4.2.3.9 allows to consider as hard errors
601 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
602 	 * but it is obsoleted by pmtu discovery).
603 	 *
604 	 * Note, that in modern internet, where routing is unreliable
605 	 * and in each dark corner broken firewalls sit, sending random
606 	 * errors ordered by their masters even this two messages finally lose
607 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
608 	 *
609 	 * Now we are in compliance with RFCs.
610 	 *							--ANK (980905)
611 	 */
612 
613 	inet = inet_sk(sk);
614 	if (!sock_owned_by_user(sk) && inet->recverr) {
615 		sk->sk_err = err;
616 		sk->sk_error_report(sk);
617 	} else	{ /* Only an error on timeout */
618 		sk->sk_err_soft = err;
619 	}
620 
621 out:
622 	bh_unlock_sock(sk);
623 	sock_put(sk);
624 	return 0;
625 }
626 
627 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
628 {
629 	struct tcphdr *th = tcp_hdr(skb);
630 
631 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
632 	skb->csum_start = skb_transport_header(skb) - skb->head;
633 	skb->csum_offset = offsetof(struct tcphdr, check);
634 }
635 
636 /* This routine computes an IPv4 TCP checksum. */
637 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
638 {
639 	const struct inet_sock *inet = inet_sk(sk);
640 
641 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
642 }
643 EXPORT_SYMBOL(tcp_v4_send_check);
644 
645 /*
646  *	This routine will send an RST to the other tcp.
647  *
648  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
649  *		      for reset.
650  *	Answer: if a packet caused RST, it is not for a socket
651  *		existing in our system, if it is matched to a socket,
652  *		it is just duplicate segment or bug in other side's TCP.
653  *		So that we build reply only basing on parameters
654  *		arrived with segment.
655  *	Exception: precedence violation. We do not implement it in any case.
656  */
657 
658 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
659 {
660 	const struct tcphdr *th = tcp_hdr(skb);
661 	struct {
662 		struct tcphdr th;
663 #ifdef CONFIG_TCP_MD5SIG
664 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
665 #endif
666 	} rep;
667 	struct ip_reply_arg arg;
668 #ifdef CONFIG_TCP_MD5SIG
669 	struct tcp_md5sig_key *key = NULL;
670 	const __u8 *hash_location = NULL;
671 	unsigned char newhash[16];
672 	int genhash;
673 	struct sock *sk1 = NULL;
674 #endif
675 	u64 transmit_time = 0;
676 	struct sock *ctl_sk;
677 	struct net *net;
678 
679 	/* Never send a reset in response to a reset. */
680 	if (th->rst)
681 		return;
682 
683 	/* If sk not NULL, it means we did a successful lookup and incoming
684 	 * route had to be correct. prequeue might have dropped our dst.
685 	 */
686 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
687 		return;
688 
689 	/* Swap the send and the receive. */
690 	memset(&rep, 0, sizeof(rep));
691 	rep.th.dest   = th->source;
692 	rep.th.source = th->dest;
693 	rep.th.doff   = sizeof(struct tcphdr) / 4;
694 	rep.th.rst    = 1;
695 
696 	if (th->ack) {
697 		rep.th.seq = th->ack_seq;
698 	} else {
699 		rep.th.ack = 1;
700 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
701 				       skb->len - (th->doff << 2));
702 	}
703 
704 	memset(&arg, 0, sizeof(arg));
705 	arg.iov[0].iov_base = (unsigned char *)&rep;
706 	arg.iov[0].iov_len  = sizeof(rep.th);
707 
708 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
709 #ifdef CONFIG_TCP_MD5SIG
710 	rcu_read_lock();
711 	hash_location = tcp_parse_md5sig_option(th);
712 	if (sk && sk_fullsock(sk)) {
713 		const union tcp_md5_addr *addr;
714 		int l3index;
715 
716 		/* sdif set, means packet ingressed via a device
717 		 * in an L3 domain and inet_iif is set to it.
718 		 */
719 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
720 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
721 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
722 	} else if (hash_location) {
723 		const union tcp_md5_addr *addr;
724 		int sdif = tcp_v4_sdif(skb);
725 		int dif = inet_iif(skb);
726 		int l3index;
727 
728 		/*
729 		 * active side is lost. Try to find listening socket through
730 		 * source port, and then find md5 key through listening socket.
731 		 * we are not loose security here:
732 		 * Incoming packet is checked with md5 hash with finding key,
733 		 * no RST generated if md5 hash doesn't match.
734 		 */
735 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
736 					     ip_hdr(skb)->saddr,
737 					     th->source, ip_hdr(skb)->daddr,
738 					     ntohs(th->source), dif, sdif);
739 		/* don't send rst if it can't find key */
740 		if (!sk1)
741 			goto out;
742 
743 		/* sdif set, means packet ingressed via a device
744 		 * in an L3 domain and dif is set to it.
745 		 */
746 		l3index = sdif ? dif : 0;
747 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
748 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
749 		if (!key)
750 			goto out;
751 
752 
753 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
754 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
755 			goto out;
756 
757 	}
758 
759 	if (key) {
760 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
761 				   (TCPOPT_NOP << 16) |
762 				   (TCPOPT_MD5SIG << 8) |
763 				   TCPOLEN_MD5SIG);
764 		/* Update length and the length the header thinks exists */
765 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
766 		rep.th.doff = arg.iov[0].iov_len / 4;
767 
768 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
769 				     key, ip_hdr(skb)->saddr,
770 				     ip_hdr(skb)->daddr, &rep.th);
771 	}
772 #endif
773 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
774 				      ip_hdr(skb)->saddr, /* XXX */
775 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
776 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
777 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
778 
779 	/* When socket is gone, all binding information is lost.
780 	 * routing might fail in this case. No choice here, if we choose to force
781 	 * input interface, we will misroute in case of asymmetric route.
782 	 */
783 	if (sk) {
784 		arg.bound_dev_if = sk->sk_bound_dev_if;
785 		if (sk_fullsock(sk))
786 			trace_tcp_send_reset(sk, skb);
787 	}
788 
789 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
790 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
791 
792 	arg.tos = ip_hdr(skb)->tos;
793 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
794 	local_bh_disable();
795 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
796 	if (sk) {
797 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
798 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
799 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
800 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
801 		transmit_time = tcp_transmit_time(sk);
802 	}
803 	ip_send_unicast_reply(ctl_sk,
804 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
805 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
806 			      &arg, arg.iov[0].iov_len,
807 			      transmit_time);
808 
809 	ctl_sk->sk_mark = 0;
810 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
811 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
812 	local_bh_enable();
813 
814 #ifdef CONFIG_TCP_MD5SIG
815 out:
816 	rcu_read_unlock();
817 #endif
818 }
819 
820 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
821    outside socket context is ugly, certainly. What can I do?
822  */
823 
824 static void tcp_v4_send_ack(const struct sock *sk,
825 			    struct sk_buff *skb, u32 seq, u32 ack,
826 			    u32 win, u32 tsval, u32 tsecr, int oif,
827 			    struct tcp_md5sig_key *key,
828 			    int reply_flags, u8 tos)
829 {
830 	const struct tcphdr *th = tcp_hdr(skb);
831 	struct {
832 		struct tcphdr th;
833 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
834 #ifdef CONFIG_TCP_MD5SIG
835 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
836 #endif
837 			];
838 	} rep;
839 	struct net *net = sock_net(sk);
840 	struct ip_reply_arg arg;
841 	struct sock *ctl_sk;
842 	u64 transmit_time;
843 
844 	memset(&rep.th, 0, sizeof(struct tcphdr));
845 	memset(&arg, 0, sizeof(arg));
846 
847 	arg.iov[0].iov_base = (unsigned char *)&rep;
848 	arg.iov[0].iov_len  = sizeof(rep.th);
849 	if (tsecr) {
850 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
851 				   (TCPOPT_TIMESTAMP << 8) |
852 				   TCPOLEN_TIMESTAMP);
853 		rep.opt[1] = htonl(tsval);
854 		rep.opt[2] = htonl(tsecr);
855 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
856 	}
857 
858 	/* Swap the send and the receive. */
859 	rep.th.dest    = th->source;
860 	rep.th.source  = th->dest;
861 	rep.th.doff    = arg.iov[0].iov_len / 4;
862 	rep.th.seq     = htonl(seq);
863 	rep.th.ack_seq = htonl(ack);
864 	rep.th.ack     = 1;
865 	rep.th.window  = htons(win);
866 
867 #ifdef CONFIG_TCP_MD5SIG
868 	if (key) {
869 		int offset = (tsecr) ? 3 : 0;
870 
871 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
872 					  (TCPOPT_NOP << 16) |
873 					  (TCPOPT_MD5SIG << 8) |
874 					  TCPOLEN_MD5SIG);
875 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
876 		rep.th.doff = arg.iov[0].iov_len/4;
877 
878 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
879 				    key, ip_hdr(skb)->saddr,
880 				    ip_hdr(skb)->daddr, &rep.th);
881 	}
882 #endif
883 	arg.flags = reply_flags;
884 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
885 				      ip_hdr(skb)->saddr, /* XXX */
886 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
887 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
888 	if (oif)
889 		arg.bound_dev_if = oif;
890 	arg.tos = tos;
891 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
892 	local_bh_disable();
893 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
894 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
895 			   inet_twsk(sk)->tw_mark : sk->sk_mark;
896 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
897 			   inet_twsk(sk)->tw_priority : sk->sk_priority;
898 	transmit_time = tcp_transmit_time(sk);
899 	ip_send_unicast_reply(ctl_sk,
900 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
901 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
902 			      &arg, arg.iov[0].iov_len,
903 			      transmit_time);
904 
905 	ctl_sk->sk_mark = 0;
906 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
907 	local_bh_enable();
908 }
909 
910 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
911 {
912 	struct inet_timewait_sock *tw = inet_twsk(sk);
913 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
914 
915 	tcp_v4_send_ack(sk, skb,
916 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
917 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
918 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
919 			tcptw->tw_ts_recent,
920 			tw->tw_bound_dev_if,
921 			tcp_twsk_md5_key(tcptw),
922 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
923 			tw->tw_tos
924 			);
925 
926 	inet_twsk_put(tw);
927 }
928 
929 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
930 				  struct request_sock *req)
931 {
932 	const union tcp_md5_addr *addr;
933 	int l3index;
934 
935 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
936 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
937 	 */
938 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
939 					     tcp_sk(sk)->snd_nxt;
940 
941 	/* RFC 7323 2.3
942 	 * The window field (SEG.WND) of every outgoing segment, with the
943 	 * exception of <SYN> segments, MUST be right-shifted by
944 	 * Rcv.Wind.Shift bits:
945 	 */
946 	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
947 	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
948 	tcp_v4_send_ack(sk, skb, seq,
949 			tcp_rsk(req)->rcv_nxt,
950 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
951 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
952 			req->ts_recent,
953 			0,
954 			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
955 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
956 			ip_hdr(skb)->tos);
957 }
958 
959 /*
960  *	Send a SYN-ACK after having received a SYN.
961  *	This still operates on a request_sock only, not on a big
962  *	socket.
963  */
964 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
965 			      struct flowi *fl,
966 			      struct request_sock *req,
967 			      struct tcp_fastopen_cookie *foc,
968 			      enum tcp_synack_type synack_type)
969 {
970 	const struct inet_request_sock *ireq = inet_rsk(req);
971 	struct flowi4 fl4;
972 	int err = -1;
973 	struct sk_buff *skb;
974 
975 	/* First, grab a route. */
976 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
977 		return -1;
978 
979 	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
980 
981 	if (skb) {
982 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
983 
984 		rcu_read_lock();
985 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
986 					    ireq->ir_rmt_addr,
987 					    rcu_dereference(ireq->ireq_opt));
988 		rcu_read_unlock();
989 		err = net_xmit_eval(err);
990 	}
991 
992 	return err;
993 }
994 
995 /*
996  *	IPv4 request_sock destructor.
997  */
998 static void tcp_v4_reqsk_destructor(struct request_sock *req)
999 {
1000 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1001 }
1002 
1003 #ifdef CONFIG_TCP_MD5SIG
1004 /*
1005  * RFC2385 MD5 checksumming requires a mapping of
1006  * IP address->MD5 Key.
1007  * We need to maintain these in the sk structure.
1008  */
1009 
1010 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1011 EXPORT_SYMBOL(tcp_md5_needed);
1012 
1013 /* Find the Key structure for an address.  */
1014 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1015 					   const union tcp_md5_addr *addr,
1016 					   int family)
1017 {
1018 	const struct tcp_sock *tp = tcp_sk(sk);
1019 	struct tcp_md5sig_key *key;
1020 	const struct tcp_md5sig_info *md5sig;
1021 	__be32 mask;
1022 	struct tcp_md5sig_key *best_match = NULL;
1023 	bool match;
1024 
1025 	/* caller either holds rcu_read_lock() or socket lock */
1026 	md5sig = rcu_dereference_check(tp->md5sig_info,
1027 				       lockdep_sock_is_held(sk));
1028 	if (!md5sig)
1029 		return NULL;
1030 
1031 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1032 				 lockdep_sock_is_held(sk)) {
1033 		if (key->family != family)
1034 			continue;
1035 		if (key->l3index && key->l3index != l3index)
1036 			continue;
1037 		if (family == AF_INET) {
1038 			mask = inet_make_mask(key->prefixlen);
1039 			match = (key->addr.a4.s_addr & mask) ==
1040 				(addr->a4.s_addr & mask);
1041 #if IS_ENABLED(CONFIG_IPV6)
1042 		} else if (family == AF_INET6) {
1043 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1044 						  key->prefixlen);
1045 #endif
1046 		} else {
1047 			match = false;
1048 		}
1049 
1050 		if (match && (!best_match ||
1051 			      key->prefixlen > best_match->prefixlen))
1052 			best_match = key;
1053 	}
1054 	return best_match;
1055 }
1056 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1057 
1058 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1059 						      const union tcp_md5_addr *addr,
1060 						      int family, u8 prefixlen,
1061 						      int l3index)
1062 {
1063 	const struct tcp_sock *tp = tcp_sk(sk);
1064 	struct tcp_md5sig_key *key;
1065 	unsigned int size = sizeof(struct in_addr);
1066 	const struct tcp_md5sig_info *md5sig;
1067 
1068 	/* caller either holds rcu_read_lock() or socket lock */
1069 	md5sig = rcu_dereference_check(tp->md5sig_info,
1070 				       lockdep_sock_is_held(sk));
1071 	if (!md5sig)
1072 		return NULL;
1073 #if IS_ENABLED(CONFIG_IPV6)
1074 	if (family == AF_INET6)
1075 		size = sizeof(struct in6_addr);
1076 #endif
1077 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1078 				 lockdep_sock_is_held(sk)) {
1079 		if (key->family != family)
1080 			continue;
1081 		if (key->l3index && key->l3index != l3index)
1082 			continue;
1083 		if (!memcmp(&key->addr, addr, size) &&
1084 		    key->prefixlen == prefixlen)
1085 			return key;
1086 	}
1087 	return NULL;
1088 }
1089 
1090 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1091 					 const struct sock *addr_sk)
1092 {
1093 	const union tcp_md5_addr *addr;
1094 	int l3index;
1095 
1096 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1097 						 addr_sk->sk_bound_dev_if);
1098 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1099 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1100 }
1101 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1102 
1103 /* This can be called on a newly created socket, from other files */
1104 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1105 		   int family, u8 prefixlen, int l3index,
1106 		   const u8 *newkey, u8 newkeylen, gfp_t gfp)
1107 {
1108 	/* Add Key to the list */
1109 	struct tcp_md5sig_key *key;
1110 	struct tcp_sock *tp = tcp_sk(sk);
1111 	struct tcp_md5sig_info *md5sig;
1112 
1113 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1114 	if (key) {
1115 		/* Pre-existing entry - just update that one.
1116 		 * Note that the key might be used concurrently.
1117 		 * data_race() is telling kcsan that we do not care of
1118 		 * key mismatches, since changing MD5 key on live flows
1119 		 * can lead to packet drops.
1120 		 */
1121 		data_race(memcpy(key->key, newkey, newkeylen));
1122 
1123 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1124 		 * Also note that a reader could catch new key->keylen value
1125 		 * but old key->key[], this is the reason we use __GFP_ZERO
1126 		 * at sock_kmalloc() time below these lines.
1127 		 */
1128 		WRITE_ONCE(key->keylen, newkeylen);
1129 
1130 		return 0;
1131 	}
1132 
1133 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1134 					   lockdep_sock_is_held(sk));
1135 	if (!md5sig) {
1136 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1137 		if (!md5sig)
1138 			return -ENOMEM;
1139 
1140 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1141 		INIT_HLIST_HEAD(&md5sig->head);
1142 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1143 	}
1144 
1145 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1146 	if (!key)
1147 		return -ENOMEM;
1148 	if (!tcp_alloc_md5sig_pool()) {
1149 		sock_kfree_s(sk, key, sizeof(*key));
1150 		return -ENOMEM;
1151 	}
1152 
1153 	memcpy(key->key, newkey, newkeylen);
1154 	key->keylen = newkeylen;
1155 	key->family = family;
1156 	key->prefixlen = prefixlen;
1157 	key->l3index = l3index;
1158 	memcpy(&key->addr, addr,
1159 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1160 				      sizeof(struct in_addr));
1161 	hlist_add_head_rcu(&key->node, &md5sig->head);
1162 	return 0;
1163 }
1164 EXPORT_SYMBOL(tcp_md5_do_add);
1165 
1166 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1167 		   u8 prefixlen, int l3index)
1168 {
1169 	struct tcp_md5sig_key *key;
1170 
1171 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1172 	if (!key)
1173 		return -ENOENT;
1174 	hlist_del_rcu(&key->node);
1175 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1176 	kfree_rcu(key, rcu);
1177 	return 0;
1178 }
1179 EXPORT_SYMBOL(tcp_md5_do_del);
1180 
1181 static void tcp_clear_md5_list(struct sock *sk)
1182 {
1183 	struct tcp_sock *tp = tcp_sk(sk);
1184 	struct tcp_md5sig_key *key;
1185 	struct hlist_node *n;
1186 	struct tcp_md5sig_info *md5sig;
1187 
1188 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1189 
1190 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1191 		hlist_del_rcu(&key->node);
1192 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1193 		kfree_rcu(key, rcu);
1194 	}
1195 }
1196 
1197 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1198 				 sockptr_t optval, int optlen)
1199 {
1200 	struct tcp_md5sig cmd;
1201 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1202 	const union tcp_md5_addr *addr;
1203 	u8 prefixlen = 32;
1204 	int l3index = 0;
1205 
1206 	if (optlen < sizeof(cmd))
1207 		return -EINVAL;
1208 
1209 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1210 		return -EFAULT;
1211 
1212 	if (sin->sin_family != AF_INET)
1213 		return -EINVAL;
1214 
1215 	if (optname == TCP_MD5SIG_EXT &&
1216 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1217 		prefixlen = cmd.tcpm_prefixlen;
1218 		if (prefixlen > 32)
1219 			return -EINVAL;
1220 	}
1221 
1222 	if (optname == TCP_MD5SIG_EXT &&
1223 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1224 		struct net_device *dev;
1225 
1226 		rcu_read_lock();
1227 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1228 		if (dev && netif_is_l3_master(dev))
1229 			l3index = dev->ifindex;
1230 
1231 		rcu_read_unlock();
1232 
1233 		/* ok to reference set/not set outside of rcu;
1234 		 * right now device MUST be an L3 master
1235 		 */
1236 		if (!dev || !l3index)
1237 			return -EINVAL;
1238 	}
1239 
1240 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1241 
1242 	if (!cmd.tcpm_keylen)
1243 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1244 
1245 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1246 		return -EINVAL;
1247 
1248 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1249 			      cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1250 }
1251 
1252 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1253 				   __be32 daddr, __be32 saddr,
1254 				   const struct tcphdr *th, int nbytes)
1255 {
1256 	struct tcp4_pseudohdr *bp;
1257 	struct scatterlist sg;
1258 	struct tcphdr *_th;
1259 
1260 	bp = hp->scratch;
1261 	bp->saddr = saddr;
1262 	bp->daddr = daddr;
1263 	bp->pad = 0;
1264 	bp->protocol = IPPROTO_TCP;
1265 	bp->len = cpu_to_be16(nbytes);
1266 
1267 	_th = (struct tcphdr *)(bp + 1);
1268 	memcpy(_th, th, sizeof(*th));
1269 	_th->check = 0;
1270 
1271 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1272 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1273 				sizeof(*bp) + sizeof(*th));
1274 	return crypto_ahash_update(hp->md5_req);
1275 }
1276 
1277 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1278 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1279 {
1280 	struct tcp_md5sig_pool *hp;
1281 	struct ahash_request *req;
1282 
1283 	hp = tcp_get_md5sig_pool();
1284 	if (!hp)
1285 		goto clear_hash_noput;
1286 	req = hp->md5_req;
1287 
1288 	if (crypto_ahash_init(req))
1289 		goto clear_hash;
1290 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1291 		goto clear_hash;
1292 	if (tcp_md5_hash_key(hp, key))
1293 		goto clear_hash;
1294 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1295 	if (crypto_ahash_final(req))
1296 		goto clear_hash;
1297 
1298 	tcp_put_md5sig_pool();
1299 	return 0;
1300 
1301 clear_hash:
1302 	tcp_put_md5sig_pool();
1303 clear_hash_noput:
1304 	memset(md5_hash, 0, 16);
1305 	return 1;
1306 }
1307 
1308 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1309 			const struct sock *sk,
1310 			const struct sk_buff *skb)
1311 {
1312 	struct tcp_md5sig_pool *hp;
1313 	struct ahash_request *req;
1314 	const struct tcphdr *th = tcp_hdr(skb);
1315 	__be32 saddr, daddr;
1316 
1317 	if (sk) { /* valid for establish/request sockets */
1318 		saddr = sk->sk_rcv_saddr;
1319 		daddr = sk->sk_daddr;
1320 	} else {
1321 		const struct iphdr *iph = ip_hdr(skb);
1322 		saddr = iph->saddr;
1323 		daddr = iph->daddr;
1324 	}
1325 
1326 	hp = tcp_get_md5sig_pool();
1327 	if (!hp)
1328 		goto clear_hash_noput;
1329 	req = hp->md5_req;
1330 
1331 	if (crypto_ahash_init(req))
1332 		goto clear_hash;
1333 
1334 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1335 		goto clear_hash;
1336 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1337 		goto clear_hash;
1338 	if (tcp_md5_hash_key(hp, key))
1339 		goto clear_hash;
1340 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1341 	if (crypto_ahash_final(req))
1342 		goto clear_hash;
1343 
1344 	tcp_put_md5sig_pool();
1345 	return 0;
1346 
1347 clear_hash:
1348 	tcp_put_md5sig_pool();
1349 clear_hash_noput:
1350 	memset(md5_hash, 0, 16);
1351 	return 1;
1352 }
1353 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1354 
1355 #endif
1356 
1357 /* Called with rcu_read_lock() */
1358 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1359 				    const struct sk_buff *skb,
1360 				    int dif, int sdif)
1361 {
1362 #ifdef CONFIG_TCP_MD5SIG
1363 	/*
1364 	 * This gets called for each TCP segment that arrives
1365 	 * so we want to be efficient.
1366 	 * We have 3 drop cases:
1367 	 * o No MD5 hash and one expected.
1368 	 * o MD5 hash and we're not expecting one.
1369 	 * o MD5 hash and its wrong.
1370 	 */
1371 	const __u8 *hash_location = NULL;
1372 	struct tcp_md5sig_key *hash_expected;
1373 	const struct iphdr *iph = ip_hdr(skb);
1374 	const struct tcphdr *th = tcp_hdr(skb);
1375 	const union tcp_md5_addr *addr;
1376 	unsigned char newhash[16];
1377 	int genhash, l3index;
1378 
1379 	/* sdif set, means packet ingressed via a device
1380 	 * in an L3 domain and dif is set to the l3mdev
1381 	 */
1382 	l3index = sdif ? dif : 0;
1383 
1384 	addr = (union tcp_md5_addr *)&iph->saddr;
1385 	hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1386 	hash_location = tcp_parse_md5sig_option(th);
1387 
1388 	/* We've parsed the options - do we have a hash? */
1389 	if (!hash_expected && !hash_location)
1390 		return false;
1391 
1392 	if (hash_expected && !hash_location) {
1393 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1394 		return true;
1395 	}
1396 
1397 	if (!hash_expected && hash_location) {
1398 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1399 		return true;
1400 	}
1401 
1402 	/* Okay, so this is hash_expected and hash_location -
1403 	 * so we need to calculate the checksum.
1404 	 */
1405 	genhash = tcp_v4_md5_hash_skb(newhash,
1406 				      hash_expected,
1407 				      NULL, skb);
1408 
1409 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1410 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1411 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1412 				     &iph->saddr, ntohs(th->source),
1413 				     &iph->daddr, ntohs(th->dest),
1414 				     genhash ? " tcp_v4_calc_md5_hash failed"
1415 				     : "", l3index);
1416 		return true;
1417 	}
1418 	return false;
1419 #endif
1420 	return false;
1421 }
1422 
1423 static void tcp_v4_init_req(struct request_sock *req,
1424 			    const struct sock *sk_listener,
1425 			    struct sk_buff *skb)
1426 {
1427 	struct inet_request_sock *ireq = inet_rsk(req);
1428 	struct net *net = sock_net(sk_listener);
1429 
1430 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1431 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1432 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1433 }
1434 
1435 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1436 					  struct flowi *fl,
1437 					  const struct request_sock *req)
1438 {
1439 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1440 }
1441 
1442 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1443 	.family		=	PF_INET,
1444 	.obj_size	=	sizeof(struct tcp_request_sock),
1445 	.rtx_syn_ack	=	tcp_rtx_synack,
1446 	.send_ack	=	tcp_v4_reqsk_send_ack,
1447 	.destructor	=	tcp_v4_reqsk_destructor,
1448 	.send_reset	=	tcp_v4_send_reset,
1449 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1450 };
1451 
1452 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1453 	.mss_clamp	=	TCP_MSS_DEFAULT,
1454 #ifdef CONFIG_TCP_MD5SIG
1455 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1456 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1457 #endif
1458 	.init_req	=	tcp_v4_init_req,
1459 #ifdef CONFIG_SYN_COOKIES
1460 	.cookie_init_seq =	cookie_v4_init_sequence,
1461 #endif
1462 	.route_req	=	tcp_v4_route_req,
1463 	.init_seq	=	tcp_v4_init_seq,
1464 	.init_ts_off	=	tcp_v4_init_ts_off,
1465 	.send_synack	=	tcp_v4_send_synack,
1466 };
1467 
1468 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1469 {
1470 	/* Never answer to SYNs send to broadcast or multicast */
1471 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1472 		goto drop;
1473 
1474 	return tcp_conn_request(&tcp_request_sock_ops,
1475 				&tcp_request_sock_ipv4_ops, sk, skb);
1476 
1477 drop:
1478 	tcp_listendrop(sk);
1479 	return 0;
1480 }
1481 EXPORT_SYMBOL(tcp_v4_conn_request);
1482 
1483 
1484 /*
1485  * The three way handshake has completed - we got a valid synack -
1486  * now create the new socket.
1487  */
1488 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1489 				  struct request_sock *req,
1490 				  struct dst_entry *dst,
1491 				  struct request_sock *req_unhash,
1492 				  bool *own_req)
1493 {
1494 	struct inet_request_sock *ireq;
1495 	struct inet_sock *newinet;
1496 	struct tcp_sock *newtp;
1497 	struct sock *newsk;
1498 #ifdef CONFIG_TCP_MD5SIG
1499 	const union tcp_md5_addr *addr;
1500 	struct tcp_md5sig_key *key;
1501 	int l3index;
1502 #endif
1503 	struct ip_options_rcu *inet_opt;
1504 
1505 	if (sk_acceptq_is_full(sk))
1506 		goto exit_overflow;
1507 
1508 	newsk = tcp_create_openreq_child(sk, req, skb);
1509 	if (!newsk)
1510 		goto exit_nonewsk;
1511 
1512 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1513 	inet_sk_rx_dst_set(newsk, skb);
1514 
1515 	newtp		      = tcp_sk(newsk);
1516 	newinet		      = inet_sk(newsk);
1517 	ireq		      = inet_rsk(req);
1518 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1519 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1520 	newsk->sk_bound_dev_if = ireq->ir_iif;
1521 	newinet->inet_saddr   = ireq->ir_loc_addr;
1522 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1523 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1524 	newinet->mc_index     = inet_iif(skb);
1525 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1526 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1527 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1528 	if (inet_opt)
1529 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1530 	newinet->inet_id = prandom_u32();
1531 
1532 	if (!dst) {
1533 		dst = inet_csk_route_child_sock(sk, newsk, req);
1534 		if (!dst)
1535 			goto put_and_exit;
1536 	} else {
1537 		/* syncookie case : see end of cookie_v4_check() */
1538 	}
1539 	sk_setup_caps(newsk, dst);
1540 
1541 	tcp_ca_openreq_child(newsk, dst);
1542 
1543 	tcp_sync_mss(newsk, dst_mtu(dst));
1544 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1545 
1546 	tcp_initialize_rcv_mss(newsk);
1547 
1548 #ifdef CONFIG_TCP_MD5SIG
1549 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1550 	/* Copy over the MD5 key from the original socket */
1551 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1552 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1553 	if (key) {
1554 		/*
1555 		 * We're using one, so create a matching key
1556 		 * on the newsk structure. If we fail to get
1557 		 * memory, then we end up not copying the key
1558 		 * across. Shucks.
1559 		 */
1560 		tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1561 			       key->key, key->keylen, GFP_ATOMIC);
1562 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1563 	}
1564 #endif
1565 
1566 	if (__inet_inherit_port(sk, newsk) < 0)
1567 		goto put_and_exit;
1568 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1569 	if (likely(*own_req)) {
1570 		tcp_move_syn(newtp, req);
1571 		ireq->ireq_opt = NULL;
1572 	} else {
1573 		newinet->inet_opt = NULL;
1574 	}
1575 	return newsk;
1576 
1577 exit_overflow:
1578 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1579 exit_nonewsk:
1580 	dst_release(dst);
1581 exit:
1582 	tcp_listendrop(sk);
1583 	return NULL;
1584 put_and_exit:
1585 	newinet->inet_opt = NULL;
1586 	inet_csk_prepare_forced_close(newsk);
1587 	tcp_done(newsk);
1588 	goto exit;
1589 }
1590 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1591 
1592 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1593 {
1594 #ifdef CONFIG_SYN_COOKIES
1595 	const struct tcphdr *th = tcp_hdr(skb);
1596 
1597 	if (!th->syn)
1598 		sk = cookie_v4_check(sk, skb);
1599 #endif
1600 	return sk;
1601 }
1602 
1603 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1604 			 struct tcphdr *th, u32 *cookie)
1605 {
1606 	u16 mss = 0;
1607 #ifdef CONFIG_SYN_COOKIES
1608 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1609 				    &tcp_request_sock_ipv4_ops, sk, th);
1610 	if (mss) {
1611 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1612 		tcp_synq_overflow(sk);
1613 	}
1614 #endif
1615 	return mss;
1616 }
1617 
1618 /* The socket must have it's spinlock held when we get
1619  * here, unless it is a TCP_LISTEN socket.
1620  *
1621  * We have a potential double-lock case here, so even when
1622  * doing backlog processing we use the BH locking scheme.
1623  * This is because we cannot sleep with the original spinlock
1624  * held.
1625  */
1626 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1627 {
1628 	struct sock *rsk;
1629 
1630 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1631 		struct dst_entry *dst = sk->sk_rx_dst;
1632 
1633 		sock_rps_save_rxhash(sk, skb);
1634 		sk_mark_napi_id(sk, skb);
1635 		if (dst) {
1636 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1637 			    !dst->ops->check(dst, 0)) {
1638 				dst_release(dst);
1639 				sk->sk_rx_dst = NULL;
1640 			}
1641 		}
1642 		tcp_rcv_established(sk, skb);
1643 		return 0;
1644 	}
1645 
1646 	if (tcp_checksum_complete(skb))
1647 		goto csum_err;
1648 
1649 	if (sk->sk_state == TCP_LISTEN) {
1650 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1651 
1652 		if (!nsk)
1653 			goto discard;
1654 		if (nsk != sk) {
1655 			if (tcp_child_process(sk, nsk, skb)) {
1656 				rsk = nsk;
1657 				goto reset;
1658 			}
1659 			return 0;
1660 		}
1661 	} else
1662 		sock_rps_save_rxhash(sk, skb);
1663 
1664 	if (tcp_rcv_state_process(sk, skb)) {
1665 		rsk = sk;
1666 		goto reset;
1667 	}
1668 	return 0;
1669 
1670 reset:
1671 	tcp_v4_send_reset(rsk, skb);
1672 discard:
1673 	kfree_skb(skb);
1674 	/* Be careful here. If this function gets more complicated and
1675 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1676 	 * might be destroyed here. This current version compiles correctly,
1677 	 * but you have been warned.
1678 	 */
1679 	return 0;
1680 
1681 csum_err:
1682 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1683 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1684 	goto discard;
1685 }
1686 EXPORT_SYMBOL(tcp_v4_do_rcv);
1687 
1688 int tcp_v4_early_demux(struct sk_buff *skb)
1689 {
1690 	const struct iphdr *iph;
1691 	const struct tcphdr *th;
1692 	struct sock *sk;
1693 
1694 	if (skb->pkt_type != PACKET_HOST)
1695 		return 0;
1696 
1697 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1698 		return 0;
1699 
1700 	iph = ip_hdr(skb);
1701 	th = tcp_hdr(skb);
1702 
1703 	if (th->doff < sizeof(struct tcphdr) / 4)
1704 		return 0;
1705 
1706 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1707 				       iph->saddr, th->source,
1708 				       iph->daddr, ntohs(th->dest),
1709 				       skb->skb_iif, inet_sdif(skb));
1710 	if (sk) {
1711 		skb->sk = sk;
1712 		skb->destructor = sock_edemux;
1713 		if (sk_fullsock(sk)) {
1714 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1715 
1716 			if (dst)
1717 				dst = dst_check(dst, 0);
1718 			if (dst &&
1719 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1720 				skb_dst_set_noref(skb, dst);
1721 		}
1722 	}
1723 	return 0;
1724 }
1725 
1726 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1727 {
1728 	u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1729 	struct skb_shared_info *shinfo;
1730 	const struct tcphdr *th;
1731 	struct tcphdr *thtail;
1732 	struct sk_buff *tail;
1733 	unsigned int hdrlen;
1734 	bool fragstolen;
1735 	u32 gso_segs;
1736 	int delta;
1737 
1738 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1739 	 * we can fix skb->truesize to its real value to avoid future drops.
1740 	 * This is valid because skb is not yet charged to the socket.
1741 	 * It has been noticed pure SACK packets were sometimes dropped
1742 	 * (if cooked by drivers without copybreak feature).
1743 	 */
1744 	skb_condense(skb);
1745 
1746 	skb_dst_drop(skb);
1747 
1748 	if (unlikely(tcp_checksum_complete(skb))) {
1749 		bh_unlock_sock(sk);
1750 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1751 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1752 		return true;
1753 	}
1754 
1755 	/* Attempt coalescing to last skb in backlog, even if we are
1756 	 * above the limits.
1757 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1758 	 */
1759 	th = (const struct tcphdr *)skb->data;
1760 	hdrlen = th->doff * 4;
1761 	shinfo = skb_shinfo(skb);
1762 
1763 	if (!shinfo->gso_size)
1764 		shinfo->gso_size = skb->len - hdrlen;
1765 
1766 	if (!shinfo->gso_segs)
1767 		shinfo->gso_segs = 1;
1768 
1769 	tail = sk->sk_backlog.tail;
1770 	if (!tail)
1771 		goto no_coalesce;
1772 	thtail = (struct tcphdr *)tail->data;
1773 
1774 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1775 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1776 	    ((TCP_SKB_CB(tail)->tcp_flags |
1777 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1778 	    !((TCP_SKB_CB(tail)->tcp_flags &
1779 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1780 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1781 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1782 #ifdef CONFIG_TLS_DEVICE
1783 	    tail->decrypted != skb->decrypted ||
1784 #endif
1785 	    thtail->doff != th->doff ||
1786 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1787 		goto no_coalesce;
1788 
1789 	__skb_pull(skb, hdrlen);
1790 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1791 		thtail->window = th->window;
1792 
1793 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1794 
1795 		if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1796 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1797 
1798 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1799 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1800 		 * is not entered if we append a packet with a FIN.
1801 		 * SYN, RST, URG are not present.
1802 		 * ACK is set on both packets.
1803 		 * PSH : we do not really care in TCP stack,
1804 		 *       at least for 'GRO' packets.
1805 		 */
1806 		thtail->fin |= th->fin;
1807 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1808 
1809 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1810 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1811 			tail->tstamp = skb->tstamp;
1812 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1813 		}
1814 
1815 		/* Not as strict as GRO. We only need to carry mss max value */
1816 		skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1817 						 skb_shinfo(tail)->gso_size);
1818 
1819 		gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1820 		skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1821 
1822 		sk->sk_backlog.len += delta;
1823 		__NET_INC_STATS(sock_net(sk),
1824 				LINUX_MIB_TCPBACKLOGCOALESCE);
1825 		kfree_skb_partial(skb, fragstolen);
1826 		return false;
1827 	}
1828 	__skb_push(skb, hdrlen);
1829 
1830 no_coalesce:
1831 	/* Only socket owner can try to collapse/prune rx queues
1832 	 * to reduce memory overhead, so add a little headroom here.
1833 	 * Few sockets backlog are possibly concurrently non empty.
1834 	 */
1835 	limit += 64*1024;
1836 
1837 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1838 		bh_unlock_sock(sk);
1839 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1840 		return true;
1841 	}
1842 	return false;
1843 }
1844 EXPORT_SYMBOL(tcp_add_backlog);
1845 
1846 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1847 {
1848 	struct tcphdr *th = (struct tcphdr *)skb->data;
1849 
1850 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1851 }
1852 EXPORT_SYMBOL(tcp_filter);
1853 
1854 static void tcp_v4_restore_cb(struct sk_buff *skb)
1855 {
1856 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1857 		sizeof(struct inet_skb_parm));
1858 }
1859 
1860 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1861 			   const struct tcphdr *th)
1862 {
1863 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1864 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1865 	 */
1866 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1867 		sizeof(struct inet_skb_parm));
1868 	barrier();
1869 
1870 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1871 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1872 				    skb->len - th->doff * 4);
1873 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1874 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1875 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1876 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1877 	TCP_SKB_CB(skb)->sacked	 = 0;
1878 	TCP_SKB_CB(skb)->has_rxtstamp =
1879 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1880 }
1881 
1882 /*
1883  *	From tcp_input.c
1884  */
1885 
1886 int tcp_v4_rcv(struct sk_buff *skb)
1887 {
1888 	struct net *net = dev_net(skb->dev);
1889 	struct sk_buff *skb_to_free;
1890 	int sdif = inet_sdif(skb);
1891 	int dif = inet_iif(skb);
1892 	const struct iphdr *iph;
1893 	const struct tcphdr *th;
1894 	bool refcounted;
1895 	struct sock *sk;
1896 	int ret;
1897 
1898 	if (skb->pkt_type != PACKET_HOST)
1899 		goto discard_it;
1900 
1901 	/* Count it even if it's bad */
1902 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1903 
1904 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1905 		goto discard_it;
1906 
1907 	th = (const struct tcphdr *)skb->data;
1908 
1909 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1910 		goto bad_packet;
1911 	if (!pskb_may_pull(skb, th->doff * 4))
1912 		goto discard_it;
1913 
1914 	/* An explanation is required here, I think.
1915 	 * Packet length and doff are validated by header prediction,
1916 	 * provided case of th->doff==0 is eliminated.
1917 	 * So, we defer the checks. */
1918 
1919 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1920 		goto csum_error;
1921 
1922 	th = (const struct tcphdr *)skb->data;
1923 	iph = ip_hdr(skb);
1924 lookup:
1925 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1926 			       th->dest, sdif, &refcounted);
1927 	if (!sk)
1928 		goto no_tcp_socket;
1929 
1930 process:
1931 	if (sk->sk_state == TCP_TIME_WAIT)
1932 		goto do_time_wait;
1933 
1934 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1935 		struct request_sock *req = inet_reqsk(sk);
1936 		bool req_stolen = false;
1937 		struct sock *nsk;
1938 
1939 		sk = req->rsk_listener;
1940 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1941 			sk_drops_add(sk, skb);
1942 			reqsk_put(req);
1943 			goto discard_it;
1944 		}
1945 		if (tcp_checksum_complete(skb)) {
1946 			reqsk_put(req);
1947 			goto csum_error;
1948 		}
1949 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1950 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1951 			goto lookup;
1952 		}
1953 		/* We own a reference on the listener, increase it again
1954 		 * as we might lose it too soon.
1955 		 */
1956 		sock_hold(sk);
1957 		refcounted = true;
1958 		nsk = NULL;
1959 		if (!tcp_filter(sk, skb)) {
1960 			th = (const struct tcphdr *)skb->data;
1961 			iph = ip_hdr(skb);
1962 			tcp_v4_fill_cb(skb, iph, th);
1963 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1964 		}
1965 		if (!nsk) {
1966 			reqsk_put(req);
1967 			if (req_stolen) {
1968 				/* Another cpu got exclusive access to req
1969 				 * and created a full blown socket.
1970 				 * Try to feed this packet to this socket
1971 				 * instead of discarding it.
1972 				 */
1973 				tcp_v4_restore_cb(skb);
1974 				sock_put(sk);
1975 				goto lookup;
1976 			}
1977 			goto discard_and_relse;
1978 		}
1979 		if (nsk == sk) {
1980 			reqsk_put(req);
1981 			tcp_v4_restore_cb(skb);
1982 		} else if (tcp_child_process(sk, nsk, skb)) {
1983 			tcp_v4_send_reset(nsk, skb);
1984 			goto discard_and_relse;
1985 		} else {
1986 			sock_put(sk);
1987 			return 0;
1988 		}
1989 	}
1990 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1991 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1992 		goto discard_and_relse;
1993 	}
1994 
1995 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1996 		goto discard_and_relse;
1997 
1998 	if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
1999 		goto discard_and_relse;
2000 
2001 	nf_reset_ct(skb);
2002 
2003 	if (tcp_filter(sk, skb))
2004 		goto discard_and_relse;
2005 	th = (const struct tcphdr *)skb->data;
2006 	iph = ip_hdr(skb);
2007 	tcp_v4_fill_cb(skb, iph, th);
2008 
2009 	skb->dev = NULL;
2010 
2011 	if (sk->sk_state == TCP_LISTEN) {
2012 		ret = tcp_v4_do_rcv(sk, skb);
2013 		goto put_and_return;
2014 	}
2015 
2016 	sk_incoming_cpu_update(sk);
2017 
2018 	bh_lock_sock_nested(sk);
2019 	tcp_segs_in(tcp_sk(sk), skb);
2020 	ret = 0;
2021 	if (!sock_owned_by_user(sk)) {
2022 		skb_to_free = sk->sk_rx_skb_cache;
2023 		sk->sk_rx_skb_cache = NULL;
2024 		ret = tcp_v4_do_rcv(sk, skb);
2025 	} else {
2026 		if (tcp_add_backlog(sk, skb))
2027 			goto discard_and_relse;
2028 		skb_to_free = NULL;
2029 	}
2030 	bh_unlock_sock(sk);
2031 	if (skb_to_free)
2032 		__kfree_skb(skb_to_free);
2033 
2034 put_and_return:
2035 	if (refcounted)
2036 		sock_put(sk);
2037 
2038 	return ret;
2039 
2040 no_tcp_socket:
2041 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2042 		goto discard_it;
2043 
2044 	tcp_v4_fill_cb(skb, iph, th);
2045 
2046 	if (tcp_checksum_complete(skb)) {
2047 csum_error:
2048 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2049 bad_packet:
2050 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2051 	} else {
2052 		tcp_v4_send_reset(NULL, skb);
2053 	}
2054 
2055 discard_it:
2056 	/* Discard frame. */
2057 	kfree_skb(skb);
2058 	return 0;
2059 
2060 discard_and_relse:
2061 	sk_drops_add(sk, skb);
2062 	if (refcounted)
2063 		sock_put(sk);
2064 	goto discard_it;
2065 
2066 do_time_wait:
2067 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2068 		inet_twsk_put(inet_twsk(sk));
2069 		goto discard_it;
2070 	}
2071 
2072 	tcp_v4_fill_cb(skb, iph, th);
2073 
2074 	if (tcp_checksum_complete(skb)) {
2075 		inet_twsk_put(inet_twsk(sk));
2076 		goto csum_error;
2077 	}
2078 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2079 	case TCP_TW_SYN: {
2080 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2081 							&tcp_hashinfo, skb,
2082 							__tcp_hdrlen(th),
2083 							iph->saddr, th->source,
2084 							iph->daddr, th->dest,
2085 							inet_iif(skb),
2086 							sdif);
2087 		if (sk2) {
2088 			inet_twsk_deschedule_put(inet_twsk(sk));
2089 			sk = sk2;
2090 			tcp_v4_restore_cb(skb);
2091 			refcounted = false;
2092 			goto process;
2093 		}
2094 	}
2095 		/* to ACK */
2096 		fallthrough;
2097 	case TCP_TW_ACK:
2098 		tcp_v4_timewait_ack(sk, skb);
2099 		break;
2100 	case TCP_TW_RST:
2101 		tcp_v4_send_reset(sk, skb);
2102 		inet_twsk_deschedule_put(inet_twsk(sk));
2103 		goto discard_it;
2104 	case TCP_TW_SUCCESS:;
2105 	}
2106 	goto discard_it;
2107 }
2108 
2109 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2110 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2111 	.twsk_unique	= tcp_twsk_unique,
2112 	.twsk_destructor= tcp_twsk_destructor,
2113 };
2114 
2115 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2116 {
2117 	struct dst_entry *dst = skb_dst(skb);
2118 
2119 	if (dst && dst_hold_safe(dst)) {
2120 		sk->sk_rx_dst = dst;
2121 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2122 	}
2123 }
2124 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2125 
2126 const struct inet_connection_sock_af_ops ipv4_specific = {
2127 	.queue_xmit	   = ip_queue_xmit,
2128 	.send_check	   = tcp_v4_send_check,
2129 	.rebuild_header	   = inet_sk_rebuild_header,
2130 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2131 	.conn_request	   = tcp_v4_conn_request,
2132 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2133 	.net_header_len	   = sizeof(struct iphdr),
2134 	.setsockopt	   = ip_setsockopt,
2135 	.getsockopt	   = ip_getsockopt,
2136 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2137 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2138 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2139 };
2140 EXPORT_SYMBOL(ipv4_specific);
2141 
2142 #ifdef CONFIG_TCP_MD5SIG
2143 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2144 	.md5_lookup		= tcp_v4_md5_lookup,
2145 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2146 	.md5_parse		= tcp_v4_parse_md5_keys,
2147 };
2148 #endif
2149 
2150 /* NOTE: A lot of things set to zero explicitly by call to
2151  *       sk_alloc() so need not be done here.
2152  */
2153 static int tcp_v4_init_sock(struct sock *sk)
2154 {
2155 	struct inet_connection_sock *icsk = inet_csk(sk);
2156 
2157 	tcp_init_sock(sk);
2158 
2159 	icsk->icsk_af_ops = &ipv4_specific;
2160 
2161 #ifdef CONFIG_TCP_MD5SIG
2162 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2163 #endif
2164 
2165 	return 0;
2166 }
2167 
2168 void tcp_v4_destroy_sock(struct sock *sk)
2169 {
2170 	struct tcp_sock *tp = tcp_sk(sk);
2171 
2172 	trace_tcp_destroy_sock(sk);
2173 
2174 	tcp_clear_xmit_timers(sk);
2175 
2176 	tcp_cleanup_congestion_control(sk);
2177 
2178 	tcp_cleanup_ulp(sk);
2179 
2180 	/* Cleanup up the write buffer. */
2181 	tcp_write_queue_purge(sk);
2182 
2183 	/* Check if we want to disable active TFO */
2184 	tcp_fastopen_active_disable_ofo_check(sk);
2185 
2186 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2187 	skb_rbtree_purge(&tp->out_of_order_queue);
2188 
2189 #ifdef CONFIG_TCP_MD5SIG
2190 	/* Clean up the MD5 key list, if any */
2191 	if (tp->md5sig_info) {
2192 		tcp_clear_md5_list(sk);
2193 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2194 		tp->md5sig_info = NULL;
2195 	}
2196 #endif
2197 
2198 	/* Clean up a referenced TCP bind bucket. */
2199 	if (inet_csk(sk)->icsk_bind_hash)
2200 		inet_put_port(sk);
2201 
2202 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2203 
2204 	/* If socket is aborted during connect operation */
2205 	tcp_free_fastopen_req(tp);
2206 	tcp_fastopen_destroy_cipher(sk);
2207 	tcp_saved_syn_free(tp);
2208 
2209 	sk_sockets_allocated_dec(sk);
2210 }
2211 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2212 
2213 #ifdef CONFIG_PROC_FS
2214 /* Proc filesystem TCP sock list dumping. */
2215 
2216 /*
2217  * Get next listener socket follow cur.  If cur is NULL, get first socket
2218  * starting from bucket given in st->bucket; when st->bucket is zero the
2219  * very first socket in the hash table is returned.
2220  */
2221 static void *listening_get_next(struct seq_file *seq, void *cur)
2222 {
2223 	struct tcp_seq_afinfo *afinfo;
2224 	struct tcp_iter_state *st = seq->private;
2225 	struct net *net = seq_file_net(seq);
2226 	struct inet_listen_hashbucket *ilb;
2227 	struct hlist_nulls_node *node;
2228 	struct sock *sk = cur;
2229 
2230 	if (st->bpf_seq_afinfo)
2231 		afinfo = st->bpf_seq_afinfo;
2232 	else
2233 		afinfo = PDE_DATA(file_inode(seq->file));
2234 
2235 	if (!sk) {
2236 get_head:
2237 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2238 		spin_lock(&ilb->lock);
2239 		sk = sk_nulls_head(&ilb->nulls_head);
2240 		st->offset = 0;
2241 		goto get_sk;
2242 	}
2243 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2244 	++st->num;
2245 	++st->offset;
2246 
2247 	sk = sk_nulls_next(sk);
2248 get_sk:
2249 	sk_nulls_for_each_from(sk, node) {
2250 		if (!net_eq(sock_net(sk), net))
2251 			continue;
2252 		if (afinfo->family == AF_UNSPEC ||
2253 		    sk->sk_family == afinfo->family)
2254 			return sk;
2255 	}
2256 	spin_unlock(&ilb->lock);
2257 	st->offset = 0;
2258 	if (++st->bucket < INET_LHTABLE_SIZE)
2259 		goto get_head;
2260 	return NULL;
2261 }
2262 
2263 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2264 {
2265 	struct tcp_iter_state *st = seq->private;
2266 	void *rc;
2267 
2268 	st->bucket = 0;
2269 	st->offset = 0;
2270 	rc = listening_get_next(seq, NULL);
2271 
2272 	while (rc && *pos) {
2273 		rc = listening_get_next(seq, rc);
2274 		--*pos;
2275 	}
2276 	return rc;
2277 }
2278 
2279 static inline bool empty_bucket(const struct tcp_iter_state *st)
2280 {
2281 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2282 }
2283 
2284 /*
2285  * Get first established socket starting from bucket given in st->bucket.
2286  * If st->bucket is zero, the very first socket in the hash is returned.
2287  */
2288 static void *established_get_first(struct seq_file *seq)
2289 {
2290 	struct tcp_seq_afinfo *afinfo;
2291 	struct tcp_iter_state *st = seq->private;
2292 	struct net *net = seq_file_net(seq);
2293 	void *rc = NULL;
2294 
2295 	if (st->bpf_seq_afinfo)
2296 		afinfo = st->bpf_seq_afinfo;
2297 	else
2298 		afinfo = PDE_DATA(file_inode(seq->file));
2299 
2300 	st->offset = 0;
2301 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2302 		struct sock *sk;
2303 		struct hlist_nulls_node *node;
2304 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2305 
2306 		/* Lockless fast path for the common case of empty buckets */
2307 		if (empty_bucket(st))
2308 			continue;
2309 
2310 		spin_lock_bh(lock);
2311 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2312 			if ((afinfo->family != AF_UNSPEC &&
2313 			     sk->sk_family != afinfo->family) ||
2314 			    !net_eq(sock_net(sk), net)) {
2315 				continue;
2316 			}
2317 			rc = sk;
2318 			goto out;
2319 		}
2320 		spin_unlock_bh(lock);
2321 	}
2322 out:
2323 	return rc;
2324 }
2325 
2326 static void *established_get_next(struct seq_file *seq, void *cur)
2327 {
2328 	struct tcp_seq_afinfo *afinfo;
2329 	struct sock *sk = cur;
2330 	struct hlist_nulls_node *node;
2331 	struct tcp_iter_state *st = seq->private;
2332 	struct net *net = seq_file_net(seq);
2333 
2334 	if (st->bpf_seq_afinfo)
2335 		afinfo = st->bpf_seq_afinfo;
2336 	else
2337 		afinfo = PDE_DATA(file_inode(seq->file));
2338 
2339 	++st->num;
2340 	++st->offset;
2341 
2342 	sk = sk_nulls_next(sk);
2343 
2344 	sk_nulls_for_each_from(sk, node) {
2345 		if ((afinfo->family == AF_UNSPEC ||
2346 		     sk->sk_family == afinfo->family) &&
2347 		    net_eq(sock_net(sk), net))
2348 			return sk;
2349 	}
2350 
2351 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2352 	++st->bucket;
2353 	return established_get_first(seq);
2354 }
2355 
2356 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2357 {
2358 	struct tcp_iter_state *st = seq->private;
2359 	void *rc;
2360 
2361 	st->bucket = 0;
2362 	rc = established_get_first(seq);
2363 
2364 	while (rc && pos) {
2365 		rc = established_get_next(seq, rc);
2366 		--pos;
2367 	}
2368 	return rc;
2369 }
2370 
2371 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2372 {
2373 	void *rc;
2374 	struct tcp_iter_state *st = seq->private;
2375 
2376 	st->state = TCP_SEQ_STATE_LISTENING;
2377 	rc	  = listening_get_idx(seq, &pos);
2378 
2379 	if (!rc) {
2380 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2381 		rc	  = established_get_idx(seq, pos);
2382 	}
2383 
2384 	return rc;
2385 }
2386 
2387 static void *tcp_seek_last_pos(struct seq_file *seq)
2388 {
2389 	struct tcp_iter_state *st = seq->private;
2390 	int offset = st->offset;
2391 	int orig_num = st->num;
2392 	void *rc = NULL;
2393 
2394 	switch (st->state) {
2395 	case TCP_SEQ_STATE_LISTENING:
2396 		if (st->bucket >= INET_LHTABLE_SIZE)
2397 			break;
2398 		st->state = TCP_SEQ_STATE_LISTENING;
2399 		rc = listening_get_next(seq, NULL);
2400 		while (offset-- && rc)
2401 			rc = listening_get_next(seq, rc);
2402 		if (rc)
2403 			break;
2404 		st->bucket = 0;
2405 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2406 		fallthrough;
2407 	case TCP_SEQ_STATE_ESTABLISHED:
2408 		if (st->bucket > tcp_hashinfo.ehash_mask)
2409 			break;
2410 		rc = established_get_first(seq);
2411 		while (offset-- && rc)
2412 			rc = established_get_next(seq, rc);
2413 	}
2414 
2415 	st->num = orig_num;
2416 
2417 	return rc;
2418 }
2419 
2420 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2421 {
2422 	struct tcp_iter_state *st = seq->private;
2423 	void *rc;
2424 
2425 	if (*pos && *pos == st->last_pos) {
2426 		rc = tcp_seek_last_pos(seq);
2427 		if (rc)
2428 			goto out;
2429 	}
2430 
2431 	st->state = TCP_SEQ_STATE_LISTENING;
2432 	st->num = 0;
2433 	st->bucket = 0;
2434 	st->offset = 0;
2435 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2436 
2437 out:
2438 	st->last_pos = *pos;
2439 	return rc;
2440 }
2441 EXPORT_SYMBOL(tcp_seq_start);
2442 
2443 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2444 {
2445 	struct tcp_iter_state *st = seq->private;
2446 	void *rc = NULL;
2447 
2448 	if (v == SEQ_START_TOKEN) {
2449 		rc = tcp_get_idx(seq, 0);
2450 		goto out;
2451 	}
2452 
2453 	switch (st->state) {
2454 	case TCP_SEQ_STATE_LISTENING:
2455 		rc = listening_get_next(seq, v);
2456 		if (!rc) {
2457 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2458 			st->bucket = 0;
2459 			st->offset = 0;
2460 			rc	  = established_get_first(seq);
2461 		}
2462 		break;
2463 	case TCP_SEQ_STATE_ESTABLISHED:
2464 		rc = established_get_next(seq, v);
2465 		break;
2466 	}
2467 out:
2468 	++*pos;
2469 	st->last_pos = *pos;
2470 	return rc;
2471 }
2472 EXPORT_SYMBOL(tcp_seq_next);
2473 
2474 void tcp_seq_stop(struct seq_file *seq, void *v)
2475 {
2476 	struct tcp_iter_state *st = seq->private;
2477 
2478 	switch (st->state) {
2479 	case TCP_SEQ_STATE_LISTENING:
2480 		if (v != SEQ_START_TOKEN)
2481 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2482 		break;
2483 	case TCP_SEQ_STATE_ESTABLISHED:
2484 		if (v)
2485 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2486 		break;
2487 	}
2488 }
2489 EXPORT_SYMBOL(tcp_seq_stop);
2490 
2491 static void get_openreq4(const struct request_sock *req,
2492 			 struct seq_file *f, int i)
2493 {
2494 	const struct inet_request_sock *ireq = inet_rsk(req);
2495 	long delta = req->rsk_timer.expires - jiffies;
2496 
2497 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2498 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2499 		i,
2500 		ireq->ir_loc_addr,
2501 		ireq->ir_num,
2502 		ireq->ir_rmt_addr,
2503 		ntohs(ireq->ir_rmt_port),
2504 		TCP_SYN_RECV,
2505 		0, 0, /* could print option size, but that is af dependent. */
2506 		1,    /* timers active (only the expire timer) */
2507 		jiffies_delta_to_clock_t(delta),
2508 		req->num_timeout,
2509 		from_kuid_munged(seq_user_ns(f),
2510 				 sock_i_uid(req->rsk_listener)),
2511 		0,  /* non standard timer */
2512 		0, /* open_requests have no inode */
2513 		0,
2514 		req);
2515 }
2516 
2517 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2518 {
2519 	int timer_active;
2520 	unsigned long timer_expires;
2521 	const struct tcp_sock *tp = tcp_sk(sk);
2522 	const struct inet_connection_sock *icsk = inet_csk(sk);
2523 	const struct inet_sock *inet = inet_sk(sk);
2524 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2525 	__be32 dest = inet->inet_daddr;
2526 	__be32 src = inet->inet_rcv_saddr;
2527 	__u16 destp = ntohs(inet->inet_dport);
2528 	__u16 srcp = ntohs(inet->inet_sport);
2529 	int rx_queue;
2530 	int state;
2531 
2532 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2533 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2534 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2535 		timer_active	= 1;
2536 		timer_expires	= icsk->icsk_timeout;
2537 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2538 		timer_active	= 4;
2539 		timer_expires	= icsk->icsk_timeout;
2540 	} else if (timer_pending(&sk->sk_timer)) {
2541 		timer_active	= 2;
2542 		timer_expires	= sk->sk_timer.expires;
2543 	} else {
2544 		timer_active	= 0;
2545 		timer_expires = jiffies;
2546 	}
2547 
2548 	state = inet_sk_state_load(sk);
2549 	if (state == TCP_LISTEN)
2550 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2551 	else
2552 		/* Because we don't lock the socket,
2553 		 * we might find a transient negative value.
2554 		 */
2555 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2556 				      READ_ONCE(tp->copied_seq), 0);
2557 
2558 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2559 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2560 		i, src, srcp, dest, destp, state,
2561 		READ_ONCE(tp->write_seq) - tp->snd_una,
2562 		rx_queue,
2563 		timer_active,
2564 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2565 		icsk->icsk_retransmits,
2566 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2567 		icsk->icsk_probes_out,
2568 		sock_i_ino(sk),
2569 		refcount_read(&sk->sk_refcnt), sk,
2570 		jiffies_to_clock_t(icsk->icsk_rto),
2571 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2572 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2573 		tp->snd_cwnd,
2574 		state == TCP_LISTEN ?
2575 		    fastopenq->max_qlen :
2576 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2577 }
2578 
2579 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2580 			       struct seq_file *f, int i)
2581 {
2582 	long delta = tw->tw_timer.expires - jiffies;
2583 	__be32 dest, src;
2584 	__u16 destp, srcp;
2585 
2586 	dest  = tw->tw_daddr;
2587 	src   = tw->tw_rcv_saddr;
2588 	destp = ntohs(tw->tw_dport);
2589 	srcp  = ntohs(tw->tw_sport);
2590 
2591 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2592 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2593 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2594 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2595 		refcount_read(&tw->tw_refcnt), tw);
2596 }
2597 
2598 #define TMPSZ 150
2599 
2600 static int tcp4_seq_show(struct seq_file *seq, void *v)
2601 {
2602 	struct tcp_iter_state *st;
2603 	struct sock *sk = v;
2604 
2605 	seq_setwidth(seq, TMPSZ - 1);
2606 	if (v == SEQ_START_TOKEN) {
2607 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2608 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2609 			   "inode");
2610 		goto out;
2611 	}
2612 	st = seq->private;
2613 
2614 	if (sk->sk_state == TCP_TIME_WAIT)
2615 		get_timewait4_sock(v, seq, st->num);
2616 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2617 		get_openreq4(v, seq, st->num);
2618 	else
2619 		get_tcp4_sock(v, seq, st->num);
2620 out:
2621 	seq_pad(seq, '\n');
2622 	return 0;
2623 }
2624 
2625 #ifdef CONFIG_BPF_SYSCALL
2626 struct bpf_iter__tcp {
2627 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2628 	__bpf_md_ptr(struct sock_common *, sk_common);
2629 	uid_t uid __aligned(8);
2630 };
2631 
2632 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2633 			     struct sock_common *sk_common, uid_t uid)
2634 {
2635 	struct bpf_iter__tcp ctx;
2636 
2637 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2638 	ctx.meta = meta;
2639 	ctx.sk_common = sk_common;
2640 	ctx.uid = uid;
2641 	return bpf_iter_run_prog(prog, &ctx);
2642 }
2643 
2644 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2645 {
2646 	struct bpf_iter_meta meta;
2647 	struct bpf_prog *prog;
2648 	struct sock *sk = v;
2649 	uid_t uid;
2650 
2651 	if (v == SEQ_START_TOKEN)
2652 		return 0;
2653 
2654 	if (sk->sk_state == TCP_TIME_WAIT) {
2655 		uid = 0;
2656 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2657 		const struct request_sock *req = v;
2658 
2659 		uid = from_kuid_munged(seq_user_ns(seq),
2660 				       sock_i_uid(req->rsk_listener));
2661 	} else {
2662 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2663 	}
2664 
2665 	meta.seq = seq;
2666 	prog = bpf_iter_get_info(&meta, false);
2667 	return tcp_prog_seq_show(prog, &meta, v, uid);
2668 }
2669 
2670 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2671 {
2672 	struct bpf_iter_meta meta;
2673 	struct bpf_prog *prog;
2674 
2675 	if (!v) {
2676 		meta.seq = seq;
2677 		prog = bpf_iter_get_info(&meta, true);
2678 		if (prog)
2679 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
2680 	}
2681 
2682 	tcp_seq_stop(seq, v);
2683 }
2684 
2685 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2686 	.show		= bpf_iter_tcp_seq_show,
2687 	.start		= tcp_seq_start,
2688 	.next		= tcp_seq_next,
2689 	.stop		= bpf_iter_tcp_seq_stop,
2690 };
2691 #endif
2692 
2693 static const struct seq_operations tcp4_seq_ops = {
2694 	.show		= tcp4_seq_show,
2695 	.start		= tcp_seq_start,
2696 	.next		= tcp_seq_next,
2697 	.stop		= tcp_seq_stop,
2698 };
2699 
2700 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2701 	.family		= AF_INET,
2702 };
2703 
2704 static int __net_init tcp4_proc_init_net(struct net *net)
2705 {
2706 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2707 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2708 		return -ENOMEM;
2709 	return 0;
2710 }
2711 
2712 static void __net_exit tcp4_proc_exit_net(struct net *net)
2713 {
2714 	remove_proc_entry("tcp", net->proc_net);
2715 }
2716 
2717 static struct pernet_operations tcp4_net_ops = {
2718 	.init = tcp4_proc_init_net,
2719 	.exit = tcp4_proc_exit_net,
2720 };
2721 
2722 int __init tcp4_proc_init(void)
2723 {
2724 	return register_pernet_subsys(&tcp4_net_ops);
2725 }
2726 
2727 void tcp4_proc_exit(void)
2728 {
2729 	unregister_pernet_subsys(&tcp4_net_ops);
2730 }
2731 #endif /* CONFIG_PROC_FS */
2732 
2733 struct proto tcp_prot = {
2734 	.name			= "TCP",
2735 	.owner			= THIS_MODULE,
2736 	.close			= tcp_close,
2737 	.pre_connect		= tcp_v4_pre_connect,
2738 	.connect		= tcp_v4_connect,
2739 	.disconnect		= tcp_disconnect,
2740 	.accept			= inet_csk_accept,
2741 	.ioctl			= tcp_ioctl,
2742 	.init			= tcp_v4_init_sock,
2743 	.destroy		= tcp_v4_destroy_sock,
2744 	.shutdown		= tcp_shutdown,
2745 	.setsockopt		= tcp_setsockopt,
2746 	.getsockopt		= tcp_getsockopt,
2747 	.keepalive		= tcp_set_keepalive,
2748 	.recvmsg		= tcp_recvmsg,
2749 	.sendmsg		= tcp_sendmsg,
2750 	.sendpage		= tcp_sendpage,
2751 	.backlog_rcv		= tcp_v4_do_rcv,
2752 	.release_cb		= tcp_release_cb,
2753 	.hash			= inet_hash,
2754 	.unhash			= inet_unhash,
2755 	.get_port		= inet_csk_get_port,
2756 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2757 	.leave_memory_pressure	= tcp_leave_memory_pressure,
2758 	.stream_memory_free	= tcp_stream_memory_free,
2759 	.sockets_allocated	= &tcp_sockets_allocated,
2760 	.orphan_count		= &tcp_orphan_count,
2761 	.memory_allocated	= &tcp_memory_allocated,
2762 	.memory_pressure	= &tcp_memory_pressure,
2763 	.sysctl_mem		= sysctl_tcp_mem,
2764 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
2765 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
2766 	.max_header		= MAX_TCP_HEADER,
2767 	.obj_size		= sizeof(struct tcp_sock),
2768 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
2769 	.twsk_prot		= &tcp_timewait_sock_ops,
2770 	.rsk_prot		= &tcp_request_sock_ops,
2771 	.h.hashinfo		= &tcp_hashinfo,
2772 	.no_autobind		= true,
2773 	.diag_destroy		= tcp_abort,
2774 };
2775 EXPORT_SYMBOL(tcp_prot);
2776 
2777 static void __net_exit tcp_sk_exit(struct net *net)
2778 {
2779 	int cpu;
2780 
2781 	if (net->ipv4.tcp_congestion_control)
2782 		bpf_module_put(net->ipv4.tcp_congestion_control,
2783 			       net->ipv4.tcp_congestion_control->owner);
2784 
2785 	for_each_possible_cpu(cpu)
2786 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2787 	free_percpu(net->ipv4.tcp_sk);
2788 }
2789 
2790 static int __net_init tcp_sk_init(struct net *net)
2791 {
2792 	int res, cpu, cnt;
2793 
2794 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2795 	if (!net->ipv4.tcp_sk)
2796 		return -ENOMEM;
2797 
2798 	for_each_possible_cpu(cpu) {
2799 		struct sock *sk;
2800 
2801 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2802 					   IPPROTO_TCP, net);
2803 		if (res)
2804 			goto fail;
2805 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2806 
2807 		/* Please enforce IP_DF and IPID==0 for RST and
2808 		 * ACK sent in SYN-RECV and TIME-WAIT state.
2809 		 */
2810 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2811 
2812 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2813 	}
2814 
2815 	net->ipv4.sysctl_tcp_ecn = 2;
2816 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2817 
2818 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2819 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2820 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2821 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2822 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2823 
2824 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2825 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2826 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2827 
2828 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2829 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2830 	net->ipv4.sysctl_tcp_syncookies = 1;
2831 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2832 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2833 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2834 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2835 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2836 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2837 	net->ipv4.sysctl_tcp_tw_reuse = 2;
2838 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2839 
2840 	cnt = tcp_hashinfo.ehash_mask + 1;
2841 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2842 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2843 
2844 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2845 	net->ipv4.sysctl_tcp_sack = 1;
2846 	net->ipv4.sysctl_tcp_window_scaling = 1;
2847 	net->ipv4.sysctl_tcp_timestamps = 1;
2848 	net->ipv4.sysctl_tcp_early_retrans = 3;
2849 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2850 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2851 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
2852 	net->ipv4.sysctl_tcp_max_reordering = 300;
2853 	net->ipv4.sysctl_tcp_dsack = 1;
2854 	net->ipv4.sysctl_tcp_app_win = 31;
2855 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
2856 	net->ipv4.sysctl_tcp_frto = 2;
2857 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2858 	/* This limits the percentage of the congestion window which we
2859 	 * will allow a single TSO frame to consume.  Building TSO frames
2860 	 * which are too large can cause TCP streams to be bursty.
2861 	 */
2862 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2863 	/* Default TSQ limit of 16 TSO segments */
2864 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2865 	/* rfc5961 challenge ack rate limiting */
2866 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2867 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
2868 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2869 	net->ipv4.sysctl_tcp_autocorking = 1;
2870 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2871 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2872 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2873 	if (net != &init_net) {
2874 		memcpy(net->ipv4.sysctl_tcp_rmem,
2875 		       init_net.ipv4.sysctl_tcp_rmem,
2876 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2877 		memcpy(net->ipv4.sysctl_tcp_wmem,
2878 		       init_net.ipv4.sysctl_tcp_wmem,
2879 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2880 	}
2881 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2882 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
2883 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2884 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2885 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2886 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2887 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2888 
2889 	/* Reno is always built in */
2890 	if (!net_eq(net, &init_net) &&
2891 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2892 			       init_net.ipv4.tcp_congestion_control->owner))
2893 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2894 	else
2895 		net->ipv4.tcp_congestion_control = &tcp_reno;
2896 
2897 	return 0;
2898 fail:
2899 	tcp_sk_exit(net);
2900 
2901 	return res;
2902 }
2903 
2904 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2905 {
2906 	struct net *net;
2907 
2908 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2909 
2910 	list_for_each_entry(net, net_exit_list, exit_list)
2911 		tcp_fastopen_ctx_destroy(net);
2912 }
2913 
2914 static struct pernet_operations __net_initdata tcp_sk_ops = {
2915        .init	   = tcp_sk_init,
2916        .exit	   = tcp_sk_exit,
2917        .exit_batch = tcp_sk_exit_batch,
2918 };
2919 
2920 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
2921 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
2922 		     struct sock_common *sk_common, uid_t uid)
2923 
2924 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
2925 {
2926 	struct tcp_iter_state *st = priv_data;
2927 	struct tcp_seq_afinfo *afinfo;
2928 	int ret;
2929 
2930 	afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
2931 	if (!afinfo)
2932 		return -ENOMEM;
2933 
2934 	afinfo->family = AF_UNSPEC;
2935 	st->bpf_seq_afinfo = afinfo;
2936 	ret = bpf_iter_init_seq_net(priv_data, aux);
2937 	if (ret)
2938 		kfree(afinfo);
2939 	return ret;
2940 }
2941 
2942 static void bpf_iter_fini_tcp(void *priv_data)
2943 {
2944 	struct tcp_iter_state *st = priv_data;
2945 
2946 	kfree(st->bpf_seq_afinfo);
2947 	bpf_iter_fini_seq_net(priv_data);
2948 }
2949 
2950 static const struct bpf_iter_seq_info tcp_seq_info = {
2951 	.seq_ops		= &bpf_iter_tcp_seq_ops,
2952 	.init_seq_private	= bpf_iter_init_tcp,
2953 	.fini_seq_private	= bpf_iter_fini_tcp,
2954 	.seq_priv_size		= sizeof(struct tcp_iter_state),
2955 };
2956 
2957 static struct bpf_iter_reg tcp_reg_info = {
2958 	.target			= "tcp",
2959 	.ctx_arg_info_size	= 1,
2960 	.ctx_arg_info		= {
2961 		{ offsetof(struct bpf_iter__tcp, sk_common),
2962 		  PTR_TO_BTF_ID_OR_NULL },
2963 	},
2964 	.seq_info		= &tcp_seq_info,
2965 };
2966 
2967 static void __init bpf_iter_register(void)
2968 {
2969 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
2970 	if (bpf_iter_reg_target(&tcp_reg_info))
2971 		pr_warn("Warning: could not register bpf iterator tcp\n");
2972 }
2973 
2974 #endif
2975 
2976 void __init tcp_v4_init(void)
2977 {
2978 	if (register_pernet_subsys(&tcp_sk_ops))
2979 		panic("Failed to create the TCP control socket.\n");
2980 
2981 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
2982 	bpf_iter_register();
2983 #endif
2984 }
2985