xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision 80d0624d)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 #include <linux/sched.h>
61 
62 #include <net/net_namespace.h>
63 #include <net/icmp.h>
64 #include <net/inet_hashtables.h>
65 #include <net/tcp.h>
66 #include <net/transp_v6.h>
67 #include <net/ipv6.h>
68 #include <net/inet_common.h>
69 #include <net/timewait_sock.h>
70 #include <net/xfrm.h>
71 #include <net/secure_seq.h>
72 #include <net/busy_poll.h>
73 
74 #include <linux/inet.h>
75 #include <linux/ipv6.h>
76 #include <linux/stddef.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79 #include <linux/inetdevice.h>
80 #include <linux/btf_ids.h>
81 
82 #include <crypto/hash.h>
83 #include <linux/scatterlist.h>
84 
85 #include <trace/events/tcp.h>
86 
87 #ifdef CONFIG_TCP_MD5SIG
88 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
89 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
90 #endif
91 
92 struct inet_hashinfo tcp_hashinfo;
93 EXPORT_SYMBOL(tcp_hashinfo);
94 
95 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
96 
97 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
98 {
99 	return secure_tcp_seq(ip_hdr(skb)->daddr,
100 			      ip_hdr(skb)->saddr,
101 			      tcp_hdr(skb)->dest,
102 			      tcp_hdr(skb)->source);
103 }
104 
105 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
106 {
107 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
108 }
109 
110 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
111 {
112 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
113 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
114 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
115 	struct tcp_sock *tp = tcp_sk(sk);
116 
117 	if (reuse == 2) {
118 		/* Still does not detect *everything* that goes through
119 		 * lo, since we require a loopback src or dst address
120 		 * or direct binding to 'lo' interface.
121 		 */
122 		bool loopback = false;
123 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
124 			loopback = true;
125 #if IS_ENABLED(CONFIG_IPV6)
126 		if (tw->tw_family == AF_INET6) {
127 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
128 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
129 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
130 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
131 				loopback = true;
132 		} else
133 #endif
134 		{
135 			if (ipv4_is_loopback(tw->tw_daddr) ||
136 			    ipv4_is_loopback(tw->tw_rcv_saddr))
137 				loopback = true;
138 		}
139 		if (!loopback)
140 			reuse = 0;
141 	}
142 
143 	/* With PAWS, it is safe from the viewpoint
144 	   of data integrity. Even without PAWS it is safe provided sequence
145 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
146 
147 	   Actually, the idea is close to VJ's one, only timestamp cache is
148 	   held not per host, but per port pair and TW bucket is used as state
149 	   holder.
150 
151 	   If TW bucket has been already destroyed we fall back to VJ's scheme
152 	   and use initial timestamp retrieved from peer table.
153 	 */
154 	if (tcptw->tw_ts_recent_stamp &&
155 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
156 					    tcptw->tw_ts_recent_stamp)))) {
157 		/* inet_twsk_hashdance() sets sk_refcnt after putting twsk
158 		 * and releasing the bucket lock.
159 		 */
160 		if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
161 			return 0;
162 
163 		/* In case of repair and re-using TIME-WAIT sockets we still
164 		 * want to be sure that it is safe as above but honor the
165 		 * sequence numbers and time stamps set as part of the repair
166 		 * process.
167 		 *
168 		 * Without this check re-using a TIME-WAIT socket with TCP
169 		 * repair would accumulate a -1 on the repair assigned
170 		 * sequence number. The first time it is reused the sequence
171 		 * is -1, the second time -2, etc. This fixes that issue
172 		 * without appearing to create any others.
173 		 */
174 		if (likely(!tp->repair)) {
175 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
176 
177 			if (!seq)
178 				seq = 1;
179 			WRITE_ONCE(tp->write_seq, seq);
180 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
181 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
182 		}
183 
184 		return 1;
185 	}
186 
187 	return 0;
188 }
189 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
190 
191 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
192 			      int addr_len)
193 {
194 	/* This check is replicated from tcp_v4_connect() and intended to
195 	 * prevent BPF program called below from accessing bytes that are out
196 	 * of the bound specified by user in addr_len.
197 	 */
198 	if (addr_len < sizeof(struct sockaddr_in))
199 		return -EINVAL;
200 
201 	sock_owned_by_me(sk);
202 
203 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
204 }
205 
206 /* This will initiate an outgoing connection. */
207 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
208 {
209 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
210 	struct inet_timewait_death_row *tcp_death_row;
211 	struct inet_sock *inet = inet_sk(sk);
212 	struct tcp_sock *tp = tcp_sk(sk);
213 	struct ip_options_rcu *inet_opt;
214 	struct net *net = sock_net(sk);
215 	__be16 orig_sport, orig_dport;
216 	__be32 daddr, nexthop;
217 	struct flowi4 *fl4;
218 	struct rtable *rt;
219 	int err;
220 
221 	if (addr_len < sizeof(struct sockaddr_in))
222 		return -EINVAL;
223 
224 	if (usin->sin_family != AF_INET)
225 		return -EAFNOSUPPORT;
226 
227 	nexthop = daddr = usin->sin_addr.s_addr;
228 	inet_opt = rcu_dereference_protected(inet->inet_opt,
229 					     lockdep_sock_is_held(sk));
230 	if (inet_opt && inet_opt->opt.srr) {
231 		if (!daddr)
232 			return -EINVAL;
233 		nexthop = inet_opt->opt.faddr;
234 	}
235 
236 	orig_sport = inet->inet_sport;
237 	orig_dport = usin->sin_port;
238 	fl4 = &inet->cork.fl.u.ip4;
239 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
240 			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
241 			      orig_dport, sk);
242 	if (IS_ERR(rt)) {
243 		err = PTR_ERR(rt);
244 		if (err == -ENETUNREACH)
245 			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
246 		return err;
247 	}
248 
249 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
250 		ip_rt_put(rt);
251 		return -ENETUNREACH;
252 	}
253 
254 	if (!inet_opt || !inet_opt->opt.srr)
255 		daddr = fl4->daddr;
256 
257 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
258 
259 	if (!inet->inet_saddr) {
260 		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
261 		if (err) {
262 			ip_rt_put(rt);
263 			return err;
264 		}
265 	} else {
266 		sk_rcv_saddr_set(sk, inet->inet_saddr);
267 	}
268 
269 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
270 		/* Reset inherited state */
271 		tp->rx_opt.ts_recent	   = 0;
272 		tp->rx_opt.ts_recent_stamp = 0;
273 		if (likely(!tp->repair))
274 			WRITE_ONCE(tp->write_seq, 0);
275 	}
276 
277 	inet->inet_dport = usin->sin_port;
278 	sk_daddr_set(sk, daddr);
279 
280 	inet_csk(sk)->icsk_ext_hdr_len = 0;
281 	if (inet_opt)
282 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
283 
284 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
285 
286 	/* Socket identity is still unknown (sport may be zero).
287 	 * However we set state to SYN-SENT and not releasing socket
288 	 * lock select source port, enter ourselves into the hash tables and
289 	 * complete initialization after this.
290 	 */
291 	tcp_set_state(sk, TCP_SYN_SENT);
292 	err = inet_hash_connect(tcp_death_row, sk);
293 	if (err)
294 		goto failure;
295 
296 	sk_set_txhash(sk);
297 
298 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
299 			       inet->inet_sport, inet->inet_dport, sk);
300 	if (IS_ERR(rt)) {
301 		err = PTR_ERR(rt);
302 		rt = NULL;
303 		goto failure;
304 	}
305 	/* OK, now commit destination to socket.  */
306 	sk->sk_gso_type = SKB_GSO_TCPV4;
307 	sk_setup_caps(sk, &rt->dst);
308 	rt = NULL;
309 
310 	if (likely(!tp->repair)) {
311 		if (!tp->write_seq)
312 			WRITE_ONCE(tp->write_seq,
313 				   secure_tcp_seq(inet->inet_saddr,
314 						  inet->inet_daddr,
315 						  inet->inet_sport,
316 						  usin->sin_port));
317 		WRITE_ONCE(tp->tsoffset,
318 			   secure_tcp_ts_off(net, inet->inet_saddr,
319 					     inet->inet_daddr));
320 	}
321 
322 	atomic_set(&inet->inet_id, get_random_u16());
323 
324 	if (tcp_fastopen_defer_connect(sk, &err))
325 		return err;
326 	if (err)
327 		goto failure;
328 
329 	err = tcp_connect(sk);
330 
331 	if (err)
332 		goto failure;
333 
334 	return 0;
335 
336 failure:
337 	/*
338 	 * This unhashes the socket and releases the local port,
339 	 * if necessary.
340 	 */
341 	tcp_set_state(sk, TCP_CLOSE);
342 	inet_bhash2_reset_saddr(sk);
343 	ip_rt_put(rt);
344 	sk->sk_route_caps = 0;
345 	inet->inet_dport = 0;
346 	return err;
347 }
348 EXPORT_SYMBOL(tcp_v4_connect);
349 
350 /*
351  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
352  * It can be called through tcp_release_cb() if socket was owned by user
353  * at the time tcp_v4_err() was called to handle ICMP message.
354  */
355 void tcp_v4_mtu_reduced(struct sock *sk)
356 {
357 	struct inet_sock *inet = inet_sk(sk);
358 	struct dst_entry *dst;
359 	u32 mtu;
360 
361 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
362 		return;
363 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
364 	dst = inet_csk_update_pmtu(sk, mtu);
365 	if (!dst)
366 		return;
367 
368 	/* Something is about to be wrong... Remember soft error
369 	 * for the case, if this connection will not able to recover.
370 	 */
371 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
372 		WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
373 
374 	mtu = dst_mtu(dst);
375 
376 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
377 	    ip_sk_accept_pmtu(sk) &&
378 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
379 		tcp_sync_mss(sk, mtu);
380 
381 		/* Resend the TCP packet because it's
382 		 * clear that the old packet has been
383 		 * dropped. This is the new "fast" path mtu
384 		 * discovery.
385 		 */
386 		tcp_simple_retransmit(sk);
387 	} /* else let the usual retransmit timer handle it */
388 }
389 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
390 
391 static void do_redirect(struct sk_buff *skb, struct sock *sk)
392 {
393 	struct dst_entry *dst = __sk_dst_check(sk, 0);
394 
395 	if (dst)
396 		dst->ops->redirect(dst, sk, skb);
397 }
398 
399 
400 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
401 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
402 {
403 	struct request_sock *req = inet_reqsk(sk);
404 	struct net *net = sock_net(sk);
405 
406 	/* ICMPs are not backlogged, hence we cannot get
407 	 * an established socket here.
408 	 */
409 	if (seq != tcp_rsk(req)->snt_isn) {
410 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
411 	} else if (abort) {
412 		/*
413 		 * Still in SYN_RECV, just remove it silently.
414 		 * There is no good way to pass the error to the newly
415 		 * created socket, and POSIX does not want network
416 		 * errors returned from accept().
417 		 */
418 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
419 		tcp_listendrop(req->rsk_listener);
420 	}
421 	reqsk_put(req);
422 }
423 EXPORT_SYMBOL(tcp_req_err);
424 
425 /* TCP-LD (RFC 6069) logic */
426 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
427 {
428 	struct inet_connection_sock *icsk = inet_csk(sk);
429 	struct tcp_sock *tp = tcp_sk(sk);
430 	struct sk_buff *skb;
431 	s32 remaining;
432 	u32 delta_us;
433 
434 	if (sock_owned_by_user(sk))
435 		return;
436 
437 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
438 	    !icsk->icsk_backoff)
439 		return;
440 
441 	skb = tcp_rtx_queue_head(sk);
442 	if (WARN_ON_ONCE(!skb))
443 		return;
444 
445 	icsk->icsk_backoff--;
446 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
447 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
448 
449 	tcp_mstamp_refresh(tp);
450 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
451 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
452 
453 	if (remaining > 0) {
454 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
455 					  remaining, TCP_RTO_MAX);
456 	} else {
457 		/* RTO revert clocked out retransmission.
458 		 * Will retransmit now.
459 		 */
460 		tcp_retransmit_timer(sk);
461 	}
462 }
463 EXPORT_SYMBOL(tcp_ld_RTO_revert);
464 
465 /*
466  * This routine is called by the ICMP module when it gets some
467  * sort of error condition.  If err < 0 then the socket should
468  * be closed and the error returned to the user.  If err > 0
469  * it's just the icmp type << 8 | icmp code.  After adjustment
470  * header points to the first 8 bytes of the tcp header.  We need
471  * to find the appropriate port.
472  *
473  * The locking strategy used here is very "optimistic". When
474  * someone else accesses the socket the ICMP is just dropped
475  * and for some paths there is no check at all.
476  * A more general error queue to queue errors for later handling
477  * is probably better.
478  *
479  */
480 
481 int tcp_v4_err(struct sk_buff *skb, u32 info)
482 {
483 	const struct iphdr *iph = (const struct iphdr *)skb->data;
484 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
485 	struct tcp_sock *tp;
486 	const int type = icmp_hdr(skb)->type;
487 	const int code = icmp_hdr(skb)->code;
488 	struct sock *sk;
489 	struct request_sock *fastopen;
490 	u32 seq, snd_una;
491 	int err;
492 	struct net *net = dev_net(skb->dev);
493 
494 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
495 				       iph->daddr, th->dest, iph->saddr,
496 				       ntohs(th->source), inet_iif(skb), 0);
497 	if (!sk) {
498 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
499 		return -ENOENT;
500 	}
501 	if (sk->sk_state == TCP_TIME_WAIT) {
502 		inet_twsk_put(inet_twsk(sk));
503 		return 0;
504 	}
505 	seq = ntohl(th->seq);
506 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
507 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
508 				     type == ICMP_TIME_EXCEEDED ||
509 				     (type == ICMP_DEST_UNREACH &&
510 				      (code == ICMP_NET_UNREACH ||
511 				       code == ICMP_HOST_UNREACH)));
512 		return 0;
513 	}
514 
515 	bh_lock_sock(sk);
516 	/* If too many ICMPs get dropped on busy
517 	 * servers this needs to be solved differently.
518 	 * We do take care of PMTU discovery (RFC1191) special case :
519 	 * we can receive locally generated ICMP messages while socket is held.
520 	 */
521 	if (sock_owned_by_user(sk)) {
522 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
523 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
524 	}
525 	if (sk->sk_state == TCP_CLOSE)
526 		goto out;
527 
528 	if (static_branch_unlikely(&ip4_min_ttl)) {
529 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
530 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
531 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
532 			goto out;
533 		}
534 	}
535 
536 	tp = tcp_sk(sk);
537 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
538 	fastopen = rcu_dereference(tp->fastopen_rsk);
539 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
540 	if (sk->sk_state != TCP_LISTEN &&
541 	    !between(seq, snd_una, tp->snd_nxt)) {
542 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
543 		goto out;
544 	}
545 
546 	switch (type) {
547 	case ICMP_REDIRECT:
548 		if (!sock_owned_by_user(sk))
549 			do_redirect(skb, sk);
550 		goto out;
551 	case ICMP_SOURCE_QUENCH:
552 		/* Just silently ignore these. */
553 		goto out;
554 	case ICMP_PARAMETERPROB:
555 		err = EPROTO;
556 		break;
557 	case ICMP_DEST_UNREACH:
558 		if (code > NR_ICMP_UNREACH)
559 			goto out;
560 
561 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
562 			/* We are not interested in TCP_LISTEN and open_requests
563 			 * (SYN-ACKs send out by Linux are always <576bytes so
564 			 * they should go through unfragmented).
565 			 */
566 			if (sk->sk_state == TCP_LISTEN)
567 				goto out;
568 
569 			WRITE_ONCE(tp->mtu_info, info);
570 			if (!sock_owned_by_user(sk)) {
571 				tcp_v4_mtu_reduced(sk);
572 			} else {
573 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
574 					sock_hold(sk);
575 			}
576 			goto out;
577 		}
578 
579 		err = icmp_err_convert[code].errno;
580 		/* check if this ICMP message allows revert of backoff.
581 		 * (see RFC 6069)
582 		 */
583 		if (!fastopen &&
584 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
585 			tcp_ld_RTO_revert(sk, seq);
586 		break;
587 	case ICMP_TIME_EXCEEDED:
588 		err = EHOSTUNREACH;
589 		break;
590 	default:
591 		goto out;
592 	}
593 
594 	switch (sk->sk_state) {
595 	case TCP_SYN_SENT:
596 	case TCP_SYN_RECV:
597 		/* Only in fast or simultaneous open. If a fast open socket is
598 		 * already accepted it is treated as a connected one below.
599 		 */
600 		if (fastopen && !fastopen->sk)
601 			break;
602 
603 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
604 
605 		if (!sock_owned_by_user(sk))
606 			tcp_done_with_error(sk, err);
607 		else
608 			WRITE_ONCE(sk->sk_err_soft, err);
609 		goto out;
610 	}
611 
612 	/* If we've already connected we will keep trying
613 	 * until we time out, or the user gives up.
614 	 *
615 	 * rfc1122 4.2.3.9 allows to consider as hard errors
616 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
617 	 * but it is obsoleted by pmtu discovery).
618 	 *
619 	 * Note, that in modern internet, where routing is unreliable
620 	 * and in each dark corner broken firewalls sit, sending random
621 	 * errors ordered by their masters even this two messages finally lose
622 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
623 	 *
624 	 * Now we are in compliance with RFCs.
625 	 *							--ANK (980905)
626 	 */
627 
628 	if (!sock_owned_by_user(sk) &&
629 	    inet_test_bit(RECVERR, sk)) {
630 		WRITE_ONCE(sk->sk_err, err);
631 		sk_error_report(sk);
632 	} else	{ /* Only an error on timeout */
633 		WRITE_ONCE(sk->sk_err_soft, err);
634 	}
635 
636 out:
637 	bh_unlock_sock(sk);
638 	sock_put(sk);
639 	return 0;
640 }
641 
642 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
643 {
644 	struct tcphdr *th = tcp_hdr(skb);
645 
646 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
647 	skb->csum_start = skb_transport_header(skb) - skb->head;
648 	skb->csum_offset = offsetof(struct tcphdr, check);
649 }
650 
651 /* This routine computes an IPv4 TCP checksum. */
652 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
653 {
654 	const struct inet_sock *inet = inet_sk(sk);
655 
656 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
657 }
658 EXPORT_SYMBOL(tcp_v4_send_check);
659 
660 /*
661  *	This routine will send an RST to the other tcp.
662  *
663  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
664  *		      for reset.
665  *	Answer: if a packet caused RST, it is not for a socket
666  *		existing in our system, if it is matched to a socket,
667  *		it is just duplicate segment or bug in other side's TCP.
668  *		So that we build reply only basing on parameters
669  *		arrived with segment.
670  *	Exception: precedence violation. We do not implement it in any case.
671  */
672 
673 #ifdef CONFIG_TCP_MD5SIG
674 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
675 #else
676 #define OPTION_BYTES sizeof(__be32)
677 #endif
678 
679 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
680 {
681 	const struct tcphdr *th = tcp_hdr(skb);
682 	struct {
683 		struct tcphdr th;
684 		__be32 opt[OPTION_BYTES / sizeof(__be32)];
685 	} rep;
686 	struct ip_reply_arg arg;
687 #ifdef CONFIG_TCP_MD5SIG
688 	struct tcp_md5sig_key *key = NULL;
689 	const __u8 *hash_location = NULL;
690 	unsigned char newhash[16];
691 	int genhash;
692 	struct sock *sk1 = NULL;
693 #endif
694 	u64 transmit_time = 0;
695 	struct sock *ctl_sk;
696 	struct net *net;
697 	u32 txhash = 0;
698 
699 	/* Never send a reset in response to a reset. */
700 	if (th->rst)
701 		return;
702 
703 	/* If sk not NULL, it means we did a successful lookup and incoming
704 	 * route had to be correct. prequeue might have dropped our dst.
705 	 */
706 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
707 		return;
708 
709 	/* Swap the send and the receive. */
710 	memset(&rep, 0, sizeof(rep));
711 	rep.th.dest   = th->source;
712 	rep.th.source = th->dest;
713 	rep.th.doff   = sizeof(struct tcphdr) / 4;
714 	rep.th.rst    = 1;
715 
716 	if (th->ack) {
717 		rep.th.seq = th->ack_seq;
718 	} else {
719 		rep.th.ack = 1;
720 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
721 				       skb->len - (th->doff << 2));
722 	}
723 
724 	memset(&arg, 0, sizeof(arg));
725 	arg.iov[0].iov_base = (unsigned char *)&rep;
726 	arg.iov[0].iov_len  = sizeof(rep.th);
727 
728 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
729 #ifdef CONFIG_TCP_MD5SIG
730 	rcu_read_lock();
731 	hash_location = tcp_parse_md5sig_option(th);
732 	if (sk && sk_fullsock(sk)) {
733 		const union tcp_md5_addr *addr;
734 		int l3index;
735 
736 		/* sdif set, means packet ingressed via a device
737 		 * in an L3 domain and inet_iif is set to it.
738 		 */
739 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
740 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
741 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
742 	} else if (hash_location) {
743 		const union tcp_md5_addr *addr;
744 		int sdif = tcp_v4_sdif(skb);
745 		int dif = inet_iif(skb);
746 		int l3index;
747 
748 		/*
749 		 * active side is lost. Try to find listening socket through
750 		 * source port, and then find md5 key through listening socket.
751 		 * we are not loose security here:
752 		 * Incoming packet is checked with md5 hash with finding key,
753 		 * no RST generated if md5 hash doesn't match.
754 		 */
755 		sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
756 					     NULL, 0, ip_hdr(skb)->saddr,
757 					     th->source, ip_hdr(skb)->daddr,
758 					     ntohs(th->source), dif, sdif);
759 		/* don't send rst if it can't find key */
760 		if (!sk1)
761 			goto out;
762 
763 		/* sdif set, means packet ingressed via a device
764 		 * in an L3 domain and dif is set to it.
765 		 */
766 		l3index = sdif ? dif : 0;
767 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
768 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
769 		if (!key)
770 			goto out;
771 
772 
773 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
774 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
775 			goto out;
776 
777 	}
778 
779 	if (key) {
780 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
781 				   (TCPOPT_NOP << 16) |
782 				   (TCPOPT_MD5SIG << 8) |
783 				   TCPOLEN_MD5SIG);
784 		/* Update length and the length the header thinks exists */
785 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
786 		rep.th.doff = arg.iov[0].iov_len / 4;
787 
788 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
789 				     key, ip_hdr(skb)->saddr,
790 				     ip_hdr(skb)->daddr, &rep.th);
791 	}
792 #endif
793 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
794 	if (rep.opt[0] == 0) {
795 		__be32 mrst = mptcp_reset_option(skb);
796 
797 		if (mrst) {
798 			rep.opt[0] = mrst;
799 			arg.iov[0].iov_len += sizeof(mrst);
800 			rep.th.doff = arg.iov[0].iov_len / 4;
801 		}
802 	}
803 
804 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
805 				      ip_hdr(skb)->saddr, /* XXX */
806 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
807 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
808 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
809 
810 	/* When socket is gone, all binding information is lost.
811 	 * routing might fail in this case. No choice here, if we choose to force
812 	 * input interface, we will misroute in case of asymmetric route.
813 	 */
814 	if (sk) {
815 		arg.bound_dev_if = sk->sk_bound_dev_if;
816 		if (sk_fullsock(sk))
817 			trace_tcp_send_reset(sk, skb);
818 	}
819 
820 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
821 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
822 
823 	arg.tos = ip_hdr(skb)->tos;
824 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
825 	local_bh_disable();
826 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
827 	sock_net_set(ctl_sk, net);
828 	if (sk) {
829 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
830 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
831 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
832 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
833 		transmit_time = tcp_transmit_time(sk);
834 		xfrm_sk_clone_policy(ctl_sk, sk);
835 		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
836 			 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
837 	} else {
838 		ctl_sk->sk_mark = 0;
839 		ctl_sk->sk_priority = 0;
840 	}
841 	ip_send_unicast_reply(ctl_sk,
842 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
843 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
844 			      &arg, arg.iov[0].iov_len,
845 			      transmit_time, txhash);
846 
847 	xfrm_sk_free_policy(ctl_sk);
848 	sock_net_set(ctl_sk, &init_net);
849 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
850 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
851 	local_bh_enable();
852 
853 #ifdef CONFIG_TCP_MD5SIG
854 out:
855 	rcu_read_unlock();
856 #endif
857 }
858 
859 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
860    outside socket context is ugly, certainly. What can I do?
861  */
862 
863 static void tcp_v4_send_ack(const struct sock *sk,
864 			    struct sk_buff *skb, u32 seq, u32 ack,
865 			    u32 win, u32 tsval, u32 tsecr, int oif,
866 			    struct tcp_md5sig_key *key,
867 			    int reply_flags, u8 tos, u32 txhash)
868 {
869 	const struct tcphdr *th = tcp_hdr(skb);
870 	struct {
871 		struct tcphdr th;
872 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
873 #ifdef CONFIG_TCP_MD5SIG
874 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
875 #endif
876 			];
877 	} rep;
878 	struct net *net = sock_net(sk);
879 	struct ip_reply_arg arg;
880 	struct sock *ctl_sk;
881 	u64 transmit_time;
882 
883 	memset(&rep.th, 0, sizeof(struct tcphdr));
884 	memset(&arg, 0, sizeof(arg));
885 
886 	arg.iov[0].iov_base = (unsigned char *)&rep;
887 	arg.iov[0].iov_len  = sizeof(rep.th);
888 	if (tsecr) {
889 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
890 				   (TCPOPT_TIMESTAMP << 8) |
891 				   TCPOLEN_TIMESTAMP);
892 		rep.opt[1] = htonl(tsval);
893 		rep.opt[2] = htonl(tsecr);
894 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
895 	}
896 
897 	/* Swap the send and the receive. */
898 	rep.th.dest    = th->source;
899 	rep.th.source  = th->dest;
900 	rep.th.doff    = arg.iov[0].iov_len / 4;
901 	rep.th.seq     = htonl(seq);
902 	rep.th.ack_seq = htonl(ack);
903 	rep.th.ack     = 1;
904 	rep.th.window  = htons(win);
905 
906 #ifdef CONFIG_TCP_MD5SIG
907 	if (key) {
908 		int offset = (tsecr) ? 3 : 0;
909 
910 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
911 					  (TCPOPT_NOP << 16) |
912 					  (TCPOPT_MD5SIG << 8) |
913 					  TCPOLEN_MD5SIG);
914 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
915 		rep.th.doff = arg.iov[0].iov_len/4;
916 
917 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
918 				    key, ip_hdr(skb)->saddr,
919 				    ip_hdr(skb)->daddr, &rep.th);
920 	}
921 #endif
922 	arg.flags = reply_flags;
923 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
924 				      ip_hdr(skb)->saddr, /* XXX */
925 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
926 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
927 	if (oif)
928 		arg.bound_dev_if = oif;
929 	arg.tos = tos;
930 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
931 	local_bh_disable();
932 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
933 	sock_net_set(ctl_sk, net);
934 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
935 			   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
936 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
937 			   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
938 	transmit_time = tcp_transmit_time(sk);
939 	ip_send_unicast_reply(ctl_sk,
940 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
941 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
942 			      &arg, arg.iov[0].iov_len,
943 			      transmit_time, txhash);
944 
945 	sock_net_set(ctl_sk, &init_net);
946 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
947 	local_bh_enable();
948 }
949 
950 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
951 {
952 	struct inet_timewait_sock *tw = inet_twsk(sk);
953 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
954 
955 	tcp_v4_send_ack(sk, skb,
956 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
957 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
958 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
959 			tcptw->tw_ts_recent,
960 			tw->tw_bound_dev_if,
961 			tcp_twsk_md5_key(tcptw),
962 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
963 			tw->tw_tos,
964 			tw->tw_txhash
965 			);
966 
967 	inet_twsk_put(tw);
968 }
969 
970 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
971 				  struct request_sock *req)
972 {
973 	const union tcp_md5_addr *addr;
974 	int l3index;
975 
976 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
977 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
978 	 */
979 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
980 					     tcp_sk(sk)->snd_nxt;
981 
982 	/* RFC 7323 2.3
983 	 * The window field (SEG.WND) of every outgoing segment, with the
984 	 * exception of <SYN> segments, MUST be right-shifted by
985 	 * Rcv.Wind.Shift bits:
986 	 */
987 	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
988 	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
989 	tcp_v4_send_ack(sk, skb, seq,
990 			tcp_rsk(req)->rcv_nxt,
991 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
992 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
993 			READ_ONCE(req->ts_recent),
994 			0,
995 			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
996 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
997 			ip_hdr(skb)->tos,
998 			READ_ONCE(tcp_rsk(req)->txhash));
999 }
1000 
1001 /*
1002  *	Send a SYN-ACK after having received a SYN.
1003  *	This still operates on a request_sock only, not on a big
1004  *	socket.
1005  */
1006 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1007 			      struct flowi *fl,
1008 			      struct request_sock *req,
1009 			      struct tcp_fastopen_cookie *foc,
1010 			      enum tcp_synack_type synack_type,
1011 			      struct sk_buff *syn_skb)
1012 {
1013 	const struct inet_request_sock *ireq = inet_rsk(req);
1014 	struct flowi4 fl4;
1015 	int err = -1;
1016 	struct sk_buff *skb;
1017 	u8 tos;
1018 
1019 	/* First, grab a route. */
1020 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1021 		return -1;
1022 
1023 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1024 
1025 	if (skb) {
1026 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1027 
1028 		tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1029 				(tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1030 				(inet_sk(sk)->tos & INET_ECN_MASK) :
1031 				inet_sk(sk)->tos;
1032 
1033 		if (!INET_ECN_is_capable(tos) &&
1034 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1035 			tos |= INET_ECN_ECT_0;
1036 
1037 		rcu_read_lock();
1038 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1039 					    ireq->ir_rmt_addr,
1040 					    rcu_dereference(ireq->ireq_opt),
1041 					    tos);
1042 		rcu_read_unlock();
1043 		err = net_xmit_eval(err);
1044 	}
1045 
1046 	return err;
1047 }
1048 
1049 /*
1050  *	IPv4 request_sock destructor.
1051  */
1052 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1053 {
1054 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1055 }
1056 
1057 #ifdef CONFIG_TCP_MD5SIG
1058 /*
1059  * RFC2385 MD5 checksumming requires a mapping of
1060  * IP address->MD5 Key.
1061  * We need to maintain these in the sk structure.
1062  */
1063 
1064 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1065 EXPORT_SYMBOL(tcp_md5_needed);
1066 
1067 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1068 {
1069 	if (!old)
1070 		return true;
1071 
1072 	/* l3index always overrides non-l3index */
1073 	if (old->l3index && new->l3index == 0)
1074 		return false;
1075 	if (old->l3index == 0 && new->l3index)
1076 		return true;
1077 
1078 	return old->prefixlen < new->prefixlen;
1079 }
1080 
1081 /* Find the Key structure for an address.  */
1082 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1083 					   const union tcp_md5_addr *addr,
1084 					   int family)
1085 {
1086 	const struct tcp_sock *tp = tcp_sk(sk);
1087 	struct tcp_md5sig_key *key;
1088 	const struct tcp_md5sig_info *md5sig;
1089 	__be32 mask;
1090 	struct tcp_md5sig_key *best_match = NULL;
1091 	bool match;
1092 
1093 	/* caller either holds rcu_read_lock() or socket lock */
1094 	md5sig = rcu_dereference_check(tp->md5sig_info,
1095 				       lockdep_sock_is_held(sk));
1096 	if (!md5sig)
1097 		return NULL;
1098 
1099 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1100 				 lockdep_sock_is_held(sk)) {
1101 		if (key->family != family)
1102 			continue;
1103 		if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1104 			continue;
1105 		if (family == AF_INET) {
1106 			mask = inet_make_mask(key->prefixlen);
1107 			match = (key->addr.a4.s_addr & mask) ==
1108 				(addr->a4.s_addr & mask);
1109 #if IS_ENABLED(CONFIG_IPV6)
1110 		} else if (family == AF_INET6) {
1111 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1112 						  key->prefixlen);
1113 #endif
1114 		} else {
1115 			match = false;
1116 		}
1117 
1118 		if (match && better_md5_match(best_match, key))
1119 			best_match = key;
1120 	}
1121 	return best_match;
1122 }
1123 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1124 
1125 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1126 						      const union tcp_md5_addr *addr,
1127 						      int family, u8 prefixlen,
1128 						      int l3index, u8 flags)
1129 {
1130 	const struct tcp_sock *tp = tcp_sk(sk);
1131 	struct tcp_md5sig_key *key;
1132 	unsigned int size = sizeof(struct in_addr);
1133 	const struct tcp_md5sig_info *md5sig;
1134 
1135 	/* caller either holds rcu_read_lock() or socket lock */
1136 	md5sig = rcu_dereference_check(tp->md5sig_info,
1137 				       lockdep_sock_is_held(sk));
1138 	if (!md5sig)
1139 		return NULL;
1140 #if IS_ENABLED(CONFIG_IPV6)
1141 	if (family == AF_INET6)
1142 		size = sizeof(struct in6_addr);
1143 #endif
1144 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1145 				 lockdep_sock_is_held(sk)) {
1146 		if (key->family != family)
1147 			continue;
1148 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1149 			continue;
1150 		if (key->l3index != l3index)
1151 			continue;
1152 		if (!memcmp(&key->addr, addr, size) &&
1153 		    key->prefixlen == prefixlen)
1154 			return key;
1155 	}
1156 	return NULL;
1157 }
1158 
1159 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1160 					 const struct sock *addr_sk)
1161 {
1162 	const union tcp_md5_addr *addr;
1163 	int l3index;
1164 
1165 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1166 						 addr_sk->sk_bound_dev_if);
1167 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1168 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1169 }
1170 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1171 
1172 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1173 {
1174 	struct tcp_sock *tp = tcp_sk(sk);
1175 	struct tcp_md5sig_info *md5sig;
1176 
1177 	md5sig = kmalloc(sizeof(*md5sig), gfp);
1178 	if (!md5sig)
1179 		return -ENOMEM;
1180 
1181 	sk_gso_disable(sk);
1182 	INIT_HLIST_HEAD(&md5sig->head);
1183 	rcu_assign_pointer(tp->md5sig_info, md5sig);
1184 	return 0;
1185 }
1186 
1187 /* This can be called on a newly created socket, from other files */
1188 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1189 			    int family, u8 prefixlen, int l3index, u8 flags,
1190 			    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1191 {
1192 	/* Add Key to the list */
1193 	struct tcp_md5sig_key *key;
1194 	struct tcp_sock *tp = tcp_sk(sk);
1195 	struct tcp_md5sig_info *md5sig;
1196 
1197 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1198 	if (key) {
1199 		/* Pre-existing entry - just update that one.
1200 		 * Note that the key might be used concurrently.
1201 		 * data_race() is telling kcsan that we do not care of
1202 		 * key mismatches, since changing MD5 key on live flows
1203 		 * can lead to packet drops.
1204 		 */
1205 		data_race(memcpy(key->key, newkey, newkeylen));
1206 
1207 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1208 		 * Also note that a reader could catch new key->keylen value
1209 		 * but old key->key[], this is the reason we use __GFP_ZERO
1210 		 * at sock_kmalloc() time below these lines.
1211 		 */
1212 		WRITE_ONCE(key->keylen, newkeylen);
1213 
1214 		return 0;
1215 	}
1216 
1217 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1218 					   lockdep_sock_is_held(sk));
1219 
1220 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1221 	if (!key)
1222 		return -ENOMEM;
1223 	if (!tcp_alloc_md5sig_pool()) {
1224 		sock_kfree_s(sk, key, sizeof(*key));
1225 		return -ENOMEM;
1226 	}
1227 
1228 	memcpy(key->key, newkey, newkeylen);
1229 	key->keylen = newkeylen;
1230 	key->family = family;
1231 	key->prefixlen = prefixlen;
1232 	key->l3index = l3index;
1233 	key->flags = flags;
1234 	memcpy(&key->addr, addr,
1235 	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1236 								 sizeof(struct in_addr));
1237 	hlist_add_head_rcu(&key->node, &md5sig->head);
1238 	return 0;
1239 }
1240 
1241 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1242 		   int family, u8 prefixlen, int l3index, u8 flags,
1243 		   const u8 *newkey, u8 newkeylen)
1244 {
1245 	struct tcp_sock *tp = tcp_sk(sk);
1246 
1247 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1248 		if (tcp_md5sig_info_add(sk, GFP_KERNEL))
1249 			return -ENOMEM;
1250 
1251 		if (!static_branch_inc(&tcp_md5_needed.key)) {
1252 			struct tcp_md5sig_info *md5sig;
1253 
1254 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1255 			rcu_assign_pointer(tp->md5sig_info, NULL);
1256 			kfree_rcu(md5sig, rcu);
1257 			return -EUSERS;
1258 		}
1259 	}
1260 
1261 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1262 				newkey, newkeylen, GFP_KERNEL);
1263 }
1264 EXPORT_SYMBOL(tcp_md5_do_add);
1265 
1266 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1267 		     int family, u8 prefixlen, int l3index,
1268 		     struct tcp_md5sig_key *key)
1269 {
1270 	struct tcp_sock *tp = tcp_sk(sk);
1271 
1272 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1273 		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
1274 			return -ENOMEM;
1275 
1276 		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1277 			struct tcp_md5sig_info *md5sig;
1278 
1279 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1280 			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1281 			rcu_assign_pointer(tp->md5sig_info, NULL);
1282 			kfree_rcu(md5sig, rcu);
1283 			return -EUSERS;
1284 		}
1285 	}
1286 
1287 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1288 				key->flags, key->key, key->keylen,
1289 				sk_gfp_mask(sk, GFP_ATOMIC));
1290 }
1291 EXPORT_SYMBOL(tcp_md5_key_copy);
1292 
1293 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1294 		   u8 prefixlen, int l3index, u8 flags)
1295 {
1296 	struct tcp_md5sig_key *key;
1297 
1298 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1299 	if (!key)
1300 		return -ENOENT;
1301 	hlist_del_rcu(&key->node);
1302 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1303 	kfree_rcu(key, rcu);
1304 	return 0;
1305 }
1306 EXPORT_SYMBOL(tcp_md5_do_del);
1307 
1308 static void tcp_clear_md5_list(struct sock *sk)
1309 {
1310 	struct tcp_sock *tp = tcp_sk(sk);
1311 	struct tcp_md5sig_key *key;
1312 	struct hlist_node *n;
1313 	struct tcp_md5sig_info *md5sig;
1314 
1315 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1316 
1317 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1318 		hlist_del_rcu(&key->node);
1319 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1320 		kfree_rcu(key, rcu);
1321 	}
1322 }
1323 
1324 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1325 				 sockptr_t optval, int optlen)
1326 {
1327 	struct tcp_md5sig cmd;
1328 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1329 	const union tcp_md5_addr *addr;
1330 	u8 prefixlen = 32;
1331 	int l3index = 0;
1332 	u8 flags;
1333 
1334 	if (optlen < sizeof(cmd))
1335 		return -EINVAL;
1336 
1337 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1338 		return -EFAULT;
1339 
1340 	if (sin->sin_family != AF_INET)
1341 		return -EINVAL;
1342 
1343 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1344 
1345 	if (optname == TCP_MD5SIG_EXT &&
1346 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1347 		prefixlen = cmd.tcpm_prefixlen;
1348 		if (prefixlen > 32)
1349 			return -EINVAL;
1350 	}
1351 
1352 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1353 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1354 		struct net_device *dev;
1355 
1356 		rcu_read_lock();
1357 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1358 		if (dev && netif_is_l3_master(dev))
1359 			l3index = dev->ifindex;
1360 
1361 		rcu_read_unlock();
1362 
1363 		/* ok to reference set/not set outside of rcu;
1364 		 * right now device MUST be an L3 master
1365 		 */
1366 		if (!dev || !l3index)
1367 			return -EINVAL;
1368 	}
1369 
1370 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1371 
1372 	if (!cmd.tcpm_keylen)
1373 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1374 
1375 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1376 		return -EINVAL;
1377 
1378 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1379 			      cmd.tcpm_key, cmd.tcpm_keylen);
1380 }
1381 
1382 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1383 				   __be32 daddr, __be32 saddr,
1384 				   const struct tcphdr *th, int nbytes)
1385 {
1386 	struct tcp4_pseudohdr *bp;
1387 	struct scatterlist sg;
1388 	struct tcphdr *_th;
1389 
1390 	bp = hp->scratch;
1391 	bp->saddr = saddr;
1392 	bp->daddr = daddr;
1393 	bp->pad = 0;
1394 	bp->protocol = IPPROTO_TCP;
1395 	bp->len = cpu_to_be16(nbytes);
1396 
1397 	_th = (struct tcphdr *)(bp + 1);
1398 	memcpy(_th, th, sizeof(*th));
1399 	_th->check = 0;
1400 
1401 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1402 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1403 				sizeof(*bp) + sizeof(*th));
1404 	return crypto_ahash_update(hp->md5_req);
1405 }
1406 
1407 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1408 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1409 {
1410 	struct tcp_md5sig_pool *hp;
1411 	struct ahash_request *req;
1412 
1413 	hp = tcp_get_md5sig_pool();
1414 	if (!hp)
1415 		goto clear_hash_noput;
1416 	req = hp->md5_req;
1417 
1418 	if (crypto_ahash_init(req))
1419 		goto clear_hash;
1420 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1421 		goto clear_hash;
1422 	if (tcp_md5_hash_key(hp, key))
1423 		goto clear_hash;
1424 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1425 	if (crypto_ahash_final(req))
1426 		goto clear_hash;
1427 
1428 	tcp_put_md5sig_pool();
1429 	return 0;
1430 
1431 clear_hash:
1432 	tcp_put_md5sig_pool();
1433 clear_hash_noput:
1434 	memset(md5_hash, 0, 16);
1435 	return 1;
1436 }
1437 
1438 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1439 			const struct sock *sk,
1440 			const struct sk_buff *skb)
1441 {
1442 	struct tcp_md5sig_pool *hp;
1443 	struct ahash_request *req;
1444 	const struct tcphdr *th = tcp_hdr(skb);
1445 	__be32 saddr, daddr;
1446 
1447 	if (sk) { /* valid for establish/request sockets */
1448 		saddr = sk->sk_rcv_saddr;
1449 		daddr = sk->sk_daddr;
1450 	} else {
1451 		const struct iphdr *iph = ip_hdr(skb);
1452 		saddr = iph->saddr;
1453 		daddr = iph->daddr;
1454 	}
1455 
1456 	hp = tcp_get_md5sig_pool();
1457 	if (!hp)
1458 		goto clear_hash_noput;
1459 	req = hp->md5_req;
1460 
1461 	if (crypto_ahash_init(req))
1462 		goto clear_hash;
1463 
1464 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1465 		goto clear_hash;
1466 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1467 		goto clear_hash;
1468 	if (tcp_md5_hash_key(hp, key))
1469 		goto clear_hash;
1470 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1471 	if (crypto_ahash_final(req))
1472 		goto clear_hash;
1473 
1474 	tcp_put_md5sig_pool();
1475 	return 0;
1476 
1477 clear_hash:
1478 	tcp_put_md5sig_pool();
1479 clear_hash_noput:
1480 	memset(md5_hash, 0, 16);
1481 	return 1;
1482 }
1483 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1484 
1485 #endif
1486 
1487 static void tcp_v4_init_req(struct request_sock *req,
1488 			    const struct sock *sk_listener,
1489 			    struct sk_buff *skb)
1490 {
1491 	struct inet_request_sock *ireq = inet_rsk(req);
1492 	struct net *net = sock_net(sk_listener);
1493 
1494 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1495 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1496 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1497 }
1498 
1499 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1500 					  struct sk_buff *skb,
1501 					  struct flowi *fl,
1502 					  struct request_sock *req)
1503 {
1504 	tcp_v4_init_req(req, sk, skb);
1505 
1506 	if (security_inet_conn_request(sk, skb, req))
1507 		return NULL;
1508 
1509 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1510 }
1511 
1512 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1513 	.family		=	PF_INET,
1514 	.obj_size	=	sizeof(struct tcp_request_sock),
1515 	.rtx_syn_ack	=	tcp_rtx_synack,
1516 	.send_ack	=	tcp_v4_reqsk_send_ack,
1517 	.destructor	=	tcp_v4_reqsk_destructor,
1518 	.send_reset	=	tcp_v4_send_reset,
1519 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1520 };
1521 
1522 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1523 	.mss_clamp	=	TCP_MSS_DEFAULT,
1524 #ifdef CONFIG_TCP_MD5SIG
1525 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1526 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1527 #endif
1528 #ifdef CONFIG_SYN_COOKIES
1529 	.cookie_init_seq =	cookie_v4_init_sequence,
1530 #endif
1531 	.route_req	=	tcp_v4_route_req,
1532 	.init_seq	=	tcp_v4_init_seq,
1533 	.init_ts_off	=	tcp_v4_init_ts_off,
1534 	.send_synack	=	tcp_v4_send_synack,
1535 };
1536 
1537 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1538 {
1539 	/* Never answer to SYNs send to broadcast or multicast */
1540 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1541 		goto drop;
1542 
1543 	return tcp_conn_request(&tcp_request_sock_ops,
1544 				&tcp_request_sock_ipv4_ops, sk, skb);
1545 
1546 drop:
1547 	tcp_listendrop(sk);
1548 	return 0;
1549 }
1550 EXPORT_SYMBOL(tcp_v4_conn_request);
1551 
1552 
1553 /*
1554  * The three way handshake has completed - we got a valid synack -
1555  * now create the new socket.
1556  */
1557 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1558 				  struct request_sock *req,
1559 				  struct dst_entry *dst,
1560 				  struct request_sock *req_unhash,
1561 				  bool *own_req)
1562 {
1563 	struct inet_request_sock *ireq;
1564 	bool found_dup_sk = false;
1565 	struct inet_sock *newinet;
1566 	struct tcp_sock *newtp;
1567 	struct sock *newsk;
1568 #ifdef CONFIG_TCP_MD5SIG
1569 	const union tcp_md5_addr *addr;
1570 	struct tcp_md5sig_key *key;
1571 	int l3index;
1572 #endif
1573 	struct ip_options_rcu *inet_opt;
1574 
1575 	if (sk_acceptq_is_full(sk))
1576 		goto exit_overflow;
1577 
1578 	newsk = tcp_create_openreq_child(sk, req, skb);
1579 	if (!newsk)
1580 		goto exit_nonewsk;
1581 
1582 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1583 	inet_sk_rx_dst_set(newsk, skb);
1584 
1585 	newtp		      = tcp_sk(newsk);
1586 	newinet		      = inet_sk(newsk);
1587 	ireq		      = inet_rsk(req);
1588 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1589 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1590 	newsk->sk_bound_dev_if = ireq->ir_iif;
1591 	newinet->inet_saddr   = ireq->ir_loc_addr;
1592 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1593 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1594 	newinet->mc_index     = inet_iif(skb);
1595 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1596 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1597 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1598 	if (inet_opt)
1599 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1600 	atomic_set(&newinet->inet_id, get_random_u16());
1601 
1602 	/* Set ToS of the new socket based upon the value of incoming SYN.
1603 	 * ECT bits are set later in tcp_init_transfer().
1604 	 */
1605 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1606 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1607 
1608 	if (!dst) {
1609 		dst = inet_csk_route_child_sock(sk, newsk, req);
1610 		if (!dst)
1611 			goto put_and_exit;
1612 	} else {
1613 		/* syncookie case : see end of cookie_v4_check() */
1614 	}
1615 	sk_setup_caps(newsk, dst);
1616 
1617 	tcp_ca_openreq_child(newsk, dst);
1618 
1619 	tcp_sync_mss(newsk, dst_mtu(dst));
1620 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1621 
1622 	tcp_initialize_rcv_mss(newsk);
1623 
1624 #ifdef CONFIG_TCP_MD5SIG
1625 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1626 	/* Copy over the MD5 key from the original socket */
1627 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1628 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1629 	if (key) {
1630 		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1631 			goto put_and_exit;
1632 		sk_gso_disable(newsk);
1633 	}
1634 #endif
1635 
1636 	if (__inet_inherit_port(sk, newsk) < 0)
1637 		goto put_and_exit;
1638 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1639 				       &found_dup_sk);
1640 	if (likely(*own_req)) {
1641 		tcp_move_syn(newtp, req);
1642 		ireq->ireq_opt = NULL;
1643 	} else {
1644 		newinet->inet_opt = NULL;
1645 
1646 		if (!req_unhash && found_dup_sk) {
1647 			/* This code path should only be executed in the
1648 			 * syncookie case only
1649 			 */
1650 			bh_unlock_sock(newsk);
1651 			sock_put(newsk);
1652 			newsk = NULL;
1653 		}
1654 	}
1655 	return newsk;
1656 
1657 exit_overflow:
1658 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1659 exit_nonewsk:
1660 	dst_release(dst);
1661 exit:
1662 	tcp_listendrop(sk);
1663 	return NULL;
1664 put_and_exit:
1665 	newinet->inet_opt = NULL;
1666 	inet_csk_prepare_forced_close(newsk);
1667 	tcp_done(newsk);
1668 	goto exit;
1669 }
1670 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1671 
1672 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1673 {
1674 #ifdef CONFIG_SYN_COOKIES
1675 	const struct tcphdr *th = tcp_hdr(skb);
1676 
1677 	if (!th->syn)
1678 		sk = cookie_v4_check(sk, skb);
1679 #endif
1680 	return sk;
1681 }
1682 
1683 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1684 			 struct tcphdr *th, u32 *cookie)
1685 {
1686 	u16 mss = 0;
1687 #ifdef CONFIG_SYN_COOKIES
1688 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1689 				    &tcp_request_sock_ipv4_ops, sk, th);
1690 	if (mss) {
1691 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1692 		tcp_synq_overflow(sk);
1693 	}
1694 #endif
1695 	return mss;
1696 }
1697 
1698 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1699 							   u32));
1700 /* The socket must have it's spinlock held when we get
1701  * here, unless it is a TCP_LISTEN socket.
1702  *
1703  * We have a potential double-lock case here, so even when
1704  * doing backlog processing we use the BH locking scheme.
1705  * This is because we cannot sleep with the original spinlock
1706  * held.
1707  */
1708 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1709 {
1710 	enum skb_drop_reason reason;
1711 	struct sock *rsk;
1712 
1713 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1714 		struct dst_entry *dst;
1715 
1716 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1717 						lockdep_sock_is_held(sk));
1718 
1719 		sock_rps_save_rxhash(sk, skb);
1720 		sk_mark_napi_id(sk, skb);
1721 		if (dst) {
1722 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1723 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1724 					     dst, 0)) {
1725 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1726 				dst_release(dst);
1727 			}
1728 		}
1729 		tcp_rcv_established(sk, skb);
1730 		return 0;
1731 	}
1732 
1733 	reason = SKB_DROP_REASON_NOT_SPECIFIED;
1734 	if (tcp_checksum_complete(skb))
1735 		goto csum_err;
1736 
1737 	if (sk->sk_state == TCP_LISTEN) {
1738 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1739 
1740 		if (!nsk)
1741 			goto discard;
1742 		if (nsk != sk) {
1743 			if (tcp_child_process(sk, nsk, skb)) {
1744 				rsk = nsk;
1745 				goto reset;
1746 			}
1747 			return 0;
1748 		}
1749 	} else
1750 		sock_rps_save_rxhash(sk, skb);
1751 
1752 	if (tcp_rcv_state_process(sk, skb)) {
1753 		rsk = sk;
1754 		goto reset;
1755 	}
1756 	return 0;
1757 
1758 reset:
1759 	tcp_v4_send_reset(rsk, skb);
1760 discard:
1761 	kfree_skb_reason(skb, reason);
1762 	/* Be careful here. If this function gets more complicated and
1763 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1764 	 * might be destroyed here. This current version compiles correctly,
1765 	 * but you have been warned.
1766 	 */
1767 	return 0;
1768 
1769 csum_err:
1770 	reason = SKB_DROP_REASON_TCP_CSUM;
1771 	trace_tcp_bad_csum(skb);
1772 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1773 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1774 	goto discard;
1775 }
1776 EXPORT_SYMBOL(tcp_v4_do_rcv);
1777 
1778 int tcp_v4_early_demux(struct sk_buff *skb)
1779 {
1780 	struct net *net = dev_net(skb->dev);
1781 	const struct iphdr *iph;
1782 	const struct tcphdr *th;
1783 	struct sock *sk;
1784 
1785 	if (skb->pkt_type != PACKET_HOST)
1786 		return 0;
1787 
1788 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1789 		return 0;
1790 
1791 	iph = ip_hdr(skb);
1792 	th = tcp_hdr(skb);
1793 
1794 	if (th->doff < sizeof(struct tcphdr) / 4)
1795 		return 0;
1796 
1797 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1798 				       iph->saddr, th->source,
1799 				       iph->daddr, ntohs(th->dest),
1800 				       skb->skb_iif, inet_sdif(skb));
1801 	if (sk) {
1802 		skb->sk = sk;
1803 		skb->destructor = sock_edemux;
1804 		if (sk_fullsock(sk)) {
1805 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1806 
1807 			if (dst)
1808 				dst = dst_check(dst, 0);
1809 			if (dst &&
1810 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
1811 				skb_dst_set_noref(skb, dst);
1812 		}
1813 	}
1814 	return 0;
1815 }
1816 
1817 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1818 		     enum skb_drop_reason *reason)
1819 {
1820 	u32 tail_gso_size, tail_gso_segs;
1821 	struct skb_shared_info *shinfo;
1822 	const struct tcphdr *th;
1823 	struct tcphdr *thtail;
1824 	struct sk_buff *tail;
1825 	unsigned int hdrlen;
1826 	bool fragstolen;
1827 	u32 gso_segs;
1828 	u32 gso_size;
1829 	u64 limit;
1830 	int delta;
1831 
1832 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1833 	 * we can fix skb->truesize to its real value to avoid future drops.
1834 	 * This is valid because skb is not yet charged to the socket.
1835 	 * It has been noticed pure SACK packets were sometimes dropped
1836 	 * (if cooked by drivers without copybreak feature).
1837 	 */
1838 	skb_condense(skb);
1839 
1840 	skb_dst_drop(skb);
1841 
1842 	if (unlikely(tcp_checksum_complete(skb))) {
1843 		bh_unlock_sock(sk);
1844 		trace_tcp_bad_csum(skb);
1845 		*reason = SKB_DROP_REASON_TCP_CSUM;
1846 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1847 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1848 		return true;
1849 	}
1850 
1851 	/* Attempt coalescing to last skb in backlog, even if we are
1852 	 * above the limits.
1853 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1854 	 */
1855 	th = (const struct tcphdr *)skb->data;
1856 	hdrlen = th->doff * 4;
1857 
1858 	tail = sk->sk_backlog.tail;
1859 	if (!tail)
1860 		goto no_coalesce;
1861 	thtail = (struct tcphdr *)tail->data;
1862 
1863 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1864 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1865 	    ((TCP_SKB_CB(tail)->tcp_flags |
1866 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1867 	    !((TCP_SKB_CB(tail)->tcp_flags &
1868 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1869 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1870 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1871 #ifdef CONFIG_TLS_DEVICE
1872 	    tail->decrypted != skb->decrypted ||
1873 #endif
1874 	    !mptcp_skb_can_collapse(tail, skb) ||
1875 	    thtail->doff != th->doff ||
1876 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1877 		goto no_coalesce;
1878 
1879 	__skb_pull(skb, hdrlen);
1880 
1881 	shinfo = skb_shinfo(skb);
1882 	gso_size = shinfo->gso_size ?: skb->len;
1883 	gso_segs = shinfo->gso_segs ?: 1;
1884 
1885 	shinfo = skb_shinfo(tail);
1886 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1887 	tail_gso_segs = shinfo->gso_segs ?: 1;
1888 
1889 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1890 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1891 
1892 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1893 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1894 			thtail->window = th->window;
1895 		}
1896 
1897 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1898 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1899 		 * is not entered if we append a packet with a FIN.
1900 		 * SYN, RST, URG are not present.
1901 		 * ACK is set on both packets.
1902 		 * PSH : we do not really care in TCP stack,
1903 		 *       at least for 'GRO' packets.
1904 		 */
1905 		thtail->fin |= th->fin;
1906 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1907 
1908 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1909 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1910 			tail->tstamp = skb->tstamp;
1911 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1912 		}
1913 
1914 		/* Not as strict as GRO. We only need to carry mss max value */
1915 		shinfo->gso_size = max(gso_size, tail_gso_size);
1916 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1917 
1918 		sk->sk_backlog.len += delta;
1919 		__NET_INC_STATS(sock_net(sk),
1920 				LINUX_MIB_TCPBACKLOGCOALESCE);
1921 		kfree_skb_partial(skb, fragstolen);
1922 		return false;
1923 	}
1924 	__skb_push(skb, hdrlen);
1925 
1926 no_coalesce:
1927 	/* sk->sk_backlog.len is reset only at the end of __release_sock().
1928 	 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
1929 	 * sk_rcvbuf in normal conditions.
1930 	 */
1931 	limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
1932 
1933 	limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
1934 
1935 	/* Only socket owner can try to collapse/prune rx queues
1936 	 * to reduce memory overhead, so add a little headroom here.
1937 	 * Few sockets backlog are possibly concurrently non empty.
1938 	 */
1939 	limit += 64 * 1024;
1940 
1941 	limit = min_t(u64, limit, UINT_MAX);
1942 
1943 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1944 		bh_unlock_sock(sk);
1945 		*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1946 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1947 		return true;
1948 	}
1949 	return false;
1950 }
1951 EXPORT_SYMBOL(tcp_add_backlog);
1952 
1953 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1954 {
1955 	struct tcphdr *th = (struct tcphdr *)skb->data;
1956 
1957 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1958 }
1959 EXPORT_SYMBOL(tcp_filter);
1960 
1961 static void tcp_v4_restore_cb(struct sk_buff *skb)
1962 {
1963 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1964 		sizeof(struct inet_skb_parm));
1965 }
1966 
1967 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1968 			   const struct tcphdr *th)
1969 {
1970 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1971 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1972 	 */
1973 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1974 		sizeof(struct inet_skb_parm));
1975 	barrier();
1976 
1977 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1978 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1979 				    skb->len - th->doff * 4);
1980 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1981 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1982 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1983 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1984 	TCP_SKB_CB(skb)->sacked	 = 0;
1985 	TCP_SKB_CB(skb)->has_rxtstamp =
1986 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1987 }
1988 
1989 /*
1990  *	From tcp_input.c
1991  */
1992 
1993 int tcp_v4_rcv(struct sk_buff *skb)
1994 {
1995 	struct net *net = dev_net(skb->dev);
1996 	enum skb_drop_reason drop_reason;
1997 	int sdif = inet_sdif(skb);
1998 	int dif = inet_iif(skb);
1999 	const struct iphdr *iph;
2000 	const struct tcphdr *th;
2001 	bool refcounted;
2002 	struct sock *sk;
2003 	int ret;
2004 
2005 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2006 	if (skb->pkt_type != PACKET_HOST)
2007 		goto discard_it;
2008 
2009 	/* Count it even if it's bad */
2010 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2011 
2012 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2013 		goto discard_it;
2014 
2015 	th = (const struct tcphdr *)skb->data;
2016 
2017 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2018 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2019 		goto bad_packet;
2020 	}
2021 	if (!pskb_may_pull(skb, th->doff * 4))
2022 		goto discard_it;
2023 
2024 	/* An explanation is required here, I think.
2025 	 * Packet length and doff are validated by header prediction,
2026 	 * provided case of th->doff==0 is eliminated.
2027 	 * So, we defer the checks. */
2028 
2029 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2030 		goto csum_error;
2031 
2032 	th = (const struct tcphdr *)skb->data;
2033 	iph = ip_hdr(skb);
2034 lookup:
2035 	sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2036 			       skb, __tcp_hdrlen(th), th->source,
2037 			       th->dest, sdif, &refcounted);
2038 	if (!sk)
2039 		goto no_tcp_socket;
2040 
2041 process:
2042 	if (sk->sk_state == TCP_TIME_WAIT)
2043 		goto do_time_wait;
2044 
2045 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2046 		struct request_sock *req = inet_reqsk(sk);
2047 		bool req_stolen = false;
2048 		struct sock *nsk;
2049 
2050 		sk = req->rsk_listener;
2051 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2052 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2053 		else
2054 			drop_reason = tcp_inbound_md5_hash(sk, skb,
2055 						   &iph->saddr, &iph->daddr,
2056 						   AF_INET, dif, sdif);
2057 		if (unlikely(drop_reason)) {
2058 			sk_drops_add(sk, skb);
2059 			reqsk_put(req);
2060 			goto discard_it;
2061 		}
2062 		if (tcp_checksum_complete(skb)) {
2063 			reqsk_put(req);
2064 			goto csum_error;
2065 		}
2066 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2067 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2068 			if (!nsk) {
2069 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2070 				goto lookup;
2071 			}
2072 			sk = nsk;
2073 			/* reuseport_migrate_sock() has already held one sk_refcnt
2074 			 * before returning.
2075 			 */
2076 		} else {
2077 			/* We own a reference on the listener, increase it again
2078 			 * as we might lose it too soon.
2079 			 */
2080 			sock_hold(sk);
2081 		}
2082 		refcounted = true;
2083 		nsk = NULL;
2084 		if (!tcp_filter(sk, skb)) {
2085 			th = (const struct tcphdr *)skb->data;
2086 			iph = ip_hdr(skb);
2087 			tcp_v4_fill_cb(skb, iph, th);
2088 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2089 		} else {
2090 			drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2091 		}
2092 		if (!nsk) {
2093 			reqsk_put(req);
2094 			if (req_stolen) {
2095 				/* Another cpu got exclusive access to req
2096 				 * and created a full blown socket.
2097 				 * Try to feed this packet to this socket
2098 				 * instead of discarding it.
2099 				 */
2100 				tcp_v4_restore_cb(skb);
2101 				sock_put(sk);
2102 				goto lookup;
2103 			}
2104 			goto discard_and_relse;
2105 		}
2106 		nf_reset_ct(skb);
2107 		if (nsk == sk) {
2108 			reqsk_put(req);
2109 			tcp_v4_restore_cb(skb);
2110 		} else if (tcp_child_process(sk, nsk, skb)) {
2111 			tcp_v4_send_reset(nsk, skb);
2112 			goto discard_and_relse;
2113 		} else {
2114 			sock_put(sk);
2115 			return 0;
2116 		}
2117 	}
2118 
2119 	if (static_branch_unlikely(&ip4_min_ttl)) {
2120 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2121 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2122 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2123 			drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2124 			goto discard_and_relse;
2125 		}
2126 	}
2127 
2128 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2129 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2130 		goto discard_and_relse;
2131 	}
2132 
2133 	drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2134 					   &iph->daddr, AF_INET, dif, sdif);
2135 	if (drop_reason)
2136 		goto discard_and_relse;
2137 
2138 	nf_reset_ct(skb);
2139 
2140 	if (tcp_filter(sk, skb)) {
2141 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2142 		goto discard_and_relse;
2143 	}
2144 	th = (const struct tcphdr *)skb->data;
2145 	iph = ip_hdr(skb);
2146 	tcp_v4_fill_cb(skb, iph, th);
2147 
2148 	skb->dev = NULL;
2149 
2150 	if (sk->sk_state == TCP_LISTEN) {
2151 		ret = tcp_v4_do_rcv(sk, skb);
2152 		goto put_and_return;
2153 	}
2154 
2155 	sk_incoming_cpu_update(sk);
2156 
2157 	bh_lock_sock_nested(sk);
2158 	tcp_segs_in(tcp_sk(sk), skb);
2159 	ret = 0;
2160 	if (!sock_owned_by_user(sk)) {
2161 		ret = tcp_v4_do_rcv(sk, skb);
2162 	} else {
2163 		if (tcp_add_backlog(sk, skb, &drop_reason))
2164 			goto discard_and_relse;
2165 	}
2166 	bh_unlock_sock(sk);
2167 
2168 put_and_return:
2169 	if (refcounted)
2170 		sock_put(sk);
2171 
2172 	return ret;
2173 
2174 no_tcp_socket:
2175 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2176 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2177 		goto discard_it;
2178 
2179 	tcp_v4_fill_cb(skb, iph, th);
2180 
2181 	if (tcp_checksum_complete(skb)) {
2182 csum_error:
2183 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2184 		trace_tcp_bad_csum(skb);
2185 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2186 bad_packet:
2187 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2188 	} else {
2189 		tcp_v4_send_reset(NULL, skb);
2190 	}
2191 
2192 discard_it:
2193 	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2194 	/* Discard frame. */
2195 	kfree_skb_reason(skb, drop_reason);
2196 	return 0;
2197 
2198 discard_and_relse:
2199 	sk_drops_add(sk, skb);
2200 	if (refcounted)
2201 		sock_put(sk);
2202 	goto discard_it;
2203 
2204 do_time_wait:
2205 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2206 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2207 		inet_twsk_put(inet_twsk(sk));
2208 		goto discard_it;
2209 	}
2210 
2211 	tcp_v4_fill_cb(skb, iph, th);
2212 
2213 	if (tcp_checksum_complete(skb)) {
2214 		inet_twsk_put(inet_twsk(sk));
2215 		goto csum_error;
2216 	}
2217 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2218 	case TCP_TW_SYN: {
2219 		struct sock *sk2 = inet_lookup_listener(net,
2220 							net->ipv4.tcp_death_row.hashinfo,
2221 							skb, __tcp_hdrlen(th),
2222 							iph->saddr, th->source,
2223 							iph->daddr, th->dest,
2224 							inet_iif(skb),
2225 							sdif);
2226 		if (sk2) {
2227 			inet_twsk_deschedule_put(inet_twsk(sk));
2228 			sk = sk2;
2229 			tcp_v4_restore_cb(skb);
2230 			refcounted = false;
2231 			goto process;
2232 		}
2233 	}
2234 		/* to ACK */
2235 		fallthrough;
2236 	case TCP_TW_ACK:
2237 		tcp_v4_timewait_ack(sk, skb);
2238 		break;
2239 	case TCP_TW_RST:
2240 		tcp_v4_send_reset(sk, skb);
2241 		inet_twsk_deschedule_put(inet_twsk(sk));
2242 		goto discard_it;
2243 	case TCP_TW_SUCCESS:;
2244 	}
2245 	goto discard_it;
2246 }
2247 
2248 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2249 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2250 	.twsk_unique	= tcp_twsk_unique,
2251 	.twsk_destructor= tcp_twsk_destructor,
2252 };
2253 
2254 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2255 {
2256 	struct dst_entry *dst = skb_dst(skb);
2257 
2258 	if (dst && dst_hold_safe(dst)) {
2259 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2260 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2261 	}
2262 }
2263 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2264 
2265 const struct inet_connection_sock_af_ops ipv4_specific = {
2266 	.queue_xmit	   = ip_queue_xmit,
2267 	.send_check	   = tcp_v4_send_check,
2268 	.rebuild_header	   = inet_sk_rebuild_header,
2269 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2270 	.conn_request	   = tcp_v4_conn_request,
2271 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2272 	.net_header_len	   = sizeof(struct iphdr),
2273 	.setsockopt	   = ip_setsockopt,
2274 	.getsockopt	   = ip_getsockopt,
2275 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2276 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2277 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2278 };
2279 EXPORT_SYMBOL(ipv4_specific);
2280 
2281 #ifdef CONFIG_TCP_MD5SIG
2282 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2283 	.md5_lookup		= tcp_v4_md5_lookup,
2284 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2285 	.md5_parse		= tcp_v4_parse_md5_keys,
2286 };
2287 #endif
2288 
2289 /* NOTE: A lot of things set to zero explicitly by call to
2290  *       sk_alloc() so need not be done here.
2291  */
2292 static int tcp_v4_init_sock(struct sock *sk)
2293 {
2294 	struct inet_connection_sock *icsk = inet_csk(sk);
2295 
2296 	tcp_init_sock(sk);
2297 
2298 	icsk->icsk_af_ops = &ipv4_specific;
2299 
2300 #ifdef CONFIG_TCP_MD5SIG
2301 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2302 #endif
2303 
2304 	return 0;
2305 }
2306 
2307 void tcp_v4_destroy_sock(struct sock *sk)
2308 {
2309 	struct tcp_sock *tp = tcp_sk(sk);
2310 
2311 	trace_tcp_destroy_sock(sk);
2312 
2313 	tcp_clear_xmit_timers(sk);
2314 
2315 	tcp_cleanup_congestion_control(sk);
2316 
2317 	tcp_cleanup_ulp(sk);
2318 
2319 	/* Cleanup up the write buffer. */
2320 	tcp_write_queue_purge(sk);
2321 
2322 	/* Check if we want to disable active TFO */
2323 	tcp_fastopen_active_disable_ofo_check(sk);
2324 
2325 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2326 	skb_rbtree_purge(&tp->out_of_order_queue);
2327 
2328 #ifdef CONFIG_TCP_MD5SIG
2329 	/* Clean up the MD5 key list, if any */
2330 	if (tp->md5sig_info) {
2331 		tcp_clear_md5_list(sk);
2332 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2333 		tp->md5sig_info = NULL;
2334 		static_branch_slow_dec_deferred(&tcp_md5_needed);
2335 	}
2336 #endif
2337 
2338 	/* Clean up a referenced TCP bind bucket. */
2339 	if (inet_csk(sk)->icsk_bind_hash)
2340 		inet_put_port(sk);
2341 
2342 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2343 
2344 	/* If socket is aborted during connect operation */
2345 	tcp_free_fastopen_req(tp);
2346 	tcp_fastopen_destroy_cipher(sk);
2347 	tcp_saved_syn_free(tp);
2348 
2349 	sk_sockets_allocated_dec(sk);
2350 }
2351 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2352 
2353 #ifdef CONFIG_PROC_FS
2354 /* Proc filesystem TCP sock list dumping. */
2355 
2356 static unsigned short seq_file_family(const struct seq_file *seq);
2357 
2358 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2359 {
2360 	unsigned short family = seq_file_family(seq);
2361 
2362 	/* AF_UNSPEC is used as a match all */
2363 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2364 		net_eq(sock_net(sk), seq_file_net(seq)));
2365 }
2366 
2367 /* Find a non empty bucket (starting from st->bucket)
2368  * and return the first sk from it.
2369  */
2370 static void *listening_get_first(struct seq_file *seq)
2371 {
2372 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2373 	struct tcp_iter_state *st = seq->private;
2374 
2375 	st->offset = 0;
2376 	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2377 		struct inet_listen_hashbucket *ilb2;
2378 		struct hlist_nulls_node *node;
2379 		struct sock *sk;
2380 
2381 		ilb2 = &hinfo->lhash2[st->bucket];
2382 		if (hlist_nulls_empty(&ilb2->nulls_head))
2383 			continue;
2384 
2385 		spin_lock(&ilb2->lock);
2386 		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2387 			if (seq_sk_match(seq, sk))
2388 				return sk;
2389 		}
2390 		spin_unlock(&ilb2->lock);
2391 	}
2392 
2393 	return NULL;
2394 }
2395 
2396 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2397  * If "cur" is the last one in the st->bucket,
2398  * call listening_get_first() to return the first sk of the next
2399  * non empty bucket.
2400  */
2401 static void *listening_get_next(struct seq_file *seq, void *cur)
2402 {
2403 	struct tcp_iter_state *st = seq->private;
2404 	struct inet_listen_hashbucket *ilb2;
2405 	struct hlist_nulls_node *node;
2406 	struct inet_hashinfo *hinfo;
2407 	struct sock *sk = cur;
2408 
2409 	++st->num;
2410 	++st->offset;
2411 
2412 	sk = sk_nulls_next(sk);
2413 	sk_nulls_for_each_from(sk, node) {
2414 		if (seq_sk_match(seq, sk))
2415 			return sk;
2416 	}
2417 
2418 	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2419 	ilb2 = &hinfo->lhash2[st->bucket];
2420 	spin_unlock(&ilb2->lock);
2421 	++st->bucket;
2422 	return listening_get_first(seq);
2423 }
2424 
2425 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2426 {
2427 	struct tcp_iter_state *st = seq->private;
2428 	void *rc;
2429 
2430 	st->bucket = 0;
2431 	st->offset = 0;
2432 	rc = listening_get_first(seq);
2433 
2434 	while (rc && *pos) {
2435 		rc = listening_get_next(seq, rc);
2436 		--*pos;
2437 	}
2438 	return rc;
2439 }
2440 
2441 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2442 				const struct tcp_iter_state *st)
2443 {
2444 	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2445 }
2446 
2447 /*
2448  * Get first established socket starting from bucket given in st->bucket.
2449  * If st->bucket is zero, the very first socket in the hash is returned.
2450  */
2451 static void *established_get_first(struct seq_file *seq)
2452 {
2453 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2454 	struct tcp_iter_state *st = seq->private;
2455 
2456 	st->offset = 0;
2457 	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2458 		struct sock *sk;
2459 		struct hlist_nulls_node *node;
2460 		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2461 
2462 		cond_resched();
2463 
2464 		/* Lockless fast path for the common case of empty buckets */
2465 		if (empty_bucket(hinfo, st))
2466 			continue;
2467 
2468 		spin_lock_bh(lock);
2469 		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2470 			if (seq_sk_match(seq, sk))
2471 				return sk;
2472 		}
2473 		spin_unlock_bh(lock);
2474 	}
2475 
2476 	return NULL;
2477 }
2478 
2479 static void *established_get_next(struct seq_file *seq, void *cur)
2480 {
2481 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2482 	struct tcp_iter_state *st = seq->private;
2483 	struct hlist_nulls_node *node;
2484 	struct sock *sk = cur;
2485 
2486 	++st->num;
2487 	++st->offset;
2488 
2489 	sk = sk_nulls_next(sk);
2490 
2491 	sk_nulls_for_each_from(sk, node) {
2492 		if (seq_sk_match(seq, sk))
2493 			return sk;
2494 	}
2495 
2496 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2497 	++st->bucket;
2498 	return established_get_first(seq);
2499 }
2500 
2501 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2502 {
2503 	struct tcp_iter_state *st = seq->private;
2504 	void *rc;
2505 
2506 	st->bucket = 0;
2507 	rc = established_get_first(seq);
2508 
2509 	while (rc && pos) {
2510 		rc = established_get_next(seq, rc);
2511 		--pos;
2512 	}
2513 	return rc;
2514 }
2515 
2516 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2517 {
2518 	void *rc;
2519 	struct tcp_iter_state *st = seq->private;
2520 
2521 	st->state = TCP_SEQ_STATE_LISTENING;
2522 	rc	  = listening_get_idx(seq, &pos);
2523 
2524 	if (!rc) {
2525 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2526 		rc	  = established_get_idx(seq, pos);
2527 	}
2528 
2529 	return rc;
2530 }
2531 
2532 static void *tcp_seek_last_pos(struct seq_file *seq)
2533 {
2534 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2535 	struct tcp_iter_state *st = seq->private;
2536 	int bucket = st->bucket;
2537 	int offset = st->offset;
2538 	int orig_num = st->num;
2539 	void *rc = NULL;
2540 
2541 	switch (st->state) {
2542 	case TCP_SEQ_STATE_LISTENING:
2543 		if (st->bucket > hinfo->lhash2_mask)
2544 			break;
2545 		rc = listening_get_first(seq);
2546 		while (offset-- && rc && bucket == st->bucket)
2547 			rc = listening_get_next(seq, rc);
2548 		if (rc)
2549 			break;
2550 		st->bucket = 0;
2551 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2552 		fallthrough;
2553 	case TCP_SEQ_STATE_ESTABLISHED:
2554 		if (st->bucket > hinfo->ehash_mask)
2555 			break;
2556 		rc = established_get_first(seq);
2557 		while (offset-- && rc && bucket == st->bucket)
2558 			rc = established_get_next(seq, rc);
2559 	}
2560 
2561 	st->num = orig_num;
2562 
2563 	return rc;
2564 }
2565 
2566 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2567 {
2568 	struct tcp_iter_state *st = seq->private;
2569 	void *rc;
2570 
2571 	if (*pos && *pos == st->last_pos) {
2572 		rc = tcp_seek_last_pos(seq);
2573 		if (rc)
2574 			goto out;
2575 	}
2576 
2577 	st->state = TCP_SEQ_STATE_LISTENING;
2578 	st->num = 0;
2579 	st->bucket = 0;
2580 	st->offset = 0;
2581 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2582 
2583 out:
2584 	st->last_pos = *pos;
2585 	return rc;
2586 }
2587 EXPORT_SYMBOL(tcp_seq_start);
2588 
2589 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2590 {
2591 	struct tcp_iter_state *st = seq->private;
2592 	void *rc = NULL;
2593 
2594 	if (v == SEQ_START_TOKEN) {
2595 		rc = tcp_get_idx(seq, 0);
2596 		goto out;
2597 	}
2598 
2599 	switch (st->state) {
2600 	case TCP_SEQ_STATE_LISTENING:
2601 		rc = listening_get_next(seq, v);
2602 		if (!rc) {
2603 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2604 			st->bucket = 0;
2605 			st->offset = 0;
2606 			rc	  = established_get_first(seq);
2607 		}
2608 		break;
2609 	case TCP_SEQ_STATE_ESTABLISHED:
2610 		rc = established_get_next(seq, v);
2611 		break;
2612 	}
2613 out:
2614 	++*pos;
2615 	st->last_pos = *pos;
2616 	return rc;
2617 }
2618 EXPORT_SYMBOL(tcp_seq_next);
2619 
2620 void tcp_seq_stop(struct seq_file *seq, void *v)
2621 {
2622 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2623 	struct tcp_iter_state *st = seq->private;
2624 
2625 	switch (st->state) {
2626 	case TCP_SEQ_STATE_LISTENING:
2627 		if (v != SEQ_START_TOKEN)
2628 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
2629 		break;
2630 	case TCP_SEQ_STATE_ESTABLISHED:
2631 		if (v)
2632 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2633 		break;
2634 	}
2635 }
2636 EXPORT_SYMBOL(tcp_seq_stop);
2637 
2638 static void get_openreq4(const struct request_sock *req,
2639 			 struct seq_file *f, int i)
2640 {
2641 	const struct inet_request_sock *ireq = inet_rsk(req);
2642 	long delta = req->rsk_timer.expires - jiffies;
2643 
2644 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2645 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2646 		i,
2647 		ireq->ir_loc_addr,
2648 		ireq->ir_num,
2649 		ireq->ir_rmt_addr,
2650 		ntohs(ireq->ir_rmt_port),
2651 		TCP_SYN_RECV,
2652 		0, 0, /* could print option size, but that is af dependent. */
2653 		1,    /* timers active (only the expire timer) */
2654 		jiffies_delta_to_clock_t(delta),
2655 		req->num_timeout,
2656 		from_kuid_munged(seq_user_ns(f),
2657 				 sock_i_uid(req->rsk_listener)),
2658 		0,  /* non standard timer */
2659 		0, /* open_requests have no inode */
2660 		0,
2661 		req);
2662 }
2663 
2664 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2665 {
2666 	int timer_active;
2667 	unsigned long timer_expires;
2668 	const struct tcp_sock *tp = tcp_sk(sk);
2669 	const struct inet_connection_sock *icsk = inet_csk(sk);
2670 	const struct inet_sock *inet = inet_sk(sk);
2671 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2672 	__be32 dest = inet->inet_daddr;
2673 	__be32 src = inet->inet_rcv_saddr;
2674 	__u16 destp = ntohs(inet->inet_dport);
2675 	__u16 srcp = ntohs(inet->inet_sport);
2676 	int rx_queue;
2677 	int state;
2678 
2679 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2680 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2681 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2682 		timer_active	= 1;
2683 		timer_expires	= icsk->icsk_timeout;
2684 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2685 		timer_active	= 4;
2686 		timer_expires	= icsk->icsk_timeout;
2687 	} else if (timer_pending(&sk->sk_timer)) {
2688 		timer_active	= 2;
2689 		timer_expires	= sk->sk_timer.expires;
2690 	} else {
2691 		timer_active	= 0;
2692 		timer_expires = jiffies;
2693 	}
2694 
2695 	state = inet_sk_state_load(sk);
2696 	if (state == TCP_LISTEN)
2697 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2698 	else
2699 		/* Because we don't lock the socket,
2700 		 * we might find a transient negative value.
2701 		 */
2702 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2703 				      READ_ONCE(tp->copied_seq), 0);
2704 
2705 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2706 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2707 		i, src, srcp, dest, destp, state,
2708 		READ_ONCE(tp->write_seq) - tp->snd_una,
2709 		rx_queue,
2710 		timer_active,
2711 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2712 		icsk->icsk_retransmits,
2713 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2714 		icsk->icsk_probes_out,
2715 		sock_i_ino(sk),
2716 		refcount_read(&sk->sk_refcnt), sk,
2717 		jiffies_to_clock_t(icsk->icsk_rto),
2718 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2719 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2720 		tcp_snd_cwnd(tp),
2721 		state == TCP_LISTEN ?
2722 		    fastopenq->max_qlen :
2723 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2724 }
2725 
2726 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2727 			       struct seq_file *f, int i)
2728 {
2729 	long delta = tw->tw_timer.expires - jiffies;
2730 	__be32 dest, src;
2731 	__u16 destp, srcp;
2732 
2733 	dest  = tw->tw_daddr;
2734 	src   = tw->tw_rcv_saddr;
2735 	destp = ntohs(tw->tw_dport);
2736 	srcp  = ntohs(tw->tw_sport);
2737 
2738 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2739 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2740 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2741 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2742 		refcount_read(&tw->tw_refcnt), tw);
2743 }
2744 
2745 #define TMPSZ 150
2746 
2747 static int tcp4_seq_show(struct seq_file *seq, void *v)
2748 {
2749 	struct tcp_iter_state *st;
2750 	struct sock *sk = v;
2751 
2752 	seq_setwidth(seq, TMPSZ - 1);
2753 	if (v == SEQ_START_TOKEN) {
2754 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2755 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2756 			   "inode");
2757 		goto out;
2758 	}
2759 	st = seq->private;
2760 
2761 	if (sk->sk_state == TCP_TIME_WAIT)
2762 		get_timewait4_sock(v, seq, st->num);
2763 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2764 		get_openreq4(v, seq, st->num);
2765 	else
2766 		get_tcp4_sock(v, seq, st->num);
2767 out:
2768 	seq_pad(seq, '\n');
2769 	return 0;
2770 }
2771 
2772 #ifdef CONFIG_BPF_SYSCALL
2773 struct bpf_tcp_iter_state {
2774 	struct tcp_iter_state state;
2775 	unsigned int cur_sk;
2776 	unsigned int end_sk;
2777 	unsigned int max_sk;
2778 	struct sock **batch;
2779 	bool st_bucket_done;
2780 };
2781 
2782 struct bpf_iter__tcp {
2783 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2784 	__bpf_md_ptr(struct sock_common *, sk_common);
2785 	uid_t uid __aligned(8);
2786 };
2787 
2788 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2789 			     struct sock_common *sk_common, uid_t uid)
2790 {
2791 	struct bpf_iter__tcp ctx;
2792 
2793 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2794 	ctx.meta = meta;
2795 	ctx.sk_common = sk_common;
2796 	ctx.uid = uid;
2797 	return bpf_iter_run_prog(prog, &ctx);
2798 }
2799 
2800 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2801 {
2802 	while (iter->cur_sk < iter->end_sk)
2803 		sock_gen_put(iter->batch[iter->cur_sk++]);
2804 }
2805 
2806 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2807 				      unsigned int new_batch_sz)
2808 {
2809 	struct sock **new_batch;
2810 
2811 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2812 			     GFP_USER | __GFP_NOWARN);
2813 	if (!new_batch)
2814 		return -ENOMEM;
2815 
2816 	bpf_iter_tcp_put_batch(iter);
2817 	kvfree(iter->batch);
2818 	iter->batch = new_batch;
2819 	iter->max_sk = new_batch_sz;
2820 
2821 	return 0;
2822 }
2823 
2824 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2825 						 struct sock *start_sk)
2826 {
2827 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2828 	struct bpf_tcp_iter_state *iter = seq->private;
2829 	struct tcp_iter_state *st = &iter->state;
2830 	struct hlist_nulls_node *node;
2831 	unsigned int expected = 1;
2832 	struct sock *sk;
2833 
2834 	sock_hold(start_sk);
2835 	iter->batch[iter->end_sk++] = start_sk;
2836 
2837 	sk = sk_nulls_next(start_sk);
2838 	sk_nulls_for_each_from(sk, node) {
2839 		if (seq_sk_match(seq, sk)) {
2840 			if (iter->end_sk < iter->max_sk) {
2841 				sock_hold(sk);
2842 				iter->batch[iter->end_sk++] = sk;
2843 			}
2844 			expected++;
2845 		}
2846 	}
2847 	spin_unlock(&hinfo->lhash2[st->bucket].lock);
2848 
2849 	return expected;
2850 }
2851 
2852 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2853 						   struct sock *start_sk)
2854 {
2855 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2856 	struct bpf_tcp_iter_state *iter = seq->private;
2857 	struct tcp_iter_state *st = &iter->state;
2858 	struct hlist_nulls_node *node;
2859 	unsigned int expected = 1;
2860 	struct sock *sk;
2861 
2862 	sock_hold(start_sk);
2863 	iter->batch[iter->end_sk++] = start_sk;
2864 
2865 	sk = sk_nulls_next(start_sk);
2866 	sk_nulls_for_each_from(sk, node) {
2867 		if (seq_sk_match(seq, sk)) {
2868 			if (iter->end_sk < iter->max_sk) {
2869 				sock_hold(sk);
2870 				iter->batch[iter->end_sk++] = sk;
2871 			}
2872 			expected++;
2873 		}
2874 	}
2875 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2876 
2877 	return expected;
2878 }
2879 
2880 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2881 {
2882 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2883 	struct bpf_tcp_iter_state *iter = seq->private;
2884 	struct tcp_iter_state *st = &iter->state;
2885 	unsigned int expected;
2886 	bool resized = false;
2887 	struct sock *sk;
2888 
2889 	/* The st->bucket is done.  Directly advance to the next
2890 	 * bucket instead of having the tcp_seek_last_pos() to skip
2891 	 * one by one in the current bucket and eventually find out
2892 	 * it has to advance to the next bucket.
2893 	 */
2894 	if (iter->st_bucket_done) {
2895 		st->offset = 0;
2896 		st->bucket++;
2897 		if (st->state == TCP_SEQ_STATE_LISTENING &&
2898 		    st->bucket > hinfo->lhash2_mask) {
2899 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2900 			st->bucket = 0;
2901 		}
2902 	}
2903 
2904 again:
2905 	/* Get a new batch */
2906 	iter->cur_sk = 0;
2907 	iter->end_sk = 0;
2908 	iter->st_bucket_done = false;
2909 
2910 	sk = tcp_seek_last_pos(seq);
2911 	if (!sk)
2912 		return NULL; /* Done */
2913 
2914 	if (st->state == TCP_SEQ_STATE_LISTENING)
2915 		expected = bpf_iter_tcp_listening_batch(seq, sk);
2916 	else
2917 		expected = bpf_iter_tcp_established_batch(seq, sk);
2918 
2919 	if (iter->end_sk == expected) {
2920 		iter->st_bucket_done = true;
2921 		return sk;
2922 	}
2923 
2924 	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2925 		resized = true;
2926 		goto again;
2927 	}
2928 
2929 	return sk;
2930 }
2931 
2932 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2933 {
2934 	/* bpf iter does not support lseek, so it always
2935 	 * continue from where it was stop()-ped.
2936 	 */
2937 	if (*pos)
2938 		return bpf_iter_tcp_batch(seq);
2939 
2940 	return SEQ_START_TOKEN;
2941 }
2942 
2943 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2944 {
2945 	struct bpf_tcp_iter_state *iter = seq->private;
2946 	struct tcp_iter_state *st = &iter->state;
2947 	struct sock *sk;
2948 
2949 	/* Whenever seq_next() is called, the iter->cur_sk is
2950 	 * done with seq_show(), so advance to the next sk in
2951 	 * the batch.
2952 	 */
2953 	if (iter->cur_sk < iter->end_sk) {
2954 		/* Keeping st->num consistent in tcp_iter_state.
2955 		 * bpf_iter_tcp does not use st->num.
2956 		 * meta.seq_num is used instead.
2957 		 */
2958 		st->num++;
2959 		/* Move st->offset to the next sk in the bucket such that
2960 		 * the future start() will resume at st->offset in
2961 		 * st->bucket.  See tcp_seek_last_pos().
2962 		 */
2963 		st->offset++;
2964 		sock_gen_put(iter->batch[iter->cur_sk++]);
2965 	}
2966 
2967 	if (iter->cur_sk < iter->end_sk)
2968 		sk = iter->batch[iter->cur_sk];
2969 	else
2970 		sk = bpf_iter_tcp_batch(seq);
2971 
2972 	++*pos;
2973 	/* Keeping st->last_pos consistent in tcp_iter_state.
2974 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2975 	 */
2976 	st->last_pos = *pos;
2977 	return sk;
2978 }
2979 
2980 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2981 {
2982 	struct bpf_iter_meta meta;
2983 	struct bpf_prog *prog;
2984 	struct sock *sk = v;
2985 	uid_t uid;
2986 	int ret;
2987 
2988 	if (v == SEQ_START_TOKEN)
2989 		return 0;
2990 
2991 	if (sk_fullsock(sk))
2992 		lock_sock(sk);
2993 
2994 	if (unlikely(sk_unhashed(sk))) {
2995 		ret = SEQ_SKIP;
2996 		goto unlock;
2997 	}
2998 
2999 	if (sk->sk_state == TCP_TIME_WAIT) {
3000 		uid = 0;
3001 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3002 		const struct request_sock *req = v;
3003 
3004 		uid = from_kuid_munged(seq_user_ns(seq),
3005 				       sock_i_uid(req->rsk_listener));
3006 	} else {
3007 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3008 	}
3009 
3010 	meta.seq = seq;
3011 	prog = bpf_iter_get_info(&meta, false);
3012 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
3013 
3014 unlock:
3015 	if (sk_fullsock(sk))
3016 		release_sock(sk);
3017 	return ret;
3018 
3019 }
3020 
3021 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3022 {
3023 	struct bpf_tcp_iter_state *iter = seq->private;
3024 	struct bpf_iter_meta meta;
3025 	struct bpf_prog *prog;
3026 
3027 	if (!v) {
3028 		meta.seq = seq;
3029 		prog = bpf_iter_get_info(&meta, true);
3030 		if (prog)
3031 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
3032 	}
3033 
3034 	if (iter->cur_sk < iter->end_sk) {
3035 		bpf_iter_tcp_put_batch(iter);
3036 		iter->st_bucket_done = false;
3037 	}
3038 }
3039 
3040 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3041 	.show		= bpf_iter_tcp_seq_show,
3042 	.start		= bpf_iter_tcp_seq_start,
3043 	.next		= bpf_iter_tcp_seq_next,
3044 	.stop		= bpf_iter_tcp_seq_stop,
3045 };
3046 #endif
3047 static unsigned short seq_file_family(const struct seq_file *seq)
3048 {
3049 	const struct tcp_seq_afinfo *afinfo;
3050 
3051 #ifdef CONFIG_BPF_SYSCALL
3052 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3053 	if (seq->op == &bpf_iter_tcp_seq_ops)
3054 		return AF_UNSPEC;
3055 #endif
3056 
3057 	/* Iterated from proc fs */
3058 	afinfo = pde_data(file_inode(seq->file));
3059 	return afinfo->family;
3060 }
3061 
3062 static const struct seq_operations tcp4_seq_ops = {
3063 	.show		= tcp4_seq_show,
3064 	.start		= tcp_seq_start,
3065 	.next		= tcp_seq_next,
3066 	.stop		= tcp_seq_stop,
3067 };
3068 
3069 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3070 	.family		= AF_INET,
3071 };
3072 
3073 static int __net_init tcp4_proc_init_net(struct net *net)
3074 {
3075 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3076 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3077 		return -ENOMEM;
3078 	return 0;
3079 }
3080 
3081 static void __net_exit tcp4_proc_exit_net(struct net *net)
3082 {
3083 	remove_proc_entry("tcp", net->proc_net);
3084 }
3085 
3086 static struct pernet_operations tcp4_net_ops = {
3087 	.init = tcp4_proc_init_net,
3088 	.exit = tcp4_proc_exit_net,
3089 };
3090 
3091 int __init tcp4_proc_init(void)
3092 {
3093 	return register_pernet_subsys(&tcp4_net_ops);
3094 }
3095 
3096 void tcp4_proc_exit(void)
3097 {
3098 	unregister_pernet_subsys(&tcp4_net_ops);
3099 }
3100 #endif /* CONFIG_PROC_FS */
3101 
3102 /* @wake is one when sk_stream_write_space() calls us.
3103  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3104  * This mimics the strategy used in sock_def_write_space().
3105  */
3106 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3107 {
3108 	const struct tcp_sock *tp = tcp_sk(sk);
3109 	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3110 			    READ_ONCE(tp->snd_nxt);
3111 
3112 	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3113 }
3114 EXPORT_SYMBOL(tcp_stream_memory_free);
3115 
3116 struct proto tcp_prot = {
3117 	.name			= "TCP",
3118 	.owner			= THIS_MODULE,
3119 	.close			= tcp_close,
3120 	.pre_connect		= tcp_v4_pre_connect,
3121 	.connect		= tcp_v4_connect,
3122 	.disconnect		= tcp_disconnect,
3123 	.accept			= inet_csk_accept,
3124 	.ioctl			= tcp_ioctl,
3125 	.init			= tcp_v4_init_sock,
3126 	.destroy		= tcp_v4_destroy_sock,
3127 	.shutdown		= tcp_shutdown,
3128 	.setsockopt		= tcp_setsockopt,
3129 	.getsockopt		= tcp_getsockopt,
3130 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3131 	.keepalive		= tcp_set_keepalive,
3132 	.recvmsg		= tcp_recvmsg,
3133 	.sendmsg		= tcp_sendmsg,
3134 	.splice_eof		= tcp_splice_eof,
3135 	.backlog_rcv		= tcp_v4_do_rcv,
3136 	.release_cb		= tcp_release_cb,
3137 	.hash			= inet_hash,
3138 	.unhash			= inet_unhash,
3139 	.get_port		= inet_csk_get_port,
3140 	.put_port		= inet_put_port,
3141 #ifdef CONFIG_BPF_SYSCALL
3142 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3143 #endif
3144 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3145 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3146 	.stream_memory_free	= tcp_stream_memory_free,
3147 	.sockets_allocated	= &tcp_sockets_allocated,
3148 	.orphan_count		= &tcp_orphan_count,
3149 
3150 	.memory_allocated	= &tcp_memory_allocated,
3151 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3152 
3153 	.memory_pressure	= &tcp_memory_pressure,
3154 	.sysctl_mem		= sysctl_tcp_mem,
3155 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3156 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3157 	.max_header		= MAX_TCP_HEADER,
3158 	.obj_size		= sizeof(struct tcp_sock),
3159 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3160 	.twsk_prot		= &tcp_timewait_sock_ops,
3161 	.rsk_prot		= &tcp_request_sock_ops,
3162 	.h.hashinfo		= NULL,
3163 	.no_autobind		= true,
3164 	.diag_destroy		= tcp_abort,
3165 };
3166 EXPORT_SYMBOL(tcp_prot);
3167 
3168 static void __net_exit tcp_sk_exit(struct net *net)
3169 {
3170 	if (net->ipv4.tcp_congestion_control)
3171 		bpf_module_put(net->ipv4.tcp_congestion_control,
3172 			       net->ipv4.tcp_congestion_control->owner);
3173 }
3174 
3175 static void __net_init tcp_set_hashinfo(struct net *net)
3176 {
3177 	struct inet_hashinfo *hinfo;
3178 	unsigned int ehash_entries;
3179 	struct net *old_net;
3180 
3181 	if (net_eq(net, &init_net))
3182 		goto fallback;
3183 
3184 	old_net = current->nsproxy->net_ns;
3185 	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3186 	if (!ehash_entries)
3187 		goto fallback;
3188 
3189 	ehash_entries = roundup_pow_of_two(ehash_entries);
3190 	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3191 	if (!hinfo) {
3192 		pr_warn("Failed to allocate TCP ehash (entries: %u) "
3193 			"for a netns, fallback to the global one\n",
3194 			ehash_entries);
3195 fallback:
3196 		hinfo = &tcp_hashinfo;
3197 		ehash_entries = tcp_hashinfo.ehash_mask + 1;
3198 	}
3199 
3200 	net->ipv4.tcp_death_row.hashinfo = hinfo;
3201 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3202 	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3203 }
3204 
3205 static int __net_init tcp_sk_init(struct net *net)
3206 {
3207 	net->ipv4.sysctl_tcp_ecn = 2;
3208 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3209 
3210 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3211 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3212 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3213 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3214 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3215 
3216 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3217 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3218 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3219 
3220 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3221 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3222 	net->ipv4.sysctl_tcp_syncookies = 1;
3223 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3224 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3225 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3226 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3227 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3228 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3229 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3230 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3231 
3232 	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3233 	tcp_set_hashinfo(net);
3234 
3235 	net->ipv4.sysctl_tcp_sack = 1;
3236 	net->ipv4.sysctl_tcp_window_scaling = 1;
3237 	net->ipv4.sysctl_tcp_timestamps = 1;
3238 	net->ipv4.sysctl_tcp_early_retrans = 3;
3239 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3240 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3241 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3242 	net->ipv4.sysctl_tcp_max_reordering = 300;
3243 	net->ipv4.sysctl_tcp_dsack = 1;
3244 	net->ipv4.sysctl_tcp_app_win = 31;
3245 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3246 	net->ipv4.sysctl_tcp_frto = 2;
3247 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3248 	/* This limits the percentage of the congestion window which we
3249 	 * will allow a single TSO frame to consume.  Building TSO frames
3250 	 * which are too large can cause TCP streams to be bursty.
3251 	 */
3252 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3253 	/* Default TSQ limit of 16 TSO segments */
3254 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3255 
3256 	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3257 	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3258 
3259 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3260 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3261 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3262 	net->ipv4.sysctl_tcp_autocorking = 1;
3263 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3264 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3265 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3266 	if (net != &init_net) {
3267 		memcpy(net->ipv4.sysctl_tcp_rmem,
3268 		       init_net.ipv4.sysctl_tcp_rmem,
3269 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3270 		memcpy(net->ipv4.sysctl_tcp_wmem,
3271 		       init_net.ipv4.sysctl_tcp_wmem,
3272 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3273 	}
3274 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3275 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3276 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3277 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3278 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3279 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3280 
3281 	/* Set default values for PLB */
3282 	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3283 	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3284 	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3285 	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3286 	/* Default congestion threshold for PLB to mark a round is 50% */
3287 	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3288 
3289 	/* Reno is always built in */
3290 	if (!net_eq(net, &init_net) &&
3291 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3292 			       init_net.ipv4.tcp_congestion_control->owner))
3293 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3294 	else
3295 		net->ipv4.tcp_congestion_control = &tcp_reno;
3296 
3297 	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3298 	net->ipv4.sysctl_tcp_shrink_window = 0;
3299 
3300 	return 0;
3301 }
3302 
3303 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3304 {
3305 	struct net *net;
3306 
3307 	tcp_twsk_purge(net_exit_list, AF_INET);
3308 
3309 	list_for_each_entry(net, net_exit_list, exit_list) {
3310 		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3311 		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3312 		tcp_fastopen_ctx_destroy(net);
3313 	}
3314 }
3315 
3316 static struct pernet_operations __net_initdata tcp_sk_ops = {
3317        .init	   = tcp_sk_init,
3318        .exit	   = tcp_sk_exit,
3319        .exit_batch = tcp_sk_exit_batch,
3320 };
3321 
3322 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3323 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3324 		     struct sock_common *sk_common, uid_t uid)
3325 
3326 #define INIT_BATCH_SZ 16
3327 
3328 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3329 {
3330 	struct bpf_tcp_iter_state *iter = priv_data;
3331 	int err;
3332 
3333 	err = bpf_iter_init_seq_net(priv_data, aux);
3334 	if (err)
3335 		return err;
3336 
3337 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3338 	if (err) {
3339 		bpf_iter_fini_seq_net(priv_data);
3340 		return err;
3341 	}
3342 
3343 	return 0;
3344 }
3345 
3346 static void bpf_iter_fini_tcp(void *priv_data)
3347 {
3348 	struct bpf_tcp_iter_state *iter = priv_data;
3349 
3350 	bpf_iter_fini_seq_net(priv_data);
3351 	kvfree(iter->batch);
3352 }
3353 
3354 static const struct bpf_iter_seq_info tcp_seq_info = {
3355 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3356 	.init_seq_private	= bpf_iter_init_tcp,
3357 	.fini_seq_private	= bpf_iter_fini_tcp,
3358 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3359 };
3360 
3361 static const struct bpf_func_proto *
3362 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3363 			    const struct bpf_prog *prog)
3364 {
3365 	switch (func_id) {
3366 	case BPF_FUNC_setsockopt:
3367 		return &bpf_sk_setsockopt_proto;
3368 	case BPF_FUNC_getsockopt:
3369 		return &bpf_sk_getsockopt_proto;
3370 	default:
3371 		return NULL;
3372 	}
3373 }
3374 
3375 static struct bpf_iter_reg tcp_reg_info = {
3376 	.target			= "tcp",
3377 	.ctx_arg_info_size	= 1,
3378 	.ctx_arg_info		= {
3379 		{ offsetof(struct bpf_iter__tcp, sk_common),
3380 		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3381 	},
3382 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3383 	.seq_info		= &tcp_seq_info,
3384 };
3385 
3386 static void __init bpf_iter_register(void)
3387 {
3388 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3389 	if (bpf_iter_reg_target(&tcp_reg_info))
3390 		pr_warn("Warning: could not register bpf iterator tcp\n");
3391 }
3392 
3393 #endif
3394 
3395 void __init tcp_v4_init(void)
3396 {
3397 	int cpu, res;
3398 
3399 	for_each_possible_cpu(cpu) {
3400 		struct sock *sk;
3401 
3402 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3403 					   IPPROTO_TCP, &init_net);
3404 		if (res)
3405 			panic("Failed to create the TCP control socket.\n");
3406 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3407 
3408 		/* Please enforce IP_DF and IPID==0 for RST and
3409 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3410 		 */
3411 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3412 
3413 		per_cpu(ipv4_tcp_sk, cpu) = sk;
3414 	}
3415 	if (register_pernet_subsys(&tcp_sk_ops))
3416 		panic("Failed to create the TCP control socket.\n");
3417 
3418 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3419 	bpf_iter_register();
3420 #endif
3421 }
3422