xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision 997a5310)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 #include <linux/sched.h>
61 
62 #include <net/net_namespace.h>
63 #include <net/icmp.h>
64 #include <net/inet_hashtables.h>
65 #include <net/tcp.h>
66 #include <net/transp_v6.h>
67 #include <net/ipv6.h>
68 #include <net/inet_common.h>
69 #include <net/timewait_sock.h>
70 #include <net/xfrm.h>
71 #include <net/secure_seq.h>
72 #include <net/busy_poll.h>
73 
74 #include <linux/inet.h>
75 #include <linux/ipv6.h>
76 #include <linux/stddef.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79 #include <linux/inetdevice.h>
80 #include <linux/btf_ids.h>
81 
82 #include <crypto/hash.h>
83 #include <linux/scatterlist.h>
84 
85 #include <trace/events/tcp.h>
86 
87 #ifdef CONFIG_TCP_MD5SIG
88 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
89 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
90 #endif
91 
92 struct inet_hashinfo tcp_hashinfo;
93 EXPORT_SYMBOL(tcp_hashinfo);
94 
95 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
96 
97 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
98 {
99 	return secure_tcp_seq(ip_hdr(skb)->daddr,
100 			      ip_hdr(skb)->saddr,
101 			      tcp_hdr(skb)->dest,
102 			      tcp_hdr(skb)->source);
103 }
104 
105 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
106 {
107 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
108 }
109 
110 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
111 {
112 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
113 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
114 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
115 	struct tcp_sock *tp = tcp_sk(sk);
116 
117 	if (reuse == 2) {
118 		/* Still does not detect *everything* that goes through
119 		 * lo, since we require a loopback src or dst address
120 		 * or direct binding to 'lo' interface.
121 		 */
122 		bool loopback = false;
123 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
124 			loopback = true;
125 #if IS_ENABLED(CONFIG_IPV6)
126 		if (tw->tw_family == AF_INET6) {
127 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
128 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
129 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
130 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
131 				loopback = true;
132 		} else
133 #endif
134 		{
135 			if (ipv4_is_loopback(tw->tw_daddr) ||
136 			    ipv4_is_loopback(tw->tw_rcv_saddr))
137 				loopback = true;
138 		}
139 		if (!loopback)
140 			reuse = 0;
141 	}
142 
143 	/* With PAWS, it is safe from the viewpoint
144 	   of data integrity. Even without PAWS it is safe provided sequence
145 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
146 
147 	   Actually, the idea is close to VJ's one, only timestamp cache is
148 	   held not per host, but per port pair and TW bucket is used as state
149 	   holder.
150 
151 	   If TW bucket has been already destroyed we fall back to VJ's scheme
152 	   and use initial timestamp retrieved from peer table.
153 	 */
154 	if (tcptw->tw_ts_recent_stamp &&
155 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
156 					    tcptw->tw_ts_recent_stamp)))) {
157 		/* inet_twsk_hashdance() sets sk_refcnt after putting twsk
158 		 * and releasing the bucket lock.
159 		 */
160 		if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
161 			return 0;
162 
163 		/* In case of repair and re-using TIME-WAIT sockets we still
164 		 * want to be sure that it is safe as above but honor the
165 		 * sequence numbers and time stamps set as part of the repair
166 		 * process.
167 		 *
168 		 * Without this check re-using a TIME-WAIT socket with TCP
169 		 * repair would accumulate a -1 on the repair assigned
170 		 * sequence number. The first time it is reused the sequence
171 		 * is -1, the second time -2, etc. This fixes that issue
172 		 * without appearing to create any others.
173 		 */
174 		if (likely(!tp->repair)) {
175 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
176 
177 			if (!seq)
178 				seq = 1;
179 			WRITE_ONCE(tp->write_seq, seq);
180 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
181 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
182 		}
183 
184 		return 1;
185 	}
186 
187 	return 0;
188 }
189 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
190 
191 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
192 			      int addr_len)
193 {
194 	/* This check is replicated from tcp_v4_connect() and intended to
195 	 * prevent BPF program called below from accessing bytes that are out
196 	 * of the bound specified by user in addr_len.
197 	 */
198 	if (addr_len < sizeof(struct sockaddr_in))
199 		return -EINVAL;
200 
201 	sock_owned_by_me(sk);
202 
203 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
204 }
205 
206 /* This will initiate an outgoing connection. */
207 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
208 {
209 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
210 	struct inet_timewait_death_row *tcp_death_row;
211 	struct inet_sock *inet = inet_sk(sk);
212 	struct tcp_sock *tp = tcp_sk(sk);
213 	struct ip_options_rcu *inet_opt;
214 	struct net *net = sock_net(sk);
215 	__be16 orig_sport, orig_dport;
216 	__be32 daddr, nexthop;
217 	struct flowi4 *fl4;
218 	struct rtable *rt;
219 	int err;
220 
221 	if (addr_len < sizeof(struct sockaddr_in))
222 		return -EINVAL;
223 
224 	if (usin->sin_family != AF_INET)
225 		return -EAFNOSUPPORT;
226 
227 	nexthop = daddr = usin->sin_addr.s_addr;
228 	inet_opt = rcu_dereference_protected(inet->inet_opt,
229 					     lockdep_sock_is_held(sk));
230 	if (inet_opt && inet_opt->opt.srr) {
231 		if (!daddr)
232 			return -EINVAL;
233 		nexthop = inet_opt->opt.faddr;
234 	}
235 
236 	orig_sport = inet->inet_sport;
237 	orig_dport = usin->sin_port;
238 	fl4 = &inet->cork.fl.u.ip4;
239 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
240 			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
241 			      orig_dport, sk);
242 	if (IS_ERR(rt)) {
243 		err = PTR_ERR(rt);
244 		if (err == -ENETUNREACH)
245 			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
246 		return err;
247 	}
248 
249 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
250 		ip_rt_put(rt);
251 		return -ENETUNREACH;
252 	}
253 
254 	if (!inet_opt || !inet_opt->opt.srr)
255 		daddr = fl4->daddr;
256 
257 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
258 
259 	if (!inet->inet_saddr) {
260 		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
261 		if (err) {
262 			ip_rt_put(rt);
263 			return err;
264 		}
265 	} else {
266 		sk_rcv_saddr_set(sk, inet->inet_saddr);
267 	}
268 
269 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
270 		/* Reset inherited state */
271 		tp->rx_opt.ts_recent	   = 0;
272 		tp->rx_opt.ts_recent_stamp = 0;
273 		if (likely(!tp->repair))
274 			WRITE_ONCE(tp->write_seq, 0);
275 	}
276 
277 	inet->inet_dport = usin->sin_port;
278 	sk_daddr_set(sk, daddr);
279 
280 	inet_csk(sk)->icsk_ext_hdr_len = 0;
281 	if (inet_opt)
282 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
283 
284 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
285 
286 	/* Socket identity is still unknown (sport may be zero).
287 	 * However we set state to SYN-SENT and not releasing socket
288 	 * lock select source port, enter ourselves into the hash tables and
289 	 * complete initialization after this.
290 	 */
291 	tcp_set_state(sk, TCP_SYN_SENT);
292 	err = inet_hash_connect(tcp_death_row, sk);
293 	if (err)
294 		goto failure;
295 
296 	sk_set_txhash(sk);
297 
298 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
299 			       inet->inet_sport, inet->inet_dport, sk);
300 	if (IS_ERR(rt)) {
301 		err = PTR_ERR(rt);
302 		rt = NULL;
303 		goto failure;
304 	}
305 	/* OK, now commit destination to socket.  */
306 	sk->sk_gso_type = SKB_GSO_TCPV4;
307 	sk_setup_caps(sk, &rt->dst);
308 	rt = NULL;
309 
310 	if (likely(!tp->repair)) {
311 		if (!tp->write_seq)
312 			WRITE_ONCE(tp->write_seq,
313 				   secure_tcp_seq(inet->inet_saddr,
314 						  inet->inet_daddr,
315 						  inet->inet_sport,
316 						  usin->sin_port));
317 		WRITE_ONCE(tp->tsoffset,
318 			   secure_tcp_ts_off(net, inet->inet_saddr,
319 					     inet->inet_daddr));
320 	}
321 
322 	atomic_set(&inet->inet_id, get_random_u16());
323 
324 	if (tcp_fastopen_defer_connect(sk, &err))
325 		return err;
326 	if (err)
327 		goto failure;
328 
329 	err = tcp_connect(sk);
330 
331 	if (err)
332 		goto failure;
333 
334 	return 0;
335 
336 failure:
337 	/*
338 	 * This unhashes the socket and releases the local port,
339 	 * if necessary.
340 	 */
341 	tcp_set_state(sk, TCP_CLOSE);
342 	inet_bhash2_reset_saddr(sk);
343 	ip_rt_put(rt);
344 	sk->sk_route_caps = 0;
345 	inet->inet_dport = 0;
346 	return err;
347 }
348 EXPORT_SYMBOL(tcp_v4_connect);
349 
350 /*
351  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
352  * It can be called through tcp_release_cb() if socket was owned by user
353  * at the time tcp_v4_err() was called to handle ICMP message.
354  */
355 void tcp_v4_mtu_reduced(struct sock *sk)
356 {
357 	struct inet_sock *inet = inet_sk(sk);
358 	struct dst_entry *dst;
359 	u32 mtu;
360 
361 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
362 		return;
363 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
364 	dst = inet_csk_update_pmtu(sk, mtu);
365 	if (!dst)
366 		return;
367 
368 	/* Something is about to be wrong... Remember soft error
369 	 * for the case, if this connection will not able to recover.
370 	 */
371 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
372 		WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
373 
374 	mtu = dst_mtu(dst);
375 
376 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
377 	    ip_sk_accept_pmtu(sk) &&
378 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
379 		tcp_sync_mss(sk, mtu);
380 
381 		/* Resend the TCP packet because it's
382 		 * clear that the old packet has been
383 		 * dropped. This is the new "fast" path mtu
384 		 * discovery.
385 		 */
386 		tcp_simple_retransmit(sk);
387 	} /* else let the usual retransmit timer handle it */
388 }
389 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
390 
391 static void do_redirect(struct sk_buff *skb, struct sock *sk)
392 {
393 	struct dst_entry *dst = __sk_dst_check(sk, 0);
394 
395 	if (dst)
396 		dst->ops->redirect(dst, sk, skb);
397 }
398 
399 
400 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
401 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
402 {
403 	struct request_sock *req = inet_reqsk(sk);
404 	struct net *net = sock_net(sk);
405 
406 	/* ICMPs are not backlogged, hence we cannot get
407 	 * an established socket here.
408 	 */
409 	if (seq != tcp_rsk(req)->snt_isn) {
410 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
411 	} else if (abort) {
412 		/*
413 		 * Still in SYN_RECV, just remove it silently.
414 		 * There is no good way to pass the error to the newly
415 		 * created socket, and POSIX does not want network
416 		 * errors returned from accept().
417 		 */
418 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
419 		tcp_listendrop(req->rsk_listener);
420 	}
421 	reqsk_put(req);
422 }
423 EXPORT_SYMBOL(tcp_req_err);
424 
425 /* TCP-LD (RFC 6069) logic */
426 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
427 {
428 	struct inet_connection_sock *icsk = inet_csk(sk);
429 	struct tcp_sock *tp = tcp_sk(sk);
430 	struct sk_buff *skb;
431 	s32 remaining;
432 	u32 delta_us;
433 
434 	if (sock_owned_by_user(sk))
435 		return;
436 
437 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
438 	    !icsk->icsk_backoff)
439 		return;
440 
441 	skb = tcp_rtx_queue_head(sk);
442 	if (WARN_ON_ONCE(!skb))
443 		return;
444 
445 	icsk->icsk_backoff--;
446 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
447 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
448 
449 	tcp_mstamp_refresh(tp);
450 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
451 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
452 
453 	if (remaining > 0) {
454 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
455 					  remaining, TCP_RTO_MAX);
456 	} else {
457 		/* RTO revert clocked out retransmission.
458 		 * Will retransmit now.
459 		 */
460 		tcp_retransmit_timer(sk);
461 	}
462 }
463 EXPORT_SYMBOL(tcp_ld_RTO_revert);
464 
465 /*
466  * This routine is called by the ICMP module when it gets some
467  * sort of error condition.  If err < 0 then the socket should
468  * be closed and the error returned to the user.  If err > 0
469  * it's just the icmp type << 8 | icmp code.  After adjustment
470  * header points to the first 8 bytes of the tcp header.  We need
471  * to find the appropriate port.
472  *
473  * The locking strategy used here is very "optimistic". When
474  * someone else accesses the socket the ICMP is just dropped
475  * and for some paths there is no check at all.
476  * A more general error queue to queue errors for later handling
477  * is probably better.
478  *
479  */
480 
481 int tcp_v4_err(struct sk_buff *skb, u32 info)
482 {
483 	const struct iphdr *iph = (const struct iphdr *)skb->data;
484 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
485 	struct tcp_sock *tp;
486 	const int type = icmp_hdr(skb)->type;
487 	const int code = icmp_hdr(skb)->code;
488 	struct sock *sk;
489 	struct request_sock *fastopen;
490 	u32 seq, snd_una;
491 	int err;
492 	struct net *net = dev_net(skb->dev);
493 
494 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
495 				       iph->daddr, th->dest, iph->saddr,
496 				       ntohs(th->source), inet_iif(skb), 0);
497 	if (!sk) {
498 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
499 		return -ENOENT;
500 	}
501 	if (sk->sk_state == TCP_TIME_WAIT) {
502 		inet_twsk_put(inet_twsk(sk));
503 		return 0;
504 	}
505 	seq = ntohl(th->seq);
506 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
507 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
508 				     type == ICMP_TIME_EXCEEDED ||
509 				     (type == ICMP_DEST_UNREACH &&
510 				      (code == ICMP_NET_UNREACH ||
511 				       code == ICMP_HOST_UNREACH)));
512 		return 0;
513 	}
514 
515 	bh_lock_sock(sk);
516 	/* If too many ICMPs get dropped on busy
517 	 * servers this needs to be solved differently.
518 	 * We do take care of PMTU discovery (RFC1191) special case :
519 	 * we can receive locally generated ICMP messages while socket is held.
520 	 */
521 	if (sock_owned_by_user(sk)) {
522 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
523 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
524 	}
525 	if (sk->sk_state == TCP_CLOSE)
526 		goto out;
527 
528 	if (static_branch_unlikely(&ip4_min_ttl)) {
529 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
530 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
531 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
532 			goto out;
533 		}
534 	}
535 
536 	tp = tcp_sk(sk);
537 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
538 	fastopen = rcu_dereference(tp->fastopen_rsk);
539 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
540 	if (sk->sk_state != TCP_LISTEN &&
541 	    !between(seq, snd_una, tp->snd_nxt)) {
542 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
543 		goto out;
544 	}
545 
546 	switch (type) {
547 	case ICMP_REDIRECT:
548 		if (!sock_owned_by_user(sk))
549 			do_redirect(skb, sk);
550 		goto out;
551 	case ICMP_SOURCE_QUENCH:
552 		/* Just silently ignore these. */
553 		goto out;
554 	case ICMP_PARAMETERPROB:
555 		err = EPROTO;
556 		break;
557 	case ICMP_DEST_UNREACH:
558 		if (code > NR_ICMP_UNREACH)
559 			goto out;
560 
561 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
562 			/* We are not interested in TCP_LISTEN and open_requests
563 			 * (SYN-ACKs send out by Linux are always <576bytes so
564 			 * they should go through unfragmented).
565 			 */
566 			if (sk->sk_state == TCP_LISTEN)
567 				goto out;
568 
569 			WRITE_ONCE(tp->mtu_info, info);
570 			if (!sock_owned_by_user(sk)) {
571 				tcp_v4_mtu_reduced(sk);
572 			} else {
573 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
574 					sock_hold(sk);
575 			}
576 			goto out;
577 		}
578 
579 		err = icmp_err_convert[code].errno;
580 		/* check if this ICMP message allows revert of backoff.
581 		 * (see RFC 6069)
582 		 */
583 		if (!fastopen &&
584 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
585 			tcp_ld_RTO_revert(sk, seq);
586 		break;
587 	case ICMP_TIME_EXCEEDED:
588 		err = EHOSTUNREACH;
589 		break;
590 	default:
591 		goto out;
592 	}
593 
594 	switch (sk->sk_state) {
595 	case TCP_SYN_SENT:
596 	case TCP_SYN_RECV:
597 		/* Only in fast or simultaneous open. If a fast open socket is
598 		 * already accepted it is treated as a connected one below.
599 		 */
600 		if (fastopen && !fastopen->sk)
601 			break;
602 
603 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
604 
605 		if (!sock_owned_by_user(sk)) {
606 			WRITE_ONCE(sk->sk_err, err);
607 
608 			sk_error_report(sk);
609 
610 			tcp_done(sk);
611 		} else {
612 			WRITE_ONCE(sk->sk_err_soft, err);
613 		}
614 		goto out;
615 	}
616 
617 	/* If we've already connected we will keep trying
618 	 * until we time out, or the user gives up.
619 	 *
620 	 * rfc1122 4.2.3.9 allows to consider as hard errors
621 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
622 	 * but it is obsoleted by pmtu discovery).
623 	 *
624 	 * Note, that in modern internet, where routing is unreliable
625 	 * and in each dark corner broken firewalls sit, sending random
626 	 * errors ordered by their masters even this two messages finally lose
627 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
628 	 *
629 	 * Now we are in compliance with RFCs.
630 	 *							--ANK (980905)
631 	 */
632 
633 	if (!sock_owned_by_user(sk) &&
634 	    inet_test_bit(RECVERR, sk)) {
635 		WRITE_ONCE(sk->sk_err, err);
636 		sk_error_report(sk);
637 	} else	{ /* Only an error on timeout */
638 		WRITE_ONCE(sk->sk_err_soft, err);
639 	}
640 
641 out:
642 	bh_unlock_sock(sk);
643 	sock_put(sk);
644 	return 0;
645 }
646 
647 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
648 {
649 	struct tcphdr *th = tcp_hdr(skb);
650 
651 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
652 	skb->csum_start = skb_transport_header(skb) - skb->head;
653 	skb->csum_offset = offsetof(struct tcphdr, check);
654 }
655 
656 /* This routine computes an IPv4 TCP checksum. */
657 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
658 {
659 	const struct inet_sock *inet = inet_sk(sk);
660 
661 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
662 }
663 EXPORT_SYMBOL(tcp_v4_send_check);
664 
665 /*
666  *	This routine will send an RST to the other tcp.
667  *
668  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
669  *		      for reset.
670  *	Answer: if a packet caused RST, it is not for a socket
671  *		existing in our system, if it is matched to a socket,
672  *		it is just duplicate segment or bug in other side's TCP.
673  *		So that we build reply only basing on parameters
674  *		arrived with segment.
675  *	Exception: precedence violation. We do not implement it in any case.
676  */
677 
678 #ifdef CONFIG_TCP_MD5SIG
679 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
680 #else
681 #define OPTION_BYTES sizeof(__be32)
682 #endif
683 
684 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
685 {
686 	const struct tcphdr *th = tcp_hdr(skb);
687 	struct {
688 		struct tcphdr th;
689 		__be32 opt[OPTION_BYTES / sizeof(__be32)];
690 	} rep;
691 	struct ip_reply_arg arg;
692 #ifdef CONFIG_TCP_MD5SIG
693 	struct tcp_md5sig_key *key = NULL;
694 	const __u8 *hash_location = NULL;
695 	unsigned char newhash[16];
696 	int genhash;
697 	struct sock *sk1 = NULL;
698 #endif
699 	u64 transmit_time = 0;
700 	struct sock *ctl_sk;
701 	struct net *net;
702 	u32 txhash = 0;
703 
704 	/* Never send a reset in response to a reset. */
705 	if (th->rst)
706 		return;
707 
708 	/* If sk not NULL, it means we did a successful lookup and incoming
709 	 * route had to be correct. prequeue might have dropped our dst.
710 	 */
711 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
712 		return;
713 
714 	/* Swap the send and the receive. */
715 	memset(&rep, 0, sizeof(rep));
716 	rep.th.dest   = th->source;
717 	rep.th.source = th->dest;
718 	rep.th.doff   = sizeof(struct tcphdr) / 4;
719 	rep.th.rst    = 1;
720 
721 	if (th->ack) {
722 		rep.th.seq = th->ack_seq;
723 	} else {
724 		rep.th.ack = 1;
725 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
726 				       skb->len - (th->doff << 2));
727 	}
728 
729 	memset(&arg, 0, sizeof(arg));
730 	arg.iov[0].iov_base = (unsigned char *)&rep;
731 	arg.iov[0].iov_len  = sizeof(rep.th);
732 
733 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
734 #ifdef CONFIG_TCP_MD5SIG
735 	rcu_read_lock();
736 	hash_location = tcp_parse_md5sig_option(th);
737 	if (sk && sk_fullsock(sk)) {
738 		const union tcp_md5_addr *addr;
739 		int l3index;
740 
741 		/* sdif set, means packet ingressed via a device
742 		 * in an L3 domain and inet_iif is set to it.
743 		 */
744 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
745 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
746 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
747 	} else if (hash_location) {
748 		const union tcp_md5_addr *addr;
749 		int sdif = tcp_v4_sdif(skb);
750 		int dif = inet_iif(skb);
751 		int l3index;
752 
753 		/*
754 		 * active side is lost. Try to find listening socket through
755 		 * source port, and then find md5 key through listening socket.
756 		 * we are not loose security here:
757 		 * Incoming packet is checked with md5 hash with finding key,
758 		 * no RST generated if md5 hash doesn't match.
759 		 */
760 		sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
761 					     NULL, 0, ip_hdr(skb)->saddr,
762 					     th->source, ip_hdr(skb)->daddr,
763 					     ntohs(th->source), dif, sdif);
764 		/* don't send rst if it can't find key */
765 		if (!sk1)
766 			goto out;
767 
768 		/* sdif set, means packet ingressed via a device
769 		 * in an L3 domain and dif is set to it.
770 		 */
771 		l3index = sdif ? dif : 0;
772 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
773 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
774 		if (!key)
775 			goto out;
776 
777 
778 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
779 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
780 			goto out;
781 
782 	}
783 
784 	if (key) {
785 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
786 				   (TCPOPT_NOP << 16) |
787 				   (TCPOPT_MD5SIG << 8) |
788 				   TCPOLEN_MD5SIG);
789 		/* Update length and the length the header thinks exists */
790 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
791 		rep.th.doff = arg.iov[0].iov_len / 4;
792 
793 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
794 				     key, ip_hdr(skb)->saddr,
795 				     ip_hdr(skb)->daddr, &rep.th);
796 	}
797 #endif
798 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
799 	if (rep.opt[0] == 0) {
800 		__be32 mrst = mptcp_reset_option(skb);
801 
802 		if (mrst) {
803 			rep.opt[0] = mrst;
804 			arg.iov[0].iov_len += sizeof(mrst);
805 			rep.th.doff = arg.iov[0].iov_len / 4;
806 		}
807 	}
808 
809 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
810 				      ip_hdr(skb)->saddr, /* XXX */
811 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
812 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
813 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
814 
815 	/* When socket is gone, all binding information is lost.
816 	 * routing might fail in this case. No choice here, if we choose to force
817 	 * input interface, we will misroute in case of asymmetric route.
818 	 */
819 	if (sk) {
820 		arg.bound_dev_if = sk->sk_bound_dev_if;
821 		if (sk_fullsock(sk))
822 			trace_tcp_send_reset(sk, skb);
823 	}
824 
825 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
826 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
827 
828 	arg.tos = ip_hdr(skb)->tos;
829 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
830 	local_bh_disable();
831 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
832 	sock_net_set(ctl_sk, net);
833 	if (sk) {
834 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
835 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
836 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
837 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
838 		transmit_time = tcp_transmit_time(sk);
839 		xfrm_sk_clone_policy(ctl_sk, sk);
840 		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
841 			 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
842 	} else {
843 		ctl_sk->sk_mark = 0;
844 		ctl_sk->sk_priority = 0;
845 	}
846 	ip_send_unicast_reply(ctl_sk,
847 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
848 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
849 			      &arg, arg.iov[0].iov_len,
850 			      transmit_time, txhash);
851 
852 	xfrm_sk_free_policy(ctl_sk);
853 	sock_net_set(ctl_sk, &init_net);
854 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
855 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
856 	local_bh_enable();
857 
858 #ifdef CONFIG_TCP_MD5SIG
859 out:
860 	rcu_read_unlock();
861 #endif
862 }
863 
864 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
865    outside socket context is ugly, certainly. What can I do?
866  */
867 
868 static void tcp_v4_send_ack(const struct sock *sk,
869 			    struct sk_buff *skb, u32 seq, u32 ack,
870 			    u32 win, u32 tsval, u32 tsecr, int oif,
871 			    struct tcp_md5sig_key *key,
872 			    int reply_flags, u8 tos, u32 txhash)
873 {
874 	const struct tcphdr *th = tcp_hdr(skb);
875 	struct {
876 		struct tcphdr th;
877 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
878 #ifdef CONFIG_TCP_MD5SIG
879 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
880 #endif
881 			];
882 	} rep;
883 	struct net *net = sock_net(sk);
884 	struct ip_reply_arg arg;
885 	struct sock *ctl_sk;
886 	u64 transmit_time;
887 
888 	memset(&rep.th, 0, sizeof(struct tcphdr));
889 	memset(&arg, 0, sizeof(arg));
890 
891 	arg.iov[0].iov_base = (unsigned char *)&rep;
892 	arg.iov[0].iov_len  = sizeof(rep.th);
893 	if (tsecr) {
894 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
895 				   (TCPOPT_TIMESTAMP << 8) |
896 				   TCPOLEN_TIMESTAMP);
897 		rep.opt[1] = htonl(tsval);
898 		rep.opt[2] = htonl(tsecr);
899 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
900 	}
901 
902 	/* Swap the send and the receive. */
903 	rep.th.dest    = th->source;
904 	rep.th.source  = th->dest;
905 	rep.th.doff    = arg.iov[0].iov_len / 4;
906 	rep.th.seq     = htonl(seq);
907 	rep.th.ack_seq = htonl(ack);
908 	rep.th.ack     = 1;
909 	rep.th.window  = htons(win);
910 
911 #ifdef CONFIG_TCP_MD5SIG
912 	if (key) {
913 		int offset = (tsecr) ? 3 : 0;
914 
915 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
916 					  (TCPOPT_NOP << 16) |
917 					  (TCPOPT_MD5SIG << 8) |
918 					  TCPOLEN_MD5SIG);
919 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
920 		rep.th.doff = arg.iov[0].iov_len/4;
921 
922 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
923 				    key, ip_hdr(skb)->saddr,
924 				    ip_hdr(skb)->daddr, &rep.th);
925 	}
926 #endif
927 	arg.flags = reply_flags;
928 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
929 				      ip_hdr(skb)->saddr, /* XXX */
930 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
931 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
932 	if (oif)
933 		arg.bound_dev_if = oif;
934 	arg.tos = tos;
935 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
936 	local_bh_disable();
937 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
938 	sock_net_set(ctl_sk, net);
939 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
940 			   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
941 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
942 			   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
943 	transmit_time = tcp_transmit_time(sk);
944 	ip_send_unicast_reply(ctl_sk,
945 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
946 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
947 			      &arg, arg.iov[0].iov_len,
948 			      transmit_time, txhash);
949 
950 	sock_net_set(ctl_sk, &init_net);
951 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
952 	local_bh_enable();
953 }
954 
955 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
956 {
957 	struct inet_timewait_sock *tw = inet_twsk(sk);
958 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
959 
960 	tcp_v4_send_ack(sk, skb,
961 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
962 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
963 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
964 			tcptw->tw_ts_recent,
965 			tw->tw_bound_dev_if,
966 			tcp_twsk_md5_key(tcptw),
967 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
968 			tw->tw_tos,
969 			tw->tw_txhash
970 			);
971 
972 	inet_twsk_put(tw);
973 }
974 
975 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
976 				  struct request_sock *req)
977 {
978 	const union tcp_md5_addr *addr;
979 	int l3index;
980 
981 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
982 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
983 	 */
984 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
985 					     tcp_sk(sk)->snd_nxt;
986 
987 	/* RFC 7323 2.3
988 	 * The window field (SEG.WND) of every outgoing segment, with the
989 	 * exception of <SYN> segments, MUST be right-shifted by
990 	 * Rcv.Wind.Shift bits:
991 	 */
992 	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
993 	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
994 	tcp_v4_send_ack(sk, skb, seq,
995 			tcp_rsk(req)->rcv_nxt,
996 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
997 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
998 			READ_ONCE(req->ts_recent),
999 			0,
1000 			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
1001 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1002 			ip_hdr(skb)->tos,
1003 			READ_ONCE(tcp_rsk(req)->txhash));
1004 }
1005 
1006 /*
1007  *	Send a SYN-ACK after having received a SYN.
1008  *	This still operates on a request_sock only, not on a big
1009  *	socket.
1010  */
1011 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1012 			      struct flowi *fl,
1013 			      struct request_sock *req,
1014 			      struct tcp_fastopen_cookie *foc,
1015 			      enum tcp_synack_type synack_type,
1016 			      struct sk_buff *syn_skb)
1017 {
1018 	const struct inet_request_sock *ireq = inet_rsk(req);
1019 	struct flowi4 fl4;
1020 	int err = -1;
1021 	struct sk_buff *skb;
1022 	u8 tos;
1023 
1024 	/* First, grab a route. */
1025 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1026 		return -1;
1027 
1028 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1029 
1030 	if (skb) {
1031 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1032 
1033 		tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1034 				(tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1035 				(inet_sk(sk)->tos & INET_ECN_MASK) :
1036 				inet_sk(sk)->tos;
1037 
1038 		if (!INET_ECN_is_capable(tos) &&
1039 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1040 			tos |= INET_ECN_ECT_0;
1041 
1042 		rcu_read_lock();
1043 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1044 					    ireq->ir_rmt_addr,
1045 					    rcu_dereference(ireq->ireq_opt),
1046 					    tos);
1047 		rcu_read_unlock();
1048 		err = net_xmit_eval(err);
1049 	}
1050 
1051 	return err;
1052 }
1053 
1054 /*
1055  *	IPv4 request_sock destructor.
1056  */
1057 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1058 {
1059 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1060 }
1061 
1062 #ifdef CONFIG_TCP_MD5SIG
1063 /*
1064  * RFC2385 MD5 checksumming requires a mapping of
1065  * IP address->MD5 Key.
1066  * We need to maintain these in the sk structure.
1067  */
1068 
1069 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1070 EXPORT_SYMBOL(tcp_md5_needed);
1071 
1072 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1073 {
1074 	if (!old)
1075 		return true;
1076 
1077 	/* l3index always overrides non-l3index */
1078 	if (old->l3index && new->l3index == 0)
1079 		return false;
1080 	if (old->l3index == 0 && new->l3index)
1081 		return true;
1082 
1083 	return old->prefixlen < new->prefixlen;
1084 }
1085 
1086 /* Find the Key structure for an address.  */
1087 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1088 					   const union tcp_md5_addr *addr,
1089 					   int family)
1090 {
1091 	const struct tcp_sock *tp = tcp_sk(sk);
1092 	struct tcp_md5sig_key *key;
1093 	const struct tcp_md5sig_info *md5sig;
1094 	__be32 mask;
1095 	struct tcp_md5sig_key *best_match = NULL;
1096 	bool match;
1097 
1098 	/* caller either holds rcu_read_lock() or socket lock */
1099 	md5sig = rcu_dereference_check(tp->md5sig_info,
1100 				       lockdep_sock_is_held(sk));
1101 	if (!md5sig)
1102 		return NULL;
1103 
1104 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1105 				 lockdep_sock_is_held(sk)) {
1106 		if (key->family != family)
1107 			continue;
1108 		if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1109 			continue;
1110 		if (family == AF_INET) {
1111 			mask = inet_make_mask(key->prefixlen);
1112 			match = (key->addr.a4.s_addr & mask) ==
1113 				(addr->a4.s_addr & mask);
1114 #if IS_ENABLED(CONFIG_IPV6)
1115 		} else if (family == AF_INET6) {
1116 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1117 						  key->prefixlen);
1118 #endif
1119 		} else {
1120 			match = false;
1121 		}
1122 
1123 		if (match && better_md5_match(best_match, key))
1124 			best_match = key;
1125 	}
1126 	return best_match;
1127 }
1128 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1129 
1130 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1131 						      const union tcp_md5_addr *addr,
1132 						      int family, u8 prefixlen,
1133 						      int l3index, u8 flags)
1134 {
1135 	const struct tcp_sock *tp = tcp_sk(sk);
1136 	struct tcp_md5sig_key *key;
1137 	unsigned int size = sizeof(struct in_addr);
1138 	const struct tcp_md5sig_info *md5sig;
1139 
1140 	/* caller either holds rcu_read_lock() or socket lock */
1141 	md5sig = rcu_dereference_check(tp->md5sig_info,
1142 				       lockdep_sock_is_held(sk));
1143 	if (!md5sig)
1144 		return NULL;
1145 #if IS_ENABLED(CONFIG_IPV6)
1146 	if (family == AF_INET6)
1147 		size = sizeof(struct in6_addr);
1148 #endif
1149 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1150 				 lockdep_sock_is_held(sk)) {
1151 		if (key->family != family)
1152 			continue;
1153 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1154 			continue;
1155 		if (key->l3index != l3index)
1156 			continue;
1157 		if (!memcmp(&key->addr, addr, size) &&
1158 		    key->prefixlen == prefixlen)
1159 			return key;
1160 	}
1161 	return NULL;
1162 }
1163 
1164 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1165 					 const struct sock *addr_sk)
1166 {
1167 	const union tcp_md5_addr *addr;
1168 	int l3index;
1169 
1170 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1171 						 addr_sk->sk_bound_dev_if);
1172 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1173 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1174 }
1175 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1176 
1177 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1178 {
1179 	struct tcp_sock *tp = tcp_sk(sk);
1180 	struct tcp_md5sig_info *md5sig;
1181 
1182 	md5sig = kmalloc(sizeof(*md5sig), gfp);
1183 	if (!md5sig)
1184 		return -ENOMEM;
1185 
1186 	sk_gso_disable(sk);
1187 	INIT_HLIST_HEAD(&md5sig->head);
1188 	rcu_assign_pointer(tp->md5sig_info, md5sig);
1189 	return 0;
1190 }
1191 
1192 /* This can be called on a newly created socket, from other files */
1193 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1194 			    int family, u8 prefixlen, int l3index, u8 flags,
1195 			    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1196 {
1197 	/* Add Key to the list */
1198 	struct tcp_md5sig_key *key;
1199 	struct tcp_sock *tp = tcp_sk(sk);
1200 	struct tcp_md5sig_info *md5sig;
1201 
1202 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1203 	if (key) {
1204 		/* Pre-existing entry - just update that one.
1205 		 * Note that the key might be used concurrently.
1206 		 * data_race() is telling kcsan that we do not care of
1207 		 * key mismatches, since changing MD5 key on live flows
1208 		 * can lead to packet drops.
1209 		 */
1210 		data_race(memcpy(key->key, newkey, newkeylen));
1211 
1212 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1213 		 * Also note that a reader could catch new key->keylen value
1214 		 * but old key->key[], this is the reason we use __GFP_ZERO
1215 		 * at sock_kmalloc() time below these lines.
1216 		 */
1217 		WRITE_ONCE(key->keylen, newkeylen);
1218 
1219 		return 0;
1220 	}
1221 
1222 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1223 					   lockdep_sock_is_held(sk));
1224 
1225 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1226 	if (!key)
1227 		return -ENOMEM;
1228 	if (!tcp_alloc_md5sig_pool()) {
1229 		sock_kfree_s(sk, key, sizeof(*key));
1230 		return -ENOMEM;
1231 	}
1232 
1233 	memcpy(key->key, newkey, newkeylen);
1234 	key->keylen = newkeylen;
1235 	key->family = family;
1236 	key->prefixlen = prefixlen;
1237 	key->l3index = l3index;
1238 	key->flags = flags;
1239 	memcpy(&key->addr, addr,
1240 	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1241 								 sizeof(struct in_addr));
1242 	hlist_add_head_rcu(&key->node, &md5sig->head);
1243 	return 0;
1244 }
1245 
1246 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1247 		   int family, u8 prefixlen, int l3index, u8 flags,
1248 		   const u8 *newkey, u8 newkeylen)
1249 {
1250 	struct tcp_sock *tp = tcp_sk(sk);
1251 
1252 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1253 		if (tcp_md5sig_info_add(sk, GFP_KERNEL))
1254 			return -ENOMEM;
1255 
1256 		if (!static_branch_inc(&tcp_md5_needed.key)) {
1257 			struct tcp_md5sig_info *md5sig;
1258 
1259 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1260 			rcu_assign_pointer(tp->md5sig_info, NULL);
1261 			kfree_rcu(md5sig, rcu);
1262 			return -EUSERS;
1263 		}
1264 	}
1265 
1266 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1267 				newkey, newkeylen, GFP_KERNEL);
1268 }
1269 EXPORT_SYMBOL(tcp_md5_do_add);
1270 
1271 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1272 		     int family, u8 prefixlen, int l3index,
1273 		     struct tcp_md5sig_key *key)
1274 {
1275 	struct tcp_sock *tp = tcp_sk(sk);
1276 
1277 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1278 		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
1279 			return -ENOMEM;
1280 
1281 		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1282 			struct tcp_md5sig_info *md5sig;
1283 
1284 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1285 			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1286 			rcu_assign_pointer(tp->md5sig_info, NULL);
1287 			kfree_rcu(md5sig, rcu);
1288 			return -EUSERS;
1289 		}
1290 	}
1291 
1292 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1293 				key->flags, key->key, key->keylen,
1294 				sk_gfp_mask(sk, GFP_ATOMIC));
1295 }
1296 EXPORT_SYMBOL(tcp_md5_key_copy);
1297 
1298 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1299 		   u8 prefixlen, int l3index, u8 flags)
1300 {
1301 	struct tcp_md5sig_key *key;
1302 
1303 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1304 	if (!key)
1305 		return -ENOENT;
1306 	hlist_del_rcu(&key->node);
1307 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1308 	kfree_rcu(key, rcu);
1309 	return 0;
1310 }
1311 EXPORT_SYMBOL(tcp_md5_do_del);
1312 
1313 static void tcp_clear_md5_list(struct sock *sk)
1314 {
1315 	struct tcp_sock *tp = tcp_sk(sk);
1316 	struct tcp_md5sig_key *key;
1317 	struct hlist_node *n;
1318 	struct tcp_md5sig_info *md5sig;
1319 
1320 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1321 
1322 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1323 		hlist_del_rcu(&key->node);
1324 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1325 		kfree_rcu(key, rcu);
1326 	}
1327 }
1328 
1329 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1330 				 sockptr_t optval, int optlen)
1331 {
1332 	struct tcp_md5sig cmd;
1333 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1334 	const union tcp_md5_addr *addr;
1335 	u8 prefixlen = 32;
1336 	int l3index = 0;
1337 	u8 flags;
1338 
1339 	if (optlen < sizeof(cmd))
1340 		return -EINVAL;
1341 
1342 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1343 		return -EFAULT;
1344 
1345 	if (sin->sin_family != AF_INET)
1346 		return -EINVAL;
1347 
1348 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1349 
1350 	if (optname == TCP_MD5SIG_EXT &&
1351 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1352 		prefixlen = cmd.tcpm_prefixlen;
1353 		if (prefixlen > 32)
1354 			return -EINVAL;
1355 	}
1356 
1357 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1358 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1359 		struct net_device *dev;
1360 
1361 		rcu_read_lock();
1362 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1363 		if (dev && netif_is_l3_master(dev))
1364 			l3index = dev->ifindex;
1365 
1366 		rcu_read_unlock();
1367 
1368 		/* ok to reference set/not set outside of rcu;
1369 		 * right now device MUST be an L3 master
1370 		 */
1371 		if (!dev || !l3index)
1372 			return -EINVAL;
1373 	}
1374 
1375 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1376 
1377 	if (!cmd.tcpm_keylen)
1378 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1379 
1380 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1381 		return -EINVAL;
1382 
1383 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1384 			      cmd.tcpm_key, cmd.tcpm_keylen);
1385 }
1386 
1387 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1388 				   __be32 daddr, __be32 saddr,
1389 				   const struct tcphdr *th, int nbytes)
1390 {
1391 	struct tcp4_pseudohdr *bp;
1392 	struct scatterlist sg;
1393 	struct tcphdr *_th;
1394 
1395 	bp = hp->scratch;
1396 	bp->saddr = saddr;
1397 	bp->daddr = daddr;
1398 	bp->pad = 0;
1399 	bp->protocol = IPPROTO_TCP;
1400 	bp->len = cpu_to_be16(nbytes);
1401 
1402 	_th = (struct tcphdr *)(bp + 1);
1403 	memcpy(_th, th, sizeof(*th));
1404 	_th->check = 0;
1405 
1406 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1407 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1408 				sizeof(*bp) + sizeof(*th));
1409 	return crypto_ahash_update(hp->md5_req);
1410 }
1411 
1412 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1413 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1414 {
1415 	struct tcp_md5sig_pool *hp;
1416 	struct ahash_request *req;
1417 
1418 	hp = tcp_get_md5sig_pool();
1419 	if (!hp)
1420 		goto clear_hash_noput;
1421 	req = hp->md5_req;
1422 
1423 	if (crypto_ahash_init(req))
1424 		goto clear_hash;
1425 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1426 		goto clear_hash;
1427 	if (tcp_md5_hash_key(hp, key))
1428 		goto clear_hash;
1429 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1430 	if (crypto_ahash_final(req))
1431 		goto clear_hash;
1432 
1433 	tcp_put_md5sig_pool();
1434 	return 0;
1435 
1436 clear_hash:
1437 	tcp_put_md5sig_pool();
1438 clear_hash_noput:
1439 	memset(md5_hash, 0, 16);
1440 	return 1;
1441 }
1442 
1443 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1444 			const struct sock *sk,
1445 			const struct sk_buff *skb)
1446 {
1447 	struct tcp_md5sig_pool *hp;
1448 	struct ahash_request *req;
1449 	const struct tcphdr *th = tcp_hdr(skb);
1450 	__be32 saddr, daddr;
1451 
1452 	if (sk) { /* valid for establish/request sockets */
1453 		saddr = sk->sk_rcv_saddr;
1454 		daddr = sk->sk_daddr;
1455 	} else {
1456 		const struct iphdr *iph = ip_hdr(skb);
1457 		saddr = iph->saddr;
1458 		daddr = iph->daddr;
1459 	}
1460 
1461 	hp = tcp_get_md5sig_pool();
1462 	if (!hp)
1463 		goto clear_hash_noput;
1464 	req = hp->md5_req;
1465 
1466 	if (crypto_ahash_init(req))
1467 		goto clear_hash;
1468 
1469 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1470 		goto clear_hash;
1471 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1472 		goto clear_hash;
1473 	if (tcp_md5_hash_key(hp, key))
1474 		goto clear_hash;
1475 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1476 	if (crypto_ahash_final(req))
1477 		goto clear_hash;
1478 
1479 	tcp_put_md5sig_pool();
1480 	return 0;
1481 
1482 clear_hash:
1483 	tcp_put_md5sig_pool();
1484 clear_hash_noput:
1485 	memset(md5_hash, 0, 16);
1486 	return 1;
1487 }
1488 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1489 
1490 #endif
1491 
1492 static void tcp_v4_init_req(struct request_sock *req,
1493 			    const struct sock *sk_listener,
1494 			    struct sk_buff *skb)
1495 {
1496 	struct inet_request_sock *ireq = inet_rsk(req);
1497 	struct net *net = sock_net(sk_listener);
1498 
1499 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1500 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1501 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1502 }
1503 
1504 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1505 					  struct sk_buff *skb,
1506 					  struct flowi *fl,
1507 					  struct request_sock *req)
1508 {
1509 	tcp_v4_init_req(req, sk, skb);
1510 
1511 	if (security_inet_conn_request(sk, skb, req))
1512 		return NULL;
1513 
1514 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1515 }
1516 
1517 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1518 	.family		=	PF_INET,
1519 	.obj_size	=	sizeof(struct tcp_request_sock),
1520 	.rtx_syn_ack	=	tcp_rtx_synack,
1521 	.send_ack	=	tcp_v4_reqsk_send_ack,
1522 	.destructor	=	tcp_v4_reqsk_destructor,
1523 	.send_reset	=	tcp_v4_send_reset,
1524 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1525 };
1526 
1527 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1528 	.mss_clamp	=	TCP_MSS_DEFAULT,
1529 #ifdef CONFIG_TCP_MD5SIG
1530 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1531 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1532 #endif
1533 #ifdef CONFIG_SYN_COOKIES
1534 	.cookie_init_seq =	cookie_v4_init_sequence,
1535 #endif
1536 	.route_req	=	tcp_v4_route_req,
1537 	.init_seq	=	tcp_v4_init_seq,
1538 	.init_ts_off	=	tcp_v4_init_ts_off,
1539 	.send_synack	=	tcp_v4_send_synack,
1540 };
1541 
1542 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1543 {
1544 	/* Never answer to SYNs send to broadcast or multicast */
1545 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1546 		goto drop;
1547 
1548 	return tcp_conn_request(&tcp_request_sock_ops,
1549 				&tcp_request_sock_ipv4_ops, sk, skb);
1550 
1551 drop:
1552 	tcp_listendrop(sk);
1553 	return 0;
1554 }
1555 EXPORT_SYMBOL(tcp_v4_conn_request);
1556 
1557 
1558 /*
1559  * The three way handshake has completed - we got a valid synack -
1560  * now create the new socket.
1561  */
1562 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1563 				  struct request_sock *req,
1564 				  struct dst_entry *dst,
1565 				  struct request_sock *req_unhash,
1566 				  bool *own_req)
1567 {
1568 	struct inet_request_sock *ireq;
1569 	bool found_dup_sk = false;
1570 	struct inet_sock *newinet;
1571 	struct tcp_sock *newtp;
1572 	struct sock *newsk;
1573 #ifdef CONFIG_TCP_MD5SIG
1574 	const union tcp_md5_addr *addr;
1575 	struct tcp_md5sig_key *key;
1576 	int l3index;
1577 #endif
1578 	struct ip_options_rcu *inet_opt;
1579 
1580 	if (sk_acceptq_is_full(sk))
1581 		goto exit_overflow;
1582 
1583 	newsk = tcp_create_openreq_child(sk, req, skb);
1584 	if (!newsk)
1585 		goto exit_nonewsk;
1586 
1587 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1588 	inet_sk_rx_dst_set(newsk, skb);
1589 
1590 	newtp		      = tcp_sk(newsk);
1591 	newinet		      = inet_sk(newsk);
1592 	ireq		      = inet_rsk(req);
1593 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1594 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1595 	newsk->sk_bound_dev_if = ireq->ir_iif;
1596 	newinet->inet_saddr   = ireq->ir_loc_addr;
1597 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1598 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1599 	newinet->mc_index     = inet_iif(skb);
1600 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1601 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1602 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1603 	if (inet_opt)
1604 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1605 	atomic_set(&newinet->inet_id, get_random_u16());
1606 
1607 	/* Set ToS of the new socket based upon the value of incoming SYN.
1608 	 * ECT bits are set later in tcp_init_transfer().
1609 	 */
1610 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1611 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1612 
1613 	if (!dst) {
1614 		dst = inet_csk_route_child_sock(sk, newsk, req);
1615 		if (!dst)
1616 			goto put_and_exit;
1617 	} else {
1618 		/* syncookie case : see end of cookie_v4_check() */
1619 	}
1620 	sk_setup_caps(newsk, dst);
1621 
1622 	tcp_ca_openreq_child(newsk, dst);
1623 
1624 	tcp_sync_mss(newsk, dst_mtu(dst));
1625 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1626 
1627 	tcp_initialize_rcv_mss(newsk);
1628 
1629 #ifdef CONFIG_TCP_MD5SIG
1630 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1631 	/* Copy over the MD5 key from the original socket */
1632 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1633 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1634 	if (key) {
1635 		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1636 			goto put_and_exit;
1637 		sk_gso_disable(newsk);
1638 	}
1639 #endif
1640 
1641 	if (__inet_inherit_port(sk, newsk) < 0)
1642 		goto put_and_exit;
1643 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1644 				       &found_dup_sk);
1645 	if (likely(*own_req)) {
1646 		tcp_move_syn(newtp, req);
1647 		ireq->ireq_opt = NULL;
1648 	} else {
1649 		newinet->inet_opt = NULL;
1650 
1651 		if (!req_unhash && found_dup_sk) {
1652 			/* This code path should only be executed in the
1653 			 * syncookie case only
1654 			 */
1655 			bh_unlock_sock(newsk);
1656 			sock_put(newsk);
1657 			newsk = NULL;
1658 		}
1659 	}
1660 	return newsk;
1661 
1662 exit_overflow:
1663 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1664 exit_nonewsk:
1665 	dst_release(dst);
1666 exit:
1667 	tcp_listendrop(sk);
1668 	return NULL;
1669 put_and_exit:
1670 	newinet->inet_opt = NULL;
1671 	inet_csk_prepare_forced_close(newsk);
1672 	tcp_done(newsk);
1673 	goto exit;
1674 }
1675 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1676 
1677 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1678 {
1679 #ifdef CONFIG_SYN_COOKIES
1680 	const struct tcphdr *th = tcp_hdr(skb);
1681 
1682 	if (!th->syn)
1683 		sk = cookie_v4_check(sk, skb);
1684 #endif
1685 	return sk;
1686 }
1687 
1688 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1689 			 struct tcphdr *th, u32 *cookie)
1690 {
1691 	u16 mss = 0;
1692 #ifdef CONFIG_SYN_COOKIES
1693 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1694 				    &tcp_request_sock_ipv4_ops, sk, th);
1695 	if (mss) {
1696 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1697 		tcp_synq_overflow(sk);
1698 	}
1699 #endif
1700 	return mss;
1701 }
1702 
1703 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1704 							   u32));
1705 /* The socket must have it's spinlock held when we get
1706  * here, unless it is a TCP_LISTEN socket.
1707  *
1708  * We have a potential double-lock case here, so even when
1709  * doing backlog processing we use the BH locking scheme.
1710  * This is because we cannot sleep with the original spinlock
1711  * held.
1712  */
1713 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1714 {
1715 	enum skb_drop_reason reason;
1716 	struct sock *rsk;
1717 
1718 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1719 		struct dst_entry *dst;
1720 
1721 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1722 						lockdep_sock_is_held(sk));
1723 
1724 		sock_rps_save_rxhash(sk, skb);
1725 		sk_mark_napi_id(sk, skb);
1726 		if (dst) {
1727 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1728 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1729 					     dst, 0)) {
1730 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1731 				dst_release(dst);
1732 			}
1733 		}
1734 		tcp_rcv_established(sk, skb);
1735 		return 0;
1736 	}
1737 
1738 	reason = SKB_DROP_REASON_NOT_SPECIFIED;
1739 	if (tcp_checksum_complete(skb))
1740 		goto csum_err;
1741 
1742 	if (sk->sk_state == TCP_LISTEN) {
1743 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1744 
1745 		if (!nsk)
1746 			goto discard;
1747 		if (nsk != sk) {
1748 			if (tcp_child_process(sk, nsk, skb)) {
1749 				rsk = nsk;
1750 				goto reset;
1751 			}
1752 			return 0;
1753 		}
1754 	} else
1755 		sock_rps_save_rxhash(sk, skb);
1756 
1757 	if (tcp_rcv_state_process(sk, skb)) {
1758 		rsk = sk;
1759 		goto reset;
1760 	}
1761 	return 0;
1762 
1763 reset:
1764 	tcp_v4_send_reset(rsk, skb);
1765 discard:
1766 	kfree_skb_reason(skb, reason);
1767 	/* Be careful here. If this function gets more complicated and
1768 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1769 	 * might be destroyed here. This current version compiles correctly,
1770 	 * but you have been warned.
1771 	 */
1772 	return 0;
1773 
1774 csum_err:
1775 	reason = SKB_DROP_REASON_TCP_CSUM;
1776 	trace_tcp_bad_csum(skb);
1777 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1778 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1779 	goto discard;
1780 }
1781 EXPORT_SYMBOL(tcp_v4_do_rcv);
1782 
1783 int tcp_v4_early_demux(struct sk_buff *skb)
1784 {
1785 	struct net *net = dev_net(skb->dev);
1786 	const struct iphdr *iph;
1787 	const struct tcphdr *th;
1788 	struct sock *sk;
1789 
1790 	if (skb->pkt_type != PACKET_HOST)
1791 		return 0;
1792 
1793 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1794 		return 0;
1795 
1796 	iph = ip_hdr(skb);
1797 	th = tcp_hdr(skb);
1798 
1799 	if (th->doff < sizeof(struct tcphdr) / 4)
1800 		return 0;
1801 
1802 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1803 				       iph->saddr, th->source,
1804 				       iph->daddr, ntohs(th->dest),
1805 				       skb->skb_iif, inet_sdif(skb));
1806 	if (sk) {
1807 		skb->sk = sk;
1808 		skb->destructor = sock_edemux;
1809 		if (sk_fullsock(sk)) {
1810 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1811 
1812 			if (dst)
1813 				dst = dst_check(dst, 0);
1814 			if (dst &&
1815 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
1816 				skb_dst_set_noref(skb, dst);
1817 		}
1818 	}
1819 	return 0;
1820 }
1821 
1822 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1823 		     enum skb_drop_reason *reason)
1824 {
1825 	u32 limit, tail_gso_size, tail_gso_segs;
1826 	struct skb_shared_info *shinfo;
1827 	const struct tcphdr *th;
1828 	struct tcphdr *thtail;
1829 	struct sk_buff *tail;
1830 	unsigned int hdrlen;
1831 	bool fragstolen;
1832 	u32 gso_segs;
1833 	u32 gso_size;
1834 	int delta;
1835 
1836 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1837 	 * we can fix skb->truesize to its real value to avoid future drops.
1838 	 * This is valid because skb is not yet charged to the socket.
1839 	 * It has been noticed pure SACK packets were sometimes dropped
1840 	 * (if cooked by drivers without copybreak feature).
1841 	 */
1842 	skb_condense(skb);
1843 
1844 	skb_dst_drop(skb);
1845 
1846 	if (unlikely(tcp_checksum_complete(skb))) {
1847 		bh_unlock_sock(sk);
1848 		trace_tcp_bad_csum(skb);
1849 		*reason = SKB_DROP_REASON_TCP_CSUM;
1850 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1851 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1852 		return true;
1853 	}
1854 
1855 	/* Attempt coalescing to last skb in backlog, even if we are
1856 	 * above the limits.
1857 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1858 	 */
1859 	th = (const struct tcphdr *)skb->data;
1860 	hdrlen = th->doff * 4;
1861 
1862 	tail = sk->sk_backlog.tail;
1863 	if (!tail)
1864 		goto no_coalesce;
1865 	thtail = (struct tcphdr *)tail->data;
1866 
1867 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1868 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1869 	    ((TCP_SKB_CB(tail)->tcp_flags |
1870 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1871 	    !((TCP_SKB_CB(tail)->tcp_flags &
1872 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1873 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1874 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1875 #ifdef CONFIG_TLS_DEVICE
1876 	    tail->decrypted != skb->decrypted ||
1877 #endif
1878 	    !mptcp_skb_can_collapse(tail, skb) ||
1879 	    thtail->doff != th->doff ||
1880 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1881 		goto no_coalesce;
1882 
1883 	__skb_pull(skb, hdrlen);
1884 
1885 	shinfo = skb_shinfo(skb);
1886 	gso_size = shinfo->gso_size ?: skb->len;
1887 	gso_segs = shinfo->gso_segs ?: 1;
1888 
1889 	shinfo = skb_shinfo(tail);
1890 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1891 	tail_gso_segs = shinfo->gso_segs ?: 1;
1892 
1893 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1894 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1895 
1896 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1897 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1898 			thtail->window = th->window;
1899 		}
1900 
1901 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1902 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1903 		 * is not entered if we append a packet with a FIN.
1904 		 * SYN, RST, URG are not present.
1905 		 * ACK is set on both packets.
1906 		 * PSH : we do not really care in TCP stack,
1907 		 *       at least for 'GRO' packets.
1908 		 */
1909 		thtail->fin |= th->fin;
1910 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1911 
1912 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1913 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1914 			tail->tstamp = skb->tstamp;
1915 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1916 		}
1917 
1918 		/* Not as strict as GRO. We only need to carry mss max value */
1919 		shinfo->gso_size = max(gso_size, tail_gso_size);
1920 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1921 
1922 		sk->sk_backlog.len += delta;
1923 		__NET_INC_STATS(sock_net(sk),
1924 				LINUX_MIB_TCPBACKLOGCOALESCE);
1925 		kfree_skb_partial(skb, fragstolen);
1926 		return false;
1927 	}
1928 	__skb_push(skb, hdrlen);
1929 
1930 no_coalesce:
1931 	limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1932 
1933 	/* Only socket owner can try to collapse/prune rx queues
1934 	 * to reduce memory overhead, so add a little headroom here.
1935 	 * Few sockets backlog are possibly concurrently non empty.
1936 	 */
1937 	limit += 64 * 1024;
1938 
1939 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1940 		bh_unlock_sock(sk);
1941 		*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1942 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1943 		return true;
1944 	}
1945 	return false;
1946 }
1947 EXPORT_SYMBOL(tcp_add_backlog);
1948 
1949 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1950 {
1951 	struct tcphdr *th = (struct tcphdr *)skb->data;
1952 
1953 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1954 }
1955 EXPORT_SYMBOL(tcp_filter);
1956 
1957 static void tcp_v4_restore_cb(struct sk_buff *skb)
1958 {
1959 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1960 		sizeof(struct inet_skb_parm));
1961 }
1962 
1963 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1964 			   const struct tcphdr *th)
1965 {
1966 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1967 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1968 	 */
1969 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1970 		sizeof(struct inet_skb_parm));
1971 	barrier();
1972 
1973 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1974 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1975 				    skb->len - th->doff * 4);
1976 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1977 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1978 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1979 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1980 	TCP_SKB_CB(skb)->sacked	 = 0;
1981 	TCP_SKB_CB(skb)->has_rxtstamp =
1982 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1983 }
1984 
1985 /*
1986  *	From tcp_input.c
1987  */
1988 
1989 int tcp_v4_rcv(struct sk_buff *skb)
1990 {
1991 	struct net *net = dev_net(skb->dev);
1992 	enum skb_drop_reason drop_reason;
1993 	int sdif = inet_sdif(skb);
1994 	int dif = inet_iif(skb);
1995 	const struct iphdr *iph;
1996 	const struct tcphdr *th;
1997 	bool refcounted;
1998 	struct sock *sk;
1999 	int ret;
2000 
2001 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2002 	if (skb->pkt_type != PACKET_HOST)
2003 		goto discard_it;
2004 
2005 	/* Count it even if it's bad */
2006 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2007 
2008 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2009 		goto discard_it;
2010 
2011 	th = (const struct tcphdr *)skb->data;
2012 
2013 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2014 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2015 		goto bad_packet;
2016 	}
2017 	if (!pskb_may_pull(skb, th->doff * 4))
2018 		goto discard_it;
2019 
2020 	/* An explanation is required here, I think.
2021 	 * Packet length and doff are validated by header prediction,
2022 	 * provided case of th->doff==0 is eliminated.
2023 	 * So, we defer the checks. */
2024 
2025 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2026 		goto csum_error;
2027 
2028 	th = (const struct tcphdr *)skb->data;
2029 	iph = ip_hdr(skb);
2030 lookup:
2031 	sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2032 			       skb, __tcp_hdrlen(th), th->source,
2033 			       th->dest, sdif, &refcounted);
2034 	if (!sk)
2035 		goto no_tcp_socket;
2036 
2037 process:
2038 	if (sk->sk_state == TCP_TIME_WAIT)
2039 		goto do_time_wait;
2040 
2041 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2042 		struct request_sock *req = inet_reqsk(sk);
2043 		bool req_stolen = false;
2044 		struct sock *nsk;
2045 
2046 		sk = req->rsk_listener;
2047 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2048 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2049 		else
2050 			drop_reason = tcp_inbound_md5_hash(sk, skb,
2051 						   &iph->saddr, &iph->daddr,
2052 						   AF_INET, dif, sdif);
2053 		if (unlikely(drop_reason)) {
2054 			sk_drops_add(sk, skb);
2055 			reqsk_put(req);
2056 			goto discard_it;
2057 		}
2058 		if (tcp_checksum_complete(skb)) {
2059 			reqsk_put(req);
2060 			goto csum_error;
2061 		}
2062 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2063 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2064 			if (!nsk) {
2065 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2066 				goto lookup;
2067 			}
2068 			sk = nsk;
2069 			/* reuseport_migrate_sock() has already held one sk_refcnt
2070 			 * before returning.
2071 			 */
2072 		} else {
2073 			/* We own a reference on the listener, increase it again
2074 			 * as we might lose it too soon.
2075 			 */
2076 			sock_hold(sk);
2077 		}
2078 		refcounted = true;
2079 		nsk = NULL;
2080 		if (!tcp_filter(sk, skb)) {
2081 			th = (const struct tcphdr *)skb->data;
2082 			iph = ip_hdr(skb);
2083 			tcp_v4_fill_cb(skb, iph, th);
2084 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2085 		} else {
2086 			drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2087 		}
2088 		if (!nsk) {
2089 			reqsk_put(req);
2090 			if (req_stolen) {
2091 				/* Another cpu got exclusive access to req
2092 				 * and created a full blown socket.
2093 				 * Try to feed this packet to this socket
2094 				 * instead of discarding it.
2095 				 */
2096 				tcp_v4_restore_cb(skb);
2097 				sock_put(sk);
2098 				goto lookup;
2099 			}
2100 			goto discard_and_relse;
2101 		}
2102 		nf_reset_ct(skb);
2103 		if (nsk == sk) {
2104 			reqsk_put(req);
2105 			tcp_v4_restore_cb(skb);
2106 		} else if (tcp_child_process(sk, nsk, skb)) {
2107 			tcp_v4_send_reset(nsk, skb);
2108 			goto discard_and_relse;
2109 		} else {
2110 			sock_put(sk);
2111 			return 0;
2112 		}
2113 	}
2114 
2115 	if (static_branch_unlikely(&ip4_min_ttl)) {
2116 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2117 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2118 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2119 			drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2120 			goto discard_and_relse;
2121 		}
2122 	}
2123 
2124 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2125 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2126 		goto discard_and_relse;
2127 	}
2128 
2129 	drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2130 					   &iph->daddr, AF_INET, dif, sdif);
2131 	if (drop_reason)
2132 		goto discard_and_relse;
2133 
2134 	nf_reset_ct(skb);
2135 
2136 	if (tcp_filter(sk, skb)) {
2137 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2138 		goto discard_and_relse;
2139 	}
2140 	th = (const struct tcphdr *)skb->data;
2141 	iph = ip_hdr(skb);
2142 	tcp_v4_fill_cb(skb, iph, th);
2143 
2144 	skb->dev = NULL;
2145 
2146 	if (sk->sk_state == TCP_LISTEN) {
2147 		ret = tcp_v4_do_rcv(sk, skb);
2148 		goto put_and_return;
2149 	}
2150 
2151 	sk_incoming_cpu_update(sk);
2152 
2153 	bh_lock_sock_nested(sk);
2154 	tcp_segs_in(tcp_sk(sk), skb);
2155 	ret = 0;
2156 	if (!sock_owned_by_user(sk)) {
2157 		ret = tcp_v4_do_rcv(sk, skb);
2158 	} else {
2159 		if (tcp_add_backlog(sk, skb, &drop_reason))
2160 			goto discard_and_relse;
2161 	}
2162 	bh_unlock_sock(sk);
2163 
2164 put_and_return:
2165 	if (refcounted)
2166 		sock_put(sk);
2167 
2168 	return ret;
2169 
2170 no_tcp_socket:
2171 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2172 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2173 		goto discard_it;
2174 
2175 	tcp_v4_fill_cb(skb, iph, th);
2176 
2177 	if (tcp_checksum_complete(skb)) {
2178 csum_error:
2179 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2180 		trace_tcp_bad_csum(skb);
2181 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2182 bad_packet:
2183 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2184 	} else {
2185 		tcp_v4_send_reset(NULL, skb);
2186 	}
2187 
2188 discard_it:
2189 	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2190 	/* Discard frame. */
2191 	kfree_skb_reason(skb, drop_reason);
2192 	return 0;
2193 
2194 discard_and_relse:
2195 	sk_drops_add(sk, skb);
2196 	if (refcounted)
2197 		sock_put(sk);
2198 	goto discard_it;
2199 
2200 do_time_wait:
2201 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2202 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2203 		inet_twsk_put(inet_twsk(sk));
2204 		goto discard_it;
2205 	}
2206 
2207 	tcp_v4_fill_cb(skb, iph, th);
2208 
2209 	if (tcp_checksum_complete(skb)) {
2210 		inet_twsk_put(inet_twsk(sk));
2211 		goto csum_error;
2212 	}
2213 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2214 	case TCP_TW_SYN: {
2215 		struct sock *sk2 = inet_lookup_listener(net,
2216 							net->ipv4.tcp_death_row.hashinfo,
2217 							skb, __tcp_hdrlen(th),
2218 							iph->saddr, th->source,
2219 							iph->daddr, th->dest,
2220 							inet_iif(skb),
2221 							sdif);
2222 		if (sk2) {
2223 			inet_twsk_deschedule_put(inet_twsk(sk));
2224 			sk = sk2;
2225 			tcp_v4_restore_cb(skb);
2226 			refcounted = false;
2227 			goto process;
2228 		}
2229 	}
2230 		/* to ACK */
2231 		fallthrough;
2232 	case TCP_TW_ACK:
2233 		tcp_v4_timewait_ack(sk, skb);
2234 		break;
2235 	case TCP_TW_RST:
2236 		tcp_v4_send_reset(sk, skb);
2237 		inet_twsk_deschedule_put(inet_twsk(sk));
2238 		goto discard_it;
2239 	case TCP_TW_SUCCESS:;
2240 	}
2241 	goto discard_it;
2242 }
2243 
2244 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2245 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2246 	.twsk_unique	= tcp_twsk_unique,
2247 	.twsk_destructor= tcp_twsk_destructor,
2248 };
2249 
2250 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2251 {
2252 	struct dst_entry *dst = skb_dst(skb);
2253 
2254 	if (dst && dst_hold_safe(dst)) {
2255 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2256 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2257 	}
2258 }
2259 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2260 
2261 const struct inet_connection_sock_af_ops ipv4_specific = {
2262 	.queue_xmit	   = ip_queue_xmit,
2263 	.send_check	   = tcp_v4_send_check,
2264 	.rebuild_header	   = inet_sk_rebuild_header,
2265 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2266 	.conn_request	   = tcp_v4_conn_request,
2267 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2268 	.net_header_len	   = sizeof(struct iphdr),
2269 	.setsockopt	   = ip_setsockopt,
2270 	.getsockopt	   = ip_getsockopt,
2271 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2272 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2273 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2274 };
2275 EXPORT_SYMBOL(ipv4_specific);
2276 
2277 #ifdef CONFIG_TCP_MD5SIG
2278 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2279 	.md5_lookup		= tcp_v4_md5_lookup,
2280 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2281 	.md5_parse		= tcp_v4_parse_md5_keys,
2282 };
2283 #endif
2284 
2285 /* NOTE: A lot of things set to zero explicitly by call to
2286  *       sk_alloc() so need not be done here.
2287  */
2288 static int tcp_v4_init_sock(struct sock *sk)
2289 {
2290 	struct inet_connection_sock *icsk = inet_csk(sk);
2291 
2292 	tcp_init_sock(sk);
2293 
2294 	icsk->icsk_af_ops = &ipv4_specific;
2295 
2296 #ifdef CONFIG_TCP_MD5SIG
2297 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2298 #endif
2299 
2300 	return 0;
2301 }
2302 
2303 void tcp_v4_destroy_sock(struct sock *sk)
2304 {
2305 	struct tcp_sock *tp = tcp_sk(sk);
2306 
2307 	trace_tcp_destroy_sock(sk);
2308 
2309 	tcp_clear_xmit_timers(sk);
2310 
2311 	tcp_cleanup_congestion_control(sk);
2312 
2313 	tcp_cleanup_ulp(sk);
2314 
2315 	/* Cleanup up the write buffer. */
2316 	tcp_write_queue_purge(sk);
2317 
2318 	/* Check if we want to disable active TFO */
2319 	tcp_fastopen_active_disable_ofo_check(sk);
2320 
2321 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2322 	skb_rbtree_purge(&tp->out_of_order_queue);
2323 
2324 #ifdef CONFIG_TCP_MD5SIG
2325 	/* Clean up the MD5 key list, if any */
2326 	if (tp->md5sig_info) {
2327 		tcp_clear_md5_list(sk);
2328 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2329 		tp->md5sig_info = NULL;
2330 		static_branch_slow_dec_deferred(&tcp_md5_needed);
2331 	}
2332 #endif
2333 
2334 	/* Clean up a referenced TCP bind bucket. */
2335 	if (inet_csk(sk)->icsk_bind_hash)
2336 		inet_put_port(sk);
2337 
2338 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2339 
2340 	/* If socket is aborted during connect operation */
2341 	tcp_free_fastopen_req(tp);
2342 	tcp_fastopen_destroy_cipher(sk);
2343 	tcp_saved_syn_free(tp);
2344 
2345 	sk_sockets_allocated_dec(sk);
2346 }
2347 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2348 
2349 #ifdef CONFIG_PROC_FS
2350 /* Proc filesystem TCP sock list dumping. */
2351 
2352 static unsigned short seq_file_family(const struct seq_file *seq);
2353 
2354 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2355 {
2356 	unsigned short family = seq_file_family(seq);
2357 
2358 	/* AF_UNSPEC is used as a match all */
2359 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2360 		net_eq(sock_net(sk), seq_file_net(seq)));
2361 }
2362 
2363 /* Find a non empty bucket (starting from st->bucket)
2364  * and return the first sk from it.
2365  */
2366 static void *listening_get_first(struct seq_file *seq)
2367 {
2368 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2369 	struct tcp_iter_state *st = seq->private;
2370 
2371 	st->offset = 0;
2372 	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2373 		struct inet_listen_hashbucket *ilb2;
2374 		struct hlist_nulls_node *node;
2375 		struct sock *sk;
2376 
2377 		ilb2 = &hinfo->lhash2[st->bucket];
2378 		if (hlist_nulls_empty(&ilb2->nulls_head))
2379 			continue;
2380 
2381 		spin_lock(&ilb2->lock);
2382 		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2383 			if (seq_sk_match(seq, sk))
2384 				return sk;
2385 		}
2386 		spin_unlock(&ilb2->lock);
2387 	}
2388 
2389 	return NULL;
2390 }
2391 
2392 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2393  * If "cur" is the last one in the st->bucket,
2394  * call listening_get_first() to return the first sk of the next
2395  * non empty bucket.
2396  */
2397 static void *listening_get_next(struct seq_file *seq, void *cur)
2398 {
2399 	struct tcp_iter_state *st = seq->private;
2400 	struct inet_listen_hashbucket *ilb2;
2401 	struct hlist_nulls_node *node;
2402 	struct inet_hashinfo *hinfo;
2403 	struct sock *sk = cur;
2404 
2405 	++st->num;
2406 	++st->offset;
2407 
2408 	sk = sk_nulls_next(sk);
2409 	sk_nulls_for_each_from(sk, node) {
2410 		if (seq_sk_match(seq, sk))
2411 			return sk;
2412 	}
2413 
2414 	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2415 	ilb2 = &hinfo->lhash2[st->bucket];
2416 	spin_unlock(&ilb2->lock);
2417 	++st->bucket;
2418 	return listening_get_first(seq);
2419 }
2420 
2421 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2422 {
2423 	struct tcp_iter_state *st = seq->private;
2424 	void *rc;
2425 
2426 	st->bucket = 0;
2427 	st->offset = 0;
2428 	rc = listening_get_first(seq);
2429 
2430 	while (rc && *pos) {
2431 		rc = listening_get_next(seq, rc);
2432 		--*pos;
2433 	}
2434 	return rc;
2435 }
2436 
2437 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2438 				const struct tcp_iter_state *st)
2439 {
2440 	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2441 }
2442 
2443 /*
2444  * Get first established socket starting from bucket given in st->bucket.
2445  * If st->bucket is zero, the very first socket in the hash is returned.
2446  */
2447 static void *established_get_first(struct seq_file *seq)
2448 {
2449 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2450 	struct tcp_iter_state *st = seq->private;
2451 
2452 	st->offset = 0;
2453 	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2454 		struct sock *sk;
2455 		struct hlist_nulls_node *node;
2456 		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2457 
2458 		cond_resched();
2459 
2460 		/* Lockless fast path for the common case of empty buckets */
2461 		if (empty_bucket(hinfo, st))
2462 			continue;
2463 
2464 		spin_lock_bh(lock);
2465 		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2466 			if (seq_sk_match(seq, sk))
2467 				return sk;
2468 		}
2469 		spin_unlock_bh(lock);
2470 	}
2471 
2472 	return NULL;
2473 }
2474 
2475 static void *established_get_next(struct seq_file *seq, void *cur)
2476 {
2477 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2478 	struct tcp_iter_state *st = seq->private;
2479 	struct hlist_nulls_node *node;
2480 	struct sock *sk = cur;
2481 
2482 	++st->num;
2483 	++st->offset;
2484 
2485 	sk = sk_nulls_next(sk);
2486 
2487 	sk_nulls_for_each_from(sk, node) {
2488 		if (seq_sk_match(seq, sk))
2489 			return sk;
2490 	}
2491 
2492 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2493 	++st->bucket;
2494 	return established_get_first(seq);
2495 }
2496 
2497 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2498 {
2499 	struct tcp_iter_state *st = seq->private;
2500 	void *rc;
2501 
2502 	st->bucket = 0;
2503 	rc = established_get_first(seq);
2504 
2505 	while (rc && pos) {
2506 		rc = established_get_next(seq, rc);
2507 		--pos;
2508 	}
2509 	return rc;
2510 }
2511 
2512 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2513 {
2514 	void *rc;
2515 	struct tcp_iter_state *st = seq->private;
2516 
2517 	st->state = TCP_SEQ_STATE_LISTENING;
2518 	rc	  = listening_get_idx(seq, &pos);
2519 
2520 	if (!rc) {
2521 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2522 		rc	  = established_get_idx(seq, pos);
2523 	}
2524 
2525 	return rc;
2526 }
2527 
2528 static void *tcp_seek_last_pos(struct seq_file *seq)
2529 {
2530 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2531 	struct tcp_iter_state *st = seq->private;
2532 	int bucket = st->bucket;
2533 	int offset = st->offset;
2534 	int orig_num = st->num;
2535 	void *rc = NULL;
2536 
2537 	switch (st->state) {
2538 	case TCP_SEQ_STATE_LISTENING:
2539 		if (st->bucket > hinfo->lhash2_mask)
2540 			break;
2541 		rc = listening_get_first(seq);
2542 		while (offset-- && rc && bucket == st->bucket)
2543 			rc = listening_get_next(seq, rc);
2544 		if (rc)
2545 			break;
2546 		st->bucket = 0;
2547 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2548 		fallthrough;
2549 	case TCP_SEQ_STATE_ESTABLISHED:
2550 		if (st->bucket > hinfo->ehash_mask)
2551 			break;
2552 		rc = established_get_first(seq);
2553 		while (offset-- && rc && bucket == st->bucket)
2554 			rc = established_get_next(seq, rc);
2555 	}
2556 
2557 	st->num = orig_num;
2558 
2559 	return rc;
2560 }
2561 
2562 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2563 {
2564 	struct tcp_iter_state *st = seq->private;
2565 	void *rc;
2566 
2567 	if (*pos && *pos == st->last_pos) {
2568 		rc = tcp_seek_last_pos(seq);
2569 		if (rc)
2570 			goto out;
2571 	}
2572 
2573 	st->state = TCP_SEQ_STATE_LISTENING;
2574 	st->num = 0;
2575 	st->bucket = 0;
2576 	st->offset = 0;
2577 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2578 
2579 out:
2580 	st->last_pos = *pos;
2581 	return rc;
2582 }
2583 EXPORT_SYMBOL(tcp_seq_start);
2584 
2585 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2586 {
2587 	struct tcp_iter_state *st = seq->private;
2588 	void *rc = NULL;
2589 
2590 	if (v == SEQ_START_TOKEN) {
2591 		rc = tcp_get_idx(seq, 0);
2592 		goto out;
2593 	}
2594 
2595 	switch (st->state) {
2596 	case TCP_SEQ_STATE_LISTENING:
2597 		rc = listening_get_next(seq, v);
2598 		if (!rc) {
2599 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2600 			st->bucket = 0;
2601 			st->offset = 0;
2602 			rc	  = established_get_first(seq);
2603 		}
2604 		break;
2605 	case TCP_SEQ_STATE_ESTABLISHED:
2606 		rc = established_get_next(seq, v);
2607 		break;
2608 	}
2609 out:
2610 	++*pos;
2611 	st->last_pos = *pos;
2612 	return rc;
2613 }
2614 EXPORT_SYMBOL(tcp_seq_next);
2615 
2616 void tcp_seq_stop(struct seq_file *seq, void *v)
2617 {
2618 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2619 	struct tcp_iter_state *st = seq->private;
2620 
2621 	switch (st->state) {
2622 	case TCP_SEQ_STATE_LISTENING:
2623 		if (v != SEQ_START_TOKEN)
2624 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
2625 		break;
2626 	case TCP_SEQ_STATE_ESTABLISHED:
2627 		if (v)
2628 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2629 		break;
2630 	}
2631 }
2632 EXPORT_SYMBOL(tcp_seq_stop);
2633 
2634 static void get_openreq4(const struct request_sock *req,
2635 			 struct seq_file *f, int i)
2636 {
2637 	const struct inet_request_sock *ireq = inet_rsk(req);
2638 	long delta = req->rsk_timer.expires - jiffies;
2639 
2640 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2641 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2642 		i,
2643 		ireq->ir_loc_addr,
2644 		ireq->ir_num,
2645 		ireq->ir_rmt_addr,
2646 		ntohs(ireq->ir_rmt_port),
2647 		TCP_SYN_RECV,
2648 		0, 0, /* could print option size, but that is af dependent. */
2649 		1,    /* timers active (only the expire timer) */
2650 		jiffies_delta_to_clock_t(delta),
2651 		req->num_timeout,
2652 		from_kuid_munged(seq_user_ns(f),
2653 				 sock_i_uid(req->rsk_listener)),
2654 		0,  /* non standard timer */
2655 		0, /* open_requests have no inode */
2656 		0,
2657 		req);
2658 }
2659 
2660 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2661 {
2662 	int timer_active;
2663 	unsigned long timer_expires;
2664 	const struct tcp_sock *tp = tcp_sk(sk);
2665 	const struct inet_connection_sock *icsk = inet_csk(sk);
2666 	const struct inet_sock *inet = inet_sk(sk);
2667 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2668 	__be32 dest = inet->inet_daddr;
2669 	__be32 src = inet->inet_rcv_saddr;
2670 	__u16 destp = ntohs(inet->inet_dport);
2671 	__u16 srcp = ntohs(inet->inet_sport);
2672 	int rx_queue;
2673 	int state;
2674 
2675 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2676 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2677 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2678 		timer_active	= 1;
2679 		timer_expires	= icsk->icsk_timeout;
2680 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2681 		timer_active	= 4;
2682 		timer_expires	= icsk->icsk_timeout;
2683 	} else if (timer_pending(&sk->sk_timer)) {
2684 		timer_active	= 2;
2685 		timer_expires	= sk->sk_timer.expires;
2686 	} else {
2687 		timer_active	= 0;
2688 		timer_expires = jiffies;
2689 	}
2690 
2691 	state = inet_sk_state_load(sk);
2692 	if (state == TCP_LISTEN)
2693 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2694 	else
2695 		/* Because we don't lock the socket,
2696 		 * we might find a transient negative value.
2697 		 */
2698 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2699 				      READ_ONCE(tp->copied_seq), 0);
2700 
2701 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2702 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2703 		i, src, srcp, dest, destp, state,
2704 		READ_ONCE(tp->write_seq) - tp->snd_una,
2705 		rx_queue,
2706 		timer_active,
2707 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2708 		icsk->icsk_retransmits,
2709 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2710 		icsk->icsk_probes_out,
2711 		sock_i_ino(sk),
2712 		refcount_read(&sk->sk_refcnt), sk,
2713 		jiffies_to_clock_t(icsk->icsk_rto),
2714 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2715 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2716 		tcp_snd_cwnd(tp),
2717 		state == TCP_LISTEN ?
2718 		    fastopenq->max_qlen :
2719 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2720 }
2721 
2722 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2723 			       struct seq_file *f, int i)
2724 {
2725 	long delta = tw->tw_timer.expires - jiffies;
2726 	__be32 dest, src;
2727 	__u16 destp, srcp;
2728 
2729 	dest  = tw->tw_daddr;
2730 	src   = tw->tw_rcv_saddr;
2731 	destp = ntohs(tw->tw_dport);
2732 	srcp  = ntohs(tw->tw_sport);
2733 
2734 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2735 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2736 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2737 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2738 		refcount_read(&tw->tw_refcnt), tw);
2739 }
2740 
2741 #define TMPSZ 150
2742 
2743 static int tcp4_seq_show(struct seq_file *seq, void *v)
2744 {
2745 	struct tcp_iter_state *st;
2746 	struct sock *sk = v;
2747 
2748 	seq_setwidth(seq, TMPSZ - 1);
2749 	if (v == SEQ_START_TOKEN) {
2750 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2751 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2752 			   "inode");
2753 		goto out;
2754 	}
2755 	st = seq->private;
2756 
2757 	if (sk->sk_state == TCP_TIME_WAIT)
2758 		get_timewait4_sock(v, seq, st->num);
2759 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2760 		get_openreq4(v, seq, st->num);
2761 	else
2762 		get_tcp4_sock(v, seq, st->num);
2763 out:
2764 	seq_pad(seq, '\n');
2765 	return 0;
2766 }
2767 
2768 #ifdef CONFIG_BPF_SYSCALL
2769 struct bpf_tcp_iter_state {
2770 	struct tcp_iter_state state;
2771 	unsigned int cur_sk;
2772 	unsigned int end_sk;
2773 	unsigned int max_sk;
2774 	struct sock **batch;
2775 	bool st_bucket_done;
2776 };
2777 
2778 struct bpf_iter__tcp {
2779 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2780 	__bpf_md_ptr(struct sock_common *, sk_common);
2781 	uid_t uid __aligned(8);
2782 };
2783 
2784 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2785 			     struct sock_common *sk_common, uid_t uid)
2786 {
2787 	struct bpf_iter__tcp ctx;
2788 
2789 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2790 	ctx.meta = meta;
2791 	ctx.sk_common = sk_common;
2792 	ctx.uid = uid;
2793 	return bpf_iter_run_prog(prog, &ctx);
2794 }
2795 
2796 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2797 {
2798 	while (iter->cur_sk < iter->end_sk)
2799 		sock_gen_put(iter->batch[iter->cur_sk++]);
2800 }
2801 
2802 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2803 				      unsigned int new_batch_sz)
2804 {
2805 	struct sock **new_batch;
2806 
2807 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2808 			     GFP_USER | __GFP_NOWARN);
2809 	if (!new_batch)
2810 		return -ENOMEM;
2811 
2812 	bpf_iter_tcp_put_batch(iter);
2813 	kvfree(iter->batch);
2814 	iter->batch = new_batch;
2815 	iter->max_sk = new_batch_sz;
2816 
2817 	return 0;
2818 }
2819 
2820 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2821 						 struct sock *start_sk)
2822 {
2823 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2824 	struct bpf_tcp_iter_state *iter = seq->private;
2825 	struct tcp_iter_state *st = &iter->state;
2826 	struct hlist_nulls_node *node;
2827 	unsigned int expected = 1;
2828 	struct sock *sk;
2829 
2830 	sock_hold(start_sk);
2831 	iter->batch[iter->end_sk++] = start_sk;
2832 
2833 	sk = sk_nulls_next(start_sk);
2834 	sk_nulls_for_each_from(sk, node) {
2835 		if (seq_sk_match(seq, sk)) {
2836 			if (iter->end_sk < iter->max_sk) {
2837 				sock_hold(sk);
2838 				iter->batch[iter->end_sk++] = sk;
2839 			}
2840 			expected++;
2841 		}
2842 	}
2843 	spin_unlock(&hinfo->lhash2[st->bucket].lock);
2844 
2845 	return expected;
2846 }
2847 
2848 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2849 						   struct sock *start_sk)
2850 {
2851 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2852 	struct bpf_tcp_iter_state *iter = seq->private;
2853 	struct tcp_iter_state *st = &iter->state;
2854 	struct hlist_nulls_node *node;
2855 	unsigned int expected = 1;
2856 	struct sock *sk;
2857 
2858 	sock_hold(start_sk);
2859 	iter->batch[iter->end_sk++] = start_sk;
2860 
2861 	sk = sk_nulls_next(start_sk);
2862 	sk_nulls_for_each_from(sk, node) {
2863 		if (seq_sk_match(seq, sk)) {
2864 			if (iter->end_sk < iter->max_sk) {
2865 				sock_hold(sk);
2866 				iter->batch[iter->end_sk++] = sk;
2867 			}
2868 			expected++;
2869 		}
2870 	}
2871 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2872 
2873 	return expected;
2874 }
2875 
2876 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2877 {
2878 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2879 	struct bpf_tcp_iter_state *iter = seq->private;
2880 	struct tcp_iter_state *st = &iter->state;
2881 	unsigned int expected;
2882 	bool resized = false;
2883 	struct sock *sk;
2884 
2885 	/* The st->bucket is done.  Directly advance to the next
2886 	 * bucket instead of having the tcp_seek_last_pos() to skip
2887 	 * one by one in the current bucket and eventually find out
2888 	 * it has to advance to the next bucket.
2889 	 */
2890 	if (iter->st_bucket_done) {
2891 		st->offset = 0;
2892 		st->bucket++;
2893 		if (st->state == TCP_SEQ_STATE_LISTENING &&
2894 		    st->bucket > hinfo->lhash2_mask) {
2895 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2896 			st->bucket = 0;
2897 		}
2898 	}
2899 
2900 again:
2901 	/* Get a new batch */
2902 	iter->cur_sk = 0;
2903 	iter->end_sk = 0;
2904 	iter->st_bucket_done = false;
2905 
2906 	sk = tcp_seek_last_pos(seq);
2907 	if (!sk)
2908 		return NULL; /* Done */
2909 
2910 	if (st->state == TCP_SEQ_STATE_LISTENING)
2911 		expected = bpf_iter_tcp_listening_batch(seq, sk);
2912 	else
2913 		expected = bpf_iter_tcp_established_batch(seq, sk);
2914 
2915 	if (iter->end_sk == expected) {
2916 		iter->st_bucket_done = true;
2917 		return sk;
2918 	}
2919 
2920 	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2921 		resized = true;
2922 		goto again;
2923 	}
2924 
2925 	return sk;
2926 }
2927 
2928 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2929 {
2930 	/* bpf iter does not support lseek, so it always
2931 	 * continue from where it was stop()-ped.
2932 	 */
2933 	if (*pos)
2934 		return bpf_iter_tcp_batch(seq);
2935 
2936 	return SEQ_START_TOKEN;
2937 }
2938 
2939 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2940 {
2941 	struct bpf_tcp_iter_state *iter = seq->private;
2942 	struct tcp_iter_state *st = &iter->state;
2943 	struct sock *sk;
2944 
2945 	/* Whenever seq_next() is called, the iter->cur_sk is
2946 	 * done with seq_show(), so advance to the next sk in
2947 	 * the batch.
2948 	 */
2949 	if (iter->cur_sk < iter->end_sk) {
2950 		/* Keeping st->num consistent in tcp_iter_state.
2951 		 * bpf_iter_tcp does not use st->num.
2952 		 * meta.seq_num is used instead.
2953 		 */
2954 		st->num++;
2955 		/* Move st->offset to the next sk in the bucket such that
2956 		 * the future start() will resume at st->offset in
2957 		 * st->bucket.  See tcp_seek_last_pos().
2958 		 */
2959 		st->offset++;
2960 		sock_gen_put(iter->batch[iter->cur_sk++]);
2961 	}
2962 
2963 	if (iter->cur_sk < iter->end_sk)
2964 		sk = iter->batch[iter->cur_sk];
2965 	else
2966 		sk = bpf_iter_tcp_batch(seq);
2967 
2968 	++*pos;
2969 	/* Keeping st->last_pos consistent in tcp_iter_state.
2970 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2971 	 */
2972 	st->last_pos = *pos;
2973 	return sk;
2974 }
2975 
2976 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2977 {
2978 	struct bpf_iter_meta meta;
2979 	struct bpf_prog *prog;
2980 	struct sock *sk = v;
2981 	uid_t uid;
2982 	int ret;
2983 
2984 	if (v == SEQ_START_TOKEN)
2985 		return 0;
2986 
2987 	if (sk_fullsock(sk))
2988 		lock_sock(sk);
2989 
2990 	if (unlikely(sk_unhashed(sk))) {
2991 		ret = SEQ_SKIP;
2992 		goto unlock;
2993 	}
2994 
2995 	if (sk->sk_state == TCP_TIME_WAIT) {
2996 		uid = 0;
2997 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2998 		const struct request_sock *req = v;
2999 
3000 		uid = from_kuid_munged(seq_user_ns(seq),
3001 				       sock_i_uid(req->rsk_listener));
3002 	} else {
3003 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3004 	}
3005 
3006 	meta.seq = seq;
3007 	prog = bpf_iter_get_info(&meta, false);
3008 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
3009 
3010 unlock:
3011 	if (sk_fullsock(sk))
3012 		release_sock(sk);
3013 	return ret;
3014 
3015 }
3016 
3017 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3018 {
3019 	struct bpf_tcp_iter_state *iter = seq->private;
3020 	struct bpf_iter_meta meta;
3021 	struct bpf_prog *prog;
3022 
3023 	if (!v) {
3024 		meta.seq = seq;
3025 		prog = bpf_iter_get_info(&meta, true);
3026 		if (prog)
3027 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
3028 	}
3029 
3030 	if (iter->cur_sk < iter->end_sk) {
3031 		bpf_iter_tcp_put_batch(iter);
3032 		iter->st_bucket_done = false;
3033 	}
3034 }
3035 
3036 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3037 	.show		= bpf_iter_tcp_seq_show,
3038 	.start		= bpf_iter_tcp_seq_start,
3039 	.next		= bpf_iter_tcp_seq_next,
3040 	.stop		= bpf_iter_tcp_seq_stop,
3041 };
3042 #endif
3043 static unsigned short seq_file_family(const struct seq_file *seq)
3044 {
3045 	const struct tcp_seq_afinfo *afinfo;
3046 
3047 #ifdef CONFIG_BPF_SYSCALL
3048 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3049 	if (seq->op == &bpf_iter_tcp_seq_ops)
3050 		return AF_UNSPEC;
3051 #endif
3052 
3053 	/* Iterated from proc fs */
3054 	afinfo = pde_data(file_inode(seq->file));
3055 	return afinfo->family;
3056 }
3057 
3058 static const struct seq_operations tcp4_seq_ops = {
3059 	.show		= tcp4_seq_show,
3060 	.start		= tcp_seq_start,
3061 	.next		= tcp_seq_next,
3062 	.stop		= tcp_seq_stop,
3063 };
3064 
3065 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3066 	.family		= AF_INET,
3067 };
3068 
3069 static int __net_init tcp4_proc_init_net(struct net *net)
3070 {
3071 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3072 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3073 		return -ENOMEM;
3074 	return 0;
3075 }
3076 
3077 static void __net_exit tcp4_proc_exit_net(struct net *net)
3078 {
3079 	remove_proc_entry("tcp", net->proc_net);
3080 }
3081 
3082 static struct pernet_operations tcp4_net_ops = {
3083 	.init = tcp4_proc_init_net,
3084 	.exit = tcp4_proc_exit_net,
3085 };
3086 
3087 int __init tcp4_proc_init(void)
3088 {
3089 	return register_pernet_subsys(&tcp4_net_ops);
3090 }
3091 
3092 void tcp4_proc_exit(void)
3093 {
3094 	unregister_pernet_subsys(&tcp4_net_ops);
3095 }
3096 #endif /* CONFIG_PROC_FS */
3097 
3098 /* @wake is one when sk_stream_write_space() calls us.
3099  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3100  * This mimics the strategy used in sock_def_write_space().
3101  */
3102 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3103 {
3104 	const struct tcp_sock *tp = tcp_sk(sk);
3105 	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3106 			    READ_ONCE(tp->snd_nxt);
3107 
3108 	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3109 }
3110 EXPORT_SYMBOL(tcp_stream_memory_free);
3111 
3112 struct proto tcp_prot = {
3113 	.name			= "TCP",
3114 	.owner			= THIS_MODULE,
3115 	.close			= tcp_close,
3116 	.pre_connect		= tcp_v4_pre_connect,
3117 	.connect		= tcp_v4_connect,
3118 	.disconnect		= tcp_disconnect,
3119 	.accept			= inet_csk_accept,
3120 	.ioctl			= tcp_ioctl,
3121 	.init			= tcp_v4_init_sock,
3122 	.destroy		= tcp_v4_destroy_sock,
3123 	.shutdown		= tcp_shutdown,
3124 	.setsockopt		= tcp_setsockopt,
3125 	.getsockopt		= tcp_getsockopt,
3126 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3127 	.keepalive		= tcp_set_keepalive,
3128 	.recvmsg		= tcp_recvmsg,
3129 	.sendmsg		= tcp_sendmsg,
3130 	.splice_eof		= tcp_splice_eof,
3131 	.backlog_rcv		= tcp_v4_do_rcv,
3132 	.release_cb		= tcp_release_cb,
3133 	.hash			= inet_hash,
3134 	.unhash			= inet_unhash,
3135 	.get_port		= inet_csk_get_port,
3136 	.put_port		= inet_put_port,
3137 #ifdef CONFIG_BPF_SYSCALL
3138 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3139 #endif
3140 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3141 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3142 	.stream_memory_free	= tcp_stream_memory_free,
3143 	.sockets_allocated	= &tcp_sockets_allocated,
3144 	.orphan_count		= &tcp_orphan_count,
3145 
3146 	.memory_allocated	= &tcp_memory_allocated,
3147 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3148 
3149 	.memory_pressure	= &tcp_memory_pressure,
3150 	.sysctl_mem		= sysctl_tcp_mem,
3151 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3152 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3153 	.max_header		= MAX_TCP_HEADER,
3154 	.obj_size		= sizeof(struct tcp_sock),
3155 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3156 	.twsk_prot		= &tcp_timewait_sock_ops,
3157 	.rsk_prot		= &tcp_request_sock_ops,
3158 	.h.hashinfo		= NULL,
3159 	.no_autobind		= true,
3160 	.diag_destroy		= tcp_abort,
3161 };
3162 EXPORT_SYMBOL(tcp_prot);
3163 
3164 static void __net_exit tcp_sk_exit(struct net *net)
3165 {
3166 	if (net->ipv4.tcp_congestion_control)
3167 		bpf_module_put(net->ipv4.tcp_congestion_control,
3168 			       net->ipv4.tcp_congestion_control->owner);
3169 }
3170 
3171 static void __net_init tcp_set_hashinfo(struct net *net)
3172 {
3173 	struct inet_hashinfo *hinfo;
3174 	unsigned int ehash_entries;
3175 	struct net *old_net;
3176 
3177 	if (net_eq(net, &init_net))
3178 		goto fallback;
3179 
3180 	old_net = current->nsproxy->net_ns;
3181 	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3182 	if (!ehash_entries)
3183 		goto fallback;
3184 
3185 	ehash_entries = roundup_pow_of_two(ehash_entries);
3186 	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3187 	if (!hinfo) {
3188 		pr_warn("Failed to allocate TCP ehash (entries: %u) "
3189 			"for a netns, fallback to the global one\n",
3190 			ehash_entries);
3191 fallback:
3192 		hinfo = &tcp_hashinfo;
3193 		ehash_entries = tcp_hashinfo.ehash_mask + 1;
3194 	}
3195 
3196 	net->ipv4.tcp_death_row.hashinfo = hinfo;
3197 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3198 	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3199 }
3200 
3201 static int __net_init tcp_sk_init(struct net *net)
3202 {
3203 	net->ipv4.sysctl_tcp_ecn = 2;
3204 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3205 
3206 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3207 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3208 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3209 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3210 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3211 
3212 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3213 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3214 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3215 
3216 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3217 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3218 	net->ipv4.sysctl_tcp_syncookies = 1;
3219 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3220 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3221 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3222 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3223 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3224 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3225 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3226 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3227 
3228 	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3229 	tcp_set_hashinfo(net);
3230 
3231 	net->ipv4.sysctl_tcp_sack = 1;
3232 	net->ipv4.sysctl_tcp_window_scaling = 1;
3233 	net->ipv4.sysctl_tcp_timestamps = 1;
3234 	net->ipv4.sysctl_tcp_early_retrans = 3;
3235 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3236 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3237 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3238 	net->ipv4.sysctl_tcp_max_reordering = 300;
3239 	net->ipv4.sysctl_tcp_dsack = 1;
3240 	net->ipv4.sysctl_tcp_app_win = 31;
3241 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3242 	net->ipv4.sysctl_tcp_frto = 2;
3243 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3244 	/* This limits the percentage of the congestion window which we
3245 	 * will allow a single TSO frame to consume.  Building TSO frames
3246 	 * which are too large can cause TCP streams to be bursty.
3247 	 */
3248 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3249 	/* Default TSQ limit of 16 TSO segments */
3250 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3251 
3252 	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3253 	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3254 
3255 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3256 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3257 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3258 	net->ipv4.sysctl_tcp_autocorking = 1;
3259 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3260 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3261 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3262 	if (net != &init_net) {
3263 		memcpy(net->ipv4.sysctl_tcp_rmem,
3264 		       init_net.ipv4.sysctl_tcp_rmem,
3265 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3266 		memcpy(net->ipv4.sysctl_tcp_wmem,
3267 		       init_net.ipv4.sysctl_tcp_wmem,
3268 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3269 	}
3270 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3271 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3272 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3273 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3274 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3275 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3276 
3277 	/* Set default values for PLB */
3278 	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3279 	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3280 	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3281 	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3282 	/* Default congestion threshold for PLB to mark a round is 50% */
3283 	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3284 
3285 	/* Reno is always built in */
3286 	if (!net_eq(net, &init_net) &&
3287 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3288 			       init_net.ipv4.tcp_congestion_control->owner))
3289 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3290 	else
3291 		net->ipv4.tcp_congestion_control = &tcp_reno;
3292 
3293 	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3294 	net->ipv4.sysctl_tcp_shrink_window = 0;
3295 
3296 	return 0;
3297 }
3298 
3299 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3300 {
3301 	struct net *net;
3302 
3303 	tcp_twsk_purge(net_exit_list, AF_INET);
3304 
3305 	list_for_each_entry(net, net_exit_list, exit_list) {
3306 		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3307 		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3308 		tcp_fastopen_ctx_destroy(net);
3309 	}
3310 }
3311 
3312 static struct pernet_operations __net_initdata tcp_sk_ops = {
3313        .init	   = tcp_sk_init,
3314        .exit	   = tcp_sk_exit,
3315        .exit_batch = tcp_sk_exit_batch,
3316 };
3317 
3318 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3319 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3320 		     struct sock_common *sk_common, uid_t uid)
3321 
3322 #define INIT_BATCH_SZ 16
3323 
3324 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3325 {
3326 	struct bpf_tcp_iter_state *iter = priv_data;
3327 	int err;
3328 
3329 	err = bpf_iter_init_seq_net(priv_data, aux);
3330 	if (err)
3331 		return err;
3332 
3333 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3334 	if (err) {
3335 		bpf_iter_fini_seq_net(priv_data);
3336 		return err;
3337 	}
3338 
3339 	return 0;
3340 }
3341 
3342 static void bpf_iter_fini_tcp(void *priv_data)
3343 {
3344 	struct bpf_tcp_iter_state *iter = priv_data;
3345 
3346 	bpf_iter_fini_seq_net(priv_data);
3347 	kvfree(iter->batch);
3348 }
3349 
3350 static const struct bpf_iter_seq_info tcp_seq_info = {
3351 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3352 	.init_seq_private	= bpf_iter_init_tcp,
3353 	.fini_seq_private	= bpf_iter_fini_tcp,
3354 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3355 };
3356 
3357 static const struct bpf_func_proto *
3358 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3359 			    const struct bpf_prog *prog)
3360 {
3361 	switch (func_id) {
3362 	case BPF_FUNC_setsockopt:
3363 		return &bpf_sk_setsockopt_proto;
3364 	case BPF_FUNC_getsockopt:
3365 		return &bpf_sk_getsockopt_proto;
3366 	default:
3367 		return NULL;
3368 	}
3369 }
3370 
3371 static struct bpf_iter_reg tcp_reg_info = {
3372 	.target			= "tcp",
3373 	.ctx_arg_info_size	= 1,
3374 	.ctx_arg_info		= {
3375 		{ offsetof(struct bpf_iter__tcp, sk_common),
3376 		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3377 	},
3378 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3379 	.seq_info		= &tcp_seq_info,
3380 };
3381 
3382 static void __init bpf_iter_register(void)
3383 {
3384 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3385 	if (bpf_iter_reg_target(&tcp_reg_info))
3386 		pr_warn("Warning: could not register bpf iterator tcp\n");
3387 }
3388 
3389 #endif
3390 
3391 void __init tcp_v4_init(void)
3392 {
3393 	int cpu, res;
3394 
3395 	for_each_possible_cpu(cpu) {
3396 		struct sock *sk;
3397 
3398 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3399 					   IPPROTO_TCP, &init_net);
3400 		if (res)
3401 			panic("Failed to create the TCP control socket.\n");
3402 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3403 
3404 		/* Please enforce IP_DF and IPID==0 for RST and
3405 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3406 		 */
3407 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3408 
3409 		per_cpu(ipv4_tcp_sk, cpu) = sk;
3410 	}
3411 	if (register_pernet_subsys(&tcp_sk_ops))
3412 		panic("Failed to create the TCP control socket.\n");
3413 
3414 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3415 	bpf_iter_register();
3416 #endif
3417 }
3418