xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision 38857318)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72 
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
80 
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
83 
84 #include <trace/events/tcp.h>
85 
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 #endif
90 
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
93 
94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
95 
96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
97 {
98 	return secure_tcp_seq(ip_hdr(skb)->daddr,
99 			      ip_hdr(skb)->saddr,
100 			      tcp_hdr(skb)->dest,
101 			      tcp_hdr(skb)->source);
102 }
103 
104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
105 {
106 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
107 }
108 
109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110 {
111 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
112 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
113 	struct tcp_sock *tp = tcp_sk(sk);
114 	int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
115 
116 	if (reuse == 2) {
117 		/* Still does not detect *everything* that goes through
118 		 * lo, since we require a loopback src or dst address
119 		 * or direct binding to 'lo' interface.
120 		 */
121 		bool loopback = false;
122 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
123 			loopback = true;
124 #if IS_ENABLED(CONFIG_IPV6)
125 		if (tw->tw_family == AF_INET6) {
126 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
127 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
128 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
129 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
130 				loopback = true;
131 		} else
132 #endif
133 		{
134 			if (ipv4_is_loopback(tw->tw_daddr) ||
135 			    ipv4_is_loopback(tw->tw_rcv_saddr))
136 				loopback = true;
137 		}
138 		if (!loopback)
139 			reuse = 0;
140 	}
141 
142 	/* With PAWS, it is safe from the viewpoint
143 	   of data integrity. Even without PAWS it is safe provided sequence
144 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
145 
146 	   Actually, the idea is close to VJ's one, only timestamp cache is
147 	   held not per host, but per port pair and TW bucket is used as state
148 	   holder.
149 
150 	   If TW bucket has been already destroyed we fall back to VJ's scheme
151 	   and use initial timestamp retrieved from peer table.
152 	 */
153 	if (tcptw->tw_ts_recent_stamp &&
154 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
155 					    tcptw->tw_ts_recent_stamp)))) {
156 		/* In case of repair and re-using TIME-WAIT sockets we still
157 		 * want to be sure that it is safe as above but honor the
158 		 * sequence numbers and time stamps set as part of the repair
159 		 * process.
160 		 *
161 		 * Without this check re-using a TIME-WAIT socket with TCP
162 		 * repair would accumulate a -1 on the repair assigned
163 		 * sequence number. The first time it is reused the sequence
164 		 * is -1, the second time -2, etc. This fixes that issue
165 		 * without appearing to create any others.
166 		 */
167 		if (likely(!tp->repair)) {
168 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
169 
170 			if (!seq)
171 				seq = 1;
172 			WRITE_ONCE(tp->write_seq, seq);
173 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
174 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
175 		}
176 		sock_hold(sktw);
177 		return 1;
178 	}
179 
180 	return 0;
181 }
182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
183 
184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
185 			      int addr_len)
186 {
187 	/* This check is replicated from tcp_v4_connect() and intended to
188 	 * prevent BPF program called below from accessing bytes that are out
189 	 * of the bound specified by user in addr_len.
190 	 */
191 	if (addr_len < sizeof(struct sockaddr_in))
192 		return -EINVAL;
193 
194 	sock_owned_by_me(sk);
195 
196 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
197 }
198 
199 /* This will initiate an outgoing connection. */
200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
201 {
202 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
203 	struct inet_sock *inet = inet_sk(sk);
204 	struct tcp_sock *tp = tcp_sk(sk);
205 	__be16 orig_sport, orig_dport;
206 	__be32 daddr, nexthop;
207 	struct flowi4 *fl4;
208 	struct rtable *rt;
209 	int err;
210 	struct ip_options_rcu *inet_opt;
211 	struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
212 
213 	if (addr_len < sizeof(struct sockaddr_in))
214 		return -EINVAL;
215 
216 	if (usin->sin_family != AF_INET)
217 		return -EAFNOSUPPORT;
218 
219 	nexthop = daddr = usin->sin_addr.s_addr;
220 	inet_opt = rcu_dereference_protected(inet->inet_opt,
221 					     lockdep_sock_is_held(sk));
222 	if (inet_opt && inet_opt->opt.srr) {
223 		if (!daddr)
224 			return -EINVAL;
225 		nexthop = inet_opt->opt.faddr;
226 	}
227 
228 	orig_sport = inet->inet_sport;
229 	orig_dport = usin->sin_port;
230 	fl4 = &inet->cork.fl.u.ip4;
231 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
232 			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
233 			      orig_dport, sk);
234 	if (IS_ERR(rt)) {
235 		err = PTR_ERR(rt);
236 		if (err == -ENETUNREACH)
237 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
238 		return err;
239 	}
240 
241 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
242 		ip_rt_put(rt);
243 		return -ENETUNREACH;
244 	}
245 
246 	if (!inet_opt || !inet_opt->opt.srr)
247 		daddr = fl4->daddr;
248 
249 	if (!inet->inet_saddr)
250 		inet->inet_saddr = fl4->saddr;
251 	sk_rcv_saddr_set(sk, inet->inet_saddr);
252 
253 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
254 		/* Reset inherited state */
255 		tp->rx_opt.ts_recent	   = 0;
256 		tp->rx_opt.ts_recent_stamp = 0;
257 		if (likely(!tp->repair))
258 			WRITE_ONCE(tp->write_seq, 0);
259 	}
260 
261 	inet->inet_dport = usin->sin_port;
262 	sk_daddr_set(sk, daddr);
263 
264 	inet_csk(sk)->icsk_ext_hdr_len = 0;
265 	if (inet_opt)
266 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
267 
268 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
269 
270 	/* Socket identity is still unknown (sport may be zero).
271 	 * However we set state to SYN-SENT and not releasing socket
272 	 * lock select source port, enter ourselves into the hash tables and
273 	 * complete initialization after this.
274 	 */
275 	tcp_set_state(sk, TCP_SYN_SENT);
276 	err = inet_hash_connect(tcp_death_row, sk);
277 	if (err)
278 		goto failure;
279 
280 	sk_set_txhash(sk);
281 
282 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
283 			       inet->inet_sport, inet->inet_dport, sk);
284 	if (IS_ERR(rt)) {
285 		err = PTR_ERR(rt);
286 		rt = NULL;
287 		goto failure;
288 	}
289 	/* OK, now commit destination to socket.  */
290 	sk->sk_gso_type = SKB_GSO_TCPV4;
291 	sk_setup_caps(sk, &rt->dst);
292 	rt = NULL;
293 
294 	if (likely(!tp->repair)) {
295 		if (!tp->write_seq)
296 			WRITE_ONCE(tp->write_seq,
297 				   secure_tcp_seq(inet->inet_saddr,
298 						  inet->inet_daddr,
299 						  inet->inet_sport,
300 						  usin->sin_port));
301 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
302 						 inet->inet_saddr,
303 						 inet->inet_daddr);
304 	}
305 
306 	inet->inet_id = prandom_u32();
307 
308 	if (tcp_fastopen_defer_connect(sk, &err))
309 		return err;
310 	if (err)
311 		goto failure;
312 
313 	err = tcp_connect(sk);
314 
315 	if (err)
316 		goto failure;
317 
318 	return 0;
319 
320 failure:
321 	/*
322 	 * This unhashes the socket and releases the local port,
323 	 * if necessary.
324 	 */
325 	tcp_set_state(sk, TCP_CLOSE);
326 	ip_rt_put(rt);
327 	sk->sk_route_caps = 0;
328 	inet->inet_dport = 0;
329 	return err;
330 }
331 EXPORT_SYMBOL(tcp_v4_connect);
332 
333 /*
334  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
335  * It can be called through tcp_release_cb() if socket was owned by user
336  * at the time tcp_v4_err() was called to handle ICMP message.
337  */
338 void tcp_v4_mtu_reduced(struct sock *sk)
339 {
340 	struct inet_sock *inet = inet_sk(sk);
341 	struct dst_entry *dst;
342 	u32 mtu;
343 
344 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
345 		return;
346 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
347 	dst = inet_csk_update_pmtu(sk, mtu);
348 	if (!dst)
349 		return;
350 
351 	/* Something is about to be wrong... Remember soft error
352 	 * for the case, if this connection will not able to recover.
353 	 */
354 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
355 		sk->sk_err_soft = EMSGSIZE;
356 
357 	mtu = dst_mtu(dst);
358 
359 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
360 	    ip_sk_accept_pmtu(sk) &&
361 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
362 		tcp_sync_mss(sk, mtu);
363 
364 		/* Resend the TCP packet because it's
365 		 * clear that the old packet has been
366 		 * dropped. This is the new "fast" path mtu
367 		 * discovery.
368 		 */
369 		tcp_simple_retransmit(sk);
370 	} /* else let the usual retransmit timer handle it */
371 }
372 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
373 
374 static void do_redirect(struct sk_buff *skb, struct sock *sk)
375 {
376 	struct dst_entry *dst = __sk_dst_check(sk, 0);
377 
378 	if (dst)
379 		dst->ops->redirect(dst, sk, skb);
380 }
381 
382 
383 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
384 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
385 {
386 	struct request_sock *req = inet_reqsk(sk);
387 	struct net *net = sock_net(sk);
388 
389 	/* ICMPs are not backlogged, hence we cannot get
390 	 * an established socket here.
391 	 */
392 	if (seq != tcp_rsk(req)->snt_isn) {
393 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
394 	} else if (abort) {
395 		/*
396 		 * Still in SYN_RECV, just remove it silently.
397 		 * There is no good way to pass the error to the newly
398 		 * created socket, and POSIX does not want network
399 		 * errors returned from accept().
400 		 */
401 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
402 		tcp_listendrop(req->rsk_listener);
403 	}
404 	reqsk_put(req);
405 }
406 EXPORT_SYMBOL(tcp_req_err);
407 
408 /* TCP-LD (RFC 6069) logic */
409 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
410 {
411 	struct inet_connection_sock *icsk = inet_csk(sk);
412 	struct tcp_sock *tp = tcp_sk(sk);
413 	struct sk_buff *skb;
414 	s32 remaining;
415 	u32 delta_us;
416 
417 	if (sock_owned_by_user(sk))
418 		return;
419 
420 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
421 	    !icsk->icsk_backoff)
422 		return;
423 
424 	skb = tcp_rtx_queue_head(sk);
425 	if (WARN_ON_ONCE(!skb))
426 		return;
427 
428 	icsk->icsk_backoff--;
429 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
430 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
431 
432 	tcp_mstamp_refresh(tp);
433 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
434 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
435 
436 	if (remaining > 0) {
437 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
438 					  remaining, TCP_RTO_MAX);
439 	} else {
440 		/* RTO revert clocked out retransmission.
441 		 * Will retransmit now.
442 		 */
443 		tcp_retransmit_timer(sk);
444 	}
445 }
446 EXPORT_SYMBOL(tcp_ld_RTO_revert);
447 
448 /*
449  * This routine is called by the ICMP module when it gets some
450  * sort of error condition.  If err < 0 then the socket should
451  * be closed and the error returned to the user.  If err > 0
452  * it's just the icmp type << 8 | icmp code.  After adjustment
453  * header points to the first 8 bytes of the tcp header.  We need
454  * to find the appropriate port.
455  *
456  * The locking strategy used here is very "optimistic". When
457  * someone else accesses the socket the ICMP is just dropped
458  * and for some paths there is no check at all.
459  * A more general error queue to queue errors for later handling
460  * is probably better.
461  *
462  */
463 
464 int tcp_v4_err(struct sk_buff *skb, u32 info)
465 {
466 	const struct iphdr *iph = (const struct iphdr *)skb->data;
467 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
468 	struct tcp_sock *tp;
469 	struct inet_sock *inet;
470 	const int type = icmp_hdr(skb)->type;
471 	const int code = icmp_hdr(skb)->code;
472 	struct sock *sk;
473 	struct request_sock *fastopen;
474 	u32 seq, snd_una;
475 	int err;
476 	struct net *net = dev_net(skb->dev);
477 
478 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
479 				       th->dest, iph->saddr, ntohs(th->source),
480 				       inet_iif(skb), 0);
481 	if (!sk) {
482 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
483 		return -ENOENT;
484 	}
485 	if (sk->sk_state == TCP_TIME_WAIT) {
486 		inet_twsk_put(inet_twsk(sk));
487 		return 0;
488 	}
489 	seq = ntohl(th->seq);
490 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
491 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
492 				     type == ICMP_TIME_EXCEEDED ||
493 				     (type == ICMP_DEST_UNREACH &&
494 				      (code == ICMP_NET_UNREACH ||
495 				       code == ICMP_HOST_UNREACH)));
496 		return 0;
497 	}
498 
499 	bh_lock_sock(sk);
500 	/* If too many ICMPs get dropped on busy
501 	 * servers this needs to be solved differently.
502 	 * We do take care of PMTU discovery (RFC1191) special case :
503 	 * we can receive locally generated ICMP messages while socket is held.
504 	 */
505 	if (sock_owned_by_user(sk)) {
506 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
507 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
508 	}
509 	if (sk->sk_state == TCP_CLOSE)
510 		goto out;
511 
512 	if (static_branch_unlikely(&ip4_min_ttl)) {
513 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
514 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
515 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
516 			goto out;
517 		}
518 	}
519 
520 	tp = tcp_sk(sk);
521 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
522 	fastopen = rcu_dereference(tp->fastopen_rsk);
523 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
524 	if (sk->sk_state != TCP_LISTEN &&
525 	    !between(seq, snd_una, tp->snd_nxt)) {
526 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
527 		goto out;
528 	}
529 
530 	switch (type) {
531 	case ICMP_REDIRECT:
532 		if (!sock_owned_by_user(sk))
533 			do_redirect(skb, sk);
534 		goto out;
535 	case ICMP_SOURCE_QUENCH:
536 		/* Just silently ignore these. */
537 		goto out;
538 	case ICMP_PARAMETERPROB:
539 		err = EPROTO;
540 		break;
541 	case ICMP_DEST_UNREACH:
542 		if (code > NR_ICMP_UNREACH)
543 			goto out;
544 
545 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
546 			/* We are not interested in TCP_LISTEN and open_requests
547 			 * (SYN-ACKs send out by Linux are always <576bytes so
548 			 * they should go through unfragmented).
549 			 */
550 			if (sk->sk_state == TCP_LISTEN)
551 				goto out;
552 
553 			WRITE_ONCE(tp->mtu_info, info);
554 			if (!sock_owned_by_user(sk)) {
555 				tcp_v4_mtu_reduced(sk);
556 			} else {
557 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
558 					sock_hold(sk);
559 			}
560 			goto out;
561 		}
562 
563 		err = icmp_err_convert[code].errno;
564 		/* check if this ICMP message allows revert of backoff.
565 		 * (see RFC 6069)
566 		 */
567 		if (!fastopen &&
568 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
569 			tcp_ld_RTO_revert(sk, seq);
570 		break;
571 	case ICMP_TIME_EXCEEDED:
572 		err = EHOSTUNREACH;
573 		break;
574 	default:
575 		goto out;
576 	}
577 
578 	switch (sk->sk_state) {
579 	case TCP_SYN_SENT:
580 	case TCP_SYN_RECV:
581 		/* Only in fast or simultaneous open. If a fast open socket is
582 		 * already accepted it is treated as a connected one below.
583 		 */
584 		if (fastopen && !fastopen->sk)
585 			break;
586 
587 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
588 
589 		if (!sock_owned_by_user(sk)) {
590 			sk->sk_err = err;
591 
592 			sk_error_report(sk);
593 
594 			tcp_done(sk);
595 		} else {
596 			sk->sk_err_soft = err;
597 		}
598 		goto out;
599 	}
600 
601 	/* If we've already connected we will keep trying
602 	 * until we time out, or the user gives up.
603 	 *
604 	 * rfc1122 4.2.3.9 allows to consider as hard errors
605 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
606 	 * but it is obsoleted by pmtu discovery).
607 	 *
608 	 * Note, that in modern internet, where routing is unreliable
609 	 * and in each dark corner broken firewalls sit, sending random
610 	 * errors ordered by their masters even this two messages finally lose
611 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
612 	 *
613 	 * Now we are in compliance with RFCs.
614 	 *							--ANK (980905)
615 	 */
616 
617 	inet = inet_sk(sk);
618 	if (!sock_owned_by_user(sk) && inet->recverr) {
619 		sk->sk_err = err;
620 		sk_error_report(sk);
621 	} else	{ /* Only an error on timeout */
622 		sk->sk_err_soft = err;
623 	}
624 
625 out:
626 	bh_unlock_sock(sk);
627 	sock_put(sk);
628 	return 0;
629 }
630 
631 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
632 {
633 	struct tcphdr *th = tcp_hdr(skb);
634 
635 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
636 	skb->csum_start = skb_transport_header(skb) - skb->head;
637 	skb->csum_offset = offsetof(struct tcphdr, check);
638 }
639 
640 /* This routine computes an IPv4 TCP checksum. */
641 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
642 {
643 	const struct inet_sock *inet = inet_sk(sk);
644 
645 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
646 }
647 EXPORT_SYMBOL(tcp_v4_send_check);
648 
649 /*
650  *	This routine will send an RST to the other tcp.
651  *
652  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
653  *		      for reset.
654  *	Answer: if a packet caused RST, it is not for a socket
655  *		existing in our system, if it is matched to a socket,
656  *		it is just duplicate segment or bug in other side's TCP.
657  *		So that we build reply only basing on parameters
658  *		arrived with segment.
659  *	Exception: precedence violation. We do not implement it in any case.
660  */
661 
662 #ifdef CONFIG_TCP_MD5SIG
663 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
664 #else
665 #define OPTION_BYTES sizeof(__be32)
666 #endif
667 
668 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
669 {
670 	const struct tcphdr *th = tcp_hdr(skb);
671 	struct {
672 		struct tcphdr th;
673 		__be32 opt[OPTION_BYTES / sizeof(__be32)];
674 	} rep;
675 	struct ip_reply_arg arg;
676 #ifdef CONFIG_TCP_MD5SIG
677 	struct tcp_md5sig_key *key = NULL;
678 	const __u8 *hash_location = NULL;
679 	unsigned char newhash[16];
680 	int genhash;
681 	struct sock *sk1 = NULL;
682 #endif
683 	u64 transmit_time = 0;
684 	struct sock *ctl_sk;
685 	struct net *net;
686 
687 	/* Never send a reset in response to a reset. */
688 	if (th->rst)
689 		return;
690 
691 	/* If sk not NULL, it means we did a successful lookup and incoming
692 	 * route had to be correct. prequeue might have dropped our dst.
693 	 */
694 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
695 		return;
696 
697 	/* Swap the send and the receive. */
698 	memset(&rep, 0, sizeof(rep));
699 	rep.th.dest   = th->source;
700 	rep.th.source = th->dest;
701 	rep.th.doff   = sizeof(struct tcphdr) / 4;
702 	rep.th.rst    = 1;
703 
704 	if (th->ack) {
705 		rep.th.seq = th->ack_seq;
706 	} else {
707 		rep.th.ack = 1;
708 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
709 				       skb->len - (th->doff << 2));
710 	}
711 
712 	memset(&arg, 0, sizeof(arg));
713 	arg.iov[0].iov_base = (unsigned char *)&rep;
714 	arg.iov[0].iov_len  = sizeof(rep.th);
715 
716 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
717 #ifdef CONFIG_TCP_MD5SIG
718 	rcu_read_lock();
719 	hash_location = tcp_parse_md5sig_option(th);
720 	if (sk && sk_fullsock(sk)) {
721 		const union tcp_md5_addr *addr;
722 		int l3index;
723 
724 		/* sdif set, means packet ingressed via a device
725 		 * in an L3 domain and inet_iif is set to it.
726 		 */
727 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
728 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
729 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
730 	} else if (hash_location) {
731 		const union tcp_md5_addr *addr;
732 		int sdif = tcp_v4_sdif(skb);
733 		int dif = inet_iif(skb);
734 		int l3index;
735 
736 		/*
737 		 * active side is lost. Try to find listening socket through
738 		 * source port, and then find md5 key through listening socket.
739 		 * we are not loose security here:
740 		 * Incoming packet is checked with md5 hash with finding key,
741 		 * no RST generated if md5 hash doesn't match.
742 		 */
743 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
744 					     ip_hdr(skb)->saddr,
745 					     th->source, ip_hdr(skb)->daddr,
746 					     ntohs(th->source), dif, sdif);
747 		/* don't send rst if it can't find key */
748 		if (!sk1)
749 			goto out;
750 
751 		/* sdif set, means packet ingressed via a device
752 		 * in an L3 domain and dif is set to it.
753 		 */
754 		l3index = sdif ? dif : 0;
755 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
756 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
757 		if (!key)
758 			goto out;
759 
760 
761 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
762 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
763 			goto out;
764 
765 	}
766 
767 	if (key) {
768 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
769 				   (TCPOPT_NOP << 16) |
770 				   (TCPOPT_MD5SIG << 8) |
771 				   TCPOLEN_MD5SIG);
772 		/* Update length and the length the header thinks exists */
773 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
774 		rep.th.doff = arg.iov[0].iov_len / 4;
775 
776 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
777 				     key, ip_hdr(skb)->saddr,
778 				     ip_hdr(skb)->daddr, &rep.th);
779 	}
780 #endif
781 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
782 	if (rep.opt[0] == 0) {
783 		__be32 mrst = mptcp_reset_option(skb);
784 
785 		if (mrst) {
786 			rep.opt[0] = mrst;
787 			arg.iov[0].iov_len += sizeof(mrst);
788 			rep.th.doff = arg.iov[0].iov_len / 4;
789 		}
790 	}
791 
792 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
793 				      ip_hdr(skb)->saddr, /* XXX */
794 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
795 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
796 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
797 
798 	/* When socket is gone, all binding information is lost.
799 	 * routing might fail in this case. No choice here, if we choose to force
800 	 * input interface, we will misroute in case of asymmetric route.
801 	 */
802 	if (sk) {
803 		arg.bound_dev_if = sk->sk_bound_dev_if;
804 		if (sk_fullsock(sk))
805 			trace_tcp_send_reset(sk, skb);
806 	}
807 
808 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
809 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
810 
811 	arg.tos = ip_hdr(skb)->tos;
812 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
813 	local_bh_disable();
814 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
815 	sock_net_set(ctl_sk, net);
816 	if (sk) {
817 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
818 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
819 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
820 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
821 		transmit_time = tcp_transmit_time(sk);
822 	}
823 	ip_send_unicast_reply(ctl_sk,
824 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
825 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
826 			      &arg, arg.iov[0].iov_len,
827 			      transmit_time);
828 
829 	ctl_sk->sk_mark = 0;
830 	sock_net_set(ctl_sk, &init_net);
831 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
832 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
833 	local_bh_enable();
834 
835 #ifdef CONFIG_TCP_MD5SIG
836 out:
837 	rcu_read_unlock();
838 #endif
839 }
840 
841 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
842    outside socket context is ugly, certainly. What can I do?
843  */
844 
845 static void tcp_v4_send_ack(const struct sock *sk,
846 			    struct sk_buff *skb, u32 seq, u32 ack,
847 			    u32 win, u32 tsval, u32 tsecr, int oif,
848 			    struct tcp_md5sig_key *key,
849 			    int reply_flags, u8 tos)
850 {
851 	const struct tcphdr *th = tcp_hdr(skb);
852 	struct {
853 		struct tcphdr th;
854 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
855 #ifdef CONFIG_TCP_MD5SIG
856 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
857 #endif
858 			];
859 	} rep;
860 	struct net *net = sock_net(sk);
861 	struct ip_reply_arg arg;
862 	struct sock *ctl_sk;
863 	u64 transmit_time;
864 
865 	memset(&rep.th, 0, sizeof(struct tcphdr));
866 	memset(&arg, 0, sizeof(arg));
867 
868 	arg.iov[0].iov_base = (unsigned char *)&rep;
869 	arg.iov[0].iov_len  = sizeof(rep.th);
870 	if (tsecr) {
871 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
872 				   (TCPOPT_TIMESTAMP << 8) |
873 				   TCPOLEN_TIMESTAMP);
874 		rep.opt[1] = htonl(tsval);
875 		rep.opt[2] = htonl(tsecr);
876 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
877 	}
878 
879 	/* Swap the send and the receive. */
880 	rep.th.dest    = th->source;
881 	rep.th.source  = th->dest;
882 	rep.th.doff    = arg.iov[0].iov_len / 4;
883 	rep.th.seq     = htonl(seq);
884 	rep.th.ack_seq = htonl(ack);
885 	rep.th.ack     = 1;
886 	rep.th.window  = htons(win);
887 
888 #ifdef CONFIG_TCP_MD5SIG
889 	if (key) {
890 		int offset = (tsecr) ? 3 : 0;
891 
892 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
893 					  (TCPOPT_NOP << 16) |
894 					  (TCPOPT_MD5SIG << 8) |
895 					  TCPOLEN_MD5SIG);
896 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
897 		rep.th.doff = arg.iov[0].iov_len/4;
898 
899 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
900 				    key, ip_hdr(skb)->saddr,
901 				    ip_hdr(skb)->daddr, &rep.th);
902 	}
903 #endif
904 	arg.flags = reply_flags;
905 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
906 				      ip_hdr(skb)->saddr, /* XXX */
907 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
908 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
909 	if (oif)
910 		arg.bound_dev_if = oif;
911 	arg.tos = tos;
912 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
913 	local_bh_disable();
914 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
915 	sock_net_set(ctl_sk, net);
916 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
917 			   inet_twsk(sk)->tw_mark : sk->sk_mark;
918 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
919 			   inet_twsk(sk)->tw_priority : sk->sk_priority;
920 	transmit_time = tcp_transmit_time(sk);
921 	ip_send_unicast_reply(ctl_sk,
922 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
923 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
924 			      &arg, arg.iov[0].iov_len,
925 			      transmit_time);
926 
927 	ctl_sk->sk_mark = 0;
928 	sock_net_set(ctl_sk, &init_net);
929 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
930 	local_bh_enable();
931 }
932 
933 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
934 {
935 	struct inet_timewait_sock *tw = inet_twsk(sk);
936 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
937 
938 	tcp_v4_send_ack(sk, skb,
939 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
940 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
941 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
942 			tcptw->tw_ts_recent,
943 			tw->tw_bound_dev_if,
944 			tcp_twsk_md5_key(tcptw),
945 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
946 			tw->tw_tos
947 			);
948 
949 	inet_twsk_put(tw);
950 }
951 
952 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
953 				  struct request_sock *req)
954 {
955 	const union tcp_md5_addr *addr;
956 	int l3index;
957 
958 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
959 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
960 	 */
961 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
962 					     tcp_sk(sk)->snd_nxt;
963 
964 	/* RFC 7323 2.3
965 	 * The window field (SEG.WND) of every outgoing segment, with the
966 	 * exception of <SYN> segments, MUST be right-shifted by
967 	 * Rcv.Wind.Shift bits:
968 	 */
969 	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
970 	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
971 	tcp_v4_send_ack(sk, skb, seq,
972 			tcp_rsk(req)->rcv_nxt,
973 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
974 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
975 			req->ts_recent,
976 			0,
977 			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
978 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
979 			ip_hdr(skb)->tos);
980 }
981 
982 /*
983  *	Send a SYN-ACK after having received a SYN.
984  *	This still operates on a request_sock only, not on a big
985  *	socket.
986  */
987 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
988 			      struct flowi *fl,
989 			      struct request_sock *req,
990 			      struct tcp_fastopen_cookie *foc,
991 			      enum tcp_synack_type synack_type,
992 			      struct sk_buff *syn_skb)
993 {
994 	const struct inet_request_sock *ireq = inet_rsk(req);
995 	struct flowi4 fl4;
996 	int err = -1;
997 	struct sk_buff *skb;
998 	u8 tos;
999 
1000 	/* First, grab a route. */
1001 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1002 		return -1;
1003 
1004 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1005 
1006 	if (skb) {
1007 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1008 
1009 		tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
1010 				(tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1011 				(inet_sk(sk)->tos & INET_ECN_MASK) :
1012 				inet_sk(sk)->tos;
1013 
1014 		if (!INET_ECN_is_capable(tos) &&
1015 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1016 			tos |= INET_ECN_ECT_0;
1017 
1018 		rcu_read_lock();
1019 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1020 					    ireq->ir_rmt_addr,
1021 					    rcu_dereference(ireq->ireq_opt),
1022 					    tos);
1023 		rcu_read_unlock();
1024 		err = net_xmit_eval(err);
1025 	}
1026 
1027 	return err;
1028 }
1029 
1030 /*
1031  *	IPv4 request_sock destructor.
1032  */
1033 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1034 {
1035 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1036 }
1037 
1038 #ifdef CONFIG_TCP_MD5SIG
1039 /*
1040  * RFC2385 MD5 checksumming requires a mapping of
1041  * IP address->MD5 Key.
1042  * We need to maintain these in the sk structure.
1043  */
1044 
1045 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1046 EXPORT_SYMBOL(tcp_md5_needed);
1047 
1048 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1049 {
1050 	if (!old)
1051 		return true;
1052 
1053 	/* l3index always overrides non-l3index */
1054 	if (old->l3index && new->l3index == 0)
1055 		return false;
1056 	if (old->l3index == 0 && new->l3index)
1057 		return true;
1058 
1059 	return old->prefixlen < new->prefixlen;
1060 }
1061 
1062 /* Find the Key structure for an address.  */
1063 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1064 					   const union tcp_md5_addr *addr,
1065 					   int family)
1066 {
1067 	const struct tcp_sock *tp = tcp_sk(sk);
1068 	struct tcp_md5sig_key *key;
1069 	const struct tcp_md5sig_info *md5sig;
1070 	__be32 mask;
1071 	struct tcp_md5sig_key *best_match = NULL;
1072 	bool match;
1073 
1074 	/* caller either holds rcu_read_lock() or socket lock */
1075 	md5sig = rcu_dereference_check(tp->md5sig_info,
1076 				       lockdep_sock_is_held(sk));
1077 	if (!md5sig)
1078 		return NULL;
1079 
1080 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1081 				 lockdep_sock_is_held(sk)) {
1082 		if (key->family != family)
1083 			continue;
1084 		if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1085 			continue;
1086 		if (family == AF_INET) {
1087 			mask = inet_make_mask(key->prefixlen);
1088 			match = (key->addr.a4.s_addr & mask) ==
1089 				(addr->a4.s_addr & mask);
1090 #if IS_ENABLED(CONFIG_IPV6)
1091 		} else if (family == AF_INET6) {
1092 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1093 						  key->prefixlen);
1094 #endif
1095 		} else {
1096 			match = false;
1097 		}
1098 
1099 		if (match && better_md5_match(best_match, key))
1100 			best_match = key;
1101 	}
1102 	return best_match;
1103 }
1104 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1105 
1106 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1107 						      const union tcp_md5_addr *addr,
1108 						      int family, u8 prefixlen,
1109 						      int l3index, u8 flags)
1110 {
1111 	const struct tcp_sock *tp = tcp_sk(sk);
1112 	struct tcp_md5sig_key *key;
1113 	unsigned int size = sizeof(struct in_addr);
1114 	const struct tcp_md5sig_info *md5sig;
1115 
1116 	/* caller either holds rcu_read_lock() or socket lock */
1117 	md5sig = rcu_dereference_check(tp->md5sig_info,
1118 				       lockdep_sock_is_held(sk));
1119 	if (!md5sig)
1120 		return NULL;
1121 #if IS_ENABLED(CONFIG_IPV6)
1122 	if (family == AF_INET6)
1123 		size = sizeof(struct in6_addr);
1124 #endif
1125 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1126 				 lockdep_sock_is_held(sk)) {
1127 		if (key->family != family)
1128 			continue;
1129 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1130 			continue;
1131 		if (key->l3index != l3index)
1132 			continue;
1133 		if (!memcmp(&key->addr, addr, size) &&
1134 		    key->prefixlen == prefixlen)
1135 			return key;
1136 	}
1137 	return NULL;
1138 }
1139 
1140 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1141 					 const struct sock *addr_sk)
1142 {
1143 	const union tcp_md5_addr *addr;
1144 	int l3index;
1145 
1146 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1147 						 addr_sk->sk_bound_dev_if);
1148 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1149 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1150 }
1151 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1152 
1153 /* This can be called on a newly created socket, from other files */
1154 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1155 		   int family, u8 prefixlen, int l3index, u8 flags,
1156 		   const u8 *newkey, u8 newkeylen, gfp_t gfp)
1157 {
1158 	/* Add Key to the list */
1159 	struct tcp_md5sig_key *key;
1160 	struct tcp_sock *tp = tcp_sk(sk);
1161 	struct tcp_md5sig_info *md5sig;
1162 
1163 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1164 	if (key) {
1165 		/* Pre-existing entry - just update that one.
1166 		 * Note that the key might be used concurrently.
1167 		 * data_race() is telling kcsan that we do not care of
1168 		 * key mismatches, since changing MD5 key on live flows
1169 		 * can lead to packet drops.
1170 		 */
1171 		data_race(memcpy(key->key, newkey, newkeylen));
1172 
1173 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1174 		 * Also note that a reader could catch new key->keylen value
1175 		 * but old key->key[], this is the reason we use __GFP_ZERO
1176 		 * at sock_kmalloc() time below these lines.
1177 		 */
1178 		WRITE_ONCE(key->keylen, newkeylen);
1179 
1180 		return 0;
1181 	}
1182 
1183 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1184 					   lockdep_sock_is_held(sk));
1185 	if (!md5sig) {
1186 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1187 		if (!md5sig)
1188 			return -ENOMEM;
1189 
1190 		sk_gso_disable(sk);
1191 		INIT_HLIST_HEAD(&md5sig->head);
1192 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1193 	}
1194 
1195 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1196 	if (!key)
1197 		return -ENOMEM;
1198 	if (!tcp_alloc_md5sig_pool()) {
1199 		sock_kfree_s(sk, key, sizeof(*key));
1200 		return -ENOMEM;
1201 	}
1202 
1203 	memcpy(key->key, newkey, newkeylen);
1204 	key->keylen = newkeylen;
1205 	key->family = family;
1206 	key->prefixlen = prefixlen;
1207 	key->l3index = l3index;
1208 	key->flags = flags;
1209 	memcpy(&key->addr, addr,
1210 	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1211 								 sizeof(struct in_addr));
1212 	hlist_add_head_rcu(&key->node, &md5sig->head);
1213 	return 0;
1214 }
1215 EXPORT_SYMBOL(tcp_md5_do_add);
1216 
1217 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1218 		   u8 prefixlen, int l3index, u8 flags)
1219 {
1220 	struct tcp_md5sig_key *key;
1221 
1222 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1223 	if (!key)
1224 		return -ENOENT;
1225 	hlist_del_rcu(&key->node);
1226 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1227 	kfree_rcu(key, rcu);
1228 	return 0;
1229 }
1230 EXPORT_SYMBOL(tcp_md5_do_del);
1231 
1232 static void tcp_clear_md5_list(struct sock *sk)
1233 {
1234 	struct tcp_sock *tp = tcp_sk(sk);
1235 	struct tcp_md5sig_key *key;
1236 	struct hlist_node *n;
1237 	struct tcp_md5sig_info *md5sig;
1238 
1239 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1240 
1241 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1242 		hlist_del_rcu(&key->node);
1243 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1244 		kfree_rcu(key, rcu);
1245 	}
1246 }
1247 
1248 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1249 				 sockptr_t optval, int optlen)
1250 {
1251 	struct tcp_md5sig cmd;
1252 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1253 	const union tcp_md5_addr *addr;
1254 	u8 prefixlen = 32;
1255 	int l3index = 0;
1256 	u8 flags;
1257 
1258 	if (optlen < sizeof(cmd))
1259 		return -EINVAL;
1260 
1261 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1262 		return -EFAULT;
1263 
1264 	if (sin->sin_family != AF_INET)
1265 		return -EINVAL;
1266 
1267 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1268 
1269 	if (optname == TCP_MD5SIG_EXT &&
1270 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1271 		prefixlen = cmd.tcpm_prefixlen;
1272 		if (prefixlen > 32)
1273 			return -EINVAL;
1274 	}
1275 
1276 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1277 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1278 		struct net_device *dev;
1279 
1280 		rcu_read_lock();
1281 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1282 		if (dev && netif_is_l3_master(dev))
1283 			l3index = dev->ifindex;
1284 
1285 		rcu_read_unlock();
1286 
1287 		/* ok to reference set/not set outside of rcu;
1288 		 * right now device MUST be an L3 master
1289 		 */
1290 		if (!dev || !l3index)
1291 			return -EINVAL;
1292 	}
1293 
1294 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1295 
1296 	if (!cmd.tcpm_keylen)
1297 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1298 
1299 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1300 		return -EINVAL;
1301 
1302 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1303 			      cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1304 }
1305 
1306 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1307 				   __be32 daddr, __be32 saddr,
1308 				   const struct tcphdr *th, int nbytes)
1309 {
1310 	struct tcp4_pseudohdr *bp;
1311 	struct scatterlist sg;
1312 	struct tcphdr *_th;
1313 
1314 	bp = hp->scratch;
1315 	bp->saddr = saddr;
1316 	bp->daddr = daddr;
1317 	bp->pad = 0;
1318 	bp->protocol = IPPROTO_TCP;
1319 	bp->len = cpu_to_be16(nbytes);
1320 
1321 	_th = (struct tcphdr *)(bp + 1);
1322 	memcpy(_th, th, sizeof(*th));
1323 	_th->check = 0;
1324 
1325 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1326 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1327 				sizeof(*bp) + sizeof(*th));
1328 	return crypto_ahash_update(hp->md5_req);
1329 }
1330 
1331 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1332 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1333 {
1334 	struct tcp_md5sig_pool *hp;
1335 	struct ahash_request *req;
1336 
1337 	hp = tcp_get_md5sig_pool();
1338 	if (!hp)
1339 		goto clear_hash_noput;
1340 	req = hp->md5_req;
1341 
1342 	if (crypto_ahash_init(req))
1343 		goto clear_hash;
1344 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1345 		goto clear_hash;
1346 	if (tcp_md5_hash_key(hp, key))
1347 		goto clear_hash;
1348 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1349 	if (crypto_ahash_final(req))
1350 		goto clear_hash;
1351 
1352 	tcp_put_md5sig_pool();
1353 	return 0;
1354 
1355 clear_hash:
1356 	tcp_put_md5sig_pool();
1357 clear_hash_noput:
1358 	memset(md5_hash, 0, 16);
1359 	return 1;
1360 }
1361 
1362 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1363 			const struct sock *sk,
1364 			const struct sk_buff *skb)
1365 {
1366 	struct tcp_md5sig_pool *hp;
1367 	struct ahash_request *req;
1368 	const struct tcphdr *th = tcp_hdr(skb);
1369 	__be32 saddr, daddr;
1370 
1371 	if (sk) { /* valid for establish/request sockets */
1372 		saddr = sk->sk_rcv_saddr;
1373 		daddr = sk->sk_daddr;
1374 	} else {
1375 		const struct iphdr *iph = ip_hdr(skb);
1376 		saddr = iph->saddr;
1377 		daddr = iph->daddr;
1378 	}
1379 
1380 	hp = tcp_get_md5sig_pool();
1381 	if (!hp)
1382 		goto clear_hash_noput;
1383 	req = hp->md5_req;
1384 
1385 	if (crypto_ahash_init(req))
1386 		goto clear_hash;
1387 
1388 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1389 		goto clear_hash;
1390 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1391 		goto clear_hash;
1392 	if (tcp_md5_hash_key(hp, key))
1393 		goto clear_hash;
1394 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1395 	if (crypto_ahash_final(req))
1396 		goto clear_hash;
1397 
1398 	tcp_put_md5sig_pool();
1399 	return 0;
1400 
1401 clear_hash:
1402 	tcp_put_md5sig_pool();
1403 clear_hash_noput:
1404 	memset(md5_hash, 0, 16);
1405 	return 1;
1406 }
1407 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1408 
1409 #endif
1410 
1411 static void tcp_v4_init_req(struct request_sock *req,
1412 			    const struct sock *sk_listener,
1413 			    struct sk_buff *skb)
1414 {
1415 	struct inet_request_sock *ireq = inet_rsk(req);
1416 	struct net *net = sock_net(sk_listener);
1417 
1418 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1419 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1420 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1421 }
1422 
1423 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1424 					  struct sk_buff *skb,
1425 					  struct flowi *fl,
1426 					  struct request_sock *req)
1427 {
1428 	tcp_v4_init_req(req, sk, skb);
1429 
1430 	if (security_inet_conn_request(sk, skb, req))
1431 		return NULL;
1432 
1433 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1434 }
1435 
1436 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1437 	.family		=	PF_INET,
1438 	.obj_size	=	sizeof(struct tcp_request_sock),
1439 	.rtx_syn_ack	=	tcp_rtx_synack,
1440 	.send_ack	=	tcp_v4_reqsk_send_ack,
1441 	.destructor	=	tcp_v4_reqsk_destructor,
1442 	.send_reset	=	tcp_v4_send_reset,
1443 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1444 };
1445 
1446 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1447 	.mss_clamp	=	TCP_MSS_DEFAULT,
1448 #ifdef CONFIG_TCP_MD5SIG
1449 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1450 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1451 #endif
1452 #ifdef CONFIG_SYN_COOKIES
1453 	.cookie_init_seq =	cookie_v4_init_sequence,
1454 #endif
1455 	.route_req	=	tcp_v4_route_req,
1456 	.init_seq	=	tcp_v4_init_seq,
1457 	.init_ts_off	=	tcp_v4_init_ts_off,
1458 	.send_synack	=	tcp_v4_send_synack,
1459 };
1460 
1461 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1462 {
1463 	/* Never answer to SYNs send to broadcast or multicast */
1464 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1465 		goto drop;
1466 
1467 	return tcp_conn_request(&tcp_request_sock_ops,
1468 				&tcp_request_sock_ipv4_ops, sk, skb);
1469 
1470 drop:
1471 	tcp_listendrop(sk);
1472 	return 0;
1473 }
1474 EXPORT_SYMBOL(tcp_v4_conn_request);
1475 
1476 
1477 /*
1478  * The three way handshake has completed - we got a valid synack -
1479  * now create the new socket.
1480  */
1481 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1482 				  struct request_sock *req,
1483 				  struct dst_entry *dst,
1484 				  struct request_sock *req_unhash,
1485 				  bool *own_req)
1486 {
1487 	struct inet_request_sock *ireq;
1488 	bool found_dup_sk = false;
1489 	struct inet_sock *newinet;
1490 	struct tcp_sock *newtp;
1491 	struct sock *newsk;
1492 #ifdef CONFIG_TCP_MD5SIG
1493 	const union tcp_md5_addr *addr;
1494 	struct tcp_md5sig_key *key;
1495 	int l3index;
1496 #endif
1497 	struct ip_options_rcu *inet_opt;
1498 
1499 	if (sk_acceptq_is_full(sk))
1500 		goto exit_overflow;
1501 
1502 	newsk = tcp_create_openreq_child(sk, req, skb);
1503 	if (!newsk)
1504 		goto exit_nonewsk;
1505 
1506 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1507 	inet_sk_rx_dst_set(newsk, skb);
1508 
1509 	newtp		      = tcp_sk(newsk);
1510 	newinet		      = inet_sk(newsk);
1511 	ireq		      = inet_rsk(req);
1512 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1513 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1514 	newsk->sk_bound_dev_if = ireq->ir_iif;
1515 	newinet->inet_saddr   = ireq->ir_loc_addr;
1516 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1517 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1518 	newinet->mc_index     = inet_iif(skb);
1519 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1520 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1521 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1522 	if (inet_opt)
1523 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1524 	newinet->inet_id = prandom_u32();
1525 
1526 	/* Set ToS of the new socket based upon the value of incoming SYN.
1527 	 * ECT bits are set later in tcp_init_transfer().
1528 	 */
1529 	if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1530 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1531 
1532 	if (!dst) {
1533 		dst = inet_csk_route_child_sock(sk, newsk, req);
1534 		if (!dst)
1535 			goto put_and_exit;
1536 	} else {
1537 		/* syncookie case : see end of cookie_v4_check() */
1538 	}
1539 	sk_setup_caps(newsk, dst);
1540 
1541 	tcp_ca_openreq_child(newsk, dst);
1542 
1543 	tcp_sync_mss(newsk, dst_mtu(dst));
1544 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1545 
1546 	tcp_initialize_rcv_mss(newsk);
1547 
1548 #ifdef CONFIG_TCP_MD5SIG
1549 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1550 	/* Copy over the MD5 key from the original socket */
1551 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1552 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1553 	if (key) {
1554 		/*
1555 		 * We're using one, so create a matching key
1556 		 * on the newsk structure. If we fail to get
1557 		 * memory, then we end up not copying the key
1558 		 * across. Shucks.
1559 		 */
1560 		tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1561 			       key->key, key->keylen, GFP_ATOMIC);
1562 		sk_gso_disable(newsk);
1563 	}
1564 #endif
1565 
1566 	if (__inet_inherit_port(sk, newsk) < 0)
1567 		goto put_and_exit;
1568 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1569 				       &found_dup_sk);
1570 	if (likely(*own_req)) {
1571 		tcp_move_syn(newtp, req);
1572 		ireq->ireq_opt = NULL;
1573 	} else {
1574 		newinet->inet_opt = NULL;
1575 
1576 		if (!req_unhash && found_dup_sk) {
1577 			/* This code path should only be executed in the
1578 			 * syncookie case only
1579 			 */
1580 			bh_unlock_sock(newsk);
1581 			sock_put(newsk);
1582 			newsk = NULL;
1583 		}
1584 	}
1585 	return newsk;
1586 
1587 exit_overflow:
1588 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1589 exit_nonewsk:
1590 	dst_release(dst);
1591 exit:
1592 	tcp_listendrop(sk);
1593 	return NULL;
1594 put_and_exit:
1595 	newinet->inet_opt = NULL;
1596 	inet_csk_prepare_forced_close(newsk);
1597 	tcp_done(newsk);
1598 	goto exit;
1599 }
1600 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1601 
1602 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1603 {
1604 #ifdef CONFIG_SYN_COOKIES
1605 	const struct tcphdr *th = tcp_hdr(skb);
1606 
1607 	if (!th->syn)
1608 		sk = cookie_v4_check(sk, skb);
1609 #endif
1610 	return sk;
1611 }
1612 
1613 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1614 			 struct tcphdr *th, u32 *cookie)
1615 {
1616 	u16 mss = 0;
1617 #ifdef CONFIG_SYN_COOKIES
1618 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1619 				    &tcp_request_sock_ipv4_ops, sk, th);
1620 	if (mss) {
1621 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1622 		tcp_synq_overflow(sk);
1623 	}
1624 #endif
1625 	return mss;
1626 }
1627 
1628 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1629 							   u32));
1630 /* The socket must have it's spinlock held when we get
1631  * here, unless it is a TCP_LISTEN socket.
1632  *
1633  * We have a potential double-lock case here, so even when
1634  * doing backlog processing we use the BH locking scheme.
1635  * This is because we cannot sleep with the original spinlock
1636  * held.
1637  */
1638 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1639 {
1640 	enum skb_drop_reason reason;
1641 	struct sock *rsk;
1642 
1643 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1644 		struct dst_entry *dst;
1645 
1646 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1647 						lockdep_sock_is_held(sk));
1648 
1649 		sock_rps_save_rxhash(sk, skb);
1650 		sk_mark_napi_id(sk, skb);
1651 		if (dst) {
1652 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1653 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1654 					     dst, 0)) {
1655 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1656 				dst_release(dst);
1657 			}
1658 		}
1659 		tcp_rcv_established(sk, skb);
1660 		return 0;
1661 	}
1662 
1663 	reason = SKB_DROP_REASON_NOT_SPECIFIED;
1664 	if (tcp_checksum_complete(skb))
1665 		goto csum_err;
1666 
1667 	if (sk->sk_state == TCP_LISTEN) {
1668 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1669 
1670 		if (!nsk)
1671 			goto discard;
1672 		if (nsk != sk) {
1673 			if (tcp_child_process(sk, nsk, skb)) {
1674 				rsk = nsk;
1675 				goto reset;
1676 			}
1677 			return 0;
1678 		}
1679 	} else
1680 		sock_rps_save_rxhash(sk, skb);
1681 
1682 	if (tcp_rcv_state_process(sk, skb)) {
1683 		rsk = sk;
1684 		goto reset;
1685 	}
1686 	return 0;
1687 
1688 reset:
1689 	tcp_v4_send_reset(rsk, skb);
1690 discard:
1691 	kfree_skb_reason(skb, reason);
1692 	/* Be careful here. If this function gets more complicated and
1693 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1694 	 * might be destroyed here. This current version compiles correctly,
1695 	 * but you have been warned.
1696 	 */
1697 	return 0;
1698 
1699 csum_err:
1700 	reason = SKB_DROP_REASON_TCP_CSUM;
1701 	trace_tcp_bad_csum(skb);
1702 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1703 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1704 	goto discard;
1705 }
1706 EXPORT_SYMBOL(tcp_v4_do_rcv);
1707 
1708 int tcp_v4_early_demux(struct sk_buff *skb)
1709 {
1710 	const struct iphdr *iph;
1711 	const struct tcphdr *th;
1712 	struct sock *sk;
1713 
1714 	if (skb->pkt_type != PACKET_HOST)
1715 		return 0;
1716 
1717 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1718 		return 0;
1719 
1720 	iph = ip_hdr(skb);
1721 	th = tcp_hdr(skb);
1722 
1723 	if (th->doff < sizeof(struct tcphdr) / 4)
1724 		return 0;
1725 
1726 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1727 				       iph->saddr, th->source,
1728 				       iph->daddr, ntohs(th->dest),
1729 				       skb->skb_iif, inet_sdif(skb));
1730 	if (sk) {
1731 		skb->sk = sk;
1732 		skb->destructor = sock_edemux;
1733 		if (sk_fullsock(sk)) {
1734 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1735 
1736 			if (dst)
1737 				dst = dst_check(dst, 0);
1738 			if (dst &&
1739 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
1740 				skb_dst_set_noref(skb, dst);
1741 		}
1742 	}
1743 	return 0;
1744 }
1745 
1746 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1747 		     enum skb_drop_reason *reason)
1748 {
1749 	u32 limit, tail_gso_size, tail_gso_segs;
1750 	struct skb_shared_info *shinfo;
1751 	const struct tcphdr *th;
1752 	struct tcphdr *thtail;
1753 	struct sk_buff *tail;
1754 	unsigned int hdrlen;
1755 	bool fragstolen;
1756 	u32 gso_segs;
1757 	u32 gso_size;
1758 	int delta;
1759 
1760 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1761 	 * we can fix skb->truesize to its real value to avoid future drops.
1762 	 * This is valid because skb is not yet charged to the socket.
1763 	 * It has been noticed pure SACK packets were sometimes dropped
1764 	 * (if cooked by drivers without copybreak feature).
1765 	 */
1766 	skb_condense(skb);
1767 
1768 	skb_dst_drop(skb);
1769 
1770 	if (unlikely(tcp_checksum_complete(skb))) {
1771 		bh_unlock_sock(sk);
1772 		trace_tcp_bad_csum(skb);
1773 		*reason = SKB_DROP_REASON_TCP_CSUM;
1774 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1775 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1776 		return true;
1777 	}
1778 
1779 	/* Attempt coalescing to last skb in backlog, even if we are
1780 	 * above the limits.
1781 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1782 	 */
1783 	th = (const struct tcphdr *)skb->data;
1784 	hdrlen = th->doff * 4;
1785 
1786 	tail = sk->sk_backlog.tail;
1787 	if (!tail)
1788 		goto no_coalesce;
1789 	thtail = (struct tcphdr *)tail->data;
1790 
1791 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1792 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1793 	    ((TCP_SKB_CB(tail)->tcp_flags |
1794 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1795 	    !((TCP_SKB_CB(tail)->tcp_flags &
1796 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1797 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1798 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1799 #ifdef CONFIG_TLS_DEVICE
1800 	    tail->decrypted != skb->decrypted ||
1801 #endif
1802 	    thtail->doff != th->doff ||
1803 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1804 		goto no_coalesce;
1805 
1806 	__skb_pull(skb, hdrlen);
1807 
1808 	shinfo = skb_shinfo(skb);
1809 	gso_size = shinfo->gso_size ?: skb->len;
1810 	gso_segs = shinfo->gso_segs ?: 1;
1811 
1812 	shinfo = skb_shinfo(tail);
1813 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1814 	tail_gso_segs = shinfo->gso_segs ?: 1;
1815 
1816 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1817 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1818 
1819 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1820 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1821 			thtail->window = th->window;
1822 		}
1823 
1824 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1825 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1826 		 * is not entered if we append a packet with a FIN.
1827 		 * SYN, RST, URG are not present.
1828 		 * ACK is set on both packets.
1829 		 * PSH : we do not really care in TCP stack,
1830 		 *       at least for 'GRO' packets.
1831 		 */
1832 		thtail->fin |= th->fin;
1833 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1834 
1835 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1836 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1837 			tail->tstamp = skb->tstamp;
1838 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1839 		}
1840 
1841 		/* Not as strict as GRO. We only need to carry mss max value */
1842 		shinfo->gso_size = max(gso_size, tail_gso_size);
1843 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1844 
1845 		sk->sk_backlog.len += delta;
1846 		__NET_INC_STATS(sock_net(sk),
1847 				LINUX_MIB_TCPBACKLOGCOALESCE);
1848 		kfree_skb_partial(skb, fragstolen);
1849 		return false;
1850 	}
1851 	__skb_push(skb, hdrlen);
1852 
1853 no_coalesce:
1854 	/* Only socket owner can try to collapse/prune rx queues
1855 	 * to reduce memory overhead, so add a little headroom here.
1856 	 * Few sockets backlog are possibly concurrently non empty.
1857 	 */
1858 	limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024;
1859 
1860 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1861 		bh_unlock_sock(sk);
1862 		*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1863 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1864 		return true;
1865 	}
1866 	return false;
1867 }
1868 EXPORT_SYMBOL(tcp_add_backlog);
1869 
1870 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1871 {
1872 	struct tcphdr *th = (struct tcphdr *)skb->data;
1873 
1874 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1875 }
1876 EXPORT_SYMBOL(tcp_filter);
1877 
1878 static void tcp_v4_restore_cb(struct sk_buff *skb)
1879 {
1880 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1881 		sizeof(struct inet_skb_parm));
1882 }
1883 
1884 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1885 			   const struct tcphdr *th)
1886 {
1887 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1888 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1889 	 */
1890 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1891 		sizeof(struct inet_skb_parm));
1892 	barrier();
1893 
1894 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1895 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1896 				    skb->len - th->doff * 4);
1897 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1898 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1899 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1900 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1901 	TCP_SKB_CB(skb)->sacked	 = 0;
1902 	TCP_SKB_CB(skb)->has_rxtstamp =
1903 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1904 }
1905 
1906 /*
1907  *	From tcp_input.c
1908  */
1909 
1910 int tcp_v4_rcv(struct sk_buff *skb)
1911 {
1912 	struct net *net = dev_net(skb->dev);
1913 	enum skb_drop_reason drop_reason;
1914 	int sdif = inet_sdif(skb);
1915 	int dif = inet_iif(skb);
1916 	const struct iphdr *iph;
1917 	const struct tcphdr *th;
1918 	bool refcounted;
1919 	struct sock *sk;
1920 	int ret;
1921 
1922 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1923 	if (skb->pkt_type != PACKET_HOST)
1924 		goto discard_it;
1925 
1926 	/* Count it even if it's bad */
1927 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1928 
1929 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1930 		goto discard_it;
1931 
1932 	th = (const struct tcphdr *)skb->data;
1933 
1934 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1935 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1936 		goto bad_packet;
1937 	}
1938 	if (!pskb_may_pull(skb, th->doff * 4))
1939 		goto discard_it;
1940 
1941 	/* An explanation is required here, I think.
1942 	 * Packet length and doff are validated by header prediction,
1943 	 * provided case of th->doff==0 is eliminated.
1944 	 * So, we defer the checks. */
1945 
1946 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1947 		goto csum_error;
1948 
1949 	th = (const struct tcphdr *)skb->data;
1950 	iph = ip_hdr(skb);
1951 lookup:
1952 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1953 			       th->dest, sdif, &refcounted);
1954 	if (!sk)
1955 		goto no_tcp_socket;
1956 
1957 process:
1958 	if (sk->sk_state == TCP_TIME_WAIT)
1959 		goto do_time_wait;
1960 
1961 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1962 		struct request_sock *req = inet_reqsk(sk);
1963 		bool req_stolen = false;
1964 		struct sock *nsk;
1965 
1966 		sk = req->rsk_listener;
1967 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1968 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
1969 		else
1970 			drop_reason = tcp_inbound_md5_hash(sk, skb,
1971 						   &iph->saddr, &iph->daddr,
1972 						   AF_INET, dif, sdif);
1973 		if (unlikely(drop_reason)) {
1974 			sk_drops_add(sk, skb);
1975 			reqsk_put(req);
1976 			goto discard_it;
1977 		}
1978 		if (tcp_checksum_complete(skb)) {
1979 			reqsk_put(req);
1980 			goto csum_error;
1981 		}
1982 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1983 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
1984 			if (!nsk) {
1985 				inet_csk_reqsk_queue_drop_and_put(sk, req);
1986 				goto lookup;
1987 			}
1988 			sk = nsk;
1989 			/* reuseport_migrate_sock() has already held one sk_refcnt
1990 			 * before returning.
1991 			 */
1992 		} else {
1993 			/* We own a reference on the listener, increase it again
1994 			 * as we might lose it too soon.
1995 			 */
1996 			sock_hold(sk);
1997 		}
1998 		refcounted = true;
1999 		nsk = NULL;
2000 		if (!tcp_filter(sk, skb)) {
2001 			th = (const struct tcphdr *)skb->data;
2002 			iph = ip_hdr(skb);
2003 			tcp_v4_fill_cb(skb, iph, th);
2004 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2005 		} else {
2006 			drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2007 		}
2008 		if (!nsk) {
2009 			reqsk_put(req);
2010 			if (req_stolen) {
2011 				/* Another cpu got exclusive access to req
2012 				 * and created a full blown socket.
2013 				 * Try to feed this packet to this socket
2014 				 * instead of discarding it.
2015 				 */
2016 				tcp_v4_restore_cb(skb);
2017 				sock_put(sk);
2018 				goto lookup;
2019 			}
2020 			goto discard_and_relse;
2021 		}
2022 		nf_reset_ct(skb);
2023 		if (nsk == sk) {
2024 			reqsk_put(req);
2025 			tcp_v4_restore_cb(skb);
2026 		} else if (tcp_child_process(sk, nsk, skb)) {
2027 			tcp_v4_send_reset(nsk, skb);
2028 			goto discard_and_relse;
2029 		} else {
2030 			sock_put(sk);
2031 			return 0;
2032 		}
2033 	}
2034 
2035 	if (static_branch_unlikely(&ip4_min_ttl)) {
2036 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2037 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2038 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2039 			goto discard_and_relse;
2040 		}
2041 	}
2042 
2043 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2044 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2045 		goto discard_and_relse;
2046 	}
2047 
2048 	drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2049 					   &iph->daddr, AF_INET, dif, sdif);
2050 	if (drop_reason)
2051 		goto discard_and_relse;
2052 
2053 	nf_reset_ct(skb);
2054 
2055 	if (tcp_filter(sk, skb)) {
2056 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2057 		goto discard_and_relse;
2058 	}
2059 	th = (const struct tcphdr *)skb->data;
2060 	iph = ip_hdr(skb);
2061 	tcp_v4_fill_cb(skb, iph, th);
2062 
2063 	skb->dev = NULL;
2064 
2065 	if (sk->sk_state == TCP_LISTEN) {
2066 		ret = tcp_v4_do_rcv(sk, skb);
2067 		goto put_and_return;
2068 	}
2069 
2070 	sk_incoming_cpu_update(sk);
2071 
2072 	bh_lock_sock_nested(sk);
2073 	tcp_segs_in(tcp_sk(sk), skb);
2074 	ret = 0;
2075 	if (!sock_owned_by_user(sk)) {
2076 		ret = tcp_v4_do_rcv(sk, skb);
2077 	} else {
2078 		if (tcp_add_backlog(sk, skb, &drop_reason))
2079 			goto discard_and_relse;
2080 	}
2081 	bh_unlock_sock(sk);
2082 
2083 put_and_return:
2084 	if (refcounted)
2085 		sock_put(sk);
2086 
2087 	return ret;
2088 
2089 no_tcp_socket:
2090 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2091 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2092 		goto discard_it;
2093 
2094 	tcp_v4_fill_cb(skb, iph, th);
2095 
2096 	if (tcp_checksum_complete(skb)) {
2097 csum_error:
2098 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2099 		trace_tcp_bad_csum(skb);
2100 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2101 bad_packet:
2102 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2103 	} else {
2104 		tcp_v4_send_reset(NULL, skb);
2105 	}
2106 
2107 discard_it:
2108 	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2109 	/* Discard frame. */
2110 	kfree_skb_reason(skb, drop_reason);
2111 	return 0;
2112 
2113 discard_and_relse:
2114 	sk_drops_add(sk, skb);
2115 	if (refcounted)
2116 		sock_put(sk);
2117 	goto discard_it;
2118 
2119 do_time_wait:
2120 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2121 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2122 		inet_twsk_put(inet_twsk(sk));
2123 		goto discard_it;
2124 	}
2125 
2126 	tcp_v4_fill_cb(skb, iph, th);
2127 
2128 	if (tcp_checksum_complete(skb)) {
2129 		inet_twsk_put(inet_twsk(sk));
2130 		goto csum_error;
2131 	}
2132 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2133 	case TCP_TW_SYN: {
2134 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2135 							&tcp_hashinfo, skb,
2136 							__tcp_hdrlen(th),
2137 							iph->saddr, th->source,
2138 							iph->daddr, th->dest,
2139 							inet_iif(skb),
2140 							sdif);
2141 		if (sk2) {
2142 			inet_twsk_deschedule_put(inet_twsk(sk));
2143 			sk = sk2;
2144 			tcp_v4_restore_cb(skb);
2145 			refcounted = false;
2146 			goto process;
2147 		}
2148 	}
2149 		/* to ACK */
2150 		fallthrough;
2151 	case TCP_TW_ACK:
2152 		tcp_v4_timewait_ack(sk, skb);
2153 		break;
2154 	case TCP_TW_RST:
2155 		tcp_v4_send_reset(sk, skb);
2156 		inet_twsk_deschedule_put(inet_twsk(sk));
2157 		goto discard_it;
2158 	case TCP_TW_SUCCESS:;
2159 	}
2160 	goto discard_it;
2161 }
2162 
2163 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2164 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2165 	.twsk_unique	= tcp_twsk_unique,
2166 	.twsk_destructor= tcp_twsk_destructor,
2167 };
2168 
2169 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2170 {
2171 	struct dst_entry *dst = skb_dst(skb);
2172 
2173 	if (dst && dst_hold_safe(dst)) {
2174 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2175 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2176 	}
2177 }
2178 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2179 
2180 const struct inet_connection_sock_af_ops ipv4_specific = {
2181 	.queue_xmit	   = ip_queue_xmit,
2182 	.send_check	   = tcp_v4_send_check,
2183 	.rebuild_header	   = inet_sk_rebuild_header,
2184 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2185 	.conn_request	   = tcp_v4_conn_request,
2186 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2187 	.net_header_len	   = sizeof(struct iphdr),
2188 	.setsockopt	   = ip_setsockopt,
2189 	.getsockopt	   = ip_getsockopt,
2190 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2191 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2192 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2193 };
2194 EXPORT_SYMBOL(ipv4_specific);
2195 
2196 #ifdef CONFIG_TCP_MD5SIG
2197 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2198 	.md5_lookup		= tcp_v4_md5_lookup,
2199 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2200 	.md5_parse		= tcp_v4_parse_md5_keys,
2201 };
2202 #endif
2203 
2204 /* NOTE: A lot of things set to zero explicitly by call to
2205  *       sk_alloc() so need not be done here.
2206  */
2207 static int tcp_v4_init_sock(struct sock *sk)
2208 {
2209 	struct inet_connection_sock *icsk = inet_csk(sk);
2210 
2211 	tcp_init_sock(sk);
2212 
2213 	icsk->icsk_af_ops = &ipv4_specific;
2214 
2215 #ifdef CONFIG_TCP_MD5SIG
2216 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2217 #endif
2218 
2219 	return 0;
2220 }
2221 
2222 void tcp_v4_destroy_sock(struct sock *sk)
2223 {
2224 	struct tcp_sock *tp = tcp_sk(sk);
2225 
2226 	trace_tcp_destroy_sock(sk);
2227 
2228 	tcp_clear_xmit_timers(sk);
2229 
2230 	tcp_cleanup_congestion_control(sk);
2231 
2232 	tcp_cleanup_ulp(sk);
2233 
2234 	/* Cleanup up the write buffer. */
2235 	tcp_write_queue_purge(sk);
2236 
2237 	/* Check if we want to disable active TFO */
2238 	tcp_fastopen_active_disable_ofo_check(sk);
2239 
2240 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2241 	skb_rbtree_purge(&tp->out_of_order_queue);
2242 
2243 #ifdef CONFIG_TCP_MD5SIG
2244 	/* Clean up the MD5 key list, if any */
2245 	if (tp->md5sig_info) {
2246 		tcp_clear_md5_list(sk);
2247 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2248 		tp->md5sig_info = NULL;
2249 	}
2250 #endif
2251 
2252 	/* Clean up a referenced TCP bind bucket. */
2253 	if (inet_csk(sk)->icsk_bind_hash)
2254 		inet_put_port(sk);
2255 
2256 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2257 
2258 	/* If socket is aborted during connect operation */
2259 	tcp_free_fastopen_req(tp);
2260 	tcp_fastopen_destroy_cipher(sk);
2261 	tcp_saved_syn_free(tp);
2262 
2263 	sk_sockets_allocated_dec(sk);
2264 }
2265 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2266 
2267 #ifdef CONFIG_PROC_FS
2268 /* Proc filesystem TCP sock list dumping. */
2269 
2270 static unsigned short seq_file_family(const struct seq_file *seq);
2271 
2272 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2273 {
2274 	unsigned short family = seq_file_family(seq);
2275 
2276 	/* AF_UNSPEC is used as a match all */
2277 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2278 		net_eq(sock_net(sk), seq_file_net(seq)));
2279 }
2280 
2281 /* Find a non empty bucket (starting from st->bucket)
2282  * and return the first sk from it.
2283  */
2284 static void *listening_get_first(struct seq_file *seq)
2285 {
2286 	struct tcp_iter_state *st = seq->private;
2287 
2288 	st->offset = 0;
2289 	for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2290 		struct inet_listen_hashbucket *ilb2;
2291 		struct hlist_nulls_node *node;
2292 		struct sock *sk;
2293 
2294 		ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2295 		if (hlist_nulls_empty(&ilb2->nulls_head))
2296 			continue;
2297 
2298 		spin_lock(&ilb2->lock);
2299 		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2300 			if (seq_sk_match(seq, sk))
2301 				return sk;
2302 		}
2303 		spin_unlock(&ilb2->lock);
2304 	}
2305 
2306 	return NULL;
2307 }
2308 
2309 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2310  * If "cur" is the last one in the st->bucket,
2311  * call listening_get_first() to return the first sk of the next
2312  * non empty bucket.
2313  */
2314 static void *listening_get_next(struct seq_file *seq, void *cur)
2315 {
2316 	struct tcp_iter_state *st = seq->private;
2317 	struct inet_listen_hashbucket *ilb2;
2318 	struct hlist_nulls_node *node;
2319 	struct sock *sk = cur;
2320 
2321 	++st->num;
2322 	++st->offset;
2323 
2324 	sk = sk_nulls_next(sk);
2325 	sk_nulls_for_each_from(sk, node) {
2326 		if (seq_sk_match(seq, sk))
2327 			return sk;
2328 	}
2329 
2330 	ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2331 	spin_unlock(&ilb2->lock);
2332 	++st->bucket;
2333 	return listening_get_first(seq);
2334 }
2335 
2336 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2337 {
2338 	struct tcp_iter_state *st = seq->private;
2339 	void *rc;
2340 
2341 	st->bucket = 0;
2342 	st->offset = 0;
2343 	rc = listening_get_first(seq);
2344 
2345 	while (rc && *pos) {
2346 		rc = listening_get_next(seq, rc);
2347 		--*pos;
2348 	}
2349 	return rc;
2350 }
2351 
2352 static inline bool empty_bucket(const struct tcp_iter_state *st)
2353 {
2354 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2355 }
2356 
2357 /*
2358  * Get first established socket starting from bucket given in st->bucket.
2359  * If st->bucket is zero, the very first socket in the hash is returned.
2360  */
2361 static void *established_get_first(struct seq_file *seq)
2362 {
2363 	struct tcp_iter_state *st = seq->private;
2364 
2365 	st->offset = 0;
2366 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2367 		struct sock *sk;
2368 		struct hlist_nulls_node *node;
2369 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2370 
2371 		/* Lockless fast path for the common case of empty buckets */
2372 		if (empty_bucket(st))
2373 			continue;
2374 
2375 		spin_lock_bh(lock);
2376 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2377 			if (seq_sk_match(seq, sk))
2378 				return sk;
2379 		}
2380 		spin_unlock_bh(lock);
2381 	}
2382 
2383 	return NULL;
2384 }
2385 
2386 static void *established_get_next(struct seq_file *seq, void *cur)
2387 {
2388 	struct sock *sk = cur;
2389 	struct hlist_nulls_node *node;
2390 	struct tcp_iter_state *st = seq->private;
2391 
2392 	++st->num;
2393 	++st->offset;
2394 
2395 	sk = sk_nulls_next(sk);
2396 
2397 	sk_nulls_for_each_from(sk, node) {
2398 		if (seq_sk_match(seq, sk))
2399 			return sk;
2400 	}
2401 
2402 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2403 	++st->bucket;
2404 	return established_get_first(seq);
2405 }
2406 
2407 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2408 {
2409 	struct tcp_iter_state *st = seq->private;
2410 	void *rc;
2411 
2412 	st->bucket = 0;
2413 	rc = established_get_first(seq);
2414 
2415 	while (rc && pos) {
2416 		rc = established_get_next(seq, rc);
2417 		--pos;
2418 	}
2419 	return rc;
2420 }
2421 
2422 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2423 {
2424 	void *rc;
2425 	struct tcp_iter_state *st = seq->private;
2426 
2427 	st->state = TCP_SEQ_STATE_LISTENING;
2428 	rc	  = listening_get_idx(seq, &pos);
2429 
2430 	if (!rc) {
2431 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2432 		rc	  = established_get_idx(seq, pos);
2433 	}
2434 
2435 	return rc;
2436 }
2437 
2438 static void *tcp_seek_last_pos(struct seq_file *seq)
2439 {
2440 	struct tcp_iter_state *st = seq->private;
2441 	int bucket = st->bucket;
2442 	int offset = st->offset;
2443 	int orig_num = st->num;
2444 	void *rc = NULL;
2445 
2446 	switch (st->state) {
2447 	case TCP_SEQ_STATE_LISTENING:
2448 		if (st->bucket > tcp_hashinfo.lhash2_mask)
2449 			break;
2450 		st->state = TCP_SEQ_STATE_LISTENING;
2451 		rc = listening_get_first(seq);
2452 		while (offset-- && rc && bucket == st->bucket)
2453 			rc = listening_get_next(seq, rc);
2454 		if (rc)
2455 			break;
2456 		st->bucket = 0;
2457 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2458 		fallthrough;
2459 	case TCP_SEQ_STATE_ESTABLISHED:
2460 		if (st->bucket > tcp_hashinfo.ehash_mask)
2461 			break;
2462 		rc = established_get_first(seq);
2463 		while (offset-- && rc && bucket == st->bucket)
2464 			rc = established_get_next(seq, rc);
2465 	}
2466 
2467 	st->num = orig_num;
2468 
2469 	return rc;
2470 }
2471 
2472 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2473 {
2474 	struct tcp_iter_state *st = seq->private;
2475 	void *rc;
2476 
2477 	if (*pos && *pos == st->last_pos) {
2478 		rc = tcp_seek_last_pos(seq);
2479 		if (rc)
2480 			goto out;
2481 	}
2482 
2483 	st->state = TCP_SEQ_STATE_LISTENING;
2484 	st->num = 0;
2485 	st->bucket = 0;
2486 	st->offset = 0;
2487 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2488 
2489 out:
2490 	st->last_pos = *pos;
2491 	return rc;
2492 }
2493 EXPORT_SYMBOL(tcp_seq_start);
2494 
2495 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2496 {
2497 	struct tcp_iter_state *st = seq->private;
2498 	void *rc = NULL;
2499 
2500 	if (v == SEQ_START_TOKEN) {
2501 		rc = tcp_get_idx(seq, 0);
2502 		goto out;
2503 	}
2504 
2505 	switch (st->state) {
2506 	case TCP_SEQ_STATE_LISTENING:
2507 		rc = listening_get_next(seq, v);
2508 		if (!rc) {
2509 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2510 			st->bucket = 0;
2511 			st->offset = 0;
2512 			rc	  = established_get_first(seq);
2513 		}
2514 		break;
2515 	case TCP_SEQ_STATE_ESTABLISHED:
2516 		rc = established_get_next(seq, v);
2517 		break;
2518 	}
2519 out:
2520 	++*pos;
2521 	st->last_pos = *pos;
2522 	return rc;
2523 }
2524 EXPORT_SYMBOL(tcp_seq_next);
2525 
2526 void tcp_seq_stop(struct seq_file *seq, void *v)
2527 {
2528 	struct tcp_iter_state *st = seq->private;
2529 
2530 	switch (st->state) {
2531 	case TCP_SEQ_STATE_LISTENING:
2532 		if (v != SEQ_START_TOKEN)
2533 			spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2534 		break;
2535 	case TCP_SEQ_STATE_ESTABLISHED:
2536 		if (v)
2537 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2538 		break;
2539 	}
2540 }
2541 EXPORT_SYMBOL(tcp_seq_stop);
2542 
2543 static void get_openreq4(const struct request_sock *req,
2544 			 struct seq_file *f, int i)
2545 {
2546 	const struct inet_request_sock *ireq = inet_rsk(req);
2547 	long delta = req->rsk_timer.expires - jiffies;
2548 
2549 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2550 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2551 		i,
2552 		ireq->ir_loc_addr,
2553 		ireq->ir_num,
2554 		ireq->ir_rmt_addr,
2555 		ntohs(ireq->ir_rmt_port),
2556 		TCP_SYN_RECV,
2557 		0, 0, /* could print option size, but that is af dependent. */
2558 		1,    /* timers active (only the expire timer) */
2559 		jiffies_delta_to_clock_t(delta),
2560 		req->num_timeout,
2561 		from_kuid_munged(seq_user_ns(f),
2562 				 sock_i_uid(req->rsk_listener)),
2563 		0,  /* non standard timer */
2564 		0, /* open_requests have no inode */
2565 		0,
2566 		req);
2567 }
2568 
2569 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2570 {
2571 	int timer_active;
2572 	unsigned long timer_expires;
2573 	const struct tcp_sock *tp = tcp_sk(sk);
2574 	const struct inet_connection_sock *icsk = inet_csk(sk);
2575 	const struct inet_sock *inet = inet_sk(sk);
2576 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2577 	__be32 dest = inet->inet_daddr;
2578 	__be32 src = inet->inet_rcv_saddr;
2579 	__u16 destp = ntohs(inet->inet_dport);
2580 	__u16 srcp = ntohs(inet->inet_sport);
2581 	int rx_queue;
2582 	int state;
2583 
2584 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2585 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2586 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2587 		timer_active	= 1;
2588 		timer_expires	= icsk->icsk_timeout;
2589 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2590 		timer_active	= 4;
2591 		timer_expires	= icsk->icsk_timeout;
2592 	} else if (timer_pending(&sk->sk_timer)) {
2593 		timer_active	= 2;
2594 		timer_expires	= sk->sk_timer.expires;
2595 	} else {
2596 		timer_active	= 0;
2597 		timer_expires = jiffies;
2598 	}
2599 
2600 	state = inet_sk_state_load(sk);
2601 	if (state == TCP_LISTEN)
2602 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2603 	else
2604 		/* Because we don't lock the socket,
2605 		 * we might find a transient negative value.
2606 		 */
2607 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2608 				      READ_ONCE(tp->copied_seq), 0);
2609 
2610 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2611 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2612 		i, src, srcp, dest, destp, state,
2613 		READ_ONCE(tp->write_seq) - tp->snd_una,
2614 		rx_queue,
2615 		timer_active,
2616 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2617 		icsk->icsk_retransmits,
2618 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2619 		icsk->icsk_probes_out,
2620 		sock_i_ino(sk),
2621 		refcount_read(&sk->sk_refcnt), sk,
2622 		jiffies_to_clock_t(icsk->icsk_rto),
2623 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2624 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2625 		tcp_snd_cwnd(tp),
2626 		state == TCP_LISTEN ?
2627 		    fastopenq->max_qlen :
2628 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2629 }
2630 
2631 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2632 			       struct seq_file *f, int i)
2633 {
2634 	long delta = tw->tw_timer.expires - jiffies;
2635 	__be32 dest, src;
2636 	__u16 destp, srcp;
2637 
2638 	dest  = tw->tw_daddr;
2639 	src   = tw->tw_rcv_saddr;
2640 	destp = ntohs(tw->tw_dport);
2641 	srcp  = ntohs(tw->tw_sport);
2642 
2643 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2644 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2645 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2646 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2647 		refcount_read(&tw->tw_refcnt), tw);
2648 }
2649 
2650 #define TMPSZ 150
2651 
2652 static int tcp4_seq_show(struct seq_file *seq, void *v)
2653 {
2654 	struct tcp_iter_state *st;
2655 	struct sock *sk = v;
2656 
2657 	seq_setwidth(seq, TMPSZ - 1);
2658 	if (v == SEQ_START_TOKEN) {
2659 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2660 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2661 			   "inode");
2662 		goto out;
2663 	}
2664 	st = seq->private;
2665 
2666 	if (sk->sk_state == TCP_TIME_WAIT)
2667 		get_timewait4_sock(v, seq, st->num);
2668 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2669 		get_openreq4(v, seq, st->num);
2670 	else
2671 		get_tcp4_sock(v, seq, st->num);
2672 out:
2673 	seq_pad(seq, '\n');
2674 	return 0;
2675 }
2676 
2677 #ifdef CONFIG_BPF_SYSCALL
2678 struct bpf_tcp_iter_state {
2679 	struct tcp_iter_state state;
2680 	unsigned int cur_sk;
2681 	unsigned int end_sk;
2682 	unsigned int max_sk;
2683 	struct sock **batch;
2684 	bool st_bucket_done;
2685 };
2686 
2687 struct bpf_iter__tcp {
2688 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2689 	__bpf_md_ptr(struct sock_common *, sk_common);
2690 	uid_t uid __aligned(8);
2691 };
2692 
2693 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2694 			     struct sock_common *sk_common, uid_t uid)
2695 {
2696 	struct bpf_iter__tcp ctx;
2697 
2698 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2699 	ctx.meta = meta;
2700 	ctx.sk_common = sk_common;
2701 	ctx.uid = uid;
2702 	return bpf_iter_run_prog(prog, &ctx);
2703 }
2704 
2705 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2706 {
2707 	while (iter->cur_sk < iter->end_sk)
2708 		sock_put(iter->batch[iter->cur_sk++]);
2709 }
2710 
2711 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2712 				      unsigned int new_batch_sz)
2713 {
2714 	struct sock **new_batch;
2715 
2716 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2717 			     GFP_USER | __GFP_NOWARN);
2718 	if (!new_batch)
2719 		return -ENOMEM;
2720 
2721 	bpf_iter_tcp_put_batch(iter);
2722 	kvfree(iter->batch);
2723 	iter->batch = new_batch;
2724 	iter->max_sk = new_batch_sz;
2725 
2726 	return 0;
2727 }
2728 
2729 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2730 						 struct sock *start_sk)
2731 {
2732 	struct bpf_tcp_iter_state *iter = seq->private;
2733 	struct tcp_iter_state *st = &iter->state;
2734 	struct hlist_nulls_node *node;
2735 	unsigned int expected = 1;
2736 	struct sock *sk;
2737 
2738 	sock_hold(start_sk);
2739 	iter->batch[iter->end_sk++] = start_sk;
2740 
2741 	sk = sk_nulls_next(start_sk);
2742 	sk_nulls_for_each_from(sk, node) {
2743 		if (seq_sk_match(seq, sk)) {
2744 			if (iter->end_sk < iter->max_sk) {
2745 				sock_hold(sk);
2746 				iter->batch[iter->end_sk++] = sk;
2747 			}
2748 			expected++;
2749 		}
2750 	}
2751 	spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2752 
2753 	return expected;
2754 }
2755 
2756 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2757 						   struct sock *start_sk)
2758 {
2759 	struct bpf_tcp_iter_state *iter = seq->private;
2760 	struct tcp_iter_state *st = &iter->state;
2761 	struct hlist_nulls_node *node;
2762 	unsigned int expected = 1;
2763 	struct sock *sk;
2764 
2765 	sock_hold(start_sk);
2766 	iter->batch[iter->end_sk++] = start_sk;
2767 
2768 	sk = sk_nulls_next(start_sk);
2769 	sk_nulls_for_each_from(sk, node) {
2770 		if (seq_sk_match(seq, sk)) {
2771 			if (iter->end_sk < iter->max_sk) {
2772 				sock_hold(sk);
2773 				iter->batch[iter->end_sk++] = sk;
2774 			}
2775 			expected++;
2776 		}
2777 	}
2778 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2779 
2780 	return expected;
2781 }
2782 
2783 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2784 {
2785 	struct bpf_tcp_iter_state *iter = seq->private;
2786 	struct tcp_iter_state *st = &iter->state;
2787 	unsigned int expected;
2788 	bool resized = false;
2789 	struct sock *sk;
2790 
2791 	/* The st->bucket is done.  Directly advance to the next
2792 	 * bucket instead of having the tcp_seek_last_pos() to skip
2793 	 * one by one in the current bucket and eventually find out
2794 	 * it has to advance to the next bucket.
2795 	 */
2796 	if (iter->st_bucket_done) {
2797 		st->offset = 0;
2798 		st->bucket++;
2799 		if (st->state == TCP_SEQ_STATE_LISTENING &&
2800 		    st->bucket > tcp_hashinfo.lhash2_mask) {
2801 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2802 			st->bucket = 0;
2803 		}
2804 	}
2805 
2806 again:
2807 	/* Get a new batch */
2808 	iter->cur_sk = 0;
2809 	iter->end_sk = 0;
2810 	iter->st_bucket_done = false;
2811 
2812 	sk = tcp_seek_last_pos(seq);
2813 	if (!sk)
2814 		return NULL; /* Done */
2815 
2816 	if (st->state == TCP_SEQ_STATE_LISTENING)
2817 		expected = bpf_iter_tcp_listening_batch(seq, sk);
2818 	else
2819 		expected = bpf_iter_tcp_established_batch(seq, sk);
2820 
2821 	if (iter->end_sk == expected) {
2822 		iter->st_bucket_done = true;
2823 		return sk;
2824 	}
2825 
2826 	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2827 		resized = true;
2828 		goto again;
2829 	}
2830 
2831 	return sk;
2832 }
2833 
2834 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2835 {
2836 	/* bpf iter does not support lseek, so it always
2837 	 * continue from where it was stop()-ped.
2838 	 */
2839 	if (*pos)
2840 		return bpf_iter_tcp_batch(seq);
2841 
2842 	return SEQ_START_TOKEN;
2843 }
2844 
2845 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2846 {
2847 	struct bpf_tcp_iter_state *iter = seq->private;
2848 	struct tcp_iter_state *st = &iter->state;
2849 	struct sock *sk;
2850 
2851 	/* Whenever seq_next() is called, the iter->cur_sk is
2852 	 * done with seq_show(), so advance to the next sk in
2853 	 * the batch.
2854 	 */
2855 	if (iter->cur_sk < iter->end_sk) {
2856 		/* Keeping st->num consistent in tcp_iter_state.
2857 		 * bpf_iter_tcp does not use st->num.
2858 		 * meta.seq_num is used instead.
2859 		 */
2860 		st->num++;
2861 		/* Move st->offset to the next sk in the bucket such that
2862 		 * the future start() will resume at st->offset in
2863 		 * st->bucket.  See tcp_seek_last_pos().
2864 		 */
2865 		st->offset++;
2866 		sock_put(iter->batch[iter->cur_sk++]);
2867 	}
2868 
2869 	if (iter->cur_sk < iter->end_sk)
2870 		sk = iter->batch[iter->cur_sk];
2871 	else
2872 		sk = bpf_iter_tcp_batch(seq);
2873 
2874 	++*pos;
2875 	/* Keeping st->last_pos consistent in tcp_iter_state.
2876 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2877 	 */
2878 	st->last_pos = *pos;
2879 	return sk;
2880 }
2881 
2882 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2883 {
2884 	struct bpf_iter_meta meta;
2885 	struct bpf_prog *prog;
2886 	struct sock *sk = v;
2887 	bool slow;
2888 	uid_t uid;
2889 	int ret;
2890 
2891 	if (v == SEQ_START_TOKEN)
2892 		return 0;
2893 
2894 	if (sk_fullsock(sk))
2895 		slow = lock_sock_fast(sk);
2896 
2897 	if (unlikely(sk_unhashed(sk))) {
2898 		ret = SEQ_SKIP;
2899 		goto unlock;
2900 	}
2901 
2902 	if (sk->sk_state == TCP_TIME_WAIT) {
2903 		uid = 0;
2904 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2905 		const struct request_sock *req = v;
2906 
2907 		uid = from_kuid_munged(seq_user_ns(seq),
2908 				       sock_i_uid(req->rsk_listener));
2909 	} else {
2910 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2911 	}
2912 
2913 	meta.seq = seq;
2914 	prog = bpf_iter_get_info(&meta, false);
2915 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
2916 
2917 unlock:
2918 	if (sk_fullsock(sk))
2919 		unlock_sock_fast(sk, slow);
2920 	return ret;
2921 
2922 }
2923 
2924 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2925 {
2926 	struct bpf_tcp_iter_state *iter = seq->private;
2927 	struct bpf_iter_meta meta;
2928 	struct bpf_prog *prog;
2929 
2930 	if (!v) {
2931 		meta.seq = seq;
2932 		prog = bpf_iter_get_info(&meta, true);
2933 		if (prog)
2934 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
2935 	}
2936 
2937 	if (iter->cur_sk < iter->end_sk) {
2938 		bpf_iter_tcp_put_batch(iter);
2939 		iter->st_bucket_done = false;
2940 	}
2941 }
2942 
2943 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2944 	.show		= bpf_iter_tcp_seq_show,
2945 	.start		= bpf_iter_tcp_seq_start,
2946 	.next		= bpf_iter_tcp_seq_next,
2947 	.stop		= bpf_iter_tcp_seq_stop,
2948 };
2949 #endif
2950 static unsigned short seq_file_family(const struct seq_file *seq)
2951 {
2952 	const struct tcp_seq_afinfo *afinfo;
2953 
2954 #ifdef CONFIG_BPF_SYSCALL
2955 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
2956 	if (seq->op == &bpf_iter_tcp_seq_ops)
2957 		return AF_UNSPEC;
2958 #endif
2959 
2960 	/* Iterated from proc fs */
2961 	afinfo = pde_data(file_inode(seq->file));
2962 	return afinfo->family;
2963 }
2964 
2965 static const struct seq_operations tcp4_seq_ops = {
2966 	.show		= tcp4_seq_show,
2967 	.start		= tcp_seq_start,
2968 	.next		= tcp_seq_next,
2969 	.stop		= tcp_seq_stop,
2970 };
2971 
2972 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2973 	.family		= AF_INET,
2974 };
2975 
2976 static int __net_init tcp4_proc_init_net(struct net *net)
2977 {
2978 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2979 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2980 		return -ENOMEM;
2981 	return 0;
2982 }
2983 
2984 static void __net_exit tcp4_proc_exit_net(struct net *net)
2985 {
2986 	remove_proc_entry("tcp", net->proc_net);
2987 }
2988 
2989 static struct pernet_operations tcp4_net_ops = {
2990 	.init = tcp4_proc_init_net,
2991 	.exit = tcp4_proc_exit_net,
2992 };
2993 
2994 int __init tcp4_proc_init(void)
2995 {
2996 	return register_pernet_subsys(&tcp4_net_ops);
2997 }
2998 
2999 void tcp4_proc_exit(void)
3000 {
3001 	unregister_pernet_subsys(&tcp4_net_ops);
3002 }
3003 #endif /* CONFIG_PROC_FS */
3004 
3005 /* @wake is one when sk_stream_write_space() calls us.
3006  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3007  * This mimics the strategy used in sock_def_write_space().
3008  */
3009 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3010 {
3011 	const struct tcp_sock *tp = tcp_sk(sk);
3012 	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3013 			    READ_ONCE(tp->snd_nxt);
3014 
3015 	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3016 }
3017 EXPORT_SYMBOL(tcp_stream_memory_free);
3018 
3019 struct proto tcp_prot = {
3020 	.name			= "TCP",
3021 	.owner			= THIS_MODULE,
3022 	.close			= tcp_close,
3023 	.pre_connect		= tcp_v4_pre_connect,
3024 	.connect		= tcp_v4_connect,
3025 	.disconnect		= tcp_disconnect,
3026 	.accept			= inet_csk_accept,
3027 	.ioctl			= tcp_ioctl,
3028 	.init			= tcp_v4_init_sock,
3029 	.destroy		= tcp_v4_destroy_sock,
3030 	.shutdown		= tcp_shutdown,
3031 	.setsockopt		= tcp_setsockopt,
3032 	.getsockopt		= tcp_getsockopt,
3033 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3034 	.keepalive		= tcp_set_keepalive,
3035 	.recvmsg		= tcp_recvmsg,
3036 	.sendmsg		= tcp_sendmsg,
3037 	.sendpage		= tcp_sendpage,
3038 	.backlog_rcv		= tcp_v4_do_rcv,
3039 	.release_cb		= tcp_release_cb,
3040 	.hash			= inet_hash,
3041 	.unhash			= inet_unhash,
3042 	.get_port		= inet_csk_get_port,
3043 	.put_port		= inet_put_port,
3044 #ifdef CONFIG_BPF_SYSCALL
3045 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3046 #endif
3047 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3048 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3049 	.stream_memory_free	= tcp_stream_memory_free,
3050 	.sockets_allocated	= &tcp_sockets_allocated,
3051 	.orphan_count		= &tcp_orphan_count,
3052 	.memory_allocated	= &tcp_memory_allocated,
3053 	.memory_pressure	= &tcp_memory_pressure,
3054 	.sysctl_mem		= sysctl_tcp_mem,
3055 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3056 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3057 	.max_header		= MAX_TCP_HEADER,
3058 	.obj_size		= sizeof(struct tcp_sock),
3059 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3060 	.twsk_prot		= &tcp_timewait_sock_ops,
3061 	.rsk_prot		= &tcp_request_sock_ops,
3062 	.h.hashinfo		= &tcp_hashinfo,
3063 	.no_autobind		= true,
3064 	.diag_destroy		= tcp_abort,
3065 };
3066 EXPORT_SYMBOL(tcp_prot);
3067 
3068 static void __net_exit tcp_sk_exit(struct net *net)
3069 {
3070 	struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row;
3071 
3072 	if (net->ipv4.tcp_congestion_control)
3073 		bpf_module_put(net->ipv4.tcp_congestion_control,
3074 			       net->ipv4.tcp_congestion_control->owner);
3075 	if (refcount_dec_and_test(&tcp_death_row->tw_refcount))
3076 		kfree(tcp_death_row);
3077 }
3078 
3079 static int __net_init tcp_sk_init(struct net *net)
3080 {
3081 	int cnt;
3082 
3083 	net->ipv4.sysctl_tcp_ecn = 2;
3084 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3085 
3086 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3087 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3088 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3089 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3090 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3091 
3092 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3093 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3094 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3095 
3096 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3097 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3098 	net->ipv4.sysctl_tcp_syncookies = 1;
3099 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3100 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3101 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3102 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3103 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3104 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3105 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3106 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3107 
3108 	net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL);
3109 	if (!net->ipv4.tcp_death_row)
3110 		return -ENOMEM;
3111 	refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1);
3112 	cnt = tcp_hashinfo.ehash_mask + 1;
3113 	net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2;
3114 	net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo;
3115 
3116 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3117 	net->ipv4.sysctl_tcp_sack = 1;
3118 	net->ipv4.sysctl_tcp_window_scaling = 1;
3119 	net->ipv4.sysctl_tcp_timestamps = 1;
3120 	net->ipv4.sysctl_tcp_early_retrans = 3;
3121 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3122 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3123 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3124 	net->ipv4.sysctl_tcp_max_reordering = 300;
3125 	net->ipv4.sysctl_tcp_dsack = 1;
3126 	net->ipv4.sysctl_tcp_app_win = 31;
3127 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3128 	net->ipv4.sysctl_tcp_frto = 2;
3129 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3130 	/* This limits the percentage of the congestion window which we
3131 	 * will allow a single TSO frame to consume.  Building TSO frames
3132 	 * which are too large can cause TCP streams to be bursty.
3133 	 */
3134 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3135 	/* Default TSQ limit of 16 TSO segments */
3136 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3137 	/* rfc5961 challenge ack rate limiting */
3138 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
3139 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3140 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3141 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3142 	net->ipv4.sysctl_tcp_autocorking = 1;
3143 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3144 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3145 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3146 	if (net != &init_net) {
3147 		memcpy(net->ipv4.sysctl_tcp_rmem,
3148 		       init_net.ipv4.sysctl_tcp_rmem,
3149 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3150 		memcpy(net->ipv4.sysctl_tcp_wmem,
3151 		       init_net.ipv4.sysctl_tcp_wmem,
3152 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3153 	}
3154 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3155 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3156 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3157 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3158 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3159 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3160 
3161 	/* Reno is always built in */
3162 	if (!net_eq(net, &init_net) &&
3163 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3164 			       init_net.ipv4.tcp_congestion_control->owner))
3165 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3166 	else
3167 		net->ipv4.tcp_congestion_control = &tcp_reno;
3168 
3169 	return 0;
3170 }
3171 
3172 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3173 {
3174 	struct net *net;
3175 
3176 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
3177 
3178 	list_for_each_entry(net, net_exit_list, exit_list)
3179 		tcp_fastopen_ctx_destroy(net);
3180 }
3181 
3182 static struct pernet_operations __net_initdata tcp_sk_ops = {
3183        .init	   = tcp_sk_init,
3184        .exit	   = tcp_sk_exit,
3185        .exit_batch = tcp_sk_exit_batch,
3186 };
3187 
3188 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3189 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3190 		     struct sock_common *sk_common, uid_t uid)
3191 
3192 #define INIT_BATCH_SZ 16
3193 
3194 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3195 {
3196 	struct bpf_tcp_iter_state *iter = priv_data;
3197 	int err;
3198 
3199 	err = bpf_iter_init_seq_net(priv_data, aux);
3200 	if (err)
3201 		return err;
3202 
3203 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3204 	if (err) {
3205 		bpf_iter_fini_seq_net(priv_data);
3206 		return err;
3207 	}
3208 
3209 	return 0;
3210 }
3211 
3212 static void bpf_iter_fini_tcp(void *priv_data)
3213 {
3214 	struct bpf_tcp_iter_state *iter = priv_data;
3215 
3216 	bpf_iter_fini_seq_net(priv_data);
3217 	kvfree(iter->batch);
3218 }
3219 
3220 static const struct bpf_iter_seq_info tcp_seq_info = {
3221 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3222 	.init_seq_private	= bpf_iter_init_tcp,
3223 	.fini_seq_private	= bpf_iter_fini_tcp,
3224 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3225 };
3226 
3227 static const struct bpf_func_proto *
3228 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3229 			    const struct bpf_prog *prog)
3230 {
3231 	switch (func_id) {
3232 	case BPF_FUNC_setsockopt:
3233 		return &bpf_sk_setsockopt_proto;
3234 	case BPF_FUNC_getsockopt:
3235 		return &bpf_sk_getsockopt_proto;
3236 	default:
3237 		return NULL;
3238 	}
3239 }
3240 
3241 static struct bpf_iter_reg tcp_reg_info = {
3242 	.target			= "tcp",
3243 	.ctx_arg_info_size	= 1,
3244 	.ctx_arg_info		= {
3245 		{ offsetof(struct bpf_iter__tcp, sk_common),
3246 		  PTR_TO_BTF_ID_OR_NULL },
3247 	},
3248 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3249 	.seq_info		= &tcp_seq_info,
3250 };
3251 
3252 static void __init bpf_iter_register(void)
3253 {
3254 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3255 	if (bpf_iter_reg_target(&tcp_reg_info))
3256 		pr_warn("Warning: could not register bpf iterator tcp\n");
3257 }
3258 
3259 #endif
3260 
3261 void __init tcp_v4_init(void)
3262 {
3263 	int cpu, res;
3264 
3265 	for_each_possible_cpu(cpu) {
3266 		struct sock *sk;
3267 
3268 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3269 					   IPPROTO_TCP, &init_net);
3270 		if (res)
3271 			panic("Failed to create the TCP control socket.\n");
3272 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3273 
3274 		/* Please enforce IP_DF and IPID==0 for RST and
3275 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3276 		 */
3277 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3278 
3279 		per_cpu(ipv4_tcp_sk, cpu) = sk;
3280 	}
3281 	if (register_pernet_subsys(&tcp_sk_ops))
3282 		panic("Failed to create the TCP control socket.\n");
3283 
3284 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3285 	bpf_iter_register();
3286 #endif
3287 }
3288