xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision a48acad7)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72 
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
80 
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
83 
84 #include <trace/events/tcp.h>
85 
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 #endif
90 
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
93 
94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
95 
96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
97 {
98 	return secure_tcp_seq(ip_hdr(skb)->daddr,
99 			      ip_hdr(skb)->saddr,
100 			      tcp_hdr(skb)->dest,
101 			      tcp_hdr(skb)->source);
102 }
103 
104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
105 {
106 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
107 }
108 
109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110 {
111 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
112 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
113 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114 	struct tcp_sock *tp = tcp_sk(sk);
115 
116 	if (reuse == 2) {
117 		/* Still does not detect *everything* that goes through
118 		 * lo, since we require a loopback src or dst address
119 		 * or direct binding to 'lo' interface.
120 		 */
121 		bool loopback = false;
122 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
123 			loopback = true;
124 #if IS_ENABLED(CONFIG_IPV6)
125 		if (tw->tw_family == AF_INET6) {
126 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
127 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
128 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
129 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
130 				loopback = true;
131 		} else
132 #endif
133 		{
134 			if (ipv4_is_loopback(tw->tw_daddr) ||
135 			    ipv4_is_loopback(tw->tw_rcv_saddr))
136 				loopback = true;
137 		}
138 		if (!loopback)
139 			reuse = 0;
140 	}
141 
142 	/* With PAWS, it is safe from the viewpoint
143 	   of data integrity. Even without PAWS it is safe provided sequence
144 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
145 
146 	   Actually, the idea is close to VJ's one, only timestamp cache is
147 	   held not per host, but per port pair and TW bucket is used as state
148 	   holder.
149 
150 	   If TW bucket has been already destroyed we fall back to VJ's scheme
151 	   and use initial timestamp retrieved from peer table.
152 	 */
153 	if (tcptw->tw_ts_recent_stamp &&
154 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
155 					    tcptw->tw_ts_recent_stamp)))) {
156 		/* In case of repair and re-using TIME-WAIT sockets we still
157 		 * want to be sure that it is safe as above but honor the
158 		 * sequence numbers and time stamps set as part of the repair
159 		 * process.
160 		 *
161 		 * Without this check re-using a TIME-WAIT socket with TCP
162 		 * repair would accumulate a -1 on the repair assigned
163 		 * sequence number. The first time it is reused the sequence
164 		 * is -1, the second time -2, etc. This fixes that issue
165 		 * without appearing to create any others.
166 		 */
167 		if (likely(!tp->repair)) {
168 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
169 
170 			if (!seq)
171 				seq = 1;
172 			WRITE_ONCE(tp->write_seq, seq);
173 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
174 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
175 		}
176 		sock_hold(sktw);
177 		return 1;
178 	}
179 
180 	return 0;
181 }
182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
183 
184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
185 			      int addr_len)
186 {
187 	/* This check is replicated from tcp_v4_connect() and intended to
188 	 * prevent BPF program called below from accessing bytes that are out
189 	 * of the bound specified by user in addr_len.
190 	 */
191 	if (addr_len < sizeof(struct sockaddr_in))
192 		return -EINVAL;
193 
194 	sock_owned_by_me(sk);
195 
196 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
197 }
198 
199 /* This will initiate an outgoing connection. */
200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
201 {
202 	struct inet_bind_hashbucket *prev_addr_hashbucket = NULL;
203 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
204 	struct inet_timewait_death_row *tcp_death_row;
205 	__be32 daddr, nexthop, prev_sk_rcv_saddr;
206 	struct inet_sock *inet = inet_sk(sk);
207 	struct tcp_sock *tp = tcp_sk(sk);
208 	struct ip_options_rcu *inet_opt;
209 	struct net *net = sock_net(sk);
210 	__be16 orig_sport, orig_dport;
211 	struct flowi4 *fl4;
212 	struct rtable *rt;
213 	int err;
214 
215 	if (addr_len < sizeof(struct sockaddr_in))
216 		return -EINVAL;
217 
218 	if (usin->sin_family != AF_INET)
219 		return -EAFNOSUPPORT;
220 
221 	nexthop = daddr = usin->sin_addr.s_addr;
222 	inet_opt = rcu_dereference_protected(inet->inet_opt,
223 					     lockdep_sock_is_held(sk));
224 	if (inet_opt && inet_opt->opt.srr) {
225 		if (!daddr)
226 			return -EINVAL;
227 		nexthop = inet_opt->opt.faddr;
228 	}
229 
230 	orig_sport = inet->inet_sport;
231 	orig_dport = usin->sin_port;
232 	fl4 = &inet->cork.fl.u.ip4;
233 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
234 			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
235 			      orig_dport, sk);
236 	if (IS_ERR(rt)) {
237 		err = PTR_ERR(rt);
238 		if (err == -ENETUNREACH)
239 			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
240 		return err;
241 	}
242 
243 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
244 		ip_rt_put(rt);
245 		return -ENETUNREACH;
246 	}
247 
248 	if (!inet_opt || !inet_opt->opt.srr)
249 		daddr = fl4->daddr;
250 
251 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
252 
253 	if (!inet->inet_saddr) {
254 		if (inet_csk(sk)->icsk_bind2_hash) {
255 			prev_addr_hashbucket = inet_bhashfn_portaddr(tcp_death_row->hashinfo,
256 								     sk, net, inet->inet_num);
257 			prev_sk_rcv_saddr = sk->sk_rcv_saddr;
258 		}
259 		inet->inet_saddr = fl4->saddr;
260 	}
261 
262 	sk_rcv_saddr_set(sk, inet->inet_saddr);
263 
264 	if (prev_addr_hashbucket) {
265 		err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
266 		if (err) {
267 			inet->inet_saddr = 0;
268 			sk_rcv_saddr_set(sk, prev_sk_rcv_saddr);
269 			ip_rt_put(rt);
270 			return err;
271 		}
272 	}
273 
274 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
275 		/* Reset inherited state */
276 		tp->rx_opt.ts_recent	   = 0;
277 		tp->rx_opt.ts_recent_stamp = 0;
278 		if (likely(!tp->repair))
279 			WRITE_ONCE(tp->write_seq, 0);
280 	}
281 
282 	inet->inet_dport = usin->sin_port;
283 	sk_daddr_set(sk, daddr);
284 
285 	inet_csk(sk)->icsk_ext_hdr_len = 0;
286 	if (inet_opt)
287 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
288 
289 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
290 
291 	/* Socket identity is still unknown (sport may be zero).
292 	 * However we set state to SYN-SENT and not releasing socket
293 	 * lock select source port, enter ourselves into the hash tables and
294 	 * complete initialization after this.
295 	 */
296 	tcp_set_state(sk, TCP_SYN_SENT);
297 	err = inet_hash_connect(tcp_death_row, sk);
298 	if (err)
299 		goto failure;
300 
301 	sk_set_txhash(sk);
302 
303 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
304 			       inet->inet_sport, inet->inet_dport, sk);
305 	if (IS_ERR(rt)) {
306 		err = PTR_ERR(rt);
307 		rt = NULL;
308 		goto failure;
309 	}
310 	/* OK, now commit destination to socket.  */
311 	sk->sk_gso_type = SKB_GSO_TCPV4;
312 	sk_setup_caps(sk, &rt->dst);
313 	rt = NULL;
314 
315 	if (likely(!tp->repair)) {
316 		if (!tp->write_seq)
317 			WRITE_ONCE(tp->write_seq,
318 				   secure_tcp_seq(inet->inet_saddr,
319 						  inet->inet_daddr,
320 						  inet->inet_sport,
321 						  usin->sin_port));
322 		tp->tsoffset = secure_tcp_ts_off(net, inet->inet_saddr,
323 						 inet->inet_daddr);
324 	}
325 
326 	inet->inet_id = get_random_u16();
327 
328 	if (tcp_fastopen_defer_connect(sk, &err))
329 		return err;
330 	if (err)
331 		goto failure;
332 
333 	err = tcp_connect(sk);
334 
335 	if (err)
336 		goto failure;
337 
338 	return 0;
339 
340 failure:
341 	/*
342 	 * This unhashes the socket and releases the local port,
343 	 * if necessary.
344 	 */
345 	tcp_set_state(sk, TCP_CLOSE);
346 	ip_rt_put(rt);
347 	sk->sk_route_caps = 0;
348 	inet->inet_dport = 0;
349 	return err;
350 }
351 EXPORT_SYMBOL(tcp_v4_connect);
352 
353 /*
354  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
355  * It can be called through tcp_release_cb() if socket was owned by user
356  * at the time tcp_v4_err() was called to handle ICMP message.
357  */
358 void tcp_v4_mtu_reduced(struct sock *sk)
359 {
360 	struct inet_sock *inet = inet_sk(sk);
361 	struct dst_entry *dst;
362 	u32 mtu;
363 
364 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
365 		return;
366 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
367 	dst = inet_csk_update_pmtu(sk, mtu);
368 	if (!dst)
369 		return;
370 
371 	/* Something is about to be wrong... Remember soft error
372 	 * for the case, if this connection will not able to recover.
373 	 */
374 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
375 		sk->sk_err_soft = EMSGSIZE;
376 
377 	mtu = dst_mtu(dst);
378 
379 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
380 	    ip_sk_accept_pmtu(sk) &&
381 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
382 		tcp_sync_mss(sk, mtu);
383 
384 		/* Resend the TCP packet because it's
385 		 * clear that the old packet has been
386 		 * dropped. This is the new "fast" path mtu
387 		 * discovery.
388 		 */
389 		tcp_simple_retransmit(sk);
390 	} /* else let the usual retransmit timer handle it */
391 }
392 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
393 
394 static void do_redirect(struct sk_buff *skb, struct sock *sk)
395 {
396 	struct dst_entry *dst = __sk_dst_check(sk, 0);
397 
398 	if (dst)
399 		dst->ops->redirect(dst, sk, skb);
400 }
401 
402 
403 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
404 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
405 {
406 	struct request_sock *req = inet_reqsk(sk);
407 	struct net *net = sock_net(sk);
408 
409 	/* ICMPs are not backlogged, hence we cannot get
410 	 * an established socket here.
411 	 */
412 	if (seq != tcp_rsk(req)->snt_isn) {
413 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
414 	} else if (abort) {
415 		/*
416 		 * Still in SYN_RECV, just remove it silently.
417 		 * There is no good way to pass the error to the newly
418 		 * created socket, and POSIX does not want network
419 		 * errors returned from accept().
420 		 */
421 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
422 		tcp_listendrop(req->rsk_listener);
423 	}
424 	reqsk_put(req);
425 }
426 EXPORT_SYMBOL(tcp_req_err);
427 
428 /* TCP-LD (RFC 6069) logic */
429 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
430 {
431 	struct inet_connection_sock *icsk = inet_csk(sk);
432 	struct tcp_sock *tp = tcp_sk(sk);
433 	struct sk_buff *skb;
434 	s32 remaining;
435 	u32 delta_us;
436 
437 	if (sock_owned_by_user(sk))
438 		return;
439 
440 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
441 	    !icsk->icsk_backoff)
442 		return;
443 
444 	skb = tcp_rtx_queue_head(sk);
445 	if (WARN_ON_ONCE(!skb))
446 		return;
447 
448 	icsk->icsk_backoff--;
449 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
450 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
451 
452 	tcp_mstamp_refresh(tp);
453 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
454 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
455 
456 	if (remaining > 0) {
457 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
458 					  remaining, TCP_RTO_MAX);
459 	} else {
460 		/* RTO revert clocked out retransmission.
461 		 * Will retransmit now.
462 		 */
463 		tcp_retransmit_timer(sk);
464 	}
465 }
466 EXPORT_SYMBOL(tcp_ld_RTO_revert);
467 
468 /*
469  * This routine is called by the ICMP module when it gets some
470  * sort of error condition.  If err < 0 then the socket should
471  * be closed and the error returned to the user.  If err > 0
472  * it's just the icmp type << 8 | icmp code.  After adjustment
473  * header points to the first 8 bytes of the tcp header.  We need
474  * to find the appropriate port.
475  *
476  * The locking strategy used here is very "optimistic". When
477  * someone else accesses the socket the ICMP is just dropped
478  * and for some paths there is no check at all.
479  * A more general error queue to queue errors for later handling
480  * is probably better.
481  *
482  */
483 
484 int tcp_v4_err(struct sk_buff *skb, u32 info)
485 {
486 	const struct iphdr *iph = (const struct iphdr *)skb->data;
487 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
488 	struct tcp_sock *tp;
489 	struct inet_sock *inet;
490 	const int type = icmp_hdr(skb)->type;
491 	const int code = icmp_hdr(skb)->code;
492 	struct sock *sk;
493 	struct request_sock *fastopen;
494 	u32 seq, snd_una;
495 	int err;
496 	struct net *net = dev_net(skb->dev);
497 
498 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
499 				       iph->daddr, th->dest, iph->saddr,
500 				       ntohs(th->source), inet_iif(skb), 0);
501 	if (!sk) {
502 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
503 		return -ENOENT;
504 	}
505 	if (sk->sk_state == TCP_TIME_WAIT) {
506 		inet_twsk_put(inet_twsk(sk));
507 		return 0;
508 	}
509 	seq = ntohl(th->seq);
510 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
511 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
512 				     type == ICMP_TIME_EXCEEDED ||
513 				     (type == ICMP_DEST_UNREACH &&
514 				      (code == ICMP_NET_UNREACH ||
515 				       code == ICMP_HOST_UNREACH)));
516 		return 0;
517 	}
518 
519 	bh_lock_sock(sk);
520 	/* If too many ICMPs get dropped on busy
521 	 * servers this needs to be solved differently.
522 	 * We do take care of PMTU discovery (RFC1191) special case :
523 	 * we can receive locally generated ICMP messages while socket is held.
524 	 */
525 	if (sock_owned_by_user(sk)) {
526 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
527 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
528 	}
529 	if (sk->sk_state == TCP_CLOSE)
530 		goto out;
531 
532 	if (static_branch_unlikely(&ip4_min_ttl)) {
533 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
534 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
535 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
536 			goto out;
537 		}
538 	}
539 
540 	tp = tcp_sk(sk);
541 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
542 	fastopen = rcu_dereference(tp->fastopen_rsk);
543 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
544 	if (sk->sk_state != TCP_LISTEN &&
545 	    !between(seq, snd_una, tp->snd_nxt)) {
546 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
547 		goto out;
548 	}
549 
550 	switch (type) {
551 	case ICMP_REDIRECT:
552 		if (!sock_owned_by_user(sk))
553 			do_redirect(skb, sk);
554 		goto out;
555 	case ICMP_SOURCE_QUENCH:
556 		/* Just silently ignore these. */
557 		goto out;
558 	case ICMP_PARAMETERPROB:
559 		err = EPROTO;
560 		break;
561 	case ICMP_DEST_UNREACH:
562 		if (code > NR_ICMP_UNREACH)
563 			goto out;
564 
565 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
566 			/* We are not interested in TCP_LISTEN and open_requests
567 			 * (SYN-ACKs send out by Linux are always <576bytes so
568 			 * they should go through unfragmented).
569 			 */
570 			if (sk->sk_state == TCP_LISTEN)
571 				goto out;
572 
573 			WRITE_ONCE(tp->mtu_info, info);
574 			if (!sock_owned_by_user(sk)) {
575 				tcp_v4_mtu_reduced(sk);
576 			} else {
577 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
578 					sock_hold(sk);
579 			}
580 			goto out;
581 		}
582 
583 		err = icmp_err_convert[code].errno;
584 		/* check if this ICMP message allows revert of backoff.
585 		 * (see RFC 6069)
586 		 */
587 		if (!fastopen &&
588 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
589 			tcp_ld_RTO_revert(sk, seq);
590 		break;
591 	case ICMP_TIME_EXCEEDED:
592 		err = EHOSTUNREACH;
593 		break;
594 	default:
595 		goto out;
596 	}
597 
598 	switch (sk->sk_state) {
599 	case TCP_SYN_SENT:
600 	case TCP_SYN_RECV:
601 		/* Only in fast or simultaneous open. If a fast open socket is
602 		 * already accepted it is treated as a connected one below.
603 		 */
604 		if (fastopen && !fastopen->sk)
605 			break;
606 
607 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
608 
609 		if (!sock_owned_by_user(sk)) {
610 			sk->sk_err = err;
611 
612 			sk_error_report(sk);
613 
614 			tcp_done(sk);
615 		} else {
616 			sk->sk_err_soft = err;
617 		}
618 		goto out;
619 	}
620 
621 	/* If we've already connected we will keep trying
622 	 * until we time out, or the user gives up.
623 	 *
624 	 * rfc1122 4.2.3.9 allows to consider as hard errors
625 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
626 	 * but it is obsoleted by pmtu discovery).
627 	 *
628 	 * Note, that in modern internet, where routing is unreliable
629 	 * and in each dark corner broken firewalls sit, sending random
630 	 * errors ordered by their masters even this two messages finally lose
631 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
632 	 *
633 	 * Now we are in compliance with RFCs.
634 	 *							--ANK (980905)
635 	 */
636 
637 	inet = inet_sk(sk);
638 	if (!sock_owned_by_user(sk) && inet->recverr) {
639 		sk->sk_err = err;
640 		sk_error_report(sk);
641 	} else	{ /* Only an error on timeout */
642 		sk->sk_err_soft = err;
643 	}
644 
645 out:
646 	bh_unlock_sock(sk);
647 	sock_put(sk);
648 	return 0;
649 }
650 
651 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
652 {
653 	struct tcphdr *th = tcp_hdr(skb);
654 
655 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
656 	skb->csum_start = skb_transport_header(skb) - skb->head;
657 	skb->csum_offset = offsetof(struct tcphdr, check);
658 }
659 
660 /* This routine computes an IPv4 TCP checksum. */
661 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
662 {
663 	const struct inet_sock *inet = inet_sk(sk);
664 
665 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
666 }
667 EXPORT_SYMBOL(tcp_v4_send_check);
668 
669 /*
670  *	This routine will send an RST to the other tcp.
671  *
672  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
673  *		      for reset.
674  *	Answer: if a packet caused RST, it is not for a socket
675  *		existing in our system, if it is matched to a socket,
676  *		it is just duplicate segment or bug in other side's TCP.
677  *		So that we build reply only basing on parameters
678  *		arrived with segment.
679  *	Exception: precedence violation. We do not implement it in any case.
680  */
681 
682 #ifdef CONFIG_TCP_MD5SIG
683 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
684 #else
685 #define OPTION_BYTES sizeof(__be32)
686 #endif
687 
688 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
689 {
690 	const struct tcphdr *th = tcp_hdr(skb);
691 	struct {
692 		struct tcphdr th;
693 		__be32 opt[OPTION_BYTES / sizeof(__be32)];
694 	} rep;
695 	struct ip_reply_arg arg;
696 #ifdef CONFIG_TCP_MD5SIG
697 	struct tcp_md5sig_key *key = NULL;
698 	const __u8 *hash_location = NULL;
699 	unsigned char newhash[16];
700 	int genhash;
701 	struct sock *sk1 = NULL;
702 #endif
703 	u64 transmit_time = 0;
704 	struct sock *ctl_sk;
705 	struct net *net;
706 
707 	/* Never send a reset in response to a reset. */
708 	if (th->rst)
709 		return;
710 
711 	/* If sk not NULL, it means we did a successful lookup and incoming
712 	 * route had to be correct. prequeue might have dropped our dst.
713 	 */
714 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
715 		return;
716 
717 	/* Swap the send and the receive. */
718 	memset(&rep, 0, sizeof(rep));
719 	rep.th.dest   = th->source;
720 	rep.th.source = th->dest;
721 	rep.th.doff   = sizeof(struct tcphdr) / 4;
722 	rep.th.rst    = 1;
723 
724 	if (th->ack) {
725 		rep.th.seq = th->ack_seq;
726 	} else {
727 		rep.th.ack = 1;
728 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
729 				       skb->len - (th->doff << 2));
730 	}
731 
732 	memset(&arg, 0, sizeof(arg));
733 	arg.iov[0].iov_base = (unsigned char *)&rep;
734 	arg.iov[0].iov_len  = sizeof(rep.th);
735 
736 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
737 #ifdef CONFIG_TCP_MD5SIG
738 	rcu_read_lock();
739 	hash_location = tcp_parse_md5sig_option(th);
740 	if (sk && sk_fullsock(sk)) {
741 		const union tcp_md5_addr *addr;
742 		int l3index;
743 
744 		/* sdif set, means packet ingressed via a device
745 		 * in an L3 domain and inet_iif is set to it.
746 		 */
747 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
748 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
749 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
750 	} else if (hash_location) {
751 		const union tcp_md5_addr *addr;
752 		int sdif = tcp_v4_sdif(skb);
753 		int dif = inet_iif(skb);
754 		int l3index;
755 
756 		/*
757 		 * active side is lost. Try to find listening socket through
758 		 * source port, and then find md5 key through listening socket.
759 		 * we are not loose security here:
760 		 * Incoming packet is checked with md5 hash with finding key,
761 		 * no RST generated if md5 hash doesn't match.
762 		 */
763 		sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
764 					     NULL, 0, ip_hdr(skb)->saddr,
765 					     th->source, ip_hdr(skb)->daddr,
766 					     ntohs(th->source), dif, sdif);
767 		/* don't send rst if it can't find key */
768 		if (!sk1)
769 			goto out;
770 
771 		/* sdif set, means packet ingressed via a device
772 		 * in an L3 domain and dif is set to it.
773 		 */
774 		l3index = sdif ? dif : 0;
775 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
776 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
777 		if (!key)
778 			goto out;
779 
780 
781 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
782 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
783 			goto out;
784 
785 	}
786 
787 	if (key) {
788 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
789 				   (TCPOPT_NOP << 16) |
790 				   (TCPOPT_MD5SIG << 8) |
791 				   TCPOLEN_MD5SIG);
792 		/* Update length and the length the header thinks exists */
793 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
794 		rep.th.doff = arg.iov[0].iov_len / 4;
795 
796 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
797 				     key, ip_hdr(skb)->saddr,
798 				     ip_hdr(skb)->daddr, &rep.th);
799 	}
800 #endif
801 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
802 	if (rep.opt[0] == 0) {
803 		__be32 mrst = mptcp_reset_option(skb);
804 
805 		if (mrst) {
806 			rep.opt[0] = mrst;
807 			arg.iov[0].iov_len += sizeof(mrst);
808 			rep.th.doff = arg.iov[0].iov_len / 4;
809 		}
810 	}
811 
812 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
813 				      ip_hdr(skb)->saddr, /* XXX */
814 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
815 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
816 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
817 
818 	/* When socket is gone, all binding information is lost.
819 	 * routing might fail in this case. No choice here, if we choose to force
820 	 * input interface, we will misroute in case of asymmetric route.
821 	 */
822 	if (sk) {
823 		arg.bound_dev_if = sk->sk_bound_dev_if;
824 		if (sk_fullsock(sk))
825 			trace_tcp_send_reset(sk, skb);
826 	}
827 
828 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
829 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
830 
831 	arg.tos = ip_hdr(skb)->tos;
832 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
833 	local_bh_disable();
834 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
835 	sock_net_set(ctl_sk, net);
836 	if (sk) {
837 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
838 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
839 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
840 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
841 		transmit_time = tcp_transmit_time(sk);
842 		xfrm_sk_clone_policy(ctl_sk, sk);
843 	}
844 	ip_send_unicast_reply(ctl_sk,
845 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
846 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
847 			      &arg, arg.iov[0].iov_len,
848 			      transmit_time);
849 
850 	ctl_sk->sk_mark = 0;
851 	xfrm_sk_free_policy(ctl_sk);
852 	sock_net_set(ctl_sk, &init_net);
853 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
854 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
855 	local_bh_enable();
856 
857 #ifdef CONFIG_TCP_MD5SIG
858 out:
859 	rcu_read_unlock();
860 #endif
861 }
862 
863 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
864    outside socket context is ugly, certainly. What can I do?
865  */
866 
867 static void tcp_v4_send_ack(const struct sock *sk,
868 			    struct sk_buff *skb, u32 seq, u32 ack,
869 			    u32 win, u32 tsval, u32 tsecr, int oif,
870 			    struct tcp_md5sig_key *key,
871 			    int reply_flags, u8 tos)
872 {
873 	const struct tcphdr *th = tcp_hdr(skb);
874 	struct {
875 		struct tcphdr th;
876 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
877 #ifdef CONFIG_TCP_MD5SIG
878 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
879 #endif
880 			];
881 	} rep;
882 	struct net *net = sock_net(sk);
883 	struct ip_reply_arg arg;
884 	struct sock *ctl_sk;
885 	u64 transmit_time;
886 
887 	memset(&rep.th, 0, sizeof(struct tcphdr));
888 	memset(&arg, 0, sizeof(arg));
889 
890 	arg.iov[0].iov_base = (unsigned char *)&rep;
891 	arg.iov[0].iov_len  = sizeof(rep.th);
892 	if (tsecr) {
893 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
894 				   (TCPOPT_TIMESTAMP << 8) |
895 				   TCPOLEN_TIMESTAMP);
896 		rep.opt[1] = htonl(tsval);
897 		rep.opt[2] = htonl(tsecr);
898 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
899 	}
900 
901 	/* Swap the send and the receive. */
902 	rep.th.dest    = th->source;
903 	rep.th.source  = th->dest;
904 	rep.th.doff    = arg.iov[0].iov_len / 4;
905 	rep.th.seq     = htonl(seq);
906 	rep.th.ack_seq = htonl(ack);
907 	rep.th.ack     = 1;
908 	rep.th.window  = htons(win);
909 
910 #ifdef CONFIG_TCP_MD5SIG
911 	if (key) {
912 		int offset = (tsecr) ? 3 : 0;
913 
914 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
915 					  (TCPOPT_NOP << 16) |
916 					  (TCPOPT_MD5SIG << 8) |
917 					  TCPOLEN_MD5SIG);
918 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
919 		rep.th.doff = arg.iov[0].iov_len/4;
920 
921 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
922 				    key, ip_hdr(skb)->saddr,
923 				    ip_hdr(skb)->daddr, &rep.th);
924 	}
925 #endif
926 	arg.flags = reply_flags;
927 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
928 				      ip_hdr(skb)->saddr, /* XXX */
929 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
930 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
931 	if (oif)
932 		arg.bound_dev_if = oif;
933 	arg.tos = tos;
934 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
935 	local_bh_disable();
936 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
937 	sock_net_set(ctl_sk, net);
938 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
939 			   inet_twsk(sk)->tw_mark : sk->sk_mark;
940 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
941 			   inet_twsk(sk)->tw_priority : sk->sk_priority;
942 	transmit_time = tcp_transmit_time(sk);
943 	ip_send_unicast_reply(ctl_sk,
944 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
945 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
946 			      &arg, arg.iov[0].iov_len,
947 			      transmit_time);
948 
949 	ctl_sk->sk_mark = 0;
950 	sock_net_set(ctl_sk, &init_net);
951 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
952 	local_bh_enable();
953 }
954 
955 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
956 {
957 	struct inet_timewait_sock *tw = inet_twsk(sk);
958 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
959 
960 	tcp_v4_send_ack(sk, skb,
961 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
962 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
963 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
964 			tcptw->tw_ts_recent,
965 			tw->tw_bound_dev_if,
966 			tcp_twsk_md5_key(tcptw),
967 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
968 			tw->tw_tos
969 			);
970 
971 	inet_twsk_put(tw);
972 }
973 
974 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
975 				  struct request_sock *req)
976 {
977 	const union tcp_md5_addr *addr;
978 	int l3index;
979 
980 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
981 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
982 	 */
983 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
984 					     tcp_sk(sk)->snd_nxt;
985 
986 	/* RFC 7323 2.3
987 	 * The window field (SEG.WND) of every outgoing segment, with the
988 	 * exception of <SYN> segments, MUST be right-shifted by
989 	 * Rcv.Wind.Shift bits:
990 	 */
991 	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
992 	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
993 	tcp_v4_send_ack(sk, skb, seq,
994 			tcp_rsk(req)->rcv_nxt,
995 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
996 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
997 			req->ts_recent,
998 			0,
999 			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
1000 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1001 			ip_hdr(skb)->tos);
1002 }
1003 
1004 /*
1005  *	Send a SYN-ACK after having received a SYN.
1006  *	This still operates on a request_sock only, not on a big
1007  *	socket.
1008  */
1009 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1010 			      struct flowi *fl,
1011 			      struct request_sock *req,
1012 			      struct tcp_fastopen_cookie *foc,
1013 			      enum tcp_synack_type synack_type,
1014 			      struct sk_buff *syn_skb)
1015 {
1016 	const struct inet_request_sock *ireq = inet_rsk(req);
1017 	struct flowi4 fl4;
1018 	int err = -1;
1019 	struct sk_buff *skb;
1020 	u8 tos;
1021 
1022 	/* First, grab a route. */
1023 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1024 		return -1;
1025 
1026 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1027 
1028 	if (skb) {
1029 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1030 
1031 		tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1032 				(tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1033 				(inet_sk(sk)->tos & INET_ECN_MASK) :
1034 				inet_sk(sk)->tos;
1035 
1036 		if (!INET_ECN_is_capable(tos) &&
1037 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1038 			tos |= INET_ECN_ECT_0;
1039 
1040 		rcu_read_lock();
1041 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1042 					    ireq->ir_rmt_addr,
1043 					    rcu_dereference(ireq->ireq_opt),
1044 					    tos);
1045 		rcu_read_unlock();
1046 		err = net_xmit_eval(err);
1047 	}
1048 
1049 	return err;
1050 }
1051 
1052 /*
1053  *	IPv4 request_sock destructor.
1054  */
1055 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1056 {
1057 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1058 }
1059 
1060 #ifdef CONFIG_TCP_MD5SIG
1061 /*
1062  * RFC2385 MD5 checksumming requires a mapping of
1063  * IP address->MD5 Key.
1064  * We need to maintain these in the sk structure.
1065  */
1066 
1067 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1068 EXPORT_SYMBOL(tcp_md5_needed);
1069 
1070 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1071 {
1072 	if (!old)
1073 		return true;
1074 
1075 	/* l3index always overrides non-l3index */
1076 	if (old->l3index && new->l3index == 0)
1077 		return false;
1078 	if (old->l3index == 0 && new->l3index)
1079 		return true;
1080 
1081 	return old->prefixlen < new->prefixlen;
1082 }
1083 
1084 /* Find the Key structure for an address.  */
1085 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1086 					   const union tcp_md5_addr *addr,
1087 					   int family)
1088 {
1089 	const struct tcp_sock *tp = tcp_sk(sk);
1090 	struct tcp_md5sig_key *key;
1091 	const struct tcp_md5sig_info *md5sig;
1092 	__be32 mask;
1093 	struct tcp_md5sig_key *best_match = NULL;
1094 	bool match;
1095 
1096 	/* caller either holds rcu_read_lock() or socket lock */
1097 	md5sig = rcu_dereference_check(tp->md5sig_info,
1098 				       lockdep_sock_is_held(sk));
1099 	if (!md5sig)
1100 		return NULL;
1101 
1102 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1103 				 lockdep_sock_is_held(sk)) {
1104 		if (key->family != family)
1105 			continue;
1106 		if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1107 			continue;
1108 		if (family == AF_INET) {
1109 			mask = inet_make_mask(key->prefixlen);
1110 			match = (key->addr.a4.s_addr & mask) ==
1111 				(addr->a4.s_addr & mask);
1112 #if IS_ENABLED(CONFIG_IPV6)
1113 		} else if (family == AF_INET6) {
1114 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1115 						  key->prefixlen);
1116 #endif
1117 		} else {
1118 			match = false;
1119 		}
1120 
1121 		if (match && better_md5_match(best_match, key))
1122 			best_match = key;
1123 	}
1124 	return best_match;
1125 }
1126 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1127 
1128 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1129 						      const union tcp_md5_addr *addr,
1130 						      int family, u8 prefixlen,
1131 						      int l3index, u8 flags)
1132 {
1133 	const struct tcp_sock *tp = tcp_sk(sk);
1134 	struct tcp_md5sig_key *key;
1135 	unsigned int size = sizeof(struct in_addr);
1136 	const struct tcp_md5sig_info *md5sig;
1137 
1138 	/* caller either holds rcu_read_lock() or socket lock */
1139 	md5sig = rcu_dereference_check(tp->md5sig_info,
1140 				       lockdep_sock_is_held(sk));
1141 	if (!md5sig)
1142 		return NULL;
1143 #if IS_ENABLED(CONFIG_IPV6)
1144 	if (family == AF_INET6)
1145 		size = sizeof(struct in6_addr);
1146 #endif
1147 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1148 				 lockdep_sock_is_held(sk)) {
1149 		if (key->family != family)
1150 			continue;
1151 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1152 			continue;
1153 		if (key->l3index != l3index)
1154 			continue;
1155 		if (!memcmp(&key->addr, addr, size) &&
1156 		    key->prefixlen == prefixlen)
1157 			return key;
1158 	}
1159 	return NULL;
1160 }
1161 
1162 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1163 					 const struct sock *addr_sk)
1164 {
1165 	const union tcp_md5_addr *addr;
1166 	int l3index;
1167 
1168 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1169 						 addr_sk->sk_bound_dev_if);
1170 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1171 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1172 }
1173 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1174 
1175 /* This can be called on a newly created socket, from other files */
1176 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1177 		   int family, u8 prefixlen, int l3index, u8 flags,
1178 		   const u8 *newkey, u8 newkeylen, gfp_t gfp)
1179 {
1180 	/* Add Key to the list */
1181 	struct tcp_md5sig_key *key;
1182 	struct tcp_sock *tp = tcp_sk(sk);
1183 	struct tcp_md5sig_info *md5sig;
1184 
1185 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1186 	if (key) {
1187 		/* Pre-existing entry - just update that one.
1188 		 * Note that the key might be used concurrently.
1189 		 * data_race() is telling kcsan that we do not care of
1190 		 * key mismatches, since changing MD5 key on live flows
1191 		 * can lead to packet drops.
1192 		 */
1193 		data_race(memcpy(key->key, newkey, newkeylen));
1194 
1195 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1196 		 * Also note that a reader could catch new key->keylen value
1197 		 * but old key->key[], this is the reason we use __GFP_ZERO
1198 		 * at sock_kmalloc() time below these lines.
1199 		 */
1200 		WRITE_ONCE(key->keylen, newkeylen);
1201 
1202 		return 0;
1203 	}
1204 
1205 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1206 					   lockdep_sock_is_held(sk));
1207 	if (!md5sig) {
1208 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1209 		if (!md5sig)
1210 			return -ENOMEM;
1211 
1212 		sk_gso_disable(sk);
1213 		INIT_HLIST_HEAD(&md5sig->head);
1214 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1215 	}
1216 
1217 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1218 	if (!key)
1219 		return -ENOMEM;
1220 	if (!tcp_alloc_md5sig_pool()) {
1221 		sock_kfree_s(sk, key, sizeof(*key));
1222 		return -ENOMEM;
1223 	}
1224 
1225 	memcpy(key->key, newkey, newkeylen);
1226 	key->keylen = newkeylen;
1227 	key->family = family;
1228 	key->prefixlen = prefixlen;
1229 	key->l3index = l3index;
1230 	key->flags = flags;
1231 	memcpy(&key->addr, addr,
1232 	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1233 								 sizeof(struct in_addr));
1234 	hlist_add_head_rcu(&key->node, &md5sig->head);
1235 	return 0;
1236 }
1237 EXPORT_SYMBOL(tcp_md5_do_add);
1238 
1239 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1240 		   u8 prefixlen, int l3index, u8 flags)
1241 {
1242 	struct tcp_md5sig_key *key;
1243 
1244 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1245 	if (!key)
1246 		return -ENOENT;
1247 	hlist_del_rcu(&key->node);
1248 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1249 	kfree_rcu(key, rcu);
1250 	return 0;
1251 }
1252 EXPORT_SYMBOL(tcp_md5_do_del);
1253 
1254 static void tcp_clear_md5_list(struct sock *sk)
1255 {
1256 	struct tcp_sock *tp = tcp_sk(sk);
1257 	struct tcp_md5sig_key *key;
1258 	struct hlist_node *n;
1259 	struct tcp_md5sig_info *md5sig;
1260 
1261 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1262 
1263 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1264 		hlist_del_rcu(&key->node);
1265 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1266 		kfree_rcu(key, rcu);
1267 	}
1268 }
1269 
1270 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1271 				 sockptr_t optval, int optlen)
1272 {
1273 	struct tcp_md5sig cmd;
1274 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1275 	const union tcp_md5_addr *addr;
1276 	u8 prefixlen = 32;
1277 	int l3index = 0;
1278 	u8 flags;
1279 
1280 	if (optlen < sizeof(cmd))
1281 		return -EINVAL;
1282 
1283 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1284 		return -EFAULT;
1285 
1286 	if (sin->sin_family != AF_INET)
1287 		return -EINVAL;
1288 
1289 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1290 
1291 	if (optname == TCP_MD5SIG_EXT &&
1292 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1293 		prefixlen = cmd.tcpm_prefixlen;
1294 		if (prefixlen > 32)
1295 			return -EINVAL;
1296 	}
1297 
1298 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1299 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1300 		struct net_device *dev;
1301 
1302 		rcu_read_lock();
1303 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1304 		if (dev && netif_is_l3_master(dev))
1305 			l3index = dev->ifindex;
1306 
1307 		rcu_read_unlock();
1308 
1309 		/* ok to reference set/not set outside of rcu;
1310 		 * right now device MUST be an L3 master
1311 		 */
1312 		if (!dev || !l3index)
1313 			return -EINVAL;
1314 	}
1315 
1316 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1317 
1318 	if (!cmd.tcpm_keylen)
1319 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1320 
1321 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1322 		return -EINVAL;
1323 
1324 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1325 			      cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1326 }
1327 
1328 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1329 				   __be32 daddr, __be32 saddr,
1330 				   const struct tcphdr *th, int nbytes)
1331 {
1332 	struct tcp4_pseudohdr *bp;
1333 	struct scatterlist sg;
1334 	struct tcphdr *_th;
1335 
1336 	bp = hp->scratch;
1337 	bp->saddr = saddr;
1338 	bp->daddr = daddr;
1339 	bp->pad = 0;
1340 	bp->protocol = IPPROTO_TCP;
1341 	bp->len = cpu_to_be16(nbytes);
1342 
1343 	_th = (struct tcphdr *)(bp + 1);
1344 	memcpy(_th, th, sizeof(*th));
1345 	_th->check = 0;
1346 
1347 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1348 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1349 				sizeof(*bp) + sizeof(*th));
1350 	return crypto_ahash_update(hp->md5_req);
1351 }
1352 
1353 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1354 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1355 {
1356 	struct tcp_md5sig_pool *hp;
1357 	struct ahash_request *req;
1358 
1359 	hp = tcp_get_md5sig_pool();
1360 	if (!hp)
1361 		goto clear_hash_noput;
1362 	req = hp->md5_req;
1363 
1364 	if (crypto_ahash_init(req))
1365 		goto clear_hash;
1366 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1367 		goto clear_hash;
1368 	if (tcp_md5_hash_key(hp, key))
1369 		goto clear_hash;
1370 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1371 	if (crypto_ahash_final(req))
1372 		goto clear_hash;
1373 
1374 	tcp_put_md5sig_pool();
1375 	return 0;
1376 
1377 clear_hash:
1378 	tcp_put_md5sig_pool();
1379 clear_hash_noput:
1380 	memset(md5_hash, 0, 16);
1381 	return 1;
1382 }
1383 
1384 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1385 			const struct sock *sk,
1386 			const struct sk_buff *skb)
1387 {
1388 	struct tcp_md5sig_pool *hp;
1389 	struct ahash_request *req;
1390 	const struct tcphdr *th = tcp_hdr(skb);
1391 	__be32 saddr, daddr;
1392 
1393 	if (sk) { /* valid for establish/request sockets */
1394 		saddr = sk->sk_rcv_saddr;
1395 		daddr = sk->sk_daddr;
1396 	} else {
1397 		const struct iphdr *iph = ip_hdr(skb);
1398 		saddr = iph->saddr;
1399 		daddr = iph->daddr;
1400 	}
1401 
1402 	hp = tcp_get_md5sig_pool();
1403 	if (!hp)
1404 		goto clear_hash_noput;
1405 	req = hp->md5_req;
1406 
1407 	if (crypto_ahash_init(req))
1408 		goto clear_hash;
1409 
1410 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1411 		goto clear_hash;
1412 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1413 		goto clear_hash;
1414 	if (tcp_md5_hash_key(hp, key))
1415 		goto clear_hash;
1416 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1417 	if (crypto_ahash_final(req))
1418 		goto clear_hash;
1419 
1420 	tcp_put_md5sig_pool();
1421 	return 0;
1422 
1423 clear_hash:
1424 	tcp_put_md5sig_pool();
1425 clear_hash_noput:
1426 	memset(md5_hash, 0, 16);
1427 	return 1;
1428 }
1429 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1430 
1431 #endif
1432 
1433 static void tcp_v4_init_req(struct request_sock *req,
1434 			    const struct sock *sk_listener,
1435 			    struct sk_buff *skb)
1436 {
1437 	struct inet_request_sock *ireq = inet_rsk(req);
1438 	struct net *net = sock_net(sk_listener);
1439 
1440 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1441 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1442 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1443 }
1444 
1445 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1446 					  struct sk_buff *skb,
1447 					  struct flowi *fl,
1448 					  struct request_sock *req)
1449 {
1450 	tcp_v4_init_req(req, sk, skb);
1451 
1452 	if (security_inet_conn_request(sk, skb, req))
1453 		return NULL;
1454 
1455 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1456 }
1457 
1458 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1459 	.family		=	PF_INET,
1460 	.obj_size	=	sizeof(struct tcp_request_sock),
1461 	.rtx_syn_ack	=	tcp_rtx_synack,
1462 	.send_ack	=	tcp_v4_reqsk_send_ack,
1463 	.destructor	=	tcp_v4_reqsk_destructor,
1464 	.send_reset	=	tcp_v4_send_reset,
1465 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1466 };
1467 
1468 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1469 	.mss_clamp	=	TCP_MSS_DEFAULT,
1470 #ifdef CONFIG_TCP_MD5SIG
1471 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1472 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1473 #endif
1474 #ifdef CONFIG_SYN_COOKIES
1475 	.cookie_init_seq =	cookie_v4_init_sequence,
1476 #endif
1477 	.route_req	=	tcp_v4_route_req,
1478 	.init_seq	=	tcp_v4_init_seq,
1479 	.init_ts_off	=	tcp_v4_init_ts_off,
1480 	.send_synack	=	tcp_v4_send_synack,
1481 };
1482 
1483 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1484 {
1485 	/* Never answer to SYNs send to broadcast or multicast */
1486 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1487 		goto drop;
1488 
1489 	return tcp_conn_request(&tcp_request_sock_ops,
1490 				&tcp_request_sock_ipv4_ops, sk, skb);
1491 
1492 drop:
1493 	tcp_listendrop(sk);
1494 	return 0;
1495 }
1496 EXPORT_SYMBOL(tcp_v4_conn_request);
1497 
1498 
1499 /*
1500  * The three way handshake has completed - we got a valid synack -
1501  * now create the new socket.
1502  */
1503 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1504 				  struct request_sock *req,
1505 				  struct dst_entry *dst,
1506 				  struct request_sock *req_unhash,
1507 				  bool *own_req)
1508 {
1509 	struct inet_request_sock *ireq;
1510 	bool found_dup_sk = false;
1511 	struct inet_sock *newinet;
1512 	struct tcp_sock *newtp;
1513 	struct sock *newsk;
1514 #ifdef CONFIG_TCP_MD5SIG
1515 	const union tcp_md5_addr *addr;
1516 	struct tcp_md5sig_key *key;
1517 	int l3index;
1518 #endif
1519 	struct ip_options_rcu *inet_opt;
1520 
1521 	if (sk_acceptq_is_full(sk))
1522 		goto exit_overflow;
1523 
1524 	newsk = tcp_create_openreq_child(sk, req, skb);
1525 	if (!newsk)
1526 		goto exit_nonewsk;
1527 
1528 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1529 	inet_sk_rx_dst_set(newsk, skb);
1530 
1531 	newtp		      = tcp_sk(newsk);
1532 	newinet		      = inet_sk(newsk);
1533 	ireq		      = inet_rsk(req);
1534 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1535 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1536 	newsk->sk_bound_dev_if = ireq->ir_iif;
1537 	newinet->inet_saddr   = ireq->ir_loc_addr;
1538 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1539 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1540 	newinet->mc_index     = inet_iif(skb);
1541 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1542 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1543 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1544 	if (inet_opt)
1545 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1546 	newinet->inet_id = get_random_u16();
1547 
1548 	/* Set ToS of the new socket based upon the value of incoming SYN.
1549 	 * ECT bits are set later in tcp_init_transfer().
1550 	 */
1551 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1552 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1553 
1554 	if (!dst) {
1555 		dst = inet_csk_route_child_sock(sk, newsk, req);
1556 		if (!dst)
1557 			goto put_and_exit;
1558 	} else {
1559 		/* syncookie case : see end of cookie_v4_check() */
1560 	}
1561 	sk_setup_caps(newsk, dst);
1562 
1563 	tcp_ca_openreq_child(newsk, dst);
1564 
1565 	tcp_sync_mss(newsk, dst_mtu(dst));
1566 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1567 
1568 	tcp_initialize_rcv_mss(newsk);
1569 
1570 #ifdef CONFIG_TCP_MD5SIG
1571 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1572 	/* Copy over the MD5 key from the original socket */
1573 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1574 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1575 	if (key) {
1576 		/*
1577 		 * We're using one, so create a matching key
1578 		 * on the newsk structure. If we fail to get
1579 		 * memory, then we end up not copying the key
1580 		 * across. Shucks.
1581 		 */
1582 		tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1583 			       key->key, key->keylen, GFP_ATOMIC);
1584 		sk_gso_disable(newsk);
1585 	}
1586 #endif
1587 
1588 	if (__inet_inherit_port(sk, newsk) < 0)
1589 		goto put_and_exit;
1590 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1591 				       &found_dup_sk);
1592 	if (likely(*own_req)) {
1593 		tcp_move_syn(newtp, req);
1594 		ireq->ireq_opt = NULL;
1595 	} else {
1596 		newinet->inet_opt = NULL;
1597 
1598 		if (!req_unhash && found_dup_sk) {
1599 			/* This code path should only be executed in the
1600 			 * syncookie case only
1601 			 */
1602 			bh_unlock_sock(newsk);
1603 			sock_put(newsk);
1604 			newsk = NULL;
1605 		}
1606 	}
1607 	return newsk;
1608 
1609 exit_overflow:
1610 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1611 exit_nonewsk:
1612 	dst_release(dst);
1613 exit:
1614 	tcp_listendrop(sk);
1615 	return NULL;
1616 put_and_exit:
1617 	newinet->inet_opt = NULL;
1618 	inet_csk_prepare_forced_close(newsk);
1619 	tcp_done(newsk);
1620 	goto exit;
1621 }
1622 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1623 
1624 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1625 {
1626 #ifdef CONFIG_SYN_COOKIES
1627 	const struct tcphdr *th = tcp_hdr(skb);
1628 
1629 	if (!th->syn)
1630 		sk = cookie_v4_check(sk, skb);
1631 #endif
1632 	return sk;
1633 }
1634 
1635 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1636 			 struct tcphdr *th, u32 *cookie)
1637 {
1638 	u16 mss = 0;
1639 #ifdef CONFIG_SYN_COOKIES
1640 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1641 				    &tcp_request_sock_ipv4_ops, sk, th);
1642 	if (mss) {
1643 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1644 		tcp_synq_overflow(sk);
1645 	}
1646 #endif
1647 	return mss;
1648 }
1649 
1650 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1651 							   u32));
1652 /* The socket must have it's spinlock held when we get
1653  * here, unless it is a TCP_LISTEN socket.
1654  *
1655  * We have a potential double-lock case here, so even when
1656  * doing backlog processing we use the BH locking scheme.
1657  * This is because we cannot sleep with the original spinlock
1658  * held.
1659  */
1660 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1661 {
1662 	enum skb_drop_reason reason;
1663 	struct sock *rsk;
1664 
1665 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1666 		struct dst_entry *dst;
1667 
1668 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1669 						lockdep_sock_is_held(sk));
1670 
1671 		sock_rps_save_rxhash(sk, skb);
1672 		sk_mark_napi_id(sk, skb);
1673 		if (dst) {
1674 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1675 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1676 					     dst, 0)) {
1677 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1678 				dst_release(dst);
1679 			}
1680 		}
1681 		tcp_rcv_established(sk, skb);
1682 		return 0;
1683 	}
1684 
1685 	reason = SKB_DROP_REASON_NOT_SPECIFIED;
1686 	if (tcp_checksum_complete(skb))
1687 		goto csum_err;
1688 
1689 	if (sk->sk_state == TCP_LISTEN) {
1690 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1691 
1692 		if (!nsk)
1693 			goto discard;
1694 		if (nsk != sk) {
1695 			if (tcp_child_process(sk, nsk, skb)) {
1696 				rsk = nsk;
1697 				goto reset;
1698 			}
1699 			return 0;
1700 		}
1701 	} else
1702 		sock_rps_save_rxhash(sk, skb);
1703 
1704 	if (tcp_rcv_state_process(sk, skb)) {
1705 		rsk = sk;
1706 		goto reset;
1707 	}
1708 	return 0;
1709 
1710 reset:
1711 	tcp_v4_send_reset(rsk, skb);
1712 discard:
1713 	kfree_skb_reason(skb, reason);
1714 	/* Be careful here. If this function gets more complicated and
1715 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1716 	 * might be destroyed here. This current version compiles correctly,
1717 	 * but you have been warned.
1718 	 */
1719 	return 0;
1720 
1721 csum_err:
1722 	reason = SKB_DROP_REASON_TCP_CSUM;
1723 	trace_tcp_bad_csum(skb);
1724 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1725 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1726 	goto discard;
1727 }
1728 EXPORT_SYMBOL(tcp_v4_do_rcv);
1729 
1730 int tcp_v4_early_demux(struct sk_buff *skb)
1731 {
1732 	struct net *net = dev_net(skb->dev);
1733 	const struct iphdr *iph;
1734 	const struct tcphdr *th;
1735 	struct sock *sk;
1736 
1737 	if (skb->pkt_type != PACKET_HOST)
1738 		return 0;
1739 
1740 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1741 		return 0;
1742 
1743 	iph = ip_hdr(skb);
1744 	th = tcp_hdr(skb);
1745 
1746 	if (th->doff < sizeof(struct tcphdr) / 4)
1747 		return 0;
1748 
1749 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1750 				       iph->saddr, th->source,
1751 				       iph->daddr, ntohs(th->dest),
1752 				       skb->skb_iif, inet_sdif(skb));
1753 	if (sk) {
1754 		skb->sk = sk;
1755 		skb->destructor = sock_edemux;
1756 		if (sk_fullsock(sk)) {
1757 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1758 
1759 			if (dst)
1760 				dst = dst_check(dst, 0);
1761 			if (dst &&
1762 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
1763 				skb_dst_set_noref(skb, dst);
1764 		}
1765 	}
1766 	return 0;
1767 }
1768 
1769 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1770 		     enum skb_drop_reason *reason)
1771 {
1772 	u32 limit, tail_gso_size, tail_gso_segs;
1773 	struct skb_shared_info *shinfo;
1774 	const struct tcphdr *th;
1775 	struct tcphdr *thtail;
1776 	struct sk_buff *tail;
1777 	unsigned int hdrlen;
1778 	bool fragstolen;
1779 	u32 gso_segs;
1780 	u32 gso_size;
1781 	int delta;
1782 
1783 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1784 	 * we can fix skb->truesize to its real value to avoid future drops.
1785 	 * This is valid because skb is not yet charged to the socket.
1786 	 * It has been noticed pure SACK packets were sometimes dropped
1787 	 * (if cooked by drivers without copybreak feature).
1788 	 */
1789 	skb_condense(skb);
1790 
1791 	skb_dst_drop(skb);
1792 
1793 	if (unlikely(tcp_checksum_complete(skb))) {
1794 		bh_unlock_sock(sk);
1795 		trace_tcp_bad_csum(skb);
1796 		*reason = SKB_DROP_REASON_TCP_CSUM;
1797 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1798 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1799 		return true;
1800 	}
1801 
1802 	/* Attempt coalescing to last skb in backlog, even if we are
1803 	 * above the limits.
1804 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1805 	 */
1806 	th = (const struct tcphdr *)skb->data;
1807 	hdrlen = th->doff * 4;
1808 
1809 	tail = sk->sk_backlog.tail;
1810 	if (!tail)
1811 		goto no_coalesce;
1812 	thtail = (struct tcphdr *)tail->data;
1813 
1814 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1815 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1816 	    ((TCP_SKB_CB(tail)->tcp_flags |
1817 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1818 	    !((TCP_SKB_CB(tail)->tcp_flags &
1819 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1820 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1821 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1822 #ifdef CONFIG_TLS_DEVICE
1823 	    tail->decrypted != skb->decrypted ||
1824 #endif
1825 	    thtail->doff != th->doff ||
1826 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1827 		goto no_coalesce;
1828 
1829 	__skb_pull(skb, hdrlen);
1830 
1831 	shinfo = skb_shinfo(skb);
1832 	gso_size = shinfo->gso_size ?: skb->len;
1833 	gso_segs = shinfo->gso_segs ?: 1;
1834 
1835 	shinfo = skb_shinfo(tail);
1836 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1837 	tail_gso_segs = shinfo->gso_segs ?: 1;
1838 
1839 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1840 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1841 
1842 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1843 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1844 			thtail->window = th->window;
1845 		}
1846 
1847 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1848 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1849 		 * is not entered if we append a packet with a FIN.
1850 		 * SYN, RST, URG are not present.
1851 		 * ACK is set on both packets.
1852 		 * PSH : we do not really care in TCP stack,
1853 		 *       at least for 'GRO' packets.
1854 		 */
1855 		thtail->fin |= th->fin;
1856 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1857 
1858 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1859 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1860 			tail->tstamp = skb->tstamp;
1861 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1862 		}
1863 
1864 		/* Not as strict as GRO. We only need to carry mss max value */
1865 		shinfo->gso_size = max(gso_size, tail_gso_size);
1866 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1867 
1868 		sk->sk_backlog.len += delta;
1869 		__NET_INC_STATS(sock_net(sk),
1870 				LINUX_MIB_TCPBACKLOGCOALESCE);
1871 		kfree_skb_partial(skb, fragstolen);
1872 		return false;
1873 	}
1874 	__skb_push(skb, hdrlen);
1875 
1876 no_coalesce:
1877 	limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1878 
1879 	/* Only socket owner can try to collapse/prune rx queues
1880 	 * to reduce memory overhead, so add a little headroom here.
1881 	 * Few sockets backlog are possibly concurrently non empty.
1882 	 */
1883 	limit += 64 * 1024;
1884 
1885 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1886 		bh_unlock_sock(sk);
1887 		*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1888 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1889 		return true;
1890 	}
1891 	return false;
1892 }
1893 EXPORT_SYMBOL(tcp_add_backlog);
1894 
1895 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1896 {
1897 	struct tcphdr *th = (struct tcphdr *)skb->data;
1898 
1899 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1900 }
1901 EXPORT_SYMBOL(tcp_filter);
1902 
1903 static void tcp_v4_restore_cb(struct sk_buff *skb)
1904 {
1905 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1906 		sizeof(struct inet_skb_parm));
1907 }
1908 
1909 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1910 			   const struct tcphdr *th)
1911 {
1912 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1913 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1914 	 */
1915 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1916 		sizeof(struct inet_skb_parm));
1917 	barrier();
1918 
1919 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1920 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1921 				    skb->len - th->doff * 4);
1922 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1923 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1924 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1925 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1926 	TCP_SKB_CB(skb)->sacked	 = 0;
1927 	TCP_SKB_CB(skb)->has_rxtstamp =
1928 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1929 }
1930 
1931 /*
1932  *	From tcp_input.c
1933  */
1934 
1935 int tcp_v4_rcv(struct sk_buff *skb)
1936 {
1937 	struct net *net = dev_net(skb->dev);
1938 	enum skb_drop_reason drop_reason;
1939 	int sdif = inet_sdif(skb);
1940 	int dif = inet_iif(skb);
1941 	const struct iphdr *iph;
1942 	const struct tcphdr *th;
1943 	bool refcounted;
1944 	struct sock *sk;
1945 	int ret;
1946 
1947 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1948 	if (skb->pkt_type != PACKET_HOST)
1949 		goto discard_it;
1950 
1951 	/* Count it even if it's bad */
1952 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1953 
1954 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1955 		goto discard_it;
1956 
1957 	th = (const struct tcphdr *)skb->data;
1958 
1959 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1960 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1961 		goto bad_packet;
1962 	}
1963 	if (!pskb_may_pull(skb, th->doff * 4))
1964 		goto discard_it;
1965 
1966 	/* An explanation is required here, I think.
1967 	 * Packet length and doff are validated by header prediction,
1968 	 * provided case of th->doff==0 is eliminated.
1969 	 * So, we defer the checks. */
1970 
1971 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1972 		goto csum_error;
1973 
1974 	th = (const struct tcphdr *)skb->data;
1975 	iph = ip_hdr(skb);
1976 lookup:
1977 	sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
1978 			       skb, __tcp_hdrlen(th), th->source,
1979 			       th->dest, sdif, &refcounted);
1980 	if (!sk)
1981 		goto no_tcp_socket;
1982 
1983 process:
1984 	if (sk->sk_state == TCP_TIME_WAIT)
1985 		goto do_time_wait;
1986 
1987 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1988 		struct request_sock *req = inet_reqsk(sk);
1989 		bool req_stolen = false;
1990 		struct sock *nsk;
1991 
1992 		sk = req->rsk_listener;
1993 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1994 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
1995 		else
1996 			drop_reason = tcp_inbound_md5_hash(sk, skb,
1997 						   &iph->saddr, &iph->daddr,
1998 						   AF_INET, dif, sdif);
1999 		if (unlikely(drop_reason)) {
2000 			sk_drops_add(sk, skb);
2001 			reqsk_put(req);
2002 			goto discard_it;
2003 		}
2004 		if (tcp_checksum_complete(skb)) {
2005 			reqsk_put(req);
2006 			goto csum_error;
2007 		}
2008 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2009 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2010 			if (!nsk) {
2011 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2012 				goto lookup;
2013 			}
2014 			sk = nsk;
2015 			/* reuseport_migrate_sock() has already held one sk_refcnt
2016 			 * before returning.
2017 			 */
2018 		} else {
2019 			/* We own a reference on the listener, increase it again
2020 			 * as we might lose it too soon.
2021 			 */
2022 			sock_hold(sk);
2023 		}
2024 		refcounted = true;
2025 		nsk = NULL;
2026 		if (!tcp_filter(sk, skb)) {
2027 			th = (const struct tcphdr *)skb->data;
2028 			iph = ip_hdr(skb);
2029 			tcp_v4_fill_cb(skb, iph, th);
2030 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2031 		} else {
2032 			drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2033 		}
2034 		if (!nsk) {
2035 			reqsk_put(req);
2036 			if (req_stolen) {
2037 				/* Another cpu got exclusive access to req
2038 				 * and created a full blown socket.
2039 				 * Try to feed this packet to this socket
2040 				 * instead of discarding it.
2041 				 */
2042 				tcp_v4_restore_cb(skb);
2043 				sock_put(sk);
2044 				goto lookup;
2045 			}
2046 			goto discard_and_relse;
2047 		}
2048 		nf_reset_ct(skb);
2049 		if (nsk == sk) {
2050 			reqsk_put(req);
2051 			tcp_v4_restore_cb(skb);
2052 		} else if (tcp_child_process(sk, nsk, skb)) {
2053 			tcp_v4_send_reset(nsk, skb);
2054 			goto discard_and_relse;
2055 		} else {
2056 			sock_put(sk);
2057 			return 0;
2058 		}
2059 	}
2060 
2061 	if (static_branch_unlikely(&ip4_min_ttl)) {
2062 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2063 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2064 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2065 			goto discard_and_relse;
2066 		}
2067 	}
2068 
2069 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2070 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2071 		goto discard_and_relse;
2072 	}
2073 
2074 	drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2075 					   &iph->daddr, AF_INET, dif, sdif);
2076 	if (drop_reason)
2077 		goto discard_and_relse;
2078 
2079 	nf_reset_ct(skb);
2080 
2081 	if (tcp_filter(sk, skb)) {
2082 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2083 		goto discard_and_relse;
2084 	}
2085 	th = (const struct tcphdr *)skb->data;
2086 	iph = ip_hdr(skb);
2087 	tcp_v4_fill_cb(skb, iph, th);
2088 
2089 	skb->dev = NULL;
2090 
2091 	if (sk->sk_state == TCP_LISTEN) {
2092 		ret = tcp_v4_do_rcv(sk, skb);
2093 		goto put_and_return;
2094 	}
2095 
2096 	sk_incoming_cpu_update(sk);
2097 
2098 	bh_lock_sock_nested(sk);
2099 	tcp_segs_in(tcp_sk(sk), skb);
2100 	ret = 0;
2101 	if (!sock_owned_by_user(sk)) {
2102 		ret = tcp_v4_do_rcv(sk, skb);
2103 	} else {
2104 		if (tcp_add_backlog(sk, skb, &drop_reason))
2105 			goto discard_and_relse;
2106 	}
2107 	bh_unlock_sock(sk);
2108 
2109 put_and_return:
2110 	if (refcounted)
2111 		sock_put(sk);
2112 
2113 	return ret;
2114 
2115 no_tcp_socket:
2116 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2117 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2118 		goto discard_it;
2119 
2120 	tcp_v4_fill_cb(skb, iph, th);
2121 
2122 	if (tcp_checksum_complete(skb)) {
2123 csum_error:
2124 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2125 		trace_tcp_bad_csum(skb);
2126 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2127 bad_packet:
2128 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2129 	} else {
2130 		tcp_v4_send_reset(NULL, skb);
2131 	}
2132 
2133 discard_it:
2134 	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2135 	/* Discard frame. */
2136 	kfree_skb_reason(skb, drop_reason);
2137 	return 0;
2138 
2139 discard_and_relse:
2140 	sk_drops_add(sk, skb);
2141 	if (refcounted)
2142 		sock_put(sk);
2143 	goto discard_it;
2144 
2145 do_time_wait:
2146 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2147 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2148 		inet_twsk_put(inet_twsk(sk));
2149 		goto discard_it;
2150 	}
2151 
2152 	tcp_v4_fill_cb(skb, iph, th);
2153 
2154 	if (tcp_checksum_complete(skb)) {
2155 		inet_twsk_put(inet_twsk(sk));
2156 		goto csum_error;
2157 	}
2158 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2159 	case TCP_TW_SYN: {
2160 		struct sock *sk2 = inet_lookup_listener(net,
2161 							net->ipv4.tcp_death_row.hashinfo,
2162 							skb, __tcp_hdrlen(th),
2163 							iph->saddr, th->source,
2164 							iph->daddr, th->dest,
2165 							inet_iif(skb),
2166 							sdif);
2167 		if (sk2) {
2168 			inet_twsk_deschedule_put(inet_twsk(sk));
2169 			sk = sk2;
2170 			tcp_v4_restore_cb(skb);
2171 			refcounted = false;
2172 			goto process;
2173 		}
2174 	}
2175 		/* to ACK */
2176 		fallthrough;
2177 	case TCP_TW_ACK:
2178 		tcp_v4_timewait_ack(sk, skb);
2179 		break;
2180 	case TCP_TW_RST:
2181 		tcp_v4_send_reset(sk, skb);
2182 		inet_twsk_deschedule_put(inet_twsk(sk));
2183 		goto discard_it;
2184 	case TCP_TW_SUCCESS:;
2185 	}
2186 	goto discard_it;
2187 }
2188 
2189 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2190 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2191 	.twsk_unique	= tcp_twsk_unique,
2192 	.twsk_destructor= tcp_twsk_destructor,
2193 };
2194 
2195 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2196 {
2197 	struct dst_entry *dst = skb_dst(skb);
2198 
2199 	if (dst && dst_hold_safe(dst)) {
2200 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2201 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2202 	}
2203 }
2204 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2205 
2206 const struct inet_connection_sock_af_ops ipv4_specific = {
2207 	.queue_xmit	   = ip_queue_xmit,
2208 	.send_check	   = tcp_v4_send_check,
2209 	.rebuild_header	   = inet_sk_rebuild_header,
2210 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2211 	.conn_request	   = tcp_v4_conn_request,
2212 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2213 	.net_header_len	   = sizeof(struct iphdr),
2214 	.setsockopt	   = ip_setsockopt,
2215 	.getsockopt	   = ip_getsockopt,
2216 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2217 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2218 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2219 };
2220 EXPORT_SYMBOL(ipv4_specific);
2221 
2222 #ifdef CONFIG_TCP_MD5SIG
2223 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2224 	.md5_lookup		= tcp_v4_md5_lookup,
2225 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2226 	.md5_parse		= tcp_v4_parse_md5_keys,
2227 };
2228 #endif
2229 
2230 /* NOTE: A lot of things set to zero explicitly by call to
2231  *       sk_alloc() so need not be done here.
2232  */
2233 static int tcp_v4_init_sock(struct sock *sk)
2234 {
2235 	struct inet_connection_sock *icsk = inet_csk(sk);
2236 
2237 	tcp_init_sock(sk);
2238 
2239 	icsk->icsk_af_ops = &ipv4_specific;
2240 
2241 #ifdef CONFIG_TCP_MD5SIG
2242 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2243 #endif
2244 
2245 	return 0;
2246 }
2247 
2248 void tcp_v4_destroy_sock(struct sock *sk)
2249 {
2250 	struct tcp_sock *tp = tcp_sk(sk);
2251 
2252 	trace_tcp_destroy_sock(sk);
2253 
2254 	tcp_clear_xmit_timers(sk);
2255 
2256 	tcp_cleanup_congestion_control(sk);
2257 
2258 	tcp_cleanup_ulp(sk);
2259 
2260 	/* Cleanup up the write buffer. */
2261 	tcp_write_queue_purge(sk);
2262 
2263 	/* Check if we want to disable active TFO */
2264 	tcp_fastopen_active_disable_ofo_check(sk);
2265 
2266 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2267 	skb_rbtree_purge(&tp->out_of_order_queue);
2268 
2269 #ifdef CONFIG_TCP_MD5SIG
2270 	/* Clean up the MD5 key list, if any */
2271 	if (tp->md5sig_info) {
2272 		tcp_clear_md5_list(sk);
2273 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2274 		tp->md5sig_info = NULL;
2275 	}
2276 #endif
2277 
2278 	/* Clean up a referenced TCP bind bucket. */
2279 	if (inet_csk(sk)->icsk_bind_hash)
2280 		inet_put_port(sk);
2281 
2282 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2283 
2284 	/* If socket is aborted during connect operation */
2285 	tcp_free_fastopen_req(tp);
2286 	tcp_fastopen_destroy_cipher(sk);
2287 	tcp_saved_syn_free(tp);
2288 
2289 	sk_sockets_allocated_dec(sk);
2290 }
2291 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2292 
2293 #ifdef CONFIG_PROC_FS
2294 /* Proc filesystem TCP sock list dumping. */
2295 
2296 static unsigned short seq_file_family(const struct seq_file *seq);
2297 
2298 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2299 {
2300 	unsigned short family = seq_file_family(seq);
2301 
2302 	/* AF_UNSPEC is used as a match all */
2303 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2304 		net_eq(sock_net(sk), seq_file_net(seq)));
2305 }
2306 
2307 /* Find a non empty bucket (starting from st->bucket)
2308  * and return the first sk from it.
2309  */
2310 static void *listening_get_first(struct seq_file *seq)
2311 {
2312 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2313 	struct tcp_iter_state *st = seq->private;
2314 
2315 	st->offset = 0;
2316 	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2317 		struct inet_listen_hashbucket *ilb2;
2318 		struct hlist_nulls_node *node;
2319 		struct sock *sk;
2320 
2321 		ilb2 = &hinfo->lhash2[st->bucket];
2322 		if (hlist_nulls_empty(&ilb2->nulls_head))
2323 			continue;
2324 
2325 		spin_lock(&ilb2->lock);
2326 		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2327 			if (seq_sk_match(seq, sk))
2328 				return sk;
2329 		}
2330 		spin_unlock(&ilb2->lock);
2331 	}
2332 
2333 	return NULL;
2334 }
2335 
2336 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2337  * If "cur" is the last one in the st->bucket,
2338  * call listening_get_first() to return the first sk of the next
2339  * non empty bucket.
2340  */
2341 static void *listening_get_next(struct seq_file *seq, void *cur)
2342 {
2343 	struct tcp_iter_state *st = seq->private;
2344 	struct inet_listen_hashbucket *ilb2;
2345 	struct hlist_nulls_node *node;
2346 	struct inet_hashinfo *hinfo;
2347 	struct sock *sk = cur;
2348 
2349 	++st->num;
2350 	++st->offset;
2351 
2352 	sk = sk_nulls_next(sk);
2353 	sk_nulls_for_each_from(sk, node) {
2354 		if (seq_sk_match(seq, sk))
2355 			return sk;
2356 	}
2357 
2358 	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2359 	ilb2 = &hinfo->lhash2[st->bucket];
2360 	spin_unlock(&ilb2->lock);
2361 	++st->bucket;
2362 	return listening_get_first(seq);
2363 }
2364 
2365 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2366 {
2367 	struct tcp_iter_state *st = seq->private;
2368 	void *rc;
2369 
2370 	st->bucket = 0;
2371 	st->offset = 0;
2372 	rc = listening_get_first(seq);
2373 
2374 	while (rc && *pos) {
2375 		rc = listening_get_next(seq, rc);
2376 		--*pos;
2377 	}
2378 	return rc;
2379 }
2380 
2381 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2382 				const struct tcp_iter_state *st)
2383 {
2384 	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2385 }
2386 
2387 /*
2388  * Get first established socket starting from bucket given in st->bucket.
2389  * If st->bucket is zero, the very first socket in the hash is returned.
2390  */
2391 static void *established_get_first(struct seq_file *seq)
2392 {
2393 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2394 	struct tcp_iter_state *st = seq->private;
2395 
2396 	st->offset = 0;
2397 	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2398 		struct sock *sk;
2399 		struct hlist_nulls_node *node;
2400 		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2401 
2402 		/* Lockless fast path for the common case of empty buckets */
2403 		if (empty_bucket(hinfo, st))
2404 			continue;
2405 
2406 		spin_lock_bh(lock);
2407 		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2408 			if (seq_sk_match(seq, sk))
2409 				return sk;
2410 		}
2411 		spin_unlock_bh(lock);
2412 	}
2413 
2414 	return NULL;
2415 }
2416 
2417 static void *established_get_next(struct seq_file *seq, void *cur)
2418 {
2419 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2420 	struct tcp_iter_state *st = seq->private;
2421 	struct hlist_nulls_node *node;
2422 	struct sock *sk = cur;
2423 
2424 	++st->num;
2425 	++st->offset;
2426 
2427 	sk = sk_nulls_next(sk);
2428 
2429 	sk_nulls_for_each_from(sk, node) {
2430 		if (seq_sk_match(seq, sk))
2431 			return sk;
2432 	}
2433 
2434 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2435 	++st->bucket;
2436 	return established_get_first(seq);
2437 }
2438 
2439 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2440 {
2441 	struct tcp_iter_state *st = seq->private;
2442 	void *rc;
2443 
2444 	st->bucket = 0;
2445 	rc = established_get_first(seq);
2446 
2447 	while (rc && pos) {
2448 		rc = established_get_next(seq, rc);
2449 		--pos;
2450 	}
2451 	return rc;
2452 }
2453 
2454 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2455 {
2456 	void *rc;
2457 	struct tcp_iter_state *st = seq->private;
2458 
2459 	st->state = TCP_SEQ_STATE_LISTENING;
2460 	rc	  = listening_get_idx(seq, &pos);
2461 
2462 	if (!rc) {
2463 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2464 		rc	  = established_get_idx(seq, pos);
2465 	}
2466 
2467 	return rc;
2468 }
2469 
2470 static void *tcp_seek_last_pos(struct seq_file *seq)
2471 {
2472 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2473 	struct tcp_iter_state *st = seq->private;
2474 	int bucket = st->bucket;
2475 	int offset = st->offset;
2476 	int orig_num = st->num;
2477 	void *rc = NULL;
2478 
2479 	switch (st->state) {
2480 	case TCP_SEQ_STATE_LISTENING:
2481 		if (st->bucket > hinfo->lhash2_mask)
2482 			break;
2483 		rc = listening_get_first(seq);
2484 		while (offset-- && rc && bucket == st->bucket)
2485 			rc = listening_get_next(seq, rc);
2486 		if (rc)
2487 			break;
2488 		st->bucket = 0;
2489 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2490 		fallthrough;
2491 	case TCP_SEQ_STATE_ESTABLISHED:
2492 		if (st->bucket > hinfo->ehash_mask)
2493 			break;
2494 		rc = established_get_first(seq);
2495 		while (offset-- && rc && bucket == st->bucket)
2496 			rc = established_get_next(seq, rc);
2497 	}
2498 
2499 	st->num = orig_num;
2500 
2501 	return rc;
2502 }
2503 
2504 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2505 {
2506 	struct tcp_iter_state *st = seq->private;
2507 	void *rc;
2508 
2509 	if (*pos && *pos == st->last_pos) {
2510 		rc = tcp_seek_last_pos(seq);
2511 		if (rc)
2512 			goto out;
2513 	}
2514 
2515 	st->state = TCP_SEQ_STATE_LISTENING;
2516 	st->num = 0;
2517 	st->bucket = 0;
2518 	st->offset = 0;
2519 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2520 
2521 out:
2522 	st->last_pos = *pos;
2523 	return rc;
2524 }
2525 EXPORT_SYMBOL(tcp_seq_start);
2526 
2527 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2528 {
2529 	struct tcp_iter_state *st = seq->private;
2530 	void *rc = NULL;
2531 
2532 	if (v == SEQ_START_TOKEN) {
2533 		rc = tcp_get_idx(seq, 0);
2534 		goto out;
2535 	}
2536 
2537 	switch (st->state) {
2538 	case TCP_SEQ_STATE_LISTENING:
2539 		rc = listening_get_next(seq, v);
2540 		if (!rc) {
2541 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2542 			st->bucket = 0;
2543 			st->offset = 0;
2544 			rc	  = established_get_first(seq);
2545 		}
2546 		break;
2547 	case TCP_SEQ_STATE_ESTABLISHED:
2548 		rc = established_get_next(seq, v);
2549 		break;
2550 	}
2551 out:
2552 	++*pos;
2553 	st->last_pos = *pos;
2554 	return rc;
2555 }
2556 EXPORT_SYMBOL(tcp_seq_next);
2557 
2558 void tcp_seq_stop(struct seq_file *seq, void *v)
2559 {
2560 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2561 	struct tcp_iter_state *st = seq->private;
2562 
2563 	switch (st->state) {
2564 	case TCP_SEQ_STATE_LISTENING:
2565 		if (v != SEQ_START_TOKEN)
2566 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
2567 		break;
2568 	case TCP_SEQ_STATE_ESTABLISHED:
2569 		if (v)
2570 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2571 		break;
2572 	}
2573 }
2574 EXPORT_SYMBOL(tcp_seq_stop);
2575 
2576 static void get_openreq4(const struct request_sock *req,
2577 			 struct seq_file *f, int i)
2578 {
2579 	const struct inet_request_sock *ireq = inet_rsk(req);
2580 	long delta = req->rsk_timer.expires - jiffies;
2581 
2582 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2583 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2584 		i,
2585 		ireq->ir_loc_addr,
2586 		ireq->ir_num,
2587 		ireq->ir_rmt_addr,
2588 		ntohs(ireq->ir_rmt_port),
2589 		TCP_SYN_RECV,
2590 		0, 0, /* could print option size, but that is af dependent. */
2591 		1,    /* timers active (only the expire timer) */
2592 		jiffies_delta_to_clock_t(delta),
2593 		req->num_timeout,
2594 		from_kuid_munged(seq_user_ns(f),
2595 				 sock_i_uid(req->rsk_listener)),
2596 		0,  /* non standard timer */
2597 		0, /* open_requests have no inode */
2598 		0,
2599 		req);
2600 }
2601 
2602 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2603 {
2604 	int timer_active;
2605 	unsigned long timer_expires;
2606 	const struct tcp_sock *tp = tcp_sk(sk);
2607 	const struct inet_connection_sock *icsk = inet_csk(sk);
2608 	const struct inet_sock *inet = inet_sk(sk);
2609 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2610 	__be32 dest = inet->inet_daddr;
2611 	__be32 src = inet->inet_rcv_saddr;
2612 	__u16 destp = ntohs(inet->inet_dport);
2613 	__u16 srcp = ntohs(inet->inet_sport);
2614 	int rx_queue;
2615 	int state;
2616 
2617 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2618 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2619 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2620 		timer_active	= 1;
2621 		timer_expires	= icsk->icsk_timeout;
2622 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2623 		timer_active	= 4;
2624 		timer_expires	= icsk->icsk_timeout;
2625 	} else if (timer_pending(&sk->sk_timer)) {
2626 		timer_active	= 2;
2627 		timer_expires	= sk->sk_timer.expires;
2628 	} else {
2629 		timer_active	= 0;
2630 		timer_expires = jiffies;
2631 	}
2632 
2633 	state = inet_sk_state_load(sk);
2634 	if (state == TCP_LISTEN)
2635 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2636 	else
2637 		/* Because we don't lock the socket,
2638 		 * we might find a transient negative value.
2639 		 */
2640 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2641 				      READ_ONCE(tp->copied_seq), 0);
2642 
2643 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2644 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2645 		i, src, srcp, dest, destp, state,
2646 		READ_ONCE(tp->write_seq) - tp->snd_una,
2647 		rx_queue,
2648 		timer_active,
2649 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2650 		icsk->icsk_retransmits,
2651 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2652 		icsk->icsk_probes_out,
2653 		sock_i_ino(sk),
2654 		refcount_read(&sk->sk_refcnt), sk,
2655 		jiffies_to_clock_t(icsk->icsk_rto),
2656 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2657 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2658 		tcp_snd_cwnd(tp),
2659 		state == TCP_LISTEN ?
2660 		    fastopenq->max_qlen :
2661 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2662 }
2663 
2664 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2665 			       struct seq_file *f, int i)
2666 {
2667 	long delta = tw->tw_timer.expires - jiffies;
2668 	__be32 dest, src;
2669 	__u16 destp, srcp;
2670 
2671 	dest  = tw->tw_daddr;
2672 	src   = tw->tw_rcv_saddr;
2673 	destp = ntohs(tw->tw_dport);
2674 	srcp  = ntohs(tw->tw_sport);
2675 
2676 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2677 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2678 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2679 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2680 		refcount_read(&tw->tw_refcnt), tw);
2681 }
2682 
2683 #define TMPSZ 150
2684 
2685 static int tcp4_seq_show(struct seq_file *seq, void *v)
2686 {
2687 	struct tcp_iter_state *st;
2688 	struct sock *sk = v;
2689 
2690 	seq_setwidth(seq, TMPSZ - 1);
2691 	if (v == SEQ_START_TOKEN) {
2692 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2693 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2694 			   "inode");
2695 		goto out;
2696 	}
2697 	st = seq->private;
2698 
2699 	if (sk->sk_state == TCP_TIME_WAIT)
2700 		get_timewait4_sock(v, seq, st->num);
2701 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2702 		get_openreq4(v, seq, st->num);
2703 	else
2704 		get_tcp4_sock(v, seq, st->num);
2705 out:
2706 	seq_pad(seq, '\n');
2707 	return 0;
2708 }
2709 
2710 #ifdef CONFIG_BPF_SYSCALL
2711 struct bpf_tcp_iter_state {
2712 	struct tcp_iter_state state;
2713 	unsigned int cur_sk;
2714 	unsigned int end_sk;
2715 	unsigned int max_sk;
2716 	struct sock **batch;
2717 	bool st_bucket_done;
2718 };
2719 
2720 struct bpf_iter__tcp {
2721 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2722 	__bpf_md_ptr(struct sock_common *, sk_common);
2723 	uid_t uid __aligned(8);
2724 };
2725 
2726 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2727 			     struct sock_common *sk_common, uid_t uid)
2728 {
2729 	struct bpf_iter__tcp ctx;
2730 
2731 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2732 	ctx.meta = meta;
2733 	ctx.sk_common = sk_common;
2734 	ctx.uid = uid;
2735 	return bpf_iter_run_prog(prog, &ctx);
2736 }
2737 
2738 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2739 {
2740 	while (iter->cur_sk < iter->end_sk)
2741 		sock_put(iter->batch[iter->cur_sk++]);
2742 }
2743 
2744 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2745 				      unsigned int new_batch_sz)
2746 {
2747 	struct sock **new_batch;
2748 
2749 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2750 			     GFP_USER | __GFP_NOWARN);
2751 	if (!new_batch)
2752 		return -ENOMEM;
2753 
2754 	bpf_iter_tcp_put_batch(iter);
2755 	kvfree(iter->batch);
2756 	iter->batch = new_batch;
2757 	iter->max_sk = new_batch_sz;
2758 
2759 	return 0;
2760 }
2761 
2762 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2763 						 struct sock *start_sk)
2764 {
2765 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2766 	struct bpf_tcp_iter_state *iter = seq->private;
2767 	struct tcp_iter_state *st = &iter->state;
2768 	struct hlist_nulls_node *node;
2769 	unsigned int expected = 1;
2770 	struct sock *sk;
2771 
2772 	sock_hold(start_sk);
2773 	iter->batch[iter->end_sk++] = start_sk;
2774 
2775 	sk = sk_nulls_next(start_sk);
2776 	sk_nulls_for_each_from(sk, node) {
2777 		if (seq_sk_match(seq, sk)) {
2778 			if (iter->end_sk < iter->max_sk) {
2779 				sock_hold(sk);
2780 				iter->batch[iter->end_sk++] = sk;
2781 			}
2782 			expected++;
2783 		}
2784 	}
2785 	spin_unlock(&hinfo->lhash2[st->bucket].lock);
2786 
2787 	return expected;
2788 }
2789 
2790 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2791 						   struct sock *start_sk)
2792 {
2793 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2794 	struct bpf_tcp_iter_state *iter = seq->private;
2795 	struct tcp_iter_state *st = &iter->state;
2796 	struct hlist_nulls_node *node;
2797 	unsigned int expected = 1;
2798 	struct sock *sk;
2799 
2800 	sock_hold(start_sk);
2801 	iter->batch[iter->end_sk++] = start_sk;
2802 
2803 	sk = sk_nulls_next(start_sk);
2804 	sk_nulls_for_each_from(sk, node) {
2805 		if (seq_sk_match(seq, sk)) {
2806 			if (iter->end_sk < iter->max_sk) {
2807 				sock_hold(sk);
2808 				iter->batch[iter->end_sk++] = sk;
2809 			}
2810 			expected++;
2811 		}
2812 	}
2813 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2814 
2815 	return expected;
2816 }
2817 
2818 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2819 {
2820 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2821 	struct bpf_tcp_iter_state *iter = seq->private;
2822 	struct tcp_iter_state *st = &iter->state;
2823 	unsigned int expected;
2824 	bool resized = false;
2825 	struct sock *sk;
2826 
2827 	/* The st->bucket is done.  Directly advance to the next
2828 	 * bucket instead of having the tcp_seek_last_pos() to skip
2829 	 * one by one in the current bucket and eventually find out
2830 	 * it has to advance to the next bucket.
2831 	 */
2832 	if (iter->st_bucket_done) {
2833 		st->offset = 0;
2834 		st->bucket++;
2835 		if (st->state == TCP_SEQ_STATE_LISTENING &&
2836 		    st->bucket > hinfo->lhash2_mask) {
2837 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2838 			st->bucket = 0;
2839 		}
2840 	}
2841 
2842 again:
2843 	/* Get a new batch */
2844 	iter->cur_sk = 0;
2845 	iter->end_sk = 0;
2846 	iter->st_bucket_done = false;
2847 
2848 	sk = tcp_seek_last_pos(seq);
2849 	if (!sk)
2850 		return NULL; /* Done */
2851 
2852 	if (st->state == TCP_SEQ_STATE_LISTENING)
2853 		expected = bpf_iter_tcp_listening_batch(seq, sk);
2854 	else
2855 		expected = bpf_iter_tcp_established_batch(seq, sk);
2856 
2857 	if (iter->end_sk == expected) {
2858 		iter->st_bucket_done = true;
2859 		return sk;
2860 	}
2861 
2862 	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2863 		resized = true;
2864 		goto again;
2865 	}
2866 
2867 	return sk;
2868 }
2869 
2870 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2871 {
2872 	/* bpf iter does not support lseek, so it always
2873 	 * continue from where it was stop()-ped.
2874 	 */
2875 	if (*pos)
2876 		return bpf_iter_tcp_batch(seq);
2877 
2878 	return SEQ_START_TOKEN;
2879 }
2880 
2881 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2882 {
2883 	struct bpf_tcp_iter_state *iter = seq->private;
2884 	struct tcp_iter_state *st = &iter->state;
2885 	struct sock *sk;
2886 
2887 	/* Whenever seq_next() is called, the iter->cur_sk is
2888 	 * done with seq_show(), so advance to the next sk in
2889 	 * the batch.
2890 	 */
2891 	if (iter->cur_sk < iter->end_sk) {
2892 		/* Keeping st->num consistent in tcp_iter_state.
2893 		 * bpf_iter_tcp does not use st->num.
2894 		 * meta.seq_num is used instead.
2895 		 */
2896 		st->num++;
2897 		/* Move st->offset to the next sk in the bucket such that
2898 		 * the future start() will resume at st->offset in
2899 		 * st->bucket.  See tcp_seek_last_pos().
2900 		 */
2901 		st->offset++;
2902 		sock_put(iter->batch[iter->cur_sk++]);
2903 	}
2904 
2905 	if (iter->cur_sk < iter->end_sk)
2906 		sk = iter->batch[iter->cur_sk];
2907 	else
2908 		sk = bpf_iter_tcp_batch(seq);
2909 
2910 	++*pos;
2911 	/* Keeping st->last_pos consistent in tcp_iter_state.
2912 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2913 	 */
2914 	st->last_pos = *pos;
2915 	return sk;
2916 }
2917 
2918 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2919 {
2920 	struct bpf_iter_meta meta;
2921 	struct bpf_prog *prog;
2922 	struct sock *sk = v;
2923 	bool slow;
2924 	uid_t uid;
2925 	int ret;
2926 
2927 	if (v == SEQ_START_TOKEN)
2928 		return 0;
2929 
2930 	if (sk_fullsock(sk))
2931 		slow = lock_sock_fast(sk);
2932 
2933 	if (unlikely(sk_unhashed(sk))) {
2934 		ret = SEQ_SKIP;
2935 		goto unlock;
2936 	}
2937 
2938 	if (sk->sk_state == TCP_TIME_WAIT) {
2939 		uid = 0;
2940 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2941 		const struct request_sock *req = v;
2942 
2943 		uid = from_kuid_munged(seq_user_ns(seq),
2944 				       sock_i_uid(req->rsk_listener));
2945 	} else {
2946 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2947 	}
2948 
2949 	meta.seq = seq;
2950 	prog = bpf_iter_get_info(&meta, false);
2951 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
2952 
2953 unlock:
2954 	if (sk_fullsock(sk))
2955 		unlock_sock_fast(sk, slow);
2956 	return ret;
2957 
2958 }
2959 
2960 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2961 {
2962 	struct bpf_tcp_iter_state *iter = seq->private;
2963 	struct bpf_iter_meta meta;
2964 	struct bpf_prog *prog;
2965 
2966 	if (!v) {
2967 		meta.seq = seq;
2968 		prog = bpf_iter_get_info(&meta, true);
2969 		if (prog)
2970 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
2971 	}
2972 
2973 	if (iter->cur_sk < iter->end_sk) {
2974 		bpf_iter_tcp_put_batch(iter);
2975 		iter->st_bucket_done = false;
2976 	}
2977 }
2978 
2979 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2980 	.show		= bpf_iter_tcp_seq_show,
2981 	.start		= bpf_iter_tcp_seq_start,
2982 	.next		= bpf_iter_tcp_seq_next,
2983 	.stop		= bpf_iter_tcp_seq_stop,
2984 };
2985 #endif
2986 static unsigned short seq_file_family(const struct seq_file *seq)
2987 {
2988 	const struct tcp_seq_afinfo *afinfo;
2989 
2990 #ifdef CONFIG_BPF_SYSCALL
2991 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
2992 	if (seq->op == &bpf_iter_tcp_seq_ops)
2993 		return AF_UNSPEC;
2994 #endif
2995 
2996 	/* Iterated from proc fs */
2997 	afinfo = pde_data(file_inode(seq->file));
2998 	return afinfo->family;
2999 }
3000 
3001 static const struct seq_operations tcp4_seq_ops = {
3002 	.show		= tcp4_seq_show,
3003 	.start		= tcp_seq_start,
3004 	.next		= tcp_seq_next,
3005 	.stop		= tcp_seq_stop,
3006 };
3007 
3008 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3009 	.family		= AF_INET,
3010 };
3011 
3012 static int __net_init tcp4_proc_init_net(struct net *net)
3013 {
3014 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3015 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3016 		return -ENOMEM;
3017 	return 0;
3018 }
3019 
3020 static void __net_exit tcp4_proc_exit_net(struct net *net)
3021 {
3022 	remove_proc_entry("tcp", net->proc_net);
3023 }
3024 
3025 static struct pernet_operations tcp4_net_ops = {
3026 	.init = tcp4_proc_init_net,
3027 	.exit = tcp4_proc_exit_net,
3028 };
3029 
3030 int __init tcp4_proc_init(void)
3031 {
3032 	return register_pernet_subsys(&tcp4_net_ops);
3033 }
3034 
3035 void tcp4_proc_exit(void)
3036 {
3037 	unregister_pernet_subsys(&tcp4_net_ops);
3038 }
3039 #endif /* CONFIG_PROC_FS */
3040 
3041 /* @wake is one when sk_stream_write_space() calls us.
3042  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3043  * This mimics the strategy used in sock_def_write_space().
3044  */
3045 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3046 {
3047 	const struct tcp_sock *tp = tcp_sk(sk);
3048 	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3049 			    READ_ONCE(tp->snd_nxt);
3050 
3051 	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3052 }
3053 EXPORT_SYMBOL(tcp_stream_memory_free);
3054 
3055 struct proto tcp_prot = {
3056 	.name			= "TCP",
3057 	.owner			= THIS_MODULE,
3058 	.close			= tcp_close,
3059 	.pre_connect		= tcp_v4_pre_connect,
3060 	.connect		= tcp_v4_connect,
3061 	.disconnect		= tcp_disconnect,
3062 	.accept			= inet_csk_accept,
3063 	.ioctl			= tcp_ioctl,
3064 	.init			= tcp_v4_init_sock,
3065 	.destroy		= tcp_v4_destroy_sock,
3066 	.shutdown		= tcp_shutdown,
3067 	.setsockopt		= tcp_setsockopt,
3068 	.getsockopt		= tcp_getsockopt,
3069 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3070 	.keepalive		= tcp_set_keepalive,
3071 	.recvmsg		= tcp_recvmsg,
3072 	.sendmsg		= tcp_sendmsg,
3073 	.sendpage		= tcp_sendpage,
3074 	.backlog_rcv		= tcp_v4_do_rcv,
3075 	.release_cb		= tcp_release_cb,
3076 	.hash			= inet_hash,
3077 	.unhash			= inet_unhash,
3078 	.get_port		= inet_csk_get_port,
3079 	.put_port		= inet_put_port,
3080 #ifdef CONFIG_BPF_SYSCALL
3081 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3082 #endif
3083 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3084 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3085 	.stream_memory_free	= tcp_stream_memory_free,
3086 	.sockets_allocated	= &tcp_sockets_allocated,
3087 	.orphan_count		= &tcp_orphan_count,
3088 
3089 	.memory_allocated	= &tcp_memory_allocated,
3090 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3091 
3092 	.memory_pressure	= &tcp_memory_pressure,
3093 	.sysctl_mem		= sysctl_tcp_mem,
3094 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3095 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3096 	.max_header		= MAX_TCP_HEADER,
3097 	.obj_size		= sizeof(struct tcp_sock),
3098 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3099 	.twsk_prot		= &tcp_timewait_sock_ops,
3100 	.rsk_prot		= &tcp_request_sock_ops,
3101 	.h.hashinfo		= NULL,
3102 	.no_autobind		= true,
3103 	.diag_destroy		= tcp_abort,
3104 };
3105 EXPORT_SYMBOL(tcp_prot);
3106 
3107 static void __net_exit tcp_sk_exit(struct net *net)
3108 {
3109 	if (net->ipv4.tcp_congestion_control)
3110 		bpf_module_put(net->ipv4.tcp_congestion_control,
3111 			       net->ipv4.tcp_congestion_control->owner);
3112 }
3113 
3114 static void __net_init tcp_set_hashinfo(struct net *net)
3115 {
3116 	struct inet_hashinfo *hinfo;
3117 	unsigned int ehash_entries;
3118 	struct net *old_net;
3119 
3120 	if (net_eq(net, &init_net))
3121 		goto fallback;
3122 
3123 	old_net = current->nsproxy->net_ns;
3124 	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3125 	if (!ehash_entries)
3126 		goto fallback;
3127 
3128 	ehash_entries = roundup_pow_of_two(ehash_entries);
3129 	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3130 	if (!hinfo) {
3131 		pr_warn("Failed to allocate TCP ehash (entries: %u) "
3132 			"for a netns, fallback to the global one\n",
3133 			ehash_entries);
3134 fallback:
3135 		hinfo = &tcp_hashinfo;
3136 		ehash_entries = tcp_hashinfo.ehash_mask + 1;
3137 	}
3138 
3139 	net->ipv4.tcp_death_row.hashinfo = hinfo;
3140 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3141 	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3142 }
3143 
3144 static int __net_init tcp_sk_init(struct net *net)
3145 {
3146 	net->ipv4.sysctl_tcp_ecn = 2;
3147 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3148 
3149 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3150 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3151 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3152 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3153 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3154 
3155 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3156 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3157 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3158 
3159 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3160 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3161 	net->ipv4.sysctl_tcp_syncookies = 1;
3162 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3163 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3164 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3165 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3166 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3167 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3168 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3169 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3170 
3171 	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3172 	tcp_set_hashinfo(net);
3173 
3174 	net->ipv4.sysctl_tcp_sack = 1;
3175 	net->ipv4.sysctl_tcp_window_scaling = 1;
3176 	net->ipv4.sysctl_tcp_timestamps = 1;
3177 	net->ipv4.sysctl_tcp_early_retrans = 3;
3178 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3179 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3180 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3181 	net->ipv4.sysctl_tcp_max_reordering = 300;
3182 	net->ipv4.sysctl_tcp_dsack = 1;
3183 	net->ipv4.sysctl_tcp_app_win = 31;
3184 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3185 	net->ipv4.sysctl_tcp_frto = 2;
3186 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3187 	/* This limits the percentage of the congestion window which we
3188 	 * will allow a single TSO frame to consume.  Building TSO frames
3189 	 * which are too large can cause TCP streams to be bursty.
3190 	 */
3191 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3192 	/* Default TSQ limit of 16 TSO segments */
3193 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3194 
3195 	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3196 	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3197 
3198 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3199 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3200 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3201 	net->ipv4.sysctl_tcp_autocorking = 1;
3202 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3203 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3204 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3205 	if (net != &init_net) {
3206 		memcpy(net->ipv4.sysctl_tcp_rmem,
3207 		       init_net.ipv4.sysctl_tcp_rmem,
3208 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3209 		memcpy(net->ipv4.sysctl_tcp_wmem,
3210 		       init_net.ipv4.sysctl_tcp_wmem,
3211 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3212 	}
3213 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3214 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3215 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3216 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3217 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3218 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3219 
3220 	/* Set default values for PLB */
3221 	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3222 	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3223 	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3224 	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3225 	/* Default congestion threshold for PLB to mark a round is 50% */
3226 	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3227 
3228 	/* Reno is always built in */
3229 	if (!net_eq(net, &init_net) &&
3230 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3231 			       init_net.ipv4.tcp_congestion_control->owner))
3232 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3233 	else
3234 		net->ipv4.tcp_congestion_control = &tcp_reno;
3235 
3236 	return 0;
3237 }
3238 
3239 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3240 {
3241 	struct net *net;
3242 
3243 	tcp_twsk_purge(net_exit_list, AF_INET);
3244 
3245 	list_for_each_entry(net, net_exit_list, exit_list) {
3246 		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3247 		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3248 		tcp_fastopen_ctx_destroy(net);
3249 	}
3250 }
3251 
3252 static struct pernet_operations __net_initdata tcp_sk_ops = {
3253        .init	   = tcp_sk_init,
3254        .exit	   = tcp_sk_exit,
3255        .exit_batch = tcp_sk_exit_batch,
3256 };
3257 
3258 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3259 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3260 		     struct sock_common *sk_common, uid_t uid)
3261 
3262 #define INIT_BATCH_SZ 16
3263 
3264 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3265 {
3266 	struct bpf_tcp_iter_state *iter = priv_data;
3267 	int err;
3268 
3269 	err = bpf_iter_init_seq_net(priv_data, aux);
3270 	if (err)
3271 		return err;
3272 
3273 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3274 	if (err) {
3275 		bpf_iter_fini_seq_net(priv_data);
3276 		return err;
3277 	}
3278 
3279 	return 0;
3280 }
3281 
3282 static void bpf_iter_fini_tcp(void *priv_data)
3283 {
3284 	struct bpf_tcp_iter_state *iter = priv_data;
3285 
3286 	bpf_iter_fini_seq_net(priv_data);
3287 	kvfree(iter->batch);
3288 }
3289 
3290 static const struct bpf_iter_seq_info tcp_seq_info = {
3291 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3292 	.init_seq_private	= bpf_iter_init_tcp,
3293 	.fini_seq_private	= bpf_iter_fini_tcp,
3294 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3295 };
3296 
3297 static const struct bpf_func_proto *
3298 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3299 			    const struct bpf_prog *prog)
3300 {
3301 	switch (func_id) {
3302 	case BPF_FUNC_setsockopt:
3303 		return &bpf_sk_setsockopt_proto;
3304 	case BPF_FUNC_getsockopt:
3305 		return &bpf_sk_getsockopt_proto;
3306 	default:
3307 		return NULL;
3308 	}
3309 }
3310 
3311 static struct bpf_iter_reg tcp_reg_info = {
3312 	.target			= "tcp",
3313 	.ctx_arg_info_size	= 1,
3314 	.ctx_arg_info		= {
3315 		{ offsetof(struct bpf_iter__tcp, sk_common),
3316 		  PTR_TO_BTF_ID_OR_NULL },
3317 	},
3318 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3319 	.seq_info		= &tcp_seq_info,
3320 };
3321 
3322 static void __init bpf_iter_register(void)
3323 {
3324 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3325 	if (bpf_iter_reg_target(&tcp_reg_info))
3326 		pr_warn("Warning: could not register bpf iterator tcp\n");
3327 }
3328 
3329 #endif
3330 
3331 void __init tcp_v4_init(void)
3332 {
3333 	int cpu, res;
3334 
3335 	for_each_possible_cpu(cpu) {
3336 		struct sock *sk;
3337 
3338 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3339 					   IPPROTO_TCP, &init_net);
3340 		if (res)
3341 			panic("Failed to create the TCP control socket.\n");
3342 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3343 
3344 		/* Please enforce IP_DF and IPID==0 for RST and
3345 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3346 		 */
3347 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3348 
3349 		per_cpu(ipv4_tcp_sk, cpu) = sk;
3350 	}
3351 	if (register_pernet_subsys(&tcp_sk_ops))
3352 		panic("Failed to create the TCP control socket.\n");
3353 
3354 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3355 	bpf_iter_register();
3356 #endif
3357 }
3358