xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision 8957261c)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 #include <linux/sched.h>
61 
62 #include <net/net_namespace.h>
63 #include <net/icmp.h>
64 #include <net/inet_hashtables.h>
65 #include <net/tcp.h>
66 #include <net/transp_v6.h>
67 #include <net/ipv6.h>
68 #include <net/inet_common.h>
69 #include <net/timewait_sock.h>
70 #include <net/xfrm.h>
71 #include <net/secure_seq.h>
72 #include <net/busy_poll.h>
73 
74 #include <linux/inet.h>
75 #include <linux/ipv6.h>
76 #include <linux/stddef.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79 #include <linux/inetdevice.h>
80 #include <linux/btf_ids.h>
81 
82 #include <crypto/hash.h>
83 #include <linux/scatterlist.h>
84 
85 #include <trace/events/tcp.h>
86 
87 #ifdef CONFIG_TCP_MD5SIG
88 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
89 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
90 #endif
91 
92 struct inet_hashinfo tcp_hashinfo;
93 EXPORT_SYMBOL(tcp_hashinfo);
94 
95 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
96 
97 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
98 {
99 	return secure_tcp_seq(ip_hdr(skb)->daddr,
100 			      ip_hdr(skb)->saddr,
101 			      tcp_hdr(skb)->dest,
102 			      tcp_hdr(skb)->source);
103 }
104 
105 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
106 {
107 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
108 }
109 
110 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
111 {
112 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
113 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
114 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
115 	struct tcp_sock *tp = tcp_sk(sk);
116 
117 	if (reuse == 2) {
118 		/* Still does not detect *everything* that goes through
119 		 * lo, since we require a loopback src or dst address
120 		 * or direct binding to 'lo' interface.
121 		 */
122 		bool loopback = false;
123 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
124 			loopback = true;
125 #if IS_ENABLED(CONFIG_IPV6)
126 		if (tw->tw_family == AF_INET6) {
127 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
128 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
129 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
130 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
131 				loopback = true;
132 		} else
133 #endif
134 		{
135 			if (ipv4_is_loopback(tw->tw_daddr) ||
136 			    ipv4_is_loopback(tw->tw_rcv_saddr))
137 				loopback = true;
138 		}
139 		if (!loopback)
140 			reuse = 0;
141 	}
142 
143 	/* With PAWS, it is safe from the viewpoint
144 	   of data integrity. Even without PAWS it is safe provided sequence
145 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
146 
147 	   Actually, the idea is close to VJ's one, only timestamp cache is
148 	   held not per host, but per port pair and TW bucket is used as state
149 	   holder.
150 
151 	   If TW bucket has been already destroyed we fall back to VJ's scheme
152 	   and use initial timestamp retrieved from peer table.
153 	 */
154 	if (tcptw->tw_ts_recent_stamp &&
155 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
156 					    tcptw->tw_ts_recent_stamp)))) {
157 		/* In case of repair and re-using TIME-WAIT sockets we still
158 		 * want to be sure that it is safe as above but honor the
159 		 * sequence numbers and time stamps set as part of the repair
160 		 * process.
161 		 *
162 		 * Without this check re-using a TIME-WAIT socket with TCP
163 		 * repair would accumulate a -1 on the repair assigned
164 		 * sequence number. The first time it is reused the sequence
165 		 * is -1, the second time -2, etc. This fixes that issue
166 		 * without appearing to create any others.
167 		 */
168 		if (likely(!tp->repair)) {
169 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
170 
171 			if (!seq)
172 				seq = 1;
173 			WRITE_ONCE(tp->write_seq, seq);
174 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
175 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
176 		}
177 		sock_hold(sktw);
178 		return 1;
179 	}
180 
181 	return 0;
182 }
183 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
184 
185 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
186 			      int addr_len)
187 {
188 	/* This check is replicated from tcp_v4_connect() and intended to
189 	 * prevent BPF program called below from accessing bytes that are out
190 	 * of the bound specified by user in addr_len.
191 	 */
192 	if (addr_len < sizeof(struct sockaddr_in))
193 		return -EINVAL;
194 
195 	sock_owned_by_me(sk);
196 
197 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
198 }
199 
200 /* This will initiate an outgoing connection. */
201 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
202 {
203 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
204 	struct inet_timewait_death_row *tcp_death_row;
205 	struct inet_sock *inet = inet_sk(sk);
206 	struct tcp_sock *tp = tcp_sk(sk);
207 	struct ip_options_rcu *inet_opt;
208 	struct net *net = sock_net(sk);
209 	__be16 orig_sport, orig_dport;
210 	__be32 daddr, nexthop;
211 	struct flowi4 *fl4;
212 	struct rtable *rt;
213 	int err;
214 
215 	if (addr_len < sizeof(struct sockaddr_in))
216 		return -EINVAL;
217 
218 	if (usin->sin_family != AF_INET)
219 		return -EAFNOSUPPORT;
220 
221 	nexthop = daddr = usin->sin_addr.s_addr;
222 	inet_opt = rcu_dereference_protected(inet->inet_opt,
223 					     lockdep_sock_is_held(sk));
224 	if (inet_opt && inet_opt->opt.srr) {
225 		if (!daddr)
226 			return -EINVAL;
227 		nexthop = inet_opt->opt.faddr;
228 	}
229 
230 	orig_sport = inet->inet_sport;
231 	orig_dport = usin->sin_port;
232 	fl4 = &inet->cork.fl.u.ip4;
233 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
234 			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
235 			      orig_dport, sk);
236 	if (IS_ERR(rt)) {
237 		err = PTR_ERR(rt);
238 		if (err == -ENETUNREACH)
239 			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
240 		return err;
241 	}
242 
243 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
244 		ip_rt_put(rt);
245 		return -ENETUNREACH;
246 	}
247 
248 	if (!inet_opt || !inet_opt->opt.srr)
249 		daddr = fl4->daddr;
250 
251 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
252 
253 	if (!inet->inet_saddr) {
254 		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
255 		if (err) {
256 			ip_rt_put(rt);
257 			return err;
258 		}
259 	} else {
260 		sk_rcv_saddr_set(sk, inet->inet_saddr);
261 	}
262 
263 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
264 		/* Reset inherited state */
265 		tp->rx_opt.ts_recent	   = 0;
266 		tp->rx_opt.ts_recent_stamp = 0;
267 		if (likely(!tp->repair))
268 			WRITE_ONCE(tp->write_seq, 0);
269 	}
270 
271 	inet->inet_dport = usin->sin_port;
272 	sk_daddr_set(sk, daddr);
273 
274 	inet_csk(sk)->icsk_ext_hdr_len = 0;
275 	if (inet_opt)
276 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
277 
278 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
279 
280 	/* Socket identity is still unknown (sport may be zero).
281 	 * However we set state to SYN-SENT and not releasing socket
282 	 * lock select source port, enter ourselves into the hash tables and
283 	 * complete initialization after this.
284 	 */
285 	tcp_set_state(sk, TCP_SYN_SENT);
286 	err = inet_hash_connect(tcp_death_row, sk);
287 	if (err)
288 		goto failure;
289 
290 	sk_set_txhash(sk);
291 
292 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
293 			       inet->inet_sport, inet->inet_dport, sk);
294 	if (IS_ERR(rt)) {
295 		err = PTR_ERR(rt);
296 		rt = NULL;
297 		goto failure;
298 	}
299 	/* OK, now commit destination to socket.  */
300 	sk->sk_gso_type = SKB_GSO_TCPV4;
301 	sk_setup_caps(sk, &rt->dst);
302 	rt = NULL;
303 
304 	if (likely(!tp->repair)) {
305 		if (!tp->write_seq)
306 			WRITE_ONCE(tp->write_seq,
307 				   secure_tcp_seq(inet->inet_saddr,
308 						  inet->inet_daddr,
309 						  inet->inet_sport,
310 						  usin->sin_port));
311 		WRITE_ONCE(tp->tsoffset,
312 			   secure_tcp_ts_off(net, inet->inet_saddr,
313 					     inet->inet_daddr));
314 	}
315 
316 	atomic_set(&inet->inet_id, get_random_u16());
317 
318 	if (tcp_fastopen_defer_connect(sk, &err))
319 		return err;
320 	if (err)
321 		goto failure;
322 
323 	err = tcp_connect(sk);
324 
325 	if (err)
326 		goto failure;
327 
328 	return 0;
329 
330 failure:
331 	/*
332 	 * This unhashes the socket and releases the local port,
333 	 * if necessary.
334 	 */
335 	tcp_set_state(sk, TCP_CLOSE);
336 	inet_bhash2_reset_saddr(sk);
337 	ip_rt_put(rt);
338 	sk->sk_route_caps = 0;
339 	inet->inet_dport = 0;
340 	return err;
341 }
342 EXPORT_SYMBOL(tcp_v4_connect);
343 
344 /*
345  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
346  * It can be called through tcp_release_cb() if socket was owned by user
347  * at the time tcp_v4_err() was called to handle ICMP message.
348  */
349 void tcp_v4_mtu_reduced(struct sock *sk)
350 {
351 	struct inet_sock *inet = inet_sk(sk);
352 	struct dst_entry *dst;
353 	u32 mtu;
354 
355 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
356 		return;
357 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
358 	dst = inet_csk_update_pmtu(sk, mtu);
359 	if (!dst)
360 		return;
361 
362 	/* Something is about to be wrong... Remember soft error
363 	 * for the case, if this connection will not able to recover.
364 	 */
365 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
366 		WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
367 
368 	mtu = dst_mtu(dst);
369 
370 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
371 	    ip_sk_accept_pmtu(sk) &&
372 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
373 		tcp_sync_mss(sk, mtu);
374 
375 		/* Resend the TCP packet because it's
376 		 * clear that the old packet has been
377 		 * dropped. This is the new "fast" path mtu
378 		 * discovery.
379 		 */
380 		tcp_simple_retransmit(sk);
381 	} /* else let the usual retransmit timer handle it */
382 }
383 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
384 
385 static void do_redirect(struct sk_buff *skb, struct sock *sk)
386 {
387 	struct dst_entry *dst = __sk_dst_check(sk, 0);
388 
389 	if (dst)
390 		dst->ops->redirect(dst, sk, skb);
391 }
392 
393 
394 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
395 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
396 {
397 	struct request_sock *req = inet_reqsk(sk);
398 	struct net *net = sock_net(sk);
399 
400 	/* ICMPs are not backlogged, hence we cannot get
401 	 * an established socket here.
402 	 */
403 	if (seq != tcp_rsk(req)->snt_isn) {
404 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
405 	} else if (abort) {
406 		/*
407 		 * Still in SYN_RECV, just remove it silently.
408 		 * There is no good way to pass the error to the newly
409 		 * created socket, and POSIX does not want network
410 		 * errors returned from accept().
411 		 */
412 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
413 		tcp_listendrop(req->rsk_listener);
414 	}
415 	reqsk_put(req);
416 }
417 EXPORT_SYMBOL(tcp_req_err);
418 
419 /* TCP-LD (RFC 6069) logic */
420 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
421 {
422 	struct inet_connection_sock *icsk = inet_csk(sk);
423 	struct tcp_sock *tp = tcp_sk(sk);
424 	struct sk_buff *skb;
425 	s32 remaining;
426 	u32 delta_us;
427 
428 	if (sock_owned_by_user(sk))
429 		return;
430 
431 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
432 	    !icsk->icsk_backoff)
433 		return;
434 
435 	skb = tcp_rtx_queue_head(sk);
436 	if (WARN_ON_ONCE(!skb))
437 		return;
438 
439 	icsk->icsk_backoff--;
440 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
441 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
442 
443 	tcp_mstamp_refresh(tp);
444 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
445 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
446 
447 	if (remaining > 0) {
448 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
449 					  remaining, TCP_RTO_MAX);
450 	} else {
451 		/* RTO revert clocked out retransmission.
452 		 * Will retransmit now.
453 		 */
454 		tcp_retransmit_timer(sk);
455 	}
456 }
457 EXPORT_SYMBOL(tcp_ld_RTO_revert);
458 
459 /*
460  * This routine is called by the ICMP module when it gets some
461  * sort of error condition.  If err < 0 then the socket should
462  * be closed and the error returned to the user.  If err > 0
463  * it's just the icmp type << 8 | icmp code.  After adjustment
464  * header points to the first 8 bytes of the tcp header.  We need
465  * to find the appropriate port.
466  *
467  * The locking strategy used here is very "optimistic". When
468  * someone else accesses the socket the ICMP is just dropped
469  * and for some paths there is no check at all.
470  * A more general error queue to queue errors for later handling
471  * is probably better.
472  *
473  */
474 
475 int tcp_v4_err(struct sk_buff *skb, u32 info)
476 {
477 	const struct iphdr *iph = (const struct iphdr *)skb->data;
478 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
479 	struct tcp_sock *tp;
480 	const int type = icmp_hdr(skb)->type;
481 	const int code = icmp_hdr(skb)->code;
482 	struct sock *sk;
483 	struct request_sock *fastopen;
484 	u32 seq, snd_una;
485 	int err;
486 	struct net *net = dev_net(skb->dev);
487 
488 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
489 				       iph->daddr, th->dest, iph->saddr,
490 				       ntohs(th->source), inet_iif(skb), 0);
491 	if (!sk) {
492 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
493 		return -ENOENT;
494 	}
495 	if (sk->sk_state == TCP_TIME_WAIT) {
496 		inet_twsk_put(inet_twsk(sk));
497 		return 0;
498 	}
499 	seq = ntohl(th->seq);
500 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
501 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
502 				     type == ICMP_TIME_EXCEEDED ||
503 				     (type == ICMP_DEST_UNREACH &&
504 				      (code == ICMP_NET_UNREACH ||
505 				       code == ICMP_HOST_UNREACH)));
506 		return 0;
507 	}
508 
509 	bh_lock_sock(sk);
510 	/* If too many ICMPs get dropped on busy
511 	 * servers this needs to be solved differently.
512 	 * We do take care of PMTU discovery (RFC1191) special case :
513 	 * we can receive locally generated ICMP messages while socket is held.
514 	 */
515 	if (sock_owned_by_user(sk)) {
516 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
517 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
518 	}
519 	if (sk->sk_state == TCP_CLOSE)
520 		goto out;
521 
522 	if (static_branch_unlikely(&ip4_min_ttl)) {
523 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
524 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
525 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
526 			goto out;
527 		}
528 	}
529 
530 	tp = tcp_sk(sk);
531 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
532 	fastopen = rcu_dereference(tp->fastopen_rsk);
533 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
534 	if (sk->sk_state != TCP_LISTEN &&
535 	    !between(seq, snd_una, tp->snd_nxt)) {
536 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
537 		goto out;
538 	}
539 
540 	switch (type) {
541 	case ICMP_REDIRECT:
542 		if (!sock_owned_by_user(sk))
543 			do_redirect(skb, sk);
544 		goto out;
545 	case ICMP_SOURCE_QUENCH:
546 		/* Just silently ignore these. */
547 		goto out;
548 	case ICMP_PARAMETERPROB:
549 		err = EPROTO;
550 		break;
551 	case ICMP_DEST_UNREACH:
552 		if (code > NR_ICMP_UNREACH)
553 			goto out;
554 
555 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
556 			/* We are not interested in TCP_LISTEN and open_requests
557 			 * (SYN-ACKs send out by Linux are always <576bytes so
558 			 * they should go through unfragmented).
559 			 */
560 			if (sk->sk_state == TCP_LISTEN)
561 				goto out;
562 
563 			WRITE_ONCE(tp->mtu_info, info);
564 			if (!sock_owned_by_user(sk)) {
565 				tcp_v4_mtu_reduced(sk);
566 			} else {
567 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
568 					sock_hold(sk);
569 			}
570 			goto out;
571 		}
572 
573 		err = icmp_err_convert[code].errno;
574 		/* check if this ICMP message allows revert of backoff.
575 		 * (see RFC 6069)
576 		 */
577 		if (!fastopen &&
578 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
579 			tcp_ld_RTO_revert(sk, seq);
580 		break;
581 	case ICMP_TIME_EXCEEDED:
582 		err = EHOSTUNREACH;
583 		break;
584 	default:
585 		goto out;
586 	}
587 
588 	switch (sk->sk_state) {
589 	case TCP_SYN_SENT:
590 	case TCP_SYN_RECV:
591 		/* Only in fast or simultaneous open. If a fast open socket is
592 		 * already accepted it is treated as a connected one below.
593 		 */
594 		if (fastopen && !fastopen->sk)
595 			break;
596 
597 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
598 
599 		if (!sock_owned_by_user(sk)) {
600 			WRITE_ONCE(sk->sk_err, err);
601 
602 			sk_error_report(sk);
603 
604 			tcp_done(sk);
605 		} else {
606 			WRITE_ONCE(sk->sk_err_soft, err);
607 		}
608 		goto out;
609 	}
610 
611 	/* If we've already connected we will keep trying
612 	 * until we time out, or the user gives up.
613 	 *
614 	 * rfc1122 4.2.3.9 allows to consider as hard errors
615 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
616 	 * but it is obsoleted by pmtu discovery).
617 	 *
618 	 * Note, that in modern internet, where routing is unreliable
619 	 * and in each dark corner broken firewalls sit, sending random
620 	 * errors ordered by their masters even this two messages finally lose
621 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
622 	 *
623 	 * Now we are in compliance with RFCs.
624 	 *							--ANK (980905)
625 	 */
626 
627 	if (!sock_owned_by_user(sk) &&
628 	    inet_test_bit(RECVERR, sk)) {
629 		WRITE_ONCE(sk->sk_err, err);
630 		sk_error_report(sk);
631 	} else	{ /* Only an error on timeout */
632 		WRITE_ONCE(sk->sk_err_soft, err);
633 	}
634 
635 out:
636 	bh_unlock_sock(sk);
637 	sock_put(sk);
638 	return 0;
639 }
640 
641 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
642 {
643 	struct tcphdr *th = tcp_hdr(skb);
644 
645 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
646 	skb->csum_start = skb_transport_header(skb) - skb->head;
647 	skb->csum_offset = offsetof(struct tcphdr, check);
648 }
649 
650 /* This routine computes an IPv4 TCP checksum. */
651 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
652 {
653 	const struct inet_sock *inet = inet_sk(sk);
654 
655 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
656 }
657 EXPORT_SYMBOL(tcp_v4_send_check);
658 
659 /*
660  *	This routine will send an RST to the other tcp.
661  *
662  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
663  *		      for reset.
664  *	Answer: if a packet caused RST, it is not for a socket
665  *		existing in our system, if it is matched to a socket,
666  *		it is just duplicate segment or bug in other side's TCP.
667  *		So that we build reply only basing on parameters
668  *		arrived with segment.
669  *	Exception: precedence violation. We do not implement it in any case.
670  */
671 
672 #ifdef CONFIG_TCP_MD5SIG
673 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
674 #else
675 #define OPTION_BYTES sizeof(__be32)
676 #endif
677 
678 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
679 {
680 	const struct tcphdr *th = tcp_hdr(skb);
681 	struct {
682 		struct tcphdr th;
683 		__be32 opt[OPTION_BYTES / sizeof(__be32)];
684 	} rep;
685 	struct ip_reply_arg arg;
686 #ifdef CONFIG_TCP_MD5SIG
687 	struct tcp_md5sig_key *key = NULL;
688 	const __u8 *hash_location = NULL;
689 	unsigned char newhash[16];
690 	int genhash;
691 	struct sock *sk1 = NULL;
692 #endif
693 	u64 transmit_time = 0;
694 	struct sock *ctl_sk;
695 	struct net *net;
696 	u32 txhash = 0;
697 
698 	/* Never send a reset in response to a reset. */
699 	if (th->rst)
700 		return;
701 
702 	/* If sk not NULL, it means we did a successful lookup and incoming
703 	 * route had to be correct. prequeue might have dropped our dst.
704 	 */
705 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
706 		return;
707 
708 	/* Swap the send and the receive. */
709 	memset(&rep, 0, sizeof(rep));
710 	rep.th.dest   = th->source;
711 	rep.th.source = th->dest;
712 	rep.th.doff   = sizeof(struct tcphdr) / 4;
713 	rep.th.rst    = 1;
714 
715 	if (th->ack) {
716 		rep.th.seq = th->ack_seq;
717 	} else {
718 		rep.th.ack = 1;
719 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
720 				       skb->len - (th->doff << 2));
721 	}
722 
723 	memset(&arg, 0, sizeof(arg));
724 	arg.iov[0].iov_base = (unsigned char *)&rep;
725 	arg.iov[0].iov_len  = sizeof(rep.th);
726 
727 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
728 #ifdef CONFIG_TCP_MD5SIG
729 	rcu_read_lock();
730 	hash_location = tcp_parse_md5sig_option(th);
731 	if (sk && sk_fullsock(sk)) {
732 		const union tcp_md5_addr *addr;
733 		int l3index;
734 
735 		/* sdif set, means packet ingressed via a device
736 		 * in an L3 domain and inet_iif is set to it.
737 		 */
738 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
739 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
740 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
741 	} else if (hash_location) {
742 		const union tcp_md5_addr *addr;
743 		int sdif = tcp_v4_sdif(skb);
744 		int dif = inet_iif(skb);
745 		int l3index;
746 
747 		/*
748 		 * active side is lost. Try to find listening socket through
749 		 * source port, and then find md5 key through listening socket.
750 		 * we are not loose security here:
751 		 * Incoming packet is checked with md5 hash with finding key,
752 		 * no RST generated if md5 hash doesn't match.
753 		 */
754 		sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
755 					     NULL, 0, ip_hdr(skb)->saddr,
756 					     th->source, ip_hdr(skb)->daddr,
757 					     ntohs(th->source), dif, sdif);
758 		/* don't send rst if it can't find key */
759 		if (!sk1)
760 			goto out;
761 
762 		/* sdif set, means packet ingressed via a device
763 		 * in an L3 domain and dif is set to it.
764 		 */
765 		l3index = sdif ? dif : 0;
766 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
767 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
768 		if (!key)
769 			goto out;
770 
771 
772 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
773 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
774 			goto out;
775 
776 	}
777 
778 	if (key) {
779 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
780 				   (TCPOPT_NOP << 16) |
781 				   (TCPOPT_MD5SIG << 8) |
782 				   TCPOLEN_MD5SIG);
783 		/* Update length and the length the header thinks exists */
784 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
785 		rep.th.doff = arg.iov[0].iov_len / 4;
786 
787 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
788 				     key, ip_hdr(skb)->saddr,
789 				     ip_hdr(skb)->daddr, &rep.th);
790 	}
791 #endif
792 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
793 	if (rep.opt[0] == 0) {
794 		__be32 mrst = mptcp_reset_option(skb);
795 
796 		if (mrst) {
797 			rep.opt[0] = mrst;
798 			arg.iov[0].iov_len += sizeof(mrst);
799 			rep.th.doff = arg.iov[0].iov_len / 4;
800 		}
801 	}
802 
803 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
804 				      ip_hdr(skb)->saddr, /* XXX */
805 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
806 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
807 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
808 
809 	/* When socket is gone, all binding information is lost.
810 	 * routing might fail in this case. No choice here, if we choose to force
811 	 * input interface, we will misroute in case of asymmetric route.
812 	 */
813 	if (sk) {
814 		arg.bound_dev_if = sk->sk_bound_dev_if;
815 		if (sk_fullsock(sk))
816 			trace_tcp_send_reset(sk, skb);
817 	}
818 
819 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
820 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
821 
822 	arg.tos = ip_hdr(skb)->tos;
823 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
824 	local_bh_disable();
825 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
826 	sock_net_set(ctl_sk, net);
827 	if (sk) {
828 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
829 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
830 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
831 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
832 		transmit_time = tcp_transmit_time(sk);
833 		xfrm_sk_clone_policy(ctl_sk, sk);
834 		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
835 			 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
836 	} else {
837 		ctl_sk->sk_mark = 0;
838 		ctl_sk->sk_priority = 0;
839 	}
840 	ip_send_unicast_reply(ctl_sk,
841 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
842 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
843 			      &arg, arg.iov[0].iov_len,
844 			      transmit_time, txhash);
845 
846 	xfrm_sk_free_policy(ctl_sk);
847 	sock_net_set(ctl_sk, &init_net);
848 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
849 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
850 	local_bh_enable();
851 
852 #ifdef CONFIG_TCP_MD5SIG
853 out:
854 	rcu_read_unlock();
855 #endif
856 }
857 
858 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
859    outside socket context is ugly, certainly. What can I do?
860  */
861 
862 static void tcp_v4_send_ack(const struct sock *sk,
863 			    struct sk_buff *skb, u32 seq, u32 ack,
864 			    u32 win, u32 tsval, u32 tsecr, int oif,
865 			    struct tcp_md5sig_key *key,
866 			    int reply_flags, u8 tos, u32 txhash)
867 {
868 	const struct tcphdr *th = tcp_hdr(skb);
869 	struct {
870 		struct tcphdr th;
871 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
872 #ifdef CONFIG_TCP_MD5SIG
873 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
874 #endif
875 			];
876 	} rep;
877 	struct net *net = sock_net(sk);
878 	struct ip_reply_arg arg;
879 	struct sock *ctl_sk;
880 	u64 transmit_time;
881 
882 	memset(&rep.th, 0, sizeof(struct tcphdr));
883 	memset(&arg, 0, sizeof(arg));
884 
885 	arg.iov[0].iov_base = (unsigned char *)&rep;
886 	arg.iov[0].iov_len  = sizeof(rep.th);
887 	if (tsecr) {
888 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
889 				   (TCPOPT_TIMESTAMP << 8) |
890 				   TCPOLEN_TIMESTAMP);
891 		rep.opt[1] = htonl(tsval);
892 		rep.opt[2] = htonl(tsecr);
893 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
894 	}
895 
896 	/* Swap the send and the receive. */
897 	rep.th.dest    = th->source;
898 	rep.th.source  = th->dest;
899 	rep.th.doff    = arg.iov[0].iov_len / 4;
900 	rep.th.seq     = htonl(seq);
901 	rep.th.ack_seq = htonl(ack);
902 	rep.th.ack     = 1;
903 	rep.th.window  = htons(win);
904 
905 #ifdef CONFIG_TCP_MD5SIG
906 	if (key) {
907 		int offset = (tsecr) ? 3 : 0;
908 
909 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
910 					  (TCPOPT_NOP << 16) |
911 					  (TCPOPT_MD5SIG << 8) |
912 					  TCPOLEN_MD5SIG);
913 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
914 		rep.th.doff = arg.iov[0].iov_len/4;
915 
916 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
917 				    key, ip_hdr(skb)->saddr,
918 				    ip_hdr(skb)->daddr, &rep.th);
919 	}
920 #endif
921 	arg.flags = reply_flags;
922 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
923 				      ip_hdr(skb)->saddr, /* XXX */
924 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
925 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
926 	if (oif)
927 		arg.bound_dev_if = oif;
928 	arg.tos = tos;
929 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
930 	local_bh_disable();
931 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
932 	sock_net_set(ctl_sk, net);
933 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
934 			   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
935 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
936 			   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
937 	transmit_time = tcp_transmit_time(sk);
938 	ip_send_unicast_reply(ctl_sk,
939 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
940 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
941 			      &arg, arg.iov[0].iov_len,
942 			      transmit_time, txhash);
943 
944 	sock_net_set(ctl_sk, &init_net);
945 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
946 	local_bh_enable();
947 }
948 
949 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
950 {
951 	struct inet_timewait_sock *tw = inet_twsk(sk);
952 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
953 
954 	tcp_v4_send_ack(sk, skb,
955 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
956 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
957 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
958 			tcptw->tw_ts_recent,
959 			tw->tw_bound_dev_if,
960 			tcp_twsk_md5_key(tcptw),
961 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
962 			tw->tw_tos,
963 			tw->tw_txhash
964 			);
965 
966 	inet_twsk_put(tw);
967 }
968 
969 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
970 				  struct request_sock *req)
971 {
972 	const union tcp_md5_addr *addr;
973 	int l3index;
974 
975 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
976 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
977 	 */
978 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
979 					     tcp_sk(sk)->snd_nxt;
980 
981 	/* RFC 7323 2.3
982 	 * The window field (SEG.WND) of every outgoing segment, with the
983 	 * exception of <SYN> segments, MUST be right-shifted by
984 	 * Rcv.Wind.Shift bits:
985 	 */
986 	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
987 	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
988 	tcp_v4_send_ack(sk, skb, seq,
989 			tcp_rsk(req)->rcv_nxt,
990 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
991 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
992 			READ_ONCE(req->ts_recent),
993 			0,
994 			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
995 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
996 			ip_hdr(skb)->tos,
997 			READ_ONCE(tcp_rsk(req)->txhash));
998 }
999 
1000 /*
1001  *	Send a SYN-ACK after having received a SYN.
1002  *	This still operates on a request_sock only, not on a big
1003  *	socket.
1004  */
1005 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1006 			      struct flowi *fl,
1007 			      struct request_sock *req,
1008 			      struct tcp_fastopen_cookie *foc,
1009 			      enum tcp_synack_type synack_type,
1010 			      struct sk_buff *syn_skb)
1011 {
1012 	const struct inet_request_sock *ireq = inet_rsk(req);
1013 	struct flowi4 fl4;
1014 	int err = -1;
1015 	struct sk_buff *skb;
1016 	u8 tos;
1017 
1018 	/* First, grab a route. */
1019 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1020 		return -1;
1021 
1022 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1023 
1024 	if (skb) {
1025 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1026 
1027 		tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1028 				(tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1029 				(inet_sk(sk)->tos & INET_ECN_MASK) :
1030 				inet_sk(sk)->tos;
1031 
1032 		if (!INET_ECN_is_capable(tos) &&
1033 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1034 			tos |= INET_ECN_ECT_0;
1035 
1036 		rcu_read_lock();
1037 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1038 					    ireq->ir_rmt_addr,
1039 					    rcu_dereference(ireq->ireq_opt),
1040 					    tos);
1041 		rcu_read_unlock();
1042 		err = net_xmit_eval(err);
1043 	}
1044 
1045 	return err;
1046 }
1047 
1048 /*
1049  *	IPv4 request_sock destructor.
1050  */
1051 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1052 {
1053 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1054 }
1055 
1056 #ifdef CONFIG_TCP_MD5SIG
1057 /*
1058  * RFC2385 MD5 checksumming requires a mapping of
1059  * IP address->MD5 Key.
1060  * We need to maintain these in the sk structure.
1061  */
1062 
1063 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1064 EXPORT_SYMBOL(tcp_md5_needed);
1065 
1066 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1067 {
1068 	if (!old)
1069 		return true;
1070 
1071 	/* l3index always overrides non-l3index */
1072 	if (old->l3index && new->l3index == 0)
1073 		return false;
1074 	if (old->l3index == 0 && new->l3index)
1075 		return true;
1076 
1077 	return old->prefixlen < new->prefixlen;
1078 }
1079 
1080 /* Find the Key structure for an address.  */
1081 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1082 					   const union tcp_md5_addr *addr,
1083 					   int family)
1084 {
1085 	const struct tcp_sock *tp = tcp_sk(sk);
1086 	struct tcp_md5sig_key *key;
1087 	const struct tcp_md5sig_info *md5sig;
1088 	__be32 mask;
1089 	struct tcp_md5sig_key *best_match = NULL;
1090 	bool match;
1091 
1092 	/* caller either holds rcu_read_lock() or socket lock */
1093 	md5sig = rcu_dereference_check(tp->md5sig_info,
1094 				       lockdep_sock_is_held(sk));
1095 	if (!md5sig)
1096 		return NULL;
1097 
1098 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1099 				 lockdep_sock_is_held(sk)) {
1100 		if (key->family != family)
1101 			continue;
1102 		if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1103 			continue;
1104 		if (family == AF_INET) {
1105 			mask = inet_make_mask(key->prefixlen);
1106 			match = (key->addr.a4.s_addr & mask) ==
1107 				(addr->a4.s_addr & mask);
1108 #if IS_ENABLED(CONFIG_IPV6)
1109 		} else if (family == AF_INET6) {
1110 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1111 						  key->prefixlen);
1112 #endif
1113 		} else {
1114 			match = false;
1115 		}
1116 
1117 		if (match && better_md5_match(best_match, key))
1118 			best_match = key;
1119 	}
1120 	return best_match;
1121 }
1122 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1123 
1124 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1125 						      const union tcp_md5_addr *addr,
1126 						      int family, u8 prefixlen,
1127 						      int l3index, u8 flags)
1128 {
1129 	const struct tcp_sock *tp = tcp_sk(sk);
1130 	struct tcp_md5sig_key *key;
1131 	unsigned int size = sizeof(struct in_addr);
1132 	const struct tcp_md5sig_info *md5sig;
1133 
1134 	/* caller either holds rcu_read_lock() or socket lock */
1135 	md5sig = rcu_dereference_check(tp->md5sig_info,
1136 				       lockdep_sock_is_held(sk));
1137 	if (!md5sig)
1138 		return NULL;
1139 #if IS_ENABLED(CONFIG_IPV6)
1140 	if (family == AF_INET6)
1141 		size = sizeof(struct in6_addr);
1142 #endif
1143 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1144 				 lockdep_sock_is_held(sk)) {
1145 		if (key->family != family)
1146 			continue;
1147 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1148 			continue;
1149 		if (key->l3index != l3index)
1150 			continue;
1151 		if (!memcmp(&key->addr, addr, size) &&
1152 		    key->prefixlen == prefixlen)
1153 			return key;
1154 	}
1155 	return NULL;
1156 }
1157 
1158 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1159 					 const struct sock *addr_sk)
1160 {
1161 	const union tcp_md5_addr *addr;
1162 	int l3index;
1163 
1164 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1165 						 addr_sk->sk_bound_dev_if);
1166 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1167 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1168 }
1169 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1170 
1171 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1172 {
1173 	struct tcp_sock *tp = tcp_sk(sk);
1174 	struct tcp_md5sig_info *md5sig;
1175 
1176 	md5sig = kmalloc(sizeof(*md5sig), gfp);
1177 	if (!md5sig)
1178 		return -ENOMEM;
1179 
1180 	sk_gso_disable(sk);
1181 	INIT_HLIST_HEAD(&md5sig->head);
1182 	rcu_assign_pointer(tp->md5sig_info, md5sig);
1183 	return 0;
1184 }
1185 
1186 /* This can be called on a newly created socket, from other files */
1187 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1188 			    int family, u8 prefixlen, int l3index, u8 flags,
1189 			    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1190 {
1191 	/* Add Key to the list */
1192 	struct tcp_md5sig_key *key;
1193 	struct tcp_sock *tp = tcp_sk(sk);
1194 	struct tcp_md5sig_info *md5sig;
1195 
1196 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1197 	if (key) {
1198 		/* Pre-existing entry - just update that one.
1199 		 * Note that the key might be used concurrently.
1200 		 * data_race() is telling kcsan that we do not care of
1201 		 * key mismatches, since changing MD5 key on live flows
1202 		 * can lead to packet drops.
1203 		 */
1204 		data_race(memcpy(key->key, newkey, newkeylen));
1205 
1206 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1207 		 * Also note that a reader could catch new key->keylen value
1208 		 * but old key->key[], this is the reason we use __GFP_ZERO
1209 		 * at sock_kmalloc() time below these lines.
1210 		 */
1211 		WRITE_ONCE(key->keylen, newkeylen);
1212 
1213 		return 0;
1214 	}
1215 
1216 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1217 					   lockdep_sock_is_held(sk));
1218 
1219 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1220 	if (!key)
1221 		return -ENOMEM;
1222 	if (!tcp_alloc_md5sig_pool()) {
1223 		sock_kfree_s(sk, key, sizeof(*key));
1224 		return -ENOMEM;
1225 	}
1226 
1227 	memcpy(key->key, newkey, newkeylen);
1228 	key->keylen = newkeylen;
1229 	key->family = family;
1230 	key->prefixlen = prefixlen;
1231 	key->l3index = l3index;
1232 	key->flags = flags;
1233 	memcpy(&key->addr, addr,
1234 	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1235 								 sizeof(struct in_addr));
1236 	hlist_add_head_rcu(&key->node, &md5sig->head);
1237 	return 0;
1238 }
1239 
1240 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1241 		   int family, u8 prefixlen, int l3index, u8 flags,
1242 		   const u8 *newkey, u8 newkeylen)
1243 {
1244 	struct tcp_sock *tp = tcp_sk(sk);
1245 
1246 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1247 		if (tcp_md5sig_info_add(sk, GFP_KERNEL))
1248 			return -ENOMEM;
1249 
1250 		if (!static_branch_inc(&tcp_md5_needed.key)) {
1251 			struct tcp_md5sig_info *md5sig;
1252 
1253 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1254 			rcu_assign_pointer(tp->md5sig_info, NULL);
1255 			kfree_rcu(md5sig, rcu);
1256 			return -EUSERS;
1257 		}
1258 	}
1259 
1260 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1261 				newkey, newkeylen, GFP_KERNEL);
1262 }
1263 EXPORT_SYMBOL(tcp_md5_do_add);
1264 
1265 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1266 		     int family, u8 prefixlen, int l3index,
1267 		     struct tcp_md5sig_key *key)
1268 {
1269 	struct tcp_sock *tp = tcp_sk(sk);
1270 
1271 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1272 		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
1273 			return -ENOMEM;
1274 
1275 		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1276 			struct tcp_md5sig_info *md5sig;
1277 
1278 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1279 			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1280 			rcu_assign_pointer(tp->md5sig_info, NULL);
1281 			kfree_rcu(md5sig, rcu);
1282 			return -EUSERS;
1283 		}
1284 	}
1285 
1286 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1287 				key->flags, key->key, key->keylen,
1288 				sk_gfp_mask(sk, GFP_ATOMIC));
1289 }
1290 EXPORT_SYMBOL(tcp_md5_key_copy);
1291 
1292 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1293 		   u8 prefixlen, int l3index, u8 flags)
1294 {
1295 	struct tcp_md5sig_key *key;
1296 
1297 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1298 	if (!key)
1299 		return -ENOENT;
1300 	hlist_del_rcu(&key->node);
1301 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1302 	kfree_rcu(key, rcu);
1303 	return 0;
1304 }
1305 EXPORT_SYMBOL(tcp_md5_do_del);
1306 
1307 static void tcp_clear_md5_list(struct sock *sk)
1308 {
1309 	struct tcp_sock *tp = tcp_sk(sk);
1310 	struct tcp_md5sig_key *key;
1311 	struct hlist_node *n;
1312 	struct tcp_md5sig_info *md5sig;
1313 
1314 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1315 
1316 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1317 		hlist_del_rcu(&key->node);
1318 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1319 		kfree_rcu(key, rcu);
1320 	}
1321 }
1322 
1323 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1324 				 sockptr_t optval, int optlen)
1325 {
1326 	struct tcp_md5sig cmd;
1327 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1328 	const union tcp_md5_addr *addr;
1329 	u8 prefixlen = 32;
1330 	int l3index = 0;
1331 	u8 flags;
1332 
1333 	if (optlen < sizeof(cmd))
1334 		return -EINVAL;
1335 
1336 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1337 		return -EFAULT;
1338 
1339 	if (sin->sin_family != AF_INET)
1340 		return -EINVAL;
1341 
1342 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1343 
1344 	if (optname == TCP_MD5SIG_EXT &&
1345 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1346 		prefixlen = cmd.tcpm_prefixlen;
1347 		if (prefixlen > 32)
1348 			return -EINVAL;
1349 	}
1350 
1351 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1352 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1353 		struct net_device *dev;
1354 
1355 		rcu_read_lock();
1356 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1357 		if (dev && netif_is_l3_master(dev))
1358 			l3index = dev->ifindex;
1359 
1360 		rcu_read_unlock();
1361 
1362 		/* ok to reference set/not set outside of rcu;
1363 		 * right now device MUST be an L3 master
1364 		 */
1365 		if (!dev || !l3index)
1366 			return -EINVAL;
1367 	}
1368 
1369 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1370 
1371 	if (!cmd.tcpm_keylen)
1372 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1373 
1374 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1375 		return -EINVAL;
1376 
1377 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1378 			      cmd.tcpm_key, cmd.tcpm_keylen);
1379 }
1380 
1381 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1382 				   __be32 daddr, __be32 saddr,
1383 				   const struct tcphdr *th, int nbytes)
1384 {
1385 	struct tcp4_pseudohdr *bp;
1386 	struct scatterlist sg;
1387 	struct tcphdr *_th;
1388 
1389 	bp = hp->scratch;
1390 	bp->saddr = saddr;
1391 	bp->daddr = daddr;
1392 	bp->pad = 0;
1393 	bp->protocol = IPPROTO_TCP;
1394 	bp->len = cpu_to_be16(nbytes);
1395 
1396 	_th = (struct tcphdr *)(bp + 1);
1397 	memcpy(_th, th, sizeof(*th));
1398 	_th->check = 0;
1399 
1400 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1401 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1402 				sizeof(*bp) + sizeof(*th));
1403 	return crypto_ahash_update(hp->md5_req);
1404 }
1405 
1406 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1407 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1408 {
1409 	struct tcp_md5sig_pool *hp;
1410 	struct ahash_request *req;
1411 
1412 	hp = tcp_get_md5sig_pool();
1413 	if (!hp)
1414 		goto clear_hash_noput;
1415 	req = hp->md5_req;
1416 
1417 	if (crypto_ahash_init(req))
1418 		goto clear_hash;
1419 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1420 		goto clear_hash;
1421 	if (tcp_md5_hash_key(hp, key))
1422 		goto clear_hash;
1423 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1424 	if (crypto_ahash_final(req))
1425 		goto clear_hash;
1426 
1427 	tcp_put_md5sig_pool();
1428 	return 0;
1429 
1430 clear_hash:
1431 	tcp_put_md5sig_pool();
1432 clear_hash_noput:
1433 	memset(md5_hash, 0, 16);
1434 	return 1;
1435 }
1436 
1437 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1438 			const struct sock *sk,
1439 			const struct sk_buff *skb)
1440 {
1441 	struct tcp_md5sig_pool *hp;
1442 	struct ahash_request *req;
1443 	const struct tcphdr *th = tcp_hdr(skb);
1444 	__be32 saddr, daddr;
1445 
1446 	if (sk) { /* valid for establish/request sockets */
1447 		saddr = sk->sk_rcv_saddr;
1448 		daddr = sk->sk_daddr;
1449 	} else {
1450 		const struct iphdr *iph = ip_hdr(skb);
1451 		saddr = iph->saddr;
1452 		daddr = iph->daddr;
1453 	}
1454 
1455 	hp = tcp_get_md5sig_pool();
1456 	if (!hp)
1457 		goto clear_hash_noput;
1458 	req = hp->md5_req;
1459 
1460 	if (crypto_ahash_init(req))
1461 		goto clear_hash;
1462 
1463 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1464 		goto clear_hash;
1465 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1466 		goto clear_hash;
1467 	if (tcp_md5_hash_key(hp, key))
1468 		goto clear_hash;
1469 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1470 	if (crypto_ahash_final(req))
1471 		goto clear_hash;
1472 
1473 	tcp_put_md5sig_pool();
1474 	return 0;
1475 
1476 clear_hash:
1477 	tcp_put_md5sig_pool();
1478 clear_hash_noput:
1479 	memset(md5_hash, 0, 16);
1480 	return 1;
1481 }
1482 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1483 
1484 #endif
1485 
1486 static void tcp_v4_init_req(struct request_sock *req,
1487 			    const struct sock *sk_listener,
1488 			    struct sk_buff *skb)
1489 {
1490 	struct inet_request_sock *ireq = inet_rsk(req);
1491 	struct net *net = sock_net(sk_listener);
1492 
1493 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1494 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1495 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1496 }
1497 
1498 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1499 					  struct sk_buff *skb,
1500 					  struct flowi *fl,
1501 					  struct request_sock *req)
1502 {
1503 	tcp_v4_init_req(req, sk, skb);
1504 
1505 	if (security_inet_conn_request(sk, skb, req))
1506 		return NULL;
1507 
1508 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1509 }
1510 
1511 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1512 	.family		=	PF_INET,
1513 	.obj_size	=	sizeof(struct tcp_request_sock),
1514 	.rtx_syn_ack	=	tcp_rtx_synack,
1515 	.send_ack	=	tcp_v4_reqsk_send_ack,
1516 	.destructor	=	tcp_v4_reqsk_destructor,
1517 	.send_reset	=	tcp_v4_send_reset,
1518 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1519 };
1520 
1521 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1522 	.mss_clamp	=	TCP_MSS_DEFAULT,
1523 #ifdef CONFIG_TCP_MD5SIG
1524 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1525 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1526 #endif
1527 #ifdef CONFIG_SYN_COOKIES
1528 	.cookie_init_seq =	cookie_v4_init_sequence,
1529 #endif
1530 	.route_req	=	tcp_v4_route_req,
1531 	.init_seq	=	tcp_v4_init_seq,
1532 	.init_ts_off	=	tcp_v4_init_ts_off,
1533 	.send_synack	=	tcp_v4_send_synack,
1534 };
1535 
1536 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1537 {
1538 	/* Never answer to SYNs send to broadcast or multicast */
1539 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1540 		goto drop;
1541 
1542 	return tcp_conn_request(&tcp_request_sock_ops,
1543 				&tcp_request_sock_ipv4_ops, sk, skb);
1544 
1545 drop:
1546 	tcp_listendrop(sk);
1547 	return 0;
1548 }
1549 EXPORT_SYMBOL(tcp_v4_conn_request);
1550 
1551 
1552 /*
1553  * The three way handshake has completed - we got a valid synack -
1554  * now create the new socket.
1555  */
1556 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1557 				  struct request_sock *req,
1558 				  struct dst_entry *dst,
1559 				  struct request_sock *req_unhash,
1560 				  bool *own_req)
1561 {
1562 	struct inet_request_sock *ireq;
1563 	bool found_dup_sk = false;
1564 	struct inet_sock *newinet;
1565 	struct tcp_sock *newtp;
1566 	struct sock *newsk;
1567 #ifdef CONFIG_TCP_MD5SIG
1568 	const union tcp_md5_addr *addr;
1569 	struct tcp_md5sig_key *key;
1570 	int l3index;
1571 #endif
1572 	struct ip_options_rcu *inet_opt;
1573 
1574 	if (sk_acceptq_is_full(sk))
1575 		goto exit_overflow;
1576 
1577 	newsk = tcp_create_openreq_child(sk, req, skb);
1578 	if (!newsk)
1579 		goto exit_nonewsk;
1580 
1581 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1582 	inet_sk_rx_dst_set(newsk, skb);
1583 
1584 	newtp		      = tcp_sk(newsk);
1585 	newinet		      = inet_sk(newsk);
1586 	ireq		      = inet_rsk(req);
1587 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1588 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1589 	newsk->sk_bound_dev_if = ireq->ir_iif;
1590 	newinet->inet_saddr   = ireq->ir_loc_addr;
1591 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1592 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1593 	newinet->mc_index     = inet_iif(skb);
1594 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1595 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1596 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1597 	if (inet_opt)
1598 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1599 	atomic_set(&newinet->inet_id, get_random_u16());
1600 
1601 	/* Set ToS of the new socket based upon the value of incoming SYN.
1602 	 * ECT bits are set later in tcp_init_transfer().
1603 	 */
1604 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1605 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1606 
1607 	if (!dst) {
1608 		dst = inet_csk_route_child_sock(sk, newsk, req);
1609 		if (!dst)
1610 			goto put_and_exit;
1611 	} else {
1612 		/* syncookie case : see end of cookie_v4_check() */
1613 	}
1614 	sk_setup_caps(newsk, dst);
1615 
1616 	tcp_ca_openreq_child(newsk, dst);
1617 
1618 	tcp_sync_mss(newsk, dst_mtu(dst));
1619 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1620 
1621 	tcp_initialize_rcv_mss(newsk);
1622 
1623 #ifdef CONFIG_TCP_MD5SIG
1624 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1625 	/* Copy over the MD5 key from the original socket */
1626 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1627 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1628 	if (key) {
1629 		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1630 			goto put_and_exit;
1631 		sk_gso_disable(newsk);
1632 	}
1633 #endif
1634 
1635 	if (__inet_inherit_port(sk, newsk) < 0)
1636 		goto put_and_exit;
1637 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1638 				       &found_dup_sk);
1639 	if (likely(*own_req)) {
1640 		tcp_move_syn(newtp, req);
1641 		ireq->ireq_opt = NULL;
1642 	} else {
1643 		newinet->inet_opt = NULL;
1644 
1645 		if (!req_unhash && found_dup_sk) {
1646 			/* This code path should only be executed in the
1647 			 * syncookie case only
1648 			 */
1649 			bh_unlock_sock(newsk);
1650 			sock_put(newsk);
1651 			newsk = NULL;
1652 		}
1653 	}
1654 	return newsk;
1655 
1656 exit_overflow:
1657 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1658 exit_nonewsk:
1659 	dst_release(dst);
1660 exit:
1661 	tcp_listendrop(sk);
1662 	return NULL;
1663 put_and_exit:
1664 	newinet->inet_opt = NULL;
1665 	inet_csk_prepare_forced_close(newsk);
1666 	tcp_done(newsk);
1667 	goto exit;
1668 }
1669 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1670 
1671 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1672 {
1673 #ifdef CONFIG_SYN_COOKIES
1674 	const struct tcphdr *th = tcp_hdr(skb);
1675 
1676 	if (!th->syn)
1677 		sk = cookie_v4_check(sk, skb);
1678 #endif
1679 	return sk;
1680 }
1681 
1682 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1683 			 struct tcphdr *th, u32 *cookie)
1684 {
1685 	u16 mss = 0;
1686 #ifdef CONFIG_SYN_COOKIES
1687 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1688 				    &tcp_request_sock_ipv4_ops, sk, th);
1689 	if (mss) {
1690 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1691 		tcp_synq_overflow(sk);
1692 	}
1693 #endif
1694 	return mss;
1695 }
1696 
1697 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1698 							   u32));
1699 /* The socket must have it's spinlock held when we get
1700  * here, unless it is a TCP_LISTEN socket.
1701  *
1702  * We have a potential double-lock case here, so even when
1703  * doing backlog processing we use the BH locking scheme.
1704  * This is because we cannot sleep with the original spinlock
1705  * held.
1706  */
1707 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1708 {
1709 	enum skb_drop_reason reason;
1710 	struct sock *rsk;
1711 
1712 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1713 		struct dst_entry *dst;
1714 
1715 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1716 						lockdep_sock_is_held(sk));
1717 
1718 		sock_rps_save_rxhash(sk, skb);
1719 		sk_mark_napi_id(sk, skb);
1720 		if (dst) {
1721 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1722 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1723 					     dst, 0)) {
1724 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1725 				dst_release(dst);
1726 			}
1727 		}
1728 		tcp_rcv_established(sk, skb);
1729 		return 0;
1730 	}
1731 
1732 	reason = SKB_DROP_REASON_NOT_SPECIFIED;
1733 	if (tcp_checksum_complete(skb))
1734 		goto csum_err;
1735 
1736 	if (sk->sk_state == TCP_LISTEN) {
1737 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1738 
1739 		if (!nsk)
1740 			goto discard;
1741 		if (nsk != sk) {
1742 			if (tcp_child_process(sk, nsk, skb)) {
1743 				rsk = nsk;
1744 				goto reset;
1745 			}
1746 			return 0;
1747 		}
1748 	} else
1749 		sock_rps_save_rxhash(sk, skb);
1750 
1751 	if (tcp_rcv_state_process(sk, skb)) {
1752 		rsk = sk;
1753 		goto reset;
1754 	}
1755 	return 0;
1756 
1757 reset:
1758 	tcp_v4_send_reset(rsk, skb);
1759 discard:
1760 	kfree_skb_reason(skb, reason);
1761 	/* Be careful here. If this function gets more complicated and
1762 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1763 	 * might be destroyed here. This current version compiles correctly,
1764 	 * but you have been warned.
1765 	 */
1766 	return 0;
1767 
1768 csum_err:
1769 	reason = SKB_DROP_REASON_TCP_CSUM;
1770 	trace_tcp_bad_csum(skb);
1771 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1772 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1773 	goto discard;
1774 }
1775 EXPORT_SYMBOL(tcp_v4_do_rcv);
1776 
1777 int tcp_v4_early_demux(struct sk_buff *skb)
1778 {
1779 	struct net *net = dev_net(skb->dev);
1780 	const struct iphdr *iph;
1781 	const struct tcphdr *th;
1782 	struct sock *sk;
1783 
1784 	if (skb->pkt_type != PACKET_HOST)
1785 		return 0;
1786 
1787 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1788 		return 0;
1789 
1790 	iph = ip_hdr(skb);
1791 	th = tcp_hdr(skb);
1792 
1793 	if (th->doff < sizeof(struct tcphdr) / 4)
1794 		return 0;
1795 
1796 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1797 				       iph->saddr, th->source,
1798 				       iph->daddr, ntohs(th->dest),
1799 				       skb->skb_iif, inet_sdif(skb));
1800 	if (sk) {
1801 		skb->sk = sk;
1802 		skb->destructor = sock_edemux;
1803 		if (sk_fullsock(sk)) {
1804 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1805 
1806 			if (dst)
1807 				dst = dst_check(dst, 0);
1808 			if (dst &&
1809 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
1810 				skb_dst_set_noref(skb, dst);
1811 		}
1812 	}
1813 	return 0;
1814 }
1815 
1816 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1817 		     enum skb_drop_reason *reason)
1818 {
1819 	u32 limit, tail_gso_size, tail_gso_segs;
1820 	struct skb_shared_info *shinfo;
1821 	const struct tcphdr *th;
1822 	struct tcphdr *thtail;
1823 	struct sk_buff *tail;
1824 	unsigned int hdrlen;
1825 	bool fragstolen;
1826 	u32 gso_segs;
1827 	u32 gso_size;
1828 	int delta;
1829 
1830 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1831 	 * we can fix skb->truesize to its real value to avoid future drops.
1832 	 * This is valid because skb is not yet charged to the socket.
1833 	 * It has been noticed pure SACK packets were sometimes dropped
1834 	 * (if cooked by drivers without copybreak feature).
1835 	 */
1836 	skb_condense(skb);
1837 
1838 	skb_dst_drop(skb);
1839 
1840 	if (unlikely(tcp_checksum_complete(skb))) {
1841 		bh_unlock_sock(sk);
1842 		trace_tcp_bad_csum(skb);
1843 		*reason = SKB_DROP_REASON_TCP_CSUM;
1844 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1845 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1846 		return true;
1847 	}
1848 
1849 	/* Attempt coalescing to last skb in backlog, even if we are
1850 	 * above the limits.
1851 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1852 	 */
1853 	th = (const struct tcphdr *)skb->data;
1854 	hdrlen = th->doff * 4;
1855 
1856 	tail = sk->sk_backlog.tail;
1857 	if (!tail)
1858 		goto no_coalesce;
1859 	thtail = (struct tcphdr *)tail->data;
1860 
1861 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1862 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1863 	    ((TCP_SKB_CB(tail)->tcp_flags |
1864 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1865 	    !((TCP_SKB_CB(tail)->tcp_flags &
1866 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1867 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1868 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1869 #ifdef CONFIG_TLS_DEVICE
1870 	    tail->decrypted != skb->decrypted ||
1871 #endif
1872 	    thtail->doff != th->doff ||
1873 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1874 		goto no_coalesce;
1875 
1876 	__skb_pull(skb, hdrlen);
1877 
1878 	shinfo = skb_shinfo(skb);
1879 	gso_size = shinfo->gso_size ?: skb->len;
1880 	gso_segs = shinfo->gso_segs ?: 1;
1881 
1882 	shinfo = skb_shinfo(tail);
1883 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1884 	tail_gso_segs = shinfo->gso_segs ?: 1;
1885 
1886 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1887 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1888 
1889 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1890 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1891 			thtail->window = th->window;
1892 		}
1893 
1894 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1895 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1896 		 * is not entered if we append a packet with a FIN.
1897 		 * SYN, RST, URG are not present.
1898 		 * ACK is set on both packets.
1899 		 * PSH : we do not really care in TCP stack,
1900 		 *       at least for 'GRO' packets.
1901 		 */
1902 		thtail->fin |= th->fin;
1903 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1904 
1905 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1906 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1907 			tail->tstamp = skb->tstamp;
1908 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1909 		}
1910 
1911 		/* Not as strict as GRO. We only need to carry mss max value */
1912 		shinfo->gso_size = max(gso_size, tail_gso_size);
1913 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1914 
1915 		sk->sk_backlog.len += delta;
1916 		__NET_INC_STATS(sock_net(sk),
1917 				LINUX_MIB_TCPBACKLOGCOALESCE);
1918 		kfree_skb_partial(skb, fragstolen);
1919 		return false;
1920 	}
1921 	__skb_push(skb, hdrlen);
1922 
1923 no_coalesce:
1924 	limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1925 
1926 	/* Only socket owner can try to collapse/prune rx queues
1927 	 * to reduce memory overhead, so add a little headroom here.
1928 	 * Few sockets backlog are possibly concurrently non empty.
1929 	 */
1930 	limit += 64 * 1024;
1931 
1932 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1933 		bh_unlock_sock(sk);
1934 		*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1935 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1936 		return true;
1937 	}
1938 	return false;
1939 }
1940 EXPORT_SYMBOL(tcp_add_backlog);
1941 
1942 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1943 {
1944 	struct tcphdr *th = (struct tcphdr *)skb->data;
1945 
1946 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1947 }
1948 EXPORT_SYMBOL(tcp_filter);
1949 
1950 static void tcp_v4_restore_cb(struct sk_buff *skb)
1951 {
1952 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1953 		sizeof(struct inet_skb_parm));
1954 }
1955 
1956 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1957 			   const struct tcphdr *th)
1958 {
1959 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1960 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1961 	 */
1962 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1963 		sizeof(struct inet_skb_parm));
1964 	barrier();
1965 
1966 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1967 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1968 				    skb->len - th->doff * 4);
1969 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1970 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1971 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1972 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1973 	TCP_SKB_CB(skb)->sacked	 = 0;
1974 	TCP_SKB_CB(skb)->has_rxtstamp =
1975 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1976 }
1977 
1978 /*
1979  *	From tcp_input.c
1980  */
1981 
1982 int tcp_v4_rcv(struct sk_buff *skb)
1983 {
1984 	struct net *net = dev_net(skb->dev);
1985 	enum skb_drop_reason drop_reason;
1986 	int sdif = inet_sdif(skb);
1987 	int dif = inet_iif(skb);
1988 	const struct iphdr *iph;
1989 	const struct tcphdr *th;
1990 	bool refcounted;
1991 	struct sock *sk;
1992 	int ret;
1993 
1994 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1995 	if (skb->pkt_type != PACKET_HOST)
1996 		goto discard_it;
1997 
1998 	/* Count it even if it's bad */
1999 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2000 
2001 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2002 		goto discard_it;
2003 
2004 	th = (const struct tcphdr *)skb->data;
2005 
2006 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2007 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2008 		goto bad_packet;
2009 	}
2010 	if (!pskb_may_pull(skb, th->doff * 4))
2011 		goto discard_it;
2012 
2013 	/* An explanation is required here, I think.
2014 	 * Packet length and doff are validated by header prediction,
2015 	 * provided case of th->doff==0 is eliminated.
2016 	 * So, we defer the checks. */
2017 
2018 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2019 		goto csum_error;
2020 
2021 	th = (const struct tcphdr *)skb->data;
2022 	iph = ip_hdr(skb);
2023 lookup:
2024 	sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2025 			       skb, __tcp_hdrlen(th), th->source,
2026 			       th->dest, sdif, &refcounted);
2027 	if (!sk)
2028 		goto no_tcp_socket;
2029 
2030 process:
2031 	if (sk->sk_state == TCP_TIME_WAIT)
2032 		goto do_time_wait;
2033 
2034 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2035 		struct request_sock *req = inet_reqsk(sk);
2036 		bool req_stolen = false;
2037 		struct sock *nsk;
2038 
2039 		sk = req->rsk_listener;
2040 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2041 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2042 		else
2043 			drop_reason = tcp_inbound_md5_hash(sk, skb,
2044 						   &iph->saddr, &iph->daddr,
2045 						   AF_INET, dif, sdif);
2046 		if (unlikely(drop_reason)) {
2047 			sk_drops_add(sk, skb);
2048 			reqsk_put(req);
2049 			goto discard_it;
2050 		}
2051 		if (tcp_checksum_complete(skb)) {
2052 			reqsk_put(req);
2053 			goto csum_error;
2054 		}
2055 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2056 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2057 			if (!nsk) {
2058 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2059 				goto lookup;
2060 			}
2061 			sk = nsk;
2062 			/* reuseport_migrate_sock() has already held one sk_refcnt
2063 			 * before returning.
2064 			 */
2065 		} else {
2066 			/* We own a reference on the listener, increase it again
2067 			 * as we might lose it too soon.
2068 			 */
2069 			sock_hold(sk);
2070 		}
2071 		refcounted = true;
2072 		nsk = NULL;
2073 		if (!tcp_filter(sk, skb)) {
2074 			th = (const struct tcphdr *)skb->data;
2075 			iph = ip_hdr(skb);
2076 			tcp_v4_fill_cb(skb, iph, th);
2077 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2078 		} else {
2079 			drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2080 		}
2081 		if (!nsk) {
2082 			reqsk_put(req);
2083 			if (req_stolen) {
2084 				/* Another cpu got exclusive access to req
2085 				 * and created a full blown socket.
2086 				 * Try to feed this packet to this socket
2087 				 * instead of discarding it.
2088 				 */
2089 				tcp_v4_restore_cb(skb);
2090 				sock_put(sk);
2091 				goto lookup;
2092 			}
2093 			goto discard_and_relse;
2094 		}
2095 		nf_reset_ct(skb);
2096 		if (nsk == sk) {
2097 			reqsk_put(req);
2098 			tcp_v4_restore_cb(skb);
2099 		} else if (tcp_child_process(sk, nsk, skb)) {
2100 			tcp_v4_send_reset(nsk, skb);
2101 			goto discard_and_relse;
2102 		} else {
2103 			sock_put(sk);
2104 			return 0;
2105 		}
2106 	}
2107 
2108 	if (static_branch_unlikely(&ip4_min_ttl)) {
2109 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2110 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2111 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2112 			drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2113 			goto discard_and_relse;
2114 		}
2115 	}
2116 
2117 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2118 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2119 		goto discard_and_relse;
2120 	}
2121 
2122 	drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2123 					   &iph->daddr, AF_INET, dif, sdif);
2124 	if (drop_reason)
2125 		goto discard_and_relse;
2126 
2127 	nf_reset_ct(skb);
2128 
2129 	if (tcp_filter(sk, skb)) {
2130 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2131 		goto discard_and_relse;
2132 	}
2133 	th = (const struct tcphdr *)skb->data;
2134 	iph = ip_hdr(skb);
2135 	tcp_v4_fill_cb(skb, iph, th);
2136 
2137 	skb->dev = NULL;
2138 
2139 	if (sk->sk_state == TCP_LISTEN) {
2140 		ret = tcp_v4_do_rcv(sk, skb);
2141 		goto put_and_return;
2142 	}
2143 
2144 	sk_incoming_cpu_update(sk);
2145 
2146 	bh_lock_sock_nested(sk);
2147 	tcp_segs_in(tcp_sk(sk), skb);
2148 	ret = 0;
2149 	if (!sock_owned_by_user(sk)) {
2150 		ret = tcp_v4_do_rcv(sk, skb);
2151 	} else {
2152 		if (tcp_add_backlog(sk, skb, &drop_reason))
2153 			goto discard_and_relse;
2154 	}
2155 	bh_unlock_sock(sk);
2156 
2157 put_and_return:
2158 	if (refcounted)
2159 		sock_put(sk);
2160 
2161 	return ret;
2162 
2163 no_tcp_socket:
2164 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2165 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2166 		goto discard_it;
2167 
2168 	tcp_v4_fill_cb(skb, iph, th);
2169 
2170 	if (tcp_checksum_complete(skb)) {
2171 csum_error:
2172 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2173 		trace_tcp_bad_csum(skb);
2174 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2175 bad_packet:
2176 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2177 	} else {
2178 		tcp_v4_send_reset(NULL, skb);
2179 	}
2180 
2181 discard_it:
2182 	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2183 	/* Discard frame. */
2184 	kfree_skb_reason(skb, drop_reason);
2185 	return 0;
2186 
2187 discard_and_relse:
2188 	sk_drops_add(sk, skb);
2189 	if (refcounted)
2190 		sock_put(sk);
2191 	goto discard_it;
2192 
2193 do_time_wait:
2194 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2195 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2196 		inet_twsk_put(inet_twsk(sk));
2197 		goto discard_it;
2198 	}
2199 
2200 	tcp_v4_fill_cb(skb, iph, th);
2201 
2202 	if (tcp_checksum_complete(skb)) {
2203 		inet_twsk_put(inet_twsk(sk));
2204 		goto csum_error;
2205 	}
2206 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2207 	case TCP_TW_SYN: {
2208 		struct sock *sk2 = inet_lookup_listener(net,
2209 							net->ipv4.tcp_death_row.hashinfo,
2210 							skb, __tcp_hdrlen(th),
2211 							iph->saddr, th->source,
2212 							iph->daddr, th->dest,
2213 							inet_iif(skb),
2214 							sdif);
2215 		if (sk2) {
2216 			inet_twsk_deschedule_put(inet_twsk(sk));
2217 			sk = sk2;
2218 			tcp_v4_restore_cb(skb);
2219 			refcounted = false;
2220 			goto process;
2221 		}
2222 	}
2223 		/* to ACK */
2224 		fallthrough;
2225 	case TCP_TW_ACK:
2226 		tcp_v4_timewait_ack(sk, skb);
2227 		break;
2228 	case TCP_TW_RST:
2229 		tcp_v4_send_reset(sk, skb);
2230 		inet_twsk_deschedule_put(inet_twsk(sk));
2231 		goto discard_it;
2232 	case TCP_TW_SUCCESS:;
2233 	}
2234 	goto discard_it;
2235 }
2236 
2237 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2238 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2239 	.twsk_unique	= tcp_twsk_unique,
2240 	.twsk_destructor= tcp_twsk_destructor,
2241 };
2242 
2243 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2244 {
2245 	struct dst_entry *dst = skb_dst(skb);
2246 
2247 	if (dst && dst_hold_safe(dst)) {
2248 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2249 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2250 	}
2251 }
2252 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2253 
2254 const struct inet_connection_sock_af_ops ipv4_specific = {
2255 	.queue_xmit	   = ip_queue_xmit,
2256 	.send_check	   = tcp_v4_send_check,
2257 	.rebuild_header	   = inet_sk_rebuild_header,
2258 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2259 	.conn_request	   = tcp_v4_conn_request,
2260 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2261 	.net_header_len	   = sizeof(struct iphdr),
2262 	.setsockopt	   = ip_setsockopt,
2263 	.getsockopt	   = ip_getsockopt,
2264 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2265 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2266 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2267 };
2268 EXPORT_SYMBOL(ipv4_specific);
2269 
2270 #ifdef CONFIG_TCP_MD5SIG
2271 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2272 	.md5_lookup		= tcp_v4_md5_lookup,
2273 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2274 	.md5_parse		= tcp_v4_parse_md5_keys,
2275 };
2276 #endif
2277 
2278 /* NOTE: A lot of things set to zero explicitly by call to
2279  *       sk_alloc() so need not be done here.
2280  */
2281 static int tcp_v4_init_sock(struct sock *sk)
2282 {
2283 	struct inet_connection_sock *icsk = inet_csk(sk);
2284 
2285 	tcp_init_sock(sk);
2286 
2287 	icsk->icsk_af_ops = &ipv4_specific;
2288 
2289 #ifdef CONFIG_TCP_MD5SIG
2290 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2291 #endif
2292 
2293 	return 0;
2294 }
2295 
2296 void tcp_v4_destroy_sock(struct sock *sk)
2297 {
2298 	struct tcp_sock *tp = tcp_sk(sk);
2299 
2300 	trace_tcp_destroy_sock(sk);
2301 
2302 	tcp_clear_xmit_timers(sk);
2303 
2304 	tcp_cleanup_congestion_control(sk);
2305 
2306 	tcp_cleanup_ulp(sk);
2307 
2308 	/* Cleanup up the write buffer. */
2309 	tcp_write_queue_purge(sk);
2310 
2311 	/* Check if we want to disable active TFO */
2312 	tcp_fastopen_active_disable_ofo_check(sk);
2313 
2314 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2315 	skb_rbtree_purge(&tp->out_of_order_queue);
2316 
2317 #ifdef CONFIG_TCP_MD5SIG
2318 	/* Clean up the MD5 key list, if any */
2319 	if (tp->md5sig_info) {
2320 		tcp_clear_md5_list(sk);
2321 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2322 		tp->md5sig_info = NULL;
2323 		static_branch_slow_dec_deferred(&tcp_md5_needed);
2324 	}
2325 #endif
2326 
2327 	/* Clean up a referenced TCP bind bucket. */
2328 	if (inet_csk(sk)->icsk_bind_hash)
2329 		inet_put_port(sk);
2330 
2331 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2332 
2333 	/* If socket is aborted during connect operation */
2334 	tcp_free_fastopen_req(tp);
2335 	tcp_fastopen_destroy_cipher(sk);
2336 	tcp_saved_syn_free(tp);
2337 
2338 	sk_sockets_allocated_dec(sk);
2339 }
2340 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2341 
2342 #ifdef CONFIG_PROC_FS
2343 /* Proc filesystem TCP sock list dumping. */
2344 
2345 static unsigned short seq_file_family(const struct seq_file *seq);
2346 
2347 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2348 {
2349 	unsigned short family = seq_file_family(seq);
2350 
2351 	/* AF_UNSPEC is used as a match all */
2352 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2353 		net_eq(sock_net(sk), seq_file_net(seq)));
2354 }
2355 
2356 /* Find a non empty bucket (starting from st->bucket)
2357  * and return the first sk from it.
2358  */
2359 static void *listening_get_first(struct seq_file *seq)
2360 {
2361 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2362 	struct tcp_iter_state *st = seq->private;
2363 
2364 	st->offset = 0;
2365 	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2366 		struct inet_listen_hashbucket *ilb2;
2367 		struct hlist_nulls_node *node;
2368 		struct sock *sk;
2369 
2370 		ilb2 = &hinfo->lhash2[st->bucket];
2371 		if (hlist_nulls_empty(&ilb2->nulls_head))
2372 			continue;
2373 
2374 		spin_lock(&ilb2->lock);
2375 		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2376 			if (seq_sk_match(seq, sk))
2377 				return sk;
2378 		}
2379 		spin_unlock(&ilb2->lock);
2380 	}
2381 
2382 	return NULL;
2383 }
2384 
2385 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2386  * If "cur" is the last one in the st->bucket,
2387  * call listening_get_first() to return the first sk of the next
2388  * non empty bucket.
2389  */
2390 static void *listening_get_next(struct seq_file *seq, void *cur)
2391 {
2392 	struct tcp_iter_state *st = seq->private;
2393 	struct inet_listen_hashbucket *ilb2;
2394 	struct hlist_nulls_node *node;
2395 	struct inet_hashinfo *hinfo;
2396 	struct sock *sk = cur;
2397 
2398 	++st->num;
2399 	++st->offset;
2400 
2401 	sk = sk_nulls_next(sk);
2402 	sk_nulls_for_each_from(sk, node) {
2403 		if (seq_sk_match(seq, sk))
2404 			return sk;
2405 	}
2406 
2407 	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2408 	ilb2 = &hinfo->lhash2[st->bucket];
2409 	spin_unlock(&ilb2->lock);
2410 	++st->bucket;
2411 	return listening_get_first(seq);
2412 }
2413 
2414 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2415 {
2416 	struct tcp_iter_state *st = seq->private;
2417 	void *rc;
2418 
2419 	st->bucket = 0;
2420 	st->offset = 0;
2421 	rc = listening_get_first(seq);
2422 
2423 	while (rc && *pos) {
2424 		rc = listening_get_next(seq, rc);
2425 		--*pos;
2426 	}
2427 	return rc;
2428 }
2429 
2430 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2431 				const struct tcp_iter_state *st)
2432 {
2433 	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2434 }
2435 
2436 /*
2437  * Get first established socket starting from bucket given in st->bucket.
2438  * If st->bucket is zero, the very first socket in the hash is returned.
2439  */
2440 static void *established_get_first(struct seq_file *seq)
2441 {
2442 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2443 	struct tcp_iter_state *st = seq->private;
2444 
2445 	st->offset = 0;
2446 	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2447 		struct sock *sk;
2448 		struct hlist_nulls_node *node;
2449 		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2450 
2451 		cond_resched();
2452 
2453 		/* Lockless fast path for the common case of empty buckets */
2454 		if (empty_bucket(hinfo, st))
2455 			continue;
2456 
2457 		spin_lock_bh(lock);
2458 		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2459 			if (seq_sk_match(seq, sk))
2460 				return sk;
2461 		}
2462 		spin_unlock_bh(lock);
2463 	}
2464 
2465 	return NULL;
2466 }
2467 
2468 static void *established_get_next(struct seq_file *seq, void *cur)
2469 {
2470 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2471 	struct tcp_iter_state *st = seq->private;
2472 	struct hlist_nulls_node *node;
2473 	struct sock *sk = cur;
2474 
2475 	++st->num;
2476 	++st->offset;
2477 
2478 	sk = sk_nulls_next(sk);
2479 
2480 	sk_nulls_for_each_from(sk, node) {
2481 		if (seq_sk_match(seq, sk))
2482 			return sk;
2483 	}
2484 
2485 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2486 	++st->bucket;
2487 	return established_get_first(seq);
2488 }
2489 
2490 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2491 {
2492 	struct tcp_iter_state *st = seq->private;
2493 	void *rc;
2494 
2495 	st->bucket = 0;
2496 	rc = established_get_first(seq);
2497 
2498 	while (rc && pos) {
2499 		rc = established_get_next(seq, rc);
2500 		--pos;
2501 	}
2502 	return rc;
2503 }
2504 
2505 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2506 {
2507 	void *rc;
2508 	struct tcp_iter_state *st = seq->private;
2509 
2510 	st->state = TCP_SEQ_STATE_LISTENING;
2511 	rc	  = listening_get_idx(seq, &pos);
2512 
2513 	if (!rc) {
2514 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2515 		rc	  = established_get_idx(seq, pos);
2516 	}
2517 
2518 	return rc;
2519 }
2520 
2521 static void *tcp_seek_last_pos(struct seq_file *seq)
2522 {
2523 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2524 	struct tcp_iter_state *st = seq->private;
2525 	int bucket = st->bucket;
2526 	int offset = st->offset;
2527 	int orig_num = st->num;
2528 	void *rc = NULL;
2529 
2530 	switch (st->state) {
2531 	case TCP_SEQ_STATE_LISTENING:
2532 		if (st->bucket > hinfo->lhash2_mask)
2533 			break;
2534 		rc = listening_get_first(seq);
2535 		while (offset-- && rc && bucket == st->bucket)
2536 			rc = listening_get_next(seq, rc);
2537 		if (rc)
2538 			break;
2539 		st->bucket = 0;
2540 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2541 		fallthrough;
2542 	case TCP_SEQ_STATE_ESTABLISHED:
2543 		if (st->bucket > hinfo->ehash_mask)
2544 			break;
2545 		rc = established_get_first(seq);
2546 		while (offset-- && rc && bucket == st->bucket)
2547 			rc = established_get_next(seq, rc);
2548 	}
2549 
2550 	st->num = orig_num;
2551 
2552 	return rc;
2553 }
2554 
2555 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2556 {
2557 	struct tcp_iter_state *st = seq->private;
2558 	void *rc;
2559 
2560 	if (*pos && *pos == st->last_pos) {
2561 		rc = tcp_seek_last_pos(seq);
2562 		if (rc)
2563 			goto out;
2564 	}
2565 
2566 	st->state = TCP_SEQ_STATE_LISTENING;
2567 	st->num = 0;
2568 	st->bucket = 0;
2569 	st->offset = 0;
2570 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2571 
2572 out:
2573 	st->last_pos = *pos;
2574 	return rc;
2575 }
2576 EXPORT_SYMBOL(tcp_seq_start);
2577 
2578 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2579 {
2580 	struct tcp_iter_state *st = seq->private;
2581 	void *rc = NULL;
2582 
2583 	if (v == SEQ_START_TOKEN) {
2584 		rc = tcp_get_idx(seq, 0);
2585 		goto out;
2586 	}
2587 
2588 	switch (st->state) {
2589 	case TCP_SEQ_STATE_LISTENING:
2590 		rc = listening_get_next(seq, v);
2591 		if (!rc) {
2592 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2593 			st->bucket = 0;
2594 			st->offset = 0;
2595 			rc	  = established_get_first(seq);
2596 		}
2597 		break;
2598 	case TCP_SEQ_STATE_ESTABLISHED:
2599 		rc = established_get_next(seq, v);
2600 		break;
2601 	}
2602 out:
2603 	++*pos;
2604 	st->last_pos = *pos;
2605 	return rc;
2606 }
2607 EXPORT_SYMBOL(tcp_seq_next);
2608 
2609 void tcp_seq_stop(struct seq_file *seq, void *v)
2610 {
2611 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2612 	struct tcp_iter_state *st = seq->private;
2613 
2614 	switch (st->state) {
2615 	case TCP_SEQ_STATE_LISTENING:
2616 		if (v != SEQ_START_TOKEN)
2617 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
2618 		break;
2619 	case TCP_SEQ_STATE_ESTABLISHED:
2620 		if (v)
2621 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2622 		break;
2623 	}
2624 }
2625 EXPORT_SYMBOL(tcp_seq_stop);
2626 
2627 static void get_openreq4(const struct request_sock *req,
2628 			 struct seq_file *f, int i)
2629 {
2630 	const struct inet_request_sock *ireq = inet_rsk(req);
2631 	long delta = req->rsk_timer.expires - jiffies;
2632 
2633 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2634 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2635 		i,
2636 		ireq->ir_loc_addr,
2637 		ireq->ir_num,
2638 		ireq->ir_rmt_addr,
2639 		ntohs(ireq->ir_rmt_port),
2640 		TCP_SYN_RECV,
2641 		0, 0, /* could print option size, but that is af dependent. */
2642 		1,    /* timers active (only the expire timer) */
2643 		jiffies_delta_to_clock_t(delta),
2644 		req->num_timeout,
2645 		from_kuid_munged(seq_user_ns(f),
2646 				 sock_i_uid(req->rsk_listener)),
2647 		0,  /* non standard timer */
2648 		0, /* open_requests have no inode */
2649 		0,
2650 		req);
2651 }
2652 
2653 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2654 {
2655 	int timer_active;
2656 	unsigned long timer_expires;
2657 	const struct tcp_sock *tp = tcp_sk(sk);
2658 	const struct inet_connection_sock *icsk = inet_csk(sk);
2659 	const struct inet_sock *inet = inet_sk(sk);
2660 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2661 	__be32 dest = inet->inet_daddr;
2662 	__be32 src = inet->inet_rcv_saddr;
2663 	__u16 destp = ntohs(inet->inet_dport);
2664 	__u16 srcp = ntohs(inet->inet_sport);
2665 	int rx_queue;
2666 	int state;
2667 
2668 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2669 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2670 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2671 		timer_active	= 1;
2672 		timer_expires	= icsk->icsk_timeout;
2673 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2674 		timer_active	= 4;
2675 		timer_expires	= icsk->icsk_timeout;
2676 	} else if (timer_pending(&sk->sk_timer)) {
2677 		timer_active	= 2;
2678 		timer_expires	= sk->sk_timer.expires;
2679 	} else {
2680 		timer_active	= 0;
2681 		timer_expires = jiffies;
2682 	}
2683 
2684 	state = inet_sk_state_load(sk);
2685 	if (state == TCP_LISTEN)
2686 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2687 	else
2688 		/* Because we don't lock the socket,
2689 		 * we might find a transient negative value.
2690 		 */
2691 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2692 				      READ_ONCE(tp->copied_seq), 0);
2693 
2694 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2695 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2696 		i, src, srcp, dest, destp, state,
2697 		READ_ONCE(tp->write_seq) - tp->snd_una,
2698 		rx_queue,
2699 		timer_active,
2700 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2701 		icsk->icsk_retransmits,
2702 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2703 		icsk->icsk_probes_out,
2704 		sock_i_ino(sk),
2705 		refcount_read(&sk->sk_refcnt), sk,
2706 		jiffies_to_clock_t(icsk->icsk_rto),
2707 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2708 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2709 		tcp_snd_cwnd(tp),
2710 		state == TCP_LISTEN ?
2711 		    fastopenq->max_qlen :
2712 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2713 }
2714 
2715 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2716 			       struct seq_file *f, int i)
2717 {
2718 	long delta = tw->tw_timer.expires - jiffies;
2719 	__be32 dest, src;
2720 	__u16 destp, srcp;
2721 
2722 	dest  = tw->tw_daddr;
2723 	src   = tw->tw_rcv_saddr;
2724 	destp = ntohs(tw->tw_dport);
2725 	srcp  = ntohs(tw->tw_sport);
2726 
2727 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2728 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2729 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2730 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2731 		refcount_read(&tw->tw_refcnt), tw);
2732 }
2733 
2734 #define TMPSZ 150
2735 
2736 static int tcp4_seq_show(struct seq_file *seq, void *v)
2737 {
2738 	struct tcp_iter_state *st;
2739 	struct sock *sk = v;
2740 
2741 	seq_setwidth(seq, TMPSZ - 1);
2742 	if (v == SEQ_START_TOKEN) {
2743 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2744 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2745 			   "inode");
2746 		goto out;
2747 	}
2748 	st = seq->private;
2749 
2750 	if (sk->sk_state == TCP_TIME_WAIT)
2751 		get_timewait4_sock(v, seq, st->num);
2752 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2753 		get_openreq4(v, seq, st->num);
2754 	else
2755 		get_tcp4_sock(v, seq, st->num);
2756 out:
2757 	seq_pad(seq, '\n');
2758 	return 0;
2759 }
2760 
2761 #ifdef CONFIG_BPF_SYSCALL
2762 struct bpf_tcp_iter_state {
2763 	struct tcp_iter_state state;
2764 	unsigned int cur_sk;
2765 	unsigned int end_sk;
2766 	unsigned int max_sk;
2767 	struct sock **batch;
2768 	bool st_bucket_done;
2769 };
2770 
2771 struct bpf_iter__tcp {
2772 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2773 	__bpf_md_ptr(struct sock_common *, sk_common);
2774 	uid_t uid __aligned(8);
2775 };
2776 
2777 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2778 			     struct sock_common *sk_common, uid_t uid)
2779 {
2780 	struct bpf_iter__tcp ctx;
2781 
2782 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2783 	ctx.meta = meta;
2784 	ctx.sk_common = sk_common;
2785 	ctx.uid = uid;
2786 	return bpf_iter_run_prog(prog, &ctx);
2787 }
2788 
2789 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2790 {
2791 	while (iter->cur_sk < iter->end_sk)
2792 		sock_gen_put(iter->batch[iter->cur_sk++]);
2793 }
2794 
2795 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2796 				      unsigned int new_batch_sz)
2797 {
2798 	struct sock **new_batch;
2799 
2800 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2801 			     GFP_USER | __GFP_NOWARN);
2802 	if (!new_batch)
2803 		return -ENOMEM;
2804 
2805 	bpf_iter_tcp_put_batch(iter);
2806 	kvfree(iter->batch);
2807 	iter->batch = new_batch;
2808 	iter->max_sk = new_batch_sz;
2809 
2810 	return 0;
2811 }
2812 
2813 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2814 						 struct sock *start_sk)
2815 {
2816 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2817 	struct bpf_tcp_iter_state *iter = seq->private;
2818 	struct tcp_iter_state *st = &iter->state;
2819 	struct hlist_nulls_node *node;
2820 	unsigned int expected = 1;
2821 	struct sock *sk;
2822 
2823 	sock_hold(start_sk);
2824 	iter->batch[iter->end_sk++] = start_sk;
2825 
2826 	sk = sk_nulls_next(start_sk);
2827 	sk_nulls_for_each_from(sk, node) {
2828 		if (seq_sk_match(seq, sk)) {
2829 			if (iter->end_sk < iter->max_sk) {
2830 				sock_hold(sk);
2831 				iter->batch[iter->end_sk++] = sk;
2832 			}
2833 			expected++;
2834 		}
2835 	}
2836 	spin_unlock(&hinfo->lhash2[st->bucket].lock);
2837 
2838 	return expected;
2839 }
2840 
2841 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2842 						   struct sock *start_sk)
2843 {
2844 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2845 	struct bpf_tcp_iter_state *iter = seq->private;
2846 	struct tcp_iter_state *st = &iter->state;
2847 	struct hlist_nulls_node *node;
2848 	unsigned int expected = 1;
2849 	struct sock *sk;
2850 
2851 	sock_hold(start_sk);
2852 	iter->batch[iter->end_sk++] = start_sk;
2853 
2854 	sk = sk_nulls_next(start_sk);
2855 	sk_nulls_for_each_from(sk, node) {
2856 		if (seq_sk_match(seq, sk)) {
2857 			if (iter->end_sk < iter->max_sk) {
2858 				sock_hold(sk);
2859 				iter->batch[iter->end_sk++] = sk;
2860 			}
2861 			expected++;
2862 		}
2863 	}
2864 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2865 
2866 	return expected;
2867 }
2868 
2869 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2870 {
2871 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2872 	struct bpf_tcp_iter_state *iter = seq->private;
2873 	struct tcp_iter_state *st = &iter->state;
2874 	unsigned int expected;
2875 	bool resized = false;
2876 	struct sock *sk;
2877 
2878 	/* The st->bucket is done.  Directly advance to the next
2879 	 * bucket instead of having the tcp_seek_last_pos() to skip
2880 	 * one by one in the current bucket and eventually find out
2881 	 * it has to advance to the next bucket.
2882 	 */
2883 	if (iter->st_bucket_done) {
2884 		st->offset = 0;
2885 		st->bucket++;
2886 		if (st->state == TCP_SEQ_STATE_LISTENING &&
2887 		    st->bucket > hinfo->lhash2_mask) {
2888 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2889 			st->bucket = 0;
2890 		}
2891 	}
2892 
2893 again:
2894 	/* Get a new batch */
2895 	iter->cur_sk = 0;
2896 	iter->end_sk = 0;
2897 	iter->st_bucket_done = false;
2898 
2899 	sk = tcp_seek_last_pos(seq);
2900 	if (!sk)
2901 		return NULL; /* Done */
2902 
2903 	if (st->state == TCP_SEQ_STATE_LISTENING)
2904 		expected = bpf_iter_tcp_listening_batch(seq, sk);
2905 	else
2906 		expected = bpf_iter_tcp_established_batch(seq, sk);
2907 
2908 	if (iter->end_sk == expected) {
2909 		iter->st_bucket_done = true;
2910 		return sk;
2911 	}
2912 
2913 	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2914 		resized = true;
2915 		goto again;
2916 	}
2917 
2918 	return sk;
2919 }
2920 
2921 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2922 {
2923 	/* bpf iter does not support lseek, so it always
2924 	 * continue from where it was stop()-ped.
2925 	 */
2926 	if (*pos)
2927 		return bpf_iter_tcp_batch(seq);
2928 
2929 	return SEQ_START_TOKEN;
2930 }
2931 
2932 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2933 {
2934 	struct bpf_tcp_iter_state *iter = seq->private;
2935 	struct tcp_iter_state *st = &iter->state;
2936 	struct sock *sk;
2937 
2938 	/* Whenever seq_next() is called, the iter->cur_sk is
2939 	 * done with seq_show(), so advance to the next sk in
2940 	 * the batch.
2941 	 */
2942 	if (iter->cur_sk < iter->end_sk) {
2943 		/* Keeping st->num consistent in tcp_iter_state.
2944 		 * bpf_iter_tcp does not use st->num.
2945 		 * meta.seq_num is used instead.
2946 		 */
2947 		st->num++;
2948 		/* Move st->offset to the next sk in the bucket such that
2949 		 * the future start() will resume at st->offset in
2950 		 * st->bucket.  See tcp_seek_last_pos().
2951 		 */
2952 		st->offset++;
2953 		sock_gen_put(iter->batch[iter->cur_sk++]);
2954 	}
2955 
2956 	if (iter->cur_sk < iter->end_sk)
2957 		sk = iter->batch[iter->cur_sk];
2958 	else
2959 		sk = bpf_iter_tcp_batch(seq);
2960 
2961 	++*pos;
2962 	/* Keeping st->last_pos consistent in tcp_iter_state.
2963 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2964 	 */
2965 	st->last_pos = *pos;
2966 	return sk;
2967 }
2968 
2969 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2970 {
2971 	struct bpf_iter_meta meta;
2972 	struct bpf_prog *prog;
2973 	struct sock *sk = v;
2974 	uid_t uid;
2975 	int ret;
2976 
2977 	if (v == SEQ_START_TOKEN)
2978 		return 0;
2979 
2980 	if (sk_fullsock(sk))
2981 		lock_sock(sk);
2982 
2983 	if (unlikely(sk_unhashed(sk))) {
2984 		ret = SEQ_SKIP;
2985 		goto unlock;
2986 	}
2987 
2988 	if (sk->sk_state == TCP_TIME_WAIT) {
2989 		uid = 0;
2990 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2991 		const struct request_sock *req = v;
2992 
2993 		uid = from_kuid_munged(seq_user_ns(seq),
2994 				       sock_i_uid(req->rsk_listener));
2995 	} else {
2996 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2997 	}
2998 
2999 	meta.seq = seq;
3000 	prog = bpf_iter_get_info(&meta, false);
3001 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
3002 
3003 unlock:
3004 	if (sk_fullsock(sk))
3005 		release_sock(sk);
3006 	return ret;
3007 
3008 }
3009 
3010 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3011 {
3012 	struct bpf_tcp_iter_state *iter = seq->private;
3013 	struct bpf_iter_meta meta;
3014 	struct bpf_prog *prog;
3015 
3016 	if (!v) {
3017 		meta.seq = seq;
3018 		prog = bpf_iter_get_info(&meta, true);
3019 		if (prog)
3020 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
3021 	}
3022 
3023 	if (iter->cur_sk < iter->end_sk) {
3024 		bpf_iter_tcp_put_batch(iter);
3025 		iter->st_bucket_done = false;
3026 	}
3027 }
3028 
3029 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3030 	.show		= bpf_iter_tcp_seq_show,
3031 	.start		= bpf_iter_tcp_seq_start,
3032 	.next		= bpf_iter_tcp_seq_next,
3033 	.stop		= bpf_iter_tcp_seq_stop,
3034 };
3035 #endif
3036 static unsigned short seq_file_family(const struct seq_file *seq)
3037 {
3038 	const struct tcp_seq_afinfo *afinfo;
3039 
3040 #ifdef CONFIG_BPF_SYSCALL
3041 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3042 	if (seq->op == &bpf_iter_tcp_seq_ops)
3043 		return AF_UNSPEC;
3044 #endif
3045 
3046 	/* Iterated from proc fs */
3047 	afinfo = pde_data(file_inode(seq->file));
3048 	return afinfo->family;
3049 }
3050 
3051 static const struct seq_operations tcp4_seq_ops = {
3052 	.show		= tcp4_seq_show,
3053 	.start		= tcp_seq_start,
3054 	.next		= tcp_seq_next,
3055 	.stop		= tcp_seq_stop,
3056 };
3057 
3058 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3059 	.family		= AF_INET,
3060 };
3061 
3062 static int __net_init tcp4_proc_init_net(struct net *net)
3063 {
3064 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3065 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3066 		return -ENOMEM;
3067 	return 0;
3068 }
3069 
3070 static void __net_exit tcp4_proc_exit_net(struct net *net)
3071 {
3072 	remove_proc_entry("tcp", net->proc_net);
3073 }
3074 
3075 static struct pernet_operations tcp4_net_ops = {
3076 	.init = tcp4_proc_init_net,
3077 	.exit = tcp4_proc_exit_net,
3078 };
3079 
3080 int __init tcp4_proc_init(void)
3081 {
3082 	return register_pernet_subsys(&tcp4_net_ops);
3083 }
3084 
3085 void tcp4_proc_exit(void)
3086 {
3087 	unregister_pernet_subsys(&tcp4_net_ops);
3088 }
3089 #endif /* CONFIG_PROC_FS */
3090 
3091 /* @wake is one when sk_stream_write_space() calls us.
3092  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3093  * This mimics the strategy used in sock_def_write_space().
3094  */
3095 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3096 {
3097 	const struct tcp_sock *tp = tcp_sk(sk);
3098 	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3099 			    READ_ONCE(tp->snd_nxt);
3100 
3101 	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3102 }
3103 EXPORT_SYMBOL(tcp_stream_memory_free);
3104 
3105 struct proto tcp_prot = {
3106 	.name			= "TCP",
3107 	.owner			= THIS_MODULE,
3108 	.close			= tcp_close,
3109 	.pre_connect		= tcp_v4_pre_connect,
3110 	.connect		= tcp_v4_connect,
3111 	.disconnect		= tcp_disconnect,
3112 	.accept			= inet_csk_accept,
3113 	.ioctl			= tcp_ioctl,
3114 	.init			= tcp_v4_init_sock,
3115 	.destroy		= tcp_v4_destroy_sock,
3116 	.shutdown		= tcp_shutdown,
3117 	.setsockopt		= tcp_setsockopt,
3118 	.getsockopt		= tcp_getsockopt,
3119 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3120 	.keepalive		= tcp_set_keepalive,
3121 	.recvmsg		= tcp_recvmsg,
3122 	.sendmsg		= tcp_sendmsg,
3123 	.splice_eof		= tcp_splice_eof,
3124 	.backlog_rcv		= tcp_v4_do_rcv,
3125 	.release_cb		= tcp_release_cb,
3126 	.hash			= inet_hash,
3127 	.unhash			= inet_unhash,
3128 	.get_port		= inet_csk_get_port,
3129 	.put_port		= inet_put_port,
3130 #ifdef CONFIG_BPF_SYSCALL
3131 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3132 #endif
3133 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3134 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3135 	.stream_memory_free	= tcp_stream_memory_free,
3136 	.sockets_allocated	= &tcp_sockets_allocated,
3137 	.orphan_count		= &tcp_orphan_count,
3138 
3139 	.memory_allocated	= &tcp_memory_allocated,
3140 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3141 
3142 	.memory_pressure	= &tcp_memory_pressure,
3143 	.sysctl_mem		= sysctl_tcp_mem,
3144 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3145 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3146 	.max_header		= MAX_TCP_HEADER,
3147 	.obj_size		= sizeof(struct tcp_sock),
3148 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3149 	.twsk_prot		= &tcp_timewait_sock_ops,
3150 	.rsk_prot		= &tcp_request_sock_ops,
3151 	.h.hashinfo		= NULL,
3152 	.no_autobind		= true,
3153 	.diag_destroy		= tcp_abort,
3154 };
3155 EXPORT_SYMBOL(tcp_prot);
3156 
3157 static void __net_exit tcp_sk_exit(struct net *net)
3158 {
3159 	if (net->ipv4.tcp_congestion_control)
3160 		bpf_module_put(net->ipv4.tcp_congestion_control,
3161 			       net->ipv4.tcp_congestion_control->owner);
3162 }
3163 
3164 static void __net_init tcp_set_hashinfo(struct net *net)
3165 {
3166 	struct inet_hashinfo *hinfo;
3167 	unsigned int ehash_entries;
3168 	struct net *old_net;
3169 
3170 	if (net_eq(net, &init_net))
3171 		goto fallback;
3172 
3173 	old_net = current->nsproxy->net_ns;
3174 	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3175 	if (!ehash_entries)
3176 		goto fallback;
3177 
3178 	ehash_entries = roundup_pow_of_two(ehash_entries);
3179 	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3180 	if (!hinfo) {
3181 		pr_warn("Failed to allocate TCP ehash (entries: %u) "
3182 			"for a netns, fallback to the global one\n",
3183 			ehash_entries);
3184 fallback:
3185 		hinfo = &tcp_hashinfo;
3186 		ehash_entries = tcp_hashinfo.ehash_mask + 1;
3187 	}
3188 
3189 	net->ipv4.tcp_death_row.hashinfo = hinfo;
3190 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3191 	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3192 }
3193 
3194 static int __net_init tcp_sk_init(struct net *net)
3195 {
3196 	net->ipv4.sysctl_tcp_ecn = 2;
3197 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3198 
3199 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3200 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3201 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3202 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3203 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3204 
3205 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3206 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3207 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3208 
3209 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3210 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3211 	net->ipv4.sysctl_tcp_syncookies = 1;
3212 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3213 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3214 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3215 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3216 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3217 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3218 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3219 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3220 
3221 	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3222 	tcp_set_hashinfo(net);
3223 
3224 	net->ipv4.sysctl_tcp_sack = 1;
3225 	net->ipv4.sysctl_tcp_window_scaling = 1;
3226 	net->ipv4.sysctl_tcp_timestamps = 1;
3227 	net->ipv4.sysctl_tcp_early_retrans = 3;
3228 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3229 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3230 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3231 	net->ipv4.sysctl_tcp_max_reordering = 300;
3232 	net->ipv4.sysctl_tcp_dsack = 1;
3233 	net->ipv4.sysctl_tcp_app_win = 31;
3234 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3235 	net->ipv4.sysctl_tcp_frto = 2;
3236 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3237 	/* This limits the percentage of the congestion window which we
3238 	 * will allow a single TSO frame to consume.  Building TSO frames
3239 	 * which are too large can cause TCP streams to be bursty.
3240 	 */
3241 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3242 	/* Default TSQ limit of 16 TSO segments */
3243 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3244 
3245 	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3246 	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3247 
3248 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3249 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3250 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3251 	net->ipv4.sysctl_tcp_autocorking = 1;
3252 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3253 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3254 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3255 	if (net != &init_net) {
3256 		memcpy(net->ipv4.sysctl_tcp_rmem,
3257 		       init_net.ipv4.sysctl_tcp_rmem,
3258 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3259 		memcpy(net->ipv4.sysctl_tcp_wmem,
3260 		       init_net.ipv4.sysctl_tcp_wmem,
3261 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3262 	}
3263 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3264 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3265 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3266 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3267 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3268 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3269 
3270 	/* Set default values for PLB */
3271 	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3272 	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3273 	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3274 	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3275 	/* Default congestion threshold for PLB to mark a round is 50% */
3276 	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3277 
3278 	/* Reno is always built in */
3279 	if (!net_eq(net, &init_net) &&
3280 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3281 			       init_net.ipv4.tcp_congestion_control->owner))
3282 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3283 	else
3284 		net->ipv4.tcp_congestion_control = &tcp_reno;
3285 
3286 	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3287 	net->ipv4.sysctl_tcp_shrink_window = 0;
3288 
3289 	return 0;
3290 }
3291 
3292 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3293 {
3294 	struct net *net;
3295 
3296 	tcp_twsk_purge(net_exit_list, AF_INET);
3297 
3298 	list_for_each_entry(net, net_exit_list, exit_list) {
3299 		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3300 		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3301 		tcp_fastopen_ctx_destroy(net);
3302 	}
3303 }
3304 
3305 static struct pernet_operations __net_initdata tcp_sk_ops = {
3306        .init	   = tcp_sk_init,
3307        .exit	   = tcp_sk_exit,
3308        .exit_batch = tcp_sk_exit_batch,
3309 };
3310 
3311 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3312 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3313 		     struct sock_common *sk_common, uid_t uid)
3314 
3315 #define INIT_BATCH_SZ 16
3316 
3317 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3318 {
3319 	struct bpf_tcp_iter_state *iter = priv_data;
3320 	int err;
3321 
3322 	err = bpf_iter_init_seq_net(priv_data, aux);
3323 	if (err)
3324 		return err;
3325 
3326 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3327 	if (err) {
3328 		bpf_iter_fini_seq_net(priv_data);
3329 		return err;
3330 	}
3331 
3332 	return 0;
3333 }
3334 
3335 static void bpf_iter_fini_tcp(void *priv_data)
3336 {
3337 	struct bpf_tcp_iter_state *iter = priv_data;
3338 
3339 	bpf_iter_fini_seq_net(priv_data);
3340 	kvfree(iter->batch);
3341 }
3342 
3343 static const struct bpf_iter_seq_info tcp_seq_info = {
3344 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3345 	.init_seq_private	= bpf_iter_init_tcp,
3346 	.fini_seq_private	= bpf_iter_fini_tcp,
3347 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3348 };
3349 
3350 static const struct bpf_func_proto *
3351 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3352 			    const struct bpf_prog *prog)
3353 {
3354 	switch (func_id) {
3355 	case BPF_FUNC_setsockopt:
3356 		return &bpf_sk_setsockopt_proto;
3357 	case BPF_FUNC_getsockopt:
3358 		return &bpf_sk_getsockopt_proto;
3359 	default:
3360 		return NULL;
3361 	}
3362 }
3363 
3364 static struct bpf_iter_reg tcp_reg_info = {
3365 	.target			= "tcp",
3366 	.ctx_arg_info_size	= 1,
3367 	.ctx_arg_info		= {
3368 		{ offsetof(struct bpf_iter__tcp, sk_common),
3369 		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3370 	},
3371 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3372 	.seq_info		= &tcp_seq_info,
3373 };
3374 
3375 static void __init bpf_iter_register(void)
3376 {
3377 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3378 	if (bpf_iter_reg_target(&tcp_reg_info))
3379 		pr_warn("Warning: could not register bpf iterator tcp\n");
3380 }
3381 
3382 #endif
3383 
3384 void __init tcp_v4_init(void)
3385 {
3386 	int cpu, res;
3387 
3388 	for_each_possible_cpu(cpu) {
3389 		struct sock *sk;
3390 
3391 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3392 					   IPPROTO_TCP, &init_net);
3393 		if (res)
3394 			panic("Failed to create the TCP control socket.\n");
3395 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3396 
3397 		/* Please enforce IP_DF and IPID==0 for RST and
3398 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3399 		 */
3400 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3401 
3402 		per_cpu(ipv4_tcp_sk, cpu) = sk;
3403 	}
3404 	if (register_pernet_subsys(&tcp_sk_ops))
3405 		panic("Failed to create the TCP control socket.\n");
3406 
3407 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3408 	bpf_iter_register();
3409 #endif
3410 }
3411