xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision dd21bfa4)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72 
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
80 
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
83 
84 #include <trace/events/tcp.h>
85 
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 #endif
90 
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
93 
94 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
95 {
96 	return secure_tcp_seq(ip_hdr(skb)->daddr,
97 			      ip_hdr(skb)->saddr,
98 			      tcp_hdr(skb)->dest,
99 			      tcp_hdr(skb)->source);
100 }
101 
102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
103 {
104 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
105 }
106 
107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 {
109 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
110 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 	struct tcp_sock *tp = tcp_sk(sk);
112 	int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
113 
114 	if (reuse == 2) {
115 		/* Still does not detect *everything* that goes through
116 		 * lo, since we require a loopback src or dst address
117 		 * or direct binding to 'lo' interface.
118 		 */
119 		bool loopback = false;
120 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
121 			loopback = true;
122 #if IS_ENABLED(CONFIG_IPV6)
123 		if (tw->tw_family == AF_INET6) {
124 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
125 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
126 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
127 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
128 				loopback = true;
129 		} else
130 #endif
131 		{
132 			if (ipv4_is_loopback(tw->tw_daddr) ||
133 			    ipv4_is_loopback(tw->tw_rcv_saddr))
134 				loopback = true;
135 		}
136 		if (!loopback)
137 			reuse = 0;
138 	}
139 
140 	/* With PAWS, it is safe from the viewpoint
141 	   of data integrity. Even without PAWS it is safe provided sequence
142 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
143 
144 	   Actually, the idea is close to VJ's one, only timestamp cache is
145 	   held not per host, but per port pair and TW bucket is used as state
146 	   holder.
147 
148 	   If TW bucket has been already destroyed we fall back to VJ's scheme
149 	   and use initial timestamp retrieved from peer table.
150 	 */
151 	if (tcptw->tw_ts_recent_stamp &&
152 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
153 					    tcptw->tw_ts_recent_stamp)))) {
154 		/* In case of repair and re-using TIME-WAIT sockets we still
155 		 * want to be sure that it is safe as above but honor the
156 		 * sequence numbers and time stamps set as part of the repair
157 		 * process.
158 		 *
159 		 * Without this check re-using a TIME-WAIT socket with TCP
160 		 * repair would accumulate a -1 on the repair assigned
161 		 * sequence number. The first time it is reused the sequence
162 		 * is -1, the second time -2, etc. This fixes that issue
163 		 * without appearing to create any others.
164 		 */
165 		if (likely(!tp->repair)) {
166 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
167 
168 			if (!seq)
169 				seq = 1;
170 			WRITE_ONCE(tp->write_seq, seq);
171 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
172 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
173 		}
174 		sock_hold(sktw);
175 		return 1;
176 	}
177 
178 	return 0;
179 }
180 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
181 
182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
183 			      int addr_len)
184 {
185 	/* This check is replicated from tcp_v4_connect() and intended to
186 	 * prevent BPF program called below from accessing bytes that are out
187 	 * of the bound specified by user in addr_len.
188 	 */
189 	if (addr_len < sizeof(struct sockaddr_in))
190 		return -EINVAL;
191 
192 	sock_owned_by_me(sk);
193 
194 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
195 }
196 
197 /* This will initiate an outgoing connection. */
198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
199 {
200 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
201 	struct inet_sock *inet = inet_sk(sk);
202 	struct tcp_sock *tp = tcp_sk(sk);
203 	__be16 orig_sport, orig_dport;
204 	__be32 daddr, nexthop;
205 	struct flowi4 *fl4;
206 	struct rtable *rt;
207 	int err;
208 	struct ip_options_rcu *inet_opt;
209 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
210 
211 	if (addr_len < sizeof(struct sockaddr_in))
212 		return -EINVAL;
213 
214 	if (usin->sin_family != AF_INET)
215 		return -EAFNOSUPPORT;
216 
217 	nexthop = daddr = usin->sin_addr.s_addr;
218 	inet_opt = rcu_dereference_protected(inet->inet_opt,
219 					     lockdep_sock_is_held(sk));
220 	if (inet_opt && inet_opt->opt.srr) {
221 		if (!daddr)
222 			return -EINVAL;
223 		nexthop = inet_opt->opt.faddr;
224 	}
225 
226 	orig_sport = inet->inet_sport;
227 	orig_dport = usin->sin_port;
228 	fl4 = &inet->cork.fl.u.ip4;
229 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
230 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
231 			      IPPROTO_TCP,
232 			      orig_sport, orig_dport, sk);
233 	if (IS_ERR(rt)) {
234 		err = PTR_ERR(rt);
235 		if (err == -ENETUNREACH)
236 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
237 		return err;
238 	}
239 
240 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
241 		ip_rt_put(rt);
242 		return -ENETUNREACH;
243 	}
244 
245 	if (!inet_opt || !inet_opt->opt.srr)
246 		daddr = fl4->daddr;
247 
248 	if (!inet->inet_saddr)
249 		inet->inet_saddr = fl4->saddr;
250 	sk_rcv_saddr_set(sk, inet->inet_saddr);
251 
252 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
253 		/* Reset inherited state */
254 		tp->rx_opt.ts_recent	   = 0;
255 		tp->rx_opt.ts_recent_stamp = 0;
256 		if (likely(!tp->repair))
257 			WRITE_ONCE(tp->write_seq, 0);
258 	}
259 
260 	inet->inet_dport = usin->sin_port;
261 	sk_daddr_set(sk, daddr);
262 
263 	inet_csk(sk)->icsk_ext_hdr_len = 0;
264 	if (inet_opt)
265 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
266 
267 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
268 
269 	/* Socket identity is still unknown (sport may be zero).
270 	 * However we set state to SYN-SENT and not releasing socket
271 	 * lock select source port, enter ourselves into the hash tables and
272 	 * complete initialization after this.
273 	 */
274 	tcp_set_state(sk, TCP_SYN_SENT);
275 	err = inet_hash_connect(tcp_death_row, sk);
276 	if (err)
277 		goto failure;
278 
279 	sk_set_txhash(sk);
280 
281 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
282 			       inet->inet_sport, inet->inet_dport, sk);
283 	if (IS_ERR(rt)) {
284 		err = PTR_ERR(rt);
285 		rt = NULL;
286 		goto failure;
287 	}
288 	/* OK, now commit destination to socket.  */
289 	sk->sk_gso_type = SKB_GSO_TCPV4;
290 	sk_setup_caps(sk, &rt->dst);
291 	rt = NULL;
292 
293 	if (likely(!tp->repair)) {
294 		if (!tp->write_seq)
295 			WRITE_ONCE(tp->write_seq,
296 				   secure_tcp_seq(inet->inet_saddr,
297 						  inet->inet_daddr,
298 						  inet->inet_sport,
299 						  usin->sin_port));
300 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
301 						 inet->inet_saddr,
302 						 inet->inet_daddr);
303 	}
304 
305 	inet->inet_id = prandom_u32();
306 
307 	if (tcp_fastopen_defer_connect(sk, &err))
308 		return err;
309 	if (err)
310 		goto failure;
311 
312 	err = tcp_connect(sk);
313 
314 	if (err)
315 		goto failure;
316 
317 	return 0;
318 
319 failure:
320 	/*
321 	 * This unhashes the socket and releases the local port,
322 	 * if necessary.
323 	 */
324 	tcp_set_state(sk, TCP_CLOSE);
325 	ip_rt_put(rt);
326 	sk->sk_route_caps = 0;
327 	inet->inet_dport = 0;
328 	return err;
329 }
330 EXPORT_SYMBOL(tcp_v4_connect);
331 
332 /*
333  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
334  * It can be called through tcp_release_cb() if socket was owned by user
335  * at the time tcp_v4_err() was called to handle ICMP message.
336  */
337 void tcp_v4_mtu_reduced(struct sock *sk)
338 {
339 	struct inet_sock *inet = inet_sk(sk);
340 	struct dst_entry *dst;
341 	u32 mtu;
342 
343 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
344 		return;
345 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
346 	dst = inet_csk_update_pmtu(sk, mtu);
347 	if (!dst)
348 		return;
349 
350 	/* Something is about to be wrong... Remember soft error
351 	 * for the case, if this connection will not able to recover.
352 	 */
353 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
354 		sk->sk_err_soft = EMSGSIZE;
355 
356 	mtu = dst_mtu(dst);
357 
358 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
359 	    ip_sk_accept_pmtu(sk) &&
360 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
361 		tcp_sync_mss(sk, mtu);
362 
363 		/* Resend the TCP packet because it's
364 		 * clear that the old packet has been
365 		 * dropped. This is the new "fast" path mtu
366 		 * discovery.
367 		 */
368 		tcp_simple_retransmit(sk);
369 	} /* else let the usual retransmit timer handle it */
370 }
371 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
372 
373 static void do_redirect(struct sk_buff *skb, struct sock *sk)
374 {
375 	struct dst_entry *dst = __sk_dst_check(sk, 0);
376 
377 	if (dst)
378 		dst->ops->redirect(dst, sk, skb);
379 }
380 
381 
382 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
383 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
384 {
385 	struct request_sock *req = inet_reqsk(sk);
386 	struct net *net = sock_net(sk);
387 
388 	/* ICMPs are not backlogged, hence we cannot get
389 	 * an established socket here.
390 	 */
391 	if (seq != tcp_rsk(req)->snt_isn) {
392 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
393 	} else if (abort) {
394 		/*
395 		 * Still in SYN_RECV, just remove it silently.
396 		 * There is no good way to pass the error to the newly
397 		 * created socket, and POSIX does not want network
398 		 * errors returned from accept().
399 		 */
400 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
401 		tcp_listendrop(req->rsk_listener);
402 	}
403 	reqsk_put(req);
404 }
405 EXPORT_SYMBOL(tcp_req_err);
406 
407 /* TCP-LD (RFC 6069) logic */
408 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
409 {
410 	struct inet_connection_sock *icsk = inet_csk(sk);
411 	struct tcp_sock *tp = tcp_sk(sk);
412 	struct sk_buff *skb;
413 	s32 remaining;
414 	u32 delta_us;
415 
416 	if (sock_owned_by_user(sk))
417 		return;
418 
419 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
420 	    !icsk->icsk_backoff)
421 		return;
422 
423 	skb = tcp_rtx_queue_head(sk);
424 	if (WARN_ON_ONCE(!skb))
425 		return;
426 
427 	icsk->icsk_backoff--;
428 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
429 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
430 
431 	tcp_mstamp_refresh(tp);
432 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
433 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
434 
435 	if (remaining > 0) {
436 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
437 					  remaining, TCP_RTO_MAX);
438 	} else {
439 		/* RTO revert clocked out retransmission.
440 		 * Will retransmit now.
441 		 */
442 		tcp_retransmit_timer(sk);
443 	}
444 }
445 EXPORT_SYMBOL(tcp_ld_RTO_revert);
446 
447 /*
448  * This routine is called by the ICMP module when it gets some
449  * sort of error condition.  If err < 0 then the socket should
450  * be closed and the error returned to the user.  If err > 0
451  * it's just the icmp type << 8 | icmp code.  After adjustment
452  * header points to the first 8 bytes of the tcp header.  We need
453  * to find the appropriate port.
454  *
455  * The locking strategy used here is very "optimistic". When
456  * someone else accesses the socket the ICMP is just dropped
457  * and for some paths there is no check at all.
458  * A more general error queue to queue errors for later handling
459  * is probably better.
460  *
461  */
462 
463 int tcp_v4_err(struct sk_buff *skb, u32 info)
464 {
465 	const struct iphdr *iph = (const struct iphdr *)skb->data;
466 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
467 	struct tcp_sock *tp;
468 	struct inet_sock *inet;
469 	const int type = icmp_hdr(skb)->type;
470 	const int code = icmp_hdr(skb)->code;
471 	struct sock *sk;
472 	struct request_sock *fastopen;
473 	u32 seq, snd_una;
474 	int err;
475 	struct net *net = dev_net(skb->dev);
476 
477 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
478 				       th->dest, iph->saddr, ntohs(th->source),
479 				       inet_iif(skb), 0);
480 	if (!sk) {
481 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
482 		return -ENOENT;
483 	}
484 	if (sk->sk_state == TCP_TIME_WAIT) {
485 		inet_twsk_put(inet_twsk(sk));
486 		return 0;
487 	}
488 	seq = ntohl(th->seq);
489 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
490 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
491 				     type == ICMP_TIME_EXCEEDED ||
492 				     (type == ICMP_DEST_UNREACH &&
493 				      (code == ICMP_NET_UNREACH ||
494 				       code == ICMP_HOST_UNREACH)));
495 		return 0;
496 	}
497 
498 	bh_lock_sock(sk);
499 	/* If too many ICMPs get dropped on busy
500 	 * servers this needs to be solved differently.
501 	 * We do take care of PMTU discovery (RFC1191) special case :
502 	 * we can receive locally generated ICMP messages while socket is held.
503 	 */
504 	if (sock_owned_by_user(sk)) {
505 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
506 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
507 	}
508 	if (sk->sk_state == TCP_CLOSE)
509 		goto out;
510 
511 	if (static_branch_unlikely(&ip4_min_ttl)) {
512 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
513 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
514 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
515 			goto out;
516 		}
517 	}
518 
519 	tp = tcp_sk(sk);
520 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
521 	fastopen = rcu_dereference(tp->fastopen_rsk);
522 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
523 	if (sk->sk_state != TCP_LISTEN &&
524 	    !between(seq, snd_una, tp->snd_nxt)) {
525 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
526 		goto out;
527 	}
528 
529 	switch (type) {
530 	case ICMP_REDIRECT:
531 		if (!sock_owned_by_user(sk))
532 			do_redirect(skb, sk);
533 		goto out;
534 	case ICMP_SOURCE_QUENCH:
535 		/* Just silently ignore these. */
536 		goto out;
537 	case ICMP_PARAMETERPROB:
538 		err = EPROTO;
539 		break;
540 	case ICMP_DEST_UNREACH:
541 		if (code > NR_ICMP_UNREACH)
542 			goto out;
543 
544 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
545 			/* We are not interested in TCP_LISTEN and open_requests
546 			 * (SYN-ACKs send out by Linux are always <576bytes so
547 			 * they should go through unfragmented).
548 			 */
549 			if (sk->sk_state == TCP_LISTEN)
550 				goto out;
551 
552 			WRITE_ONCE(tp->mtu_info, info);
553 			if (!sock_owned_by_user(sk)) {
554 				tcp_v4_mtu_reduced(sk);
555 			} else {
556 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
557 					sock_hold(sk);
558 			}
559 			goto out;
560 		}
561 
562 		err = icmp_err_convert[code].errno;
563 		/* check if this ICMP message allows revert of backoff.
564 		 * (see RFC 6069)
565 		 */
566 		if (!fastopen &&
567 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
568 			tcp_ld_RTO_revert(sk, seq);
569 		break;
570 	case ICMP_TIME_EXCEEDED:
571 		err = EHOSTUNREACH;
572 		break;
573 	default:
574 		goto out;
575 	}
576 
577 	switch (sk->sk_state) {
578 	case TCP_SYN_SENT:
579 	case TCP_SYN_RECV:
580 		/* Only in fast or simultaneous open. If a fast open socket is
581 		 * already accepted it is treated as a connected one below.
582 		 */
583 		if (fastopen && !fastopen->sk)
584 			break;
585 
586 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
587 
588 		if (!sock_owned_by_user(sk)) {
589 			sk->sk_err = err;
590 
591 			sk_error_report(sk);
592 
593 			tcp_done(sk);
594 		} else {
595 			sk->sk_err_soft = err;
596 		}
597 		goto out;
598 	}
599 
600 	/* If we've already connected we will keep trying
601 	 * until we time out, or the user gives up.
602 	 *
603 	 * rfc1122 4.2.3.9 allows to consider as hard errors
604 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
605 	 * but it is obsoleted by pmtu discovery).
606 	 *
607 	 * Note, that in modern internet, where routing is unreliable
608 	 * and in each dark corner broken firewalls sit, sending random
609 	 * errors ordered by their masters even this two messages finally lose
610 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
611 	 *
612 	 * Now we are in compliance with RFCs.
613 	 *							--ANK (980905)
614 	 */
615 
616 	inet = inet_sk(sk);
617 	if (!sock_owned_by_user(sk) && inet->recverr) {
618 		sk->sk_err = err;
619 		sk_error_report(sk);
620 	} else	{ /* Only an error on timeout */
621 		sk->sk_err_soft = err;
622 	}
623 
624 out:
625 	bh_unlock_sock(sk);
626 	sock_put(sk);
627 	return 0;
628 }
629 
630 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
631 {
632 	struct tcphdr *th = tcp_hdr(skb);
633 
634 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
635 	skb->csum_start = skb_transport_header(skb) - skb->head;
636 	skb->csum_offset = offsetof(struct tcphdr, check);
637 }
638 
639 /* This routine computes an IPv4 TCP checksum. */
640 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
641 {
642 	const struct inet_sock *inet = inet_sk(sk);
643 
644 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
645 }
646 EXPORT_SYMBOL(tcp_v4_send_check);
647 
648 /*
649  *	This routine will send an RST to the other tcp.
650  *
651  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
652  *		      for reset.
653  *	Answer: if a packet caused RST, it is not for a socket
654  *		existing in our system, if it is matched to a socket,
655  *		it is just duplicate segment or bug in other side's TCP.
656  *		So that we build reply only basing on parameters
657  *		arrived with segment.
658  *	Exception: precedence violation. We do not implement it in any case.
659  */
660 
661 #ifdef CONFIG_TCP_MD5SIG
662 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
663 #else
664 #define OPTION_BYTES sizeof(__be32)
665 #endif
666 
667 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
668 {
669 	const struct tcphdr *th = tcp_hdr(skb);
670 	struct {
671 		struct tcphdr th;
672 		__be32 opt[OPTION_BYTES / sizeof(__be32)];
673 	} rep;
674 	struct ip_reply_arg arg;
675 #ifdef CONFIG_TCP_MD5SIG
676 	struct tcp_md5sig_key *key = NULL;
677 	const __u8 *hash_location = NULL;
678 	unsigned char newhash[16];
679 	int genhash;
680 	struct sock *sk1 = NULL;
681 #endif
682 	u64 transmit_time = 0;
683 	struct sock *ctl_sk;
684 	struct net *net;
685 
686 	/* Never send a reset in response to a reset. */
687 	if (th->rst)
688 		return;
689 
690 	/* If sk not NULL, it means we did a successful lookup and incoming
691 	 * route had to be correct. prequeue might have dropped our dst.
692 	 */
693 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
694 		return;
695 
696 	/* Swap the send and the receive. */
697 	memset(&rep, 0, sizeof(rep));
698 	rep.th.dest   = th->source;
699 	rep.th.source = th->dest;
700 	rep.th.doff   = sizeof(struct tcphdr) / 4;
701 	rep.th.rst    = 1;
702 
703 	if (th->ack) {
704 		rep.th.seq = th->ack_seq;
705 	} else {
706 		rep.th.ack = 1;
707 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
708 				       skb->len - (th->doff << 2));
709 	}
710 
711 	memset(&arg, 0, sizeof(arg));
712 	arg.iov[0].iov_base = (unsigned char *)&rep;
713 	arg.iov[0].iov_len  = sizeof(rep.th);
714 
715 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
716 #ifdef CONFIG_TCP_MD5SIG
717 	rcu_read_lock();
718 	hash_location = tcp_parse_md5sig_option(th);
719 	if (sk && sk_fullsock(sk)) {
720 		const union tcp_md5_addr *addr;
721 		int l3index;
722 
723 		/* sdif set, means packet ingressed via a device
724 		 * in an L3 domain and inet_iif is set to it.
725 		 */
726 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
727 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
728 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
729 	} else if (hash_location) {
730 		const union tcp_md5_addr *addr;
731 		int sdif = tcp_v4_sdif(skb);
732 		int dif = inet_iif(skb);
733 		int l3index;
734 
735 		/*
736 		 * active side is lost. Try to find listening socket through
737 		 * source port, and then find md5 key through listening socket.
738 		 * we are not loose security here:
739 		 * Incoming packet is checked with md5 hash with finding key,
740 		 * no RST generated if md5 hash doesn't match.
741 		 */
742 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
743 					     ip_hdr(skb)->saddr,
744 					     th->source, ip_hdr(skb)->daddr,
745 					     ntohs(th->source), dif, sdif);
746 		/* don't send rst if it can't find key */
747 		if (!sk1)
748 			goto out;
749 
750 		/* sdif set, means packet ingressed via a device
751 		 * in an L3 domain and dif is set to it.
752 		 */
753 		l3index = sdif ? dif : 0;
754 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
755 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
756 		if (!key)
757 			goto out;
758 
759 
760 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
761 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
762 			goto out;
763 
764 	}
765 
766 	if (key) {
767 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
768 				   (TCPOPT_NOP << 16) |
769 				   (TCPOPT_MD5SIG << 8) |
770 				   TCPOLEN_MD5SIG);
771 		/* Update length and the length the header thinks exists */
772 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
773 		rep.th.doff = arg.iov[0].iov_len / 4;
774 
775 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
776 				     key, ip_hdr(skb)->saddr,
777 				     ip_hdr(skb)->daddr, &rep.th);
778 	}
779 #endif
780 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
781 	if (rep.opt[0] == 0) {
782 		__be32 mrst = mptcp_reset_option(skb);
783 
784 		if (mrst) {
785 			rep.opt[0] = mrst;
786 			arg.iov[0].iov_len += sizeof(mrst);
787 			rep.th.doff = arg.iov[0].iov_len / 4;
788 		}
789 	}
790 
791 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
792 				      ip_hdr(skb)->saddr, /* XXX */
793 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
794 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
795 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
796 
797 	/* When socket is gone, all binding information is lost.
798 	 * routing might fail in this case. No choice here, if we choose to force
799 	 * input interface, we will misroute in case of asymmetric route.
800 	 */
801 	if (sk) {
802 		arg.bound_dev_if = sk->sk_bound_dev_if;
803 		if (sk_fullsock(sk))
804 			trace_tcp_send_reset(sk, skb);
805 	}
806 
807 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
808 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
809 
810 	arg.tos = ip_hdr(skb)->tos;
811 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
812 	local_bh_disable();
813 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
814 	if (sk) {
815 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
816 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
817 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
818 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
819 		transmit_time = tcp_transmit_time(sk);
820 	}
821 	ip_send_unicast_reply(ctl_sk,
822 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
823 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
824 			      &arg, arg.iov[0].iov_len,
825 			      transmit_time);
826 
827 	ctl_sk->sk_mark = 0;
828 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
829 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
830 	local_bh_enable();
831 
832 #ifdef CONFIG_TCP_MD5SIG
833 out:
834 	rcu_read_unlock();
835 #endif
836 }
837 
838 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
839    outside socket context is ugly, certainly. What can I do?
840  */
841 
842 static void tcp_v4_send_ack(const struct sock *sk,
843 			    struct sk_buff *skb, u32 seq, u32 ack,
844 			    u32 win, u32 tsval, u32 tsecr, int oif,
845 			    struct tcp_md5sig_key *key,
846 			    int reply_flags, u8 tos)
847 {
848 	const struct tcphdr *th = tcp_hdr(skb);
849 	struct {
850 		struct tcphdr th;
851 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
852 #ifdef CONFIG_TCP_MD5SIG
853 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
854 #endif
855 			];
856 	} rep;
857 	struct net *net = sock_net(sk);
858 	struct ip_reply_arg arg;
859 	struct sock *ctl_sk;
860 	u64 transmit_time;
861 
862 	memset(&rep.th, 0, sizeof(struct tcphdr));
863 	memset(&arg, 0, sizeof(arg));
864 
865 	arg.iov[0].iov_base = (unsigned char *)&rep;
866 	arg.iov[0].iov_len  = sizeof(rep.th);
867 	if (tsecr) {
868 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
869 				   (TCPOPT_TIMESTAMP << 8) |
870 				   TCPOLEN_TIMESTAMP);
871 		rep.opt[1] = htonl(tsval);
872 		rep.opt[2] = htonl(tsecr);
873 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
874 	}
875 
876 	/* Swap the send and the receive. */
877 	rep.th.dest    = th->source;
878 	rep.th.source  = th->dest;
879 	rep.th.doff    = arg.iov[0].iov_len / 4;
880 	rep.th.seq     = htonl(seq);
881 	rep.th.ack_seq = htonl(ack);
882 	rep.th.ack     = 1;
883 	rep.th.window  = htons(win);
884 
885 #ifdef CONFIG_TCP_MD5SIG
886 	if (key) {
887 		int offset = (tsecr) ? 3 : 0;
888 
889 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
890 					  (TCPOPT_NOP << 16) |
891 					  (TCPOPT_MD5SIG << 8) |
892 					  TCPOLEN_MD5SIG);
893 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
894 		rep.th.doff = arg.iov[0].iov_len/4;
895 
896 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
897 				    key, ip_hdr(skb)->saddr,
898 				    ip_hdr(skb)->daddr, &rep.th);
899 	}
900 #endif
901 	arg.flags = reply_flags;
902 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
903 				      ip_hdr(skb)->saddr, /* XXX */
904 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
905 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
906 	if (oif)
907 		arg.bound_dev_if = oif;
908 	arg.tos = tos;
909 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
910 	local_bh_disable();
911 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
912 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
913 			   inet_twsk(sk)->tw_mark : sk->sk_mark;
914 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
915 			   inet_twsk(sk)->tw_priority : sk->sk_priority;
916 	transmit_time = tcp_transmit_time(sk);
917 	ip_send_unicast_reply(ctl_sk,
918 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
919 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
920 			      &arg, arg.iov[0].iov_len,
921 			      transmit_time);
922 
923 	ctl_sk->sk_mark = 0;
924 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
925 	local_bh_enable();
926 }
927 
928 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
929 {
930 	struct inet_timewait_sock *tw = inet_twsk(sk);
931 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
932 
933 	tcp_v4_send_ack(sk, skb,
934 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
935 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
936 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
937 			tcptw->tw_ts_recent,
938 			tw->tw_bound_dev_if,
939 			tcp_twsk_md5_key(tcptw),
940 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
941 			tw->tw_tos
942 			);
943 
944 	inet_twsk_put(tw);
945 }
946 
947 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
948 				  struct request_sock *req)
949 {
950 	const union tcp_md5_addr *addr;
951 	int l3index;
952 
953 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
954 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
955 	 */
956 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
957 					     tcp_sk(sk)->snd_nxt;
958 
959 	/* RFC 7323 2.3
960 	 * The window field (SEG.WND) of every outgoing segment, with the
961 	 * exception of <SYN> segments, MUST be right-shifted by
962 	 * Rcv.Wind.Shift bits:
963 	 */
964 	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
965 	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
966 	tcp_v4_send_ack(sk, skb, seq,
967 			tcp_rsk(req)->rcv_nxt,
968 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
969 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
970 			req->ts_recent,
971 			0,
972 			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
973 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
974 			ip_hdr(skb)->tos);
975 }
976 
977 /*
978  *	Send a SYN-ACK after having received a SYN.
979  *	This still operates on a request_sock only, not on a big
980  *	socket.
981  */
982 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
983 			      struct flowi *fl,
984 			      struct request_sock *req,
985 			      struct tcp_fastopen_cookie *foc,
986 			      enum tcp_synack_type synack_type,
987 			      struct sk_buff *syn_skb)
988 {
989 	const struct inet_request_sock *ireq = inet_rsk(req);
990 	struct flowi4 fl4;
991 	int err = -1;
992 	struct sk_buff *skb;
993 	u8 tos;
994 
995 	/* First, grab a route. */
996 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
997 		return -1;
998 
999 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1000 
1001 	if (skb) {
1002 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1003 
1004 		tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
1005 				(tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1006 				(inet_sk(sk)->tos & INET_ECN_MASK) :
1007 				inet_sk(sk)->tos;
1008 
1009 		if (!INET_ECN_is_capable(tos) &&
1010 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1011 			tos |= INET_ECN_ECT_0;
1012 
1013 		rcu_read_lock();
1014 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1015 					    ireq->ir_rmt_addr,
1016 					    rcu_dereference(ireq->ireq_opt),
1017 					    tos);
1018 		rcu_read_unlock();
1019 		err = net_xmit_eval(err);
1020 	}
1021 
1022 	return err;
1023 }
1024 
1025 /*
1026  *	IPv4 request_sock destructor.
1027  */
1028 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1029 {
1030 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1031 }
1032 
1033 #ifdef CONFIG_TCP_MD5SIG
1034 /*
1035  * RFC2385 MD5 checksumming requires a mapping of
1036  * IP address->MD5 Key.
1037  * We need to maintain these in the sk structure.
1038  */
1039 
1040 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1041 EXPORT_SYMBOL(tcp_md5_needed);
1042 
1043 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1044 {
1045 	if (!old)
1046 		return true;
1047 
1048 	/* l3index always overrides non-l3index */
1049 	if (old->l3index && new->l3index == 0)
1050 		return false;
1051 	if (old->l3index == 0 && new->l3index)
1052 		return true;
1053 
1054 	return old->prefixlen < new->prefixlen;
1055 }
1056 
1057 /* Find the Key structure for an address.  */
1058 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1059 					   const union tcp_md5_addr *addr,
1060 					   int family)
1061 {
1062 	const struct tcp_sock *tp = tcp_sk(sk);
1063 	struct tcp_md5sig_key *key;
1064 	const struct tcp_md5sig_info *md5sig;
1065 	__be32 mask;
1066 	struct tcp_md5sig_key *best_match = NULL;
1067 	bool match;
1068 
1069 	/* caller either holds rcu_read_lock() or socket lock */
1070 	md5sig = rcu_dereference_check(tp->md5sig_info,
1071 				       lockdep_sock_is_held(sk));
1072 	if (!md5sig)
1073 		return NULL;
1074 
1075 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1076 				 lockdep_sock_is_held(sk)) {
1077 		if (key->family != family)
1078 			continue;
1079 		if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1080 			continue;
1081 		if (family == AF_INET) {
1082 			mask = inet_make_mask(key->prefixlen);
1083 			match = (key->addr.a4.s_addr & mask) ==
1084 				(addr->a4.s_addr & mask);
1085 #if IS_ENABLED(CONFIG_IPV6)
1086 		} else if (family == AF_INET6) {
1087 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1088 						  key->prefixlen);
1089 #endif
1090 		} else {
1091 			match = false;
1092 		}
1093 
1094 		if (match && better_md5_match(best_match, key))
1095 			best_match = key;
1096 	}
1097 	return best_match;
1098 }
1099 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1100 
1101 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1102 						      const union tcp_md5_addr *addr,
1103 						      int family, u8 prefixlen,
1104 						      int l3index, u8 flags)
1105 {
1106 	const struct tcp_sock *tp = tcp_sk(sk);
1107 	struct tcp_md5sig_key *key;
1108 	unsigned int size = sizeof(struct in_addr);
1109 	const struct tcp_md5sig_info *md5sig;
1110 
1111 	/* caller either holds rcu_read_lock() or socket lock */
1112 	md5sig = rcu_dereference_check(tp->md5sig_info,
1113 				       lockdep_sock_is_held(sk));
1114 	if (!md5sig)
1115 		return NULL;
1116 #if IS_ENABLED(CONFIG_IPV6)
1117 	if (family == AF_INET6)
1118 		size = sizeof(struct in6_addr);
1119 #endif
1120 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1121 				 lockdep_sock_is_held(sk)) {
1122 		if (key->family != family)
1123 			continue;
1124 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1125 			continue;
1126 		if (key->l3index != l3index)
1127 			continue;
1128 		if (!memcmp(&key->addr, addr, size) &&
1129 		    key->prefixlen == prefixlen)
1130 			return key;
1131 	}
1132 	return NULL;
1133 }
1134 
1135 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1136 					 const struct sock *addr_sk)
1137 {
1138 	const union tcp_md5_addr *addr;
1139 	int l3index;
1140 
1141 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1142 						 addr_sk->sk_bound_dev_if);
1143 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1144 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1145 }
1146 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1147 
1148 /* This can be called on a newly created socket, from other files */
1149 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1150 		   int family, u8 prefixlen, int l3index, u8 flags,
1151 		   const u8 *newkey, u8 newkeylen, gfp_t gfp)
1152 {
1153 	/* Add Key to the list */
1154 	struct tcp_md5sig_key *key;
1155 	struct tcp_sock *tp = tcp_sk(sk);
1156 	struct tcp_md5sig_info *md5sig;
1157 
1158 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1159 	if (key) {
1160 		/* Pre-existing entry - just update that one.
1161 		 * Note that the key might be used concurrently.
1162 		 * data_race() is telling kcsan that we do not care of
1163 		 * key mismatches, since changing MD5 key on live flows
1164 		 * can lead to packet drops.
1165 		 */
1166 		data_race(memcpy(key->key, newkey, newkeylen));
1167 
1168 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1169 		 * Also note that a reader could catch new key->keylen value
1170 		 * but old key->key[], this is the reason we use __GFP_ZERO
1171 		 * at sock_kmalloc() time below these lines.
1172 		 */
1173 		WRITE_ONCE(key->keylen, newkeylen);
1174 
1175 		return 0;
1176 	}
1177 
1178 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1179 					   lockdep_sock_is_held(sk));
1180 	if (!md5sig) {
1181 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1182 		if (!md5sig)
1183 			return -ENOMEM;
1184 
1185 		sk_gso_disable(sk);
1186 		INIT_HLIST_HEAD(&md5sig->head);
1187 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1188 	}
1189 
1190 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1191 	if (!key)
1192 		return -ENOMEM;
1193 	if (!tcp_alloc_md5sig_pool()) {
1194 		sock_kfree_s(sk, key, sizeof(*key));
1195 		return -ENOMEM;
1196 	}
1197 
1198 	memcpy(key->key, newkey, newkeylen);
1199 	key->keylen = newkeylen;
1200 	key->family = family;
1201 	key->prefixlen = prefixlen;
1202 	key->l3index = l3index;
1203 	key->flags = flags;
1204 	memcpy(&key->addr, addr,
1205 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1206 				      sizeof(struct in_addr));
1207 	hlist_add_head_rcu(&key->node, &md5sig->head);
1208 	return 0;
1209 }
1210 EXPORT_SYMBOL(tcp_md5_do_add);
1211 
1212 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1213 		   u8 prefixlen, int l3index, u8 flags)
1214 {
1215 	struct tcp_md5sig_key *key;
1216 
1217 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1218 	if (!key)
1219 		return -ENOENT;
1220 	hlist_del_rcu(&key->node);
1221 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1222 	kfree_rcu(key, rcu);
1223 	return 0;
1224 }
1225 EXPORT_SYMBOL(tcp_md5_do_del);
1226 
1227 static void tcp_clear_md5_list(struct sock *sk)
1228 {
1229 	struct tcp_sock *tp = tcp_sk(sk);
1230 	struct tcp_md5sig_key *key;
1231 	struct hlist_node *n;
1232 	struct tcp_md5sig_info *md5sig;
1233 
1234 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1235 
1236 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1237 		hlist_del_rcu(&key->node);
1238 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1239 		kfree_rcu(key, rcu);
1240 	}
1241 }
1242 
1243 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1244 				 sockptr_t optval, int optlen)
1245 {
1246 	struct tcp_md5sig cmd;
1247 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1248 	const union tcp_md5_addr *addr;
1249 	u8 prefixlen = 32;
1250 	int l3index = 0;
1251 	u8 flags;
1252 
1253 	if (optlen < sizeof(cmd))
1254 		return -EINVAL;
1255 
1256 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1257 		return -EFAULT;
1258 
1259 	if (sin->sin_family != AF_INET)
1260 		return -EINVAL;
1261 
1262 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1263 
1264 	if (optname == TCP_MD5SIG_EXT &&
1265 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1266 		prefixlen = cmd.tcpm_prefixlen;
1267 		if (prefixlen > 32)
1268 			return -EINVAL;
1269 	}
1270 
1271 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1272 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1273 		struct net_device *dev;
1274 
1275 		rcu_read_lock();
1276 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1277 		if (dev && netif_is_l3_master(dev))
1278 			l3index = dev->ifindex;
1279 
1280 		rcu_read_unlock();
1281 
1282 		/* ok to reference set/not set outside of rcu;
1283 		 * right now device MUST be an L3 master
1284 		 */
1285 		if (!dev || !l3index)
1286 			return -EINVAL;
1287 	}
1288 
1289 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1290 
1291 	if (!cmd.tcpm_keylen)
1292 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1293 
1294 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1295 		return -EINVAL;
1296 
1297 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1298 			      cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1299 }
1300 
1301 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1302 				   __be32 daddr, __be32 saddr,
1303 				   const struct tcphdr *th, int nbytes)
1304 {
1305 	struct tcp4_pseudohdr *bp;
1306 	struct scatterlist sg;
1307 	struct tcphdr *_th;
1308 
1309 	bp = hp->scratch;
1310 	bp->saddr = saddr;
1311 	bp->daddr = daddr;
1312 	bp->pad = 0;
1313 	bp->protocol = IPPROTO_TCP;
1314 	bp->len = cpu_to_be16(nbytes);
1315 
1316 	_th = (struct tcphdr *)(bp + 1);
1317 	memcpy(_th, th, sizeof(*th));
1318 	_th->check = 0;
1319 
1320 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1321 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1322 				sizeof(*bp) + sizeof(*th));
1323 	return crypto_ahash_update(hp->md5_req);
1324 }
1325 
1326 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1327 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1328 {
1329 	struct tcp_md5sig_pool *hp;
1330 	struct ahash_request *req;
1331 
1332 	hp = tcp_get_md5sig_pool();
1333 	if (!hp)
1334 		goto clear_hash_noput;
1335 	req = hp->md5_req;
1336 
1337 	if (crypto_ahash_init(req))
1338 		goto clear_hash;
1339 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1340 		goto clear_hash;
1341 	if (tcp_md5_hash_key(hp, key))
1342 		goto clear_hash;
1343 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1344 	if (crypto_ahash_final(req))
1345 		goto clear_hash;
1346 
1347 	tcp_put_md5sig_pool();
1348 	return 0;
1349 
1350 clear_hash:
1351 	tcp_put_md5sig_pool();
1352 clear_hash_noput:
1353 	memset(md5_hash, 0, 16);
1354 	return 1;
1355 }
1356 
1357 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1358 			const struct sock *sk,
1359 			const struct sk_buff *skb)
1360 {
1361 	struct tcp_md5sig_pool *hp;
1362 	struct ahash_request *req;
1363 	const struct tcphdr *th = tcp_hdr(skb);
1364 	__be32 saddr, daddr;
1365 
1366 	if (sk) { /* valid for establish/request sockets */
1367 		saddr = sk->sk_rcv_saddr;
1368 		daddr = sk->sk_daddr;
1369 	} else {
1370 		const struct iphdr *iph = ip_hdr(skb);
1371 		saddr = iph->saddr;
1372 		daddr = iph->daddr;
1373 	}
1374 
1375 	hp = tcp_get_md5sig_pool();
1376 	if (!hp)
1377 		goto clear_hash_noput;
1378 	req = hp->md5_req;
1379 
1380 	if (crypto_ahash_init(req))
1381 		goto clear_hash;
1382 
1383 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1384 		goto clear_hash;
1385 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1386 		goto clear_hash;
1387 	if (tcp_md5_hash_key(hp, key))
1388 		goto clear_hash;
1389 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1390 	if (crypto_ahash_final(req))
1391 		goto clear_hash;
1392 
1393 	tcp_put_md5sig_pool();
1394 	return 0;
1395 
1396 clear_hash:
1397 	tcp_put_md5sig_pool();
1398 clear_hash_noput:
1399 	memset(md5_hash, 0, 16);
1400 	return 1;
1401 }
1402 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1403 
1404 #endif
1405 
1406 /* Called with rcu_read_lock() */
1407 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1408 				    const struct sk_buff *skb,
1409 				    int dif, int sdif)
1410 {
1411 #ifdef CONFIG_TCP_MD5SIG
1412 	/*
1413 	 * This gets called for each TCP segment that arrives
1414 	 * so we want to be efficient.
1415 	 * We have 3 drop cases:
1416 	 * o No MD5 hash and one expected.
1417 	 * o MD5 hash and we're not expecting one.
1418 	 * o MD5 hash and its wrong.
1419 	 */
1420 	const __u8 *hash_location = NULL;
1421 	struct tcp_md5sig_key *hash_expected;
1422 	const struct iphdr *iph = ip_hdr(skb);
1423 	const struct tcphdr *th = tcp_hdr(skb);
1424 	const union tcp_md5_addr *addr;
1425 	unsigned char newhash[16];
1426 	int genhash, l3index;
1427 
1428 	/* sdif set, means packet ingressed via a device
1429 	 * in an L3 domain and dif is set to the l3mdev
1430 	 */
1431 	l3index = sdif ? dif : 0;
1432 
1433 	addr = (union tcp_md5_addr *)&iph->saddr;
1434 	hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1435 	hash_location = tcp_parse_md5sig_option(th);
1436 
1437 	/* We've parsed the options - do we have a hash? */
1438 	if (!hash_expected && !hash_location)
1439 		return false;
1440 
1441 	if (hash_expected && !hash_location) {
1442 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1443 		return true;
1444 	}
1445 
1446 	if (!hash_expected && hash_location) {
1447 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1448 		return true;
1449 	}
1450 
1451 	/* Okay, so this is hash_expected and hash_location -
1452 	 * so we need to calculate the checksum.
1453 	 */
1454 	genhash = tcp_v4_md5_hash_skb(newhash,
1455 				      hash_expected,
1456 				      NULL, skb);
1457 
1458 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1459 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1460 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1461 				     &iph->saddr, ntohs(th->source),
1462 				     &iph->daddr, ntohs(th->dest),
1463 				     genhash ? " tcp_v4_calc_md5_hash failed"
1464 				     : "", l3index);
1465 		return true;
1466 	}
1467 	return false;
1468 #endif
1469 	return false;
1470 }
1471 
1472 static void tcp_v4_init_req(struct request_sock *req,
1473 			    const struct sock *sk_listener,
1474 			    struct sk_buff *skb)
1475 {
1476 	struct inet_request_sock *ireq = inet_rsk(req);
1477 	struct net *net = sock_net(sk_listener);
1478 
1479 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1480 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1481 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1482 }
1483 
1484 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1485 					  struct sk_buff *skb,
1486 					  struct flowi *fl,
1487 					  struct request_sock *req)
1488 {
1489 	tcp_v4_init_req(req, sk, skb);
1490 
1491 	if (security_inet_conn_request(sk, skb, req))
1492 		return NULL;
1493 
1494 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1495 }
1496 
1497 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1498 	.family		=	PF_INET,
1499 	.obj_size	=	sizeof(struct tcp_request_sock),
1500 	.rtx_syn_ack	=	tcp_rtx_synack,
1501 	.send_ack	=	tcp_v4_reqsk_send_ack,
1502 	.destructor	=	tcp_v4_reqsk_destructor,
1503 	.send_reset	=	tcp_v4_send_reset,
1504 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1505 };
1506 
1507 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1508 	.mss_clamp	=	TCP_MSS_DEFAULT,
1509 #ifdef CONFIG_TCP_MD5SIG
1510 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1511 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1512 #endif
1513 #ifdef CONFIG_SYN_COOKIES
1514 	.cookie_init_seq =	cookie_v4_init_sequence,
1515 #endif
1516 	.route_req	=	tcp_v4_route_req,
1517 	.init_seq	=	tcp_v4_init_seq,
1518 	.init_ts_off	=	tcp_v4_init_ts_off,
1519 	.send_synack	=	tcp_v4_send_synack,
1520 };
1521 
1522 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1523 {
1524 	/* Never answer to SYNs send to broadcast or multicast */
1525 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1526 		goto drop;
1527 
1528 	return tcp_conn_request(&tcp_request_sock_ops,
1529 				&tcp_request_sock_ipv4_ops, sk, skb);
1530 
1531 drop:
1532 	tcp_listendrop(sk);
1533 	return 0;
1534 }
1535 EXPORT_SYMBOL(tcp_v4_conn_request);
1536 
1537 
1538 /*
1539  * The three way handshake has completed - we got a valid synack -
1540  * now create the new socket.
1541  */
1542 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1543 				  struct request_sock *req,
1544 				  struct dst_entry *dst,
1545 				  struct request_sock *req_unhash,
1546 				  bool *own_req)
1547 {
1548 	struct inet_request_sock *ireq;
1549 	bool found_dup_sk = false;
1550 	struct inet_sock *newinet;
1551 	struct tcp_sock *newtp;
1552 	struct sock *newsk;
1553 #ifdef CONFIG_TCP_MD5SIG
1554 	const union tcp_md5_addr *addr;
1555 	struct tcp_md5sig_key *key;
1556 	int l3index;
1557 #endif
1558 	struct ip_options_rcu *inet_opt;
1559 
1560 	if (sk_acceptq_is_full(sk))
1561 		goto exit_overflow;
1562 
1563 	newsk = tcp_create_openreq_child(sk, req, skb);
1564 	if (!newsk)
1565 		goto exit_nonewsk;
1566 
1567 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1568 	inet_sk_rx_dst_set(newsk, skb);
1569 
1570 	newtp		      = tcp_sk(newsk);
1571 	newinet		      = inet_sk(newsk);
1572 	ireq		      = inet_rsk(req);
1573 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1574 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1575 	newsk->sk_bound_dev_if = ireq->ir_iif;
1576 	newinet->inet_saddr   = ireq->ir_loc_addr;
1577 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1578 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1579 	newinet->mc_index     = inet_iif(skb);
1580 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1581 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1582 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1583 	if (inet_opt)
1584 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1585 	newinet->inet_id = prandom_u32();
1586 
1587 	/* Set ToS of the new socket based upon the value of incoming SYN.
1588 	 * ECT bits are set later in tcp_init_transfer().
1589 	 */
1590 	if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1591 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1592 
1593 	if (!dst) {
1594 		dst = inet_csk_route_child_sock(sk, newsk, req);
1595 		if (!dst)
1596 			goto put_and_exit;
1597 	} else {
1598 		/* syncookie case : see end of cookie_v4_check() */
1599 	}
1600 	sk_setup_caps(newsk, dst);
1601 
1602 	tcp_ca_openreq_child(newsk, dst);
1603 
1604 	tcp_sync_mss(newsk, dst_mtu(dst));
1605 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1606 
1607 	tcp_initialize_rcv_mss(newsk);
1608 
1609 #ifdef CONFIG_TCP_MD5SIG
1610 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1611 	/* Copy over the MD5 key from the original socket */
1612 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1613 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1614 	if (key) {
1615 		/*
1616 		 * We're using one, so create a matching key
1617 		 * on the newsk structure. If we fail to get
1618 		 * memory, then we end up not copying the key
1619 		 * across. Shucks.
1620 		 */
1621 		tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1622 			       key->key, key->keylen, GFP_ATOMIC);
1623 		sk_gso_disable(newsk);
1624 	}
1625 #endif
1626 
1627 	if (__inet_inherit_port(sk, newsk) < 0)
1628 		goto put_and_exit;
1629 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1630 				       &found_dup_sk);
1631 	if (likely(*own_req)) {
1632 		tcp_move_syn(newtp, req);
1633 		ireq->ireq_opt = NULL;
1634 	} else {
1635 		newinet->inet_opt = NULL;
1636 
1637 		if (!req_unhash && found_dup_sk) {
1638 			/* This code path should only be executed in the
1639 			 * syncookie case only
1640 			 */
1641 			bh_unlock_sock(newsk);
1642 			sock_put(newsk);
1643 			newsk = NULL;
1644 		}
1645 	}
1646 	return newsk;
1647 
1648 exit_overflow:
1649 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1650 exit_nonewsk:
1651 	dst_release(dst);
1652 exit:
1653 	tcp_listendrop(sk);
1654 	return NULL;
1655 put_and_exit:
1656 	newinet->inet_opt = NULL;
1657 	inet_csk_prepare_forced_close(newsk);
1658 	tcp_done(newsk);
1659 	goto exit;
1660 }
1661 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1662 
1663 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1664 {
1665 #ifdef CONFIG_SYN_COOKIES
1666 	const struct tcphdr *th = tcp_hdr(skb);
1667 
1668 	if (!th->syn)
1669 		sk = cookie_v4_check(sk, skb);
1670 #endif
1671 	return sk;
1672 }
1673 
1674 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1675 			 struct tcphdr *th, u32 *cookie)
1676 {
1677 	u16 mss = 0;
1678 #ifdef CONFIG_SYN_COOKIES
1679 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1680 				    &tcp_request_sock_ipv4_ops, sk, th);
1681 	if (mss) {
1682 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1683 		tcp_synq_overflow(sk);
1684 	}
1685 #endif
1686 	return mss;
1687 }
1688 
1689 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1690 							   u32));
1691 /* The socket must have it's spinlock held when we get
1692  * here, unless it is a TCP_LISTEN socket.
1693  *
1694  * We have a potential double-lock case here, so even when
1695  * doing backlog processing we use the BH locking scheme.
1696  * This is because we cannot sleep with the original spinlock
1697  * held.
1698  */
1699 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1700 {
1701 	struct sock *rsk;
1702 
1703 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1704 		struct dst_entry *dst;
1705 
1706 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1707 						lockdep_sock_is_held(sk));
1708 
1709 		sock_rps_save_rxhash(sk, skb);
1710 		sk_mark_napi_id(sk, skb);
1711 		if (dst) {
1712 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1713 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1714 					     dst, 0)) {
1715 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1716 				dst_release(dst);
1717 			}
1718 		}
1719 		tcp_rcv_established(sk, skb);
1720 		return 0;
1721 	}
1722 
1723 	if (tcp_checksum_complete(skb))
1724 		goto csum_err;
1725 
1726 	if (sk->sk_state == TCP_LISTEN) {
1727 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1728 
1729 		if (!nsk)
1730 			goto discard;
1731 		if (nsk != sk) {
1732 			if (tcp_child_process(sk, nsk, skb)) {
1733 				rsk = nsk;
1734 				goto reset;
1735 			}
1736 			return 0;
1737 		}
1738 	} else
1739 		sock_rps_save_rxhash(sk, skb);
1740 
1741 	if (tcp_rcv_state_process(sk, skb)) {
1742 		rsk = sk;
1743 		goto reset;
1744 	}
1745 	return 0;
1746 
1747 reset:
1748 	tcp_v4_send_reset(rsk, skb);
1749 discard:
1750 	kfree_skb(skb);
1751 	/* Be careful here. If this function gets more complicated and
1752 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1753 	 * might be destroyed here. This current version compiles correctly,
1754 	 * but you have been warned.
1755 	 */
1756 	return 0;
1757 
1758 csum_err:
1759 	trace_tcp_bad_csum(skb);
1760 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1761 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1762 	goto discard;
1763 }
1764 EXPORT_SYMBOL(tcp_v4_do_rcv);
1765 
1766 int tcp_v4_early_demux(struct sk_buff *skb)
1767 {
1768 	const struct iphdr *iph;
1769 	const struct tcphdr *th;
1770 	struct sock *sk;
1771 
1772 	if (skb->pkt_type != PACKET_HOST)
1773 		return 0;
1774 
1775 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1776 		return 0;
1777 
1778 	iph = ip_hdr(skb);
1779 	th = tcp_hdr(skb);
1780 
1781 	if (th->doff < sizeof(struct tcphdr) / 4)
1782 		return 0;
1783 
1784 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1785 				       iph->saddr, th->source,
1786 				       iph->daddr, ntohs(th->dest),
1787 				       skb->skb_iif, inet_sdif(skb));
1788 	if (sk) {
1789 		skb->sk = sk;
1790 		skb->destructor = sock_edemux;
1791 		if (sk_fullsock(sk)) {
1792 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1793 
1794 			if (dst)
1795 				dst = dst_check(dst, 0);
1796 			if (dst &&
1797 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
1798 				skb_dst_set_noref(skb, dst);
1799 		}
1800 	}
1801 	return 0;
1802 }
1803 
1804 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1805 {
1806 	u32 limit, tail_gso_size, tail_gso_segs;
1807 	struct skb_shared_info *shinfo;
1808 	const struct tcphdr *th;
1809 	struct tcphdr *thtail;
1810 	struct sk_buff *tail;
1811 	unsigned int hdrlen;
1812 	bool fragstolen;
1813 	u32 gso_segs;
1814 	u32 gso_size;
1815 	int delta;
1816 
1817 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1818 	 * we can fix skb->truesize to its real value to avoid future drops.
1819 	 * This is valid because skb is not yet charged to the socket.
1820 	 * It has been noticed pure SACK packets were sometimes dropped
1821 	 * (if cooked by drivers without copybreak feature).
1822 	 */
1823 	skb_condense(skb);
1824 
1825 	skb_dst_drop(skb);
1826 
1827 	if (unlikely(tcp_checksum_complete(skb))) {
1828 		bh_unlock_sock(sk);
1829 		trace_tcp_bad_csum(skb);
1830 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1831 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1832 		return true;
1833 	}
1834 
1835 	/* Attempt coalescing to last skb in backlog, even if we are
1836 	 * above the limits.
1837 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1838 	 */
1839 	th = (const struct tcphdr *)skb->data;
1840 	hdrlen = th->doff * 4;
1841 
1842 	tail = sk->sk_backlog.tail;
1843 	if (!tail)
1844 		goto no_coalesce;
1845 	thtail = (struct tcphdr *)tail->data;
1846 
1847 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1848 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1849 	    ((TCP_SKB_CB(tail)->tcp_flags |
1850 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1851 	    !((TCP_SKB_CB(tail)->tcp_flags &
1852 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1853 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1854 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1855 #ifdef CONFIG_TLS_DEVICE
1856 	    tail->decrypted != skb->decrypted ||
1857 #endif
1858 	    thtail->doff != th->doff ||
1859 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1860 		goto no_coalesce;
1861 
1862 	__skb_pull(skb, hdrlen);
1863 
1864 	shinfo = skb_shinfo(skb);
1865 	gso_size = shinfo->gso_size ?: skb->len;
1866 	gso_segs = shinfo->gso_segs ?: 1;
1867 
1868 	shinfo = skb_shinfo(tail);
1869 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1870 	tail_gso_segs = shinfo->gso_segs ?: 1;
1871 
1872 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1873 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1874 
1875 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1876 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1877 			thtail->window = th->window;
1878 		}
1879 
1880 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1881 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1882 		 * is not entered if we append a packet with a FIN.
1883 		 * SYN, RST, URG are not present.
1884 		 * ACK is set on both packets.
1885 		 * PSH : we do not really care in TCP stack,
1886 		 *       at least for 'GRO' packets.
1887 		 */
1888 		thtail->fin |= th->fin;
1889 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1890 
1891 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1892 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1893 			tail->tstamp = skb->tstamp;
1894 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1895 		}
1896 
1897 		/* Not as strict as GRO. We only need to carry mss max value */
1898 		shinfo->gso_size = max(gso_size, tail_gso_size);
1899 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1900 
1901 		sk->sk_backlog.len += delta;
1902 		__NET_INC_STATS(sock_net(sk),
1903 				LINUX_MIB_TCPBACKLOGCOALESCE);
1904 		kfree_skb_partial(skb, fragstolen);
1905 		return false;
1906 	}
1907 	__skb_push(skb, hdrlen);
1908 
1909 no_coalesce:
1910 	/* Only socket owner can try to collapse/prune rx queues
1911 	 * to reduce memory overhead, so add a little headroom here.
1912 	 * Few sockets backlog are possibly concurrently non empty.
1913 	 */
1914 	limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024;
1915 
1916 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1917 		bh_unlock_sock(sk);
1918 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1919 		return true;
1920 	}
1921 	return false;
1922 }
1923 EXPORT_SYMBOL(tcp_add_backlog);
1924 
1925 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1926 {
1927 	struct tcphdr *th = (struct tcphdr *)skb->data;
1928 
1929 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1930 }
1931 EXPORT_SYMBOL(tcp_filter);
1932 
1933 static void tcp_v4_restore_cb(struct sk_buff *skb)
1934 {
1935 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1936 		sizeof(struct inet_skb_parm));
1937 }
1938 
1939 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1940 			   const struct tcphdr *th)
1941 {
1942 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1943 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1944 	 */
1945 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1946 		sizeof(struct inet_skb_parm));
1947 	barrier();
1948 
1949 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1950 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1951 				    skb->len - th->doff * 4);
1952 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1953 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1954 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1955 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1956 	TCP_SKB_CB(skb)->sacked	 = 0;
1957 	TCP_SKB_CB(skb)->has_rxtstamp =
1958 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1959 }
1960 
1961 /*
1962  *	From tcp_input.c
1963  */
1964 
1965 int tcp_v4_rcv(struct sk_buff *skb)
1966 {
1967 	struct net *net = dev_net(skb->dev);
1968 	int sdif = inet_sdif(skb);
1969 	int dif = inet_iif(skb);
1970 	const struct iphdr *iph;
1971 	const struct tcphdr *th;
1972 	bool refcounted;
1973 	struct sock *sk;
1974 	int drop_reason;
1975 	int ret;
1976 
1977 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1978 	if (skb->pkt_type != PACKET_HOST)
1979 		goto discard_it;
1980 
1981 	/* Count it even if it's bad */
1982 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1983 
1984 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1985 		goto discard_it;
1986 
1987 	th = (const struct tcphdr *)skb->data;
1988 
1989 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1990 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1991 		goto bad_packet;
1992 	}
1993 	if (!pskb_may_pull(skb, th->doff * 4))
1994 		goto discard_it;
1995 
1996 	/* An explanation is required here, I think.
1997 	 * Packet length and doff are validated by header prediction,
1998 	 * provided case of th->doff==0 is eliminated.
1999 	 * So, we defer the checks. */
2000 
2001 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2002 		goto csum_error;
2003 
2004 	th = (const struct tcphdr *)skb->data;
2005 	iph = ip_hdr(skb);
2006 lookup:
2007 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
2008 			       th->dest, sdif, &refcounted);
2009 	if (!sk)
2010 		goto no_tcp_socket;
2011 
2012 process:
2013 	if (sk->sk_state == TCP_TIME_WAIT)
2014 		goto do_time_wait;
2015 
2016 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2017 		struct request_sock *req = inet_reqsk(sk);
2018 		bool req_stolen = false;
2019 		struct sock *nsk;
2020 
2021 		sk = req->rsk_listener;
2022 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
2023 			sk_drops_add(sk, skb);
2024 			reqsk_put(req);
2025 			goto discard_it;
2026 		}
2027 		if (tcp_checksum_complete(skb)) {
2028 			reqsk_put(req);
2029 			goto csum_error;
2030 		}
2031 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2032 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2033 			if (!nsk) {
2034 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2035 				goto lookup;
2036 			}
2037 			sk = nsk;
2038 			/* reuseport_migrate_sock() has already held one sk_refcnt
2039 			 * before returning.
2040 			 */
2041 		} else {
2042 			/* We own a reference on the listener, increase it again
2043 			 * as we might lose it too soon.
2044 			 */
2045 			sock_hold(sk);
2046 		}
2047 		refcounted = true;
2048 		nsk = NULL;
2049 		if (!tcp_filter(sk, skb)) {
2050 			th = (const struct tcphdr *)skb->data;
2051 			iph = ip_hdr(skb);
2052 			tcp_v4_fill_cb(skb, iph, th);
2053 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2054 		}
2055 		if (!nsk) {
2056 			reqsk_put(req);
2057 			if (req_stolen) {
2058 				/* Another cpu got exclusive access to req
2059 				 * and created a full blown socket.
2060 				 * Try to feed this packet to this socket
2061 				 * instead of discarding it.
2062 				 */
2063 				tcp_v4_restore_cb(skb);
2064 				sock_put(sk);
2065 				goto lookup;
2066 			}
2067 			goto discard_and_relse;
2068 		}
2069 		if (nsk == sk) {
2070 			reqsk_put(req);
2071 			tcp_v4_restore_cb(skb);
2072 		} else if (tcp_child_process(sk, nsk, skb)) {
2073 			tcp_v4_send_reset(nsk, skb);
2074 			goto discard_and_relse;
2075 		} else {
2076 			sock_put(sk);
2077 			return 0;
2078 		}
2079 	}
2080 
2081 	if (static_branch_unlikely(&ip4_min_ttl)) {
2082 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2083 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2084 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2085 			goto discard_and_relse;
2086 		}
2087 	}
2088 
2089 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2090 		goto discard_and_relse;
2091 
2092 	if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2093 		goto discard_and_relse;
2094 
2095 	nf_reset_ct(skb);
2096 
2097 	if (tcp_filter(sk, skb)) {
2098 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2099 		goto discard_and_relse;
2100 	}
2101 	th = (const struct tcphdr *)skb->data;
2102 	iph = ip_hdr(skb);
2103 	tcp_v4_fill_cb(skb, iph, th);
2104 
2105 	skb->dev = NULL;
2106 
2107 	if (sk->sk_state == TCP_LISTEN) {
2108 		ret = tcp_v4_do_rcv(sk, skb);
2109 		goto put_and_return;
2110 	}
2111 
2112 	sk_incoming_cpu_update(sk);
2113 
2114 	sk_defer_free_flush(sk);
2115 	bh_lock_sock_nested(sk);
2116 	tcp_segs_in(tcp_sk(sk), skb);
2117 	ret = 0;
2118 	if (!sock_owned_by_user(sk)) {
2119 		ret = tcp_v4_do_rcv(sk, skb);
2120 	} else {
2121 		if (tcp_add_backlog(sk, skb))
2122 			goto discard_and_relse;
2123 	}
2124 	bh_unlock_sock(sk);
2125 
2126 put_and_return:
2127 	if (refcounted)
2128 		sock_put(sk);
2129 
2130 	return ret;
2131 
2132 no_tcp_socket:
2133 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2134 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2135 		goto discard_it;
2136 
2137 	tcp_v4_fill_cb(skb, iph, th);
2138 
2139 	if (tcp_checksum_complete(skb)) {
2140 csum_error:
2141 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2142 		trace_tcp_bad_csum(skb);
2143 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2144 bad_packet:
2145 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2146 	} else {
2147 		tcp_v4_send_reset(NULL, skb);
2148 	}
2149 
2150 discard_it:
2151 	/* Discard frame. */
2152 	kfree_skb_reason(skb, drop_reason);
2153 	return 0;
2154 
2155 discard_and_relse:
2156 	sk_drops_add(sk, skb);
2157 	if (refcounted)
2158 		sock_put(sk);
2159 	goto discard_it;
2160 
2161 do_time_wait:
2162 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2163 		inet_twsk_put(inet_twsk(sk));
2164 		goto discard_it;
2165 	}
2166 
2167 	tcp_v4_fill_cb(skb, iph, th);
2168 
2169 	if (tcp_checksum_complete(skb)) {
2170 		inet_twsk_put(inet_twsk(sk));
2171 		goto csum_error;
2172 	}
2173 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2174 	case TCP_TW_SYN: {
2175 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2176 							&tcp_hashinfo, skb,
2177 							__tcp_hdrlen(th),
2178 							iph->saddr, th->source,
2179 							iph->daddr, th->dest,
2180 							inet_iif(skb),
2181 							sdif);
2182 		if (sk2) {
2183 			inet_twsk_deschedule_put(inet_twsk(sk));
2184 			sk = sk2;
2185 			tcp_v4_restore_cb(skb);
2186 			refcounted = false;
2187 			goto process;
2188 		}
2189 	}
2190 		/* to ACK */
2191 		fallthrough;
2192 	case TCP_TW_ACK:
2193 		tcp_v4_timewait_ack(sk, skb);
2194 		break;
2195 	case TCP_TW_RST:
2196 		tcp_v4_send_reset(sk, skb);
2197 		inet_twsk_deschedule_put(inet_twsk(sk));
2198 		goto discard_it;
2199 	case TCP_TW_SUCCESS:;
2200 	}
2201 	goto discard_it;
2202 }
2203 
2204 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2205 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2206 	.twsk_unique	= tcp_twsk_unique,
2207 	.twsk_destructor= tcp_twsk_destructor,
2208 };
2209 
2210 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2211 {
2212 	struct dst_entry *dst = skb_dst(skb);
2213 
2214 	if (dst && dst_hold_safe(dst)) {
2215 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2216 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2217 	}
2218 }
2219 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2220 
2221 const struct inet_connection_sock_af_ops ipv4_specific = {
2222 	.queue_xmit	   = ip_queue_xmit,
2223 	.send_check	   = tcp_v4_send_check,
2224 	.rebuild_header	   = inet_sk_rebuild_header,
2225 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2226 	.conn_request	   = tcp_v4_conn_request,
2227 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2228 	.net_header_len	   = sizeof(struct iphdr),
2229 	.setsockopt	   = ip_setsockopt,
2230 	.getsockopt	   = ip_getsockopt,
2231 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2232 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2233 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2234 };
2235 EXPORT_SYMBOL(ipv4_specific);
2236 
2237 #ifdef CONFIG_TCP_MD5SIG
2238 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2239 	.md5_lookup		= tcp_v4_md5_lookup,
2240 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2241 	.md5_parse		= tcp_v4_parse_md5_keys,
2242 };
2243 #endif
2244 
2245 /* NOTE: A lot of things set to zero explicitly by call to
2246  *       sk_alloc() so need not be done here.
2247  */
2248 static int tcp_v4_init_sock(struct sock *sk)
2249 {
2250 	struct inet_connection_sock *icsk = inet_csk(sk);
2251 
2252 	tcp_init_sock(sk);
2253 
2254 	icsk->icsk_af_ops = &ipv4_specific;
2255 
2256 #ifdef CONFIG_TCP_MD5SIG
2257 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2258 #endif
2259 
2260 	return 0;
2261 }
2262 
2263 void tcp_v4_destroy_sock(struct sock *sk)
2264 {
2265 	struct tcp_sock *tp = tcp_sk(sk);
2266 
2267 	trace_tcp_destroy_sock(sk);
2268 
2269 	tcp_clear_xmit_timers(sk);
2270 
2271 	tcp_cleanup_congestion_control(sk);
2272 
2273 	tcp_cleanup_ulp(sk);
2274 
2275 	/* Cleanup up the write buffer. */
2276 	tcp_write_queue_purge(sk);
2277 
2278 	/* Check if we want to disable active TFO */
2279 	tcp_fastopen_active_disable_ofo_check(sk);
2280 
2281 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2282 	skb_rbtree_purge(&tp->out_of_order_queue);
2283 
2284 #ifdef CONFIG_TCP_MD5SIG
2285 	/* Clean up the MD5 key list, if any */
2286 	if (tp->md5sig_info) {
2287 		tcp_clear_md5_list(sk);
2288 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2289 		tp->md5sig_info = NULL;
2290 	}
2291 #endif
2292 
2293 	/* Clean up a referenced TCP bind bucket. */
2294 	if (inet_csk(sk)->icsk_bind_hash)
2295 		inet_put_port(sk);
2296 
2297 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2298 
2299 	/* If socket is aborted during connect operation */
2300 	tcp_free_fastopen_req(tp);
2301 	tcp_fastopen_destroy_cipher(sk);
2302 	tcp_saved_syn_free(tp);
2303 
2304 	sk_sockets_allocated_dec(sk);
2305 }
2306 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2307 
2308 #ifdef CONFIG_PROC_FS
2309 /* Proc filesystem TCP sock list dumping. */
2310 
2311 static unsigned short seq_file_family(const struct seq_file *seq);
2312 
2313 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2314 {
2315 	unsigned short family = seq_file_family(seq);
2316 
2317 	/* AF_UNSPEC is used as a match all */
2318 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2319 		net_eq(sock_net(sk), seq_file_net(seq)));
2320 }
2321 
2322 /* Find a non empty bucket (starting from st->bucket)
2323  * and return the first sk from it.
2324  */
2325 static void *listening_get_first(struct seq_file *seq)
2326 {
2327 	struct tcp_iter_state *st = seq->private;
2328 
2329 	st->offset = 0;
2330 	for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2331 		struct inet_listen_hashbucket *ilb2;
2332 		struct inet_connection_sock *icsk;
2333 		struct sock *sk;
2334 
2335 		ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2336 		if (hlist_empty(&ilb2->head))
2337 			continue;
2338 
2339 		spin_lock(&ilb2->lock);
2340 		inet_lhash2_for_each_icsk(icsk, &ilb2->head) {
2341 			sk = (struct sock *)icsk;
2342 			if (seq_sk_match(seq, sk))
2343 				return sk;
2344 		}
2345 		spin_unlock(&ilb2->lock);
2346 	}
2347 
2348 	return NULL;
2349 }
2350 
2351 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2352  * If "cur" is the last one in the st->bucket,
2353  * call listening_get_first() to return the first sk of the next
2354  * non empty bucket.
2355  */
2356 static void *listening_get_next(struct seq_file *seq, void *cur)
2357 {
2358 	struct tcp_iter_state *st = seq->private;
2359 	struct inet_listen_hashbucket *ilb2;
2360 	struct inet_connection_sock *icsk;
2361 	struct sock *sk = cur;
2362 
2363 	++st->num;
2364 	++st->offset;
2365 
2366 	icsk = inet_csk(sk);
2367 	inet_lhash2_for_each_icsk_continue(icsk) {
2368 		sk = (struct sock *)icsk;
2369 		if (seq_sk_match(seq, sk))
2370 			return sk;
2371 	}
2372 
2373 	ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2374 	spin_unlock(&ilb2->lock);
2375 	++st->bucket;
2376 	return listening_get_first(seq);
2377 }
2378 
2379 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2380 {
2381 	struct tcp_iter_state *st = seq->private;
2382 	void *rc;
2383 
2384 	st->bucket = 0;
2385 	st->offset = 0;
2386 	rc = listening_get_first(seq);
2387 
2388 	while (rc && *pos) {
2389 		rc = listening_get_next(seq, rc);
2390 		--*pos;
2391 	}
2392 	return rc;
2393 }
2394 
2395 static inline bool empty_bucket(const struct tcp_iter_state *st)
2396 {
2397 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2398 }
2399 
2400 /*
2401  * Get first established socket starting from bucket given in st->bucket.
2402  * If st->bucket is zero, the very first socket in the hash is returned.
2403  */
2404 static void *established_get_first(struct seq_file *seq)
2405 {
2406 	struct tcp_iter_state *st = seq->private;
2407 
2408 	st->offset = 0;
2409 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2410 		struct sock *sk;
2411 		struct hlist_nulls_node *node;
2412 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2413 
2414 		/* Lockless fast path for the common case of empty buckets */
2415 		if (empty_bucket(st))
2416 			continue;
2417 
2418 		spin_lock_bh(lock);
2419 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2420 			if (seq_sk_match(seq, sk))
2421 				return sk;
2422 		}
2423 		spin_unlock_bh(lock);
2424 	}
2425 
2426 	return NULL;
2427 }
2428 
2429 static void *established_get_next(struct seq_file *seq, void *cur)
2430 {
2431 	struct sock *sk = cur;
2432 	struct hlist_nulls_node *node;
2433 	struct tcp_iter_state *st = seq->private;
2434 
2435 	++st->num;
2436 	++st->offset;
2437 
2438 	sk = sk_nulls_next(sk);
2439 
2440 	sk_nulls_for_each_from(sk, node) {
2441 		if (seq_sk_match(seq, sk))
2442 			return sk;
2443 	}
2444 
2445 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2446 	++st->bucket;
2447 	return established_get_first(seq);
2448 }
2449 
2450 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2451 {
2452 	struct tcp_iter_state *st = seq->private;
2453 	void *rc;
2454 
2455 	st->bucket = 0;
2456 	rc = established_get_first(seq);
2457 
2458 	while (rc && pos) {
2459 		rc = established_get_next(seq, rc);
2460 		--pos;
2461 	}
2462 	return rc;
2463 }
2464 
2465 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2466 {
2467 	void *rc;
2468 	struct tcp_iter_state *st = seq->private;
2469 
2470 	st->state = TCP_SEQ_STATE_LISTENING;
2471 	rc	  = listening_get_idx(seq, &pos);
2472 
2473 	if (!rc) {
2474 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2475 		rc	  = established_get_idx(seq, pos);
2476 	}
2477 
2478 	return rc;
2479 }
2480 
2481 static void *tcp_seek_last_pos(struct seq_file *seq)
2482 {
2483 	struct tcp_iter_state *st = seq->private;
2484 	int bucket = st->bucket;
2485 	int offset = st->offset;
2486 	int orig_num = st->num;
2487 	void *rc = NULL;
2488 
2489 	switch (st->state) {
2490 	case TCP_SEQ_STATE_LISTENING:
2491 		if (st->bucket > tcp_hashinfo.lhash2_mask)
2492 			break;
2493 		st->state = TCP_SEQ_STATE_LISTENING;
2494 		rc = listening_get_first(seq);
2495 		while (offset-- && rc && bucket == st->bucket)
2496 			rc = listening_get_next(seq, rc);
2497 		if (rc)
2498 			break;
2499 		st->bucket = 0;
2500 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2501 		fallthrough;
2502 	case TCP_SEQ_STATE_ESTABLISHED:
2503 		if (st->bucket > tcp_hashinfo.ehash_mask)
2504 			break;
2505 		rc = established_get_first(seq);
2506 		while (offset-- && rc && bucket == st->bucket)
2507 			rc = established_get_next(seq, rc);
2508 	}
2509 
2510 	st->num = orig_num;
2511 
2512 	return rc;
2513 }
2514 
2515 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2516 {
2517 	struct tcp_iter_state *st = seq->private;
2518 	void *rc;
2519 
2520 	if (*pos && *pos == st->last_pos) {
2521 		rc = tcp_seek_last_pos(seq);
2522 		if (rc)
2523 			goto out;
2524 	}
2525 
2526 	st->state = TCP_SEQ_STATE_LISTENING;
2527 	st->num = 0;
2528 	st->bucket = 0;
2529 	st->offset = 0;
2530 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2531 
2532 out:
2533 	st->last_pos = *pos;
2534 	return rc;
2535 }
2536 EXPORT_SYMBOL(tcp_seq_start);
2537 
2538 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2539 {
2540 	struct tcp_iter_state *st = seq->private;
2541 	void *rc = NULL;
2542 
2543 	if (v == SEQ_START_TOKEN) {
2544 		rc = tcp_get_idx(seq, 0);
2545 		goto out;
2546 	}
2547 
2548 	switch (st->state) {
2549 	case TCP_SEQ_STATE_LISTENING:
2550 		rc = listening_get_next(seq, v);
2551 		if (!rc) {
2552 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2553 			st->bucket = 0;
2554 			st->offset = 0;
2555 			rc	  = established_get_first(seq);
2556 		}
2557 		break;
2558 	case TCP_SEQ_STATE_ESTABLISHED:
2559 		rc = established_get_next(seq, v);
2560 		break;
2561 	}
2562 out:
2563 	++*pos;
2564 	st->last_pos = *pos;
2565 	return rc;
2566 }
2567 EXPORT_SYMBOL(tcp_seq_next);
2568 
2569 void tcp_seq_stop(struct seq_file *seq, void *v)
2570 {
2571 	struct tcp_iter_state *st = seq->private;
2572 
2573 	switch (st->state) {
2574 	case TCP_SEQ_STATE_LISTENING:
2575 		if (v != SEQ_START_TOKEN)
2576 			spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2577 		break;
2578 	case TCP_SEQ_STATE_ESTABLISHED:
2579 		if (v)
2580 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2581 		break;
2582 	}
2583 }
2584 EXPORT_SYMBOL(tcp_seq_stop);
2585 
2586 static void get_openreq4(const struct request_sock *req,
2587 			 struct seq_file *f, int i)
2588 {
2589 	const struct inet_request_sock *ireq = inet_rsk(req);
2590 	long delta = req->rsk_timer.expires - jiffies;
2591 
2592 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2593 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2594 		i,
2595 		ireq->ir_loc_addr,
2596 		ireq->ir_num,
2597 		ireq->ir_rmt_addr,
2598 		ntohs(ireq->ir_rmt_port),
2599 		TCP_SYN_RECV,
2600 		0, 0, /* could print option size, but that is af dependent. */
2601 		1,    /* timers active (only the expire timer) */
2602 		jiffies_delta_to_clock_t(delta),
2603 		req->num_timeout,
2604 		from_kuid_munged(seq_user_ns(f),
2605 				 sock_i_uid(req->rsk_listener)),
2606 		0,  /* non standard timer */
2607 		0, /* open_requests have no inode */
2608 		0,
2609 		req);
2610 }
2611 
2612 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2613 {
2614 	int timer_active;
2615 	unsigned long timer_expires;
2616 	const struct tcp_sock *tp = tcp_sk(sk);
2617 	const struct inet_connection_sock *icsk = inet_csk(sk);
2618 	const struct inet_sock *inet = inet_sk(sk);
2619 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2620 	__be32 dest = inet->inet_daddr;
2621 	__be32 src = inet->inet_rcv_saddr;
2622 	__u16 destp = ntohs(inet->inet_dport);
2623 	__u16 srcp = ntohs(inet->inet_sport);
2624 	int rx_queue;
2625 	int state;
2626 
2627 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2628 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2629 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2630 		timer_active	= 1;
2631 		timer_expires	= icsk->icsk_timeout;
2632 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2633 		timer_active	= 4;
2634 		timer_expires	= icsk->icsk_timeout;
2635 	} else if (timer_pending(&sk->sk_timer)) {
2636 		timer_active	= 2;
2637 		timer_expires	= sk->sk_timer.expires;
2638 	} else {
2639 		timer_active	= 0;
2640 		timer_expires = jiffies;
2641 	}
2642 
2643 	state = inet_sk_state_load(sk);
2644 	if (state == TCP_LISTEN)
2645 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2646 	else
2647 		/* Because we don't lock the socket,
2648 		 * we might find a transient negative value.
2649 		 */
2650 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2651 				      READ_ONCE(tp->copied_seq), 0);
2652 
2653 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2654 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2655 		i, src, srcp, dest, destp, state,
2656 		READ_ONCE(tp->write_seq) - tp->snd_una,
2657 		rx_queue,
2658 		timer_active,
2659 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2660 		icsk->icsk_retransmits,
2661 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2662 		icsk->icsk_probes_out,
2663 		sock_i_ino(sk),
2664 		refcount_read(&sk->sk_refcnt), sk,
2665 		jiffies_to_clock_t(icsk->icsk_rto),
2666 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2667 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2668 		tp->snd_cwnd,
2669 		state == TCP_LISTEN ?
2670 		    fastopenq->max_qlen :
2671 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2672 }
2673 
2674 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2675 			       struct seq_file *f, int i)
2676 {
2677 	long delta = tw->tw_timer.expires - jiffies;
2678 	__be32 dest, src;
2679 	__u16 destp, srcp;
2680 
2681 	dest  = tw->tw_daddr;
2682 	src   = tw->tw_rcv_saddr;
2683 	destp = ntohs(tw->tw_dport);
2684 	srcp  = ntohs(tw->tw_sport);
2685 
2686 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2687 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2688 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2689 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2690 		refcount_read(&tw->tw_refcnt), tw);
2691 }
2692 
2693 #define TMPSZ 150
2694 
2695 static int tcp4_seq_show(struct seq_file *seq, void *v)
2696 {
2697 	struct tcp_iter_state *st;
2698 	struct sock *sk = v;
2699 
2700 	seq_setwidth(seq, TMPSZ - 1);
2701 	if (v == SEQ_START_TOKEN) {
2702 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2703 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2704 			   "inode");
2705 		goto out;
2706 	}
2707 	st = seq->private;
2708 
2709 	if (sk->sk_state == TCP_TIME_WAIT)
2710 		get_timewait4_sock(v, seq, st->num);
2711 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2712 		get_openreq4(v, seq, st->num);
2713 	else
2714 		get_tcp4_sock(v, seq, st->num);
2715 out:
2716 	seq_pad(seq, '\n');
2717 	return 0;
2718 }
2719 
2720 #ifdef CONFIG_BPF_SYSCALL
2721 struct bpf_tcp_iter_state {
2722 	struct tcp_iter_state state;
2723 	unsigned int cur_sk;
2724 	unsigned int end_sk;
2725 	unsigned int max_sk;
2726 	struct sock **batch;
2727 	bool st_bucket_done;
2728 };
2729 
2730 struct bpf_iter__tcp {
2731 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2732 	__bpf_md_ptr(struct sock_common *, sk_common);
2733 	uid_t uid __aligned(8);
2734 };
2735 
2736 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2737 			     struct sock_common *sk_common, uid_t uid)
2738 {
2739 	struct bpf_iter__tcp ctx;
2740 
2741 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2742 	ctx.meta = meta;
2743 	ctx.sk_common = sk_common;
2744 	ctx.uid = uid;
2745 	return bpf_iter_run_prog(prog, &ctx);
2746 }
2747 
2748 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2749 {
2750 	while (iter->cur_sk < iter->end_sk)
2751 		sock_put(iter->batch[iter->cur_sk++]);
2752 }
2753 
2754 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2755 				      unsigned int new_batch_sz)
2756 {
2757 	struct sock **new_batch;
2758 
2759 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2760 			     GFP_USER | __GFP_NOWARN);
2761 	if (!new_batch)
2762 		return -ENOMEM;
2763 
2764 	bpf_iter_tcp_put_batch(iter);
2765 	kvfree(iter->batch);
2766 	iter->batch = new_batch;
2767 	iter->max_sk = new_batch_sz;
2768 
2769 	return 0;
2770 }
2771 
2772 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2773 						 struct sock *start_sk)
2774 {
2775 	struct bpf_tcp_iter_state *iter = seq->private;
2776 	struct tcp_iter_state *st = &iter->state;
2777 	struct inet_connection_sock *icsk;
2778 	unsigned int expected = 1;
2779 	struct sock *sk;
2780 
2781 	sock_hold(start_sk);
2782 	iter->batch[iter->end_sk++] = start_sk;
2783 
2784 	icsk = inet_csk(start_sk);
2785 	inet_lhash2_for_each_icsk_continue(icsk) {
2786 		sk = (struct sock *)icsk;
2787 		if (seq_sk_match(seq, sk)) {
2788 			if (iter->end_sk < iter->max_sk) {
2789 				sock_hold(sk);
2790 				iter->batch[iter->end_sk++] = sk;
2791 			}
2792 			expected++;
2793 		}
2794 	}
2795 	spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2796 
2797 	return expected;
2798 }
2799 
2800 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2801 						   struct sock *start_sk)
2802 {
2803 	struct bpf_tcp_iter_state *iter = seq->private;
2804 	struct tcp_iter_state *st = &iter->state;
2805 	struct hlist_nulls_node *node;
2806 	unsigned int expected = 1;
2807 	struct sock *sk;
2808 
2809 	sock_hold(start_sk);
2810 	iter->batch[iter->end_sk++] = start_sk;
2811 
2812 	sk = sk_nulls_next(start_sk);
2813 	sk_nulls_for_each_from(sk, node) {
2814 		if (seq_sk_match(seq, sk)) {
2815 			if (iter->end_sk < iter->max_sk) {
2816 				sock_hold(sk);
2817 				iter->batch[iter->end_sk++] = sk;
2818 			}
2819 			expected++;
2820 		}
2821 	}
2822 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2823 
2824 	return expected;
2825 }
2826 
2827 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2828 {
2829 	struct bpf_tcp_iter_state *iter = seq->private;
2830 	struct tcp_iter_state *st = &iter->state;
2831 	unsigned int expected;
2832 	bool resized = false;
2833 	struct sock *sk;
2834 
2835 	/* The st->bucket is done.  Directly advance to the next
2836 	 * bucket instead of having the tcp_seek_last_pos() to skip
2837 	 * one by one in the current bucket and eventually find out
2838 	 * it has to advance to the next bucket.
2839 	 */
2840 	if (iter->st_bucket_done) {
2841 		st->offset = 0;
2842 		st->bucket++;
2843 		if (st->state == TCP_SEQ_STATE_LISTENING &&
2844 		    st->bucket > tcp_hashinfo.lhash2_mask) {
2845 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2846 			st->bucket = 0;
2847 		}
2848 	}
2849 
2850 again:
2851 	/* Get a new batch */
2852 	iter->cur_sk = 0;
2853 	iter->end_sk = 0;
2854 	iter->st_bucket_done = false;
2855 
2856 	sk = tcp_seek_last_pos(seq);
2857 	if (!sk)
2858 		return NULL; /* Done */
2859 
2860 	if (st->state == TCP_SEQ_STATE_LISTENING)
2861 		expected = bpf_iter_tcp_listening_batch(seq, sk);
2862 	else
2863 		expected = bpf_iter_tcp_established_batch(seq, sk);
2864 
2865 	if (iter->end_sk == expected) {
2866 		iter->st_bucket_done = true;
2867 		return sk;
2868 	}
2869 
2870 	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2871 		resized = true;
2872 		goto again;
2873 	}
2874 
2875 	return sk;
2876 }
2877 
2878 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2879 {
2880 	/* bpf iter does not support lseek, so it always
2881 	 * continue from where it was stop()-ped.
2882 	 */
2883 	if (*pos)
2884 		return bpf_iter_tcp_batch(seq);
2885 
2886 	return SEQ_START_TOKEN;
2887 }
2888 
2889 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2890 {
2891 	struct bpf_tcp_iter_state *iter = seq->private;
2892 	struct tcp_iter_state *st = &iter->state;
2893 	struct sock *sk;
2894 
2895 	/* Whenever seq_next() is called, the iter->cur_sk is
2896 	 * done with seq_show(), so advance to the next sk in
2897 	 * the batch.
2898 	 */
2899 	if (iter->cur_sk < iter->end_sk) {
2900 		/* Keeping st->num consistent in tcp_iter_state.
2901 		 * bpf_iter_tcp does not use st->num.
2902 		 * meta.seq_num is used instead.
2903 		 */
2904 		st->num++;
2905 		/* Move st->offset to the next sk in the bucket such that
2906 		 * the future start() will resume at st->offset in
2907 		 * st->bucket.  See tcp_seek_last_pos().
2908 		 */
2909 		st->offset++;
2910 		sock_put(iter->batch[iter->cur_sk++]);
2911 	}
2912 
2913 	if (iter->cur_sk < iter->end_sk)
2914 		sk = iter->batch[iter->cur_sk];
2915 	else
2916 		sk = bpf_iter_tcp_batch(seq);
2917 
2918 	++*pos;
2919 	/* Keeping st->last_pos consistent in tcp_iter_state.
2920 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2921 	 */
2922 	st->last_pos = *pos;
2923 	return sk;
2924 }
2925 
2926 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2927 {
2928 	struct bpf_iter_meta meta;
2929 	struct bpf_prog *prog;
2930 	struct sock *sk = v;
2931 	bool slow;
2932 	uid_t uid;
2933 	int ret;
2934 
2935 	if (v == SEQ_START_TOKEN)
2936 		return 0;
2937 
2938 	if (sk_fullsock(sk))
2939 		slow = lock_sock_fast(sk);
2940 
2941 	if (unlikely(sk_unhashed(sk))) {
2942 		ret = SEQ_SKIP;
2943 		goto unlock;
2944 	}
2945 
2946 	if (sk->sk_state == TCP_TIME_WAIT) {
2947 		uid = 0;
2948 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2949 		const struct request_sock *req = v;
2950 
2951 		uid = from_kuid_munged(seq_user_ns(seq),
2952 				       sock_i_uid(req->rsk_listener));
2953 	} else {
2954 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2955 	}
2956 
2957 	meta.seq = seq;
2958 	prog = bpf_iter_get_info(&meta, false);
2959 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
2960 
2961 unlock:
2962 	if (sk_fullsock(sk))
2963 		unlock_sock_fast(sk, slow);
2964 	return ret;
2965 
2966 }
2967 
2968 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2969 {
2970 	struct bpf_tcp_iter_state *iter = seq->private;
2971 	struct bpf_iter_meta meta;
2972 	struct bpf_prog *prog;
2973 
2974 	if (!v) {
2975 		meta.seq = seq;
2976 		prog = bpf_iter_get_info(&meta, true);
2977 		if (prog)
2978 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
2979 	}
2980 
2981 	if (iter->cur_sk < iter->end_sk) {
2982 		bpf_iter_tcp_put_batch(iter);
2983 		iter->st_bucket_done = false;
2984 	}
2985 }
2986 
2987 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2988 	.show		= bpf_iter_tcp_seq_show,
2989 	.start		= bpf_iter_tcp_seq_start,
2990 	.next		= bpf_iter_tcp_seq_next,
2991 	.stop		= bpf_iter_tcp_seq_stop,
2992 };
2993 #endif
2994 static unsigned short seq_file_family(const struct seq_file *seq)
2995 {
2996 	const struct tcp_seq_afinfo *afinfo;
2997 
2998 #ifdef CONFIG_BPF_SYSCALL
2999 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3000 	if (seq->op == &bpf_iter_tcp_seq_ops)
3001 		return AF_UNSPEC;
3002 #endif
3003 
3004 	/* Iterated from proc fs */
3005 	afinfo = pde_data(file_inode(seq->file));
3006 	return afinfo->family;
3007 }
3008 
3009 static const struct seq_operations tcp4_seq_ops = {
3010 	.show		= tcp4_seq_show,
3011 	.start		= tcp_seq_start,
3012 	.next		= tcp_seq_next,
3013 	.stop		= tcp_seq_stop,
3014 };
3015 
3016 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3017 	.family		= AF_INET,
3018 };
3019 
3020 static int __net_init tcp4_proc_init_net(struct net *net)
3021 {
3022 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3023 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3024 		return -ENOMEM;
3025 	return 0;
3026 }
3027 
3028 static void __net_exit tcp4_proc_exit_net(struct net *net)
3029 {
3030 	remove_proc_entry("tcp", net->proc_net);
3031 }
3032 
3033 static struct pernet_operations tcp4_net_ops = {
3034 	.init = tcp4_proc_init_net,
3035 	.exit = tcp4_proc_exit_net,
3036 };
3037 
3038 int __init tcp4_proc_init(void)
3039 {
3040 	return register_pernet_subsys(&tcp4_net_ops);
3041 }
3042 
3043 void tcp4_proc_exit(void)
3044 {
3045 	unregister_pernet_subsys(&tcp4_net_ops);
3046 }
3047 #endif /* CONFIG_PROC_FS */
3048 
3049 /* @wake is one when sk_stream_write_space() calls us.
3050  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3051  * This mimics the strategy used in sock_def_write_space().
3052  */
3053 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3054 {
3055 	const struct tcp_sock *tp = tcp_sk(sk);
3056 	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3057 			    READ_ONCE(tp->snd_nxt);
3058 
3059 	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3060 }
3061 EXPORT_SYMBOL(tcp_stream_memory_free);
3062 
3063 struct proto tcp_prot = {
3064 	.name			= "TCP",
3065 	.owner			= THIS_MODULE,
3066 	.close			= tcp_close,
3067 	.pre_connect		= tcp_v4_pre_connect,
3068 	.connect		= tcp_v4_connect,
3069 	.disconnect		= tcp_disconnect,
3070 	.accept			= inet_csk_accept,
3071 	.ioctl			= tcp_ioctl,
3072 	.init			= tcp_v4_init_sock,
3073 	.destroy		= tcp_v4_destroy_sock,
3074 	.shutdown		= tcp_shutdown,
3075 	.setsockopt		= tcp_setsockopt,
3076 	.getsockopt		= tcp_getsockopt,
3077 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3078 	.keepalive		= tcp_set_keepalive,
3079 	.recvmsg		= tcp_recvmsg,
3080 	.sendmsg		= tcp_sendmsg,
3081 	.sendpage		= tcp_sendpage,
3082 	.backlog_rcv		= tcp_v4_do_rcv,
3083 	.release_cb		= tcp_release_cb,
3084 	.hash			= inet_hash,
3085 	.unhash			= inet_unhash,
3086 	.get_port		= inet_csk_get_port,
3087 	.put_port		= inet_put_port,
3088 #ifdef CONFIG_BPF_SYSCALL
3089 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3090 #endif
3091 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3092 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3093 	.stream_memory_free	= tcp_stream_memory_free,
3094 	.sockets_allocated	= &tcp_sockets_allocated,
3095 	.orphan_count		= &tcp_orphan_count,
3096 	.memory_allocated	= &tcp_memory_allocated,
3097 	.memory_pressure	= &tcp_memory_pressure,
3098 	.sysctl_mem		= sysctl_tcp_mem,
3099 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3100 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3101 	.max_header		= MAX_TCP_HEADER,
3102 	.obj_size		= sizeof(struct tcp_sock),
3103 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3104 	.twsk_prot		= &tcp_timewait_sock_ops,
3105 	.rsk_prot		= &tcp_request_sock_ops,
3106 	.h.hashinfo		= &tcp_hashinfo,
3107 	.no_autobind		= true,
3108 	.diag_destroy		= tcp_abort,
3109 };
3110 EXPORT_SYMBOL(tcp_prot);
3111 
3112 static void __net_exit tcp_sk_exit(struct net *net)
3113 {
3114 	int cpu;
3115 
3116 	if (net->ipv4.tcp_congestion_control)
3117 		bpf_module_put(net->ipv4.tcp_congestion_control,
3118 			       net->ipv4.tcp_congestion_control->owner);
3119 
3120 	for_each_possible_cpu(cpu)
3121 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
3122 	free_percpu(net->ipv4.tcp_sk);
3123 }
3124 
3125 static int __net_init tcp_sk_init(struct net *net)
3126 {
3127 	int res, cpu, cnt;
3128 
3129 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
3130 	if (!net->ipv4.tcp_sk)
3131 		return -ENOMEM;
3132 
3133 	for_each_possible_cpu(cpu) {
3134 		struct sock *sk;
3135 
3136 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3137 					   IPPROTO_TCP, net);
3138 		if (res)
3139 			goto fail;
3140 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3141 
3142 		/* Please enforce IP_DF and IPID==0 for RST and
3143 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3144 		 */
3145 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3146 
3147 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
3148 	}
3149 
3150 	net->ipv4.sysctl_tcp_ecn = 2;
3151 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3152 
3153 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3154 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3155 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3156 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3157 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3158 
3159 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3160 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3161 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3162 
3163 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3164 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3165 	net->ipv4.sysctl_tcp_syncookies = 1;
3166 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3167 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3168 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3169 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3170 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3171 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3172 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3173 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3174 
3175 	cnt = tcp_hashinfo.ehash_mask + 1;
3176 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3177 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
3178 
3179 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3180 	net->ipv4.sysctl_tcp_sack = 1;
3181 	net->ipv4.sysctl_tcp_window_scaling = 1;
3182 	net->ipv4.sysctl_tcp_timestamps = 1;
3183 	net->ipv4.sysctl_tcp_early_retrans = 3;
3184 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3185 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3186 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3187 	net->ipv4.sysctl_tcp_max_reordering = 300;
3188 	net->ipv4.sysctl_tcp_dsack = 1;
3189 	net->ipv4.sysctl_tcp_app_win = 31;
3190 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3191 	net->ipv4.sysctl_tcp_frto = 2;
3192 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3193 	/* This limits the percentage of the congestion window which we
3194 	 * will allow a single TSO frame to consume.  Building TSO frames
3195 	 * which are too large can cause TCP streams to be bursty.
3196 	 */
3197 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3198 	/* Default TSQ limit of 16 TSO segments */
3199 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3200 	/* rfc5961 challenge ack rate limiting */
3201 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
3202 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3203 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3204 	net->ipv4.sysctl_tcp_autocorking = 1;
3205 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3206 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3207 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3208 	if (net != &init_net) {
3209 		memcpy(net->ipv4.sysctl_tcp_rmem,
3210 		       init_net.ipv4.sysctl_tcp_rmem,
3211 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3212 		memcpy(net->ipv4.sysctl_tcp_wmem,
3213 		       init_net.ipv4.sysctl_tcp_wmem,
3214 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3215 	}
3216 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3217 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3218 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3219 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3220 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3221 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3222 
3223 	/* Reno is always built in */
3224 	if (!net_eq(net, &init_net) &&
3225 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3226 			       init_net.ipv4.tcp_congestion_control->owner))
3227 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3228 	else
3229 		net->ipv4.tcp_congestion_control = &tcp_reno;
3230 
3231 	return 0;
3232 fail:
3233 	tcp_sk_exit(net);
3234 
3235 	return res;
3236 }
3237 
3238 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3239 {
3240 	struct net *net;
3241 
3242 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
3243 
3244 	list_for_each_entry(net, net_exit_list, exit_list)
3245 		tcp_fastopen_ctx_destroy(net);
3246 }
3247 
3248 static struct pernet_operations __net_initdata tcp_sk_ops = {
3249        .init	   = tcp_sk_init,
3250        .exit	   = tcp_sk_exit,
3251        .exit_batch = tcp_sk_exit_batch,
3252 };
3253 
3254 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3255 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3256 		     struct sock_common *sk_common, uid_t uid)
3257 
3258 #define INIT_BATCH_SZ 16
3259 
3260 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3261 {
3262 	struct bpf_tcp_iter_state *iter = priv_data;
3263 	int err;
3264 
3265 	err = bpf_iter_init_seq_net(priv_data, aux);
3266 	if (err)
3267 		return err;
3268 
3269 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3270 	if (err) {
3271 		bpf_iter_fini_seq_net(priv_data);
3272 		return err;
3273 	}
3274 
3275 	return 0;
3276 }
3277 
3278 static void bpf_iter_fini_tcp(void *priv_data)
3279 {
3280 	struct bpf_tcp_iter_state *iter = priv_data;
3281 
3282 	bpf_iter_fini_seq_net(priv_data);
3283 	kvfree(iter->batch);
3284 }
3285 
3286 static const struct bpf_iter_seq_info tcp_seq_info = {
3287 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3288 	.init_seq_private	= bpf_iter_init_tcp,
3289 	.fini_seq_private	= bpf_iter_fini_tcp,
3290 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3291 };
3292 
3293 static const struct bpf_func_proto *
3294 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3295 			    const struct bpf_prog *prog)
3296 {
3297 	switch (func_id) {
3298 	case BPF_FUNC_setsockopt:
3299 		return &bpf_sk_setsockopt_proto;
3300 	case BPF_FUNC_getsockopt:
3301 		return &bpf_sk_getsockopt_proto;
3302 	default:
3303 		return NULL;
3304 	}
3305 }
3306 
3307 static struct bpf_iter_reg tcp_reg_info = {
3308 	.target			= "tcp",
3309 	.ctx_arg_info_size	= 1,
3310 	.ctx_arg_info		= {
3311 		{ offsetof(struct bpf_iter__tcp, sk_common),
3312 		  PTR_TO_BTF_ID_OR_NULL },
3313 	},
3314 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3315 	.seq_info		= &tcp_seq_info,
3316 };
3317 
3318 static void __init bpf_iter_register(void)
3319 {
3320 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3321 	if (bpf_iter_reg_target(&tcp_reg_info))
3322 		pr_warn("Warning: could not register bpf iterator tcp\n");
3323 }
3324 
3325 #endif
3326 
3327 void __init tcp_v4_init(void)
3328 {
3329 	if (register_pernet_subsys(&tcp_sk_ops))
3330 		panic("Failed to create the TCP control socket.\n");
3331 
3332 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3333 	bpf_iter_register();
3334 #endif
3335 }
3336