xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision fa5d824ce5dd8306c66f45c34fd78536e6ce2488)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72 
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
80 
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
83 
84 #include <trace/events/tcp.h>
85 
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 #endif
90 
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
93 
94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
95 
96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
97 {
98 	return secure_tcp_seq(ip_hdr(skb)->daddr,
99 			      ip_hdr(skb)->saddr,
100 			      tcp_hdr(skb)->dest,
101 			      tcp_hdr(skb)->source);
102 }
103 
104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
105 {
106 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
107 }
108 
109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110 {
111 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
112 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
113 	struct tcp_sock *tp = tcp_sk(sk);
114 	int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
115 
116 	if (reuse == 2) {
117 		/* Still does not detect *everything* that goes through
118 		 * lo, since we require a loopback src or dst address
119 		 * or direct binding to 'lo' interface.
120 		 */
121 		bool loopback = false;
122 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
123 			loopback = true;
124 #if IS_ENABLED(CONFIG_IPV6)
125 		if (tw->tw_family == AF_INET6) {
126 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
127 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
128 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
129 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
130 				loopback = true;
131 		} else
132 #endif
133 		{
134 			if (ipv4_is_loopback(tw->tw_daddr) ||
135 			    ipv4_is_loopback(tw->tw_rcv_saddr))
136 				loopback = true;
137 		}
138 		if (!loopback)
139 			reuse = 0;
140 	}
141 
142 	/* With PAWS, it is safe from the viewpoint
143 	   of data integrity. Even without PAWS it is safe provided sequence
144 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
145 
146 	   Actually, the idea is close to VJ's one, only timestamp cache is
147 	   held not per host, but per port pair and TW bucket is used as state
148 	   holder.
149 
150 	   If TW bucket has been already destroyed we fall back to VJ's scheme
151 	   and use initial timestamp retrieved from peer table.
152 	 */
153 	if (tcptw->tw_ts_recent_stamp &&
154 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
155 					    tcptw->tw_ts_recent_stamp)))) {
156 		/* In case of repair and re-using TIME-WAIT sockets we still
157 		 * want to be sure that it is safe as above but honor the
158 		 * sequence numbers and time stamps set as part of the repair
159 		 * process.
160 		 *
161 		 * Without this check re-using a TIME-WAIT socket with TCP
162 		 * repair would accumulate a -1 on the repair assigned
163 		 * sequence number. The first time it is reused the sequence
164 		 * is -1, the second time -2, etc. This fixes that issue
165 		 * without appearing to create any others.
166 		 */
167 		if (likely(!tp->repair)) {
168 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
169 
170 			if (!seq)
171 				seq = 1;
172 			WRITE_ONCE(tp->write_seq, seq);
173 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
174 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
175 		}
176 		sock_hold(sktw);
177 		return 1;
178 	}
179 
180 	return 0;
181 }
182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
183 
184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
185 			      int addr_len)
186 {
187 	/* This check is replicated from tcp_v4_connect() and intended to
188 	 * prevent BPF program called below from accessing bytes that are out
189 	 * of the bound specified by user in addr_len.
190 	 */
191 	if (addr_len < sizeof(struct sockaddr_in))
192 		return -EINVAL;
193 
194 	sock_owned_by_me(sk);
195 
196 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
197 }
198 
199 /* This will initiate an outgoing connection. */
200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
201 {
202 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
203 	struct inet_sock *inet = inet_sk(sk);
204 	struct tcp_sock *tp = tcp_sk(sk);
205 	__be16 orig_sport, orig_dport;
206 	__be32 daddr, nexthop;
207 	struct flowi4 *fl4;
208 	struct rtable *rt;
209 	int err;
210 	struct ip_options_rcu *inet_opt;
211 	struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
212 
213 	if (addr_len < sizeof(struct sockaddr_in))
214 		return -EINVAL;
215 
216 	if (usin->sin_family != AF_INET)
217 		return -EAFNOSUPPORT;
218 
219 	nexthop = daddr = usin->sin_addr.s_addr;
220 	inet_opt = rcu_dereference_protected(inet->inet_opt,
221 					     lockdep_sock_is_held(sk));
222 	if (inet_opt && inet_opt->opt.srr) {
223 		if (!daddr)
224 			return -EINVAL;
225 		nexthop = inet_opt->opt.faddr;
226 	}
227 
228 	orig_sport = inet->inet_sport;
229 	orig_dport = usin->sin_port;
230 	fl4 = &inet->cork.fl.u.ip4;
231 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
232 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
233 			      IPPROTO_TCP,
234 			      orig_sport, orig_dport, sk);
235 	if (IS_ERR(rt)) {
236 		err = PTR_ERR(rt);
237 		if (err == -ENETUNREACH)
238 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
239 		return err;
240 	}
241 
242 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
243 		ip_rt_put(rt);
244 		return -ENETUNREACH;
245 	}
246 
247 	if (!inet_opt || !inet_opt->opt.srr)
248 		daddr = fl4->daddr;
249 
250 	if (!inet->inet_saddr)
251 		inet->inet_saddr = fl4->saddr;
252 	sk_rcv_saddr_set(sk, inet->inet_saddr);
253 
254 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
255 		/* Reset inherited state */
256 		tp->rx_opt.ts_recent	   = 0;
257 		tp->rx_opt.ts_recent_stamp = 0;
258 		if (likely(!tp->repair))
259 			WRITE_ONCE(tp->write_seq, 0);
260 	}
261 
262 	inet->inet_dport = usin->sin_port;
263 	sk_daddr_set(sk, daddr);
264 
265 	inet_csk(sk)->icsk_ext_hdr_len = 0;
266 	if (inet_opt)
267 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
268 
269 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
270 
271 	/* Socket identity is still unknown (sport may be zero).
272 	 * However we set state to SYN-SENT and not releasing socket
273 	 * lock select source port, enter ourselves into the hash tables and
274 	 * complete initialization after this.
275 	 */
276 	tcp_set_state(sk, TCP_SYN_SENT);
277 	err = inet_hash_connect(tcp_death_row, sk);
278 	if (err)
279 		goto failure;
280 
281 	sk_set_txhash(sk);
282 
283 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
284 			       inet->inet_sport, inet->inet_dport, sk);
285 	if (IS_ERR(rt)) {
286 		err = PTR_ERR(rt);
287 		rt = NULL;
288 		goto failure;
289 	}
290 	/* OK, now commit destination to socket.  */
291 	sk->sk_gso_type = SKB_GSO_TCPV4;
292 	sk_setup_caps(sk, &rt->dst);
293 	rt = NULL;
294 
295 	if (likely(!tp->repair)) {
296 		if (!tp->write_seq)
297 			WRITE_ONCE(tp->write_seq,
298 				   secure_tcp_seq(inet->inet_saddr,
299 						  inet->inet_daddr,
300 						  inet->inet_sport,
301 						  usin->sin_port));
302 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
303 						 inet->inet_saddr,
304 						 inet->inet_daddr);
305 	}
306 
307 	inet->inet_id = prandom_u32();
308 
309 	if (tcp_fastopen_defer_connect(sk, &err))
310 		return err;
311 	if (err)
312 		goto failure;
313 
314 	err = tcp_connect(sk);
315 
316 	if (err)
317 		goto failure;
318 
319 	return 0;
320 
321 failure:
322 	/*
323 	 * This unhashes the socket and releases the local port,
324 	 * if necessary.
325 	 */
326 	tcp_set_state(sk, TCP_CLOSE);
327 	ip_rt_put(rt);
328 	sk->sk_route_caps = 0;
329 	inet->inet_dport = 0;
330 	return err;
331 }
332 EXPORT_SYMBOL(tcp_v4_connect);
333 
334 /*
335  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
336  * It can be called through tcp_release_cb() if socket was owned by user
337  * at the time tcp_v4_err() was called to handle ICMP message.
338  */
339 void tcp_v4_mtu_reduced(struct sock *sk)
340 {
341 	struct inet_sock *inet = inet_sk(sk);
342 	struct dst_entry *dst;
343 	u32 mtu;
344 
345 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
346 		return;
347 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
348 	dst = inet_csk_update_pmtu(sk, mtu);
349 	if (!dst)
350 		return;
351 
352 	/* Something is about to be wrong... Remember soft error
353 	 * for the case, if this connection will not able to recover.
354 	 */
355 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
356 		sk->sk_err_soft = EMSGSIZE;
357 
358 	mtu = dst_mtu(dst);
359 
360 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
361 	    ip_sk_accept_pmtu(sk) &&
362 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
363 		tcp_sync_mss(sk, mtu);
364 
365 		/* Resend the TCP packet because it's
366 		 * clear that the old packet has been
367 		 * dropped. This is the new "fast" path mtu
368 		 * discovery.
369 		 */
370 		tcp_simple_retransmit(sk);
371 	} /* else let the usual retransmit timer handle it */
372 }
373 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
374 
375 static void do_redirect(struct sk_buff *skb, struct sock *sk)
376 {
377 	struct dst_entry *dst = __sk_dst_check(sk, 0);
378 
379 	if (dst)
380 		dst->ops->redirect(dst, sk, skb);
381 }
382 
383 
384 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
385 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
386 {
387 	struct request_sock *req = inet_reqsk(sk);
388 	struct net *net = sock_net(sk);
389 
390 	/* ICMPs are not backlogged, hence we cannot get
391 	 * an established socket here.
392 	 */
393 	if (seq != tcp_rsk(req)->snt_isn) {
394 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
395 	} else if (abort) {
396 		/*
397 		 * Still in SYN_RECV, just remove it silently.
398 		 * There is no good way to pass the error to the newly
399 		 * created socket, and POSIX does not want network
400 		 * errors returned from accept().
401 		 */
402 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
403 		tcp_listendrop(req->rsk_listener);
404 	}
405 	reqsk_put(req);
406 }
407 EXPORT_SYMBOL(tcp_req_err);
408 
409 /* TCP-LD (RFC 6069) logic */
410 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
411 {
412 	struct inet_connection_sock *icsk = inet_csk(sk);
413 	struct tcp_sock *tp = tcp_sk(sk);
414 	struct sk_buff *skb;
415 	s32 remaining;
416 	u32 delta_us;
417 
418 	if (sock_owned_by_user(sk))
419 		return;
420 
421 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
422 	    !icsk->icsk_backoff)
423 		return;
424 
425 	skb = tcp_rtx_queue_head(sk);
426 	if (WARN_ON_ONCE(!skb))
427 		return;
428 
429 	icsk->icsk_backoff--;
430 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
431 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
432 
433 	tcp_mstamp_refresh(tp);
434 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
435 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
436 
437 	if (remaining > 0) {
438 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
439 					  remaining, TCP_RTO_MAX);
440 	} else {
441 		/* RTO revert clocked out retransmission.
442 		 * Will retransmit now.
443 		 */
444 		tcp_retransmit_timer(sk);
445 	}
446 }
447 EXPORT_SYMBOL(tcp_ld_RTO_revert);
448 
449 /*
450  * This routine is called by the ICMP module when it gets some
451  * sort of error condition.  If err < 0 then the socket should
452  * be closed and the error returned to the user.  If err > 0
453  * it's just the icmp type << 8 | icmp code.  After adjustment
454  * header points to the first 8 bytes of the tcp header.  We need
455  * to find the appropriate port.
456  *
457  * The locking strategy used here is very "optimistic". When
458  * someone else accesses the socket the ICMP is just dropped
459  * and for some paths there is no check at all.
460  * A more general error queue to queue errors for later handling
461  * is probably better.
462  *
463  */
464 
465 int tcp_v4_err(struct sk_buff *skb, u32 info)
466 {
467 	const struct iphdr *iph = (const struct iphdr *)skb->data;
468 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
469 	struct tcp_sock *tp;
470 	struct inet_sock *inet;
471 	const int type = icmp_hdr(skb)->type;
472 	const int code = icmp_hdr(skb)->code;
473 	struct sock *sk;
474 	struct request_sock *fastopen;
475 	u32 seq, snd_una;
476 	int err;
477 	struct net *net = dev_net(skb->dev);
478 
479 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
480 				       th->dest, iph->saddr, ntohs(th->source),
481 				       inet_iif(skb), 0);
482 	if (!sk) {
483 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
484 		return -ENOENT;
485 	}
486 	if (sk->sk_state == TCP_TIME_WAIT) {
487 		inet_twsk_put(inet_twsk(sk));
488 		return 0;
489 	}
490 	seq = ntohl(th->seq);
491 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
492 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
493 				     type == ICMP_TIME_EXCEEDED ||
494 				     (type == ICMP_DEST_UNREACH &&
495 				      (code == ICMP_NET_UNREACH ||
496 				       code == ICMP_HOST_UNREACH)));
497 		return 0;
498 	}
499 
500 	bh_lock_sock(sk);
501 	/* If too many ICMPs get dropped on busy
502 	 * servers this needs to be solved differently.
503 	 * We do take care of PMTU discovery (RFC1191) special case :
504 	 * we can receive locally generated ICMP messages while socket is held.
505 	 */
506 	if (sock_owned_by_user(sk)) {
507 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
508 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
509 	}
510 	if (sk->sk_state == TCP_CLOSE)
511 		goto out;
512 
513 	if (static_branch_unlikely(&ip4_min_ttl)) {
514 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
515 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
516 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
517 			goto out;
518 		}
519 	}
520 
521 	tp = tcp_sk(sk);
522 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
523 	fastopen = rcu_dereference(tp->fastopen_rsk);
524 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
525 	if (sk->sk_state != TCP_LISTEN &&
526 	    !between(seq, snd_una, tp->snd_nxt)) {
527 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
528 		goto out;
529 	}
530 
531 	switch (type) {
532 	case ICMP_REDIRECT:
533 		if (!sock_owned_by_user(sk))
534 			do_redirect(skb, sk);
535 		goto out;
536 	case ICMP_SOURCE_QUENCH:
537 		/* Just silently ignore these. */
538 		goto out;
539 	case ICMP_PARAMETERPROB:
540 		err = EPROTO;
541 		break;
542 	case ICMP_DEST_UNREACH:
543 		if (code > NR_ICMP_UNREACH)
544 			goto out;
545 
546 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
547 			/* We are not interested in TCP_LISTEN and open_requests
548 			 * (SYN-ACKs send out by Linux are always <576bytes so
549 			 * they should go through unfragmented).
550 			 */
551 			if (sk->sk_state == TCP_LISTEN)
552 				goto out;
553 
554 			WRITE_ONCE(tp->mtu_info, info);
555 			if (!sock_owned_by_user(sk)) {
556 				tcp_v4_mtu_reduced(sk);
557 			} else {
558 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
559 					sock_hold(sk);
560 			}
561 			goto out;
562 		}
563 
564 		err = icmp_err_convert[code].errno;
565 		/* check if this ICMP message allows revert of backoff.
566 		 * (see RFC 6069)
567 		 */
568 		if (!fastopen &&
569 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
570 			tcp_ld_RTO_revert(sk, seq);
571 		break;
572 	case ICMP_TIME_EXCEEDED:
573 		err = EHOSTUNREACH;
574 		break;
575 	default:
576 		goto out;
577 	}
578 
579 	switch (sk->sk_state) {
580 	case TCP_SYN_SENT:
581 	case TCP_SYN_RECV:
582 		/* Only in fast or simultaneous open. If a fast open socket is
583 		 * already accepted it is treated as a connected one below.
584 		 */
585 		if (fastopen && !fastopen->sk)
586 			break;
587 
588 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
589 
590 		if (!sock_owned_by_user(sk)) {
591 			sk->sk_err = err;
592 
593 			sk_error_report(sk);
594 
595 			tcp_done(sk);
596 		} else {
597 			sk->sk_err_soft = err;
598 		}
599 		goto out;
600 	}
601 
602 	/* If we've already connected we will keep trying
603 	 * until we time out, or the user gives up.
604 	 *
605 	 * rfc1122 4.2.3.9 allows to consider as hard errors
606 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
607 	 * but it is obsoleted by pmtu discovery).
608 	 *
609 	 * Note, that in modern internet, where routing is unreliable
610 	 * and in each dark corner broken firewalls sit, sending random
611 	 * errors ordered by their masters even this two messages finally lose
612 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
613 	 *
614 	 * Now we are in compliance with RFCs.
615 	 *							--ANK (980905)
616 	 */
617 
618 	inet = inet_sk(sk);
619 	if (!sock_owned_by_user(sk) && inet->recverr) {
620 		sk->sk_err = err;
621 		sk_error_report(sk);
622 	} else	{ /* Only an error on timeout */
623 		sk->sk_err_soft = err;
624 	}
625 
626 out:
627 	bh_unlock_sock(sk);
628 	sock_put(sk);
629 	return 0;
630 }
631 
632 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
633 {
634 	struct tcphdr *th = tcp_hdr(skb);
635 
636 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
637 	skb->csum_start = skb_transport_header(skb) - skb->head;
638 	skb->csum_offset = offsetof(struct tcphdr, check);
639 }
640 
641 /* This routine computes an IPv4 TCP checksum. */
642 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
643 {
644 	const struct inet_sock *inet = inet_sk(sk);
645 
646 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
647 }
648 EXPORT_SYMBOL(tcp_v4_send_check);
649 
650 /*
651  *	This routine will send an RST to the other tcp.
652  *
653  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
654  *		      for reset.
655  *	Answer: if a packet caused RST, it is not for a socket
656  *		existing in our system, if it is matched to a socket,
657  *		it is just duplicate segment or bug in other side's TCP.
658  *		So that we build reply only basing on parameters
659  *		arrived with segment.
660  *	Exception: precedence violation. We do not implement it in any case.
661  */
662 
663 #ifdef CONFIG_TCP_MD5SIG
664 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
665 #else
666 #define OPTION_BYTES sizeof(__be32)
667 #endif
668 
669 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
670 {
671 	const struct tcphdr *th = tcp_hdr(skb);
672 	struct {
673 		struct tcphdr th;
674 		__be32 opt[OPTION_BYTES / sizeof(__be32)];
675 	} rep;
676 	struct ip_reply_arg arg;
677 #ifdef CONFIG_TCP_MD5SIG
678 	struct tcp_md5sig_key *key = NULL;
679 	const __u8 *hash_location = NULL;
680 	unsigned char newhash[16];
681 	int genhash;
682 	struct sock *sk1 = NULL;
683 #endif
684 	u64 transmit_time = 0;
685 	struct sock *ctl_sk;
686 	struct net *net;
687 
688 	/* Never send a reset in response to a reset. */
689 	if (th->rst)
690 		return;
691 
692 	/* If sk not NULL, it means we did a successful lookup and incoming
693 	 * route had to be correct. prequeue might have dropped our dst.
694 	 */
695 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
696 		return;
697 
698 	/* Swap the send and the receive. */
699 	memset(&rep, 0, sizeof(rep));
700 	rep.th.dest   = th->source;
701 	rep.th.source = th->dest;
702 	rep.th.doff   = sizeof(struct tcphdr) / 4;
703 	rep.th.rst    = 1;
704 
705 	if (th->ack) {
706 		rep.th.seq = th->ack_seq;
707 	} else {
708 		rep.th.ack = 1;
709 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
710 				       skb->len - (th->doff << 2));
711 	}
712 
713 	memset(&arg, 0, sizeof(arg));
714 	arg.iov[0].iov_base = (unsigned char *)&rep;
715 	arg.iov[0].iov_len  = sizeof(rep.th);
716 
717 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
718 #ifdef CONFIG_TCP_MD5SIG
719 	rcu_read_lock();
720 	hash_location = tcp_parse_md5sig_option(th);
721 	if (sk && sk_fullsock(sk)) {
722 		const union tcp_md5_addr *addr;
723 		int l3index;
724 
725 		/* sdif set, means packet ingressed via a device
726 		 * in an L3 domain and inet_iif is set to it.
727 		 */
728 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
729 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
730 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
731 	} else if (hash_location) {
732 		const union tcp_md5_addr *addr;
733 		int sdif = tcp_v4_sdif(skb);
734 		int dif = inet_iif(skb);
735 		int l3index;
736 
737 		/*
738 		 * active side is lost. Try to find listening socket through
739 		 * source port, and then find md5 key through listening socket.
740 		 * we are not loose security here:
741 		 * Incoming packet is checked with md5 hash with finding key,
742 		 * no RST generated if md5 hash doesn't match.
743 		 */
744 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
745 					     ip_hdr(skb)->saddr,
746 					     th->source, ip_hdr(skb)->daddr,
747 					     ntohs(th->source), dif, sdif);
748 		/* don't send rst if it can't find key */
749 		if (!sk1)
750 			goto out;
751 
752 		/* sdif set, means packet ingressed via a device
753 		 * in an L3 domain and dif is set to it.
754 		 */
755 		l3index = sdif ? dif : 0;
756 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
757 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
758 		if (!key)
759 			goto out;
760 
761 
762 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
763 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
764 			goto out;
765 
766 	}
767 
768 	if (key) {
769 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
770 				   (TCPOPT_NOP << 16) |
771 				   (TCPOPT_MD5SIG << 8) |
772 				   TCPOLEN_MD5SIG);
773 		/* Update length and the length the header thinks exists */
774 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
775 		rep.th.doff = arg.iov[0].iov_len / 4;
776 
777 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
778 				     key, ip_hdr(skb)->saddr,
779 				     ip_hdr(skb)->daddr, &rep.th);
780 	}
781 #endif
782 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
783 	if (rep.opt[0] == 0) {
784 		__be32 mrst = mptcp_reset_option(skb);
785 
786 		if (mrst) {
787 			rep.opt[0] = mrst;
788 			arg.iov[0].iov_len += sizeof(mrst);
789 			rep.th.doff = arg.iov[0].iov_len / 4;
790 		}
791 	}
792 
793 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
794 				      ip_hdr(skb)->saddr, /* XXX */
795 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
796 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
797 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
798 
799 	/* When socket is gone, all binding information is lost.
800 	 * routing might fail in this case. No choice here, if we choose to force
801 	 * input interface, we will misroute in case of asymmetric route.
802 	 */
803 	if (sk) {
804 		arg.bound_dev_if = sk->sk_bound_dev_if;
805 		if (sk_fullsock(sk))
806 			trace_tcp_send_reset(sk, skb);
807 	}
808 
809 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
810 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
811 
812 	arg.tos = ip_hdr(skb)->tos;
813 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
814 	local_bh_disable();
815 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
816 	sock_net_set(ctl_sk, net);
817 	if (sk) {
818 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
819 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
820 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
821 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
822 		transmit_time = tcp_transmit_time(sk);
823 	}
824 	ip_send_unicast_reply(ctl_sk,
825 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
826 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
827 			      &arg, arg.iov[0].iov_len,
828 			      transmit_time);
829 
830 	ctl_sk->sk_mark = 0;
831 	sock_net_set(ctl_sk, &init_net);
832 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
833 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
834 	local_bh_enable();
835 
836 #ifdef CONFIG_TCP_MD5SIG
837 out:
838 	rcu_read_unlock();
839 #endif
840 }
841 
842 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
843    outside socket context is ugly, certainly. What can I do?
844  */
845 
846 static void tcp_v4_send_ack(const struct sock *sk,
847 			    struct sk_buff *skb, u32 seq, u32 ack,
848 			    u32 win, u32 tsval, u32 tsecr, int oif,
849 			    struct tcp_md5sig_key *key,
850 			    int reply_flags, u8 tos)
851 {
852 	const struct tcphdr *th = tcp_hdr(skb);
853 	struct {
854 		struct tcphdr th;
855 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
856 #ifdef CONFIG_TCP_MD5SIG
857 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
858 #endif
859 			];
860 	} rep;
861 	struct net *net = sock_net(sk);
862 	struct ip_reply_arg arg;
863 	struct sock *ctl_sk;
864 	u64 transmit_time;
865 
866 	memset(&rep.th, 0, sizeof(struct tcphdr));
867 	memset(&arg, 0, sizeof(arg));
868 
869 	arg.iov[0].iov_base = (unsigned char *)&rep;
870 	arg.iov[0].iov_len  = sizeof(rep.th);
871 	if (tsecr) {
872 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
873 				   (TCPOPT_TIMESTAMP << 8) |
874 				   TCPOLEN_TIMESTAMP);
875 		rep.opt[1] = htonl(tsval);
876 		rep.opt[2] = htonl(tsecr);
877 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
878 	}
879 
880 	/* Swap the send and the receive. */
881 	rep.th.dest    = th->source;
882 	rep.th.source  = th->dest;
883 	rep.th.doff    = arg.iov[0].iov_len / 4;
884 	rep.th.seq     = htonl(seq);
885 	rep.th.ack_seq = htonl(ack);
886 	rep.th.ack     = 1;
887 	rep.th.window  = htons(win);
888 
889 #ifdef CONFIG_TCP_MD5SIG
890 	if (key) {
891 		int offset = (tsecr) ? 3 : 0;
892 
893 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
894 					  (TCPOPT_NOP << 16) |
895 					  (TCPOPT_MD5SIG << 8) |
896 					  TCPOLEN_MD5SIG);
897 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
898 		rep.th.doff = arg.iov[0].iov_len/4;
899 
900 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
901 				    key, ip_hdr(skb)->saddr,
902 				    ip_hdr(skb)->daddr, &rep.th);
903 	}
904 #endif
905 	arg.flags = reply_flags;
906 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
907 				      ip_hdr(skb)->saddr, /* XXX */
908 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
909 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
910 	if (oif)
911 		arg.bound_dev_if = oif;
912 	arg.tos = tos;
913 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
914 	local_bh_disable();
915 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
916 	sock_net_set(ctl_sk, net);
917 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
918 			   inet_twsk(sk)->tw_mark : sk->sk_mark;
919 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
920 			   inet_twsk(sk)->tw_priority : sk->sk_priority;
921 	transmit_time = tcp_transmit_time(sk);
922 	ip_send_unicast_reply(ctl_sk,
923 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
924 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
925 			      &arg, arg.iov[0].iov_len,
926 			      transmit_time);
927 
928 	ctl_sk->sk_mark = 0;
929 	sock_net_set(ctl_sk, &init_net);
930 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
931 	local_bh_enable();
932 }
933 
934 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
935 {
936 	struct inet_timewait_sock *tw = inet_twsk(sk);
937 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
938 
939 	tcp_v4_send_ack(sk, skb,
940 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
941 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
942 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
943 			tcptw->tw_ts_recent,
944 			tw->tw_bound_dev_if,
945 			tcp_twsk_md5_key(tcptw),
946 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
947 			tw->tw_tos
948 			);
949 
950 	inet_twsk_put(tw);
951 }
952 
953 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
954 				  struct request_sock *req)
955 {
956 	const union tcp_md5_addr *addr;
957 	int l3index;
958 
959 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
960 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
961 	 */
962 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
963 					     tcp_sk(sk)->snd_nxt;
964 
965 	/* RFC 7323 2.3
966 	 * The window field (SEG.WND) of every outgoing segment, with the
967 	 * exception of <SYN> segments, MUST be right-shifted by
968 	 * Rcv.Wind.Shift bits:
969 	 */
970 	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
971 	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
972 	tcp_v4_send_ack(sk, skb, seq,
973 			tcp_rsk(req)->rcv_nxt,
974 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
975 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
976 			req->ts_recent,
977 			0,
978 			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
979 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
980 			ip_hdr(skb)->tos);
981 }
982 
983 /*
984  *	Send a SYN-ACK after having received a SYN.
985  *	This still operates on a request_sock only, not on a big
986  *	socket.
987  */
988 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
989 			      struct flowi *fl,
990 			      struct request_sock *req,
991 			      struct tcp_fastopen_cookie *foc,
992 			      enum tcp_synack_type synack_type,
993 			      struct sk_buff *syn_skb)
994 {
995 	const struct inet_request_sock *ireq = inet_rsk(req);
996 	struct flowi4 fl4;
997 	int err = -1;
998 	struct sk_buff *skb;
999 	u8 tos;
1000 
1001 	/* First, grab a route. */
1002 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1003 		return -1;
1004 
1005 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1006 
1007 	if (skb) {
1008 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1009 
1010 		tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
1011 				(tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1012 				(inet_sk(sk)->tos & INET_ECN_MASK) :
1013 				inet_sk(sk)->tos;
1014 
1015 		if (!INET_ECN_is_capable(tos) &&
1016 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1017 			tos |= INET_ECN_ECT_0;
1018 
1019 		rcu_read_lock();
1020 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1021 					    ireq->ir_rmt_addr,
1022 					    rcu_dereference(ireq->ireq_opt),
1023 					    tos);
1024 		rcu_read_unlock();
1025 		err = net_xmit_eval(err);
1026 	}
1027 
1028 	return err;
1029 }
1030 
1031 /*
1032  *	IPv4 request_sock destructor.
1033  */
1034 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1035 {
1036 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1037 }
1038 
1039 #ifdef CONFIG_TCP_MD5SIG
1040 /*
1041  * RFC2385 MD5 checksumming requires a mapping of
1042  * IP address->MD5 Key.
1043  * We need to maintain these in the sk structure.
1044  */
1045 
1046 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1047 EXPORT_SYMBOL(tcp_md5_needed);
1048 
1049 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1050 {
1051 	if (!old)
1052 		return true;
1053 
1054 	/* l3index always overrides non-l3index */
1055 	if (old->l3index && new->l3index == 0)
1056 		return false;
1057 	if (old->l3index == 0 && new->l3index)
1058 		return true;
1059 
1060 	return old->prefixlen < new->prefixlen;
1061 }
1062 
1063 /* Find the Key structure for an address.  */
1064 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1065 					   const union tcp_md5_addr *addr,
1066 					   int family)
1067 {
1068 	const struct tcp_sock *tp = tcp_sk(sk);
1069 	struct tcp_md5sig_key *key;
1070 	const struct tcp_md5sig_info *md5sig;
1071 	__be32 mask;
1072 	struct tcp_md5sig_key *best_match = NULL;
1073 	bool match;
1074 
1075 	/* caller either holds rcu_read_lock() or socket lock */
1076 	md5sig = rcu_dereference_check(tp->md5sig_info,
1077 				       lockdep_sock_is_held(sk));
1078 	if (!md5sig)
1079 		return NULL;
1080 
1081 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1082 				 lockdep_sock_is_held(sk)) {
1083 		if (key->family != family)
1084 			continue;
1085 		if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1086 			continue;
1087 		if (family == AF_INET) {
1088 			mask = inet_make_mask(key->prefixlen);
1089 			match = (key->addr.a4.s_addr & mask) ==
1090 				(addr->a4.s_addr & mask);
1091 #if IS_ENABLED(CONFIG_IPV6)
1092 		} else if (family == AF_INET6) {
1093 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1094 						  key->prefixlen);
1095 #endif
1096 		} else {
1097 			match = false;
1098 		}
1099 
1100 		if (match && better_md5_match(best_match, key))
1101 			best_match = key;
1102 	}
1103 	return best_match;
1104 }
1105 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1106 
1107 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1108 						      const union tcp_md5_addr *addr,
1109 						      int family, u8 prefixlen,
1110 						      int l3index, u8 flags)
1111 {
1112 	const struct tcp_sock *tp = tcp_sk(sk);
1113 	struct tcp_md5sig_key *key;
1114 	unsigned int size = sizeof(struct in_addr);
1115 	const struct tcp_md5sig_info *md5sig;
1116 
1117 	/* caller either holds rcu_read_lock() or socket lock */
1118 	md5sig = rcu_dereference_check(tp->md5sig_info,
1119 				       lockdep_sock_is_held(sk));
1120 	if (!md5sig)
1121 		return NULL;
1122 #if IS_ENABLED(CONFIG_IPV6)
1123 	if (family == AF_INET6)
1124 		size = sizeof(struct in6_addr);
1125 #endif
1126 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1127 				 lockdep_sock_is_held(sk)) {
1128 		if (key->family != family)
1129 			continue;
1130 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1131 			continue;
1132 		if (key->l3index != l3index)
1133 			continue;
1134 		if (!memcmp(&key->addr, addr, size) &&
1135 		    key->prefixlen == prefixlen)
1136 			return key;
1137 	}
1138 	return NULL;
1139 }
1140 
1141 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1142 					 const struct sock *addr_sk)
1143 {
1144 	const union tcp_md5_addr *addr;
1145 	int l3index;
1146 
1147 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1148 						 addr_sk->sk_bound_dev_if);
1149 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1150 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1151 }
1152 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1153 
1154 /* This can be called on a newly created socket, from other files */
1155 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1156 		   int family, u8 prefixlen, int l3index, u8 flags,
1157 		   const u8 *newkey, u8 newkeylen, gfp_t gfp)
1158 {
1159 	/* Add Key to the list */
1160 	struct tcp_md5sig_key *key;
1161 	struct tcp_sock *tp = tcp_sk(sk);
1162 	struct tcp_md5sig_info *md5sig;
1163 
1164 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1165 	if (key) {
1166 		/* Pre-existing entry - just update that one.
1167 		 * Note that the key might be used concurrently.
1168 		 * data_race() is telling kcsan that we do not care of
1169 		 * key mismatches, since changing MD5 key on live flows
1170 		 * can lead to packet drops.
1171 		 */
1172 		data_race(memcpy(key->key, newkey, newkeylen));
1173 
1174 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1175 		 * Also note that a reader could catch new key->keylen value
1176 		 * but old key->key[], this is the reason we use __GFP_ZERO
1177 		 * at sock_kmalloc() time below these lines.
1178 		 */
1179 		WRITE_ONCE(key->keylen, newkeylen);
1180 
1181 		return 0;
1182 	}
1183 
1184 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1185 					   lockdep_sock_is_held(sk));
1186 	if (!md5sig) {
1187 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1188 		if (!md5sig)
1189 			return -ENOMEM;
1190 
1191 		sk_gso_disable(sk);
1192 		INIT_HLIST_HEAD(&md5sig->head);
1193 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1194 	}
1195 
1196 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1197 	if (!key)
1198 		return -ENOMEM;
1199 	if (!tcp_alloc_md5sig_pool()) {
1200 		sock_kfree_s(sk, key, sizeof(*key));
1201 		return -ENOMEM;
1202 	}
1203 
1204 	memcpy(key->key, newkey, newkeylen);
1205 	key->keylen = newkeylen;
1206 	key->family = family;
1207 	key->prefixlen = prefixlen;
1208 	key->l3index = l3index;
1209 	key->flags = flags;
1210 	memcpy(&key->addr, addr,
1211 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1212 				      sizeof(struct in_addr));
1213 	hlist_add_head_rcu(&key->node, &md5sig->head);
1214 	return 0;
1215 }
1216 EXPORT_SYMBOL(tcp_md5_do_add);
1217 
1218 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1219 		   u8 prefixlen, int l3index, u8 flags)
1220 {
1221 	struct tcp_md5sig_key *key;
1222 
1223 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1224 	if (!key)
1225 		return -ENOENT;
1226 	hlist_del_rcu(&key->node);
1227 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1228 	kfree_rcu(key, rcu);
1229 	return 0;
1230 }
1231 EXPORT_SYMBOL(tcp_md5_do_del);
1232 
1233 static void tcp_clear_md5_list(struct sock *sk)
1234 {
1235 	struct tcp_sock *tp = tcp_sk(sk);
1236 	struct tcp_md5sig_key *key;
1237 	struct hlist_node *n;
1238 	struct tcp_md5sig_info *md5sig;
1239 
1240 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1241 
1242 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1243 		hlist_del_rcu(&key->node);
1244 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1245 		kfree_rcu(key, rcu);
1246 	}
1247 }
1248 
1249 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1250 				 sockptr_t optval, int optlen)
1251 {
1252 	struct tcp_md5sig cmd;
1253 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1254 	const union tcp_md5_addr *addr;
1255 	u8 prefixlen = 32;
1256 	int l3index = 0;
1257 	u8 flags;
1258 
1259 	if (optlen < sizeof(cmd))
1260 		return -EINVAL;
1261 
1262 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1263 		return -EFAULT;
1264 
1265 	if (sin->sin_family != AF_INET)
1266 		return -EINVAL;
1267 
1268 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1269 
1270 	if (optname == TCP_MD5SIG_EXT &&
1271 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1272 		prefixlen = cmd.tcpm_prefixlen;
1273 		if (prefixlen > 32)
1274 			return -EINVAL;
1275 	}
1276 
1277 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1278 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1279 		struct net_device *dev;
1280 
1281 		rcu_read_lock();
1282 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1283 		if (dev && netif_is_l3_master(dev))
1284 			l3index = dev->ifindex;
1285 
1286 		rcu_read_unlock();
1287 
1288 		/* ok to reference set/not set outside of rcu;
1289 		 * right now device MUST be an L3 master
1290 		 */
1291 		if (!dev || !l3index)
1292 			return -EINVAL;
1293 	}
1294 
1295 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1296 
1297 	if (!cmd.tcpm_keylen)
1298 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1299 
1300 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1301 		return -EINVAL;
1302 
1303 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1304 			      cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1305 }
1306 
1307 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1308 				   __be32 daddr, __be32 saddr,
1309 				   const struct tcphdr *th, int nbytes)
1310 {
1311 	struct tcp4_pseudohdr *bp;
1312 	struct scatterlist sg;
1313 	struct tcphdr *_th;
1314 
1315 	bp = hp->scratch;
1316 	bp->saddr = saddr;
1317 	bp->daddr = daddr;
1318 	bp->pad = 0;
1319 	bp->protocol = IPPROTO_TCP;
1320 	bp->len = cpu_to_be16(nbytes);
1321 
1322 	_th = (struct tcphdr *)(bp + 1);
1323 	memcpy(_th, th, sizeof(*th));
1324 	_th->check = 0;
1325 
1326 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1327 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1328 				sizeof(*bp) + sizeof(*th));
1329 	return crypto_ahash_update(hp->md5_req);
1330 }
1331 
1332 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1333 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1334 {
1335 	struct tcp_md5sig_pool *hp;
1336 	struct ahash_request *req;
1337 
1338 	hp = tcp_get_md5sig_pool();
1339 	if (!hp)
1340 		goto clear_hash_noput;
1341 	req = hp->md5_req;
1342 
1343 	if (crypto_ahash_init(req))
1344 		goto clear_hash;
1345 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1346 		goto clear_hash;
1347 	if (tcp_md5_hash_key(hp, key))
1348 		goto clear_hash;
1349 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1350 	if (crypto_ahash_final(req))
1351 		goto clear_hash;
1352 
1353 	tcp_put_md5sig_pool();
1354 	return 0;
1355 
1356 clear_hash:
1357 	tcp_put_md5sig_pool();
1358 clear_hash_noput:
1359 	memset(md5_hash, 0, 16);
1360 	return 1;
1361 }
1362 
1363 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1364 			const struct sock *sk,
1365 			const struct sk_buff *skb)
1366 {
1367 	struct tcp_md5sig_pool *hp;
1368 	struct ahash_request *req;
1369 	const struct tcphdr *th = tcp_hdr(skb);
1370 	__be32 saddr, daddr;
1371 
1372 	if (sk) { /* valid for establish/request sockets */
1373 		saddr = sk->sk_rcv_saddr;
1374 		daddr = sk->sk_daddr;
1375 	} else {
1376 		const struct iphdr *iph = ip_hdr(skb);
1377 		saddr = iph->saddr;
1378 		daddr = iph->daddr;
1379 	}
1380 
1381 	hp = tcp_get_md5sig_pool();
1382 	if (!hp)
1383 		goto clear_hash_noput;
1384 	req = hp->md5_req;
1385 
1386 	if (crypto_ahash_init(req))
1387 		goto clear_hash;
1388 
1389 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1390 		goto clear_hash;
1391 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1392 		goto clear_hash;
1393 	if (tcp_md5_hash_key(hp, key))
1394 		goto clear_hash;
1395 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1396 	if (crypto_ahash_final(req))
1397 		goto clear_hash;
1398 
1399 	tcp_put_md5sig_pool();
1400 	return 0;
1401 
1402 clear_hash:
1403 	tcp_put_md5sig_pool();
1404 clear_hash_noput:
1405 	memset(md5_hash, 0, 16);
1406 	return 1;
1407 }
1408 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1409 
1410 #endif
1411 
1412 /* Called with rcu_read_lock() */
1413 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1414 				    const struct sk_buff *skb,
1415 				    int dif, int sdif)
1416 {
1417 #ifdef CONFIG_TCP_MD5SIG
1418 	/*
1419 	 * This gets called for each TCP segment that arrives
1420 	 * so we want to be efficient.
1421 	 * We have 3 drop cases:
1422 	 * o No MD5 hash and one expected.
1423 	 * o MD5 hash and we're not expecting one.
1424 	 * o MD5 hash and its wrong.
1425 	 */
1426 	const __u8 *hash_location = NULL;
1427 	struct tcp_md5sig_key *hash_expected;
1428 	const struct iphdr *iph = ip_hdr(skb);
1429 	const struct tcphdr *th = tcp_hdr(skb);
1430 	const union tcp_md5_addr *addr;
1431 	unsigned char newhash[16];
1432 	int genhash, l3index;
1433 
1434 	/* sdif set, means packet ingressed via a device
1435 	 * in an L3 domain and dif is set to the l3mdev
1436 	 */
1437 	l3index = sdif ? dif : 0;
1438 
1439 	addr = (union tcp_md5_addr *)&iph->saddr;
1440 	hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1441 	hash_location = tcp_parse_md5sig_option(th);
1442 
1443 	/* We've parsed the options - do we have a hash? */
1444 	if (!hash_expected && !hash_location)
1445 		return false;
1446 
1447 	if (hash_expected && !hash_location) {
1448 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1449 		return true;
1450 	}
1451 
1452 	if (!hash_expected && hash_location) {
1453 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1454 		return true;
1455 	}
1456 
1457 	/* Okay, so this is hash_expected and hash_location -
1458 	 * so we need to calculate the checksum.
1459 	 */
1460 	genhash = tcp_v4_md5_hash_skb(newhash,
1461 				      hash_expected,
1462 				      NULL, skb);
1463 
1464 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1465 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1466 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1467 				     &iph->saddr, ntohs(th->source),
1468 				     &iph->daddr, ntohs(th->dest),
1469 				     genhash ? " tcp_v4_calc_md5_hash failed"
1470 				     : "", l3index);
1471 		return true;
1472 	}
1473 	return false;
1474 #endif
1475 	return false;
1476 }
1477 
1478 static void tcp_v4_init_req(struct request_sock *req,
1479 			    const struct sock *sk_listener,
1480 			    struct sk_buff *skb)
1481 {
1482 	struct inet_request_sock *ireq = inet_rsk(req);
1483 	struct net *net = sock_net(sk_listener);
1484 
1485 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1486 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1487 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1488 }
1489 
1490 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1491 					  struct sk_buff *skb,
1492 					  struct flowi *fl,
1493 					  struct request_sock *req)
1494 {
1495 	tcp_v4_init_req(req, sk, skb);
1496 
1497 	if (security_inet_conn_request(sk, skb, req))
1498 		return NULL;
1499 
1500 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1501 }
1502 
1503 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1504 	.family		=	PF_INET,
1505 	.obj_size	=	sizeof(struct tcp_request_sock),
1506 	.rtx_syn_ack	=	tcp_rtx_synack,
1507 	.send_ack	=	tcp_v4_reqsk_send_ack,
1508 	.destructor	=	tcp_v4_reqsk_destructor,
1509 	.send_reset	=	tcp_v4_send_reset,
1510 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1511 };
1512 
1513 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1514 	.mss_clamp	=	TCP_MSS_DEFAULT,
1515 #ifdef CONFIG_TCP_MD5SIG
1516 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1517 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1518 #endif
1519 #ifdef CONFIG_SYN_COOKIES
1520 	.cookie_init_seq =	cookie_v4_init_sequence,
1521 #endif
1522 	.route_req	=	tcp_v4_route_req,
1523 	.init_seq	=	tcp_v4_init_seq,
1524 	.init_ts_off	=	tcp_v4_init_ts_off,
1525 	.send_synack	=	tcp_v4_send_synack,
1526 };
1527 
1528 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1529 {
1530 	/* Never answer to SYNs send to broadcast or multicast */
1531 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1532 		goto drop;
1533 
1534 	return tcp_conn_request(&tcp_request_sock_ops,
1535 				&tcp_request_sock_ipv4_ops, sk, skb);
1536 
1537 drop:
1538 	tcp_listendrop(sk);
1539 	return 0;
1540 }
1541 EXPORT_SYMBOL(tcp_v4_conn_request);
1542 
1543 
1544 /*
1545  * The three way handshake has completed - we got a valid synack -
1546  * now create the new socket.
1547  */
1548 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1549 				  struct request_sock *req,
1550 				  struct dst_entry *dst,
1551 				  struct request_sock *req_unhash,
1552 				  bool *own_req)
1553 {
1554 	struct inet_request_sock *ireq;
1555 	bool found_dup_sk = false;
1556 	struct inet_sock *newinet;
1557 	struct tcp_sock *newtp;
1558 	struct sock *newsk;
1559 #ifdef CONFIG_TCP_MD5SIG
1560 	const union tcp_md5_addr *addr;
1561 	struct tcp_md5sig_key *key;
1562 	int l3index;
1563 #endif
1564 	struct ip_options_rcu *inet_opt;
1565 
1566 	if (sk_acceptq_is_full(sk))
1567 		goto exit_overflow;
1568 
1569 	newsk = tcp_create_openreq_child(sk, req, skb);
1570 	if (!newsk)
1571 		goto exit_nonewsk;
1572 
1573 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1574 	inet_sk_rx_dst_set(newsk, skb);
1575 
1576 	newtp		      = tcp_sk(newsk);
1577 	newinet		      = inet_sk(newsk);
1578 	ireq		      = inet_rsk(req);
1579 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1580 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1581 	newsk->sk_bound_dev_if = ireq->ir_iif;
1582 	newinet->inet_saddr   = ireq->ir_loc_addr;
1583 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1584 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1585 	newinet->mc_index     = inet_iif(skb);
1586 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1587 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1588 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1589 	if (inet_opt)
1590 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1591 	newinet->inet_id = prandom_u32();
1592 
1593 	/* Set ToS of the new socket based upon the value of incoming SYN.
1594 	 * ECT bits are set later in tcp_init_transfer().
1595 	 */
1596 	if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1597 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1598 
1599 	if (!dst) {
1600 		dst = inet_csk_route_child_sock(sk, newsk, req);
1601 		if (!dst)
1602 			goto put_and_exit;
1603 	} else {
1604 		/* syncookie case : see end of cookie_v4_check() */
1605 	}
1606 	sk_setup_caps(newsk, dst);
1607 
1608 	tcp_ca_openreq_child(newsk, dst);
1609 
1610 	tcp_sync_mss(newsk, dst_mtu(dst));
1611 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1612 
1613 	tcp_initialize_rcv_mss(newsk);
1614 
1615 #ifdef CONFIG_TCP_MD5SIG
1616 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1617 	/* Copy over the MD5 key from the original socket */
1618 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1619 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1620 	if (key) {
1621 		/*
1622 		 * We're using one, so create a matching key
1623 		 * on the newsk structure. If we fail to get
1624 		 * memory, then we end up not copying the key
1625 		 * across. Shucks.
1626 		 */
1627 		tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1628 			       key->key, key->keylen, GFP_ATOMIC);
1629 		sk_gso_disable(newsk);
1630 	}
1631 #endif
1632 
1633 	if (__inet_inherit_port(sk, newsk) < 0)
1634 		goto put_and_exit;
1635 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1636 				       &found_dup_sk);
1637 	if (likely(*own_req)) {
1638 		tcp_move_syn(newtp, req);
1639 		ireq->ireq_opt = NULL;
1640 	} else {
1641 		newinet->inet_opt = NULL;
1642 
1643 		if (!req_unhash && found_dup_sk) {
1644 			/* This code path should only be executed in the
1645 			 * syncookie case only
1646 			 */
1647 			bh_unlock_sock(newsk);
1648 			sock_put(newsk);
1649 			newsk = NULL;
1650 		}
1651 	}
1652 	return newsk;
1653 
1654 exit_overflow:
1655 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1656 exit_nonewsk:
1657 	dst_release(dst);
1658 exit:
1659 	tcp_listendrop(sk);
1660 	return NULL;
1661 put_and_exit:
1662 	newinet->inet_opt = NULL;
1663 	inet_csk_prepare_forced_close(newsk);
1664 	tcp_done(newsk);
1665 	goto exit;
1666 }
1667 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1668 
1669 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1670 {
1671 #ifdef CONFIG_SYN_COOKIES
1672 	const struct tcphdr *th = tcp_hdr(skb);
1673 
1674 	if (!th->syn)
1675 		sk = cookie_v4_check(sk, skb);
1676 #endif
1677 	return sk;
1678 }
1679 
1680 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1681 			 struct tcphdr *th, u32 *cookie)
1682 {
1683 	u16 mss = 0;
1684 #ifdef CONFIG_SYN_COOKIES
1685 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1686 				    &tcp_request_sock_ipv4_ops, sk, th);
1687 	if (mss) {
1688 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1689 		tcp_synq_overflow(sk);
1690 	}
1691 #endif
1692 	return mss;
1693 }
1694 
1695 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1696 							   u32));
1697 /* The socket must have it's spinlock held when we get
1698  * here, unless it is a TCP_LISTEN socket.
1699  *
1700  * We have a potential double-lock case here, so even when
1701  * doing backlog processing we use the BH locking scheme.
1702  * This is because we cannot sleep with the original spinlock
1703  * held.
1704  */
1705 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1706 {
1707 	struct sock *rsk;
1708 
1709 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1710 		struct dst_entry *dst;
1711 
1712 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1713 						lockdep_sock_is_held(sk));
1714 
1715 		sock_rps_save_rxhash(sk, skb);
1716 		sk_mark_napi_id(sk, skb);
1717 		if (dst) {
1718 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1719 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1720 					     dst, 0)) {
1721 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1722 				dst_release(dst);
1723 			}
1724 		}
1725 		tcp_rcv_established(sk, skb);
1726 		return 0;
1727 	}
1728 
1729 	if (tcp_checksum_complete(skb))
1730 		goto csum_err;
1731 
1732 	if (sk->sk_state == TCP_LISTEN) {
1733 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1734 
1735 		if (!nsk)
1736 			goto discard;
1737 		if (nsk != sk) {
1738 			if (tcp_child_process(sk, nsk, skb)) {
1739 				rsk = nsk;
1740 				goto reset;
1741 			}
1742 			return 0;
1743 		}
1744 	} else
1745 		sock_rps_save_rxhash(sk, skb);
1746 
1747 	if (tcp_rcv_state_process(sk, skb)) {
1748 		rsk = sk;
1749 		goto reset;
1750 	}
1751 	return 0;
1752 
1753 reset:
1754 	tcp_v4_send_reset(rsk, skb);
1755 discard:
1756 	kfree_skb(skb);
1757 	/* Be careful here. If this function gets more complicated and
1758 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1759 	 * might be destroyed here. This current version compiles correctly,
1760 	 * but you have been warned.
1761 	 */
1762 	return 0;
1763 
1764 csum_err:
1765 	trace_tcp_bad_csum(skb);
1766 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1767 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1768 	goto discard;
1769 }
1770 EXPORT_SYMBOL(tcp_v4_do_rcv);
1771 
1772 int tcp_v4_early_demux(struct sk_buff *skb)
1773 {
1774 	const struct iphdr *iph;
1775 	const struct tcphdr *th;
1776 	struct sock *sk;
1777 
1778 	if (skb->pkt_type != PACKET_HOST)
1779 		return 0;
1780 
1781 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1782 		return 0;
1783 
1784 	iph = ip_hdr(skb);
1785 	th = tcp_hdr(skb);
1786 
1787 	if (th->doff < sizeof(struct tcphdr) / 4)
1788 		return 0;
1789 
1790 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1791 				       iph->saddr, th->source,
1792 				       iph->daddr, ntohs(th->dest),
1793 				       skb->skb_iif, inet_sdif(skb));
1794 	if (sk) {
1795 		skb->sk = sk;
1796 		skb->destructor = sock_edemux;
1797 		if (sk_fullsock(sk)) {
1798 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1799 
1800 			if (dst)
1801 				dst = dst_check(dst, 0);
1802 			if (dst &&
1803 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
1804 				skb_dst_set_noref(skb, dst);
1805 		}
1806 	}
1807 	return 0;
1808 }
1809 
1810 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1811 {
1812 	u32 limit, tail_gso_size, tail_gso_segs;
1813 	struct skb_shared_info *shinfo;
1814 	const struct tcphdr *th;
1815 	struct tcphdr *thtail;
1816 	struct sk_buff *tail;
1817 	unsigned int hdrlen;
1818 	bool fragstolen;
1819 	u32 gso_segs;
1820 	u32 gso_size;
1821 	int delta;
1822 
1823 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1824 	 * we can fix skb->truesize to its real value to avoid future drops.
1825 	 * This is valid because skb is not yet charged to the socket.
1826 	 * It has been noticed pure SACK packets were sometimes dropped
1827 	 * (if cooked by drivers without copybreak feature).
1828 	 */
1829 	skb_condense(skb);
1830 
1831 	skb_dst_drop(skb);
1832 
1833 	if (unlikely(tcp_checksum_complete(skb))) {
1834 		bh_unlock_sock(sk);
1835 		trace_tcp_bad_csum(skb);
1836 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1837 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1838 		return true;
1839 	}
1840 
1841 	/* Attempt coalescing to last skb in backlog, even if we are
1842 	 * above the limits.
1843 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1844 	 */
1845 	th = (const struct tcphdr *)skb->data;
1846 	hdrlen = th->doff * 4;
1847 
1848 	tail = sk->sk_backlog.tail;
1849 	if (!tail)
1850 		goto no_coalesce;
1851 	thtail = (struct tcphdr *)tail->data;
1852 
1853 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1854 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1855 	    ((TCP_SKB_CB(tail)->tcp_flags |
1856 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1857 	    !((TCP_SKB_CB(tail)->tcp_flags &
1858 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1859 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1860 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1861 #ifdef CONFIG_TLS_DEVICE
1862 	    tail->decrypted != skb->decrypted ||
1863 #endif
1864 	    thtail->doff != th->doff ||
1865 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1866 		goto no_coalesce;
1867 
1868 	__skb_pull(skb, hdrlen);
1869 
1870 	shinfo = skb_shinfo(skb);
1871 	gso_size = shinfo->gso_size ?: skb->len;
1872 	gso_segs = shinfo->gso_segs ?: 1;
1873 
1874 	shinfo = skb_shinfo(tail);
1875 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1876 	tail_gso_segs = shinfo->gso_segs ?: 1;
1877 
1878 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1879 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1880 
1881 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1882 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1883 			thtail->window = th->window;
1884 		}
1885 
1886 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1887 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1888 		 * is not entered if we append a packet with a FIN.
1889 		 * SYN, RST, URG are not present.
1890 		 * ACK is set on both packets.
1891 		 * PSH : we do not really care in TCP stack,
1892 		 *       at least for 'GRO' packets.
1893 		 */
1894 		thtail->fin |= th->fin;
1895 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1896 
1897 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1898 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1899 			tail->tstamp = skb->tstamp;
1900 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1901 		}
1902 
1903 		/* Not as strict as GRO. We only need to carry mss max value */
1904 		shinfo->gso_size = max(gso_size, tail_gso_size);
1905 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1906 
1907 		sk->sk_backlog.len += delta;
1908 		__NET_INC_STATS(sock_net(sk),
1909 				LINUX_MIB_TCPBACKLOGCOALESCE);
1910 		kfree_skb_partial(skb, fragstolen);
1911 		return false;
1912 	}
1913 	__skb_push(skb, hdrlen);
1914 
1915 no_coalesce:
1916 	/* Only socket owner can try to collapse/prune rx queues
1917 	 * to reduce memory overhead, so add a little headroom here.
1918 	 * Few sockets backlog are possibly concurrently non empty.
1919 	 */
1920 	limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024;
1921 
1922 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1923 		bh_unlock_sock(sk);
1924 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1925 		return true;
1926 	}
1927 	return false;
1928 }
1929 EXPORT_SYMBOL(tcp_add_backlog);
1930 
1931 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1932 {
1933 	struct tcphdr *th = (struct tcphdr *)skb->data;
1934 
1935 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1936 }
1937 EXPORT_SYMBOL(tcp_filter);
1938 
1939 static void tcp_v4_restore_cb(struct sk_buff *skb)
1940 {
1941 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1942 		sizeof(struct inet_skb_parm));
1943 }
1944 
1945 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1946 			   const struct tcphdr *th)
1947 {
1948 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1949 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1950 	 */
1951 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1952 		sizeof(struct inet_skb_parm));
1953 	barrier();
1954 
1955 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1956 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1957 				    skb->len - th->doff * 4);
1958 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1959 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1960 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1961 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1962 	TCP_SKB_CB(skb)->sacked	 = 0;
1963 	TCP_SKB_CB(skb)->has_rxtstamp =
1964 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1965 }
1966 
1967 /*
1968  *	From tcp_input.c
1969  */
1970 
1971 int tcp_v4_rcv(struct sk_buff *skb)
1972 {
1973 	struct net *net = dev_net(skb->dev);
1974 	int sdif = inet_sdif(skb);
1975 	int dif = inet_iif(skb);
1976 	const struct iphdr *iph;
1977 	const struct tcphdr *th;
1978 	bool refcounted;
1979 	struct sock *sk;
1980 	int drop_reason;
1981 	int ret;
1982 
1983 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1984 	if (skb->pkt_type != PACKET_HOST)
1985 		goto discard_it;
1986 
1987 	/* Count it even if it's bad */
1988 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1989 
1990 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1991 		goto discard_it;
1992 
1993 	th = (const struct tcphdr *)skb->data;
1994 
1995 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1996 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1997 		goto bad_packet;
1998 	}
1999 	if (!pskb_may_pull(skb, th->doff * 4))
2000 		goto discard_it;
2001 
2002 	/* An explanation is required here, I think.
2003 	 * Packet length and doff are validated by header prediction,
2004 	 * provided case of th->doff==0 is eliminated.
2005 	 * So, we defer the checks. */
2006 
2007 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2008 		goto csum_error;
2009 
2010 	th = (const struct tcphdr *)skb->data;
2011 	iph = ip_hdr(skb);
2012 lookup:
2013 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
2014 			       th->dest, sdif, &refcounted);
2015 	if (!sk)
2016 		goto no_tcp_socket;
2017 
2018 process:
2019 	if (sk->sk_state == TCP_TIME_WAIT)
2020 		goto do_time_wait;
2021 
2022 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2023 		struct request_sock *req = inet_reqsk(sk);
2024 		bool req_stolen = false;
2025 		struct sock *nsk;
2026 
2027 		sk = req->rsk_listener;
2028 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
2029 			sk_drops_add(sk, skb);
2030 			reqsk_put(req);
2031 			goto discard_it;
2032 		}
2033 		if (tcp_checksum_complete(skb)) {
2034 			reqsk_put(req);
2035 			goto csum_error;
2036 		}
2037 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2038 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2039 			if (!nsk) {
2040 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2041 				goto lookup;
2042 			}
2043 			sk = nsk;
2044 			/* reuseport_migrate_sock() has already held one sk_refcnt
2045 			 * before returning.
2046 			 */
2047 		} else {
2048 			/* We own a reference on the listener, increase it again
2049 			 * as we might lose it too soon.
2050 			 */
2051 			sock_hold(sk);
2052 		}
2053 		refcounted = true;
2054 		nsk = NULL;
2055 		if (!tcp_filter(sk, skb)) {
2056 			th = (const struct tcphdr *)skb->data;
2057 			iph = ip_hdr(skb);
2058 			tcp_v4_fill_cb(skb, iph, th);
2059 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2060 		}
2061 		if (!nsk) {
2062 			reqsk_put(req);
2063 			if (req_stolen) {
2064 				/* Another cpu got exclusive access to req
2065 				 * and created a full blown socket.
2066 				 * Try to feed this packet to this socket
2067 				 * instead of discarding it.
2068 				 */
2069 				tcp_v4_restore_cb(skb);
2070 				sock_put(sk);
2071 				goto lookup;
2072 			}
2073 			goto discard_and_relse;
2074 		}
2075 		if (nsk == sk) {
2076 			reqsk_put(req);
2077 			tcp_v4_restore_cb(skb);
2078 		} else if (tcp_child_process(sk, nsk, skb)) {
2079 			tcp_v4_send_reset(nsk, skb);
2080 			goto discard_and_relse;
2081 		} else {
2082 			sock_put(sk);
2083 			return 0;
2084 		}
2085 	}
2086 
2087 	if (static_branch_unlikely(&ip4_min_ttl)) {
2088 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2089 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2090 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2091 			goto discard_and_relse;
2092 		}
2093 	}
2094 
2095 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2096 		goto discard_and_relse;
2097 
2098 	if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2099 		goto discard_and_relse;
2100 
2101 	nf_reset_ct(skb);
2102 
2103 	if (tcp_filter(sk, skb)) {
2104 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2105 		goto discard_and_relse;
2106 	}
2107 	th = (const struct tcphdr *)skb->data;
2108 	iph = ip_hdr(skb);
2109 	tcp_v4_fill_cb(skb, iph, th);
2110 
2111 	skb->dev = NULL;
2112 
2113 	if (sk->sk_state == TCP_LISTEN) {
2114 		ret = tcp_v4_do_rcv(sk, skb);
2115 		goto put_and_return;
2116 	}
2117 
2118 	sk_incoming_cpu_update(sk);
2119 
2120 	sk_defer_free_flush(sk);
2121 	bh_lock_sock_nested(sk);
2122 	tcp_segs_in(tcp_sk(sk), skb);
2123 	ret = 0;
2124 	if (!sock_owned_by_user(sk)) {
2125 		ret = tcp_v4_do_rcv(sk, skb);
2126 	} else {
2127 		if (tcp_add_backlog(sk, skb))
2128 			goto discard_and_relse;
2129 	}
2130 	bh_unlock_sock(sk);
2131 
2132 put_and_return:
2133 	if (refcounted)
2134 		sock_put(sk);
2135 
2136 	return ret;
2137 
2138 no_tcp_socket:
2139 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2140 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2141 		goto discard_it;
2142 
2143 	tcp_v4_fill_cb(skb, iph, th);
2144 
2145 	if (tcp_checksum_complete(skb)) {
2146 csum_error:
2147 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2148 		trace_tcp_bad_csum(skb);
2149 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2150 bad_packet:
2151 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2152 	} else {
2153 		tcp_v4_send_reset(NULL, skb);
2154 	}
2155 
2156 discard_it:
2157 	/* Discard frame. */
2158 	kfree_skb_reason(skb, drop_reason);
2159 	return 0;
2160 
2161 discard_and_relse:
2162 	sk_drops_add(sk, skb);
2163 	if (refcounted)
2164 		sock_put(sk);
2165 	goto discard_it;
2166 
2167 do_time_wait:
2168 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2169 		inet_twsk_put(inet_twsk(sk));
2170 		goto discard_it;
2171 	}
2172 
2173 	tcp_v4_fill_cb(skb, iph, th);
2174 
2175 	if (tcp_checksum_complete(skb)) {
2176 		inet_twsk_put(inet_twsk(sk));
2177 		goto csum_error;
2178 	}
2179 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2180 	case TCP_TW_SYN: {
2181 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2182 							&tcp_hashinfo, skb,
2183 							__tcp_hdrlen(th),
2184 							iph->saddr, th->source,
2185 							iph->daddr, th->dest,
2186 							inet_iif(skb),
2187 							sdif);
2188 		if (sk2) {
2189 			inet_twsk_deschedule_put(inet_twsk(sk));
2190 			sk = sk2;
2191 			tcp_v4_restore_cb(skb);
2192 			refcounted = false;
2193 			goto process;
2194 		}
2195 	}
2196 		/* to ACK */
2197 		fallthrough;
2198 	case TCP_TW_ACK:
2199 		tcp_v4_timewait_ack(sk, skb);
2200 		break;
2201 	case TCP_TW_RST:
2202 		tcp_v4_send_reset(sk, skb);
2203 		inet_twsk_deschedule_put(inet_twsk(sk));
2204 		goto discard_it;
2205 	case TCP_TW_SUCCESS:;
2206 	}
2207 	goto discard_it;
2208 }
2209 
2210 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2211 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2212 	.twsk_unique	= tcp_twsk_unique,
2213 	.twsk_destructor= tcp_twsk_destructor,
2214 };
2215 
2216 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2217 {
2218 	struct dst_entry *dst = skb_dst(skb);
2219 
2220 	if (dst && dst_hold_safe(dst)) {
2221 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2222 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2223 	}
2224 }
2225 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2226 
2227 const struct inet_connection_sock_af_ops ipv4_specific = {
2228 	.queue_xmit	   = ip_queue_xmit,
2229 	.send_check	   = tcp_v4_send_check,
2230 	.rebuild_header	   = inet_sk_rebuild_header,
2231 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2232 	.conn_request	   = tcp_v4_conn_request,
2233 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2234 	.net_header_len	   = sizeof(struct iphdr),
2235 	.setsockopt	   = ip_setsockopt,
2236 	.getsockopt	   = ip_getsockopt,
2237 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2238 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2239 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2240 };
2241 EXPORT_SYMBOL(ipv4_specific);
2242 
2243 #ifdef CONFIG_TCP_MD5SIG
2244 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2245 	.md5_lookup		= tcp_v4_md5_lookup,
2246 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2247 	.md5_parse		= tcp_v4_parse_md5_keys,
2248 };
2249 #endif
2250 
2251 /* NOTE: A lot of things set to zero explicitly by call to
2252  *       sk_alloc() so need not be done here.
2253  */
2254 static int tcp_v4_init_sock(struct sock *sk)
2255 {
2256 	struct inet_connection_sock *icsk = inet_csk(sk);
2257 
2258 	tcp_init_sock(sk);
2259 
2260 	icsk->icsk_af_ops = &ipv4_specific;
2261 
2262 #ifdef CONFIG_TCP_MD5SIG
2263 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2264 #endif
2265 
2266 	return 0;
2267 }
2268 
2269 void tcp_v4_destroy_sock(struct sock *sk)
2270 {
2271 	struct tcp_sock *tp = tcp_sk(sk);
2272 
2273 	trace_tcp_destroy_sock(sk);
2274 
2275 	tcp_clear_xmit_timers(sk);
2276 
2277 	tcp_cleanup_congestion_control(sk);
2278 
2279 	tcp_cleanup_ulp(sk);
2280 
2281 	/* Cleanup up the write buffer. */
2282 	tcp_write_queue_purge(sk);
2283 
2284 	/* Check if we want to disable active TFO */
2285 	tcp_fastopen_active_disable_ofo_check(sk);
2286 
2287 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2288 	skb_rbtree_purge(&tp->out_of_order_queue);
2289 
2290 #ifdef CONFIG_TCP_MD5SIG
2291 	/* Clean up the MD5 key list, if any */
2292 	if (tp->md5sig_info) {
2293 		tcp_clear_md5_list(sk);
2294 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2295 		tp->md5sig_info = NULL;
2296 	}
2297 #endif
2298 
2299 	/* Clean up a referenced TCP bind bucket. */
2300 	if (inet_csk(sk)->icsk_bind_hash)
2301 		inet_put_port(sk);
2302 
2303 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2304 
2305 	/* If socket is aborted during connect operation */
2306 	tcp_free_fastopen_req(tp);
2307 	tcp_fastopen_destroy_cipher(sk);
2308 	tcp_saved_syn_free(tp);
2309 
2310 	sk_sockets_allocated_dec(sk);
2311 }
2312 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2313 
2314 #ifdef CONFIG_PROC_FS
2315 /* Proc filesystem TCP sock list dumping. */
2316 
2317 static unsigned short seq_file_family(const struct seq_file *seq);
2318 
2319 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2320 {
2321 	unsigned short family = seq_file_family(seq);
2322 
2323 	/* AF_UNSPEC is used as a match all */
2324 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2325 		net_eq(sock_net(sk), seq_file_net(seq)));
2326 }
2327 
2328 /* Find a non empty bucket (starting from st->bucket)
2329  * and return the first sk from it.
2330  */
2331 static void *listening_get_first(struct seq_file *seq)
2332 {
2333 	struct tcp_iter_state *st = seq->private;
2334 
2335 	st->offset = 0;
2336 	for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2337 		struct inet_listen_hashbucket *ilb2;
2338 		struct inet_connection_sock *icsk;
2339 		struct sock *sk;
2340 
2341 		ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2342 		if (hlist_empty(&ilb2->head))
2343 			continue;
2344 
2345 		spin_lock(&ilb2->lock);
2346 		inet_lhash2_for_each_icsk(icsk, &ilb2->head) {
2347 			sk = (struct sock *)icsk;
2348 			if (seq_sk_match(seq, sk))
2349 				return sk;
2350 		}
2351 		spin_unlock(&ilb2->lock);
2352 	}
2353 
2354 	return NULL;
2355 }
2356 
2357 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2358  * If "cur" is the last one in the st->bucket,
2359  * call listening_get_first() to return the first sk of the next
2360  * non empty bucket.
2361  */
2362 static void *listening_get_next(struct seq_file *seq, void *cur)
2363 {
2364 	struct tcp_iter_state *st = seq->private;
2365 	struct inet_listen_hashbucket *ilb2;
2366 	struct inet_connection_sock *icsk;
2367 	struct sock *sk = cur;
2368 
2369 	++st->num;
2370 	++st->offset;
2371 
2372 	icsk = inet_csk(sk);
2373 	inet_lhash2_for_each_icsk_continue(icsk) {
2374 		sk = (struct sock *)icsk;
2375 		if (seq_sk_match(seq, sk))
2376 			return sk;
2377 	}
2378 
2379 	ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2380 	spin_unlock(&ilb2->lock);
2381 	++st->bucket;
2382 	return listening_get_first(seq);
2383 }
2384 
2385 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2386 {
2387 	struct tcp_iter_state *st = seq->private;
2388 	void *rc;
2389 
2390 	st->bucket = 0;
2391 	st->offset = 0;
2392 	rc = listening_get_first(seq);
2393 
2394 	while (rc && *pos) {
2395 		rc = listening_get_next(seq, rc);
2396 		--*pos;
2397 	}
2398 	return rc;
2399 }
2400 
2401 static inline bool empty_bucket(const struct tcp_iter_state *st)
2402 {
2403 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2404 }
2405 
2406 /*
2407  * Get first established socket starting from bucket given in st->bucket.
2408  * If st->bucket is zero, the very first socket in the hash is returned.
2409  */
2410 static void *established_get_first(struct seq_file *seq)
2411 {
2412 	struct tcp_iter_state *st = seq->private;
2413 
2414 	st->offset = 0;
2415 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2416 		struct sock *sk;
2417 		struct hlist_nulls_node *node;
2418 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2419 
2420 		/* Lockless fast path for the common case of empty buckets */
2421 		if (empty_bucket(st))
2422 			continue;
2423 
2424 		spin_lock_bh(lock);
2425 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2426 			if (seq_sk_match(seq, sk))
2427 				return sk;
2428 		}
2429 		spin_unlock_bh(lock);
2430 	}
2431 
2432 	return NULL;
2433 }
2434 
2435 static void *established_get_next(struct seq_file *seq, void *cur)
2436 {
2437 	struct sock *sk = cur;
2438 	struct hlist_nulls_node *node;
2439 	struct tcp_iter_state *st = seq->private;
2440 
2441 	++st->num;
2442 	++st->offset;
2443 
2444 	sk = sk_nulls_next(sk);
2445 
2446 	sk_nulls_for_each_from(sk, node) {
2447 		if (seq_sk_match(seq, sk))
2448 			return sk;
2449 	}
2450 
2451 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2452 	++st->bucket;
2453 	return established_get_first(seq);
2454 }
2455 
2456 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2457 {
2458 	struct tcp_iter_state *st = seq->private;
2459 	void *rc;
2460 
2461 	st->bucket = 0;
2462 	rc = established_get_first(seq);
2463 
2464 	while (rc && pos) {
2465 		rc = established_get_next(seq, rc);
2466 		--pos;
2467 	}
2468 	return rc;
2469 }
2470 
2471 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2472 {
2473 	void *rc;
2474 	struct tcp_iter_state *st = seq->private;
2475 
2476 	st->state = TCP_SEQ_STATE_LISTENING;
2477 	rc	  = listening_get_idx(seq, &pos);
2478 
2479 	if (!rc) {
2480 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2481 		rc	  = established_get_idx(seq, pos);
2482 	}
2483 
2484 	return rc;
2485 }
2486 
2487 static void *tcp_seek_last_pos(struct seq_file *seq)
2488 {
2489 	struct tcp_iter_state *st = seq->private;
2490 	int bucket = st->bucket;
2491 	int offset = st->offset;
2492 	int orig_num = st->num;
2493 	void *rc = NULL;
2494 
2495 	switch (st->state) {
2496 	case TCP_SEQ_STATE_LISTENING:
2497 		if (st->bucket > tcp_hashinfo.lhash2_mask)
2498 			break;
2499 		st->state = TCP_SEQ_STATE_LISTENING;
2500 		rc = listening_get_first(seq);
2501 		while (offset-- && rc && bucket == st->bucket)
2502 			rc = listening_get_next(seq, rc);
2503 		if (rc)
2504 			break;
2505 		st->bucket = 0;
2506 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2507 		fallthrough;
2508 	case TCP_SEQ_STATE_ESTABLISHED:
2509 		if (st->bucket > tcp_hashinfo.ehash_mask)
2510 			break;
2511 		rc = established_get_first(seq);
2512 		while (offset-- && rc && bucket == st->bucket)
2513 			rc = established_get_next(seq, rc);
2514 	}
2515 
2516 	st->num = orig_num;
2517 
2518 	return rc;
2519 }
2520 
2521 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2522 {
2523 	struct tcp_iter_state *st = seq->private;
2524 	void *rc;
2525 
2526 	if (*pos && *pos == st->last_pos) {
2527 		rc = tcp_seek_last_pos(seq);
2528 		if (rc)
2529 			goto out;
2530 	}
2531 
2532 	st->state = TCP_SEQ_STATE_LISTENING;
2533 	st->num = 0;
2534 	st->bucket = 0;
2535 	st->offset = 0;
2536 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2537 
2538 out:
2539 	st->last_pos = *pos;
2540 	return rc;
2541 }
2542 EXPORT_SYMBOL(tcp_seq_start);
2543 
2544 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2545 {
2546 	struct tcp_iter_state *st = seq->private;
2547 	void *rc = NULL;
2548 
2549 	if (v == SEQ_START_TOKEN) {
2550 		rc = tcp_get_idx(seq, 0);
2551 		goto out;
2552 	}
2553 
2554 	switch (st->state) {
2555 	case TCP_SEQ_STATE_LISTENING:
2556 		rc = listening_get_next(seq, v);
2557 		if (!rc) {
2558 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2559 			st->bucket = 0;
2560 			st->offset = 0;
2561 			rc	  = established_get_first(seq);
2562 		}
2563 		break;
2564 	case TCP_SEQ_STATE_ESTABLISHED:
2565 		rc = established_get_next(seq, v);
2566 		break;
2567 	}
2568 out:
2569 	++*pos;
2570 	st->last_pos = *pos;
2571 	return rc;
2572 }
2573 EXPORT_SYMBOL(tcp_seq_next);
2574 
2575 void tcp_seq_stop(struct seq_file *seq, void *v)
2576 {
2577 	struct tcp_iter_state *st = seq->private;
2578 
2579 	switch (st->state) {
2580 	case TCP_SEQ_STATE_LISTENING:
2581 		if (v != SEQ_START_TOKEN)
2582 			spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2583 		break;
2584 	case TCP_SEQ_STATE_ESTABLISHED:
2585 		if (v)
2586 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2587 		break;
2588 	}
2589 }
2590 EXPORT_SYMBOL(tcp_seq_stop);
2591 
2592 static void get_openreq4(const struct request_sock *req,
2593 			 struct seq_file *f, int i)
2594 {
2595 	const struct inet_request_sock *ireq = inet_rsk(req);
2596 	long delta = req->rsk_timer.expires - jiffies;
2597 
2598 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2599 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2600 		i,
2601 		ireq->ir_loc_addr,
2602 		ireq->ir_num,
2603 		ireq->ir_rmt_addr,
2604 		ntohs(ireq->ir_rmt_port),
2605 		TCP_SYN_RECV,
2606 		0, 0, /* could print option size, but that is af dependent. */
2607 		1,    /* timers active (only the expire timer) */
2608 		jiffies_delta_to_clock_t(delta),
2609 		req->num_timeout,
2610 		from_kuid_munged(seq_user_ns(f),
2611 				 sock_i_uid(req->rsk_listener)),
2612 		0,  /* non standard timer */
2613 		0, /* open_requests have no inode */
2614 		0,
2615 		req);
2616 }
2617 
2618 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2619 {
2620 	int timer_active;
2621 	unsigned long timer_expires;
2622 	const struct tcp_sock *tp = tcp_sk(sk);
2623 	const struct inet_connection_sock *icsk = inet_csk(sk);
2624 	const struct inet_sock *inet = inet_sk(sk);
2625 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2626 	__be32 dest = inet->inet_daddr;
2627 	__be32 src = inet->inet_rcv_saddr;
2628 	__u16 destp = ntohs(inet->inet_dport);
2629 	__u16 srcp = ntohs(inet->inet_sport);
2630 	int rx_queue;
2631 	int state;
2632 
2633 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2634 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2635 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2636 		timer_active	= 1;
2637 		timer_expires	= icsk->icsk_timeout;
2638 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2639 		timer_active	= 4;
2640 		timer_expires	= icsk->icsk_timeout;
2641 	} else if (timer_pending(&sk->sk_timer)) {
2642 		timer_active	= 2;
2643 		timer_expires	= sk->sk_timer.expires;
2644 	} else {
2645 		timer_active	= 0;
2646 		timer_expires = jiffies;
2647 	}
2648 
2649 	state = inet_sk_state_load(sk);
2650 	if (state == TCP_LISTEN)
2651 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2652 	else
2653 		/* Because we don't lock the socket,
2654 		 * we might find a transient negative value.
2655 		 */
2656 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2657 				      READ_ONCE(tp->copied_seq), 0);
2658 
2659 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2660 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2661 		i, src, srcp, dest, destp, state,
2662 		READ_ONCE(tp->write_seq) - tp->snd_una,
2663 		rx_queue,
2664 		timer_active,
2665 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2666 		icsk->icsk_retransmits,
2667 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2668 		icsk->icsk_probes_out,
2669 		sock_i_ino(sk),
2670 		refcount_read(&sk->sk_refcnt), sk,
2671 		jiffies_to_clock_t(icsk->icsk_rto),
2672 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2673 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2674 		tp->snd_cwnd,
2675 		state == TCP_LISTEN ?
2676 		    fastopenq->max_qlen :
2677 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2678 }
2679 
2680 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2681 			       struct seq_file *f, int i)
2682 {
2683 	long delta = tw->tw_timer.expires - jiffies;
2684 	__be32 dest, src;
2685 	__u16 destp, srcp;
2686 
2687 	dest  = tw->tw_daddr;
2688 	src   = tw->tw_rcv_saddr;
2689 	destp = ntohs(tw->tw_dport);
2690 	srcp  = ntohs(tw->tw_sport);
2691 
2692 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2693 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2694 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2695 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2696 		refcount_read(&tw->tw_refcnt), tw);
2697 }
2698 
2699 #define TMPSZ 150
2700 
2701 static int tcp4_seq_show(struct seq_file *seq, void *v)
2702 {
2703 	struct tcp_iter_state *st;
2704 	struct sock *sk = v;
2705 
2706 	seq_setwidth(seq, TMPSZ - 1);
2707 	if (v == SEQ_START_TOKEN) {
2708 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2709 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2710 			   "inode");
2711 		goto out;
2712 	}
2713 	st = seq->private;
2714 
2715 	if (sk->sk_state == TCP_TIME_WAIT)
2716 		get_timewait4_sock(v, seq, st->num);
2717 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2718 		get_openreq4(v, seq, st->num);
2719 	else
2720 		get_tcp4_sock(v, seq, st->num);
2721 out:
2722 	seq_pad(seq, '\n');
2723 	return 0;
2724 }
2725 
2726 #ifdef CONFIG_BPF_SYSCALL
2727 struct bpf_tcp_iter_state {
2728 	struct tcp_iter_state state;
2729 	unsigned int cur_sk;
2730 	unsigned int end_sk;
2731 	unsigned int max_sk;
2732 	struct sock **batch;
2733 	bool st_bucket_done;
2734 };
2735 
2736 struct bpf_iter__tcp {
2737 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2738 	__bpf_md_ptr(struct sock_common *, sk_common);
2739 	uid_t uid __aligned(8);
2740 };
2741 
2742 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2743 			     struct sock_common *sk_common, uid_t uid)
2744 {
2745 	struct bpf_iter__tcp ctx;
2746 
2747 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2748 	ctx.meta = meta;
2749 	ctx.sk_common = sk_common;
2750 	ctx.uid = uid;
2751 	return bpf_iter_run_prog(prog, &ctx);
2752 }
2753 
2754 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2755 {
2756 	while (iter->cur_sk < iter->end_sk)
2757 		sock_put(iter->batch[iter->cur_sk++]);
2758 }
2759 
2760 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2761 				      unsigned int new_batch_sz)
2762 {
2763 	struct sock **new_batch;
2764 
2765 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2766 			     GFP_USER | __GFP_NOWARN);
2767 	if (!new_batch)
2768 		return -ENOMEM;
2769 
2770 	bpf_iter_tcp_put_batch(iter);
2771 	kvfree(iter->batch);
2772 	iter->batch = new_batch;
2773 	iter->max_sk = new_batch_sz;
2774 
2775 	return 0;
2776 }
2777 
2778 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2779 						 struct sock *start_sk)
2780 {
2781 	struct bpf_tcp_iter_state *iter = seq->private;
2782 	struct tcp_iter_state *st = &iter->state;
2783 	struct inet_connection_sock *icsk;
2784 	unsigned int expected = 1;
2785 	struct sock *sk;
2786 
2787 	sock_hold(start_sk);
2788 	iter->batch[iter->end_sk++] = start_sk;
2789 
2790 	icsk = inet_csk(start_sk);
2791 	inet_lhash2_for_each_icsk_continue(icsk) {
2792 		sk = (struct sock *)icsk;
2793 		if (seq_sk_match(seq, sk)) {
2794 			if (iter->end_sk < iter->max_sk) {
2795 				sock_hold(sk);
2796 				iter->batch[iter->end_sk++] = sk;
2797 			}
2798 			expected++;
2799 		}
2800 	}
2801 	spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2802 
2803 	return expected;
2804 }
2805 
2806 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2807 						   struct sock *start_sk)
2808 {
2809 	struct bpf_tcp_iter_state *iter = seq->private;
2810 	struct tcp_iter_state *st = &iter->state;
2811 	struct hlist_nulls_node *node;
2812 	unsigned int expected = 1;
2813 	struct sock *sk;
2814 
2815 	sock_hold(start_sk);
2816 	iter->batch[iter->end_sk++] = start_sk;
2817 
2818 	sk = sk_nulls_next(start_sk);
2819 	sk_nulls_for_each_from(sk, node) {
2820 		if (seq_sk_match(seq, sk)) {
2821 			if (iter->end_sk < iter->max_sk) {
2822 				sock_hold(sk);
2823 				iter->batch[iter->end_sk++] = sk;
2824 			}
2825 			expected++;
2826 		}
2827 	}
2828 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2829 
2830 	return expected;
2831 }
2832 
2833 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2834 {
2835 	struct bpf_tcp_iter_state *iter = seq->private;
2836 	struct tcp_iter_state *st = &iter->state;
2837 	unsigned int expected;
2838 	bool resized = false;
2839 	struct sock *sk;
2840 
2841 	/* The st->bucket is done.  Directly advance to the next
2842 	 * bucket instead of having the tcp_seek_last_pos() to skip
2843 	 * one by one in the current bucket and eventually find out
2844 	 * it has to advance to the next bucket.
2845 	 */
2846 	if (iter->st_bucket_done) {
2847 		st->offset = 0;
2848 		st->bucket++;
2849 		if (st->state == TCP_SEQ_STATE_LISTENING &&
2850 		    st->bucket > tcp_hashinfo.lhash2_mask) {
2851 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2852 			st->bucket = 0;
2853 		}
2854 	}
2855 
2856 again:
2857 	/* Get a new batch */
2858 	iter->cur_sk = 0;
2859 	iter->end_sk = 0;
2860 	iter->st_bucket_done = false;
2861 
2862 	sk = tcp_seek_last_pos(seq);
2863 	if (!sk)
2864 		return NULL; /* Done */
2865 
2866 	if (st->state == TCP_SEQ_STATE_LISTENING)
2867 		expected = bpf_iter_tcp_listening_batch(seq, sk);
2868 	else
2869 		expected = bpf_iter_tcp_established_batch(seq, sk);
2870 
2871 	if (iter->end_sk == expected) {
2872 		iter->st_bucket_done = true;
2873 		return sk;
2874 	}
2875 
2876 	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2877 		resized = true;
2878 		goto again;
2879 	}
2880 
2881 	return sk;
2882 }
2883 
2884 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2885 {
2886 	/* bpf iter does not support lseek, so it always
2887 	 * continue from where it was stop()-ped.
2888 	 */
2889 	if (*pos)
2890 		return bpf_iter_tcp_batch(seq);
2891 
2892 	return SEQ_START_TOKEN;
2893 }
2894 
2895 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2896 {
2897 	struct bpf_tcp_iter_state *iter = seq->private;
2898 	struct tcp_iter_state *st = &iter->state;
2899 	struct sock *sk;
2900 
2901 	/* Whenever seq_next() is called, the iter->cur_sk is
2902 	 * done with seq_show(), so advance to the next sk in
2903 	 * the batch.
2904 	 */
2905 	if (iter->cur_sk < iter->end_sk) {
2906 		/* Keeping st->num consistent in tcp_iter_state.
2907 		 * bpf_iter_tcp does not use st->num.
2908 		 * meta.seq_num is used instead.
2909 		 */
2910 		st->num++;
2911 		/* Move st->offset to the next sk in the bucket such that
2912 		 * the future start() will resume at st->offset in
2913 		 * st->bucket.  See tcp_seek_last_pos().
2914 		 */
2915 		st->offset++;
2916 		sock_put(iter->batch[iter->cur_sk++]);
2917 	}
2918 
2919 	if (iter->cur_sk < iter->end_sk)
2920 		sk = iter->batch[iter->cur_sk];
2921 	else
2922 		sk = bpf_iter_tcp_batch(seq);
2923 
2924 	++*pos;
2925 	/* Keeping st->last_pos consistent in tcp_iter_state.
2926 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2927 	 */
2928 	st->last_pos = *pos;
2929 	return sk;
2930 }
2931 
2932 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2933 {
2934 	struct bpf_iter_meta meta;
2935 	struct bpf_prog *prog;
2936 	struct sock *sk = v;
2937 	bool slow;
2938 	uid_t uid;
2939 	int ret;
2940 
2941 	if (v == SEQ_START_TOKEN)
2942 		return 0;
2943 
2944 	if (sk_fullsock(sk))
2945 		slow = lock_sock_fast(sk);
2946 
2947 	if (unlikely(sk_unhashed(sk))) {
2948 		ret = SEQ_SKIP;
2949 		goto unlock;
2950 	}
2951 
2952 	if (sk->sk_state == TCP_TIME_WAIT) {
2953 		uid = 0;
2954 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2955 		const struct request_sock *req = v;
2956 
2957 		uid = from_kuid_munged(seq_user_ns(seq),
2958 				       sock_i_uid(req->rsk_listener));
2959 	} else {
2960 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2961 	}
2962 
2963 	meta.seq = seq;
2964 	prog = bpf_iter_get_info(&meta, false);
2965 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
2966 
2967 unlock:
2968 	if (sk_fullsock(sk))
2969 		unlock_sock_fast(sk, slow);
2970 	return ret;
2971 
2972 }
2973 
2974 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2975 {
2976 	struct bpf_tcp_iter_state *iter = seq->private;
2977 	struct bpf_iter_meta meta;
2978 	struct bpf_prog *prog;
2979 
2980 	if (!v) {
2981 		meta.seq = seq;
2982 		prog = bpf_iter_get_info(&meta, true);
2983 		if (prog)
2984 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
2985 	}
2986 
2987 	if (iter->cur_sk < iter->end_sk) {
2988 		bpf_iter_tcp_put_batch(iter);
2989 		iter->st_bucket_done = false;
2990 	}
2991 }
2992 
2993 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2994 	.show		= bpf_iter_tcp_seq_show,
2995 	.start		= bpf_iter_tcp_seq_start,
2996 	.next		= bpf_iter_tcp_seq_next,
2997 	.stop		= bpf_iter_tcp_seq_stop,
2998 };
2999 #endif
3000 static unsigned short seq_file_family(const struct seq_file *seq)
3001 {
3002 	const struct tcp_seq_afinfo *afinfo;
3003 
3004 #ifdef CONFIG_BPF_SYSCALL
3005 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3006 	if (seq->op == &bpf_iter_tcp_seq_ops)
3007 		return AF_UNSPEC;
3008 #endif
3009 
3010 	/* Iterated from proc fs */
3011 	afinfo = pde_data(file_inode(seq->file));
3012 	return afinfo->family;
3013 }
3014 
3015 static const struct seq_operations tcp4_seq_ops = {
3016 	.show		= tcp4_seq_show,
3017 	.start		= tcp_seq_start,
3018 	.next		= tcp_seq_next,
3019 	.stop		= tcp_seq_stop,
3020 };
3021 
3022 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3023 	.family		= AF_INET,
3024 };
3025 
3026 static int __net_init tcp4_proc_init_net(struct net *net)
3027 {
3028 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3029 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3030 		return -ENOMEM;
3031 	return 0;
3032 }
3033 
3034 static void __net_exit tcp4_proc_exit_net(struct net *net)
3035 {
3036 	remove_proc_entry("tcp", net->proc_net);
3037 }
3038 
3039 static struct pernet_operations tcp4_net_ops = {
3040 	.init = tcp4_proc_init_net,
3041 	.exit = tcp4_proc_exit_net,
3042 };
3043 
3044 int __init tcp4_proc_init(void)
3045 {
3046 	return register_pernet_subsys(&tcp4_net_ops);
3047 }
3048 
3049 void tcp4_proc_exit(void)
3050 {
3051 	unregister_pernet_subsys(&tcp4_net_ops);
3052 }
3053 #endif /* CONFIG_PROC_FS */
3054 
3055 /* @wake is one when sk_stream_write_space() calls us.
3056  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3057  * This mimics the strategy used in sock_def_write_space().
3058  */
3059 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3060 {
3061 	const struct tcp_sock *tp = tcp_sk(sk);
3062 	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3063 			    READ_ONCE(tp->snd_nxt);
3064 
3065 	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3066 }
3067 EXPORT_SYMBOL(tcp_stream_memory_free);
3068 
3069 struct proto tcp_prot = {
3070 	.name			= "TCP",
3071 	.owner			= THIS_MODULE,
3072 	.close			= tcp_close,
3073 	.pre_connect		= tcp_v4_pre_connect,
3074 	.connect		= tcp_v4_connect,
3075 	.disconnect		= tcp_disconnect,
3076 	.accept			= inet_csk_accept,
3077 	.ioctl			= tcp_ioctl,
3078 	.init			= tcp_v4_init_sock,
3079 	.destroy		= tcp_v4_destroy_sock,
3080 	.shutdown		= tcp_shutdown,
3081 	.setsockopt		= tcp_setsockopt,
3082 	.getsockopt		= tcp_getsockopt,
3083 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3084 	.keepalive		= tcp_set_keepalive,
3085 	.recvmsg		= tcp_recvmsg,
3086 	.sendmsg		= tcp_sendmsg,
3087 	.sendpage		= tcp_sendpage,
3088 	.backlog_rcv		= tcp_v4_do_rcv,
3089 	.release_cb		= tcp_release_cb,
3090 	.hash			= inet_hash,
3091 	.unhash			= inet_unhash,
3092 	.get_port		= inet_csk_get_port,
3093 	.put_port		= inet_put_port,
3094 #ifdef CONFIG_BPF_SYSCALL
3095 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3096 #endif
3097 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3098 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3099 	.stream_memory_free	= tcp_stream_memory_free,
3100 	.sockets_allocated	= &tcp_sockets_allocated,
3101 	.orphan_count		= &tcp_orphan_count,
3102 	.memory_allocated	= &tcp_memory_allocated,
3103 	.memory_pressure	= &tcp_memory_pressure,
3104 	.sysctl_mem		= sysctl_tcp_mem,
3105 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3106 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3107 	.max_header		= MAX_TCP_HEADER,
3108 	.obj_size		= sizeof(struct tcp_sock),
3109 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3110 	.twsk_prot		= &tcp_timewait_sock_ops,
3111 	.rsk_prot		= &tcp_request_sock_ops,
3112 	.h.hashinfo		= &tcp_hashinfo,
3113 	.no_autobind		= true,
3114 	.diag_destroy		= tcp_abort,
3115 };
3116 EXPORT_SYMBOL(tcp_prot);
3117 
3118 static void __net_exit tcp_sk_exit(struct net *net)
3119 {
3120 	struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row;
3121 
3122 	if (net->ipv4.tcp_congestion_control)
3123 		bpf_module_put(net->ipv4.tcp_congestion_control,
3124 			       net->ipv4.tcp_congestion_control->owner);
3125 	if (refcount_dec_and_test(&tcp_death_row->tw_refcount))
3126 		kfree(tcp_death_row);
3127 }
3128 
3129 static int __net_init tcp_sk_init(struct net *net)
3130 {
3131 	int cnt;
3132 
3133 	net->ipv4.sysctl_tcp_ecn = 2;
3134 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3135 
3136 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3137 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3138 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3139 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3140 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3141 
3142 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3143 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3144 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3145 
3146 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3147 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3148 	net->ipv4.sysctl_tcp_syncookies = 1;
3149 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3150 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3151 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3152 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3153 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3154 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3155 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3156 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3157 
3158 	net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL);
3159 	if (!net->ipv4.tcp_death_row)
3160 		return -ENOMEM;
3161 	refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1);
3162 	cnt = tcp_hashinfo.ehash_mask + 1;
3163 	net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2;
3164 	net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo;
3165 
3166 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3167 	net->ipv4.sysctl_tcp_sack = 1;
3168 	net->ipv4.sysctl_tcp_window_scaling = 1;
3169 	net->ipv4.sysctl_tcp_timestamps = 1;
3170 	net->ipv4.sysctl_tcp_early_retrans = 3;
3171 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3172 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3173 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3174 	net->ipv4.sysctl_tcp_max_reordering = 300;
3175 	net->ipv4.sysctl_tcp_dsack = 1;
3176 	net->ipv4.sysctl_tcp_app_win = 31;
3177 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3178 	net->ipv4.sysctl_tcp_frto = 2;
3179 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3180 	/* This limits the percentage of the congestion window which we
3181 	 * will allow a single TSO frame to consume.  Building TSO frames
3182 	 * which are too large can cause TCP streams to be bursty.
3183 	 */
3184 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3185 	/* Default TSQ limit of 16 TSO segments */
3186 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3187 	/* rfc5961 challenge ack rate limiting */
3188 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
3189 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3190 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3191 	net->ipv4.sysctl_tcp_autocorking = 1;
3192 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3193 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3194 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3195 	if (net != &init_net) {
3196 		memcpy(net->ipv4.sysctl_tcp_rmem,
3197 		       init_net.ipv4.sysctl_tcp_rmem,
3198 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3199 		memcpy(net->ipv4.sysctl_tcp_wmem,
3200 		       init_net.ipv4.sysctl_tcp_wmem,
3201 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3202 	}
3203 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3204 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3205 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3206 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3207 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3208 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3209 
3210 	/* Reno is always built in */
3211 	if (!net_eq(net, &init_net) &&
3212 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3213 			       init_net.ipv4.tcp_congestion_control->owner))
3214 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3215 	else
3216 		net->ipv4.tcp_congestion_control = &tcp_reno;
3217 
3218 	return 0;
3219 }
3220 
3221 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3222 {
3223 	struct net *net;
3224 
3225 	list_for_each_entry(net, net_exit_list, exit_list)
3226 		tcp_fastopen_ctx_destroy(net);
3227 }
3228 
3229 static struct pernet_operations __net_initdata tcp_sk_ops = {
3230        .init	   = tcp_sk_init,
3231        .exit	   = tcp_sk_exit,
3232        .exit_batch = tcp_sk_exit_batch,
3233 };
3234 
3235 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3236 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3237 		     struct sock_common *sk_common, uid_t uid)
3238 
3239 #define INIT_BATCH_SZ 16
3240 
3241 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3242 {
3243 	struct bpf_tcp_iter_state *iter = priv_data;
3244 	int err;
3245 
3246 	err = bpf_iter_init_seq_net(priv_data, aux);
3247 	if (err)
3248 		return err;
3249 
3250 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3251 	if (err) {
3252 		bpf_iter_fini_seq_net(priv_data);
3253 		return err;
3254 	}
3255 
3256 	return 0;
3257 }
3258 
3259 static void bpf_iter_fini_tcp(void *priv_data)
3260 {
3261 	struct bpf_tcp_iter_state *iter = priv_data;
3262 
3263 	bpf_iter_fini_seq_net(priv_data);
3264 	kvfree(iter->batch);
3265 }
3266 
3267 static const struct bpf_iter_seq_info tcp_seq_info = {
3268 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3269 	.init_seq_private	= bpf_iter_init_tcp,
3270 	.fini_seq_private	= bpf_iter_fini_tcp,
3271 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3272 };
3273 
3274 static const struct bpf_func_proto *
3275 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3276 			    const struct bpf_prog *prog)
3277 {
3278 	switch (func_id) {
3279 	case BPF_FUNC_setsockopt:
3280 		return &bpf_sk_setsockopt_proto;
3281 	case BPF_FUNC_getsockopt:
3282 		return &bpf_sk_getsockopt_proto;
3283 	default:
3284 		return NULL;
3285 	}
3286 }
3287 
3288 static struct bpf_iter_reg tcp_reg_info = {
3289 	.target			= "tcp",
3290 	.ctx_arg_info_size	= 1,
3291 	.ctx_arg_info		= {
3292 		{ offsetof(struct bpf_iter__tcp, sk_common),
3293 		  PTR_TO_BTF_ID_OR_NULL },
3294 	},
3295 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3296 	.seq_info		= &tcp_seq_info,
3297 };
3298 
3299 static void __init bpf_iter_register(void)
3300 {
3301 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3302 	if (bpf_iter_reg_target(&tcp_reg_info))
3303 		pr_warn("Warning: could not register bpf iterator tcp\n");
3304 }
3305 
3306 #endif
3307 
3308 void __init tcp_v4_init(void)
3309 {
3310 	int cpu, res;
3311 
3312 	for_each_possible_cpu(cpu) {
3313 		struct sock *sk;
3314 
3315 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3316 					   IPPROTO_TCP, &init_net);
3317 		if (res)
3318 			panic("Failed to create the TCP control socket.\n");
3319 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3320 
3321 		/* Please enforce IP_DF and IPID==0 for RST and
3322 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3323 		 */
3324 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3325 
3326 		per_cpu(ipv4_tcp_sk, cpu) = sk;
3327 	}
3328 	if (register_pernet_subsys(&tcp_sk_ops))
3329 		panic("Failed to create the TCP control socket.\n");
3330 
3331 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3332 	bpf_iter_register();
3333 #endif
3334 }
3335