xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision 07d9a767)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72 
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
80 
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
83 
84 #include <trace/events/tcp.h>
85 
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 #endif
90 
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
93 
94 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
95 {
96 	return secure_tcp_seq(ip_hdr(skb)->daddr,
97 			      ip_hdr(skb)->saddr,
98 			      tcp_hdr(skb)->dest,
99 			      tcp_hdr(skb)->source);
100 }
101 
102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
103 {
104 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
105 }
106 
107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 {
109 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
110 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 	struct tcp_sock *tp = tcp_sk(sk);
112 	int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
113 
114 	if (reuse == 2) {
115 		/* Still does not detect *everything* that goes through
116 		 * lo, since we require a loopback src or dst address
117 		 * or direct binding to 'lo' interface.
118 		 */
119 		bool loopback = false;
120 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
121 			loopback = true;
122 #if IS_ENABLED(CONFIG_IPV6)
123 		if (tw->tw_family == AF_INET6) {
124 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
125 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
126 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
127 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
128 				loopback = true;
129 		} else
130 #endif
131 		{
132 			if (ipv4_is_loopback(tw->tw_daddr) ||
133 			    ipv4_is_loopback(tw->tw_rcv_saddr))
134 				loopback = true;
135 		}
136 		if (!loopback)
137 			reuse = 0;
138 	}
139 
140 	/* With PAWS, it is safe from the viewpoint
141 	   of data integrity. Even without PAWS it is safe provided sequence
142 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
143 
144 	   Actually, the idea is close to VJ's one, only timestamp cache is
145 	   held not per host, but per port pair and TW bucket is used as state
146 	   holder.
147 
148 	   If TW bucket has been already destroyed we fall back to VJ's scheme
149 	   and use initial timestamp retrieved from peer table.
150 	 */
151 	if (tcptw->tw_ts_recent_stamp &&
152 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
153 					    tcptw->tw_ts_recent_stamp)))) {
154 		/* In case of repair and re-using TIME-WAIT sockets we still
155 		 * want to be sure that it is safe as above but honor the
156 		 * sequence numbers and time stamps set as part of the repair
157 		 * process.
158 		 *
159 		 * Without this check re-using a TIME-WAIT socket with TCP
160 		 * repair would accumulate a -1 on the repair assigned
161 		 * sequence number. The first time it is reused the sequence
162 		 * is -1, the second time -2, etc. This fixes that issue
163 		 * without appearing to create any others.
164 		 */
165 		if (likely(!tp->repair)) {
166 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
167 
168 			if (!seq)
169 				seq = 1;
170 			WRITE_ONCE(tp->write_seq, seq);
171 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
172 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
173 		}
174 		sock_hold(sktw);
175 		return 1;
176 	}
177 
178 	return 0;
179 }
180 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
181 
182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
183 			      int addr_len)
184 {
185 	/* This check is replicated from tcp_v4_connect() and intended to
186 	 * prevent BPF program called below from accessing bytes that are out
187 	 * of the bound specified by user in addr_len.
188 	 */
189 	if (addr_len < sizeof(struct sockaddr_in))
190 		return -EINVAL;
191 
192 	sock_owned_by_me(sk);
193 
194 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
195 }
196 
197 /* This will initiate an outgoing connection. */
198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
199 {
200 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
201 	struct inet_sock *inet = inet_sk(sk);
202 	struct tcp_sock *tp = tcp_sk(sk);
203 	__be16 orig_sport, orig_dport;
204 	__be32 daddr, nexthop;
205 	struct flowi4 *fl4;
206 	struct rtable *rt;
207 	int err;
208 	struct ip_options_rcu *inet_opt;
209 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
210 
211 	if (addr_len < sizeof(struct sockaddr_in))
212 		return -EINVAL;
213 
214 	if (usin->sin_family != AF_INET)
215 		return -EAFNOSUPPORT;
216 
217 	nexthop = daddr = usin->sin_addr.s_addr;
218 	inet_opt = rcu_dereference_protected(inet->inet_opt,
219 					     lockdep_sock_is_held(sk));
220 	if (inet_opt && inet_opt->opt.srr) {
221 		if (!daddr)
222 			return -EINVAL;
223 		nexthop = inet_opt->opt.faddr;
224 	}
225 
226 	orig_sport = inet->inet_sport;
227 	orig_dport = usin->sin_port;
228 	fl4 = &inet->cork.fl.u.ip4;
229 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
230 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
231 			      IPPROTO_TCP,
232 			      orig_sport, orig_dport, sk);
233 	if (IS_ERR(rt)) {
234 		err = PTR_ERR(rt);
235 		if (err == -ENETUNREACH)
236 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
237 		return err;
238 	}
239 
240 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
241 		ip_rt_put(rt);
242 		return -ENETUNREACH;
243 	}
244 
245 	if (!inet_opt || !inet_opt->opt.srr)
246 		daddr = fl4->daddr;
247 
248 	if (!inet->inet_saddr)
249 		inet->inet_saddr = fl4->saddr;
250 	sk_rcv_saddr_set(sk, inet->inet_saddr);
251 
252 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
253 		/* Reset inherited state */
254 		tp->rx_opt.ts_recent	   = 0;
255 		tp->rx_opt.ts_recent_stamp = 0;
256 		if (likely(!tp->repair))
257 			WRITE_ONCE(tp->write_seq, 0);
258 	}
259 
260 	inet->inet_dport = usin->sin_port;
261 	sk_daddr_set(sk, daddr);
262 
263 	inet_csk(sk)->icsk_ext_hdr_len = 0;
264 	if (inet_opt)
265 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
266 
267 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
268 
269 	/* Socket identity is still unknown (sport may be zero).
270 	 * However we set state to SYN-SENT and not releasing socket
271 	 * lock select source port, enter ourselves into the hash tables and
272 	 * complete initialization after this.
273 	 */
274 	tcp_set_state(sk, TCP_SYN_SENT);
275 	err = inet_hash_connect(tcp_death_row, sk);
276 	if (err)
277 		goto failure;
278 
279 	sk_set_txhash(sk);
280 
281 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
282 			       inet->inet_sport, inet->inet_dport, sk);
283 	if (IS_ERR(rt)) {
284 		err = PTR_ERR(rt);
285 		rt = NULL;
286 		goto failure;
287 	}
288 	/* OK, now commit destination to socket.  */
289 	sk->sk_gso_type = SKB_GSO_TCPV4;
290 	sk_setup_caps(sk, &rt->dst);
291 	rt = NULL;
292 
293 	if (likely(!tp->repair)) {
294 		if (!tp->write_seq)
295 			WRITE_ONCE(tp->write_seq,
296 				   secure_tcp_seq(inet->inet_saddr,
297 						  inet->inet_daddr,
298 						  inet->inet_sport,
299 						  usin->sin_port));
300 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
301 						 inet->inet_saddr,
302 						 inet->inet_daddr);
303 	}
304 
305 	inet->inet_id = prandom_u32();
306 
307 	if (tcp_fastopen_defer_connect(sk, &err))
308 		return err;
309 	if (err)
310 		goto failure;
311 
312 	err = tcp_connect(sk);
313 
314 	if (err)
315 		goto failure;
316 
317 	return 0;
318 
319 failure:
320 	/*
321 	 * This unhashes the socket and releases the local port,
322 	 * if necessary.
323 	 */
324 	tcp_set_state(sk, TCP_CLOSE);
325 	ip_rt_put(rt);
326 	sk->sk_route_caps = 0;
327 	inet->inet_dport = 0;
328 	return err;
329 }
330 EXPORT_SYMBOL(tcp_v4_connect);
331 
332 /*
333  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
334  * It can be called through tcp_release_cb() if socket was owned by user
335  * at the time tcp_v4_err() was called to handle ICMP message.
336  */
337 void tcp_v4_mtu_reduced(struct sock *sk)
338 {
339 	struct inet_sock *inet = inet_sk(sk);
340 	struct dst_entry *dst;
341 	u32 mtu;
342 
343 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
344 		return;
345 	mtu = tcp_sk(sk)->mtu_info;
346 	dst = inet_csk_update_pmtu(sk, mtu);
347 	if (!dst)
348 		return;
349 
350 	/* Something is about to be wrong... Remember soft error
351 	 * for the case, if this connection will not able to recover.
352 	 */
353 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
354 		sk->sk_err_soft = EMSGSIZE;
355 
356 	mtu = dst_mtu(dst);
357 
358 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
359 	    ip_sk_accept_pmtu(sk) &&
360 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
361 		tcp_sync_mss(sk, mtu);
362 
363 		/* Resend the TCP packet because it's
364 		 * clear that the old packet has been
365 		 * dropped. This is the new "fast" path mtu
366 		 * discovery.
367 		 */
368 		tcp_simple_retransmit(sk);
369 	} /* else let the usual retransmit timer handle it */
370 }
371 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
372 
373 static void do_redirect(struct sk_buff *skb, struct sock *sk)
374 {
375 	struct dst_entry *dst = __sk_dst_check(sk, 0);
376 
377 	if (dst)
378 		dst->ops->redirect(dst, sk, skb);
379 }
380 
381 
382 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
383 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
384 {
385 	struct request_sock *req = inet_reqsk(sk);
386 	struct net *net = sock_net(sk);
387 
388 	/* ICMPs are not backlogged, hence we cannot get
389 	 * an established socket here.
390 	 */
391 	if (seq != tcp_rsk(req)->snt_isn) {
392 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
393 	} else if (abort) {
394 		/*
395 		 * Still in SYN_RECV, just remove it silently.
396 		 * There is no good way to pass the error to the newly
397 		 * created socket, and POSIX does not want network
398 		 * errors returned from accept().
399 		 */
400 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
401 		tcp_listendrop(req->rsk_listener);
402 	}
403 	reqsk_put(req);
404 }
405 EXPORT_SYMBOL(tcp_req_err);
406 
407 /* TCP-LD (RFC 6069) logic */
408 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
409 {
410 	struct inet_connection_sock *icsk = inet_csk(sk);
411 	struct tcp_sock *tp = tcp_sk(sk);
412 	struct sk_buff *skb;
413 	s32 remaining;
414 	u32 delta_us;
415 
416 	if (sock_owned_by_user(sk))
417 		return;
418 
419 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
420 	    !icsk->icsk_backoff)
421 		return;
422 
423 	skb = tcp_rtx_queue_head(sk);
424 	if (WARN_ON_ONCE(!skb))
425 		return;
426 
427 	icsk->icsk_backoff--;
428 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
429 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
430 
431 	tcp_mstamp_refresh(tp);
432 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
433 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
434 
435 	if (remaining > 0) {
436 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
437 					  remaining, TCP_RTO_MAX);
438 	} else {
439 		/* RTO revert clocked out retransmission.
440 		 * Will retransmit now.
441 		 */
442 		tcp_retransmit_timer(sk);
443 	}
444 }
445 EXPORT_SYMBOL(tcp_ld_RTO_revert);
446 
447 /*
448  * This routine is called by the ICMP module when it gets some
449  * sort of error condition.  If err < 0 then the socket should
450  * be closed and the error returned to the user.  If err > 0
451  * it's just the icmp type << 8 | icmp code.  After adjustment
452  * header points to the first 8 bytes of the tcp header.  We need
453  * to find the appropriate port.
454  *
455  * The locking strategy used here is very "optimistic". When
456  * someone else accesses the socket the ICMP is just dropped
457  * and for some paths there is no check at all.
458  * A more general error queue to queue errors for later handling
459  * is probably better.
460  *
461  */
462 
463 int tcp_v4_err(struct sk_buff *skb, u32 info)
464 {
465 	const struct iphdr *iph = (const struct iphdr *)skb->data;
466 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
467 	struct tcp_sock *tp;
468 	struct inet_sock *inet;
469 	const int type = icmp_hdr(skb)->type;
470 	const int code = icmp_hdr(skb)->code;
471 	struct sock *sk;
472 	struct request_sock *fastopen;
473 	u32 seq, snd_una;
474 	int err;
475 	struct net *net = dev_net(skb->dev);
476 
477 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
478 				       th->dest, iph->saddr, ntohs(th->source),
479 				       inet_iif(skb), 0);
480 	if (!sk) {
481 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
482 		return -ENOENT;
483 	}
484 	if (sk->sk_state == TCP_TIME_WAIT) {
485 		inet_twsk_put(inet_twsk(sk));
486 		return 0;
487 	}
488 	seq = ntohl(th->seq);
489 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
490 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
491 				     type == ICMP_TIME_EXCEEDED ||
492 				     (type == ICMP_DEST_UNREACH &&
493 				      (code == ICMP_NET_UNREACH ||
494 				       code == ICMP_HOST_UNREACH)));
495 		return 0;
496 	}
497 
498 	bh_lock_sock(sk);
499 	/* If too many ICMPs get dropped on busy
500 	 * servers this needs to be solved differently.
501 	 * We do take care of PMTU discovery (RFC1191) special case :
502 	 * we can receive locally generated ICMP messages while socket is held.
503 	 */
504 	if (sock_owned_by_user(sk)) {
505 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
506 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
507 	}
508 	if (sk->sk_state == TCP_CLOSE)
509 		goto out;
510 
511 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
512 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
513 		goto out;
514 	}
515 
516 	tp = tcp_sk(sk);
517 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
518 	fastopen = rcu_dereference(tp->fastopen_rsk);
519 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
520 	if (sk->sk_state != TCP_LISTEN &&
521 	    !between(seq, snd_una, tp->snd_nxt)) {
522 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
523 		goto out;
524 	}
525 
526 	switch (type) {
527 	case ICMP_REDIRECT:
528 		if (!sock_owned_by_user(sk))
529 			do_redirect(skb, sk);
530 		goto out;
531 	case ICMP_SOURCE_QUENCH:
532 		/* Just silently ignore these. */
533 		goto out;
534 	case ICMP_PARAMETERPROB:
535 		err = EPROTO;
536 		break;
537 	case ICMP_DEST_UNREACH:
538 		if (code > NR_ICMP_UNREACH)
539 			goto out;
540 
541 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
542 			/* We are not interested in TCP_LISTEN and open_requests
543 			 * (SYN-ACKs send out by Linux are always <576bytes so
544 			 * they should go through unfragmented).
545 			 */
546 			if (sk->sk_state == TCP_LISTEN)
547 				goto out;
548 
549 			tp->mtu_info = info;
550 			if (!sock_owned_by_user(sk)) {
551 				tcp_v4_mtu_reduced(sk);
552 			} else {
553 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
554 					sock_hold(sk);
555 			}
556 			goto out;
557 		}
558 
559 		err = icmp_err_convert[code].errno;
560 		/* check if this ICMP message allows revert of backoff.
561 		 * (see RFC 6069)
562 		 */
563 		if (!fastopen &&
564 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
565 			tcp_ld_RTO_revert(sk, seq);
566 		break;
567 	case ICMP_TIME_EXCEEDED:
568 		err = EHOSTUNREACH;
569 		break;
570 	default:
571 		goto out;
572 	}
573 
574 	switch (sk->sk_state) {
575 	case TCP_SYN_SENT:
576 	case TCP_SYN_RECV:
577 		/* Only in fast or simultaneous open. If a fast open socket is
578 		 * already accepted it is treated as a connected one below.
579 		 */
580 		if (fastopen && !fastopen->sk)
581 			break;
582 
583 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
584 
585 		if (!sock_owned_by_user(sk)) {
586 			sk->sk_err = err;
587 
588 			sk->sk_error_report(sk);
589 
590 			tcp_done(sk);
591 		} else {
592 			sk->sk_err_soft = err;
593 		}
594 		goto out;
595 	}
596 
597 	/* If we've already connected we will keep trying
598 	 * until we time out, or the user gives up.
599 	 *
600 	 * rfc1122 4.2.3.9 allows to consider as hard errors
601 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
602 	 * but it is obsoleted by pmtu discovery).
603 	 *
604 	 * Note, that in modern internet, where routing is unreliable
605 	 * and in each dark corner broken firewalls sit, sending random
606 	 * errors ordered by their masters even this two messages finally lose
607 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
608 	 *
609 	 * Now we are in compliance with RFCs.
610 	 *							--ANK (980905)
611 	 */
612 
613 	inet = inet_sk(sk);
614 	if (!sock_owned_by_user(sk) && inet->recverr) {
615 		sk->sk_err = err;
616 		sk->sk_error_report(sk);
617 	} else	{ /* Only an error on timeout */
618 		sk->sk_err_soft = err;
619 	}
620 
621 out:
622 	bh_unlock_sock(sk);
623 	sock_put(sk);
624 	return 0;
625 }
626 
627 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
628 {
629 	struct tcphdr *th = tcp_hdr(skb);
630 
631 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
632 	skb->csum_start = skb_transport_header(skb) - skb->head;
633 	skb->csum_offset = offsetof(struct tcphdr, check);
634 }
635 
636 /* This routine computes an IPv4 TCP checksum. */
637 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
638 {
639 	const struct inet_sock *inet = inet_sk(sk);
640 
641 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
642 }
643 EXPORT_SYMBOL(tcp_v4_send_check);
644 
645 /*
646  *	This routine will send an RST to the other tcp.
647  *
648  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
649  *		      for reset.
650  *	Answer: if a packet caused RST, it is not for a socket
651  *		existing in our system, if it is matched to a socket,
652  *		it is just duplicate segment or bug in other side's TCP.
653  *		So that we build reply only basing on parameters
654  *		arrived with segment.
655  *	Exception: precedence violation. We do not implement it in any case.
656  */
657 
658 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
659 {
660 	const struct tcphdr *th = tcp_hdr(skb);
661 	struct {
662 		struct tcphdr th;
663 #ifdef CONFIG_TCP_MD5SIG
664 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
665 #endif
666 	} rep;
667 	struct ip_reply_arg arg;
668 #ifdef CONFIG_TCP_MD5SIG
669 	struct tcp_md5sig_key *key = NULL;
670 	const __u8 *hash_location = NULL;
671 	unsigned char newhash[16];
672 	int genhash;
673 	struct sock *sk1 = NULL;
674 #endif
675 	u64 transmit_time = 0;
676 	struct sock *ctl_sk;
677 	struct net *net;
678 
679 	/* Never send a reset in response to a reset. */
680 	if (th->rst)
681 		return;
682 
683 	/* If sk not NULL, it means we did a successful lookup and incoming
684 	 * route had to be correct. prequeue might have dropped our dst.
685 	 */
686 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
687 		return;
688 
689 	/* Swap the send and the receive. */
690 	memset(&rep, 0, sizeof(rep));
691 	rep.th.dest   = th->source;
692 	rep.th.source = th->dest;
693 	rep.th.doff   = sizeof(struct tcphdr) / 4;
694 	rep.th.rst    = 1;
695 
696 	if (th->ack) {
697 		rep.th.seq = th->ack_seq;
698 	} else {
699 		rep.th.ack = 1;
700 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
701 				       skb->len - (th->doff << 2));
702 	}
703 
704 	memset(&arg, 0, sizeof(arg));
705 	arg.iov[0].iov_base = (unsigned char *)&rep;
706 	arg.iov[0].iov_len  = sizeof(rep.th);
707 
708 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
709 #ifdef CONFIG_TCP_MD5SIG
710 	rcu_read_lock();
711 	hash_location = tcp_parse_md5sig_option(th);
712 	if (sk && sk_fullsock(sk)) {
713 		const union tcp_md5_addr *addr;
714 		int l3index;
715 
716 		/* sdif set, means packet ingressed via a device
717 		 * in an L3 domain and inet_iif is set to it.
718 		 */
719 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
720 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
721 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
722 	} else if (hash_location) {
723 		const union tcp_md5_addr *addr;
724 		int sdif = tcp_v4_sdif(skb);
725 		int dif = inet_iif(skb);
726 		int l3index;
727 
728 		/*
729 		 * active side is lost. Try to find listening socket through
730 		 * source port, and then find md5 key through listening socket.
731 		 * we are not loose security here:
732 		 * Incoming packet is checked with md5 hash with finding key,
733 		 * no RST generated if md5 hash doesn't match.
734 		 */
735 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
736 					     ip_hdr(skb)->saddr,
737 					     th->source, ip_hdr(skb)->daddr,
738 					     ntohs(th->source), dif, sdif);
739 		/* don't send rst if it can't find key */
740 		if (!sk1)
741 			goto out;
742 
743 		/* sdif set, means packet ingressed via a device
744 		 * in an L3 domain and dif is set to it.
745 		 */
746 		l3index = sdif ? dif : 0;
747 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
748 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
749 		if (!key)
750 			goto out;
751 
752 
753 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
754 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
755 			goto out;
756 
757 	}
758 
759 	if (key) {
760 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
761 				   (TCPOPT_NOP << 16) |
762 				   (TCPOPT_MD5SIG << 8) |
763 				   TCPOLEN_MD5SIG);
764 		/* Update length and the length the header thinks exists */
765 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
766 		rep.th.doff = arg.iov[0].iov_len / 4;
767 
768 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
769 				     key, ip_hdr(skb)->saddr,
770 				     ip_hdr(skb)->daddr, &rep.th);
771 	}
772 #endif
773 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
774 				      ip_hdr(skb)->saddr, /* XXX */
775 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
776 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
777 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
778 
779 	/* When socket is gone, all binding information is lost.
780 	 * routing might fail in this case. No choice here, if we choose to force
781 	 * input interface, we will misroute in case of asymmetric route.
782 	 */
783 	if (sk) {
784 		arg.bound_dev_if = sk->sk_bound_dev_if;
785 		if (sk_fullsock(sk))
786 			trace_tcp_send_reset(sk, skb);
787 	}
788 
789 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
790 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
791 
792 	arg.tos = ip_hdr(skb)->tos;
793 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
794 	local_bh_disable();
795 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
796 	if (sk) {
797 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
798 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
799 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
800 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
801 		transmit_time = tcp_transmit_time(sk);
802 	}
803 	ip_send_unicast_reply(ctl_sk,
804 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
805 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
806 			      &arg, arg.iov[0].iov_len,
807 			      transmit_time);
808 
809 	ctl_sk->sk_mark = 0;
810 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
811 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
812 	local_bh_enable();
813 
814 #ifdef CONFIG_TCP_MD5SIG
815 out:
816 	rcu_read_unlock();
817 #endif
818 }
819 
820 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
821    outside socket context is ugly, certainly. What can I do?
822  */
823 
824 static void tcp_v4_send_ack(const struct sock *sk,
825 			    struct sk_buff *skb, u32 seq, u32 ack,
826 			    u32 win, u32 tsval, u32 tsecr, int oif,
827 			    struct tcp_md5sig_key *key,
828 			    int reply_flags, u8 tos)
829 {
830 	const struct tcphdr *th = tcp_hdr(skb);
831 	struct {
832 		struct tcphdr th;
833 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
834 #ifdef CONFIG_TCP_MD5SIG
835 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
836 #endif
837 			];
838 	} rep;
839 	struct net *net = sock_net(sk);
840 	struct ip_reply_arg arg;
841 	struct sock *ctl_sk;
842 	u64 transmit_time;
843 
844 	memset(&rep.th, 0, sizeof(struct tcphdr));
845 	memset(&arg, 0, sizeof(arg));
846 
847 	arg.iov[0].iov_base = (unsigned char *)&rep;
848 	arg.iov[0].iov_len  = sizeof(rep.th);
849 	if (tsecr) {
850 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
851 				   (TCPOPT_TIMESTAMP << 8) |
852 				   TCPOLEN_TIMESTAMP);
853 		rep.opt[1] = htonl(tsval);
854 		rep.opt[2] = htonl(tsecr);
855 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
856 	}
857 
858 	/* Swap the send and the receive. */
859 	rep.th.dest    = th->source;
860 	rep.th.source  = th->dest;
861 	rep.th.doff    = arg.iov[0].iov_len / 4;
862 	rep.th.seq     = htonl(seq);
863 	rep.th.ack_seq = htonl(ack);
864 	rep.th.ack     = 1;
865 	rep.th.window  = htons(win);
866 
867 #ifdef CONFIG_TCP_MD5SIG
868 	if (key) {
869 		int offset = (tsecr) ? 3 : 0;
870 
871 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
872 					  (TCPOPT_NOP << 16) |
873 					  (TCPOPT_MD5SIG << 8) |
874 					  TCPOLEN_MD5SIG);
875 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
876 		rep.th.doff = arg.iov[0].iov_len/4;
877 
878 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
879 				    key, ip_hdr(skb)->saddr,
880 				    ip_hdr(skb)->daddr, &rep.th);
881 	}
882 #endif
883 	arg.flags = reply_flags;
884 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
885 				      ip_hdr(skb)->saddr, /* XXX */
886 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
887 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
888 	if (oif)
889 		arg.bound_dev_if = oif;
890 	arg.tos = tos;
891 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
892 	local_bh_disable();
893 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
894 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
895 			   inet_twsk(sk)->tw_mark : sk->sk_mark;
896 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
897 			   inet_twsk(sk)->tw_priority : sk->sk_priority;
898 	transmit_time = tcp_transmit_time(sk);
899 	ip_send_unicast_reply(ctl_sk,
900 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
901 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
902 			      &arg, arg.iov[0].iov_len,
903 			      transmit_time);
904 
905 	ctl_sk->sk_mark = 0;
906 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
907 	local_bh_enable();
908 }
909 
910 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
911 {
912 	struct inet_timewait_sock *tw = inet_twsk(sk);
913 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
914 
915 	tcp_v4_send_ack(sk, skb,
916 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
917 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
918 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
919 			tcptw->tw_ts_recent,
920 			tw->tw_bound_dev_if,
921 			tcp_twsk_md5_key(tcptw),
922 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
923 			tw->tw_tos
924 			);
925 
926 	inet_twsk_put(tw);
927 }
928 
929 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
930 				  struct request_sock *req)
931 {
932 	const union tcp_md5_addr *addr;
933 	int l3index;
934 
935 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
936 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
937 	 */
938 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
939 					     tcp_sk(sk)->snd_nxt;
940 
941 	/* RFC 7323 2.3
942 	 * The window field (SEG.WND) of every outgoing segment, with the
943 	 * exception of <SYN> segments, MUST be right-shifted by
944 	 * Rcv.Wind.Shift bits:
945 	 */
946 	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
947 	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
948 	tcp_v4_send_ack(sk, skb, seq,
949 			tcp_rsk(req)->rcv_nxt,
950 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
951 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
952 			req->ts_recent,
953 			0,
954 			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
955 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
956 			ip_hdr(skb)->tos);
957 }
958 
959 /*
960  *	Send a SYN-ACK after having received a SYN.
961  *	This still operates on a request_sock only, not on a big
962  *	socket.
963  */
964 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
965 			      struct flowi *fl,
966 			      struct request_sock *req,
967 			      struct tcp_fastopen_cookie *foc,
968 			      enum tcp_synack_type synack_type,
969 			      struct sk_buff *syn_skb)
970 {
971 	const struct inet_request_sock *ireq = inet_rsk(req);
972 	struct flowi4 fl4;
973 	int err = -1;
974 	struct sk_buff *skb;
975 	u8 tos;
976 
977 	/* First, grab a route. */
978 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
979 		return -1;
980 
981 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
982 
983 	if (skb) {
984 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
985 
986 		tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
987 				tcp_rsk(req)->syn_tos & ~INET_ECN_MASK :
988 				inet_sk(sk)->tos;
989 
990 		if (!INET_ECN_is_capable(tos) &&
991 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
992 			tos |= INET_ECN_ECT_0;
993 
994 		rcu_read_lock();
995 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
996 					    ireq->ir_rmt_addr,
997 					    rcu_dereference(ireq->ireq_opt),
998 					    tos);
999 		rcu_read_unlock();
1000 		err = net_xmit_eval(err);
1001 	}
1002 
1003 	return err;
1004 }
1005 
1006 /*
1007  *	IPv4 request_sock destructor.
1008  */
1009 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1010 {
1011 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1012 }
1013 
1014 #ifdef CONFIG_TCP_MD5SIG
1015 /*
1016  * RFC2385 MD5 checksumming requires a mapping of
1017  * IP address->MD5 Key.
1018  * We need to maintain these in the sk structure.
1019  */
1020 
1021 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1022 EXPORT_SYMBOL(tcp_md5_needed);
1023 
1024 /* Find the Key structure for an address.  */
1025 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1026 					   const union tcp_md5_addr *addr,
1027 					   int family)
1028 {
1029 	const struct tcp_sock *tp = tcp_sk(sk);
1030 	struct tcp_md5sig_key *key;
1031 	const struct tcp_md5sig_info *md5sig;
1032 	__be32 mask;
1033 	struct tcp_md5sig_key *best_match = NULL;
1034 	bool match;
1035 
1036 	/* caller either holds rcu_read_lock() or socket lock */
1037 	md5sig = rcu_dereference_check(tp->md5sig_info,
1038 				       lockdep_sock_is_held(sk));
1039 	if (!md5sig)
1040 		return NULL;
1041 
1042 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1043 				 lockdep_sock_is_held(sk)) {
1044 		if (key->family != family)
1045 			continue;
1046 		if (key->l3index && key->l3index != l3index)
1047 			continue;
1048 		if (family == AF_INET) {
1049 			mask = inet_make_mask(key->prefixlen);
1050 			match = (key->addr.a4.s_addr & mask) ==
1051 				(addr->a4.s_addr & mask);
1052 #if IS_ENABLED(CONFIG_IPV6)
1053 		} else if (family == AF_INET6) {
1054 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1055 						  key->prefixlen);
1056 #endif
1057 		} else {
1058 			match = false;
1059 		}
1060 
1061 		if (match && (!best_match ||
1062 			      key->prefixlen > best_match->prefixlen))
1063 			best_match = key;
1064 	}
1065 	return best_match;
1066 }
1067 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1068 
1069 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1070 						      const union tcp_md5_addr *addr,
1071 						      int family, u8 prefixlen,
1072 						      int l3index)
1073 {
1074 	const struct tcp_sock *tp = tcp_sk(sk);
1075 	struct tcp_md5sig_key *key;
1076 	unsigned int size = sizeof(struct in_addr);
1077 	const struct tcp_md5sig_info *md5sig;
1078 
1079 	/* caller either holds rcu_read_lock() or socket lock */
1080 	md5sig = rcu_dereference_check(tp->md5sig_info,
1081 				       lockdep_sock_is_held(sk));
1082 	if (!md5sig)
1083 		return NULL;
1084 #if IS_ENABLED(CONFIG_IPV6)
1085 	if (family == AF_INET6)
1086 		size = sizeof(struct in6_addr);
1087 #endif
1088 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1089 				 lockdep_sock_is_held(sk)) {
1090 		if (key->family != family)
1091 			continue;
1092 		if (key->l3index && key->l3index != l3index)
1093 			continue;
1094 		if (!memcmp(&key->addr, addr, size) &&
1095 		    key->prefixlen == prefixlen)
1096 			return key;
1097 	}
1098 	return NULL;
1099 }
1100 
1101 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1102 					 const struct sock *addr_sk)
1103 {
1104 	const union tcp_md5_addr *addr;
1105 	int l3index;
1106 
1107 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1108 						 addr_sk->sk_bound_dev_if);
1109 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1110 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1111 }
1112 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1113 
1114 /* This can be called on a newly created socket, from other files */
1115 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1116 		   int family, u8 prefixlen, int l3index,
1117 		   const u8 *newkey, u8 newkeylen, gfp_t gfp)
1118 {
1119 	/* Add Key to the list */
1120 	struct tcp_md5sig_key *key;
1121 	struct tcp_sock *tp = tcp_sk(sk);
1122 	struct tcp_md5sig_info *md5sig;
1123 
1124 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1125 	if (key) {
1126 		/* Pre-existing entry - just update that one.
1127 		 * Note that the key might be used concurrently.
1128 		 * data_race() is telling kcsan that we do not care of
1129 		 * key mismatches, since changing MD5 key on live flows
1130 		 * can lead to packet drops.
1131 		 */
1132 		data_race(memcpy(key->key, newkey, newkeylen));
1133 
1134 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1135 		 * Also note that a reader could catch new key->keylen value
1136 		 * but old key->key[], this is the reason we use __GFP_ZERO
1137 		 * at sock_kmalloc() time below these lines.
1138 		 */
1139 		WRITE_ONCE(key->keylen, newkeylen);
1140 
1141 		return 0;
1142 	}
1143 
1144 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1145 					   lockdep_sock_is_held(sk));
1146 	if (!md5sig) {
1147 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1148 		if (!md5sig)
1149 			return -ENOMEM;
1150 
1151 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1152 		INIT_HLIST_HEAD(&md5sig->head);
1153 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1154 	}
1155 
1156 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1157 	if (!key)
1158 		return -ENOMEM;
1159 	if (!tcp_alloc_md5sig_pool()) {
1160 		sock_kfree_s(sk, key, sizeof(*key));
1161 		return -ENOMEM;
1162 	}
1163 
1164 	memcpy(key->key, newkey, newkeylen);
1165 	key->keylen = newkeylen;
1166 	key->family = family;
1167 	key->prefixlen = prefixlen;
1168 	key->l3index = l3index;
1169 	memcpy(&key->addr, addr,
1170 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1171 				      sizeof(struct in_addr));
1172 	hlist_add_head_rcu(&key->node, &md5sig->head);
1173 	return 0;
1174 }
1175 EXPORT_SYMBOL(tcp_md5_do_add);
1176 
1177 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1178 		   u8 prefixlen, int l3index)
1179 {
1180 	struct tcp_md5sig_key *key;
1181 
1182 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1183 	if (!key)
1184 		return -ENOENT;
1185 	hlist_del_rcu(&key->node);
1186 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1187 	kfree_rcu(key, rcu);
1188 	return 0;
1189 }
1190 EXPORT_SYMBOL(tcp_md5_do_del);
1191 
1192 static void tcp_clear_md5_list(struct sock *sk)
1193 {
1194 	struct tcp_sock *tp = tcp_sk(sk);
1195 	struct tcp_md5sig_key *key;
1196 	struct hlist_node *n;
1197 	struct tcp_md5sig_info *md5sig;
1198 
1199 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1200 
1201 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1202 		hlist_del_rcu(&key->node);
1203 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1204 		kfree_rcu(key, rcu);
1205 	}
1206 }
1207 
1208 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1209 				 sockptr_t optval, int optlen)
1210 {
1211 	struct tcp_md5sig cmd;
1212 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1213 	const union tcp_md5_addr *addr;
1214 	u8 prefixlen = 32;
1215 	int l3index = 0;
1216 
1217 	if (optlen < sizeof(cmd))
1218 		return -EINVAL;
1219 
1220 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1221 		return -EFAULT;
1222 
1223 	if (sin->sin_family != AF_INET)
1224 		return -EINVAL;
1225 
1226 	if (optname == TCP_MD5SIG_EXT &&
1227 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1228 		prefixlen = cmd.tcpm_prefixlen;
1229 		if (prefixlen > 32)
1230 			return -EINVAL;
1231 	}
1232 
1233 	if (optname == TCP_MD5SIG_EXT &&
1234 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1235 		struct net_device *dev;
1236 
1237 		rcu_read_lock();
1238 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1239 		if (dev && netif_is_l3_master(dev))
1240 			l3index = dev->ifindex;
1241 
1242 		rcu_read_unlock();
1243 
1244 		/* ok to reference set/not set outside of rcu;
1245 		 * right now device MUST be an L3 master
1246 		 */
1247 		if (!dev || !l3index)
1248 			return -EINVAL;
1249 	}
1250 
1251 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1252 
1253 	if (!cmd.tcpm_keylen)
1254 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1255 
1256 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1257 		return -EINVAL;
1258 
1259 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1260 			      cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1261 }
1262 
1263 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1264 				   __be32 daddr, __be32 saddr,
1265 				   const struct tcphdr *th, int nbytes)
1266 {
1267 	struct tcp4_pseudohdr *bp;
1268 	struct scatterlist sg;
1269 	struct tcphdr *_th;
1270 
1271 	bp = hp->scratch;
1272 	bp->saddr = saddr;
1273 	bp->daddr = daddr;
1274 	bp->pad = 0;
1275 	bp->protocol = IPPROTO_TCP;
1276 	bp->len = cpu_to_be16(nbytes);
1277 
1278 	_th = (struct tcphdr *)(bp + 1);
1279 	memcpy(_th, th, sizeof(*th));
1280 	_th->check = 0;
1281 
1282 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1283 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1284 				sizeof(*bp) + sizeof(*th));
1285 	return crypto_ahash_update(hp->md5_req);
1286 }
1287 
1288 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1289 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1290 {
1291 	struct tcp_md5sig_pool *hp;
1292 	struct ahash_request *req;
1293 
1294 	hp = tcp_get_md5sig_pool();
1295 	if (!hp)
1296 		goto clear_hash_noput;
1297 	req = hp->md5_req;
1298 
1299 	if (crypto_ahash_init(req))
1300 		goto clear_hash;
1301 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1302 		goto clear_hash;
1303 	if (tcp_md5_hash_key(hp, key))
1304 		goto clear_hash;
1305 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1306 	if (crypto_ahash_final(req))
1307 		goto clear_hash;
1308 
1309 	tcp_put_md5sig_pool();
1310 	return 0;
1311 
1312 clear_hash:
1313 	tcp_put_md5sig_pool();
1314 clear_hash_noput:
1315 	memset(md5_hash, 0, 16);
1316 	return 1;
1317 }
1318 
1319 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1320 			const struct sock *sk,
1321 			const struct sk_buff *skb)
1322 {
1323 	struct tcp_md5sig_pool *hp;
1324 	struct ahash_request *req;
1325 	const struct tcphdr *th = tcp_hdr(skb);
1326 	__be32 saddr, daddr;
1327 
1328 	if (sk) { /* valid for establish/request sockets */
1329 		saddr = sk->sk_rcv_saddr;
1330 		daddr = sk->sk_daddr;
1331 	} else {
1332 		const struct iphdr *iph = ip_hdr(skb);
1333 		saddr = iph->saddr;
1334 		daddr = iph->daddr;
1335 	}
1336 
1337 	hp = tcp_get_md5sig_pool();
1338 	if (!hp)
1339 		goto clear_hash_noput;
1340 	req = hp->md5_req;
1341 
1342 	if (crypto_ahash_init(req))
1343 		goto clear_hash;
1344 
1345 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1346 		goto clear_hash;
1347 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1348 		goto clear_hash;
1349 	if (tcp_md5_hash_key(hp, key))
1350 		goto clear_hash;
1351 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1352 	if (crypto_ahash_final(req))
1353 		goto clear_hash;
1354 
1355 	tcp_put_md5sig_pool();
1356 	return 0;
1357 
1358 clear_hash:
1359 	tcp_put_md5sig_pool();
1360 clear_hash_noput:
1361 	memset(md5_hash, 0, 16);
1362 	return 1;
1363 }
1364 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1365 
1366 #endif
1367 
1368 /* Called with rcu_read_lock() */
1369 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1370 				    const struct sk_buff *skb,
1371 				    int dif, int sdif)
1372 {
1373 #ifdef CONFIG_TCP_MD5SIG
1374 	/*
1375 	 * This gets called for each TCP segment that arrives
1376 	 * so we want to be efficient.
1377 	 * We have 3 drop cases:
1378 	 * o No MD5 hash and one expected.
1379 	 * o MD5 hash and we're not expecting one.
1380 	 * o MD5 hash and its wrong.
1381 	 */
1382 	const __u8 *hash_location = NULL;
1383 	struct tcp_md5sig_key *hash_expected;
1384 	const struct iphdr *iph = ip_hdr(skb);
1385 	const struct tcphdr *th = tcp_hdr(skb);
1386 	const union tcp_md5_addr *addr;
1387 	unsigned char newhash[16];
1388 	int genhash, l3index;
1389 
1390 	/* sdif set, means packet ingressed via a device
1391 	 * in an L3 domain and dif is set to the l3mdev
1392 	 */
1393 	l3index = sdif ? dif : 0;
1394 
1395 	addr = (union tcp_md5_addr *)&iph->saddr;
1396 	hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1397 	hash_location = tcp_parse_md5sig_option(th);
1398 
1399 	/* We've parsed the options - do we have a hash? */
1400 	if (!hash_expected && !hash_location)
1401 		return false;
1402 
1403 	if (hash_expected && !hash_location) {
1404 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1405 		return true;
1406 	}
1407 
1408 	if (!hash_expected && hash_location) {
1409 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1410 		return true;
1411 	}
1412 
1413 	/* Okay, so this is hash_expected and hash_location -
1414 	 * so we need to calculate the checksum.
1415 	 */
1416 	genhash = tcp_v4_md5_hash_skb(newhash,
1417 				      hash_expected,
1418 				      NULL, skb);
1419 
1420 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1421 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1422 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1423 				     &iph->saddr, ntohs(th->source),
1424 				     &iph->daddr, ntohs(th->dest),
1425 				     genhash ? " tcp_v4_calc_md5_hash failed"
1426 				     : "", l3index);
1427 		return true;
1428 	}
1429 	return false;
1430 #endif
1431 	return false;
1432 }
1433 
1434 static void tcp_v4_init_req(struct request_sock *req,
1435 			    const struct sock *sk_listener,
1436 			    struct sk_buff *skb)
1437 {
1438 	struct inet_request_sock *ireq = inet_rsk(req);
1439 	struct net *net = sock_net(sk_listener);
1440 
1441 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1442 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1443 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1444 }
1445 
1446 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1447 					  struct flowi *fl,
1448 					  const struct request_sock *req)
1449 {
1450 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1451 }
1452 
1453 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1454 	.family		=	PF_INET,
1455 	.obj_size	=	sizeof(struct tcp_request_sock),
1456 	.rtx_syn_ack	=	tcp_rtx_synack,
1457 	.send_ack	=	tcp_v4_reqsk_send_ack,
1458 	.destructor	=	tcp_v4_reqsk_destructor,
1459 	.send_reset	=	tcp_v4_send_reset,
1460 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1461 };
1462 
1463 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1464 	.mss_clamp	=	TCP_MSS_DEFAULT,
1465 #ifdef CONFIG_TCP_MD5SIG
1466 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1467 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1468 #endif
1469 	.init_req	=	tcp_v4_init_req,
1470 #ifdef CONFIG_SYN_COOKIES
1471 	.cookie_init_seq =	cookie_v4_init_sequence,
1472 #endif
1473 	.route_req	=	tcp_v4_route_req,
1474 	.init_seq	=	tcp_v4_init_seq,
1475 	.init_ts_off	=	tcp_v4_init_ts_off,
1476 	.send_synack	=	tcp_v4_send_synack,
1477 };
1478 
1479 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1480 {
1481 	/* Never answer to SYNs send to broadcast or multicast */
1482 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1483 		goto drop;
1484 
1485 	return tcp_conn_request(&tcp_request_sock_ops,
1486 				&tcp_request_sock_ipv4_ops, sk, skb);
1487 
1488 drop:
1489 	tcp_listendrop(sk);
1490 	return 0;
1491 }
1492 EXPORT_SYMBOL(tcp_v4_conn_request);
1493 
1494 
1495 /*
1496  * The three way handshake has completed - we got a valid synack -
1497  * now create the new socket.
1498  */
1499 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1500 				  struct request_sock *req,
1501 				  struct dst_entry *dst,
1502 				  struct request_sock *req_unhash,
1503 				  bool *own_req)
1504 {
1505 	struct inet_request_sock *ireq;
1506 	bool found_dup_sk = false;
1507 	struct inet_sock *newinet;
1508 	struct tcp_sock *newtp;
1509 	struct sock *newsk;
1510 #ifdef CONFIG_TCP_MD5SIG
1511 	const union tcp_md5_addr *addr;
1512 	struct tcp_md5sig_key *key;
1513 	int l3index;
1514 #endif
1515 	struct ip_options_rcu *inet_opt;
1516 
1517 	if (sk_acceptq_is_full(sk))
1518 		goto exit_overflow;
1519 
1520 	newsk = tcp_create_openreq_child(sk, req, skb);
1521 	if (!newsk)
1522 		goto exit_nonewsk;
1523 
1524 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1525 	inet_sk_rx_dst_set(newsk, skb);
1526 
1527 	newtp		      = tcp_sk(newsk);
1528 	newinet		      = inet_sk(newsk);
1529 	ireq		      = inet_rsk(req);
1530 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1531 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1532 	newsk->sk_bound_dev_if = ireq->ir_iif;
1533 	newinet->inet_saddr   = ireq->ir_loc_addr;
1534 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1535 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1536 	newinet->mc_index     = inet_iif(skb);
1537 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1538 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1539 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1540 	if (inet_opt)
1541 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1542 	newinet->inet_id = prandom_u32();
1543 
1544 	/* Set ToS of the new socket based upon the value of incoming SYN. */
1545 	if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1546 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1547 
1548 	if (!dst) {
1549 		dst = inet_csk_route_child_sock(sk, newsk, req);
1550 		if (!dst)
1551 			goto put_and_exit;
1552 	} else {
1553 		/* syncookie case : see end of cookie_v4_check() */
1554 	}
1555 	sk_setup_caps(newsk, dst);
1556 
1557 	tcp_ca_openreq_child(newsk, dst);
1558 
1559 	tcp_sync_mss(newsk, dst_mtu(dst));
1560 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1561 
1562 	tcp_initialize_rcv_mss(newsk);
1563 
1564 #ifdef CONFIG_TCP_MD5SIG
1565 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1566 	/* Copy over the MD5 key from the original socket */
1567 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1568 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1569 	if (key) {
1570 		/*
1571 		 * We're using one, so create a matching key
1572 		 * on the newsk structure. If we fail to get
1573 		 * memory, then we end up not copying the key
1574 		 * across. Shucks.
1575 		 */
1576 		tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1577 			       key->key, key->keylen, GFP_ATOMIC);
1578 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1579 	}
1580 #endif
1581 
1582 	if (__inet_inherit_port(sk, newsk) < 0)
1583 		goto put_and_exit;
1584 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1585 				       &found_dup_sk);
1586 	if (likely(*own_req)) {
1587 		tcp_move_syn(newtp, req);
1588 		ireq->ireq_opt = NULL;
1589 	} else {
1590 		if (!req_unhash && found_dup_sk) {
1591 			/* This code path should only be executed in the
1592 			 * syncookie case only
1593 			 */
1594 			bh_unlock_sock(newsk);
1595 			sock_put(newsk);
1596 			newsk = NULL;
1597 		} else {
1598 			newinet->inet_opt = NULL;
1599 		}
1600 	}
1601 	return newsk;
1602 
1603 exit_overflow:
1604 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1605 exit_nonewsk:
1606 	dst_release(dst);
1607 exit:
1608 	tcp_listendrop(sk);
1609 	return NULL;
1610 put_and_exit:
1611 	newinet->inet_opt = NULL;
1612 	inet_csk_prepare_forced_close(newsk);
1613 	tcp_done(newsk);
1614 	goto exit;
1615 }
1616 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1617 
1618 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1619 {
1620 #ifdef CONFIG_SYN_COOKIES
1621 	const struct tcphdr *th = tcp_hdr(skb);
1622 
1623 	if (!th->syn)
1624 		sk = cookie_v4_check(sk, skb);
1625 #endif
1626 	return sk;
1627 }
1628 
1629 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1630 			 struct tcphdr *th, u32 *cookie)
1631 {
1632 	u16 mss = 0;
1633 #ifdef CONFIG_SYN_COOKIES
1634 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1635 				    &tcp_request_sock_ipv4_ops, sk, th);
1636 	if (mss) {
1637 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1638 		tcp_synq_overflow(sk);
1639 	}
1640 #endif
1641 	return mss;
1642 }
1643 
1644 /* The socket must have it's spinlock held when we get
1645  * here, unless it is a TCP_LISTEN socket.
1646  *
1647  * We have a potential double-lock case here, so even when
1648  * doing backlog processing we use the BH locking scheme.
1649  * This is because we cannot sleep with the original spinlock
1650  * held.
1651  */
1652 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1653 {
1654 	struct sock *rsk;
1655 
1656 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1657 		struct dst_entry *dst = sk->sk_rx_dst;
1658 
1659 		sock_rps_save_rxhash(sk, skb);
1660 		sk_mark_napi_id(sk, skb);
1661 		if (dst) {
1662 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1663 			    !dst->ops->check(dst, 0)) {
1664 				dst_release(dst);
1665 				sk->sk_rx_dst = NULL;
1666 			}
1667 		}
1668 		tcp_rcv_established(sk, skb);
1669 		return 0;
1670 	}
1671 
1672 	if (tcp_checksum_complete(skb))
1673 		goto csum_err;
1674 
1675 	if (sk->sk_state == TCP_LISTEN) {
1676 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1677 
1678 		if (!nsk)
1679 			goto discard;
1680 		if (nsk != sk) {
1681 			if (tcp_child_process(sk, nsk, skb)) {
1682 				rsk = nsk;
1683 				goto reset;
1684 			}
1685 			return 0;
1686 		}
1687 	} else
1688 		sock_rps_save_rxhash(sk, skb);
1689 
1690 	if (tcp_rcv_state_process(sk, skb)) {
1691 		rsk = sk;
1692 		goto reset;
1693 	}
1694 	return 0;
1695 
1696 reset:
1697 	tcp_v4_send_reset(rsk, skb);
1698 discard:
1699 	kfree_skb(skb);
1700 	/* Be careful here. If this function gets more complicated and
1701 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1702 	 * might be destroyed here. This current version compiles correctly,
1703 	 * but you have been warned.
1704 	 */
1705 	return 0;
1706 
1707 csum_err:
1708 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1709 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1710 	goto discard;
1711 }
1712 EXPORT_SYMBOL(tcp_v4_do_rcv);
1713 
1714 int tcp_v4_early_demux(struct sk_buff *skb)
1715 {
1716 	const struct iphdr *iph;
1717 	const struct tcphdr *th;
1718 	struct sock *sk;
1719 
1720 	if (skb->pkt_type != PACKET_HOST)
1721 		return 0;
1722 
1723 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1724 		return 0;
1725 
1726 	iph = ip_hdr(skb);
1727 	th = tcp_hdr(skb);
1728 
1729 	if (th->doff < sizeof(struct tcphdr) / 4)
1730 		return 0;
1731 
1732 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1733 				       iph->saddr, th->source,
1734 				       iph->daddr, ntohs(th->dest),
1735 				       skb->skb_iif, inet_sdif(skb));
1736 	if (sk) {
1737 		skb->sk = sk;
1738 		skb->destructor = sock_edemux;
1739 		if (sk_fullsock(sk)) {
1740 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1741 
1742 			if (dst)
1743 				dst = dst_check(dst, 0);
1744 			if (dst &&
1745 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1746 				skb_dst_set_noref(skb, dst);
1747 		}
1748 	}
1749 	return 0;
1750 }
1751 
1752 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1753 {
1754 	u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1755 	struct skb_shared_info *shinfo;
1756 	const struct tcphdr *th;
1757 	struct tcphdr *thtail;
1758 	struct sk_buff *tail;
1759 	unsigned int hdrlen;
1760 	bool fragstolen;
1761 	u32 gso_segs;
1762 	int delta;
1763 
1764 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1765 	 * we can fix skb->truesize to its real value to avoid future drops.
1766 	 * This is valid because skb is not yet charged to the socket.
1767 	 * It has been noticed pure SACK packets were sometimes dropped
1768 	 * (if cooked by drivers without copybreak feature).
1769 	 */
1770 	skb_condense(skb);
1771 
1772 	skb_dst_drop(skb);
1773 
1774 	if (unlikely(tcp_checksum_complete(skb))) {
1775 		bh_unlock_sock(sk);
1776 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1777 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1778 		return true;
1779 	}
1780 
1781 	/* Attempt coalescing to last skb in backlog, even if we are
1782 	 * above the limits.
1783 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1784 	 */
1785 	th = (const struct tcphdr *)skb->data;
1786 	hdrlen = th->doff * 4;
1787 	shinfo = skb_shinfo(skb);
1788 
1789 	if (!shinfo->gso_size)
1790 		shinfo->gso_size = skb->len - hdrlen;
1791 
1792 	if (!shinfo->gso_segs)
1793 		shinfo->gso_segs = 1;
1794 
1795 	tail = sk->sk_backlog.tail;
1796 	if (!tail)
1797 		goto no_coalesce;
1798 	thtail = (struct tcphdr *)tail->data;
1799 
1800 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1801 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1802 	    ((TCP_SKB_CB(tail)->tcp_flags |
1803 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1804 	    !((TCP_SKB_CB(tail)->tcp_flags &
1805 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1806 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1807 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1808 #ifdef CONFIG_TLS_DEVICE
1809 	    tail->decrypted != skb->decrypted ||
1810 #endif
1811 	    thtail->doff != th->doff ||
1812 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1813 		goto no_coalesce;
1814 
1815 	__skb_pull(skb, hdrlen);
1816 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1817 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1818 
1819 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1820 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1821 			thtail->window = th->window;
1822 		}
1823 
1824 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1825 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1826 		 * is not entered if we append a packet with a FIN.
1827 		 * SYN, RST, URG are not present.
1828 		 * ACK is set on both packets.
1829 		 * PSH : we do not really care in TCP stack,
1830 		 *       at least for 'GRO' packets.
1831 		 */
1832 		thtail->fin |= th->fin;
1833 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1834 
1835 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1836 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1837 			tail->tstamp = skb->tstamp;
1838 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1839 		}
1840 
1841 		/* Not as strict as GRO. We only need to carry mss max value */
1842 		skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1843 						 skb_shinfo(tail)->gso_size);
1844 
1845 		gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1846 		skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1847 
1848 		sk->sk_backlog.len += delta;
1849 		__NET_INC_STATS(sock_net(sk),
1850 				LINUX_MIB_TCPBACKLOGCOALESCE);
1851 		kfree_skb_partial(skb, fragstolen);
1852 		return false;
1853 	}
1854 	__skb_push(skb, hdrlen);
1855 
1856 no_coalesce:
1857 	/* Only socket owner can try to collapse/prune rx queues
1858 	 * to reduce memory overhead, so add a little headroom here.
1859 	 * Few sockets backlog are possibly concurrently non empty.
1860 	 */
1861 	limit += 64*1024;
1862 
1863 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1864 		bh_unlock_sock(sk);
1865 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1866 		return true;
1867 	}
1868 	return false;
1869 }
1870 EXPORT_SYMBOL(tcp_add_backlog);
1871 
1872 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1873 {
1874 	struct tcphdr *th = (struct tcphdr *)skb->data;
1875 
1876 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1877 }
1878 EXPORT_SYMBOL(tcp_filter);
1879 
1880 static void tcp_v4_restore_cb(struct sk_buff *skb)
1881 {
1882 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1883 		sizeof(struct inet_skb_parm));
1884 }
1885 
1886 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1887 			   const struct tcphdr *th)
1888 {
1889 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1890 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1891 	 */
1892 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1893 		sizeof(struct inet_skb_parm));
1894 	barrier();
1895 
1896 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1897 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1898 				    skb->len - th->doff * 4);
1899 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1900 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1901 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1902 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1903 	TCP_SKB_CB(skb)->sacked	 = 0;
1904 	TCP_SKB_CB(skb)->has_rxtstamp =
1905 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1906 }
1907 
1908 /*
1909  *	From tcp_input.c
1910  */
1911 
1912 int tcp_v4_rcv(struct sk_buff *skb)
1913 {
1914 	struct net *net = dev_net(skb->dev);
1915 	struct sk_buff *skb_to_free;
1916 	int sdif = inet_sdif(skb);
1917 	int dif = inet_iif(skb);
1918 	const struct iphdr *iph;
1919 	const struct tcphdr *th;
1920 	bool refcounted;
1921 	struct sock *sk;
1922 	int ret;
1923 
1924 	if (skb->pkt_type != PACKET_HOST)
1925 		goto discard_it;
1926 
1927 	/* Count it even if it's bad */
1928 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1929 
1930 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1931 		goto discard_it;
1932 
1933 	th = (const struct tcphdr *)skb->data;
1934 
1935 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1936 		goto bad_packet;
1937 	if (!pskb_may_pull(skb, th->doff * 4))
1938 		goto discard_it;
1939 
1940 	/* An explanation is required here, I think.
1941 	 * Packet length and doff are validated by header prediction,
1942 	 * provided case of th->doff==0 is eliminated.
1943 	 * So, we defer the checks. */
1944 
1945 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1946 		goto csum_error;
1947 
1948 	th = (const struct tcphdr *)skb->data;
1949 	iph = ip_hdr(skb);
1950 lookup:
1951 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1952 			       th->dest, sdif, &refcounted);
1953 	if (!sk)
1954 		goto no_tcp_socket;
1955 
1956 process:
1957 	if (sk->sk_state == TCP_TIME_WAIT)
1958 		goto do_time_wait;
1959 
1960 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1961 		struct request_sock *req = inet_reqsk(sk);
1962 		bool req_stolen = false;
1963 		struct sock *nsk;
1964 
1965 		sk = req->rsk_listener;
1966 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1967 			sk_drops_add(sk, skb);
1968 			reqsk_put(req);
1969 			goto discard_it;
1970 		}
1971 		if (tcp_checksum_complete(skb)) {
1972 			reqsk_put(req);
1973 			goto csum_error;
1974 		}
1975 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1976 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1977 			goto lookup;
1978 		}
1979 		/* We own a reference on the listener, increase it again
1980 		 * as we might lose it too soon.
1981 		 */
1982 		sock_hold(sk);
1983 		refcounted = true;
1984 		nsk = NULL;
1985 		if (!tcp_filter(sk, skb)) {
1986 			th = (const struct tcphdr *)skb->data;
1987 			iph = ip_hdr(skb);
1988 			tcp_v4_fill_cb(skb, iph, th);
1989 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1990 		}
1991 		if (!nsk) {
1992 			reqsk_put(req);
1993 			if (req_stolen) {
1994 				/* Another cpu got exclusive access to req
1995 				 * and created a full blown socket.
1996 				 * Try to feed this packet to this socket
1997 				 * instead of discarding it.
1998 				 */
1999 				tcp_v4_restore_cb(skb);
2000 				sock_put(sk);
2001 				goto lookup;
2002 			}
2003 			goto discard_and_relse;
2004 		}
2005 		if (nsk == sk) {
2006 			reqsk_put(req);
2007 			tcp_v4_restore_cb(skb);
2008 		} else if (tcp_child_process(sk, nsk, skb)) {
2009 			tcp_v4_send_reset(nsk, skb);
2010 			goto discard_and_relse;
2011 		} else {
2012 			sock_put(sk);
2013 			return 0;
2014 		}
2015 	}
2016 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2017 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2018 		goto discard_and_relse;
2019 	}
2020 
2021 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2022 		goto discard_and_relse;
2023 
2024 	if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2025 		goto discard_and_relse;
2026 
2027 	nf_reset_ct(skb);
2028 
2029 	if (tcp_filter(sk, skb))
2030 		goto discard_and_relse;
2031 	th = (const struct tcphdr *)skb->data;
2032 	iph = ip_hdr(skb);
2033 	tcp_v4_fill_cb(skb, iph, th);
2034 
2035 	skb->dev = NULL;
2036 
2037 	if (sk->sk_state == TCP_LISTEN) {
2038 		ret = tcp_v4_do_rcv(sk, skb);
2039 		goto put_and_return;
2040 	}
2041 
2042 	sk_incoming_cpu_update(sk);
2043 
2044 	bh_lock_sock_nested(sk);
2045 	tcp_segs_in(tcp_sk(sk), skb);
2046 	ret = 0;
2047 	if (!sock_owned_by_user(sk)) {
2048 		skb_to_free = sk->sk_rx_skb_cache;
2049 		sk->sk_rx_skb_cache = NULL;
2050 		ret = tcp_v4_do_rcv(sk, skb);
2051 	} else {
2052 		if (tcp_add_backlog(sk, skb))
2053 			goto discard_and_relse;
2054 		skb_to_free = NULL;
2055 	}
2056 	bh_unlock_sock(sk);
2057 	if (skb_to_free)
2058 		__kfree_skb(skb_to_free);
2059 
2060 put_and_return:
2061 	if (refcounted)
2062 		sock_put(sk);
2063 
2064 	return ret;
2065 
2066 no_tcp_socket:
2067 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2068 		goto discard_it;
2069 
2070 	tcp_v4_fill_cb(skb, iph, th);
2071 
2072 	if (tcp_checksum_complete(skb)) {
2073 csum_error:
2074 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2075 bad_packet:
2076 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2077 	} else {
2078 		tcp_v4_send_reset(NULL, skb);
2079 	}
2080 
2081 discard_it:
2082 	/* Discard frame. */
2083 	kfree_skb(skb);
2084 	return 0;
2085 
2086 discard_and_relse:
2087 	sk_drops_add(sk, skb);
2088 	if (refcounted)
2089 		sock_put(sk);
2090 	goto discard_it;
2091 
2092 do_time_wait:
2093 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2094 		inet_twsk_put(inet_twsk(sk));
2095 		goto discard_it;
2096 	}
2097 
2098 	tcp_v4_fill_cb(skb, iph, th);
2099 
2100 	if (tcp_checksum_complete(skb)) {
2101 		inet_twsk_put(inet_twsk(sk));
2102 		goto csum_error;
2103 	}
2104 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2105 	case TCP_TW_SYN: {
2106 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2107 							&tcp_hashinfo, skb,
2108 							__tcp_hdrlen(th),
2109 							iph->saddr, th->source,
2110 							iph->daddr, th->dest,
2111 							inet_iif(skb),
2112 							sdif);
2113 		if (sk2) {
2114 			inet_twsk_deschedule_put(inet_twsk(sk));
2115 			sk = sk2;
2116 			tcp_v4_restore_cb(skb);
2117 			refcounted = false;
2118 			goto process;
2119 		}
2120 	}
2121 		/* to ACK */
2122 		fallthrough;
2123 	case TCP_TW_ACK:
2124 		tcp_v4_timewait_ack(sk, skb);
2125 		break;
2126 	case TCP_TW_RST:
2127 		tcp_v4_send_reset(sk, skb);
2128 		inet_twsk_deschedule_put(inet_twsk(sk));
2129 		goto discard_it;
2130 	case TCP_TW_SUCCESS:;
2131 	}
2132 	goto discard_it;
2133 }
2134 
2135 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2136 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2137 	.twsk_unique	= tcp_twsk_unique,
2138 	.twsk_destructor= tcp_twsk_destructor,
2139 };
2140 
2141 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2142 {
2143 	struct dst_entry *dst = skb_dst(skb);
2144 
2145 	if (dst && dst_hold_safe(dst)) {
2146 		sk->sk_rx_dst = dst;
2147 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2148 	}
2149 }
2150 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2151 
2152 const struct inet_connection_sock_af_ops ipv4_specific = {
2153 	.queue_xmit	   = ip_queue_xmit,
2154 	.send_check	   = tcp_v4_send_check,
2155 	.rebuild_header	   = inet_sk_rebuild_header,
2156 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2157 	.conn_request	   = tcp_v4_conn_request,
2158 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2159 	.net_header_len	   = sizeof(struct iphdr),
2160 	.setsockopt	   = ip_setsockopt,
2161 	.getsockopt	   = ip_getsockopt,
2162 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2163 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2164 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2165 };
2166 EXPORT_SYMBOL(ipv4_specific);
2167 
2168 #ifdef CONFIG_TCP_MD5SIG
2169 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2170 	.md5_lookup		= tcp_v4_md5_lookup,
2171 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2172 	.md5_parse		= tcp_v4_parse_md5_keys,
2173 };
2174 #endif
2175 
2176 /* NOTE: A lot of things set to zero explicitly by call to
2177  *       sk_alloc() so need not be done here.
2178  */
2179 static int tcp_v4_init_sock(struct sock *sk)
2180 {
2181 	struct inet_connection_sock *icsk = inet_csk(sk);
2182 
2183 	tcp_init_sock(sk);
2184 
2185 	icsk->icsk_af_ops = &ipv4_specific;
2186 
2187 #ifdef CONFIG_TCP_MD5SIG
2188 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2189 #endif
2190 
2191 	return 0;
2192 }
2193 
2194 void tcp_v4_destroy_sock(struct sock *sk)
2195 {
2196 	struct tcp_sock *tp = tcp_sk(sk);
2197 
2198 	trace_tcp_destroy_sock(sk);
2199 
2200 	tcp_clear_xmit_timers(sk);
2201 
2202 	tcp_cleanup_congestion_control(sk);
2203 
2204 	tcp_cleanup_ulp(sk);
2205 
2206 	/* Cleanup up the write buffer. */
2207 	tcp_write_queue_purge(sk);
2208 
2209 	/* Check if we want to disable active TFO */
2210 	tcp_fastopen_active_disable_ofo_check(sk);
2211 
2212 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2213 	skb_rbtree_purge(&tp->out_of_order_queue);
2214 
2215 #ifdef CONFIG_TCP_MD5SIG
2216 	/* Clean up the MD5 key list, if any */
2217 	if (tp->md5sig_info) {
2218 		tcp_clear_md5_list(sk);
2219 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2220 		tp->md5sig_info = NULL;
2221 	}
2222 #endif
2223 
2224 	/* Clean up a referenced TCP bind bucket. */
2225 	if (inet_csk(sk)->icsk_bind_hash)
2226 		inet_put_port(sk);
2227 
2228 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2229 
2230 	/* If socket is aborted during connect operation */
2231 	tcp_free_fastopen_req(tp);
2232 	tcp_fastopen_destroy_cipher(sk);
2233 	tcp_saved_syn_free(tp);
2234 
2235 	sk_sockets_allocated_dec(sk);
2236 }
2237 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2238 
2239 #ifdef CONFIG_PROC_FS
2240 /* Proc filesystem TCP sock list dumping. */
2241 
2242 /*
2243  * Get next listener socket follow cur.  If cur is NULL, get first socket
2244  * starting from bucket given in st->bucket; when st->bucket is zero the
2245  * very first socket in the hash table is returned.
2246  */
2247 static void *listening_get_next(struct seq_file *seq, void *cur)
2248 {
2249 	struct tcp_seq_afinfo *afinfo;
2250 	struct tcp_iter_state *st = seq->private;
2251 	struct net *net = seq_file_net(seq);
2252 	struct inet_listen_hashbucket *ilb;
2253 	struct hlist_nulls_node *node;
2254 	struct sock *sk = cur;
2255 
2256 	if (st->bpf_seq_afinfo)
2257 		afinfo = st->bpf_seq_afinfo;
2258 	else
2259 		afinfo = PDE_DATA(file_inode(seq->file));
2260 
2261 	if (!sk) {
2262 get_head:
2263 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2264 		spin_lock(&ilb->lock);
2265 		sk = sk_nulls_head(&ilb->nulls_head);
2266 		st->offset = 0;
2267 		goto get_sk;
2268 	}
2269 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2270 	++st->num;
2271 	++st->offset;
2272 
2273 	sk = sk_nulls_next(sk);
2274 get_sk:
2275 	sk_nulls_for_each_from(sk, node) {
2276 		if (!net_eq(sock_net(sk), net))
2277 			continue;
2278 		if (afinfo->family == AF_UNSPEC ||
2279 		    sk->sk_family == afinfo->family)
2280 			return sk;
2281 	}
2282 	spin_unlock(&ilb->lock);
2283 	st->offset = 0;
2284 	if (++st->bucket < INET_LHTABLE_SIZE)
2285 		goto get_head;
2286 	return NULL;
2287 }
2288 
2289 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2290 {
2291 	struct tcp_iter_state *st = seq->private;
2292 	void *rc;
2293 
2294 	st->bucket = 0;
2295 	st->offset = 0;
2296 	rc = listening_get_next(seq, NULL);
2297 
2298 	while (rc && *pos) {
2299 		rc = listening_get_next(seq, rc);
2300 		--*pos;
2301 	}
2302 	return rc;
2303 }
2304 
2305 static inline bool empty_bucket(const struct tcp_iter_state *st)
2306 {
2307 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2308 }
2309 
2310 /*
2311  * Get first established socket starting from bucket given in st->bucket.
2312  * If st->bucket is zero, the very first socket in the hash is returned.
2313  */
2314 static void *established_get_first(struct seq_file *seq)
2315 {
2316 	struct tcp_seq_afinfo *afinfo;
2317 	struct tcp_iter_state *st = seq->private;
2318 	struct net *net = seq_file_net(seq);
2319 	void *rc = NULL;
2320 
2321 	if (st->bpf_seq_afinfo)
2322 		afinfo = st->bpf_seq_afinfo;
2323 	else
2324 		afinfo = PDE_DATA(file_inode(seq->file));
2325 
2326 	st->offset = 0;
2327 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2328 		struct sock *sk;
2329 		struct hlist_nulls_node *node;
2330 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2331 
2332 		/* Lockless fast path for the common case of empty buckets */
2333 		if (empty_bucket(st))
2334 			continue;
2335 
2336 		spin_lock_bh(lock);
2337 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2338 			if ((afinfo->family != AF_UNSPEC &&
2339 			     sk->sk_family != afinfo->family) ||
2340 			    !net_eq(sock_net(sk), net)) {
2341 				continue;
2342 			}
2343 			rc = sk;
2344 			goto out;
2345 		}
2346 		spin_unlock_bh(lock);
2347 	}
2348 out:
2349 	return rc;
2350 }
2351 
2352 static void *established_get_next(struct seq_file *seq, void *cur)
2353 {
2354 	struct tcp_seq_afinfo *afinfo;
2355 	struct sock *sk = cur;
2356 	struct hlist_nulls_node *node;
2357 	struct tcp_iter_state *st = seq->private;
2358 	struct net *net = seq_file_net(seq);
2359 
2360 	if (st->bpf_seq_afinfo)
2361 		afinfo = st->bpf_seq_afinfo;
2362 	else
2363 		afinfo = PDE_DATA(file_inode(seq->file));
2364 
2365 	++st->num;
2366 	++st->offset;
2367 
2368 	sk = sk_nulls_next(sk);
2369 
2370 	sk_nulls_for_each_from(sk, node) {
2371 		if ((afinfo->family == AF_UNSPEC ||
2372 		     sk->sk_family == afinfo->family) &&
2373 		    net_eq(sock_net(sk), net))
2374 			return sk;
2375 	}
2376 
2377 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2378 	++st->bucket;
2379 	return established_get_first(seq);
2380 }
2381 
2382 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2383 {
2384 	struct tcp_iter_state *st = seq->private;
2385 	void *rc;
2386 
2387 	st->bucket = 0;
2388 	rc = established_get_first(seq);
2389 
2390 	while (rc && pos) {
2391 		rc = established_get_next(seq, rc);
2392 		--pos;
2393 	}
2394 	return rc;
2395 }
2396 
2397 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2398 {
2399 	void *rc;
2400 	struct tcp_iter_state *st = seq->private;
2401 
2402 	st->state = TCP_SEQ_STATE_LISTENING;
2403 	rc	  = listening_get_idx(seq, &pos);
2404 
2405 	if (!rc) {
2406 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2407 		rc	  = established_get_idx(seq, pos);
2408 	}
2409 
2410 	return rc;
2411 }
2412 
2413 static void *tcp_seek_last_pos(struct seq_file *seq)
2414 {
2415 	struct tcp_iter_state *st = seq->private;
2416 	int offset = st->offset;
2417 	int orig_num = st->num;
2418 	void *rc = NULL;
2419 
2420 	switch (st->state) {
2421 	case TCP_SEQ_STATE_LISTENING:
2422 		if (st->bucket >= INET_LHTABLE_SIZE)
2423 			break;
2424 		st->state = TCP_SEQ_STATE_LISTENING;
2425 		rc = listening_get_next(seq, NULL);
2426 		while (offset-- && rc)
2427 			rc = listening_get_next(seq, rc);
2428 		if (rc)
2429 			break;
2430 		st->bucket = 0;
2431 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2432 		fallthrough;
2433 	case TCP_SEQ_STATE_ESTABLISHED:
2434 		if (st->bucket > tcp_hashinfo.ehash_mask)
2435 			break;
2436 		rc = established_get_first(seq);
2437 		while (offset-- && rc)
2438 			rc = established_get_next(seq, rc);
2439 	}
2440 
2441 	st->num = orig_num;
2442 
2443 	return rc;
2444 }
2445 
2446 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2447 {
2448 	struct tcp_iter_state *st = seq->private;
2449 	void *rc;
2450 
2451 	if (*pos && *pos == st->last_pos) {
2452 		rc = tcp_seek_last_pos(seq);
2453 		if (rc)
2454 			goto out;
2455 	}
2456 
2457 	st->state = TCP_SEQ_STATE_LISTENING;
2458 	st->num = 0;
2459 	st->bucket = 0;
2460 	st->offset = 0;
2461 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2462 
2463 out:
2464 	st->last_pos = *pos;
2465 	return rc;
2466 }
2467 EXPORT_SYMBOL(tcp_seq_start);
2468 
2469 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2470 {
2471 	struct tcp_iter_state *st = seq->private;
2472 	void *rc = NULL;
2473 
2474 	if (v == SEQ_START_TOKEN) {
2475 		rc = tcp_get_idx(seq, 0);
2476 		goto out;
2477 	}
2478 
2479 	switch (st->state) {
2480 	case TCP_SEQ_STATE_LISTENING:
2481 		rc = listening_get_next(seq, v);
2482 		if (!rc) {
2483 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2484 			st->bucket = 0;
2485 			st->offset = 0;
2486 			rc	  = established_get_first(seq);
2487 		}
2488 		break;
2489 	case TCP_SEQ_STATE_ESTABLISHED:
2490 		rc = established_get_next(seq, v);
2491 		break;
2492 	}
2493 out:
2494 	++*pos;
2495 	st->last_pos = *pos;
2496 	return rc;
2497 }
2498 EXPORT_SYMBOL(tcp_seq_next);
2499 
2500 void tcp_seq_stop(struct seq_file *seq, void *v)
2501 {
2502 	struct tcp_iter_state *st = seq->private;
2503 
2504 	switch (st->state) {
2505 	case TCP_SEQ_STATE_LISTENING:
2506 		if (v != SEQ_START_TOKEN)
2507 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2508 		break;
2509 	case TCP_SEQ_STATE_ESTABLISHED:
2510 		if (v)
2511 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2512 		break;
2513 	}
2514 }
2515 EXPORT_SYMBOL(tcp_seq_stop);
2516 
2517 static void get_openreq4(const struct request_sock *req,
2518 			 struct seq_file *f, int i)
2519 {
2520 	const struct inet_request_sock *ireq = inet_rsk(req);
2521 	long delta = req->rsk_timer.expires - jiffies;
2522 
2523 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2524 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2525 		i,
2526 		ireq->ir_loc_addr,
2527 		ireq->ir_num,
2528 		ireq->ir_rmt_addr,
2529 		ntohs(ireq->ir_rmt_port),
2530 		TCP_SYN_RECV,
2531 		0, 0, /* could print option size, but that is af dependent. */
2532 		1,    /* timers active (only the expire timer) */
2533 		jiffies_delta_to_clock_t(delta),
2534 		req->num_timeout,
2535 		from_kuid_munged(seq_user_ns(f),
2536 				 sock_i_uid(req->rsk_listener)),
2537 		0,  /* non standard timer */
2538 		0, /* open_requests have no inode */
2539 		0,
2540 		req);
2541 }
2542 
2543 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2544 {
2545 	int timer_active;
2546 	unsigned long timer_expires;
2547 	const struct tcp_sock *tp = tcp_sk(sk);
2548 	const struct inet_connection_sock *icsk = inet_csk(sk);
2549 	const struct inet_sock *inet = inet_sk(sk);
2550 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2551 	__be32 dest = inet->inet_daddr;
2552 	__be32 src = inet->inet_rcv_saddr;
2553 	__u16 destp = ntohs(inet->inet_dport);
2554 	__u16 srcp = ntohs(inet->inet_sport);
2555 	int rx_queue;
2556 	int state;
2557 
2558 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2559 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2560 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2561 		timer_active	= 1;
2562 		timer_expires	= icsk->icsk_timeout;
2563 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2564 		timer_active	= 4;
2565 		timer_expires	= icsk->icsk_timeout;
2566 	} else if (timer_pending(&sk->sk_timer)) {
2567 		timer_active	= 2;
2568 		timer_expires	= sk->sk_timer.expires;
2569 	} else {
2570 		timer_active	= 0;
2571 		timer_expires = jiffies;
2572 	}
2573 
2574 	state = inet_sk_state_load(sk);
2575 	if (state == TCP_LISTEN)
2576 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2577 	else
2578 		/* Because we don't lock the socket,
2579 		 * we might find a transient negative value.
2580 		 */
2581 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2582 				      READ_ONCE(tp->copied_seq), 0);
2583 
2584 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2585 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2586 		i, src, srcp, dest, destp, state,
2587 		READ_ONCE(tp->write_seq) - tp->snd_una,
2588 		rx_queue,
2589 		timer_active,
2590 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2591 		icsk->icsk_retransmits,
2592 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2593 		icsk->icsk_probes_out,
2594 		sock_i_ino(sk),
2595 		refcount_read(&sk->sk_refcnt), sk,
2596 		jiffies_to_clock_t(icsk->icsk_rto),
2597 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2598 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2599 		tp->snd_cwnd,
2600 		state == TCP_LISTEN ?
2601 		    fastopenq->max_qlen :
2602 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2603 }
2604 
2605 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2606 			       struct seq_file *f, int i)
2607 {
2608 	long delta = tw->tw_timer.expires - jiffies;
2609 	__be32 dest, src;
2610 	__u16 destp, srcp;
2611 
2612 	dest  = tw->tw_daddr;
2613 	src   = tw->tw_rcv_saddr;
2614 	destp = ntohs(tw->tw_dport);
2615 	srcp  = ntohs(tw->tw_sport);
2616 
2617 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2618 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2619 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2620 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2621 		refcount_read(&tw->tw_refcnt), tw);
2622 }
2623 
2624 #define TMPSZ 150
2625 
2626 static int tcp4_seq_show(struct seq_file *seq, void *v)
2627 {
2628 	struct tcp_iter_state *st;
2629 	struct sock *sk = v;
2630 
2631 	seq_setwidth(seq, TMPSZ - 1);
2632 	if (v == SEQ_START_TOKEN) {
2633 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2634 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2635 			   "inode");
2636 		goto out;
2637 	}
2638 	st = seq->private;
2639 
2640 	if (sk->sk_state == TCP_TIME_WAIT)
2641 		get_timewait4_sock(v, seq, st->num);
2642 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2643 		get_openreq4(v, seq, st->num);
2644 	else
2645 		get_tcp4_sock(v, seq, st->num);
2646 out:
2647 	seq_pad(seq, '\n');
2648 	return 0;
2649 }
2650 
2651 #ifdef CONFIG_BPF_SYSCALL
2652 struct bpf_iter__tcp {
2653 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2654 	__bpf_md_ptr(struct sock_common *, sk_common);
2655 	uid_t uid __aligned(8);
2656 };
2657 
2658 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2659 			     struct sock_common *sk_common, uid_t uid)
2660 {
2661 	struct bpf_iter__tcp ctx;
2662 
2663 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2664 	ctx.meta = meta;
2665 	ctx.sk_common = sk_common;
2666 	ctx.uid = uid;
2667 	return bpf_iter_run_prog(prog, &ctx);
2668 }
2669 
2670 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2671 {
2672 	struct bpf_iter_meta meta;
2673 	struct bpf_prog *prog;
2674 	struct sock *sk = v;
2675 	uid_t uid;
2676 
2677 	if (v == SEQ_START_TOKEN)
2678 		return 0;
2679 
2680 	if (sk->sk_state == TCP_TIME_WAIT) {
2681 		uid = 0;
2682 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2683 		const struct request_sock *req = v;
2684 
2685 		uid = from_kuid_munged(seq_user_ns(seq),
2686 				       sock_i_uid(req->rsk_listener));
2687 	} else {
2688 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2689 	}
2690 
2691 	meta.seq = seq;
2692 	prog = bpf_iter_get_info(&meta, false);
2693 	return tcp_prog_seq_show(prog, &meta, v, uid);
2694 }
2695 
2696 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2697 {
2698 	struct bpf_iter_meta meta;
2699 	struct bpf_prog *prog;
2700 
2701 	if (!v) {
2702 		meta.seq = seq;
2703 		prog = bpf_iter_get_info(&meta, true);
2704 		if (prog)
2705 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
2706 	}
2707 
2708 	tcp_seq_stop(seq, v);
2709 }
2710 
2711 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2712 	.show		= bpf_iter_tcp_seq_show,
2713 	.start		= tcp_seq_start,
2714 	.next		= tcp_seq_next,
2715 	.stop		= bpf_iter_tcp_seq_stop,
2716 };
2717 #endif
2718 
2719 static const struct seq_operations tcp4_seq_ops = {
2720 	.show		= tcp4_seq_show,
2721 	.start		= tcp_seq_start,
2722 	.next		= tcp_seq_next,
2723 	.stop		= tcp_seq_stop,
2724 };
2725 
2726 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2727 	.family		= AF_INET,
2728 };
2729 
2730 static int __net_init tcp4_proc_init_net(struct net *net)
2731 {
2732 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2733 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2734 		return -ENOMEM;
2735 	return 0;
2736 }
2737 
2738 static void __net_exit tcp4_proc_exit_net(struct net *net)
2739 {
2740 	remove_proc_entry("tcp", net->proc_net);
2741 }
2742 
2743 static struct pernet_operations tcp4_net_ops = {
2744 	.init = tcp4_proc_init_net,
2745 	.exit = tcp4_proc_exit_net,
2746 };
2747 
2748 int __init tcp4_proc_init(void)
2749 {
2750 	return register_pernet_subsys(&tcp4_net_ops);
2751 }
2752 
2753 void tcp4_proc_exit(void)
2754 {
2755 	unregister_pernet_subsys(&tcp4_net_ops);
2756 }
2757 #endif /* CONFIG_PROC_FS */
2758 
2759 struct proto tcp_prot = {
2760 	.name			= "TCP",
2761 	.owner			= THIS_MODULE,
2762 	.close			= tcp_close,
2763 	.pre_connect		= tcp_v4_pre_connect,
2764 	.connect		= tcp_v4_connect,
2765 	.disconnect		= tcp_disconnect,
2766 	.accept			= inet_csk_accept,
2767 	.ioctl			= tcp_ioctl,
2768 	.init			= tcp_v4_init_sock,
2769 	.destroy		= tcp_v4_destroy_sock,
2770 	.shutdown		= tcp_shutdown,
2771 	.setsockopt		= tcp_setsockopt,
2772 	.getsockopt		= tcp_getsockopt,
2773 	.keepalive		= tcp_set_keepalive,
2774 	.recvmsg		= tcp_recvmsg,
2775 	.sendmsg		= tcp_sendmsg,
2776 	.sendpage		= tcp_sendpage,
2777 	.backlog_rcv		= tcp_v4_do_rcv,
2778 	.release_cb		= tcp_release_cb,
2779 	.hash			= inet_hash,
2780 	.unhash			= inet_unhash,
2781 	.get_port		= inet_csk_get_port,
2782 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2783 	.leave_memory_pressure	= tcp_leave_memory_pressure,
2784 	.stream_memory_free	= tcp_stream_memory_free,
2785 	.sockets_allocated	= &tcp_sockets_allocated,
2786 	.orphan_count		= &tcp_orphan_count,
2787 	.memory_allocated	= &tcp_memory_allocated,
2788 	.memory_pressure	= &tcp_memory_pressure,
2789 	.sysctl_mem		= sysctl_tcp_mem,
2790 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
2791 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
2792 	.max_header		= MAX_TCP_HEADER,
2793 	.obj_size		= sizeof(struct tcp_sock),
2794 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
2795 	.twsk_prot		= &tcp_timewait_sock_ops,
2796 	.rsk_prot		= &tcp_request_sock_ops,
2797 	.h.hashinfo		= &tcp_hashinfo,
2798 	.no_autobind		= true,
2799 	.diag_destroy		= tcp_abort,
2800 };
2801 EXPORT_SYMBOL(tcp_prot);
2802 
2803 static void __net_exit tcp_sk_exit(struct net *net)
2804 {
2805 	int cpu;
2806 
2807 	if (net->ipv4.tcp_congestion_control)
2808 		bpf_module_put(net->ipv4.tcp_congestion_control,
2809 			       net->ipv4.tcp_congestion_control->owner);
2810 
2811 	for_each_possible_cpu(cpu)
2812 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2813 	free_percpu(net->ipv4.tcp_sk);
2814 }
2815 
2816 static int __net_init tcp_sk_init(struct net *net)
2817 {
2818 	int res, cpu, cnt;
2819 
2820 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2821 	if (!net->ipv4.tcp_sk)
2822 		return -ENOMEM;
2823 
2824 	for_each_possible_cpu(cpu) {
2825 		struct sock *sk;
2826 
2827 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2828 					   IPPROTO_TCP, net);
2829 		if (res)
2830 			goto fail;
2831 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2832 
2833 		/* Please enforce IP_DF and IPID==0 for RST and
2834 		 * ACK sent in SYN-RECV and TIME-WAIT state.
2835 		 */
2836 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2837 
2838 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2839 	}
2840 
2841 	net->ipv4.sysctl_tcp_ecn = 2;
2842 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2843 
2844 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2845 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2846 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2847 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2848 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2849 
2850 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2851 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2852 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2853 
2854 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2855 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2856 	net->ipv4.sysctl_tcp_syncookies = 1;
2857 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2858 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2859 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2860 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2861 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2862 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2863 	net->ipv4.sysctl_tcp_tw_reuse = 2;
2864 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2865 
2866 	cnt = tcp_hashinfo.ehash_mask + 1;
2867 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2868 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2869 
2870 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2871 	net->ipv4.sysctl_tcp_sack = 1;
2872 	net->ipv4.sysctl_tcp_window_scaling = 1;
2873 	net->ipv4.sysctl_tcp_timestamps = 1;
2874 	net->ipv4.sysctl_tcp_early_retrans = 3;
2875 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2876 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2877 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
2878 	net->ipv4.sysctl_tcp_max_reordering = 300;
2879 	net->ipv4.sysctl_tcp_dsack = 1;
2880 	net->ipv4.sysctl_tcp_app_win = 31;
2881 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
2882 	net->ipv4.sysctl_tcp_frto = 2;
2883 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2884 	/* This limits the percentage of the congestion window which we
2885 	 * will allow a single TSO frame to consume.  Building TSO frames
2886 	 * which are too large can cause TCP streams to be bursty.
2887 	 */
2888 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2889 	/* Default TSQ limit of 16 TSO segments */
2890 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2891 	/* rfc5961 challenge ack rate limiting */
2892 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2893 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
2894 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2895 	net->ipv4.sysctl_tcp_autocorking = 1;
2896 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2897 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2898 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2899 	if (net != &init_net) {
2900 		memcpy(net->ipv4.sysctl_tcp_rmem,
2901 		       init_net.ipv4.sysctl_tcp_rmem,
2902 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2903 		memcpy(net->ipv4.sysctl_tcp_wmem,
2904 		       init_net.ipv4.sysctl_tcp_wmem,
2905 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2906 	}
2907 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2908 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
2909 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2910 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2911 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2912 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2913 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2914 
2915 	/* Reno is always built in */
2916 	if (!net_eq(net, &init_net) &&
2917 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2918 			       init_net.ipv4.tcp_congestion_control->owner))
2919 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2920 	else
2921 		net->ipv4.tcp_congestion_control = &tcp_reno;
2922 
2923 	return 0;
2924 fail:
2925 	tcp_sk_exit(net);
2926 
2927 	return res;
2928 }
2929 
2930 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2931 {
2932 	struct net *net;
2933 
2934 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2935 
2936 	list_for_each_entry(net, net_exit_list, exit_list)
2937 		tcp_fastopen_ctx_destroy(net);
2938 }
2939 
2940 static struct pernet_operations __net_initdata tcp_sk_ops = {
2941        .init	   = tcp_sk_init,
2942        .exit	   = tcp_sk_exit,
2943        .exit_batch = tcp_sk_exit_batch,
2944 };
2945 
2946 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
2947 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
2948 		     struct sock_common *sk_common, uid_t uid)
2949 
2950 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
2951 {
2952 	struct tcp_iter_state *st = priv_data;
2953 	struct tcp_seq_afinfo *afinfo;
2954 	int ret;
2955 
2956 	afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
2957 	if (!afinfo)
2958 		return -ENOMEM;
2959 
2960 	afinfo->family = AF_UNSPEC;
2961 	st->bpf_seq_afinfo = afinfo;
2962 	ret = bpf_iter_init_seq_net(priv_data, aux);
2963 	if (ret)
2964 		kfree(afinfo);
2965 	return ret;
2966 }
2967 
2968 static void bpf_iter_fini_tcp(void *priv_data)
2969 {
2970 	struct tcp_iter_state *st = priv_data;
2971 
2972 	kfree(st->bpf_seq_afinfo);
2973 	bpf_iter_fini_seq_net(priv_data);
2974 }
2975 
2976 static const struct bpf_iter_seq_info tcp_seq_info = {
2977 	.seq_ops		= &bpf_iter_tcp_seq_ops,
2978 	.init_seq_private	= bpf_iter_init_tcp,
2979 	.fini_seq_private	= bpf_iter_fini_tcp,
2980 	.seq_priv_size		= sizeof(struct tcp_iter_state),
2981 };
2982 
2983 static struct bpf_iter_reg tcp_reg_info = {
2984 	.target			= "tcp",
2985 	.ctx_arg_info_size	= 1,
2986 	.ctx_arg_info		= {
2987 		{ offsetof(struct bpf_iter__tcp, sk_common),
2988 		  PTR_TO_BTF_ID_OR_NULL },
2989 	},
2990 	.seq_info		= &tcp_seq_info,
2991 };
2992 
2993 static void __init bpf_iter_register(void)
2994 {
2995 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
2996 	if (bpf_iter_reg_target(&tcp_reg_info))
2997 		pr_warn("Warning: could not register bpf iterator tcp\n");
2998 }
2999 
3000 #endif
3001 
3002 void __init tcp_v4_init(void)
3003 {
3004 	if (register_pernet_subsys(&tcp_sk_ops))
3005 		panic("Failed to create the TCP control socket.\n");
3006 
3007 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3008 	bpf_iter_register();
3009 #endif
3010 }
3011