xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision 99580ae8)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 #include <linux/sched.h>
61 
62 #include <net/net_namespace.h>
63 #include <net/icmp.h>
64 #include <net/inet_hashtables.h>
65 #include <net/tcp.h>
66 #include <net/transp_v6.h>
67 #include <net/ipv6.h>
68 #include <net/inet_common.h>
69 #include <net/timewait_sock.h>
70 #include <net/xfrm.h>
71 #include <net/secure_seq.h>
72 #include <net/busy_poll.h>
73 
74 #include <linux/inet.h>
75 #include <linux/ipv6.h>
76 #include <linux/stddef.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79 #include <linux/inetdevice.h>
80 #include <linux/btf_ids.h>
81 
82 #include <crypto/hash.h>
83 #include <linux/scatterlist.h>
84 
85 #include <trace/events/tcp.h>
86 
87 #ifdef CONFIG_TCP_MD5SIG
88 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
89 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
90 #endif
91 
92 struct inet_hashinfo tcp_hashinfo;
93 EXPORT_SYMBOL(tcp_hashinfo);
94 
95 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
96 
97 static DEFINE_MUTEX(tcp_exit_batch_mutex);
98 
tcp_v4_init_seq(const struct sk_buff * skb)99 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
100 {
101 	return secure_tcp_seq(ip_hdr(skb)->daddr,
102 			      ip_hdr(skb)->saddr,
103 			      tcp_hdr(skb)->dest,
104 			      tcp_hdr(skb)->source);
105 }
106 
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)107 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
108 {
109 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
110 }
111 
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)112 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
113 {
114 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
115 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
116 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
117 	struct tcp_sock *tp = tcp_sk(sk);
118 
119 	if (reuse == 2) {
120 		/* Still does not detect *everything* that goes through
121 		 * lo, since we require a loopback src or dst address
122 		 * or direct binding to 'lo' interface.
123 		 */
124 		bool loopback = false;
125 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
126 			loopback = true;
127 #if IS_ENABLED(CONFIG_IPV6)
128 		if (tw->tw_family == AF_INET6) {
129 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
130 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
131 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
132 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
133 				loopback = true;
134 		} else
135 #endif
136 		{
137 			if (ipv4_is_loopback(tw->tw_daddr) ||
138 			    ipv4_is_loopback(tw->tw_rcv_saddr))
139 				loopback = true;
140 		}
141 		if (!loopback)
142 			reuse = 0;
143 	}
144 
145 	/* With PAWS, it is safe from the viewpoint
146 	   of data integrity. Even without PAWS it is safe provided sequence
147 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
148 
149 	   Actually, the idea is close to VJ's one, only timestamp cache is
150 	   held not per host, but per port pair and TW bucket is used as state
151 	   holder.
152 
153 	   If TW bucket has been already destroyed we fall back to VJ's scheme
154 	   and use initial timestamp retrieved from peer table.
155 	 */
156 	if (tcptw->tw_ts_recent_stamp &&
157 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
158 					    tcptw->tw_ts_recent_stamp)))) {
159 		/* inet_twsk_hashdance() sets sk_refcnt after putting twsk
160 		 * and releasing the bucket lock.
161 		 */
162 		if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
163 			return 0;
164 
165 		/* In case of repair and re-using TIME-WAIT sockets we still
166 		 * want to be sure that it is safe as above but honor the
167 		 * sequence numbers and time stamps set as part of the repair
168 		 * process.
169 		 *
170 		 * Without this check re-using a TIME-WAIT socket with TCP
171 		 * repair would accumulate a -1 on the repair assigned
172 		 * sequence number. The first time it is reused the sequence
173 		 * is -1, the second time -2, etc. This fixes that issue
174 		 * without appearing to create any others.
175 		 */
176 		if (likely(!tp->repair)) {
177 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
178 
179 			if (!seq)
180 				seq = 1;
181 			WRITE_ONCE(tp->write_seq, seq);
182 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
183 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
184 		}
185 
186 		return 1;
187 	}
188 
189 	return 0;
190 }
191 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
192 
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)193 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
194 			      int addr_len)
195 {
196 	/* This check is replicated from tcp_v4_connect() and intended to
197 	 * prevent BPF program called below from accessing bytes that are out
198 	 * of the bound specified by user in addr_len.
199 	 */
200 	if (addr_len < sizeof(struct sockaddr_in))
201 		return -EINVAL;
202 
203 	sock_owned_by_me(sk);
204 
205 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
206 }
207 
208 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)209 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
210 {
211 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
212 	struct inet_timewait_death_row *tcp_death_row;
213 	struct inet_sock *inet = inet_sk(sk);
214 	struct tcp_sock *tp = tcp_sk(sk);
215 	struct ip_options_rcu *inet_opt;
216 	struct net *net = sock_net(sk);
217 	__be16 orig_sport, orig_dport;
218 	__be32 daddr, nexthop;
219 	struct flowi4 *fl4;
220 	struct rtable *rt;
221 	int err;
222 
223 	if (addr_len < sizeof(struct sockaddr_in))
224 		return -EINVAL;
225 
226 	if (usin->sin_family != AF_INET)
227 		return -EAFNOSUPPORT;
228 
229 	nexthop = daddr = usin->sin_addr.s_addr;
230 	inet_opt = rcu_dereference_protected(inet->inet_opt,
231 					     lockdep_sock_is_held(sk));
232 	if (inet_opt && inet_opt->opt.srr) {
233 		if (!daddr)
234 			return -EINVAL;
235 		nexthop = inet_opt->opt.faddr;
236 	}
237 
238 	orig_sport = inet->inet_sport;
239 	orig_dport = usin->sin_port;
240 	fl4 = &inet->cork.fl.u.ip4;
241 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
242 			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
243 			      orig_dport, sk);
244 	if (IS_ERR(rt)) {
245 		err = PTR_ERR(rt);
246 		if (err == -ENETUNREACH)
247 			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
248 		return err;
249 	}
250 
251 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
252 		ip_rt_put(rt);
253 		return -ENETUNREACH;
254 	}
255 
256 	if (!inet_opt || !inet_opt->opt.srr)
257 		daddr = fl4->daddr;
258 
259 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
260 
261 	if (!inet->inet_saddr) {
262 		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
263 		if (err) {
264 			ip_rt_put(rt);
265 			return err;
266 		}
267 	} else {
268 		sk_rcv_saddr_set(sk, inet->inet_saddr);
269 	}
270 
271 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
272 		/* Reset inherited state */
273 		tp->rx_opt.ts_recent	   = 0;
274 		tp->rx_opt.ts_recent_stamp = 0;
275 		if (likely(!tp->repair))
276 			WRITE_ONCE(tp->write_seq, 0);
277 	}
278 
279 	inet->inet_dport = usin->sin_port;
280 	sk_daddr_set(sk, daddr);
281 
282 	inet_csk(sk)->icsk_ext_hdr_len = 0;
283 	if (inet_opt)
284 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
285 
286 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
287 
288 	/* Socket identity is still unknown (sport may be zero).
289 	 * However we set state to SYN-SENT and not releasing socket
290 	 * lock select source port, enter ourselves into the hash tables and
291 	 * complete initialization after this.
292 	 */
293 	tcp_set_state(sk, TCP_SYN_SENT);
294 	err = inet_hash_connect(tcp_death_row, sk);
295 	if (err)
296 		goto failure;
297 
298 	sk_set_txhash(sk);
299 
300 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
301 			       inet->inet_sport, inet->inet_dport, sk);
302 	if (IS_ERR(rt)) {
303 		err = PTR_ERR(rt);
304 		rt = NULL;
305 		goto failure;
306 	}
307 	/* OK, now commit destination to socket.  */
308 	sk->sk_gso_type = SKB_GSO_TCPV4;
309 	sk_setup_caps(sk, &rt->dst);
310 	rt = NULL;
311 
312 	if (likely(!tp->repair)) {
313 		if (!tp->write_seq)
314 			WRITE_ONCE(tp->write_seq,
315 				   secure_tcp_seq(inet->inet_saddr,
316 						  inet->inet_daddr,
317 						  inet->inet_sport,
318 						  usin->sin_port));
319 		WRITE_ONCE(tp->tsoffset,
320 			   secure_tcp_ts_off(net, inet->inet_saddr,
321 					     inet->inet_daddr));
322 	}
323 
324 	atomic_set(&inet->inet_id, get_random_u16());
325 
326 	if (tcp_fastopen_defer_connect(sk, &err))
327 		return err;
328 	if (err)
329 		goto failure;
330 
331 	err = tcp_connect(sk);
332 
333 	if (err)
334 		goto failure;
335 
336 	return 0;
337 
338 failure:
339 	/*
340 	 * This unhashes the socket and releases the local port,
341 	 * if necessary.
342 	 */
343 	tcp_set_state(sk, TCP_CLOSE);
344 	inet_bhash2_reset_saddr(sk);
345 	ip_rt_put(rt);
346 	sk->sk_route_caps = 0;
347 	inet->inet_dport = 0;
348 	return err;
349 }
350 EXPORT_SYMBOL(tcp_v4_connect);
351 
352 /*
353  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
354  * It can be called through tcp_release_cb() if socket was owned by user
355  * at the time tcp_v4_err() was called to handle ICMP message.
356  */
tcp_v4_mtu_reduced(struct sock * sk)357 void tcp_v4_mtu_reduced(struct sock *sk)
358 {
359 	struct inet_sock *inet = inet_sk(sk);
360 	struct dst_entry *dst;
361 	u32 mtu;
362 
363 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
364 		return;
365 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
366 	dst = inet_csk_update_pmtu(sk, mtu);
367 	if (!dst)
368 		return;
369 
370 	/* Something is about to be wrong... Remember soft error
371 	 * for the case, if this connection will not able to recover.
372 	 */
373 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
374 		WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
375 
376 	mtu = dst_mtu(dst);
377 
378 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
379 	    ip_sk_accept_pmtu(sk) &&
380 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
381 		tcp_sync_mss(sk, mtu);
382 
383 		/* Resend the TCP packet because it's
384 		 * clear that the old packet has been
385 		 * dropped. This is the new "fast" path mtu
386 		 * discovery.
387 		 */
388 		tcp_simple_retransmit(sk);
389 	} /* else let the usual retransmit timer handle it */
390 }
391 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
392 
do_redirect(struct sk_buff * skb,struct sock * sk)393 static void do_redirect(struct sk_buff *skb, struct sock *sk)
394 {
395 	struct dst_entry *dst = __sk_dst_check(sk, 0);
396 
397 	if (dst)
398 		dst->ops->redirect(dst, sk, skb);
399 }
400 
401 
402 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)403 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
404 {
405 	struct request_sock *req = inet_reqsk(sk);
406 	struct net *net = sock_net(sk);
407 
408 	/* ICMPs are not backlogged, hence we cannot get
409 	 * an established socket here.
410 	 */
411 	if (seq != tcp_rsk(req)->snt_isn) {
412 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
413 	} else if (abort) {
414 		/*
415 		 * Still in SYN_RECV, just remove it silently.
416 		 * There is no good way to pass the error to the newly
417 		 * created socket, and POSIX does not want network
418 		 * errors returned from accept().
419 		 */
420 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
421 		tcp_listendrop(req->rsk_listener);
422 	}
423 	reqsk_put(req);
424 }
425 EXPORT_SYMBOL(tcp_req_err);
426 
427 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)428 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
429 {
430 	struct inet_connection_sock *icsk = inet_csk(sk);
431 	struct tcp_sock *tp = tcp_sk(sk);
432 	struct sk_buff *skb;
433 	s32 remaining;
434 	u32 delta_us;
435 
436 	if (sock_owned_by_user(sk))
437 		return;
438 
439 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
440 	    !icsk->icsk_backoff)
441 		return;
442 
443 	skb = tcp_rtx_queue_head(sk);
444 	if (WARN_ON_ONCE(!skb))
445 		return;
446 
447 	icsk->icsk_backoff--;
448 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
449 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
450 
451 	tcp_mstamp_refresh(tp);
452 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
453 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
454 
455 	if (remaining > 0) {
456 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
457 					  remaining, TCP_RTO_MAX);
458 	} else {
459 		/* RTO revert clocked out retransmission.
460 		 * Will retransmit now.
461 		 */
462 		tcp_retransmit_timer(sk);
463 	}
464 }
465 EXPORT_SYMBOL(tcp_ld_RTO_revert);
466 
467 /*
468  * This routine is called by the ICMP module when it gets some
469  * sort of error condition.  If err < 0 then the socket should
470  * be closed and the error returned to the user.  If err > 0
471  * it's just the icmp type << 8 | icmp code.  After adjustment
472  * header points to the first 8 bytes of the tcp header.  We need
473  * to find the appropriate port.
474  *
475  * The locking strategy used here is very "optimistic". When
476  * someone else accesses the socket the ICMP is just dropped
477  * and for some paths there is no check at all.
478  * A more general error queue to queue errors for later handling
479  * is probably better.
480  *
481  */
482 
tcp_v4_err(struct sk_buff * skb,u32 info)483 int tcp_v4_err(struct sk_buff *skb, u32 info)
484 {
485 	const struct iphdr *iph = (const struct iphdr *)skb->data;
486 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
487 	struct tcp_sock *tp;
488 	const int type = icmp_hdr(skb)->type;
489 	const int code = icmp_hdr(skb)->code;
490 	struct sock *sk;
491 	struct request_sock *fastopen;
492 	u32 seq, snd_una;
493 	int err;
494 	struct net *net = dev_net(skb->dev);
495 
496 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
497 				       iph->daddr, th->dest, iph->saddr,
498 				       ntohs(th->source), inet_iif(skb), 0);
499 	if (!sk) {
500 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
501 		return -ENOENT;
502 	}
503 	if (sk->sk_state == TCP_TIME_WAIT) {
504 		inet_twsk_put(inet_twsk(sk));
505 		return 0;
506 	}
507 	seq = ntohl(th->seq);
508 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
509 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
510 				     type == ICMP_TIME_EXCEEDED ||
511 				     (type == ICMP_DEST_UNREACH &&
512 				      (code == ICMP_NET_UNREACH ||
513 				       code == ICMP_HOST_UNREACH)));
514 		return 0;
515 	}
516 
517 	bh_lock_sock(sk);
518 	/* If too many ICMPs get dropped on busy
519 	 * servers this needs to be solved differently.
520 	 * We do take care of PMTU discovery (RFC1191) special case :
521 	 * we can receive locally generated ICMP messages while socket is held.
522 	 */
523 	if (sock_owned_by_user(sk)) {
524 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
525 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
526 	}
527 	if (sk->sk_state == TCP_CLOSE)
528 		goto out;
529 
530 	if (static_branch_unlikely(&ip4_min_ttl)) {
531 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
532 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
533 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
534 			goto out;
535 		}
536 	}
537 
538 	tp = tcp_sk(sk);
539 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
540 	fastopen = rcu_dereference(tp->fastopen_rsk);
541 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
542 	if (sk->sk_state != TCP_LISTEN &&
543 	    !between(seq, snd_una, tp->snd_nxt)) {
544 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
545 		goto out;
546 	}
547 
548 	switch (type) {
549 	case ICMP_REDIRECT:
550 		if (!sock_owned_by_user(sk))
551 			do_redirect(skb, sk);
552 		goto out;
553 	case ICMP_SOURCE_QUENCH:
554 		/* Just silently ignore these. */
555 		goto out;
556 	case ICMP_PARAMETERPROB:
557 		err = EPROTO;
558 		break;
559 	case ICMP_DEST_UNREACH:
560 		if (code > NR_ICMP_UNREACH)
561 			goto out;
562 
563 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
564 			/* We are not interested in TCP_LISTEN and open_requests
565 			 * (SYN-ACKs send out by Linux are always <576bytes so
566 			 * they should go through unfragmented).
567 			 */
568 			if (sk->sk_state == TCP_LISTEN)
569 				goto out;
570 
571 			WRITE_ONCE(tp->mtu_info, info);
572 			if (!sock_owned_by_user(sk)) {
573 				tcp_v4_mtu_reduced(sk);
574 			} else {
575 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
576 					sock_hold(sk);
577 			}
578 			goto out;
579 		}
580 
581 		err = icmp_err_convert[code].errno;
582 		/* check if this ICMP message allows revert of backoff.
583 		 * (see RFC 6069)
584 		 */
585 		if (!fastopen &&
586 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
587 			tcp_ld_RTO_revert(sk, seq);
588 		break;
589 	case ICMP_TIME_EXCEEDED:
590 		err = EHOSTUNREACH;
591 		break;
592 	default:
593 		goto out;
594 	}
595 
596 	switch (sk->sk_state) {
597 	case TCP_SYN_SENT:
598 	case TCP_SYN_RECV:
599 		/* Only in fast or simultaneous open. If a fast open socket is
600 		 * already accepted it is treated as a connected one below.
601 		 */
602 		if (fastopen && !fastopen->sk)
603 			break;
604 
605 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
606 
607 		if (!sock_owned_by_user(sk))
608 			tcp_done_with_error(sk, err);
609 		else
610 			WRITE_ONCE(sk->sk_err_soft, err);
611 		goto out;
612 	}
613 
614 	/* If we've already connected we will keep trying
615 	 * until we time out, or the user gives up.
616 	 *
617 	 * rfc1122 4.2.3.9 allows to consider as hard errors
618 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
619 	 * but it is obsoleted by pmtu discovery).
620 	 *
621 	 * Note, that in modern internet, where routing is unreliable
622 	 * and in each dark corner broken firewalls sit, sending random
623 	 * errors ordered by their masters even this two messages finally lose
624 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
625 	 *
626 	 * Now we are in compliance with RFCs.
627 	 *							--ANK (980905)
628 	 */
629 
630 	if (!sock_owned_by_user(sk) &&
631 	    inet_test_bit(RECVERR, sk)) {
632 		WRITE_ONCE(sk->sk_err, err);
633 		sk_error_report(sk);
634 	} else	{ /* Only an error on timeout */
635 		WRITE_ONCE(sk->sk_err_soft, err);
636 	}
637 
638 out:
639 	bh_unlock_sock(sk);
640 	sock_put(sk);
641 	return 0;
642 }
643 
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)644 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
645 {
646 	struct tcphdr *th = tcp_hdr(skb);
647 
648 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
649 	skb->csum_start = skb_transport_header(skb) - skb->head;
650 	skb->csum_offset = offsetof(struct tcphdr, check);
651 }
652 
653 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)654 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
655 {
656 	const struct inet_sock *inet = inet_sk(sk);
657 
658 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
659 }
660 EXPORT_SYMBOL(tcp_v4_send_check);
661 
662 /*
663  *	This routine will send an RST to the other tcp.
664  *
665  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
666  *		      for reset.
667  *	Answer: if a packet caused RST, it is not for a socket
668  *		existing in our system, if it is matched to a socket,
669  *		it is just duplicate segment or bug in other side's TCP.
670  *		So that we build reply only basing on parameters
671  *		arrived with segment.
672  *	Exception: precedence violation. We do not implement it in any case.
673  */
674 
675 #ifdef CONFIG_TCP_MD5SIG
676 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
677 #else
678 #define OPTION_BYTES sizeof(__be32)
679 #endif
680 
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb)681 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
682 {
683 	const struct tcphdr *th = tcp_hdr(skb);
684 	struct {
685 		struct tcphdr th;
686 		__be32 opt[OPTION_BYTES / sizeof(__be32)];
687 	} rep;
688 	struct ip_reply_arg arg;
689 #ifdef CONFIG_TCP_MD5SIG
690 	struct tcp_md5sig_key *key = NULL;
691 	const __u8 *hash_location = NULL;
692 	unsigned char newhash[16];
693 	int genhash;
694 	struct sock *sk1 = NULL;
695 #endif
696 	u64 transmit_time = 0;
697 	struct sock *ctl_sk;
698 	struct net *net;
699 	u32 txhash = 0;
700 
701 	/* Never send a reset in response to a reset. */
702 	if (th->rst)
703 		return;
704 
705 	/* If sk not NULL, it means we did a successful lookup and incoming
706 	 * route had to be correct. prequeue might have dropped our dst.
707 	 */
708 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
709 		return;
710 
711 	/* Swap the send and the receive. */
712 	memset(&rep, 0, sizeof(rep));
713 	rep.th.dest   = th->source;
714 	rep.th.source = th->dest;
715 	rep.th.doff   = sizeof(struct tcphdr) / 4;
716 	rep.th.rst    = 1;
717 
718 	if (th->ack) {
719 		rep.th.seq = th->ack_seq;
720 	} else {
721 		rep.th.ack = 1;
722 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
723 				       skb->len - (th->doff << 2));
724 	}
725 
726 	memset(&arg, 0, sizeof(arg));
727 	arg.iov[0].iov_base = (unsigned char *)&rep;
728 	arg.iov[0].iov_len  = sizeof(rep.th);
729 
730 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
731 #ifdef CONFIG_TCP_MD5SIG
732 	rcu_read_lock();
733 	hash_location = tcp_parse_md5sig_option(th);
734 	if (sk && sk_fullsock(sk)) {
735 		const union tcp_md5_addr *addr;
736 		int l3index;
737 
738 		/* sdif set, means packet ingressed via a device
739 		 * in an L3 domain and inet_iif is set to it.
740 		 */
741 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
742 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
743 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
744 	} else if (hash_location) {
745 		const union tcp_md5_addr *addr;
746 		int sdif = tcp_v4_sdif(skb);
747 		int dif = inet_iif(skb);
748 		int l3index;
749 
750 		/*
751 		 * active side is lost. Try to find listening socket through
752 		 * source port, and then find md5 key through listening socket.
753 		 * we are not loose security here:
754 		 * Incoming packet is checked with md5 hash with finding key,
755 		 * no RST generated if md5 hash doesn't match.
756 		 */
757 		sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
758 					     NULL, 0, ip_hdr(skb)->saddr,
759 					     th->source, ip_hdr(skb)->daddr,
760 					     ntohs(th->source), dif, sdif);
761 		/* don't send rst if it can't find key */
762 		if (!sk1)
763 			goto out;
764 
765 		/* sdif set, means packet ingressed via a device
766 		 * in an L3 domain and dif is set to it.
767 		 */
768 		l3index = sdif ? dif : 0;
769 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
770 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
771 		if (!key)
772 			goto out;
773 
774 
775 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
776 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
777 			goto out;
778 
779 	}
780 
781 	if (key) {
782 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
783 				   (TCPOPT_NOP << 16) |
784 				   (TCPOPT_MD5SIG << 8) |
785 				   TCPOLEN_MD5SIG);
786 		/* Update length and the length the header thinks exists */
787 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
788 		rep.th.doff = arg.iov[0].iov_len / 4;
789 
790 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
791 				     key, ip_hdr(skb)->saddr,
792 				     ip_hdr(skb)->daddr, &rep.th);
793 	}
794 #endif
795 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
796 	if (rep.opt[0] == 0) {
797 		__be32 mrst = mptcp_reset_option(skb);
798 
799 		if (mrst) {
800 			rep.opt[0] = mrst;
801 			arg.iov[0].iov_len += sizeof(mrst);
802 			rep.th.doff = arg.iov[0].iov_len / 4;
803 		}
804 	}
805 
806 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
807 				      ip_hdr(skb)->saddr, /* XXX */
808 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
809 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
810 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
811 
812 	/* When socket is gone, all binding information is lost.
813 	 * routing might fail in this case. No choice here, if we choose to force
814 	 * input interface, we will misroute in case of asymmetric route.
815 	 */
816 	if (sk) {
817 		arg.bound_dev_if = sk->sk_bound_dev_if;
818 		if (sk_fullsock(sk))
819 			trace_tcp_send_reset(sk, skb);
820 	}
821 
822 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
823 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
824 
825 	arg.tos = ip_hdr(skb)->tos;
826 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
827 	local_bh_disable();
828 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
829 	sock_net_set(ctl_sk, net);
830 	if (sk) {
831 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
832 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
833 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
834 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
835 		transmit_time = tcp_transmit_time(sk);
836 		xfrm_sk_clone_policy(ctl_sk, sk);
837 		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
838 			 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
839 	} else {
840 		ctl_sk->sk_mark = 0;
841 		ctl_sk->sk_priority = 0;
842 	}
843 	ip_send_unicast_reply(ctl_sk,
844 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
845 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
846 			      &arg, arg.iov[0].iov_len,
847 			      transmit_time, txhash);
848 
849 	xfrm_sk_free_policy(ctl_sk);
850 	sock_net_set(ctl_sk, &init_net);
851 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
852 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
853 	local_bh_enable();
854 
855 #ifdef CONFIG_TCP_MD5SIG
856 out:
857 	rcu_read_unlock();
858 #endif
859 }
860 
861 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
862    outside socket context is ugly, certainly. What can I do?
863  */
864 
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_md5sig_key * key,int reply_flags,u8 tos,u32 txhash)865 static void tcp_v4_send_ack(const struct sock *sk,
866 			    struct sk_buff *skb, u32 seq, u32 ack,
867 			    u32 win, u32 tsval, u32 tsecr, int oif,
868 			    struct tcp_md5sig_key *key,
869 			    int reply_flags, u8 tos, u32 txhash)
870 {
871 	const struct tcphdr *th = tcp_hdr(skb);
872 	struct {
873 		struct tcphdr th;
874 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
875 #ifdef CONFIG_TCP_MD5SIG
876 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
877 #endif
878 			];
879 	} rep;
880 	struct net *net = sock_net(sk);
881 	struct ip_reply_arg arg;
882 	struct sock *ctl_sk;
883 	u64 transmit_time;
884 
885 	memset(&rep.th, 0, sizeof(struct tcphdr));
886 	memset(&arg, 0, sizeof(arg));
887 
888 	arg.iov[0].iov_base = (unsigned char *)&rep;
889 	arg.iov[0].iov_len  = sizeof(rep.th);
890 	if (tsecr) {
891 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
892 				   (TCPOPT_TIMESTAMP << 8) |
893 				   TCPOLEN_TIMESTAMP);
894 		rep.opt[1] = htonl(tsval);
895 		rep.opt[2] = htonl(tsecr);
896 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
897 	}
898 
899 	/* Swap the send and the receive. */
900 	rep.th.dest    = th->source;
901 	rep.th.source  = th->dest;
902 	rep.th.doff    = arg.iov[0].iov_len / 4;
903 	rep.th.seq     = htonl(seq);
904 	rep.th.ack_seq = htonl(ack);
905 	rep.th.ack     = 1;
906 	rep.th.window  = htons(win);
907 
908 #ifdef CONFIG_TCP_MD5SIG
909 	if (key) {
910 		int offset = (tsecr) ? 3 : 0;
911 
912 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
913 					  (TCPOPT_NOP << 16) |
914 					  (TCPOPT_MD5SIG << 8) |
915 					  TCPOLEN_MD5SIG);
916 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
917 		rep.th.doff = arg.iov[0].iov_len/4;
918 
919 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
920 				    key, ip_hdr(skb)->saddr,
921 				    ip_hdr(skb)->daddr, &rep.th);
922 	}
923 #endif
924 	arg.flags = reply_flags;
925 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
926 				      ip_hdr(skb)->saddr, /* XXX */
927 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
928 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
929 	if (oif)
930 		arg.bound_dev_if = oif;
931 	arg.tos = tos;
932 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
933 	local_bh_disable();
934 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
935 	sock_net_set(ctl_sk, net);
936 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
937 			   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
938 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
939 			   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
940 	transmit_time = tcp_transmit_time(sk);
941 	ip_send_unicast_reply(ctl_sk,
942 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
943 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
944 			      &arg, arg.iov[0].iov_len,
945 			      transmit_time, txhash);
946 
947 	sock_net_set(ctl_sk, &init_net);
948 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
949 	local_bh_enable();
950 }
951 
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)952 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
953 {
954 	struct inet_timewait_sock *tw = inet_twsk(sk);
955 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
956 
957 	tcp_v4_send_ack(sk, skb,
958 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
959 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
960 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
961 			tcptw->tw_ts_recent,
962 			tw->tw_bound_dev_if,
963 			tcp_twsk_md5_key(tcptw),
964 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
965 			tw->tw_tos,
966 			tw->tw_txhash
967 			);
968 
969 	inet_twsk_put(tw);
970 }
971 
tcp_v4_reqsk_send_ack(const struct sock * sk,struct sk_buff * skb,struct request_sock * req)972 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
973 				  struct request_sock *req)
974 {
975 	const union tcp_md5_addr *addr;
976 	int l3index;
977 
978 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
979 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
980 	 */
981 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
982 					     tcp_sk(sk)->snd_nxt;
983 
984 	/* RFC 7323 2.3
985 	 * The window field (SEG.WND) of every outgoing segment, with the
986 	 * exception of <SYN> segments, MUST be right-shifted by
987 	 * Rcv.Wind.Shift bits:
988 	 */
989 	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
990 	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
991 	tcp_v4_send_ack(sk, skb, seq,
992 			tcp_rsk(req)->rcv_nxt,
993 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
994 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
995 			READ_ONCE(req->ts_recent),
996 			0,
997 			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
998 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
999 			ip_hdr(skb)->tos,
1000 			READ_ONCE(tcp_rsk(req)->txhash));
1001 }
1002 
1003 /*
1004  *	Send a SYN-ACK after having received a SYN.
1005  *	This still operates on a request_sock only, not on a big
1006  *	socket.
1007  */
tcp_v4_send_synack(const struct sock * sk,struct dst_entry * dst,struct flowi * fl,struct request_sock * req,struct tcp_fastopen_cookie * foc,enum tcp_synack_type synack_type,struct sk_buff * syn_skb)1008 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1009 			      struct flowi *fl,
1010 			      struct request_sock *req,
1011 			      struct tcp_fastopen_cookie *foc,
1012 			      enum tcp_synack_type synack_type,
1013 			      struct sk_buff *syn_skb)
1014 {
1015 	const struct inet_request_sock *ireq = inet_rsk(req);
1016 	struct flowi4 fl4;
1017 	int err = -1;
1018 	struct sk_buff *skb;
1019 	u8 tos;
1020 
1021 	/* First, grab a route. */
1022 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1023 		return -1;
1024 
1025 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1026 
1027 	if (skb) {
1028 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1029 
1030 		tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1031 				(tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1032 				(inet_sk(sk)->tos & INET_ECN_MASK) :
1033 				inet_sk(sk)->tos;
1034 
1035 		if (!INET_ECN_is_capable(tos) &&
1036 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1037 			tos |= INET_ECN_ECT_0;
1038 
1039 		rcu_read_lock();
1040 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1041 					    ireq->ir_rmt_addr,
1042 					    rcu_dereference(ireq->ireq_opt),
1043 					    tos);
1044 		rcu_read_unlock();
1045 		err = net_xmit_eval(err);
1046 	}
1047 
1048 	return err;
1049 }
1050 
1051 /*
1052  *	IPv4 request_sock destructor.
1053  */
tcp_v4_reqsk_destructor(struct request_sock * req)1054 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1055 {
1056 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1057 }
1058 
1059 #ifdef CONFIG_TCP_MD5SIG
1060 /*
1061  * RFC2385 MD5 checksumming requires a mapping of
1062  * IP address->MD5 Key.
1063  * We need to maintain these in the sk structure.
1064  */
1065 
1066 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1067 EXPORT_SYMBOL(tcp_md5_needed);
1068 
better_md5_match(struct tcp_md5sig_key * old,struct tcp_md5sig_key * new)1069 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1070 {
1071 	if (!old)
1072 		return true;
1073 
1074 	/* l3index always overrides non-l3index */
1075 	if (old->l3index && new->l3index == 0)
1076 		return false;
1077 	if (old->l3index == 0 && new->l3index)
1078 		return true;
1079 
1080 	return old->prefixlen < new->prefixlen;
1081 }
1082 
1083 /* Find the Key structure for an address.  */
__tcp_md5_do_lookup(const struct sock * sk,int l3index,const union tcp_md5_addr * addr,int family)1084 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1085 					   const union tcp_md5_addr *addr,
1086 					   int family)
1087 {
1088 	const struct tcp_sock *tp = tcp_sk(sk);
1089 	struct tcp_md5sig_key *key;
1090 	const struct tcp_md5sig_info *md5sig;
1091 	__be32 mask;
1092 	struct tcp_md5sig_key *best_match = NULL;
1093 	bool match;
1094 
1095 	/* caller either holds rcu_read_lock() or socket lock */
1096 	md5sig = rcu_dereference_check(tp->md5sig_info,
1097 				       lockdep_sock_is_held(sk));
1098 	if (!md5sig)
1099 		return NULL;
1100 
1101 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1102 				 lockdep_sock_is_held(sk)) {
1103 		if (key->family != family)
1104 			continue;
1105 		if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1106 			continue;
1107 		if (family == AF_INET) {
1108 			mask = inet_make_mask(key->prefixlen);
1109 			match = (key->addr.a4.s_addr & mask) ==
1110 				(addr->a4.s_addr & mask);
1111 #if IS_ENABLED(CONFIG_IPV6)
1112 		} else if (family == AF_INET6) {
1113 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1114 						  key->prefixlen);
1115 #endif
1116 		} else {
1117 			match = false;
1118 		}
1119 
1120 		if (match && better_md5_match(best_match, key))
1121 			best_match = key;
1122 	}
1123 	return best_match;
1124 }
1125 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1126 
tcp_md5_do_lookup_exact(const struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags)1127 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1128 						      const union tcp_md5_addr *addr,
1129 						      int family, u8 prefixlen,
1130 						      int l3index, u8 flags)
1131 {
1132 	const struct tcp_sock *tp = tcp_sk(sk);
1133 	struct tcp_md5sig_key *key;
1134 	unsigned int size = sizeof(struct in_addr);
1135 	const struct tcp_md5sig_info *md5sig;
1136 
1137 	/* caller either holds rcu_read_lock() or socket lock */
1138 	md5sig = rcu_dereference_check(tp->md5sig_info,
1139 				       lockdep_sock_is_held(sk));
1140 	if (!md5sig)
1141 		return NULL;
1142 #if IS_ENABLED(CONFIG_IPV6)
1143 	if (family == AF_INET6)
1144 		size = sizeof(struct in6_addr);
1145 #endif
1146 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1147 				 lockdep_sock_is_held(sk)) {
1148 		if (key->family != family)
1149 			continue;
1150 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1151 			continue;
1152 		if (key->l3index != l3index)
1153 			continue;
1154 		if (!memcmp(&key->addr, addr, size) &&
1155 		    key->prefixlen == prefixlen)
1156 			return key;
1157 	}
1158 	return NULL;
1159 }
1160 
tcp_v4_md5_lookup(const struct sock * sk,const struct sock * addr_sk)1161 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1162 					 const struct sock *addr_sk)
1163 {
1164 	const union tcp_md5_addr *addr;
1165 	int l3index;
1166 
1167 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1168 						 addr_sk->sk_bound_dev_if);
1169 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1170 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1171 }
1172 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1173 
tcp_md5sig_info_add(struct sock * sk,gfp_t gfp)1174 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1175 {
1176 	struct tcp_sock *tp = tcp_sk(sk);
1177 	struct tcp_md5sig_info *md5sig;
1178 
1179 	md5sig = kmalloc(sizeof(*md5sig), gfp);
1180 	if (!md5sig)
1181 		return -ENOMEM;
1182 
1183 	sk_gso_disable(sk);
1184 	INIT_HLIST_HEAD(&md5sig->head);
1185 	rcu_assign_pointer(tp->md5sig_info, md5sig);
1186 	return 0;
1187 }
1188 
1189 /* This can be called on a newly created socket, from other files */
__tcp_md5_do_add(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags,const u8 * newkey,u8 newkeylen,gfp_t gfp)1190 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1191 			    int family, u8 prefixlen, int l3index, u8 flags,
1192 			    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1193 {
1194 	/* Add Key to the list */
1195 	struct tcp_md5sig_key *key;
1196 	struct tcp_sock *tp = tcp_sk(sk);
1197 	struct tcp_md5sig_info *md5sig;
1198 
1199 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1200 	if (key) {
1201 		/* Pre-existing entry - just update that one.
1202 		 * Note that the key might be used concurrently.
1203 		 * data_race() is telling kcsan that we do not care of
1204 		 * key mismatches, since changing MD5 key on live flows
1205 		 * can lead to packet drops.
1206 		 */
1207 		data_race(memcpy(key->key, newkey, newkeylen));
1208 
1209 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1210 		 * Also note that a reader could catch new key->keylen value
1211 		 * but old key->key[], this is the reason we use __GFP_ZERO
1212 		 * at sock_kmalloc() time below these lines.
1213 		 */
1214 		WRITE_ONCE(key->keylen, newkeylen);
1215 
1216 		return 0;
1217 	}
1218 
1219 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1220 					   lockdep_sock_is_held(sk));
1221 
1222 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1223 	if (!key)
1224 		return -ENOMEM;
1225 	if (!tcp_alloc_md5sig_pool()) {
1226 		sock_kfree_s(sk, key, sizeof(*key));
1227 		return -ENOMEM;
1228 	}
1229 
1230 	memcpy(key->key, newkey, newkeylen);
1231 	key->keylen = newkeylen;
1232 	key->family = family;
1233 	key->prefixlen = prefixlen;
1234 	key->l3index = l3index;
1235 	key->flags = flags;
1236 	memcpy(&key->addr, addr,
1237 	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1238 								 sizeof(struct in_addr));
1239 	hlist_add_head_rcu(&key->node, &md5sig->head);
1240 	return 0;
1241 }
1242 
tcp_md5_do_add(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags,const u8 * newkey,u8 newkeylen)1243 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1244 		   int family, u8 prefixlen, int l3index, u8 flags,
1245 		   const u8 *newkey, u8 newkeylen)
1246 {
1247 	struct tcp_sock *tp = tcp_sk(sk);
1248 
1249 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1250 		if (tcp_md5sig_info_add(sk, GFP_KERNEL))
1251 			return -ENOMEM;
1252 
1253 		if (!static_branch_inc(&tcp_md5_needed.key)) {
1254 			struct tcp_md5sig_info *md5sig;
1255 
1256 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1257 			rcu_assign_pointer(tp->md5sig_info, NULL);
1258 			kfree_rcu(md5sig, rcu);
1259 			return -EUSERS;
1260 		}
1261 	}
1262 
1263 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1264 				newkey, newkeylen, GFP_KERNEL);
1265 }
1266 EXPORT_SYMBOL(tcp_md5_do_add);
1267 
tcp_md5_key_copy(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,struct tcp_md5sig_key * key)1268 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1269 		     int family, u8 prefixlen, int l3index,
1270 		     struct tcp_md5sig_key *key)
1271 {
1272 	struct tcp_sock *tp = tcp_sk(sk);
1273 
1274 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1275 		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
1276 			return -ENOMEM;
1277 
1278 		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1279 			struct tcp_md5sig_info *md5sig;
1280 
1281 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1282 			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1283 			rcu_assign_pointer(tp->md5sig_info, NULL);
1284 			kfree_rcu(md5sig, rcu);
1285 			return -EUSERS;
1286 		}
1287 	}
1288 
1289 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1290 				key->flags, key->key, key->keylen,
1291 				sk_gfp_mask(sk, GFP_ATOMIC));
1292 }
1293 EXPORT_SYMBOL(tcp_md5_key_copy);
1294 
tcp_md5_do_del(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags)1295 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1296 		   u8 prefixlen, int l3index, u8 flags)
1297 {
1298 	struct tcp_md5sig_key *key;
1299 
1300 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1301 	if (!key)
1302 		return -ENOENT;
1303 	hlist_del_rcu(&key->node);
1304 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1305 	kfree_rcu(key, rcu);
1306 	return 0;
1307 }
1308 EXPORT_SYMBOL(tcp_md5_do_del);
1309 
tcp_clear_md5_list(struct sock * sk)1310 static void tcp_clear_md5_list(struct sock *sk)
1311 {
1312 	struct tcp_sock *tp = tcp_sk(sk);
1313 	struct tcp_md5sig_key *key;
1314 	struct hlist_node *n;
1315 	struct tcp_md5sig_info *md5sig;
1316 
1317 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1318 
1319 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1320 		hlist_del_rcu(&key->node);
1321 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1322 		kfree_rcu(key, rcu);
1323 	}
1324 }
1325 
tcp_v4_parse_md5_keys(struct sock * sk,int optname,sockptr_t optval,int optlen)1326 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1327 				 sockptr_t optval, int optlen)
1328 {
1329 	struct tcp_md5sig cmd;
1330 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1331 	const union tcp_md5_addr *addr;
1332 	u8 prefixlen = 32;
1333 	int l3index = 0;
1334 	u8 flags;
1335 
1336 	if (optlen < sizeof(cmd))
1337 		return -EINVAL;
1338 
1339 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1340 		return -EFAULT;
1341 
1342 	if (sin->sin_family != AF_INET)
1343 		return -EINVAL;
1344 
1345 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1346 
1347 	if (optname == TCP_MD5SIG_EXT &&
1348 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1349 		prefixlen = cmd.tcpm_prefixlen;
1350 		if (prefixlen > 32)
1351 			return -EINVAL;
1352 	}
1353 
1354 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1355 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1356 		struct net_device *dev;
1357 
1358 		rcu_read_lock();
1359 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1360 		if (dev && netif_is_l3_master(dev))
1361 			l3index = dev->ifindex;
1362 
1363 		rcu_read_unlock();
1364 
1365 		/* ok to reference set/not set outside of rcu;
1366 		 * right now device MUST be an L3 master
1367 		 */
1368 		if (!dev || !l3index)
1369 			return -EINVAL;
1370 	}
1371 
1372 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1373 
1374 	if (!cmd.tcpm_keylen)
1375 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1376 
1377 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1378 		return -EINVAL;
1379 
1380 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1381 			      cmd.tcpm_key, cmd.tcpm_keylen);
1382 }
1383 
tcp_v4_md5_hash_headers(struct tcp_md5sig_pool * hp,__be32 daddr,__be32 saddr,const struct tcphdr * th,int nbytes)1384 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1385 				   __be32 daddr, __be32 saddr,
1386 				   const struct tcphdr *th, int nbytes)
1387 {
1388 	struct tcp4_pseudohdr *bp;
1389 	struct scatterlist sg;
1390 	struct tcphdr *_th;
1391 
1392 	bp = hp->scratch;
1393 	bp->saddr = saddr;
1394 	bp->daddr = daddr;
1395 	bp->pad = 0;
1396 	bp->protocol = IPPROTO_TCP;
1397 	bp->len = cpu_to_be16(nbytes);
1398 
1399 	_th = (struct tcphdr *)(bp + 1);
1400 	memcpy(_th, th, sizeof(*th));
1401 	_th->check = 0;
1402 
1403 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1404 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1405 				sizeof(*bp) + sizeof(*th));
1406 	return crypto_ahash_update(hp->md5_req);
1407 }
1408 
tcp_v4_md5_hash_hdr(char * md5_hash,const struct tcp_md5sig_key * key,__be32 daddr,__be32 saddr,const struct tcphdr * th)1409 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1410 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1411 {
1412 	struct tcp_md5sig_pool *hp;
1413 	struct ahash_request *req;
1414 
1415 	hp = tcp_get_md5sig_pool();
1416 	if (!hp)
1417 		goto clear_hash_noput;
1418 	req = hp->md5_req;
1419 
1420 	if (crypto_ahash_init(req))
1421 		goto clear_hash;
1422 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1423 		goto clear_hash;
1424 	if (tcp_md5_hash_key(hp, key))
1425 		goto clear_hash;
1426 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1427 	if (crypto_ahash_final(req))
1428 		goto clear_hash;
1429 
1430 	tcp_put_md5sig_pool();
1431 	return 0;
1432 
1433 clear_hash:
1434 	tcp_put_md5sig_pool();
1435 clear_hash_noput:
1436 	memset(md5_hash, 0, 16);
1437 	return 1;
1438 }
1439 
tcp_v4_md5_hash_skb(char * md5_hash,const struct tcp_md5sig_key * key,const struct sock * sk,const struct sk_buff * skb)1440 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1441 			const struct sock *sk,
1442 			const struct sk_buff *skb)
1443 {
1444 	struct tcp_md5sig_pool *hp;
1445 	struct ahash_request *req;
1446 	const struct tcphdr *th = tcp_hdr(skb);
1447 	__be32 saddr, daddr;
1448 
1449 	if (sk) { /* valid for establish/request sockets */
1450 		saddr = sk->sk_rcv_saddr;
1451 		daddr = sk->sk_daddr;
1452 	} else {
1453 		const struct iphdr *iph = ip_hdr(skb);
1454 		saddr = iph->saddr;
1455 		daddr = iph->daddr;
1456 	}
1457 
1458 	hp = tcp_get_md5sig_pool();
1459 	if (!hp)
1460 		goto clear_hash_noput;
1461 	req = hp->md5_req;
1462 
1463 	if (crypto_ahash_init(req))
1464 		goto clear_hash;
1465 
1466 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1467 		goto clear_hash;
1468 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1469 		goto clear_hash;
1470 	if (tcp_md5_hash_key(hp, key))
1471 		goto clear_hash;
1472 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1473 	if (crypto_ahash_final(req))
1474 		goto clear_hash;
1475 
1476 	tcp_put_md5sig_pool();
1477 	return 0;
1478 
1479 clear_hash:
1480 	tcp_put_md5sig_pool();
1481 clear_hash_noput:
1482 	memset(md5_hash, 0, 16);
1483 	return 1;
1484 }
1485 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1486 
1487 #endif
1488 
tcp_v4_init_req(struct request_sock * req,const struct sock * sk_listener,struct sk_buff * skb)1489 static void tcp_v4_init_req(struct request_sock *req,
1490 			    const struct sock *sk_listener,
1491 			    struct sk_buff *skb)
1492 {
1493 	struct inet_request_sock *ireq = inet_rsk(req);
1494 	struct net *net = sock_net(sk_listener);
1495 
1496 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1497 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1498 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1499 }
1500 
tcp_v4_route_req(const struct sock * sk,struct sk_buff * skb,struct flowi * fl,struct request_sock * req)1501 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1502 					  struct sk_buff *skb,
1503 					  struct flowi *fl,
1504 					  struct request_sock *req)
1505 {
1506 	tcp_v4_init_req(req, sk, skb);
1507 
1508 	if (security_inet_conn_request(sk, skb, req))
1509 		return NULL;
1510 
1511 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1512 }
1513 
1514 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1515 	.family		=	PF_INET,
1516 	.obj_size	=	sizeof(struct tcp_request_sock),
1517 	.rtx_syn_ack	=	tcp_rtx_synack,
1518 	.send_ack	=	tcp_v4_reqsk_send_ack,
1519 	.destructor	=	tcp_v4_reqsk_destructor,
1520 	.send_reset	=	tcp_v4_send_reset,
1521 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1522 };
1523 
1524 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1525 	.mss_clamp	=	TCP_MSS_DEFAULT,
1526 #ifdef CONFIG_TCP_MD5SIG
1527 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1528 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1529 #endif
1530 #ifdef CONFIG_SYN_COOKIES
1531 	.cookie_init_seq =	cookie_v4_init_sequence,
1532 #endif
1533 	.route_req	=	tcp_v4_route_req,
1534 	.init_seq	=	tcp_v4_init_seq,
1535 	.init_ts_off	=	tcp_v4_init_ts_off,
1536 	.send_synack	=	tcp_v4_send_synack,
1537 };
1538 
tcp_v4_conn_request(struct sock * sk,struct sk_buff * skb)1539 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1540 {
1541 	/* Never answer to SYNs send to broadcast or multicast */
1542 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1543 		goto drop;
1544 
1545 	return tcp_conn_request(&tcp_request_sock_ops,
1546 				&tcp_request_sock_ipv4_ops, sk, skb);
1547 
1548 drop:
1549 	tcp_listendrop(sk);
1550 	return 0;
1551 }
1552 EXPORT_SYMBOL(tcp_v4_conn_request);
1553 
1554 
1555 /*
1556  * The three way handshake has completed - we got a valid synack -
1557  * now create the new socket.
1558  */
tcp_v4_syn_recv_sock(const struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct dst_entry * dst,struct request_sock * req_unhash,bool * own_req)1559 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1560 				  struct request_sock *req,
1561 				  struct dst_entry *dst,
1562 				  struct request_sock *req_unhash,
1563 				  bool *own_req)
1564 {
1565 	struct inet_request_sock *ireq;
1566 	bool found_dup_sk = false;
1567 	struct inet_sock *newinet;
1568 	struct tcp_sock *newtp;
1569 	struct sock *newsk;
1570 #ifdef CONFIG_TCP_MD5SIG
1571 	const union tcp_md5_addr *addr;
1572 	struct tcp_md5sig_key *key;
1573 	int l3index;
1574 #endif
1575 	struct ip_options_rcu *inet_opt;
1576 
1577 	if (sk_acceptq_is_full(sk))
1578 		goto exit_overflow;
1579 
1580 	newsk = tcp_create_openreq_child(sk, req, skb);
1581 	if (!newsk)
1582 		goto exit_nonewsk;
1583 
1584 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1585 	inet_sk_rx_dst_set(newsk, skb);
1586 
1587 	newtp		      = tcp_sk(newsk);
1588 	newinet		      = inet_sk(newsk);
1589 	ireq		      = inet_rsk(req);
1590 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1591 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1592 	newsk->sk_bound_dev_if = ireq->ir_iif;
1593 	newinet->inet_saddr   = ireq->ir_loc_addr;
1594 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1595 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1596 	newinet->mc_index     = inet_iif(skb);
1597 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1598 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1599 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1600 	if (inet_opt)
1601 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1602 	atomic_set(&newinet->inet_id, get_random_u16());
1603 
1604 	/* Set ToS of the new socket based upon the value of incoming SYN.
1605 	 * ECT bits are set later in tcp_init_transfer().
1606 	 */
1607 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1608 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1609 
1610 	if (!dst) {
1611 		dst = inet_csk_route_child_sock(sk, newsk, req);
1612 		if (!dst)
1613 			goto put_and_exit;
1614 	} else {
1615 		/* syncookie case : see end of cookie_v4_check() */
1616 	}
1617 	sk_setup_caps(newsk, dst);
1618 
1619 	tcp_ca_openreq_child(newsk, dst);
1620 
1621 	tcp_sync_mss(newsk, dst_mtu(dst));
1622 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1623 
1624 	tcp_initialize_rcv_mss(newsk);
1625 
1626 #ifdef CONFIG_TCP_MD5SIG
1627 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1628 	/* Copy over the MD5 key from the original socket */
1629 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1630 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1631 	if (key) {
1632 		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1633 			goto put_and_exit;
1634 		sk_gso_disable(newsk);
1635 	}
1636 #endif
1637 
1638 	if (__inet_inherit_port(sk, newsk) < 0)
1639 		goto put_and_exit;
1640 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1641 				       &found_dup_sk);
1642 	if (likely(*own_req)) {
1643 		tcp_move_syn(newtp, req);
1644 		ireq->ireq_opt = NULL;
1645 	} else {
1646 		newinet->inet_opt = NULL;
1647 
1648 		if (!req_unhash && found_dup_sk) {
1649 			/* This code path should only be executed in the
1650 			 * syncookie case only
1651 			 */
1652 			bh_unlock_sock(newsk);
1653 			sock_put(newsk);
1654 			newsk = NULL;
1655 		}
1656 	}
1657 	return newsk;
1658 
1659 exit_overflow:
1660 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1661 exit_nonewsk:
1662 	dst_release(dst);
1663 exit:
1664 	tcp_listendrop(sk);
1665 	return NULL;
1666 put_and_exit:
1667 	newinet->inet_opt = NULL;
1668 	inet_csk_prepare_forced_close(newsk);
1669 	tcp_done(newsk);
1670 	goto exit;
1671 }
1672 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1673 
tcp_v4_cookie_check(struct sock * sk,struct sk_buff * skb)1674 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1675 {
1676 #ifdef CONFIG_SYN_COOKIES
1677 	const struct tcphdr *th = tcp_hdr(skb);
1678 
1679 	if (!th->syn)
1680 		sk = cookie_v4_check(sk, skb);
1681 #endif
1682 	return sk;
1683 }
1684 
tcp_v4_get_syncookie(struct sock * sk,struct iphdr * iph,struct tcphdr * th,u32 * cookie)1685 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1686 			 struct tcphdr *th, u32 *cookie)
1687 {
1688 	u16 mss = 0;
1689 #ifdef CONFIG_SYN_COOKIES
1690 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1691 				    &tcp_request_sock_ipv4_ops, sk, th);
1692 	if (mss) {
1693 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1694 		tcp_synq_overflow(sk);
1695 	}
1696 #endif
1697 	return mss;
1698 }
1699 
1700 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1701 							   u32));
1702 /* The socket must have it's spinlock held when we get
1703  * here, unless it is a TCP_LISTEN socket.
1704  *
1705  * We have a potential double-lock case here, so even when
1706  * doing backlog processing we use the BH locking scheme.
1707  * This is because we cannot sleep with the original spinlock
1708  * held.
1709  */
tcp_v4_do_rcv(struct sock * sk,struct sk_buff * skb)1710 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1711 {
1712 	enum skb_drop_reason reason;
1713 	struct sock *rsk;
1714 
1715 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1716 		struct dst_entry *dst;
1717 
1718 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1719 						lockdep_sock_is_held(sk));
1720 
1721 		sock_rps_save_rxhash(sk, skb);
1722 		sk_mark_napi_id(sk, skb);
1723 		if (dst) {
1724 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1725 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1726 					     dst, 0)) {
1727 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1728 				dst_release(dst);
1729 			}
1730 		}
1731 		tcp_rcv_established(sk, skb);
1732 		return 0;
1733 	}
1734 
1735 	reason = SKB_DROP_REASON_NOT_SPECIFIED;
1736 	if (tcp_checksum_complete(skb))
1737 		goto csum_err;
1738 
1739 	if (sk->sk_state == TCP_LISTEN) {
1740 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1741 
1742 		if (!nsk)
1743 			goto discard;
1744 		if (nsk != sk) {
1745 			if (tcp_child_process(sk, nsk, skb)) {
1746 				rsk = nsk;
1747 				goto reset;
1748 			}
1749 			return 0;
1750 		}
1751 	} else
1752 		sock_rps_save_rxhash(sk, skb);
1753 
1754 	if (tcp_rcv_state_process(sk, skb)) {
1755 		rsk = sk;
1756 		goto reset;
1757 	}
1758 	return 0;
1759 
1760 reset:
1761 	tcp_v4_send_reset(rsk, skb);
1762 discard:
1763 	kfree_skb_reason(skb, reason);
1764 	/* Be careful here. If this function gets more complicated and
1765 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1766 	 * might be destroyed here. This current version compiles correctly,
1767 	 * but you have been warned.
1768 	 */
1769 	return 0;
1770 
1771 csum_err:
1772 	reason = SKB_DROP_REASON_TCP_CSUM;
1773 	trace_tcp_bad_csum(skb);
1774 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1775 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1776 	goto discard;
1777 }
1778 EXPORT_SYMBOL(tcp_v4_do_rcv);
1779 
tcp_v4_early_demux(struct sk_buff * skb)1780 int tcp_v4_early_demux(struct sk_buff *skb)
1781 {
1782 	struct net *net = dev_net(skb->dev);
1783 	const struct iphdr *iph;
1784 	const struct tcphdr *th;
1785 	struct sock *sk;
1786 
1787 	if (skb->pkt_type != PACKET_HOST)
1788 		return 0;
1789 
1790 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1791 		return 0;
1792 
1793 	iph = ip_hdr(skb);
1794 	th = tcp_hdr(skb);
1795 
1796 	if (th->doff < sizeof(struct tcphdr) / 4)
1797 		return 0;
1798 
1799 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1800 				       iph->saddr, th->source,
1801 				       iph->daddr, ntohs(th->dest),
1802 				       skb->skb_iif, inet_sdif(skb));
1803 	if (sk) {
1804 		skb->sk = sk;
1805 		skb->destructor = sock_edemux;
1806 		if (sk_fullsock(sk)) {
1807 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1808 
1809 			if (dst)
1810 				dst = dst_check(dst, 0);
1811 			if (dst &&
1812 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
1813 				skb_dst_set_noref(skb, dst);
1814 		}
1815 	}
1816 	return 0;
1817 }
1818 
tcp_add_backlog(struct sock * sk,struct sk_buff * skb,enum skb_drop_reason * reason)1819 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1820 		     enum skb_drop_reason *reason)
1821 {
1822 	u32 tail_gso_size, tail_gso_segs;
1823 	struct skb_shared_info *shinfo;
1824 	const struct tcphdr *th;
1825 	struct tcphdr *thtail;
1826 	struct sk_buff *tail;
1827 	unsigned int hdrlen;
1828 	bool fragstolen;
1829 	u32 gso_segs;
1830 	u32 gso_size;
1831 	u64 limit;
1832 	int delta;
1833 
1834 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1835 	 * we can fix skb->truesize to its real value to avoid future drops.
1836 	 * This is valid because skb is not yet charged to the socket.
1837 	 * It has been noticed pure SACK packets were sometimes dropped
1838 	 * (if cooked by drivers without copybreak feature).
1839 	 */
1840 	skb_condense(skb);
1841 
1842 	skb_dst_drop(skb);
1843 
1844 	if (unlikely(tcp_checksum_complete(skb))) {
1845 		bh_unlock_sock(sk);
1846 		trace_tcp_bad_csum(skb);
1847 		*reason = SKB_DROP_REASON_TCP_CSUM;
1848 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1849 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1850 		return true;
1851 	}
1852 
1853 	/* Attempt coalescing to last skb in backlog, even if we are
1854 	 * above the limits.
1855 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1856 	 */
1857 	th = (const struct tcphdr *)skb->data;
1858 	hdrlen = th->doff * 4;
1859 
1860 	tail = sk->sk_backlog.tail;
1861 	if (!tail)
1862 		goto no_coalesce;
1863 	thtail = (struct tcphdr *)tail->data;
1864 
1865 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1866 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1867 	    ((TCP_SKB_CB(tail)->tcp_flags |
1868 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1869 	    !((TCP_SKB_CB(tail)->tcp_flags &
1870 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1871 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1872 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1873 #ifdef CONFIG_TLS_DEVICE
1874 	    tail->decrypted != skb->decrypted ||
1875 #endif
1876 	    !mptcp_skb_can_collapse(tail, skb) ||
1877 	    thtail->doff != th->doff ||
1878 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1879 		goto no_coalesce;
1880 
1881 	__skb_pull(skb, hdrlen);
1882 
1883 	shinfo = skb_shinfo(skb);
1884 	gso_size = shinfo->gso_size ?: skb->len;
1885 	gso_segs = shinfo->gso_segs ?: 1;
1886 
1887 	shinfo = skb_shinfo(tail);
1888 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1889 	tail_gso_segs = shinfo->gso_segs ?: 1;
1890 
1891 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1892 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1893 
1894 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1895 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1896 			thtail->window = th->window;
1897 		}
1898 
1899 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1900 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1901 		 * is not entered if we append a packet with a FIN.
1902 		 * SYN, RST, URG are not present.
1903 		 * ACK is set on both packets.
1904 		 * PSH : we do not really care in TCP stack,
1905 		 *       at least for 'GRO' packets.
1906 		 */
1907 		thtail->fin |= th->fin;
1908 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1909 
1910 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1911 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1912 			tail->tstamp = skb->tstamp;
1913 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1914 		}
1915 
1916 		/* Not as strict as GRO. We only need to carry mss max value */
1917 		shinfo->gso_size = max(gso_size, tail_gso_size);
1918 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1919 
1920 		sk->sk_backlog.len += delta;
1921 		__NET_INC_STATS(sock_net(sk),
1922 				LINUX_MIB_TCPBACKLOGCOALESCE);
1923 		kfree_skb_partial(skb, fragstolen);
1924 		return false;
1925 	}
1926 	__skb_push(skb, hdrlen);
1927 
1928 no_coalesce:
1929 	/* sk->sk_backlog.len is reset only at the end of __release_sock().
1930 	 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
1931 	 * sk_rcvbuf in normal conditions.
1932 	 */
1933 	limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
1934 
1935 	limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
1936 
1937 	/* Only socket owner can try to collapse/prune rx queues
1938 	 * to reduce memory overhead, so add a little headroom here.
1939 	 * Few sockets backlog are possibly concurrently non empty.
1940 	 */
1941 	limit += 64 * 1024;
1942 
1943 	limit = min_t(u64, limit, UINT_MAX);
1944 
1945 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1946 		bh_unlock_sock(sk);
1947 		*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1948 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1949 		return true;
1950 	}
1951 	return false;
1952 }
1953 EXPORT_SYMBOL(tcp_add_backlog);
1954 
tcp_filter(struct sock * sk,struct sk_buff * skb)1955 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1956 {
1957 	struct tcphdr *th = (struct tcphdr *)skb->data;
1958 
1959 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1960 }
1961 EXPORT_SYMBOL(tcp_filter);
1962 
tcp_v4_restore_cb(struct sk_buff * skb)1963 static void tcp_v4_restore_cb(struct sk_buff *skb)
1964 {
1965 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1966 		sizeof(struct inet_skb_parm));
1967 }
1968 
tcp_v4_fill_cb(struct sk_buff * skb,const struct iphdr * iph,const struct tcphdr * th)1969 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1970 			   const struct tcphdr *th)
1971 {
1972 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1973 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1974 	 */
1975 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1976 		sizeof(struct inet_skb_parm));
1977 	barrier();
1978 
1979 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1980 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1981 				    skb->len - th->doff * 4);
1982 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1983 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1984 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1985 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1986 	TCP_SKB_CB(skb)->sacked	 = 0;
1987 	TCP_SKB_CB(skb)->has_rxtstamp =
1988 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1989 }
1990 
1991 /*
1992  *	From tcp_input.c
1993  */
1994 
tcp_v4_rcv(struct sk_buff * skb)1995 int tcp_v4_rcv(struct sk_buff *skb)
1996 {
1997 	struct net *net = dev_net(skb->dev);
1998 	enum skb_drop_reason drop_reason;
1999 	int sdif = inet_sdif(skb);
2000 	int dif = inet_iif(skb);
2001 	const struct iphdr *iph;
2002 	const struct tcphdr *th;
2003 	bool refcounted;
2004 	struct sock *sk;
2005 	int ret;
2006 
2007 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2008 	if (skb->pkt_type != PACKET_HOST)
2009 		goto discard_it;
2010 
2011 	/* Count it even if it's bad */
2012 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2013 
2014 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2015 		goto discard_it;
2016 
2017 	th = (const struct tcphdr *)skb->data;
2018 
2019 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2020 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2021 		goto bad_packet;
2022 	}
2023 	if (!pskb_may_pull(skb, th->doff * 4))
2024 		goto discard_it;
2025 
2026 	/* An explanation is required here, I think.
2027 	 * Packet length and doff are validated by header prediction,
2028 	 * provided case of th->doff==0 is eliminated.
2029 	 * So, we defer the checks. */
2030 
2031 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2032 		goto csum_error;
2033 
2034 	th = (const struct tcphdr *)skb->data;
2035 	iph = ip_hdr(skb);
2036 lookup:
2037 	sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2038 			       skb, __tcp_hdrlen(th), th->source,
2039 			       th->dest, sdif, &refcounted);
2040 	if (!sk)
2041 		goto no_tcp_socket;
2042 
2043 process:
2044 	if (sk->sk_state == TCP_TIME_WAIT)
2045 		goto do_time_wait;
2046 
2047 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2048 		struct request_sock *req = inet_reqsk(sk);
2049 		bool req_stolen = false;
2050 		struct sock *nsk;
2051 
2052 		sk = req->rsk_listener;
2053 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2054 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2055 		else
2056 			drop_reason = tcp_inbound_md5_hash(sk, skb,
2057 						   &iph->saddr, &iph->daddr,
2058 						   AF_INET, dif, sdif);
2059 		if (unlikely(drop_reason)) {
2060 			sk_drops_add(sk, skb);
2061 			reqsk_put(req);
2062 			goto discard_it;
2063 		}
2064 		if (tcp_checksum_complete(skb)) {
2065 			reqsk_put(req);
2066 			goto csum_error;
2067 		}
2068 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2069 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2070 			if (!nsk) {
2071 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2072 				goto lookup;
2073 			}
2074 			sk = nsk;
2075 			/* reuseport_migrate_sock() has already held one sk_refcnt
2076 			 * before returning.
2077 			 */
2078 		} else {
2079 			/* We own a reference on the listener, increase it again
2080 			 * as we might lose it too soon.
2081 			 */
2082 			sock_hold(sk);
2083 		}
2084 		refcounted = true;
2085 		nsk = NULL;
2086 		if (!tcp_filter(sk, skb)) {
2087 			th = (const struct tcphdr *)skb->data;
2088 			iph = ip_hdr(skb);
2089 			tcp_v4_fill_cb(skb, iph, th);
2090 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2091 		} else {
2092 			drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2093 		}
2094 		if (!nsk) {
2095 			reqsk_put(req);
2096 			if (req_stolen) {
2097 				/* Another cpu got exclusive access to req
2098 				 * and created a full blown socket.
2099 				 * Try to feed this packet to this socket
2100 				 * instead of discarding it.
2101 				 */
2102 				tcp_v4_restore_cb(skb);
2103 				sock_put(sk);
2104 				goto lookup;
2105 			}
2106 			goto discard_and_relse;
2107 		}
2108 		nf_reset_ct(skb);
2109 		if (nsk == sk) {
2110 			reqsk_put(req);
2111 			tcp_v4_restore_cb(skb);
2112 		} else if (tcp_child_process(sk, nsk, skb)) {
2113 			tcp_v4_send_reset(nsk, skb);
2114 			goto discard_and_relse;
2115 		} else {
2116 			sock_put(sk);
2117 			return 0;
2118 		}
2119 	}
2120 
2121 	if (static_branch_unlikely(&ip4_min_ttl)) {
2122 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2123 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2124 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2125 			drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2126 			goto discard_and_relse;
2127 		}
2128 	}
2129 
2130 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2131 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2132 		goto discard_and_relse;
2133 	}
2134 
2135 	drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2136 					   &iph->daddr, AF_INET, dif, sdif);
2137 	if (drop_reason)
2138 		goto discard_and_relse;
2139 
2140 	nf_reset_ct(skb);
2141 
2142 	if (tcp_filter(sk, skb)) {
2143 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2144 		goto discard_and_relse;
2145 	}
2146 	th = (const struct tcphdr *)skb->data;
2147 	iph = ip_hdr(skb);
2148 	tcp_v4_fill_cb(skb, iph, th);
2149 
2150 	skb->dev = NULL;
2151 
2152 	if (sk->sk_state == TCP_LISTEN) {
2153 		ret = tcp_v4_do_rcv(sk, skb);
2154 		goto put_and_return;
2155 	}
2156 
2157 	sk_incoming_cpu_update(sk);
2158 
2159 	bh_lock_sock_nested(sk);
2160 	tcp_segs_in(tcp_sk(sk), skb);
2161 	ret = 0;
2162 	if (!sock_owned_by_user(sk)) {
2163 		ret = tcp_v4_do_rcv(sk, skb);
2164 	} else {
2165 		if (tcp_add_backlog(sk, skb, &drop_reason))
2166 			goto discard_and_relse;
2167 	}
2168 	bh_unlock_sock(sk);
2169 
2170 put_and_return:
2171 	if (refcounted)
2172 		sock_put(sk);
2173 
2174 	return ret;
2175 
2176 no_tcp_socket:
2177 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2178 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2179 		goto discard_it;
2180 
2181 	tcp_v4_fill_cb(skb, iph, th);
2182 
2183 	if (tcp_checksum_complete(skb)) {
2184 csum_error:
2185 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2186 		trace_tcp_bad_csum(skb);
2187 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2188 bad_packet:
2189 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2190 	} else {
2191 		tcp_v4_send_reset(NULL, skb);
2192 	}
2193 
2194 discard_it:
2195 	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2196 	/* Discard frame. */
2197 	kfree_skb_reason(skb, drop_reason);
2198 	return 0;
2199 
2200 discard_and_relse:
2201 	sk_drops_add(sk, skb);
2202 	if (refcounted)
2203 		sock_put(sk);
2204 	goto discard_it;
2205 
2206 do_time_wait:
2207 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2208 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2209 		inet_twsk_put(inet_twsk(sk));
2210 		goto discard_it;
2211 	}
2212 
2213 	tcp_v4_fill_cb(skb, iph, th);
2214 
2215 	if (tcp_checksum_complete(skb)) {
2216 		inet_twsk_put(inet_twsk(sk));
2217 		goto csum_error;
2218 	}
2219 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2220 	case TCP_TW_SYN: {
2221 		struct sock *sk2 = inet_lookup_listener(net,
2222 							net->ipv4.tcp_death_row.hashinfo,
2223 							skb, __tcp_hdrlen(th),
2224 							iph->saddr, th->source,
2225 							iph->daddr, th->dest,
2226 							inet_iif(skb),
2227 							sdif);
2228 		if (sk2) {
2229 			inet_twsk_deschedule_put(inet_twsk(sk));
2230 			sk = sk2;
2231 			tcp_v4_restore_cb(skb);
2232 			refcounted = false;
2233 			goto process;
2234 		}
2235 	}
2236 		/* to ACK */
2237 		fallthrough;
2238 	case TCP_TW_ACK:
2239 		tcp_v4_timewait_ack(sk, skb);
2240 		break;
2241 	case TCP_TW_RST:
2242 		tcp_v4_send_reset(sk, skb);
2243 		inet_twsk_deschedule_put(inet_twsk(sk));
2244 		goto discard_it;
2245 	case TCP_TW_SUCCESS:;
2246 	}
2247 	goto discard_it;
2248 }
2249 
2250 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2251 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2252 	.twsk_unique	= tcp_twsk_unique,
2253 	.twsk_destructor= tcp_twsk_destructor,
2254 };
2255 
inet_sk_rx_dst_set(struct sock * sk,const struct sk_buff * skb)2256 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2257 {
2258 	struct dst_entry *dst = skb_dst(skb);
2259 
2260 	if (dst && dst_hold_safe(dst)) {
2261 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2262 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2263 	}
2264 }
2265 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2266 
2267 const struct inet_connection_sock_af_ops ipv4_specific = {
2268 	.queue_xmit	   = ip_queue_xmit,
2269 	.send_check	   = tcp_v4_send_check,
2270 	.rebuild_header	   = inet_sk_rebuild_header,
2271 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2272 	.conn_request	   = tcp_v4_conn_request,
2273 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2274 	.net_header_len	   = sizeof(struct iphdr),
2275 	.setsockopt	   = ip_setsockopt,
2276 	.getsockopt	   = ip_getsockopt,
2277 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2278 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2279 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2280 };
2281 EXPORT_SYMBOL(ipv4_specific);
2282 
2283 #ifdef CONFIG_TCP_MD5SIG
2284 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2285 	.md5_lookup		= tcp_v4_md5_lookup,
2286 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2287 	.md5_parse		= tcp_v4_parse_md5_keys,
2288 };
2289 #endif
2290 
2291 /* NOTE: A lot of things set to zero explicitly by call to
2292  *       sk_alloc() so need not be done here.
2293  */
tcp_v4_init_sock(struct sock * sk)2294 static int tcp_v4_init_sock(struct sock *sk)
2295 {
2296 	struct inet_connection_sock *icsk = inet_csk(sk);
2297 
2298 	tcp_init_sock(sk);
2299 
2300 	icsk->icsk_af_ops = &ipv4_specific;
2301 
2302 #ifdef CONFIG_TCP_MD5SIG
2303 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2304 #endif
2305 
2306 	return 0;
2307 }
2308 
tcp_v4_destroy_sock(struct sock * sk)2309 void tcp_v4_destroy_sock(struct sock *sk)
2310 {
2311 	struct tcp_sock *tp = tcp_sk(sk);
2312 
2313 	trace_tcp_destroy_sock(sk);
2314 
2315 	tcp_clear_xmit_timers(sk);
2316 
2317 	tcp_cleanup_congestion_control(sk);
2318 
2319 	tcp_cleanup_ulp(sk);
2320 
2321 	/* Cleanup up the write buffer. */
2322 	tcp_write_queue_purge(sk);
2323 
2324 	/* Check if we want to disable active TFO */
2325 	tcp_fastopen_active_disable_ofo_check(sk);
2326 
2327 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2328 	skb_rbtree_purge(&tp->out_of_order_queue);
2329 
2330 #ifdef CONFIG_TCP_MD5SIG
2331 	/* Clean up the MD5 key list, if any */
2332 	if (tp->md5sig_info) {
2333 		tcp_clear_md5_list(sk);
2334 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2335 		tp->md5sig_info = NULL;
2336 		static_branch_slow_dec_deferred(&tcp_md5_needed);
2337 	}
2338 #endif
2339 
2340 	/* Clean up a referenced TCP bind bucket. */
2341 	if (inet_csk(sk)->icsk_bind_hash)
2342 		inet_put_port(sk);
2343 
2344 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2345 
2346 	/* If socket is aborted during connect operation */
2347 	tcp_free_fastopen_req(tp);
2348 	tcp_fastopen_destroy_cipher(sk);
2349 	tcp_saved_syn_free(tp);
2350 
2351 	sk_sockets_allocated_dec(sk);
2352 }
2353 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2354 
2355 #ifdef CONFIG_PROC_FS
2356 /* Proc filesystem TCP sock list dumping. */
2357 
2358 static unsigned short seq_file_family(const struct seq_file *seq);
2359 
seq_sk_match(struct seq_file * seq,const struct sock * sk)2360 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2361 {
2362 	unsigned short family = seq_file_family(seq);
2363 
2364 	/* AF_UNSPEC is used as a match all */
2365 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2366 		net_eq(sock_net(sk), seq_file_net(seq)));
2367 }
2368 
2369 /* Find a non empty bucket (starting from st->bucket)
2370  * and return the first sk from it.
2371  */
listening_get_first(struct seq_file * seq)2372 static void *listening_get_first(struct seq_file *seq)
2373 {
2374 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2375 	struct tcp_iter_state *st = seq->private;
2376 
2377 	st->offset = 0;
2378 	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2379 		struct inet_listen_hashbucket *ilb2;
2380 		struct hlist_nulls_node *node;
2381 		struct sock *sk;
2382 
2383 		ilb2 = &hinfo->lhash2[st->bucket];
2384 		if (hlist_nulls_empty(&ilb2->nulls_head))
2385 			continue;
2386 
2387 		spin_lock(&ilb2->lock);
2388 		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2389 			if (seq_sk_match(seq, sk))
2390 				return sk;
2391 		}
2392 		spin_unlock(&ilb2->lock);
2393 	}
2394 
2395 	return NULL;
2396 }
2397 
2398 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2399  * If "cur" is the last one in the st->bucket,
2400  * call listening_get_first() to return the first sk of the next
2401  * non empty bucket.
2402  */
listening_get_next(struct seq_file * seq,void * cur)2403 static void *listening_get_next(struct seq_file *seq, void *cur)
2404 {
2405 	struct tcp_iter_state *st = seq->private;
2406 	struct inet_listen_hashbucket *ilb2;
2407 	struct hlist_nulls_node *node;
2408 	struct inet_hashinfo *hinfo;
2409 	struct sock *sk = cur;
2410 
2411 	++st->num;
2412 	++st->offset;
2413 
2414 	sk = sk_nulls_next(sk);
2415 	sk_nulls_for_each_from(sk, node) {
2416 		if (seq_sk_match(seq, sk))
2417 			return sk;
2418 	}
2419 
2420 	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2421 	ilb2 = &hinfo->lhash2[st->bucket];
2422 	spin_unlock(&ilb2->lock);
2423 	++st->bucket;
2424 	return listening_get_first(seq);
2425 }
2426 
listening_get_idx(struct seq_file * seq,loff_t * pos)2427 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2428 {
2429 	struct tcp_iter_state *st = seq->private;
2430 	void *rc;
2431 
2432 	st->bucket = 0;
2433 	st->offset = 0;
2434 	rc = listening_get_first(seq);
2435 
2436 	while (rc && *pos) {
2437 		rc = listening_get_next(seq, rc);
2438 		--*pos;
2439 	}
2440 	return rc;
2441 }
2442 
empty_bucket(struct inet_hashinfo * hinfo,const struct tcp_iter_state * st)2443 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2444 				const struct tcp_iter_state *st)
2445 {
2446 	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2447 }
2448 
2449 /*
2450  * Get first established socket starting from bucket given in st->bucket.
2451  * If st->bucket is zero, the very first socket in the hash is returned.
2452  */
established_get_first(struct seq_file * seq)2453 static void *established_get_first(struct seq_file *seq)
2454 {
2455 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2456 	struct tcp_iter_state *st = seq->private;
2457 
2458 	st->offset = 0;
2459 	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2460 		struct sock *sk;
2461 		struct hlist_nulls_node *node;
2462 		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2463 
2464 		cond_resched();
2465 
2466 		/* Lockless fast path for the common case of empty buckets */
2467 		if (empty_bucket(hinfo, st))
2468 			continue;
2469 
2470 		spin_lock_bh(lock);
2471 		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2472 			if (seq_sk_match(seq, sk))
2473 				return sk;
2474 		}
2475 		spin_unlock_bh(lock);
2476 	}
2477 
2478 	return NULL;
2479 }
2480 
established_get_next(struct seq_file * seq,void * cur)2481 static void *established_get_next(struct seq_file *seq, void *cur)
2482 {
2483 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2484 	struct tcp_iter_state *st = seq->private;
2485 	struct hlist_nulls_node *node;
2486 	struct sock *sk = cur;
2487 
2488 	++st->num;
2489 	++st->offset;
2490 
2491 	sk = sk_nulls_next(sk);
2492 
2493 	sk_nulls_for_each_from(sk, node) {
2494 		if (seq_sk_match(seq, sk))
2495 			return sk;
2496 	}
2497 
2498 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2499 	++st->bucket;
2500 	return established_get_first(seq);
2501 }
2502 
established_get_idx(struct seq_file * seq,loff_t pos)2503 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2504 {
2505 	struct tcp_iter_state *st = seq->private;
2506 	void *rc;
2507 
2508 	st->bucket = 0;
2509 	rc = established_get_first(seq);
2510 
2511 	while (rc && pos) {
2512 		rc = established_get_next(seq, rc);
2513 		--pos;
2514 	}
2515 	return rc;
2516 }
2517 
tcp_get_idx(struct seq_file * seq,loff_t pos)2518 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2519 {
2520 	void *rc;
2521 	struct tcp_iter_state *st = seq->private;
2522 
2523 	st->state = TCP_SEQ_STATE_LISTENING;
2524 	rc	  = listening_get_idx(seq, &pos);
2525 
2526 	if (!rc) {
2527 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2528 		rc	  = established_get_idx(seq, pos);
2529 	}
2530 
2531 	return rc;
2532 }
2533 
tcp_seek_last_pos(struct seq_file * seq)2534 static void *tcp_seek_last_pos(struct seq_file *seq)
2535 {
2536 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2537 	struct tcp_iter_state *st = seq->private;
2538 	int bucket = st->bucket;
2539 	int offset = st->offset;
2540 	int orig_num = st->num;
2541 	void *rc = NULL;
2542 
2543 	switch (st->state) {
2544 	case TCP_SEQ_STATE_LISTENING:
2545 		if (st->bucket > hinfo->lhash2_mask)
2546 			break;
2547 		rc = listening_get_first(seq);
2548 		while (offset-- && rc && bucket == st->bucket)
2549 			rc = listening_get_next(seq, rc);
2550 		if (rc)
2551 			break;
2552 		st->bucket = 0;
2553 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2554 		fallthrough;
2555 	case TCP_SEQ_STATE_ESTABLISHED:
2556 		if (st->bucket > hinfo->ehash_mask)
2557 			break;
2558 		rc = established_get_first(seq);
2559 		while (offset-- && rc && bucket == st->bucket)
2560 			rc = established_get_next(seq, rc);
2561 	}
2562 
2563 	st->num = orig_num;
2564 
2565 	return rc;
2566 }
2567 
tcp_seq_start(struct seq_file * seq,loff_t * pos)2568 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2569 {
2570 	struct tcp_iter_state *st = seq->private;
2571 	void *rc;
2572 
2573 	if (*pos && *pos == st->last_pos) {
2574 		rc = tcp_seek_last_pos(seq);
2575 		if (rc)
2576 			goto out;
2577 	}
2578 
2579 	st->state = TCP_SEQ_STATE_LISTENING;
2580 	st->num = 0;
2581 	st->bucket = 0;
2582 	st->offset = 0;
2583 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2584 
2585 out:
2586 	st->last_pos = *pos;
2587 	return rc;
2588 }
2589 EXPORT_SYMBOL(tcp_seq_start);
2590 
tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2591 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2592 {
2593 	struct tcp_iter_state *st = seq->private;
2594 	void *rc = NULL;
2595 
2596 	if (v == SEQ_START_TOKEN) {
2597 		rc = tcp_get_idx(seq, 0);
2598 		goto out;
2599 	}
2600 
2601 	switch (st->state) {
2602 	case TCP_SEQ_STATE_LISTENING:
2603 		rc = listening_get_next(seq, v);
2604 		if (!rc) {
2605 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2606 			st->bucket = 0;
2607 			st->offset = 0;
2608 			rc	  = established_get_first(seq);
2609 		}
2610 		break;
2611 	case TCP_SEQ_STATE_ESTABLISHED:
2612 		rc = established_get_next(seq, v);
2613 		break;
2614 	}
2615 out:
2616 	++*pos;
2617 	st->last_pos = *pos;
2618 	return rc;
2619 }
2620 EXPORT_SYMBOL(tcp_seq_next);
2621 
tcp_seq_stop(struct seq_file * seq,void * v)2622 void tcp_seq_stop(struct seq_file *seq, void *v)
2623 {
2624 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2625 	struct tcp_iter_state *st = seq->private;
2626 
2627 	switch (st->state) {
2628 	case TCP_SEQ_STATE_LISTENING:
2629 		if (v != SEQ_START_TOKEN)
2630 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
2631 		break;
2632 	case TCP_SEQ_STATE_ESTABLISHED:
2633 		if (v)
2634 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2635 		break;
2636 	}
2637 }
2638 EXPORT_SYMBOL(tcp_seq_stop);
2639 
get_openreq4(const struct request_sock * req,struct seq_file * f,int i)2640 static void get_openreq4(const struct request_sock *req,
2641 			 struct seq_file *f, int i)
2642 {
2643 	const struct inet_request_sock *ireq = inet_rsk(req);
2644 	long delta = req->rsk_timer.expires - jiffies;
2645 
2646 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2647 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2648 		i,
2649 		ireq->ir_loc_addr,
2650 		ireq->ir_num,
2651 		ireq->ir_rmt_addr,
2652 		ntohs(ireq->ir_rmt_port),
2653 		TCP_SYN_RECV,
2654 		0, 0, /* could print option size, but that is af dependent. */
2655 		1,    /* timers active (only the expire timer) */
2656 		jiffies_delta_to_clock_t(delta),
2657 		req->num_timeout,
2658 		from_kuid_munged(seq_user_ns(f),
2659 				 sock_i_uid(req->rsk_listener)),
2660 		0,  /* non standard timer */
2661 		0, /* open_requests have no inode */
2662 		0,
2663 		req);
2664 }
2665 
get_tcp4_sock(struct sock * sk,struct seq_file * f,int i)2666 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2667 {
2668 	int timer_active;
2669 	unsigned long timer_expires;
2670 	const struct tcp_sock *tp = tcp_sk(sk);
2671 	const struct inet_connection_sock *icsk = inet_csk(sk);
2672 	const struct inet_sock *inet = inet_sk(sk);
2673 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2674 	__be32 dest = inet->inet_daddr;
2675 	__be32 src = inet->inet_rcv_saddr;
2676 	__u16 destp = ntohs(inet->inet_dport);
2677 	__u16 srcp = ntohs(inet->inet_sport);
2678 	int rx_queue;
2679 	int state;
2680 
2681 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2682 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2683 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2684 		timer_active	= 1;
2685 		timer_expires	= icsk->icsk_timeout;
2686 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2687 		timer_active	= 4;
2688 		timer_expires	= icsk->icsk_timeout;
2689 	} else if (timer_pending(&sk->sk_timer)) {
2690 		timer_active	= 2;
2691 		timer_expires	= sk->sk_timer.expires;
2692 	} else {
2693 		timer_active	= 0;
2694 		timer_expires = jiffies;
2695 	}
2696 
2697 	state = inet_sk_state_load(sk);
2698 	if (state == TCP_LISTEN)
2699 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2700 	else
2701 		/* Because we don't lock the socket,
2702 		 * we might find a transient negative value.
2703 		 */
2704 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2705 				      READ_ONCE(tp->copied_seq), 0);
2706 
2707 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2708 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2709 		i, src, srcp, dest, destp, state,
2710 		READ_ONCE(tp->write_seq) - tp->snd_una,
2711 		rx_queue,
2712 		timer_active,
2713 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2714 		icsk->icsk_retransmits,
2715 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2716 		icsk->icsk_probes_out,
2717 		sock_i_ino(sk),
2718 		refcount_read(&sk->sk_refcnt), sk,
2719 		jiffies_to_clock_t(icsk->icsk_rto),
2720 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2721 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2722 		tcp_snd_cwnd(tp),
2723 		state == TCP_LISTEN ?
2724 		    fastopenq->max_qlen :
2725 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2726 }
2727 
get_timewait4_sock(const struct inet_timewait_sock * tw,struct seq_file * f,int i)2728 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2729 			       struct seq_file *f, int i)
2730 {
2731 	long delta = tw->tw_timer.expires - jiffies;
2732 	__be32 dest, src;
2733 	__u16 destp, srcp;
2734 
2735 	dest  = tw->tw_daddr;
2736 	src   = tw->tw_rcv_saddr;
2737 	destp = ntohs(tw->tw_dport);
2738 	srcp  = ntohs(tw->tw_sport);
2739 
2740 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2741 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2742 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2743 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2744 		refcount_read(&tw->tw_refcnt), tw);
2745 }
2746 
2747 #define TMPSZ 150
2748 
tcp4_seq_show(struct seq_file * seq,void * v)2749 static int tcp4_seq_show(struct seq_file *seq, void *v)
2750 {
2751 	struct tcp_iter_state *st;
2752 	struct sock *sk = v;
2753 
2754 	seq_setwidth(seq, TMPSZ - 1);
2755 	if (v == SEQ_START_TOKEN) {
2756 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2757 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2758 			   "inode");
2759 		goto out;
2760 	}
2761 	st = seq->private;
2762 
2763 	if (sk->sk_state == TCP_TIME_WAIT)
2764 		get_timewait4_sock(v, seq, st->num);
2765 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2766 		get_openreq4(v, seq, st->num);
2767 	else
2768 		get_tcp4_sock(v, seq, st->num);
2769 out:
2770 	seq_pad(seq, '\n');
2771 	return 0;
2772 }
2773 
2774 #ifdef CONFIG_BPF_SYSCALL
2775 struct bpf_tcp_iter_state {
2776 	struct tcp_iter_state state;
2777 	unsigned int cur_sk;
2778 	unsigned int end_sk;
2779 	unsigned int max_sk;
2780 	struct sock **batch;
2781 	bool st_bucket_done;
2782 };
2783 
2784 struct bpf_iter__tcp {
2785 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2786 	__bpf_md_ptr(struct sock_common *, sk_common);
2787 	uid_t uid __aligned(8);
2788 };
2789 
tcp_prog_seq_show(struct bpf_prog * prog,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)2790 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2791 			     struct sock_common *sk_common, uid_t uid)
2792 {
2793 	struct bpf_iter__tcp ctx;
2794 
2795 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2796 	ctx.meta = meta;
2797 	ctx.sk_common = sk_common;
2798 	ctx.uid = uid;
2799 	return bpf_iter_run_prog(prog, &ctx);
2800 }
2801 
bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state * iter)2802 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2803 {
2804 	while (iter->cur_sk < iter->end_sk)
2805 		sock_gen_put(iter->batch[iter->cur_sk++]);
2806 }
2807 
bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state * iter,unsigned int new_batch_sz)2808 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2809 				      unsigned int new_batch_sz)
2810 {
2811 	struct sock **new_batch;
2812 
2813 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2814 			     GFP_USER | __GFP_NOWARN);
2815 	if (!new_batch)
2816 		return -ENOMEM;
2817 
2818 	bpf_iter_tcp_put_batch(iter);
2819 	kvfree(iter->batch);
2820 	iter->batch = new_batch;
2821 	iter->max_sk = new_batch_sz;
2822 
2823 	return 0;
2824 }
2825 
bpf_iter_tcp_listening_batch(struct seq_file * seq,struct sock * start_sk)2826 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2827 						 struct sock *start_sk)
2828 {
2829 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2830 	struct bpf_tcp_iter_state *iter = seq->private;
2831 	struct tcp_iter_state *st = &iter->state;
2832 	struct hlist_nulls_node *node;
2833 	unsigned int expected = 1;
2834 	struct sock *sk;
2835 
2836 	sock_hold(start_sk);
2837 	iter->batch[iter->end_sk++] = start_sk;
2838 
2839 	sk = sk_nulls_next(start_sk);
2840 	sk_nulls_for_each_from(sk, node) {
2841 		if (seq_sk_match(seq, sk)) {
2842 			if (iter->end_sk < iter->max_sk) {
2843 				sock_hold(sk);
2844 				iter->batch[iter->end_sk++] = sk;
2845 			}
2846 			expected++;
2847 		}
2848 	}
2849 	spin_unlock(&hinfo->lhash2[st->bucket].lock);
2850 
2851 	return expected;
2852 }
2853 
bpf_iter_tcp_established_batch(struct seq_file * seq,struct sock * start_sk)2854 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2855 						   struct sock *start_sk)
2856 {
2857 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2858 	struct bpf_tcp_iter_state *iter = seq->private;
2859 	struct tcp_iter_state *st = &iter->state;
2860 	struct hlist_nulls_node *node;
2861 	unsigned int expected = 1;
2862 	struct sock *sk;
2863 
2864 	sock_hold(start_sk);
2865 	iter->batch[iter->end_sk++] = start_sk;
2866 
2867 	sk = sk_nulls_next(start_sk);
2868 	sk_nulls_for_each_from(sk, node) {
2869 		if (seq_sk_match(seq, sk)) {
2870 			if (iter->end_sk < iter->max_sk) {
2871 				sock_hold(sk);
2872 				iter->batch[iter->end_sk++] = sk;
2873 			}
2874 			expected++;
2875 		}
2876 	}
2877 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2878 
2879 	return expected;
2880 }
2881 
bpf_iter_tcp_batch(struct seq_file * seq)2882 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2883 {
2884 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2885 	struct bpf_tcp_iter_state *iter = seq->private;
2886 	struct tcp_iter_state *st = &iter->state;
2887 	unsigned int expected;
2888 	bool resized = false;
2889 	struct sock *sk;
2890 
2891 	/* The st->bucket is done.  Directly advance to the next
2892 	 * bucket instead of having the tcp_seek_last_pos() to skip
2893 	 * one by one in the current bucket and eventually find out
2894 	 * it has to advance to the next bucket.
2895 	 */
2896 	if (iter->st_bucket_done) {
2897 		st->offset = 0;
2898 		st->bucket++;
2899 		if (st->state == TCP_SEQ_STATE_LISTENING &&
2900 		    st->bucket > hinfo->lhash2_mask) {
2901 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2902 			st->bucket = 0;
2903 		}
2904 	}
2905 
2906 again:
2907 	/* Get a new batch */
2908 	iter->cur_sk = 0;
2909 	iter->end_sk = 0;
2910 	iter->st_bucket_done = false;
2911 
2912 	sk = tcp_seek_last_pos(seq);
2913 	if (!sk)
2914 		return NULL; /* Done */
2915 
2916 	if (st->state == TCP_SEQ_STATE_LISTENING)
2917 		expected = bpf_iter_tcp_listening_batch(seq, sk);
2918 	else
2919 		expected = bpf_iter_tcp_established_batch(seq, sk);
2920 
2921 	if (iter->end_sk == expected) {
2922 		iter->st_bucket_done = true;
2923 		return sk;
2924 	}
2925 
2926 	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2927 		resized = true;
2928 		goto again;
2929 	}
2930 
2931 	return sk;
2932 }
2933 
bpf_iter_tcp_seq_start(struct seq_file * seq,loff_t * pos)2934 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2935 {
2936 	/* bpf iter does not support lseek, so it always
2937 	 * continue from where it was stop()-ped.
2938 	 */
2939 	if (*pos)
2940 		return bpf_iter_tcp_batch(seq);
2941 
2942 	return SEQ_START_TOKEN;
2943 }
2944 
bpf_iter_tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2945 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2946 {
2947 	struct bpf_tcp_iter_state *iter = seq->private;
2948 	struct tcp_iter_state *st = &iter->state;
2949 	struct sock *sk;
2950 
2951 	/* Whenever seq_next() is called, the iter->cur_sk is
2952 	 * done with seq_show(), so advance to the next sk in
2953 	 * the batch.
2954 	 */
2955 	if (iter->cur_sk < iter->end_sk) {
2956 		/* Keeping st->num consistent in tcp_iter_state.
2957 		 * bpf_iter_tcp does not use st->num.
2958 		 * meta.seq_num is used instead.
2959 		 */
2960 		st->num++;
2961 		/* Move st->offset to the next sk in the bucket such that
2962 		 * the future start() will resume at st->offset in
2963 		 * st->bucket.  See tcp_seek_last_pos().
2964 		 */
2965 		st->offset++;
2966 		sock_gen_put(iter->batch[iter->cur_sk++]);
2967 	}
2968 
2969 	if (iter->cur_sk < iter->end_sk)
2970 		sk = iter->batch[iter->cur_sk];
2971 	else
2972 		sk = bpf_iter_tcp_batch(seq);
2973 
2974 	++*pos;
2975 	/* Keeping st->last_pos consistent in tcp_iter_state.
2976 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2977 	 */
2978 	st->last_pos = *pos;
2979 	return sk;
2980 }
2981 
bpf_iter_tcp_seq_show(struct seq_file * seq,void * v)2982 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2983 {
2984 	struct bpf_iter_meta meta;
2985 	struct bpf_prog *prog;
2986 	struct sock *sk = v;
2987 	uid_t uid;
2988 	int ret;
2989 
2990 	if (v == SEQ_START_TOKEN)
2991 		return 0;
2992 
2993 	if (sk_fullsock(sk))
2994 		lock_sock(sk);
2995 
2996 	if (unlikely(sk_unhashed(sk))) {
2997 		ret = SEQ_SKIP;
2998 		goto unlock;
2999 	}
3000 
3001 	if (sk->sk_state == TCP_TIME_WAIT) {
3002 		uid = 0;
3003 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3004 		const struct request_sock *req = v;
3005 
3006 		uid = from_kuid_munged(seq_user_ns(seq),
3007 				       sock_i_uid(req->rsk_listener));
3008 	} else {
3009 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3010 	}
3011 
3012 	meta.seq = seq;
3013 	prog = bpf_iter_get_info(&meta, false);
3014 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
3015 
3016 unlock:
3017 	if (sk_fullsock(sk))
3018 		release_sock(sk);
3019 	return ret;
3020 
3021 }
3022 
bpf_iter_tcp_seq_stop(struct seq_file * seq,void * v)3023 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3024 {
3025 	struct bpf_tcp_iter_state *iter = seq->private;
3026 	struct bpf_iter_meta meta;
3027 	struct bpf_prog *prog;
3028 
3029 	if (!v) {
3030 		meta.seq = seq;
3031 		prog = bpf_iter_get_info(&meta, true);
3032 		if (prog)
3033 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
3034 	}
3035 
3036 	if (iter->cur_sk < iter->end_sk) {
3037 		bpf_iter_tcp_put_batch(iter);
3038 		iter->st_bucket_done = false;
3039 	}
3040 }
3041 
3042 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3043 	.show		= bpf_iter_tcp_seq_show,
3044 	.start		= bpf_iter_tcp_seq_start,
3045 	.next		= bpf_iter_tcp_seq_next,
3046 	.stop		= bpf_iter_tcp_seq_stop,
3047 };
3048 #endif
seq_file_family(const struct seq_file * seq)3049 static unsigned short seq_file_family(const struct seq_file *seq)
3050 {
3051 	const struct tcp_seq_afinfo *afinfo;
3052 
3053 #ifdef CONFIG_BPF_SYSCALL
3054 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3055 	if (seq->op == &bpf_iter_tcp_seq_ops)
3056 		return AF_UNSPEC;
3057 #endif
3058 
3059 	/* Iterated from proc fs */
3060 	afinfo = pde_data(file_inode(seq->file));
3061 	return afinfo->family;
3062 }
3063 
3064 static const struct seq_operations tcp4_seq_ops = {
3065 	.show		= tcp4_seq_show,
3066 	.start		= tcp_seq_start,
3067 	.next		= tcp_seq_next,
3068 	.stop		= tcp_seq_stop,
3069 };
3070 
3071 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3072 	.family		= AF_INET,
3073 };
3074 
tcp4_proc_init_net(struct net * net)3075 static int __net_init tcp4_proc_init_net(struct net *net)
3076 {
3077 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3078 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3079 		return -ENOMEM;
3080 	return 0;
3081 }
3082 
tcp4_proc_exit_net(struct net * net)3083 static void __net_exit tcp4_proc_exit_net(struct net *net)
3084 {
3085 	remove_proc_entry("tcp", net->proc_net);
3086 }
3087 
3088 static struct pernet_operations tcp4_net_ops = {
3089 	.init = tcp4_proc_init_net,
3090 	.exit = tcp4_proc_exit_net,
3091 };
3092 
tcp4_proc_init(void)3093 int __init tcp4_proc_init(void)
3094 {
3095 	return register_pernet_subsys(&tcp4_net_ops);
3096 }
3097 
tcp4_proc_exit(void)3098 void tcp4_proc_exit(void)
3099 {
3100 	unregister_pernet_subsys(&tcp4_net_ops);
3101 }
3102 #endif /* CONFIG_PROC_FS */
3103 
3104 /* @wake is one when sk_stream_write_space() calls us.
3105  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3106  * This mimics the strategy used in sock_def_write_space().
3107  */
tcp_stream_memory_free(const struct sock * sk,int wake)3108 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3109 {
3110 	const struct tcp_sock *tp = tcp_sk(sk);
3111 	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3112 			    READ_ONCE(tp->snd_nxt);
3113 
3114 	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3115 }
3116 EXPORT_SYMBOL(tcp_stream_memory_free);
3117 
3118 struct proto tcp_prot = {
3119 	.name			= "TCP",
3120 	.owner			= THIS_MODULE,
3121 	.close			= tcp_close,
3122 	.pre_connect		= tcp_v4_pre_connect,
3123 	.connect		= tcp_v4_connect,
3124 	.disconnect		= tcp_disconnect,
3125 	.accept			= inet_csk_accept,
3126 	.ioctl			= tcp_ioctl,
3127 	.init			= tcp_v4_init_sock,
3128 	.destroy		= tcp_v4_destroy_sock,
3129 	.shutdown		= tcp_shutdown,
3130 	.setsockopt		= tcp_setsockopt,
3131 	.getsockopt		= tcp_getsockopt,
3132 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3133 	.keepalive		= tcp_set_keepalive,
3134 	.recvmsg		= tcp_recvmsg,
3135 	.sendmsg		= tcp_sendmsg,
3136 	.splice_eof		= tcp_splice_eof,
3137 	.backlog_rcv		= tcp_v4_do_rcv,
3138 	.release_cb		= tcp_release_cb,
3139 	.hash			= inet_hash,
3140 	.unhash			= inet_unhash,
3141 	.get_port		= inet_csk_get_port,
3142 	.put_port		= inet_put_port,
3143 #ifdef CONFIG_BPF_SYSCALL
3144 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3145 #endif
3146 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3147 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3148 	.stream_memory_free	= tcp_stream_memory_free,
3149 	.sockets_allocated	= &tcp_sockets_allocated,
3150 	.orphan_count		= &tcp_orphan_count,
3151 
3152 	.memory_allocated	= &tcp_memory_allocated,
3153 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3154 
3155 	.memory_pressure	= &tcp_memory_pressure,
3156 	.sysctl_mem		= sysctl_tcp_mem,
3157 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3158 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3159 	.max_header		= MAX_TCP_HEADER,
3160 	.obj_size		= sizeof(struct tcp_sock),
3161 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3162 	.twsk_prot		= &tcp_timewait_sock_ops,
3163 	.rsk_prot		= &tcp_request_sock_ops,
3164 	.h.hashinfo		= NULL,
3165 	.no_autobind		= true,
3166 	.diag_destroy		= tcp_abort,
3167 };
3168 EXPORT_SYMBOL(tcp_prot);
3169 
tcp_sk_exit(struct net * net)3170 static void __net_exit tcp_sk_exit(struct net *net)
3171 {
3172 	if (net->ipv4.tcp_congestion_control)
3173 		bpf_module_put(net->ipv4.tcp_congestion_control,
3174 			       net->ipv4.tcp_congestion_control->owner);
3175 }
3176 
tcp_set_hashinfo(struct net * net)3177 static void __net_init tcp_set_hashinfo(struct net *net)
3178 {
3179 	struct inet_hashinfo *hinfo;
3180 	unsigned int ehash_entries;
3181 	struct net *old_net;
3182 
3183 	if (net_eq(net, &init_net))
3184 		goto fallback;
3185 
3186 	old_net = current->nsproxy->net_ns;
3187 	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3188 	if (!ehash_entries)
3189 		goto fallback;
3190 
3191 	ehash_entries = roundup_pow_of_two(ehash_entries);
3192 	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3193 	if (!hinfo) {
3194 		pr_warn("Failed to allocate TCP ehash (entries: %u) "
3195 			"for a netns, fallback to the global one\n",
3196 			ehash_entries);
3197 fallback:
3198 		hinfo = &tcp_hashinfo;
3199 		ehash_entries = tcp_hashinfo.ehash_mask + 1;
3200 	}
3201 
3202 	net->ipv4.tcp_death_row.hashinfo = hinfo;
3203 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3204 	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3205 }
3206 
tcp_sk_init(struct net * net)3207 static int __net_init tcp_sk_init(struct net *net)
3208 {
3209 	net->ipv4.sysctl_tcp_ecn = 2;
3210 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3211 
3212 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3213 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3214 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3215 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3216 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3217 
3218 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3219 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3220 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3221 
3222 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3223 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3224 	net->ipv4.sysctl_tcp_syncookies = 1;
3225 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3226 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3227 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3228 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3229 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3230 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3231 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3232 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3233 
3234 	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3235 	tcp_set_hashinfo(net);
3236 
3237 	net->ipv4.sysctl_tcp_sack = 1;
3238 	net->ipv4.sysctl_tcp_window_scaling = 1;
3239 	net->ipv4.sysctl_tcp_timestamps = 1;
3240 	net->ipv4.sysctl_tcp_early_retrans = 3;
3241 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3242 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3243 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3244 	net->ipv4.sysctl_tcp_max_reordering = 300;
3245 	net->ipv4.sysctl_tcp_dsack = 1;
3246 	net->ipv4.sysctl_tcp_app_win = 31;
3247 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3248 	net->ipv4.sysctl_tcp_frto = 2;
3249 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3250 	/* This limits the percentage of the congestion window which we
3251 	 * will allow a single TSO frame to consume.  Building TSO frames
3252 	 * which are too large can cause TCP streams to be bursty.
3253 	 */
3254 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3255 	/* Default TSQ limit of 16 TSO segments */
3256 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3257 
3258 	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3259 	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3260 
3261 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3262 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3263 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3264 	net->ipv4.sysctl_tcp_autocorking = 1;
3265 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3266 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3267 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3268 	if (net != &init_net) {
3269 		memcpy(net->ipv4.sysctl_tcp_rmem,
3270 		       init_net.ipv4.sysctl_tcp_rmem,
3271 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3272 		memcpy(net->ipv4.sysctl_tcp_wmem,
3273 		       init_net.ipv4.sysctl_tcp_wmem,
3274 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3275 	}
3276 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3277 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3278 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3279 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3280 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3281 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3282 
3283 	/* Set default values for PLB */
3284 	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3285 	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3286 	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3287 	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3288 	/* Default congestion threshold for PLB to mark a round is 50% */
3289 	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3290 
3291 	/* Reno is always built in */
3292 	if (!net_eq(net, &init_net) &&
3293 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3294 			       init_net.ipv4.tcp_congestion_control->owner))
3295 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3296 	else
3297 		net->ipv4.tcp_congestion_control = &tcp_reno;
3298 
3299 	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3300 	net->ipv4.sysctl_tcp_shrink_window = 0;
3301 
3302 	return 0;
3303 }
3304 
tcp_sk_exit_batch(struct list_head * net_exit_list)3305 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3306 {
3307 	struct net *net;
3308 
3309 	/* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
3310 	 * and failed setup_net error unwinding path are serialized.
3311 	 *
3312 	 * tcp_twsk_purge() handles twsk in any dead netns, not just those in
3313 	 * net_exit_list, the thread that dismantles a particular twsk must
3314 	 * do so without other thread progressing to refcount_dec_and_test() of
3315 	 * tcp_death_row.tw_refcount.
3316 	 */
3317 	mutex_lock(&tcp_exit_batch_mutex);
3318 
3319 	tcp_twsk_purge(net_exit_list);
3320 
3321 	list_for_each_entry(net, net_exit_list, exit_list) {
3322 		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3323 		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3324 		tcp_fastopen_ctx_destroy(net);
3325 	}
3326 
3327 	mutex_unlock(&tcp_exit_batch_mutex);
3328 }
3329 
3330 static struct pernet_operations __net_initdata tcp_sk_ops = {
3331        .init	   = tcp_sk_init,
3332        .exit	   = tcp_sk_exit,
3333        .exit_batch = tcp_sk_exit_batch,
3334 };
3335 
3336 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(tcp,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)3337 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3338 		     struct sock_common *sk_common, uid_t uid)
3339 
3340 #define INIT_BATCH_SZ 16
3341 
3342 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3343 {
3344 	struct bpf_tcp_iter_state *iter = priv_data;
3345 	int err;
3346 
3347 	err = bpf_iter_init_seq_net(priv_data, aux);
3348 	if (err)
3349 		return err;
3350 
3351 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3352 	if (err) {
3353 		bpf_iter_fini_seq_net(priv_data);
3354 		return err;
3355 	}
3356 
3357 	return 0;
3358 }
3359 
bpf_iter_fini_tcp(void * priv_data)3360 static void bpf_iter_fini_tcp(void *priv_data)
3361 {
3362 	struct bpf_tcp_iter_state *iter = priv_data;
3363 
3364 	bpf_iter_fini_seq_net(priv_data);
3365 	kvfree(iter->batch);
3366 }
3367 
3368 static const struct bpf_iter_seq_info tcp_seq_info = {
3369 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3370 	.init_seq_private	= bpf_iter_init_tcp,
3371 	.fini_seq_private	= bpf_iter_fini_tcp,
3372 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3373 };
3374 
3375 static const struct bpf_func_proto *
bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)3376 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3377 			    const struct bpf_prog *prog)
3378 {
3379 	switch (func_id) {
3380 	case BPF_FUNC_setsockopt:
3381 		return &bpf_sk_setsockopt_proto;
3382 	case BPF_FUNC_getsockopt:
3383 		return &bpf_sk_getsockopt_proto;
3384 	default:
3385 		return NULL;
3386 	}
3387 }
3388 
3389 static struct bpf_iter_reg tcp_reg_info = {
3390 	.target			= "tcp",
3391 	.ctx_arg_info_size	= 1,
3392 	.ctx_arg_info		= {
3393 		{ offsetof(struct bpf_iter__tcp, sk_common),
3394 		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3395 	},
3396 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3397 	.seq_info		= &tcp_seq_info,
3398 };
3399 
bpf_iter_register(void)3400 static void __init bpf_iter_register(void)
3401 {
3402 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3403 	if (bpf_iter_reg_target(&tcp_reg_info))
3404 		pr_warn("Warning: could not register bpf iterator tcp\n");
3405 }
3406 
3407 #endif
3408 
tcp_v4_init(void)3409 void __init tcp_v4_init(void)
3410 {
3411 	int cpu, res;
3412 
3413 	for_each_possible_cpu(cpu) {
3414 		struct sock *sk;
3415 
3416 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3417 					   IPPROTO_TCP, &init_net);
3418 		if (res)
3419 			panic("Failed to create the TCP control socket.\n");
3420 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3421 
3422 		/* Please enforce IP_DF and IPID==0 for RST and
3423 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3424 		 */
3425 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3426 
3427 		per_cpu(ipv4_tcp_sk, cpu) = sk;
3428 	}
3429 	if (register_pernet_subsys(&tcp_sk_ops))
3430 		panic("Failed to create the TCP control socket.\n");
3431 
3432 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3433 	bpf_iter_register();
3434 #endif
3435 }
3436