xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision 00bb9335)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 #include <linux/sched.h>
61 
62 #include <net/net_namespace.h>
63 #include <net/icmp.h>
64 #include <net/inet_hashtables.h>
65 #include <net/tcp.h>
66 #include <net/transp_v6.h>
67 #include <net/ipv6.h>
68 #include <net/inet_common.h>
69 #include <net/timewait_sock.h>
70 #include <net/xfrm.h>
71 #include <net/secure_seq.h>
72 #include <net/busy_poll.h>
73 
74 #include <linux/inet.h>
75 #include <linux/ipv6.h>
76 #include <linux/stddef.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79 #include <linux/inetdevice.h>
80 #include <linux/btf_ids.h>
81 
82 #include <crypto/hash.h>
83 #include <linux/scatterlist.h>
84 
85 #include <trace/events/tcp.h>
86 
87 #ifdef CONFIG_TCP_MD5SIG
88 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
89 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
90 #endif
91 
92 struct inet_hashinfo tcp_hashinfo;
93 EXPORT_SYMBOL(tcp_hashinfo);
94 
95 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
96 
tcp_v4_init_seq(const struct sk_buff * skb)97 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
98 {
99 	return secure_tcp_seq(ip_hdr(skb)->daddr,
100 			      ip_hdr(skb)->saddr,
101 			      tcp_hdr(skb)->dest,
102 			      tcp_hdr(skb)->source);
103 }
104 
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)105 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
106 {
107 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
108 }
109 
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)110 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
111 {
112 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
113 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
114 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
115 	struct tcp_sock *tp = tcp_sk(sk);
116 
117 	if (reuse == 2) {
118 		/* Still does not detect *everything* that goes through
119 		 * lo, since we require a loopback src or dst address
120 		 * or direct binding to 'lo' interface.
121 		 */
122 		bool loopback = false;
123 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
124 			loopback = true;
125 #if IS_ENABLED(CONFIG_IPV6)
126 		if (tw->tw_family == AF_INET6) {
127 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
128 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
129 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
130 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
131 				loopback = true;
132 		} else
133 #endif
134 		{
135 			if (ipv4_is_loopback(tw->tw_daddr) ||
136 			    ipv4_is_loopback(tw->tw_rcv_saddr))
137 				loopback = true;
138 		}
139 		if (!loopback)
140 			reuse = 0;
141 	}
142 
143 	/* With PAWS, it is safe from the viewpoint
144 	   of data integrity. Even without PAWS it is safe provided sequence
145 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
146 
147 	   Actually, the idea is close to VJ's one, only timestamp cache is
148 	   held not per host, but per port pair and TW bucket is used as state
149 	   holder.
150 
151 	   If TW bucket has been already destroyed we fall back to VJ's scheme
152 	   and use initial timestamp retrieved from peer table.
153 	 */
154 	if (tcptw->tw_ts_recent_stamp &&
155 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
156 					    tcptw->tw_ts_recent_stamp)))) {
157 		/* inet_twsk_hashdance() sets sk_refcnt after putting twsk
158 		 * and releasing the bucket lock.
159 		 */
160 		if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
161 			return 0;
162 
163 		/* In case of repair and re-using TIME-WAIT sockets we still
164 		 * want to be sure that it is safe as above but honor the
165 		 * sequence numbers and time stamps set as part of the repair
166 		 * process.
167 		 *
168 		 * Without this check re-using a TIME-WAIT socket with TCP
169 		 * repair would accumulate a -1 on the repair assigned
170 		 * sequence number. The first time it is reused the sequence
171 		 * is -1, the second time -2, etc. This fixes that issue
172 		 * without appearing to create any others.
173 		 */
174 		if (likely(!tp->repair)) {
175 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
176 
177 			if (!seq)
178 				seq = 1;
179 			WRITE_ONCE(tp->write_seq, seq);
180 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
181 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
182 		}
183 
184 		return 1;
185 	}
186 
187 	return 0;
188 }
189 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
190 
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)191 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
192 			      int addr_len)
193 {
194 	/* This check is replicated from tcp_v4_connect() and intended to
195 	 * prevent BPF program called below from accessing bytes that are out
196 	 * of the bound specified by user in addr_len.
197 	 */
198 	if (addr_len < sizeof(struct sockaddr_in))
199 		return -EINVAL;
200 
201 	sock_owned_by_me(sk);
202 
203 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
204 }
205 
206 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)207 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
208 {
209 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
210 	struct inet_timewait_death_row *tcp_death_row;
211 	struct inet_sock *inet = inet_sk(sk);
212 	struct tcp_sock *tp = tcp_sk(sk);
213 	struct ip_options_rcu *inet_opt;
214 	struct net *net = sock_net(sk);
215 	__be16 orig_sport, orig_dport;
216 	__be32 daddr, nexthop;
217 	struct flowi4 *fl4;
218 	struct rtable *rt;
219 	int err;
220 
221 	if (addr_len < sizeof(struct sockaddr_in))
222 		return -EINVAL;
223 
224 	if (usin->sin_family != AF_INET)
225 		return -EAFNOSUPPORT;
226 
227 	nexthop = daddr = usin->sin_addr.s_addr;
228 	inet_opt = rcu_dereference_protected(inet->inet_opt,
229 					     lockdep_sock_is_held(sk));
230 	if (inet_opt && inet_opt->opt.srr) {
231 		if (!daddr)
232 			return -EINVAL;
233 		nexthop = inet_opt->opt.faddr;
234 	}
235 
236 	orig_sport = inet->inet_sport;
237 	orig_dport = usin->sin_port;
238 	fl4 = &inet->cork.fl.u.ip4;
239 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
240 			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
241 			      orig_dport, sk);
242 	if (IS_ERR(rt)) {
243 		err = PTR_ERR(rt);
244 		if (err == -ENETUNREACH)
245 			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
246 		return err;
247 	}
248 
249 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
250 		ip_rt_put(rt);
251 		return -ENETUNREACH;
252 	}
253 
254 	if (!inet_opt || !inet_opt->opt.srr)
255 		daddr = fl4->daddr;
256 
257 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
258 
259 	if (!inet->inet_saddr) {
260 		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
261 		if (err) {
262 			ip_rt_put(rt);
263 			return err;
264 		}
265 	} else {
266 		sk_rcv_saddr_set(sk, inet->inet_saddr);
267 	}
268 
269 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
270 		/* Reset inherited state */
271 		tp->rx_opt.ts_recent	   = 0;
272 		tp->rx_opt.ts_recent_stamp = 0;
273 		if (likely(!tp->repair))
274 			WRITE_ONCE(tp->write_seq, 0);
275 	}
276 
277 	inet->inet_dport = usin->sin_port;
278 	sk_daddr_set(sk, daddr);
279 
280 	inet_csk(sk)->icsk_ext_hdr_len = 0;
281 	if (inet_opt)
282 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
283 
284 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
285 
286 	/* Socket identity is still unknown (sport may be zero).
287 	 * However we set state to SYN-SENT and not releasing socket
288 	 * lock select source port, enter ourselves into the hash tables and
289 	 * complete initialization after this.
290 	 */
291 	tcp_set_state(sk, TCP_SYN_SENT);
292 	err = inet_hash_connect(tcp_death_row, sk);
293 	if (err)
294 		goto failure;
295 
296 	sk_set_txhash(sk);
297 
298 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
299 			       inet->inet_sport, inet->inet_dport, sk);
300 	if (IS_ERR(rt)) {
301 		err = PTR_ERR(rt);
302 		rt = NULL;
303 		goto failure;
304 	}
305 	/* OK, now commit destination to socket.  */
306 	sk->sk_gso_type = SKB_GSO_TCPV4;
307 	sk_setup_caps(sk, &rt->dst);
308 	rt = NULL;
309 
310 	if (likely(!tp->repair)) {
311 		if (!tp->write_seq)
312 			WRITE_ONCE(tp->write_seq,
313 				   secure_tcp_seq(inet->inet_saddr,
314 						  inet->inet_daddr,
315 						  inet->inet_sport,
316 						  usin->sin_port));
317 		WRITE_ONCE(tp->tsoffset,
318 			   secure_tcp_ts_off(net, inet->inet_saddr,
319 					     inet->inet_daddr));
320 	}
321 
322 	atomic_set(&inet->inet_id, get_random_u16());
323 
324 	if (tcp_fastopen_defer_connect(sk, &err))
325 		return err;
326 	if (err)
327 		goto failure;
328 
329 	err = tcp_connect(sk);
330 
331 	if (err)
332 		goto failure;
333 
334 	return 0;
335 
336 failure:
337 	/*
338 	 * This unhashes the socket and releases the local port,
339 	 * if necessary.
340 	 */
341 	tcp_set_state(sk, TCP_CLOSE);
342 	inet_bhash2_reset_saddr(sk);
343 	ip_rt_put(rt);
344 	sk->sk_route_caps = 0;
345 	inet->inet_dport = 0;
346 	return err;
347 }
348 EXPORT_SYMBOL(tcp_v4_connect);
349 
350 /*
351  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
352  * It can be called through tcp_release_cb() if socket was owned by user
353  * at the time tcp_v4_err() was called to handle ICMP message.
354  */
tcp_v4_mtu_reduced(struct sock * sk)355 void tcp_v4_mtu_reduced(struct sock *sk)
356 {
357 	struct inet_sock *inet = inet_sk(sk);
358 	struct dst_entry *dst;
359 	u32 mtu;
360 
361 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
362 		return;
363 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
364 	dst = inet_csk_update_pmtu(sk, mtu);
365 	if (!dst)
366 		return;
367 
368 	/* Something is about to be wrong... Remember soft error
369 	 * for the case, if this connection will not able to recover.
370 	 */
371 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
372 		WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
373 
374 	mtu = dst_mtu(dst);
375 
376 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
377 	    ip_sk_accept_pmtu(sk) &&
378 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
379 		tcp_sync_mss(sk, mtu);
380 
381 		/* Resend the TCP packet because it's
382 		 * clear that the old packet has been
383 		 * dropped. This is the new "fast" path mtu
384 		 * discovery.
385 		 */
386 		tcp_simple_retransmit(sk);
387 	} /* else let the usual retransmit timer handle it */
388 }
389 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
390 
do_redirect(struct sk_buff * skb,struct sock * sk)391 static void do_redirect(struct sk_buff *skb, struct sock *sk)
392 {
393 	struct dst_entry *dst = __sk_dst_check(sk, 0);
394 
395 	if (dst)
396 		dst->ops->redirect(dst, sk, skb);
397 }
398 
399 
400 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)401 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
402 {
403 	struct request_sock *req = inet_reqsk(sk);
404 	struct net *net = sock_net(sk);
405 
406 	/* ICMPs are not backlogged, hence we cannot get
407 	 * an established socket here.
408 	 */
409 	if (seq != tcp_rsk(req)->snt_isn) {
410 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
411 	} else if (abort) {
412 		/*
413 		 * Still in SYN_RECV, just remove it silently.
414 		 * There is no good way to pass the error to the newly
415 		 * created socket, and POSIX does not want network
416 		 * errors returned from accept().
417 		 */
418 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
419 		tcp_listendrop(req->rsk_listener);
420 	}
421 	reqsk_put(req);
422 }
423 EXPORT_SYMBOL(tcp_req_err);
424 
425 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)426 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
427 {
428 	struct inet_connection_sock *icsk = inet_csk(sk);
429 	struct tcp_sock *tp = tcp_sk(sk);
430 	struct sk_buff *skb;
431 	s32 remaining;
432 	u32 delta_us;
433 
434 	if (sock_owned_by_user(sk))
435 		return;
436 
437 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
438 	    !icsk->icsk_backoff)
439 		return;
440 
441 	skb = tcp_rtx_queue_head(sk);
442 	if (WARN_ON_ONCE(!skb))
443 		return;
444 
445 	icsk->icsk_backoff--;
446 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
447 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
448 
449 	tcp_mstamp_refresh(tp);
450 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
451 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
452 
453 	if (remaining > 0) {
454 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
455 					  remaining, TCP_RTO_MAX);
456 	} else {
457 		/* RTO revert clocked out retransmission.
458 		 * Will retransmit now.
459 		 */
460 		tcp_retransmit_timer(sk);
461 	}
462 }
463 EXPORT_SYMBOL(tcp_ld_RTO_revert);
464 
465 /*
466  * This routine is called by the ICMP module when it gets some
467  * sort of error condition.  If err < 0 then the socket should
468  * be closed and the error returned to the user.  If err > 0
469  * it's just the icmp type << 8 | icmp code.  After adjustment
470  * header points to the first 8 bytes of the tcp header.  We need
471  * to find the appropriate port.
472  *
473  * The locking strategy used here is very "optimistic". When
474  * someone else accesses the socket the ICMP is just dropped
475  * and for some paths there is no check at all.
476  * A more general error queue to queue errors for later handling
477  * is probably better.
478  *
479  */
480 
tcp_v4_err(struct sk_buff * skb,u32 info)481 int tcp_v4_err(struct sk_buff *skb, u32 info)
482 {
483 	const struct iphdr *iph = (const struct iphdr *)skb->data;
484 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
485 	struct tcp_sock *tp;
486 	const int type = icmp_hdr(skb)->type;
487 	const int code = icmp_hdr(skb)->code;
488 	struct sock *sk;
489 	struct request_sock *fastopen;
490 	u32 seq, snd_una;
491 	int err;
492 	struct net *net = dev_net(skb->dev);
493 
494 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
495 				       iph->daddr, th->dest, iph->saddr,
496 				       ntohs(th->source), inet_iif(skb), 0);
497 	if (!sk) {
498 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
499 		return -ENOENT;
500 	}
501 	if (sk->sk_state == TCP_TIME_WAIT) {
502 		inet_twsk_put(inet_twsk(sk));
503 		return 0;
504 	}
505 	seq = ntohl(th->seq);
506 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
507 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
508 				     type == ICMP_TIME_EXCEEDED ||
509 				     (type == ICMP_DEST_UNREACH &&
510 				      (code == ICMP_NET_UNREACH ||
511 				       code == ICMP_HOST_UNREACH)));
512 		return 0;
513 	}
514 
515 	bh_lock_sock(sk);
516 	/* If too many ICMPs get dropped on busy
517 	 * servers this needs to be solved differently.
518 	 * We do take care of PMTU discovery (RFC1191) special case :
519 	 * we can receive locally generated ICMP messages while socket is held.
520 	 */
521 	if (sock_owned_by_user(sk)) {
522 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
523 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
524 	}
525 	if (sk->sk_state == TCP_CLOSE)
526 		goto out;
527 
528 	if (static_branch_unlikely(&ip4_min_ttl)) {
529 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
530 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
531 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
532 			goto out;
533 		}
534 	}
535 
536 	tp = tcp_sk(sk);
537 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
538 	fastopen = rcu_dereference(tp->fastopen_rsk);
539 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
540 	if (sk->sk_state != TCP_LISTEN &&
541 	    !between(seq, snd_una, tp->snd_nxt)) {
542 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
543 		goto out;
544 	}
545 
546 	switch (type) {
547 	case ICMP_REDIRECT:
548 		if (!sock_owned_by_user(sk))
549 			do_redirect(skb, sk);
550 		goto out;
551 	case ICMP_SOURCE_QUENCH:
552 		/* Just silently ignore these. */
553 		goto out;
554 	case ICMP_PARAMETERPROB:
555 		err = EPROTO;
556 		break;
557 	case ICMP_DEST_UNREACH:
558 		if (code > NR_ICMP_UNREACH)
559 			goto out;
560 
561 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
562 			/* We are not interested in TCP_LISTEN and open_requests
563 			 * (SYN-ACKs send out by Linux are always <576bytes so
564 			 * they should go through unfragmented).
565 			 */
566 			if (sk->sk_state == TCP_LISTEN)
567 				goto out;
568 
569 			WRITE_ONCE(tp->mtu_info, info);
570 			if (!sock_owned_by_user(sk)) {
571 				tcp_v4_mtu_reduced(sk);
572 			} else {
573 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
574 					sock_hold(sk);
575 			}
576 			goto out;
577 		}
578 
579 		err = icmp_err_convert[code].errno;
580 		/* check if this ICMP message allows revert of backoff.
581 		 * (see RFC 6069)
582 		 */
583 		if (!fastopen &&
584 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
585 			tcp_ld_RTO_revert(sk, seq);
586 		break;
587 	case ICMP_TIME_EXCEEDED:
588 		err = EHOSTUNREACH;
589 		break;
590 	default:
591 		goto out;
592 	}
593 
594 	switch (sk->sk_state) {
595 	case TCP_SYN_SENT:
596 	case TCP_SYN_RECV:
597 		/* Only in fast or simultaneous open. If a fast open socket is
598 		 * already accepted it is treated as a connected one below.
599 		 */
600 		if (fastopen && !fastopen->sk)
601 			break;
602 
603 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
604 
605 		if (!sock_owned_by_user(sk)) {
606 			WRITE_ONCE(sk->sk_err, err);
607 
608 			sk_error_report(sk);
609 
610 			tcp_done(sk);
611 		} else {
612 			WRITE_ONCE(sk->sk_err_soft, err);
613 		}
614 		goto out;
615 	}
616 
617 	/* If we've already connected we will keep trying
618 	 * until we time out, or the user gives up.
619 	 *
620 	 * rfc1122 4.2.3.9 allows to consider as hard errors
621 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
622 	 * but it is obsoleted by pmtu discovery).
623 	 *
624 	 * Note, that in modern internet, where routing is unreliable
625 	 * and in each dark corner broken firewalls sit, sending random
626 	 * errors ordered by their masters even this two messages finally lose
627 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
628 	 *
629 	 * Now we are in compliance with RFCs.
630 	 *							--ANK (980905)
631 	 */
632 
633 	if (!sock_owned_by_user(sk) &&
634 	    inet_test_bit(RECVERR, sk)) {
635 		WRITE_ONCE(sk->sk_err, err);
636 		sk_error_report(sk);
637 	} else	{ /* Only an error on timeout */
638 		WRITE_ONCE(sk->sk_err_soft, err);
639 	}
640 
641 out:
642 	bh_unlock_sock(sk);
643 	sock_put(sk);
644 	return 0;
645 }
646 
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)647 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
648 {
649 	struct tcphdr *th = tcp_hdr(skb);
650 
651 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
652 	skb->csum_start = skb_transport_header(skb) - skb->head;
653 	skb->csum_offset = offsetof(struct tcphdr, check);
654 }
655 
656 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)657 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
658 {
659 	const struct inet_sock *inet = inet_sk(sk);
660 
661 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
662 }
663 EXPORT_SYMBOL(tcp_v4_send_check);
664 
665 /*
666  *	This routine will send an RST to the other tcp.
667  *
668  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
669  *		      for reset.
670  *	Answer: if a packet caused RST, it is not for a socket
671  *		existing in our system, if it is matched to a socket,
672  *		it is just duplicate segment or bug in other side's TCP.
673  *		So that we build reply only basing on parameters
674  *		arrived with segment.
675  *	Exception: precedence violation. We do not implement it in any case.
676  */
677 
678 #ifdef CONFIG_TCP_MD5SIG
679 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
680 #else
681 #define OPTION_BYTES sizeof(__be32)
682 #endif
683 
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb)684 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
685 {
686 	const struct tcphdr *th = tcp_hdr(skb);
687 	struct {
688 		struct tcphdr th;
689 		__be32 opt[OPTION_BYTES / sizeof(__be32)];
690 	} rep;
691 	struct ip_reply_arg arg;
692 #ifdef CONFIG_TCP_MD5SIG
693 	struct tcp_md5sig_key *key = NULL;
694 	const __u8 *hash_location = NULL;
695 	unsigned char newhash[16];
696 	int genhash;
697 	struct sock *sk1 = NULL;
698 #endif
699 	u64 transmit_time = 0;
700 	struct sock *ctl_sk;
701 	struct net *net;
702 	u32 txhash = 0;
703 
704 	/* Never send a reset in response to a reset. */
705 	if (th->rst)
706 		return;
707 
708 	/* If sk not NULL, it means we did a successful lookup and incoming
709 	 * route had to be correct. prequeue might have dropped our dst.
710 	 */
711 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
712 		return;
713 
714 	/* Swap the send and the receive. */
715 	memset(&rep, 0, sizeof(rep));
716 	rep.th.dest   = th->source;
717 	rep.th.source = th->dest;
718 	rep.th.doff   = sizeof(struct tcphdr) / 4;
719 	rep.th.rst    = 1;
720 
721 	if (th->ack) {
722 		rep.th.seq = th->ack_seq;
723 	} else {
724 		rep.th.ack = 1;
725 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
726 				       skb->len - (th->doff << 2));
727 	}
728 
729 	memset(&arg, 0, sizeof(arg));
730 	arg.iov[0].iov_base = (unsigned char *)&rep;
731 	arg.iov[0].iov_len  = sizeof(rep.th);
732 
733 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
734 #ifdef CONFIG_TCP_MD5SIG
735 	rcu_read_lock();
736 	hash_location = tcp_parse_md5sig_option(th);
737 	if (sk && sk_fullsock(sk)) {
738 		const union tcp_md5_addr *addr;
739 		int l3index;
740 
741 		/* sdif set, means packet ingressed via a device
742 		 * in an L3 domain and inet_iif is set to it.
743 		 */
744 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
745 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
746 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
747 	} else if (hash_location) {
748 		const union tcp_md5_addr *addr;
749 		int sdif = tcp_v4_sdif(skb);
750 		int dif = inet_iif(skb);
751 		int l3index;
752 
753 		/*
754 		 * active side is lost. Try to find listening socket through
755 		 * source port, and then find md5 key through listening socket.
756 		 * we are not loose security here:
757 		 * Incoming packet is checked with md5 hash with finding key,
758 		 * no RST generated if md5 hash doesn't match.
759 		 */
760 		sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
761 					     NULL, 0, ip_hdr(skb)->saddr,
762 					     th->source, ip_hdr(skb)->daddr,
763 					     ntohs(th->source), dif, sdif);
764 		/* don't send rst if it can't find key */
765 		if (!sk1)
766 			goto out;
767 
768 		/* sdif set, means packet ingressed via a device
769 		 * in an L3 domain and dif is set to it.
770 		 */
771 		l3index = sdif ? dif : 0;
772 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
773 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
774 		if (!key)
775 			goto out;
776 
777 
778 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
779 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
780 			goto out;
781 
782 	}
783 
784 	if (key) {
785 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
786 				   (TCPOPT_NOP << 16) |
787 				   (TCPOPT_MD5SIG << 8) |
788 				   TCPOLEN_MD5SIG);
789 		/* Update length and the length the header thinks exists */
790 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
791 		rep.th.doff = arg.iov[0].iov_len / 4;
792 
793 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
794 				     key, ip_hdr(skb)->saddr,
795 				     ip_hdr(skb)->daddr, &rep.th);
796 	}
797 #endif
798 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
799 	if (rep.opt[0] == 0) {
800 		__be32 mrst = mptcp_reset_option(skb);
801 
802 		if (mrst) {
803 			rep.opt[0] = mrst;
804 			arg.iov[0].iov_len += sizeof(mrst);
805 			rep.th.doff = arg.iov[0].iov_len / 4;
806 		}
807 	}
808 
809 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
810 				      ip_hdr(skb)->saddr, /* XXX */
811 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
812 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
813 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
814 
815 	/* When socket is gone, all binding information is lost.
816 	 * routing might fail in this case. No choice here, if we choose to force
817 	 * input interface, we will misroute in case of asymmetric route.
818 	 */
819 	if (sk) {
820 		arg.bound_dev_if = sk->sk_bound_dev_if;
821 		if (sk_fullsock(sk))
822 			trace_tcp_send_reset(sk, skb);
823 	}
824 
825 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
826 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
827 
828 	arg.tos = ip_hdr(skb)->tos;
829 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
830 	local_bh_disable();
831 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
832 	sock_net_set(ctl_sk, net);
833 	if (sk) {
834 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
835 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
836 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
837 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
838 		transmit_time = tcp_transmit_time(sk);
839 		xfrm_sk_clone_policy(ctl_sk, sk);
840 		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
841 			 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
842 	} else {
843 		ctl_sk->sk_mark = 0;
844 		ctl_sk->sk_priority = 0;
845 	}
846 	ip_send_unicast_reply(ctl_sk,
847 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
848 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
849 			      &arg, arg.iov[0].iov_len,
850 			      transmit_time, txhash);
851 
852 	xfrm_sk_free_policy(ctl_sk);
853 	sock_net_set(ctl_sk, &init_net);
854 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
855 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
856 	local_bh_enable();
857 
858 #ifdef CONFIG_TCP_MD5SIG
859 out:
860 	rcu_read_unlock();
861 #endif
862 }
863 
864 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
865    outside socket context is ugly, certainly. What can I do?
866  */
867 
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_md5sig_key * key,int reply_flags,u8 tos,u32 txhash)868 static void tcp_v4_send_ack(const struct sock *sk,
869 			    struct sk_buff *skb, u32 seq, u32 ack,
870 			    u32 win, u32 tsval, u32 tsecr, int oif,
871 			    struct tcp_md5sig_key *key,
872 			    int reply_flags, u8 tos, u32 txhash)
873 {
874 	const struct tcphdr *th = tcp_hdr(skb);
875 	struct {
876 		struct tcphdr th;
877 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
878 #ifdef CONFIG_TCP_MD5SIG
879 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
880 #endif
881 			];
882 	} rep;
883 	struct net *net = sock_net(sk);
884 	struct ip_reply_arg arg;
885 	struct sock *ctl_sk;
886 	u64 transmit_time;
887 
888 	memset(&rep.th, 0, sizeof(struct tcphdr));
889 	memset(&arg, 0, sizeof(arg));
890 
891 	arg.iov[0].iov_base = (unsigned char *)&rep;
892 	arg.iov[0].iov_len  = sizeof(rep.th);
893 	if (tsecr) {
894 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
895 				   (TCPOPT_TIMESTAMP << 8) |
896 				   TCPOLEN_TIMESTAMP);
897 		rep.opt[1] = htonl(tsval);
898 		rep.opt[2] = htonl(tsecr);
899 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
900 	}
901 
902 	/* Swap the send and the receive. */
903 	rep.th.dest    = th->source;
904 	rep.th.source  = th->dest;
905 	rep.th.doff    = arg.iov[0].iov_len / 4;
906 	rep.th.seq     = htonl(seq);
907 	rep.th.ack_seq = htonl(ack);
908 	rep.th.ack     = 1;
909 	rep.th.window  = htons(win);
910 
911 #ifdef CONFIG_TCP_MD5SIG
912 	if (key) {
913 		int offset = (tsecr) ? 3 : 0;
914 
915 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
916 					  (TCPOPT_NOP << 16) |
917 					  (TCPOPT_MD5SIG << 8) |
918 					  TCPOLEN_MD5SIG);
919 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
920 		rep.th.doff = arg.iov[0].iov_len/4;
921 
922 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
923 				    key, ip_hdr(skb)->saddr,
924 				    ip_hdr(skb)->daddr, &rep.th);
925 	}
926 #endif
927 	arg.flags = reply_flags;
928 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
929 				      ip_hdr(skb)->saddr, /* XXX */
930 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
931 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
932 	if (oif)
933 		arg.bound_dev_if = oif;
934 	arg.tos = tos;
935 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
936 	local_bh_disable();
937 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
938 	sock_net_set(ctl_sk, net);
939 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
940 			   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
941 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
942 			   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
943 	transmit_time = tcp_transmit_time(sk);
944 	ip_send_unicast_reply(ctl_sk,
945 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
946 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
947 			      &arg, arg.iov[0].iov_len,
948 			      transmit_time, txhash);
949 
950 	sock_net_set(ctl_sk, &init_net);
951 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
952 	local_bh_enable();
953 }
954 
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)955 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
956 {
957 	struct inet_timewait_sock *tw = inet_twsk(sk);
958 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
959 
960 	tcp_v4_send_ack(sk, skb,
961 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
962 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
963 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
964 			tcptw->tw_ts_recent,
965 			tw->tw_bound_dev_if,
966 			tcp_twsk_md5_key(tcptw),
967 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
968 			tw->tw_tos,
969 			tw->tw_txhash
970 			);
971 
972 	inet_twsk_put(tw);
973 }
974 
tcp_v4_reqsk_send_ack(const struct sock * sk,struct sk_buff * skb,struct request_sock * req)975 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
976 				  struct request_sock *req)
977 {
978 	const union tcp_md5_addr *addr;
979 	int l3index;
980 
981 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
982 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
983 	 */
984 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
985 					     tcp_sk(sk)->snd_nxt;
986 
987 	/* RFC 7323 2.3
988 	 * The window field (SEG.WND) of every outgoing segment, with the
989 	 * exception of <SYN> segments, MUST be right-shifted by
990 	 * Rcv.Wind.Shift bits:
991 	 */
992 	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
993 	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
994 	tcp_v4_send_ack(sk, skb, seq,
995 			tcp_rsk(req)->rcv_nxt,
996 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
997 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
998 			READ_ONCE(req->ts_recent),
999 			0,
1000 			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
1001 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1002 			ip_hdr(skb)->tos,
1003 			READ_ONCE(tcp_rsk(req)->txhash));
1004 }
1005 
1006 /*
1007  *	Send a SYN-ACK after having received a SYN.
1008  *	This still operates on a request_sock only, not on a big
1009  *	socket.
1010  */
tcp_v4_send_synack(const struct sock * sk,struct dst_entry * dst,struct flowi * fl,struct request_sock * req,struct tcp_fastopen_cookie * foc,enum tcp_synack_type synack_type,struct sk_buff * syn_skb)1011 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1012 			      struct flowi *fl,
1013 			      struct request_sock *req,
1014 			      struct tcp_fastopen_cookie *foc,
1015 			      enum tcp_synack_type synack_type,
1016 			      struct sk_buff *syn_skb)
1017 {
1018 	const struct inet_request_sock *ireq = inet_rsk(req);
1019 	struct flowi4 fl4;
1020 	int err = -1;
1021 	struct sk_buff *skb;
1022 	u8 tos;
1023 
1024 	/* First, grab a route. */
1025 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1026 		return -1;
1027 
1028 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1029 
1030 	if (skb) {
1031 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1032 
1033 		tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1034 				(tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1035 				(inet_sk(sk)->tos & INET_ECN_MASK) :
1036 				inet_sk(sk)->tos;
1037 
1038 		if (!INET_ECN_is_capable(tos) &&
1039 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1040 			tos |= INET_ECN_ECT_0;
1041 
1042 		rcu_read_lock();
1043 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1044 					    ireq->ir_rmt_addr,
1045 					    rcu_dereference(ireq->ireq_opt),
1046 					    tos);
1047 		rcu_read_unlock();
1048 		err = net_xmit_eval(err);
1049 	}
1050 
1051 	return err;
1052 }
1053 
1054 /*
1055  *	IPv4 request_sock destructor.
1056  */
tcp_v4_reqsk_destructor(struct request_sock * req)1057 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1058 {
1059 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1060 }
1061 
1062 #ifdef CONFIG_TCP_MD5SIG
1063 /*
1064  * RFC2385 MD5 checksumming requires a mapping of
1065  * IP address->MD5 Key.
1066  * We need to maintain these in the sk structure.
1067  */
1068 
1069 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1070 EXPORT_SYMBOL(tcp_md5_needed);
1071 
better_md5_match(struct tcp_md5sig_key * old,struct tcp_md5sig_key * new)1072 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1073 {
1074 	if (!old)
1075 		return true;
1076 
1077 	/* l3index always overrides non-l3index */
1078 	if (old->l3index && new->l3index == 0)
1079 		return false;
1080 	if (old->l3index == 0 && new->l3index)
1081 		return true;
1082 
1083 	return old->prefixlen < new->prefixlen;
1084 }
1085 
1086 /* Find the Key structure for an address.  */
__tcp_md5_do_lookup(const struct sock * sk,int l3index,const union tcp_md5_addr * addr,int family)1087 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1088 					   const union tcp_md5_addr *addr,
1089 					   int family)
1090 {
1091 	const struct tcp_sock *tp = tcp_sk(sk);
1092 	struct tcp_md5sig_key *key;
1093 	const struct tcp_md5sig_info *md5sig;
1094 	__be32 mask;
1095 	struct tcp_md5sig_key *best_match = NULL;
1096 	bool match;
1097 
1098 	/* caller either holds rcu_read_lock() or socket lock */
1099 	md5sig = rcu_dereference_check(tp->md5sig_info,
1100 				       lockdep_sock_is_held(sk));
1101 	if (!md5sig)
1102 		return NULL;
1103 
1104 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1105 				 lockdep_sock_is_held(sk)) {
1106 		if (key->family != family)
1107 			continue;
1108 		if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1109 			continue;
1110 		if (family == AF_INET) {
1111 			mask = inet_make_mask(key->prefixlen);
1112 			match = (key->addr.a4.s_addr & mask) ==
1113 				(addr->a4.s_addr & mask);
1114 #if IS_ENABLED(CONFIG_IPV6)
1115 		} else if (family == AF_INET6) {
1116 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1117 						  key->prefixlen);
1118 #endif
1119 		} else {
1120 			match = false;
1121 		}
1122 
1123 		if (match && better_md5_match(best_match, key))
1124 			best_match = key;
1125 	}
1126 	return best_match;
1127 }
1128 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1129 
tcp_md5_do_lookup_exact(const struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags)1130 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1131 						      const union tcp_md5_addr *addr,
1132 						      int family, u8 prefixlen,
1133 						      int l3index, u8 flags)
1134 {
1135 	const struct tcp_sock *tp = tcp_sk(sk);
1136 	struct tcp_md5sig_key *key;
1137 	unsigned int size = sizeof(struct in_addr);
1138 	const struct tcp_md5sig_info *md5sig;
1139 
1140 	/* caller either holds rcu_read_lock() or socket lock */
1141 	md5sig = rcu_dereference_check(tp->md5sig_info,
1142 				       lockdep_sock_is_held(sk));
1143 	if (!md5sig)
1144 		return NULL;
1145 #if IS_ENABLED(CONFIG_IPV6)
1146 	if (family == AF_INET6)
1147 		size = sizeof(struct in6_addr);
1148 #endif
1149 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1150 				 lockdep_sock_is_held(sk)) {
1151 		if (key->family != family)
1152 			continue;
1153 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1154 			continue;
1155 		if (key->l3index != l3index)
1156 			continue;
1157 		if (!memcmp(&key->addr, addr, size) &&
1158 		    key->prefixlen == prefixlen)
1159 			return key;
1160 	}
1161 	return NULL;
1162 }
1163 
tcp_v4_md5_lookup(const struct sock * sk,const struct sock * addr_sk)1164 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1165 					 const struct sock *addr_sk)
1166 {
1167 	const union tcp_md5_addr *addr;
1168 	int l3index;
1169 
1170 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1171 						 addr_sk->sk_bound_dev_if);
1172 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1173 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1174 }
1175 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1176 
tcp_md5sig_info_add(struct sock * sk,gfp_t gfp)1177 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1178 {
1179 	struct tcp_sock *tp = tcp_sk(sk);
1180 	struct tcp_md5sig_info *md5sig;
1181 
1182 	md5sig = kmalloc(sizeof(*md5sig), gfp);
1183 	if (!md5sig)
1184 		return -ENOMEM;
1185 
1186 	sk_gso_disable(sk);
1187 	INIT_HLIST_HEAD(&md5sig->head);
1188 	rcu_assign_pointer(tp->md5sig_info, md5sig);
1189 	return 0;
1190 }
1191 
1192 /* This can be called on a newly created socket, from other files */
__tcp_md5_do_add(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags,const u8 * newkey,u8 newkeylen,gfp_t gfp)1193 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1194 			    int family, u8 prefixlen, int l3index, u8 flags,
1195 			    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1196 {
1197 	/* Add Key to the list */
1198 	struct tcp_md5sig_key *key;
1199 	struct tcp_sock *tp = tcp_sk(sk);
1200 	struct tcp_md5sig_info *md5sig;
1201 
1202 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1203 	if (key) {
1204 		/* Pre-existing entry - just update that one.
1205 		 * Note that the key might be used concurrently.
1206 		 * data_race() is telling kcsan that we do not care of
1207 		 * key mismatches, since changing MD5 key on live flows
1208 		 * can lead to packet drops.
1209 		 */
1210 		data_race(memcpy(key->key, newkey, newkeylen));
1211 
1212 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1213 		 * Also note that a reader could catch new key->keylen value
1214 		 * but old key->key[], this is the reason we use __GFP_ZERO
1215 		 * at sock_kmalloc() time below these lines.
1216 		 */
1217 		WRITE_ONCE(key->keylen, newkeylen);
1218 
1219 		return 0;
1220 	}
1221 
1222 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1223 					   lockdep_sock_is_held(sk));
1224 
1225 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1226 	if (!key)
1227 		return -ENOMEM;
1228 	if (!tcp_alloc_md5sig_pool()) {
1229 		sock_kfree_s(sk, key, sizeof(*key));
1230 		return -ENOMEM;
1231 	}
1232 
1233 	memcpy(key->key, newkey, newkeylen);
1234 	key->keylen = newkeylen;
1235 	key->family = family;
1236 	key->prefixlen = prefixlen;
1237 	key->l3index = l3index;
1238 	key->flags = flags;
1239 	memcpy(&key->addr, addr,
1240 	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1241 								 sizeof(struct in_addr));
1242 	hlist_add_head_rcu(&key->node, &md5sig->head);
1243 	return 0;
1244 }
1245 
tcp_md5_do_add(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags,const u8 * newkey,u8 newkeylen)1246 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1247 		   int family, u8 prefixlen, int l3index, u8 flags,
1248 		   const u8 *newkey, u8 newkeylen)
1249 {
1250 	struct tcp_sock *tp = tcp_sk(sk);
1251 
1252 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1253 		if (tcp_md5sig_info_add(sk, GFP_KERNEL))
1254 			return -ENOMEM;
1255 
1256 		if (!static_branch_inc(&tcp_md5_needed.key)) {
1257 			struct tcp_md5sig_info *md5sig;
1258 
1259 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1260 			rcu_assign_pointer(tp->md5sig_info, NULL);
1261 			kfree_rcu(md5sig, rcu);
1262 			return -EUSERS;
1263 		}
1264 	}
1265 
1266 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1267 				newkey, newkeylen, GFP_KERNEL);
1268 }
1269 EXPORT_SYMBOL(tcp_md5_do_add);
1270 
tcp_md5_key_copy(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,struct tcp_md5sig_key * key)1271 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1272 		     int family, u8 prefixlen, int l3index,
1273 		     struct tcp_md5sig_key *key)
1274 {
1275 	struct tcp_sock *tp = tcp_sk(sk);
1276 
1277 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1278 		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
1279 			return -ENOMEM;
1280 
1281 		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1282 			struct tcp_md5sig_info *md5sig;
1283 
1284 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1285 			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1286 			rcu_assign_pointer(tp->md5sig_info, NULL);
1287 			kfree_rcu(md5sig, rcu);
1288 			return -EUSERS;
1289 		}
1290 	}
1291 
1292 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1293 				key->flags, key->key, key->keylen,
1294 				sk_gfp_mask(sk, GFP_ATOMIC));
1295 }
1296 EXPORT_SYMBOL(tcp_md5_key_copy);
1297 
tcp_md5_do_del(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags)1298 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1299 		   u8 prefixlen, int l3index, u8 flags)
1300 {
1301 	struct tcp_md5sig_key *key;
1302 
1303 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1304 	if (!key)
1305 		return -ENOENT;
1306 	hlist_del_rcu(&key->node);
1307 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1308 	kfree_rcu(key, rcu);
1309 	return 0;
1310 }
1311 EXPORT_SYMBOL(tcp_md5_do_del);
1312 
tcp_clear_md5_list(struct sock * sk)1313 static void tcp_clear_md5_list(struct sock *sk)
1314 {
1315 	struct tcp_sock *tp = tcp_sk(sk);
1316 	struct tcp_md5sig_key *key;
1317 	struct hlist_node *n;
1318 	struct tcp_md5sig_info *md5sig;
1319 
1320 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1321 
1322 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1323 		hlist_del_rcu(&key->node);
1324 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1325 		kfree_rcu(key, rcu);
1326 	}
1327 }
1328 
tcp_v4_parse_md5_keys(struct sock * sk,int optname,sockptr_t optval,int optlen)1329 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1330 				 sockptr_t optval, int optlen)
1331 {
1332 	struct tcp_md5sig cmd;
1333 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1334 	const union tcp_md5_addr *addr;
1335 	u8 prefixlen = 32;
1336 	int l3index = 0;
1337 	u8 flags;
1338 
1339 	if (optlen < sizeof(cmd))
1340 		return -EINVAL;
1341 
1342 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1343 		return -EFAULT;
1344 
1345 	if (sin->sin_family != AF_INET)
1346 		return -EINVAL;
1347 
1348 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1349 
1350 	if (optname == TCP_MD5SIG_EXT &&
1351 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1352 		prefixlen = cmd.tcpm_prefixlen;
1353 		if (prefixlen > 32)
1354 			return -EINVAL;
1355 	}
1356 
1357 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1358 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1359 		struct net_device *dev;
1360 
1361 		rcu_read_lock();
1362 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1363 		if (dev && netif_is_l3_master(dev))
1364 			l3index = dev->ifindex;
1365 
1366 		rcu_read_unlock();
1367 
1368 		/* ok to reference set/not set outside of rcu;
1369 		 * right now device MUST be an L3 master
1370 		 */
1371 		if (!dev || !l3index)
1372 			return -EINVAL;
1373 	}
1374 
1375 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1376 
1377 	if (!cmd.tcpm_keylen)
1378 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1379 
1380 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1381 		return -EINVAL;
1382 
1383 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1384 			      cmd.tcpm_key, cmd.tcpm_keylen);
1385 }
1386 
tcp_v4_md5_hash_headers(struct tcp_md5sig_pool * hp,__be32 daddr,__be32 saddr,const struct tcphdr * th,int nbytes)1387 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1388 				   __be32 daddr, __be32 saddr,
1389 				   const struct tcphdr *th, int nbytes)
1390 {
1391 	struct tcp4_pseudohdr *bp;
1392 	struct scatterlist sg;
1393 	struct tcphdr *_th;
1394 
1395 	bp = hp->scratch;
1396 	bp->saddr = saddr;
1397 	bp->daddr = daddr;
1398 	bp->pad = 0;
1399 	bp->protocol = IPPROTO_TCP;
1400 	bp->len = cpu_to_be16(nbytes);
1401 
1402 	_th = (struct tcphdr *)(bp + 1);
1403 	memcpy(_th, th, sizeof(*th));
1404 	_th->check = 0;
1405 
1406 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1407 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1408 				sizeof(*bp) + sizeof(*th));
1409 	return crypto_ahash_update(hp->md5_req);
1410 }
1411 
tcp_v4_md5_hash_hdr(char * md5_hash,const struct tcp_md5sig_key * key,__be32 daddr,__be32 saddr,const struct tcphdr * th)1412 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1413 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1414 {
1415 	struct tcp_md5sig_pool *hp;
1416 	struct ahash_request *req;
1417 
1418 	hp = tcp_get_md5sig_pool();
1419 	if (!hp)
1420 		goto clear_hash_noput;
1421 	req = hp->md5_req;
1422 
1423 	if (crypto_ahash_init(req))
1424 		goto clear_hash;
1425 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1426 		goto clear_hash;
1427 	if (tcp_md5_hash_key(hp, key))
1428 		goto clear_hash;
1429 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1430 	if (crypto_ahash_final(req))
1431 		goto clear_hash;
1432 
1433 	tcp_put_md5sig_pool();
1434 	return 0;
1435 
1436 clear_hash:
1437 	tcp_put_md5sig_pool();
1438 clear_hash_noput:
1439 	memset(md5_hash, 0, 16);
1440 	return 1;
1441 }
1442 
tcp_v4_md5_hash_skb(char * md5_hash,const struct tcp_md5sig_key * key,const struct sock * sk,const struct sk_buff * skb)1443 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1444 			const struct sock *sk,
1445 			const struct sk_buff *skb)
1446 {
1447 	struct tcp_md5sig_pool *hp;
1448 	struct ahash_request *req;
1449 	const struct tcphdr *th = tcp_hdr(skb);
1450 	__be32 saddr, daddr;
1451 
1452 	if (sk) { /* valid for establish/request sockets */
1453 		saddr = sk->sk_rcv_saddr;
1454 		daddr = sk->sk_daddr;
1455 	} else {
1456 		const struct iphdr *iph = ip_hdr(skb);
1457 		saddr = iph->saddr;
1458 		daddr = iph->daddr;
1459 	}
1460 
1461 	hp = tcp_get_md5sig_pool();
1462 	if (!hp)
1463 		goto clear_hash_noput;
1464 	req = hp->md5_req;
1465 
1466 	if (crypto_ahash_init(req))
1467 		goto clear_hash;
1468 
1469 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1470 		goto clear_hash;
1471 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1472 		goto clear_hash;
1473 	if (tcp_md5_hash_key(hp, key))
1474 		goto clear_hash;
1475 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1476 	if (crypto_ahash_final(req))
1477 		goto clear_hash;
1478 
1479 	tcp_put_md5sig_pool();
1480 	return 0;
1481 
1482 clear_hash:
1483 	tcp_put_md5sig_pool();
1484 clear_hash_noput:
1485 	memset(md5_hash, 0, 16);
1486 	return 1;
1487 }
1488 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1489 
1490 #endif
1491 
tcp_v4_init_req(struct request_sock * req,const struct sock * sk_listener,struct sk_buff * skb)1492 static void tcp_v4_init_req(struct request_sock *req,
1493 			    const struct sock *sk_listener,
1494 			    struct sk_buff *skb)
1495 {
1496 	struct inet_request_sock *ireq = inet_rsk(req);
1497 	struct net *net = sock_net(sk_listener);
1498 
1499 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1500 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1501 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1502 }
1503 
tcp_v4_route_req(const struct sock * sk,struct sk_buff * skb,struct flowi * fl,struct request_sock * req)1504 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1505 					  struct sk_buff *skb,
1506 					  struct flowi *fl,
1507 					  struct request_sock *req)
1508 {
1509 	tcp_v4_init_req(req, sk, skb);
1510 
1511 	if (security_inet_conn_request(sk, skb, req))
1512 		return NULL;
1513 
1514 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1515 }
1516 
1517 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1518 	.family		=	PF_INET,
1519 	.obj_size	=	sizeof(struct tcp_request_sock),
1520 	.rtx_syn_ack	=	tcp_rtx_synack,
1521 	.send_ack	=	tcp_v4_reqsk_send_ack,
1522 	.destructor	=	tcp_v4_reqsk_destructor,
1523 	.send_reset	=	tcp_v4_send_reset,
1524 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1525 };
1526 
1527 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1528 	.mss_clamp	=	TCP_MSS_DEFAULT,
1529 #ifdef CONFIG_TCP_MD5SIG
1530 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1531 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1532 #endif
1533 #ifdef CONFIG_SYN_COOKIES
1534 	.cookie_init_seq =	cookie_v4_init_sequence,
1535 #endif
1536 	.route_req	=	tcp_v4_route_req,
1537 	.init_seq	=	tcp_v4_init_seq,
1538 	.init_ts_off	=	tcp_v4_init_ts_off,
1539 	.send_synack	=	tcp_v4_send_synack,
1540 };
1541 
tcp_v4_conn_request(struct sock * sk,struct sk_buff * skb)1542 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1543 {
1544 	/* Never answer to SYNs send to broadcast or multicast */
1545 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1546 		goto drop;
1547 
1548 	return tcp_conn_request(&tcp_request_sock_ops,
1549 				&tcp_request_sock_ipv4_ops, sk, skb);
1550 
1551 drop:
1552 	tcp_listendrop(sk);
1553 	return 0;
1554 }
1555 EXPORT_SYMBOL(tcp_v4_conn_request);
1556 
1557 
1558 /*
1559  * The three way handshake has completed - we got a valid synack -
1560  * now create the new socket.
1561  */
tcp_v4_syn_recv_sock(const struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct dst_entry * dst,struct request_sock * req_unhash,bool * own_req)1562 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1563 				  struct request_sock *req,
1564 				  struct dst_entry *dst,
1565 				  struct request_sock *req_unhash,
1566 				  bool *own_req)
1567 {
1568 	struct inet_request_sock *ireq;
1569 	bool found_dup_sk = false;
1570 	struct inet_sock *newinet;
1571 	struct tcp_sock *newtp;
1572 	struct sock *newsk;
1573 #ifdef CONFIG_TCP_MD5SIG
1574 	const union tcp_md5_addr *addr;
1575 	struct tcp_md5sig_key *key;
1576 	int l3index;
1577 #endif
1578 	struct ip_options_rcu *inet_opt;
1579 
1580 	if (sk_acceptq_is_full(sk))
1581 		goto exit_overflow;
1582 
1583 	newsk = tcp_create_openreq_child(sk, req, skb);
1584 	if (!newsk)
1585 		goto exit_nonewsk;
1586 
1587 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1588 	inet_sk_rx_dst_set(newsk, skb);
1589 
1590 	newtp		      = tcp_sk(newsk);
1591 	newinet		      = inet_sk(newsk);
1592 	ireq		      = inet_rsk(req);
1593 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1594 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1595 	newsk->sk_bound_dev_if = ireq->ir_iif;
1596 	newinet->inet_saddr   = ireq->ir_loc_addr;
1597 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1598 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1599 	newinet->mc_index     = inet_iif(skb);
1600 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1601 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1602 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1603 	if (inet_opt)
1604 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1605 	atomic_set(&newinet->inet_id, get_random_u16());
1606 
1607 	/* Set ToS of the new socket based upon the value of incoming SYN.
1608 	 * ECT bits are set later in tcp_init_transfer().
1609 	 */
1610 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1611 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1612 
1613 	if (!dst) {
1614 		dst = inet_csk_route_child_sock(sk, newsk, req);
1615 		if (!dst)
1616 			goto put_and_exit;
1617 	} else {
1618 		/* syncookie case : see end of cookie_v4_check() */
1619 	}
1620 	sk_setup_caps(newsk, dst);
1621 
1622 	tcp_ca_openreq_child(newsk, dst);
1623 
1624 	tcp_sync_mss(newsk, dst_mtu(dst));
1625 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1626 
1627 	tcp_initialize_rcv_mss(newsk);
1628 
1629 #ifdef CONFIG_TCP_MD5SIG
1630 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1631 	/* Copy over the MD5 key from the original socket */
1632 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1633 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1634 	if (key) {
1635 		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1636 			goto put_and_exit;
1637 		sk_gso_disable(newsk);
1638 	}
1639 #endif
1640 
1641 	if (__inet_inherit_port(sk, newsk) < 0)
1642 		goto put_and_exit;
1643 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1644 				       &found_dup_sk);
1645 	if (likely(*own_req)) {
1646 		tcp_move_syn(newtp, req);
1647 		ireq->ireq_opt = NULL;
1648 	} else {
1649 		newinet->inet_opt = NULL;
1650 
1651 		if (!req_unhash && found_dup_sk) {
1652 			/* This code path should only be executed in the
1653 			 * syncookie case only
1654 			 */
1655 			bh_unlock_sock(newsk);
1656 			sock_put(newsk);
1657 			newsk = NULL;
1658 		}
1659 	}
1660 	return newsk;
1661 
1662 exit_overflow:
1663 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1664 exit_nonewsk:
1665 	dst_release(dst);
1666 exit:
1667 	tcp_listendrop(sk);
1668 	return NULL;
1669 put_and_exit:
1670 	newinet->inet_opt = NULL;
1671 	inet_csk_prepare_forced_close(newsk);
1672 	tcp_done(newsk);
1673 	goto exit;
1674 }
1675 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1676 
tcp_v4_cookie_check(struct sock * sk,struct sk_buff * skb)1677 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1678 {
1679 #ifdef CONFIG_SYN_COOKIES
1680 	const struct tcphdr *th = tcp_hdr(skb);
1681 
1682 	if (!th->syn)
1683 		sk = cookie_v4_check(sk, skb);
1684 #endif
1685 	return sk;
1686 }
1687 
tcp_v4_get_syncookie(struct sock * sk,struct iphdr * iph,struct tcphdr * th,u32 * cookie)1688 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1689 			 struct tcphdr *th, u32 *cookie)
1690 {
1691 	u16 mss = 0;
1692 #ifdef CONFIG_SYN_COOKIES
1693 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1694 				    &tcp_request_sock_ipv4_ops, sk, th);
1695 	if (mss) {
1696 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1697 		tcp_synq_overflow(sk);
1698 	}
1699 #endif
1700 	return mss;
1701 }
1702 
1703 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1704 							   u32));
1705 /* The socket must have it's spinlock held when we get
1706  * here, unless it is a TCP_LISTEN socket.
1707  *
1708  * We have a potential double-lock case here, so even when
1709  * doing backlog processing we use the BH locking scheme.
1710  * This is because we cannot sleep with the original spinlock
1711  * held.
1712  */
tcp_v4_do_rcv(struct sock * sk,struct sk_buff * skb)1713 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1714 {
1715 	enum skb_drop_reason reason;
1716 	struct sock *rsk;
1717 
1718 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1719 		struct dst_entry *dst;
1720 
1721 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1722 						lockdep_sock_is_held(sk));
1723 
1724 		sock_rps_save_rxhash(sk, skb);
1725 		sk_mark_napi_id(sk, skb);
1726 		if (dst) {
1727 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1728 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1729 					     dst, 0)) {
1730 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1731 				dst_release(dst);
1732 			}
1733 		}
1734 		tcp_rcv_established(sk, skb);
1735 		return 0;
1736 	}
1737 
1738 	reason = SKB_DROP_REASON_NOT_SPECIFIED;
1739 	if (tcp_checksum_complete(skb))
1740 		goto csum_err;
1741 
1742 	if (sk->sk_state == TCP_LISTEN) {
1743 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1744 
1745 		if (!nsk)
1746 			goto discard;
1747 		if (nsk != sk) {
1748 			if (tcp_child_process(sk, nsk, skb)) {
1749 				rsk = nsk;
1750 				goto reset;
1751 			}
1752 			return 0;
1753 		}
1754 	} else
1755 		sock_rps_save_rxhash(sk, skb);
1756 
1757 	if (tcp_rcv_state_process(sk, skb)) {
1758 		rsk = sk;
1759 		goto reset;
1760 	}
1761 	return 0;
1762 
1763 reset:
1764 	tcp_v4_send_reset(rsk, skb);
1765 discard:
1766 	kfree_skb_reason(skb, reason);
1767 	/* Be careful here. If this function gets more complicated and
1768 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1769 	 * might be destroyed here. This current version compiles correctly,
1770 	 * but you have been warned.
1771 	 */
1772 	return 0;
1773 
1774 csum_err:
1775 	reason = SKB_DROP_REASON_TCP_CSUM;
1776 	trace_tcp_bad_csum(skb);
1777 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1778 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1779 	goto discard;
1780 }
1781 EXPORT_SYMBOL(tcp_v4_do_rcv);
1782 
tcp_v4_early_demux(struct sk_buff * skb)1783 int tcp_v4_early_demux(struct sk_buff *skb)
1784 {
1785 	struct net *net = dev_net(skb->dev);
1786 	const struct iphdr *iph;
1787 	const struct tcphdr *th;
1788 	struct sock *sk;
1789 
1790 	if (skb->pkt_type != PACKET_HOST)
1791 		return 0;
1792 
1793 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1794 		return 0;
1795 
1796 	iph = ip_hdr(skb);
1797 	th = tcp_hdr(skb);
1798 
1799 	if (th->doff < sizeof(struct tcphdr) / 4)
1800 		return 0;
1801 
1802 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1803 				       iph->saddr, th->source,
1804 				       iph->daddr, ntohs(th->dest),
1805 				       skb->skb_iif, inet_sdif(skb));
1806 	if (sk) {
1807 		skb->sk = sk;
1808 		skb->destructor = sock_edemux;
1809 		if (sk_fullsock(sk)) {
1810 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1811 
1812 			if (dst)
1813 				dst = dst_check(dst, 0);
1814 			if (dst &&
1815 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
1816 				skb_dst_set_noref(skb, dst);
1817 		}
1818 	}
1819 	return 0;
1820 }
1821 
tcp_add_backlog(struct sock * sk,struct sk_buff * skb,enum skb_drop_reason * reason)1822 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1823 		     enum skb_drop_reason *reason)
1824 {
1825 	u32 tail_gso_size, tail_gso_segs;
1826 	struct skb_shared_info *shinfo;
1827 	const struct tcphdr *th;
1828 	struct tcphdr *thtail;
1829 	struct sk_buff *tail;
1830 	unsigned int hdrlen;
1831 	bool fragstolen;
1832 	u32 gso_segs;
1833 	u32 gso_size;
1834 	u64 limit;
1835 	int delta;
1836 
1837 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1838 	 * we can fix skb->truesize to its real value to avoid future drops.
1839 	 * This is valid because skb is not yet charged to the socket.
1840 	 * It has been noticed pure SACK packets were sometimes dropped
1841 	 * (if cooked by drivers without copybreak feature).
1842 	 */
1843 	skb_condense(skb);
1844 
1845 	skb_dst_drop(skb);
1846 
1847 	if (unlikely(tcp_checksum_complete(skb))) {
1848 		bh_unlock_sock(sk);
1849 		trace_tcp_bad_csum(skb);
1850 		*reason = SKB_DROP_REASON_TCP_CSUM;
1851 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1852 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1853 		return true;
1854 	}
1855 
1856 	/* Attempt coalescing to last skb in backlog, even if we are
1857 	 * above the limits.
1858 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1859 	 */
1860 	th = (const struct tcphdr *)skb->data;
1861 	hdrlen = th->doff * 4;
1862 
1863 	tail = sk->sk_backlog.tail;
1864 	if (!tail)
1865 		goto no_coalesce;
1866 	thtail = (struct tcphdr *)tail->data;
1867 
1868 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1869 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1870 	    ((TCP_SKB_CB(tail)->tcp_flags |
1871 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1872 	    !((TCP_SKB_CB(tail)->tcp_flags &
1873 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1874 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1875 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1876 #ifdef CONFIG_TLS_DEVICE
1877 	    tail->decrypted != skb->decrypted ||
1878 #endif
1879 	    !mptcp_skb_can_collapse(tail, skb) ||
1880 	    thtail->doff != th->doff ||
1881 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1882 		goto no_coalesce;
1883 
1884 	__skb_pull(skb, hdrlen);
1885 
1886 	shinfo = skb_shinfo(skb);
1887 	gso_size = shinfo->gso_size ?: skb->len;
1888 	gso_segs = shinfo->gso_segs ?: 1;
1889 
1890 	shinfo = skb_shinfo(tail);
1891 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1892 	tail_gso_segs = shinfo->gso_segs ?: 1;
1893 
1894 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1895 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1896 
1897 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1898 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1899 			thtail->window = th->window;
1900 		}
1901 
1902 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1903 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1904 		 * is not entered if we append a packet with a FIN.
1905 		 * SYN, RST, URG are not present.
1906 		 * ACK is set on both packets.
1907 		 * PSH : we do not really care in TCP stack,
1908 		 *       at least for 'GRO' packets.
1909 		 */
1910 		thtail->fin |= th->fin;
1911 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1912 
1913 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1914 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1915 			tail->tstamp = skb->tstamp;
1916 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1917 		}
1918 
1919 		/* Not as strict as GRO. We only need to carry mss max value */
1920 		shinfo->gso_size = max(gso_size, tail_gso_size);
1921 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1922 
1923 		sk->sk_backlog.len += delta;
1924 		__NET_INC_STATS(sock_net(sk),
1925 				LINUX_MIB_TCPBACKLOGCOALESCE);
1926 		kfree_skb_partial(skb, fragstolen);
1927 		return false;
1928 	}
1929 	__skb_push(skb, hdrlen);
1930 
1931 no_coalesce:
1932 	/* sk->sk_backlog.len is reset only at the end of __release_sock().
1933 	 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
1934 	 * sk_rcvbuf in normal conditions.
1935 	 */
1936 	limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
1937 
1938 	limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
1939 
1940 	/* Only socket owner can try to collapse/prune rx queues
1941 	 * to reduce memory overhead, so add a little headroom here.
1942 	 * Few sockets backlog are possibly concurrently non empty.
1943 	 */
1944 	limit += 64 * 1024;
1945 
1946 	limit = min_t(u64, limit, UINT_MAX);
1947 
1948 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1949 		bh_unlock_sock(sk);
1950 		*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1951 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1952 		return true;
1953 	}
1954 	return false;
1955 }
1956 EXPORT_SYMBOL(tcp_add_backlog);
1957 
tcp_filter(struct sock * sk,struct sk_buff * skb)1958 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1959 {
1960 	struct tcphdr *th = (struct tcphdr *)skb->data;
1961 
1962 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1963 }
1964 EXPORT_SYMBOL(tcp_filter);
1965 
tcp_v4_restore_cb(struct sk_buff * skb)1966 static void tcp_v4_restore_cb(struct sk_buff *skb)
1967 {
1968 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1969 		sizeof(struct inet_skb_parm));
1970 }
1971 
tcp_v4_fill_cb(struct sk_buff * skb,const struct iphdr * iph,const struct tcphdr * th)1972 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1973 			   const struct tcphdr *th)
1974 {
1975 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1976 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1977 	 */
1978 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1979 		sizeof(struct inet_skb_parm));
1980 	barrier();
1981 
1982 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1983 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1984 				    skb->len - th->doff * 4);
1985 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1986 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1987 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1988 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1989 	TCP_SKB_CB(skb)->sacked	 = 0;
1990 	TCP_SKB_CB(skb)->has_rxtstamp =
1991 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1992 }
1993 
1994 /*
1995  *	From tcp_input.c
1996  */
1997 
tcp_v4_rcv(struct sk_buff * skb)1998 int tcp_v4_rcv(struct sk_buff *skb)
1999 {
2000 	struct net *net = dev_net(skb->dev);
2001 	enum skb_drop_reason drop_reason;
2002 	int sdif = inet_sdif(skb);
2003 	int dif = inet_iif(skb);
2004 	const struct iphdr *iph;
2005 	const struct tcphdr *th;
2006 	bool refcounted;
2007 	struct sock *sk;
2008 	int ret;
2009 
2010 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2011 	if (skb->pkt_type != PACKET_HOST)
2012 		goto discard_it;
2013 
2014 	/* Count it even if it's bad */
2015 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2016 
2017 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2018 		goto discard_it;
2019 
2020 	th = (const struct tcphdr *)skb->data;
2021 
2022 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2023 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2024 		goto bad_packet;
2025 	}
2026 	if (!pskb_may_pull(skb, th->doff * 4))
2027 		goto discard_it;
2028 
2029 	/* An explanation is required here, I think.
2030 	 * Packet length and doff are validated by header prediction,
2031 	 * provided case of th->doff==0 is eliminated.
2032 	 * So, we defer the checks. */
2033 
2034 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2035 		goto csum_error;
2036 
2037 	th = (const struct tcphdr *)skb->data;
2038 	iph = ip_hdr(skb);
2039 lookup:
2040 	sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2041 			       skb, __tcp_hdrlen(th), th->source,
2042 			       th->dest, sdif, &refcounted);
2043 	if (!sk)
2044 		goto no_tcp_socket;
2045 
2046 process:
2047 	if (sk->sk_state == TCP_TIME_WAIT)
2048 		goto do_time_wait;
2049 
2050 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2051 		struct request_sock *req = inet_reqsk(sk);
2052 		bool req_stolen = false;
2053 		struct sock *nsk;
2054 
2055 		sk = req->rsk_listener;
2056 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2057 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2058 		else
2059 			drop_reason = tcp_inbound_md5_hash(sk, skb,
2060 						   &iph->saddr, &iph->daddr,
2061 						   AF_INET, dif, sdif);
2062 		if (unlikely(drop_reason)) {
2063 			sk_drops_add(sk, skb);
2064 			reqsk_put(req);
2065 			goto discard_it;
2066 		}
2067 		if (tcp_checksum_complete(skb)) {
2068 			reqsk_put(req);
2069 			goto csum_error;
2070 		}
2071 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2072 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2073 			if (!nsk) {
2074 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2075 				goto lookup;
2076 			}
2077 			sk = nsk;
2078 			/* reuseport_migrate_sock() has already held one sk_refcnt
2079 			 * before returning.
2080 			 */
2081 		} else {
2082 			/* We own a reference on the listener, increase it again
2083 			 * as we might lose it too soon.
2084 			 */
2085 			sock_hold(sk);
2086 		}
2087 		refcounted = true;
2088 		nsk = NULL;
2089 		if (!tcp_filter(sk, skb)) {
2090 			th = (const struct tcphdr *)skb->data;
2091 			iph = ip_hdr(skb);
2092 			tcp_v4_fill_cb(skb, iph, th);
2093 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2094 		} else {
2095 			drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2096 		}
2097 		if (!nsk) {
2098 			reqsk_put(req);
2099 			if (req_stolen) {
2100 				/* Another cpu got exclusive access to req
2101 				 * and created a full blown socket.
2102 				 * Try to feed this packet to this socket
2103 				 * instead of discarding it.
2104 				 */
2105 				tcp_v4_restore_cb(skb);
2106 				sock_put(sk);
2107 				goto lookup;
2108 			}
2109 			goto discard_and_relse;
2110 		}
2111 		nf_reset_ct(skb);
2112 		if (nsk == sk) {
2113 			reqsk_put(req);
2114 			tcp_v4_restore_cb(skb);
2115 		} else if (tcp_child_process(sk, nsk, skb)) {
2116 			tcp_v4_send_reset(nsk, skb);
2117 			goto discard_and_relse;
2118 		} else {
2119 			sock_put(sk);
2120 			return 0;
2121 		}
2122 	}
2123 
2124 	if (static_branch_unlikely(&ip4_min_ttl)) {
2125 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2126 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2127 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2128 			drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2129 			goto discard_and_relse;
2130 		}
2131 	}
2132 
2133 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2134 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2135 		goto discard_and_relse;
2136 	}
2137 
2138 	drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2139 					   &iph->daddr, AF_INET, dif, sdif);
2140 	if (drop_reason)
2141 		goto discard_and_relse;
2142 
2143 	nf_reset_ct(skb);
2144 
2145 	if (tcp_filter(sk, skb)) {
2146 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2147 		goto discard_and_relse;
2148 	}
2149 	th = (const struct tcphdr *)skb->data;
2150 	iph = ip_hdr(skb);
2151 	tcp_v4_fill_cb(skb, iph, th);
2152 
2153 	skb->dev = NULL;
2154 
2155 	if (sk->sk_state == TCP_LISTEN) {
2156 		ret = tcp_v4_do_rcv(sk, skb);
2157 		goto put_and_return;
2158 	}
2159 
2160 	sk_incoming_cpu_update(sk);
2161 
2162 	bh_lock_sock_nested(sk);
2163 	tcp_segs_in(tcp_sk(sk), skb);
2164 	ret = 0;
2165 	if (!sock_owned_by_user(sk)) {
2166 		ret = tcp_v4_do_rcv(sk, skb);
2167 	} else {
2168 		if (tcp_add_backlog(sk, skb, &drop_reason))
2169 			goto discard_and_relse;
2170 	}
2171 	bh_unlock_sock(sk);
2172 
2173 put_and_return:
2174 	if (refcounted)
2175 		sock_put(sk);
2176 
2177 	return ret;
2178 
2179 no_tcp_socket:
2180 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2181 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2182 		goto discard_it;
2183 
2184 	tcp_v4_fill_cb(skb, iph, th);
2185 
2186 	if (tcp_checksum_complete(skb)) {
2187 csum_error:
2188 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2189 		trace_tcp_bad_csum(skb);
2190 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2191 bad_packet:
2192 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2193 	} else {
2194 		tcp_v4_send_reset(NULL, skb);
2195 	}
2196 
2197 discard_it:
2198 	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2199 	/* Discard frame. */
2200 	kfree_skb_reason(skb, drop_reason);
2201 	return 0;
2202 
2203 discard_and_relse:
2204 	sk_drops_add(sk, skb);
2205 	if (refcounted)
2206 		sock_put(sk);
2207 	goto discard_it;
2208 
2209 do_time_wait:
2210 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2211 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2212 		inet_twsk_put(inet_twsk(sk));
2213 		goto discard_it;
2214 	}
2215 
2216 	tcp_v4_fill_cb(skb, iph, th);
2217 
2218 	if (tcp_checksum_complete(skb)) {
2219 		inet_twsk_put(inet_twsk(sk));
2220 		goto csum_error;
2221 	}
2222 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2223 	case TCP_TW_SYN: {
2224 		struct sock *sk2 = inet_lookup_listener(net,
2225 							net->ipv4.tcp_death_row.hashinfo,
2226 							skb, __tcp_hdrlen(th),
2227 							iph->saddr, th->source,
2228 							iph->daddr, th->dest,
2229 							inet_iif(skb),
2230 							sdif);
2231 		if (sk2) {
2232 			inet_twsk_deschedule_put(inet_twsk(sk));
2233 			sk = sk2;
2234 			tcp_v4_restore_cb(skb);
2235 			refcounted = false;
2236 			goto process;
2237 		}
2238 	}
2239 		/* to ACK */
2240 		fallthrough;
2241 	case TCP_TW_ACK:
2242 		tcp_v4_timewait_ack(sk, skb);
2243 		break;
2244 	case TCP_TW_RST:
2245 		tcp_v4_send_reset(sk, skb);
2246 		inet_twsk_deschedule_put(inet_twsk(sk));
2247 		goto discard_it;
2248 	case TCP_TW_SUCCESS:;
2249 	}
2250 	goto discard_it;
2251 }
2252 
2253 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2254 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2255 	.twsk_unique	= tcp_twsk_unique,
2256 	.twsk_destructor= tcp_twsk_destructor,
2257 };
2258 
inet_sk_rx_dst_set(struct sock * sk,const struct sk_buff * skb)2259 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2260 {
2261 	struct dst_entry *dst = skb_dst(skb);
2262 
2263 	if (dst && dst_hold_safe(dst)) {
2264 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2265 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2266 	}
2267 }
2268 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2269 
2270 const struct inet_connection_sock_af_ops ipv4_specific = {
2271 	.queue_xmit	   = ip_queue_xmit,
2272 	.send_check	   = tcp_v4_send_check,
2273 	.rebuild_header	   = inet_sk_rebuild_header,
2274 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2275 	.conn_request	   = tcp_v4_conn_request,
2276 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2277 	.net_header_len	   = sizeof(struct iphdr),
2278 	.setsockopt	   = ip_setsockopt,
2279 	.getsockopt	   = ip_getsockopt,
2280 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2281 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2282 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2283 };
2284 EXPORT_SYMBOL(ipv4_specific);
2285 
2286 #ifdef CONFIG_TCP_MD5SIG
2287 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2288 	.md5_lookup		= tcp_v4_md5_lookup,
2289 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2290 	.md5_parse		= tcp_v4_parse_md5_keys,
2291 };
2292 #endif
2293 
2294 /* NOTE: A lot of things set to zero explicitly by call to
2295  *       sk_alloc() so need not be done here.
2296  */
tcp_v4_init_sock(struct sock * sk)2297 static int tcp_v4_init_sock(struct sock *sk)
2298 {
2299 	struct inet_connection_sock *icsk = inet_csk(sk);
2300 
2301 	tcp_init_sock(sk);
2302 
2303 	icsk->icsk_af_ops = &ipv4_specific;
2304 
2305 #ifdef CONFIG_TCP_MD5SIG
2306 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2307 #endif
2308 
2309 	return 0;
2310 }
2311 
tcp_v4_destroy_sock(struct sock * sk)2312 void tcp_v4_destroy_sock(struct sock *sk)
2313 {
2314 	struct tcp_sock *tp = tcp_sk(sk);
2315 
2316 	trace_tcp_destroy_sock(sk);
2317 
2318 	tcp_clear_xmit_timers(sk);
2319 
2320 	tcp_cleanup_congestion_control(sk);
2321 
2322 	tcp_cleanup_ulp(sk);
2323 
2324 	/* Cleanup up the write buffer. */
2325 	tcp_write_queue_purge(sk);
2326 
2327 	/* Check if we want to disable active TFO */
2328 	tcp_fastopen_active_disable_ofo_check(sk);
2329 
2330 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2331 	skb_rbtree_purge(&tp->out_of_order_queue);
2332 
2333 #ifdef CONFIG_TCP_MD5SIG
2334 	/* Clean up the MD5 key list, if any */
2335 	if (tp->md5sig_info) {
2336 		tcp_clear_md5_list(sk);
2337 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2338 		tp->md5sig_info = NULL;
2339 		static_branch_slow_dec_deferred(&tcp_md5_needed);
2340 	}
2341 #endif
2342 
2343 	/* Clean up a referenced TCP bind bucket. */
2344 	if (inet_csk(sk)->icsk_bind_hash)
2345 		inet_put_port(sk);
2346 
2347 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2348 
2349 	/* If socket is aborted during connect operation */
2350 	tcp_free_fastopen_req(tp);
2351 	tcp_fastopen_destroy_cipher(sk);
2352 	tcp_saved_syn_free(tp);
2353 
2354 	sk_sockets_allocated_dec(sk);
2355 }
2356 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2357 
2358 #ifdef CONFIG_PROC_FS
2359 /* Proc filesystem TCP sock list dumping. */
2360 
2361 static unsigned short seq_file_family(const struct seq_file *seq);
2362 
seq_sk_match(struct seq_file * seq,const struct sock * sk)2363 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2364 {
2365 	unsigned short family = seq_file_family(seq);
2366 
2367 	/* AF_UNSPEC is used as a match all */
2368 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2369 		net_eq(sock_net(sk), seq_file_net(seq)));
2370 }
2371 
2372 /* Find a non empty bucket (starting from st->bucket)
2373  * and return the first sk from it.
2374  */
listening_get_first(struct seq_file * seq)2375 static void *listening_get_first(struct seq_file *seq)
2376 {
2377 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2378 	struct tcp_iter_state *st = seq->private;
2379 
2380 	st->offset = 0;
2381 	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2382 		struct inet_listen_hashbucket *ilb2;
2383 		struct hlist_nulls_node *node;
2384 		struct sock *sk;
2385 
2386 		ilb2 = &hinfo->lhash2[st->bucket];
2387 		if (hlist_nulls_empty(&ilb2->nulls_head))
2388 			continue;
2389 
2390 		spin_lock(&ilb2->lock);
2391 		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2392 			if (seq_sk_match(seq, sk))
2393 				return sk;
2394 		}
2395 		spin_unlock(&ilb2->lock);
2396 	}
2397 
2398 	return NULL;
2399 }
2400 
2401 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2402  * If "cur" is the last one in the st->bucket,
2403  * call listening_get_first() to return the first sk of the next
2404  * non empty bucket.
2405  */
listening_get_next(struct seq_file * seq,void * cur)2406 static void *listening_get_next(struct seq_file *seq, void *cur)
2407 {
2408 	struct tcp_iter_state *st = seq->private;
2409 	struct inet_listen_hashbucket *ilb2;
2410 	struct hlist_nulls_node *node;
2411 	struct inet_hashinfo *hinfo;
2412 	struct sock *sk = cur;
2413 
2414 	++st->num;
2415 	++st->offset;
2416 
2417 	sk = sk_nulls_next(sk);
2418 	sk_nulls_for_each_from(sk, node) {
2419 		if (seq_sk_match(seq, sk))
2420 			return sk;
2421 	}
2422 
2423 	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2424 	ilb2 = &hinfo->lhash2[st->bucket];
2425 	spin_unlock(&ilb2->lock);
2426 	++st->bucket;
2427 	return listening_get_first(seq);
2428 }
2429 
listening_get_idx(struct seq_file * seq,loff_t * pos)2430 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2431 {
2432 	struct tcp_iter_state *st = seq->private;
2433 	void *rc;
2434 
2435 	st->bucket = 0;
2436 	st->offset = 0;
2437 	rc = listening_get_first(seq);
2438 
2439 	while (rc && *pos) {
2440 		rc = listening_get_next(seq, rc);
2441 		--*pos;
2442 	}
2443 	return rc;
2444 }
2445 
empty_bucket(struct inet_hashinfo * hinfo,const struct tcp_iter_state * st)2446 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2447 				const struct tcp_iter_state *st)
2448 {
2449 	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2450 }
2451 
2452 /*
2453  * Get first established socket starting from bucket given in st->bucket.
2454  * If st->bucket is zero, the very first socket in the hash is returned.
2455  */
established_get_first(struct seq_file * seq)2456 static void *established_get_first(struct seq_file *seq)
2457 {
2458 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2459 	struct tcp_iter_state *st = seq->private;
2460 
2461 	st->offset = 0;
2462 	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2463 		struct sock *sk;
2464 		struct hlist_nulls_node *node;
2465 		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2466 
2467 		cond_resched();
2468 
2469 		/* Lockless fast path for the common case of empty buckets */
2470 		if (empty_bucket(hinfo, st))
2471 			continue;
2472 
2473 		spin_lock_bh(lock);
2474 		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2475 			if (seq_sk_match(seq, sk))
2476 				return sk;
2477 		}
2478 		spin_unlock_bh(lock);
2479 	}
2480 
2481 	return NULL;
2482 }
2483 
established_get_next(struct seq_file * seq,void * cur)2484 static void *established_get_next(struct seq_file *seq, void *cur)
2485 {
2486 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2487 	struct tcp_iter_state *st = seq->private;
2488 	struct hlist_nulls_node *node;
2489 	struct sock *sk = cur;
2490 
2491 	++st->num;
2492 	++st->offset;
2493 
2494 	sk = sk_nulls_next(sk);
2495 
2496 	sk_nulls_for_each_from(sk, node) {
2497 		if (seq_sk_match(seq, sk))
2498 			return sk;
2499 	}
2500 
2501 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2502 	++st->bucket;
2503 	return established_get_first(seq);
2504 }
2505 
established_get_idx(struct seq_file * seq,loff_t pos)2506 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2507 {
2508 	struct tcp_iter_state *st = seq->private;
2509 	void *rc;
2510 
2511 	st->bucket = 0;
2512 	rc = established_get_first(seq);
2513 
2514 	while (rc && pos) {
2515 		rc = established_get_next(seq, rc);
2516 		--pos;
2517 	}
2518 	return rc;
2519 }
2520 
tcp_get_idx(struct seq_file * seq,loff_t pos)2521 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2522 {
2523 	void *rc;
2524 	struct tcp_iter_state *st = seq->private;
2525 
2526 	st->state = TCP_SEQ_STATE_LISTENING;
2527 	rc	  = listening_get_idx(seq, &pos);
2528 
2529 	if (!rc) {
2530 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2531 		rc	  = established_get_idx(seq, pos);
2532 	}
2533 
2534 	return rc;
2535 }
2536 
tcp_seek_last_pos(struct seq_file * seq)2537 static void *tcp_seek_last_pos(struct seq_file *seq)
2538 {
2539 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2540 	struct tcp_iter_state *st = seq->private;
2541 	int bucket = st->bucket;
2542 	int offset = st->offset;
2543 	int orig_num = st->num;
2544 	void *rc = NULL;
2545 
2546 	switch (st->state) {
2547 	case TCP_SEQ_STATE_LISTENING:
2548 		if (st->bucket > hinfo->lhash2_mask)
2549 			break;
2550 		rc = listening_get_first(seq);
2551 		while (offset-- && rc && bucket == st->bucket)
2552 			rc = listening_get_next(seq, rc);
2553 		if (rc)
2554 			break;
2555 		st->bucket = 0;
2556 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2557 		fallthrough;
2558 	case TCP_SEQ_STATE_ESTABLISHED:
2559 		if (st->bucket > hinfo->ehash_mask)
2560 			break;
2561 		rc = established_get_first(seq);
2562 		while (offset-- && rc && bucket == st->bucket)
2563 			rc = established_get_next(seq, rc);
2564 	}
2565 
2566 	st->num = orig_num;
2567 
2568 	return rc;
2569 }
2570 
tcp_seq_start(struct seq_file * seq,loff_t * pos)2571 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2572 {
2573 	struct tcp_iter_state *st = seq->private;
2574 	void *rc;
2575 
2576 	if (*pos && *pos == st->last_pos) {
2577 		rc = tcp_seek_last_pos(seq);
2578 		if (rc)
2579 			goto out;
2580 	}
2581 
2582 	st->state = TCP_SEQ_STATE_LISTENING;
2583 	st->num = 0;
2584 	st->bucket = 0;
2585 	st->offset = 0;
2586 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2587 
2588 out:
2589 	st->last_pos = *pos;
2590 	return rc;
2591 }
2592 EXPORT_SYMBOL(tcp_seq_start);
2593 
tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2594 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2595 {
2596 	struct tcp_iter_state *st = seq->private;
2597 	void *rc = NULL;
2598 
2599 	if (v == SEQ_START_TOKEN) {
2600 		rc = tcp_get_idx(seq, 0);
2601 		goto out;
2602 	}
2603 
2604 	switch (st->state) {
2605 	case TCP_SEQ_STATE_LISTENING:
2606 		rc = listening_get_next(seq, v);
2607 		if (!rc) {
2608 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2609 			st->bucket = 0;
2610 			st->offset = 0;
2611 			rc	  = established_get_first(seq);
2612 		}
2613 		break;
2614 	case TCP_SEQ_STATE_ESTABLISHED:
2615 		rc = established_get_next(seq, v);
2616 		break;
2617 	}
2618 out:
2619 	++*pos;
2620 	st->last_pos = *pos;
2621 	return rc;
2622 }
2623 EXPORT_SYMBOL(tcp_seq_next);
2624 
tcp_seq_stop(struct seq_file * seq,void * v)2625 void tcp_seq_stop(struct seq_file *seq, void *v)
2626 {
2627 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2628 	struct tcp_iter_state *st = seq->private;
2629 
2630 	switch (st->state) {
2631 	case TCP_SEQ_STATE_LISTENING:
2632 		if (v != SEQ_START_TOKEN)
2633 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
2634 		break;
2635 	case TCP_SEQ_STATE_ESTABLISHED:
2636 		if (v)
2637 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2638 		break;
2639 	}
2640 }
2641 EXPORT_SYMBOL(tcp_seq_stop);
2642 
get_openreq4(const struct request_sock * req,struct seq_file * f,int i)2643 static void get_openreq4(const struct request_sock *req,
2644 			 struct seq_file *f, int i)
2645 {
2646 	const struct inet_request_sock *ireq = inet_rsk(req);
2647 	long delta = req->rsk_timer.expires - jiffies;
2648 
2649 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2650 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2651 		i,
2652 		ireq->ir_loc_addr,
2653 		ireq->ir_num,
2654 		ireq->ir_rmt_addr,
2655 		ntohs(ireq->ir_rmt_port),
2656 		TCP_SYN_RECV,
2657 		0, 0, /* could print option size, but that is af dependent. */
2658 		1,    /* timers active (only the expire timer) */
2659 		jiffies_delta_to_clock_t(delta),
2660 		req->num_timeout,
2661 		from_kuid_munged(seq_user_ns(f),
2662 				 sock_i_uid(req->rsk_listener)),
2663 		0,  /* non standard timer */
2664 		0, /* open_requests have no inode */
2665 		0,
2666 		req);
2667 }
2668 
get_tcp4_sock(struct sock * sk,struct seq_file * f,int i)2669 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2670 {
2671 	int timer_active;
2672 	unsigned long timer_expires;
2673 	const struct tcp_sock *tp = tcp_sk(sk);
2674 	const struct inet_connection_sock *icsk = inet_csk(sk);
2675 	const struct inet_sock *inet = inet_sk(sk);
2676 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2677 	__be32 dest = inet->inet_daddr;
2678 	__be32 src = inet->inet_rcv_saddr;
2679 	__u16 destp = ntohs(inet->inet_dport);
2680 	__u16 srcp = ntohs(inet->inet_sport);
2681 	int rx_queue;
2682 	int state;
2683 
2684 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2685 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2686 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2687 		timer_active	= 1;
2688 		timer_expires	= icsk->icsk_timeout;
2689 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2690 		timer_active	= 4;
2691 		timer_expires	= icsk->icsk_timeout;
2692 	} else if (timer_pending(&sk->sk_timer)) {
2693 		timer_active	= 2;
2694 		timer_expires	= sk->sk_timer.expires;
2695 	} else {
2696 		timer_active	= 0;
2697 		timer_expires = jiffies;
2698 	}
2699 
2700 	state = inet_sk_state_load(sk);
2701 	if (state == TCP_LISTEN)
2702 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2703 	else
2704 		/* Because we don't lock the socket,
2705 		 * we might find a transient negative value.
2706 		 */
2707 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2708 				      READ_ONCE(tp->copied_seq), 0);
2709 
2710 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2711 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2712 		i, src, srcp, dest, destp, state,
2713 		READ_ONCE(tp->write_seq) - tp->snd_una,
2714 		rx_queue,
2715 		timer_active,
2716 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2717 		icsk->icsk_retransmits,
2718 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2719 		icsk->icsk_probes_out,
2720 		sock_i_ino(sk),
2721 		refcount_read(&sk->sk_refcnt), sk,
2722 		jiffies_to_clock_t(icsk->icsk_rto),
2723 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2724 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2725 		tcp_snd_cwnd(tp),
2726 		state == TCP_LISTEN ?
2727 		    fastopenq->max_qlen :
2728 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2729 }
2730 
get_timewait4_sock(const struct inet_timewait_sock * tw,struct seq_file * f,int i)2731 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2732 			       struct seq_file *f, int i)
2733 {
2734 	long delta = tw->tw_timer.expires - jiffies;
2735 	__be32 dest, src;
2736 	__u16 destp, srcp;
2737 
2738 	dest  = tw->tw_daddr;
2739 	src   = tw->tw_rcv_saddr;
2740 	destp = ntohs(tw->tw_dport);
2741 	srcp  = ntohs(tw->tw_sport);
2742 
2743 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2744 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2745 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2746 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2747 		refcount_read(&tw->tw_refcnt), tw);
2748 }
2749 
2750 #define TMPSZ 150
2751 
tcp4_seq_show(struct seq_file * seq,void * v)2752 static int tcp4_seq_show(struct seq_file *seq, void *v)
2753 {
2754 	struct tcp_iter_state *st;
2755 	struct sock *sk = v;
2756 
2757 	seq_setwidth(seq, TMPSZ - 1);
2758 	if (v == SEQ_START_TOKEN) {
2759 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2760 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2761 			   "inode");
2762 		goto out;
2763 	}
2764 	st = seq->private;
2765 
2766 	if (sk->sk_state == TCP_TIME_WAIT)
2767 		get_timewait4_sock(v, seq, st->num);
2768 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2769 		get_openreq4(v, seq, st->num);
2770 	else
2771 		get_tcp4_sock(v, seq, st->num);
2772 out:
2773 	seq_pad(seq, '\n');
2774 	return 0;
2775 }
2776 
2777 #ifdef CONFIG_BPF_SYSCALL
2778 struct bpf_tcp_iter_state {
2779 	struct tcp_iter_state state;
2780 	unsigned int cur_sk;
2781 	unsigned int end_sk;
2782 	unsigned int max_sk;
2783 	struct sock **batch;
2784 	bool st_bucket_done;
2785 };
2786 
2787 struct bpf_iter__tcp {
2788 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2789 	__bpf_md_ptr(struct sock_common *, sk_common);
2790 	uid_t uid __aligned(8);
2791 };
2792 
tcp_prog_seq_show(struct bpf_prog * prog,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)2793 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2794 			     struct sock_common *sk_common, uid_t uid)
2795 {
2796 	struct bpf_iter__tcp ctx;
2797 
2798 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2799 	ctx.meta = meta;
2800 	ctx.sk_common = sk_common;
2801 	ctx.uid = uid;
2802 	return bpf_iter_run_prog(prog, &ctx);
2803 }
2804 
bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state * iter)2805 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2806 {
2807 	while (iter->cur_sk < iter->end_sk)
2808 		sock_gen_put(iter->batch[iter->cur_sk++]);
2809 }
2810 
bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state * iter,unsigned int new_batch_sz)2811 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2812 				      unsigned int new_batch_sz)
2813 {
2814 	struct sock **new_batch;
2815 
2816 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2817 			     GFP_USER | __GFP_NOWARN);
2818 	if (!new_batch)
2819 		return -ENOMEM;
2820 
2821 	bpf_iter_tcp_put_batch(iter);
2822 	kvfree(iter->batch);
2823 	iter->batch = new_batch;
2824 	iter->max_sk = new_batch_sz;
2825 
2826 	return 0;
2827 }
2828 
bpf_iter_tcp_listening_batch(struct seq_file * seq,struct sock * start_sk)2829 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2830 						 struct sock *start_sk)
2831 {
2832 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2833 	struct bpf_tcp_iter_state *iter = seq->private;
2834 	struct tcp_iter_state *st = &iter->state;
2835 	struct hlist_nulls_node *node;
2836 	unsigned int expected = 1;
2837 	struct sock *sk;
2838 
2839 	sock_hold(start_sk);
2840 	iter->batch[iter->end_sk++] = start_sk;
2841 
2842 	sk = sk_nulls_next(start_sk);
2843 	sk_nulls_for_each_from(sk, node) {
2844 		if (seq_sk_match(seq, sk)) {
2845 			if (iter->end_sk < iter->max_sk) {
2846 				sock_hold(sk);
2847 				iter->batch[iter->end_sk++] = sk;
2848 			}
2849 			expected++;
2850 		}
2851 	}
2852 	spin_unlock(&hinfo->lhash2[st->bucket].lock);
2853 
2854 	return expected;
2855 }
2856 
bpf_iter_tcp_established_batch(struct seq_file * seq,struct sock * start_sk)2857 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2858 						   struct sock *start_sk)
2859 {
2860 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2861 	struct bpf_tcp_iter_state *iter = seq->private;
2862 	struct tcp_iter_state *st = &iter->state;
2863 	struct hlist_nulls_node *node;
2864 	unsigned int expected = 1;
2865 	struct sock *sk;
2866 
2867 	sock_hold(start_sk);
2868 	iter->batch[iter->end_sk++] = start_sk;
2869 
2870 	sk = sk_nulls_next(start_sk);
2871 	sk_nulls_for_each_from(sk, node) {
2872 		if (seq_sk_match(seq, sk)) {
2873 			if (iter->end_sk < iter->max_sk) {
2874 				sock_hold(sk);
2875 				iter->batch[iter->end_sk++] = sk;
2876 			}
2877 			expected++;
2878 		}
2879 	}
2880 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2881 
2882 	return expected;
2883 }
2884 
bpf_iter_tcp_batch(struct seq_file * seq)2885 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2886 {
2887 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2888 	struct bpf_tcp_iter_state *iter = seq->private;
2889 	struct tcp_iter_state *st = &iter->state;
2890 	unsigned int expected;
2891 	bool resized = false;
2892 	struct sock *sk;
2893 
2894 	/* The st->bucket is done.  Directly advance to the next
2895 	 * bucket instead of having the tcp_seek_last_pos() to skip
2896 	 * one by one in the current bucket and eventually find out
2897 	 * it has to advance to the next bucket.
2898 	 */
2899 	if (iter->st_bucket_done) {
2900 		st->offset = 0;
2901 		st->bucket++;
2902 		if (st->state == TCP_SEQ_STATE_LISTENING &&
2903 		    st->bucket > hinfo->lhash2_mask) {
2904 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2905 			st->bucket = 0;
2906 		}
2907 	}
2908 
2909 again:
2910 	/* Get a new batch */
2911 	iter->cur_sk = 0;
2912 	iter->end_sk = 0;
2913 	iter->st_bucket_done = false;
2914 
2915 	sk = tcp_seek_last_pos(seq);
2916 	if (!sk)
2917 		return NULL; /* Done */
2918 
2919 	if (st->state == TCP_SEQ_STATE_LISTENING)
2920 		expected = bpf_iter_tcp_listening_batch(seq, sk);
2921 	else
2922 		expected = bpf_iter_tcp_established_batch(seq, sk);
2923 
2924 	if (iter->end_sk == expected) {
2925 		iter->st_bucket_done = true;
2926 		return sk;
2927 	}
2928 
2929 	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2930 		resized = true;
2931 		goto again;
2932 	}
2933 
2934 	return sk;
2935 }
2936 
bpf_iter_tcp_seq_start(struct seq_file * seq,loff_t * pos)2937 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2938 {
2939 	/* bpf iter does not support lseek, so it always
2940 	 * continue from where it was stop()-ped.
2941 	 */
2942 	if (*pos)
2943 		return bpf_iter_tcp_batch(seq);
2944 
2945 	return SEQ_START_TOKEN;
2946 }
2947 
bpf_iter_tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2948 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2949 {
2950 	struct bpf_tcp_iter_state *iter = seq->private;
2951 	struct tcp_iter_state *st = &iter->state;
2952 	struct sock *sk;
2953 
2954 	/* Whenever seq_next() is called, the iter->cur_sk is
2955 	 * done with seq_show(), so advance to the next sk in
2956 	 * the batch.
2957 	 */
2958 	if (iter->cur_sk < iter->end_sk) {
2959 		/* Keeping st->num consistent in tcp_iter_state.
2960 		 * bpf_iter_tcp does not use st->num.
2961 		 * meta.seq_num is used instead.
2962 		 */
2963 		st->num++;
2964 		/* Move st->offset to the next sk in the bucket such that
2965 		 * the future start() will resume at st->offset in
2966 		 * st->bucket.  See tcp_seek_last_pos().
2967 		 */
2968 		st->offset++;
2969 		sock_gen_put(iter->batch[iter->cur_sk++]);
2970 	}
2971 
2972 	if (iter->cur_sk < iter->end_sk)
2973 		sk = iter->batch[iter->cur_sk];
2974 	else
2975 		sk = bpf_iter_tcp_batch(seq);
2976 
2977 	++*pos;
2978 	/* Keeping st->last_pos consistent in tcp_iter_state.
2979 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2980 	 */
2981 	st->last_pos = *pos;
2982 	return sk;
2983 }
2984 
bpf_iter_tcp_seq_show(struct seq_file * seq,void * v)2985 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2986 {
2987 	struct bpf_iter_meta meta;
2988 	struct bpf_prog *prog;
2989 	struct sock *sk = v;
2990 	uid_t uid;
2991 	int ret;
2992 
2993 	if (v == SEQ_START_TOKEN)
2994 		return 0;
2995 
2996 	if (sk_fullsock(sk))
2997 		lock_sock(sk);
2998 
2999 	if (unlikely(sk_unhashed(sk))) {
3000 		ret = SEQ_SKIP;
3001 		goto unlock;
3002 	}
3003 
3004 	if (sk->sk_state == TCP_TIME_WAIT) {
3005 		uid = 0;
3006 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3007 		const struct request_sock *req = v;
3008 
3009 		uid = from_kuid_munged(seq_user_ns(seq),
3010 				       sock_i_uid(req->rsk_listener));
3011 	} else {
3012 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3013 	}
3014 
3015 	meta.seq = seq;
3016 	prog = bpf_iter_get_info(&meta, false);
3017 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
3018 
3019 unlock:
3020 	if (sk_fullsock(sk))
3021 		release_sock(sk);
3022 	return ret;
3023 
3024 }
3025 
bpf_iter_tcp_seq_stop(struct seq_file * seq,void * v)3026 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3027 {
3028 	struct bpf_tcp_iter_state *iter = seq->private;
3029 	struct bpf_iter_meta meta;
3030 	struct bpf_prog *prog;
3031 
3032 	if (!v) {
3033 		meta.seq = seq;
3034 		prog = bpf_iter_get_info(&meta, true);
3035 		if (prog)
3036 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
3037 	}
3038 
3039 	if (iter->cur_sk < iter->end_sk) {
3040 		bpf_iter_tcp_put_batch(iter);
3041 		iter->st_bucket_done = false;
3042 	}
3043 }
3044 
3045 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3046 	.show		= bpf_iter_tcp_seq_show,
3047 	.start		= bpf_iter_tcp_seq_start,
3048 	.next		= bpf_iter_tcp_seq_next,
3049 	.stop		= bpf_iter_tcp_seq_stop,
3050 };
3051 #endif
seq_file_family(const struct seq_file * seq)3052 static unsigned short seq_file_family(const struct seq_file *seq)
3053 {
3054 	const struct tcp_seq_afinfo *afinfo;
3055 
3056 #ifdef CONFIG_BPF_SYSCALL
3057 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3058 	if (seq->op == &bpf_iter_tcp_seq_ops)
3059 		return AF_UNSPEC;
3060 #endif
3061 
3062 	/* Iterated from proc fs */
3063 	afinfo = pde_data(file_inode(seq->file));
3064 	return afinfo->family;
3065 }
3066 
3067 static const struct seq_operations tcp4_seq_ops = {
3068 	.show		= tcp4_seq_show,
3069 	.start		= tcp_seq_start,
3070 	.next		= tcp_seq_next,
3071 	.stop		= tcp_seq_stop,
3072 };
3073 
3074 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3075 	.family		= AF_INET,
3076 };
3077 
tcp4_proc_init_net(struct net * net)3078 static int __net_init tcp4_proc_init_net(struct net *net)
3079 {
3080 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3081 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3082 		return -ENOMEM;
3083 	return 0;
3084 }
3085 
tcp4_proc_exit_net(struct net * net)3086 static void __net_exit tcp4_proc_exit_net(struct net *net)
3087 {
3088 	remove_proc_entry("tcp", net->proc_net);
3089 }
3090 
3091 static struct pernet_operations tcp4_net_ops = {
3092 	.init = tcp4_proc_init_net,
3093 	.exit = tcp4_proc_exit_net,
3094 };
3095 
tcp4_proc_init(void)3096 int __init tcp4_proc_init(void)
3097 {
3098 	return register_pernet_subsys(&tcp4_net_ops);
3099 }
3100 
tcp4_proc_exit(void)3101 void tcp4_proc_exit(void)
3102 {
3103 	unregister_pernet_subsys(&tcp4_net_ops);
3104 }
3105 #endif /* CONFIG_PROC_FS */
3106 
3107 /* @wake is one when sk_stream_write_space() calls us.
3108  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3109  * This mimics the strategy used in sock_def_write_space().
3110  */
tcp_stream_memory_free(const struct sock * sk,int wake)3111 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3112 {
3113 	const struct tcp_sock *tp = tcp_sk(sk);
3114 	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3115 			    READ_ONCE(tp->snd_nxt);
3116 
3117 	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3118 }
3119 EXPORT_SYMBOL(tcp_stream_memory_free);
3120 
3121 struct proto tcp_prot = {
3122 	.name			= "TCP",
3123 	.owner			= THIS_MODULE,
3124 	.close			= tcp_close,
3125 	.pre_connect		= tcp_v4_pre_connect,
3126 	.connect		= tcp_v4_connect,
3127 	.disconnect		= tcp_disconnect,
3128 	.accept			= inet_csk_accept,
3129 	.ioctl			= tcp_ioctl,
3130 	.init			= tcp_v4_init_sock,
3131 	.destroy		= tcp_v4_destroy_sock,
3132 	.shutdown		= tcp_shutdown,
3133 	.setsockopt		= tcp_setsockopt,
3134 	.getsockopt		= tcp_getsockopt,
3135 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3136 	.keepalive		= tcp_set_keepalive,
3137 	.recvmsg		= tcp_recvmsg,
3138 	.sendmsg		= tcp_sendmsg,
3139 	.splice_eof		= tcp_splice_eof,
3140 	.backlog_rcv		= tcp_v4_do_rcv,
3141 	.release_cb		= tcp_release_cb,
3142 	.hash			= inet_hash,
3143 	.unhash			= inet_unhash,
3144 	.get_port		= inet_csk_get_port,
3145 	.put_port		= inet_put_port,
3146 #ifdef CONFIG_BPF_SYSCALL
3147 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3148 #endif
3149 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3150 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3151 	.stream_memory_free	= tcp_stream_memory_free,
3152 	.sockets_allocated	= &tcp_sockets_allocated,
3153 	.orphan_count		= &tcp_orphan_count,
3154 
3155 	.memory_allocated	= &tcp_memory_allocated,
3156 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3157 
3158 	.memory_pressure	= &tcp_memory_pressure,
3159 	.sysctl_mem		= sysctl_tcp_mem,
3160 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3161 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3162 	.max_header		= MAX_TCP_HEADER,
3163 	.obj_size		= sizeof(struct tcp_sock),
3164 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3165 	.twsk_prot		= &tcp_timewait_sock_ops,
3166 	.rsk_prot		= &tcp_request_sock_ops,
3167 	.h.hashinfo		= NULL,
3168 	.no_autobind		= true,
3169 	.diag_destroy		= tcp_abort,
3170 };
3171 EXPORT_SYMBOL(tcp_prot);
3172 
tcp_sk_exit(struct net * net)3173 static void __net_exit tcp_sk_exit(struct net *net)
3174 {
3175 	if (net->ipv4.tcp_congestion_control)
3176 		bpf_module_put(net->ipv4.tcp_congestion_control,
3177 			       net->ipv4.tcp_congestion_control->owner);
3178 }
3179 
tcp_set_hashinfo(struct net * net)3180 static void __net_init tcp_set_hashinfo(struct net *net)
3181 {
3182 	struct inet_hashinfo *hinfo;
3183 	unsigned int ehash_entries;
3184 	struct net *old_net;
3185 
3186 	if (net_eq(net, &init_net))
3187 		goto fallback;
3188 
3189 	old_net = current->nsproxy->net_ns;
3190 	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3191 	if (!ehash_entries)
3192 		goto fallback;
3193 
3194 	ehash_entries = roundup_pow_of_two(ehash_entries);
3195 	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3196 	if (!hinfo) {
3197 		pr_warn("Failed to allocate TCP ehash (entries: %u) "
3198 			"for a netns, fallback to the global one\n",
3199 			ehash_entries);
3200 fallback:
3201 		hinfo = &tcp_hashinfo;
3202 		ehash_entries = tcp_hashinfo.ehash_mask + 1;
3203 	}
3204 
3205 	net->ipv4.tcp_death_row.hashinfo = hinfo;
3206 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3207 	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3208 }
3209 
tcp_sk_init(struct net * net)3210 static int __net_init tcp_sk_init(struct net *net)
3211 {
3212 	net->ipv4.sysctl_tcp_ecn = 2;
3213 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3214 
3215 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3216 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3217 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3218 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3219 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3220 
3221 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3222 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3223 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3224 
3225 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3226 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3227 	net->ipv4.sysctl_tcp_syncookies = 1;
3228 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3229 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3230 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3231 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3232 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3233 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3234 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3235 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3236 
3237 	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3238 	tcp_set_hashinfo(net);
3239 
3240 	net->ipv4.sysctl_tcp_sack = 1;
3241 	net->ipv4.sysctl_tcp_window_scaling = 1;
3242 	net->ipv4.sysctl_tcp_timestamps = 1;
3243 	net->ipv4.sysctl_tcp_early_retrans = 3;
3244 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3245 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3246 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3247 	net->ipv4.sysctl_tcp_max_reordering = 300;
3248 	net->ipv4.sysctl_tcp_dsack = 1;
3249 	net->ipv4.sysctl_tcp_app_win = 31;
3250 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3251 	net->ipv4.sysctl_tcp_frto = 2;
3252 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3253 	/* This limits the percentage of the congestion window which we
3254 	 * will allow a single TSO frame to consume.  Building TSO frames
3255 	 * which are too large can cause TCP streams to be bursty.
3256 	 */
3257 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3258 	/* Default TSQ limit of 16 TSO segments */
3259 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3260 
3261 	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3262 	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3263 
3264 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3265 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3266 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3267 	net->ipv4.sysctl_tcp_autocorking = 1;
3268 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3269 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3270 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3271 	if (net != &init_net) {
3272 		memcpy(net->ipv4.sysctl_tcp_rmem,
3273 		       init_net.ipv4.sysctl_tcp_rmem,
3274 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3275 		memcpy(net->ipv4.sysctl_tcp_wmem,
3276 		       init_net.ipv4.sysctl_tcp_wmem,
3277 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3278 	}
3279 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3280 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3281 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3282 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3283 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3284 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3285 
3286 	/* Set default values for PLB */
3287 	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3288 	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3289 	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3290 	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3291 	/* Default congestion threshold for PLB to mark a round is 50% */
3292 	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3293 
3294 	/* Reno is always built in */
3295 	if (!net_eq(net, &init_net) &&
3296 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3297 			       init_net.ipv4.tcp_congestion_control->owner))
3298 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3299 	else
3300 		net->ipv4.tcp_congestion_control = &tcp_reno;
3301 
3302 	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3303 	net->ipv4.sysctl_tcp_shrink_window = 0;
3304 
3305 	return 0;
3306 }
3307 
tcp_sk_exit_batch(struct list_head * net_exit_list)3308 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3309 {
3310 	struct net *net;
3311 
3312 	tcp_twsk_purge(net_exit_list, AF_INET);
3313 
3314 	list_for_each_entry(net, net_exit_list, exit_list) {
3315 		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3316 		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3317 		tcp_fastopen_ctx_destroy(net);
3318 	}
3319 }
3320 
3321 static struct pernet_operations __net_initdata tcp_sk_ops = {
3322        .init	   = tcp_sk_init,
3323        .exit	   = tcp_sk_exit,
3324        .exit_batch = tcp_sk_exit_batch,
3325 };
3326 
3327 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(tcp,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)3328 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3329 		     struct sock_common *sk_common, uid_t uid)
3330 
3331 #define INIT_BATCH_SZ 16
3332 
3333 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3334 {
3335 	struct bpf_tcp_iter_state *iter = priv_data;
3336 	int err;
3337 
3338 	err = bpf_iter_init_seq_net(priv_data, aux);
3339 	if (err)
3340 		return err;
3341 
3342 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3343 	if (err) {
3344 		bpf_iter_fini_seq_net(priv_data);
3345 		return err;
3346 	}
3347 
3348 	return 0;
3349 }
3350 
bpf_iter_fini_tcp(void * priv_data)3351 static void bpf_iter_fini_tcp(void *priv_data)
3352 {
3353 	struct bpf_tcp_iter_state *iter = priv_data;
3354 
3355 	bpf_iter_fini_seq_net(priv_data);
3356 	kvfree(iter->batch);
3357 }
3358 
3359 static const struct bpf_iter_seq_info tcp_seq_info = {
3360 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3361 	.init_seq_private	= bpf_iter_init_tcp,
3362 	.fini_seq_private	= bpf_iter_fini_tcp,
3363 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3364 };
3365 
3366 static const struct bpf_func_proto *
bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)3367 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3368 			    const struct bpf_prog *prog)
3369 {
3370 	switch (func_id) {
3371 	case BPF_FUNC_setsockopt:
3372 		return &bpf_sk_setsockopt_proto;
3373 	case BPF_FUNC_getsockopt:
3374 		return &bpf_sk_getsockopt_proto;
3375 	default:
3376 		return NULL;
3377 	}
3378 }
3379 
3380 static struct bpf_iter_reg tcp_reg_info = {
3381 	.target			= "tcp",
3382 	.ctx_arg_info_size	= 1,
3383 	.ctx_arg_info		= {
3384 		{ offsetof(struct bpf_iter__tcp, sk_common),
3385 		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3386 	},
3387 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3388 	.seq_info		= &tcp_seq_info,
3389 };
3390 
bpf_iter_register(void)3391 static void __init bpf_iter_register(void)
3392 {
3393 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3394 	if (bpf_iter_reg_target(&tcp_reg_info))
3395 		pr_warn("Warning: could not register bpf iterator tcp\n");
3396 }
3397 
3398 #endif
3399 
tcp_v4_init(void)3400 void __init tcp_v4_init(void)
3401 {
3402 	int cpu, res;
3403 
3404 	for_each_possible_cpu(cpu) {
3405 		struct sock *sk;
3406 
3407 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3408 					   IPPROTO_TCP, &init_net);
3409 		if (res)
3410 			panic("Failed to create the TCP control socket.\n");
3411 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3412 
3413 		/* Please enforce IP_DF and IPID==0 for RST and
3414 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3415 		 */
3416 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3417 
3418 		per_cpu(ipv4_tcp_sk, cpu) = sk;
3419 	}
3420 	if (register_pernet_subsys(&tcp_sk_ops))
3421 		panic("Failed to create the TCP control socket.\n");
3422 
3423 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3424 	bpf_iter_register();
3425 #endif
3426 }
3427