xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision a971b42c)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72 
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 
80 #include <crypto/hash.h>
81 #include <linux/scatterlist.h>
82 
83 #include <trace/events/tcp.h>
84 
85 #ifdef CONFIG_TCP_MD5SIG
86 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
87 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
88 #endif
89 
90 struct inet_hashinfo tcp_hashinfo;
91 EXPORT_SYMBOL(tcp_hashinfo);
92 
93 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
94 {
95 	return secure_tcp_seq(ip_hdr(skb)->daddr,
96 			      ip_hdr(skb)->saddr,
97 			      tcp_hdr(skb)->dest,
98 			      tcp_hdr(skb)->source);
99 }
100 
101 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
102 {
103 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
104 }
105 
106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
107 {
108 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
109 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
110 	struct tcp_sock *tp = tcp_sk(sk);
111 	int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
112 
113 	if (reuse == 2) {
114 		/* Still does not detect *everything* that goes through
115 		 * lo, since we require a loopback src or dst address
116 		 * or direct binding to 'lo' interface.
117 		 */
118 		bool loopback = false;
119 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
120 			loopback = true;
121 #if IS_ENABLED(CONFIG_IPV6)
122 		if (tw->tw_family == AF_INET6) {
123 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
124 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
125 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
126 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
127 				loopback = true;
128 		} else
129 #endif
130 		{
131 			if (ipv4_is_loopback(tw->tw_daddr) ||
132 			    ipv4_is_loopback(tw->tw_rcv_saddr))
133 				loopback = true;
134 		}
135 		if (!loopback)
136 			reuse = 0;
137 	}
138 
139 	/* With PAWS, it is safe from the viewpoint
140 	   of data integrity. Even without PAWS it is safe provided sequence
141 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
142 
143 	   Actually, the idea is close to VJ's one, only timestamp cache is
144 	   held not per host, but per port pair and TW bucket is used as state
145 	   holder.
146 
147 	   If TW bucket has been already destroyed we fall back to VJ's scheme
148 	   and use initial timestamp retrieved from peer table.
149 	 */
150 	if (tcptw->tw_ts_recent_stamp &&
151 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
152 					    tcptw->tw_ts_recent_stamp)))) {
153 		/* In case of repair and re-using TIME-WAIT sockets we still
154 		 * want to be sure that it is safe as above but honor the
155 		 * sequence numbers and time stamps set as part of the repair
156 		 * process.
157 		 *
158 		 * Without this check re-using a TIME-WAIT socket with TCP
159 		 * repair would accumulate a -1 on the repair assigned
160 		 * sequence number. The first time it is reused the sequence
161 		 * is -1, the second time -2, etc. This fixes that issue
162 		 * without appearing to create any others.
163 		 */
164 		if (likely(!tp->repair)) {
165 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
166 
167 			if (!seq)
168 				seq = 1;
169 			WRITE_ONCE(tp->write_seq, seq);
170 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
171 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
172 		}
173 		sock_hold(sktw);
174 		return 1;
175 	}
176 
177 	return 0;
178 }
179 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
180 
181 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
182 			      int addr_len)
183 {
184 	/* This check is replicated from tcp_v4_connect() and intended to
185 	 * prevent BPF program called below from accessing bytes that are out
186 	 * of the bound specified by user in addr_len.
187 	 */
188 	if (addr_len < sizeof(struct sockaddr_in))
189 		return -EINVAL;
190 
191 	sock_owned_by_me(sk);
192 
193 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
194 }
195 
196 /* This will initiate an outgoing connection. */
197 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
198 {
199 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
200 	struct inet_sock *inet = inet_sk(sk);
201 	struct tcp_sock *tp = tcp_sk(sk);
202 	__be16 orig_sport, orig_dport;
203 	__be32 daddr, nexthop;
204 	struct flowi4 *fl4;
205 	struct rtable *rt;
206 	int err;
207 	struct ip_options_rcu *inet_opt;
208 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
209 
210 	if (addr_len < sizeof(struct sockaddr_in))
211 		return -EINVAL;
212 
213 	if (usin->sin_family != AF_INET)
214 		return -EAFNOSUPPORT;
215 
216 	nexthop = daddr = usin->sin_addr.s_addr;
217 	inet_opt = rcu_dereference_protected(inet->inet_opt,
218 					     lockdep_sock_is_held(sk));
219 	if (inet_opt && inet_opt->opt.srr) {
220 		if (!daddr)
221 			return -EINVAL;
222 		nexthop = inet_opt->opt.faddr;
223 	}
224 
225 	orig_sport = inet->inet_sport;
226 	orig_dport = usin->sin_port;
227 	fl4 = &inet->cork.fl.u.ip4;
228 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
229 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
230 			      IPPROTO_TCP,
231 			      orig_sport, orig_dport, sk);
232 	if (IS_ERR(rt)) {
233 		err = PTR_ERR(rt);
234 		if (err == -ENETUNREACH)
235 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
236 		return err;
237 	}
238 
239 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
240 		ip_rt_put(rt);
241 		return -ENETUNREACH;
242 	}
243 
244 	if (!inet_opt || !inet_opt->opt.srr)
245 		daddr = fl4->daddr;
246 
247 	if (!inet->inet_saddr)
248 		inet->inet_saddr = fl4->saddr;
249 	sk_rcv_saddr_set(sk, inet->inet_saddr);
250 
251 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
252 		/* Reset inherited state */
253 		tp->rx_opt.ts_recent	   = 0;
254 		tp->rx_opt.ts_recent_stamp = 0;
255 		if (likely(!tp->repair))
256 			WRITE_ONCE(tp->write_seq, 0);
257 	}
258 
259 	inet->inet_dport = usin->sin_port;
260 	sk_daddr_set(sk, daddr);
261 
262 	inet_csk(sk)->icsk_ext_hdr_len = 0;
263 	if (inet_opt)
264 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
265 
266 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
267 
268 	/* Socket identity is still unknown (sport may be zero).
269 	 * However we set state to SYN-SENT and not releasing socket
270 	 * lock select source port, enter ourselves into the hash tables and
271 	 * complete initialization after this.
272 	 */
273 	tcp_set_state(sk, TCP_SYN_SENT);
274 	err = inet_hash_connect(tcp_death_row, sk);
275 	if (err)
276 		goto failure;
277 
278 	sk_set_txhash(sk);
279 
280 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
281 			       inet->inet_sport, inet->inet_dport, sk);
282 	if (IS_ERR(rt)) {
283 		err = PTR_ERR(rt);
284 		rt = NULL;
285 		goto failure;
286 	}
287 	/* OK, now commit destination to socket.  */
288 	sk->sk_gso_type = SKB_GSO_TCPV4;
289 	sk_setup_caps(sk, &rt->dst);
290 	rt = NULL;
291 
292 	if (likely(!tp->repair)) {
293 		if (!tp->write_seq)
294 			WRITE_ONCE(tp->write_seq,
295 				   secure_tcp_seq(inet->inet_saddr,
296 						  inet->inet_daddr,
297 						  inet->inet_sport,
298 						  usin->sin_port));
299 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
300 						 inet->inet_saddr,
301 						 inet->inet_daddr);
302 	}
303 
304 	inet->inet_id = prandom_u32();
305 
306 	if (tcp_fastopen_defer_connect(sk, &err))
307 		return err;
308 	if (err)
309 		goto failure;
310 
311 	err = tcp_connect(sk);
312 
313 	if (err)
314 		goto failure;
315 
316 	return 0;
317 
318 failure:
319 	/*
320 	 * This unhashes the socket and releases the local port,
321 	 * if necessary.
322 	 */
323 	tcp_set_state(sk, TCP_CLOSE);
324 	ip_rt_put(rt);
325 	sk->sk_route_caps = 0;
326 	inet->inet_dport = 0;
327 	return err;
328 }
329 EXPORT_SYMBOL(tcp_v4_connect);
330 
331 /*
332  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
333  * It can be called through tcp_release_cb() if socket was owned by user
334  * at the time tcp_v4_err() was called to handle ICMP message.
335  */
336 void tcp_v4_mtu_reduced(struct sock *sk)
337 {
338 	struct inet_sock *inet = inet_sk(sk);
339 	struct dst_entry *dst;
340 	u32 mtu;
341 
342 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
343 		return;
344 	mtu = tcp_sk(sk)->mtu_info;
345 	dst = inet_csk_update_pmtu(sk, mtu);
346 	if (!dst)
347 		return;
348 
349 	/* Something is about to be wrong... Remember soft error
350 	 * for the case, if this connection will not able to recover.
351 	 */
352 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
353 		sk->sk_err_soft = EMSGSIZE;
354 
355 	mtu = dst_mtu(dst);
356 
357 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
358 	    ip_sk_accept_pmtu(sk) &&
359 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
360 		tcp_sync_mss(sk, mtu);
361 
362 		/* Resend the TCP packet because it's
363 		 * clear that the old packet has been
364 		 * dropped. This is the new "fast" path mtu
365 		 * discovery.
366 		 */
367 		tcp_simple_retransmit(sk);
368 	} /* else let the usual retransmit timer handle it */
369 }
370 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
371 
372 static void do_redirect(struct sk_buff *skb, struct sock *sk)
373 {
374 	struct dst_entry *dst = __sk_dst_check(sk, 0);
375 
376 	if (dst)
377 		dst->ops->redirect(dst, sk, skb);
378 }
379 
380 
381 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
382 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
383 {
384 	struct request_sock *req = inet_reqsk(sk);
385 	struct net *net = sock_net(sk);
386 
387 	/* ICMPs are not backlogged, hence we cannot get
388 	 * an established socket here.
389 	 */
390 	if (seq != tcp_rsk(req)->snt_isn) {
391 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
392 	} else if (abort) {
393 		/*
394 		 * Still in SYN_RECV, just remove it silently.
395 		 * There is no good way to pass the error to the newly
396 		 * created socket, and POSIX does not want network
397 		 * errors returned from accept().
398 		 */
399 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
400 		tcp_listendrop(req->rsk_listener);
401 	}
402 	reqsk_put(req);
403 }
404 EXPORT_SYMBOL(tcp_req_err);
405 
406 /* TCP-LD (RFC 6069) logic */
407 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
408 {
409 	struct inet_connection_sock *icsk = inet_csk(sk);
410 	struct tcp_sock *tp = tcp_sk(sk);
411 	struct sk_buff *skb;
412 	s32 remaining;
413 	u32 delta_us;
414 
415 	if (sock_owned_by_user(sk))
416 		return;
417 
418 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
419 	    !icsk->icsk_backoff)
420 		return;
421 
422 	skb = tcp_rtx_queue_head(sk);
423 	if (WARN_ON_ONCE(!skb))
424 		return;
425 
426 	icsk->icsk_backoff--;
427 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
428 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
429 
430 	tcp_mstamp_refresh(tp);
431 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
432 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
433 
434 	if (remaining > 0) {
435 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
436 					  remaining, TCP_RTO_MAX);
437 	} else {
438 		/* RTO revert clocked out retransmission.
439 		 * Will retransmit now.
440 		 */
441 		tcp_retransmit_timer(sk);
442 	}
443 }
444 EXPORT_SYMBOL(tcp_ld_RTO_revert);
445 
446 /*
447  * This routine is called by the ICMP module when it gets some
448  * sort of error condition.  If err < 0 then the socket should
449  * be closed and the error returned to the user.  If err > 0
450  * it's just the icmp type << 8 | icmp code.  After adjustment
451  * header points to the first 8 bytes of the tcp header.  We need
452  * to find the appropriate port.
453  *
454  * The locking strategy used here is very "optimistic". When
455  * someone else accesses the socket the ICMP is just dropped
456  * and for some paths there is no check at all.
457  * A more general error queue to queue errors for later handling
458  * is probably better.
459  *
460  */
461 
462 int tcp_v4_err(struct sk_buff *skb, u32 info)
463 {
464 	const struct iphdr *iph = (const struct iphdr *)skb->data;
465 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
466 	struct tcp_sock *tp;
467 	struct inet_sock *inet;
468 	const int type = icmp_hdr(skb)->type;
469 	const int code = icmp_hdr(skb)->code;
470 	struct sock *sk;
471 	struct request_sock *fastopen;
472 	u32 seq, snd_una;
473 	int err;
474 	struct net *net = dev_net(skb->dev);
475 
476 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
477 				       th->dest, iph->saddr, ntohs(th->source),
478 				       inet_iif(skb), 0);
479 	if (!sk) {
480 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
481 		return -ENOENT;
482 	}
483 	if (sk->sk_state == TCP_TIME_WAIT) {
484 		inet_twsk_put(inet_twsk(sk));
485 		return 0;
486 	}
487 	seq = ntohl(th->seq);
488 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
489 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
490 				     type == ICMP_TIME_EXCEEDED ||
491 				     (type == ICMP_DEST_UNREACH &&
492 				      (code == ICMP_NET_UNREACH ||
493 				       code == ICMP_HOST_UNREACH)));
494 		return 0;
495 	}
496 
497 	bh_lock_sock(sk);
498 	/* If too many ICMPs get dropped on busy
499 	 * servers this needs to be solved differently.
500 	 * We do take care of PMTU discovery (RFC1191) special case :
501 	 * we can receive locally generated ICMP messages while socket is held.
502 	 */
503 	if (sock_owned_by_user(sk)) {
504 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
505 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
506 	}
507 	if (sk->sk_state == TCP_CLOSE)
508 		goto out;
509 
510 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
511 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
512 		goto out;
513 	}
514 
515 	tp = tcp_sk(sk);
516 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
517 	fastopen = rcu_dereference(tp->fastopen_rsk);
518 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
519 	if (sk->sk_state != TCP_LISTEN &&
520 	    !between(seq, snd_una, tp->snd_nxt)) {
521 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
522 		goto out;
523 	}
524 
525 	switch (type) {
526 	case ICMP_REDIRECT:
527 		if (!sock_owned_by_user(sk))
528 			do_redirect(skb, sk);
529 		goto out;
530 	case ICMP_SOURCE_QUENCH:
531 		/* Just silently ignore these. */
532 		goto out;
533 	case ICMP_PARAMETERPROB:
534 		err = EPROTO;
535 		break;
536 	case ICMP_DEST_UNREACH:
537 		if (code > NR_ICMP_UNREACH)
538 			goto out;
539 
540 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
541 			/* We are not interested in TCP_LISTEN and open_requests
542 			 * (SYN-ACKs send out by Linux are always <576bytes so
543 			 * they should go through unfragmented).
544 			 */
545 			if (sk->sk_state == TCP_LISTEN)
546 				goto out;
547 
548 			tp->mtu_info = info;
549 			if (!sock_owned_by_user(sk)) {
550 				tcp_v4_mtu_reduced(sk);
551 			} else {
552 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
553 					sock_hold(sk);
554 			}
555 			goto out;
556 		}
557 
558 		err = icmp_err_convert[code].errno;
559 		/* check if this ICMP message allows revert of backoff.
560 		 * (see RFC 6069)
561 		 */
562 		if (!fastopen &&
563 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
564 			tcp_ld_RTO_revert(sk, seq);
565 		break;
566 	case ICMP_TIME_EXCEEDED:
567 		err = EHOSTUNREACH;
568 		break;
569 	default:
570 		goto out;
571 	}
572 
573 	switch (sk->sk_state) {
574 	case TCP_SYN_SENT:
575 	case TCP_SYN_RECV:
576 		/* Only in fast or simultaneous open. If a fast open socket is
577 		 * is already accepted it is treated as a connected one below.
578 		 */
579 		if (fastopen && !fastopen->sk)
580 			break;
581 
582 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
583 
584 		if (!sock_owned_by_user(sk)) {
585 			sk->sk_err = err;
586 
587 			sk->sk_error_report(sk);
588 
589 			tcp_done(sk);
590 		} else {
591 			sk->sk_err_soft = err;
592 		}
593 		goto out;
594 	}
595 
596 	/* If we've already connected we will keep trying
597 	 * until we time out, or the user gives up.
598 	 *
599 	 * rfc1122 4.2.3.9 allows to consider as hard errors
600 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
601 	 * but it is obsoleted by pmtu discovery).
602 	 *
603 	 * Note, that in modern internet, where routing is unreliable
604 	 * and in each dark corner broken firewalls sit, sending random
605 	 * errors ordered by their masters even this two messages finally lose
606 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
607 	 *
608 	 * Now we are in compliance with RFCs.
609 	 *							--ANK (980905)
610 	 */
611 
612 	inet = inet_sk(sk);
613 	if (!sock_owned_by_user(sk) && inet->recverr) {
614 		sk->sk_err = err;
615 		sk->sk_error_report(sk);
616 	} else	{ /* Only an error on timeout */
617 		sk->sk_err_soft = err;
618 	}
619 
620 out:
621 	bh_unlock_sock(sk);
622 	sock_put(sk);
623 	return 0;
624 }
625 
626 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
627 {
628 	struct tcphdr *th = tcp_hdr(skb);
629 
630 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
631 	skb->csum_start = skb_transport_header(skb) - skb->head;
632 	skb->csum_offset = offsetof(struct tcphdr, check);
633 }
634 
635 /* This routine computes an IPv4 TCP checksum. */
636 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
637 {
638 	const struct inet_sock *inet = inet_sk(sk);
639 
640 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
641 }
642 EXPORT_SYMBOL(tcp_v4_send_check);
643 
644 /*
645  *	This routine will send an RST to the other tcp.
646  *
647  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
648  *		      for reset.
649  *	Answer: if a packet caused RST, it is not for a socket
650  *		existing in our system, if it is matched to a socket,
651  *		it is just duplicate segment or bug in other side's TCP.
652  *		So that we build reply only basing on parameters
653  *		arrived with segment.
654  *	Exception: precedence violation. We do not implement it in any case.
655  */
656 
657 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
658 {
659 	const struct tcphdr *th = tcp_hdr(skb);
660 	struct {
661 		struct tcphdr th;
662 #ifdef CONFIG_TCP_MD5SIG
663 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
664 #endif
665 	} rep;
666 	struct ip_reply_arg arg;
667 #ifdef CONFIG_TCP_MD5SIG
668 	struct tcp_md5sig_key *key = NULL;
669 	const __u8 *hash_location = NULL;
670 	unsigned char newhash[16];
671 	int genhash;
672 	struct sock *sk1 = NULL;
673 #endif
674 	u64 transmit_time = 0;
675 	struct sock *ctl_sk;
676 	struct net *net;
677 
678 	/* Never send a reset in response to a reset. */
679 	if (th->rst)
680 		return;
681 
682 	/* If sk not NULL, it means we did a successful lookup and incoming
683 	 * route had to be correct. prequeue might have dropped our dst.
684 	 */
685 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
686 		return;
687 
688 	/* Swap the send and the receive. */
689 	memset(&rep, 0, sizeof(rep));
690 	rep.th.dest   = th->source;
691 	rep.th.source = th->dest;
692 	rep.th.doff   = sizeof(struct tcphdr) / 4;
693 	rep.th.rst    = 1;
694 
695 	if (th->ack) {
696 		rep.th.seq = th->ack_seq;
697 	} else {
698 		rep.th.ack = 1;
699 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
700 				       skb->len - (th->doff << 2));
701 	}
702 
703 	memset(&arg, 0, sizeof(arg));
704 	arg.iov[0].iov_base = (unsigned char *)&rep;
705 	arg.iov[0].iov_len  = sizeof(rep.th);
706 
707 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
708 #ifdef CONFIG_TCP_MD5SIG
709 	rcu_read_lock();
710 	hash_location = tcp_parse_md5sig_option(th);
711 	if (sk && sk_fullsock(sk)) {
712 		const union tcp_md5_addr *addr;
713 		int l3index;
714 
715 		/* sdif set, means packet ingressed via a device
716 		 * in an L3 domain and inet_iif is set to it.
717 		 */
718 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
719 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
720 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
721 	} else if (hash_location) {
722 		const union tcp_md5_addr *addr;
723 		int sdif = tcp_v4_sdif(skb);
724 		int dif = inet_iif(skb);
725 		int l3index;
726 
727 		/*
728 		 * active side is lost. Try to find listening socket through
729 		 * source port, and then find md5 key through listening socket.
730 		 * we are not loose security here:
731 		 * Incoming packet is checked with md5 hash with finding key,
732 		 * no RST generated if md5 hash doesn't match.
733 		 */
734 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
735 					     ip_hdr(skb)->saddr,
736 					     th->source, ip_hdr(skb)->daddr,
737 					     ntohs(th->source), dif, sdif);
738 		/* don't send rst if it can't find key */
739 		if (!sk1)
740 			goto out;
741 
742 		/* sdif set, means packet ingressed via a device
743 		 * in an L3 domain and dif is set to it.
744 		 */
745 		l3index = sdif ? dif : 0;
746 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
747 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
748 		if (!key)
749 			goto out;
750 
751 
752 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
753 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
754 			goto out;
755 
756 	}
757 
758 	if (key) {
759 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
760 				   (TCPOPT_NOP << 16) |
761 				   (TCPOPT_MD5SIG << 8) |
762 				   TCPOLEN_MD5SIG);
763 		/* Update length and the length the header thinks exists */
764 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
765 		rep.th.doff = arg.iov[0].iov_len / 4;
766 
767 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
768 				     key, ip_hdr(skb)->saddr,
769 				     ip_hdr(skb)->daddr, &rep.th);
770 	}
771 #endif
772 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
773 				      ip_hdr(skb)->saddr, /* XXX */
774 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
775 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
776 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
777 
778 	/* When socket is gone, all binding information is lost.
779 	 * routing might fail in this case. No choice here, if we choose to force
780 	 * input interface, we will misroute in case of asymmetric route.
781 	 */
782 	if (sk) {
783 		arg.bound_dev_if = sk->sk_bound_dev_if;
784 		if (sk_fullsock(sk))
785 			trace_tcp_send_reset(sk, skb);
786 	}
787 
788 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
789 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
790 
791 	arg.tos = ip_hdr(skb)->tos;
792 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
793 	local_bh_disable();
794 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
795 	if (sk) {
796 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
797 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
798 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
799 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
800 		transmit_time = tcp_transmit_time(sk);
801 	}
802 	ip_send_unicast_reply(ctl_sk,
803 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
804 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
805 			      &arg, arg.iov[0].iov_len,
806 			      transmit_time);
807 
808 	ctl_sk->sk_mark = 0;
809 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
810 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
811 	local_bh_enable();
812 
813 #ifdef CONFIG_TCP_MD5SIG
814 out:
815 	rcu_read_unlock();
816 #endif
817 }
818 
819 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
820    outside socket context is ugly, certainly. What can I do?
821  */
822 
823 static void tcp_v4_send_ack(const struct sock *sk,
824 			    struct sk_buff *skb, u32 seq, u32 ack,
825 			    u32 win, u32 tsval, u32 tsecr, int oif,
826 			    struct tcp_md5sig_key *key,
827 			    int reply_flags, u8 tos)
828 {
829 	const struct tcphdr *th = tcp_hdr(skb);
830 	struct {
831 		struct tcphdr th;
832 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
833 #ifdef CONFIG_TCP_MD5SIG
834 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
835 #endif
836 			];
837 	} rep;
838 	struct net *net = sock_net(sk);
839 	struct ip_reply_arg arg;
840 	struct sock *ctl_sk;
841 	u64 transmit_time;
842 
843 	memset(&rep.th, 0, sizeof(struct tcphdr));
844 	memset(&arg, 0, sizeof(arg));
845 
846 	arg.iov[0].iov_base = (unsigned char *)&rep;
847 	arg.iov[0].iov_len  = sizeof(rep.th);
848 	if (tsecr) {
849 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
850 				   (TCPOPT_TIMESTAMP << 8) |
851 				   TCPOLEN_TIMESTAMP);
852 		rep.opt[1] = htonl(tsval);
853 		rep.opt[2] = htonl(tsecr);
854 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
855 	}
856 
857 	/* Swap the send and the receive. */
858 	rep.th.dest    = th->source;
859 	rep.th.source  = th->dest;
860 	rep.th.doff    = arg.iov[0].iov_len / 4;
861 	rep.th.seq     = htonl(seq);
862 	rep.th.ack_seq = htonl(ack);
863 	rep.th.ack     = 1;
864 	rep.th.window  = htons(win);
865 
866 #ifdef CONFIG_TCP_MD5SIG
867 	if (key) {
868 		int offset = (tsecr) ? 3 : 0;
869 
870 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
871 					  (TCPOPT_NOP << 16) |
872 					  (TCPOPT_MD5SIG << 8) |
873 					  TCPOLEN_MD5SIG);
874 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
875 		rep.th.doff = arg.iov[0].iov_len/4;
876 
877 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
878 				    key, ip_hdr(skb)->saddr,
879 				    ip_hdr(skb)->daddr, &rep.th);
880 	}
881 #endif
882 	arg.flags = reply_flags;
883 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
884 				      ip_hdr(skb)->saddr, /* XXX */
885 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
886 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
887 	if (oif)
888 		arg.bound_dev_if = oif;
889 	arg.tos = tos;
890 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
891 	local_bh_disable();
892 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
893 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
894 			   inet_twsk(sk)->tw_mark : sk->sk_mark;
895 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
896 			   inet_twsk(sk)->tw_priority : sk->sk_priority;
897 	transmit_time = tcp_transmit_time(sk);
898 	ip_send_unicast_reply(ctl_sk,
899 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
900 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
901 			      &arg, arg.iov[0].iov_len,
902 			      transmit_time);
903 
904 	ctl_sk->sk_mark = 0;
905 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
906 	local_bh_enable();
907 }
908 
909 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
910 {
911 	struct inet_timewait_sock *tw = inet_twsk(sk);
912 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
913 
914 	tcp_v4_send_ack(sk, skb,
915 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
916 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
917 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
918 			tcptw->tw_ts_recent,
919 			tw->tw_bound_dev_if,
920 			tcp_twsk_md5_key(tcptw),
921 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
922 			tw->tw_tos
923 			);
924 
925 	inet_twsk_put(tw);
926 }
927 
928 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
929 				  struct request_sock *req)
930 {
931 	const union tcp_md5_addr *addr;
932 	int l3index;
933 
934 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
935 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
936 	 */
937 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
938 					     tcp_sk(sk)->snd_nxt;
939 
940 	/* RFC 7323 2.3
941 	 * The window field (SEG.WND) of every outgoing segment, with the
942 	 * exception of <SYN> segments, MUST be right-shifted by
943 	 * Rcv.Wind.Shift bits:
944 	 */
945 	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
946 	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
947 	tcp_v4_send_ack(sk, skb, seq,
948 			tcp_rsk(req)->rcv_nxt,
949 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
950 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
951 			req->ts_recent,
952 			0,
953 			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
954 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
955 			ip_hdr(skb)->tos);
956 }
957 
958 /*
959  *	Send a SYN-ACK after having received a SYN.
960  *	This still operates on a request_sock only, not on a big
961  *	socket.
962  */
963 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
964 			      struct flowi *fl,
965 			      struct request_sock *req,
966 			      struct tcp_fastopen_cookie *foc,
967 			      enum tcp_synack_type synack_type)
968 {
969 	const struct inet_request_sock *ireq = inet_rsk(req);
970 	struct flowi4 fl4;
971 	int err = -1;
972 	struct sk_buff *skb;
973 
974 	/* First, grab a route. */
975 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
976 		return -1;
977 
978 	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
979 
980 	if (skb) {
981 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
982 
983 		rcu_read_lock();
984 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
985 					    ireq->ir_rmt_addr,
986 					    rcu_dereference(ireq->ireq_opt));
987 		rcu_read_unlock();
988 		err = net_xmit_eval(err);
989 	}
990 
991 	return err;
992 }
993 
994 /*
995  *	IPv4 request_sock destructor.
996  */
997 static void tcp_v4_reqsk_destructor(struct request_sock *req)
998 {
999 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1000 }
1001 
1002 #ifdef CONFIG_TCP_MD5SIG
1003 /*
1004  * RFC2385 MD5 checksumming requires a mapping of
1005  * IP address->MD5 Key.
1006  * We need to maintain these in the sk structure.
1007  */
1008 
1009 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1010 EXPORT_SYMBOL(tcp_md5_needed);
1011 
1012 /* Find the Key structure for an address.  */
1013 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1014 					   const union tcp_md5_addr *addr,
1015 					   int family)
1016 {
1017 	const struct tcp_sock *tp = tcp_sk(sk);
1018 	struct tcp_md5sig_key *key;
1019 	const struct tcp_md5sig_info *md5sig;
1020 	__be32 mask;
1021 	struct tcp_md5sig_key *best_match = NULL;
1022 	bool match;
1023 
1024 	/* caller either holds rcu_read_lock() or socket lock */
1025 	md5sig = rcu_dereference_check(tp->md5sig_info,
1026 				       lockdep_sock_is_held(sk));
1027 	if (!md5sig)
1028 		return NULL;
1029 
1030 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1031 				 lockdep_sock_is_held(sk)) {
1032 		if (key->family != family)
1033 			continue;
1034 		if (key->l3index && key->l3index != l3index)
1035 			continue;
1036 		if (family == AF_INET) {
1037 			mask = inet_make_mask(key->prefixlen);
1038 			match = (key->addr.a4.s_addr & mask) ==
1039 				(addr->a4.s_addr & mask);
1040 #if IS_ENABLED(CONFIG_IPV6)
1041 		} else if (family == AF_INET6) {
1042 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1043 						  key->prefixlen);
1044 #endif
1045 		} else {
1046 			match = false;
1047 		}
1048 
1049 		if (match && (!best_match ||
1050 			      key->prefixlen > best_match->prefixlen))
1051 			best_match = key;
1052 	}
1053 	return best_match;
1054 }
1055 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1056 
1057 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1058 						      const union tcp_md5_addr *addr,
1059 						      int family, u8 prefixlen,
1060 						      int l3index)
1061 {
1062 	const struct tcp_sock *tp = tcp_sk(sk);
1063 	struct tcp_md5sig_key *key;
1064 	unsigned int size = sizeof(struct in_addr);
1065 	const struct tcp_md5sig_info *md5sig;
1066 
1067 	/* caller either holds rcu_read_lock() or socket lock */
1068 	md5sig = rcu_dereference_check(tp->md5sig_info,
1069 				       lockdep_sock_is_held(sk));
1070 	if (!md5sig)
1071 		return NULL;
1072 #if IS_ENABLED(CONFIG_IPV6)
1073 	if (family == AF_INET6)
1074 		size = sizeof(struct in6_addr);
1075 #endif
1076 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1077 				 lockdep_sock_is_held(sk)) {
1078 		if (key->family != family)
1079 			continue;
1080 		if (key->l3index && key->l3index != l3index)
1081 			continue;
1082 		if (!memcmp(&key->addr, addr, size) &&
1083 		    key->prefixlen == prefixlen)
1084 			return key;
1085 	}
1086 	return NULL;
1087 }
1088 
1089 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1090 					 const struct sock *addr_sk)
1091 {
1092 	const union tcp_md5_addr *addr;
1093 	int l3index;
1094 
1095 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1096 						 addr_sk->sk_bound_dev_if);
1097 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1098 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1099 }
1100 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1101 
1102 /* This can be called on a newly created socket, from other files */
1103 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1104 		   int family, u8 prefixlen, int l3index,
1105 		   const u8 *newkey, u8 newkeylen, gfp_t gfp)
1106 {
1107 	/* Add Key to the list */
1108 	struct tcp_md5sig_key *key;
1109 	struct tcp_sock *tp = tcp_sk(sk);
1110 	struct tcp_md5sig_info *md5sig;
1111 
1112 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1113 	if (key) {
1114 		/* Pre-existing entry - just update that one.
1115 		 * Note that the key might be used concurrently.
1116 		 * data_race() is telling kcsan that we do not care of
1117 		 * key mismatches, since changing MD5 key on live flows
1118 		 * can lead to packet drops.
1119 		 */
1120 		data_race(memcpy(key->key, newkey, newkeylen));
1121 
1122 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1123 		 * Also note that a reader could catch new key->keylen value
1124 		 * but old key->key[], this is the reason we use __GFP_ZERO
1125 		 * at sock_kmalloc() time below these lines.
1126 		 */
1127 		WRITE_ONCE(key->keylen, newkeylen);
1128 
1129 		return 0;
1130 	}
1131 
1132 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1133 					   lockdep_sock_is_held(sk));
1134 	if (!md5sig) {
1135 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1136 		if (!md5sig)
1137 			return -ENOMEM;
1138 
1139 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1140 		INIT_HLIST_HEAD(&md5sig->head);
1141 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1142 	}
1143 
1144 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1145 	if (!key)
1146 		return -ENOMEM;
1147 	if (!tcp_alloc_md5sig_pool()) {
1148 		sock_kfree_s(sk, key, sizeof(*key));
1149 		return -ENOMEM;
1150 	}
1151 
1152 	memcpy(key->key, newkey, newkeylen);
1153 	key->keylen = newkeylen;
1154 	key->family = family;
1155 	key->prefixlen = prefixlen;
1156 	key->l3index = l3index;
1157 	memcpy(&key->addr, addr,
1158 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1159 				      sizeof(struct in_addr));
1160 	hlist_add_head_rcu(&key->node, &md5sig->head);
1161 	return 0;
1162 }
1163 EXPORT_SYMBOL(tcp_md5_do_add);
1164 
1165 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1166 		   u8 prefixlen, int l3index)
1167 {
1168 	struct tcp_md5sig_key *key;
1169 
1170 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1171 	if (!key)
1172 		return -ENOENT;
1173 	hlist_del_rcu(&key->node);
1174 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1175 	kfree_rcu(key, rcu);
1176 	return 0;
1177 }
1178 EXPORT_SYMBOL(tcp_md5_do_del);
1179 
1180 static void tcp_clear_md5_list(struct sock *sk)
1181 {
1182 	struct tcp_sock *tp = tcp_sk(sk);
1183 	struct tcp_md5sig_key *key;
1184 	struct hlist_node *n;
1185 	struct tcp_md5sig_info *md5sig;
1186 
1187 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1188 
1189 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1190 		hlist_del_rcu(&key->node);
1191 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1192 		kfree_rcu(key, rcu);
1193 	}
1194 }
1195 
1196 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1197 				 char __user *optval, int optlen)
1198 {
1199 	struct tcp_md5sig cmd;
1200 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1201 	const union tcp_md5_addr *addr;
1202 	u8 prefixlen = 32;
1203 	int l3index = 0;
1204 
1205 	if (optlen < sizeof(cmd))
1206 		return -EINVAL;
1207 
1208 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1209 		return -EFAULT;
1210 
1211 	if (sin->sin_family != AF_INET)
1212 		return -EINVAL;
1213 
1214 	if (optname == TCP_MD5SIG_EXT &&
1215 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1216 		prefixlen = cmd.tcpm_prefixlen;
1217 		if (prefixlen > 32)
1218 			return -EINVAL;
1219 	}
1220 
1221 	if (optname == TCP_MD5SIG_EXT &&
1222 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1223 		struct net_device *dev;
1224 
1225 		rcu_read_lock();
1226 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1227 		if (dev && netif_is_l3_master(dev))
1228 			l3index = dev->ifindex;
1229 
1230 		rcu_read_unlock();
1231 
1232 		/* ok to reference set/not set outside of rcu;
1233 		 * right now device MUST be an L3 master
1234 		 */
1235 		if (!dev || !l3index)
1236 			return -EINVAL;
1237 	}
1238 
1239 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1240 
1241 	if (!cmd.tcpm_keylen)
1242 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1243 
1244 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1245 		return -EINVAL;
1246 
1247 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1248 			      cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1249 }
1250 
1251 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1252 				   __be32 daddr, __be32 saddr,
1253 				   const struct tcphdr *th, int nbytes)
1254 {
1255 	struct tcp4_pseudohdr *bp;
1256 	struct scatterlist sg;
1257 	struct tcphdr *_th;
1258 
1259 	bp = hp->scratch;
1260 	bp->saddr = saddr;
1261 	bp->daddr = daddr;
1262 	bp->pad = 0;
1263 	bp->protocol = IPPROTO_TCP;
1264 	bp->len = cpu_to_be16(nbytes);
1265 
1266 	_th = (struct tcphdr *)(bp + 1);
1267 	memcpy(_th, th, sizeof(*th));
1268 	_th->check = 0;
1269 
1270 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1271 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1272 				sizeof(*bp) + sizeof(*th));
1273 	return crypto_ahash_update(hp->md5_req);
1274 }
1275 
1276 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1277 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1278 {
1279 	struct tcp_md5sig_pool *hp;
1280 	struct ahash_request *req;
1281 
1282 	hp = tcp_get_md5sig_pool();
1283 	if (!hp)
1284 		goto clear_hash_noput;
1285 	req = hp->md5_req;
1286 
1287 	if (crypto_ahash_init(req))
1288 		goto clear_hash;
1289 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1290 		goto clear_hash;
1291 	if (tcp_md5_hash_key(hp, key))
1292 		goto clear_hash;
1293 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1294 	if (crypto_ahash_final(req))
1295 		goto clear_hash;
1296 
1297 	tcp_put_md5sig_pool();
1298 	return 0;
1299 
1300 clear_hash:
1301 	tcp_put_md5sig_pool();
1302 clear_hash_noput:
1303 	memset(md5_hash, 0, 16);
1304 	return 1;
1305 }
1306 
1307 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1308 			const struct sock *sk,
1309 			const struct sk_buff *skb)
1310 {
1311 	struct tcp_md5sig_pool *hp;
1312 	struct ahash_request *req;
1313 	const struct tcphdr *th = tcp_hdr(skb);
1314 	__be32 saddr, daddr;
1315 
1316 	if (sk) { /* valid for establish/request sockets */
1317 		saddr = sk->sk_rcv_saddr;
1318 		daddr = sk->sk_daddr;
1319 	} else {
1320 		const struct iphdr *iph = ip_hdr(skb);
1321 		saddr = iph->saddr;
1322 		daddr = iph->daddr;
1323 	}
1324 
1325 	hp = tcp_get_md5sig_pool();
1326 	if (!hp)
1327 		goto clear_hash_noput;
1328 	req = hp->md5_req;
1329 
1330 	if (crypto_ahash_init(req))
1331 		goto clear_hash;
1332 
1333 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1334 		goto clear_hash;
1335 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1336 		goto clear_hash;
1337 	if (tcp_md5_hash_key(hp, key))
1338 		goto clear_hash;
1339 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1340 	if (crypto_ahash_final(req))
1341 		goto clear_hash;
1342 
1343 	tcp_put_md5sig_pool();
1344 	return 0;
1345 
1346 clear_hash:
1347 	tcp_put_md5sig_pool();
1348 clear_hash_noput:
1349 	memset(md5_hash, 0, 16);
1350 	return 1;
1351 }
1352 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1353 
1354 #endif
1355 
1356 /* Called with rcu_read_lock() */
1357 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1358 				    const struct sk_buff *skb,
1359 				    int dif, int sdif)
1360 {
1361 #ifdef CONFIG_TCP_MD5SIG
1362 	/*
1363 	 * This gets called for each TCP segment that arrives
1364 	 * so we want to be efficient.
1365 	 * We have 3 drop cases:
1366 	 * o No MD5 hash and one expected.
1367 	 * o MD5 hash and we're not expecting one.
1368 	 * o MD5 hash and its wrong.
1369 	 */
1370 	const __u8 *hash_location = NULL;
1371 	struct tcp_md5sig_key *hash_expected;
1372 	const struct iphdr *iph = ip_hdr(skb);
1373 	const struct tcphdr *th = tcp_hdr(skb);
1374 	const union tcp_md5_addr *addr;
1375 	unsigned char newhash[16];
1376 	int genhash, l3index;
1377 
1378 	/* sdif set, means packet ingressed via a device
1379 	 * in an L3 domain and dif is set to the l3mdev
1380 	 */
1381 	l3index = sdif ? dif : 0;
1382 
1383 	addr = (union tcp_md5_addr *)&iph->saddr;
1384 	hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1385 	hash_location = tcp_parse_md5sig_option(th);
1386 
1387 	/* We've parsed the options - do we have a hash? */
1388 	if (!hash_expected && !hash_location)
1389 		return false;
1390 
1391 	if (hash_expected && !hash_location) {
1392 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1393 		return true;
1394 	}
1395 
1396 	if (!hash_expected && hash_location) {
1397 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1398 		return true;
1399 	}
1400 
1401 	/* Okay, so this is hash_expected and hash_location -
1402 	 * so we need to calculate the checksum.
1403 	 */
1404 	genhash = tcp_v4_md5_hash_skb(newhash,
1405 				      hash_expected,
1406 				      NULL, skb);
1407 
1408 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1409 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1410 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1411 				     &iph->saddr, ntohs(th->source),
1412 				     &iph->daddr, ntohs(th->dest),
1413 				     genhash ? " tcp_v4_calc_md5_hash failed"
1414 				     : "", l3index);
1415 		return true;
1416 	}
1417 	return false;
1418 #endif
1419 	return false;
1420 }
1421 
1422 static void tcp_v4_init_req(struct request_sock *req,
1423 			    const struct sock *sk_listener,
1424 			    struct sk_buff *skb)
1425 {
1426 	struct inet_request_sock *ireq = inet_rsk(req);
1427 	struct net *net = sock_net(sk_listener);
1428 
1429 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1430 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1431 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1432 }
1433 
1434 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1435 					  struct flowi *fl,
1436 					  const struct request_sock *req)
1437 {
1438 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1439 }
1440 
1441 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1442 	.family		=	PF_INET,
1443 	.obj_size	=	sizeof(struct tcp_request_sock),
1444 	.rtx_syn_ack	=	tcp_rtx_synack,
1445 	.send_ack	=	tcp_v4_reqsk_send_ack,
1446 	.destructor	=	tcp_v4_reqsk_destructor,
1447 	.send_reset	=	tcp_v4_send_reset,
1448 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1449 };
1450 
1451 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1452 	.mss_clamp	=	TCP_MSS_DEFAULT,
1453 #ifdef CONFIG_TCP_MD5SIG
1454 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1455 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1456 #endif
1457 	.init_req	=	tcp_v4_init_req,
1458 #ifdef CONFIG_SYN_COOKIES
1459 	.cookie_init_seq =	cookie_v4_init_sequence,
1460 #endif
1461 	.route_req	=	tcp_v4_route_req,
1462 	.init_seq	=	tcp_v4_init_seq,
1463 	.init_ts_off	=	tcp_v4_init_ts_off,
1464 	.send_synack	=	tcp_v4_send_synack,
1465 };
1466 
1467 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1468 {
1469 	/* Never answer to SYNs send to broadcast or multicast */
1470 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1471 		goto drop;
1472 
1473 	return tcp_conn_request(&tcp_request_sock_ops,
1474 				&tcp_request_sock_ipv4_ops, sk, skb);
1475 
1476 drop:
1477 	tcp_listendrop(sk);
1478 	return 0;
1479 }
1480 EXPORT_SYMBOL(tcp_v4_conn_request);
1481 
1482 
1483 /*
1484  * The three way handshake has completed - we got a valid synack -
1485  * now create the new socket.
1486  */
1487 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1488 				  struct request_sock *req,
1489 				  struct dst_entry *dst,
1490 				  struct request_sock *req_unhash,
1491 				  bool *own_req)
1492 {
1493 	struct inet_request_sock *ireq;
1494 	struct inet_sock *newinet;
1495 	struct tcp_sock *newtp;
1496 	struct sock *newsk;
1497 #ifdef CONFIG_TCP_MD5SIG
1498 	const union tcp_md5_addr *addr;
1499 	struct tcp_md5sig_key *key;
1500 	int l3index;
1501 #endif
1502 	struct ip_options_rcu *inet_opt;
1503 
1504 	if (sk_acceptq_is_full(sk))
1505 		goto exit_overflow;
1506 
1507 	newsk = tcp_create_openreq_child(sk, req, skb);
1508 	if (!newsk)
1509 		goto exit_nonewsk;
1510 
1511 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1512 	inet_sk_rx_dst_set(newsk, skb);
1513 
1514 	newtp		      = tcp_sk(newsk);
1515 	newinet		      = inet_sk(newsk);
1516 	ireq		      = inet_rsk(req);
1517 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1518 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1519 	newsk->sk_bound_dev_if = ireq->ir_iif;
1520 	newinet->inet_saddr   = ireq->ir_loc_addr;
1521 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1522 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1523 	newinet->mc_index     = inet_iif(skb);
1524 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1525 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1526 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1527 	if (inet_opt)
1528 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1529 	newinet->inet_id = prandom_u32();
1530 
1531 	if (!dst) {
1532 		dst = inet_csk_route_child_sock(sk, newsk, req);
1533 		if (!dst)
1534 			goto put_and_exit;
1535 	} else {
1536 		/* syncookie case : see end of cookie_v4_check() */
1537 	}
1538 	sk_setup_caps(newsk, dst);
1539 
1540 	tcp_ca_openreq_child(newsk, dst);
1541 
1542 	tcp_sync_mss(newsk, dst_mtu(dst));
1543 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1544 
1545 	tcp_initialize_rcv_mss(newsk);
1546 
1547 #ifdef CONFIG_TCP_MD5SIG
1548 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1549 	/* Copy over the MD5 key from the original socket */
1550 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1551 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1552 	if (key) {
1553 		/*
1554 		 * We're using one, so create a matching key
1555 		 * on the newsk structure. If we fail to get
1556 		 * memory, then we end up not copying the key
1557 		 * across. Shucks.
1558 		 */
1559 		tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1560 			       key->key, key->keylen, GFP_ATOMIC);
1561 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1562 	}
1563 #endif
1564 
1565 	if (__inet_inherit_port(sk, newsk) < 0)
1566 		goto put_and_exit;
1567 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1568 	if (likely(*own_req)) {
1569 		tcp_move_syn(newtp, req);
1570 		ireq->ireq_opt = NULL;
1571 	} else {
1572 		newinet->inet_opt = NULL;
1573 	}
1574 	return newsk;
1575 
1576 exit_overflow:
1577 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1578 exit_nonewsk:
1579 	dst_release(dst);
1580 exit:
1581 	tcp_listendrop(sk);
1582 	return NULL;
1583 put_and_exit:
1584 	newinet->inet_opt = NULL;
1585 	inet_csk_prepare_forced_close(newsk);
1586 	tcp_done(newsk);
1587 	goto exit;
1588 }
1589 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1590 
1591 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1592 {
1593 #ifdef CONFIG_SYN_COOKIES
1594 	const struct tcphdr *th = tcp_hdr(skb);
1595 
1596 	if (!th->syn)
1597 		sk = cookie_v4_check(sk, skb);
1598 #endif
1599 	return sk;
1600 }
1601 
1602 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1603 			 struct tcphdr *th, u32 *cookie)
1604 {
1605 	u16 mss = 0;
1606 #ifdef CONFIG_SYN_COOKIES
1607 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1608 				    &tcp_request_sock_ipv4_ops, sk, th);
1609 	if (mss) {
1610 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1611 		tcp_synq_overflow(sk);
1612 	}
1613 #endif
1614 	return mss;
1615 }
1616 
1617 /* The socket must have it's spinlock held when we get
1618  * here, unless it is a TCP_LISTEN socket.
1619  *
1620  * We have a potential double-lock case here, so even when
1621  * doing backlog processing we use the BH locking scheme.
1622  * This is because we cannot sleep with the original spinlock
1623  * held.
1624  */
1625 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1626 {
1627 	struct sock *rsk;
1628 
1629 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1630 		struct dst_entry *dst = sk->sk_rx_dst;
1631 
1632 		sock_rps_save_rxhash(sk, skb);
1633 		sk_mark_napi_id(sk, skb);
1634 		if (dst) {
1635 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1636 			    !dst->ops->check(dst, 0)) {
1637 				dst_release(dst);
1638 				sk->sk_rx_dst = NULL;
1639 			}
1640 		}
1641 		tcp_rcv_established(sk, skb);
1642 		return 0;
1643 	}
1644 
1645 	if (tcp_checksum_complete(skb))
1646 		goto csum_err;
1647 
1648 	if (sk->sk_state == TCP_LISTEN) {
1649 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1650 
1651 		if (!nsk)
1652 			goto discard;
1653 		if (nsk != sk) {
1654 			if (tcp_child_process(sk, nsk, skb)) {
1655 				rsk = nsk;
1656 				goto reset;
1657 			}
1658 			return 0;
1659 		}
1660 	} else
1661 		sock_rps_save_rxhash(sk, skb);
1662 
1663 	if (tcp_rcv_state_process(sk, skb)) {
1664 		rsk = sk;
1665 		goto reset;
1666 	}
1667 	return 0;
1668 
1669 reset:
1670 	tcp_v4_send_reset(rsk, skb);
1671 discard:
1672 	kfree_skb(skb);
1673 	/* Be careful here. If this function gets more complicated and
1674 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1675 	 * might be destroyed here. This current version compiles correctly,
1676 	 * but you have been warned.
1677 	 */
1678 	return 0;
1679 
1680 csum_err:
1681 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1682 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1683 	goto discard;
1684 }
1685 EXPORT_SYMBOL(tcp_v4_do_rcv);
1686 
1687 int tcp_v4_early_demux(struct sk_buff *skb)
1688 {
1689 	const struct iphdr *iph;
1690 	const struct tcphdr *th;
1691 	struct sock *sk;
1692 
1693 	if (skb->pkt_type != PACKET_HOST)
1694 		return 0;
1695 
1696 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1697 		return 0;
1698 
1699 	iph = ip_hdr(skb);
1700 	th = tcp_hdr(skb);
1701 
1702 	if (th->doff < sizeof(struct tcphdr) / 4)
1703 		return 0;
1704 
1705 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1706 				       iph->saddr, th->source,
1707 				       iph->daddr, ntohs(th->dest),
1708 				       skb->skb_iif, inet_sdif(skb));
1709 	if (sk) {
1710 		skb->sk = sk;
1711 		skb->destructor = sock_edemux;
1712 		if (sk_fullsock(sk)) {
1713 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1714 
1715 			if (dst)
1716 				dst = dst_check(dst, 0);
1717 			if (dst &&
1718 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1719 				skb_dst_set_noref(skb, dst);
1720 		}
1721 	}
1722 	return 0;
1723 }
1724 
1725 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1726 {
1727 	u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1728 	struct skb_shared_info *shinfo;
1729 	const struct tcphdr *th;
1730 	struct tcphdr *thtail;
1731 	struct sk_buff *tail;
1732 	unsigned int hdrlen;
1733 	bool fragstolen;
1734 	u32 gso_segs;
1735 	int delta;
1736 
1737 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1738 	 * we can fix skb->truesize to its real value to avoid future drops.
1739 	 * This is valid because skb is not yet charged to the socket.
1740 	 * It has been noticed pure SACK packets were sometimes dropped
1741 	 * (if cooked by drivers without copybreak feature).
1742 	 */
1743 	skb_condense(skb);
1744 
1745 	skb_dst_drop(skb);
1746 
1747 	if (unlikely(tcp_checksum_complete(skb))) {
1748 		bh_unlock_sock(sk);
1749 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1750 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1751 		return true;
1752 	}
1753 
1754 	/* Attempt coalescing to last skb in backlog, even if we are
1755 	 * above the limits.
1756 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1757 	 */
1758 	th = (const struct tcphdr *)skb->data;
1759 	hdrlen = th->doff * 4;
1760 	shinfo = skb_shinfo(skb);
1761 
1762 	if (!shinfo->gso_size)
1763 		shinfo->gso_size = skb->len - hdrlen;
1764 
1765 	if (!shinfo->gso_segs)
1766 		shinfo->gso_segs = 1;
1767 
1768 	tail = sk->sk_backlog.tail;
1769 	if (!tail)
1770 		goto no_coalesce;
1771 	thtail = (struct tcphdr *)tail->data;
1772 
1773 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1774 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1775 	    ((TCP_SKB_CB(tail)->tcp_flags |
1776 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1777 	    !((TCP_SKB_CB(tail)->tcp_flags &
1778 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1779 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1780 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1781 #ifdef CONFIG_TLS_DEVICE
1782 	    tail->decrypted != skb->decrypted ||
1783 #endif
1784 	    thtail->doff != th->doff ||
1785 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1786 		goto no_coalesce;
1787 
1788 	__skb_pull(skb, hdrlen);
1789 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1790 		thtail->window = th->window;
1791 
1792 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1793 
1794 		if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1795 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1796 
1797 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1798 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1799 		 * is not entered if we append a packet with a FIN.
1800 		 * SYN, RST, URG are not present.
1801 		 * ACK is set on both packets.
1802 		 * PSH : we do not really care in TCP stack,
1803 		 *       at least for 'GRO' packets.
1804 		 */
1805 		thtail->fin |= th->fin;
1806 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1807 
1808 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1809 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1810 			tail->tstamp = skb->tstamp;
1811 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1812 		}
1813 
1814 		/* Not as strict as GRO. We only need to carry mss max value */
1815 		skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1816 						 skb_shinfo(tail)->gso_size);
1817 
1818 		gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1819 		skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1820 
1821 		sk->sk_backlog.len += delta;
1822 		__NET_INC_STATS(sock_net(sk),
1823 				LINUX_MIB_TCPBACKLOGCOALESCE);
1824 		kfree_skb_partial(skb, fragstolen);
1825 		return false;
1826 	}
1827 	__skb_push(skb, hdrlen);
1828 
1829 no_coalesce:
1830 	/* Only socket owner can try to collapse/prune rx queues
1831 	 * to reduce memory overhead, so add a little headroom here.
1832 	 * Few sockets backlog are possibly concurrently non empty.
1833 	 */
1834 	limit += 64*1024;
1835 
1836 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1837 		bh_unlock_sock(sk);
1838 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1839 		return true;
1840 	}
1841 	return false;
1842 }
1843 EXPORT_SYMBOL(tcp_add_backlog);
1844 
1845 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1846 {
1847 	struct tcphdr *th = (struct tcphdr *)skb->data;
1848 
1849 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1850 }
1851 EXPORT_SYMBOL(tcp_filter);
1852 
1853 static void tcp_v4_restore_cb(struct sk_buff *skb)
1854 {
1855 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1856 		sizeof(struct inet_skb_parm));
1857 }
1858 
1859 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1860 			   const struct tcphdr *th)
1861 {
1862 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1863 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1864 	 */
1865 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1866 		sizeof(struct inet_skb_parm));
1867 	barrier();
1868 
1869 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1870 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1871 				    skb->len - th->doff * 4);
1872 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1873 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1874 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1875 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1876 	TCP_SKB_CB(skb)->sacked	 = 0;
1877 	TCP_SKB_CB(skb)->has_rxtstamp =
1878 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1879 }
1880 
1881 /*
1882  *	From tcp_input.c
1883  */
1884 
1885 int tcp_v4_rcv(struct sk_buff *skb)
1886 {
1887 	struct net *net = dev_net(skb->dev);
1888 	struct sk_buff *skb_to_free;
1889 	int sdif = inet_sdif(skb);
1890 	int dif = inet_iif(skb);
1891 	const struct iphdr *iph;
1892 	const struct tcphdr *th;
1893 	bool refcounted;
1894 	struct sock *sk;
1895 	int ret;
1896 
1897 	if (skb->pkt_type != PACKET_HOST)
1898 		goto discard_it;
1899 
1900 	/* Count it even if it's bad */
1901 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1902 
1903 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1904 		goto discard_it;
1905 
1906 	th = (const struct tcphdr *)skb->data;
1907 
1908 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1909 		goto bad_packet;
1910 	if (!pskb_may_pull(skb, th->doff * 4))
1911 		goto discard_it;
1912 
1913 	/* An explanation is required here, I think.
1914 	 * Packet length and doff are validated by header prediction,
1915 	 * provided case of th->doff==0 is eliminated.
1916 	 * So, we defer the checks. */
1917 
1918 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1919 		goto csum_error;
1920 
1921 	th = (const struct tcphdr *)skb->data;
1922 	iph = ip_hdr(skb);
1923 lookup:
1924 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1925 			       th->dest, sdif, &refcounted);
1926 	if (!sk)
1927 		goto no_tcp_socket;
1928 
1929 process:
1930 	if (sk->sk_state == TCP_TIME_WAIT)
1931 		goto do_time_wait;
1932 
1933 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1934 		struct request_sock *req = inet_reqsk(sk);
1935 		bool req_stolen = false;
1936 		struct sock *nsk;
1937 
1938 		sk = req->rsk_listener;
1939 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1940 			sk_drops_add(sk, skb);
1941 			reqsk_put(req);
1942 			goto discard_it;
1943 		}
1944 		if (tcp_checksum_complete(skb)) {
1945 			reqsk_put(req);
1946 			goto csum_error;
1947 		}
1948 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1949 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1950 			goto lookup;
1951 		}
1952 		/* We own a reference on the listener, increase it again
1953 		 * as we might lose it too soon.
1954 		 */
1955 		sock_hold(sk);
1956 		refcounted = true;
1957 		nsk = NULL;
1958 		if (!tcp_filter(sk, skb)) {
1959 			th = (const struct tcphdr *)skb->data;
1960 			iph = ip_hdr(skb);
1961 			tcp_v4_fill_cb(skb, iph, th);
1962 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1963 		}
1964 		if (!nsk) {
1965 			reqsk_put(req);
1966 			if (req_stolen) {
1967 				/* Another cpu got exclusive access to req
1968 				 * and created a full blown socket.
1969 				 * Try to feed this packet to this socket
1970 				 * instead of discarding it.
1971 				 */
1972 				tcp_v4_restore_cb(skb);
1973 				sock_put(sk);
1974 				goto lookup;
1975 			}
1976 			goto discard_and_relse;
1977 		}
1978 		if (nsk == sk) {
1979 			reqsk_put(req);
1980 			tcp_v4_restore_cb(skb);
1981 		} else if (tcp_child_process(sk, nsk, skb)) {
1982 			tcp_v4_send_reset(nsk, skb);
1983 			goto discard_and_relse;
1984 		} else {
1985 			sock_put(sk);
1986 			return 0;
1987 		}
1988 	}
1989 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1990 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1991 		goto discard_and_relse;
1992 	}
1993 
1994 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1995 		goto discard_and_relse;
1996 
1997 	if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
1998 		goto discard_and_relse;
1999 
2000 	nf_reset_ct(skb);
2001 
2002 	if (tcp_filter(sk, skb))
2003 		goto discard_and_relse;
2004 	th = (const struct tcphdr *)skb->data;
2005 	iph = ip_hdr(skb);
2006 	tcp_v4_fill_cb(skb, iph, th);
2007 
2008 	skb->dev = NULL;
2009 
2010 	if (sk->sk_state == TCP_LISTEN) {
2011 		ret = tcp_v4_do_rcv(sk, skb);
2012 		goto put_and_return;
2013 	}
2014 
2015 	sk_incoming_cpu_update(sk);
2016 
2017 	bh_lock_sock_nested(sk);
2018 	tcp_segs_in(tcp_sk(sk), skb);
2019 	ret = 0;
2020 	if (!sock_owned_by_user(sk)) {
2021 		skb_to_free = sk->sk_rx_skb_cache;
2022 		sk->sk_rx_skb_cache = NULL;
2023 		ret = tcp_v4_do_rcv(sk, skb);
2024 	} else {
2025 		if (tcp_add_backlog(sk, skb))
2026 			goto discard_and_relse;
2027 		skb_to_free = NULL;
2028 	}
2029 	bh_unlock_sock(sk);
2030 	if (skb_to_free)
2031 		__kfree_skb(skb_to_free);
2032 
2033 put_and_return:
2034 	if (refcounted)
2035 		sock_put(sk);
2036 
2037 	return ret;
2038 
2039 no_tcp_socket:
2040 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2041 		goto discard_it;
2042 
2043 	tcp_v4_fill_cb(skb, iph, th);
2044 
2045 	if (tcp_checksum_complete(skb)) {
2046 csum_error:
2047 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2048 bad_packet:
2049 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2050 	} else {
2051 		tcp_v4_send_reset(NULL, skb);
2052 	}
2053 
2054 discard_it:
2055 	/* Discard frame. */
2056 	kfree_skb(skb);
2057 	return 0;
2058 
2059 discard_and_relse:
2060 	sk_drops_add(sk, skb);
2061 	if (refcounted)
2062 		sock_put(sk);
2063 	goto discard_it;
2064 
2065 do_time_wait:
2066 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2067 		inet_twsk_put(inet_twsk(sk));
2068 		goto discard_it;
2069 	}
2070 
2071 	tcp_v4_fill_cb(skb, iph, th);
2072 
2073 	if (tcp_checksum_complete(skb)) {
2074 		inet_twsk_put(inet_twsk(sk));
2075 		goto csum_error;
2076 	}
2077 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2078 	case TCP_TW_SYN: {
2079 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2080 							&tcp_hashinfo, skb,
2081 							__tcp_hdrlen(th),
2082 							iph->saddr, th->source,
2083 							iph->daddr, th->dest,
2084 							inet_iif(skb),
2085 							sdif);
2086 		if (sk2) {
2087 			inet_twsk_deschedule_put(inet_twsk(sk));
2088 			sk = sk2;
2089 			tcp_v4_restore_cb(skb);
2090 			refcounted = false;
2091 			goto process;
2092 		}
2093 	}
2094 		/* to ACK */
2095 		fallthrough;
2096 	case TCP_TW_ACK:
2097 		tcp_v4_timewait_ack(sk, skb);
2098 		break;
2099 	case TCP_TW_RST:
2100 		tcp_v4_send_reset(sk, skb);
2101 		inet_twsk_deschedule_put(inet_twsk(sk));
2102 		goto discard_it;
2103 	case TCP_TW_SUCCESS:;
2104 	}
2105 	goto discard_it;
2106 }
2107 
2108 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2109 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2110 	.twsk_unique	= tcp_twsk_unique,
2111 	.twsk_destructor= tcp_twsk_destructor,
2112 };
2113 
2114 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2115 {
2116 	struct dst_entry *dst = skb_dst(skb);
2117 
2118 	if (dst && dst_hold_safe(dst)) {
2119 		sk->sk_rx_dst = dst;
2120 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2121 	}
2122 }
2123 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2124 
2125 const struct inet_connection_sock_af_ops ipv4_specific = {
2126 	.queue_xmit	   = ip_queue_xmit,
2127 	.send_check	   = tcp_v4_send_check,
2128 	.rebuild_header	   = inet_sk_rebuild_header,
2129 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2130 	.conn_request	   = tcp_v4_conn_request,
2131 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2132 	.net_header_len	   = sizeof(struct iphdr),
2133 	.setsockopt	   = ip_setsockopt,
2134 	.getsockopt	   = ip_getsockopt,
2135 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2136 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2137 #ifdef CONFIG_COMPAT
2138 	.compat_setsockopt = compat_ip_setsockopt,
2139 	.compat_getsockopt = compat_ip_getsockopt,
2140 #endif
2141 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2142 };
2143 EXPORT_SYMBOL(ipv4_specific);
2144 
2145 #ifdef CONFIG_TCP_MD5SIG
2146 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2147 	.md5_lookup		= tcp_v4_md5_lookup,
2148 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2149 	.md5_parse		= tcp_v4_parse_md5_keys,
2150 };
2151 #endif
2152 
2153 /* NOTE: A lot of things set to zero explicitly by call to
2154  *       sk_alloc() so need not be done here.
2155  */
2156 static int tcp_v4_init_sock(struct sock *sk)
2157 {
2158 	struct inet_connection_sock *icsk = inet_csk(sk);
2159 
2160 	tcp_init_sock(sk);
2161 
2162 	icsk->icsk_af_ops = &ipv4_specific;
2163 
2164 #ifdef CONFIG_TCP_MD5SIG
2165 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2166 #endif
2167 
2168 	return 0;
2169 }
2170 
2171 void tcp_v4_destroy_sock(struct sock *sk)
2172 {
2173 	struct tcp_sock *tp = tcp_sk(sk);
2174 
2175 	trace_tcp_destroy_sock(sk);
2176 
2177 	tcp_clear_xmit_timers(sk);
2178 
2179 	tcp_cleanup_congestion_control(sk);
2180 
2181 	tcp_cleanup_ulp(sk);
2182 
2183 	/* Cleanup up the write buffer. */
2184 	tcp_write_queue_purge(sk);
2185 
2186 	/* Check if we want to disable active TFO */
2187 	tcp_fastopen_active_disable_ofo_check(sk);
2188 
2189 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2190 	skb_rbtree_purge(&tp->out_of_order_queue);
2191 
2192 #ifdef CONFIG_TCP_MD5SIG
2193 	/* Clean up the MD5 key list, if any */
2194 	if (tp->md5sig_info) {
2195 		tcp_clear_md5_list(sk);
2196 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2197 		tp->md5sig_info = NULL;
2198 	}
2199 #endif
2200 
2201 	/* Clean up a referenced TCP bind bucket. */
2202 	if (inet_csk(sk)->icsk_bind_hash)
2203 		inet_put_port(sk);
2204 
2205 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2206 
2207 	/* If socket is aborted during connect operation */
2208 	tcp_free_fastopen_req(tp);
2209 	tcp_fastopen_destroy_cipher(sk);
2210 	tcp_saved_syn_free(tp);
2211 
2212 	sk_sockets_allocated_dec(sk);
2213 }
2214 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2215 
2216 #ifdef CONFIG_PROC_FS
2217 /* Proc filesystem TCP sock list dumping. */
2218 
2219 /*
2220  * Get next listener socket follow cur.  If cur is NULL, get first socket
2221  * starting from bucket given in st->bucket; when st->bucket is zero the
2222  * very first socket in the hash table is returned.
2223  */
2224 static void *listening_get_next(struct seq_file *seq, void *cur)
2225 {
2226 	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2227 	struct tcp_iter_state *st = seq->private;
2228 	struct net *net = seq_file_net(seq);
2229 	struct inet_listen_hashbucket *ilb;
2230 	struct hlist_nulls_node *node;
2231 	struct sock *sk = cur;
2232 
2233 	if (!sk) {
2234 get_head:
2235 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2236 		spin_lock(&ilb->lock);
2237 		sk = sk_nulls_head(&ilb->nulls_head);
2238 		st->offset = 0;
2239 		goto get_sk;
2240 	}
2241 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2242 	++st->num;
2243 	++st->offset;
2244 
2245 	sk = sk_nulls_next(sk);
2246 get_sk:
2247 	sk_nulls_for_each_from(sk, node) {
2248 		if (!net_eq(sock_net(sk), net))
2249 			continue;
2250 		if (sk->sk_family == afinfo->family)
2251 			return sk;
2252 	}
2253 	spin_unlock(&ilb->lock);
2254 	st->offset = 0;
2255 	if (++st->bucket < INET_LHTABLE_SIZE)
2256 		goto get_head;
2257 	return NULL;
2258 }
2259 
2260 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2261 {
2262 	struct tcp_iter_state *st = seq->private;
2263 	void *rc;
2264 
2265 	st->bucket = 0;
2266 	st->offset = 0;
2267 	rc = listening_get_next(seq, NULL);
2268 
2269 	while (rc && *pos) {
2270 		rc = listening_get_next(seq, rc);
2271 		--*pos;
2272 	}
2273 	return rc;
2274 }
2275 
2276 static inline bool empty_bucket(const struct tcp_iter_state *st)
2277 {
2278 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2279 }
2280 
2281 /*
2282  * Get first established socket starting from bucket given in st->bucket.
2283  * If st->bucket is zero, the very first socket in the hash is returned.
2284  */
2285 static void *established_get_first(struct seq_file *seq)
2286 {
2287 	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2288 	struct tcp_iter_state *st = seq->private;
2289 	struct net *net = seq_file_net(seq);
2290 	void *rc = NULL;
2291 
2292 	st->offset = 0;
2293 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2294 		struct sock *sk;
2295 		struct hlist_nulls_node *node;
2296 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2297 
2298 		/* Lockless fast path for the common case of empty buckets */
2299 		if (empty_bucket(st))
2300 			continue;
2301 
2302 		spin_lock_bh(lock);
2303 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2304 			if (sk->sk_family != afinfo->family ||
2305 			    !net_eq(sock_net(sk), net)) {
2306 				continue;
2307 			}
2308 			rc = sk;
2309 			goto out;
2310 		}
2311 		spin_unlock_bh(lock);
2312 	}
2313 out:
2314 	return rc;
2315 }
2316 
2317 static void *established_get_next(struct seq_file *seq, void *cur)
2318 {
2319 	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2320 	struct sock *sk = cur;
2321 	struct hlist_nulls_node *node;
2322 	struct tcp_iter_state *st = seq->private;
2323 	struct net *net = seq_file_net(seq);
2324 
2325 	++st->num;
2326 	++st->offset;
2327 
2328 	sk = sk_nulls_next(sk);
2329 
2330 	sk_nulls_for_each_from(sk, node) {
2331 		if (sk->sk_family == afinfo->family &&
2332 		    net_eq(sock_net(sk), net))
2333 			return sk;
2334 	}
2335 
2336 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2337 	++st->bucket;
2338 	return established_get_first(seq);
2339 }
2340 
2341 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2342 {
2343 	struct tcp_iter_state *st = seq->private;
2344 	void *rc;
2345 
2346 	st->bucket = 0;
2347 	rc = established_get_first(seq);
2348 
2349 	while (rc && pos) {
2350 		rc = established_get_next(seq, rc);
2351 		--pos;
2352 	}
2353 	return rc;
2354 }
2355 
2356 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2357 {
2358 	void *rc;
2359 	struct tcp_iter_state *st = seq->private;
2360 
2361 	st->state = TCP_SEQ_STATE_LISTENING;
2362 	rc	  = listening_get_idx(seq, &pos);
2363 
2364 	if (!rc) {
2365 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2366 		rc	  = established_get_idx(seq, pos);
2367 	}
2368 
2369 	return rc;
2370 }
2371 
2372 static void *tcp_seek_last_pos(struct seq_file *seq)
2373 {
2374 	struct tcp_iter_state *st = seq->private;
2375 	int offset = st->offset;
2376 	int orig_num = st->num;
2377 	void *rc = NULL;
2378 
2379 	switch (st->state) {
2380 	case TCP_SEQ_STATE_LISTENING:
2381 		if (st->bucket >= INET_LHTABLE_SIZE)
2382 			break;
2383 		st->state = TCP_SEQ_STATE_LISTENING;
2384 		rc = listening_get_next(seq, NULL);
2385 		while (offset-- && rc)
2386 			rc = listening_get_next(seq, rc);
2387 		if (rc)
2388 			break;
2389 		st->bucket = 0;
2390 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2391 		fallthrough;
2392 	case TCP_SEQ_STATE_ESTABLISHED:
2393 		if (st->bucket > tcp_hashinfo.ehash_mask)
2394 			break;
2395 		rc = established_get_first(seq);
2396 		while (offset-- && rc)
2397 			rc = established_get_next(seq, rc);
2398 	}
2399 
2400 	st->num = orig_num;
2401 
2402 	return rc;
2403 }
2404 
2405 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2406 {
2407 	struct tcp_iter_state *st = seq->private;
2408 	void *rc;
2409 
2410 	if (*pos && *pos == st->last_pos) {
2411 		rc = tcp_seek_last_pos(seq);
2412 		if (rc)
2413 			goto out;
2414 	}
2415 
2416 	st->state = TCP_SEQ_STATE_LISTENING;
2417 	st->num = 0;
2418 	st->bucket = 0;
2419 	st->offset = 0;
2420 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2421 
2422 out:
2423 	st->last_pos = *pos;
2424 	return rc;
2425 }
2426 EXPORT_SYMBOL(tcp_seq_start);
2427 
2428 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2429 {
2430 	struct tcp_iter_state *st = seq->private;
2431 	void *rc = NULL;
2432 
2433 	if (v == SEQ_START_TOKEN) {
2434 		rc = tcp_get_idx(seq, 0);
2435 		goto out;
2436 	}
2437 
2438 	switch (st->state) {
2439 	case TCP_SEQ_STATE_LISTENING:
2440 		rc = listening_get_next(seq, v);
2441 		if (!rc) {
2442 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2443 			st->bucket = 0;
2444 			st->offset = 0;
2445 			rc	  = established_get_first(seq);
2446 		}
2447 		break;
2448 	case TCP_SEQ_STATE_ESTABLISHED:
2449 		rc = established_get_next(seq, v);
2450 		break;
2451 	}
2452 out:
2453 	++*pos;
2454 	st->last_pos = *pos;
2455 	return rc;
2456 }
2457 EXPORT_SYMBOL(tcp_seq_next);
2458 
2459 void tcp_seq_stop(struct seq_file *seq, void *v)
2460 {
2461 	struct tcp_iter_state *st = seq->private;
2462 
2463 	switch (st->state) {
2464 	case TCP_SEQ_STATE_LISTENING:
2465 		if (v != SEQ_START_TOKEN)
2466 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2467 		break;
2468 	case TCP_SEQ_STATE_ESTABLISHED:
2469 		if (v)
2470 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2471 		break;
2472 	}
2473 }
2474 EXPORT_SYMBOL(tcp_seq_stop);
2475 
2476 static void get_openreq4(const struct request_sock *req,
2477 			 struct seq_file *f, int i)
2478 {
2479 	const struct inet_request_sock *ireq = inet_rsk(req);
2480 	long delta = req->rsk_timer.expires - jiffies;
2481 
2482 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2483 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2484 		i,
2485 		ireq->ir_loc_addr,
2486 		ireq->ir_num,
2487 		ireq->ir_rmt_addr,
2488 		ntohs(ireq->ir_rmt_port),
2489 		TCP_SYN_RECV,
2490 		0, 0, /* could print option size, but that is af dependent. */
2491 		1,    /* timers active (only the expire timer) */
2492 		jiffies_delta_to_clock_t(delta),
2493 		req->num_timeout,
2494 		from_kuid_munged(seq_user_ns(f),
2495 				 sock_i_uid(req->rsk_listener)),
2496 		0,  /* non standard timer */
2497 		0, /* open_requests have no inode */
2498 		0,
2499 		req);
2500 }
2501 
2502 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2503 {
2504 	int timer_active;
2505 	unsigned long timer_expires;
2506 	const struct tcp_sock *tp = tcp_sk(sk);
2507 	const struct inet_connection_sock *icsk = inet_csk(sk);
2508 	const struct inet_sock *inet = inet_sk(sk);
2509 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2510 	__be32 dest = inet->inet_daddr;
2511 	__be32 src = inet->inet_rcv_saddr;
2512 	__u16 destp = ntohs(inet->inet_dport);
2513 	__u16 srcp = ntohs(inet->inet_sport);
2514 	int rx_queue;
2515 	int state;
2516 
2517 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2518 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2519 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2520 		timer_active	= 1;
2521 		timer_expires	= icsk->icsk_timeout;
2522 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2523 		timer_active	= 4;
2524 		timer_expires	= icsk->icsk_timeout;
2525 	} else if (timer_pending(&sk->sk_timer)) {
2526 		timer_active	= 2;
2527 		timer_expires	= sk->sk_timer.expires;
2528 	} else {
2529 		timer_active	= 0;
2530 		timer_expires = jiffies;
2531 	}
2532 
2533 	state = inet_sk_state_load(sk);
2534 	if (state == TCP_LISTEN)
2535 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2536 	else
2537 		/* Because we don't lock the socket,
2538 		 * we might find a transient negative value.
2539 		 */
2540 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2541 				      READ_ONCE(tp->copied_seq), 0);
2542 
2543 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2544 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2545 		i, src, srcp, dest, destp, state,
2546 		READ_ONCE(tp->write_seq) - tp->snd_una,
2547 		rx_queue,
2548 		timer_active,
2549 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2550 		icsk->icsk_retransmits,
2551 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2552 		icsk->icsk_probes_out,
2553 		sock_i_ino(sk),
2554 		refcount_read(&sk->sk_refcnt), sk,
2555 		jiffies_to_clock_t(icsk->icsk_rto),
2556 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2557 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2558 		tp->snd_cwnd,
2559 		state == TCP_LISTEN ?
2560 		    fastopenq->max_qlen :
2561 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2562 }
2563 
2564 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2565 			       struct seq_file *f, int i)
2566 {
2567 	long delta = tw->tw_timer.expires - jiffies;
2568 	__be32 dest, src;
2569 	__u16 destp, srcp;
2570 
2571 	dest  = tw->tw_daddr;
2572 	src   = tw->tw_rcv_saddr;
2573 	destp = ntohs(tw->tw_dport);
2574 	srcp  = ntohs(tw->tw_sport);
2575 
2576 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2577 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2578 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2579 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2580 		refcount_read(&tw->tw_refcnt), tw);
2581 }
2582 
2583 #define TMPSZ 150
2584 
2585 static int tcp4_seq_show(struct seq_file *seq, void *v)
2586 {
2587 	struct tcp_iter_state *st;
2588 	struct sock *sk = v;
2589 
2590 	seq_setwidth(seq, TMPSZ - 1);
2591 	if (v == SEQ_START_TOKEN) {
2592 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2593 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2594 			   "inode");
2595 		goto out;
2596 	}
2597 	st = seq->private;
2598 
2599 	if (sk->sk_state == TCP_TIME_WAIT)
2600 		get_timewait4_sock(v, seq, st->num);
2601 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2602 		get_openreq4(v, seq, st->num);
2603 	else
2604 		get_tcp4_sock(v, seq, st->num);
2605 out:
2606 	seq_pad(seq, '\n');
2607 	return 0;
2608 }
2609 
2610 static const struct seq_operations tcp4_seq_ops = {
2611 	.show		= tcp4_seq_show,
2612 	.start		= tcp_seq_start,
2613 	.next		= tcp_seq_next,
2614 	.stop		= tcp_seq_stop,
2615 };
2616 
2617 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2618 	.family		= AF_INET,
2619 };
2620 
2621 static int __net_init tcp4_proc_init_net(struct net *net)
2622 {
2623 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2624 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2625 		return -ENOMEM;
2626 	return 0;
2627 }
2628 
2629 static void __net_exit tcp4_proc_exit_net(struct net *net)
2630 {
2631 	remove_proc_entry("tcp", net->proc_net);
2632 }
2633 
2634 static struct pernet_operations tcp4_net_ops = {
2635 	.init = tcp4_proc_init_net,
2636 	.exit = tcp4_proc_exit_net,
2637 };
2638 
2639 int __init tcp4_proc_init(void)
2640 {
2641 	return register_pernet_subsys(&tcp4_net_ops);
2642 }
2643 
2644 void tcp4_proc_exit(void)
2645 {
2646 	unregister_pernet_subsys(&tcp4_net_ops);
2647 }
2648 #endif /* CONFIG_PROC_FS */
2649 
2650 struct proto tcp_prot = {
2651 	.name			= "TCP",
2652 	.owner			= THIS_MODULE,
2653 	.close			= tcp_close,
2654 	.pre_connect		= tcp_v4_pre_connect,
2655 	.connect		= tcp_v4_connect,
2656 	.disconnect		= tcp_disconnect,
2657 	.accept			= inet_csk_accept,
2658 	.ioctl			= tcp_ioctl,
2659 	.init			= tcp_v4_init_sock,
2660 	.destroy		= tcp_v4_destroy_sock,
2661 	.shutdown		= tcp_shutdown,
2662 	.setsockopt		= tcp_setsockopt,
2663 	.getsockopt		= tcp_getsockopt,
2664 	.keepalive		= tcp_set_keepalive,
2665 	.recvmsg		= tcp_recvmsg,
2666 	.sendmsg		= tcp_sendmsg,
2667 	.sendpage		= tcp_sendpage,
2668 	.backlog_rcv		= tcp_v4_do_rcv,
2669 	.release_cb		= tcp_release_cb,
2670 	.hash			= inet_hash,
2671 	.unhash			= inet_unhash,
2672 	.get_port		= inet_csk_get_port,
2673 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2674 	.leave_memory_pressure	= tcp_leave_memory_pressure,
2675 	.stream_memory_free	= tcp_stream_memory_free,
2676 	.sockets_allocated	= &tcp_sockets_allocated,
2677 	.orphan_count		= &tcp_orphan_count,
2678 	.memory_allocated	= &tcp_memory_allocated,
2679 	.memory_pressure	= &tcp_memory_pressure,
2680 	.sysctl_mem		= sysctl_tcp_mem,
2681 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
2682 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
2683 	.max_header		= MAX_TCP_HEADER,
2684 	.obj_size		= sizeof(struct tcp_sock),
2685 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
2686 	.twsk_prot		= &tcp_timewait_sock_ops,
2687 	.rsk_prot		= &tcp_request_sock_ops,
2688 	.h.hashinfo		= &tcp_hashinfo,
2689 	.no_autobind		= true,
2690 #ifdef CONFIG_COMPAT
2691 	.compat_setsockopt	= compat_tcp_setsockopt,
2692 	.compat_getsockopt	= compat_tcp_getsockopt,
2693 #endif
2694 	.diag_destroy		= tcp_abort,
2695 };
2696 EXPORT_SYMBOL(tcp_prot);
2697 
2698 static void __net_exit tcp_sk_exit(struct net *net)
2699 {
2700 	int cpu;
2701 
2702 	if (net->ipv4.tcp_congestion_control)
2703 		bpf_module_put(net->ipv4.tcp_congestion_control,
2704 			       net->ipv4.tcp_congestion_control->owner);
2705 
2706 	for_each_possible_cpu(cpu)
2707 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2708 	free_percpu(net->ipv4.tcp_sk);
2709 }
2710 
2711 static int __net_init tcp_sk_init(struct net *net)
2712 {
2713 	int res, cpu, cnt;
2714 
2715 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2716 	if (!net->ipv4.tcp_sk)
2717 		return -ENOMEM;
2718 
2719 	for_each_possible_cpu(cpu) {
2720 		struct sock *sk;
2721 
2722 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2723 					   IPPROTO_TCP, net);
2724 		if (res)
2725 			goto fail;
2726 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2727 
2728 		/* Please enforce IP_DF and IPID==0 for RST and
2729 		 * ACK sent in SYN-RECV and TIME-WAIT state.
2730 		 */
2731 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2732 
2733 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2734 	}
2735 
2736 	net->ipv4.sysctl_tcp_ecn = 2;
2737 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2738 
2739 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2740 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2741 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2742 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2743 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2744 
2745 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2746 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2747 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2748 
2749 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2750 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2751 	net->ipv4.sysctl_tcp_syncookies = 1;
2752 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2753 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2754 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2755 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2756 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2757 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2758 	net->ipv4.sysctl_tcp_tw_reuse = 2;
2759 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2760 
2761 	cnt = tcp_hashinfo.ehash_mask + 1;
2762 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2763 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2764 
2765 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2766 	net->ipv4.sysctl_tcp_sack = 1;
2767 	net->ipv4.sysctl_tcp_window_scaling = 1;
2768 	net->ipv4.sysctl_tcp_timestamps = 1;
2769 	net->ipv4.sysctl_tcp_early_retrans = 3;
2770 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2771 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2772 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
2773 	net->ipv4.sysctl_tcp_max_reordering = 300;
2774 	net->ipv4.sysctl_tcp_dsack = 1;
2775 	net->ipv4.sysctl_tcp_app_win = 31;
2776 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
2777 	net->ipv4.sysctl_tcp_frto = 2;
2778 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2779 	/* This limits the percentage of the congestion window which we
2780 	 * will allow a single TSO frame to consume.  Building TSO frames
2781 	 * which are too large can cause TCP streams to be bursty.
2782 	 */
2783 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2784 	/* Default TSQ limit of 16 TSO segments */
2785 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2786 	/* rfc5961 challenge ack rate limiting */
2787 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2788 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
2789 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2790 	net->ipv4.sysctl_tcp_autocorking = 1;
2791 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2792 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2793 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2794 	if (net != &init_net) {
2795 		memcpy(net->ipv4.sysctl_tcp_rmem,
2796 		       init_net.ipv4.sysctl_tcp_rmem,
2797 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2798 		memcpy(net->ipv4.sysctl_tcp_wmem,
2799 		       init_net.ipv4.sysctl_tcp_wmem,
2800 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2801 	}
2802 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2803 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
2804 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2805 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2806 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2807 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2808 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2809 
2810 	/* Reno is always built in */
2811 	if (!net_eq(net, &init_net) &&
2812 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2813 			       init_net.ipv4.tcp_congestion_control->owner))
2814 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2815 	else
2816 		net->ipv4.tcp_congestion_control = &tcp_reno;
2817 
2818 	return 0;
2819 fail:
2820 	tcp_sk_exit(net);
2821 
2822 	return res;
2823 }
2824 
2825 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2826 {
2827 	struct net *net;
2828 
2829 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2830 
2831 	list_for_each_entry(net, net_exit_list, exit_list)
2832 		tcp_fastopen_ctx_destroy(net);
2833 }
2834 
2835 static struct pernet_operations __net_initdata tcp_sk_ops = {
2836        .init	   = tcp_sk_init,
2837        .exit	   = tcp_sk_exit,
2838        .exit_batch = tcp_sk_exit_batch,
2839 };
2840 
2841 void __init tcp_v4_init(void)
2842 {
2843 	if (register_pernet_subsys(&tcp_sk_ops))
2844 		panic("Failed to create the TCP control socket.\n");
2845 }
2846