xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision 82df5b73)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72 
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 
80 #include <crypto/hash.h>
81 #include <linux/scatterlist.h>
82 
83 #include <trace/events/tcp.h>
84 
85 #ifdef CONFIG_TCP_MD5SIG
86 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
87 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
88 #endif
89 
90 struct inet_hashinfo tcp_hashinfo;
91 EXPORT_SYMBOL(tcp_hashinfo);
92 
93 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
94 {
95 	return secure_tcp_seq(ip_hdr(skb)->daddr,
96 			      ip_hdr(skb)->saddr,
97 			      tcp_hdr(skb)->dest,
98 			      tcp_hdr(skb)->source);
99 }
100 
101 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
102 {
103 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
104 }
105 
106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
107 {
108 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
109 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
110 	struct tcp_sock *tp = tcp_sk(sk);
111 	int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
112 
113 	if (reuse == 2) {
114 		/* Still does not detect *everything* that goes through
115 		 * lo, since we require a loopback src or dst address
116 		 * or direct binding to 'lo' interface.
117 		 */
118 		bool loopback = false;
119 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
120 			loopback = true;
121 #if IS_ENABLED(CONFIG_IPV6)
122 		if (tw->tw_family == AF_INET6) {
123 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
124 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
125 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
126 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
127 				loopback = true;
128 		} else
129 #endif
130 		{
131 			if (ipv4_is_loopback(tw->tw_daddr) ||
132 			    ipv4_is_loopback(tw->tw_rcv_saddr))
133 				loopback = true;
134 		}
135 		if (!loopback)
136 			reuse = 0;
137 	}
138 
139 	/* With PAWS, it is safe from the viewpoint
140 	   of data integrity. Even without PAWS it is safe provided sequence
141 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
142 
143 	   Actually, the idea is close to VJ's one, only timestamp cache is
144 	   held not per host, but per port pair and TW bucket is used as state
145 	   holder.
146 
147 	   If TW bucket has been already destroyed we fall back to VJ's scheme
148 	   and use initial timestamp retrieved from peer table.
149 	 */
150 	if (tcptw->tw_ts_recent_stamp &&
151 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
152 					    tcptw->tw_ts_recent_stamp)))) {
153 		/* In case of repair and re-using TIME-WAIT sockets we still
154 		 * want to be sure that it is safe as above but honor the
155 		 * sequence numbers and time stamps set as part of the repair
156 		 * process.
157 		 *
158 		 * Without this check re-using a TIME-WAIT socket with TCP
159 		 * repair would accumulate a -1 on the repair assigned
160 		 * sequence number. The first time it is reused the sequence
161 		 * is -1, the second time -2, etc. This fixes that issue
162 		 * without appearing to create any others.
163 		 */
164 		if (likely(!tp->repair)) {
165 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
166 
167 			if (!seq)
168 				seq = 1;
169 			WRITE_ONCE(tp->write_seq, seq);
170 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
171 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
172 		}
173 		sock_hold(sktw);
174 		return 1;
175 	}
176 
177 	return 0;
178 }
179 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
180 
181 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
182 			      int addr_len)
183 {
184 	/* This check is replicated from tcp_v4_connect() and intended to
185 	 * prevent BPF program called below from accessing bytes that are out
186 	 * of the bound specified by user in addr_len.
187 	 */
188 	if (addr_len < sizeof(struct sockaddr_in))
189 		return -EINVAL;
190 
191 	sock_owned_by_me(sk);
192 
193 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
194 }
195 
196 /* This will initiate an outgoing connection. */
197 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
198 {
199 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
200 	struct inet_sock *inet = inet_sk(sk);
201 	struct tcp_sock *tp = tcp_sk(sk);
202 	__be16 orig_sport, orig_dport;
203 	__be32 daddr, nexthop;
204 	struct flowi4 *fl4;
205 	struct rtable *rt;
206 	int err;
207 	struct ip_options_rcu *inet_opt;
208 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
209 
210 	if (addr_len < sizeof(struct sockaddr_in))
211 		return -EINVAL;
212 
213 	if (usin->sin_family != AF_INET)
214 		return -EAFNOSUPPORT;
215 
216 	nexthop = daddr = usin->sin_addr.s_addr;
217 	inet_opt = rcu_dereference_protected(inet->inet_opt,
218 					     lockdep_sock_is_held(sk));
219 	if (inet_opt && inet_opt->opt.srr) {
220 		if (!daddr)
221 			return -EINVAL;
222 		nexthop = inet_opt->opt.faddr;
223 	}
224 
225 	orig_sport = inet->inet_sport;
226 	orig_dport = usin->sin_port;
227 	fl4 = &inet->cork.fl.u.ip4;
228 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
229 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
230 			      IPPROTO_TCP,
231 			      orig_sport, orig_dport, sk);
232 	if (IS_ERR(rt)) {
233 		err = PTR_ERR(rt);
234 		if (err == -ENETUNREACH)
235 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
236 		return err;
237 	}
238 
239 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
240 		ip_rt_put(rt);
241 		return -ENETUNREACH;
242 	}
243 
244 	if (!inet_opt || !inet_opt->opt.srr)
245 		daddr = fl4->daddr;
246 
247 	if (!inet->inet_saddr)
248 		inet->inet_saddr = fl4->saddr;
249 	sk_rcv_saddr_set(sk, inet->inet_saddr);
250 
251 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
252 		/* Reset inherited state */
253 		tp->rx_opt.ts_recent	   = 0;
254 		tp->rx_opt.ts_recent_stamp = 0;
255 		if (likely(!tp->repair))
256 			WRITE_ONCE(tp->write_seq, 0);
257 	}
258 
259 	inet->inet_dport = usin->sin_port;
260 	sk_daddr_set(sk, daddr);
261 
262 	inet_csk(sk)->icsk_ext_hdr_len = 0;
263 	if (inet_opt)
264 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
265 
266 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
267 
268 	/* Socket identity is still unknown (sport may be zero).
269 	 * However we set state to SYN-SENT and not releasing socket
270 	 * lock select source port, enter ourselves into the hash tables and
271 	 * complete initialization after this.
272 	 */
273 	tcp_set_state(sk, TCP_SYN_SENT);
274 	err = inet_hash_connect(tcp_death_row, sk);
275 	if (err)
276 		goto failure;
277 
278 	sk_set_txhash(sk);
279 
280 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
281 			       inet->inet_sport, inet->inet_dport, sk);
282 	if (IS_ERR(rt)) {
283 		err = PTR_ERR(rt);
284 		rt = NULL;
285 		goto failure;
286 	}
287 	/* OK, now commit destination to socket.  */
288 	sk->sk_gso_type = SKB_GSO_TCPV4;
289 	sk_setup_caps(sk, &rt->dst);
290 	rt = NULL;
291 
292 	if (likely(!tp->repair)) {
293 		if (!tp->write_seq)
294 			WRITE_ONCE(tp->write_seq,
295 				   secure_tcp_seq(inet->inet_saddr,
296 						  inet->inet_daddr,
297 						  inet->inet_sport,
298 						  usin->sin_port));
299 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
300 						 inet->inet_saddr,
301 						 inet->inet_daddr);
302 	}
303 
304 	inet->inet_id = prandom_u32();
305 
306 	if (tcp_fastopen_defer_connect(sk, &err))
307 		return err;
308 	if (err)
309 		goto failure;
310 
311 	err = tcp_connect(sk);
312 
313 	if (err)
314 		goto failure;
315 
316 	return 0;
317 
318 failure:
319 	/*
320 	 * This unhashes the socket and releases the local port,
321 	 * if necessary.
322 	 */
323 	tcp_set_state(sk, TCP_CLOSE);
324 	ip_rt_put(rt);
325 	sk->sk_route_caps = 0;
326 	inet->inet_dport = 0;
327 	return err;
328 }
329 EXPORT_SYMBOL(tcp_v4_connect);
330 
331 /*
332  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
333  * It can be called through tcp_release_cb() if socket was owned by user
334  * at the time tcp_v4_err() was called to handle ICMP message.
335  */
336 void tcp_v4_mtu_reduced(struct sock *sk)
337 {
338 	struct inet_sock *inet = inet_sk(sk);
339 	struct dst_entry *dst;
340 	u32 mtu;
341 
342 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
343 		return;
344 	mtu = tcp_sk(sk)->mtu_info;
345 	dst = inet_csk_update_pmtu(sk, mtu);
346 	if (!dst)
347 		return;
348 
349 	/* Something is about to be wrong... Remember soft error
350 	 * for the case, if this connection will not able to recover.
351 	 */
352 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
353 		sk->sk_err_soft = EMSGSIZE;
354 
355 	mtu = dst_mtu(dst);
356 
357 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
358 	    ip_sk_accept_pmtu(sk) &&
359 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
360 		tcp_sync_mss(sk, mtu);
361 
362 		/* Resend the TCP packet because it's
363 		 * clear that the old packet has been
364 		 * dropped. This is the new "fast" path mtu
365 		 * discovery.
366 		 */
367 		tcp_simple_retransmit(sk);
368 	} /* else let the usual retransmit timer handle it */
369 }
370 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
371 
372 static void do_redirect(struct sk_buff *skb, struct sock *sk)
373 {
374 	struct dst_entry *dst = __sk_dst_check(sk, 0);
375 
376 	if (dst)
377 		dst->ops->redirect(dst, sk, skb);
378 }
379 
380 
381 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
382 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
383 {
384 	struct request_sock *req = inet_reqsk(sk);
385 	struct net *net = sock_net(sk);
386 
387 	/* ICMPs are not backlogged, hence we cannot get
388 	 * an established socket here.
389 	 */
390 	if (seq != tcp_rsk(req)->snt_isn) {
391 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
392 	} else if (abort) {
393 		/*
394 		 * Still in SYN_RECV, just remove it silently.
395 		 * There is no good way to pass the error to the newly
396 		 * created socket, and POSIX does not want network
397 		 * errors returned from accept().
398 		 */
399 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
400 		tcp_listendrop(req->rsk_listener);
401 	}
402 	reqsk_put(req);
403 }
404 EXPORT_SYMBOL(tcp_req_err);
405 
406 /* TCP-LD (RFC 6069) logic */
407 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
408 {
409 	struct inet_connection_sock *icsk = inet_csk(sk);
410 	struct tcp_sock *tp = tcp_sk(sk);
411 	struct sk_buff *skb;
412 	s32 remaining;
413 	u32 delta_us;
414 
415 	if (sock_owned_by_user(sk))
416 		return;
417 
418 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
419 	    !icsk->icsk_backoff)
420 		return;
421 
422 	skb = tcp_rtx_queue_head(sk);
423 	if (WARN_ON_ONCE(!skb))
424 		return;
425 
426 	icsk->icsk_backoff--;
427 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
428 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
429 
430 	tcp_mstamp_refresh(tp);
431 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
432 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
433 
434 	if (remaining > 0) {
435 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
436 					  remaining, TCP_RTO_MAX);
437 	} else {
438 		/* RTO revert clocked out retransmission.
439 		 * Will retransmit now.
440 		 */
441 		tcp_retransmit_timer(sk);
442 	}
443 }
444 EXPORT_SYMBOL(tcp_ld_RTO_revert);
445 
446 /*
447  * This routine is called by the ICMP module when it gets some
448  * sort of error condition.  If err < 0 then the socket should
449  * be closed and the error returned to the user.  If err > 0
450  * it's just the icmp type << 8 | icmp code.  After adjustment
451  * header points to the first 8 bytes of the tcp header.  We need
452  * to find the appropriate port.
453  *
454  * The locking strategy used here is very "optimistic". When
455  * someone else accesses the socket the ICMP is just dropped
456  * and for some paths there is no check at all.
457  * A more general error queue to queue errors for later handling
458  * is probably better.
459  *
460  */
461 
462 int tcp_v4_err(struct sk_buff *skb, u32 info)
463 {
464 	const struct iphdr *iph = (const struct iphdr *)skb->data;
465 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
466 	struct tcp_sock *tp;
467 	struct inet_sock *inet;
468 	const int type = icmp_hdr(skb)->type;
469 	const int code = icmp_hdr(skb)->code;
470 	struct sock *sk;
471 	struct request_sock *fastopen;
472 	u32 seq, snd_una;
473 	int err;
474 	struct net *net = dev_net(skb->dev);
475 
476 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
477 				       th->dest, iph->saddr, ntohs(th->source),
478 				       inet_iif(skb), 0);
479 	if (!sk) {
480 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
481 		return -ENOENT;
482 	}
483 	if (sk->sk_state == TCP_TIME_WAIT) {
484 		inet_twsk_put(inet_twsk(sk));
485 		return 0;
486 	}
487 	seq = ntohl(th->seq);
488 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
489 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
490 				     type == ICMP_TIME_EXCEEDED ||
491 				     (type == ICMP_DEST_UNREACH &&
492 				      (code == ICMP_NET_UNREACH ||
493 				       code == ICMP_HOST_UNREACH)));
494 		return 0;
495 	}
496 
497 	bh_lock_sock(sk);
498 	/* If too many ICMPs get dropped on busy
499 	 * servers this needs to be solved differently.
500 	 * We do take care of PMTU discovery (RFC1191) special case :
501 	 * we can receive locally generated ICMP messages while socket is held.
502 	 */
503 	if (sock_owned_by_user(sk)) {
504 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
505 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
506 	}
507 	if (sk->sk_state == TCP_CLOSE)
508 		goto out;
509 
510 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
511 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
512 		goto out;
513 	}
514 
515 	tp = tcp_sk(sk);
516 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
517 	fastopen = rcu_dereference(tp->fastopen_rsk);
518 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
519 	if (sk->sk_state != TCP_LISTEN &&
520 	    !between(seq, snd_una, tp->snd_nxt)) {
521 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
522 		goto out;
523 	}
524 
525 	switch (type) {
526 	case ICMP_REDIRECT:
527 		if (!sock_owned_by_user(sk))
528 			do_redirect(skb, sk);
529 		goto out;
530 	case ICMP_SOURCE_QUENCH:
531 		/* Just silently ignore these. */
532 		goto out;
533 	case ICMP_PARAMETERPROB:
534 		err = EPROTO;
535 		break;
536 	case ICMP_DEST_UNREACH:
537 		if (code > NR_ICMP_UNREACH)
538 			goto out;
539 
540 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
541 			/* We are not interested in TCP_LISTEN and open_requests
542 			 * (SYN-ACKs send out by Linux are always <576bytes so
543 			 * they should go through unfragmented).
544 			 */
545 			if (sk->sk_state == TCP_LISTEN)
546 				goto out;
547 
548 			tp->mtu_info = info;
549 			if (!sock_owned_by_user(sk)) {
550 				tcp_v4_mtu_reduced(sk);
551 			} else {
552 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
553 					sock_hold(sk);
554 			}
555 			goto out;
556 		}
557 
558 		err = icmp_err_convert[code].errno;
559 		/* check if this ICMP message allows revert of backoff.
560 		 * (see RFC 6069)
561 		 */
562 		if (!fastopen &&
563 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
564 			tcp_ld_RTO_revert(sk, seq);
565 		break;
566 	case ICMP_TIME_EXCEEDED:
567 		err = EHOSTUNREACH;
568 		break;
569 	default:
570 		goto out;
571 	}
572 
573 	switch (sk->sk_state) {
574 	case TCP_SYN_SENT:
575 	case TCP_SYN_RECV:
576 		/* Only in fast or simultaneous open. If a fast open socket is
577 		 * is already accepted it is treated as a connected one below.
578 		 */
579 		if (fastopen && !fastopen->sk)
580 			break;
581 
582 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
583 
584 		if (!sock_owned_by_user(sk)) {
585 			sk->sk_err = err;
586 
587 			sk->sk_error_report(sk);
588 
589 			tcp_done(sk);
590 		} else {
591 			sk->sk_err_soft = err;
592 		}
593 		goto out;
594 	}
595 
596 	/* If we've already connected we will keep trying
597 	 * until we time out, or the user gives up.
598 	 *
599 	 * rfc1122 4.2.3.9 allows to consider as hard errors
600 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
601 	 * but it is obsoleted by pmtu discovery).
602 	 *
603 	 * Note, that in modern internet, where routing is unreliable
604 	 * and in each dark corner broken firewalls sit, sending random
605 	 * errors ordered by their masters even this two messages finally lose
606 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
607 	 *
608 	 * Now we are in compliance with RFCs.
609 	 *							--ANK (980905)
610 	 */
611 
612 	inet = inet_sk(sk);
613 	if (!sock_owned_by_user(sk) && inet->recverr) {
614 		sk->sk_err = err;
615 		sk->sk_error_report(sk);
616 	} else	{ /* Only an error on timeout */
617 		sk->sk_err_soft = err;
618 	}
619 
620 out:
621 	bh_unlock_sock(sk);
622 	sock_put(sk);
623 	return 0;
624 }
625 
626 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
627 {
628 	struct tcphdr *th = tcp_hdr(skb);
629 
630 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
631 	skb->csum_start = skb_transport_header(skb) - skb->head;
632 	skb->csum_offset = offsetof(struct tcphdr, check);
633 }
634 
635 /* This routine computes an IPv4 TCP checksum. */
636 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
637 {
638 	const struct inet_sock *inet = inet_sk(sk);
639 
640 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
641 }
642 EXPORT_SYMBOL(tcp_v4_send_check);
643 
644 /*
645  *	This routine will send an RST to the other tcp.
646  *
647  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
648  *		      for reset.
649  *	Answer: if a packet caused RST, it is not for a socket
650  *		existing in our system, if it is matched to a socket,
651  *		it is just duplicate segment or bug in other side's TCP.
652  *		So that we build reply only basing on parameters
653  *		arrived with segment.
654  *	Exception: precedence violation. We do not implement it in any case.
655  */
656 
657 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
658 {
659 	const struct tcphdr *th = tcp_hdr(skb);
660 	struct {
661 		struct tcphdr th;
662 #ifdef CONFIG_TCP_MD5SIG
663 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
664 #endif
665 	} rep;
666 	struct ip_reply_arg arg;
667 #ifdef CONFIG_TCP_MD5SIG
668 	struct tcp_md5sig_key *key = NULL;
669 	const __u8 *hash_location = NULL;
670 	unsigned char newhash[16];
671 	int genhash;
672 	struct sock *sk1 = NULL;
673 #endif
674 	u64 transmit_time = 0;
675 	struct sock *ctl_sk;
676 	struct net *net;
677 
678 	/* Never send a reset in response to a reset. */
679 	if (th->rst)
680 		return;
681 
682 	/* If sk not NULL, it means we did a successful lookup and incoming
683 	 * route had to be correct. prequeue might have dropped our dst.
684 	 */
685 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
686 		return;
687 
688 	/* Swap the send and the receive. */
689 	memset(&rep, 0, sizeof(rep));
690 	rep.th.dest   = th->source;
691 	rep.th.source = th->dest;
692 	rep.th.doff   = sizeof(struct tcphdr) / 4;
693 	rep.th.rst    = 1;
694 
695 	if (th->ack) {
696 		rep.th.seq = th->ack_seq;
697 	} else {
698 		rep.th.ack = 1;
699 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
700 				       skb->len - (th->doff << 2));
701 	}
702 
703 	memset(&arg, 0, sizeof(arg));
704 	arg.iov[0].iov_base = (unsigned char *)&rep;
705 	arg.iov[0].iov_len  = sizeof(rep.th);
706 
707 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
708 #ifdef CONFIG_TCP_MD5SIG
709 	rcu_read_lock();
710 	hash_location = tcp_parse_md5sig_option(th);
711 	if (sk && sk_fullsock(sk)) {
712 		const union tcp_md5_addr *addr;
713 		int l3index;
714 
715 		/* sdif set, means packet ingressed via a device
716 		 * in an L3 domain and inet_iif is set to it.
717 		 */
718 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
719 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
720 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
721 	} else if (hash_location) {
722 		const union tcp_md5_addr *addr;
723 		int sdif = tcp_v4_sdif(skb);
724 		int dif = inet_iif(skb);
725 		int l3index;
726 
727 		/*
728 		 * active side is lost. Try to find listening socket through
729 		 * source port, and then find md5 key through listening socket.
730 		 * we are not loose security here:
731 		 * Incoming packet is checked with md5 hash with finding key,
732 		 * no RST generated if md5 hash doesn't match.
733 		 */
734 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
735 					     ip_hdr(skb)->saddr,
736 					     th->source, ip_hdr(skb)->daddr,
737 					     ntohs(th->source), dif, sdif);
738 		/* don't send rst if it can't find key */
739 		if (!sk1)
740 			goto out;
741 
742 		/* sdif set, means packet ingressed via a device
743 		 * in an L3 domain and dif is set to it.
744 		 */
745 		l3index = sdif ? dif : 0;
746 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
747 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
748 		if (!key)
749 			goto out;
750 
751 
752 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
753 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
754 			goto out;
755 
756 	}
757 
758 	if (key) {
759 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
760 				   (TCPOPT_NOP << 16) |
761 				   (TCPOPT_MD5SIG << 8) |
762 				   TCPOLEN_MD5SIG);
763 		/* Update length and the length the header thinks exists */
764 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
765 		rep.th.doff = arg.iov[0].iov_len / 4;
766 
767 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
768 				     key, ip_hdr(skb)->saddr,
769 				     ip_hdr(skb)->daddr, &rep.th);
770 	}
771 #endif
772 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
773 				      ip_hdr(skb)->saddr, /* XXX */
774 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
775 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
776 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
777 
778 	/* When socket is gone, all binding information is lost.
779 	 * routing might fail in this case. No choice here, if we choose to force
780 	 * input interface, we will misroute in case of asymmetric route.
781 	 */
782 	if (sk) {
783 		arg.bound_dev_if = sk->sk_bound_dev_if;
784 		if (sk_fullsock(sk))
785 			trace_tcp_send_reset(sk, skb);
786 	}
787 
788 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
789 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
790 
791 	arg.tos = ip_hdr(skb)->tos;
792 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
793 	local_bh_disable();
794 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
795 	if (sk) {
796 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
797 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
798 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
799 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
800 		transmit_time = tcp_transmit_time(sk);
801 	}
802 	ip_send_unicast_reply(ctl_sk,
803 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
804 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
805 			      &arg, arg.iov[0].iov_len,
806 			      transmit_time);
807 
808 	ctl_sk->sk_mark = 0;
809 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
810 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
811 	local_bh_enable();
812 
813 #ifdef CONFIG_TCP_MD5SIG
814 out:
815 	rcu_read_unlock();
816 #endif
817 }
818 
819 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
820    outside socket context is ugly, certainly. What can I do?
821  */
822 
823 static void tcp_v4_send_ack(const struct sock *sk,
824 			    struct sk_buff *skb, u32 seq, u32 ack,
825 			    u32 win, u32 tsval, u32 tsecr, int oif,
826 			    struct tcp_md5sig_key *key,
827 			    int reply_flags, u8 tos)
828 {
829 	const struct tcphdr *th = tcp_hdr(skb);
830 	struct {
831 		struct tcphdr th;
832 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
833 #ifdef CONFIG_TCP_MD5SIG
834 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
835 #endif
836 			];
837 	} rep;
838 	struct net *net = sock_net(sk);
839 	struct ip_reply_arg arg;
840 	struct sock *ctl_sk;
841 	u64 transmit_time;
842 
843 	memset(&rep.th, 0, sizeof(struct tcphdr));
844 	memset(&arg, 0, sizeof(arg));
845 
846 	arg.iov[0].iov_base = (unsigned char *)&rep;
847 	arg.iov[0].iov_len  = sizeof(rep.th);
848 	if (tsecr) {
849 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
850 				   (TCPOPT_TIMESTAMP << 8) |
851 				   TCPOLEN_TIMESTAMP);
852 		rep.opt[1] = htonl(tsval);
853 		rep.opt[2] = htonl(tsecr);
854 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
855 	}
856 
857 	/* Swap the send and the receive. */
858 	rep.th.dest    = th->source;
859 	rep.th.source  = th->dest;
860 	rep.th.doff    = arg.iov[0].iov_len / 4;
861 	rep.th.seq     = htonl(seq);
862 	rep.th.ack_seq = htonl(ack);
863 	rep.th.ack     = 1;
864 	rep.th.window  = htons(win);
865 
866 #ifdef CONFIG_TCP_MD5SIG
867 	if (key) {
868 		int offset = (tsecr) ? 3 : 0;
869 
870 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
871 					  (TCPOPT_NOP << 16) |
872 					  (TCPOPT_MD5SIG << 8) |
873 					  TCPOLEN_MD5SIG);
874 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
875 		rep.th.doff = arg.iov[0].iov_len/4;
876 
877 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
878 				    key, ip_hdr(skb)->saddr,
879 				    ip_hdr(skb)->daddr, &rep.th);
880 	}
881 #endif
882 	arg.flags = reply_flags;
883 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
884 				      ip_hdr(skb)->saddr, /* XXX */
885 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
886 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
887 	if (oif)
888 		arg.bound_dev_if = oif;
889 	arg.tos = tos;
890 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
891 	local_bh_disable();
892 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
893 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
894 			   inet_twsk(sk)->tw_mark : sk->sk_mark;
895 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
896 			   inet_twsk(sk)->tw_priority : sk->sk_priority;
897 	transmit_time = tcp_transmit_time(sk);
898 	ip_send_unicast_reply(ctl_sk,
899 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
900 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
901 			      &arg, arg.iov[0].iov_len,
902 			      transmit_time);
903 
904 	ctl_sk->sk_mark = 0;
905 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
906 	local_bh_enable();
907 }
908 
909 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
910 {
911 	struct inet_timewait_sock *tw = inet_twsk(sk);
912 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
913 
914 	tcp_v4_send_ack(sk, skb,
915 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
916 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
917 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
918 			tcptw->tw_ts_recent,
919 			tw->tw_bound_dev_if,
920 			tcp_twsk_md5_key(tcptw),
921 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
922 			tw->tw_tos
923 			);
924 
925 	inet_twsk_put(tw);
926 }
927 
928 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
929 				  struct request_sock *req)
930 {
931 	const union tcp_md5_addr *addr;
932 	int l3index;
933 
934 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
935 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
936 	 */
937 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
938 					     tcp_sk(sk)->snd_nxt;
939 
940 	/* RFC 7323 2.3
941 	 * The window field (SEG.WND) of every outgoing segment, with the
942 	 * exception of <SYN> segments, MUST be right-shifted by
943 	 * Rcv.Wind.Shift bits:
944 	 */
945 	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
946 	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
947 	tcp_v4_send_ack(sk, skb, seq,
948 			tcp_rsk(req)->rcv_nxt,
949 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
950 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
951 			req->ts_recent,
952 			0,
953 			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
954 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
955 			ip_hdr(skb)->tos);
956 }
957 
958 /*
959  *	Send a SYN-ACK after having received a SYN.
960  *	This still operates on a request_sock only, not on a big
961  *	socket.
962  */
963 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
964 			      struct flowi *fl,
965 			      struct request_sock *req,
966 			      struct tcp_fastopen_cookie *foc,
967 			      enum tcp_synack_type synack_type)
968 {
969 	const struct inet_request_sock *ireq = inet_rsk(req);
970 	struct flowi4 fl4;
971 	int err = -1;
972 	struct sk_buff *skb;
973 
974 	/* First, grab a route. */
975 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
976 		return -1;
977 
978 	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
979 
980 	if (skb) {
981 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
982 
983 		rcu_read_lock();
984 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
985 					    ireq->ir_rmt_addr,
986 					    rcu_dereference(ireq->ireq_opt));
987 		rcu_read_unlock();
988 		err = net_xmit_eval(err);
989 	}
990 
991 	return err;
992 }
993 
994 /*
995  *	IPv4 request_sock destructor.
996  */
997 static void tcp_v4_reqsk_destructor(struct request_sock *req)
998 {
999 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1000 }
1001 
1002 #ifdef CONFIG_TCP_MD5SIG
1003 /*
1004  * RFC2385 MD5 checksumming requires a mapping of
1005  * IP address->MD5 Key.
1006  * We need to maintain these in the sk structure.
1007  */
1008 
1009 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1010 EXPORT_SYMBOL(tcp_md5_needed);
1011 
1012 /* Find the Key structure for an address.  */
1013 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1014 					   const union tcp_md5_addr *addr,
1015 					   int family)
1016 {
1017 	const struct tcp_sock *tp = tcp_sk(sk);
1018 	struct tcp_md5sig_key *key;
1019 	const struct tcp_md5sig_info *md5sig;
1020 	__be32 mask;
1021 	struct tcp_md5sig_key *best_match = NULL;
1022 	bool match;
1023 
1024 	/* caller either holds rcu_read_lock() or socket lock */
1025 	md5sig = rcu_dereference_check(tp->md5sig_info,
1026 				       lockdep_sock_is_held(sk));
1027 	if (!md5sig)
1028 		return NULL;
1029 
1030 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1031 				 lockdep_sock_is_held(sk)) {
1032 		if (key->family != family)
1033 			continue;
1034 		if (key->l3index && key->l3index != l3index)
1035 			continue;
1036 		if (family == AF_INET) {
1037 			mask = inet_make_mask(key->prefixlen);
1038 			match = (key->addr.a4.s_addr & mask) ==
1039 				(addr->a4.s_addr & mask);
1040 #if IS_ENABLED(CONFIG_IPV6)
1041 		} else if (family == AF_INET6) {
1042 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1043 						  key->prefixlen);
1044 #endif
1045 		} else {
1046 			match = false;
1047 		}
1048 
1049 		if (match && (!best_match ||
1050 			      key->prefixlen > best_match->prefixlen))
1051 			best_match = key;
1052 	}
1053 	return best_match;
1054 }
1055 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1056 
1057 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1058 						      const union tcp_md5_addr *addr,
1059 						      int family, u8 prefixlen,
1060 						      int l3index)
1061 {
1062 	const struct tcp_sock *tp = tcp_sk(sk);
1063 	struct tcp_md5sig_key *key;
1064 	unsigned int size = sizeof(struct in_addr);
1065 	const struct tcp_md5sig_info *md5sig;
1066 
1067 	/* caller either holds rcu_read_lock() or socket lock */
1068 	md5sig = rcu_dereference_check(tp->md5sig_info,
1069 				       lockdep_sock_is_held(sk));
1070 	if (!md5sig)
1071 		return NULL;
1072 #if IS_ENABLED(CONFIG_IPV6)
1073 	if (family == AF_INET6)
1074 		size = sizeof(struct in6_addr);
1075 #endif
1076 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1077 				 lockdep_sock_is_held(sk)) {
1078 		if (key->family != family)
1079 			continue;
1080 		if (key->l3index && key->l3index != l3index)
1081 			continue;
1082 		if (!memcmp(&key->addr, addr, size) &&
1083 		    key->prefixlen == prefixlen)
1084 			return key;
1085 	}
1086 	return NULL;
1087 }
1088 
1089 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1090 					 const struct sock *addr_sk)
1091 {
1092 	const union tcp_md5_addr *addr;
1093 	int l3index;
1094 
1095 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1096 						 addr_sk->sk_bound_dev_if);
1097 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1098 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1099 }
1100 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1101 
1102 /* This can be called on a newly created socket, from other files */
1103 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1104 		   int family, u8 prefixlen, int l3index,
1105 		   const u8 *newkey, u8 newkeylen, gfp_t gfp)
1106 {
1107 	/* Add Key to the list */
1108 	struct tcp_md5sig_key *key;
1109 	struct tcp_sock *tp = tcp_sk(sk);
1110 	struct tcp_md5sig_info *md5sig;
1111 
1112 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1113 	if (key) {
1114 		/* Pre-existing entry - just update that one. */
1115 		memcpy(key->key, newkey, newkeylen);
1116 		key->keylen = newkeylen;
1117 		return 0;
1118 	}
1119 
1120 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1121 					   lockdep_sock_is_held(sk));
1122 	if (!md5sig) {
1123 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1124 		if (!md5sig)
1125 			return -ENOMEM;
1126 
1127 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1128 		INIT_HLIST_HEAD(&md5sig->head);
1129 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1130 	}
1131 
1132 	key = sock_kmalloc(sk, sizeof(*key), gfp);
1133 	if (!key)
1134 		return -ENOMEM;
1135 	if (!tcp_alloc_md5sig_pool()) {
1136 		sock_kfree_s(sk, key, sizeof(*key));
1137 		return -ENOMEM;
1138 	}
1139 
1140 	memcpy(key->key, newkey, newkeylen);
1141 	key->keylen = newkeylen;
1142 	key->family = family;
1143 	key->prefixlen = prefixlen;
1144 	key->l3index = l3index;
1145 	memcpy(&key->addr, addr,
1146 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1147 				      sizeof(struct in_addr));
1148 	hlist_add_head_rcu(&key->node, &md5sig->head);
1149 	return 0;
1150 }
1151 EXPORT_SYMBOL(tcp_md5_do_add);
1152 
1153 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1154 		   u8 prefixlen, int l3index)
1155 {
1156 	struct tcp_md5sig_key *key;
1157 
1158 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1159 	if (!key)
1160 		return -ENOENT;
1161 	hlist_del_rcu(&key->node);
1162 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1163 	kfree_rcu(key, rcu);
1164 	return 0;
1165 }
1166 EXPORT_SYMBOL(tcp_md5_do_del);
1167 
1168 static void tcp_clear_md5_list(struct sock *sk)
1169 {
1170 	struct tcp_sock *tp = tcp_sk(sk);
1171 	struct tcp_md5sig_key *key;
1172 	struct hlist_node *n;
1173 	struct tcp_md5sig_info *md5sig;
1174 
1175 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1176 
1177 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1178 		hlist_del_rcu(&key->node);
1179 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1180 		kfree_rcu(key, rcu);
1181 	}
1182 }
1183 
1184 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1185 				 char __user *optval, int optlen)
1186 {
1187 	struct tcp_md5sig cmd;
1188 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1189 	const union tcp_md5_addr *addr;
1190 	u8 prefixlen = 32;
1191 	int l3index = 0;
1192 
1193 	if (optlen < sizeof(cmd))
1194 		return -EINVAL;
1195 
1196 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1197 		return -EFAULT;
1198 
1199 	if (sin->sin_family != AF_INET)
1200 		return -EINVAL;
1201 
1202 	if (optname == TCP_MD5SIG_EXT &&
1203 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1204 		prefixlen = cmd.tcpm_prefixlen;
1205 		if (prefixlen > 32)
1206 			return -EINVAL;
1207 	}
1208 
1209 	if (optname == TCP_MD5SIG_EXT &&
1210 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1211 		struct net_device *dev;
1212 
1213 		rcu_read_lock();
1214 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1215 		if (dev && netif_is_l3_master(dev))
1216 			l3index = dev->ifindex;
1217 
1218 		rcu_read_unlock();
1219 
1220 		/* ok to reference set/not set outside of rcu;
1221 		 * right now device MUST be an L3 master
1222 		 */
1223 		if (!dev || !l3index)
1224 			return -EINVAL;
1225 	}
1226 
1227 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1228 
1229 	if (!cmd.tcpm_keylen)
1230 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1231 
1232 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1233 		return -EINVAL;
1234 
1235 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1236 			      cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1237 }
1238 
1239 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1240 				   __be32 daddr, __be32 saddr,
1241 				   const struct tcphdr *th, int nbytes)
1242 {
1243 	struct tcp4_pseudohdr *bp;
1244 	struct scatterlist sg;
1245 	struct tcphdr *_th;
1246 
1247 	bp = hp->scratch;
1248 	bp->saddr = saddr;
1249 	bp->daddr = daddr;
1250 	bp->pad = 0;
1251 	bp->protocol = IPPROTO_TCP;
1252 	bp->len = cpu_to_be16(nbytes);
1253 
1254 	_th = (struct tcphdr *)(bp + 1);
1255 	memcpy(_th, th, sizeof(*th));
1256 	_th->check = 0;
1257 
1258 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1259 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1260 				sizeof(*bp) + sizeof(*th));
1261 	return crypto_ahash_update(hp->md5_req);
1262 }
1263 
1264 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1265 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1266 {
1267 	struct tcp_md5sig_pool *hp;
1268 	struct ahash_request *req;
1269 
1270 	hp = tcp_get_md5sig_pool();
1271 	if (!hp)
1272 		goto clear_hash_noput;
1273 	req = hp->md5_req;
1274 
1275 	if (crypto_ahash_init(req))
1276 		goto clear_hash;
1277 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1278 		goto clear_hash;
1279 	if (tcp_md5_hash_key(hp, key))
1280 		goto clear_hash;
1281 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1282 	if (crypto_ahash_final(req))
1283 		goto clear_hash;
1284 
1285 	tcp_put_md5sig_pool();
1286 	return 0;
1287 
1288 clear_hash:
1289 	tcp_put_md5sig_pool();
1290 clear_hash_noput:
1291 	memset(md5_hash, 0, 16);
1292 	return 1;
1293 }
1294 
1295 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1296 			const struct sock *sk,
1297 			const struct sk_buff *skb)
1298 {
1299 	struct tcp_md5sig_pool *hp;
1300 	struct ahash_request *req;
1301 	const struct tcphdr *th = tcp_hdr(skb);
1302 	__be32 saddr, daddr;
1303 
1304 	if (sk) { /* valid for establish/request sockets */
1305 		saddr = sk->sk_rcv_saddr;
1306 		daddr = sk->sk_daddr;
1307 	} else {
1308 		const struct iphdr *iph = ip_hdr(skb);
1309 		saddr = iph->saddr;
1310 		daddr = iph->daddr;
1311 	}
1312 
1313 	hp = tcp_get_md5sig_pool();
1314 	if (!hp)
1315 		goto clear_hash_noput;
1316 	req = hp->md5_req;
1317 
1318 	if (crypto_ahash_init(req))
1319 		goto clear_hash;
1320 
1321 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1322 		goto clear_hash;
1323 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1324 		goto clear_hash;
1325 	if (tcp_md5_hash_key(hp, key))
1326 		goto clear_hash;
1327 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1328 	if (crypto_ahash_final(req))
1329 		goto clear_hash;
1330 
1331 	tcp_put_md5sig_pool();
1332 	return 0;
1333 
1334 clear_hash:
1335 	tcp_put_md5sig_pool();
1336 clear_hash_noput:
1337 	memset(md5_hash, 0, 16);
1338 	return 1;
1339 }
1340 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1341 
1342 #endif
1343 
1344 /* Called with rcu_read_lock() */
1345 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1346 				    const struct sk_buff *skb,
1347 				    int dif, int sdif)
1348 {
1349 #ifdef CONFIG_TCP_MD5SIG
1350 	/*
1351 	 * This gets called for each TCP segment that arrives
1352 	 * so we want to be efficient.
1353 	 * We have 3 drop cases:
1354 	 * o No MD5 hash and one expected.
1355 	 * o MD5 hash and we're not expecting one.
1356 	 * o MD5 hash and its wrong.
1357 	 */
1358 	const __u8 *hash_location = NULL;
1359 	struct tcp_md5sig_key *hash_expected;
1360 	const struct iphdr *iph = ip_hdr(skb);
1361 	const struct tcphdr *th = tcp_hdr(skb);
1362 	const union tcp_md5_addr *addr;
1363 	unsigned char newhash[16];
1364 	int genhash, l3index;
1365 
1366 	/* sdif set, means packet ingressed via a device
1367 	 * in an L3 domain and dif is set to the l3mdev
1368 	 */
1369 	l3index = sdif ? dif : 0;
1370 
1371 	addr = (union tcp_md5_addr *)&iph->saddr;
1372 	hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1373 	hash_location = tcp_parse_md5sig_option(th);
1374 
1375 	/* We've parsed the options - do we have a hash? */
1376 	if (!hash_expected && !hash_location)
1377 		return false;
1378 
1379 	if (hash_expected && !hash_location) {
1380 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1381 		return true;
1382 	}
1383 
1384 	if (!hash_expected && hash_location) {
1385 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1386 		return true;
1387 	}
1388 
1389 	/* Okay, so this is hash_expected and hash_location -
1390 	 * so we need to calculate the checksum.
1391 	 */
1392 	genhash = tcp_v4_md5_hash_skb(newhash,
1393 				      hash_expected,
1394 				      NULL, skb);
1395 
1396 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1397 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1398 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1399 				     &iph->saddr, ntohs(th->source),
1400 				     &iph->daddr, ntohs(th->dest),
1401 				     genhash ? " tcp_v4_calc_md5_hash failed"
1402 				     : "", l3index);
1403 		return true;
1404 	}
1405 	return false;
1406 #endif
1407 	return false;
1408 }
1409 
1410 static void tcp_v4_init_req(struct request_sock *req,
1411 			    const struct sock *sk_listener,
1412 			    struct sk_buff *skb)
1413 {
1414 	struct inet_request_sock *ireq = inet_rsk(req);
1415 	struct net *net = sock_net(sk_listener);
1416 
1417 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1418 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1419 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1420 }
1421 
1422 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1423 					  struct flowi *fl,
1424 					  const struct request_sock *req)
1425 {
1426 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1427 }
1428 
1429 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1430 	.family		=	PF_INET,
1431 	.obj_size	=	sizeof(struct tcp_request_sock),
1432 	.rtx_syn_ack	=	tcp_rtx_synack,
1433 	.send_ack	=	tcp_v4_reqsk_send_ack,
1434 	.destructor	=	tcp_v4_reqsk_destructor,
1435 	.send_reset	=	tcp_v4_send_reset,
1436 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1437 };
1438 
1439 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1440 	.mss_clamp	=	TCP_MSS_DEFAULT,
1441 #ifdef CONFIG_TCP_MD5SIG
1442 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1443 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1444 #endif
1445 	.init_req	=	tcp_v4_init_req,
1446 #ifdef CONFIG_SYN_COOKIES
1447 	.cookie_init_seq =	cookie_v4_init_sequence,
1448 #endif
1449 	.route_req	=	tcp_v4_route_req,
1450 	.init_seq	=	tcp_v4_init_seq,
1451 	.init_ts_off	=	tcp_v4_init_ts_off,
1452 	.send_synack	=	tcp_v4_send_synack,
1453 };
1454 
1455 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1456 {
1457 	/* Never answer to SYNs send to broadcast or multicast */
1458 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1459 		goto drop;
1460 
1461 	return tcp_conn_request(&tcp_request_sock_ops,
1462 				&tcp_request_sock_ipv4_ops, sk, skb);
1463 
1464 drop:
1465 	tcp_listendrop(sk);
1466 	return 0;
1467 }
1468 EXPORT_SYMBOL(tcp_v4_conn_request);
1469 
1470 
1471 /*
1472  * The three way handshake has completed - we got a valid synack -
1473  * now create the new socket.
1474  */
1475 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1476 				  struct request_sock *req,
1477 				  struct dst_entry *dst,
1478 				  struct request_sock *req_unhash,
1479 				  bool *own_req)
1480 {
1481 	struct inet_request_sock *ireq;
1482 	struct inet_sock *newinet;
1483 	struct tcp_sock *newtp;
1484 	struct sock *newsk;
1485 #ifdef CONFIG_TCP_MD5SIG
1486 	const union tcp_md5_addr *addr;
1487 	struct tcp_md5sig_key *key;
1488 	int l3index;
1489 #endif
1490 	struct ip_options_rcu *inet_opt;
1491 
1492 	if (sk_acceptq_is_full(sk))
1493 		goto exit_overflow;
1494 
1495 	newsk = tcp_create_openreq_child(sk, req, skb);
1496 	if (!newsk)
1497 		goto exit_nonewsk;
1498 
1499 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1500 	inet_sk_rx_dst_set(newsk, skb);
1501 
1502 	newtp		      = tcp_sk(newsk);
1503 	newinet		      = inet_sk(newsk);
1504 	ireq		      = inet_rsk(req);
1505 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1506 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1507 	newsk->sk_bound_dev_if = ireq->ir_iif;
1508 	newinet->inet_saddr   = ireq->ir_loc_addr;
1509 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1510 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1511 	newinet->mc_index     = inet_iif(skb);
1512 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1513 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1514 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1515 	if (inet_opt)
1516 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1517 	newinet->inet_id = prandom_u32();
1518 
1519 	if (!dst) {
1520 		dst = inet_csk_route_child_sock(sk, newsk, req);
1521 		if (!dst)
1522 			goto put_and_exit;
1523 	} else {
1524 		/* syncookie case : see end of cookie_v4_check() */
1525 	}
1526 	sk_setup_caps(newsk, dst);
1527 
1528 	tcp_ca_openreq_child(newsk, dst);
1529 
1530 	tcp_sync_mss(newsk, dst_mtu(dst));
1531 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1532 
1533 	tcp_initialize_rcv_mss(newsk);
1534 
1535 #ifdef CONFIG_TCP_MD5SIG
1536 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1537 	/* Copy over the MD5 key from the original socket */
1538 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1539 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1540 	if (key) {
1541 		/*
1542 		 * We're using one, so create a matching key
1543 		 * on the newsk structure. If we fail to get
1544 		 * memory, then we end up not copying the key
1545 		 * across. Shucks.
1546 		 */
1547 		tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1548 			       key->key, key->keylen, GFP_ATOMIC);
1549 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1550 	}
1551 #endif
1552 
1553 	if (__inet_inherit_port(sk, newsk) < 0)
1554 		goto put_and_exit;
1555 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1556 	if (likely(*own_req)) {
1557 		tcp_move_syn(newtp, req);
1558 		ireq->ireq_opt = NULL;
1559 	} else {
1560 		newinet->inet_opt = NULL;
1561 	}
1562 	return newsk;
1563 
1564 exit_overflow:
1565 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1566 exit_nonewsk:
1567 	dst_release(dst);
1568 exit:
1569 	tcp_listendrop(sk);
1570 	return NULL;
1571 put_and_exit:
1572 	newinet->inet_opt = NULL;
1573 	inet_csk_prepare_forced_close(newsk);
1574 	tcp_done(newsk);
1575 	goto exit;
1576 }
1577 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1578 
1579 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1580 {
1581 #ifdef CONFIG_SYN_COOKIES
1582 	const struct tcphdr *th = tcp_hdr(skb);
1583 
1584 	if (!th->syn)
1585 		sk = cookie_v4_check(sk, skb);
1586 #endif
1587 	return sk;
1588 }
1589 
1590 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1591 			 struct tcphdr *th, u32 *cookie)
1592 {
1593 	u16 mss = 0;
1594 #ifdef CONFIG_SYN_COOKIES
1595 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1596 				    &tcp_request_sock_ipv4_ops, sk, th);
1597 	if (mss) {
1598 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1599 		tcp_synq_overflow(sk);
1600 	}
1601 #endif
1602 	return mss;
1603 }
1604 
1605 /* The socket must have it's spinlock held when we get
1606  * here, unless it is a TCP_LISTEN socket.
1607  *
1608  * We have a potential double-lock case here, so even when
1609  * doing backlog processing we use the BH locking scheme.
1610  * This is because we cannot sleep with the original spinlock
1611  * held.
1612  */
1613 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1614 {
1615 	struct sock *rsk;
1616 
1617 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1618 		struct dst_entry *dst = sk->sk_rx_dst;
1619 
1620 		sock_rps_save_rxhash(sk, skb);
1621 		sk_mark_napi_id(sk, skb);
1622 		if (dst) {
1623 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1624 			    !dst->ops->check(dst, 0)) {
1625 				dst_release(dst);
1626 				sk->sk_rx_dst = NULL;
1627 			}
1628 		}
1629 		tcp_rcv_established(sk, skb);
1630 		return 0;
1631 	}
1632 
1633 	if (tcp_checksum_complete(skb))
1634 		goto csum_err;
1635 
1636 	if (sk->sk_state == TCP_LISTEN) {
1637 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1638 
1639 		if (!nsk)
1640 			goto discard;
1641 		if (nsk != sk) {
1642 			if (tcp_child_process(sk, nsk, skb)) {
1643 				rsk = nsk;
1644 				goto reset;
1645 			}
1646 			return 0;
1647 		}
1648 	} else
1649 		sock_rps_save_rxhash(sk, skb);
1650 
1651 	if (tcp_rcv_state_process(sk, skb)) {
1652 		rsk = sk;
1653 		goto reset;
1654 	}
1655 	return 0;
1656 
1657 reset:
1658 	tcp_v4_send_reset(rsk, skb);
1659 discard:
1660 	kfree_skb(skb);
1661 	/* Be careful here. If this function gets more complicated and
1662 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1663 	 * might be destroyed here. This current version compiles correctly,
1664 	 * but you have been warned.
1665 	 */
1666 	return 0;
1667 
1668 csum_err:
1669 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1670 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1671 	goto discard;
1672 }
1673 EXPORT_SYMBOL(tcp_v4_do_rcv);
1674 
1675 int tcp_v4_early_demux(struct sk_buff *skb)
1676 {
1677 	const struct iphdr *iph;
1678 	const struct tcphdr *th;
1679 	struct sock *sk;
1680 
1681 	if (skb->pkt_type != PACKET_HOST)
1682 		return 0;
1683 
1684 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1685 		return 0;
1686 
1687 	iph = ip_hdr(skb);
1688 	th = tcp_hdr(skb);
1689 
1690 	if (th->doff < sizeof(struct tcphdr) / 4)
1691 		return 0;
1692 
1693 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1694 				       iph->saddr, th->source,
1695 				       iph->daddr, ntohs(th->dest),
1696 				       skb->skb_iif, inet_sdif(skb));
1697 	if (sk) {
1698 		skb->sk = sk;
1699 		skb->destructor = sock_edemux;
1700 		if (sk_fullsock(sk)) {
1701 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1702 
1703 			if (dst)
1704 				dst = dst_check(dst, 0);
1705 			if (dst &&
1706 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1707 				skb_dst_set_noref(skb, dst);
1708 		}
1709 	}
1710 	return 0;
1711 }
1712 
1713 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1714 {
1715 	u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1716 	struct skb_shared_info *shinfo;
1717 	const struct tcphdr *th;
1718 	struct tcphdr *thtail;
1719 	struct sk_buff *tail;
1720 	unsigned int hdrlen;
1721 	bool fragstolen;
1722 	u32 gso_segs;
1723 	int delta;
1724 
1725 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1726 	 * we can fix skb->truesize to its real value to avoid future drops.
1727 	 * This is valid because skb is not yet charged to the socket.
1728 	 * It has been noticed pure SACK packets were sometimes dropped
1729 	 * (if cooked by drivers without copybreak feature).
1730 	 */
1731 	skb_condense(skb);
1732 
1733 	skb_dst_drop(skb);
1734 
1735 	if (unlikely(tcp_checksum_complete(skb))) {
1736 		bh_unlock_sock(sk);
1737 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1738 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1739 		return true;
1740 	}
1741 
1742 	/* Attempt coalescing to last skb in backlog, even if we are
1743 	 * above the limits.
1744 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1745 	 */
1746 	th = (const struct tcphdr *)skb->data;
1747 	hdrlen = th->doff * 4;
1748 	shinfo = skb_shinfo(skb);
1749 
1750 	if (!shinfo->gso_size)
1751 		shinfo->gso_size = skb->len - hdrlen;
1752 
1753 	if (!shinfo->gso_segs)
1754 		shinfo->gso_segs = 1;
1755 
1756 	tail = sk->sk_backlog.tail;
1757 	if (!tail)
1758 		goto no_coalesce;
1759 	thtail = (struct tcphdr *)tail->data;
1760 
1761 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1762 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1763 	    ((TCP_SKB_CB(tail)->tcp_flags |
1764 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1765 	    !((TCP_SKB_CB(tail)->tcp_flags &
1766 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1767 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1768 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1769 #ifdef CONFIG_TLS_DEVICE
1770 	    tail->decrypted != skb->decrypted ||
1771 #endif
1772 	    thtail->doff != th->doff ||
1773 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1774 		goto no_coalesce;
1775 
1776 	__skb_pull(skb, hdrlen);
1777 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1778 		thtail->window = th->window;
1779 
1780 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1781 
1782 		if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1783 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1784 
1785 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1786 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1787 		 * is not entered if we append a packet with a FIN.
1788 		 * SYN, RST, URG are not present.
1789 		 * ACK is set on both packets.
1790 		 * PSH : we do not really care in TCP stack,
1791 		 *       at least for 'GRO' packets.
1792 		 */
1793 		thtail->fin |= th->fin;
1794 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1795 
1796 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1797 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1798 			tail->tstamp = skb->tstamp;
1799 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1800 		}
1801 
1802 		/* Not as strict as GRO. We only need to carry mss max value */
1803 		skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1804 						 skb_shinfo(tail)->gso_size);
1805 
1806 		gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1807 		skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1808 
1809 		sk->sk_backlog.len += delta;
1810 		__NET_INC_STATS(sock_net(sk),
1811 				LINUX_MIB_TCPBACKLOGCOALESCE);
1812 		kfree_skb_partial(skb, fragstolen);
1813 		return false;
1814 	}
1815 	__skb_push(skb, hdrlen);
1816 
1817 no_coalesce:
1818 	/* Only socket owner can try to collapse/prune rx queues
1819 	 * to reduce memory overhead, so add a little headroom here.
1820 	 * Few sockets backlog are possibly concurrently non empty.
1821 	 */
1822 	limit += 64*1024;
1823 
1824 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1825 		bh_unlock_sock(sk);
1826 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1827 		return true;
1828 	}
1829 	return false;
1830 }
1831 EXPORT_SYMBOL(tcp_add_backlog);
1832 
1833 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1834 {
1835 	struct tcphdr *th = (struct tcphdr *)skb->data;
1836 
1837 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1838 }
1839 EXPORT_SYMBOL(tcp_filter);
1840 
1841 static void tcp_v4_restore_cb(struct sk_buff *skb)
1842 {
1843 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1844 		sizeof(struct inet_skb_parm));
1845 }
1846 
1847 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1848 			   const struct tcphdr *th)
1849 {
1850 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1851 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1852 	 */
1853 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1854 		sizeof(struct inet_skb_parm));
1855 	barrier();
1856 
1857 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1858 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1859 				    skb->len - th->doff * 4);
1860 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1861 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1862 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1863 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1864 	TCP_SKB_CB(skb)->sacked	 = 0;
1865 	TCP_SKB_CB(skb)->has_rxtstamp =
1866 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1867 }
1868 
1869 /*
1870  *	From tcp_input.c
1871  */
1872 
1873 int tcp_v4_rcv(struct sk_buff *skb)
1874 {
1875 	struct net *net = dev_net(skb->dev);
1876 	struct sk_buff *skb_to_free;
1877 	int sdif = inet_sdif(skb);
1878 	int dif = inet_iif(skb);
1879 	const struct iphdr *iph;
1880 	const struct tcphdr *th;
1881 	bool refcounted;
1882 	struct sock *sk;
1883 	int ret;
1884 
1885 	if (skb->pkt_type != PACKET_HOST)
1886 		goto discard_it;
1887 
1888 	/* Count it even if it's bad */
1889 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1890 
1891 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1892 		goto discard_it;
1893 
1894 	th = (const struct tcphdr *)skb->data;
1895 
1896 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1897 		goto bad_packet;
1898 	if (!pskb_may_pull(skb, th->doff * 4))
1899 		goto discard_it;
1900 
1901 	/* An explanation is required here, I think.
1902 	 * Packet length and doff are validated by header prediction,
1903 	 * provided case of th->doff==0 is eliminated.
1904 	 * So, we defer the checks. */
1905 
1906 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1907 		goto csum_error;
1908 
1909 	th = (const struct tcphdr *)skb->data;
1910 	iph = ip_hdr(skb);
1911 lookup:
1912 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1913 			       th->dest, sdif, &refcounted);
1914 	if (!sk)
1915 		goto no_tcp_socket;
1916 
1917 process:
1918 	if (sk->sk_state == TCP_TIME_WAIT)
1919 		goto do_time_wait;
1920 
1921 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1922 		struct request_sock *req = inet_reqsk(sk);
1923 		bool req_stolen = false;
1924 		struct sock *nsk;
1925 
1926 		sk = req->rsk_listener;
1927 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1928 			sk_drops_add(sk, skb);
1929 			reqsk_put(req);
1930 			goto discard_it;
1931 		}
1932 		if (tcp_checksum_complete(skb)) {
1933 			reqsk_put(req);
1934 			goto csum_error;
1935 		}
1936 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1937 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1938 			goto lookup;
1939 		}
1940 		/* We own a reference on the listener, increase it again
1941 		 * as we might lose it too soon.
1942 		 */
1943 		sock_hold(sk);
1944 		refcounted = true;
1945 		nsk = NULL;
1946 		if (!tcp_filter(sk, skb)) {
1947 			th = (const struct tcphdr *)skb->data;
1948 			iph = ip_hdr(skb);
1949 			tcp_v4_fill_cb(skb, iph, th);
1950 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1951 		}
1952 		if (!nsk) {
1953 			reqsk_put(req);
1954 			if (req_stolen) {
1955 				/* Another cpu got exclusive access to req
1956 				 * and created a full blown socket.
1957 				 * Try to feed this packet to this socket
1958 				 * instead of discarding it.
1959 				 */
1960 				tcp_v4_restore_cb(skb);
1961 				sock_put(sk);
1962 				goto lookup;
1963 			}
1964 			goto discard_and_relse;
1965 		}
1966 		if (nsk == sk) {
1967 			reqsk_put(req);
1968 			tcp_v4_restore_cb(skb);
1969 		} else if (tcp_child_process(sk, nsk, skb)) {
1970 			tcp_v4_send_reset(nsk, skb);
1971 			goto discard_and_relse;
1972 		} else {
1973 			sock_put(sk);
1974 			return 0;
1975 		}
1976 	}
1977 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1978 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1979 		goto discard_and_relse;
1980 	}
1981 
1982 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1983 		goto discard_and_relse;
1984 
1985 	if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
1986 		goto discard_and_relse;
1987 
1988 	nf_reset_ct(skb);
1989 
1990 	if (tcp_filter(sk, skb))
1991 		goto discard_and_relse;
1992 	th = (const struct tcphdr *)skb->data;
1993 	iph = ip_hdr(skb);
1994 	tcp_v4_fill_cb(skb, iph, th);
1995 
1996 	skb->dev = NULL;
1997 
1998 	if (sk->sk_state == TCP_LISTEN) {
1999 		ret = tcp_v4_do_rcv(sk, skb);
2000 		goto put_and_return;
2001 	}
2002 
2003 	sk_incoming_cpu_update(sk);
2004 
2005 	bh_lock_sock_nested(sk);
2006 	tcp_segs_in(tcp_sk(sk), skb);
2007 	ret = 0;
2008 	if (!sock_owned_by_user(sk)) {
2009 		skb_to_free = sk->sk_rx_skb_cache;
2010 		sk->sk_rx_skb_cache = NULL;
2011 		ret = tcp_v4_do_rcv(sk, skb);
2012 	} else {
2013 		if (tcp_add_backlog(sk, skb))
2014 			goto discard_and_relse;
2015 		skb_to_free = NULL;
2016 	}
2017 	bh_unlock_sock(sk);
2018 	if (skb_to_free)
2019 		__kfree_skb(skb_to_free);
2020 
2021 put_and_return:
2022 	if (refcounted)
2023 		sock_put(sk);
2024 
2025 	return ret;
2026 
2027 no_tcp_socket:
2028 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2029 		goto discard_it;
2030 
2031 	tcp_v4_fill_cb(skb, iph, th);
2032 
2033 	if (tcp_checksum_complete(skb)) {
2034 csum_error:
2035 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2036 bad_packet:
2037 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2038 	} else {
2039 		tcp_v4_send_reset(NULL, skb);
2040 	}
2041 
2042 discard_it:
2043 	/* Discard frame. */
2044 	kfree_skb(skb);
2045 	return 0;
2046 
2047 discard_and_relse:
2048 	sk_drops_add(sk, skb);
2049 	if (refcounted)
2050 		sock_put(sk);
2051 	goto discard_it;
2052 
2053 do_time_wait:
2054 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2055 		inet_twsk_put(inet_twsk(sk));
2056 		goto discard_it;
2057 	}
2058 
2059 	tcp_v4_fill_cb(skb, iph, th);
2060 
2061 	if (tcp_checksum_complete(skb)) {
2062 		inet_twsk_put(inet_twsk(sk));
2063 		goto csum_error;
2064 	}
2065 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2066 	case TCP_TW_SYN: {
2067 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2068 							&tcp_hashinfo, skb,
2069 							__tcp_hdrlen(th),
2070 							iph->saddr, th->source,
2071 							iph->daddr, th->dest,
2072 							inet_iif(skb),
2073 							sdif);
2074 		if (sk2) {
2075 			inet_twsk_deschedule_put(inet_twsk(sk));
2076 			sk = sk2;
2077 			tcp_v4_restore_cb(skb);
2078 			refcounted = false;
2079 			goto process;
2080 		}
2081 	}
2082 		/* to ACK */
2083 		fallthrough;
2084 	case TCP_TW_ACK:
2085 		tcp_v4_timewait_ack(sk, skb);
2086 		break;
2087 	case TCP_TW_RST:
2088 		tcp_v4_send_reset(sk, skb);
2089 		inet_twsk_deschedule_put(inet_twsk(sk));
2090 		goto discard_it;
2091 	case TCP_TW_SUCCESS:;
2092 	}
2093 	goto discard_it;
2094 }
2095 
2096 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2097 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2098 	.twsk_unique	= tcp_twsk_unique,
2099 	.twsk_destructor= tcp_twsk_destructor,
2100 };
2101 
2102 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2103 {
2104 	struct dst_entry *dst = skb_dst(skb);
2105 
2106 	if (dst && dst_hold_safe(dst)) {
2107 		sk->sk_rx_dst = dst;
2108 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2109 	}
2110 }
2111 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2112 
2113 const struct inet_connection_sock_af_ops ipv4_specific = {
2114 	.queue_xmit	   = ip_queue_xmit,
2115 	.send_check	   = tcp_v4_send_check,
2116 	.rebuild_header	   = inet_sk_rebuild_header,
2117 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2118 	.conn_request	   = tcp_v4_conn_request,
2119 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2120 	.net_header_len	   = sizeof(struct iphdr),
2121 	.setsockopt	   = ip_setsockopt,
2122 	.getsockopt	   = ip_getsockopt,
2123 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2124 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2125 #ifdef CONFIG_COMPAT
2126 	.compat_setsockopt = compat_ip_setsockopt,
2127 	.compat_getsockopt = compat_ip_getsockopt,
2128 #endif
2129 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2130 };
2131 EXPORT_SYMBOL(ipv4_specific);
2132 
2133 #ifdef CONFIG_TCP_MD5SIG
2134 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2135 	.md5_lookup		= tcp_v4_md5_lookup,
2136 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2137 	.md5_parse		= tcp_v4_parse_md5_keys,
2138 };
2139 #endif
2140 
2141 /* NOTE: A lot of things set to zero explicitly by call to
2142  *       sk_alloc() so need not be done here.
2143  */
2144 static int tcp_v4_init_sock(struct sock *sk)
2145 {
2146 	struct inet_connection_sock *icsk = inet_csk(sk);
2147 
2148 	tcp_init_sock(sk);
2149 
2150 	icsk->icsk_af_ops = &ipv4_specific;
2151 
2152 #ifdef CONFIG_TCP_MD5SIG
2153 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2154 #endif
2155 
2156 	return 0;
2157 }
2158 
2159 void tcp_v4_destroy_sock(struct sock *sk)
2160 {
2161 	struct tcp_sock *tp = tcp_sk(sk);
2162 
2163 	trace_tcp_destroy_sock(sk);
2164 
2165 	tcp_clear_xmit_timers(sk);
2166 
2167 	tcp_cleanup_congestion_control(sk);
2168 
2169 	tcp_cleanup_ulp(sk);
2170 
2171 	/* Cleanup up the write buffer. */
2172 	tcp_write_queue_purge(sk);
2173 
2174 	/* Check if we want to disable active TFO */
2175 	tcp_fastopen_active_disable_ofo_check(sk);
2176 
2177 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2178 	skb_rbtree_purge(&tp->out_of_order_queue);
2179 
2180 #ifdef CONFIG_TCP_MD5SIG
2181 	/* Clean up the MD5 key list, if any */
2182 	if (tp->md5sig_info) {
2183 		tcp_clear_md5_list(sk);
2184 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2185 		tp->md5sig_info = NULL;
2186 	}
2187 #endif
2188 
2189 	/* Clean up a referenced TCP bind bucket. */
2190 	if (inet_csk(sk)->icsk_bind_hash)
2191 		inet_put_port(sk);
2192 
2193 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2194 
2195 	/* If socket is aborted during connect operation */
2196 	tcp_free_fastopen_req(tp);
2197 	tcp_fastopen_destroy_cipher(sk);
2198 	tcp_saved_syn_free(tp);
2199 
2200 	sk_sockets_allocated_dec(sk);
2201 }
2202 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2203 
2204 #ifdef CONFIG_PROC_FS
2205 /* Proc filesystem TCP sock list dumping. */
2206 
2207 /*
2208  * Get next listener socket follow cur.  If cur is NULL, get first socket
2209  * starting from bucket given in st->bucket; when st->bucket is zero the
2210  * very first socket in the hash table is returned.
2211  */
2212 static void *listening_get_next(struct seq_file *seq, void *cur)
2213 {
2214 	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2215 	struct tcp_iter_state *st = seq->private;
2216 	struct net *net = seq_file_net(seq);
2217 	struct inet_listen_hashbucket *ilb;
2218 	struct hlist_nulls_node *node;
2219 	struct sock *sk = cur;
2220 
2221 	if (!sk) {
2222 get_head:
2223 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2224 		spin_lock(&ilb->lock);
2225 		sk = sk_nulls_head(&ilb->nulls_head);
2226 		st->offset = 0;
2227 		goto get_sk;
2228 	}
2229 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2230 	++st->num;
2231 	++st->offset;
2232 
2233 	sk = sk_nulls_next(sk);
2234 get_sk:
2235 	sk_nulls_for_each_from(sk, node) {
2236 		if (!net_eq(sock_net(sk), net))
2237 			continue;
2238 		if (sk->sk_family == afinfo->family)
2239 			return sk;
2240 	}
2241 	spin_unlock(&ilb->lock);
2242 	st->offset = 0;
2243 	if (++st->bucket < INET_LHTABLE_SIZE)
2244 		goto get_head;
2245 	return NULL;
2246 }
2247 
2248 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2249 {
2250 	struct tcp_iter_state *st = seq->private;
2251 	void *rc;
2252 
2253 	st->bucket = 0;
2254 	st->offset = 0;
2255 	rc = listening_get_next(seq, NULL);
2256 
2257 	while (rc && *pos) {
2258 		rc = listening_get_next(seq, rc);
2259 		--*pos;
2260 	}
2261 	return rc;
2262 }
2263 
2264 static inline bool empty_bucket(const struct tcp_iter_state *st)
2265 {
2266 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2267 }
2268 
2269 /*
2270  * Get first established socket starting from bucket given in st->bucket.
2271  * If st->bucket is zero, the very first socket in the hash is returned.
2272  */
2273 static void *established_get_first(struct seq_file *seq)
2274 {
2275 	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2276 	struct tcp_iter_state *st = seq->private;
2277 	struct net *net = seq_file_net(seq);
2278 	void *rc = NULL;
2279 
2280 	st->offset = 0;
2281 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2282 		struct sock *sk;
2283 		struct hlist_nulls_node *node;
2284 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2285 
2286 		/* Lockless fast path for the common case of empty buckets */
2287 		if (empty_bucket(st))
2288 			continue;
2289 
2290 		spin_lock_bh(lock);
2291 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2292 			if (sk->sk_family != afinfo->family ||
2293 			    !net_eq(sock_net(sk), net)) {
2294 				continue;
2295 			}
2296 			rc = sk;
2297 			goto out;
2298 		}
2299 		spin_unlock_bh(lock);
2300 	}
2301 out:
2302 	return rc;
2303 }
2304 
2305 static void *established_get_next(struct seq_file *seq, void *cur)
2306 {
2307 	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2308 	struct sock *sk = cur;
2309 	struct hlist_nulls_node *node;
2310 	struct tcp_iter_state *st = seq->private;
2311 	struct net *net = seq_file_net(seq);
2312 
2313 	++st->num;
2314 	++st->offset;
2315 
2316 	sk = sk_nulls_next(sk);
2317 
2318 	sk_nulls_for_each_from(sk, node) {
2319 		if (sk->sk_family == afinfo->family &&
2320 		    net_eq(sock_net(sk), net))
2321 			return sk;
2322 	}
2323 
2324 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2325 	++st->bucket;
2326 	return established_get_first(seq);
2327 }
2328 
2329 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2330 {
2331 	struct tcp_iter_state *st = seq->private;
2332 	void *rc;
2333 
2334 	st->bucket = 0;
2335 	rc = established_get_first(seq);
2336 
2337 	while (rc && pos) {
2338 		rc = established_get_next(seq, rc);
2339 		--pos;
2340 	}
2341 	return rc;
2342 }
2343 
2344 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2345 {
2346 	void *rc;
2347 	struct tcp_iter_state *st = seq->private;
2348 
2349 	st->state = TCP_SEQ_STATE_LISTENING;
2350 	rc	  = listening_get_idx(seq, &pos);
2351 
2352 	if (!rc) {
2353 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2354 		rc	  = established_get_idx(seq, pos);
2355 	}
2356 
2357 	return rc;
2358 }
2359 
2360 static void *tcp_seek_last_pos(struct seq_file *seq)
2361 {
2362 	struct tcp_iter_state *st = seq->private;
2363 	int offset = st->offset;
2364 	int orig_num = st->num;
2365 	void *rc = NULL;
2366 
2367 	switch (st->state) {
2368 	case TCP_SEQ_STATE_LISTENING:
2369 		if (st->bucket >= INET_LHTABLE_SIZE)
2370 			break;
2371 		st->state = TCP_SEQ_STATE_LISTENING;
2372 		rc = listening_get_next(seq, NULL);
2373 		while (offset-- && rc)
2374 			rc = listening_get_next(seq, rc);
2375 		if (rc)
2376 			break;
2377 		st->bucket = 0;
2378 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2379 		fallthrough;
2380 	case TCP_SEQ_STATE_ESTABLISHED:
2381 		if (st->bucket > tcp_hashinfo.ehash_mask)
2382 			break;
2383 		rc = established_get_first(seq);
2384 		while (offset-- && rc)
2385 			rc = established_get_next(seq, rc);
2386 	}
2387 
2388 	st->num = orig_num;
2389 
2390 	return rc;
2391 }
2392 
2393 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2394 {
2395 	struct tcp_iter_state *st = seq->private;
2396 	void *rc;
2397 
2398 	if (*pos && *pos == st->last_pos) {
2399 		rc = tcp_seek_last_pos(seq);
2400 		if (rc)
2401 			goto out;
2402 	}
2403 
2404 	st->state = TCP_SEQ_STATE_LISTENING;
2405 	st->num = 0;
2406 	st->bucket = 0;
2407 	st->offset = 0;
2408 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2409 
2410 out:
2411 	st->last_pos = *pos;
2412 	return rc;
2413 }
2414 EXPORT_SYMBOL(tcp_seq_start);
2415 
2416 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2417 {
2418 	struct tcp_iter_state *st = seq->private;
2419 	void *rc = NULL;
2420 
2421 	if (v == SEQ_START_TOKEN) {
2422 		rc = tcp_get_idx(seq, 0);
2423 		goto out;
2424 	}
2425 
2426 	switch (st->state) {
2427 	case TCP_SEQ_STATE_LISTENING:
2428 		rc = listening_get_next(seq, v);
2429 		if (!rc) {
2430 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2431 			st->bucket = 0;
2432 			st->offset = 0;
2433 			rc	  = established_get_first(seq);
2434 		}
2435 		break;
2436 	case TCP_SEQ_STATE_ESTABLISHED:
2437 		rc = established_get_next(seq, v);
2438 		break;
2439 	}
2440 out:
2441 	++*pos;
2442 	st->last_pos = *pos;
2443 	return rc;
2444 }
2445 EXPORT_SYMBOL(tcp_seq_next);
2446 
2447 void tcp_seq_stop(struct seq_file *seq, void *v)
2448 {
2449 	struct tcp_iter_state *st = seq->private;
2450 
2451 	switch (st->state) {
2452 	case TCP_SEQ_STATE_LISTENING:
2453 		if (v != SEQ_START_TOKEN)
2454 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2455 		break;
2456 	case TCP_SEQ_STATE_ESTABLISHED:
2457 		if (v)
2458 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2459 		break;
2460 	}
2461 }
2462 EXPORT_SYMBOL(tcp_seq_stop);
2463 
2464 static void get_openreq4(const struct request_sock *req,
2465 			 struct seq_file *f, int i)
2466 {
2467 	const struct inet_request_sock *ireq = inet_rsk(req);
2468 	long delta = req->rsk_timer.expires - jiffies;
2469 
2470 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2471 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2472 		i,
2473 		ireq->ir_loc_addr,
2474 		ireq->ir_num,
2475 		ireq->ir_rmt_addr,
2476 		ntohs(ireq->ir_rmt_port),
2477 		TCP_SYN_RECV,
2478 		0, 0, /* could print option size, but that is af dependent. */
2479 		1,    /* timers active (only the expire timer) */
2480 		jiffies_delta_to_clock_t(delta),
2481 		req->num_timeout,
2482 		from_kuid_munged(seq_user_ns(f),
2483 				 sock_i_uid(req->rsk_listener)),
2484 		0,  /* non standard timer */
2485 		0, /* open_requests have no inode */
2486 		0,
2487 		req);
2488 }
2489 
2490 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2491 {
2492 	int timer_active;
2493 	unsigned long timer_expires;
2494 	const struct tcp_sock *tp = tcp_sk(sk);
2495 	const struct inet_connection_sock *icsk = inet_csk(sk);
2496 	const struct inet_sock *inet = inet_sk(sk);
2497 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2498 	__be32 dest = inet->inet_daddr;
2499 	__be32 src = inet->inet_rcv_saddr;
2500 	__u16 destp = ntohs(inet->inet_dport);
2501 	__u16 srcp = ntohs(inet->inet_sport);
2502 	int rx_queue;
2503 	int state;
2504 
2505 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2506 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2507 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2508 		timer_active	= 1;
2509 		timer_expires	= icsk->icsk_timeout;
2510 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2511 		timer_active	= 4;
2512 		timer_expires	= icsk->icsk_timeout;
2513 	} else if (timer_pending(&sk->sk_timer)) {
2514 		timer_active	= 2;
2515 		timer_expires	= sk->sk_timer.expires;
2516 	} else {
2517 		timer_active	= 0;
2518 		timer_expires = jiffies;
2519 	}
2520 
2521 	state = inet_sk_state_load(sk);
2522 	if (state == TCP_LISTEN)
2523 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2524 	else
2525 		/* Because we don't lock the socket,
2526 		 * we might find a transient negative value.
2527 		 */
2528 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2529 				      READ_ONCE(tp->copied_seq), 0);
2530 
2531 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2532 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2533 		i, src, srcp, dest, destp, state,
2534 		READ_ONCE(tp->write_seq) - tp->snd_una,
2535 		rx_queue,
2536 		timer_active,
2537 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2538 		icsk->icsk_retransmits,
2539 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2540 		icsk->icsk_probes_out,
2541 		sock_i_ino(sk),
2542 		refcount_read(&sk->sk_refcnt), sk,
2543 		jiffies_to_clock_t(icsk->icsk_rto),
2544 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2545 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2546 		tp->snd_cwnd,
2547 		state == TCP_LISTEN ?
2548 		    fastopenq->max_qlen :
2549 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2550 }
2551 
2552 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2553 			       struct seq_file *f, int i)
2554 {
2555 	long delta = tw->tw_timer.expires - jiffies;
2556 	__be32 dest, src;
2557 	__u16 destp, srcp;
2558 
2559 	dest  = tw->tw_daddr;
2560 	src   = tw->tw_rcv_saddr;
2561 	destp = ntohs(tw->tw_dport);
2562 	srcp  = ntohs(tw->tw_sport);
2563 
2564 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2565 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2566 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2567 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2568 		refcount_read(&tw->tw_refcnt), tw);
2569 }
2570 
2571 #define TMPSZ 150
2572 
2573 static int tcp4_seq_show(struct seq_file *seq, void *v)
2574 {
2575 	struct tcp_iter_state *st;
2576 	struct sock *sk = v;
2577 
2578 	seq_setwidth(seq, TMPSZ - 1);
2579 	if (v == SEQ_START_TOKEN) {
2580 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2581 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2582 			   "inode");
2583 		goto out;
2584 	}
2585 	st = seq->private;
2586 
2587 	if (sk->sk_state == TCP_TIME_WAIT)
2588 		get_timewait4_sock(v, seq, st->num);
2589 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2590 		get_openreq4(v, seq, st->num);
2591 	else
2592 		get_tcp4_sock(v, seq, st->num);
2593 out:
2594 	seq_pad(seq, '\n');
2595 	return 0;
2596 }
2597 
2598 static const struct seq_operations tcp4_seq_ops = {
2599 	.show		= tcp4_seq_show,
2600 	.start		= tcp_seq_start,
2601 	.next		= tcp_seq_next,
2602 	.stop		= tcp_seq_stop,
2603 };
2604 
2605 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2606 	.family		= AF_INET,
2607 };
2608 
2609 static int __net_init tcp4_proc_init_net(struct net *net)
2610 {
2611 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2612 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2613 		return -ENOMEM;
2614 	return 0;
2615 }
2616 
2617 static void __net_exit tcp4_proc_exit_net(struct net *net)
2618 {
2619 	remove_proc_entry("tcp", net->proc_net);
2620 }
2621 
2622 static struct pernet_operations tcp4_net_ops = {
2623 	.init = tcp4_proc_init_net,
2624 	.exit = tcp4_proc_exit_net,
2625 };
2626 
2627 int __init tcp4_proc_init(void)
2628 {
2629 	return register_pernet_subsys(&tcp4_net_ops);
2630 }
2631 
2632 void tcp4_proc_exit(void)
2633 {
2634 	unregister_pernet_subsys(&tcp4_net_ops);
2635 }
2636 #endif /* CONFIG_PROC_FS */
2637 
2638 struct proto tcp_prot = {
2639 	.name			= "TCP",
2640 	.owner			= THIS_MODULE,
2641 	.close			= tcp_close,
2642 	.pre_connect		= tcp_v4_pre_connect,
2643 	.connect		= tcp_v4_connect,
2644 	.disconnect		= tcp_disconnect,
2645 	.accept			= inet_csk_accept,
2646 	.ioctl			= tcp_ioctl,
2647 	.init			= tcp_v4_init_sock,
2648 	.destroy		= tcp_v4_destroy_sock,
2649 	.shutdown		= tcp_shutdown,
2650 	.setsockopt		= tcp_setsockopt,
2651 	.getsockopt		= tcp_getsockopt,
2652 	.keepalive		= tcp_set_keepalive,
2653 	.recvmsg		= tcp_recvmsg,
2654 	.sendmsg		= tcp_sendmsg,
2655 	.sendpage		= tcp_sendpage,
2656 	.backlog_rcv		= tcp_v4_do_rcv,
2657 	.release_cb		= tcp_release_cb,
2658 	.hash			= inet_hash,
2659 	.unhash			= inet_unhash,
2660 	.get_port		= inet_csk_get_port,
2661 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2662 	.leave_memory_pressure	= tcp_leave_memory_pressure,
2663 	.stream_memory_free	= tcp_stream_memory_free,
2664 	.sockets_allocated	= &tcp_sockets_allocated,
2665 	.orphan_count		= &tcp_orphan_count,
2666 	.memory_allocated	= &tcp_memory_allocated,
2667 	.memory_pressure	= &tcp_memory_pressure,
2668 	.sysctl_mem		= sysctl_tcp_mem,
2669 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
2670 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
2671 	.max_header		= MAX_TCP_HEADER,
2672 	.obj_size		= sizeof(struct tcp_sock),
2673 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
2674 	.twsk_prot		= &tcp_timewait_sock_ops,
2675 	.rsk_prot		= &tcp_request_sock_ops,
2676 	.h.hashinfo		= &tcp_hashinfo,
2677 	.no_autobind		= true,
2678 #ifdef CONFIG_COMPAT
2679 	.compat_setsockopt	= compat_tcp_setsockopt,
2680 	.compat_getsockopt	= compat_tcp_getsockopt,
2681 #endif
2682 	.diag_destroy		= tcp_abort,
2683 };
2684 EXPORT_SYMBOL(tcp_prot);
2685 
2686 static void __net_exit tcp_sk_exit(struct net *net)
2687 {
2688 	int cpu;
2689 
2690 	if (net->ipv4.tcp_congestion_control)
2691 		bpf_module_put(net->ipv4.tcp_congestion_control,
2692 			       net->ipv4.tcp_congestion_control->owner);
2693 
2694 	for_each_possible_cpu(cpu)
2695 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2696 	free_percpu(net->ipv4.tcp_sk);
2697 }
2698 
2699 static int __net_init tcp_sk_init(struct net *net)
2700 {
2701 	int res, cpu, cnt;
2702 
2703 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2704 	if (!net->ipv4.tcp_sk)
2705 		return -ENOMEM;
2706 
2707 	for_each_possible_cpu(cpu) {
2708 		struct sock *sk;
2709 
2710 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2711 					   IPPROTO_TCP, net);
2712 		if (res)
2713 			goto fail;
2714 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2715 
2716 		/* Please enforce IP_DF and IPID==0 for RST and
2717 		 * ACK sent in SYN-RECV and TIME-WAIT state.
2718 		 */
2719 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2720 
2721 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2722 	}
2723 
2724 	net->ipv4.sysctl_tcp_ecn = 2;
2725 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2726 
2727 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2728 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2729 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2730 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2731 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2732 
2733 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2734 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2735 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2736 
2737 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2738 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2739 	net->ipv4.sysctl_tcp_syncookies = 1;
2740 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2741 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2742 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2743 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2744 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2745 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2746 	net->ipv4.sysctl_tcp_tw_reuse = 2;
2747 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2748 
2749 	cnt = tcp_hashinfo.ehash_mask + 1;
2750 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2751 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2752 
2753 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2754 	net->ipv4.sysctl_tcp_sack = 1;
2755 	net->ipv4.sysctl_tcp_window_scaling = 1;
2756 	net->ipv4.sysctl_tcp_timestamps = 1;
2757 	net->ipv4.sysctl_tcp_early_retrans = 3;
2758 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2759 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2760 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
2761 	net->ipv4.sysctl_tcp_max_reordering = 300;
2762 	net->ipv4.sysctl_tcp_dsack = 1;
2763 	net->ipv4.sysctl_tcp_app_win = 31;
2764 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
2765 	net->ipv4.sysctl_tcp_frto = 2;
2766 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2767 	/* This limits the percentage of the congestion window which we
2768 	 * will allow a single TSO frame to consume.  Building TSO frames
2769 	 * which are too large can cause TCP streams to be bursty.
2770 	 */
2771 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2772 	/* Default TSQ limit of 16 TSO segments */
2773 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2774 	/* rfc5961 challenge ack rate limiting */
2775 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2776 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
2777 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2778 	net->ipv4.sysctl_tcp_autocorking = 1;
2779 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2780 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2781 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2782 	if (net != &init_net) {
2783 		memcpy(net->ipv4.sysctl_tcp_rmem,
2784 		       init_net.ipv4.sysctl_tcp_rmem,
2785 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2786 		memcpy(net->ipv4.sysctl_tcp_wmem,
2787 		       init_net.ipv4.sysctl_tcp_wmem,
2788 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2789 	}
2790 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2791 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
2792 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2793 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2794 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2795 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2796 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2797 
2798 	/* Reno is always built in */
2799 	if (!net_eq(net, &init_net) &&
2800 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2801 			       init_net.ipv4.tcp_congestion_control->owner))
2802 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2803 	else
2804 		net->ipv4.tcp_congestion_control = &tcp_reno;
2805 
2806 	return 0;
2807 fail:
2808 	tcp_sk_exit(net);
2809 
2810 	return res;
2811 }
2812 
2813 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2814 {
2815 	struct net *net;
2816 
2817 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2818 
2819 	list_for_each_entry(net, net_exit_list, exit_list)
2820 		tcp_fastopen_ctx_destroy(net);
2821 }
2822 
2823 static struct pernet_operations __net_initdata tcp_sk_ops = {
2824        .init	   = tcp_sk_init,
2825        .exit	   = tcp_sk_exit,
2826        .exit_batch = tcp_sk_exit_batch,
2827 };
2828 
2829 void __init tcp_v4_init(void)
2830 {
2831 	if (register_pernet_subsys(&tcp_sk_ops))
2832 		panic("Failed to create the TCP control socket.\n");
2833 }
2834