xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision 81de3bf3)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72 
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 
80 #include <crypto/hash.h>
81 #include <linux/scatterlist.h>
82 
83 #include <trace/events/tcp.h>
84 
85 #ifdef CONFIG_TCP_MD5SIG
86 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
87 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
88 #endif
89 
90 struct inet_hashinfo tcp_hashinfo;
91 EXPORT_SYMBOL(tcp_hashinfo);
92 
93 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
94 {
95 	return secure_tcp_seq(ip_hdr(skb)->daddr,
96 			      ip_hdr(skb)->saddr,
97 			      tcp_hdr(skb)->dest,
98 			      tcp_hdr(skb)->source);
99 }
100 
101 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
102 {
103 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
104 }
105 
106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
107 {
108 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
109 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
110 	struct tcp_sock *tp = tcp_sk(sk);
111 	int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
112 
113 	if (reuse == 2) {
114 		/* Still does not detect *everything* that goes through
115 		 * lo, since we require a loopback src or dst address
116 		 * or direct binding to 'lo' interface.
117 		 */
118 		bool loopback = false;
119 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
120 			loopback = true;
121 #if IS_ENABLED(CONFIG_IPV6)
122 		if (tw->tw_family == AF_INET6) {
123 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
124 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
125 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
126 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
127 				loopback = true;
128 		} else
129 #endif
130 		{
131 			if (ipv4_is_loopback(tw->tw_daddr) ||
132 			    ipv4_is_loopback(tw->tw_rcv_saddr))
133 				loopback = true;
134 		}
135 		if (!loopback)
136 			reuse = 0;
137 	}
138 
139 	/* With PAWS, it is safe from the viewpoint
140 	   of data integrity. Even without PAWS it is safe provided sequence
141 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
142 
143 	   Actually, the idea is close to VJ's one, only timestamp cache is
144 	   held not per host, but per port pair and TW bucket is used as state
145 	   holder.
146 
147 	   If TW bucket has been already destroyed we fall back to VJ's scheme
148 	   and use initial timestamp retrieved from peer table.
149 	 */
150 	if (tcptw->tw_ts_recent_stamp &&
151 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
152 					    tcptw->tw_ts_recent_stamp)))) {
153 		/* In case of repair and re-using TIME-WAIT sockets we still
154 		 * want to be sure that it is safe as above but honor the
155 		 * sequence numbers and time stamps set as part of the repair
156 		 * process.
157 		 *
158 		 * Without this check re-using a TIME-WAIT socket with TCP
159 		 * repair would accumulate a -1 on the repair assigned
160 		 * sequence number. The first time it is reused the sequence
161 		 * is -1, the second time -2, etc. This fixes that issue
162 		 * without appearing to create any others.
163 		 */
164 		if (likely(!tp->repair)) {
165 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
166 
167 			if (!seq)
168 				seq = 1;
169 			WRITE_ONCE(tp->write_seq, seq);
170 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
171 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
172 		}
173 		sock_hold(sktw);
174 		return 1;
175 	}
176 
177 	return 0;
178 }
179 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
180 
181 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
182 			      int addr_len)
183 {
184 	/* This check is replicated from tcp_v4_connect() and intended to
185 	 * prevent BPF program called below from accessing bytes that are out
186 	 * of the bound specified by user in addr_len.
187 	 */
188 	if (addr_len < sizeof(struct sockaddr_in))
189 		return -EINVAL;
190 
191 	sock_owned_by_me(sk);
192 
193 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
194 }
195 
196 /* This will initiate an outgoing connection. */
197 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
198 {
199 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
200 	struct inet_sock *inet = inet_sk(sk);
201 	struct tcp_sock *tp = tcp_sk(sk);
202 	__be16 orig_sport, orig_dport;
203 	__be32 daddr, nexthop;
204 	struct flowi4 *fl4;
205 	struct rtable *rt;
206 	int err;
207 	struct ip_options_rcu *inet_opt;
208 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
209 
210 	if (addr_len < sizeof(struct sockaddr_in))
211 		return -EINVAL;
212 
213 	if (usin->sin_family != AF_INET)
214 		return -EAFNOSUPPORT;
215 
216 	nexthop = daddr = usin->sin_addr.s_addr;
217 	inet_opt = rcu_dereference_protected(inet->inet_opt,
218 					     lockdep_sock_is_held(sk));
219 	if (inet_opt && inet_opt->opt.srr) {
220 		if (!daddr)
221 			return -EINVAL;
222 		nexthop = inet_opt->opt.faddr;
223 	}
224 
225 	orig_sport = inet->inet_sport;
226 	orig_dport = usin->sin_port;
227 	fl4 = &inet->cork.fl.u.ip4;
228 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
229 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
230 			      IPPROTO_TCP,
231 			      orig_sport, orig_dport, sk);
232 	if (IS_ERR(rt)) {
233 		err = PTR_ERR(rt);
234 		if (err == -ENETUNREACH)
235 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
236 		return err;
237 	}
238 
239 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
240 		ip_rt_put(rt);
241 		return -ENETUNREACH;
242 	}
243 
244 	if (!inet_opt || !inet_opt->opt.srr)
245 		daddr = fl4->daddr;
246 
247 	if (!inet->inet_saddr)
248 		inet->inet_saddr = fl4->saddr;
249 	sk_rcv_saddr_set(sk, inet->inet_saddr);
250 
251 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
252 		/* Reset inherited state */
253 		tp->rx_opt.ts_recent	   = 0;
254 		tp->rx_opt.ts_recent_stamp = 0;
255 		if (likely(!tp->repair))
256 			WRITE_ONCE(tp->write_seq, 0);
257 	}
258 
259 	inet->inet_dport = usin->sin_port;
260 	sk_daddr_set(sk, daddr);
261 
262 	inet_csk(sk)->icsk_ext_hdr_len = 0;
263 	if (inet_opt)
264 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
265 
266 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
267 
268 	/* Socket identity is still unknown (sport may be zero).
269 	 * However we set state to SYN-SENT and not releasing socket
270 	 * lock select source port, enter ourselves into the hash tables and
271 	 * complete initialization after this.
272 	 */
273 	tcp_set_state(sk, TCP_SYN_SENT);
274 	err = inet_hash_connect(tcp_death_row, sk);
275 	if (err)
276 		goto failure;
277 
278 	sk_set_txhash(sk);
279 
280 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
281 			       inet->inet_sport, inet->inet_dport, sk);
282 	if (IS_ERR(rt)) {
283 		err = PTR_ERR(rt);
284 		rt = NULL;
285 		goto failure;
286 	}
287 	/* OK, now commit destination to socket.  */
288 	sk->sk_gso_type = SKB_GSO_TCPV4;
289 	sk_setup_caps(sk, &rt->dst);
290 	rt = NULL;
291 
292 	if (likely(!tp->repair)) {
293 		if (!tp->write_seq)
294 			WRITE_ONCE(tp->write_seq,
295 				   secure_tcp_seq(inet->inet_saddr,
296 						  inet->inet_daddr,
297 						  inet->inet_sport,
298 						  usin->sin_port));
299 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
300 						 inet->inet_saddr,
301 						 inet->inet_daddr);
302 	}
303 
304 	inet->inet_id = prandom_u32();
305 
306 	if (tcp_fastopen_defer_connect(sk, &err))
307 		return err;
308 	if (err)
309 		goto failure;
310 
311 	err = tcp_connect(sk);
312 
313 	if (err)
314 		goto failure;
315 
316 	return 0;
317 
318 failure:
319 	/*
320 	 * This unhashes the socket and releases the local port,
321 	 * if necessary.
322 	 */
323 	tcp_set_state(sk, TCP_CLOSE);
324 	ip_rt_put(rt);
325 	sk->sk_route_caps = 0;
326 	inet->inet_dport = 0;
327 	return err;
328 }
329 EXPORT_SYMBOL(tcp_v4_connect);
330 
331 /*
332  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
333  * It can be called through tcp_release_cb() if socket was owned by user
334  * at the time tcp_v4_err() was called to handle ICMP message.
335  */
336 void tcp_v4_mtu_reduced(struct sock *sk)
337 {
338 	struct inet_sock *inet = inet_sk(sk);
339 	struct dst_entry *dst;
340 	u32 mtu;
341 
342 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
343 		return;
344 	mtu = tcp_sk(sk)->mtu_info;
345 	dst = inet_csk_update_pmtu(sk, mtu);
346 	if (!dst)
347 		return;
348 
349 	/* Something is about to be wrong... Remember soft error
350 	 * for the case, if this connection will not able to recover.
351 	 */
352 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
353 		sk->sk_err_soft = EMSGSIZE;
354 
355 	mtu = dst_mtu(dst);
356 
357 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
358 	    ip_sk_accept_pmtu(sk) &&
359 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
360 		tcp_sync_mss(sk, mtu);
361 
362 		/* Resend the TCP packet because it's
363 		 * clear that the old packet has been
364 		 * dropped. This is the new "fast" path mtu
365 		 * discovery.
366 		 */
367 		tcp_simple_retransmit(sk);
368 	} /* else let the usual retransmit timer handle it */
369 }
370 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
371 
372 static void do_redirect(struct sk_buff *skb, struct sock *sk)
373 {
374 	struct dst_entry *dst = __sk_dst_check(sk, 0);
375 
376 	if (dst)
377 		dst->ops->redirect(dst, sk, skb);
378 }
379 
380 
381 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
382 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
383 {
384 	struct request_sock *req = inet_reqsk(sk);
385 	struct net *net = sock_net(sk);
386 
387 	/* ICMPs are not backlogged, hence we cannot get
388 	 * an established socket here.
389 	 */
390 	if (seq != tcp_rsk(req)->snt_isn) {
391 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
392 	} else if (abort) {
393 		/*
394 		 * Still in SYN_RECV, just remove it silently.
395 		 * There is no good way to pass the error to the newly
396 		 * created socket, and POSIX does not want network
397 		 * errors returned from accept().
398 		 */
399 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
400 		tcp_listendrop(req->rsk_listener);
401 	}
402 	reqsk_put(req);
403 }
404 EXPORT_SYMBOL(tcp_req_err);
405 
406 /*
407  * This routine is called by the ICMP module when it gets some
408  * sort of error condition.  If err < 0 then the socket should
409  * be closed and the error returned to the user.  If err > 0
410  * it's just the icmp type << 8 | icmp code.  After adjustment
411  * header points to the first 8 bytes of the tcp header.  We need
412  * to find the appropriate port.
413  *
414  * The locking strategy used here is very "optimistic". When
415  * someone else accesses the socket the ICMP is just dropped
416  * and for some paths there is no check at all.
417  * A more general error queue to queue errors for later handling
418  * is probably better.
419  *
420  */
421 
422 int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
423 {
424 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
425 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
426 	struct inet_connection_sock *icsk;
427 	struct tcp_sock *tp;
428 	struct inet_sock *inet;
429 	const int type = icmp_hdr(icmp_skb)->type;
430 	const int code = icmp_hdr(icmp_skb)->code;
431 	struct sock *sk;
432 	struct sk_buff *skb;
433 	struct request_sock *fastopen;
434 	u32 seq, snd_una;
435 	s32 remaining;
436 	u32 delta_us;
437 	int err;
438 	struct net *net = dev_net(icmp_skb->dev);
439 
440 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
441 				       th->dest, iph->saddr, ntohs(th->source),
442 				       inet_iif(icmp_skb), 0);
443 	if (!sk) {
444 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
445 		return -ENOENT;
446 	}
447 	if (sk->sk_state == TCP_TIME_WAIT) {
448 		inet_twsk_put(inet_twsk(sk));
449 		return 0;
450 	}
451 	seq = ntohl(th->seq);
452 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
453 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
454 				     type == ICMP_TIME_EXCEEDED ||
455 				     (type == ICMP_DEST_UNREACH &&
456 				      (code == ICMP_NET_UNREACH ||
457 				       code == ICMP_HOST_UNREACH)));
458 		return 0;
459 	}
460 
461 	bh_lock_sock(sk);
462 	/* If too many ICMPs get dropped on busy
463 	 * servers this needs to be solved differently.
464 	 * We do take care of PMTU discovery (RFC1191) special case :
465 	 * we can receive locally generated ICMP messages while socket is held.
466 	 */
467 	if (sock_owned_by_user(sk)) {
468 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
469 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
470 	}
471 	if (sk->sk_state == TCP_CLOSE)
472 		goto out;
473 
474 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
475 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
476 		goto out;
477 	}
478 
479 	icsk = inet_csk(sk);
480 	tp = tcp_sk(sk);
481 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
482 	fastopen = rcu_dereference(tp->fastopen_rsk);
483 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
484 	if (sk->sk_state != TCP_LISTEN &&
485 	    !between(seq, snd_una, tp->snd_nxt)) {
486 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
487 		goto out;
488 	}
489 
490 	switch (type) {
491 	case ICMP_REDIRECT:
492 		if (!sock_owned_by_user(sk))
493 			do_redirect(icmp_skb, sk);
494 		goto out;
495 	case ICMP_SOURCE_QUENCH:
496 		/* Just silently ignore these. */
497 		goto out;
498 	case ICMP_PARAMETERPROB:
499 		err = EPROTO;
500 		break;
501 	case ICMP_DEST_UNREACH:
502 		if (code > NR_ICMP_UNREACH)
503 			goto out;
504 
505 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
506 			/* We are not interested in TCP_LISTEN and open_requests
507 			 * (SYN-ACKs send out by Linux are always <576bytes so
508 			 * they should go through unfragmented).
509 			 */
510 			if (sk->sk_state == TCP_LISTEN)
511 				goto out;
512 
513 			tp->mtu_info = info;
514 			if (!sock_owned_by_user(sk)) {
515 				tcp_v4_mtu_reduced(sk);
516 			} else {
517 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
518 					sock_hold(sk);
519 			}
520 			goto out;
521 		}
522 
523 		err = icmp_err_convert[code].errno;
524 		/* check if icmp_skb allows revert of backoff
525 		 * (see draft-zimmermann-tcp-lcd) */
526 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
527 			break;
528 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
529 		    !icsk->icsk_backoff || fastopen)
530 			break;
531 
532 		if (sock_owned_by_user(sk))
533 			break;
534 
535 		skb = tcp_rtx_queue_head(sk);
536 		if (WARN_ON_ONCE(!skb))
537 			break;
538 
539 		icsk->icsk_backoff--;
540 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
541 					       TCP_TIMEOUT_INIT;
542 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
543 
544 
545 		tcp_mstamp_refresh(tp);
546 		delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
547 		remaining = icsk->icsk_rto -
548 			    usecs_to_jiffies(delta_us);
549 
550 		if (remaining > 0) {
551 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
552 						  remaining, TCP_RTO_MAX);
553 		} else {
554 			/* RTO revert clocked out retransmission.
555 			 * Will retransmit now */
556 			tcp_retransmit_timer(sk);
557 		}
558 
559 		break;
560 	case ICMP_TIME_EXCEEDED:
561 		err = EHOSTUNREACH;
562 		break;
563 	default:
564 		goto out;
565 	}
566 
567 	switch (sk->sk_state) {
568 	case TCP_SYN_SENT:
569 	case TCP_SYN_RECV:
570 		/* Only in fast or simultaneous open. If a fast open socket is
571 		 * is already accepted it is treated as a connected one below.
572 		 */
573 		if (fastopen && !fastopen->sk)
574 			break;
575 
576 		if (!sock_owned_by_user(sk)) {
577 			sk->sk_err = err;
578 
579 			sk->sk_error_report(sk);
580 
581 			tcp_done(sk);
582 		} else {
583 			sk->sk_err_soft = err;
584 		}
585 		goto out;
586 	}
587 
588 	/* If we've already connected we will keep trying
589 	 * until we time out, or the user gives up.
590 	 *
591 	 * rfc1122 4.2.3.9 allows to consider as hard errors
592 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
593 	 * but it is obsoleted by pmtu discovery).
594 	 *
595 	 * Note, that in modern internet, where routing is unreliable
596 	 * and in each dark corner broken firewalls sit, sending random
597 	 * errors ordered by their masters even this two messages finally lose
598 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
599 	 *
600 	 * Now we are in compliance with RFCs.
601 	 *							--ANK (980905)
602 	 */
603 
604 	inet = inet_sk(sk);
605 	if (!sock_owned_by_user(sk) && inet->recverr) {
606 		sk->sk_err = err;
607 		sk->sk_error_report(sk);
608 	} else	{ /* Only an error on timeout */
609 		sk->sk_err_soft = err;
610 	}
611 
612 out:
613 	bh_unlock_sock(sk);
614 	sock_put(sk);
615 	return 0;
616 }
617 
618 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
619 {
620 	struct tcphdr *th = tcp_hdr(skb);
621 
622 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
623 	skb->csum_start = skb_transport_header(skb) - skb->head;
624 	skb->csum_offset = offsetof(struct tcphdr, check);
625 }
626 
627 /* This routine computes an IPv4 TCP checksum. */
628 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
629 {
630 	const struct inet_sock *inet = inet_sk(sk);
631 
632 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
633 }
634 EXPORT_SYMBOL(tcp_v4_send_check);
635 
636 /*
637  *	This routine will send an RST to the other tcp.
638  *
639  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
640  *		      for reset.
641  *	Answer: if a packet caused RST, it is not for a socket
642  *		existing in our system, if it is matched to a socket,
643  *		it is just duplicate segment or bug in other side's TCP.
644  *		So that we build reply only basing on parameters
645  *		arrived with segment.
646  *	Exception: precedence violation. We do not implement it in any case.
647  */
648 
649 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
650 {
651 	const struct tcphdr *th = tcp_hdr(skb);
652 	struct {
653 		struct tcphdr th;
654 #ifdef CONFIG_TCP_MD5SIG
655 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
656 #endif
657 	} rep;
658 	struct ip_reply_arg arg;
659 #ifdef CONFIG_TCP_MD5SIG
660 	struct tcp_md5sig_key *key = NULL;
661 	const __u8 *hash_location = NULL;
662 	unsigned char newhash[16];
663 	int genhash;
664 	struct sock *sk1 = NULL;
665 #endif
666 	u64 transmit_time = 0;
667 	struct sock *ctl_sk;
668 	struct net *net;
669 
670 	/* Never send a reset in response to a reset. */
671 	if (th->rst)
672 		return;
673 
674 	/* If sk not NULL, it means we did a successful lookup and incoming
675 	 * route had to be correct. prequeue might have dropped our dst.
676 	 */
677 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
678 		return;
679 
680 	/* Swap the send and the receive. */
681 	memset(&rep, 0, sizeof(rep));
682 	rep.th.dest   = th->source;
683 	rep.th.source = th->dest;
684 	rep.th.doff   = sizeof(struct tcphdr) / 4;
685 	rep.th.rst    = 1;
686 
687 	if (th->ack) {
688 		rep.th.seq = th->ack_seq;
689 	} else {
690 		rep.th.ack = 1;
691 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
692 				       skb->len - (th->doff << 2));
693 	}
694 
695 	memset(&arg, 0, sizeof(arg));
696 	arg.iov[0].iov_base = (unsigned char *)&rep;
697 	arg.iov[0].iov_len  = sizeof(rep.th);
698 
699 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
700 #ifdef CONFIG_TCP_MD5SIG
701 	rcu_read_lock();
702 	hash_location = tcp_parse_md5sig_option(th);
703 	if (sk && sk_fullsock(sk)) {
704 		key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
705 					&ip_hdr(skb)->saddr, AF_INET);
706 	} else if (hash_location) {
707 		/*
708 		 * active side is lost. Try to find listening socket through
709 		 * source port, and then find md5 key through listening socket.
710 		 * we are not loose security here:
711 		 * Incoming packet is checked with md5 hash with finding key,
712 		 * no RST generated if md5 hash doesn't match.
713 		 */
714 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
715 					     ip_hdr(skb)->saddr,
716 					     th->source, ip_hdr(skb)->daddr,
717 					     ntohs(th->source), inet_iif(skb),
718 					     tcp_v4_sdif(skb));
719 		/* don't send rst if it can't find key */
720 		if (!sk1)
721 			goto out;
722 
723 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
724 					&ip_hdr(skb)->saddr, AF_INET);
725 		if (!key)
726 			goto out;
727 
728 
729 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
730 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
731 			goto out;
732 
733 	}
734 
735 	if (key) {
736 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
737 				   (TCPOPT_NOP << 16) |
738 				   (TCPOPT_MD5SIG << 8) |
739 				   TCPOLEN_MD5SIG);
740 		/* Update length and the length the header thinks exists */
741 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
742 		rep.th.doff = arg.iov[0].iov_len / 4;
743 
744 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
745 				     key, ip_hdr(skb)->saddr,
746 				     ip_hdr(skb)->daddr, &rep.th);
747 	}
748 #endif
749 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
750 				      ip_hdr(skb)->saddr, /* XXX */
751 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
752 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
753 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
754 
755 	/* When socket is gone, all binding information is lost.
756 	 * routing might fail in this case. No choice here, if we choose to force
757 	 * input interface, we will misroute in case of asymmetric route.
758 	 */
759 	if (sk) {
760 		arg.bound_dev_if = sk->sk_bound_dev_if;
761 		if (sk_fullsock(sk))
762 			trace_tcp_send_reset(sk, skb);
763 	}
764 
765 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
766 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
767 
768 	arg.tos = ip_hdr(skb)->tos;
769 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
770 	local_bh_disable();
771 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
772 	if (sk) {
773 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
774 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
775 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
776 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
777 		transmit_time = tcp_transmit_time(sk);
778 	}
779 	ip_send_unicast_reply(ctl_sk,
780 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
781 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
782 			      &arg, arg.iov[0].iov_len,
783 			      transmit_time);
784 
785 	ctl_sk->sk_mark = 0;
786 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
787 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
788 	local_bh_enable();
789 
790 #ifdef CONFIG_TCP_MD5SIG
791 out:
792 	rcu_read_unlock();
793 #endif
794 }
795 
796 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
797    outside socket context is ugly, certainly. What can I do?
798  */
799 
800 static void tcp_v4_send_ack(const struct sock *sk,
801 			    struct sk_buff *skb, u32 seq, u32 ack,
802 			    u32 win, u32 tsval, u32 tsecr, int oif,
803 			    struct tcp_md5sig_key *key,
804 			    int reply_flags, u8 tos)
805 {
806 	const struct tcphdr *th = tcp_hdr(skb);
807 	struct {
808 		struct tcphdr th;
809 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
810 #ifdef CONFIG_TCP_MD5SIG
811 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
812 #endif
813 			];
814 	} rep;
815 	struct net *net = sock_net(sk);
816 	struct ip_reply_arg arg;
817 	struct sock *ctl_sk;
818 	u64 transmit_time;
819 
820 	memset(&rep.th, 0, sizeof(struct tcphdr));
821 	memset(&arg, 0, sizeof(arg));
822 
823 	arg.iov[0].iov_base = (unsigned char *)&rep;
824 	arg.iov[0].iov_len  = sizeof(rep.th);
825 	if (tsecr) {
826 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
827 				   (TCPOPT_TIMESTAMP << 8) |
828 				   TCPOLEN_TIMESTAMP);
829 		rep.opt[1] = htonl(tsval);
830 		rep.opt[2] = htonl(tsecr);
831 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
832 	}
833 
834 	/* Swap the send and the receive. */
835 	rep.th.dest    = th->source;
836 	rep.th.source  = th->dest;
837 	rep.th.doff    = arg.iov[0].iov_len / 4;
838 	rep.th.seq     = htonl(seq);
839 	rep.th.ack_seq = htonl(ack);
840 	rep.th.ack     = 1;
841 	rep.th.window  = htons(win);
842 
843 #ifdef CONFIG_TCP_MD5SIG
844 	if (key) {
845 		int offset = (tsecr) ? 3 : 0;
846 
847 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
848 					  (TCPOPT_NOP << 16) |
849 					  (TCPOPT_MD5SIG << 8) |
850 					  TCPOLEN_MD5SIG);
851 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
852 		rep.th.doff = arg.iov[0].iov_len/4;
853 
854 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
855 				    key, ip_hdr(skb)->saddr,
856 				    ip_hdr(skb)->daddr, &rep.th);
857 	}
858 #endif
859 	arg.flags = reply_flags;
860 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
861 				      ip_hdr(skb)->saddr, /* XXX */
862 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
863 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
864 	if (oif)
865 		arg.bound_dev_if = oif;
866 	arg.tos = tos;
867 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
868 	local_bh_disable();
869 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
870 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
871 			   inet_twsk(sk)->tw_mark : sk->sk_mark;
872 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
873 			   inet_twsk(sk)->tw_priority : sk->sk_priority;
874 	transmit_time = tcp_transmit_time(sk);
875 	ip_send_unicast_reply(ctl_sk,
876 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
877 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
878 			      &arg, arg.iov[0].iov_len,
879 			      transmit_time);
880 
881 	ctl_sk->sk_mark = 0;
882 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
883 	local_bh_enable();
884 }
885 
886 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
887 {
888 	struct inet_timewait_sock *tw = inet_twsk(sk);
889 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
890 
891 	tcp_v4_send_ack(sk, skb,
892 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
893 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
894 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
895 			tcptw->tw_ts_recent,
896 			tw->tw_bound_dev_if,
897 			tcp_twsk_md5_key(tcptw),
898 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
899 			tw->tw_tos
900 			);
901 
902 	inet_twsk_put(tw);
903 }
904 
905 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
906 				  struct request_sock *req)
907 {
908 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
909 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
910 	 */
911 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
912 					     tcp_sk(sk)->snd_nxt;
913 
914 	/* RFC 7323 2.3
915 	 * The window field (SEG.WND) of every outgoing segment, with the
916 	 * exception of <SYN> segments, MUST be right-shifted by
917 	 * Rcv.Wind.Shift bits:
918 	 */
919 	tcp_v4_send_ack(sk, skb, seq,
920 			tcp_rsk(req)->rcv_nxt,
921 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
922 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
923 			req->ts_recent,
924 			0,
925 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
926 					  AF_INET),
927 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
928 			ip_hdr(skb)->tos);
929 }
930 
931 /*
932  *	Send a SYN-ACK after having received a SYN.
933  *	This still operates on a request_sock only, not on a big
934  *	socket.
935  */
936 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
937 			      struct flowi *fl,
938 			      struct request_sock *req,
939 			      struct tcp_fastopen_cookie *foc,
940 			      enum tcp_synack_type synack_type)
941 {
942 	const struct inet_request_sock *ireq = inet_rsk(req);
943 	struct flowi4 fl4;
944 	int err = -1;
945 	struct sk_buff *skb;
946 
947 	/* First, grab a route. */
948 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
949 		return -1;
950 
951 	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
952 
953 	if (skb) {
954 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
955 
956 		rcu_read_lock();
957 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
958 					    ireq->ir_rmt_addr,
959 					    rcu_dereference(ireq->ireq_opt));
960 		rcu_read_unlock();
961 		err = net_xmit_eval(err);
962 	}
963 
964 	return err;
965 }
966 
967 /*
968  *	IPv4 request_sock destructor.
969  */
970 static void tcp_v4_reqsk_destructor(struct request_sock *req)
971 {
972 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
973 }
974 
975 #ifdef CONFIG_TCP_MD5SIG
976 /*
977  * RFC2385 MD5 checksumming requires a mapping of
978  * IP address->MD5 Key.
979  * We need to maintain these in the sk structure.
980  */
981 
982 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
983 EXPORT_SYMBOL(tcp_md5_needed);
984 
985 /* Find the Key structure for an address.  */
986 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk,
987 					   const union tcp_md5_addr *addr,
988 					   int family)
989 {
990 	const struct tcp_sock *tp = tcp_sk(sk);
991 	struct tcp_md5sig_key *key;
992 	const struct tcp_md5sig_info *md5sig;
993 	__be32 mask;
994 	struct tcp_md5sig_key *best_match = NULL;
995 	bool match;
996 
997 	/* caller either holds rcu_read_lock() or socket lock */
998 	md5sig = rcu_dereference_check(tp->md5sig_info,
999 				       lockdep_sock_is_held(sk));
1000 	if (!md5sig)
1001 		return NULL;
1002 
1003 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1004 		if (key->family != family)
1005 			continue;
1006 
1007 		if (family == AF_INET) {
1008 			mask = inet_make_mask(key->prefixlen);
1009 			match = (key->addr.a4.s_addr & mask) ==
1010 				(addr->a4.s_addr & mask);
1011 #if IS_ENABLED(CONFIG_IPV6)
1012 		} else if (family == AF_INET6) {
1013 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1014 						  key->prefixlen);
1015 #endif
1016 		} else {
1017 			match = false;
1018 		}
1019 
1020 		if (match && (!best_match ||
1021 			      key->prefixlen > best_match->prefixlen))
1022 			best_match = key;
1023 	}
1024 	return best_match;
1025 }
1026 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1027 
1028 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1029 						      const union tcp_md5_addr *addr,
1030 						      int family, u8 prefixlen)
1031 {
1032 	const struct tcp_sock *tp = tcp_sk(sk);
1033 	struct tcp_md5sig_key *key;
1034 	unsigned int size = sizeof(struct in_addr);
1035 	const struct tcp_md5sig_info *md5sig;
1036 
1037 	/* caller either holds rcu_read_lock() or socket lock */
1038 	md5sig = rcu_dereference_check(tp->md5sig_info,
1039 				       lockdep_sock_is_held(sk));
1040 	if (!md5sig)
1041 		return NULL;
1042 #if IS_ENABLED(CONFIG_IPV6)
1043 	if (family == AF_INET6)
1044 		size = sizeof(struct in6_addr);
1045 #endif
1046 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1047 		if (key->family != family)
1048 			continue;
1049 		if (!memcmp(&key->addr, addr, size) &&
1050 		    key->prefixlen == prefixlen)
1051 			return key;
1052 	}
1053 	return NULL;
1054 }
1055 
1056 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1057 					 const struct sock *addr_sk)
1058 {
1059 	const union tcp_md5_addr *addr;
1060 
1061 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1062 	return tcp_md5_do_lookup(sk, addr, AF_INET);
1063 }
1064 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1065 
1066 /* This can be called on a newly created socket, from other files */
1067 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1068 		   int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1069 		   gfp_t gfp)
1070 {
1071 	/* Add Key to the list */
1072 	struct tcp_md5sig_key *key;
1073 	struct tcp_sock *tp = tcp_sk(sk);
1074 	struct tcp_md5sig_info *md5sig;
1075 
1076 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1077 	if (key) {
1078 		/* Pre-existing entry - just update that one. */
1079 		memcpy(key->key, newkey, newkeylen);
1080 		key->keylen = newkeylen;
1081 		return 0;
1082 	}
1083 
1084 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1085 					   lockdep_sock_is_held(sk));
1086 	if (!md5sig) {
1087 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1088 		if (!md5sig)
1089 			return -ENOMEM;
1090 
1091 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1092 		INIT_HLIST_HEAD(&md5sig->head);
1093 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1094 	}
1095 
1096 	key = sock_kmalloc(sk, sizeof(*key), gfp);
1097 	if (!key)
1098 		return -ENOMEM;
1099 	if (!tcp_alloc_md5sig_pool()) {
1100 		sock_kfree_s(sk, key, sizeof(*key));
1101 		return -ENOMEM;
1102 	}
1103 
1104 	memcpy(key->key, newkey, newkeylen);
1105 	key->keylen = newkeylen;
1106 	key->family = family;
1107 	key->prefixlen = prefixlen;
1108 	memcpy(&key->addr, addr,
1109 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1110 				      sizeof(struct in_addr));
1111 	hlist_add_head_rcu(&key->node, &md5sig->head);
1112 	return 0;
1113 }
1114 EXPORT_SYMBOL(tcp_md5_do_add);
1115 
1116 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1117 		   u8 prefixlen)
1118 {
1119 	struct tcp_md5sig_key *key;
1120 
1121 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1122 	if (!key)
1123 		return -ENOENT;
1124 	hlist_del_rcu(&key->node);
1125 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1126 	kfree_rcu(key, rcu);
1127 	return 0;
1128 }
1129 EXPORT_SYMBOL(tcp_md5_do_del);
1130 
1131 static void tcp_clear_md5_list(struct sock *sk)
1132 {
1133 	struct tcp_sock *tp = tcp_sk(sk);
1134 	struct tcp_md5sig_key *key;
1135 	struct hlist_node *n;
1136 	struct tcp_md5sig_info *md5sig;
1137 
1138 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1139 
1140 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1141 		hlist_del_rcu(&key->node);
1142 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1143 		kfree_rcu(key, rcu);
1144 	}
1145 }
1146 
1147 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1148 				 char __user *optval, int optlen)
1149 {
1150 	struct tcp_md5sig cmd;
1151 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1152 	u8 prefixlen = 32;
1153 
1154 	if (optlen < sizeof(cmd))
1155 		return -EINVAL;
1156 
1157 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1158 		return -EFAULT;
1159 
1160 	if (sin->sin_family != AF_INET)
1161 		return -EINVAL;
1162 
1163 	if (optname == TCP_MD5SIG_EXT &&
1164 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1165 		prefixlen = cmd.tcpm_prefixlen;
1166 		if (prefixlen > 32)
1167 			return -EINVAL;
1168 	}
1169 
1170 	if (!cmd.tcpm_keylen)
1171 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1172 				      AF_INET, prefixlen);
1173 
1174 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1175 		return -EINVAL;
1176 
1177 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1178 			      AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1179 			      GFP_KERNEL);
1180 }
1181 
1182 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1183 				   __be32 daddr, __be32 saddr,
1184 				   const struct tcphdr *th, int nbytes)
1185 {
1186 	struct tcp4_pseudohdr *bp;
1187 	struct scatterlist sg;
1188 	struct tcphdr *_th;
1189 
1190 	bp = hp->scratch;
1191 	bp->saddr = saddr;
1192 	bp->daddr = daddr;
1193 	bp->pad = 0;
1194 	bp->protocol = IPPROTO_TCP;
1195 	bp->len = cpu_to_be16(nbytes);
1196 
1197 	_th = (struct tcphdr *)(bp + 1);
1198 	memcpy(_th, th, sizeof(*th));
1199 	_th->check = 0;
1200 
1201 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1202 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1203 				sizeof(*bp) + sizeof(*th));
1204 	return crypto_ahash_update(hp->md5_req);
1205 }
1206 
1207 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1208 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1209 {
1210 	struct tcp_md5sig_pool *hp;
1211 	struct ahash_request *req;
1212 
1213 	hp = tcp_get_md5sig_pool();
1214 	if (!hp)
1215 		goto clear_hash_noput;
1216 	req = hp->md5_req;
1217 
1218 	if (crypto_ahash_init(req))
1219 		goto clear_hash;
1220 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1221 		goto clear_hash;
1222 	if (tcp_md5_hash_key(hp, key))
1223 		goto clear_hash;
1224 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1225 	if (crypto_ahash_final(req))
1226 		goto clear_hash;
1227 
1228 	tcp_put_md5sig_pool();
1229 	return 0;
1230 
1231 clear_hash:
1232 	tcp_put_md5sig_pool();
1233 clear_hash_noput:
1234 	memset(md5_hash, 0, 16);
1235 	return 1;
1236 }
1237 
1238 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1239 			const struct sock *sk,
1240 			const struct sk_buff *skb)
1241 {
1242 	struct tcp_md5sig_pool *hp;
1243 	struct ahash_request *req;
1244 	const struct tcphdr *th = tcp_hdr(skb);
1245 	__be32 saddr, daddr;
1246 
1247 	if (sk) { /* valid for establish/request sockets */
1248 		saddr = sk->sk_rcv_saddr;
1249 		daddr = sk->sk_daddr;
1250 	} else {
1251 		const struct iphdr *iph = ip_hdr(skb);
1252 		saddr = iph->saddr;
1253 		daddr = iph->daddr;
1254 	}
1255 
1256 	hp = tcp_get_md5sig_pool();
1257 	if (!hp)
1258 		goto clear_hash_noput;
1259 	req = hp->md5_req;
1260 
1261 	if (crypto_ahash_init(req))
1262 		goto clear_hash;
1263 
1264 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1265 		goto clear_hash;
1266 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1267 		goto clear_hash;
1268 	if (tcp_md5_hash_key(hp, key))
1269 		goto clear_hash;
1270 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1271 	if (crypto_ahash_final(req))
1272 		goto clear_hash;
1273 
1274 	tcp_put_md5sig_pool();
1275 	return 0;
1276 
1277 clear_hash:
1278 	tcp_put_md5sig_pool();
1279 clear_hash_noput:
1280 	memset(md5_hash, 0, 16);
1281 	return 1;
1282 }
1283 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1284 
1285 #endif
1286 
1287 /* Called with rcu_read_lock() */
1288 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1289 				    const struct sk_buff *skb)
1290 {
1291 #ifdef CONFIG_TCP_MD5SIG
1292 	/*
1293 	 * This gets called for each TCP segment that arrives
1294 	 * so we want to be efficient.
1295 	 * We have 3 drop cases:
1296 	 * o No MD5 hash and one expected.
1297 	 * o MD5 hash and we're not expecting one.
1298 	 * o MD5 hash and its wrong.
1299 	 */
1300 	const __u8 *hash_location = NULL;
1301 	struct tcp_md5sig_key *hash_expected;
1302 	const struct iphdr *iph = ip_hdr(skb);
1303 	const struct tcphdr *th = tcp_hdr(skb);
1304 	int genhash;
1305 	unsigned char newhash[16];
1306 
1307 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1308 					  AF_INET);
1309 	hash_location = tcp_parse_md5sig_option(th);
1310 
1311 	/* We've parsed the options - do we have a hash? */
1312 	if (!hash_expected && !hash_location)
1313 		return false;
1314 
1315 	if (hash_expected && !hash_location) {
1316 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1317 		return true;
1318 	}
1319 
1320 	if (!hash_expected && hash_location) {
1321 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1322 		return true;
1323 	}
1324 
1325 	/* Okay, so this is hash_expected and hash_location -
1326 	 * so we need to calculate the checksum.
1327 	 */
1328 	genhash = tcp_v4_md5_hash_skb(newhash,
1329 				      hash_expected,
1330 				      NULL, skb);
1331 
1332 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1333 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1334 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1335 				     &iph->saddr, ntohs(th->source),
1336 				     &iph->daddr, ntohs(th->dest),
1337 				     genhash ? " tcp_v4_calc_md5_hash failed"
1338 				     : "");
1339 		return true;
1340 	}
1341 	return false;
1342 #endif
1343 	return false;
1344 }
1345 
1346 static void tcp_v4_init_req(struct request_sock *req,
1347 			    const struct sock *sk_listener,
1348 			    struct sk_buff *skb)
1349 {
1350 	struct inet_request_sock *ireq = inet_rsk(req);
1351 	struct net *net = sock_net(sk_listener);
1352 
1353 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1354 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1355 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1356 }
1357 
1358 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1359 					  struct flowi *fl,
1360 					  const struct request_sock *req)
1361 {
1362 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1363 }
1364 
1365 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1366 	.family		=	PF_INET,
1367 	.obj_size	=	sizeof(struct tcp_request_sock),
1368 	.rtx_syn_ack	=	tcp_rtx_synack,
1369 	.send_ack	=	tcp_v4_reqsk_send_ack,
1370 	.destructor	=	tcp_v4_reqsk_destructor,
1371 	.send_reset	=	tcp_v4_send_reset,
1372 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1373 };
1374 
1375 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1376 	.mss_clamp	=	TCP_MSS_DEFAULT,
1377 #ifdef CONFIG_TCP_MD5SIG
1378 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1379 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1380 #endif
1381 	.init_req	=	tcp_v4_init_req,
1382 #ifdef CONFIG_SYN_COOKIES
1383 	.cookie_init_seq =	cookie_v4_init_sequence,
1384 #endif
1385 	.route_req	=	tcp_v4_route_req,
1386 	.init_seq	=	tcp_v4_init_seq,
1387 	.init_ts_off	=	tcp_v4_init_ts_off,
1388 	.send_synack	=	tcp_v4_send_synack,
1389 };
1390 
1391 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1392 {
1393 	/* Never answer to SYNs send to broadcast or multicast */
1394 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1395 		goto drop;
1396 
1397 	return tcp_conn_request(&tcp_request_sock_ops,
1398 				&tcp_request_sock_ipv4_ops, sk, skb);
1399 
1400 drop:
1401 	tcp_listendrop(sk);
1402 	return 0;
1403 }
1404 EXPORT_SYMBOL(tcp_v4_conn_request);
1405 
1406 
1407 /*
1408  * The three way handshake has completed - we got a valid synack -
1409  * now create the new socket.
1410  */
1411 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1412 				  struct request_sock *req,
1413 				  struct dst_entry *dst,
1414 				  struct request_sock *req_unhash,
1415 				  bool *own_req)
1416 {
1417 	struct inet_request_sock *ireq;
1418 	struct inet_sock *newinet;
1419 	struct tcp_sock *newtp;
1420 	struct sock *newsk;
1421 #ifdef CONFIG_TCP_MD5SIG
1422 	struct tcp_md5sig_key *key;
1423 #endif
1424 	struct ip_options_rcu *inet_opt;
1425 
1426 	if (sk_acceptq_is_full(sk))
1427 		goto exit_overflow;
1428 
1429 	newsk = tcp_create_openreq_child(sk, req, skb);
1430 	if (!newsk)
1431 		goto exit_nonewsk;
1432 
1433 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1434 	inet_sk_rx_dst_set(newsk, skb);
1435 
1436 	newtp		      = tcp_sk(newsk);
1437 	newinet		      = inet_sk(newsk);
1438 	ireq		      = inet_rsk(req);
1439 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1440 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1441 	newsk->sk_bound_dev_if = ireq->ir_iif;
1442 	newinet->inet_saddr   = ireq->ir_loc_addr;
1443 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1444 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1445 	newinet->mc_index     = inet_iif(skb);
1446 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1447 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1448 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1449 	if (inet_opt)
1450 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1451 	newinet->inet_id = prandom_u32();
1452 
1453 	if (!dst) {
1454 		dst = inet_csk_route_child_sock(sk, newsk, req);
1455 		if (!dst)
1456 			goto put_and_exit;
1457 	} else {
1458 		/* syncookie case : see end of cookie_v4_check() */
1459 	}
1460 	sk_setup_caps(newsk, dst);
1461 
1462 	tcp_ca_openreq_child(newsk, dst);
1463 
1464 	tcp_sync_mss(newsk, dst_mtu(dst));
1465 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1466 
1467 	tcp_initialize_rcv_mss(newsk);
1468 
1469 #ifdef CONFIG_TCP_MD5SIG
1470 	/* Copy over the MD5 key from the original socket */
1471 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1472 				AF_INET);
1473 	if (key) {
1474 		/*
1475 		 * We're using one, so create a matching key
1476 		 * on the newsk structure. If we fail to get
1477 		 * memory, then we end up not copying the key
1478 		 * across. Shucks.
1479 		 */
1480 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1481 			       AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1482 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1483 	}
1484 #endif
1485 
1486 	if (__inet_inherit_port(sk, newsk) < 0)
1487 		goto put_and_exit;
1488 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1489 	if (likely(*own_req)) {
1490 		tcp_move_syn(newtp, req);
1491 		ireq->ireq_opt = NULL;
1492 	} else {
1493 		newinet->inet_opt = NULL;
1494 	}
1495 	return newsk;
1496 
1497 exit_overflow:
1498 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1499 exit_nonewsk:
1500 	dst_release(dst);
1501 exit:
1502 	tcp_listendrop(sk);
1503 	return NULL;
1504 put_and_exit:
1505 	newinet->inet_opt = NULL;
1506 	inet_csk_prepare_forced_close(newsk);
1507 	tcp_done(newsk);
1508 	goto exit;
1509 }
1510 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1511 
1512 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1513 {
1514 #ifdef CONFIG_SYN_COOKIES
1515 	const struct tcphdr *th = tcp_hdr(skb);
1516 
1517 	if (!th->syn)
1518 		sk = cookie_v4_check(sk, skb);
1519 #endif
1520 	return sk;
1521 }
1522 
1523 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1524 			 struct tcphdr *th, u32 *cookie)
1525 {
1526 	u16 mss = 0;
1527 #ifdef CONFIG_SYN_COOKIES
1528 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1529 				    &tcp_request_sock_ipv4_ops, sk, th);
1530 	if (mss) {
1531 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1532 		tcp_synq_overflow(sk);
1533 	}
1534 #endif
1535 	return mss;
1536 }
1537 
1538 /* The socket must have it's spinlock held when we get
1539  * here, unless it is a TCP_LISTEN socket.
1540  *
1541  * We have a potential double-lock case here, so even when
1542  * doing backlog processing we use the BH locking scheme.
1543  * This is because we cannot sleep with the original spinlock
1544  * held.
1545  */
1546 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1547 {
1548 	struct sock *rsk;
1549 
1550 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1551 		struct dst_entry *dst = sk->sk_rx_dst;
1552 
1553 		sock_rps_save_rxhash(sk, skb);
1554 		sk_mark_napi_id(sk, skb);
1555 		if (dst) {
1556 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1557 			    !dst->ops->check(dst, 0)) {
1558 				dst_release(dst);
1559 				sk->sk_rx_dst = NULL;
1560 			}
1561 		}
1562 		tcp_rcv_established(sk, skb);
1563 		return 0;
1564 	}
1565 
1566 	if (tcp_checksum_complete(skb))
1567 		goto csum_err;
1568 
1569 	if (sk->sk_state == TCP_LISTEN) {
1570 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1571 
1572 		if (!nsk)
1573 			goto discard;
1574 		if (nsk != sk) {
1575 			if (tcp_child_process(sk, nsk, skb)) {
1576 				rsk = nsk;
1577 				goto reset;
1578 			}
1579 			return 0;
1580 		}
1581 	} else
1582 		sock_rps_save_rxhash(sk, skb);
1583 
1584 	if (tcp_rcv_state_process(sk, skb)) {
1585 		rsk = sk;
1586 		goto reset;
1587 	}
1588 	return 0;
1589 
1590 reset:
1591 	tcp_v4_send_reset(rsk, skb);
1592 discard:
1593 	kfree_skb(skb);
1594 	/* Be careful here. If this function gets more complicated and
1595 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1596 	 * might be destroyed here. This current version compiles correctly,
1597 	 * but you have been warned.
1598 	 */
1599 	return 0;
1600 
1601 csum_err:
1602 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1603 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1604 	goto discard;
1605 }
1606 EXPORT_SYMBOL(tcp_v4_do_rcv);
1607 
1608 int tcp_v4_early_demux(struct sk_buff *skb)
1609 {
1610 	const struct iphdr *iph;
1611 	const struct tcphdr *th;
1612 	struct sock *sk;
1613 
1614 	if (skb->pkt_type != PACKET_HOST)
1615 		return 0;
1616 
1617 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1618 		return 0;
1619 
1620 	iph = ip_hdr(skb);
1621 	th = tcp_hdr(skb);
1622 
1623 	if (th->doff < sizeof(struct tcphdr) / 4)
1624 		return 0;
1625 
1626 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1627 				       iph->saddr, th->source,
1628 				       iph->daddr, ntohs(th->dest),
1629 				       skb->skb_iif, inet_sdif(skb));
1630 	if (sk) {
1631 		skb->sk = sk;
1632 		skb->destructor = sock_edemux;
1633 		if (sk_fullsock(sk)) {
1634 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1635 
1636 			if (dst)
1637 				dst = dst_check(dst, 0);
1638 			if (dst &&
1639 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1640 				skb_dst_set_noref(skb, dst);
1641 		}
1642 	}
1643 	return 0;
1644 }
1645 
1646 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1647 {
1648 	u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1649 	struct skb_shared_info *shinfo;
1650 	const struct tcphdr *th;
1651 	struct tcphdr *thtail;
1652 	struct sk_buff *tail;
1653 	unsigned int hdrlen;
1654 	bool fragstolen;
1655 	u32 gso_segs;
1656 	int delta;
1657 
1658 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1659 	 * we can fix skb->truesize to its real value to avoid future drops.
1660 	 * This is valid because skb is not yet charged to the socket.
1661 	 * It has been noticed pure SACK packets were sometimes dropped
1662 	 * (if cooked by drivers without copybreak feature).
1663 	 */
1664 	skb_condense(skb);
1665 
1666 	skb_dst_drop(skb);
1667 
1668 	if (unlikely(tcp_checksum_complete(skb))) {
1669 		bh_unlock_sock(sk);
1670 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1671 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1672 		return true;
1673 	}
1674 
1675 	/* Attempt coalescing to last skb in backlog, even if we are
1676 	 * above the limits.
1677 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1678 	 */
1679 	th = (const struct tcphdr *)skb->data;
1680 	hdrlen = th->doff * 4;
1681 	shinfo = skb_shinfo(skb);
1682 
1683 	if (!shinfo->gso_size)
1684 		shinfo->gso_size = skb->len - hdrlen;
1685 
1686 	if (!shinfo->gso_segs)
1687 		shinfo->gso_segs = 1;
1688 
1689 	tail = sk->sk_backlog.tail;
1690 	if (!tail)
1691 		goto no_coalesce;
1692 	thtail = (struct tcphdr *)tail->data;
1693 
1694 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1695 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1696 	    ((TCP_SKB_CB(tail)->tcp_flags |
1697 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1698 	    !((TCP_SKB_CB(tail)->tcp_flags &
1699 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1700 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1701 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1702 #ifdef CONFIG_TLS_DEVICE
1703 	    tail->decrypted != skb->decrypted ||
1704 #endif
1705 	    thtail->doff != th->doff ||
1706 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1707 		goto no_coalesce;
1708 
1709 	__skb_pull(skb, hdrlen);
1710 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1711 		thtail->window = th->window;
1712 
1713 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1714 
1715 		if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1716 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1717 
1718 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1719 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1720 		 * is not entered if we append a packet with a FIN.
1721 		 * SYN, RST, URG are not present.
1722 		 * ACK is set on both packets.
1723 		 * PSH : we do not really care in TCP stack,
1724 		 *       at least for 'GRO' packets.
1725 		 */
1726 		thtail->fin |= th->fin;
1727 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1728 
1729 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1730 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1731 			tail->tstamp = skb->tstamp;
1732 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1733 		}
1734 
1735 		/* Not as strict as GRO. We only need to carry mss max value */
1736 		skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1737 						 skb_shinfo(tail)->gso_size);
1738 
1739 		gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1740 		skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1741 
1742 		sk->sk_backlog.len += delta;
1743 		__NET_INC_STATS(sock_net(sk),
1744 				LINUX_MIB_TCPBACKLOGCOALESCE);
1745 		kfree_skb_partial(skb, fragstolen);
1746 		return false;
1747 	}
1748 	__skb_push(skb, hdrlen);
1749 
1750 no_coalesce:
1751 	/* Only socket owner can try to collapse/prune rx queues
1752 	 * to reduce memory overhead, so add a little headroom here.
1753 	 * Few sockets backlog are possibly concurrently non empty.
1754 	 */
1755 	limit += 64*1024;
1756 
1757 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1758 		bh_unlock_sock(sk);
1759 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1760 		return true;
1761 	}
1762 	return false;
1763 }
1764 EXPORT_SYMBOL(tcp_add_backlog);
1765 
1766 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1767 {
1768 	struct tcphdr *th = (struct tcphdr *)skb->data;
1769 
1770 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1771 }
1772 EXPORT_SYMBOL(tcp_filter);
1773 
1774 static void tcp_v4_restore_cb(struct sk_buff *skb)
1775 {
1776 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1777 		sizeof(struct inet_skb_parm));
1778 }
1779 
1780 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1781 			   const struct tcphdr *th)
1782 {
1783 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1784 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1785 	 */
1786 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1787 		sizeof(struct inet_skb_parm));
1788 	barrier();
1789 
1790 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1791 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1792 				    skb->len - th->doff * 4);
1793 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1794 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1795 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1796 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1797 	TCP_SKB_CB(skb)->sacked	 = 0;
1798 	TCP_SKB_CB(skb)->has_rxtstamp =
1799 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1800 }
1801 
1802 /*
1803  *	From tcp_input.c
1804  */
1805 
1806 int tcp_v4_rcv(struct sk_buff *skb)
1807 {
1808 	struct net *net = dev_net(skb->dev);
1809 	struct sk_buff *skb_to_free;
1810 	int sdif = inet_sdif(skb);
1811 	const struct iphdr *iph;
1812 	const struct tcphdr *th;
1813 	bool refcounted;
1814 	struct sock *sk;
1815 	int ret;
1816 
1817 	if (skb->pkt_type != PACKET_HOST)
1818 		goto discard_it;
1819 
1820 	/* Count it even if it's bad */
1821 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1822 
1823 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1824 		goto discard_it;
1825 
1826 	th = (const struct tcphdr *)skb->data;
1827 
1828 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1829 		goto bad_packet;
1830 	if (!pskb_may_pull(skb, th->doff * 4))
1831 		goto discard_it;
1832 
1833 	/* An explanation is required here, I think.
1834 	 * Packet length and doff are validated by header prediction,
1835 	 * provided case of th->doff==0 is eliminated.
1836 	 * So, we defer the checks. */
1837 
1838 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1839 		goto csum_error;
1840 
1841 	th = (const struct tcphdr *)skb->data;
1842 	iph = ip_hdr(skb);
1843 lookup:
1844 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1845 			       th->dest, sdif, &refcounted);
1846 	if (!sk)
1847 		goto no_tcp_socket;
1848 
1849 process:
1850 	if (sk->sk_state == TCP_TIME_WAIT)
1851 		goto do_time_wait;
1852 
1853 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1854 		struct request_sock *req = inet_reqsk(sk);
1855 		bool req_stolen = false;
1856 		struct sock *nsk;
1857 
1858 		sk = req->rsk_listener;
1859 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1860 			sk_drops_add(sk, skb);
1861 			reqsk_put(req);
1862 			goto discard_it;
1863 		}
1864 		if (tcp_checksum_complete(skb)) {
1865 			reqsk_put(req);
1866 			goto csum_error;
1867 		}
1868 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1869 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1870 			goto lookup;
1871 		}
1872 		/* We own a reference on the listener, increase it again
1873 		 * as we might lose it too soon.
1874 		 */
1875 		sock_hold(sk);
1876 		refcounted = true;
1877 		nsk = NULL;
1878 		if (!tcp_filter(sk, skb)) {
1879 			th = (const struct tcphdr *)skb->data;
1880 			iph = ip_hdr(skb);
1881 			tcp_v4_fill_cb(skb, iph, th);
1882 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1883 		}
1884 		if (!nsk) {
1885 			reqsk_put(req);
1886 			if (req_stolen) {
1887 				/* Another cpu got exclusive access to req
1888 				 * and created a full blown socket.
1889 				 * Try to feed this packet to this socket
1890 				 * instead of discarding it.
1891 				 */
1892 				tcp_v4_restore_cb(skb);
1893 				sock_put(sk);
1894 				goto lookup;
1895 			}
1896 			goto discard_and_relse;
1897 		}
1898 		if (nsk == sk) {
1899 			reqsk_put(req);
1900 			tcp_v4_restore_cb(skb);
1901 		} else if (tcp_child_process(sk, nsk, skb)) {
1902 			tcp_v4_send_reset(nsk, skb);
1903 			goto discard_and_relse;
1904 		} else {
1905 			sock_put(sk);
1906 			return 0;
1907 		}
1908 	}
1909 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1910 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1911 		goto discard_and_relse;
1912 	}
1913 
1914 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1915 		goto discard_and_relse;
1916 
1917 	if (tcp_v4_inbound_md5_hash(sk, skb))
1918 		goto discard_and_relse;
1919 
1920 	nf_reset_ct(skb);
1921 
1922 	if (tcp_filter(sk, skb))
1923 		goto discard_and_relse;
1924 	th = (const struct tcphdr *)skb->data;
1925 	iph = ip_hdr(skb);
1926 	tcp_v4_fill_cb(skb, iph, th);
1927 
1928 	skb->dev = NULL;
1929 
1930 	if (sk->sk_state == TCP_LISTEN) {
1931 		ret = tcp_v4_do_rcv(sk, skb);
1932 		goto put_and_return;
1933 	}
1934 
1935 	sk_incoming_cpu_update(sk);
1936 
1937 	bh_lock_sock_nested(sk);
1938 	tcp_segs_in(tcp_sk(sk), skb);
1939 	ret = 0;
1940 	if (!sock_owned_by_user(sk)) {
1941 		skb_to_free = sk->sk_rx_skb_cache;
1942 		sk->sk_rx_skb_cache = NULL;
1943 		ret = tcp_v4_do_rcv(sk, skb);
1944 	} else {
1945 		if (tcp_add_backlog(sk, skb))
1946 			goto discard_and_relse;
1947 		skb_to_free = NULL;
1948 	}
1949 	bh_unlock_sock(sk);
1950 	if (skb_to_free)
1951 		__kfree_skb(skb_to_free);
1952 
1953 put_and_return:
1954 	if (refcounted)
1955 		sock_put(sk);
1956 
1957 	return ret;
1958 
1959 no_tcp_socket:
1960 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1961 		goto discard_it;
1962 
1963 	tcp_v4_fill_cb(skb, iph, th);
1964 
1965 	if (tcp_checksum_complete(skb)) {
1966 csum_error:
1967 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1968 bad_packet:
1969 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
1970 	} else {
1971 		tcp_v4_send_reset(NULL, skb);
1972 	}
1973 
1974 discard_it:
1975 	/* Discard frame. */
1976 	kfree_skb(skb);
1977 	return 0;
1978 
1979 discard_and_relse:
1980 	sk_drops_add(sk, skb);
1981 	if (refcounted)
1982 		sock_put(sk);
1983 	goto discard_it;
1984 
1985 do_time_wait:
1986 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1987 		inet_twsk_put(inet_twsk(sk));
1988 		goto discard_it;
1989 	}
1990 
1991 	tcp_v4_fill_cb(skb, iph, th);
1992 
1993 	if (tcp_checksum_complete(skb)) {
1994 		inet_twsk_put(inet_twsk(sk));
1995 		goto csum_error;
1996 	}
1997 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1998 	case TCP_TW_SYN: {
1999 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2000 							&tcp_hashinfo, skb,
2001 							__tcp_hdrlen(th),
2002 							iph->saddr, th->source,
2003 							iph->daddr, th->dest,
2004 							inet_iif(skb),
2005 							sdif);
2006 		if (sk2) {
2007 			inet_twsk_deschedule_put(inet_twsk(sk));
2008 			sk = sk2;
2009 			tcp_v4_restore_cb(skb);
2010 			refcounted = false;
2011 			goto process;
2012 		}
2013 	}
2014 		/* to ACK */
2015 		/* fall through */
2016 	case TCP_TW_ACK:
2017 		tcp_v4_timewait_ack(sk, skb);
2018 		break;
2019 	case TCP_TW_RST:
2020 		tcp_v4_send_reset(sk, skb);
2021 		inet_twsk_deschedule_put(inet_twsk(sk));
2022 		goto discard_it;
2023 	case TCP_TW_SUCCESS:;
2024 	}
2025 	goto discard_it;
2026 }
2027 
2028 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2029 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2030 	.twsk_unique	= tcp_twsk_unique,
2031 	.twsk_destructor= tcp_twsk_destructor,
2032 };
2033 
2034 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2035 {
2036 	struct dst_entry *dst = skb_dst(skb);
2037 
2038 	if (dst && dst_hold_safe(dst)) {
2039 		sk->sk_rx_dst = dst;
2040 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2041 	}
2042 }
2043 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2044 
2045 const struct inet_connection_sock_af_ops ipv4_specific = {
2046 	.queue_xmit	   = ip_queue_xmit,
2047 	.send_check	   = tcp_v4_send_check,
2048 	.rebuild_header	   = inet_sk_rebuild_header,
2049 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2050 	.conn_request	   = tcp_v4_conn_request,
2051 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2052 	.net_header_len	   = sizeof(struct iphdr),
2053 	.setsockopt	   = ip_setsockopt,
2054 	.getsockopt	   = ip_getsockopt,
2055 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2056 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2057 #ifdef CONFIG_COMPAT
2058 	.compat_setsockopt = compat_ip_setsockopt,
2059 	.compat_getsockopt = compat_ip_getsockopt,
2060 #endif
2061 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2062 };
2063 EXPORT_SYMBOL(ipv4_specific);
2064 
2065 #ifdef CONFIG_TCP_MD5SIG
2066 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2067 	.md5_lookup		= tcp_v4_md5_lookup,
2068 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2069 	.md5_parse		= tcp_v4_parse_md5_keys,
2070 };
2071 #endif
2072 
2073 /* NOTE: A lot of things set to zero explicitly by call to
2074  *       sk_alloc() so need not be done here.
2075  */
2076 static int tcp_v4_init_sock(struct sock *sk)
2077 {
2078 	struct inet_connection_sock *icsk = inet_csk(sk);
2079 
2080 	tcp_init_sock(sk);
2081 
2082 	icsk->icsk_af_ops = &ipv4_specific;
2083 
2084 #ifdef CONFIG_TCP_MD5SIG
2085 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2086 #endif
2087 
2088 	return 0;
2089 }
2090 
2091 void tcp_v4_destroy_sock(struct sock *sk)
2092 {
2093 	struct tcp_sock *tp = tcp_sk(sk);
2094 
2095 	trace_tcp_destroy_sock(sk);
2096 
2097 	tcp_clear_xmit_timers(sk);
2098 
2099 	tcp_cleanup_congestion_control(sk);
2100 
2101 	tcp_cleanup_ulp(sk);
2102 
2103 	/* Cleanup up the write buffer. */
2104 	tcp_write_queue_purge(sk);
2105 
2106 	/* Check if we want to disable active TFO */
2107 	tcp_fastopen_active_disable_ofo_check(sk);
2108 
2109 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2110 	skb_rbtree_purge(&tp->out_of_order_queue);
2111 
2112 #ifdef CONFIG_TCP_MD5SIG
2113 	/* Clean up the MD5 key list, if any */
2114 	if (tp->md5sig_info) {
2115 		tcp_clear_md5_list(sk);
2116 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2117 		tp->md5sig_info = NULL;
2118 	}
2119 #endif
2120 
2121 	/* Clean up a referenced TCP bind bucket. */
2122 	if (inet_csk(sk)->icsk_bind_hash)
2123 		inet_put_port(sk);
2124 
2125 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2126 
2127 	/* If socket is aborted during connect operation */
2128 	tcp_free_fastopen_req(tp);
2129 	tcp_fastopen_destroy_cipher(sk);
2130 	tcp_saved_syn_free(tp);
2131 
2132 	sk_sockets_allocated_dec(sk);
2133 }
2134 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2135 
2136 #ifdef CONFIG_PROC_FS
2137 /* Proc filesystem TCP sock list dumping. */
2138 
2139 /*
2140  * Get next listener socket follow cur.  If cur is NULL, get first socket
2141  * starting from bucket given in st->bucket; when st->bucket is zero the
2142  * very first socket in the hash table is returned.
2143  */
2144 static void *listening_get_next(struct seq_file *seq, void *cur)
2145 {
2146 	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2147 	struct tcp_iter_state *st = seq->private;
2148 	struct net *net = seq_file_net(seq);
2149 	struct inet_listen_hashbucket *ilb;
2150 	struct hlist_nulls_node *node;
2151 	struct sock *sk = cur;
2152 
2153 	if (!sk) {
2154 get_head:
2155 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2156 		spin_lock(&ilb->lock);
2157 		sk = sk_nulls_head(&ilb->nulls_head);
2158 		st->offset = 0;
2159 		goto get_sk;
2160 	}
2161 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2162 	++st->num;
2163 	++st->offset;
2164 
2165 	sk = sk_nulls_next(sk);
2166 get_sk:
2167 	sk_nulls_for_each_from(sk, node) {
2168 		if (!net_eq(sock_net(sk), net))
2169 			continue;
2170 		if (sk->sk_family == afinfo->family)
2171 			return sk;
2172 	}
2173 	spin_unlock(&ilb->lock);
2174 	st->offset = 0;
2175 	if (++st->bucket < INET_LHTABLE_SIZE)
2176 		goto get_head;
2177 	return NULL;
2178 }
2179 
2180 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2181 {
2182 	struct tcp_iter_state *st = seq->private;
2183 	void *rc;
2184 
2185 	st->bucket = 0;
2186 	st->offset = 0;
2187 	rc = listening_get_next(seq, NULL);
2188 
2189 	while (rc && *pos) {
2190 		rc = listening_get_next(seq, rc);
2191 		--*pos;
2192 	}
2193 	return rc;
2194 }
2195 
2196 static inline bool empty_bucket(const struct tcp_iter_state *st)
2197 {
2198 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2199 }
2200 
2201 /*
2202  * Get first established socket starting from bucket given in st->bucket.
2203  * If st->bucket is zero, the very first socket in the hash is returned.
2204  */
2205 static void *established_get_first(struct seq_file *seq)
2206 {
2207 	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2208 	struct tcp_iter_state *st = seq->private;
2209 	struct net *net = seq_file_net(seq);
2210 	void *rc = NULL;
2211 
2212 	st->offset = 0;
2213 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2214 		struct sock *sk;
2215 		struct hlist_nulls_node *node;
2216 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2217 
2218 		/* Lockless fast path for the common case of empty buckets */
2219 		if (empty_bucket(st))
2220 			continue;
2221 
2222 		spin_lock_bh(lock);
2223 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2224 			if (sk->sk_family != afinfo->family ||
2225 			    !net_eq(sock_net(sk), net)) {
2226 				continue;
2227 			}
2228 			rc = sk;
2229 			goto out;
2230 		}
2231 		spin_unlock_bh(lock);
2232 	}
2233 out:
2234 	return rc;
2235 }
2236 
2237 static void *established_get_next(struct seq_file *seq, void *cur)
2238 {
2239 	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2240 	struct sock *sk = cur;
2241 	struct hlist_nulls_node *node;
2242 	struct tcp_iter_state *st = seq->private;
2243 	struct net *net = seq_file_net(seq);
2244 
2245 	++st->num;
2246 	++st->offset;
2247 
2248 	sk = sk_nulls_next(sk);
2249 
2250 	sk_nulls_for_each_from(sk, node) {
2251 		if (sk->sk_family == afinfo->family &&
2252 		    net_eq(sock_net(sk), net))
2253 			return sk;
2254 	}
2255 
2256 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2257 	++st->bucket;
2258 	return established_get_first(seq);
2259 }
2260 
2261 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2262 {
2263 	struct tcp_iter_state *st = seq->private;
2264 	void *rc;
2265 
2266 	st->bucket = 0;
2267 	rc = established_get_first(seq);
2268 
2269 	while (rc && pos) {
2270 		rc = established_get_next(seq, rc);
2271 		--pos;
2272 	}
2273 	return rc;
2274 }
2275 
2276 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2277 {
2278 	void *rc;
2279 	struct tcp_iter_state *st = seq->private;
2280 
2281 	st->state = TCP_SEQ_STATE_LISTENING;
2282 	rc	  = listening_get_idx(seq, &pos);
2283 
2284 	if (!rc) {
2285 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2286 		rc	  = established_get_idx(seq, pos);
2287 	}
2288 
2289 	return rc;
2290 }
2291 
2292 static void *tcp_seek_last_pos(struct seq_file *seq)
2293 {
2294 	struct tcp_iter_state *st = seq->private;
2295 	int offset = st->offset;
2296 	int orig_num = st->num;
2297 	void *rc = NULL;
2298 
2299 	switch (st->state) {
2300 	case TCP_SEQ_STATE_LISTENING:
2301 		if (st->bucket >= INET_LHTABLE_SIZE)
2302 			break;
2303 		st->state = TCP_SEQ_STATE_LISTENING;
2304 		rc = listening_get_next(seq, NULL);
2305 		while (offset-- && rc)
2306 			rc = listening_get_next(seq, rc);
2307 		if (rc)
2308 			break;
2309 		st->bucket = 0;
2310 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2311 		/* Fallthrough */
2312 	case TCP_SEQ_STATE_ESTABLISHED:
2313 		if (st->bucket > tcp_hashinfo.ehash_mask)
2314 			break;
2315 		rc = established_get_first(seq);
2316 		while (offset-- && rc)
2317 			rc = established_get_next(seq, rc);
2318 	}
2319 
2320 	st->num = orig_num;
2321 
2322 	return rc;
2323 }
2324 
2325 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2326 {
2327 	struct tcp_iter_state *st = seq->private;
2328 	void *rc;
2329 
2330 	if (*pos && *pos == st->last_pos) {
2331 		rc = tcp_seek_last_pos(seq);
2332 		if (rc)
2333 			goto out;
2334 	}
2335 
2336 	st->state = TCP_SEQ_STATE_LISTENING;
2337 	st->num = 0;
2338 	st->bucket = 0;
2339 	st->offset = 0;
2340 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2341 
2342 out:
2343 	st->last_pos = *pos;
2344 	return rc;
2345 }
2346 EXPORT_SYMBOL(tcp_seq_start);
2347 
2348 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2349 {
2350 	struct tcp_iter_state *st = seq->private;
2351 	void *rc = NULL;
2352 
2353 	if (v == SEQ_START_TOKEN) {
2354 		rc = tcp_get_idx(seq, 0);
2355 		goto out;
2356 	}
2357 
2358 	switch (st->state) {
2359 	case TCP_SEQ_STATE_LISTENING:
2360 		rc = listening_get_next(seq, v);
2361 		if (!rc) {
2362 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2363 			st->bucket = 0;
2364 			st->offset = 0;
2365 			rc	  = established_get_first(seq);
2366 		}
2367 		break;
2368 	case TCP_SEQ_STATE_ESTABLISHED:
2369 		rc = established_get_next(seq, v);
2370 		break;
2371 	}
2372 out:
2373 	++*pos;
2374 	st->last_pos = *pos;
2375 	return rc;
2376 }
2377 EXPORT_SYMBOL(tcp_seq_next);
2378 
2379 void tcp_seq_stop(struct seq_file *seq, void *v)
2380 {
2381 	struct tcp_iter_state *st = seq->private;
2382 
2383 	switch (st->state) {
2384 	case TCP_SEQ_STATE_LISTENING:
2385 		if (v != SEQ_START_TOKEN)
2386 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2387 		break;
2388 	case TCP_SEQ_STATE_ESTABLISHED:
2389 		if (v)
2390 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2391 		break;
2392 	}
2393 }
2394 EXPORT_SYMBOL(tcp_seq_stop);
2395 
2396 static void get_openreq4(const struct request_sock *req,
2397 			 struct seq_file *f, int i)
2398 {
2399 	const struct inet_request_sock *ireq = inet_rsk(req);
2400 	long delta = req->rsk_timer.expires - jiffies;
2401 
2402 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2403 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2404 		i,
2405 		ireq->ir_loc_addr,
2406 		ireq->ir_num,
2407 		ireq->ir_rmt_addr,
2408 		ntohs(ireq->ir_rmt_port),
2409 		TCP_SYN_RECV,
2410 		0, 0, /* could print option size, but that is af dependent. */
2411 		1,    /* timers active (only the expire timer) */
2412 		jiffies_delta_to_clock_t(delta),
2413 		req->num_timeout,
2414 		from_kuid_munged(seq_user_ns(f),
2415 				 sock_i_uid(req->rsk_listener)),
2416 		0,  /* non standard timer */
2417 		0, /* open_requests have no inode */
2418 		0,
2419 		req);
2420 }
2421 
2422 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2423 {
2424 	int timer_active;
2425 	unsigned long timer_expires;
2426 	const struct tcp_sock *tp = tcp_sk(sk);
2427 	const struct inet_connection_sock *icsk = inet_csk(sk);
2428 	const struct inet_sock *inet = inet_sk(sk);
2429 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2430 	__be32 dest = inet->inet_daddr;
2431 	__be32 src = inet->inet_rcv_saddr;
2432 	__u16 destp = ntohs(inet->inet_dport);
2433 	__u16 srcp = ntohs(inet->inet_sport);
2434 	int rx_queue;
2435 	int state;
2436 
2437 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2438 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2439 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2440 		timer_active	= 1;
2441 		timer_expires	= icsk->icsk_timeout;
2442 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2443 		timer_active	= 4;
2444 		timer_expires	= icsk->icsk_timeout;
2445 	} else if (timer_pending(&sk->sk_timer)) {
2446 		timer_active	= 2;
2447 		timer_expires	= sk->sk_timer.expires;
2448 	} else {
2449 		timer_active	= 0;
2450 		timer_expires = jiffies;
2451 	}
2452 
2453 	state = inet_sk_state_load(sk);
2454 	if (state == TCP_LISTEN)
2455 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2456 	else
2457 		/* Because we don't lock the socket,
2458 		 * we might find a transient negative value.
2459 		 */
2460 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2461 				      READ_ONCE(tp->copied_seq), 0);
2462 
2463 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2464 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2465 		i, src, srcp, dest, destp, state,
2466 		READ_ONCE(tp->write_seq) - tp->snd_una,
2467 		rx_queue,
2468 		timer_active,
2469 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2470 		icsk->icsk_retransmits,
2471 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2472 		icsk->icsk_probes_out,
2473 		sock_i_ino(sk),
2474 		refcount_read(&sk->sk_refcnt), sk,
2475 		jiffies_to_clock_t(icsk->icsk_rto),
2476 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2477 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2478 		tp->snd_cwnd,
2479 		state == TCP_LISTEN ?
2480 		    fastopenq->max_qlen :
2481 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2482 }
2483 
2484 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2485 			       struct seq_file *f, int i)
2486 {
2487 	long delta = tw->tw_timer.expires - jiffies;
2488 	__be32 dest, src;
2489 	__u16 destp, srcp;
2490 
2491 	dest  = tw->tw_daddr;
2492 	src   = tw->tw_rcv_saddr;
2493 	destp = ntohs(tw->tw_dport);
2494 	srcp  = ntohs(tw->tw_sport);
2495 
2496 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2497 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2498 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2499 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2500 		refcount_read(&tw->tw_refcnt), tw);
2501 }
2502 
2503 #define TMPSZ 150
2504 
2505 static int tcp4_seq_show(struct seq_file *seq, void *v)
2506 {
2507 	struct tcp_iter_state *st;
2508 	struct sock *sk = v;
2509 
2510 	seq_setwidth(seq, TMPSZ - 1);
2511 	if (v == SEQ_START_TOKEN) {
2512 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2513 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2514 			   "inode");
2515 		goto out;
2516 	}
2517 	st = seq->private;
2518 
2519 	if (sk->sk_state == TCP_TIME_WAIT)
2520 		get_timewait4_sock(v, seq, st->num);
2521 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2522 		get_openreq4(v, seq, st->num);
2523 	else
2524 		get_tcp4_sock(v, seq, st->num);
2525 out:
2526 	seq_pad(seq, '\n');
2527 	return 0;
2528 }
2529 
2530 static const struct seq_operations tcp4_seq_ops = {
2531 	.show		= tcp4_seq_show,
2532 	.start		= tcp_seq_start,
2533 	.next		= tcp_seq_next,
2534 	.stop		= tcp_seq_stop,
2535 };
2536 
2537 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2538 	.family		= AF_INET,
2539 };
2540 
2541 static int __net_init tcp4_proc_init_net(struct net *net)
2542 {
2543 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2544 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2545 		return -ENOMEM;
2546 	return 0;
2547 }
2548 
2549 static void __net_exit tcp4_proc_exit_net(struct net *net)
2550 {
2551 	remove_proc_entry("tcp", net->proc_net);
2552 }
2553 
2554 static struct pernet_operations tcp4_net_ops = {
2555 	.init = tcp4_proc_init_net,
2556 	.exit = tcp4_proc_exit_net,
2557 };
2558 
2559 int __init tcp4_proc_init(void)
2560 {
2561 	return register_pernet_subsys(&tcp4_net_ops);
2562 }
2563 
2564 void tcp4_proc_exit(void)
2565 {
2566 	unregister_pernet_subsys(&tcp4_net_ops);
2567 }
2568 #endif /* CONFIG_PROC_FS */
2569 
2570 struct proto tcp_prot = {
2571 	.name			= "TCP",
2572 	.owner			= THIS_MODULE,
2573 	.close			= tcp_close,
2574 	.pre_connect		= tcp_v4_pre_connect,
2575 	.connect		= tcp_v4_connect,
2576 	.disconnect		= tcp_disconnect,
2577 	.accept			= inet_csk_accept,
2578 	.ioctl			= tcp_ioctl,
2579 	.init			= tcp_v4_init_sock,
2580 	.destroy		= tcp_v4_destroy_sock,
2581 	.shutdown		= tcp_shutdown,
2582 	.setsockopt		= tcp_setsockopt,
2583 	.getsockopt		= tcp_getsockopt,
2584 	.keepalive		= tcp_set_keepalive,
2585 	.recvmsg		= tcp_recvmsg,
2586 	.sendmsg		= tcp_sendmsg,
2587 	.sendpage		= tcp_sendpage,
2588 	.backlog_rcv		= tcp_v4_do_rcv,
2589 	.release_cb		= tcp_release_cb,
2590 	.hash			= inet_hash,
2591 	.unhash			= inet_unhash,
2592 	.get_port		= inet_csk_get_port,
2593 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2594 	.leave_memory_pressure	= tcp_leave_memory_pressure,
2595 	.stream_memory_free	= tcp_stream_memory_free,
2596 	.sockets_allocated	= &tcp_sockets_allocated,
2597 	.orphan_count		= &tcp_orphan_count,
2598 	.memory_allocated	= &tcp_memory_allocated,
2599 	.memory_pressure	= &tcp_memory_pressure,
2600 	.sysctl_mem		= sysctl_tcp_mem,
2601 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
2602 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
2603 	.max_header		= MAX_TCP_HEADER,
2604 	.obj_size		= sizeof(struct tcp_sock),
2605 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
2606 	.twsk_prot		= &tcp_timewait_sock_ops,
2607 	.rsk_prot		= &tcp_request_sock_ops,
2608 	.h.hashinfo		= &tcp_hashinfo,
2609 	.no_autobind		= true,
2610 #ifdef CONFIG_COMPAT
2611 	.compat_setsockopt	= compat_tcp_setsockopt,
2612 	.compat_getsockopt	= compat_tcp_getsockopt,
2613 #endif
2614 	.diag_destroy		= tcp_abort,
2615 };
2616 EXPORT_SYMBOL(tcp_prot);
2617 
2618 static void __net_exit tcp_sk_exit(struct net *net)
2619 {
2620 	int cpu;
2621 
2622 	if (net->ipv4.tcp_congestion_control)
2623 		module_put(net->ipv4.tcp_congestion_control->owner);
2624 
2625 	for_each_possible_cpu(cpu)
2626 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2627 	free_percpu(net->ipv4.tcp_sk);
2628 }
2629 
2630 static int __net_init tcp_sk_init(struct net *net)
2631 {
2632 	int res, cpu, cnt;
2633 
2634 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2635 	if (!net->ipv4.tcp_sk)
2636 		return -ENOMEM;
2637 
2638 	for_each_possible_cpu(cpu) {
2639 		struct sock *sk;
2640 
2641 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2642 					   IPPROTO_TCP, net);
2643 		if (res)
2644 			goto fail;
2645 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2646 
2647 		/* Please enforce IP_DF and IPID==0 for RST and
2648 		 * ACK sent in SYN-RECV and TIME-WAIT state.
2649 		 */
2650 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2651 
2652 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2653 	}
2654 
2655 	net->ipv4.sysctl_tcp_ecn = 2;
2656 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2657 
2658 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2659 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2660 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2661 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2662 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2663 
2664 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2665 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2666 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2667 
2668 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2669 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2670 	net->ipv4.sysctl_tcp_syncookies = 1;
2671 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2672 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2673 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2674 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2675 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2676 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2677 	net->ipv4.sysctl_tcp_tw_reuse = 2;
2678 
2679 	cnt = tcp_hashinfo.ehash_mask + 1;
2680 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2681 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2682 
2683 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2684 	net->ipv4.sysctl_tcp_sack = 1;
2685 	net->ipv4.sysctl_tcp_window_scaling = 1;
2686 	net->ipv4.sysctl_tcp_timestamps = 1;
2687 	net->ipv4.sysctl_tcp_early_retrans = 3;
2688 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2689 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2690 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
2691 	net->ipv4.sysctl_tcp_max_reordering = 300;
2692 	net->ipv4.sysctl_tcp_dsack = 1;
2693 	net->ipv4.sysctl_tcp_app_win = 31;
2694 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
2695 	net->ipv4.sysctl_tcp_frto = 2;
2696 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2697 	/* This limits the percentage of the congestion window which we
2698 	 * will allow a single TSO frame to consume.  Building TSO frames
2699 	 * which are too large can cause TCP streams to be bursty.
2700 	 */
2701 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2702 	/* Default TSQ limit of 16 TSO segments */
2703 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2704 	/* rfc5961 challenge ack rate limiting */
2705 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2706 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
2707 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2708 	net->ipv4.sysctl_tcp_autocorking = 1;
2709 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2710 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2711 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2712 	if (net != &init_net) {
2713 		memcpy(net->ipv4.sysctl_tcp_rmem,
2714 		       init_net.ipv4.sysctl_tcp_rmem,
2715 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2716 		memcpy(net->ipv4.sysctl_tcp_wmem,
2717 		       init_net.ipv4.sysctl_tcp_wmem,
2718 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2719 	}
2720 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2721 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2722 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2723 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2724 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2725 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2726 
2727 	/* Reno is always built in */
2728 	if (!net_eq(net, &init_net) &&
2729 	    try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2730 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2731 	else
2732 		net->ipv4.tcp_congestion_control = &tcp_reno;
2733 
2734 	return 0;
2735 fail:
2736 	tcp_sk_exit(net);
2737 
2738 	return res;
2739 }
2740 
2741 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2742 {
2743 	struct net *net;
2744 
2745 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2746 
2747 	list_for_each_entry(net, net_exit_list, exit_list)
2748 		tcp_fastopen_ctx_destroy(net);
2749 }
2750 
2751 static struct pernet_operations __net_initdata tcp_sk_ops = {
2752        .init	   = tcp_sk_init,
2753        .exit	   = tcp_sk_exit,
2754        .exit_batch = tcp_sk_exit_batch,
2755 };
2756 
2757 void __init tcp_v4_init(void)
2758 {
2759 	if (register_pernet_subsys(&tcp_sk_ops))
2760 		panic("Failed to create the TCP control socket.\n");
2761 }
2762