xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision 1802d0beecafe581ad584634ba92f8a471d8a63a)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72 
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 
80 #include <crypto/hash.h>
81 #include <linux/scatterlist.h>
82 
83 #include <trace/events/tcp.h>
84 
85 #ifdef CONFIG_TCP_MD5SIG
86 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
87 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
88 #endif
89 
90 struct inet_hashinfo tcp_hashinfo;
91 EXPORT_SYMBOL(tcp_hashinfo);
92 
93 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
94 {
95 	return secure_tcp_seq(ip_hdr(skb)->daddr,
96 			      ip_hdr(skb)->saddr,
97 			      tcp_hdr(skb)->dest,
98 			      tcp_hdr(skb)->source);
99 }
100 
101 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
102 {
103 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
104 }
105 
106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
107 {
108 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
109 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
110 	struct tcp_sock *tp = tcp_sk(sk);
111 	int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
112 
113 	if (reuse == 2) {
114 		/* Still does not detect *everything* that goes through
115 		 * lo, since we require a loopback src or dst address
116 		 * or direct binding to 'lo' interface.
117 		 */
118 		bool loopback = false;
119 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
120 			loopback = true;
121 #if IS_ENABLED(CONFIG_IPV6)
122 		if (tw->tw_family == AF_INET6) {
123 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
124 			    (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
125 			     (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
126 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
127 			    (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
128 			     (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
129 				loopback = true;
130 		} else
131 #endif
132 		{
133 			if (ipv4_is_loopback(tw->tw_daddr) ||
134 			    ipv4_is_loopback(tw->tw_rcv_saddr))
135 				loopback = true;
136 		}
137 		if (!loopback)
138 			reuse = 0;
139 	}
140 
141 	/* With PAWS, it is safe from the viewpoint
142 	   of data integrity. Even without PAWS it is safe provided sequence
143 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
144 
145 	   Actually, the idea is close to VJ's one, only timestamp cache is
146 	   held not per host, but per port pair and TW bucket is used as state
147 	   holder.
148 
149 	   If TW bucket has been already destroyed we fall back to VJ's scheme
150 	   and use initial timestamp retrieved from peer table.
151 	 */
152 	if (tcptw->tw_ts_recent_stamp &&
153 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
154 					    tcptw->tw_ts_recent_stamp)))) {
155 		/* In case of repair and re-using TIME-WAIT sockets we still
156 		 * want to be sure that it is safe as above but honor the
157 		 * sequence numbers and time stamps set as part of the repair
158 		 * process.
159 		 *
160 		 * Without this check re-using a TIME-WAIT socket with TCP
161 		 * repair would accumulate a -1 on the repair assigned
162 		 * sequence number. The first time it is reused the sequence
163 		 * is -1, the second time -2, etc. This fixes that issue
164 		 * without appearing to create any others.
165 		 */
166 		if (likely(!tp->repair)) {
167 			tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
168 			if (tp->write_seq == 0)
169 				tp->write_seq = 1;
170 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
171 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
172 		}
173 		sock_hold(sktw);
174 		return 1;
175 	}
176 
177 	return 0;
178 }
179 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
180 
181 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
182 			      int addr_len)
183 {
184 	/* This check is replicated from tcp_v4_connect() and intended to
185 	 * prevent BPF program called below from accessing bytes that are out
186 	 * of the bound specified by user in addr_len.
187 	 */
188 	if (addr_len < sizeof(struct sockaddr_in))
189 		return -EINVAL;
190 
191 	sock_owned_by_me(sk);
192 
193 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
194 }
195 
196 /* This will initiate an outgoing connection. */
197 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
198 {
199 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
200 	struct inet_sock *inet = inet_sk(sk);
201 	struct tcp_sock *tp = tcp_sk(sk);
202 	__be16 orig_sport, orig_dport;
203 	__be32 daddr, nexthop;
204 	struct flowi4 *fl4;
205 	struct rtable *rt;
206 	int err;
207 	struct ip_options_rcu *inet_opt;
208 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
209 
210 	if (addr_len < sizeof(struct sockaddr_in))
211 		return -EINVAL;
212 
213 	if (usin->sin_family != AF_INET)
214 		return -EAFNOSUPPORT;
215 
216 	nexthop = daddr = usin->sin_addr.s_addr;
217 	inet_opt = rcu_dereference_protected(inet->inet_opt,
218 					     lockdep_sock_is_held(sk));
219 	if (inet_opt && inet_opt->opt.srr) {
220 		if (!daddr)
221 			return -EINVAL;
222 		nexthop = inet_opt->opt.faddr;
223 	}
224 
225 	orig_sport = inet->inet_sport;
226 	orig_dport = usin->sin_port;
227 	fl4 = &inet->cork.fl.u.ip4;
228 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
229 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
230 			      IPPROTO_TCP,
231 			      orig_sport, orig_dport, sk);
232 	if (IS_ERR(rt)) {
233 		err = PTR_ERR(rt);
234 		if (err == -ENETUNREACH)
235 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
236 		return err;
237 	}
238 
239 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
240 		ip_rt_put(rt);
241 		return -ENETUNREACH;
242 	}
243 
244 	if (!inet_opt || !inet_opt->opt.srr)
245 		daddr = fl4->daddr;
246 
247 	if (!inet->inet_saddr)
248 		inet->inet_saddr = fl4->saddr;
249 	sk_rcv_saddr_set(sk, inet->inet_saddr);
250 
251 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
252 		/* Reset inherited state */
253 		tp->rx_opt.ts_recent	   = 0;
254 		tp->rx_opt.ts_recent_stamp = 0;
255 		if (likely(!tp->repair))
256 			tp->write_seq	   = 0;
257 	}
258 
259 	inet->inet_dport = usin->sin_port;
260 	sk_daddr_set(sk, daddr);
261 
262 	inet_csk(sk)->icsk_ext_hdr_len = 0;
263 	if (inet_opt)
264 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
265 
266 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
267 
268 	/* Socket identity is still unknown (sport may be zero).
269 	 * However we set state to SYN-SENT and not releasing socket
270 	 * lock select source port, enter ourselves into the hash tables and
271 	 * complete initialization after this.
272 	 */
273 	tcp_set_state(sk, TCP_SYN_SENT);
274 	err = inet_hash_connect(tcp_death_row, sk);
275 	if (err)
276 		goto failure;
277 
278 	sk_set_txhash(sk);
279 
280 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
281 			       inet->inet_sport, inet->inet_dport, sk);
282 	if (IS_ERR(rt)) {
283 		err = PTR_ERR(rt);
284 		rt = NULL;
285 		goto failure;
286 	}
287 	/* OK, now commit destination to socket.  */
288 	sk->sk_gso_type = SKB_GSO_TCPV4;
289 	sk_setup_caps(sk, &rt->dst);
290 	rt = NULL;
291 
292 	if (likely(!tp->repair)) {
293 		if (!tp->write_seq)
294 			tp->write_seq = secure_tcp_seq(inet->inet_saddr,
295 						       inet->inet_daddr,
296 						       inet->inet_sport,
297 						       usin->sin_port);
298 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
299 						 inet->inet_saddr,
300 						 inet->inet_daddr);
301 	}
302 
303 	inet->inet_id = tp->write_seq ^ jiffies;
304 
305 	if (tcp_fastopen_defer_connect(sk, &err))
306 		return err;
307 	if (err)
308 		goto failure;
309 
310 	err = tcp_connect(sk);
311 
312 	if (err)
313 		goto failure;
314 
315 	return 0;
316 
317 failure:
318 	/*
319 	 * This unhashes the socket and releases the local port,
320 	 * if necessary.
321 	 */
322 	tcp_set_state(sk, TCP_CLOSE);
323 	ip_rt_put(rt);
324 	sk->sk_route_caps = 0;
325 	inet->inet_dport = 0;
326 	return err;
327 }
328 EXPORT_SYMBOL(tcp_v4_connect);
329 
330 /*
331  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
332  * It can be called through tcp_release_cb() if socket was owned by user
333  * at the time tcp_v4_err() was called to handle ICMP message.
334  */
335 void tcp_v4_mtu_reduced(struct sock *sk)
336 {
337 	struct inet_sock *inet = inet_sk(sk);
338 	struct dst_entry *dst;
339 	u32 mtu;
340 
341 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
342 		return;
343 	mtu = tcp_sk(sk)->mtu_info;
344 	dst = inet_csk_update_pmtu(sk, mtu);
345 	if (!dst)
346 		return;
347 
348 	/* Something is about to be wrong... Remember soft error
349 	 * for the case, if this connection will not able to recover.
350 	 */
351 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
352 		sk->sk_err_soft = EMSGSIZE;
353 
354 	mtu = dst_mtu(dst);
355 
356 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
357 	    ip_sk_accept_pmtu(sk) &&
358 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
359 		tcp_sync_mss(sk, mtu);
360 
361 		/* Resend the TCP packet because it's
362 		 * clear that the old packet has been
363 		 * dropped. This is the new "fast" path mtu
364 		 * discovery.
365 		 */
366 		tcp_simple_retransmit(sk);
367 	} /* else let the usual retransmit timer handle it */
368 }
369 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
370 
371 static void do_redirect(struct sk_buff *skb, struct sock *sk)
372 {
373 	struct dst_entry *dst = __sk_dst_check(sk, 0);
374 
375 	if (dst)
376 		dst->ops->redirect(dst, sk, skb);
377 }
378 
379 
380 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
381 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
382 {
383 	struct request_sock *req = inet_reqsk(sk);
384 	struct net *net = sock_net(sk);
385 
386 	/* ICMPs are not backlogged, hence we cannot get
387 	 * an established socket here.
388 	 */
389 	if (seq != tcp_rsk(req)->snt_isn) {
390 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
391 	} else if (abort) {
392 		/*
393 		 * Still in SYN_RECV, just remove it silently.
394 		 * There is no good way to pass the error to the newly
395 		 * created socket, and POSIX does not want network
396 		 * errors returned from accept().
397 		 */
398 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
399 		tcp_listendrop(req->rsk_listener);
400 	}
401 	reqsk_put(req);
402 }
403 EXPORT_SYMBOL(tcp_req_err);
404 
405 /*
406  * This routine is called by the ICMP module when it gets some
407  * sort of error condition.  If err < 0 then the socket should
408  * be closed and the error returned to the user.  If err > 0
409  * it's just the icmp type << 8 | icmp code.  After adjustment
410  * header points to the first 8 bytes of the tcp header.  We need
411  * to find the appropriate port.
412  *
413  * The locking strategy used here is very "optimistic". When
414  * someone else accesses the socket the ICMP is just dropped
415  * and for some paths there is no check at all.
416  * A more general error queue to queue errors for later handling
417  * is probably better.
418  *
419  */
420 
421 int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
422 {
423 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
424 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
425 	struct inet_connection_sock *icsk;
426 	struct tcp_sock *tp;
427 	struct inet_sock *inet;
428 	const int type = icmp_hdr(icmp_skb)->type;
429 	const int code = icmp_hdr(icmp_skb)->code;
430 	struct sock *sk;
431 	struct sk_buff *skb;
432 	struct request_sock *fastopen;
433 	u32 seq, snd_una;
434 	s32 remaining;
435 	u32 delta_us;
436 	int err;
437 	struct net *net = dev_net(icmp_skb->dev);
438 
439 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
440 				       th->dest, iph->saddr, ntohs(th->source),
441 				       inet_iif(icmp_skb), 0);
442 	if (!sk) {
443 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
444 		return -ENOENT;
445 	}
446 	if (sk->sk_state == TCP_TIME_WAIT) {
447 		inet_twsk_put(inet_twsk(sk));
448 		return 0;
449 	}
450 	seq = ntohl(th->seq);
451 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
452 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
453 				     type == ICMP_TIME_EXCEEDED ||
454 				     (type == ICMP_DEST_UNREACH &&
455 				      (code == ICMP_NET_UNREACH ||
456 				       code == ICMP_HOST_UNREACH)));
457 		return 0;
458 	}
459 
460 	bh_lock_sock(sk);
461 	/* If too many ICMPs get dropped on busy
462 	 * servers this needs to be solved differently.
463 	 * We do take care of PMTU discovery (RFC1191) special case :
464 	 * we can receive locally generated ICMP messages while socket is held.
465 	 */
466 	if (sock_owned_by_user(sk)) {
467 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
468 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
469 	}
470 	if (sk->sk_state == TCP_CLOSE)
471 		goto out;
472 
473 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
474 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
475 		goto out;
476 	}
477 
478 	icsk = inet_csk(sk);
479 	tp = tcp_sk(sk);
480 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
481 	fastopen = tp->fastopen_rsk;
482 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
483 	if (sk->sk_state != TCP_LISTEN &&
484 	    !between(seq, snd_una, tp->snd_nxt)) {
485 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
486 		goto out;
487 	}
488 
489 	switch (type) {
490 	case ICMP_REDIRECT:
491 		if (!sock_owned_by_user(sk))
492 			do_redirect(icmp_skb, sk);
493 		goto out;
494 	case ICMP_SOURCE_QUENCH:
495 		/* Just silently ignore these. */
496 		goto out;
497 	case ICMP_PARAMETERPROB:
498 		err = EPROTO;
499 		break;
500 	case ICMP_DEST_UNREACH:
501 		if (code > NR_ICMP_UNREACH)
502 			goto out;
503 
504 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
505 			/* We are not interested in TCP_LISTEN and open_requests
506 			 * (SYN-ACKs send out by Linux are always <576bytes so
507 			 * they should go through unfragmented).
508 			 */
509 			if (sk->sk_state == TCP_LISTEN)
510 				goto out;
511 
512 			tp->mtu_info = info;
513 			if (!sock_owned_by_user(sk)) {
514 				tcp_v4_mtu_reduced(sk);
515 			} else {
516 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
517 					sock_hold(sk);
518 			}
519 			goto out;
520 		}
521 
522 		err = icmp_err_convert[code].errno;
523 		/* check if icmp_skb allows revert of backoff
524 		 * (see draft-zimmermann-tcp-lcd) */
525 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
526 			break;
527 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
528 		    !icsk->icsk_backoff || fastopen)
529 			break;
530 
531 		if (sock_owned_by_user(sk))
532 			break;
533 
534 		skb = tcp_rtx_queue_head(sk);
535 		if (WARN_ON_ONCE(!skb))
536 			break;
537 
538 		icsk->icsk_backoff--;
539 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
540 					       TCP_TIMEOUT_INIT;
541 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
542 
543 
544 		tcp_mstamp_refresh(tp);
545 		delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
546 		remaining = icsk->icsk_rto -
547 			    usecs_to_jiffies(delta_us);
548 
549 		if (remaining > 0) {
550 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
551 						  remaining, TCP_RTO_MAX);
552 		} else {
553 			/* RTO revert clocked out retransmission.
554 			 * Will retransmit now */
555 			tcp_retransmit_timer(sk);
556 		}
557 
558 		break;
559 	case ICMP_TIME_EXCEEDED:
560 		err = EHOSTUNREACH;
561 		break;
562 	default:
563 		goto out;
564 	}
565 
566 	switch (sk->sk_state) {
567 	case TCP_SYN_SENT:
568 	case TCP_SYN_RECV:
569 		/* Only in fast or simultaneous open. If a fast open socket is
570 		 * is already accepted it is treated as a connected one below.
571 		 */
572 		if (fastopen && !fastopen->sk)
573 			break;
574 
575 		if (!sock_owned_by_user(sk)) {
576 			sk->sk_err = err;
577 
578 			sk->sk_error_report(sk);
579 
580 			tcp_done(sk);
581 		} else {
582 			sk->sk_err_soft = err;
583 		}
584 		goto out;
585 	}
586 
587 	/* If we've already connected we will keep trying
588 	 * until we time out, or the user gives up.
589 	 *
590 	 * rfc1122 4.2.3.9 allows to consider as hard errors
591 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
592 	 * but it is obsoleted by pmtu discovery).
593 	 *
594 	 * Note, that in modern internet, where routing is unreliable
595 	 * and in each dark corner broken firewalls sit, sending random
596 	 * errors ordered by their masters even this two messages finally lose
597 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
598 	 *
599 	 * Now we are in compliance with RFCs.
600 	 *							--ANK (980905)
601 	 */
602 
603 	inet = inet_sk(sk);
604 	if (!sock_owned_by_user(sk) && inet->recverr) {
605 		sk->sk_err = err;
606 		sk->sk_error_report(sk);
607 	} else	{ /* Only an error on timeout */
608 		sk->sk_err_soft = err;
609 	}
610 
611 out:
612 	bh_unlock_sock(sk);
613 	sock_put(sk);
614 	return 0;
615 }
616 
617 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
618 {
619 	struct tcphdr *th = tcp_hdr(skb);
620 
621 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
622 	skb->csum_start = skb_transport_header(skb) - skb->head;
623 	skb->csum_offset = offsetof(struct tcphdr, check);
624 }
625 
626 /* This routine computes an IPv4 TCP checksum. */
627 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
628 {
629 	const struct inet_sock *inet = inet_sk(sk);
630 
631 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
632 }
633 EXPORT_SYMBOL(tcp_v4_send_check);
634 
635 /*
636  *	This routine will send an RST to the other tcp.
637  *
638  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
639  *		      for reset.
640  *	Answer: if a packet caused RST, it is not for a socket
641  *		existing in our system, if it is matched to a socket,
642  *		it is just duplicate segment or bug in other side's TCP.
643  *		So that we build reply only basing on parameters
644  *		arrived with segment.
645  *	Exception: precedence violation. We do not implement it in any case.
646  */
647 
648 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
649 {
650 	const struct tcphdr *th = tcp_hdr(skb);
651 	struct {
652 		struct tcphdr th;
653 #ifdef CONFIG_TCP_MD5SIG
654 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
655 #endif
656 	} rep;
657 	struct ip_reply_arg arg;
658 #ifdef CONFIG_TCP_MD5SIG
659 	struct tcp_md5sig_key *key = NULL;
660 	const __u8 *hash_location = NULL;
661 	unsigned char newhash[16];
662 	int genhash;
663 	struct sock *sk1 = NULL;
664 #endif
665 	struct net *net;
666 	struct sock *ctl_sk;
667 
668 	/* Never send a reset in response to a reset. */
669 	if (th->rst)
670 		return;
671 
672 	/* If sk not NULL, it means we did a successful lookup and incoming
673 	 * route had to be correct. prequeue might have dropped our dst.
674 	 */
675 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
676 		return;
677 
678 	/* Swap the send and the receive. */
679 	memset(&rep, 0, sizeof(rep));
680 	rep.th.dest   = th->source;
681 	rep.th.source = th->dest;
682 	rep.th.doff   = sizeof(struct tcphdr) / 4;
683 	rep.th.rst    = 1;
684 
685 	if (th->ack) {
686 		rep.th.seq = th->ack_seq;
687 	} else {
688 		rep.th.ack = 1;
689 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
690 				       skb->len - (th->doff << 2));
691 	}
692 
693 	memset(&arg, 0, sizeof(arg));
694 	arg.iov[0].iov_base = (unsigned char *)&rep;
695 	arg.iov[0].iov_len  = sizeof(rep.th);
696 
697 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
698 #ifdef CONFIG_TCP_MD5SIG
699 	rcu_read_lock();
700 	hash_location = tcp_parse_md5sig_option(th);
701 	if (sk && sk_fullsock(sk)) {
702 		key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
703 					&ip_hdr(skb)->saddr, AF_INET);
704 	} else if (hash_location) {
705 		/*
706 		 * active side is lost. Try to find listening socket through
707 		 * source port, and then find md5 key through listening socket.
708 		 * we are not loose security here:
709 		 * Incoming packet is checked with md5 hash with finding key,
710 		 * no RST generated if md5 hash doesn't match.
711 		 */
712 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
713 					     ip_hdr(skb)->saddr,
714 					     th->source, ip_hdr(skb)->daddr,
715 					     ntohs(th->source), inet_iif(skb),
716 					     tcp_v4_sdif(skb));
717 		/* don't send rst if it can't find key */
718 		if (!sk1)
719 			goto out;
720 
721 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
722 					&ip_hdr(skb)->saddr, AF_INET);
723 		if (!key)
724 			goto out;
725 
726 
727 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
728 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
729 			goto out;
730 
731 	}
732 
733 	if (key) {
734 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
735 				   (TCPOPT_NOP << 16) |
736 				   (TCPOPT_MD5SIG << 8) |
737 				   TCPOLEN_MD5SIG);
738 		/* Update length and the length the header thinks exists */
739 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
740 		rep.th.doff = arg.iov[0].iov_len / 4;
741 
742 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
743 				     key, ip_hdr(skb)->saddr,
744 				     ip_hdr(skb)->daddr, &rep.th);
745 	}
746 #endif
747 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
748 				      ip_hdr(skb)->saddr, /* XXX */
749 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
750 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
751 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
752 
753 	/* When socket is gone, all binding information is lost.
754 	 * routing might fail in this case. No choice here, if we choose to force
755 	 * input interface, we will misroute in case of asymmetric route.
756 	 */
757 	if (sk) {
758 		arg.bound_dev_if = sk->sk_bound_dev_if;
759 		if (sk_fullsock(sk))
760 			trace_tcp_send_reset(sk, skb);
761 	}
762 
763 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
764 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
765 
766 	arg.tos = ip_hdr(skb)->tos;
767 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
768 	local_bh_disable();
769 	ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
770 	if (sk)
771 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
772 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
773 	ip_send_unicast_reply(ctl_sk,
774 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
775 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
776 			      &arg, arg.iov[0].iov_len);
777 
778 	ctl_sk->sk_mark = 0;
779 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
780 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
781 	local_bh_enable();
782 
783 #ifdef CONFIG_TCP_MD5SIG
784 out:
785 	rcu_read_unlock();
786 #endif
787 }
788 
789 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
790    outside socket context is ugly, certainly. What can I do?
791  */
792 
793 static void tcp_v4_send_ack(const struct sock *sk,
794 			    struct sk_buff *skb, u32 seq, u32 ack,
795 			    u32 win, u32 tsval, u32 tsecr, int oif,
796 			    struct tcp_md5sig_key *key,
797 			    int reply_flags, u8 tos)
798 {
799 	const struct tcphdr *th = tcp_hdr(skb);
800 	struct {
801 		struct tcphdr th;
802 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
803 #ifdef CONFIG_TCP_MD5SIG
804 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
805 #endif
806 			];
807 	} rep;
808 	struct net *net = sock_net(sk);
809 	struct ip_reply_arg arg;
810 	struct sock *ctl_sk;
811 
812 	memset(&rep.th, 0, sizeof(struct tcphdr));
813 	memset(&arg, 0, sizeof(arg));
814 
815 	arg.iov[0].iov_base = (unsigned char *)&rep;
816 	arg.iov[0].iov_len  = sizeof(rep.th);
817 	if (tsecr) {
818 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
819 				   (TCPOPT_TIMESTAMP << 8) |
820 				   TCPOLEN_TIMESTAMP);
821 		rep.opt[1] = htonl(tsval);
822 		rep.opt[2] = htonl(tsecr);
823 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
824 	}
825 
826 	/* Swap the send and the receive. */
827 	rep.th.dest    = th->source;
828 	rep.th.source  = th->dest;
829 	rep.th.doff    = arg.iov[0].iov_len / 4;
830 	rep.th.seq     = htonl(seq);
831 	rep.th.ack_seq = htonl(ack);
832 	rep.th.ack     = 1;
833 	rep.th.window  = htons(win);
834 
835 #ifdef CONFIG_TCP_MD5SIG
836 	if (key) {
837 		int offset = (tsecr) ? 3 : 0;
838 
839 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
840 					  (TCPOPT_NOP << 16) |
841 					  (TCPOPT_MD5SIG << 8) |
842 					  TCPOLEN_MD5SIG);
843 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
844 		rep.th.doff = arg.iov[0].iov_len/4;
845 
846 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
847 				    key, ip_hdr(skb)->saddr,
848 				    ip_hdr(skb)->daddr, &rep.th);
849 	}
850 #endif
851 	arg.flags = reply_flags;
852 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
853 				      ip_hdr(skb)->saddr, /* XXX */
854 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
855 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
856 	if (oif)
857 		arg.bound_dev_if = oif;
858 	arg.tos = tos;
859 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
860 	local_bh_disable();
861 	ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
862 	if (sk)
863 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
864 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
865 	ip_send_unicast_reply(ctl_sk,
866 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
867 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
868 			      &arg, arg.iov[0].iov_len);
869 
870 	ctl_sk->sk_mark = 0;
871 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
872 	local_bh_enable();
873 }
874 
875 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
876 {
877 	struct inet_timewait_sock *tw = inet_twsk(sk);
878 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
879 
880 	tcp_v4_send_ack(sk, skb,
881 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
882 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
883 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
884 			tcptw->tw_ts_recent,
885 			tw->tw_bound_dev_if,
886 			tcp_twsk_md5_key(tcptw),
887 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
888 			tw->tw_tos
889 			);
890 
891 	inet_twsk_put(tw);
892 }
893 
894 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
895 				  struct request_sock *req)
896 {
897 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
898 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
899 	 */
900 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
901 					     tcp_sk(sk)->snd_nxt;
902 
903 	/* RFC 7323 2.3
904 	 * The window field (SEG.WND) of every outgoing segment, with the
905 	 * exception of <SYN> segments, MUST be right-shifted by
906 	 * Rcv.Wind.Shift bits:
907 	 */
908 	tcp_v4_send_ack(sk, skb, seq,
909 			tcp_rsk(req)->rcv_nxt,
910 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
911 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
912 			req->ts_recent,
913 			0,
914 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
915 					  AF_INET),
916 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
917 			ip_hdr(skb)->tos);
918 }
919 
920 /*
921  *	Send a SYN-ACK after having received a SYN.
922  *	This still operates on a request_sock only, not on a big
923  *	socket.
924  */
925 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
926 			      struct flowi *fl,
927 			      struct request_sock *req,
928 			      struct tcp_fastopen_cookie *foc,
929 			      enum tcp_synack_type synack_type)
930 {
931 	const struct inet_request_sock *ireq = inet_rsk(req);
932 	struct flowi4 fl4;
933 	int err = -1;
934 	struct sk_buff *skb;
935 
936 	/* First, grab a route. */
937 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
938 		return -1;
939 
940 	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
941 
942 	if (skb) {
943 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
944 
945 		rcu_read_lock();
946 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
947 					    ireq->ir_rmt_addr,
948 					    rcu_dereference(ireq->ireq_opt));
949 		rcu_read_unlock();
950 		err = net_xmit_eval(err);
951 	}
952 
953 	return err;
954 }
955 
956 /*
957  *	IPv4 request_sock destructor.
958  */
959 static void tcp_v4_reqsk_destructor(struct request_sock *req)
960 {
961 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
962 }
963 
964 #ifdef CONFIG_TCP_MD5SIG
965 /*
966  * RFC2385 MD5 checksumming requires a mapping of
967  * IP address->MD5 Key.
968  * We need to maintain these in the sk structure.
969  */
970 
971 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
972 EXPORT_SYMBOL(tcp_md5_needed);
973 
974 /* Find the Key structure for an address.  */
975 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk,
976 					   const union tcp_md5_addr *addr,
977 					   int family)
978 {
979 	const struct tcp_sock *tp = tcp_sk(sk);
980 	struct tcp_md5sig_key *key;
981 	const struct tcp_md5sig_info *md5sig;
982 	__be32 mask;
983 	struct tcp_md5sig_key *best_match = NULL;
984 	bool match;
985 
986 	/* caller either holds rcu_read_lock() or socket lock */
987 	md5sig = rcu_dereference_check(tp->md5sig_info,
988 				       lockdep_sock_is_held(sk));
989 	if (!md5sig)
990 		return NULL;
991 
992 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
993 		if (key->family != family)
994 			continue;
995 
996 		if (family == AF_INET) {
997 			mask = inet_make_mask(key->prefixlen);
998 			match = (key->addr.a4.s_addr & mask) ==
999 				(addr->a4.s_addr & mask);
1000 #if IS_ENABLED(CONFIG_IPV6)
1001 		} else if (family == AF_INET6) {
1002 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1003 						  key->prefixlen);
1004 #endif
1005 		} else {
1006 			match = false;
1007 		}
1008 
1009 		if (match && (!best_match ||
1010 			      key->prefixlen > best_match->prefixlen))
1011 			best_match = key;
1012 	}
1013 	return best_match;
1014 }
1015 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1016 
1017 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1018 						      const union tcp_md5_addr *addr,
1019 						      int family, u8 prefixlen)
1020 {
1021 	const struct tcp_sock *tp = tcp_sk(sk);
1022 	struct tcp_md5sig_key *key;
1023 	unsigned int size = sizeof(struct in_addr);
1024 	const struct tcp_md5sig_info *md5sig;
1025 
1026 	/* caller either holds rcu_read_lock() or socket lock */
1027 	md5sig = rcu_dereference_check(tp->md5sig_info,
1028 				       lockdep_sock_is_held(sk));
1029 	if (!md5sig)
1030 		return NULL;
1031 #if IS_ENABLED(CONFIG_IPV6)
1032 	if (family == AF_INET6)
1033 		size = sizeof(struct in6_addr);
1034 #endif
1035 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1036 		if (key->family != family)
1037 			continue;
1038 		if (!memcmp(&key->addr, addr, size) &&
1039 		    key->prefixlen == prefixlen)
1040 			return key;
1041 	}
1042 	return NULL;
1043 }
1044 
1045 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1046 					 const struct sock *addr_sk)
1047 {
1048 	const union tcp_md5_addr *addr;
1049 
1050 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1051 	return tcp_md5_do_lookup(sk, addr, AF_INET);
1052 }
1053 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1054 
1055 /* This can be called on a newly created socket, from other files */
1056 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1057 		   int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1058 		   gfp_t gfp)
1059 {
1060 	/* Add Key to the list */
1061 	struct tcp_md5sig_key *key;
1062 	struct tcp_sock *tp = tcp_sk(sk);
1063 	struct tcp_md5sig_info *md5sig;
1064 
1065 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1066 	if (key) {
1067 		/* Pre-existing entry - just update that one. */
1068 		memcpy(key->key, newkey, newkeylen);
1069 		key->keylen = newkeylen;
1070 		return 0;
1071 	}
1072 
1073 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1074 					   lockdep_sock_is_held(sk));
1075 	if (!md5sig) {
1076 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1077 		if (!md5sig)
1078 			return -ENOMEM;
1079 
1080 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1081 		INIT_HLIST_HEAD(&md5sig->head);
1082 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1083 	}
1084 
1085 	key = sock_kmalloc(sk, sizeof(*key), gfp);
1086 	if (!key)
1087 		return -ENOMEM;
1088 	if (!tcp_alloc_md5sig_pool()) {
1089 		sock_kfree_s(sk, key, sizeof(*key));
1090 		return -ENOMEM;
1091 	}
1092 
1093 	memcpy(key->key, newkey, newkeylen);
1094 	key->keylen = newkeylen;
1095 	key->family = family;
1096 	key->prefixlen = prefixlen;
1097 	memcpy(&key->addr, addr,
1098 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1099 				      sizeof(struct in_addr));
1100 	hlist_add_head_rcu(&key->node, &md5sig->head);
1101 	return 0;
1102 }
1103 EXPORT_SYMBOL(tcp_md5_do_add);
1104 
1105 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1106 		   u8 prefixlen)
1107 {
1108 	struct tcp_md5sig_key *key;
1109 
1110 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1111 	if (!key)
1112 		return -ENOENT;
1113 	hlist_del_rcu(&key->node);
1114 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1115 	kfree_rcu(key, rcu);
1116 	return 0;
1117 }
1118 EXPORT_SYMBOL(tcp_md5_do_del);
1119 
1120 static void tcp_clear_md5_list(struct sock *sk)
1121 {
1122 	struct tcp_sock *tp = tcp_sk(sk);
1123 	struct tcp_md5sig_key *key;
1124 	struct hlist_node *n;
1125 	struct tcp_md5sig_info *md5sig;
1126 
1127 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1128 
1129 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1130 		hlist_del_rcu(&key->node);
1131 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1132 		kfree_rcu(key, rcu);
1133 	}
1134 }
1135 
1136 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1137 				 char __user *optval, int optlen)
1138 {
1139 	struct tcp_md5sig cmd;
1140 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1141 	u8 prefixlen = 32;
1142 
1143 	if (optlen < sizeof(cmd))
1144 		return -EINVAL;
1145 
1146 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1147 		return -EFAULT;
1148 
1149 	if (sin->sin_family != AF_INET)
1150 		return -EINVAL;
1151 
1152 	if (optname == TCP_MD5SIG_EXT &&
1153 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1154 		prefixlen = cmd.tcpm_prefixlen;
1155 		if (prefixlen > 32)
1156 			return -EINVAL;
1157 	}
1158 
1159 	if (!cmd.tcpm_keylen)
1160 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1161 				      AF_INET, prefixlen);
1162 
1163 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1164 		return -EINVAL;
1165 
1166 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1167 			      AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1168 			      GFP_KERNEL);
1169 }
1170 
1171 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1172 				   __be32 daddr, __be32 saddr,
1173 				   const struct tcphdr *th, int nbytes)
1174 {
1175 	struct tcp4_pseudohdr *bp;
1176 	struct scatterlist sg;
1177 	struct tcphdr *_th;
1178 
1179 	bp = hp->scratch;
1180 	bp->saddr = saddr;
1181 	bp->daddr = daddr;
1182 	bp->pad = 0;
1183 	bp->protocol = IPPROTO_TCP;
1184 	bp->len = cpu_to_be16(nbytes);
1185 
1186 	_th = (struct tcphdr *)(bp + 1);
1187 	memcpy(_th, th, sizeof(*th));
1188 	_th->check = 0;
1189 
1190 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1191 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1192 				sizeof(*bp) + sizeof(*th));
1193 	return crypto_ahash_update(hp->md5_req);
1194 }
1195 
1196 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1197 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1198 {
1199 	struct tcp_md5sig_pool *hp;
1200 	struct ahash_request *req;
1201 
1202 	hp = tcp_get_md5sig_pool();
1203 	if (!hp)
1204 		goto clear_hash_noput;
1205 	req = hp->md5_req;
1206 
1207 	if (crypto_ahash_init(req))
1208 		goto clear_hash;
1209 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1210 		goto clear_hash;
1211 	if (tcp_md5_hash_key(hp, key))
1212 		goto clear_hash;
1213 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1214 	if (crypto_ahash_final(req))
1215 		goto clear_hash;
1216 
1217 	tcp_put_md5sig_pool();
1218 	return 0;
1219 
1220 clear_hash:
1221 	tcp_put_md5sig_pool();
1222 clear_hash_noput:
1223 	memset(md5_hash, 0, 16);
1224 	return 1;
1225 }
1226 
1227 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1228 			const struct sock *sk,
1229 			const struct sk_buff *skb)
1230 {
1231 	struct tcp_md5sig_pool *hp;
1232 	struct ahash_request *req;
1233 	const struct tcphdr *th = tcp_hdr(skb);
1234 	__be32 saddr, daddr;
1235 
1236 	if (sk) { /* valid for establish/request sockets */
1237 		saddr = sk->sk_rcv_saddr;
1238 		daddr = sk->sk_daddr;
1239 	} else {
1240 		const struct iphdr *iph = ip_hdr(skb);
1241 		saddr = iph->saddr;
1242 		daddr = iph->daddr;
1243 	}
1244 
1245 	hp = tcp_get_md5sig_pool();
1246 	if (!hp)
1247 		goto clear_hash_noput;
1248 	req = hp->md5_req;
1249 
1250 	if (crypto_ahash_init(req))
1251 		goto clear_hash;
1252 
1253 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1254 		goto clear_hash;
1255 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1256 		goto clear_hash;
1257 	if (tcp_md5_hash_key(hp, key))
1258 		goto clear_hash;
1259 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1260 	if (crypto_ahash_final(req))
1261 		goto clear_hash;
1262 
1263 	tcp_put_md5sig_pool();
1264 	return 0;
1265 
1266 clear_hash:
1267 	tcp_put_md5sig_pool();
1268 clear_hash_noput:
1269 	memset(md5_hash, 0, 16);
1270 	return 1;
1271 }
1272 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1273 
1274 #endif
1275 
1276 /* Called with rcu_read_lock() */
1277 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1278 				    const struct sk_buff *skb)
1279 {
1280 #ifdef CONFIG_TCP_MD5SIG
1281 	/*
1282 	 * This gets called for each TCP segment that arrives
1283 	 * so we want to be efficient.
1284 	 * We have 3 drop cases:
1285 	 * o No MD5 hash and one expected.
1286 	 * o MD5 hash and we're not expecting one.
1287 	 * o MD5 hash and its wrong.
1288 	 */
1289 	const __u8 *hash_location = NULL;
1290 	struct tcp_md5sig_key *hash_expected;
1291 	const struct iphdr *iph = ip_hdr(skb);
1292 	const struct tcphdr *th = tcp_hdr(skb);
1293 	int genhash;
1294 	unsigned char newhash[16];
1295 
1296 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1297 					  AF_INET);
1298 	hash_location = tcp_parse_md5sig_option(th);
1299 
1300 	/* We've parsed the options - do we have a hash? */
1301 	if (!hash_expected && !hash_location)
1302 		return false;
1303 
1304 	if (hash_expected && !hash_location) {
1305 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1306 		return true;
1307 	}
1308 
1309 	if (!hash_expected && hash_location) {
1310 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1311 		return true;
1312 	}
1313 
1314 	/* Okay, so this is hash_expected and hash_location -
1315 	 * so we need to calculate the checksum.
1316 	 */
1317 	genhash = tcp_v4_md5_hash_skb(newhash,
1318 				      hash_expected,
1319 				      NULL, skb);
1320 
1321 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1322 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1323 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1324 				     &iph->saddr, ntohs(th->source),
1325 				     &iph->daddr, ntohs(th->dest),
1326 				     genhash ? " tcp_v4_calc_md5_hash failed"
1327 				     : "");
1328 		return true;
1329 	}
1330 	return false;
1331 #endif
1332 	return false;
1333 }
1334 
1335 static void tcp_v4_init_req(struct request_sock *req,
1336 			    const struct sock *sk_listener,
1337 			    struct sk_buff *skb)
1338 {
1339 	struct inet_request_sock *ireq = inet_rsk(req);
1340 	struct net *net = sock_net(sk_listener);
1341 
1342 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1343 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1344 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1345 }
1346 
1347 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1348 					  struct flowi *fl,
1349 					  const struct request_sock *req)
1350 {
1351 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1352 }
1353 
1354 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1355 	.family		=	PF_INET,
1356 	.obj_size	=	sizeof(struct tcp_request_sock),
1357 	.rtx_syn_ack	=	tcp_rtx_synack,
1358 	.send_ack	=	tcp_v4_reqsk_send_ack,
1359 	.destructor	=	tcp_v4_reqsk_destructor,
1360 	.send_reset	=	tcp_v4_send_reset,
1361 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1362 };
1363 
1364 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1365 	.mss_clamp	=	TCP_MSS_DEFAULT,
1366 #ifdef CONFIG_TCP_MD5SIG
1367 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1368 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1369 #endif
1370 	.init_req	=	tcp_v4_init_req,
1371 #ifdef CONFIG_SYN_COOKIES
1372 	.cookie_init_seq =	cookie_v4_init_sequence,
1373 #endif
1374 	.route_req	=	tcp_v4_route_req,
1375 	.init_seq	=	tcp_v4_init_seq,
1376 	.init_ts_off	=	tcp_v4_init_ts_off,
1377 	.send_synack	=	tcp_v4_send_synack,
1378 };
1379 
1380 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1381 {
1382 	/* Never answer to SYNs send to broadcast or multicast */
1383 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1384 		goto drop;
1385 
1386 	return tcp_conn_request(&tcp_request_sock_ops,
1387 				&tcp_request_sock_ipv4_ops, sk, skb);
1388 
1389 drop:
1390 	tcp_listendrop(sk);
1391 	return 0;
1392 }
1393 EXPORT_SYMBOL(tcp_v4_conn_request);
1394 
1395 
1396 /*
1397  * The three way handshake has completed - we got a valid synack -
1398  * now create the new socket.
1399  */
1400 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1401 				  struct request_sock *req,
1402 				  struct dst_entry *dst,
1403 				  struct request_sock *req_unhash,
1404 				  bool *own_req)
1405 {
1406 	struct inet_request_sock *ireq;
1407 	struct inet_sock *newinet;
1408 	struct tcp_sock *newtp;
1409 	struct sock *newsk;
1410 #ifdef CONFIG_TCP_MD5SIG
1411 	struct tcp_md5sig_key *key;
1412 #endif
1413 	struct ip_options_rcu *inet_opt;
1414 
1415 	if (sk_acceptq_is_full(sk))
1416 		goto exit_overflow;
1417 
1418 	newsk = tcp_create_openreq_child(sk, req, skb);
1419 	if (!newsk)
1420 		goto exit_nonewsk;
1421 
1422 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1423 	inet_sk_rx_dst_set(newsk, skb);
1424 
1425 	newtp		      = tcp_sk(newsk);
1426 	newinet		      = inet_sk(newsk);
1427 	ireq		      = inet_rsk(req);
1428 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1429 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1430 	newsk->sk_bound_dev_if = ireq->ir_iif;
1431 	newinet->inet_saddr   = ireq->ir_loc_addr;
1432 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1433 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1434 	newinet->mc_index     = inet_iif(skb);
1435 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1436 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1437 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1438 	if (inet_opt)
1439 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1440 	newinet->inet_id = newtp->write_seq ^ jiffies;
1441 
1442 	if (!dst) {
1443 		dst = inet_csk_route_child_sock(sk, newsk, req);
1444 		if (!dst)
1445 			goto put_and_exit;
1446 	} else {
1447 		/* syncookie case : see end of cookie_v4_check() */
1448 	}
1449 	sk_setup_caps(newsk, dst);
1450 
1451 	tcp_ca_openreq_child(newsk, dst);
1452 
1453 	tcp_sync_mss(newsk, dst_mtu(dst));
1454 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1455 
1456 	tcp_initialize_rcv_mss(newsk);
1457 
1458 #ifdef CONFIG_TCP_MD5SIG
1459 	/* Copy over the MD5 key from the original socket */
1460 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1461 				AF_INET);
1462 	if (key) {
1463 		/*
1464 		 * We're using one, so create a matching key
1465 		 * on the newsk structure. If we fail to get
1466 		 * memory, then we end up not copying the key
1467 		 * across. Shucks.
1468 		 */
1469 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1470 			       AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1471 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1472 	}
1473 #endif
1474 
1475 	if (__inet_inherit_port(sk, newsk) < 0)
1476 		goto put_and_exit;
1477 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1478 	if (likely(*own_req)) {
1479 		tcp_move_syn(newtp, req);
1480 		ireq->ireq_opt = NULL;
1481 	} else {
1482 		newinet->inet_opt = NULL;
1483 	}
1484 	return newsk;
1485 
1486 exit_overflow:
1487 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1488 exit_nonewsk:
1489 	dst_release(dst);
1490 exit:
1491 	tcp_listendrop(sk);
1492 	return NULL;
1493 put_and_exit:
1494 	newinet->inet_opt = NULL;
1495 	inet_csk_prepare_forced_close(newsk);
1496 	tcp_done(newsk);
1497 	goto exit;
1498 }
1499 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1500 
1501 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1502 {
1503 #ifdef CONFIG_SYN_COOKIES
1504 	const struct tcphdr *th = tcp_hdr(skb);
1505 
1506 	if (!th->syn)
1507 		sk = cookie_v4_check(sk, skb);
1508 #endif
1509 	return sk;
1510 }
1511 
1512 /* The socket must have it's spinlock held when we get
1513  * here, unless it is a TCP_LISTEN socket.
1514  *
1515  * We have a potential double-lock case here, so even when
1516  * doing backlog processing we use the BH locking scheme.
1517  * This is because we cannot sleep with the original spinlock
1518  * held.
1519  */
1520 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1521 {
1522 	struct sock *rsk;
1523 
1524 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1525 		struct dst_entry *dst = sk->sk_rx_dst;
1526 
1527 		sock_rps_save_rxhash(sk, skb);
1528 		sk_mark_napi_id(sk, skb);
1529 		if (dst) {
1530 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1531 			    !dst->ops->check(dst, 0)) {
1532 				dst_release(dst);
1533 				sk->sk_rx_dst = NULL;
1534 			}
1535 		}
1536 		tcp_rcv_established(sk, skb);
1537 		return 0;
1538 	}
1539 
1540 	if (tcp_checksum_complete(skb))
1541 		goto csum_err;
1542 
1543 	if (sk->sk_state == TCP_LISTEN) {
1544 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1545 
1546 		if (!nsk)
1547 			goto discard;
1548 		if (nsk != sk) {
1549 			if (tcp_child_process(sk, nsk, skb)) {
1550 				rsk = nsk;
1551 				goto reset;
1552 			}
1553 			return 0;
1554 		}
1555 	} else
1556 		sock_rps_save_rxhash(sk, skb);
1557 
1558 	if (tcp_rcv_state_process(sk, skb)) {
1559 		rsk = sk;
1560 		goto reset;
1561 	}
1562 	return 0;
1563 
1564 reset:
1565 	tcp_v4_send_reset(rsk, skb);
1566 discard:
1567 	kfree_skb(skb);
1568 	/* Be careful here. If this function gets more complicated and
1569 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1570 	 * might be destroyed here. This current version compiles correctly,
1571 	 * but you have been warned.
1572 	 */
1573 	return 0;
1574 
1575 csum_err:
1576 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1577 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1578 	goto discard;
1579 }
1580 EXPORT_SYMBOL(tcp_v4_do_rcv);
1581 
1582 int tcp_v4_early_demux(struct sk_buff *skb)
1583 {
1584 	const struct iphdr *iph;
1585 	const struct tcphdr *th;
1586 	struct sock *sk;
1587 
1588 	if (skb->pkt_type != PACKET_HOST)
1589 		return 0;
1590 
1591 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1592 		return 0;
1593 
1594 	iph = ip_hdr(skb);
1595 	th = tcp_hdr(skb);
1596 
1597 	if (th->doff < sizeof(struct tcphdr) / 4)
1598 		return 0;
1599 
1600 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1601 				       iph->saddr, th->source,
1602 				       iph->daddr, ntohs(th->dest),
1603 				       skb->skb_iif, inet_sdif(skb));
1604 	if (sk) {
1605 		skb->sk = sk;
1606 		skb->destructor = sock_edemux;
1607 		if (sk_fullsock(sk)) {
1608 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1609 
1610 			if (dst)
1611 				dst = dst_check(dst, 0);
1612 			if (dst &&
1613 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1614 				skb_dst_set_noref(skb, dst);
1615 		}
1616 	}
1617 	return 0;
1618 }
1619 
1620 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1621 {
1622 	u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1623 	struct skb_shared_info *shinfo;
1624 	const struct tcphdr *th;
1625 	struct tcphdr *thtail;
1626 	struct sk_buff *tail;
1627 	unsigned int hdrlen;
1628 	bool fragstolen;
1629 	u32 gso_segs;
1630 	int delta;
1631 
1632 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1633 	 * we can fix skb->truesize to its real value to avoid future drops.
1634 	 * This is valid because skb is not yet charged to the socket.
1635 	 * It has been noticed pure SACK packets were sometimes dropped
1636 	 * (if cooked by drivers without copybreak feature).
1637 	 */
1638 	skb_condense(skb);
1639 
1640 	skb_dst_drop(skb);
1641 
1642 	if (unlikely(tcp_checksum_complete(skb))) {
1643 		bh_unlock_sock(sk);
1644 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1645 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1646 		return true;
1647 	}
1648 
1649 	/* Attempt coalescing to last skb in backlog, even if we are
1650 	 * above the limits.
1651 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1652 	 */
1653 	th = (const struct tcphdr *)skb->data;
1654 	hdrlen = th->doff * 4;
1655 	shinfo = skb_shinfo(skb);
1656 
1657 	if (!shinfo->gso_size)
1658 		shinfo->gso_size = skb->len - hdrlen;
1659 
1660 	if (!shinfo->gso_segs)
1661 		shinfo->gso_segs = 1;
1662 
1663 	tail = sk->sk_backlog.tail;
1664 	if (!tail)
1665 		goto no_coalesce;
1666 	thtail = (struct tcphdr *)tail->data;
1667 
1668 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1669 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1670 	    ((TCP_SKB_CB(tail)->tcp_flags |
1671 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1672 	    !((TCP_SKB_CB(tail)->tcp_flags &
1673 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1674 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1675 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1676 #ifdef CONFIG_TLS_DEVICE
1677 	    tail->decrypted != skb->decrypted ||
1678 #endif
1679 	    thtail->doff != th->doff ||
1680 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1681 		goto no_coalesce;
1682 
1683 	__skb_pull(skb, hdrlen);
1684 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1685 		thtail->window = th->window;
1686 
1687 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1688 
1689 		if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1690 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1691 
1692 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1693 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1694 		 * is not entered if we append a packet with a FIN.
1695 		 * SYN, RST, URG are not present.
1696 		 * ACK is set on both packets.
1697 		 * PSH : we do not really care in TCP stack,
1698 		 *       at least for 'GRO' packets.
1699 		 */
1700 		thtail->fin |= th->fin;
1701 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1702 
1703 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1704 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1705 			tail->tstamp = skb->tstamp;
1706 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1707 		}
1708 
1709 		/* Not as strict as GRO. We only need to carry mss max value */
1710 		skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1711 						 skb_shinfo(tail)->gso_size);
1712 
1713 		gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1714 		skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1715 
1716 		sk->sk_backlog.len += delta;
1717 		__NET_INC_STATS(sock_net(sk),
1718 				LINUX_MIB_TCPBACKLOGCOALESCE);
1719 		kfree_skb_partial(skb, fragstolen);
1720 		return false;
1721 	}
1722 	__skb_push(skb, hdrlen);
1723 
1724 no_coalesce:
1725 	/* Only socket owner can try to collapse/prune rx queues
1726 	 * to reduce memory overhead, so add a little headroom here.
1727 	 * Few sockets backlog are possibly concurrently non empty.
1728 	 */
1729 	limit += 64*1024;
1730 
1731 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1732 		bh_unlock_sock(sk);
1733 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1734 		return true;
1735 	}
1736 	return false;
1737 }
1738 EXPORT_SYMBOL(tcp_add_backlog);
1739 
1740 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1741 {
1742 	struct tcphdr *th = (struct tcphdr *)skb->data;
1743 
1744 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1745 }
1746 EXPORT_SYMBOL(tcp_filter);
1747 
1748 static void tcp_v4_restore_cb(struct sk_buff *skb)
1749 {
1750 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1751 		sizeof(struct inet_skb_parm));
1752 }
1753 
1754 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1755 			   const struct tcphdr *th)
1756 {
1757 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1758 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1759 	 */
1760 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1761 		sizeof(struct inet_skb_parm));
1762 	barrier();
1763 
1764 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1765 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1766 				    skb->len - th->doff * 4);
1767 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1768 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1769 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1770 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1771 	TCP_SKB_CB(skb)->sacked	 = 0;
1772 	TCP_SKB_CB(skb)->has_rxtstamp =
1773 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1774 }
1775 
1776 /*
1777  *	From tcp_input.c
1778  */
1779 
1780 int tcp_v4_rcv(struct sk_buff *skb)
1781 {
1782 	struct net *net = dev_net(skb->dev);
1783 	struct sk_buff *skb_to_free;
1784 	int sdif = inet_sdif(skb);
1785 	const struct iphdr *iph;
1786 	const struct tcphdr *th;
1787 	bool refcounted;
1788 	struct sock *sk;
1789 	int ret;
1790 
1791 	if (skb->pkt_type != PACKET_HOST)
1792 		goto discard_it;
1793 
1794 	/* Count it even if it's bad */
1795 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1796 
1797 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1798 		goto discard_it;
1799 
1800 	th = (const struct tcphdr *)skb->data;
1801 
1802 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1803 		goto bad_packet;
1804 	if (!pskb_may_pull(skb, th->doff * 4))
1805 		goto discard_it;
1806 
1807 	/* An explanation is required here, I think.
1808 	 * Packet length and doff are validated by header prediction,
1809 	 * provided case of th->doff==0 is eliminated.
1810 	 * So, we defer the checks. */
1811 
1812 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1813 		goto csum_error;
1814 
1815 	th = (const struct tcphdr *)skb->data;
1816 	iph = ip_hdr(skb);
1817 lookup:
1818 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1819 			       th->dest, sdif, &refcounted);
1820 	if (!sk)
1821 		goto no_tcp_socket;
1822 
1823 process:
1824 	if (sk->sk_state == TCP_TIME_WAIT)
1825 		goto do_time_wait;
1826 
1827 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1828 		struct request_sock *req = inet_reqsk(sk);
1829 		bool req_stolen = false;
1830 		struct sock *nsk;
1831 
1832 		sk = req->rsk_listener;
1833 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1834 			sk_drops_add(sk, skb);
1835 			reqsk_put(req);
1836 			goto discard_it;
1837 		}
1838 		if (tcp_checksum_complete(skb)) {
1839 			reqsk_put(req);
1840 			goto csum_error;
1841 		}
1842 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1843 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1844 			goto lookup;
1845 		}
1846 		/* We own a reference on the listener, increase it again
1847 		 * as we might lose it too soon.
1848 		 */
1849 		sock_hold(sk);
1850 		refcounted = true;
1851 		nsk = NULL;
1852 		if (!tcp_filter(sk, skb)) {
1853 			th = (const struct tcphdr *)skb->data;
1854 			iph = ip_hdr(skb);
1855 			tcp_v4_fill_cb(skb, iph, th);
1856 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1857 		}
1858 		if (!nsk) {
1859 			reqsk_put(req);
1860 			if (req_stolen) {
1861 				/* Another cpu got exclusive access to req
1862 				 * and created a full blown socket.
1863 				 * Try to feed this packet to this socket
1864 				 * instead of discarding it.
1865 				 */
1866 				tcp_v4_restore_cb(skb);
1867 				sock_put(sk);
1868 				goto lookup;
1869 			}
1870 			goto discard_and_relse;
1871 		}
1872 		if (nsk == sk) {
1873 			reqsk_put(req);
1874 			tcp_v4_restore_cb(skb);
1875 		} else if (tcp_child_process(sk, nsk, skb)) {
1876 			tcp_v4_send_reset(nsk, skb);
1877 			goto discard_and_relse;
1878 		} else {
1879 			sock_put(sk);
1880 			return 0;
1881 		}
1882 	}
1883 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1884 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1885 		goto discard_and_relse;
1886 	}
1887 
1888 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1889 		goto discard_and_relse;
1890 
1891 	if (tcp_v4_inbound_md5_hash(sk, skb))
1892 		goto discard_and_relse;
1893 
1894 	nf_reset(skb);
1895 
1896 	if (tcp_filter(sk, skb))
1897 		goto discard_and_relse;
1898 	th = (const struct tcphdr *)skb->data;
1899 	iph = ip_hdr(skb);
1900 	tcp_v4_fill_cb(skb, iph, th);
1901 
1902 	skb->dev = NULL;
1903 
1904 	if (sk->sk_state == TCP_LISTEN) {
1905 		ret = tcp_v4_do_rcv(sk, skb);
1906 		goto put_and_return;
1907 	}
1908 
1909 	sk_incoming_cpu_update(sk);
1910 
1911 	bh_lock_sock_nested(sk);
1912 	tcp_segs_in(tcp_sk(sk), skb);
1913 	ret = 0;
1914 	if (!sock_owned_by_user(sk)) {
1915 		skb_to_free = sk->sk_rx_skb_cache;
1916 		sk->sk_rx_skb_cache = NULL;
1917 		ret = tcp_v4_do_rcv(sk, skb);
1918 	} else {
1919 		if (tcp_add_backlog(sk, skb))
1920 			goto discard_and_relse;
1921 		skb_to_free = NULL;
1922 	}
1923 	bh_unlock_sock(sk);
1924 	if (skb_to_free)
1925 		__kfree_skb(skb_to_free);
1926 
1927 put_and_return:
1928 	if (refcounted)
1929 		sock_put(sk);
1930 
1931 	return ret;
1932 
1933 no_tcp_socket:
1934 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1935 		goto discard_it;
1936 
1937 	tcp_v4_fill_cb(skb, iph, th);
1938 
1939 	if (tcp_checksum_complete(skb)) {
1940 csum_error:
1941 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1942 bad_packet:
1943 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
1944 	} else {
1945 		tcp_v4_send_reset(NULL, skb);
1946 	}
1947 
1948 discard_it:
1949 	/* Discard frame. */
1950 	kfree_skb(skb);
1951 	return 0;
1952 
1953 discard_and_relse:
1954 	sk_drops_add(sk, skb);
1955 	if (refcounted)
1956 		sock_put(sk);
1957 	goto discard_it;
1958 
1959 do_time_wait:
1960 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1961 		inet_twsk_put(inet_twsk(sk));
1962 		goto discard_it;
1963 	}
1964 
1965 	tcp_v4_fill_cb(skb, iph, th);
1966 
1967 	if (tcp_checksum_complete(skb)) {
1968 		inet_twsk_put(inet_twsk(sk));
1969 		goto csum_error;
1970 	}
1971 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1972 	case TCP_TW_SYN: {
1973 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1974 							&tcp_hashinfo, skb,
1975 							__tcp_hdrlen(th),
1976 							iph->saddr, th->source,
1977 							iph->daddr, th->dest,
1978 							inet_iif(skb),
1979 							sdif);
1980 		if (sk2) {
1981 			inet_twsk_deschedule_put(inet_twsk(sk));
1982 			sk = sk2;
1983 			tcp_v4_restore_cb(skb);
1984 			refcounted = false;
1985 			goto process;
1986 		}
1987 	}
1988 		/* to ACK */
1989 		/* fall through */
1990 	case TCP_TW_ACK:
1991 		tcp_v4_timewait_ack(sk, skb);
1992 		break;
1993 	case TCP_TW_RST:
1994 		tcp_v4_send_reset(sk, skb);
1995 		inet_twsk_deschedule_put(inet_twsk(sk));
1996 		goto discard_it;
1997 	case TCP_TW_SUCCESS:;
1998 	}
1999 	goto discard_it;
2000 }
2001 
2002 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2003 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2004 	.twsk_unique	= tcp_twsk_unique,
2005 	.twsk_destructor= tcp_twsk_destructor,
2006 };
2007 
2008 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2009 {
2010 	struct dst_entry *dst = skb_dst(skb);
2011 
2012 	if (dst && dst_hold_safe(dst)) {
2013 		sk->sk_rx_dst = dst;
2014 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2015 	}
2016 }
2017 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2018 
2019 const struct inet_connection_sock_af_ops ipv4_specific = {
2020 	.queue_xmit	   = ip_queue_xmit,
2021 	.send_check	   = tcp_v4_send_check,
2022 	.rebuild_header	   = inet_sk_rebuild_header,
2023 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2024 	.conn_request	   = tcp_v4_conn_request,
2025 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2026 	.net_header_len	   = sizeof(struct iphdr),
2027 	.setsockopt	   = ip_setsockopt,
2028 	.getsockopt	   = ip_getsockopt,
2029 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2030 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2031 #ifdef CONFIG_COMPAT
2032 	.compat_setsockopt = compat_ip_setsockopt,
2033 	.compat_getsockopt = compat_ip_getsockopt,
2034 #endif
2035 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2036 };
2037 EXPORT_SYMBOL(ipv4_specific);
2038 
2039 #ifdef CONFIG_TCP_MD5SIG
2040 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2041 	.md5_lookup		= tcp_v4_md5_lookup,
2042 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2043 	.md5_parse		= tcp_v4_parse_md5_keys,
2044 };
2045 #endif
2046 
2047 /* NOTE: A lot of things set to zero explicitly by call to
2048  *       sk_alloc() so need not be done here.
2049  */
2050 static int tcp_v4_init_sock(struct sock *sk)
2051 {
2052 	struct inet_connection_sock *icsk = inet_csk(sk);
2053 
2054 	tcp_init_sock(sk);
2055 
2056 	icsk->icsk_af_ops = &ipv4_specific;
2057 
2058 #ifdef CONFIG_TCP_MD5SIG
2059 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2060 #endif
2061 
2062 	return 0;
2063 }
2064 
2065 void tcp_v4_destroy_sock(struct sock *sk)
2066 {
2067 	struct tcp_sock *tp = tcp_sk(sk);
2068 
2069 	trace_tcp_destroy_sock(sk);
2070 
2071 	tcp_clear_xmit_timers(sk);
2072 
2073 	tcp_cleanup_congestion_control(sk);
2074 
2075 	tcp_cleanup_ulp(sk);
2076 
2077 	/* Cleanup up the write buffer. */
2078 	tcp_write_queue_purge(sk);
2079 
2080 	/* Check if we want to disable active TFO */
2081 	tcp_fastopen_active_disable_ofo_check(sk);
2082 
2083 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2084 	skb_rbtree_purge(&tp->out_of_order_queue);
2085 
2086 #ifdef CONFIG_TCP_MD5SIG
2087 	/* Clean up the MD5 key list, if any */
2088 	if (tp->md5sig_info) {
2089 		tcp_clear_md5_list(sk);
2090 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2091 		tp->md5sig_info = NULL;
2092 	}
2093 #endif
2094 
2095 	/* Clean up a referenced TCP bind bucket. */
2096 	if (inet_csk(sk)->icsk_bind_hash)
2097 		inet_put_port(sk);
2098 
2099 	BUG_ON(tp->fastopen_rsk);
2100 
2101 	/* If socket is aborted during connect operation */
2102 	tcp_free_fastopen_req(tp);
2103 	tcp_fastopen_destroy_cipher(sk);
2104 	tcp_saved_syn_free(tp);
2105 
2106 	sk_sockets_allocated_dec(sk);
2107 }
2108 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2109 
2110 #ifdef CONFIG_PROC_FS
2111 /* Proc filesystem TCP sock list dumping. */
2112 
2113 /*
2114  * Get next listener socket follow cur.  If cur is NULL, get first socket
2115  * starting from bucket given in st->bucket; when st->bucket is zero the
2116  * very first socket in the hash table is returned.
2117  */
2118 static void *listening_get_next(struct seq_file *seq, void *cur)
2119 {
2120 	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2121 	struct tcp_iter_state *st = seq->private;
2122 	struct net *net = seq_file_net(seq);
2123 	struct inet_listen_hashbucket *ilb;
2124 	struct sock *sk = cur;
2125 
2126 	if (!sk) {
2127 get_head:
2128 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2129 		spin_lock(&ilb->lock);
2130 		sk = sk_head(&ilb->head);
2131 		st->offset = 0;
2132 		goto get_sk;
2133 	}
2134 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2135 	++st->num;
2136 	++st->offset;
2137 
2138 	sk = sk_next(sk);
2139 get_sk:
2140 	sk_for_each_from(sk) {
2141 		if (!net_eq(sock_net(sk), net))
2142 			continue;
2143 		if (sk->sk_family == afinfo->family)
2144 			return sk;
2145 	}
2146 	spin_unlock(&ilb->lock);
2147 	st->offset = 0;
2148 	if (++st->bucket < INET_LHTABLE_SIZE)
2149 		goto get_head;
2150 	return NULL;
2151 }
2152 
2153 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2154 {
2155 	struct tcp_iter_state *st = seq->private;
2156 	void *rc;
2157 
2158 	st->bucket = 0;
2159 	st->offset = 0;
2160 	rc = listening_get_next(seq, NULL);
2161 
2162 	while (rc && *pos) {
2163 		rc = listening_get_next(seq, rc);
2164 		--*pos;
2165 	}
2166 	return rc;
2167 }
2168 
2169 static inline bool empty_bucket(const struct tcp_iter_state *st)
2170 {
2171 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2172 }
2173 
2174 /*
2175  * Get first established socket starting from bucket given in st->bucket.
2176  * If st->bucket is zero, the very first socket in the hash is returned.
2177  */
2178 static void *established_get_first(struct seq_file *seq)
2179 {
2180 	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2181 	struct tcp_iter_state *st = seq->private;
2182 	struct net *net = seq_file_net(seq);
2183 	void *rc = NULL;
2184 
2185 	st->offset = 0;
2186 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2187 		struct sock *sk;
2188 		struct hlist_nulls_node *node;
2189 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2190 
2191 		/* Lockless fast path for the common case of empty buckets */
2192 		if (empty_bucket(st))
2193 			continue;
2194 
2195 		spin_lock_bh(lock);
2196 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2197 			if (sk->sk_family != afinfo->family ||
2198 			    !net_eq(sock_net(sk), net)) {
2199 				continue;
2200 			}
2201 			rc = sk;
2202 			goto out;
2203 		}
2204 		spin_unlock_bh(lock);
2205 	}
2206 out:
2207 	return rc;
2208 }
2209 
2210 static void *established_get_next(struct seq_file *seq, void *cur)
2211 {
2212 	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2213 	struct sock *sk = cur;
2214 	struct hlist_nulls_node *node;
2215 	struct tcp_iter_state *st = seq->private;
2216 	struct net *net = seq_file_net(seq);
2217 
2218 	++st->num;
2219 	++st->offset;
2220 
2221 	sk = sk_nulls_next(sk);
2222 
2223 	sk_nulls_for_each_from(sk, node) {
2224 		if (sk->sk_family == afinfo->family &&
2225 		    net_eq(sock_net(sk), net))
2226 			return sk;
2227 	}
2228 
2229 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2230 	++st->bucket;
2231 	return established_get_first(seq);
2232 }
2233 
2234 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2235 {
2236 	struct tcp_iter_state *st = seq->private;
2237 	void *rc;
2238 
2239 	st->bucket = 0;
2240 	rc = established_get_first(seq);
2241 
2242 	while (rc && pos) {
2243 		rc = established_get_next(seq, rc);
2244 		--pos;
2245 	}
2246 	return rc;
2247 }
2248 
2249 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2250 {
2251 	void *rc;
2252 	struct tcp_iter_state *st = seq->private;
2253 
2254 	st->state = TCP_SEQ_STATE_LISTENING;
2255 	rc	  = listening_get_idx(seq, &pos);
2256 
2257 	if (!rc) {
2258 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2259 		rc	  = established_get_idx(seq, pos);
2260 	}
2261 
2262 	return rc;
2263 }
2264 
2265 static void *tcp_seek_last_pos(struct seq_file *seq)
2266 {
2267 	struct tcp_iter_state *st = seq->private;
2268 	int offset = st->offset;
2269 	int orig_num = st->num;
2270 	void *rc = NULL;
2271 
2272 	switch (st->state) {
2273 	case TCP_SEQ_STATE_LISTENING:
2274 		if (st->bucket >= INET_LHTABLE_SIZE)
2275 			break;
2276 		st->state = TCP_SEQ_STATE_LISTENING;
2277 		rc = listening_get_next(seq, NULL);
2278 		while (offset-- && rc)
2279 			rc = listening_get_next(seq, rc);
2280 		if (rc)
2281 			break;
2282 		st->bucket = 0;
2283 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2284 		/* Fallthrough */
2285 	case TCP_SEQ_STATE_ESTABLISHED:
2286 		if (st->bucket > tcp_hashinfo.ehash_mask)
2287 			break;
2288 		rc = established_get_first(seq);
2289 		while (offset-- && rc)
2290 			rc = established_get_next(seq, rc);
2291 	}
2292 
2293 	st->num = orig_num;
2294 
2295 	return rc;
2296 }
2297 
2298 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2299 {
2300 	struct tcp_iter_state *st = seq->private;
2301 	void *rc;
2302 
2303 	if (*pos && *pos == st->last_pos) {
2304 		rc = tcp_seek_last_pos(seq);
2305 		if (rc)
2306 			goto out;
2307 	}
2308 
2309 	st->state = TCP_SEQ_STATE_LISTENING;
2310 	st->num = 0;
2311 	st->bucket = 0;
2312 	st->offset = 0;
2313 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2314 
2315 out:
2316 	st->last_pos = *pos;
2317 	return rc;
2318 }
2319 EXPORT_SYMBOL(tcp_seq_start);
2320 
2321 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2322 {
2323 	struct tcp_iter_state *st = seq->private;
2324 	void *rc = NULL;
2325 
2326 	if (v == SEQ_START_TOKEN) {
2327 		rc = tcp_get_idx(seq, 0);
2328 		goto out;
2329 	}
2330 
2331 	switch (st->state) {
2332 	case TCP_SEQ_STATE_LISTENING:
2333 		rc = listening_get_next(seq, v);
2334 		if (!rc) {
2335 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2336 			st->bucket = 0;
2337 			st->offset = 0;
2338 			rc	  = established_get_first(seq);
2339 		}
2340 		break;
2341 	case TCP_SEQ_STATE_ESTABLISHED:
2342 		rc = established_get_next(seq, v);
2343 		break;
2344 	}
2345 out:
2346 	++*pos;
2347 	st->last_pos = *pos;
2348 	return rc;
2349 }
2350 EXPORT_SYMBOL(tcp_seq_next);
2351 
2352 void tcp_seq_stop(struct seq_file *seq, void *v)
2353 {
2354 	struct tcp_iter_state *st = seq->private;
2355 
2356 	switch (st->state) {
2357 	case TCP_SEQ_STATE_LISTENING:
2358 		if (v != SEQ_START_TOKEN)
2359 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2360 		break;
2361 	case TCP_SEQ_STATE_ESTABLISHED:
2362 		if (v)
2363 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2364 		break;
2365 	}
2366 }
2367 EXPORT_SYMBOL(tcp_seq_stop);
2368 
2369 static void get_openreq4(const struct request_sock *req,
2370 			 struct seq_file *f, int i)
2371 {
2372 	const struct inet_request_sock *ireq = inet_rsk(req);
2373 	long delta = req->rsk_timer.expires - jiffies;
2374 
2375 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2376 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2377 		i,
2378 		ireq->ir_loc_addr,
2379 		ireq->ir_num,
2380 		ireq->ir_rmt_addr,
2381 		ntohs(ireq->ir_rmt_port),
2382 		TCP_SYN_RECV,
2383 		0, 0, /* could print option size, but that is af dependent. */
2384 		1,    /* timers active (only the expire timer) */
2385 		jiffies_delta_to_clock_t(delta),
2386 		req->num_timeout,
2387 		from_kuid_munged(seq_user_ns(f),
2388 				 sock_i_uid(req->rsk_listener)),
2389 		0,  /* non standard timer */
2390 		0, /* open_requests have no inode */
2391 		0,
2392 		req);
2393 }
2394 
2395 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2396 {
2397 	int timer_active;
2398 	unsigned long timer_expires;
2399 	const struct tcp_sock *tp = tcp_sk(sk);
2400 	const struct inet_connection_sock *icsk = inet_csk(sk);
2401 	const struct inet_sock *inet = inet_sk(sk);
2402 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2403 	__be32 dest = inet->inet_daddr;
2404 	__be32 src = inet->inet_rcv_saddr;
2405 	__u16 destp = ntohs(inet->inet_dport);
2406 	__u16 srcp = ntohs(inet->inet_sport);
2407 	int rx_queue;
2408 	int state;
2409 
2410 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2411 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2412 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2413 		timer_active	= 1;
2414 		timer_expires	= icsk->icsk_timeout;
2415 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2416 		timer_active	= 4;
2417 		timer_expires	= icsk->icsk_timeout;
2418 	} else if (timer_pending(&sk->sk_timer)) {
2419 		timer_active	= 2;
2420 		timer_expires	= sk->sk_timer.expires;
2421 	} else {
2422 		timer_active	= 0;
2423 		timer_expires = jiffies;
2424 	}
2425 
2426 	state = inet_sk_state_load(sk);
2427 	if (state == TCP_LISTEN)
2428 		rx_queue = sk->sk_ack_backlog;
2429 	else
2430 		/* Because we don't lock the socket,
2431 		 * we might find a transient negative value.
2432 		 */
2433 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2434 
2435 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2436 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2437 		i, src, srcp, dest, destp, state,
2438 		tp->write_seq - tp->snd_una,
2439 		rx_queue,
2440 		timer_active,
2441 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2442 		icsk->icsk_retransmits,
2443 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2444 		icsk->icsk_probes_out,
2445 		sock_i_ino(sk),
2446 		refcount_read(&sk->sk_refcnt), sk,
2447 		jiffies_to_clock_t(icsk->icsk_rto),
2448 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2449 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2450 		tp->snd_cwnd,
2451 		state == TCP_LISTEN ?
2452 		    fastopenq->max_qlen :
2453 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2454 }
2455 
2456 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2457 			       struct seq_file *f, int i)
2458 {
2459 	long delta = tw->tw_timer.expires - jiffies;
2460 	__be32 dest, src;
2461 	__u16 destp, srcp;
2462 
2463 	dest  = tw->tw_daddr;
2464 	src   = tw->tw_rcv_saddr;
2465 	destp = ntohs(tw->tw_dport);
2466 	srcp  = ntohs(tw->tw_sport);
2467 
2468 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2469 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2470 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2471 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2472 		refcount_read(&tw->tw_refcnt), tw);
2473 }
2474 
2475 #define TMPSZ 150
2476 
2477 static int tcp4_seq_show(struct seq_file *seq, void *v)
2478 {
2479 	struct tcp_iter_state *st;
2480 	struct sock *sk = v;
2481 
2482 	seq_setwidth(seq, TMPSZ - 1);
2483 	if (v == SEQ_START_TOKEN) {
2484 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2485 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2486 			   "inode");
2487 		goto out;
2488 	}
2489 	st = seq->private;
2490 
2491 	if (sk->sk_state == TCP_TIME_WAIT)
2492 		get_timewait4_sock(v, seq, st->num);
2493 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2494 		get_openreq4(v, seq, st->num);
2495 	else
2496 		get_tcp4_sock(v, seq, st->num);
2497 out:
2498 	seq_pad(seq, '\n');
2499 	return 0;
2500 }
2501 
2502 static const struct seq_operations tcp4_seq_ops = {
2503 	.show		= tcp4_seq_show,
2504 	.start		= tcp_seq_start,
2505 	.next		= tcp_seq_next,
2506 	.stop		= tcp_seq_stop,
2507 };
2508 
2509 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2510 	.family		= AF_INET,
2511 };
2512 
2513 static int __net_init tcp4_proc_init_net(struct net *net)
2514 {
2515 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2516 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2517 		return -ENOMEM;
2518 	return 0;
2519 }
2520 
2521 static void __net_exit tcp4_proc_exit_net(struct net *net)
2522 {
2523 	remove_proc_entry("tcp", net->proc_net);
2524 }
2525 
2526 static struct pernet_operations tcp4_net_ops = {
2527 	.init = tcp4_proc_init_net,
2528 	.exit = tcp4_proc_exit_net,
2529 };
2530 
2531 int __init tcp4_proc_init(void)
2532 {
2533 	return register_pernet_subsys(&tcp4_net_ops);
2534 }
2535 
2536 void tcp4_proc_exit(void)
2537 {
2538 	unregister_pernet_subsys(&tcp4_net_ops);
2539 }
2540 #endif /* CONFIG_PROC_FS */
2541 
2542 struct proto tcp_prot = {
2543 	.name			= "TCP",
2544 	.owner			= THIS_MODULE,
2545 	.close			= tcp_close,
2546 	.pre_connect		= tcp_v4_pre_connect,
2547 	.connect		= tcp_v4_connect,
2548 	.disconnect		= tcp_disconnect,
2549 	.accept			= inet_csk_accept,
2550 	.ioctl			= tcp_ioctl,
2551 	.init			= tcp_v4_init_sock,
2552 	.destroy		= tcp_v4_destroy_sock,
2553 	.shutdown		= tcp_shutdown,
2554 	.setsockopt		= tcp_setsockopt,
2555 	.getsockopt		= tcp_getsockopt,
2556 	.keepalive		= tcp_set_keepalive,
2557 	.recvmsg		= tcp_recvmsg,
2558 	.sendmsg		= tcp_sendmsg,
2559 	.sendpage		= tcp_sendpage,
2560 	.backlog_rcv		= tcp_v4_do_rcv,
2561 	.release_cb		= tcp_release_cb,
2562 	.hash			= inet_hash,
2563 	.unhash			= inet_unhash,
2564 	.get_port		= inet_csk_get_port,
2565 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2566 	.leave_memory_pressure	= tcp_leave_memory_pressure,
2567 	.stream_memory_free	= tcp_stream_memory_free,
2568 	.sockets_allocated	= &tcp_sockets_allocated,
2569 	.orphan_count		= &tcp_orphan_count,
2570 	.memory_allocated	= &tcp_memory_allocated,
2571 	.memory_pressure	= &tcp_memory_pressure,
2572 	.sysctl_mem		= sysctl_tcp_mem,
2573 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
2574 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
2575 	.max_header		= MAX_TCP_HEADER,
2576 	.obj_size		= sizeof(struct tcp_sock),
2577 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
2578 	.twsk_prot		= &tcp_timewait_sock_ops,
2579 	.rsk_prot		= &tcp_request_sock_ops,
2580 	.h.hashinfo		= &tcp_hashinfo,
2581 	.no_autobind		= true,
2582 #ifdef CONFIG_COMPAT
2583 	.compat_setsockopt	= compat_tcp_setsockopt,
2584 	.compat_getsockopt	= compat_tcp_getsockopt,
2585 #endif
2586 	.diag_destroy		= tcp_abort,
2587 };
2588 EXPORT_SYMBOL(tcp_prot);
2589 
2590 static void __net_exit tcp_sk_exit(struct net *net)
2591 {
2592 	int cpu;
2593 
2594 	if (net->ipv4.tcp_congestion_control)
2595 		module_put(net->ipv4.tcp_congestion_control->owner);
2596 
2597 	for_each_possible_cpu(cpu)
2598 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2599 	free_percpu(net->ipv4.tcp_sk);
2600 }
2601 
2602 static int __net_init tcp_sk_init(struct net *net)
2603 {
2604 	int res, cpu, cnt;
2605 
2606 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2607 	if (!net->ipv4.tcp_sk)
2608 		return -ENOMEM;
2609 
2610 	for_each_possible_cpu(cpu) {
2611 		struct sock *sk;
2612 
2613 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2614 					   IPPROTO_TCP, net);
2615 		if (res)
2616 			goto fail;
2617 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2618 
2619 		/* Please enforce IP_DF and IPID==0 for RST and
2620 		 * ACK sent in SYN-RECV and TIME-WAIT state.
2621 		 */
2622 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2623 
2624 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2625 	}
2626 
2627 	net->ipv4.sysctl_tcp_ecn = 2;
2628 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2629 
2630 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2631 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2632 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2633 
2634 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2635 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2636 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2637 
2638 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2639 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2640 	net->ipv4.sysctl_tcp_syncookies = 1;
2641 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2642 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2643 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2644 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2645 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2646 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2647 	net->ipv4.sysctl_tcp_tw_reuse = 2;
2648 
2649 	cnt = tcp_hashinfo.ehash_mask + 1;
2650 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2651 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2652 
2653 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2654 	net->ipv4.sysctl_tcp_sack = 1;
2655 	net->ipv4.sysctl_tcp_window_scaling = 1;
2656 	net->ipv4.sysctl_tcp_timestamps = 1;
2657 	net->ipv4.sysctl_tcp_early_retrans = 3;
2658 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2659 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2660 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
2661 	net->ipv4.sysctl_tcp_max_reordering = 300;
2662 	net->ipv4.sysctl_tcp_dsack = 1;
2663 	net->ipv4.sysctl_tcp_app_win = 31;
2664 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
2665 	net->ipv4.sysctl_tcp_frto = 2;
2666 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2667 	/* This limits the percentage of the congestion window which we
2668 	 * will allow a single TSO frame to consume.  Building TSO frames
2669 	 * which are too large can cause TCP streams to be bursty.
2670 	 */
2671 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2672 	/* Default TSQ limit of 16 TSO segments */
2673 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2674 	/* rfc5961 challenge ack rate limiting */
2675 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2676 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
2677 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2678 	net->ipv4.sysctl_tcp_autocorking = 1;
2679 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2680 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2681 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2682 	if (net != &init_net) {
2683 		memcpy(net->ipv4.sysctl_tcp_rmem,
2684 		       init_net.ipv4.sysctl_tcp_rmem,
2685 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2686 		memcpy(net->ipv4.sysctl_tcp_wmem,
2687 		       init_net.ipv4.sysctl_tcp_wmem,
2688 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2689 	}
2690 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2691 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2692 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2693 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2694 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2695 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2696 
2697 	/* Reno is always built in */
2698 	if (!net_eq(net, &init_net) &&
2699 	    try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2700 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2701 	else
2702 		net->ipv4.tcp_congestion_control = &tcp_reno;
2703 
2704 	return 0;
2705 fail:
2706 	tcp_sk_exit(net);
2707 
2708 	return res;
2709 }
2710 
2711 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2712 {
2713 	struct net *net;
2714 
2715 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2716 
2717 	list_for_each_entry(net, net_exit_list, exit_list)
2718 		tcp_fastopen_ctx_destroy(net);
2719 }
2720 
2721 static struct pernet_operations __net_initdata tcp_sk_ops = {
2722        .init	   = tcp_sk_init,
2723        .exit	   = tcp_sk_exit,
2724        .exit_batch = tcp_sk_exit_batch,
2725 };
2726 
2727 void __init tcp_v4_init(void)
2728 {
2729 	if (register_pernet_subsys(&tcp_sk_ops))
2730 		panic("Failed to create the TCP control socket.\n");
2731 }
2732