xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision f79e4d5f92a129a1159c973735007d4ddc8541f3)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77 
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/inetdevice.h>
84 
85 #include <crypto/hash.h>
86 #include <linux/scatterlist.h>
87 
88 #include <trace/events/tcp.h>
89 
90 #ifdef CONFIG_TCP_MD5SIG
91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
92 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
93 #endif
94 
95 struct inet_hashinfo tcp_hashinfo;
96 EXPORT_SYMBOL(tcp_hashinfo);
97 
98 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
99 {
100 	return secure_tcp_seq(ip_hdr(skb)->daddr,
101 			      ip_hdr(skb)->saddr,
102 			      tcp_hdr(skb)->dest,
103 			      tcp_hdr(skb)->source);
104 }
105 
106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
107 {
108 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
109 }
110 
111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112 {
113 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
114 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
115 	struct tcp_sock *tp = tcp_sk(sk);
116 	int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
117 
118 	if (reuse == 2) {
119 		/* Still does not detect *everything* that goes through
120 		 * lo, since we require a loopback src or dst address
121 		 * or direct binding to 'lo' interface.
122 		 */
123 		bool loopback = false;
124 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
125 			loopback = true;
126 #if IS_ENABLED(CONFIG_IPV6)
127 		if (tw->tw_family == AF_INET6) {
128 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
129 			    (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
130 			     (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
131 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
132 			    (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
133 			     (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
134 				loopback = true;
135 		} else
136 #endif
137 		{
138 			if (ipv4_is_loopback(tw->tw_daddr) ||
139 			    ipv4_is_loopback(tw->tw_rcv_saddr))
140 				loopback = true;
141 		}
142 		if (!loopback)
143 			reuse = 0;
144 	}
145 
146 	/* With PAWS, it is safe from the viewpoint
147 	   of data integrity. Even without PAWS it is safe provided sequence
148 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
149 
150 	   Actually, the idea is close to VJ's one, only timestamp cache is
151 	   held not per host, but per port pair and TW bucket is used as state
152 	   holder.
153 
154 	   If TW bucket has been already destroyed we fall back to VJ's scheme
155 	   and use initial timestamp retrieved from peer table.
156 	 */
157 	if (tcptw->tw_ts_recent_stamp &&
158 	    (!twp || (reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
159 		/* In case of repair and re-using TIME-WAIT sockets we still
160 		 * want to be sure that it is safe as above but honor the
161 		 * sequence numbers and time stamps set as part of the repair
162 		 * process.
163 		 *
164 		 * Without this check re-using a TIME-WAIT socket with TCP
165 		 * repair would accumulate a -1 on the repair assigned
166 		 * sequence number. The first time it is reused the sequence
167 		 * is -1, the second time -2, etc. This fixes that issue
168 		 * without appearing to create any others.
169 		 */
170 		if (likely(!tp->repair)) {
171 			tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
172 			if (tp->write_seq == 0)
173 				tp->write_seq = 1;
174 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
175 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
176 		}
177 		sock_hold(sktw);
178 		return 1;
179 	}
180 
181 	return 0;
182 }
183 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
184 
185 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
186 			      int addr_len)
187 {
188 	/* This check is replicated from tcp_v4_connect() and intended to
189 	 * prevent BPF program called below from accessing bytes that are out
190 	 * of the bound specified by user in addr_len.
191 	 */
192 	if (addr_len < sizeof(struct sockaddr_in))
193 		return -EINVAL;
194 
195 	sock_owned_by_me(sk);
196 
197 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
198 }
199 
200 /* This will initiate an outgoing connection. */
201 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
202 {
203 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
204 	struct inet_sock *inet = inet_sk(sk);
205 	struct tcp_sock *tp = tcp_sk(sk);
206 	__be16 orig_sport, orig_dport;
207 	__be32 daddr, nexthop;
208 	struct flowi4 *fl4;
209 	struct rtable *rt;
210 	int err;
211 	struct ip_options_rcu *inet_opt;
212 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
213 
214 	if (addr_len < sizeof(struct sockaddr_in))
215 		return -EINVAL;
216 
217 	if (usin->sin_family != AF_INET)
218 		return -EAFNOSUPPORT;
219 
220 	nexthop = daddr = usin->sin_addr.s_addr;
221 	inet_opt = rcu_dereference_protected(inet->inet_opt,
222 					     lockdep_sock_is_held(sk));
223 	if (inet_opt && inet_opt->opt.srr) {
224 		if (!daddr)
225 			return -EINVAL;
226 		nexthop = inet_opt->opt.faddr;
227 	}
228 
229 	orig_sport = inet->inet_sport;
230 	orig_dport = usin->sin_port;
231 	fl4 = &inet->cork.fl.u.ip4;
232 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
233 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
234 			      IPPROTO_TCP,
235 			      orig_sport, orig_dport, sk);
236 	if (IS_ERR(rt)) {
237 		err = PTR_ERR(rt);
238 		if (err == -ENETUNREACH)
239 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
240 		return err;
241 	}
242 
243 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
244 		ip_rt_put(rt);
245 		return -ENETUNREACH;
246 	}
247 
248 	if (!inet_opt || !inet_opt->opt.srr)
249 		daddr = fl4->daddr;
250 
251 	if (!inet->inet_saddr)
252 		inet->inet_saddr = fl4->saddr;
253 	sk_rcv_saddr_set(sk, inet->inet_saddr);
254 
255 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
256 		/* Reset inherited state */
257 		tp->rx_opt.ts_recent	   = 0;
258 		tp->rx_opt.ts_recent_stamp = 0;
259 		if (likely(!tp->repair))
260 			tp->write_seq	   = 0;
261 	}
262 
263 	inet->inet_dport = usin->sin_port;
264 	sk_daddr_set(sk, daddr);
265 
266 	inet_csk(sk)->icsk_ext_hdr_len = 0;
267 	if (inet_opt)
268 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
269 
270 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
271 
272 	/* Socket identity is still unknown (sport may be zero).
273 	 * However we set state to SYN-SENT and not releasing socket
274 	 * lock select source port, enter ourselves into the hash tables and
275 	 * complete initialization after this.
276 	 */
277 	tcp_set_state(sk, TCP_SYN_SENT);
278 	err = inet_hash_connect(tcp_death_row, sk);
279 	if (err)
280 		goto failure;
281 
282 	sk_set_txhash(sk);
283 
284 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
285 			       inet->inet_sport, inet->inet_dport, sk);
286 	if (IS_ERR(rt)) {
287 		err = PTR_ERR(rt);
288 		rt = NULL;
289 		goto failure;
290 	}
291 	/* OK, now commit destination to socket.  */
292 	sk->sk_gso_type = SKB_GSO_TCPV4;
293 	sk_setup_caps(sk, &rt->dst);
294 	rt = NULL;
295 
296 	if (likely(!tp->repair)) {
297 		if (!tp->write_seq)
298 			tp->write_seq = secure_tcp_seq(inet->inet_saddr,
299 						       inet->inet_daddr,
300 						       inet->inet_sport,
301 						       usin->sin_port);
302 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
303 						 inet->inet_saddr,
304 						 inet->inet_daddr);
305 	}
306 
307 	inet->inet_id = tp->write_seq ^ jiffies;
308 
309 	if (tcp_fastopen_defer_connect(sk, &err))
310 		return err;
311 	if (err)
312 		goto failure;
313 
314 	err = tcp_connect(sk);
315 
316 	if (err)
317 		goto failure;
318 
319 	return 0;
320 
321 failure:
322 	/*
323 	 * This unhashes the socket and releases the local port,
324 	 * if necessary.
325 	 */
326 	tcp_set_state(sk, TCP_CLOSE);
327 	ip_rt_put(rt);
328 	sk->sk_route_caps = 0;
329 	inet->inet_dport = 0;
330 	return err;
331 }
332 EXPORT_SYMBOL(tcp_v4_connect);
333 
334 /*
335  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
336  * It can be called through tcp_release_cb() if socket was owned by user
337  * at the time tcp_v4_err() was called to handle ICMP message.
338  */
339 void tcp_v4_mtu_reduced(struct sock *sk)
340 {
341 	struct inet_sock *inet = inet_sk(sk);
342 	struct dst_entry *dst;
343 	u32 mtu;
344 
345 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
346 		return;
347 	mtu = tcp_sk(sk)->mtu_info;
348 	dst = inet_csk_update_pmtu(sk, mtu);
349 	if (!dst)
350 		return;
351 
352 	/* Something is about to be wrong... Remember soft error
353 	 * for the case, if this connection will not able to recover.
354 	 */
355 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
356 		sk->sk_err_soft = EMSGSIZE;
357 
358 	mtu = dst_mtu(dst);
359 
360 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
361 	    ip_sk_accept_pmtu(sk) &&
362 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
363 		tcp_sync_mss(sk, mtu);
364 
365 		/* Resend the TCP packet because it's
366 		 * clear that the old packet has been
367 		 * dropped. This is the new "fast" path mtu
368 		 * discovery.
369 		 */
370 		tcp_simple_retransmit(sk);
371 	} /* else let the usual retransmit timer handle it */
372 }
373 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
374 
375 static void do_redirect(struct sk_buff *skb, struct sock *sk)
376 {
377 	struct dst_entry *dst = __sk_dst_check(sk, 0);
378 
379 	if (dst)
380 		dst->ops->redirect(dst, sk, skb);
381 }
382 
383 
384 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
385 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
386 {
387 	struct request_sock *req = inet_reqsk(sk);
388 	struct net *net = sock_net(sk);
389 
390 	/* ICMPs are not backlogged, hence we cannot get
391 	 * an established socket here.
392 	 */
393 	if (seq != tcp_rsk(req)->snt_isn) {
394 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
395 	} else if (abort) {
396 		/*
397 		 * Still in SYN_RECV, just remove it silently.
398 		 * There is no good way to pass the error to the newly
399 		 * created socket, and POSIX does not want network
400 		 * errors returned from accept().
401 		 */
402 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
403 		tcp_listendrop(req->rsk_listener);
404 	}
405 	reqsk_put(req);
406 }
407 EXPORT_SYMBOL(tcp_req_err);
408 
409 /*
410  * This routine is called by the ICMP module when it gets some
411  * sort of error condition.  If err < 0 then the socket should
412  * be closed and the error returned to the user.  If err > 0
413  * it's just the icmp type << 8 | icmp code.  After adjustment
414  * header points to the first 8 bytes of the tcp header.  We need
415  * to find the appropriate port.
416  *
417  * The locking strategy used here is very "optimistic". When
418  * someone else accesses the socket the ICMP is just dropped
419  * and for some paths there is no check at all.
420  * A more general error queue to queue errors for later handling
421  * is probably better.
422  *
423  */
424 
425 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
426 {
427 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
428 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
429 	struct inet_connection_sock *icsk;
430 	struct tcp_sock *tp;
431 	struct inet_sock *inet;
432 	const int type = icmp_hdr(icmp_skb)->type;
433 	const int code = icmp_hdr(icmp_skb)->code;
434 	struct sock *sk;
435 	struct sk_buff *skb;
436 	struct request_sock *fastopen;
437 	u32 seq, snd_una;
438 	s32 remaining;
439 	u32 delta_us;
440 	int err;
441 	struct net *net = dev_net(icmp_skb->dev);
442 
443 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
444 				       th->dest, iph->saddr, ntohs(th->source),
445 				       inet_iif(icmp_skb), 0);
446 	if (!sk) {
447 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
448 		return;
449 	}
450 	if (sk->sk_state == TCP_TIME_WAIT) {
451 		inet_twsk_put(inet_twsk(sk));
452 		return;
453 	}
454 	seq = ntohl(th->seq);
455 	if (sk->sk_state == TCP_NEW_SYN_RECV)
456 		return tcp_req_err(sk, seq,
457 				  type == ICMP_PARAMETERPROB ||
458 				  type == ICMP_TIME_EXCEEDED ||
459 				  (type == ICMP_DEST_UNREACH &&
460 				   (code == ICMP_NET_UNREACH ||
461 				    code == ICMP_HOST_UNREACH)));
462 
463 	bh_lock_sock(sk);
464 	/* If too many ICMPs get dropped on busy
465 	 * servers this needs to be solved differently.
466 	 * We do take care of PMTU discovery (RFC1191) special case :
467 	 * we can receive locally generated ICMP messages while socket is held.
468 	 */
469 	if (sock_owned_by_user(sk)) {
470 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
471 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
472 	}
473 	if (sk->sk_state == TCP_CLOSE)
474 		goto out;
475 
476 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
477 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
478 		goto out;
479 	}
480 
481 	icsk = inet_csk(sk);
482 	tp = tcp_sk(sk);
483 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
484 	fastopen = tp->fastopen_rsk;
485 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
486 	if (sk->sk_state != TCP_LISTEN &&
487 	    !between(seq, snd_una, tp->snd_nxt)) {
488 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
489 		goto out;
490 	}
491 
492 	switch (type) {
493 	case ICMP_REDIRECT:
494 		if (!sock_owned_by_user(sk))
495 			do_redirect(icmp_skb, sk);
496 		goto out;
497 	case ICMP_SOURCE_QUENCH:
498 		/* Just silently ignore these. */
499 		goto out;
500 	case ICMP_PARAMETERPROB:
501 		err = EPROTO;
502 		break;
503 	case ICMP_DEST_UNREACH:
504 		if (code > NR_ICMP_UNREACH)
505 			goto out;
506 
507 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
508 			/* We are not interested in TCP_LISTEN and open_requests
509 			 * (SYN-ACKs send out by Linux are always <576bytes so
510 			 * they should go through unfragmented).
511 			 */
512 			if (sk->sk_state == TCP_LISTEN)
513 				goto out;
514 
515 			tp->mtu_info = info;
516 			if (!sock_owned_by_user(sk)) {
517 				tcp_v4_mtu_reduced(sk);
518 			} else {
519 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
520 					sock_hold(sk);
521 			}
522 			goto out;
523 		}
524 
525 		err = icmp_err_convert[code].errno;
526 		/* check if icmp_skb allows revert of backoff
527 		 * (see draft-zimmermann-tcp-lcd) */
528 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
529 			break;
530 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
531 		    !icsk->icsk_backoff || fastopen)
532 			break;
533 
534 		if (sock_owned_by_user(sk))
535 			break;
536 
537 		icsk->icsk_backoff--;
538 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
539 					       TCP_TIMEOUT_INIT;
540 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
541 
542 		skb = tcp_rtx_queue_head(sk);
543 		BUG_ON(!skb);
544 
545 		tcp_mstamp_refresh(tp);
546 		delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
547 		remaining = icsk->icsk_rto -
548 			    usecs_to_jiffies(delta_us);
549 
550 		if (remaining > 0) {
551 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
552 						  remaining, TCP_RTO_MAX);
553 		} else {
554 			/* RTO revert clocked out retransmission.
555 			 * Will retransmit now */
556 			tcp_retransmit_timer(sk);
557 		}
558 
559 		break;
560 	case ICMP_TIME_EXCEEDED:
561 		err = EHOSTUNREACH;
562 		break;
563 	default:
564 		goto out;
565 	}
566 
567 	switch (sk->sk_state) {
568 	case TCP_SYN_SENT:
569 	case TCP_SYN_RECV:
570 		/* Only in fast or simultaneous open. If a fast open socket is
571 		 * is already accepted it is treated as a connected one below.
572 		 */
573 		if (fastopen && !fastopen->sk)
574 			break;
575 
576 		if (!sock_owned_by_user(sk)) {
577 			sk->sk_err = err;
578 
579 			sk->sk_error_report(sk);
580 
581 			tcp_done(sk);
582 		} else {
583 			sk->sk_err_soft = err;
584 		}
585 		goto out;
586 	}
587 
588 	/* If we've already connected we will keep trying
589 	 * until we time out, or the user gives up.
590 	 *
591 	 * rfc1122 4.2.3.9 allows to consider as hard errors
592 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
593 	 * but it is obsoleted by pmtu discovery).
594 	 *
595 	 * Note, that in modern internet, where routing is unreliable
596 	 * and in each dark corner broken firewalls sit, sending random
597 	 * errors ordered by their masters even this two messages finally lose
598 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
599 	 *
600 	 * Now we are in compliance with RFCs.
601 	 *							--ANK (980905)
602 	 */
603 
604 	inet = inet_sk(sk);
605 	if (!sock_owned_by_user(sk) && inet->recverr) {
606 		sk->sk_err = err;
607 		sk->sk_error_report(sk);
608 	} else	{ /* Only an error on timeout */
609 		sk->sk_err_soft = err;
610 	}
611 
612 out:
613 	bh_unlock_sock(sk);
614 	sock_put(sk);
615 }
616 
617 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
618 {
619 	struct tcphdr *th = tcp_hdr(skb);
620 
621 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
622 	skb->csum_start = skb_transport_header(skb) - skb->head;
623 	skb->csum_offset = offsetof(struct tcphdr, check);
624 }
625 
626 /* This routine computes an IPv4 TCP checksum. */
627 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
628 {
629 	const struct inet_sock *inet = inet_sk(sk);
630 
631 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
632 }
633 EXPORT_SYMBOL(tcp_v4_send_check);
634 
635 /*
636  *	This routine will send an RST to the other tcp.
637  *
638  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
639  *		      for reset.
640  *	Answer: if a packet caused RST, it is not for a socket
641  *		existing in our system, if it is matched to a socket,
642  *		it is just duplicate segment or bug in other side's TCP.
643  *		So that we build reply only basing on parameters
644  *		arrived with segment.
645  *	Exception: precedence violation. We do not implement it in any case.
646  */
647 
648 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
649 {
650 	const struct tcphdr *th = tcp_hdr(skb);
651 	struct {
652 		struct tcphdr th;
653 #ifdef CONFIG_TCP_MD5SIG
654 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
655 #endif
656 	} rep;
657 	struct ip_reply_arg arg;
658 #ifdef CONFIG_TCP_MD5SIG
659 	struct tcp_md5sig_key *key = NULL;
660 	const __u8 *hash_location = NULL;
661 	unsigned char newhash[16];
662 	int genhash;
663 	struct sock *sk1 = NULL;
664 #endif
665 	struct net *net;
666 	struct sock *ctl_sk;
667 
668 	/* Never send a reset in response to a reset. */
669 	if (th->rst)
670 		return;
671 
672 	/* If sk not NULL, it means we did a successful lookup and incoming
673 	 * route had to be correct. prequeue might have dropped our dst.
674 	 */
675 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
676 		return;
677 
678 	/* Swap the send and the receive. */
679 	memset(&rep, 0, sizeof(rep));
680 	rep.th.dest   = th->source;
681 	rep.th.source = th->dest;
682 	rep.th.doff   = sizeof(struct tcphdr) / 4;
683 	rep.th.rst    = 1;
684 
685 	if (th->ack) {
686 		rep.th.seq = th->ack_seq;
687 	} else {
688 		rep.th.ack = 1;
689 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
690 				       skb->len - (th->doff << 2));
691 	}
692 
693 	memset(&arg, 0, sizeof(arg));
694 	arg.iov[0].iov_base = (unsigned char *)&rep;
695 	arg.iov[0].iov_len  = sizeof(rep.th);
696 
697 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
698 #ifdef CONFIG_TCP_MD5SIG
699 	rcu_read_lock();
700 	hash_location = tcp_parse_md5sig_option(th);
701 	if (sk && sk_fullsock(sk)) {
702 		key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
703 					&ip_hdr(skb)->saddr, AF_INET);
704 	} else if (hash_location) {
705 		/*
706 		 * active side is lost. Try to find listening socket through
707 		 * source port, and then find md5 key through listening socket.
708 		 * we are not loose security here:
709 		 * Incoming packet is checked with md5 hash with finding key,
710 		 * no RST generated if md5 hash doesn't match.
711 		 */
712 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
713 					     ip_hdr(skb)->saddr,
714 					     th->source, ip_hdr(skb)->daddr,
715 					     ntohs(th->source), inet_iif(skb),
716 					     tcp_v4_sdif(skb));
717 		/* don't send rst if it can't find key */
718 		if (!sk1)
719 			goto out;
720 
721 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
722 					&ip_hdr(skb)->saddr, AF_INET);
723 		if (!key)
724 			goto out;
725 
726 
727 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
728 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
729 			goto out;
730 
731 	}
732 
733 	if (key) {
734 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
735 				   (TCPOPT_NOP << 16) |
736 				   (TCPOPT_MD5SIG << 8) |
737 				   TCPOLEN_MD5SIG);
738 		/* Update length and the length the header thinks exists */
739 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
740 		rep.th.doff = arg.iov[0].iov_len / 4;
741 
742 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
743 				     key, ip_hdr(skb)->saddr,
744 				     ip_hdr(skb)->daddr, &rep.th);
745 	}
746 #endif
747 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
748 				      ip_hdr(skb)->saddr, /* XXX */
749 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
750 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
751 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
752 
753 	/* When socket is gone, all binding information is lost.
754 	 * routing might fail in this case. No choice here, if we choose to force
755 	 * input interface, we will misroute in case of asymmetric route.
756 	 */
757 	if (sk) {
758 		arg.bound_dev_if = sk->sk_bound_dev_if;
759 		if (sk_fullsock(sk))
760 			trace_tcp_send_reset(sk, skb);
761 	}
762 
763 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
764 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
765 
766 	arg.tos = ip_hdr(skb)->tos;
767 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
768 	local_bh_disable();
769 	ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
770 	if (sk)
771 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
772 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
773 	ip_send_unicast_reply(ctl_sk,
774 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
775 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
776 			      &arg, arg.iov[0].iov_len);
777 
778 	ctl_sk->sk_mark = 0;
779 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
780 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
781 	local_bh_enable();
782 
783 #ifdef CONFIG_TCP_MD5SIG
784 out:
785 	rcu_read_unlock();
786 #endif
787 }
788 
789 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
790    outside socket context is ugly, certainly. What can I do?
791  */
792 
793 static void tcp_v4_send_ack(const struct sock *sk,
794 			    struct sk_buff *skb, u32 seq, u32 ack,
795 			    u32 win, u32 tsval, u32 tsecr, int oif,
796 			    struct tcp_md5sig_key *key,
797 			    int reply_flags, u8 tos)
798 {
799 	const struct tcphdr *th = tcp_hdr(skb);
800 	struct {
801 		struct tcphdr th;
802 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
803 #ifdef CONFIG_TCP_MD5SIG
804 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
805 #endif
806 			];
807 	} rep;
808 	struct net *net = sock_net(sk);
809 	struct ip_reply_arg arg;
810 	struct sock *ctl_sk;
811 
812 	memset(&rep.th, 0, sizeof(struct tcphdr));
813 	memset(&arg, 0, sizeof(arg));
814 
815 	arg.iov[0].iov_base = (unsigned char *)&rep;
816 	arg.iov[0].iov_len  = sizeof(rep.th);
817 	if (tsecr) {
818 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
819 				   (TCPOPT_TIMESTAMP << 8) |
820 				   TCPOLEN_TIMESTAMP);
821 		rep.opt[1] = htonl(tsval);
822 		rep.opt[2] = htonl(tsecr);
823 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
824 	}
825 
826 	/* Swap the send and the receive. */
827 	rep.th.dest    = th->source;
828 	rep.th.source  = th->dest;
829 	rep.th.doff    = arg.iov[0].iov_len / 4;
830 	rep.th.seq     = htonl(seq);
831 	rep.th.ack_seq = htonl(ack);
832 	rep.th.ack     = 1;
833 	rep.th.window  = htons(win);
834 
835 #ifdef CONFIG_TCP_MD5SIG
836 	if (key) {
837 		int offset = (tsecr) ? 3 : 0;
838 
839 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
840 					  (TCPOPT_NOP << 16) |
841 					  (TCPOPT_MD5SIG << 8) |
842 					  TCPOLEN_MD5SIG);
843 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
844 		rep.th.doff = arg.iov[0].iov_len/4;
845 
846 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
847 				    key, ip_hdr(skb)->saddr,
848 				    ip_hdr(skb)->daddr, &rep.th);
849 	}
850 #endif
851 	arg.flags = reply_flags;
852 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
853 				      ip_hdr(skb)->saddr, /* XXX */
854 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
855 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
856 	if (oif)
857 		arg.bound_dev_if = oif;
858 	arg.tos = tos;
859 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
860 	local_bh_disable();
861 	ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
862 	if (sk)
863 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
864 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
865 	ip_send_unicast_reply(ctl_sk,
866 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
867 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
868 			      &arg, arg.iov[0].iov_len);
869 
870 	ctl_sk->sk_mark = 0;
871 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
872 	local_bh_enable();
873 }
874 
875 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
876 {
877 	struct inet_timewait_sock *tw = inet_twsk(sk);
878 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
879 
880 	tcp_v4_send_ack(sk, skb,
881 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
882 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
883 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
884 			tcptw->tw_ts_recent,
885 			tw->tw_bound_dev_if,
886 			tcp_twsk_md5_key(tcptw),
887 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
888 			tw->tw_tos
889 			);
890 
891 	inet_twsk_put(tw);
892 }
893 
894 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
895 				  struct request_sock *req)
896 {
897 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
898 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
899 	 */
900 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
901 					     tcp_sk(sk)->snd_nxt;
902 
903 	/* RFC 7323 2.3
904 	 * The window field (SEG.WND) of every outgoing segment, with the
905 	 * exception of <SYN> segments, MUST be right-shifted by
906 	 * Rcv.Wind.Shift bits:
907 	 */
908 	tcp_v4_send_ack(sk, skb, seq,
909 			tcp_rsk(req)->rcv_nxt,
910 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
911 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
912 			req->ts_recent,
913 			0,
914 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
915 					  AF_INET),
916 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
917 			ip_hdr(skb)->tos);
918 }
919 
920 /*
921  *	Send a SYN-ACK after having received a SYN.
922  *	This still operates on a request_sock only, not on a big
923  *	socket.
924  */
925 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
926 			      struct flowi *fl,
927 			      struct request_sock *req,
928 			      struct tcp_fastopen_cookie *foc,
929 			      enum tcp_synack_type synack_type)
930 {
931 	const struct inet_request_sock *ireq = inet_rsk(req);
932 	struct flowi4 fl4;
933 	int err = -1;
934 	struct sk_buff *skb;
935 
936 	/* First, grab a route. */
937 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
938 		return -1;
939 
940 	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
941 
942 	if (skb) {
943 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
944 
945 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
946 					    ireq->ir_rmt_addr,
947 					    ireq_opt_deref(ireq));
948 		err = net_xmit_eval(err);
949 	}
950 
951 	return err;
952 }
953 
954 /*
955  *	IPv4 request_sock destructor.
956  */
957 static void tcp_v4_reqsk_destructor(struct request_sock *req)
958 {
959 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
960 }
961 
962 #ifdef CONFIG_TCP_MD5SIG
963 /*
964  * RFC2385 MD5 checksumming requires a mapping of
965  * IP address->MD5 Key.
966  * We need to maintain these in the sk structure.
967  */
968 
969 /* Find the Key structure for an address.  */
970 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
971 					 const union tcp_md5_addr *addr,
972 					 int family)
973 {
974 	const struct tcp_sock *tp = tcp_sk(sk);
975 	struct tcp_md5sig_key *key;
976 	const struct tcp_md5sig_info *md5sig;
977 	__be32 mask;
978 	struct tcp_md5sig_key *best_match = NULL;
979 	bool match;
980 
981 	/* caller either holds rcu_read_lock() or socket lock */
982 	md5sig = rcu_dereference_check(tp->md5sig_info,
983 				       lockdep_sock_is_held(sk));
984 	if (!md5sig)
985 		return NULL;
986 
987 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
988 		if (key->family != family)
989 			continue;
990 
991 		if (family == AF_INET) {
992 			mask = inet_make_mask(key->prefixlen);
993 			match = (key->addr.a4.s_addr & mask) ==
994 				(addr->a4.s_addr & mask);
995 #if IS_ENABLED(CONFIG_IPV6)
996 		} else if (family == AF_INET6) {
997 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
998 						  key->prefixlen);
999 #endif
1000 		} else {
1001 			match = false;
1002 		}
1003 
1004 		if (match && (!best_match ||
1005 			      key->prefixlen > best_match->prefixlen))
1006 			best_match = key;
1007 	}
1008 	return best_match;
1009 }
1010 EXPORT_SYMBOL(tcp_md5_do_lookup);
1011 
1012 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1013 						      const union tcp_md5_addr *addr,
1014 						      int family, u8 prefixlen)
1015 {
1016 	const struct tcp_sock *tp = tcp_sk(sk);
1017 	struct tcp_md5sig_key *key;
1018 	unsigned int size = sizeof(struct in_addr);
1019 	const struct tcp_md5sig_info *md5sig;
1020 
1021 	/* caller either holds rcu_read_lock() or socket lock */
1022 	md5sig = rcu_dereference_check(tp->md5sig_info,
1023 				       lockdep_sock_is_held(sk));
1024 	if (!md5sig)
1025 		return NULL;
1026 #if IS_ENABLED(CONFIG_IPV6)
1027 	if (family == AF_INET6)
1028 		size = sizeof(struct in6_addr);
1029 #endif
1030 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1031 		if (key->family != family)
1032 			continue;
1033 		if (!memcmp(&key->addr, addr, size) &&
1034 		    key->prefixlen == prefixlen)
1035 			return key;
1036 	}
1037 	return NULL;
1038 }
1039 
1040 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1041 					 const struct sock *addr_sk)
1042 {
1043 	const union tcp_md5_addr *addr;
1044 
1045 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1046 	return tcp_md5_do_lookup(sk, addr, AF_INET);
1047 }
1048 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1049 
1050 /* This can be called on a newly created socket, from other files */
1051 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1052 		   int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1053 		   gfp_t gfp)
1054 {
1055 	/* Add Key to the list */
1056 	struct tcp_md5sig_key *key;
1057 	struct tcp_sock *tp = tcp_sk(sk);
1058 	struct tcp_md5sig_info *md5sig;
1059 
1060 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1061 	if (key) {
1062 		/* Pre-existing entry - just update that one. */
1063 		memcpy(key->key, newkey, newkeylen);
1064 		key->keylen = newkeylen;
1065 		return 0;
1066 	}
1067 
1068 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1069 					   lockdep_sock_is_held(sk));
1070 	if (!md5sig) {
1071 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1072 		if (!md5sig)
1073 			return -ENOMEM;
1074 
1075 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1076 		INIT_HLIST_HEAD(&md5sig->head);
1077 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1078 	}
1079 
1080 	key = sock_kmalloc(sk, sizeof(*key), gfp);
1081 	if (!key)
1082 		return -ENOMEM;
1083 	if (!tcp_alloc_md5sig_pool()) {
1084 		sock_kfree_s(sk, key, sizeof(*key));
1085 		return -ENOMEM;
1086 	}
1087 
1088 	memcpy(key->key, newkey, newkeylen);
1089 	key->keylen = newkeylen;
1090 	key->family = family;
1091 	key->prefixlen = prefixlen;
1092 	memcpy(&key->addr, addr,
1093 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1094 				      sizeof(struct in_addr));
1095 	hlist_add_head_rcu(&key->node, &md5sig->head);
1096 	return 0;
1097 }
1098 EXPORT_SYMBOL(tcp_md5_do_add);
1099 
1100 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1101 		   u8 prefixlen)
1102 {
1103 	struct tcp_md5sig_key *key;
1104 
1105 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1106 	if (!key)
1107 		return -ENOENT;
1108 	hlist_del_rcu(&key->node);
1109 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1110 	kfree_rcu(key, rcu);
1111 	return 0;
1112 }
1113 EXPORT_SYMBOL(tcp_md5_do_del);
1114 
1115 static void tcp_clear_md5_list(struct sock *sk)
1116 {
1117 	struct tcp_sock *tp = tcp_sk(sk);
1118 	struct tcp_md5sig_key *key;
1119 	struct hlist_node *n;
1120 	struct tcp_md5sig_info *md5sig;
1121 
1122 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1123 
1124 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1125 		hlist_del_rcu(&key->node);
1126 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1127 		kfree_rcu(key, rcu);
1128 	}
1129 }
1130 
1131 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1132 				 char __user *optval, int optlen)
1133 {
1134 	struct tcp_md5sig cmd;
1135 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1136 	u8 prefixlen = 32;
1137 
1138 	if (optlen < sizeof(cmd))
1139 		return -EINVAL;
1140 
1141 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1142 		return -EFAULT;
1143 
1144 	if (sin->sin_family != AF_INET)
1145 		return -EINVAL;
1146 
1147 	if (optname == TCP_MD5SIG_EXT &&
1148 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1149 		prefixlen = cmd.tcpm_prefixlen;
1150 		if (prefixlen > 32)
1151 			return -EINVAL;
1152 	}
1153 
1154 	if (!cmd.tcpm_keylen)
1155 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1156 				      AF_INET, prefixlen);
1157 
1158 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1159 		return -EINVAL;
1160 
1161 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1162 			      AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1163 			      GFP_KERNEL);
1164 }
1165 
1166 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1167 				   __be32 daddr, __be32 saddr,
1168 				   const struct tcphdr *th, int nbytes)
1169 {
1170 	struct tcp4_pseudohdr *bp;
1171 	struct scatterlist sg;
1172 	struct tcphdr *_th;
1173 
1174 	bp = hp->scratch;
1175 	bp->saddr = saddr;
1176 	bp->daddr = daddr;
1177 	bp->pad = 0;
1178 	bp->protocol = IPPROTO_TCP;
1179 	bp->len = cpu_to_be16(nbytes);
1180 
1181 	_th = (struct tcphdr *)(bp + 1);
1182 	memcpy(_th, th, sizeof(*th));
1183 	_th->check = 0;
1184 
1185 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1186 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1187 				sizeof(*bp) + sizeof(*th));
1188 	return crypto_ahash_update(hp->md5_req);
1189 }
1190 
1191 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1192 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1193 {
1194 	struct tcp_md5sig_pool *hp;
1195 	struct ahash_request *req;
1196 
1197 	hp = tcp_get_md5sig_pool();
1198 	if (!hp)
1199 		goto clear_hash_noput;
1200 	req = hp->md5_req;
1201 
1202 	if (crypto_ahash_init(req))
1203 		goto clear_hash;
1204 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1205 		goto clear_hash;
1206 	if (tcp_md5_hash_key(hp, key))
1207 		goto clear_hash;
1208 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1209 	if (crypto_ahash_final(req))
1210 		goto clear_hash;
1211 
1212 	tcp_put_md5sig_pool();
1213 	return 0;
1214 
1215 clear_hash:
1216 	tcp_put_md5sig_pool();
1217 clear_hash_noput:
1218 	memset(md5_hash, 0, 16);
1219 	return 1;
1220 }
1221 
1222 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1223 			const struct sock *sk,
1224 			const struct sk_buff *skb)
1225 {
1226 	struct tcp_md5sig_pool *hp;
1227 	struct ahash_request *req;
1228 	const struct tcphdr *th = tcp_hdr(skb);
1229 	__be32 saddr, daddr;
1230 
1231 	if (sk) { /* valid for establish/request sockets */
1232 		saddr = sk->sk_rcv_saddr;
1233 		daddr = sk->sk_daddr;
1234 	} else {
1235 		const struct iphdr *iph = ip_hdr(skb);
1236 		saddr = iph->saddr;
1237 		daddr = iph->daddr;
1238 	}
1239 
1240 	hp = tcp_get_md5sig_pool();
1241 	if (!hp)
1242 		goto clear_hash_noput;
1243 	req = hp->md5_req;
1244 
1245 	if (crypto_ahash_init(req))
1246 		goto clear_hash;
1247 
1248 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1249 		goto clear_hash;
1250 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1251 		goto clear_hash;
1252 	if (tcp_md5_hash_key(hp, key))
1253 		goto clear_hash;
1254 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1255 	if (crypto_ahash_final(req))
1256 		goto clear_hash;
1257 
1258 	tcp_put_md5sig_pool();
1259 	return 0;
1260 
1261 clear_hash:
1262 	tcp_put_md5sig_pool();
1263 clear_hash_noput:
1264 	memset(md5_hash, 0, 16);
1265 	return 1;
1266 }
1267 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1268 
1269 #endif
1270 
1271 /* Called with rcu_read_lock() */
1272 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1273 				    const struct sk_buff *skb)
1274 {
1275 #ifdef CONFIG_TCP_MD5SIG
1276 	/*
1277 	 * This gets called for each TCP segment that arrives
1278 	 * so we want to be efficient.
1279 	 * We have 3 drop cases:
1280 	 * o No MD5 hash and one expected.
1281 	 * o MD5 hash and we're not expecting one.
1282 	 * o MD5 hash and its wrong.
1283 	 */
1284 	const __u8 *hash_location = NULL;
1285 	struct tcp_md5sig_key *hash_expected;
1286 	const struct iphdr *iph = ip_hdr(skb);
1287 	const struct tcphdr *th = tcp_hdr(skb);
1288 	int genhash;
1289 	unsigned char newhash[16];
1290 
1291 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1292 					  AF_INET);
1293 	hash_location = tcp_parse_md5sig_option(th);
1294 
1295 	/* We've parsed the options - do we have a hash? */
1296 	if (!hash_expected && !hash_location)
1297 		return false;
1298 
1299 	if (hash_expected && !hash_location) {
1300 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1301 		return true;
1302 	}
1303 
1304 	if (!hash_expected && hash_location) {
1305 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1306 		return true;
1307 	}
1308 
1309 	/* Okay, so this is hash_expected and hash_location -
1310 	 * so we need to calculate the checksum.
1311 	 */
1312 	genhash = tcp_v4_md5_hash_skb(newhash,
1313 				      hash_expected,
1314 				      NULL, skb);
1315 
1316 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1317 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1318 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1319 				     &iph->saddr, ntohs(th->source),
1320 				     &iph->daddr, ntohs(th->dest),
1321 				     genhash ? " tcp_v4_calc_md5_hash failed"
1322 				     : "");
1323 		return true;
1324 	}
1325 	return false;
1326 #endif
1327 	return false;
1328 }
1329 
1330 static void tcp_v4_init_req(struct request_sock *req,
1331 			    const struct sock *sk_listener,
1332 			    struct sk_buff *skb)
1333 {
1334 	struct inet_request_sock *ireq = inet_rsk(req);
1335 	struct net *net = sock_net(sk_listener);
1336 
1337 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1338 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1339 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1340 }
1341 
1342 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1343 					  struct flowi *fl,
1344 					  const struct request_sock *req)
1345 {
1346 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1347 }
1348 
1349 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1350 	.family		=	PF_INET,
1351 	.obj_size	=	sizeof(struct tcp_request_sock),
1352 	.rtx_syn_ack	=	tcp_rtx_synack,
1353 	.send_ack	=	tcp_v4_reqsk_send_ack,
1354 	.destructor	=	tcp_v4_reqsk_destructor,
1355 	.send_reset	=	tcp_v4_send_reset,
1356 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1357 };
1358 
1359 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1360 	.mss_clamp	=	TCP_MSS_DEFAULT,
1361 #ifdef CONFIG_TCP_MD5SIG
1362 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1363 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1364 #endif
1365 	.init_req	=	tcp_v4_init_req,
1366 #ifdef CONFIG_SYN_COOKIES
1367 	.cookie_init_seq =	cookie_v4_init_sequence,
1368 #endif
1369 	.route_req	=	tcp_v4_route_req,
1370 	.init_seq	=	tcp_v4_init_seq,
1371 	.init_ts_off	=	tcp_v4_init_ts_off,
1372 	.send_synack	=	tcp_v4_send_synack,
1373 };
1374 
1375 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1376 {
1377 	/* Never answer to SYNs send to broadcast or multicast */
1378 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1379 		goto drop;
1380 
1381 	return tcp_conn_request(&tcp_request_sock_ops,
1382 				&tcp_request_sock_ipv4_ops, sk, skb);
1383 
1384 drop:
1385 	tcp_listendrop(sk);
1386 	return 0;
1387 }
1388 EXPORT_SYMBOL(tcp_v4_conn_request);
1389 
1390 
1391 /*
1392  * The three way handshake has completed - we got a valid synack -
1393  * now create the new socket.
1394  */
1395 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1396 				  struct request_sock *req,
1397 				  struct dst_entry *dst,
1398 				  struct request_sock *req_unhash,
1399 				  bool *own_req)
1400 {
1401 	struct inet_request_sock *ireq;
1402 	struct inet_sock *newinet;
1403 	struct tcp_sock *newtp;
1404 	struct sock *newsk;
1405 #ifdef CONFIG_TCP_MD5SIG
1406 	struct tcp_md5sig_key *key;
1407 #endif
1408 	struct ip_options_rcu *inet_opt;
1409 
1410 	if (sk_acceptq_is_full(sk))
1411 		goto exit_overflow;
1412 
1413 	newsk = tcp_create_openreq_child(sk, req, skb);
1414 	if (!newsk)
1415 		goto exit_nonewsk;
1416 
1417 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1418 	inet_sk_rx_dst_set(newsk, skb);
1419 
1420 	newtp		      = tcp_sk(newsk);
1421 	newinet		      = inet_sk(newsk);
1422 	ireq		      = inet_rsk(req);
1423 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1424 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1425 	newsk->sk_bound_dev_if = ireq->ir_iif;
1426 	newinet->inet_saddr   = ireq->ir_loc_addr;
1427 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1428 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1429 	newinet->mc_index     = inet_iif(skb);
1430 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1431 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1432 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1433 	if (inet_opt)
1434 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1435 	newinet->inet_id = newtp->write_seq ^ jiffies;
1436 
1437 	if (!dst) {
1438 		dst = inet_csk_route_child_sock(sk, newsk, req);
1439 		if (!dst)
1440 			goto put_and_exit;
1441 	} else {
1442 		/* syncookie case : see end of cookie_v4_check() */
1443 	}
1444 	sk_setup_caps(newsk, dst);
1445 
1446 	tcp_ca_openreq_child(newsk, dst);
1447 
1448 	tcp_sync_mss(newsk, dst_mtu(dst));
1449 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1450 
1451 	tcp_initialize_rcv_mss(newsk);
1452 
1453 #ifdef CONFIG_TCP_MD5SIG
1454 	/* Copy over the MD5 key from the original socket */
1455 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1456 				AF_INET);
1457 	if (key) {
1458 		/*
1459 		 * We're using one, so create a matching key
1460 		 * on the newsk structure. If we fail to get
1461 		 * memory, then we end up not copying the key
1462 		 * across. Shucks.
1463 		 */
1464 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1465 			       AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1466 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1467 	}
1468 #endif
1469 
1470 	if (__inet_inherit_port(sk, newsk) < 0)
1471 		goto put_and_exit;
1472 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1473 	if (likely(*own_req)) {
1474 		tcp_move_syn(newtp, req);
1475 		ireq->ireq_opt = NULL;
1476 	} else {
1477 		newinet->inet_opt = NULL;
1478 	}
1479 	return newsk;
1480 
1481 exit_overflow:
1482 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1483 exit_nonewsk:
1484 	dst_release(dst);
1485 exit:
1486 	tcp_listendrop(sk);
1487 	return NULL;
1488 put_and_exit:
1489 	newinet->inet_opt = NULL;
1490 	inet_csk_prepare_forced_close(newsk);
1491 	tcp_done(newsk);
1492 	goto exit;
1493 }
1494 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1495 
1496 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1497 {
1498 #ifdef CONFIG_SYN_COOKIES
1499 	const struct tcphdr *th = tcp_hdr(skb);
1500 
1501 	if (!th->syn)
1502 		sk = cookie_v4_check(sk, skb);
1503 #endif
1504 	return sk;
1505 }
1506 
1507 /* The socket must have it's spinlock held when we get
1508  * here, unless it is a TCP_LISTEN socket.
1509  *
1510  * We have a potential double-lock case here, so even when
1511  * doing backlog processing we use the BH locking scheme.
1512  * This is because we cannot sleep with the original spinlock
1513  * held.
1514  */
1515 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1516 {
1517 	struct sock *rsk;
1518 
1519 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1520 		struct dst_entry *dst = sk->sk_rx_dst;
1521 
1522 		sock_rps_save_rxhash(sk, skb);
1523 		sk_mark_napi_id(sk, skb);
1524 		if (dst) {
1525 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1526 			    !dst->ops->check(dst, 0)) {
1527 				dst_release(dst);
1528 				sk->sk_rx_dst = NULL;
1529 			}
1530 		}
1531 		tcp_rcv_established(sk, skb);
1532 		return 0;
1533 	}
1534 
1535 	if (tcp_checksum_complete(skb))
1536 		goto csum_err;
1537 
1538 	if (sk->sk_state == TCP_LISTEN) {
1539 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1540 
1541 		if (!nsk)
1542 			goto discard;
1543 		if (nsk != sk) {
1544 			if (tcp_child_process(sk, nsk, skb)) {
1545 				rsk = nsk;
1546 				goto reset;
1547 			}
1548 			return 0;
1549 		}
1550 	} else
1551 		sock_rps_save_rxhash(sk, skb);
1552 
1553 	if (tcp_rcv_state_process(sk, skb)) {
1554 		rsk = sk;
1555 		goto reset;
1556 	}
1557 	return 0;
1558 
1559 reset:
1560 	tcp_v4_send_reset(rsk, skb);
1561 discard:
1562 	kfree_skb(skb);
1563 	/* Be careful here. If this function gets more complicated and
1564 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1565 	 * might be destroyed here. This current version compiles correctly,
1566 	 * but you have been warned.
1567 	 */
1568 	return 0;
1569 
1570 csum_err:
1571 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1572 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1573 	goto discard;
1574 }
1575 EXPORT_SYMBOL(tcp_v4_do_rcv);
1576 
1577 int tcp_v4_early_demux(struct sk_buff *skb)
1578 {
1579 	const struct iphdr *iph;
1580 	const struct tcphdr *th;
1581 	struct sock *sk;
1582 
1583 	if (skb->pkt_type != PACKET_HOST)
1584 		return 0;
1585 
1586 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1587 		return 0;
1588 
1589 	iph = ip_hdr(skb);
1590 	th = tcp_hdr(skb);
1591 
1592 	if (th->doff < sizeof(struct tcphdr) / 4)
1593 		return 0;
1594 
1595 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1596 				       iph->saddr, th->source,
1597 				       iph->daddr, ntohs(th->dest),
1598 				       skb->skb_iif, inet_sdif(skb));
1599 	if (sk) {
1600 		skb->sk = sk;
1601 		skb->destructor = sock_edemux;
1602 		if (sk_fullsock(sk)) {
1603 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1604 
1605 			if (dst)
1606 				dst = dst_check(dst, 0);
1607 			if (dst &&
1608 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1609 				skb_dst_set_noref(skb, dst);
1610 		}
1611 	}
1612 	return 0;
1613 }
1614 
1615 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1616 {
1617 	u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1618 
1619 	/* Only socket owner can try to collapse/prune rx queues
1620 	 * to reduce memory overhead, so add a little headroom here.
1621 	 * Few sockets backlog are possibly concurrently non empty.
1622 	 */
1623 	limit += 64*1024;
1624 
1625 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1626 	 * we can fix skb->truesize to its real value to avoid future drops.
1627 	 * This is valid because skb is not yet charged to the socket.
1628 	 * It has been noticed pure SACK packets were sometimes dropped
1629 	 * (if cooked by drivers without copybreak feature).
1630 	 */
1631 	skb_condense(skb);
1632 
1633 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1634 		bh_unlock_sock(sk);
1635 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1636 		return true;
1637 	}
1638 	return false;
1639 }
1640 EXPORT_SYMBOL(tcp_add_backlog);
1641 
1642 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1643 {
1644 	struct tcphdr *th = (struct tcphdr *)skb->data;
1645 	unsigned int eaten = skb->len;
1646 	int err;
1647 
1648 	err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1649 	if (!err) {
1650 		eaten -= skb->len;
1651 		TCP_SKB_CB(skb)->end_seq -= eaten;
1652 	}
1653 	return err;
1654 }
1655 EXPORT_SYMBOL(tcp_filter);
1656 
1657 static void tcp_v4_restore_cb(struct sk_buff *skb)
1658 {
1659 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1660 		sizeof(struct inet_skb_parm));
1661 }
1662 
1663 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1664 			   const struct tcphdr *th)
1665 {
1666 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1667 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1668 	 */
1669 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1670 		sizeof(struct inet_skb_parm));
1671 	barrier();
1672 
1673 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1674 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1675 				    skb->len - th->doff * 4);
1676 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1677 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1678 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1679 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1680 	TCP_SKB_CB(skb)->sacked	 = 0;
1681 	TCP_SKB_CB(skb)->has_rxtstamp =
1682 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1683 }
1684 
1685 /*
1686  *	From tcp_input.c
1687  */
1688 
1689 int tcp_v4_rcv(struct sk_buff *skb)
1690 {
1691 	struct net *net = dev_net(skb->dev);
1692 	int sdif = inet_sdif(skb);
1693 	const struct iphdr *iph;
1694 	const struct tcphdr *th;
1695 	bool refcounted;
1696 	struct sock *sk;
1697 	int ret;
1698 
1699 	if (skb->pkt_type != PACKET_HOST)
1700 		goto discard_it;
1701 
1702 	/* Count it even if it's bad */
1703 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1704 
1705 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1706 		goto discard_it;
1707 
1708 	th = (const struct tcphdr *)skb->data;
1709 
1710 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1711 		goto bad_packet;
1712 	if (!pskb_may_pull(skb, th->doff * 4))
1713 		goto discard_it;
1714 
1715 	/* An explanation is required here, I think.
1716 	 * Packet length and doff are validated by header prediction,
1717 	 * provided case of th->doff==0 is eliminated.
1718 	 * So, we defer the checks. */
1719 
1720 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1721 		goto csum_error;
1722 
1723 	th = (const struct tcphdr *)skb->data;
1724 	iph = ip_hdr(skb);
1725 lookup:
1726 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1727 			       th->dest, sdif, &refcounted);
1728 	if (!sk)
1729 		goto no_tcp_socket;
1730 
1731 process:
1732 	if (sk->sk_state == TCP_TIME_WAIT)
1733 		goto do_time_wait;
1734 
1735 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1736 		struct request_sock *req = inet_reqsk(sk);
1737 		bool req_stolen = false;
1738 		struct sock *nsk;
1739 
1740 		sk = req->rsk_listener;
1741 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1742 			sk_drops_add(sk, skb);
1743 			reqsk_put(req);
1744 			goto discard_it;
1745 		}
1746 		if (tcp_checksum_complete(skb)) {
1747 			reqsk_put(req);
1748 			goto csum_error;
1749 		}
1750 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1751 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1752 			goto lookup;
1753 		}
1754 		/* We own a reference on the listener, increase it again
1755 		 * as we might lose it too soon.
1756 		 */
1757 		sock_hold(sk);
1758 		refcounted = true;
1759 		nsk = NULL;
1760 		if (!tcp_filter(sk, skb)) {
1761 			th = (const struct tcphdr *)skb->data;
1762 			iph = ip_hdr(skb);
1763 			tcp_v4_fill_cb(skb, iph, th);
1764 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1765 		}
1766 		if (!nsk) {
1767 			reqsk_put(req);
1768 			if (req_stolen) {
1769 				/* Another cpu got exclusive access to req
1770 				 * and created a full blown socket.
1771 				 * Try to feed this packet to this socket
1772 				 * instead of discarding it.
1773 				 */
1774 				tcp_v4_restore_cb(skb);
1775 				sock_put(sk);
1776 				goto lookup;
1777 			}
1778 			goto discard_and_relse;
1779 		}
1780 		if (nsk == sk) {
1781 			reqsk_put(req);
1782 			tcp_v4_restore_cb(skb);
1783 		} else if (tcp_child_process(sk, nsk, skb)) {
1784 			tcp_v4_send_reset(nsk, skb);
1785 			goto discard_and_relse;
1786 		} else {
1787 			sock_put(sk);
1788 			return 0;
1789 		}
1790 	}
1791 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1792 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1793 		goto discard_and_relse;
1794 	}
1795 
1796 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1797 		goto discard_and_relse;
1798 
1799 	if (tcp_v4_inbound_md5_hash(sk, skb))
1800 		goto discard_and_relse;
1801 
1802 	nf_reset(skb);
1803 
1804 	if (tcp_filter(sk, skb))
1805 		goto discard_and_relse;
1806 	th = (const struct tcphdr *)skb->data;
1807 	iph = ip_hdr(skb);
1808 	tcp_v4_fill_cb(skb, iph, th);
1809 
1810 	skb->dev = NULL;
1811 
1812 	if (sk->sk_state == TCP_LISTEN) {
1813 		ret = tcp_v4_do_rcv(sk, skb);
1814 		goto put_and_return;
1815 	}
1816 
1817 	sk_incoming_cpu_update(sk);
1818 
1819 	bh_lock_sock_nested(sk);
1820 	tcp_segs_in(tcp_sk(sk), skb);
1821 	ret = 0;
1822 	if (!sock_owned_by_user(sk)) {
1823 		ret = tcp_v4_do_rcv(sk, skb);
1824 	} else if (tcp_add_backlog(sk, skb)) {
1825 		goto discard_and_relse;
1826 	}
1827 	bh_unlock_sock(sk);
1828 
1829 put_and_return:
1830 	if (refcounted)
1831 		sock_put(sk);
1832 
1833 	return ret;
1834 
1835 no_tcp_socket:
1836 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1837 		goto discard_it;
1838 
1839 	tcp_v4_fill_cb(skb, iph, th);
1840 
1841 	if (tcp_checksum_complete(skb)) {
1842 csum_error:
1843 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1844 bad_packet:
1845 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
1846 	} else {
1847 		tcp_v4_send_reset(NULL, skb);
1848 	}
1849 
1850 discard_it:
1851 	/* Discard frame. */
1852 	kfree_skb(skb);
1853 	return 0;
1854 
1855 discard_and_relse:
1856 	sk_drops_add(sk, skb);
1857 	if (refcounted)
1858 		sock_put(sk);
1859 	goto discard_it;
1860 
1861 do_time_wait:
1862 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1863 		inet_twsk_put(inet_twsk(sk));
1864 		goto discard_it;
1865 	}
1866 
1867 	tcp_v4_fill_cb(skb, iph, th);
1868 
1869 	if (tcp_checksum_complete(skb)) {
1870 		inet_twsk_put(inet_twsk(sk));
1871 		goto csum_error;
1872 	}
1873 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1874 	case TCP_TW_SYN: {
1875 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1876 							&tcp_hashinfo, skb,
1877 							__tcp_hdrlen(th),
1878 							iph->saddr, th->source,
1879 							iph->daddr, th->dest,
1880 							inet_iif(skb),
1881 							sdif);
1882 		if (sk2) {
1883 			inet_twsk_deschedule_put(inet_twsk(sk));
1884 			sk = sk2;
1885 			tcp_v4_restore_cb(skb);
1886 			refcounted = false;
1887 			goto process;
1888 		}
1889 	}
1890 		/* to ACK */
1891 		/* fall through */
1892 	case TCP_TW_ACK:
1893 		tcp_v4_timewait_ack(sk, skb);
1894 		break;
1895 	case TCP_TW_RST:
1896 		tcp_v4_send_reset(sk, skb);
1897 		inet_twsk_deschedule_put(inet_twsk(sk));
1898 		goto discard_it;
1899 	case TCP_TW_SUCCESS:;
1900 	}
1901 	goto discard_it;
1902 }
1903 
1904 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1905 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1906 	.twsk_unique	= tcp_twsk_unique,
1907 	.twsk_destructor= tcp_twsk_destructor,
1908 };
1909 
1910 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1911 {
1912 	struct dst_entry *dst = skb_dst(skb);
1913 
1914 	if (dst && dst_hold_safe(dst)) {
1915 		sk->sk_rx_dst = dst;
1916 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1917 	}
1918 }
1919 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1920 
1921 const struct inet_connection_sock_af_ops ipv4_specific = {
1922 	.queue_xmit	   = ip_queue_xmit,
1923 	.send_check	   = tcp_v4_send_check,
1924 	.rebuild_header	   = inet_sk_rebuild_header,
1925 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
1926 	.conn_request	   = tcp_v4_conn_request,
1927 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1928 	.net_header_len	   = sizeof(struct iphdr),
1929 	.setsockopt	   = ip_setsockopt,
1930 	.getsockopt	   = ip_getsockopt,
1931 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1932 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1933 #ifdef CONFIG_COMPAT
1934 	.compat_setsockopt = compat_ip_setsockopt,
1935 	.compat_getsockopt = compat_ip_getsockopt,
1936 #endif
1937 	.mtu_reduced	   = tcp_v4_mtu_reduced,
1938 };
1939 EXPORT_SYMBOL(ipv4_specific);
1940 
1941 #ifdef CONFIG_TCP_MD5SIG
1942 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1943 	.md5_lookup		= tcp_v4_md5_lookup,
1944 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1945 	.md5_parse		= tcp_v4_parse_md5_keys,
1946 };
1947 #endif
1948 
1949 /* NOTE: A lot of things set to zero explicitly by call to
1950  *       sk_alloc() so need not be done here.
1951  */
1952 static int tcp_v4_init_sock(struct sock *sk)
1953 {
1954 	struct inet_connection_sock *icsk = inet_csk(sk);
1955 
1956 	tcp_init_sock(sk);
1957 
1958 	icsk->icsk_af_ops = &ipv4_specific;
1959 
1960 #ifdef CONFIG_TCP_MD5SIG
1961 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1962 #endif
1963 
1964 	return 0;
1965 }
1966 
1967 void tcp_v4_destroy_sock(struct sock *sk)
1968 {
1969 	struct tcp_sock *tp = tcp_sk(sk);
1970 
1971 	trace_tcp_destroy_sock(sk);
1972 
1973 	tcp_clear_xmit_timers(sk);
1974 
1975 	tcp_cleanup_congestion_control(sk);
1976 
1977 	tcp_cleanup_ulp(sk);
1978 
1979 	/* Cleanup up the write buffer. */
1980 	tcp_write_queue_purge(sk);
1981 
1982 	/* Check if we want to disable active TFO */
1983 	tcp_fastopen_active_disable_ofo_check(sk);
1984 
1985 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1986 	skb_rbtree_purge(&tp->out_of_order_queue);
1987 
1988 #ifdef CONFIG_TCP_MD5SIG
1989 	/* Clean up the MD5 key list, if any */
1990 	if (tp->md5sig_info) {
1991 		tcp_clear_md5_list(sk);
1992 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
1993 		tp->md5sig_info = NULL;
1994 	}
1995 #endif
1996 
1997 	/* Clean up a referenced TCP bind bucket. */
1998 	if (inet_csk(sk)->icsk_bind_hash)
1999 		inet_put_port(sk);
2000 
2001 	BUG_ON(tp->fastopen_rsk);
2002 
2003 	/* If socket is aborted during connect operation */
2004 	tcp_free_fastopen_req(tp);
2005 	tcp_fastopen_destroy_cipher(sk);
2006 	tcp_saved_syn_free(tp);
2007 
2008 	sk_sockets_allocated_dec(sk);
2009 }
2010 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2011 
2012 #ifdef CONFIG_PROC_FS
2013 /* Proc filesystem TCP sock list dumping. */
2014 
2015 /*
2016  * Get next listener socket follow cur.  If cur is NULL, get first socket
2017  * starting from bucket given in st->bucket; when st->bucket is zero the
2018  * very first socket in the hash table is returned.
2019  */
2020 static void *listening_get_next(struct seq_file *seq, void *cur)
2021 {
2022 	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2023 	struct tcp_iter_state *st = seq->private;
2024 	struct net *net = seq_file_net(seq);
2025 	struct inet_listen_hashbucket *ilb;
2026 	struct sock *sk = cur;
2027 
2028 	if (!sk) {
2029 get_head:
2030 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2031 		spin_lock(&ilb->lock);
2032 		sk = sk_head(&ilb->head);
2033 		st->offset = 0;
2034 		goto get_sk;
2035 	}
2036 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2037 	++st->num;
2038 	++st->offset;
2039 
2040 	sk = sk_next(sk);
2041 get_sk:
2042 	sk_for_each_from(sk) {
2043 		if (!net_eq(sock_net(sk), net))
2044 			continue;
2045 		if (sk->sk_family == afinfo->family)
2046 			return sk;
2047 	}
2048 	spin_unlock(&ilb->lock);
2049 	st->offset = 0;
2050 	if (++st->bucket < INET_LHTABLE_SIZE)
2051 		goto get_head;
2052 	return NULL;
2053 }
2054 
2055 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2056 {
2057 	struct tcp_iter_state *st = seq->private;
2058 	void *rc;
2059 
2060 	st->bucket = 0;
2061 	st->offset = 0;
2062 	rc = listening_get_next(seq, NULL);
2063 
2064 	while (rc && *pos) {
2065 		rc = listening_get_next(seq, rc);
2066 		--*pos;
2067 	}
2068 	return rc;
2069 }
2070 
2071 static inline bool empty_bucket(const struct tcp_iter_state *st)
2072 {
2073 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2074 }
2075 
2076 /*
2077  * Get first established socket starting from bucket given in st->bucket.
2078  * If st->bucket is zero, the very first socket in the hash is returned.
2079  */
2080 static void *established_get_first(struct seq_file *seq)
2081 {
2082 	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2083 	struct tcp_iter_state *st = seq->private;
2084 	struct net *net = seq_file_net(seq);
2085 	void *rc = NULL;
2086 
2087 	st->offset = 0;
2088 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2089 		struct sock *sk;
2090 		struct hlist_nulls_node *node;
2091 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2092 
2093 		/* Lockless fast path for the common case of empty buckets */
2094 		if (empty_bucket(st))
2095 			continue;
2096 
2097 		spin_lock_bh(lock);
2098 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2099 			if (sk->sk_family != afinfo->family ||
2100 			    !net_eq(sock_net(sk), net)) {
2101 				continue;
2102 			}
2103 			rc = sk;
2104 			goto out;
2105 		}
2106 		spin_unlock_bh(lock);
2107 	}
2108 out:
2109 	return rc;
2110 }
2111 
2112 static void *established_get_next(struct seq_file *seq, void *cur)
2113 {
2114 	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2115 	struct sock *sk = cur;
2116 	struct hlist_nulls_node *node;
2117 	struct tcp_iter_state *st = seq->private;
2118 	struct net *net = seq_file_net(seq);
2119 
2120 	++st->num;
2121 	++st->offset;
2122 
2123 	sk = sk_nulls_next(sk);
2124 
2125 	sk_nulls_for_each_from(sk, node) {
2126 		if (sk->sk_family == afinfo->family &&
2127 		    net_eq(sock_net(sk), net))
2128 			return sk;
2129 	}
2130 
2131 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2132 	++st->bucket;
2133 	return established_get_first(seq);
2134 }
2135 
2136 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2137 {
2138 	struct tcp_iter_state *st = seq->private;
2139 	void *rc;
2140 
2141 	st->bucket = 0;
2142 	rc = established_get_first(seq);
2143 
2144 	while (rc && pos) {
2145 		rc = established_get_next(seq, rc);
2146 		--pos;
2147 	}
2148 	return rc;
2149 }
2150 
2151 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2152 {
2153 	void *rc;
2154 	struct tcp_iter_state *st = seq->private;
2155 
2156 	st->state = TCP_SEQ_STATE_LISTENING;
2157 	rc	  = listening_get_idx(seq, &pos);
2158 
2159 	if (!rc) {
2160 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2161 		rc	  = established_get_idx(seq, pos);
2162 	}
2163 
2164 	return rc;
2165 }
2166 
2167 static void *tcp_seek_last_pos(struct seq_file *seq)
2168 {
2169 	struct tcp_iter_state *st = seq->private;
2170 	int offset = st->offset;
2171 	int orig_num = st->num;
2172 	void *rc = NULL;
2173 
2174 	switch (st->state) {
2175 	case TCP_SEQ_STATE_LISTENING:
2176 		if (st->bucket >= INET_LHTABLE_SIZE)
2177 			break;
2178 		st->state = TCP_SEQ_STATE_LISTENING;
2179 		rc = listening_get_next(seq, NULL);
2180 		while (offset-- && rc)
2181 			rc = listening_get_next(seq, rc);
2182 		if (rc)
2183 			break;
2184 		st->bucket = 0;
2185 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2186 		/* Fallthrough */
2187 	case TCP_SEQ_STATE_ESTABLISHED:
2188 		if (st->bucket > tcp_hashinfo.ehash_mask)
2189 			break;
2190 		rc = established_get_first(seq);
2191 		while (offset-- && rc)
2192 			rc = established_get_next(seq, rc);
2193 	}
2194 
2195 	st->num = orig_num;
2196 
2197 	return rc;
2198 }
2199 
2200 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2201 {
2202 	struct tcp_iter_state *st = seq->private;
2203 	void *rc;
2204 
2205 	if (*pos && *pos == st->last_pos) {
2206 		rc = tcp_seek_last_pos(seq);
2207 		if (rc)
2208 			goto out;
2209 	}
2210 
2211 	st->state = TCP_SEQ_STATE_LISTENING;
2212 	st->num = 0;
2213 	st->bucket = 0;
2214 	st->offset = 0;
2215 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2216 
2217 out:
2218 	st->last_pos = *pos;
2219 	return rc;
2220 }
2221 EXPORT_SYMBOL(tcp_seq_start);
2222 
2223 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2224 {
2225 	struct tcp_iter_state *st = seq->private;
2226 	void *rc = NULL;
2227 
2228 	if (v == SEQ_START_TOKEN) {
2229 		rc = tcp_get_idx(seq, 0);
2230 		goto out;
2231 	}
2232 
2233 	switch (st->state) {
2234 	case TCP_SEQ_STATE_LISTENING:
2235 		rc = listening_get_next(seq, v);
2236 		if (!rc) {
2237 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2238 			st->bucket = 0;
2239 			st->offset = 0;
2240 			rc	  = established_get_first(seq);
2241 		}
2242 		break;
2243 	case TCP_SEQ_STATE_ESTABLISHED:
2244 		rc = established_get_next(seq, v);
2245 		break;
2246 	}
2247 out:
2248 	++*pos;
2249 	st->last_pos = *pos;
2250 	return rc;
2251 }
2252 EXPORT_SYMBOL(tcp_seq_next);
2253 
2254 void tcp_seq_stop(struct seq_file *seq, void *v)
2255 {
2256 	struct tcp_iter_state *st = seq->private;
2257 
2258 	switch (st->state) {
2259 	case TCP_SEQ_STATE_LISTENING:
2260 		if (v != SEQ_START_TOKEN)
2261 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2262 		break;
2263 	case TCP_SEQ_STATE_ESTABLISHED:
2264 		if (v)
2265 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2266 		break;
2267 	}
2268 }
2269 EXPORT_SYMBOL(tcp_seq_stop);
2270 
2271 static void get_openreq4(const struct request_sock *req,
2272 			 struct seq_file *f, int i)
2273 {
2274 	const struct inet_request_sock *ireq = inet_rsk(req);
2275 	long delta = req->rsk_timer.expires - jiffies;
2276 
2277 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2278 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2279 		i,
2280 		ireq->ir_loc_addr,
2281 		ireq->ir_num,
2282 		ireq->ir_rmt_addr,
2283 		ntohs(ireq->ir_rmt_port),
2284 		TCP_SYN_RECV,
2285 		0, 0, /* could print option size, but that is af dependent. */
2286 		1,    /* timers active (only the expire timer) */
2287 		jiffies_delta_to_clock_t(delta),
2288 		req->num_timeout,
2289 		from_kuid_munged(seq_user_ns(f),
2290 				 sock_i_uid(req->rsk_listener)),
2291 		0,  /* non standard timer */
2292 		0, /* open_requests have no inode */
2293 		0,
2294 		req);
2295 }
2296 
2297 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2298 {
2299 	int timer_active;
2300 	unsigned long timer_expires;
2301 	const struct tcp_sock *tp = tcp_sk(sk);
2302 	const struct inet_connection_sock *icsk = inet_csk(sk);
2303 	const struct inet_sock *inet = inet_sk(sk);
2304 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2305 	__be32 dest = inet->inet_daddr;
2306 	__be32 src = inet->inet_rcv_saddr;
2307 	__u16 destp = ntohs(inet->inet_dport);
2308 	__u16 srcp = ntohs(inet->inet_sport);
2309 	int rx_queue;
2310 	int state;
2311 
2312 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2313 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2314 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2315 		timer_active	= 1;
2316 		timer_expires	= icsk->icsk_timeout;
2317 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2318 		timer_active	= 4;
2319 		timer_expires	= icsk->icsk_timeout;
2320 	} else if (timer_pending(&sk->sk_timer)) {
2321 		timer_active	= 2;
2322 		timer_expires	= sk->sk_timer.expires;
2323 	} else {
2324 		timer_active	= 0;
2325 		timer_expires = jiffies;
2326 	}
2327 
2328 	state = inet_sk_state_load(sk);
2329 	if (state == TCP_LISTEN)
2330 		rx_queue = sk->sk_ack_backlog;
2331 	else
2332 		/* Because we don't lock the socket,
2333 		 * we might find a transient negative value.
2334 		 */
2335 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2336 
2337 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2338 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2339 		i, src, srcp, dest, destp, state,
2340 		tp->write_seq - tp->snd_una,
2341 		rx_queue,
2342 		timer_active,
2343 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2344 		icsk->icsk_retransmits,
2345 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2346 		icsk->icsk_probes_out,
2347 		sock_i_ino(sk),
2348 		refcount_read(&sk->sk_refcnt), sk,
2349 		jiffies_to_clock_t(icsk->icsk_rto),
2350 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2351 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2352 		tp->snd_cwnd,
2353 		state == TCP_LISTEN ?
2354 		    fastopenq->max_qlen :
2355 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2356 }
2357 
2358 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2359 			       struct seq_file *f, int i)
2360 {
2361 	long delta = tw->tw_timer.expires - jiffies;
2362 	__be32 dest, src;
2363 	__u16 destp, srcp;
2364 
2365 	dest  = tw->tw_daddr;
2366 	src   = tw->tw_rcv_saddr;
2367 	destp = ntohs(tw->tw_dport);
2368 	srcp  = ntohs(tw->tw_sport);
2369 
2370 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2371 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2372 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2373 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2374 		refcount_read(&tw->tw_refcnt), tw);
2375 }
2376 
2377 #define TMPSZ 150
2378 
2379 static int tcp4_seq_show(struct seq_file *seq, void *v)
2380 {
2381 	struct tcp_iter_state *st;
2382 	struct sock *sk = v;
2383 
2384 	seq_setwidth(seq, TMPSZ - 1);
2385 	if (v == SEQ_START_TOKEN) {
2386 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2387 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2388 			   "inode");
2389 		goto out;
2390 	}
2391 	st = seq->private;
2392 
2393 	if (sk->sk_state == TCP_TIME_WAIT)
2394 		get_timewait4_sock(v, seq, st->num);
2395 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2396 		get_openreq4(v, seq, st->num);
2397 	else
2398 		get_tcp4_sock(v, seq, st->num);
2399 out:
2400 	seq_pad(seq, '\n');
2401 	return 0;
2402 }
2403 
2404 static const struct seq_operations tcp4_seq_ops = {
2405 	.show		= tcp4_seq_show,
2406 	.start		= tcp_seq_start,
2407 	.next		= tcp_seq_next,
2408 	.stop		= tcp_seq_stop,
2409 };
2410 
2411 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2412 	.family		= AF_INET,
2413 };
2414 
2415 static int __net_init tcp4_proc_init_net(struct net *net)
2416 {
2417 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2418 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2419 		return -ENOMEM;
2420 	return 0;
2421 }
2422 
2423 static void __net_exit tcp4_proc_exit_net(struct net *net)
2424 {
2425 	remove_proc_entry("tcp", net->proc_net);
2426 }
2427 
2428 static struct pernet_operations tcp4_net_ops = {
2429 	.init = tcp4_proc_init_net,
2430 	.exit = tcp4_proc_exit_net,
2431 };
2432 
2433 int __init tcp4_proc_init(void)
2434 {
2435 	return register_pernet_subsys(&tcp4_net_ops);
2436 }
2437 
2438 void tcp4_proc_exit(void)
2439 {
2440 	unregister_pernet_subsys(&tcp4_net_ops);
2441 }
2442 #endif /* CONFIG_PROC_FS */
2443 
2444 struct proto tcp_prot = {
2445 	.name			= "TCP",
2446 	.owner			= THIS_MODULE,
2447 	.close			= tcp_close,
2448 	.pre_connect		= tcp_v4_pre_connect,
2449 	.connect		= tcp_v4_connect,
2450 	.disconnect		= tcp_disconnect,
2451 	.accept			= inet_csk_accept,
2452 	.ioctl			= tcp_ioctl,
2453 	.init			= tcp_v4_init_sock,
2454 	.destroy		= tcp_v4_destroy_sock,
2455 	.shutdown		= tcp_shutdown,
2456 	.setsockopt		= tcp_setsockopt,
2457 	.getsockopt		= tcp_getsockopt,
2458 	.keepalive		= tcp_set_keepalive,
2459 	.recvmsg		= tcp_recvmsg,
2460 	.sendmsg		= tcp_sendmsg,
2461 	.sendpage		= tcp_sendpage,
2462 	.backlog_rcv		= tcp_v4_do_rcv,
2463 	.release_cb		= tcp_release_cb,
2464 	.hash			= inet_hash,
2465 	.unhash			= inet_unhash,
2466 	.get_port		= inet_csk_get_port,
2467 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2468 	.leave_memory_pressure	= tcp_leave_memory_pressure,
2469 	.stream_memory_free	= tcp_stream_memory_free,
2470 	.sockets_allocated	= &tcp_sockets_allocated,
2471 	.orphan_count		= &tcp_orphan_count,
2472 	.memory_allocated	= &tcp_memory_allocated,
2473 	.memory_pressure	= &tcp_memory_pressure,
2474 	.sysctl_mem		= sysctl_tcp_mem,
2475 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
2476 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
2477 	.max_header		= MAX_TCP_HEADER,
2478 	.obj_size		= sizeof(struct tcp_sock),
2479 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
2480 	.twsk_prot		= &tcp_timewait_sock_ops,
2481 	.rsk_prot		= &tcp_request_sock_ops,
2482 	.h.hashinfo		= &tcp_hashinfo,
2483 	.no_autobind		= true,
2484 #ifdef CONFIG_COMPAT
2485 	.compat_setsockopt	= compat_tcp_setsockopt,
2486 	.compat_getsockopt	= compat_tcp_getsockopt,
2487 #endif
2488 	.diag_destroy		= tcp_abort,
2489 };
2490 EXPORT_SYMBOL(tcp_prot);
2491 
2492 static void __net_exit tcp_sk_exit(struct net *net)
2493 {
2494 	int cpu;
2495 
2496 	module_put(net->ipv4.tcp_congestion_control->owner);
2497 
2498 	for_each_possible_cpu(cpu)
2499 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2500 	free_percpu(net->ipv4.tcp_sk);
2501 }
2502 
2503 static int __net_init tcp_sk_init(struct net *net)
2504 {
2505 	int res, cpu, cnt;
2506 
2507 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2508 	if (!net->ipv4.tcp_sk)
2509 		return -ENOMEM;
2510 
2511 	for_each_possible_cpu(cpu) {
2512 		struct sock *sk;
2513 
2514 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2515 					   IPPROTO_TCP, net);
2516 		if (res)
2517 			goto fail;
2518 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2519 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2520 	}
2521 
2522 	net->ipv4.sysctl_tcp_ecn = 2;
2523 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2524 
2525 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2526 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2527 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2528 
2529 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2530 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2531 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2532 
2533 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2534 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2535 	net->ipv4.sysctl_tcp_syncookies = 1;
2536 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2537 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2538 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2539 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2540 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2541 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2542 	net->ipv4.sysctl_tcp_tw_reuse = 2;
2543 
2544 	cnt = tcp_hashinfo.ehash_mask + 1;
2545 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2546 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2547 
2548 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2549 	net->ipv4.sysctl_tcp_sack = 1;
2550 	net->ipv4.sysctl_tcp_window_scaling = 1;
2551 	net->ipv4.sysctl_tcp_timestamps = 1;
2552 	net->ipv4.sysctl_tcp_early_retrans = 3;
2553 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2554 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2555 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
2556 	net->ipv4.sysctl_tcp_max_reordering = 300;
2557 	net->ipv4.sysctl_tcp_dsack = 1;
2558 	net->ipv4.sysctl_tcp_app_win = 31;
2559 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
2560 	net->ipv4.sysctl_tcp_frto = 2;
2561 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2562 	/* This limits the percentage of the congestion window which we
2563 	 * will allow a single TSO frame to consume.  Building TSO frames
2564 	 * which are too large can cause TCP streams to be bursty.
2565 	 */
2566 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2567 	/* Default TSQ limit of four TSO segments */
2568 	net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
2569 	/* rfc5961 challenge ack rate limiting */
2570 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2571 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
2572 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2573 	net->ipv4.sysctl_tcp_autocorking = 1;
2574 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2575 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2576 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2577 	if (net != &init_net) {
2578 		memcpy(net->ipv4.sysctl_tcp_rmem,
2579 		       init_net.ipv4.sysctl_tcp_rmem,
2580 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2581 		memcpy(net->ipv4.sysctl_tcp_wmem,
2582 		       init_net.ipv4.sysctl_tcp_wmem,
2583 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2584 	}
2585 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2586 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2587 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2588 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2589 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2590 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2591 
2592 	/* Reno is always built in */
2593 	if (!net_eq(net, &init_net) &&
2594 	    try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2595 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2596 	else
2597 		net->ipv4.tcp_congestion_control = &tcp_reno;
2598 
2599 	return 0;
2600 fail:
2601 	tcp_sk_exit(net);
2602 
2603 	return res;
2604 }
2605 
2606 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2607 {
2608 	struct net *net;
2609 
2610 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2611 
2612 	list_for_each_entry(net, net_exit_list, exit_list)
2613 		tcp_fastopen_ctx_destroy(net);
2614 }
2615 
2616 static struct pernet_operations __net_initdata tcp_sk_ops = {
2617        .init	   = tcp_sk_init,
2618        .exit	   = tcp_sk_exit,
2619        .exit_batch = tcp_sk_exit_batch,
2620 };
2621 
2622 void __init tcp_v4_init(void)
2623 {
2624 	if (register_pernet_subsys(&tcp_sk_ops))
2625 		panic("Failed to create the TCP control socket.\n");
2626 }
2627