xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision dc6a81c3)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72 
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 
80 #include <crypto/hash.h>
81 #include <linux/scatterlist.h>
82 
83 #include <trace/events/tcp.h>
84 
85 #ifdef CONFIG_TCP_MD5SIG
86 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
87 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
88 #endif
89 
90 struct inet_hashinfo tcp_hashinfo;
91 EXPORT_SYMBOL(tcp_hashinfo);
92 
93 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
94 {
95 	return secure_tcp_seq(ip_hdr(skb)->daddr,
96 			      ip_hdr(skb)->saddr,
97 			      tcp_hdr(skb)->dest,
98 			      tcp_hdr(skb)->source);
99 }
100 
101 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
102 {
103 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
104 }
105 
106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
107 {
108 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
109 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
110 	struct tcp_sock *tp = tcp_sk(sk);
111 	int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
112 
113 	if (reuse == 2) {
114 		/* Still does not detect *everything* that goes through
115 		 * lo, since we require a loopback src or dst address
116 		 * or direct binding to 'lo' interface.
117 		 */
118 		bool loopback = false;
119 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
120 			loopback = true;
121 #if IS_ENABLED(CONFIG_IPV6)
122 		if (tw->tw_family == AF_INET6) {
123 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
124 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
125 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
126 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
127 				loopback = true;
128 		} else
129 #endif
130 		{
131 			if (ipv4_is_loopback(tw->tw_daddr) ||
132 			    ipv4_is_loopback(tw->tw_rcv_saddr))
133 				loopback = true;
134 		}
135 		if (!loopback)
136 			reuse = 0;
137 	}
138 
139 	/* With PAWS, it is safe from the viewpoint
140 	   of data integrity. Even without PAWS it is safe provided sequence
141 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
142 
143 	   Actually, the idea is close to VJ's one, only timestamp cache is
144 	   held not per host, but per port pair and TW bucket is used as state
145 	   holder.
146 
147 	   If TW bucket has been already destroyed we fall back to VJ's scheme
148 	   and use initial timestamp retrieved from peer table.
149 	 */
150 	if (tcptw->tw_ts_recent_stamp &&
151 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
152 					    tcptw->tw_ts_recent_stamp)))) {
153 		/* In case of repair and re-using TIME-WAIT sockets we still
154 		 * want to be sure that it is safe as above but honor the
155 		 * sequence numbers and time stamps set as part of the repair
156 		 * process.
157 		 *
158 		 * Without this check re-using a TIME-WAIT socket with TCP
159 		 * repair would accumulate a -1 on the repair assigned
160 		 * sequence number. The first time it is reused the sequence
161 		 * is -1, the second time -2, etc. This fixes that issue
162 		 * without appearing to create any others.
163 		 */
164 		if (likely(!tp->repair)) {
165 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
166 
167 			if (!seq)
168 				seq = 1;
169 			WRITE_ONCE(tp->write_seq, seq);
170 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
171 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
172 		}
173 		sock_hold(sktw);
174 		return 1;
175 	}
176 
177 	return 0;
178 }
179 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
180 
181 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
182 			      int addr_len)
183 {
184 	/* This check is replicated from tcp_v4_connect() and intended to
185 	 * prevent BPF program called below from accessing bytes that are out
186 	 * of the bound specified by user in addr_len.
187 	 */
188 	if (addr_len < sizeof(struct sockaddr_in))
189 		return -EINVAL;
190 
191 	sock_owned_by_me(sk);
192 
193 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
194 }
195 
196 /* This will initiate an outgoing connection. */
197 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
198 {
199 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
200 	struct inet_sock *inet = inet_sk(sk);
201 	struct tcp_sock *tp = tcp_sk(sk);
202 	__be16 orig_sport, orig_dport;
203 	__be32 daddr, nexthop;
204 	struct flowi4 *fl4;
205 	struct rtable *rt;
206 	int err;
207 	struct ip_options_rcu *inet_opt;
208 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
209 
210 	if (addr_len < sizeof(struct sockaddr_in))
211 		return -EINVAL;
212 
213 	if (usin->sin_family != AF_INET)
214 		return -EAFNOSUPPORT;
215 
216 	nexthop = daddr = usin->sin_addr.s_addr;
217 	inet_opt = rcu_dereference_protected(inet->inet_opt,
218 					     lockdep_sock_is_held(sk));
219 	if (inet_opt && inet_opt->opt.srr) {
220 		if (!daddr)
221 			return -EINVAL;
222 		nexthop = inet_opt->opt.faddr;
223 	}
224 
225 	orig_sport = inet->inet_sport;
226 	orig_dport = usin->sin_port;
227 	fl4 = &inet->cork.fl.u.ip4;
228 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
229 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
230 			      IPPROTO_TCP,
231 			      orig_sport, orig_dport, sk);
232 	if (IS_ERR(rt)) {
233 		err = PTR_ERR(rt);
234 		if (err == -ENETUNREACH)
235 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
236 		return err;
237 	}
238 
239 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
240 		ip_rt_put(rt);
241 		return -ENETUNREACH;
242 	}
243 
244 	if (!inet_opt || !inet_opt->opt.srr)
245 		daddr = fl4->daddr;
246 
247 	if (!inet->inet_saddr)
248 		inet->inet_saddr = fl4->saddr;
249 	sk_rcv_saddr_set(sk, inet->inet_saddr);
250 
251 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
252 		/* Reset inherited state */
253 		tp->rx_opt.ts_recent	   = 0;
254 		tp->rx_opt.ts_recent_stamp = 0;
255 		if (likely(!tp->repair))
256 			WRITE_ONCE(tp->write_seq, 0);
257 	}
258 
259 	inet->inet_dport = usin->sin_port;
260 	sk_daddr_set(sk, daddr);
261 
262 	inet_csk(sk)->icsk_ext_hdr_len = 0;
263 	if (inet_opt)
264 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
265 
266 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
267 
268 	/* Socket identity is still unknown (sport may be zero).
269 	 * However we set state to SYN-SENT and not releasing socket
270 	 * lock select source port, enter ourselves into the hash tables and
271 	 * complete initialization after this.
272 	 */
273 	tcp_set_state(sk, TCP_SYN_SENT);
274 	err = inet_hash_connect(tcp_death_row, sk);
275 	if (err)
276 		goto failure;
277 
278 	sk_set_txhash(sk);
279 
280 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
281 			       inet->inet_sport, inet->inet_dport, sk);
282 	if (IS_ERR(rt)) {
283 		err = PTR_ERR(rt);
284 		rt = NULL;
285 		goto failure;
286 	}
287 	/* OK, now commit destination to socket.  */
288 	sk->sk_gso_type = SKB_GSO_TCPV4;
289 	sk_setup_caps(sk, &rt->dst);
290 	rt = NULL;
291 
292 	if (likely(!tp->repair)) {
293 		if (!tp->write_seq)
294 			WRITE_ONCE(tp->write_seq,
295 				   secure_tcp_seq(inet->inet_saddr,
296 						  inet->inet_daddr,
297 						  inet->inet_sport,
298 						  usin->sin_port));
299 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
300 						 inet->inet_saddr,
301 						 inet->inet_daddr);
302 	}
303 
304 	inet->inet_id = prandom_u32();
305 
306 	if (tcp_fastopen_defer_connect(sk, &err))
307 		return err;
308 	if (err)
309 		goto failure;
310 
311 	err = tcp_connect(sk);
312 
313 	if (err)
314 		goto failure;
315 
316 	return 0;
317 
318 failure:
319 	/*
320 	 * This unhashes the socket and releases the local port,
321 	 * if necessary.
322 	 */
323 	tcp_set_state(sk, TCP_CLOSE);
324 	ip_rt_put(rt);
325 	sk->sk_route_caps = 0;
326 	inet->inet_dport = 0;
327 	return err;
328 }
329 EXPORT_SYMBOL(tcp_v4_connect);
330 
331 /*
332  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
333  * It can be called through tcp_release_cb() if socket was owned by user
334  * at the time tcp_v4_err() was called to handle ICMP message.
335  */
336 void tcp_v4_mtu_reduced(struct sock *sk)
337 {
338 	struct inet_sock *inet = inet_sk(sk);
339 	struct dst_entry *dst;
340 	u32 mtu;
341 
342 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
343 		return;
344 	mtu = tcp_sk(sk)->mtu_info;
345 	dst = inet_csk_update_pmtu(sk, mtu);
346 	if (!dst)
347 		return;
348 
349 	/* Something is about to be wrong... Remember soft error
350 	 * for the case, if this connection will not able to recover.
351 	 */
352 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
353 		sk->sk_err_soft = EMSGSIZE;
354 
355 	mtu = dst_mtu(dst);
356 
357 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
358 	    ip_sk_accept_pmtu(sk) &&
359 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
360 		tcp_sync_mss(sk, mtu);
361 
362 		/* Resend the TCP packet because it's
363 		 * clear that the old packet has been
364 		 * dropped. This is the new "fast" path mtu
365 		 * discovery.
366 		 */
367 		tcp_simple_retransmit(sk);
368 	} /* else let the usual retransmit timer handle it */
369 }
370 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
371 
372 static void do_redirect(struct sk_buff *skb, struct sock *sk)
373 {
374 	struct dst_entry *dst = __sk_dst_check(sk, 0);
375 
376 	if (dst)
377 		dst->ops->redirect(dst, sk, skb);
378 }
379 
380 
381 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
382 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
383 {
384 	struct request_sock *req = inet_reqsk(sk);
385 	struct net *net = sock_net(sk);
386 
387 	/* ICMPs are not backlogged, hence we cannot get
388 	 * an established socket here.
389 	 */
390 	if (seq != tcp_rsk(req)->snt_isn) {
391 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
392 	} else if (abort) {
393 		/*
394 		 * Still in SYN_RECV, just remove it silently.
395 		 * There is no good way to pass the error to the newly
396 		 * created socket, and POSIX does not want network
397 		 * errors returned from accept().
398 		 */
399 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
400 		tcp_listendrop(req->rsk_listener);
401 	}
402 	reqsk_put(req);
403 }
404 EXPORT_SYMBOL(tcp_req_err);
405 
406 /*
407  * This routine is called by the ICMP module when it gets some
408  * sort of error condition.  If err < 0 then the socket should
409  * be closed and the error returned to the user.  If err > 0
410  * it's just the icmp type << 8 | icmp code.  After adjustment
411  * header points to the first 8 bytes of the tcp header.  We need
412  * to find the appropriate port.
413  *
414  * The locking strategy used here is very "optimistic". When
415  * someone else accesses the socket the ICMP is just dropped
416  * and for some paths there is no check at all.
417  * A more general error queue to queue errors for later handling
418  * is probably better.
419  *
420  */
421 
422 int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
423 {
424 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
425 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
426 	struct inet_connection_sock *icsk;
427 	struct tcp_sock *tp;
428 	struct inet_sock *inet;
429 	const int type = icmp_hdr(icmp_skb)->type;
430 	const int code = icmp_hdr(icmp_skb)->code;
431 	struct sock *sk;
432 	struct sk_buff *skb;
433 	struct request_sock *fastopen;
434 	u32 seq, snd_una;
435 	s32 remaining;
436 	u32 delta_us;
437 	int err;
438 	struct net *net = dev_net(icmp_skb->dev);
439 
440 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
441 				       th->dest, iph->saddr, ntohs(th->source),
442 				       inet_iif(icmp_skb), 0);
443 	if (!sk) {
444 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
445 		return -ENOENT;
446 	}
447 	if (sk->sk_state == TCP_TIME_WAIT) {
448 		inet_twsk_put(inet_twsk(sk));
449 		return 0;
450 	}
451 	seq = ntohl(th->seq);
452 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
453 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
454 				     type == ICMP_TIME_EXCEEDED ||
455 				     (type == ICMP_DEST_UNREACH &&
456 				      (code == ICMP_NET_UNREACH ||
457 				       code == ICMP_HOST_UNREACH)));
458 		return 0;
459 	}
460 
461 	bh_lock_sock(sk);
462 	/* If too many ICMPs get dropped on busy
463 	 * servers this needs to be solved differently.
464 	 * We do take care of PMTU discovery (RFC1191) special case :
465 	 * we can receive locally generated ICMP messages while socket is held.
466 	 */
467 	if (sock_owned_by_user(sk)) {
468 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
469 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
470 	}
471 	if (sk->sk_state == TCP_CLOSE)
472 		goto out;
473 
474 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
475 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
476 		goto out;
477 	}
478 
479 	icsk = inet_csk(sk);
480 	tp = tcp_sk(sk);
481 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
482 	fastopen = rcu_dereference(tp->fastopen_rsk);
483 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
484 	if (sk->sk_state != TCP_LISTEN &&
485 	    !between(seq, snd_una, tp->snd_nxt)) {
486 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
487 		goto out;
488 	}
489 
490 	switch (type) {
491 	case ICMP_REDIRECT:
492 		if (!sock_owned_by_user(sk))
493 			do_redirect(icmp_skb, sk);
494 		goto out;
495 	case ICMP_SOURCE_QUENCH:
496 		/* Just silently ignore these. */
497 		goto out;
498 	case ICMP_PARAMETERPROB:
499 		err = EPROTO;
500 		break;
501 	case ICMP_DEST_UNREACH:
502 		if (code > NR_ICMP_UNREACH)
503 			goto out;
504 
505 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
506 			/* We are not interested in TCP_LISTEN and open_requests
507 			 * (SYN-ACKs send out by Linux are always <576bytes so
508 			 * they should go through unfragmented).
509 			 */
510 			if (sk->sk_state == TCP_LISTEN)
511 				goto out;
512 
513 			tp->mtu_info = info;
514 			if (!sock_owned_by_user(sk)) {
515 				tcp_v4_mtu_reduced(sk);
516 			} else {
517 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
518 					sock_hold(sk);
519 			}
520 			goto out;
521 		}
522 
523 		err = icmp_err_convert[code].errno;
524 		/* check if icmp_skb allows revert of backoff
525 		 * (see draft-zimmermann-tcp-lcd) */
526 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
527 			break;
528 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
529 		    !icsk->icsk_backoff || fastopen)
530 			break;
531 
532 		if (sock_owned_by_user(sk))
533 			break;
534 
535 		skb = tcp_rtx_queue_head(sk);
536 		if (WARN_ON_ONCE(!skb))
537 			break;
538 
539 		icsk->icsk_backoff--;
540 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
541 					       TCP_TIMEOUT_INIT;
542 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
543 
544 
545 		tcp_mstamp_refresh(tp);
546 		delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
547 		remaining = icsk->icsk_rto -
548 			    usecs_to_jiffies(delta_us);
549 
550 		if (remaining > 0) {
551 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
552 						  remaining, TCP_RTO_MAX);
553 		} else {
554 			/* RTO revert clocked out retransmission.
555 			 * Will retransmit now */
556 			tcp_retransmit_timer(sk);
557 		}
558 
559 		break;
560 	case ICMP_TIME_EXCEEDED:
561 		err = EHOSTUNREACH;
562 		break;
563 	default:
564 		goto out;
565 	}
566 
567 	switch (sk->sk_state) {
568 	case TCP_SYN_SENT:
569 	case TCP_SYN_RECV:
570 		/* Only in fast or simultaneous open. If a fast open socket is
571 		 * is already accepted it is treated as a connected one below.
572 		 */
573 		if (fastopen && !fastopen->sk)
574 			break;
575 
576 		if (!sock_owned_by_user(sk)) {
577 			sk->sk_err = err;
578 
579 			sk->sk_error_report(sk);
580 
581 			tcp_done(sk);
582 		} else {
583 			sk->sk_err_soft = err;
584 		}
585 		goto out;
586 	}
587 
588 	/* If we've already connected we will keep trying
589 	 * until we time out, or the user gives up.
590 	 *
591 	 * rfc1122 4.2.3.9 allows to consider as hard errors
592 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
593 	 * but it is obsoleted by pmtu discovery).
594 	 *
595 	 * Note, that in modern internet, where routing is unreliable
596 	 * and in each dark corner broken firewalls sit, sending random
597 	 * errors ordered by their masters even this two messages finally lose
598 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
599 	 *
600 	 * Now we are in compliance with RFCs.
601 	 *							--ANK (980905)
602 	 */
603 
604 	inet = inet_sk(sk);
605 	if (!sock_owned_by_user(sk) && inet->recverr) {
606 		sk->sk_err = err;
607 		sk->sk_error_report(sk);
608 	} else	{ /* Only an error on timeout */
609 		sk->sk_err_soft = err;
610 	}
611 
612 out:
613 	bh_unlock_sock(sk);
614 	sock_put(sk);
615 	return 0;
616 }
617 
618 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
619 {
620 	struct tcphdr *th = tcp_hdr(skb);
621 
622 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
623 	skb->csum_start = skb_transport_header(skb) - skb->head;
624 	skb->csum_offset = offsetof(struct tcphdr, check);
625 }
626 
627 /* This routine computes an IPv4 TCP checksum. */
628 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
629 {
630 	const struct inet_sock *inet = inet_sk(sk);
631 
632 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
633 }
634 EXPORT_SYMBOL(tcp_v4_send_check);
635 
636 /*
637  *	This routine will send an RST to the other tcp.
638  *
639  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
640  *		      for reset.
641  *	Answer: if a packet caused RST, it is not for a socket
642  *		existing in our system, if it is matched to a socket,
643  *		it is just duplicate segment or bug in other side's TCP.
644  *		So that we build reply only basing on parameters
645  *		arrived with segment.
646  *	Exception: precedence violation. We do not implement it in any case.
647  */
648 
649 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
650 {
651 	const struct tcphdr *th = tcp_hdr(skb);
652 	struct {
653 		struct tcphdr th;
654 #ifdef CONFIG_TCP_MD5SIG
655 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
656 #endif
657 	} rep;
658 	struct ip_reply_arg arg;
659 #ifdef CONFIG_TCP_MD5SIG
660 	struct tcp_md5sig_key *key = NULL;
661 	const __u8 *hash_location = NULL;
662 	unsigned char newhash[16];
663 	int genhash;
664 	struct sock *sk1 = NULL;
665 #endif
666 	u64 transmit_time = 0;
667 	struct sock *ctl_sk;
668 	struct net *net;
669 
670 	/* Never send a reset in response to a reset. */
671 	if (th->rst)
672 		return;
673 
674 	/* If sk not NULL, it means we did a successful lookup and incoming
675 	 * route had to be correct. prequeue might have dropped our dst.
676 	 */
677 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
678 		return;
679 
680 	/* Swap the send and the receive. */
681 	memset(&rep, 0, sizeof(rep));
682 	rep.th.dest   = th->source;
683 	rep.th.source = th->dest;
684 	rep.th.doff   = sizeof(struct tcphdr) / 4;
685 	rep.th.rst    = 1;
686 
687 	if (th->ack) {
688 		rep.th.seq = th->ack_seq;
689 	} else {
690 		rep.th.ack = 1;
691 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
692 				       skb->len - (th->doff << 2));
693 	}
694 
695 	memset(&arg, 0, sizeof(arg));
696 	arg.iov[0].iov_base = (unsigned char *)&rep;
697 	arg.iov[0].iov_len  = sizeof(rep.th);
698 
699 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
700 #ifdef CONFIG_TCP_MD5SIG
701 	rcu_read_lock();
702 	hash_location = tcp_parse_md5sig_option(th);
703 	if (sk && sk_fullsock(sk)) {
704 		const union tcp_md5_addr *addr;
705 		int l3index;
706 
707 		/* sdif set, means packet ingressed via a device
708 		 * in an L3 domain and inet_iif is set to it.
709 		 */
710 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
711 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
712 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
713 	} else if (hash_location) {
714 		const union tcp_md5_addr *addr;
715 		int sdif = tcp_v4_sdif(skb);
716 		int dif = inet_iif(skb);
717 		int l3index;
718 
719 		/*
720 		 * active side is lost. Try to find listening socket through
721 		 * source port, and then find md5 key through listening socket.
722 		 * we are not loose security here:
723 		 * Incoming packet is checked with md5 hash with finding key,
724 		 * no RST generated if md5 hash doesn't match.
725 		 */
726 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
727 					     ip_hdr(skb)->saddr,
728 					     th->source, ip_hdr(skb)->daddr,
729 					     ntohs(th->source), dif, sdif);
730 		/* don't send rst if it can't find key */
731 		if (!sk1)
732 			goto out;
733 
734 		/* sdif set, means packet ingressed via a device
735 		 * in an L3 domain and dif is set to it.
736 		 */
737 		l3index = sdif ? dif : 0;
738 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
739 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
740 		if (!key)
741 			goto out;
742 
743 
744 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
745 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
746 			goto out;
747 
748 	}
749 
750 	if (key) {
751 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
752 				   (TCPOPT_NOP << 16) |
753 				   (TCPOPT_MD5SIG << 8) |
754 				   TCPOLEN_MD5SIG);
755 		/* Update length and the length the header thinks exists */
756 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
757 		rep.th.doff = arg.iov[0].iov_len / 4;
758 
759 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
760 				     key, ip_hdr(skb)->saddr,
761 				     ip_hdr(skb)->daddr, &rep.th);
762 	}
763 #endif
764 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
765 				      ip_hdr(skb)->saddr, /* XXX */
766 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
767 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
768 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
769 
770 	/* When socket is gone, all binding information is lost.
771 	 * routing might fail in this case. No choice here, if we choose to force
772 	 * input interface, we will misroute in case of asymmetric route.
773 	 */
774 	if (sk) {
775 		arg.bound_dev_if = sk->sk_bound_dev_if;
776 		if (sk_fullsock(sk))
777 			trace_tcp_send_reset(sk, skb);
778 	}
779 
780 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
781 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
782 
783 	arg.tos = ip_hdr(skb)->tos;
784 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
785 	local_bh_disable();
786 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
787 	if (sk) {
788 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
789 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
790 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
791 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
792 		transmit_time = tcp_transmit_time(sk);
793 	}
794 	ip_send_unicast_reply(ctl_sk,
795 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
796 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
797 			      &arg, arg.iov[0].iov_len,
798 			      transmit_time);
799 
800 	ctl_sk->sk_mark = 0;
801 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
802 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
803 	local_bh_enable();
804 
805 #ifdef CONFIG_TCP_MD5SIG
806 out:
807 	rcu_read_unlock();
808 #endif
809 }
810 
811 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
812    outside socket context is ugly, certainly. What can I do?
813  */
814 
815 static void tcp_v4_send_ack(const struct sock *sk,
816 			    struct sk_buff *skb, u32 seq, u32 ack,
817 			    u32 win, u32 tsval, u32 tsecr, int oif,
818 			    struct tcp_md5sig_key *key,
819 			    int reply_flags, u8 tos)
820 {
821 	const struct tcphdr *th = tcp_hdr(skb);
822 	struct {
823 		struct tcphdr th;
824 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
825 #ifdef CONFIG_TCP_MD5SIG
826 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
827 #endif
828 			];
829 	} rep;
830 	struct net *net = sock_net(sk);
831 	struct ip_reply_arg arg;
832 	struct sock *ctl_sk;
833 	u64 transmit_time;
834 
835 	memset(&rep.th, 0, sizeof(struct tcphdr));
836 	memset(&arg, 0, sizeof(arg));
837 
838 	arg.iov[0].iov_base = (unsigned char *)&rep;
839 	arg.iov[0].iov_len  = sizeof(rep.th);
840 	if (tsecr) {
841 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
842 				   (TCPOPT_TIMESTAMP << 8) |
843 				   TCPOLEN_TIMESTAMP);
844 		rep.opt[1] = htonl(tsval);
845 		rep.opt[2] = htonl(tsecr);
846 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
847 	}
848 
849 	/* Swap the send and the receive. */
850 	rep.th.dest    = th->source;
851 	rep.th.source  = th->dest;
852 	rep.th.doff    = arg.iov[0].iov_len / 4;
853 	rep.th.seq     = htonl(seq);
854 	rep.th.ack_seq = htonl(ack);
855 	rep.th.ack     = 1;
856 	rep.th.window  = htons(win);
857 
858 #ifdef CONFIG_TCP_MD5SIG
859 	if (key) {
860 		int offset = (tsecr) ? 3 : 0;
861 
862 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
863 					  (TCPOPT_NOP << 16) |
864 					  (TCPOPT_MD5SIG << 8) |
865 					  TCPOLEN_MD5SIG);
866 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
867 		rep.th.doff = arg.iov[0].iov_len/4;
868 
869 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
870 				    key, ip_hdr(skb)->saddr,
871 				    ip_hdr(skb)->daddr, &rep.th);
872 	}
873 #endif
874 	arg.flags = reply_flags;
875 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
876 				      ip_hdr(skb)->saddr, /* XXX */
877 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
878 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
879 	if (oif)
880 		arg.bound_dev_if = oif;
881 	arg.tos = tos;
882 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
883 	local_bh_disable();
884 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
885 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
886 			   inet_twsk(sk)->tw_mark : sk->sk_mark;
887 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
888 			   inet_twsk(sk)->tw_priority : sk->sk_priority;
889 	transmit_time = tcp_transmit_time(sk);
890 	ip_send_unicast_reply(ctl_sk,
891 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
892 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
893 			      &arg, arg.iov[0].iov_len,
894 			      transmit_time);
895 
896 	ctl_sk->sk_mark = 0;
897 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
898 	local_bh_enable();
899 }
900 
901 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
902 {
903 	struct inet_timewait_sock *tw = inet_twsk(sk);
904 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
905 
906 	tcp_v4_send_ack(sk, skb,
907 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
908 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
909 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
910 			tcptw->tw_ts_recent,
911 			tw->tw_bound_dev_if,
912 			tcp_twsk_md5_key(tcptw),
913 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
914 			tw->tw_tos
915 			);
916 
917 	inet_twsk_put(tw);
918 }
919 
920 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
921 				  struct request_sock *req)
922 {
923 	const union tcp_md5_addr *addr;
924 	int l3index;
925 
926 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
927 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
928 	 */
929 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
930 					     tcp_sk(sk)->snd_nxt;
931 
932 	/* RFC 7323 2.3
933 	 * The window field (SEG.WND) of every outgoing segment, with the
934 	 * exception of <SYN> segments, MUST be right-shifted by
935 	 * Rcv.Wind.Shift bits:
936 	 */
937 	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
938 	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
939 	tcp_v4_send_ack(sk, skb, seq,
940 			tcp_rsk(req)->rcv_nxt,
941 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
942 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
943 			req->ts_recent,
944 			0,
945 			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
946 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
947 			ip_hdr(skb)->tos);
948 }
949 
950 /*
951  *	Send a SYN-ACK after having received a SYN.
952  *	This still operates on a request_sock only, not on a big
953  *	socket.
954  */
955 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
956 			      struct flowi *fl,
957 			      struct request_sock *req,
958 			      struct tcp_fastopen_cookie *foc,
959 			      enum tcp_synack_type synack_type)
960 {
961 	const struct inet_request_sock *ireq = inet_rsk(req);
962 	struct flowi4 fl4;
963 	int err = -1;
964 	struct sk_buff *skb;
965 
966 	/* First, grab a route. */
967 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
968 		return -1;
969 
970 	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
971 
972 	if (skb) {
973 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
974 
975 		rcu_read_lock();
976 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
977 					    ireq->ir_rmt_addr,
978 					    rcu_dereference(ireq->ireq_opt));
979 		rcu_read_unlock();
980 		err = net_xmit_eval(err);
981 	}
982 
983 	return err;
984 }
985 
986 /*
987  *	IPv4 request_sock destructor.
988  */
989 static void tcp_v4_reqsk_destructor(struct request_sock *req)
990 {
991 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
992 }
993 
994 #ifdef CONFIG_TCP_MD5SIG
995 /*
996  * RFC2385 MD5 checksumming requires a mapping of
997  * IP address->MD5 Key.
998  * We need to maintain these in the sk structure.
999  */
1000 
1001 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1002 EXPORT_SYMBOL(tcp_md5_needed);
1003 
1004 /* Find the Key structure for an address.  */
1005 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1006 					   const union tcp_md5_addr *addr,
1007 					   int family)
1008 {
1009 	const struct tcp_sock *tp = tcp_sk(sk);
1010 	struct tcp_md5sig_key *key;
1011 	const struct tcp_md5sig_info *md5sig;
1012 	__be32 mask;
1013 	struct tcp_md5sig_key *best_match = NULL;
1014 	bool match;
1015 
1016 	/* caller either holds rcu_read_lock() or socket lock */
1017 	md5sig = rcu_dereference_check(tp->md5sig_info,
1018 				       lockdep_sock_is_held(sk));
1019 	if (!md5sig)
1020 		return NULL;
1021 
1022 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1023 		if (key->family != family)
1024 			continue;
1025 		if (key->l3index && key->l3index != l3index)
1026 			continue;
1027 		if (family == AF_INET) {
1028 			mask = inet_make_mask(key->prefixlen);
1029 			match = (key->addr.a4.s_addr & mask) ==
1030 				(addr->a4.s_addr & mask);
1031 #if IS_ENABLED(CONFIG_IPV6)
1032 		} else if (family == AF_INET6) {
1033 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1034 						  key->prefixlen);
1035 #endif
1036 		} else {
1037 			match = false;
1038 		}
1039 
1040 		if (match && (!best_match ||
1041 			      key->prefixlen > best_match->prefixlen))
1042 			best_match = key;
1043 	}
1044 	return best_match;
1045 }
1046 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1047 
1048 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1049 						      const union tcp_md5_addr *addr,
1050 						      int family, u8 prefixlen,
1051 						      int l3index)
1052 {
1053 	const struct tcp_sock *tp = tcp_sk(sk);
1054 	struct tcp_md5sig_key *key;
1055 	unsigned int size = sizeof(struct in_addr);
1056 	const struct tcp_md5sig_info *md5sig;
1057 
1058 	/* caller either holds rcu_read_lock() or socket lock */
1059 	md5sig = rcu_dereference_check(tp->md5sig_info,
1060 				       lockdep_sock_is_held(sk));
1061 	if (!md5sig)
1062 		return NULL;
1063 #if IS_ENABLED(CONFIG_IPV6)
1064 	if (family == AF_INET6)
1065 		size = sizeof(struct in6_addr);
1066 #endif
1067 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1068 		if (key->family != family)
1069 			continue;
1070 		if (key->l3index && key->l3index != l3index)
1071 			continue;
1072 		if (!memcmp(&key->addr, addr, size) &&
1073 		    key->prefixlen == prefixlen)
1074 			return key;
1075 	}
1076 	return NULL;
1077 }
1078 
1079 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1080 					 const struct sock *addr_sk)
1081 {
1082 	const union tcp_md5_addr *addr;
1083 	int l3index;
1084 
1085 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1086 						 addr_sk->sk_bound_dev_if);
1087 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1088 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1089 }
1090 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1091 
1092 /* This can be called on a newly created socket, from other files */
1093 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1094 		   int family, u8 prefixlen, int l3index,
1095 		   const u8 *newkey, u8 newkeylen, gfp_t gfp)
1096 {
1097 	/* Add Key to the list */
1098 	struct tcp_md5sig_key *key;
1099 	struct tcp_sock *tp = tcp_sk(sk);
1100 	struct tcp_md5sig_info *md5sig;
1101 
1102 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1103 	if (key) {
1104 		/* Pre-existing entry - just update that one. */
1105 		memcpy(key->key, newkey, newkeylen);
1106 		key->keylen = newkeylen;
1107 		return 0;
1108 	}
1109 
1110 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1111 					   lockdep_sock_is_held(sk));
1112 	if (!md5sig) {
1113 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1114 		if (!md5sig)
1115 			return -ENOMEM;
1116 
1117 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1118 		INIT_HLIST_HEAD(&md5sig->head);
1119 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1120 	}
1121 
1122 	key = sock_kmalloc(sk, sizeof(*key), gfp);
1123 	if (!key)
1124 		return -ENOMEM;
1125 	if (!tcp_alloc_md5sig_pool()) {
1126 		sock_kfree_s(sk, key, sizeof(*key));
1127 		return -ENOMEM;
1128 	}
1129 
1130 	memcpy(key->key, newkey, newkeylen);
1131 	key->keylen = newkeylen;
1132 	key->family = family;
1133 	key->prefixlen = prefixlen;
1134 	key->l3index = l3index;
1135 	memcpy(&key->addr, addr,
1136 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1137 				      sizeof(struct in_addr));
1138 	hlist_add_head_rcu(&key->node, &md5sig->head);
1139 	return 0;
1140 }
1141 EXPORT_SYMBOL(tcp_md5_do_add);
1142 
1143 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1144 		   u8 prefixlen, int l3index)
1145 {
1146 	struct tcp_md5sig_key *key;
1147 
1148 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1149 	if (!key)
1150 		return -ENOENT;
1151 	hlist_del_rcu(&key->node);
1152 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1153 	kfree_rcu(key, rcu);
1154 	return 0;
1155 }
1156 EXPORT_SYMBOL(tcp_md5_do_del);
1157 
1158 static void tcp_clear_md5_list(struct sock *sk)
1159 {
1160 	struct tcp_sock *tp = tcp_sk(sk);
1161 	struct tcp_md5sig_key *key;
1162 	struct hlist_node *n;
1163 	struct tcp_md5sig_info *md5sig;
1164 
1165 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1166 
1167 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1168 		hlist_del_rcu(&key->node);
1169 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1170 		kfree_rcu(key, rcu);
1171 	}
1172 }
1173 
1174 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1175 				 char __user *optval, int optlen)
1176 {
1177 	struct tcp_md5sig cmd;
1178 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1179 	const union tcp_md5_addr *addr;
1180 	u8 prefixlen = 32;
1181 	int l3index = 0;
1182 
1183 	if (optlen < sizeof(cmd))
1184 		return -EINVAL;
1185 
1186 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1187 		return -EFAULT;
1188 
1189 	if (sin->sin_family != AF_INET)
1190 		return -EINVAL;
1191 
1192 	if (optname == TCP_MD5SIG_EXT &&
1193 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1194 		prefixlen = cmd.tcpm_prefixlen;
1195 		if (prefixlen > 32)
1196 			return -EINVAL;
1197 	}
1198 
1199 	if (optname == TCP_MD5SIG_EXT &&
1200 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1201 		struct net_device *dev;
1202 
1203 		rcu_read_lock();
1204 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1205 		if (dev && netif_is_l3_master(dev))
1206 			l3index = dev->ifindex;
1207 
1208 		rcu_read_unlock();
1209 
1210 		/* ok to reference set/not set outside of rcu;
1211 		 * right now device MUST be an L3 master
1212 		 */
1213 		if (!dev || !l3index)
1214 			return -EINVAL;
1215 	}
1216 
1217 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1218 
1219 	if (!cmd.tcpm_keylen)
1220 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1221 
1222 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1223 		return -EINVAL;
1224 
1225 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1226 			      cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1227 }
1228 
1229 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1230 				   __be32 daddr, __be32 saddr,
1231 				   const struct tcphdr *th, int nbytes)
1232 {
1233 	struct tcp4_pseudohdr *bp;
1234 	struct scatterlist sg;
1235 	struct tcphdr *_th;
1236 
1237 	bp = hp->scratch;
1238 	bp->saddr = saddr;
1239 	bp->daddr = daddr;
1240 	bp->pad = 0;
1241 	bp->protocol = IPPROTO_TCP;
1242 	bp->len = cpu_to_be16(nbytes);
1243 
1244 	_th = (struct tcphdr *)(bp + 1);
1245 	memcpy(_th, th, sizeof(*th));
1246 	_th->check = 0;
1247 
1248 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1249 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1250 				sizeof(*bp) + sizeof(*th));
1251 	return crypto_ahash_update(hp->md5_req);
1252 }
1253 
1254 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1255 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1256 {
1257 	struct tcp_md5sig_pool *hp;
1258 	struct ahash_request *req;
1259 
1260 	hp = tcp_get_md5sig_pool();
1261 	if (!hp)
1262 		goto clear_hash_noput;
1263 	req = hp->md5_req;
1264 
1265 	if (crypto_ahash_init(req))
1266 		goto clear_hash;
1267 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1268 		goto clear_hash;
1269 	if (tcp_md5_hash_key(hp, key))
1270 		goto clear_hash;
1271 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1272 	if (crypto_ahash_final(req))
1273 		goto clear_hash;
1274 
1275 	tcp_put_md5sig_pool();
1276 	return 0;
1277 
1278 clear_hash:
1279 	tcp_put_md5sig_pool();
1280 clear_hash_noput:
1281 	memset(md5_hash, 0, 16);
1282 	return 1;
1283 }
1284 
1285 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1286 			const struct sock *sk,
1287 			const struct sk_buff *skb)
1288 {
1289 	struct tcp_md5sig_pool *hp;
1290 	struct ahash_request *req;
1291 	const struct tcphdr *th = tcp_hdr(skb);
1292 	__be32 saddr, daddr;
1293 
1294 	if (sk) { /* valid for establish/request sockets */
1295 		saddr = sk->sk_rcv_saddr;
1296 		daddr = sk->sk_daddr;
1297 	} else {
1298 		const struct iphdr *iph = ip_hdr(skb);
1299 		saddr = iph->saddr;
1300 		daddr = iph->daddr;
1301 	}
1302 
1303 	hp = tcp_get_md5sig_pool();
1304 	if (!hp)
1305 		goto clear_hash_noput;
1306 	req = hp->md5_req;
1307 
1308 	if (crypto_ahash_init(req))
1309 		goto clear_hash;
1310 
1311 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1312 		goto clear_hash;
1313 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1314 		goto clear_hash;
1315 	if (tcp_md5_hash_key(hp, key))
1316 		goto clear_hash;
1317 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1318 	if (crypto_ahash_final(req))
1319 		goto clear_hash;
1320 
1321 	tcp_put_md5sig_pool();
1322 	return 0;
1323 
1324 clear_hash:
1325 	tcp_put_md5sig_pool();
1326 clear_hash_noput:
1327 	memset(md5_hash, 0, 16);
1328 	return 1;
1329 }
1330 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1331 
1332 #endif
1333 
1334 /* Called with rcu_read_lock() */
1335 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1336 				    const struct sk_buff *skb,
1337 				    int dif, int sdif)
1338 {
1339 #ifdef CONFIG_TCP_MD5SIG
1340 	/*
1341 	 * This gets called for each TCP segment that arrives
1342 	 * so we want to be efficient.
1343 	 * We have 3 drop cases:
1344 	 * o No MD5 hash and one expected.
1345 	 * o MD5 hash and we're not expecting one.
1346 	 * o MD5 hash and its wrong.
1347 	 */
1348 	const __u8 *hash_location = NULL;
1349 	struct tcp_md5sig_key *hash_expected;
1350 	const struct iphdr *iph = ip_hdr(skb);
1351 	const struct tcphdr *th = tcp_hdr(skb);
1352 	const union tcp_md5_addr *addr;
1353 	unsigned char newhash[16];
1354 	int genhash, l3index;
1355 
1356 	/* sdif set, means packet ingressed via a device
1357 	 * in an L3 domain and dif is set to the l3mdev
1358 	 */
1359 	l3index = sdif ? dif : 0;
1360 
1361 	addr = (union tcp_md5_addr *)&iph->saddr;
1362 	hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1363 	hash_location = tcp_parse_md5sig_option(th);
1364 
1365 	/* We've parsed the options - do we have a hash? */
1366 	if (!hash_expected && !hash_location)
1367 		return false;
1368 
1369 	if (hash_expected && !hash_location) {
1370 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1371 		return true;
1372 	}
1373 
1374 	if (!hash_expected && hash_location) {
1375 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1376 		return true;
1377 	}
1378 
1379 	/* Okay, so this is hash_expected and hash_location -
1380 	 * so we need to calculate the checksum.
1381 	 */
1382 	genhash = tcp_v4_md5_hash_skb(newhash,
1383 				      hash_expected,
1384 				      NULL, skb);
1385 
1386 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1387 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1388 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1389 				     &iph->saddr, ntohs(th->source),
1390 				     &iph->daddr, ntohs(th->dest),
1391 				     genhash ? " tcp_v4_calc_md5_hash failed"
1392 				     : "", l3index);
1393 		return true;
1394 	}
1395 	return false;
1396 #endif
1397 	return false;
1398 }
1399 
1400 static void tcp_v4_init_req(struct request_sock *req,
1401 			    const struct sock *sk_listener,
1402 			    struct sk_buff *skb)
1403 {
1404 	struct inet_request_sock *ireq = inet_rsk(req);
1405 	struct net *net = sock_net(sk_listener);
1406 
1407 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1408 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1409 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1410 }
1411 
1412 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1413 					  struct flowi *fl,
1414 					  const struct request_sock *req)
1415 {
1416 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1417 }
1418 
1419 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1420 	.family		=	PF_INET,
1421 	.obj_size	=	sizeof(struct tcp_request_sock),
1422 	.rtx_syn_ack	=	tcp_rtx_synack,
1423 	.send_ack	=	tcp_v4_reqsk_send_ack,
1424 	.destructor	=	tcp_v4_reqsk_destructor,
1425 	.send_reset	=	tcp_v4_send_reset,
1426 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1427 };
1428 
1429 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1430 	.mss_clamp	=	TCP_MSS_DEFAULT,
1431 #ifdef CONFIG_TCP_MD5SIG
1432 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1433 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1434 #endif
1435 	.init_req	=	tcp_v4_init_req,
1436 #ifdef CONFIG_SYN_COOKIES
1437 	.cookie_init_seq =	cookie_v4_init_sequence,
1438 #endif
1439 	.route_req	=	tcp_v4_route_req,
1440 	.init_seq	=	tcp_v4_init_seq,
1441 	.init_ts_off	=	tcp_v4_init_ts_off,
1442 	.send_synack	=	tcp_v4_send_synack,
1443 };
1444 
1445 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1446 {
1447 	/* Never answer to SYNs send to broadcast or multicast */
1448 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1449 		goto drop;
1450 
1451 	return tcp_conn_request(&tcp_request_sock_ops,
1452 				&tcp_request_sock_ipv4_ops, sk, skb);
1453 
1454 drop:
1455 	tcp_listendrop(sk);
1456 	return 0;
1457 }
1458 EXPORT_SYMBOL(tcp_v4_conn_request);
1459 
1460 
1461 /*
1462  * The three way handshake has completed - we got a valid synack -
1463  * now create the new socket.
1464  */
1465 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1466 				  struct request_sock *req,
1467 				  struct dst_entry *dst,
1468 				  struct request_sock *req_unhash,
1469 				  bool *own_req)
1470 {
1471 	struct inet_request_sock *ireq;
1472 	struct inet_sock *newinet;
1473 	struct tcp_sock *newtp;
1474 	struct sock *newsk;
1475 #ifdef CONFIG_TCP_MD5SIG
1476 	const union tcp_md5_addr *addr;
1477 	struct tcp_md5sig_key *key;
1478 	int l3index;
1479 #endif
1480 	struct ip_options_rcu *inet_opt;
1481 
1482 	if (sk_acceptq_is_full(sk))
1483 		goto exit_overflow;
1484 
1485 	newsk = tcp_create_openreq_child(sk, req, skb);
1486 	if (!newsk)
1487 		goto exit_nonewsk;
1488 
1489 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1490 	inet_sk_rx_dst_set(newsk, skb);
1491 
1492 	newtp		      = tcp_sk(newsk);
1493 	newinet		      = inet_sk(newsk);
1494 	ireq		      = inet_rsk(req);
1495 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1496 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1497 	newsk->sk_bound_dev_if = ireq->ir_iif;
1498 	newinet->inet_saddr   = ireq->ir_loc_addr;
1499 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1500 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1501 	newinet->mc_index     = inet_iif(skb);
1502 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1503 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1504 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1505 	if (inet_opt)
1506 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1507 	newinet->inet_id = prandom_u32();
1508 
1509 	if (!dst) {
1510 		dst = inet_csk_route_child_sock(sk, newsk, req);
1511 		if (!dst)
1512 			goto put_and_exit;
1513 	} else {
1514 		/* syncookie case : see end of cookie_v4_check() */
1515 	}
1516 	sk_setup_caps(newsk, dst);
1517 
1518 	tcp_ca_openreq_child(newsk, dst);
1519 
1520 	tcp_sync_mss(newsk, dst_mtu(dst));
1521 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1522 
1523 	tcp_initialize_rcv_mss(newsk);
1524 
1525 #ifdef CONFIG_TCP_MD5SIG
1526 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1527 	/* Copy over the MD5 key from the original socket */
1528 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1529 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1530 	if (key) {
1531 		/*
1532 		 * We're using one, so create a matching key
1533 		 * on the newsk structure. If we fail to get
1534 		 * memory, then we end up not copying the key
1535 		 * across. Shucks.
1536 		 */
1537 		tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1538 			       key->key, key->keylen, GFP_ATOMIC);
1539 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1540 	}
1541 #endif
1542 
1543 	if (__inet_inherit_port(sk, newsk) < 0)
1544 		goto put_and_exit;
1545 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1546 	if (likely(*own_req)) {
1547 		tcp_move_syn(newtp, req);
1548 		ireq->ireq_opt = NULL;
1549 	} else {
1550 		newinet->inet_opt = NULL;
1551 	}
1552 	return newsk;
1553 
1554 exit_overflow:
1555 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1556 exit_nonewsk:
1557 	dst_release(dst);
1558 exit:
1559 	tcp_listendrop(sk);
1560 	return NULL;
1561 put_and_exit:
1562 	newinet->inet_opt = NULL;
1563 	inet_csk_prepare_forced_close(newsk);
1564 	tcp_done(newsk);
1565 	goto exit;
1566 }
1567 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1568 
1569 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1570 {
1571 #ifdef CONFIG_SYN_COOKIES
1572 	const struct tcphdr *th = tcp_hdr(skb);
1573 
1574 	if (!th->syn)
1575 		sk = cookie_v4_check(sk, skb);
1576 #endif
1577 	return sk;
1578 }
1579 
1580 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1581 			 struct tcphdr *th, u32 *cookie)
1582 {
1583 	u16 mss = 0;
1584 #ifdef CONFIG_SYN_COOKIES
1585 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1586 				    &tcp_request_sock_ipv4_ops, sk, th);
1587 	if (mss) {
1588 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1589 		tcp_synq_overflow(sk);
1590 	}
1591 #endif
1592 	return mss;
1593 }
1594 
1595 /* The socket must have it's spinlock held when we get
1596  * here, unless it is a TCP_LISTEN socket.
1597  *
1598  * We have a potential double-lock case here, so even when
1599  * doing backlog processing we use the BH locking scheme.
1600  * This is because we cannot sleep with the original spinlock
1601  * held.
1602  */
1603 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1604 {
1605 	struct sock *rsk;
1606 
1607 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1608 		struct dst_entry *dst = sk->sk_rx_dst;
1609 
1610 		sock_rps_save_rxhash(sk, skb);
1611 		sk_mark_napi_id(sk, skb);
1612 		if (dst) {
1613 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1614 			    !dst->ops->check(dst, 0)) {
1615 				dst_release(dst);
1616 				sk->sk_rx_dst = NULL;
1617 			}
1618 		}
1619 		tcp_rcv_established(sk, skb);
1620 		return 0;
1621 	}
1622 
1623 	if (tcp_checksum_complete(skb))
1624 		goto csum_err;
1625 
1626 	if (sk->sk_state == TCP_LISTEN) {
1627 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1628 
1629 		if (!nsk)
1630 			goto discard;
1631 		if (nsk != sk) {
1632 			if (tcp_child_process(sk, nsk, skb)) {
1633 				rsk = nsk;
1634 				goto reset;
1635 			}
1636 			return 0;
1637 		}
1638 	} else
1639 		sock_rps_save_rxhash(sk, skb);
1640 
1641 	if (tcp_rcv_state_process(sk, skb)) {
1642 		rsk = sk;
1643 		goto reset;
1644 	}
1645 	return 0;
1646 
1647 reset:
1648 	tcp_v4_send_reset(rsk, skb);
1649 discard:
1650 	kfree_skb(skb);
1651 	/* Be careful here. If this function gets more complicated and
1652 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1653 	 * might be destroyed here. This current version compiles correctly,
1654 	 * but you have been warned.
1655 	 */
1656 	return 0;
1657 
1658 csum_err:
1659 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1660 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1661 	goto discard;
1662 }
1663 EXPORT_SYMBOL(tcp_v4_do_rcv);
1664 
1665 int tcp_v4_early_demux(struct sk_buff *skb)
1666 {
1667 	const struct iphdr *iph;
1668 	const struct tcphdr *th;
1669 	struct sock *sk;
1670 
1671 	if (skb->pkt_type != PACKET_HOST)
1672 		return 0;
1673 
1674 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1675 		return 0;
1676 
1677 	iph = ip_hdr(skb);
1678 	th = tcp_hdr(skb);
1679 
1680 	if (th->doff < sizeof(struct tcphdr) / 4)
1681 		return 0;
1682 
1683 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1684 				       iph->saddr, th->source,
1685 				       iph->daddr, ntohs(th->dest),
1686 				       skb->skb_iif, inet_sdif(skb));
1687 	if (sk) {
1688 		skb->sk = sk;
1689 		skb->destructor = sock_edemux;
1690 		if (sk_fullsock(sk)) {
1691 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1692 
1693 			if (dst)
1694 				dst = dst_check(dst, 0);
1695 			if (dst &&
1696 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1697 				skb_dst_set_noref(skb, dst);
1698 		}
1699 	}
1700 	return 0;
1701 }
1702 
1703 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1704 {
1705 	u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1706 	struct skb_shared_info *shinfo;
1707 	const struct tcphdr *th;
1708 	struct tcphdr *thtail;
1709 	struct sk_buff *tail;
1710 	unsigned int hdrlen;
1711 	bool fragstolen;
1712 	u32 gso_segs;
1713 	int delta;
1714 
1715 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1716 	 * we can fix skb->truesize to its real value to avoid future drops.
1717 	 * This is valid because skb is not yet charged to the socket.
1718 	 * It has been noticed pure SACK packets were sometimes dropped
1719 	 * (if cooked by drivers without copybreak feature).
1720 	 */
1721 	skb_condense(skb);
1722 
1723 	skb_dst_drop(skb);
1724 
1725 	if (unlikely(tcp_checksum_complete(skb))) {
1726 		bh_unlock_sock(sk);
1727 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1728 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1729 		return true;
1730 	}
1731 
1732 	/* Attempt coalescing to last skb in backlog, even if we are
1733 	 * above the limits.
1734 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1735 	 */
1736 	th = (const struct tcphdr *)skb->data;
1737 	hdrlen = th->doff * 4;
1738 	shinfo = skb_shinfo(skb);
1739 
1740 	if (!shinfo->gso_size)
1741 		shinfo->gso_size = skb->len - hdrlen;
1742 
1743 	if (!shinfo->gso_segs)
1744 		shinfo->gso_segs = 1;
1745 
1746 	tail = sk->sk_backlog.tail;
1747 	if (!tail)
1748 		goto no_coalesce;
1749 	thtail = (struct tcphdr *)tail->data;
1750 
1751 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1752 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1753 	    ((TCP_SKB_CB(tail)->tcp_flags |
1754 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1755 	    !((TCP_SKB_CB(tail)->tcp_flags &
1756 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1757 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1758 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1759 #ifdef CONFIG_TLS_DEVICE
1760 	    tail->decrypted != skb->decrypted ||
1761 #endif
1762 	    thtail->doff != th->doff ||
1763 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1764 		goto no_coalesce;
1765 
1766 	__skb_pull(skb, hdrlen);
1767 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1768 		thtail->window = th->window;
1769 
1770 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1771 
1772 		if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1773 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1774 
1775 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1776 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1777 		 * is not entered if we append a packet with a FIN.
1778 		 * SYN, RST, URG are not present.
1779 		 * ACK is set on both packets.
1780 		 * PSH : we do not really care in TCP stack,
1781 		 *       at least for 'GRO' packets.
1782 		 */
1783 		thtail->fin |= th->fin;
1784 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1785 
1786 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1787 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1788 			tail->tstamp = skb->tstamp;
1789 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1790 		}
1791 
1792 		/* Not as strict as GRO. We only need to carry mss max value */
1793 		skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1794 						 skb_shinfo(tail)->gso_size);
1795 
1796 		gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1797 		skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1798 
1799 		sk->sk_backlog.len += delta;
1800 		__NET_INC_STATS(sock_net(sk),
1801 				LINUX_MIB_TCPBACKLOGCOALESCE);
1802 		kfree_skb_partial(skb, fragstolen);
1803 		return false;
1804 	}
1805 	__skb_push(skb, hdrlen);
1806 
1807 no_coalesce:
1808 	/* Only socket owner can try to collapse/prune rx queues
1809 	 * to reduce memory overhead, so add a little headroom here.
1810 	 * Few sockets backlog are possibly concurrently non empty.
1811 	 */
1812 	limit += 64*1024;
1813 
1814 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1815 		bh_unlock_sock(sk);
1816 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1817 		return true;
1818 	}
1819 	return false;
1820 }
1821 EXPORT_SYMBOL(tcp_add_backlog);
1822 
1823 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1824 {
1825 	struct tcphdr *th = (struct tcphdr *)skb->data;
1826 
1827 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1828 }
1829 EXPORT_SYMBOL(tcp_filter);
1830 
1831 static void tcp_v4_restore_cb(struct sk_buff *skb)
1832 {
1833 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1834 		sizeof(struct inet_skb_parm));
1835 }
1836 
1837 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1838 			   const struct tcphdr *th)
1839 {
1840 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1841 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1842 	 */
1843 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1844 		sizeof(struct inet_skb_parm));
1845 	barrier();
1846 
1847 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1848 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1849 				    skb->len - th->doff * 4);
1850 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1851 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1852 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1853 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1854 	TCP_SKB_CB(skb)->sacked	 = 0;
1855 	TCP_SKB_CB(skb)->has_rxtstamp =
1856 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1857 }
1858 
1859 /*
1860  *	From tcp_input.c
1861  */
1862 
1863 int tcp_v4_rcv(struct sk_buff *skb)
1864 {
1865 	struct net *net = dev_net(skb->dev);
1866 	struct sk_buff *skb_to_free;
1867 	int sdif = inet_sdif(skb);
1868 	int dif = inet_iif(skb);
1869 	const struct iphdr *iph;
1870 	const struct tcphdr *th;
1871 	bool refcounted;
1872 	struct sock *sk;
1873 	int ret;
1874 
1875 	if (skb->pkt_type != PACKET_HOST)
1876 		goto discard_it;
1877 
1878 	/* Count it even if it's bad */
1879 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1880 
1881 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1882 		goto discard_it;
1883 
1884 	th = (const struct tcphdr *)skb->data;
1885 
1886 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1887 		goto bad_packet;
1888 	if (!pskb_may_pull(skb, th->doff * 4))
1889 		goto discard_it;
1890 
1891 	/* An explanation is required here, I think.
1892 	 * Packet length and doff are validated by header prediction,
1893 	 * provided case of th->doff==0 is eliminated.
1894 	 * So, we defer the checks. */
1895 
1896 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1897 		goto csum_error;
1898 
1899 	th = (const struct tcphdr *)skb->data;
1900 	iph = ip_hdr(skb);
1901 lookup:
1902 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1903 			       th->dest, sdif, &refcounted);
1904 	if (!sk)
1905 		goto no_tcp_socket;
1906 
1907 process:
1908 	if (sk->sk_state == TCP_TIME_WAIT)
1909 		goto do_time_wait;
1910 
1911 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1912 		struct request_sock *req = inet_reqsk(sk);
1913 		bool req_stolen = false;
1914 		struct sock *nsk;
1915 
1916 		sk = req->rsk_listener;
1917 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1918 			sk_drops_add(sk, skb);
1919 			reqsk_put(req);
1920 			goto discard_it;
1921 		}
1922 		if (tcp_checksum_complete(skb)) {
1923 			reqsk_put(req);
1924 			goto csum_error;
1925 		}
1926 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1927 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1928 			goto lookup;
1929 		}
1930 		/* We own a reference on the listener, increase it again
1931 		 * as we might lose it too soon.
1932 		 */
1933 		sock_hold(sk);
1934 		refcounted = true;
1935 		nsk = NULL;
1936 		if (!tcp_filter(sk, skb)) {
1937 			th = (const struct tcphdr *)skb->data;
1938 			iph = ip_hdr(skb);
1939 			tcp_v4_fill_cb(skb, iph, th);
1940 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1941 		}
1942 		if (!nsk) {
1943 			reqsk_put(req);
1944 			if (req_stolen) {
1945 				/* Another cpu got exclusive access to req
1946 				 * and created a full blown socket.
1947 				 * Try to feed this packet to this socket
1948 				 * instead of discarding it.
1949 				 */
1950 				tcp_v4_restore_cb(skb);
1951 				sock_put(sk);
1952 				goto lookup;
1953 			}
1954 			goto discard_and_relse;
1955 		}
1956 		if (nsk == sk) {
1957 			reqsk_put(req);
1958 			tcp_v4_restore_cb(skb);
1959 		} else if (tcp_child_process(sk, nsk, skb)) {
1960 			tcp_v4_send_reset(nsk, skb);
1961 			goto discard_and_relse;
1962 		} else {
1963 			sock_put(sk);
1964 			return 0;
1965 		}
1966 	}
1967 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1968 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1969 		goto discard_and_relse;
1970 	}
1971 
1972 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1973 		goto discard_and_relse;
1974 
1975 	if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
1976 		goto discard_and_relse;
1977 
1978 	nf_reset_ct(skb);
1979 
1980 	if (tcp_filter(sk, skb))
1981 		goto discard_and_relse;
1982 	th = (const struct tcphdr *)skb->data;
1983 	iph = ip_hdr(skb);
1984 	tcp_v4_fill_cb(skb, iph, th);
1985 
1986 	skb->dev = NULL;
1987 
1988 	if (sk->sk_state == TCP_LISTEN) {
1989 		ret = tcp_v4_do_rcv(sk, skb);
1990 		goto put_and_return;
1991 	}
1992 
1993 	sk_incoming_cpu_update(sk);
1994 
1995 	bh_lock_sock_nested(sk);
1996 	tcp_segs_in(tcp_sk(sk), skb);
1997 	ret = 0;
1998 	if (!sock_owned_by_user(sk)) {
1999 		skb_to_free = sk->sk_rx_skb_cache;
2000 		sk->sk_rx_skb_cache = NULL;
2001 		ret = tcp_v4_do_rcv(sk, skb);
2002 	} else {
2003 		if (tcp_add_backlog(sk, skb))
2004 			goto discard_and_relse;
2005 		skb_to_free = NULL;
2006 	}
2007 	bh_unlock_sock(sk);
2008 	if (skb_to_free)
2009 		__kfree_skb(skb_to_free);
2010 
2011 put_and_return:
2012 	if (refcounted)
2013 		sock_put(sk);
2014 
2015 	return ret;
2016 
2017 no_tcp_socket:
2018 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2019 		goto discard_it;
2020 
2021 	tcp_v4_fill_cb(skb, iph, th);
2022 
2023 	if (tcp_checksum_complete(skb)) {
2024 csum_error:
2025 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2026 bad_packet:
2027 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2028 	} else {
2029 		tcp_v4_send_reset(NULL, skb);
2030 	}
2031 
2032 discard_it:
2033 	/* Discard frame. */
2034 	kfree_skb(skb);
2035 	return 0;
2036 
2037 discard_and_relse:
2038 	sk_drops_add(sk, skb);
2039 	if (refcounted)
2040 		sock_put(sk);
2041 	goto discard_it;
2042 
2043 do_time_wait:
2044 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2045 		inet_twsk_put(inet_twsk(sk));
2046 		goto discard_it;
2047 	}
2048 
2049 	tcp_v4_fill_cb(skb, iph, th);
2050 
2051 	if (tcp_checksum_complete(skb)) {
2052 		inet_twsk_put(inet_twsk(sk));
2053 		goto csum_error;
2054 	}
2055 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2056 	case TCP_TW_SYN: {
2057 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2058 							&tcp_hashinfo, skb,
2059 							__tcp_hdrlen(th),
2060 							iph->saddr, th->source,
2061 							iph->daddr, th->dest,
2062 							inet_iif(skb),
2063 							sdif);
2064 		if (sk2) {
2065 			inet_twsk_deschedule_put(inet_twsk(sk));
2066 			sk = sk2;
2067 			tcp_v4_restore_cb(skb);
2068 			refcounted = false;
2069 			goto process;
2070 		}
2071 	}
2072 		/* to ACK */
2073 		/* fall through */
2074 	case TCP_TW_ACK:
2075 		tcp_v4_timewait_ack(sk, skb);
2076 		break;
2077 	case TCP_TW_RST:
2078 		tcp_v4_send_reset(sk, skb);
2079 		inet_twsk_deschedule_put(inet_twsk(sk));
2080 		goto discard_it;
2081 	case TCP_TW_SUCCESS:;
2082 	}
2083 	goto discard_it;
2084 }
2085 
2086 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2087 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2088 	.twsk_unique	= tcp_twsk_unique,
2089 	.twsk_destructor= tcp_twsk_destructor,
2090 };
2091 
2092 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2093 {
2094 	struct dst_entry *dst = skb_dst(skb);
2095 
2096 	if (dst && dst_hold_safe(dst)) {
2097 		sk->sk_rx_dst = dst;
2098 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2099 	}
2100 }
2101 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2102 
2103 const struct inet_connection_sock_af_ops ipv4_specific = {
2104 	.queue_xmit	   = ip_queue_xmit,
2105 	.send_check	   = tcp_v4_send_check,
2106 	.rebuild_header	   = inet_sk_rebuild_header,
2107 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2108 	.conn_request	   = tcp_v4_conn_request,
2109 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2110 	.net_header_len	   = sizeof(struct iphdr),
2111 	.setsockopt	   = ip_setsockopt,
2112 	.getsockopt	   = ip_getsockopt,
2113 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2114 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2115 #ifdef CONFIG_COMPAT
2116 	.compat_setsockopt = compat_ip_setsockopt,
2117 	.compat_getsockopt = compat_ip_getsockopt,
2118 #endif
2119 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2120 };
2121 EXPORT_SYMBOL(ipv4_specific);
2122 
2123 #ifdef CONFIG_TCP_MD5SIG
2124 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2125 	.md5_lookup		= tcp_v4_md5_lookup,
2126 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2127 	.md5_parse		= tcp_v4_parse_md5_keys,
2128 };
2129 #endif
2130 
2131 /* NOTE: A lot of things set to zero explicitly by call to
2132  *       sk_alloc() so need not be done here.
2133  */
2134 static int tcp_v4_init_sock(struct sock *sk)
2135 {
2136 	struct inet_connection_sock *icsk = inet_csk(sk);
2137 
2138 	tcp_init_sock(sk);
2139 
2140 	icsk->icsk_af_ops = &ipv4_specific;
2141 
2142 #ifdef CONFIG_TCP_MD5SIG
2143 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2144 #endif
2145 
2146 	return 0;
2147 }
2148 
2149 void tcp_v4_destroy_sock(struct sock *sk)
2150 {
2151 	struct tcp_sock *tp = tcp_sk(sk);
2152 
2153 	trace_tcp_destroy_sock(sk);
2154 
2155 	tcp_clear_xmit_timers(sk);
2156 
2157 	tcp_cleanup_congestion_control(sk);
2158 
2159 	tcp_cleanup_ulp(sk);
2160 
2161 	/* Cleanup up the write buffer. */
2162 	tcp_write_queue_purge(sk);
2163 
2164 	/* Check if we want to disable active TFO */
2165 	tcp_fastopen_active_disable_ofo_check(sk);
2166 
2167 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2168 	skb_rbtree_purge(&tp->out_of_order_queue);
2169 
2170 #ifdef CONFIG_TCP_MD5SIG
2171 	/* Clean up the MD5 key list, if any */
2172 	if (tp->md5sig_info) {
2173 		tcp_clear_md5_list(sk);
2174 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2175 		tp->md5sig_info = NULL;
2176 	}
2177 #endif
2178 
2179 	/* Clean up a referenced TCP bind bucket. */
2180 	if (inet_csk(sk)->icsk_bind_hash)
2181 		inet_put_port(sk);
2182 
2183 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2184 
2185 	/* If socket is aborted during connect operation */
2186 	tcp_free_fastopen_req(tp);
2187 	tcp_fastopen_destroy_cipher(sk);
2188 	tcp_saved_syn_free(tp);
2189 
2190 	sk_sockets_allocated_dec(sk);
2191 }
2192 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2193 
2194 #ifdef CONFIG_PROC_FS
2195 /* Proc filesystem TCP sock list dumping. */
2196 
2197 /*
2198  * Get next listener socket follow cur.  If cur is NULL, get first socket
2199  * starting from bucket given in st->bucket; when st->bucket is zero the
2200  * very first socket in the hash table is returned.
2201  */
2202 static void *listening_get_next(struct seq_file *seq, void *cur)
2203 {
2204 	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2205 	struct tcp_iter_state *st = seq->private;
2206 	struct net *net = seq_file_net(seq);
2207 	struct inet_listen_hashbucket *ilb;
2208 	struct hlist_nulls_node *node;
2209 	struct sock *sk = cur;
2210 
2211 	if (!sk) {
2212 get_head:
2213 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2214 		spin_lock(&ilb->lock);
2215 		sk = sk_nulls_head(&ilb->nulls_head);
2216 		st->offset = 0;
2217 		goto get_sk;
2218 	}
2219 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2220 	++st->num;
2221 	++st->offset;
2222 
2223 	sk = sk_nulls_next(sk);
2224 get_sk:
2225 	sk_nulls_for_each_from(sk, node) {
2226 		if (!net_eq(sock_net(sk), net))
2227 			continue;
2228 		if (sk->sk_family == afinfo->family)
2229 			return sk;
2230 	}
2231 	spin_unlock(&ilb->lock);
2232 	st->offset = 0;
2233 	if (++st->bucket < INET_LHTABLE_SIZE)
2234 		goto get_head;
2235 	return NULL;
2236 }
2237 
2238 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2239 {
2240 	struct tcp_iter_state *st = seq->private;
2241 	void *rc;
2242 
2243 	st->bucket = 0;
2244 	st->offset = 0;
2245 	rc = listening_get_next(seq, NULL);
2246 
2247 	while (rc && *pos) {
2248 		rc = listening_get_next(seq, rc);
2249 		--*pos;
2250 	}
2251 	return rc;
2252 }
2253 
2254 static inline bool empty_bucket(const struct tcp_iter_state *st)
2255 {
2256 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2257 }
2258 
2259 /*
2260  * Get first established socket starting from bucket given in st->bucket.
2261  * If st->bucket is zero, the very first socket in the hash is returned.
2262  */
2263 static void *established_get_first(struct seq_file *seq)
2264 {
2265 	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2266 	struct tcp_iter_state *st = seq->private;
2267 	struct net *net = seq_file_net(seq);
2268 	void *rc = NULL;
2269 
2270 	st->offset = 0;
2271 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2272 		struct sock *sk;
2273 		struct hlist_nulls_node *node;
2274 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2275 
2276 		/* Lockless fast path for the common case of empty buckets */
2277 		if (empty_bucket(st))
2278 			continue;
2279 
2280 		spin_lock_bh(lock);
2281 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2282 			if (sk->sk_family != afinfo->family ||
2283 			    !net_eq(sock_net(sk), net)) {
2284 				continue;
2285 			}
2286 			rc = sk;
2287 			goto out;
2288 		}
2289 		spin_unlock_bh(lock);
2290 	}
2291 out:
2292 	return rc;
2293 }
2294 
2295 static void *established_get_next(struct seq_file *seq, void *cur)
2296 {
2297 	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2298 	struct sock *sk = cur;
2299 	struct hlist_nulls_node *node;
2300 	struct tcp_iter_state *st = seq->private;
2301 	struct net *net = seq_file_net(seq);
2302 
2303 	++st->num;
2304 	++st->offset;
2305 
2306 	sk = sk_nulls_next(sk);
2307 
2308 	sk_nulls_for_each_from(sk, node) {
2309 		if (sk->sk_family == afinfo->family &&
2310 		    net_eq(sock_net(sk), net))
2311 			return sk;
2312 	}
2313 
2314 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2315 	++st->bucket;
2316 	return established_get_first(seq);
2317 }
2318 
2319 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2320 {
2321 	struct tcp_iter_state *st = seq->private;
2322 	void *rc;
2323 
2324 	st->bucket = 0;
2325 	rc = established_get_first(seq);
2326 
2327 	while (rc && pos) {
2328 		rc = established_get_next(seq, rc);
2329 		--pos;
2330 	}
2331 	return rc;
2332 }
2333 
2334 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2335 {
2336 	void *rc;
2337 	struct tcp_iter_state *st = seq->private;
2338 
2339 	st->state = TCP_SEQ_STATE_LISTENING;
2340 	rc	  = listening_get_idx(seq, &pos);
2341 
2342 	if (!rc) {
2343 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2344 		rc	  = established_get_idx(seq, pos);
2345 	}
2346 
2347 	return rc;
2348 }
2349 
2350 static void *tcp_seek_last_pos(struct seq_file *seq)
2351 {
2352 	struct tcp_iter_state *st = seq->private;
2353 	int offset = st->offset;
2354 	int orig_num = st->num;
2355 	void *rc = NULL;
2356 
2357 	switch (st->state) {
2358 	case TCP_SEQ_STATE_LISTENING:
2359 		if (st->bucket >= INET_LHTABLE_SIZE)
2360 			break;
2361 		st->state = TCP_SEQ_STATE_LISTENING;
2362 		rc = listening_get_next(seq, NULL);
2363 		while (offset-- && rc)
2364 			rc = listening_get_next(seq, rc);
2365 		if (rc)
2366 			break;
2367 		st->bucket = 0;
2368 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2369 		/* Fallthrough */
2370 	case TCP_SEQ_STATE_ESTABLISHED:
2371 		if (st->bucket > tcp_hashinfo.ehash_mask)
2372 			break;
2373 		rc = established_get_first(seq);
2374 		while (offset-- && rc)
2375 			rc = established_get_next(seq, rc);
2376 	}
2377 
2378 	st->num = orig_num;
2379 
2380 	return rc;
2381 }
2382 
2383 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2384 {
2385 	struct tcp_iter_state *st = seq->private;
2386 	void *rc;
2387 
2388 	if (*pos && *pos == st->last_pos) {
2389 		rc = tcp_seek_last_pos(seq);
2390 		if (rc)
2391 			goto out;
2392 	}
2393 
2394 	st->state = TCP_SEQ_STATE_LISTENING;
2395 	st->num = 0;
2396 	st->bucket = 0;
2397 	st->offset = 0;
2398 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2399 
2400 out:
2401 	st->last_pos = *pos;
2402 	return rc;
2403 }
2404 EXPORT_SYMBOL(tcp_seq_start);
2405 
2406 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2407 {
2408 	struct tcp_iter_state *st = seq->private;
2409 	void *rc = NULL;
2410 
2411 	if (v == SEQ_START_TOKEN) {
2412 		rc = tcp_get_idx(seq, 0);
2413 		goto out;
2414 	}
2415 
2416 	switch (st->state) {
2417 	case TCP_SEQ_STATE_LISTENING:
2418 		rc = listening_get_next(seq, v);
2419 		if (!rc) {
2420 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2421 			st->bucket = 0;
2422 			st->offset = 0;
2423 			rc	  = established_get_first(seq);
2424 		}
2425 		break;
2426 	case TCP_SEQ_STATE_ESTABLISHED:
2427 		rc = established_get_next(seq, v);
2428 		break;
2429 	}
2430 out:
2431 	++*pos;
2432 	st->last_pos = *pos;
2433 	return rc;
2434 }
2435 EXPORT_SYMBOL(tcp_seq_next);
2436 
2437 void tcp_seq_stop(struct seq_file *seq, void *v)
2438 {
2439 	struct tcp_iter_state *st = seq->private;
2440 
2441 	switch (st->state) {
2442 	case TCP_SEQ_STATE_LISTENING:
2443 		if (v != SEQ_START_TOKEN)
2444 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2445 		break;
2446 	case TCP_SEQ_STATE_ESTABLISHED:
2447 		if (v)
2448 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2449 		break;
2450 	}
2451 }
2452 EXPORT_SYMBOL(tcp_seq_stop);
2453 
2454 static void get_openreq4(const struct request_sock *req,
2455 			 struct seq_file *f, int i)
2456 {
2457 	const struct inet_request_sock *ireq = inet_rsk(req);
2458 	long delta = req->rsk_timer.expires - jiffies;
2459 
2460 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2461 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2462 		i,
2463 		ireq->ir_loc_addr,
2464 		ireq->ir_num,
2465 		ireq->ir_rmt_addr,
2466 		ntohs(ireq->ir_rmt_port),
2467 		TCP_SYN_RECV,
2468 		0, 0, /* could print option size, but that is af dependent. */
2469 		1,    /* timers active (only the expire timer) */
2470 		jiffies_delta_to_clock_t(delta),
2471 		req->num_timeout,
2472 		from_kuid_munged(seq_user_ns(f),
2473 				 sock_i_uid(req->rsk_listener)),
2474 		0,  /* non standard timer */
2475 		0, /* open_requests have no inode */
2476 		0,
2477 		req);
2478 }
2479 
2480 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2481 {
2482 	int timer_active;
2483 	unsigned long timer_expires;
2484 	const struct tcp_sock *tp = tcp_sk(sk);
2485 	const struct inet_connection_sock *icsk = inet_csk(sk);
2486 	const struct inet_sock *inet = inet_sk(sk);
2487 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2488 	__be32 dest = inet->inet_daddr;
2489 	__be32 src = inet->inet_rcv_saddr;
2490 	__u16 destp = ntohs(inet->inet_dport);
2491 	__u16 srcp = ntohs(inet->inet_sport);
2492 	int rx_queue;
2493 	int state;
2494 
2495 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2496 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2497 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2498 		timer_active	= 1;
2499 		timer_expires	= icsk->icsk_timeout;
2500 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2501 		timer_active	= 4;
2502 		timer_expires	= icsk->icsk_timeout;
2503 	} else if (timer_pending(&sk->sk_timer)) {
2504 		timer_active	= 2;
2505 		timer_expires	= sk->sk_timer.expires;
2506 	} else {
2507 		timer_active	= 0;
2508 		timer_expires = jiffies;
2509 	}
2510 
2511 	state = inet_sk_state_load(sk);
2512 	if (state == TCP_LISTEN)
2513 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2514 	else
2515 		/* Because we don't lock the socket,
2516 		 * we might find a transient negative value.
2517 		 */
2518 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2519 				      READ_ONCE(tp->copied_seq), 0);
2520 
2521 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2522 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2523 		i, src, srcp, dest, destp, state,
2524 		READ_ONCE(tp->write_seq) - tp->snd_una,
2525 		rx_queue,
2526 		timer_active,
2527 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2528 		icsk->icsk_retransmits,
2529 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2530 		icsk->icsk_probes_out,
2531 		sock_i_ino(sk),
2532 		refcount_read(&sk->sk_refcnt), sk,
2533 		jiffies_to_clock_t(icsk->icsk_rto),
2534 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2535 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2536 		tp->snd_cwnd,
2537 		state == TCP_LISTEN ?
2538 		    fastopenq->max_qlen :
2539 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2540 }
2541 
2542 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2543 			       struct seq_file *f, int i)
2544 {
2545 	long delta = tw->tw_timer.expires - jiffies;
2546 	__be32 dest, src;
2547 	__u16 destp, srcp;
2548 
2549 	dest  = tw->tw_daddr;
2550 	src   = tw->tw_rcv_saddr;
2551 	destp = ntohs(tw->tw_dport);
2552 	srcp  = ntohs(tw->tw_sport);
2553 
2554 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2555 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2556 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2557 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2558 		refcount_read(&tw->tw_refcnt), tw);
2559 }
2560 
2561 #define TMPSZ 150
2562 
2563 static int tcp4_seq_show(struct seq_file *seq, void *v)
2564 {
2565 	struct tcp_iter_state *st;
2566 	struct sock *sk = v;
2567 
2568 	seq_setwidth(seq, TMPSZ - 1);
2569 	if (v == SEQ_START_TOKEN) {
2570 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2571 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2572 			   "inode");
2573 		goto out;
2574 	}
2575 	st = seq->private;
2576 
2577 	if (sk->sk_state == TCP_TIME_WAIT)
2578 		get_timewait4_sock(v, seq, st->num);
2579 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2580 		get_openreq4(v, seq, st->num);
2581 	else
2582 		get_tcp4_sock(v, seq, st->num);
2583 out:
2584 	seq_pad(seq, '\n');
2585 	return 0;
2586 }
2587 
2588 static const struct seq_operations tcp4_seq_ops = {
2589 	.show		= tcp4_seq_show,
2590 	.start		= tcp_seq_start,
2591 	.next		= tcp_seq_next,
2592 	.stop		= tcp_seq_stop,
2593 };
2594 
2595 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2596 	.family		= AF_INET,
2597 };
2598 
2599 static int __net_init tcp4_proc_init_net(struct net *net)
2600 {
2601 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2602 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2603 		return -ENOMEM;
2604 	return 0;
2605 }
2606 
2607 static void __net_exit tcp4_proc_exit_net(struct net *net)
2608 {
2609 	remove_proc_entry("tcp", net->proc_net);
2610 }
2611 
2612 static struct pernet_operations tcp4_net_ops = {
2613 	.init = tcp4_proc_init_net,
2614 	.exit = tcp4_proc_exit_net,
2615 };
2616 
2617 int __init tcp4_proc_init(void)
2618 {
2619 	return register_pernet_subsys(&tcp4_net_ops);
2620 }
2621 
2622 void tcp4_proc_exit(void)
2623 {
2624 	unregister_pernet_subsys(&tcp4_net_ops);
2625 }
2626 #endif /* CONFIG_PROC_FS */
2627 
2628 struct proto tcp_prot = {
2629 	.name			= "TCP",
2630 	.owner			= THIS_MODULE,
2631 	.close			= tcp_close,
2632 	.pre_connect		= tcp_v4_pre_connect,
2633 	.connect		= tcp_v4_connect,
2634 	.disconnect		= tcp_disconnect,
2635 	.accept			= inet_csk_accept,
2636 	.ioctl			= tcp_ioctl,
2637 	.init			= tcp_v4_init_sock,
2638 	.destroy		= tcp_v4_destroy_sock,
2639 	.shutdown		= tcp_shutdown,
2640 	.setsockopt		= tcp_setsockopt,
2641 	.getsockopt		= tcp_getsockopt,
2642 	.keepalive		= tcp_set_keepalive,
2643 	.recvmsg		= tcp_recvmsg,
2644 	.sendmsg		= tcp_sendmsg,
2645 	.sendpage		= tcp_sendpage,
2646 	.backlog_rcv		= tcp_v4_do_rcv,
2647 	.release_cb		= tcp_release_cb,
2648 	.hash			= inet_hash,
2649 	.unhash			= inet_unhash,
2650 	.get_port		= inet_csk_get_port,
2651 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2652 	.leave_memory_pressure	= tcp_leave_memory_pressure,
2653 	.stream_memory_free	= tcp_stream_memory_free,
2654 	.sockets_allocated	= &tcp_sockets_allocated,
2655 	.orphan_count		= &tcp_orphan_count,
2656 	.memory_allocated	= &tcp_memory_allocated,
2657 	.memory_pressure	= &tcp_memory_pressure,
2658 	.sysctl_mem		= sysctl_tcp_mem,
2659 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
2660 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
2661 	.max_header		= MAX_TCP_HEADER,
2662 	.obj_size		= sizeof(struct tcp_sock),
2663 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
2664 	.twsk_prot		= &tcp_timewait_sock_ops,
2665 	.rsk_prot		= &tcp_request_sock_ops,
2666 	.h.hashinfo		= &tcp_hashinfo,
2667 	.no_autobind		= true,
2668 #ifdef CONFIG_COMPAT
2669 	.compat_setsockopt	= compat_tcp_setsockopt,
2670 	.compat_getsockopt	= compat_tcp_getsockopt,
2671 #endif
2672 	.diag_destroy		= tcp_abort,
2673 };
2674 EXPORT_SYMBOL(tcp_prot);
2675 
2676 static void __net_exit tcp_sk_exit(struct net *net)
2677 {
2678 	int cpu;
2679 
2680 	if (net->ipv4.tcp_congestion_control)
2681 		bpf_module_put(net->ipv4.tcp_congestion_control,
2682 			       net->ipv4.tcp_congestion_control->owner);
2683 
2684 	for_each_possible_cpu(cpu)
2685 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2686 	free_percpu(net->ipv4.tcp_sk);
2687 }
2688 
2689 static int __net_init tcp_sk_init(struct net *net)
2690 {
2691 	int res, cpu, cnt;
2692 
2693 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2694 	if (!net->ipv4.tcp_sk)
2695 		return -ENOMEM;
2696 
2697 	for_each_possible_cpu(cpu) {
2698 		struct sock *sk;
2699 
2700 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2701 					   IPPROTO_TCP, net);
2702 		if (res)
2703 			goto fail;
2704 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2705 
2706 		/* Please enforce IP_DF and IPID==0 for RST and
2707 		 * ACK sent in SYN-RECV and TIME-WAIT state.
2708 		 */
2709 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2710 
2711 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2712 	}
2713 
2714 	net->ipv4.sysctl_tcp_ecn = 2;
2715 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2716 
2717 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2718 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2719 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2720 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2721 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2722 
2723 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2724 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2725 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2726 
2727 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2728 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2729 	net->ipv4.sysctl_tcp_syncookies = 1;
2730 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2731 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2732 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2733 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2734 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2735 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2736 	net->ipv4.sysctl_tcp_tw_reuse = 2;
2737 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2738 
2739 	cnt = tcp_hashinfo.ehash_mask + 1;
2740 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2741 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2742 
2743 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2744 	net->ipv4.sysctl_tcp_sack = 1;
2745 	net->ipv4.sysctl_tcp_window_scaling = 1;
2746 	net->ipv4.sysctl_tcp_timestamps = 1;
2747 	net->ipv4.sysctl_tcp_early_retrans = 3;
2748 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2749 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2750 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
2751 	net->ipv4.sysctl_tcp_max_reordering = 300;
2752 	net->ipv4.sysctl_tcp_dsack = 1;
2753 	net->ipv4.sysctl_tcp_app_win = 31;
2754 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
2755 	net->ipv4.sysctl_tcp_frto = 2;
2756 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2757 	/* This limits the percentage of the congestion window which we
2758 	 * will allow a single TSO frame to consume.  Building TSO frames
2759 	 * which are too large can cause TCP streams to be bursty.
2760 	 */
2761 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2762 	/* Default TSQ limit of 16 TSO segments */
2763 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2764 	/* rfc5961 challenge ack rate limiting */
2765 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2766 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
2767 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2768 	net->ipv4.sysctl_tcp_autocorking = 1;
2769 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2770 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2771 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2772 	if (net != &init_net) {
2773 		memcpy(net->ipv4.sysctl_tcp_rmem,
2774 		       init_net.ipv4.sysctl_tcp_rmem,
2775 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2776 		memcpy(net->ipv4.sysctl_tcp_wmem,
2777 		       init_net.ipv4.sysctl_tcp_wmem,
2778 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2779 	}
2780 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2781 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2782 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2783 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2784 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2785 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2786 
2787 	/* Reno is always built in */
2788 	if (!net_eq(net, &init_net) &&
2789 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2790 			       init_net.ipv4.tcp_congestion_control->owner))
2791 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2792 	else
2793 		net->ipv4.tcp_congestion_control = &tcp_reno;
2794 
2795 	return 0;
2796 fail:
2797 	tcp_sk_exit(net);
2798 
2799 	return res;
2800 }
2801 
2802 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2803 {
2804 	struct net *net;
2805 
2806 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2807 
2808 	list_for_each_entry(net, net_exit_list, exit_list)
2809 		tcp_fastopen_ctx_destroy(net);
2810 }
2811 
2812 static struct pernet_operations __net_initdata tcp_sk_ops = {
2813        .init	   = tcp_sk_init,
2814        .exit	   = tcp_sk_exit,
2815        .exit_batch = tcp_sk_exit_batch,
2816 };
2817 
2818 void __init tcp_v4_init(void)
2819 {
2820 	if (register_pernet_subsys(&tcp_sk_ops))
2821 		panic("Failed to create the TCP control socket.\n");
2822 }
2823