xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision 9c1f8594)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 
54 #include <linux/bottom_half.h>
55 #include <linux/types.h>
56 #include <linux/fcntl.h>
57 #include <linux/module.h>
58 #include <linux/random.h>
59 #include <linux/cache.h>
60 #include <linux/jhash.h>
61 #include <linux/init.h>
62 #include <linux/times.h>
63 #include <linux/slab.h>
64 
65 #include <net/net_namespace.h>
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/transp_v6.h>
70 #include <net/ipv6.h>
71 #include <net/inet_common.h>
72 #include <net/timewait_sock.h>
73 #include <net/xfrm.h>
74 #include <net/netdma.h>
75 #include <net/secure_seq.h>
76 
77 #include <linux/inet.h>
78 #include <linux/ipv6.h>
79 #include <linux/stddef.h>
80 #include <linux/proc_fs.h>
81 #include <linux/seq_file.h>
82 
83 #include <linux/crypto.h>
84 #include <linux/scatterlist.h>
85 
86 int sysctl_tcp_tw_reuse __read_mostly;
87 int sysctl_tcp_low_latency __read_mostly;
88 EXPORT_SYMBOL(sysctl_tcp_low_latency);
89 
90 
91 #ifdef CONFIG_TCP_MD5SIG
92 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
93 						   __be32 addr);
94 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
95 			       __be32 daddr, __be32 saddr, struct tcphdr *th);
96 #else
97 static inline
98 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
99 {
100 	return NULL;
101 }
102 #endif
103 
104 struct inet_hashinfo tcp_hashinfo;
105 EXPORT_SYMBOL(tcp_hashinfo);
106 
107 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
108 {
109 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
110 					  ip_hdr(skb)->saddr,
111 					  tcp_hdr(skb)->dest,
112 					  tcp_hdr(skb)->source);
113 }
114 
115 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
116 {
117 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
118 	struct tcp_sock *tp = tcp_sk(sk);
119 
120 	/* With PAWS, it is safe from the viewpoint
121 	   of data integrity. Even without PAWS it is safe provided sequence
122 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
123 
124 	   Actually, the idea is close to VJ's one, only timestamp cache is
125 	   held not per host, but per port pair and TW bucket is used as state
126 	   holder.
127 
128 	   If TW bucket has been already destroyed we fall back to VJ's scheme
129 	   and use initial timestamp retrieved from peer table.
130 	 */
131 	if (tcptw->tw_ts_recent_stamp &&
132 	    (twp == NULL || (sysctl_tcp_tw_reuse &&
133 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
134 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
135 		if (tp->write_seq == 0)
136 			tp->write_seq = 1;
137 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
138 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
139 		sock_hold(sktw);
140 		return 1;
141 	}
142 
143 	return 0;
144 }
145 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
146 
147 /* This will initiate an outgoing connection. */
148 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
149 {
150 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
151 	struct inet_sock *inet = inet_sk(sk);
152 	struct tcp_sock *tp = tcp_sk(sk);
153 	__be16 orig_sport, orig_dport;
154 	__be32 daddr, nexthop;
155 	struct flowi4 *fl4;
156 	struct rtable *rt;
157 	int err;
158 	struct ip_options_rcu *inet_opt;
159 
160 	if (addr_len < sizeof(struct sockaddr_in))
161 		return -EINVAL;
162 
163 	if (usin->sin_family != AF_INET)
164 		return -EAFNOSUPPORT;
165 
166 	nexthop = daddr = usin->sin_addr.s_addr;
167 	inet_opt = rcu_dereference_protected(inet->inet_opt,
168 					     sock_owned_by_user(sk));
169 	if (inet_opt && inet_opt->opt.srr) {
170 		if (!daddr)
171 			return -EINVAL;
172 		nexthop = inet_opt->opt.faddr;
173 	}
174 
175 	orig_sport = inet->inet_sport;
176 	orig_dport = usin->sin_port;
177 	fl4 = &inet->cork.fl.u.ip4;
178 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
179 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
180 			      IPPROTO_TCP,
181 			      orig_sport, orig_dport, sk, true);
182 	if (IS_ERR(rt)) {
183 		err = PTR_ERR(rt);
184 		if (err == -ENETUNREACH)
185 			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
186 		return err;
187 	}
188 
189 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
190 		ip_rt_put(rt);
191 		return -ENETUNREACH;
192 	}
193 
194 	if (!inet_opt || !inet_opt->opt.srr)
195 		daddr = fl4->daddr;
196 
197 	if (!inet->inet_saddr)
198 		inet->inet_saddr = fl4->saddr;
199 	inet->inet_rcv_saddr = inet->inet_saddr;
200 
201 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
202 		/* Reset inherited state */
203 		tp->rx_opt.ts_recent	   = 0;
204 		tp->rx_opt.ts_recent_stamp = 0;
205 		tp->write_seq		   = 0;
206 	}
207 
208 	if (tcp_death_row.sysctl_tw_recycle &&
209 	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
210 		struct inet_peer *peer = rt_get_peer(rt, fl4->daddr);
211 		/*
212 		 * VJ's idea. We save last timestamp seen from
213 		 * the destination in peer table, when entering state
214 		 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
215 		 * when trying new connection.
216 		 */
217 		if (peer) {
218 			inet_peer_refcheck(peer);
219 			if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
220 				tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
221 				tp->rx_opt.ts_recent = peer->tcp_ts;
222 			}
223 		}
224 	}
225 
226 	inet->inet_dport = usin->sin_port;
227 	inet->inet_daddr = daddr;
228 
229 	inet_csk(sk)->icsk_ext_hdr_len = 0;
230 	if (inet_opt)
231 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
232 
233 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
234 
235 	/* Socket identity is still unknown (sport may be zero).
236 	 * However we set state to SYN-SENT and not releasing socket
237 	 * lock select source port, enter ourselves into the hash tables and
238 	 * complete initialization after this.
239 	 */
240 	tcp_set_state(sk, TCP_SYN_SENT);
241 	err = inet_hash_connect(&tcp_death_row, sk);
242 	if (err)
243 		goto failure;
244 
245 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
246 			       inet->inet_sport, inet->inet_dport, sk);
247 	if (IS_ERR(rt)) {
248 		err = PTR_ERR(rt);
249 		rt = NULL;
250 		goto failure;
251 	}
252 	/* OK, now commit destination to socket.  */
253 	sk->sk_gso_type = SKB_GSO_TCPV4;
254 	sk_setup_caps(sk, &rt->dst);
255 
256 	if (!tp->write_seq)
257 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
258 							   inet->inet_daddr,
259 							   inet->inet_sport,
260 							   usin->sin_port);
261 
262 	inet->inet_id = tp->write_seq ^ jiffies;
263 
264 	err = tcp_connect(sk);
265 	rt = NULL;
266 	if (err)
267 		goto failure;
268 
269 	return 0;
270 
271 failure:
272 	/*
273 	 * This unhashes the socket and releases the local port,
274 	 * if necessary.
275 	 */
276 	tcp_set_state(sk, TCP_CLOSE);
277 	ip_rt_put(rt);
278 	sk->sk_route_caps = 0;
279 	inet->inet_dport = 0;
280 	return err;
281 }
282 EXPORT_SYMBOL(tcp_v4_connect);
283 
284 /*
285  * This routine does path mtu discovery as defined in RFC1191.
286  */
287 static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
288 {
289 	struct dst_entry *dst;
290 	struct inet_sock *inet = inet_sk(sk);
291 
292 	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
293 	 * send out by Linux are always <576bytes so they should go through
294 	 * unfragmented).
295 	 */
296 	if (sk->sk_state == TCP_LISTEN)
297 		return;
298 
299 	/* We don't check in the destentry if pmtu discovery is forbidden
300 	 * on this route. We just assume that no packet_to_big packets
301 	 * are send back when pmtu discovery is not active.
302 	 * There is a small race when the user changes this flag in the
303 	 * route, but I think that's acceptable.
304 	 */
305 	if ((dst = __sk_dst_check(sk, 0)) == NULL)
306 		return;
307 
308 	dst->ops->update_pmtu(dst, mtu);
309 
310 	/* Something is about to be wrong... Remember soft error
311 	 * for the case, if this connection will not able to recover.
312 	 */
313 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
314 		sk->sk_err_soft = EMSGSIZE;
315 
316 	mtu = dst_mtu(dst);
317 
318 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
319 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
320 		tcp_sync_mss(sk, mtu);
321 
322 		/* Resend the TCP packet because it's
323 		 * clear that the old packet has been
324 		 * dropped. This is the new "fast" path mtu
325 		 * discovery.
326 		 */
327 		tcp_simple_retransmit(sk);
328 	} /* else let the usual retransmit timer handle it */
329 }
330 
331 /*
332  * This routine is called by the ICMP module when it gets some
333  * sort of error condition.  If err < 0 then the socket should
334  * be closed and the error returned to the user.  If err > 0
335  * it's just the icmp type << 8 | icmp code.  After adjustment
336  * header points to the first 8 bytes of the tcp header.  We need
337  * to find the appropriate port.
338  *
339  * The locking strategy used here is very "optimistic". When
340  * someone else accesses the socket the ICMP is just dropped
341  * and for some paths there is no check at all.
342  * A more general error queue to queue errors for later handling
343  * is probably better.
344  *
345  */
346 
347 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
348 {
349 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
350 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
351 	struct inet_connection_sock *icsk;
352 	struct tcp_sock *tp;
353 	struct inet_sock *inet;
354 	const int type = icmp_hdr(icmp_skb)->type;
355 	const int code = icmp_hdr(icmp_skb)->code;
356 	struct sock *sk;
357 	struct sk_buff *skb;
358 	__u32 seq;
359 	__u32 remaining;
360 	int err;
361 	struct net *net = dev_net(icmp_skb->dev);
362 
363 	if (icmp_skb->len < (iph->ihl << 2) + 8) {
364 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
365 		return;
366 	}
367 
368 	sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
369 			iph->saddr, th->source, inet_iif(icmp_skb));
370 	if (!sk) {
371 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
372 		return;
373 	}
374 	if (sk->sk_state == TCP_TIME_WAIT) {
375 		inet_twsk_put(inet_twsk(sk));
376 		return;
377 	}
378 
379 	bh_lock_sock(sk);
380 	/* If too many ICMPs get dropped on busy
381 	 * servers this needs to be solved differently.
382 	 */
383 	if (sock_owned_by_user(sk))
384 		NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
385 
386 	if (sk->sk_state == TCP_CLOSE)
387 		goto out;
388 
389 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
390 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
391 		goto out;
392 	}
393 
394 	icsk = inet_csk(sk);
395 	tp = tcp_sk(sk);
396 	seq = ntohl(th->seq);
397 	if (sk->sk_state != TCP_LISTEN &&
398 	    !between(seq, tp->snd_una, tp->snd_nxt)) {
399 		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
400 		goto out;
401 	}
402 
403 	switch (type) {
404 	case ICMP_SOURCE_QUENCH:
405 		/* Just silently ignore these. */
406 		goto out;
407 	case ICMP_PARAMETERPROB:
408 		err = EPROTO;
409 		break;
410 	case ICMP_DEST_UNREACH:
411 		if (code > NR_ICMP_UNREACH)
412 			goto out;
413 
414 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
415 			if (!sock_owned_by_user(sk))
416 				do_pmtu_discovery(sk, iph, info);
417 			goto out;
418 		}
419 
420 		err = icmp_err_convert[code].errno;
421 		/* check if icmp_skb allows revert of backoff
422 		 * (see draft-zimmermann-tcp-lcd) */
423 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
424 			break;
425 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
426 		    !icsk->icsk_backoff)
427 			break;
428 
429 		if (sock_owned_by_user(sk))
430 			break;
431 
432 		icsk->icsk_backoff--;
433 		inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
434 			TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
435 		tcp_bound_rto(sk);
436 
437 		skb = tcp_write_queue_head(sk);
438 		BUG_ON(!skb);
439 
440 		remaining = icsk->icsk_rto - min(icsk->icsk_rto,
441 				tcp_time_stamp - TCP_SKB_CB(skb)->when);
442 
443 		if (remaining) {
444 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
445 						  remaining, TCP_RTO_MAX);
446 		} else {
447 			/* RTO revert clocked out retransmission.
448 			 * Will retransmit now */
449 			tcp_retransmit_timer(sk);
450 		}
451 
452 		break;
453 	case ICMP_TIME_EXCEEDED:
454 		err = EHOSTUNREACH;
455 		break;
456 	default:
457 		goto out;
458 	}
459 
460 	switch (sk->sk_state) {
461 		struct request_sock *req, **prev;
462 	case TCP_LISTEN:
463 		if (sock_owned_by_user(sk))
464 			goto out;
465 
466 		req = inet_csk_search_req(sk, &prev, th->dest,
467 					  iph->daddr, iph->saddr);
468 		if (!req)
469 			goto out;
470 
471 		/* ICMPs are not backlogged, hence we cannot get
472 		   an established socket here.
473 		 */
474 		WARN_ON(req->sk);
475 
476 		if (seq != tcp_rsk(req)->snt_isn) {
477 			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
478 			goto out;
479 		}
480 
481 		/*
482 		 * Still in SYN_RECV, just remove it silently.
483 		 * There is no good way to pass the error to the newly
484 		 * created socket, and POSIX does not want network
485 		 * errors returned from accept().
486 		 */
487 		inet_csk_reqsk_queue_drop(sk, req, prev);
488 		goto out;
489 
490 	case TCP_SYN_SENT:
491 	case TCP_SYN_RECV:  /* Cannot happen.
492 			       It can f.e. if SYNs crossed.
493 			     */
494 		if (!sock_owned_by_user(sk)) {
495 			sk->sk_err = err;
496 
497 			sk->sk_error_report(sk);
498 
499 			tcp_done(sk);
500 		} else {
501 			sk->sk_err_soft = err;
502 		}
503 		goto out;
504 	}
505 
506 	/* If we've already connected we will keep trying
507 	 * until we time out, or the user gives up.
508 	 *
509 	 * rfc1122 4.2.3.9 allows to consider as hard errors
510 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
511 	 * but it is obsoleted by pmtu discovery).
512 	 *
513 	 * Note, that in modern internet, where routing is unreliable
514 	 * and in each dark corner broken firewalls sit, sending random
515 	 * errors ordered by their masters even this two messages finally lose
516 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
517 	 *
518 	 * Now we are in compliance with RFCs.
519 	 *							--ANK (980905)
520 	 */
521 
522 	inet = inet_sk(sk);
523 	if (!sock_owned_by_user(sk) && inet->recverr) {
524 		sk->sk_err = err;
525 		sk->sk_error_report(sk);
526 	} else	{ /* Only an error on timeout */
527 		sk->sk_err_soft = err;
528 	}
529 
530 out:
531 	bh_unlock_sock(sk);
532 	sock_put(sk);
533 }
534 
535 static void __tcp_v4_send_check(struct sk_buff *skb,
536 				__be32 saddr, __be32 daddr)
537 {
538 	struct tcphdr *th = tcp_hdr(skb);
539 
540 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
541 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
542 		skb->csum_start = skb_transport_header(skb) - skb->head;
543 		skb->csum_offset = offsetof(struct tcphdr, check);
544 	} else {
545 		th->check = tcp_v4_check(skb->len, saddr, daddr,
546 					 csum_partial(th,
547 						      th->doff << 2,
548 						      skb->csum));
549 	}
550 }
551 
552 /* This routine computes an IPv4 TCP checksum. */
553 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
554 {
555 	struct inet_sock *inet = inet_sk(sk);
556 
557 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
558 }
559 EXPORT_SYMBOL(tcp_v4_send_check);
560 
561 int tcp_v4_gso_send_check(struct sk_buff *skb)
562 {
563 	const struct iphdr *iph;
564 	struct tcphdr *th;
565 
566 	if (!pskb_may_pull(skb, sizeof(*th)))
567 		return -EINVAL;
568 
569 	iph = ip_hdr(skb);
570 	th = tcp_hdr(skb);
571 
572 	th->check = 0;
573 	skb->ip_summed = CHECKSUM_PARTIAL;
574 	__tcp_v4_send_check(skb, iph->saddr, iph->daddr);
575 	return 0;
576 }
577 
578 /*
579  *	This routine will send an RST to the other tcp.
580  *
581  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
582  *		      for reset.
583  *	Answer: if a packet caused RST, it is not for a socket
584  *		existing in our system, if it is matched to a socket,
585  *		it is just duplicate segment or bug in other side's TCP.
586  *		So that we build reply only basing on parameters
587  *		arrived with segment.
588  *	Exception: precedence violation. We do not implement it in any case.
589  */
590 
591 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
592 {
593 	struct tcphdr *th = tcp_hdr(skb);
594 	struct {
595 		struct tcphdr th;
596 #ifdef CONFIG_TCP_MD5SIG
597 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
598 #endif
599 	} rep;
600 	struct ip_reply_arg arg;
601 #ifdef CONFIG_TCP_MD5SIG
602 	struct tcp_md5sig_key *key;
603 #endif
604 	struct net *net;
605 
606 	/* Never send a reset in response to a reset. */
607 	if (th->rst)
608 		return;
609 
610 	if (skb_rtable(skb)->rt_type != RTN_LOCAL)
611 		return;
612 
613 	/* Swap the send and the receive. */
614 	memset(&rep, 0, sizeof(rep));
615 	rep.th.dest   = th->source;
616 	rep.th.source = th->dest;
617 	rep.th.doff   = sizeof(struct tcphdr) / 4;
618 	rep.th.rst    = 1;
619 
620 	if (th->ack) {
621 		rep.th.seq = th->ack_seq;
622 	} else {
623 		rep.th.ack = 1;
624 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
625 				       skb->len - (th->doff << 2));
626 	}
627 
628 	memset(&arg, 0, sizeof(arg));
629 	arg.iov[0].iov_base = (unsigned char *)&rep;
630 	arg.iov[0].iov_len  = sizeof(rep.th);
631 
632 #ifdef CONFIG_TCP_MD5SIG
633 	key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
634 	if (key) {
635 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
636 				   (TCPOPT_NOP << 16) |
637 				   (TCPOPT_MD5SIG << 8) |
638 				   TCPOLEN_MD5SIG);
639 		/* Update length and the length the header thinks exists */
640 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
641 		rep.th.doff = arg.iov[0].iov_len / 4;
642 
643 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
644 				     key, ip_hdr(skb)->saddr,
645 				     ip_hdr(skb)->daddr, &rep.th);
646 	}
647 #endif
648 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
649 				      ip_hdr(skb)->saddr, /* XXX */
650 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
651 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
652 	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
653 
654 	net = dev_net(skb_dst(skb)->dev);
655 	ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
656 		      &arg, arg.iov[0].iov_len);
657 
658 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
659 	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
660 }
661 
662 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
663    outside socket context is ugly, certainly. What can I do?
664  */
665 
666 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
667 			    u32 win, u32 ts, int oif,
668 			    struct tcp_md5sig_key *key,
669 			    int reply_flags)
670 {
671 	struct tcphdr *th = tcp_hdr(skb);
672 	struct {
673 		struct tcphdr th;
674 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
675 #ifdef CONFIG_TCP_MD5SIG
676 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
677 #endif
678 			];
679 	} rep;
680 	struct ip_reply_arg arg;
681 	struct net *net = dev_net(skb_dst(skb)->dev);
682 
683 	memset(&rep.th, 0, sizeof(struct tcphdr));
684 	memset(&arg, 0, sizeof(arg));
685 
686 	arg.iov[0].iov_base = (unsigned char *)&rep;
687 	arg.iov[0].iov_len  = sizeof(rep.th);
688 	if (ts) {
689 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
690 				   (TCPOPT_TIMESTAMP << 8) |
691 				   TCPOLEN_TIMESTAMP);
692 		rep.opt[1] = htonl(tcp_time_stamp);
693 		rep.opt[2] = htonl(ts);
694 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
695 	}
696 
697 	/* Swap the send and the receive. */
698 	rep.th.dest    = th->source;
699 	rep.th.source  = th->dest;
700 	rep.th.doff    = arg.iov[0].iov_len / 4;
701 	rep.th.seq     = htonl(seq);
702 	rep.th.ack_seq = htonl(ack);
703 	rep.th.ack     = 1;
704 	rep.th.window  = htons(win);
705 
706 #ifdef CONFIG_TCP_MD5SIG
707 	if (key) {
708 		int offset = (ts) ? 3 : 0;
709 
710 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
711 					  (TCPOPT_NOP << 16) |
712 					  (TCPOPT_MD5SIG << 8) |
713 					  TCPOLEN_MD5SIG);
714 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
715 		rep.th.doff = arg.iov[0].iov_len/4;
716 
717 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
718 				    key, ip_hdr(skb)->saddr,
719 				    ip_hdr(skb)->daddr, &rep.th);
720 	}
721 #endif
722 	arg.flags = reply_flags;
723 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
724 				      ip_hdr(skb)->saddr, /* XXX */
725 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
726 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
727 	if (oif)
728 		arg.bound_dev_if = oif;
729 
730 	ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
731 		      &arg, arg.iov[0].iov_len);
732 
733 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
734 }
735 
736 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
737 {
738 	struct inet_timewait_sock *tw = inet_twsk(sk);
739 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
740 
741 	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
742 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
743 			tcptw->tw_ts_recent,
744 			tw->tw_bound_dev_if,
745 			tcp_twsk_md5_key(tcptw),
746 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
747 			);
748 
749 	inet_twsk_put(tw);
750 }
751 
752 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
753 				  struct request_sock *req)
754 {
755 	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
756 			tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
757 			req->ts_recent,
758 			0,
759 			tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
760 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
761 }
762 
763 /*
764  *	Send a SYN-ACK after having received a SYN.
765  *	This still operates on a request_sock only, not on a big
766  *	socket.
767  */
768 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
769 			      struct request_sock *req,
770 			      struct request_values *rvp)
771 {
772 	const struct inet_request_sock *ireq = inet_rsk(req);
773 	struct flowi4 fl4;
774 	int err = -1;
775 	struct sk_buff * skb;
776 
777 	/* First, grab a route. */
778 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
779 		return -1;
780 
781 	skb = tcp_make_synack(sk, dst, req, rvp);
782 
783 	if (skb) {
784 		__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
785 
786 		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
787 					    ireq->rmt_addr,
788 					    ireq->opt);
789 		err = net_xmit_eval(err);
790 	}
791 
792 	dst_release(dst);
793 	return err;
794 }
795 
796 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
797 			      struct request_values *rvp)
798 {
799 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
800 	return tcp_v4_send_synack(sk, NULL, req, rvp);
801 }
802 
803 /*
804  *	IPv4 request_sock destructor.
805  */
806 static void tcp_v4_reqsk_destructor(struct request_sock *req)
807 {
808 	kfree(inet_rsk(req)->opt);
809 }
810 
811 /*
812  * Return 1 if a syncookie should be sent
813  */
814 int tcp_syn_flood_action(struct sock *sk,
815 			 const struct sk_buff *skb,
816 			 const char *proto)
817 {
818 	const char *msg = "Dropping request";
819 	int want_cookie = 0;
820 	struct listen_sock *lopt;
821 
822 
823 
824 #ifdef CONFIG_SYN_COOKIES
825 	if (sysctl_tcp_syncookies) {
826 		msg = "Sending cookies";
827 		want_cookie = 1;
828 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
829 	} else
830 #endif
831 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
832 
833 	lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
834 	if (!lopt->synflood_warned) {
835 		lopt->synflood_warned = 1;
836 		pr_info("%s: Possible SYN flooding on port %d. %s. "
837 			" Check SNMP counters.\n",
838 			proto, ntohs(tcp_hdr(skb)->dest), msg);
839 	}
840 	return want_cookie;
841 }
842 EXPORT_SYMBOL(tcp_syn_flood_action);
843 
844 /*
845  * Save and compile IPv4 options into the request_sock if needed.
846  */
847 static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
848 						  struct sk_buff *skb)
849 {
850 	const struct ip_options *opt = &(IPCB(skb)->opt);
851 	struct ip_options_rcu *dopt = NULL;
852 
853 	if (opt && opt->optlen) {
854 		int opt_size = sizeof(*dopt) + opt->optlen;
855 
856 		dopt = kmalloc(opt_size, GFP_ATOMIC);
857 		if (dopt) {
858 			if (ip_options_echo(&dopt->opt, skb)) {
859 				kfree(dopt);
860 				dopt = NULL;
861 			}
862 		}
863 	}
864 	return dopt;
865 }
866 
867 #ifdef CONFIG_TCP_MD5SIG
868 /*
869  * RFC2385 MD5 checksumming requires a mapping of
870  * IP address->MD5 Key.
871  * We need to maintain these in the sk structure.
872  */
873 
874 /* Find the Key structure for an address.  */
875 static struct tcp_md5sig_key *
876 			tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
877 {
878 	struct tcp_sock *tp = tcp_sk(sk);
879 	int i;
880 
881 	if (!tp->md5sig_info || !tp->md5sig_info->entries4)
882 		return NULL;
883 	for (i = 0; i < tp->md5sig_info->entries4; i++) {
884 		if (tp->md5sig_info->keys4[i].addr == addr)
885 			return &tp->md5sig_info->keys4[i].base;
886 	}
887 	return NULL;
888 }
889 
890 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
891 					 struct sock *addr_sk)
892 {
893 	return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
894 }
895 EXPORT_SYMBOL(tcp_v4_md5_lookup);
896 
897 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
898 						      struct request_sock *req)
899 {
900 	return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
901 }
902 
903 /* This can be called on a newly created socket, from other files */
904 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
905 		      u8 *newkey, u8 newkeylen)
906 {
907 	/* Add Key to the list */
908 	struct tcp_md5sig_key *key;
909 	struct tcp_sock *tp = tcp_sk(sk);
910 	struct tcp4_md5sig_key *keys;
911 
912 	key = tcp_v4_md5_do_lookup(sk, addr);
913 	if (key) {
914 		/* Pre-existing entry - just update that one. */
915 		kfree(key->key);
916 		key->key = newkey;
917 		key->keylen = newkeylen;
918 	} else {
919 		struct tcp_md5sig_info *md5sig;
920 
921 		if (!tp->md5sig_info) {
922 			tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
923 						  GFP_ATOMIC);
924 			if (!tp->md5sig_info) {
925 				kfree(newkey);
926 				return -ENOMEM;
927 			}
928 			sk_nocaps_add(sk, NETIF_F_GSO_MASK);
929 		}
930 		if (tcp_alloc_md5sig_pool(sk) == NULL) {
931 			kfree(newkey);
932 			return -ENOMEM;
933 		}
934 		md5sig = tp->md5sig_info;
935 
936 		if (md5sig->alloced4 == md5sig->entries4) {
937 			keys = kmalloc((sizeof(*keys) *
938 					(md5sig->entries4 + 1)), GFP_ATOMIC);
939 			if (!keys) {
940 				kfree(newkey);
941 				tcp_free_md5sig_pool();
942 				return -ENOMEM;
943 			}
944 
945 			if (md5sig->entries4)
946 				memcpy(keys, md5sig->keys4,
947 				       sizeof(*keys) * md5sig->entries4);
948 
949 			/* Free old key list, and reference new one */
950 			kfree(md5sig->keys4);
951 			md5sig->keys4 = keys;
952 			md5sig->alloced4++;
953 		}
954 		md5sig->entries4++;
955 		md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
956 		md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
957 		md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
958 	}
959 	return 0;
960 }
961 EXPORT_SYMBOL(tcp_v4_md5_do_add);
962 
963 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
964 			       u8 *newkey, u8 newkeylen)
965 {
966 	return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
967 				 newkey, newkeylen);
968 }
969 
970 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
971 {
972 	struct tcp_sock *tp = tcp_sk(sk);
973 	int i;
974 
975 	for (i = 0; i < tp->md5sig_info->entries4; i++) {
976 		if (tp->md5sig_info->keys4[i].addr == addr) {
977 			/* Free the key */
978 			kfree(tp->md5sig_info->keys4[i].base.key);
979 			tp->md5sig_info->entries4--;
980 
981 			if (tp->md5sig_info->entries4 == 0) {
982 				kfree(tp->md5sig_info->keys4);
983 				tp->md5sig_info->keys4 = NULL;
984 				tp->md5sig_info->alloced4 = 0;
985 			} else if (tp->md5sig_info->entries4 != i) {
986 				/* Need to do some manipulation */
987 				memmove(&tp->md5sig_info->keys4[i],
988 					&tp->md5sig_info->keys4[i+1],
989 					(tp->md5sig_info->entries4 - i) *
990 					 sizeof(struct tcp4_md5sig_key));
991 			}
992 			tcp_free_md5sig_pool();
993 			return 0;
994 		}
995 	}
996 	return -ENOENT;
997 }
998 EXPORT_SYMBOL(tcp_v4_md5_do_del);
999 
1000 static void tcp_v4_clear_md5_list(struct sock *sk)
1001 {
1002 	struct tcp_sock *tp = tcp_sk(sk);
1003 
1004 	/* Free each key, then the set of key keys,
1005 	 * the crypto element, and then decrement our
1006 	 * hold on the last resort crypto.
1007 	 */
1008 	if (tp->md5sig_info->entries4) {
1009 		int i;
1010 		for (i = 0; i < tp->md5sig_info->entries4; i++)
1011 			kfree(tp->md5sig_info->keys4[i].base.key);
1012 		tp->md5sig_info->entries4 = 0;
1013 		tcp_free_md5sig_pool();
1014 	}
1015 	if (tp->md5sig_info->keys4) {
1016 		kfree(tp->md5sig_info->keys4);
1017 		tp->md5sig_info->keys4 = NULL;
1018 		tp->md5sig_info->alloced4  = 0;
1019 	}
1020 }
1021 
1022 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1023 				 int optlen)
1024 {
1025 	struct tcp_md5sig cmd;
1026 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1027 	u8 *newkey;
1028 
1029 	if (optlen < sizeof(cmd))
1030 		return -EINVAL;
1031 
1032 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1033 		return -EFAULT;
1034 
1035 	if (sin->sin_family != AF_INET)
1036 		return -EINVAL;
1037 
1038 	if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1039 		if (!tcp_sk(sk)->md5sig_info)
1040 			return -ENOENT;
1041 		return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1042 	}
1043 
1044 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1045 		return -EINVAL;
1046 
1047 	if (!tcp_sk(sk)->md5sig_info) {
1048 		struct tcp_sock *tp = tcp_sk(sk);
1049 		struct tcp_md5sig_info *p;
1050 
1051 		p = kzalloc(sizeof(*p), sk->sk_allocation);
1052 		if (!p)
1053 			return -EINVAL;
1054 
1055 		tp->md5sig_info = p;
1056 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1057 	}
1058 
1059 	newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1060 	if (!newkey)
1061 		return -ENOMEM;
1062 	return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1063 				 newkey, cmd.tcpm_keylen);
1064 }
1065 
1066 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1067 					__be32 daddr, __be32 saddr, int nbytes)
1068 {
1069 	struct tcp4_pseudohdr *bp;
1070 	struct scatterlist sg;
1071 
1072 	bp = &hp->md5_blk.ip4;
1073 
1074 	/*
1075 	 * 1. the TCP pseudo-header (in the order: source IP address,
1076 	 * destination IP address, zero-padded protocol number, and
1077 	 * segment length)
1078 	 */
1079 	bp->saddr = saddr;
1080 	bp->daddr = daddr;
1081 	bp->pad = 0;
1082 	bp->protocol = IPPROTO_TCP;
1083 	bp->len = cpu_to_be16(nbytes);
1084 
1085 	sg_init_one(&sg, bp, sizeof(*bp));
1086 	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1087 }
1088 
1089 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1090 			       __be32 daddr, __be32 saddr, struct tcphdr *th)
1091 {
1092 	struct tcp_md5sig_pool *hp;
1093 	struct hash_desc *desc;
1094 
1095 	hp = tcp_get_md5sig_pool();
1096 	if (!hp)
1097 		goto clear_hash_noput;
1098 	desc = &hp->md5_desc;
1099 
1100 	if (crypto_hash_init(desc))
1101 		goto clear_hash;
1102 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1103 		goto clear_hash;
1104 	if (tcp_md5_hash_header(hp, th))
1105 		goto clear_hash;
1106 	if (tcp_md5_hash_key(hp, key))
1107 		goto clear_hash;
1108 	if (crypto_hash_final(desc, md5_hash))
1109 		goto clear_hash;
1110 
1111 	tcp_put_md5sig_pool();
1112 	return 0;
1113 
1114 clear_hash:
1115 	tcp_put_md5sig_pool();
1116 clear_hash_noput:
1117 	memset(md5_hash, 0, 16);
1118 	return 1;
1119 }
1120 
1121 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1122 			struct sock *sk, struct request_sock *req,
1123 			struct sk_buff *skb)
1124 {
1125 	struct tcp_md5sig_pool *hp;
1126 	struct hash_desc *desc;
1127 	struct tcphdr *th = tcp_hdr(skb);
1128 	__be32 saddr, daddr;
1129 
1130 	if (sk) {
1131 		saddr = inet_sk(sk)->inet_saddr;
1132 		daddr = inet_sk(sk)->inet_daddr;
1133 	} else if (req) {
1134 		saddr = inet_rsk(req)->loc_addr;
1135 		daddr = inet_rsk(req)->rmt_addr;
1136 	} else {
1137 		const struct iphdr *iph = ip_hdr(skb);
1138 		saddr = iph->saddr;
1139 		daddr = iph->daddr;
1140 	}
1141 
1142 	hp = tcp_get_md5sig_pool();
1143 	if (!hp)
1144 		goto clear_hash_noput;
1145 	desc = &hp->md5_desc;
1146 
1147 	if (crypto_hash_init(desc))
1148 		goto clear_hash;
1149 
1150 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1151 		goto clear_hash;
1152 	if (tcp_md5_hash_header(hp, th))
1153 		goto clear_hash;
1154 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1155 		goto clear_hash;
1156 	if (tcp_md5_hash_key(hp, key))
1157 		goto clear_hash;
1158 	if (crypto_hash_final(desc, md5_hash))
1159 		goto clear_hash;
1160 
1161 	tcp_put_md5sig_pool();
1162 	return 0;
1163 
1164 clear_hash:
1165 	tcp_put_md5sig_pool();
1166 clear_hash_noput:
1167 	memset(md5_hash, 0, 16);
1168 	return 1;
1169 }
1170 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1171 
1172 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1173 {
1174 	/*
1175 	 * This gets called for each TCP segment that arrives
1176 	 * so we want to be efficient.
1177 	 * We have 3 drop cases:
1178 	 * o No MD5 hash and one expected.
1179 	 * o MD5 hash and we're not expecting one.
1180 	 * o MD5 hash and its wrong.
1181 	 */
1182 	__u8 *hash_location = NULL;
1183 	struct tcp_md5sig_key *hash_expected;
1184 	const struct iphdr *iph = ip_hdr(skb);
1185 	struct tcphdr *th = tcp_hdr(skb);
1186 	int genhash;
1187 	unsigned char newhash[16];
1188 
1189 	hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1190 	hash_location = tcp_parse_md5sig_option(th);
1191 
1192 	/* We've parsed the options - do we have a hash? */
1193 	if (!hash_expected && !hash_location)
1194 		return 0;
1195 
1196 	if (hash_expected && !hash_location) {
1197 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1198 		return 1;
1199 	}
1200 
1201 	if (!hash_expected && hash_location) {
1202 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1203 		return 1;
1204 	}
1205 
1206 	/* Okay, so this is hash_expected and hash_location -
1207 	 * so we need to calculate the checksum.
1208 	 */
1209 	genhash = tcp_v4_md5_hash_skb(newhash,
1210 				      hash_expected,
1211 				      NULL, NULL, skb);
1212 
1213 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1214 		if (net_ratelimit()) {
1215 			printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1216 			       &iph->saddr, ntohs(th->source),
1217 			       &iph->daddr, ntohs(th->dest),
1218 			       genhash ? " tcp_v4_calc_md5_hash failed" : "");
1219 		}
1220 		return 1;
1221 	}
1222 	return 0;
1223 }
1224 
1225 #endif
1226 
1227 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1228 	.family		=	PF_INET,
1229 	.obj_size	=	sizeof(struct tcp_request_sock),
1230 	.rtx_syn_ack	=	tcp_v4_rtx_synack,
1231 	.send_ack	=	tcp_v4_reqsk_send_ack,
1232 	.destructor	=	tcp_v4_reqsk_destructor,
1233 	.send_reset	=	tcp_v4_send_reset,
1234 	.syn_ack_timeout = 	tcp_syn_ack_timeout,
1235 };
1236 
1237 #ifdef CONFIG_TCP_MD5SIG
1238 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1239 	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
1240 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1241 };
1242 #endif
1243 
1244 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1245 {
1246 	struct tcp_extend_values tmp_ext;
1247 	struct tcp_options_received tmp_opt;
1248 	u8 *hash_location;
1249 	struct request_sock *req;
1250 	struct inet_request_sock *ireq;
1251 	struct tcp_sock *tp = tcp_sk(sk);
1252 	struct dst_entry *dst = NULL;
1253 	__be32 saddr = ip_hdr(skb)->saddr;
1254 	__be32 daddr = ip_hdr(skb)->daddr;
1255 	__u32 isn = TCP_SKB_CB(skb)->when;
1256 	int want_cookie = 0;
1257 
1258 	/* Never answer to SYNs send to broadcast or multicast */
1259 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1260 		goto drop;
1261 
1262 	/* TW buckets are converted to open requests without
1263 	 * limitations, they conserve resources and peer is
1264 	 * evidently real one.
1265 	 */
1266 	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1267 		want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1268 		if (!want_cookie)
1269 			goto drop;
1270 	}
1271 
1272 	/* Accept backlog is full. If we have already queued enough
1273 	 * of warm entries in syn queue, drop request. It is better than
1274 	 * clogging syn queue with openreqs with exponentially increasing
1275 	 * timeout.
1276 	 */
1277 	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1278 		goto drop;
1279 
1280 	req = inet_reqsk_alloc(&tcp_request_sock_ops);
1281 	if (!req)
1282 		goto drop;
1283 
1284 #ifdef CONFIG_TCP_MD5SIG
1285 	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1286 #endif
1287 
1288 	tcp_clear_options(&tmp_opt);
1289 	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1290 	tmp_opt.user_mss  = tp->rx_opt.user_mss;
1291 	tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1292 
1293 	if (tmp_opt.cookie_plus > 0 &&
1294 	    tmp_opt.saw_tstamp &&
1295 	    !tp->rx_opt.cookie_out_never &&
1296 	    (sysctl_tcp_cookie_size > 0 ||
1297 	     (tp->cookie_values != NULL &&
1298 	      tp->cookie_values->cookie_desired > 0))) {
1299 		u8 *c;
1300 		u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1301 		int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1302 
1303 		if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1304 			goto drop_and_release;
1305 
1306 		/* Secret recipe starts with IP addresses */
1307 		*mess++ ^= (__force u32)daddr;
1308 		*mess++ ^= (__force u32)saddr;
1309 
1310 		/* plus variable length Initiator Cookie */
1311 		c = (u8 *)mess;
1312 		while (l-- > 0)
1313 			*c++ ^= *hash_location++;
1314 
1315 		want_cookie = 0;	/* not our kind of cookie */
1316 		tmp_ext.cookie_out_never = 0; /* false */
1317 		tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1318 	} else if (!tp->rx_opt.cookie_in_always) {
1319 		/* redundant indications, but ensure initialization. */
1320 		tmp_ext.cookie_out_never = 1; /* true */
1321 		tmp_ext.cookie_plus = 0;
1322 	} else {
1323 		goto drop_and_release;
1324 	}
1325 	tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1326 
1327 	if (want_cookie && !tmp_opt.saw_tstamp)
1328 		tcp_clear_options(&tmp_opt);
1329 
1330 	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1331 	tcp_openreq_init(req, &tmp_opt, skb);
1332 
1333 	ireq = inet_rsk(req);
1334 	ireq->loc_addr = daddr;
1335 	ireq->rmt_addr = saddr;
1336 	ireq->no_srccheck = inet_sk(sk)->transparent;
1337 	ireq->opt = tcp_v4_save_options(sk, skb);
1338 
1339 	if (security_inet_conn_request(sk, skb, req))
1340 		goto drop_and_free;
1341 
1342 	if (!want_cookie || tmp_opt.tstamp_ok)
1343 		TCP_ECN_create_request(req, tcp_hdr(skb));
1344 
1345 	if (want_cookie) {
1346 		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1347 		req->cookie_ts = tmp_opt.tstamp_ok;
1348 	} else if (!isn) {
1349 		struct inet_peer *peer = NULL;
1350 		struct flowi4 fl4;
1351 
1352 		/* VJ's idea. We save last timestamp seen
1353 		 * from the destination in peer table, when entering
1354 		 * state TIME-WAIT, and check against it before
1355 		 * accepting new connection request.
1356 		 *
1357 		 * If "isn" is not zero, this request hit alive
1358 		 * timewait bucket, so that all the necessary checks
1359 		 * are made in the function processing timewait state.
1360 		 */
1361 		if (tmp_opt.saw_tstamp &&
1362 		    tcp_death_row.sysctl_tw_recycle &&
1363 		    (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1364 		    fl4.daddr == saddr &&
1365 		    (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
1366 			inet_peer_refcheck(peer);
1367 			if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1368 			    (s32)(peer->tcp_ts - req->ts_recent) >
1369 							TCP_PAWS_WINDOW) {
1370 				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1371 				goto drop_and_release;
1372 			}
1373 		}
1374 		/* Kill the following clause, if you dislike this way. */
1375 		else if (!sysctl_tcp_syncookies &&
1376 			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1377 			  (sysctl_max_syn_backlog >> 2)) &&
1378 			 (!peer || !peer->tcp_ts_stamp) &&
1379 			 (!dst || !dst_metric(dst, RTAX_RTT))) {
1380 			/* Without syncookies last quarter of
1381 			 * backlog is filled with destinations,
1382 			 * proven to be alive.
1383 			 * It means that we continue to communicate
1384 			 * to destinations, already remembered
1385 			 * to the moment of synflood.
1386 			 */
1387 			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1388 				       &saddr, ntohs(tcp_hdr(skb)->source));
1389 			goto drop_and_release;
1390 		}
1391 
1392 		isn = tcp_v4_init_sequence(skb);
1393 	}
1394 	tcp_rsk(req)->snt_isn = isn;
1395 	tcp_rsk(req)->snt_synack = tcp_time_stamp;
1396 
1397 	if (tcp_v4_send_synack(sk, dst, req,
1398 			       (struct request_values *)&tmp_ext) ||
1399 	    want_cookie)
1400 		goto drop_and_free;
1401 
1402 	inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1403 	return 0;
1404 
1405 drop_and_release:
1406 	dst_release(dst);
1407 drop_and_free:
1408 	reqsk_free(req);
1409 drop:
1410 	return 0;
1411 }
1412 EXPORT_SYMBOL(tcp_v4_conn_request);
1413 
1414 
1415 /*
1416  * The three way handshake has completed - we got a valid synack -
1417  * now create the new socket.
1418  */
1419 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1420 				  struct request_sock *req,
1421 				  struct dst_entry *dst)
1422 {
1423 	struct inet_request_sock *ireq;
1424 	struct inet_sock *newinet;
1425 	struct tcp_sock *newtp;
1426 	struct sock *newsk;
1427 #ifdef CONFIG_TCP_MD5SIG
1428 	struct tcp_md5sig_key *key;
1429 #endif
1430 	struct ip_options_rcu *inet_opt;
1431 
1432 	if (sk_acceptq_is_full(sk))
1433 		goto exit_overflow;
1434 
1435 	newsk = tcp_create_openreq_child(sk, req, skb);
1436 	if (!newsk)
1437 		goto exit_nonewsk;
1438 
1439 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1440 
1441 	newtp		      = tcp_sk(newsk);
1442 	newinet		      = inet_sk(newsk);
1443 	ireq		      = inet_rsk(req);
1444 	newinet->inet_daddr   = ireq->rmt_addr;
1445 	newinet->inet_rcv_saddr = ireq->loc_addr;
1446 	newinet->inet_saddr	      = ireq->loc_addr;
1447 	inet_opt	      = ireq->opt;
1448 	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1449 	ireq->opt	      = NULL;
1450 	newinet->mc_index     = inet_iif(skb);
1451 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1452 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1453 	if (inet_opt)
1454 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1455 	newinet->inet_id = newtp->write_seq ^ jiffies;
1456 
1457 	if (!dst && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
1458 		goto put_and_exit;
1459 
1460 	sk_setup_caps(newsk, dst);
1461 
1462 	tcp_mtup_init(newsk);
1463 	tcp_sync_mss(newsk, dst_mtu(dst));
1464 	newtp->advmss = dst_metric_advmss(dst);
1465 	if (tcp_sk(sk)->rx_opt.user_mss &&
1466 	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1467 		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1468 
1469 	tcp_initialize_rcv_mss(newsk);
1470 	if (tcp_rsk(req)->snt_synack)
1471 		tcp_valid_rtt_meas(newsk,
1472 		    tcp_time_stamp - tcp_rsk(req)->snt_synack);
1473 	newtp->total_retrans = req->retrans;
1474 
1475 #ifdef CONFIG_TCP_MD5SIG
1476 	/* Copy over the MD5 key from the original socket */
1477 	key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1478 	if (key != NULL) {
1479 		/*
1480 		 * We're using one, so create a matching key
1481 		 * on the newsk structure. If we fail to get
1482 		 * memory, then we end up not copying the key
1483 		 * across. Shucks.
1484 		 */
1485 		char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1486 		if (newkey != NULL)
1487 			tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1488 					  newkey, key->keylen);
1489 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1490 	}
1491 #endif
1492 
1493 	if (__inet_inherit_port(sk, newsk) < 0)
1494 		goto put_and_exit;
1495 	__inet_hash_nolisten(newsk, NULL);
1496 
1497 	return newsk;
1498 
1499 exit_overflow:
1500 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1501 exit_nonewsk:
1502 	dst_release(dst);
1503 exit:
1504 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1505 	return NULL;
1506 put_and_exit:
1507 	sock_put(newsk);
1508 	goto exit;
1509 }
1510 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1511 
1512 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1513 {
1514 	struct tcphdr *th = tcp_hdr(skb);
1515 	const struct iphdr *iph = ip_hdr(skb);
1516 	struct sock *nsk;
1517 	struct request_sock **prev;
1518 	/* Find possible connection requests. */
1519 	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1520 						       iph->saddr, iph->daddr);
1521 	if (req)
1522 		return tcp_check_req(sk, skb, req, prev);
1523 
1524 	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1525 			th->source, iph->daddr, th->dest, inet_iif(skb));
1526 
1527 	if (nsk) {
1528 		if (nsk->sk_state != TCP_TIME_WAIT) {
1529 			bh_lock_sock(nsk);
1530 			return nsk;
1531 		}
1532 		inet_twsk_put(inet_twsk(nsk));
1533 		return NULL;
1534 	}
1535 
1536 #ifdef CONFIG_SYN_COOKIES
1537 	if (!th->syn)
1538 		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1539 #endif
1540 	return sk;
1541 }
1542 
1543 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1544 {
1545 	const struct iphdr *iph = ip_hdr(skb);
1546 
1547 	if (skb->ip_summed == CHECKSUM_COMPLETE) {
1548 		if (!tcp_v4_check(skb->len, iph->saddr,
1549 				  iph->daddr, skb->csum)) {
1550 			skb->ip_summed = CHECKSUM_UNNECESSARY;
1551 			return 0;
1552 		}
1553 	}
1554 
1555 	skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1556 				       skb->len, IPPROTO_TCP, 0);
1557 
1558 	if (skb->len <= 76) {
1559 		return __skb_checksum_complete(skb);
1560 	}
1561 	return 0;
1562 }
1563 
1564 
1565 /* The socket must have it's spinlock held when we get
1566  * here.
1567  *
1568  * We have a potential double-lock case here, so even when
1569  * doing backlog processing we use the BH locking scheme.
1570  * This is because we cannot sleep with the original spinlock
1571  * held.
1572  */
1573 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1574 {
1575 	struct sock *rsk;
1576 #ifdef CONFIG_TCP_MD5SIG
1577 	/*
1578 	 * We really want to reject the packet as early as possible
1579 	 * if:
1580 	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1581 	 *  o There is an MD5 option and we're not expecting one
1582 	 */
1583 	if (tcp_v4_inbound_md5_hash(sk, skb))
1584 		goto discard;
1585 #endif
1586 
1587 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1588 		sock_rps_save_rxhash(sk, skb->rxhash);
1589 		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1590 			rsk = sk;
1591 			goto reset;
1592 		}
1593 		return 0;
1594 	}
1595 
1596 	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1597 		goto csum_err;
1598 
1599 	if (sk->sk_state == TCP_LISTEN) {
1600 		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1601 		if (!nsk)
1602 			goto discard;
1603 
1604 		if (nsk != sk) {
1605 			sock_rps_save_rxhash(nsk, skb->rxhash);
1606 			if (tcp_child_process(sk, nsk, skb)) {
1607 				rsk = nsk;
1608 				goto reset;
1609 			}
1610 			return 0;
1611 		}
1612 	} else
1613 		sock_rps_save_rxhash(sk, skb->rxhash);
1614 
1615 	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1616 		rsk = sk;
1617 		goto reset;
1618 	}
1619 	return 0;
1620 
1621 reset:
1622 	tcp_v4_send_reset(rsk, skb);
1623 discard:
1624 	kfree_skb(skb);
1625 	/* Be careful here. If this function gets more complicated and
1626 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1627 	 * might be destroyed here. This current version compiles correctly,
1628 	 * but you have been warned.
1629 	 */
1630 	return 0;
1631 
1632 csum_err:
1633 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1634 	goto discard;
1635 }
1636 EXPORT_SYMBOL(tcp_v4_do_rcv);
1637 
1638 /*
1639  *	From tcp_input.c
1640  */
1641 
1642 int tcp_v4_rcv(struct sk_buff *skb)
1643 {
1644 	const struct iphdr *iph;
1645 	struct tcphdr *th;
1646 	struct sock *sk;
1647 	int ret;
1648 	struct net *net = dev_net(skb->dev);
1649 
1650 	if (skb->pkt_type != PACKET_HOST)
1651 		goto discard_it;
1652 
1653 	/* Count it even if it's bad */
1654 	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1655 
1656 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1657 		goto discard_it;
1658 
1659 	th = tcp_hdr(skb);
1660 
1661 	if (th->doff < sizeof(struct tcphdr) / 4)
1662 		goto bad_packet;
1663 	if (!pskb_may_pull(skb, th->doff * 4))
1664 		goto discard_it;
1665 
1666 	/* An explanation is required here, I think.
1667 	 * Packet length and doff are validated by header prediction,
1668 	 * provided case of th->doff==0 is eliminated.
1669 	 * So, we defer the checks. */
1670 	if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1671 		goto bad_packet;
1672 
1673 	th = tcp_hdr(skb);
1674 	iph = ip_hdr(skb);
1675 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1676 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1677 				    skb->len - th->doff * 4);
1678 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1679 	TCP_SKB_CB(skb)->when	 = 0;
1680 	TCP_SKB_CB(skb)->flags	 = iph->tos;
1681 	TCP_SKB_CB(skb)->sacked	 = 0;
1682 
1683 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1684 	if (!sk)
1685 		goto no_tcp_socket;
1686 
1687 process:
1688 	if (sk->sk_state == TCP_TIME_WAIT)
1689 		goto do_time_wait;
1690 
1691 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1692 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1693 		goto discard_and_relse;
1694 	}
1695 
1696 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1697 		goto discard_and_relse;
1698 	nf_reset(skb);
1699 
1700 	if (sk_filter(sk, skb))
1701 		goto discard_and_relse;
1702 
1703 	skb->dev = NULL;
1704 
1705 	bh_lock_sock_nested(sk);
1706 	ret = 0;
1707 	if (!sock_owned_by_user(sk)) {
1708 #ifdef CONFIG_NET_DMA
1709 		struct tcp_sock *tp = tcp_sk(sk);
1710 		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1711 			tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1712 		if (tp->ucopy.dma_chan)
1713 			ret = tcp_v4_do_rcv(sk, skb);
1714 		else
1715 #endif
1716 		{
1717 			if (!tcp_prequeue(sk, skb))
1718 				ret = tcp_v4_do_rcv(sk, skb);
1719 		}
1720 	} else if (unlikely(sk_add_backlog(sk, skb))) {
1721 		bh_unlock_sock(sk);
1722 		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1723 		goto discard_and_relse;
1724 	}
1725 	bh_unlock_sock(sk);
1726 
1727 	sock_put(sk);
1728 
1729 	return ret;
1730 
1731 no_tcp_socket:
1732 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1733 		goto discard_it;
1734 
1735 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1736 bad_packet:
1737 		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1738 	} else {
1739 		tcp_v4_send_reset(NULL, skb);
1740 	}
1741 
1742 discard_it:
1743 	/* Discard frame. */
1744 	kfree_skb(skb);
1745 	return 0;
1746 
1747 discard_and_relse:
1748 	sock_put(sk);
1749 	goto discard_it;
1750 
1751 do_time_wait:
1752 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1753 		inet_twsk_put(inet_twsk(sk));
1754 		goto discard_it;
1755 	}
1756 
1757 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1758 		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1759 		inet_twsk_put(inet_twsk(sk));
1760 		goto discard_it;
1761 	}
1762 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1763 	case TCP_TW_SYN: {
1764 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1765 							&tcp_hashinfo,
1766 							iph->daddr, th->dest,
1767 							inet_iif(skb));
1768 		if (sk2) {
1769 			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1770 			inet_twsk_put(inet_twsk(sk));
1771 			sk = sk2;
1772 			goto process;
1773 		}
1774 		/* Fall through to ACK */
1775 	}
1776 	case TCP_TW_ACK:
1777 		tcp_v4_timewait_ack(sk, skb);
1778 		break;
1779 	case TCP_TW_RST:
1780 		goto no_tcp_socket;
1781 	case TCP_TW_SUCCESS:;
1782 	}
1783 	goto discard_it;
1784 }
1785 
1786 struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1787 {
1788 	struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1789 	struct inet_sock *inet = inet_sk(sk);
1790 	struct inet_peer *peer;
1791 
1792 	if (!rt ||
1793 	    inet->cork.fl.u.ip4.daddr != inet->inet_daddr) {
1794 		peer = inet_getpeer_v4(inet->inet_daddr, 1);
1795 		*release_it = true;
1796 	} else {
1797 		if (!rt->peer)
1798 			rt_bind_peer(rt, inet->inet_daddr, 1);
1799 		peer = rt->peer;
1800 		*release_it = false;
1801 	}
1802 
1803 	return peer;
1804 }
1805 EXPORT_SYMBOL(tcp_v4_get_peer);
1806 
1807 void *tcp_v4_tw_get_peer(struct sock *sk)
1808 {
1809 	struct inet_timewait_sock *tw = inet_twsk(sk);
1810 
1811 	return inet_getpeer_v4(tw->tw_daddr, 1);
1812 }
1813 EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1814 
1815 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1816 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1817 	.twsk_unique	= tcp_twsk_unique,
1818 	.twsk_destructor= tcp_twsk_destructor,
1819 	.twsk_getpeer	= tcp_v4_tw_get_peer,
1820 };
1821 
1822 const struct inet_connection_sock_af_ops ipv4_specific = {
1823 	.queue_xmit	   = ip_queue_xmit,
1824 	.send_check	   = tcp_v4_send_check,
1825 	.rebuild_header	   = inet_sk_rebuild_header,
1826 	.conn_request	   = tcp_v4_conn_request,
1827 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1828 	.get_peer	   = tcp_v4_get_peer,
1829 	.net_header_len	   = sizeof(struct iphdr),
1830 	.setsockopt	   = ip_setsockopt,
1831 	.getsockopt	   = ip_getsockopt,
1832 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1833 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1834 	.bind_conflict	   = inet_csk_bind_conflict,
1835 #ifdef CONFIG_COMPAT
1836 	.compat_setsockopt = compat_ip_setsockopt,
1837 	.compat_getsockopt = compat_ip_getsockopt,
1838 #endif
1839 };
1840 EXPORT_SYMBOL(ipv4_specific);
1841 
1842 #ifdef CONFIG_TCP_MD5SIG
1843 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1844 	.md5_lookup		= tcp_v4_md5_lookup,
1845 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1846 	.md5_add		= tcp_v4_md5_add_func,
1847 	.md5_parse		= tcp_v4_parse_md5_keys,
1848 };
1849 #endif
1850 
1851 /* NOTE: A lot of things set to zero explicitly by call to
1852  *       sk_alloc() so need not be done here.
1853  */
1854 static int tcp_v4_init_sock(struct sock *sk)
1855 {
1856 	struct inet_connection_sock *icsk = inet_csk(sk);
1857 	struct tcp_sock *tp = tcp_sk(sk);
1858 
1859 	skb_queue_head_init(&tp->out_of_order_queue);
1860 	tcp_init_xmit_timers(sk);
1861 	tcp_prequeue_init(tp);
1862 
1863 	icsk->icsk_rto = TCP_TIMEOUT_INIT;
1864 	tp->mdev = TCP_TIMEOUT_INIT;
1865 
1866 	/* So many TCP implementations out there (incorrectly) count the
1867 	 * initial SYN frame in their delayed-ACK and congestion control
1868 	 * algorithms that we must have the following bandaid to talk
1869 	 * efficiently to them.  -DaveM
1870 	 */
1871 	tp->snd_cwnd = TCP_INIT_CWND;
1872 
1873 	/* See draft-stevens-tcpca-spec-01 for discussion of the
1874 	 * initialization of these values.
1875 	 */
1876 	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1877 	tp->snd_cwnd_clamp = ~0;
1878 	tp->mss_cache = TCP_MSS_DEFAULT;
1879 
1880 	tp->reordering = sysctl_tcp_reordering;
1881 	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1882 
1883 	sk->sk_state = TCP_CLOSE;
1884 
1885 	sk->sk_write_space = sk_stream_write_space;
1886 	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1887 
1888 	icsk->icsk_af_ops = &ipv4_specific;
1889 	icsk->icsk_sync_mss = tcp_sync_mss;
1890 #ifdef CONFIG_TCP_MD5SIG
1891 	tp->af_specific = &tcp_sock_ipv4_specific;
1892 #endif
1893 
1894 	/* TCP Cookie Transactions */
1895 	if (sysctl_tcp_cookie_size > 0) {
1896 		/* Default, cookies without s_data_payload. */
1897 		tp->cookie_values =
1898 			kzalloc(sizeof(*tp->cookie_values),
1899 				sk->sk_allocation);
1900 		if (tp->cookie_values != NULL)
1901 			kref_init(&tp->cookie_values->kref);
1902 	}
1903 	/* Presumed zeroed, in order of appearance:
1904 	 *	cookie_in_always, cookie_out_never,
1905 	 *	s_data_constant, s_data_in, s_data_out
1906 	 */
1907 	sk->sk_sndbuf = sysctl_tcp_wmem[1];
1908 	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1909 
1910 	local_bh_disable();
1911 	percpu_counter_inc(&tcp_sockets_allocated);
1912 	local_bh_enable();
1913 
1914 	return 0;
1915 }
1916 
1917 void tcp_v4_destroy_sock(struct sock *sk)
1918 {
1919 	struct tcp_sock *tp = tcp_sk(sk);
1920 
1921 	tcp_clear_xmit_timers(sk);
1922 
1923 	tcp_cleanup_congestion_control(sk);
1924 
1925 	/* Cleanup up the write buffer. */
1926 	tcp_write_queue_purge(sk);
1927 
1928 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1929 	__skb_queue_purge(&tp->out_of_order_queue);
1930 
1931 #ifdef CONFIG_TCP_MD5SIG
1932 	/* Clean up the MD5 key list, if any */
1933 	if (tp->md5sig_info) {
1934 		tcp_v4_clear_md5_list(sk);
1935 		kfree(tp->md5sig_info);
1936 		tp->md5sig_info = NULL;
1937 	}
1938 #endif
1939 
1940 #ifdef CONFIG_NET_DMA
1941 	/* Cleans up our sk_async_wait_queue */
1942 	__skb_queue_purge(&sk->sk_async_wait_queue);
1943 #endif
1944 
1945 	/* Clean prequeue, it must be empty really */
1946 	__skb_queue_purge(&tp->ucopy.prequeue);
1947 
1948 	/* Clean up a referenced TCP bind bucket. */
1949 	if (inet_csk(sk)->icsk_bind_hash)
1950 		inet_put_port(sk);
1951 
1952 	/*
1953 	 * If sendmsg cached page exists, toss it.
1954 	 */
1955 	if (sk->sk_sndmsg_page) {
1956 		__free_page(sk->sk_sndmsg_page);
1957 		sk->sk_sndmsg_page = NULL;
1958 	}
1959 
1960 	/* TCP Cookie Transactions */
1961 	if (tp->cookie_values != NULL) {
1962 		kref_put(&tp->cookie_values->kref,
1963 			 tcp_cookie_values_release);
1964 		tp->cookie_values = NULL;
1965 	}
1966 
1967 	percpu_counter_dec(&tcp_sockets_allocated);
1968 }
1969 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1970 
1971 #ifdef CONFIG_PROC_FS
1972 /* Proc filesystem TCP sock list dumping. */
1973 
1974 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1975 {
1976 	return hlist_nulls_empty(head) ? NULL :
1977 		list_entry(head->first, struct inet_timewait_sock, tw_node);
1978 }
1979 
1980 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1981 {
1982 	return !is_a_nulls(tw->tw_node.next) ?
1983 		hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1984 }
1985 
1986 /*
1987  * Get next listener socket follow cur.  If cur is NULL, get first socket
1988  * starting from bucket given in st->bucket; when st->bucket is zero the
1989  * very first socket in the hash table is returned.
1990  */
1991 static void *listening_get_next(struct seq_file *seq, void *cur)
1992 {
1993 	struct inet_connection_sock *icsk;
1994 	struct hlist_nulls_node *node;
1995 	struct sock *sk = cur;
1996 	struct inet_listen_hashbucket *ilb;
1997 	struct tcp_iter_state *st = seq->private;
1998 	struct net *net = seq_file_net(seq);
1999 
2000 	if (!sk) {
2001 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2002 		spin_lock_bh(&ilb->lock);
2003 		sk = sk_nulls_head(&ilb->head);
2004 		st->offset = 0;
2005 		goto get_sk;
2006 	}
2007 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2008 	++st->num;
2009 	++st->offset;
2010 
2011 	if (st->state == TCP_SEQ_STATE_OPENREQ) {
2012 		struct request_sock *req = cur;
2013 
2014 		icsk = inet_csk(st->syn_wait_sk);
2015 		req = req->dl_next;
2016 		while (1) {
2017 			while (req) {
2018 				if (req->rsk_ops->family == st->family) {
2019 					cur = req;
2020 					goto out;
2021 				}
2022 				req = req->dl_next;
2023 			}
2024 			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2025 				break;
2026 get_req:
2027 			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2028 		}
2029 		sk	  = sk_nulls_next(st->syn_wait_sk);
2030 		st->state = TCP_SEQ_STATE_LISTENING;
2031 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2032 	} else {
2033 		icsk = inet_csk(sk);
2034 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2035 		if (reqsk_queue_len(&icsk->icsk_accept_queue))
2036 			goto start_req;
2037 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2038 		sk = sk_nulls_next(sk);
2039 	}
2040 get_sk:
2041 	sk_nulls_for_each_from(sk, node) {
2042 		if (!net_eq(sock_net(sk), net))
2043 			continue;
2044 		if (sk->sk_family == st->family) {
2045 			cur = sk;
2046 			goto out;
2047 		}
2048 		icsk = inet_csk(sk);
2049 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2050 		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2051 start_req:
2052 			st->uid		= sock_i_uid(sk);
2053 			st->syn_wait_sk = sk;
2054 			st->state	= TCP_SEQ_STATE_OPENREQ;
2055 			st->sbucket	= 0;
2056 			goto get_req;
2057 		}
2058 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2059 	}
2060 	spin_unlock_bh(&ilb->lock);
2061 	st->offset = 0;
2062 	if (++st->bucket < INET_LHTABLE_SIZE) {
2063 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2064 		spin_lock_bh(&ilb->lock);
2065 		sk = sk_nulls_head(&ilb->head);
2066 		goto get_sk;
2067 	}
2068 	cur = NULL;
2069 out:
2070 	return cur;
2071 }
2072 
2073 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2074 {
2075 	struct tcp_iter_state *st = seq->private;
2076 	void *rc;
2077 
2078 	st->bucket = 0;
2079 	st->offset = 0;
2080 	rc = listening_get_next(seq, NULL);
2081 
2082 	while (rc && *pos) {
2083 		rc = listening_get_next(seq, rc);
2084 		--*pos;
2085 	}
2086 	return rc;
2087 }
2088 
2089 static inline int empty_bucket(struct tcp_iter_state *st)
2090 {
2091 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2092 		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2093 }
2094 
2095 /*
2096  * Get first established socket starting from bucket given in st->bucket.
2097  * If st->bucket is zero, the very first socket in the hash is returned.
2098  */
2099 static void *established_get_first(struct seq_file *seq)
2100 {
2101 	struct tcp_iter_state *st = seq->private;
2102 	struct net *net = seq_file_net(seq);
2103 	void *rc = NULL;
2104 
2105 	st->offset = 0;
2106 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2107 		struct sock *sk;
2108 		struct hlist_nulls_node *node;
2109 		struct inet_timewait_sock *tw;
2110 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2111 
2112 		/* Lockless fast path for the common case of empty buckets */
2113 		if (empty_bucket(st))
2114 			continue;
2115 
2116 		spin_lock_bh(lock);
2117 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2118 			if (sk->sk_family != st->family ||
2119 			    !net_eq(sock_net(sk), net)) {
2120 				continue;
2121 			}
2122 			rc = sk;
2123 			goto out;
2124 		}
2125 		st->state = TCP_SEQ_STATE_TIME_WAIT;
2126 		inet_twsk_for_each(tw, node,
2127 				   &tcp_hashinfo.ehash[st->bucket].twchain) {
2128 			if (tw->tw_family != st->family ||
2129 			    !net_eq(twsk_net(tw), net)) {
2130 				continue;
2131 			}
2132 			rc = tw;
2133 			goto out;
2134 		}
2135 		spin_unlock_bh(lock);
2136 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2137 	}
2138 out:
2139 	return rc;
2140 }
2141 
2142 static void *established_get_next(struct seq_file *seq, void *cur)
2143 {
2144 	struct sock *sk = cur;
2145 	struct inet_timewait_sock *tw;
2146 	struct hlist_nulls_node *node;
2147 	struct tcp_iter_state *st = seq->private;
2148 	struct net *net = seq_file_net(seq);
2149 
2150 	++st->num;
2151 	++st->offset;
2152 
2153 	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2154 		tw = cur;
2155 		tw = tw_next(tw);
2156 get_tw:
2157 		while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2158 			tw = tw_next(tw);
2159 		}
2160 		if (tw) {
2161 			cur = tw;
2162 			goto out;
2163 		}
2164 		spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2165 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2166 
2167 		/* Look for next non empty bucket */
2168 		st->offset = 0;
2169 		while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2170 				empty_bucket(st))
2171 			;
2172 		if (st->bucket > tcp_hashinfo.ehash_mask)
2173 			return NULL;
2174 
2175 		spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2176 		sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2177 	} else
2178 		sk = sk_nulls_next(sk);
2179 
2180 	sk_nulls_for_each_from(sk, node) {
2181 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2182 			goto found;
2183 	}
2184 
2185 	st->state = TCP_SEQ_STATE_TIME_WAIT;
2186 	tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2187 	goto get_tw;
2188 found:
2189 	cur = sk;
2190 out:
2191 	return cur;
2192 }
2193 
2194 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2195 {
2196 	struct tcp_iter_state *st = seq->private;
2197 	void *rc;
2198 
2199 	st->bucket = 0;
2200 	rc = established_get_first(seq);
2201 
2202 	while (rc && pos) {
2203 		rc = established_get_next(seq, rc);
2204 		--pos;
2205 	}
2206 	return rc;
2207 }
2208 
2209 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2210 {
2211 	void *rc;
2212 	struct tcp_iter_state *st = seq->private;
2213 
2214 	st->state = TCP_SEQ_STATE_LISTENING;
2215 	rc	  = listening_get_idx(seq, &pos);
2216 
2217 	if (!rc) {
2218 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2219 		rc	  = established_get_idx(seq, pos);
2220 	}
2221 
2222 	return rc;
2223 }
2224 
2225 static void *tcp_seek_last_pos(struct seq_file *seq)
2226 {
2227 	struct tcp_iter_state *st = seq->private;
2228 	int offset = st->offset;
2229 	int orig_num = st->num;
2230 	void *rc = NULL;
2231 
2232 	switch (st->state) {
2233 	case TCP_SEQ_STATE_OPENREQ:
2234 	case TCP_SEQ_STATE_LISTENING:
2235 		if (st->bucket >= INET_LHTABLE_SIZE)
2236 			break;
2237 		st->state = TCP_SEQ_STATE_LISTENING;
2238 		rc = listening_get_next(seq, NULL);
2239 		while (offset-- && rc)
2240 			rc = listening_get_next(seq, rc);
2241 		if (rc)
2242 			break;
2243 		st->bucket = 0;
2244 		/* Fallthrough */
2245 	case TCP_SEQ_STATE_ESTABLISHED:
2246 	case TCP_SEQ_STATE_TIME_WAIT:
2247 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2248 		if (st->bucket > tcp_hashinfo.ehash_mask)
2249 			break;
2250 		rc = established_get_first(seq);
2251 		while (offset-- && rc)
2252 			rc = established_get_next(seq, rc);
2253 	}
2254 
2255 	st->num = orig_num;
2256 
2257 	return rc;
2258 }
2259 
2260 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2261 {
2262 	struct tcp_iter_state *st = seq->private;
2263 	void *rc;
2264 
2265 	if (*pos && *pos == st->last_pos) {
2266 		rc = tcp_seek_last_pos(seq);
2267 		if (rc)
2268 			goto out;
2269 	}
2270 
2271 	st->state = TCP_SEQ_STATE_LISTENING;
2272 	st->num = 0;
2273 	st->bucket = 0;
2274 	st->offset = 0;
2275 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2276 
2277 out:
2278 	st->last_pos = *pos;
2279 	return rc;
2280 }
2281 
2282 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2283 {
2284 	struct tcp_iter_state *st = seq->private;
2285 	void *rc = NULL;
2286 
2287 	if (v == SEQ_START_TOKEN) {
2288 		rc = tcp_get_idx(seq, 0);
2289 		goto out;
2290 	}
2291 
2292 	switch (st->state) {
2293 	case TCP_SEQ_STATE_OPENREQ:
2294 	case TCP_SEQ_STATE_LISTENING:
2295 		rc = listening_get_next(seq, v);
2296 		if (!rc) {
2297 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2298 			st->bucket = 0;
2299 			st->offset = 0;
2300 			rc	  = established_get_first(seq);
2301 		}
2302 		break;
2303 	case TCP_SEQ_STATE_ESTABLISHED:
2304 	case TCP_SEQ_STATE_TIME_WAIT:
2305 		rc = established_get_next(seq, v);
2306 		break;
2307 	}
2308 out:
2309 	++*pos;
2310 	st->last_pos = *pos;
2311 	return rc;
2312 }
2313 
2314 static void tcp_seq_stop(struct seq_file *seq, void *v)
2315 {
2316 	struct tcp_iter_state *st = seq->private;
2317 
2318 	switch (st->state) {
2319 	case TCP_SEQ_STATE_OPENREQ:
2320 		if (v) {
2321 			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2322 			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2323 		}
2324 	case TCP_SEQ_STATE_LISTENING:
2325 		if (v != SEQ_START_TOKEN)
2326 			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2327 		break;
2328 	case TCP_SEQ_STATE_TIME_WAIT:
2329 	case TCP_SEQ_STATE_ESTABLISHED:
2330 		if (v)
2331 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2332 		break;
2333 	}
2334 }
2335 
2336 static int tcp_seq_open(struct inode *inode, struct file *file)
2337 {
2338 	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2339 	struct tcp_iter_state *s;
2340 	int err;
2341 
2342 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2343 			  sizeof(struct tcp_iter_state));
2344 	if (err < 0)
2345 		return err;
2346 
2347 	s = ((struct seq_file *)file->private_data)->private;
2348 	s->family		= afinfo->family;
2349 	s->last_pos 		= 0;
2350 	return 0;
2351 }
2352 
2353 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2354 {
2355 	int rc = 0;
2356 	struct proc_dir_entry *p;
2357 
2358 	afinfo->seq_fops.open		= tcp_seq_open;
2359 	afinfo->seq_fops.read		= seq_read;
2360 	afinfo->seq_fops.llseek		= seq_lseek;
2361 	afinfo->seq_fops.release	= seq_release_net;
2362 
2363 	afinfo->seq_ops.start		= tcp_seq_start;
2364 	afinfo->seq_ops.next		= tcp_seq_next;
2365 	afinfo->seq_ops.stop		= tcp_seq_stop;
2366 
2367 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2368 			     &afinfo->seq_fops, afinfo);
2369 	if (!p)
2370 		rc = -ENOMEM;
2371 	return rc;
2372 }
2373 EXPORT_SYMBOL(tcp_proc_register);
2374 
2375 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2376 {
2377 	proc_net_remove(net, afinfo->name);
2378 }
2379 EXPORT_SYMBOL(tcp_proc_unregister);
2380 
2381 static void get_openreq4(struct sock *sk, struct request_sock *req,
2382 			 struct seq_file *f, int i, int uid, int *len)
2383 {
2384 	const struct inet_request_sock *ireq = inet_rsk(req);
2385 	int ttd = req->expires - jiffies;
2386 
2387 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2388 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2389 		i,
2390 		ireq->loc_addr,
2391 		ntohs(inet_sk(sk)->inet_sport),
2392 		ireq->rmt_addr,
2393 		ntohs(ireq->rmt_port),
2394 		TCP_SYN_RECV,
2395 		0, 0, /* could print option size, but that is af dependent. */
2396 		1,    /* timers active (only the expire timer) */
2397 		jiffies_to_clock_t(ttd),
2398 		req->retrans,
2399 		uid,
2400 		0,  /* non standard timer */
2401 		0, /* open_requests have no inode */
2402 		atomic_read(&sk->sk_refcnt),
2403 		req,
2404 		len);
2405 }
2406 
2407 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2408 {
2409 	int timer_active;
2410 	unsigned long timer_expires;
2411 	struct tcp_sock *tp = tcp_sk(sk);
2412 	const struct inet_connection_sock *icsk = inet_csk(sk);
2413 	struct inet_sock *inet = inet_sk(sk);
2414 	__be32 dest = inet->inet_daddr;
2415 	__be32 src = inet->inet_rcv_saddr;
2416 	__u16 destp = ntohs(inet->inet_dport);
2417 	__u16 srcp = ntohs(inet->inet_sport);
2418 	int rx_queue;
2419 
2420 	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2421 		timer_active	= 1;
2422 		timer_expires	= icsk->icsk_timeout;
2423 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2424 		timer_active	= 4;
2425 		timer_expires	= icsk->icsk_timeout;
2426 	} else if (timer_pending(&sk->sk_timer)) {
2427 		timer_active	= 2;
2428 		timer_expires	= sk->sk_timer.expires;
2429 	} else {
2430 		timer_active	= 0;
2431 		timer_expires = jiffies;
2432 	}
2433 
2434 	if (sk->sk_state == TCP_LISTEN)
2435 		rx_queue = sk->sk_ack_backlog;
2436 	else
2437 		/*
2438 		 * because we dont lock socket, we might find a transient negative value
2439 		 */
2440 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2441 
2442 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2443 			"%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2444 		i, src, srcp, dest, destp, sk->sk_state,
2445 		tp->write_seq - tp->snd_una,
2446 		rx_queue,
2447 		timer_active,
2448 		jiffies_to_clock_t(timer_expires - jiffies),
2449 		icsk->icsk_retransmits,
2450 		sock_i_uid(sk),
2451 		icsk->icsk_probes_out,
2452 		sock_i_ino(sk),
2453 		atomic_read(&sk->sk_refcnt), sk,
2454 		jiffies_to_clock_t(icsk->icsk_rto),
2455 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2456 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2457 		tp->snd_cwnd,
2458 		tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2459 		len);
2460 }
2461 
2462 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2463 			       struct seq_file *f, int i, int *len)
2464 {
2465 	__be32 dest, src;
2466 	__u16 destp, srcp;
2467 	int ttd = tw->tw_ttd - jiffies;
2468 
2469 	if (ttd < 0)
2470 		ttd = 0;
2471 
2472 	dest  = tw->tw_daddr;
2473 	src   = tw->tw_rcv_saddr;
2474 	destp = ntohs(tw->tw_dport);
2475 	srcp  = ntohs(tw->tw_sport);
2476 
2477 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2478 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2479 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2480 		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2481 		atomic_read(&tw->tw_refcnt), tw, len);
2482 }
2483 
2484 #define TMPSZ 150
2485 
2486 static int tcp4_seq_show(struct seq_file *seq, void *v)
2487 {
2488 	struct tcp_iter_state *st;
2489 	int len;
2490 
2491 	if (v == SEQ_START_TOKEN) {
2492 		seq_printf(seq, "%-*s\n", TMPSZ - 1,
2493 			   "  sl  local_address rem_address   st tx_queue "
2494 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2495 			   "inode");
2496 		goto out;
2497 	}
2498 	st = seq->private;
2499 
2500 	switch (st->state) {
2501 	case TCP_SEQ_STATE_LISTENING:
2502 	case TCP_SEQ_STATE_ESTABLISHED:
2503 		get_tcp4_sock(v, seq, st->num, &len);
2504 		break;
2505 	case TCP_SEQ_STATE_OPENREQ:
2506 		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2507 		break;
2508 	case TCP_SEQ_STATE_TIME_WAIT:
2509 		get_timewait4_sock(v, seq, st->num, &len);
2510 		break;
2511 	}
2512 	seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2513 out:
2514 	return 0;
2515 }
2516 
2517 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2518 	.name		= "tcp",
2519 	.family		= AF_INET,
2520 	.seq_fops	= {
2521 		.owner		= THIS_MODULE,
2522 	},
2523 	.seq_ops	= {
2524 		.show		= tcp4_seq_show,
2525 	},
2526 };
2527 
2528 static int __net_init tcp4_proc_init_net(struct net *net)
2529 {
2530 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2531 }
2532 
2533 static void __net_exit tcp4_proc_exit_net(struct net *net)
2534 {
2535 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2536 }
2537 
2538 static struct pernet_operations tcp4_net_ops = {
2539 	.init = tcp4_proc_init_net,
2540 	.exit = tcp4_proc_exit_net,
2541 };
2542 
2543 int __init tcp4_proc_init(void)
2544 {
2545 	return register_pernet_subsys(&tcp4_net_ops);
2546 }
2547 
2548 void tcp4_proc_exit(void)
2549 {
2550 	unregister_pernet_subsys(&tcp4_net_ops);
2551 }
2552 #endif /* CONFIG_PROC_FS */
2553 
2554 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2555 {
2556 	const struct iphdr *iph = skb_gro_network_header(skb);
2557 
2558 	switch (skb->ip_summed) {
2559 	case CHECKSUM_COMPLETE:
2560 		if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2561 				  skb->csum)) {
2562 			skb->ip_summed = CHECKSUM_UNNECESSARY;
2563 			break;
2564 		}
2565 
2566 		/* fall through */
2567 	case CHECKSUM_NONE:
2568 		NAPI_GRO_CB(skb)->flush = 1;
2569 		return NULL;
2570 	}
2571 
2572 	return tcp_gro_receive(head, skb);
2573 }
2574 
2575 int tcp4_gro_complete(struct sk_buff *skb)
2576 {
2577 	const struct iphdr *iph = ip_hdr(skb);
2578 	struct tcphdr *th = tcp_hdr(skb);
2579 
2580 	th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2581 				  iph->saddr, iph->daddr, 0);
2582 	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2583 
2584 	return tcp_gro_complete(skb);
2585 }
2586 
2587 struct proto tcp_prot = {
2588 	.name			= "TCP",
2589 	.owner			= THIS_MODULE,
2590 	.close			= tcp_close,
2591 	.connect		= tcp_v4_connect,
2592 	.disconnect		= tcp_disconnect,
2593 	.accept			= inet_csk_accept,
2594 	.ioctl			= tcp_ioctl,
2595 	.init			= tcp_v4_init_sock,
2596 	.destroy		= tcp_v4_destroy_sock,
2597 	.shutdown		= tcp_shutdown,
2598 	.setsockopt		= tcp_setsockopt,
2599 	.getsockopt		= tcp_getsockopt,
2600 	.recvmsg		= tcp_recvmsg,
2601 	.sendmsg		= tcp_sendmsg,
2602 	.sendpage		= tcp_sendpage,
2603 	.backlog_rcv		= tcp_v4_do_rcv,
2604 	.hash			= inet_hash,
2605 	.unhash			= inet_unhash,
2606 	.get_port		= inet_csk_get_port,
2607 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2608 	.sockets_allocated	= &tcp_sockets_allocated,
2609 	.orphan_count		= &tcp_orphan_count,
2610 	.memory_allocated	= &tcp_memory_allocated,
2611 	.memory_pressure	= &tcp_memory_pressure,
2612 	.sysctl_mem		= sysctl_tcp_mem,
2613 	.sysctl_wmem		= sysctl_tcp_wmem,
2614 	.sysctl_rmem		= sysctl_tcp_rmem,
2615 	.max_header		= MAX_TCP_HEADER,
2616 	.obj_size		= sizeof(struct tcp_sock),
2617 	.slab_flags		= SLAB_DESTROY_BY_RCU,
2618 	.twsk_prot		= &tcp_timewait_sock_ops,
2619 	.rsk_prot		= &tcp_request_sock_ops,
2620 	.h.hashinfo		= &tcp_hashinfo,
2621 	.no_autobind		= true,
2622 #ifdef CONFIG_COMPAT
2623 	.compat_setsockopt	= compat_tcp_setsockopt,
2624 	.compat_getsockopt	= compat_tcp_getsockopt,
2625 #endif
2626 };
2627 EXPORT_SYMBOL(tcp_prot);
2628 
2629 
2630 static int __net_init tcp_sk_init(struct net *net)
2631 {
2632 	return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2633 				    PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2634 }
2635 
2636 static void __net_exit tcp_sk_exit(struct net *net)
2637 {
2638 	inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2639 }
2640 
2641 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2642 {
2643 	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2644 }
2645 
2646 static struct pernet_operations __net_initdata tcp_sk_ops = {
2647        .init	   = tcp_sk_init,
2648        .exit	   = tcp_sk_exit,
2649        .exit_batch = tcp_sk_exit_batch,
2650 };
2651 
2652 void __init tcp_v4_init(void)
2653 {
2654 	inet_hashinfo_init(&tcp_hashinfo);
2655 	if (register_pernet_subsys(&tcp_sk_ops))
2656 		panic("Failed to create the TCP control socket.\n");
2657 }
2658