xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision 384740dc)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 
54 #include <linux/types.h>
55 #include <linux/fcntl.h>
56 #include <linux/module.h>
57 #include <linux/random.h>
58 #include <linux/cache.h>
59 #include <linux/jhash.h>
60 #include <linux/init.h>
61 #include <linux/times.h>
62 
63 #include <net/net_namespace.h>
64 #include <net/icmp.h>
65 #include <net/inet_hashtables.h>
66 #include <net/tcp.h>
67 #include <net/transp_v6.h>
68 #include <net/ipv6.h>
69 #include <net/inet_common.h>
70 #include <net/timewait_sock.h>
71 #include <net/xfrm.h>
72 #include <net/netdma.h>
73 
74 #include <linux/inet.h>
75 #include <linux/ipv6.h>
76 #include <linux/stddef.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79 
80 #include <linux/crypto.h>
81 #include <linux/scatterlist.h>
82 
83 int sysctl_tcp_tw_reuse __read_mostly;
84 int sysctl_tcp_low_latency __read_mostly;
85 
86 
87 #ifdef CONFIG_TCP_MD5SIG
88 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
89 						   __be32 addr);
90 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
91 			       __be32 daddr, __be32 saddr, struct tcphdr *th);
92 #else
93 static inline
94 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
95 {
96 	return NULL;
97 }
98 #endif
99 
100 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
101 	.lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
102 	.lhash_users = ATOMIC_INIT(0),
103 	.lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
104 };
105 
106 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
107 {
108 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
109 					  ip_hdr(skb)->saddr,
110 					  tcp_hdr(skb)->dest,
111 					  tcp_hdr(skb)->source);
112 }
113 
114 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
115 {
116 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
117 	struct tcp_sock *tp = tcp_sk(sk);
118 
119 	/* With PAWS, it is safe from the viewpoint
120 	   of data integrity. Even without PAWS it is safe provided sequence
121 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
122 
123 	   Actually, the idea is close to VJ's one, only timestamp cache is
124 	   held not per host, but per port pair and TW bucket is used as state
125 	   holder.
126 
127 	   If TW bucket has been already destroyed we fall back to VJ's scheme
128 	   and use initial timestamp retrieved from peer table.
129 	 */
130 	if (tcptw->tw_ts_recent_stamp &&
131 	    (twp == NULL || (sysctl_tcp_tw_reuse &&
132 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
133 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
134 		if (tp->write_seq == 0)
135 			tp->write_seq = 1;
136 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
137 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
138 		sock_hold(sktw);
139 		return 1;
140 	}
141 
142 	return 0;
143 }
144 
145 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
146 
147 /* This will initiate an outgoing connection. */
148 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
149 {
150 	struct inet_sock *inet = inet_sk(sk);
151 	struct tcp_sock *tp = tcp_sk(sk);
152 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
153 	struct rtable *rt;
154 	__be32 daddr, nexthop;
155 	int tmp;
156 	int err;
157 
158 	if (addr_len < sizeof(struct sockaddr_in))
159 		return -EINVAL;
160 
161 	if (usin->sin_family != AF_INET)
162 		return -EAFNOSUPPORT;
163 
164 	nexthop = daddr = usin->sin_addr.s_addr;
165 	if (inet->opt && inet->opt->srr) {
166 		if (!daddr)
167 			return -EINVAL;
168 		nexthop = inet->opt->faddr;
169 	}
170 
171 	tmp = ip_route_connect(&rt, nexthop, inet->saddr,
172 			       RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
173 			       IPPROTO_TCP,
174 			       inet->sport, usin->sin_port, sk, 1);
175 	if (tmp < 0) {
176 		if (tmp == -ENETUNREACH)
177 			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
178 		return tmp;
179 	}
180 
181 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
182 		ip_rt_put(rt);
183 		return -ENETUNREACH;
184 	}
185 
186 	if (!inet->opt || !inet->opt->srr)
187 		daddr = rt->rt_dst;
188 
189 	if (!inet->saddr)
190 		inet->saddr = rt->rt_src;
191 	inet->rcv_saddr = inet->saddr;
192 
193 	if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
194 		/* Reset inherited state */
195 		tp->rx_opt.ts_recent	   = 0;
196 		tp->rx_opt.ts_recent_stamp = 0;
197 		tp->write_seq		   = 0;
198 	}
199 
200 	if (tcp_death_row.sysctl_tw_recycle &&
201 	    !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
202 		struct inet_peer *peer = rt_get_peer(rt);
203 		/*
204 		 * VJ's idea. We save last timestamp seen from
205 		 * the destination in peer table, when entering state
206 		 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
207 		 * when trying new connection.
208 		 */
209 		if (peer != NULL &&
210 		    peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
211 			tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
212 			tp->rx_opt.ts_recent = peer->tcp_ts;
213 		}
214 	}
215 
216 	inet->dport = usin->sin_port;
217 	inet->daddr = daddr;
218 
219 	inet_csk(sk)->icsk_ext_hdr_len = 0;
220 	if (inet->opt)
221 		inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
222 
223 	tp->rx_opt.mss_clamp = 536;
224 
225 	/* Socket identity is still unknown (sport may be zero).
226 	 * However we set state to SYN-SENT and not releasing socket
227 	 * lock select source port, enter ourselves into the hash tables and
228 	 * complete initialization after this.
229 	 */
230 	tcp_set_state(sk, TCP_SYN_SENT);
231 	err = inet_hash_connect(&tcp_death_row, sk);
232 	if (err)
233 		goto failure;
234 
235 	err = ip_route_newports(&rt, IPPROTO_TCP,
236 				inet->sport, inet->dport, sk);
237 	if (err)
238 		goto failure;
239 
240 	/* OK, now commit destination to socket.  */
241 	sk->sk_gso_type = SKB_GSO_TCPV4;
242 	sk_setup_caps(sk, &rt->u.dst);
243 
244 	if (!tp->write_seq)
245 		tp->write_seq = secure_tcp_sequence_number(inet->saddr,
246 							   inet->daddr,
247 							   inet->sport,
248 							   usin->sin_port);
249 
250 	inet->id = tp->write_seq ^ jiffies;
251 
252 	err = tcp_connect(sk);
253 	rt = NULL;
254 	if (err)
255 		goto failure;
256 
257 	return 0;
258 
259 failure:
260 	/*
261 	 * This unhashes the socket and releases the local port,
262 	 * if necessary.
263 	 */
264 	tcp_set_state(sk, TCP_CLOSE);
265 	ip_rt_put(rt);
266 	sk->sk_route_caps = 0;
267 	inet->dport = 0;
268 	return err;
269 }
270 
271 /*
272  * This routine does path mtu discovery as defined in RFC1191.
273  */
274 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
275 {
276 	struct dst_entry *dst;
277 	struct inet_sock *inet = inet_sk(sk);
278 
279 	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
280 	 * send out by Linux are always <576bytes so they should go through
281 	 * unfragmented).
282 	 */
283 	if (sk->sk_state == TCP_LISTEN)
284 		return;
285 
286 	/* We don't check in the destentry if pmtu discovery is forbidden
287 	 * on this route. We just assume that no packet_to_big packets
288 	 * are send back when pmtu discovery is not active.
289 	 * There is a small race when the user changes this flag in the
290 	 * route, but I think that's acceptable.
291 	 */
292 	if ((dst = __sk_dst_check(sk, 0)) == NULL)
293 		return;
294 
295 	dst->ops->update_pmtu(dst, mtu);
296 
297 	/* Something is about to be wrong... Remember soft error
298 	 * for the case, if this connection will not able to recover.
299 	 */
300 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
301 		sk->sk_err_soft = EMSGSIZE;
302 
303 	mtu = dst_mtu(dst);
304 
305 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
306 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
307 		tcp_sync_mss(sk, mtu);
308 
309 		/* Resend the TCP packet because it's
310 		 * clear that the old packet has been
311 		 * dropped. This is the new "fast" path mtu
312 		 * discovery.
313 		 */
314 		tcp_simple_retransmit(sk);
315 	} /* else let the usual retransmit timer handle it */
316 }
317 
318 /*
319  * This routine is called by the ICMP module when it gets some
320  * sort of error condition.  If err < 0 then the socket should
321  * be closed and the error returned to the user.  If err > 0
322  * it's just the icmp type << 8 | icmp code.  After adjustment
323  * header points to the first 8 bytes of the tcp header.  We need
324  * to find the appropriate port.
325  *
326  * The locking strategy used here is very "optimistic". When
327  * someone else accesses the socket the ICMP is just dropped
328  * and for some paths there is no check at all.
329  * A more general error queue to queue errors for later handling
330  * is probably better.
331  *
332  */
333 
334 void tcp_v4_err(struct sk_buff *skb, u32 info)
335 {
336 	struct iphdr *iph = (struct iphdr *)skb->data;
337 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
338 	struct tcp_sock *tp;
339 	struct inet_sock *inet;
340 	const int type = icmp_hdr(skb)->type;
341 	const int code = icmp_hdr(skb)->code;
342 	struct sock *sk;
343 	__u32 seq;
344 	int err;
345 	struct net *net = dev_net(skb->dev);
346 
347 	if (skb->len < (iph->ihl << 2) + 8) {
348 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
349 		return;
350 	}
351 
352 	sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
353 			iph->saddr, th->source, inet_iif(skb));
354 	if (!sk) {
355 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
356 		return;
357 	}
358 	if (sk->sk_state == TCP_TIME_WAIT) {
359 		inet_twsk_put(inet_twsk(sk));
360 		return;
361 	}
362 
363 	bh_lock_sock(sk);
364 	/* If too many ICMPs get dropped on busy
365 	 * servers this needs to be solved differently.
366 	 */
367 	if (sock_owned_by_user(sk))
368 		NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
369 
370 	if (sk->sk_state == TCP_CLOSE)
371 		goto out;
372 
373 	tp = tcp_sk(sk);
374 	seq = ntohl(th->seq);
375 	if (sk->sk_state != TCP_LISTEN &&
376 	    !between(seq, tp->snd_una, tp->snd_nxt)) {
377 		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
378 		goto out;
379 	}
380 
381 	switch (type) {
382 	case ICMP_SOURCE_QUENCH:
383 		/* Just silently ignore these. */
384 		goto out;
385 	case ICMP_PARAMETERPROB:
386 		err = EPROTO;
387 		break;
388 	case ICMP_DEST_UNREACH:
389 		if (code > NR_ICMP_UNREACH)
390 			goto out;
391 
392 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
393 			if (!sock_owned_by_user(sk))
394 				do_pmtu_discovery(sk, iph, info);
395 			goto out;
396 		}
397 
398 		err = icmp_err_convert[code].errno;
399 		break;
400 	case ICMP_TIME_EXCEEDED:
401 		err = EHOSTUNREACH;
402 		break;
403 	default:
404 		goto out;
405 	}
406 
407 	switch (sk->sk_state) {
408 		struct request_sock *req, **prev;
409 	case TCP_LISTEN:
410 		if (sock_owned_by_user(sk))
411 			goto out;
412 
413 		req = inet_csk_search_req(sk, &prev, th->dest,
414 					  iph->daddr, iph->saddr);
415 		if (!req)
416 			goto out;
417 
418 		/* ICMPs are not backlogged, hence we cannot get
419 		   an established socket here.
420 		 */
421 		WARN_ON(req->sk);
422 
423 		if (seq != tcp_rsk(req)->snt_isn) {
424 			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
425 			goto out;
426 		}
427 
428 		/*
429 		 * Still in SYN_RECV, just remove it silently.
430 		 * There is no good way to pass the error to the newly
431 		 * created socket, and POSIX does not want network
432 		 * errors returned from accept().
433 		 */
434 		inet_csk_reqsk_queue_drop(sk, req, prev);
435 		goto out;
436 
437 	case TCP_SYN_SENT:
438 	case TCP_SYN_RECV:  /* Cannot happen.
439 			       It can f.e. if SYNs crossed.
440 			     */
441 		if (!sock_owned_by_user(sk)) {
442 			sk->sk_err = err;
443 
444 			sk->sk_error_report(sk);
445 
446 			tcp_done(sk);
447 		} else {
448 			sk->sk_err_soft = err;
449 		}
450 		goto out;
451 	}
452 
453 	/* If we've already connected we will keep trying
454 	 * until we time out, or the user gives up.
455 	 *
456 	 * rfc1122 4.2.3.9 allows to consider as hard errors
457 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
458 	 * but it is obsoleted by pmtu discovery).
459 	 *
460 	 * Note, that in modern internet, where routing is unreliable
461 	 * and in each dark corner broken firewalls sit, sending random
462 	 * errors ordered by their masters even this two messages finally lose
463 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
464 	 *
465 	 * Now we are in compliance with RFCs.
466 	 *							--ANK (980905)
467 	 */
468 
469 	inet = inet_sk(sk);
470 	if (!sock_owned_by_user(sk) && inet->recverr) {
471 		sk->sk_err = err;
472 		sk->sk_error_report(sk);
473 	} else	{ /* Only an error on timeout */
474 		sk->sk_err_soft = err;
475 	}
476 
477 out:
478 	bh_unlock_sock(sk);
479 	sock_put(sk);
480 }
481 
482 /* This routine computes an IPv4 TCP checksum. */
483 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
484 {
485 	struct inet_sock *inet = inet_sk(sk);
486 	struct tcphdr *th = tcp_hdr(skb);
487 
488 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
489 		th->check = ~tcp_v4_check(len, inet->saddr,
490 					  inet->daddr, 0);
491 		skb->csum_start = skb_transport_header(skb) - skb->head;
492 		skb->csum_offset = offsetof(struct tcphdr, check);
493 	} else {
494 		th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
495 					 csum_partial((char *)th,
496 						      th->doff << 2,
497 						      skb->csum));
498 	}
499 }
500 
501 int tcp_v4_gso_send_check(struct sk_buff *skb)
502 {
503 	const struct iphdr *iph;
504 	struct tcphdr *th;
505 
506 	if (!pskb_may_pull(skb, sizeof(*th)))
507 		return -EINVAL;
508 
509 	iph = ip_hdr(skb);
510 	th = tcp_hdr(skb);
511 
512 	th->check = 0;
513 	th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
514 	skb->csum_start = skb_transport_header(skb) - skb->head;
515 	skb->csum_offset = offsetof(struct tcphdr, check);
516 	skb->ip_summed = CHECKSUM_PARTIAL;
517 	return 0;
518 }
519 
520 /*
521  *	This routine will send an RST to the other tcp.
522  *
523  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
524  *		      for reset.
525  *	Answer: if a packet caused RST, it is not for a socket
526  *		existing in our system, if it is matched to a socket,
527  *		it is just duplicate segment or bug in other side's TCP.
528  *		So that we build reply only basing on parameters
529  *		arrived with segment.
530  *	Exception: precedence violation. We do not implement it in any case.
531  */
532 
533 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
534 {
535 	struct tcphdr *th = tcp_hdr(skb);
536 	struct {
537 		struct tcphdr th;
538 #ifdef CONFIG_TCP_MD5SIG
539 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
540 #endif
541 	} rep;
542 	struct ip_reply_arg arg;
543 #ifdef CONFIG_TCP_MD5SIG
544 	struct tcp_md5sig_key *key;
545 #endif
546 	struct net *net;
547 
548 	/* Never send a reset in response to a reset. */
549 	if (th->rst)
550 		return;
551 
552 	if (skb->rtable->rt_type != RTN_LOCAL)
553 		return;
554 
555 	/* Swap the send and the receive. */
556 	memset(&rep, 0, sizeof(rep));
557 	rep.th.dest   = th->source;
558 	rep.th.source = th->dest;
559 	rep.th.doff   = sizeof(struct tcphdr) / 4;
560 	rep.th.rst    = 1;
561 
562 	if (th->ack) {
563 		rep.th.seq = th->ack_seq;
564 	} else {
565 		rep.th.ack = 1;
566 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
567 				       skb->len - (th->doff << 2));
568 	}
569 
570 	memset(&arg, 0, sizeof(arg));
571 	arg.iov[0].iov_base = (unsigned char *)&rep;
572 	arg.iov[0].iov_len  = sizeof(rep.th);
573 
574 #ifdef CONFIG_TCP_MD5SIG
575 	key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
576 	if (key) {
577 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
578 				   (TCPOPT_NOP << 16) |
579 				   (TCPOPT_MD5SIG << 8) |
580 				   TCPOLEN_MD5SIG);
581 		/* Update length and the length the header thinks exists */
582 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
583 		rep.th.doff = arg.iov[0].iov_len / 4;
584 
585 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
586 				     key, ip_hdr(skb)->daddr,
587 				     ip_hdr(skb)->saddr, &rep.th);
588 	}
589 #endif
590 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
591 				      ip_hdr(skb)->saddr, /* XXX */
592 				      sizeof(struct tcphdr), IPPROTO_TCP, 0);
593 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
594 
595 	net = dev_net(skb->dst->dev);
596 	ip_send_reply(net->ipv4.tcp_sock, skb,
597 		      &arg, arg.iov[0].iov_len);
598 
599 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
600 	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
601 }
602 
603 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
604    outside socket context is ugly, certainly. What can I do?
605  */
606 
607 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
608 			    u32 win, u32 ts, int oif,
609 			    struct tcp_md5sig_key *key)
610 {
611 	struct tcphdr *th = tcp_hdr(skb);
612 	struct {
613 		struct tcphdr th;
614 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
615 #ifdef CONFIG_TCP_MD5SIG
616 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
617 #endif
618 			];
619 	} rep;
620 	struct ip_reply_arg arg;
621 	struct net *net = dev_net(skb->dst->dev);
622 
623 	memset(&rep.th, 0, sizeof(struct tcphdr));
624 	memset(&arg, 0, sizeof(arg));
625 
626 	arg.iov[0].iov_base = (unsigned char *)&rep;
627 	arg.iov[0].iov_len  = sizeof(rep.th);
628 	if (ts) {
629 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
630 				   (TCPOPT_TIMESTAMP << 8) |
631 				   TCPOLEN_TIMESTAMP);
632 		rep.opt[1] = htonl(tcp_time_stamp);
633 		rep.opt[2] = htonl(ts);
634 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
635 	}
636 
637 	/* Swap the send and the receive. */
638 	rep.th.dest    = th->source;
639 	rep.th.source  = th->dest;
640 	rep.th.doff    = arg.iov[0].iov_len / 4;
641 	rep.th.seq     = htonl(seq);
642 	rep.th.ack_seq = htonl(ack);
643 	rep.th.ack     = 1;
644 	rep.th.window  = htons(win);
645 
646 #ifdef CONFIG_TCP_MD5SIG
647 	if (key) {
648 		int offset = (ts) ? 3 : 0;
649 
650 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
651 					  (TCPOPT_NOP << 16) |
652 					  (TCPOPT_MD5SIG << 8) |
653 					  TCPOLEN_MD5SIG);
654 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
655 		rep.th.doff = arg.iov[0].iov_len/4;
656 
657 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
658 				    key, ip_hdr(skb)->saddr,
659 				    ip_hdr(skb)->daddr, &rep.th);
660 	}
661 #endif
662 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
663 				      ip_hdr(skb)->saddr, /* XXX */
664 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
665 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
666 	if (oif)
667 		arg.bound_dev_if = oif;
668 
669 	ip_send_reply(net->ipv4.tcp_sock, skb,
670 		      &arg, arg.iov[0].iov_len);
671 
672 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
673 }
674 
675 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
676 {
677 	struct inet_timewait_sock *tw = inet_twsk(sk);
678 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
679 
680 	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
681 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
682 			tcptw->tw_ts_recent,
683 			tw->tw_bound_dev_if,
684 			tcp_twsk_md5_key(tcptw)
685 			);
686 
687 	inet_twsk_put(tw);
688 }
689 
690 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
691 				  struct request_sock *req)
692 {
693 	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
694 			tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
695 			req->ts_recent,
696 			0,
697 			tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr));
698 }
699 
700 /*
701  *	Send a SYN-ACK after having received a SYN.
702  *	This still operates on a request_sock only, not on a big
703  *	socket.
704  */
705 static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
706 				struct dst_entry *dst)
707 {
708 	const struct inet_request_sock *ireq = inet_rsk(req);
709 	int err = -1;
710 	struct sk_buff * skb;
711 
712 	/* First, grab a route. */
713 	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
714 		return -1;
715 
716 	skb = tcp_make_synack(sk, dst, req);
717 
718 	if (skb) {
719 		struct tcphdr *th = tcp_hdr(skb);
720 
721 		th->check = tcp_v4_check(skb->len,
722 					 ireq->loc_addr,
723 					 ireq->rmt_addr,
724 					 csum_partial((char *)th, skb->len,
725 						      skb->csum));
726 
727 		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
728 					    ireq->rmt_addr,
729 					    ireq->opt);
730 		err = net_xmit_eval(err);
731 	}
732 
733 	dst_release(dst);
734 	return err;
735 }
736 
737 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
738 {
739 	return __tcp_v4_send_synack(sk, req, NULL);
740 }
741 
742 /*
743  *	IPv4 request_sock destructor.
744  */
745 static void tcp_v4_reqsk_destructor(struct request_sock *req)
746 {
747 	kfree(inet_rsk(req)->opt);
748 }
749 
750 #ifdef CONFIG_SYN_COOKIES
751 static void syn_flood_warning(struct sk_buff *skb)
752 {
753 	static unsigned long warntime;
754 
755 	if (time_after(jiffies, (warntime + HZ * 60))) {
756 		warntime = jiffies;
757 		printk(KERN_INFO
758 		       "possible SYN flooding on port %d. Sending cookies.\n",
759 		       ntohs(tcp_hdr(skb)->dest));
760 	}
761 }
762 #endif
763 
764 /*
765  * Save and compile IPv4 options into the request_sock if needed.
766  */
767 static struct ip_options *tcp_v4_save_options(struct sock *sk,
768 					      struct sk_buff *skb)
769 {
770 	struct ip_options *opt = &(IPCB(skb)->opt);
771 	struct ip_options *dopt = NULL;
772 
773 	if (opt && opt->optlen) {
774 		int opt_size = optlength(opt);
775 		dopt = kmalloc(opt_size, GFP_ATOMIC);
776 		if (dopt) {
777 			if (ip_options_echo(dopt, skb)) {
778 				kfree(dopt);
779 				dopt = NULL;
780 			}
781 		}
782 	}
783 	return dopt;
784 }
785 
786 #ifdef CONFIG_TCP_MD5SIG
787 /*
788  * RFC2385 MD5 checksumming requires a mapping of
789  * IP address->MD5 Key.
790  * We need to maintain these in the sk structure.
791  */
792 
793 /* Find the Key structure for an address.  */
794 static struct tcp_md5sig_key *
795 			tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
796 {
797 	struct tcp_sock *tp = tcp_sk(sk);
798 	int i;
799 
800 	if (!tp->md5sig_info || !tp->md5sig_info->entries4)
801 		return NULL;
802 	for (i = 0; i < tp->md5sig_info->entries4; i++) {
803 		if (tp->md5sig_info->keys4[i].addr == addr)
804 			return &tp->md5sig_info->keys4[i].base;
805 	}
806 	return NULL;
807 }
808 
809 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
810 					 struct sock *addr_sk)
811 {
812 	return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
813 }
814 
815 EXPORT_SYMBOL(tcp_v4_md5_lookup);
816 
817 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
818 						      struct request_sock *req)
819 {
820 	return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
821 }
822 
823 /* This can be called on a newly created socket, from other files */
824 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
825 		      u8 *newkey, u8 newkeylen)
826 {
827 	/* Add Key to the list */
828 	struct tcp_md5sig_key *key;
829 	struct tcp_sock *tp = tcp_sk(sk);
830 	struct tcp4_md5sig_key *keys;
831 
832 	key = tcp_v4_md5_do_lookup(sk, addr);
833 	if (key) {
834 		/* Pre-existing entry - just update that one. */
835 		kfree(key->key);
836 		key->key = newkey;
837 		key->keylen = newkeylen;
838 	} else {
839 		struct tcp_md5sig_info *md5sig;
840 
841 		if (!tp->md5sig_info) {
842 			tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
843 						  GFP_ATOMIC);
844 			if (!tp->md5sig_info) {
845 				kfree(newkey);
846 				return -ENOMEM;
847 			}
848 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
849 		}
850 		if (tcp_alloc_md5sig_pool() == NULL) {
851 			kfree(newkey);
852 			return -ENOMEM;
853 		}
854 		md5sig = tp->md5sig_info;
855 
856 		if (md5sig->alloced4 == md5sig->entries4) {
857 			keys = kmalloc((sizeof(*keys) *
858 					(md5sig->entries4 + 1)), GFP_ATOMIC);
859 			if (!keys) {
860 				kfree(newkey);
861 				tcp_free_md5sig_pool();
862 				return -ENOMEM;
863 			}
864 
865 			if (md5sig->entries4)
866 				memcpy(keys, md5sig->keys4,
867 				       sizeof(*keys) * md5sig->entries4);
868 
869 			/* Free old key list, and reference new one */
870 			kfree(md5sig->keys4);
871 			md5sig->keys4 = keys;
872 			md5sig->alloced4++;
873 		}
874 		md5sig->entries4++;
875 		md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
876 		md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
877 		md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
878 	}
879 	return 0;
880 }
881 
882 EXPORT_SYMBOL(tcp_v4_md5_do_add);
883 
884 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
885 			       u8 *newkey, u8 newkeylen)
886 {
887 	return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
888 				 newkey, newkeylen);
889 }
890 
891 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
892 {
893 	struct tcp_sock *tp = tcp_sk(sk);
894 	int i;
895 
896 	for (i = 0; i < tp->md5sig_info->entries4; i++) {
897 		if (tp->md5sig_info->keys4[i].addr == addr) {
898 			/* Free the key */
899 			kfree(tp->md5sig_info->keys4[i].base.key);
900 			tp->md5sig_info->entries4--;
901 
902 			if (tp->md5sig_info->entries4 == 0) {
903 				kfree(tp->md5sig_info->keys4);
904 				tp->md5sig_info->keys4 = NULL;
905 				tp->md5sig_info->alloced4 = 0;
906 			} else if (tp->md5sig_info->entries4 != i) {
907 				/* Need to do some manipulation */
908 				memmove(&tp->md5sig_info->keys4[i],
909 					&tp->md5sig_info->keys4[i+1],
910 					(tp->md5sig_info->entries4 - i) *
911 					 sizeof(struct tcp4_md5sig_key));
912 			}
913 			tcp_free_md5sig_pool();
914 			return 0;
915 		}
916 	}
917 	return -ENOENT;
918 }
919 
920 EXPORT_SYMBOL(tcp_v4_md5_do_del);
921 
922 static void tcp_v4_clear_md5_list(struct sock *sk)
923 {
924 	struct tcp_sock *tp = tcp_sk(sk);
925 
926 	/* Free each key, then the set of key keys,
927 	 * the crypto element, and then decrement our
928 	 * hold on the last resort crypto.
929 	 */
930 	if (tp->md5sig_info->entries4) {
931 		int i;
932 		for (i = 0; i < tp->md5sig_info->entries4; i++)
933 			kfree(tp->md5sig_info->keys4[i].base.key);
934 		tp->md5sig_info->entries4 = 0;
935 		tcp_free_md5sig_pool();
936 	}
937 	if (tp->md5sig_info->keys4) {
938 		kfree(tp->md5sig_info->keys4);
939 		tp->md5sig_info->keys4 = NULL;
940 		tp->md5sig_info->alloced4  = 0;
941 	}
942 }
943 
944 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
945 				 int optlen)
946 {
947 	struct tcp_md5sig cmd;
948 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
949 	u8 *newkey;
950 
951 	if (optlen < sizeof(cmd))
952 		return -EINVAL;
953 
954 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
955 		return -EFAULT;
956 
957 	if (sin->sin_family != AF_INET)
958 		return -EINVAL;
959 
960 	if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
961 		if (!tcp_sk(sk)->md5sig_info)
962 			return -ENOENT;
963 		return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
964 	}
965 
966 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
967 		return -EINVAL;
968 
969 	if (!tcp_sk(sk)->md5sig_info) {
970 		struct tcp_sock *tp = tcp_sk(sk);
971 		struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
972 
973 		if (!p)
974 			return -EINVAL;
975 
976 		tp->md5sig_info = p;
977 		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
978 	}
979 
980 	newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
981 	if (!newkey)
982 		return -ENOMEM;
983 	return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
984 				 newkey, cmd.tcpm_keylen);
985 }
986 
987 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
988 					__be32 daddr, __be32 saddr, int nbytes)
989 {
990 	struct tcp4_pseudohdr *bp;
991 	struct scatterlist sg;
992 
993 	bp = &hp->md5_blk.ip4;
994 
995 	/*
996 	 * 1. the TCP pseudo-header (in the order: source IP address,
997 	 * destination IP address, zero-padded protocol number, and
998 	 * segment length)
999 	 */
1000 	bp->saddr = saddr;
1001 	bp->daddr = daddr;
1002 	bp->pad = 0;
1003 	bp->protocol = IPPROTO_TCP;
1004 	bp->len = cpu_to_be16(nbytes);
1005 
1006 	sg_init_one(&sg, bp, sizeof(*bp));
1007 	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1008 }
1009 
1010 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1011 			       __be32 daddr, __be32 saddr, struct tcphdr *th)
1012 {
1013 	struct tcp_md5sig_pool *hp;
1014 	struct hash_desc *desc;
1015 
1016 	hp = tcp_get_md5sig_pool();
1017 	if (!hp)
1018 		goto clear_hash_noput;
1019 	desc = &hp->md5_desc;
1020 
1021 	if (crypto_hash_init(desc))
1022 		goto clear_hash;
1023 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1024 		goto clear_hash;
1025 	if (tcp_md5_hash_header(hp, th))
1026 		goto clear_hash;
1027 	if (tcp_md5_hash_key(hp, key))
1028 		goto clear_hash;
1029 	if (crypto_hash_final(desc, md5_hash))
1030 		goto clear_hash;
1031 
1032 	tcp_put_md5sig_pool();
1033 	return 0;
1034 
1035 clear_hash:
1036 	tcp_put_md5sig_pool();
1037 clear_hash_noput:
1038 	memset(md5_hash, 0, 16);
1039 	return 1;
1040 }
1041 
1042 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1043 			struct sock *sk, struct request_sock *req,
1044 			struct sk_buff *skb)
1045 {
1046 	struct tcp_md5sig_pool *hp;
1047 	struct hash_desc *desc;
1048 	struct tcphdr *th = tcp_hdr(skb);
1049 	__be32 saddr, daddr;
1050 
1051 	if (sk) {
1052 		saddr = inet_sk(sk)->saddr;
1053 		daddr = inet_sk(sk)->daddr;
1054 	} else if (req) {
1055 		saddr = inet_rsk(req)->loc_addr;
1056 		daddr = inet_rsk(req)->rmt_addr;
1057 	} else {
1058 		const struct iphdr *iph = ip_hdr(skb);
1059 		saddr = iph->saddr;
1060 		daddr = iph->daddr;
1061 	}
1062 
1063 	hp = tcp_get_md5sig_pool();
1064 	if (!hp)
1065 		goto clear_hash_noput;
1066 	desc = &hp->md5_desc;
1067 
1068 	if (crypto_hash_init(desc))
1069 		goto clear_hash;
1070 
1071 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1072 		goto clear_hash;
1073 	if (tcp_md5_hash_header(hp, th))
1074 		goto clear_hash;
1075 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1076 		goto clear_hash;
1077 	if (tcp_md5_hash_key(hp, key))
1078 		goto clear_hash;
1079 	if (crypto_hash_final(desc, md5_hash))
1080 		goto clear_hash;
1081 
1082 	tcp_put_md5sig_pool();
1083 	return 0;
1084 
1085 clear_hash:
1086 	tcp_put_md5sig_pool();
1087 clear_hash_noput:
1088 	memset(md5_hash, 0, 16);
1089 	return 1;
1090 }
1091 
1092 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1093 
1094 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1095 {
1096 	/*
1097 	 * This gets called for each TCP segment that arrives
1098 	 * so we want to be efficient.
1099 	 * We have 3 drop cases:
1100 	 * o No MD5 hash and one expected.
1101 	 * o MD5 hash and we're not expecting one.
1102 	 * o MD5 hash and its wrong.
1103 	 */
1104 	__u8 *hash_location = NULL;
1105 	struct tcp_md5sig_key *hash_expected;
1106 	const struct iphdr *iph = ip_hdr(skb);
1107 	struct tcphdr *th = tcp_hdr(skb);
1108 	int genhash;
1109 	unsigned char newhash[16];
1110 
1111 	hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1112 	hash_location = tcp_parse_md5sig_option(th);
1113 
1114 	/* We've parsed the options - do we have a hash? */
1115 	if (!hash_expected && !hash_location)
1116 		return 0;
1117 
1118 	if (hash_expected && !hash_location) {
1119 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1120 		return 1;
1121 	}
1122 
1123 	if (!hash_expected && hash_location) {
1124 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1125 		return 1;
1126 	}
1127 
1128 	/* Okay, so this is hash_expected and hash_location -
1129 	 * so we need to calculate the checksum.
1130 	 */
1131 	genhash = tcp_v4_md5_hash_skb(newhash,
1132 				      hash_expected,
1133 				      NULL, NULL, skb);
1134 
1135 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1136 		if (net_ratelimit()) {
1137 			printk(KERN_INFO "MD5 Hash failed for "
1138 			       "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1139 			       NIPQUAD(iph->saddr), ntohs(th->source),
1140 			       NIPQUAD(iph->daddr), ntohs(th->dest),
1141 			       genhash ? " tcp_v4_calc_md5_hash failed" : "");
1142 		}
1143 		return 1;
1144 	}
1145 	return 0;
1146 }
1147 
1148 #endif
1149 
1150 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1151 	.family		=	PF_INET,
1152 	.obj_size	=	sizeof(struct tcp_request_sock),
1153 	.rtx_syn_ack	=	tcp_v4_send_synack,
1154 	.send_ack	=	tcp_v4_reqsk_send_ack,
1155 	.destructor	=	tcp_v4_reqsk_destructor,
1156 	.send_reset	=	tcp_v4_send_reset,
1157 };
1158 
1159 #ifdef CONFIG_TCP_MD5SIG
1160 static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1161 	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
1162 };
1163 #endif
1164 
1165 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1166 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1167 	.twsk_unique	= tcp_twsk_unique,
1168 	.twsk_destructor= tcp_twsk_destructor,
1169 };
1170 
1171 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1172 {
1173 	struct inet_request_sock *ireq;
1174 	struct tcp_options_received tmp_opt;
1175 	struct request_sock *req;
1176 	__be32 saddr = ip_hdr(skb)->saddr;
1177 	__be32 daddr = ip_hdr(skb)->daddr;
1178 	__u32 isn = TCP_SKB_CB(skb)->when;
1179 	struct dst_entry *dst = NULL;
1180 #ifdef CONFIG_SYN_COOKIES
1181 	int want_cookie = 0;
1182 #else
1183 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1184 #endif
1185 
1186 	/* Never answer to SYNs send to broadcast or multicast */
1187 	if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1188 		goto drop;
1189 
1190 	/* TW buckets are converted to open requests without
1191 	 * limitations, they conserve resources and peer is
1192 	 * evidently real one.
1193 	 */
1194 	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1195 #ifdef CONFIG_SYN_COOKIES
1196 		if (sysctl_tcp_syncookies) {
1197 			want_cookie = 1;
1198 		} else
1199 #endif
1200 		goto drop;
1201 	}
1202 
1203 	/* Accept backlog is full. If we have already queued enough
1204 	 * of warm entries in syn queue, drop request. It is better than
1205 	 * clogging syn queue with openreqs with exponentially increasing
1206 	 * timeout.
1207 	 */
1208 	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1209 		goto drop;
1210 
1211 	req = inet_reqsk_alloc(&tcp_request_sock_ops);
1212 	if (!req)
1213 		goto drop;
1214 
1215 #ifdef CONFIG_TCP_MD5SIG
1216 	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1217 #endif
1218 
1219 	tcp_clear_options(&tmp_opt);
1220 	tmp_opt.mss_clamp = 536;
1221 	tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1222 
1223 	tcp_parse_options(skb, &tmp_opt, 0);
1224 
1225 	if (want_cookie && !tmp_opt.saw_tstamp)
1226 		tcp_clear_options(&tmp_opt);
1227 
1228 	if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1229 		/* Some OSes (unknown ones, but I see them on web server, which
1230 		 * contains information interesting only for windows'
1231 		 * users) do not send their stamp in SYN. It is easy case.
1232 		 * We simply do not advertise TS support.
1233 		 */
1234 		tmp_opt.saw_tstamp = 0;
1235 		tmp_opt.tstamp_ok  = 0;
1236 	}
1237 	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1238 
1239 	tcp_openreq_init(req, &tmp_opt, skb);
1240 
1241 	if (security_inet_conn_request(sk, skb, req))
1242 		goto drop_and_free;
1243 
1244 	ireq = inet_rsk(req);
1245 	ireq->loc_addr = daddr;
1246 	ireq->rmt_addr = saddr;
1247 	ireq->opt = tcp_v4_save_options(sk, skb);
1248 	if (!want_cookie)
1249 		TCP_ECN_create_request(req, tcp_hdr(skb));
1250 
1251 	if (want_cookie) {
1252 #ifdef CONFIG_SYN_COOKIES
1253 		syn_flood_warning(skb);
1254 		req->cookie_ts = tmp_opt.tstamp_ok;
1255 #endif
1256 		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1257 	} else if (!isn) {
1258 		struct inet_peer *peer = NULL;
1259 
1260 		/* VJ's idea. We save last timestamp seen
1261 		 * from the destination in peer table, when entering
1262 		 * state TIME-WAIT, and check against it before
1263 		 * accepting new connection request.
1264 		 *
1265 		 * If "isn" is not zero, this request hit alive
1266 		 * timewait bucket, so that all the necessary checks
1267 		 * are made in the function processing timewait state.
1268 		 */
1269 		if (tmp_opt.saw_tstamp &&
1270 		    tcp_death_row.sysctl_tw_recycle &&
1271 		    (dst = inet_csk_route_req(sk, req)) != NULL &&
1272 		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1273 		    peer->v4daddr == saddr) {
1274 			if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1275 			    (s32)(peer->tcp_ts - req->ts_recent) >
1276 							TCP_PAWS_WINDOW) {
1277 				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1278 				goto drop_and_release;
1279 			}
1280 		}
1281 		/* Kill the following clause, if you dislike this way. */
1282 		else if (!sysctl_tcp_syncookies &&
1283 			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1284 			  (sysctl_max_syn_backlog >> 2)) &&
1285 			 (!peer || !peer->tcp_ts_stamp) &&
1286 			 (!dst || !dst_metric(dst, RTAX_RTT))) {
1287 			/* Without syncookies last quarter of
1288 			 * backlog is filled with destinations,
1289 			 * proven to be alive.
1290 			 * It means that we continue to communicate
1291 			 * to destinations, already remembered
1292 			 * to the moment of synflood.
1293 			 */
1294 			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1295 				       "request from " NIPQUAD_FMT "/%u\n",
1296 				       NIPQUAD(saddr),
1297 				       ntohs(tcp_hdr(skb)->source));
1298 			goto drop_and_release;
1299 		}
1300 
1301 		isn = tcp_v4_init_sequence(skb);
1302 	}
1303 	tcp_rsk(req)->snt_isn = isn;
1304 
1305 	if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
1306 		goto drop_and_free;
1307 
1308 	inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1309 	return 0;
1310 
1311 drop_and_release:
1312 	dst_release(dst);
1313 drop_and_free:
1314 	reqsk_free(req);
1315 drop:
1316 	return 0;
1317 }
1318 
1319 
1320 /*
1321  * The three way handshake has completed - we got a valid synack -
1322  * now create the new socket.
1323  */
1324 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1325 				  struct request_sock *req,
1326 				  struct dst_entry *dst)
1327 {
1328 	struct inet_request_sock *ireq;
1329 	struct inet_sock *newinet;
1330 	struct tcp_sock *newtp;
1331 	struct sock *newsk;
1332 #ifdef CONFIG_TCP_MD5SIG
1333 	struct tcp_md5sig_key *key;
1334 #endif
1335 
1336 	if (sk_acceptq_is_full(sk))
1337 		goto exit_overflow;
1338 
1339 	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1340 		goto exit;
1341 
1342 	newsk = tcp_create_openreq_child(sk, req, skb);
1343 	if (!newsk)
1344 		goto exit;
1345 
1346 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1347 	sk_setup_caps(newsk, dst);
1348 
1349 	newtp		      = tcp_sk(newsk);
1350 	newinet		      = inet_sk(newsk);
1351 	ireq		      = inet_rsk(req);
1352 	newinet->daddr	      = ireq->rmt_addr;
1353 	newinet->rcv_saddr    = ireq->loc_addr;
1354 	newinet->saddr	      = ireq->loc_addr;
1355 	newinet->opt	      = ireq->opt;
1356 	ireq->opt	      = NULL;
1357 	newinet->mc_index     = inet_iif(skb);
1358 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1359 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1360 	if (newinet->opt)
1361 		inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1362 	newinet->id = newtp->write_seq ^ jiffies;
1363 
1364 	tcp_mtup_init(newsk);
1365 	tcp_sync_mss(newsk, dst_mtu(dst));
1366 	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1367 	tcp_initialize_rcv_mss(newsk);
1368 
1369 #ifdef CONFIG_TCP_MD5SIG
1370 	/* Copy over the MD5 key from the original socket */
1371 	if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1372 		/*
1373 		 * We're using one, so create a matching key
1374 		 * on the newsk structure. If we fail to get
1375 		 * memory, then we end up not copying the key
1376 		 * across. Shucks.
1377 		 */
1378 		char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1379 		if (newkey != NULL)
1380 			tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1381 					  newkey, key->keylen);
1382 		newsk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1383 	}
1384 #endif
1385 
1386 	__inet_hash_nolisten(newsk);
1387 	__inet_inherit_port(sk, newsk);
1388 
1389 	return newsk;
1390 
1391 exit_overflow:
1392 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1393 exit:
1394 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1395 	dst_release(dst);
1396 	return NULL;
1397 }
1398 
1399 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1400 {
1401 	struct tcphdr *th = tcp_hdr(skb);
1402 	const struct iphdr *iph = ip_hdr(skb);
1403 	struct sock *nsk;
1404 	struct request_sock **prev;
1405 	/* Find possible connection requests. */
1406 	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1407 						       iph->saddr, iph->daddr);
1408 	if (req)
1409 		return tcp_check_req(sk, skb, req, prev);
1410 
1411 	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1412 			th->source, iph->daddr, th->dest, inet_iif(skb));
1413 
1414 	if (nsk) {
1415 		if (nsk->sk_state != TCP_TIME_WAIT) {
1416 			bh_lock_sock(nsk);
1417 			return nsk;
1418 		}
1419 		inet_twsk_put(inet_twsk(nsk));
1420 		return NULL;
1421 	}
1422 
1423 #ifdef CONFIG_SYN_COOKIES
1424 	if (!th->rst && !th->syn && th->ack)
1425 		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1426 #endif
1427 	return sk;
1428 }
1429 
1430 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1431 {
1432 	const struct iphdr *iph = ip_hdr(skb);
1433 
1434 	if (skb->ip_summed == CHECKSUM_COMPLETE) {
1435 		if (!tcp_v4_check(skb->len, iph->saddr,
1436 				  iph->daddr, skb->csum)) {
1437 			skb->ip_summed = CHECKSUM_UNNECESSARY;
1438 			return 0;
1439 		}
1440 	}
1441 
1442 	skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1443 				       skb->len, IPPROTO_TCP, 0);
1444 
1445 	if (skb->len <= 76) {
1446 		return __skb_checksum_complete(skb);
1447 	}
1448 	return 0;
1449 }
1450 
1451 
1452 /* The socket must have it's spinlock held when we get
1453  * here.
1454  *
1455  * We have a potential double-lock case here, so even when
1456  * doing backlog processing we use the BH locking scheme.
1457  * This is because we cannot sleep with the original spinlock
1458  * held.
1459  */
1460 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1461 {
1462 	struct sock *rsk;
1463 #ifdef CONFIG_TCP_MD5SIG
1464 	/*
1465 	 * We really want to reject the packet as early as possible
1466 	 * if:
1467 	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1468 	 *  o There is an MD5 option and we're not expecting one
1469 	 */
1470 	if (tcp_v4_inbound_md5_hash(sk, skb))
1471 		goto discard;
1472 #endif
1473 
1474 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1475 		TCP_CHECK_TIMER(sk);
1476 		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1477 			rsk = sk;
1478 			goto reset;
1479 		}
1480 		TCP_CHECK_TIMER(sk);
1481 		return 0;
1482 	}
1483 
1484 	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1485 		goto csum_err;
1486 
1487 	if (sk->sk_state == TCP_LISTEN) {
1488 		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1489 		if (!nsk)
1490 			goto discard;
1491 
1492 		if (nsk != sk) {
1493 			if (tcp_child_process(sk, nsk, skb)) {
1494 				rsk = nsk;
1495 				goto reset;
1496 			}
1497 			return 0;
1498 		}
1499 	}
1500 
1501 	TCP_CHECK_TIMER(sk);
1502 	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1503 		rsk = sk;
1504 		goto reset;
1505 	}
1506 	TCP_CHECK_TIMER(sk);
1507 	return 0;
1508 
1509 reset:
1510 	tcp_v4_send_reset(rsk, skb);
1511 discard:
1512 	kfree_skb(skb);
1513 	/* Be careful here. If this function gets more complicated and
1514 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1515 	 * might be destroyed here. This current version compiles correctly,
1516 	 * but you have been warned.
1517 	 */
1518 	return 0;
1519 
1520 csum_err:
1521 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1522 	goto discard;
1523 }
1524 
1525 /*
1526  *	From tcp_input.c
1527  */
1528 
1529 int tcp_v4_rcv(struct sk_buff *skb)
1530 {
1531 	const struct iphdr *iph;
1532 	struct tcphdr *th;
1533 	struct sock *sk;
1534 	int ret;
1535 	struct net *net = dev_net(skb->dev);
1536 
1537 	if (skb->pkt_type != PACKET_HOST)
1538 		goto discard_it;
1539 
1540 	/* Count it even if it's bad */
1541 	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1542 
1543 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1544 		goto discard_it;
1545 
1546 	th = tcp_hdr(skb);
1547 
1548 	if (th->doff < sizeof(struct tcphdr) / 4)
1549 		goto bad_packet;
1550 	if (!pskb_may_pull(skb, th->doff * 4))
1551 		goto discard_it;
1552 
1553 	/* An explanation is required here, I think.
1554 	 * Packet length and doff are validated by header prediction,
1555 	 * provided case of th->doff==0 is eliminated.
1556 	 * So, we defer the checks. */
1557 	if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1558 		goto bad_packet;
1559 
1560 	th = tcp_hdr(skb);
1561 	iph = ip_hdr(skb);
1562 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1563 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1564 				    skb->len - th->doff * 4);
1565 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1566 	TCP_SKB_CB(skb)->when	 = 0;
1567 	TCP_SKB_CB(skb)->flags	 = iph->tos;
1568 	TCP_SKB_CB(skb)->sacked	 = 0;
1569 
1570 	sk = __inet_lookup(net, &tcp_hashinfo, iph->saddr,
1571 			th->source, iph->daddr, th->dest, inet_iif(skb));
1572 	if (!sk)
1573 		goto no_tcp_socket;
1574 
1575 process:
1576 	if (sk->sk_state == TCP_TIME_WAIT)
1577 		goto do_time_wait;
1578 
1579 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1580 		goto discard_and_relse;
1581 	nf_reset(skb);
1582 
1583 	if (sk_filter(sk, skb))
1584 		goto discard_and_relse;
1585 
1586 	skb->dev = NULL;
1587 
1588 	bh_lock_sock_nested(sk);
1589 	ret = 0;
1590 	if (!sock_owned_by_user(sk)) {
1591 #ifdef CONFIG_NET_DMA
1592 		struct tcp_sock *tp = tcp_sk(sk);
1593 		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1594 			tp->ucopy.dma_chan = get_softnet_dma();
1595 		if (tp->ucopy.dma_chan)
1596 			ret = tcp_v4_do_rcv(sk, skb);
1597 		else
1598 #endif
1599 		{
1600 			if (!tcp_prequeue(sk, skb))
1601 			ret = tcp_v4_do_rcv(sk, skb);
1602 		}
1603 	} else
1604 		sk_add_backlog(sk, skb);
1605 	bh_unlock_sock(sk);
1606 
1607 	sock_put(sk);
1608 
1609 	return ret;
1610 
1611 no_tcp_socket:
1612 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1613 		goto discard_it;
1614 
1615 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1616 bad_packet:
1617 		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1618 	} else {
1619 		tcp_v4_send_reset(NULL, skb);
1620 	}
1621 
1622 discard_it:
1623 	/* Discard frame. */
1624 	kfree_skb(skb);
1625 	return 0;
1626 
1627 discard_and_relse:
1628 	sock_put(sk);
1629 	goto discard_it;
1630 
1631 do_time_wait:
1632 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1633 		inet_twsk_put(inet_twsk(sk));
1634 		goto discard_it;
1635 	}
1636 
1637 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1638 		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1639 		inet_twsk_put(inet_twsk(sk));
1640 		goto discard_it;
1641 	}
1642 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1643 	case TCP_TW_SYN: {
1644 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1645 							&tcp_hashinfo,
1646 							iph->daddr, th->dest,
1647 							inet_iif(skb));
1648 		if (sk2) {
1649 			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1650 			inet_twsk_put(inet_twsk(sk));
1651 			sk = sk2;
1652 			goto process;
1653 		}
1654 		/* Fall through to ACK */
1655 	}
1656 	case TCP_TW_ACK:
1657 		tcp_v4_timewait_ack(sk, skb);
1658 		break;
1659 	case TCP_TW_RST:
1660 		goto no_tcp_socket;
1661 	case TCP_TW_SUCCESS:;
1662 	}
1663 	goto discard_it;
1664 }
1665 
1666 /* VJ's idea. Save last timestamp seen from this destination
1667  * and hold it at least for normal timewait interval to use for duplicate
1668  * segment detection in subsequent connections, before they enter synchronized
1669  * state.
1670  */
1671 
1672 int tcp_v4_remember_stamp(struct sock *sk)
1673 {
1674 	struct inet_sock *inet = inet_sk(sk);
1675 	struct tcp_sock *tp = tcp_sk(sk);
1676 	struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1677 	struct inet_peer *peer = NULL;
1678 	int release_it = 0;
1679 
1680 	if (!rt || rt->rt_dst != inet->daddr) {
1681 		peer = inet_getpeer(inet->daddr, 1);
1682 		release_it = 1;
1683 	} else {
1684 		if (!rt->peer)
1685 			rt_bind_peer(rt, 1);
1686 		peer = rt->peer;
1687 	}
1688 
1689 	if (peer) {
1690 		if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1691 		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1692 		     peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1693 			peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1694 			peer->tcp_ts = tp->rx_opt.ts_recent;
1695 		}
1696 		if (release_it)
1697 			inet_putpeer(peer);
1698 		return 1;
1699 	}
1700 
1701 	return 0;
1702 }
1703 
1704 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1705 {
1706 	struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1707 
1708 	if (peer) {
1709 		const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1710 
1711 		if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1712 		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1713 		     peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1714 			peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1715 			peer->tcp_ts	   = tcptw->tw_ts_recent;
1716 		}
1717 		inet_putpeer(peer);
1718 		return 1;
1719 	}
1720 
1721 	return 0;
1722 }
1723 
1724 struct inet_connection_sock_af_ops ipv4_specific = {
1725 	.queue_xmit	   = ip_queue_xmit,
1726 	.send_check	   = tcp_v4_send_check,
1727 	.rebuild_header	   = inet_sk_rebuild_header,
1728 	.conn_request	   = tcp_v4_conn_request,
1729 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1730 	.remember_stamp	   = tcp_v4_remember_stamp,
1731 	.net_header_len	   = sizeof(struct iphdr),
1732 	.setsockopt	   = ip_setsockopt,
1733 	.getsockopt	   = ip_getsockopt,
1734 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1735 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1736 	.bind_conflict	   = inet_csk_bind_conflict,
1737 #ifdef CONFIG_COMPAT
1738 	.compat_setsockopt = compat_ip_setsockopt,
1739 	.compat_getsockopt = compat_ip_getsockopt,
1740 #endif
1741 };
1742 
1743 #ifdef CONFIG_TCP_MD5SIG
1744 static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1745 	.md5_lookup		= tcp_v4_md5_lookup,
1746 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1747 	.md5_add		= tcp_v4_md5_add_func,
1748 	.md5_parse		= tcp_v4_parse_md5_keys,
1749 };
1750 #endif
1751 
1752 /* NOTE: A lot of things set to zero explicitly by call to
1753  *       sk_alloc() so need not be done here.
1754  */
1755 static int tcp_v4_init_sock(struct sock *sk)
1756 {
1757 	struct inet_connection_sock *icsk = inet_csk(sk);
1758 	struct tcp_sock *tp = tcp_sk(sk);
1759 
1760 	skb_queue_head_init(&tp->out_of_order_queue);
1761 	tcp_init_xmit_timers(sk);
1762 	tcp_prequeue_init(tp);
1763 
1764 	icsk->icsk_rto = TCP_TIMEOUT_INIT;
1765 	tp->mdev = TCP_TIMEOUT_INIT;
1766 
1767 	/* So many TCP implementations out there (incorrectly) count the
1768 	 * initial SYN frame in their delayed-ACK and congestion control
1769 	 * algorithms that we must have the following bandaid to talk
1770 	 * efficiently to them.  -DaveM
1771 	 */
1772 	tp->snd_cwnd = 2;
1773 
1774 	/* See draft-stevens-tcpca-spec-01 for discussion of the
1775 	 * initialization of these values.
1776 	 */
1777 	tp->snd_ssthresh = 0x7fffffff;	/* Infinity */
1778 	tp->snd_cwnd_clamp = ~0;
1779 	tp->mss_cache = 536;
1780 
1781 	tp->reordering = sysctl_tcp_reordering;
1782 	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1783 
1784 	sk->sk_state = TCP_CLOSE;
1785 
1786 	sk->sk_write_space = sk_stream_write_space;
1787 	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1788 
1789 	icsk->icsk_af_ops = &ipv4_specific;
1790 	icsk->icsk_sync_mss = tcp_sync_mss;
1791 #ifdef CONFIG_TCP_MD5SIG
1792 	tp->af_specific = &tcp_sock_ipv4_specific;
1793 #endif
1794 
1795 	sk->sk_sndbuf = sysctl_tcp_wmem[1];
1796 	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1797 
1798 	atomic_inc(&tcp_sockets_allocated);
1799 
1800 	return 0;
1801 }
1802 
1803 void tcp_v4_destroy_sock(struct sock *sk)
1804 {
1805 	struct tcp_sock *tp = tcp_sk(sk);
1806 
1807 	tcp_clear_xmit_timers(sk);
1808 
1809 	tcp_cleanup_congestion_control(sk);
1810 
1811 	/* Cleanup up the write buffer. */
1812 	tcp_write_queue_purge(sk);
1813 
1814 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1815 	__skb_queue_purge(&tp->out_of_order_queue);
1816 
1817 #ifdef CONFIG_TCP_MD5SIG
1818 	/* Clean up the MD5 key list, if any */
1819 	if (tp->md5sig_info) {
1820 		tcp_v4_clear_md5_list(sk);
1821 		kfree(tp->md5sig_info);
1822 		tp->md5sig_info = NULL;
1823 	}
1824 #endif
1825 
1826 #ifdef CONFIG_NET_DMA
1827 	/* Cleans up our sk_async_wait_queue */
1828 	__skb_queue_purge(&sk->sk_async_wait_queue);
1829 #endif
1830 
1831 	/* Clean prequeue, it must be empty really */
1832 	__skb_queue_purge(&tp->ucopy.prequeue);
1833 
1834 	/* Clean up a referenced TCP bind bucket. */
1835 	if (inet_csk(sk)->icsk_bind_hash)
1836 		inet_put_port(sk);
1837 
1838 	/*
1839 	 * If sendmsg cached page exists, toss it.
1840 	 */
1841 	if (sk->sk_sndmsg_page) {
1842 		__free_page(sk->sk_sndmsg_page);
1843 		sk->sk_sndmsg_page = NULL;
1844 	}
1845 
1846 	atomic_dec(&tcp_sockets_allocated);
1847 }
1848 
1849 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1850 
1851 #ifdef CONFIG_PROC_FS
1852 /* Proc filesystem TCP sock list dumping. */
1853 
1854 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1855 {
1856 	return hlist_empty(head) ? NULL :
1857 		list_entry(head->first, struct inet_timewait_sock, tw_node);
1858 }
1859 
1860 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1861 {
1862 	return tw->tw_node.next ?
1863 		hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1864 }
1865 
1866 static void *listening_get_next(struct seq_file *seq, void *cur)
1867 {
1868 	struct inet_connection_sock *icsk;
1869 	struct hlist_node *node;
1870 	struct sock *sk = cur;
1871 	struct tcp_iter_state* st = seq->private;
1872 	struct net *net = seq_file_net(seq);
1873 
1874 	if (!sk) {
1875 		st->bucket = 0;
1876 		sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1877 		goto get_sk;
1878 	}
1879 
1880 	++st->num;
1881 
1882 	if (st->state == TCP_SEQ_STATE_OPENREQ) {
1883 		struct request_sock *req = cur;
1884 
1885 		icsk = inet_csk(st->syn_wait_sk);
1886 		req = req->dl_next;
1887 		while (1) {
1888 			while (req) {
1889 				if (req->rsk_ops->family == st->family) {
1890 					cur = req;
1891 					goto out;
1892 				}
1893 				req = req->dl_next;
1894 			}
1895 			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1896 				break;
1897 get_req:
1898 			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1899 		}
1900 		sk	  = sk_next(st->syn_wait_sk);
1901 		st->state = TCP_SEQ_STATE_LISTENING;
1902 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1903 	} else {
1904 		icsk = inet_csk(sk);
1905 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1906 		if (reqsk_queue_len(&icsk->icsk_accept_queue))
1907 			goto start_req;
1908 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1909 		sk = sk_next(sk);
1910 	}
1911 get_sk:
1912 	sk_for_each_from(sk, node) {
1913 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
1914 			cur = sk;
1915 			goto out;
1916 		}
1917 		icsk = inet_csk(sk);
1918 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1919 		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1920 start_req:
1921 			st->uid		= sock_i_uid(sk);
1922 			st->syn_wait_sk = sk;
1923 			st->state	= TCP_SEQ_STATE_OPENREQ;
1924 			st->sbucket	= 0;
1925 			goto get_req;
1926 		}
1927 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1928 	}
1929 	if (++st->bucket < INET_LHTABLE_SIZE) {
1930 		sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1931 		goto get_sk;
1932 	}
1933 	cur = NULL;
1934 out:
1935 	return cur;
1936 }
1937 
1938 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1939 {
1940 	void *rc = listening_get_next(seq, NULL);
1941 
1942 	while (rc && *pos) {
1943 		rc = listening_get_next(seq, rc);
1944 		--*pos;
1945 	}
1946 	return rc;
1947 }
1948 
1949 static void *established_get_first(struct seq_file *seq)
1950 {
1951 	struct tcp_iter_state* st = seq->private;
1952 	struct net *net = seq_file_net(seq);
1953 	void *rc = NULL;
1954 
1955 	for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1956 		struct sock *sk;
1957 		struct hlist_node *node;
1958 		struct inet_timewait_sock *tw;
1959 		rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1960 
1961 		read_lock_bh(lock);
1962 		sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1963 			if (sk->sk_family != st->family ||
1964 			    !net_eq(sock_net(sk), net)) {
1965 				continue;
1966 			}
1967 			rc = sk;
1968 			goto out;
1969 		}
1970 		st->state = TCP_SEQ_STATE_TIME_WAIT;
1971 		inet_twsk_for_each(tw, node,
1972 				   &tcp_hashinfo.ehash[st->bucket].twchain) {
1973 			if (tw->tw_family != st->family ||
1974 			    !net_eq(twsk_net(tw), net)) {
1975 				continue;
1976 			}
1977 			rc = tw;
1978 			goto out;
1979 		}
1980 		read_unlock_bh(lock);
1981 		st->state = TCP_SEQ_STATE_ESTABLISHED;
1982 	}
1983 out:
1984 	return rc;
1985 }
1986 
1987 static void *established_get_next(struct seq_file *seq, void *cur)
1988 {
1989 	struct sock *sk = cur;
1990 	struct inet_timewait_sock *tw;
1991 	struct hlist_node *node;
1992 	struct tcp_iter_state* st = seq->private;
1993 	struct net *net = seq_file_net(seq);
1994 
1995 	++st->num;
1996 
1997 	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1998 		tw = cur;
1999 		tw = tw_next(tw);
2000 get_tw:
2001 		while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2002 			tw = tw_next(tw);
2003 		}
2004 		if (tw) {
2005 			cur = tw;
2006 			goto out;
2007 		}
2008 		read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2009 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2010 
2011 		if (++st->bucket < tcp_hashinfo.ehash_size) {
2012 			read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2013 			sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2014 		} else {
2015 			cur = NULL;
2016 			goto out;
2017 		}
2018 	} else
2019 		sk = sk_next(sk);
2020 
2021 	sk_for_each_from(sk, node) {
2022 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2023 			goto found;
2024 	}
2025 
2026 	st->state = TCP_SEQ_STATE_TIME_WAIT;
2027 	tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2028 	goto get_tw;
2029 found:
2030 	cur = sk;
2031 out:
2032 	return cur;
2033 }
2034 
2035 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2036 {
2037 	void *rc = established_get_first(seq);
2038 
2039 	while (rc && pos) {
2040 		rc = established_get_next(seq, rc);
2041 		--pos;
2042 	}
2043 	return rc;
2044 }
2045 
2046 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2047 {
2048 	void *rc;
2049 	struct tcp_iter_state* st = seq->private;
2050 
2051 	inet_listen_lock(&tcp_hashinfo);
2052 	st->state = TCP_SEQ_STATE_LISTENING;
2053 	rc	  = listening_get_idx(seq, &pos);
2054 
2055 	if (!rc) {
2056 		inet_listen_unlock(&tcp_hashinfo);
2057 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2058 		rc	  = established_get_idx(seq, pos);
2059 	}
2060 
2061 	return rc;
2062 }
2063 
2064 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2065 {
2066 	struct tcp_iter_state* st = seq->private;
2067 	st->state = TCP_SEQ_STATE_LISTENING;
2068 	st->num = 0;
2069 	return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2070 }
2071 
2072 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2073 {
2074 	void *rc = NULL;
2075 	struct tcp_iter_state* st;
2076 
2077 	if (v == SEQ_START_TOKEN) {
2078 		rc = tcp_get_idx(seq, 0);
2079 		goto out;
2080 	}
2081 	st = seq->private;
2082 
2083 	switch (st->state) {
2084 	case TCP_SEQ_STATE_OPENREQ:
2085 	case TCP_SEQ_STATE_LISTENING:
2086 		rc = listening_get_next(seq, v);
2087 		if (!rc) {
2088 			inet_listen_unlock(&tcp_hashinfo);
2089 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2090 			rc	  = established_get_first(seq);
2091 		}
2092 		break;
2093 	case TCP_SEQ_STATE_ESTABLISHED:
2094 	case TCP_SEQ_STATE_TIME_WAIT:
2095 		rc = established_get_next(seq, v);
2096 		break;
2097 	}
2098 out:
2099 	++*pos;
2100 	return rc;
2101 }
2102 
2103 static void tcp_seq_stop(struct seq_file *seq, void *v)
2104 {
2105 	struct tcp_iter_state* st = seq->private;
2106 
2107 	switch (st->state) {
2108 	case TCP_SEQ_STATE_OPENREQ:
2109 		if (v) {
2110 			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2111 			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2112 		}
2113 	case TCP_SEQ_STATE_LISTENING:
2114 		if (v != SEQ_START_TOKEN)
2115 			inet_listen_unlock(&tcp_hashinfo);
2116 		break;
2117 	case TCP_SEQ_STATE_TIME_WAIT:
2118 	case TCP_SEQ_STATE_ESTABLISHED:
2119 		if (v)
2120 			read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2121 		break;
2122 	}
2123 }
2124 
2125 static int tcp_seq_open(struct inode *inode, struct file *file)
2126 {
2127 	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2128 	struct tcp_iter_state *s;
2129 	int err;
2130 
2131 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2132 			  sizeof(struct tcp_iter_state));
2133 	if (err < 0)
2134 		return err;
2135 
2136 	s = ((struct seq_file *)file->private_data)->private;
2137 	s->family		= afinfo->family;
2138 	return 0;
2139 }
2140 
2141 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2142 {
2143 	int rc = 0;
2144 	struct proc_dir_entry *p;
2145 
2146 	afinfo->seq_fops.open		= tcp_seq_open;
2147 	afinfo->seq_fops.read		= seq_read;
2148 	afinfo->seq_fops.llseek		= seq_lseek;
2149 	afinfo->seq_fops.release	= seq_release_net;
2150 
2151 	afinfo->seq_ops.start		= tcp_seq_start;
2152 	afinfo->seq_ops.next		= tcp_seq_next;
2153 	afinfo->seq_ops.stop		= tcp_seq_stop;
2154 
2155 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2156 			     &afinfo->seq_fops, afinfo);
2157 	if (!p)
2158 		rc = -ENOMEM;
2159 	return rc;
2160 }
2161 
2162 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2163 {
2164 	proc_net_remove(net, afinfo->name);
2165 }
2166 
2167 static void get_openreq4(struct sock *sk, struct request_sock *req,
2168 			 struct seq_file *f, int i, int uid, int *len)
2169 {
2170 	const struct inet_request_sock *ireq = inet_rsk(req);
2171 	int ttd = req->expires - jiffies;
2172 
2173 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2174 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2175 		i,
2176 		ireq->loc_addr,
2177 		ntohs(inet_sk(sk)->sport),
2178 		ireq->rmt_addr,
2179 		ntohs(ireq->rmt_port),
2180 		TCP_SYN_RECV,
2181 		0, 0, /* could print option size, but that is af dependent. */
2182 		1,    /* timers active (only the expire timer) */
2183 		jiffies_to_clock_t(ttd),
2184 		req->retrans,
2185 		uid,
2186 		0,  /* non standard timer */
2187 		0, /* open_requests have no inode */
2188 		atomic_read(&sk->sk_refcnt),
2189 		req,
2190 		len);
2191 }
2192 
2193 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2194 {
2195 	int timer_active;
2196 	unsigned long timer_expires;
2197 	struct tcp_sock *tp = tcp_sk(sk);
2198 	const struct inet_connection_sock *icsk = inet_csk(sk);
2199 	struct inet_sock *inet = inet_sk(sk);
2200 	__be32 dest = inet->daddr;
2201 	__be32 src = inet->rcv_saddr;
2202 	__u16 destp = ntohs(inet->dport);
2203 	__u16 srcp = ntohs(inet->sport);
2204 
2205 	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2206 		timer_active	= 1;
2207 		timer_expires	= icsk->icsk_timeout;
2208 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2209 		timer_active	= 4;
2210 		timer_expires	= icsk->icsk_timeout;
2211 	} else if (timer_pending(&sk->sk_timer)) {
2212 		timer_active	= 2;
2213 		timer_expires	= sk->sk_timer.expires;
2214 	} else {
2215 		timer_active	= 0;
2216 		timer_expires = jiffies;
2217 	}
2218 
2219 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2220 			"%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2221 		i, src, srcp, dest, destp, sk->sk_state,
2222 		tp->write_seq - tp->snd_una,
2223 		sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2224 					     (tp->rcv_nxt - tp->copied_seq),
2225 		timer_active,
2226 		jiffies_to_clock_t(timer_expires - jiffies),
2227 		icsk->icsk_retransmits,
2228 		sock_i_uid(sk),
2229 		icsk->icsk_probes_out,
2230 		sock_i_ino(sk),
2231 		atomic_read(&sk->sk_refcnt), sk,
2232 		jiffies_to_clock_t(icsk->icsk_rto),
2233 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2234 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2235 		tp->snd_cwnd,
2236 		tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh,
2237 		len);
2238 }
2239 
2240 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2241 			       struct seq_file *f, int i, int *len)
2242 {
2243 	__be32 dest, src;
2244 	__u16 destp, srcp;
2245 	int ttd = tw->tw_ttd - jiffies;
2246 
2247 	if (ttd < 0)
2248 		ttd = 0;
2249 
2250 	dest  = tw->tw_daddr;
2251 	src   = tw->tw_rcv_saddr;
2252 	destp = ntohs(tw->tw_dport);
2253 	srcp  = ntohs(tw->tw_sport);
2254 
2255 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2256 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2257 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2258 		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2259 		atomic_read(&tw->tw_refcnt), tw, len);
2260 }
2261 
2262 #define TMPSZ 150
2263 
2264 static int tcp4_seq_show(struct seq_file *seq, void *v)
2265 {
2266 	struct tcp_iter_state* st;
2267 	int len;
2268 
2269 	if (v == SEQ_START_TOKEN) {
2270 		seq_printf(seq, "%-*s\n", TMPSZ - 1,
2271 			   "  sl  local_address rem_address   st tx_queue "
2272 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2273 			   "inode");
2274 		goto out;
2275 	}
2276 	st = seq->private;
2277 
2278 	switch (st->state) {
2279 	case TCP_SEQ_STATE_LISTENING:
2280 	case TCP_SEQ_STATE_ESTABLISHED:
2281 		get_tcp4_sock(v, seq, st->num, &len);
2282 		break;
2283 	case TCP_SEQ_STATE_OPENREQ:
2284 		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2285 		break;
2286 	case TCP_SEQ_STATE_TIME_WAIT:
2287 		get_timewait4_sock(v, seq, st->num, &len);
2288 		break;
2289 	}
2290 	seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2291 out:
2292 	return 0;
2293 }
2294 
2295 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2296 	.name		= "tcp",
2297 	.family		= AF_INET,
2298 	.seq_fops	= {
2299 		.owner		= THIS_MODULE,
2300 	},
2301 	.seq_ops	= {
2302 		.show		= tcp4_seq_show,
2303 	},
2304 };
2305 
2306 static int tcp4_proc_init_net(struct net *net)
2307 {
2308 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2309 }
2310 
2311 static void tcp4_proc_exit_net(struct net *net)
2312 {
2313 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2314 }
2315 
2316 static struct pernet_operations tcp4_net_ops = {
2317 	.init = tcp4_proc_init_net,
2318 	.exit = tcp4_proc_exit_net,
2319 };
2320 
2321 int __init tcp4_proc_init(void)
2322 {
2323 	return register_pernet_subsys(&tcp4_net_ops);
2324 }
2325 
2326 void tcp4_proc_exit(void)
2327 {
2328 	unregister_pernet_subsys(&tcp4_net_ops);
2329 }
2330 #endif /* CONFIG_PROC_FS */
2331 
2332 struct proto tcp_prot = {
2333 	.name			= "TCP",
2334 	.owner			= THIS_MODULE,
2335 	.close			= tcp_close,
2336 	.connect		= tcp_v4_connect,
2337 	.disconnect		= tcp_disconnect,
2338 	.accept			= inet_csk_accept,
2339 	.ioctl			= tcp_ioctl,
2340 	.init			= tcp_v4_init_sock,
2341 	.destroy		= tcp_v4_destroy_sock,
2342 	.shutdown		= tcp_shutdown,
2343 	.setsockopt		= tcp_setsockopt,
2344 	.getsockopt		= tcp_getsockopt,
2345 	.recvmsg		= tcp_recvmsg,
2346 	.backlog_rcv		= tcp_v4_do_rcv,
2347 	.hash			= inet_hash,
2348 	.unhash			= inet_unhash,
2349 	.get_port		= inet_csk_get_port,
2350 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2351 	.sockets_allocated	= &tcp_sockets_allocated,
2352 	.orphan_count		= &tcp_orphan_count,
2353 	.memory_allocated	= &tcp_memory_allocated,
2354 	.memory_pressure	= &tcp_memory_pressure,
2355 	.sysctl_mem		= sysctl_tcp_mem,
2356 	.sysctl_wmem		= sysctl_tcp_wmem,
2357 	.sysctl_rmem		= sysctl_tcp_rmem,
2358 	.max_header		= MAX_TCP_HEADER,
2359 	.obj_size		= sizeof(struct tcp_sock),
2360 	.twsk_prot		= &tcp_timewait_sock_ops,
2361 	.rsk_prot		= &tcp_request_sock_ops,
2362 	.h.hashinfo		= &tcp_hashinfo,
2363 #ifdef CONFIG_COMPAT
2364 	.compat_setsockopt	= compat_tcp_setsockopt,
2365 	.compat_getsockopt	= compat_tcp_getsockopt,
2366 #endif
2367 };
2368 
2369 
2370 static int __net_init tcp_sk_init(struct net *net)
2371 {
2372 	return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2373 				    PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2374 }
2375 
2376 static void __net_exit tcp_sk_exit(struct net *net)
2377 {
2378 	inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2379 	inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET);
2380 }
2381 
2382 static struct pernet_operations __net_initdata tcp_sk_ops = {
2383        .init = tcp_sk_init,
2384        .exit = tcp_sk_exit,
2385 };
2386 
2387 void __init tcp_v4_init(void)
2388 {
2389 	if (register_pernet_device(&tcp_sk_ops))
2390 		panic("Failed to create the TCP control socket.\n");
2391 }
2392 
2393 EXPORT_SYMBOL(ipv4_specific);
2394 EXPORT_SYMBOL(tcp_hashinfo);
2395 EXPORT_SYMBOL(tcp_prot);
2396 EXPORT_SYMBOL(tcp_v4_conn_request);
2397 EXPORT_SYMBOL(tcp_v4_connect);
2398 EXPORT_SYMBOL(tcp_v4_do_rcv);
2399 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2400 EXPORT_SYMBOL(tcp_v4_send_check);
2401 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2402 
2403 #ifdef CONFIG_PROC_FS
2404 EXPORT_SYMBOL(tcp_proc_register);
2405 EXPORT_SYMBOL(tcp_proc_unregister);
2406 #endif
2407 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2408 
2409