xref: /openbmc/linux/net/ipv4/tcp.c (revision 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:	$Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
9  *
10  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *		Corey Minyard <wf-rch!minyard@relay.EU.net>
14  *		Florian La Roche, <flla@stud.uni-sb.de>
15  *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16  *		Linus Torvalds, <torvalds@cs.helsinki.fi>
17  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
18  *		Matthew Dillon, <dillon@apollo.west.oic.com>
19  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20  *		Jorge Cwik, <jorge@laser.satlink.net>
21  *
22  * Fixes:
23  *		Alan Cox	:	Numerous verify_area() calls
24  *		Alan Cox	:	Set the ACK bit on a reset
25  *		Alan Cox	:	Stopped it crashing if it closed while
26  *					sk->inuse=1 and was trying to connect
27  *					(tcp_err()).
28  *		Alan Cox	:	All icmp error handling was broken
29  *					pointers passed where wrong and the
30  *					socket was looked up backwards. Nobody
31  *					tested any icmp error code obviously.
32  *		Alan Cox	:	tcp_err() now handled properly. It
33  *					wakes people on errors. poll
34  *					behaves and the icmp error race
35  *					has gone by moving it into sock.c
36  *		Alan Cox	:	tcp_send_reset() fixed to work for
37  *					everything not just packets for
38  *					unknown sockets.
39  *		Alan Cox	:	tcp option processing.
40  *		Alan Cox	:	Reset tweaked (still not 100%) [Had
41  *					syn rule wrong]
42  *		Herp Rosmanith  :	More reset fixes
43  *		Alan Cox	:	No longer acks invalid rst frames.
44  *					Acking any kind of RST is right out.
45  *		Alan Cox	:	Sets an ignore me flag on an rst
46  *					receive otherwise odd bits of prattle
47  *					escape still
48  *		Alan Cox	:	Fixed another acking RST frame bug.
49  *					Should stop LAN workplace lockups.
50  *		Alan Cox	: 	Some tidyups using the new skb list
51  *					facilities
52  *		Alan Cox	:	sk->keepopen now seems to work
53  *		Alan Cox	:	Pulls options out correctly on accepts
54  *		Alan Cox	:	Fixed assorted sk->rqueue->next errors
55  *		Alan Cox	:	PSH doesn't end a TCP read. Switched a
56  *					bit to skb ops.
57  *		Alan Cox	:	Tidied tcp_data to avoid a potential
58  *					nasty.
59  *		Alan Cox	:	Added some better commenting, as the
60  *					tcp is hard to follow
61  *		Alan Cox	:	Removed incorrect check for 20 * psh
62  *	Michael O'Reilly	:	ack < copied bug fix.
63  *	Johannes Stille		:	Misc tcp fixes (not all in yet).
64  *		Alan Cox	:	FIN with no memory -> CRASH
65  *		Alan Cox	:	Added socket option proto entries.
66  *					Also added awareness of them to accept.
67  *		Alan Cox	:	Added TCP options (SOL_TCP)
68  *		Alan Cox	:	Switched wakeup calls to callbacks,
69  *					so the kernel can layer network
70  *					sockets.
71  *		Alan Cox	:	Use ip_tos/ip_ttl settings.
72  *		Alan Cox	:	Handle FIN (more) properly (we hope).
73  *		Alan Cox	:	RST frames sent on unsynchronised
74  *					state ack error.
75  *		Alan Cox	:	Put in missing check for SYN bit.
76  *		Alan Cox	:	Added tcp_select_window() aka NET2E
77  *					window non shrink trick.
78  *		Alan Cox	:	Added a couple of small NET2E timer
79  *					fixes
80  *		Charles Hedrick :	TCP fixes
81  *		Toomas Tamm	:	TCP window fixes
82  *		Alan Cox	:	Small URG fix to rlogin ^C ack fight
83  *		Charles Hedrick	:	Rewrote most of it to actually work
84  *		Linus		:	Rewrote tcp_read() and URG handling
85  *					completely
86  *		Gerhard Koerting:	Fixed some missing timer handling
87  *		Matthew Dillon  :	Reworked TCP machine states as per RFC
88  *		Gerhard Koerting:	PC/TCP workarounds
89  *		Adam Caldwell	:	Assorted timer/timing errors
90  *		Matthew Dillon	:	Fixed another RST bug
91  *		Alan Cox	:	Move to kernel side addressing changes.
92  *		Alan Cox	:	Beginning work on TCP fastpathing
93  *					(not yet usable)
94  *		Arnt Gulbrandsen:	Turbocharged tcp_check() routine.
95  *		Alan Cox	:	TCP fast path debugging
96  *		Alan Cox	:	Window clamping
97  *		Michael Riepe	:	Bug in tcp_check()
98  *		Matt Dillon	:	More TCP improvements and RST bug fixes
99  *		Matt Dillon	:	Yet more small nasties remove from the
100  *					TCP code (Be very nice to this man if
101  *					tcp finally works 100%) 8)
102  *		Alan Cox	:	BSD accept semantics.
103  *		Alan Cox	:	Reset on closedown bug.
104  *	Peter De Schrijver	:	ENOTCONN check missing in tcp_sendto().
105  *		Michael Pall	:	Handle poll() after URG properly in
106  *					all cases.
107  *		Michael Pall	:	Undo the last fix in tcp_read_urg()
108  *					(multi URG PUSH broke rlogin).
109  *		Michael Pall	:	Fix the multi URG PUSH problem in
110  *					tcp_readable(), poll() after URG
111  *					works now.
112  *		Michael Pall	:	recv(...,MSG_OOB) never blocks in the
113  *					BSD api.
114  *		Alan Cox	:	Changed the semantics of sk->socket to
115  *					fix a race and a signal problem with
116  *					accept() and async I/O.
117  *		Alan Cox	:	Relaxed the rules on tcp_sendto().
118  *		Yury Shevchuk	:	Really fixed accept() blocking problem.
119  *		Craig I. Hagan  :	Allow for BSD compatible TIME_WAIT for
120  *					clients/servers which listen in on
121  *					fixed ports.
122  *		Alan Cox	:	Cleaned the above up and shrank it to
123  *					a sensible code size.
124  *		Alan Cox	:	Self connect lockup fix.
125  *		Alan Cox	:	No connect to multicast.
126  *		Ross Biro	:	Close unaccepted children on master
127  *					socket close.
128  *		Alan Cox	:	Reset tracing code.
129  *		Alan Cox	:	Spurious resets on shutdown.
130  *		Alan Cox	:	Giant 15 minute/60 second timer error
131  *		Alan Cox	:	Small whoops in polling before an
132  *					accept.
133  *		Alan Cox	:	Kept the state trace facility since
134  *					it's handy for debugging.
135  *		Alan Cox	:	More reset handler fixes.
136  *		Alan Cox	:	Started rewriting the code based on
137  *					the RFC's for other useful protocol
138  *					references see: Comer, KA9Q NOS, and
139  *					for a reference on the difference
140  *					between specifications and how BSD
141  *					works see the 4.4lite source.
142  *		A.N.Kuznetsov	:	Don't time wait on completion of tidy
143  *					close.
144  *		Linus Torvalds	:	Fin/Shutdown & copied_seq changes.
145  *		Linus Torvalds	:	Fixed BSD port reuse to work first syn
146  *		Alan Cox	:	Reimplemented timers as per the RFC
147  *					and using multiple timers for sanity.
148  *		Alan Cox	:	Small bug fixes, and a lot of new
149  *					comments.
150  *		Alan Cox	:	Fixed dual reader crash by locking
151  *					the buffers (much like datagram.c)
152  *		Alan Cox	:	Fixed stuck sockets in probe. A probe
153  *					now gets fed up of retrying without
154  *					(even a no space) answer.
155  *		Alan Cox	:	Extracted closing code better
156  *		Alan Cox	:	Fixed the closing state machine to
157  *					resemble the RFC.
158  *		Alan Cox	:	More 'per spec' fixes.
159  *		Jorge Cwik	:	Even faster checksumming.
160  *		Alan Cox	:	tcp_data() doesn't ack illegal PSH
161  *					only frames. At least one pc tcp stack
162  *					generates them.
163  *		Alan Cox	:	Cache last socket.
164  *		Alan Cox	:	Per route irtt.
165  *		Matt Day	:	poll()->select() match BSD precisely on error
166  *		Alan Cox	:	New buffers
167  *		Marc Tamsky	:	Various sk->prot->retransmits and
168  *					sk->retransmits misupdating fixed.
169  *					Fixed tcp_write_timeout: stuck close,
170  *					and TCP syn retries gets used now.
171  *		Mark Yarvis	:	In tcp_read_wakeup(), don't send an
172  *					ack if state is TCP_CLOSED.
173  *		Alan Cox	:	Look up device on a retransmit - routes may
174  *					change. Doesn't yet cope with MSS shrink right
175  *					but it's a start!
176  *		Marc Tamsky	:	Closing in closing fixes.
177  *		Mike Shaver	:	RFC1122 verifications.
178  *		Alan Cox	:	rcv_saddr errors.
179  *		Alan Cox	:	Block double connect().
180  *		Alan Cox	:	Small hooks for enSKIP.
181  *		Alexey Kuznetsov:	Path MTU discovery.
182  *		Alan Cox	:	Support soft errors.
183  *		Alan Cox	:	Fix MTU discovery pathological case
184  *					when the remote claims no mtu!
185  *		Marc Tamsky	:	TCP_CLOSE fix.
186  *		Colin (G3TNE)	:	Send a reset on syn ack replies in
187  *					window but wrong (fixes NT lpd problems)
188  *		Pedro Roque	:	Better TCP window handling, delayed ack.
189  *		Joerg Reuter	:	No modification of locked buffers in
190  *					tcp_do_retransmit()
191  *		Eric Schenk	:	Changed receiver side silly window
192  *					avoidance algorithm to BSD style
193  *					algorithm. This doubles throughput
194  *					against machines running Solaris,
195  *					and seems to result in general
196  *					improvement.
197  *	Stefan Magdalinski	:	adjusted tcp_readable() to fix FIONREAD
198  *	Willy Konynenberg	:	Transparent proxying support.
199  *	Mike McLagan		:	Routing by source
200  *		Keith Owens	:	Do proper merging with partial SKB's in
201  *					tcp_do_sendmsg to avoid burstiness.
202  *		Eric Schenk	:	Fix fast close down bug with
203  *					shutdown() followed by close().
204  *		Andi Kleen 	:	Make poll agree with SIGIO
205  *	Salvatore Sanfilippo	:	Support SO_LINGER with linger == 1 and
206  *					lingertime == 0 (RFC 793 ABORT Call)
207  *	Hirokazu Takahashi	:	Use copy_from_user() instead of
208  *					csum_and_copy_from_user() if possible.
209  *
210  *		This program is free software; you can redistribute it and/or
211  *		modify it under the terms of the GNU General Public License
212  *		as published by the Free Software Foundation; either version
213  *		2 of the License, or(at your option) any later version.
214  *
215  * Description of States:
216  *
217  *	TCP_SYN_SENT		sent a connection request, waiting for ack
218  *
219  *	TCP_SYN_RECV		received a connection request, sent ack,
220  *				waiting for final ack in three-way handshake.
221  *
222  *	TCP_ESTABLISHED		connection established
223  *
224  *	TCP_FIN_WAIT1		our side has shutdown, waiting to complete
225  *				transmission of remaining buffered data
226  *
227  *	TCP_FIN_WAIT2		all buffered data sent, waiting for remote
228  *				to shutdown
229  *
230  *	TCP_CLOSING		both sides have shutdown but we still have
231  *				data we have to finish sending
232  *
233  *	TCP_TIME_WAIT		timeout to catch resent junk before entering
234  *				closed, can only be entered from FIN_WAIT2
235  *				or CLOSING.  Required because the other end
236  *				may not have gotten our last ACK causing it
237  *				to retransmit the data packet (which we ignore)
238  *
239  *	TCP_CLOSE_WAIT		remote side has shutdown and is waiting for
240  *				us to finish writing our data and to shutdown
241  *				(we have to close() to move on to LAST_ACK)
242  *
243  *	TCP_LAST_ACK		out side has shutdown after remote has
244  *				shutdown.  There may still be data in our
245  *				buffer that we have to finish sending
246  *
247  *	TCP_CLOSE		socket is finished
248  */
249 
250 #include <linux/config.h>
251 #include <linux/module.h>
252 #include <linux/types.h>
253 #include <linux/fcntl.h>
254 #include <linux/poll.h>
255 #include <linux/init.h>
256 #include <linux/smp_lock.h>
257 #include <linux/fs.h>
258 #include <linux/random.h>
259 #include <linux/bootmem.h>
260 
261 #include <net/icmp.h>
262 #include <net/tcp.h>
263 #include <net/xfrm.h>
264 #include <net/ip.h>
265 
266 
267 #include <asm/uaccess.h>
268 #include <asm/ioctls.h>
269 
270 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
271 
272 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
273 
274 kmem_cache_t *tcp_openreq_cachep;
275 kmem_cache_t *tcp_bucket_cachep;
276 kmem_cache_t *tcp_timewait_cachep;
277 
278 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
279 
280 int sysctl_tcp_mem[3];
281 int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
282 int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
283 
284 EXPORT_SYMBOL(sysctl_tcp_mem);
285 EXPORT_SYMBOL(sysctl_tcp_rmem);
286 EXPORT_SYMBOL(sysctl_tcp_wmem);
287 
288 atomic_t tcp_memory_allocated;	/* Current allocated memory. */
289 atomic_t tcp_sockets_allocated;	/* Current number of TCP sockets. */
290 
291 EXPORT_SYMBOL(tcp_memory_allocated);
292 EXPORT_SYMBOL(tcp_sockets_allocated);
293 
294 /*
295  * Pressure flag: try to collapse.
296  * Technical note: it is used by multiple contexts non atomically.
297  * All the sk_stream_mem_schedule() is of this nature: accounting
298  * is strict, actions are advisory and have some latency.
299  */
300 int tcp_memory_pressure;
301 
302 EXPORT_SYMBOL(tcp_memory_pressure);
303 
304 void tcp_enter_memory_pressure(void)
305 {
306 	if (!tcp_memory_pressure) {
307 		NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
308 		tcp_memory_pressure = 1;
309 	}
310 }
311 
312 EXPORT_SYMBOL(tcp_enter_memory_pressure);
313 
314 /*
315  * LISTEN is a special case for poll..
316  */
317 static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
318 					       poll_table *wait)
319 {
320 	return tcp_sk(sk)->accept_queue ? (POLLIN | POLLRDNORM) : 0;
321 }
322 
323 /*
324  *	Wait for a TCP event.
325  *
326  *	Note that we don't need to lock the socket, as the upper poll layers
327  *	take care of normal races (between the test and the event) and we don't
328  *	go look at any of the socket buffers directly.
329  */
330 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
331 {
332 	unsigned int mask;
333 	struct sock *sk = sock->sk;
334 	struct tcp_sock *tp = tcp_sk(sk);
335 
336 	poll_wait(file, sk->sk_sleep, wait);
337 	if (sk->sk_state == TCP_LISTEN)
338 		return tcp_listen_poll(sk, wait);
339 
340 	/* Socket is not locked. We are protected from async events
341 	   by poll logic and correct handling of state changes
342 	   made by another threads is impossible in any case.
343 	 */
344 
345 	mask = 0;
346 	if (sk->sk_err)
347 		mask = POLLERR;
348 
349 	/*
350 	 * POLLHUP is certainly not done right. But poll() doesn't
351 	 * have a notion of HUP in just one direction, and for a
352 	 * socket the read side is more interesting.
353 	 *
354 	 * Some poll() documentation says that POLLHUP is incompatible
355 	 * with the POLLOUT/POLLWR flags, so somebody should check this
356 	 * all. But careful, it tends to be safer to return too many
357 	 * bits than too few, and you can easily break real applications
358 	 * if you don't tell them that something has hung up!
359 	 *
360 	 * Check-me.
361 	 *
362 	 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
363 	 * our fs/select.c). It means that after we received EOF,
364 	 * poll always returns immediately, making impossible poll() on write()
365 	 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
366 	 * if and only if shutdown has been made in both directions.
367 	 * Actually, it is interesting to look how Solaris and DUX
368 	 * solve this dilemma. I would prefer, if PULLHUP were maskable,
369 	 * then we could set it on SND_SHUTDOWN. BTW examples given
370 	 * in Stevens' books assume exactly this behaviour, it explains
371 	 * why PULLHUP is incompatible with POLLOUT.	--ANK
372 	 *
373 	 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
374 	 * blocking on fresh not-connected or disconnected socket. --ANK
375 	 */
376 	if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
377 		mask |= POLLHUP;
378 	if (sk->sk_shutdown & RCV_SHUTDOWN)
379 		mask |= POLLIN | POLLRDNORM;
380 
381 	/* Connected? */
382 	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
383 		/* Potential race condition. If read of tp below will
384 		 * escape above sk->sk_state, we can be illegally awaken
385 		 * in SYN_* states. */
386 		if ((tp->rcv_nxt != tp->copied_seq) &&
387 		    (tp->urg_seq != tp->copied_seq ||
388 		     tp->rcv_nxt != tp->copied_seq + 1 ||
389 		     sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
390 			mask |= POLLIN | POLLRDNORM;
391 
392 		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
393 			if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
394 				mask |= POLLOUT | POLLWRNORM;
395 			} else {  /* send SIGIO later */
396 				set_bit(SOCK_ASYNC_NOSPACE,
397 					&sk->sk_socket->flags);
398 				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
399 
400 				/* Race breaker. If space is freed after
401 				 * wspace test but before the flags are set,
402 				 * IO signal will be lost.
403 				 */
404 				if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
405 					mask |= POLLOUT | POLLWRNORM;
406 			}
407 		}
408 
409 		if (tp->urg_data & TCP_URG_VALID)
410 			mask |= POLLPRI;
411 	}
412 	return mask;
413 }
414 
415 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
416 {
417 	struct tcp_sock *tp = tcp_sk(sk);
418 	int answ;
419 
420 	switch (cmd) {
421 	case SIOCINQ:
422 		if (sk->sk_state == TCP_LISTEN)
423 			return -EINVAL;
424 
425 		lock_sock(sk);
426 		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
427 			answ = 0;
428 		else if (sock_flag(sk, SOCK_URGINLINE) ||
429 			 !tp->urg_data ||
430 			 before(tp->urg_seq, tp->copied_seq) ||
431 			 !before(tp->urg_seq, tp->rcv_nxt)) {
432 			answ = tp->rcv_nxt - tp->copied_seq;
433 
434 			/* Subtract 1, if FIN is in queue. */
435 			if (answ && !skb_queue_empty(&sk->sk_receive_queue))
436 				answ -=
437 		       ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
438 		} else
439 			answ = tp->urg_seq - tp->copied_seq;
440 		release_sock(sk);
441 		break;
442 	case SIOCATMARK:
443 		answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
444 		break;
445 	case SIOCOUTQ:
446 		if (sk->sk_state == TCP_LISTEN)
447 			return -EINVAL;
448 
449 		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
450 			answ = 0;
451 		else
452 			answ = tp->write_seq - tp->snd_una;
453 		break;
454 	default:
455 		return -ENOIOCTLCMD;
456 	};
457 
458 	return put_user(answ, (int __user *)arg);
459 }
460 
461 
462 int tcp_listen_start(struct sock *sk)
463 {
464 	struct inet_sock *inet = inet_sk(sk);
465 	struct tcp_sock *tp = tcp_sk(sk);
466 	struct tcp_listen_opt *lopt;
467 
468 	sk->sk_max_ack_backlog = 0;
469 	sk->sk_ack_backlog = 0;
470 	tp->accept_queue = tp->accept_queue_tail = NULL;
471 	rwlock_init(&tp->syn_wait_lock);
472 	tcp_delack_init(tp);
473 
474 	lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
475 	if (!lopt)
476 		return -ENOMEM;
477 
478 	memset(lopt, 0, sizeof(struct tcp_listen_opt));
479 	for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
480 		if ((1 << lopt->max_qlen_log) >= sysctl_max_syn_backlog)
481 			break;
482 	get_random_bytes(&lopt->hash_rnd, 4);
483 
484 	write_lock_bh(&tp->syn_wait_lock);
485 	tp->listen_opt = lopt;
486 	write_unlock_bh(&tp->syn_wait_lock);
487 
488 	/* There is race window here: we announce ourselves listening,
489 	 * but this transition is still not validated by get_port().
490 	 * It is OK, because this socket enters to hash table only
491 	 * after validation is complete.
492 	 */
493 	sk->sk_state = TCP_LISTEN;
494 	if (!sk->sk_prot->get_port(sk, inet->num)) {
495 		inet->sport = htons(inet->num);
496 
497 		sk_dst_reset(sk);
498 		sk->sk_prot->hash(sk);
499 
500 		return 0;
501 	}
502 
503 	sk->sk_state = TCP_CLOSE;
504 	write_lock_bh(&tp->syn_wait_lock);
505 	tp->listen_opt = NULL;
506 	write_unlock_bh(&tp->syn_wait_lock);
507 	kfree(lopt);
508 	return -EADDRINUSE;
509 }
510 
511 /*
512  *	This routine closes sockets which have been at least partially
513  *	opened, but not yet accepted.
514  */
515 
516 static void tcp_listen_stop (struct sock *sk)
517 {
518 	struct tcp_sock *tp = tcp_sk(sk);
519 	struct tcp_listen_opt *lopt = tp->listen_opt;
520 	struct open_request *acc_req = tp->accept_queue;
521 	struct open_request *req;
522 	int i;
523 
524 	tcp_delete_keepalive_timer(sk);
525 
526 	/* make all the listen_opt local to us */
527 	write_lock_bh(&tp->syn_wait_lock);
528 	tp->listen_opt = NULL;
529 	write_unlock_bh(&tp->syn_wait_lock);
530 	tp->accept_queue = tp->accept_queue_tail = NULL;
531 
532 	if (lopt->qlen) {
533 		for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
534 			while ((req = lopt->syn_table[i]) != NULL) {
535 				lopt->syn_table[i] = req->dl_next;
536 				lopt->qlen--;
537 				tcp_openreq_free(req);
538 
539 		/* Following specs, it would be better either to send FIN
540 		 * (and enter FIN-WAIT-1, it is normal close)
541 		 * or to send active reset (abort).
542 		 * Certainly, it is pretty dangerous while synflood, but it is
543 		 * bad justification for our negligence 8)
544 		 * To be honest, we are not able to make either
545 		 * of the variants now.			--ANK
546 		 */
547 			}
548 		}
549 	}
550 	BUG_TRAP(!lopt->qlen);
551 
552 	kfree(lopt);
553 
554 	while ((req = acc_req) != NULL) {
555 		struct sock *child = req->sk;
556 
557 		acc_req = req->dl_next;
558 
559 		local_bh_disable();
560 		bh_lock_sock(child);
561 		BUG_TRAP(!sock_owned_by_user(child));
562 		sock_hold(child);
563 
564 		tcp_disconnect(child, O_NONBLOCK);
565 
566 		sock_orphan(child);
567 
568 		atomic_inc(&tcp_orphan_count);
569 
570 		tcp_destroy_sock(child);
571 
572 		bh_unlock_sock(child);
573 		local_bh_enable();
574 		sock_put(child);
575 
576 		sk_acceptq_removed(sk);
577 		tcp_openreq_fastfree(req);
578 	}
579 	BUG_TRAP(!sk->sk_ack_backlog);
580 }
581 
582 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
583 {
584 	TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
585 	tp->pushed_seq = tp->write_seq;
586 }
587 
588 static inline int forced_push(struct tcp_sock *tp)
589 {
590 	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
591 }
592 
593 static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
594 			      struct sk_buff *skb)
595 {
596 	skb->csum = 0;
597 	TCP_SKB_CB(skb)->seq = tp->write_seq;
598 	TCP_SKB_CB(skb)->end_seq = tp->write_seq;
599 	TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
600 	TCP_SKB_CB(skb)->sacked = 0;
601 	skb_header_release(skb);
602 	__skb_queue_tail(&sk->sk_write_queue, skb);
603 	sk_charge_skb(sk, skb);
604 	if (!sk->sk_send_head)
605 		sk->sk_send_head = skb;
606 	else if (tp->nonagle&TCP_NAGLE_PUSH)
607 		tp->nonagle &= ~TCP_NAGLE_PUSH;
608 }
609 
610 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
611 				struct sk_buff *skb)
612 {
613 	if (flags & MSG_OOB) {
614 		tp->urg_mode = 1;
615 		tp->snd_up = tp->write_seq;
616 		TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
617 	}
618 }
619 
620 static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
621 			    int mss_now, int nonagle)
622 {
623 	if (sk->sk_send_head) {
624 		struct sk_buff *skb = sk->sk_write_queue.prev;
625 		if (!(flags & MSG_MORE) || forced_push(tp))
626 			tcp_mark_push(tp, skb);
627 		tcp_mark_urg(tp, flags, skb);
628 		__tcp_push_pending_frames(sk, tp, mss_now,
629 					  (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
630 	}
631 }
632 
633 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
634 			 size_t psize, int flags)
635 {
636 	struct tcp_sock *tp = tcp_sk(sk);
637 	int mss_now;
638 	int err;
639 	ssize_t copied;
640 	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
641 
642 	/* Wait for a connection to finish. */
643 	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
644 		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
645 			goto out_err;
646 
647 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
648 
649 	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
650 	copied = 0;
651 
652 	err = -EPIPE;
653 	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
654 		goto do_error;
655 
656 	while (psize > 0) {
657 		struct sk_buff *skb = sk->sk_write_queue.prev;
658 		struct page *page = pages[poffset / PAGE_SIZE];
659 		int copy, i, can_coalesce;
660 		int offset = poffset % PAGE_SIZE;
661 		int size = min_t(size_t, psize, PAGE_SIZE - offset);
662 
663 		if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) {
664 new_segment:
665 			if (!sk_stream_memory_free(sk))
666 				goto wait_for_sndbuf;
667 
668 			skb = sk_stream_alloc_pskb(sk, 0, 0,
669 						   sk->sk_allocation);
670 			if (!skb)
671 				goto wait_for_memory;
672 
673 			skb_entail(sk, tp, skb);
674 			copy = mss_now;
675 		}
676 
677 		if (copy > size)
678 			copy = size;
679 
680 		i = skb_shinfo(skb)->nr_frags;
681 		can_coalesce = skb_can_coalesce(skb, i, page, offset);
682 		if (!can_coalesce && i >= MAX_SKB_FRAGS) {
683 			tcp_mark_push(tp, skb);
684 			goto new_segment;
685 		}
686 		if (sk->sk_forward_alloc < copy &&
687 		    !sk_stream_mem_schedule(sk, copy, 0))
688 			goto wait_for_memory;
689 
690 		if (can_coalesce) {
691 			skb_shinfo(skb)->frags[i - 1].size += copy;
692 		} else {
693 			get_page(page);
694 			skb_fill_page_desc(skb, i, page, offset, copy);
695 		}
696 
697 		skb->len += copy;
698 		skb->data_len += copy;
699 		skb->truesize += copy;
700 		sk->sk_wmem_queued += copy;
701 		sk->sk_forward_alloc -= copy;
702 		skb->ip_summed = CHECKSUM_HW;
703 		tp->write_seq += copy;
704 		TCP_SKB_CB(skb)->end_seq += copy;
705 		skb_shinfo(skb)->tso_segs = 0;
706 
707 		if (!copied)
708 			TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
709 
710 		copied += copy;
711 		poffset += copy;
712 		if (!(psize -= copy))
713 			goto out;
714 
715 		if (skb->len != mss_now || (flags & MSG_OOB))
716 			continue;
717 
718 		if (forced_push(tp)) {
719 			tcp_mark_push(tp, skb);
720 			__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
721 		} else if (skb == sk->sk_send_head)
722 			tcp_push_one(sk, mss_now);
723 		continue;
724 
725 wait_for_sndbuf:
726 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
727 wait_for_memory:
728 		if (copied)
729 			tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
730 
731 		if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
732 			goto do_error;
733 
734 		mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
735 	}
736 
737 out:
738 	if (copied)
739 		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
740 	return copied;
741 
742 do_error:
743 	if (copied)
744 		goto out;
745 out_err:
746 	return sk_stream_error(sk, flags, err);
747 }
748 
749 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
750 		     size_t size, int flags)
751 {
752 	ssize_t res;
753 	struct sock *sk = sock->sk;
754 
755 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
756 
757 	if (!(sk->sk_route_caps & NETIF_F_SG) ||
758 	    !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
759 		return sock_no_sendpage(sock, page, offset, size, flags);
760 
761 #undef TCP_ZC_CSUM_FLAGS
762 
763 	lock_sock(sk);
764 	TCP_CHECK_TIMER(sk);
765 	res = do_tcp_sendpages(sk, &page, offset, size, flags);
766 	TCP_CHECK_TIMER(sk);
767 	release_sock(sk);
768 	return res;
769 }
770 
771 #define TCP_PAGE(sk)	(sk->sk_sndmsg_page)
772 #define TCP_OFF(sk)	(sk->sk_sndmsg_off)
773 
774 static inline int select_size(struct sock *sk, struct tcp_sock *tp)
775 {
776 	int tmp = tp->mss_cache_std;
777 
778 	if (sk->sk_route_caps & NETIF_F_SG) {
779 		int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
780 
781 		if (tmp >= pgbreak &&
782 		    tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
783 			tmp = pgbreak;
784 	}
785 	return tmp;
786 }
787 
788 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
789 		size_t size)
790 {
791 	struct iovec *iov;
792 	struct tcp_sock *tp = tcp_sk(sk);
793 	struct sk_buff *skb;
794 	int iovlen, flags;
795 	int mss_now;
796 	int err, copied;
797 	long timeo;
798 
799 	lock_sock(sk);
800 	TCP_CHECK_TIMER(sk);
801 
802 	flags = msg->msg_flags;
803 	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
804 
805 	/* Wait for a connection to finish. */
806 	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
807 		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
808 			goto out_err;
809 
810 	/* This should be in poll */
811 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
812 
813 	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
814 
815 	/* Ok commence sending. */
816 	iovlen = msg->msg_iovlen;
817 	iov = msg->msg_iov;
818 	copied = 0;
819 
820 	err = -EPIPE;
821 	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
822 		goto do_error;
823 
824 	while (--iovlen >= 0) {
825 		int seglen = iov->iov_len;
826 		unsigned char __user *from = iov->iov_base;
827 
828 		iov++;
829 
830 		while (seglen > 0) {
831 			int copy;
832 
833 			skb = sk->sk_write_queue.prev;
834 
835 			if (!sk->sk_send_head ||
836 			    (copy = mss_now - skb->len) <= 0) {
837 
838 new_segment:
839 				/* Allocate new segment. If the interface is SG,
840 				 * allocate skb fitting to single page.
841 				 */
842 				if (!sk_stream_memory_free(sk))
843 					goto wait_for_sndbuf;
844 
845 				skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
846 							   0, sk->sk_allocation);
847 				if (!skb)
848 					goto wait_for_memory;
849 
850 				/*
851 				 * Check whether we can use HW checksum.
852 				 */
853 				if (sk->sk_route_caps &
854 				    (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
855 				     NETIF_F_HW_CSUM))
856 					skb->ip_summed = CHECKSUM_HW;
857 
858 				skb_entail(sk, tp, skb);
859 				copy = mss_now;
860 			}
861 
862 			/* Try to append data to the end of skb. */
863 			if (copy > seglen)
864 				copy = seglen;
865 
866 			/* Where to copy to? */
867 			if (skb_tailroom(skb) > 0) {
868 				/* We have some space in skb head. Superb! */
869 				if (copy > skb_tailroom(skb))
870 					copy = skb_tailroom(skb);
871 				if ((err = skb_add_data(skb, from, copy)) != 0)
872 					goto do_fault;
873 			} else {
874 				int merge = 0;
875 				int i = skb_shinfo(skb)->nr_frags;
876 				struct page *page = TCP_PAGE(sk);
877 				int off = TCP_OFF(sk);
878 
879 				if (skb_can_coalesce(skb, i, page, off) &&
880 				    off != PAGE_SIZE) {
881 					/* We can extend the last page
882 					 * fragment. */
883 					merge = 1;
884 				} else if (i == MAX_SKB_FRAGS ||
885 					   (!i &&
886 					   !(sk->sk_route_caps & NETIF_F_SG))) {
887 					/* Need to add new fragment and cannot
888 					 * do this because interface is non-SG,
889 					 * or because all the page slots are
890 					 * busy. */
891 					tcp_mark_push(tp, skb);
892 					goto new_segment;
893 				} else if (page) {
894 					/* If page is cached, align
895 					 * offset to L1 cache boundary
896 					 */
897 					off = (off + L1_CACHE_BYTES - 1) &
898 					      ~(L1_CACHE_BYTES - 1);
899 					if (off == PAGE_SIZE) {
900 						put_page(page);
901 						TCP_PAGE(sk) = page = NULL;
902 					}
903 				}
904 
905 				if (!page) {
906 					/* Allocate new cache page. */
907 					if (!(page = sk_stream_alloc_page(sk)))
908 						goto wait_for_memory;
909 					off = 0;
910 				}
911 
912 				if (copy > PAGE_SIZE - off)
913 					copy = PAGE_SIZE - off;
914 
915 				/* Time to copy data. We are close to
916 				 * the end! */
917 				err = skb_copy_to_page(sk, from, skb, page,
918 						       off, copy);
919 				if (err) {
920 					/* If this page was new, give it to the
921 					 * socket so it does not get leaked.
922 					 */
923 					if (!TCP_PAGE(sk)) {
924 						TCP_PAGE(sk) = page;
925 						TCP_OFF(sk) = 0;
926 					}
927 					goto do_error;
928 				}
929 
930 				/* Update the skb. */
931 				if (merge) {
932 					skb_shinfo(skb)->frags[i - 1].size +=
933 									copy;
934 				} else {
935 					skb_fill_page_desc(skb, i, page, off, copy);
936 					if (TCP_PAGE(sk)) {
937 						get_page(page);
938 					} else if (off + copy < PAGE_SIZE) {
939 						get_page(page);
940 						TCP_PAGE(sk) = page;
941 					}
942 				}
943 
944 				TCP_OFF(sk) = off + copy;
945 			}
946 
947 			if (!copied)
948 				TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
949 
950 			tp->write_seq += copy;
951 			TCP_SKB_CB(skb)->end_seq += copy;
952 			skb_shinfo(skb)->tso_segs = 0;
953 
954 			from += copy;
955 			copied += copy;
956 			if ((seglen -= copy) == 0 && iovlen == 0)
957 				goto out;
958 
959 			if (skb->len != mss_now || (flags & MSG_OOB))
960 				continue;
961 
962 			if (forced_push(tp)) {
963 				tcp_mark_push(tp, skb);
964 				__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
965 			} else if (skb == sk->sk_send_head)
966 				tcp_push_one(sk, mss_now);
967 			continue;
968 
969 wait_for_sndbuf:
970 			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
971 wait_for_memory:
972 			if (copied)
973 				tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
974 
975 			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
976 				goto do_error;
977 
978 			mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
979 		}
980 	}
981 
982 out:
983 	if (copied)
984 		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
985 	TCP_CHECK_TIMER(sk);
986 	release_sock(sk);
987 	return copied;
988 
989 do_fault:
990 	if (!skb->len) {
991 		if (sk->sk_send_head == skb)
992 			sk->sk_send_head = NULL;
993 		__skb_unlink(skb, skb->list);
994 		sk_stream_free_skb(sk, skb);
995 	}
996 
997 do_error:
998 	if (copied)
999 		goto out;
1000 out_err:
1001 	err = sk_stream_error(sk, flags, err);
1002 	TCP_CHECK_TIMER(sk);
1003 	release_sock(sk);
1004 	return err;
1005 }
1006 
1007 /*
1008  *	Handle reading urgent data. BSD has very simple semantics for
1009  *	this, no blocking and very strange errors 8)
1010  */
1011 
1012 static int tcp_recv_urg(struct sock *sk, long timeo,
1013 			struct msghdr *msg, int len, int flags,
1014 			int *addr_len)
1015 {
1016 	struct tcp_sock *tp = tcp_sk(sk);
1017 
1018 	/* No URG data to read. */
1019 	if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1020 	    tp->urg_data == TCP_URG_READ)
1021 		return -EINVAL;	/* Yes this is right ! */
1022 
1023 	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1024 		return -ENOTCONN;
1025 
1026 	if (tp->urg_data & TCP_URG_VALID) {
1027 		int err = 0;
1028 		char c = tp->urg_data;
1029 
1030 		if (!(flags & MSG_PEEK))
1031 			tp->urg_data = TCP_URG_READ;
1032 
1033 		/* Read urgent data. */
1034 		msg->msg_flags |= MSG_OOB;
1035 
1036 		if (len > 0) {
1037 			if (!(flags & MSG_TRUNC))
1038 				err = memcpy_toiovec(msg->msg_iov, &c, 1);
1039 			len = 1;
1040 		} else
1041 			msg->msg_flags |= MSG_TRUNC;
1042 
1043 		return err ? -EFAULT : len;
1044 	}
1045 
1046 	if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1047 		return 0;
1048 
1049 	/* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1050 	 * the available implementations agree in this case:
1051 	 * this call should never block, independent of the
1052 	 * blocking state of the socket.
1053 	 * Mike <pall@rz.uni-karlsruhe.de>
1054 	 */
1055 	return -EAGAIN;
1056 }
1057 
1058 /* Clean up the receive buffer for full frames taken by the user,
1059  * then send an ACK if necessary.  COPIED is the number of bytes
1060  * tcp_recvmsg has given to the user so far, it speeds up the
1061  * calculation of whether or not we must ACK for the sake of
1062  * a window update.
1063  */
1064 static void cleanup_rbuf(struct sock *sk, int copied)
1065 {
1066 	struct tcp_sock *tp = tcp_sk(sk);
1067 	int time_to_ack = 0;
1068 
1069 #if TCP_DEBUG
1070 	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1071 
1072 	BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1073 #endif
1074 
1075 	if (tcp_ack_scheduled(tp)) {
1076 		   /* Delayed ACKs frequently hit locked sockets during bulk
1077 		    * receive. */
1078 		if (tp->ack.blocked ||
1079 		    /* Once-per-two-segments ACK was not sent by tcp_input.c */
1080 		    tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1081 		    /*
1082 		     * If this read emptied read buffer, we send ACK, if
1083 		     * connection is not bidirectional, user drained
1084 		     * receive buffer and there was a small segment
1085 		     * in queue.
1086 		     */
1087 		    (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1088 		     !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1089 			time_to_ack = 1;
1090 	}
1091 
1092 	/* We send an ACK if we can now advertise a non-zero window
1093 	 * which has been raised "significantly".
1094 	 *
1095 	 * Even if window raised up to infinity, do not send window open ACK
1096 	 * in states, where we will not receive more. It is useless.
1097 	 */
1098 	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1099 		__u32 rcv_window_now = tcp_receive_window(tp);
1100 
1101 		/* Optimize, __tcp_select_window() is not cheap. */
1102 		if (2*rcv_window_now <= tp->window_clamp) {
1103 			__u32 new_window = __tcp_select_window(sk);
1104 
1105 			/* Send ACK now, if this read freed lots of space
1106 			 * in our buffer. Certainly, new_window is new window.
1107 			 * We can advertise it now, if it is not less than current one.
1108 			 * "Lots" means "at least twice" here.
1109 			 */
1110 			if (new_window && new_window >= 2 * rcv_window_now)
1111 				time_to_ack = 1;
1112 		}
1113 	}
1114 	if (time_to_ack)
1115 		tcp_send_ack(sk);
1116 }
1117 
1118 static void tcp_prequeue_process(struct sock *sk)
1119 {
1120 	struct sk_buff *skb;
1121 	struct tcp_sock *tp = tcp_sk(sk);
1122 
1123 	NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue));
1124 
1125 	/* RX process wants to run with disabled BHs, though it is not
1126 	 * necessary */
1127 	local_bh_disable();
1128 	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1129 		sk->sk_backlog_rcv(sk, skb);
1130 	local_bh_enable();
1131 
1132 	/* Clear memory counter. */
1133 	tp->ucopy.memory = 0;
1134 }
1135 
1136 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1137 {
1138 	struct sk_buff *skb;
1139 	u32 offset;
1140 
1141 	skb_queue_walk(&sk->sk_receive_queue, skb) {
1142 		offset = seq - TCP_SKB_CB(skb)->seq;
1143 		if (skb->h.th->syn)
1144 			offset--;
1145 		if (offset < skb->len || skb->h.th->fin) {
1146 			*off = offset;
1147 			return skb;
1148 		}
1149 	}
1150 	return NULL;
1151 }
1152 
1153 /*
1154  * This routine provides an alternative to tcp_recvmsg() for routines
1155  * that would like to handle copying from skbuffs directly in 'sendfile'
1156  * fashion.
1157  * Note:
1158  *	- It is assumed that the socket was locked by the caller.
1159  *	- The routine does not block.
1160  *	- At present, there is no support for reading OOB data
1161  *	  or for 'peeking' the socket using this routine
1162  *	  (although both would be easy to implement).
1163  */
1164 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1165 		  sk_read_actor_t recv_actor)
1166 {
1167 	struct sk_buff *skb;
1168 	struct tcp_sock *tp = tcp_sk(sk);
1169 	u32 seq = tp->copied_seq;
1170 	u32 offset;
1171 	int copied = 0;
1172 
1173 	if (sk->sk_state == TCP_LISTEN)
1174 		return -ENOTCONN;
1175 	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1176 		if (offset < skb->len) {
1177 			size_t used, len;
1178 
1179 			len = skb->len - offset;
1180 			/* Stop reading if we hit a patch of urgent data */
1181 			if (tp->urg_data) {
1182 				u32 urg_offset = tp->urg_seq - seq;
1183 				if (urg_offset < len)
1184 					len = urg_offset;
1185 				if (!len)
1186 					break;
1187 			}
1188 			used = recv_actor(desc, skb, offset, len);
1189 			if (used <= len) {
1190 				seq += used;
1191 				copied += used;
1192 				offset += used;
1193 			}
1194 			if (offset != skb->len)
1195 				break;
1196 		}
1197 		if (skb->h.th->fin) {
1198 			sk_eat_skb(sk, skb);
1199 			++seq;
1200 			break;
1201 		}
1202 		sk_eat_skb(sk, skb);
1203 		if (!desc->count)
1204 			break;
1205 	}
1206 	tp->copied_seq = seq;
1207 
1208 	tcp_rcv_space_adjust(sk);
1209 
1210 	/* Clean up data we have read: This will do ACK frames. */
1211 	if (copied)
1212 		cleanup_rbuf(sk, copied);
1213 	return copied;
1214 }
1215 
1216 /*
1217  *	This routine copies from a sock struct into the user buffer.
1218  *
1219  *	Technical note: in 2.3 we work on _locked_ socket, so that
1220  *	tricks with *seq access order and skb->users are not required.
1221  *	Probably, code can be easily improved even more.
1222  */
1223 
1224 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1225 		size_t len, int nonblock, int flags, int *addr_len)
1226 {
1227 	struct tcp_sock *tp = tcp_sk(sk);
1228 	int copied = 0;
1229 	u32 peek_seq;
1230 	u32 *seq;
1231 	unsigned long used;
1232 	int err;
1233 	int target;		/* Read at least this many bytes */
1234 	long timeo;
1235 	struct task_struct *user_recv = NULL;
1236 
1237 	lock_sock(sk);
1238 
1239 	TCP_CHECK_TIMER(sk);
1240 
1241 	err = -ENOTCONN;
1242 	if (sk->sk_state == TCP_LISTEN)
1243 		goto out;
1244 
1245 	timeo = sock_rcvtimeo(sk, nonblock);
1246 
1247 	/* Urgent data needs to be handled specially. */
1248 	if (flags & MSG_OOB)
1249 		goto recv_urg;
1250 
1251 	seq = &tp->copied_seq;
1252 	if (flags & MSG_PEEK) {
1253 		peek_seq = tp->copied_seq;
1254 		seq = &peek_seq;
1255 	}
1256 
1257 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1258 
1259 	do {
1260 		struct sk_buff *skb;
1261 		u32 offset;
1262 
1263 		/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1264 		if (tp->urg_data && tp->urg_seq == *seq) {
1265 			if (copied)
1266 				break;
1267 			if (signal_pending(current)) {
1268 				copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1269 				break;
1270 			}
1271 		}
1272 
1273 		/* Next get a buffer. */
1274 
1275 		skb = skb_peek(&sk->sk_receive_queue);
1276 		do {
1277 			if (!skb)
1278 				break;
1279 
1280 			/* Now that we have two receive queues this
1281 			 * shouldn't happen.
1282 			 */
1283 			if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1284 				printk(KERN_INFO "recvmsg bug: copied %X "
1285 				       "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1286 				break;
1287 			}
1288 			offset = *seq - TCP_SKB_CB(skb)->seq;
1289 			if (skb->h.th->syn)
1290 				offset--;
1291 			if (offset < skb->len)
1292 				goto found_ok_skb;
1293 			if (skb->h.th->fin)
1294 				goto found_fin_ok;
1295 			BUG_TRAP(flags & MSG_PEEK);
1296 			skb = skb->next;
1297 		} while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1298 
1299 		/* Well, if we have backlog, try to process it now yet. */
1300 
1301 		if (copied >= target && !sk->sk_backlog.tail)
1302 			break;
1303 
1304 		if (copied) {
1305 			if (sk->sk_err ||
1306 			    sk->sk_state == TCP_CLOSE ||
1307 			    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1308 			    !timeo ||
1309 			    signal_pending(current) ||
1310 			    (flags & MSG_PEEK))
1311 				break;
1312 		} else {
1313 			if (sock_flag(sk, SOCK_DONE))
1314 				break;
1315 
1316 			if (sk->sk_err) {
1317 				copied = sock_error(sk);
1318 				break;
1319 			}
1320 
1321 			if (sk->sk_shutdown & RCV_SHUTDOWN)
1322 				break;
1323 
1324 			if (sk->sk_state == TCP_CLOSE) {
1325 				if (!sock_flag(sk, SOCK_DONE)) {
1326 					/* This occurs when user tries to read
1327 					 * from never connected socket.
1328 					 */
1329 					copied = -ENOTCONN;
1330 					break;
1331 				}
1332 				break;
1333 			}
1334 
1335 			if (!timeo) {
1336 				copied = -EAGAIN;
1337 				break;
1338 			}
1339 
1340 			if (signal_pending(current)) {
1341 				copied = sock_intr_errno(timeo);
1342 				break;
1343 			}
1344 		}
1345 
1346 		cleanup_rbuf(sk, copied);
1347 
1348 		if (tp->ucopy.task == user_recv) {
1349 			/* Install new reader */
1350 			if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1351 				user_recv = current;
1352 				tp->ucopy.task = user_recv;
1353 				tp->ucopy.iov = msg->msg_iov;
1354 			}
1355 
1356 			tp->ucopy.len = len;
1357 
1358 			BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1359 				 (flags & (MSG_PEEK | MSG_TRUNC)));
1360 
1361 			/* Ugly... If prequeue is not empty, we have to
1362 			 * process it before releasing socket, otherwise
1363 			 * order will be broken at second iteration.
1364 			 * More elegant solution is required!!!
1365 			 *
1366 			 * Look: we have the following (pseudo)queues:
1367 			 *
1368 			 * 1. packets in flight
1369 			 * 2. backlog
1370 			 * 3. prequeue
1371 			 * 4. receive_queue
1372 			 *
1373 			 * Each queue can be processed only if the next ones
1374 			 * are empty. At this point we have empty receive_queue.
1375 			 * But prequeue _can_ be not empty after 2nd iteration,
1376 			 * when we jumped to start of loop because backlog
1377 			 * processing added something to receive_queue.
1378 			 * We cannot release_sock(), because backlog contains
1379 			 * packets arrived _after_ prequeued ones.
1380 			 *
1381 			 * Shortly, algorithm is clear --- to process all
1382 			 * the queues in order. We could make it more directly,
1383 			 * requeueing packets from backlog to prequeue, if
1384 			 * is not empty. It is more elegant, but eats cycles,
1385 			 * unfortunately.
1386 			 */
1387 			if (skb_queue_len(&tp->ucopy.prequeue))
1388 				goto do_prequeue;
1389 
1390 			/* __ Set realtime policy in scheduler __ */
1391 		}
1392 
1393 		if (copied >= target) {
1394 			/* Do not sleep, just process backlog. */
1395 			release_sock(sk);
1396 			lock_sock(sk);
1397 		} else
1398 			sk_wait_data(sk, &timeo);
1399 
1400 		if (user_recv) {
1401 			int chunk;
1402 
1403 			/* __ Restore normal policy in scheduler __ */
1404 
1405 			if ((chunk = len - tp->ucopy.len) != 0) {
1406 				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1407 				len -= chunk;
1408 				copied += chunk;
1409 			}
1410 
1411 			if (tp->rcv_nxt == tp->copied_seq &&
1412 			    skb_queue_len(&tp->ucopy.prequeue)) {
1413 do_prequeue:
1414 				tcp_prequeue_process(sk);
1415 
1416 				if ((chunk = len - tp->ucopy.len) != 0) {
1417 					NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1418 					len -= chunk;
1419 					copied += chunk;
1420 				}
1421 			}
1422 		}
1423 		if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1424 			if (net_ratelimit())
1425 				printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1426 				       current->comm, current->pid);
1427 			peek_seq = tp->copied_seq;
1428 		}
1429 		continue;
1430 
1431 	found_ok_skb:
1432 		/* Ok so how much can we use? */
1433 		used = skb->len - offset;
1434 		if (len < used)
1435 			used = len;
1436 
1437 		/* Do we have urgent data here? */
1438 		if (tp->urg_data) {
1439 			u32 urg_offset = tp->urg_seq - *seq;
1440 			if (urg_offset < used) {
1441 				if (!urg_offset) {
1442 					if (!sock_flag(sk, SOCK_URGINLINE)) {
1443 						++*seq;
1444 						offset++;
1445 						used--;
1446 						if (!used)
1447 							goto skip_copy;
1448 					}
1449 				} else
1450 					used = urg_offset;
1451 			}
1452 		}
1453 
1454 		if (!(flags & MSG_TRUNC)) {
1455 			err = skb_copy_datagram_iovec(skb, offset,
1456 						      msg->msg_iov, used);
1457 			if (err) {
1458 				/* Exception. Bailout! */
1459 				if (!copied)
1460 					copied = -EFAULT;
1461 				break;
1462 			}
1463 		}
1464 
1465 		*seq += used;
1466 		copied += used;
1467 		len -= used;
1468 
1469 		tcp_rcv_space_adjust(sk);
1470 
1471 skip_copy:
1472 		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1473 			tp->urg_data = 0;
1474 			tcp_fast_path_check(sk, tp);
1475 		}
1476 		if (used + offset < skb->len)
1477 			continue;
1478 
1479 		if (skb->h.th->fin)
1480 			goto found_fin_ok;
1481 		if (!(flags & MSG_PEEK))
1482 			sk_eat_skb(sk, skb);
1483 		continue;
1484 
1485 	found_fin_ok:
1486 		/* Process the FIN. */
1487 		++*seq;
1488 		if (!(flags & MSG_PEEK))
1489 			sk_eat_skb(sk, skb);
1490 		break;
1491 	} while (len > 0);
1492 
1493 	if (user_recv) {
1494 		if (skb_queue_len(&tp->ucopy.prequeue)) {
1495 			int chunk;
1496 
1497 			tp->ucopy.len = copied > 0 ? len : 0;
1498 
1499 			tcp_prequeue_process(sk);
1500 
1501 			if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1502 				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1503 				len -= chunk;
1504 				copied += chunk;
1505 			}
1506 		}
1507 
1508 		tp->ucopy.task = NULL;
1509 		tp->ucopy.len = 0;
1510 	}
1511 
1512 	/* According to UNIX98, msg_name/msg_namelen are ignored
1513 	 * on connected socket. I was just happy when found this 8) --ANK
1514 	 */
1515 
1516 	/* Clean up data we have read: This will do ACK frames. */
1517 	cleanup_rbuf(sk, copied);
1518 
1519 	TCP_CHECK_TIMER(sk);
1520 	release_sock(sk);
1521 	return copied;
1522 
1523 out:
1524 	TCP_CHECK_TIMER(sk);
1525 	release_sock(sk);
1526 	return err;
1527 
1528 recv_urg:
1529 	err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1530 	goto out;
1531 }
1532 
1533 /*
1534  *	State processing on a close. This implements the state shift for
1535  *	sending our FIN frame. Note that we only send a FIN for some
1536  *	states. A shutdown() may have already sent the FIN, or we may be
1537  *	closed.
1538  */
1539 
1540 static unsigned char new_state[16] = {
1541   /* current state:        new state:      action:	*/
1542   /* (Invalid)		*/ TCP_CLOSE,
1543   /* TCP_ESTABLISHED	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1544   /* TCP_SYN_SENT	*/ TCP_CLOSE,
1545   /* TCP_SYN_RECV	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1546   /* TCP_FIN_WAIT1	*/ TCP_FIN_WAIT1,
1547   /* TCP_FIN_WAIT2	*/ TCP_FIN_WAIT2,
1548   /* TCP_TIME_WAIT	*/ TCP_CLOSE,
1549   /* TCP_CLOSE		*/ TCP_CLOSE,
1550   /* TCP_CLOSE_WAIT	*/ TCP_LAST_ACK  | TCP_ACTION_FIN,
1551   /* TCP_LAST_ACK	*/ TCP_LAST_ACK,
1552   /* TCP_LISTEN		*/ TCP_CLOSE,
1553   /* TCP_CLOSING	*/ TCP_CLOSING,
1554 };
1555 
1556 static int tcp_close_state(struct sock *sk)
1557 {
1558 	int next = (int)new_state[sk->sk_state];
1559 	int ns = next & TCP_STATE_MASK;
1560 
1561 	tcp_set_state(sk, ns);
1562 
1563 	return next & TCP_ACTION_FIN;
1564 }
1565 
1566 /*
1567  *	Shutdown the sending side of a connection. Much like close except
1568  *	that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1569  */
1570 
1571 void tcp_shutdown(struct sock *sk, int how)
1572 {
1573 	/*	We need to grab some memory, and put together a FIN,
1574 	 *	and then put it into the queue to be sent.
1575 	 *		Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1576 	 */
1577 	if (!(how & SEND_SHUTDOWN))
1578 		return;
1579 
1580 	/* If we've already sent a FIN, or it's a closed state, skip this. */
1581 	if ((1 << sk->sk_state) &
1582 	    (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1583 	     TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1584 		/* Clear out any half completed packets.  FIN if needed. */
1585 		if (tcp_close_state(sk))
1586 			tcp_send_fin(sk);
1587 	}
1588 }
1589 
1590 /*
1591  * At this point, there should be no process reference to this
1592  * socket, and thus no user references at all.  Therefore we
1593  * can assume the socket waitqueue is inactive and nobody will
1594  * try to jump onto it.
1595  */
1596 void tcp_destroy_sock(struct sock *sk)
1597 {
1598 	BUG_TRAP(sk->sk_state == TCP_CLOSE);
1599 	BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1600 
1601 	/* It cannot be in hash table! */
1602 	BUG_TRAP(sk_unhashed(sk));
1603 
1604 	/* If it has not 0 inet_sk(sk)->num, it must be bound */
1605 	BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1606 
1607 	sk->sk_prot->destroy(sk);
1608 
1609 	sk_stream_kill_queues(sk);
1610 
1611 	xfrm_sk_free_policy(sk);
1612 
1613 #ifdef INET_REFCNT_DEBUG
1614 	if (atomic_read(&sk->sk_refcnt) != 1) {
1615 		printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1616 		       sk, atomic_read(&sk->sk_refcnt));
1617 	}
1618 #endif
1619 
1620 	atomic_dec(&tcp_orphan_count);
1621 	sock_put(sk);
1622 }
1623 
1624 void tcp_close(struct sock *sk, long timeout)
1625 {
1626 	struct sk_buff *skb;
1627 	int data_was_unread = 0;
1628 
1629 	lock_sock(sk);
1630 	sk->sk_shutdown = SHUTDOWN_MASK;
1631 
1632 	if (sk->sk_state == TCP_LISTEN) {
1633 		tcp_set_state(sk, TCP_CLOSE);
1634 
1635 		/* Special case. */
1636 		tcp_listen_stop(sk);
1637 
1638 		goto adjudge_to_death;
1639 	}
1640 
1641 	/*  We need to flush the recv. buffs.  We do this only on the
1642 	 *  descriptor close, not protocol-sourced closes, because the
1643 	 *  reader process may not have drained the data yet!
1644 	 */
1645 	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1646 		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1647 			  skb->h.th->fin;
1648 		data_was_unread += len;
1649 		__kfree_skb(skb);
1650 	}
1651 
1652 	sk_stream_mem_reclaim(sk);
1653 
1654 	/* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1655 	 * 3.10, we send a RST here because data was lost.  To
1656 	 * witness the awful effects of the old behavior of always
1657 	 * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1658 	 * a bulk GET in an FTP client, suspend the process, wait
1659 	 * for the client to advertise a zero window, then kill -9
1660 	 * the FTP client, wheee...  Note: timeout is always zero
1661 	 * in such a case.
1662 	 */
1663 	if (data_was_unread) {
1664 		/* Unread data was tossed, zap the connection. */
1665 		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1666 		tcp_set_state(sk, TCP_CLOSE);
1667 		tcp_send_active_reset(sk, GFP_KERNEL);
1668 	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1669 		/* Check zero linger _after_ checking for unread data. */
1670 		sk->sk_prot->disconnect(sk, 0);
1671 		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1672 	} else if (tcp_close_state(sk)) {
1673 		/* We FIN if the application ate all the data before
1674 		 * zapping the connection.
1675 		 */
1676 
1677 		/* RED-PEN. Formally speaking, we have broken TCP state
1678 		 * machine. State transitions:
1679 		 *
1680 		 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1681 		 * TCP_SYN_RECV	-> TCP_FIN_WAIT1 (forget it, it's impossible)
1682 		 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1683 		 *
1684 		 * are legal only when FIN has been sent (i.e. in window),
1685 		 * rather than queued out of window. Purists blame.
1686 		 *
1687 		 * F.e. "RFC state" is ESTABLISHED,
1688 		 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1689 		 *
1690 		 * The visible declinations are that sometimes
1691 		 * we enter time-wait state, when it is not required really
1692 		 * (harmless), do not send active resets, when they are
1693 		 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1694 		 * they look as CLOSING or LAST_ACK for Linux)
1695 		 * Probably, I missed some more holelets.
1696 		 * 						--ANK
1697 		 */
1698 		tcp_send_fin(sk);
1699 	}
1700 
1701 	sk_stream_wait_close(sk, timeout);
1702 
1703 adjudge_to_death:
1704 	/* It is the last release_sock in its life. It will remove backlog. */
1705 	release_sock(sk);
1706 
1707 
1708 	/* Now socket is owned by kernel and we acquire BH lock
1709 	   to finish close. No need to check for user refs.
1710 	 */
1711 	local_bh_disable();
1712 	bh_lock_sock(sk);
1713 	BUG_TRAP(!sock_owned_by_user(sk));
1714 
1715 	sock_hold(sk);
1716 	sock_orphan(sk);
1717 
1718 	/*	This is a (useful) BSD violating of the RFC. There is a
1719 	 *	problem with TCP as specified in that the other end could
1720 	 *	keep a socket open forever with no application left this end.
1721 	 *	We use a 3 minute timeout (about the same as BSD) then kill
1722 	 *	our end. If they send after that then tough - BUT: long enough
1723 	 *	that we won't make the old 4*rto = almost no time - whoops
1724 	 *	reset mistake.
1725 	 *
1726 	 *	Nope, it was not mistake. It is really desired behaviour
1727 	 *	f.e. on http servers, when such sockets are useless, but
1728 	 *	consume significant resources. Let's do it with special
1729 	 *	linger2	option.					--ANK
1730 	 */
1731 
1732 	if (sk->sk_state == TCP_FIN_WAIT2) {
1733 		struct tcp_sock *tp = tcp_sk(sk);
1734 		if (tp->linger2 < 0) {
1735 			tcp_set_state(sk, TCP_CLOSE);
1736 			tcp_send_active_reset(sk, GFP_ATOMIC);
1737 			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1738 		} else {
1739 			int tmo = tcp_fin_time(tp);
1740 
1741 			if (tmo > TCP_TIMEWAIT_LEN) {
1742 				tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
1743 			} else {
1744 				atomic_inc(&tcp_orphan_count);
1745 				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1746 				goto out;
1747 			}
1748 		}
1749 	}
1750 	if (sk->sk_state != TCP_CLOSE) {
1751 		sk_stream_mem_reclaim(sk);
1752 		if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
1753 		    (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1754 		     atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1755 			if (net_ratelimit())
1756 				printk(KERN_INFO "TCP: too many of orphaned "
1757 				       "sockets\n");
1758 			tcp_set_state(sk, TCP_CLOSE);
1759 			tcp_send_active_reset(sk, GFP_ATOMIC);
1760 			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1761 		}
1762 	}
1763 	atomic_inc(&tcp_orphan_count);
1764 
1765 	if (sk->sk_state == TCP_CLOSE)
1766 		tcp_destroy_sock(sk);
1767 	/* Otherwise, socket is reprieved until protocol close. */
1768 
1769 out:
1770 	bh_unlock_sock(sk);
1771 	local_bh_enable();
1772 	sock_put(sk);
1773 }
1774 
1775 /* These states need RST on ABORT according to RFC793 */
1776 
1777 static inline int tcp_need_reset(int state)
1778 {
1779 	return (1 << state) &
1780 	       (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1781 		TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1782 }
1783 
1784 int tcp_disconnect(struct sock *sk, int flags)
1785 {
1786 	struct inet_sock *inet = inet_sk(sk);
1787 	struct tcp_sock *tp = tcp_sk(sk);
1788 	int err = 0;
1789 	int old_state = sk->sk_state;
1790 
1791 	if (old_state != TCP_CLOSE)
1792 		tcp_set_state(sk, TCP_CLOSE);
1793 
1794 	/* ABORT function of RFC793 */
1795 	if (old_state == TCP_LISTEN) {
1796 		tcp_listen_stop(sk);
1797 	} else if (tcp_need_reset(old_state) ||
1798 		   (tp->snd_nxt != tp->write_seq &&
1799 		    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1800 		/* The last check adjusts for discrepance of Linux wrt. RFC
1801 		 * states
1802 		 */
1803 		tcp_send_active_reset(sk, gfp_any());
1804 		sk->sk_err = ECONNRESET;
1805 	} else if (old_state == TCP_SYN_SENT)
1806 		sk->sk_err = ECONNRESET;
1807 
1808 	tcp_clear_xmit_timers(sk);
1809 	__skb_queue_purge(&sk->sk_receive_queue);
1810 	sk_stream_writequeue_purge(sk);
1811 	__skb_queue_purge(&tp->out_of_order_queue);
1812 
1813 	inet->dport = 0;
1814 
1815 	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1816 		inet_reset_saddr(sk);
1817 
1818 	sk->sk_shutdown = 0;
1819 	sock_reset_flag(sk, SOCK_DONE);
1820 	tp->srtt = 0;
1821 	if ((tp->write_seq += tp->max_window + 2) == 0)
1822 		tp->write_seq = 1;
1823 	tp->backoff = 0;
1824 	tp->snd_cwnd = 2;
1825 	tp->probes_out = 0;
1826 	tp->packets_out = 0;
1827 	tp->snd_ssthresh = 0x7fffffff;
1828 	tp->snd_cwnd_cnt = 0;
1829 	tcp_set_ca_state(tp, TCP_CA_Open);
1830 	tcp_clear_retrans(tp);
1831 	tcp_delack_init(tp);
1832 	sk->sk_send_head = NULL;
1833 	tp->rx_opt.saw_tstamp = 0;
1834 	tcp_sack_reset(&tp->rx_opt);
1835 	__sk_dst_reset(sk);
1836 
1837 	BUG_TRAP(!inet->num || tp->bind_hash);
1838 
1839 	sk->sk_error_report(sk);
1840 	return err;
1841 }
1842 
1843 /*
1844  *	Wait for an incoming connection, avoid race
1845  *	conditions. This must be called with the socket locked.
1846  */
1847 static int wait_for_connect(struct sock *sk, long timeo)
1848 {
1849 	struct tcp_sock *tp = tcp_sk(sk);
1850 	DEFINE_WAIT(wait);
1851 	int err;
1852 
1853 	/*
1854 	 * True wake-one mechanism for incoming connections: only
1855 	 * one process gets woken up, not the 'whole herd'.
1856 	 * Since we do not 'race & poll' for established sockets
1857 	 * anymore, the common case will execute the loop only once.
1858 	 *
1859 	 * Subtle issue: "add_wait_queue_exclusive()" will be added
1860 	 * after any current non-exclusive waiters, and we know that
1861 	 * it will always _stay_ after any new non-exclusive waiters
1862 	 * because all non-exclusive waiters are added at the
1863 	 * beginning of the wait-queue. As such, it's ok to "drop"
1864 	 * our exclusiveness temporarily when we get woken up without
1865 	 * having to remove and re-insert us on the wait queue.
1866 	 */
1867 	for (;;) {
1868 		prepare_to_wait_exclusive(sk->sk_sleep, &wait,
1869 					  TASK_INTERRUPTIBLE);
1870 		release_sock(sk);
1871 		if (!tp->accept_queue)
1872 			timeo = schedule_timeout(timeo);
1873 		lock_sock(sk);
1874 		err = 0;
1875 		if (tp->accept_queue)
1876 			break;
1877 		err = -EINVAL;
1878 		if (sk->sk_state != TCP_LISTEN)
1879 			break;
1880 		err = sock_intr_errno(timeo);
1881 		if (signal_pending(current))
1882 			break;
1883 		err = -EAGAIN;
1884 		if (!timeo)
1885 			break;
1886 	}
1887 	finish_wait(sk->sk_sleep, &wait);
1888 	return err;
1889 }
1890 
1891 /*
1892  *	This will accept the next outstanding connection.
1893  */
1894 
1895 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1896 {
1897 	struct tcp_sock *tp = tcp_sk(sk);
1898 	struct open_request *req;
1899 	struct sock *newsk;
1900 	int error;
1901 
1902 	lock_sock(sk);
1903 
1904 	/* We need to make sure that this socket is listening,
1905 	 * and that it has something pending.
1906 	 */
1907 	error = -EINVAL;
1908 	if (sk->sk_state != TCP_LISTEN)
1909 		goto out;
1910 
1911 	/* Find already established connection */
1912 	if (!tp->accept_queue) {
1913 		long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1914 
1915 		/* If this is a non blocking socket don't sleep */
1916 		error = -EAGAIN;
1917 		if (!timeo)
1918 			goto out;
1919 
1920 		error = wait_for_connect(sk, timeo);
1921 		if (error)
1922 			goto out;
1923 	}
1924 
1925 	req = tp->accept_queue;
1926 	if ((tp->accept_queue = req->dl_next) == NULL)
1927 		tp->accept_queue_tail = NULL;
1928 
1929  	newsk = req->sk;
1930 	sk_acceptq_removed(sk);
1931 	tcp_openreq_fastfree(req);
1932 	BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
1933 	release_sock(sk);
1934 	return newsk;
1935 
1936 out:
1937 	release_sock(sk);
1938 	*err = error;
1939 	return NULL;
1940 }
1941 
1942 /*
1943  *	Socket option code for TCP.
1944  */
1945 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1946 		   int optlen)
1947 {
1948 	struct tcp_sock *tp = tcp_sk(sk);
1949 	int val;
1950 	int err = 0;
1951 
1952 	if (level != SOL_TCP)
1953 		return tp->af_specific->setsockopt(sk, level, optname,
1954 						   optval, optlen);
1955 
1956 	if (optlen < sizeof(int))
1957 		return -EINVAL;
1958 
1959 	if (get_user(val, (int __user *)optval))
1960 		return -EFAULT;
1961 
1962 	lock_sock(sk);
1963 
1964 	switch (optname) {
1965 	case TCP_MAXSEG:
1966 		/* Values greater than interface MTU won't take effect. However
1967 		 * at the point when this call is done we typically don't yet
1968 		 * know which interface is going to be used */
1969 		if (val < 8 || val > MAX_TCP_WINDOW) {
1970 			err = -EINVAL;
1971 			break;
1972 		}
1973 		tp->rx_opt.user_mss = val;
1974 		break;
1975 
1976 	case TCP_NODELAY:
1977 		if (val) {
1978 			/* TCP_NODELAY is weaker than TCP_CORK, so that
1979 			 * this option on corked socket is remembered, but
1980 			 * it is not activated until cork is cleared.
1981 			 *
1982 			 * However, when TCP_NODELAY is set we make
1983 			 * an explicit push, which overrides even TCP_CORK
1984 			 * for currently queued segments.
1985 			 */
1986 			tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
1987 			tcp_push_pending_frames(sk, tp);
1988 		} else {
1989 			tp->nonagle &= ~TCP_NAGLE_OFF;
1990 		}
1991 		break;
1992 
1993 	case TCP_CORK:
1994 		/* When set indicates to always queue non-full frames.
1995 		 * Later the user clears this option and we transmit
1996 		 * any pending partial frames in the queue.  This is
1997 		 * meant to be used alongside sendfile() to get properly
1998 		 * filled frames when the user (for example) must write
1999 		 * out headers with a write() call first and then use
2000 		 * sendfile to send out the data parts.
2001 		 *
2002 		 * TCP_CORK can be set together with TCP_NODELAY and it is
2003 		 * stronger than TCP_NODELAY.
2004 		 */
2005 		if (val) {
2006 			tp->nonagle |= TCP_NAGLE_CORK;
2007 		} else {
2008 			tp->nonagle &= ~TCP_NAGLE_CORK;
2009 			if (tp->nonagle&TCP_NAGLE_OFF)
2010 				tp->nonagle |= TCP_NAGLE_PUSH;
2011 			tcp_push_pending_frames(sk, tp);
2012 		}
2013 		break;
2014 
2015 	case TCP_KEEPIDLE:
2016 		if (val < 1 || val > MAX_TCP_KEEPIDLE)
2017 			err = -EINVAL;
2018 		else {
2019 			tp->keepalive_time = val * HZ;
2020 			if (sock_flag(sk, SOCK_KEEPOPEN) &&
2021 			    !((1 << sk->sk_state) &
2022 			      (TCPF_CLOSE | TCPF_LISTEN))) {
2023 				__u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2024 				if (tp->keepalive_time > elapsed)
2025 					elapsed = tp->keepalive_time - elapsed;
2026 				else
2027 					elapsed = 0;
2028 				tcp_reset_keepalive_timer(sk, elapsed);
2029 			}
2030 		}
2031 		break;
2032 	case TCP_KEEPINTVL:
2033 		if (val < 1 || val > MAX_TCP_KEEPINTVL)
2034 			err = -EINVAL;
2035 		else
2036 			tp->keepalive_intvl = val * HZ;
2037 		break;
2038 	case TCP_KEEPCNT:
2039 		if (val < 1 || val > MAX_TCP_KEEPCNT)
2040 			err = -EINVAL;
2041 		else
2042 			tp->keepalive_probes = val;
2043 		break;
2044 	case TCP_SYNCNT:
2045 		if (val < 1 || val > MAX_TCP_SYNCNT)
2046 			err = -EINVAL;
2047 		else
2048 			tp->syn_retries = val;
2049 		break;
2050 
2051 	case TCP_LINGER2:
2052 		if (val < 0)
2053 			tp->linger2 = -1;
2054 		else if (val > sysctl_tcp_fin_timeout / HZ)
2055 			tp->linger2 = 0;
2056 		else
2057 			tp->linger2 = val * HZ;
2058 		break;
2059 
2060 	case TCP_DEFER_ACCEPT:
2061 		tp->defer_accept = 0;
2062 		if (val > 0) {
2063 			/* Translate value in seconds to number of
2064 			 * retransmits */
2065 			while (tp->defer_accept < 32 &&
2066 			       val > ((TCP_TIMEOUT_INIT / HZ) <<
2067 				       tp->defer_accept))
2068 				tp->defer_accept++;
2069 			tp->defer_accept++;
2070 		}
2071 		break;
2072 
2073 	case TCP_WINDOW_CLAMP:
2074 		if (!val) {
2075 			if (sk->sk_state != TCP_CLOSE) {
2076 				err = -EINVAL;
2077 				break;
2078 			}
2079 			tp->window_clamp = 0;
2080 		} else
2081 			tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2082 						SOCK_MIN_RCVBUF / 2 : val;
2083 		break;
2084 
2085 	case TCP_QUICKACK:
2086 		if (!val) {
2087 			tp->ack.pingpong = 1;
2088 		} else {
2089 			tp->ack.pingpong = 0;
2090 			if ((1 << sk->sk_state) &
2091 			    (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2092 			    tcp_ack_scheduled(tp)) {
2093 				tp->ack.pending |= TCP_ACK_PUSHED;
2094 				cleanup_rbuf(sk, 1);
2095 				if (!(val & 1))
2096 					tp->ack.pingpong = 1;
2097 			}
2098 		}
2099 		break;
2100 
2101 	default:
2102 		err = -ENOPROTOOPT;
2103 		break;
2104 	};
2105 	release_sock(sk);
2106 	return err;
2107 }
2108 
2109 /* Return information about state of tcp endpoint in API format. */
2110 void tcp_get_info(struct sock *sk, struct tcp_info *info)
2111 {
2112 	struct tcp_sock *tp = tcp_sk(sk);
2113 	u32 now = tcp_time_stamp;
2114 
2115 	memset(info, 0, sizeof(*info));
2116 
2117 	info->tcpi_state = sk->sk_state;
2118 	info->tcpi_ca_state = tp->ca_state;
2119 	info->tcpi_retransmits = tp->retransmits;
2120 	info->tcpi_probes = tp->probes_out;
2121 	info->tcpi_backoff = tp->backoff;
2122 
2123 	if (tp->rx_opt.tstamp_ok)
2124 		info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2125 	if (tp->rx_opt.sack_ok)
2126 		info->tcpi_options |= TCPI_OPT_SACK;
2127 	if (tp->rx_opt.wscale_ok) {
2128 		info->tcpi_options |= TCPI_OPT_WSCALE;
2129 		info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2130 		info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2131 	}
2132 
2133 	if (tp->ecn_flags&TCP_ECN_OK)
2134 		info->tcpi_options |= TCPI_OPT_ECN;
2135 
2136 	info->tcpi_rto = jiffies_to_usecs(tp->rto);
2137 	info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
2138 	info->tcpi_snd_mss = tp->mss_cache_std;
2139 	info->tcpi_rcv_mss = tp->ack.rcv_mss;
2140 
2141 	info->tcpi_unacked = tp->packets_out;
2142 	info->tcpi_sacked = tp->sacked_out;
2143 	info->tcpi_lost = tp->lost_out;
2144 	info->tcpi_retrans = tp->retrans_out;
2145 	info->tcpi_fackets = tp->fackets_out;
2146 
2147 	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2148 	info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime);
2149 	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2150 
2151 	info->tcpi_pmtu = tp->pmtu_cookie;
2152 	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2153 	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2154 	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2155 	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2156 	info->tcpi_snd_cwnd = tp->snd_cwnd;
2157 	info->tcpi_advmss = tp->advmss;
2158 	info->tcpi_reordering = tp->reordering;
2159 
2160 	info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2161 	info->tcpi_rcv_space = tp->rcvq_space.space;
2162 
2163 	info->tcpi_total_retrans = tp->total_retrans;
2164 }
2165 
2166 EXPORT_SYMBOL_GPL(tcp_get_info);
2167 
2168 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2169 		   int __user *optlen)
2170 {
2171 	struct tcp_sock *tp = tcp_sk(sk);
2172 	int val, len;
2173 
2174 	if (level != SOL_TCP)
2175 		return tp->af_specific->getsockopt(sk, level, optname,
2176 						   optval, optlen);
2177 
2178 	if (get_user(len, optlen))
2179 		return -EFAULT;
2180 
2181 	len = min_t(unsigned int, len, sizeof(int));
2182 
2183 	if (len < 0)
2184 		return -EINVAL;
2185 
2186 	switch (optname) {
2187 	case TCP_MAXSEG:
2188 		val = tp->mss_cache_std;
2189 		if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2190 			val = tp->rx_opt.user_mss;
2191 		break;
2192 	case TCP_NODELAY:
2193 		val = !!(tp->nonagle&TCP_NAGLE_OFF);
2194 		break;
2195 	case TCP_CORK:
2196 		val = !!(tp->nonagle&TCP_NAGLE_CORK);
2197 		break;
2198 	case TCP_KEEPIDLE:
2199 		val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2200 		break;
2201 	case TCP_KEEPINTVL:
2202 		val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2203 		break;
2204 	case TCP_KEEPCNT:
2205 		val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2206 		break;
2207 	case TCP_SYNCNT:
2208 		val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2209 		break;
2210 	case TCP_LINGER2:
2211 		val = tp->linger2;
2212 		if (val >= 0)
2213 			val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2214 		break;
2215 	case TCP_DEFER_ACCEPT:
2216 		val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2217 					       (tp->defer_accept - 1));
2218 		break;
2219 	case TCP_WINDOW_CLAMP:
2220 		val = tp->window_clamp;
2221 		break;
2222 	case TCP_INFO: {
2223 		struct tcp_info info;
2224 
2225 		if (get_user(len, optlen))
2226 			return -EFAULT;
2227 
2228 		tcp_get_info(sk, &info);
2229 
2230 		len = min_t(unsigned int, len, sizeof(info));
2231 		if (put_user(len, optlen))
2232 			return -EFAULT;
2233 		if (copy_to_user(optval, &info, len))
2234 			return -EFAULT;
2235 		return 0;
2236 	}
2237 	case TCP_QUICKACK:
2238 		val = !tp->ack.pingpong;
2239 		break;
2240 	default:
2241 		return -ENOPROTOOPT;
2242 	};
2243 
2244 	if (put_user(len, optlen))
2245 		return -EFAULT;
2246 	if (copy_to_user(optval, &val, len))
2247 		return -EFAULT;
2248 	return 0;
2249 }
2250 
2251 
2252 extern void __skb_cb_too_small_for_tcp(int, int);
2253 extern void tcpdiag_init(void);
2254 
2255 static __initdata unsigned long thash_entries;
2256 static int __init set_thash_entries(char *str)
2257 {
2258 	if (!str)
2259 		return 0;
2260 	thash_entries = simple_strtoul(str, &str, 0);
2261 	return 1;
2262 }
2263 __setup("thash_entries=", set_thash_entries);
2264 
2265 void __init tcp_init(void)
2266 {
2267 	struct sk_buff *skb = NULL;
2268 	int order, i;
2269 
2270 	if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2271 		__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2272 					   sizeof(skb->cb));
2273 
2274 	tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2275 						   sizeof(struct open_request),
2276 					       0, SLAB_HWCACHE_ALIGN,
2277 					       NULL, NULL);
2278 	if (!tcp_openreq_cachep)
2279 		panic("tcp_init: Cannot alloc open_request cache.");
2280 
2281 	tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2282 					      sizeof(struct tcp_bind_bucket),
2283 					      0, SLAB_HWCACHE_ALIGN,
2284 					      NULL, NULL);
2285 	if (!tcp_bucket_cachep)
2286 		panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2287 
2288 	tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2289 						sizeof(struct tcp_tw_bucket),
2290 						0, SLAB_HWCACHE_ALIGN,
2291 						NULL, NULL);
2292 	if (!tcp_timewait_cachep)
2293 		panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2294 
2295 	/* Size and allocate the main established and bind bucket
2296 	 * hash tables.
2297 	 *
2298 	 * The methodology is similar to that of the buffer cache.
2299 	 */
2300 	tcp_ehash = (struct tcp_ehash_bucket *)
2301 		alloc_large_system_hash("TCP established",
2302 					sizeof(struct tcp_ehash_bucket),
2303 					thash_entries,
2304 					(num_physpages >= 128 * 1024) ?
2305 						(25 - PAGE_SHIFT) :
2306 						(27 - PAGE_SHIFT),
2307 					HASH_HIGHMEM,
2308 					&tcp_ehash_size,
2309 					NULL,
2310 					0);
2311 	tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
2312 	for (i = 0; i < (tcp_ehash_size << 1); i++) {
2313 		rwlock_init(&tcp_ehash[i].lock);
2314 		INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2315 	}
2316 
2317 	tcp_bhash = (struct tcp_bind_hashbucket *)
2318 		alloc_large_system_hash("TCP bind",
2319 					sizeof(struct tcp_bind_hashbucket),
2320 					tcp_ehash_size,
2321 					(num_physpages >= 128 * 1024) ?
2322 						(25 - PAGE_SHIFT) :
2323 						(27 - PAGE_SHIFT),
2324 					HASH_HIGHMEM,
2325 					&tcp_bhash_size,
2326 					NULL,
2327 					64 * 1024);
2328 	tcp_bhash_size = 1 << tcp_bhash_size;
2329 	for (i = 0; i < tcp_bhash_size; i++) {
2330 		spin_lock_init(&tcp_bhash[i].lock);
2331 		INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2332 	}
2333 
2334 	/* Try to be a bit smarter and adjust defaults depending
2335 	 * on available memory.
2336 	 */
2337 	for (order = 0; ((1 << order) << PAGE_SHIFT) <
2338 			(tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
2339 			order++)
2340 		;
2341 	if (order > 4) {
2342 		sysctl_local_port_range[0] = 32768;
2343 		sysctl_local_port_range[1] = 61000;
2344 		sysctl_tcp_max_tw_buckets = 180000;
2345 		sysctl_tcp_max_orphans = 4096 << (order - 4);
2346 		sysctl_max_syn_backlog = 1024;
2347 	} else if (order < 3) {
2348 		sysctl_local_port_range[0] = 1024 * (3 - order);
2349 		sysctl_tcp_max_tw_buckets >>= (3 - order);
2350 		sysctl_tcp_max_orphans >>= (3 - order);
2351 		sysctl_max_syn_backlog = 128;
2352 	}
2353 	tcp_port_rover = sysctl_local_port_range[0] - 1;
2354 
2355 	sysctl_tcp_mem[0] =  768 << order;
2356 	sysctl_tcp_mem[1] = 1024 << order;
2357 	sysctl_tcp_mem[2] = 1536 << order;
2358 
2359 	if (order < 3) {
2360 		sysctl_tcp_wmem[2] = 64 * 1024;
2361 		sysctl_tcp_rmem[0] = PAGE_SIZE;
2362 		sysctl_tcp_rmem[1] = 43689;
2363 		sysctl_tcp_rmem[2] = 2 * 43689;
2364 	}
2365 
2366 	printk(KERN_INFO "TCP: Hash tables configured "
2367 	       "(established %d bind %d)\n",
2368 	       tcp_ehash_size << 1, tcp_bhash_size);
2369 }
2370 
2371 EXPORT_SYMBOL(tcp_accept);
2372 EXPORT_SYMBOL(tcp_close);
2373 EXPORT_SYMBOL(tcp_destroy_sock);
2374 EXPORT_SYMBOL(tcp_disconnect);
2375 EXPORT_SYMBOL(tcp_getsockopt);
2376 EXPORT_SYMBOL(tcp_ioctl);
2377 EXPORT_SYMBOL(tcp_openreq_cachep);
2378 EXPORT_SYMBOL(tcp_poll);
2379 EXPORT_SYMBOL(tcp_read_sock);
2380 EXPORT_SYMBOL(tcp_recvmsg);
2381 EXPORT_SYMBOL(tcp_sendmsg);
2382 EXPORT_SYMBOL(tcp_sendpage);
2383 EXPORT_SYMBOL(tcp_setsockopt);
2384 EXPORT_SYMBOL(tcp_shutdown);
2385 EXPORT_SYMBOL(tcp_statistics);
2386 EXPORT_SYMBOL(tcp_timewait_cachep);
2387