xref: /openbmc/linux/net/ipv4/tcp.c (revision d5cb9783536a41df9f9cba5b0a1d78047ed787f7)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:	$Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *		Corey Minyard <wf-rch!minyard@relay.EU.net>
14  *		Florian La Roche, <flla@stud.uni-sb.de>
15  *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16  *		Linus Torvalds, <torvalds@cs.helsinki.fi>
17  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
18  *		Matthew Dillon, <dillon@apollo.west.oic.com>
19  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20  *		Jorge Cwik, <jorge@laser.satlink.net>
21  *
22  * Fixes:
23  *		Alan Cox	:	Numerous verify_area() calls
24  *		Alan Cox	:	Set the ACK bit on a reset
25  *		Alan Cox	:	Stopped it crashing if it closed while
26  *					sk->inuse=1 and was trying to connect
27  *					(tcp_err()).
28  *		Alan Cox	:	All icmp error handling was broken
29  *					pointers passed where wrong and the
30  *					socket was looked up backwards. Nobody
31  *					tested any icmp error code obviously.
32  *		Alan Cox	:	tcp_err() now handled properly. It
33  *					wakes people on errors. poll
34  *					behaves and the icmp error race
35  *					has gone by moving it into sock.c
36  *		Alan Cox	:	tcp_send_reset() fixed to work for
37  *					everything not just packets for
38  *					unknown sockets.
39  *		Alan Cox	:	tcp option processing.
40  *		Alan Cox	:	Reset tweaked (still not 100%) [Had
41  *					syn rule wrong]
42  *		Herp Rosmanith  :	More reset fixes
43  *		Alan Cox	:	No longer acks invalid rst frames.
44  *					Acking any kind of RST is right out.
45  *		Alan Cox	:	Sets an ignore me flag on an rst
46  *					receive otherwise odd bits of prattle
47  *					escape still
48  *		Alan Cox	:	Fixed another acking RST frame bug.
49  *					Should stop LAN workplace lockups.
50  *		Alan Cox	: 	Some tidyups using the new skb list
51  *					facilities
52  *		Alan Cox	:	sk->keepopen now seems to work
53  *		Alan Cox	:	Pulls options out correctly on accepts
54  *		Alan Cox	:	Fixed assorted sk->rqueue->next errors
55  *		Alan Cox	:	PSH doesn't end a TCP read. Switched a
56  *					bit to skb ops.
57  *		Alan Cox	:	Tidied tcp_data to avoid a potential
58  *					nasty.
59  *		Alan Cox	:	Added some better commenting, as the
60  *					tcp is hard to follow
61  *		Alan Cox	:	Removed incorrect check for 20 * psh
62  *	Michael O'Reilly	:	ack < copied bug fix.
63  *	Johannes Stille		:	Misc tcp fixes (not all in yet).
64  *		Alan Cox	:	FIN with no memory -> CRASH
65  *		Alan Cox	:	Added socket option proto entries.
66  *					Also added awareness of them to accept.
67  *		Alan Cox	:	Added TCP options (SOL_TCP)
68  *		Alan Cox	:	Switched wakeup calls to callbacks,
69  *					so the kernel can layer network
70  *					sockets.
71  *		Alan Cox	:	Use ip_tos/ip_ttl settings.
72  *		Alan Cox	:	Handle FIN (more) properly (we hope).
73  *		Alan Cox	:	RST frames sent on unsynchronised
74  *					state ack error.
75  *		Alan Cox	:	Put in missing check for SYN bit.
76  *		Alan Cox	:	Added tcp_select_window() aka NET2E
77  *					window non shrink trick.
78  *		Alan Cox	:	Added a couple of small NET2E timer
79  *					fixes
80  *		Charles Hedrick :	TCP fixes
81  *		Toomas Tamm	:	TCP window fixes
82  *		Alan Cox	:	Small URG fix to rlogin ^C ack fight
83  *		Charles Hedrick	:	Rewrote most of it to actually work
84  *		Linus		:	Rewrote tcp_read() and URG handling
85  *					completely
86  *		Gerhard Koerting:	Fixed some missing timer handling
87  *		Matthew Dillon  :	Reworked TCP machine states as per RFC
88  *		Gerhard Koerting:	PC/TCP workarounds
89  *		Adam Caldwell	:	Assorted timer/timing errors
90  *		Matthew Dillon	:	Fixed another RST bug
91  *		Alan Cox	:	Move to kernel side addressing changes.
92  *		Alan Cox	:	Beginning work on TCP fastpathing
93  *					(not yet usable)
94  *		Arnt Gulbrandsen:	Turbocharged tcp_check() routine.
95  *		Alan Cox	:	TCP fast path debugging
96  *		Alan Cox	:	Window clamping
97  *		Michael Riepe	:	Bug in tcp_check()
98  *		Matt Dillon	:	More TCP improvements and RST bug fixes
99  *		Matt Dillon	:	Yet more small nasties remove from the
100  *					TCP code (Be very nice to this man if
101  *					tcp finally works 100%) 8)
102  *		Alan Cox	:	BSD accept semantics.
103  *		Alan Cox	:	Reset on closedown bug.
104  *	Peter De Schrijver	:	ENOTCONN check missing in tcp_sendto().
105  *		Michael Pall	:	Handle poll() after URG properly in
106  *					all cases.
107  *		Michael Pall	:	Undo the last fix in tcp_read_urg()
108  *					(multi URG PUSH broke rlogin).
109  *		Michael Pall	:	Fix the multi URG PUSH problem in
110  *					tcp_readable(), poll() after URG
111  *					works now.
112  *		Michael Pall	:	recv(...,MSG_OOB) never blocks in the
113  *					BSD api.
114  *		Alan Cox	:	Changed the semantics of sk->socket to
115  *					fix a race and a signal problem with
116  *					accept() and async I/O.
117  *		Alan Cox	:	Relaxed the rules on tcp_sendto().
118  *		Yury Shevchuk	:	Really fixed accept() blocking problem.
119  *		Craig I. Hagan  :	Allow for BSD compatible TIME_WAIT for
120  *					clients/servers which listen in on
121  *					fixed ports.
122  *		Alan Cox	:	Cleaned the above up and shrank it to
123  *					a sensible code size.
124  *		Alan Cox	:	Self connect lockup fix.
125  *		Alan Cox	:	No connect to multicast.
126  *		Ross Biro	:	Close unaccepted children on master
127  *					socket close.
128  *		Alan Cox	:	Reset tracing code.
129  *		Alan Cox	:	Spurious resets on shutdown.
130  *		Alan Cox	:	Giant 15 minute/60 second timer error
131  *		Alan Cox	:	Small whoops in polling before an
132  *					accept.
133  *		Alan Cox	:	Kept the state trace facility since
134  *					it's handy for debugging.
135  *		Alan Cox	:	More reset handler fixes.
136  *		Alan Cox	:	Started rewriting the code based on
137  *					the RFC's for other useful protocol
138  *					references see: Comer, KA9Q NOS, and
139  *					for a reference on the difference
140  *					between specifications and how BSD
141  *					works see the 4.4lite source.
142  *		A.N.Kuznetsov	:	Don't time wait on completion of tidy
143  *					close.
144  *		Linus Torvalds	:	Fin/Shutdown & copied_seq changes.
145  *		Linus Torvalds	:	Fixed BSD port reuse to work first syn
146  *		Alan Cox	:	Reimplemented timers as per the RFC
147  *					and using multiple timers for sanity.
148  *		Alan Cox	:	Small bug fixes, and a lot of new
149  *					comments.
150  *		Alan Cox	:	Fixed dual reader crash by locking
151  *					the buffers (much like datagram.c)
152  *		Alan Cox	:	Fixed stuck sockets in probe. A probe
153  *					now gets fed up of retrying without
154  *					(even a no space) answer.
155  *		Alan Cox	:	Extracted closing code better
156  *		Alan Cox	:	Fixed the closing state machine to
157  *					resemble the RFC.
158  *		Alan Cox	:	More 'per spec' fixes.
159  *		Jorge Cwik	:	Even faster checksumming.
160  *		Alan Cox	:	tcp_data() doesn't ack illegal PSH
161  *					only frames. At least one pc tcp stack
162  *					generates them.
163  *		Alan Cox	:	Cache last socket.
164  *		Alan Cox	:	Per route irtt.
165  *		Matt Day	:	poll()->select() match BSD precisely on error
166  *		Alan Cox	:	New buffers
167  *		Marc Tamsky	:	Various sk->prot->retransmits and
168  *					sk->retransmits misupdating fixed.
169  *					Fixed tcp_write_timeout: stuck close,
170  *					and TCP syn retries gets used now.
171  *		Mark Yarvis	:	In tcp_read_wakeup(), don't send an
172  *					ack if state is TCP_CLOSED.
173  *		Alan Cox	:	Look up device on a retransmit - routes may
174  *					change. Doesn't yet cope with MSS shrink right
175  *					but it's a start!
176  *		Marc Tamsky	:	Closing in closing fixes.
177  *		Mike Shaver	:	RFC1122 verifications.
178  *		Alan Cox	:	rcv_saddr errors.
179  *		Alan Cox	:	Block double connect().
180  *		Alan Cox	:	Small hooks for enSKIP.
181  *		Alexey Kuznetsov:	Path MTU discovery.
182  *		Alan Cox	:	Support soft errors.
183  *		Alan Cox	:	Fix MTU discovery pathological case
184  *					when the remote claims no mtu!
185  *		Marc Tamsky	:	TCP_CLOSE fix.
186  *		Colin (G3TNE)	:	Send a reset on syn ack replies in
187  *					window but wrong (fixes NT lpd problems)
188  *		Pedro Roque	:	Better TCP window handling, delayed ack.
189  *		Joerg Reuter	:	No modification of locked buffers in
190  *					tcp_do_retransmit()
191  *		Eric Schenk	:	Changed receiver side silly window
192  *					avoidance algorithm to BSD style
193  *					algorithm. This doubles throughput
194  *					against machines running Solaris,
195  *					and seems to result in general
196  *					improvement.
197  *	Stefan Magdalinski	:	adjusted tcp_readable() to fix FIONREAD
198  *	Willy Konynenberg	:	Transparent proxying support.
199  *	Mike McLagan		:	Routing by source
200  *		Keith Owens	:	Do proper merging with partial SKB's in
201  *					tcp_do_sendmsg to avoid burstiness.
202  *		Eric Schenk	:	Fix fast close down bug with
203  *					shutdown() followed by close().
204  *		Andi Kleen 	:	Make poll agree with SIGIO
205  *	Salvatore Sanfilippo	:	Support SO_LINGER with linger == 1 and
206  *					lingertime == 0 (RFC 793 ABORT Call)
207  *	Hirokazu Takahashi	:	Use copy_from_user() instead of
208  *					csum_and_copy_from_user() if possible.
209  *
210  *		This program is free software; you can redistribute it and/or
211  *		modify it under the terms of the GNU General Public License
212  *		as published by the Free Software Foundation; either version
213  *		2 of the License, or(at your option) any later version.
214  *
215  * Description of States:
216  *
217  *	TCP_SYN_SENT		sent a connection request, waiting for ack
218  *
219  *	TCP_SYN_RECV		received a connection request, sent ack,
220  *				waiting for final ack in three-way handshake.
221  *
222  *	TCP_ESTABLISHED		connection established
223  *
224  *	TCP_FIN_WAIT1		our side has shutdown, waiting to complete
225  *				transmission of remaining buffered data
226  *
227  *	TCP_FIN_WAIT2		all buffered data sent, waiting for remote
228  *				to shutdown
229  *
230  *	TCP_CLOSING		both sides have shutdown but we still have
231  *				data we have to finish sending
232  *
233  *	TCP_TIME_WAIT		timeout to catch resent junk before entering
234  *				closed, can only be entered from FIN_WAIT2
235  *				or CLOSING.  Required because the other end
236  *				may not have gotten our last ACK causing it
237  *				to retransmit the data packet (which we ignore)
238  *
239  *	TCP_CLOSE_WAIT		remote side has shutdown and is waiting for
240  *				us to finish writing our data and to shutdown
241  *				(we have to close() to move on to LAST_ACK)
242  *
243  *	TCP_LAST_ACK		out side has shutdown after remote has
244  *				shutdown.  There may still be data in our
245  *				buffer that we have to finish sending
246  *
247  *	TCP_CLOSE		socket is finished
248  */
249 
250 #include <linux/config.h>
251 #include <linux/module.h>
252 #include <linux/types.h>
253 #include <linux/fcntl.h>
254 #include <linux/poll.h>
255 #include <linux/init.h>
256 #include <linux/smp_lock.h>
257 #include <linux/fs.h>
258 #include <linux/random.h>
259 #include <linux/bootmem.h>
260 
261 #include <net/icmp.h>
262 #include <net/tcp.h>
263 #include <net/xfrm.h>
264 #include <net/ip.h>
265 
266 
267 #include <asm/uaccess.h>
268 #include <asm/ioctls.h>
269 
270 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
271 
272 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly;
273 
274 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
275 
276 EXPORT_SYMBOL_GPL(tcp_orphan_count);
277 
278 int sysctl_tcp_mem[3];
279 int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
280 int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
281 
282 EXPORT_SYMBOL(sysctl_tcp_mem);
283 EXPORT_SYMBOL(sysctl_tcp_rmem);
284 EXPORT_SYMBOL(sysctl_tcp_wmem);
285 
286 atomic_t tcp_memory_allocated;	/* Current allocated memory. */
287 atomic_t tcp_sockets_allocated;	/* Current number of TCP sockets. */
288 
289 EXPORT_SYMBOL(tcp_memory_allocated);
290 EXPORT_SYMBOL(tcp_sockets_allocated);
291 
292 /*
293  * Pressure flag: try to collapse.
294  * Technical note: it is used by multiple contexts non atomically.
295  * All the sk_stream_mem_schedule() is of this nature: accounting
296  * is strict, actions are advisory and have some latency.
297  */
298 int tcp_memory_pressure;
299 
300 EXPORT_SYMBOL(tcp_memory_pressure);
301 
302 void tcp_enter_memory_pressure(void)
303 {
304 	if (!tcp_memory_pressure) {
305 		NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
306 		tcp_memory_pressure = 1;
307 	}
308 }
309 
310 EXPORT_SYMBOL(tcp_enter_memory_pressure);
311 
312 /*
313  *	Wait for a TCP event.
314  *
315  *	Note that we don't need to lock the socket, as the upper poll layers
316  *	take care of normal races (between the test and the event) and we don't
317  *	go look at any of the socket buffers directly.
318  */
319 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
320 {
321 	unsigned int mask;
322 	struct sock *sk = sock->sk;
323 	struct tcp_sock *tp = tcp_sk(sk);
324 
325 	poll_wait(file, sk->sk_sleep, wait);
326 	if (sk->sk_state == TCP_LISTEN)
327 		return inet_csk_listen_poll(sk);
328 
329 	/* Socket is not locked. We are protected from async events
330 	   by poll logic and correct handling of state changes
331 	   made by another threads is impossible in any case.
332 	 */
333 
334 	mask = 0;
335 	if (sk->sk_err)
336 		mask = POLLERR;
337 
338 	/*
339 	 * POLLHUP is certainly not done right. But poll() doesn't
340 	 * have a notion of HUP in just one direction, and for a
341 	 * socket the read side is more interesting.
342 	 *
343 	 * Some poll() documentation says that POLLHUP is incompatible
344 	 * with the POLLOUT/POLLWR flags, so somebody should check this
345 	 * all. But careful, it tends to be safer to return too many
346 	 * bits than too few, and you can easily break real applications
347 	 * if you don't tell them that something has hung up!
348 	 *
349 	 * Check-me.
350 	 *
351 	 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
352 	 * our fs/select.c). It means that after we received EOF,
353 	 * poll always returns immediately, making impossible poll() on write()
354 	 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
355 	 * if and only if shutdown has been made in both directions.
356 	 * Actually, it is interesting to look how Solaris and DUX
357 	 * solve this dilemma. I would prefer, if PULLHUP were maskable,
358 	 * then we could set it on SND_SHUTDOWN. BTW examples given
359 	 * in Stevens' books assume exactly this behaviour, it explains
360 	 * why PULLHUP is incompatible with POLLOUT.	--ANK
361 	 *
362 	 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
363 	 * blocking on fresh not-connected or disconnected socket. --ANK
364 	 */
365 	if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
366 		mask |= POLLHUP;
367 	if (sk->sk_shutdown & RCV_SHUTDOWN)
368 		mask |= POLLIN | POLLRDNORM;
369 
370 	/* Connected? */
371 	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
372 		/* Potential race condition. If read of tp below will
373 		 * escape above sk->sk_state, we can be illegally awaken
374 		 * in SYN_* states. */
375 		if ((tp->rcv_nxt != tp->copied_seq) &&
376 		    (tp->urg_seq != tp->copied_seq ||
377 		     tp->rcv_nxt != tp->copied_seq + 1 ||
378 		     sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
379 			mask |= POLLIN | POLLRDNORM;
380 
381 		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
382 			if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
383 				mask |= POLLOUT | POLLWRNORM;
384 			} else {  /* send SIGIO later */
385 				set_bit(SOCK_ASYNC_NOSPACE,
386 					&sk->sk_socket->flags);
387 				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
388 
389 				/* Race breaker. If space is freed after
390 				 * wspace test but before the flags are set,
391 				 * IO signal will be lost.
392 				 */
393 				if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
394 					mask |= POLLOUT | POLLWRNORM;
395 			}
396 		}
397 
398 		if (tp->urg_data & TCP_URG_VALID)
399 			mask |= POLLPRI;
400 	}
401 	return mask;
402 }
403 
404 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
405 {
406 	struct tcp_sock *tp = tcp_sk(sk);
407 	int answ;
408 
409 	switch (cmd) {
410 	case SIOCINQ:
411 		if (sk->sk_state == TCP_LISTEN)
412 			return -EINVAL;
413 
414 		lock_sock(sk);
415 		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
416 			answ = 0;
417 		else if (sock_flag(sk, SOCK_URGINLINE) ||
418 			 !tp->urg_data ||
419 			 before(tp->urg_seq, tp->copied_seq) ||
420 			 !before(tp->urg_seq, tp->rcv_nxt)) {
421 			answ = tp->rcv_nxt - tp->copied_seq;
422 
423 			/* Subtract 1, if FIN is in queue. */
424 			if (answ && !skb_queue_empty(&sk->sk_receive_queue))
425 				answ -=
426 		       ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
427 		} else
428 			answ = tp->urg_seq - tp->copied_seq;
429 		release_sock(sk);
430 		break;
431 	case SIOCATMARK:
432 		answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
433 		break;
434 	case SIOCOUTQ:
435 		if (sk->sk_state == TCP_LISTEN)
436 			return -EINVAL;
437 
438 		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
439 			answ = 0;
440 		else
441 			answ = tp->write_seq - tp->snd_una;
442 		break;
443 	default:
444 		return -ENOIOCTLCMD;
445 	};
446 
447 	return put_user(answ, (int __user *)arg);
448 }
449 
450 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
451 {
452 	TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
453 	tp->pushed_seq = tp->write_seq;
454 }
455 
456 static inline int forced_push(struct tcp_sock *tp)
457 {
458 	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
459 }
460 
461 static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
462 			      struct sk_buff *skb)
463 {
464 	skb->csum = 0;
465 	TCP_SKB_CB(skb)->seq = tp->write_seq;
466 	TCP_SKB_CB(skb)->end_seq = tp->write_seq;
467 	TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
468 	TCP_SKB_CB(skb)->sacked = 0;
469 	skb_header_release(skb);
470 	__skb_queue_tail(&sk->sk_write_queue, skb);
471 	sk_charge_skb(sk, skb);
472 	if (!sk->sk_send_head)
473 		sk->sk_send_head = skb;
474 	if (tp->nonagle & TCP_NAGLE_PUSH)
475 		tp->nonagle &= ~TCP_NAGLE_PUSH;
476 }
477 
478 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
479 				struct sk_buff *skb)
480 {
481 	if (flags & MSG_OOB) {
482 		tp->urg_mode = 1;
483 		tp->snd_up = tp->write_seq;
484 		TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
485 	}
486 }
487 
488 static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
489 			    int mss_now, int nonagle)
490 {
491 	if (sk->sk_send_head) {
492 		struct sk_buff *skb = sk->sk_write_queue.prev;
493 		if (!(flags & MSG_MORE) || forced_push(tp))
494 			tcp_mark_push(tp, skb);
495 		tcp_mark_urg(tp, flags, skb);
496 		__tcp_push_pending_frames(sk, tp, mss_now,
497 					  (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
498 	}
499 }
500 
501 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
502 			 size_t psize, int flags)
503 {
504 	struct tcp_sock *tp = tcp_sk(sk);
505 	int mss_now, size_goal;
506 	int err;
507 	ssize_t copied;
508 	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
509 
510 	/* Wait for a connection to finish. */
511 	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
512 		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
513 			goto out_err;
514 
515 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
516 
517 	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
518 	size_goal = tp->xmit_size_goal;
519 	copied = 0;
520 
521 	err = -EPIPE;
522 	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
523 		goto do_error;
524 
525 	while (psize > 0) {
526 		struct sk_buff *skb = sk->sk_write_queue.prev;
527 		struct page *page = pages[poffset / PAGE_SIZE];
528 		int copy, i, can_coalesce;
529 		int offset = poffset % PAGE_SIZE;
530 		int size = min_t(size_t, psize, PAGE_SIZE - offset);
531 
532 		if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
533 new_segment:
534 			if (!sk_stream_memory_free(sk))
535 				goto wait_for_sndbuf;
536 
537 			skb = sk_stream_alloc_pskb(sk, 0, 0,
538 						   sk->sk_allocation);
539 			if (!skb)
540 				goto wait_for_memory;
541 
542 			skb_entail(sk, tp, skb);
543 			copy = size_goal;
544 		}
545 
546 		if (copy > size)
547 			copy = size;
548 
549 		i = skb_shinfo(skb)->nr_frags;
550 		can_coalesce = skb_can_coalesce(skb, i, page, offset);
551 		if (!can_coalesce && i >= MAX_SKB_FRAGS) {
552 			tcp_mark_push(tp, skb);
553 			goto new_segment;
554 		}
555 		if (!sk_stream_wmem_schedule(sk, copy))
556 			goto wait_for_memory;
557 
558 		if (can_coalesce) {
559 			skb_shinfo(skb)->frags[i - 1].size += copy;
560 		} else {
561 			get_page(page);
562 			skb_fill_page_desc(skb, i, page, offset, copy);
563 		}
564 
565 		skb->len += copy;
566 		skb->data_len += copy;
567 		skb->truesize += copy;
568 		sk->sk_wmem_queued += copy;
569 		sk->sk_forward_alloc -= copy;
570 		skb->ip_summed = CHECKSUM_HW;
571 		tp->write_seq += copy;
572 		TCP_SKB_CB(skb)->end_seq += copy;
573 		skb_shinfo(skb)->tso_segs = 0;
574 
575 		if (!copied)
576 			TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
577 
578 		copied += copy;
579 		poffset += copy;
580 		if (!(psize -= copy))
581 			goto out;
582 
583 		if (skb->len < mss_now || (flags & MSG_OOB))
584 			continue;
585 
586 		if (forced_push(tp)) {
587 			tcp_mark_push(tp, skb);
588 			__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
589 		} else if (skb == sk->sk_send_head)
590 			tcp_push_one(sk, mss_now);
591 		continue;
592 
593 wait_for_sndbuf:
594 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
595 wait_for_memory:
596 		if (copied)
597 			tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
598 
599 		if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
600 			goto do_error;
601 
602 		mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
603 		size_goal = tp->xmit_size_goal;
604 	}
605 
606 out:
607 	if (copied)
608 		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
609 	return copied;
610 
611 do_error:
612 	if (copied)
613 		goto out;
614 out_err:
615 	return sk_stream_error(sk, flags, err);
616 }
617 
618 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
619 		     size_t size, int flags)
620 {
621 	ssize_t res;
622 	struct sock *sk = sock->sk;
623 
624 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
625 
626 	if (!(sk->sk_route_caps & NETIF_F_SG) ||
627 	    !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
628 		return sock_no_sendpage(sock, page, offset, size, flags);
629 
630 #undef TCP_ZC_CSUM_FLAGS
631 
632 	lock_sock(sk);
633 	TCP_CHECK_TIMER(sk);
634 	res = do_tcp_sendpages(sk, &page, offset, size, flags);
635 	TCP_CHECK_TIMER(sk);
636 	release_sock(sk);
637 	return res;
638 }
639 
640 #define TCP_PAGE(sk)	(sk->sk_sndmsg_page)
641 #define TCP_OFF(sk)	(sk->sk_sndmsg_off)
642 
643 static inline int select_size(struct sock *sk, struct tcp_sock *tp)
644 {
645 	int tmp = tp->mss_cache;
646 
647 	if (sk->sk_route_caps & NETIF_F_SG) {
648 		if (sk->sk_route_caps & NETIF_F_TSO)
649 			tmp = 0;
650 		else {
651 			int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
652 
653 			if (tmp >= pgbreak &&
654 			    tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
655 				tmp = pgbreak;
656 		}
657 	}
658 
659 	return tmp;
660 }
661 
662 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
663 		size_t size)
664 {
665 	struct iovec *iov;
666 	struct tcp_sock *tp = tcp_sk(sk);
667 	struct sk_buff *skb;
668 	int iovlen, flags;
669 	int mss_now, size_goal;
670 	int err, copied;
671 	long timeo;
672 
673 	lock_sock(sk);
674 	TCP_CHECK_TIMER(sk);
675 
676 	flags = msg->msg_flags;
677 	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
678 
679 	/* Wait for a connection to finish. */
680 	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
681 		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
682 			goto out_err;
683 
684 	/* This should be in poll */
685 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
686 
687 	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
688 	size_goal = tp->xmit_size_goal;
689 
690 	/* Ok commence sending. */
691 	iovlen = msg->msg_iovlen;
692 	iov = msg->msg_iov;
693 	copied = 0;
694 
695 	err = -EPIPE;
696 	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
697 		goto do_error;
698 
699 	while (--iovlen >= 0) {
700 		int seglen = iov->iov_len;
701 		unsigned char __user *from = iov->iov_base;
702 
703 		iov++;
704 
705 		while (seglen > 0) {
706 			int copy;
707 
708 			skb = sk->sk_write_queue.prev;
709 
710 			if (!sk->sk_send_head ||
711 			    (copy = size_goal - skb->len) <= 0) {
712 
713 new_segment:
714 				/* Allocate new segment. If the interface is SG,
715 				 * allocate skb fitting to single page.
716 				 */
717 				if (!sk_stream_memory_free(sk))
718 					goto wait_for_sndbuf;
719 
720 				skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
721 							   0, sk->sk_allocation);
722 				if (!skb)
723 					goto wait_for_memory;
724 
725 				/*
726 				 * Check whether we can use HW checksum.
727 				 */
728 				if (sk->sk_route_caps &
729 				    (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
730 				     NETIF_F_HW_CSUM))
731 					skb->ip_summed = CHECKSUM_HW;
732 
733 				skb_entail(sk, tp, skb);
734 				copy = size_goal;
735 			}
736 
737 			/* Try to append data to the end of skb. */
738 			if (copy > seglen)
739 				copy = seglen;
740 
741 			/* Where to copy to? */
742 			if (skb_tailroom(skb) > 0) {
743 				/* We have some space in skb head. Superb! */
744 				if (copy > skb_tailroom(skb))
745 					copy = skb_tailroom(skb);
746 				if ((err = skb_add_data(skb, from, copy)) != 0)
747 					goto do_fault;
748 			} else {
749 				int merge = 0;
750 				int i = skb_shinfo(skb)->nr_frags;
751 				struct page *page = TCP_PAGE(sk);
752 				int off = TCP_OFF(sk);
753 
754 				if (skb_can_coalesce(skb, i, page, off) &&
755 				    off != PAGE_SIZE) {
756 					/* We can extend the last page
757 					 * fragment. */
758 					merge = 1;
759 				} else if (i == MAX_SKB_FRAGS ||
760 					   (!i &&
761 					   !(sk->sk_route_caps & NETIF_F_SG))) {
762 					/* Need to add new fragment and cannot
763 					 * do this because interface is non-SG,
764 					 * or because all the page slots are
765 					 * busy. */
766 					tcp_mark_push(tp, skb);
767 					goto new_segment;
768 				} else if (page) {
769 					if (off == PAGE_SIZE) {
770 						put_page(page);
771 						TCP_PAGE(sk) = page = NULL;
772 						off = 0;
773 					}
774 				} else
775 					off = 0;
776 
777 				if (copy > PAGE_SIZE - off)
778 					copy = PAGE_SIZE - off;
779 
780 				if (!sk_stream_wmem_schedule(sk, copy))
781 					goto wait_for_memory;
782 
783 				if (!page) {
784 					/* Allocate new cache page. */
785 					if (!(page = sk_stream_alloc_page(sk)))
786 						goto wait_for_memory;
787 				}
788 
789 				/* Time to copy data. We are close to
790 				 * the end! */
791 				err = skb_copy_to_page(sk, from, skb, page,
792 						       off, copy);
793 				if (err) {
794 					/* If this page was new, give it to the
795 					 * socket so it does not get leaked.
796 					 */
797 					if (!TCP_PAGE(sk)) {
798 						TCP_PAGE(sk) = page;
799 						TCP_OFF(sk) = 0;
800 					}
801 					goto do_error;
802 				}
803 
804 				/* Update the skb. */
805 				if (merge) {
806 					skb_shinfo(skb)->frags[i - 1].size +=
807 									copy;
808 				} else {
809 					skb_fill_page_desc(skb, i, page, off, copy);
810 					if (TCP_PAGE(sk)) {
811 						get_page(page);
812 					} else if (off + copy < PAGE_SIZE) {
813 						get_page(page);
814 						TCP_PAGE(sk) = page;
815 					}
816 				}
817 
818 				TCP_OFF(sk) = off + copy;
819 			}
820 
821 			if (!copied)
822 				TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
823 
824 			tp->write_seq += copy;
825 			TCP_SKB_CB(skb)->end_seq += copy;
826 			skb_shinfo(skb)->tso_segs = 0;
827 
828 			from += copy;
829 			copied += copy;
830 			if ((seglen -= copy) == 0 && iovlen == 0)
831 				goto out;
832 
833 			if (skb->len < mss_now || (flags & MSG_OOB))
834 				continue;
835 
836 			if (forced_push(tp)) {
837 				tcp_mark_push(tp, skb);
838 				__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
839 			} else if (skb == sk->sk_send_head)
840 				tcp_push_one(sk, mss_now);
841 			continue;
842 
843 wait_for_sndbuf:
844 			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
845 wait_for_memory:
846 			if (copied)
847 				tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
848 
849 			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
850 				goto do_error;
851 
852 			mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
853 			size_goal = tp->xmit_size_goal;
854 		}
855 	}
856 
857 out:
858 	if (copied)
859 		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
860 	TCP_CHECK_TIMER(sk);
861 	release_sock(sk);
862 	return copied;
863 
864 do_fault:
865 	if (!skb->len) {
866 		if (sk->sk_send_head == skb)
867 			sk->sk_send_head = NULL;
868 		__skb_unlink(skb, &sk->sk_write_queue);
869 		sk_stream_free_skb(sk, skb);
870 	}
871 
872 do_error:
873 	if (copied)
874 		goto out;
875 out_err:
876 	err = sk_stream_error(sk, flags, err);
877 	TCP_CHECK_TIMER(sk);
878 	release_sock(sk);
879 	return err;
880 }
881 
882 /*
883  *	Handle reading urgent data. BSD has very simple semantics for
884  *	this, no blocking and very strange errors 8)
885  */
886 
887 static int tcp_recv_urg(struct sock *sk, long timeo,
888 			struct msghdr *msg, int len, int flags,
889 			int *addr_len)
890 {
891 	struct tcp_sock *tp = tcp_sk(sk);
892 
893 	/* No URG data to read. */
894 	if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
895 	    tp->urg_data == TCP_URG_READ)
896 		return -EINVAL;	/* Yes this is right ! */
897 
898 	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
899 		return -ENOTCONN;
900 
901 	if (tp->urg_data & TCP_URG_VALID) {
902 		int err = 0;
903 		char c = tp->urg_data;
904 
905 		if (!(flags & MSG_PEEK))
906 			tp->urg_data = TCP_URG_READ;
907 
908 		/* Read urgent data. */
909 		msg->msg_flags |= MSG_OOB;
910 
911 		if (len > 0) {
912 			if (!(flags & MSG_TRUNC))
913 				err = memcpy_toiovec(msg->msg_iov, &c, 1);
914 			len = 1;
915 		} else
916 			msg->msg_flags |= MSG_TRUNC;
917 
918 		return err ? -EFAULT : len;
919 	}
920 
921 	if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
922 		return 0;
923 
924 	/* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
925 	 * the available implementations agree in this case:
926 	 * this call should never block, independent of the
927 	 * blocking state of the socket.
928 	 * Mike <pall@rz.uni-karlsruhe.de>
929 	 */
930 	return -EAGAIN;
931 }
932 
933 /* Clean up the receive buffer for full frames taken by the user,
934  * then send an ACK if necessary.  COPIED is the number of bytes
935  * tcp_recvmsg has given to the user so far, it speeds up the
936  * calculation of whether or not we must ACK for the sake of
937  * a window update.
938  */
939 static void cleanup_rbuf(struct sock *sk, int copied)
940 {
941 	struct tcp_sock *tp = tcp_sk(sk);
942 	int time_to_ack = 0;
943 
944 #if TCP_DEBUG
945 	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
946 
947 	BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
948 #endif
949 
950 	if (inet_csk_ack_scheduled(sk)) {
951 		const struct inet_connection_sock *icsk = inet_csk(sk);
952 		   /* Delayed ACKs frequently hit locked sockets during bulk
953 		    * receive. */
954 		if (icsk->icsk_ack.blocked ||
955 		    /* Once-per-two-segments ACK was not sent by tcp_input.c */
956 		    tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
957 		    /*
958 		     * If this read emptied read buffer, we send ACK, if
959 		     * connection is not bidirectional, user drained
960 		     * receive buffer and there was a small segment
961 		     * in queue.
962 		     */
963 		    (copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
964 		     !icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
965 			time_to_ack = 1;
966 	}
967 
968 	/* We send an ACK if we can now advertise a non-zero window
969 	 * which has been raised "significantly".
970 	 *
971 	 * Even if window raised up to infinity, do not send window open ACK
972 	 * in states, where we will not receive more. It is useless.
973 	 */
974 	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
975 		__u32 rcv_window_now = tcp_receive_window(tp);
976 
977 		/* Optimize, __tcp_select_window() is not cheap. */
978 		if (2*rcv_window_now <= tp->window_clamp) {
979 			__u32 new_window = __tcp_select_window(sk);
980 
981 			/* Send ACK now, if this read freed lots of space
982 			 * in our buffer. Certainly, new_window is new window.
983 			 * We can advertise it now, if it is not less than current one.
984 			 * "Lots" means "at least twice" here.
985 			 */
986 			if (new_window && new_window >= 2 * rcv_window_now)
987 				time_to_ack = 1;
988 		}
989 	}
990 	if (time_to_ack)
991 		tcp_send_ack(sk);
992 }
993 
994 static void tcp_prequeue_process(struct sock *sk)
995 {
996 	struct sk_buff *skb;
997 	struct tcp_sock *tp = tcp_sk(sk);
998 
999 	NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
1000 
1001 	/* RX process wants to run with disabled BHs, though it is not
1002 	 * necessary */
1003 	local_bh_disable();
1004 	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1005 		sk->sk_backlog_rcv(sk, skb);
1006 	local_bh_enable();
1007 
1008 	/* Clear memory counter. */
1009 	tp->ucopy.memory = 0;
1010 }
1011 
1012 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1013 {
1014 	struct sk_buff *skb;
1015 	u32 offset;
1016 
1017 	skb_queue_walk(&sk->sk_receive_queue, skb) {
1018 		offset = seq - TCP_SKB_CB(skb)->seq;
1019 		if (skb->h.th->syn)
1020 			offset--;
1021 		if (offset < skb->len || skb->h.th->fin) {
1022 			*off = offset;
1023 			return skb;
1024 		}
1025 	}
1026 	return NULL;
1027 }
1028 
1029 /*
1030  * This routine provides an alternative to tcp_recvmsg() for routines
1031  * that would like to handle copying from skbuffs directly in 'sendfile'
1032  * fashion.
1033  * Note:
1034  *	- It is assumed that the socket was locked by the caller.
1035  *	- The routine does not block.
1036  *	- At present, there is no support for reading OOB data
1037  *	  or for 'peeking' the socket using this routine
1038  *	  (although both would be easy to implement).
1039  */
1040 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1041 		  sk_read_actor_t recv_actor)
1042 {
1043 	struct sk_buff *skb;
1044 	struct tcp_sock *tp = tcp_sk(sk);
1045 	u32 seq = tp->copied_seq;
1046 	u32 offset;
1047 	int copied = 0;
1048 
1049 	if (sk->sk_state == TCP_LISTEN)
1050 		return -ENOTCONN;
1051 	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1052 		if (offset < skb->len) {
1053 			size_t used, len;
1054 
1055 			len = skb->len - offset;
1056 			/* Stop reading if we hit a patch of urgent data */
1057 			if (tp->urg_data) {
1058 				u32 urg_offset = tp->urg_seq - seq;
1059 				if (urg_offset < len)
1060 					len = urg_offset;
1061 				if (!len)
1062 					break;
1063 			}
1064 			used = recv_actor(desc, skb, offset, len);
1065 			if (used <= len) {
1066 				seq += used;
1067 				copied += used;
1068 				offset += used;
1069 			}
1070 			if (offset != skb->len)
1071 				break;
1072 		}
1073 		if (skb->h.th->fin) {
1074 			sk_eat_skb(sk, skb);
1075 			++seq;
1076 			break;
1077 		}
1078 		sk_eat_skb(sk, skb);
1079 		if (!desc->count)
1080 			break;
1081 	}
1082 	tp->copied_seq = seq;
1083 
1084 	tcp_rcv_space_adjust(sk);
1085 
1086 	/* Clean up data we have read: This will do ACK frames. */
1087 	if (copied)
1088 		cleanup_rbuf(sk, copied);
1089 	return copied;
1090 }
1091 
1092 /*
1093  *	This routine copies from a sock struct into the user buffer.
1094  *
1095  *	Technical note: in 2.3 we work on _locked_ socket, so that
1096  *	tricks with *seq access order and skb->users are not required.
1097  *	Probably, code can be easily improved even more.
1098  */
1099 
1100 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1101 		size_t len, int nonblock, int flags, int *addr_len)
1102 {
1103 	struct tcp_sock *tp = tcp_sk(sk);
1104 	int copied = 0;
1105 	u32 peek_seq;
1106 	u32 *seq;
1107 	unsigned long used;
1108 	int err;
1109 	int target;		/* Read at least this many bytes */
1110 	long timeo;
1111 	struct task_struct *user_recv = NULL;
1112 
1113 	lock_sock(sk);
1114 
1115 	TCP_CHECK_TIMER(sk);
1116 
1117 	err = -ENOTCONN;
1118 	if (sk->sk_state == TCP_LISTEN)
1119 		goto out;
1120 
1121 	timeo = sock_rcvtimeo(sk, nonblock);
1122 
1123 	/* Urgent data needs to be handled specially. */
1124 	if (flags & MSG_OOB)
1125 		goto recv_urg;
1126 
1127 	seq = &tp->copied_seq;
1128 	if (flags & MSG_PEEK) {
1129 		peek_seq = tp->copied_seq;
1130 		seq = &peek_seq;
1131 	}
1132 
1133 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1134 
1135 	do {
1136 		struct sk_buff *skb;
1137 		u32 offset;
1138 
1139 		/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1140 		if (tp->urg_data && tp->urg_seq == *seq) {
1141 			if (copied)
1142 				break;
1143 			if (signal_pending(current)) {
1144 				copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1145 				break;
1146 			}
1147 		}
1148 
1149 		/* Next get a buffer. */
1150 
1151 		skb = skb_peek(&sk->sk_receive_queue);
1152 		do {
1153 			if (!skb)
1154 				break;
1155 
1156 			/* Now that we have two receive queues this
1157 			 * shouldn't happen.
1158 			 */
1159 			if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1160 				printk(KERN_INFO "recvmsg bug: copied %X "
1161 				       "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1162 				break;
1163 			}
1164 			offset = *seq - TCP_SKB_CB(skb)->seq;
1165 			if (skb->h.th->syn)
1166 				offset--;
1167 			if (offset < skb->len)
1168 				goto found_ok_skb;
1169 			if (skb->h.th->fin)
1170 				goto found_fin_ok;
1171 			BUG_TRAP(flags & MSG_PEEK);
1172 			skb = skb->next;
1173 		} while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1174 
1175 		/* Well, if we have backlog, try to process it now yet. */
1176 
1177 		if (copied >= target && !sk->sk_backlog.tail)
1178 			break;
1179 
1180 		if (copied) {
1181 			if (sk->sk_err ||
1182 			    sk->sk_state == TCP_CLOSE ||
1183 			    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1184 			    !timeo ||
1185 			    signal_pending(current) ||
1186 			    (flags & MSG_PEEK))
1187 				break;
1188 		} else {
1189 			if (sock_flag(sk, SOCK_DONE))
1190 				break;
1191 
1192 			if (sk->sk_err) {
1193 				copied = sock_error(sk);
1194 				break;
1195 			}
1196 
1197 			if (sk->sk_shutdown & RCV_SHUTDOWN)
1198 				break;
1199 
1200 			if (sk->sk_state == TCP_CLOSE) {
1201 				if (!sock_flag(sk, SOCK_DONE)) {
1202 					/* This occurs when user tries to read
1203 					 * from never connected socket.
1204 					 */
1205 					copied = -ENOTCONN;
1206 					break;
1207 				}
1208 				break;
1209 			}
1210 
1211 			if (!timeo) {
1212 				copied = -EAGAIN;
1213 				break;
1214 			}
1215 
1216 			if (signal_pending(current)) {
1217 				copied = sock_intr_errno(timeo);
1218 				break;
1219 			}
1220 		}
1221 
1222 		cleanup_rbuf(sk, copied);
1223 
1224 		if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1225 			/* Install new reader */
1226 			if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1227 				user_recv = current;
1228 				tp->ucopy.task = user_recv;
1229 				tp->ucopy.iov = msg->msg_iov;
1230 			}
1231 
1232 			tp->ucopy.len = len;
1233 
1234 			BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1235 				 (flags & (MSG_PEEK | MSG_TRUNC)));
1236 
1237 			/* Ugly... If prequeue is not empty, we have to
1238 			 * process it before releasing socket, otherwise
1239 			 * order will be broken at second iteration.
1240 			 * More elegant solution is required!!!
1241 			 *
1242 			 * Look: we have the following (pseudo)queues:
1243 			 *
1244 			 * 1. packets in flight
1245 			 * 2. backlog
1246 			 * 3. prequeue
1247 			 * 4. receive_queue
1248 			 *
1249 			 * Each queue can be processed only if the next ones
1250 			 * are empty. At this point we have empty receive_queue.
1251 			 * But prequeue _can_ be not empty after 2nd iteration,
1252 			 * when we jumped to start of loop because backlog
1253 			 * processing added something to receive_queue.
1254 			 * We cannot release_sock(), because backlog contains
1255 			 * packets arrived _after_ prequeued ones.
1256 			 *
1257 			 * Shortly, algorithm is clear --- to process all
1258 			 * the queues in order. We could make it more directly,
1259 			 * requeueing packets from backlog to prequeue, if
1260 			 * is not empty. It is more elegant, but eats cycles,
1261 			 * unfortunately.
1262 			 */
1263 			if (!skb_queue_empty(&tp->ucopy.prequeue))
1264 				goto do_prequeue;
1265 
1266 			/* __ Set realtime policy in scheduler __ */
1267 		}
1268 
1269 		if (copied >= target) {
1270 			/* Do not sleep, just process backlog. */
1271 			release_sock(sk);
1272 			lock_sock(sk);
1273 		} else
1274 			sk_wait_data(sk, &timeo);
1275 
1276 		if (user_recv) {
1277 			int chunk;
1278 
1279 			/* __ Restore normal policy in scheduler __ */
1280 
1281 			if ((chunk = len - tp->ucopy.len) != 0) {
1282 				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1283 				len -= chunk;
1284 				copied += chunk;
1285 			}
1286 
1287 			if (tp->rcv_nxt == tp->copied_seq &&
1288 			    !skb_queue_empty(&tp->ucopy.prequeue)) {
1289 do_prequeue:
1290 				tcp_prequeue_process(sk);
1291 
1292 				if ((chunk = len - tp->ucopy.len) != 0) {
1293 					NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1294 					len -= chunk;
1295 					copied += chunk;
1296 				}
1297 			}
1298 		}
1299 		if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1300 			if (net_ratelimit())
1301 				printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1302 				       current->comm, current->pid);
1303 			peek_seq = tp->copied_seq;
1304 		}
1305 		continue;
1306 
1307 	found_ok_skb:
1308 		/* Ok so how much can we use? */
1309 		used = skb->len - offset;
1310 		if (len < used)
1311 			used = len;
1312 
1313 		/* Do we have urgent data here? */
1314 		if (tp->urg_data) {
1315 			u32 urg_offset = tp->urg_seq - *seq;
1316 			if (urg_offset < used) {
1317 				if (!urg_offset) {
1318 					if (!sock_flag(sk, SOCK_URGINLINE)) {
1319 						++*seq;
1320 						offset++;
1321 						used--;
1322 						if (!used)
1323 							goto skip_copy;
1324 					}
1325 				} else
1326 					used = urg_offset;
1327 			}
1328 		}
1329 
1330 		if (!(flags & MSG_TRUNC)) {
1331 			err = skb_copy_datagram_iovec(skb, offset,
1332 						      msg->msg_iov, used);
1333 			if (err) {
1334 				/* Exception. Bailout! */
1335 				if (!copied)
1336 					copied = -EFAULT;
1337 				break;
1338 			}
1339 		}
1340 
1341 		*seq += used;
1342 		copied += used;
1343 		len -= used;
1344 
1345 		tcp_rcv_space_adjust(sk);
1346 
1347 skip_copy:
1348 		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1349 			tp->urg_data = 0;
1350 			tcp_fast_path_check(sk, tp);
1351 		}
1352 		if (used + offset < skb->len)
1353 			continue;
1354 
1355 		if (skb->h.th->fin)
1356 			goto found_fin_ok;
1357 		if (!(flags & MSG_PEEK))
1358 			sk_eat_skb(sk, skb);
1359 		continue;
1360 
1361 	found_fin_ok:
1362 		/* Process the FIN. */
1363 		++*seq;
1364 		if (!(flags & MSG_PEEK))
1365 			sk_eat_skb(sk, skb);
1366 		break;
1367 	} while (len > 0);
1368 
1369 	if (user_recv) {
1370 		if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1371 			int chunk;
1372 
1373 			tp->ucopy.len = copied > 0 ? len : 0;
1374 
1375 			tcp_prequeue_process(sk);
1376 
1377 			if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1378 				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1379 				len -= chunk;
1380 				copied += chunk;
1381 			}
1382 		}
1383 
1384 		tp->ucopy.task = NULL;
1385 		tp->ucopy.len = 0;
1386 	}
1387 
1388 	/* According to UNIX98, msg_name/msg_namelen are ignored
1389 	 * on connected socket. I was just happy when found this 8) --ANK
1390 	 */
1391 
1392 	/* Clean up data we have read: This will do ACK frames. */
1393 	cleanup_rbuf(sk, copied);
1394 
1395 	TCP_CHECK_TIMER(sk);
1396 	release_sock(sk);
1397 	return copied;
1398 
1399 out:
1400 	TCP_CHECK_TIMER(sk);
1401 	release_sock(sk);
1402 	return err;
1403 
1404 recv_urg:
1405 	err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1406 	goto out;
1407 }
1408 
1409 /*
1410  *	State processing on a close. This implements the state shift for
1411  *	sending our FIN frame. Note that we only send a FIN for some
1412  *	states. A shutdown() may have already sent the FIN, or we may be
1413  *	closed.
1414  */
1415 
1416 static unsigned char new_state[16] = {
1417   /* current state:        new state:      action:	*/
1418   /* (Invalid)		*/ TCP_CLOSE,
1419   /* TCP_ESTABLISHED	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1420   /* TCP_SYN_SENT	*/ TCP_CLOSE,
1421   /* TCP_SYN_RECV	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1422   /* TCP_FIN_WAIT1	*/ TCP_FIN_WAIT1,
1423   /* TCP_FIN_WAIT2	*/ TCP_FIN_WAIT2,
1424   /* TCP_TIME_WAIT	*/ TCP_CLOSE,
1425   /* TCP_CLOSE		*/ TCP_CLOSE,
1426   /* TCP_CLOSE_WAIT	*/ TCP_LAST_ACK  | TCP_ACTION_FIN,
1427   /* TCP_LAST_ACK	*/ TCP_LAST_ACK,
1428   /* TCP_LISTEN		*/ TCP_CLOSE,
1429   /* TCP_CLOSING	*/ TCP_CLOSING,
1430 };
1431 
1432 static int tcp_close_state(struct sock *sk)
1433 {
1434 	int next = (int)new_state[sk->sk_state];
1435 	int ns = next & TCP_STATE_MASK;
1436 
1437 	tcp_set_state(sk, ns);
1438 
1439 	return next & TCP_ACTION_FIN;
1440 }
1441 
1442 /*
1443  *	Shutdown the sending side of a connection. Much like close except
1444  *	that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1445  */
1446 
1447 void tcp_shutdown(struct sock *sk, int how)
1448 {
1449 	/*	We need to grab some memory, and put together a FIN,
1450 	 *	and then put it into the queue to be sent.
1451 	 *		Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1452 	 */
1453 	if (!(how & SEND_SHUTDOWN))
1454 		return;
1455 
1456 	/* If we've already sent a FIN, or it's a closed state, skip this. */
1457 	if ((1 << sk->sk_state) &
1458 	    (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1459 	     TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1460 		/* Clear out any half completed packets.  FIN if needed. */
1461 		if (tcp_close_state(sk))
1462 			tcp_send_fin(sk);
1463 	}
1464 }
1465 
1466 void tcp_close(struct sock *sk, long timeout)
1467 {
1468 	struct sk_buff *skb;
1469 	int data_was_unread = 0;
1470 
1471 	lock_sock(sk);
1472 	sk->sk_shutdown = SHUTDOWN_MASK;
1473 
1474 	if (sk->sk_state == TCP_LISTEN) {
1475 		tcp_set_state(sk, TCP_CLOSE);
1476 
1477 		/* Special case. */
1478 		inet_csk_listen_stop(sk);
1479 
1480 		goto adjudge_to_death;
1481 	}
1482 
1483 	/*  We need to flush the recv. buffs.  We do this only on the
1484 	 *  descriptor close, not protocol-sourced closes, because the
1485 	 *  reader process may not have drained the data yet!
1486 	 */
1487 	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1488 		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1489 			  skb->h.th->fin;
1490 		data_was_unread += len;
1491 		__kfree_skb(skb);
1492 	}
1493 
1494 	sk_stream_mem_reclaim(sk);
1495 
1496 	/* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1497 	 * 3.10, we send a RST here because data was lost.  To
1498 	 * witness the awful effects of the old behavior of always
1499 	 * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1500 	 * a bulk GET in an FTP client, suspend the process, wait
1501 	 * for the client to advertise a zero window, then kill -9
1502 	 * the FTP client, wheee...  Note: timeout is always zero
1503 	 * in such a case.
1504 	 */
1505 	if (data_was_unread) {
1506 		/* Unread data was tossed, zap the connection. */
1507 		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1508 		tcp_set_state(sk, TCP_CLOSE);
1509 		tcp_send_active_reset(sk, GFP_KERNEL);
1510 	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1511 		/* Check zero linger _after_ checking for unread data. */
1512 		sk->sk_prot->disconnect(sk, 0);
1513 		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1514 	} else if (tcp_close_state(sk)) {
1515 		/* We FIN if the application ate all the data before
1516 		 * zapping the connection.
1517 		 */
1518 
1519 		/* RED-PEN. Formally speaking, we have broken TCP state
1520 		 * machine. State transitions:
1521 		 *
1522 		 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1523 		 * TCP_SYN_RECV	-> TCP_FIN_WAIT1 (forget it, it's impossible)
1524 		 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1525 		 *
1526 		 * are legal only when FIN has been sent (i.e. in window),
1527 		 * rather than queued out of window. Purists blame.
1528 		 *
1529 		 * F.e. "RFC state" is ESTABLISHED,
1530 		 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1531 		 *
1532 		 * The visible declinations are that sometimes
1533 		 * we enter time-wait state, when it is not required really
1534 		 * (harmless), do not send active resets, when they are
1535 		 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1536 		 * they look as CLOSING or LAST_ACK for Linux)
1537 		 * Probably, I missed some more holelets.
1538 		 * 						--ANK
1539 		 */
1540 		tcp_send_fin(sk);
1541 	}
1542 
1543 	sk_stream_wait_close(sk, timeout);
1544 
1545 adjudge_to_death:
1546 	/* It is the last release_sock in its life. It will remove backlog. */
1547 	release_sock(sk);
1548 
1549 
1550 	/* Now socket is owned by kernel and we acquire BH lock
1551 	   to finish close. No need to check for user refs.
1552 	 */
1553 	local_bh_disable();
1554 	bh_lock_sock(sk);
1555 	BUG_TRAP(!sock_owned_by_user(sk));
1556 
1557 	sock_hold(sk);
1558 	sock_orphan(sk);
1559 
1560 	/*	This is a (useful) BSD violating of the RFC. There is a
1561 	 *	problem with TCP as specified in that the other end could
1562 	 *	keep a socket open forever with no application left this end.
1563 	 *	We use a 3 minute timeout (about the same as BSD) then kill
1564 	 *	our end. If they send after that then tough - BUT: long enough
1565 	 *	that we won't make the old 4*rto = almost no time - whoops
1566 	 *	reset mistake.
1567 	 *
1568 	 *	Nope, it was not mistake. It is really desired behaviour
1569 	 *	f.e. on http servers, when such sockets are useless, but
1570 	 *	consume significant resources. Let's do it with special
1571 	 *	linger2	option.					--ANK
1572 	 */
1573 
1574 	if (sk->sk_state == TCP_FIN_WAIT2) {
1575 		struct tcp_sock *tp = tcp_sk(sk);
1576 		if (tp->linger2 < 0) {
1577 			tcp_set_state(sk, TCP_CLOSE);
1578 			tcp_send_active_reset(sk, GFP_ATOMIC);
1579 			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1580 		} else {
1581 			const int tmo = tcp_fin_time(sk);
1582 
1583 			if (tmo > TCP_TIMEWAIT_LEN) {
1584 				inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk));
1585 			} else {
1586 				atomic_inc(sk->sk_prot->orphan_count);
1587 				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1588 				goto out;
1589 			}
1590 		}
1591 	}
1592 	if (sk->sk_state != TCP_CLOSE) {
1593 		sk_stream_mem_reclaim(sk);
1594 		if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans ||
1595 		    (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1596 		     atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1597 			if (net_ratelimit())
1598 				printk(KERN_INFO "TCP: too many of orphaned "
1599 				       "sockets\n");
1600 			tcp_set_state(sk, TCP_CLOSE);
1601 			tcp_send_active_reset(sk, GFP_ATOMIC);
1602 			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1603 		}
1604 	}
1605 	atomic_inc(sk->sk_prot->orphan_count);
1606 
1607 	if (sk->sk_state == TCP_CLOSE)
1608 		inet_csk_destroy_sock(sk);
1609 	/* Otherwise, socket is reprieved until protocol close. */
1610 
1611 out:
1612 	bh_unlock_sock(sk);
1613 	local_bh_enable();
1614 	sock_put(sk);
1615 }
1616 
1617 /* These states need RST on ABORT according to RFC793 */
1618 
1619 static inline int tcp_need_reset(int state)
1620 {
1621 	return (1 << state) &
1622 	       (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1623 		TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1624 }
1625 
1626 int tcp_disconnect(struct sock *sk, int flags)
1627 {
1628 	struct inet_sock *inet = inet_sk(sk);
1629 	struct inet_connection_sock *icsk = inet_csk(sk);
1630 	struct tcp_sock *tp = tcp_sk(sk);
1631 	int err = 0;
1632 	int old_state = sk->sk_state;
1633 
1634 	if (old_state != TCP_CLOSE)
1635 		tcp_set_state(sk, TCP_CLOSE);
1636 
1637 	/* ABORT function of RFC793 */
1638 	if (old_state == TCP_LISTEN) {
1639 		inet_csk_listen_stop(sk);
1640 	} else if (tcp_need_reset(old_state) ||
1641 		   (tp->snd_nxt != tp->write_seq &&
1642 		    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1643 		/* The last check adjusts for discrepance of Linux wrt. RFC
1644 		 * states
1645 		 */
1646 		tcp_send_active_reset(sk, gfp_any());
1647 		sk->sk_err = ECONNRESET;
1648 	} else if (old_state == TCP_SYN_SENT)
1649 		sk->sk_err = ECONNRESET;
1650 
1651 	tcp_clear_xmit_timers(sk);
1652 	__skb_queue_purge(&sk->sk_receive_queue);
1653 	sk_stream_writequeue_purge(sk);
1654 	__skb_queue_purge(&tp->out_of_order_queue);
1655 
1656 	inet->dport = 0;
1657 
1658 	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1659 		inet_reset_saddr(sk);
1660 
1661 	sk->sk_shutdown = 0;
1662 	sock_reset_flag(sk, SOCK_DONE);
1663 	tp->srtt = 0;
1664 	if ((tp->write_seq += tp->max_window + 2) == 0)
1665 		tp->write_seq = 1;
1666 	icsk->icsk_backoff = 0;
1667 	tp->snd_cwnd = 2;
1668 	icsk->icsk_probes_out = 0;
1669 	tp->packets_out = 0;
1670 	tp->snd_ssthresh = 0x7fffffff;
1671 	tp->snd_cwnd_cnt = 0;
1672 	tcp_set_ca_state(sk, TCP_CA_Open);
1673 	tcp_clear_retrans(tp);
1674 	inet_csk_delack_init(sk);
1675 	sk->sk_send_head = NULL;
1676 	tp->rx_opt.saw_tstamp = 0;
1677 	tcp_sack_reset(&tp->rx_opt);
1678 	__sk_dst_reset(sk);
1679 
1680 	BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
1681 
1682 	sk->sk_error_report(sk);
1683 	return err;
1684 }
1685 
1686 /*
1687  *	Socket option code for TCP.
1688  */
1689 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1690 		   int optlen)
1691 {
1692 	struct tcp_sock *tp = tcp_sk(sk);
1693 	struct inet_connection_sock *icsk = inet_csk(sk);
1694 	int val;
1695 	int err = 0;
1696 
1697 	if (level != SOL_TCP)
1698 		return tp->af_specific->setsockopt(sk, level, optname,
1699 						   optval, optlen);
1700 
1701 	/* This is a string value all the others are int's */
1702 	if (optname == TCP_CONGESTION) {
1703 		char name[TCP_CA_NAME_MAX];
1704 
1705 		if (optlen < 1)
1706 			return -EINVAL;
1707 
1708 		val = strncpy_from_user(name, optval,
1709 					min(TCP_CA_NAME_MAX-1, optlen));
1710 		if (val < 0)
1711 			return -EFAULT;
1712 		name[val] = 0;
1713 
1714 		lock_sock(sk);
1715 		err = tcp_set_congestion_control(sk, name);
1716 		release_sock(sk);
1717 		return err;
1718 	}
1719 
1720 	if (optlen < sizeof(int))
1721 		return -EINVAL;
1722 
1723 	if (get_user(val, (int __user *)optval))
1724 		return -EFAULT;
1725 
1726 	lock_sock(sk);
1727 
1728 	switch (optname) {
1729 	case TCP_MAXSEG:
1730 		/* Values greater than interface MTU won't take effect. However
1731 		 * at the point when this call is done we typically don't yet
1732 		 * know which interface is going to be used */
1733 		if (val < 8 || val > MAX_TCP_WINDOW) {
1734 			err = -EINVAL;
1735 			break;
1736 		}
1737 		tp->rx_opt.user_mss = val;
1738 		break;
1739 
1740 	case TCP_NODELAY:
1741 		if (val) {
1742 			/* TCP_NODELAY is weaker than TCP_CORK, so that
1743 			 * this option on corked socket is remembered, but
1744 			 * it is not activated until cork is cleared.
1745 			 *
1746 			 * However, when TCP_NODELAY is set we make
1747 			 * an explicit push, which overrides even TCP_CORK
1748 			 * for currently queued segments.
1749 			 */
1750 			tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
1751 			tcp_push_pending_frames(sk, tp);
1752 		} else {
1753 			tp->nonagle &= ~TCP_NAGLE_OFF;
1754 		}
1755 		break;
1756 
1757 	case TCP_CORK:
1758 		/* When set indicates to always queue non-full frames.
1759 		 * Later the user clears this option and we transmit
1760 		 * any pending partial frames in the queue.  This is
1761 		 * meant to be used alongside sendfile() to get properly
1762 		 * filled frames when the user (for example) must write
1763 		 * out headers with a write() call first and then use
1764 		 * sendfile to send out the data parts.
1765 		 *
1766 		 * TCP_CORK can be set together with TCP_NODELAY and it is
1767 		 * stronger than TCP_NODELAY.
1768 		 */
1769 		if (val) {
1770 			tp->nonagle |= TCP_NAGLE_CORK;
1771 		} else {
1772 			tp->nonagle &= ~TCP_NAGLE_CORK;
1773 			if (tp->nonagle&TCP_NAGLE_OFF)
1774 				tp->nonagle |= TCP_NAGLE_PUSH;
1775 			tcp_push_pending_frames(sk, tp);
1776 		}
1777 		break;
1778 
1779 	case TCP_KEEPIDLE:
1780 		if (val < 1 || val > MAX_TCP_KEEPIDLE)
1781 			err = -EINVAL;
1782 		else {
1783 			tp->keepalive_time = val * HZ;
1784 			if (sock_flag(sk, SOCK_KEEPOPEN) &&
1785 			    !((1 << sk->sk_state) &
1786 			      (TCPF_CLOSE | TCPF_LISTEN))) {
1787 				__u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
1788 				if (tp->keepalive_time > elapsed)
1789 					elapsed = tp->keepalive_time - elapsed;
1790 				else
1791 					elapsed = 0;
1792 				inet_csk_reset_keepalive_timer(sk, elapsed);
1793 			}
1794 		}
1795 		break;
1796 	case TCP_KEEPINTVL:
1797 		if (val < 1 || val > MAX_TCP_KEEPINTVL)
1798 			err = -EINVAL;
1799 		else
1800 			tp->keepalive_intvl = val * HZ;
1801 		break;
1802 	case TCP_KEEPCNT:
1803 		if (val < 1 || val > MAX_TCP_KEEPCNT)
1804 			err = -EINVAL;
1805 		else
1806 			tp->keepalive_probes = val;
1807 		break;
1808 	case TCP_SYNCNT:
1809 		if (val < 1 || val > MAX_TCP_SYNCNT)
1810 			err = -EINVAL;
1811 		else
1812 			icsk->icsk_syn_retries = val;
1813 		break;
1814 
1815 	case TCP_LINGER2:
1816 		if (val < 0)
1817 			tp->linger2 = -1;
1818 		else if (val > sysctl_tcp_fin_timeout / HZ)
1819 			tp->linger2 = 0;
1820 		else
1821 			tp->linger2 = val * HZ;
1822 		break;
1823 
1824 	case TCP_DEFER_ACCEPT:
1825 		icsk->icsk_accept_queue.rskq_defer_accept = 0;
1826 		if (val > 0) {
1827 			/* Translate value in seconds to number of
1828 			 * retransmits */
1829 			while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
1830 			       val > ((TCP_TIMEOUT_INIT / HZ) <<
1831 				       icsk->icsk_accept_queue.rskq_defer_accept))
1832 				icsk->icsk_accept_queue.rskq_defer_accept++;
1833 			icsk->icsk_accept_queue.rskq_defer_accept++;
1834 		}
1835 		break;
1836 
1837 	case TCP_WINDOW_CLAMP:
1838 		if (!val) {
1839 			if (sk->sk_state != TCP_CLOSE) {
1840 				err = -EINVAL;
1841 				break;
1842 			}
1843 			tp->window_clamp = 0;
1844 		} else
1845 			tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
1846 						SOCK_MIN_RCVBUF / 2 : val;
1847 		break;
1848 
1849 	case TCP_QUICKACK:
1850 		if (!val) {
1851 			icsk->icsk_ack.pingpong = 1;
1852 		} else {
1853 			icsk->icsk_ack.pingpong = 0;
1854 			if ((1 << sk->sk_state) &
1855 			    (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
1856 			    inet_csk_ack_scheduled(sk)) {
1857 				icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
1858 				cleanup_rbuf(sk, 1);
1859 				if (!(val & 1))
1860 					icsk->icsk_ack.pingpong = 1;
1861 			}
1862 		}
1863 		break;
1864 
1865 	default:
1866 		err = -ENOPROTOOPT;
1867 		break;
1868 	};
1869 	release_sock(sk);
1870 	return err;
1871 }
1872 
1873 /* Return information about state of tcp endpoint in API format. */
1874 void tcp_get_info(struct sock *sk, struct tcp_info *info)
1875 {
1876 	struct tcp_sock *tp = tcp_sk(sk);
1877 	const struct inet_connection_sock *icsk = inet_csk(sk);
1878 	u32 now = tcp_time_stamp;
1879 
1880 	memset(info, 0, sizeof(*info));
1881 
1882 	info->tcpi_state = sk->sk_state;
1883 	info->tcpi_ca_state = icsk->icsk_ca_state;
1884 	info->tcpi_retransmits = icsk->icsk_retransmits;
1885 	info->tcpi_probes = icsk->icsk_probes_out;
1886 	info->tcpi_backoff = icsk->icsk_backoff;
1887 
1888 	if (tp->rx_opt.tstamp_ok)
1889 		info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
1890 	if (tp->rx_opt.sack_ok)
1891 		info->tcpi_options |= TCPI_OPT_SACK;
1892 	if (tp->rx_opt.wscale_ok) {
1893 		info->tcpi_options |= TCPI_OPT_WSCALE;
1894 		info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
1895 		info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
1896 	}
1897 
1898 	if (tp->ecn_flags&TCP_ECN_OK)
1899 		info->tcpi_options |= TCPI_OPT_ECN;
1900 
1901 	info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
1902 	info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
1903 	info->tcpi_snd_mss = tp->mss_cache;
1904 	info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
1905 
1906 	info->tcpi_unacked = tp->packets_out;
1907 	info->tcpi_sacked = tp->sacked_out;
1908 	info->tcpi_lost = tp->lost_out;
1909 	info->tcpi_retrans = tp->retrans_out;
1910 	info->tcpi_fackets = tp->fackets_out;
1911 
1912 	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
1913 	info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
1914 	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
1915 
1916 	info->tcpi_pmtu = tp->pmtu_cookie;
1917 	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
1918 	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
1919 	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
1920 	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
1921 	info->tcpi_snd_cwnd = tp->snd_cwnd;
1922 	info->tcpi_advmss = tp->advmss;
1923 	info->tcpi_reordering = tp->reordering;
1924 
1925 	info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
1926 	info->tcpi_rcv_space = tp->rcvq_space.space;
1927 
1928 	info->tcpi_total_retrans = tp->total_retrans;
1929 }
1930 
1931 EXPORT_SYMBOL_GPL(tcp_get_info);
1932 
1933 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
1934 		   int __user *optlen)
1935 {
1936 	struct inet_connection_sock *icsk = inet_csk(sk);
1937 	struct tcp_sock *tp = tcp_sk(sk);
1938 	int val, len;
1939 
1940 	if (level != SOL_TCP)
1941 		return tp->af_specific->getsockopt(sk, level, optname,
1942 						   optval, optlen);
1943 
1944 	if (get_user(len, optlen))
1945 		return -EFAULT;
1946 
1947 	len = min_t(unsigned int, len, sizeof(int));
1948 
1949 	if (len < 0)
1950 		return -EINVAL;
1951 
1952 	switch (optname) {
1953 	case TCP_MAXSEG:
1954 		val = tp->mss_cache;
1955 		if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
1956 			val = tp->rx_opt.user_mss;
1957 		break;
1958 	case TCP_NODELAY:
1959 		val = !!(tp->nonagle&TCP_NAGLE_OFF);
1960 		break;
1961 	case TCP_CORK:
1962 		val = !!(tp->nonagle&TCP_NAGLE_CORK);
1963 		break;
1964 	case TCP_KEEPIDLE:
1965 		val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
1966 		break;
1967 	case TCP_KEEPINTVL:
1968 		val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
1969 		break;
1970 	case TCP_KEEPCNT:
1971 		val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
1972 		break;
1973 	case TCP_SYNCNT:
1974 		val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
1975 		break;
1976 	case TCP_LINGER2:
1977 		val = tp->linger2;
1978 		if (val >= 0)
1979 			val = (val ? : sysctl_tcp_fin_timeout) / HZ;
1980 		break;
1981 	case TCP_DEFER_ACCEPT:
1982 		val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
1983 			((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
1984 		break;
1985 	case TCP_WINDOW_CLAMP:
1986 		val = tp->window_clamp;
1987 		break;
1988 	case TCP_INFO: {
1989 		struct tcp_info info;
1990 
1991 		if (get_user(len, optlen))
1992 			return -EFAULT;
1993 
1994 		tcp_get_info(sk, &info);
1995 
1996 		len = min_t(unsigned int, len, sizeof(info));
1997 		if (put_user(len, optlen))
1998 			return -EFAULT;
1999 		if (copy_to_user(optval, &info, len))
2000 			return -EFAULT;
2001 		return 0;
2002 	}
2003 	case TCP_QUICKACK:
2004 		val = !icsk->icsk_ack.pingpong;
2005 		break;
2006 
2007 	case TCP_CONGESTION:
2008 		if (get_user(len, optlen))
2009 			return -EFAULT;
2010 		len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2011 		if (put_user(len, optlen))
2012 			return -EFAULT;
2013 		if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2014 			return -EFAULT;
2015 		return 0;
2016 	default:
2017 		return -ENOPROTOOPT;
2018 	};
2019 
2020 	if (put_user(len, optlen))
2021 		return -EFAULT;
2022 	if (copy_to_user(optval, &val, len))
2023 		return -EFAULT;
2024 	return 0;
2025 }
2026 
2027 
2028 extern void __skb_cb_too_small_for_tcp(int, int);
2029 extern struct tcp_congestion_ops tcp_reno;
2030 
2031 static __initdata unsigned long thash_entries;
2032 static int __init set_thash_entries(char *str)
2033 {
2034 	if (!str)
2035 		return 0;
2036 	thash_entries = simple_strtoul(str, &str, 0);
2037 	return 1;
2038 }
2039 __setup("thash_entries=", set_thash_entries);
2040 
2041 void __init tcp_init(void)
2042 {
2043 	struct sk_buff *skb = NULL;
2044 	int order, i;
2045 
2046 	if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2047 		__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2048 					   sizeof(skb->cb));
2049 
2050 	tcp_hashinfo.bind_bucket_cachep =
2051 		kmem_cache_create("tcp_bind_bucket",
2052 				  sizeof(struct inet_bind_bucket), 0,
2053 				  SLAB_HWCACHE_ALIGN, NULL, NULL);
2054 	if (!tcp_hashinfo.bind_bucket_cachep)
2055 		panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2056 
2057 	/* Size and allocate the main established and bind bucket
2058 	 * hash tables.
2059 	 *
2060 	 * The methodology is similar to that of the buffer cache.
2061 	 */
2062 	tcp_hashinfo.ehash =
2063 		alloc_large_system_hash("TCP established",
2064 					sizeof(struct inet_ehash_bucket),
2065 					thash_entries,
2066 					(num_physpages >= 128 * 1024) ?
2067 						(25 - PAGE_SHIFT) :
2068 						(27 - PAGE_SHIFT),
2069 					HASH_HIGHMEM,
2070 					&tcp_hashinfo.ehash_size,
2071 					NULL,
2072 					0);
2073 	tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1;
2074 	for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) {
2075 		rwlock_init(&tcp_hashinfo.ehash[i].lock);
2076 		INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
2077 	}
2078 
2079 	tcp_hashinfo.bhash =
2080 		alloc_large_system_hash("TCP bind",
2081 					sizeof(struct inet_bind_hashbucket),
2082 					tcp_hashinfo.ehash_size,
2083 					(num_physpages >= 128 * 1024) ?
2084 						(25 - PAGE_SHIFT) :
2085 						(27 - PAGE_SHIFT),
2086 					HASH_HIGHMEM,
2087 					&tcp_hashinfo.bhash_size,
2088 					NULL,
2089 					64 * 1024);
2090 	tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
2091 	for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
2092 		spin_lock_init(&tcp_hashinfo.bhash[i].lock);
2093 		INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
2094 	}
2095 
2096 	/* Try to be a bit smarter and adjust defaults depending
2097 	 * on available memory.
2098 	 */
2099 	for (order = 0; ((1 << order) << PAGE_SHIFT) <
2100 			(tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
2101 			order++)
2102 		;
2103 	if (order >= 4) {
2104 		sysctl_local_port_range[0] = 32768;
2105 		sysctl_local_port_range[1] = 61000;
2106 		tcp_death_row.sysctl_max_tw_buckets = 180000;
2107 		sysctl_tcp_max_orphans = 4096 << (order - 4);
2108 		sysctl_max_syn_backlog = 1024;
2109 	} else if (order < 3) {
2110 		sysctl_local_port_range[0] = 1024 * (3 - order);
2111 		tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
2112 		sysctl_tcp_max_orphans >>= (3 - order);
2113 		sysctl_max_syn_backlog = 128;
2114 	}
2115 	tcp_hashinfo.port_rover = sysctl_local_port_range[0] - 1;
2116 
2117 	sysctl_tcp_mem[0] =  768 << order;
2118 	sysctl_tcp_mem[1] = 1024 << order;
2119 	sysctl_tcp_mem[2] = 1536 << order;
2120 
2121 	if (order < 3) {
2122 		sysctl_tcp_wmem[2] = 64 * 1024;
2123 		sysctl_tcp_rmem[0] = PAGE_SIZE;
2124 		sysctl_tcp_rmem[1] = 43689;
2125 		sysctl_tcp_rmem[2] = 2 * 43689;
2126 	}
2127 
2128 	printk(KERN_INFO "TCP: Hash tables configured "
2129 	       "(established %d bind %d)\n",
2130 	       tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size);
2131 
2132 	tcp_register_congestion_control(&tcp_reno);
2133 }
2134 
2135 EXPORT_SYMBOL(tcp_close);
2136 EXPORT_SYMBOL(tcp_disconnect);
2137 EXPORT_SYMBOL(tcp_getsockopt);
2138 EXPORT_SYMBOL(tcp_ioctl);
2139 EXPORT_SYMBOL(tcp_poll);
2140 EXPORT_SYMBOL(tcp_read_sock);
2141 EXPORT_SYMBOL(tcp_recvmsg);
2142 EXPORT_SYMBOL(tcp_sendmsg);
2143 EXPORT_SYMBOL(tcp_sendpage);
2144 EXPORT_SYMBOL(tcp_setsockopt);
2145 EXPORT_SYMBOL(tcp_shutdown);
2146 EXPORT_SYMBOL(tcp_statistics);
2147