xref: /openbmc/linux/net/ipv4/tcp_timer.c (revision d5cb9783536a41df9f9cba5b0a1d78047ed787f7)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:	$Id: tcp_timer.c,v 1.88 2002/02/01 22:01:04 davem Exp $
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *		Corey Minyard <wf-rch!minyard@relay.EU.net>
14  *		Florian La Roche, <flla@stud.uni-sb.de>
15  *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16  *		Linus Torvalds, <torvalds@cs.helsinki.fi>
17  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
18  *		Matthew Dillon, <dillon@apollo.west.oic.com>
19  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20  *		Jorge Cwik, <jorge@laser.satlink.net>
21  */
22 
23 #include <linux/module.h>
24 #include <net/tcp.h>
25 
26 int sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
27 int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
28 int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
29 int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
30 int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
31 int sysctl_tcp_retries1 = TCP_RETR1;
32 int sysctl_tcp_retries2 = TCP_RETR2;
33 int sysctl_tcp_orphan_retries;
34 
35 static void tcp_write_timer(unsigned long);
36 static void tcp_delack_timer(unsigned long);
37 static void tcp_keepalive_timer (unsigned long data);
38 
39 void tcp_init_xmit_timers(struct sock *sk)
40 {
41 	inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
42 				  &tcp_keepalive_timer);
43 }
44 
45 EXPORT_SYMBOL(tcp_init_xmit_timers);
46 
47 static void tcp_write_err(struct sock *sk)
48 {
49 	sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
50 	sk->sk_error_report(sk);
51 
52 	tcp_done(sk);
53 	NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
54 }
55 
56 /* Do not allow orphaned sockets to eat all our resources.
57  * This is direct violation of TCP specs, but it is required
58  * to prevent DoS attacks. It is called when a retransmission timeout
59  * or zero probe timeout occurs on orphaned socket.
60  *
61  * Criterium is still not confirmed experimentally and may change.
62  * We kill the socket, if:
63  * 1. If number of orphaned sockets exceeds an administratively configured
64  *    limit.
65  * 2. If we have strong memory pressure.
66  */
67 static int tcp_out_of_resources(struct sock *sk, int do_reset)
68 {
69 	struct tcp_sock *tp = tcp_sk(sk);
70 	int orphans = atomic_read(&tcp_orphan_count);
71 
72 	/* If peer does not open window for long time, or did not transmit
73 	 * anything for long time, penalize it. */
74 	if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
75 		orphans <<= 1;
76 
77 	/* If some dubious ICMP arrived, penalize even more. */
78 	if (sk->sk_err_soft)
79 		orphans <<= 1;
80 
81 	if (orphans >= sysctl_tcp_max_orphans ||
82 	    (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
83 	     atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
84 		if (net_ratelimit())
85 			printk(KERN_INFO "Out of socket memory\n");
86 
87 		/* Catch exceptional cases, when connection requires reset.
88 		 *      1. Last segment was sent recently. */
89 		if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
90 		    /*  2. Window is closed. */
91 		    (!tp->snd_wnd && !tp->packets_out))
92 			do_reset = 1;
93 		if (do_reset)
94 			tcp_send_active_reset(sk, GFP_ATOMIC);
95 		tcp_done(sk);
96 		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
97 		return 1;
98 	}
99 	return 0;
100 }
101 
102 /* Calculate maximal number or retries on an orphaned socket. */
103 static int tcp_orphan_retries(struct sock *sk, int alive)
104 {
105 	int retries = sysctl_tcp_orphan_retries; /* May be zero. */
106 
107 	/* We know from an ICMP that something is wrong. */
108 	if (sk->sk_err_soft && !alive)
109 		retries = 0;
110 
111 	/* However, if socket sent something recently, select some safe
112 	 * number of retries. 8 corresponds to >100 seconds with minimal
113 	 * RTO of 200msec. */
114 	if (retries == 0 && alive)
115 		retries = 8;
116 	return retries;
117 }
118 
119 /* A write timeout has occurred. Process the after effects. */
120 static int tcp_write_timeout(struct sock *sk)
121 {
122 	const struct inet_connection_sock *icsk = inet_csk(sk);
123 	int retry_until;
124 
125 	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
126 		if (icsk->icsk_retransmits)
127 			dst_negative_advice(&sk->sk_dst_cache);
128 		retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
129 	} else {
130 		if (icsk->icsk_retransmits >= sysctl_tcp_retries1) {
131 			/* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
132 			   hole detection. :-(
133 
134 			   It is place to make it. It is not made. I do not want
135 			   to make it. It is disguisting. It does not work in any
136 			   case. Let me to cite the same draft, which requires for
137 			   us to implement this:
138 
139    "The one security concern raised by this memo is that ICMP black holes
140    are often caused by over-zealous security administrators who block
141    all ICMP messages.  It is vitally important that those who design and
142    deploy security systems understand the impact of strict filtering on
143    upper-layer protocols.  The safest web site in the world is worthless
144    if most TCP implementations cannot transfer data from it.  It would
145    be far nicer to have all of the black holes fixed rather than fixing
146    all of the TCP implementations."
147 
148                            Golden words :-).
149 		   */
150 
151 			dst_negative_advice(&sk->sk_dst_cache);
152 		}
153 
154 		retry_until = sysctl_tcp_retries2;
155 		if (sock_flag(sk, SOCK_DEAD)) {
156 			const int alive = (icsk->icsk_rto < TCP_RTO_MAX);
157 
158 			retry_until = tcp_orphan_retries(sk, alive);
159 
160 			if (tcp_out_of_resources(sk, alive || icsk->icsk_retransmits < retry_until))
161 				return 1;
162 		}
163 	}
164 
165 	if (icsk->icsk_retransmits >= retry_until) {
166 		/* Has it gone just too far? */
167 		tcp_write_err(sk);
168 		return 1;
169 	}
170 	return 0;
171 }
172 
173 static void tcp_delack_timer(unsigned long data)
174 {
175 	struct sock *sk = (struct sock*)data;
176 	struct tcp_sock *tp = tcp_sk(sk);
177 	struct inet_connection_sock *icsk = inet_csk(sk);
178 
179 	bh_lock_sock(sk);
180 	if (sock_owned_by_user(sk)) {
181 		/* Try again later. */
182 		icsk->icsk_ack.blocked = 1;
183 		NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED);
184 		sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN);
185 		goto out_unlock;
186 	}
187 
188 	sk_stream_mem_reclaim(sk);
189 
190 	if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
191 		goto out;
192 
193 	if (time_after(icsk->icsk_ack.timeout, jiffies)) {
194 		sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
195 		goto out;
196 	}
197 	icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
198 
199 	if (!skb_queue_empty(&tp->ucopy.prequeue)) {
200 		struct sk_buff *skb;
201 
202 		NET_INC_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED);
203 
204 		while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
205 			sk->sk_backlog_rcv(sk, skb);
206 
207 		tp->ucopy.memory = 0;
208 	}
209 
210 	if (inet_csk_ack_scheduled(sk)) {
211 		if (!icsk->icsk_ack.pingpong) {
212 			/* Delayed ACK missed: inflate ATO. */
213 			icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto);
214 		} else {
215 			/* Delayed ACK missed: leave pingpong mode and
216 			 * deflate ATO.
217 			 */
218 			icsk->icsk_ack.pingpong = 0;
219 			icsk->icsk_ack.ato      = TCP_ATO_MIN;
220 		}
221 		tcp_send_ack(sk);
222 		NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS);
223 	}
224 	TCP_CHECK_TIMER(sk);
225 
226 out:
227 	if (tcp_memory_pressure)
228 		sk_stream_mem_reclaim(sk);
229 out_unlock:
230 	bh_unlock_sock(sk);
231 	sock_put(sk);
232 }
233 
234 static void tcp_probe_timer(struct sock *sk)
235 {
236 	struct inet_connection_sock *icsk = inet_csk(sk);
237 	struct tcp_sock *tp = tcp_sk(sk);
238 	int max_probes;
239 
240 	if (tp->packets_out || !sk->sk_send_head) {
241 		icsk->icsk_probes_out = 0;
242 		return;
243 	}
244 
245 	/* *WARNING* RFC 1122 forbids this
246 	 *
247 	 * It doesn't AFAIK, because we kill the retransmit timer -AK
248 	 *
249 	 * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
250 	 * this behaviour in Solaris down as a bug fix. [AC]
251 	 *
252 	 * Let me to explain. icsk_probes_out is zeroed by incoming ACKs
253 	 * even if they advertise zero window. Hence, connection is killed only
254 	 * if we received no ACKs for normal connection timeout. It is not killed
255 	 * only because window stays zero for some time, window may be zero
256 	 * until armageddon and even later. We are in full accordance
257 	 * with RFCs, only probe timer combines both retransmission timeout
258 	 * and probe timeout in one bottle.				--ANK
259 	 */
260 	max_probes = sysctl_tcp_retries2;
261 
262 	if (sock_flag(sk, SOCK_DEAD)) {
263 		const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX);
264 
265 		max_probes = tcp_orphan_retries(sk, alive);
266 
267 		if (tcp_out_of_resources(sk, alive || icsk->icsk_probes_out <= max_probes))
268 			return;
269 	}
270 
271 	if (icsk->icsk_probes_out > max_probes) {
272 		tcp_write_err(sk);
273 	} else {
274 		/* Only send another probe if we didn't close things up. */
275 		tcp_send_probe0(sk);
276 	}
277 }
278 
279 /*
280  *	The TCP retransmit timer.
281  */
282 
283 static void tcp_retransmit_timer(struct sock *sk)
284 {
285 	struct tcp_sock *tp = tcp_sk(sk);
286 	struct inet_connection_sock *icsk = inet_csk(sk);
287 
288 	if (!tp->packets_out)
289 		goto out;
290 
291 	BUG_TRAP(!skb_queue_empty(&sk->sk_write_queue));
292 
293 	if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
294 	    !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
295 		/* Receiver dastardly shrinks window. Our retransmits
296 		 * become zero probes, but we should not timeout this
297 		 * connection. If the socket is an orphan, time it out,
298 		 * we cannot allow such beasts to hang infinitely.
299 		 */
300 #ifdef TCP_DEBUG
301 		if (net_ratelimit()) {
302 			struct inet_sock *inet = inet_sk(sk);
303 			printk(KERN_DEBUG "TCP: Treason uncloaked! Peer %u.%u.%u.%u:%u/%u shrinks window %u:%u. Repaired.\n",
304 			       NIPQUAD(inet->daddr), htons(inet->dport),
305 			       inet->num, tp->snd_una, tp->snd_nxt);
306 		}
307 #endif
308 		if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) {
309 			tcp_write_err(sk);
310 			goto out;
311 		}
312 		tcp_enter_loss(sk, 0);
313 		tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
314 		__sk_dst_reset(sk);
315 		goto out_reset_timer;
316 	}
317 
318 	if (tcp_write_timeout(sk))
319 		goto out;
320 
321 	if (icsk->icsk_retransmits == 0) {
322 		if (icsk->icsk_ca_state == TCP_CA_Disorder ||
323 		    icsk->icsk_ca_state == TCP_CA_Recovery) {
324 			if (tp->rx_opt.sack_ok) {
325 				if (icsk->icsk_ca_state == TCP_CA_Recovery)
326 					NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL);
327 				else
328 					NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES);
329 			} else {
330 				if (icsk->icsk_ca_state == TCP_CA_Recovery)
331 					NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL);
332 				else
333 					NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES);
334 			}
335 		} else if (icsk->icsk_ca_state == TCP_CA_Loss) {
336 			NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES);
337 		} else {
338 			NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS);
339 		}
340 	}
341 
342 	if (tcp_use_frto(sk)) {
343 		tcp_enter_frto(sk);
344 	} else {
345 		tcp_enter_loss(sk, 0);
346 	}
347 
348 	if (tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)) > 0) {
349 		/* Retransmission failed because of local congestion,
350 		 * do not backoff.
351 		 */
352 		if (!icsk->icsk_retransmits)
353 			icsk->icsk_retransmits = 1;
354 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
355 					  min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
356 					  TCP_RTO_MAX);
357 		goto out;
358 	}
359 
360 	/* Increase the timeout each time we retransmit.  Note that
361 	 * we do not increase the rtt estimate.  rto is initialized
362 	 * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
363 	 * that doubling rto each time is the least we can get away with.
364 	 * In KA9Q, Karn uses this for the first few times, and then
365 	 * goes to quadratic.  netBSD doubles, but only goes up to *64,
366 	 * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
367 	 * defined in the protocol as the maximum possible RTT.  I guess
368 	 * we'll have to use something other than TCP to talk to the
369 	 * University of Mars.
370 	 *
371 	 * PAWS allows us longer timeouts and large windows, so once
372 	 * implemented ftp to mars will work nicely. We will have to fix
373 	 * the 120 second clamps though!
374 	 */
375 	icsk->icsk_backoff++;
376 	icsk->icsk_retransmits++;
377 
378 out_reset_timer:
379 	icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
380 	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
381 	if (icsk->icsk_retransmits > sysctl_tcp_retries1)
382 		__sk_dst_reset(sk);
383 
384 out:;
385 }
386 
387 static void tcp_write_timer(unsigned long data)
388 {
389 	struct sock *sk = (struct sock*)data;
390 	struct inet_connection_sock *icsk = inet_csk(sk);
391 	int event;
392 
393 	bh_lock_sock(sk);
394 	if (sock_owned_by_user(sk)) {
395 		/* Try again later */
396 		sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20));
397 		goto out_unlock;
398 	}
399 
400 	if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
401 		goto out;
402 
403 	if (time_after(icsk->icsk_timeout, jiffies)) {
404 		sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
405 		goto out;
406 	}
407 
408 	event = icsk->icsk_pending;
409 	icsk->icsk_pending = 0;
410 
411 	switch (event) {
412 	case ICSK_TIME_RETRANS:
413 		tcp_retransmit_timer(sk);
414 		break;
415 	case ICSK_TIME_PROBE0:
416 		tcp_probe_timer(sk);
417 		break;
418 	}
419 	TCP_CHECK_TIMER(sk);
420 
421 out:
422 	sk_stream_mem_reclaim(sk);
423 out_unlock:
424 	bh_unlock_sock(sk);
425 	sock_put(sk);
426 }
427 
428 /*
429  *	Timer for listening sockets
430  */
431 
432 static void tcp_synack_timer(struct sock *sk)
433 {
434 	inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL,
435 				   TCP_TIMEOUT_INIT, TCP_RTO_MAX);
436 }
437 
438 void tcp_set_keepalive(struct sock *sk, int val)
439 {
440 	if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
441 		return;
442 
443 	if (val && !sock_flag(sk, SOCK_KEEPOPEN))
444 		inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
445 	else if (!val)
446 		inet_csk_delete_keepalive_timer(sk);
447 }
448 
449 
450 static void tcp_keepalive_timer (unsigned long data)
451 {
452 	struct sock *sk = (struct sock *) data;
453 	struct inet_connection_sock *icsk = inet_csk(sk);
454 	struct tcp_sock *tp = tcp_sk(sk);
455 	__u32 elapsed;
456 
457 	/* Only process if socket is not in use. */
458 	bh_lock_sock(sk);
459 	if (sock_owned_by_user(sk)) {
460 		/* Try again later. */
461 		inet_csk_reset_keepalive_timer (sk, HZ/20);
462 		goto out;
463 	}
464 
465 	if (sk->sk_state == TCP_LISTEN) {
466 		tcp_synack_timer(sk);
467 		goto out;
468 	}
469 
470 	if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
471 		if (tp->linger2 >= 0) {
472 			const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
473 
474 			if (tmo > 0) {
475 				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
476 				goto out;
477 			}
478 		}
479 		tcp_send_active_reset(sk, GFP_ATOMIC);
480 		goto death;
481 	}
482 
483 	if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE)
484 		goto out;
485 
486 	elapsed = keepalive_time_when(tp);
487 
488 	/* It is alive without keepalive 8) */
489 	if (tp->packets_out || sk->sk_send_head)
490 		goto resched;
491 
492 	elapsed = tcp_time_stamp - tp->rcv_tstamp;
493 
494 	if (elapsed >= keepalive_time_when(tp)) {
495 		if ((!tp->keepalive_probes && icsk->icsk_probes_out >= sysctl_tcp_keepalive_probes) ||
496 		     (tp->keepalive_probes && icsk->icsk_probes_out >= tp->keepalive_probes)) {
497 			tcp_send_active_reset(sk, GFP_ATOMIC);
498 			tcp_write_err(sk);
499 			goto out;
500 		}
501 		if (tcp_write_wakeup(sk) <= 0) {
502 			icsk->icsk_probes_out++;
503 			elapsed = keepalive_intvl_when(tp);
504 		} else {
505 			/* If keepalive was lost due to local congestion,
506 			 * try harder.
507 			 */
508 			elapsed = TCP_RESOURCE_PROBE_INTERVAL;
509 		}
510 	} else {
511 		/* It is tp->rcv_tstamp + keepalive_time_when(tp) */
512 		elapsed = keepalive_time_when(tp) - elapsed;
513 	}
514 
515 	TCP_CHECK_TIMER(sk);
516 	sk_stream_mem_reclaim(sk);
517 
518 resched:
519 	inet_csk_reset_keepalive_timer (sk, elapsed);
520 	goto out;
521 
522 death:
523 	tcp_done(sk);
524 
525 out:
526 	bh_unlock_sock(sk);
527 	sock_put(sk);
528 }
529