1 /*
2  * ip_vs_proto_tcp.c:	TCP load balancing support for IPVS
3  *
4  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
5  *              Julian Anastasov <ja@ssi.bg>
6  *
7  *              This program is free software; you can redistribute it and/or
8  *              modify it under the terms of the GNU General Public License
9  *              as published by the Free Software Foundation; either version
10  *              2 of the License, or (at your option) any later version.
11  *
12  * Changes:     Hans Schillstrom <hans.schillstrom@ericsson.com>
13  *
14  *              Network name space (netns) aware.
15  *              Global data moved to netns i.e struct netns_ipvs
16  *              tcp_timeouts table has copy per netns in a hash table per
17  *              protocol ip_vs_proto_data and is handled by netns
18  */
19 
20 #define KMSG_COMPONENT "IPVS"
21 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
22 
23 #include <linux/kernel.h>
24 #include <linux/ip.h>
25 #include <linux/tcp.h>                  /* for tcphdr */
26 #include <net/ip.h>
27 #include <net/tcp.h>                    /* for csum_tcpudp_magic */
28 #include <net/ip6_checksum.h>
29 #include <linux/netfilter.h>
30 #include <linux/netfilter_ipv4.h>
31 
32 #include <net/ip_vs.h>
33 
34 static int
35 tcp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
36 		  struct ip_vs_proto_data *pd,
37 		  int *verdict, struct ip_vs_conn **cpp,
38 		  struct ip_vs_iphdr *iph)
39 {
40 	struct ip_vs_service *svc;
41 	struct tcphdr _tcph, *th;
42 	__be16 _ports[2], *ports = NULL;
43 
44 	/* In the event of icmp, we're only guaranteed to have the first 8
45 	 * bytes of the transport header, so we only check the rest of the
46 	 * TCP packet for non-ICMP packets
47 	 */
48 	if (likely(!ip_vs_iph_icmp(iph))) {
49 		th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
50 		if (th) {
51 			if (th->rst || !(sysctl_sloppy_tcp(ipvs) || th->syn))
52 				return 1;
53 			ports = &th->source;
54 		}
55 	} else {
56 		ports = skb_header_pointer(
57 			skb, iph->len, sizeof(_ports), &_ports);
58 	}
59 
60 	if (!ports) {
61 		*verdict = NF_DROP;
62 		return 0;
63 	}
64 
65 	/* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
66 	rcu_read_lock();
67 
68 	if (likely(!ip_vs_iph_inverse(iph)))
69 		svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol,
70 					 &iph->daddr, ports[1]);
71 	else
72 		svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol,
73 					 &iph->saddr, ports[0]);
74 
75 	if (svc) {
76 		int ignored;
77 
78 		if (ip_vs_todrop(ipvs)) {
79 			/*
80 			 * It seems that we are very loaded.
81 			 * We have to drop this packet :(
82 			 */
83 			rcu_read_unlock();
84 			*verdict = NF_DROP;
85 			return 0;
86 		}
87 
88 		/*
89 		 * Let the virtual server select a real server for the
90 		 * incoming connection, and create a connection entry.
91 		 */
92 		*cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
93 		if (!*cpp && ignored <= 0) {
94 			if (!ignored)
95 				*verdict = ip_vs_leave(svc, skb, pd, iph);
96 			else
97 				*verdict = NF_DROP;
98 			rcu_read_unlock();
99 			return 0;
100 		}
101 	}
102 	rcu_read_unlock();
103 	/* NF_ACCEPT */
104 	return 1;
105 }
106 
107 
108 static inline void
109 tcp_fast_csum_update(int af, struct tcphdr *tcph,
110 		     const union nf_inet_addr *oldip,
111 		     const union nf_inet_addr *newip,
112 		     __be16 oldport, __be16 newport)
113 {
114 #ifdef CONFIG_IP_VS_IPV6
115 	if (af == AF_INET6)
116 		tcph->check =
117 			csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
118 					 ip_vs_check_diff2(oldport, newport,
119 						~csum_unfold(tcph->check))));
120 	else
121 #endif
122 	tcph->check =
123 		csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
124 				 ip_vs_check_diff2(oldport, newport,
125 						~csum_unfold(tcph->check))));
126 }
127 
128 
129 static inline void
130 tcp_partial_csum_update(int af, struct tcphdr *tcph,
131 		     const union nf_inet_addr *oldip,
132 		     const union nf_inet_addr *newip,
133 		     __be16 oldlen, __be16 newlen)
134 {
135 #ifdef CONFIG_IP_VS_IPV6
136 	if (af == AF_INET6)
137 		tcph->check =
138 			~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
139 					 ip_vs_check_diff2(oldlen, newlen,
140 						csum_unfold(tcph->check))));
141 	else
142 #endif
143 	tcph->check =
144 		~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
145 				ip_vs_check_diff2(oldlen, newlen,
146 						csum_unfold(tcph->check))));
147 }
148 
149 
150 static int
151 tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
152 		 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
153 {
154 	struct tcphdr *tcph;
155 	unsigned int tcphoff = iph->len;
156 	int oldlen;
157 	int payload_csum = 0;
158 
159 #ifdef CONFIG_IP_VS_IPV6
160 	if (cp->af == AF_INET6 && iph->fragoffs)
161 		return 1;
162 #endif
163 	oldlen = skb->len - tcphoff;
164 
165 	/* csum_check requires unshared skb */
166 	if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
167 		return 0;
168 
169 	if (unlikely(cp->app != NULL)) {
170 		int ret;
171 
172 		/* Some checks before mangling */
173 		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
174 			return 0;
175 
176 		/* Call application helper if needed */
177 		if (!(ret = ip_vs_app_pkt_out(cp, skb)))
178 			return 0;
179 		/* ret=2: csum update is needed after payload mangling */
180 		if (ret == 1)
181 			oldlen = skb->len - tcphoff;
182 		else
183 			payload_csum = 1;
184 	}
185 
186 	tcph = (void *)skb_network_header(skb) + tcphoff;
187 	tcph->source = cp->vport;
188 
189 	/* Adjust TCP checksums */
190 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
191 		tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
192 					htons(oldlen),
193 					htons(skb->len - tcphoff));
194 	} else if (!payload_csum) {
195 		/* Only port and addr are changed, do fast csum update */
196 		tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
197 				     cp->dport, cp->vport);
198 		if (skb->ip_summed == CHECKSUM_COMPLETE)
199 			skb->ip_summed = (cp->app && pp->csum_check) ?
200 					 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
201 	} else {
202 		/* full checksum calculation */
203 		tcph->check = 0;
204 		skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
205 #ifdef CONFIG_IP_VS_IPV6
206 		if (cp->af == AF_INET6)
207 			tcph->check = csum_ipv6_magic(&cp->vaddr.in6,
208 						      &cp->caddr.in6,
209 						      skb->len - tcphoff,
210 						      cp->protocol, skb->csum);
211 		else
212 #endif
213 			tcph->check = csum_tcpudp_magic(cp->vaddr.ip,
214 							cp->caddr.ip,
215 							skb->len - tcphoff,
216 							cp->protocol,
217 							skb->csum);
218 		skb->ip_summed = CHECKSUM_UNNECESSARY;
219 
220 		IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
221 			  pp->name, tcph->check,
222 			  (char*)&(tcph->check) - (char*)tcph);
223 	}
224 	return 1;
225 }
226 
227 
228 static int
229 tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
230 		 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
231 {
232 	struct tcphdr *tcph;
233 	unsigned int tcphoff = iph->len;
234 	int oldlen;
235 	int payload_csum = 0;
236 
237 #ifdef CONFIG_IP_VS_IPV6
238 	if (cp->af == AF_INET6 && iph->fragoffs)
239 		return 1;
240 #endif
241 	oldlen = skb->len - tcphoff;
242 
243 	/* csum_check requires unshared skb */
244 	if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
245 		return 0;
246 
247 	if (unlikely(cp->app != NULL)) {
248 		int ret;
249 
250 		/* Some checks before mangling */
251 		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
252 			return 0;
253 
254 		/*
255 		 *	Attempt ip_vs_app call.
256 		 *	It will fix ip_vs_conn and iph ack_seq stuff
257 		 */
258 		if (!(ret = ip_vs_app_pkt_in(cp, skb)))
259 			return 0;
260 		/* ret=2: csum update is needed after payload mangling */
261 		if (ret == 1)
262 			oldlen = skb->len - tcphoff;
263 		else
264 			payload_csum = 1;
265 	}
266 
267 	tcph = (void *)skb_network_header(skb) + tcphoff;
268 	tcph->dest = cp->dport;
269 
270 	/*
271 	 *	Adjust TCP checksums
272 	 */
273 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
274 		tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
275 					htons(oldlen),
276 					htons(skb->len - tcphoff));
277 	} else if (!payload_csum) {
278 		/* Only port and addr are changed, do fast csum update */
279 		tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
280 				     cp->vport, cp->dport);
281 		if (skb->ip_summed == CHECKSUM_COMPLETE)
282 			skb->ip_summed = (cp->app && pp->csum_check) ?
283 					 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
284 	} else {
285 		/* full checksum calculation */
286 		tcph->check = 0;
287 		skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
288 #ifdef CONFIG_IP_VS_IPV6
289 		if (cp->af == AF_INET6)
290 			tcph->check = csum_ipv6_magic(&cp->caddr.in6,
291 						      &cp->daddr.in6,
292 						      skb->len - tcphoff,
293 						      cp->protocol, skb->csum);
294 		else
295 #endif
296 			tcph->check = csum_tcpudp_magic(cp->caddr.ip,
297 							cp->daddr.ip,
298 							skb->len - tcphoff,
299 							cp->protocol,
300 							skb->csum);
301 		skb->ip_summed = CHECKSUM_UNNECESSARY;
302 	}
303 	return 1;
304 }
305 
306 
307 static int
308 tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
309 {
310 	unsigned int tcphoff;
311 
312 #ifdef CONFIG_IP_VS_IPV6
313 	if (af == AF_INET6)
314 		tcphoff = sizeof(struct ipv6hdr);
315 	else
316 #endif
317 		tcphoff = ip_hdrlen(skb);
318 
319 	switch (skb->ip_summed) {
320 	case CHECKSUM_NONE:
321 		skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
322 	case CHECKSUM_COMPLETE:
323 #ifdef CONFIG_IP_VS_IPV6
324 		if (af == AF_INET6) {
325 			if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
326 					    &ipv6_hdr(skb)->daddr,
327 					    skb->len - tcphoff,
328 					    ipv6_hdr(skb)->nexthdr,
329 					    skb->csum)) {
330 				IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
331 						 "Failed checksum for");
332 				return 0;
333 			}
334 		} else
335 #endif
336 			if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
337 					      ip_hdr(skb)->daddr,
338 					      skb->len - tcphoff,
339 					      ip_hdr(skb)->protocol,
340 					      skb->csum)) {
341 				IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
342 						 "Failed checksum for");
343 				return 0;
344 			}
345 		break;
346 	default:
347 		/* No need to checksum. */
348 		break;
349 	}
350 
351 	return 1;
352 }
353 
354 
355 #define TCP_DIR_INPUT		0
356 #define TCP_DIR_OUTPUT		4
357 #define TCP_DIR_INPUT_ONLY	8
358 
359 static const int tcp_state_off[IP_VS_DIR_LAST] = {
360 	[IP_VS_DIR_INPUT]		=	TCP_DIR_INPUT,
361 	[IP_VS_DIR_OUTPUT]		=	TCP_DIR_OUTPUT,
362 	[IP_VS_DIR_INPUT_ONLY]		=	TCP_DIR_INPUT_ONLY,
363 };
364 
365 /*
366  *	Timeout table[state]
367  */
368 static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
369 	[IP_VS_TCP_S_NONE]		=	2*HZ,
370 	[IP_VS_TCP_S_ESTABLISHED]	=	15*60*HZ,
371 	[IP_VS_TCP_S_SYN_SENT]		=	2*60*HZ,
372 	[IP_VS_TCP_S_SYN_RECV]		=	1*60*HZ,
373 	[IP_VS_TCP_S_FIN_WAIT]		=	2*60*HZ,
374 	[IP_VS_TCP_S_TIME_WAIT]		=	2*60*HZ,
375 	[IP_VS_TCP_S_CLOSE]		=	10*HZ,
376 	[IP_VS_TCP_S_CLOSE_WAIT]	=	60*HZ,
377 	[IP_VS_TCP_S_LAST_ACK]		=	30*HZ,
378 	[IP_VS_TCP_S_LISTEN]		=	2*60*HZ,
379 	[IP_VS_TCP_S_SYNACK]		=	120*HZ,
380 	[IP_VS_TCP_S_LAST]		=	2*HZ,
381 };
382 
383 static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
384 	[IP_VS_TCP_S_NONE]		=	"NONE",
385 	[IP_VS_TCP_S_ESTABLISHED]	=	"ESTABLISHED",
386 	[IP_VS_TCP_S_SYN_SENT]		=	"SYN_SENT",
387 	[IP_VS_TCP_S_SYN_RECV]		=	"SYN_RECV",
388 	[IP_VS_TCP_S_FIN_WAIT]		=	"FIN_WAIT",
389 	[IP_VS_TCP_S_TIME_WAIT]		=	"TIME_WAIT",
390 	[IP_VS_TCP_S_CLOSE]		=	"CLOSE",
391 	[IP_VS_TCP_S_CLOSE_WAIT]	=	"CLOSE_WAIT",
392 	[IP_VS_TCP_S_LAST_ACK]		=	"LAST_ACK",
393 	[IP_VS_TCP_S_LISTEN]		=	"LISTEN",
394 	[IP_VS_TCP_S_SYNACK]		=	"SYNACK",
395 	[IP_VS_TCP_S_LAST]		=	"BUG!",
396 };
397 
398 #define sNO IP_VS_TCP_S_NONE
399 #define sES IP_VS_TCP_S_ESTABLISHED
400 #define sSS IP_VS_TCP_S_SYN_SENT
401 #define sSR IP_VS_TCP_S_SYN_RECV
402 #define sFW IP_VS_TCP_S_FIN_WAIT
403 #define sTW IP_VS_TCP_S_TIME_WAIT
404 #define sCL IP_VS_TCP_S_CLOSE
405 #define sCW IP_VS_TCP_S_CLOSE_WAIT
406 #define sLA IP_VS_TCP_S_LAST_ACK
407 #define sLI IP_VS_TCP_S_LISTEN
408 #define sSA IP_VS_TCP_S_SYNACK
409 
410 struct tcp_states_t {
411 	int next_state[IP_VS_TCP_S_LAST];
412 };
413 
414 static const char * tcp_state_name(int state)
415 {
416 	if (state >= IP_VS_TCP_S_LAST)
417 		return "ERR!";
418 	return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
419 }
420 
421 static struct tcp_states_t tcp_states [] = {
422 /*	INPUT */
423 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
424 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
425 /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
426 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
427 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
428 
429 /*	OUTPUT */
430 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
431 /*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
432 /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
433 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
434 /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
435 
436 /*	INPUT-ONLY */
437 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
438 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
439 /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
440 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
441 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
442 };
443 
444 static struct tcp_states_t tcp_states_dos [] = {
445 /*	INPUT */
446 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
447 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
448 /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
449 /*ack*/ {{sES, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
450 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
451 
452 /*	OUTPUT */
453 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
454 /*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
455 /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
456 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
457 /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
458 
459 /*	INPUT-ONLY */
460 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
461 /*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
462 /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
463 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
464 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
465 };
466 
467 static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags)
468 {
469 	int on = (flags & 1);		/* secure_tcp */
470 
471 	/*
472 	** FIXME: change secure_tcp to independent sysctl var
473 	** or make it per-service or per-app because it is valid
474 	** for most if not for all of the applications. Something
475 	** like "capabilities" (flags) for each object.
476 	*/
477 	pd->tcp_state_table = (on ? tcp_states_dos : tcp_states);
478 }
479 
480 static inline int tcp_state_idx(struct tcphdr *th)
481 {
482 	if (th->rst)
483 		return 3;
484 	if (th->syn)
485 		return 0;
486 	if (th->fin)
487 		return 1;
488 	if (th->ack)
489 		return 2;
490 	return -1;
491 }
492 
493 static inline void
494 set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
495 	      int direction, struct tcphdr *th)
496 {
497 	int state_idx;
498 	int new_state = IP_VS_TCP_S_CLOSE;
499 	int state_off = tcp_state_off[direction];
500 
501 	/*
502 	 *    Update state offset to INPUT_ONLY if necessary
503 	 *    or delete NO_OUTPUT flag if output packet detected
504 	 */
505 	if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
506 		if (state_off == TCP_DIR_OUTPUT)
507 			cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
508 		else
509 			state_off = TCP_DIR_INPUT_ONLY;
510 	}
511 
512 	if ((state_idx = tcp_state_idx(th)) < 0) {
513 		IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
514 		goto tcp_state_out;
515 	}
516 
517 	new_state =
518 		pd->tcp_state_table[state_off+state_idx].next_state[cp->state];
519 
520   tcp_state_out:
521 	if (new_state != cp->state) {
522 		struct ip_vs_dest *dest = cp->dest;
523 
524 		IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
525 			      "%s:%d state: %s->%s conn->refcnt:%d\n",
526 			      pd->pp->name,
527 			      ((state_off == TCP_DIR_OUTPUT) ?
528 			       "output " : "input "),
529 			      th->syn ? 'S' : '.',
530 			      th->fin ? 'F' : '.',
531 			      th->ack ? 'A' : '.',
532 			      th->rst ? 'R' : '.',
533 			      IP_VS_DBG_ADDR(cp->daf, &cp->daddr),
534 			      ntohs(cp->dport),
535 			      IP_VS_DBG_ADDR(cp->af, &cp->caddr),
536 			      ntohs(cp->cport),
537 			      tcp_state_name(cp->state),
538 			      tcp_state_name(new_state),
539 			      atomic_read(&cp->refcnt));
540 
541 		if (dest) {
542 			if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
543 			    (new_state != IP_VS_TCP_S_ESTABLISHED)) {
544 				atomic_dec(&dest->activeconns);
545 				atomic_inc(&dest->inactconns);
546 				cp->flags |= IP_VS_CONN_F_INACTIVE;
547 			} else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
548 				   (new_state == IP_VS_TCP_S_ESTABLISHED)) {
549 				atomic_inc(&dest->activeconns);
550 				atomic_dec(&dest->inactconns);
551 				cp->flags &= ~IP_VS_CONN_F_INACTIVE;
552 			}
553 		}
554 	}
555 
556 	if (likely(pd))
557 		cp->timeout = pd->timeout_table[cp->state = new_state];
558 	else	/* What to do ? */
559 		cp->timeout = tcp_timeouts[cp->state = new_state];
560 }
561 
562 /*
563  *	Handle state transitions
564  */
565 static void
566 tcp_state_transition(struct ip_vs_conn *cp, int direction,
567 		     const struct sk_buff *skb,
568 		     struct ip_vs_proto_data *pd)
569 {
570 	struct tcphdr _tcph, *th;
571 
572 #ifdef CONFIG_IP_VS_IPV6
573 	int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
574 #else
575 	int ihl = ip_hdrlen(skb);
576 #endif
577 
578 	th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);
579 	if (th == NULL)
580 		return;
581 
582 	spin_lock_bh(&cp->lock);
583 	set_tcp_state(pd, cp, direction, th);
584 	spin_unlock_bh(&cp->lock);
585 }
586 
587 static inline __u16 tcp_app_hashkey(__be16 port)
588 {
589 	return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
590 		& TCP_APP_TAB_MASK;
591 }
592 
593 
594 static int tcp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc)
595 {
596 	struct ip_vs_app *i;
597 	__u16 hash;
598 	__be16 port = inc->port;
599 	int ret = 0;
600 	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
601 
602 	hash = tcp_app_hashkey(port);
603 
604 	list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) {
605 		if (i->port == port) {
606 			ret = -EEXIST;
607 			goto out;
608 		}
609 	}
610 	list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]);
611 	atomic_inc(&pd->appcnt);
612 
613   out:
614 	return ret;
615 }
616 
617 
618 static void
619 tcp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc)
620 {
621 	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
622 
623 	atomic_dec(&pd->appcnt);
624 	list_del_rcu(&inc->p_list);
625 }
626 
627 
628 static int
629 tcp_app_conn_bind(struct ip_vs_conn *cp)
630 {
631 	struct netns_ipvs *ipvs = cp->ipvs;
632 	int hash;
633 	struct ip_vs_app *inc;
634 	int result = 0;
635 
636 	/* Default binding: bind app only for NAT */
637 	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
638 		return 0;
639 
640 	/* Lookup application incarnations and bind the right one */
641 	hash = tcp_app_hashkey(cp->vport);
642 
643 	rcu_read_lock();
644 	list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) {
645 		if (inc->port == cp->vport) {
646 			if (unlikely(!ip_vs_app_inc_get(inc)))
647 				break;
648 			rcu_read_unlock();
649 
650 			IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
651 				      "%s:%u to app %s on port %u\n",
652 				      __func__,
653 				      IP_VS_DBG_ADDR(cp->af, &cp->caddr),
654 				      ntohs(cp->cport),
655 				      IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
656 				      ntohs(cp->vport),
657 				      inc->name, ntohs(inc->port));
658 
659 			cp->app = inc;
660 			if (inc->init_conn)
661 				result = inc->init_conn(inc, cp);
662 			goto out;
663 		}
664 	}
665 	rcu_read_unlock();
666 
667   out:
668 	return result;
669 }
670 
671 
672 /*
673  *	Set LISTEN timeout. (ip_vs_conn_put will setup timer)
674  */
675 void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
676 {
677 	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(cp->ipvs, IPPROTO_TCP);
678 
679 	spin_lock_bh(&cp->lock);
680 	cp->state = IP_VS_TCP_S_LISTEN;
681 	cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN]
682 			   : tcp_timeouts[IP_VS_TCP_S_LISTEN]);
683 	spin_unlock_bh(&cp->lock);
684 }
685 
686 /* ---------------------------------------------
687  *   timeouts is netns related now.
688  * ---------------------------------------------
689  */
690 static int __ip_vs_tcp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd)
691 {
692 	ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE);
693 	pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts,
694 							sizeof(tcp_timeouts));
695 	if (!pd->timeout_table)
696 		return -ENOMEM;
697 	pd->tcp_state_table =  tcp_states;
698 	return 0;
699 }
700 
701 static void __ip_vs_tcp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd)
702 {
703 	kfree(pd->timeout_table);
704 }
705 
706 
707 struct ip_vs_protocol ip_vs_protocol_tcp = {
708 	.name =			"TCP",
709 	.protocol =		IPPROTO_TCP,
710 	.num_states =		IP_VS_TCP_S_LAST,
711 	.dont_defrag =		0,
712 	.init =			NULL,
713 	.exit =			NULL,
714 	.init_netns =		__ip_vs_tcp_init,
715 	.exit_netns =		__ip_vs_tcp_exit,
716 	.register_app =		tcp_register_app,
717 	.unregister_app =	tcp_unregister_app,
718 	.conn_schedule =	tcp_conn_schedule,
719 	.conn_in_get =		ip_vs_conn_in_get_proto,
720 	.conn_out_get =		ip_vs_conn_out_get_proto,
721 	.snat_handler =		tcp_snat_handler,
722 	.dnat_handler =		tcp_dnat_handler,
723 	.csum_check =		tcp_csum_check,
724 	.state_name =		tcp_state_name,
725 	.state_transition =	tcp_state_transition,
726 	.app_conn_bind =	tcp_app_conn_bind,
727 	.debug_packet =		ip_vs_tcpudp_debug_packet,
728 	.timeout_change =	tcp_timeout_change,
729 };
730