xref: /openbmc/linux/net/netfilter/ipvs/ip_vs_proto_tcp.c (revision cbecf716ca618fd44feda6bd9a64a8179d031fc5)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * ip_vs_proto_tcp.c:	TCP load balancing support for IPVS
4  *
5  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
6  *              Julian Anastasov <ja@ssi.bg>
7  *
8  * Changes:     Hans Schillstrom <hans.schillstrom@ericsson.com>
9  *
10  *              Network name space (netns) aware.
11  *              Global data moved to netns i.e struct netns_ipvs
12  *              tcp_timeouts table has copy per netns in a hash table per
13  *              protocol ip_vs_proto_data and is handled by netns
14  */
15 
16 #define KMSG_COMPONENT "IPVS"
17 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
18 
19 #include <linux/kernel.h>
20 #include <linux/ip.h>
21 #include <linux/tcp.h>                  /* for tcphdr */
22 #include <net/ip.h>
23 #include <net/tcp.h>                    /* for csum_tcpudp_magic */
24 #include <net/ip6_checksum.h>
25 #include <linux/netfilter.h>
26 #include <linux/netfilter_ipv4.h>
27 #include <linux/indirect_call_wrapper.h>
28 
29 #include <net/ip_vs.h>
30 
31 static int
32 tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp);
33 
34 static int
tcp_conn_schedule(struct netns_ipvs * ipvs,int af,struct sk_buff * skb,struct ip_vs_proto_data * pd,int * verdict,struct ip_vs_conn ** cpp,struct ip_vs_iphdr * iph)35 tcp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
36 		  struct ip_vs_proto_data *pd,
37 		  int *verdict, struct ip_vs_conn **cpp,
38 		  struct ip_vs_iphdr *iph)
39 {
40 	struct ip_vs_service *svc;
41 	struct tcphdr _tcph, *th;
42 	__be16 _ports[2], *ports = NULL;
43 
44 	/* In the event of icmp, we're only guaranteed to have the first 8
45 	 * bytes of the transport header, so we only check the rest of the
46 	 * TCP packet for non-ICMP packets
47 	 */
48 	if (likely(!ip_vs_iph_icmp(iph))) {
49 		th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
50 		if (th) {
51 			if (th->rst || !(sysctl_sloppy_tcp(ipvs) || th->syn))
52 				return 1;
53 			ports = &th->source;
54 		}
55 	} else {
56 		ports = skb_header_pointer(
57 			skb, iph->len, sizeof(_ports), &_ports);
58 	}
59 
60 	if (!ports) {
61 		*verdict = NF_DROP;
62 		return 0;
63 	}
64 
65 	/* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
66 
67 	if (likely(!ip_vs_iph_inverse(iph)))
68 		svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol,
69 					 &iph->daddr, ports[1]);
70 	else
71 		svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol,
72 					 &iph->saddr, ports[0]);
73 
74 	if (svc) {
75 		int ignored;
76 
77 		if (ip_vs_todrop(ipvs)) {
78 			/*
79 			 * It seems that we are very loaded.
80 			 * We have to drop this packet :(
81 			 */
82 			*verdict = NF_DROP;
83 			return 0;
84 		}
85 
86 		/*
87 		 * Let the virtual server select a real server for the
88 		 * incoming connection, and create a connection entry.
89 		 */
90 		*cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
91 		if (!*cpp && ignored <= 0) {
92 			if (!ignored)
93 				*verdict = ip_vs_leave(svc, skb, pd, iph);
94 			else
95 				*verdict = NF_DROP;
96 			return 0;
97 		}
98 	}
99 	/* NF_ACCEPT */
100 	return 1;
101 }
102 
103 
104 static inline void
tcp_fast_csum_update(int af,struct tcphdr * tcph,const union nf_inet_addr * oldip,const union nf_inet_addr * newip,__be16 oldport,__be16 newport)105 tcp_fast_csum_update(int af, struct tcphdr *tcph,
106 		     const union nf_inet_addr *oldip,
107 		     const union nf_inet_addr *newip,
108 		     __be16 oldport, __be16 newport)
109 {
110 #ifdef CONFIG_IP_VS_IPV6
111 	if (af == AF_INET6)
112 		tcph->check =
113 			csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
114 					 ip_vs_check_diff2(oldport, newport,
115 						~csum_unfold(tcph->check))));
116 	else
117 #endif
118 	tcph->check =
119 		csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
120 				 ip_vs_check_diff2(oldport, newport,
121 						~csum_unfold(tcph->check))));
122 }
123 
124 
125 static inline void
tcp_partial_csum_update(int af,struct tcphdr * tcph,const union nf_inet_addr * oldip,const union nf_inet_addr * newip,__be16 oldlen,__be16 newlen)126 tcp_partial_csum_update(int af, struct tcphdr *tcph,
127 		     const union nf_inet_addr *oldip,
128 		     const union nf_inet_addr *newip,
129 		     __be16 oldlen, __be16 newlen)
130 {
131 #ifdef CONFIG_IP_VS_IPV6
132 	if (af == AF_INET6)
133 		tcph->check =
134 			~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
135 					 ip_vs_check_diff2(oldlen, newlen,
136 						csum_unfold(tcph->check))));
137 	else
138 #endif
139 	tcph->check =
140 		~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
141 				ip_vs_check_diff2(oldlen, newlen,
142 						csum_unfold(tcph->check))));
143 }
144 
145 
146 INDIRECT_CALLABLE_SCOPE int
tcp_snat_handler(struct sk_buff * skb,struct ip_vs_protocol * pp,struct ip_vs_conn * cp,struct ip_vs_iphdr * iph)147 tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
148 		 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
149 {
150 	struct tcphdr *tcph;
151 	unsigned int tcphoff = iph->len;
152 	bool payload_csum = false;
153 	int oldlen;
154 
155 #ifdef CONFIG_IP_VS_IPV6
156 	if (cp->af == AF_INET6 && iph->fragoffs)
157 		return 1;
158 #endif
159 	oldlen = skb->len - tcphoff;
160 
161 	/* csum_check requires unshared skb */
162 	if (skb_ensure_writable(skb, tcphoff + sizeof(*tcph)))
163 		return 0;
164 
165 	if (unlikely(cp->app != NULL)) {
166 		int ret;
167 
168 		/* Some checks before mangling */
169 		if (!tcp_csum_check(cp->af, skb, pp))
170 			return 0;
171 
172 		/* Call application helper if needed */
173 		if (!(ret = ip_vs_app_pkt_out(cp, skb, iph)))
174 			return 0;
175 		/* ret=2: csum update is needed after payload mangling */
176 		if (ret == 1)
177 			oldlen = skb->len - tcphoff;
178 		else
179 			payload_csum = true;
180 	}
181 
182 	tcph = (void *)skb_network_header(skb) + tcphoff;
183 	tcph->source = cp->vport;
184 
185 	/* Adjust TCP checksums */
186 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
187 		tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
188 					htons(oldlen),
189 					htons(skb->len - tcphoff));
190 	} else if (!payload_csum) {
191 		/* Only port and addr are changed, do fast csum update */
192 		tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
193 				     cp->dport, cp->vport);
194 		if (skb->ip_summed == CHECKSUM_COMPLETE)
195 			skb->ip_summed = cp->app ?
196 					 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
197 	} else {
198 		/* full checksum calculation */
199 		tcph->check = 0;
200 		skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
201 #ifdef CONFIG_IP_VS_IPV6
202 		if (cp->af == AF_INET6)
203 			tcph->check = csum_ipv6_magic(&cp->vaddr.in6,
204 						      &cp->caddr.in6,
205 						      skb->len - tcphoff,
206 						      cp->protocol, skb->csum);
207 		else
208 #endif
209 			tcph->check = csum_tcpudp_magic(cp->vaddr.ip,
210 							cp->caddr.ip,
211 							skb->len - tcphoff,
212 							cp->protocol,
213 							skb->csum);
214 		skb->ip_summed = CHECKSUM_UNNECESSARY;
215 
216 		IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
217 			  pp->name, tcph->check,
218 			  (char*)&(tcph->check) - (char*)tcph);
219 	}
220 	return 1;
221 }
222 
223 
224 static int
tcp_dnat_handler(struct sk_buff * skb,struct ip_vs_protocol * pp,struct ip_vs_conn * cp,struct ip_vs_iphdr * iph)225 tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
226 		 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
227 {
228 	struct tcphdr *tcph;
229 	unsigned int tcphoff = iph->len;
230 	bool payload_csum = false;
231 	int oldlen;
232 
233 #ifdef CONFIG_IP_VS_IPV6
234 	if (cp->af == AF_INET6 && iph->fragoffs)
235 		return 1;
236 #endif
237 	oldlen = skb->len - tcphoff;
238 
239 	/* csum_check requires unshared skb */
240 	if (skb_ensure_writable(skb, tcphoff + sizeof(*tcph)))
241 		return 0;
242 
243 	if (unlikely(cp->app != NULL)) {
244 		int ret;
245 
246 		/* Some checks before mangling */
247 		if (!tcp_csum_check(cp->af, skb, pp))
248 			return 0;
249 
250 		/*
251 		 *	Attempt ip_vs_app call.
252 		 *	It will fix ip_vs_conn and iph ack_seq stuff
253 		 */
254 		if (!(ret = ip_vs_app_pkt_in(cp, skb, iph)))
255 			return 0;
256 		/* ret=2: csum update is needed after payload mangling */
257 		if (ret == 1)
258 			oldlen = skb->len - tcphoff;
259 		else
260 			payload_csum = true;
261 	}
262 
263 	tcph = (void *)skb_network_header(skb) + tcphoff;
264 	tcph->dest = cp->dport;
265 
266 	/*
267 	 *	Adjust TCP checksums
268 	 */
269 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
270 		tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
271 					htons(oldlen),
272 					htons(skb->len - tcphoff));
273 	} else if (!payload_csum) {
274 		/* Only port and addr are changed, do fast csum update */
275 		tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
276 				     cp->vport, cp->dport);
277 		if (skb->ip_summed == CHECKSUM_COMPLETE)
278 			skb->ip_summed = cp->app ?
279 					 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
280 	} else {
281 		/* full checksum calculation */
282 		tcph->check = 0;
283 		skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
284 #ifdef CONFIG_IP_VS_IPV6
285 		if (cp->af == AF_INET6)
286 			tcph->check = csum_ipv6_magic(&cp->caddr.in6,
287 						      &cp->daddr.in6,
288 						      skb->len - tcphoff,
289 						      cp->protocol, skb->csum);
290 		else
291 #endif
292 			tcph->check = csum_tcpudp_magic(cp->caddr.ip,
293 							cp->daddr.ip,
294 							skb->len - tcphoff,
295 							cp->protocol,
296 							skb->csum);
297 		skb->ip_summed = CHECKSUM_UNNECESSARY;
298 	}
299 	return 1;
300 }
301 
302 
303 static int
tcp_csum_check(int af,struct sk_buff * skb,struct ip_vs_protocol * pp)304 tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
305 {
306 	unsigned int tcphoff;
307 
308 #ifdef CONFIG_IP_VS_IPV6
309 	if (af == AF_INET6)
310 		tcphoff = sizeof(struct ipv6hdr);
311 	else
312 #endif
313 		tcphoff = ip_hdrlen(skb);
314 
315 	switch (skb->ip_summed) {
316 	case CHECKSUM_NONE:
317 		skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
318 		fallthrough;
319 	case CHECKSUM_COMPLETE:
320 #ifdef CONFIG_IP_VS_IPV6
321 		if (af == AF_INET6) {
322 			if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
323 					    &ipv6_hdr(skb)->daddr,
324 					    skb->len - tcphoff,
325 					    ipv6_hdr(skb)->nexthdr,
326 					    skb->csum)) {
327 				IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
328 						 "Failed checksum for");
329 				return 0;
330 			}
331 		} else
332 #endif
333 			if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
334 					      ip_hdr(skb)->daddr,
335 					      skb->len - tcphoff,
336 					      ip_hdr(skb)->protocol,
337 					      skb->csum)) {
338 				IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
339 						 "Failed checksum for");
340 				return 0;
341 			}
342 		break;
343 	default:
344 		/* No need to checksum. */
345 		break;
346 	}
347 
348 	return 1;
349 }
350 
351 
352 #define TCP_DIR_INPUT		0
353 #define TCP_DIR_OUTPUT		4
354 #define TCP_DIR_INPUT_ONLY	8
355 
356 static const int tcp_state_off[IP_VS_DIR_LAST] = {
357 	[IP_VS_DIR_INPUT]		=	TCP_DIR_INPUT,
358 	[IP_VS_DIR_OUTPUT]		=	TCP_DIR_OUTPUT,
359 	[IP_VS_DIR_INPUT_ONLY]		=	TCP_DIR_INPUT_ONLY,
360 };
361 
362 /*
363  *	Timeout table[state]
364  */
365 static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
366 	[IP_VS_TCP_S_NONE]		=	2*HZ,
367 	[IP_VS_TCP_S_ESTABLISHED]	=	15*60*HZ,
368 	[IP_VS_TCP_S_SYN_SENT]		=	2*60*HZ,
369 	[IP_VS_TCP_S_SYN_RECV]		=	1*60*HZ,
370 	[IP_VS_TCP_S_FIN_WAIT]		=	2*60*HZ,
371 	[IP_VS_TCP_S_TIME_WAIT]		=	2*60*HZ,
372 	[IP_VS_TCP_S_CLOSE]		=	10*HZ,
373 	[IP_VS_TCP_S_CLOSE_WAIT]	=	60*HZ,
374 	[IP_VS_TCP_S_LAST_ACK]		=	30*HZ,
375 	[IP_VS_TCP_S_LISTEN]		=	2*60*HZ,
376 	[IP_VS_TCP_S_SYNACK]		=	120*HZ,
377 	[IP_VS_TCP_S_LAST]		=	2*HZ,
378 };
379 
380 static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
381 	[IP_VS_TCP_S_NONE]		=	"NONE",
382 	[IP_VS_TCP_S_ESTABLISHED]	=	"ESTABLISHED",
383 	[IP_VS_TCP_S_SYN_SENT]		=	"SYN_SENT",
384 	[IP_VS_TCP_S_SYN_RECV]		=	"SYN_RECV",
385 	[IP_VS_TCP_S_FIN_WAIT]		=	"FIN_WAIT",
386 	[IP_VS_TCP_S_TIME_WAIT]		=	"TIME_WAIT",
387 	[IP_VS_TCP_S_CLOSE]		=	"CLOSE",
388 	[IP_VS_TCP_S_CLOSE_WAIT]	=	"CLOSE_WAIT",
389 	[IP_VS_TCP_S_LAST_ACK]		=	"LAST_ACK",
390 	[IP_VS_TCP_S_LISTEN]		=	"LISTEN",
391 	[IP_VS_TCP_S_SYNACK]		=	"SYNACK",
392 	[IP_VS_TCP_S_LAST]		=	"BUG!",
393 };
394 
395 static const bool tcp_state_active_table[IP_VS_TCP_S_LAST] = {
396 	[IP_VS_TCP_S_NONE]		=	false,
397 	[IP_VS_TCP_S_ESTABLISHED]	=	true,
398 	[IP_VS_TCP_S_SYN_SENT]		=	true,
399 	[IP_VS_TCP_S_SYN_RECV]		=	true,
400 	[IP_VS_TCP_S_FIN_WAIT]		=	false,
401 	[IP_VS_TCP_S_TIME_WAIT]		=	false,
402 	[IP_VS_TCP_S_CLOSE]		=	false,
403 	[IP_VS_TCP_S_CLOSE_WAIT]	=	false,
404 	[IP_VS_TCP_S_LAST_ACK]		=	false,
405 	[IP_VS_TCP_S_LISTEN]		=	false,
406 	[IP_VS_TCP_S_SYNACK]		=	true,
407 };
408 
409 #define sNO IP_VS_TCP_S_NONE
410 #define sES IP_VS_TCP_S_ESTABLISHED
411 #define sSS IP_VS_TCP_S_SYN_SENT
412 #define sSR IP_VS_TCP_S_SYN_RECV
413 #define sFW IP_VS_TCP_S_FIN_WAIT
414 #define sTW IP_VS_TCP_S_TIME_WAIT
415 #define sCL IP_VS_TCP_S_CLOSE
416 #define sCW IP_VS_TCP_S_CLOSE_WAIT
417 #define sLA IP_VS_TCP_S_LAST_ACK
418 #define sLI IP_VS_TCP_S_LISTEN
419 #define sSA IP_VS_TCP_S_SYNACK
420 
421 struct tcp_states_t {
422 	int next_state[IP_VS_TCP_S_LAST];
423 };
424 
tcp_state_name(int state)425 static const char * tcp_state_name(int state)
426 {
427 	if (state >= IP_VS_TCP_S_LAST)
428 		return "ERR!";
429 	return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
430 }
431 
tcp_state_active(int state)432 static bool tcp_state_active(int state)
433 {
434 	if (state >= IP_VS_TCP_S_LAST)
435 		return false;
436 	return tcp_state_active_table[state];
437 }
438 
439 static struct tcp_states_t tcp_states[] = {
440 /*	INPUT */
441 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
442 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
443 /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
444 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
445 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
446 
447 /*	OUTPUT */
448 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
449 /*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
450 /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
451 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
452 /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
453 
454 /*	INPUT-ONLY */
455 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
456 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
457 /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
458 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
459 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
460 };
461 
462 static struct tcp_states_t tcp_states_dos[] = {
463 /*	INPUT */
464 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
465 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
466 /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
467 /*ack*/ {{sES, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
468 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
469 
470 /*	OUTPUT */
471 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
472 /*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
473 /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
474 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
475 /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
476 
477 /*	INPUT-ONLY */
478 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
479 /*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
480 /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
481 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
482 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
483 };
484 
tcp_timeout_change(struct ip_vs_proto_data * pd,int flags)485 static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags)
486 {
487 	int on = (flags & 1);		/* secure_tcp */
488 
489 	/*
490 	** FIXME: change secure_tcp to independent sysctl var
491 	** or make it per-service or per-app because it is valid
492 	** for most if not for all of the applications. Something
493 	** like "capabilities" (flags) for each object.
494 	*/
495 	pd->tcp_state_table = (on ? tcp_states_dos : tcp_states);
496 }
497 
tcp_state_idx(struct tcphdr * th)498 static inline int tcp_state_idx(struct tcphdr *th)
499 {
500 	if (th->rst)
501 		return 3;
502 	if (th->syn)
503 		return 0;
504 	if (th->fin)
505 		return 1;
506 	if (th->ack)
507 		return 2;
508 	return -1;
509 }
510 
511 static inline void
set_tcp_state(struct ip_vs_proto_data * pd,struct ip_vs_conn * cp,int direction,struct tcphdr * th)512 set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
513 	      int direction, struct tcphdr *th)
514 {
515 	int state_idx;
516 	int new_state = IP_VS_TCP_S_CLOSE;
517 	int state_off = tcp_state_off[direction];
518 
519 	/*
520 	 *    Update state offset to INPUT_ONLY if necessary
521 	 *    or delete NO_OUTPUT flag if output packet detected
522 	 */
523 	if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
524 		if (state_off == TCP_DIR_OUTPUT)
525 			cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
526 		else
527 			state_off = TCP_DIR_INPUT_ONLY;
528 	}
529 
530 	if ((state_idx = tcp_state_idx(th)) < 0) {
531 		IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
532 		goto tcp_state_out;
533 	}
534 
535 	new_state =
536 		pd->tcp_state_table[state_off+state_idx].next_state[cp->state];
537 
538   tcp_state_out:
539 	if (new_state != cp->state) {
540 		struct ip_vs_dest *dest = cp->dest;
541 
542 		IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] c:%s:%d v:%s:%d "
543 			      "d:%s:%d state: %s->%s conn->refcnt:%d\n",
544 			      pd->pp->name,
545 			      ((state_off == TCP_DIR_OUTPUT) ?
546 			       "output " : "input "),
547 			      th->syn ? 'S' : '.',
548 			      th->fin ? 'F' : '.',
549 			      th->ack ? 'A' : '.',
550 			      th->rst ? 'R' : '.',
551 			      IP_VS_DBG_ADDR(cp->af, &cp->caddr),
552 			      ntohs(cp->cport),
553 			      IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
554 			      ntohs(cp->vport),
555 			      IP_VS_DBG_ADDR(cp->daf, &cp->daddr),
556 			      ntohs(cp->dport),
557 			      tcp_state_name(cp->state),
558 			      tcp_state_name(new_state),
559 			      refcount_read(&cp->refcnt));
560 
561 		if (dest) {
562 			if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
563 			    !tcp_state_active(new_state)) {
564 				atomic_dec(&dest->activeconns);
565 				atomic_inc(&dest->inactconns);
566 				cp->flags |= IP_VS_CONN_F_INACTIVE;
567 			} else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
568 				   tcp_state_active(new_state)) {
569 				atomic_inc(&dest->activeconns);
570 				atomic_dec(&dest->inactconns);
571 				cp->flags &= ~IP_VS_CONN_F_INACTIVE;
572 			}
573 		}
574 		if (new_state == IP_VS_TCP_S_ESTABLISHED)
575 			ip_vs_control_assure_ct(cp);
576 	}
577 
578 	if (likely(pd))
579 		cp->timeout = pd->timeout_table[cp->state = new_state];
580 	else	/* What to do ? */
581 		cp->timeout = tcp_timeouts[cp->state = new_state];
582 }
583 
584 /*
585  *	Handle state transitions
586  */
587 static void
tcp_state_transition(struct ip_vs_conn * cp,int direction,const struct sk_buff * skb,struct ip_vs_proto_data * pd)588 tcp_state_transition(struct ip_vs_conn *cp, int direction,
589 		     const struct sk_buff *skb,
590 		     struct ip_vs_proto_data *pd)
591 {
592 	struct tcphdr _tcph, *th;
593 
594 #ifdef CONFIG_IP_VS_IPV6
595 	int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
596 #else
597 	int ihl = ip_hdrlen(skb);
598 #endif
599 
600 	th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);
601 	if (th == NULL)
602 		return;
603 
604 	spin_lock_bh(&cp->lock);
605 	set_tcp_state(pd, cp, direction, th);
606 	spin_unlock_bh(&cp->lock);
607 }
608 
tcp_app_hashkey(__be16 port)609 static inline __u16 tcp_app_hashkey(__be16 port)
610 {
611 	return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
612 		& TCP_APP_TAB_MASK;
613 }
614 
615 
tcp_register_app(struct netns_ipvs * ipvs,struct ip_vs_app * inc)616 static int tcp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc)
617 {
618 	struct ip_vs_app *i;
619 	__u16 hash;
620 	__be16 port = inc->port;
621 	int ret = 0;
622 	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
623 
624 	hash = tcp_app_hashkey(port);
625 
626 	list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) {
627 		if (i->port == port) {
628 			ret = -EEXIST;
629 			goto out;
630 		}
631 	}
632 	list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]);
633 	atomic_inc(&pd->appcnt);
634 
635   out:
636 	return ret;
637 }
638 
639 
640 static void
tcp_unregister_app(struct netns_ipvs * ipvs,struct ip_vs_app * inc)641 tcp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc)
642 {
643 	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
644 
645 	atomic_dec(&pd->appcnt);
646 	list_del_rcu(&inc->p_list);
647 }
648 
649 
650 static int
tcp_app_conn_bind(struct ip_vs_conn * cp)651 tcp_app_conn_bind(struct ip_vs_conn *cp)
652 {
653 	struct netns_ipvs *ipvs = cp->ipvs;
654 	int hash;
655 	struct ip_vs_app *inc;
656 	int result = 0;
657 
658 	/* Default binding: bind app only for NAT */
659 	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
660 		return 0;
661 
662 	/* Lookup application incarnations and bind the right one */
663 	hash = tcp_app_hashkey(cp->vport);
664 
665 	list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) {
666 		if (inc->port == cp->vport) {
667 			if (unlikely(!ip_vs_app_inc_get(inc)))
668 				break;
669 
670 			IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
671 				      "%s:%u to app %s on port %u\n",
672 				      __func__,
673 				      IP_VS_DBG_ADDR(cp->af, &cp->caddr),
674 				      ntohs(cp->cport),
675 				      IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
676 				      ntohs(cp->vport),
677 				      inc->name, ntohs(inc->port));
678 
679 			cp->app = inc;
680 			if (inc->init_conn)
681 				result = inc->init_conn(inc, cp);
682 			break;
683 		}
684 	}
685 
686 	return result;
687 }
688 
689 
690 /*
691  *	Set LISTEN timeout. (ip_vs_conn_put will setup timer)
692  */
ip_vs_tcp_conn_listen(struct ip_vs_conn * cp)693 void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
694 {
695 	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(cp->ipvs, IPPROTO_TCP);
696 
697 	spin_lock_bh(&cp->lock);
698 	cp->state = IP_VS_TCP_S_LISTEN;
699 	cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN]
700 			   : tcp_timeouts[IP_VS_TCP_S_LISTEN]);
701 	spin_unlock_bh(&cp->lock);
702 }
703 
704 /* ---------------------------------------------
705  *   timeouts is netns related now.
706  * ---------------------------------------------
707  */
__ip_vs_tcp_init(struct netns_ipvs * ipvs,struct ip_vs_proto_data * pd)708 static int __ip_vs_tcp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd)
709 {
710 	ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE);
711 	pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts,
712 							sizeof(tcp_timeouts));
713 	if (!pd->timeout_table)
714 		return -ENOMEM;
715 	pd->tcp_state_table = tcp_states;
716 	return 0;
717 }
718 
__ip_vs_tcp_exit(struct netns_ipvs * ipvs,struct ip_vs_proto_data * pd)719 static void __ip_vs_tcp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd)
720 {
721 	kfree(pd->timeout_table);
722 }
723 
724 
725 struct ip_vs_protocol ip_vs_protocol_tcp = {
726 	.name =			"TCP",
727 	.protocol =		IPPROTO_TCP,
728 	.num_states =		IP_VS_TCP_S_LAST,
729 	.dont_defrag =		0,
730 	.init =			NULL,
731 	.exit =			NULL,
732 	.init_netns =		__ip_vs_tcp_init,
733 	.exit_netns =		__ip_vs_tcp_exit,
734 	.register_app =		tcp_register_app,
735 	.unregister_app =	tcp_unregister_app,
736 	.conn_schedule =	tcp_conn_schedule,
737 	.conn_in_get =		ip_vs_conn_in_get_proto,
738 	.conn_out_get =		ip_vs_conn_out_get_proto,
739 	.snat_handler =		tcp_snat_handler,
740 	.dnat_handler =		tcp_dnat_handler,
741 	.state_name =		tcp_state_name,
742 	.state_transition =	tcp_state_transition,
743 	.app_conn_bind =	tcp_app_conn_bind,
744 	.debug_packet =		ip_vs_tcpudp_debug_packet,
745 	.timeout_change =	tcp_timeout_change,
746 };
747