1 /* 2 * ip_vs_proto_udp.c: UDP load balancing support for IPVS 3 * 4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 5 * Julian Anastasov <ja@ssi.bg> 6 * 7 * This program is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU General Public License 9 * as published by the Free Software Foundation; either version 10 * 2 of the License, or (at your option) any later version. 11 * 12 * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com> 13 * Network name space (netns) aware. 14 * 15 */ 16 17 #define KMSG_COMPONENT "IPVS" 18 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 19 20 #include <linux/in.h> 21 #include <linux/ip.h> 22 #include <linux/kernel.h> 23 #include <linux/netfilter.h> 24 #include <linux/netfilter_ipv4.h> 25 #include <linux/udp.h> 26 27 #include <net/ip_vs.h> 28 #include <net/ip.h> 29 #include <net/ip6_checksum.h> 30 31 static int 32 udp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, 33 struct ip_vs_proto_data *pd, 34 int *verdict, struct ip_vs_conn **cpp, 35 struct ip_vs_iphdr *iph) 36 { 37 struct ip_vs_service *svc; 38 struct udphdr _udph, *uh; 39 __be16 _ports[2], *ports = NULL; 40 41 if (likely(!ip_vs_iph_icmp(iph))) { 42 /* IPv6 fragments, only first fragment will hit this */ 43 uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph); 44 if (uh) 45 ports = &uh->source; 46 } else { 47 ports = skb_header_pointer( 48 skb, iph->len, sizeof(_ports), &_ports); 49 } 50 51 if (!ports) { 52 *verdict = NF_DROP; 53 return 0; 54 } 55 56 rcu_read_lock(); 57 if (likely(!ip_vs_iph_inverse(iph))) 58 svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, 59 &iph->daddr, ports[1]); 60 else 61 svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, 62 &iph->saddr, ports[0]); 63 64 if (svc) { 65 int ignored; 66 67 if (ip_vs_todrop(ipvs)) { 68 /* 69 * It seems that we are very loaded. 70 * We have to drop this packet :( 71 */ 72 rcu_read_unlock(); 73 *verdict = NF_DROP; 74 return 0; 75 } 76 77 /* 78 * Let the virtual server select a real server for the 79 * incoming connection, and create a connection entry. 80 */ 81 *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph); 82 if (!*cpp && ignored <= 0) { 83 if (!ignored) 84 *verdict = ip_vs_leave(svc, skb, pd, iph); 85 else 86 *verdict = NF_DROP; 87 rcu_read_unlock(); 88 return 0; 89 } 90 } 91 rcu_read_unlock(); 92 /* NF_ACCEPT */ 93 return 1; 94 } 95 96 97 static inline void 98 udp_fast_csum_update(int af, struct udphdr *uhdr, 99 const union nf_inet_addr *oldip, 100 const union nf_inet_addr *newip, 101 __be16 oldport, __be16 newport) 102 { 103 #ifdef CONFIG_IP_VS_IPV6 104 if (af == AF_INET6) 105 uhdr->check = 106 csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, 107 ip_vs_check_diff2(oldport, newport, 108 ~csum_unfold(uhdr->check)))); 109 else 110 #endif 111 uhdr->check = 112 csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, 113 ip_vs_check_diff2(oldport, newport, 114 ~csum_unfold(uhdr->check)))); 115 if (!uhdr->check) 116 uhdr->check = CSUM_MANGLED_0; 117 } 118 119 static inline void 120 udp_partial_csum_update(int af, struct udphdr *uhdr, 121 const union nf_inet_addr *oldip, 122 const union nf_inet_addr *newip, 123 __be16 oldlen, __be16 newlen) 124 { 125 #ifdef CONFIG_IP_VS_IPV6 126 if (af == AF_INET6) 127 uhdr->check = 128 ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, 129 ip_vs_check_diff2(oldlen, newlen, 130 csum_unfold(uhdr->check)))); 131 else 132 #endif 133 uhdr->check = 134 ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, 135 ip_vs_check_diff2(oldlen, newlen, 136 csum_unfold(uhdr->check)))); 137 } 138 139 140 static int 141 udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, 142 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) 143 { 144 struct udphdr *udph; 145 unsigned int udphoff = iph->len; 146 int oldlen; 147 int payload_csum = 0; 148 149 #ifdef CONFIG_IP_VS_IPV6 150 if (cp->af == AF_INET6 && iph->fragoffs) 151 return 1; 152 #endif 153 oldlen = skb->len - udphoff; 154 155 /* csum_check requires unshared skb */ 156 if (!skb_make_writable(skb, udphoff+sizeof(*udph))) 157 return 0; 158 159 if (unlikely(cp->app != NULL)) { 160 int ret; 161 162 /* Some checks before mangling */ 163 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) 164 return 0; 165 166 /* 167 * Call application helper if needed 168 */ 169 if (!(ret = ip_vs_app_pkt_out(cp, skb))) 170 return 0; 171 /* ret=2: csum update is needed after payload mangling */ 172 if (ret == 1) 173 oldlen = skb->len - udphoff; 174 else 175 payload_csum = 1; 176 } 177 178 udph = (void *)skb_network_header(skb) + udphoff; 179 udph->source = cp->vport; 180 181 /* 182 * Adjust UDP checksums 183 */ 184 if (skb->ip_summed == CHECKSUM_PARTIAL) { 185 udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, 186 htons(oldlen), 187 htons(skb->len - udphoff)); 188 } else if (!payload_csum && (udph->check != 0)) { 189 /* Only port and addr are changed, do fast csum update */ 190 udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, 191 cp->dport, cp->vport); 192 if (skb->ip_summed == CHECKSUM_COMPLETE) 193 skb->ip_summed = (cp->app && pp->csum_check) ? 194 CHECKSUM_UNNECESSARY : CHECKSUM_NONE; 195 } else { 196 /* full checksum calculation */ 197 udph->check = 0; 198 skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); 199 #ifdef CONFIG_IP_VS_IPV6 200 if (cp->af == AF_INET6) 201 udph->check = csum_ipv6_magic(&cp->vaddr.in6, 202 &cp->caddr.in6, 203 skb->len - udphoff, 204 cp->protocol, skb->csum); 205 else 206 #endif 207 udph->check = csum_tcpudp_magic(cp->vaddr.ip, 208 cp->caddr.ip, 209 skb->len - udphoff, 210 cp->protocol, 211 skb->csum); 212 if (udph->check == 0) 213 udph->check = CSUM_MANGLED_0; 214 skb->ip_summed = CHECKSUM_UNNECESSARY; 215 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", 216 pp->name, udph->check, 217 (char*)&(udph->check) - (char*)udph); 218 } 219 return 1; 220 } 221 222 223 static int 224 udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, 225 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) 226 { 227 struct udphdr *udph; 228 unsigned int udphoff = iph->len; 229 int oldlen; 230 int payload_csum = 0; 231 232 #ifdef CONFIG_IP_VS_IPV6 233 if (cp->af == AF_INET6 && iph->fragoffs) 234 return 1; 235 #endif 236 oldlen = skb->len - udphoff; 237 238 /* csum_check requires unshared skb */ 239 if (!skb_make_writable(skb, udphoff+sizeof(*udph))) 240 return 0; 241 242 if (unlikely(cp->app != NULL)) { 243 int ret; 244 245 /* Some checks before mangling */ 246 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) 247 return 0; 248 249 /* 250 * Attempt ip_vs_app call. 251 * It will fix ip_vs_conn 252 */ 253 if (!(ret = ip_vs_app_pkt_in(cp, skb))) 254 return 0; 255 /* ret=2: csum update is needed after payload mangling */ 256 if (ret == 1) 257 oldlen = skb->len - udphoff; 258 else 259 payload_csum = 1; 260 } 261 262 udph = (void *)skb_network_header(skb) + udphoff; 263 udph->dest = cp->dport; 264 265 /* 266 * Adjust UDP checksums 267 */ 268 if (skb->ip_summed == CHECKSUM_PARTIAL) { 269 udp_partial_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, 270 htons(oldlen), 271 htons(skb->len - udphoff)); 272 } else if (!payload_csum && (udph->check != 0)) { 273 /* Only port and addr are changed, do fast csum update */ 274 udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, 275 cp->vport, cp->dport); 276 if (skb->ip_summed == CHECKSUM_COMPLETE) 277 skb->ip_summed = (cp->app && pp->csum_check) ? 278 CHECKSUM_UNNECESSARY : CHECKSUM_NONE; 279 } else { 280 /* full checksum calculation */ 281 udph->check = 0; 282 skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); 283 #ifdef CONFIG_IP_VS_IPV6 284 if (cp->af == AF_INET6) 285 udph->check = csum_ipv6_magic(&cp->caddr.in6, 286 &cp->daddr.in6, 287 skb->len - udphoff, 288 cp->protocol, skb->csum); 289 else 290 #endif 291 udph->check = csum_tcpudp_magic(cp->caddr.ip, 292 cp->daddr.ip, 293 skb->len - udphoff, 294 cp->protocol, 295 skb->csum); 296 if (udph->check == 0) 297 udph->check = CSUM_MANGLED_0; 298 skb->ip_summed = CHECKSUM_UNNECESSARY; 299 } 300 return 1; 301 } 302 303 304 static int 305 udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) 306 { 307 struct udphdr _udph, *uh; 308 unsigned int udphoff; 309 310 #ifdef CONFIG_IP_VS_IPV6 311 if (af == AF_INET6) 312 udphoff = sizeof(struct ipv6hdr); 313 else 314 #endif 315 udphoff = ip_hdrlen(skb); 316 317 uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph); 318 if (uh == NULL) 319 return 0; 320 321 if (uh->check != 0) { 322 switch (skb->ip_summed) { 323 case CHECKSUM_NONE: 324 skb->csum = skb_checksum(skb, udphoff, 325 skb->len - udphoff, 0); 326 case CHECKSUM_COMPLETE: 327 #ifdef CONFIG_IP_VS_IPV6 328 if (af == AF_INET6) { 329 if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, 330 &ipv6_hdr(skb)->daddr, 331 skb->len - udphoff, 332 ipv6_hdr(skb)->nexthdr, 333 skb->csum)) { 334 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, 335 "Failed checksum for"); 336 return 0; 337 } 338 } else 339 #endif 340 if (csum_tcpudp_magic(ip_hdr(skb)->saddr, 341 ip_hdr(skb)->daddr, 342 skb->len - udphoff, 343 ip_hdr(skb)->protocol, 344 skb->csum)) { 345 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, 346 "Failed checksum for"); 347 return 0; 348 } 349 break; 350 default: 351 /* No need to checksum. */ 352 break; 353 } 354 } 355 return 1; 356 } 357 358 static inline __u16 udp_app_hashkey(__be16 port) 359 { 360 return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port) 361 & UDP_APP_TAB_MASK; 362 } 363 364 365 static int udp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) 366 { 367 struct ip_vs_app *i; 368 __u16 hash; 369 __be16 port = inc->port; 370 int ret = 0; 371 struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); 372 373 hash = udp_app_hashkey(port); 374 375 list_for_each_entry(i, &ipvs->udp_apps[hash], p_list) { 376 if (i->port == port) { 377 ret = -EEXIST; 378 goto out; 379 } 380 } 381 list_add_rcu(&inc->p_list, &ipvs->udp_apps[hash]); 382 atomic_inc(&pd->appcnt); 383 384 out: 385 return ret; 386 } 387 388 389 static void 390 udp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) 391 { 392 struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); 393 394 atomic_dec(&pd->appcnt); 395 list_del_rcu(&inc->p_list); 396 } 397 398 399 static int udp_app_conn_bind(struct ip_vs_conn *cp) 400 { 401 struct netns_ipvs *ipvs = cp->ipvs; 402 int hash; 403 struct ip_vs_app *inc; 404 int result = 0; 405 406 /* Default binding: bind app only for NAT */ 407 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) 408 return 0; 409 410 /* Lookup application incarnations and bind the right one */ 411 hash = udp_app_hashkey(cp->vport); 412 413 rcu_read_lock(); 414 list_for_each_entry_rcu(inc, &ipvs->udp_apps[hash], p_list) { 415 if (inc->port == cp->vport) { 416 if (unlikely(!ip_vs_app_inc_get(inc))) 417 break; 418 rcu_read_unlock(); 419 420 IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" 421 "%s:%u to app %s on port %u\n", 422 __func__, 423 IP_VS_DBG_ADDR(cp->af, &cp->caddr), 424 ntohs(cp->cport), 425 IP_VS_DBG_ADDR(cp->af, &cp->vaddr), 426 ntohs(cp->vport), 427 inc->name, ntohs(inc->port)); 428 429 cp->app = inc; 430 if (inc->init_conn) 431 result = inc->init_conn(inc, cp); 432 goto out; 433 } 434 } 435 rcu_read_unlock(); 436 437 out: 438 return result; 439 } 440 441 442 static const int udp_timeouts[IP_VS_UDP_S_LAST+1] = { 443 [IP_VS_UDP_S_NORMAL] = 5*60*HZ, 444 [IP_VS_UDP_S_LAST] = 2*HZ, 445 }; 446 447 static const char *const udp_state_name_table[IP_VS_UDP_S_LAST+1] = { 448 [IP_VS_UDP_S_NORMAL] = "UDP", 449 [IP_VS_UDP_S_LAST] = "BUG!", 450 }; 451 452 static const char * udp_state_name(int state) 453 { 454 if (state >= IP_VS_UDP_S_LAST) 455 return "ERR!"; 456 return udp_state_name_table[state] ? udp_state_name_table[state] : "?"; 457 } 458 459 static void 460 udp_state_transition(struct ip_vs_conn *cp, int direction, 461 const struct sk_buff *skb, 462 struct ip_vs_proto_data *pd) 463 { 464 if (unlikely(!pd)) { 465 pr_err("UDP no ns data\n"); 466 return; 467 } 468 469 cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL]; 470 } 471 472 static int __udp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) 473 { 474 ip_vs_init_hash_table(ipvs->udp_apps, UDP_APP_TAB_SIZE); 475 pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts, 476 sizeof(udp_timeouts)); 477 if (!pd->timeout_table) 478 return -ENOMEM; 479 return 0; 480 } 481 482 static void __udp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) 483 { 484 kfree(pd->timeout_table); 485 } 486 487 488 struct ip_vs_protocol ip_vs_protocol_udp = { 489 .name = "UDP", 490 .protocol = IPPROTO_UDP, 491 .num_states = IP_VS_UDP_S_LAST, 492 .dont_defrag = 0, 493 .init = NULL, 494 .exit = NULL, 495 .init_netns = __udp_init, 496 .exit_netns = __udp_exit, 497 .conn_schedule = udp_conn_schedule, 498 .conn_in_get = ip_vs_conn_in_get_proto, 499 .conn_out_get = ip_vs_conn_out_get_proto, 500 .snat_handler = udp_snat_handler, 501 .dnat_handler = udp_dnat_handler, 502 .csum_check = udp_csum_check, 503 .state_transition = udp_state_transition, 504 .state_name = udp_state_name, 505 .register_app = udp_register_app, 506 .unregister_app = udp_unregister_app, 507 .app_conn_bind = udp_app_conn_bind, 508 .debug_packet = ip_vs_tcpudp_debug_packet, 509 .timeout_change = NULL, 510 }; 511