1 /* 2 * IPVS An implementation of the IP virtual server support for the 3 * LINUX operating system. IPVS is now implemented as a module 4 * over the Netfilter framework. IPVS can be used to build a 5 * high-performance and highly available server based on a 6 * cluster of servers. 7 * 8 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 9 * Peter Kese <peter.kese@ijs.si> 10 * Julian Anastasov <ja@ssi.bg> 11 * 12 * This program is free software; you can redistribute it and/or 13 * modify it under the terms of the GNU General Public License 14 * as published by the Free Software Foundation; either version 15 * 2 of the License, or (at your option) any later version. 16 * 17 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, 18 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms 19 * and others. 20 * 21 * Changes: 22 * Paul `Rusty' Russell properly handle non-linear skbs 23 * Harald Welte don't use nfcache 24 * 25 */ 26 27 #define KMSG_COMPONENT "IPVS" 28 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 29 30 #include <linux/module.h> 31 #include <linux/kernel.h> 32 #include <linux/ip.h> 33 #include <linux/tcp.h> 34 #include <linux/sctp.h> 35 #include <linux/icmp.h> 36 #include <linux/slab.h> 37 38 #include <net/ip.h> 39 #include <net/tcp.h> 40 #include <net/udp.h> 41 #include <net/icmp.h> /* for icmp_send */ 42 #include <net/route.h> 43 #include <net/ip6_checksum.h> 44 #include <net/netns/generic.h> /* net_generic() */ 45 46 #include <linux/netfilter.h> 47 #include <linux/netfilter_ipv4.h> 48 49 #ifdef CONFIG_IP_VS_IPV6 50 #include <net/ipv6.h> 51 #include <linux/netfilter_ipv6.h> 52 #include <net/ip6_route.h> 53 #endif 54 55 #include <net/ip_vs.h> 56 57 58 EXPORT_SYMBOL(register_ip_vs_scheduler); 59 EXPORT_SYMBOL(unregister_ip_vs_scheduler); 60 EXPORT_SYMBOL(ip_vs_proto_name); 61 EXPORT_SYMBOL(ip_vs_conn_new); 62 EXPORT_SYMBOL(ip_vs_conn_in_get); 63 EXPORT_SYMBOL(ip_vs_conn_out_get); 64 #ifdef CONFIG_IP_VS_PROTO_TCP 65 EXPORT_SYMBOL(ip_vs_tcp_conn_listen); 66 #endif 67 EXPORT_SYMBOL(ip_vs_conn_put); 68 #ifdef CONFIG_IP_VS_DEBUG 69 EXPORT_SYMBOL(ip_vs_get_debug_level); 70 #endif 71 EXPORT_SYMBOL(ip_vs_new_conn_out); 72 73 static unsigned int ip_vs_net_id __read_mostly; 74 /* netns cnt used for uniqueness */ 75 static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0); 76 77 /* ID used in ICMP lookups */ 78 #define icmp_id(icmph) (((icmph)->un).echo.id) 79 #define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier) 80 81 const char *ip_vs_proto_name(unsigned int proto) 82 { 83 static char buf[20]; 84 85 switch (proto) { 86 case IPPROTO_IP: 87 return "IP"; 88 case IPPROTO_UDP: 89 return "UDP"; 90 case IPPROTO_TCP: 91 return "TCP"; 92 case IPPROTO_SCTP: 93 return "SCTP"; 94 case IPPROTO_ICMP: 95 return "ICMP"; 96 #ifdef CONFIG_IP_VS_IPV6 97 case IPPROTO_ICMPV6: 98 return "ICMPv6"; 99 #endif 100 default: 101 sprintf(buf, "IP_%u", proto); 102 return buf; 103 } 104 } 105 106 void ip_vs_init_hash_table(struct list_head *table, int rows) 107 { 108 while (--rows >= 0) 109 INIT_LIST_HEAD(&table[rows]); 110 } 111 112 static inline void 113 ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) 114 { 115 struct ip_vs_dest *dest = cp->dest; 116 struct netns_ipvs *ipvs = cp->ipvs; 117 118 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { 119 struct ip_vs_cpu_stats *s; 120 struct ip_vs_service *svc; 121 122 s = this_cpu_ptr(dest->stats.cpustats); 123 u64_stats_update_begin(&s->syncp); 124 s->cnt.inpkts++; 125 s->cnt.inbytes += skb->len; 126 u64_stats_update_end(&s->syncp); 127 128 svc = rcu_dereference(dest->svc); 129 s = this_cpu_ptr(svc->stats.cpustats); 130 u64_stats_update_begin(&s->syncp); 131 s->cnt.inpkts++; 132 s->cnt.inbytes += skb->len; 133 u64_stats_update_end(&s->syncp); 134 135 s = this_cpu_ptr(ipvs->tot_stats.cpustats); 136 u64_stats_update_begin(&s->syncp); 137 s->cnt.inpkts++; 138 s->cnt.inbytes += skb->len; 139 u64_stats_update_end(&s->syncp); 140 } 141 } 142 143 144 static inline void 145 ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) 146 { 147 struct ip_vs_dest *dest = cp->dest; 148 struct netns_ipvs *ipvs = cp->ipvs; 149 150 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { 151 struct ip_vs_cpu_stats *s; 152 struct ip_vs_service *svc; 153 154 s = this_cpu_ptr(dest->stats.cpustats); 155 u64_stats_update_begin(&s->syncp); 156 s->cnt.outpkts++; 157 s->cnt.outbytes += skb->len; 158 u64_stats_update_end(&s->syncp); 159 160 svc = rcu_dereference(dest->svc); 161 s = this_cpu_ptr(svc->stats.cpustats); 162 u64_stats_update_begin(&s->syncp); 163 s->cnt.outpkts++; 164 s->cnt.outbytes += skb->len; 165 u64_stats_update_end(&s->syncp); 166 167 s = this_cpu_ptr(ipvs->tot_stats.cpustats); 168 u64_stats_update_begin(&s->syncp); 169 s->cnt.outpkts++; 170 s->cnt.outbytes += skb->len; 171 u64_stats_update_end(&s->syncp); 172 } 173 } 174 175 176 static inline void 177 ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) 178 { 179 struct netns_ipvs *ipvs = svc->ipvs; 180 struct ip_vs_cpu_stats *s; 181 182 s = this_cpu_ptr(cp->dest->stats.cpustats); 183 u64_stats_update_begin(&s->syncp); 184 s->cnt.conns++; 185 u64_stats_update_end(&s->syncp); 186 187 s = this_cpu_ptr(svc->stats.cpustats); 188 u64_stats_update_begin(&s->syncp); 189 s->cnt.conns++; 190 u64_stats_update_end(&s->syncp); 191 192 s = this_cpu_ptr(ipvs->tot_stats.cpustats); 193 u64_stats_update_begin(&s->syncp); 194 s->cnt.conns++; 195 u64_stats_update_end(&s->syncp); 196 } 197 198 199 static inline void 200 ip_vs_set_state(struct ip_vs_conn *cp, int direction, 201 const struct sk_buff *skb, 202 struct ip_vs_proto_data *pd) 203 { 204 if (likely(pd->pp->state_transition)) 205 pd->pp->state_transition(cp, direction, skb, pd); 206 } 207 208 static inline int 209 ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, 210 struct sk_buff *skb, int protocol, 211 const union nf_inet_addr *caddr, __be16 cport, 212 const union nf_inet_addr *vaddr, __be16 vport, 213 struct ip_vs_conn_param *p) 214 { 215 ip_vs_conn_fill_param(svc->ipvs, svc->af, protocol, caddr, cport, vaddr, 216 vport, p); 217 p->pe = rcu_dereference(svc->pe); 218 if (p->pe && p->pe->fill_param) 219 return p->pe->fill_param(p, skb); 220 221 return 0; 222 } 223 224 /* 225 * IPVS persistent scheduling function 226 * It creates a connection entry according to its template if exists, 227 * or selects a server and creates a connection entry plus a template. 228 * Locking: we are svc user (svc->refcnt), so we hold all dests too 229 * Protocols supported: TCP, UDP 230 */ 231 static struct ip_vs_conn * 232 ip_vs_sched_persist(struct ip_vs_service *svc, 233 struct sk_buff *skb, __be16 src_port, __be16 dst_port, 234 int *ignored, struct ip_vs_iphdr *iph) 235 { 236 struct ip_vs_conn *cp = NULL; 237 struct ip_vs_dest *dest; 238 struct ip_vs_conn *ct; 239 __be16 dport = 0; /* destination port to forward */ 240 unsigned int flags; 241 struct ip_vs_conn_param param; 242 const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) }; 243 union nf_inet_addr snet; /* source network of the client, 244 after masking */ 245 const union nf_inet_addr *src_addr, *dst_addr; 246 247 if (likely(!ip_vs_iph_inverse(iph))) { 248 src_addr = &iph->saddr; 249 dst_addr = &iph->daddr; 250 } else { 251 src_addr = &iph->daddr; 252 dst_addr = &iph->saddr; 253 } 254 255 256 /* Mask saddr with the netmask to adjust template granularity */ 257 #ifdef CONFIG_IP_VS_IPV6 258 if (svc->af == AF_INET6) 259 ipv6_addr_prefix(&snet.in6, &src_addr->in6, 260 (__force __u32) svc->netmask); 261 else 262 #endif 263 snet.ip = src_addr->ip & svc->netmask; 264 265 IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u " 266 "mnet %s\n", 267 IP_VS_DBG_ADDR(svc->af, src_addr), ntohs(src_port), 268 IP_VS_DBG_ADDR(svc->af, dst_addr), ntohs(dst_port), 269 IP_VS_DBG_ADDR(svc->af, &snet)); 270 271 /* 272 * As far as we know, FTP is a very complicated network protocol, and 273 * it uses control connection and data connections. For active FTP, 274 * FTP server initialize data connection to the client, its source port 275 * is often 20. For passive FTP, FTP server tells the clients the port 276 * that it passively listens to, and the client issues the data 277 * connection. In the tunneling or direct routing mode, the load 278 * balancer is on the client-to-server half of connection, the port 279 * number is unknown to the load balancer. So, a conn template like 280 * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP 281 * service, and a template like <caddr, 0, vaddr, vport, daddr, dport> 282 * is created for other persistent services. 283 */ 284 { 285 int protocol = iph->protocol; 286 const union nf_inet_addr *vaddr = dst_addr; 287 __be16 vport = 0; 288 289 if (dst_port == svc->port) { 290 /* non-FTP template: 291 * <protocol, caddr, 0, vaddr, vport, daddr, dport> 292 * FTP template: 293 * <protocol, caddr, 0, vaddr, 0, daddr, 0> 294 */ 295 if (svc->port != FTPPORT) 296 vport = dst_port; 297 } else { 298 /* Note: persistent fwmark-based services and 299 * persistent port zero service are handled here. 300 * fwmark template: 301 * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0> 302 * port zero template: 303 * <protocol,caddr,0,vaddr,0,daddr,0> 304 */ 305 if (svc->fwmark) { 306 protocol = IPPROTO_IP; 307 vaddr = &fwmark; 308 } 309 } 310 /* return *ignored = -1 so NF_DROP can be used */ 311 if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0, 312 vaddr, vport, ¶m) < 0) { 313 *ignored = -1; 314 return NULL; 315 } 316 } 317 318 /* Check if a template already exists */ 319 ct = ip_vs_ct_in_get(¶m); 320 if (!ct || !ip_vs_check_template(ct, NULL)) { 321 struct ip_vs_scheduler *sched; 322 323 /* 324 * No template found or the dest of the connection 325 * template is not available. 326 * return *ignored=0 i.e. ICMP and NF_DROP 327 */ 328 sched = rcu_dereference(svc->scheduler); 329 if (sched) { 330 /* read svc->sched_data after svc->scheduler */ 331 smp_rmb(); 332 dest = sched->schedule(svc, skb, iph); 333 } else { 334 dest = NULL; 335 } 336 if (!dest) { 337 IP_VS_DBG(1, "p-schedule: no dest found.\n"); 338 kfree(param.pe_data); 339 *ignored = 0; 340 return NULL; 341 } 342 343 if (dst_port == svc->port && svc->port != FTPPORT) 344 dport = dest->port; 345 346 /* Create a template 347 * This adds param.pe_data to the template, 348 * and thus param.pe_data will be destroyed 349 * when the template expires */ 350 ct = ip_vs_conn_new(¶m, dest->af, &dest->addr, dport, 351 IP_VS_CONN_F_TEMPLATE, dest, skb->mark); 352 if (ct == NULL) { 353 kfree(param.pe_data); 354 *ignored = -1; 355 return NULL; 356 } 357 358 ct->timeout = svc->timeout; 359 } else { 360 /* set destination with the found template */ 361 dest = ct->dest; 362 kfree(param.pe_data); 363 } 364 365 dport = dst_port; 366 if (dport == svc->port && dest->port) 367 dport = dest->port; 368 369 flags = (svc->flags & IP_VS_SVC_F_ONEPACKET 370 && iph->protocol == IPPROTO_UDP) ? 371 IP_VS_CONN_F_ONE_PACKET : 0; 372 373 /* 374 * Create a new connection according to the template 375 */ 376 ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, src_addr, 377 src_port, dst_addr, dst_port, ¶m); 378 379 cp = ip_vs_conn_new(¶m, dest->af, &dest->addr, dport, flags, dest, 380 skb->mark); 381 if (cp == NULL) { 382 ip_vs_conn_put(ct); 383 *ignored = -1; 384 return NULL; 385 } 386 387 /* 388 * Add its control 389 */ 390 ip_vs_control_add(cp, ct); 391 ip_vs_conn_put(ct); 392 393 ip_vs_conn_stats(cp, svc); 394 return cp; 395 } 396 397 398 /* 399 * IPVS main scheduling function 400 * It selects a server according to the virtual service, and 401 * creates a connection entry. 402 * Protocols supported: TCP, UDP 403 * 404 * Usage of *ignored 405 * 406 * 1 : protocol tried to schedule (eg. on SYN), found svc but the 407 * svc/scheduler decides that this packet should be accepted with 408 * NF_ACCEPT because it must not be scheduled. 409 * 410 * 0 : scheduler can not find destination, so try bypass or 411 * return ICMP and then NF_DROP (ip_vs_leave). 412 * 413 * -1 : scheduler tried to schedule but fatal error occurred, eg. 414 * ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param 415 * failure such as missing Call-ID, ENOMEM on skb_linearize 416 * or pe_data. In this case we should return NF_DROP without 417 * any attempts to send ICMP with ip_vs_leave. 418 */ 419 struct ip_vs_conn * 420 ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, 421 struct ip_vs_proto_data *pd, int *ignored, 422 struct ip_vs_iphdr *iph) 423 { 424 struct ip_vs_protocol *pp = pd->pp; 425 struct ip_vs_conn *cp = NULL; 426 struct ip_vs_scheduler *sched; 427 struct ip_vs_dest *dest; 428 __be16 _ports[2], *pptr, cport, vport; 429 const void *caddr, *vaddr; 430 unsigned int flags; 431 432 *ignored = 1; 433 /* 434 * IPv6 frags, only the first hit here. 435 */ 436 pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports); 437 if (pptr == NULL) 438 return NULL; 439 440 if (likely(!ip_vs_iph_inverse(iph))) { 441 cport = pptr[0]; 442 caddr = &iph->saddr; 443 vport = pptr[1]; 444 vaddr = &iph->daddr; 445 } else { 446 cport = pptr[1]; 447 caddr = &iph->daddr; 448 vport = pptr[0]; 449 vaddr = &iph->saddr; 450 } 451 452 /* 453 * FTPDATA needs this check when using local real server. 454 * Never schedule Active FTPDATA connections from real server. 455 * For LVS-NAT they must be already created. For other methods 456 * with persistence the connection is created on SYN+ACK. 457 */ 458 if (cport == FTPDATA) { 459 IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off, 460 "Not scheduling FTPDATA"); 461 return NULL; 462 } 463 464 /* 465 * Do not schedule replies from local real server. 466 */ 467 if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK)) { 468 iph->hdr_flags ^= IP_VS_HDR_INVERSE; 469 cp = pp->conn_in_get(svc->ipvs, svc->af, skb, iph); 470 iph->hdr_flags ^= IP_VS_HDR_INVERSE; 471 472 if (cp) { 473 IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off, 474 "Not scheduling reply for existing" 475 " connection"); 476 __ip_vs_conn_put(cp); 477 return NULL; 478 } 479 } 480 481 /* 482 * Persistent service 483 */ 484 if (svc->flags & IP_VS_SVC_F_PERSISTENT) 485 return ip_vs_sched_persist(svc, skb, cport, vport, ignored, 486 iph); 487 488 *ignored = 0; 489 490 /* 491 * Non-persistent service 492 */ 493 if (!svc->fwmark && vport != svc->port) { 494 if (!svc->port) 495 pr_err("Schedule: port zero only supported " 496 "in persistent services, " 497 "check your ipvs configuration\n"); 498 return NULL; 499 } 500 501 sched = rcu_dereference(svc->scheduler); 502 if (sched) { 503 /* read svc->sched_data after svc->scheduler */ 504 smp_rmb(); 505 dest = sched->schedule(svc, skb, iph); 506 } else { 507 dest = NULL; 508 } 509 if (dest == NULL) { 510 IP_VS_DBG(1, "Schedule: no dest found.\n"); 511 return NULL; 512 } 513 514 flags = (svc->flags & IP_VS_SVC_F_ONEPACKET 515 && iph->protocol == IPPROTO_UDP) ? 516 IP_VS_CONN_F_ONE_PACKET : 0; 517 518 /* 519 * Create a connection entry. 520 */ 521 { 522 struct ip_vs_conn_param p; 523 524 ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, 525 caddr, cport, vaddr, vport, &p); 526 cp = ip_vs_conn_new(&p, dest->af, &dest->addr, 527 dest->port ? dest->port : vport, 528 flags, dest, skb->mark); 529 if (!cp) { 530 *ignored = -1; 531 return NULL; 532 } 533 } 534 535 IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " 536 "d:%s:%u conn->flags:%X conn->refcnt:%d\n", 537 ip_vs_fwd_tag(cp), 538 IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), 539 IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), 540 IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), 541 cp->flags, refcount_read(&cp->refcnt)); 542 543 ip_vs_conn_stats(cp, svc); 544 return cp; 545 } 546 547 static inline int ip_vs_addr_is_unicast(struct net *net, int af, 548 union nf_inet_addr *addr) 549 { 550 #ifdef CONFIG_IP_VS_IPV6 551 if (af == AF_INET6) 552 return ipv6_addr_type(&addr->in6) & IPV6_ADDR_UNICAST; 553 #endif 554 return (inet_addr_type(net, addr->ip) == RTN_UNICAST); 555 } 556 557 /* 558 * Pass or drop the packet. 559 * Called by ip_vs_in, when the virtual service is available but 560 * no destination is available for a new connection. 561 */ 562 int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, 563 struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph) 564 { 565 __be16 _ports[2], *pptr, dport; 566 struct netns_ipvs *ipvs = svc->ipvs; 567 struct net *net = ipvs->net; 568 569 pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports); 570 if (!pptr) 571 return NF_DROP; 572 dport = likely(!ip_vs_iph_inverse(iph)) ? pptr[1] : pptr[0]; 573 574 /* if it is fwmark-based service, the cache_bypass sysctl is up 575 and the destination is a non-local unicast, then create 576 a cache_bypass connection entry */ 577 if (sysctl_cache_bypass(ipvs) && svc->fwmark && 578 !(iph->hdr_flags & (IP_VS_HDR_INVERSE | IP_VS_HDR_ICMP)) && 579 ip_vs_addr_is_unicast(net, svc->af, &iph->daddr)) { 580 int ret; 581 struct ip_vs_conn *cp; 582 unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && 583 iph->protocol == IPPROTO_UDP) ? 584 IP_VS_CONN_F_ONE_PACKET : 0; 585 union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } }; 586 587 /* create a new connection entry */ 588 IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__); 589 { 590 struct ip_vs_conn_param p; 591 ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, 592 &iph->saddr, pptr[0], 593 &iph->daddr, pptr[1], &p); 594 cp = ip_vs_conn_new(&p, svc->af, &daddr, 0, 595 IP_VS_CONN_F_BYPASS | flags, 596 NULL, skb->mark); 597 if (!cp) 598 return NF_DROP; 599 } 600 601 /* statistics */ 602 ip_vs_in_stats(cp, skb); 603 604 /* set state */ 605 ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); 606 607 /* transmit the first SYN packet */ 608 ret = cp->packet_xmit(skb, cp, pd->pp, iph); 609 /* do not touch skb anymore */ 610 611 if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control) 612 atomic_inc(&cp->control->in_pkts); 613 else 614 atomic_inc(&cp->in_pkts); 615 ip_vs_conn_put(cp); 616 return ret; 617 } 618 619 /* 620 * When the virtual ftp service is presented, packets destined 621 * for other services on the VIP may get here (except services 622 * listed in the ipvs table), pass the packets, because it is 623 * not ipvs job to decide to drop the packets. 624 */ 625 if (svc->port == FTPPORT && dport != FTPPORT) 626 return NF_ACCEPT; 627 628 if (unlikely(ip_vs_iph_icmp(iph))) 629 return NF_DROP; 630 631 /* 632 * Notify the client that the destination is unreachable, and 633 * release the socket buffer. 634 * Since it is in IP layer, the TCP socket is not actually 635 * created, the TCP RST packet cannot be sent, instead that 636 * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ 637 */ 638 #ifdef CONFIG_IP_VS_IPV6 639 if (svc->af == AF_INET6) { 640 if (!skb->dev) 641 skb->dev = net->loopback_dev; 642 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); 643 } else 644 #endif 645 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); 646 647 return NF_DROP; 648 } 649 650 #ifdef CONFIG_SYSCTL 651 652 static int sysctl_snat_reroute(struct netns_ipvs *ipvs) 653 { 654 return ipvs->sysctl_snat_reroute; 655 } 656 657 static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) 658 { 659 return ipvs->sysctl_nat_icmp_send; 660 } 661 662 static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) 663 { 664 return ipvs->sysctl_expire_nodest_conn; 665 } 666 667 #else 668 669 static int sysctl_snat_reroute(struct netns_ipvs *ipvs) { return 0; } 670 static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) { return 0; } 671 static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; } 672 673 #endif 674 675 __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) 676 { 677 return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); 678 } 679 680 static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum) 681 { 682 if (NF_INET_LOCAL_IN == hooknum) 683 return IP_DEFRAG_VS_IN; 684 if (NF_INET_FORWARD == hooknum) 685 return IP_DEFRAG_VS_FWD; 686 return IP_DEFRAG_VS_OUT; 687 } 688 689 static inline int ip_vs_gather_frags(struct netns_ipvs *ipvs, 690 struct sk_buff *skb, u_int32_t user) 691 { 692 int err; 693 694 local_bh_disable(); 695 err = ip_defrag(ipvs->net, skb, user); 696 local_bh_enable(); 697 if (!err) 698 ip_send_check(ip_hdr(skb)); 699 700 return err; 701 } 702 703 static int ip_vs_route_me_harder(struct netns_ipvs *ipvs, int af, 704 struct sk_buff *skb, unsigned int hooknum) 705 { 706 if (!sysctl_snat_reroute(ipvs)) 707 return 0; 708 /* Reroute replies only to remote clients (FORWARD and LOCAL_OUT) */ 709 if (NF_INET_LOCAL_IN == hooknum) 710 return 0; 711 #ifdef CONFIG_IP_VS_IPV6 712 if (af == AF_INET6) { 713 struct dst_entry *dst = skb_dst(skb); 714 715 if (dst->dev && !(dst->dev->flags & IFF_LOOPBACK) && 716 ip6_route_me_harder(ipvs->net, skb) != 0) 717 return 1; 718 } else 719 #endif 720 if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL) && 721 ip_route_me_harder(ipvs->net, skb, RTN_LOCAL) != 0) 722 return 1; 723 724 return 0; 725 } 726 727 /* 728 * Packet has been made sufficiently writable in caller 729 * - inout: 1=in->out, 0=out->in 730 */ 731 void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, 732 struct ip_vs_conn *cp, int inout) 733 { 734 struct iphdr *iph = ip_hdr(skb); 735 unsigned int icmp_offset = iph->ihl*4; 736 struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) + 737 icmp_offset); 738 struct iphdr *ciph = (struct iphdr *)(icmph + 1); 739 740 if (inout) { 741 iph->saddr = cp->vaddr.ip; 742 ip_send_check(iph); 743 ciph->daddr = cp->vaddr.ip; 744 ip_send_check(ciph); 745 } else { 746 iph->daddr = cp->daddr.ip; 747 ip_send_check(iph); 748 ciph->saddr = cp->daddr.ip; 749 ip_send_check(ciph); 750 } 751 752 /* the TCP/UDP/SCTP port */ 753 if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol || 754 IPPROTO_SCTP == ciph->protocol) { 755 __be16 *ports = (void *)ciph + ciph->ihl*4; 756 757 if (inout) 758 ports[1] = cp->vport; 759 else 760 ports[0] = cp->dport; 761 } 762 763 /* And finally the ICMP checksum */ 764 icmph->checksum = 0; 765 icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset); 766 skb->ip_summed = CHECKSUM_UNNECESSARY; 767 768 if (inout) 769 IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, 770 "Forwarding altered outgoing ICMP"); 771 else 772 IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, 773 "Forwarding altered incoming ICMP"); 774 } 775 776 #ifdef CONFIG_IP_VS_IPV6 777 void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, 778 struct ip_vs_conn *cp, int inout) 779 { 780 struct ipv6hdr *iph = ipv6_hdr(skb); 781 unsigned int icmp_offset = 0; 782 unsigned int offs = 0; /* header offset*/ 783 int protocol; 784 struct icmp6hdr *icmph; 785 struct ipv6hdr *ciph; 786 unsigned short fragoffs; 787 788 ipv6_find_hdr(skb, &icmp_offset, IPPROTO_ICMPV6, &fragoffs, NULL); 789 icmph = (struct icmp6hdr *)(skb_network_header(skb) + icmp_offset); 790 offs = icmp_offset + sizeof(struct icmp6hdr); 791 ciph = (struct ipv6hdr *)(skb_network_header(skb) + offs); 792 793 protocol = ipv6_find_hdr(skb, &offs, -1, &fragoffs, NULL); 794 795 if (inout) { 796 iph->saddr = cp->vaddr.in6; 797 ciph->daddr = cp->vaddr.in6; 798 } else { 799 iph->daddr = cp->daddr.in6; 800 ciph->saddr = cp->daddr.in6; 801 } 802 803 /* the TCP/UDP/SCTP port */ 804 if (!fragoffs && (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol || 805 IPPROTO_SCTP == protocol)) { 806 __be16 *ports = (void *)(skb_network_header(skb) + offs); 807 808 IP_VS_DBG(11, "%s() changed port %d to %d\n", __func__, 809 ntohs(inout ? ports[1] : ports[0]), 810 ntohs(inout ? cp->vport : cp->dport)); 811 if (inout) 812 ports[1] = cp->vport; 813 else 814 ports[0] = cp->dport; 815 } 816 817 /* And finally the ICMP checksum */ 818 icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, 819 skb->len - icmp_offset, 820 IPPROTO_ICMPV6, 0); 821 skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset; 822 skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum); 823 skb->ip_summed = CHECKSUM_PARTIAL; 824 825 if (inout) 826 IP_VS_DBG_PKT(11, AF_INET6, pp, skb, 827 (void *)ciph - (void *)iph, 828 "Forwarding altered outgoing ICMPv6"); 829 else 830 IP_VS_DBG_PKT(11, AF_INET6, pp, skb, 831 (void *)ciph - (void *)iph, 832 "Forwarding altered incoming ICMPv6"); 833 } 834 #endif 835 836 /* Handle relevant response ICMP messages - forward to the right 837 * destination host. 838 */ 839 static int handle_response_icmp(int af, struct sk_buff *skb, 840 union nf_inet_addr *snet, 841 __u8 protocol, struct ip_vs_conn *cp, 842 struct ip_vs_protocol *pp, 843 unsigned int offset, unsigned int ihl, 844 unsigned int hooknum) 845 { 846 unsigned int verdict = NF_DROP; 847 848 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) 849 goto ignore_cp; 850 851 /* Ensure the checksum is correct */ 852 if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { 853 /* Failed checksum! */ 854 IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n", 855 IP_VS_DBG_ADDR(af, snet)); 856 goto out; 857 } 858 859 if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol || 860 IPPROTO_SCTP == protocol) 861 offset += 2 * sizeof(__u16); 862 if (!skb_make_writable(skb, offset)) 863 goto out; 864 865 #ifdef CONFIG_IP_VS_IPV6 866 if (af == AF_INET6) 867 ip_vs_nat_icmp_v6(skb, pp, cp, 1); 868 else 869 #endif 870 ip_vs_nat_icmp(skb, pp, cp, 1); 871 872 if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum)) 873 goto out; 874 875 /* do the statistics and put it back */ 876 ip_vs_out_stats(cp, skb); 877 878 skb->ipvs_property = 1; 879 if (!(cp->flags & IP_VS_CONN_F_NFCT)) 880 ip_vs_notrack(skb); 881 else 882 ip_vs_update_conntrack(skb, cp, 0); 883 884 ignore_cp: 885 verdict = NF_ACCEPT; 886 887 out: 888 __ip_vs_conn_put(cp); 889 890 return verdict; 891 } 892 893 /* 894 * Handle ICMP messages in the inside-to-outside direction (outgoing). 895 * Find any that might be relevant, check against existing connections. 896 * Currently handles error types - unreachable, quench, ttl exceeded. 897 */ 898 static int ip_vs_out_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, 899 int *related, unsigned int hooknum) 900 { 901 struct iphdr *iph; 902 struct icmphdr _icmph, *ic; 903 struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ 904 struct ip_vs_iphdr ciph; 905 struct ip_vs_conn *cp; 906 struct ip_vs_protocol *pp; 907 unsigned int offset, ihl; 908 union nf_inet_addr snet; 909 910 *related = 1; 911 912 /* reassemble IP fragments */ 913 if (ip_is_fragment(ip_hdr(skb))) { 914 if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum))) 915 return NF_STOLEN; 916 } 917 918 iph = ip_hdr(skb); 919 offset = ihl = iph->ihl * 4; 920 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); 921 if (ic == NULL) 922 return NF_DROP; 923 924 IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n", 925 ic->type, ntohs(icmp_id(ic)), 926 &iph->saddr, &iph->daddr); 927 928 /* 929 * Work through seeing if this is for us. 930 * These checks are supposed to be in an order that means easy 931 * things are checked first to speed up processing.... however 932 * this means that some packets will manage to get a long way 933 * down this stack and then be rejected, but that's life. 934 */ 935 if ((ic->type != ICMP_DEST_UNREACH) && 936 (ic->type != ICMP_SOURCE_QUENCH) && 937 (ic->type != ICMP_TIME_EXCEEDED)) { 938 *related = 0; 939 return NF_ACCEPT; 940 } 941 942 /* Now find the contained IP header */ 943 offset += sizeof(_icmph); 944 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); 945 if (cih == NULL) 946 return NF_ACCEPT; /* The packet looks wrong, ignore */ 947 948 pp = ip_vs_proto_get(cih->protocol); 949 if (!pp) 950 return NF_ACCEPT; 951 952 /* Is the embedded protocol header present? */ 953 if (unlikely(cih->frag_off & htons(IP_OFFSET) && 954 pp->dont_defrag)) 955 return NF_ACCEPT; 956 957 IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, 958 "Checking outgoing ICMP for"); 959 960 ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, true, &ciph); 961 962 /* The embedded headers contain source and dest in reverse order */ 963 cp = pp->conn_out_get(ipvs, AF_INET, skb, &ciph); 964 if (!cp) 965 return NF_ACCEPT; 966 967 snet.ip = iph->saddr; 968 return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp, 969 pp, ciph.len, ihl, hooknum); 970 } 971 972 #ifdef CONFIG_IP_VS_IPV6 973 static int ip_vs_out_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb, 974 int *related, unsigned int hooknum, 975 struct ip_vs_iphdr *ipvsh) 976 { 977 struct icmp6hdr _icmph, *ic; 978 struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */ 979 struct ip_vs_conn *cp; 980 struct ip_vs_protocol *pp; 981 union nf_inet_addr snet; 982 unsigned int offset; 983 984 *related = 1; 985 ic = frag_safe_skb_hp(skb, ipvsh->len, sizeof(_icmph), &_icmph); 986 if (ic == NULL) 987 return NF_DROP; 988 989 /* 990 * Work through seeing if this is for us. 991 * These checks are supposed to be in an order that means easy 992 * things are checked first to speed up processing.... however 993 * this means that some packets will manage to get a long way 994 * down this stack and then be rejected, but that's life. 995 */ 996 if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) { 997 *related = 0; 998 return NF_ACCEPT; 999 } 1000 /* Fragment header that is before ICMP header tells us that: 1001 * it's not an error message since they can't be fragmented. 1002 */ 1003 if (ipvsh->flags & IP6_FH_F_FRAG) 1004 return NF_DROP; 1005 1006 IP_VS_DBG(8, "Outgoing ICMPv6 (%d,%d) %pI6c->%pI6c\n", 1007 ic->icmp6_type, ntohs(icmpv6_id(ic)), 1008 &ipvsh->saddr, &ipvsh->daddr); 1009 1010 if (!ip_vs_fill_iph_skb_icmp(AF_INET6, skb, ipvsh->len + sizeof(_icmph), 1011 true, &ciph)) 1012 return NF_ACCEPT; /* The packet looks wrong, ignore */ 1013 1014 pp = ip_vs_proto_get(ciph.protocol); 1015 if (!pp) 1016 return NF_ACCEPT; 1017 1018 /* The embedded headers contain source and dest in reverse order */ 1019 cp = pp->conn_out_get(ipvs, AF_INET6, skb, &ciph); 1020 if (!cp) 1021 return NF_ACCEPT; 1022 1023 snet.in6 = ciph.saddr.in6; 1024 offset = ciph.len; 1025 return handle_response_icmp(AF_INET6, skb, &snet, ciph.protocol, cp, 1026 pp, offset, sizeof(struct ipv6hdr), 1027 hooknum); 1028 } 1029 #endif 1030 1031 /* 1032 * Check if sctp chunc is ABORT chunk 1033 */ 1034 static inline int is_sctp_abort(const struct sk_buff *skb, int nh_len) 1035 { 1036 struct sctp_chunkhdr *sch, schunk; 1037 sch = skb_header_pointer(skb, nh_len + sizeof(struct sctphdr), 1038 sizeof(schunk), &schunk); 1039 if (sch == NULL) 1040 return 0; 1041 if (sch->type == SCTP_CID_ABORT) 1042 return 1; 1043 return 0; 1044 } 1045 1046 static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len) 1047 { 1048 struct tcphdr _tcph, *th; 1049 1050 th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph); 1051 if (th == NULL) 1052 return 0; 1053 return th->rst; 1054 } 1055 1056 static inline bool is_new_conn(const struct sk_buff *skb, 1057 struct ip_vs_iphdr *iph) 1058 { 1059 switch (iph->protocol) { 1060 case IPPROTO_TCP: { 1061 struct tcphdr _tcph, *th; 1062 1063 th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); 1064 if (th == NULL) 1065 return false; 1066 return th->syn; 1067 } 1068 case IPPROTO_SCTP: { 1069 struct sctp_chunkhdr *sch, schunk; 1070 1071 sch = skb_header_pointer(skb, iph->len + sizeof(struct sctphdr), 1072 sizeof(schunk), &schunk); 1073 if (sch == NULL) 1074 return false; 1075 return sch->type == SCTP_CID_INIT; 1076 } 1077 default: 1078 return false; 1079 } 1080 } 1081 1082 static inline bool is_new_conn_expected(const struct ip_vs_conn *cp, 1083 int conn_reuse_mode) 1084 { 1085 /* Controlled (FTP DATA or persistence)? */ 1086 if (cp->control) 1087 return false; 1088 1089 switch (cp->protocol) { 1090 case IPPROTO_TCP: 1091 return (cp->state == IP_VS_TCP_S_TIME_WAIT) || 1092 (cp->state == IP_VS_TCP_S_CLOSE) || 1093 ((conn_reuse_mode & 2) && 1094 (cp->state == IP_VS_TCP_S_FIN_WAIT) && 1095 (cp->flags & IP_VS_CONN_F_NOOUTPUT)); 1096 case IPPROTO_SCTP: 1097 return cp->state == IP_VS_SCTP_S_CLOSED; 1098 default: 1099 return false; 1100 } 1101 } 1102 1103 /* Generic function to create new connections for outgoing RS packets 1104 * 1105 * Pre-requisites for successful connection creation: 1106 * 1) Virtual Service is NOT fwmark based: 1107 * In fwmark-VS actual vaddr and vport are unknown to IPVS 1108 * 2) Real Server and Virtual Service were NOT configured without port: 1109 * This is to allow match of different VS to the same RS ip-addr 1110 */ 1111 struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc, 1112 struct ip_vs_dest *dest, 1113 struct sk_buff *skb, 1114 const struct ip_vs_iphdr *iph, 1115 __be16 dport, 1116 __be16 cport) 1117 { 1118 struct ip_vs_conn_param param; 1119 struct ip_vs_conn *ct = NULL, *cp = NULL; 1120 const union nf_inet_addr *vaddr, *daddr, *caddr; 1121 union nf_inet_addr snet; 1122 __be16 vport; 1123 unsigned int flags; 1124 1125 EnterFunction(12); 1126 vaddr = &svc->addr; 1127 vport = svc->port; 1128 daddr = &iph->saddr; 1129 caddr = &iph->daddr; 1130 1131 /* check pre-requisites are satisfied */ 1132 if (svc->fwmark) 1133 return NULL; 1134 if (!vport || !dport) 1135 return NULL; 1136 1137 /* for persistent service first create connection template */ 1138 if (svc->flags & IP_VS_SVC_F_PERSISTENT) { 1139 /* apply netmask the same way ingress-side does */ 1140 #ifdef CONFIG_IP_VS_IPV6 1141 if (svc->af == AF_INET6) 1142 ipv6_addr_prefix(&snet.in6, &caddr->in6, 1143 (__force __u32)svc->netmask); 1144 else 1145 #endif 1146 snet.ip = caddr->ip & svc->netmask; 1147 /* fill params and create template if not existent */ 1148 if (ip_vs_conn_fill_param_persist(svc, skb, iph->protocol, 1149 &snet, 0, vaddr, 1150 vport, ¶m) < 0) 1151 return NULL; 1152 ct = ip_vs_ct_in_get(¶m); 1153 /* check if template exists and points to the same dest */ 1154 if (!ct || !ip_vs_check_template(ct, dest)) { 1155 ct = ip_vs_conn_new(¶m, dest->af, daddr, dport, 1156 IP_VS_CONN_F_TEMPLATE, dest, 0); 1157 if (!ct) { 1158 kfree(param.pe_data); 1159 return NULL; 1160 } 1161 ct->timeout = svc->timeout; 1162 } else { 1163 kfree(param.pe_data); 1164 } 1165 } 1166 1167 /* connection flags */ 1168 flags = ((svc->flags & IP_VS_SVC_F_ONEPACKET) && 1169 iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0; 1170 /* create connection */ 1171 ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, 1172 caddr, cport, vaddr, vport, ¶m); 1173 cp = ip_vs_conn_new(¶m, dest->af, daddr, dport, flags, dest, 0); 1174 if (!cp) { 1175 if (ct) 1176 ip_vs_conn_put(ct); 1177 return NULL; 1178 } 1179 if (ct) { 1180 ip_vs_control_add(cp, ct); 1181 ip_vs_conn_put(ct); 1182 } 1183 ip_vs_conn_stats(cp, svc); 1184 1185 /* return connection (will be used to handle outgoing packet) */ 1186 IP_VS_DBG_BUF(6, "New connection RS-initiated:%c c:%s:%u v:%s:%u " 1187 "d:%s:%u conn->flags:%X conn->refcnt:%d\n", 1188 ip_vs_fwd_tag(cp), 1189 IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), 1190 IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), 1191 IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), 1192 cp->flags, refcount_read(&cp->refcnt)); 1193 LeaveFunction(12); 1194 return cp; 1195 } 1196 1197 /* Handle outgoing packets which are considered requests initiated by 1198 * real servers, so that subsequent responses from external client can be 1199 * routed to the right real server. 1200 * Used also for outgoing responses in OPS mode. 1201 * 1202 * Connection management is handled by persistent-engine specific callback. 1203 */ 1204 static struct ip_vs_conn *__ip_vs_rs_conn_out(unsigned int hooknum, 1205 struct netns_ipvs *ipvs, 1206 int af, struct sk_buff *skb, 1207 const struct ip_vs_iphdr *iph) 1208 { 1209 struct ip_vs_dest *dest; 1210 struct ip_vs_conn *cp = NULL; 1211 __be16 _ports[2], *pptr; 1212 1213 if (hooknum == NF_INET_LOCAL_IN) 1214 return NULL; 1215 1216 pptr = frag_safe_skb_hp(skb, iph->len, 1217 sizeof(_ports), _ports); 1218 if (!pptr) 1219 return NULL; 1220 1221 dest = ip_vs_find_real_service(ipvs, af, iph->protocol, 1222 &iph->saddr, pptr[0]); 1223 if (dest) { 1224 struct ip_vs_service *svc; 1225 struct ip_vs_pe *pe; 1226 1227 svc = rcu_dereference(dest->svc); 1228 if (svc) { 1229 pe = rcu_dereference(svc->pe); 1230 if (pe && pe->conn_out) 1231 cp = pe->conn_out(svc, dest, skb, iph, 1232 pptr[0], pptr[1]); 1233 } 1234 } 1235 1236 return cp; 1237 } 1238 1239 /* Handle response packets: rewrite addresses and send away... 1240 */ 1241 static unsigned int 1242 handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, 1243 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph, 1244 unsigned int hooknum) 1245 { 1246 struct ip_vs_protocol *pp = pd->pp; 1247 1248 IP_VS_DBG_PKT(11, af, pp, skb, iph->off, "Outgoing packet"); 1249 1250 if (!skb_make_writable(skb, iph->len)) 1251 goto drop; 1252 1253 /* mangle the packet */ 1254 if (pp->snat_handler && !pp->snat_handler(skb, pp, cp, iph)) 1255 goto drop; 1256 1257 #ifdef CONFIG_IP_VS_IPV6 1258 if (af == AF_INET6) 1259 ipv6_hdr(skb)->saddr = cp->vaddr.in6; 1260 else 1261 #endif 1262 { 1263 ip_hdr(skb)->saddr = cp->vaddr.ip; 1264 ip_send_check(ip_hdr(skb)); 1265 } 1266 1267 /* 1268 * nf_iterate does not expect change in the skb->dst->dev. 1269 * It looks like it is not fatal to enable this code for hooks 1270 * where our handlers are at the end of the chain list and 1271 * when all next handlers use skb->dst->dev and not outdev. 1272 * It will definitely route properly the inout NAT traffic 1273 * when multiple paths are used. 1274 */ 1275 1276 /* For policy routing, packets originating from this 1277 * machine itself may be routed differently to packets 1278 * passing through. We want this packet to be routed as 1279 * if it came from this machine itself. So re-compute 1280 * the routing information. 1281 */ 1282 if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum)) 1283 goto drop; 1284 1285 IP_VS_DBG_PKT(10, af, pp, skb, iph->off, "After SNAT"); 1286 1287 ip_vs_out_stats(cp, skb); 1288 ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd); 1289 skb->ipvs_property = 1; 1290 if (!(cp->flags & IP_VS_CONN_F_NFCT)) 1291 ip_vs_notrack(skb); 1292 else 1293 ip_vs_update_conntrack(skb, cp, 0); 1294 ip_vs_conn_put(cp); 1295 1296 LeaveFunction(11); 1297 return NF_ACCEPT; 1298 1299 drop: 1300 ip_vs_conn_put(cp); 1301 kfree_skb(skb); 1302 LeaveFunction(11); 1303 return NF_STOLEN; 1304 } 1305 1306 /* 1307 * Check if outgoing packet belongs to the established ip_vs_conn. 1308 */ 1309 static unsigned int 1310 ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af) 1311 { 1312 struct ip_vs_iphdr iph; 1313 struct ip_vs_protocol *pp; 1314 struct ip_vs_proto_data *pd; 1315 struct ip_vs_conn *cp; 1316 struct sock *sk; 1317 1318 EnterFunction(11); 1319 1320 /* Already marked as IPVS request or reply? */ 1321 if (skb->ipvs_property) 1322 return NF_ACCEPT; 1323 1324 sk = skb_to_full_sk(skb); 1325 /* Bad... Do not break raw sockets */ 1326 if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT && 1327 af == AF_INET)) { 1328 1329 if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag) 1330 return NF_ACCEPT; 1331 } 1332 1333 if (unlikely(!skb_dst(skb))) 1334 return NF_ACCEPT; 1335 1336 if (!ipvs->enable) 1337 return NF_ACCEPT; 1338 1339 ip_vs_fill_iph_skb(af, skb, false, &iph); 1340 #ifdef CONFIG_IP_VS_IPV6 1341 if (af == AF_INET6) { 1342 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { 1343 int related; 1344 int verdict = ip_vs_out_icmp_v6(ipvs, skb, &related, 1345 hooknum, &iph); 1346 1347 if (related) 1348 return verdict; 1349 } 1350 } else 1351 #endif 1352 if (unlikely(iph.protocol == IPPROTO_ICMP)) { 1353 int related; 1354 int verdict = ip_vs_out_icmp(ipvs, skb, &related, hooknum); 1355 1356 if (related) 1357 return verdict; 1358 } 1359 1360 pd = ip_vs_proto_data_get(ipvs, iph.protocol); 1361 if (unlikely(!pd)) 1362 return NF_ACCEPT; 1363 pp = pd->pp; 1364 1365 /* reassemble IP fragments */ 1366 #ifdef CONFIG_IP_VS_IPV6 1367 if (af == AF_INET) 1368 #endif 1369 if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) { 1370 if (ip_vs_gather_frags(ipvs, skb, 1371 ip_vs_defrag_user(hooknum))) 1372 return NF_STOLEN; 1373 1374 ip_vs_fill_iph_skb(AF_INET, skb, false, &iph); 1375 } 1376 1377 /* 1378 * Check if the packet belongs to an existing entry 1379 */ 1380 cp = pp->conn_out_get(ipvs, af, skb, &iph); 1381 1382 if (likely(cp)) { 1383 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) 1384 goto ignore_cp; 1385 return handle_response(af, skb, pd, cp, &iph, hooknum); 1386 } 1387 1388 /* Check for real-server-started requests */ 1389 if (atomic_read(&ipvs->conn_out_counter)) { 1390 /* Currently only for UDP: 1391 * connection oriented protocols typically use 1392 * ephemeral ports for outgoing connections, so 1393 * related incoming responses would not match any VS 1394 */ 1395 if (pp->protocol == IPPROTO_UDP) { 1396 cp = __ip_vs_rs_conn_out(hooknum, ipvs, af, skb, &iph); 1397 if (likely(cp)) 1398 return handle_response(af, skb, pd, cp, &iph, 1399 hooknum); 1400 } 1401 } 1402 1403 if (sysctl_nat_icmp_send(ipvs) && 1404 (pp->protocol == IPPROTO_TCP || 1405 pp->protocol == IPPROTO_UDP || 1406 pp->protocol == IPPROTO_SCTP)) { 1407 __be16 _ports[2], *pptr; 1408 1409 pptr = frag_safe_skb_hp(skb, iph.len, 1410 sizeof(_ports), _ports); 1411 if (pptr == NULL) 1412 return NF_ACCEPT; /* Not for me */ 1413 if (ip_vs_has_real_service(ipvs, af, iph.protocol, &iph.saddr, 1414 pptr[0])) { 1415 /* 1416 * Notify the real server: there is no 1417 * existing entry if it is not RST 1418 * packet or not TCP packet. 1419 */ 1420 if ((iph.protocol != IPPROTO_TCP && 1421 iph.protocol != IPPROTO_SCTP) 1422 || ((iph.protocol == IPPROTO_TCP 1423 && !is_tcp_reset(skb, iph.len)) 1424 || (iph.protocol == IPPROTO_SCTP 1425 && !is_sctp_abort(skb, 1426 iph.len)))) { 1427 #ifdef CONFIG_IP_VS_IPV6 1428 if (af == AF_INET6) { 1429 if (!skb->dev) 1430 skb->dev = ipvs->net->loopback_dev; 1431 icmpv6_send(skb, 1432 ICMPV6_DEST_UNREACH, 1433 ICMPV6_PORT_UNREACH, 1434 0); 1435 } else 1436 #endif 1437 icmp_send(skb, 1438 ICMP_DEST_UNREACH, 1439 ICMP_PORT_UNREACH, 0); 1440 return NF_DROP; 1441 } 1442 } 1443 } 1444 1445 out: 1446 IP_VS_DBG_PKT(12, af, pp, skb, iph.off, 1447 "ip_vs_out: packet continues traversal as normal"); 1448 return NF_ACCEPT; 1449 1450 ignore_cp: 1451 __ip_vs_conn_put(cp); 1452 goto out; 1453 } 1454 1455 /* 1456 * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain, 1457 * used only for VS/NAT. 1458 * Check if packet is reply for established ip_vs_conn. 1459 */ 1460 static unsigned int 1461 ip_vs_reply4(void *priv, struct sk_buff *skb, 1462 const struct nf_hook_state *state) 1463 { 1464 return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET); 1465 } 1466 1467 /* 1468 * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT. 1469 * Check if packet is reply for established ip_vs_conn. 1470 */ 1471 static unsigned int 1472 ip_vs_local_reply4(void *priv, struct sk_buff *skb, 1473 const struct nf_hook_state *state) 1474 { 1475 return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET); 1476 } 1477 1478 #ifdef CONFIG_IP_VS_IPV6 1479 1480 /* 1481 * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain, 1482 * used only for VS/NAT. 1483 * Check if packet is reply for established ip_vs_conn. 1484 */ 1485 static unsigned int 1486 ip_vs_reply6(void *priv, struct sk_buff *skb, 1487 const struct nf_hook_state *state) 1488 { 1489 return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6); 1490 } 1491 1492 /* 1493 * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT. 1494 * Check if packet is reply for established ip_vs_conn. 1495 */ 1496 static unsigned int 1497 ip_vs_local_reply6(void *priv, struct sk_buff *skb, 1498 const struct nf_hook_state *state) 1499 { 1500 return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6); 1501 } 1502 1503 #endif 1504 1505 static unsigned int 1506 ip_vs_try_to_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, 1507 struct ip_vs_proto_data *pd, 1508 int *verdict, struct ip_vs_conn **cpp, 1509 struct ip_vs_iphdr *iph) 1510 { 1511 struct ip_vs_protocol *pp = pd->pp; 1512 1513 if (!iph->fragoffs) { 1514 /* No (second) fragments need to enter here, as nf_defrag_ipv6 1515 * replayed fragment zero will already have created the cp 1516 */ 1517 1518 /* Schedule and create new connection entry into cpp */ 1519 if (!pp->conn_schedule(ipvs, af, skb, pd, verdict, cpp, iph)) 1520 return 0; 1521 } 1522 1523 if (unlikely(!*cpp)) { 1524 /* sorry, all this trouble for a no-hit :) */ 1525 IP_VS_DBG_PKT(12, af, pp, skb, iph->off, 1526 "ip_vs_in: packet continues traversal as normal"); 1527 if (iph->fragoffs) { 1528 /* Fragment that couldn't be mapped to a conn entry 1529 * is missing module nf_defrag_ipv6 1530 */ 1531 IP_VS_DBG_RL("Unhandled frag, load nf_defrag_ipv6\n"); 1532 IP_VS_DBG_PKT(7, af, pp, skb, iph->off, 1533 "unhandled fragment"); 1534 } 1535 *verdict = NF_ACCEPT; 1536 return 0; 1537 } 1538 1539 return 1; 1540 } 1541 1542 /* 1543 * Handle ICMP messages in the outside-to-inside direction (incoming). 1544 * Find any that might be relevant, check against existing connections, 1545 * forward to the right destination host if relevant. 1546 * Currently handles error types - unreachable, quench, ttl exceeded. 1547 */ 1548 static int 1549 ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, 1550 unsigned int hooknum) 1551 { 1552 struct iphdr *iph; 1553 struct icmphdr _icmph, *ic; 1554 struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ 1555 struct ip_vs_iphdr ciph; 1556 struct ip_vs_conn *cp; 1557 struct ip_vs_protocol *pp; 1558 struct ip_vs_proto_data *pd; 1559 unsigned int offset, offset2, ihl, verdict; 1560 bool ipip, new_cp = false; 1561 1562 *related = 1; 1563 1564 /* reassemble IP fragments */ 1565 if (ip_is_fragment(ip_hdr(skb))) { 1566 if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum))) 1567 return NF_STOLEN; 1568 } 1569 1570 iph = ip_hdr(skb); 1571 offset = ihl = iph->ihl * 4; 1572 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); 1573 if (ic == NULL) 1574 return NF_DROP; 1575 1576 IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n", 1577 ic->type, ntohs(icmp_id(ic)), 1578 &iph->saddr, &iph->daddr); 1579 1580 /* 1581 * Work through seeing if this is for us. 1582 * These checks are supposed to be in an order that means easy 1583 * things are checked first to speed up processing.... however 1584 * this means that some packets will manage to get a long way 1585 * down this stack and then be rejected, but that's life. 1586 */ 1587 if ((ic->type != ICMP_DEST_UNREACH) && 1588 (ic->type != ICMP_SOURCE_QUENCH) && 1589 (ic->type != ICMP_TIME_EXCEEDED)) { 1590 *related = 0; 1591 return NF_ACCEPT; 1592 } 1593 1594 /* Now find the contained IP header */ 1595 offset += sizeof(_icmph); 1596 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); 1597 if (cih == NULL) 1598 return NF_ACCEPT; /* The packet looks wrong, ignore */ 1599 1600 /* Special case for errors for IPIP packets */ 1601 ipip = false; 1602 if (cih->protocol == IPPROTO_IPIP) { 1603 if (unlikely(cih->frag_off & htons(IP_OFFSET))) 1604 return NF_ACCEPT; 1605 /* Error for our IPIP must arrive at LOCAL_IN */ 1606 if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL)) 1607 return NF_ACCEPT; 1608 offset += cih->ihl * 4; 1609 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); 1610 if (cih == NULL) 1611 return NF_ACCEPT; /* The packet looks wrong, ignore */ 1612 ipip = true; 1613 } 1614 1615 pd = ip_vs_proto_data_get(ipvs, cih->protocol); 1616 if (!pd) 1617 return NF_ACCEPT; 1618 pp = pd->pp; 1619 1620 /* Is the embedded protocol header present? */ 1621 if (unlikely(cih->frag_off & htons(IP_OFFSET) && 1622 pp->dont_defrag)) 1623 return NF_ACCEPT; 1624 1625 IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, 1626 "Checking incoming ICMP for"); 1627 1628 offset2 = offset; 1629 ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, !ipip, &ciph); 1630 offset = ciph.len; 1631 1632 /* The embedded headers contain source and dest in reverse order. 1633 * For IPIP this is error for request, not for reply. 1634 */ 1635 cp = pp->conn_in_get(ipvs, AF_INET, skb, &ciph); 1636 1637 if (!cp) { 1638 int v; 1639 1640 if (!sysctl_schedule_icmp(ipvs)) 1641 return NF_ACCEPT; 1642 1643 if (!ip_vs_try_to_schedule(ipvs, AF_INET, skb, pd, &v, &cp, &ciph)) 1644 return v; 1645 new_cp = true; 1646 } 1647 1648 verdict = NF_DROP; 1649 1650 /* Ensure the checksum is correct */ 1651 if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { 1652 /* Failed checksum! */ 1653 IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n", 1654 &iph->saddr); 1655 goto out; 1656 } 1657 1658 if (ipip) { 1659 __be32 info = ic->un.gateway; 1660 __u8 type = ic->type; 1661 __u8 code = ic->code; 1662 1663 /* Update the MTU */ 1664 if (ic->type == ICMP_DEST_UNREACH && 1665 ic->code == ICMP_FRAG_NEEDED) { 1666 struct ip_vs_dest *dest = cp->dest; 1667 u32 mtu = ntohs(ic->un.frag.mtu); 1668 __be16 frag_off = cih->frag_off; 1669 1670 /* Strip outer IP and ICMP, go to IPIP header */ 1671 if (pskb_pull(skb, ihl + sizeof(_icmph)) == NULL) 1672 goto ignore_ipip; 1673 offset2 -= ihl + sizeof(_icmph); 1674 skb_reset_network_header(skb); 1675 IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n", 1676 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu); 1677 ipv4_update_pmtu(skb, ipvs->net, 1678 mtu, 0, 0, 0, 0); 1679 /* Client uses PMTUD? */ 1680 if (!(frag_off & htons(IP_DF))) 1681 goto ignore_ipip; 1682 /* Prefer the resulting PMTU */ 1683 if (dest) { 1684 struct ip_vs_dest_dst *dest_dst; 1685 1686 dest_dst = rcu_dereference(dest->dest_dst); 1687 if (dest_dst) 1688 mtu = dst_mtu(dest_dst->dst_cache); 1689 } 1690 if (mtu > 68 + sizeof(struct iphdr)) 1691 mtu -= sizeof(struct iphdr); 1692 info = htonl(mtu); 1693 } 1694 /* Strip outer IP, ICMP and IPIP, go to IP header of 1695 * original request. 1696 */ 1697 if (pskb_pull(skb, offset2) == NULL) 1698 goto ignore_ipip; 1699 skb_reset_network_header(skb); 1700 IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n", 1701 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, 1702 type, code, ntohl(info)); 1703 icmp_send(skb, type, code, info); 1704 /* ICMP can be shorter but anyways, account it */ 1705 ip_vs_out_stats(cp, skb); 1706 1707 ignore_ipip: 1708 consume_skb(skb); 1709 verdict = NF_STOLEN; 1710 goto out; 1711 } 1712 1713 /* do the statistics and put it back */ 1714 ip_vs_in_stats(cp, skb); 1715 if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol || 1716 IPPROTO_SCTP == cih->protocol) 1717 offset += 2 * sizeof(__u16); 1718 verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum, &ciph); 1719 1720 out: 1721 if (likely(!new_cp)) 1722 __ip_vs_conn_put(cp); 1723 else 1724 ip_vs_conn_put(cp); 1725 1726 return verdict; 1727 } 1728 1729 #ifdef CONFIG_IP_VS_IPV6 1730 static int ip_vs_in_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb, 1731 int *related, unsigned int hooknum, 1732 struct ip_vs_iphdr *iph) 1733 { 1734 struct icmp6hdr _icmph, *ic; 1735 struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */ 1736 struct ip_vs_conn *cp; 1737 struct ip_vs_protocol *pp; 1738 struct ip_vs_proto_data *pd; 1739 unsigned int offset, verdict; 1740 bool new_cp = false; 1741 1742 *related = 1; 1743 1744 ic = frag_safe_skb_hp(skb, iph->len, sizeof(_icmph), &_icmph); 1745 if (ic == NULL) 1746 return NF_DROP; 1747 1748 /* 1749 * Work through seeing if this is for us. 1750 * These checks are supposed to be in an order that means easy 1751 * things are checked first to speed up processing.... however 1752 * this means that some packets will manage to get a long way 1753 * down this stack and then be rejected, but that's life. 1754 */ 1755 if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) { 1756 *related = 0; 1757 return NF_ACCEPT; 1758 } 1759 /* Fragment header that is before ICMP header tells us that: 1760 * it's not an error message since they can't be fragmented. 1761 */ 1762 if (iph->flags & IP6_FH_F_FRAG) 1763 return NF_DROP; 1764 1765 IP_VS_DBG(8, "Incoming ICMPv6 (%d,%d) %pI6c->%pI6c\n", 1766 ic->icmp6_type, ntohs(icmpv6_id(ic)), 1767 &iph->saddr, &iph->daddr); 1768 1769 offset = iph->len + sizeof(_icmph); 1770 if (!ip_vs_fill_iph_skb_icmp(AF_INET6, skb, offset, true, &ciph)) 1771 return NF_ACCEPT; 1772 1773 pd = ip_vs_proto_data_get(ipvs, ciph.protocol); 1774 if (!pd) 1775 return NF_ACCEPT; 1776 pp = pd->pp; 1777 1778 /* Cannot handle fragmented embedded protocol */ 1779 if (ciph.fragoffs) 1780 return NF_ACCEPT; 1781 1782 IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset, 1783 "Checking incoming ICMPv6 for"); 1784 1785 /* The embedded headers contain source and dest in reverse order 1786 * if not from localhost 1787 */ 1788 cp = pp->conn_in_get(ipvs, AF_INET6, skb, &ciph); 1789 1790 if (!cp) { 1791 int v; 1792 1793 if (!sysctl_schedule_icmp(ipvs)) 1794 return NF_ACCEPT; 1795 1796 if (!ip_vs_try_to_schedule(ipvs, AF_INET6, skb, pd, &v, &cp, &ciph)) 1797 return v; 1798 1799 new_cp = true; 1800 } 1801 1802 /* VS/TUN, VS/DR and LOCALNODE just let it go */ 1803 if ((hooknum == NF_INET_LOCAL_OUT) && 1804 (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)) { 1805 verdict = NF_ACCEPT; 1806 goto out; 1807 } 1808 1809 /* do the statistics and put it back */ 1810 ip_vs_in_stats(cp, skb); 1811 1812 /* Need to mangle contained IPv6 header in ICMPv6 packet */ 1813 offset = ciph.len; 1814 if (IPPROTO_TCP == ciph.protocol || IPPROTO_UDP == ciph.protocol || 1815 IPPROTO_SCTP == ciph.protocol) 1816 offset += 2 * sizeof(__u16); /* Also mangle ports */ 1817 1818 verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset, hooknum, &ciph); 1819 1820 out: 1821 if (likely(!new_cp)) 1822 __ip_vs_conn_put(cp); 1823 else 1824 ip_vs_conn_put(cp); 1825 1826 return verdict; 1827 } 1828 #endif 1829 1830 1831 /* 1832 * Check if it's for virtual services, look it up, 1833 * and send it on its way... 1834 */ 1835 static unsigned int 1836 ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af) 1837 { 1838 struct ip_vs_iphdr iph; 1839 struct ip_vs_protocol *pp; 1840 struct ip_vs_proto_data *pd; 1841 struct ip_vs_conn *cp; 1842 int ret, pkts; 1843 int conn_reuse_mode; 1844 struct sock *sk; 1845 1846 /* Already marked as IPVS request or reply? */ 1847 if (skb->ipvs_property) 1848 return NF_ACCEPT; 1849 1850 /* 1851 * Big tappo: 1852 * - remote client: only PACKET_HOST 1853 * - route: used for struct net when skb->dev is unset 1854 */ 1855 if (unlikely((skb->pkt_type != PACKET_HOST && 1856 hooknum != NF_INET_LOCAL_OUT) || 1857 !skb_dst(skb))) { 1858 ip_vs_fill_iph_skb(af, skb, false, &iph); 1859 IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s" 1860 " ignored in hook %u\n", 1861 skb->pkt_type, iph.protocol, 1862 IP_VS_DBG_ADDR(af, &iph.daddr), hooknum); 1863 return NF_ACCEPT; 1864 } 1865 /* ipvs enabled in this netns ? */ 1866 if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) 1867 return NF_ACCEPT; 1868 1869 ip_vs_fill_iph_skb(af, skb, false, &iph); 1870 1871 /* Bad... Do not break raw sockets */ 1872 sk = skb_to_full_sk(skb); 1873 if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT && 1874 af == AF_INET)) { 1875 1876 if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag) 1877 return NF_ACCEPT; 1878 } 1879 1880 #ifdef CONFIG_IP_VS_IPV6 1881 if (af == AF_INET6) { 1882 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { 1883 int related; 1884 int verdict = ip_vs_in_icmp_v6(ipvs, skb, &related, 1885 hooknum, &iph); 1886 1887 if (related) 1888 return verdict; 1889 } 1890 } else 1891 #endif 1892 if (unlikely(iph.protocol == IPPROTO_ICMP)) { 1893 int related; 1894 int verdict = ip_vs_in_icmp(ipvs, skb, &related, 1895 hooknum); 1896 1897 if (related) 1898 return verdict; 1899 } 1900 1901 /* Protocol supported? */ 1902 pd = ip_vs_proto_data_get(ipvs, iph.protocol); 1903 if (unlikely(!pd)) { 1904 /* The only way we'll see this packet again is if it's 1905 * encapsulated, so mark it with ipvs_property=1 so we 1906 * skip it if we're ignoring tunneled packets 1907 */ 1908 if (sysctl_ignore_tunneled(ipvs)) 1909 skb->ipvs_property = 1; 1910 1911 return NF_ACCEPT; 1912 } 1913 pp = pd->pp; 1914 /* 1915 * Check if the packet belongs to an existing connection entry 1916 */ 1917 cp = pp->conn_in_get(ipvs, af, skb, &iph); 1918 1919 conn_reuse_mode = sysctl_conn_reuse_mode(ipvs); 1920 if (conn_reuse_mode && !iph.fragoffs && is_new_conn(skb, &iph) && cp) { 1921 bool uses_ct = false, resched = false; 1922 1923 if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest && 1924 unlikely(!atomic_read(&cp->dest->weight))) { 1925 resched = true; 1926 uses_ct = ip_vs_conn_uses_conntrack(cp, skb); 1927 } else if (is_new_conn_expected(cp, conn_reuse_mode)) { 1928 uses_ct = ip_vs_conn_uses_conntrack(cp, skb); 1929 if (!atomic_read(&cp->n_control)) { 1930 resched = true; 1931 } else { 1932 /* Do not reschedule controlling connection 1933 * that uses conntrack while it is still 1934 * referenced by controlled connection(s). 1935 */ 1936 resched = !uses_ct; 1937 } 1938 } 1939 1940 if (resched) { 1941 if (!atomic_read(&cp->n_control)) 1942 ip_vs_conn_expire_now(cp); 1943 __ip_vs_conn_put(cp); 1944 if (uses_ct) 1945 return NF_DROP; 1946 cp = NULL; 1947 } 1948 } 1949 1950 if (unlikely(!cp)) { 1951 int v; 1952 1953 if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph)) 1954 return v; 1955 } 1956 1957 IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet"); 1958 1959 /* Check the server status */ 1960 if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { 1961 /* the destination server is not available */ 1962 1963 if (sysctl_expire_nodest_conn(ipvs)) { 1964 /* try to expire the connection immediately */ 1965 ip_vs_conn_expire_now(cp); 1966 } 1967 /* don't restart its timer, and silently 1968 drop the packet. */ 1969 __ip_vs_conn_put(cp); 1970 return NF_DROP; 1971 } 1972 1973 ip_vs_in_stats(cp, skb); 1974 ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); 1975 if (cp->packet_xmit) 1976 ret = cp->packet_xmit(skb, cp, pp, &iph); 1977 /* do not touch skb anymore */ 1978 else { 1979 IP_VS_DBG_RL("warning: packet_xmit is null"); 1980 ret = NF_ACCEPT; 1981 } 1982 1983 /* Increase its packet counter and check if it is needed 1984 * to be synchronized 1985 * 1986 * Sync connection if it is about to close to 1987 * encorage the standby servers to update the connections timeout 1988 * 1989 * For ONE_PKT let ip_vs_sync_conn() do the filter work. 1990 */ 1991 1992 if (cp->flags & IP_VS_CONN_F_ONE_PACKET) 1993 pkts = sysctl_sync_threshold(ipvs); 1994 else 1995 pkts = atomic_add_return(1, &cp->in_pkts); 1996 1997 if (ipvs->sync_state & IP_VS_STATE_MASTER) 1998 ip_vs_sync_conn(ipvs, cp, pkts); 1999 else if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control) 2000 /* increment is done inside ip_vs_sync_conn too */ 2001 atomic_inc(&cp->control->in_pkts); 2002 2003 ip_vs_conn_put(cp); 2004 return ret; 2005 } 2006 2007 /* 2008 * AF_INET handler in NF_INET_LOCAL_IN chain 2009 * Schedule and forward packets from remote clients 2010 */ 2011 static unsigned int 2012 ip_vs_remote_request4(void *priv, struct sk_buff *skb, 2013 const struct nf_hook_state *state) 2014 { 2015 return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET); 2016 } 2017 2018 /* 2019 * AF_INET handler in NF_INET_LOCAL_OUT chain 2020 * Schedule and forward packets from local clients 2021 */ 2022 static unsigned int 2023 ip_vs_local_request4(void *priv, struct sk_buff *skb, 2024 const struct nf_hook_state *state) 2025 { 2026 return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET); 2027 } 2028 2029 #ifdef CONFIG_IP_VS_IPV6 2030 2031 /* 2032 * AF_INET6 handler in NF_INET_LOCAL_IN chain 2033 * Schedule and forward packets from remote clients 2034 */ 2035 static unsigned int 2036 ip_vs_remote_request6(void *priv, struct sk_buff *skb, 2037 const struct nf_hook_state *state) 2038 { 2039 return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET6); 2040 } 2041 2042 /* 2043 * AF_INET6 handler in NF_INET_LOCAL_OUT chain 2044 * Schedule and forward packets from local clients 2045 */ 2046 static unsigned int 2047 ip_vs_local_request6(void *priv, struct sk_buff *skb, 2048 const struct nf_hook_state *state) 2049 { 2050 return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET6); 2051 } 2052 2053 #endif 2054 2055 2056 /* 2057 * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP 2058 * related packets destined for 0.0.0.0/0. 2059 * When fwmark-based virtual service is used, such as transparent 2060 * cache cluster, TCP packets can be marked and routed to ip_vs_in, 2061 * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and 2062 * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain 2063 * and send them to ip_vs_in_icmp. 2064 */ 2065 static unsigned int 2066 ip_vs_forward_icmp(void *priv, struct sk_buff *skb, 2067 const struct nf_hook_state *state) 2068 { 2069 int r; 2070 struct netns_ipvs *ipvs = net_ipvs(state->net); 2071 2072 if (ip_hdr(skb)->protocol != IPPROTO_ICMP) 2073 return NF_ACCEPT; 2074 2075 /* ipvs enabled in this netns ? */ 2076 if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) 2077 return NF_ACCEPT; 2078 2079 return ip_vs_in_icmp(ipvs, skb, &r, state->hook); 2080 } 2081 2082 #ifdef CONFIG_IP_VS_IPV6 2083 static unsigned int 2084 ip_vs_forward_icmp_v6(void *priv, struct sk_buff *skb, 2085 const struct nf_hook_state *state) 2086 { 2087 int r; 2088 struct netns_ipvs *ipvs = net_ipvs(state->net); 2089 struct ip_vs_iphdr iphdr; 2090 2091 ip_vs_fill_iph_skb(AF_INET6, skb, false, &iphdr); 2092 if (iphdr.protocol != IPPROTO_ICMPV6) 2093 return NF_ACCEPT; 2094 2095 /* ipvs enabled in this netns ? */ 2096 if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) 2097 return NF_ACCEPT; 2098 2099 return ip_vs_in_icmp_v6(ipvs, skb, &r, state->hook, &iphdr); 2100 } 2101 #endif 2102 2103 2104 static const struct nf_hook_ops ip_vs_ops[] = { 2105 /* After packet filtering, change source only for VS/NAT */ 2106 { 2107 .hook = ip_vs_reply4, 2108 .pf = NFPROTO_IPV4, 2109 .hooknum = NF_INET_LOCAL_IN, 2110 .priority = NF_IP_PRI_NAT_SRC - 2, 2111 }, 2112 /* After packet filtering, forward packet through VS/DR, VS/TUN, 2113 * or VS/NAT(change destination), so that filtering rules can be 2114 * applied to IPVS. */ 2115 { 2116 .hook = ip_vs_remote_request4, 2117 .pf = NFPROTO_IPV4, 2118 .hooknum = NF_INET_LOCAL_IN, 2119 .priority = NF_IP_PRI_NAT_SRC - 1, 2120 }, 2121 /* Before ip_vs_in, change source only for VS/NAT */ 2122 { 2123 .hook = ip_vs_local_reply4, 2124 .pf = NFPROTO_IPV4, 2125 .hooknum = NF_INET_LOCAL_OUT, 2126 .priority = NF_IP_PRI_NAT_DST + 1, 2127 }, 2128 /* After mangle, schedule and forward local requests */ 2129 { 2130 .hook = ip_vs_local_request4, 2131 .pf = NFPROTO_IPV4, 2132 .hooknum = NF_INET_LOCAL_OUT, 2133 .priority = NF_IP_PRI_NAT_DST + 2, 2134 }, 2135 /* After packet filtering (but before ip_vs_out_icmp), catch icmp 2136 * destined for 0.0.0.0/0, which is for incoming IPVS connections */ 2137 { 2138 .hook = ip_vs_forward_icmp, 2139 .pf = NFPROTO_IPV4, 2140 .hooknum = NF_INET_FORWARD, 2141 .priority = 99, 2142 }, 2143 /* After packet filtering, change source only for VS/NAT */ 2144 { 2145 .hook = ip_vs_reply4, 2146 .pf = NFPROTO_IPV4, 2147 .hooknum = NF_INET_FORWARD, 2148 .priority = 100, 2149 }, 2150 #ifdef CONFIG_IP_VS_IPV6 2151 /* After packet filtering, change source only for VS/NAT */ 2152 { 2153 .hook = ip_vs_reply6, 2154 .pf = NFPROTO_IPV6, 2155 .hooknum = NF_INET_LOCAL_IN, 2156 .priority = NF_IP6_PRI_NAT_SRC - 2, 2157 }, 2158 /* After packet filtering, forward packet through VS/DR, VS/TUN, 2159 * or VS/NAT(change destination), so that filtering rules can be 2160 * applied to IPVS. */ 2161 { 2162 .hook = ip_vs_remote_request6, 2163 .pf = NFPROTO_IPV6, 2164 .hooknum = NF_INET_LOCAL_IN, 2165 .priority = NF_IP6_PRI_NAT_SRC - 1, 2166 }, 2167 /* Before ip_vs_in, change source only for VS/NAT */ 2168 { 2169 .hook = ip_vs_local_reply6, 2170 .pf = NFPROTO_IPV6, 2171 .hooknum = NF_INET_LOCAL_OUT, 2172 .priority = NF_IP6_PRI_NAT_DST + 1, 2173 }, 2174 /* After mangle, schedule and forward local requests */ 2175 { 2176 .hook = ip_vs_local_request6, 2177 .pf = NFPROTO_IPV6, 2178 .hooknum = NF_INET_LOCAL_OUT, 2179 .priority = NF_IP6_PRI_NAT_DST + 2, 2180 }, 2181 /* After packet filtering (but before ip_vs_out_icmp), catch icmp 2182 * destined for 0.0.0.0/0, which is for incoming IPVS connections */ 2183 { 2184 .hook = ip_vs_forward_icmp_v6, 2185 .pf = NFPROTO_IPV6, 2186 .hooknum = NF_INET_FORWARD, 2187 .priority = 99, 2188 }, 2189 /* After packet filtering, change source only for VS/NAT */ 2190 { 2191 .hook = ip_vs_reply6, 2192 .pf = NFPROTO_IPV6, 2193 .hooknum = NF_INET_FORWARD, 2194 .priority = 100, 2195 }, 2196 #endif 2197 }; 2198 /* 2199 * Initialize IP Virtual Server netns mem. 2200 */ 2201 static int __net_init __ip_vs_init(struct net *net) 2202 { 2203 struct netns_ipvs *ipvs; 2204 int ret; 2205 2206 ipvs = net_generic(net, ip_vs_net_id); 2207 if (ipvs == NULL) 2208 return -ENOMEM; 2209 2210 /* Hold the beast until a service is registerd */ 2211 ipvs->enable = 0; 2212 ipvs->net = net; 2213 /* Counters used for creating unique names */ 2214 ipvs->gen = atomic_read(&ipvs_netns_cnt); 2215 atomic_inc(&ipvs_netns_cnt); 2216 net->ipvs = ipvs; 2217 2218 if (ip_vs_estimator_net_init(ipvs) < 0) 2219 goto estimator_fail; 2220 2221 if (ip_vs_control_net_init(ipvs) < 0) 2222 goto control_fail; 2223 2224 if (ip_vs_protocol_net_init(ipvs) < 0) 2225 goto protocol_fail; 2226 2227 if (ip_vs_app_net_init(ipvs) < 0) 2228 goto app_fail; 2229 2230 if (ip_vs_conn_net_init(ipvs) < 0) 2231 goto conn_fail; 2232 2233 if (ip_vs_sync_net_init(ipvs) < 0) 2234 goto sync_fail; 2235 2236 ret = nf_register_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); 2237 if (ret < 0) 2238 goto hook_fail; 2239 2240 return 0; 2241 /* 2242 * Error handling 2243 */ 2244 2245 hook_fail: 2246 ip_vs_sync_net_cleanup(ipvs); 2247 sync_fail: 2248 ip_vs_conn_net_cleanup(ipvs); 2249 conn_fail: 2250 ip_vs_app_net_cleanup(ipvs); 2251 app_fail: 2252 ip_vs_protocol_net_cleanup(ipvs); 2253 protocol_fail: 2254 ip_vs_control_net_cleanup(ipvs); 2255 control_fail: 2256 ip_vs_estimator_net_cleanup(ipvs); 2257 estimator_fail: 2258 net->ipvs = NULL; 2259 return -ENOMEM; 2260 } 2261 2262 static void __net_exit __ip_vs_cleanup(struct net *net) 2263 { 2264 struct netns_ipvs *ipvs = net_ipvs(net); 2265 2266 nf_unregister_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); 2267 ip_vs_service_net_cleanup(ipvs); /* ip_vs_flush() with locks */ 2268 ip_vs_conn_net_cleanup(ipvs); 2269 ip_vs_app_net_cleanup(ipvs); 2270 ip_vs_protocol_net_cleanup(ipvs); 2271 ip_vs_control_net_cleanup(ipvs); 2272 ip_vs_estimator_net_cleanup(ipvs); 2273 IP_VS_DBG(2, "ipvs netns %d released\n", ipvs->gen); 2274 net->ipvs = NULL; 2275 } 2276 2277 static void __net_exit __ip_vs_dev_cleanup(struct net *net) 2278 { 2279 struct netns_ipvs *ipvs = net_ipvs(net); 2280 EnterFunction(2); 2281 ipvs->enable = 0; /* Disable packet reception */ 2282 smp_wmb(); 2283 ip_vs_sync_net_cleanup(ipvs); 2284 LeaveFunction(2); 2285 } 2286 2287 static struct pernet_operations ipvs_core_ops = { 2288 .init = __ip_vs_init, 2289 .exit = __ip_vs_cleanup, 2290 .id = &ip_vs_net_id, 2291 .size = sizeof(struct netns_ipvs), 2292 }; 2293 2294 static struct pernet_operations ipvs_core_dev_ops = { 2295 .exit = __ip_vs_dev_cleanup, 2296 }; 2297 2298 /* 2299 * Initialize IP Virtual Server 2300 */ 2301 static int __init ip_vs_init(void) 2302 { 2303 int ret; 2304 2305 ret = ip_vs_control_init(); 2306 if (ret < 0) { 2307 pr_err("can't setup control.\n"); 2308 goto exit; 2309 } 2310 2311 ip_vs_protocol_init(); 2312 2313 ret = ip_vs_conn_init(); 2314 if (ret < 0) { 2315 pr_err("can't setup connection table.\n"); 2316 goto cleanup_protocol; 2317 } 2318 2319 ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */ 2320 if (ret < 0) 2321 goto cleanup_conn; 2322 2323 ret = register_pernet_device(&ipvs_core_dev_ops); 2324 if (ret < 0) 2325 goto cleanup_sub; 2326 2327 ret = ip_vs_register_nl_ioctl(); 2328 if (ret < 0) { 2329 pr_err("can't register netlink/ioctl.\n"); 2330 goto cleanup_dev; 2331 } 2332 2333 pr_info("ipvs loaded.\n"); 2334 2335 return ret; 2336 2337 cleanup_dev: 2338 unregister_pernet_device(&ipvs_core_dev_ops); 2339 cleanup_sub: 2340 unregister_pernet_subsys(&ipvs_core_ops); 2341 cleanup_conn: 2342 ip_vs_conn_cleanup(); 2343 cleanup_protocol: 2344 ip_vs_protocol_cleanup(); 2345 ip_vs_control_cleanup(); 2346 exit: 2347 return ret; 2348 } 2349 2350 static void __exit ip_vs_cleanup(void) 2351 { 2352 ip_vs_unregister_nl_ioctl(); 2353 unregister_pernet_device(&ipvs_core_dev_ops); 2354 unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ 2355 ip_vs_conn_cleanup(); 2356 ip_vs_protocol_cleanup(); 2357 ip_vs_control_cleanup(); 2358 pr_info("ipvs unloaded.\n"); 2359 } 2360 2361 module_init(ip_vs_init); 2362 module_exit(ip_vs_cleanup); 2363 MODULE_LICENSE("GPL"); 2364