1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * IPVS An implementation of the IP virtual server support for the 4 * LINUX operating system. IPVS is now implemented as a module 5 * over the Netfilter framework. IPVS can be used to build a 6 * high-performance and highly available server based on a 7 * cluster of servers. 8 * 9 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 10 * Peter Kese <peter.kese@ijs.si> 11 * Julian Anastasov <ja@ssi.bg> 12 * 13 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, 14 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms 15 * and others. 16 * 17 * Changes: 18 * Paul `Rusty' Russell properly handle non-linear skbs 19 * Harald Welte don't use nfcache 20 */ 21 22 #define KMSG_COMPONENT "IPVS" 23 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 24 25 #include <linux/module.h> 26 #include <linux/kernel.h> 27 #include <linux/ip.h> 28 #include <linux/tcp.h> 29 #include <linux/sctp.h> 30 #include <linux/icmp.h> 31 #include <linux/slab.h> 32 33 #include <net/ip.h> 34 #include <net/tcp.h> 35 #include <net/udp.h> 36 #include <net/icmp.h> /* for icmp_send */ 37 #include <net/gue.h> 38 #include <net/gre.h> 39 #include <net/route.h> 40 #include <net/ip6_checksum.h> 41 #include <net/netns/generic.h> /* net_generic() */ 42 43 #include <linux/netfilter.h> 44 #include <linux/netfilter_ipv4.h> 45 46 #ifdef CONFIG_IP_VS_IPV6 47 #include <net/ipv6.h> 48 #include <linux/netfilter_ipv6.h> 49 #include <net/ip6_route.h> 50 #endif 51 52 #include <net/ip_vs.h> 53 #include <linux/indirect_call_wrapper.h> 54 55 56 EXPORT_SYMBOL(register_ip_vs_scheduler); 57 EXPORT_SYMBOL(unregister_ip_vs_scheduler); 58 EXPORT_SYMBOL(ip_vs_proto_name); 59 EXPORT_SYMBOL(ip_vs_conn_new); 60 EXPORT_SYMBOL(ip_vs_conn_in_get); 61 EXPORT_SYMBOL(ip_vs_conn_out_get); 62 #ifdef CONFIG_IP_VS_PROTO_TCP 63 EXPORT_SYMBOL(ip_vs_tcp_conn_listen); 64 #endif 65 EXPORT_SYMBOL(ip_vs_conn_put); 66 #ifdef CONFIG_IP_VS_DEBUG 67 EXPORT_SYMBOL(ip_vs_get_debug_level); 68 #endif 69 EXPORT_SYMBOL(ip_vs_new_conn_out); 70 71 #ifdef CONFIG_IP_VS_PROTO_TCP 72 INDIRECT_CALLABLE_DECLARE(int 73 tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, 74 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)); 75 #endif 76 77 #ifdef CONFIG_IP_VS_PROTO_UDP 78 INDIRECT_CALLABLE_DECLARE(int 79 udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, 80 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)); 81 #endif 82 83 #if defined(CONFIG_IP_VS_PROTO_TCP) && defined(CONFIG_IP_VS_PROTO_UDP) 84 #define SNAT_CALL(f, ...) \ 85 INDIRECT_CALL_2(f, tcp_snat_handler, udp_snat_handler, __VA_ARGS__) 86 #elif defined(CONFIG_IP_VS_PROTO_TCP) 87 #define SNAT_CALL(f, ...) INDIRECT_CALL_1(f, tcp_snat_handler, __VA_ARGS__) 88 #elif defined(CONFIG_IP_VS_PROTO_UDP) 89 #define SNAT_CALL(f, ...) INDIRECT_CALL_1(f, udp_snat_handler, __VA_ARGS__) 90 #else 91 #define SNAT_CALL(f, ...) f(__VA_ARGS__) 92 #endif 93 94 static unsigned int ip_vs_net_id __read_mostly; 95 /* netns cnt used for uniqueness */ 96 static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0); 97 98 /* ID used in ICMP lookups */ 99 #define icmp_id(icmph) (((icmph)->un).echo.id) 100 #define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier) 101 102 const char *ip_vs_proto_name(unsigned int proto) 103 { 104 static char buf[20]; 105 106 switch (proto) { 107 case IPPROTO_IP: 108 return "IP"; 109 case IPPROTO_UDP: 110 return "UDP"; 111 case IPPROTO_TCP: 112 return "TCP"; 113 case IPPROTO_SCTP: 114 return "SCTP"; 115 case IPPROTO_ICMP: 116 return "ICMP"; 117 #ifdef CONFIG_IP_VS_IPV6 118 case IPPROTO_ICMPV6: 119 return "ICMPv6"; 120 #endif 121 default: 122 sprintf(buf, "IP_%u", proto); 123 return buf; 124 } 125 } 126 127 void ip_vs_init_hash_table(struct list_head *table, int rows) 128 { 129 while (--rows >= 0) 130 INIT_LIST_HEAD(&table[rows]); 131 } 132 133 static inline void 134 ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) 135 { 136 struct ip_vs_dest *dest = cp->dest; 137 struct netns_ipvs *ipvs = cp->ipvs; 138 139 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { 140 struct ip_vs_cpu_stats *s; 141 struct ip_vs_service *svc; 142 143 local_bh_disable(); 144 145 s = this_cpu_ptr(dest->stats.cpustats); 146 u64_stats_update_begin(&s->syncp); 147 s->cnt.inpkts++; 148 s->cnt.inbytes += skb->len; 149 u64_stats_update_end(&s->syncp); 150 151 svc = rcu_dereference(dest->svc); 152 s = this_cpu_ptr(svc->stats.cpustats); 153 u64_stats_update_begin(&s->syncp); 154 s->cnt.inpkts++; 155 s->cnt.inbytes += skb->len; 156 u64_stats_update_end(&s->syncp); 157 158 s = this_cpu_ptr(ipvs->tot_stats.cpustats); 159 u64_stats_update_begin(&s->syncp); 160 s->cnt.inpkts++; 161 s->cnt.inbytes += skb->len; 162 u64_stats_update_end(&s->syncp); 163 164 local_bh_enable(); 165 } 166 } 167 168 169 static inline void 170 ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) 171 { 172 struct ip_vs_dest *dest = cp->dest; 173 struct netns_ipvs *ipvs = cp->ipvs; 174 175 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { 176 struct ip_vs_cpu_stats *s; 177 struct ip_vs_service *svc; 178 179 local_bh_disable(); 180 181 s = this_cpu_ptr(dest->stats.cpustats); 182 u64_stats_update_begin(&s->syncp); 183 s->cnt.outpkts++; 184 s->cnt.outbytes += skb->len; 185 u64_stats_update_end(&s->syncp); 186 187 svc = rcu_dereference(dest->svc); 188 s = this_cpu_ptr(svc->stats.cpustats); 189 u64_stats_update_begin(&s->syncp); 190 s->cnt.outpkts++; 191 s->cnt.outbytes += skb->len; 192 u64_stats_update_end(&s->syncp); 193 194 s = this_cpu_ptr(ipvs->tot_stats.cpustats); 195 u64_stats_update_begin(&s->syncp); 196 s->cnt.outpkts++; 197 s->cnt.outbytes += skb->len; 198 u64_stats_update_end(&s->syncp); 199 200 local_bh_enable(); 201 } 202 } 203 204 205 static inline void 206 ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) 207 { 208 struct netns_ipvs *ipvs = svc->ipvs; 209 struct ip_vs_cpu_stats *s; 210 211 local_bh_disable(); 212 213 s = this_cpu_ptr(cp->dest->stats.cpustats); 214 u64_stats_update_begin(&s->syncp); 215 s->cnt.conns++; 216 u64_stats_update_end(&s->syncp); 217 218 s = this_cpu_ptr(svc->stats.cpustats); 219 u64_stats_update_begin(&s->syncp); 220 s->cnt.conns++; 221 u64_stats_update_end(&s->syncp); 222 223 s = this_cpu_ptr(ipvs->tot_stats.cpustats); 224 u64_stats_update_begin(&s->syncp); 225 s->cnt.conns++; 226 u64_stats_update_end(&s->syncp); 227 228 local_bh_enable(); 229 } 230 231 232 static inline void 233 ip_vs_set_state(struct ip_vs_conn *cp, int direction, 234 const struct sk_buff *skb, 235 struct ip_vs_proto_data *pd) 236 { 237 if (likely(pd->pp->state_transition)) 238 pd->pp->state_transition(cp, direction, skb, pd); 239 } 240 241 static inline int 242 ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, 243 struct sk_buff *skb, int protocol, 244 const union nf_inet_addr *caddr, __be16 cport, 245 const union nf_inet_addr *vaddr, __be16 vport, 246 struct ip_vs_conn_param *p) 247 { 248 ip_vs_conn_fill_param(svc->ipvs, svc->af, protocol, caddr, cport, vaddr, 249 vport, p); 250 p->pe = rcu_dereference(svc->pe); 251 if (p->pe && p->pe->fill_param) 252 return p->pe->fill_param(p, skb); 253 254 return 0; 255 } 256 257 /* 258 * IPVS persistent scheduling function 259 * It creates a connection entry according to its template if exists, 260 * or selects a server and creates a connection entry plus a template. 261 * Locking: we are svc user (svc->refcnt), so we hold all dests too 262 * Protocols supported: TCP, UDP 263 */ 264 static struct ip_vs_conn * 265 ip_vs_sched_persist(struct ip_vs_service *svc, 266 struct sk_buff *skb, __be16 src_port, __be16 dst_port, 267 int *ignored, struct ip_vs_iphdr *iph) 268 { 269 struct ip_vs_conn *cp = NULL; 270 struct ip_vs_dest *dest; 271 struct ip_vs_conn *ct; 272 __be16 dport = 0; /* destination port to forward */ 273 unsigned int flags; 274 struct ip_vs_conn_param param; 275 const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) }; 276 union nf_inet_addr snet; /* source network of the client, 277 after masking */ 278 const union nf_inet_addr *src_addr, *dst_addr; 279 280 if (likely(!ip_vs_iph_inverse(iph))) { 281 src_addr = &iph->saddr; 282 dst_addr = &iph->daddr; 283 } else { 284 src_addr = &iph->daddr; 285 dst_addr = &iph->saddr; 286 } 287 288 289 /* Mask saddr with the netmask to adjust template granularity */ 290 #ifdef CONFIG_IP_VS_IPV6 291 if (svc->af == AF_INET6) 292 ipv6_addr_prefix(&snet.in6, &src_addr->in6, 293 (__force __u32) svc->netmask); 294 else 295 #endif 296 snet.ip = src_addr->ip & svc->netmask; 297 298 IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u " 299 "mnet %s\n", 300 IP_VS_DBG_ADDR(svc->af, src_addr), ntohs(src_port), 301 IP_VS_DBG_ADDR(svc->af, dst_addr), ntohs(dst_port), 302 IP_VS_DBG_ADDR(svc->af, &snet)); 303 304 /* 305 * As far as we know, FTP is a very complicated network protocol, and 306 * it uses control connection and data connections. For active FTP, 307 * FTP server initialize data connection to the client, its source port 308 * is often 20. For passive FTP, FTP server tells the clients the port 309 * that it passively listens to, and the client issues the data 310 * connection. In the tunneling or direct routing mode, the load 311 * balancer is on the client-to-server half of connection, the port 312 * number is unknown to the load balancer. So, a conn template like 313 * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP 314 * service, and a template like <caddr, 0, vaddr, vport, daddr, dport> 315 * is created for other persistent services. 316 */ 317 { 318 int protocol = iph->protocol; 319 const union nf_inet_addr *vaddr = dst_addr; 320 __be16 vport = 0; 321 322 if (dst_port == svc->port) { 323 /* non-FTP template: 324 * <protocol, caddr, 0, vaddr, vport, daddr, dport> 325 * FTP template: 326 * <protocol, caddr, 0, vaddr, 0, daddr, 0> 327 */ 328 if (svc->port != FTPPORT) 329 vport = dst_port; 330 } else { 331 /* Note: persistent fwmark-based services and 332 * persistent port zero service are handled here. 333 * fwmark template: 334 * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0> 335 * port zero template: 336 * <protocol,caddr,0,vaddr,0,daddr,0> 337 */ 338 if (svc->fwmark) { 339 protocol = IPPROTO_IP; 340 vaddr = &fwmark; 341 } 342 } 343 /* return *ignored = -1 so NF_DROP can be used */ 344 if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0, 345 vaddr, vport, ¶m) < 0) { 346 *ignored = -1; 347 return NULL; 348 } 349 } 350 351 /* Check if a template already exists */ 352 ct = ip_vs_ct_in_get(¶m); 353 if (!ct || !ip_vs_check_template(ct, NULL)) { 354 struct ip_vs_scheduler *sched; 355 356 /* 357 * No template found or the dest of the connection 358 * template is not available. 359 * return *ignored=0 i.e. ICMP and NF_DROP 360 */ 361 sched = rcu_dereference(svc->scheduler); 362 if (sched) { 363 /* read svc->sched_data after svc->scheduler */ 364 smp_rmb(); 365 dest = sched->schedule(svc, skb, iph); 366 } else { 367 dest = NULL; 368 } 369 if (!dest) { 370 IP_VS_DBG(1, "p-schedule: no dest found.\n"); 371 kfree(param.pe_data); 372 *ignored = 0; 373 return NULL; 374 } 375 376 if (dst_port == svc->port && svc->port != FTPPORT) 377 dport = dest->port; 378 379 /* Create a template 380 * This adds param.pe_data to the template, 381 * and thus param.pe_data will be destroyed 382 * when the template expires */ 383 ct = ip_vs_conn_new(¶m, dest->af, &dest->addr, dport, 384 IP_VS_CONN_F_TEMPLATE, dest, skb->mark); 385 if (ct == NULL) { 386 kfree(param.pe_data); 387 *ignored = -1; 388 return NULL; 389 } 390 391 ct->timeout = svc->timeout; 392 } else { 393 /* set destination with the found template */ 394 dest = ct->dest; 395 kfree(param.pe_data); 396 } 397 398 dport = dst_port; 399 if (dport == svc->port && dest->port) 400 dport = dest->port; 401 402 flags = (svc->flags & IP_VS_SVC_F_ONEPACKET 403 && iph->protocol == IPPROTO_UDP) ? 404 IP_VS_CONN_F_ONE_PACKET : 0; 405 406 /* 407 * Create a new connection according to the template 408 */ 409 ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, src_addr, 410 src_port, dst_addr, dst_port, ¶m); 411 412 cp = ip_vs_conn_new(¶m, dest->af, &dest->addr, dport, flags, dest, 413 skb->mark); 414 if (cp == NULL) { 415 ip_vs_conn_put(ct); 416 *ignored = -1; 417 return NULL; 418 } 419 420 /* 421 * Add its control 422 */ 423 ip_vs_control_add(cp, ct); 424 ip_vs_conn_put(ct); 425 426 ip_vs_conn_stats(cp, svc); 427 return cp; 428 } 429 430 431 /* 432 * IPVS main scheduling function 433 * It selects a server according to the virtual service, and 434 * creates a connection entry. 435 * Protocols supported: TCP, UDP 436 * 437 * Usage of *ignored 438 * 439 * 1 : protocol tried to schedule (eg. on SYN), found svc but the 440 * svc/scheduler decides that this packet should be accepted with 441 * NF_ACCEPT because it must not be scheduled. 442 * 443 * 0 : scheduler can not find destination, so try bypass or 444 * return ICMP and then NF_DROP (ip_vs_leave). 445 * 446 * -1 : scheduler tried to schedule but fatal error occurred, eg. 447 * ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param 448 * failure such as missing Call-ID, ENOMEM on skb_linearize 449 * or pe_data. In this case we should return NF_DROP without 450 * any attempts to send ICMP with ip_vs_leave. 451 */ 452 struct ip_vs_conn * 453 ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, 454 struct ip_vs_proto_data *pd, int *ignored, 455 struct ip_vs_iphdr *iph) 456 { 457 struct ip_vs_protocol *pp = pd->pp; 458 struct ip_vs_conn *cp = NULL; 459 struct ip_vs_scheduler *sched; 460 struct ip_vs_dest *dest; 461 __be16 _ports[2], *pptr, cport, vport; 462 const void *caddr, *vaddr; 463 unsigned int flags; 464 465 *ignored = 1; 466 /* 467 * IPv6 frags, only the first hit here. 468 */ 469 pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports); 470 if (pptr == NULL) 471 return NULL; 472 473 if (likely(!ip_vs_iph_inverse(iph))) { 474 cport = pptr[0]; 475 caddr = &iph->saddr; 476 vport = pptr[1]; 477 vaddr = &iph->daddr; 478 } else { 479 cport = pptr[1]; 480 caddr = &iph->daddr; 481 vport = pptr[0]; 482 vaddr = &iph->saddr; 483 } 484 485 /* 486 * FTPDATA needs this check when using local real server. 487 * Never schedule Active FTPDATA connections from real server. 488 * For LVS-NAT they must be already created. For other methods 489 * with persistence the connection is created on SYN+ACK. 490 */ 491 if (cport == FTPDATA) { 492 IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off, 493 "Not scheduling FTPDATA"); 494 return NULL; 495 } 496 497 /* 498 * Do not schedule replies from local real server. 499 */ 500 if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK)) { 501 iph->hdr_flags ^= IP_VS_HDR_INVERSE; 502 cp = INDIRECT_CALL_1(pp->conn_in_get, 503 ip_vs_conn_in_get_proto, svc->ipvs, 504 svc->af, skb, iph); 505 iph->hdr_flags ^= IP_VS_HDR_INVERSE; 506 507 if (cp) { 508 IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off, 509 "Not scheduling reply for existing" 510 " connection"); 511 __ip_vs_conn_put(cp); 512 return NULL; 513 } 514 } 515 516 /* 517 * Persistent service 518 */ 519 if (svc->flags & IP_VS_SVC_F_PERSISTENT) 520 return ip_vs_sched_persist(svc, skb, cport, vport, ignored, 521 iph); 522 523 *ignored = 0; 524 525 /* 526 * Non-persistent service 527 */ 528 if (!svc->fwmark && vport != svc->port) { 529 if (!svc->port) 530 pr_err("Schedule: port zero only supported " 531 "in persistent services, " 532 "check your ipvs configuration\n"); 533 return NULL; 534 } 535 536 sched = rcu_dereference(svc->scheduler); 537 if (sched) { 538 /* read svc->sched_data after svc->scheduler */ 539 smp_rmb(); 540 dest = sched->schedule(svc, skb, iph); 541 } else { 542 dest = NULL; 543 } 544 if (dest == NULL) { 545 IP_VS_DBG(1, "Schedule: no dest found.\n"); 546 return NULL; 547 } 548 549 flags = (svc->flags & IP_VS_SVC_F_ONEPACKET 550 && iph->protocol == IPPROTO_UDP) ? 551 IP_VS_CONN_F_ONE_PACKET : 0; 552 553 /* 554 * Create a connection entry. 555 */ 556 { 557 struct ip_vs_conn_param p; 558 559 ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, 560 caddr, cport, vaddr, vport, &p); 561 cp = ip_vs_conn_new(&p, dest->af, &dest->addr, 562 dest->port ? dest->port : vport, 563 flags, dest, skb->mark); 564 if (!cp) { 565 *ignored = -1; 566 return NULL; 567 } 568 } 569 570 IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " 571 "d:%s:%u conn->flags:%X conn->refcnt:%d\n", 572 ip_vs_fwd_tag(cp), 573 IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), 574 IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), 575 IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), 576 cp->flags, refcount_read(&cp->refcnt)); 577 578 ip_vs_conn_stats(cp, svc); 579 return cp; 580 } 581 582 static inline int ip_vs_addr_is_unicast(struct net *net, int af, 583 union nf_inet_addr *addr) 584 { 585 #ifdef CONFIG_IP_VS_IPV6 586 if (af == AF_INET6) 587 return ipv6_addr_type(&addr->in6) & IPV6_ADDR_UNICAST; 588 #endif 589 return (inet_addr_type(net, addr->ip) == RTN_UNICAST); 590 } 591 592 /* 593 * Pass or drop the packet. 594 * Called by ip_vs_in, when the virtual service is available but 595 * no destination is available for a new connection. 596 */ 597 int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, 598 struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph) 599 { 600 __be16 _ports[2], *pptr, dport; 601 struct netns_ipvs *ipvs = svc->ipvs; 602 struct net *net = ipvs->net; 603 604 pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports); 605 if (!pptr) 606 return NF_DROP; 607 dport = likely(!ip_vs_iph_inverse(iph)) ? pptr[1] : pptr[0]; 608 609 /* if it is fwmark-based service, the cache_bypass sysctl is up 610 and the destination is a non-local unicast, then create 611 a cache_bypass connection entry */ 612 if (sysctl_cache_bypass(ipvs) && svc->fwmark && 613 !(iph->hdr_flags & (IP_VS_HDR_INVERSE | IP_VS_HDR_ICMP)) && 614 ip_vs_addr_is_unicast(net, svc->af, &iph->daddr)) { 615 int ret; 616 struct ip_vs_conn *cp; 617 unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && 618 iph->protocol == IPPROTO_UDP) ? 619 IP_VS_CONN_F_ONE_PACKET : 0; 620 union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } }; 621 622 /* create a new connection entry */ 623 IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__); 624 { 625 struct ip_vs_conn_param p; 626 ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, 627 &iph->saddr, pptr[0], 628 &iph->daddr, pptr[1], &p); 629 cp = ip_vs_conn_new(&p, svc->af, &daddr, 0, 630 IP_VS_CONN_F_BYPASS | flags, 631 NULL, skb->mark); 632 if (!cp) 633 return NF_DROP; 634 } 635 636 /* statistics */ 637 ip_vs_in_stats(cp, skb); 638 639 /* set state */ 640 ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); 641 642 /* transmit the first SYN packet */ 643 ret = cp->packet_xmit(skb, cp, pd->pp, iph); 644 /* do not touch skb anymore */ 645 646 if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control) 647 atomic_inc(&cp->control->in_pkts); 648 else 649 atomic_inc(&cp->in_pkts); 650 ip_vs_conn_put(cp); 651 return ret; 652 } 653 654 /* 655 * When the virtual ftp service is presented, packets destined 656 * for other services on the VIP may get here (except services 657 * listed in the ipvs table), pass the packets, because it is 658 * not ipvs job to decide to drop the packets. 659 */ 660 if (svc->port == FTPPORT && dport != FTPPORT) 661 return NF_ACCEPT; 662 663 if (unlikely(ip_vs_iph_icmp(iph))) 664 return NF_DROP; 665 666 /* 667 * Notify the client that the destination is unreachable, and 668 * release the socket buffer. 669 * Since it is in IP layer, the TCP socket is not actually 670 * created, the TCP RST packet cannot be sent, instead that 671 * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ 672 */ 673 #ifdef CONFIG_IP_VS_IPV6 674 if (svc->af == AF_INET6) { 675 if (!skb->dev) 676 skb->dev = net->loopback_dev; 677 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); 678 } else 679 #endif 680 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); 681 682 return NF_DROP; 683 } 684 685 #ifdef CONFIG_SYSCTL 686 687 static int sysctl_snat_reroute(struct netns_ipvs *ipvs) 688 { 689 return ipvs->sysctl_snat_reroute; 690 } 691 692 static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) 693 { 694 return ipvs->sysctl_nat_icmp_send; 695 } 696 697 #else 698 699 static int sysctl_snat_reroute(struct netns_ipvs *ipvs) { return 0; } 700 static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) { return 0; } 701 702 #endif 703 704 __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) 705 { 706 return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); 707 } 708 709 static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum) 710 { 711 if (NF_INET_LOCAL_IN == hooknum) 712 return IP_DEFRAG_VS_IN; 713 if (NF_INET_FORWARD == hooknum) 714 return IP_DEFRAG_VS_FWD; 715 return IP_DEFRAG_VS_OUT; 716 } 717 718 static inline int ip_vs_gather_frags(struct netns_ipvs *ipvs, 719 struct sk_buff *skb, u_int32_t user) 720 { 721 int err; 722 723 local_bh_disable(); 724 err = ip_defrag(ipvs->net, skb, user); 725 local_bh_enable(); 726 if (!err) 727 ip_send_check(ip_hdr(skb)); 728 729 return err; 730 } 731 732 static int ip_vs_route_me_harder(struct netns_ipvs *ipvs, int af, 733 struct sk_buff *skb, unsigned int hooknum) 734 { 735 if (!sysctl_snat_reroute(ipvs)) 736 return 0; 737 /* Reroute replies only to remote clients (FORWARD and LOCAL_OUT) */ 738 if (NF_INET_LOCAL_IN == hooknum) 739 return 0; 740 #ifdef CONFIG_IP_VS_IPV6 741 if (af == AF_INET6) { 742 struct dst_entry *dst = skb_dst(skb); 743 744 if (dst->dev && !(dst->dev->flags & IFF_LOOPBACK) && 745 ip6_route_me_harder(ipvs->net, skb) != 0) 746 return 1; 747 } else 748 #endif 749 if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL) && 750 ip_route_me_harder(ipvs->net, skb, RTN_LOCAL) != 0) 751 return 1; 752 753 return 0; 754 } 755 756 /* 757 * Packet has been made sufficiently writable in caller 758 * - inout: 1=in->out, 0=out->in 759 */ 760 void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, 761 struct ip_vs_conn *cp, int inout) 762 { 763 struct iphdr *iph = ip_hdr(skb); 764 unsigned int icmp_offset = iph->ihl*4; 765 struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) + 766 icmp_offset); 767 struct iphdr *ciph = (struct iphdr *)(icmph + 1); 768 769 if (inout) { 770 iph->saddr = cp->vaddr.ip; 771 ip_send_check(iph); 772 ciph->daddr = cp->vaddr.ip; 773 ip_send_check(ciph); 774 } else { 775 iph->daddr = cp->daddr.ip; 776 ip_send_check(iph); 777 ciph->saddr = cp->daddr.ip; 778 ip_send_check(ciph); 779 } 780 781 /* the TCP/UDP/SCTP port */ 782 if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol || 783 IPPROTO_SCTP == ciph->protocol) { 784 __be16 *ports = (void *)ciph + ciph->ihl*4; 785 786 if (inout) 787 ports[1] = cp->vport; 788 else 789 ports[0] = cp->dport; 790 } 791 792 /* And finally the ICMP checksum */ 793 icmph->checksum = 0; 794 icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset); 795 skb->ip_summed = CHECKSUM_UNNECESSARY; 796 797 if (inout) 798 IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, 799 "Forwarding altered outgoing ICMP"); 800 else 801 IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, 802 "Forwarding altered incoming ICMP"); 803 } 804 805 #ifdef CONFIG_IP_VS_IPV6 806 void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, 807 struct ip_vs_conn *cp, int inout) 808 { 809 struct ipv6hdr *iph = ipv6_hdr(skb); 810 unsigned int icmp_offset = 0; 811 unsigned int offs = 0; /* header offset*/ 812 int protocol; 813 struct icmp6hdr *icmph; 814 struct ipv6hdr *ciph; 815 unsigned short fragoffs; 816 817 ipv6_find_hdr(skb, &icmp_offset, IPPROTO_ICMPV6, &fragoffs, NULL); 818 icmph = (struct icmp6hdr *)(skb_network_header(skb) + icmp_offset); 819 offs = icmp_offset + sizeof(struct icmp6hdr); 820 ciph = (struct ipv6hdr *)(skb_network_header(skb) + offs); 821 822 protocol = ipv6_find_hdr(skb, &offs, -1, &fragoffs, NULL); 823 824 if (inout) { 825 iph->saddr = cp->vaddr.in6; 826 ciph->daddr = cp->vaddr.in6; 827 } else { 828 iph->daddr = cp->daddr.in6; 829 ciph->saddr = cp->daddr.in6; 830 } 831 832 /* the TCP/UDP/SCTP port */ 833 if (!fragoffs && (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol || 834 IPPROTO_SCTP == protocol)) { 835 __be16 *ports = (void *)(skb_network_header(skb) + offs); 836 837 IP_VS_DBG(11, "%s() changed port %d to %d\n", __func__, 838 ntohs(inout ? ports[1] : ports[0]), 839 ntohs(inout ? cp->vport : cp->dport)); 840 if (inout) 841 ports[1] = cp->vport; 842 else 843 ports[0] = cp->dport; 844 } 845 846 /* And finally the ICMP checksum */ 847 icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, 848 skb->len - icmp_offset, 849 IPPROTO_ICMPV6, 0); 850 skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset; 851 skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum); 852 skb->ip_summed = CHECKSUM_PARTIAL; 853 854 if (inout) 855 IP_VS_DBG_PKT(11, AF_INET6, pp, skb, 856 (void *)ciph - (void *)iph, 857 "Forwarding altered outgoing ICMPv6"); 858 else 859 IP_VS_DBG_PKT(11, AF_INET6, pp, skb, 860 (void *)ciph - (void *)iph, 861 "Forwarding altered incoming ICMPv6"); 862 } 863 #endif 864 865 /* Handle relevant response ICMP messages - forward to the right 866 * destination host. 867 */ 868 static int handle_response_icmp(int af, struct sk_buff *skb, 869 union nf_inet_addr *snet, 870 __u8 protocol, struct ip_vs_conn *cp, 871 struct ip_vs_protocol *pp, 872 unsigned int offset, unsigned int ihl, 873 unsigned int hooknum) 874 { 875 unsigned int verdict = NF_DROP; 876 877 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) 878 goto ignore_cp; 879 880 /* Ensure the checksum is correct */ 881 if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { 882 /* Failed checksum! */ 883 IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n", 884 IP_VS_DBG_ADDR(af, snet)); 885 goto out; 886 } 887 888 if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol || 889 IPPROTO_SCTP == protocol) 890 offset += 2 * sizeof(__u16); 891 if (skb_ensure_writable(skb, offset)) 892 goto out; 893 894 #ifdef CONFIG_IP_VS_IPV6 895 if (af == AF_INET6) 896 ip_vs_nat_icmp_v6(skb, pp, cp, 1); 897 else 898 #endif 899 ip_vs_nat_icmp(skb, pp, cp, 1); 900 901 if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum)) 902 goto out; 903 904 /* do the statistics and put it back */ 905 ip_vs_out_stats(cp, skb); 906 907 skb->ipvs_property = 1; 908 if (!(cp->flags & IP_VS_CONN_F_NFCT)) 909 ip_vs_notrack(skb); 910 else 911 ip_vs_update_conntrack(skb, cp, 0); 912 913 ignore_cp: 914 verdict = NF_ACCEPT; 915 916 out: 917 __ip_vs_conn_put(cp); 918 919 return verdict; 920 } 921 922 /* 923 * Handle ICMP messages in the inside-to-outside direction (outgoing). 924 * Find any that might be relevant, check against existing connections. 925 * Currently handles error types - unreachable, quench, ttl exceeded. 926 */ 927 static int ip_vs_out_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, 928 int *related, unsigned int hooknum) 929 { 930 struct iphdr *iph; 931 struct icmphdr _icmph, *ic; 932 struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ 933 struct ip_vs_iphdr ciph; 934 struct ip_vs_conn *cp; 935 struct ip_vs_protocol *pp; 936 unsigned int offset, ihl; 937 union nf_inet_addr snet; 938 939 *related = 1; 940 941 /* reassemble IP fragments */ 942 if (ip_is_fragment(ip_hdr(skb))) { 943 if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum))) 944 return NF_STOLEN; 945 } 946 947 iph = ip_hdr(skb); 948 offset = ihl = iph->ihl * 4; 949 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); 950 if (ic == NULL) 951 return NF_DROP; 952 953 IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n", 954 ic->type, ntohs(icmp_id(ic)), 955 &iph->saddr, &iph->daddr); 956 957 /* 958 * Work through seeing if this is for us. 959 * These checks are supposed to be in an order that means easy 960 * things are checked first to speed up processing.... however 961 * this means that some packets will manage to get a long way 962 * down this stack and then be rejected, but that's life. 963 */ 964 if ((ic->type != ICMP_DEST_UNREACH) && 965 (ic->type != ICMP_SOURCE_QUENCH) && 966 (ic->type != ICMP_TIME_EXCEEDED)) { 967 *related = 0; 968 return NF_ACCEPT; 969 } 970 971 /* Now find the contained IP header */ 972 offset += sizeof(_icmph); 973 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); 974 if (cih == NULL) 975 return NF_ACCEPT; /* The packet looks wrong, ignore */ 976 977 pp = ip_vs_proto_get(cih->protocol); 978 if (!pp) 979 return NF_ACCEPT; 980 981 /* Is the embedded protocol header present? */ 982 if (unlikely(cih->frag_off & htons(IP_OFFSET) && 983 pp->dont_defrag)) 984 return NF_ACCEPT; 985 986 IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, 987 "Checking outgoing ICMP for"); 988 989 ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, true, &ciph); 990 991 /* The embedded headers contain source and dest in reverse order */ 992 cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto, 993 ipvs, AF_INET, skb, &ciph); 994 if (!cp) 995 return NF_ACCEPT; 996 997 snet.ip = iph->saddr; 998 return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp, 999 pp, ciph.len, ihl, hooknum); 1000 } 1001 1002 #ifdef CONFIG_IP_VS_IPV6 1003 static int ip_vs_out_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb, 1004 int *related, unsigned int hooknum, 1005 struct ip_vs_iphdr *ipvsh) 1006 { 1007 struct icmp6hdr _icmph, *ic; 1008 struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */ 1009 struct ip_vs_conn *cp; 1010 struct ip_vs_protocol *pp; 1011 union nf_inet_addr snet; 1012 unsigned int offset; 1013 1014 *related = 1; 1015 ic = frag_safe_skb_hp(skb, ipvsh->len, sizeof(_icmph), &_icmph); 1016 if (ic == NULL) 1017 return NF_DROP; 1018 1019 /* 1020 * Work through seeing if this is for us. 1021 * These checks are supposed to be in an order that means easy 1022 * things are checked first to speed up processing.... however 1023 * this means that some packets will manage to get a long way 1024 * down this stack and then be rejected, but that's life. 1025 */ 1026 if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) { 1027 *related = 0; 1028 return NF_ACCEPT; 1029 } 1030 /* Fragment header that is before ICMP header tells us that: 1031 * it's not an error message since they can't be fragmented. 1032 */ 1033 if (ipvsh->flags & IP6_FH_F_FRAG) 1034 return NF_DROP; 1035 1036 IP_VS_DBG(8, "Outgoing ICMPv6 (%d,%d) %pI6c->%pI6c\n", 1037 ic->icmp6_type, ntohs(icmpv6_id(ic)), 1038 &ipvsh->saddr, &ipvsh->daddr); 1039 1040 if (!ip_vs_fill_iph_skb_icmp(AF_INET6, skb, ipvsh->len + sizeof(_icmph), 1041 true, &ciph)) 1042 return NF_ACCEPT; /* The packet looks wrong, ignore */ 1043 1044 pp = ip_vs_proto_get(ciph.protocol); 1045 if (!pp) 1046 return NF_ACCEPT; 1047 1048 /* The embedded headers contain source and dest in reverse order */ 1049 cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto, 1050 ipvs, AF_INET6, skb, &ciph); 1051 if (!cp) 1052 return NF_ACCEPT; 1053 1054 snet.in6 = ciph.saddr.in6; 1055 offset = ciph.len; 1056 return handle_response_icmp(AF_INET6, skb, &snet, ciph.protocol, cp, 1057 pp, offset, sizeof(struct ipv6hdr), 1058 hooknum); 1059 } 1060 #endif 1061 1062 /* 1063 * Check if sctp chunc is ABORT chunk 1064 */ 1065 static inline int is_sctp_abort(const struct sk_buff *skb, int nh_len) 1066 { 1067 struct sctp_chunkhdr *sch, schunk; 1068 sch = skb_header_pointer(skb, nh_len + sizeof(struct sctphdr), 1069 sizeof(schunk), &schunk); 1070 if (sch == NULL) 1071 return 0; 1072 if (sch->type == SCTP_CID_ABORT) 1073 return 1; 1074 return 0; 1075 } 1076 1077 static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len) 1078 { 1079 struct tcphdr _tcph, *th; 1080 1081 th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph); 1082 if (th == NULL) 1083 return 0; 1084 return th->rst; 1085 } 1086 1087 static inline bool is_new_conn(const struct sk_buff *skb, 1088 struct ip_vs_iphdr *iph) 1089 { 1090 switch (iph->protocol) { 1091 case IPPROTO_TCP: { 1092 struct tcphdr _tcph, *th; 1093 1094 th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); 1095 if (th == NULL) 1096 return false; 1097 return th->syn; 1098 } 1099 case IPPROTO_SCTP: { 1100 struct sctp_chunkhdr *sch, schunk; 1101 1102 sch = skb_header_pointer(skb, iph->len + sizeof(struct sctphdr), 1103 sizeof(schunk), &schunk); 1104 if (sch == NULL) 1105 return false; 1106 return sch->type == SCTP_CID_INIT; 1107 } 1108 default: 1109 return false; 1110 } 1111 } 1112 1113 static inline bool is_new_conn_expected(const struct ip_vs_conn *cp, 1114 int conn_reuse_mode) 1115 { 1116 /* Controlled (FTP DATA or persistence)? */ 1117 if (cp->control) 1118 return false; 1119 1120 switch (cp->protocol) { 1121 case IPPROTO_TCP: 1122 return (cp->state == IP_VS_TCP_S_TIME_WAIT) || 1123 (cp->state == IP_VS_TCP_S_CLOSE) || 1124 ((conn_reuse_mode & 2) && 1125 (cp->state == IP_VS_TCP_S_FIN_WAIT) && 1126 (cp->flags & IP_VS_CONN_F_NOOUTPUT)); 1127 case IPPROTO_SCTP: 1128 return cp->state == IP_VS_SCTP_S_CLOSED; 1129 default: 1130 return false; 1131 } 1132 } 1133 1134 /* Generic function to create new connections for outgoing RS packets 1135 * 1136 * Pre-requisites for successful connection creation: 1137 * 1) Virtual Service is NOT fwmark based: 1138 * In fwmark-VS actual vaddr and vport are unknown to IPVS 1139 * 2) Real Server and Virtual Service were NOT configured without port: 1140 * This is to allow match of different VS to the same RS ip-addr 1141 */ 1142 struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc, 1143 struct ip_vs_dest *dest, 1144 struct sk_buff *skb, 1145 const struct ip_vs_iphdr *iph, 1146 __be16 dport, 1147 __be16 cport) 1148 { 1149 struct ip_vs_conn_param param; 1150 struct ip_vs_conn *ct = NULL, *cp = NULL; 1151 const union nf_inet_addr *vaddr, *daddr, *caddr; 1152 union nf_inet_addr snet; 1153 __be16 vport; 1154 unsigned int flags; 1155 1156 EnterFunction(12); 1157 vaddr = &svc->addr; 1158 vport = svc->port; 1159 daddr = &iph->saddr; 1160 caddr = &iph->daddr; 1161 1162 /* check pre-requisites are satisfied */ 1163 if (svc->fwmark) 1164 return NULL; 1165 if (!vport || !dport) 1166 return NULL; 1167 1168 /* for persistent service first create connection template */ 1169 if (svc->flags & IP_VS_SVC_F_PERSISTENT) { 1170 /* apply netmask the same way ingress-side does */ 1171 #ifdef CONFIG_IP_VS_IPV6 1172 if (svc->af == AF_INET6) 1173 ipv6_addr_prefix(&snet.in6, &caddr->in6, 1174 (__force __u32)svc->netmask); 1175 else 1176 #endif 1177 snet.ip = caddr->ip & svc->netmask; 1178 /* fill params and create template if not existent */ 1179 if (ip_vs_conn_fill_param_persist(svc, skb, iph->protocol, 1180 &snet, 0, vaddr, 1181 vport, ¶m) < 0) 1182 return NULL; 1183 ct = ip_vs_ct_in_get(¶m); 1184 /* check if template exists and points to the same dest */ 1185 if (!ct || !ip_vs_check_template(ct, dest)) { 1186 ct = ip_vs_conn_new(¶m, dest->af, daddr, dport, 1187 IP_VS_CONN_F_TEMPLATE, dest, 0); 1188 if (!ct) { 1189 kfree(param.pe_data); 1190 return NULL; 1191 } 1192 ct->timeout = svc->timeout; 1193 } else { 1194 kfree(param.pe_data); 1195 } 1196 } 1197 1198 /* connection flags */ 1199 flags = ((svc->flags & IP_VS_SVC_F_ONEPACKET) && 1200 iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0; 1201 /* create connection */ 1202 ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, 1203 caddr, cport, vaddr, vport, ¶m); 1204 cp = ip_vs_conn_new(¶m, dest->af, daddr, dport, flags, dest, 0); 1205 if (!cp) { 1206 if (ct) 1207 ip_vs_conn_put(ct); 1208 return NULL; 1209 } 1210 if (ct) { 1211 ip_vs_control_add(cp, ct); 1212 ip_vs_conn_put(ct); 1213 } 1214 ip_vs_conn_stats(cp, svc); 1215 1216 /* return connection (will be used to handle outgoing packet) */ 1217 IP_VS_DBG_BUF(6, "New connection RS-initiated:%c c:%s:%u v:%s:%u " 1218 "d:%s:%u conn->flags:%X conn->refcnt:%d\n", 1219 ip_vs_fwd_tag(cp), 1220 IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), 1221 IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), 1222 IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), 1223 cp->flags, refcount_read(&cp->refcnt)); 1224 LeaveFunction(12); 1225 return cp; 1226 } 1227 1228 /* Handle outgoing packets which are considered requests initiated by 1229 * real servers, so that subsequent responses from external client can be 1230 * routed to the right real server. 1231 * Used also for outgoing responses in OPS mode. 1232 * 1233 * Connection management is handled by persistent-engine specific callback. 1234 */ 1235 static struct ip_vs_conn *__ip_vs_rs_conn_out(unsigned int hooknum, 1236 struct netns_ipvs *ipvs, 1237 int af, struct sk_buff *skb, 1238 const struct ip_vs_iphdr *iph) 1239 { 1240 struct ip_vs_dest *dest; 1241 struct ip_vs_conn *cp = NULL; 1242 __be16 _ports[2], *pptr; 1243 1244 if (hooknum == NF_INET_LOCAL_IN) 1245 return NULL; 1246 1247 pptr = frag_safe_skb_hp(skb, iph->len, 1248 sizeof(_ports), _ports); 1249 if (!pptr) 1250 return NULL; 1251 1252 dest = ip_vs_find_real_service(ipvs, af, iph->protocol, 1253 &iph->saddr, pptr[0]); 1254 if (dest) { 1255 struct ip_vs_service *svc; 1256 struct ip_vs_pe *pe; 1257 1258 svc = rcu_dereference(dest->svc); 1259 if (svc) { 1260 pe = rcu_dereference(svc->pe); 1261 if (pe && pe->conn_out) 1262 cp = pe->conn_out(svc, dest, skb, iph, 1263 pptr[0], pptr[1]); 1264 } 1265 } 1266 1267 return cp; 1268 } 1269 1270 /* Handle response packets: rewrite addresses and send away... 1271 */ 1272 static unsigned int 1273 handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, 1274 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph, 1275 unsigned int hooknum) 1276 { 1277 struct ip_vs_protocol *pp = pd->pp; 1278 1279 IP_VS_DBG_PKT(11, af, pp, skb, iph->off, "Outgoing packet"); 1280 1281 if (skb_ensure_writable(skb, iph->len)) 1282 goto drop; 1283 1284 /* mangle the packet */ 1285 if (pp->snat_handler && 1286 !SNAT_CALL(pp->snat_handler, skb, pp, cp, iph)) 1287 goto drop; 1288 1289 #ifdef CONFIG_IP_VS_IPV6 1290 if (af == AF_INET6) 1291 ipv6_hdr(skb)->saddr = cp->vaddr.in6; 1292 else 1293 #endif 1294 { 1295 ip_hdr(skb)->saddr = cp->vaddr.ip; 1296 ip_send_check(ip_hdr(skb)); 1297 } 1298 1299 /* 1300 * nf_iterate does not expect change in the skb->dst->dev. 1301 * It looks like it is not fatal to enable this code for hooks 1302 * where our handlers are at the end of the chain list and 1303 * when all next handlers use skb->dst->dev and not outdev. 1304 * It will definitely route properly the inout NAT traffic 1305 * when multiple paths are used. 1306 */ 1307 1308 /* For policy routing, packets originating from this 1309 * machine itself may be routed differently to packets 1310 * passing through. We want this packet to be routed as 1311 * if it came from this machine itself. So re-compute 1312 * the routing information. 1313 */ 1314 if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum)) 1315 goto drop; 1316 1317 IP_VS_DBG_PKT(10, af, pp, skb, iph->off, "After SNAT"); 1318 1319 ip_vs_out_stats(cp, skb); 1320 ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd); 1321 skb->ipvs_property = 1; 1322 if (!(cp->flags & IP_VS_CONN_F_NFCT)) 1323 ip_vs_notrack(skb); 1324 else 1325 ip_vs_update_conntrack(skb, cp, 0); 1326 ip_vs_conn_put(cp); 1327 1328 LeaveFunction(11); 1329 return NF_ACCEPT; 1330 1331 drop: 1332 ip_vs_conn_put(cp); 1333 kfree_skb(skb); 1334 LeaveFunction(11); 1335 return NF_STOLEN; 1336 } 1337 1338 /* 1339 * Check if outgoing packet belongs to the established ip_vs_conn. 1340 */ 1341 static unsigned int 1342 ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af) 1343 { 1344 struct ip_vs_iphdr iph; 1345 struct ip_vs_protocol *pp; 1346 struct ip_vs_proto_data *pd; 1347 struct ip_vs_conn *cp; 1348 struct sock *sk; 1349 1350 EnterFunction(11); 1351 1352 /* Already marked as IPVS request or reply? */ 1353 if (skb->ipvs_property) 1354 return NF_ACCEPT; 1355 1356 sk = skb_to_full_sk(skb); 1357 /* Bad... Do not break raw sockets */ 1358 if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT && 1359 af == AF_INET)) { 1360 1361 if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag) 1362 return NF_ACCEPT; 1363 } 1364 1365 if (unlikely(!skb_dst(skb))) 1366 return NF_ACCEPT; 1367 1368 if (!ipvs->enable) 1369 return NF_ACCEPT; 1370 1371 ip_vs_fill_iph_skb(af, skb, false, &iph); 1372 #ifdef CONFIG_IP_VS_IPV6 1373 if (af == AF_INET6) { 1374 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { 1375 int related; 1376 int verdict = ip_vs_out_icmp_v6(ipvs, skb, &related, 1377 hooknum, &iph); 1378 1379 if (related) 1380 return verdict; 1381 } 1382 } else 1383 #endif 1384 if (unlikely(iph.protocol == IPPROTO_ICMP)) { 1385 int related; 1386 int verdict = ip_vs_out_icmp(ipvs, skb, &related, hooknum); 1387 1388 if (related) 1389 return verdict; 1390 } 1391 1392 pd = ip_vs_proto_data_get(ipvs, iph.protocol); 1393 if (unlikely(!pd)) 1394 return NF_ACCEPT; 1395 pp = pd->pp; 1396 1397 /* reassemble IP fragments */ 1398 #ifdef CONFIG_IP_VS_IPV6 1399 if (af == AF_INET) 1400 #endif 1401 if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) { 1402 if (ip_vs_gather_frags(ipvs, skb, 1403 ip_vs_defrag_user(hooknum))) 1404 return NF_STOLEN; 1405 1406 ip_vs_fill_iph_skb(AF_INET, skb, false, &iph); 1407 } 1408 1409 /* 1410 * Check if the packet belongs to an existing entry 1411 */ 1412 cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto, 1413 ipvs, af, skb, &iph); 1414 1415 if (likely(cp)) { 1416 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) 1417 goto ignore_cp; 1418 return handle_response(af, skb, pd, cp, &iph, hooknum); 1419 } 1420 1421 /* Check for real-server-started requests */ 1422 if (atomic_read(&ipvs->conn_out_counter)) { 1423 /* Currently only for UDP: 1424 * connection oriented protocols typically use 1425 * ephemeral ports for outgoing connections, so 1426 * related incoming responses would not match any VS 1427 */ 1428 if (pp->protocol == IPPROTO_UDP) { 1429 cp = __ip_vs_rs_conn_out(hooknum, ipvs, af, skb, &iph); 1430 if (likely(cp)) 1431 return handle_response(af, skb, pd, cp, &iph, 1432 hooknum); 1433 } 1434 } 1435 1436 if (sysctl_nat_icmp_send(ipvs) && 1437 (pp->protocol == IPPROTO_TCP || 1438 pp->protocol == IPPROTO_UDP || 1439 pp->protocol == IPPROTO_SCTP)) { 1440 __be16 _ports[2], *pptr; 1441 1442 pptr = frag_safe_skb_hp(skb, iph.len, 1443 sizeof(_ports), _ports); 1444 if (pptr == NULL) 1445 return NF_ACCEPT; /* Not for me */ 1446 if (ip_vs_has_real_service(ipvs, af, iph.protocol, &iph.saddr, 1447 pptr[0])) { 1448 /* 1449 * Notify the real server: there is no 1450 * existing entry if it is not RST 1451 * packet or not TCP packet. 1452 */ 1453 if ((iph.protocol != IPPROTO_TCP && 1454 iph.protocol != IPPROTO_SCTP) 1455 || ((iph.protocol == IPPROTO_TCP 1456 && !is_tcp_reset(skb, iph.len)) 1457 || (iph.protocol == IPPROTO_SCTP 1458 && !is_sctp_abort(skb, 1459 iph.len)))) { 1460 #ifdef CONFIG_IP_VS_IPV6 1461 if (af == AF_INET6) { 1462 if (!skb->dev) 1463 skb->dev = ipvs->net->loopback_dev; 1464 icmpv6_send(skb, 1465 ICMPV6_DEST_UNREACH, 1466 ICMPV6_PORT_UNREACH, 1467 0); 1468 } else 1469 #endif 1470 icmp_send(skb, 1471 ICMP_DEST_UNREACH, 1472 ICMP_PORT_UNREACH, 0); 1473 return NF_DROP; 1474 } 1475 } 1476 } 1477 1478 out: 1479 IP_VS_DBG_PKT(12, af, pp, skb, iph.off, 1480 "ip_vs_out: packet continues traversal as normal"); 1481 return NF_ACCEPT; 1482 1483 ignore_cp: 1484 __ip_vs_conn_put(cp); 1485 goto out; 1486 } 1487 1488 /* 1489 * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain, 1490 * used only for VS/NAT. 1491 * Check if packet is reply for established ip_vs_conn. 1492 */ 1493 static unsigned int 1494 ip_vs_reply4(void *priv, struct sk_buff *skb, 1495 const struct nf_hook_state *state) 1496 { 1497 return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET); 1498 } 1499 1500 /* 1501 * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT. 1502 * Check if packet is reply for established ip_vs_conn. 1503 */ 1504 static unsigned int 1505 ip_vs_local_reply4(void *priv, struct sk_buff *skb, 1506 const struct nf_hook_state *state) 1507 { 1508 return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET); 1509 } 1510 1511 #ifdef CONFIG_IP_VS_IPV6 1512 1513 /* 1514 * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain, 1515 * used only for VS/NAT. 1516 * Check if packet is reply for established ip_vs_conn. 1517 */ 1518 static unsigned int 1519 ip_vs_reply6(void *priv, struct sk_buff *skb, 1520 const struct nf_hook_state *state) 1521 { 1522 return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6); 1523 } 1524 1525 /* 1526 * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT. 1527 * Check if packet is reply for established ip_vs_conn. 1528 */ 1529 static unsigned int 1530 ip_vs_local_reply6(void *priv, struct sk_buff *skb, 1531 const struct nf_hook_state *state) 1532 { 1533 return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6); 1534 } 1535 1536 #endif 1537 1538 static unsigned int 1539 ip_vs_try_to_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, 1540 struct ip_vs_proto_data *pd, 1541 int *verdict, struct ip_vs_conn **cpp, 1542 struct ip_vs_iphdr *iph) 1543 { 1544 struct ip_vs_protocol *pp = pd->pp; 1545 1546 if (!iph->fragoffs) { 1547 /* No (second) fragments need to enter here, as nf_defrag_ipv6 1548 * replayed fragment zero will already have created the cp 1549 */ 1550 1551 /* Schedule and create new connection entry into cpp */ 1552 if (!pp->conn_schedule(ipvs, af, skb, pd, verdict, cpp, iph)) 1553 return 0; 1554 } 1555 1556 if (unlikely(!*cpp)) { 1557 /* sorry, all this trouble for a no-hit :) */ 1558 IP_VS_DBG_PKT(12, af, pp, skb, iph->off, 1559 "ip_vs_in: packet continues traversal as normal"); 1560 1561 /* Fragment couldn't be mapped to a conn entry */ 1562 if (iph->fragoffs) 1563 IP_VS_DBG_PKT(7, af, pp, skb, iph->off, 1564 "unhandled fragment"); 1565 1566 *verdict = NF_ACCEPT; 1567 return 0; 1568 } 1569 1570 return 1; 1571 } 1572 1573 /* Check the UDP tunnel and return its header length */ 1574 static int ipvs_udp_decap(struct netns_ipvs *ipvs, struct sk_buff *skb, 1575 unsigned int offset, __u16 af, 1576 const union nf_inet_addr *daddr, __u8 *proto) 1577 { 1578 struct udphdr _udph, *udph; 1579 struct ip_vs_dest *dest; 1580 1581 udph = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); 1582 if (!udph) 1583 goto unk; 1584 offset += sizeof(struct udphdr); 1585 dest = ip_vs_find_tunnel(ipvs, af, daddr, udph->dest); 1586 if (!dest) 1587 goto unk; 1588 if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1589 struct guehdr _gueh, *gueh; 1590 1591 gueh = skb_header_pointer(skb, offset, sizeof(_gueh), &_gueh); 1592 if (!gueh) 1593 goto unk; 1594 if (gueh->control != 0 || gueh->version != 0) 1595 goto unk; 1596 /* Later we can support also IPPROTO_IPV6 */ 1597 if (gueh->proto_ctype != IPPROTO_IPIP) 1598 goto unk; 1599 *proto = gueh->proto_ctype; 1600 return sizeof(struct udphdr) + sizeof(struct guehdr) + 1601 (gueh->hlen << 2); 1602 } 1603 1604 unk: 1605 return 0; 1606 } 1607 1608 /* Check the GRE tunnel and return its header length */ 1609 static int ipvs_gre_decap(struct netns_ipvs *ipvs, struct sk_buff *skb, 1610 unsigned int offset, __u16 af, 1611 const union nf_inet_addr *daddr, __u8 *proto) 1612 { 1613 struct gre_base_hdr _greh, *greh; 1614 struct ip_vs_dest *dest; 1615 1616 greh = skb_header_pointer(skb, offset, sizeof(_greh), &_greh); 1617 if (!greh) 1618 goto unk; 1619 dest = ip_vs_find_tunnel(ipvs, af, daddr, 0); 1620 if (!dest) 1621 goto unk; 1622 if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 1623 __be16 type; 1624 1625 /* Only support version 0 and C (csum) */ 1626 if ((greh->flags & ~GRE_CSUM) != 0) 1627 goto unk; 1628 type = greh->protocol; 1629 /* Later we can support also IPPROTO_IPV6 */ 1630 if (type != htons(ETH_P_IP)) 1631 goto unk; 1632 *proto = IPPROTO_IPIP; 1633 return gre_calc_hlen(gre_flags_to_tnl_flags(greh->flags)); 1634 } 1635 1636 unk: 1637 return 0; 1638 } 1639 1640 /* 1641 * Handle ICMP messages in the outside-to-inside direction (incoming). 1642 * Find any that might be relevant, check against existing connections, 1643 * forward to the right destination host if relevant. 1644 * Currently handles error types - unreachable, quench, ttl exceeded. 1645 */ 1646 static int 1647 ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, 1648 unsigned int hooknum) 1649 { 1650 struct iphdr *iph; 1651 struct icmphdr _icmph, *ic; 1652 struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ 1653 struct ip_vs_iphdr ciph; 1654 struct ip_vs_conn *cp; 1655 struct ip_vs_protocol *pp; 1656 struct ip_vs_proto_data *pd; 1657 unsigned int offset, offset2, ihl, verdict; 1658 bool tunnel, new_cp = false; 1659 union nf_inet_addr *raddr; 1660 char *outer_proto = "IPIP"; 1661 1662 *related = 1; 1663 1664 /* reassemble IP fragments */ 1665 if (ip_is_fragment(ip_hdr(skb))) { 1666 if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum))) 1667 return NF_STOLEN; 1668 } 1669 1670 iph = ip_hdr(skb); 1671 offset = ihl = iph->ihl * 4; 1672 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); 1673 if (ic == NULL) 1674 return NF_DROP; 1675 1676 IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n", 1677 ic->type, ntohs(icmp_id(ic)), 1678 &iph->saddr, &iph->daddr); 1679 1680 /* 1681 * Work through seeing if this is for us. 1682 * These checks are supposed to be in an order that means easy 1683 * things are checked first to speed up processing.... however 1684 * this means that some packets will manage to get a long way 1685 * down this stack and then be rejected, but that's life. 1686 */ 1687 if ((ic->type != ICMP_DEST_UNREACH) && 1688 (ic->type != ICMP_SOURCE_QUENCH) && 1689 (ic->type != ICMP_TIME_EXCEEDED)) { 1690 *related = 0; 1691 return NF_ACCEPT; 1692 } 1693 1694 /* Now find the contained IP header */ 1695 offset += sizeof(_icmph); 1696 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); 1697 if (cih == NULL) 1698 return NF_ACCEPT; /* The packet looks wrong, ignore */ 1699 raddr = (union nf_inet_addr *)&cih->daddr; 1700 1701 /* Special case for errors for IPIP/UDP/GRE tunnel packets */ 1702 tunnel = false; 1703 if (cih->protocol == IPPROTO_IPIP) { 1704 struct ip_vs_dest *dest; 1705 1706 if (unlikely(cih->frag_off & htons(IP_OFFSET))) 1707 return NF_ACCEPT; 1708 /* Error for our IPIP must arrive at LOCAL_IN */ 1709 if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL)) 1710 return NF_ACCEPT; 1711 dest = ip_vs_find_tunnel(ipvs, AF_INET, raddr, 0); 1712 /* Only for known tunnel */ 1713 if (!dest || dest->tun_type != IP_VS_CONN_F_TUNNEL_TYPE_IPIP) 1714 return NF_ACCEPT; 1715 offset += cih->ihl * 4; 1716 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); 1717 if (cih == NULL) 1718 return NF_ACCEPT; /* The packet looks wrong, ignore */ 1719 tunnel = true; 1720 } else if ((cih->protocol == IPPROTO_UDP || /* Can be UDP encap */ 1721 cih->protocol == IPPROTO_GRE) && /* Can be GRE encap */ 1722 /* Error for our tunnel must arrive at LOCAL_IN */ 1723 (skb_rtable(skb)->rt_flags & RTCF_LOCAL)) { 1724 __u8 iproto; 1725 int ulen; 1726 1727 /* Non-first fragment has no UDP/GRE header */ 1728 if (unlikely(cih->frag_off & htons(IP_OFFSET))) 1729 return NF_ACCEPT; 1730 offset2 = offset + cih->ihl * 4; 1731 if (cih->protocol == IPPROTO_UDP) { 1732 ulen = ipvs_udp_decap(ipvs, skb, offset2, AF_INET, 1733 raddr, &iproto); 1734 outer_proto = "UDP"; 1735 } else { 1736 ulen = ipvs_gre_decap(ipvs, skb, offset2, AF_INET, 1737 raddr, &iproto); 1738 outer_proto = "GRE"; 1739 } 1740 if (ulen > 0) { 1741 /* Skip IP and UDP/GRE tunnel headers */ 1742 offset = offset2 + ulen; 1743 /* Now we should be at the original IP header */ 1744 cih = skb_header_pointer(skb, offset, sizeof(_ciph), 1745 &_ciph); 1746 if (cih && cih->version == 4 && cih->ihl >= 5 && 1747 iproto == IPPROTO_IPIP) 1748 tunnel = true; 1749 else 1750 return NF_ACCEPT; 1751 } 1752 } 1753 1754 pd = ip_vs_proto_data_get(ipvs, cih->protocol); 1755 if (!pd) 1756 return NF_ACCEPT; 1757 pp = pd->pp; 1758 1759 /* Is the embedded protocol header present? */ 1760 if (unlikely(cih->frag_off & htons(IP_OFFSET) && 1761 pp->dont_defrag)) 1762 return NF_ACCEPT; 1763 1764 IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, 1765 "Checking incoming ICMP for"); 1766 1767 offset2 = offset; 1768 ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, !tunnel, &ciph); 1769 offset = ciph.len; 1770 1771 /* The embedded headers contain source and dest in reverse order. 1772 * For IPIP/UDP/GRE tunnel this is error for request, not for reply. 1773 */ 1774 cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto, 1775 ipvs, AF_INET, skb, &ciph); 1776 1777 if (!cp) { 1778 int v; 1779 1780 if (tunnel || !sysctl_schedule_icmp(ipvs)) 1781 return NF_ACCEPT; 1782 1783 if (!ip_vs_try_to_schedule(ipvs, AF_INET, skb, pd, &v, &cp, &ciph)) 1784 return v; 1785 new_cp = true; 1786 } 1787 1788 verdict = NF_DROP; 1789 1790 /* Ensure the checksum is correct */ 1791 if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { 1792 /* Failed checksum! */ 1793 IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n", 1794 &iph->saddr); 1795 goto out; 1796 } 1797 1798 if (tunnel) { 1799 __be32 info = ic->un.gateway; 1800 __u8 type = ic->type; 1801 __u8 code = ic->code; 1802 1803 /* Update the MTU */ 1804 if (ic->type == ICMP_DEST_UNREACH && 1805 ic->code == ICMP_FRAG_NEEDED) { 1806 struct ip_vs_dest *dest = cp->dest; 1807 u32 mtu = ntohs(ic->un.frag.mtu); 1808 __be16 frag_off = cih->frag_off; 1809 1810 /* Strip outer IP and ICMP, go to IPIP/UDP/GRE header */ 1811 if (pskb_pull(skb, ihl + sizeof(_icmph)) == NULL) 1812 goto ignore_tunnel; 1813 offset2 -= ihl + sizeof(_icmph); 1814 skb_reset_network_header(skb); 1815 IP_VS_DBG(12, "ICMP for %s %pI4->%pI4: mtu=%u\n", 1816 outer_proto, &ip_hdr(skb)->saddr, 1817 &ip_hdr(skb)->daddr, mtu); 1818 ipv4_update_pmtu(skb, ipvs->net, mtu, 0, 0); 1819 /* Client uses PMTUD? */ 1820 if (!(frag_off & htons(IP_DF))) 1821 goto ignore_tunnel; 1822 /* Prefer the resulting PMTU */ 1823 if (dest) { 1824 struct ip_vs_dest_dst *dest_dst; 1825 1826 dest_dst = rcu_dereference(dest->dest_dst); 1827 if (dest_dst) 1828 mtu = dst_mtu(dest_dst->dst_cache); 1829 } 1830 if (mtu > 68 + sizeof(struct iphdr)) 1831 mtu -= sizeof(struct iphdr); 1832 info = htonl(mtu); 1833 } 1834 /* Strip outer IP, ICMP and IPIP/UDP/GRE, go to IP header of 1835 * original request. 1836 */ 1837 if (pskb_pull(skb, offset2) == NULL) 1838 goto ignore_tunnel; 1839 skb_reset_network_header(skb); 1840 IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n", 1841 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, 1842 type, code, ntohl(info)); 1843 icmp_send(skb, type, code, info); 1844 /* ICMP can be shorter but anyways, account it */ 1845 ip_vs_out_stats(cp, skb); 1846 1847 ignore_tunnel: 1848 consume_skb(skb); 1849 verdict = NF_STOLEN; 1850 goto out; 1851 } 1852 1853 /* do the statistics and put it back */ 1854 ip_vs_in_stats(cp, skb); 1855 if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol || 1856 IPPROTO_SCTP == cih->protocol) 1857 offset += 2 * sizeof(__u16); 1858 verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum, &ciph); 1859 1860 out: 1861 if (likely(!new_cp)) 1862 __ip_vs_conn_put(cp); 1863 else 1864 ip_vs_conn_put(cp); 1865 1866 return verdict; 1867 } 1868 1869 #ifdef CONFIG_IP_VS_IPV6 1870 static int ip_vs_in_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb, 1871 int *related, unsigned int hooknum, 1872 struct ip_vs_iphdr *iph) 1873 { 1874 struct icmp6hdr _icmph, *ic; 1875 struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */ 1876 struct ip_vs_conn *cp; 1877 struct ip_vs_protocol *pp; 1878 struct ip_vs_proto_data *pd; 1879 unsigned int offset, verdict; 1880 bool new_cp = false; 1881 1882 *related = 1; 1883 1884 ic = frag_safe_skb_hp(skb, iph->len, sizeof(_icmph), &_icmph); 1885 if (ic == NULL) 1886 return NF_DROP; 1887 1888 /* 1889 * Work through seeing if this is for us. 1890 * These checks are supposed to be in an order that means easy 1891 * things are checked first to speed up processing.... however 1892 * this means that some packets will manage to get a long way 1893 * down this stack and then be rejected, but that's life. 1894 */ 1895 if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) { 1896 *related = 0; 1897 return NF_ACCEPT; 1898 } 1899 /* Fragment header that is before ICMP header tells us that: 1900 * it's not an error message since they can't be fragmented. 1901 */ 1902 if (iph->flags & IP6_FH_F_FRAG) 1903 return NF_DROP; 1904 1905 IP_VS_DBG(8, "Incoming ICMPv6 (%d,%d) %pI6c->%pI6c\n", 1906 ic->icmp6_type, ntohs(icmpv6_id(ic)), 1907 &iph->saddr, &iph->daddr); 1908 1909 offset = iph->len + sizeof(_icmph); 1910 if (!ip_vs_fill_iph_skb_icmp(AF_INET6, skb, offset, true, &ciph)) 1911 return NF_ACCEPT; 1912 1913 pd = ip_vs_proto_data_get(ipvs, ciph.protocol); 1914 if (!pd) 1915 return NF_ACCEPT; 1916 pp = pd->pp; 1917 1918 /* Cannot handle fragmented embedded protocol */ 1919 if (ciph.fragoffs) 1920 return NF_ACCEPT; 1921 1922 IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset, 1923 "Checking incoming ICMPv6 for"); 1924 1925 /* The embedded headers contain source and dest in reverse order 1926 * if not from localhost 1927 */ 1928 cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto, 1929 ipvs, AF_INET6, skb, &ciph); 1930 1931 if (!cp) { 1932 int v; 1933 1934 if (!sysctl_schedule_icmp(ipvs)) 1935 return NF_ACCEPT; 1936 1937 if (!ip_vs_try_to_schedule(ipvs, AF_INET6, skb, pd, &v, &cp, &ciph)) 1938 return v; 1939 1940 new_cp = true; 1941 } 1942 1943 /* VS/TUN, VS/DR and LOCALNODE just let it go */ 1944 if ((hooknum == NF_INET_LOCAL_OUT) && 1945 (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)) { 1946 verdict = NF_ACCEPT; 1947 goto out; 1948 } 1949 1950 /* do the statistics and put it back */ 1951 ip_vs_in_stats(cp, skb); 1952 1953 /* Need to mangle contained IPv6 header in ICMPv6 packet */ 1954 offset = ciph.len; 1955 if (IPPROTO_TCP == ciph.protocol || IPPROTO_UDP == ciph.protocol || 1956 IPPROTO_SCTP == ciph.protocol) 1957 offset += 2 * sizeof(__u16); /* Also mangle ports */ 1958 1959 verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset, hooknum, &ciph); 1960 1961 out: 1962 if (likely(!new_cp)) 1963 __ip_vs_conn_put(cp); 1964 else 1965 ip_vs_conn_put(cp); 1966 1967 return verdict; 1968 } 1969 #endif 1970 1971 1972 /* 1973 * Check if it's for virtual services, look it up, 1974 * and send it on its way... 1975 */ 1976 static unsigned int 1977 ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af) 1978 { 1979 struct ip_vs_iphdr iph; 1980 struct ip_vs_protocol *pp; 1981 struct ip_vs_proto_data *pd; 1982 struct ip_vs_conn *cp; 1983 int ret, pkts; 1984 int conn_reuse_mode; 1985 struct sock *sk; 1986 1987 /* Already marked as IPVS request or reply? */ 1988 if (skb->ipvs_property) 1989 return NF_ACCEPT; 1990 1991 /* 1992 * Big tappo: 1993 * - remote client: only PACKET_HOST 1994 * - route: used for struct net when skb->dev is unset 1995 */ 1996 if (unlikely((skb->pkt_type != PACKET_HOST && 1997 hooknum != NF_INET_LOCAL_OUT) || 1998 !skb_dst(skb))) { 1999 ip_vs_fill_iph_skb(af, skb, false, &iph); 2000 IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s" 2001 " ignored in hook %u\n", 2002 skb->pkt_type, iph.protocol, 2003 IP_VS_DBG_ADDR(af, &iph.daddr), hooknum); 2004 return NF_ACCEPT; 2005 } 2006 /* ipvs enabled in this netns ? */ 2007 if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) 2008 return NF_ACCEPT; 2009 2010 ip_vs_fill_iph_skb(af, skb, false, &iph); 2011 2012 /* Bad... Do not break raw sockets */ 2013 sk = skb_to_full_sk(skb); 2014 if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT && 2015 af == AF_INET)) { 2016 2017 if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag) 2018 return NF_ACCEPT; 2019 } 2020 2021 #ifdef CONFIG_IP_VS_IPV6 2022 if (af == AF_INET6) { 2023 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { 2024 int related; 2025 int verdict = ip_vs_in_icmp_v6(ipvs, skb, &related, 2026 hooknum, &iph); 2027 2028 if (related) 2029 return verdict; 2030 } 2031 } else 2032 #endif 2033 if (unlikely(iph.protocol == IPPROTO_ICMP)) { 2034 int related; 2035 int verdict = ip_vs_in_icmp(ipvs, skb, &related, 2036 hooknum); 2037 2038 if (related) 2039 return verdict; 2040 } 2041 2042 /* Protocol supported? */ 2043 pd = ip_vs_proto_data_get(ipvs, iph.protocol); 2044 if (unlikely(!pd)) { 2045 /* The only way we'll see this packet again is if it's 2046 * encapsulated, so mark it with ipvs_property=1 so we 2047 * skip it if we're ignoring tunneled packets 2048 */ 2049 if (sysctl_ignore_tunneled(ipvs)) 2050 skb->ipvs_property = 1; 2051 2052 return NF_ACCEPT; 2053 } 2054 pp = pd->pp; 2055 /* 2056 * Check if the packet belongs to an existing connection entry 2057 */ 2058 cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto, 2059 ipvs, af, skb, &iph); 2060 2061 conn_reuse_mode = sysctl_conn_reuse_mode(ipvs); 2062 if (conn_reuse_mode && !iph.fragoffs && is_new_conn(skb, &iph) && cp) { 2063 bool old_ct = false, resched = false; 2064 2065 if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest && 2066 unlikely(!atomic_read(&cp->dest->weight))) { 2067 resched = true; 2068 old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); 2069 } else if (is_new_conn_expected(cp, conn_reuse_mode)) { 2070 old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); 2071 if (!atomic_read(&cp->n_control)) { 2072 resched = true; 2073 } else { 2074 /* Do not reschedule controlling connection 2075 * that uses conntrack while it is still 2076 * referenced by controlled connection(s). 2077 */ 2078 resched = !old_ct; 2079 } 2080 } 2081 2082 if (resched) { 2083 if (!old_ct) 2084 cp->flags &= ~IP_VS_CONN_F_NFCT; 2085 if (!atomic_read(&cp->n_control)) 2086 ip_vs_conn_expire_now(cp); 2087 __ip_vs_conn_put(cp); 2088 if (old_ct) 2089 return NF_DROP; 2090 cp = NULL; 2091 } 2092 } 2093 2094 /* Check the server status */ 2095 if (cp && cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { 2096 /* the destination server is not available */ 2097 if (sysctl_expire_nodest_conn(ipvs)) { 2098 bool old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); 2099 2100 if (!old_ct) 2101 cp->flags &= ~IP_VS_CONN_F_NFCT; 2102 2103 ip_vs_conn_expire_now(cp); 2104 __ip_vs_conn_put(cp); 2105 if (old_ct) 2106 return NF_DROP; 2107 cp = NULL; 2108 } else { 2109 __ip_vs_conn_put(cp); 2110 return NF_DROP; 2111 } 2112 } 2113 2114 if (unlikely(!cp)) { 2115 int v; 2116 2117 if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph)) 2118 return v; 2119 } 2120 2121 IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet"); 2122 2123 ip_vs_in_stats(cp, skb); 2124 ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); 2125 if (cp->packet_xmit) 2126 ret = cp->packet_xmit(skb, cp, pp, &iph); 2127 /* do not touch skb anymore */ 2128 else { 2129 IP_VS_DBG_RL("warning: packet_xmit is null"); 2130 ret = NF_ACCEPT; 2131 } 2132 2133 /* Increase its packet counter and check if it is needed 2134 * to be synchronized 2135 * 2136 * Sync connection if it is about to close to 2137 * encorage the standby servers to update the connections timeout 2138 * 2139 * For ONE_PKT let ip_vs_sync_conn() do the filter work. 2140 */ 2141 2142 if (cp->flags & IP_VS_CONN_F_ONE_PACKET) 2143 pkts = sysctl_sync_threshold(ipvs); 2144 else 2145 pkts = atomic_add_return(1, &cp->in_pkts); 2146 2147 if (ipvs->sync_state & IP_VS_STATE_MASTER) 2148 ip_vs_sync_conn(ipvs, cp, pkts); 2149 else if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control) 2150 /* increment is done inside ip_vs_sync_conn too */ 2151 atomic_inc(&cp->control->in_pkts); 2152 2153 ip_vs_conn_put(cp); 2154 return ret; 2155 } 2156 2157 /* 2158 * AF_INET handler in NF_INET_LOCAL_IN chain 2159 * Schedule and forward packets from remote clients 2160 */ 2161 static unsigned int 2162 ip_vs_remote_request4(void *priv, struct sk_buff *skb, 2163 const struct nf_hook_state *state) 2164 { 2165 return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET); 2166 } 2167 2168 /* 2169 * AF_INET handler in NF_INET_LOCAL_OUT chain 2170 * Schedule and forward packets from local clients 2171 */ 2172 static unsigned int 2173 ip_vs_local_request4(void *priv, struct sk_buff *skb, 2174 const struct nf_hook_state *state) 2175 { 2176 return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET); 2177 } 2178 2179 #ifdef CONFIG_IP_VS_IPV6 2180 2181 /* 2182 * AF_INET6 handler in NF_INET_LOCAL_IN chain 2183 * Schedule and forward packets from remote clients 2184 */ 2185 static unsigned int 2186 ip_vs_remote_request6(void *priv, struct sk_buff *skb, 2187 const struct nf_hook_state *state) 2188 { 2189 return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET6); 2190 } 2191 2192 /* 2193 * AF_INET6 handler in NF_INET_LOCAL_OUT chain 2194 * Schedule and forward packets from local clients 2195 */ 2196 static unsigned int 2197 ip_vs_local_request6(void *priv, struct sk_buff *skb, 2198 const struct nf_hook_state *state) 2199 { 2200 return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET6); 2201 } 2202 2203 #endif 2204 2205 2206 /* 2207 * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP 2208 * related packets destined for 0.0.0.0/0. 2209 * When fwmark-based virtual service is used, such as transparent 2210 * cache cluster, TCP packets can be marked and routed to ip_vs_in, 2211 * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and 2212 * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain 2213 * and send them to ip_vs_in_icmp. 2214 */ 2215 static unsigned int 2216 ip_vs_forward_icmp(void *priv, struct sk_buff *skb, 2217 const struct nf_hook_state *state) 2218 { 2219 int r; 2220 struct netns_ipvs *ipvs = net_ipvs(state->net); 2221 2222 if (ip_hdr(skb)->protocol != IPPROTO_ICMP) 2223 return NF_ACCEPT; 2224 2225 /* ipvs enabled in this netns ? */ 2226 if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) 2227 return NF_ACCEPT; 2228 2229 return ip_vs_in_icmp(ipvs, skb, &r, state->hook); 2230 } 2231 2232 #ifdef CONFIG_IP_VS_IPV6 2233 static unsigned int 2234 ip_vs_forward_icmp_v6(void *priv, struct sk_buff *skb, 2235 const struct nf_hook_state *state) 2236 { 2237 int r; 2238 struct netns_ipvs *ipvs = net_ipvs(state->net); 2239 struct ip_vs_iphdr iphdr; 2240 2241 ip_vs_fill_iph_skb(AF_INET6, skb, false, &iphdr); 2242 if (iphdr.protocol != IPPROTO_ICMPV6) 2243 return NF_ACCEPT; 2244 2245 /* ipvs enabled in this netns ? */ 2246 if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) 2247 return NF_ACCEPT; 2248 2249 return ip_vs_in_icmp_v6(ipvs, skb, &r, state->hook, &iphdr); 2250 } 2251 #endif 2252 2253 2254 static const struct nf_hook_ops ip_vs_ops4[] = { 2255 /* After packet filtering, change source only for VS/NAT */ 2256 { 2257 .hook = ip_vs_reply4, 2258 .pf = NFPROTO_IPV4, 2259 .hooknum = NF_INET_LOCAL_IN, 2260 .priority = NF_IP_PRI_NAT_SRC - 2, 2261 }, 2262 /* After packet filtering, forward packet through VS/DR, VS/TUN, 2263 * or VS/NAT(change destination), so that filtering rules can be 2264 * applied to IPVS. */ 2265 { 2266 .hook = ip_vs_remote_request4, 2267 .pf = NFPROTO_IPV4, 2268 .hooknum = NF_INET_LOCAL_IN, 2269 .priority = NF_IP_PRI_NAT_SRC - 1, 2270 }, 2271 /* Before ip_vs_in, change source only for VS/NAT */ 2272 { 2273 .hook = ip_vs_local_reply4, 2274 .pf = NFPROTO_IPV4, 2275 .hooknum = NF_INET_LOCAL_OUT, 2276 .priority = NF_IP_PRI_NAT_DST + 1, 2277 }, 2278 /* After mangle, schedule and forward local requests */ 2279 { 2280 .hook = ip_vs_local_request4, 2281 .pf = NFPROTO_IPV4, 2282 .hooknum = NF_INET_LOCAL_OUT, 2283 .priority = NF_IP_PRI_NAT_DST + 2, 2284 }, 2285 /* After packet filtering (but before ip_vs_out_icmp), catch icmp 2286 * destined for 0.0.0.0/0, which is for incoming IPVS connections */ 2287 { 2288 .hook = ip_vs_forward_icmp, 2289 .pf = NFPROTO_IPV4, 2290 .hooknum = NF_INET_FORWARD, 2291 .priority = 99, 2292 }, 2293 /* After packet filtering, change source only for VS/NAT */ 2294 { 2295 .hook = ip_vs_reply4, 2296 .pf = NFPROTO_IPV4, 2297 .hooknum = NF_INET_FORWARD, 2298 .priority = 100, 2299 }, 2300 }; 2301 2302 #ifdef CONFIG_IP_VS_IPV6 2303 static const struct nf_hook_ops ip_vs_ops6[] = { 2304 /* After packet filtering, change source only for VS/NAT */ 2305 { 2306 .hook = ip_vs_reply6, 2307 .pf = NFPROTO_IPV6, 2308 .hooknum = NF_INET_LOCAL_IN, 2309 .priority = NF_IP6_PRI_NAT_SRC - 2, 2310 }, 2311 /* After packet filtering, forward packet through VS/DR, VS/TUN, 2312 * or VS/NAT(change destination), so that filtering rules can be 2313 * applied to IPVS. */ 2314 { 2315 .hook = ip_vs_remote_request6, 2316 .pf = NFPROTO_IPV6, 2317 .hooknum = NF_INET_LOCAL_IN, 2318 .priority = NF_IP6_PRI_NAT_SRC - 1, 2319 }, 2320 /* Before ip_vs_in, change source only for VS/NAT */ 2321 { 2322 .hook = ip_vs_local_reply6, 2323 .pf = NFPROTO_IPV6, 2324 .hooknum = NF_INET_LOCAL_OUT, 2325 .priority = NF_IP6_PRI_NAT_DST + 1, 2326 }, 2327 /* After mangle, schedule and forward local requests */ 2328 { 2329 .hook = ip_vs_local_request6, 2330 .pf = NFPROTO_IPV6, 2331 .hooknum = NF_INET_LOCAL_OUT, 2332 .priority = NF_IP6_PRI_NAT_DST + 2, 2333 }, 2334 /* After packet filtering (but before ip_vs_out_icmp), catch icmp 2335 * destined for 0.0.0.0/0, which is for incoming IPVS connections */ 2336 { 2337 .hook = ip_vs_forward_icmp_v6, 2338 .pf = NFPROTO_IPV6, 2339 .hooknum = NF_INET_FORWARD, 2340 .priority = 99, 2341 }, 2342 /* After packet filtering, change source only for VS/NAT */ 2343 { 2344 .hook = ip_vs_reply6, 2345 .pf = NFPROTO_IPV6, 2346 .hooknum = NF_INET_FORWARD, 2347 .priority = 100, 2348 }, 2349 }; 2350 #endif 2351 2352 int ip_vs_register_hooks(struct netns_ipvs *ipvs, unsigned int af) 2353 { 2354 const struct nf_hook_ops *ops; 2355 unsigned int count; 2356 unsigned int afmask; 2357 int ret = 0; 2358 2359 if (af == AF_INET6) { 2360 #ifdef CONFIG_IP_VS_IPV6 2361 ops = ip_vs_ops6; 2362 count = ARRAY_SIZE(ip_vs_ops6); 2363 afmask = 2; 2364 #else 2365 return -EINVAL; 2366 #endif 2367 } else { 2368 ops = ip_vs_ops4; 2369 count = ARRAY_SIZE(ip_vs_ops4); 2370 afmask = 1; 2371 } 2372 2373 if (!(ipvs->hooks_afmask & afmask)) { 2374 ret = nf_register_net_hooks(ipvs->net, ops, count); 2375 if (ret >= 0) 2376 ipvs->hooks_afmask |= afmask; 2377 } 2378 return ret; 2379 } 2380 2381 void ip_vs_unregister_hooks(struct netns_ipvs *ipvs, unsigned int af) 2382 { 2383 const struct nf_hook_ops *ops; 2384 unsigned int count; 2385 unsigned int afmask; 2386 2387 if (af == AF_INET6) { 2388 #ifdef CONFIG_IP_VS_IPV6 2389 ops = ip_vs_ops6; 2390 count = ARRAY_SIZE(ip_vs_ops6); 2391 afmask = 2; 2392 #else 2393 return; 2394 #endif 2395 } else { 2396 ops = ip_vs_ops4; 2397 count = ARRAY_SIZE(ip_vs_ops4); 2398 afmask = 1; 2399 } 2400 2401 if (ipvs->hooks_afmask & afmask) { 2402 nf_unregister_net_hooks(ipvs->net, ops, count); 2403 ipvs->hooks_afmask &= ~afmask; 2404 } 2405 } 2406 2407 /* 2408 * Initialize IP Virtual Server netns mem. 2409 */ 2410 static int __net_init __ip_vs_init(struct net *net) 2411 { 2412 struct netns_ipvs *ipvs; 2413 2414 ipvs = net_generic(net, ip_vs_net_id); 2415 if (ipvs == NULL) 2416 return -ENOMEM; 2417 2418 /* Hold the beast until a service is registerd */ 2419 ipvs->enable = 0; 2420 ipvs->net = net; 2421 /* Counters used for creating unique names */ 2422 ipvs->gen = atomic_read(&ipvs_netns_cnt); 2423 atomic_inc(&ipvs_netns_cnt); 2424 net->ipvs = ipvs; 2425 2426 if (ip_vs_estimator_net_init(ipvs) < 0) 2427 goto estimator_fail; 2428 2429 if (ip_vs_control_net_init(ipvs) < 0) 2430 goto control_fail; 2431 2432 if (ip_vs_protocol_net_init(ipvs) < 0) 2433 goto protocol_fail; 2434 2435 if (ip_vs_app_net_init(ipvs) < 0) 2436 goto app_fail; 2437 2438 if (ip_vs_conn_net_init(ipvs) < 0) 2439 goto conn_fail; 2440 2441 if (ip_vs_sync_net_init(ipvs) < 0) 2442 goto sync_fail; 2443 2444 return 0; 2445 /* 2446 * Error handling 2447 */ 2448 2449 sync_fail: 2450 ip_vs_conn_net_cleanup(ipvs); 2451 conn_fail: 2452 ip_vs_app_net_cleanup(ipvs); 2453 app_fail: 2454 ip_vs_protocol_net_cleanup(ipvs); 2455 protocol_fail: 2456 ip_vs_control_net_cleanup(ipvs); 2457 control_fail: 2458 ip_vs_estimator_net_cleanup(ipvs); 2459 estimator_fail: 2460 net->ipvs = NULL; 2461 return -ENOMEM; 2462 } 2463 2464 static void __net_exit __ip_vs_cleanup_batch(struct list_head *net_list) 2465 { 2466 struct netns_ipvs *ipvs; 2467 struct net *net; 2468 2469 ip_vs_service_nets_cleanup(net_list); /* ip_vs_flush() with locks */ 2470 list_for_each_entry(net, net_list, exit_list) { 2471 ipvs = net_ipvs(net); 2472 ip_vs_conn_net_cleanup(ipvs); 2473 ip_vs_app_net_cleanup(ipvs); 2474 ip_vs_protocol_net_cleanup(ipvs); 2475 ip_vs_control_net_cleanup(ipvs); 2476 ip_vs_estimator_net_cleanup(ipvs); 2477 IP_VS_DBG(2, "ipvs netns %d released\n", ipvs->gen); 2478 net->ipvs = NULL; 2479 } 2480 } 2481 2482 static void __net_exit __ip_vs_dev_cleanup_batch(struct list_head *net_list) 2483 { 2484 struct netns_ipvs *ipvs; 2485 struct net *net; 2486 2487 EnterFunction(2); 2488 list_for_each_entry(net, net_list, exit_list) { 2489 ipvs = net_ipvs(net); 2490 ip_vs_unregister_hooks(ipvs, AF_INET); 2491 ip_vs_unregister_hooks(ipvs, AF_INET6); 2492 ipvs->enable = 0; /* Disable packet reception */ 2493 smp_wmb(); 2494 ip_vs_sync_net_cleanup(ipvs); 2495 } 2496 LeaveFunction(2); 2497 } 2498 2499 static struct pernet_operations ipvs_core_ops = { 2500 .init = __ip_vs_init, 2501 .exit_batch = __ip_vs_cleanup_batch, 2502 .id = &ip_vs_net_id, 2503 .size = sizeof(struct netns_ipvs), 2504 }; 2505 2506 static struct pernet_operations ipvs_core_dev_ops = { 2507 .exit_batch = __ip_vs_dev_cleanup_batch, 2508 }; 2509 2510 /* 2511 * Initialize IP Virtual Server 2512 */ 2513 static int __init ip_vs_init(void) 2514 { 2515 int ret; 2516 2517 ret = ip_vs_control_init(); 2518 if (ret < 0) { 2519 pr_err("can't setup control.\n"); 2520 goto exit; 2521 } 2522 2523 ip_vs_protocol_init(); 2524 2525 ret = ip_vs_conn_init(); 2526 if (ret < 0) { 2527 pr_err("can't setup connection table.\n"); 2528 goto cleanup_protocol; 2529 } 2530 2531 ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */ 2532 if (ret < 0) 2533 goto cleanup_conn; 2534 2535 ret = register_pernet_device(&ipvs_core_dev_ops); 2536 if (ret < 0) 2537 goto cleanup_sub; 2538 2539 ret = ip_vs_register_nl_ioctl(); 2540 if (ret < 0) { 2541 pr_err("can't register netlink/ioctl.\n"); 2542 goto cleanup_dev; 2543 } 2544 2545 pr_info("ipvs loaded.\n"); 2546 2547 return ret; 2548 2549 cleanup_dev: 2550 unregister_pernet_device(&ipvs_core_dev_ops); 2551 cleanup_sub: 2552 unregister_pernet_subsys(&ipvs_core_ops); 2553 cleanup_conn: 2554 ip_vs_conn_cleanup(); 2555 cleanup_protocol: 2556 ip_vs_protocol_cleanup(); 2557 ip_vs_control_cleanup(); 2558 exit: 2559 return ret; 2560 } 2561 2562 static void __exit ip_vs_cleanup(void) 2563 { 2564 ip_vs_unregister_nl_ioctl(); 2565 unregister_pernet_device(&ipvs_core_dev_ops); 2566 unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ 2567 ip_vs_conn_cleanup(); 2568 ip_vs_protocol_cleanup(); 2569 ip_vs_control_cleanup(); 2570 pr_info("ipvs unloaded.\n"); 2571 } 2572 2573 module_init(ip_vs_init); 2574 module_exit(ip_vs_cleanup); 2575 MODULE_LICENSE("GPL"); 2576