1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * IPVS An implementation of the IP virtual server support for the 4 * LINUX operating system. IPVS is now implemented as a module 5 * over the Netfilter framework. IPVS can be used to build a 6 * high-performance and highly available server based on a 7 * cluster of servers. 8 * 9 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 10 * Peter Kese <peter.kese@ijs.si> 11 * Julian Anastasov <ja@ssi.bg> 12 * 13 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, 14 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms 15 * and others. 16 * 17 * Changes: 18 * Paul `Rusty' Russell properly handle non-linear skbs 19 * Harald Welte don't use nfcache 20 */ 21 22 #define KMSG_COMPONENT "IPVS" 23 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 24 25 #include <linux/module.h> 26 #include <linux/kernel.h> 27 #include <linux/ip.h> 28 #include <linux/tcp.h> 29 #include <linux/sctp.h> 30 #include <linux/icmp.h> 31 #include <linux/slab.h> 32 33 #include <net/ip.h> 34 #include <net/tcp.h> 35 #include <net/udp.h> 36 #include <net/icmp.h> /* for icmp_send */ 37 #include <net/gue.h> 38 #include <net/gre.h> 39 #include <net/route.h> 40 #include <net/ip6_checksum.h> 41 #include <net/netns/generic.h> /* net_generic() */ 42 43 #include <linux/netfilter.h> 44 #include <linux/netfilter_ipv4.h> 45 46 #ifdef CONFIG_IP_VS_IPV6 47 #include <net/ipv6.h> 48 #include <linux/netfilter_ipv6.h> 49 #include <net/ip6_route.h> 50 #endif 51 52 #include <net/ip_vs.h> 53 #include <linux/indirect_call_wrapper.h> 54 55 56 EXPORT_SYMBOL(register_ip_vs_scheduler); 57 EXPORT_SYMBOL(unregister_ip_vs_scheduler); 58 EXPORT_SYMBOL(ip_vs_proto_name); 59 EXPORT_SYMBOL(ip_vs_conn_new); 60 EXPORT_SYMBOL(ip_vs_conn_in_get); 61 EXPORT_SYMBOL(ip_vs_conn_out_get); 62 #ifdef CONFIG_IP_VS_PROTO_TCP 63 EXPORT_SYMBOL(ip_vs_tcp_conn_listen); 64 #endif 65 EXPORT_SYMBOL(ip_vs_conn_put); 66 #ifdef CONFIG_IP_VS_DEBUG 67 EXPORT_SYMBOL(ip_vs_get_debug_level); 68 #endif 69 EXPORT_SYMBOL(ip_vs_new_conn_out); 70 71 #ifdef CONFIG_IP_VS_PROTO_TCP 72 INDIRECT_CALLABLE_DECLARE(int 73 tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, 74 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)); 75 #endif 76 77 #ifdef CONFIG_IP_VS_PROTO_UDP 78 INDIRECT_CALLABLE_DECLARE(int 79 udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, 80 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)); 81 #endif 82 83 #if defined(CONFIG_IP_VS_PROTO_TCP) && defined(CONFIG_IP_VS_PROTO_UDP) 84 #define SNAT_CALL(f, ...) \ 85 INDIRECT_CALL_2(f, tcp_snat_handler, udp_snat_handler, __VA_ARGS__) 86 #elif defined(CONFIG_IP_VS_PROTO_TCP) 87 #define SNAT_CALL(f, ...) INDIRECT_CALL_1(f, tcp_snat_handler, __VA_ARGS__) 88 #elif defined(CONFIG_IP_VS_PROTO_UDP) 89 #define SNAT_CALL(f, ...) INDIRECT_CALL_1(f, udp_snat_handler, __VA_ARGS__) 90 #else 91 #define SNAT_CALL(f, ...) f(__VA_ARGS__) 92 #endif 93 94 static unsigned int ip_vs_net_id __read_mostly; 95 /* netns cnt used for uniqueness */ 96 static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0); 97 98 /* ID used in ICMP lookups */ 99 #define icmp_id(icmph) (((icmph)->un).echo.id) 100 #define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier) 101 102 const char *ip_vs_proto_name(unsigned int proto) 103 { 104 static char buf[20]; 105 106 switch (proto) { 107 case IPPROTO_IP: 108 return "IP"; 109 case IPPROTO_UDP: 110 return "UDP"; 111 case IPPROTO_TCP: 112 return "TCP"; 113 case IPPROTO_SCTP: 114 return "SCTP"; 115 case IPPROTO_ICMP: 116 return "ICMP"; 117 #ifdef CONFIG_IP_VS_IPV6 118 case IPPROTO_ICMPV6: 119 return "ICMPv6"; 120 #endif 121 default: 122 sprintf(buf, "IP_%u", proto); 123 return buf; 124 } 125 } 126 127 void ip_vs_init_hash_table(struct list_head *table, int rows) 128 { 129 while (--rows >= 0) 130 INIT_LIST_HEAD(&table[rows]); 131 } 132 133 static inline void 134 ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) 135 { 136 struct ip_vs_dest *dest = cp->dest; 137 struct netns_ipvs *ipvs = cp->ipvs; 138 139 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { 140 struct ip_vs_cpu_stats *s; 141 struct ip_vs_service *svc; 142 143 local_bh_disable(); 144 145 s = this_cpu_ptr(dest->stats.cpustats); 146 u64_stats_update_begin(&s->syncp); 147 s->cnt.inpkts++; 148 s->cnt.inbytes += skb->len; 149 u64_stats_update_end(&s->syncp); 150 151 svc = rcu_dereference(dest->svc); 152 s = this_cpu_ptr(svc->stats.cpustats); 153 u64_stats_update_begin(&s->syncp); 154 s->cnt.inpkts++; 155 s->cnt.inbytes += skb->len; 156 u64_stats_update_end(&s->syncp); 157 158 s = this_cpu_ptr(ipvs->tot_stats.cpustats); 159 u64_stats_update_begin(&s->syncp); 160 s->cnt.inpkts++; 161 s->cnt.inbytes += skb->len; 162 u64_stats_update_end(&s->syncp); 163 164 local_bh_enable(); 165 } 166 } 167 168 169 static inline void 170 ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) 171 { 172 struct ip_vs_dest *dest = cp->dest; 173 struct netns_ipvs *ipvs = cp->ipvs; 174 175 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { 176 struct ip_vs_cpu_stats *s; 177 struct ip_vs_service *svc; 178 179 local_bh_disable(); 180 181 s = this_cpu_ptr(dest->stats.cpustats); 182 u64_stats_update_begin(&s->syncp); 183 s->cnt.outpkts++; 184 s->cnt.outbytes += skb->len; 185 u64_stats_update_end(&s->syncp); 186 187 svc = rcu_dereference(dest->svc); 188 s = this_cpu_ptr(svc->stats.cpustats); 189 u64_stats_update_begin(&s->syncp); 190 s->cnt.outpkts++; 191 s->cnt.outbytes += skb->len; 192 u64_stats_update_end(&s->syncp); 193 194 s = this_cpu_ptr(ipvs->tot_stats.cpustats); 195 u64_stats_update_begin(&s->syncp); 196 s->cnt.outpkts++; 197 s->cnt.outbytes += skb->len; 198 u64_stats_update_end(&s->syncp); 199 200 local_bh_enable(); 201 } 202 } 203 204 205 static inline void 206 ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) 207 { 208 struct netns_ipvs *ipvs = svc->ipvs; 209 struct ip_vs_cpu_stats *s; 210 211 local_bh_disable(); 212 213 s = this_cpu_ptr(cp->dest->stats.cpustats); 214 u64_stats_update_begin(&s->syncp); 215 s->cnt.conns++; 216 u64_stats_update_end(&s->syncp); 217 218 s = this_cpu_ptr(svc->stats.cpustats); 219 u64_stats_update_begin(&s->syncp); 220 s->cnt.conns++; 221 u64_stats_update_end(&s->syncp); 222 223 s = this_cpu_ptr(ipvs->tot_stats.cpustats); 224 u64_stats_update_begin(&s->syncp); 225 s->cnt.conns++; 226 u64_stats_update_end(&s->syncp); 227 228 local_bh_enable(); 229 } 230 231 232 static inline void 233 ip_vs_set_state(struct ip_vs_conn *cp, int direction, 234 const struct sk_buff *skb, 235 struct ip_vs_proto_data *pd) 236 { 237 if (likely(pd->pp->state_transition)) 238 pd->pp->state_transition(cp, direction, skb, pd); 239 } 240 241 static inline int 242 ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, 243 struct sk_buff *skb, int protocol, 244 const union nf_inet_addr *caddr, __be16 cport, 245 const union nf_inet_addr *vaddr, __be16 vport, 246 struct ip_vs_conn_param *p) 247 { 248 ip_vs_conn_fill_param(svc->ipvs, svc->af, protocol, caddr, cport, vaddr, 249 vport, p); 250 p->pe = rcu_dereference(svc->pe); 251 if (p->pe && p->pe->fill_param) 252 return p->pe->fill_param(p, skb); 253 254 return 0; 255 } 256 257 /* 258 * IPVS persistent scheduling function 259 * It creates a connection entry according to its template if exists, 260 * or selects a server and creates a connection entry plus a template. 261 * Locking: we are svc user (svc->refcnt), so we hold all dests too 262 * Protocols supported: TCP, UDP 263 */ 264 static struct ip_vs_conn * 265 ip_vs_sched_persist(struct ip_vs_service *svc, 266 struct sk_buff *skb, __be16 src_port, __be16 dst_port, 267 int *ignored, struct ip_vs_iphdr *iph) 268 { 269 struct ip_vs_conn *cp = NULL; 270 struct ip_vs_dest *dest; 271 struct ip_vs_conn *ct; 272 __be16 dport = 0; /* destination port to forward */ 273 unsigned int flags; 274 struct ip_vs_conn_param param; 275 const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) }; 276 union nf_inet_addr snet; /* source network of the client, 277 after masking */ 278 const union nf_inet_addr *src_addr, *dst_addr; 279 280 if (likely(!ip_vs_iph_inverse(iph))) { 281 src_addr = &iph->saddr; 282 dst_addr = &iph->daddr; 283 } else { 284 src_addr = &iph->daddr; 285 dst_addr = &iph->saddr; 286 } 287 288 289 /* Mask saddr with the netmask to adjust template granularity */ 290 #ifdef CONFIG_IP_VS_IPV6 291 if (svc->af == AF_INET6) 292 ipv6_addr_prefix(&snet.in6, &src_addr->in6, 293 (__force __u32) svc->netmask); 294 else 295 #endif 296 snet.ip = src_addr->ip & svc->netmask; 297 298 IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u " 299 "mnet %s\n", 300 IP_VS_DBG_ADDR(svc->af, src_addr), ntohs(src_port), 301 IP_VS_DBG_ADDR(svc->af, dst_addr), ntohs(dst_port), 302 IP_VS_DBG_ADDR(svc->af, &snet)); 303 304 /* 305 * As far as we know, FTP is a very complicated network protocol, and 306 * it uses control connection and data connections. For active FTP, 307 * FTP server initialize data connection to the client, its source port 308 * is often 20. For passive FTP, FTP server tells the clients the port 309 * that it passively listens to, and the client issues the data 310 * connection. In the tunneling or direct routing mode, the load 311 * balancer is on the client-to-server half of connection, the port 312 * number is unknown to the load balancer. So, a conn template like 313 * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP 314 * service, and a template like <caddr, 0, vaddr, vport, daddr, dport> 315 * is created for other persistent services. 316 */ 317 { 318 int protocol = iph->protocol; 319 const union nf_inet_addr *vaddr = dst_addr; 320 __be16 vport = 0; 321 322 if (dst_port == svc->port) { 323 /* non-FTP template: 324 * <protocol, caddr, 0, vaddr, vport, daddr, dport> 325 * FTP template: 326 * <protocol, caddr, 0, vaddr, 0, daddr, 0> 327 */ 328 if (svc->port != FTPPORT) 329 vport = dst_port; 330 } else { 331 /* Note: persistent fwmark-based services and 332 * persistent port zero service are handled here. 333 * fwmark template: 334 * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0> 335 * port zero template: 336 * <protocol,caddr,0,vaddr,0,daddr,0> 337 */ 338 if (svc->fwmark) { 339 protocol = IPPROTO_IP; 340 vaddr = &fwmark; 341 } 342 } 343 /* return *ignored = -1 so NF_DROP can be used */ 344 if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0, 345 vaddr, vport, ¶m) < 0) { 346 *ignored = -1; 347 return NULL; 348 } 349 } 350 351 /* Check if a template already exists */ 352 ct = ip_vs_ct_in_get(¶m); 353 if (!ct || !ip_vs_check_template(ct, NULL)) { 354 struct ip_vs_scheduler *sched; 355 356 /* 357 * No template found or the dest of the connection 358 * template is not available. 359 * return *ignored=0 i.e. ICMP and NF_DROP 360 */ 361 sched = rcu_dereference(svc->scheduler); 362 if (sched) { 363 /* read svc->sched_data after svc->scheduler */ 364 smp_rmb(); 365 dest = sched->schedule(svc, skb, iph); 366 } else { 367 dest = NULL; 368 } 369 if (!dest) { 370 IP_VS_DBG(1, "p-schedule: no dest found.\n"); 371 kfree(param.pe_data); 372 *ignored = 0; 373 return NULL; 374 } 375 376 if (dst_port == svc->port && svc->port != FTPPORT) 377 dport = dest->port; 378 379 /* Create a template 380 * This adds param.pe_data to the template, 381 * and thus param.pe_data will be destroyed 382 * when the template expires */ 383 ct = ip_vs_conn_new(¶m, dest->af, &dest->addr, dport, 384 IP_VS_CONN_F_TEMPLATE, dest, skb->mark); 385 if (ct == NULL) { 386 kfree(param.pe_data); 387 *ignored = -1; 388 return NULL; 389 } 390 391 ct->timeout = svc->timeout; 392 } else { 393 /* set destination with the found template */ 394 dest = ct->dest; 395 kfree(param.pe_data); 396 } 397 398 dport = dst_port; 399 if (dport == svc->port && dest->port) 400 dport = dest->port; 401 402 flags = (svc->flags & IP_VS_SVC_F_ONEPACKET 403 && iph->protocol == IPPROTO_UDP) ? 404 IP_VS_CONN_F_ONE_PACKET : 0; 405 406 /* 407 * Create a new connection according to the template 408 */ 409 ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, src_addr, 410 src_port, dst_addr, dst_port, ¶m); 411 412 cp = ip_vs_conn_new(¶m, dest->af, &dest->addr, dport, flags, dest, 413 skb->mark); 414 if (cp == NULL) { 415 ip_vs_conn_put(ct); 416 *ignored = -1; 417 return NULL; 418 } 419 420 /* 421 * Add its control 422 */ 423 ip_vs_control_add(cp, ct); 424 ip_vs_conn_put(ct); 425 426 ip_vs_conn_stats(cp, svc); 427 return cp; 428 } 429 430 431 /* 432 * IPVS main scheduling function 433 * It selects a server according to the virtual service, and 434 * creates a connection entry. 435 * Protocols supported: TCP, UDP 436 * 437 * Usage of *ignored 438 * 439 * 1 : protocol tried to schedule (eg. on SYN), found svc but the 440 * svc/scheduler decides that this packet should be accepted with 441 * NF_ACCEPT because it must not be scheduled. 442 * 443 * 0 : scheduler can not find destination, so try bypass or 444 * return ICMP and then NF_DROP (ip_vs_leave). 445 * 446 * -1 : scheduler tried to schedule but fatal error occurred, eg. 447 * ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param 448 * failure such as missing Call-ID, ENOMEM on skb_linearize 449 * or pe_data. In this case we should return NF_DROP without 450 * any attempts to send ICMP with ip_vs_leave. 451 */ 452 struct ip_vs_conn * 453 ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, 454 struct ip_vs_proto_data *pd, int *ignored, 455 struct ip_vs_iphdr *iph) 456 { 457 struct ip_vs_protocol *pp = pd->pp; 458 struct ip_vs_conn *cp = NULL; 459 struct ip_vs_scheduler *sched; 460 struct ip_vs_dest *dest; 461 __be16 _ports[2], *pptr, cport, vport; 462 const void *caddr, *vaddr; 463 unsigned int flags; 464 465 *ignored = 1; 466 /* 467 * IPv6 frags, only the first hit here. 468 */ 469 pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports); 470 if (pptr == NULL) 471 return NULL; 472 473 if (likely(!ip_vs_iph_inverse(iph))) { 474 cport = pptr[0]; 475 caddr = &iph->saddr; 476 vport = pptr[1]; 477 vaddr = &iph->daddr; 478 } else { 479 cport = pptr[1]; 480 caddr = &iph->daddr; 481 vport = pptr[0]; 482 vaddr = &iph->saddr; 483 } 484 485 /* 486 * FTPDATA needs this check when using local real server. 487 * Never schedule Active FTPDATA connections from real server. 488 * For LVS-NAT they must be already created. For other methods 489 * with persistence the connection is created on SYN+ACK. 490 */ 491 if (cport == FTPDATA) { 492 IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off, 493 "Not scheduling FTPDATA"); 494 return NULL; 495 } 496 497 /* 498 * Do not schedule replies from local real server. 499 */ 500 if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK)) { 501 iph->hdr_flags ^= IP_VS_HDR_INVERSE; 502 cp = INDIRECT_CALL_1(pp->conn_in_get, 503 ip_vs_conn_in_get_proto, svc->ipvs, 504 svc->af, skb, iph); 505 iph->hdr_flags ^= IP_VS_HDR_INVERSE; 506 507 if (cp) { 508 IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off, 509 "Not scheduling reply for existing" 510 " connection"); 511 __ip_vs_conn_put(cp); 512 return NULL; 513 } 514 } 515 516 /* 517 * Persistent service 518 */ 519 if (svc->flags & IP_VS_SVC_F_PERSISTENT) 520 return ip_vs_sched_persist(svc, skb, cport, vport, ignored, 521 iph); 522 523 *ignored = 0; 524 525 /* 526 * Non-persistent service 527 */ 528 if (!svc->fwmark && vport != svc->port) { 529 if (!svc->port) 530 pr_err("Schedule: port zero only supported " 531 "in persistent services, " 532 "check your ipvs configuration\n"); 533 return NULL; 534 } 535 536 sched = rcu_dereference(svc->scheduler); 537 if (sched) { 538 /* read svc->sched_data after svc->scheduler */ 539 smp_rmb(); 540 dest = sched->schedule(svc, skb, iph); 541 } else { 542 dest = NULL; 543 } 544 if (dest == NULL) { 545 IP_VS_DBG(1, "Schedule: no dest found.\n"); 546 return NULL; 547 } 548 549 flags = (svc->flags & IP_VS_SVC_F_ONEPACKET 550 && iph->protocol == IPPROTO_UDP) ? 551 IP_VS_CONN_F_ONE_PACKET : 0; 552 553 /* 554 * Create a connection entry. 555 */ 556 { 557 struct ip_vs_conn_param p; 558 559 ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, 560 caddr, cport, vaddr, vport, &p); 561 cp = ip_vs_conn_new(&p, dest->af, &dest->addr, 562 dest->port ? dest->port : vport, 563 flags, dest, skb->mark); 564 if (!cp) { 565 *ignored = -1; 566 return NULL; 567 } 568 } 569 570 IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " 571 "d:%s:%u conn->flags:%X conn->refcnt:%d\n", 572 ip_vs_fwd_tag(cp), 573 IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), 574 IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), 575 IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), 576 cp->flags, refcount_read(&cp->refcnt)); 577 578 ip_vs_conn_stats(cp, svc); 579 return cp; 580 } 581 582 static inline int ip_vs_addr_is_unicast(struct net *net, int af, 583 union nf_inet_addr *addr) 584 { 585 #ifdef CONFIG_IP_VS_IPV6 586 if (af == AF_INET6) 587 return ipv6_addr_type(&addr->in6) & IPV6_ADDR_UNICAST; 588 #endif 589 return (inet_addr_type(net, addr->ip) == RTN_UNICAST); 590 } 591 592 /* 593 * Pass or drop the packet. 594 * Called by ip_vs_in, when the virtual service is available but 595 * no destination is available for a new connection. 596 */ 597 int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, 598 struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph) 599 { 600 __be16 _ports[2], *pptr, dport; 601 struct netns_ipvs *ipvs = svc->ipvs; 602 struct net *net = ipvs->net; 603 604 pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports); 605 if (!pptr) 606 return NF_DROP; 607 dport = likely(!ip_vs_iph_inverse(iph)) ? pptr[1] : pptr[0]; 608 609 /* if it is fwmark-based service, the cache_bypass sysctl is up 610 and the destination is a non-local unicast, then create 611 a cache_bypass connection entry */ 612 if (sysctl_cache_bypass(ipvs) && svc->fwmark && 613 !(iph->hdr_flags & (IP_VS_HDR_INVERSE | IP_VS_HDR_ICMP)) && 614 ip_vs_addr_is_unicast(net, svc->af, &iph->daddr)) { 615 int ret; 616 struct ip_vs_conn *cp; 617 unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && 618 iph->protocol == IPPROTO_UDP) ? 619 IP_VS_CONN_F_ONE_PACKET : 0; 620 union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } }; 621 622 /* create a new connection entry */ 623 IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__); 624 { 625 struct ip_vs_conn_param p; 626 ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, 627 &iph->saddr, pptr[0], 628 &iph->daddr, pptr[1], &p); 629 cp = ip_vs_conn_new(&p, svc->af, &daddr, 0, 630 IP_VS_CONN_F_BYPASS | flags, 631 NULL, skb->mark); 632 if (!cp) 633 return NF_DROP; 634 } 635 636 /* statistics */ 637 ip_vs_in_stats(cp, skb); 638 639 /* set state */ 640 ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); 641 642 /* transmit the first SYN packet */ 643 ret = cp->packet_xmit(skb, cp, pd->pp, iph); 644 /* do not touch skb anymore */ 645 646 if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control) 647 atomic_inc(&cp->control->in_pkts); 648 else 649 atomic_inc(&cp->in_pkts); 650 ip_vs_conn_put(cp); 651 return ret; 652 } 653 654 /* 655 * When the virtual ftp service is presented, packets destined 656 * for other services on the VIP may get here (except services 657 * listed in the ipvs table), pass the packets, because it is 658 * not ipvs job to decide to drop the packets. 659 */ 660 if (svc->port == FTPPORT && dport != FTPPORT) 661 return NF_ACCEPT; 662 663 if (unlikely(ip_vs_iph_icmp(iph))) 664 return NF_DROP; 665 666 /* 667 * Notify the client that the destination is unreachable, and 668 * release the socket buffer. 669 * Since it is in IP layer, the TCP socket is not actually 670 * created, the TCP RST packet cannot be sent, instead that 671 * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ 672 */ 673 #ifdef CONFIG_IP_VS_IPV6 674 if (svc->af == AF_INET6) { 675 if (!skb->dev) 676 skb->dev = net->loopback_dev; 677 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); 678 } else 679 #endif 680 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); 681 682 return NF_DROP; 683 } 684 685 #ifdef CONFIG_SYSCTL 686 687 static int sysctl_snat_reroute(struct netns_ipvs *ipvs) 688 { 689 return ipvs->sysctl_snat_reroute; 690 } 691 692 static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) 693 { 694 return ipvs->sysctl_nat_icmp_send; 695 } 696 697 #else 698 699 static int sysctl_snat_reroute(struct netns_ipvs *ipvs) { return 0; } 700 static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) { return 0; } 701 702 #endif 703 704 __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) 705 { 706 return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); 707 } 708 709 static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum) 710 { 711 if (NF_INET_LOCAL_IN == hooknum) 712 return IP_DEFRAG_VS_IN; 713 if (NF_INET_FORWARD == hooknum) 714 return IP_DEFRAG_VS_FWD; 715 return IP_DEFRAG_VS_OUT; 716 } 717 718 static inline int ip_vs_gather_frags(struct netns_ipvs *ipvs, 719 struct sk_buff *skb, u_int32_t user) 720 { 721 int err; 722 723 local_bh_disable(); 724 err = ip_defrag(ipvs->net, skb, user); 725 local_bh_enable(); 726 if (!err) 727 ip_send_check(ip_hdr(skb)); 728 729 return err; 730 } 731 732 static int ip_vs_route_me_harder(struct netns_ipvs *ipvs, int af, 733 struct sk_buff *skb, unsigned int hooknum) 734 { 735 if (!sysctl_snat_reroute(ipvs)) 736 return 0; 737 /* Reroute replies only to remote clients (FORWARD and LOCAL_OUT) */ 738 if (NF_INET_LOCAL_IN == hooknum) 739 return 0; 740 #ifdef CONFIG_IP_VS_IPV6 741 if (af == AF_INET6) { 742 struct dst_entry *dst = skb_dst(skb); 743 744 if (dst->dev && !(dst->dev->flags & IFF_LOOPBACK) && 745 ip6_route_me_harder(ipvs->net, skb) != 0) 746 return 1; 747 } else 748 #endif 749 if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL) && 750 ip_route_me_harder(ipvs->net, skb, RTN_LOCAL) != 0) 751 return 1; 752 753 return 0; 754 } 755 756 /* 757 * Packet has been made sufficiently writable in caller 758 * - inout: 1=in->out, 0=out->in 759 */ 760 void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, 761 struct ip_vs_conn *cp, int inout) 762 { 763 struct iphdr *iph = ip_hdr(skb); 764 unsigned int icmp_offset = iph->ihl*4; 765 struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) + 766 icmp_offset); 767 struct iphdr *ciph = (struct iphdr *)(icmph + 1); 768 769 if (inout) { 770 iph->saddr = cp->vaddr.ip; 771 ip_send_check(iph); 772 ciph->daddr = cp->vaddr.ip; 773 ip_send_check(ciph); 774 } else { 775 iph->daddr = cp->daddr.ip; 776 ip_send_check(iph); 777 ciph->saddr = cp->daddr.ip; 778 ip_send_check(ciph); 779 } 780 781 /* the TCP/UDP/SCTP port */ 782 if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol || 783 IPPROTO_SCTP == ciph->protocol) { 784 __be16 *ports = (void *)ciph + ciph->ihl*4; 785 786 if (inout) 787 ports[1] = cp->vport; 788 else 789 ports[0] = cp->dport; 790 } 791 792 /* And finally the ICMP checksum */ 793 icmph->checksum = 0; 794 icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset); 795 skb->ip_summed = CHECKSUM_UNNECESSARY; 796 797 if (inout) 798 IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, 799 "Forwarding altered outgoing ICMP"); 800 else 801 IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, 802 "Forwarding altered incoming ICMP"); 803 } 804 805 #ifdef CONFIG_IP_VS_IPV6 806 void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, 807 struct ip_vs_conn *cp, int inout) 808 { 809 struct ipv6hdr *iph = ipv6_hdr(skb); 810 unsigned int icmp_offset = 0; 811 unsigned int offs = 0; /* header offset*/ 812 int protocol; 813 struct icmp6hdr *icmph; 814 struct ipv6hdr *ciph; 815 unsigned short fragoffs; 816 817 ipv6_find_hdr(skb, &icmp_offset, IPPROTO_ICMPV6, &fragoffs, NULL); 818 icmph = (struct icmp6hdr *)(skb_network_header(skb) + icmp_offset); 819 offs = icmp_offset + sizeof(struct icmp6hdr); 820 ciph = (struct ipv6hdr *)(skb_network_header(skb) + offs); 821 822 protocol = ipv6_find_hdr(skb, &offs, -1, &fragoffs, NULL); 823 824 if (inout) { 825 iph->saddr = cp->vaddr.in6; 826 ciph->daddr = cp->vaddr.in6; 827 } else { 828 iph->daddr = cp->daddr.in6; 829 ciph->saddr = cp->daddr.in6; 830 } 831 832 /* the TCP/UDP/SCTP port */ 833 if (!fragoffs && (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol || 834 IPPROTO_SCTP == protocol)) { 835 __be16 *ports = (void *)(skb_network_header(skb) + offs); 836 837 IP_VS_DBG(11, "%s() changed port %d to %d\n", __func__, 838 ntohs(inout ? ports[1] : ports[0]), 839 ntohs(inout ? cp->vport : cp->dport)); 840 if (inout) 841 ports[1] = cp->vport; 842 else 843 ports[0] = cp->dport; 844 } 845 846 /* And finally the ICMP checksum */ 847 icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, 848 skb->len - icmp_offset, 849 IPPROTO_ICMPV6, 0); 850 skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset; 851 skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum); 852 skb->ip_summed = CHECKSUM_PARTIAL; 853 854 if (inout) 855 IP_VS_DBG_PKT(11, AF_INET6, pp, skb, 856 (void *)ciph - (void *)iph, 857 "Forwarding altered outgoing ICMPv6"); 858 else 859 IP_VS_DBG_PKT(11, AF_INET6, pp, skb, 860 (void *)ciph - (void *)iph, 861 "Forwarding altered incoming ICMPv6"); 862 } 863 #endif 864 865 /* Handle relevant response ICMP messages - forward to the right 866 * destination host. 867 */ 868 static int handle_response_icmp(int af, struct sk_buff *skb, 869 union nf_inet_addr *snet, 870 __u8 protocol, struct ip_vs_conn *cp, 871 struct ip_vs_protocol *pp, 872 unsigned int offset, unsigned int ihl, 873 unsigned int hooknum) 874 { 875 unsigned int verdict = NF_DROP; 876 877 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) 878 goto after_nat; 879 880 /* Ensure the checksum is correct */ 881 if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { 882 /* Failed checksum! */ 883 IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n", 884 IP_VS_DBG_ADDR(af, snet)); 885 goto out; 886 } 887 888 if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol || 889 IPPROTO_SCTP == protocol) 890 offset += 2 * sizeof(__u16); 891 if (skb_ensure_writable(skb, offset)) 892 goto out; 893 894 #ifdef CONFIG_IP_VS_IPV6 895 if (af == AF_INET6) 896 ip_vs_nat_icmp_v6(skb, pp, cp, 1); 897 else 898 #endif 899 ip_vs_nat_icmp(skb, pp, cp, 1); 900 901 if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum)) 902 goto out; 903 904 after_nat: 905 /* do the statistics and put it back */ 906 ip_vs_out_stats(cp, skb); 907 908 skb->ipvs_property = 1; 909 if (!(cp->flags & IP_VS_CONN_F_NFCT)) 910 ip_vs_notrack(skb); 911 else 912 ip_vs_update_conntrack(skb, cp, 0); 913 verdict = NF_ACCEPT; 914 915 out: 916 __ip_vs_conn_put(cp); 917 918 return verdict; 919 } 920 921 /* 922 * Handle ICMP messages in the inside-to-outside direction (outgoing). 923 * Find any that might be relevant, check against existing connections. 924 * Currently handles error types - unreachable, quench, ttl exceeded. 925 */ 926 static int ip_vs_out_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, 927 int *related, unsigned int hooknum) 928 { 929 struct iphdr *iph; 930 struct icmphdr _icmph, *ic; 931 struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ 932 struct ip_vs_iphdr ciph; 933 struct ip_vs_conn *cp; 934 struct ip_vs_protocol *pp; 935 unsigned int offset, ihl; 936 union nf_inet_addr snet; 937 938 *related = 1; 939 940 /* reassemble IP fragments */ 941 if (ip_is_fragment(ip_hdr(skb))) { 942 if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum))) 943 return NF_STOLEN; 944 } 945 946 iph = ip_hdr(skb); 947 offset = ihl = iph->ihl * 4; 948 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); 949 if (ic == NULL) 950 return NF_DROP; 951 952 IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n", 953 ic->type, ntohs(icmp_id(ic)), 954 &iph->saddr, &iph->daddr); 955 956 /* 957 * Work through seeing if this is for us. 958 * These checks are supposed to be in an order that means easy 959 * things are checked first to speed up processing.... however 960 * this means that some packets will manage to get a long way 961 * down this stack and then be rejected, but that's life. 962 */ 963 if ((ic->type != ICMP_DEST_UNREACH) && 964 (ic->type != ICMP_SOURCE_QUENCH) && 965 (ic->type != ICMP_TIME_EXCEEDED)) { 966 *related = 0; 967 return NF_ACCEPT; 968 } 969 970 /* Now find the contained IP header */ 971 offset += sizeof(_icmph); 972 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); 973 if (cih == NULL) 974 return NF_ACCEPT; /* The packet looks wrong, ignore */ 975 976 pp = ip_vs_proto_get(cih->protocol); 977 if (!pp) 978 return NF_ACCEPT; 979 980 /* Is the embedded protocol header present? */ 981 if (unlikely(cih->frag_off & htons(IP_OFFSET) && 982 pp->dont_defrag)) 983 return NF_ACCEPT; 984 985 IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, 986 "Checking outgoing ICMP for"); 987 988 ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, true, &ciph); 989 990 /* The embedded headers contain source and dest in reverse order */ 991 cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto, 992 ipvs, AF_INET, skb, &ciph); 993 if (!cp) 994 return NF_ACCEPT; 995 996 snet.ip = iph->saddr; 997 return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp, 998 pp, ciph.len, ihl, hooknum); 999 } 1000 1001 #ifdef CONFIG_IP_VS_IPV6 1002 static int ip_vs_out_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb, 1003 int *related, unsigned int hooknum, 1004 struct ip_vs_iphdr *ipvsh) 1005 { 1006 struct icmp6hdr _icmph, *ic; 1007 struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */ 1008 struct ip_vs_conn *cp; 1009 struct ip_vs_protocol *pp; 1010 union nf_inet_addr snet; 1011 unsigned int offset; 1012 1013 *related = 1; 1014 ic = frag_safe_skb_hp(skb, ipvsh->len, sizeof(_icmph), &_icmph); 1015 if (ic == NULL) 1016 return NF_DROP; 1017 1018 /* 1019 * Work through seeing if this is for us. 1020 * These checks are supposed to be in an order that means easy 1021 * things are checked first to speed up processing.... however 1022 * this means that some packets will manage to get a long way 1023 * down this stack and then be rejected, but that's life. 1024 */ 1025 if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) { 1026 *related = 0; 1027 return NF_ACCEPT; 1028 } 1029 /* Fragment header that is before ICMP header tells us that: 1030 * it's not an error message since they can't be fragmented. 1031 */ 1032 if (ipvsh->flags & IP6_FH_F_FRAG) 1033 return NF_DROP; 1034 1035 IP_VS_DBG(8, "Outgoing ICMPv6 (%d,%d) %pI6c->%pI6c\n", 1036 ic->icmp6_type, ntohs(icmpv6_id(ic)), 1037 &ipvsh->saddr, &ipvsh->daddr); 1038 1039 if (!ip_vs_fill_iph_skb_icmp(AF_INET6, skb, ipvsh->len + sizeof(_icmph), 1040 true, &ciph)) 1041 return NF_ACCEPT; /* The packet looks wrong, ignore */ 1042 1043 pp = ip_vs_proto_get(ciph.protocol); 1044 if (!pp) 1045 return NF_ACCEPT; 1046 1047 /* The embedded headers contain source and dest in reverse order */ 1048 cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto, 1049 ipvs, AF_INET6, skb, &ciph); 1050 if (!cp) 1051 return NF_ACCEPT; 1052 1053 snet.in6 = ciph.saddr.in6; 1054 offset = ciph.len; 1055 return handle_response_icmp(AF_INET6, skb, &snet, ciph.protocol, cp, 1056 pp, offset, sizeof(struct ipv6hdr), 1057 hooknum); 1058 } 1059 #endif 1060 1061 /* 1062 * Check if sctp chunc is ABORT chunk 1063 */ 1064 static inline int is_sctp_abort(const struct sk_buff *skb, int nh_len) 1065 { 1066 struct sctp_chunkhdr *sch, schunk; 1067 sch = skb_header_pointer(skb, nh_len + sizeof(struct sctphdr), 1068 sizeof(schunk), &schunk); 1069 if (sch == NULL) 1070 return 0; 1071 if (sch->type == SCTP_CID_ABORT) 1072 return 1; 1073 return 0; 1074 } 1075 1076 static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len) 1077 { 1078 struct tcphdr _tcph, *th; 1079 1080 th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph); 1081 if (th == NULL) 1082 return 0; 1083 return th->rst; 1084 } 1085 1086 static inline bool is_new_conn(const struct sk_buff *skb, 1087 struct ip_vs_iphdr *iph) 1088 { 1089 switch (iph->protocol) { 1090 case IPPROTO_TCP: { 1091 struct tcphdr _tcph, *th; 1092 1093 th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); 1094 if (th == NULL) 1095 return false; 1096 return th->syn; 1097 } 1098 case IPPROTO_SCTP: { 1099 struct sctp_chunkhdr *sch, schunk; 1100 1101 sch = skb_header_pointer(skb, iph->len + sizeof(struct sctphdr), 1102 sizeof(schunk), &schunk); 1103 if (sch == NULL) 1104 return false; 1105 return sch->type == SCTP_CID_INIT; 1106 } 1107 default: 1108 return false; 1109 } 1110 } 1111 1112 static inline bool is_new_conn_expected(const struct ip_vs_conn *cp, 1113 int conn_reuse_mode) 1114 { 1115 /* Controlled (FTP DATA or persistence)? */ 1116 if (cp->control) 1117 return false; 1118 1119 switch (cp->protocol) { 1120 case IPPROTO_TCP: 1121 return (cp->state == IP_VS_TCP_S_TIME_WAIT) || 1122 (cp->state == IP_VS_TCP_S_CLOSE) || 1123 ((conn_reuse_mode & 2) && 1124 (cp->state == IP_VS_TCP_S_FIN_WAIT) && 1125 (cp->flags & IP_VS_CONN_F_NOOUTPUT)); 1126 case IPPROTO_SCTP: 1127 return cp->state == IP_VS_SCTP_S_CLOSED; 1128 default: 1129 return false; 1130 } 1131 } 1132 1133 /* Generic function to create new connections for outgoing RS packets 1134 * 1135 * Pre-requisites for successful connection creation: 1136 * 1) Virtual Service is NOT fwmark based: 1137 * In fwmark-VS actual vaddr and vport are unknown to IPVS 1138 * 2) Real Server and Virtual Service were NOT configured without port: 1139 * This is to allow match of different VS to the same RS ip-addr 1140 */ 1141 struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc, 1142 struct ip_vs_dest *dest, 1143 struct sk_buff *skb, 1144 const struct ip_vs_iphdr *iph, 1145 __be16 dport, 1146 __be16 cport) 1147 { 1148 struct ip_vs_conn_param param; 1149 struct ip_vs_conn *ct = NULL, *cp = NULL; 1150 const union nf_inet_addr *vaddr, *daddr, *caddr; 1151 union nf_inet_addr snet; 1152 __be16 vport; 1153 unsigned int flags; 1154 1155 EnterFunction(12); 1156 vaddr = &svc->addr; 1157 vport = svc->port; 1158 daddr = &iph->saddr; 1159 caddr = &iph->daddr; 1160 1161 /* check pre-requisites are satisfied */ 1162 if (svc->fwmark) 1163 return NULL; 1164 if (!vport || !dport) 1165 return NULL; 1166 1167 /* for persistent service first create connection template */ 1168 if (svc->flags & IP_VS_SVC_F_PERSISTENT) { 1169 /* apply netmask the same way ingress-side does */ 1170 #ifdef CONFIG_IP_VS_IPV6 1171 if (svc->af == AF_INET6) 1172 ipv6_addr_prefix(&snet.in6, &caddr->in6, 1173 (__force __u32)svc->netmask); 1174 else 1175 #endif 1176 snet.ip = caddr->ip & svc->netmask; 1177 /* fill params and create template if not existent */ 1178 if (ip_vs_conn_fill_param_persist(svc, skb, iph->protocol, 1179 &snet, 0, vaddr, 1180 vport, ¶m) < 0) 1181 return NULL; 1182 ct = ip_vs_ct_in_get(¶m); 1183 /* check if template exists and points to the same dest */ 1184 if (!ct || !ip_vs_check_template(ct, dest)) { 1185 ct = ip_vs_conn_new(¶m, dest->af, daddr, dport, 1186 IP_VS_CONN_F_TEMPLATE, dest, 0); 1187 if (!ct) { 1188 kfree(param.pe_data); 1189 return NULL; 1190 } 1191 ct->timeout = svc->timeout; 1192 } else { 1193 kfree(param.pe_data); 1194 } 1195 } 1196 1197 /* connection flags */ 1198 flags = ((svc->flags & IP_VS_SVC_F_ONEPACKET) && 1199 iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0; 1200 /* create connection */ 1201 ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, 1202 caddr, cport, vaddr, vport, ¶m); 1203 cp = ip_vs_conn_new(¶m, dest->af, daddr, dport, flags, dest, 0); 1204 if (!cp) { 1205 if (ct) 1206 ip_vs_conn_put(ct); 1207 return NULL; 1208 } 1209 if (ct) { 1210 ip_vs_control_add(cp, ct); 1211 ip_vs_conn_put(ct); 1212 } 1213 ip_vs_conn_stats(cp, svc); 1214 1215 /* return connection (will be used to handle outgoing packet) */ 1216 IP_VS_DBG_BUF(6, "New connection RS-initiated:%c c:%s:%u v:%s:%u " 1217 "d:%s:%u conn->flags:%X conn->refcnt:%d\n", 1218 ip_vs_fwd_tag(cp), 1219 IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), 1220 IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), 1221 IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), 1222 cp->flags, refcount_read(&cp->refcnt)); 1223 LeaveFunction(12); 1224 return cp; 1225 } 1226 1227 /* Handle outgoing packets which are considered requests initiated by 1228 * real servers, so that subsequent responses from external client can be 1229 * routed to the right real server. 1230 * Used also for outgoing responses in OPS mode. 1231 * 1232 * Connection management is handled by persistent-engine specific callback. 1233 */ 1234 static struct ip_vs_conn *__ip_vs_rs_conn_out(unsigned int hooknum, 1235 struct netns_ipvs *ipvs, 1236 int af, struct sk_buff *skb, 1237 const struct ip_vs_iphdr *iph) 1238 { 1239 struct ip_vs_dest *dest; 1240 struct ip_vs_conn *cp = NULL; 1241 __be16 _ports[2], *pptr; 1242 1243 if (hooknum == NF_INET_LOCAL_IN) 1244 return NULL; 1245 1246 pptr = frag_safe_skb_hp(skb, iph->len, 1247 sizeof(_ports), _ports); 1248 if (!pptr) 1249 return NULL; 1250 1251 dest = ip_vs_find_real_service(ipvs, af, iph->protocol, 1252 &iph->saddr, pptr[0]); 1253 if (dest) { 1254 struct ip_vs_service *svc; 1255 struct ip_vs_pe *pe; 1256 1257 svc = rcu_dereference(dest->svc); 1258 if (svc) { 1259 pe = rcu_dereference(svc->pe); 1260 if (pe && pe->conn_out) 1261 cp = pe->conn_out(svc, dest, skb, iph, 1262 pptr[0], pptr[1]); 1263 } 1264 } 1265 1266 return cp; 1267 } 1268 1269 /* Handle response packets: rewrite addresses and send away... 1270 */ 1271 static unsigned int 1272 handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, 1273 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph, 1274 unsigned int hooknum) 1275 { 1276 struct ip_vs_protocol *pp = pd->pp; 1277 1278 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) 1279 goto after_nat; 1280 1281 IP_VS_DBG_PKT(11, af, pp, skb, iph->off, "Outgoing packet"); 1282 1283 if (skb_ensure_writable(skb, iph->len)) 1284 goto drop; 1285 1286 /* mangle the packet */ 1287 if (pp->snat_handler && 1288 !SNAT_CALL(pp->snat_handler, skb, pp, cp, iph)) 1289 goto drop; 1290 1291 #ifdef CONFIG_IP_VS_IPV6 1292 if (af == AF_INET6) 1293 ipv6_hdr(skb)->saddr = cp->vaddr.in6; 1294 else 1295 #endif 1296 { 1297 ip_hdr(skb)->saddr = cp->vaddr.ip; 1298 ip_send_check(ip_hdr(skb)); 1299 } 1300 1301 /* 1302 * nf_iterate does not expect change in the skb->dst->dev. 1303 * It looks like it is not fatal to enable this code for hooks 1304 * where our handlers are at the end of the chain list and 1305 * when all next handlers use skb->dst->dev and not outdev. 1306 * It will definitely route properly the inout NAT traffic 1307 * when multiple paths are used. 1308 */ 1309 1310 /* For policy routing, packets originating from this 1311 * machine itself may be routed differently to packets 1312 * passing through. We want this packet to be routed as 1313 * if it came from this machine itself. So re-compute 1314 * the routing information. 1315 */ 1316 if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum)) 1317 goto drop; 1318 1319 IP_VS_DBG_PKT(10, af, pp, skb, iph->off, "After SNAT"); 1320 1321 after_nat: 1322 ip_vs_out_stats(cp, skb); 1323 ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd); 1324 skb->ipvs_property = 1; 1325 if (!(cp->flags & IP_VS_CONN_F_NFCT)) 1326 ip_vs_notrack(skb); 1327 else 1328 ip_vs_update_conntrack(skb, cp, 0); 1329 ip_vs_conn_put(cp); 1330 1331 LeaveFunction(11); 1332 return NF_ACCEPT; 1333 1334 drop: 1335 ip_vs_conn_put(cp); 1336 kfree_skb(skb); 1337 LeaveFunction(11); 1338 return NF_STOLEN; 1339 } 1340 1341 /* 1342 * Check if outgoing packet belongs to the established ip_vs_conn. 1343 */ 1344 static unsigned int 1345 ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af) 1346 { 1347 struct ip_vs_iphdr iph; 1348 struct ip_vs_protocol *pp; 1349 struct ip_vs_proto_data *pd; 1350 struct ip_vs_conn *cp; 1351 struct sock *sk; 1352 1353 EnterFunction(11); 1354 1355 /* Already marked as IPVS request or reply? */ 1356 if (skb->ipvs_property) 1357 return NF_ACCEPT; 1358 1359 sk = skb_to_full_sk(skb); 1360 /* Bad... Do not break raw sockets */ 1361 if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT && 1362 af == AF_INET)) { 1363 1364 if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag) 1365 return NF_ACCEPT; 1366 } 1367 1368 if (unlikely(!skb_dst(skb))) 1369 return NF_ACCEPT; 1370 1371 if (!ipvs->enable) 1372 return NF_ACCEPT; 1373 1374 ip_vs_fill_iph_skb(af, skb, false, &iph); 1375 #ifdef CONFIG_IP_VS_IPV6 1376 if (af == AF_INET6) { 1377 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { 1378 int related; 1379 int verdict = ip_vs_out_icmp_v6(ipvs, skb, &related, 1380 hooknum, &iph); 1381 1382 if (related) 1383 return verdict; 1384 } 1385 } else 1386 #endif 1387 if (unlikely(iph.protocol == IPPROTO_ICMP)) { 1388 int related; 1389 int verdict = ip_vs_out_icmp(ipvs, skb, &related, hooknum); 1390 1391 if (related) 1392 return verdict; 1393 } 1394 1395 pd = ip_vs_proto_data_get(ipvs, iph.protocol); 1396 if (unlikely(!pd)) 1397 return NF_ACCEPT; 1398 pp = pd->pp; 1399 1400 /* reassemble IP fragments */ 1401 #ifdef CONFIG_IP_VS_IPV6 1402 if (af == AF_INET) 1403 #endif 1404 if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) { 1405 if (ip_vs_gather_frags(ipvs, skb, 1406 ip_vs_defrag_user(hooknum))) 1407 return NF_STOLEN; 1408 1409 ip_vs_fill_iph_skb(AF_INET, skb, false, &iph); 1410 } 1411 1412 /* 1413 * Check if the packet belongs to an existing entry 1414 */ 1415 cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto, 1416 ipvs, af, skb, &iph); 1417 1418 if (likely(cp)) 1419 return handle_response(af, skb, pd, cp, &iph, hooknum); 1420 1421 /* Check for real-server-started requests */ 1422 if (atomic_read(&ipvs->conn_out_counter)) { 1423 /* Currently only for UDP: 1424 * connection oriented protocols typically use 1425 * ephemeral ports for outgoing connections, so 1426 * related incoming responses would not match any VS 1427 */ 1428 if (pp->protocol == IPPROTO_UDP) { 1429 cp = __ip_vs_rs_conn_out(hooknum, ipvs, af, skb, &iph); 1430 if (likely(cp)) 1431 return handle_response(af, skb, pd, cp, &iph, 1432 hooknum); 1433 } 1434 } 1435 1436 if (sysctl_nat_icmp_send(ipvs) && 1437 (pp->protocol == IPPROTO_TCP || 1438 pp->protocol == IPPROTO_UDP || 1439 pp->protocol == IPPROTO_SCTP)) { 1440 __be16 _ports[2], *pptr; 1441 1442 pptr = frag_safe_skb_hp(skb, iph.len, 1443 sizeof(_ports), _ports); 1444 if (pptr == NULL) 1445 return NF_ACCEPT; /* Not for me */ 1446 if (ip_vs_has_real_service(ipvs, af, iph.protocol, &iph.saddr, 1447 pptr[0])) { 1448 /* 1449 * Notify the real server: there is no 1450 * existing entry if it is not RST 1451 * packet or not TCP packet. 1452 */ 1453 if ((iph.protocol != IPPROTO_TCP && 1454 iph.protocol != IPPROTO_SCTP) 1455 || ((iph.protocol == IPPROTO_TCP 1456 && !is_tcp_reset(skb, iph.len)) 1457 || (iph.protocol == IPPROTO_SCTP 1458 && !is_sctp_abort(skb, 1459 iph.len)))) { 1460 #ifdef CONFIG_IP_VS_IPV6 1461 if (af == AF_INET6) { 1462 if (!skb->dev) 1463 skb->dev = ipvs->net->loopback_dev; 1464 icmpv6_send(skb, 1465 ICMPV6_DEST_UNREACH, 1466 ICMPV6_PORT_UNREACH, 1467 0); 1468 } else 1469 #endif 1470 icmp_send(skb, 1471 ICMP_DEST_UNREACH, 1472 ICMP_PORT_UNREACH, 0); 1473 return NF_DROP; 1474 } 1475 } 1476 } 1477 1478 IP_VS_DBG_PKT(12, af, pp, skb, iph.off, 1479 "ip_vs_out: packet continues traversal as normal"); 1480 return NF_ACCEPT; 1481 } 1482 1483 /* 1484 * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain, 1485 * used only for VS/NAT. 1486 * Check if packet is reply for established ip_vs_conn. 1487 */ 1488 static unsigned int 1489 ip_vs_reply4(void *priv, struct sk_buff *skb, 1490 const struct nf_hook_state *state) 1491 { 1492 return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET); 1493 } 1494 1495 /* 1496 * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT. 1497 * Check if packet is reply for established ip_vs_conn. 1498 */ 1499 static unsigned int 1500 ip_vs_local_reply4(void *priv, struct sk_buff *skb, 1501 const struct nf_hook_state *state) 1502 { 1503 return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET); 1504 } 1505 1506 #ifdef CONFIG_IP_VS_IPV6 1507 1508 /* 1509 * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain, 1510 * used only for VS/NAT. 1511 * Check if packet is reply for established ip_vs_conn. 1512 */ 1513 static unsigned int 1514 ip_vs_reply6(void *priv, struct sk_buff *skb, 1515 const struct nf_hook_state *state) 1516 { 1517 return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6); 1518 } 1519 1520 /* 1521 * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT. 1522 * Check if packet is reply for established ip_vs_conn. 1523 */ 1524 static unsigned int 1525 ip_vs_local_reply6(void *priv, struct sk_buff *skb, 1526 const struct nf_hook_state *state) 1527 { 1528 return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6); 1529 } 1530 1531 #endif 1532 1533 static unsigned int 1534 ip_vs_try_to_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, 1535 struct ip_vs_proto_data *pd, 1536 int *verdict, struct ip_vs_conn **cpp, 1537 struct ip_vs_iphdr *iph) 1538 { 1539 struct ip_vs_protocol *pp = pd->pp; 1540 1541 if (!iph->fragoffs) { 1542 /* No (second) fragments need to enter here, as nf_defrag_ipv6 1543 * replayed fragment zero will already have created the cp 1544 */ 1545 1546 /* Schedule and create new connection entry into cpp */ 1547 if (!pp->conn_schedule(ipvs, af, skb, pd, verdict, cpp, iph)) 1548 return 0; 1549 } 1550 1551 if (unlikely(!*cpp)) { 1552 /* sorry, all this trouble for a no-hit :) */ 1553 IP_VS_DBG_PKT(12, af, pp, skb, iph->off, 1554 "ip_vs_in: packet continues traversal as normal"); 1555 1556 /* Fragment couldn't be mapped to a conn entry */ 1557 if (iph->fragoffs) 1558 IP_VS_DBG_PKT(7, af, pp, skb, iph->off, 1559 "unhandled fragment"); 1560 1561 *verdict = NF_ACCEPT; 1562 return 0; 1563 } 1564 1565 return 1; 1566 } 1567 1568 /* Check the UDP tunnel and return its header length */ 1569 static int ipvs_udp_decap(struct netns_ipvs *ipvs, struct sk_buff *skb, 1570 unsigned int offset, __u16 af, 1571 const union nf_inet_addr *daddr, __u8 *proto) 1572 { 1573 struct udphdr _udph, *udph; 1574 struct ip_vs_dest *dest; 1575 1576 udph = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); 1577 if (!udph) 1578 goto unk; 1579 offset += sizeof(struct udphdr); 1580 dest = ip_vs_find_tunnel(ipvs, af, daddr, udph->dest); 1581 if (!dest) 1582 goto unk; 1583 if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1584 struct guehdr _gueh, *gueh; 1585 1586 gueh = skb_header_pointer(skb, offset, sizeof(_gueh), &_gueh); 1587 if (!gueh) 1588 goto unk; 1589 if (gueh->control != 0 || gueh->version != 0) 1590 goto unk; 1591 /* Later we can support also IPPROTO_IPV6 */ 1592 if (gueh->proto_ctype != IPPROTO_IPIP) 1593 goto unk; 1594 *proto = gueh->proto_ctype; 1595 return sizeof(struct udphdr) + sizeof(struct guehdr) + 1596 (gueh->hlen << 2); 1597 } 1598 1599 unk: 1600 return 0; 1601 } 1602 1603 /* Check the GRE tunnel and return its header length */ 1604 static int ipvs_gre_decap(struct netns_ipvs *ipvs, struct sk_buff *skb, 1605 unsigned int offset, __u16 af, 1606 const union nf_inet_addr *daddr, __u8 *proto) 1607 { 1608 struct gre_base_hdr _greh, *greh; 1609 struct ip_vs_dest *dest; 1610 1611 greh = skb_header_pointer(skb, offset, sizeof(_greh), &_greh); 1612 if (!greh) 1613 goto unk; 1614 dest = ip_vs_find_tunnel(ipvs, af, daddr, 0); 1615 if (!dest) 1616 goto unk; 1617 if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 1618 __be16 type; 1619 1620 /* Only support version 0 and C (csum) */ 1621 if ((greh->flags & ~GRE_CSUM) != 0) 1622 goto unk; 1623 type = greh->protocol; 1624 /* Later we can support also IPPROTO_IPV6 */ 1625 if (type != htons(ETH_P_IP)) 1626 goto unk; 1627 *proto = IPPROTO_IPIP; 1628 return gre_calc_hlen(gre_flags_to_tnl_flags(greh->flags)); 1629 } 1630 1631 unk: 1632 return 0; 1633 } 1634 1635 /* 1636 * Handle ICMP messages in the outside-to-inside direction (incoming). 1637 * Find any that might be relevant, check against existing connections, 1638 * forward to the right destination host if relevant. 1639 * Currently handles error types - unreachable, quench, ttl exceeded. 1640 */ 1641 static int 1642 ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, 1643 unsigned int hooknum) 1644 { 1645 struct iphdr *iph; 1646 struct icmphdr _icmph, *ic; 1647 struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ 1648 struct ip_vs_iphdr ciph; 1649 struct ip_vs_conn *cp; 1650 struct ip_vs_protocol *pp; 1651 struct ip_vs_proto_data *pd; 1652 unsigned int offset, offset2, ihl, verdict; 1653 bool tunnel, new_cp = false; 1654 union nf_inet_addr *raddr; 1655 char *outer_proto = "IPIP"; 1656 1657 *related = 1; 1658 1659 /* reassemble IP fragments */ 1660 if (ip_is_fragment(ip_hdr(skb))) { 1661 if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum))) 1662 return NF_STOLEN; 1663 } 1664 1665 iph = ip_hdr(skb); 1666 offset = ihl = iph->ihl * 4; 1667 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); 1668 if (ic == NULL) 1669 return NF_DROP; 1670 1671 IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n", 1672 ic->type, ntohs(icmp_id(ic)), 1673 &iph->saddr, &iph->daddr); 1674 1675 /* 1676 * Work through seeing if this is for us. 1677 * These checks are supposed to be in an order that means easy 1678 * things are checked first to speed up processing.... however 1679 * this means that some packets will manage to get a long way 1680 * down this stack and then be rejected, but that's life. 1681 */ 1682 if ((ic->type != ICMP_DEST_UNREACH) && 1683 (ic->type != ICMP_SOURCE_QUENCH) && 1684 (ic->type != ICMP_TIME_EXCEEDED)) { 1685 *related = 0; 1686 return NF_ACCEPT; 1687 } 1688 1689 /* Now find the contained IP header */ 1690 offset += sizeof(_icmph); 1691 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); 1692 if (cih == NULL) 1693 return NF_ACCEPT; /* The packet looks wrong, ignore */ 1694 raddr = (union nf_inet_addr *)&cih->daddr; 1695 1696 /* Special case for errors for IPIP/UDP/GRE tunnel packets */ 1697 tunnel = false; 1698 if (cih->protocol == IPPROTO_IPIP) { 1699 struct ip_vs_dest *dest; 1700 1701 if (unlikely(cih->frag_off & htons(IP_OFFSET))) 1702 return NF_ACCEPT; 1703 /* Error for our IPIP must arrive at LOCAL_IN */ 1704 if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL)) 1705 return NF_ACCEPT; 1706 dest = ip_vs_find_tunnel(ipvs, AF_INET, raddr, 0); 1707 /* Only for known tunnel */ 1708 if (!dest || dest->tun_type != IP_VS_CONN_F_TUNNEL_TYPE_IPIP) 1709 return NF_ACCEPT; 1710 offset += cih->ihl * 4; 1711 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); 1712 if (cih == NULL) 1713 return NF_ACCEPT; /* The packet looks wrong, ignore */ 1714 tunnel = true; 1715 } else if ((cih->protocol == IPPROTO_UDP || /* Can be UDP encap */ 1716 cih->protocol == IPPROTO_GRE) && /* Can be GRE encap */ 1717 /* Error for our tunnel must arrive at LOCAL_IN */ 1718 (skb_rtable(skb)->rt_flags & RTCF_LOCAL)) { 1719 __u8 iproto; 1720 int ulen; 1721 1722 /* Non-first fragment has no UDP/GRE header */ 1723 if (unlikely(cih->frag_off & htons(IP_OFFSET))) 1724 return NF_ACCEPT; 1725 offset2 = offset + cih->ihl * 4; 1726 if (cih->protocol == IPPROTO_UDP) { 1727 ulen = ipvs_udp_decap(ipvs, skb, offset2, AF_INET, 1728 raddr, &iproto); 1729 outer_proto = "UDP"; 1730 } else { 1731 ulen = ipvs_gre_decap(ipvs, skb, offset2, AF_INET, 1732 raddr, &iproto); 1733 outer_proto = "GRE"; 1734 } 1735 if (ulen > 0) { 1736 /* Skip IP and UDP/GRE tunnel headers */ 1737 offset = offset2 + ulen; 1738 /* Now we should be at the original IP header */ 1739 cih = skb_header_pointer(skb, offset, sizeof(_ciph), 1740 &_ciph); 1741 if (cih && cih->version == 4 && cih->ihl >= 5 && 1742 iproto == IPPROTO_IPIP) 1743 tunnel = true; 1744 else 1745 return NF_ACCEPT; 1746 } 1747 } 1748 1749 pd = ip_vs_proto_data_get(ipvs, cih->protocol); 1750 if (!pd) 1751 return NF_ACCEPT; 1752 pp = pd->pp; 1753 1754 /* Is the embedded protocol header present? */ 1755 if (unlikely(cih->frag_off & htons(IP_OFFSET) && 1756 pp->dont_defrag)) 1757 return NF_ACCEPT; 1758 1759 IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, 1760 "Checking incoming ICMP for"); 1761 1762 offset2 = offset; 1763 ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, !tunnel, &ciph); 1764 offset = ciph.len; 1765 1766 /* The embedded headers contain source and dest in reverse order. 1767 * For IPIP/UDP/GRE tunnel this is error for request, not for reply. 1768 */ 1769 cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto, 1770 ipvs, AF_INET, skb, &ciph); 1771 1772 if (!cp) { 1773 int v; 1774 1775 if (tunnel || !sysctl_schedule_icmp(ipvs)) 1776 return NF_ACCEPT; 1777 1778 if (!ip_vs_try_to_schedule(ipvs, AF_INET, skb, pd, &v, &cp, &ciph)) 1779 return v; 1780 new_cp = true; 1781 } 1782 1783 verdict = NF_DROP; 1784 1785 /* Ensure the checksum is correct */ 1786 if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { 1787 /* Failed checksum! */ 1788 IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n", 1789 &iph->saddr); 1790 goto out; 1791 } 1792 1793 if (tunnel) { 1794 __be32 info = ic->un.gateway; 1795 __u8 type = ic->type; 1796 __u8 code = ic->code; 1797 1798 /* Update the MTU */ 1799 if (ic->type == ICMP_DEST_UNREACH && 1800 ic->code == ICMP_FRAG_NEEDED) { 1801 struct ip_vs_dest *dest = cp->dest; 1802 u32 mtu = ntohs(ic->un.frag.mtu); 1803 __be16 frag_off = cih->frag_off; 1804 1805 /* Strip outer IP and ICMP, go to IPIP/UDP/GRE header */ 1806 if (pskb_pull(skb, ihl + sizeof(_icmph)) == NULL) 1807 goto ignore_tunnel; 1808 offset2 -= ihl + sizeof(_icmph); 1809 skb_reset_network_header(skb); 1810 IP_VS_DBG(12, "ICMP for %s %pI4->%pI4: mtu=%u\n", 1811 outer_proto, &ip_hdr(skb)->saddr, 1812 &ip_hdr(skb)->daddr, mtu); 1813 ipv4_update_pmtu(skb, ipvs->net, mtu, 0, 0); 1814 /* Client uses PMTUD? */ 1815 if (!(frag_off & htons(IP_DF))) 1816 goto ignore_tunnel; 1817 /* Prefer the resulting PMTU */ 1818 if (dest) { 1819 struct ip_vs_dest_dst *dest_dst; 1820 1821 dest_dst = rcu_dereference(dest->dest_dst); 1822 if (dest_dst) 1823 mtu = dst_mtu(dest_dst->dst_cache); 1824 } 1825 if (mtu > 68 + sizeof(struct iphdr)) 1826 mtu -= sizeof(struct iphdr); 1827 info = htonl(mtu); 1828 } 1829 /* Strip outer IP, ICMP and IPIP/UDP/GRE, go to IP header of 1830 * original request. 1831 */ 1832 if (pskb_pull(skb, offset2) == NULL) 1833 goto ignore_tunnel; 1834 skb_reset_network_header(skb); 1835 IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n", 1836 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, 1837 type, code, ntohl(info)); 1838 icmp_send(skb, type, code, info); 1839 /* ICMP can be shorter but anyways, account it */ 1840 ip_vs_out_stats(cp, skb); 1841 1842 ignore_tunnel: 1843 consume_skb(skb); 1844 verdict = NF_STOLEN; 1845 goto out; 1846 } 1847 1848 /* do the statistics and put it back */ 1849 ip_vs_in_stats(cp, skb); 1850 if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol || 1851 IPPROTO_SCTP == cih->protocol) 1852 offset += 2 * sizeof(__u16); 1853 verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum, &ciph); 1854 1855 out: 1856 if (likely(!new_cp)) 1857 __ip_vs_conn_put(cp); 1858 else 1859 ip_vs_conn_put(cp); 1860 1861 return verdict; 1862 } 1863 1864 #ifdef CONFIG_IP_VS_IPV6 1865 static int ip_vs_in_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb, 1866 int *related, unsigned int hooknum, 1867 struct ip_vs_iphdr *iph) 1868 { 1869 struct icmp6hdr _icmph, *ic; 1870 struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */ 1871 struct ip_vs_conn *cp; 1872 struct ip_vs_protocol *pp; 1873 struct ip_vs_proto_data *pd; 1874 unsigned int offset, verdict; 1875 bool new_cp = false; 1876 1877 *related = 1; 1878 1879 ic = frag_safe_skb_hp(skb, iph->len, sizeof(_icmph), &_icmph); 1880 if (ic == NULL) 1881 return NF_DROP; 1882 1883 /* 1884 * Work through seeing if this is for us. 1885 * These checks are supposed to be in an order that means easy 1886 * things are checked first to speed up processing.... however 1887 * this means that some packets will manage to get a long way 1888 * down this stack and then be rejected, but that's life. 1889 */ 1890 if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) { 1891 *related = 0; 1892 return NF_ACCEPT; 1893 } 1894 /* Fragment header that is before ICMP header tells us that: 1895 * it's not an error message since they can't be fragmented. 1896 */ 1897 if (iph->flags & IP6_FH_F_FRAG) 1898 return NF_DROP; 1899 1900 IP_VS_DBG(8, "Incoming ICMPv6 (%d,%d) %pI6c->%pI6c\n", 1901 ic->icmp6_type, ntohs(icmpv6_id(ic)), 1902 &iph->saddr, &iph->daddr); 1903 1904 offset = iph->len + sizeof(_icmph); 1905 if (!ip_vs_fill_iph_skb_icmp(AF_INET6, skb, offset, true, &ciph)) 1906 return NF_ACCEPT; 1907 1908 pd = ip_vs_proto_data_get(ipvs, ciph.protocol); 1909 if (!pd) 1910 return NF_ACCEPT; 1911 pp = pd->pp; 1912 1913 /* Cannot handle fragmented embedded protocol */ 1914 if (ciph.fragoffs) 1915 return NF_ACCEPT; 1916 1917 IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset, 1918 "Checking incoming ICMPv6 for"); 1919 1920 /* The embedded headers contain source and dest in reverse order 1921 * if not from localhost 1922 */ 1923 cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto, 1924 ipvs, AF_INET6, skb, &ciph); 1925 1926 if (!cp) { 1927 int v; 1928 1929 if (!sysctl_schedule_icmp(ipvs)) 1930 return NF_ACCEPT; 1931 1932 if (!ip_vs_try_to_schedule(ipvs, AF_INET6, skb, pd, &v, &cp, &ciph)) 1933 return v; 1934 1935 new_cp = true; 1936 } 1937 1938 /* VS/TUN, VS/DR and LOCALNODE just let it go */ 1939 if ((hooknum == NF_INET_LOCAL_OUT) && 1940 (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)) { 1941 verdict = NF_ACCEPT; 1942 goto out; 1943 } 1944 1945 /* do the statistics and put it back */ 1946 ip_vs_in_stats(cp, skb); 1947 1948 /* Need to mangle contained IPv6 header in ICMPv6 packet */ 1949 offset = ciph.len; 1950 if (IPPROTO_TCP == ciph.protocol || IPPROTO_UDP == ciph.protocol || 1951 IPPROTO_SCTP == ciph.protocol) 1952 offset += 2 * sizeof(__u16); /* Also mangle ports */ 1953 1954 verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset, hooknum, &ciph); 1955 1956 out: 1957 if (likely(!new_cp)) 1958 __ip_vs_conn_put(cp); 1959 else 1960 ip_vs_conn_put(cp); 1961 1962 return verdict; 1963 } 1964 #endif 1965 1966 1967 /* 1968 * Check if it's for virtual services, look it up, 1969 * and send it on its way... 1970 */ 1971 static unsigned int 1972 ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af) 1973 { 1974 struct ip_vs_iphdr iph; 1975 struct ip_vs_protocol *pp; 1976 struct ip_vs_proto_data *pd; 1977 struct ip_vs_conn *cp; 1978 int ret, pkts; 1979 int conn_reuse_mode; 1980 struct sock *sk; 1981 1982 /* Already marked as IPVS request or reply? */ 1983 if (skb->ipvs_property) 1984 return NF_ACCEPT; 1985 1986 /* 1987 * Big tappo: 1988 * - remote client: only PACKET_HOST 1989 * - route: used for struct net when skb->dev is unset 1990 */ 1991 if (unlikely((skb->pkt_type != PACKET_HOST && 1992 hooknum != NF_INET_LOCAL_OUT) || 1993 !skb_dst(skb))) { 1994 ip_vs_fill_iph_skb(af, skb, false, &iph); 1995 IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s" 1996 " ignored in hook %u\n", 1997 skb->pkt_type, iph.protocol, 1998 IP_VS_DBG_ADDR(af, &iph.daddr), hooknum); 1999 return NF_ACCEPT; 2000 } 2001 /* ipvs enabled in this netns ? */ 2002 if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) 2003 return NF_ACCEPT; 2004 2005 ip_vs_fill_iph_skb(af, skb, false, &iph); 2006 2007 /* Bad... Do not break raw sockets */ 2008 sk = skb_to_full_sk(skb); 2009 if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT && 2010 af == AF_INET)) { 2011 2012 if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag) 2013 return NF_ACCEPT; 2014 } 2015 2016 #ifdef CONFIG_IP_VS_IPV6 2017 if (af == AF_INET6) { 2018 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { 2019 int related; 2020 int verdict = ip_vs_in_icmp_v6(ipvs, skb, &related, 2021 hooknum, &iph); 2022 2023 if (related) 2024 return verdict; 2025 } 2026 } else 2027 #endif 2028 if (unlikely(iph.protocol == IPPROTO_ICMP)) { 2029 int related; 2030 int verdict = ip_vs_in_icmp(ipvs, skb, &related, 2031 hooknum); 2032 2033 if (related) 2034 return verdict; 2035 } 2036 2037 /* Protocol supported? */ 2038 pd = ip_vs_proto_data_get(ipvs, iph.protocol); 2039 if (unlikely(!pd)) { 2040 /* The only way we'll see this packet again is if it's 2041 * encapsulated, so mark it with ipvs_property=1 so we 2042 * skip it if we're ignoring tunneled packets 2043 */ 2044 if (sysctl_ignore_tunneled(ipvs)) 2045 skb->ipvs_property = 1; 2046 2047 return NF_ACCEPT; 2048 } 2049 pp = pd->pp; 2050 /* 2051 * Check if the packet belongs to an existing connection entry 2052 */ 2053 cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto, 2054 ipvs, af, skb, &iph); 2055 2056 conn_reuse_mode = sysctl_conn_reuse_mode(ipvs); 2057 if (conn_reuse_mode && !iph.fragoffs && is_new_conn(skb, &iph) && cp) { 2058 bool old_ct = false, resched = false; 2059 2060 if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest && 2061 unlikely(!atomic_read(&cp->dest->weight))) { 2062 resched = true; 2063 old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); 2064 } else if (is_new_conn_expected(cp, conn_reuse_mode)) { 2065 old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); 2066 if (!atomic_read(&cp->n_control)) { 2067 resched = true; 2068 } else { 2069 /* Do not reschedule controlling connection 2070 * that uses conntrack while it is still 2071 * referenced by controlled connection(s). 2072 */ 2073 resched = !old_ct; 2074 } 2075 } 2076 2077 if (resched) { 2078 if (!old_ct) 2079 cp->flags &= ~IP_VS_CONN_F_NFCT; 2080 if (!atomic_read(&cp->n_control)) 2081 ip_vs_conn_expire_now(cp); 2082 __ip_vs_conn_put(cp); 2083 if (old_ct) 2084 return NF_DROP; 2085 cp = NULL; 2086 } 2087 } 2088 2089 /* Check the server status */ 2090 if (cp && cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { 2091 /* the destination server is not available */ 2092 if (sysctl_expire_nodest_conn(ipvs)) { 2093 bool old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); 2094 2095 if (!old_ct) 2096 cp->flags &= ~IP_VS_CONN_F_NFCT; 2097 2098 ip_vs_conn_expire_now(cp); 2099 __ip_vs_conn_put(cp); 2100 if (old_ct) 2101 return NF_DROP; 2102 cp = NULL; 2103 } else { 2104 __ip_vs_conn_put(cp); 2105 return NF_DROP; 2106 } 2107 } 2108 2109 if (unlikely(!cp)) { 2110 int v; 2111 2112 if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph)) 2113 return v; 2114 } 2115 2116 IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet"); 2117 2118 ip_vs_in_stats(cp, skb); 2119 ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); 2120 if (cp->packet_xmit) 2121 ret = cp->packet_xmit(skb, cp, pp, &iph); 2122 /* do not touch skb anymore */ 2123 else { 2124 IP_VS_DBG_RL("warning: packet_xmit is null"); 2125 ret = NF_ACCEPT; 2126 } 2127 2128 /* Increase its packet counter and check if it is needed 2129 * to be synchronized 2130 * 2131 * Sync connection if it is about to close to 2132 * encorage the standby servers to update the connections timeout 2133 * 2134 * For ONE_PKT let ip_vs_sync_conn() do the filter work. 2135 */ 2136 2137 if (cp->flags & IP_VS_CONN_F_ONE_PACKET) 2138 pkts = sysctl_sync_threshold(ipvs); 2139 else 2140 pkts = atomic_add_return(1, &cp->in_pkts); 2141 2142 if (ipvs->sync_state & IP_VS_STATE_MASTER) 2143 ip_vs_sync_conn(ipvs, cp, pkts); 2144 else if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control) 2145 /* increment is done inside ip_vs_sync_conn too */ 2146 atomic_inc(&cp->control->in_pkts); 2147 2148 ip_vs_conn_put(cp); 2149 return ret; 2150 } 2151 2152 /* 2153 * AF_INET handler in NF_INET_LOCAL_IN chain 2154 * Schedule and forward packets from remote clients 2155 */ 2156 static unsigned int 2157 ip_vs_remote_request4(void *priv, struct sk_buff *skb, 2158 const struct nf_hook_state *state) 2159 { 2160 return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET); 2161 } 2162 2163 /* 2164 * AF_INET handler in NF_INET_LOCAL_OUT chain 2165 * Schedule and forward packets from local clients 2166 */ 2167 static unsigned int 2168 ip_vs_local_request4(void *priv, struct sk_buff *skb, 2169 const struct nf_hook_state *state) 2170 { 2171 return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET); 2172 } 2173 2174 #ifdef CONFIG_IP_VS_IPV6 2175 2176 /* 2177 * AF_INET6 handler in NF_INET_LOCAL_IN chain 2178 * Schedule and forward packets from remote clients 2179 */ 2180 static unsigned int 2181 ip_vs_remote_request6(void *priv, struct sk_buff *skb, 2182 const struct nf_hook_state *state) 2183 { 2184 return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET6); 2185 } 2186 2187 /* 2188 * AF_INET6 handler in NF_INET_LOCAL_OUT chain 2189 * Schedule and forward packets from local clients 2190 */ 2191 static unsigned int 2192 ip_vs_local_request6(void *priv, struct sk_buff *skb, 2193 const struct nf_hook_state *state) 2194 { 2195 return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET6); 2196 } 2197 2198 #endif 2199 2200 2201 /* 2202 * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP 2203 * related packets destined for 0.0.0.0/0. 2204 * When fwmark-based virtual service is used, such as transparent 2205 * cache cluster, TCP packets can be marked and routed to ip_vs_in, 2206 * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and 2207 * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain 2208 * and send them to ip_vs_in_icmp. 2209 */ 2210 static unsigned int 2211 ip_vs_forward_icmp(void *priv, struct sk_buff *skb, 2212 const struct nf_hook_state *state) 2213 { 2214 int r; 2215 struct netns_ipvs *ipvs = net_ipvs(state->net); 2216 2217 if (ip_hdr(skb)->protocol != IPPROTO_ICMP) 2218 return NF_ACCEPT; 2219 2220 /* ipvs enabled in this netns ? */ 2221 if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) 2222 return NF_ACCEPT; 2223 2224 return ip_vs_in_icmp(ipvs, skb, &r, state->hook); 2225 } 2226 2227 #ifdef CONFIG_IP_VS_IPV6 2228 static unsigned int 2229 ip_vs_forward_icmp_v6(void *priv, struct sk_buff *skb, 2230 const struct nf_hook_state *state) 2231 { 2232 int r; 2233 struct netns_ipvs *ipvs = net_ipvs(state->net); 2234 struct ip_vs_iphdr iphdr; 2235 2236 ip_vs_fill_iph_skb(AF_INET6, skb, false, &iphdr); 2237 if (iphdr.protocol != IPPROTO_ICMPV6) 2238 return NF_ACCEPT; 2239 2240 /* ipvs enabled in this netns ? */ 2241 if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) 2242 return NF_ACCEPT; 2243 2244 return ip_vs_in_icmp_v6(ipvs, skb, &r, state->hook, &iphdr); 2245 } 2246 #endif 2247 2248 2249 static const struct nf_hook_ops ip_vs_ops4[] = { 2250 /* After packet filtering, change source only for VS/NAT */ 2251 { 2252 .hook = ip_vs_reply4, 2253 .pf = NFPROTO_IPV4, 2254 .hooknum = NF_INET_LOCAL_IN, 2255 .priority = NF_IP_PRI_NAT_SRC - 2, 2256 }, 2257 /* After packet filtering, forward packet through VS/DR, VS/TUN, 2258 * or VS/NAT(change destination), so that filtering rules can be 2259 * applied to IPVS. */ 2260 { 2261 .hook = ip_vs_remote_request4, 2262 .pf = NFPROTO_IPV4, 2263 .hooknum = NF_INET_LOCAL_IN, 2264 .priority = NF_IP_PRI_NAT_SRC - 1, 2265 }, 2266 /* Before ip_vs_in, change source only for VS/NAT */ 2267 { 2268 .hook = ip_vs_local_reply4, 2269 .pf = NFPROTO_IPV4, 2270 .hooknum = NF_INET_LOCAL_OUT, 2271 .priority = NF_IP_PRI_NAT_DST + 1, 2272 }, 2273 /* After mangle, schedule and forward local requests */ 2274 { 2275 .hook = ip_vs_local_request4, 2276 .pf = NFPROTO_IPV4, 2277 .hooknum = NF_INET_LOCAL_OUT, 2278 .priority = NF_IP_PRI_NAT_DST + 2, 2279 }, 2280 /* After packet filtering (but before ip_vs_out_icmp), catch icmp 2281 * destined for 0.0.0.0/0, which is for incoming IPVS connections */ 2282 { 2283 .hook = ip_vs_forward_icmp, 2284 .pf = NFPROTO_IPV4, 2285 .hooknum = NF_INET_FORWARD, 2286 .priority = 99, 2287 }, 2288 /* After packet filtering, change source only for VS/NAT */ 2289 { 2290 .hook = ip_vs_reply4, 2291 .pf = NFPROTO_IPV4, 2292 .hooknum = NF_INET_FORWARD, 2293 .priority = 100, 2294 }, 2295 }; 2296 2297 #ifdef CONFIG_IP_VS_IPV6 2298 static const struct nf_hook_ops ip_vs_ops6[] = { 2299 /* After packet filtering, change source only for VS/NAT */ 2300 { 2301 .hook = ip_vs_reply6, 2302 .pf = NFPROTO_IPV6, 2303 .hooknum = NF_INET_LOCAL_IN, 2304 .priority = NF_IP6_PRI_NAT_SRC - 2, 2305 }, 2306 /* After packet filtering, forward packet through VS/DR, VS/TUN, 2307 * or VS/NAT(change destination), so that filtering rules can be 2308 * applied to IPVS. */ 2309 { 2310 .hook = ip_vs_remote_request6, 2311 .pf = NFPROTO_IPV6, 2312 .hooknum = NF_INET_LOCAL_IN, 2313 .priority = NF_IP6_PRI_NAT_SRC - 1, 2314 }, 2315 /* Before ip_vs_in, change source only for VS/NAT */ 2316 { 2317 .hook = ip_vs_local_reply6, 2318 .pf = NFPROTO_IPV6, 2319 .hooknum = NF_INET_LOCAL_OUT, 2320 .priority = NF_IP6_PRI_NAT_DST + 1, 2321 }, 2322 /* After mangle, schedule and forward local requests */ 2323 { 2324 .hook = ip_vs_local_request6, 2325 .pf = NFPROTO_IPV6, 2326 .hooknum = NF_INET_LOCAL_OUT, 2327 .priority = NF_IP6_PRI_NAT_DST + 2, 2328 }, 2329 /* After packet filtering (but before ip_vs_out_icmp), catch icmp 2330 * destined for 0.0.0.0/0, which is for incoming IPVS connections */ 2331 { 2332 .hook = ip_vs_forward_icmp_v6, 2333 .pf = NFPROTO_IPV6, 2334 .hooknum = NF_INET_FORWARD, 2335 .priority = 99, 2336 }, 2337 /* After packet filtering, change source only for VS/NAT */ 2338 { 2339 .hook = ip_vs_reply6, 2340 .pf = NFPROTO_IPV6, 2341 .hooknum = NF_INET_FORWARD, 2342 .priority = 100, 2343 }, 2344 }; 2345 #endif 2346 2347 int ip_vs_register_hooks(struct netns_ipvs *ipvs, unsigned int af) 2348 { 2349 const struct nf_hook_ops *ops; 2350 unsigned int count; 2351 unsigned int afmask; 2352 int ret = 0; 2353 2354 if (af == AF_INET6) { 2355 #ifdef CONFIG_IP_VS_IPV6 2356 ops = ip_vs_ops6; 2357 count = ARRAY_SIZE(ip_vs_ops6); 2358 afmask = 2; 2359 #else 2360 return -EINVAL; 2361 #endif 2362 } else { 2363 ops = ip_vs_ops4; 2364 count = ARRAY_SIZE(ip_vs_ops4); 2365 afmask = 1; 2366 } 2367 2368 if (!(ipvs->hooks_afmask & afmask)) { 2369 ret = nf_register_net_hooks(ipvs->net, ops, count); 2370 if (ret >= 0) 2371 ipvs->hooks_afmask |= afmask; 2372 } 2373 return ret; 2374 } 2375 2376 void ip_vs_unregister_hooks(struct netns_ipvs *ipvs, unsigned int af) 2377 { 2378 const struct nf_hook_ops *ops; 2379 unsigned int count; 2380 unsigned int afmask; 2381 2382 if (af == AF_INET6) { 2383 #ifdef CONFIG_IP_VS_IPV6 2384 ops = ip_vs_ops6; 2385 count = ARRAY_SIZE(ip_vs_ops6); 2386 afmask = 2; 2387 #else 2388 return; 2389 #endif 2390 } else { 2391 ops = ip_vs_ops4; 2392 count = ARRAY_SIZE(ip_vs_ops4); 2393 afmask = 1; 2394 } 2395 2396 if (ipvs->hooks_afmask & afmask) { 2397 nf_unregister_net_hooks(ipvs->net, ops, count); 2398 ipvs->hooks_afmask &= ~afmask; 2399 } 2400 } 2401 2402 /* 2403 * Initialize IP Virtual Server netns mem. 2404 */ 2405 static int __net_init __ip_vs_init(struct net *net) 2406 { 2407 struct netns_ipvs *ipvs; 2408 2409 ipvs = net_generic(net, ip_vs_net_id); 2410 if (ipvs == NULL) 2411 return -ENOMEM; 2412 2413 /* Hold the beast until a service is registerd */ 2414 ipvs->enable = 0; 2415 ipvs->net = net; 2416 /* Counters used for creating unique names */ 2417 ipvs->gen = atomic_read(&ipvs_netns_cnt); 2418 atomic_inc(&ipvs_netns_cnt); 2419 net->ipvs = ipvs; 2420 2421 if (ip_vs_estimator_net_init(ipvs) < 0) 2422 goto estimator_fail; 2423 2424 if (ip_vs_control_net_init(ipvs) < 0) 2425 goto control_fail; 2426 2427 if (ip_vs_protocol_net_init(ipvs) < 0) 2428 goto protocol_fail; 2429 2430 if (ip_vs_app_net_init(ipvs) < 0) 2431 goto app_fail; 2432 2433 if (ip_vs_conn_net_init(ipvs) < 0) 2434 goto conn_fail; 2435 2436 if (ip_vs_sync_net_init(ipvs) < 0) 2437 goto sync_fail; 2438 2439 return 0; 2440 /* 2441 * Error handling 2442 */ 2443 2444 sync_fail: 2445 ip_vs_conn_net_cleanup(ipvs); 2446 conn_fail: 2447 ip_vs_app_net_cleanup(ipvs); 2448 app_fail: 2449 ip_vs_protocol_net_cleanup(ipvs); 2450 protocol_fail: 2451 ip_vs_control_net_cleanup(ipvs); 2452 control_fail: 2453 ip_vs_estimator_net_cleanup(ipvs); 2454 estimator_fail: 2455 net->ipvs = NULL; 2456 return -ENOMEM; 2457 } 2458 2459 static void __net_exit __ip_vs_cleanup_batch(struct list_head *net_list) 2460 { 2461 struct netns_ipvs *ipvs; 2462 struct net *net; 2463 2464 ip_vs_service_nets_cleanup(net_list); /* ip_vs_flush() with locks */ 2465 list_for_each_entry(net, net_list, exit_list) { 2466 ipvs = net_ipvs(net); 2467 ip_vs_conn_net_cleanup(ipvs); 2468 ip_vs_app_net_cleanup(ipvs); 2469 ip_vs_protocol_net_cleanup(ipvs); 2470 ip_vs_control_net_cleanup(ipvs); 2471 ip_vs_estimator_net_cleanup(ipvs); 2472 IP_VS_DBG(2, "ipvs netns %d released\n", ipvs->gen); 2473 net->ipvs = NULL; 2474 } 2475 } 2476 2477 static void __net_exit __ip_vs_dev_cleanup_batch(struct list_head *net_list) 2478 { 2479 struct netns_ipvs *ipvs; 2480 struct net *net; 2481 2482 EnterFunction(2); 2483 list_for_each_entry(net, net_list, exit_list) { 2484 ipvs = net_ipvs(net); 2485 ip_vs_unregister_hooks(ipvs, AF_INET); 2486 ip_vs_unregister_hooks(ipvs, AF_INET6); 2487 ipvs->enable = 0; /* Disable packet reception */ 2488 smp_wmb(); 2489 ip_vs_sync_net_cleanup(ipvs); 2490 } 2491 LeaveFunction(2); 2492 } 2493 2494 static struct pernet_operations ipvs_core_ops = { 2495 .init = __ip_vs_init, 2496 .exit_batch = __ip_vs_cleanup_batch, 2497 .id = &ip_vs_net_id, 2498 .size = sizeof(struct netns_ipvs), 2499 }; 2500 2501 static struct pernet_operations ipvs_core_dev_ops = { 2502 .exit_batch = __ip_vs_dev_cleanup_batch, 2503 }; 2504 2505 /* 2506 * Initialize IP Virtual Server 2507 */ 2508 static int __init ip_vs_init(void) 2509 { 2510 int ret; 2511 2512 ret = ip_vs_control_init(); 2513 if (ret < 0) { 2514 pr_err("can't setup control.\n"); 2515 goto exit; 2516 } 2517 2518 ip_vs_protocol_init(); 2519 2520 ret = ip_vs_conn_init(); 2521 if (ret < 0) { 2522 pr_err("can't setup connection table.\n"); 2523 goto cleanup_protocol; 2524 } 2525 2526 ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */ 2527 if (ret < 0) 2528 goto cleanup_conn; 2529 2530 ret = register_pernet_device(&ipvs_core_dev_ops); 2531 if (ret < 0) 2532 goto cleanup_sub; 2533 2534 ret = ip_vs_register_nl_ioctl(); 2535 if (ret < 0) { 2536 pr_err("can't register netlink/ioctl.\n"); 2537 goto cleanup_dev; 2538 } 2539 2540 pr_info("ipvs loaded.\n"); 2541 2542 return ret; 2543 2544 cleanup_dev: 2545 unregister_pernet_device(&ipvs_core_dev_ops); 2546 cleanup_sub: 2547 unregister_pernet_subsys(&ipvs_core_ops); 2548 cleanup_conn: 2549 ip_vs_conn_cleanup(); 2550 cleanup_protocol: 2551 ip_vs_protocol_cleanup(); 2552 ip_vs_control_cleanup(); 2553 exit: 2554 return ret; 2555 } 2556 2557 static void __exit ip_vs_cleanup(void) 2558 { 2559 ip_vs_unregister_nl_ioctl(); 2560 unregister_pernet_device(&ipvs_core_dev_ops); 2561 unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ 2562 ip_vs_conn_cleanup(); 2563 ip_vs_protocol_cleanup(); 2564 ip_vs_control_cleanup(); 2565 pr_info("ipvs unloaded.\n"); 2566 } 2567 2568 module_init(ip_vs_init); 2569 module_exit(ip_vs_cleanup); 2570 MODULE_LICENSE("GPL"); 2571